Terminators.php 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. <?php
  2. namespace Highlight;
  3. /**
  4. * @internal
  5. *
  6. * @since 9.16.0.0
  7. */
  8. final class Terminators
  9. {
  10. /** @var bool */
  11. private $caseInsensitive;
  12. /** @var array<int, Mode|string> */
  13. private $matchIndexes = array();
  14. /** @var RegEx|null */
  15. private $matcherRe = null;
  16. /** @var array<int, array<int, Mode|string>> */
  17. private $regexes = array();
  18. /** @var int */
  19. private $matchAt = 1;
  20. /** @var Mode */
  21. private $mode;
  22. /** @var int */
  23. public $lastIndex = 0;
  24. /**
  25. * @param bool $caseInsensitive
  26. */
  27. public function __construct($caseInsensitive)
  28. {
  29. $this->caseInsensitive = $caseInsensitive;
  30. }
  31. /**
  32. * @internal
  33. *
  34. * @param Mode $mode
  35. *
  36. * @return self
  37. */
  38. public function _buildModeRegex($mode)
  39. {
  40. $this->mode = $mode;
  41. $term = null;
  42. for ($i = 0; $i < count($mode->contains); ++$i) {
  43. $re = null;
  44. $term = $mode->contains[$i];
  45. if ($term->beginKeywords) {
  46. $re = "\.?(?:" . $term->begin . ")\.?";
  47. } else {
  48. $re = $term->begin;
  49. }
  50. $this->addRule($term, $re);
  51. }
  52. if ($mode->terminator_end) {
  53. $this->addRule('end', $mode->terminator_end);
  54. }
  55. if ($mode->illegal) {
  56. $this->addRule('illegal', $mode->illegal);
  57. }
  58. /** @var array<int, string> $terminators */
  59. $terminators = array();
  60. foreach ($this->regexes as $regex) {
  61. $terminators[] = $regex[1];
  62. }
  63. $this->matcherRe = $this->langRe($this->joinRe($terminators, '|'), true);
  64. $this->lastIndex = 0;
  65. return $this;
  66. }
  67. /**
  68. * @param string $s
  69. *
  70. * @return RegExMatch|null
  71. */
  72. public function exec($s)
  73. {
  74. if (count($this->regexes) === 0) {
  75. return null;
  76. }
  77. $this->matcherRe->lastIndex = $this->lastIndex;
  78. $match = $this->matcherRe->exec($s);
  79. if (!$match) {
  80. return null;
  81. }
  82. /** @var Mode|string $rule */
  83. $rule = null;
  84. for ($i = 0; $i < count($match); ++$i) {
  85. if ($match[$i] !== null && isset($this->matchIndexes[$i])) {
  86. $rule = $this->matchIndexes[$i];
  87. break;
  88. }
  89. }
  90. if (is_string($rule)) {
  91. $match->type = $rule;
  92. } else {
  93. $match->type = "begin";
  94. $match->rule = $rule;
  95. }
  96. return $match;
  97. }
  98. /**
  99. * @param string $value
  100. * @param bool $global
  101. *
  102. * @return RegEx
  103. */
  104. private function langRe($value, $global = false)
  105. {
  106. return RegExUtils::langRe($value, $global, $this->caseInsensitive);
  107. }
  108. /**
  109. * @param Mode|string $rule
  110. * @param string $regex
  111. *
  112. * @return void
  113. */
  114. private function addRule($rule, $regex)
  115. {
  116. $this->matchIndexes[$this->matchAt] = $rule;
  117. $this->regexes[] = array($rule, $regex);
  118. $this->matchAt += $this->reCountMatchGroups($regex) + 1;
  119. }
  120. /**
  121. * joinRe logically computes regexps.join(separator), but fixes the
  122. * backreferences so they continue to match.
  123. *
  124. * it also places each individual regular expression into it's own
  125. * match group, keeping track of the sequencing of those match groups
  126. * is currently an exercise for the caller. :-)
  127. *
  128. * @param array<int, string> $regexps
  129. * @param string $separator
  130. *
  131. * @return string
  132. */
  133. private function joinRe($regexps, $separator)
  134. {
  135. // backreferenceRe matches an open parenthesis or backreference. To avoid
  136. // an incorrect parse, it additionally matches the following:
  137. // - [...] elements, where the meaning of parentheses and escapes change
  138. // - other escape sequences, so we do not misparse escape sequences as
  139. // interesting elements
  140. // - non-matching or lookahead parentheses, which do not capture. These
  141. // follow the '(' with a '?'.
  142. $backreferenceRe = '#\[(?:[^\\\\\]]|\\\.)*\]|\(\??|\\\([1-9][0-9]*)|\\\.#';
  143. $numCaptures = 0;
  144. $ret = '';
  145. $strLen = count($regexps);
  146. for ($i = 0; $i < $strLen; ++$i) {
  147. ++$numCaptures;
  148. $offset = $numCaptures;
  149. $re = $this->reStr($regexps[$i]);
  150. if ($i > 0) {
  151. $ret .= $separator;
  152. }
  153. $ret .= "(";
  154. while (strlen($re) > 0) {
  155. $matches = array();
  156. $matchFound = preg_match($backreferenceRe, $re, $matches, PREG_OFFSET_CAPTURE);
  157. if ($matchFound === 0) {
  158. $ret .= $re;
  159. break;
  160. }
  161. // PHP aliases to match the JS naming conventions
  162. $match = $matches[0];
  163. $index = $match[1];
  164. $ret .= substr($re, 0, $index);
  165. $re = substr($re, $index + strlen($match[0]));
  166. if (substr($match[0], 0, 1) === '\\' && isset($matches[1])) {
  167. // Adjust the backreference.
  168. $ret .= "\\" . strval(intval($matches[1][0]) + $offset);
  169. } else {
  170. $ret .= $match[0];
  171. if ($match[0] == "(") {
  172. ++$numCaptures;
  173. }
  174. }
  175. }
  176. $ret .= ")";
  177. }
  178. return $ret;
  179. }
  180. /**
  181. * @param RegEx|string $re
  182. *
  183. * @return mixed
  184. */
  185. private function reStr($re)
  186. {
  187. if ($re && isset($re->source)) {
  188. return $re->source;
  189. }
  190. return $re;
  191. }
  192. /**
  193. * @param RegEx|string $re
  194. *
  195. * @return int
  196. */
  197. private function reCountMatchGroups($re)
  198. {
  199. $results = array();
  200. $escaped = preg_replace('#(?<!\\\)/#um', '\\/', (string) $re);
  201. preg_match_all("/{$escaped}|/u", '', $results);
  202. return count($results) - 1;
  203. }
  204. }