ParallelRegex.php 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. <?php
  2. /**
  3. * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
  4. * For an intro to the Lexer see:
  5. * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
  6. *
  7. * @author Marcus Baker http://www.lastcraft.com
  8. */
  9. namespace dokuwiki\Parsing\Lexer;
  10. /**
  11. * Compounded regular expression.
  12. *
  13. * Any of the contained patterns could match and when one does it's label is returned.
  14. */
  15. class ParallelRegex
  16. {
  17. /** @var string[] patterns to match */
  18. protected $patterns;
  19. /** @var string[] labels for above patterns */
  20. protected $labels;
  21. /** @var string the compound regex matching all patterns */
  22. protected $regex;
  23. /** @var bool case sensitive matching? */
  24. protected $case;
  25. /**
  26. * Constructor. Starts with no patterns.
  27. *
  28. * @param boolean $case True for case sensitive, false
  29. * for insensitive.
  30. */
  31. public function __construct($case)
  32. {
  33. $this->case = $case;
  34. $this->patterns = array();
  35. $this->labels = array();
  36. $this->regex = null;
  37. }
  38. /**
  39. * Adds a pattern with an optional label.
  40. *
  41. * @param mixed $pattern Perl style regex. Must be UTF-8
  42. * encoded. If its a string, the (, )
  43. * lose their meaning unless they
  44. * form part of a lookahead or
  45. * lookbehind assertation.
  46. * @param bool|string $label Label of regex to be returned
  47. * on a match. Label must be ASCII
  48. */
  49. public function addPattern($pattern, $label = true)
  50. {
  51. $count = count($this->patterns);
  52. $this->patterns[$count] = $pattern;
  53. $this->labels[$count] = $label;
  54. $this->regex = null;
  55. }
  56. /**
  57. * Attempts to match all patterns at once against a string.
  58. *
  59. * @param string $subject String to match against.
  60. * @param string $match First matched portion of
  61. * subject.
  62. * @return bool|string False if no match found, label if label exists, true if not
  63. */
  64. public function apply($subject, &$match)
  65. {
  66. if (count($this->patterns) == 0) {
  67. return false;
  68. }
  69. if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
  70. $match = "";
  71. return false;
  72. }
  73. $match = $matches[0];
  74. $size = count($matches);
  75. // FIXME this could be made faster by storing the labels as keys in a hashmap
  76. for ($i = 1; $i < $size; $i++) {
  77. if ($matches[$i] && isset($this->labels[$i - 1])) {
  78. return $this->labels[$i - 1];
  79. }
  80. }
  81. return true;
  82. }
  83. /**
  84. * Attempts to split the string against all patterns at once
  85. *
  86. * @param string $subject String to match against.
  87. * @param array $split The split result: array containing, pre-match, match & post-match strings
  88. * @return boolean True on success.
  89. *
  90. * @author Christopher Smith <chris@jalakai.co.uk>
  91. */
  92. public function split($subject, &$split)
  93. {
  94. if (count($this->patterns) == 0) {
  95. return false;
  96. }
  97. if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
  98. if (function_exists('preg_last_error')) {
  99. $err = preg_last_error();
  100. switch ($err) {
  101. case PREG_BACKTRACK_LIMIT_ERROR:
  102. msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
  103. break;
  104. case PREG_RECURSION_LIMIT_ERROR:
  105. msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
  106. break;
  107. case PREG_BAD_UTF8_ERROR:
  108. msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
  109. break;
  110. case PREG_INTERNAL_ERROR:
  111. msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
  112. break;
  113. }
  114. }
  115. $split = array($subject, "", "");
  116. return false;
  117. }
  118. $idx = count($matches)-2;
  119. list($pre, $post) = preg_split($this->patterns[$idx].$this->getPerlMatchingFlags(), $subject, 2);
  120. $split = array($pre, $matches[0], $post);
  121. return isset($this->labels[$idx]) ? $this->labels[$idx] : true;
  122. }
  123. /**
  124. * Compounds the patterns into a single
  125. * regular expression separated with the
  126. * "or" operator. Caches the regex.
  127. * Will automatically escape (, ) and / tokens.
  128. *
  129. * @return null|string
  130. */
  131. protected function getCompoundedRegex()
  132. {
  133. if ($this->regex == null) {
  134. $cnt = count($this->patterns);
  135. for ($i = 0; $i < $cnt; $i++) {
  136. /*
  137. * decompose the input pattern into "(", "(?", ")",
  138. * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
  139. * elements.
  140. */
  141. preg_match_all('/\\\\.|' .
  142. '\(\?|' .
  143. '[()]|' .
  144. '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
  145. '[^[()\\\\]+/', $this->patterns[$i], $elts);
  146. $pattern = "";
  147. $level = 0;
  148. foreach ($elts[0] as $elt) {
  149. /*
  150. * for "(", ")" remember the nesting level, add "\"
  151. * only to the non-"(?" ones.
  152. */
  153. switch ($elt) {
  154. case '(':
  155. $pattern .= '\(';
  156. break;
  157. case ')':
  158. if ($level > 0)
  159. $level--; /* closing (? */
  160. else $pattern .= '\\';
  161. $pattern .= ')';
  162. break;
  163. case '(?':
  164. $level++;
  165. $pattern .= '(?';
  166. break;
  167. default:
  168. if (substr($elt, 0, 1) == '\\')
  169. $pattern .= $elt;
  170. else $pattern .= str_replace('/', '\/', $elt);
  171. }
  172. }
  173. $this->patterns[$i] = "($pattern)";
  174. }
  175. $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
  176. }
  177. return $this->regex;
  178. }
  179. /**
  180. * Accessor for perl regex mode flags to use.
  181. * @return string Perl regex flags.
  182. */
  183. protected function getPerlMatchingFlags()
  184. {
  185. return ($this->case ? "msS" : "msSi");
  186. }
  187. }