Lexer.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. <?php
  2. /**
  3. * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
  4. * For an intro to the Lexer see:
  5. * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
  6. *
  7. * @author Marcus Baker http://www.lastcraft.com
  8. */
  9. namespace dokuwiki\Parsing\Lexer;
  10. /**
  11. * Accepts text and breaks it into tokens.
  12. *
  13. * Some optimisation to make the sure the content is only scanned by the PHP regex
  14. * parser once. Lexer modes must not start with leading underscores.
  15. */
  16. class Lexer
  17. {
  18. /** @var ParallelRegex[] */
  19. protected $regexes;
  20. /** @var \Doku_Handler */
  21. protected $handler;
  22. /** @var StateStack */
  23. protected $modeStack;
  24. /** @var array mode "rewrites" */
  25. protected $mode_handlers;
  26. /** @var bool case sensitive? */
  27. protected $case;
  28. /**
  29. * Sets up the lexer in case insensitive matching by default.
  30. *
  31. * @param \Doku_Handler $handler Handling strategy by reference.
  32. * @param string $start Starting handler.
  33. * @param boolean $case True for case sensitive.
  34. */
  35. public function __construct($handler, $start = "accept", $case = false)
  36. {
  37. $this->case = $case;
  38. $this->regexes = array();
  39. $this->handler = $handler;
  40. $this->modeStack = new StateStack($start);
  41. $this->mode_handlers = array();
  42. }
  43. /**
  44. * Adds a token search pattern for a particular parsing mode.
  45. *
  46. * The pattern does not change the current mode.
  47. *
  48. * @param string $pattern Perl style regex, but ( and )
  49. * lose the usual meaning.
  50. * @param string $mode Should only apply this
  51. * pattern when dealing with
  52. * this type of input.
  53. */
  54. public function addPattern($pattern, $mode = "accept")
  55. {
  56. if (! isset($this->regexes[$mode])) {
  57. $this->regexes[$mode] = new ParallelRegex($this->case);
  58. }
  59. $this->regexes[$mode]->addPattern($pattern);
  60. }
  61. /**
  62. * Adds a pattern that will enter a new parsing mode.
  63. *
  64. * Useful for entering parenthesis, strings, tags, etc.
  65. *
  66. * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
  67. * @param string $mode Should only apply this pattern when dealing with this type of input.
  68. * @param string $new_mode Change parsing to this new nested mode.
  69. */
  70. public function addEntryPattern($pattern, $mode, $new_mode)
  71. {
  72. if (! isset($this->regexes[$mode])) {
  73. $this->regexes[$mode] = new ParallelRegex($this->case);
  74. }
  75. $this->regexes[$mode]->addPattern($pattern, $new_mode);
  76. }
  77. /**
  78. * Adds a pattern that will exit the current mode and re-enter the previous one.
  79. *
  80. * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
  81. * @param string $mode Mode to leave.
  82. */
  83. public function addExitPattern($pattern, $mode)
  84. {
  85. if (! isset($this->regexes[$mode])) {
  86. $this->regexes[$mode] = new ParallelRegex($this->case);
  87. }
  88. $this->regexes[$mode]->addPattern($pattern, "__exit");
  89. }
  90. /**
  91. * Adds a pattern that has a special mode.
  92. *
  93. * Acts as an entry and exit pattern in one go, effectively calling a special
  94. * parser handler for this token only.
  95. *
  96. * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
  97. * @param string $mode Should only apply this pattern when dealing with this type of input.
  98. * @param string $special Use this mode for this one token.
  99. */
  100. public function addSpecialPattern($pattern, $mode, $special)
  101. {
  102. if (! isset($this->regexes[$mode])) {
  103. $this->regexes[$mode] = new ParallelRegex($this->case);
  104. }
  105. $this->regexes[$mode]->addPattern($pattern, "_$special");
  106. }
  107. /**
  108. * Adds a mapping from a mode to another handler.
  109. *
  110. * @param string $mode Mode to be remapped.
  111. * @param string $handler New target handler.
  112. */
  113. public function mapHandler($mode, $handler)
  114. {
  115. $this->mode_handlers[$mode] = $handler;
  116. }
  117. /**
  118. * Splits the page text into tokens.
  119. *
  120. * Will fail if the handlers report an error or if no content is consumed. If successful then each
  121. * unparsed and parsed token invokes a call to the held listener.
  122. *
  123. * @param string $raw Raw HTML text.
  124. * @return boolean True on success, else false.
  125. */
  126. public function parse($raw)
  127. {
  128. if (! isset($this->handler)) {
  129. return false;
  130. }
  131. $initialLength = strlen($raw);
  132. $length = $initialLength;
  133. $pos = 0;
  134. while (is_array($parsed = $this->reduce($raw))) {
  135. list($unmatched, $matched, $mode) = $parsed;
  136. $currentLength = strlen($raw);
  137. $matchPos = $initialLength - $currentLength - strlen($matched);
  138. if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
  139. return false;
  140. }
  141. if ($currentLength == $length) {
  142. return false;
  143. }
  144. $length = $currentLength;
  145. $pos = $initialLength - $currentLength;
  146. }
  147. if (!$parsed) {
  148. return false;
  149. }
  150. return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
  151. }
  152. /**
  153. * Gives plugins access to the mode stack
  154. *
  155. * @return StateStack
  156. */
  157. public function getModeStack()
  158. {
  159. return $this->modeStack;
  160. }
  161. /**
  162. * Sends the matched token and any leading unmatched
  163. * text to the parser changing the lexer to a new
  164. * mode if one is listed.
  165. *
  166. * @param string $unmatched Unmatched leading portion.
  167. * @param string $matched Actual token match.
  168. * @param bool|string $mode Mode after match. A boolean false mode causes no change.
  169. * @param int $initialPos
  170. * @param int $matchPos Current byte index location in raw doc thats being parsed
  171. * @return boolean False if there was any error from the parser.
  172. */
  173. protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
  174. {
  175. if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
  176. return false;
  177. }
  178. if ($this->isModeEnd($mode)) {
  179. if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
  180. return false;
  181. }
  182. return $this->modeStack->leave();
  183. }
  184. if ($this->isSpecialMode($mode)) {
  185. $this->modeStack->enter($this->decodeSpecial($mode));
  186. if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
  187. return false;
  188. }
  189. return $this->modeStack->leave();
  190. }
  191. if (is_string($mode)) {
  192. $this->modeStack->enter($mode);
  193. return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
  194. }
  195. return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
  196. }
  197. /**
  198. * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
  199. * mode stack.
  200. *
  201. * @param string $mode Mode to test.
  202. * @return boolean True if this is the exit mode.
  203. */
  204. protected function isModeEnd($mode)
  205. {
  206. return ($mode === "__exit");
  207. }
  208. /**
  209. * Test to see if the mode is one where this mode is entered for this token only and automatically
  210. * leaves immediately afterwoods.
  211. *
  212. * @param string $mode Mode to test.
  213. * @return boolean True if this is the exit mode.
  214. */
  215. protected function isSpecialMode($mode)
  216. {
  217. return (strncmp($mode, "_", 1) == 0);
  218. }
  219. /**
  220. * Strips the magic underscore marking single token modes.
  221. *
  222. * @param string $mode Mode to decode.
  223. * @return string Underlying mode name.
  224. */
  225. protected function decodeSpecial($mode)
  226. {
  227. return substr($mode, 1);
  228. }
  229. /**
  230. * Calls the parser method named after the current mode.
  231. *
  232. * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
  233. *
  234. * @param string $content Text parsed.
  235. * @param boolean $is_match Token is recognised rather
  236. * than unparsed data.
  237. * @param int $pos Current byte index location in raw doc
  238. * thats being parsed
  239. * @return bool
  240. */
  241. protected function invokeHandler($content, $is_match, $pos)
  242. {
  243. if (($content === "") || ($content === false)) {
  244. return true;
  245. }
  246. $handler = $this->modeStack->getCurrent();
  247. if (isset($this->mode_handlers[$handler])) {
  248. $handler = $this->mode_handlers[$handler];
  249. }
  250. // modes starting with plugin_ are all handled by the same
  251. // handler but with an additional parameter
  252. if (substr($handler, 0, 7)=='plugin_') {
  253. list($handler,$plugin) = sexplode('_', $handler, 2, '');
  254. return $this->handler->$handler($content, $is_match, $pos, $plugin);
  255. }
  256. return $this->handler->$handler($content, $is_match, $pos);
  257. }
  258. /**
  259. * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
  260. * unparsed data. Empty strings will not be matched.
  261. *
  262. * @param string $raw The subject to parse. This is the content that will be eaten.
  263. * @return array|bool Three item list of unparsed content followed by the
  264. * recognised token and finally the action the parser is to take.
  265. * True if no match, false if there is a parsing error.
  266. */
  267. protected function reduce(&$raw)
  268. {
  269. if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
  270. return false;
  271. }
  272. if ($raw === "") {
  273. return true;
  274. }
  275. if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
  276. list($unparsed, $match, $raw) = $split;
  277. return array($unparsed, $match, $action);
  278. }
  279. return true;
  280. }
  281. /**
  282. * Escapes regex characters other than (, ) and /
  283. *
  284. * @param string $str
  285. * @return string
  286. */
  287. public static function escape($str)
  288. {
  289. $chars = array(
  290. '/\\\\/',
  291. '/\./',
  292. '/\+/',
  293. '/\*/',
  294. '/\?/',
  295. '/\[/',
  296. '/\^/',
  297. '/\]/',
  298. '/\$/',
  299. '/\{/',
  300. '/\}/',
  301. '/\=/',
  302. '/\!/',
  303. '/\</',
  304. '/\>/',
  305. '/\|/',
  306. '/\:/'
  307. );
  308. $escaped = array(
  309. '\\\\\\\\',
  310. '\.',
  311. '\+',
  312. '\*',
  313. '\?',
  314. '\[',
  315. '\^',
  316. '\]',
  317. '\$',
  318. '\{',
  319. '\}',
  320. '\=',
  321. '\!',
  322. '\<',
  323. '\>',
  324. '\|',
  325. '\:'
  326. );
  327. return preg_replace($chars, $escaped, $str);
  328. }
  329. }