123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349 |
- <?php
- /**
- * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
- * For an intro to the Lexer see:
- * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
- *
- * @author Marcus Baker http://www.lastcraft.com
- */
- namespace dokuwiki\Parsing\Lexer;
- /**
- * Accepts text and breaks it into tokens.
- *
- * Some optimisation to make the sure the content is only scanned by the PHP regex
- * parser once. Lexer modes must not start with leading underscores.
- */
- class Lexer
- {
- /** @var ParallelRegex[] */
- protected $regexes;
- /** @var \Doku_Handler */
- protected $handler;
- /** @var StateStack */
- protected $modeStack;
- /** @var array mode "rewrites" */
- protected $mode_handlers;
- /** @var bool case sensitive? */
- protected $case;
- /**
- * Sets up the lexer in case insensitive matching by default.
- *
- * @param \Doku_Handler $handler Handling strategy by reference.
- * @param string $start Starting handler.
- * @param boolean $case True for case sensitive.
- */
- public function __construct($handler, $start = "accept", $case = false)
- {
- $this->case = $case;
- $this->regexes = array();
- $this->handler = $handler;
- $this->modeStack = new StateStack($start);
- $this->mode_handlers = array();
- }
- /**
- * Adds a token search pattern for a particular parsing mode.
- *
- * The pattern does not change the current mode.
- *
- * @param string $pattern Perl style regex, but ( and )
- * lose the usual meaning.
- * @param string $mode Should only apply this
- * pattern when dealing with
- * this type of input.
- */
- public function addPattern($pattern, $mode = "accept")
- {
- if (! isset($this->regexes[$mode])) {
- $this->regexes[$mode] = new ParallelRegex($this->case);
- }
- $this->regexes[$mode]->addPattern($pattern);
- }
- /**
- * Adds a pattern that will enter a new parsing mode.
- *
- * Useful for entering parenthesis, strings, tags, etc.
- *
- * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
- * @param string $mode Should only apply this pattern when dealing with this type of input.
- * @param string $new_mode Change parsing to this new nested mode.
- */
- public function addEntryPattern($pattern, $mode, $new_mode)
- {
- if (! isset($this->regexes[$mode])) {
- $this->regexes[$mode] = new ParallelRegex($this->case);
- }
- $this->regexes[$mode]->addPattern($pattern, $new_mode);
- }
- /**
- * Adds a pattern that will exit the current mode and re-enter the previous one.
- *
- * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
- * @param string $mode Mode to leave.
- */
- public function addExitPattern($pattern, $mode)
- {
- if (! isset($this->regexes[$mode])) {
- $this->regexes[$mode] = new ParallelRegex($this->case);
- }
- $this->regexes[$mode]->addPattern($pattern, "__exit");
- }
- /**
- * Adds a pattern that has a special mode.
- *
- * Acts as an entry and exit pattern in one go, effectively calling a special
- * parser handler for this token only.
- *
- * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
- * @param string $mode Should only apply this pattern when dealing with this type of input.
- * @param string $special Use this mode for this one token.
- */
- public function addSpecialPattern($pattern, $mode, $special)
- {
- if (! isset($this->regexes[$mode])) {
- $this->regexes[$mode] = new ParallelRegex($this->case);
- }
- $this->regexes[$mode]->addPattern($pattern, "_$special");
- }
- /**
- * Adds a mapping from a mode to another handler.
- *
- * @param string $mode Mode to be remapped.
- * @param string $handler New target handler.
- */
- public function mapHandler($mode, $handler)
- {
- $this->mode_handlers[$mode] = $handler;
- }
- /**
- * Splits the page text into tokens.
- *
- * Will fail if the handlers report an error or if no content is consumed. If successful then each
- * unparsed and parsed token invokes a call to the held listener.
- *
- * @param string $raw Raw HTML text.
- * @return boolean True on success, else false.
- */
- public function parse($raw)
- {
- if (! isset($this->handler)) {
- return false;
- }
- $initialLength = strlen($raw);
- $length = $initialLength;
- $pos = 0;
- while (is_array($parsed = $this->reduce($raw))) {
- list($unmatched, $matched, $mode) = $parsed;
- $currentLength = strlen($raw);
- $matchPos = $initialLength - $currentLength - strlen($matched);
- if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
- return false;
- }
- if ($currentLength == $length) {
- return false;
- }
- $length = $currentLength;
- $pos = $initialLength - $currentLength;
- }
- if (!$parsed) {
- return false;
- }
- return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
- }
- /**
- * Gives plugins access to the mode stack
- *
- * @return StateStack
- */
- public function getModeStack()
- {
- return $this->modeStack;
- }
- /**
- * Sends the matched token and any leading unmatched
- * text to the parser changing the lexer to a new
- * mode if one is listed.
- *
- * @param string $unmatched Unmatched leading portion.
- * @param string $matched Actual token match.
- * @param bool|string $mode Mode after match. A boolean false mode causes no change.
- * @param int $initialPos
- * @param int $matchPos Current byte index location in raw doc thats being parsed
- * @return boolean False if there was any error from the parser.
- */
- protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
- {
- if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
- return false;
- }
- if ($this->isModeEnd($mode)) {
- if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
- return false;
- }
- return $this->modeStack->leave();
- }
- if ($this->isSpecialMode($mode)) {
- $this->modeStack->enter($this->decodeSpecial($mode));
- if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
- return false;
- }
- return $this->modeStack->leave();
- }
- if (is_string($mode)) {
- $this->modeStack->enter($mode);
- return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
- }
- return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
- }
- /**
- * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
- * mode stack.
- *
- * @param string $mode Mode to test.
- * @return boolean True if this is the exit mode.
- */
- protected function isModeEnd($mode)
- {
- return ($mode === "__exit");
- }
- /**
- * Test to see if the mode is one where this mode is entered for this token only and automatically
- * leaves immediately afterwoods.
- *
- * @param string $mode Mode to test.
- * @return boolean True if this is the exit mode.
- */
- protected function isSpecialMode($mode)
- {
- return (strncmp($mode, "_", 1) == 0);
- }
- /**
- * Strips the magic underscore marking single token modes.
- *
- * @param string $mode Mode to decode.
- * @return string Underlying mode name.
- */
- protected function decodeSpecial($mode)
- {
- return substr($mode, 1);
- }
- /**
- * Calls the parser method named after the current mode.
- *
- * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
- *
- * @param string $content Text parsed.
- * @param boolean $is_match Token is recognised rather
- * than unparsed data.
- * @param int $pos Current byte index location in raw doc
- * thats being parsed
- * @return bool
- */
- protected function invokeHandler($content, $is_match, $pos)
- {
- if (($content === "") || ($content === false)) {
- return true;
- }
- $handler = $this->modeStack->getCurrent();
- if (isset($this->mode_handlers[$handler])) {
- $handler = $this->mode_handlers[$handler];
- }
- // modes starting with plugin_ are all handled by the same
- // handler but with an additional parameter
- if (substr($handler, 0, 7)=='plugin_') {
- list($handler,$plugin) = sexplode('_', $handler, 2, '');
- return $this->handler->$handler($content, $is_match, $pos, $plugin);
- }
- return $this->handler->$handler($content, $is_match, $pos);
- }
- /**
- * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
- * unparsed data. Empty strings will not be matched.
- *
- * @param string $raw The subject to parse. This is the content that will be eaten.
- * @return array|bool Three item list of unparsed content followed by the
- * recognised token and finally the action the parser is to take.
- * True if no match, false if there is a parsing error.
- */
- protected function reduce(&$raw)
- {
- if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
- return false;
- }
- if ($raw === "") {
- return true;
- }
- if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
- list($unparsed, $match, $raw) = $split;
- return array($unparsed, $match, $action);
- }
- return true;
- }
- /**
- * Escapes regex characters other than (, ) and /
- *
- * @param string $str
- * @return string
- */
- public static function escape($str)
- {
- $chars = array(
- '/\\\\/',
- '/\./',
- '/\+/',
- '/\*/',
- '/\?/',
- '/\[/',
- '/\^/',
- '/\]/',
- '/\$/',
- '/\{/',
- '/\}/',
- '/\=/',
- '/\!/',
- '/\</',
- '/\>/',
- '/\|/',
- '/\:/'
- );
- $escaped = array(
- '\\\\\\\\',
- '\.',
- '\+',
- '\*',
- '\?',
- '\[',
- '\^',
- '\]',
- '\$',
- '\{',
- '\}',
- '\=',
- '\!',
- '\<',
- '\>',
- '\|',
- '\:'
- );
- return preg_replace($chars, $escaped, $str);
- }
- }
|