lex.py 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099
  1. # -----------------------------------------------------------------------------
  2. # ply: lex.py
  3. #
  4. # Copyright (C) 2001-2017
  5. # David M. Beazley (Dabeaz LLC)
  6. # All rights reserved.
  7. #
  8. # Redistribution and use in source and binary forms, with or without
  9. # modification, are permitted provided that the following conditions are
  10. # met:
  11. #
  12. # * Redistributions of source code must retain the above copyright notice,
  13. # this list of conditions and the following disclaimer.
  14. # * Redistributions in binary form must reproduce the above copyright notice,
  15. # this list of conditions and the following disclaimer in the documentation
  16. # and/or other materials provided with the distribution.
  17. # * Neither the name of the David Beazley or Dabeaz LLC may be used to
  18. # endorse or promote products derived from this software without
  19. # specific prior written permission.
  20. #
  21. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22. # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23. # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24. # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25. # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26. # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28. # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29. # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32. # -----------------------------------------------------------------------------
  33. __version__ = '3.10'
  34. __tabversion__ = '3.10'
  35. import re
  36. import sys
  37. import types
  38. import copy
  39. import os
  40. import inspect
  41. # This tuple contains known string types
  42. try:
  43. # Python 2.6
  44. StringTypes = (types.StringType, types.UnicodeType)
  45. except AttributeError:
  46. # Python 3.0
  47. StringTypes = (str, bytes)
  48. # This regular expression is used to match valid token names
  49. _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')
  50. # Exception thrown when invalid token encountered and no default error
  51. # handler is defined.
  52. class LexError(Exception):
  53. def __init__(self, message, s):
  54. self.args = (message,)
  55. self.text = s
  56. # Token class. This class is used to represent the tokens produced.
  57. class LexToken(object):
  58. def __str__(self):
  59. return 'LexToken(%s,%r,%d,%d)' % (self.type, self.value, self.lineno, self.lexpos)
  60. def __repr__(self):
  61. return str(self)
  62. # This object is a stand-in for a logging object created by the
  63. # logging module.
  64. class PlyLogger(object):
  65. def __init__(self, f):
  66. self.f = f
  67. def critical(self, msg, *args, **kwargs):
  68. self.f.write((msg % args) + '\n')
  69. def warning(self, msg, *args, **kwargs):
  70. self.f.write('WARNING: ' + (msg % args) + '\n')
  71. def error(self, msg, *args, **kwargs):
  72. self.f.write('ERROR: ' + (msg % args) + '\n')
  73. info = critical
  74. debug = critical
  75. # Null logger is used when no output is generated. Does nothing.
  76. class NullLogger(object):
  77. def __getattribute__(self, name):
  78. return self
  79. def __call__(self, *args, **kwargs):
  80. return self
  81. # -----------------------------------------------------------------------------
  82. # === Lexing Engine ===
  83. #
  84. # The following Lexer class implements the lexer runtime. There are only
  85. # a few public methods and attributes:
  86. #
  87. # input() - Store a new string in the lexer
  88. # token() - Get the next token
  89. # clone() - Clone the lexer
  90. #
  91. # lineno - Current line number
  92. # lexpos - Current position in the input string
  93. # -----------------------------------------------------------------------------
  94. class Lexer:
  95. def __init__(self):
  96. self.lexre = None # Master regular expression. This is a list of
  97. # tuples (re, findex) where re is a compiled
  98. # regular expression and findex is a list
  99. # mapping regex group numbers to rules
  100. self.lexretext = None # Current regular expression strings
  101. self.lexstatere = {} # Dictionary mapping lexer states to master regexs
  102. self.lexstateretext = {} # Dictionary mapping lexer states to regex strings
  103. self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names
  104. self.lexstate = 'INITIAL' # Current lexer state
  105. self.lexstatestack = [] # Stack of lexer states
  106. self.lexstateinfo = None # State information
  107. self.lexstateignore = {} # Dictionary of ignored characters for each state
  108. self.lexstateerrorf = {} # Dictionary of error functions for each state
  109. self.lexstateeoff = {} # Dictionary of eof functions for each state
  110. self.lexreflags = 0 # Optional re compile flags
  111. self.lexdata = None # Actual input data (as a string)
  112. self.lexpos = 0 # Current position in input text
  113. self.lexlen = 0 # Length of the input text
  114. self.lexerrorf = None # Error rule (if any)
  115. self.lexeoff = None # EOF rule (if any)
  116. self.lextokens = None # List of valid tokens
  117. self.lexignore = '' # Ignored characters
  118. self.lexliterals = '' # Literal characters that can be passed through
  119. self.lexmodule = None # Module
  120. self.lineno = 1 # Current line number
  121. self.lexoptimize = False # Optimized mode
  122. def clone(self, object=None):
  123. c = copy.copy(self)
  124. # If the object parameter has been supplied, it means we are attaching the
  125. # lexer to a new object. In this case, we have to rebind all methods in
  126. # the lexstatere and lexstateerrorf tables.
  127. if object:
  128. newtab = {}
  129. for key, ritem in self.lexstatere.items():
  130. newre = []
  131. for cre, findex in ritem:
  132. newfindex = []
  133. for f in findex:
  134. if not f or not f[0]:
  135. newfindex.append(f)
  136. continue
  137. newfindex.append((getattr(object, f[0].__name__), f[1]))
  138. newre.append((cre, newfindex))
  139. newtab[key] = newre
  140. c.lexstatere = newtab
  141. c.lexstateerrorf = {}
  142. for key, ef in self.lexstateerrorf.items():
  143. c.lexstateerrorf[key] = getattr(object, ef.__name__)
  144. c.lexmodule = object
  145. return c
  146. # ------------------------------------------------------------
  147. # writetab() - Write lexer information to a table file
  148. # ------------------------------------------------------------
  149. def writetab(self, lextab, outputdir=''):
  150. if isinstance(lextab, types.ModuleType):
  151. raise IOError("Won't overwrite existing lextab module")
  152. basetabmodule = lextab.split('.')[-1]
  153. filename = os.path.join(outputdir, basetabmodule) + '.py'
  154. with open(filename, 'w') as tf:
  155. tf.write('# %s.py. This file automatically created by PLY (version %s). Don\'t edit!\n' % (basetabmodule, __version__))
  156. tf.write('_tabversion = %s\n' % repr(__tabversion__))
  157. tf.write('_lextokens = set(%s)\n' % repr(tuple(self.lextokens)))
  158. tf.write('_lexreflags = %s\n' % repr(self.lexreflags))
  159. tf.write('_lexliterals = %s\n' % repr(self.lexliterals))
  160. tf.write('_lexstateinfo = %s\n' % repr(self.lexstateinfo))
  161. # Rewrite the lexstatere table, replacing function objects with function names
  162. tabre = {}
  163. for statename, lre in self.lexstatere.items():
  164. titem = []
  165. for (pat, func), retext, renames in zip(lre, self.lexstateretext[statename], self.lexstaterenames[statename]):
  166. titem.append((retext, _funcs_to_names(func, renames)))
  167. tabre[statename] = titem
  168. tf.write('_lexstatere = %s\n' % repr(tabre))
  169. tf.write('_lexstateignore = %s\n' % repr(self.lexstateignore))
  170. taberr = {}
  171. for statename, ef in self.lexstateerrorf.items():
  172. taberr[statename] = ef.__name__ if ef else None
  173. tf.write('_lexstateerrorf = %s\n' % repr(taberr))
  174. tabeof = {}
  175. for statename, ef in self.lexstateeoff.items():
  176. tabeof[statename] = ef.__name__ if ef else None
  177. tf.write('_lexstateeoff = %s\n' % repr(tabeof))
  178. # ------------------------------------------------------------
  179. # readtab() - Read lexer information from a tab file
  180. # ------------------------------------------------------------
  181. def readtab(self, tabfile, fdict):
  182. if isinstance(tabfile, types.ModuleType):
  183. lextab = tabfile
  184. else:
  185. exec('import %s' % tabfile)
  186. lextab = sys.modules[tabfile]
  187. if getattr(lextab, '_tabversion', '0.0') != __tabversion__:
  188. raise ImportError('Inconsistent PLY version')
  189. self.lextokens = lextab._lextokens
  190. self.lexreflags = lextab._lexreflags
  191. self.lexliterals = lextab._lexliterals
  192. self.lextokens_all = self.lextokens | set(self.lexliterals)
  193. self.lexstateinfo = lextab._lexstateinfo
  194. self.lexstateignore = lextab._lexstateignore
  195. self.lexstatere = {}
  196. self.lexstateretext = {}
  197. for statename, lre in lextab._lexstatere.items():
  198. titem = []
  199. txtitem = []
  200. for pat, func_name in lre:
  201. titem.append((re.compile(pat, lextab._lexreflags), _names_to_funcs(func_name, fdict)))
  202. self.lexstatere[statename] = titem
  203. self.lexstateretext[statename] = txtitem
  204. self.lexstateerrorf = {}
  205. for statename, ef in lextab._lexstateerrorf.items():
  206. self.lexstateerrorf[statename] = fdict[ef]
  207. self.lexstateeoff = {}
  208. for statename, ef in lextab._lexstateeoff.items():
  209. self.lexstateeoff[statename] = fdict[ef]
  210. self.begin('INITIAL')
  211. # ------------------------------------------------------------
  212. # input() - Push a new string into the lexer
  213. # ------------------------------------------------------------
  214. def input(self, s):
  215. # Pull off the first character to see if s looks like a string
  216. c = s[:1]
  217. if not isinstance(c, StringTypes):
  218. raise ValueError('Expected a string')
  219. self.lexdata = s
  220. self.lexpos = 0
  221. self.lexlen = len(s)
  222. # ------------------------------------------------------------
  223. # begin() - Changes the lexing state
  224. # ------------------------------------------------------------
  225. def begin(self, state):
  226. if state not in self.lexstatere:
  227. raise ValueError('Undefined state')
  228. self.lexre = self.lexstatere[state]
  229. self.lexretext = self.lexstateretext[state]
  230. self.lexignore = self.lexstateignore.get(state, '')
  231. self.lexerrorf = self.lexstateerrorf.get(state, None)
  232. self.lexeoff = self.lexstateeoff.get(state, None)
  233. self.lexstate = state
  234. # ------------------------------------------------------------
  235. # push_state() - Changes the lexing state and saves old on stack
  236. # ------------------------------------------------------------
  237. def push_state(self, state):
  238. self.lexstatestack.append(self.lexstate)
  239. self.begin(state)
  240. # ------------------------------------------------------------
  241. # pop_state() - Restores the previous state
  242. # ------------------------------------------------------------
  243. def pop_state(self):
  244. self.begin(self.lexstatestack.pop())
  245. # ------------------------------------------------------------
  246. # current_state() - Returns the current lexing state
  247. # ------------------------------------------------------------
  248. def current_state(self):
  249. return self.lexstate
  250. # ------------------------------------------------------------
  251. # skip() - Skip ahead n characters
  252. # ------------------------------------------------------------
  253. def skip(self, n):
  254. self.lexpos += n
  255. # ------------------------------------------------------------
  256. # opttoken() - Return the next token from the Lexer
  257. #
  258. # Note: This function has been carefully implemented to be as fast
  259. # as possible. Don't make changes unless you really know what
  260. # you are doing
  261. # ------------------------------------------------------------
  262. def token(self):
  263. # Make local copies of frequently referenced attributes
  264. lexpos = self.lexpos
  265. lexlen = self.lexlen
  266. lexignore = self.lexignore
  267. lexdata = self.lexdata
  268. while lexpos < lexlen:
  269. # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
  270. if lexdata[lexpos] in lexignore:
  271. lexpos += 1
  272. continue
  273. # Look for a regular expression match
  274. for lexre, lexindexfunc in self.lexre:
  275. m = lexre.match(lexdata, lexpos)
  276. if not m:
  277. continue
  278. # Create a token for return
  279. tok = LexToken()
  280. tok.value = m.group()
  281. tok.lineno = self.lineno
  282. tok.lexpos = lexpos
  283. i = m.lastindex
  284. func, tok.type = lexindexfunc[i]
  285. if not func:
  286. # If no token type was set, it's an ignored token
  287. if tok.type:
  288. self.lexpos = m.end()
  289. return tok
  290. else:
  291. lexpos = m.end()
  292. break
  293. lexpos = m.end()
  294. # If token is processed by a function, call it
  295. tok.lexer = self # Set additional attributes useful in token rules
  296. self.lexmatch = m
  297. self.lexpos = lexpos
  298. newtok = func(tok)
  299. # Every function must return a token, if nothing, we just move to next token
  300. if not newtok:
  301. lexpos = self.lexpos # This is here in case user has updated lexpos.
  302. lexignore = self.lexignore # This is here in case there was a state change
  303. break
  304. # Verify type of the token. If not in the token map, raise an error
  305. if not self.lexoptimize:
  306. if newtok.type not in self.lextokens_all:
  307. raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
  308. func.__code__.co_filename, func.__code__.co_firstlineno,
  309. func.__name__, newtok.type), lexdata[lexpos:])
  310. return newtok
  311. else:
  312. # No match, see if in literals
  313. if lexdata[lexpos] in self.lexliterals:
  314. tok = LexToken()
  315. tok.value = lexdata[lexpos]
  316. tok.lineno = self.lineno
  317. tok.type = tok.value
  318. tok.lexpos = lexpos
  319. self.lexpos = lexpos + 1
  320. return tok
  321. # No match. Call t_error() if defined.
  322. if self.lexerrorf:
  323. tok = LexToken()
  324. tok.value = self.lexdata[lexpos:]
  325. tok.lineno = self.lineno
  326. tok.type = 'error'
  327. tok.lexer = self
  328. tok.lexpos = lexpos
  329. self.lexpos = lexpos
  330. newtok = self.lexerrorf(tok)
  331. if lexpos == self.lexpos:
  332. # Error method didn't change text position at all. This is an error.
  333. raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
  334. lexpos = self.lexpos
  335. if not newtok:
  336. continue
  337. return newtok
  338. self.lexpos = lexpos
  339. raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos], lexpos), lexdata[lexpos:])
  340. if self.lexeoff:
  341. tok = LexToken()
  342. tok.type = 'eof'
  343. tok.value = ''
  344. tok.lineno = self.lineno
  345. tok.lexpos = lexpos
  346. tok.lexer = self
  347. self.lexpos = lexpos
  348. newtok = self.lexeoff(tok)
  349. return newtok
  350. self.lexpos = lexpos + 1
  351. if self.lexdata is None:
  352. raise RuntimeError('No input string given with input()')
  353. return None
  354. # Iterator interface
  355. def __iter__(self):
  356. return self
  357. def next(self):
  358. t = self.token()
  359. if t is None:
  360. raise StopIteration
  361. return t
  362. __next__ = next
  363. # -----------------------------------------------------------------------------
  364. # ==== Lex Builder ===
  365. #
  366. # The functions and classes below are used to collect lexing information
  367. # and build a Lexer object from it.
  368. # -----------------------------------------------------------------------------
  369. # -----------------------------------------------------------------------------
  370. # _get_regex(func)
  371. #
  372. # Returns the regular expression assigned to a function either as a doc string
  373. # or as a .regex attribute attached by the @TOKEN decorator.
  374. # -----------------------------------------------------------------------------
  375. def _get_regex(func):
  376. return getattr(func, 'regex', func.__doc__)
  377. # -----------------------------------------------------------------------------
  378. # get_caller_module_dict()
  379. #
  380. # This function returns a dictionary containing all of the symbols defined within
  381. # a caller further down the call stack. This is used to get the environment
  382. # associated with the yacc() call if none was provided.
  383. # -----------------------------------------------------------------------------
  384. def get_caller_module_dict(levels):
  385. f = sys._getframe(levels)
  386. ldict = f.f_globals.copy()
  387. if f.f_globals != f.f_locals:
  388. ldict.update(f.f_locals)
  389. return ldict
  390. # -----------------------------------------------------------------------------
  391. # _funcs_to_names()
  392. #
  393. # Given a list of regular expression functions, this converts it to a list
  394. # suitable for output to a table file
  395. # -----------------------------------------------------------------------------
  396. def _funcs_to_names(funclist, namelist):
  397. result = []
  398. for f, name in zip(funclist, namelist):
  399. if f and f[0]:
  400. result.append((name, f[1]))
  401. else:
  402. result.append(f)
  403. return result
  404. # -----------------------------------------------------------------------------
  405. # _names_to_funcs()
  406. #
  407. # Given a list of regular expression function names, this converts it back to
  408. # functions.
  409. # -----------------------------------------------------------------------------
  410. def _names_to_funcs(namelist, fdict):
  411. result = []
  412. for n in namelist:
  413. if n and n[0]:
  414. result.append((fdict[n[0]], n[1]))
  415. else:
  416. result.append(n)
  417. return result
  418. # -----------------------------------------------------------------------------
  419. # _form_master_re()
  420. #
  421. # This function takes a list of all of the regex components and attempts to
  422. # form the master regular expression. Given limitations in the Python re
  423. # module, it may be necessary to break the master regex into separate expressions.
  424. # -----------------------------------------------------------------------------
  425. def _form_master_re(relist, reflags, ldict, toknames):
  426. if not relist:
  427. return []
  428. regex = '|'.join(relist)
  429. try:
  430. lexre = re.compile(regex, reflags)
  431. # Build the index to function map for the matching engine
  432. lexindexfunc = [None] * (max(lexre.groupindex.values()) + 1)
  433. lexindexnames = lexindexfunc[:]
  434. for f, i in lexre.groupindex.items():
  435. handle = ldict.get(f, None)
  436. if type(handle) in (types.FunctionType, types.MethodType):
  437. lexindexfunc[i] = (handle, toknames[f])
  438. lexindexnames[i] = f
  439. elif handle is not None:
  440. lexindexnames[i] = f
  441. if f.find('ignore_') > 0:
  442. lexindexfunc[i] = (None, None)
  443. else:
  444. lexindexfunc[i] = (None, toknames[f])
  445. return [(lexre, lexindexfunc)], [regex], [lexindexnames]
  446. except Exception:
  447. m = int(len(relist)/2)
  448. if m == 0:
  449. m = 1
  450. llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames)
  451. rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames)
  452. return (llist+rlist), (lre+rre), (lnames+rnames)
  453. # -----------------------------------------------------------------------------
  454. # def _statetoken(s,names)
  455. #
  456. # Given a declaration name s of the form "t_" and a dictionary whose keys are
  457. # state names, this function returns a tuple (states,tokenname) where states
  458. # is a tuple of state names and tokenname is the name of the token. For example,
  459. # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
  460. # -----------------------------------------------------------------------------
  461. def _statetoken(s, names):
  462. nonstate = 1
  463. parts = s.split('_')
  464. for i, part in enumerate(parts[1:], 1):
  465. if part not in names and part != 'ANY':
  466. break
  467. if i > 1:
  468. states = tuple(parts[1:i])
  469. else:
  470. states = ('INITIAL',)
  471. if 'ANY' in states:
  472. states = tuple(names)
  473. tokenname = '_'.join(parts[i:])
  474. return (states, tokenname)
  475. # -----------------------------------------------------------------------------
  476. # LexerReflect()
  477. #
  478. # This class represents information needed to build a lexer as extracted from a
  479. # user's input file.
  480. # -----------------------------------------------------------------------------
  481. class LexerReflect(object):
  482. def __init__(self, ldict, log=None, reflags=0):
  483. self.ldict = ldict
  484. self.error_func = None
  485. self.tokens = []
  486. self.reflags = reflags
  487. self.stateinfo = {'INITIAL': 'inclusive'}
  488. self.modules = set()
  489. self.error = False
  490. self.log = PlyLogger(sys.stderr) if log is None else log
  491. # Get all of the basic information
  492. def get_all(self):
  493. self.get_tokens()
  494. self.get_literals()
  495. self.get_states()
  496. self.get_rules()
  497. # Validate all of the information
  498. def validate_all(self):
  499. self.validate_tokens()
  500. self.validate_literals()
  501. self.validate_rules()
  502. return self.error
  503. # Get the tokens map
  504. def get_tokens(self):
  505. tokens = self.ldict.get('tokens', None)
  506. if not tokens:
  507. self.log.error('No token list is defined')
  508. self.error = True
  509. return
  510. if not isinstance(tokens, (list, tuple)):
  511. self.log.error('tokens must be a list or tuple')
  512. self.error = True
  513. return
  514. if not tokens:
  515. self.log.error('tokens is empty')
  516. self.error = True
  517. return
  518. self.tokens = tokens
  519. # Validate the tokens
  520. def validate_tokens(self):
  521. terminals = {}
  522. for n in self.tokens:
  523. if not _is_identifier.match(n):
  524. self.log.error("Bad token name '%s'", n)
  525. self.error = True
  526. if n in terminals:
  527. self.log.warning("Token '%s' multiply defined", n)
  528. terminals[n] = 1
  529. # Get the literals specifier
  530. def get_literals(self):
  531. self.literals = self.ldict.get('literals', '')
  532. if not self.literals:
  533. self.literals = ''
  534. # Validate literals
  535. def validate_literals(self):
  536. try:
  537. for c in self.literals:
  538. if not isinstance(c, StringTypes) or len(c) > 1:
  539. self.log.error('Invalid literal %s. Must be a single character', repr(c))
  540. self.error = True
  541. except TypeError:
  542. self.log.error('Invalid literals specification. literals must be a sequence of characters')
  543. self.error = True
  544. def get_states(self):
  545. self.states = self.ldict.get('states', None)
  546. # Build statemap
  547. if self.states:
  548. if not isinstance(self.states, (tuple, list)):
  549. self.log.error('states must be defined as a tuple or list')
  550. self.error = True
  551. else:
  552. for s in self.states:
  553. if not isinstance(s, tuple) or len(s) != 2:
  554. self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')", repr(s))
  555. self.error = True
  556. continue
  557. name, statetype = s
  558. if not isinstance(name, StringTypes):
  559. self.log.error('State name %s must be a string', repr(name))
  560. self.error = True
  561. continue
  562. if not (statetype == 'inclusive' or statetype == 'exclusive'):
  563. self.log.error("State type for state %s must be 'inclusive' or 'exclusive'", name)
  564. self.error = True
  565. continue
  566. if name in self.stateinfo:
  567. self.log.error("State '%s' already defined", name)
  568. self.error = True
  569. continue
  570. self.stateinfo[name] = statetype
  571. # Get all of the symbols with a t_ prefix and sort them into various
  572. # categories (functions, strings, error functions, and ignore characters)
  573. def get_rules(self):
  574. tsymbols = [f for f in self.ldict if f[:2] == 't_']
  575. # Now build up a list of functions and a list of strings
  576. self.toknames = {} # Mapping of symbols to token names
  577. self.funcsym = {} # Symbols defined as functions
  578. self.strsym = {} # Symbols defined as strings
  579. self.ignore = {} # Ignore strings by state
  580. self.errorf = {} # Error functions by state
  581. self.eoff = {} # EOF functions by state
  582. for s in self.stateinfo:
  583. self.funcsym[s] = []
  584. self.strsym[s] = []
  585. if len(tsymbols) == 0:
  586. self.log.error('No rules of the form t_rulename are defined')
  587. self.error = True
  588. return
  589. for f in tsymbols:
  590. t = self.ldict[f]
  591. states, tokname = _statetoken(f, self.stateinfo)
  592. self.toknames[f] = tokname
  593. if hasattr(t, '__call__'):
  594. if tokname == 'error':
  595. for s in states:
  596. self.errorf[s] = t
  597. elif tokname == 'eof':
  598. for s in states:
  599. self.eoff[s] = t
  600. elif tokname == 'ignore':
  601. line = t.__code__.co_firstlineno
  602. file = t.__code__.co_filename
  603. self.log.error("%s:%d: Rule '%s' must be defined as a string", file, line, t.__name__)
  604. self.error = True
  605. else:
  606. for s in states:
  607. self.funcsym[s].append((f, t))
  608. elif isinstance(t, StringTypes):
  609. if tokname == 'ignore':
  610. for s in states:
  611. self.ignore[s] = t
  612. if '\\' in t:
  613. self.log.warning("%s contains a literal backslash '\\'", f)
  614. elif tokname == 'error':
  615. self.log.error("Rule '%s' must be defined as a function", f)
  616. self.error = True
  617. else:
  618. for s in states:
  619. self.strsym[s].append((f, t))
  620. else:
  621. self.log.error('%s not defined as a function or string', f)
  622. self.error = True
  623. # Sort the functions by line number
  624. for f in self.funcsym.values():
  625. f.sort(key=lambda x: x[1].__code__.co_firstlineno)
  626. # Sort the strings by regular expression length
  627. for s in self.strsym.values():
  628. s.sort(key=lambda x: len(x[1]), reverse=True)
  629. # Validate all of the t_rules collected
  630. def validate_rules(self):
  631. for state in self.stateinfo:
  632. # Validate all rules defined by functions
  633. for fname, f in self.funcsym[state]:
  634. line = f.__code__.co_firstlineno
  635. file = f.__code__.co_filename
  636. module = inspect.getmodule(f)
  637. self.modules.add(module)
  638. tokname = self.toknames[fname]
  639. if isinstance(f, types.MethodType):
  640. reqargs = 2
  641. else:
  642. reqargs = 1
  643. nargs = f.__code__.co_argcount
  644. if nargs > reqargs:
  645. self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__)
  646. self.error = True
  647. continue
  648. if nargs < reqargs:
  649. self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)
  650. self.error = True
  651. continue
  652. if not _get_regex(f):
  653. self.log.error("%s:%d: No regular expression defined for rule '%s'", file, line, f.__name__)
  654. self.error = True
  655. continue
  656. try:
  657. c = re.compile('(?P<%s>%s)' % (fname, _get_regex(f)), self.reflags)
  658. if c.match(''):
  659. self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file, line, f.__name__)
  660. self.error = True
  661. except re.error as e:
  662. self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file, line, f.__name__, e)
  663. if '#' in _get_regex(f):
  664. self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__)
  665. self.error = True
  666. # Validate all rules defined by strings
  667. for name, r in self.strsym[state]:
  668. tokname = self.toknames[name]
  669. if tokname == 'error':
  670. self.log.error("Rule '%s' must be defined as a function", name)
  671. self.error = True
  672. continue
  673. if tokname not in self.tokens and tokname.find('ignore_') < 0:
  674. self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname)
  675. self.error = True
  676. continue
  677. try:
  678. c = re.compile('(?P<%s>%s)' % (name, r), self.reflags)
  679. if (c.match('')):
  680. self.log.error("Regular expression for rule '%s' matches empty string", name)
  681. self.error = True
  682. except re.error as e:
  683. self.log.error("Invalid regular expression for rule '%s'. %s", name, e)
  684. if '#' in r:
  685. self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name)
  686. self.error = True
  687. if not self.funcsym[state] and not self.strsym[state]:
  688. self.log.error("No rules defined for state '%s'", state)
  689. self.error = True
  690. # Validate the error function
  691. efunc = self.errorf.get(state, None)
  692. if efunc:
  693. f = efunc
  694. line = f.__code__.co_firstlineno
  695. file = f.__code__.co_filename
  696. module = inspect.getmodule(f)
  697. self.modules.add(module)
  698. if isinstance(f, types.MethodType):
  699. reqargs = 2
  700. else:
  701. reqargs = 1
  702. nargs = f.__code__.co_argcount
  703. if nargs > reqargs:
  704. self.log.error("%s:%d: Rule '%s' has too many arguments", file, line, f.__name__)
  705. self.error = True
  706. if nargs < reqargs:
  707. self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)
  708. self.error = True
  709. for module in self.modules:
  710. self.validate_module(module)
  711. # -----------------------------------------------------------------------------
  712. # validate_module()
  713. #
  714. # This checks to see if there are duplicated t_rulename() functions or strings
  715. # in the parser input file. This is done using a simple regular expression
  716. # match on each line in the source code of the given module.
  717. # -----------------------------------------------------------------------------
  718. def validate_module(self, module):
  719. try:
  720. lines, linen = inspect.getsourcelines(module)
  721. except IOError:
  722. return
  723. fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
  724. sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
  725. counthash = {}
  726. linen += 1
  727. for line in lines:
  728. m = fre.match(line)
  729. if not m:
  730. m = sre.match(line)
  731. if m:
  732. name = m.group(1)
  733. prev = counthash.get(name)
  734. if not prev:
  735. counthash[name] = linen
  736. else:
  737. filename = inspect.getsourcefile(module)
  738. self.log.error('%s:%d: Rule %s redefined. Previously defined on line %d', filename, linen, name, prev)
  739. self.error = True
  740. linen += 1
  741. # -----------------------------------------------------------------------------
  742. # lex(module)
  743. #
  744. # Build all of the regular expression rules from definitions in the supplied module
  745. # -----------------------------------------------------------------------------
  746. def lex(module=None, object=None, debug=False, optimize=False, lextab='lextab',
  747. reflags=int(re.VERBOSE), nowarn=False, outputdir=None, debuglog=None, errorlog=None):
  748. if lextab is None:
  749. lextab = 'lextab'
  750. global lexer
  751. ldict = None
  752. stateinfo = {'INITIAL': 'inclusive'}
  753. lexobj = Lexer()
  754. lexobj.lexoptimize = optimize
  755. global token, input
  756. if errorlog is None:
  757. errorlog = PlyLogger(sys.stderr)
  758. if debug:
  759. if debuglog is None:
  760. debuglog = PlyLogger(sys.stderr)
  761. # Get the module dictionary used for the lexer
  762. if object:
  763. module = object
  764. # Get the module dictionary used for the parser
  765. if module:
  766. _items = [(k, getattr(module, k)) for k in dir(module)]
  767. ldict = dict(_items)
  768. # If no __file__ attribute is available, try to obtain it from the __module__ instead
  769. if '__file__' not in ldict:
  770. ldict['__file__'] = sys.modules[ldict['__module__']].__file__
  771. else:
  772. ldict = get_caller_module_dict(2)
  773. # Determine if the module is package of a package or not.
  774. # If so, fix the tabmodule setting so that tables load correctly
  775. pkg = ldict.get('__package__')
  776. if pkg and isinstance(lextab, str):
  777. if '.' not in lextab:
  778. lextab = pkg + '.' + lextab
  779. # Collect parser information from the dictionary
  780. linfo = LexerReflect(ldict, log=errorlog, reflags=reflags)
  781. linfo.get_all()
  782. if not optimize:
  783. if linfo.validate_all():
  784. raise SyntaxError("Can't build lexer")
  785. if optimize and lextab:
  786. try:
  787. lexobj.readtab(lextab, ldict)
  788. token = lexobj.token
  789. input = lexobj.input
  790. lexer = lexobj
  791. return lexobj
  792. except ImportError:
  793. pass
  794. # Dump some basic debugging information
  795. if debug:
  796. debuglog.info('lex: tokens = %r', linfo.tokens)
  797. debuglog.info('lex: literals = %r', linfo.literals)
  798. debuglog.info('lex: states = %r', linfo.stateinfo)
  799. # Build a dictionary of valid token names
  800. lexobj.lextokens = set()
  801. for n in linfo.tokens:
  802. lexobj.lextokens.add(n)
  803. # Get literals specification
  804. if isinstance(linfo.literals, (list, tuple)):
  805. lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals)
  806. else:
  807. lexobj.lexliterals = linfo.literals
  808. lexobj.lextokens_all = lexobj.lextokens | set(lexobj.lexliterals)
  809. # Get the stateinfo dictionary
  810. stateinfo = linfo.stateinfo
  811. regexs = {}
  812. # Build the master regular expressions
  813. for state in stateinfo:
  814. regex_list = []
  815. # Add rules defined by functions first
  816. for fname, f in linfo.funcsym[state]:
  817. line = f.__code__.co_firstlineno
  818. file = f.__code__.co_filename
  819. regex_list.append('(?P<%s>%s)' % (fname, _get_regex(f)))
  820. if debug:
  821. debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, _get_regex(f), state)
  822. # Now add all of the simple rules
  823. for name, r in linfo.strsym[state]:
  824. regex_list.append('(?P<%s>%s)' % (name, r))
  825. if debug:
  826. debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state)
  827. regexs[state] = regex_list
  828. # Build the master regular expressions
  829. if debug:
  830. debuglog.info('lex: ==== MASTER REGEXS FOLLOW ====')
  831. for state in regexs:
  832. lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames)
  833. lexobj.lexstatere[state] = lexre
  834. lexobj.lexstateretext[state] = re_text
  835. lexobj.lexstaterenames[state] = re_names
  836. if debug:
  837. for i, text in enumerate(re_text):
  838. debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, text)
  839. # For inclusive states, we need to add the regular expressions from the INITIAL state
  840. for state, stype in stateinfo.items():
  841. if state != 'INITIAL' and stype == 'inclusive':
  842. lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])
  843. lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])
  844. lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL'])
  845. lexobj.lexstateinfo = stateinfo
  846. lexobj.lexre = lexobj.lexstatere['INITIAL']
  847. lexobj.lexretext = lexobj.lexstateretext['INITIAL']
  848. lexobj.lexreflags = reflags
  849. # Set up ignore variables
  850. lexobj.lexstateignore = linfo.ignore
  851. lexobj.lexignore = lexobj.lexstateignore.get('INITIAL', '')
  852. # Set up error functions
  853. lexobj.lexstateerrorf = linfo.errorf
  854. lexobj.lexerrorf = linfo.errorf.get('INITIAL', None)
  855. if not lexobj.lexerrorf:
  856. errorlog.warning('No t_error rule is defined')
  857. # Set up eof functions
  858. lexobj.lexstateeoff = linfo.eoff
  859. lexobj.lexeoff = linfo.eoff.get('INITIAL', None)
  860. # Check state information for ignore and error rules
  861. for s, stype in stateinfo.items():
  862. if stype == 'exclusive':
  863. if s not in linfo.errorf:
  864. errorlog.warning("No error rule is defined for exclusive state '%s'", s)
  865. if s not in linfo.ignore and lexobj.lexignore:
  866. errorlog.warning("No ignore rule is defined for exclusive state '%s'", s)
  867. elif stype == 'inclusive':
  868. if s not in linfo.errorf:
  869. linfo.errorf[s] = linfo.errorf.get('INITIAL', None)
  870. if s not in linfo.ignore:
  871. linfo.ignore[s] = linfo.ignore.get('INITIAL', '')
  872. # Create global versions of the token() and input() functions
  873. token = lexobj.token
  874. input = lexobj.input
  875. lexer = lexobj
  876. # If in optimize mode, we write the lextab
  877. if lextab and optimize:
  878. if outputdir is None:
  879. # If no output directory is set, the location of the output files
  880. # is determined according to the following rules:
  881. # - If lextab specifies a package, files go into that package directory
  882. # - Otherwise, files go in the same directory as the specifying module
  883. if isinstance(lextab, types.ModuleType):
  884. srcfile = lextab.__file__
  885. else:
  886. if '.' not in lextab:
  887. srcfile = ldict['__file__']
  888. else:
  889. parts = lextab.split('.')
  890. pkgname = '.'.join(parts[:-1])
  891. exec('import %s' % pkgname)
  892. srcfile = getattr(sys.modules[pkgname], '__file__', '')
  893. outputdir = os.path.dirname(srcfile)
  894. try:
  895. lexobj.writetab(lextab, outputdir)
  896. except IOError as e:
  897. errorlog.warning("Couldn't write lextab module %r. %s" % (lextab, e))
  898. return lexobj
  899. # -----------------------------------------------------------------------------
  900. # runmain()
  901. #
  902. # This runs the lexer as a main program
  903. # -----------------------------------------------------------------------------
  904. def runmain(lexer=None, data=None):
  905. if not data:
  906. try:
  907. filename = sys.argv[1]
  908. f = open(filename)
  909. data = f.read()
  910. f.close()
  911. except IndexError:
  912. sys.stdout.write('Reading from standard input (type EOF to end):\n')
  913. data = sys.stdin.read()
  914. if lexer:
  915. _input = lexer.input
  916. else:
  917. _input = input
  918. _input(data)
  919. if lexer:
  920. _token = lexer.token
  921. else:
  922. _token = token
  923. while True:
  924. tok = _token()
  925. if not tok:
  926. break
  927. sys.stdout.write('(%s,%r,%d,%d)\n' % (tok.type, tok.value, tok.lineno, tok.lexpos))
  928. # -----------------------------------------------------------------------------
  929. # @TOKEN(regex)
  930. #
  931. # This decorator function can be used to set the regex expression on a function
  932. # when its docstring might need to be set in an alternative way
  933. # -----------------------------------------------------------------------------
  934. def TOKEN(r):
  935. def set_regex(f):
  936. if hasattr(r, '__call__'):
  937. f.regex = _get_regex(r)
  938. else:
  939. f.regex = r
  940. return f
  941. return set_regex
  942. # Alternative spelling of the TOKEN decorator
  943. Token = TOKEN