c_lexer.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. #------------------------------------------------------------------------------
  2. # pycparser: c_lexer.py
  3. #
  4. # CLexer class: lexer for the C language
  5. #
  6. # Eli Bendersky [https://eli.thegreenplace.net/]
  7. # License: BSD
  8. #------------------------------------------------------------------------------
  9. import re
  10. from .ply import lex
  11. from .ply.lex import TOKEN
  12. class CLexer(object):
  13. """ A lexer for the C language. After building it, set the
  14. input text with input(), and call token() to get new
  15. tokens.
  16. The public attribute filename can be set to an initial
  17. filename, but the lexer will update it upon #line
  18. directives.
  19. """
  20. def __init__(self, error_func, on_lbrace_func, on_rbrace_func,
  21. type_lookup_func):
  22. """ Create a new Lexer.
  23. error_func:
  24. An error function. Will be called with an error
  25. message, line and column as arguments, in case of
  26. an error during lexing.
  27. on_lbrace_func, on_rbrace_func:
  28. Called when an LBRACE or RBRACE is encountered
  29. (likely to push/pop type_lookup_func's scope)
  30. type_lookup_func:
  31. A type lookup function. Given a string, it must
  32. return True IFF this string is a name of a type
  33. that was defined with a typedef earlier.
  34. """
  35. self.error_func = error_func
  36. self.on_lbrace_func = on_lbrace_func
  37. self.on_rbrace_func = on_rbrace_func
  38. self.type_lookup_func = type_lookup_func
  39. self.filename = ''
  40. # Keeps track of the last token returned from self.token()
  41. self.last_token = None
  42. # Allow either "# line" or "# <num>" to support GCC's
  43. # cpp output
  44. #
  45. self.line_pattern = re.compile(r'([ \t]*line\W)|([ \t]*\d+)')
  46. self.pragma_pattern = re.compile(r'[ \t]*pragma\W')
  47. def build(self, **kwargs):
  48. """ Builds the lexer from the specification. Must be
  49. called after the lexer object is created.
  50. This method exists separately, because the PLY
  51. manual warns against calling lex.lex inside
  52. __init__
  53. """
  54. self.lexer = lex.lex(object=self, **kwargs)
  55. def reset_lineno(self):
  56. """ Resets the internal line number counter of the lexer.
  57. """
  58. self.lexer.lineno = 1
  59. def input(self, text):
  60. self.lexer.input(text)
  61. def token(self):
  62. self.last_token = self.lexer.token()
  63. return self.last_token
  64. def find_tok_column(self, token):
  65. """ Find the column of the token in its line.
  66. """
  67. last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos)
  68. return token.lexpos - last_cr
  69. ######################-- PRIVATE --######################
  70. ##
  71. ## Internal auxiliary methods
  72. ##
  73. def _error(self, msg, token):
  74. location = self._make_tok_location(token)
  75. self.error_func(msg, location[0], location[1])
  76. self.lexer.skip(1)
  77. def _make_tok_location(self, token):
  78. return (token.lineno, self.find_tok_column(token))
  79. ##
  80. ## Reserved keywords
  81. ##
  82. keywords = (
  83. 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST',
  84. 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
  85. 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG',
  86. 'REGISTER', 'OFFSETOF',
  87. 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
  88. 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
  89. 'VOLATILE', 'WHILE', '__INT128',
  90. )
  91. keywords_new = (
  92. '_BOOL', '_COMPLEX',
  93. '_NORETURN', '_THREAD_LOCAL', '_STATIC_ASSERT',
  94. '_ATOMIC', '_ALIGNOF', '_ALIGNAS',
  95. )
  96. keyword_map = {}
  97. for keyword in keywords:
  98. keyword_map[keyword.lower()] = keyword
  99. for keyword in keywords_new:
  100. keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword
  101. ##
  102. ## All the tokens recognized by the lexer
  103. ##
  104. tokens = keywords + keywords_new + (
  105. # Identifiers
  106. 'ID',
  107. # Type identifiers (identifiers previously defined as
  108. # types with typedef)
  109. 'TYPEID',
  110. # constants
  111. 'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', 'INT_CONST_CHAR',
  112. 'FLOAT_CONST', 'HEX_FLOAT_CONST',
  113. 'CHAR_CONST',
  114. 'WCHAR_CONST',
  115. 'U8CHAR_CONST',
  116. 'U16CHAR_CONST',
  117. 'U32CHAR_CONST',
  118. # String literals
  119. 'STRING_LITERAL',
  120. 'WSTRING_LITERAL',
  121. 'U8STRING_LITERAL',
  122. 'U16STRING_LITERAL',
  123. 'U32STRING_LITERAL',
  124. # Operators
  125. 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
  126. 'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
  127. 'LOR', 'LAND', 'LNOT',
  128. 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
  129. # Assignment
  130. 'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL',
  131. 'PLUSEQUAL', 'MINUSEQUAL',
  132. 'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL',
  133. 'OREQUAL',
  134. # Increment/decrement
  135. 'PLUSPLUS', 'MINUSMINUS',
  136. # Structure dereference (->)
  137. 'ARROW',
  138. # Conditional operator (?)
  139. 'CONDOP',
  140. # Delimiters
  141. 'LPAREN', 'RPAREN', # ( )
  142. 'LBRACKET', 'RBRACKET', # [ ]
  143. 'LBRACE', 'RBRACE', # { }
  144. 'COMMA', 'PERIOD', # . ,
  145. 'SEMI', 'COLON', # ; :
  146. # Ellipsis (...)
  147. 'ELLIPSIS',
  148. # pre-processor
  149. 'PPHASH', # '#'
  150. 'PPPRAGMA', # 'pragma'
  151. 'PPPRAGMASTR',
  152. )
  153. ##
  154. ## Regexes for use in tokens
  155. ##
  156. ##
  157. # valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
  158. identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*'
  159. hex_prefix = '0[xX]'
  160. hex_digits = '[0-9a-fA-F]+'
  161. bin_prefix = '0[bB]'
  162. bin_digits = '[01]+'
  163. # integer constants (K&R2: A.2.5.1)
  164. integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?'
  165. decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')'
  166. octal_constant = '0[0-7]*'+integer_suffix_opt
  167. hex_constant = hex_prefix+hex_digits+integer_suffix_opt
  168. bin_constant = bin_prefix+bin_digits+integer_suffix_opt
  169. bad_octal_constant = '0[0-7]*[89]'
  170. # character constants (K&R2: A.2.5.2)
  171. # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
  172. # directives with Windows paths as filenames (..\..\dir\file)
  173. # For the same reason, decimal_escape allows all digit sequences. We want to
  174. # parse all correct code, even if it means to sometimes parse incorrect
  175. # code.
  176. #
  177. # The original regexes were taken verbatim from the C syntax definition,
  178. # and were later modified to avoid worst-case exponential running time.
  179. #
  180. # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
  181. # decimal_escape = r"""(\d+)"""
  182. # hex_escape = r"""(x[0-9a-fA-F]+)"""
  183. # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
  184. #
  185. # The following modifications were made to avoid the ambiguity that allowed backtracking:
  186. # (https://github.com/eliben/pycparser/issues/61)
  187. #
  188. # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape.
  189. # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex
  190. # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal
  191. # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape.
  192. #
  193. # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways.
  194. # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`.
  195. simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
  196. decimal_escape = r"""(\d+)(?!\d)"""
  197. hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
  198. bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
  199. escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
  200. # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed
  201. # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to
  202. escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
  203. cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
  204. char_const = "'"+cconst_char+"'"
  205. wchar_const = 'L'+char_const
  206. u8char_const = 'u8'+char_const
  207. u16char_const = 'u'+char_const
  208. u32char_const = 'U'+char_const
  209. multicharacter_constant = "'"+cconst_char+"{2,4}'"
  210. unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
  211. bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
  212. # string literals (K&R2: A.2.6)
  213. string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')'
  214. string_literal = '"'+string_char+'*"'
  215. wstring_literal = 'L'+string_literal
  216. u8string_literal = 'u8'+string_literal
  217. u16string_literal = 'u'+string_literal
  218. u32string_literal = 'U'+string_literal
  219. bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
  220. # floating constants (K&R2: A.2.5.3)
  221. exponent_part = r"""([eE][-+]?[0-9]+)"""
  222. fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
  223. floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)'
  224. binary_exponent_part = r'''([pP][+-]?[0-9]+)'''
  225. hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))"""
  226. hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)'
  227. ##
  228. ## Lexer states: used for preprocessor \n-terminated directives
  229. ##
  230. states = (
  231. # ppline: preprocessor line directives
  232. #
  233. ('ppline', 'exclusive'),
  234. # pppragma: pragma
  235. #
  236. ('pppragma', 'exclusive'),
  237. )
  238. def t_PPHASH(self, t):
  239. r'[ \t]*\#'
  240. if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
  241. t.lexer.begin('ppline')
  242. self.pp_line = self.pp_filename = None
  243. elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
  244. t.lexer.begin('pppragma')
  245. else:
  246. t.type = 'PPHASH'
  247. return t
  248. ##
  249. ## Rules for the ppline state
  250. ##
  251. @TOKEN(string_literal)
  252. def t_ppline_FILENAME(self, t):
  253. if self.pp_line is None:
  254. self._error('filename before line number in #line', t)
  255. else:
  256. self.pp_filename = t.value.lstrip('"').rstrip('"')
  257. @TOKEN(decimal_constant)
  258. def t_ppline_LINE_NUMBER(self, t):
  259. if self.pp_line is None:
  260. self.pp_line = t.value
  261. else:
  262. # Ignore: GCC's cpp sometimes inserts a numeric flag
  263. # after the file name
  264. pass
  265. def t_ppline_NEWLINE(self, t):
  266. r'\n'
  267. if self.pp_line is None:
  268. self._error('line number missing in #line', t)
  269. else:
  270. self.lexer.lineno = int(self.pp_line)
  271. if self.pp_filename is not None:
  272. self.filename = self.pp_filename
  273. t.lexer.begin('INITIAL')
  274. def t_ppline_PPLINE(self, t):
  275. r'line'
  276. pass
  277. t_ppline_ignore = ' \t'
  278. def t_ppline_error(self, t):
  279. self._error('invalid #line directive', t)
  280. ##
  281. ## Rules for the pppragma state
  282. ##
  283. def t_pppragma_NEWLINE(self, t):
  284. r'\n'
  285. t.lexer.lineno += 1
  286. t.lexer.begin('INITIAL')
  287. def t_pppragma_PPPRAGMA(self, t):
  288. r'pragma'
  289. return t
  290. t_pppragma_ignore = ' \t'
  291. def t_pppragma_STR(self, t):
  292. '.+'
  293. t.type = 'PPPRAGMASTR'
  294. return t
  295. def t_pppragma_error(self, t):
  296. self._error('invalid #pragma directive', t)
  297. ##
  298. ## Rules for the normal state
  299. ##
  300. t_ignore = ' \t'
  301. # Newlines
  302. def t_NEWLINE(self, t):
  303. r'\n+'
  304. t.lexer.lineno += t.value.count("\n")
  305. # Operators
  306. t_PLUS = r'\+'
  307. t_MINUS = r'-'
  308. t_TIMES = r'\*'
  309. t_DIVIDE = r'/'
  310. t_MOD = r'%'
  311. t_OR = r'\|'
  312. t_AND = r'&'
  313. t_NOT = r'~'
  314. t_XOR = r'\^'
  315. t_LSHIFT = r'<<'
  316. t_RSHIFT = r'>>'
  317. t_LOR = r'\|\|'
  318. t_LAND = r'&&'
  319. t_LNOT = r'!'
  320. t_LT = r'<'
  321. t_GT = r'>'
  322. t_LE = r'<='
  323. t_GE = r'>='
  324. t_EQ = r'=='
  325. t_NE = r'!='
  326. # Assignment operators
  327. t_EQUALS = r'='
  328. t_TIMESEQUAL = r'\*='
  329. t_DIVEQUAL = r'/='
  330. t_MODEQUAL = r'%='
  331. t_PLUSEQUAL = r'\+='
  332. t_MINUSEQUAL = r'-='
  333. t_LSHIFTEQUAL = r'<<='
  334. t_RSHIFTEQUAL = r'>>='
  335. t_ANDEQUAL = r'&='
  336. t_OREQUAL = r'\|='
  337. t_XOREQUAL = r'\^='
  338. # Increment/decrement
  339. t_PLUSPLUS = r'\+\+'
  340. t_MINUSMINUS = r'--'
  341. # ->
  342. t_ARROW = r'->'
  343. # ?
  344. t_CONDOP = r'\?'
  345. # Delimiters
  346. t_LPAREN = r'\('
  347. t_RPAREN = r'\)'
  348. t_LBRACKET = r'\['
  349. t_RBRACKET = r'\]'
  350. t_COMMA = r','
  351. t_PERIOD = r'\.'
  352. t_SEMI = r';'
  353. t_COLON = r':'
  354. t_ELLIPSIS = r'\.\.\.'
  355. # Scope delimiters
  356. # To see why on_lbrace_func is needed, consider:
  357. # typedef char TT;
  358. # void foo(int TT) { TT = 10; }
  359. # TT x = 5;
  360. # Outside the function, TT is a typedef, but inside (starting and ending
  361. # with the braces) it's a parameter. The trouble begins with yacc's
  362. # lookahead token. If we open a new scope in brace_open, then TT has
  363. # already been read and incorrectly interpreted as TYPEID. So, we need
  364. # to open and close scopes from within the lexer.
  365. # Similar for the TT immediately outside the end of the function.
  366. #
  367. @TOKEN(r'\{')
  368. def t_LBRACE(self, t):
  369. self.on_lbrace_func()
  370. return t
  371. @TOKEN(r'\}')
  372. def t_RBRACE(self, t):
  373. self.on_rbrace_func()
  374. return t
  375. t_STRING_LITERAL = string_literal
  376. # The following floating and integer constants are defined as
  377. # functions to impose a strict order (otherwise, decimal
  378. # is placed before the others because its regex is longer,
  379. # and this is bad)
  380. #
  381. @TOKEN(floating_constant)
  382. def t_FLOAT_CONST(self, t):
  383. return t
  384. @TOKEN(hex_floating_constant)
  385. def t_HEX_FLOAT_CONST(self, t):
  386. return t
  387. @TOKEN(hex_constant)
  388. def t_INT_CONST_HEX(self, t):
  389. return t
  390. @TOKEN(bin_constant)
  391. def t_INT_CONST_BIN(self, t):
  392. return t
  393. @TOKEN(bad_octal_constant)
  394. def t_BAD_CONST_OCT(self, t):
  395. msg = "Invalid octal constant"
  396. self._error(msg, t)
  397. @TOKEN(octal_constant)
  398. def t_INT_CONST_OCT(self, t):
  399. return t
  400. @TOKEN(decimal_constant)
  401. def t_INT_CONST_DEC(self, t):
  402. return t
  403. # Must come before bad_char_const, to prevent it from
  404. # catching valid char constants as invalid
  405. #
  406. @TOKEN(multicharacter_constant)
  407. def t_INT_CONST_CHAR(self, t):
  408. return t
  409. @TOKEN(char_const)
  410. def t_CHAR_CONST(self, t):
  411. return t
  412. @TOKEN(wchar_const)
  413. def t_WCHAR_CONST(self, t):
  414. return t
  415. @TOKEN(u8char_const)
  416. def t_U8CHAR_CONST(self, t):
  417. return t
  418. @TOKEN(u16char_const)
  419. def t_U16CHAR_CONST(self, t):
  420. return t
  421. @TOKEN(u32char_const)
  422. def t_U32CHAR_CONST(self, t):
  423. return t
  424. @TOKEN(unmatched_quote)
  425. def t_UNMATCHED_QUOTE(self, t):
  426. msg = "Unmatched '"
  427. self._error(msg, t)
  428. @TOKEN(bad_char_const)
  429. def t_BAD_CHAR_CONST(self, t):
  430. msg = "Invalid char constant %s" % t.value
  431. self._error(msg, t)
  432. @TOKEN(wstring_literal)
  433. def t_WSTRING_LITERAL(self, t):
  434. return t
  435. @TOKEN(u8string_literal)
  436. def t_U8STRING_LITERAL(self, t):
  437. return t
  438. @TOKEN(u16string_literal)
  439. def t_U16STRING_LITERAL(self, t):
  440. return t
  441. @TOKEN(u32string_literal)
  442. def t_U32STRING_LITERAL(self, t):
  443. return t
  444. # unmatched string literals are caught by the preprocessor
  445. @TOKEN(bad_string_literal)
  446. def t_BAD_STRING_LITERAL(self, t):
  447. msg = "String contains invalid escape code"
  448. self._error(msg, t)
  449. @TOKEN(identifier)
  450. def t_ID(self, t):
  451. t.type = self.keyword_map.get(t.value, "ID")
  452. if t.type == 'ID' and self.type_lookup_func(t.value):
  453. t.type = "TYPEID"
  454. return t
  455. def t_error(self, t):
  456. msg = 'Illegal character %s' % repr(t.value[0])
  457. self._error(msg, t)