code_analyzer.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. #!/usr/bin/python3
  2. """Lexical analysis of formal languages (i.e. code) using Pygments."""
  3. # :Author: Georg Brandl; Felix Wiemann; Günter Milde
  4. # :Date: $Date: 2022-03-04 16:57:13 +0100 (Fr, 04. Mär 2022) $
  5. # :Copyright: This module has been placed in the public domain.
  6. from docutils import ApplicationError
  7. try:
  8. import pygments
  9. from pygments.lexers import get_lexer_by_name
  10. from pygments.formatters.html import _get_ttype_class
  11. with_pygments = True
  12. except ImportError:
  13. with_pygments = False
  14. # Filter the following token types from the list of class arguments:
  15. unstyled_tokens = ['token', # Token (base token type)
  16. 'text', # Token.Text
  17. ''] # short name for Token and Text
  18. # (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
  19. class LexerError(ApplicationError):
  20. pass
  21. class Lexer:
  22. """Parse `code` lines and yield "classified" tokens.
  23. Arguments
  24. code -- string of source code to parse,
  25. language -- formal language the code is written in,
  26. tokennames -- either 'long', 'short', or 'none' (see below).
  27. Merge subsequent tokens of the same token-type.
  28. Iterating over an instance yields the tokens as ``(tokentype, value)``
  29. tuples. The value of `tokennames` configures the naming of the tokentype:
  30. 'long': downcased full token type name,
  31. 'short': short name defined by pygments.token.STANDARD_TYPES
  32. (= class argument used in pygments html output),
  33. 'none': skip lexical analysis.
  34. """
  35. def __init__(self, code, language, tokennames='short'):
  36. """
  37. Set up a lexical analyzer for `code` in `language`.
  38. """
  39. self.code = code
  40. self.language = language
  41. self.tokennames = tokennames
  42. self.lexer = None
  43. # get lexical analyzer for `language`:
  44. if language in ('', 'text') or tokennames == 'none':
  45. return
  46. if not with_pygments:
  47. raise LexerError('Cannot analyze code. '
  48. 'Pygments package not found.')
  49. try:
  50. self.lexer = get_lexer_by_name(self.language)
  51. except pygments.util.ClassNotFound:
  52. raise LexerError('Cannot analyze code. '
  53. 'No Pygments lexer found for "%s".' % language)
  54. # self.lexer.add_filter('tokenmerge')
  55. # Since version 1.2. (released Jan 01, 2010) Pygments has a
  56. # TokenMergeFilter. # ``self.merge(tokens)`` in __iter__ could
  57. # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
  58. # However, `merge` below also strips a final newline added by pygments.
  59. #
  60. # self.lexer.add_filter('tokenmerge')
  61. def merge(self, tokens):
  62. """Merge subsequent tokens of same token-type.
  63. Also strip the final newline (added by pygments).
  64. """
  65. tokens = iter(tokens)
  66. (lasttype, lastval) = next(tokens)
  67. for ttype, value in tokens:
  68. if ttype is lasttype:
  69. lastval += value
  70. else:
  71. yield lasttype, lastval
  72. (lasttype, lastval) = (ttype, value)
  73. if lastval.endswith('\n'):
  74. lastval = lastval[:-1]
  75. if lastval:
  76. yield lasttype, lastval
  77. def __iter__(self):
  78. """Parse self.code and yield "classified" tokens.
  79. """
  80. if self.lexer is None:
  81. yield [], self.code
  82. return
  83. tokens = pygments.lex(self.code, self.lexer)
  84. for tokentype, value in self.merge(tokens):
  85. if self.tokennames == 'long': # long CSS class args
  86. classes = str(tokentype).lower().split('.')
  87. else: # short CSS class args
  88. classes = [_get_ttype_class(tokentype)]
  89. classes = [cls for cls in classes if cls not in unstyled_tokens]
  90. yield classes, value
  91. class NumberLines:
  92. """Insert linenumber-tokens at the start of every code line.
  93. Arguments
  94. tokens -- iterable of ``(classes, value)`` tuples
  95. startline -- first line number
  96. endline -- last line number
  97. Iterating over an instance yields the tokens with a
  98. ``(['ln'], '<the line number>')`` token added for every code line.
  99. Multi-line tokens are split."""
  100. def __init__(self, tokens, startline, endline):
  101. self.tokens = tokens
  102. self.startline = startline
  103. # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
  104. self.fmt_str = '%%%dd ' % len(str(endline))
  105. def __iter__(self):
  106. lineno = self.startline
  107. yield ['ln'], self.fmt_str % lineno
  108. for ttype, value in self.tokens:
  109. lines = value.split('\n')
  110. for line in lines[:-1]:
  111. yield ttype, line + '\n'
  112. lineno += 1
  113. yield ['ln'], self.fmt_str % lineno
  114. yield ttype, lines[-1]