123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- #!/usr/bin/python3
- """Lexical analysis of formal languages (i.e. code) using Pygments."""
- # :Author: Georg Brandl; Felix Wiemann; Günter Milde
- # :Date: $Date: 2022-03-04 16:57:13 +0100 (Fr, 04. Mär 2022) $
- # :Copyright: This module has been placed in the public domain.
- from docutils import ApplicationError
- try:
- import pygments
- from pygments.lexers import get_lexer_by_name
- from pygments.formatters.html import _get_ttype_class
- with_pygments = True
- except ImportError:
- with_pygments = False
- # Filter the following token types from the list of class arguments:
- unstyled_tokens = ['token', # Token (base token type)
- 'text', # Token.Text
- ''] # short name for Token and Text
- # (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
- class LexerError(ApplicationError):
- pass
- class Lexer:
- """Parse `code` lines and yield "classified" tokens.
- Arguments
- code -- string of source code to parse,
- language -- formal language the code is written in,
- tokennames -- either 'long', 'short', or 'none' (see below).
- Merge subsequent tokens of the same token-type.
- Iterating over an instance yields the tokens as ``(tokentype, value)``
- tuples. The value of `tokennames` configures the naming of the tokentype:
- 'long': downcased full token type name,
- 'short': short name defined by pygments.token.STANDARD_TYPES
- (= class argument used in pygments html output),
- 'none': skip lexical analysis.
- """
- def __init__(self, code, language, tokennames='short'):
- """
- Set up a lexical analyzer for `code` in `language`.
- """
- self.code = code
- self.language = language
- self.tokennames = tokennames
- self.lexer = None
- # get lexical analyzer for `language`:
- if language in ('', 'text') or tokennames == 'none':
- return
- if not with_pygments:
- raise LexerError('Cannot analyze code. '
- 'Pygments package not found.')
- try:
- self.lexer = get_lexer_by_name(self.language)
- except pygments.util.ClassNotFound:
- raise LexerError('Cannot analyze code. '
- 'No Pygments lexer found for "%s".' % language)
- # self.lexer.add_filter('tokenmerge')
- # Since version 1.2. (released Jan 01, 2010) Pygments has a
- # TokenMergeFilter. # ``self.merge(tokens)`` in __iter__ could
- # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
- # However, `merge` below also strips a final newline added by pygments.
- #
- # self.lexer.add_filter('tokenmerge')
- def merge(self, tokens):
- """Merge subsequent tokens of same token-type.
- Also strip the final newline (added by pygments).
- """
- tokens = iter(tokens)
- (lasttype, lastval) = next(tokens)
- for ttype, value in tokens:
- if ttype is lasttype:
- lastval += value
- else:
- yield lasttype, lastval
- (lasttype, lastval) = (ttype, value)
- if lastval.endswith('\n'):
- lastval = lastval[:-1]
- if lastval:
- yield lasttype, lastval
- def __iter__(self):
- """Parse self.code and yield "classified" tokens.
- """
- if self.lexer is None:
- yield [], self.code
- return
- tokens = pygments.lex(self.code, self.lexer)
- for tokentype, value in self.merge(tokens):
- if self.tokennames == 'long': # long CSS class args
- classes = str(tokentype).lower().split('.')
- else: # short CSS class args
- classes = [_get_ttype_class(tokentype)]
- classes = [cls for cls in classes if cls not in unstyled_tokens]
- yield classes, value
- class NumberLines:
- """Insert linenumber-tokens at the start of every code line.
- Arguments
- tokens -- iterable of ``(classes, value)`` tuples
- startline -- first line number
- endline -- last line number
- Iterating over an instance yields the tokens with a
- ``(['ln'], '<the line number>')`` token added for every code line.
- Multi-line tokens are split."""
- def __init__(self, tokens, startline, endline):
- self.tokens = tokens
- self.startline = startline
- # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
- self.fmt_str = '%%%dd ' % len(str(endline))
- def __iter__(self):
- lineno = self.startline
- yield ['ln'], self.fmt_str % lineno
- for ttype, value in self.tokens:
- lines = value.split('\n')
- for line in lines[:-1]:
- yield ttype, line + '\n'
- lineno += 1
- yield ['ln'], self.fmt_str % lineno
- yield ttype, lines[-1]
|