123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734 |
- # flake8: noqa
- """
- Shim module between Bleach and html5lib. This makes it easier to upgrade the
- html5lib library without having to change a lot of code.
- """
- import re
- import string
- import warnings
- # ignore html5lib deprecation warnings to use bleach; we are bleach
- # apply before we import submodules that import html5lib
- warnings.filterwarnings(
- "ignore",
- message="html5lib's sanitizer is deprecated",
- category=DeprecationWarning,
- module="bleach._vendor.html5lib",
- )
- from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
- HTMLParser,
- getTreeWalker,
- )
- from bleach._vendor.html5lib import (
- constants,
- ) # noqa: E402 module level import not at top of file
- from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
- namespaces,
- prefixes,
- )
- from bleach._vendor.html5lib.constants import (
- _ReparseException as ReparseException,
- ) # noqa: E402 module level import not at top of file
- from bleach._vendor.html5lib.filters.base import (
- Filter,
- ) # noqa: E402 module level import not at top of file
- from bleach._vendor.html5lib.filters.sanitizer import (
- allowed_protocols,
- allowed_css_properties,
- allowed_svg_properties,
- ) # noqa: E402 module level import not at top of file
- from bleach._vendor.html5lib.filters.sanitizer import (
- Filter as SanitizerFilter,
- ) # noqa: E402 module level import not at top of file
- from bleach._vendor.html5lib._inputstream import (
- HTMLInputStream,
- ) # noqa: E402 module level import not at top of file
- from bleach._vendor.html5lib.serializer import (
- escape,
- HTMLSerializer,
- ) # noqa: E402 module level import not at top of file
- from bleach._vendor.html5lib._tokenizer import (
- attributeMap,
- HTMLTokenizer,
- ) # noqa: E402 module level import not at top of file
- from bleach._vendor.html5lib._trie import (
- Trie,
- ) # noqa: E402 module level import not at top of file
- #: Map of entity name to expanded entity
- ENTITIES = constants.entities
- #: Trie of html entity string -> character representation
- ENTITIES_TRIE = Trie(ENTITIES)
- #: Token type constants--these never change
- TAG_TOKEN_TYPES = {
- constants.tokenTypes["StartTag"],
- constants.tokenTypes["EndTag"],
- constants.tokenTypes["EmptyTag"],
- }
- TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
- TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
- TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
- TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]
- #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
- #: https://html.spec.whatwg.org/multipage/indices.html#elements-3
- HTML_TAGS = [
- "a",
- "abbr",
- "address",
- "area",
- "article",
- "aside",
- "audio",
- "b",
- "base",
- "bdi",
- "bdo",
- "blockquote",
- "body",
- "br",
- "button",
- "canvas",
- "caption",
- "cite",
- "code",
- "col",
- "colgroup",
- "data",
- "datalist",
- "dd",
- "del",
- "details",
- "dfn",
- "dialog",
- "div",
- "dl",
- "dt",
- "em",
- "embed",
- "fieldset",
- "figcaption",
- "figure",
- "footer",
- "form",
- "h1",
- "h2",
- "h3",
- "h4",
- "h5",
- "h6",
- "head",
- "header",
- "hgroup",
- "hr",
- "html",
- "i",
- "iframe",
- "img",
- "input",
- "ins",
- "kbd",
- "keygen",
- "label",
- "legend",
- "li",
- "link",
- "map",
- "mark",
- "menu",
- "meta",
- "meter",
- "nav",
- "noscript",
- "object",
- "ol",
- "optgroup",
- "option",
- "output",
- "p",
- "param",
- "picture",
- "pre",
- "progress",
- "q",
- "rp",
- "rt",
- "ruby",
- "s",
- "samp",
- "script",
- "section",
- "select",
- "slot",
- "small",
- "source",
- "span",
- "strong",
- "style",
- "sub",
- "summary",
- "sup",
- "table",
- "tbody",
- "td",
- "template",
- "textarea",
- "tfoot",
- "th",
- "thead",
- "time",
- "title",
- "tr",
- "track",
- "u",
- "ul",
- "var",
- "video",
- "wbr",
- ]
- #: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
- #: from mozilla on 2019.07.11
- #: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
- HTML_TAGS_BLOCK_LEVEL = frozenset(
- [
- "address",
- "article",
- "aside",
- "blockquote",
- "details",
- "dialog",
- "dd",
- "div",
- "dl",
- "dt",
- "fieldset",
- "figcaption",
- "figure",
- "footer",
- "form",
- "h1",
- "h2",
- "h3",
- "h4",
- "h5",
- "h6",
- "header",
- "hgroup",
- "hr",
- "li",
- "main",
- "nav",
- "ol",
- "p",
- "pre",
- "section",
- "table",
- "ul",
- ]
- )
- class InputStreamWithMemory:
- """Wraps an HTMLInputStream to remember characters since last <
- This wraps existing HTMLInputStream classes to keep track of the stream
- since the last < which marked an open tag state.
- """
- def __init__(self, inner_stream):
- self._inner_stream = inner_stream
- self.reset = self._inner_stream.reset
- self.position = self._inner_stream.position
- self._buffer = []
- @property
- def errors(self):
- return self._inner_stream.errors
- @property
- def charEncoding(self):
- return self._inner_stream.charEncoding
- @property
- def changeEncoding(self):
- return self._inner_stream.changeEncoding
- def char(self):
- c = self._inner_stream.char()
- # char() can return None if EOF, so ignore that
- if c:
- self._buffer.append(c)
- return c
- def charsUntil(self, characters, opposite=False):
- chars = self._inner_stream.charsUntil(characters, opposite=opposite)
- self._buffer.extend(list(chars))
- return chars
- def unget(self, char):
- if self._buffer:
- self._buffer.pop(-1)
- return self._inner_stream.unget(char)
- def get_tag(self):
- """Returns the stream history since last '<'
- Since the buffer starts at the last '<' as as seen by tagOpenState(),
- we know that everything from that point to when this method is called
- is the "tag" that is being tokenized.
- """
- return "".join(self._buffer)
- def start_tag(self):
- """Resets stream history to just '<'
- This gets called by tagOpenState() which marks a '<' that denotes an
- open tag. Any time we see that, we reset the buffer.
- """
- self._buffer = ["<"]
- class BleachHTMLTokenizer(HTMLTokenizer):
- """Tokenizer that doesn't consume character entities"""
- def __init__(self, consume_entities=False, **kwargs):
- super().__init__(**kwargs)
- self.consume_entities = consume_entities
- # Wrap the stream with one that remembers the history
- self.stream = InputStreamWithMemory(self.stream)
- # Remember the last token emitted; needed for block element spacing
- self.emitted_last_token = None
- def __iter__(self):
- last_error_token = None
- for token in super().__iter__():
- if last_error_token is not None:
- if (
- last_error_token["data"] == "invalid-character-in-attribute-name"
- and token["type"] in TAG_TOKEN_TYPES
- and token.get("data")
- ):
- # token["data"] is an html5lib attributeMap
- # (OrderedDict 3.7+ and dict otherwise)
- # of attr name to attr value
- #
- # Remove attribute names that have ', " or < in them
- # because those characters are invalid for attribute names.
- token["data"] = attributeMap(
- (attr_name, attr_value)
- for attr_name, attr_value in token["data"].items()
- if (
- '"' not in attr_name
- and "'" not in attr_name
- and "<" not in attr_name
- )
- )
- last_error_token = None
- yield token
- elif (
- last_error_token["data"] == "expected-closing-tag-but-got-char"
- and self.parser.tags is not None
- and token["data"].lower().strip() not in self.parser.tags
- ):
- # We've got either a malformed tag or a pseudo-tag or
- # something that html5lib wants to turn into a malformed
- # comment which Bleach clean() will drop so we interfere
- # with the token stream to handle it more correctly.
- #
- # If this is an allowed tag, it's malformed and we just let
- # the html5lib parser deal with it--we don't enter into this
- # block.
- #
- # If this is not an allowed tag, then we convert it to
- # characters and it'll get escaped in the sanitizer.
- token["data"] = self.stream.get_tag()
- token["type"] = TAG_TOKEN_TYPE_CHARACTERS
- last_error_token = None
- yield token
- elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
- # If the token is a parse error, then let the last_error_token
- # go, and make token the new last_error_token
- yield last_error_token
- last_error_token = token
- else:
- yield last_error_token
- yield token
- last_error_token = None
- continue
- # If the token is a ParseError, we hold on to it so we can get the
- # next token and potentially fix it.
- if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
- last_error_token = token
- continue
- yield token
- if last_error_token:
- if last_error_token["data"] == "eof-in-tag-name":
- # Handle the case where the text being parsed ends with <
- # followed by a series of characters. It's treated as a tag
- # name that abruptly ends, but we should treat that like
- # character data
- yield {
- "type": TAG_TOKEN_TYPE_CHARACTERS,
- "data": "<" + self.currentToken["name"],
- }
- else:
- yield last_error_token
- def consumeEntity(self, allowedChar=None, fromAttribute=False):
- # If this tokenizer is set to consume entities, then we can let the
- # superclass do its thing.
- if self.consume_entities:
- return super().consumeEntity(allowedChar, fromAttribute)
- # If this tokenizer is set to not consume entities, then we don't want
- # to consume and convert them, so this overrides the html5lib tokenizer's
- # consumeEntity so that it's now a no-op.
- #
- # However, when that gets called, it's consumed an &, so we put that back in
- # the stream.
- if fromAttribute:
- self.currentToken["data"][-1][1] += "&"
- else:
- self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})
- def tagOpenState(self):
- # This state marks a < that is either a StartTag, EndTag, EmptyTag,
- # or ParseError. In all cases, we want to drop any stream history
- # we've collected so far and we do that by calling start_tag() on
- # the input stream wrapper.
- self.stream.start_tag()
- return super().tagOpenState()
- def emitCurrentToken(self):
- token = self.currentToken
- if (
- self.parser.tags is not None
- and token["type"] in TAG_TOKEN_TYPES
- and token["name"].lower() not in self.parser.tags
- ):
- # If this is a start/end/empty tag for a tag that's not in our
- # allowed list, then it gets stripped or escaped. In both of these
- # cases it gets converted to a Characters token.
- if self.parser.strip:
- if (
- self.emitted_last_token
- and token["type"] == TAG_TOKEN_TYPE_START
- and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
- ):
- # If this is a block level tag we're stripping, we drop it
- # for a newline because that's what a browser would parse
- # it as
- new_data = "\n"
- else:
- # For all other things being stripped, we throw in an empty
- # string token
- new_data = ""
- else:
- # If we're escaping the token, we want to escape the exact
- # original string. Since tokenizing also normalizes data
- # and this is a tag-like thing, we've lost some information.
- # So we go back through the stream to get the original
- # string and use that.
- new_data = self.stream.get_tag()
- new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}
- self.currentToken = self.emitted_last_token = new_token
- self.tokenQueue.append(new_token)
- self.state = self.dataState
- return
- self.emitted_last_token = self.currentToken
- super().emitCurrentToken()
- class BleachHTMLParser(HTMLParser):
- """Parser that uses BleachHTMLTokenizer"""
- def __init__(self, tags, strip, consume_entities, **kwargs):
- """
- :arg tags: list of allowed tags--everything else is either stripped or
- escaped; if None, then this doesn't look at tags at all
- :arg strip: whether to strip disallowed tags (True) or escape them (False);
- if tags=None, then this doesn't have any effect
- :arg consume_entities: whether to consume entities (default behavior) or
- leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
- """
- self.tags = [tag.lower() for tag in tags] if tags is not None else None
- self.strip = strip
- self.consume_entities = consume_entities
- super().__init__(**kwargs)
- def _parse(
- self, stream, innerHTML=False, container="div", scripting=True, **kwargs
- ):
- # set scripting=True to parse <noscript> as though JS is enabled to
- # match the expected context in browsers
- #
- # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
- #
- # Override HTMLParser so we can swap out the tokenizer for our own.
- self.innerHTMLMode = innerHTML
- self.container = container
- self.scripting = scripting
- self.tokenizer = BleachHTMLTokenizer(
- stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
- )
- self.reset()
- try:
- self.mainLoop()
- except ReparseException:
- self.reset()
- self.mainLoop()
- def convert_entity(value):
- """Convert an entity (minus the & and ; part) into what it represents
- This handles numeric, hex, and text entities.
- :arg value: the string (minus the ``&`` and ``;`` part) to convert
- :returns: unicode character or None if it's an ambiguous ampersand that
- doesn't match a character entity
- """
- if value[0] == "#":
- if len(value) < 2:
- return None
- if value[1] in ("x", "X"):
- # hex-encoded code point
- int_as_string, base = value[2:], 16
- else:
- # decimal code point
- int_as_string, base = value[1:], 10
- if int_as_string == "":
- return None
- code_point = int(int_as_string, base)
- if 0 < code_point < 0x110000:
- return chr(code_point)
- else:
- return None
- return ENTITIES.get(value, None)
- def convert_entities(text):
- """Converts all found entities in the text
- :arg text: the text to convert entities in
- :returns: unicode text with converted entities
- """
- if "&" not in text:
- return text
- new_text = []
- for part in next_possible_entity(text):
- if not part:
- continue
- if part.startswith("&"):
- entity = match_entity(part)
- if entity is not None:
- converted = convert_entity(entity)
- # If it's not an ambiguous ampersand, then replace with the
- # unicode character. Otherwise, we leave the entity in.
- if converted is not None:
- new_text.append(converted)
- remainder = part[len(entity) + 2 :]
- if part:
- new_text.append(remainder)
- continue
- new_text.append(part)
- return "".join(new_text)
- def match_entity(stream):
- """Returns first entity in stream or None if no entity exists
- Note: For Bleach purposes, entities must start with a "&" and end with a
- ";". This ignores ambiguous character entities that have no ";" at the end.
- :arg stream: the character stream
- :returns: the entity string without "&" or ";" if it's a valid character
- entity; ``None`` otherwise
- """
- # Nix the & at the beginning
- if stream[0] != "&":
- raise ValueError('Stream should begin with "&"')
- stream = stream[1:]
- stream = list(stream)
- possible_entity = ""
- end_characters = "<&=;" + string.whitespace
- # Handle number entities
- if stream and stream[0] == "#":
- possible_entity = "#"
- stream.pop(0)
- if stream and stream[0] in ("x", "X"):
- allowed = "0123456789abcdefABCDEF"
- possible_entity += stream.pop(0)
- else:
- allowed = "0123456789"
- # FIXME(willkg): Do we want to make sure these are valid number
- # entities? This doesn't do that currently.
- while stream and stream[0] not in end_characters:
- c = stream.pop(0)
- if c not in allowed:
- break
- possible_entity += c
- if possible_entity and stream and stream[0] == ";":
- return possible_entity
- return None
- # Handle character entities
- while stream and stream[0] not in end_characters:
- c = stream.pop(0)
- possible_entity += c
- if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
- # If it's not a prefix, then it's not an entity and we're
- # out
- return None
- if possible_entity and stream and stream[0] == ";":
- return possible_entity
- return None
- AMP_SPLIT_RE = re.compile("(&)")
- def next_possible_entity(text):
- """Takes a text and generates a list of possible entities
- :arg text: the text to look at
- :returns: generator where each part (except the first) starts with an
- "&"
- """
- for i, part in enumerate(AMP_SPLIT_RE.split(text)):
- if i == 0:
- yield part
- elif i % 2 == 0:
- yield "&" + part
- class BleachHTMLSerializer(HTMLSerializer):
- """HTMLSerializer that undoes & -> & in attributes and sets
- escape_rcdata to True
- """
- # per the HTMLSerializer.__init__ docstring:
- #
- # Whether to escape characters that need to be
- # escaped within normal elements within rcdata elements such as
- # style.
- #
- escape_rcdata = True
- def escape_base_amp(self, stoken):
- """Escapes just bare & in HTML attribute values"""
- # First, undo escaping of &. We need to do this because html5lib's
- # HTMLSerializer expected the tokenizer to consume all the character
- # entities and convert them to their respective characters, but the
- # BleachHTMLTokenizer doesn't do that. For example, this fixes
- # &entity; back to &entity; .
- stoken = stoken.replace("&", "&")
- # However, we do want all bare & that are not marking character
- # entities to be changed to &, so let's do that carefully here.
- for part in next_possible_entity(stoken):
- if not part:
- continue
- if part.startswith("&"):
- entity = match_entity(part)
- # Only leave entities in that are not ambiguous. If they're
- # ambiguous, then we escape the ampersand.
- if entity is not None and convert_entity(entity) is not None:
- yield "&" + entity + ";"
- # Length of the entity plus 2--one for & at the beginning
- # and one for ; at the end
- part = part[len(entity) + 2 :]
- if part:
- yield part
- continue
- yield part.replace("&", "&")
- def serialize(self, treewalker, encoding=None):
- """Wrap HTMLSerializer.serialize and conver & to & in attribute values
- Note that this converts & to & in attribute values where the & isn't
- already part of an unambiguous character entity.
- """
- in_tag = False
- after_equals = False
- for stoken in super().serialize(treewalker, encoding):
- if in_tag:
- if stoken == ">":
- in_tag = False
- elif after_equals:
- if stoken != '"':
- yield from self.escape_base_amp(stoken)
- after_equals = False
- continue
- elif stoken == "=":
- after_equals = True
- yield stoken
- else:
- if stoken.startswith("<"):
- in_tag = True
- yield stoken
|