html5lib_shim.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734
  1. # flake8: noqa
  2. """
  3. Shim module between Bleach and html5lib. This makes it easier to upgrade the
  4. html5lib library without having to change a lot of code.
  5. """
  6. import re
  7. import string
  8. import warnings
  9. # ignore html5lib deprecation warnings to use bleach; we are bleach
  10. # apply before we import submodules that import html5lib
  11. warnings.filterwarnings(
  12. "ignore",
  13. message="html5lib's sanitizer is deprecated",
  14. category=DeprecationWarning,
  15. module="bleach._vendor.html5lib",
  16. )
  17. from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
  18. HTMLParser,
  19. getTreeWalker,
  20. )
  21. from bleach._vendor.html5lib import (
  22. constants,
  23. ) # noqa: E402 module level import not at top of file
  24. from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
  25. namespaces,
  26. prefixes,
  27. )
  28. from bleach._vendor.html5lib.constants import (
  29. _ReparseException as ReparseException,
  30. ) # noqa: E402 module level import not at top of file
  31. from bleach._vendor.html5lib.filters.base import (
  32. Filter,
  33. ) # noqa: E402 module level import not at top of file
  34. from bleach._vendor.html5lib.filters.sanitizer import (
  35. allowed_protocols,
  36. allowed_css_properties,
  37. allowed_svg_properties,
  38. ) # noqa: E402 module level import not at top of file
  39. from bleach._vendor.html5lib.filters.sanitizer import (
  40. Filter as SanitizerFilter,
  41. ) # noqa: E402 module level import not at top of file
  42. from bleach._vendor.html5lib._inputstream import (
  43. HTMLInputStream,
  44. ) # noqa: E402 module level import not at top of file
  45. from bleach._vendor.html5lib.serializer import (
  46. escape,
  47. HTMLSerializer,
  48. ) # noqa: E402 module level import not at top of file
  49. from bleach._vendor.html5lib._tokenizer import (
  50. attributeMap,
  51. HTMLTokenizer,
  52. ) # noqa: E402 module level import not at top of file
  53. from bleach._vendor.html5lib._trie import (
  54. Trie,
  55. ) # noqa: E402 module level import not at top of file
  56. #: Map of entity name to expanded entity
  57. ENTITIES = constants.entities
  58. #: Trie of html entity string -> character representation
  59. ENTITIES_TRIE = Trie(ENTITIES)
  60. #: Token type constants--these never change
  61. TAG_TOKEN_TYPES = {
  62. constants.tokenTypes["StartTag"],
  63. constants.tokenTypes["EndTag"],
  64. constants.tokenTypes["EmptyTag"],
  65. }
  66. TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
  67. TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
  68. TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
  69. TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]
  70. #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
  71. #: https://html.spec.whatwg.org/multipage/indices.html#elements-3
  72. HTML_TAGS = [
  73. "a",
  74. "abbr",
  75. "address",
  76. "area",
  77. "article",
  78. "aside",
  79. "audio",
  80. "b",
  81. "base",
  82. "bdi",
  83. "bdo",
  84. "blockquote",
  85. "body",
  86. "br",
  87. "button",
  88. "canvas",
  89. "caption",
  90. "cite",
  91. "code",
  92. "col",
  93. "colgroup",
  94. "data",
  95. "datalist",
  96. "dd",
  97. "del",
  98. "details",
  99. "dfn",
  100. "dialog",
  101. "div",
  102. "dl",
  103. "dt",
  104. "em",
  105. "embed",
  106. "fieldset",
  107. "figcaption",
  108. "figure",
  109. "footer",
  110. "form",
  111. "h1",
  112. "h2",
  113. "h3",
  114. "h4",
  115. "h5",
  116. "h6",
  117. "head",
  118. "header",
  119. "hgroup",
  120. "hr",
  121. "html",
  122. "i",
  123. "iframe",
  124. "img",
  125. "input",
  126. "ins",
  127. "kbd",
  128. "keygen",
  129. "label",
  130. "legend",
  131. "li",
  132. "link",
  133. "map",
  134. "mark",
  135. "menu",
  136. "meta",
  137. "meter",
  138. "nav",
  139. "noscript",
  140. "object",
  141. "ol",
  142. "optgroup",
  143. "option",
  144. "output",
  145. "p",
  146. "param",
  147. "picture",
  148. "pre",
  149. "progress",
  150. "q",
  151. "rp",
  152. "rt",
  153. "ruby",
  154. "s",
  155. "samp",
  156. "script",
  157. "section",
  158. "select",
  159. "slot",
  160. "small",
  161. "source",
  162. "span",
  163. "strong",
  164. "style",
  165. "sub",
  166. "summary",
  167. "sup",
  168. "table",
  169. "tbody",
  170. "td",
  171. "template",
  172. "textarea",
  173. "tfoot",
  174. "th",
  175. "thead",
  176. "time",
  177. "title",
  178. "tr",
  179. "track",
  180. "u",
  181. "ul",
  182. "var",
  183. "video",
  184. "wbr",
  185. ]
  186. #: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
  187. #: from mozilla on 2019.07.11
  188. #: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
  189. HTML_TAGS_BLOCK_LEVEL = frozenset(
  190. [
  191. "address",
  192. "article",
  193. "aside",
  194. "blockquote",
  195. "details",
  196. "dialog",
  197. "dd",
  198. "div",
  199. "dl",
  200. "dt",
  201. "fieldset",
  202. "figcaption",
  203. "figure",
  204. "footer",
  205. "form",
  206. "h1",
  207. "h2",
  208. "h3",
  209. "h4",
  210. "h5",
  211. "h6",
  212. "header",
  213. "hgroup",
  214. "hr",
  215. "li",
  216. "main",
  217. "nav",
  218. "ol",
  219. "p",
  220. "pre",
  221. "section",
  222. "table",
  223. "ul",
  224. ]
  225. )
  226. class InputStreamWithMemory:
  227. """Wraps an HTMLInputStream to remember characters since last <
  228. This wraps existing HTMLInputStream classes to keep track of the stream
  229. since the last < which marked an open tag state.
  230. """
  231. def __init__(self, inner_stream):
  232. self._inner_stream = inner_stream
  233. self.reset = self._inner_stream.reset
  234. self.position = self._inner_stream.position
  235. self._buffer = []
  236. @property
  237. def errors(self):
  238. return self._inner_stream.errors
  239. @property
  240. def charEncoding(self):
  241. return self._inner_stream.charEncoding
  242. @property
  243. def changeEncoding(self):
  244. return self._inner_stream.changeEncoding
  245. def char(self):
  246. c = self._inner_stream.char()
  247. # char() can return None if EOF, so ignore that
  248. if c:
  249. self._buffer.append(c)
  250. return c
  251. def charsUntil(self, characters, opposite=False):
  252. chars = self._inner_stream.charsUntil(characters, opposite=opposite)
  253. self._buffer.extend(list(chars))
  254. return chars
  255. def unget(self, char):
  256. if self._buffer:
  257. self._buffer.pop(-1)
  258. return self._inner_stream.unget(char)
  259. def get_tag(self):
  260. """Returns the stream history since last '<'
  261. Since the buffer starts at the last '<' as as seen by tagOpenState(),
  262. we know that everything from that point to when this method is called
  263. is the "tag" that is being tokenized.
  264. """
  265. return "".join(self._buffer)
  266. def start_tag(self):
  267. """Resets stream history to just '<'
  268. This gets called by tagOpenState() which marks a '<' that denotes an
  269. open tag. Any time we see that, we reset the buffer.
  270. """
  271. self._buffer = ["<"]
  272. class BleachHTMLTokenizer(HTMLTokenizer):
  273. """Tokenizer that doesn't consume character entities"""
  274. def __init__(self, consume_entities=False, **kwargs):
  275. super().__init__(**kwargs)
  276. self.consume_entities = consume_entities
  277. # Wrap the stream with one that remembers the history
  278. self.stream = InputStreamWithMemory(self.stream)
  279. # Remember the last token emitted; needed for block element spacing
  280. self.emitted_last_token = None
  281. def __iter__(self):
  282. last_error_token = None
  283. for token in super().__iter__():
  284. if last_error_token is not None:
  285. if (
  286. last_error_token["data"] == "invalid-character-in-attribute-name"
  287. and token["type"] in TAG_TOKEN_TYPES
  288. and token.get("data")
  289. ):
  290. # token["data"] is an html5lib attributeMap
  291. # (OrderedDict 3.7+ and dict otherwise)
  292. # of attr name to attr value
  293. #
  294. # Remove attribute names that have ', " or < in them
  295. # because those characters are invalid for attribute names.
  296. token["data"] = attributeMap(
  297. (attr_name, attr_value)
  298. for attr_name, attr_value in token["data"].items()
  299. if (
  300. '"' not in attr_name
  301. and "'" not in attr_name
  302. and "<" not in attr_name
  303. )
  304. )
  305. last_error_token = None
  306. yield token
  307. elif (
  308. last_error_token["data"] == "expected-closing-tag-but-got-char"
  309. and self.parser.tags is not None
  310. and token["data"].lower().strip() not in self.parser.tags
  311. ):
  312. # We've got either a malformed tag or a pseudo-tag or
  313. # something that html5lib wants to turn into a malformed
  314. # comment which Bleach clean() will drop so we interfere
  315. # with the token stream to handle it more correctly.
  316. #
  317. # If this is an allowed tag, it's malformed and we just let
  318. # the html5lib parser deal with it--we don't enter into this
  319. # block.
  320. #
  321. # If this is not an allowed tag, then we convert it to
  322. # characters and it'll get escaped in the sanitizer.
  323. token["data"] = self.stream.get_tag()
  324. token["type"] = TAG_TOKEN_TYPE_CHARACTERS
  325. last_error_token = None
  326. yield token
  327. elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
  328. # If the token is a parse error, then let the last_error_token
  329. # go, and make token the new last_error_token
  330. yield last_error_token
  331. last_error_token = token
  332. else:
  333. yield last_error_token
  334. yield token
  335. last_error_token = None
  336. continue
  337. # If the token is a ParseError, we hold on to it so we can get the
  338. # next token and potentially fix it.
  339. if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
  340. last_error_token = token
  341. continue
  342. yield token
  343. if last_error_token:
  344. if last_error_token["data"] == "eof-in-tag-name":
  345. # Handle the case where the text being parsed ends with <
  346. # followed by a series of characters. It's treated as a tag
  347. # name that abruptly ends, but we should treat that like
  348. # character data
  349. yield {
  350. "type": TAG_TOKEN_TYPE_CHARACTERS,
  351. "data": "<" + self.currentToken["name"],
  352. }
  353. else:
  354. yield last_error_token
  355. def consumeEntity(self, allowedChar=None, fromAttribute=False):
  356. # If this tokenizer is set to consume entities, then we can let the
  357. # superclass do its thing.
  358. if self.consume_entities:
  359. return super().consumeEntity(allowedChar, fromAttribute)
  360. # If this tokenizer is set to not consume entities, then we don't want
  361. # to consume and convert them, so this overrides the html5lib tokenizer's
  362. # consumeEntity so that it's now a no-op.
  363. #
  364. # However, when that gets called, it's consumed an &, so we put that back in
  365. # the stream.
  366. if fromAttribute:
  367. self.currentToken["data"][-1][1] += "&"
  368. else:
  369. self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})
  370. def tagOpenState(self):
  371. # This state marks a < that is either a StartTag, EndTag, EmptyTag,
  372. # or ParseError. In all cases, we want to drop any stream history
  373. # we've collected so far and we do that by calling start_tag() on
  374. # the input stream wrapper.
  375. self.stream.start_tag()
  376. return super().tagOpenState()
  377. def emitCurrentToken(self):
  378. token = self.currentToken
  379. if (
  380. self.parser.tags is not None
  381. and token["type"] in TAG_TOKEN_TYPES
  382. and token["name"].lower() not in self.parser.tags
  383. ):
  384. # If this is a start/end/empty tag for a tag that's not in our
  385. # allowed list, then it gets stripped or escaped. In both of these
  386. # cases it gets converted to a Characters token.
  387. if self.parser.strip:
  388. if (
  389. self.emitted_last_token
  390. and token["type"] == TAG_TOKEN_TYPE_START
  391. and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
  392. ):
  393. # If this is a block level tag we're stripping, we drop it
  394. # for a newline because that's what a browser would parse
  395. # it as
  396. new_data = "\n"
  397. else:
  398. # For all other things being stripped, we throw in an empty
  399. # string token
  400. new_data = ""
  401. else:
  402. # If we're escaping the token, we want to escape the exact
  403. # original string. Since tokenizing also normalizes data
  404. # and this is a tag-like thing, we've lost some information.
  405. # So we go back through the stream to get the original
  406. # string and use that.
  407. new_data = self.stream.get_tag()
  408. new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}
  409. self.currentToken = self.emitted_last_token = new_token
  410. self.tokenQueue.append(new_token)
  411. self.state = self.dataState
  412. return
  413. self.emitted_last_token = self.currentToken
  414. super().emitCurrentToken()
  415. class BleachHTMLParser(HTMLParser):
  416. """Parser that uses BleachHTMLTokenizer"""
  417. def __init__(self, tags, strip, consume_entities, **kwargs):
  418. """
  419. :arg tags: list of allowed tags--everything else is either stripped or
  420. escaped; if None, then this doesn't look at tags at all
  421. :arg strip: whether to strip disallowed tags (True) or escape them (False);
  422. if tags=None, then this doesn't have any effect
  423. :arg consume_entities: whether to consume entities (default behavior) or
  424. leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
  425. """
  426. self.tags = [tag.lower() for tag in tags] if tags is not None else None
  427. self.strip = strip
  428. self.consume_entities = consume_entities
  429. super().__init__(**kwargs)
  430. def _parse(
  431. self, stream, innerHTML=False, container="div", scripting=True, **kwargs
  432. ):
  433. # set scripting=True to parse <noscript> as though JS is enabled to
  434. # match the expected context in browsers
  435. #
  436. # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
  437. #
  438. # Override HTMLParser so we can swap out the tokenizer for our own.
  439. self.innerHTMLMode = innerHTML
  440. self.container = container
  441. self.scripting = scripting
  442. self.tokenizer = BleachHTMLTokenizer(
  443. stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
  444. )
  445. self.reset()
  446. try:
  447. self.mainLoop()
  448. except ReparseException:
  449. self.reset()
  450. self.mainLoop()
  451. def convert_entity(value):
  452. """Convert an entity (minus the & and ; part) into what it represents
  453. This handles numeric, hex, and text entities.
  454. :arg value: the string (minus the ``&`` and ``;`` part) to convert
  455. :returns: unicode character or None if it's an ambiguous ampersand that
  456. doesn't match a character entity
  457. """
  458. if value[0] == "#":
  459. if len(value) < 2:
  460. return None
  461. if value[1] in ("x", "X"):
  462. # hex-encoded code point
  463. int_as_string, base = value[2:], 16
  464. else:
  465. # decimal code point
  466. int_as_string, base = value[1:], 10
  467. if int_as_string == "":
  468. return None
  469. code_point = int(int_as_string, base)
  470. if 0 < code_point < 0x110000:
  471. return chr(code_point)
  472. else:
  473. return None
  474. return ENTITIES.get(value, None)
  475. def convert_entities(text):
  476. """Converts all found entities in the text
  477. :arg text: the text to convert entities in
  478. :returns: unicode text with converted entities
  479. """
  480. if "&" not in text:
  481. return text
  482. new_text = []
  483. for part in next_possible_entity(text):
  484. if not part:
  485. continue
  486. if part.startswith("&"):
  487. entity = match_entity(part)
  488. if entity is not None:
  489. converted = convert_entity(entity)
  490. # If it's not an ambiguous ampersand, then replace with the
  491. # unicode character. Otherwise, we leave the entity in.
  492. if converted is not None:
  493. new_text.append(converted)
  494. remainder = part[len(entity) + 2 :]
  495. if part:
  496. new_text.append(remainder)
  497. continue
  498. new_text.append(part)
  499. return "".join(new_text)
  500. def match_entity(stream):
  501. """Returns first entity in stream or None if no entity exists
  502. Note: For Bleach purposes, entities must start with a "&" and end with a
  503. ";". This ignores ambiguous character entities that have no ";" at the end.
  504. :arg stream: the character stream
  505. :returns: the entity string without "&" or ";" if it's a valid character
  506. entity; ``None`` otherwise
  507. """
  508. # Nix the & at the beginning
  509. if stream[0] != "&":
  510. raise ValueError('Stream should begin with "&"')
  511. stream = stream[1:]
  512. stream = list(stream)
  513. possible_entity = ""
  514. end_characters = "<&=;" + string.whitespace
  515. # Handle number entities
  516. if stream and stream[0] == "#":
  517. possible_entity = "#"
  518. stream.pop(0)
  519. if stream and stream[0] in ("x", "X"):
  520. allowed = "0123456789abcdefABCDEF"
  521. possible_entity += stream.pop(0)
  522. else:
  523. allowed = "0123456789"
  524. # FIXME(willkg): Do we want to make sure these are valid number
  525. # entities? This doesn't do that currently.
  526. while stream and stream[0] not in end_characters:
  527. c = stream.pop(0)
  528. if c not in allowed:
  529. break
  530. possible_entity += c
  531. if possible_entity and stream and stream[0] == ";":
  532. return possible_entity
  533. return None
  534. # Handle character entities
  535. while stream and stream[0] not in end_characters:
  536. c = stream.pop(0)
  537. possible_entity += c
  538. if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
  539. # If it's not a prefix, then it's not an entity and we're
  540. # out
  541. return None
  542. if possible_entity and stream and stream[0] == ";":
  543. return possible_entity
  544. return None
  545. AMP_SPLIT_RE = re.compile("(&)")
  546. def next_possible_entity(text):
  547. """Takes a text and generates a list of possible entities
  548. :arg text: the text to look at
  549. :returns: generator where each part (except the first) starts with an
  550. "&"
  551. """
  552. for i, part in enumerate(AMP_SPLIT_RE.split(text)):
  553. if i == 0:
  554. yield part
  555. elif i % 2 == 0:
  556. yield "&" + part
  557. class BleachHTMLSerializer(HTMLSerializer):
  558. """HTMLSerializer that undoes & -> &amp; in attributes and sets
  559. escape_rcdata to True
  560. """
  561. # per the HTMLSerializer.__init__ docstring:
  562. #
  563. # Whether to escape characters that need to be
  564. # escaped within normal elements within rcdata elements such as
  565. # style.
  566. #
  567. escape_rcdata = True
  568. def escape_base_amp(self, stoken):
  569. """Escapes just bare & in HTML attribute values"""
  570. # First, undo escaping of &. We need to do this because html5lib's
  571. # HTMLSerializer expected the tokenizer to consume all the character
  572. # entities and convert them to their respective characters, but the
  573. # BleachHTMLTokenizer doesn't do that. For example, this fixes
  574. # &amp;entity; back to &entity; .
  575. stoken = stoken.replace("&amp;", "&")
  576. # However, we do want all bare & that are not marking character
  577. # entities to be changed to &amp;, so let's do that carefully here.
  578. for part in next_possible_entity(stoken):
  579. if not part:
  580. continue
  581. if part.startswith("&"):
  582. entity = match_entity(part)
  583. # Only leave entities in that are not ambiguous. If they're
  584. # ambiguous, then we escape the ampersand.
  585. if entity is not None and convert_entity(entity) is not None:
  586. yield "&" + entity + ";"
  587. # Length of the entity plus 2--one for & at the beginning
  588. # and one for ; at the end
  589. part = part[len(entity) + 2 :]
  590. if part:
  591. yield part
  592. continue
  593. yield part.replace("&", "&amp;")
  594. def serialize(self, treewalker, encoding=None):
  595. """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
  596. Note that this converts & to &amp; in attribute values where the & isn't
  597. already part of an unambiguous character entity.
  598. """
  599. in_tag = False
  600. after_equals = False
  601. for stoken in super().serialize(treewalker, encoding):
  602. if in_tag:
  603. if stoken == ">":
  604. in_tag = False
  605. elif after_equals:
  606. if stoken != '"':
  607. yield from self.escape_base_amp(stoken)
  608. after_equals = False
  609. continue
  610. elif stoken == "=":
  611. after_equals = True
  612. yield stoken
  613. else:
  614. if stoken.startswith("<"):
  615. in_tag = True
  616. yield stoken