sanitizer.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612
  1. from itertools import chain
  2. import re
  3. import warnings
  4. from xml.sax.saxutils import unescape
  5. from bleach import html5lib_shim
  6. from bleach import parse_shim
  7. #: List of allowed tags
  8. ALLOWED_TAGS = [
  9. "a",
  10. "abbr",
  11. "acronym",
  12. "b",
  13. "blockquote",
  14. "code",
  15. "em",
  16. "i",
  17. "li",
  18. "ol",
  19. "strong",
  20. "ul",
  21. ]
  22. #: Map of allowed attributes by tag
  23. ALLOWED_ATTRIBUTES = {
  24. "a": ["href", "title"],
  25. "abbr": ["title"],
  26. "acronym": ["title"],
  27. }
  28. #: List of allowed protocols
  29. ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
  30. #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
  31. INVISIBLE_CHARACTERS = "".join(
  32. [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
  33. )
  34. #: Regexp for characters that are invisible
  35. INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
  36. #: String to replace invisible characters with. This can be a character, a
  37. #: string, or even a function that takes a Python re matchobj
  38. INVISIBLE_REPLACEMENT_CHAR = "?"
  39. class Cleaner:
  40. """Cleaner for cleaning HTML fragments of malicious content
  41. This cleaner is a security-focused function whose sole purpose is to remove
  42. malicious content from a string such that it can be displayed as content in
  43. a web page.
  44. To use::
  45. from bleach.sanitizer import Cleaner
  46. cleaner = Cleaner()
  47. for text in all_the_yucky_things:
  48. sanitized = cleaner.clean(text)
  49. .. Note::
  50. This cleaner is not designed to use to transform content to be used in
  51. non-web-page contexts.
  52. .. Warning::
  53. This cleaner is not thread-safe--the html parser has internal state.
  54. Create a separate cleaner per thread!
  55. """
  56. def __init__(
  57. self,
  58. tags=ALLOWED_TAGS,
  59. attributes=ALLOWED_ATTRIBUTES,
  60. protocols=ALLOWED_PROTOCOLS,
  61. strip=False,
  62. strip_comments=True,
  63. filters=None,
  64. css_sanitizer=None,
  65. ):
  66. """Initializes a Cleaner
  67. :arg list tags: allowed list of tags; defaults to
  68. ``bleach.sanitizer.ALLOWED_TAGS``
  69. :arg dict attributes: allowed attributes; can be a callable, list or dict;
  70. defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
  71. :arg list protocols: allowed list of protocols for links; defaults
  72. to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
  73. :arg bool strip: whether or not to strip disallowed elements
  74. :arg bool strip_comments: whether or not to strip HTML comments
  75. :arg list filters: list of html5lib Filter classes to pass streamed content through
  76. .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
  77. .. Warning::
  78. Using filters changes the output of ``bleach.Cleaner.clean``.
  79. Make sure the way the filters change the output are secure.
  80. :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
  81. sanitizing style attribute values and style text; defaults to None
  82. """
  83. self.tags = tags
  84. self.attributes = attributes
  85. self.protocols = protocols
  86. self.strip = strip
  87. self.strip_comments = strip_comments
  88. self.filters = filters or []
  89. self.css_sanitizer = css_sanitizer
  90. self.parser = html5lib_shim.BleachHTMLParser(
  91. tags=self.tags,
  92. strip=self.strip,
  93. consume_entities=False,
  94. namespaceHTMLElements=False,
  95. )
  96. self.walker = html5lib_shim.getTreeWalker("etree")
  97. self.serializer = html5lib_shim.BleachHTMLSerializer(
  98. quote_attr_values="always",
  99. omit_optional_tags=False,
  100. escape_lt_in_attrs=True,
  101. # We want to leave entities as they are without escaping or
  102. # resolving or expanding
  103. resolve_entities=False,
  104. # Bleach has its own sanitizer, so don't use the html5lib one
  105. sanitize=False,
  106. # clean preserves attr order
  107. alphabetical_attributes=False,
  108. )
  109. def clean(self, text):
  110. """Cleans text and returns sanitized result as unicode
  111. :arg str text: text to be cleaned
  112. :returns: sanitized text as unicode
  113. :raises TypeError: if ``text`` is not a text type
  114. """
  115. if not isinstance(text, str):
  116. message = (
  117. "argument cannot be of '{name}' type, must be of text type".format(
  118. name=text.__class__.__name__
  119. )
  120. )
  121. raise TypeError(message)
  122. if not text:
  123. return ""
  124. dom = self.parser.parseFragment(text)
  125. filtered = BleachSanitizerFilter(
  126. source=self.walker(dom),
  127. # Bleach-sanitizer-specific things
  128. attributes=self.attributes,
  129. strip_disallowed_elements=self.strip,
  130. strip_html_comments=self.strip_comments,
  131. css_sanitizer=self.css_sanitizer,
  132. # html5lib-sanitizer things
  133. allowed_elements=self.tags,
  134. allowed_protocols=self.protocols,
  135. )
  136. # Apply any filters after the BleachSanitizerFilter
  137. for filter_class in self.filters:
  138. filtered = filter_class(source=filtered)
  139. return self.serializer.render(filtered)
  140. def attribute_filter_factory(attributes):
  141. """Generates attribute filter function for the given attributes value
  142. The attributes value can take one of several shapes. This returns a filter
  143. function appropriate to the attributes value. One nice thing about this is
  144. that there's less if/then shenanigans in the ``allow_token`` method.
  145. """
  146. if callable(attributes):
  147. return attributes
  148. if isinstance(attributes, dict):
  149. def _attr_filter(tag, attr, value):
  150. if tag in attributes:
  151. attr_val = attributes[tag]
  152. if callable(attr_val):
  153. return attr_val(tag, attr, value)
  154. if attr in attr_val:
  155. return True
  156. if "*" in attributes:
  157. attr_val = attributes["*"]
  158. if callable(attr_val):
  159. return attr_val(tag, attr, value)
  160. return attr in attr_val
  161. return False
  162. return _attr_filter
  163. if isinstance(attributes, list):
  164. def _attr_filter(tag, attr, value):
  165. return attr in attributes
  166. return _attr_filter
  167. raise ValueError("attributes needs to be a callable, a list or a dict")
  168. class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
  169. """html5lib Filter that sanitizes text
  170. This filter can be used anywhere html5lib filters can be used.
  171. """
  172. def __init__(
  173. self,
  174. source,
  175. allowed_elements=ALLOWED_TAGS,
  176. attributes=ALLOWED_ATTRIBUTES,
  177. allowed_protocols=ALLOWED_PROTOCOLS,
  178. strip_disallowed_elements=False,
  179. strip_html_comments=True,
  180. css_sanitizer=None,
  181. **kwargs,
  182. ):
  183. """Creates a BleachSanitizerFilter instance
  184. :arg source: html5lib TreeWalker stream as an html5lib TreeWalker
  185. :arg list allowed_elements: allowed list of tags; defaults to
  186. ``bleach.sanitizer.ALLOWED_TAGS``
  187. :arg dict attributes: allowed attributes; can be a callable, list or dict;
  188. defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
  189. :arg list allowed_protocols: allowed list of protocols for links; defaults
  190. to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
  191. :arg bool strip_disallowed_elements: whether or not to strip disallowed
  192. elements
  193. :arg bool strip_html_comments: whether or not to strip HTML comments
  194. :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
  195. sanitizing style attribute values and style text; defaults to None
  196. """
  197. self.attr_filter = attribute_filter_factory(attributes)
  198. self.strip_disallowed_elements = strip_disallowed_elements
  199. self.strip_html_comments = strip_html_comments
  200. self.css_sanitizer = css_sanitizer
  201. # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
  202. warnings.filterwarnings(
  203. "ignore",
  204. message="html5lib's sanitizer is deprecated",
  205. category=DeprecationWarning,
  206. module="bleach._vendor.html5lib",
  207. )
  208. return super().__init__(
  209. source,
  210. allowed_elements=allowed_elements,
  211. allowed_protocols=allowed_protocols,
  212. **kwargs,
  213. )
  214. def sanitize_stream(self, token_iterator):
  215. for token in token_iterator:
  216. ret = self.sanitize_token(token)
  217. if not ret:
  218. continue
  219. if isinstance(ret, list):
  220. yield from ret
  221. else:
  222. yield ret
  223. def merge_characters(self, token_iterator):
  224. """Merge consecutive Characters tokens in a stream"""
  225. characters_buffer = []
  226. for token in token_iterator:
  227. if characters_buffer:
  228. if token["type"] == "Characters":
  229. characters_buffer.append(token)
  230. continue
  231. else:
  232. # Merge all the characters tokens together into one and then
  233. # operate on it.
  234. new_token = {
  235. "data": "".join(
  236. [char_token["data"] for char_token in characters_buffer]
  237. ),
  238. "type": "Characters",
  239. }
  240. characters_buffer = []
  241. yield new_token
  242. elif token["type"] == "Characters":
  243. characters_buffer.append(token)
  244. continue
  245. yield token
  246. new_token = {
  247. "data": "".join([char_token["data"] for char_token in characters_buffer]),
  248. "type": "Characters",
  249. }
  250. yield new_token
  251. def __iter__(self):
  252. return self.merge_characters(
  253. self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
  254. )
  255. def sanitize_token(self, token):
  256. """Sanitize a token either by HTML-encoding or dropping.
  257. Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
  258. ['attribute', 'pairs'], 'tag': callable}.
  259. Here callable is a function with two arguments of attribute name and
  260. value. It should return true of false.
  261. Also gives the option to strip tags instead of encoding.
  262. :arg dict token: token to sanitize
  263. :returns: token or list of tokens
  264. """
  265. token_type = token["type"]
  266. if token_type in ["StartTag", "EndTag", "EmptyTag"]:
  267. if token["name"] in self.allowed_elements:
  268. return self.allow_token(token)
  269. elif self.strip_disallowed_elements:
  270. return None
  271. else:
  272. return self.disallowed_token(token)
  273. elif token_type == "Comment":
  274. if not self.strip_html_comments:
  275. # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
  276. token["data"] = html5lib_shim.escape(
  277. token["data"], entities={'"': "&quot;", "'": "&#x27;"}
  278. )
  279. return token
  280. else:
  281. return None
  282. elif token_type == "Characters":
  283. return self.sanitize_characters(token)
  284. else:
  285. return token
  286. def sanitize_characters(self, token):
  287. """Handles Characters tokens
  288. Our overridden tokenizer doesn't do anything with entities. However,
  289. that means that the serializer will convert all ``&`` in Characters
  290. tokens to ``&amp;``.
  291. Since we don't want that, we extract entities here and convert them to
  292. Entity tokens so the serializer will let them be.
  293. :arg token: the Characters token to work on
  294. :returns: a list of tokens
  295. """
  296. data = token.get("data", "")
  297. if not data:
  298. return token
  299. data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
  300. token["data"] = data
  301. # If there isn't a & in the data, we can return now
  302. if "&" not in data:
  303. return token
  304. new_tokens = []
  305. # For each possible entity that starts with a "&", we try to extract an
  306. # actual entity and re-tokenize accordingly
  307. for part in html5lib_shim.next_possible_entity(data):
  308. if not part:
  309. continue
  310. if part.startswith("&"):
  311. entity = html5lib_shim.match_entity(part)
  312. if entity is not None:
  313. if entity == "amp":
  314. # LinkifyFilter can't match urls across token boundaries
  315. # which is problematic with &amp; since that shows up in
  316. # querystrings all the time. This special-cases &amp;
  317. # and converts it to a & and sticks it in as a
  318. # Characters token. It'll get merged with surrounding
  319. # tokens in the BleachSanitizerfilter.__iter__ and
  320. # escaped in the serializer.
  321. new_tokens.append({"type": "Characters", "data": "&"})
  322. else:
  323. new_tokens.append({"type": "Entity", "name": entity})
  324. # Length of the entity plus 2--one for & at the beginning
  325. # and one for ; at the end
  326. remainder = part[len(entity) + 2 :]
  327. if remainder:
  328. new_tokens.append({"type": "Characters", "data": remainder})
  329. continue
  330. new_tokens.append({"type": "Characters", "data": part})
  331. return new_tokens
  332. def sanitize_uri_value(self, value, allowed_protocols):
  333. """Checks a uri value to see if it's allowed
  334. :arg value: the uri value to sanitize
  335. :arg allowed_protocols: list of allowed protocols
  336. :returns: allowed value or None
  337. """
  338. # NOTE(willkg): This transforms the value into a normalized one that's
  339. # easier to match and verify, but shouldn't get returned since it's
  340. # vastly different than the original value.
  341. # Convert all character entities in the value
  342. normalized_uri = html5lib_shim.convert_entities(value)
  343. # Nix backtick, space characters, and control characters
  344. normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri)
  345. # Remove REPLACEMENT characters
  346. normalized_uri = normalized_uri.replace("\ufffd", "")
  347. # Lowercase it--this breaks the value, but makes it easier to match
  348. # against
  349. normalized_uri = normalized_uri.lower()
  350. try:
  351. # Drop attributes with uri values that have protocols that aren't
  352. # allowed
  353. parsed = parse_shim.urlparse(normalized_uri)
  354. except ValueError:
  355. # URI is impossible to parse, therefore it's not allowed
  356. return None
  357. if parsed.scheme:
  358. # If urlparse found a scheme, check that
  359. if parsed.scheme in allowed_protocols:
  360. return value
  361. else:
  362. # Allow uris that are just an anchor
  363. if normalized_uri.startswith("#"):
  364. return value
  365. # Handle protocols that urlparse doesn't recognize like "myprotocol"
  366. if (
  367. ":" in normalized_uri
  368. and normalized_uri.split(":")[0] in allowed_protocols
  369. ):
  370. return value
  371. # If there's no protocol/scheme specified, then assume it's "http" or
  372. # "https" and see if that's allowed
  373. if "http" in allowed_protocols or "https" in allowed_protocols:
  374. return value
  375. return None
  376. def allow_token(self, token):
  377. """Handles the case where we're allowing the tag"""
  378. if "data" in token:
  379. # Loop through all the attributes and drop the ones that are not
  380. # allowed, are unsafe or break other rules. Additionally, fix
  381. # attribute values that need fixing.
  382. #
  383. # At the end of this loop, we have the final set of attributes
  384. # we're keeping.
  385. attrs = {}
  386. for namespaced_name, val in token["data"].items():
  387. namespace, name = namespaced_name
  388. # Drop attributes that are not explicitly allowed
  389. #
  390. # NOTE(willkg): We pass in the attribute name--not a namespaced
  391. # name.
  392. if not self.attr_filter(token["name"], name, val):
  393. continue
  394. # Drop attributes with uri values that use a disallowed protocol
  395. # Sanitize attributes with uri values
  396. if namespaced_name in self.attr_val_is_uri:
  397. new_value = self.sanitize_uri_value(val, self.allowed_protocols)
  398. if new_value is None:
  399. continue
  400. val = new_value
  401. # Drop values in svg attrs with non-local IRIs
  402. if namespaced_name in self.svg_attr_val_allows_ref:
  403. new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
  404. new_val = new_val.strip()
  405. if not new_val:
  406. continue
  407. else:
  408. # Replace the val with the unescaped version because
  409. # it's a iri
  410. val = new_val
  411. # Drop href and xlink:href attr for svg elements with non-local IRIs
  412. if (None, token["name"]) in self.svg_allow_local_href:
  413. if namespaced_name in [
  414. (None, "href"),
  415. (html5lib_shim.namespaces["xlink"], "href"),
  416. ]:
  417. if re.search(r"^\s*[^#\s]", val):
  418. continue
  419. # If it's a style attribute, sanitize it
  420. if namespaced_name == (None, "style"):
  421. if self.css_sanitizer:
  422. val = self.css_sanitizer.sanitize_css(val)
  423. else:
  424. # FIXME(willkg): if style is allowed, but no
  425. # css_sanitizer was set up, then this is probably a
  426. # mistake and we should raise an error here
  427. #
  428. # For now, we're going to set the value to "" because
  429. # there was no sanitizer set
  430. val = ""
  431. # At this point, we want to keep the attribute, so add it in
  432. attrs[namespaced_name] = val
  433. token["data"] = attrs
  434. return token
  435. def disallowed_token(self, token):
  436. token_type = token["type"]
  437. if token_type == "EndTag":
  438. token["data"] = "</%s>" % token["name"]
  439. elif token["data"]:
  440. assert token_type in ("StartTag", "EmptyTag")
  441. attrs = []
  442. for (ns, name), v in token["data"].items():
  443. # If we end up with a namespace, but no name, switch them so we
  444. # have a valid name to use.
  445. if ns and not name:
  446. ns, name = name, ns
  447. # Figure out namespaced name if the namespace is appropriate
  448. # and exists; if the ns isn't in prefixes, then drop it.
  449. if ns is None or ns not in html5lib_shim.prefixes:
  450. namespaced_name = name
  451. else:
  452. namespaced_name = "{}:{}".format(html5lib_shim.prefixes[ns], name)
  453. attrs.append(
  454. ' %s="%s"'
  455. % (
  456. namespaced_name,
  457. # NOTE(willkg): HTMLSerializer escapes attribute values
  458. # already, so if we do it here (like HTMLSerializer does),
  459. # then we end up double-escaping.
  460. v,
  461. )
  462. )
  463. token["data"] = "<{}{}>".format(token["name"], "".join(attrs))
  464. else:
  465. token["data"] = "<%s>" % token["name"]
  466. if token.get("selfClosing"):
  467. token["data"] = token["data"][:-1] + "/>"
  468. token["type"] = "Characters"
  469. del token["name"]
  470. return token