rfc7230.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. """
  2. This contains a bunch of RFC7230 definitions and regular expressions that are
  3. needed to properly parse HTTP messages.
  4. """
  5. import re
  6. HEXDIG = "[0-9a-fA-F]"
  7. DIGIT = "[0-9]"
  8. WS = "[ \t]"
  9. OWS = WS + "{0,}?"
  10. RWS = WS + "{1,}?"
  11. BWS = OWS
  12. # RFC 7230 Section 3.2.6 "Field Value Components":
  13. # tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*"
  14. # / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
  15. # / DIGIT / ALPHA
  16. # obs-text = %x80-FF
  17. TCHAR = r"[!#$%&'*+\-.^_`|~0-9A-Za-z]"
  18. OBS_TEXT = r"\x80-\xff"
  19. TOKEN = TCHAR + "{1,}"
  20. # RFC 5234 Appendix B.1 "Core Rules":
  21. # VCHAR = %x21-7E
  22. # ; visible (printing) characters
  23. VCHAR = r"\x21-\x7e"
  24. # The '\\' between \x5b and \x5d is needed to escape \x5d (']')
  25. QDTEXT = "[\t \x21\x23-\x5b\\\x5d-\x7e" + OBS_TEXT + "]"
  26. QUOTED_PAIR = r"\\" + "([\t " + VCHAR + OBS_TEXT + "])"
  27. QUOTED_STRING = '"(?:(?:' + QDTEXT + ")|(?:" + QUOTED_PAIR + '))*"'
  28. # header-field = field-name ":" OWS field-value OWS
  29. # field-name = token
  30. # field-value = *( field-content / obs-fold )
  31. # field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ]
  32. # field-vchar = VCHAR / obs-text
  33. # Errata from: https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189
  34. # changes field-content to:
  35. #
  36. # field-content = field-vchar [ 1*( SP / HTAB / field-vchar )
  37. # field-vchar ]
  38. FIELD_VCHAR = "[" + VCHAR + OBS_TEXT + "]"
  39. # Field content is more greedy than the ABNF, in that it will match the whole value
  40. FIELD_CONTENT = FIELD_VCHAR + "+(?:[ \t]+" + FIELD_VCHAR + "+)*"
  41. # Which allows the field value here to just see if there is even a value in the first place
  42. FIELD_VALUE = "(?:" + FIELD_CONTENT + ")?"
  43. # chunk-ext = *( ";" chunk-ext-name [ "=" chunk-ext-val ] )
  44. # chunk-ext-name = token
  45. # chunk-ext-val = token / quoted-string
  46. CHUNK_EXT_NAME = TOKEN
  47. CHUNK_EXT_VAL = "(?:" + TOKEN + ")|(?:" + QUOTED_STRING + ")"
  48. CHUNK_EXT = (
  49. "(?:;(?P<extension>" + CHUNK_EXT_NAME + ")(?:=(?P<value>" + CHUNK_EXT_VAL + "))?)*"
  50. )
  51. # Pre-compiled regular expressions for use elsewhere
  52. ONLY_HEXDIG_RE = re.compile(("^" + HEXDIG + "+$").encode("latin-1"))
  53. ONLY_DIGIT_RE = re.compile(("^" + DIGIT + "+$").encode("latin-1"))
  54. HEADER_FIELD_RE = re.compile(
  55. (
  56. "^(?P<name>" + TOKEN + "):" + OWS + "(?P<value>" + FIELD_VALUE + ")" + OWS + "$"
  57. ).encode("latin-1")
  58. )
  59. QUOTED_PAIR_RE = re.compile(QUOTED_PAIR)
  60. QUOTED_STRING_RE = re.compile(QUOTED_STRING)
  61. CHUNK_EXT_RE = re.compile(("^" + CHUNK_EXT + "$").encode("latin-1"))