parser.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. ##############################################################################
  2. #
  3. # Copyright (c) 2001, 2002 Zope Foundation and Contributors.
  4. # All Rights Reserved.
  5. #
  6. # This software is subject to the provisions of the Zope Public License,
  7. # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
  8. # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
  9. # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  10. # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
  11. # FOR A PARTICULAR PURPOSE.
  12. #
  13. ##############################################################################
  14. """HTTP Request Parser
  15. This server uses asyncore to accept connections and do initial
  16. processing but threads to do work.
  17. """
  18. from io import BytesIO
  19. import re
  20. from urllib import parse
  21. from urllib.parse import unquote_to_bytes
  22. from waitress.buffers import OverflowableBuffer
  23. from waitress.receiver import ChunkedReceiver, FixedStreamReceiver
  24. from waitress.rfc7230 import HEADER_FIELD_RE, ONLY_DIGIT_RE
  25. from waitress.utilities import (
  26. BadRequest,
  27. RequestEntityTooLarge,
  28. RequestHeaderFieldsTooLarge,
  29. ServerNotImplemented,
  30. find_double_newline,
  31. )
  32. def unquote_bytes_to_wsgi(bytestring):
  33. return unquote_to_bytes(bytestring).decode("latin-1")
  34. class ParsingError(Exception):
  35. pass
  36. class TransferEncodingNotImplemented(Exception):
  37. pass
  38. class HTTPRequestParser:
  39. """A structure that collects the HTTP request.
  40. Once the stream is completed, the instance is passed to
  41. a server task constructor.
  42. """
  43. completed = False # Set once request is completed.
  44. empty = False # Set if no request was made.
  45. expect_continue = False # client sent "Expect: 100-continue" header
  46. headers_finished = False # True when headers have been read
  47. header_plus = b""
  48. chunked = False
  49. content_length = 0
  50. header_bytes_received = 0
  51. body_bytes_received = 0
  52. body_rcv = None
  53. version = "1.0"
  54. error = None
  55. connection_close = False
  56. # Other attributes: first_line, header, headers, command, uri, version,
  57. # path, query, fragment
  58. def __init__(self, adj):
  59. """
  60. adj is an Adjustments object.
  61. """
  62. # headers is a mapping containing keys translated to uppercase
  63. # with dashes turned into underscores.
  64. self.headers = {}
  65. self.adj = adj
  66. def received(self, data):
  67. """
  68. Receives the HTTP stream for one request. Returns the number of
  69. bytes consumed. Sets the completed flag once both the header and the
  70. body have been received.
  71. """
  72. if self.completed:
  73. return 0 # Can't consume any more.
  74. datalen = len(data)
  75. br = self.body_rcv
  76. if br is None:
  77. # In header.
  78. max_header = self.adj.max_request_header_size
  79. s = self.header_plus + data
  80. index = find_double_newline(s)
  81. consumed = 0
  82. if index >= 0:
  83. # If the headers have ended, and we also have part of the body
  84. # message in data we still want to validate we aren't going
  85. # over our limit for received headers.
  86. self.header_bytes_received = index
  87. consumed = datalen - (len(s) - index)
  88. else:
  89. self.header_bytes_received += datalen
  90. consumed = datalen
  91. # If the first line + headers is over the max length, we return a
  92. # RequestHeaderFieldsTooLarge error rather than continuing to
  93. # attempt to parse the headers.
  94. if self.header_bytes_received >= max_header:
  95. self.parse_header(b"GET / HTTP/1.0\r\n")
  96. self.error = RequestHeaderFieldsTooLarge(
  97. "exceeds max_header of %s" % max_header
  98. )
  99. self.completed = True
  100. return consumed
  101. if index >= 0:
  102. # Header finished.
  103. header_plus = s[:index]
  104. # Remove preceeding blank lines. This is suggested by
  105. # https://tools.ietf.org/html/rfc7230#section-3.5 to support
  106. # clients sending an extra CR LF after another request when
  107. # using HTTP pipelining
  108. header_plus = header_plus.lstrip()
  109. if not header_plus:
  110. self.empty = True
  111. self.completed = True
  112. else:
  113. try:
  114. self.parse_header(header_plus)
  115. except ParsingError as e:
  116. self.error = BadRequest(e.args[0])
  117. self.completed = True
  118. except TransferEncodingNotImplemented as e:
  119. self.error = ServerNotImplemented(e.args[0])
  120. self.completed = True
  121. else:
  122. if self.body_rcv is None:
  123. # no content-length header and not a t-e: chunked
  124. # request
  125. self.completed = True
  126. if self.content_length > 0:
  127. max_body = self.adj.max_request_body_size
  128. # we won't accept this request if the content-length
  129. # is too large
  130. if self.content_length >= max_body:
  131. self.error = RequestEntityTooLarge(
  132. "exceeds max_body of %s" % max_body
  133. )
  134. self.completed = True
  135. self.headers_finished = True
  136. return consumed
  137. # Header not finished yet.
  138. self.header_plus = s
  139. return datalen
  140. else:
  141. # In body.
  142. consumed = br.received(data)
  143. self.body_bytes_received += consumed
  144. max_body = self.adj.max_request_body_size
  145. if self.body_bytes_received >= max_body:
  146. # this will only be raised during t-e: chunked requests
  147. self.error = RequestEntityTooLarge("exceeds max_body of %s" % max_body)
  148. self.completed = True
  149. elif br.error:
  150. # garbage in chunked encoding input probably
  151. self.error = br.error
  152. self.completed = True
  153. elif br.completed:
  154. # The request (with the body) is ready to use.
  155. self.completed = True
  156. if self.chunked:
  157. # We've converted the chunked transfer encoding request
  158. # body into a normal request body, so we know its content
  159. # length; set the header here. We already popped the
  160. # TRANSFER_ENCODING header in parse_header, so this will
  161. # appear to the client to be an entirely non-chunked HTTP
  162. # request with a valid content-length.
  163. self.headers["CONTENT_LENGTH"] = str(br.__len__())
  164. return consumed
  165. def parse_header(self, header_plus):
  166. """
  167. Parses the header_plus block of text (the headers plus the
  168. first line of the request).
  169. """
  170. index = header_plus.find(b"\r\n")
  171. if index >= 0:
  172. first_line = header_plus[:index].rstrip()
  173. header = header_plus[index + 2 :]
  174. else:
  175. raise ParsingError("HTTP message header invalid")
  176. if b"\r" in first_line or b"\n" in first_line:
  177. raise ParsingError("Bare CR or LF found in HTTP message")
  178. self.first_line = first_line # for testing
  179. lines = get_header_lines(header)
  180. headers = self.headers
  181. for line in lines:
  182. header = HEADER_FIELD_RE.match(line)
  183. if not header:
  184. raise ParsingError("Invalid header")
  185. key, value = header.group("name", "value")
  186. if b"_" in key:
  187. # TODO(xistence): Should we drop this request instead?
  188. continue
  189. # Only strip off whitespace that is considered valid whitespace by
  190. # RFC7230, don't strip the rest
  191. value = value.strip(b" \t")
  192. key1 = key.upper().replace(b"-", b"_").decode("latin-1")
  193. # If a header already exists, we append subsequent values
  194. # separated by a comma. Applications already need to handle
  195. # the comma separated values, as HTTP front ends might do
  196. # the concatenation for you (behavior specified in RFC2616).
  197. try:
  198. headers[key1] += (b", " + value).decode("latin-1")
  199. except KeyError:
  200. headers[key1] = value.decode("latin-1")
  201. # command, uri, version will be bytes
  202. command, uri, version = crack_first_line(first_line)
  203. # self.request_uri is like nginx's request_uri:
  204. # "full original request URI (with arguments)"
  205. self.request_uri = uri.decode("latin-1")
  206. version = version.decode("latin-1")
  207. command = command.decode("latin-1")
  208. self.command = command
  209. self.version = version
  210. (
  211. self.proxy_scheme,
  212. self.proxy_netloc,
  213. self.path,
  214. self.query,
  215. self.fragment,
  216. ) = split_uri(uri)
  217. self.url_scheme = self.adj.url_scheme
  218. connection = headers.get("CONNECTION", "")
  219. if version == "1.0":
  220. if connection.lower() != "keep-alive":
  221. self.connection_close = True
  222. if version == "1.1":
  223. # since the server buffers data from chunked transfers and clients
  224. # never need to deal with chunked requests, downstream clients
  225. # should not see the HTTP_TRANSFER_ENCODING header; we pop it
  226. # here
  227. te = headers.pop("TRANSFER_ENCODING", "")
  228. # NB: We can not just call bare strip() here because it will also
  229. # remove other non-printable characters that we explicitly do not
  230. # want removed so that if someone attempts to smuggle a request
  231. # with these characters we don't fall prey to it.
  232. #
  233. # For example \x85 is stripped by default, but it is not considered
  234. # valid whitespace to be stripped by RFC7230.
  235. encodings = [
  236. encoding.strip(" \t").lower() for encoding in te.split(",") if encoding
  237. ]
  238. for encoding in encodings:
  239. # Out of the transfer-codings listed in
  240. # https://tools.ietf.org/html/rfc7230#section-4 we only support
  241. # chunked at this time.
  242. # Note: the identity transfer-coding was removed in RFC7230:
  243. # https://tools.ietf.org/html/rfc7230#appendix-A.2 and is thus
  244. # not supported
  245. if encoding not in {"chunked"}:
  246. raise TransferEncodingNotImplemented(
  247. "Transfer-Encoding requested is not supported."
  248. )
  249. if encodings and encodings[-1] == "chunked":
  250. self.chunked = True
  251. buf = OverflowableBuffer(self.adj.inbuf_overflow)
  252. self.body_rcv = ChunkedReceiver(buf)
  253. elif encodings: # pragma: nocover
  254. raise TransferEncodingNotImplemented(
  255. "Transfer-Encoding requested is not supported."
  256. )
  257. expect = headers.get("EXPECT", "").lower()
  258. self.expect_continue = expect == "100-continue"
  259. if connection.lower() == "close":
  260. self.connection_close = True
  261. if not self.chunked:
  262. cl = headers.get("CONTENT_LENGTH", "0")
  263. if not ONLY_DIGIT_RE.match(cl.encode("latin-1")):
  264. raise ParsingError("Content-Length is invalid")
  265. cl = int(cl)
  266. self.content_length = cl
  267. if cl > 0:
  268. buf = OverflowableBuffer(self.adj.inbuf_overflow)
  269. self.body_rcv = FixedStreamReceiver(cl, buf)
  270. def get_body_stream(self):
  271. body_rcv = self.body_rcv
  272. if body_rcv is not None:
  273. return body_rcv.getfile()
  274. else:
  275. return BytesIO()
  276. def close(self):
  277. body_rcv = self.body_rcv
  278. if body_rcv is not None:
  279. body_rcv.getbuf().close()
  280. def split_uri(uri):
  281. # urlsplit handles byte input by returning bytes on py3, so
  282. # scheme, netloc, path, query, and fragment are bytes
  283. scheme = netloc = path = query = fragment = b""
  284. # urlsplit below will treat this as a scheme-less netloc, thereby losing
  285. # the original intent of the request. Here we shamelessly stole 4 lines of
  286. # code from the CPython stdlib to parse out the fragment and query but
  287. # leave the path alone. See
  288. # https://github.com/python/cpython/blob/8c9e9b0cd5b24dfbf1424d1f253d02de80e8f5ef/Lib/urllib/parse.py#L465-L468
  289. # and https://github.com/Pylons/waitress/issues/260
  290. if uri[:2] == b"//":
  291. path = uri
  292. if b"#" in path:
  293. path, fragment = path.split(b"#", 1)
  294. if b"?" in path:
  295. path, query = path.split(b"?", 1)
  296. else:
  297. try:
  298. scheme, netloc, path, query, fragment = parse.urlsplit(uri)
  299. except UnicodeError:
  300. raise ParsingError("Bad URI")
  301. return (
  302. scheme.decode("latin-1"),
  303. netloc.decode("latin-1"),
  304. unquote_bytes_to_wsgi(path),
  305. query.decode("latin-1"),
  306. fragment.decode("latin-1"),
  307. )
  308. def get_header_lines(header):
  309. """
  310. Splits the header into lines, putting multi-line headers together.
  311. """
  312. r = []
  313. lines = header.split(b"\r\n")
  314. for line in lines:
  315. if not line:
  316. continue
  317. if b"\r" in line or b"\n" in line:
  318. raise ParsingError(
  319. 'Bare CR or LF found in header line "%s"' % str(line, "latin-1")
  320. )
  321. if line.startswith((b" ", b"\t")):
  322. if not r:
  323. # https://corte.si/posts/code/pathod/pythonservers/index.html
  324. raise ParsingError('Malformed header line "%s"' % str(line, "latin-1"))
  325. r[-1] += line
  326. else:
  327. r.append(line)
  328. return r
  329. first_line_re = re.compile(
  330. b"([^ ]+) "
  331. b"((?:[^ :?#]+://[^ ?#/]*(?:[0-9]{1,5})?)?[^ ]+)"
  332. b"(( HTTP/([0-9.]+))$|$)"
  333. )
  334. def crack_first_line(line):
  335. m = first_line_re.match(line)
  336. if m is not None and m.end() == len(line):
  337. if m.group(3):
  338. version = m.group(5)
  339. else:
  340. version = b""
  341. method = m.group(1)
  342. # the request methods that are currently defined are all uppercase:
  343. # https://www.iana.org/assignments/http-methods/http-methods.xhtml and
  344. # the request method is case sensitive according to
  345. # https://tools.ietf.org/html/rfc7231#section-4.1
  346. # By disallowing anything but uppercase methods we save poor
  347. # unsuspecting souls from sending lowercase HTTP methods to waitress
  348. # and having the request complete, while servers like nginx drop the
  349. # request onto the floor.
  350. if method != method.upper():
  351. raise ParsingError('Malformed HTTP method "%s"' % str(method, "latin-1"))
  352. uri = m.group(2)
  353. return method, uri, version
  354. else:
  355. return b"", b"", b""