123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442 |
- ##############################################################################
- #
- # Copyright (c) 2001, 2002 Zope Foundation and Contributors.
- # All Rights Reserved.
- #
- # This software is subject to the provisions of the Zope Public License,
- # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
- # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
- # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
- # FOR A PARTICULAR PURPOSE.
- #
- ##############################################################################
- """HTTP Request Parser
- This server uses asyncore to accept connections and do initial
- processing but threads to do work.
- """
- from io import BytesIO
- import re
- from urllib import parse
- from urllib.parse import unquote_to_bytes
- from waitress.buffers import OverflowableBuffer
- from waitress.receiver import ChunkedReceiver, FixedStreamReceiver
- from waitress.rfc7230 import HEADER_FIELD_RE, ONLY_DIGIT_RE
- from waitress.utilities import (
- BadRequest,
- RequestEntityTooLarge,
- RequestHeaderFieldsTooLarge,
- ServerNotImplemented,
- find_double_newline,
- )
- def unquote_bytes_to_wsgi(bytestring):
- return unquote_to_bytes(bytestring).decode("latin-1")
- class ParsingError(Exception):
- pass
- class TransferEncodingNotImplemented(Exception):
- pass
- class HTTPRequestParser:
- """A structure that collects the HTTP request.
- Once the stream is completed, the instance is passed to
- a server task constructor.
- """
- completed = False # Set once request is completed.
- empty = False # Set if no request was made.
- expect_continue = False # client sent "Expect: 100-continue" header
- headers_finished = False # True when headers have been read
- header_plus = b""
- chunked = False
- content_length = 0
- header_bytes_received = 0
- body_bytes_received = 0
- body_rcv = None
- version = "1.0"
- error = None
- connection_close = False
- # Other attributes: first_line, header, headers, command, uri, version,
- # path, query, fragment
- def __init__(self, adj):
- """
- adj is an Adjustments object.
- """
- # headers is a mapping containing keys translated to uppercase
- # with dashes turned into underscores.
- self.headers = {}
- self.adj = adj
- def received(self, data):
- """
- Receives the HTTP stream for one request. Returns the number of
- bytes consumed. Sets the completed flag once both the header and the
- body have been received.
- """
- if self.completed:
- return 0 # Can't consume any more.
- datalen = len(data)
- br = self.body_rcv
- if br is None:
- # In header.
- max_header = self.adj.max_request_header_size
- s = self.header_plus + data
- index = find_double_newline(s)
- consumed = 0
- if index >= 0:
- # If the headers have ended, and we also have part of the body
- # message in data we still want to validate we aren't going
- # over our limit for received headers.
- self.header_bytes_received = index
- consumed = datalen - (len(s) - index)
- else:
- self.header_bytes_received += datalen
- consumed = datalen
- # If the first line + headers is over the max length, we return a
- # RequestHeaderFieldsTooLarge error rather than continuing to
- # attempt to parse the headers.
- if self.header_bytes_received >= max_header:
- self.parse_header(b"GET / HTTP/1.0\r\n")
- self.error = RequestHeaderFieldsTooLarge(
- "exceeds max_header of %s" % max_header
- )
- self.completed = True
- return consumed
- if index >= 0:
- # Header finished.
- header_plus = s[:index]
- # Remove preceeding blank lines. This is suggested by
- # https://tools.ietf.org/html/rfc7230#section-3.5 to support
- # clients sending an extra CR LF after another request when
- # using HTTP pipelining
- header_plus = header_plus.lstrip()
- if not header_plus:
- self.empty = True
- self.completed = True
- else:
- try:
- self.parse_header(header_plus)
- except ParsingError as e:
- self.error = BadRequest(e.args[0])
- self.completed = True
- except TransferEncodingNotImplemented as e:
- self.error = ServerNotImplemented(e.args[0])
- self.completed = True
- else:
- if self.body_rcv is None:
- # no content-length header and not a t-e: chunked
- # request
- self.completed = True
- if self.content_length > 0:
- max_body = self.adj.max_request_body_size
- # we won't accept this request if the content-length
- # is too large
- if self.content_length >= max_body:
- self.error = RequestEntityTooLarge(
- "exceeds max_body of %s" % max_body
- )
- self.completed = True
- self.headers_finished = True
- return consumed
- # Header not finished yet.
- self.header_plus = s
- return datalen
- else:
- # In body.
- consumed = br.received(data)
- self.body_bytes_received += consumed
- max_body = self.adj.max_request_body_size
- if self.body_bytes_received >= max_body:
- # this will only be raised during t-e: chunked requests
- self.error = RequestEntityTooLarge("exceeds max_body of %s" % max_body)
- self.completed = True
- elif br.error:
- # garbage in chunked encoding input probably
- self.error = br.error
- self.completed = True
- elif br.completed:
- # The request (with the body) is ready to use.
- self.completed = True
- if self.chunked:
- # We've converted the chunked transfer encoding request
- # body into a normal request body, so we know its content
- # length; set the header here. We already popped the
- # TRANSFER_ENCODING header in parse_header, so this will
- # appear to the client to be an entirely non-chunked HTTP
- # request with a valid content-length.
- self.headers["CONTENT_LENGTH"] = str(br.__len__())
- return consumed
- def parse_header(self, header_plus):
- """
- Parses the header_plus block of text (the headers plus the
- first line of the request).
- """
- index = header_plus.find(b"\r\n")
- if index >= 0:
- first_line = header_plus[:index].rstrip()
- header = header_plus[index + 2 :]
- else:
- raise ParsingError("HTTP message header invalid")
- if b"\r" in first_line or b"\n" in first_line:
- raise ParsingError("Bare CR or LF found in HTTP message")
- self.first_line = first_line # for testing
- lines = get_header_lines(header)
- headers = self.headers
- for line in lines:
- header = HEADER_FIELD_RE.match(line)
- if not header:
- raise ParsingError("Invalid header")
- key, value = header.group("name", "value")
- if b"_" in key:
- # TODO(xistence): Should we drop this request instead?
- continue
- # Only strip off whitespace that is considered valid whitespace by
- # RFC7230, don't strip the rest
- value = value.strip(b" \t")
- key1 = key.upper().replace(b"-", b"_").decode("latin-1")
- # If a header already exists, we append subsequent values
- # separated by a comma. Applications already need to handle
- # the comma separated values, as HTTP front ends might do
- # the concatenation for you (behavior specified in RFC2616).
- try:
- headers[key1] += (b", " + value).decode("latin-1")
- except KeyError:
- headers[key1] = value.decode("latin-1")
- # command, uri, version will be bytes
- command, uri, version = crack_first_line(first_line)
- # self.request_uri is like nginx's request_uri:
- # "full original request URI (with arguments)"
- self.request_uri = uri.decode("latin-1")
- version = version.decode("latin-1")
- command = command.decode("latin-1")
- self.command = command
- self.version = version
- (
- self.proxy_scheme,
- self.proxy_netloc,
- self.path,
- self.query,
- self.fragment,
- ) = split_uri(uri)
- self.url_scheme = self.adj.url_scheme
- connection = headers.get("CONNECTION", "")
- if version == "1.0":
- if connection.lower() != "keep-alive":
- self.connection_close = True
- if version == "1.1":
- # since the server buffers data from chunked transfers and clients
- # never need to deal with chunked requests, downstream clients
- # should not see the HTTP_TRANSFER_ENCODING header; we pop it
- # here
- te = headers.pop("TRANSFER_ENCODING", "")
- # NB: We can not just call bare strip() here because it will also
- # remove other non-printable characters that we explicitly do not
- # want removed so that if someone attempts to smuggle a request
- # with these characters we don't fall prey to it.
- #
- # For example \x85 is stripped by default, but it is not considered
- # valid whitespace to be stripped by RFC7230.
- encodings = [
- encoding.strip(" \t").lower() for encoding in te.split(",") if encoding
- ]
- for encoding in encodings:
- # Out of the transfer-codings listed in
- # https://tools.ietf.org/html/rfc7230#section-4 we only support
- # chunked at this time.
- # Note: the identity transfer-coding was removed in RFC7230:
- # https://tools.ietf.org/html/rfc7230#appendix-A.2 and is thus
- # not supported
- if encoding not in {"chunked"}:
- raise TransferEncodingNotImplemented(
- "Transfer-Encoding requested is not supported."
- )
- if encodings and encodings[-1] == "chunked":
- self.chunked = True
- buf = OverflowableBuffer(self.adj.inbuf_overflow)
- self.body_rcv = ChunkedReceiver(buf)
- elif encodings: # pragma: nocover
- raise TransferEncodingNotImplemented(
- "Transfer-Encoding requested is not supported."
- )
- expect = headers.get("EXPECT", "").lower()
- self.expect_continue = expect == "100-continue"
- if connection.lower() == "close":
- self.connection_close = True
- if not self.chunked:
- cl = headers.get("CONTENT_LENGTH", "0")
- if not ONLY_DIGIT_RE.match(cl.encode("latin-1")):
- raise ParsingError("Content-Length is invalid")
- cl = int(cl)
- self.content_length = cl
- if cl > 0:
- buf = OverflowableBuffer(self.adj.inbuf_overflow)
- self.body_rcv = FixedStreamReceiver(cl, buf)
- def get_body_stream(self):
- body_rcv = self.body_rcv
- if body_rcv is not None:
- return body_rcv.getfile()
- else:
- return BytesIO()
- def close(self):
- body_rcv = self.body_rcv
- if body_rcv is not None:
- body_rcv.getbuf().close()
- def split_uri(uri):
- # urlsplit handles byte input by returning bytes on py3, so
- # scheme, netloc, path, query, and fragment are bytes
- scheme = netloc = path = query = fragment = b""
- # urlsplit below will treat this as a scheme-less netloc, thereby losing
- # the original intent of the request. Here we shamelessly stole 4 lines of
- # code from the CPython stdlib to parse out the fragment and query but
- # leave the path alone. See
- # https://github.com/python/cpython/blob/8c9e9b0cd5b24dfbf1424d1f253d02de80e8f5ef/Lib/urllib/parse.py#L465-L468
- # and https://github.com/Pylons/waitress/issues/260
- if uri[:2] == b"//":
- path = uri
- if b"#" in path:
- path, fragment = path.split(b"#", 1)
- if b"?" in path:
- path, query = path.split(b"?", 1)
- else:
- try:
- scheme, netloc, path, query, fragment = parse.urlsplit(uri)
- except UnicodeError:
- raise ParsingError("Bad URI")
- return (
- scheme.decode("latin-1"),
- netloc.decode("latin-1"),
- unquote_bytes_to_wsgi(path),
- query.decode("latin-1"),
- fragment.decode("latin-1"),
- )
- def get_header_lines(header):
- """
- Splits the header into lines, putting multi-line headers together.
- """
- r = []
- lines = header.split(b"\r\n")
- for line in lines:
- if not line:
- continue
- if b"\r" in line or b"\n" in line:
- raise ParsingError(
- 'Bare CR or LF found in header line "%s"' % str(line, "latin-1")
- )
- if line.startswith((b" ", b"\t")):
- if not r:
- # https://corte.si/posts/code/pathod/pythonservers/index.html
- raise ParsingError('Malformed header line "%s"' % str(line, "latin-1"))
- r[-1] += line
- else:
- r.append(line)
- return r
- first_line_re = re.compile(
- b"([^ ]+) "
- b"((?:[^ :?#]+://[^ ?#/]*(?:[0-9]{1,5})?)?[^ ]+)"
- b"(( HTTP/([0-9.]+))$|$)"
- )
- def crack_first_line(line):
- m = first_line_re.match(line)
- if m is not None and m.end() == len(line):
- if m.group(3):
- version = m.group(5)
- else:
- version = b""
- method = m.group(1)
- # the request methods that are currently defined are all uppercase:
- # https://www.iana.org/assignments/http-methods/http-methods.xhtml and
- # the request method is case sensitive according to
- # https://tools.ietf.org/html/rfc7231#section-4.1
- # By disallowing anything but uppercase methods we save poor
- # unsuspecting souls from sending lowercase HTTP methods to waitress
- # and having the request complete, while servers like nginx drop the
- # request onto the floor.
- if method != method.upper():
- raise ParsingError('Malformed HTTP method "%s"' % str(method, "latin-1"))
- uri = m.group(2)
- return method, uri, version
- else:
- return b"", b"", b""
|