1
0

fetchers.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. # -*- test-case-name: openid.test.test_fetchers -*-
  2. """
  3. This module contains the HTTP fetcher interface and several implementations.
  4. """
  5. __all__ = [
  6. 'fetch', 'getDefaultFetcher', 'setDefaultFetcher', 'HTTPResponse',
  7. 'HTTPFetcher', 'createHTTPFetcher', 'HTTPFetchingError', 'HTTPError'
  8. ]
  9. import urllib.request
  10. import urllib.error
  11. import urllib.parse
  12. import http.client
  13. import time
  14. import io
  15. import sys
  16. import contextlib
  17. import openid
  18. import openid.urinorm
  19. # Try to import httplib2 for caching support
  20. # http://bitworking.org/projects/httplib2/
  21. try:
  22. import httplib2
  23. except ImportError:
  24. # httplib2 not available
  25. httplib2 = None
  26. # try to import pycurl, which will let us use CurlHTTPFetcher
  27. try:
  28. import pycurl
  29. except ImportError:
  30. pycurl = None
  31. USER_AGENT = "python-openid/%s (%s)" % (openid.__version__, sys.platform)
  32. MAX_RESPONSE_KB = 1024
  33. def fetch(url, body=None, headers=None):
  34. """Invoke the fetch method on the default fetcher. Most users
  35. should need only this method.
  36. @raises Exception: any exceptions that may be raised by the default fetcher
  37. """
  38. fetcher = getDefaultFetcher()
  39. return fetcher.fetch(url, body, headers)
  40. def createHTTPFetcher():
  41. """Create a default HTTP fetcher instance
  42. prefers Curl to urllib2."""
  43. if pycurl is None:
  44. fetcher = Urllib2Fetcher()
  45. else:
  46. fetcher = CurlHTTPFetcher()
  47. return fetcher
  48. # Contains the currently set HTTP fetcher. If it is set to None, the
  49. # library will call createHTTPFetcher() to set it. Do not access this
  50. # variable outside of this module.
  51. _default_fetcher = None
  52. def getDefaultFetcher():
  53. """Return the default fetcher instance
  54. if no fetcher has been set, it will create a default fetcher.
  55. @return: the default fetcher
  56. @rtype: HTTPFetcher
  57. """
  58. global _default_fetcher
  59. if _default_fetcher is None:
  60. setDefaultFetcher(createHTTPFetcher())
  61. return _default_fetcher
  62. def setDefaultFetcher(fetcher, wrap_exceptions=True):
  63. """Set the default fetcher
  64. @param fetcher: The fetcher to use as the default HTTP fetcher
  65. @type fetcher: HTTPFetcher
  66. @param wrap_exceptions: Whether to wrap exceptions thrown by the
  67. fetcher wil HTTPFetchingError so that they may be caught
  68. easier. By default, exceptions will be wrapped. In general,
  69. unwrapped fetchers are useful for debugging of fetching errors
  70. or if your fetcher raises well-known exceptions that you would
  71. like to catch.
  72. @type wrap_exceptions: bool
  73. """
  74. global _default_fetcher
  75. if fetcher is None or not wrap_exceptions:
  76. _default_fetcher = fetcher
  77. else:
  78. _default_fetcher = ExceptionWrappingFetcher(fetcher)
  79. def usingCurl():
  80. """Whether the currently set HTTP fetcher is a Curl HTTP fetcher."""
  81. fetcher = getDefaultFetcher()
  82. if isinstance(fetcher, ExceptionWrappingFetcher):
  83. fetcher = fetcher.fetcher
  84. return isinstance(fetcher, CurlHTTPFetcher)
  85. class HTTPResponse(object):
  86. """XXX document attributes"""
  87. headers = None
  88. status = None
  89. body = None
  90. final_url = None
  91. def __init__(self, final_url=None, status=None, headers=None, body=None):
  92. self.final_url = final_url
  93. self.status = status
  94. self.headers = headers
  95. self.body = body
  96. def __repr__(self):
  97. return "<%s status %s for %s>" % (self.__class__.__name__, self.status,
  98. self.final_url)
  99. class HTTPFetcher(object):
  100. """
  101. This class is the interface for openid HTTP fetchers. This
  102. interface is only important if you need to write a new fetcher for
  103. some reason.
  104. """
  105. def fetch(self, url, body=None, headers=None):
  106. """
  107. This performs an HTTP POST or GET, following redirects along
  108. the way. If a body is specified, then the request will be a
  109. POST. Otherwise, it will be a GET.
  110. @param headers: HTTP headers to include with the request
  111. @type headers: {str:str}
  112. @return: An object representing the server's HTTP response. If
  113. there are network or protocol errors, an exception will be
  114. raised. HTTP error responses, like 404 or 500, do not
  115. cause exceptions.
  116. @rtype: L{HTTPResponse}
  117. @raise Exception: Different implementations will raise
  118. different errors based on the underlying HTTP library.
  119. """
  120. raise NotImplementedError
  121. def _allowedURL(url):
  122. parsed = urllib.parse.urlparse(url)
  123. # scheme is the first item in the tuple
  124. return parsed[0] in ('http', 'https')
  125. class HTTPFetchingError(Exception):
  126. """Exception that is wrapped around all exceptions that are raised
  127. by the underlying fetcher when using the ExceptionWrappingFetcher
  128. @ivar why: The exception that caused this exception
  129. """
  130. def __init__(self, why=None):
  131. Exception.__init__(self, why)
  132. self.why = why
  133. class ExceptionWrappingFetcher(HTTPFetcher):
  134. """Fetcher that wraps another fetcher, causing all exceptions
  135. @cvar uncaught_exceptions: Exceptions that should be exposed to the
  136. user if they are raised by the fetch call
  137. """
  138. uncaught_exceptions = (SystemExit, KeyboardInterrupt, MemoryError)
  139. def __init__(self, fetcher):
  140. self.fetcher = fetcher
  141. def fetch(self, *args, **kwargs):
  142. try:
  143. return self.fetcher.fetch(*args, **kwargs)
  144. except self.uncaught_exceptions:
  145. raise
  146. except:
  147. exc_cls, exc_inst = sys.exc_info()[:2]
  148. if exc_inst is None:
  149. # string exceptions
  150. exc_inst = exc_cls
  151. raise HTTPFetchingError(why=exc_inst)
  152. class Urllib2Fetcher(HTTPFetcher):
  153. """An C{L{HTTPFetcher}} that uses urllib2.
  154. """
  155. # Parameterized for the benefit of testing frameworks, see
  156. # http://trac.openidenabled.com/trac/ticket/85
  157. urlopen = staticmethod(urllib.request.urlopen)
  158. def fetch(self, url, body=None, headers=None):
  159. if not _allowedURL(url):
  160. raise ValueError('Bad URL scheme: %r' % (url, ))
  161. if headers is None:
  162. headers = {}
  163. headers.setdefault('User-Agent', "%s Python-urllib/%s" %
  164. (USER_AGENT, urllib.request.__version__))
  165. if isinstance(body, str):
  166. body = bytes(body, encoding="utf-8")
  167. req = urllib.request.Request(url, data=body, headers=headers)
  168. url_resource = None
  169. try:
  170. url_resource = self.urlopen(req)
  171. with contextlib.closing(url_resource):
  172. return self._makeResponse(url_resource)
  173. except urllib.error.HTTPError as why:
  174. with contextlib.closing(why):
  175. resp = self._makeResponse(why)
  176. return resp
  177. except (urllib.error.URLError, http.client.BadStatusLine) as why:
  178. raise
  179. except Exception as why:
  180. raise AssertionError(why)
  181. def _makeResponse(self, urllib2_response):
  182. '''
  183. Construct an HTTPResponse from the the urllib response. Attempt to
  184. decode the response body from bytes to str if the necessary information
  185. is available.
  186. '''
  187. resp = HTTPResponse()
  188. resp.body = urllib2_response.read(MAX_RESPONSE_KB * 1024)
  189. resp.final_url = urllib2_response.geturl()
  190. resp.headers = self._lowerCaseKeys(
  191. dict(list(urllib2_response.info().items())))
  192. if hasattr(urllib2_response, 'code'):
  193. resp.status = urllib2_response.code
  194. else:
  195. resp.status = 200
  196. _, extra_dict = self._parseHeaderValue(
  197. resp.headers.get("content-type", ""))
  198. # Try to decode the response body to a string, if there's a
  199. # charset known; fall back to ISO-8859-1 otherwise, since that's
  200. # what's suggested in HTTP/1.1
  201. charset = extra_dict.get('charset', 'latin1')
  202. try:
  203. resp.body = resp.body.decode(charset)
  204. except Exception:
  205. pass
  206. return resp
  207. def _lowerCaseKeys(self, headers_dict):
  208. new_dict = {}
  209. for k, v in headers_dict.items():
  210. new_dict[k.lower()] = v
  211. return new_dict
  212. def _parseHeaderValue(self, header_value):
  213. """
  214. Parse out a complex header value (such as Content-Type, with a value
  215. like "text/html; charset=utf-8") into a main value and a dictionary of
  216. extra information (in this case, 'text/html' and {'charset': 'utf8'}).
  217. """
  218. values = header_value.split(';', 1)
  219. if len(values) == 1:
  220. # There's no extra info -- return the main value and an empty dict
  221. return values[0], {}
  222. main_value, extra_values = values[0], values[1].split(';')
  223. extra_dict = {}
  224. for value_string in extra_values:
  225. try:
  226. key, value = value_string.split('=', 1)
  227. extra_dict[key.strip()] = value.strip()
  228. except ValueError:
  229. # Can't unpack it -- must be malformed. Ignore
  230. pass
  231. return main_value, extra_dict
  232. class HTTPError(HTTPFetchingError):
  233. """
  234. This exception is raised by the C{L{CurlHTTPFetcher}} when it
  235. encounters an exceptional situation fetching a URL.
  236. """
  237. pass
  238. # XXX: define what we mean by paranoid, and make sure it is.
  239. class CurlHTTPFetcher(HTTPFetcher):
  240. """
  241. An C{L{HTTPFetcher}} that uses pycurl for fetching.
  242. See U{http://pycurl.sourceforge.net/}.
  243. """
  244. ALLOWED_TIME = 20 # seconds
  245. def __init__(self):
  246. HTTPFetcher.__init__(self)
  247. if pycurl is None:
  248. raise RuntimeError('Cannot find pycurl library')
  249. def _parseHeaders(self, header_file):
  250. header_file.seek(0)
  251. # Remove all non "name: value" header lines from the input
  252. lines = [line.decode().strip() for line in header_file if b':' in line]
  253. headers = {}
  254. for line in lines:
  255. try:
  256. name, value = line.split(':', 1)
  257. except ValueError:
  258. raise HTTPError("Malformed HTTP header line in response: %r" %
  259. (line, ))
  260. value = value.strip()
  261. # HTTP headers are case-insensitive
  262. name = name.lower()
  263. headers[name] = value
  264. return headers
  265. def _checkURL(self, url):
  266. # XXX: document that this can be overridden to match desired policy
  267. # XXX: make sure url is well-formed and routeable
  268. return _allowedURL(url)
  269. def fetch(self, url, body=None, headers=None):
  270. stop = int(time.time()) + self.ALLOWED_TIME
  271. off = self.ALLOWED_TIME
  272. if headers is None:
  273. headers = {}
  274. headers.setdefault('User-Agent',
  275. "%s %s" % (USER_AGENT, pycurl.version, ))
  276. header_list = []
  277. if headers is not None:
  278. for header_name, header_value in headers.items():
  279. header = '%s: %s' % (header_name, header_value)
  280. header_list.append(header.encode())
  281. c = pycurl.Curl()
  282. try:
  283. c.setopt(pycurl.NOSIGNAL, 1)
  284. if header_list:
  285. c.setopt(pycurl.HTTPHEADER, header_list)
  286. # Presence of a body indicates that we should do a POST
  287. if body is not None:
  288. c.setopt(pycurl.POST, 1)
  289. c.setopt(pycurl.POSTFIELDS, body)
  290. while off > 0:
  291. if not self._checkURL(url):
  292. raise HTTPError("Fetching URL not allowed: %r" % (url, ))
  293. data = io.BytesIO()
  294. def write_data(chunk):
  295. if data.tell() > (1024 * MAX_RESPONSE_KB):
  296. return 0
  297. else:
  298. return data.write(chunk)
  299. response_header_data = io.BytesIO()
  300. c.setopt(pycurl.WRITEFUNCTION, write_data)
  301. c.setopt(pycurl.HEADERFUNCTION, response_header_data.write)
  302. c.setopt(pycurl.TIMEOUT, off)
  303. c.setopt(pycurl.URL, openid.urinorm.urinorm(url))
  304. c.perform()
  305. response_headers = self._parseHeaders(response_header_data)
  306. code = c.getinfo(pycurl.RESPONSE_CODE)
  307. if code in [301, 302, 303, 307]:
  308. url = response_headers.get('location')
  309. if url is None:
  310. raise HTTPError(
  311. 'Redirect (%s) returned without a location' % code)
  312. # Redirects are always GETs
  313. c.setopt(pycurl.POST, 0)
  314. # There is no way to reset POSTFIELDS to empty and
  315. # reuse the connection, but we only use it once.
  316. else:
  317. resp = HTTPResponse()
  318. resp.headers = response_headers
  319. resp.status = code
  320. resp.final_url = url
  321. resp.body = data.getvalue().decode()
  322. return resp
  323. off = stop - int(time.time())
  324. raise HTTPError("Timed out fetching: %r" % (url, ))
  325. finally:
  326. c.close()
  327. class HTTPLib2Fetcher(HTTPFetcher):
  328. """A fetcher that uses C{httplib2} for performing HTTP
  329. requests. This implementation supports HTTP caching.
  330. @see: http://bitworking.org/projects/httplib2/
  331. """
  332. def __init__(self, cache=None):
  333. """@param cache: An object suitable for use as an C{httplib2}
  334. cache. If a string is passed, it is assumed to be a
  335. directory name.
  336. """
  337. if httplib2 is None:
  338. raise RuntimeError('Cannot find httplib2 library. '
  339. 'See http://bitworking.org/projects/httplib2/')
  340. super(HTTPLib2Fetcher, self).__init__()
  341. # An instance of the httplib2 object that performs HTTP requests
  342. self.httplib2 = httplib2.Http(cache)
  343. # We want httplib2 to raise exceptions for errors, just like
  344. # the other fetchers.
  345. self.httplib2.force_exception_to_status_code = False
  346. def fetch(self, url, body=None, headers=None):
  347. """Perform an HTTP request
  348. @raises Exception: Any exception that can be raised by httplib2
  349. @see: C{L{HTTPFetcher.fetch}}
  350. """
  351. if body:
  352. method = 'POST'
  353. else:
  354. method = 'GET'
  355. if headers is None:
  356. headers = {}
  357. # httplib2 doesn't check to make sure that the URL's scheme is
  358. # 'http' so we do it here.
  359. if not (url.startswith('http://') or url.startswith('https://')):
  360. raise ValueError('URL is not a HTTP URL: %r' % (url, ))
  361. httplib2_response, content = self.httplib2.request(
  362. url, method, body=body, headers=headers)
  363. # Translate the httplib2 response to our HTTP response abstraction
  364. # When a 400 is returned, there is no "content-location"
  365. # header set. This seems like a bug to me. I can't think of a
  366. # case where we really care about the final URL when it is an
  367. # error response, but being careful about it can't hurt.
  368. try:
  369. final_url = httplib2_response['content-location']
  370. except KeyError:
  371. # We're assuming that no redirects occurred
  372. assert not httplib2_response.previous
  373. # And this should never happen for a successful response
  374. assert httplib2_response.status != 200
  375. final_url = url
  376. return HTTPResponse(
  377. body=content.decode(), # TODO Don't assume ASCII
  378. final_url=final_url,
  379. headers=dict(list(httplib2_response.items())),
  380. status=httplib2_response.status, )