urinorm.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. import re
  2. from openid import codecutil # registers 'oid_percent_escape' encoding handler
  3. # from appendix B of rfc 3986 (http://www.ietf.org/rfc/rfc3986.txt)
  4. uri_pattern = r'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
  5. uri_re = re.compile(uri_pattern)
  6. # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
  7. #
  8. # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
  9. # / "*" / "+" / "," / ";" / "="
  10. #
  11. # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
  12. uri_illegal_char_re = re.compile(r"[^-A-Za-z0-9:/?#[\]@!$&'()*+,;=._~%]",
  13. re.UNICODE)
  14. authority_pattern = r'^([^@]*@)?([^:]*)(:.*)?'
  15. authority_re = re.compile(authority_pattern)
  16. pct_encoded_pattern = r'%([0-9A-Fa-f]{2})'
  17. pct_encoded_re = re.compile(pct_encoded_pattern)
  18. _unreserved = [False] * 256
  19. for _ in range(ord('A'), ord('Z') + 1):
  20. _unreserved[_] = True
  21. for _ in range(ord('0'), ord('9') + 1):
  22. _unreserved[_] = True
  23. for _ in range(ord('a'), ord('z') + 1):
  24. _unreserved[_] = True
  25. _unreserved[ord('-')] = True
  26. _unreserved[ord('.')] = True
  27. _unreserved[ord('_')] = True
  28. _unreserved[ord('~')] = True
  29. def _pct_encoded_replace_unreserved(mo):
  30. try:
  31. i = int(mo.group(1), 16)
  32. if _unreserved[i]:
  33. return chr(i)
  34. else:
  35. return mo.group().upper()
  36. except ValueError:
  37. return mo.group()
  38. def _pct_encoded_replace(mo):
  39. try:
  40. return chr(int(mo.group(1), 16))
  41. except ValueError:
  42. return mo.group()
  43. def remove_dot_segments(path):
  44. result_segments = []
  45. while path:
  46. if path.startswith('../'):
  47. path = path[3:]
  48. elif path.startswith('./'):
  49. path = path[2:]
  50. elif path.startswith('/./'):
  51. path = path[2:]
  52. elif path == '/.':
  53. path = '/'
  54. elif path.startswith('/../'):
  55. path = path[3:]
  56. if result_segments:
  57. result_segments.pop()
  58. elif path == '/..':
  59. path = '/'
  60. if result_segments:
  61. result_segments.pop()
  62. elif path == '..' or path == '.':
  63. path = ''
  64. else:
  65. i = 0
  66. if path[0] == '/':
  67. i = 1
  68. i = path.find('/', i)
  69. if i == -1:
  70. i = len(path)
  71. result_segments.append(path[:i])
  72. path = path[i:]
  73. return ''.join(result_segments)
  74. def urinorm(uri):
  75. '''
  76. Normalize a URI
  77. '''
  78. # TODO: use urllib.parse instead of these complex regular expressions
  79. if isinstance(uri, bytes):
  80. uri = str(uri, encoding='utf-8')
  81. uri = uri.encode('ascii', errors='oid_percent_escape').decode('utf-8')
  82. # _escapeme_re.sub(_pct_escape_unicode, uri).encode('ascii').decode()
  83. illegal_mo = uri_illegal_char_re.search(uri)
  84. if illegal_mo:
  85. raise ValueError('Illegal characters in URI: %r at position %s' %
  86. (illegal_mo.group(), illegal_mo.start()))
  87. uri_mo = uri_re.match(uri)
  88. scheme = uri_mo.group(2)
  89. if scheme is None:
  90. raise ValueError('No scheme specified')
  91. scheme = scheme.lower()
  92. if scheme not in ('http', 'https'):
  93. raise ValueError('Not an absolute HTTP or HTTPS URI: %r' % (uri, ))
  94. authority = uri_mo.group(4)
  95. if authority is None:
  96. raise ValueError('Not an absolute URI: %r' % (uri, ))
  97. authority_mo = authority_re.match(authority)
  98. if authority_mo is None:
  99. raise ValueError('URI does not have a valid authority: %r' % (uri, ))
  100. userinfo, host, port = authority_mo.groups()
  101. if userinfo is None:
  102. userinfo = ''
  103. if '%' in host:
  104. host = host.lower()
  105. host = pct_encoded_re.sub(_pct_encoded_replace, host)
  106. host = host.encode('idna').decode()
  107. else:
  108. host = host.lower()
  109. if port:
  110. if (port == ':' or (scheme == 'http' and port == ':80') or
  111. (scheme == 'https' and port == ':443')):
  112. port = ''
  113. else:
  114. port = ''
  115. authority = userinfo + host + port
  116. path = uri_mo.group(5)
  117. path = pct_encoded_re.sub(_pct_encoded_replace_unreserved, path)
  118. path = remove_dot_segments(path)
  119. if not path:
  120. path = '/'
  121. query = uri_mo.group(6)
  122. if query is None:
  123. query = ''
  124. fragment = uri_mo.group(8)
  125. if fragment is None:
  126. fragment = ''
  127. return scheme + '://' + authority + path + query + fragment