codecutil.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. import codecs
  2. try:
  3. chr(0x10000)
  4. except ValueError:
  5. # narrow python build
  6. UCSCHAR = [
  7. (0xA0, 0xD7FF),
  8. (0xF900, 0xFDCF),
  9. (0xFDF0, 0xFFEF),
  10. ]
  11. IPRIVATE = [
  12. (0xE000, 0xF8FF),
  13. ]
  14. else:
  15. UCSCHAR = [
  16. (0xA0, 0xD7FF),
  17. (0xF900, 0xFDCF),
  18. (0xFDF0, 0xFFEF),
  19. (0x10000, 0x1FFFD),
  20. (0x20000, 0x2FFFD),
  21. (0x30000, 0x3FFFD),
  22. (0x40000, 0x4FFFD),
  23. (0x50000, 0x5FFFD),
  24. (0x60000, 0x6FFFD),
  25. (0x70000, 0x7FFFD),
  26. (0x80000, 0x8FFFD),
  27. (0x90000, 0x9FFFD),
  28. (0xA0000, 0xAFFFD),
  29. (0xB0000, 0xBFFFD),
  30. (0xC0000, 0xCFFFD),
  31. (0xD0000, 0xDFFFD),
  32. (0xE1000, 0xEFFFD),
  33. ]
  34. IPRIVATE = [
  35. (0xE000, 0xF8FF),
  36. (0xF0000, 0xFFFFD),
  37. (0x100000, 0x10FFFD),
  38. ]
  39. _ESCAPE_RANGES = UCSCHAR + IPRIVATE
  40. def _in_escape_range(octet):
  41. for start, end in _ESCAPE_RANGES:
  42. if start <= octet <= end:
  43. return True
  44. return False
  45. def _starts_surrogate_pair(character):
  46. char_value = ord(character)
  47. return 0xD800 <= char_value <= 0xDBFF
  48. def _ends_surrogate_pair(character):
  49. char_value = ord(character)
  50. return 0xDC00 <= char_value <= 0xDFFF
  51. def _pct_encoded_replacements(chunk):
  52. replacements = []
  53. chunk_iter = iter(chunk)
  54. for character in chunk_iter:
  55. codepoint = ord(character)
  56. if _in_escape_range(codepoint):
  57. for char in chr(codepoint).encode("utf-8"):
  58. replacements.append("%%%X" % char)
  59. elif _starts_surrogate_pair(character):
  60. next_character = next(chunk_iter)
  61. for char in (character + next_character).encode("utf-8"):
  62. replacements.append("%%%X" % char)
  63. else:
  64. replacements.append(chr(codepoint))
  65. return replacements
  66. def _pct_escape_handler(err):
  67. '''
  68. Encoding error handler that does percent-escaping of Unicode, to be used
  69. with codecs.register_error
  70. TODO: replace use of this with urllib.parse.quote as appropriate
  71. '''
  72. chunk = err.object[err.start:err.end]
  73. replacements = _pct_encoded_replacements(chunk)
  74. return ("".join(replacements), err.end)
  75. codecs.register_error("oid_percent_escape", _pct_escape_handler)