uri_validate.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. """
  2. Regex for URIs
  3. These regex are directly derived from the collected ABNF in RFC3986
  4. (except for DIGIT, ALPHA and HEXDIG, defined by RFC2234).
  5. They should be processed with re.VERBOSE.
  6. Thanks Mark Nottingham for this code - https://gist.github.com/138549
  7. """
  8. import re
  9. # basics
  10. DIGIT = r"[\x30-\x39]"
  11. ALPHA = r"[\x41-\x5A\x61-\x7A]"
  12. HEXDIG = r"[\x30-\x39A-Fa-f]"
  13. # pct-encoded = "%" HEXDIG HEXDIG
  14. pct_encoded = r" %% %(HEXDIG)s %(HEXDIG)s" % locals()
  15. # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
  16. unreserved = r"(?: %(ALPHA)s | %(DIGIT)s | \- | \. | _ | ~ )" % locals()
  17. # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
  18. gen_delims = r"(?: : | / | \? | \# | \[ | \] | @ )"
  19. # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
  20. # / "*" / "+" / "," / ";" / "="
  21. sub_delims = r"""(?: ! | \$ | & | ' | \( | \) |
  22. \* | \+ | , | ; | = )"""
  23. # pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
  24. pchar = r"(?: %(unreserved)s | %(pct_encoded)s | %(sub_delims)s | : | @ )" % locals(
  25. )
  26. # reserved = gen-delims / sub-delims
  27. reserved = r"(?: %(gen_delims)s | %(sub_delims)s )" % locals()
  28. # scheme
  29. # scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
  30. scheme = r"%(ALPHA)s (?: %(ALPHA)s | %(DIGIT)s | \+ | \- | \. )*" % locals()
  31. # authority
  32. # dec-octet = DIGIT ; 0-9
  33. # / %x31-39 DIGIT ; 10-99
  34. # / "1" 2DIGIT ; 100-199
  35. # / "2" %x30-34 DIGIT ; 200-249
  36. # / "25" %x30-35 ; 250-255
  37. dec_octet = r"""(?: %(DIGIT)s |
  38. [\x31-\x39] %(DIGIT)s |
  39. 1 %(DIGIT)s{2} |
  40. 2 [\x30-\x34] %(DIGIT)s |
  41. 25 [\x30-\x35]
  42. )
  43. """ % locals()
  44. # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
  45. IPv4address = r"%(dec_octet)s \. %(dec_octet)s \. %(dec_octet)s \. %(dec_octet)s" % locals(
  46. )
  47. # IPv6address
  48. IPv6address = r"([A-Fa-f0-9:]+[:$])[A-Fa-f0-9]{1,4}"
  49. # IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
  50. IPvFuture = r"v %(HEXDIG)s+ \. (?: %(unreserved)s | %(sub_delims)s | : )+" % locals()
  51. # IP-literal = "[" ( IPv6address / IPvFuture ) "]"
  52. IP_literal = r"\[ (?: %(IPv6address)s | %(IPvFuture)s ) \]" % locals()
  53. # reg-name = *( unreserved / pct-encoded / sub-delims )
  54. reg_name = r"(?: %(unreserved)s | %(pct_encoded)s | %(sub_delims)s )*" % locals()
  55. # userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
  56. userinfo = r"(?: %(unreserved)s | %(pct_encoded)s | %(sub_delims)s | : )" % locals(
  57. )
  58. # host = IP-literal / IPv4address / reg-name
  59. host = r"(?: %(IP_literal)s | %(IPv4address)s | %(reg_name)s )" % locals()
  60. # port = *DIGIT
  61. port = r"(?: %(DIGIT)s )*" % locals()
  62. # authority = [ userinfo "@" ] host [ ":" port ]
  63. authority = r"(?: %(userinfo)s @)? %(host)s (?: : %(port)s)?" % locals()
  64. # Path
  65. # segment = *pchar
  66. segment = r"%(pchar)s*" % locals()
  67. # segment-nz = 1*pchar
  68. segment_nz = r"%(pchar)s+" % locals()
  69. # segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
  70. # ; non-zero-length segment without any colon ":"
  71. segment_nz_nc = r"(?: %(unreserved)s | %(pct_encoded)s | %(sub_delims)s | @ )+" % locals()
  72. # path-abempty = *( "/" segment )
  73. path_abempty = r"(?: / %(segment)s )*" % locals()
  74. # path-absolute = "/" [ segment-nz *( "/" segment ) ]
  75. path_absolute = r"/ (?: %(segment_nz)s (?: / %(segment)s )* )?" % locals()
  76. # path-noscheme = segment-nz-nc *( "/" segment )
  77. path_noscheme = r"%(segment_nz_nc)s (?: / %(segment)s )*" % locals()
  78. # path-rootless = segment-nz *( "/" segment )
  79. path_rootless = r"%(segment_nz)s (?: / %(segment)s )*" % locals()
  80. # path-empty = 0<pchar>
  81. path_empty = r"" # FIXME
  82. # path = path-abempty ; begins with "/" or is empty
  83. # / path-absolute ; begins with "/" but not "//"
  84. # / path-noscheme ; begins with a non-colon segment
  85. # / path-rootless ; begins with a segment
  86. # / path-empty ; zero characters
  87. path = r"""(?: %(path_abempty)s |
  88. %(path_absolute)s |
  89. %(path_noscheme)s |
  90. %(path_rootless)s |
  91. %(path_empty)s
  92. )
  93. """ % locals()
  94. ### Query and Fragment
  95. # query = *( pchar / "/" / "?" )
  96. query = r"(?: %(pchar)s | / | \? )*" % locals()
  97. # fragment = *( pchar / "/" / "?" )
  98. fragment = r"(?: %(pchar)s | / | \? )*" % locals()
  99. # URIs
  100. # hier-part = "//" authority path-abempty
  101. # / path-absolute
  102. # / path-rootless
  103. # / path-empty
  104. hier_part = r"""(?: (?: // %(authority)s %(path_abempty)s ) |
  105. %(path_absolute)s |
  106. %(path_rootless)s |
  107. %(path_empty)s
  108. )
  109. """ % locals()
  110. # relative-part = "//" authority path-abempty
  111. # / path-absolute
  112. # / path-noscheme
  113. # / path-empty
  114. relative_part = r"""(?: (?: // %(authority)s %(path_abempty)s ) |
  115. %(path_absolute)s |
  116. %(path_noscheme)s |
  117. %(path_empty)s
  118. )
  119. """ % locals()
  120. # relative-ref = relative-part [ "?" query ] [ "#" fragment ]
  121. relative_ref = r"%(relative_part)s (?: \? %(query)s)? (?: \# %(fragment)s)?" % locals(
  122. )
  123. # URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  124. URI = r"^(?: %(scheme)s : %(hier_part)s (?: \? %(query)s )? (?: \# %(fragment)s )? )$" % locals(
  125. )
  126. # URI-reference = URI / relative-ref
  127. URI_reference = r"^(?: %(URI)s | %(relative_ref)s )$" % locals()
  128. # absolute-URI = scheme ":" hier-part [ "?" query ]
  129. absolute_URI = r"^(?: %(scheme)s : %(hier_part)s (?: \? %(query)s )? )$" % locals(
  130. )
  131. def is_uri(uri):
  132. return re.match(URI, uri, re.VERBOSE)
  133. def is_uri_reference(uri):
  134. return re.match(URI_reference, uri, re.VERBOSE)
  135. def is_absolute_uri(uri):
  136. return re.match(absolute_URI, uri, re.VERBOSE)