parsehtml.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. __all__ = ['findHTMLMeta', 'MetaNotFound']
  2. from html.parser import HTMLParser
  3. import html.entities
  4. import re
  5. import sys
  6. from openid.yadis.constants import YADIS_HEADER_NAME
  7. # Size of the chunks to search at a time (also the amount that gets
  8. # read at a time)
  9. CHUNK_SIZE = 1024 * 16 # 16 KB
  10. class ParseDone(Exception):
  11. """Exception to hold the URI that was located when the parse is
  12. finished. If the parse finishes without finding the URI, set it to
  13. None."""
  14. class MetaNotFound(Exception):
  15. """Exception to hold the content of the page if we did not find
  16. the appropriate <meta> tag"""
  17. re_flags = re.IGNORECASE | re.UNICODE | re.VERBOSE
  18. ent_pat = r'''
  19. &
  20. (?: \#x (?P<hex> [a-f0-9]+ )
  21. | \# (?P<dec> \d+ )
  22. | (?P<word> \w+ )
  23. )
  24. ;'''
  25. ent_re = re.compile(ent_pat, re_flags)
  26. def substituteMO(mo):
  27. if mo.lastgroup == 'hex':
  28. codepoint = int(mo.group('hex'), 16)
  29. elif mo.lastgroup == 'dec':
  30. codepoint = int(mo.group('dec'))
  31. else:
  32. assert mo.lastgroup == 'word'
  33. codepoint = html.entities.name2codepoint.get(mo.group('word'))
  34. if codepoint is None:
  35. return mo.group()
  36. else:
  37. return chr(codepoint)
  38. def substituteEntities(s):
  39. return ent_re.sub(substituteMO, s)
  40. class YadisHTMLParser(HTMLParser):
  41. """Parser that finds a meta http-equiv tag in the head of a html
  42. document.
  43. When feeding in data, if the tag is matched or it will never be
  44. found, the parser will raise ParseDone with the uri as the first
  45. attribute.
  46. Parsing state diagram
  47. =====================
  48. Any unlisted input does not affect the state::
  49. 1, 2, 5 8
  50. +--------------------------+ +-+
  51. | | | |
  52. 4 | 3 1, 2, 5, 7 v | v
  53. TOP -> HTML -> HEAD ----------> TERMINATED
  54. | | ^ | ^ ^
  55. | | 3 | | | |
  56. | +------------+ +-> FOUND ------+ |
  57. | 6 8 |
  58. | 1, 2 |
  59. +------------------------------------+
  60. 1. any of </body>, </html>, </head> -> TERMINATE
  61. 2. <body> -> TERMINATE
  62. 3. <head> -> HEAD
  63. 4. <html> -> HTML
  64. 5. <html> -> TERMINATE
  65. 6. <meta http-equiv='X-XRDS-Location'> -> FOUND
  66. 7. <head> -> TERMINATE
  67. 8. Any input -> TERMINATE
  68. """
  69. TOP = 0
  70. HTML = 1
  71. HEAD = 2
  72. FOUND = 3
  73. TERMINATED = 4
  74. def __init__(self):
  75. if (sys.version_info.minor <= 2):
  76. # Python 3.2 and below actually require the `strict` argument
  77. # to `html.parser.HTMLParser` -- otherwise it's deprecated and
  78. # we don't want to pass it
  79. super(YadisHTMLParser, self).__init__(strict=False)
  80. else:
  81. super(YadisHTMLParser, self).__init__()
  82. self.phase = self.TOP
  83. def _terminate(self):
  84. self.phase = self.TERMINATED
  85. raise ParseDone(None)
  86. def handle_endtag(self, tag):
  87. # If we ever see an end of head, body, or html, bail out right away.
  88. # [1]
  89. if tag in ['head', 'body', 'html']:
  90. self._terminate()
  91. def handle_starttag(self, tag, attrs):
  92. # if we ever see a start body tag, bail out right away, since
  93. # we want to prevent the meta tag from appearing in the body
  94. # [2]
  95. if tag == 'body':
  96. self._terminate()
  97. if self.phase == self.TOP:
  98. # At the top level, allow a html tag or a head tag to move
  99. # to the head or html phase
  100. if tag == 'head':
  101. # [3]
  102. self.phase = self.HEAD
  103. elif tag == 'html':
  104. # [4]
  105. self.phase = self.HTML
  106. elif self.phase == self.HTML:
  107. # if we are in the html tag, allow a head tag to move to
  108. # the HEAD phase. If we get another html tag, then bail
  109. # out
  110. if tag == 'head':
  111. # [3]
  112. self.phase = self.HEAD
  113. elif tag == 'html':
  114. # [5]
  115. self._terminate()
  116. elif self.phase == self.HEAD:
  117. # If we are in the head phase, look for the appropriate
  118. # meta tag. If we get a head or body tag, bail out.
  119. if tag == 'meta':
  120. attrs_d = dict(attrs)
  121. http_equiv = attrs_d.get('http-equiv', '').lower()
  122. if http_equiv == YADIS_HEADER_NAME.lower():
  123. raw_attr = attrs_d.get('content')
  124. yadis_loc = substituteEntities(raw_attr)
  125. # [6]
  126. self.phase = self.FOUND
  127. raise ParseDone(yadis_loc)
  128. elif tag in ('head', 'html'):
  129. # [5], [7]
  130. self._terminate()
  131. def feed(self, chars):
  132. # [8]
  133. if self.phase in (self.TERMINATED, self.FOUND):
  134. self._terminate()
  135. return super(YadisHTMLParser, self).feed(chars)
  136. def findHTMLMeta(stream):
  137. """Look for a meta http-equiv tag with the YADIS header name.
  138. @param stream: Source of the html text
  139. @type stream: Object that implements a read() method that works
  140. like file.read
  141. @return: The URI from which to fetch the XRDS document
  142. @rtype: str
  143. @raises MetaNotFound: raised with the content that was
  144. searched as the first parameter.
  145. """
  146. parser = YadisHTMLParser()
  147. chunks = []
  148. while 1:
  149. chunk = stream.read(CHUNK_SIZE)
  150. if not chunk:
  151. # End of file
  152. break
  153. chunks.append(chunk)
  154. try:
  155. parser.feed(chunk)
  156. except ParseDone as why:
  157. uri = why.args[0]
  158. if uri is None:
  159. # Parse finished, but we may need the rest of the file
  160. chunks.append(stream.read())
  161. break
  162. else:
  163. return uri
  164. content = ''.join(chunks)
  165. raise MetaNotFound(content)