utils.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. import html.entities
  2. from typing import Dict, List, Optional
  3. from . import config
  4. unifiable_n = {
  5. html.entities.name2codepoint[k]: v
  6. for k, v in config.UNIFIABLE.items()
  7. if k != "nbsp"
  8. }
  9. def hn(tag: str) -> int:
  10. if tag[0] == "h" and len(tag) == 2:
  11. n = tag[1]
  12. if "0" < n <= "9":
  13. return int(n)
  14. return 0
  15. def dumb_property_dict(style: str) -> Dict[str, str]:
  16. """
  17. :returns: A hash of css attributes
  18. """
  19. return {
  20. x.strip().lower(): y.strip().lower()
  21. for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
  22. }
  23. def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
  24. """
  25. :type data: str
  26. :returns: A hash of css selectors, each of which contains a hash of
  27. css attributes.
  28. :rtype: dict
  29. """
  30. # remove @import sentences
  31. data += ";"
  32. importIndex = data.find("@import")
  33. while importIndex != -1:
  34. data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
  35. importIndex = data.find("@import")
  36. # parse the css. reverted from dictionary comprehension in order to
  37. # support older pythons
  38. pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
  39. try:
  40. elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
  41. except ValueError:
  42. elements = {} # not that important
  43. return elements
  44. def element_style(
  45. attrs: Dict[str, Optional[str]],
  46. style_def: Dict[str, Dict[str, str]],
  47. parent_style: Dict[str, str],
  48. ) -> Dict[str, str]:
  49. """
  50. :type attrs: dict
  51. :type style_def: dict
  52. :type style_def: dict
  53. :returns: A hash of the 'final' style attributes of the element
  54. :rtype: dict
  55. """
  56. style = parent_style.copy()
  57. if "class" in attrs:
  58. assert attrs["class"] is not None
  59. for css_class in attrs["class"].split():
  60. css_style = style_def.get("." + css_class, {})
  61. style.update(css_style)
  62. if "style" in attrs:
  63. assert attrs["style"] is not None
  64. immediate_style = dumb_property_dict(attrs["style"])
  65. style.update(immediate_style)
  66. return style
  67. def google_list_style(style: Dict[str, str]) -> str:
  68. """
  69. Finds out whether this is an ordered or unordered list
  70. :type style: dict
  71. :rtype: str
  72. """
  73. if "list-style-type" in style:
  74. list_style = style["list-style-type"]
  75. if list_style in ["disc", "circle", "square", "none"]:
  76. return "ul"
  77. return "ol"
  78. def google_has_height(style: Dict[str, str]) -> bool:
  79. """
  80. Check if the style of the element has the 'height' attribute
  81. explicitly defined
  82. :type style: dict
  83. :rtype: bool
  84. """
  85. return "height" in style
  86. def google_text_emphasis(style: Dict[str, str]) -> List[str]:
  87. """
  88. :type style: dict
  89. :returns: A list of all emphasis modifiers of the element
  90. :rtype: list
  91. """
  92. emphasis = []
  93. if "text-decoration" in style:
  94. emphasis.append(style["text-decoration"])
  95. if "font-style" in style:
  96. emphasis.append(style["font-style"])
  97. if "font-weight" in style:
  98. emphasis.append(style["font-weight"])
  99. return emphasis
  100. def google_fixed_width_font(style: Dict[str, str]) -> bool:
  101. """
  102. Check if the css of the current element defines a fixed width font
  103. :type style: dict
  104. :rtype: bool
  105. """
  106. font_family = ""
  107. if "font-family" in style:
  108. font_family = style["font-family"]
  109. return "courier new" == font_family or "consolas" == font_family
  110. def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
  111. """
  112. Extract numbering from list element attributes
  113. :type attrs: dict
  114. :rtype: int or None
  115. """
  116. if "start" in attrs:
  117. assert attrs["start"] is not None
  118. try:
  119. return int(attrs["start"]) - 1
  120. except ValueError:
  121. pass
  122. return 0
  123. def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool:
  124. # If it appears to contain a link
  125. # don't wrap
  126. if not wrap_links and config.RE_LINK.search(para):
  127. return True
  128. # If the text begins with four spaces or one tab, it's a code block;
  129. # don't wrap
  130. if para[0:4] == " " or para[0] == "\t":
  131. return True
  132. # If the text begins with only two "--", possibly preceded by
  133. # whitespace, that's an emdash; so wrap.
  134. stripped = para.lstrip()
  135. if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
  136. return False
  137. # I'm not sure what this is for; I thought it was to detect lists,
  138. # but there's a <br>-inside-<span> case in one of the tests that
  139. # also depends upon it.
  140. if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
  141. return not wrap_list_items
  142. # If the text begins with a single -, *, or +, followed by a space,
  143. # or an integer, followed by a ., followed by a space (in either
  144. # case optionally proceeded by whitespace), it's a list; don't wrap.
  145. return bool(
  146. config.RE_ORDERED_LIST_MATCHER.match(stripped)
  147. or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
  148. )
  149. def escape_md(text: str) -> str:
  150. """
  151. Escapes markdown-sensitive characters within other markdown
  152. constructs.
  153. """
  154. return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
  155. def escape_md_section(text: str, snob: bool = False) -> str:
  156. """
  157. Escapes markdown-sensitive characters across whole document sections.
  158. """
  159. text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
  160. if snob:
  161. text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
  162. text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
  163. text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
  164. text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
  165. return text
  166. def reformat_table(lines: List[str], right_margin: int) -> List[str]:
  167. """
  168. Given the lines of a table
  169. padds the cells and returns the new lines
  170. """
  171. # find the maximum width of the columns
  172. max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
  173. max_cols = len(max_width)
  174. for line in lines:
  175. cols = [x.rstrip() for x in line.split("|")]
  176. num_cols = len(cols)
  177. # don't drop any data if colspan attributes result in unequal lengths
  178. if num_cols < max_cols:
  179. cols += [""] * (max_cols - num_cols)
  180. elif max_cols < num_cols:
  181. max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
  182. max_cols = num_cols
  183. max_width = [
  184. max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
  185. ]
  186. # reformat
  187. new_lines = []
  188. for line in lines:
  189. cols = [x.rstrip() for x in line.split("|")]
  190. if set(line.strip()) == set("-|"):
  191. filler = "-"
  192. new_cols = [
  193. x.rstrip() + (filler * (M - len(x.rstrip())))
  194. for x, M in zip(cols, max_width)
  195. ]
  196. else:
  197. filler = " "
  198. new_cols = [
  199. x.rstrip() + (filler * (M - len(x.rstrip())))
  200. for x, M in zip(cols, max_width)
  201. ]
  202. new_lines.append("|".join(new_cols))
  203. return new_lines
  204. def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
  205. """
  206. Provide padding for tables in the text
  207. """
  208. lines = text.split("\n")
  209. table_buffer = [] # type: List[str]
  210. table_started = False
  211. new_lines = []
  212. for line in lines:
  213. # Toggle table started
  214. if config.TABLE_MARKER_FOR_PAD in line:
  215. table_started = not table_started
  216. if not table_started:
  217. table = reformat_table(table_buffer, right_margin)
  218. new_lines.extend(table)
  219. table_buffer = []
  220. new_lines.append("")
  221. continue
  222. # Process lines
  223. if table_started:
  224. table_buffer.append(line)
  225. else:
  226. new_lines.append(line)
  227. return "\n".join(new_lines)