1
0

__init__.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947
  1. """html2text: Turn HTML into equivalent Markdown-structured text."""
  2. import html.entities
  3. import html.parser
  4. import re
  5. import urllib.parse as urlparse
  6. from textwrap import wrap
  7. from typing import Dict, List, Optional, Tuple, Union
  8. from . import config
  9. from .elements import AnchorElement, ListElement
  10. from .typing import OutCallback
  11. from .utils import (
  12. dumb_css_parser,
  13. element_style,
  14. escape_md,
  15. escape_md_section,
  16. google_fixed_width_font,
  17. google_has_height,
  18. google_list_style,
  19. google_text_emphasis,
  20. hn,
  21. list_numbering_start,
  22. pad_tables_in_text,
  23. skipwrap,
  24. unifiable_n,
  25. )
  26. __version__ = (2020, 1, 16)
  27. # TODO:
  28. # Support decoded entities with UNIFIABLE.
  29. class HTML2Text(html.parser.HTMLParser):
  30. def __init__(
  31. self,
  32. out: Optional[OutCallback] = None,
  33. baseurl: str = "",
  34. bodywidth: int = config.BODY_WIDTH,
  35. ) -> None:
  36. """
  37. Input parameters:
  38. out: possible custom replacement for self.outtextf (which
  39. appends lines of text).
  40. baseurl: base URL of the document we process
  41. """
  42. super().__init__(convert_charrefs=False)
  43. # Config options
  44. self.split_next_td = False
  45. self.td_count = 0
  46. self.table_start = False
  47. self.unicode_snob = config.UNICODE_SNOB # covered in cli
  48. self.escape_snob = config.ESCAPE_SNOB # covered in cli
  49. self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
  50. self.body_width = bodywidth # covered in cli
  51. self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli
  52. self.inline_links = config.INLINE_LINKS # covered in cli
  53. self.protect_links = config.PROTECT_LINKS # covered in cli
  54. self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli
  55. self.ignore_links = config.IGNORE_ANCHORS # covered in cli
  56. self.ignore_images = config.IGNORE_IMAGES # covered in cli
  57. self.images_as_html = config.IMAGES_AS_HTML # covered in cli
  58. self.images_to_alt = config.IMAGES_TO_ALT # covered in cli
  59. self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli
  60. self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli
  61. self.bypass_tables = config.BYPASS_TABLES # covered in cli
  62. self.ignore_tables = config.IGNORE_TABLES # covered in cli
  63. self.google_doc = False # covered in cli
  64. self.ul_item_mark = "*" # covered in cli
  65. self.emphasis_mark = "_" # covered in cli
  66. self.strong_mark = "**"
  67. self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli
  68. self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli
  69. self.hide_strikethrough = False # covered in cli
  70. self.mark_code = config.MARK_CODE
  71. self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli
  72. self.wrap_links = config.WRAP_LINKS # covered in cli
  73. self.pad_tables = config.PAD_TABLES # covered in cli
  74. self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
  75. self.tag_callback = None
  76. self.open_quote = config.OPEN_QUOTE # covered in cli
  77. self.close_quote = config.CLOSE_QUOTE # covered in cli
  78. if out is None:
  79. self.out = self.outtextf
  80. else:
  81. self.out = out
  82. # empty list to store output characters before they are "joined"
  83. self.outtextlist = [] # type: List[str]
  84. self.quiet = 0
  85. self.p_p = 0 # number of newline character to print before next output
  86. self.outcount = 0
  87. self.start = True
  88. self.space = False
  89. self.a = [] # type: List[AnchorElement]
  90. self.astack = [] # type: List[Optional[Dict[str, Optional[str]]]]
  91. self.maybe_automatic_link = None # type: Optional[str]
  92. self.empty_link = False
  93. self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://")
  94. self.acount = 0
  95. self.list = [] # type: List[ListElement]
  96. self.blockquote = 0
  97. self.pre = False
  98. self.startpre = False
  99. self.code = False
  100. self.quote = False
  101. self.br_toggle = ""
  102. self.lastWasNL = False
  103. self.lastWasList = False
  104. self.style = 0
  105. self.style_def = {} # type: Dict[str, Dict[str, str]]
  106. self.tag_stack = (
  107. []
  108. ) # type: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]]
  109. self.emphasis = 0
  110. self.drop_white_space = 0
  111. self.inheader = False
  112. # Current abbreviation definition
  113. self.abbr_title = None # type: Optional[str]
  114. # Last inner HTML (for abbr being defined)
  115. self.abbr_data = None # type: Optional[str]
  116. # Stack of abbreviations to write later
  117. self.abbr_list = {} # type: Dict[str, str]
  118. self.baseurl = baseurl
  119. self.stressed = False
  120. self.preceding_stressed = False
  121. self.preceding_data = ""
  122. self.current_tag = ""
  123. config.UNIFIABLE["nbsp"] = "&nbsp_place_holder;"
  124. def feed(self, data: str) -> None:
  125. data = data.replace("</' + 'script>", "</ignore>")
  126. super().feed(data)
  127. def handle(self, data: str) -> str:
  128. self.feed(data)
  129. self.feed("")
  130. markdown = self.optwrap(self.finish())
  131. if self.pad_tables:
  132. return pad_tables_in_text(markdown)
  133. else:
  134. return markdown
  135. def outtextf(self, s: str) -> None:
  136. self.outtextlist.append(s)
  137. if s:
  138. self.lastWasNL = s[-1] == "\n"
  139. def finish(self) -> str:
  140. self.close()
  141. self.pbr()
  142. self.o("", force="end")
  143. outtext = "".join(self.outtextlist)
  144. if self.unicode_snob:
  145. nbsp = html.entities.html5["nbsp;"]
  146. else:
  147. nbsp = " "
  148. outtext = outtext.replace("&nbsp_place_holder;", nbsp)
  149. # Clear self.outtextlist to avoid memory leak of its content to
  150. # the next handling.
  151. self.outtextlist = []
  152. return outtext
  153. def handle_charref(self, c: str) -> None:
  154. self.handle_data(self.charref(c), True)
  155. def handle_entityref(self, c: str) -> None:
  156. ref = self.entityref(c)
  157. # ref may be an empty string (e.g. for &lrm;/&rlm; markers that should
  158. # not contribute to the final output).
  159. # self.handle_data cannot handle a zero-length string right after a
  160. # stressed tag or mid-text within a stressed tag (text get split and
  161. # self.stressed/self.preceding_stressed gets switched after the first
  162. # part of that text).
  163. if ref:
  164. self.handle_data(ref, True)
  165. def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
  166. self.handle_tag(tag, dict(attrs), start=True)
  167. def handle_endtag(self, tag: str) -> None:
  168. self.handle_tag(tag, {}, start=False)
  169. def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]:
  170. """
  171. :type attrs: dict
  172. :returns: The index of certain set of attributes (of a link) in the
  173. self.a list. If the set of attributes is not found, returns None
  174. :rtype: int
  175. """
  176. if "href" not in attrs:
  177. return None
  178. match = False
  179. for i, a in enumerate(self.a):
  180. if "href" in a.attrs and a.attrs["href"] == attrs["href"]:
  181. if "title" in a.attrs or "title" in attrs:
  182. if (
  183. "title" in a.attrs
  184. and "title" in attrs
  185. and a.attrs["title"] == attrs["title"]
  186. ):
  187. match = True
  188. else:
  189. match = True
  190. if match:
  191. return i
  192. return None
  193. def handle_emphasis(
  194. self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str]
  195. ) -> None:
  196. """
  197. Handles various text emphases
  198. """
  199. tag_emphasis = google_text_emphasis(tag_style)
  200. parent_emphasis = google_text_emphasis(parent_style)
  201. # handle Google's text emphasis
  202. strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough
  203. # google and others may mark a font's weight as `bold` or `700`
  204. bold = False
  205. for bold_marker in config.BOLD_TEXT_STYLE_VALUES:
  206. bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis
  207. if bold:
  208. break
  209. italic = "italic" in tag_emphasis and "italic" not in parent_emphasis
  210. fixed = (
  211. google_fixed_width_font(tag_style)
  212. and not google_fixed_width_font(parent_style)
  213. and not self.pre
  214. )
  215. if start:
  216. # crossed-out text must be handled before other attributes
  217. # in order not to output qualifiers unnecessarily
  218. if bold or italic or fixed:
  219. self.emphasis += 1
  220. if strikethrough:
  221. self.quiet += 1
  222. if italic:
  223. self.o(self.emphasis_mark)
  224. self.drop_white_space += 1
  225. if bold:
  226. self.o(self.strong_mark)
  227. self.drop_white_space += 1
  228. if fixed:
  229. self.o("`")
  230. self.drop_white_space += 1
  231. self.code = True
  232. else:
  233. if bold or italic or fixed:
  234. # there must not be whitespace before closing emphasis mark
  235. self.emphasis -= 1
  236. self.space = False
  237. if fixed:
  238. if self.drop_white_space:
  239. # empty emphasis, drop it
  240. self.drop_white_space -= 1
  241. else:
  242. self.o("`")
  243. self.code = False
  244. if bold:
  245. if self.drop_white_space:
  246. # empty emphasis, drop it
  247. self.drop_white_space -= 1
  248. else:
  249. self.o(self.strong_mark)
  250. if italic:
  251. if self.drop_white_space:
  252. # empty emphasis, drop it
  253. self.drop_white_space -= 1
  254. else:
  255. self.o(self.emphasis_mark)
  256. # space is only allowed after *all* emphasis marks
  257. if (bold or italic) and not self.emphasis:
  258. self.o(" ")
  259. if strikethrough:
  260. self.quiet -= 1
  261. def handle_tag(
  262. self, tag: str, attrs: Dict[str, Optional[str]], start: bool
  263. ) -> None:
  264. self.current_tag = tag
  265. if self.tag_callback is not None:
  266. if self.tag_callback(self, tag, attrs, start) is True:
  267. return
  268. # first thing inside the anchor tag is another tag
  269. # that produces some output
  270. if (
  271. start
  272. and self.maybe_automatic_link is not None
  273. and tag not in ["p", "div", "style", "dl", "dt"]
  274. and (tag != "img" or self.ignore_images)
  275. ):
  276. self.o("[")
  277. self.maybe_automatic_link = None
  278. self.empty_link = False
  279. if self.google_doc:
  280. # the attrs parameter is empty for a closing tag. in addition, we
  281. # need the attributes of the parent nodes in order to get a
  282. # complete style description for the current element. we assume
  283. # that google docs export well formed html.
  284. parent_style = {} # type: Dict[str, str]
  285. if start:
  286. if self.tag_stack:
  287. parent_style = self.tag_stack[-1][2]
  288. tag_style = element_style(attrs, self.style_def, parent_style)
  289. self.tag_stack.append((tag, attrs, tag_style))
  290. else:
  291. dummy, attrs, tag_style = (
  292. self.tag_stack.pop() if self.tag_stack else (None, {}, {})
  293. )
  294. if self.tag_stack:
  295. parent_style = self.tag_stack[-1][2]
  296. if hn(tag):
  297. self.p()
  298. if start:
  299. self.inheader = True
  300. self.o(hn(tag) * "#" + " ")
  301. else:
  302. self.inheader = False
  303. return # prevent redundant emphasis marks on headers
  304. if tag in ["p", "div"]:
  305. if self.google_doc:
  306. if start and google_has_height(tag_style):
  307. self.p()
  308. else:
  309. self.soft_br()
  310. elif self.astack and tag == "div":
  311. pass
  312. else:
  313. self.p()
  314. if tag == "br" and start:
  315. if self.blockquote > 0:
  316. self.o(" \n> ")
  317. else:
  318. self.o(" \n")
  319. if tag == "hr" and start:
  320. self.p()
  321. self.o("* * *")
  322. self.p()
  323. if tag in ["head", "style", "script"]:
  324. if start:
  325. self.quiet += 1
  326. else:
  327. self.quiet -= 1
  328. if tag == "style":
  329. if start:
  330. self.style += 1
  331. else:
  332. self.style -= 1
  333. if tag in ["body"]:
  334. self.quiet = 0 # sites like 9rules.com never close <head>
  335. if tag == "blockquote":
  336. if start:
  337. self.p()
  338. self.o("> ", force=True)
  339. self.start = True
  340. self.blockquote += 1
  341. else:
  342. self.blockquote -= 1
  343. self.p()
  344. def no_preceding_space(self: HTML2Text) -> bool:
  345. return bool(
  346. self.preceding_data and re.match(r"[^\s]", self.preceding_data[-1])
  347. )
  348. if tag in ["em", "i", "u"] and not self.ignore_emphasis:
  349. if start and no_preceding_space(self):
  350. emphasis = " " + self.emphasis_mark
  351. else:
  352. emphasis = self.emphasis_mark
  353. self.o(emphasis)
  354. if start:
  355. self.stressed = True
  356. if tag in ["strong", "b"] and not self.ignore_emphasis:
  357. if start and no_preceding_space(self):
  358. strong = " " + self.strong_mark
  359. else:
  360. strong = self.strong_mark
  361. self.o(strong)
  362. if start:
  363. self.stressed = True
  364. if tag in ["del", "strike", "s"]:
  365. if start and no_preceding_space(self):
  366. strike = " ~~"
  367. else:
  368. strike = "~~"
  369. self.o(strike)
  370. if start:
  371. self.stressed = True
  372. if self.google_doc:
  373. if not self.inheader:
  374. # handle some font attributes, but leave headers clean
  375. self.handle_emphasis(start, tag_style, parent_style)
  376. if tag in ["kbd", "code", "tt"] and not self.pre:
  377. self.o("`") # TODO: `` `this` ``
  378. self.code = not self.code
  379. if tag == "abbr":
  380. if start:
  381. self.abbr_title = None
  382. self.abbr_data = ""
  383. if "title" in attrs:
  384. self.abbr_title = attrs["title"]
  385. else:
  386. if self.abbr_title is not None:
  387. assert self.abbr_data is not None
  388. self.abbr_list[self.abbr_data] = self.abbr_title
  389. self.abbr_title = None
  390. self.abbr_data = None
  391. if tag == "q":
  392. if not self.quote:
  393. self.o(self.open_quote)
  394. else:
  395. self.o(self.close_quote)
  396. self.quote = not self.quote
  397. def link_url(self: HTML2Text, link: str, title: str = "") -> None:
  398. url = urlparse.urljoin(self.baseurl, link)
  399. title = ' "{}"'.format(title) if title.strip() else ""
  400. self.o("]({url}{title})".format(url=escape_md(url), title=title))
  401. if tag == "a" and not self.ignore_links:
  402. if start:
  403. if (
  404. "href" in attrs
  405. and attrs["href"] is not None
  406. and not (self.skip_internal_links and attrs["href"].startswith("#"))
  407. ):
  408. self.astack.append(attrs)
  409. self.maybe_automatic_link = attrs["href"]
  410. self.empty_link = True
  411. if self.protect_links:
  412. attrs["href"] = "<" + attrs["href"] + ">"
  413. else:
  414. self.astack.append(None)
  415. else:
  416. if self.astack:
  417. a = self.astack.pop()
  418. if self.maybe_automatic_link and not self.empty_link:
  419. self.maybe_automatic_link = None
  420. elif a:
  421. assert a["href"] is not None
  422. if self.empty_link:
  423. self.o("[")
  424. self.empty_link = False
  425. self.maybe_automatic_link = None
  426. if self.inline_links:
  427. title = a.get("title") or ""
  428. title = escape_md(title)
  429. link_url(self, a["href"], title)
  430. else:
  431. i = self.previousIndex(a)
  432. if i is not None:
  433. a_props = self.a[i]
  434. else:
  435. self.acount += 1
  436. a_props = AnchorElement(a, self.acount, self.outcount)
  437. self.a.append(a_props)
  438. self.o("][" + str(a_props.count) + "]")
  439. if tag == "img" and start and not self.ignore_images:
  440. if "src" in attrs:
  441. assert attrs["src"] is not None
  442. if not self.images_to_alt:
  443. attrs["href"] = attrs["src"]
  444. alt = attrs.get("alt") or self.default_image_alt
  445. # If we have images_with_size, write raw html including width,
  446. # height, and alt attributes
  447. if self.images_as_html or (
  448. self.images_with_size and ("width" in attrs or "height" in attrs)
  449. ):
  450. self.o("<img src='" + attrs["src"] + "' ")
  451. if "width" in attrs:
  452. assert attrs["width"] is not None
  453. self.o("width='" + attrs["width"] + "' ")
  454. if "height" in attrs:
  455. assert attrs["height"] is not None
  456. self.o("height='" + attrs["height"] + "' ")
  457. if alt:
  458. self.o("alt='" + alt + "' ")
  459. self.o("/>")
  460. return
  461. # If we have a link to create, output the start
  462. if self.maybe_automatic_link is not None:
  463. href = self.maybe_automatic_link
  464. if (
  465. self.images_to_alt
  466. and escape_md(alt) == href
  467. and self.absolute_url_matcher.match(href)
  468. ):
  469. self.o("<" + escape_md(alt) + ">")
  470. self.empty_link = False
  471. return
  472. else:
  473. self.o("[")
  474. self.maybe_automatic_link = None
  475. self.empty_link = False
  476. # If we have images_to_alt, we discard the image itself,
  477. # considering only the alt text.
  478. if self.images_to_alt:
  479. self.o(escape_md(alt))
  480. else:
  481. self.o("![" + escape_md(alt) + "]")
  482. if self.inline_links:
  483. href = attrs.get("href") or ""
  484. self.o(
  485. "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")"
  486. )
  487. else:
  488. i = self.previousIndex(attrs)
  489. if i is not None:
  490. a_props = self.a[i]
  491. else:
  492. self.acount += 1
  493. a_props = AnchorElement(attrs, self.acount, self.outcount)
  494. self.a.append(a_props)
  495. self.o("[" + str(a_props.count) + "]")
  496. if tag == "dl" and start:
  497. self.p()
  498. if tag == "dt" and not start:
  499. self.pbr()
  500. if tag == "dd" and start:
  501. self.o(" ")
  502. if tag == "dd" and not start:
  503. self.pbr()
  504. if tag in ["ol", "ul"]:
  505. # Google Docs create sub lists as top level lists
  506. if not self.list and not self.lastWasList:
  507. self.p()
  508. if start:
  509. if self.google_doc:
  510. list_style = google_list_style(tag_style)
  511. else:
  512. list_style = tag
  513. numbering_start = list_numbering_start(attrs)
  514. self.list.append(ListElement(list_style, numbering_start))
  515. else:
  516. if self.list:
  517. self.list.pop()
  518. if not self.google_doc and not self.list:
  519. self.o("\n")
  520. self.lastWasList = True
  521. else:
  522. self.lastWasList = False
  523. if tag == "li":
  524. self.pbr()
  525. if start:
  526. if self.list:
  527. li = self.list[-1]
  528. else:
  529. li = ListElement("ul", 0)
  530. if self.google_doc:
  531. nest_count = self.google_nest_count(tag_style)
  532. else:
  533. nest_count = len(self.list)
  534. # TODO: line up <ol><li>s > 9 correctly.
  535. self.o(" " * nest_count)
  536. if li.name == "ul":
  537. self.o(self.ul_item_mark + " ")
  538. elif li.name == "ol":
  539. li.num += 1
  540. self.o(str(li.num) + ". ")
  541. self.start = True
  542. if tag in ["table", "tr", "td", "th"]:
  543. if self.ignore_tables:
  544. if tag == "tr":
  545. if start:
  546. pass
  547. else:
  548. self.soft_br()
  549. else:
  550. pass
  551. elif self.bypass_tables:
  552. if start:
  553. self.soft_br()
  554. if tag in ["td", "th"]:
  555. if start:
  556. self.o("<{}>\n\n".format(tag))
  557. else:
  558. self.o("\n</{}>".format(tag))
  559. else:
  560. if start:
  561. self.o("<{}>".format(tag))
  562. else:
  563. self.o("</{}>".format(tag))
  564. else:
  565. if tag == "table":
  566. if start:
  567. self.table_start = True
  568. if self.pad_tables:
  569. self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")
  570. self.o(" \n")
  571. else:
  572. if self.pad_tables:
  573. self.o("</" + config.TABLE_MARKER_FOR_PAD + ">")
  574. self.o(" \n")
  575. if tag in ["td", "th"] and start:
  576. if self.split_next_td:
  577. self.o("| ")
  578. self.split_next_td = True
  579. if tag == "tr" and start:
  580. self.td_count = 0
  581. if tag == "tr" and not start:
  582. self.split_next_td = False
  583. self.soft_br()
  584. if tag == "tr" and not start and self.table_start:
  585. # Underline table header
  586. self.o("|".join(["---"] * self.td_count))
  587. self.soft_br()
  588. self.table_start = False
  589. if tag in ["td", "th"] and start:
  590. self.td_count += 1
  591. if tag == "pre":
  592. if start:
  593. self.startpre = True
  594. self.pre = True
  595. else:
  596. self.pre = False
  597. if self.mark_code:
  598. self.out("\n[/code]")
  599. self.p()
  600. # TODO: Add docstring for these one letter functions
  601. def pbr(self) -> None:
  602. "Pretty print has a line break"
  603. if self.p_p == 0:
  604. self.p_p = 1
  605. def p(self) -> None:
  606. "Set pretty print to 1 or 2 lines"
  607. self.p_p = 1 if self.single_line_break else 2
  608. def soft_br(self) -> None:
  609. "Soft breaks"
  610. self.pbr()
  611. self.br_toggle = " "
  612. def o(
  613. self, data: str, puredata: bool = False, force: Union[bool, str] = False
  614. ) -> None:
  615. """
  616. Deal with indentation and whitespace
  617. """
  618. if self.abbr_data is not None:
  619. self.abbr_data += data
  620. if not self.quiet:
  621. if self.google_doc:
  622. # prevent white space immediately after 'begin emphasis'
  623. # marks ('**' and '_')
  624. lstripped_data = data.lstrip()
  625. if self.drop_white_space and not (self.pre or self.code):
  626. data = lstripped_data
  627. if lstripped_data != "":
  628. self.drop_white_space = 0
  629. if puredata and not self.pre:
  630. # This is a very dangerous call ... it could mess up
  631. # all handling of &nbsp; when not handled properly
  632. # (see entityref)
  633. data = re.sub(r"\s+", r" ", data)
  634. if data and data[0] == " ":
  635. self.space = True
  636. data = data[1:]
  637. if not data and not force:
  638. return
  639. if self.startpre:
  640. # self.out(" :") #TODO: not output when already one there
  641. if not data.startswith("\n") and not data.startswith("\r\n"):
  642. # <pre>stuff...
  643. data = "\n" + data
  644. if self.mark_code:
  645. self.out("\n[code]")
  646. self.p_p = 0
  647. bq = ">" * self.blockquote
  648. if not (force and data and data[0] == ">") and self.blockquote:
  649. bq += " "
  650. if self.pre:
  651. if not self.list:
  652. bq += " "
  653. # else: list content is already partially indented
  654. bq += " " * len(self.list)
  655. data = data.replace("\n", "\n" + bq)
  656. if self.startpre:
  657. self.startpre = False
  658. if self.list:
  659. # use existing initial indentation
  660. data = data.lstrip("\n")
  661. if self.start:
  662. self.space = False
  663. self.p_p = 0
  664. self.start = False
  665. if force == "end":
  666. # It's the end.
  667. self.p_p = 0
  668. self.out("\n")
  669. self.space = False
  670. if self.p_p:
  671. self.out((self.br_toggle + "\n" + bq) * self.p_p)
  672. self.space = False
  673. self.br_toggle = ""
  674. if self.space:
  675. if not self.lastWasNL:
  676. self.out(" ")
  677. self.space = False
  678. if self.a and (
  679. (self.p_p == 2 and self.links_each_paragraph) or force == "end"
  680. ):
  681. if force == "end":
  682. self.out("\n")
  683. newa = []
  684. for link in self.a:
  685. if self.outcount > link.outcount:
  686. self.out(
  687. " ["
  688. + str(link.count)
  689. + "]: "
  690. + urlparse.urljoin(self.baseurl, link.attrs["href"])
  691. )
  692. if "title" in link.attrs:
  693. assert link.attrs["title"] is not None
  694. self.out(" (" + link.attrs["title"] + ")")
  695. self.out("\n")
  696. else:
  697. newa.append(link)
  698. # Don't need an extra line when nothing was done.
  699. if self.a != newa:
  700. self.out("\n")
  701. self.a = newa
  702. if self.abbr_list and force == "end":
  703. for abbr, definition in self.abbr_list.items():
  704. self.out(" *[" + abbr + "]: " + definition + "\n")
  705. self.p_p = 0
  706. self.out(data)
  707. self.outcount += 1
  708. def handle_data(self, data: str, entity_char: bool = False) -> None:
  709. if not data:
  710. # Data may be empty for some HTML entities. For example,
  711. # LEFT-TO-RIGHT MARK.
  712. return
  713. if self.stressed:
  714. data = data.strip()
  715. self.stressed = False
  716. self.preceding_stressed = True
  717. elif self.preceding_stressed:
  718. if (
  719. re.match(r"[^\s.!?]", data[0])
  720. and not hn(self.current_tag)
  721. and self.current_tag not in ["a", "code", "pre"]
  722. ):
  723. # should match a letter or common punctuation
  724. data = " " + data
  725. self.preceding_stressed = False
  726. if self.style:
  727. self.style_def.update(dumb_css_parser(data))
  728. if self.maybe_automatic_link is not None:
  729. href = self.maybe_automatic_link
  730. if (
  731. href == data
  732. and self.absolute_url_matcher.match(href)
  733. and self.use_automatic_links
  734. ):
  735. self.o("<" + data + ">")
  736. self.empty_link = False
  737. return
  738. else:
  739. self.o("[")
  740. self.maybe_automatic_link = None
  741. self.empty_link = False
  742. if not self.code and not self.pre and not entity_char:
  743. data = escape_md_section(data, snob=self.escape_snob)
  744. self.preceding_data = data
  745. self.o(data, puredata=True)
  746. def charref(self, name: str) -> str:
  747. if name[0] in ["x", "X"]:
  748. c = int(name[1:], 16)
  749. else:
  750. c = int(name)
  751. if not self.unicode_snob and c in unifiable_n:
  752. return unifiable_n[c]
  753. else:
  754. try:
  755. return chr(c)
  756. except ValueError: # invalid unicode
  757. return ""
  758. def entityref(self, c: str) -> str:
  759. if not self.unicode_snob and c in config.UNIFIABLE:
  760. return config.UNIFIABLE[c]
  761. try:
  762. ch = html.entities.html5[c + ";"]
  763. except KeyError:
  764. return "&" + c + ";"
  765. return config.UNIFIABLE[c] if c == "nbsp" else ch
  766. def google_nest_count(self, style: Dict[str, str]) -> int:
  767. """
  768. Calculate the nesting count of google doc lists
  769. :type style: dict
  770. :rtype: int
  771. """
  772. nest_count = 0
  773. if "margin-left" in style:
  774. nest_count = int(style["margin-left"][:-2]) // self.google_list_indent
  775. return nest_count
  776. def optwrap(self, text: str) -> str:
  777. """
  778. Wrap all paragraphs in the provided text.
  779. :type text: str
  780. :rtype: str
  781. """
  782. if not self.body_width:
  783. return text
  784. result = ""
  785. newlines = 0
  786. # I cannot think of a better solution for now.
  787. # To avoid the non-wrap behaviour for entire paras
  788. # because of the presence of a link in it
  789. if not self.wrap_links:
  790. self.inline_links = False
  791. for para in text.split("\n"):
  792. if len(para) > 0:
  793. if not skipwrap(para, self.wrap_links, self.wrap_list_items):
  794. indent = ""
  795. if para.startswith(" " + self.ul_item_mark):
  796. # list item continuation: add a double indent to the
  797. # new lines
  798. indent = " "
  799. elif para.startswith("> "):
  800. # blockquote continuation: add the greater than symbol
  801. # to the new lines
  802. indent = "> "
  803. wrapped = wrap(
  804. para,
  805. self.body_width,
  806. break_long_words=False,
  807. subsequent_indent=indent,
  808. )
  809. result += "\n".join(wrapped)
  810. if para.endswith(" "):
  811. result += " \n"
  812. newlines = 1
  813. elif indent:
  814. result += "\n"
  815. newlines = 1
  816. else:
  817. result += "\n\n"
  818. newlines = 2
  819. else:
  820. # Warning for the tempted!!!
  821. # Be aware that obvious replacement of this with
  822. # line.isspace()
  823. # DOES NOT work! Explanations are welcome.
  824. if not config.RE_SPACE.match(para):
  825. result += para + "\n"
  826. newlines = 1
  827. else:
  828. if newlines < 2:
  829. result += "\n"
  830. newlines += 1
  831. return result
  832. def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
  833. if bodywidth is None:
  834. bodywidth = config.BODY_WIDTH
  835. h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
  836. return h.handle(html)