cli.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. import argparse
  2. import sys
  3. from . import HTML2Text, __version__, config
  4. def main() -> None:
  5. baseurl = ""
  6. class bcolors:
  7. HEADER = "\033[95m"
  8. OKBLUE = "\033[94m"
  9. OKGREEN = "\033[92m"
  10. WARNING = "\033[93m"
  11. FAIL = "\033[91m"
  12. ENDC = "\033[0m"
  13. BOLD = "\033[1m"
  14. UNDERLINE = "\033[4m"
  15. p = argparse.ArgumentParser()
  16. p.add_argument(
  17. "--default-image-alt",
  18. dest="default_image_alt",
  19. default=config.DEFAULT_IMAGE_ALT,
  20. help="The default alt string for images with missing ones",
  21. )
  22. p.add_argument(
  23. "--pad-tables",
  24. dest="pad_tables",
  25. action="store_true",
  26. default=config.PAD_TABLES,
  27. help="pad the cells to equal column width in tables",
  28. )
  29. p.add_argument(
  30. "--no-wrap-links",
  31. dest="wrap_links",
  32. action="store_false",
  33. default=config.WRAP_LINKS,
  34. help="don't wrap links during conversion",
  35. )
  36. p.add_argument(
  37. "--wrap-list-items",
  38. dest="wrap_list_items",
  39. action="store_true",
  40. default=config.WRAP_LIST_ITEMS,
  41. help="wrap list items during conversion",
  42. )
  43. p.add_argument(
  44. "--ignore-emphasis",
  45. dest="ignore_emphasis",
  46. action="store_true",
  47. default=config.IGNORE_EMPHASIS,
  48. help="don't include any formatting for emphasis",
  49. )
  50. p.add_argument(
  51. "--reference-links",
  52. dest="inline_links",
  53. action="store_false",
  54. default=config.INLINE_LINKS,
  55. help="use reference style links instead of inline links",
  56. )
  57. p.add_argument(
  58. "--ignore-links",
  59. dest="ignore_links",
  60. action="store_true",
  61. default=config.IGNORE_ANCHORS,
  62. help="don't include any formatting for links",
  63. )
  64. p.add_argument(
  65. "--protect-links",
  66. dest="protect_links",
  67. action="store_true",
  68. default=config.PROTECT_LINKS,
  69. help="protect links from line breaks surrounding them with angle brackets",
  70. )
  71. p.add_argument(
  72. "--ignore-images",
  73. dest="ignore_images",
  74. action="store_true",
  75. default=config.IGNORE_IMAGES,
  76. help="don't include any formatting for images",
  77. )
  78. p.add_argument(
  79. "--images-as-html",
  80. dest="images_as_html",
  81. action="store_true",
  82. default=config.IMAGES_AS_HTML,
  83. help=(
  84. "Always write image tags as raw html; preserves `height`, `width` and "
  85. "`alt` if possible."
  86. ),
  87. )
  88. p.add_argument(
  89. "--images-to-alt",
  90. dest="images_to_alt",
  91. action="store_true",
  92. default=config.IMAGES_TO_ALT,
  93. help="Discard image data, only keep alt text",
  94. )
  95. p.add_argument(
  96. "--images-with-size",
  97. dest="images_with_size",
  98. action="store_true",
  99. default=config.IMAGES_WITH_SIZE,
  100. help=(
  101. "Write image tags with height and width attrs as raw html to retain "
  102. "dimensions"
  103. ),
  104. )
  105. p.add_argument(
  106. "-g",
  107. "--google-doc",
  108. action="store_true",
  109. dest="google_doc",
  110. default=False,
  111. help="convert an html-exported Google Document",
  112. )
  113. p.add_argument(
  114. "-d",
  115. "--dash-unordered-list",
  116. action="store_true",
  117. dest="ul_style_dash",
  118. default=False,
  119. help="use a dash rather than a star for unordered list items",
  120. )
  121. p.add_argument(
  122. "-e",
  123. "--asterisk-emphasis",
  124. action="store_true",
  125. dest="em_style_asterisk",
  126. default=False,
  127. help="use an asterisk rather than an underscore for emphasized text",
  128. )
  129. p.add_argument(
  130. "-b",
  131. "--body-width",
  132. dest="body_width",
  133. type=int,
  134. default=config.BODY_WIDTH,
  135. help="number of characters per output line, 0 for no wrap",
  136. )
  137. p.add_argument(
  138. "-i",
  139. "--google-list-indent",
  140. dest="list_indent",
  141. type=int,
  142. default=config.GOOGLE_LIST_INDENT,
  143. help="number of pixels Google indents nested lists",
  144. )
  145. p.add_argument(
  146. "-s",
  147. "--hide-strikethrough",
  148. action="store_true",
  149. dest="hide_strikethrough",
  150. default=False,
  151. help="hide strike-through text. only relevant when -g is " "specified as well",
  152. )
  153. p.add_argument(
  154. "--escape-all",
  155. action="store_true",
  156. dest="escape_snob",
  157. default=False,
  158. help=(
  159. "Escape all special characters. Output is less readable, but avoids "
  160. "corner case formatting issues."
  161. ),
  162. )
  163. p.add_argument(
  164. "--bypass-tables",
  165. action="store_true",
  166. dest="bypass_tables",
  167. default=config.BYPASS_TABLES,
  168. help="Format tables in HTML rather than Markdown syntax.",
  169. )
  170. p.add_argument(
  171. "--ignore-tables",
  172. action="store_true",
  173. dest="ignore_tables",
  174. default=config.IGNORE_TABLES,
  175. help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
  176. )
  177. p.add_argument(
  178. "--single-line-break",
  179. action="store_true",
  180. dest="single_line_break",
  181. default=config.SINGLE_LINE_BREAK,
  182. help=(
  183. "Use a single line break after a block element rather than two line "
  184. "breaks. NOTE: Requires --body-width=0"
  185. ),
  186. )
  187. p.add_argument(
  188. "--unicode-snob",
  189. action="store_true",
  190. dest="unicode_snob",
  191. default=config.UNICODE_SNOB,
  192. help="Use unicode throughout document",
  193. )
  194. p.add_argument(
  195. "--no-automatic-links",
  196. action="store_false",
  197. dest="use_automatic_links",
  198. default=config.USE_AUTOMATIC_LINKS,
  199. help="Do not use automatic links wherever applicable",
  200. )
  201. p.add_argument(
  202. "--no-skip-internal-links",
  203. action="store_false",
  204. dest="skip_internal_links",
  205. default=config.SKIP_INTERNAL_LINKS,
  206. help="Do not skip internal links",
  207. )
  208. p.add_argument(
  209. "--links-after-para",
  210. action="store_true",
  211. dest="links_each_paragraph",
  212. default=config.LINKS_EACH_PARAGRAPH,
  213. help="Put links after each paragraph instead of document",
  214. )
  215. p.add_argument(
  216. "--mark-code",
  217. action="store_true",
  218. dest="mark_code",
  219. default=config.MARK_CODE,
  220. help="Mark program code blocks with [code]...[/code]",
  221. )
  222. p.add_argument(
  223. "--decode-errors",
  224. dest="decode_errors",
  225. default=config.DECODE_ERRORS,
  226. help=(
  227. "What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
  228. "acceptable values"
  229. ),
  230. )
  231. p.add_argument(
  232. "--open-quote",
  233. dest="open_quote",
  234. default=config.OPEN_QUOTE,
  235. help="The character used to open quotes",
  236. )
  237. p.add_argument(
  238. "--close-quote",
  239. dest="close_quote",
  240. default=config.CLOSE_QUOTE,
  241. help="The character used to close quotes",
  242. )
  243. p.add_argument(
  244. "--version", action="version", version=".".join(map(str, __version__))
  245. )
  246. p.add_argument("filename", nargs="?")
  247. p.add_argument("encoding", nargs="?", default="utf-8")
  248. args = p.parse_args()
  249. if args.filename and args.filename != "-":
  250. with open(args.filename, "rb") as fp:
  251. data = fp.read()
  252. else:
  253. data = sys.stdin.buffer.read()
  254. try:
  255. html = data.decode(args.encoding, args.decode_errors)
  256. except UnicodeDecodeError as err:
  257. warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
  258. warning += " Use the " + bcolors.OKGREEN
  259. warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
  260. print(warning)
  261. raise err
  262. h = HTML2Text(baseurl=baseurl)
  263. # handle options
  264. if args.ul_style_dash:
  265. h.ul_item_mark = "-"
  266. if args.em_style_asterisk:
  267. h.emphasis_mark = "*"
  268. h.strong_mark = "__"
  269. h.body_width = args.body_width
  270. h.google_list_indent = args.list_indent
  271. h.ignore_emphasis = args.ignore_emphasis
  272. h.ignore_links = args.ignore_links
  273. h.protect_links = args.protect_links
  274. h.ignore_images = args.ignore_images
  275. h.images_as_html = args.images_as_html
  276. h.images_to_alt = args.images_to_alt
  277. h.images_with_size = args.images_with_size
  278. h.google_doc = args.google_doc
  279. h.hide_strikethrough = args.hide_strikethrough
  280. h.escape_snob = args.escape_snob
  281. h.bypass_tables = args.bypass_tables
  282. h.ignore_tables = args.ignore_tables
  283. h.single_line_break = args.single_line_break
  284. h.inline_links = args.inline_links
  285. h.unicode_snob = args.unicode_snob
  286. h.use_automatic_links = args.use_automatic_links
  287. h.skip_internal_links = args.skip_internal_links
  288. h.links_each_paragraph = args.links_each_paragraph
  289. h.mark_code = args.mark_code
  290. h.wrap_links = args.wrap_links
  291. h.wrap_list_items = args.wrap_list_items
  292. h.pad_tables = args.pad_tables
  293. h.default_image_alt = args.default_image_alt
  294. h.open_quote = args.open_quote
  295. h.close_quote = args.close_quote
  296. sys.stdout.write(h.handle(html))