123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306 |
- import argparse
- import sys
- from . import HTML2Text, __version__, config
- def main() -> None:
- baseurl = ""
- class bcolors:
- HEADER = "\033[95m"
- OKBLUE = "\033[94m"
- OKGREEN = "\033[92m"
- WARNING = "\033[93m"
- FAIL = "\033[91m"
- ENDC = "\033[0m"
- BOLD = "\033[1m"
- UNDERLINE = "\033[4m"
- p = argparse.ArgumentParser()
- p.add_argument(
- "--default-image-alt",
- dest="default_image_alt",
- default=config.DEFAULT_IMAGE_ALT,
- help="The default alt string for images with missing ones",
- )
- p.add_argument(
- "--pad-tables",
- dest="pad_tables",
- action="store_true",
- default=config.PAD_TABLES,
- help="pad the cells to equal column width in tables",
- )
- p.add_argument(
- "--no-wrap-links",
- dest="wrap_links",
- action="store_false",
- default=config.WRAP_LINKS,
- help="don't wrap links during conversion",
- )
- p.add_argument(
- "--wrap-list-items",
- dest="wrap_list_items",
- action="store_true",
- default=config.WRAP_LIST_ITEMS,
- help="wrap list items during conversion",
- )
- p.add_argument(
- "--ignore-emphasis",
- dest="ignore_emphasis",
- action="store_true",
- default=config.IGNORE_EMPHASIS,
- help="don't include any formatting for emphasis",
- )
- p.add_argument(
- "--reference-links",
- dest="inline_links",
- action="store_false",
- default=config.INLINE_LINKS,
- help="use reference style links instead of inline links",
- )
- p.add_argument(
- "--ignore-links",
- dest="ignore_links",
- action="store_true",
- default=config.IGNORE_ANCHORS,
- help="don't include any formatting for links",
- )
- p.add_argument(
- "--protect-links",
- dest="protect_links",
- action="store_true",
- default=config.PROTECT_LINKS,
- help="protect links from line breaks surrounding them with angle brackets",
- )
- p.add_argument(
- "--ignore-images",
- dest="ignore_images",
- action="store_true",
- default=config.IGNORE_IMAGES,
- help="don't include any formatting for images",
- )
- p.add_argument(
- "--images-as-html",
- dest="images_as_html",
- action="store_true",
- default=config.IMAGES_AS_HTML,
- help=(
- "Always write image tags as raw html; preserves `height`, `width` and "
- "`alt` if possible."
- ),
- )
- p.add_argument(
- "--images-to-alt",
- dest="images_to_alt",
- action="store_true",
- default=config.IMAGES_TO_ALT,
- help="Discard image data, only keep alt text",
- )
- p.add_argument(
- "--images-with-size",
- dest="images_with_size",
- action="store_true",
- default=config.IMAGES_WITH_SIZE,
- help=(
- "Write image tags with height and width attrs as raw html to retain "
- "dimensions"
- ),
- )
- p.add_argument(
- "-g",
- "--google-doc",
- action="store_true",
- dest="google_doc",
- default=False,
- help="convert an html-exported Google Document",
- )
- p.add_argument(
- "-d",
- "--dash-unordered-list",
- action="store_true",
- dest="ul_style_dash",
- default=False,
- help="use a dash rather than a star for unordered list items",
- )
- p.add_argument(
- "-e",
- "--asterisk-emphasis",
- action="store_true",
- dest="em_style_asterisk",
- default=False,
- help="use an asterisk rather than an underscore for emphasized text",
- )
- p.add_argument(
- "-b",
- "--body-width",
- dest="body_width",
- type=int,
- default=config.BODY_WIDTH,
- help="number of characters per output line, 0 for no wrap",
- )
- p.add_argument(
- "-i",
- "--google-list-indent",
- dest="list_indent",
- type=int,
- default=config.GOOGLE_LIST_INDENT,
- help="number of pixels Google indents nested lists",
- )
- p.add_argument(
- "-s",
- "--hide-strikethrough",
- action="store_true",
- dest="hide_strikethrough",
- default=False,
- help="hide strike-through text. only relevant when -g is " "specified as well",
- )
- p.add_argument(
- "--escape-all",
- action="store_true",
- dest="escape_snob",
- default=False,
- help=(
- "Escape all special characters. Output is less readable, but avoids "
- "corner case formatting issues."
- ),
- )
- p.add_argument(
- "--bypass-tables",
- action="store_true",
- dest="bypass_tables",
- default=config.BYPASS_TABLES,
- help="Format tables in HTML rather than Markdown syntax.",
- )
- p.add_argument(
- "--ignore-tables",
- action="store_true",
- dest="ignore_tables",
- default=config.IGNORE_TABLES,
- help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
- )
- p.add_argument(
- "--single-line-break",
- action="store_true",
- dest="single_line_break",
- default=config.SINGLE_LINE_BREAK,
- help=(
- "Use a single line break after a block element rather than two line "
- "breaks. NOTE: Requires --body-width=0"
- ),
- )
- p.add_argument(
- "--unicode-snob",
- action="store_true",
- dest="unicode_snob",
- default=config.UNICODE_SNOB,
- help="Use unicode throughout document",
- )
- p.add_argument(
- "--no-automatic-links",
- action="store_false",
- dest="use_automatic_links",
- default=config.USE_AUTOMATIC_LINKS,
- help="Do not use automatic links wherever applicable",
- )
- p.add_argument(
- "--no-skip-internal-links",
- action="store_false",
- dest="skip_internal_links",
- default=config.SKIP_INTERNAL_LINKS,
- help="Do not skip internal links",
- )
- p.add_argument(
- "--links-after-para",
- action="store_true",
- dest="links_each_paragraph",
- default=config.LINKS_EACH_PARAGRAPH,
- help="Put links after each paragraph instead of document",
- )
- p.add_argument(
- "--mark-code",
- action="store_true",
- dest="mark_code",
- default=config.MARK_CODE,
- help="Mark program code blocks with [code]...[/code]",
- )
- p.add_argument(
- "--decode-errors",
- dest="decode_errors",
- default=config.DECODE_ERRORS,
- help=(
- "What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
- "acceptable values"
- ),
- )
- p.add_argument(
- "--open-quote",
- dest="open_quote",
- default=config.OPEN_QUOTE,
- help="The character used to open quotes",
- )
- p.add_argument(
- "--close-quote",
- dest="close_quote",
- default=config.CLOSE_QUOTE,
- help="The character used to close quotes",
- )
- p.add_argument(
- "--version", action="version", version=".".join(map(str, __version__))
- )
- p.add_argument("filename", nargs="?")
- p.add_argument("encoding", nargs="?", default="utf-8")
- args = p.parse_args()
- if args.filename and args.filename != "-":
- with open(args.filename, "rb") as fp:
- data = fp.read()
- else:
- data = sys.stdin.buffer.read()
- try:
- html = data.decode(args.encoding, args.decode_errors)
- except UnicodeDecodeError as err:
- warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
- warning += " Use the " + bcolors.OKGREEN
- warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
- print(warning)
- raise err
- h = HTML2Text(baseurl=baseurl)
- # handle options
- if args.ul_style_dash:
- h.ul_item_mark = "-"
- if args.em_style_asterisk:
- h.emphasis_mark = "*"
- h.strong_mark = "__"
- h.body_width = args.body_width
- h.google_list_indent = args.list_indent
- h.ignore_emphasis = args.ignore_emphasis
- h.ignore_links = args.ignore_links
- h.protect_links = args.protect_links
- h.ignore_images = args.ignore_images
- h.images_as_html = args.images_as_html
- h.images_to_alt = args.images_to_alt
- h.images_with_size = args.images_with_size
- h.google_doc = args.google_doc
- h.hide_strikethrough = args.hide_strikethrough
- h.escape_snob = args.escape_snob
- h.bypass_tables = args.bypass_tables
- h.ignore_tables = args.ignore_tables
- h.single_line_break = args.single_line_break
- h.inline_links = args.inline_links
- h.unicode_snob = args.unicode_snob
- h.use_automatic_links = args.use_automatic_links
- h.skip_internal_links = args.skip_internal_links
- h.links_each_paragraph = args.links_each_paragraph
- h.mark_code = args.mark_code
- h.wrap_links = args.wrap_links
- h.wrap_list_items = args.wrap_list_items
- h.pad_tables = args.pad_tables
- h.default_image_alt = args.default_image_alt
- h.open_quote = args.open_quote
- h.close_quote = args.close_quote
- sys.stdout.write(h.handle(html))
|