123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283 |
- import html.entities
- from typing import Dict, List, Optional
- from . import config
- unifiable_n = {
- html.entities.name2codepoint[k]: v
- for k, v in config.UNIFIABLE.items()
- if k != "nbsp"
- }
- def hn(tag: str) -> int:
- if tag[0] == "h" and len(tag) == 2:
- n = tag[1]
- if "0" < n <= "9":
- return int(n)
- return 0
- def dumb_property_dict(style: str) -> Dict[str, str]:
- """
- :returns: A hash of css attributes
- """
- return {
- x.strip().lower(): y.strip().lower()
- for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
- }
- def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
- """
- :type data: str
- :returns: A hash of css selectors, each of which contains a hash of
- css attributes.
- :rtype: dict
- """
- # remove @import sentences
- data += ";"
- importIndex = data.find("@import")
- while importIndex != -1:
- data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
- importIndex = data.find("@import")
- # parse the css. reverted from dictionary comprehension in order to
- # support older pythons
- pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
- try:
- elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
- except ValueError:
- elements = {} # not that important
- return elements
- def element_style(
- attrs: Dict[str, Optional[str]],
- style_def: Dict[str, Dict[str, str]],
- parent_style: Dict[str, str],
- ) -> Dict[str, str]:
- """
- :type attrs: dict
- :type style_def: dict
- :type style_def: dict
- :returns: A hash of the 'final' style attributes of the element
- :rtype: dict
- """
- style = parent_style.copy()
- if "class" in attrs:
- assert attrs["class"] is not None
- for css_class in attrs["class"].split():
- css_style = style_def.get("." + css_class, {})
- style.update(css_style)
- if "style" in attrs:
- assert attrs["style"] is not None
- immediate_style = dumb_property_dict(attrs["style"])
- style.update(immediate_style)
- return style
- def google_list_style(style: Dict[str, str]) -> str:
- """
- Finds out whether this is an ordered or unordered list
- :type style: dict
- :rtype: str
- """
- if "list-style-type" in style:
- list_style = style["list-style-type"]
- if list_style in ["disc", "circle", "square", "none"]:
- return "ul"
- return "ol"
- def google_has_height(style: Dict[str, str]) -> bool:
- """
- Check if the style of the element has the 'height' attribute
- explicitly defined
- :type style: dict
- :rtype: bool
- """
- return "height" in style
- def google_text_emphasis(style: Dict[str, str]) -> List[str]:
- """
- :type style: dict
- :returns: A list of all emphasis modifiers of the element
- :rtype: list
- """
- emphasis = []
- if "text-decoration" in style:
- emphasis.append(style["text-decoration"])
- if "font-style" in style:
- emphasis.append(style["font-style"])
- if "font-weight" in style:
- emphasis.append(style["font-weight"])
- return emphasis
- def google_fixed_width_font(style: Dict[str, str]) -> bool:
- """
- Check if the css of the current element defines a fixed width font
- :type style: dict
- :rtype: bool
- """
- font_family = ""
- if "font-family" in style:
- font_family = style["font-family"]
- return "courier new" == font_family or "consolas" == font_family
- def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
- """
- Extract numbering from list element attributes
- :type attrs: dict
- :rtype: int or None
- """
- if "start" in attrs:
- assert attrs["start"] is not None
- try:
- return int(attrs["start"]) - 1
- except ValueError:
- pass
- return 0
- def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool:
- # If it appears to contain a link
- # don't wrap
- if not wrap_links and config.RE_LINK.search(para):
- return True
- # If the text begins with four spaces or one tab, it's a code block;
- # don't wrap
- if para[0:4] == " " or para[0] == "\t":
- return True
- # If the text begins with only two "--", possibly preceded by
- # whitespace, that's an emdash; so wrap.
- stripped = para.lstrip()
- if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
- return False
- # I'm not sure what this is for; I thought it was to detect lists,
- # but there's a <br>-inside-<span> case in one of the tests that
- # also depends upon it.
- if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
- return not wrap_list_items
- # If the text begins with a single -, *, or +, followed by a space,
- # or an integer, followed by a ., followed by a space (in either
- # case optionally proceeded by whitespace), it's a list; don't wrap.
- return bool(
- config.RE_ORDERED_LIST_MATCHER.match(stripped)
- or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
- )
- def escape_md(text: str) -> str:
- """
- Escapes markdown-sensitive characters within other markdown
- constructs.
- """
- return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
- def escape_md_section(text: str, snob: bool = False) -> str:
- """
- Escapes markdown-sensitive characters across whole document sections.
- """
- text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
- if snob:
- text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
- text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
- text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
- text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
- return text
- def reformat_table(lines: List[str], right_margin: int) -> List[str]:
- """
- Given the lines of a table
- padds the cells and returns the new lines
- """
- # find the maximum width of the columns
- max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
- max_cols = len(max_width)
- for line in lines:
- cols = [x.rstrip() for x in line.split("|")]
- num_cols = len(cols)
- # don't drop any data if colspan attributes result in unequal lengths
- if num_cols < max_cols:
- cols += [""] * (max_cols - num_cols)
- elif max_cols < num_cols:
- max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
- max_cols = num_cols
- max_width = [
- max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
- ]
- # reformat
- new_lines = []
- for line in lines:
- cols = [x.rstrip() for x in line.split("|")]
- if set(line.strip()) == set("-|"):
- filler = "-"
- new_cols = [
- x.rstrip() + (filler * (M - len(x.rstrip())))
- for x, M in zip(cols, max_width)
- ]
- else:
- filler = " "
- new_cols = [
- x.rstrip() + (filler * (M - len(x.rstrip())))
- for x, M in zip(cols, max_width)
- ]
- new_lines.append("|".join(new_cols))
- return new_lines
- def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
- """
- Provide padding for tables in the text
- """
- lines = text.split("\n")
- table_buffer = [] # type: List[str]
- table_started = False
- new_lines = []
- for line in lines:
- # Toggle table started
- if config.TABLE_MARKER_FOR_PAD in line:
- table_started = not table_started
- if not table_started:
- table = reformat_table(table_buffer, right_margin)
- new_lines.extend(table)
- table_buffer = []
- new_lines.append("")
- continue
- # Process lines
- if table_started:
- table_buffer.append(line)
- else:
- new_lines.append(line)
- return "\n".join(new_lines)
|