123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545 |
- # $Id: frontmatter.py 9030 2022-03-05 23:28:32Z milde $
- # Author: David Goodger, Ueli Schlaepfer <goodger@python.org>
- # Copyright: This module has been placed in the public domain.
- """
- Transforms related to the front matter of a document or a section
- (information found before the main text):
- - `DocTitle`: Used to transform a lone top level section's title to
- the document title, promote a remaining lone top-level section's
- title to the document subtitle, and determine the document's title
- metadata (document['title']) based on the document title and/or the
- "title" setting.
- - `SectionSubTitle`: Used to transform a lone subsection into a
- subtitle.
- - `DocInfo`: Used to transform a bibliographic field list into docinfo
- elements.
- """
- __docformat__ = 'reStructuredText'
- import re
- from docutils import nodes, utils
- from docutils.transforms import TransformError, Transform
- class TitlePromoter(Transform):
- """
- Abstract base class for DocTitle and SectionSubTitle transforms.
- """
- def promote_title(self, node):
- """
- Transform the following tree::
- <node>
- <section>
- <title>
- ...
- into ::
- <node>
- <title>
- ...
- `node` is normally a document.
- """
- # Type check
- if not isinstance(node, nodes.Element):
- raise TypeError('node must be of Element-derived type.')
- # `node` must not have a title yet.
- assert not (len(node) and isinstance(node[0], nodes.title))
- section, index = self.candidate_index(node)
- if index is None:
- return False
- # Transfer the section's attributes to the node:
- # NOTE: Change `replace` to False to NOT replace attributes that
- # already exist in node with those in section.
- # NOTE: Remove `and_source` to NOT copy the 'source'
- # attribute from section
- node.update_all_atts_concatenating(section, replace=True,
- and_source=True)
- # setup_child is called automatically for all nodes.
- node[:] = (section[:1] # section title
- + node[:index] # everything that was in the
- # node before the section
- + section[1:]) # everything that was in the section
- assert isinstance(node[0], nodes.title)
- return True
- def promote_subtitle(self, node):
- """
- Transform the following node tree::
- <node>
- <title>
- <section>
- <title>
- ...
- into ::
- <node>
- <title>
- <subtitle>
- ...
- """
- # Type check
- if not isinstance(node, nodes.Element):
- raise TypeError('node must be of Element-derived type.')
- subsection, index = self.candidate_index(node)
- if index is None:
- return False
- subtitle = nodes.subtitle()
- # Transfer the subsection's attributes to the new subtitle
- # NOTE: Change `replace` to False to NOT replace attributes
- # that already exist in node with those in section.
- # NOTE: Remove `and_source` to NOT copy the 'source'
- # attribute from section.
- subtitle.update_all_atts_concatenating(subsection, replace=True,
- and_source=True)
- # Transfer the contents of the subsection's title to the
- # subtitle:
- subtitle[:] = subsection[0][:]
- node[:] = (node[:1] # title
- + [subtitle]
- # everything that was before the section:
- + node[1:index]
- # everything that was in the subsection:
- + subsection[1:])
- return True
- def candidate_index(self, node):
- """
- Find and return the promotion candidate and its index.
- Return (None, None) if no valid candidate was found.
- """
- index = node.first_child_not_matching_class(
- nodes.PreBibliographic)
- if (index is None or len(node) > (index + 1)
- or not isinstance(node[index], nodes.section)):
- return None, None
- else:
- return node[index], index
- class DocTitle(TitlePromoter):
- """
- In reStructuredText_, there is no way to specify a document title
- and subtitle explicitly. Instead, we can supply the document title
- (and possibly the subtitle as well) implicitly, and use this
- two-step transform to "raise" or "promote" the title(s) (and their
- corresponding section contents) to the document level.
- 1. If the document contains a single top-level section as its
- first non-comment element, the top-level section's title
- becomes the document's title, and the top-level section's
- contents become the document's immediate contents. The lone
- top-level section header must be the first non-comment element
- in the document.
- For example, take this input text::
- =================
- Top-Level Title
- =================
- A paragraph.
- Once parsed, it looks like this::
- <document>
- <section names="top-level title">
- <title>
- Top-Level Title
- <paragraph>
- A paragraph.
- After running the DocTitle transform, we have::
- <document names="top-level title">
- <title>
- Top-Level Title
- <paragraph>
- A paragraph.
- 2. If step 1 successfully determines the document title, we
- continue by checking for a subtitle.
- If the lone top-level section itself contains a single
- second-level section as its first non-comment element, that
- section's title is promoted to the document's subtitle, and
- that section's contents become the document's immediate
- contents. Given this input text::
- =================
- Top-Level Title
- =================
- Second-Level Title
- ~~~~~~~~~~~~~~~~~~
- A paragraph.
- After parsing and running the Section Promotion transform, the
- result is::
- <document names="top-level title">
- <title>
- Top-Level Title
- <subtitle names="second-level title">
- Second-Level Title
- <paragraph>
- A paragraph.
- (Note that the implicit hyperlink target generated by the
- "Second-Level Title" is preserved on the "subtitle" element
- itself.)
- Any comment elements occurring before the document title or
- subtitle are accumulated and inserted as the first body elements
- after the title(s).
- This transform also sets the document's metadata title
- (document['title']).
- .. _reStructuredText: https://docutils.sourceforge.io/rst.html
- """
- default_priority = 320
- def set_metadata(self):
- """
- Set document['title'] metadata title from the following
- sources, listed in order of priority:
- * Existing document['title'] attribute.
- * "title" setting.
- * Document title node (as promoted by promote_title).
- """
- if not self.document.hasattr('title'):
- if self.document.settings.title is not None:
- self.document['title'] = self.document.settings.title
- elif len(self.document) and isinstance(self.document[0],
- nodes.title):
- self.document['title'] = self.document[0].astext()
- def apply(self):
- if self.document.settings.setdefault('doctitle_xform', True):
- # promote_(sub)title defined in TitlePromoter base class.
- if self.promote_title(self.document):
- # If a title has been promoted, also try to promote a
- # subtitle.
- self.promote_subtitle(self.document)
- # Set document['title'].
- self.set_metadata()
- class SectionSubTitle(TitlePromoter):
- """
- This works like document subtitles, but for sections. For example, ::
- <section>
- <title>
- Title
- <section>
- <title>
- Subtitle
- ...
- is transformed into ::
- <section>
- <title>
- Title
- <subtitle>
- Subtitle
- ...
- For details refer to the docstring of DocTitle.
- """
- default_priority = 350
- def apply(self):
- if not self.document.settings.setdefault('sectsubtitle_xform', True):
- return
- for section in self.document.findall(nodes.section):
- # On our way through the node tree, we are modifying it
- # but only the not-yet-visited part, so that the iterator
- # returned by findall() is not corrupted.
- self.promote_subtitle(section)
- class DocInfo(Transform):
- """
- This transform is specific to the reStructuredText_ markup syntax;
- see "Bibliographic Fields" in the `reStructuredText Markup
- Specification`_ for a high-level description. This transform
- should be run *after* the `DocTitle` transform.
- Given a field list as the first non-comment element after the
- document title and subtitle (if present), registered bibliographic
- field names are transformed to the corresponding DTD elements,
- becoming child elements of the "docinfo" element (except for a
- dedication and/or an abstract, which become "topic" elements after
- "docinfo").
- For example, given this document fragment after parsing::
- <document>
- <title>
- Document Title
- <field_list>
- <field>
- <field_name>
- Author
- <field_body>
- <paragraph>
- A. Name
- <field>
- <field_name>
- Status
- <field_body>
- <paragraph>
- $RCSfile$
- ...
- After running the bibliographic field list transform, the
- resulting document tree would look like this::
- <document>
- <title>
- Document Title
- <docinfo>
- <author>
- A. Name
- <status>
- frontmatter.py
- ...
- The "Status" field contained an expanded RCS keyword, which is
- normally (but optionally) cleaned up by the transform. The sole
- contents of the field body must be a paragraph containing an
- expanded RCS keyword of the form "$keyword: expansion text $". Any
- RCS keyword can be processed in any bibliographic field. The
- dollar signs and leading RCS keyword name are removed. Extra
- processing is done for the following RCS keywords:
- - "RCSfile" expands to the name of the file in the RCS or CVS
- repository, which is the name of the source file with a ",v"
- suffix appended. The transform will remove the ",v" suffix.
- - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
- time zone). The RCS Keywords transform will extract just the
- date itself and transform it to an ISO 8601 format date, as in
- "2000-12-31".
- (Since the source file for this text is itself stored under CVS,
- we can't show an example of the "Date" RCS keyword because we
- can't prevent any RCS keywords used in this explanation from
- being expanded. Only the "RCSfile" keyword is stable; its
- expansion text changes only if the file name changes.)
- .. _reStructuredText: https://docutils.sourceforge.io/rst.html
- .. _reStructuredText Markup Specification:
- https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html
- """
- default_priority = 340
- biblio_nodes = {
- 'author': nodes.author,
- 'authors': nodes.authors,
- 'organization': nodes.organization,
- 'address': nodes.address,
- 'contact': nodes.contact,
- 'version': nodes.version,
- 'revision': nodes.revision,
- 'status': nodes.status,
- 'date': nodes.date,
- 'copyright': nodes.copyright,
- 'dedication': nodes.topic,
- 'abstract': nodes.topic}
- """Canonical field name (lowcased) to node class name mapping for
- bibliographic fields (field_list)."""
- def apply(self):
- if not self.document.settings.setdefault('docinfo_xform', True):
- return
- document = self.document
- index = document.first_child_not_matching_class(
- nodes.PreBibliographic)
- if index is None:
- return
- candidate = document[index]
- if isinstance(candidate, nodes.field_list):
- biblioindex = document.first_child_not_matching_class(
- (nodes.Titular, nodes.Decorative, nodes.meta))
- nodelist = self.extract_bibliographic(candidate)
- del document[index] # untransformed field list (candidate)
- document[biblioindex:biblioindex] = nodelist
- def extract_bibliographic(self, field_list):
- docinfo = nodes.docinfo()
- bibliofields = self.language.bibliographic_fields
- labels = self.language.labels
- topics = {'dedication': None, 'abstract': None}
- for field in field_list:
- try:
- name = field[0][0].astext()
- normedname = nodes.fully_normalize_name(name)
- if not (len(field) == 2 and normedname in bibliofields
- and self.check_empty_biblio_field(field, name)):
- raise TransformError
- canonical = bibliofields[normedname]
- biblioclass = self.biblio_nodes[canonical]
- if issubclass(biblioclass, nodes.TextElement):
- if not self.check_compound_biblio_field(field, name):
- raise TransformError
- utils.clean_rcs_keywords(
- field[1][0], self.rcs_keyword_substitutions)
- docinfo.append(biblioclass('', '', *field[1][0]))
- elif issubclass(biblioclass, nodes.authors):
- self.extract_authors(field, name, docinfo)
- elif issubclass(biblioclass, nodes.topic):
- if topics[canonical]:
- field[-1] += self.document.reporter.warning(
- 'There can only be one "%s" field.' % name,
- base_node=field)
- raise TransformError
- title = nodes.title(name, labels[canonical])
- title[0].rawsource = labels[canonical]
- topics[canonical] = biblioclass(
- '', title, classes=[canonical], *field[1].children)
- else:
- docinfo.append(biblioclass('', *field[1].children))
- except TransformError:
- if len(field[-1]) == 1 \
- and isinstance(field[-1][0], nodes.paragraph):
- utils.clean_rcs_keywords(
- field[-1][0], self.rcs_keyword_substitutions)
- # if normedname not in bibliofields:
- classvalue = nodes.make_id(normedname)
- if classvalue:
- field['classes'].append(classvalue)
- docinfo.append(field)
- nodelist = []
- if len(docinfo) != 0:
- nodelist.append(docinfo)
- for name in ('dedication', 'abstract'):
- if topics[name]:
- nodelist.append(topics[name])
- return nodelist
- def check_empty_biblio_field(self, field, name):
- if len(field[-1]) < 1:
- field[-1] += self.document.reporter.warning(
- 'Cannot extract empty bibliographic field "%s".' % name,
- base_node=field)
- return None
- return 1
- def check_compound_biblio_field(self, field, name):
- if len(field[-1]) > 1:
- field[-1] += self.document.reporter.warning(
- 'Cannot extract compound bibliographic field "%s".' % name,
- base_node=field)
- return None
- if not isinstance(field[-1][0], nodes.paragraph):
- field[-1] += self.document.reporter.warning(
- 'Cannot extract bibliographic field "%s" containing '
- 'anything other than a single paragraph.' % name,
- base_node=field)
- return None
- return 1
- rcs_keyword_substitutions = [
- (re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
- r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'),
- (re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'),
- (re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1')]
- def extract_authors(self, field, name, docinfo):
- try:
- if len(field[1]) == 1:
- if isinstance(field[1][0], nodes.paragraph):
- authors = self.authors_from_one_paragraph(field)
- elif isinstance(field[1][0], nodes.bullet_list):
- authors = self.authors_from_bullet_list(field)
- else:
- raise TransformError
- else:
- authors = self.authors_from_paragraphs(field)
- authornodes = [nodes.author('', '', *author)
- for author in authors if author]
- if len(authornodes) >= 1:
- docinfo.append(nodes.authors('', *authornodes))
- else:
- raise TransformError
- except TransformError:
- field[-1] += self.document.reporter.warning(
- 'Bibliographic field "%s" incompatible with extraction: '
- 'it must contain either a single paragraph (with authors '
- 'separated by one of "%s"), multiple paragraphs (one per '
- 'author), or a bullet list with one paragraph (one author) '
- 'per item.'
- % (name, ''.join(self.language.author_separators)),
- base_node=field)
- raise
- def authors_from_one_paragraph(self, field):
- """Return list of Text nodes with author names in `field`.
- Author names must be separated by one of the "autor separators"
- defined for the document language (default: ";" or ",").
- """
- # @@ keep original formatting? (e.g. ``:authors: A. Test, *et-al*``)
- text = ''.join(str(node)
- for node in field[1].findall(nodes.Text))
- if not text:
- raise TransformError
- for authorsep in self.language.author_separators:
- # don't split at escaped `authorsep`:
- pattern = '(?<!\x00)%s' % authorsep
- authornames = re.split(pattern, text)
- if len(authornames) > 1:
- break
- authornames = (name.strip() for name in authornames)
- return [[nodes.Text(name)] for name in authornames if name]
- def authors_from_bullet_list(self, field):
- authors = []
- for item in field[1][0]:
- if isinstance(item, nodes.comment):
- continue
- if len(item) != 1 or not isinstance(item[0], nodes.paragraph):
- raise TransformError
- authors.append(item[0].children)
- if not authors:
- raise TransformError
- return authors
- def authors_from_paragraphs(self, field):
- for item in field[1]:
- if not isinstance(item, (nodes.paragraph, nodes.comment)):
- raise TransformError
- authors = [item.children for item in field[1]
- if not isinstance(item, nodes.comment)]
- return authors
|