1
0

docutils_xml.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. # $Id: docutils_xml.py 9038 2022-03-05 23:31:46Z milde $
  2. # Author: David Goodger, Paul Tremblay, Guenter Milde
  3. # Maintainer: docutils-develop@lists.sourceforge.net
  4. # Copyright: This module has been placed in the public domain.
  5. """
  6. Simple document tree Writer, writes Docutils XML according to
  7. https://docutils.sourceforge.io/docs/ref/docutils.dtd.
  8. """
  9. __docformat__ = 'reStructuredText'
  10. from io import StringIO
  11. import xml.sax.saxutils
  12. import docutils
  13. from docutils import frontend, writers, nodes
  14. class RawXmlError(docutils.ApplicationError):
  15. pass
  16. class Writer(writers.Writer):
  17. supported = ('xml',)
  18. """Formats this writer supports."""
  19. settings_spec = (
  20. '"Docutils XML" Writer Options',
  21. None,
  22. (('Generate XML with newlines before and after tags.',
  23. ['--newlines'],
  24. {'action': 'store_true', 'validator': frontend.validate_boolean}),
  25. ('Generate XML with indents and newlines.',
  26. ['--indents'], # TODO use integer value for number of spaces?
  27. {'action': 'store_true', 'validator': frontend.validate_boolean}),
  28. ('Omit the XML declaration. Use with caution.',
  29. ['--no-xml-declaration'],
  30. {'dest': 'xml_declaration', 'default': 1, 'action': 'store_false',
  31. 'validator': frontend.validate_boolean}),
  32. ('Omit the DOCTYPE declaration.',
  33. ['--no-doctype'],
  34. {'dest': 'doctype_declaration', 'default': 1,
  35. 'action': 'store_false', 'validator': frontend.validate_boolean}),))
  36. settings_defaults = {'output_encoding_error_handler': 'xmlcharrefreplace'}
  37. config_section = 'docutils_xml writer'
  38. config_section_dependencies = ('writers',)
  39. output = None
  40. """Final translated form of `document`."""
  41. def __init__(self):
  42. writers.Writer.__init__(self)
  43. self.translator_class = XMLTranslator
  44. def translate(self):
  45. self.visitor = visitor = self.translator_class(self.document)
  46. self.document.walkabout(visitor)
  47. self.output = ''.join(visitor.output)
  48. class XMLTranslator(nodes.GenericNodeVisitor):
  49. xml_declaration = '<?xml version="1.0" encoding="%s"?>\n'
  50. # TODO: add stylesheet options similar to HTML and LaTeX writers?
  51. # xml_stylesheet = '<?xml-stylesheet type="text/xsl" href="%s"?>\n'
  52. doctype = (
  53. '<!DOCTYPE document PUBLIC'
  54. ' "+//IDN docutils.sourceforge.net//DTD Docutils Generic//EN//XML"'
  55. ' "http://docutils.sourceforge.net/docs/ref/docutils.dtd">\n')
  56. generator = '<!-- Generated by Docutils %s -->\n'
  57. xmlparser = xml.sax.make_parser()
  58. """SAX parser instance to check/extract raw XML."""
  59. xmlparser.setFeature(
  60. "http://xml.org/sax/features/external-general-entities", True)
  61. def __init__(self, document):
  62. nodes.NodeVisitor.__init__(self, document)
  63. # Reporter
  64. self.warn = self.document.reporter.warning
  65. self.error = self.document.reporter.error
  66. # Settings
  67. self.settings = settings = document.settings
  68. self.indent = self.newline = ''
  69. if settings.newlines:
  70. self.newline = '\n'
  71. if settings.indents:
  72. self.newline = '\n'
  73. self.indent = ' ' # TODO make this configurable?
  74. self.level = 0 # indentation level
  75. self.in_simple = 0 # level of nesting inside mixed-content elements
  76. self.fixed_text = 0 # level of nesting inside FixedText elements
  77. # Output
  78. self.output = []
  79. if settings.xml_declaration:
  80. self.output.append(
  81. self.xml_declaration % settings.output_encoding)
  82. if settings.doctype_declaration:
  83. self.output.append(self.doctype)
  84. self.output.append(self.generator % docutils.__version__)
  85. # initialize XML parser
  86. self.the_handle = TestXml()
  87. self.xmlparser.setContentHandler(self.the_handle)
  88. # generic visit and depart methods
  89. # --------------------------------
  90. simple_nodes = (nodes.TextElement,
  91. nodes.image, nodes.colspec, nodes.transition)
  92. def default_visit(self, node):
  93. """Default node visit method."""
  94. if not self.in_simple:
  95. self.output.append(self.indent*self.level)
  96. self.output.append(node.starttag(xml.sax.saxutils.quoteattr))
  97. self.level += 1
  98. # @@ make nodes.literal an instance of FixedTextElement?
  99. if isinstance(node, (nodes.FixedTextElement, nodes.literal)):
  100. self.fixed_text += 1
  101. if isinstance(node, self.simple_nodes):
  102. self.in_simple += 1
  103. if not self.in_simple:
  104. self.output.append(self.newline)
  105. def default_departure(self, node):
  106. """Default node depart method."""
  107. self.level -= 1
  108. if not self.in_simple:
  109. self.output.append(self.indent*self.level)
  110. self.output.append(node.endtag())
  111. if isinstance(node, (nodes.FixedTextElement, nodes.literal)):
  112. self.fixed_text -= 1
  113. if isinstance(node, self.simple_nodes):
  114. self.in_simple -= 1
  115. if not self.in_simple:
  116. self.output.append(self.newline)
  117. # specific visit and depart methods
  118. # ---------------------------------
  119. def visit_Text(self, node):
  120. text = xml.sax.saxutils.escape(node.astext())
  121. # indent text if we are not in a FixedText element:
  122. if not self.fixed_text:
  123. text = text.replace('\n', '\n'+self.indent*self.level)
  124. self.output.append(text)
  125. def depart_Text(self, node):
  126. pass
  127. def visit_raw(self, node):
  128. if 'xml' not in node.get('format', '').split():
  129. # skip other raw content?
  130. # raise nodes.SkipNode
  131. self.default_visit(node)
  132. return
  133. # wrap in <raw> element
  134. self.default_visit(node) # or not?
  135. xml_string = node.astext()
  136. self.output.append(xml_string)
  137. self.default_departure(node) # or not?
  138. # Check validity of raw XML:
  139. try:
  140. self.xmlparser.parse(StringIO(xml_string))
  141. except xml.sax._exceptions.SAXParseException:
  142. col_num = self.the_handle.locator.getColumnNumber()
  143. line_num = self.the_handle.locator.getLineNumber()
  144. srcline = node.line
  145. if not isinstance(node.parent, nodes.TextElement):
  146. srcline += 2 # directive content start line
  147. msg = 'Invalid raw XML in column %d, line offset %d:\n%s' % (
  148. col_num, line_num, node.astext())
  149. self.warn(msg, source=node.source, line=srcline+line_num-1)
  150. raise nodes.SkipNode # content already processed
  151. class TestXml(xml.sax.handler.ContentHandler):
  152. def setDocumentLocator(self, locator):
  153. self.locator = locator