io.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611
  1. # $Id: io.py 9100 2022-07-04 21:06:49Z milde $
  2. # Author: David Goodger <goodger@python.org>
  3. # Copyright: This module has been placed in the public domain.
  4. """
  5. I/O classes provide a uniform API for low-level input and output. Subclasses
  6. exist for a variety of input/output mechanisms.
  7. """
  8. __docformat__ = 'reStructuredText'
  9. import codecs
  10. import locale
  11. import os
  12. import re
  13. import sys
  14. import warnings
  15. from docutils import TransformSpec
  16. # Guess the locale's preferred encoding.
  17. # If no valid guess can be made, _locale_encoding is set to `None`:
  18. #
  19. # TODO: check whether this is set correctly with every OS and Python version
  20. # or whether front-end tools need to call `locale.setlocale()`
  21. # before importing this module
  22. try:
  23. # Return locale encoding also in UTF-8 mode
  24. _locale_encoding = locale.getlocale()[1] or locale.getdefaultlocale()[1]
  25. _locale_encoding = _locale_encoding.lower()
  26. except ValueError as error: # OS X may set UTF-8 without language code
  27. # See https://bugs.python.org/issue18378 fixed in 3.8
  28. # and https://sourceforge.net/p/docutils/bugs/298/.
  29. # Drop the special case after requiring Python >= 3.8
  30. if "unknown locale: UTF-8" in error.args:
  31. _locale_encoding = "utf-8"
  32. else:
  33. _locale_encoding = None
  34. except: # noqa any other problems determining the locale -> use None
  35. _locale_encoding = None
  36. try:
  37. codecs.lookup(_locale_encoding)
  38. except (LookupError, TypeError):
  39. _locale_encoding = None
  40. class InputError(OSError): pass
  41. class OutputError(OSError): pass
  42. def check_encoding(stream, encoding):
  43. """Test, whether the encoding of `stream` matches `encoding`.
  44. Returns
  45. :None: if `encoding` or `stream.encoding` are not a valid encoding
  46. argument (e.g. ``None``) or `stream.encoding is missing.
  47. :True: if the encoding argument resolves to the same value as `encoding`,
  48. :False: if the encodings differ.
  49. """
  50. try:
  51. return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
  52. except (LookupError, AttributeError, TypeError):
  53. return None
  54. def error_string(err):
  55. """Return string representation of Exception `err`.
  56. """
  57. return f'{err.__class__.__name__}: {err}'
  58. class Input(TransformSpec):
  59. """
  60. Abstract base class for input wrappers.
  61. """
  62. component_type = 'input'
  63. default_source_path = None
  64. def __init__(self, source=None, source_path=None, encoding=None,
  65. error_handler='strict'):
  66. self.encoding = encoding
  67. """Text encoding for the input source."""
  68. self.error_handler = error_handler
  69. """Text decoding error handler."""
  70. self.source = source
  71. """The source of input data."""
  72. self.source_path = source_path
  73. """A text reference to the source."""
  74. if not source_path:
  75. self.source_path = self.default_source_path
  76. self.successful_encoding = None
  77. """The encoding that successfully decoded the source data."""
  78. def __repr__(self):
  79. return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
  80. self.source_path)
  81. def read(self):
  82. raise NotImplementedError
  83. def decode(self, data):
  84. """
  85. Decode `data` if required.
  86. Return Unicode `str` instances unchanged (nothing to decode).
  87. If `self.encoding` is None, determine encoding from data
  88. or try UTF-8, locale encoding, and (as last ressort) 'latin-1'.
  89. The client application should call ``locale.setlocale`` at the
  90. beginning of processing::
  91. locale.setlocale(locale.LC_ALL, '')
  92. Raise UnicodeError if unsuccessful.
  93. """
  94. if self.encoding and self.encoding.lower() == 'unicode':
  95. assert isinstance(data, str), ('input encoding is "unicode" '
  96. 'but input is not a `str` object')
  97. if isinstance(data, str):
  98. # nothing to decode
  99. return data
  100. if self.encoding:
  101. # We believe the user/application when the encoding is
  102. # explicitly given.
  103. encoding_candidates = [self.encoding]
  104. else:
  105. data_encoding = self.determine_encoding_from_data(data)
  106. if data_encoding:
  107. # If the data declares its encoding (explicitly or via a BOM),
  108. # we believe it.
  109. encoding_candidates = [data_encoding]
  110. else:
  111. # Apply heuristics only if no encoding is explicitly given and
  112. # no BOM found. Start with UTF-8, because that only matches
  113. # data that *IS* UTF-8:
  114. encoding_candidates = ['utf-8']
  115. # TODO: use `locale.getpreferredlocale(do_setlocale=True)`
  116. # to respect UTF-8 mode (API change).
  117. # (Check if it is a valid encoding and not UTF-8)
  118. if _locale_encoding and _locale_encoding != 'utf-8':
  119. encoding_candidates.append(_locale_encoding)
  120. # TODO: don't fall back to 'latin-1' (API change).
  121. encoding_candidates.append('latin-1')
  122. for enc in encoding_candidates:
  123. try:
  124. decoded = str(data, enc, self.error_handler)
  125. self.successful_encoding = enc
  126. # Return decoded, removing BOM and other ZWNBSPs.
  127. # TODO: only remove BOM (ZWNBSP at start of data, API change).
  128. return decoded.replace('\ufeff', '')
  129. except (UnicodeError, LookupError) as err:
  130. # keep exception instance for use outside of the "for" loop.
  131. error = err
  132. raise UnicodeError(
  133. 'Unable to decode input data. Tried the following encodings: '
  134. f'{", ".join(repr(enc) for enc in encoding_candidates)}.\n'
  135. f'({error_string(error)})')
  136. coding_slug = re.compile(br"coding[:=]\s*([-\w.]+)")
  137. """Encoding declaration pattern."""
  138. byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'),
  139. (codecs.BOM_UTF16_BE, 'utf-16-be'),
  140. (codecs.BOM_UTF16_LE, 'utf-16-le'),)
  141. """Sequence of (start_bytes, encoding) tuples for encoding detection.
  142. The first bytes of input data are checked against the start_bytes strings.
  143. A match indicates the given encoding."""
  144. def determine_encoding_from_data(self, data):
  145. """
  146. Try to determine the encoding of `data` by looking *in* `data`.
  147. Check for a byte order mark (BOM) or an encoding declaration.
  148. """
  149. # check for a byte order mark:
  150. for start_bytes, encoding in self.byte_order_marks:
  151. if data.startswith(start_bytes):
  152. return encoding
  153. # check for an encoding declaration pattern in first 2 lines of file:
  154. for line in data.splitlines()[:2]:
  155. match = self.coding_slug.search(line)
  156. if match:
  157. return match.group(1).decode('ascii')
  158. return None
  159. def isatty(self):
  160. try:
  161. return self.source.isatty()
  162. except AttributeError:
  163. return False
  164. class Output(TransformSpec):
  165. """
  166. Abstract base class for output wrappers.
  167. """
  168. component_type = 'output'
  169. default_destination_path = None
  170. def __init__(self, destination=None, destination_path=None,
  171. encoding=None, error_handler='strict'):
  172. self.encoding = encoding
  173. """Text encoding for the output destination."""
  174. self.error_handler = error_handler or 'strict'
  175. """Text encoding error handler."""
  176. self.destination = destination
  177. """The destination for output data."""
  178. self.destination_path = destination_path
  179. """A text reference to the destination."""
  180. if not destination_path:
  181. self.destination_path = self.default_destination_path
  182. def __repr__(self):
  183. return ('%s: destination=%r, destination_path=%r'
  184. % (self.__class__, self.destination, self.destination_path))
  185. def write(self, data):
  186. """`data` is a Unicode string, to be encoded by `self.encode`."""
  187. raise NotImplementedError
  188. def encode(self, data):
  189. if self.encoding and self.encoding.lower() == 'unicode':
  190. assert isinstance(data, str), (
  191. 'the encoding given is "unicode" but the output is not '
  192. 'a Unicode string')
  193. return data
  194. if not isinstance(data, str):
  195. # Non-unicode (e.g. bytes) output.
  196. return data
  197. else:
  198. return data.encode(self.encoding, self.error_handler)
  199. class ErrorOutput:
  200. """
  201. Wrapper class for file-like error streams with
  202. failsafe de- and encoding of `str`, `bytes`, `unicode` and
  203. `Exception` instances.
  204. """
  205. def __init__(self, destination=None, encoding=None,
  206. encoding_errors='backslashreplace',
  207. decoding_errors='replace'):
  208. """
  209. :Parameters:
  210. - `destination`: a file-like object,
  211. a string (path to a file),
  212. `None` (write to `sys.stderr`, default), or
  213. evaluating to `False` (write() requests are ignored).
  214. - `encoding`: `destination` text encoding. Guessed if None.
  215. - `encoding_errors`: how to treat encoding errors.
  216. """
  217. if destination is None:
  218. destination = sys.stderr
  219. elif not destination:
  220. destination = False
  221. # if `destination` is a file name, open it
  222. elif isinstance(destination, str):
  223. destination = open(destination, 'w')
  224. self.destination = destination
  225. """Where warning output is sent."""
  226. self.encoding = (encoding or getattr(destination, 'encoding', None)
  227. or _locale_encoding or 'ascii')
  228. """The output character encoding."""
  229. self.encoding_errors = encoding_errors
  230. """Encoding error handler."""
  231. self.decoding_errors = decoding_errors
  232. """Decoding error handler."""
  233. def write(self, data):
  234. """
  235. Write `data` to self.destination. Ignore, if self.destination is False.
  236. `data` can be a `bytes`, `str`, or `Exception` instance.
  237. """
  238. if not self.destination:
  239. return
  240. if isinstance(data, Exception):
  241. data = str(data)
  242. try:
  243. self.destination.write(data)
  244. except UnicodeEncodeError:
  245. self.destination.write(data.encode(self.encoding,
  246. self.encoding_errors))
  247. except TypeError:
  248. if isinstance(data, str): # destination may expect bytes
  249. self.destination.write(data.encode(self.encoding,
  250. self.encoding_errors))
  251. elif self.destination in (sys.stderr, sys.stdout):
  252. # write bytes to raw stream
  253. self.destination.buffer.write(data)
  254. else:
  255. self.destination.write(str(data, self.encoding,
  256. self.decoding_errors))
  257. def close(self):
  258. """
  259. Close the error-output stream.
  260. Ignored if the destination is` sys.stderr` or `sys.stdout` or has no
  261. close() method.
  262. """
  263. if self.destination in (sys.stdout, sys.stderr):
  264. return
  265. try:
  266. self.destination.close()
  267. except AttributeError:
  268. pass
  269. def isatty(self):
  270. try:
  271. return self.destination.isatty()
  272. except AttributeError:
  273. return False
  274. class FileInput(Input):
  275. """
  276. Input for single, simple file-like objects.
  277. """
  278. def __init__(self, source=None, source_path=None,
  279. encoding=None, error_handler='strict',
  280. autoclose=True, mode='r'):
  281. """
  282. :Parameters:
  283. - `source`: either a file-like object (which is read directly), or
  284. `None` (which implies `sys.stdin` if no `source_path` given).
  285. - `source_path`: a path to a file, which is opened and then read.
  286. - `encoding`: the expected text encoding of the input file.
  287. - `error_handler`: the encoding error handler to use.
  288. - `autoclose`: close automatically after read (except when
  289. `sys.stdin` is the source).
  290. - `mode`: how the file is to be opened (see standard function
  291. `open`). The default is read only ('r').
  292. """
  293. Input.__init__(self, source, source_path, encoding, error_handler)
  294. self.autoclose = autoclose
  295. self._stderr = ErrorOutput()
  296. if source is None:
  297. if source_path:
  298. try:
  299. self.source = open(source_path, mode,
  300. encoding=self.encoding or 'utf-8-sig',
  301. errors=self.error_handler)
  302. except OSError as error:
  303. raise InputError(error.errno, error.strerror, source_path)
  304. else:
  305. self.source = sys.stdin
  306. elif check_encoding(self.source, self.encoding) is False:
  307. # TODO: re-open, warn or raise error?
  308. raise UnicodeError('Encoding clash: encoding given is "%s" '
  309. 'but source is opened with encoding "%s".' %
  310. (self.encoding, self.source.encoding))
  311. if not source_path:
  312. try:
  313. self.source_path = self.source.name
  314. except AttributeError:
  315. pass
  316. def read(self):
  317. """
  318. Read and decode a single file and return the data (Unicode string).
  319. """
  320. try:
  321. if self.source is sys.stdin:
  322. # read as binary data to circumvent auto-decoding
  323. data = self.source.buffer.read()
  324. else:
  325. data = self.source.read()
  326. except (UnicodeError, LookupError):
  327. if not self.encoding and self.source_path:
  328. # re-read in binary mode and decode with heuristics
  329. b_source = open(self.source_path, 'rb')
  330. data = b_source.read()
  331. b_source.close()
  332. else:
  333. raise
  334. finally:
  335. if self.autoclose:
  336. self.close()
  337. data = self.decode(data)
  338. # normalise newlines
  339. return '\n'.join(data.splitlines()+[''])
  340. def readlines(self):
  341. """
  342. Return lines of a single file as list of Unicode strings.
  343. """
  344. return self.read().splitlines(True)
  345. def close(self):
  346. if self.source is not sys.stdin:
  347. self.source.close()
  348. class FileOutput(Output):
  349. """
  350. Output for single, simple file-like objects.
  351. """
  352. mode = 'w'
  353. """The mode argument for `open()`."""
  354. # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).
  355. # (Do not use binary mode ('wb') for text files, as this prevents the
  356. # conversion of newlines to the system specific default.)
  357. def __init__(self, destination=None, destination_path=None,
  358. encoding=None, error_handler='strict', autoclose=True,
  359. handle_io_errors=None, mode=None):
  360. """
  361. :Parameters:
  362. - `destination`: either a file-like object (which is written
  363. directly) or `None` (which implies `sys.stdout` if no
  364. `destination_path` given).
  365. - `destination_path`: a path to a file, which is opened and then
  366. written.
  367. - `encoding`: the text encoding of the output file.
  368. - `error_handler`: the encoding error handler to use.
  369. - `autoclose`: close automatically after write (except when
  370. `sys.stdout` or `sys.stderr` is the destination).
  371. - `handle_io_errors`: ignored, deprecated, will be removed.
  372. - `mode`: how the file is to be opened (see standard function
  373. `open`). The default is 'w', providing universal newline
  374. support for text files.
  375. """
  376. Output.__init__(self, destination, destination_path,
  377. encoding, error_handler)
  378. self.opened = True
  379. self.autoclose = autoclose
  380. if handle_io_errors is not None:
  381. warnings.warn('io.FileOutput: init argument "handle_io_errors" '
  382. 'is ignored and will be removed in '
  383. 'Docutils 2.0.', DeprecationWarning, stacklevel=2)
  384. if mode is not None:
  385. self.mode = mode
  386. self._stderr = ErrorOutput()
  387. if destination is None:
  388. if destination_path:
  389. self.opened = False
  390. else:
  391. self.destination = sys.stdout
  392. elif ( # destination is file-type object -> check mode:
  393. mode and hasattr(self.destination, 'mode')
  394. and mode != self.destination.mode):
  395. print('Warning: Destination mode "%s" differs from specified '
  396. 'mode "%s"' % (self.destination.mode, mode),
  397. file=self._stderr)
  398. if not destination_path:
  399. try:
  400. self.destination_path = self.destination.name
  401. except AttributeError:
  402. pass
  403. def open(self):
  404. # Specify encoding
  405. if 'b' not in self.mode:
  406. kwargs = {'encoding': self.encoding,
  407. 'errors': self.error_handler}
  408. else:
  409. kwargs = {}
  410. try:
  411. self.destination = open(self.destination_path, self.mode, **kwargs)
  412. except OSError as error:
  413. raise OutputError(error.errno, error.strerror,
  414. self.destination_path)
  415. self.opened = True
  416. def write(self, data):
  417. """Encode `data`, write it to a single file, and return it.
  418. With Python 3 or binary output mode, `data` is returned unchanged,
  419. except when specified encoding and output encoding differ.
  420. """
  421. if not self.opened:
  422. self.open()
  423. if ('b' not in self.mode
  424. and check_encoding(self.destination, self.encoding) is False):
  425. data = self.encode(data)
  426. if os.linesep != '\n':
  427. # fix endings
  428. data = data.replace(b'\n', bytes(os.linesep, 'ascii'))
  429. try:
  430. self.destination.write(data)
  431. except TypeError as err:
  432. if isinstance(data, bytes):
  433. try:
  434. self.destination.buffer.write(data)
  435. except AttributeError:
  436. if check_encoding(self.destination,
  437. self.encoding) is False:
  438. raise ValueError(
  439. 'Encoding of %s (%s) differs \n'
  440. ' from specified encoding (%s)' %
  441. (self.destination_path or 'destination',
  442. self.destination.encoding,
  443. self.encoding))
  444. else:
  445. raise err
  446. except (UnicodeError, LookupError) as err:
  447. raise UnicodeError(
  448. 'Unable to encode output data. output-encoding is: '
  449. '%s.\n(%s)' % (self.encoding, error_string(err)))
  450. finally:
  451. if self.autoclose:
  452. self.close()
  453. return data
  454. def close(self):
  455. if self.destination not in (sys.stdout, sys.stderr):
  456. self.destination.close()
  457. self.opened = False
  458. class BinaryFileOutput(FileOutput):
  459. """
  460. A version of docutils.io.FileOutput which writes to a binary file.
  461. """
  462. # Used by core.publish_cmdline_to_binary() which in turn is used by
  463. # rst2odt (OpenOffice writer)
  464. mode = 'wb'
  465. class StringInput(Input):
  466. """
  467. Direct string input.
  468. """
  469. default_source_path = '<string>'
  470. def read(self):
  471. """Decode and return the source string."""
  472. return self.decode(self.source)
  473. class StringOutput(Output):
  474. """
  475. Direct string output.
  476. """
  477. default_destination_path = '<string>'
  478. def write(self, data):
  479. """Encode `data`, store it in `self.destination`, and return it."""
  480. self.destination = self.encode(data)
  481. return self.destination
  482. class NullInput(Input):
  483. """
  484. Degenerate input: read nothing.
  485. """
  486. default_source_path = 'null input'
  487. def read(self):
  488. """Return a null string."""
  489. return ''
  490. class NullOutput(Output):
  491. """
  492. Degenerate output: write nothing.
  493. """
  494. default_destination_path = 'null output'
  495. def write(self, data):
  496. """Do nothing ([don't even] send data to the bit bucket)."""
  497. pass
  498. class DocTreeInput(Input):
  499. """
  500. Adapter for document tree input.
  501. The document tree must be passed in the ``source`` parameter.
  502. """
  503. default_source_path = 'doctree input'
  504. def read(self):
  505. """Return the document tree."""
  506. return self.source