123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611 |
- # $Id: io.py 9100 2022-07-04 21:06:49Z milde $
- # Author: David Goodger <goodger@python.org>
- # Copyright: This module has been placed in the public domain.
- """
- I/O classes provide a uniform API for low-level input and output. Subclasses
- exist for a variety of input/output mechanisms.
- """
- __docformat__ = 'reStructuredText'
- import codecs
- import locale
- import os
- import re
- import sys
- import warnings
- from docutils import TransformSpec
- # Guess the locale's preferred encoding.
- # If no valid guess can be made, _locale_encoding is set to `None`:
- #
- # TODO: check whether this is set correctly with every OS and Python version
- # or whether front-end tools need to call `locale.setlocale()`
- # before importing this module
- try:
- # Return locale encoding also in UTF-8 mode
- _locale_encoding = locale.getlocale()[1] or locale.getdefaultlocale()[1]
- _locale_encoding = _locale_encoding.lower()
- except ValueError as error: # OS X may set UTF-8 without language code
- # See https://bugs.python.org/issue18378 fixed in 3.8
- # and https://sourceforge.net/p/docutils/bugs/298/.
- # Drop the special case after requiring Python >= 3.8
- if "unknown locale: UTF-8" in error.args:
- _locale_encoding = "utf-8"
- else:
- _locale_encoding = None
- except: # noqa any other problems determining the locale -> use None
- _locale_encoding = None
- try:
- codecs.lookup(_locale_encoding)
- except (LookupError, TypeError):
- _locale_encoding = None
- class InputError(OSError): pass
- class OutputError(OSError): pass
- def check_encoding(stream, encoding):
- """Test, whether the encoding of `stream` matches `encoding`.
- Returns
- :None: if `encoding` or `stream.encoding` are not a valid encoding
- argument (e.g. ``None``) or `stream.encoding is missing.
- :True: if the encoding argument resolves to the same value as `encoding`,
- :False: if the encodings differ.
- """
- try:
- return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
- except (LookupError, AttributeError, TypeError):
- return None
- def error_string(err):
- """Return string representation of Exception `err`.
- """
- return f'{err.__class__.__name__}: {err}'
- class Input(TransformSpec):
- """
- Abstract base class for input wrappers.
- """
- component_type = 'input'
- default_source_path = None
- def __init__(self, source=None, source_path=None, encoding=None,
- error_handler='strict'):
- self.encoding = encoding
- """Text encoding for the input source."""
- self.error_handler = error_handler
- """Text decoding error handler."""
- self.source = source
- """The source of input data."""
- self.source_path = source_path
- """A text reference to the source."""
- if not source_path:
- self.source_path = self.default_source_path
- self.successful_encoding = None
- """The encoding that successfully decoded the source data."""
- def __repr__(self):
- return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
- self.source_path)
- def read(self):
- raise NotImplementedError
- def decode(self, data):
- """
- Decode `data` if required.
- Return Unicode `str` instances unchanged (nothing to decode).
- If `self.encoding` is None, determine encoding from data
- or try UTF-8, locale encoding, and (as last ressort) 'latin-1'.
- The client application should call ``locale.setlocale`` at the
- beginning of processing::
- locale.setlocale(locale.LC_ALL, '')
- Raise UnicodeError if unsuccessful.
- """
- if self.encoding and self.encoding.lower() == 'unicode':
- assert isinstance(data, str), ('input encoding is "unicode" '
- 'but input is not a `str` object')
- if isinstance(data, str):
- # nothing to decode
- return data
- if self.encoding:
- # We believe the user/application when the encoding is
- # explicitly given.
- encoding_candidates = [self.encoding]
- else:
- data_encoding = self.determine_encoding_from_data(data)
- if data_encoding:
- # If the data declares its encoding (explicitly or via a BOM),
- # we believe it.
- encoding_candidates = [data_encoding]
- else:
- # Apply heuristics only if no encoding is explicitly given and
- # no BOM found. Start with UTF-8, because that only matches
- # data that *IS* UTF-8:
- encoding_candidates = ['utf-8']
- # TODO: use `locale.getpreferredlocale(do_setlocale=True)`
- # to respect UTF-8 mode (API change).
- # (Check if it is a valid encoding and not UTF-8)
- if _locale_encoding and _locale_encoding != 'utf-8':
- encoding_candidates.append(_locale_encoding)
- # TODO: don't fall back to 'latin-1' (API change).
- encoding_candidates.append('latin-1')
- for enc in encoding_candidates:
- try:
- decoded = str(data, enc, self.error_handler)
- self.successful_encoding = enc
- # Return decoded, removing BOM and other ZWNBSPs.
- # TODO: only remove BOM (ZWNBSP at start of data, API change).
- return decoded.replace('\ufeff', '')
- except (UnicodeError, LookupError) as err:
- # keep exception instance for use outside of the "for" loop.
- error = err
- raise UnicodeError(
- 'Unable to decode input data. Tried the following encodings: '
- f'{", ".join(repr(enc) for enc in encoding_candidates)}.\n'
- f'({error_string(error)})')
- coding_slug = re.compile(br"coding[:=]\s*([-\w.]+)")
- """Encoding declaration pattern."""
- byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'),
- (codecs.BOM_UTF16_BE, 'utf-16-be'),
- (codecs.BOM_UTF16_LE, 'utf-16-le'),)
- """Sequence of (start_bytes, encoding) tuples for encoding detection.
- The first bytes of input data are checked against the start_bytes strings.
- A match indicates the given encoding."""
- def determine_encoding_from_data(self, data):
- """
- Try to determine the encoding of `data` by looking *in* `data`.
- Check for a byte order mark (BOM) or an encoding declaration.
- """
- # check for a byte order mark:
- for start_bytes, encoding in self.byte_order_marks:
- if data.startswith(start_bytes):
- return encoding
- # check for an encoding declaration pattern in first 2 lines of file:
- for line in data.splitlines()[:2]:
- match = self.coding_slug.search(line)
- if match:
- return match.group(1).decode('ascii')
- return None
- def isatty(self):
- try:
- return self.source.isatty()
- except AttributeError:
- return False
- class Output(TransformSpec):
- """
- Abstract base class for output wrappers.
- """
- component_type = 'output'
- default_destination_path = None
- def __init__(self, destination=None, destination_path=None,
- encoding=None, error_handler='strict'):
- self.encoding = encoding
- """Text encoding for the output destination."""
- self.error_handler = error_handler or 'strict'
- """Text encoding error handler."""
- self.destination = destination
- """The destination for output data."""
- self.destination_path = destination_path
- """A text reference to the destination."""
- if not destination_path:
- self.destination_path = self.default_destination_path
- def __repr__(self):
- return ('%s: destination=%r, destination_path=%r'
- % (self.__class__, self.destination, self.destination_path))
- def write(self, data):
- """`data` is a Unicode string, to be encoded by `self.encode`."""
- raise NotImplementedError
- def encode(self, data):
- if self.encoding and self.encoding.lower() == 'unicode':
- assert isinstance(data, str), (
- 'the encoding given is "unicode" but the output is not '
- 'a Unicode string')
- return data
- if not isinstance(data, str):
- # Non-unicode (e.g. bytes) output.
- return data
- else:
- return data.encode(self.encoding, self.error_handler)
- class ErrorOutput:
- """
- Wrapper class for file-like error streams with
- failsafe de- and encoding of `str`, `bytes`, `unicode` and
- `Exception` instances.
- """
- def __init__(self, destination=None, encoding=None,
- encoding_errors='backslashreplace',
- decoding_errors='replace'):
- """
- :Parameters:
- - `destination`: a file-like object,
- a string (path to a file),
- `None` (write to `sys.stderr`, default), or
- evaluating to `False` (write() requests are ignored).
- - `encoding`: `destination` text encoding. Guessed if None.
- - `encoding_errors`: how to treat encoding errors.
- """
- if destination is None:
- destination = sys.stderr
- elif not destination:
- destination = False
- # if `destination` is a file name, open it
- elif isinstance(destination, str):
- destination = open(destination, 'w')
- self.destination = destination
- """Where warning output is sent."""
- self.encoding = (encoding or getattr(destination, 'encoding', None)
- or _locale_encoding or 'ascii')
- """The output character encoding."""
- self.encoding_errors = encoding_errors
- """Encoding error handler."""
- self.decoding_errors = decoding_errors
- """Decoding error handler."""
- def write(self, data):
- """
- Write `data` to self.destination. Ignore, if self.destination is False.
- `data` can be a `bytes`, `str`, or `Exception` instance.
- """
- if not self.destination:
- return
- if isinstance(data, Exception):
- data = str(data)
- try:
- self.destination.write(data)
- except UnicodeEncodeError:
- self.destination.write(data.encode(self.encoding,
- self.encoding_errors))
- except TypeError:
- if isinstance(data, str): # destination may expect bytes
- self.destination.write(data.encode(self.encoding,
- self.encoding_errors))
- elif self.destination in (sys.stderr, sys.stdout):
- # write bytes to raw stream
- self.destination.buffer.write(data)
- else:
- self.destination.write(str(data, self.encoding,
- self.decoding_errors))
- def close(self):
- """
- Close the error-output stream.
- Ignored if the destination is` sys.stderr` or `sys.stdout` or has no
- close() method.
- """
- if self.destination in (sys.stdout, sys.stderr):
- return
- try:
- self.destination.close()
- except AttributeError:
- pass
- def isatty(self):
- try:
- return self.destination.isatty()
- except AttributeError:
- return False
- class FileInput(Input):
- """
- Input for single, simple file-like objects.
- """
- def __init__(self, source=None, source_path=None,
- encoding=None, error_handler='strict',
- autoclose=True, mode='r'):
- """
- :Parameters:
- - `source`: either a file-like object (which is read directly), or
- `None` (which implies `sys.stdin` if no `source_path` given).
- - `source_path`: a path to a file, which is opened and then read.
- - `encoding`: the expected text encoding of the input file.
- - `error_handler`: the encoding error handler to use.
- - `autoclose`: close automatically after read (except when
- `sys.stdin` is the source).
- - `mode`: how the file is to be opened (see standard function
- `open`). The default is read only ('r').
- """
- Input.__init__(self, source, source_path, encoding, error_handler)
- self.autoclose = autoclose
- self._stderr = ErrorOutput()
- if source is None:
- if source_path:
- try:
- self.source = open(source_path, mode,
- encoding=self.encoding or 'utf-8-sig',
- errors=self.error_handler)
- except OSError as error:
- raise InputError(error.errno, error.strerror, source_path)
- else:
- self.source = sys.stdin
- elif check_encoding(self.source, self.encoding) is False:
- # TODO: re-open, warn or raise error?
- raise UnicodeError('Encoding clash: encoding given is "%s" '
- 'but source is opened with encoding "%s".' %
- (self.encoding, self.source.encoding))
- if not source_path:
- try:
- self.source_path = self.source.name
- except AttributeError:
- pass
- def read(self):
- """
- Read and decode a single file and return the data (Unicode string).
- """
- try:
- if self.source is sys.stdin:
- # read as binary data to circumvent auto-decoding
- data = self.source.buffer.read()
- else:
- data = self.source.read()
- except (UnicodeError, LookupError):
- if not self.encoding and self.source_path:
- # re-read in binary mode and decode with heuristics
- b_source = open(self.source_path, 'rb')
- data = b_source.read()
- b_source.close()
- else:
- raise
- finally:
- if self.autoclose:
- self.close()
- data = self.decode(data)
- # normalise newlines
- return '\n'.join(data.splitlines()+[''])
- def readlines(self):
- """
- Return lines of a single file as list of Unicode strings.
- """
- return self.read().splitlines(True)
- def close(self):
- if self.source is not sys.stdin:
- self.source.close()
- class FileOutput(Output):
- """
- Output for single, simple file-like objects.
- """
- mode = 'w'
- """The mode argument for `open()`."""
- # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).
- # (Do not use binary mode ('wb') for text files, as this prevents the
- # conversion of newlines to the system specific default.)
- def __init__(self, destination=None, destination_path=None,
- encoding=None, error_handler='strict', autoclose=True,
- handle_io_errors=None, mode=None):
- """
- :Parameters:
- - `destination`: either a file-like object (which is written
- directly) or `None` (which implies `sys.stdout` if no
- `destination_path` given).
- - `destination_path`: a path to a file, which is opened and then
- written.
- - `encoding`: the text encoding of the output file.
- - `error_handler`: the encoding error handler to use.
- - `autoclose`: close automatically after write (except when
- `sys.stdout` or `sys.stderr` is the destination).
- - `handle_io_errors`: ignored, deprecated, will be removed.
- - `mode`: how the file is to be opened (see standard function
- `open`). The default is 'w', providing universal newline
- support for text files.
- """
- Output.__init__(self, destination, destination_path,
- encoding, error_handler)
- self.opened = True
- self.autoclose = autoclose
- if handle_io_errors is not None:
- warnings.warn('io.FileOutput: init argument "handle_io_errors" '
- 'is ignored and will be removed in '
- 'Docutils 2.0.', DeprecationWarning, stacklevel=2)
- if mode is not None:
- self.mode = mode
- self._stderr = ErrorOutput()
- if destination is None:
- if destination_path:
- self.opened = False
- else:
- self.destination = sys.stdout
- elif ( # destination is file-type object -> check mode:
- mode and hasattr(self.destination, 'mode')
- and mode != self.destination.mode):
- print('Warning: Destination mode "%s" differs from specified '
- 'mode "%s"' % (self.destination.mode, mode),
- file=self._stderr)
- if not destination_path:
- try:
- self.destination_path = self.destination.name
- except AttributeError:
- pass
- def open(self):
- # Specify encoding
- if 'b' not in self.mode:
- kwargs = {'encoding': self.encoding,
- 'errors': self.error_handler}
- else:
- kwargs = {}
- try:
- self.destination = open(self.destination_path, self.mode, **kwargs)
- except OSError as error:
- raise OutputError(error.errno, error.strerror,
- self.destination_path)
- self.opened = True
- def write(self, data):
- """Encode `data`, write it to a single file, and return it.
- With Python 3 or binary output mode, `data` is returned unchanged,
- except when specified encoding and output encoding differ.
- """
- if not self.opened:
- self.open()
- if ('b' not in self.mode
- and check_encoding(self.destination, self.encoding) is False):
- data = self.encode(data)
- if os.linesep != '\n':
- # fix endings
- data = data.replace(b'\n', bytes(os.linesep, 'ascii'))
- try:
- self.destination.write(data)
- except TypeError as err:
- if isinstance(data, bytes):
- try:
- self.destination.buffer.write(data)
- except AttributeError:
- if check_encoding(self.destination,
- self.encoding) is False:
- raise ValueError(
- 'Encoding of %s (%s) differs \n'
- ' from specified encoding (%s)' %
- (self.destination_path or 'destination',
- self.destination.encoding,
- self.encoding))
- else:
- raise err
- except (UnicodeError, LookupError) as err:
- raise UnicodeError(
- 'Unable to encode output data. output-encoding is: '
- '%s.\n(%s)' % (self.encoding, error_string(err)))
- finally:
- if self.autoclose:
- self.close()
- return data
- def close(self):
- if self.destination not in (sys.stdout, sys.stderr):
- self.destination.close()
- self.opened = False
- class BinaryFileOutput(FileOutput):
- """
- A version of docutils.io.FileOutput which writes to a binary file.
- """
- # Used by core.publish_cmdline_to_binary() which in turn is used by
- # rst2odt (OpenOffice writer)
- mode = 'wb'
- class StringInput(Input):
- """
- Direct string input.
- """
- default_source_path = '<string>'
- def read(self):
- """Decode and return the source string."""
- return self.decode(self.source)
- class StringOutput(Output):
- """
- Direct string output.
- """
- default_destination_path = '<string>'
- def write(self, data):
- """Encode `data`, store it in `self.destination`, and return it."""
- self.destination = self.encode(data)
- return self.destination
- class NullInput(Input):
- """
- Degenerate input: read nothing.
- """
- default_source_path = 'null input'
- def read(self):
- """Return a null string."""
- return ''
- class NullOutput(Output):
- """
- Degenerate output: write nothing.
- """
- default_destination_path = 'null output'
- def write(self, data):
- """Do nothing ([don't even] send data to the bit bucket)."""
- pass
- class DocTreeInput(Input):
- """
- Adapter for document tree input.
- The document tree must be passed in the ``source`` parameter.
- """
- default_source_path = 'doctree input'
- def read(self):
- """Return the document tree."""
- return self.source
|