legacy.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. from typing import Dict, Optional, Union
  2. from .api import from_bytes
  3. from .constant import CHARDET_CORRESPONDENCE
  4. def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
  5. """
  6. chardet legacy method
  7. Detect the encoding of the given byte string. It should be mostly backward-compatible.
  8. Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
  9. This function is deprecated and should be used to migrate your project easily, consult the documentation for
  10. further information. Not planned for removal.
  11. :param byte_str: The byte sequence to examine.
  12. """
  13. if not isinstance(byte_str, (bytearray, bytes)):
  14. raise TypeError( # pragma: nocover
  15. "Expected object of type bytes or bytearray, got: "
  16. "{0}".format(type(byte_str))
  17. )
  18. if isinstance(byte_str, bytearray):
  19. byte_str = bytes(byte_str)
  20. r = from_bytes(byte_str).best()
  21. encoding = r.encoding if r is not None else None
  22. language = r.language if r is not None and r.language != "Unknown" else ""
  23. confidence = 1.0 - r.chaos if r is not None else None
  24. # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
  25. # but chardet does return 'utf-8-sig' and it is a valid codec name.
  26. if r is not None and encoding == "utf_8" and r.bom:
  27. encoding += "_sig"
  28. return {
  29. "encoding": encoding
  30. if encoding not in CHARDET_CORRESPONDENCE
  31. else CHARDET_CORRESPONDENCE[encoding],
  32. "language": language,
  33. "confidence": confidence,
  34. }