Recipes¶
These are some additional possible uses for chardetng_py.
If there’s sufficient interest, we can stabilise these and include them in the main package.
Detect the encoding of a bytestring and return a CodecInfo object¶
def detect_codec(
byte_str: Union[bytes, bytearray], *, allow_utf8: bool = True
) -> codecs.CodecInfo:
r"""Detect the encoding of byte_str and return a CodecInfo object.
Parameters
----------
byte_str : bytes or bytearray
Input buffer to detect the encoding of.
Examples
--------
>>> codec = detect_codec(b"Jakby r\xeaka Boga")
>>> codec.name
'cp1254'
"""
return codecs.lookup(detect(byte_str, allow_utf8=allow_utf8))
Detect the encoding of a bytestring and return the decoded string¶
def decode(
byte_str: Union[bytes, bytearray],
errors: Literal[
"strict", "ignore", "replace", "backslashreplace", "surrogateescape"
] = "strict",
*,
allow_utf8: bool = True,
) -> str:
r"""Detect the encoding of byte_str and return the decoded string.
Parameters
----------
byte_str : bytes or bytearray
Input buffer to decode.
errors: "strict" or "ignore" or "replace" or "backslashreplace" or "surrogateescape"
Error handler to use. See [Python documentation](https://docs.python.org/3/library/codecs.html#error-handlers)
Examples
--------
>>> decode(b"Jakby r\xeaka Boga")
'Jakby rêka Boga'
"""
return byte_str.decode(detect(byte_str, allow_utf8=allow_utf8), errors=errors)
Open a file, incrementally determine its encoding and return a TextIOWrapper¶
This is a neat trick that allows you to open a file and detect its encoding with a fixed amount of memory. The other bindings I’ve found don’t support this use-case and you end up having to read the entire file into memory, which is a problem for huge files.
This also lets you directly pass a text file of unknown encoding to csv.writer of csv.DictWriter, for example.
# Reads entire file
# We could add support for reading to some fixed position
def _detect_buffer(buffer: IO[bytes], *, allow_utf8: bool = True, **kwargs):
cursor_initial_position = buffer.tell()
encoding_detector = EncodingDetector()
# Not sure this is the best chunk size?
while chunk := buffer.read(io.DEFAULT_BUFFER_SIZE):
encoding_detector.feed(chunk, last=False)
encoding_detector.feed(b"", last=True)
buffer.seek(cursor_initial_position)
return io.TextIOWrapper(
buffer,
encoding=encoding_detector.guess(tld=None, allow_utf8=allow_utf8),
**kwargs,
)
# Could be nice to have an async one as well
# unfortunately async fs tools aren't in std lib
@contextmanager
def detect_open(
file: Union[bytes, str, PathLike], mode: Literal["r", "rt"] = "r", **kwargs
):
"""Open a file and detect its encoding."""
if mode not in {"r", "rt"}:
raise NotImplemented("Only reading supported at the moment")
# TODO Could support r+ and w+ modes of operation?
# The whole point is that we're going to detect in
if "encoding" in kwargs:
raise ValueError
with open(file, mode="rb", **kwargs) as f:
yield _detect_buffer(f)