xref: /third_party/python/Lib/codecs.py (revision 7db96d56)
17db96d56Sopenharmony_ci""" codecs -- Python Codec Registry, API and helpers.
27db96d56Sopenharmony_ci
37db96d56Sopenharmony_ci
47db96d56Sopenharmony_ciWritten by Marc-Andre Lemburg (mal@lemburg.com).
57db96d56Sopenharmony_ci
67db96d56Sopenharmony_ci(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
77db96d56Sopenharmony_ci
87db96d56Sopenharmony_ci"""
97db96d56Sopenharmony_ci
107db96d56Sopenharmony_ciimport builtins
117db96d56Sopenharmony_ciimport sys
127db96d56Sopenharmony_ci
137db96d56Sopenharmony_ci### Registry and builtin stateless codec functions
147db96d56Sopenharmony_ci
157db96d56Sopenharmony_citry:
167db96d56Sopenharmony_ci    from _codecs import *
177db96d56Sopenharmony_ciexcept ImportError as why:
187db96d56Sopenharmony_ci    raise SystemError('Failed to load the builtin codecs: %s' % why)
197db96d56Sopenharmony_ci
207db96d56Sopenharmony_ci__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
217db96d56Sopenharmony_ci           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
227db96d56Sopenharmony_ci           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
237db96d56Sopenharmony_ci           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
247db96d56Sopenharmony_ci           "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
257db96d56Sopenharmony_ci           "StreamReader", "StreamWriter",
267db96d56Sopenharmony_ci           "StreamReaderWriter", "StreamRecoder",
277db96d56Sopenharmony_ci           "getencoder", "getdecoder", "getincrementalencoder",
287db96d56Sopenharmony_ci           "getincrementaldecoder", "getreader", "getwriter",
297db96d56Sopenharmony_ci           "encode", "decode", "iterencode", "iterdecode",
307db96d56Sopenharmony_ci           "strict_errors", "ignore_errors", "replace_errors",
317db96d56Sopenharmony_ci           "xmlcharrefreplace_errors",
327db96d56Sopenharmony_ci           "backslashreplace_errors", "namereplace_errors",
337db96d56Sopenharmony_ci           "register_error", "lookup_error"]
347db96d56Sopenharmony_ci
357db96d56Sopenharmony_ci### Constants
367db96d56Sopenharmony_ci
377db96d56Sopenharmony_ci#
387db96d56Sopenharmony_ci# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
397db96d56Sopenharmony_ci# and its possible byte string values
407db96d56Sopenharmony_ci# for UTF8/UTF16/UTF32 output and little/big endian machines
417db96d56Sopenharmony_ci#
427db96d56Sopenharmony_ci
437db96d56Sopenharmony_ci# UTF-8
447db96d56Sopenharmony_ciBOM_UTF8 = b'\xef\xbb\xbf'
457db96d56Sopenharmony_ci
467db96d56Sopenharmony_ci# UTF-16, little endian
477db96d56Sopenharmony_ciBOM_LE = BOM_UTF16_LE = b'\xff\xfe'
487db96d56Sopenharmony_ci
497db96d56Sopenharmony_ci# UTF-16, big endian
507db96d56Sopenharmony_ciBOM_BE = BOM_UTF16_BE = b'\xfe\xff'
517db96d56Sopenharmony_ci
527db96d56Sopenharmony_ci# UTF-32, little endian
537db96d56Sopenharmony_ciBOM_UTF32_LE = b'\xff\xfe\x00\x00'
547db96d56Sopenharmony_ci
557db96d56Sopenharmony_ci# UTF-32, big endian
567db96d56Sopenharmony_ciBOM_UTF32_BE = b'\x00\x00\xfe\xff'
577db96d56Sopenharmony_ci
587db96d56Sopenharmony_ciif sys.byteorder == 'little':
597db96d56Sopenharmony_ci
607db96d56Sopenharmony_ci    # UTF-16, native endianness
617db96d56Sopenharmony_ci    BOM = BOM_UTF16 = BOM_UTF16_LE
627db96d56Sopenharmony_ci
637db96d56Sopenharmony_ci    # UTF-32, native endianness
647db96d56Sopenharmony_ci    BOM_UTF32 = BOM_UTF32_LE
657db96d56Sopenharmony_ci
667db96d56Sopenharmony_cielse:
677db96d56Sopenharmony_ci
687db96d56Sopenharmony_ci    # UTF-16, native endianness
697db96d56Sopenharmony_ci    BOM = BOM_UTF16 = BOM_UTF16_BE
707db96d56Sopenharmony_ci
717db96d56Sopenharmony_ci    # UTF-32, native endianness
727db96d56Sopenharmony_ci    BOM_UTF32 = BOM_UTF32_BE
737db96d56Sopenharmony_ci
747db96d56Sopenharmony_ci# Old broken names (don't use in new code)
757db96d56Sopenharmony_ciBOM32_LE = BOM_UTF16_LE
767db96d56Sopenharmony_ciBOM32_BE = BOM_UTF16_BE
777db96d56Sopenharmony_ciBOM64_LE = BOM_UTF32_LE
787db96d56Sopenharmony_ciBOM64_BE = BOM_UTF32_BE
797db96d56Sopenharmony_ci
807db96d56Sopenharmony_ci
817db96d56Sopenharmony_ci### Codec base classes (defining the API)
827db96d56Sopenharmony_ci
837db96d56Sopenharmony_ciclass CodecInfo(tuple):
847db96d56Sopenharmony_ci    """Codec details when looking up the codec registry"""
857db96d56Sopenharmony_ci
867db96d56Sopenharmony_ci    # Private API to allow Python 3.4 to denylist the known non-Unicode
877db96d56Sopenharmony_ci    # codecs in the standard library. A more general mechanism to
887db96d56Sopenharmony_ci    # reliably distinguish test encodings from other codecs will hopefully
897db96d56Sopenharmony_ci    # be defined for Python 3.5
907db96d56Sopenharmony_ci    #
917db96d56Sopenharmony_ci    # See http://bugs.python.org/issue19619
927db96d56Sopenharmony_ci    _is_text_encoding = True # Assume codecs are text encodings by default
937db96d56Sopenharmony_ci
947db96d56Sopenharmony_ci    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
957db96d56Sopenharmony_ci        incrementalencoder=None, incrementaldecoder=None, name=None,
967db96d56Sopenharmony_ci        *, _is_text_encoding=None):
977db96d56Sopenharmony_ci        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
987db96d56Sopenharmony_ci        self.name = name
997db96d56Sopenharmony_ci        self.encode = encode
1007db96d56Sopenharmony_ci        self.decode = decode
1017db96d56Sopenharmony_ci        self.incrementalencoder = incrementalencoder
1027db96d56Sopenharmony_ci        self.incrementaldecoder = incrementaldecoder
1037db96d56Sopenharmony_ci        self.streamwriter = streamwriter
1047db96d56Sopenharmony_ci        self.streamreader = streamreader
1057db96d56Sopenharmony_ci        if _is_text_encoding is not None:
1067db96d56Sopenharmony_ci            self._is_text_encoding = _is_text_encoding
1077db96d56Sopenharmony_ci        return self
1087db96d56Sopenharmony_ci
1097db96d56Sopenharmony_ci    def __repr__(self):
1107db96d56Sopenharmony_ci        return "<%s.%s object for encoding %s at %#x>" % \
1117db96d56Sopenharmony_ci                (self.__class__.__module__, self.__class__.__qualname__,
1127db96d56Sopenharmony_ci                 self.name, id(self))
1137db96d56Sopenharmony_ci
1147db96d56Sopenharmony_ciclass Codec:
1157db96d56Sopenharmony_ci
1167db96d56Sopenharmony_ci    """ Defines the interface for stateless encoders/decoders.
1177db96d56Sopenharmony_ci
1187db96d56Sopenharmony_ci        The .encode()/.decode() methods may use different error
1197db96d56Sopenharmony_ci        handling schemes by providing the errors argument. These
1207db96d56Sopenharmony_ci        string values are predefined:
1217db96d56Sopenharmony_ci
1227db96d56Sopenharmony_ci         'strict' - raise a ValueError error (or a subclass)
1237db96d56Sopenharmony_ci         'ignore' - ignore the character and continue with the next
1247db96d56Sopenharmony_ci         'replace' - replace with a suitable replacement character;
1257db96d56Sopenharmony_ci                    Python will use the official U+FFFD REPLACEMENT
1267db96d56Sopenharmony_ci                    CHARACTER for the builtin Unicode codecs on
1277db96d56Sopenharmony_ci                    decoding and '?' on encoding.
1287db96d56Sopenharmony_ci         'surrogateescape' - replace with private code points U+DCnn.
1297db96d56Sopenharmony_ci         'xmlcharrefreplace' - Replace with the appropriate XML
1307db96d56Sopenharmony_ci                               character reference (only for encoding).
1317db96d56Sopenharmony_ci         'backslashreplace'  - Replace with backslashed escape sequences.
1327db96d56Sopenharmony_ci         'namereplace'       - Replace with \\N{...} escape sequences
1337db96d56Sopenharmony_ci                               (only for encoding).
1347db96d56Sopenharmony_ci
1357db96d56Sopenharmony_ci        The set of allowed values can be extended via register_error.
1367db96d56Sopenharmony_ci
1377db96d56Sopenharmony_ci    """
1387db96d56Sopenharmony_ci    def encode(self, input, errors='strict'):
1397db96d56Sopenharmony_ci
1407db96d56Sopenharmony_ci        """ Encodes the object input and returns a tuple (output
1417db96d56Sopenharmony_ci            object, length consumed).
1427db96d56Sopenharmony_ci
1437db96d56Sopenharmony_ci            errors defines the error handling to apply. It defaults to
1447db96d56Sopenharmony_ci            'strict' handling.
1457db96d56Sopenharmony_ci
1467db96d56Sopenharmony_ci            The method may not store state in the Codec instance. Use
1477db96d56Sopenharmony_ci            StreamWriter for codecs which have to keep state in order to
1487db96d56Sopenharmony_ci            make encoding efficient.
1497db96d56Sopenharmony_ci
1507db96d56Sopenharmony_ci            The encoder must be able to handle zero length input and
1517db96d56Sopenharmony_ci            return an empty object of the output object type in this
1527db96d56Sopenharmony_ci            situation.
1537db96d56Sopenharmony_ci
1547db96d56Sopenharmony_ci        """
1557db96d56Sopenharmony_ci        raise NotImplementedError
1567db96d56Sopenharmony_ci
1577db96d56Sopenharmony_ci    def decode(self, input, errors='strict'):
1587db96d56Sopenharmony_ci
1597db96d56Sopenharmony_ci        """ Decodes the object input and returns a tuple (output
1607db96d56Sopenharmony_ci            object, length consumed).
1617db96d56Sopenharmony_ci
1627db96d56Sopenharmony_ci            input must be an object which provides the bf_getreadbuf
1637db96d56Sopenharmony_ci            buffer slot. Python strings, buffer objects and memory
1647db96d56Sopenharmony_ci            mapped files are examples of objects providing this slot.
1657db96d56Sopenharmony_ci
1667db96d56Sopenharmony_ci            errors defines the error handling to apply. It defaults to
1677db96d56Sopenharmony_ci            'strict' handling.
1687db96d56Sopenharmony_ci
1697db96d56Sopenharmony_ci            The method may not store state in the Codec instance. Use
1707db96d56Sopenharmony_ci            StreamReader for codecs which have to keep state in order to
1717db96d56Sopenharmony_ci            make decoding efficient.
1727db96d56Sopenharmony_ci
1737db96d56Sopenharmony_ci            The decoder must be able to handle zero length input and
1747db96d56Sopenharmony_ci            return an empty object of the output object type in this
1757db96d56Sopenharmony_ci            situation.
1767db96d56Sopenharmony_ci
1777db96d56Sopenharmony_ci        """
1787db96d56Sopenharmony_ci        raise NotImplementedError
1797db96d56Sopenharmony_ci
1807db96d56Sopenharmony_ciclass IncrementalEncoder(object):
1817db96d56Sopenharmony_ci    """
1827db96d56Sopenharmony_ci    An IncrementalEncoder encodes an input in multiple steps. The input can
1837db96d56Sopenharmony_ci    be passed piece by piece to the encode() method. The IncrementalEncoder
1847db96d56Sopenharmony_ci    remembers the state of the encoding process between calls to encode().
1857db96d56Sopenharmony_ci    """
1867db96d56Sopenharmony_ci    def __init__(self, errors='strict'):
1877db96d56Sopenharmony_ci        """
1887db96d56Sopenharmony_ci        Creates an IncrementalEncoder instance.
1897db96d56Sopenharmony_ci
1907db96d56Sopenharmony_ci        The IncrementalEncoder may use different error handling schemes by
1917db96d56Sopenharmony_ci        providing the errors keyword argument. See the module docstring
1927db96d56Sopenharmony_ci        for a list of possible values.
1937db96d56Sopenharmony_ci        """
1947db96d56Sopenharmony_ci        self.errors = errors
1957db96d56Sopenharmony_ci        self.buffer = ""
1967db96d56Sopenharmony_ci
1977db96d56Sopenharmony_ci    def encode(self, input, final=False):
1987db96d56Sopenharmony_ci        """
1997db96d56Sopenharmony_ci        Encodes input and returns the resulting object.
2007db96d56Sopenharmony_ci        """
2017db96d56Sopenharmony_ci        raise NotImplementedError
2027db96d56Sopenharmony_ci
2037db96d56Sopenharmony_ci    def reset(self):
2047db96d56Sopenharmony_ci        """
2057db96d56Sopenharmony_ci        Resets the encoder to the initial state.
2067db96d56Sopenharmony_ci        """
2077db96d56Sopenharmony_ci
2087db96d56Sopenharmony_ci    def getstate(self):
2097db96d56Sopenharmony_ci        """
2107db96d56Sopenharmony_ci        Return the current state of the encoder.
2117db96d56Sopenharmony_ci        """
2127db96d56Sopenharmony_ci        return 0
2137db96d56Sopenharmony_ci
2147db96d56Sopenharmony_ci    def setstate(self, state):
2157db96d56Sopenharmony_ci        """
2167db96d56Sopenharmony_ci        Set the current state of the encoder. state must have been
2177db96d56Sopenharmony_ci        returned by getstate().
2187db96d56Sopenharmony_ci        """
2197db96d56Sopenharmony_ci
2207db96d56Sopenharmony_ciclass BufferedIncrementalEncoder(IncrementalEncoder):
2217db96d56Sopenharmony_ci    """
2227db96d56Sopenharmony_ci    This subclass of IncrementalEncoder can be used as the baseclass for an
2237db96d56Sopenharmony_ci    incremental encoder if the encoder must keep some of the output in a
2247db96d56Sopenharmony_ci    buffer between calls to encode().
2257db96d56Sopenharmony_ci    """
2267db96d56Sopenharmony_ci    def __init__(self, errors='strict'):
2277db96d56Sopenharmony_ci        IncrementalEncoder.__init__(self, errors)
2287db96d56Sopenharmony_ci        # unencoded input that is kept between calls to encode()
2297db96d56Sopenharmony_ci        self.buffer = ""
2307db96d56Sopenharmony_ci
2317db96d56Sopenharmony_ci    def _buffer_encode(self, input, errors, final):
2327db96d56Sopenharmony_ci        # Overwrite this method in subclasses: It must encode input
2337db96d56Sopenharmony_ci        # and return an (output, length consumed) tuple
2347db96d56Sopenharmony_ci        raise NotImplementedError
2357db96d56Sopenharmony_ci
2367db96d56Sopenharmony_ci    def encode(self, input, final=False):
2377db96d56Sopenharmony_ci        # encode input (taking the buffer into account)
2387db96d56Sopenharmony_ci        data = self.buffer + input
2397db96d56Sopenharmony_ci        (result, consumed) = self._buffer_encode(data, self.errors, final)
2407db96d56Sopenharmony_ci        # keep unencoded input until the next call
2417db96d56Sopenharmony_ci        self.buffer = data[consumed:]
2427db96d56Sopenharmony_ci        return result
2437db96d56Sopenharmony_ci
2447db96d56Sopenharmony_ci    def reset(self):
2457db96d56Sopenharmony_ci        IncrementalEncoder.reset(self)
2467db96d56Sopenharmony_ci        self.buffer = ""
2477db96d56Sopenharmony_ci
2487db96d56Sopenharmony_ci    def getstate(self):
2497db96d56Sopenharmony_ci        return self.buffer or 0
2507db96d56Sopenharmony_ci
2517db96d56Sopenharmony_ci    def setstate(self, state):
2527db96d56Sopenharmony_ci        self.buffer = state or ""
2537db96d56Sopenharmony_ci
2547db96d56Sopenharmony_ciclass IncrementalDecoder(object):
2557db96d56Sopenharmony_ci    """
2567db96d56Sopenharmony_ci    An IncrementalDecoder decodes an input in multiple steps. The input can
2577db96d56Sopenharmony_ci    be passed piece by piece to the decode() method. The IncrementalDecoder
2587db96d56Sopenharmony_ci    remembers the state of the decoding process between calls to decode().
2597db96d56Sopenharmony_ci    """
2607db96d56Sopenharmony_ci    def __init__(self, errors='strict'):
2617db96d56Sopenharmony_ci        """
2627db96d56Sopenharmony_ci        Create an IncrementalDecoder instance.
2637db96d56Sopenharmony_ci
2647db96d56Sopenharmony_ci        The IncrementalDecoder may use different error handling schemes by
2657db96d56Sopenharmony_ci        providing the errors keyword argument. See the module docstring
2667db96d56Sopenharmony_ci        for a list of possible values.
2677db96d56Sopenharmony_ci        """
2687db96d56Sopenharmony_ci        self.errors = errors
2697db96d56Sopenharmony_ci
2707db96d56Sopenharmony_ci    def decode(self, input, final=False):
2717db96d56Sopenharmony_ci        """
2727db96d56Sopenharmony_ci        Decode input and returns the resulting object.
2737db96d56Sopenharmony_ci        """
2747db96d56Sopenharmony_ci        raise NotImplementedError
2757db96d56Sopenharmony_ci
2767db96d56Sopenharmony_ci    def reset(self):
2777db96d56Sopenharmony_ci        """
2787db96d56Sopenharmony_ci        Reset the decoder to the initial state.
2797db96d56Sopenharmony_ci        """
2807db96d56Sopenharmony_ci
2817db96d56Sopenharmony_ci    def getstate(self):
2827db96d56Sopenharmony_ci        """
2837db96d56Sopenharmony_ci        Return the current state of the decoder.
2847db96d56Sopenharmony_ci
2857db96d56Sopenharmony_ci        This must be a (buffered_input, additional_state_info) tuple.
2867db96d56Sopenharmony_ci        buffered_input must be a bytes object containing bytes that
2877db96d56Sopenharmony_ci        were passed to decode() that have not yet been converted.
2887db96d56Sopenharmony_ci        additional_state_info must be a non-negative integer
2897db96d56Sopenharmony_ci        representing the state of the decoder WITHOUT yet having
2907db96d56Sopenharmony_ci        processed the contents of buffered_input.  In the initial state
2917db96d56Sopenharmony_ci        and after reset(), getstate() must return (b"", 0).
2927db96d56Sopenharmony_ci        """
2937db96d56Sopenharmony_ci        return (b"", 0)
2947db96d56Sopenharmony_ci
2957db96d56Sopenharmony_ci    def setstate(self, state):
2967db96d56Sopenharmony_ci        """
2977db96d56Sopenharmony_ci        Set the current state of the decoder.
2987db96d56Sopenharmony_ci
2997db96d56Sopenharmony_ci        state must have been returned by getstate().  The effect of
3007db96d56Sopenharmony_ci        setstate((b"", 0)) must be equivalent to reset().
3017db96d56Sopenharmony_ci        """
3027db96d56Sopenharmony_ci
3037db96d56Sopenharmony_ciclass BufferedIncrementalDecoder(IncrementalDecoder):
3047db96d56Sopenharmony_ci    """
3057db96d56Sopenharmony_ci    This subclass of IncrementalDecoder can be used as the baseclass for an
3067db96d56Sopenharmony_ci    incremental decoder if the decoder must be able to handle incomplete
3077db96d56Sopenharmony_ci    byte sequences.
3087db96d56Sopenharmony_ci    """
3097db96d56Sopenharmony_ci    def __init__(self, errors='strict'):
3107db96d56Sopenharmony_ci        IncrementalDecoder.__init__(self, errors)
3117db96d56Sopenharmony_ci        # undecoded input that is kept between calls to decode()
3127db96d56Sopenharmony_ci        self.buffer = b""
3137db96d56Sopenharmony_ci
3147db96d56Sopenharmony_ci    def _buffer_decode(self, input, errors, final):
3157db96d56Sopenharmony_ci        # Overwrite this method in subclasses: It must decode input
3167db96d56Sopenharmony_ci        # and return an (output, length consumed) tuple
3177db96d56Sopenharmony_ci        raise NotImplementedError
3187db96d56Sopenharmony_ci
3197db96d56Sopenharmony_ci    def decode(self, input, final=False):
3207db96d56Sopenharmony_ci        # decode input (taking the buffer into account)
3217db96d56Sopenharmony_ci        data = self.buffer + input
3227db96d56Sopenharmony_ci        (result, consumed) = self._buffer_decode(data, self.errors, final)
3237db96d56Sopenharmony_ci        # keep undecoded input until the next call
3247db96d56Sopenharmony_ci        self.buffer = data[consumed:]
3257db96d56Sopenharmony_ci        return result
3267db96d56Sopenharmony_ci
3277db96d56Sopenharmony_ci    def reset(self):
3287db96d56Sopenharmony_ci        IncrementalDecoder.reset(self)
3297db96d56Sopenharmony_ci        self.buffer = b""
3307db96d56Sopenharmony_ci
3317db96d56Sopenharmony_ci    def getstate(self):
3327db96d56Sopenharmony_ci        # additional state info is always 0
3337db96d56Sopenharmony_ci        return (self.buffer, 0)
3347db96d56Sopenharmony_ci
3357db96d56Sopenharmony_ci    def setstate(self, state):
3367db96d56Sopenharmony_ci        # ignore additional state info
3377db96d56Sopenharmony_ci        self.buffer = state[0]
3387db96d56Sopenharmony_ci
3397db96d56Sopenharmony_ci#
3407db96d56Sopenharmony_ci# The StreamWriter and StreamReader class provide generic working
3417db96d56Sopenharmony_ci# interfaces which can be used to implement new encoding submodules
3427db96d56Sopenharmony_ci# very easily. See encodings/utf_8.py for an example on how this is
3437db96d56Sopenharmony_ci# done.
3447db96d56Sopenharmony_ci#
3457db96d56Sopenharmony_ci
3467db96d56Sopenharmony_ciclass StreamWriter(Codec):
3477db96d56Sopenharmony_ci
3487db96d56Sopenharmony_ci    def __init__(self, stream, errors='strict'):
3497db96d56Sopenharmony_ci
3507db96d56Sopenharmony_ci        """ Creates a StreamWriter instance.
3517db96d56Sopenharmony_ci
3527db96d56Sopenharmony_ci            stream must be a file-like object open for writing.
3537db96d56Sopenharmony_ci
3547db96d56Sopenharmony_ci            The StreamWriter may use different error handling
3557db96d56Sopenharmony_ci            schemes by providing the errors keyword argument. These
3567db96d56Sopenharmony_ci            parameters are predefined:
3577db96d56Sopenharmony_ci
3587db96d56Sopenharmony_ci             'strict' - raise a ValueError (or a subclass)
3597db96d56Sopenharmony_ci             'ignore' - ignore the character and continue with the next
3607db96d56Sopenharmony_ci             'replace'- replace with a suitable replacement character
3617db96d56Sopenharmony_ci             'xmlcharrefreplace' - Replace with the appropriate XML
3627db96d56Sopenharmony_ci                                   character reference.
3637db96d56Sopenharmony_ci             'backslashreplace'  - Replace with backslashed escape
3647db96d56Sopenharmony_ci                                   sequences.
3657db96d56Sopenharmony_ci             'namereplace'       - Replace with \\N{...} escape sequences.
3667db96d56Sopenharmony_ci
3677db96d56Sopenharmony_ci            The set of allowed parameter values can be extended via
3687db96d56Sopenharmony_ci            register_error.
3697db96d56Sopenharmony_ci        """
3707db96d56Sopenharmony_ci        self.stream = stream
3717db96d56Sopenharmony_ci        self.errors = errors
3727db96d56Sopenharmony_ci
3737db96d56Sopenharmony_ci    def write(self, object):
3747db96d56Sopenharmony_ci
3757db96d56Sopenharmony_ci        """ Writes the object's contents encoded to self.stream.
3767db96d56Sopenharmony_ci        """
3777db96d56Sopenharmony_ci        data, consumed = self.encode(object, self.errors)
3787db96d56Sopenharmony_ci        self.stream.write(data)
3797db96d56Sopenharmony_ci
3807db96d56Sopenharmony_ci    def writelines(self, list):
3817db96d56Sopenharmony_ci
3827db96d56Sopenharmony_ci        """ Writes the concatenated list of strings to the stream
3837db96d56Sopenharmony_ci            using .write().
3847db96d56Sopenharmony_ci        """
3857db96d56Sopenharmony_ci        self.write(''.join(list))
3867db96d56Sopenharmony_ci
3877db96d56Sopenharmony_ci    def reset(self):
3887db96d56Sopenharmony_ci
3897db96d56Sopenharmony_ci        """ Resets the codec buffers used for keeping internal state.
3907db96d56Sopenharmony_ci
3917db96d56Sopenharmony_ci            Calling this method should ensure that the data on the
3927db96d56Sopenharmony_ci            output is put into a clean state, that allows appending
3937db96d56Sopenharmony_ci            of new fresh data without having to rescan the whole
3947db96d56Sopenharmony_ci            stream to recover state.
3957db96d56Sopenharmony_ci
3967db96d56Sopenharmony_ci        """
3977db96d56Sopenharmony_ci        pass
3987db96d56Sopenharmony_ci
3997db96d56Sopenharmony_ci    def seek(self, offset, whence=0):
4007db96d56Sopenharmony_ci        self.stream.seek(offset, whence)
4017db96d56Sopenharmony_ci        if whence == 0 and offset == 0:
4027db96d56Sopenharmony_ci            self.reset()
4037db96d56Sopenharmony_ci
4047db96d56Sopenharmony_ci    def __getattr__(self, name,
4057db96d56Sopenharmony_ci                    getattr=getattr):
4067db96d56Sopenharmony_ci
4077db96d56Sopenharmony_ci        """ Inherit all other methods from the underlying stream.
4087db96d56Sopenharmony_ci        """
4097db96d56Sopenharmony_ci        return getattr(self.stream, name)
4107db96d56Sopenharmony_ci
4117db96d56Sopenharmony_ci    def __enter__(self):
4127db96d56Sopenharmony_ci        return self
4137db96d56Sopenharmony_ci
4147db96d56Sopenharmony_ci    def __exit__(self, type, value, tb):
4157db96d56Sopenharmony_ci        self.stream.close()
4167db96d56Sopenharmony_ci
4177db96d56Sopenharmony_ci###
4187db96d56Sopenharmony_ci
4197db96d56Sopenharmony_ciclass StreamReader(Codec):
4207db96d56Sopenharmony_ci
4217db96d56Sopenharmony_ci    charbuffertype = str
4227db96d56Sopenharmony_ci
4237db96d56Sopenharmony_ci    def __init__(self, stream, errors='strict'):
4247db96d56Sopenharmony_ci
4257db96d56Sopenharmony_ci        """ Creates a StreamReader instance.
4267db96d56Sopenharmony_ci
4277db96d56Sopenharmony_ci            stream must be a file-like object open for reading.
4287db96d56Sopenharmony_ci
4297db96d56Sopenharmony_ci            The StreamReader may use different error handling
4307db96d56Sopenharmony_ci            schemes by providing the errors keyword argument. These
4317db96d56Sopenharmony_ci            parameters are predefined:
4327db96d56Sopenharmony_ci
4337db96d56Sopenharmony_ci             'strict' - raise a ValueError (or a subclass)
4347db96d56Sopenharmony_ci             'ignore' - ignore the character and continue with the next
4357db96d56Sopenharmony_ci             'replace'- replace with a suitable replacement character
4367db96d56Sopenharmony_ci             'backslashreplace' - Replace with backslashed escape sequences;
4377db96d56Sopenharmony_ci
4387db96d56Sopenharmony_ci            The set of allowed parameter values can be extended via
4397db96d56Sopenharmony_ci            register_error.
4407db96d56Sopenharmony_ci        """
4417db96d56Sopenharmony_ci        self.stream = stream
4427db96d56Sopenharmony_ci        self.errors = errors
4437db96d56Sopenharmony_ci        self.bytebuffer = b""
4447db96d56Sopenharmony_ci        self._empty_charbuffer = self.charbuffertype()
4457db96d56Sopenharmony_ci        self.charbuffer = self._empty_charbuffer
4467db96d56Sopenharmony_ci        self.linebuffer = None
4477db96d56Sopenharmony_ci
4487db96d56Sopenharmony_ci    def decode(self, input, errors='strict'):
4497db96d56Sopenharmony_ci        raise NotImplementedError
4507db96d56Sopenharmony_ci
4517db96d56Sopenharmony_ci    def read(self, size=-1, chars=-1, firstline=False):
4527db96d56Sopenharmony_ci
4537db96d56Sopenharmony_ci        """ Decodes data from the stream self.stream and returns the
4547db96d56Sopenharmony_ci            resulting object.
4557db96d56Sopenharmony_ci
4567db96d56Sopenharmony_ci            chars indicates the number of decoded code points or bytes to
4577db96d56Sopenharmony_ci            return. read() will never return more data than requested,
4587db96d56Sopenharmony_ci            but it might return less, if there is not enough available.
4597db96d56Sopenharmony_ci
4607db96d56Sopenharmony_ci            size indicates the approximate maximum number of decoded
4617db96d56Sopenharmony_ci            bytes or code points to read for decoding. The decoder
4627db96d56Sopenharmony_ci            can modify this setting as appropriate. The default value
4637db96d56Sopenharmony_ci            -1 indicates to read and decode as much as possible.  size
4647db96d56Sopenharmony_ci            is intended to prevent having to decode huge files in one
4657db96d56Sopenharmony_ci            step.
4667db96d56Sopenharmony_ci
4677db96d56Sopenharmony_ci            If firstline is true, and a UnicodeDecodeError happens
4687db96d56Sopenharmony_ci            after the first line terminator in the input only the first line
4697db96d56Sopenharmony_ci            will be returned, the rest of the input will be kept until the
4707db96d56Sopenharmony_ci            next call to read().
4717db96d56Sopenharmony_ci
4727db96d56Sopenharmony_ci            The method should use a greedy read strategy, meaning that
4737db96d56Sopenharmony_ci            it should read as much data as is allowed within the
4747db96d56Sopenharmony_ci            definition of the encoding and the given size, e.g.  if
4757db96d56Sopenharmony_ci            optional encoding endings or state markers are available
4767db96d56Sopenharmony_ci            on the stream, these should be read too.
4777db96d56Sopenharmony_ci        """
4787db96d56Sopenharmony_ci        # If we have lines cached, first merge them back into characters
4797db96d56Sopenharmony_ci        if self.linebuffer:
4807db96d56Sopenharmony_ci            self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
4817db96d56Sopenharmony_ci            self.linebuffer = None
4827db96d56Sopenharmony_ci
4837db96d56Sopenharmony_ci        if chars < 0:
4847db96d56Sopenharmony_ci            # For compatibility with other read() methods that take a
4857db96d56Sopenharmony_ci            # single argument
4867db96d56Sopenharmony_ci            chars = size
4877db96d56Sopenharmony_ci
4887db96d56Sopenharmony_ci        # read until we get the required number of characters (if available)
4897db96d56Sopenharmony_ci        while True:
4907db96d56Sopenharmony_ci            # can the request be satisfied from the character buffer?
4917db96d56Sopenharmony_ci            if chars >= 0:
4927db96d56Sopenharmony_ci                if len(self.charbuffer) >= chars:
4937db96d56Sopenharmony_ci                    break
4947db96d56Sopenharmony_ci            # we need more data
4957db96d56Sopenharmony_ci            if size < 0:
4967db96d56Sopenharmony_ci                newdata = self.stream.read()
4977db96d56Sopenharmony_ci            else:
4987db96d56Sopenharmony_ci                newdata = self.stream.read(size)
4997db96d56Sopenharmony_ci            # decode bytes (those remaining from the last call included)
5007db96d56Sopenharmony_ci            data = self.bytebuffer + newdata
5017db96d56Sopenharmony_ci            if not data:
5027db96d56Sopenharmony_ci                break
5037db96d56Sopenharmony_ci            try:
5047db96d56Sopenharmony_ci                newchars, decodedbytes = self.decode(data, self.errors)
5057db96d56Sopenharmony_ci            except UnicodeDecodeError as exc:
5067db96d56Sopenharmony_ci                if firstline:
5077db96d56Sopenharmony_ci                    newchars, decodedbytes = \
5087db96d56Sopenharmony_ci                        self.decode(data[:exc.start], self.errors)
5097db96d56Sopenharmony_ci                    lines = newchars.splitlines(keepends=True)
5107db96d56Sopenharmony_ci                    if len(lines)<=1:
5117db96d56Sopenharmony_ci                        raise
5127db96d56Sopenharmony_ci                else:
5137db96d56Sopenharmony_ci                    raise
5147db96d56Sopenharmony_ci            # keep undecoded bytes until the next call
5157db96d56Sopenharmony_ci            self.bytebuffer = data[decodedbytes:]
5167db96d56Sopenharmony_ci            # put new characters in the character buffer
5177db96d56Sopenharmony_ci            self.charbuffer += newchars
5187db96d56Sopenharmony_ci            # there was no data available
5197db96d56Sopenharmony_ci            if not newdata:
5207db96d56Sopenharmony_ci                break
5217db96d56Sopenharmony_ci        if chars < 0:
5227db96d56Sopenharmony_ci            # Return everything we've got
5237db96d56Sopenharmony_ci            result = self.charbuffer
5247db96d56Sopenharmony_ci            self.charbuffer = self._empty_charbuffer
5257db96d56Sopenharmony_ci        else:
5267db96d56Sopenharmony_ci            # Return the first chars characters
5277db96d56Sopenharmony_ci            result = self.charbuffer[:chars]
5287db96d56Sopenharmony_ci            self.charbuffer = self.charbuffer[chars:]
5297db96d56Sopenharmony_ci        return result
5307db96d56Sopenharmony_ci
5317db96d56Sopenharmony_ci    def readline(self, size=None, keepends=True):
5327db96d56Sopenharmony_ci
5337db96d56Sopenharmony_ci        """ Read one line from the input stream and return the
5347db96d56Sopenharmony_ci            decoded data.
5357db96d56Sopenharmony_ci
5367db96d56Sopenharmony_ci            size, if given, is passed as size argument to the
5377db96d56Sopenharmony_ci            read() method.
5387db96d56Sopenharmony_ci
5397db96d56Sopenharmony_ci        """
5407db96d56Sopenharmony_ci        # If we have lines cached from an earlier read, return
5417db96d56Sopenharmony_ci        # them unconditionally
5427db96d56Sopenharmony_ci        if self.linebuffer:
5437db96d56Sopenharmony_ci            line = self.linebuffer[0]
5447db96d56Sopenharmony_ci            del self.linebuffer[0]
5457db96d56Sopenharmony_ci            if len(self.linebuffer) == 1:
5467db96d56Sopenharmony_ci                # revert to charbuffer mode; we might need more data
5477db96d56Sopenharmony_ci                # next time
5487db96d56Sopenharmony_ci                self.charbuffer = self.linebuffer[0]
5497db96d56Sopenharmony_ci                self.linebuffer = None
5507db96d56Sopenharmony_ci            if not keepends:
5517db96d56Sopenharmony_ci                line = line.splitlines(keepends=False)[0]
5527db96d56Sopenharmony_ci            return line
5537db96d56Sopenharmony_ci
5547db96d56Sopenharmony_ci        readsize = size or 72
5557db96d56Sopenharmony_ci        line = self._empty_charbuffer
5567db96d56Sopenharmony_ci        # If size is given, we call read() only once
5577db96d56Sopenharmony_ci        while True:
5587db96d56Sopenharmony_ci            data = self.read(readsize, firstline=True)
5597db96d56Sopenharmony_ci            if data:
5607db96d56Sopenharmony_ci                # If we're at a "\r" read one extra character (which might
5617db96d56Sopenharmony_ci                # be a "\n") to get a proper line ending. If the stream is
5627db96d56Sopenharmony_ci                # temporarily exhausted we return the wrong line ending.
5637db96d56Sopenharmony_ci                if (isinstance(data, str) and data.endswith("\r")) or \
5647db96d56Sopenharmony_ci                   (isinstance(data, bytes) and data.endswith(b"\r")):
5657db96d56Sopenharmony_ci                    data += self.read(size=1, chars=1)
5667db96d56Sopenharmony_ci
5677db96d56Sopenharmony_ci            line += data
5687db96d56Sopenharmony_ci            lines = line.splitlines(keepends=True)
5697db96d56Sopenharmony_ci            if lines:
5707db96d56Sopenharmony_ci                if len(lines) > 1:
5717db96d56Sopenharmony_ci                    # More than one line result; the first line is a full line
5727db96d56Sopenharmony_ci                    # to return
5737db96d56Sopenharmony_ci                    line = lines[0]
5747db96d56Sopenharmony_ci                    del lines[0]
5757db96d56Sopenharmony_ci                    if len(lines) > 1:
5767db96d56Sopenharmony_ci                        # cache the remaining lines
5777db96d56Sopenharmony_ci                        lines[-1] += self.charbuffer
5787db96d56Sopenharmony_ci                        self.linebuffer = lines
5797db96d56Sopenharmony_ci                        self.charbuffer = None
5807db96d56Sopenharmony_ci                    else:
5817db96d56Sopenharmony_ci                        # only one remaining line, put it back into charbuffer
5827db96d56Sopenharmony_ci                        self.charbuffer = lines[0] + self.charbuffer
5837db96d56Sopenharmony_ci                    if not keepends:
5847db96d56Sopenharmony_ci                        line = line.splitlines(keepends=False)[0]
5857db96d56Sopenharmony_ci                    break
5867db96d56Sopenharmony_ci                line0withend = lines[0]
5877db96d56Sopenharmony_ci                line0withoutend = lines[0].splitlines(keepends=False)[0]
5887db96d56Sopenharmony_ci                if line0withend != line0withoutend: # We really have a line end
5897db96d56Sopenharmony_ci                    # Put the rest back together and keep it until the next call
5907db96d56Sopenharmony_ci                    self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
5917db96d56Sopenharmony_ci                                      self.charbuffer
5927db96d56Sopenharmony_ci                    if keepends:
5937db96d56Sopenharmony_ci                        line = line0withend
5947db96d56Sopenharmony_ci                    else:
5957db96d56Sopenharmony_ci                        line = line0withoutend
5967db96d56Sopenharmony_ci                    break
5977db96d56Sopenharmony_ci            # we didn't get anything or this was our only try
5987db96d56Sopenharmony_ci            if not data or size is not None:
5997db96d56Sopenharmony_ci                if line and not keepends:
6007db96d56Sopenharmony_ci                    line = line.splitlines(keepends=False)[0]
6017db96d56Sopenharmony_ci                break
6027db96d56Sopenharmony_ci            if readsize < 8000:
6037db96d56Sopenharmony_ci                readsize *= 2
6047db96d56Sopenharmony_ci        return line
6057db96d56Sopenharmony_ci
6067db96d56Sopenharmony_ci    def readlines(self, sizehint=None, keepends=True):
6077db96d56Sopenharmony_ci
6087db96d56Sopenharmony_ci        """ Read all lines available on the input stream
6097db96d56Sopenharmony_ci            and return them as a list.
6107db96d56Sopenharmony_ci
6117db96d56Sopenharmony_ci            Line breaks are implemented using the codec's decoder
6127db96d56Sopenharmony_ci            method and are included in the list entries.
6137db96d56Sopenharmony_ci
6147db96d56Sopenharmony_ci            sizehint, if given, is ignored since there is no efficient
6157db96d56Sopenharmony_ci            way to finding the true end-of-line.
6167db96d56Sopenharmony_ci
6177db96d56Sopenharmony_ci        """
6187db96d56Sopenharmony_ci        data = self.read()
6197db96d56Sopenharmony_ci        return data.splitlines(keepends)
6207db96d56Sopenharmony_ci
6217db96d56Sopenharmony_ci    def reset(self):
6227db96d56Sopenharmony_ci
6237db96d56Sopenharmony_ci        """ Resets the codec buffers used for keeping internal state.
6247db96d56Sopenharmony_ci
6257db96d56Sopenharmony_ci            Note that no stream repositioning should take place.
6267db96d56Sopenharmony_ci            This method is primarily intended to be able to recover
6277db96d56Sopenharmony_ci            from decoding errors.
6287db96d56Sopenharmony_ci
6297db96d56Sopenharmony_ci        """
6307db96d56Sopenharmony_ci        self.bytebuffer = b""
6317db96d56Sopenharmony_ci        self.charbuffer = self._empty_charbuffer
6327db96d56Sopenharmony_ci        self.linebuffer = None
6337db96d56Sopenharmony_ci
6347db96d56Sopenharmony_ci    def seek(self, offset, whence=0):
6357db96d56Sopenharmony_ci        """ Set the input stream's current position.
6367db96d56Sopenharmony_ci
6377db96d56Sopenharmony_ci            Resets the codec buffers used for keeping state.
6387db96d56Sopenharmony_ci        """
6397db96d56Sopenharmony_ci        self.stream.seek(offset, whence)
6407db96d56Sopenharmony_ci        self.reset()
6417db96d56Sopenharmony_ci
6427db96d56Sopenharmony_ci    def __next__(self):
6437db96d56Sopenharmony_ci
6447db96d56Sopenharmony_ci        """ Return the next decoded line from the input stream."""
6457db96d56Sopenharmony_ci        line = self.readline()
6467db96d56Sopenharmony_ci        if line:
6477db96d56Sopenharmony_ci            return line
6487db96d56Sopenharmony_ci        raise StopIteration
6497db96d56Sopenharmony_ci
6507db96d56Sopenharmony_ci    def __iter__(self):
6517db96d56Sopenharmony_ci        return self
6527db96d56Sopenharmony_ci
6537db96d56Sopenharmony_ci    def __getattr__(self, name,
6547db96d56Sopenharmony_ci                    getattr=getattr):
6557db96d56Sopenharmony_ci
6567db96d56Sopenharmony_ci        """ Inherit all other methods from the underlying stream.
6577db96d56Sopenharmony_ci        """
6587db96d56Sopenharmony_ci        return getattr(self.stream, name)
6597db96d56Sopenharmony_ci
6607db96d56Sopenharmony_ci    def __enter__(self):
6617db96d56Sopenharmony_ci        return self
6627db96d56Sopenharmony_ci
6637db96d56Sopenharmony_ci    def __exit__(self, type, value, tb):
6647db96d56Sopenharmony_ci        self.stream.close()
6657db96d56Sopenharmony_ci
6667db96d56Sopenharmony_ci###
6677db96d56Sopenharmony_ci
6687db96d56Sopenharmony_ciclass StreamReaderWriter:
6697db96d56Sopenharmony_ci
6707db96d56Sopenharmony_ci    """ StreamReaderWriter instances allow wrapping streams which
6717db96d56Sopenharmony_ci        work in both read and write modes.
6727db96d56Sopenharmony_ci
6737db96d56Sopenharmony_ci        The design is such that one can use the factory functions
6747db96d56Sopenharmony_ci        returned by the codec.lookup() function to construct the
6757db96d56Sopenharmony_ci        instance.
6767db96d56Sopenharmony_ci
6777db96d56Sopenharmony_ci    """
6787db96d56Sopenharmony_ci    # Optional attributes set by the file wrappers below
6797db96d56Sopenharmony_ci    encoding = 'unknown'
6807db96d56Sopenharmony_ci
6817db96d56Sopenharmony_ci    def __init__(self, stream, Reader, Writer, errors='strict'):
6827db96d56Sopenharmony_ci
6837db96d56Sopenharmony_ci        """ Creates a StreamReaderWriter instance.
6847db96d56Sopenharmony_ci
6857db96d56Sopenharmony_ci            stream must be a Stream-like object.
6867db96d56Sopenharmony_ci
6877db96d56Sopenharmony_ci            Reader, Writer must be factory functions or classes
6887db96d56Sopenharmony_ci            providing the StreamReader, StreamWriter interface resp.
6897db96d56Sopenharmony_ci
6907db96d56Sopenharmony_ci            Error handling is done in the same way as defined for the
6917db96d56Sopenharmony_ci            StreamWriter/Readers.
6927db96d56Sopenharmony_ci
6937db96d56Sopenharmony_ci        """
6947db96d56Sopenharmony_ci        self.stream = stream
6957db96d56Sopenharmony_ci        self.reader = Reader(stream, errors)
6967db96d56Sopenharmony_ci        self.writer = Writer(stream, errors)
6977db96d56Sopenharmony_ci        self.errors = errors
6987db96d56Sopenharmony_ci
6997db96d56Sopenharmony_ci    def read(self, size=-1):
7007db96d56Sopenharmony_ci
7017db96d56Sopenharmony_ci        return self.reader.read(size)
7027db96d56Sopenharmony_ci
7037db96d56Sopenharmony_ci    def readline(self, size=None):
7047db96d56Sopenharmony_ci
7057db96d56Sopenharmony_ci        return self.reader.readline(size)
7067db96d56Sopenharmony_ci
7077db96d56Sopenharmony_ci    def readlines(self, sizehint=None):
7087db96d56Sopenharmony_ci
7097db96d56Sopenharmony_ci        return self.reader.readlines(sizehint)
7107db96d56Sopenharmony_ci
7117db96d56Sopenharmony_ci    def __next__(self):
7127db96d56Sopenharmony_ci
7137db96d56Sopenharmony_ci        """ Return the next decoded line from the input stream."""
7147db96d56Sopenharmony_ci        return next(self.reader)
7157db96d56Sopenharmony_ci
7167db96d56Sopenharmony_ci    def __iter__(self):
7177db96d56Sopenharmony_ci        return self
7187db96d56Sopenharmony_ci
7197db96d56Sopenharmony_ci    def write(self, data):
7207db96d56Sopenharmony_ci
7217db96d56Sopenharmony_ci        return self.writer.write(data)
7227db96d56Sopenharmony_ci
7237db96d56Sopenharmony_ci    def writelines(self, list):
7247db96d56Sopenharmony_ci
7257db96d56Sopenharmony_ci        return self.writer.writelines(list)
7267db96d56Sopenharmony_ci
7277db96d56Sopenharmony_ci    def reset(self):
7287db96d56Sopenharmony_ci
7297db96d56Sopenharmony_ci        self.reader.reset()
7307db96d56Sopenharmony_ci        self.writer.reset()
7317db96d56Sopenharmony_ci
7327db96d56Sopenharmony_ci    def seek(self, offset, whence=0):
7337db96d56Sopenharmony_ci        self.stream.seek(offset, whence)
7347db96d56Sopenharmony_ci        self.reader.reset()
7357db96d56Sopenharmony_ci        if whence == 0 and offset == 0:
7367db96d56Sopenharmony_ci            self.writer.reset()
7377db96d56Sopenharmony_ci
7387db96d56Sopenharmony_ci    def __getattr__(self, name,
7397db96d56Sopenharmony_ci                    getattr=getattr):
7407db96d56Sopenharmony_ci
7417db96d56Sopenharmony_ci        """ Inherit all other methods from the underlying stream.
7427db96d56Sopenharmony_ci        """
7437db96d56Sopenharmony_ci        return getattr(self.stream, name)
7447db96d56Sopenharmony_ci
7457db96d56Sopenharmony_ci    # these are needed to make "with StreamReaderWriter(...)" work properly
7467db96d56Sopenharmony_ci
7477db96d56Sopenharmony_ci    def __enter__(self):
7487db96d56Sopenharmony_ci        return self
7497db96d56Sopenharmony_ci
7507db96d56Sopenharmony_ci    def __exit__(self, type, value, tb):
7517db96d56Sopenharmony_ci        self.stream.close()
7527db96d56Sopenharmony_ci
7537db96d56Sopenharmony_ci###
7547db96d56Sopenharmony_ci
7557db96d56Sopenharmony_ciclass StreamRecoder:
7567db96d56Sopenharmony_ci
7577db96d56Sopenharmony_ci    """ StreamRecoder instances translate data from one encoding to another.
7587db96d56Sopenharmony_ci
7597db96d56Sopenharmony_ci        They use the complete set of APIs returned by the
7607db96d56Sopenharmony_ci        codecs.lookup() function to implement their task.
7617db96d56Sopenharmony_ci
7627db96d56Sopenharmony_ci        Data written to the StreamRecoder is first decoded into an
7637db96d56Sopenharmony_ci        intermediate format (depending on the "decode" codec) and then
7647db96d56Sopenharmony_ci        written to the underlying stream using an instance of the provided
7657db96d56Sopenharmony_ci        Writer class.
7667db96d56Sopenharmony_ci
7677db96d56Sopenharmony_ci        In the other direction, data is read from the underlying stream using
7687db96d56Sopenharmony_ci        a Reader instance and then encoded and returned to the caller.
7697db96d56Sopenharmony_ci
7707db96d56Sopenharmony_ci    """
7717db96d56Sopenharmony_ci    # Optional attributes set by the file wrappers below
7727db96d56Sopenharmony_ci    data_encoding = 'unknown'
7737db96d56Sopenharmony_ci    file_encoding = 'unknown'
7747db96d56Sopenharmony_ci
7757db96d56Sopenharmony_ci    def __init__(self, stream, encode, decode, Reader, Writer,
7767db96d56Sopenharmony_ci                 errors='strict'):
7777db96d56Sopenharmony_ci
7787db96d56Sopenharmony_ci        """ Creates a StreamRecoder instance which implements a two-way
7797db96d56Sopenharmony_ci            conversion: encode and decode work on the frontend (the
7807db96d56Sopenharmony_ci            data visible to .read() and .write()) while Reader and Writer
7817db96d56Sopenharmony_ci            work on the backend (the data in stream).
7827db96d56Sopenharmony_ci
7837db96d56Sopenharmony_ci            You can use these objects to do transparent
7847db96d56Sopenharmony_ci            transcodings from e.g. latin-1 to utf-8 and back.
7857db96d56Sopenharmony_ci
7867db96d56Sopenharmony_ci            stream must be a file-like object.
7877db96d56Sopenharmony_ci
7887db96d56Sopenharmony_ci            encode and decode must adhere to the Codec interface; Reader and
7897db96d56Sopenharmony_ci            Writer must be factory functions or classes providing the
7907db96d56Sopenharmony_ci            StreamReader and StreamWriter interfaces resp.
7917db96d56Sopenharmony_ci
7927db96d56Sopenharmony_ci            Error handling is done in the same way as defined for the
7937db96d56Sopenharmony_ci            StreamWriter/Readers.
7947db96d56Sopenharmony_ci
7957db96d56Sopenharmony_ci        """
7967db96d56Sopenharmony_ci        self.stream = stream
7977db96d56Sopenharmony_ci        self.encode = encode
7987db96d56Sopenharmony_ci        self.decode = decode
7997db96d56Sopenharmony_ci        self.reader = Reader(stream, errors)
8007db96d56Sopenharmony_ci        self.writer = Writer(stream, errors)
8017db96d56Sopenharmony_ci        self.errors = errors
8027db96d56Sopenharmony_ci
8037db96d56Sopenharmony_ci    def read(self, size=-1):
8047db96d56Sopenharmony_ci
8057db96d56Sopenharmony_ci        data = self.reader.read(size)
8067db96d56Sopenharmony_ci        data, bytesencoded = self.encode(data, self.errors)
8077db96d56Sopenharmony_ci        return data
8087db96d56Sopenharmony_ci
8097db96d56Sopenharmony_ci    def readline(self, size=None):
8107db96d56Sopenharmony_ci
8117db96d56Sopenharmony_ci        if size is None:
8127db96d56Sopenharmony_ci            data = self.reader.readline()
8137db96d56Sopenharmony_ci        else:
8147db96d56Sopenharmony_ci            data = self.reader.readline(size)
8157db96d56Sopenharmony_ci        data, bytesencoded = self.encode(data, self.errors)
8167db96d56Sopenharmony_ci        return data
8177db96d56Sopenharmony_ci
8187db96d56Sopenharmony_ci    def readlines(self, sizehint=None):
8197db96d56Sopenharmony_ci
8207db96d56Sopenharmony_ci        data = self.reader.read()
8217db96d56Sopenharmony_ci        data, bytesencoded = self.encode(data, self.errors)
8227db96d56Sopenharmony_ci        return data.splitlines(keepends=True)
8237db96d56Sopenharmony_ci
8247db96d56Sopenharmony_ci    def __next__(self):
8257db96d56Sopenharmony_ci
8267db96d56Sopenharmony_ci        """ Return the next decoded line from the input stream."""
8277db96d56Sopenharmony_ci        data = next(self.reader)
8287db96d56Sopenharmony_ci        data, bytesencoded = self.encode(data, self.errors)
8297db96d56Sopenharmony_ci        return data
8307db96d56Sopenharmony_ci
8317db96d56Sopenharmony_ci    def __iter__(self):
8327db96d56Sopenharmony_ci        return self
8337db96d56Sopenharmony_ci
8347db96d56Sopenharmony_ci    def write(self, data):
8357db96d56Sopenharmony_ci
8367db96d56Sopenharmony_ci        data, bytesdecoded = self.decode(data, self.errors)
8377db96d56Sopenharmony_ci        return self.writer.write(data)
8387db96d56Sopenharmony_ci
8397db96d56Sopenharmony_ci    def writelines(self, list):
8407db96d56Sopenharmony_ci
8417db96d56Sopenharmony_ci        data = b''.join(list)
8427db96d56Sopenharmony_ci        data, bytesdecoded = self.decode(data, self.errors)
8437db96d56Sopenharmony_ci        return self.writer.write(data)
8447db96d56Sopenharmony_ci
8457db96d56Sopenharmony_ci    def reset(self):
8467db96d56Sopenharmony_ci
8477db96d56Sopenharmony_ci        self.reader.reset()
8487db96d56Sopenharmony_ci        self.writer.reset()
8497db96d56Sopenharmony_ci
8507db96d56Sopenharmony_ci    def seek(self, offset, whence=0):
8517db96d56Sopenharmony_ci        # Seeks must be propagated to both the readers and writers
8527db96d56Sopenharmony_ci        # as they might need to reset their internal buffers.
8537db96d56Sopenharmony_ci        self.reader.seek(offset, whence)
8547db96d56Sopenharmony_ci        self.writer.seek(offset, whence)
8557db96d56Sopenharmony_ci
8567db96d56Sopenharmony_ci    def __getattr__(self, name,
8577db96d56Sopenharmony_ci                    getattr=getattr):
8587db96d56Sopenharmony_ci
8597db96d56Sopenharmony_ci        """ Inherit all other methods from the underlying stream.
8607db96d56Sopenharmony_ci        """
8617db96d56Sopenharmony_ci        return getattr(self.stream, name)
8627db96d56Sopenharmony_ci
8637db96d56Sopenharmony_ci    def __enter__(self):
8647db96d56Sopenharmony_ci        return self
8657db96d56Sopenharmony_ci
8667db96d56Sopenharmony_ci    def __exit__(self, type, value, tb):
8677db96d56Sopenharmony_ci        self.stream.close()
8687db96d56Sopenharmony_ci
8697db96d56Sopenharmony_ci### Shortcuts
8707db96d56Sopenharmony_ci
8717db96d56Sopenharmony_cidef open(filename, mode='r', encoding=None, errors='strict', buffering=-1):
8727db96d56Sopenharmony_ci
8737db96d56Sopenharmony_ci    """ Open an encoded file using the given mode and return
8747db96d56Sopenharmony_ci        a wrapped version providing transparent encoding/decoding.
8757db96d56Sopenharmony_ci
8767db96d56Sopenharmony_ci        Note: The wrapped version will only accept the object format
8777db96d56Sopenharmony_ci        defined by the codecs, i.e. Unicode objects for most builtin
8787db96d56Sopenharmony_ci        codecs. Output is also codec dependent and will usually be
8797db96d56Sopenharmony_ci        Unicode as well.
8807db96d56Sopenharmony_ci
8817db96d56Sopenharmony_ci        If encoding is not None, then the
8827db96d56Sopenharmony_ci        underlying encoded files are always opened in binary mode.
8837db96d56Sopenharmony_ci        The default file mode is 'r', meaning to open the file in read mode.
8847db96d56Sopenharmony_ci
8857db96d56Sopenharmony_ci        encoding specifies the encoding which is to be used for the
8867db96d56Sopenharmony_ci        file.
8877db96d56Sopenharmony_ci
8887db96d56Sopenharmony_ci        errors may be given to define the error handling. It defaults
8897db96d56Sopenharmony_ci        to 'strict' which causes ValueErrors to be raised in case an
8907db96d56Sopenharmony_ci        encoding error occurs.
8917db96d56Sopenharmony_ci
8927db96d56Sopenharmony_ci        buffering has the same meaning as for the builtin open() API.
8937db96d56Sopenharmony_ci        It defaults to -1 which means that the default buffer size will
8947db96d56Sopenharmony_ci        be used.
8957db96d56Sopenharmony_ci
8967db96d56Sopenharmony_ci        The returned wrapped file object provides an extra attribute
8977db96d56Sopenharmony_ci        .encoding which allows querying the used encoding. This
8987db96d56Sopenharmony_ci        attribute is only available if an encoding was specified as
8997db96d56Sopenharmony_ci        parameter.
9007db96d56Sopenharmony_ci
9017db96d56Sopenharmony_ci    """
9027db96d56Sopenharmony_ci    if encoding is not None and \
9037db96d56Sopenharmony_ci       'b' not in mode:
9047db96d56Sopenharmony_ci        # Force opening of the file in binary mode
9057db96d56Sopenharmony_ci        mode = mode + 'b'
9067db96d56Sopenharmony_ci    file = builtins.open(filename, mode, buffering)
9077db96d56Sopenharmony_ci    if encoding is None:
9087db96d56Sopenharmony_ci        return file
9097db96d56Sopenharmony_ci
9107db96d56Sopenharmony_ci    try:
9117db96d56Sopenharmony_ci        info = lookup(encoding)
9127db96d56Sopenharmony_ci        srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
9137db96d56Sopenharmony_ci        # Add attributes to simplify introspection
9147db96d56Sopenharmony_ci        srw.encoding = encoding
9157db96d56Sopenharmony_ci        return srw
9167db96d56Sopenharmony_ci    except:
9177db96d56Sopenharmony_ci        file.close()
9187db96d56Sopenharmony_ci        raise
9197db96d56Sopenharmony_ci
9207db96d56Sopenharmony_cidef EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
9217db96d56Sopenharmony_ci
9227db96d56Sopenharmony_ci    """ Return a wrapped version of file which provides transparent
9237db96d56Sopenharmony_ci        encoding translation.
9247db96d56Sopenharmony_ci
9257db96d56Sopenharmony_ci        Data written to the wrapped file is decoded according
9267db96d56Sopenharmony_ci        to the given data_encoding and then encoded to the underlying
9277db96d56Sopenharmony_ci        file using file_encoding. The intermediate data type
9287db96d56Sopenharmony_ci        will usually be Unicode but depends on the specified codecs.
9297db96d56Sopenharmony_ci
9307db96d56Sopenharmony_ci        Bytes read from the file are decoded using file_encoding and then
9317db96d56Sopenharmony_ci        passed back to the caller encoded using data_encoding.
9327db96d56Sopenharmony_ci
9337db96d56Sopenharmony_ci        If file_encoding is not given, it defaults to data_encoding.
9347db96d56Sopenharmony_ci
9357db96d56Sopenharmony_ci        errors may be given to define the error handling. It defaults
9367db96d56Sopenharmony_ci        to 'strict' which causes ValueErrors to be raised in case an
9377db96d56Sopenharmony_ci        encoding error occurs.
9387db96d56Sopenharmony_ci
9397db96d56Sopenharmony_ci        The returned wrapped file object provides two extra attributes
9407db96d56Sopenharmony_ci        .data_encoding and .file_encoding which reflect the given
9417db96d56Sopenharmony_ci        parameters of the same name. The attributes can be used for
9427db96d56Sopenharmony_ci        introspection by Python programs.
9437db96d56Sopenharmony_ci
9447db96d56Sopenharmony_ci    """
9457db96d56Sopenharmony_ci    if file_encoding is None:
9467db96d56Sopenharmony_ci        file_encoding = data_encoding
9477db96d56Sopenharmony_ci    data_info = lookup(data_encoding)
9487db96d56Sopenharmony_ci    file_info = lookup(file_encoding)
9497db96d56Sopenharmony_ci    sr = StreamRecoder(file, data_info.encode, data_info.decode,
9507db96d56Sopenharmony_ci                       file_info.streamreader, file_info.streamwriter, errors)
9517db96d56Sopenharmony_ci    # Add attributes to simplify introspection
9527db96d56Sopenharmony_ci    sr.data_encoding = data_encoding
9537db96d56Sopenharmony_ci    sr.file_encoding = file_encoding
9547db96d56Sopenharmony_ci    return sr
9557db96d56Sopenharmony_ci
9567db96d56Sopenharmony_ci### Helpers for codec lookup
9577db96d56Sopenharmony_ci
9587db96d56Sopenharmony_cidef getencoder(encoding):
9597db96d56Sopenharmony_ci
9607db96d56Sopenharmony_ci    """ Lookup up the codec for the given encoding and return
9617db96d56Sopenharmony_ci        its encoder function.
9627db96d56Sopenharmony_ci
9637db96d56Sopenharmony_ci        Raises a LookupError in case the encoding cannot be found.
9647db96d56Sopenharmony_ci
9657db96d56Sopenharmony_ci    """
9667db96d56Sopenharmony_ci    return lookup(encoding).encode
9677db96d56Sopenharmony_ci
9687db96d56Sopenharmony_cidef getdecoder(encoding):
9697db96d56Sopenharmony_ci
9707db96d56Sopenharmony_ci    """ Lookup up the codec for the given encoding and return
9717db96d56Sopenharmony_ci        its decoder function.
9727db96d56Sopenharmony_ci
9737db96d56Sopenharmony_ci        Raises a LookupError in case the encoding cannot be found.
9747db96d56Sopenharmony_ci
9757db96d56Sopenharmony_ci    """
9767db96d56Sopenharmony_ci    return lookup(encoding).decode
9777db96d56Sopenharmony_ci
9787db96d56Sopenharmony_cidef getincrementalencoder(encoding):
9797db96d56Sopenharmony_ci
9807db96d56Sopenharmony_ci    """ Lookup up the codec for the given encoding and return
9817db96d56Sopenharmony_ci        its IncrementalEncoder class or factory function.
9827db96d56Sopenharmony_ci
9837db96d56Sopenharmony_ci        Raises a LookupError in case the encoding cannot be found
9847db96d56Sopenharmony_ci        or the codecs doesn't provide an incremental encoder.
9857db96d56Sopenharmony_ci
9867db96d56Sopenharmony_ci    """
9877db96d56Sopenharmony_ci    encoder = lookup(encoding).incrementalencoder
9887db96d56Sopenharmony_ci    if encoder is None:
9897db96d56Sopenharmony_ci        raise LookupError(encoding)
9907db96d56Sopenharmony_ci    return encoder
9917db96d56Sopenharmony_ci
9927db96d56Sopenharmony_cidef getincrementaldecoder(encoding):
9937db96d56Sopenharmony_ci
9947db96d56Sopenharmony_ci    """ Lookup up the codec for the given encoding and return
9957db96d56Sopenharmony_ci        its IncrementalDecoder class or factory function.
9967db96d56Sopenharmony_ci
9977db96d56Sopenharmony_ci        Raises a LookupError in case the encoding cannot be found
9987db96d56Sopenharmony_ci        or the codecs doesn't provide an incremental decoder.
9997db96d56Sopenharmony_ci
10007db96d56Sopenharmony_ci    """
10017db96d56Sopenharmony_ci    decoder = lookup(encoding).incrementaldecoder
10027db96d56Sopenharmony_ci    if decoder is None:
10037db96d56Sopenharmony_ci        raise LookupError(encoding)
10047db96d56Sopenharmony_ci    return decoder
10057db96d56Sopenharmony_ci
10067db96d56Sopenharmony_cidef getreader(encoding):
10077db96d56Sopenharmony_ci
10087db96d56Sopenharmony_ci    """ Lookup up the codec for the given encoding and return
10097db96d56Sopenharmony_ci        its StreamReader class or factory function.
10107db96d56Sopenharmony_ci
10117db96d56Sopenharmony_ci        Raises a LookupError in case the encoding cannot be found.
10127db96d56Sopenharmony_ci
10137db96d56Sopenharmony_ci    """
10147db96d56Sopenharmony_ci    return lookup(encoding).streamreader
10157db96d56Sopenharmony_ci
10167db96d56Sopenharmony_cidef getwriter(encoding):
10177db96d56Sopenharmony_ci
10187db96d56Sopenharmony_ci    """ Lookup up the codec for the given encoding and return
10197db96d56Sopenharmony_ci        its StreamWriter class or factory function.
10207db96d56Sopenharmony_ci
10217db96d56Sopenharmony_ci        Raises a LookupError in case the encoding cannot be found.
10227db96d56Sopenharmony_ci
10237db96d56Sopenharmony_ci    """
10247db96d56Sopenharmony_ci    return lookup(encoding).streamwriter
10257db96d56Sopenharmony_ci
10267db96d56Sopenharmony_cidef iterencode(iterator, encoding, errors='strict', **kwargs):
10277db96d56Sopenharmony_ci    """
10287db96d56Sopenharmony_ci    Encoding iterator.
10297db96d56Sopenharmony_ci
10307db96d56Sopenharmony_ci    Encodes the input strings from the iterator using an IncrementalEncoder.
10317db96d56Sopenharmony_ci
10327db96d56Sopenharmony_ci    errors and kwargs are passed through to the IncrementalEncoder
10337db96d56Sopenharmony_ci    constructor.
10347db96d56Sopenharmony_ci    """
10357db96d56Sopenharmony_ci    encoder = getincrementalencoder(encoding)(errors, **kwargs)
10367db96d56Sopenharmony_ci    for input in iterator:
10377db96d56Sopenharmony_ci        output = encoder.encode(input)
10387db96d56Sopenharmony_ci        if output:
10397db96d56Sopenharmony_ci            yield output
10407db96d56Sopenharmony_ci    output = encoder.encode("", True)
10417db96d56Sopenharmony_ci    if output:
10427db96d56Sopenharmony_ci        yield output
10437db96d56Sopenharmony_ci
10447db96d56Sopenharmony_cidef iterdecode(iterator, encoding, errors='strict', **kwargs):
10457db96d56Sopenharmony_ci    """
10467db96d56Sopenharmony_ci    Decoding iterator.
10477db96d56Sopenharmony_ci
10487db96d56Sopenharmony_ci    Decodes the input strings from the iterator using an IncrementalDecoder.
10497db96d56Sopenharmony_ci
10507db96d56Sopenharmony_ci    errors and kwargs are passed through to the IncrementalDecoder
10517db96d56Sopenharmony_ci    constructor.
10527db96d56Sopenharmony_ci    """
10537db96d56Sopenharmony_ci    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
10547db96d56Sopenharmony_ci    for input in iterator:
10557db96d56Sopenharmony_ci        output = decoder.decode(input)
10567db96d56Sopenharmony_ci        if output:
10577db96d56Sopenharmony_ci            yield output
10587db96d56Sopenharmony_ci    output = decoder.decode(b"", True)
10597db96d56Sopenharmony_ci    if output:
10607db96d56Sopenharmony_ci        yield output
10617db96d56Sopenharmony_ci
10627db96d56Sopenharmony_ci### Helpers for charmap-based codecs
10637db96d56Sopenharmony_ci
10647db96d56Sopenharmony_cidef make_identity_dict(rng):
10657db96d56Sopenharmony_ci
10667db96d56Sopenharmony_ci    """ make_identity_dict(rng) -> dict
10677db96d56Sopenharmony_ci
10687db96d56Sopenharmony_ci        Return a dictionary where elements of the rng sequence are
10697db96d56Sopenharmony_ci        mapped to themselves.
10707db96d56Sopenharmony_ci
10717db96d56Sopenharmony_ci    """
10727db96d56Sopenharmony_ci    return {i:i for i in rng}
10737db96d56Sopenharmony_ci
10747db96d56Sopenharmony_cidef make_encoding_map(decoding_map):
10757db96d56Sopenharmony_ci
10767db96d56Sopenharmony_ci    """ Creates an encoding map from a decoding map.
10777db96d56Sopenharmony_ci
10787db96d56Sopenharmony_ci        If a target mapping in the decoding map occurs multiple
10797db96d56Sopenharmony_ci        times, then that target is mapped to None (undefined mapping),
10807db96d56Sopenharmony_ci        causing an exception when encountered by the charmap codec
10817db96d56Sopenharmony_ci        during translation.
10827db96d56Sopenharmony_ci
10837db96d56Sopenharmony_ci        One example where this happens is cp875.py which decodes
10847db96d56Sopenharmony_ci        multiple character to \\u001a.
10857db96d56Sopenharmony_ci
10867db96d56Sopenharmony_ci    """
10877db96d56Sopenharmony_ci    m = {}
10887db96d56Sopenharmony_ci    for k,v in decoding_map.items():
10897db96d56Sopenharmony_ci        if not v in m:
10907db96d56Sopenharmony_ci            m[v] = k
10917db96d56Sopenharmony_ci        else:
10927db96d56Sopenharmony_ci            m[v] = None
10937db96d56Sopenharmony_ci    return m
10947db96d56Sopenharmony_ci
10957db96d56Sopenharmony_ci### error handlers
10967db96d56Sopenharmony_ci
10977db96d56Sopenharmony_citry:
10987db96d56Sopenharmony_ci    strict_errors = lookup_error("strict")
10997db96d56Sopenharmony_ci    ignore_errors = lookup_error("ignore")
11007db96d56Sopenharmony_ci    replace_errors = lookup_error("replace")
11017db96d56Sopenharmony_ci    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
11027db96d56Sopenharmony_ci    backslashreplace_errors = lookup_error("backslashreplace")
11037db96d56Sopenharmony_ci    namereplace_errors = lookup_error("namereplace")
11047db96d56Sopenharmony_ciexcept LookupError:
11057db96d56Sopenharmony_ci    # In --disable-unicode builds, these error handler are missing
11067db96d56Sopenharmony_ci    strict_errors = None
11077db96d56Sopenharmony_ci    ignore_errors = None
11087db96d56Sopenharmony_ci    replace_errors = None
11097db96d56Sopenharmony_ci    xmlcharrefreplace_errors = None
11107db96d56Sopenharmony_ci    backslashreplace_errors = None
11117db96d56Sopenharmony_ci    namereplace_errors = None
11127db96d56Sopenharmony_ci
11137db96d56Sopenharmony_ci# Tell modulefinder that using codecs probably needs the encodings
11147db96d56Sopenharmony_ci# package
11157db96d56Sopenharmony_ci_false = 0
11167db96d56Sopenharmony_ciif _false:
11177db96d56Sopenharmony_ci    import encodings
11187db96d56Sopenharmony_ci
11197db96d56Sopenharmony_ci### Tests
11207db96d56Sopenharmony_ci
11217db96d56Sopenharmony_ciif __name__ == '__main__':
11227db96d56Sopenharmony_ci
11237db96d56Sopenharmony_ci    # Make stdout translate Latin-1 output into UTF-8 output
11247db96d56Sopenharmony_ci    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
11257db96d56Sopenharmony_ci
11267db96d56Sopenharmony_ci    # Have stdin translate Latin-1 input into UTF-8 input
11277db96d56Sopenharmony_ci    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1128