17db96d56Sopenharmony_ci""" codecs -- Python Codec Registry, API and helpers. 27db96d56Sopenharmony_ci 37db96d56Sopenharmony_ci 47db96d56Sopenharmony_ciWritten by Marc-Andre Lemburg (mal@lemburg.com). 57db96d56Sopenharmony_ci 67db96d56Sopenharmony_ci(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 77db96d56Sopenharmony_ci 87db96d56Sopenharmony_ci""" 97db96d56Sopenharmony_ci 107db96d56Sopenharmony_ciimport builtins 117db96d56Sopenharmony_ciimport sys 127db96d56Sopenharmony_ci 137db96d56Sopenharmony_ci### Registry and builtin stateless codec functions 147db96d56Sopenharmony_ci 157db96d56Sopenharmony_citry: 167db96d56Sopenharmony_ci from _codecs import * 177db96d56Sopenharmony_ciexcept ImportError as why: 187db96d56Sopenharmony_ci raise SystemError('Failed to load the builtin codecs: %s' % why) 197db96d56Sopenharmony_ci 207db96d56Sopenharmony_ci__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 217db96d56Sopenharmony_ci "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 227db96d56Sopenharmony_ci "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 237db96d56Sopenharmony_ci "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 247db96d56Sopenharmony_ci "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", 257db96d56Sopenharmony_ci "StreamReader", "StreamWriter", 267db96d56Sopenharmony_ci "StreamReaderWriter", "StreamRecoder", 277db96d56Sopenharmony_ci "getencoder", "getdecoder", "getincrementalencoder", 287db96d56Sopenharmony_ci "getincrementaldecoder", "getreader", "getwriter", 297db96d56Sopenharmony_ci "encode", "decode", "iterencode", "iterdecode", 307db96d56Sopenharmony_ci "strict_errors", "ignore_errors", "replace_errors", 317db96d56Sopenharmony_ci "xmlcharrefreplace_errors", 327db96d56Sopenharmony_ci "backslashreplace_errors", "namereplace_errors", 337db96d56Sopenharmony_ci "register_error", "lookup_error"] 347db96d56Sopenharmony_ci 357db96d56Sopenharmony_ci### Constants 367db96d56Sopenharmony_ci 377db96d56Sopenharmony_ci# 387db96d56Sopenharmony_ci# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 397db96d56Sopenharmony_ci# and its possible byte string values 407db96d56Sopenharmony_ci# for UTF8/UTF16/UTF32 output and little/big endian machines 417db96d56Sopenharmony_ci# 427db96d56Sopenharmony_ci 437db96d56Sopenharmony_ci# UTF-8 447db96d56Sopenharmony_ciBOM_UTF8 = b'\xef\xbb\xbf' 457db96d56Sopenharmony_ci 467db96d56Sopenharmony_ci# UTF-16, little endian 477db96d56Sopenharmony_ciBOM_LE = BOM_UTF16_LE = b'\xff\xfe' 487db96d56Sopenharmony_ci 497db96d56Sopenharmony_ci# UTF-16, big endian 507db96d56Sopenharmony_ciBOM_BE = BOM_UTF16_BE = b'\xfe\xff' 517db96d56Sopenharmony_ci 527db96d56Sopenharmony_ci# UTF-32, little endian 537db96d56Sopenharmony_ciBOM_UTF32_LE = b'\xff\xfe\x00\x00' 547db96d56Sopenharmony_ci 557db96d56Sopenharmony_ci# UTF-32, big endian 567db96d56Sopenharmony_ciBOM_UTF32_BE = b'\x00\x00\xfe\xff' 577db96d56Sopenharmony_ci 587db96d56Sopenharmony_ciif sys.byteorder == 'little': 597db96d56Sopenharmony_ci 607db96d56Sopenharmony_ci # UTF-16, native endianness 617db96d56Sopenharmony_ci BOM = BOM_UTF16 = BOM_UTF16_LE 627db96d56Sopenharmony_ci 637db96d56Sopenharmony_ci # UTF-32, native endianness 647db96d56Sopenharmony_ci BOM_UTF32 = BOM_UTF32_LE 657db96d56Sopenharmony_ci 667db96d56Sopenharmony_cielse: 677db96d56Sopenharmony_ci 687db96d56Sopenharmony_ci # UTF-16, native endianness 697db96d56Sopenharmony_ci BOM = BOM_UTF16 = BOM_UTF16_BE 707db96d56Sopenharmony_ci 717db96d56Sopenharmony_ci # UTF-32, native endianness 727db96d56Sopenharmony_ci BOM_UTF32 = BOM_UTF32_BE 737db96d56Sopenharmony_ci 747db96d56Sopenharmony_ci# Old broken names (don't use in new code) 757db96d56Sopenharmony_ciBOM32_LE = BOM_UTF16_LE 767db96d56Sopenharmony_ciBOM32_BE = BOM_UTF16_BE 777db96d56Sopenharmony_ciBOM64_LE = BOM_UTF32_LE 787db96d56Sopenharmony_ciBOM64_BE = BOM_UTF32_BE 797db96d56Sopenharmony_ci 807db96d56Sopenharmony_ci 817db96d56Sopenharmony_ci### Codec base classes (defining the API) 827db96d56Sopenharmony_ci 837db96d56Sopenharmony_ciclass CodecInfo(tuple): 847db96d56Sopenharmony_ci """Codec details when looking up the codec registry""" 857db96d56Sopenharmony_ci 867db96d56Sopenharmony_ci # Private API to allow Python 3.4 to denylist the known non-Unicode 877db96d56Sopenharmony_ci # codecs in the standard library. A more general mechanism to 887db96d56Sopenharmony_ci # reliably distinguish test encodings from other codecs will hopefully 897db96d56Sopenharmony_ci # be defined for Python 3.5 907db96d56Sopenharmony_ci # 917db96d56Sopenharmony_ci # See http://bugs.python.org/issue19619 927db96d56Sopenharmony_ci _is_text_encoding = True # Assume codecs are text encodings by default 937db96d56Sopenharmony_ci 947db96d56Sopenharmony_ci def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 957db96d56Sopenharmony_ci incrementalencoder=None, incrementaldecoder=None, name=None, 967db96d56Sopenharmony_ci *, _is_text_encoding=None): 977db96d56Sopenharmony_ci self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 987db96d56Sopenharmony_ci self.name = name 997db96d56Sopenharmony_ci self.encode = encode 1007db96d56Sopenharmony_ci self.decode = decode 1017db96d56Sopenharmony_ci self.incrementalencoder = incrementalencoder 1027db96d56Sopenharmony_ci self.incrementaldecoder = incrementaldecoder 1037db96d56Sopenharmony_ci self.streamwriter = streamwriter 1047db96d56Sopenharmony_ci self.streamreader = streamreader 1057db96d56Sopenharmony_ci if _is_text_encoding is not None: 1067db96d56Sopenharmony_ci self._is_text_encoding = _is_text_encoding 1077db96d56Sopenharmony_ci return self 1087db96d56Sopenharmony_ci 1097db96d56Sopenharmony_ci def __repr__(self): 1107db96d56Sopenharmony_ci return "<%s.%s object for encoding %s at %#x>" % \ 1117db96d56Sopenharmony_ci (self.__class__.__module__, self.__class__.__qualname__, 1127db96d56Sopenharmony_ci self.name, id(self)) 1137db96d56Sopenharmony_ci 1147db96d56Sopenharmony_ciclass Codec: 1157db96d56Sopenharmony_ci 1167db96d56Sopenharmony_ci """ Defines the interface for stateless encoders/decoders. 1177db96d56Sopenharmony_ci 1187db96d56Sopenharmony_ci The .encode()/.decode() methods may use different error 1197db96d56Sopenharmony_ci handling schemes by providing the errors argument. These 1207db96d56Sopenharmony_ci string values are predefined: 1217db96d56Sopenharmony_ci 1227db96d56Sopenharmony_ci 'strict' - raise a ValueError error (or a subclass) 1237db96d56Sopenharmony_ci 'ignore' - ignore the character and continue with the next 1247db96d56Sopenharmony_ci 'replace' - replace with a suitable replacement character; 1257db96d56Sopenharmony_ci Python will use the official U+FFFD REPLACEMENT 1267db96d56Sopenharmony_ci CHARACTER for the builtin Unicode codecs on 1277db96d56Sopenharmony_ci decoding and '?' on encoding. 1287db96d56Sopenharmony_ci 'surrogateescape' - replace with private code points U+DCnn. 1297db96d56Sopenharmony_ci 'xmlcharrefreplace' - Replace with the appropriate XML 1307db96d56Sopenharmony_ci character reference (only for encoding). 1317db96d56Sopenharmony_ci 'backslashreplace' - Replace with backslashed escape sequences. 1327db96d56Sopenharmony_ci 'namereplace' - Replace with \\N{...} escape sequences 1337db96d56Sopenharmony_ci (only for encoding). 1347db96d56Sopenharmony_ci 1357db96d56Sopenharmony_ci The set of allowed values can be extended via register_error. 1367db96d56Sopenharmony_ci 1377db96d56Sopenharmony_ci """ 1387db96d56Sopenharmony_ci def encode(self, input, errors='strict'): 1397db96d56Sopenharmony_ci 1407db96d56Sopenharmony_ci """ Encodes the object input and returns a tuple (output 1417db96d56Sopenharmony_ci object, length consumed). 1427db96d56Sopenharmony_ci 1437db96d56Sopenharmony_ci errors defines the error handling to apply. It defaults to 1447db96d56Sopenharmony_ci 'strict' handling. 1457db96d56Sopenharmony_ci 1467db96d56Sopenharmony_ci The method may not store state in the Codec instance. Use 1477db96d56Sopenharmony_ci StreamWriter for codecs which have to keep state in order to 1487db96d56Sopenharmony_ci make encoding efficient. 1497db96d56Sopenharmony_ci 1507db96d56Sopenharmony_ci The encoder must be able to handle zero length input and 1517db96d56Sopenharmony_ci return an empty object of the output object type in this 1527db96d56Sopenharmony_ci situation. 1537db96d56Sopenharmony_ci 1547db96d56Sopenharmony_ci """ 1557db96d56Sopenharmony_ci raise NotImplementedError 1567db96d56Sopenharmony_ci 1577db96d56Sopenharmony_ci def decode(self, input, errors='strict'): 1587db96d56Sopenharmony_ci 1597db96d56Sopenharmony_ci """ Decodes the object input and returns a tuple (output 1607db96d56Sopenharmony_ci object, length consumed). 1617db96d56Sopenharmony_ci 1627db96d56Sopenharmony_ci input must be an object which provides the bf_getreadbuf 1637db96d56Sopenharmony_ci buffer slot. Python strings, buffer objects and memory 1647db96d56Sopenharmony_ci mapped files are examples of objects providing this slot. 1657db96d56Sopenharmony_ci 1667db96d56Sopenharmony_ci errors defines the error handling to apply. It defaults to 1677db96d56Sopenharmony_ci 'strict' handling. 1687db96d56Sopenharmony_ci 1697db96d56Sopenharmony_ci The method may not store state in the Codec instance. Use 1707db96d56Sopenharmony_ci StreamReader for codecs which have to keep state in order to 1717db96d56Sopenharmony_ci make decoding efficient. 1727db96d56Sopenharmony_ci 1737db96d56Sopenharmony_ci The decoder must be able to handle zero length input and 1747db96d56Sopenharmony_ci return an empty object of the output object type in this 1757db96d56Sopenharmony_ci situation. 1767db96d56Sopenharmony_ci 1777db96d56Sopenharmony_ci """ 1787db96d56Sopenharmony_ci raise NotImplementedError 1797db96d56Sopenharmony_ci 1807db96d56Sopenharmony_ciclass IncrementalEncoder(object): 1817db96d56Sopenharmony_ci """ 1827db96d56Sopenharmony_ci An IncrementalEncoder encodes an input in multiple steps. The input can 1837db96d56Sopenharmony_ci be passed piece by piece to the encode() method. The IncrementalEncoder 1847db96d56Sopenharmony_ci remembers the state of the encoding process between calls to encode(). 1857db96d56Sopenharmony_ci """ 1867db96d56Sopenharmony_ci def __init__(self, errors='strict'): 1877db96d56Sopenharmony_ci """ 1887db96d56Sopenharmony_ci Creates an IncrementalEncoder instance. 1897db96d56Sopenharmony_ci 1907db96d56Sopenharmony_ci The IncrementalEncoder may use different error handling schemes by 1917db96d56Sopenharmony_ci providing the errors keyword argument. See the module docstring 1927db96d56Sopenharmony_ci for a list of possible values. 1937db96d56Sopenharmony_ci """ 1947db96d56Sopenharmony_ci self.errors = errors 1957db96d56Sopenharmony_ci self.buffer = "" 1967db96d56Sopenharmony_ci 1977db96d56Sopenharmony_ci def encode(self, input, final=False): 1987db96d56Sopenharmony_ci """ 1997db96d56Sopenharmony_ci Encodes input and returns the resulting object. 2007db96d56Sopenharmony_ci """ 2017db96d56Sopenharmony_ci raise NotImplementedError 2027db96d56Sopenharmony_ci 2037db96d56Sopenharmony_ci def reset(self): 2047db96d56Sopenharmony_ci """ 2057db96d56Sopenharmony_ci Resets the encoder to the initial state. 2067db96d56Sopenharmony_ci """ 2077db96d56Sopenharmony_ci 2087db96d56Sopenharmony_ci def getstate(self): 2097db96d56Sopenharmony_ci """ 2107db96d56Sopenharmony_ci Return the current state of the encoder. 2117db96d56Sopenharmony_ci """ 2127db96d56Sopenharmony_ci return 0 2137db96d56Sopenharmony_ci 2147db96d56Sopenharmony_ci def setstate(self, state): 2157db96d56Sopenharmony_ci """ 2167db96d56Sopenharmony_ci Set the current state of the encoder. state must have been 2177db96d56Sopenharmony_ci returned by getstate(). 2187db96d56Sopenharmony_ci """ 2197db96d56Sopenharmony_ci 2207db96d56Sopenharmony_ciclass BufferedIncrementalEncoder(IncrementalEncoder): 2217db96d56Sopenharmony_ci """ 2227db96d56Sopenharmony_ci This subclass of IncrementalEncoder can be used as the baseclass for an 2237db96d56Sopenharmony_ci incremental encoder if the encoder must keep some of the output in a 2247db96d56Sopenharmony_ci buffer between calls to encode(). 2257db96d56Sopenharmony_ci """ 2267db96d56Sopenharmony_ci def __init__(self, errors='strict'): 2277db96d56Sopenharmony_ci IncrementalEncoder.__init__(self, errors) 2287db96d56Sopenharmony_ci # unencoded input that is kept between calls to encode() 2297db96d56Sopenharmony_ci self.buffer = "" 2307db96d56Sopenharmony_ci 2317db96d56Sopenharmony_ci def _buffer_encode(self, input, errors, final): 2327db96d56Sopenharmony_ci # Overwrite this method in subclasses: It must encode input 2337db96d56Sopenharmony_ci # and return an (output, length consumed) tuple 2347db96d56Sopenharmony_ci raise NotImplementedError 2357db96d56Sopenharmony_ci 2367db96d56Sopenharmony_ci def encode(self, input, final=False): 2377db96d56Sopenharmony_ci # encode input (taking the buffer into account) 2387db96d56Sopenharmony_ci data = self.buffer + input 2397db96d56Sopenharmony_ci (result, consumed) = self._buffer_encode(data, self.errors, final) 2407db96d56Sopenharmony_ci # keep unencoded input until the next call 2417db96d56Sopenharmony_ci self.buffer = data[consumed:] 2427db96d56Sopenharmony_ci return result 2437db96d56Sopenharmony_ci 2447db96d56Sopenharmony_ci def reset(self): 2457db96d56Sopenharmony_ci IncrementalEncoder.reset(self) 2467db96d56Sopenharmony_ci self.buffer = "" 2477db96d56Sopenharmony_ci 2487db96d56Sopenharmony_ci def getstate(self): 2497db96d56Sopenharmony_ci return self.buffer or 0 2507db96d56Sopenharmony_ci 2517db96d56Sopenharmony_ci def setstate(self, state): 2527db96d56Sopenharmony_ci self.buffer = state or "" 2537db96d56Sopenharmony_ci 2547db96d56Sopenharmony_ciclass IncrementalDecoder(object): 2557db96d56Sopenharmony_ci """ 2567db96d56Sopenharmony_ci An IncrementalDecoder decodes an input in multiple steps. The input can 2577db96d56Sopenharmony_ci be passed piece by piece to the decode() method. The IncrementalDecoder 2587db96d56Sopenharmony_ci remembers the state of the decoding process between calls to decode(). 2597db96d56Sopenharmony_ci """ 2607db96d56Sopenharmony_ci def __init__(self, errors='strict'): 2617db96d56Sopenharmony_ci """ 2627db96d56Sopenharmony_ci Create an IncrementalDecoder instance. 2637db96d56Sopenharmony_ci 2647db96d56Sopenharmony_ci The IncrementalDecoder may use different error handling schemes by 2657db96d56Sopenharmony_ci providing the errors keyword argument. See the module docstring 2667db96d56Sopenharmony_ci for a list of possible values. 2677db96d56Sopenharmony_ci """ 2687db96d56Sopenharmony_ci self.errors = errors 2697db96d56Sopenharmony_ci 2707db96d56Sopenharmony_ci def decode(self, input, final=False): 2717db96d56Sopenharmony_ci """ 2727db96d56Sopenharmony_ci Decode input and returns the resulting object. 2737db96d56Sopenharmony_ci """ 2747db96d56Sopenharmony_ci raise NotImplementedError 2757db96d56Sopenharmony_ci 2767db96d56Sopenharmony_ci def reset(self): 2777db96d56Sopenharmony_ci """ 2787db96d56Sopenharmony_ci Reset the decoder to the initial state. 2797db96d56Sopenharmony_ci """ 2807db96d56Sopenharmony_ci 2817db96d56Sopenharmony_ci def getstate(self): 2827db96d56Sopenharmony_ci """ 2837db96d56Sopenharmony_ci Return the current state of the decoder. 2847db96d56Sopenharmony_ci 2857db96d56Sopenharmony_ci This must be a (buffered_input, additional_state_info) tuple. 2867db96d56Sopenharmony_ci buffered_input must be a bytes object containing bytes that 2877db96d56Sopenharmony_ci were passed to decode() that have not yet been converted. 2887db96d56Sopenharmony_ci additional_state_info must be a non-negative integer 2897db96d56Sopenharmony_ci representing the state of the decoder WITHOUT yet having 2907db96d56Sopenharmony_ci processed the contents of buffered_input. In the initial state 2917db96d56Sopenharmony_ci and after reset(), getstate() must return (b"", 0). 2927db96d56Sopenharmony_ci """ 2937db96d56Sopenharmony_ci return (b"", 0) 2947db96d56Sopenharmony_ci 2957db96d56Sopenharmony_ci def setstate(self, state): 2967db96d56Sopenharmony_ci """ 2977db96d56Sopenharmony_ci Set the current state of the decoder. 2987db96d56Sopenharmony_ci 2997db96d56Sopenharmony_ci state must have been returned by getstate(). The effect of 3007db96d56Sopenharmony_ci setstate((b"", 0)) must be equivalent to reset(). 3017db96d56Sopenharmony_ci """ 3027db96d56Sopenharmony_ci 3037db96d56Sopenharmony_ciclass BufferedIncrementalDecoder(IncrementalDecoder): 3047db96d56Sopenharmony_ci """ 3057db96d56Sopenharmony_ci This subclass of IncrementalDecoder can be used as the baseclass for an 3067db96d56Sopenharmony_ci incremental decoder if the decoder must be able to handle incomplete 3077db96d56Sopenharmony_ci byte sequences. 3087db96d56Sopenharmony_ci """ 3097db96d56Sopenharmony_ci def __init__(self, errors='strict'): 3107db96d56Sopenharmony_ci IncrementalDecoder.__init__(self, errors) 3117db96d56Sopenharmony_ci # undecoded input that is kept between calls to decode() 3127db96d56Sopenharmony_ci self.buffer = b"" 3137db96d56Sopenharmony_ci 3147db96d56Sopenharmony_ci def _buffer_decode(self, input, errors, final): 3157db96d56Sopenharmony_ci # Overwrite this method in subclasses: It must decode input 3167db96d56Sopenharmony_ci # and return an (output, length consumed) tuple 3177db96d56Sopenharmony_ci raise NotImplementedError 3187db96d56Sopenharmony_ci 3197db96d56Sopenharmony_ci def decode(self, input, final=False): 3207db96d56Sopenharmony_ci # decode input (taking the buffer into account) 3217db96d56Sopenharmony_ci data = self.buffer + input 3227db96d56Sopenharmony_ci (result, consumed) = self._buffer_decode(data, self.errors, final) 3237db96d56Sopenharmony_ci # keep undecoded input until the next call 3247db96d56Sopenharmony_ci self.buffer = data[consumed:] 3257db96d56Sopenharmony_ci return result 3267db96d56Sopenharmony_ci 3277db96d56Sopenharmony_ci def reset(self): 3287db96d56Sopenharmony_ci IncrementalDecoder.reset(self) 3297db96d56Sopenharmony_ci self.buffer = b"" 3307db96d56Sopenharmony_ci 3317db96d56Sopenharmony_ci def getstate(self): 3327db96d56Sopenharmony_ci # additional state info is always 0 3337db96d56Sopenharmony_ci return (self.buffer, 0) 3347db96d56Sopenharmony_ci 3357db96d56Sopenharmony_ci def setstate(self, state): 3367db96d56Sopenharmony_ci # ignore additional state info 3377db96d56Sopenharmony_ci self.buffer = state[0] 3387db96d56Sopenharmony_ci 3397db96d56Sopenharmony_ci# 3407db96d56Sopenharmony_ci# The StreamWriter and StreamReader class provide generic working 3417db96d56Sopenharmony_ci# interfaces which can be used to implement new encoding submodules 3427db96d56Sopenharmony_ci# very easily. See encodings/utf_8.py for an example on how this is 3437db96d56Sopenharmony_ci# done. 3447db96d56Sopenharmony_ci# 3457db96d56Sopenharmony_ci 3467db96d56Sopenharmony_ciclass StreamWriter(Codec): 3477db96d56Sopenharmony_ci 3487db96d56Sopenharmony_ci def __init__(self, stream, errors='strict'): 3497db96d56Sopenharmony_ci 3507db96d56Sopenharmony_ci """ Creates a StreamWriter instance. 3517db96d56Sopenharmony_ci 3527db96d56Sopenharmony_ci stream must be a file-like object open for writing. 3537db96d56Sopenharmony_ci 3547db96d56Sopenharmony_ci The StreamWriter may use different error handling 3557db96d56Sopenharmony_ci schemes by providing the errors keyword argument. These 3567db96d56Sopenharmony_ci parameters are predefined: 3577db96d56Sopenharmony_ci 3587db96d56Sopenharmony_ci 'strict' - raise a ValueError (or a subclass) 3597db96d56Sopenharmony_ci 'ignore' - ignore the character and continue with the next 3607db96d56Sopenharmony_ci 'replace'- replace with a suitable replacement character 3617db96d56Sopenharmony_ci 'xmlcharrefreplace' - Replace with the appropriate XML 3627db96d56Sopenharmony_ci character reference. 3637db96d56Sopenharmony_ci 'backslashreplace' - Replace with backslashed escape 3647db96d56Sopenharmony_ci sequences. 3657db96d56Sopenharmony_ci 'namereplace' - Replace with \\N{...} escape sequences. 3667db96d56Sopenharmony_ci 3677db96d56Sopenharmony_ci The set of allowed parameter values can be extended via 3687db96d56Sopenharmony_ci register_error. 3697db96d56Sopenharmony_ci """ 3707db96d56Sopenharmony_ci self.stream = stream 3717db96d56Sopenharmony_ci self.errors = errors 3727db96d56Sopenharmony_ci 3737db96d56Sopenharmony_ci def write(self, object): 3747db96d56Sopenharmony_ci 3757db96d56Sopenharmony_ci """ Writes the object's contents encoded to self.stream. 3767db96d56Sopenharmony_ci """ 3777db96d56Sopenharmony_ci data, consumed = self.encode(object, self.errors) 3787db96d56Sopenharmony_ci self.stream.write(data) 3797db96d56Sopenharmony_ci 3807db96d56Sopenharmony_ci def writelines(self, list): 3817db96d56Sopenharmony_ci 3827db96d56Sopenharmony_ci """ Writes the concatenated list of strings to the stream 3837db96d56Sopenharmony_ci using .write(). 3847db96d56Sopenharmony_ci """ 3857db96d56Sopenharmony_ci self.write(''.join(list)) 3867db96d56Sopenharmony_ci 3877db96d56Sopenharmony_ci def reset(self): 3887db96d56Sopenharmony_ci 3897db96d56Sopenharmony_ci """ Resets the codec buffers used for keeping internal state. 3907db96d56Sopenharmony_ci 3917db96d56Sopenharmony_ci Calling this method should ensure that the data on the 3927db96d56Sopenharmony_ci output is put into a clean state, that allows appending 3937db96d56Sopenharmony_ci of new fresh data without having to rescan the whole 3947db96d56Sopenharmony_ci stream to recover state. 3957db96d56Sopenharmony_ci 3967db96d56Sopenharmony_ci """ 3977db96d56Sopenharmony_ci pass 3987db96d56Sopenharmony_ci 3997db96d56Sopenharmony_ci def seek(self, offset, whence=0): 4007db96d56Sopenharmony_ci self.stream.seek(offset, whence) 4017db96d56Sopenharmony_ci if whence == 0 and offset == 0: 4027db96d56Sopenharmony_ci self.reset() 4037db96d56Sopenharmony_ci 4047db96d56Sopenharmony_ci def __getattr__(self, name, 4057db96d56Sopenharmony_ci getattr=getattr): 4067db96d56Sopenharmony_ci 4077db96d56Sopenharmony_ci """ Inherit all other methods from the underlying stream. 4087db96d56Sopenharmony_ci """ 4097db96d56Sopenharmony_ci return getattr(self.stream, name) 4107db96d56Sopenharmony_ci 4117db96d56Sopenharmony_ci def __enter__(self): 4127db96d56Sopenharmony_ci return self 4137db96d56Sopenharmony_ci 4147db96d56Sopenharmony_ci def __exit__(self, type, value, tb): 4157db96d56Sopenharmony_ci self.stream.close() 4167db96d56Sopenharmony_ci 4177db96d56Sopenharmony_ci### 4187db96d56Sopenharmony_ci 4197db96d56Sopenharmony_ciclass StreamReader(Codec): 4207db96d56Sopenharmony_ci 4217db96d56Sopenharmony_ci charbuffertype = str 4227db96d56Sopenharmony_ci 4237db96d56Sopenharmony_ci def __init__(self, stream, errors='strict'): 4247db96d56Sopenharmony_ci 4257db96d56Sopenharmony_ci """ Creates a StreamReader instance. 4267db96d56Sopenharmony_ci 4277db96d56Sopenharmony_ci stream must be a file-like object open for reading. 4287db96d56Sopenharmony_ci 4297db96d56Sopenharmony_ci The StreamReader may use different error handling 4307db96d56Sopenharmony_ci schemes by providing the errors keyword argument. These 4317db96d56Sopenharmony_ci parameters are predefined: 4327db96d56Sopenharmony_ci 4337db96d56Sopenharmony_ci 'strict' - raise a ValueError (or a subclass) 4347db96d56Sopenharmony_ci 'ignore' - ignore the character and continue with the next 4357db96d56Sopenharmony_ci 'replace'- replace with a suitable replacement character 4367db96d56Sopenharmony_ci 'backslashreplace' - Replace with backslashed escape sequences; 4377db96d56Sopenharmony_ci 4387db96d56Sopenharmony_ci The set of allowed parameter values can be extended via 4397db96d56Sopenharmony_ci register_error. 4407db96d56Sopenharmony_ci """ 4417db96d56Sopenharmony_ci self.stream = stream 4427db96d56Sopenharmony_ci self.errors = errors 4437db96d56Sopenharmony_ci self.bytebuffer = b"" 4447db96d56Sopenharmony_ci self._empty_charbuffer = self.charbuffertype() 4457db96d56Sopenharmony_ci self.charbuffer = self._empty_charbuffer 4467db96d56Sopenharmony_ci self.linebuffer = None 4477db96d56Sopenharmony_ci 4487db96d56Sopenharmony_ci def decode(self, input, errors='strict'): 4497db96d56Sopenharmony_ci raise NotImplementedError 4507db96d56Sopenharmony_ci 4517db96d56Sopenharmony_ci def read(self, size=-1, chars=-1, firstline=False): 4527db96d56Sopenharmony_ci 4537db96d56Sopenharmony_ci """ Decodes data from the stream self.stream and returns the 4547db96d56Sopenharmony_ci resulting object. 4557db96d56Sopenharmony_ci 4567db96d56Sopenharmony_ci chars indicates the number of decoded code points or bytes to 4577db96d56Sopenharmony_ci return. read() will never return more data than requested, 4587db96d56Sopenharmony_ci but it might return less, if there is not enough available. 4597db96d56Sopenharmony_ci 4607db96d56Sopenharmony_ci size indicates the approximate maximum number of decoded 4617db96d56Sopenharmony_ci bytes or code points to read for decoding. The decoder 4627db96d56Sopenharmony_ci can modify this setting as appropriate. The default value 4637db96d56Sopenharmony_ci -1 indicates to read and decode as much as possible. size 4647db96d56Sopenharmony_ci is intended to prevent having to decode huge files in one 4657db96d56Sopenharmony_ci step. 4667db96d56Sopenharmony_ci 4677db96d56Sopenharmony_ci If firstline is true, and a UnicodeDecodeError happens 4687db96d56Sopenharmony_ci after the first line terminator in the input only the first line 4697db96d56Sopenharmony_ci will be returned, the rest of the input will be kept until the 4707db96d56Sopenharmony_ci next call to read(). 4717db96d56Sopenharmony_ci 4727db96d56Sopenharmony_ci The method should use a greedy read strategy, meaning that 4737db96d56Sopenharmony_ci it should read as much data as is allowed within the 4747db96d56Sopenharmony_ci definition of the encoding and the given size, e.g. if 4757db96d56Sopenharmony_ci optional encoding endings or state markers are available 4767db96d56Sopenharmony_ci on the stream, these should be read too. 4777db96d56Sopenharmony_ci """ 4787db96d56Sopenharmony_ci # If we have lines cached, first merge them back into characters 4797db96d56Sopenharmony_ci if self.linebuffer: 4807db96d56Sopenharmony_ci self.charbuffer = self._empty_charbuffer.join(self.linebuffer) 4817db96d56Sopenharmony_ci self.linebuffer = None 4827db96d56Sopenharmony_ci 4837db96d56Sopenharmony_ci if chars < 0: 4847db96d56Sopenharmony_ci # For compatibility with other read() methods that take a 4857db96d56Sopenharmony_ci # single argument 4867db96d56Sopenharmony_ci chars = size 4877db96d56Sopenharmony_ci 4887db96d56Sopenharmony_ci # read until we get the required number of characters (if available) 4897db96d56Sopenharmony_ci while True: 4907db96d56Sopenharmony_ci # can the request be satisfied from the character buffer? 4917db96d56Sopenharmony_ci if chars >= 0: 4927db96d56Sopenharmony_ci if len(self.charbuffer) >= chars: 4937db96d56Sopenharmony_ci break 4947db96d56Sopenharmony_ci # we need more data 4957db96d56Sopenharmony_ci if size < 0: 4967db96d56Sopenharmony_ci newdata = self.stream.read() 4977db96d56Sopenharmony_ci else: 4987db96d56Sopenharmony_ci newdata = self.stream.read(size) 4997db96d56Sopenharmony_ci # decode bytes (those remaining from the last call included) 5007db96d56Sopenharmony_ci data = self.bytebuffer + newdata 5017db96d56Sopenharmony_ci if not data: 5027db96d56Sopenharmony_ci break 5037db96d56Sopenharmony_ci try: 5047db96d56Sopenharmony_ci newchars, decodedbytes = self.decode(data, self.errors) 5057db96d56Sopenharmony_ci except UnicodeDecodeError as exc: 5067db96d56Sopenharmony_ci if firstline: 5077db96d56Sopenharmony_ci newchars, decodedbytes = \ 5087db96d56Sopenharmony_ci self.decode(data[:exc.start], self.errors) 5097db96d56Sopenharmony_ci lines = newchars.splitlines(keepends=True) 5107db96d56Sopenharmony_ci if len(lines)<=1: 5117db96d56Sopenharmony_ci raise 5127db96d56Sopenharmony_ci else: 5137db96d56Sopenharmony_ci raise 5147db96d56Sopenharmony_ci # keep undecoded bytes until the next call 5157db96d56Sopenharmony_ci self.bytebuffer = data[decodedbytes:] 5167db96d56Sopenharmony_ci # put new characters in the character buffer 5177db96d56Sopenharmony_ci self.charbuffer += newchars 5187db96d56Sopenharmony_ci # there was no data available 5197db96d56Sopenharmony_ci if not newdata: 5207db96d56Sopenharmony_ci break 5217db96d56Sopenharmony_ci if chars < 0: 5227db96d56Sopenharmony_ci # Return everything we've got 5237db96d56Sopenharmony_ci result = self.charbuffer 5247db96d56Sopenharmony_ci self.charbuffer = self._empty_charbuffer 5257db96d56Sopenharmony_ci else: 5267db96d56Sopenharmony_ci # Return the first chars characters 5277db96d56Sopenharmony_ci result = self.charbuffer[:chars] 5287db96d56Sopenharmony_ci self.charbuffer = self.charbuffer[chars:] 5297db96d56Sopenharmony_ci return result 5307db96d56Sopenharmony_ci 5317db96d56Sopenharmony_ci def readline(self, size=None, keepends=True): 5327db96d56Sopenharmony_ci 5337db96d56Sopenharmony_ci """ Read one line from the input stream and return the 5347db96d56Sopenharmony_ci decoded data. 5357db96d56Sopenharmony_ci 5367db96d56Sopenharmony_ci size, if given, is passed as size argument to the 5377db96d56Sopenharmony_ci read() method. 5387db96d56Sopenharmony_ci 5397db96d56Sopenharmony_ci """ 5407db96d56Sopenharmony_ci # If we have lines cached from an earlier read, return 5417db96d56Sopenharmony_ci # them unconditionally 5427db96d56Sopenharmony_ci if self.linebuffer: 5437db96d56Sopenharmony_ci line = self.linebuffer[0] 5447db96d56Sopenharmony_ci del self.linebuffer[0] 5457db96d56Sopenharmony_ci if len(self.linebuffer) == 1: 5467db96d56Sopenharmony_ci # revert to charbuffer mode; we might need more data 5477db96d56Sopenharmony_ci # next time 5487db96d56Sopenharmony_ci self.charbuffer = self.linebuffer[0] 5497db96d56Sopenharmony_ci self.linebuffer = None 5507db96d56Sopenharmony_ci if not keepends: 5517db96d56Sopenharmony_ci line = line.splitlines(keepends=False)[0] 5527db96d56Sopenharmony_ci return line 5537db96d56Sopenharmony_ci 5547db96d56Sopenharmony_ci readsize = size or 72 5557db96d56Sopenharmony_ci line = self._empty_charbuffer 5567db96d56Sopenharmony_ci # If size is given, we call read() only once 5577db96d56Sopenharmony_ci while True: 5587db96d56Sopenharmony_ci data = self.read(readsize, firstline=True) 5597db96d56Sopenharmony_ci if data: 5607db96d56Sopenharmony_ci # If we're at a "\r" read one extra character (which might 5617db96d56Sopenharmony_ci # be a "\n") to get a proper line ending. If the stream is 5627db96d56Sopenharmony_ci # temporarily exhausted we return the wrong line ending. 5637db96d56Sopenharmony_ci if (isinstance(data, str) and data.endswith("\r")) or \ 5647db96d56Sopenharmony_ci (isinstance(data, bytes) and data.endswith(b"\r")): 5657db96d56Sopenharmony_ci data += self.read(size=1, chars=1) 5667db96d56Sopenharmony_ci 5677db96d56Sopenharmony_ci line += data 5687db96d56Sopenharmony_ci lines = line.splitlines(keepends=True) 5697db96d56Sopenharmony_ci if lines: 5707db96d56Sopenharmony_ci if len(lines) > 1: 5717db96d56Sopenharmony_ci # More than one line result; the first line is a full line 5727db96d56Sopenharmony_ci # to return 5737db96d56Sopenharmony_ci line = lines[0] 5747db96d56Sopenharmony_ci del lines[0] 5757db96d56Sopenharmony_ci if len(lines) > 1: 5767db96d56Sopenharmony_ci # cache the remaining lines 5777db96d56Sopenharmony_ci lines[-1] += self.charbuffer 5787db96d56Sopenharmony_ci self.linebuffer = lines 5797db96d56Sopenharmony_ci self.charbuffer = None 5807db96d56Sopenharmony_ci else: 5817db96d56Sopenharmony_ci # only one remaining line, put it back into charbuffer 5827db96d56Sopenharmony_ci self.charbuffer = lines[0] + self.charbuffer 5837db96d56Sopenharmony_ci if not keepends: 5847db96d56Sopenharmony_ci line = line.splitlines(keepends=False)[0] 5857db96d56Sopenharmony_ci break 5867db96d56Sopenharmony_ci line0withend = lines[0] 5877db96d56Sopenharmony_ci line0withoutend = lines[0].splitlines(keepends=False)[0] 5887db96d56Sopenharmony_ci if line0withend != line0withoutend: # We really have a line end 5897db96d56Sopenharmony_ci # Put the rest back together and keep it until the next call 5907db96d56Sopenharmony_ci self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ 5917db96d56Sopenharmony_ci self.charbuffer 5927db96d56Sopenharmony_ci if keepends: 5937db96d56Sopenharmony_ci line = line0withend 5947db96d56Sopenharmony_ci else: 5957db96d56Sopenharmony_ci line = line0withoutend 5967db96d56Sopenharmony_ci break 5977db96d56Sopenharmony_ci # we didn't get anything or this was our only try 5987db96d56Sopenharmony_ci if not data or size is not None: 5997db96d56Sopenharmony_ci if line and not keepends: 6007db96d56Sopenharmony_ci line = line.splitlines(keepends=False)[0] 6017db96d56Sopenharmony_ci break 6027db96d56Sopenharmony_ci if readsize < 8000: 6037db96d56Sopenharmony_ci readsize *= 2 6047db96d56Sopenharmony_ci return line 6057db96d56Sopenharmony_ci 6067db96d56Sopenharmony_ci def readlines(self, sizehint=None, keepends=True): 6077db96d56Sopenharmony_ci 6087db96d56Sopenharmony_ci """ Read all lines available on the input stream 6097db96d56Sopenharmony_ci and return them as a list. 6107db96d56Sopenharmony_ci 6117db96d56Sopenharmony_ci Line breaks are implemented using the codec's decoder 6127db96d56Sopenharmony_ci method and are included in the list entries. 6137db96d56Sopenharmony_ci 6147db96d56Sopenharmony_ci sizehint, if given, is ignored since there is no efficient 6157db96d56Sopenharmony_ci way to finding the true end-of-line. 6167db96d56Sopenharmony_ci 6177db96d56Sopenharmony_ci """ 6187db96d56Sopenharmony_ci data = self.read() 6197db96d56Sopenharmony_ci return data.splitlines(keepends) 6207db96d56Sopenharmony_ci 6217db96d56Sopenharmony_ci def reset(self): 6227db96d56Sopenharmony_ci 6237db96d56Sopenharmony_ci """ Resets the codec buffers used for keeping internal state. 6247db96d56Sopenharmony_ci 6257db96d56Sopenharmony_ci Note that no stream repositioning should take place. 6267db96d56Sopenharmony_ci This method is primarily intended to be able to recover 6277db96d56Sopenharmony_ci from decoding errors. 6287db96d56Sopenharmony_ci 6297db96d56Sopenharmony_ci """ 6307db96d56Sopenharmony_ci self.bytebuffer = b"" 6317db96d56Sopenharmony_ci self.charbuffer = self._empty_charbuffer 6327db96d56Sopenharmony_ci self.linebuffer = None 6337db96d56Sopenharmony_ci 6347db96d56Sopenharmony_ci def seek(self, offset, whence=0): 6357db96d56Sopenharmony_ci """ Set the input stream's current position. 6367db96d56Sopenharmony_ci 6377db96d56Sopenharmony_ci Resets the codec buffers used for keeping state. 6387db96d56Sopenharmony_ci """ 6397db96d56Sopenharmony_ci self.stream.seek(offset, whence) 6407db96d56Sopenharmony_ci self.reset() 6417db96d56Sopenharmony_ci 6427db96d56Sopenharmony_ci def __next__(self): 6437db96d56Sopenharmony_ci 6447db96d56Sopenharmony_ci """ Return the next decoded line from the input stream.""" 6457db96d56Sopenharmony_ci line = self.readline() 6467db96d56Sopenharmony_ci if line: 6477db96d56Sopenharmony_ci return line 6487db96d56Sopenharmony_ci raise StopIteration 6497db96d56Sopenharmony_ci 6507db96d56Sopenharmony_ci def __iter__(self): 6517db96d56Sopenharmony_ci return self 6527db96d56Sopenharmony_ci 6537db96d56Sopenharmony_ci def __getattr__(self, name, 6547db96d56Sopenharmony_ci getattr=getattr): 6557db96d56Sopenharmony_ci 6567db96d56Sopenharmony_ci """ Inherit all other methods from the underlying stream. 6577db96d56Sopenharmony_ci """ 6587db96d56Sopenharmony_ci return getattr(self.stream, name) 6597db96d56Sopenharmony_ci 6607db96d56Sopenharmony_ci def __enter__(self): 6617db96d56Sopenharmony_ci return self 6627db96d56Sopenharmony_ci 6637db96d56Sopenharmony_ci def __exit__(self, type, value, tb): 6647db96d56Sopenharmony_ci self.stream.close() 6657db96d56Sopenharmony_ci 6667db96d56Sopenharmony_ci### 6677db96d56Sopenharmony_ci 6687db96d56Sopenharmony_ciclass StreamReaderWriter: 6697db96d56Sopenharmony_ci 6707db96d56Sopenharmony_ci """ StreamReaderWriter instances allow wrapping streams which 6717db96d56Sopenharmony_ci work in both read and write modes. 6727db96d56Sopenharmony_ci 6737db96d56Sopenharmony_ci The design is such that one can use the factory functions 6747db96d56Sopenharmony_ci returned by the codec.lookup() function to construct the 6757db96d56Sopenharmony_ci instance. 6767db96d56Sopenharmony_ci 6777db96d56Sopenharmony_ci """ 6787db96d56Sopenharmony_ci # Optional attributes set by the file wrappers below 6797db96d56Sopenharmony_ci encoding = 'unknown' 6807db96d56Sopenharmony_ci 6817db96d56Sopenharmony_ci def __init__(self, stream, Reader, Writer, errors='strict'): 6827db96d56Sopenharmony_ci 6837db96d56Sopenharmony_ci """ Creates a StreamReaderWriter instance. 6847db96d56Sopenharmony_ci 6857db96d56Sopenharmony_ci stream must be a Stream-like object. 6867db96d56Sopenharmony_ci 6877db96d56Sopenharmony_ci Reader, Writer must be factory functions or classes 6887db96d56Sopenharmony_ci providing the StreamReader, StreamWriter interface resp. 6897db96d56Sopenharmony_ci 6907db96d56Sopenharmony_ci Error handling is done in the same way as defined for the 6917db96d56Sopenharmony_ci StreamWriter/Readers. 6927db96d56Sopenharmony_ci 6937db96d56Sopenharmony_ci """ 6947db96d56Sopenharmony_ci self.stream = stream 6957db96d56Sopenharmony_ci self.reader = Reader(stream, errors) 6967db96d56Sopenharmony_ci self.writer = Writer(stream, errors) 6977db96d56Sopenharmony_ci self.errors = errors 6987db96d56Sopenharmony_ci 6997db96d56Sopenharmony_ci def read(self, size=-1): 7007db96d56Sopenharmony_ci 7017db96d56Sopenharmony_ci return self.reader.read(size) 7027db96d56Sopenharmony_ci 7037db96d56Sopenharmony_ci def readline(self, size=None): 7047db96d56Sopenharmony_ci 7057db96d56Sopenharmony_ci return self.reader.readline(size) 7067db96d56Sopenharmony_ci 7077db96d56Sopenharmony_ci def readlines(self, sizehint=None): 7087db96d56Sopenharmony_ci 7097db96d56Sopenharmony_ci return self.reader.readlines(sizehint) 7107db96d56Sopenharmony_ci 7117db96d56Sopenharmony_ci def __next__(self): 7127db96d56Sopenharmony_ci 7137db96d56Sopenharmony_ci """ Return the next decoded line from the input stream.""" 7147db96d56Sopenharmony_ci return next(self.reader) 7157db96d56Sopenharmony_ci 7167db96d56Sopenharmony_ci def __iter__(self): 7177db96d56Sopenharmony_ci return self 7187db96d56Sopenharmony_ci 7197db96d56Sopenharmony_ci def write(self, data): 7207db96d56Sopenharmony_ci 7217db96d56Sopenharmony_ci return self.writer.write(data) 7227db96d56Sopenharmony_ci 7237db96d56Sopenharmony_ci def writelines(self, list): 7247db96d56Sopenharmony_ci 7257db96d56Sopenharmony_ci return self.writer.writelines(list) 7267db96d56Sopenharmony_ci 7277db96d56Sopenharmony_ci def reset(self): 7287db96d56Sopenharmony_ci 7297db96d56Sopenharmony_ci self.reader.reset() 7307db96d56Sopenharmony_ci self.writer.reset() 7317db96d56Sopenharmony_ci 7327db96d56Sopenharmony_ci def seek(self, offset, whence=0): 7337db96d56Sopenharmony_ci self.stream.seek(offset, whence) 7347db96d56Sopenharmony_ci self.reader.reset() 7357db96d56Sopenharmony_ci if whence == 0 and offset == 0: 7367db96d56Sopenharmony_ci self.writer.reset() 7377db96d56Sopenharmony_ci 7387db96d56Sopenharmony_ci def __getattr__(self, name, 7397db96d56Sopenharmony_ci getattr=getattr): 7407db96d56Sopenharmony_ci 7417db96d56Sopenharmony_ci """ Inherit all other methods from the underlying stream. 7427db96d56Sopenharmony_ci """ 7437db96d56Sopenharmony_ci return getattr(self.stream, name) 7447db96d56Sopenharmony_ci 7457db96d56Sopenharmony_ci # these are needed to make "with StreamReaderWriter(...)" work properly 7467db96d56Sopenharmony_ci 7477db96d56Sopenharmony_ci def __enter__(self): 7487db96d56Sopenharmony_ci return self 7497db96d56Sopenharmony_ci 7507db96d56Sopenharmony_ci def __exit__(self, type, value, tb): 7517db96d56Sopenharmony_ci self.stream.close() 7527db96d56Sopenharmony_ci 7537db96d56Sopenharmony_ci### 7547db96d56Sopenharmony_ci 7557db96d56Sopenharmony_ciclass StreamRecoder: 7567db96d56Sopenharmony_ci 7577db96d56Sopenharmony_ci """ StreamRecoder instances translate data from one encoding to another. 7587db96d56Sopenharmony_ci 7597db96d56Sopenharmony_ci They use the complete set of APIs returned by the 7607db96d56Sopenharmony_ci codecs.lookup() function to implement their task. 7617db96d56Sopenharmony_ci 7627db96d56Sopenharmony_ci Data written to the StreamRecoder is first decoded into an 7637db96d56Sopenharmony_ci intermediate format (depending on the "decode" codec) and then 7647db96d56Sopenharmony_ci written to the underlying stream using an instance of the provided 7657db96d56Sopenharmony_ci Writer class. 7667db96d56Sopenharmony_ci 7677db96d56Sopenharmony_ci In the other direction, data is read from the underlying stream using 7687db96d56Sopenharmony_ci a Reader instance and then encoded and returned to the caller. 7697db96d56Sopenharmony_ci 7707db96d56Sopenharmony_ci """ 7717db96d56Sopenharmony_ci # Optional attributes set by the file wrappers below 7727db96d56Sopenharmony_ci data_encoding = 'unknown' 7737db96d56Sopenharmony_ci file_encoding = 'unknown' 7747db96d56Sopenharmony_ci 7757db96d56Sopenharmony_ci def __init__(self, stream, encode, decode, Reader, Writer, 7767db96d56Sopenharmony_ci errors='strict'): 7777db96d56Sopenharmony_ci 7787db96d56Sopenharmony_ci """ Creates a StreamRecoder instance which implements a two-way 7797db96d56Sopenharmony_ci conversion: encode and decode work on the frontend (the 7807db96d56Sopenharmony_ci data visible to .read() and .write()) while Reader and Writer 7817db96d56Sopenharmony_ci work on the backend (the data in stream). 7827db96d56Sopenharmony_ci 7837db96d56Sopenharmony_ci You can use these objects to do transparent 7847db96d56Sopenharmony_ci transcodings from e.g. latin-1 to utf-8 and back. 7857db96d56Sopenharmony_ci 7867db96d56Sopenharmony_ci stream must be a file-like object. 7877db96d56Sopenharmony_ci 7887db96d56Sopenharmony_ci encode and decode must adhere to the Codec interface; Reader and 7897db96d56Sopenharmony_ci Writer must be factory functions or classes providing the 7907db96d56Sopenharmony_ci StreamReader and StreamWriter interfaces resp. 7917db96d56Sopenharmony_ci 7927db96d56Sopenharmony_ci Error handling is done in the same way as defined for the 7937db96d56Sopenharmony_ci StreamWriter/Readers. 7947db96d56Sopenharmony_ci 7957db96d56Sopenharmony_ci """ 7967db96d56Sopenharmony_ci self.stream = stream 7977db96d56Sopenharmony_ci self.encode = encode 7987db96d56Sopenharmony_ci self.decode = decode 7997db96d56Sopenharmony_ci self.reader = Reader(stream, errors) 8007db96d56Sopenharmony_ci self.writer = Writer(stream, errors) 8017db96d56Sopenharmony_ci self.errors = errors 8027db96d56Sopenharmony_ci 8037db96d56Sopenharmony_ci def read(self, size=-1): 8047db96d56Sopenharmony_ci 8057db96d56Sopenharmony_ci data = self.reader.read(size) 8067db96d56Sopenharmony_ci data, bytesencoded = self.encode(data, self.errors) 8077db96d56Sopenharmony_ci return data 8087db96d56Sopenharmony_ci 8097db96d56Sopenharmony_ci def readline(self, size=None): 8107db96d56Sopenharmony_ci 8117db96d56Sopenharmony_ci if size is None: 8127db96d56Sopenharmony_ci data = self.reader.readline() 8137db96d56Sopenharmony_ci else: 8147db96d56Sopenharmony_ci data = self.reader.readline(size) 8157db96d56Sopenharmony_ci data, bytesencoded = self.encode(data, self.errors) 8167db96d56Sopenharmony_ci return data 8177db96d56Sopenharmony_ci 8187db96d56Sopenharmony_ci def readlines(self, sizehint=None): 8197db96d56Sopenharmony_ci 8207db96d56Sopenharmony_ci data = self.reader.read() 8217db96d56Sopenharmony_ci data, bytesencoded = self.encode(data, self.errors) 8227db96d56Sopenharmony_ci return data.splitlines(keepends=True) 8237db96d56Sopenharmony_ci 8247db96d56Sopenharmony_ci def __next__(self): 8257db96d56Sopenharmony_ci 8267db96d56Sopenharmony_ci """ Return the next decoded line from the input stream.""" 8277db96d56Sopenharmony_ci data = next(self.reader) 8287db96d56Sopenharmony_ci data, bytesencoded = self.encode(data, self.errors) 8297db96d56Sopenharmony_ci return data 8307db96d56Sopenharmony_ci 8317db96d56Sopenharmony_ci def __iter__(self): 8327db96d56Sopenharmony_ci return self 8337db96d56Sopenharmony_ci 8347db96d56Sopenharmony_ci def write(self, data): 8357db96d56Sopenharmony_ci 8367db96d56Sopenharmony_ci data, bytesdecoded = self.decode(data, self.errors) 8377db96d56Sopenharmony_ci return self.writer.write(data) 8387db96d56Sopenharmony_ci 8397db96d56Sopenharmony_ci def writelines(self, list): 8407db96d56Sopenharmony_ci 8417db96d56Sopenharmony_ci data = b''.join(list) 8427db96d56Sopenharmony_ci data, bytesdecoded = self.decode(data, self.errors) 8437db96d56Sopenharmony_ci return self.writer.write(data) 8447db96d56Sopenharmony_ci 8457db96d56Sopenharmony_ci def reset(self): 8467db96d56Sopenharmony_ci 8477db96d56Sopenharmony_ci self.reader.reset() 8487db96d56Sopenharmony_ci self.writer.reset() 8497db96d56Sopenharmony_ci 8507db96d56Sopenharmony_ci def seek(self, offset, whence=0): 8517db96d56Sopenharmony_ci # Seeks must be propagated to both the readers and writers 8527db96d56Sopenharmony_ci # as they might need to reset their internal buffers. 8537db96d56Sopenharmony_ci self.reader.seek(offset, whence) 8547db96d56Sopenharmony_ci self.writer.seek(offset, whence) 8557db96d56Sopenharmony_ci 8567db96d56Sopenharmony_ci def __getattr__(self, name, 8577db96d56Sopenharmony_ci getattr=getattr): 8587db96d56Sopenharmony_ci 8597db96d56Sopenharmony_ci """ Inherit all other methods from the underlying stream. 8607db96d56Sopenharmony_ci """ 8617db96d56Sopenharmony_ci return getattr(self.stream, name) 8627db96d56Sopenharmony_ci 8637db96d56Sopenharmony_ci def __enter__(self): 8647db96d56Sopenharmony_ci return self 8657db96d56Sopenharmony_ci 8667db96d56Sopenharmony_ci def __exit__(self, type, value, tb): 8677db96d56Sopenharmony_ci self.stream.close() 8687db96d56Sopenharmony_ci 8697db96d56Sopenharmony_ci### Shortcuts 8707db96d56Sopenharmony_ci 8717db96d56Sopenharmony_cidef open(filename, mode='r', encoding=None, errors='strict', buffering=-1): 8727db96d56Sopenharmony_ci 8737db96d56Sopenharmony_ci """ Open an encoded file using the given mode and return 8747db96d56Sopenharmony_ci a wrapped version providing transparent encoding/decoding. 8757db96d56Sopenharmony_ci 8767db96d56Sopenharmony_ci Note: The wrapped version will only accept the object format 8777db96d56Sopenharmony_ci defined by the codecs, i.e. Unicode objects for most builtin 8787db96d56Sopenharmony_ci codecs. Output is also codec dependent and will usually be 8797db96d56Sopenharmony_ci Unicode as well. 8807db96d56Sopenharmony_ci 8817db96d56Sopenharmony_ci If encoding is not None, then the 8827db96d56Sopenharmony_ci underlying encoded files are always opened in binary mode. 8837db96d56Sopenharmony_ci The default file mode is 'r', meaning to open the file in read mode. 8847db96d56Sopenharmony_ci 8857db96d56Sopenharmony_ci encoding specifies the encoding which is to be used for the 8867db96d56Sopenharmony_ci file. 8877db96d56Sopenharmony_ci 8887db96d56Sopenharmony_ci errors may be given to define the error handling. It defaults 8897db96d56Sopenharmony_ci to 'strict' which causes ValueErrors to be raised in case an 8907db96d56Sopenharmony_ci encoding error occurs. 8917db96d56Sopenharmony_ci 8927db96d56Sopenharmony_ci buffering has the same meaning as for the builtin open() API. 8937db96d56Sopenharmony_ci It defaults to -1 which means that the default buffer size will 8947db96d56Sopenharmony_ci be used. 8957db96d56Sopenharmony_ci 8967db96d56Sopenharmony_ci The returned wrapped file object provides an extra attribute 8977db96d56Sopenharmony_ci .encoding which allows querying the used encoding. This 8987db96d56Sopenharmony_ci attribute is only available if an encoding was specified as 8997db96d56Sopenharmony_ci parameter. 9007db96d56Sopenharmony_ci 9017db96d56Sopenharmony_ci """ 9027db96d56Sopenharmony_ci if encoding is not None and \ 9037db96d56Sopenharmony_ci 'b' not in mode: 9047db96d56Sopenharmony_ci # Force opening of the file in binary mode 9057db96d56Sopenharmony_ci mode = mode + 'b' 9067db96d56Sopenharmony_ci file = builtins.open(filename, mode, buffering) 9077db96d56Sopenharmony_ci if encoding is None: 9087db96d56Sopenharmony_ci return file 9097db96d56Sopenharmony_ci 9107db96d56Sopenharmony_ci try: 9117db96d56Sopenharmony_ci info = lookup(encoding) 9127db96d56Sopenharmony_ci srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 9137db96d56Sopenharmony_ci # Add attributes to simplify introspection 9147db96d56Sopenharmony_ci srw.encoding = encoding 9157db96d56Sopenharmony_ci return srw 9167db96d56Sopenharmony_ci except: 9177db96d56Sopenharmony_ci file.close() 9187db96d56Sopenharmony_ci raise 9197db96d56Sopenharmony_ci 9207db96d56Sopenharmony_cidef EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 9217db96d56Sopenharmony_ci 9227db96d56Sopenharmony_ci """ Return a wrapped version of file which provides transparent 9237db96d56Sopenharmony_ci encoding translation. 9247db96d56Sopenharmony_ci 9257db96d56Sopenharmony_ci Data written to the wrapped file is decoded according 9267db96d56Sopenharmony_ci to the given data_encoding and then encoded to the underlying 9277db96d56Sopenharmony_ci file using file_encoding. The intermediate data type 9287db96d56Sopenharmony_ci will usually be Unicode but depends on the specified codecs. 9297db96d56Sopenharmony_ci 9307db96d56Sopenharmony_ci Bytes read from the file are decoded using file_encoding and then 9317db96d56Sopenharmony_ci passed back to the caller encoded using data_encoding. 9327db96d56Sopenharmony_ci 9337db96d56Sopenharmony_ci If file_encoding is not given, it defaults to data_encoding. 9347db96d56Sopenharmony_ci 9357db96d56Sopenharmony_ci errors may be given to define the error handling. It defaults 9367db96d56Sopenharmony_ci to 'strict' which causes ValueErrors to be raised in case an 9377db96d56Sopenharmony_ci encoding error occurs. 9387db96d56Sopenharmony_ci 9397db96d56Sopenharmony_ci The returned wrapped file object provides two extra attributes 9407db96d56Sopenharmony_ci .data_encoding and .file_encoding which reflect the given 9417db96d56Sopenharmony_ci parameters of the same name. The attributes can be used for 9427db96d56Sopenharmony_ci introspection by Python programs. 9437db96d56Sopenharmony_ci 9447db96d56Sopenharmony_ci """ 9457db96d56Sopenharmony_ci if file_encoding is None: 9467db96d56Sopenharmony_ci file_encoding = data_encoding 9477db96d56Sopenharmony_ci data_info = lookup(data_encoding) 9487db96d56Sopenharmony_ci file_info = lookup(file_encoding) 9497db96d56Sopenharmony_ci sr = StreamRecoder(file, data_info.encode, data_info.decode, 9507db96d56Sopenharmony_ci file_info.streamreader, file_info.streamwriter, errors) 9517db96d56Sopenharmony_ci # Add attributes to simplify introspection 9527db96d56Sopenharmony_ci sr.data_encoding = data_encoding 9537db96d56Sopenharmony_ci sr.file_encoding = file_encoding 9547db96d56Sopenharmony_ci return sr 9557db96d56Sopenharmony_ci 9567db96d56Sopenharmony_ci### Helpers for codec lookup 9577db96d56Sopenharmony_ci 9587db96d56Sopenharmony_cidef getencoder(encoding): 9597db96d56Sopenharmony_ci 9607db96d56Sopenharmony_ci """ Lookup up the codec for the given encoding and return 9617db96d56Sopenharmony_ci its encoder function. 9627db96d56Sopenharmony_ci 9637db96d56Sopenharmony_ci Raises a LookupError in case the encoding cannot be found. 9647db96d56Sopenharmony_ci 9657db96d56Sopenharmony_ci """ 9667db96d56Sopenharmony_ci return lookup(encoding).encode 9677db96d56Sopenharmony_ci 9687db96d56Sopenharmony_cidef getdecoder(encoding): 9697db96d56Sopenharmony_ci 9707db96d56Sopenharmony_ci """ Lookup up the codec for the given encoding and return 9717db96d56Sopenharmony_ci its decoder function. 9727db96d56Sopenharmony_ci 9737db96d56Sopenharmony_ci Raises a LookupError in case the encoding cannot be found. 9747db96d56Sopenharmony_ci 9757db96d56Sopenharmony_ci """ 9767db96d56Sopenharmony_ci return lookup(encoding).decode 9777db96d56Sopenharmony_ci 9787db96d56Sopenharmony_cidef getincrementalencoder(encoding): 9797db96d56Sopenharmony_ci 9807db96d56Sopenharmony_ci """ Lookup up the codec for the given encoding and return 9817db96d56Sopenharmony_ci its IncrementalEncoder class or factory function. 9827db96d56Sopenharmony_ci 9837db96d56Sopenharmony_ci Raises a LookupError in case the encoding cannot be found 9847db96d56Sopenharmony_ci or the codecs doesn't provide an incremental encoder. 9857db96d56Sopenharmony_ci 9867db96d56Sopenharmony_ci """ 9877db96d56Sopenharmony_ci encoder = lookup(encoding).incrementalencoder 9887db96d56Sopenharmony_ci if encoder is None: 9897db96d56Sopenharmony_ci raise LookupError(encoding) 9907db96d56Sopenharmony_ci return encoder 9917db96d56Sopenharmony_ci 9927db96d56Sopenharmony_cidef getincrementaldecoder(encoding): 9937db96d56Sopenharmony_ci 9947db96d56Sopenharmony_ci """ Lookup up the codec for the given encoding and return 9957db96d56Sopenharmony_ci its IncrementalDecoder class or factory function. 9967db96d56Sopenharmony_ci 9977db96d56Sopenharmony_ci Raises a LookupError in case the encoding cannot be found 9987db96d56Sopenharmony_ci or the codecs doesn't provide an incremental decoder. 9997db96d56Sopenharmony_ci 10007db96d56Sopenharmony_ci """ 10017db96d56Sopenharmony_ci decoder = lookup(encoding).incrementaldecoder 10027db96d56Sopenharmony_ci if decoder is None: 10037db96d56Sopenharmony_ci raise LookupError(encoding) 10047db96d56Sopenharmony_ci return decoder 10057db96d56Sopenharmony_ci 10067db96d56Sopenharmony_cidef getreader(encoding): 10077db96d56Sopenharmony_ci 10087db96d56Sopenharmony_ci """ Lookup up the codec for the given encoding and return 10097db96d56Sopenharmony_ci its StreamReader class or factory function. 10107db96d56Sopenharmony_ci 10117db96d56Sopenharmony_ci Raises a LookupError in case the encoding cannot be found. 10127db96d56Sopenharmony_ci 10137db96d56Sopenharmony_ci """ 10147db96d56Sopenharmony_ci return lookup(encoding).streamreader 10157db96d56Sopenharmony_ci 10167db96d56Sopenharmony_cidef getwriter(encoding): 10177db96d56Sopenharmony_ci 10187db96d56Sopenharmony_ci """ Lookup up the codec for the given encoding and return 10197db96d56Sopenharmony_ci its StreamWriter class or factory function. 10207db96d56Sopenharmony_ci 10217db96d56Sopenharmony_ci Raises a LookupError in case the encoding cannot be found. 10227db96d56Sopenharmony_ci 10237db96d56Sopenharmony_ci """ 10247db96d56Sopenharmony_ci return lookup(encoding).streamwriter 10257db96d56Sopenharmony_ci 10267db96d56Sopenharmony_cidef iterencode(iterator, encoding, errors='strict', **kwargs): 10277db96d56Sopenharmony_ci """ 10287db96d56Sopenharmony_ci Encoding iterator. 10297db96d56Sopenharmony_ci 10307db96d56Sopenharmony_ci Encodes the input strings from the iterator using an IncrementalEncoder. 10317db96d56Sopenharmony_ci 10327db96d56Sopenharmony_ci errors and kwargs are passed through to the IncrementalEncoder 10337db96d56Sopenharmony_ci constructor. 10347db96d56Sopenharmony_ci """ 10357db96d56Sopenharmony_ci encoder = getincrementalencoder(encoding)(errors, **kwargs) 10367db96d56Sopenharmony_ci for input in iterator: 10377db96d56Sopenharmony_ci output = encoder.encode(input) 10387db96d56Sopenharmony_ci if output: 10397db96d56Sopenharmony_ci yield output 10407db96d56Sopenharmony_ci output = encoder.encode("", True) 10417db96d56Sopenharmony_ci if output: 10427db96d56Sopenharmony_ci yield output 10437db96d56Sopenharmony_ci 10447db96d56Sopenharmony_cidef iterdecode(iterator, encoding, errors='strict', **kwargs): 10457db96d56Sopenharmony_ci """ 10467db96d56Sopenharmony_ci Decoding iterator. 10477db96d56Sopenharmony_ci 10487db96d56Sopenharmony_ci Decodes the input strings from the iterator using an IncrementalDecoder. 10497db96d56Sopenharmony_ci 10507db96d56Sopenharmony_ci errors and kwargs are passed through to the IncrementalDecoder 10517db96d56Sopenharmony_ci constructor. 10527db96d56Sopenharmony_ci """ 10537db96d56Sopenharmony_ci decoder = getincrementaldecoder(encoding)(errors, **kwargs) 10547db96d56Sopenharmony_ci for input in iterator: 10557db96d56Sopenharmony_ci output = decoder.decode(input) 10567db96d56Sopenharmony_ci if output: 10577db96d56Sopenharmony_ci yield output 10587db96d56Sopenharmony_ci output = decoder.decode(b"", True) 10597db96d56Sopenharmony_ci if output: 10607db96d56Sopenharmony_ci yield output 10617db96d56Sopenharmony_ci 10627db96d56Sopenharmony_ci### Helpers for charmap-based codecs 10637db96d56Sopenharmony_ci 10647db96d56Sopenharmony_cidef make_identity_dict(rng): 10657db96d56Sopenharmony_ci 10667db96d56Sopenharmony_ci """ make_identity_dict(rng) -> dict 10677db96d56Sopenharmony_ci 10687db96d56Sopenharmony_ci Return a dictionary where elements of the rng sequence are 10697db96d56Sopenharmony_ci mapped to themselves. 10707db96d56Sopenharmony_ci 10717db96d56Sopenharmony_ci """ 10727db96d56Sopenharmony_ci return {i:i for i in rng} 10737db96d56Sopenharmony_ci 10747db96d56Sopenharmony_cidef make_encoding_map(decoding_map): 10757db96d56Sopenharmony_ci 10767db96d56Sopenharmony_ci """ Creates an encoding map from a decoding map. 10777db96d56Sopenharmony_ci 10787db96d56Sopenharmony_ci If a target mapping in the decoding map occurs multiple 10797db96d56Sopenharmony_ci times, then that target is mapped to None (undefined mapping), 10807db96d56Sopenharmony_ci causing an exception when encountered by the charmap codec 10817db96d56Sopenharmony_ci during translation. 10827db96d56Sopenharmony_ci 10837db96d56Sopenharmony_ci One example where this happens is cp875.py which decodes 10847db96d56Sopenharmony_ci multiple character to \\u001a. 10857db96d56Sopenharmony_ci 10867db96d56Sopenharmony_ci """ 10877db96d56Sopenharmony_ci m = {} 10887db96d56Sopenharmony_ci for k,v in decoding_map.items(): 10897db96d56Sopenharmony_ci if not v in m: 10907db96d56Sopenharmony_ci m[v] = k 10917db96d56Sopenharmony_ci else: 10927db96d56Sopenharmony_ci m[v] = None 10937db96d56Sopenharmony_ci return m 10947db96d56Sopenharmony_ci 10957db96d56Sopenharmony_ci### error handlers 10967db96d56Sopenharmony_ci 10977db96d56Sopenharmony_citry: 10987db96d56Sopenharmony_ci strict_errors = lookup_error("strict") 10997db96d56Sopenharmony_ci ignore_errors = lookup_error("ignore") 11007db96d56Sopenharmony_ci replace_errors = lookup_error("replace") 11017db96d56Sopenharmony_ci xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 11027db96d56Sopenharmony_ci backslashreplace_errors = lookup_error("backslashreplace") 11037db96d56Sopenharmony_ci namereplace_errors = lookup_error("namereplace") 11047db96d56Sopenharmony_ciexcept LookupError: 11057db96d56Sopenharmony_ci # In --disable-unicode builds, these error handler are missing 11067db96d56Sopenharmony_ci strict_errors = None 11077db96d56Sopenharmony_ci ignore_errors = None 11087db96d56Sopenharmony_ci replace_errors = None 11097db96d56Sopenharmony_ci xmlcharrefreplace_errors = None 11107db96d56Sopenharmony_ci backslashreplace_errors = None 11117db96d56Sopenharmony_ci namereplace_errors = None 11127db96d56Sopenharmony_ci 11137db96d56Sopenharmony_ci# Tell modulefinder that using codecs probably needs the encodings 11147db96d56Sopenharmony_ci# package 11157db96d56Sopenharmony_ci_false = 0 11167db96d56Sopenharmony_ciif _false: 11177db96d56Sopenharmony_ci import encodings 11187db96d56Sopenharmony_ci 11197db96d56Sopenharmony_ci### Tests 11207db96d56Sopenharmony_ci 11217db96d56Sopenharmony_ciif __name__ == '__main__': 11227db96d56Sopenharmony_ci 11237db96d56Sopenharmony_ci # Make stdout translate Latin-1 output into UTF-8 output 11247db96d56Sopenharmony_ci sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 11257db96d56Sopenharmony_ci 11267db96d56Sopenharmony_ci # Have stdin translate Latin-1 input into UTF-8 input 11277db96d56Sopenharmony_ci sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 1128