17db96d56Sopenharmony_ci""" Routines for manipulating RFC2047 encoded words. 27db96d56Sopenharmony_ci 37db96d56Sopenharmony_ciThis is currently a package-private API, but will be considered for promotion 47db96d56Sopenharmony_cito a public API if there is demand. 57db96d56Sopenharmony_ci 67db96d56Sopenharmony_ci""" 77db96d56Sopenharmony_ci 87db96d56Sopenharmony_ci# An ecoded word looks like this: 97db96d56Sopenharmony_ci# 107db96d56Sopenharmony_ci# =?charset[*lang]?cte?encoded_string?= 117db96d56Sopenharmony_ci# 127db96d56Sopenharmony_ci# for more information about charset see the charset module. Here it is one 137db96d56Sopenharmony_ci# of the preferred MIME charset names (hopefully; you never know when parsing). 147db96d56Sopenharmony_ci# cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case). In 157db96d56Sopenharmony_ci# theory other letters could be used for other encodings, but in practice this 167db96d56Sopenharmony_ci# (almost?) never happens. There could be a public API for adding entries 177db96d56Sopenharmony_ci# to the CTE tables, but YAGNI for now. 'q' is Quoted Printable, 'b' is 187db96d56Sopenharmony_ci# Base64. The meaning of encoded_string should be obvious. 'lang' is optional 197db96d56Sopenharmony_ci# as indicated by the brackets (they are not part of the syntax) but is almost 207db96d56Sopenharmony_ci# never encountered in practice. 217db96d56Sopenharmony_ci# 227db96d56Sopenharmony_ci# The general interface for a CTE decoder is that it takes the encoded_string 237db96d56Sopenharmony_ci# as its argument, and returns a tuple (cte_decoded_string, defects). The 247db96d56Sopenharmony_ci# cte_decoded_string is the original binary that was encoded using the 257db96d56Sopenharmony_ci# specified cte. 'defects' is a list of MessageDefect instances indicating any 267db96d56Sopenharmony_ci# problems encountered during conversion. 'charset' and 'lang' are the 277db96d56Sopenharmony_ci# corresponding strings extracted from the EW, case preserved. 287db96d56Sopenharmony_ci# 297db96d56Sopenharmony_ci# The general interface for a CTE encoder is that it takes a binary sequence 307db96d56Sopenharmony_ci# as input and returns the cte_encoded_string, which is an ascii-only string. 317db96d56Sopenharmony_ci# 327db96d56Sopenharmony_ci# Each decoder must also supply a length function that takes the binary 337db96d56Sopenharmony_ci# sequence as its argument and returns the length of the resulting encoded 347db96d56Sopenharmony_ci# string. 357db96d56Sopenharmony_ci# 367db96d56Sopenharmony_ci# The main API functions for the module are decode, which calls the decoder 377db96d56Sopenharmony_ci# referenced by the cte specifier, and encode, which adds the appropriate 387db96d56Sopenharmony_ci# RFC 2047 "chrome" to the encoded string, and can optionally automatically 397db96d56Sopenharmony_ci# select the shortest possible encoding. See their docstrings below for 407db96d56Sopenharmony_ci# details. 417db96d56Sopenharmony_ci 427db96d56Sopenharmony_ciimport re 437db96d56Sopenharmony_ciimport base64 447db96d56Sopenharmony_ciimport binascii 457db96d56Sopenharmony_ciimport functools 467db96d56Sopenharmony_cifrom string import ascii_letters, digits 477db96d56Sopenharmony_cifrom email import errors 487db96d56Sopenharmony_ci 497db96d56Sopenharmony_ci__all__ = ['decode_q', 507db96d56Sopenharmony_ci 'encode_q', 517db96d56Sopenharmony_ci 'decode_b', 527db96d56Sopenharmony_ci 'encode_b', 537db96d56Sopenharmony_ci 'len_q', 547db96d56Sopenharmony_ci 'len_b', 557db96d56Sopenharmony_ci 'decode', 567db96d56Sopenharmony_ci 'encode', 577db96d56Sopenharmony_ci ] 587db96d56Sopenharmony_ci 597db96d56Sopenharmony_ci# 607db96d56Sopenharmony_ci# Quoted Printable 617db96d56Sopenharmony_ci# 627db96d56Sopenharmony_ci 637db96d56Sopenharmony_ci# regex based decoder. 647db96d56Sopenharmony_ci_q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub, 657db96d56Sopenharmony_ci lambda m: bytes.fromhex(m.group(1).decode())) 667db96d56Sopenharmony_ci 677db96d56Sopenharmony_cidef decode_q(encoded): 687db96d56Sopenharmony_ci encoded = encoded.replace(b'_', b' ') 697db96d56Sopenharmony_ci return _q_byte_subber(encoded), [] 707db96d56Sopenharmony_ci 717db96d56Sopenharmony_ci 727db96d56Sopenharmony_ci# dict mapping bytes to their encoded form 737db96d56Sopenharmony_ciclass _QByteMap(dict): 747db96d56Sopenharmony_ci 757db96d56Sopenharmony_ci safe = b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii') 767db96d56Sopenharmony_ci 777db96d56Sopenharmony_ci def __missing__(self, key): 787db96d56Sopenharmony_ci if key in self.safe: 797db96d56Sopenharmony_ci self[key] = chr(key) 807db96d56Sopenharmony_ci else: 817db96d56Sopenharmony_ci self[key] = "={:02X}".format(key) 827db96d56Sopenharmony_ci return self[key] 837db96d56Sopenharmony_ci 847db96d56Sopenharmony_ci_q_byte_map = _QByteMap() 857db96d56Sopenharmony_ci 867db96d56Sopenharmony_ci# In headers spaces are mapped to '_'. 877db96d56Sopenharmony_ci_q_byte_map[ord(' ')] = '_' 887db96d56Sopenharmony_ci 897db96d56Sopenharmony_cidef encode_q(bstring): 907db96d56Sopenharmony_ci return ''.join(_q_byte_map[x] for x in bstring) 917db96d56Sopenharmony_ci 927db96d56Sopenharmony_cidef len_q(bstring): 937db96d56Sopenharmony_ci return sum(len(_q_byte_map[x]) for x in bstring) 947db96d56Sopenharmony_ci 957db96d56Sopenharmony_ci 967db96d56Sopenharmony_ci# 977db96d56Sopenharmony_ci# Base64 987db96d56Sopenharmony_ci# 997db96d56Sopenharmony_ci 1007db96d56Sopenharmony_cidef decode_b(encoded): 1017db96d56Sopenharmony_ci # First try encoding with validate=True, fixing the padding if needed. 1027db96d56Sopenharmony_ci # This will succeed only if encoded includes no invalid characters. 1037db96d56Sopenharmony_ci pad_err = len(encoded) % 4 1047db96d56Sopenharmony_ci missing_padding = b'==='[:4-pad_err] if pad_err else b'' 1057db96d56Sopenharmony_ci try: 1067db96d56Sopenharmony_ci return ( 1077db96d56Sopenharmony_ci base64.b64decode(encoded + missing_padding, validate=True), 1087db96d56Sopenharmony_ci [errors.InvalidBase64PaddingDefect()] if pad_err else [], 1097db96d56Sopenharmony_ci ) 1107db96d56Sopenharmony_ci except binascii.Error: 1117db96d56Sopenharmony_ci # Since we had correct padding, this is likely an invalid char error. 1127db96d56Sopenharmony_ci # 1137db96d56Sopenharmony_ci # The non-alphabet characters are ignored as far as padding 1147db96d56Sopenharmony_ci # goes, but we don't know how many there are. So try without adding 1157db96d56Sopenharmony_ci # padding to see if it works. 1167db96d56Sopenharmony_ci try: 1177db96d56Sopenharmony_ci return ( 1187db96d56Sopenharmony_ci base64.b64decode(encoded, validate=False), 1197db96d56Sopenharmony_ci [errors.InvalidBase64CharactersDefect()], 1207db96d56Sopenharmony_ci ) 1217db96d56Sopenharmony_ci except binascii.Error: 1227db96d56Sopenharmony_ci # Add as much padding as could possibly be necessary (extra padding 1237db96d56Sopenharmony_ci # is ignored). 1247db96d56Sopenharmony_ci try: 1257db96d56Sopenharmony_ci return ( 1267db96d56Sopenharmony_ci base64.b64decode(encoded + b'==', validate=False), 1277db96d56Sopenharmony_ci [errors.InvalidBase64CharactersDefect(), 1287db96d56Sopenharmony_ci errors.InvalidBase64PaddingDefect()], 1297db96d56Sopenharmony_ci ) 1307db96d56Sopenharmony_ci except binascii.Error: 1317db96d56Sopenharmony_ci # This only happens when the encoded string's length is 1 more 1327db96d56Sopenharmony_ci # than a multiple of 4, which is invalid. 1337db96d56Sopenharmony_ci # 1347db96d56Sopenharmony_ci # bpo-27397: Just return the encoded string since there's no 1357db96d56Sopenharmony_ci # way to decode. 1367db96d56Sopenharmony_ci return encoded, [errors.InvalidBase64LengthDefect()] 1377db96d56Sopenharmony_ci 1387db96d56Sopenharmony_cidef encode_b(bstring): 1397db96d56Sopenharmony_ci return base64.b64encode(bstring).decode('ascii') 1407db96d56Sopenharmony_ci 1417db96d56Sopenharmony_cidef len_b(bstring): 1427db96d56Sopenharmony_ci groups_of_3, leftover = divmod(len(bstring), 3) 1437db96d56Sopenharmony_ci # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in. 1447db96d56Sopenharmony_ci return groups_of_3 * 4 + (4 if leftover else 0) 1457db96d56Sopenharmony_ci 1467db96d56Sopenharmony_ci 1477db96d56Sopenharmony_ci_cte_decoders = { 1487db96d56Sopenharmony_ci 'q': decode_q, 1497db96d56Sopenharmony_ci 'b': decode_b, 1507db96d56Sopenharmony_ci } 1517db96d56Sopenharmony_ci 1527db96d56Sopenharmony_cidef decode(ew): 1537db96d56Sopenharmony_ci """Decode encoded word and return (string, charset, lang, defects) tuple. 1547db96d56Sopenharmony_ci 1557db96d56Sopenharmony_ci An RFC 2047/2243 encoded word has the form: 1567db96d56Sopenharmony_ci 1577db96d56Sopenharmony_ci =?charset*lang?cte?encoded_string?= 1587db96d56Sopenharmony_ci 1597db96d56Sopenharmony_ci where '*lang' may be omitted but the other parts may not be. 1607db96d56Sopenharmony_ci 1617db96d56Sopenharmony_ci This function expects exactly such a string (that is, it does not check the 1627db96d56Sopenharmony_ci syntax and may raise errors if the string is not well formed), and returns 1637db96d56Sopenharmony_ci the encoded_string decoded first from its Content Transfer Encoding and 1647db96d56Sopenharmony_ci then from the resulting bytes into unicode using the specified charset. If 1657db96d56Sopenharmony_ci the cte-decoded string does not successfully decode using the specified 1667db96d56Sopenharmony_ci character set, a defect is added to the defects list and the unknown octets 1677db96d56Sopenharmony_ci are replaced by the unicode 'unknown' character \\uFDFF. 1687db96d56Sopenharmony_ci 1697db96d56Sopenharmony_ci The specified charset and language are returned. The default for language, 1707db96d56Sopenharmony_ci which is rarely if ever encountered, is the empty string. 1717db96d56Sopenharmony_ci 1727db96d56Sopenharmony_ci """ 1737db96d56Sopenharmony_ci _, charset, cte, cte_string, _ = ew.split('?') 1747db96d56Sopenharmony_ci charset, _, lang = charset.partition('*') 1757db96d56Sopenharmony_ci cte = cte.lower() 1767db96d56Sopenharmony_ci # Recover the original bytes and do CTE decoding. 1777db96d56Sopenharmony_ci bstring = cte_string.encode('ascii', 'surrogateescape') 1787db96d56Sopenharmony_ci bstring, defects = _cte_decoders[cte](bstring) 1797db96d56Sopenharmony_ci # Turn the CTE decoded bytes into unicode. 1807db96d56Sopenharmony_ci try: 1817db96d56Sopenharmony_ci string = bstring.decode(charset) 1827db96d56Sopenharmony_ci except UnicodeDecodeError: 1837db96d56Sopenharmony_ci defects.append(errors.UndecodableBytesDefect("Encoded word " 1847db96d56Sopenharmony_ci f"contains bytes not decodable using {charset!r} charset")) 1857db96d56Sopenharmony_ci string = bstring.decode(charset, 'surrogateescape') 1867db96d56Sopenharmony_ci except (LookupError, UnicodeEncodeError): 1877db96d56Sopenharmony_ci string = bstring.decode('ascii', 'surrogateescape') 1887db96d56Sopenharmony_ci if charset.lower() != 'unknown-8bit': 1897db96d56Sopenharmony_ci defects.append(errors.CharsetError(f"Unknown charset {charset!r} " 1907db96d56Sopenharmony_ci f"in encoded word; decoded as unknown bytes")) 1917db96d56Sopenharmony_ci return string, charset, lang, defects 1927db96d56Sopenharmony_ci 1937db96d56Sopenharmony_ci 1947db96d56Sopenharmony_ci_cte_encoders = { 1957db96d56Sopenharmony_ci 'q': encode_q, 1967db96d56Sopenharmony_ci 'b': encode_b, 1977db96d56Sopenharmony_ci } 1987db96d56Sopenharmony_ci 1997db96d56Sopenharmony_ci_cte_encode_length = { 2007db96d56Sopenharmony_ci 'q': len_q, 2017db96d56Sopenharmony_ci 'b': len_b, 2027db96d56Sopenharmony_ci } 2037db96d56Sopenharmony_ci 2047db96d56Sopenharmony_cidef encode(string, charset='utf-8', encoding=None, lang=''): 2057db96d56Sopenharmony_ci """Encode string using the CTE encoding that produces the shorter result. 2067db96d56Sopenharmony_ci 2077db96d56Sopenharmony_ci Produces an RFC 2047/2243 encoded word of the form: 2087db96d56Sopenharmony_ci 2097db96d56Sopenharmony_ci =?charset*lang?cte?encoded_string?= 2107db96d56Sopenharmony_ci 2117db96d56Sopenharmony_ci where '*lang' is omitted unless the 'lang' parameter is given a value. 2127db96d56Sopenharmony_ci Optional argument charset (defaults to utf-8) specifies the charset to use 2137db96d56Sopenharmony_ci to encode the string to binary before CTE encoding it. Optional argument 2147db96d56Sopenharmony_ci 'encoding' is the cte specifier for the encoding that should be used ('q' 2157db96d56Sopenharmony_ci or 'b'); if it is None (the default) the encoding which produces the 2167db96d56Sopenharmony_ci shortest encoded sequence is used, except that 'q' is preferred if it is up 2177db96d56Sopenharmony_ci to five characters longer. Optional argument 'lang' (default '') gives the 2187db96d56Sopenharmony_ci RFC 2243 language string to specify in the encoded word. 2197db96d56Sopenharmony_ci 2207db96d56Sopenharmony_ci """ 2217db96d56Sopenharmony_ci if charset == 'unknown-8bit': 2227db96d56Sopenharmony_ci bstring = string.encode('ascii', 'surrogateescape') 2237db96d56Sopenharmony_ci else: 2247db96d56Sopenharmony_ci bstring = string.encode(charset) 2257db96d56Sopenharmony_ci if encoding is None: 2267db96d56Sopenharmony_ci qlen = _cte_encode_length['q'](bstring) 2277db96d56Sopenharmony_ci blen = _cte_encode_length['b'](bstring) 2287db96d56Sopenharmony_ci # Bias toward q. 5 is arbitrary. 2297db96d56Sopenharmony_ci encoding = 'q' if qlen - blen < 5 else 'b' 2307db96d56Sopenharmony_ci encoded = _cte_encoders[encoding](bstring) 2317db96d56Sopenharmony_ci if lang: 2327db96d56Sopenharmony_ci lang = '*' + lang 2337db96d56Sopenharmony_ci return "=?{}{}?{}?{}?=".format(charset, lang, encoding, encoded) 234