17db96d56Sopenharmony_ci""" Routines for manipulating RFC2047 encoded words.
27db96d56Sopenharmony_ci
37db96d56Sopenharmony_ciThis is currently a package-private API, but will be considered for promotion
47db96d56Sopenharmony_cito a public API if there is demand.
57db96d56Sopenharmony_ci
67db96d56Sopenharmony_ci"""
77db96d56Sopenharmony_ci
87db96d56Sopenharmony_ci# An ecoded word looks like this:
97db96d56Sopenharmony_ci#
107db96d56Sopenharmony_ci#        =?charset[*lang]?cte?encoded_string?=
117db96d56Sopenharmony_ci#
127db96d56Sopenharmony_ci# for more information about charset see the charset module.  Here it is one
137db96d56Sopenharmony_ci# of the preferred MIME charset names (hopefully; you never know when parsing).
147db96d56Sopenharmony_ci# cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case).  In
157db96d56Sopenharmony_ci# theory other letters could be used for other encodings, but in practice this
167db96d56Sopenharmony_ci# (almost?) never happens.  There could be a public API for adding entries
177db96d56Sopenharmony_ci# to the CTE tables, but YAGNI for now.  'q' is Quoted Printable, 'b' is
187db96d56Sopenharmony_ci# Base64.  The meaning of encoded_string should be obvious.  'lang' is optional
197db96d56Sopenharmony_ci# as indicated by the brackets (they are not part of the syntax) but is almost
207db96d56Sopenharmony_ci# never encountered in practice.
217db96d56Sopenharmony_ci#
227db96d56Sopenharmony_ci# The general interface for a CTE decoder is that it takes the encoded_string
237db96d56Sopenharmony_ci# as its argument, and returns a tuple (cte_decoded_string, defects).  The
247db96d56Sopenharmony_ci# cte_decoded_string is the original binary that was encoded using the
257db96d56Sopenharmony_ci# specified cte.  'defects' is a list of MessageDefect instances indicating any
267db96d56Sopenharmony_ci# problems encountered during conversion.  'charset' and 'lang' are the
277db96d56Sopenharmony_ci# corresponding strings extracted from the EW, case preserved.
287db96d56Sopenharmony_ci#
297db96d56Sopenharmony_ci# The general interface for a CTE encoder is that it takes a binary sequence
307db96d56Sopenharmony_ci# as input and returns the cte_encoded_string, which is an ascii-only string.
317db96d56Sopenharmony_ci#
327db96d56Sopenharmony_ci# Each decoder must also supply a length function that takes the binary
337db96d56Sopenharmony_ci# sequence as its argument and returns the length of the resulting encoded
347db96d56Sopenharmony_ci# string.
357db96d56Sopenharmony_ci#
367db96d56Sopenharmony_ci# The main API functions for the module are decode, which calls the decoder
377db96d56Sopenharmony_ci# referenced by the cte specifier, and encode, which adds the appropriate
387db96d56Sopenharmony_ci# RFC 2047 "chrome" to the encoded string, and can optionally automatically
397db96d56Sopenharmony_ci# select the shortest possible encoding.  See their docstrings below for
407db96d56Sopenharmony_ci# details.
417db96d56Sopenharmony_ci
427db96d56Sopenharmony_ciimport re
437db96d56Sopenharmony_ciimport base64
447db96d56Sopenharmony_ciimport binascii
457db96d56Sopenharmony_ciimport functools
467db96d56Sopenharmony_cifrom string import ascii_letters, digits
477db96d56Sopenharmony_cifrom email import errors
487db96d56Sopenharmony_ci
497db96d56Sopenharmony_ci__all__ = ['decode_q',
507db96d56Sopenharmony_ci           'encode_q',
517db96d56Sopenharmony_ci           'decode_b',
527db96d56Sopenharmony_ci           'encode_b',
537db96d56Sopenharmony_ci           'len_q',
547db96d56Sopenharmony_ci           'len_b',
557db96d56Sopenharmony_ci           'decode',
567db96d56Sopenharmony_ci           'encode',
577db96d56Sopenharmony_ci           ]
587db96d56Sopenharmony_ci
597db96d56Sopenharmony_ci#
607db96d56Sopenharmony_ci# Quoted Printable
617db96d56Sopenharmony_ci#
627db96d56Sopenharmony_ci
637db96d56Sopenharmony_ci# regex based decoder.
647db96d56Sopenharmony_ci_q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub,
657db96d56Sopenharmony_ci        lambda m: bytes.fromhex(m.group(1).decode()))
667db96d56Sopenharmony_ci
677db96d56Sopenharmony_cidef decode_q(encoded):
687db96d56Sopenharmony_ci    encoded = encoded.replace(b'_', b' ')
697db96d56Sopenharmony_ci    return _q_byte_subber(encoded), []
707db96d56Sopenharmony_ci
717db96d56Sopenharmony_ci
727db96d56Sopenharmony_ci# dict mapping bytes to their encoded form
737db96d56Sopenharmony_ciclass _QByteMap(dict):
747db96d56Sopenharmony_ci
757db96d56Sopenharmony_ci    safe = b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii')
767db96d56Sopenharmony_ci
777db96d56Sopenharmony_ci    def __missing__(self, key):
787db96d56Sopenharmony_ci        if key in self.safe:
797db96d56Sopenharmony_ci            self[key] = chr(key)
807db96d56Sopenharmony_ci        else:
817db96d56Sopenharmony_ci            self[key] = "={:02X}".format(key)
827db96d56Sopenharmony_ci        return self[key]
837db96d56Sopenharmony_ci
847db96d56Sopenharmony_ci_q_byte_map = _QByteMap()
857db96d56Sopenharmony_ci
867db96d56Sopenharmony_ci# In headers spaces are mapped to '_'.
877db96d56Sopenharmony_ci_q_byte_map[ord(' ')] = '_'
887db96d56Sopenharmony_ci
897db96d56Sopenharmony_cidef encode_q(bstring):
907db96d56Sopenharmony_ci    return ''.join(_q_byte_map[x] for x in bstring)
917db96d56Sopenharmony_ci
927db96d56Sopenharmony_cidef len_q(bstring):
937db96d56Sopenharmony_ci    return sum(len(_q_byte_map[x]) for x in bstring)
947db96d56Sopenharmony_ci
957db96d56Sopenharmony_ci
967db96d56Sopenharmony_ci#
977db96d56Sopenharmony_ci# Base64
987db96d56Sopenharmony_ci#
997db96d56Sopenharmony_ci
1007db96d56Sopenharmony_cidef decode_b(encoded):
1017db96d56Sopenharmony_ci    # First try encoding with validate=True, fixing the padding if needed.
1027db96d56Sopenharmony_ci    # This will succeed only if encoded includes no invalid characters.
1037db96d56Sopenharmony_ci    pad_err = len(encoded) % 4
1047db96d56Sopenharmony_ci    missing_padding = b'==='[:4-pad_err] if pad_err else b''
1057db96d56Sopenharmony_ci    try:
1067db96d56Sopenharmony_ci        return (
1077db96d56Sopenharmony_ci            base64.b64decode(encoded + missing_padding, validate=True),
1087db96d56Sopenharmony_ci            [errors.InvalidBase64PaddingDefect()] if pad_err else [],
1097db96d56Sopenharmony_ci        )
1107db96d56Sopenharmony_ci    except binascii.Error:
1117db96d56Sopenharmony_ci        # Since we had correct padding, this is likely an invalid char error.
1127db96d56Sopenharmony_ci        #
1137db96d56Sopenharmony_ci        # The non-alphabet characters are ignored as far as padding
1147db96d56Sopenharmony_ci        # goes, but we don't know how many there are.  So try without adding
1157db96d56Sopenharmony_ci        # padding to see if it works.
1167db96d56Sopenharmony_ci        try:
1177db96d56Sopenharmony_ci            return (
1187db96d56Sopenharmony_ci                base64.b64decode(encoded, validate=False),
1197db96d56Sopenharmony_ci                [errors.InvalidBase64CharactersDefect()],
1207db96d56Sopenharmony_ci            )
1217db96d56Sopenharmony_ci        except binascii.Error:
1227db96d56Sopenharmony_ci            # Add as much padding as could possibly be necessary (extra padding
1237db96d56Sopenharmony_ci            # is ignored).
1247db96d56Sopenharmony_ci            try:
1257db96d56Sopenharmony_ci                return (
1267db96d56Sopenharmony_ci                    base64.b64decode(encoded + b'==', validate=False),
1277db96d56Sopenharmony_ci                    [errors.InvalidBase64CharactersDefect(),
1287db96d56Sopenharmony_ci                     errors.InvalidBase64PaddingDefect()],
1297db96d56Sopenharmony_ci                )
1307db96d56Sopenharmony_ci            except binascii.Error:
1317db96d56Sopenharmony_ci                # This only happens when the encoded string's length is 1 more
1327db96d56Sopenharmony_ci                # than a multiple of 4, which is invalid.
1337db96d56Sopenharmony_ci                #
1347db96d56Sopenharmony_ci                # bpo-27397: Just return the encoded string since there's no
1357db96d56Sopenharmony_ci                # way to decode.
1367db96d56Sopenharmony_ci                return encoded, [errors.InvalidBase64LengthDefect()]
1377db96d56Sopenharmony_ci
1387db96d56Sopenharmony_cidef encode_b(bstring):
1397db96d56Sopenharmony_ci    return base64.b64encode(bstring).decode('ascii')
1407db96d56Sopenharmony_ci
1417db96d56Sopenharmony_cidef len_b(bstring):
1427db96d56Sopenharmony_ci    groups_of_3, leftover = divmod(len(bstring), 3)
1437db96d56Sopenharmony_ci    # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in.
1447db96d56Sopenharmony_ci    return groups_of_3 * 4 + (4 if leftover else 0)
1457db96d56Sopenharmony_ci
1467db96d56Sopenharmony_ci
1477db96d56Sopenharmony_ci_cte_decoders = {
1487db96d56Sopenharmony_ci    'q': decode_q,
1497db96d56Sopenharmony_ci    'b': decode_b,
1507db96d56Sopenharmony_ci    }
1517db96d56Sopenharmony_ci
1527db96d56Sopenharmony_cidef decode(ew):
1537db96d56Sopenharmony_ci    """Decode encoded word and return (string, charset, lang, defects) tuple.
1547db96d56Sopenharmony_ci
1557db96d56Sopenharmony_ci    An RFC 2047/2243 encoded word has the form:
1567db96d56Sopenharmony_ci
1577db96d56Sopenharmony_ci        =?charset*lang?cte?encoded_string?=
1587db96d56Sopenharmony_ci
1597db96d56Sopenharmony_ci    where '*lang' may be omitted but the other parts may not be.
1607db96d56Sopenharmony_ci
1617db96d56Sopenharmony_ci    This function expects exactly such a string (that is, it does not check the
1627db96d56Sopenharmony_ci    syntax and may raise errors if the string is not well formed), and returns
1637db96d56Sopenharmony_ci    the encoded_string decoded first from its Content Transfer Encoding and
1647db96d56Sopenharmony_ci    then from the resulting bytes into unicode using the specified charset.  If
1657db96d56Sopenharmony_ci    the cte-decoded string does not successfully decode using the specified
1667db96d56Sopenharmony_ci    character set, a defect is added to the defects list and the unknown octets
1677db96d56Sopenharmony_ci    are replaced by the unicode 'unknown' character \\uFDFF.
1687db96d56Sopenharmony_ci
1697db96d56Sopenharmony_ci    The specified charset and language are returned.  The default for language,
1707db96d56Sopenharmony_ci    which is rarely if ever encountered, is the empty string.
1717db96d56Sopenharmony_ci
1727db96d56Sopenharmony_ci    """
1737db96d56Sopenharmony_ci    _, charset, cte, cte_string, _ = ew.split('?')
1747db96d56Sopenharmony_ci    charset, _, lang = charset.partition('*')
1757db96d56Sopenharmony_ci    cte = cte.lower()
1767db96d56Sopenharmony_ci    # Recover the original bytes and do CTE decoding.
1777db96d56Sopenharmony_ci    bstring = cte_string.encode('ascii', 'surrogateescape')
1787db96d56Sopenharmony_ci    bstring, defects = _cte_decoders[cte](bstring)
1797db96d56Sopenharmony_ci    # Turn the CTE decoded bytes into unicode.
1807db96d56Sopenharmony_ci    try:
1817db96d56Sopenharmony_ci        string = bstring.decode(charset)
1827db96d56Sopenharmony_ci    except UnicodeDecodeError:
1837db96d56Sopenharmony_ci        defects.append(errors.UndecodableBytesDefect("Encoded word "
1847db96d56Sopenharmony_ci            f"contains bytes not decodable using {charset!r} charset"))
1857db96d56Sopenharmony_ci        string = bstring.decode(charset, 'surrogateescape')
1867db96d56Sopenharmony_ci    except (LookupError, UnicodeEncodeError):
1877db96d56Sopenharmony_ci        string = bstring.decode('ascii', 'surrogateescape')
1887db96d56Sopenharmony_ci        if charset.lower() != 'unknown-8bit':
1897db96d56Sopenharmony_ci            defects.append(errors.CharsetError(f"Unknown charset {charset!r} "
1907db96d56Sopenharmony_ci                f"in encoded word; decoded as unknown bytes"))
1917db96d56Sopenharmony_ci    return string, charset, lang, defects
1927db96d56Sopenharmony_ci
1937db96d56Sopenharmony_ci
1947db96d56Sopenharmony_ci_cte_encoders = {
1957db96d56Sopenharmony_ci    'q': encode_q,
1967db96d56Sopenharmony_ci    'b': encode_b,
1977db96d56Sopenharmony_ci    }
1987db96d56Sopenharmony_ci
1997db96d56Sopenharmony_ci_cte_encode_length = {
2007db96d56Sopenharmony_ci    'q': len_q,
2017db96d56Sopenharmony_ci    'b': len_b,
2027db96d56Sopenharmony_ci    }
2037db96d56Sopenharmony_ci
2047db96d56Sopenharmony_cidef encode(string, charset='utf-8', encoding=None, lang=''):
2057db96d56Sopenharmony_ci    """Encode string using the CTE encoding that produces the shorter result.
2067db96d56Sopenharmony_ci
2077db96d56Sopenharmony_ci    Produces an RFC 2047/2243 encoded word of the form:
2087db96d56Sopenharmony_ci
2097db96d56Sopenharmony_ci        =?charset*lang?cte?encoded_string?=
2107db96d56Sopenharmony_ci
2117db96d56Sopenharmony_ci    where '*lang' is omitted unless the 'lang' parameter is given a value.
2127db96d56Sopenharmony_ci    Optional argument charset (defaults to utf-8) specifies the charset to use
2137db96d56Sopenharmony_ci    to encode the string to binary before CTE encoding it.  Optional argument
2147db96d56Sopenharmony_ci    'encoding' is the cte specifier for the encoding that should be used ('q'
2157db96d56Sopenharmony_ci    or 'b'); if it is None (the default) the encoding which produces the
2167db96d56Sopenharmony_ci    shortest encoded sequence is used, except that 'q' is preferred if it is up
2177db96d56Sopenharmony_ci    to five characters longer.  Optional argument 'lang' (default '') gives the
2187db96d56Sopenharmony_ci    RFC 2243 language string to specify in the encoded word.
2197db96d56Sopenharmony_ci
2207db96d56Sopenharmony_ci    """
2217db96d56Sopenharmony_ci    if charset == 'unknown-8bit':
2227db96d56Sopenharmony_ci        bstring = string.encode('ascii', 'surrogateescape')
2237db96d56Sopenharmony_ci    else:
2247db96d56Sopenharmony_ci        bstring = string.encode(charset)
2257db96d56Sopenharmony_ci    if encoding is None:
2267db96d56Sopenharmony_ci        qlen = _cte_encode_length['q'](bstring)
2277db96d56Sopenharmony_ci        blen = _cte_encode_length['b'](bstring)
2287db96d56Sopenharmony_ci        # Bias toward q.  5 is arbitrary.
2297db96d56Sopenharmony_ci        encoding = 'q' if qlen - blen < 5 else 'b'
2307db96d56Sopenharmony_ci    encoded = _cte_encoders[encoding](bstring)
2317db96d56Sopenharmony_ci    if lang:
2327db96d56Sopenharmony_ci        lang = '*' + lang
2337db96d56Sopenharmony_ci    return "=?{}{}?{}?{}?=".format(charset, lang, encoding, encoded)
234