17db96d56Sopenharmony_ci# Copyright (C) 2002-2007 Python Software Foundation 27db96d56Sopenharmony_ci# Author: Ben Gertzfield, Barry Warsaw 37db96d56Sopenharmony_ci# Contact: email-sig@python.org 47db96d56Sopenharmony_ci 57db96d56Sopenharmony_ci"""Header encoding and decoding functionality.""" 67db96d56Sopenharmony_ci 77db96d56Sopenharmony_ci__all__ = [ 87db96d56Sopenharmony_ci 'Header', 97db96d56Sopenharmony_ci 'decode_header', 107db96d56Sopenharmony_ci 'make_header', 117db96d56Sopenharmony_ci ] 127db96d56Sopenharmony_ci 137db96d56Sopenharmony_ciimport re 147db96d56Sopenharmony_ciimport binascii 157db96d56Sopenharmony_ci 167db96d56Sopenharmony_ciimport email.quoprimime 177db96d56Sopenharmony_ciimport email.base64mime 187db96d56Sopenharmony_ci 197db96d56Sopenharmony_cifrom email.errors import HeaderParseError 207db96d56Sopenharmony_cifrom email import charset as _charset 217db96d56Sopenharmony_ciCharset = _charset.Charset 227db96d56Sopenharmony_ci 237db96d56Sopenharmony_ciNL = '\n' 247db96d56Sopenharmony_ciSPACE = ' ' 257db96d56Sopenharmony_ciBSPACE = b' ' 267db96d56Sopenharmony_ciSPACE8 = ' ' * 8 277db96d56Sopenharmony_ciEMPTYSTRING = '' 287db96d56Sopenharmony_ciMAXLINELEN = 78 297db96d56Sopenharmony_ciFWS = ' \t' 307db96d56Sopenharmony_ci 317db96d56Sopenharmony_ciUSASCII = Charset('us-ascii') 327db96d56Sopenharmony_ciUTF8 = Charset('utf-8') 337db96d56Sopenharmony_ci 347db96d56Sopenharmony_ci# Match encoded-word strings in the form =?charset?q?Hello_World?= 357db96d56Sopenharmony_ciecre = re.compile(r''' 367db96d56Sopenharmony_ci =\? # literal =? 377db96d56Sopenharmony_ci (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset 387db96d56Sopenharmony_ci \? # literal ? 397db96d56Sopenharmony_ci (?P<encoding>[qQbB]) # either a "q" or a "b", case insensitive 407db96d56Sopenharmony_ci \? # literal ? 417db96d56Sopenharmony_ci (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string 427db96d56Sopenharmony_ci \?= # literal ?= 437db96d56Sopenharmony_ci ''', re.VERBOSE | re.MULTILINE) 447db96d56Sopenharmony_ci 457db96d56Sopenharmony_ci# Field name regexp, including trailing colon, but not separating whitespace, 467db96d56Sopenharmony_ci# according to RFC 2822. Character range is from tilde to exclamation mark. 477db96d56Sopenharmony_ci# For use with .match() 487db96d56Sopenharmony_cifcre = re.compile(r'[\041-\176]+:$') 497db96d56Sopenharmony_ci 507db96d56Sopenharmony_ci# Find a header embedded in a putative header value. Used to check for 517db96d56Sopenharmony_ci# header injection attack. 527db96d56Sopenharmony_ci_embedded_header = re.compile(r'\n[^ \t]+:') 537db96d56Sopenharmony_ci 547db96d56Sopenharmony_ci 557db96d56Sopenharmony_ci 567db96d56Sopenharmony_ci# Helpers 577db96d56Sopenharmony_ci_max_append = email.quoprimime._max_append 587db96d56Sopenharmony_ci 597db96d56Sopenharmony_ci 607db96d56Sopenharmony_ci 617db96d56Sopenharmony_cidef decode_header(header): 627db96d56Sopenharmony_ci """Decode a message header value without converting charset. 637db96d56Sopenharmony_ci 647db96d56Sopenharmony_ci Returns a list of (string, charset) pairs containing each of the decoded 657db96d56Sopenharmony_ci parts of the header. Charset is None for non-encoded parts of the header, 667db96d56Sopenharmony_ci otherwise a lower-case string containing the name of the character set 677db96d56Sopenharmony_ci specified in the encoded string. 687db96d56Sopenharmony_ci 697db96d56Sopenharmony_ci header may be a string that may or may not contain RFC2047 encoded words, 707db96d56Sopenharmony_ci or it may be a Header object. 717db96d56Sopenharmony_ci 727db96d56Sopenharmony_ci An email.errors.HeaderParseError may be raised when certain decoding error 737db96d56Sopenharmony_ci occurs (e.g. a base64 decoding exception). 747db96d56Sopenharmony_ci """ 757db96d56Sopenharmony_ci # If it is a Header object, we can just return the encoded chunks. 767db96d56Sopenharmony_ci if hasattr(header, '_chunks'): 777db96d56Sopenharmony_ci return [(_charset._encode(string, str(charset)), str(charset)) 787db96d56Sopenharmony_ci for string, charset in header._chunks] 797db96d56Sopenharmony_ci # If no encoding, just return the header with no charset. 807db96d56Sopenharmony_ci if not ecre.search(header): 817db96d56Sopenharmony_ci return [(header, None)] 827db96d56Sopenharmony_ci # First step is to parse all the encoded parts into triplets of the form 837db96d56Sopenharmony_ci # (encoded_string, encoding, charset). For unencoded strings, the last 847db96d56Sopenharmony_ci # two parts will be None. 857db96d56Sopenharmony_ci words = [] 867db96d56Sopenharmony_ci for line in header.splitlines(): 877db96d56Sopenharmony_ci parts = ecre.split(line) 887db96d56Sopenharmony_ci first = True 897db96d56Sopenharmony_ci while parts: 907db96d56Sopenharmony_ci unencoded = parts.pop(0) 917db96d56Sopenharmony_ci if first: 927db96d56Sopenharmony_ci unencoded = unencoded.lstrip() 937db96d56Sopenharmony_ci first = False 947db96d56Sopenharmony_ci if unencoded: 957db96d56Sopenharmony_ci words.append((unencoded, None, None)) 967db96d56Sopenharmony_ci if parts: 977db96d56Sopenharmony_ci charset = parts.pop(0).lower() 987db96d56Sopenharmony_ci encoding = parts.pop(0).lower() 997db96d56Sopenharmony_ci encoded = parts.pop(0) 1007db96d56Sopenharmony_ci words.append((encoded, encoding, charset)) 1017db96d56Sopenharmony_ci # Now loop over words and remove words that consist of whitespace 1027db96d56Sopenharmony_ci # between two encoded strings. 1037db96d56Sopenharmony_ci droplist = [] 1047db96d56Sopenharmony_ci for n, w in enumerate(words): 1057db96d56Sopenharmony_ci if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace(): 1067db96d56Sopenharmony_ci droplist.append(n-1) 1077db96d56Sopenharmony_ci for d in reversed(droplist): 1087db96d56Sopenharmony_ci del words[d] 1097db96d56Sopenharmony_ci 1107db96d56Sopenharmony_ci # The next step is to decode each encoded word by applying the reverse 1117db96d56Sopenharmony_ci # base64 or quopri transformation. decoded_words is now a list of the 1127db96d56Sopenharmony_ci # form (decoded_word, charset). 1137db96d56Sopenharmony_ci decoded_words = [] 1147db96d56Sopenharmony_ci for encoded_string, encoding, charset in words: 1157db96d56Sopenharmony_ci if encoding is None: 1167db96d56Sopenharmony_ci # This is an unencoded word. 1177db96d56Sopenharmony_ci decoded_words.append((encoded_string, charset)) 1187db96d56Sopenharmony_ci elif encoding == 'q': 1197db96d56Sopenharmony_ci word = email.quoprimime.header_decode(encoded_string) 1207db96d56Sopenharmony_ci decoded_words.append((word, charset)) 1217db96d56Sopenharmony_ci elif encoding == 'b': 1227db96d56Sopenharmony_ci paderr = len(encoded_string) % 4 # Postel's law: add missing padding 1237db96d56Sopenharmony_ci if paderr: 1247db96d56Sopenharmony_ci encoded_string += '==='[:4 - paderr] 1257db96d56Sopenharmony_ci try: 1267db96d56Sopenharmony_ci word = email.base64mime.decode(encoded_string) 1277db96d56Sopenharmony_ci except binascii.Error: 1287db96d56Sopenharmony_ci raise HeaderParseError('Base64 decoding error') 1297db96d56Sopenharmony_ci else: 1307db96d56Sopenharmony_ci decoded_words.append((word, charset)) 1317db96d56Sopenharmony_ci else: 1327db96d56Sopenharmony_ci raise AssertionError('Unexpected encoding: ' + encoding) 1337db96d56Sopenharmony_ci # Now convert all words to bytes and collapse consecutive runs of 1347db96d56Sopenharmony_ci # similarly encoded words. 1357db96d56Sopenharmony_ci collapsed = [] 1367db96d56Sopenharmony_ci last_word = last_charset = None 1377db96d56Sopenharmony_ci for word, charset in decoded_words: 1387db96d56Sopenharmony_ci if isinstance(word, str): 1397db96d56Sopenharmony_ci word = bytes(word, 'raw-unicode-escape') 1407db96d56Sopenharmony_ci if last_word is None: 1417db96d56Sopenharmony_ci last_word = word 1427db96d56Sopenharmony_ci last_charset = charset 1437db96d56Sopenharmony_ci elif charset != last_charset: 1447db96d56Sopenharmony_ci collapsed.append((last_word, last_charset)) 1457db96d56Sopenharmony_ci last_word = word 1467db96d56Sopenharmony_ci last_charset = charset 1477db96d56Sopenharmony_ci elif last_charset is None: 1487db96d56Sopenharmony_ci last_word += BSPACE + word 1497db96d56Sopenharmony_ci else: 1507db96d56Sopenharmony_ci last_word += word 1517db96d56Sopenharmony_ci collapsed.append((last_word, last_charset)) 1527db96d56Sopenharmony_ci return collapsed 1537db96d56Sopenharmony_ci 1547db96d56Sopenharmony_ci 1557db96d56Sopenharmony_ci 1567db96d56Sopenharmony_cidef make_header(decoded_seq, maxlinelen=None, header_name=None, 1577db96d56Sopenharmony_ci continuation_ws=' '): 1587db96d56Sopenharmony_ci """Create a Header from a sequence of pairs as returned by decode_header() 1597db96d56Sopenharmony_ci 1607db96d56Sopenharmony_ci decode_header() takes a header value string and returns a sequence of 1617db96d56Sopenharmony_ci pairs of the format (decoded_string, charset) where charset is the string 1627db96d56Sopenharmony_ci name of the character set. 1637db96d56Sopenharmony_ci 1647db96d56Sopenharmony_ci This function takes one of those sequence of pairs and returns a Header 1657db96d56Sopenharmony_ci instance. Optional maxlinelen, header_name, and continuation_ws are as in 1667db96d56Sopenharmony_ci the Header constructor. 1677db96d56Sopenharmony_ci """ 1687db96d56Sopenharmony_ci h = Header(maxlinelen=maxlinelen, header_name=header_name, 1697db96d56Sopenharmony_ci continuation_ws=continuation_ws) 1707db96d56Sopenharmony_ci for s, charset in decoded_seq: 1717db96d56Sopenharmony_ci # None means us-ascii but we can simply pass it on to h.append() 1727db96d56Sopenharmony_ci if charset is not None and not isinstance(charset, Charset): 1737db96d56Sopenharmony_ci charset = Charset(charset) 1747db96d56Sopenharmony_ci h.append(s, charset) 1757db96d56Sopenharmony_ci return h 1767db96d56Sopenharmony_ci 1777db96d56Sopenharmony_ci 1787db96d56Sopenharmony_ci 1797db96d56Sopenharmony_ciclass Header: 1807db96d56Sopenharmony_ci def __init__(self, s=None, charset=None, 1817db96d56Sopenharmony_ci maxlinelen=None, header_name=None, 1827db96d56Sopenharmony_ci continuation_ws=' ', errors='strict'): 1837db96d56Sopenharmony_ci """Create a MIME-compliant header that can contain many character sets. 1847db96d56Sopenharmony_ci 1857db96d56Sopenharmony_ci Optional s is the initial header value. If None, the initial header 1867db96d56Sopenharmony_ci value is not set. You can later append to the header with .append() 1877db96d56Sopenharmony_ci method calls. s may be a byte string or a Unicode string, but see the 1887db96d56Sopenharmony_ci .append() documentation for semantics. 1897db96d56Sopenharmony_ci 1907db96d56Sopenharmony_ci Optional charset serves two purposes: it has the same meaning as the 1917db96d56Sopenharmony_ci charset argument to the .append() method. It also sets the default 1927db96d56Sopenharmony_ci character set for all subsequent .append() calls that omit the charset 1937db96d56Sopenharmony_ci argument. If charset is not provided in the constructor, the us-ascii 1947db96d56Sopenharmony_ci charset is used both as s's initial charset and as the default for 1957db96d56Sopenharmony_ci subsequent .append() calls. 1967db96d56Sopenharmony_ci 1977db96d56Sopenharmony_ci The maximum line length can be specified explicitly via maxlinelen. For 1987db96d56Sopenharmony_ci splitting the first line to a shorter value (to account for the field 1997db96d56Sopenharmony_ci header which isn't included in s, e.g. `Subject') pass in the name of 2007db96d56Sopenharmony_ci the field in header_name. The default maxlinelen is 78 as recommended 2017db96d56Sopenharmony_ci by RFC 2822. 2027db96d56Sopenharmony_ci 2037db96d56Sopenharmony_ci continuation_ws must be RFC 2822 compliant folding whitespace (usually 2047db96d56Sopenharmony_ci either a space or a hard tab) which will be prepended to continuation 2057db96d56Sopenharmony_ci lines. 2067db96d56Sopenharmony_ci 2077db96d56Sopenharmony_ci errors is passed through to the .append() call. 2087db96d56Sopenharmony_ci """ 2097db96d56Sopenharmony_ci if charset is None: 2107db96d56Sopenharmony_ci charset = USASCII 2117db96d56Sopenharmony_ci elif not isinstance(charset, Charset): 2127db96d56Sopenharmony_ci charset = Charset(charset) 2137db96d56Sopenharmony_ci self._charset = charset 2147db96d56Sopenharmony_ci self._continuation_ws = continuation_ws 2157db96d56Sopenharmony_ci self._chunks = [] 2167db96d56Sopenharmony_ci if s is not None: 2177db96d56Sopenharmony_ci self.append(s, charset, errors) 2187db96d56Sopenharmony_ci if maxlinelen is None: 2197db96d56Sopenharmony_ci maxlinelen = MAXLINELEN 2207db96d56Sopenharmony_ci self._maxlinelen = maxlinelen 2217db96d56Sopenharmony_ci if header_name is None: 2227db96d56Sopenharmony_ci self._headerlen = 0 2237db96d56Sopenharmony_ci else: 2247db96d56Sopenharmony_ci # Take the separating colon and space into account. 2257db96d56Sopenharmony_ci self._headerlen = len(header_name) + 2 2267db96d56Sopenharmony_ci 2277db96d56Sopenharmony_ci def __str__(self): 2287db96d56Sopenharmony_ci """Return the string value of the header.""" 2297db96d56Sopenharmony_ci self._normalize() 2307db96d56Sopenharmony_ci uchunks = [] 2317db96d56Sopenharmony_ci lastcs = None 2327db96d56Sopenharmony_ci lastspace = None 2337db96d56Sopenharmony_ci for string, charset in self._chunks: 2347db96d56Sopenharmony_ci # We must preserve spaces between encoded and non-encoded word 2357db96d56Sopenharmony_ci # boundaries, which means for us we need to add a space when we go 2367db96d56Sopenharmony_ci # from a charset to None/us-ascii, or from None/us-ascii to a 2377db96d56Sopenharmony_ci # charset. Only do this for the second and subsequent chunks. 2387db96d56Sopenharmony_ci # Don't add a space if the None/us-ascii string already has 2397db96d56Sopenharmony_ci # a space (trailing or leading depending on transition) 2407db96d56Sopenharmony_ci nextcs = charset 2417db96d56Sopenharmony_ci if nextcs == _charset.UNKNOWN8BIT: 2427db96d56Sopenharmony_ci original_bytes = string.encode('ascii', 'surrogateescape') 2437db96d56Sopenharmony_ci string = original_bytes.decode('ascii', 'replace') 2447db96d56Sopenharmony_ci if uchunks: 2457db96d56Sopenharmony_ci hasspace = string and self._nonctext(string[0]) 2467db96d56Sopenharmony_ci if lastcs not in (None, 'us-ascii'): 2477db96d56Sopenharmony_ci if nextcs in (None, 'us-ascii') and not hasspace: 2487db96d56Sopenharmony_ci uchunks.append(SPACE) 2497db96d56Sopenharmony_ci nextcs = None 2507db96d56Sopenharmony_ci elif nextcs not in (None, 'us-ascii') and not lastspace: 2517db96d56Sopenharmony_ci uchunks.append(SPACE) 2527db96d56Sopenharmony_ci lastspace = string and self._nonctext(string[-1]) 2537db96d56Sopenharmony_ci lastcs = nextcs 2547db96d56Sopenharmony_ci uchunks.append(string) 2557db96d56Sopenharmony_ci return EMPTYSTRING.join(uchunks) 2567db96d56Sopenharmony_ci 2577db96d56Sopenharmony_ci # Rich comparison operators for equality only. BAW: does it make sense to 2587db96d56Sopenharmony_ci # have or explicitly disable <, <=, >, >= operators? 2597db96d56Sopenharmony_ci def __eq__(self, other): 2607db96d56Sopenharmony_ci # other may be a Header or a string. Both are fine so coerce 2617db96d56Sopenharmony_ci # ourselves to a unicode (of the unencoded header value), swap the 2627db96d56Sopenharmony_ci # args and do another comparison. 2637db96d56Sopenharmony_ci return other == str(self) 2647db96d56Sopenharmony_ci 2657db96d56Sopenharmony_ci def append(self, s, charset=None, errors='strict'): 2667db96d56Sopenharmony_ci """Append a string to the MIME header. 2677db96d56Sopenharmony_ci 2687db96d56Sopenharmony_ci Optional charset, if given, should be a Charset instance or the name 2697db96d56Sopenharmony_ci of a character set (which will be converted to a Charset instance). A 2707db96d56Sopenharmony_ci value of None (the default) means that the charset given in the 2717db96d56Sopenharmony_ci constructor is used. 2727db96d56Sopenharmony_ci 2737db96d56Sopenharmony_ci s may be a byte string or a Unicode string. If it is a byte string 2747db96d56Sopenharmony_ci (i.e. isinstance(s, str) is false), then charset is the encoding of 2757db96d56Sopenharmony_ci that byte string, and a UnicodeError will be raised if the string 2767db96d56Sopenharmony_ci cannot be decoded with that charset. If s is a Unicode string, then 2777db96d56Sopenharmony_ci charset is a hint specifying the character set of the characters in 2787db96d56Sopenharmony_ci the string. In either case, when producing an RFC 2822 compliant 2797db96d56Sopenharmony_ci header using RFC 2047 rules, the string will be encoded using the 2807db96d56Sopenharmony_ci output codec of the charset. If the string cannot be encoded to the 2817db96d56Sopenharmony_ci output codec, a UnicodeError will be raised. 2827db96d56Sopenharmony_ci 2837db96d56Sopenharmony_ci Optional `errors' is passed as the errors argument to the decode 2847db96d56Sopenharmony_ci call if s is a byte string. 2857db96d56Sopenharmony_ci """ 2867db96d56Sopenharmony_ci if charset is None: 2877db96d56Sopenharmony_ci charset = self._charset 2887db96d56Sopenharmony_ci elif not isinstance(charset, Charset): 2897db96d56Sopenharmony_ci charset = Charset(charset) 2907db96d56Sopenharmony_ci if not isinstance(s, str): 2917db96d56Sopenharmony_ci input_charset = charset.input_codec or 'us-ascii' 2927db96d56Sopenharmony_ci if input_charset == _charset.UNKNOWN8BIT: 2937db96d56Sopenharmony_ci s = s.decode('us-ascii', 'surrogateescape') 2947db96d56Sopenharmony_ci else: 2957db96d56Sopenharmony_ci s = s.decode(input_charset, errors) 2967db96d56Sopenharmony_ci # Ensure that the bytes we're storing can be decoded to the output 2977db96d56Sopenharmony_ci # character set, otherwise an early error is raised. 2987db96d56Sopenharmony_ci output_charset = charset.output_codec or 'us-ascii' 2997db96d56Sopenharmony_ci if output_charset != _charset.UNKNOWN8BIT: 3007db96d56Sopenharmony_ci try: 3017db96d56Sopenharmony_ci s.encode(output_charset, errors) 3027db96d56Sopenharmony_ci except UnicodeEncodeError: 3037db96d56Sopenharmony_ci if output_charset!='us-ascii': 3047db96d56Sopenharmony_ci raise 3057db96d56Sopenharmony_ci charset = UTF8 3067db96d56Sopenharmony_ci self._chunks.append((s, charset)) 3077db96d56Sopenharmony_ci 3087db96d56Sopenharmony_ci def _nonctext(self, s): 3097db96d56Sopenharmony_ci """True if string s is not a ctext character of RFC822. 3107db96d56Sopenharmony_ci """ 3117db96d56Sopenharmony_ci return s.isspace() or s in ('(', ')', '\\') 3127db96d56Sopenharmony_ci 3137db96d56Sopenharmony_ci def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'): 3147db96d56Sopenharmony_ci r"""Encode a message header into an RFC-compliant format. 3157db96d56Sopenharmony_ci 3167db96d56Sopenharmony_ci There are many issues involved in converting a given string for use in 3177db96d56Sopenharmony_ci an email header. Only certain character sets are readable in most 3187db96d56Sopenharmony_ci email clients, and as header strings can only contain a subset of 3197db96d56Sopenharmony_ci 7-bit ASCII, care must be taken to properly convert and encode (with 3207db96d56Sopenharmony_ci Base64 or quoted-printable) header strings. In addition, there is a 3217db96d56Sopenharmony_ci 75-character length limit on any given encoded header field, so 3227db96d56Sopenharmony_ci line-wrapping must be performed, even with double-byte character sets. 3237db96d56Sopenharmony_ci 3247db96d56Sopenharmony_ci Optional maxlinelen specifies the maximum length of each generated 3257db96d56Sopenharmony_ci line, exclusive of the linesep string. Individual lines may be longer 3267db96d56Sopenharmony_ci than maxlinelen if a folding point cannot be found. The first line 3277db96d56Sopenharmony_ci will be shorter by the length of the header name plus ": " if a header 3287db96d56Sopenharmony_ci name was specified at Header construction time. The default value for 3297db96d56Sopenharmony_ci maxlinelen is determined at header construction time. 3307db96d56Sopenharmony_ci 3317db96d56Sopenharmony_ci Optional splitchars is a string containing characters which should be 3327db96d56Sopenharmony_ci given extra weight by the splitting algorithm during normal header 3337db96d56Sopenharmony_ci wrapping. This is in very rough support of RFC 2822's `higher level 3347db96d56Sopenharmony_ci syntactic breaks': split points preceded by a splitchar are preferred 3357db96d56Sopenharmony_ci during line splitting, with the characters preferred in the order in 3367db96d56Sopenharmony_ci which they appear in the string. Space and tab may be included in the 3377db96d56Sopenharmony_ci string to indicate whether preference should be given to one over the 3387db96d56Sopenharmony_ci other as a split point when other split chars do not appear in the line 3397db96d56Sopenharmony_ci being split. Splitchars does not affect RFC 2047 encoded lines. 3407db96d56Sopenharmony_ci 3417db96d56Sopenharmony_ci Optional linesep is a string to be used to separate the lines of 3427db96d56Sopenharmony_ci the value. The default value is the most useful for typical 3437db96d56Sopenharmony_ci Python applications, but it can be set to \r\n to produce RFC-compliant 3447db96d56Sopenharmony_ci line separators when needed. 3457db96d56Sopenharmony_ci """ 3467db96d56Sopenharmony_ci self._normalize() 3477db96d56Sopenharmony_ci if maxlinelen is None: 3487db96d56Sopenharmony_ci maxlinelen = self._maxlinelen 3497db96d56Sopenharmony_ci # A maxlinelen of 0 means don't wrap. For all practical purposes, 3507db96d56Sopenharmony_ci # choosing a huge number here accomplishes that and makes the 3517db96d56Sopenharmony_ci # _ValueFormatter algorithm much simpler. 3527db96d56Sopenharmony_ci if maxlinelen == 0: 3537db96d56Sopenharmony_ci maxlinelen = 1000000 3547db96d56Sopenharmony_ci formatter = _ValueFormatter(self._headerlen, maxlinelen, 3557db96d56Sopenharmony_ci self._continuation_ws, splitchars) 3567db96d56Sopenharmony_ci lastcs = None 3577db96d56Sopenharmony_ci hasspace = lastspace = None 3587db96d56Sopenharmony_ci for string, charset in self._chunks: 3597db96d56Sopenharmony_ci if hasspace is not None: 3607db96d56Sopenharmony_ci hasspace = string and self._nonctext(string[0]) 3617db96d56Sopenharmony_ci if lastcs not in (None, 'us-ascii'): 3627db96d56Sopenharmony_ci if not hasspace or charset not in (None, 'us-ascii'): 3637db96d56Sopenharmony_ci formatter.add_transition() 3647db96d56Sopenharmony_ci elif charset not in (None, 'us-ascii') and not lastspace: 3657db96d56Sopenharmony_ci formatter.add_transition() 3667db96d56Sopenharmony_ci lastspace = string and self._nonctext(string[-1]) 3677db96d56Sopenharmony_ci lastcs = charset 3687db96d56Sopenharmony_ci hasspace = False 3697db96d56Sopenharmony_ci lines = string.splitlines() 3707db96d56Sopenharmony_ci if lines: 3717db96d56Sopenharmony_ci formatter.feed('', lines[0], charset) 3727db96d56Sopenharmony_ci else: 3737db96d56Sopenharmony_ci formatter.feed('', '', charset) 3747db96d56Sopenharmony_ci for line in lines[1:]: 3757db96d56Sopenharmony_ci formatter.newline() 3767db96d56Sopenharmony_ci if charset.header_encoding is not None: 3777db96d56Sopenharmony_ci formatter.feed(self._continuation_ws, ' ' + line.lstrip(), 3787db96d56Sopenharmony_ci charset) 3797db96d56Sopenharmony_ci else: 3807db96d56Sopenharmony_ci sline = line.lstrip() 3817db96d56Sopenharmony_ci fws = line[:len(line)-len(sline)] 3827db96d56Sopenharmony_ci formatter.feed(fws, sline, charset) 3837db96d56Sopenharmony_ci if len(lines) > 1: 3847db96d56Sopenharmony_ci formatter.newline() 3857db96d56Sopenharmony_ci if self._chunks: 3867db96d56Sopenharmony_ci formatter.add_transition() 3877db96d56Sopenharmony_ci value = formatter._str(linesep) 3887db96d56Sopenharmony_ci if _embedded_header.search(value): 3897db96d56Sopenharmony_ci raise HeaderParseError("header value appears to contain " 3907db96d56Sopenharmony_ci "an embedded header: {!r}".format(value)) 3917db96d56Sopenharmony_ci return value 3927db96d56Sopenharmony_ci 3937db96d56Sopenharmony_ci def _normalize(self): 3947db96d56Sopenharmony_ci # Step 1: Normalize the chunks so that all runs of identical charsets 3957db96d56Sopenharmony_ci # get collapsed into a single unicode string. 3967db96d56Sopenharmony_ci chunks = [] 3977db96d56Sopenharmony_ci last_charset = None 3987db96d56Sopenharmony_ci last_chunk = [] 3997db96d56Sopenharmony_ci for string, charset in self._chunks: 4007db96d56Sopenharmony_ci if charset == last_charset: 4017db96d56Sopenharmony_ci last_chunk.append(string) 4027db96d56Sopenharmony_ci else: 4037db96d56Sopenharmony_ci if last_charset is not None: 4047db96d56Sopenharmony_ci chunks.append((SPACE.join(last_chunk), last_charset)) 4057db96d56Sopenharmony_ci last_chunk = [string] 4067db96d56Sopenharmony_ci last_charset = charset 4077db96d56Sopenharmony_ci if last_chunk: 4087db96d56Sopenharmony_ci chunks.append((SPACE.join(last_chunk), last_charset)) 4097db96d56Sopenharmony_ci self._chunks = chunks 4107db96d56Sopenharmony_ci 4117db96d56Sopenharmony_ci 4127db96d56Sopenharmony_ci 4137db96d56Sopenharmony_ciclass _ValueFormatter: 4147db96d56Sopenharmony_ci def __init__(self, headerlen, maxlen, continuation_ws, splitchars): 4157db96d56Sopenharmony_ci self._maxlen = maxlen 4167db96d56Sopenharmony_ci self._continuation_ws = continuation_ws 4177db96d56Sopenharmony_ci self._continuation_ws_len = len(continuation_ws) 4187db96d56Sopenharmony_ci self._splitchars = splitchars 4197db96d56Sopenharmony_ci self._lines = [] 4207db96d56Sopenharmony_ci self._current_line = _Accumulator(headerlen) 4217db96d56Sopenharmony_ci 4227db96d56Sopenharmony_ci def _str(self, linesep): 4237db96d56Sopenharmony_ci self.newline() 4247db96d56Sopenharmony_ci return linesep.join(self._lines) 4257db96d56Sopenharmony_ci 4267db96d56Sopenharmony_ci def __str__(self): 4277db96d56Sopenharmony_ci return self._str(NL) 4287db96d56Sopenharmony_ci 4297db96d56Sopenharmony_ci def newline(self): 4307db96d56Sopenharmony_ci end_of_line = self._current_line.pop() 4317db96d56Sopenharmony_ci if end_of_line != (' ', ''): 4327db96d56Sopenharmony_ci self._current_line.push(*end_of_line) 4337db96d56Sopenharmony_ci if len(self._current_line) > 0: 4347db96d56Sopenharmony_ci if self._current_line.is_onlyws() and self._lines: 4357db96d56Sopenharmony_ci self._lines[-1] += str(self._current_line) 4367db96d56Sopenharmony_ci else: 4377db96d56Sopenharmony_ci self._lines.append(str(self._current_line)) 4387db96d56Sopenharmony_ci self._current_line.reset() 4397db96d56Sopenharmony_ci 4407db96d56Sopenharmony_ci def add_transition(self): 4417db96d56Sopenharmony_ci self._current_line.push(' ', '') 4427db96d56Sopenharmony_ci 4437db96d56Sopenharmony_ci def feed(self, fws, string, charset): 4447db96d56Sopenharmony_ci # If the charset has no header encoding (i.e. it is an ASCII encoding) 4457db96d56Sopenharmony_ci # then we must split the header at the "highest level syntactic break" 4467db96d56Sopenharmony_ci # possible. Note that we don't have a lot of smarts about field 4477db96d56Sopenharmony_ci # syntax; we just try to break on semi-colons, then commas, then 4487db96d56Sopenharmony_ci # whitespace. Eventually, this should be pluggable. 4497db96d56Sopenharmony_ci if charset.header_encoding is None: 4507db96d56Sopenharmony_ci self._ascii_split(fws, string, self._splitchars) 4517db96d56Sopenharmony_ci return 4527db96d56Sopenharmony_ci # Otherwise, we're doing either a Base64 or a quoted-printable 4537db96d56Sopenharmony_ci # encoding which means we don't need to split the line on syntactic 4547db96d56Sopenharmony_ci # breaks. We can basically just find enough characters to fit on the 4557db96d56Sopenharmony_ci # current line, minus the RFC 2047 chrome. What makes this trickier 4567db96d56Sopenharmony_ci # though is that we have to split at octet boundaries, not character 4577db96d56Sopenharmony_ci # boundaries but it's only safe to split at character boundaries so at 4587db96d56Sopenharmony_ci # best we can only get close. 4597db96d56Sopenharmony_ci encoded_lines = charset.header_encode_lines(string, self._maxlengths()) 4607db96d56Sopenharmony_ci # The first element extends the current line, but if it's None then 4617db96d56Sopenharmony_ci # nothing more fit on the current line so start a new line. 4627db96d56Sopenharmony_ci try: 4637db96d56Sopenharmony_ci first_line = encoded_lines.pop(0) 4647db96d56Sopenharmony_ci except IndexError: 4657db96d56Sopenharmony_ci # There are no encoded lines, so we're done. 4667db96d56Sopenharmony_ci return 4677db96d56Sopenharmony_ci if first_line is not None: 4687db96d56Sopenharmony_ci self._append_chunk(fws, first_line) 4697db96d56Sopenharmony_ci try: 4707db96d56Sopenharmony_ci last_line = encoded_lines.pop() 4717db96d56Sopenharmony_ci except IndexError: 4727db96d56Sopenharmony_ci # There was only one line. 4737db96d56Sopenharmony_ci return 4747db96d56Sopenharmony_ci self.newline() 4757db96d56Sopenharmony_ci self._current_line.push(self._continuation_ws, last_line) 4767db96d56Sopenharmony_ci # Everything else are full lines in themselves. 4777db96d56Sopenharmony_ci for line in encoded_lines: 4787db96d56Sopenharmony_ci self._lines.append(self._continuation_ws + line) 4797db96d56Sopenharmony_ci 4807db96d56Sopenharmony_ci def _maxlengths(self): 4817db96d56Sopenharmony_ci # The first line's length. 4827db96d56Sopenharmony_ci yield self._maxlen - len(self._current_line) 4837db96d56Sopenharmony_ci while True: 4847db96d56Sopenharmony_ci yield self._maxlen - self._continuation_ws_len 4857db96d56Sopenharmony_ci 4867db96d56Sopenharmony_ci def _ascii_split(self, fws, string, splitchars): 4877db96d56Sopenharmony_ci # The RFC 2822 header folding algorithm is simple in principle but 4887db96d56Sopenharmony_ci # complex in practice. Lines may be folded any place where "folding 4897db96d56Sopenharmony_ci # white space" appears by inserting a linesep character in front of the 4907db96d56Sopenharmony_ci # FWS. The complication is that not all spaces or tabs qualify as FWS, 4917db96d56Sopenharmony_ci # and we are also supposed to prefer to break at "higher level 4927db96d56Sopenharmony_ci # syntactic breaks". We can't do either of these without intimate 4937db96d56Sopenharmony_ci # knowledge of the structure of structured headers, which we don't have 4947db96d56Sopenharmony_ci # here. So the best we can do here is prefer to break at the specified 4957db96d56Sopenharmony_ci # splitchars, and hope that we don't choose any spaces or tabs that 4967db96d56Sopenharmony_ci # aren't legal FWS. (This is at least better than the old algorithm, 4977db96d56Sopenharmony_ci # where we would sometimes *introduce* FWS after a splitchar, or the 4987db96d56Sopenharmony_ci # algorithm before that, where we would turn all white space runs into 4997db96d56Sopenharmony_ci # single spaces or tabs.) 5007db96d56Sopenharmony_ci parts = re.split("(["+FWS+"]+)", fws+string) 5017db96d56Sopenharmony_ci if parts[0]: 5027db96d56Sopenharmony_ci parts[:0] = [''] 5037db96d56Sopenharmony_ci else: 5047db96d56Sopenharmony_ci parts.pop(0) 5057db96d56Sopenharmony_ci for fws, part in zip(*[iter(parts)]*2): 5067db96d56Sopenharmony_ci self._append_chunk(fws, part) 5077db96d56Sopenharmony_ci 5087db96d56Sopenharmony_ci def _append_chunk(self, fws, string): 5097db96d56Sopenharmony_ci self._current_line.push(fws, string) 5107db96d56Sopenharmony_ci if len(self._current_line) > self._maxlen: 5117db96d56Sopenharmony_ci # Find the best split point, working backward from the end. 5127db96d56Sopenharmony_ci # There might be none, on a long first line. 5137db96d56Sopenharmony_ci for ch in self._splitchars: 5147db96d56Sopenharmony_ci for i in range(self._current_line.part_count()-1, 0, -1): 5157db96d56Sopenharmony_ci if ch.isspace(): 5167db96d56Sopenharmony_ci fws = self._current_line[i][0] 5177db96d56Sopenharmony_ci if fws and fws[0]==ch: 5187db96d56Sopenharmony_ci break 5197db96d56Sopenharmony_ci prevpart = self._current_line[i-1][1] 5207db96d56Sopenharmony_ci if prevpart and prevpart[-1]==ch: 5217db96d56Sopenharmony_ci break 5227db96d56Sopenharmony_ci else: 5237db96d56Sopenharmony_ci continue 5247db96d56Sopenharmony_ci break 5257db96d56Sopenharmony_ci else: 5267db96d56Sopenharmony_ci fws, part = self._current_line.pop() 5277db96d56Sopenharmony_ci if self._current_line._initial_size > 0: 5287db96d56Sopenharmony_ci # There will be a header, so leave it on a line by itself. 5297db96d56Sopenharmony_ci self.newline() 5307db96d56Sopenharmony_ci if not fws: 5317db96d56Sopenharmony_ci # We don't use continuation_ws here because the whitespace 5327db96d56Sopenharmony_ci # after a header should always be a space. 5337db96d56Sopenharmony_ci fws = ' ' 5347db96d56Sopenharmony_ci self._current_line.push(fws, part) 5357db96d56Sopenharmony_ci return 5367db96d56Sopenharmony_ci remainder = self._current_line.pop_from(i) 5377db96d56Sopenharmony_ci self._lines.append(str(self._current_line)) 5387db96d56Sopenharmony_ci self._current_line.reset(remainder) 5397db96d56Sopenharmony_ci 5407db96d56Sopenharmony_ci 5417db96d56Sopenharmony_ciclass _Accumulator(list): 5427db96d56Sopenharmony_ci 5437db96d56Sopenharmony_ci def __init__(self, initial_size=0): 5447db96d56Sopenharmony_ci self._initial_size = initial_size 5457db96d56Sopenharmony_ci super().__init__() 5467db96d56Sopenharmony_ci 5477db96d56Sopenharmony_ci def push(self, fws, string): 5487db96d56Sopenharmony_ci self.append((fws, string)) 5497db96d56Sopenharmony_ci 5507db96d56Sopenharmony_ci def pop_from(self, i=0): 5517db96d56Sopenharmony_ci popped = self[i:] 5527db96d56Sopenharmony_ci self[i:] = [] 5537db96d56Sopenharmony_ci return popped 5547db96d56Sopenharmony_ci 5557db96d56Sopenharmony_ci def pop(self): 5567db96d56Sopenharmony_ci if self.part_count()==0: 5577db96d56Sopenharmony_ci return ('', '') 5587db96d56Sopenharmony_ci return super().pop() 5597db96d56Sopenharmony_ci 5607db96d56Sopenharmony_ci def __len__(self): 5617db96d56Sopenharmony_ci return sum((len(fws)+len(part) for fws, part in self), 5627db96d56Sopenharmony_ci self._initial_size) 5637db96d56Sopenharmony_ci 5647db96d56Sopenharmony_ci def __str__(self): 5657db96d56Sopenharmony_ci return EMPTYSTRING.join((EMPTYSTRING.join((fws, part)) 5667db96d56Sopenharmony_ci for fws, part in self)) 5677db96d56Sopenharmony_ci 5687db96d56Sopenharmony_ci def reset(self, startval=None): 5697db96d56Sopenharmony_ci if startval is None: 5707db96d56Sopenharmony_ci startval = [] 5717db96d56Sopenharmony_ci self[:] = startval 5727db96d56Sopenharmony_ci self._initial_size = 0 5737db96d56Sopenharmony_ci 5747db96d56Sopenharmony_ci def is_onlyws(self): 5757db96d56Sopenharmony_ci return self._initial_size==0 and (not self or str(self).isspace()) 5767db96d56Sopenharmony_ci 5777db96d56Sopenharmony_ci def part_count(self): 5787db96d56Sopenharmony_ci return super().__len__() 579