17db96d56Sopenharmony_ci# Copyright (C) 2004-2006 Python Software Foundation 27db96d56Sopenharmony_ci# Authors: Baxter, Wouters and Warsaw 37db96d56Sopenharmony_ci# Contact: email-sig@python.org 47db96d56Sopenharmony_ci 57db96d56Sopenharmony_ci"""FeedParser - An email feed parser. 67db96d56Sopenharmony_ci 77db96d56Sopenharmony_ciThe feed parser implements an interface for incrementally parsing an email 87db96d56Sopenharmony_cimessage, line by line. This has advantages for certain applications, such as 97db96d56Sopenharmony_cithose reading email messages off a socket. 107db96d56Sopenharmony_ci 117db96d56Sopenharmony_ciFeedParser.feed() is the primary interface for pushing new data into the 127db96d56Sopenharmony_ciparser. It returns when there's nothing more it can do with the available 137db96d56Sopenharmony_cidata. When you have no more data to push into the parser, call .close(). 147db96d56Sopenharmony_ciThis completes the parsing and returns the root message object. 157db96d56Sopenharmony_ci 167db96d56Sopenharmony_ciThe other advantage of this parser is that it will never raise a parsing 177db96d56Sopenharmony_ciexception. Instead, when it finds something unexpected, it adds a 'defect' to 187db96d56Sopenharmony_cithe current message. Defects are just instances that live on the message 197db96d56Sopenharmony_ciobject's .defects attribute. 207db96d56Sopenharmony_ci""" 217db96d56Sopenharmony_ci 227db96d56Sopenharmony_ci__all__ = ['FeedParser', 'BytesFeedParser'] 237db96d56Sopenharmony_ci 247db96d56Sopenharmony_ciimport re 257db96d56Sopenharmony_ci 267db96d56Sopenharmony_cifrom email import errors 277db96d56Sopenharmony_cifrom email._policybase import compat32 287db96d56Sopenharmony_cifrom collections import deque 297db96d56Sopenharmony_cifrom io import StringIO 307db96d56Sopenharmony_ci 317db96d56Sopenharmony_ciNLCRE = re.compile(r'\r\n|\r|\n') 327db96d56Sopenharmony_ciNLCRE_bol = re.compile(r'(\r\n|\r|\n)') 337db96d56Sopenharmony_ciNLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z') 347db96d56Sopenharmony_ciNLCRE_crack = re.compile(r'(\r\n|\r|\n)') 357db96d56Sopenharmony_ci# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character 367db96d56Sopenharmony_ci# except controls, SP, and ":". 377db96d56Sopenharmony_ciheaderRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])') 387db96d56Sopenharmony_ciEMPTYSTRING = '' 397db96d56Sopenharmony_ciNL = '\n' 407db96d56Sopenharmony_ci 417db96d56Sopenharmony_ciNeedMoreData = object() 427db96d56Sopenharmony_ci 437db96d56Sopenharmony_ci 447db96d56Sopenharmony_ci 457db96d56Sopenharmony_ciclass BufferedSubFile(object): 467db96d56Sopenharmony_ci """A file-ish object that can have new data loaded into it. 477db96d56Sopenharmony_ci 487db96d56Sopenharmony_ci You can also push and pop line-matching predicates onto a stack. When the 497db96d56Sopenharmony_ci current predicate matches the current line, a false EOF response 507db96d56Sopenharmony_ci (i.e. empty string) is returned instead. This lets the parser adhere to a 517db96d56Sopenharmony_ci simple abstraction -- it parses until EOF closes the current message. 527db96d56Sopenharmony_ci """ 537db96d56Sopenharmony_ci def __init__(self): 547db96d56Sopenharmony_ci # Text stream of the last partial line pushed into this object. 557db96d56Sopenharmony_ci # See issue 22233 for why this is a text stream and not a list. 567db96d56Sopenharmony_ci self._partial = StringIO(newline='') 577db96d56Sopenharmony_ci # A deque of full, pushed lines 587db96d56Sopenharmony_ci self._lines = deque() 597db96d56Sopenharmony_ci # The stack of false-EOF checking predicates. 607db96d56Sopenharmony_ci self._eofstack = [] 617db96d56Sopenharmony_ci # A flag indicating whether the file has been closed or not. 627db96d56Sopenharmony_ci self._closed = False 637db96d56Sopenharmony_ci 647db96d56Sopenharmony_ci def push_eof_matcher(self, pred): 657db96d56Sopenharmony_ci self._eofstack.append(pred) 667db96d56Sopenharmony_ci 677db96d56Sopenharmony_ci def pop_eof_matcher(self): 687db96d56Sopenharmony_ci return self._eofstack.pop() 697db96d56Sopenharmony_ci 707db96d56Sopenharmony_ci def close(self): 717db96d56Sopenharmony_ci # Don't forget any trailing partial line. 727db96d56Sopenharmony_ci self._partial.seek(0) 737db96d56Sopenharmony_ci self.pushlines(self._partial.readlines()) 747db96d56Sopenharmony_ci self._partial.seek(0) 757db96d56Sopenharmony_ci self._partial.truncate() 767db96d56Sopenharmony_ci self._closed = True 777db96d56Sopenharmony_ci 787db96d56Sopenharmony_ci def readline(self): 797db96d56Sopenharmony_ci if not self._lines: 807db96d56Sopenharmony_ci if self._closed: 817db96d56Sopenharmony_ci return '' 827db96d56Sopenharmony_ci return NeedMoreData 837db96d56Sopenharmony_ci # Pop the line off the stack and see if it matches the current 847db96d56Sopenharmony_ci # false-EOF predicate. 857db96d56Sopenharmony_ci line = self._lines.popleft() 867db96d56Sopenharmony_ci # RFC 2046, section 5.1.2 requires us to recognize outer level 877db96d56Sopenharmony_ci # boundaries at any level of inner nesting. Do this, but be sure it's 887db96d56Sopenharmony_ci # in the order of most to least nested. 897db96d56Sopenharmony_ci for ateof in reversed(self._eofstack): 907db96d56Sopenharmony_ci if ateof(line): 917db96d56Sopenharmony_ci # We're at the false EOF. But push the last line back first. 927db96d56Sopenharmony_ci self._lines.appendleft(line) 937db96d56Sopenharmony_ci return '' 947db96d56Sopenharmony_ci return line 957db96d56Sopenharmony_ci 967db96d56Sopenharmony_ci def unreadline(self, line): 977db96d56Sopenharmony_ci # Let the consumer push a line back into the buffer. 987db96d56Sopenharmony_ci assert line is not NeedMoreData 997db96d56Sopenharmony_ci self._lines.appendleft(line) 1007db96d56Sopenharmony_ci 1017db96d56Sopenharmony_ci def push(self, data): 1027db96d56Sopenharmony_ci """Push some new data into this object.""" 1037db96d56Sopenharmony_ci self._partial.write(data) 1047db96d56Sopenharmony_ci if '\n' not in data and '\r' not in data: 1057db96d56Sopenharmony_ci # No new complete lines, wait for more. 1067db96d56Sopenharmony_ci return 1077db96d56Sopenharmony_ci 1087db96d56Sopenharmony_ci # Crack into lines, preserving the linesep characters. 1097db96d56Sopenharmony_ci self._partial.seek(0) 1107db96d56Sopenharmony_ci parts = self._partial.readlines() 1117db96d56Sopenharmony_ci self._partial.seek(0) 1127db96d56Sopenharmony_ci self._partial.truncate() 1137db96d56Sopenharmony_ci 1147db96d56Sopenharmony_ci # If the last element of the list does not end in a newline, then treat 1157db96d56Sopenharmony_ci # it as a partial line. We only check for '\n' here because a line 1167db96d56Sopenharmony_ci # ending with '\r' might be a line that was split in the middle of a 1177db96d56Sopenharmony_ci # '\r\n' sequence (see bugs 1555570 and 1721862). 1187db96d56Sopenharmony_ci if not parts[-1].endswith('\n'): 1197db96d56Sopenharmony_ci self._partial.write(parts.pop()) 1207db96d56Sopenharmony_ci self.pushlines(parts) 1217db96d56Sopenharmony_ci 1227db96d56Sopenharmony_ci def pushlines(self, lines): 1237db96d56Sopenharmony_ci self._lines.extend(lines) 1247db96d56Sopenharmony_ci 1257db96d56Sopenharmony_ci def __iter__(self): 1267db96d56Sopenharmony_ci return self 1277db96d56Sopenharmony_ci 1287db96d56Sopenharmony_ci def __next__(self): 1297db96d56Sopenharmony_ci line = self.readline() 1307db96d56Sopenharmony_ci if line == '': 1317db96d56Sopenharmony_ci raise StopIteration 1327db96d56Sopenharmony_ci return line 1337db96d56Sopenharmony_ci 1347db96d56Sopenharmony_ci 1357db96d56Sopenharmony_ci 1367db96d56Sopenharmony_ciclass FeedParser: 1377db96d56Sopenharmony_ci """A feed-style parser of email.""" 1387db96d56Sopenharmony_ci 1397db96d56Sopenharmony_ci def __init__(self, _factory=None, *, policy=compat32): 1407db96d56Sopenharmony_ci """_factory is called with no arguments to create a new message obj 1417db96d56Sopenharmony_ci 1427db96d56Sopenharmony_ci The policy keyword specifies a policy object that controls a number of 1437db96d56Sopenharmony_ci aspects of the parser's operation. The default policy maintains 1447db96d56Sopenharmony_ci backward compatibility. 1457db96d56Sopenharmony_ci 1467db96d56Sopenharmony_ci """ 1477db96d56Sopenharmony_ci self.policy = policy 1487db96d56Sopenharmony_ci self._old_style_factory = False 1497db96d56Sopenharmony_ci if _factory is None: 1507db96d56Sopenharmony_ci if policy.message_factory is None: 1517db96d56Sopenharmony_ci from email.message import Message 1527db96d56Sopenharmony_ci self._factory = Message 1537db96d56Sopenharmony_ci else: 1547db96d56Sopenharmony_ci self._factory = policy.message_factory 1557db96d56Sopenharmony_ci else: 1567db96d56Sopenharmony_ci self._factory = _factory 1577db96d56Sopenharmony_ci try: 1587db96d56Sopenharmony_ci _factory(policy=self.policy) 1597db96d56Sopenharmony_ci except TypeError: 1607db96d56Sopenharmony_ci # Assume this is an old-style factory 1617db96d56Sopenharmony_ci self._old_style_factory = True 1627db96d56Sopenharmony_ci self._input = BufferedSubFile() 1637db96d56Sopenharmony_ci self._msgstack = [] 1647db96d56Sopenharmony_ci self._parse = self._parsegen().__next__ 1657db96d56Sopenharmony_ci self._cur = None 1667db96d56Sopenharmony_ci self._last = None 1677db96d56Sopenharmony_ci self._headersonly = False 1687db96d56Sopenharmony_ci 1697db96d56Sopenharmony_ci # Non-public interface for supporting Parser's headersonly flag 1707db96d56Sopenharmony_ci def _set_headersonly(self): 1717db96d56Sopenharmony_ci self._headersonly = True 1727db96d56Sopenharmony_ci 1737db96d56Sopenharmony_ci def feed(self, data): 1747db96d56Sopenharmony_ci """Push more data into the parser.""" 1757db96d56Sopenharmony_ci self._input.push(data) 1767db96d56Sopenharmony_ci self._call_parse() 1777db96d56Sopenharmony_ci 1787db96d56Sopenharmony_ci def _call_parse(self): 1797db96d56Sopenharmony_ci try: 1807db96d56Sopenharmony_ci self._parse() 1817db96d56Sopenharmony_ci except StopIteration: 1827db96d56Sopenharmony_ci pass 1837db96d56Sopenharmony_ci 1847db96d56Sopenharmony_ci def close(self): 1857db96d56Sopenharmony_ci """Parse all remaining data and return the root message object.""" 1867db96d56Sopenharmony_ci self._input.close() 1877db96d56Sopenharmony_ci self._call_parse() 1887db96d56Sopenharmony_ci root = self._pop_message() 1897db96d56Sopenharmony_ci assert not self._msgstack 1907db96d56Sopenharmony_ci # Look for final set of defects 1917db96d56Sopenharmony_ci if root.get_content_maintype() == 'multipart' \ 1927db96d56Sopenharmony_ci and not root.is_multipart(): 1937db96d56Sopenharmony_ci defect = errors.MultipartInvariantViolationDefect() 1947db96d56Sopenharmony_ci self.policy.handle_defect(root, defect) 1957db96d56Sopenharmony_ci return root 1967db96d56Sopenharmony_ci 1977db96d56Sopenharmony_ci def _new_message(self): 1987db96d56Sopenharmony_ci if self._old_style_factory: 1997db96d56Sopenharmony_ci msg = self._factory() 2007db96d56Sopenharmony_ci else: 2017db96d56Sopenharmony_ci msg = self._factory(policy=self.policy) 2027db96d56Sopenharmony_ci if self._cur and self._cur.get_content_type() == 'multipart/digest': 2037db96d56Sopenharmony_ci msg.set_default_type('message/rfc822') 2047db96d56Sopenharmony_ci if self._msgstack: 2057db96d56Sopenharmony_ci self._msgstack[-1].attach(msg) 2067db96d56Sopenharmony_ci self._msgstack.append(msg) 2077db96d56Sopenharmony_ci self._cur = msg 2087db96d56Sopenharmony_ci self._last = msg 2097db96d56Sopenharmony_ci 2107db96d56Sopenharmony_ci def _pop_message(self): 2117db96d56Sopenharmony_ci retval = self._msgstack.pop() 2127db96d56Sopenharmony_ci if self._msgstack: 2137db96d56Sopenharmony_ci self._cur = self._msgstack[-1] 2147db96d56Sopenharmony_ci else: 2157db96d56Sopenharmony_ci self._cur = None 2167db96d56Sopenharmony_ci return retval 2177db96d56Sopenharmony_ci 2187db96d56Sopenharmony_ci def _parsegen(self): 2197db96d56Sopenharmony_ci # Create a new message and start by parsing headers. 2207db96d56Sopenharmony_ci self._new_message() 2217db96d56Sopenharmony_ci headers = [] 2227db96d56Sopenharmony_ci # Collect the headers, searching for a line that doesn't match the RFC 2237db96d56Sopenharmony_ci # 2822 header or continuation pattern (including an empty line). 2247db96d56Sopenharmony_ci for line in self._input: 2257db96d56Sopenharmony_ci if line is NeedMoreData: 2267db96d56Sopenharmony_ci yield NeedMoreData 2277db96d56Sopenharmony_ci continue 2287db96d56Sopenharmony_ci if not headerRE.match(line): 2297db96d56Sopenharmony_ci # If we saw the RFC defined header/body separator 2307db96d56Sopenharmony_ci # (i.e. newline), just throw it away. Otherwise the line is 2317db96d56Sopenharmony_ci # part of the body so push it back. 2327db96d56Sopenharmony_ci if not NLCRE.match(line): 2337db96d56Sopenharmony_ci defect = errors.MissingHeaderBodySeparatorDefect() 2347db96d56Sopenharmony_ci self.policy.handle_defect(self._cur, defect) 2357db96d56Sopenharmony_ci self._input.unreadline(line) 2367db96d56Sopenharmony_ci break 2377db96d56Sopenharmony_ci headers.append(line) 2387db96d56Sopenharmony_ci # Done with the headers, so parse them and figure out what we're 2397db96d56Sopenharmony_ci # supposed to see in the body of the message. 2407db96d56Sopenharmony_ci self._parse_headers(headers) 2417db96d56Sopenharmony_ci # Headers-only parsing is a backwards compatibility hack, which was 2427db96d56Sopenharmony_ci # necessary in the older parser, which could raise errors. All 2437db96d56Sopenharmony_ci # remaining lines in the input are thrown into the message body. 2447db96d56Sopenharmony_ci if self._headersonly: 2457db96d56Sopenharmony_ci lines = [] 2467db96d56Sopenharmony_ci while True: 2477db96d56Sopenharmony_ci line = self._input.readline() 2487db96d56Sopenharmony_ci if line is NeedMoreData: 2497db96d56Sopenharmony_ci yield NeedMoreData 2507db96d56Sopenharmony_ci continue 2517db96d56Sopenharmony_ci if line == '': 2527db96d56Sopenharmony_ci break 2537db96d56Sopenharmony_ci lines.append(line) 2547db96d56Sopenharmony_ci self._cur.set_payload(EMPTYSTRING.join(lines)) 2557db96d56Sopenharmony_ci return 2567db96d56Sopenharmony_ci if self._cur.get_content_type() == 'message/delivery-status': 2577db96d56Sopenharmony_ci # message/delivery-status contains blocks of headers separated by 2587db96d56Sopenharmony_ci # a blank line. We'll represent each header block as a separate 2597db96d56Sopenharmony_ci # nested message object, but the processing is a bit different 2607db96d56Sopenharmony_ci # than standard message/* types because there is no body for the 2617db96d56Sopenharmony_ci # nested messages. A blank line separates the subparts. 2627db96d56Sopenharmony_ci while True: 2637db96d56Sopenharmony_ci self._input.push_eof_matcher(NLCRE.match) 2647db96d56Sopenharmony_ci for retval in self._parsegen(): 2657db96d56Sopenharmony_ci if retval is NeedMoreData: 2667db96d56Sopenharmony_ci yield NeedMoreData 2677db96d56Sopenharmony_ci continue 2687db96d56Sopenharmony_ci break 2697db96d56Sopenharmony_ci msg = self._pop_message() 2707db96d56Sopenharmony_ci # We need to pop the EOF matcher in order to tell if we're at 2717db96d56Sopenharmony_ci # the end of the current file, not the end of the last block 2727db96d56Sopenharmony_ci # of message headers. 2737db96d56Sopenharmony_ci self._input.pop_eof_matcher() 2747db96d56Sopenharmony_ci # The input stream must be sitting at the newline or at the 2757db96d56Sopenharmony_ci # EOF. We want to see if we're at the end of this subpart, so 2767db96d56Sopenharmony_ci # first consume the blank line, then test the next line to see 2777db96d56Sopenharmony_ci # if we're at this subpart's EOF. 2787db96d56Sopenharmony_ci while True: 2797db96d56Sopenharmony_ci line = self._input.readline() 2807db96d56Sopenharmony_ci if line is NeedMoreData: 2817db96d56Sopenharmony_ci yield NeedMoreData 2827db96d56Sopenharmony_ci continue 2837db96d56Sopenharmony_ci break 2847db96d56Sopenharmony_ci while True: 2857db96d56Sopenharmony_ci line = self._input.readline() 2867db96d56Sopenharmony_ci if line is NeedMoreData: 2877db96d56Sopenharmony_ci yield NeedMoreData 2887db96d56Sopenharmony_ci continue 2897db96d56Sopenharmony_ci break 2907db96d56Sopenharmony_ci if line == '': 2917db96d56Sopenharmony_ci break 2927db96d56Sopenharmony_ci # Not at EOF so this is a line we're going to need. 2937db96d56Sopenharmony_ci self._input.unreadline(line) 2947db96d56Sopenharmony_ci return 2957db96d56Sopenharmony_ci if self._cur.get_content_maintype() == 'message': 2967db96d56Sopenharmony_ci # The message claims to be a message/* type, then what follows is 2977db96d56Sopenharmony_ci # another RFC 2822 message. 2987db96d56Sopenharmony_ci for retval in self._parsegen(): 2997db96d56Sopenharmony_ci if retval is NeedMoreData: 3007db96d56Sopenharmony_ci yield NeedMoreData 3017db96d56Sopenharmony_ci continue 3027db96d56Sopenharmony_ci break 3037db96d56Sopenharmony_ci self._pop_message() 3047db96d56Sopenharmony_ci return 3057db96d56Sopenharmony_ci if self._cur.get_content_maintype() == 'multipart': 3067db96d56Sopenharmony_ci boundary = self._cur.get_boundary() 3077db96d56Sopenharmony_ci if boundary is None: 3087db96d56Sopenharmony_ci # The message /claims/ to be a multipart but it has not 3097db96d56Sopenharmony_ci # defined a boundary. That's a problem which we'll handle by 3107db96d56Sopenharmony_ci # reading everything until the EOF and marking the message as 3117db96d56Sopenharmony_ci # defective. 3127db96d56Sopenharmony_ci defect = errors.NoBoundaryInMultipartDefect() 3137db96d56Sopenharmony_ci self.policy.handle_defect(self._cur, defect) 3147db96d56Sopenharmony_ci lines = [] 3157db96d56Sopenharmony_ci for line in self._input: 3167db96d56Sopenharmony_ci if line is NeedMoreData: 3177db96d56Sopenharmony_ci yield NeedMoreData 3187db96d56Sopenharmony_ci continue 3197db96d56Sopenharmony_ci lines.append(line) 3207db96d56Sopenharmony_ci self._cur.set_payload(EMPTYSTRING.join(lines)) 3217db96d56Sopenharmony_ci return 3227db96d56Sopenharmony_ci # Make sure a valid content type was specified per RFC 2045:6.4. 3237db96d56Sopenharmony_ci if (str(self._cur.get('content-transfer-encoding', '8bit')).lower() 3247db96d56Sopenharmony_ci not in ('7bit', '8bit', 'binary')): 3257db96d56Sopenharmony_ci defect = errors.InvalidMultipartContentTransferEncodingDefect() 3267db96d56Sopenharmony_ci self.policy.handle_defect(self._cur, defect) 3277db96d56Sopenharmony_ci # Create a line match predicate which matches the inter-part 3287db96d56Sopenharmony_ci # boundary as well as the end-of-multipart boundary. Don't push 3297db96d56Sopenharmony_ci # this onto the input stream until we've scanned past the 3307db96d56Sopenharmony_ci # preamble. 3317db96d56Sopenharmony_ci separator = '--' + boundary 3327db96d56Sopenharmony_ci boundaryre = re.compile( 3337db96d56Sopenharmony_ci '(?P<sep>' + re.escape(separator) + 3347db96d56Sopenharmony_ci r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') 3357db96d56Sopenharmony_ci capturing_preamble = True 3367db96d56Sopenharmony_ci preamble = [] 3377db96d56Sopenharmony_ci linesep = False 3387db96d56Sopenharmony_ci close_boundary_seen = False 3397db96d56Sopenharmony_ci while True: 3407db96d56Sopenharmony_ci line = self._input.readline() 3417db96d56Sopenharmony_ci if line is NeedMoreData: 3427db96d56Sopenharmony_ci yield NeedMoreData 3437db96d56Sopenharmony_ci continue 3447db96d56Sopenharmony_ci if line == '': 3457db96d56Sopenharmony_ci break 3467db96d56Sopenharmony_ci mo = boundaryre.match(line) 3477db96d56Sopenharmony_ci if mo: 3487db96d56Sopenharmony_ci # If we're looking at the end boundary, we're done with 3497db96d56Sopenharmony_ci # this multipart. If there was a newline at the end of 3507db96d56Sopenharmony_ci # the closing boundary, then we need to initialize the 3517db96d56Sopenharmony_ci # epilogue with the empty string (see below). 3527db96d56Sopenharmony_ci if mo.group('end'): 3537db96d56Sopenharmony_ci close_boundary_seen = True 3547db96d56Sopenharmony_ci linesep = mo.group('linesep') 3557db96d56Sopenharmony_ci break 3567db96d56Sopenharmony_ci # We saw an inter-part boundary. Were we in the preamble? 3577db96d56Sopenharmony_ci if capturing_preamble: 3587db96d56Sopenharmony_ci if preamble: 3597db96d56Sopenharmony_ci # According to RFC 2046, the last newline belongs 3607db96d56Sopenharmony_ci # to the boundary. 3617db96d56Sopenharmony_ci lastline = preamble[-1] 3627db96d56Sopenharmony_ci eolmo = NLCRE_eol.search(lastline) 3637db96d56Sopenharmony_ci if eolmo: 3647db96d56Sopenharmony_ci preamble[-1] = lastline[:-len(eolmo.group(0))] 3657db96d56Sopenharmony_ci self._cur.preamble = EMPTYSTRING.join(preamble) 3667db96d56Sopenharmony_ci capturing_preamble = False 3677db96d56Sopenharmony_ci self._input.unreadline(line) 3687db96d56Sopenharmony_ci continue 3697db96d56Sopenharmony_ci # We saw a boundary separating two parts. Consume any 3707db96d56Sopenharmony_ci # multiple boundary lines that may be following. Our 3717db96d56Sopenharmony_ci # interpretation of RFC 2046 BNF grammar does not produce 3727db96d56Sopenharmony_ci # body parts within such double boundaries. 3737db96d56Sopenharmony_ci while True: 3747db96d56Sopenharmony_ci line = self._input.readline() 3757db96d56Sopenharmony_ci if line is NeedMoreData: 3767db96d56Sopenharmony_ci yield NeedMoreData 3777db96d56Sopenharmony_ci continue 3787db96d56Sopenharmony_ci mo = boundaryre.match(line) 3797db96d56Sopenharmony_ci if not mo: 3807db96d56Sopenharmony_ci self._input.unreadline(line) 3817db96d56Sopenharmony_ci break 3827db96d56Sopenharmony_ci # Recurse to parse this subpart; the input stream points 3837db96d56Sopenharmony_ci # at the subpart's first line. 3847db96d56Sopenharmony_ci self._input.push_eof_matcher(boundaryre.match) 3857db96d56Sopenharmony_ci for retval in self._parsegen(): 3867db96d56Sopenharmony_ci if retval is NeedMoreData: 3877db96d56Sopenharmony_ci yield NeedMoreData 3887db96d56Sopenharmony_ci continue 3897db96d56Sopenharmony_ci break 3907db96d56Sopenharmony_ci # Because of RFC 2046, the newline preceding the boundary 3917db96d56Sopenharmony_ci # separator actually belongs to the boundary, not the 3927db96d56Sopenharmony_ci # previous subpart's payload (or epilogue if the previous 3937db96d56Sopenharmony_ci # part is a multipart). 3947db96d56Sopenharmony_ci if self._last.get_content_maintype() == 'multipart': 3957db96d56Sopenharmony_ci epilogue = self._last.epilogue 3967db96d56Sopenharmony_ci if epilogue == '': 3977db96d56Sopenharmony_ci self._last.epilogue = None 3987db96d56Sopenharmony_ci elif epilogue is not None: 3997db96d56Sopenharmony_ci mo = NLCRE_eol.search(epilogue) 4007db96d56Sopenharmony_ci if mo: 4017db96d56Sopenharmony_ci end = len(mo.group(0)) 4027db96d56Sopenharmony_ci self._last.epilogue = epilogue[:-end] 4037db96d56Sopenharmony_ci else: 4047db96d56Sopenharmony_ci payload = self._last._payload 4057db96d56Sopenharmony_ci if isinstance(payload, str): 4067db96d56Sopenharmony_ci mo = NLCRE_eol.search(payload) 4077db96d56Sopenharmony_ci if mo: 4087db96d56Sopenharmony_ci payload = payload[:-len(mo.group(0))] 4097db96d56Sopenharmony_ci self._last._payload = payload 4107db96d56Sopenharmony_ci self._input.pop_eof_matcher() 4117db96d56Sopenharmony_ci self._pop_message() 4127db96d56Sopenharmony_ci # Set the multipart up for newline cleansing, which will 4137db96d56Sopenharmony_ci # happen if we're in a nested multipart. 4147db96d56Sopenharmony_ci self._last = self._cur 4157db96d56Sopenharmony_ci else: 4167db96d56Sopenharmony_ci # I think we must be in the preamble 4177db96d56Sopenharmony_ci assert capturing_preamble 4187db96d56Sopenharmony_ci preamble.append(line) 4197db96d56Sopenharmony_ci # We've seen either the EOF or the end boundary. If we're still 4207db96d56Sopenharmony_ci # capturing the preamble, we never saw the start boundary. Note 4217db96d56Sopenharmony_ci # that as a defect and store the captured text as the payload. 4227db96d56Sopenharmony_ci if capturing_preamble: 4237db96d56Sopenharmony_ci defect = errors.StartBoundaryNotFoundDefect() 4247db96d56Sopenharmony_ci self.policy.handle_defect(self._cur, defect) 4257db96d56Sopenharmony_ci self._cur.set_payload(EMPTYSTRING.join(preamble)) 4267db96d56Sopenharmony_ci epilogue = [] 4277db96d56Sopenharmony_ci for line in self._input: 4287db96d56Sopenharmony_ci if line is NeedMoreData: 4297db96d56Sopenharmony_ci yield NeedMoreData 4307db96d56Sopenharmony_ci continue 4317db96d56Sopenharmony_ci self._cur.epilogue = EMPTYSTRING.join(epilogue) 4327db96d56Sopenharmony_ci return 4337db96d56Sopenharmony_ci # If we're not processing the preamble, then we might have seen 4347db96d56Sopenharmony_ci # EOF without seeing that end boundary...that is also a defect. 4357db96d56Sopenharmony_ci if not close_boundary_seen: 4367db96d56Sopenharmony_ci defect = errors.CloseBoundaryNotFoundDefect() 4377db96d56Sopenharmony_ci self.policy.handle_defect(self._cur, defect) 4387db96d56Sopenharmony_ci return 4397db96d56Sopenharmony_ci # Everything from here to the EOF is epilogue. If the end boundary 4407db96d56Sopenharmony_ci # ended in a newline, we'll need to make sure the epilogue isn't 4417db96d56Sopenharmony_ci # None 4427db96d56Sopenharmony_ci if linesep: 4437db96d56Sopenharmony_ci epilogue = [''] 4447db96d56Sopenharmony_ci else: 4457db96d56Sopenharmony_ci epilogue = [] 4467db96d56Sopenharmony_ci for line in self._input: 4477db96d56Sopenharmony_ci if line is NeedMoreData: 4487db96d56Sopenharmony_ci yield NeedMoreData 4497db96d56Sopenharmony_ci continue 4507db96d56Sopenharmony_ci epilogue.append(line) 4517db96d56Sopenharmony_ci # Any CRLF at the front of the epilogue is not technically part of 4527db96d56Sopenharmony_ci # the epilogue. Also, watch out for an empty string epilogue, 4537db96d56Sopenharmony_ci # which means a single newline. 4547db96d56Sopenharmony_ci if epilogue: 4557db96d56Sopenharmony_ci firstline = epilogue[0] 4567db96d56Sopenharmony_ci bolmo = NLCRE_bol.match(firstline) 4577db96d56Sopenharmony_ci if bolmo: 4587db96d56Sopenharmony_ci epilogue[0] = firstline[len(bolmo.group(0)):] 4597db96d56Sopenharmony_ci self._cur.epilogue = EMPTYSTRING.join(epilogue) 4607db96d56Sopenharmony_ci return 4617db96d56Sopenharmony_ci # Otherwise, it's some non-multipart type, so the entire rest of the 4627db96d56Sopenharmony_ci # file contents becomes the payload. 4637db96d56Sopenharmony_ci lines = [] 4647db96d56Sopenharmony_ci for line in self._input: 4657db96d56Sopenharmony_ci if line is NeedMoreData: 4667db96d56Sopenharmony_ci yield NeedMoreData 4677db96d56Sopenharmony_ci continue 4687db96d56Sopenharmony_ci lines.append(line) 4697db96d56Sopenharmony_ci self._cur.set_payload(EMPTYSTRING.join(lines)) 4707db96d56Sopenharmony_ci 4717db96d56Sopenharmony_ci def _parse_headers(self, lines): 4727db96d56Sopenharmony_ci # Passed a list of lines that make up the headers for the current msg 4737db96d56Sopenharmony_ci lastheader = '' 4747db96d56Sopenharmony_ci lastvalue = [] 4757db96d56Sopenharmony_ci for lineno, line in enumerate(lines): 4767db96d56Sopenharmony_ci # Check for continuation 4777db96d56Sopenharmony_ci if line[0] in ' \t': 4787db96d56Sopenharmony_ci if not lastheader: 4797db96d56Sopenharmony_ci # The first line of the headers was a continuation. This 4807db96d56Sopenharmony_ci # is illegal, so let's note the defect, store the illegal 4817db96d56Sopenharmony_ci # line, and ignore it for purposes of headers. 4827db96d56Sopenharmony_ci defect = errors.FirstHeaderLineIsContinuationDefect(line) 4837db96d56Sopenharmony_ci self.policy.handle_defect(self._cur, defect) 4847db96d56Sopenharmony_ci continue 4857db96d56Sopenharmony_ci lastvalue.append(line) 4867db96d56Sopenharmony_ci continue 4877db96d56Sopenharmony_ci if lastheader: 4887db96d56Sopenharmony_ci self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) 4897db96d56Sopenharmony_ci lastheader, lastvalue = '', [] 4907db96d56Sopenharmony_ci # Check for envelope header, i.e. unix-from 4917db96d56Sopenharmony_ci if line.startswith('From '): 4927db96d56Sopenharmony_ci if lineno == 0: 4937db96d56Sopenharmony_ci # Strip off the trailing newline 4947db96d56Sopenharmony_ci mo = NLCRE_eol.search(line) 4957db96d56Sopenharmony_ci if mo: 4967db96d56Sopenharmony_ci line = line[:-len(mo.group(0))] 4977db96d56Sopenharmony_ci self._cur.set_unixfrom(line) 4987db96d56Sopenharmony_ci continue 4997db96d56Sopenharmony_ci elif lineno == len(lines) - 1: 5007db96d56Sopenharmony_ci # Something looking like a unix-from at the end - it's 5017db96d56Sopenharmony_ci # probably the first line of the body, so push back the 5027db96d56Sopenharmony_ci # line and stop. 5037db96d56Sopenharmony_ci self._input.unreadline(line) 5047db96d56Sopenharmony_ci return 5057db96d56Sopenharmony_ci else: 5067db96d56Sopenharmony_ci # Weirdly placed unix-from line. Note this as a defect 5077db96d56Sopenharmony_ci # and ignore it. 5087db96d56Sopenharmony_ci defect = errors.MisplacedEnvelopeHeaderDefect(line) 5097db96d56Sopenharmony_ci self._cur.defects.append(defect) 5107db96d56Sopenharmony_ci continue 5117db96d56Sopenharmony_ci # Split the line on the colon separating field name from value. 5127db96d56Sopenharmony_ci # There will always be a colon, because if there wasn't the part of 5137db96d56Sopenharmony_ci # the parser that calls us would have started parsing the body. 5147db96d56Sopenharmony_ci i = line.find(':') 5157db96d56Sopenharmony_ci 5167db96d56Sopenharmony_ci # If the colon is on the start of the line the header is clearly 5177db96d56Sopenharmony_ci # malformed, but we might be able to salvage the rest of the 5187db96d56Sopenharmony_ci # message. Track the error but keep going. 5197db96d56Sopenharmony_ci if i == 0: 5207db96d56Sopenharmony_ci defect = errors.InvalidHeaderDefect("Missing header name.") 5217db96d56Sopenharmony_ci self._cur.defects.append(defect) 5227db96d56Sopenharmony_ci continue 5237db96d56Sopenharmony_ci 5247db96d56Sopenharmony_ci assert i>0, "_parse_headers fed line with no : and no leading WS" 5257db96d56Sopenharmony_ci lastheader = line[:i] 5267db96d56Sopenharmony_ci lastvalue = [line] 5277db96d56Sopenharmony_ci # Done with all the lines, so handle the last header. 5287db96d56Sopenharmony_ci if lastheader: 5297db96d56Sopenharmony_ci self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) 5307db96d56Sopenharmony_ci 5317db96d56Sopenharmony_ci 5327db96d56Sopenharmony_ciclass BytesFeedParser(FeedParser): 5337db96d56Sopenharmony_ci """Like FeedParser, but feed accepts bytes.""" 5347db96d56Sopenharmony_ci 5357db96d56Sopenharmony_ci def feed(self, data): 5367db96d56Sopenharmony_ci super().feed(data.decode('ascii', 'surrogateescape')) 537