17db96d56Sopenharmony_ci# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
27db96d56Sopenharmony_ci# All rights reserved.
37db96d56Sopenharmony_ci
47db96d56Sopenharmony_ci"""Tokenization help for Python programs.
57db96d56Sopenharmony_ci
67db96d56Sopenharmony_cigenerate_tokens(readline) is a generator that breaks a stream of
77db96d56Sopenharmony_citext into Python tokens.  It accepts a readline-like method which is called
87db96d56Sopenharmony_cirepeatedly to get the next line of input (or "" for EOF).  It generates
97db96d56Sopenharmony_ci5-tuples with these members:
107db96d56Sopenharmony_ci
117db96d56Sopenharmony_ci    the token type (see token.py)
127db96d56Sopenharmony_ci    the token (a string)
137db96d56Sopenharmony_ci    the starting (row, column) indices of the token (a 2-tuple of ints)
147db96d56Sopenharmony_ci    the ending (row, column) indices of the token (a 2-tuple of ints)
157db96d56Sopenharmony_ci    the original line (string)
167db96d56Sopenharmony_ci
177db96d56Sopenharmony_ciIt is designed to match the working of the Python tokenizer exactly, except
187db96d56Sopenharmony_cithat it produces COMMENT tokens for comments and gives type OP for all
197db96d56Sopenharmony_cioperators
207db96d56Sopenharmony_ci
217db96d56Sopenharmony_ciOlder entry points
227db96d56Sopenharmony_ci    tokenize_loop(readline, tokeneater)
237db96d56Sopenharmony_ci    tokenize(readline, tokeneater=printtoken)
247db96d56Sopenharmony_ciare the same, except instead of generating tokens, tokeneater is a callback
257db96d56Sopenharmony_cifunction to which the 5 fields described above are passed as 5 arguments,
267db96d56Sopenharmony_cieach time a new token is found."""
277db96d56Sopenharmony_ci
287db96d56Sopenharmony_ci__author__ = 'Ka-Ping Yee <ping@lfw.org>'
297db96d56Sopenharmony_ci__credits__ = \
307db96d56Sopenharmony_ci    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
317db96d56Sopenharmony_ci
327db96d56Sopenharmony_ciimport string, re
337db96d56Sopenharmony_cifrom codecs import BOM_UTF8, lookup
347db96d56Sopenharmony_cifrom lib2to3.pgen2.token import *
357db96d56Sopenharmony_ci
367db96d56Sopenharmony_cifrom . import token
377db96d56Sopenharmony_ci__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
387db96d56Sopenharmony_ci           "generate_tokens", "untokenize"]
397db96d56Sopenharmony_cidel token
407db96d56Sopenharmony_ci
417db96d56Sopenharmony_citry:
427db96d56Sopenharmony_ci    bytes
437db96d56Sopenharmony_ciexcept NameError:
447db96d56Sopenharmony_ci    # Support bytes type in Python <= 2.5, so 2to3 turns itself into
457db96d56Sopenharmony_ci    # valid Python 3 code.
467db96d56Sopenharmony_ci    bytes = str
477db96d56Sopenharmony_ci
487db96d56Sopenharmony_cidef group(*choices): return '(' + '|'.join(choices) + ')'
497db96d56Sopenharmony_cidef any(*choices): return group(*choices) + '*'
507db96d56Sopenharmony_cidef maybe(*choices): return group(*choices) + '?'
517db96d56Sopenharmony_cidef _combinations(*l):
527db96d56Sopenharmony_ci    return set(
537db96d56Sopenharmony_ci        x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
547db96d56Sopenharmony_ci    )
557db96d56Sopenharmony_ci
567db96d56Sopenharmony_ciWhitespace = r'[ \f\t]*'
577db96d56Sopenharmony_ciComment = r'#[^\r\n]*'
587db96d56Sopenharmony_ciIgnore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
597db96d56Sopenharmony_ciName = r'\w+'
607db96d56Sopenharmony_ci
617db96d56Sopenharmony_ciBinnumber = r'0[bB]_?[01]+(?:_[01]+)*'
627db96d56Sopenharmony_ciHexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
637db96d56Sopenharmony_ciOctnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
647db96d56Sopenharmony_ciDecnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
657db96d56Sopenharmony_ciIntnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
667db96d56Sopenharmony_ciExponent = r'[eE][-+]?\d+(?:_\d+)*'
677db96d56Sopenharmony_ciPointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
687db96d56Sopenharmony_ciExpfloat = r'\d+(?:_\d+)*' + Exponent
697db96d56Sopenharmony_ciFloatnumber = group(Pointfloat, Expfloat)
707db96d56Sopenharmony_ciImagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
717db96d56Sopenharmony_ciNumber = group(Imagnumber, Floatnumber, Intnumber)
727db96d56Sopenharmony_ci
737db96d56Sopenharmony_ci# Tail end of ' string.
747db96d56Sopenharmony_ciSingle = r"[^'\\]*(?:\\.[^'\\]*)*'"
757db96d56Sopenharmony_ci# Tail end of " string.
767db96d56Sopenharmony_ciDouble = r'[^"\\]*(?:\\.[^"\\]*)*"'
777db96d56Sopenharmony_ci# Tail end of ''' string.
787db96d56Sopenharmony_ciSingle3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
797db96d56Sopenharmony_ci# Tail end of """ string.
807db96d56Sopenharmony_ciDouble3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
817db96d56Sopenharmony_ci_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
827db96d56Sopenharmony_ciTriple = group(_litprefix + "'''", _litprefix + '"""')
837db96d56Sopenharmony_ci# Single-line ' or " string.
847db96d56Sopenharmony_ciString = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
857db96d56Sopenharmony_ci               _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
867db96d56Sopenharmony_ci
877db96d56Sopenharmony_ci# Because of leftmost-then-longest match semantics, be sure to put the
887db96d56Sopenharmony_ci# longest operators first (e.g., if = came before ==, == would get
897db96d56Sopenharmony_ci# recognized as two instances of =).
907db96d56Sopenharmony_ciOperator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
917db96d56Sopenharmony_ci                 r"//=?", r"->",
927db96d56Sopenharmony_ci                 r"[+\-*/%&@|^=<>]=?",
937db96d56Sopenharmony_ci                 r"~")
947db96d56Sopenharmony_ci
957db96d56Sopenharmony_ciBracket = '[][(){}]'
967db96d56Sopenharmony_ciSpecial = group(r'\r?\n', r':=', r'[:;.,`@]')
977db96d56Sopenharmony_ciFunny = group(Operator, Bracket, Special)
987db96d56Sopenharmony_ci
997db96d56Sopenharmony_ciPlainToken = group(Number, Funny, String, Name)
1007db96d56Sopenharmony_ciToken = Ignore + PlainToken
1017db96d56Sopenharmony_ci
1027db96d56Sopenharmony_ci# First (or only) line of ' or " string.
1037db96d56Sopenharmony_ciContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
1047db96d56Sopenharmony_ci                group("'", r'\\\r?\n'),
1057db96d56Sopenharmony_ci                _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
1067db96d56Sopenharmony_ci                group('"', r'\\\r?\n'))
1077db96d56Sopenharmony_ciPseudoExtras = group(r'\\\r?\n', Comment, Triple)
1087db96d56Sopenharmony_ciPseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
1097db96d56Sopenharmony_ci
1107db96d56Sopenharmony_citokenprog, pseudoprog, single3prog, double3prog = map(
1117db96d56Sopenharmony_ci    re.compile, (Token, PseudoToken, Single3, Double3))
1127db96d56Sopenharmony_ci
1137db96d56Sopenharmony_ci_strprefixes = (
1147db96d56Sopenharmony_ci    _combinations('r', 'R', 'f', 'F') |
1157db96d56Sopenharmony_ci    _combinations('r', 'R', 'b', 'B') |
1167db96d56Sopenharmony_ci    {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
1177db96d56Sopenharmony_ci)
1187db96d56Sopenharmony_ci
1197db96d56Sopenharmony_ciendprogs = {"'": re.compile(Single), '"': re.compile(Double),
1207db96d56Sopenharmony_ci            "'''": single3prog, '"""': double3prog,
1217db96d56Sopenharmony_ci            **{f"{prefix}'''": single3prog for prefix in _strprefixes},
1227db96d56Sopenharmony_ci            **{f'{prefix}"""': double3prog for prefix in _strprefixes},
1237db96d56Sopenharmony_ci            **{prefix: None for prefix in _strprefixes}}
1247db96d56Sopenharmony_ci
1257db96d56Sopenharmony_citriple_quoted = (
1267db96d56Sopenharmony_ci    {"'''", '"""'} |
1277db96d56Sopenharmony_ci    {f"{prefix}'''" for prefix in _strprefixes} |
1287db96d56Sopenharmony_ci    {f'{prefix}"""' for prefix in _strprefixes}
1297db96d56Sopenharmony_ci)
1307db96d56Sopenharmony_cisingle_quoted = (
1317db96d56Sopenharmony_ci    {"'", '"'} |
1327db96d56Sopenharmony_ci    {f"{prefix}'" for prefix in _strprefixes} |
1337db96d56Sopenharmony_ci    {f'{prefix}"' for prefix in _strprefixes}
1347db96d56Sopenharmony_ci)
1357db96d56Sopenharmony_ci
1367db96d56Sopenharmony_citabsize = 8
1377db96d56Sopenharmony_ci
1387db96d56Sopenharmony_ciclass TokenError(Exception): pass
1397db96d56Sopenharmony_ci
1407db96d56Sopenharmony_ciclass StopTokenizing(Exception): pass
1417db96d56Sopenharmony_ci
1427db96d56Sopenharmony_cidef printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
1437db96d56Sopenharmony_ci    (srow, scol) = xxx_todo_changeme
1447db96d56Sopenharmony_ci    (erow, ecol) = xxx_todo_changeme1
1457db96d56Sopenharmony_ci    print("%d,%d-%d,%d:\t%s\t%s" % \
1467db96d56Sopenharmony_ci        (srow, scol, erow, ecol, tok_name[type], repr(token)))
1477db96d56Sopenharmony_ci
1487db96d56Sopenharmony_cidef tokenize(readline, tokeneater=printtoken):
1497db96d56Sopenharmony_ci    """
1507db96d56Sopenharmony_ci    The tokenize() function accepts two parameters: one representing the
1517db96d56Sopenharmony_ci    input stream, and one providing an output mechanism for tokenize().
1527db96d56Sopenharmony_ci
1537db96d56Sopenharmony_ci    The first parameter, readline, must be a callable object which provides
1547db96d56Sopenharmony_ci    the same interface as the readline() method of built-in file objects.
1557db96d56Sopenharmony_ci    Each call to the function should return one line of input as a string.
1567db96d56Sopenharmony_ci
1577db96d56Sopenharmony_ci    The second parameter, tokeneater, must also be a callable object. It is
1587db96d56Sopenharmony_ci    called once for each token, with five arguments, corresponding to the
1597db96d56Sopenharmony_ci    tuples generated by generate_tokens().
1607db96d56Sopenharmony_ci    """
1617db96d56Sopenharmony_ci    try:
1627db96d56Sopenharmony_ci        tokenize_loop(readline, tokeneater)
1637db96d56Sopenharmony_ci    except StopTokenizing:
1647db96d56Sopenharmony_ci        pass
1657db96d56Sopenharmony_ci
1667db96d56Sopenharmony_ci# backwards compatible interface
1677db96d56Sopenharmony_cidef tokenize_loop(readline, tokeneater):
1687db96d56Sopenharmony_ci    for token_info in generate_tokens(readline):
1697db96d56Sopenharmony_ci        tokeneater(*token_info)
1707db96d56Sopenharmony_ci
1717db96d56Sopenharmony_ciclass Untokenizer:
1727db96d56Sopenharmony_ci
1737db96d56Sopenharmony_ci    def __init__(self):
1747db96d56Sopenharmony_ci        self.tokens = []
1757db96d56Sopenharmony_ci        self.prev_row = 1
1767db96d56Sopenharmony_ci        self.prev_col = 0
1777db96d56Sopenharmony_ci
1787db96d56Sopenharmony_ci    def add_whitespace(self, start):
1797db96d56Sopenharmony_ci        row, col = start
1807db96d56Sopenharmony_ci        assert row <= self.prev_row
1817db96d56Sopenharmony_ci        col_offset = col - self.prev_col
1827db96d56Sopenharmony_ci        if col_offset:
1837db96d56Sopenharmony_ci            self.tokens.append(" " * col_offset)
1847db96d56Sopenharmony_ci
1857db96d56Sopenharmony_ci    def untokenize(self, iterable):
1867db96d56Sopenharmony_ci        for t in iterable:
1877db96d56Sopenharmony_ci            if len(t) == 2:
1887db96d56Sopenharmony_ci                self.compat(t, iterable)
1897db96d56Sopenharmony_ci                break
1907db96d56Sopenharmony_ci            tok_type, token, start, end, line = t
1917db96d56Sopenharmony_ci            self.add_whitespace(start)
1927db96d56Sopenharmony_ci            self.tokens.append(token)
1937db96d56Sopenharmony_ci            self.prev_row, self.prev_col = end
1947db96d56Sopenharmony_ci            if tok_type in (NEWLINE, NL):
1957db96d56Sopenharmony_ci                self.prev_row += 1
1967db96d56Sopenharmony_ci                self.prev_col = 0
1977db96d56Sopenharmony_ci        return "".join(self.tokens)
1987db96d56Sopenharmony_ci
1997db96d56Sopenharmony_ci    def compat(self, token, iterable):
2007db96d56Sopenharmony_ci        startline = False
2017db96d56Sopenharmony_ci        indents = []
2027db96d56Sopenharmony_ci        toks_append = self.tokens.append
2037db96d56Sopenharmony_ci        toknum, tokval = token
2047db96d56Sopenharmony_ci        if toknum in (NAME, NUMBER):
2057db96d56Sopenharmony_ci            tokval += ' '
2067db96d56Sopenharmony_ci        if toknum in (NEWLINE, NL):
2077db96d56Sopenharmony_ci            startline = True
2087db96d56Sopenharmony_ci        for tok in iterable:
2097db96d56Sopenharmony_ci            toknum, tokval = tok[:2]
2107db96d56Sopenharmony_ci
2117db96d56Sopenharmony_ci            if toknum in (NAME, NUMBER, ASYNC, AWAIT):
2127db96d56Sopenharmony_ci                tokval += ' '
2137db96d56Sopenharmony_ci
2147db96d56Sopenharmony_ci            if toknum == INDENT:
2157db96d56Sopenharmony_ci                indents.append(tokval)
2167db96d56Sopenharmony_ci                continue
2177db96d56Sopenharmony_ci            elif toknum == DEDENT:
2187db96d56Sopenharmony_ci                indents.pop()
2197db96d56Sopenharmony_ci                continue
2207db96d56Sopenharmony_ci            elif toknum in (NEWLINE, NL):
2217db96d56Sopenharmony_ci                startline = True
2227db96d56Sopenharmony_ci            elif startline and indents:
2237db96d56Sopenharmony_ci                toks_append(indents[-1])
2247db96d56Sopenharmony_ci                startline = False
2257db96d56Sopenharmony_ci            toks_append(tokval)
2267db96d56Sopenharmony_ci
2277db96d56Sopenharmony_cicookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
2287db96d56Sopenharmony_ciblank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
2297db96d56Sopenharmony_ci
2307db96d56Sopenharmony_cidef _get_normal_name(orig_enc):
2317db96d56Sopenharmony_ci    """Imitates get_normal_name in tokenizer.c."""
2327db96d56Sopenharmony_ci    # Only care about the first 12 characters.
2337db96d56Sopenharmony_ci    enc = orig_enc[:12].lower().replace("_", "-")
2347db96d56Sopenharmony_ci    if enc == "utf-8" or enc.startswith("utf-8-"):
2357db96d56Sopenharmony_ci        return "utf-8"
2367db96d56Sopenharmony_ci    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
2377db96d56Sopenharmony_ci       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
2387db96d56Sopenharmony_ci        return "iso-8859-1"
2397db96d56Sopenharmony_ci    return orig_enc
2407db96d56Sopenharmony_ci
2417db96d56Sopenharmony_cidef detect_encoding(readline):
2427db96d56Sopenharmony_ci    """
2437db96d56Sopenharmony_ci    The detect_encoding() function is used to detect the encoding that should
2447db96d56Sopenharmony_ci    be used to decode a Python source file. It requires one argument, readline,
2457db96d56Sopenharmony_ci    in the same way as the tokenize() generator.
2467db96d56Sopenharmony_ci
2477db96d56Sopenharmony_ci    It will call readline a maximum of twice, and return the encoding used
2487db96d56Sopenharmony_ci    (as a string) and a list of any lines (left as bytes) it has read
2497db96d56Sopenharmony_ci    in.
2507db96d56Sopenharmony_ci
2517db96d56Sopenharmony_ci    It detects the encoding from the presence of a utf-8 bom or an encoding
2527db96d56Sopenharmony_ci    cookie as specified in pep-0263. If both a bom and a cookie are present, but
2537db96d56Sopenharmony_ci    disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
2547db96d56Sopenharmony_ci    charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
2557db96d56Sopenharmony_ci    'utf-8-sig' is returned.
2567db96d56Sopenharmony_ci
2577db96d56Sopenharmony_ci    If no encoding is specified, then the default of 'utf-8' will be returned.
2587db96d56Sopenharmony_ci    """
2597db96d56Sopenharmony_ci    bom_found = False
2607db96d56Sopenharmony_ci    encoding = None
2617db96d56Sopenharmony_ci    default = 'utf-8'
2627db96d56Sopenharmony_ci    def read_or_stop():
2637db96d56Sopenharmony_ci        try:
2647db96d56Sopenharmony_ci            return readline()
2657db96d56Sopenharmony_ci        except StopIteration:
2667db96d56Sopenharmony_ci            return bytes()
2677db96d56Sopenharmony_ci
2687db96d56Sopenharmony_ci    def find_cookie(line):
2697db96d56Sopenharmony_ci        try:
2707db96d56Sopenharmony_ci            line_string = line.decode('ascii')
2717db96d56Sopenharmony_ci        except UnicodeDecodeError:
2727db96d56Sopenharmony_ci            return None
2737db96d56Sopenharmony_ci        match = cookie_re.match(line_string)
2747db96d56Sopenharmony_ci        if not match:
2757db96d56Sopenharmony_ci            return None
2767db96d56Sopenharmony_ci        encoding = _get_normal_name(match.group(1))
2777db96d56Sopenharmony_ci        try:
2787db96d56Sopenharmony_ci            codec = lookup(encoding)
2797db96d56Sopenharmony_ci        except LookupError:
2807db96d56Sopenharmony_ci            # This behaviour mimics the Python interpreter
2817db96d56Sopenharmony_ci            raise SyntaxError("unknown encoding: " + encoding)
2827db96d56Sopenharmony_ci
2837db96d56Sopenharmony_ci        if bom_found:
2847db96d56Sopenharmony_ci            if codec.name != 'utf-8':
2857db96d56Sopenharmony_ci                # This behaviour mimics the Python interpreter
2867db96d56Sopenharmony_ci                raise SyntaxError('encoding problem: utf-8')
2877db96d56Sopenharmony_ci            encoding += '-sig'
2887db96d56Sopenharmony_ci        return encoding
2897db96d56Sopenharmony_ci
2907db96d56Sopenharmony_ci    first = read_or_stop()
2917db96d56Sopenharmony_ci    if first.startswith(BOM_UTF8):
2927db96d56Sopenharmony_ci        bom_found = True
2937db96d56Sopenharmony_ci        first = first[3:]
2947db96d56Sopenharmony_ci        default = 'utf-8-sig'
2957db96d56Sopenharmony_ci    if not first:
2967db96d56Sopenharmony_ci        return default, []
2977db96d56Sopenharmony_ci
2987db96d56Sopenharmony_ci    encoding = find_cookie(first)
2997db96d56Sopenharmony_ci    if encoding:
3007db96d56Sopenharmony_ci        return encoding, [first]
3017db96d56Sopenharmony_ci    if not blank_re.match(first):
3027db96d56Sopenharmony_ci        return default, [first]
3037db96d56Sopenharmony_ci
3047db96d56Sopenharmony_ci    second = read_or_stop()
3057db96d56Sopenharmony_ci    if not second:
3067db96d56Sopenharmony_ci        return default, [first]
3077db96d56Sopenharmony_ci
3087db96d56Sopenharmony_ci    encoding = find_cookie(second)
3097db96d56Sopenharmony_ci    if encoding:
3107db96d56Sopenharmony_ci        return encoding, [first, second]
3117db96d56Sopenharmony_ci
3127db96d56Sopenharmony_ci    return default, [first, second]
3137db96d56Sopenharmony_ci
3147db96d56Sopenharmony_cidef untokenize(iterable):
3157db96d56Sopenharmony_ci    """Transform tokens back into Python source code.
3167db96d56Sopenharmony_ci
3177db96d56Sopenharmony_ci    Each element returned by the iterable must be a token sequence
3187db96d56Sopenharmony_ci    with at least two elements, a token number and token value.  If
3197db96d56Sopenharmony_ci    only two tokens are passed, the resulting output is poor.
3207db96d56Sopenharmony_ci
3217db96d56Sopenharmony_ci    Round-trip invariant for full input:
3227db96d56Sopenharmony_ci        Untokenized source will match input source exactly
3237db96d56Sopenharmony_ci
3247db96d56Sopenharmony_ci    Round-trip invariant for limited input:
3257db96d56Sopenharmony_ci        # Output text will tokenize the back to the input
3267db96d56Sopenharmony_ci        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
3277db96d56Sopenharmony_ci        newcode = untokenize(t1)
3287db96d56Sopenharmony_ci        readline = iter(newcode.splitlines(1)).next
3297db96d56Sopenharmony_ci        t2 = [tok[:2] for tokin generate_tokens(readline)]
3307db96d56Sopenharmony_ci        assert t1 == t2
3317db96d56Sopenharmony_ci    """
3327db96d56Sopenharmony_ci    ut = Untokenizer()
3337db96d56Sopenharmony_ci    return ut.untokenize(iterable)
3347db96d56Sopenharmony_ci
3357db96d56Sopenharmony_cidef generate_tokens(readline):
3367db96d56Sopenharmony_ci    """
3377db96d56Sopenharmony_ci    The generate_tokens() generator requires one argument, readline, which
3387db96d56Sopenharmony_ci    must be a callable object which provides the same interface as the
3397db96d56Sopenharmony_ci    readline() method of built-in file objects. Each call to the function
3407db96d56Sopenharmony_ci    should return one line of input as a string.  Alternately, readline
3417db96d56Sopenharmony_ci    can be a callable function terminating with StopIteration:
3427db96d56Sopenharmony_ci        readline = open(myfile).next    # Example of alternate readline
3437db96d56Sopenharmony_ci
3447db96d56Sopenharmony_ci    The generator produces 5-tuples with these members: the token type; the
3457db96d56Sopenharmony_ci    token string; a 2-tuple (srow, scol) of ints specifying the row and
3467db96d56Sopenharmony_ci    column where the token begins in the source; a 2-tuple (erow, ecol) of
3477db96d56Sopenharmony_ci    ints specifying the row and column where the token ends in the source;
3487db96d56Sopenharmony_ci    and the line on which the token was found. The line passed is the
3497db96d56Sopenharmony_ci    physical line.
3507db96d56Sopenharmony_ci    """
3517db96d56Sopenharmony_ci    lnum = parenlev = continued = 0
3527db96d56Sopenharmony_ci    contstr, needcont = '', 0
3537db96d56Sopenharmony_ci    contline = None
3547db96d56Sopenharmony_ci    indents = [0]
3557db96d56Sopenharmony_ci
3567db96d56Sopenharmony_ci    # 'stashed' and 'async_*' are used for async/await parsing
3577db96d56Sopenharmony_ci    stashed = None
3587db96d56Sopenharmony_ci    async_def = False
3597db96d56Sopenharmony_ci    async_def_indent = 0
3607db96d56Sopenharmony_ci    async_def_nl = False
3617db96d56Sopenharmony_ci
3627db96d56Sopenharmony_ci    while 1:                                   # loop over lines in stream
3637db96d56Sopenharmony_ci        try:
3647db96d56Sopenharmony_ci            line = readline()
3657db96d56Sopenharmony_ci        except StopIteration:
3667db96d56Sopenharmony_ci            line = ''
3677db96d56Sopenharmony_ci        lnum = lnum + 1
3687db96d56Sopenharmony_ci        pos, max = 0, len(line)
3697db96d56Sopenharmony_ci
3707db96d56Sopenharmony_ci        if contstr:                            # continued string
3717db96d56Sopenharmony_ci            if not line:
3727db96d56Sopenharmony_ci                raise TokenError("EOF in multi-line string", strstart)
3737db96d56Sopenharmony_ci            endmatch = endprog.match(line)
3747db96d56Sopenharmony_ci            if endmatch:
3757db96d56Sopenharmony_ci                pos = end = endmatch.end(0)
3767db96d56Sopenharmony_ci                yield (STRING, contstr + line[:end],
3777db96d56Sopenharmony_ci                       strstart, (lnum, end), contline + line)
3787db96d56Sopenharmony_ci                contstr, needcont = '', 0
3797db96d56Sopenharmony_ci                contline = None
3807db96d56Sopenharmony_ci            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
3817db96d56Sopenharmony_ci                yield (ERRORTOKEN, contstr + line,
3827db96d56Sopenharmony_ci                           strstart, (lnum, len(line)), contline)
3837db96d56Sopenharmony_ci                contstr = ''
3847db96d56Sopenharmony_ci                contline = None
3857db96d56Sopenharmony_ci                continue
3867db96d56Sopenharmony_ci            else:
3877db96d56Sopenharmony_ci                contstr = contstr + line
3887db96d56Sopenharmony_ci                contline = contline + line
3897db96d56Sopenharmony_ci                continue
3907db96d56Sopenharmony_ci
3917db96d56Sopenharmony_ci        elif parenlev == 0 and not continued:  # new statement
3927db96d56Sopenharmony_ci            if not line: break
3937db96d56Sopenharmony_ci            column = 0
3947db96d56Sopenharmony_ci            while pos < max:                   # measure leading whitespace
3957db96d56Sopenharmony_ci                if line[pos] == ' ': column = column + 1
3967db96d56Sopenharmony_ci                elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
3977db96d56Sopenharmony_ci                elif line[pos] == '\f': column = 0
3987db96d56Sopenharmony_ci                else: break
3997db96d56Sopenharmony_ci                pos = pos + 1
4007db96d56Sopenharmony_ci            if pos == max: break
4017db96d56Sopenharmony_ci
4027db96d56Sopenharmony_ci            if stashed:
4037db96d56Sopenharmony_ci                yield stashed
4047db96d56Sopenharmony_ci                stashed = None
4057db96d56Sopenharmony_ci
4067db96d56Sopenharmony_ci            if line[pos] in '#\r\n':           # skip comments or blank lines
4077db96d56Sopenharmony_ci                if line[pos] == '#':
4087db96d56Sopenharmony_ci                    comment_token = line[pos:].rstrip('\r\n')
4097db96d56Sopenharmony_ci                    nl_pos = pos + len(comment_token)
4107db96d56Sopenharmony_ci                    yield (COMMENT, comment_token,
4117db96d56Sopenharmony_ci                           (lnum, pos), (lnum, pos + len(comment_token)), line)
4127db96d56Sopenharmony_ci                    yield (NL, line[nl_pos:],
4137db96d56Sopenharmony_ci                           (lnum, nl_pos), (lnum, len(line)), line)
4147db96d56Sopenharmony_ci                else:
4157db96d56Sopenharmony_ci                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
4167db96d56Sopenharmony_ci                           (lnum, pos), (lnum, len(line)), line)
4177db96d56Sopenharmony_ci                continue
4187db96d56Sopenharmony_ci
4197db96d56Sopenharmony_ci            if column > indents[-1]:           # count indents or dedents
4207db96d56Sopenharmony_ci                indents.append(column)
4217db96d56Sopenharmony_ci                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
4227db96d56Sopenharmony_ci            while column < indents[-1]:
4237db96d56Sopenharmony_ci                if column not in indents:
4247db96d56Sopenharmony_ci                    raise IndentationError(
4257db96d56Sopenharmony_ci                        "unindent does not match any outer indentation level",
4267db96d56Sopenharmony_ci                        ("<tokenize>", lnum, pos, line))
4277db96d56Sopenharmony_ci                indents = indents[:-1]
4287db96d56Sopenharmony_ci
4297db96d56Sopenharmony_ci                if async_def and async_def_indent >= indents[-1]:
4307db96d56Sopenharmony_ci                    async_def = False
4317db96d56Sopenharmony_ci                    async_def_nl = False
4327db96d56Sopenharmony_ci                    async_def_indent = 0
4337db96d56Sopenharmony_ci
4347db96d56Sopenharmony_ci                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
4357db96d56Sopenharmony_ci
4367db96d56Sopenharmony_ci            if async_def and async_def_nl and async_def_indent >= indents[-1]:
4377db96d56Sopenharmony_ci                async_def = False
4387db96d56Sopenharmony_ci                async_def_nl = False
4397db96d56Sopenharmony_ci                async_def_indent = 0
4407db96d56Sopenharmony_ci
4417db96d56Sopenharmony_ci        else:                                  # continued statement
4427db96d56Sopenharmony_ci            if not line:
4437db96d56Sopenharmony_ci                raise TokenError("EOF in multi-line statement", (lnum, 0))
4447db96d56Sopenharmony_ci            continued = 0
4457db96d56Sopenharmony_ci
4467db96d56Sopenharmony_ci        while pos < max:
4477db96d56Sopenharmony_ci            pseudomatch = pseudoprog.match(line, pos)
4487db96d56Sopenharmony_ci            if pseudomatch:                                # scan for tokens
4497db96d56Sopenharmony_ci                start, end = pseudomatch.span(1)
4507db96d56Sopenharmony_ci                spos, epos, pos = (lnum, start), (lnum, end), end
4517db96d56Sopenharmony_ci                token, initial = line[start:end], line[start]
4527db96d56Sopenharmony_ci
4537db96d56Sopenharmony_ci                if initial in string.digits or \
4547db96d56Sopenharmony_ci                   (initial == '.' and token != '.'):      # ordinary number
4557db96d56Sopenharmony_ci                    yield (NUMBER, token, spos, epos, line)
4567db96d56Sopenharmony_ci                elif initial in '\r\n':
4577db96d56Sopenharmony_ci                    newline = NEWLINE
4587db96d56Sopenharmony_ci                    if parenlev > 0:
4597db96d56Sopenharmony_ci                        newline = NL
4607db96d56Sopenharmony_ci                    elif async_def:
4617db96d56Sopenharmony_ci                        async_def_nl = True
4627db96d56Sopenharmony_ci                    if stashed:
4637db96d56Sopenharmony_ci                        yield stashed
4647db96d56Sopenharmony_ci                        stashed = None
4657db96d56Sopenharmony_ci                    yield (newline, token, spos, epos, line)
4667db96d56Sopenharmony_ci
4677db96d56Sopenharmony_ci                elif initial == '#':
4687db96d56Sopenharmony_ci                    assert not token.endswith("\n")
4697db96d56Sopenharmony_ci                    if stashed:
4707db96d56Sopenharmony_ci                        yield stashed
4717db96d56Sopenharmony_ci                        stashed = None
4727db96d56Sopenharmony_ci                    yield (COMMENT, token, spos, epos, line)
4737db96d56Sopenharmony_ci                elif token in triple_quoted:
4747db96d56Sopenharmony_ci                    endprog = endprogs[token]
4757db96d56Sopenharmony_ci                    endmatch = endprog.match(line, pos)
4767db96d56Sopenharmony_ci                    if endmatch:                           # all on one line
4777db96d56Sopenharmony_ci                        pos = endmatch.end(0)
4787db96d56Sopenharmony_ci                        token = line[start:pos]
4797db96d56Sopenharmony_ci                        if stashed:
4807db96d56Sopenharmony_ci                            yield stashed
4817db96d56Sopenharmony_ci                            stashed = None
4827db96d56Sopenharmony_ci                        yield (STRING, token, spos, (lnum, pos), line)
4837db96d56Sopenharmony_ci                    else:
4847db96d56Sopenharmony_ci                        strstart = (lnum, start)           # multiple lines
4857db96d56Sopenharmony_ci                        contstr = line[start:]
4867db96d56Sopenharmony_ci                        contline = line
4877db96d56Sopenharmony_ci                        break
4887db96d56Sopenharmony_ci                elif initial in single_quoted or \
4897db96d56Sopenharmony_ci                    token[:2] in single_quoted or \
4907db96d56Sopenharmony_ci                    token[:3] in single_quoted:
4917db96d56Sopenharmony_ci                    if token[-1] == '\n':                  # continued string
4927db96d56Sopenharmony_ci                        strstart = (lnum, start)
4937db96d56Sopenharmony_ci                        endprog = (endprogs[initial] or endprogs[token[1]] or
4947db96d56Sopenharmony_ci                                   endprogs[token[2]])
4957db96d56Sopenharmony_ci                        contstr, needcont = line[start:], 1
4967db96d56Sopenharmony_ci                        contline = line
4977db96d56Sopenharmony_ci                        break
4987db96d56Sopenharmony_ci                    else:                                  # ordinary string
4997db96d56Sopenharmony_ci                        if stashed:
5007db96d56Sopenharmony_ci                            yield stashed
5017db96d56Sopenharmony_ci                            stashed = None
5027db96d56Sopenharmony_ci                        yield (STRING, token, spos, epos, line)
5037db96d56Sopenharmony_ci                elif initial.isidentifier():               # ordinary name
5047db96d56Sopenharmony_ci                    if token in ('async', 'await'):
5057db96d56Sopenharmony_ci                        if async_def:
5067db96d56Sopenharmony_ci                            yield (ASYNC if token == 'async' else AWAIT,
5077db96d56Sopenharmony_ci                                   token, spos, epos, line)
5087db96d56Sopenharmony_ci                            continue
5097db96d56Sopenharmony_ci
5107db96d56Sopenharmony_ci                    tok = (NAME, token, spos, epos, line)
5117db96d56Sopenharmony_ci                    if token == 'async' and not stashed:
5127db96d56Sopenharmony_ci                        stashed = tok
5137db96d56Sopenharmony_ci                        continue
5147db96d56Sopenharmony_ci
5157db96d56Sopenharmony_ci                    if token in ('def', 'for'):
5167db96d56Sopenharmony_ci                        if (stashed
5177db96d56Sopenharmony_ci                                and stashed[0] == NAME
5187db96d56Sopenharmony_ci                                and stashed[1] == 'async'):
5197db96d56Sopenharmony_ci
5207db96d56Sopenharmony_ci                            if token == 'def':
5217db96d56Sopenharmony_ci                                async_def = True
5227db96d56Sopenharmony_ci                                async_def_indent = indents[-1]
5237db96d56Sopenharmony_ci
5247db96d56Sopenharmony_ci                            yield (ASYNC, stashed[1],
5257db96d56Sopenharmony_ci                                   stashed[2], stashed[3],
5267db96d56Sopenharmony_ci                                   stashed[4])
5277db96d56Sopenharmony_ci                            stashed = None
5287db96d56Sopenharmony_ci
5297db96d56Sopenharmony_ci                    if stashed:
5307db96d56Sopenharmony_ci                        yield stashed
5317db96d56Sopenharmony_ci                        stashed = None
5327db96d56Sopenharmony_ci
5337db96d56Sopenharmony_ci                    yield tok
5347db96d56Sopenharmony_ci                elif initial == '\\':                      # continued stmt
5357db96d56Sopenharmony_ci                    # This yield is new; needed for better idempotency:
5367db96d56Sopenharmony_ci                    if stashed:
5377db96d56Sopenharmony_ci                        yield stashed
5387db96d56Sopenharmony_ci                        stashed = None
5397db96d56Sopenharmony_ci                    yield (NL, token, spos, (lnum, pos), line)
5407db96d56Sopenharmony_ci                    continued = 1
5417db96d56Sopenharmony_ci                else:
5427db96d56Sopenharmony_ci                    if initial in '([{': parenlev = parenlev + 1
5437db96d56Sopenharmony_ci                    elif initial in ')]}': parenlev = parenlev - 1
5447db96d56Sopenharmony_ci                    if stashed:
5457db96d56Sopenharmony_ci                        yield stashed
5467db96d56Sopenharmony_ci                        stashed = None
5477db96d56Sopenharmony_ci                    yield (OP, token, spos, epos, line)
5487db96d56Sopenharmony_ci            else:
5497db96d56Sopenharmony_ci                yield (ERRORTOKEN, line[pos],
5507db96d56Sopenharmony_ci                           (lnum, pos), (lnum, pos+1), line)
5517db96d56Sopenharmony_ci                pos = pos + 1
5527db96d56Sopenharmony_ci
5537db96d56Sopenharmony_ci    if stashed:
5547db96d56Sopenharmony_ci        yield stashed
5557db96d56Sopenharmony_ci        stashed = None
5567db96d56Sopenharmony_ci
5577db96d56Sopenharmony_ci    for indent in indents[1:]:                 # pop remaining indent levels
5587db96d56Sopenharmony_ci        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
5597db96d56Sopenharmony_ci    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
5607db96d56Sopenharmony_ci
5617db96d56Sopenharmony_ciif __name__ == '__main__':                     # testing
5627db96d56Sopenharmony_ci    import sys
5637db96d56Sopenharmony_ci    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
5647db96d56Sopenharmony_ci    else: tokenize(sys.stdin.readline)
565