17db96d56Sopenharmony_ci# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 27db96d56Sopenharmony_ci# All rights reserved. 37db96d56Sopenharmony_ci 47db96d56Sopenharmony_ci"""Tokenization help for Python programs. 57db96d56Sopenharmony_ci 67db96d56Sopenharmony_cigenerate_tokens(readline) is a generator that breaks a stream of 77db96d56Sopenharmony_citext into Python tokens. It accepts a readline-like method which is called 87db96d56Sopenharmony_cirepeatedly to get the next line of input (or "" for EOF). It generates 97db96d56Sopenharmony_ci5-tuples with these members: 107db96d56Sopenharmony_ci 117db96d56Sopenharmony_ci the token type (see token.py) 127db96d56Sopenharmony_ci the token (a string) 137db96d56Sopenharmony_ci the starting (row, column) indices of the token (a 2-tuple of ints) 147db96d56Sopenharmony_ci the ending (row, column) indices of the token (a 2-tuple of ints) 157db96d56Sopenharmony_ci the original line (string) 167db96d56Sopenharmony_ci 177db96d56Sopenharmony_ciIt is designed to match the working of the Python tokenizer exactly, except 187db96d56Sopenharmony_cithat it produces COMMENT tokens for comments and gives type OP for all 197db96d56Sopenharmony_cioperators 207db96d56Sopenharmony_ci 217db96d56Sopenharmony_ciOlder entry points 227db96d56Sopenharmony_ci tokenize_loop(readline, tokeneater) 237db96d56Sopenharmony_ci tokenize(readline, tokeneater=printtoken) 247db96d56Sopenharmony_ciare the same, except instead of generating tokens, tokeneater is a callback 257db96d56Sopenharmony_cifunction to which the 5 fields described above are passed as 5 arguments, 267db96d56Sopenharmony_cieach time a new token is found.""" 277db96d56Sopenharmony_ci 287db96d56Sopenharmony_ci__author__ = 'Ka-Ping Yee <ping@lfw.org>' 297db96d56Sopenharmony_ci__credits__ = \ 307db96d56Sopenharmony_ci 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' 317db96d56Sopenharmony_ci 327db96d56Sopenharmony_ciimport string, re 337db96d56Sopenharmony_cifrom codecs import BOM_UTF8, lookup 347db96d56Sopenharmony_cifrom lib2to3.pgen2.token import * 357db96d56Sopenharmony_ci 367db96d56Sopenharmony_cifrom . import token 377db96d56Sopenharmony_ci__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize", 387db96d56Sopenharmony_ci "generate_tokens", "untokenize"] 397db96d56Sopenharmony_cidel token 407db96d56Sopenharmony_ci 417db96d56Sopenharmony_citry: 427db96d56Sopenharmony_ci bytes 437db96d56Sopenharmony_ciexcept NameError: 447db96d56Sopenharmony_ci # Support bytes type in Python <= 2.5, so 2to3 turns itself into 457db96d56Sopenharmony_ci # valid Python 3 code. 467db96d56Sopenharmony_ci bytes = str 477db96d56Sopenharmony_ci 487db96d56Sopenharmony_cidef group(*choices): return '(' + '|'.join(choices) + ')' 497db96d56Sopenharmony_cidef any(*choices): return group(*choices) + '*' 507db96d56Sopenharmony_cidef maybe(*choices): return group(*choices) + '?' 517db96d56Sopenharmony_cidef _combinations(*l): 527db96d56Sopenharmony_ci return set( 537db96d56Sopenharmony_ci x + y for x in l for y in l + ("",) if x.casefold() != y.casefold() 547db96d56Sopenharmony_ci ) 557db96d56Sopenharmony_ci 567db96d56Sopenharmony_ciWhitespace = r'[ \f\t]*' 577db96d56Sopenharmony_ciComment = r'#[^\r\n]*' 587db96d56Sopenharmony_ciIgnore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 597db96d56Sopenharmony_ciName = r'\w+' 607db96d56Sopenharmony_ci 617db96d56Sopenharmony_ciBinnumber = r'0[bB]_?[01]+(?:_[01]+)*' 627db96d56Sopenharmony_ciHexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?' 637db96d56Sopenharmony_ciOctnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?' 647db96d56Sopenharmony_ciDecnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?') 657db96d56Sopenharmony_ciIntnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) 667db96d56Sopenharmony_ciExponent = r'[eE][-+]?\d+(?:_\d+)*' 677db96d56Sopenharmony_ciPointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent) 687db96d56Sopenharmony_ciExpfloat = r'\d+(?:_\d+)*' + Exponent 697db96d56Sopenharmony_ciFloatnumber = group(Pointfloat, Expfloat) 707db96d56Sopenharmony_ciImagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]') 717db96d56Sopenharmony_ciNumber = group(Imagnumber, Floatnumber, Intnumber) 727db96d56Sopenharmony_ci 737db96d56Sopenharmony_ci# Tail end of ' string. 747db96d56Sopenharmony_ciSingle = r"[^'\\]*(?:\\.[^'\\]*)*'" 757db96d56Sopenharmony_ci# Tail end of " string. 767db96d56Sopenharmony_ciDouble = r'[^"\\]*(?:\\.[^"\\]*)*"' 777db96d56Sopenharmony_ci# Tail end of ''' string. 787db96d56Sopenharmony_ciSingle3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 797db96d56Sopenharmony_ci# Tail end of """ string. 807db96d56Sopenharmony_ciDouble3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 817db96d56Sopenharmony_ci_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?" 827db96d56Sopenharmony_ciTriple = group(_litprefix + "'''", _litprefix + '"""') 837db96d56Sopenharmony_ci# Single-line ' or " string. 847db96d56Sopenharmony_ciString = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 857db96d56Sopenharmony_ci _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 867db96d56Sopenharmony_ci 877db96d56Sopenharmony_ci# Because of leftmost-then-longest match semantics, be sure to put the 887db96d56Sopenharmony_ci# longest operators first (e.g., if = came before ==, == would get 897db96d56Sopenharmony_ci# recognized as two instances of =). 907db96d56Sopenharmony_ciOperator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 917db96d56Sopenharmony_ci r"//=?", r"->", 927db96d56Sopenharmony_ci r"[+\-*/%&@|^=<>]=?", 937db96d56Sopenharmony_ci r"~") 947db96d56Sopenharmony_ci 957db96d56Sopenharmony_ciBracket = '[][(){}]' 967db96d56Sopenharmony_ciSpecial = group(r'\r?\n', r':=', r'[:;.,`@]') 977db96d56Sopenharmony_ciFunny = group(Operator, Bracket, Special) 987db96d56Sopenharmony_ci 997db96d56Sopenharmony_ciPlainToken = group(Number, Funny, String, Name) 1007db96d56Sopenharmony_ciToken = Ignore + PlainToken 1017db96d56Sopenharmony_ci 1027db96d56Sopenharmony_ci# First (or only) line of ' or " string. 1037db96d56Sopenharmony_ciContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 1047db96d56Sopenharmony_ci group("'", r'\\\r?\n'), 1057db96d56Sopenharmony_ci _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 1067db96d56Sopenharmony_ci group('"', r'\\\r?\n')) 1077db96d56Sopenharmony_ciPseudoExtras = group(r'\\\r?\n', Comment, Triple) 1087db96d56Sopenharmony_ciPseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 1097db96d56Sopenharmony_ci 1107db96d56Sopenharmony_citokenprog, pseudoprog, single3prog, double3prog = map( 1117db96d56Sopenharmony_ci re.compile, (Token, PseudoToken, Single3, Double3)) 1127db96d56Sopenharmony_ci 1137db96d56Sopenharmony_ci_strprefixes = ( 1147db96d56Sopenharmony_ci _combinations('r', 'R', 'f', 'F') | 1157db96d56Sopenharmony_ci _combinations('r', 'R', 'b', 'B') | 1167db96d56Sopenharmony_ci {'u', 'U', 'ur', 'uR', 'Ur', 'UR'} 1177db96d56Sopenharmony_ci) 1187db96d56Sopenharmony_ci 1197db96d56Sopenharmony_ciendprogs = {"'": re.compile(Single), '"': re.compile(Double), 1207db96d56Sopenharmony_ci "'''": single3prog, '"""': double3prog, 1217db96d56Sopenharmony_ci **{f"{prefix}'''": single3prog for prefix in _strprefixes}, 1227db96d56Sopenharmony_ci **{f'{prefix}"""': double3prog for prefix in _strprefixes}, 1237db96d56Sopenharmony_ci **{prefix: None for prefix in _strprefixes}} 1247db96d56Sopenharmony_ci 1257db96d56Sopenharmony_citriple_quoted = ( 1267db96d56Sopenharmony_ci {"'''", '"""'} | 1277db96d56Sopenharmony_ci {f"{prefix}'''" for prefix in _strprefixes} | 1287db96d56Sopenharmony_ci {f'{prefix}"""' for prefix in _strprefixes} 1297db96d56Sopenharmony_ci) 1307db96d56Sopenharmony_cisingle_quoted = ( 1317db96d56Sopenharmony_ci {"'", '"'} | 1327db96d56Sopenharmony_ci {f"{prefix}'" for prefix in _strprefixes} | 1337db96d56Sopenharmony_ci {f'{prefix}"' for prefix in _strprefixes} 1347db96d56Sopenharmony_ci) 1357db96d56Sopenharmony_ci 1367db96d56Sopenharmony_citabsize = 8 1377db96d56Sopenharmony_ci 1387db96d56Sopenharmony_ciclass TokenError(Exception): pass 1397db96d56Sopenharmony_ci 1407db96d56Sopenharmony_ciclass StopTokenizing(Exception): pass 1417db96d56Sopenharmony_ci 1427db96d56Sopenharmony_cidef printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing 1437db96d56Sopenharmony_ci (srow, scol) = xxx_todo_changeme 1447db96d56Sopenharmony_ci (erow, ecol) = xxx_todo_changeme1 1457db96d56Sopenharmony_ci print("%d,%d-%d,%d:\t%s\t%s" % \ 1467db96d56Sopenharmony_ci (srow, scol, erow, ecol, tok_name[type], repr(token))) 1477db96d56Sopenharmony_ci 1487db96d56Sopenharmony_cidef tokenize(readline, tokeneater=printtoken): 1497db96d56Sopenharmony_ci """ 1507db96d56Sopenharmony_ci The tokenize() function accepts two parameters: one representing the 1517db96d56Sopenharmony_ci input stream, and one providing an output mechanism for tokenize(). 1527db96d56Sopenharmony_ci 1537db96d56Sopenharmony_ci The first parameter, readline, must be a callable object which provides 1547db96d56Sopenharmony_ci the same interface as the readline() method of built-in file objects. 1557db96d56Sopenharmony_ci Each call to the function should return one line of input as a string. 1567db96d56Sopenharmony_ci 1577db96d56Sopenharmony_ci The second parameter, tokeneater, must also be a callable object. It is 1587db96d56Sopenharmony_ci called once for each token, with five arguments, corresponding to the 1597db96d56Sopenharmony_ci tuples generated by generate_tokens(). 1607db96d56Sopenharmony_ci """ 1617db96d56Sopenharmony_ci try: 1627db96d56Sopenharmony_ci tokenize_loop(readline, tokeneater) 1637db96d56Sopenharmony_ci except StopTokenizing: 1647db96d56Sopenharmony_ci pass 1657db96d56Sopenharmony_ci 1667db96d56Sopenharmony_ci# backwards compatible interface 1677db96d56Sopenharmony_cidef tokenize_loop(readline, tokeneater): 1687db96d56Sopenharmony_ci for token_info in generate_tokens(readline): 1697db96d56Sopenharmony_ci tokeneater(*token_info) 1707db96d56Sopenharmony_ci 1717db96d56Sopenharmony_ciclass Untokenizer: 1727db96d56Sopenharmony_ci 1737db96d56Sopenharmony_ci def __init__(self): 1747db96d56Sopenharmony_ci self.tokens = [] 1757db96d56Sopenharmony_ci self.prev_row = 1 1767db96d56Sopenharmony_ci self.prev_col = 0 1777db96d56Sopenharmony_ci 1787db96d56Sopenharmony_ci def add_whitespace(self, start): 1797db96d56Sopenharmony_ci row, col = start 1807db96d56Sopenharmony_ci assert row <= self.prev_row 1817db96d56Sopenharmony_ci col_offset = col - self.prev_col 1827db96d56Sopenharmony_ci if col_offset: 1837db96d56Sopenharmony_ci self.tokens.append(" " * col_offset) 1847db96d56Sopenharmony_ci 1857db96d56Sopenharmony_ci def untokenize(self, iterable): 1867db96d56Sopenharmony_ci for t in iterable: 1877db96d56Sopenharmony_ci if len(t) == 2: 1887db96d56Sopenharmony_ci self.compat(t, iterable) 1897db96d56Sopenharmony_ci break 1907db96d56Sopenharmony_ci tok_type, token, start, end, line = t 1917db96d56Sopenharmony_ci self.add_whitespace(start) 1927db96d56Sopenharmony_ci self.tokens.append(token) 1937db96d56Sopenharmony_ci self.prev_row, self.prev_col = end 1947db96d56Sopenharmony_ci if tok_type in (NEWLINE, NL): 1957db96d56Sopenharmony_ci self.prev_row += 1 1967db96d56Sopenharmony_ci self.prev_col = 0 1977db96d56Sopenharmony_ci return "".join(self.tokens) 1987db96d56Sopenharmony_ci 1997db96d56Sopenharmony_ci def compat(self, token, iterable): 2007db96d56Sopenharmony_ci startline = False 2017db96d56Sopenharmony_ci indents = [] 2027db96d56Sopenharmony_ci toks_append = self.tokens.append 2037db96d56Sopenharmony_ci toknum, tokval = token 2047db96d56Sopenharmony_ci if toknum in (NAME, NUMBER): 2057db96d56Sopenharmony_ci tokval += ' ' 2067db96d56Sopenharmony_ci if toknum in (NEWLINE, NL): 2077db96d56Sopenharmony_ci startline = True 2087db96d56Sopenharmony_ci for tok in iterable: 2097db96d56Sopenharmony_ci toknum, tokval = tok[:2] 2107db96d56Sopenharmony_ci 2117db96d56Sopenharmony_ci if toknum in (NAME, NUMBER, ASYNC, AWAIT): 2127db96d56Sopenharmony_ci tokval += ' ' 2137db96d56Sopenharmony_ci 2147db96d56Sopenharmony_ci if toknum == INDENT: 2157db96d56Sopenharmony_ci indents.append(tokval) 2167db96d56Sopenharmony_ci continue 2177db96d56Sopenharmony_ci elif toknum == DEDENT: 2187db96d56Sopenharmony_ci indents.pop() 2197db96d56Sopenharmony_ci continue 2207db96d56Sopenharmony_ci elif toknum in (NEWLINE, NL): 2217db96d56Sopenharmony_ci startline = True 2227db96d56Sopenharmony_ci elif startline and indents: 2237db96d56Sopenharmony_ci toks_append(indents[-1]) 2247db96d56Sopenharmony_ci startline = False 2257db96d56Sopenharmony_ci toks_append(tokval) 2267db96d56Sopenharmony_ci 2277db96d56Sopenharmony_cicookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 2287db96d56Sopenharmony_ciblank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 2297db96d56Sopenharmony_ci 2307db96d56Sopenharmony_cidef _get_normal_name(orig_enc): 2317db96d56Sopenharmony_ci """Imitates get_normal_name in tokenizer.c.""" 2327db96d56Sopenharmony_ci # Only care about the first 12 characters. 2337db96d56Sopenharmony_ci enc = orig_enc[:12].lower().replace("_", "-") 2347db96d56Sopenharmony_ci if enc == "utf-8" or enc.startswith("utf-8-"): 2357db96d56Sopenharmony_ci return "utf-8" 2367db96d56Sopenharmony_ci if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 2377db96d56Sopenharmony_ci enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 2387db96d56Sopenharmony_ci return "iso-8859-1" 2397db96d56Sopenharmony_ci return orig_enc 2407db96d56Sopenharmony_ci 2417db96d56Sopenharmony_cidef detect_encoding(readline): 2427db96d56Sopenharmony_ci """ 2437db96d56Sopenharmony_ci The detect_encoding() function is used to detect the encoding that should 2447db96d56Sopenharmony_ci be used to decode a Python source file. It requires one argument, readline, 2457db96d56Sopenharmony_ci in the same way as the tokenize() generator. 2467db96d56Sopenharmony_ci 2477db96d56Sopenharmony_ci It will call readline a maximum of twice, and return the encoding used 2487db96d56Sopenharmony_ci (as a string) and a list of any lines (left as bytes) it has read 2497db96d56Sopenharmony_ci in. 2507db96d56Sopenharmony_ci 2517db96d56Sopenharmony_ci It detects the encoding from the presence of a utf-8 bom or an encoding 2527db96d56Sopenharmony_ci cookie as specified in pep-0263. If both a bom and a cookie are present, but 2537db96d56Sopenharmony_ci disagree, a SyntaxError will be raised. If the encoding cookie is an invalid 2547db96d56Sopenharmony_ci charset, raise a SyntaxError. Note that if a utf-8 bom is found, 2557db96d56Sopenharmony_ci 'utf-8-sig' is returned. 2567db96d56Sopenharmony_ci 2577db96d56Sopenharmony_ci If no encoding is specified, then the default of 'utf-8' will be returned. 2587db96d56Sopenharmony_ci """ 2597db96d56Sopenharmony_ci bom_found = False 2607db96d56Sopenharmony_ci encoding = None 2617db96d56Sopenharmony_ci default = 'utf-8' 2627db96d56Sopenharmony_ci def read_or_stop(): 2637db96d56Sopenharmony_ci try: 2647db96d56Sopenharmony_ci return readline() 2657db96d56Sopenharmony_ci except StopIteration: 2667db96d56Sopenharmony_ci return bytes() 2677db96d56Sopenharmony_ci 2687db96d56Sopenharmony_ci def find_cookie(line): 2697db96d56Sopenharmony_ci try: 2707db96d56Sopenharmony_ci line_string = line.decode('ascii') 2717db96d56Sopenharmony_ci except UnicodeDecodeError: 2727db96d56Sopenharmony_ci return None 2737db96d56Sopenharmony_ci match = cookie_re.match(line_string) 2747db96d56Sopenharmony_ci if not match: 2757db96d56Sopenharmony_ci return None 2767db96d56Sopenharmony_ci encoding = _get_normal_name(match.group(1)) 2777db96d56Sopenharmony_ci try: 2787db96d56Sopenharmony_ci codec = lookup(encoding) 2797db96d56Sopenharmony_ci except LookupError: 2807db96d56Sopenharmony_ci # This behaviour mimics the Python interpreter 2817db96d56Sopenharmony_ci raise SyntaxError("unknown encoding: " + encoding) 2827db96d56Sopenharmony_ci 2837db96d56Sopenharmony_ci if bom_found: 2847db96d56Sopenharmony_ci if codec.name != 'utf-8': 2857db96d56Sopenharmony_ci # This behaviour mimics the Python interpreter 2867db96d56Sopenharmony_ci raise SyntaxError('encoding problem: utf-8') 2877db96d56Sopenharmony_ci encoding += '-sig' 2887db96d56Sopenharmony_ci return encoding 2897db96d56Sopenharmony_ci 2907db96d56Sopenharmony_ci first = read_or_stop() 2917db96d56Sopenharmony_ci if first.startswith(BOM_UTF8): 2927db96d56Sopenharmony_ci bom_found = True 2937db96d56Sopenharmony_ci first = first[3:] 2947db96d56Sopenharmony_ci default = 'utf-8-sig' 2957db96d56Sopenharmony_ci if not first: 2967db96d56Sopenharmony_ci return default, [] 2977db96d56Sopenharmony_ci 2987db96d56Sopenharmony_ci encoding = find_cookie(first) 2997db96d56Sopenharmony_ci if encoding: 3007db96d56Sopenharmony_ci return encoding, [first] 3017db96d56Sopenharmony_ci if not blank_re.match(first): 3027db96d56Sopenharmony_ci return default, [first] 3037db96d56Sopenharmony_ci 3047db96d56Sopenharmony_ci second = read_or_stop() 3057db96d56Sopenharmony_ci if not second: 3067db96d56Sopenharmony_ci return default, [first] 3077db96d56Sopenharmony_ci 3087db96d56Sopenharmony_ci encoding = find_cookie(second) 3097db96d56Sopenharmony_ci if encoding: 3107db96d56Sopenharmony_ci return encoding, [first, second] 3117db96d56Sopenharmony_ci 3127db96d56Sopenharmony_ci return default, [first, second] 3137db96d56Sopenharmony_ci 3147db96d56Sopenharmony_cidef untokenize(iterable): 3157db96d56Sopenharmony_ci """Transform tokens back into Python source code. 3167db96d56Sopenharmony_ci 3177db96d56Sopenharmony_ci Each element returned by the iterable must be a token sequence 3187db96d56Sopenharmony_ci with at least two elements, a token number and token value. If 3197db96d56Sopenharmony_ci only two tokens are passed, the resulting output is poor. 3207db96d56Sopenharmony_ci 3217db96d56Sopenharmony_ci Round-trip invariant for full input: 3227db96d56Sopenharmony_ci Untokenized source will match input source exactly 3237db96d56Sopenharmony_ci 3247db96d56Sopenharmony_ci Round-trip invariant for limited input: 3257db96d56Sopenharmony_ci # Output text will tokenize the back to the input 3267db96d56Sopenharmony_ci t1 = [tok[:2] for tok in generate_tokens(f.readline)] 3277db96d56Sopenharmony_ci newcode = untokenize(t1) 3287db96d56Sopenharmony_ci readline = iter(newcode.splitlines(1)).next 3297db96d56Sopenharmony_ci t2 = [tok[:2] for tokin generate_tokens(readline)] 3307db96d56Sopenharmony_ci assert t1 == t2 3317db96d56Sopenharmony_ci """ 3327db96d56Sopenharmony_ci ut = Untokenizer() 3337db96d56Sopenharmony_ci return ut.untokenize(iterable) 3347db96d56Sopenharmony_ci 3357db96d56Sopenharmony_cidef generate_tokens(readline): 3367db96d56Sopenharmony_ci """ 3377db96d56Sopenharmony_ci The generate_tokens() generator requires one argument, readline, which 3387db96d56Sopenharmony_ci must be a callable object which provides the same interface as the 3397db96d56Sopenharmony_ci readline() method of built-in file objects. Each call to the function 3407db96d56Sopenharmony_ci should return one line of input as a string. Alternately, readline 3417db96d56Sopenharmony_ci can be a callable function terminating with StopIteration: 3427db96d56Sopenharmony_ci readline = open(myfile).next # Example of alternate readline 3437db96d56Sopenharmony_ci 3447db96d56Sopenharmony_ci The generator produces 5-tuples with these members: the token type; the 3457db96d56Sopenharmony_ci token string; a 2-tuple (srow, scol) of ints specifying the row and 3467db96d56Sopenharmony_ci column where the token begins in the source; a 2-tuple (erow, ecol) of 3477db96d56Sopenharmony_ci ints specifying the row and column where the token ends in the source; 3487db96d56Sopenharmony_ci and the line on which the token was found. The line passed is the 3497db96d56Sopenharmony_ci physical line. 3507db96d56Sopenharmony_ci """ 3517db96d56Sopenharmony_ci lnum = parenlev = continued = 0 3527db96d56Sopenharmony_ci contstr, needcont = '', 0 3537db96d56Sopenharmony_ci contline = None 3547db96d56Sopenharmony_ci indents = [0] 3557db96d56Sopenharmony_ci 3567db96d56Sopenharmony_ci # 'stashed' and 'async_*' are used for async/await parsing 3577db96d56Sopenharmony_ci stashed = None 3587db96d56Sopenharmony_ci async_def = False 3597db96d56Sopenharmony_ci async_def_indent = 0 3607db96d56Sopenharmony_ci async_def_nl = False 3617db96d56Sopenharmony_ci 3627db96d56Sopenharmony_ci while 1: # loop over lines in stream 3637db96d56Sopenharmony_ci try: 3647db96d56Sopenharmony_ci line = readline() 3657db96d56Sopenharmony_ci except StopIteration: 3667db96d56Sopenharmony_ci line = '' 3677db96d56Sopenharmony_ci lnum = lnum + 1 3687db96d56Sopenharmony_ci pos, max = 0, len(line) 3697db96d56Sopenharmony_ci 3707db96d56Sopenharmony_ci if contstr: # continued string 3717db96d56Sopenharmony_ci if not line: 3727db96d56Sopenharmony_ci raise TokenError("EOF in multi-line string", strstart) 3737db96d56Sopenharmony_ci endmatch = endprog.match(line) 3747db96d56Sopenharmony_ci if endmatch: 3757db96d56Sopenharmony_ci pos = end = endmatch.end(0) 3767db96d56Sopenharmony_ci yield (STRING, contstr + line[:end], 3777db96d56Sopenharmony_ci strstart, (lnum, end), contline + line) 3787db96d56Sopenharmony_ci contstr, needcont = '', 0 3797db96d56Sopenharmony_ci contline = None 3807db96d56Sopenharmony_ci elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 3817db96d56Sopenharmony_ci yield (ERRORTOKEN, contstr + line, 3827db96d56Sopenharmony_ci strstart, (lnum, len(line)), contline) 3837db96d56Sopenharmony_ci contstr = '' 3847db96d56Sopenharmony_ci contline = None 3857db96d56Sopenharmony_ci continue 3867db96d56Sopenharmony_ci else: 3877db96d56Sopenharmony_ci contstr = contstr + line 3887db96d56Sopenharmony_ci contline = contline + line 3897db96d56Sopenharmony_ci continue 3907db96d56Sopenharmony_ci 3917db96d56Sopenharmony_ci elif parenlev == 0 and not continued: # new statement 3927db96d56Sopenharmony_ci if not line: break 3937db96d56Sopenharmony_ci column = 0 3947db96d56Sopenharmony_ci while pos < max: # measure leading whitespace 3957db96d56Sopenharmony_ci if line[pos] == ' ': column = column + 1 3967db96d56Sopenharmony_ci elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize 3977db96d56Sopenharmony_ci elif line[pos] == '\f': column = 0 3987db96d56Sopenharmony_ci else: break 3997db96d56Sopenharmony_ci pos = pos + 1 4007db96d56Sopenharmony_ci if pos == max: break 4017db96d56Sopenharmony_ci 4027db96d56Sopenharmony_ci if stashed: 4037db96d56Sopenharmony_ci yield stashed 4047db96d56Sopenharmony_ci stashed = None 4057db96d56Sopenharmony_ci 4067db96d56Sopenharmony_ci if line[pos] in '#\r\n': # skip comments or blank lines 4077db96d56Sopenharmony_ci if line[pos] == '#': 4087db96d56Sopenharmony_ci comment_token = line[pos:].rstrip('\r\n') 4097db96d56Sopenharmony_ci nl_pos = pos + len(comment_token) 4107db96d56Sopenharmony_ci yield (COMMENT, comment_token, 4117db96d56Sopenharmony_ci (lnum, pos), (lnum, pos + len(comment_token)), line) 4127db96d56Sopenharmony_ci yield (NL, line[nl_pos:], 4137db96d56Sopenharmony_ci (lnum, nl_pos), (lnum, len(line)), line) 4147db96d56Sopenharmony_ci else: 4157db96d56Sopenharmony_ci yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 4167db96d56Sopenharmony_ci (lnum, pos), (lnum, len(line)), line) 4177db96d56Sopenharmony_ci continue 4187db96d56Sopenharmony_ci 4197db96d56Sopenharmony_ci if column > indents[-1]: # count indents or dedents 4207db96d56Sopenharmony_ci indents.append(column) 4217db96d56Sopenharmony_ci yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 4227db96d56Sopenharmony_ci while column < indents[-1]: 4237db96d56Sopenharmony_ci if column not in indents: 4247db96d56Sopenharmony_ci raise IndentationError( 4257db96d56Sopenharmony_ci "unindent does not match any outer indentation level", 4267db96d56Sopenharmony_ci ("<tokenize>", lnum, pos, line)) 4277db96d56Sopenharmony_ci indents = indents[:-1] 4287db96d56Sopenharmony_ci 4297db96d56Sopenharmony_ci if async_def and async_def_indent >= indents[-1]: 4307db96d56Sopenharmony_ci async_def = False 4317db96d56Sopenharmony_ci async_def_nl = False 4327db96d56Sopenharmony_ci async_def_indent = 0 4337db96d56Sopenharmony_ci 4347db96d56Sopenharmony_ci yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 4357db96d56Sopenharmony_ci 4367db96d56Sopenharmony_ci if async_def and async_def_nl and async_def_indent >= indents[-1]: 4377db96d56Sopenharmony_ci async_def = False 4387db96d56Sopenharmony_ci async_def_nl = False 4397db96d56Sopenharmony_ci async_def_indent = 0 4407db96d56Sopenharmony_ci 4417db96d56Sopenharmony_ci else: # continued statement 4427db96d56Sopenharmony_ci if not line: 4437db96d56Sopenharmony_ci raise TokenError("EOF in multi-line statement", (lnum, 0)) 4447db96d56Sopenharmony_ci continued = 0 4457db96d56Sopenharmony_ci 4467db96d56Sopenharmony_ci while pos < max: 4477db96d56Sopenharmony_ci pseudomatch = pseudoprog.match(line, pos) 4487db96d56Sopenharmony_ci if pseudomatch: # scan for tokens 4497db96d56Sopenharmony_ci start, end = pseudomatch.span(1) 4507db96d56Sopenharmony_ci spos, epos, pos = (lnum, start), (lnum, end), end 4517db96d56Sopenharmony_ci token, initial = line[start:end], line[start] 4527db96d56Sopenharmony_ci 4537db96d56Sopenharmony_ci if initial in string.digits or \ 4547db96d56Sopenharmony_ci (initial == '.' and token != '.'): # ordinary number 4557db96d56Sopenharmony_ci yield (NUMBER, token, spos, epos, line) 4567db96d56Sopenharmony_ci elif initial in '\r\n': 4577db96d56Sopenharmony_ci newline = NEWLINE 4587db96d56Sopenharmony_ci if parenlev > 0: 4597db96d56Sopenharmony_ci newline = NL 4607db96d56Sopenharmony_ci elif async_def: 4617db96d56Sopenharmony_ci async_def_nl = True 4627db96d56Sopenharmony_ci if stashed: 4637db96d56Sopenharmony_ci yield stashed 4647db96d56Sopenharmony_ci stashed = None 4657db96d56Sopenharmony_ci yield (newline, token, spos, epos, line) 4667db96d56Sopenharmony_ci 4677db96d56Sopenharmony_ci elif initial == '#': 4687db96d56Sopenharmony_ci assert not token.endswith("\n") 4697db96d56Sopenharmony_ci if stashed: 4707db96d56Sopenharmony_ci yield stashed 4717db96d56Sopenharmony_ci stashed = None 4727db96d56Sopenharmony_ci yield (COMMENT, token, spos, epos, line) 4737db96d56Sopenharmony_ci elif token in triple_quoted: 4747db96d56Sopenharmony_ci endprog = endprogs[token] 4757db96d56Sopenharmony_ci endmatch = endprog.match(line, pos) 4767db96d56Sopenharmony_ci if endmatch: # all on one line 4777db96d56Sopenharmony_ci pos = endmatch.end(0) 4787db96d56Sopenharmony_ci token = line[start:pos] 4797db96d56Sopenharmony_ci if stashed: 4807db96d56Sopenharmony_ci yield stashed 4817db96d56Sopenharmony_ci stashed = None 4827db96d56Sopenharmony_ci yield (STRING, token, spos, (lnum, pos), line) 4837db96d56Sopenharmony_ci else: 4847db96d56Sopenharmony_ci strstart = (lnum, start) # multiple lines 4857db96d56Sopenharmony_ci contstr = line[start:] 4867db96d56Sopenharmony_ci contline = line 4877db96d56Sopenharmony_ci break 4887db96d56Sopenharmony_ci elif initial in single_quoted or \ 4897db96d56Sopenharmony_ci token[:2] in single_quoted or \ 4907db96d56Sopenharmony_ci token[:3] in single_quoted: 4917db96d56Sopenharmony_ci if token[-1] == '\n': # continued string 4927db96d56Sopenharmony_ci strstart = (lnum, start) 4937db96d56Sopenharmony_ci endprog = (endprogs[initial] or endprogs[token[1]] or 4947db96d56Sopenharmony_ci endprogs[token[2]]) 4957db96d56Sopenharmony_ci contstr, needcont = line[start:], 1 4967db96d56Sopenharmony_ci contline = line 4977db96d56Sopenharmony_ci break 4987db96d56Sopenharmony_ci else: # ordinary string 4997db96d56Sopenharmony_ci if stashed: 5007db96d56Sopenharmony_ci yield stashed 5017db96d56Sopenharmony_ci stashed = None 5027db96d56Sopenharmony_ci yield (STRING, token, spos, epos, line) 5037db96d56Sopenharmony_ci elif initial.isidentifier(): # ordinary name 5047db96d56Sopenharmony_ci if token in ('async', 'await'): 5057db96d56Sopenharmony_ci if async_def: 5067db96d56Sopenharmony_ci yield (ASYNC if token == 'async' else AWAIT, 5077db96d56Sopenharmony_ci token, spos, epos, line) 5087db96d56Sopenharmony_ci continue 5097db96d56Sopenharmony_ci 5107db96d56Sopenharmony_ci tok = (NAME, token, spos, epos, line) 5117db96d56Sopenharmony_ci if token == 'async' and not stashed: 5127db96d56Sopenharmony_ci stashed = tok 5137db96d56Sopenharmony_ci continue 5147db96d56Sopenharmony_ci 5157db96d56Sopenharmony_ci if token in ('def', 'for'): 5167db96d56Sopenharmony_ci if (stashed 5177db96d56Sopenharmony_ci and stashed[0] == NAME 5187db96d56Sopenharmony_ci and stashed[1] == 'async'): 5197db96d56Sopenharmony_ci 5207db96d56Sopenharmony_ci if token == 'def': 5217db96d56Sopenharmony_ci async_def = True 5227db96d56Sopenharmony_ci async_def_indent = indents[-1] 5237db96d56Sopenharmony_ci 5247db96d56Sopenharmony_ci yield (ASYNC, stashed[1], 5257db96d56Sopenharmony_ci stashed[2], stashed[3], 5267db96d56Sopenharmony_ci stashed[4]) 5277db96d56Sopenharmony_ci stashed = None 5287db96d56Sopenharmony_ci 5297db96d56Sopenharmony_ci if stashed: 5307db96d56Sopenharmony_ci yield stashed 5317db96d56Sopenharmony_ci stashed = None 5327db96d56Sopenharmony_ci 5337db96d56Sopenharmony_ci yield tok 5347db96d56Sopenharmony_ci elif initial == '\\': # continued stmt 5357db96d56Sopenharmony_ci # This yield is new; needed for better idempotency: 5367db96d56Sopenharmony_ci if stashed: 5377db96d56Sopenharmony_ci yield stashed 5387db96d56Sopenharmony_ci stashed = None 5397db96d56Sopenharmony_ci yield (NL, token, spos, (lnum, pos), line) 5407db96d56Sopenharmony_ci continued = 1 5417db96d56Sopenharmony_ci else: 5427db96d56Sopenharmony_ci if initial in '([{': parenlev = parenlev + 1 5437db96d56Sopenharmony_ci elif initial in ')]}': parenlev = parenlev - 1 5447db96d56Sopenharmony_ci if stashed: 5457db96d56Sopenharmony_ci yield stashed 5467db96d56Sopenharmony_ci stashed = None 5477db96d56Sopenharmony_ci yield (OP, token, spos, epos, line) 5487db96d56Sopenharmony_ci else: 5497db96d56Sopenharmony_ci yield (ERRORTOKEN, line[pos], 5507db96d56Sopenharmony_ci (lnum, pos), (lnum, pos+1), line) 5517db96d56Sopenharmony_ci pos = pos + 1 5527db96d56Sopenharmony_ci 5537db96d56Sopenharmony_ci if stashed: 5547db96d56Sopenharmony_ci yield stashed 5557db96d56Sopenharmony_ci stashed = None 5567db96d56Sopenharmony_ci 5577db96d56Sopenharmony_ci for indent in indents[1:]: # pop remaining indent levels 5587db96d56Sopenharmony_ci yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 5597db96d56Sopenharmony_ci yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 5607db96d56Sopenharmony_ci 5617db96d56Sopenharmony_ciif __name__ == '__main__': # testing 5627db96d56Sopenharmony_ci import sys 5637db96d56Sopenharmony_ci if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) 5647db96d56Sopenharmony_ci else: tokenize(sys.stdin.readline) 565