17db96d56Sopenharmony_ci"""Tokenization help for Python programs. 27db96d56Sopenharmony_ci 37db96d56Sopenharmony_citokenize(readline) is a generator that breaks a stream of bytes into 47db96d56Sopenharmony_ciPython tokens. It decodes the bytes according to PEP-0263 for 57db96d56Sopenharmony_cidetermining source file encoding. 67db96d56Sopenharmony_ci 77db96d56Sopenharmony_ciIt accepts a readline-like method which is called repeatedly to get the 87db96d56Sopenharmony_cinext line of input (or b"" for EOF). It generates 5-tuples with these 97db96d56Sopenharmony_cimembers: 107db96d56Sopenharmony_ci 117db96d56Sopenharmony_ci the token type (see token.py) 127db96d56Sopenharmony_ci the token (a string) 137db96d56Sopenharmony_ci the starting (row, column) indices of the token (a 2-tuple of ints) 147db96d56Sopenharmony_ci the ending (row, column) indices of the token (a 2-tuple of ints) 157db96d56Sopenharmony_ci the original line (string) 167db96d56Sopenharmony_ci 177db96d56Sopenharmony_ciIt is designed to match the working of the Python tokenizer exactly, except 187db96d56Sopenharmony_cithat it produces COMMENT tokens for comments and gives type OP for all 197db96d56Sopenharmony_cioperators. Additionally, all token lists start with an ENCODING token 207db96d56Sopenharmony_ciwhich tells you which encoding was used to decode the bytes stream. 217db96d56Sopenharmony_ci""" 227db96d56Sopenharmony_ci 237db96d56Sopenharmony_ci__author__ = 'Ka-Ping Yee <ping@lfw.org>' 247db96d56Sopenharmony_ci__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 257db96d56Sopenharmony_ci 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 267db96d56Sopenharmony_ci 'Michael Foord') 277db96d56Sopenharmony_cifrom builtins import open as _builtin_open 287db96d56Sopenharmony_cifrom codecs import lookup, BOM_UTF8 297db96d56Sopenharmony_ciimport collections 307db96d56Sopenharmony_ciimport functools 317db96d56Sopenharmony_cifrom io import TextIOWrapper 327db96d56Sopenharmony_ciimport itertools as _itertools 337db96d56Sopenharmony_ciimport re 347db96d56Sopenharmony_ciimport sys 357db96d56Sopenharmony_cifrom token import * 367db96d56Sopenharmony_cifrom token import EXACT_TOKEN_TYPES 377db96d56Sopenharmony_ci 387db96d56Sopenharmony_cicookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 397db96d56Sopenharmony_ciblank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 407db96d56Sopenharmony_ci 417db96d56Sopenharmony_ciimport token 427db96d56Sopenharmony_ci__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", 437db96d56Sopenharmony_ci "untokenize", "TokenInfo"] 447db96d56Sopenharmony_cidel token 457db96d56Sopenharmony_ci 467db96d56Sopenharmony_ciclass TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): 477db96d56Sopenharmony_ci def __repr__(self): 487db96d56Sopenharmony_ci annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) 497db96d56Sopenharmony_ci return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % 507db96d56Sopenharmony_ci self._replace(type=annotated_type)) 517db96d56Sopenharmony_ci 527db96d56Sopenharmony_ci @property 537db96d56Sopenharmony_ci def exact_type(self): 547db96d56Sopenharmony_ci if self.type == OP and self.string in EXACT_TOKEN_TYPES: 557db96d56Sopenharmony_ci return EXACT_TOKEN_TYPES[self.string] 567db96d56Sopenharmony_ci else: 577db96d56Sopenharmony_ci return self.type 587db96d56Sopenharmony_ci 597db96d56Sopenharmony_cidef group(*choices): return '(' + '|'.join(choices) + ')' 607db96d56Sopenharmony_cidef any(*choices): return group(*choices) + '*' 617db96d56Sopenharmony_cidef maybe(*choices): return group(*choices) + '?' 627db96d56Sopenharmony_ci 637db96d56Sopenharmony_ci# Note: we use unicode matching for names ("\w") but ascii matching for 647db96d56Sopenharmony_ci# number literals. 657db96d56Sopenharmony_ciWhitespace = r'[ \f\t]*' 667db96d56Sopenharmony_ciComment = r'#[^\r\n]*' 677db96d56Sopenharmony_ciIgnore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 687db96d56Sopenharmony_ciName = r'\w+' 697db96d56Sopenharmony_ci 707db96d56Sopenharmony_ciHexnumber = r'0[xX](?:_?[0-9a-fA-F])+' 717db96d56Sopenharmony_ciBinnumber = r'0[bB](?:_?[01])+' 727db96d56Sopenharmony_ciOctnumber = r'0[oO](?:_?[0-7])+' 737db96d56Sopenharmony_ciDecnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' 747db96d56Sopenharmony_ciIntnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 757db96d56Sopenharmony_ciExponent = r'[eE][-+]?[0-9](?:_?[0-9])*' 767db96d56Sopenharmony_ciPointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', 777db96d56Sopenharmony_ci r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) 787db96d56Sopenharmony_ciExpfloat = r'[0-9](?:_?[0-9])*' + Exponent 797db96d56Sopenharmony_ciFloatnumber = group(Pointfloat, Expfloat) 807db96d56Sopenharmony_ciImagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') 817db96d56Sopenharmony_ciNumber = group(Imagnumber, Floatnumber, Intnumber) 827db96d56Sopenharmony_ci 837db96d56Sopenharmony_ci# Return the empty string, plus all of the valid string prefixes. 847db96d56Sopenharmony_cidef _all_string_prefixes(): 857db96d56Sopenharmony_ci # The valid string prefixes. Only contain the lower case versions, 867db96d56Sopenharmony_ci # and don't contain any permutations (include 'fr', but not 877db96d56Sopenharmony_ci # 'rf'). The various permutations will be generated. 887db96d56Sopenharmony_ci _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] 897db96d56Sopenharmony_ci # if we add binary f-strings, add: ['fb', 'fbr'] 907db96d56Sopenharmony_ci result = {''} 917db96d56Sopenharmony_ci for prefix in _valid_string_prefixes: 927db96d56Sopenharmony_ci for t in _itertools.permutations(prefix): 937db96d56Sopenharmony_ci # create a list with upper and lower versions of each 947db96d56Sopenharmony_ci # character 957db96d56Sopenharmony_ci for u in _itertools.product(*[(c, c.upper()) for c in t]): 967db96d56Sopenharmony_ci result.add(''.join(u)) 977db96d56Sopenharmony_ci return result 987db96d56Sopenharmony_ci 997db96d56Sopenharmony_ci@functools.lru_cache 1007db96d56Sopenharmony_cidef _compile(expr): 1017db96d56Sopenharmony_ci return re.compile(expr, re.UNICODE) 1027db96d56Sopenharmony_ci 1037db96d56Sopenharmony_ci# Note that since _all_string_prefixes includes the empty string, 1047db96d56Sopenharmony_ci# StringPrefix can be the empty string (making it optional). 1057db96d56Sopenharmony_ciStringPrefix = group(*_all_string_prefixes()) 1067db96d56Sopenharmony_ci 1077db96d56Sopenharmony_ci# Tail end of ' string. 1087db96d56Sopenharmony_ciSingle = r"[^'\\]*(?:\\.[^'\\]*)*'" 1097db96d56Sopenharmony_ci# Tail end of " string. 1107db96d56Sopenharmony_ciDouble = r'[^"\\]*(?:\\.[^"\\]*)*"' 1117db96d56Sopenharmony_ci# Tail end of ''' string. 1127db96d56Sopenharmony_ciSingle3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 1137db96d56Sopenharmony_ci# Tail end of """ string. 1147db96d56Sopenharmony_ciDouble3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 1157db96d56Sopenharmony_ciTriple = group(StringPrefix + "'''", StringPrefix + '"""') 1167db96d56Sopenharmony_ci# Single-line ' or " string. 1177db96d56Sopenharmony_ciString = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 1187db96d56Sopenharmony_ci StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 1197db96d56Sopenharmony_ci 1207db96d56Sopenharmony_ci# Sorting in reverse order puts the long operators before their prefixes. 1217db96d56Sopenharmony_ci# Otherwise if = came before ==, == would get recognized as two instances 1227db96d56Sopenharmony_ci# of =. 1237db96d56Sopenharmony_ciSpecial = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True))) 1247db96d56Sopenharmony_ciFunny = group(r'\r?\n', Special) 1257db96d56Sopenharmony_ci 1267db96d56Sopenharmony_ciPlainToken = group(Number, Funny, String, Name) 1277db96d56Sopenharmony_ciToken = Ignore + PlainToken 1287db96d56Sopenharmony_ci 1297db96d56Sopenharmony_ci# First (or only) line of ' or " string. 1307db96d56Sopenharmony_ciContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 1317db96d56Sopenharmony_ci group("'", r'\\\r?\n'), 1327db96d56Sopenharmony_ci StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 1337db96d56Sopenharmony_ci group('"', r'\\\r?\n')) 1347db96d56Sopenharmony_ciPseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 1357db96d56Sopenharmony_ciPseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 1367db96d56Sopenharmony_ci 1377db96d56Sopenharmony_ci# For a given string prefix plus quotes, endpats maps it to a regex 1387db96d56Sopenharmony_ci# to match the remainder of that string. _prefix can be empty, for 1397db96d56Sopenharmony_ci# a normal single or triple quoted string (with no prefix). 1407db96d56Sopenharmony_ciendpats = {} 1417db96d56Sopenharmony_cifor _prefix in _all_string_prefixes(): 1427db96d56Sopenharmony_ci endpats[_prefix + "'"] = Single 1437db96d56Sopenharmony_ci endpats[_prefix + '"'] = Double 1447db96d56Sopenharmony_ci endpats[_prefix + "'''"] = Single3 1457db96d56Sopenharmony_ci endpats[_prefix + '"""'] = Double3 1467db96d56Sopenharmony_cidel _prefix 1477db96d56Sopenharmony_ci 1487db96d56Sopenharmony_ci# A set of all of the single and triple quoted string prefixes, 1497db96d56Sopenharmony_ci# including the opening quotes. 1507db96d56Sopenharmony_cisingle_quoted = set() 1517db96d56Sopenharmony_citriple_quoted = set() 1527db96d56Sopenharmony_cifor t in _all_string_prefixes(): 1537db96d56Sopenharmony_ci for u in (t + '"', t + "'"): 1547db96d56Sopenharmony_ci single_quoted.add(u) 1557db96d56Sopenharmony_ci for u in (t + '"""', t + "'''"): 1567db96d56Sopenharmony_ci triple_quoted.add(u) 1577db96d56Sopenharmony_cidel t, u 1587db96d56Sopenharmony_ci 1597db96d56Sopenharmony_citabsize = 8 1607db96d56Sopenharmony_ci 1617db96d56Sopenharmony_ciclass TokenError(Exception): pass 1627db96d56Sopenharmony_ci 1637db96d56Sopenharmony_ciclass StopTokenizing(Exception): pass 1647db96d56Sopenharmony_ci 1657db96d56Sopenharmony_ci 1667db96d56Sopenharmony_ciclass Untokenizer: 1677db96d56Sopenharmony_ci 1687db96d56Sopenharmony_ci def __init__(self): 1697db96d56Sopenharmony_ci self.tokens = [] 1707db96d56Sopenharmony_ci self.prev_row = 1 1717db96d56Sopenharmony_ci self.prev_col = 0 1727db96d56Sopenharmony_ci self.encoding = None 1737db96d56Sopenharmony_ci 1747db96d56Sopenharmony_ci def add_whitespace(self, start): 1757db96d56Sopenharmony_ci row, col = start 1767db96d56Sopenharmony_ci if row < self.prev_row or row == self.prev_row and col < self.prev_col: 1777db96d56Sopenharmony_ci raise ValueError("start ({},{}) precedes previous end ({},{})" 1787db96d56Sopenharmony_ci .format(row, col, self.prev_row, self.prev_col)) 1797db96d56Sopenharmony_ci row_offset = row - self.prev_row 1807db96d56Sopenharmony_ci if row_offset: 1817db96d56Sopenharmony_ci self.tokens.append("\\\n" * row_offset) 1827db96d56Sopenharmony_ci self.prev_col = 0 1837db96d56Sopenharmony_ci col_offset = col - self.prev_col 1847db96d56Sopenharmony_ci if col_offset: 1857db96d56Sopenharmony_ci self.tokens.append(" " * col_offset) 1867db96d56Sopenharmony_ci 1877db96d56Sopenharmony_ci def untokenize(self, iterable): 1887db96d56Sopenharmony_ci it = iter(iterable) 1897db96d56Sopenharmony_ci indents = [] 1907db96d56Sopenharmony_ci startline = False 1917db96d56Sopenharmony_ci for t in it: 1927db96d56Sopenharmony_ci if len(t) == 2: 1937db96d56Sopenharmony_ci self.compat(t, it) 1947db96d56Sopenharmony_ci break 1957db96d56Sopenharmony_ci tok_type, token, start, end, line = t 1967db96d56Sopenharmony_ci if tok_type == ENCODING: 1977db96d56Sopenharmony_ci self.encoding = token 1987db96d56Sopenharmony_ci continue 1997db96d56Sopenharmony_ci if tok_type == ENDMARKER: 2007db96d56Sopenharmony_ci break 2017db96d56Sopenharmony_ci if tok_type == INDENT: 2027db96d56Sopenharmony_ci indents.append(token) 2037db96d56Sopenharmony_ci continue 2047db96d56Sopenharmony_ci elif tok_type == DEDENT: 2057db96d56Sopenharmony_ci indents.pop() 2067db96d56Sopenharmony_ci self.prev_row, self.prev_col = end 2077db96d56Sopenharmony_ci continue 2087db96d56Sopenharmony_ci elif tok_type in (NEWLINE, NL): 2097db96d56Sopenharmony_ci startline = True 2107db96d56Sopenharmony_ci elif startline and indents: 2117db96d56Sopenharmony_ci indent = indents[-1] 2127db96d56Sopenharmony_ci if start[1] >= len(indent): 2137db96d56Sopenharmony_ci self.tokens.append(indent) 2147db96d56Sopenharmony_ci self.prev_col = len(indent) 2157db96d56Sopenharmony_ci startline = False 2167db96d56Sopenharmony_ci self.add_whitespace(start) 2177db96d56Sopenharmony_ci self.tokens.append(token) 2187db96d56Sopenharmony_ci self.prev_row, self.prev_col = end 2197db96d56Sopenharmony_ci if tok_type in (NEWLINE, NL): 2207db96d56Sopenharmony_ci self.prev_row += 1 2217db96d56Sopenharmony_ci self.prev_col = 0 2227db96d56Sopenharmony_ci return "".join(self.tokens) 2237db96d56Sopenharmony_ci 2247db96d56Sopenharmony_ci def compat(self, token, iterable): 2257db96d56Sopenharmony_ci indents = [] 2267db96d56Sopenharmony_ci toks_append = self.tokens.append 2277db96d56Sopenharmony_ci startline = token[0] in (NEWLINE, NL) 2287db96d56Sopenharmony_ci prevstring = False 2297db96d56Sopenharmony_ci 2307db96d56Sopenharmony_ci for tok in _itertools.chain([token], iterable): 2317db96d56Sopenharmony_ci toknum, tokval = tok[:2] 2327db96d56Sopenharmony_ci if toknum == ENCODING: 2337db96d56Sopenharmony_ci self.encoding = tokval 2347db96d56Sopenharmony_ci continue 2357db96d56Sopenharmony_ci 2367db96d56Sopenharmony_ci if toknum in (NAME, NUMBER): 2377db96d56Sopenharmony_ci tokval += ' ' 2387db96d56Sopenharmony_ci 2397db96d56Sopenharmony_ci # Insert a space between two consecutive strings 2407db96d56Sopenharmony_ci if toknum == STRING: 2417db96d56Sopenharmony_ci if prevstring: 2427db96d56Sopenharmony_ci tokval = ' ' + tokval 2437db96d56Sopenharmony_ci prevstring = True 2447db96d56Sopenharmony_ci else: 2457db96d56Sopenharmony_ci prevstring = False 2467db96d56Sopenharmony_ci 2477db96d56Sopenharmony_ci if toknum == INDENT: 2487db96d56Sopenharmony_ci indents.append(tokval) 2497db96d56Sopenharmony_ci continue 2507db96d56Sopenharmony_ci elif toknum == DEDENT: 2517db96d56Sopenharmony_ci indents.pop() 2527db96d56Sopenharmony_ci continue 2537db96d56Sopenharmony_ci elif toknum in (NEWLINE, NL): 2547db96d56Sopenharmony_ci startline = True 2557db96d56Sopenharmony_ci elif startline and indents: 2567db96d56Sopenharmony_ci toks_append(indents[-1]) 2577db96d56Sopenharmony_ci startline = False 2587db96d56Sopenharmony_ci toks_append(tokval) 2597db96d56Sopenharmony_ci 2607db96d56Sopenharmony_ci 2617db96d56Sopenharmony_cidef untokenize(iterable): 2627db96d56Sopenharmony_ci """Transform tokens back into Python source code. 2637db96d56Sopenharmony_ci It returns a bytes object, encoded using the ENCODING 2647db96d56Sopenharmony_ci token, which is the first token sequence output by tokenize. 2657db96d56Sopenharmony_ci 2667db96d56Sopenharmony_ci Each element returned by the iterable must be a token sequence 2677db96d56Sopenharmony_ci with at least two elements, a token number and token value. If 2687db96d56Sopenharmony_ci only two tokens are passed, the resulting output is poor. 2697db96d56Sopenharmony_ci 2707db96d56Sopenharmony_ci Round-trip invariant for full input: 2717db96d56Sopenharmony_ci Untokenized source will match input source exactly 2727db96d56Sopenharmony_ci 2737db96d56Sopenharmony_ci Round-trip invariant for limited input: 2747db96d56Sopenharmony_ci # Output bytes will tokenize back to the input 2757db96d56Sopenharmony_ci t1 = [tok[:2] for tok in tokenize(f.readline)] 2767db96d56Sopenharmony_ci newcode = untokenize(t1) 2777db96d56Sopenharmony_ci readline = BytesIO(newcode).readline 2787db96d56Sopenharmony_ci t2 = [tok[:2] for tok in tokenize(readline)] 2797db96d56Sopenharmony_ci assert t1 == t2 2807db96d56Sopenharmony_ci """ 2817db96d56Sopenharmony_ci ut = Untokenizer() 2827db96d56Sopenharmony_ci out = ut.untokenize(iterable) 2837db96d56Sopenharmony_ci if ut.encoding is not None: 2847db96d56Sopenharmony_ci out = out.encode(ut.encoding) 2857db96d56Sopenharmony_ci return out 2867db96d56Sopenharmony_ci 2877db96d56Sopenharmony_ci 2887db96d56Sopenharmony_cidef _get_normal_name(orig_enc): 2897db96d56Sopenharmony_ci """Imitates get_normal_name in tokenizer.c.""" 2907db96d56Sopenharmony_ci # Only care about the first 12 characters. 2917db96d56Sopenharmony_ci enc = orig_enc[:12].lower().replace("_", "-") 2927db96d56Sopenharmony_ci if enc == "utf-8" or enc.startswith("utf-8-"): 2937db96d56Sopenharmony_ci return "utf-8" 2947db96d56Sopenharmony_ci if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 2957db96d56Sopenharmony_ci enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 2967db96d56Sopenharmony_ci return "iso-8859-1" 2977db96d56Sopenharmony_ci return orig_enc 2987db96d56Sopenharmony_ci 2997db96d56Sopenharmony_cidef detect_encoding(readline): 3007db96d56Sopenharmony_ci """ 3017db96d56Sopenharmony_ci The detect_encoding() function is used to detect the encoding that should 3027db96d56Sopenharmony_ci be used to decode a Python source file. It requires one argument, readline, 3037db96d56Sopenharmony_ci in the same way as the tokenize() generator. 3047db96d56Sopenharmony_ci 3057db96d56Sopenharmony_ci It will call readline a maximum of twice, and return the encoding used 3067db96d56Sopenharmony_ci (as a string) and a list of any lines (left as bytes) it has read in. 3077db96d56Sopenharmony_ci 3087db96d56Sopenharmony_ci It detects the encoding from the presence of a utf-8 bom or an encoding 3097db96d56Sopenharmony_ci cookie as specified in pep-0263. If both a bom and a cookie are present, 3107db96d56Sopenharmony_ci but disagree, a SyntaxError will be raised. If the encoding cookie is an 3117db96d56Sopenharmony_ci invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 3127db96d56Sopenharmony_ci 'utf-8-sig' is returned. 3137db96d56Sopenharmony_ci 3147db96d56Sopenharmony_ci If no encoding is specified, then the default of 'utf-8' will be returned. 3157db96d56Sopenharmony_ci """ 3167db96d56Sopenharmony_ci try: 3177db96d56Sopenharmony_ci filename = readline.__self__.name 3187db96d56Sopenharmony_ci except AttributeError: 3197db96d56Sopenharmony_ci filename = None 3207db96d56Sopenharmony_ci bom_found = False 3217db96d56Sopenharmony_ci encoding = None 3227db96d56Sopenharmony_ci default = 'utf-8' 3237db96d56Sopenharmony_ci def read_or_stop(): 3247db96d56Sopenharmony_ci try: 3257db96d56Sopenharmony_ci return readline() 3267db96d56Sopenharmony_ci except StopIteration: 3277db96d56Sopenharmony_ci return b'' 3287db96d56Sopenharmony_ci 3297db96d56Sopenharmony_ci def find_cookie(line): 3307db96d56Sopenharmony_ci try: 3317db96d56Sopenharmony_ci # Decode as UTF-8. Either the line is an encoding declaration, 3327db96d56Sopenharmony_ci # in which case it should be pure ASCII, or it must be UTF-8 3337db96d56Sopenharmony_ci # per default encoding. 3347db96d56Sopenharmony_ci line_string = line.decode('utf-8') 3357db96d56Sopenharmony_ci except UnicodeDecodeError: 3367db96d56Sopenharmony_ci msg = "invalid or missing encoding declaration" 3377db96d56Sopenharmony_ci if filename is not None: 3387db96d56Sopenharmony_ci msg = '{} for {!r}'.format(msg, filename) 3397db96d56Sopenharmony_ci raise SyntaxError(msg) 3407db96d56Sopenharmony_ci 3417db96d56Sopenharmony_ci match = cookie_re.match(line_string) 3427db96d56Sopenharmony_ci if not match: 3437db96d56Sopenharmony_ci return None 3447db96d56Sopenharmony_ci encoding = _get_normal_name(match.group(1)) 3457db96d56Sopenharmony_ci try: 3467db96d56Sopenharmony_ci codec = lookup(encoding) 3477db96d56Sopenharmony_ci except LookupError: 3487db96d56Sopenharmony_ci # This behaviour mimics the Python interpreter 3497db96d56Sopenharmony_ci if filename is None: 3507db96d56Sopenharmony_ci msg = "unknown encoding: " + encoding 3517db96d56Sopenharmony_ci else: 3527db96d56Sopenharmony_ci msg = "unknown encoding for {!r}: {}".format(filename, 3537db96d56Sopenharmony_ci encoding) 3547db96d56Sopenharmony_ci raise SyntaxError(msg) 3557db96d56Sopenharmony_ci 3567db96d56Sopenharmony_ci if bom_found: 3577db96d56Sopenharmony_ci if encoding != 'utf-8': 3587db96d56Sopenharmony_ci # This behaviour mimics the Python interpreter 3597db96d56Sopenharmony_ci if filename is None: 3607db96d56Sopenharmony_ci msg = 'encoding problem: utf-8' 3617db96d56Sopenharmony_ci else: 3627db96d56Sopenharmony_ci msg = 'encoding problem for {!r}: utf-8'.format(filename) 3637db96d56Sopenharmony_ci raise SyntaxError(msg) 3647db96d56Sopenharmony_ci encoding += '-sig' 3657db96d56Sopenharmony_ci return encoding 3667db96d56Sopenharmony_ci 3677db96d56Sopenharmony_ci first = read_or_stop() 3687db96d56Sopenharmony_ci if first.startswith(BOM_UTF8): 3697db96d56Sopenharmony_ci bom_found = True 3707db96d56Sopenharmony_ci first = first[3:] 3717db96d56Sopenharmony_ci default = 'utf-8-sig' 3727db96d56Sopenharmony_ci if not first: 3737db96d56Sopenharmony_ci return default, [] 3747db96d56Sopenharmony_ci 3757db96d56Sopenharmony_ci encoding = find_cookie(first) 3767db96d56Sopenharmony_ci if encoding: 3777db96d56Sopenharmony_ci return encoding, [first] 3787db96d56Sopenharmony_ci if not blank_re.match(first): 3797db96d56Sopenharmony_ci return default, [first] 3807db96d56Sopenharmony_ci 3817db96d56Sopenharmony_ci second = read_or_stop() 3827db96d56Sopenharmony_ci if not second: 3837db96d56Sopenharmony_ci return default, [first] 3847db96d56Sopenharmony_ci 3857db96d56Sopenharmony_ci encoding = find_cookie(second) 3867db96d56Sopenharmony_ci if encoding: 3877db96d56Sopenharmony_ci return encoding, [first, second] 3887db96d56Sopenharmony_ci 3897db96d56Sopenharmony_ci return default, [first, second] 3907db96d56Sopenharmony_ci 3917db96d56Sopenharmony_ci 3927db96d56Sopenharmony_cidef open(filename): 3937db96d56Sopenharmony_ci """Open a file in read only mode using the encoding detected by 3947db96d56Sopenharmony_ci detect_encoding(). 3957db96d56Sopenharmony_ci """ 3967db96d56Sopenharmony_ci buffer = _builtin_open(filename, 'rb') 3977db96d56Sopenharmony_ci try: 3987db96d56Sopenharmony_ci encoding, lines = detect_encoding(buffer.readline) 3997db96d56Sopenharmony_ci buffer.seek(0) 4007db96d56Sopenharmony_ci text = TextIOWrapper(buffer, encoding, line_buffering=True) 4017db96d56Sopenharmony_ci text.mode = 'r' 4027db96d56Sopenharmony_ci return text 4037db96d56Sopenharmony_ci except: 4047db96d56Sopenharmony_ci buffer.close() 4057db96d56Sopenharmony_ci raise 4067db96d56Sopenharmony_ci 4077db96d56Sopenharmony_ci 4087db96d56Sopenharmony_cidef tokenize(readline): 4097db96d56Sopenharmony_ci """ 4107db96d56Sopenharmony_ci The tokenize() generator requires one argument, readline, which 4117db96d56Sopenharmony_ci must be a callable object which provides the same interface as the 4127db96d56Sopenharmony_ci readline() method of built-in file objects. Each call to the function 4137db96d56Sopenharmony_ci should return one line of input as bytes. Alternatively, readline 4147db96d56Sopenharmony_ci can be a callable function terminating with StopIteration: 4157db96d56Sopenharmony_ci readline = open(myfile, 'rb').__next__ # Example of alternate readline 4167db96d56Sopenharmony_ci 4177db96d56Sopenharmony_ci The generator produces 5-tuples with these members: the token type; the 4187db96d56Sopenharmony_ci token string; a 2-tuple (srow, scol) of ints specifying the row and 4197db96d56Sopenharmony_ci column where the token begins in the source; a 2-tuple (erow, ecol) of 4207db96d56Sopenharmony_ci ints specifying the row and column where the token ends in the source; 4217db96d56Sopenharmony_ci and the line on which the token was found. The line passed is the 4227db96d56Sopenharmony_ci physical line. 4237db96d56Sopenharmony_ci 4247db96d56Sopenharmony_ci The first token sequence will always be an ENCODING token 4257db96d56Sopenharmony_ci which tells you which encoding was used to decode the bytes stream. 4267db96d56Sopenharmony_ci """ 4277db96d56Sopenharmony_ci encoding, consumed = detect_encoding(readline) 4287db96d56Sopenharmony_ci empty = _itertools.repeat(b"") 4297db96d56Sopenharmony_ci rl_gen = _itertools.chain(consumed, iter(readline, b""), empty) 4307db96d56Sopenharmony_ci return _tokenize(rl_gen.__next__, encoding) 4317db96d56Sopenharmony_ci 4327db96d56Sopenharmony_ci 4337db96d56Sopenharmony_cidef _tokenize(readline, encoding): 4347db96d56Sopenharmony_ci lnum = parenlev = continued = 0 4357db96d56Sopenharmony_ci numchars = '0123456789' 4367db96d56Sopenharmony_ci contstr, needcont = '', 0 4377db96d56Sopenharmony_ci contline = None 4387db96d56Sopenharmony_ci indents = [0] 4397db96d56Sopenharmony_ci 4407db96d56Sopenharmony_ci if encoding is not None: 4417db96d56Sopenharmony_ci if encoding == "utf-8-sig": 4427db96d56Sopenharmony_ci # BOM will already have been stripped. 4437db96d56Sopenharmony_ci encoding = "utf-8" 4447db96d56Sopenharmony_ci yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') 4457db96d56Sopenharmony_ci last_line = b'' 4467db96d56Sopenharmony_ci line = b'' 4477db96d56Sopenharmony_ci while True: # loop over lines in stream 4487db96d56Sopenharmony_ci try: 4497db96d56Sopenharmony_ci # We capture the value of the line variable here because 4507db96d56Sopenharmony_ci # readline uses the empty string '' to signal end of input, 4517db96d56Sopenharmony_ci # hence `line` itself will always be overwritten at the end 4527db96d56Sopenharmony_ci # of this loop. 4537db96d56Sopenharmony_ci last_line = line 4547db96d56Sopenharmony_ci line = readline() 4557db96d56Sopenharmony_ci except StopIteration: 4567db96d56Sopenharmony_ci line = b'' 4577db96d56Sopenharmony_ci 4587db96d56Sopenharmony_ci if encoding is not None: 4597db96d56Sopenharmony_ci line = line.decode(encoding) 4607db96d56Sopenharmony_ci lnum += 1 4617db96d56Sopenharmony_ci pos, max = 0, len(line) 4627db96d56Sopenharmony_ci 4637db96d56Sopenharmony_ci if contstr: # continued string 4647db96d56Sopenharmony_ci if not line: 4657db96d56Sopenharmony_ci raise TokenError("EOF in multi-line string", strstart) 4667db96d56Sopenharmony_ci endmatch = endprog.match(line) 4677db96d56Sopenharmony_ci if endmatch: 4687db96d56Sopenharmony_ci pos = end = endmatch.end(0) 4697db96d56Sopenharmony_ci yield TokenInfo(STRING, contstr + line[:end], 4707db96d56Sopenharmony_ci strstart, (lnum, end), contline + line) 4717db96d56Sopenharmony_ci contstr, needcont = '', 0 4727db96d56Sopenharmony_ci contline = None 4737db96d56Sopenharmony_ci elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 4747db96d56Sopenharmony_ci yield TokenInfo(ERRORTOKEN, contstr + line, 4757db96d56Sopenharmony_ci strstart, (lnum, len(line)), contline) 4767db96d56Sopenharmony_ci contstr = '' 4777db96d56Sopenharmony_ci contline = None 4787db96d56Sopenharmony_ci continue 4797db96d56Sopenharmony_ci else: 4807db96d56Sopenharmony_ci contstr = contstr + line 4817db96d56Sopenharmony_ci contline = contline + line 4827db96d56Sopenharmony_ci continue 4837db96d56Sopenharmony_ci 4847db96d56Sopenharmony_ci elif parenlev == 0 and not continued: # new statement 4857db96d56Sopenharmony_ci if not line: break 4867db96d56Sopenharmony_ci column = 0 4877db96d56Sopenharmony_ci while pos < max: # measure leading whitespace 4887db96d56Sopenharmony_ci if line[pos] == ' ': 4897db96d56Sopenharmony_ci column += 1 4907db96d56Sopenharmony_ci elif line[pos] == '\t': 4917db96d56Sopenharmony_ci column = (column//tabsize + 1)*tabsize 4927db96d56Sopenharmony_ci elif line[pos] == '\f': 4937db96d56Sopenharmony_ci column = 0 4947db96d56Sopenharmony_ci else: 4957db96d56Sopenharmony_ci break 4967db96d56Sopenharmony_ci pos += 1 4977db96d56Sopenharmony_ci if pos == max: 4987db96d56Sopenharmony_ci break 4997db96d56Sopenharmony_ci 5007db96d56Sopenharmony_ci if line[pos] in '#\r\n': # skip comments or blank lines 5017db96d56Sopenharmony_ci if line[pos] == '#': 5027db96d56Sopenharmony_ci comment_token = line[pos:].rstrip('\r\n') 5037db96d56Sopenharmony_ci yield TokenInfo(COMMENT, comment_token, 5047db96d56Sopenharmony_ci (lnum, pos), (lnum, pos + len(comment_token)), line) 5057db96d56Sopenharmony_ci pos += len(comment_token) 5067db96d56Sopenharmony_ci 5077db96d56Sopenharmony_ci yield TokenInfo(NL, line[pos:], 5087db96d56Sopenharmony_ci (lnum, pos), (lnum, len(line)), line) 5097db96d56Sopenharmony_ci continue 5107db96d56Sopenharmony_ci 5117db96d56Sopenharmony_ci if column > indents[-1]: # count indents or dedents 5127db96d56Sopenharmony_ci indents.append(column) 5137db96d56Sopenharmony_ci yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 5147db96d56Sopenharmony_ci while column < indents[-1]: 5157db96d56Sopenharmony_ci if column not in indents: 5167db96d56Sopenharmony_ci raise IndentationError( 5177db96d56Sopenharmony_ci "unindent does not match any outer indentation level", 5187db96d56Sopenharmony_ci ("<tokenize>", lnum, pos, line)) 5197db96d56Sopenharmony_ci indents = indents[:-1] 5207db96d56Sopenharmony_ci 5217db96d56Sopenharmony_ci yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) 5227db96d56Sopenharmony_ci 5237db96d56Sopenharmony_ci else: # continued statement 5247db96d56Sopenharmony_ci if not line: 5257db96d56Sopenharmony_ci raise TokenError("EOF in multi-line statement", (lnum, 0)) 5267db96d56Sopenharmony_ci continued = 0 5277db96d56Sopenharmony_ci 5287db96d56Sopenharmony_ci while pos < max: 5297db96d56Sopenharmony_ci pseudomatch = _compile(PseudoToken).match(line, pos) 5307db96d56Sopenharmony_ci if pseudomatch: # scan for tokens 5317db96d56Sopenharmony_ci start, end = pseudomatch.span(1) 5327db96d56Sopenharmony_ci spos, epos, pos = (lnum, start), (lnum, end), end 5337db96d56Sopenharmony_ci if start == end: 5347db96d56Sopenharmony_ci continue 5357db96d56Sopenharmony_ci token, initial = line[start:end], line[start] 5367db96d56Sopenharmony_ci 5377db96d56Sopenharmony_ci if (initial in numchars or # ordinary number 5387db96d56Sopenharmony_ci (initial == '.' and token != '.' and token != '...')): 5397db96d56Sopenharmony_ci yield TokenInfo(NUMBER, token, spos, epos, line) 5407db96d56Sopenharmony_ci elif initial in '\r\n': 5417db96d56Sopenharmony_ci if parenlev > 0: 5427db96d56Sopenharmony_ci yield TokenInfo(NL, token, spos, epos, line) 5437db96d56Sopenharmony_ci else: 5447db96d56Sopenharmony_ci yield TokenInfo(NEWLINE, token, spos, epos, line) 5457db96d56Sopenharmony_ci 5467db96d56Sopenharmony_ci elif initial == '#': 5477db96d56Sopenharmony_ci assert not token.endswith("\n") 5487db96d56Sopenharmony_ci yield TokenInfo(COMMENT, token, spos, epos, line) 5497db96d56Sopenharmony_ci 5507db96d56Sopenharmony_ci elif token in triple_quoted: 5517db96d56Sopenharmony_ci endprog = _compile(endpats[token]) 5527db96d56Sopenharmony_ci endmatch = endprog.match(line, pos) 5537db96d56Sopenharmony_ci if endmatch: # all on one line 5547db96d56Sopenharmony_ci pos = endmatch.end(0) 5557db96d56Sopenharmony_ci token = line[start:pos] 5567db96d56Sopenharmony_ci yield TokenInfo(STRING, token, spos, (lnum, pos), line) 5577db96d56Sopenharmony_ci else: 5587db96d56Sopenharmony_ci strstart = (lnum, start) # multiple lines 5597db96d56Sopenharmony_ci contstr = line[start:] 5607db96d56Sopenharmony_ci contline = line 5617db96d56Sopenharmony_ci break 5627db96d56Sopenharmony_ci 5637db96d56Sopenharmony_ci # Check up to the first 3 chars of the token to see if 5647db96d56Sopenharmony_ci # they're in the single_quoted set. If so, they start 5657db96d56Sopenharmony_ci # a string. 5667db96d56Sopenharmony_ci # We're using the first 3, because we're looking for 5677db96d56Sopenharmony_ci # "rb'" (for example) at the start of the token. If 5687db96d56Sopenharmony_ci # we switch to longer prefixes, this needs to be 5697db96d56Sopenharmony_ci # adjusted. 5707db96d56Sopenharmony_ci # Note that initial == token[:1]. 5717db96d56Sopenharmony_ci # Also note that single quote checking must come after 5727db96d56Sopenharmony_ci # triple quote checking (above). 5737db96d56Sopenharmony_ci elif (initial in single_quoted or 5747db96d56Sopenharmony_ci token[:2] in single_quoted or 5757db96d56Sopenharmony_ci token[:3] in single_quoted): 5767db96d56Sopenharmony_ci if token[-1] == '\n': # continued string 5777db96d56Sopenharmony_ci strstart = (lnum, start) 5787db96d56Sopenharmony_ci # Again, using the first 3 chars of the 5797db96d56Sopenharmony_ci # token. This is looking for the matching end 5807db96d56Sopenharmony_ci # regex for the correct type of quote 5817db96d56Sopenharmony_ci # character. So it's really looking for 5827db96d56Sopenharmony_ci # endpats["'"] or endpats['"'], by trying to 5837db96d56Sopenharmony_ci # skip string prefix characters, if any. 5847db96d56Sopenharmony_ci endprog = _compile(endpats.get(initial) or 5857db96d56Sopenharmony_ci endpats.get(token[1]) or 5867db96d56Sopenharmony_ci endpats.get(token[2])) 5877db96d56Sopenharmony_ci contstr, needcont = line[start:], 1 5887db96d56Sopenharmony_ci contline = line 5897db96d56Sopenharmony_ci break 5907db96d56Sopenharmony_ci else: # ordinary string 5917db96d56Sopenharmony_ci yield TokenInfo(STRING, token, spos, epos, line) 5927db96d56Sopenharmony_ci 5937db96d56Sopenharmony_ci elif initial.isidentifier(): # ordinary name 5947db96d56Sopenharmony_ci yield TokenInfo(NAME, token, spos, epos, line) 5957db96d56Sopenharmony_ci elif initial == '\\': # continued stmt 5967db96d56Sopenharmony_ci continued = 1 5977db96d56Sopenharmony_ci else: 5987db96d56Sopenharmony_ci if initial in '([{': 5997db96d56Sopenharmony_ci parenlev += 1 6007db96d56Sopenharmony_ci elif initial in ')]}': 6017db96d56Sopenharmony_ci parenlev -= 1 6027db96d56Sopenharmony_ci yield TokenInfo(OP, token, spos, epos, line) 6037db96d56Sopenharmony_ci else: 6047db96d56Sopenharmony_ci yield TokenInfo(ERRORTOKEN, line[pos], 6057db96d56Sopenharmony_ci (lnum, pos), (lnum, pos+1), line) 6067db96d56Sopenharmony_ci pos += 1 6077db96d56Sopenharmony_ci 6087db96d56Sopenharmony_ci # Add an implicit NEWLINE if the input doesn't end in one 6097db96d56Sopenharmony_ci if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"): 6107db96d56Sopenharmony_ci yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') 6117db96d56Sopenharmony_ci for indent in indents[1:]: # pop remaining indent levels 6127db96d56Sopenharmony_ci yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') 6137db96d56Sopenharmony_ci yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') 6147db96d56Sopenharmony_ci 6157db96d56Sopenharmony_ci 6167db96d56Sopenharmony_cidef generate_tokens(readline): 6177db96d56Sopenharmony_ci """Tokenize a source reading Python code as unicode strings. 6187db96d56Sopenharmony_ci 6197db96d56Sopenharmony_ci This has the same API as tokenize(), except that it expects the *readline* 6207db96d56Sopenharmony_ci callable to return str objects instead of bytes. 6217db96d56Sopenharmony_ci """ 6227db96d56Sopenharmony_ci return _tokenize(readline, None) 6237db96d56Sopenharmony_ci 6247db96d56Sopenharmony_cidef main(): 6257db96d56Sopenharmony_ci import argparse 6267db96d56Sopenharmony_ci 6277db96d56Sopenharmony_ci # Helper error handling routines 6287db96d56Sopenharmony_ci def perror(message): 6297db96d56Sopenharmony_ci sys.stderr.write(message) 6307db96d56Sopenharmony_ci sys.stderr.write('\n') 6317db96d56Sopenharmony_ci 6327db96d56Sopenharmony_ci def error(message, filename=None, location=None): 6337db96d56Sopenharmony_ci if location: 6347db96d56Sopenharmony_ci args = (filename,) + location + (message,) 6357db96d56Sopenharmony_ci perror("%s:%d:%d: error: %s" % args) 6367db96d56Sopenharmony_ci elif filename: 6377db96d56Sopenharmony_ci perror("%s: error: %s" % (filename, message)) 6387db96d56Sopenharmony_ci else: 6397db96d56Sopenharmony_ci perror("error: %s" % message) 6407db96d56Sopenharmony_ci sys.exit(1) 6417db96d56Sopenharmony_ci 6427db96d56Sopenharmony_ci # Parse the arguments and options 6437db96d56Sopenharmony_ci parser = argparse.ArgumentParser(prog='python -m tokenize') 6447db96d56Sopenharmony_ci parser.add_argument(dest='filename', nargs='?', 6457db96d56Sopenharmony_ci metavar='filename.py', 6467db96d56Sopenharmony_ci help='the file to tokenize; defaults to stdin') 6477db96d56Sopenharmony_ci parser.add_argument('-e', '--exact', dest='exact', action='store_true', 6487db96d56Sopenharmony_ci help='display token names using the exact type') 6497db96d56Sopenharmony_ci args = parser.parse_args() 6507db96d56Sopenharmony_ci 6517db96d56Sopenharmony_ci try: 6527db96d56Sopenharmony_ci # Tokenize the input 6537db96d56Sopenharmony_ci if args.filename: 6547db96d56Sopenharmony_ci filename = args.filename 6557db96d56Sopenharmony_ci with _builtin_open(filename, 'rb') as f: 6567db96d56Sopenharmony_ci tokens = list(tokenize(f.readline)) 6577db96d56Sopenharmony_ci else: 6587db96d56Sopenharmony_ci filename = "<stdin>" 6597db96d56Sopenharmony_ci tokens = _tokenize(sys.stdin.readline, None) 6607db96d56Sopenharmony_ci 6617db96d56Sopenharmony_ci # Output the tokenization 6627db96d56Sopenharmony_ci for token in tokens: 6637db96d56Sopenharmony_ci token_type = token.type 6647db96d56Sopenharmony_ci if args.exact: 6657db96d56Sopenharmony_ci token_type = token.exact_type 6667db96d56Sopenharmony_ci token_range = "%d,%d-%d,%d:" % (token.start + token.end) 6677db96d56Sopenharmony_ci print("%-20s%-15s%-15r" % 6687db96d56Sopenharmony_ci (token_range, tok_name[token_type], token.string)) 6697db96d56Sopenharmony_ci except IndentationError as err: 6707db96d56Sopenharmony_ci line, column = err.args[1][1:3] 6717db96d56Sopenharmony_ci error(err.args[0], filename, (line, column)) 6727db96d56Sopenharmony_ci except TokenError as err: 6737db96d56Sopenharmony_ci line, column = err.args[1] 6747db96d56Sopenharmony_ci error(err.args[0], filename, (line, column)) 6757db96d56Sopenharmony_ci except SyntaxError as err: 6767db96d56Sopenharmony_ci error(err, filename) 6777db96d56Sopenharmony_ci except OSError as err: 6787db96d56Sopenharmony_ci error(err) 6797db96d56Sopenharmony_ci except KeyboardInterrupt: 6807db96d56Sopenharmony_ci print("interrupted\n") 6817db96d56Sopenharmony_ci except Exception as err: 6827db96d56Sopenharmony_ci perror("unexpected error: %s" % err) 6837db96d56Sopenharmony_ci raise 6847db96d56Sopenharmony_ci 6857db96d56Sopenharmony_cidef _generate_tokens_from_c_tokenizer(source): 6867db96d56Sopenharmony_ci """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" 6877db96d56Sopenharmony_ci import _tokenize as c_tokenizer 6887db96d56Sopenharmony_ci for info in c_tokenizer.TokenizerIter(source): 6897db96d56Sopenharmony_ci tok, type, lineno, end_lineno, col_off, end_col_off, line = info 6907db96d56Sopenharmony_ci yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line) 6917db96d56Sopenharmony_ci 6927db96d56Sopenharmony_ci 6937db96d56Sopenharmony_ciif __name__ == "__main__": 6947db96d56Sopenharmony_ci main() 695