17db96d56Sopenharmony_ci# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 27db96d56Sopenharmony_ci# Licensed to PSF under a Contributor Agreement. 37db96d56Sopenharmony_ci 47db96d56Sopenharmony_ci"""Convert graminit.[ch] spit out by pgen to Python code. 57db96d56Sopenharmony_ci 67db96d56Sopenharmony_ciPgen is the Python parser generator. It is useful to quickly create a 77db96d56Sopenharmony_ciparser from a grammar file in Python's grammar notation. But I don't 87db96d56Sopenharmony_ciwant my parsers to be written in C (yet), so I'm translating the 97db96d56Sopenharmony_ciparsing tables to Python data structures and writing a Python parse 107db96d56Sopenharmony_ciengine. 117db96d56Sopenharmony_ci 127db96d56Sopenharmony_ciNote that the token numbers are constants determined by the standard 137db96d56Sopenharmony_ciPython tokenizer. The standard token module defines these numbers and 147db96d56Sopenharmony_citheir names (the names are not used much). The token numbers are 157db96d56Sopenharmony_cihardcoded into the Python tokenizer and into pgen. A Python 167db96d56Sopenharmony_ciimplementation of the Python tokenizer is also available, in the 177db96d56Sopenharmony_cistandard tokenize module. 187db96d56Sopenharmony_ci 197db96d56Sopenharmony_ciOn the other hand, symbol numbers (representing the grammar's 207db96d56Sopenharmony_cinon-terminals) are assigned by pgen based on the actual grammar 217db96d56Sopenharmony_ciinput. 227db96d56Sopenharmony_ci 237db96d56Sopenharmony_ciNote: this module is pretty much obsolete; the pgen module generates 247db96d56Sopenharmony_ciequivalent grammar tables directly from the Grammar.txt input file 257db96d56Sopenharmony_ciwithout having to invoke the Python pgen C program. 267db96d56Sopenharmony_ci 277db96d56Sopenharmony_ci""" 287db96d56Sopenharmony_ci 297db96d56Sopenharmony_ci# Python imports 307db96d56Sopenharmony_ciimport re 317db96d56Sopenharmony_ci 327db96d56Sopenharmony_ci# Local imports 337db96d56Sopenharmony_cifrom pgen2 import grammar, token 347db96d56Sopenharmony_ci 357db96d56Sopenharmony_ci 367db96d56Sopenharmony_ciclass Converter(grammar.Grammar): 377db96d56Sopenharmony_ci """Grammar subclass that reads classic pgen output files. 387db96d56Sopenharmony_ci 397db96d56Sopenharmony_ci The run() method reads the tables as produced by the pgen parser 407db96d56Sopenharmony_ci generator, typically contained in two C files, graminit.h and 417db96d56Sopenharmony_ci graminit.c. The other methods are for internal use only. 427db96d56Sopenharmony_ci 437db96d56Sopenharmony_ci See the base class for more documentation. 447db96d56Sopenharmony_ci 457db96d56Sopenharmony_ci """ 467db96d56Sopenharmony_ci 477db96d56Sopenharmony_ci def run(self, graminit_h, graminit_c): 487db96d56Sopenharmony_ci """Load the grammar tables from the text files written by pgen.""" 497db96d56Sopenharmony_ci self.parse_graminit_h(graminit_h) 507db96d56Sopenharmony_ci self.parse_graminit_c(graminit_c) 517db96d56Sopenharmony_ci self.finish_off() 527db96d56Sopenharmony_ci 537db96d56Sopenharmony_ci def parse_graminit_h(self, filename): 547db96d56Sopenharmony_ci """Parse the .h file written by pgen. (Internal) 557db96d56Sopenharmony_ci 567db96d56Sopenharmony_ci This file is a sequence of #define statements defining the 577db96d56Sopenharmony_ci nonterminals of the grammar as numbers. We build two tables 587db96d56Sopenharmony_ci mapping the numbers to names and back. 597db96d56Sopenharmony_ci 607db96d56Sopenharmony_ci """ 617db96d56Sopenharmony_ci try: 627db96d56Sopenharmony_ci f = open(filename) 637db96d56Sopenharmony_ci except OSError as err: 647db96d56Sopenharmony_ci print("Can't open %s: %s" % (filename, err)) 657db96d56Sopenharmony_ci return False 667db96d56Sopenharmony_ci self.symbol2number = {} 677db96d56Sopenharmony_ci self.number2symbol = {} 687db96d56Sopenharmony_ci lineno = 0 697db96d56Sopenharmony_ci for line in f: 707db96d56Sopenharmony_ci lineno += 1 717db96d56Sopenharmony_ci mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line) 727db96d56Sopenharmony_ci if not mo and line.strip(): 737db96d56Sopenharmony_ci print("%s(%s): can't parse %s" % (filename, lineno, 747db96d56Sopenharmony_ci line.strip())) 757db96d56Sopenharmony_ci else: 767db96d56Sopenharmony_ci symbol, number = mo.groups() 777db96d56Sopenharmony_ci number = int(number) 787db96d56Sopenharmony_ci assert symbol not in self.symbol2number 797db96d56Sopenharmony_ci assert number not in self.number2symbol 807db96d56Sopenharmony_ci self.symbol2number[symbol] = number 817db96d56Sopenharmony_ci self.number2symbol[number] = symbol 827db96d56Sopenharmony_ci return True 837db96d56Sopenharmony_ci 847db96d56Sopenharmony_ci def parse_graminit_c(self, filename): 857db96d56Sopenharmony_ci """Parse the .c file written by pgen. (Internal) 867db96d56Sopenharmony_ci 877db96d56Sopenharmony_ci The file looks as follows. The first two lines are always this: 887db96d56Sopenharmony_ci 897db96d56Sopenharmony_ci #include "pgenheaders.h" 907db96d56Sopenharmony_ci #include "grammar.h" 917db96d56Sopenharmony_ci 927db96d56Sopenharmony_ci After that come four blocks: 937db96d56Sopenharmony_ci 947db96d56Sopenharmony_ci 1) one or more state definitions 957db96d56Sopenharmony_ci 2) a table defining dfas 967db96d56Sopenharmony_ci 3) a table defining labels 977db96d56Sopenharmony_ci 4) a struct defining the grammar 987db96d56Sopenharmony_ci 997db96d56Sopenharmony_ci A state definition has the following form: 1007db96d56Sopenharmony_ci - one or more arc arrays, each of the form: 1017db96d56Sopenharmony_ci static arc arcs_<n>_<m>[<k>] = { 1027db96d56Sopenharmony_ci {<i>, <j>}, 1037db96d56Sopenharmony_ci ... 1047db96d56Sopenharmony_ci }; 1057db96d56Sopenharmony_ci - followed by a state array, of the form: 1067db96d56Sopenharmony_ci static state states_<s>[<t>] = { 1077db96d56Sopenharmony_ci {<k>, arcs_<n>_<m>}, 1087db96d56Sopenharmony_ci ... 1097db96d56Sopenharmony_ci }; 1107db96d56Sopenharmony_ci 1117db96d56Sopenharmony_ci """ 1127db96d56Sopenharmony_ci try: 1137db96d56Sopenharmony_ci f = open(filename) 1147db96d56Sopenharmony_ci except OSError as err: 1157db96d56Sopenharmony_ci print("Can't open %s: %s" % (filename, err)) 1167db96d56Sopenharmony_ci return False 1177db96d56Sopenharmony_ci # The code below essentially uses f's iterator-ness! 1187db96d56Sopenharmony_ci lineno = 0 1197db96d56Sopenharmony_ci 1207db96d56Sopenharmony_ci # Expect the two #include lines 1217db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1227db96d56Sopenharmony_ci assert line == '#include "pgenheaders.h"\n', (lineno, line) 1237db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1247db96d56Sopenharmony_ci assert line == '#include "grammar.h"\n', (lineno, line) 1257db96d56Sopenharmony_ci 1267db96d56Sopenharmony_ci # Parse the state definitions 1277db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1287db96d56Sopenharmony_ci allarcs = {} 1297db96d56Sopenharmony_ci states = [] 1307db96d56Sopenharmony_ci while line.startswith("static arc "): 1317db96d56Sopenharmony_ci while line.startswith("static arc "): 1327db96d56Sopenharmony_ci mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$", 1337db96d56Sopenharmony_ci line) 1347db96d56Sopenharmony_ci assert mo, (lineno, line) 1357db96d56Sopenharmony_ci n, m, k = list(map(int, mo.groups())) 1367db96d56Sopenharmony_ci arcs = [] 1377db96d56Sopenharmony_ci for _ in range(k): 1387db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1397db96d56Sopenharmony_ci mo = re.match(r"\s+{(\d+), (\d+)},$", line) 1407db96d56Sopenharmony_ci assert mo, (lineno, line) 1417db96d56Sopenharmony_ci i, j = list(map(int, mo.groups())) 1427db96d56Sopenharmony_ci arcs.append((i, j)) 1437db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1447db96d56Sopenharmony_ci assert line == "};\n", (lineno, line) 1457db96d56Sopenharmony_ci allarcs[(n, m)] = arcs 1467db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1477db96d56Sopenharmony_ci mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line) 1487db96d56Sopenharmony_ci assert mo, (lineno, line) 1497db96d56Sopenharmony_ci s, t = list(map(int, mo.groups())) 1507db96d56Sopenharmony_ci assert s == len(states), (lineno, line) 1517db96d56Sopenharmony_ci state = [] 1527db96d56Sopenharmony_ci for _ in range(t): 1537db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1547db96d56Sopenharmony_ci mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line) 1557db96d56Sopenharmony_ci assert mo, (lineno, line) 1567db96d56Sopenharmony_ci k, n, m = list(map(int, mo.groups())) 1577db96d56Sopenharmony_ci arcs = allarcs[n, m] 1587db96d56Sopenharmony_ci assert k == len(arcs), (lineno, line) 1597db96d56Sopenharmony_ci state.append(arcs) 1607db96d56Sopenharmony_ci states.append(state) 1617db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1627db96d56Sopenharmony_ci assert line == "};\n", (lineno, line) 1637db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1647db96d56Sopenharmony_ci self.states = states 1657db96d56Sopenharmony_ci 1667db96d56Sopenharmony_ci # Parse the dfas 1677db96d56Sopenharmony_ci dfas = {} 1687db96d56Sopenharmony_ci mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line) 1697db96d56Sopenharmony_ci assert mo, (lineno, line) 1707db96d56Sopenharmony_ci ndfas = int(mo.group(1)) 1717db96d56Sopenharmony_ci for i in range(ndfas): 1727db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1737db96d56Sopenharmony_ci mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', 1747db96d56Sopenharmony_ci line) 1757db96d56Sopenharmony_ci assert mo, (lineno, line) 1767db96d56Sopenharmony_ci symbol = mo.group(2) 1777db96d56Sopenharmony_ci number, x, y, z = list(map(int, mo.group(1, 3, 4, 5))) 1787db96d56Sopenharmony_ci assert self.symbol2number[symbol] == number, (lineno, line) 1797db96d56Sopenharmony_ci assert self.number2symbol[number] == symbol, (lineno, line) 1807db96d56Sopenharmony_ci assert x == 0, (lineno, line) 1817db96d56Sopenharmony_ci state = states[z] 1827db96d56Sopenharmony_ci assert y == len(state), (lineno, line) 1837db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1847db96d56Sopenharmony_ci mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line) 1857db96d56Sopenharmony_ci assert mo, (lineno, line) 1867db96d56Sopenharmony_ci first = {} 1877db96d56Sopenharmony_ci rawbitset = eval(mo.group(1)) 1887db96d56Sopenharmony_ci for i, c in enumerate(rawbitset): 1897db96d56Sopenharmony_ci byte = ord(c) 1907db96d56Sopenharmony_ci for j in range(8): 1917db96d56Sopenharmony_ci if byte & (1<<j): 1927db96d56Sopenharmony_ci first[i*8 + j] = 1 1937db96d56Sopenharmony_ci dfas[number] = (state, first) 1947db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 1957db96d56Sopenharmony_ci assert line == "};\n", (lineno, line) 1967db96d56Sopenharmony_ci self.dfas = dfas 1977db96d56Sopenharmony_ci 1987db96d56Sopenharmony_ci # Parse the labels 1997db96d56Sopenharmony_ci labels = [] 2007db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 2017db96d56Sopenharmony_ci mo = re.match(r"static label labels\[(\d+)\] = {$", line) 2027db96d56Sopenharmony_ci assert mo, (lineno, line) 2037db96d56Sopenharmony_ci nlabels = int(mo.group(1)) 2047db96d56Sopenharmony_ci for i in range(nlabels): 2057db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 2067db96d56Sopenharmony_ci mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line) 2077db96d56Sopenharmony_ci assert mo, (lineno, line) 2087db96d56Sopenharmony_ci x, y = mo.groups() 2097db96d56Sopenharmony_ci x = int(x) 2107db96d56Sopenharmony_ci if y == "0": 2117db96d56Sopenharmony_ci y = None 2127db96d56Sopenharmony_ci else: 2137db96d56Sopenharmony_ci y = eval(y) 2147db96d56Sopenharmony_ci labels.append((x, y)) 2157db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 2167db96d56Sopenharmony_ci assert line == "};\n", (lineno, line) 2177db96d56Sopenharmony_ci self.labels = labels 2187db96d56Sopenharmony_ci 2197db96d56Sopenharmony_ci # Parse the grammar struct 2207db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 2217db96d56Sopenharmony_ci assert line == "grammar _PyParser_Grammar = {\n", (lineno, line) 2227db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 2237db96d56Sopenharmony_ci mo = re.match(r"\s+(\d+),$", line) 2247db96d56Sopenharmony_ci assert mo, (lineno, line) 2257db96d56Sopenharmony_ci ndfas = int(mo.group(1)) 2267db96d56Sopenharmony_ci assert ndfas == len(self.dfas) 2277db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 2287db96d56Sopenharmony_ci assert line == "\tdfas,\n", (lineno, line) 2297db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 2307db96d56Sopenharmony_ci mo = re.match(r"\s+{(\d+), labels},$", line) 2317db96d56Sopenharmony_ci assert mo, (lineno, line) 2327db96d56Sopenharmony_ci nlabels = int(mo.group(1)) 2337db96d56Sopenharmony_ci assert nlabels == len(self.labels), (lineno, line) 2347db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 2357db96d56Sopenharmony_ci mo = re.match(r"\s+(\d+)$", line) 2367db96d56Sopenharmony_ci assert mo, (lineno, line) 2377db96d56Sopenharmony_ci start = int(mo.group(1)) 2387db96d56Sopenharmony_ci assert start in self.number2symbol, (lineno, line) 2397db96d56Sopenharmony_ci self.start = start 2407db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 2417db96d56Sopenharmony_ci assert line == "};\n", (lineno, line) 2427db96d56Sopenharmony_ci try: 2437db96d56Sopenharmony_ci lineno, line = lineno+1, next(f) 2447db96d56Sopenharmony_ci except StopIteration: 2457db96d56Sopenharmony_ci pass 2467db96d56Sopenharmony_ci else: 2477db96d56Sopenharmony_ci assert 0, (lineno, line) 2487db96d56Sopenharmony_ci 2497db96d56Sopenharmony_ci def finish_off(self): 2507db96d56Sopenharmony_ci """Create additional useful structures. (Internal).""" 2517db96d56Sopenharmony_ci self.keywords = {} # map from keyword strings to arc labels 2527db96d56Sopenharmony_ci self.tokens = {} # map from numeric token values to arc labels 2537db96d56Sopenharmony_ci for ilabel, (type, value) in enumerate(self.labels): 2547db96d56Sopenharmony_ci if type == token.NAME and value is not None: 2557db96d56Sopenharmony_ci self.keywords[value] = ilabel 2567db96d56Sopenharmony_ci elif value is None: 2577db96d56Sopenharmony_ci self.tokens[type] = ilabel 258