17db96d56Sopenharmony_ci# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
27db96d56Sopenharmony_ci# Licensed to PSF under a Contributor Agreement.
37db96d56Sopenharmony_ci
47db96d56Sopenharmony_ci"""Convert graminit.[ch] spit out by pgen to Python code.
57db96d56Sopenharmony_ci
67db96d56Sopenharmony_ciPgen is the Python parser generator.  It is useful to quickly create a
77db96d56Sopenharmony_ciparser from a grammar file in Python's grammar notation.  But I don't
87db96d56Sopenharmony_ciwant my parsers to be written in C (yet), so I'm translating the
97db96d56Sopenharmony_ciparsing tables to Python data structures and writing a Python parse
107db96d56Sopenharmony_ciengine.
117db96d56Sopenharmony_ci
127db96d56Sopenharmony_ciNote that the token numbers are constants determined by the standard
137db96d56Sopenharmony_ciPython tokenizer.  The standard token module defines these numbers and
147db96d56Sopenharmony_citheir names (the names are not used much).  The token numbers are
157db96d56Sopenharmony_cihardcoded into the Python tokenizer and into pgen.  A Python
167db96d56Sopenharmony_ciimplementation of the Python tokenizer is also available, in the
177db96d56Sopenharmony_cistandard tokenize module.
187db96d56Sopenharmony_ci
197db96d56Sopenharmony_ciOn the other hand, symbol numbers (representing the grammar's
207db96d56Sopenharmony_cinon-terminals) are assigned by pgen based on the actual grammar
217db96d56Sopenharmony_ciinput.
227db96d56Sopenharmony_ci
237db96d56Sopenharmony_ciNote: this module is pretty much obsolete; the pgen module generates
247db96d56Sopenharmony_ciequivalent grammar tables directly from the Grammar.txt input file
257db96d56Sopenharmony_ciwithout having to invoke the Python pgen C program.
267db96d56Sopenharmony_ci
277db96d56Sopenharmony_ci"""
287db96d56Sopenharmony_ci
297db96d56Sopenharmony_ci# Python imports
307db96d56Sopenharmony_ciimport re
317db96d56Sopenharmony_ci
327db96d56Sopenharmony_ci# Local imports
337db96d56Sopenharmony_cifrom pgen2 import grammar, token
347db96d56Sopenharmony_ci
357db96d56Sopenharmony_ci
367db96d56Sopenharmony_ciclass Converter(grammar.Grammar):
377db96d56Sopenharmony_ci    """Grammar subclass that reads classic pgen output files.
387db96d56Sopenharmony_ci
397db96d56Sopenharmony_ci    The run() method reads the tables as produced by the pgen parser
407db96d56Sopenharmony_ci    generator, typically contained in two C files, graminit.h and
417db96d56Sopenharmony_ci    graminit.c.  The other methods are for internal use only.
427db96d56Sopenharmony_ci
437db96d56Sopenharmony_ci    See the base class for more documentation.
447db96d56Sopenharmony_ci
457db96d56Sopenharmony_ci    """
467db96d56Sopenharmony_ci
477db96d56Sopenharmony_ci    def run(self, graminit_h, graminit_c):
487db96d56Sopenharmony_ci        """Load the grammar tables from the text files written by pgen."""
497db96d56Sopenharmony_ci        self.parse_graminit_h(graminit_h)
507db96d56Sopenharmony_ci        self.parse_graminit_c(graminit_c)
517db96d56Sopenharmony_ci        self.finish_off()
527db96d56Sopenharmony_ci
537db96d56Sopenharmony_ci    def parse_graminit_h(self, filename):
547db96d56Sopenharmony_ci        """Parse the .h file written by pgen.  (Internal)
557db96d56Sopenharmony_ci
567db96d56Sopenharmony_ci        This file is a sequence of #define statements defining the
577db96d56Sopenharmony_ci        nonterminals of the grammar as numbers.  We build two tables
587db96d56Sopenharmony_ci        mapping the numbers to names and back.
597db96d56Sopenharmony_ci
607db96d56Sopenharmony_ci        """
617db96d56Sopenharmony_ci        try:
627db96d56Sopenharmony_ci            f = open(filename)
637db96d56Sopenharmony_ci        except OSError as err:
647db96d56Sopenharmony_ci            print("Can't open %s: %s" % (filename, err))
657db96d56Sopenharmony_ci            return False
667db96d56Sopenharmony_ci        self.symbol2number = {}
677db96d56Sopenharmony_ci        self.number2symbol = {}
687db96d56Sopenharmony_ci        lineno = 0
697db96d56Sopenharmony_ci        for line in f:
707db96d56Sopenharmony_ci            lineno += 1
717db96d56Sopenharmony_ci            mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
727db96d56Sopenharmony_ci            if not mo and line.strip():
737db96d56Sopenharmony_ci                print("%s(%s): can't parse %s" % (filename, lineno,
747db96d56Sopenharmony_ci                                                  line.strip()))
757db96d56Sopenharmony_ci            else:
767db96d56Sopenharmony_ci                symbol, number = mo.groups()
777db96d56Sopenharmony_ci                number = int(number)
787db96d56Sopenharmony_ci                assert symbol not in self.symbol2number
797db96d56Sopenharmony_ci                assert number not in self.number2symbol
807db96d56Sopenharmony_ci                self.symbol2number[symbol] = number
817db96d56Sopenharmony_ci                self.number2symbol[number] = symbol
827db96d56Sopenharmony_ci        return True
837db96d56Sopenharmony_ci
847db96d56Sopenharmony_ci    def parse_graminit_c(self, filename):
857db96d56Sopenharmony_ci        """Parse the .c file written by pgen.  (Internal)
867db96d56Sopenharmony_ci
877db96d56Sopenharmony_ci        The file looks as follows.  The first two lines are always this:
887db96d56Sopenharmony_ci
897db96d56Sopenharmony_ci        #include "pgenheaders.h"
907db96d56Sopenharmony_ci        #include "grammar.h"
917db96d56Sopenharmony_ci
927db96d56Sopenharmony_ci        After that come four blocks:
937db96d56Sopenharmony_ci
947db96d56Sopenharmony_ci        1) one or more state definitions
957db96d56Sopenharmony_ci        2) a table defining dfas
967db96d56Sopenharmony_ci        3) a table defining labels
977db96d56Sopenharmony_ci        4) a struct defining the grammar
987db96d56Sopenharmony_ci
997db96d56Sopenharmony_ci        A state definition has the following form:
1007db96d56Sopenharmony_ci        - one or more arc arrays, each of the form:
1017db96d56Sopenharmony_ci          static arc arcs_<n>_<m>[<k>] = {
1027db96d56Sopenharmony_ci                  {<i>, <j>},
1037db96d56Sopenharmony_ci                  ...
1047db96d56Sopenharmony_ci          };
1057db96d56Sopenharmony_ci        - followed by a state array, of the form:
1067db96d56Sopenharmony_ci          static state states_<s>[<t>] = {
1077db96d56Sopenharmony_ci                  {<k>, arcs_<n>_<m>},
1087db96d56Sopenharmony_ci                  ...
1097db96d56Sopenharmony_ci          };
1107db96d56Sopenharmony_ci
1117db96d56Sopenharmony_ci        """
1127db96d56Sopenharmony_ci        try:
1137db96d56Sopenharmony_ci            f = open(filename)
1147db96d56Sopenharmony_ci        except OSError as err:
1157db96d56Sopenharmony_ci            print("Can't open %s: %s" % (filename, err))
1167db96d56Sopenharmony_ci            return False
1177db96d56Sopenharmony_ci        # The code below essentially uses f's iterator-ness!
1187db96d56Sopenharmony_ci        lineno = 0
1197db96d56Sopenharmony_ci
1207db96d56Sopenharmony_ci        # Expect the two #include lines
1217db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
1227db96d56Sopenharmony_ci        assert line == '#include "pgenheaders.h"\n', (lineno, line)
1237db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
1247db96d56Sopenharmony_ci        assert line == '#include "grammar.h"\n', (lineno, line)
1257db96d56Sopenharmony_ci
1267db96d56Sopenharmony_ci        # Parse the state definitions
1277db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
1287db96d56Sopenharmony_ci        allarcs = {}
1297db96d56Sopenharmony_ci        states = []
1307db96d56Sopenharmony_ci        while line.startswith("static arc "):
1317db96d56Sopenharmony_ci            while line.startswith("static arc "):
1327db96d56Sopenharmony_ci                mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$",
1337db96d56Sopenharmony_ci                              line)
1347db96d56Sopenharmony_ci                assert mo, (lineno, line)
1357db96d56Sopenharmony_ci                n, m, k = list(map(int, mo.groups()))
1367db96d56Sopenharmony_ci                arcs = []
1377db96d56Sopenharmony_ci                for _ in range(k):
1387db96d56Sopenharmony_ci                    lineno, line = lineno+1, next(f)
1397db96d56Sopenharmony_ci                    mo = re.match(r"\s+{(\d+), (\d+)},$", line)
1407db96d56Sopenharmony_ci                    assert mo, (lineno, line)
1417db96d56Sopenharmony_ci                    i, j = list(map(int, mo.groups()))
1427db96d56Sopenharmony_ci                    arcs.append((i, j))
1437db96d56Sopenharmony_ci                lineno, line = lineno+1, next(f)
1447db96d56Sopenharmony_ci                assert line == "};\n", (lineno, line)
1457db96d56Sopenharmony_ci                allarcs[(n, m)] = arcs
1467db96d56Sopenharmony_ci                lineno, line = lineno+1, next(f)
1477db96d56Sopenharmony_ci            mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
1487db96d56Sopenharmony_ci            assert mo, (lineno, line)
1497db96d56Sopenharmony_ci            s, t = list(map(int, mo.groups()))
1507db96d56Sopenharmony_ci            assert s == len(states), (lineno, line)
1517db96d56Sopenharmony_ci            state = []
1527db96d56Sopenharmony_ci            for _ in range(t):
1537db96d56Sopenharmony_ci                lineno, line = lineno+1, next(f)
1547db96d56Sopenharmony_ci                mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
1557db96d56Sopenharmony_ci                assert mo, (lineno, line)
1567db96d56Sopenharmony_ci                k, n, m = list(map(int, mo.groups()))
1577db96d56Sopenharmony_ci                arcs = allarcs[n, m]
1587db96d56Sopenharmony_ci                assert k == len(arcs), (lineno, line)
1597db96d56Sopenharmony_ci                state.append(arcs)
1607db96d56Sopenharmony_ci            states.append(state)
1617db96d56Sopenharmony_ci            lineno, line = lineno+1, next(f)
1627db96d56Sopenharmony_ci            assert line == "};\n", (lineno, line)
1637db96d56Sopenharmony_ci            lineno, line = lineno+1, next(f)
1647db96d56Sopenharmony_ci        self.states = states
1657db96d56Sopenharmony_ci
1667db96d56Sopenharmony_ci        # Parse the dfas
1677db96d56Sopenharmony_ci        dfas = {}
1687db96d56Sopenharmony_ci        mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
1697db96d56Sopenharmony_ci        assert mo, (lineno, line)
1707db96d56Sopenharmony_ci        ndfas = int(mo.group(1))
1717db96d56Sopenharmony_ci        for i in range(ndfas):
1727db96d56Sopenharmony_ci            lineno, line = lineno+1, next(f)
1737db96d56Sopenharmony_ci            mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$',
1747db96d56Sopenharmony_ci                          line)
1757db96d56Sopenharmony_ci            assert mo, (lineno, line)
1767db96d56Sopenharmony_ci            symbol = mo.group(2)
1777db96d56Sopenharmony_ci            number, x, y, z = list(map(int, mo.group(1, 3, 4, 5)))
1787db96d56Sopenharmony_ci            assert self.symbol2number[symbol] == number, (lineno, line)
1797db96d56Sopenharmony_ci            assert self.number2symbol[number] == symbol, (lineno, line)
1807db96d56Sopenharmony_ci            assert x == 0, (lineno, line)
1817db96d56Sopenharmony_ci            state = states[z]
1827db96d56Sopenharmony_ci            assert y == len(state), (lineno, line)
1837db96d56Sopenharmony_ci            lineno, line = lineno+1, next(f)
1847db96d56Sopenharmony_ci            mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
1857db96d56Sopenharmony_ci            assert mo, (lineno, line)
1867db96d56Sopenharmony_ci            first = {}
1877db96d56Sopenharmony_ci            rawbitset = eval(mo.group(1))
1887db96d56Sopenharmony_ci            for i, c in enumerate(rawbitset):
1897db96d56Sopenharmony_ci                byte = ord(c)
1907db96d56Sopenharmony_ci                for j in range(8):
1917db96d56Sopenharmony_ci                    if byte & (1<<j):
1927db96d56Sopenharmony_ci                        first[i*8 + j] = 1
1937db96d56Sopenharmony_ci            dfas[number] = (state, first)
1947db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
1957db96d56Sopenharmony_ci        assert line == "};\n", (lineno, line)
1967db96d56Sopenharmony_ci        self.dfas = dfas
1977db96d56Sopenharmony_ci
1987db96d56Sopenharmony_ci        # Parse the labels
1997db96d56Sopenharmony_ci        labels = []
2007db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
2017db96d56Sopenharmony_ci        mo = re.match(r"static label labels\[(\d+)\] = {$", line)
2027db96d56Sopenharmony_ci        assert mo, (lineno, line)
2037db96d56Sopenharmony_ci        nlabels = int(mo.group(1))
2047db96d56Sopenharmony_ci        for i in range(nlabels):
2057db96d56Sopenharmony_ci            lineno, line = lineno+1, next(f)
2067db96d56Sopenharmony_ci            mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
2077db96d56Sopenharmony_ci            assert mo, (lineno, line)
2087db96d56Sopenharmony_ci            x, y = mo.groups()
2097db96d56Sopenharmony_ci            x = int(x)
2107db96d56Sopenharmony_ci            if y == "0":
2117db96d56Sopenharmony_ci                y = None
2127db96d56Sopenharmony_ci            else:
2137db96d56Sopenharmony_ci                y = eval(y)
2147db96d56Sopenharmony_ci            labels.append((x, y))
2157db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
2167db96d56Sopenharmony_ci        assert line == "};\n", (lineno, line)
2177db96d56Sopenharmony_ci        self.labels = labels
2187db96d56Sopenharmony_ci
2197db96d56Sopenharmony_ci        # Parse the grammar struct
2207db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
2217db96d56Sopenharmony_ci        assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
2227db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
2237db96d56Sopenharmony_ci        mo = re.match(r"\s+(\d+),$", line)
2247db96d56Sopenharmony_ci        assert mo, (lineno, line)
2257db96d56Sopenharmony_ci        ndfas = int(mo.group(1))
2267db96d56Sopenharmony_ci        assert ndfas == len(self.dfas)
2277db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
2287db96d56Sopenharmony_ci        assert line == "\tdfas,\n", (lineno, line)
2297db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
2307db96d56Sopenharmony_ci        mo = re.match(r"\s+{(\d+), labels},$", line)
2317db96d56Sopenharmony_ci        assert mo, (lineno, line)
2327db96d56Sopenharmony_ci        nlabels = int(mo.group(1))
2337db96d56Sopenharmony_ci        assert nlabels == len(self.labels), (lineno, line)
2347db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
2357db96d56Sopenharmony_ci        mo = re.match(r"\s+(\d+)$", line)
2367db96d56Sopenharmony_ci        assert mo, (lineno, line)
2377db96d56Sopenharmony_ci        start = int(mo.group(1))
2387db96d56Sopenharmony_ci        assert start in self.number2symbol, (lineno, line)
2397db96d56Sopenharmony_ci        self.start = start
2407db96d56Sopenharmony_ci        lineno, line = lineno+1, next(f)
2417db96d56Sopenharmony_ci        assert line == "};\n", (lineno, line)
2427db96d56Sopenharmony_ci        try:
2437db96d56Sopenharmony_ci            lineno, line = lineno+1, next(f)
2447db96d56Sopenharmony_ci        except StopIteration:
2457db96d56Sopenharmony_ci            pass
2467db96d56Sopenharmony_ci        else:
2477db96d56Sopenharmony_ci            assert 0, (lineno, line)
2487db96d56Sopenharmony_ci
2497db96d56Sopenharmony_ci    def finish_off(self):
2507db96d56Sopenharmony_ci        """Create additional useful structures.  (Internal)."""
2517db96d56Sopenharmony_ci        self.keywords = {} # map from keyword strings to arc labels
2527db96d56Sopenharmony_ci        self.tokens = {}   # map from numeric token values to arc labels
2537db96d56Sopenharmony_ci        for ilabel, (type, value) in enumerate(self.labels):
2547db96d56Sopenharmony_ci            if type == token.NAME and value is not None:
2557db96d56Sopenharmony_ci                self.keywords[value] = ilabel
2567db96d56Sopenharmony_ci            elif value is None:
2577db96d56Sopenharmony_ci                self.tokens[type] = ilabel
258