17db96d56Sopenharmony_ci# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 27db96d56Sopenharmony_ci# Licensed to PSF under a Contributor Agreement. 37db96d56Sopenharmony_ci 47db96d56Sopenharmony_ci"""This module defines the data structures used to represent a grammar. 57db96d56Sopenharmony_ci 67db96d56Sopenharmony_ciThese are a bit arcane because they are derived from the data 77db96d56Sopenharmony_cistructures used by Python's 'pgen' parser generator. 87db96d56Sopenharmony_ci 97db96d56Sopenharmony_ciThere's also a table here mapping operators to their names in the 107db96d56Sopenharmony_citoken module; the Python tokenize module reports all operators as the 117db96d56Sopenharmony_cifallback token code OP, but the parser needs the actual token code. 127db96d56Sopenharmony_ci 137db96d56Sopenharmony_ci""" 147db96d56Sopenharmony_ci 157db96d56Sopenharmony_ci# Python imports 167db96d56Sopenharmony_ciimport pickle 177db96d56Sopenharmony_ci 187db96d56Sopenharmony_ci# Local imports 197db96d56Sopenharmony_cifrom . import token 207db96d56Sopenharmony_ci 217db96d56Sopenharmony_ci 227db96d56Sopenharmony_ciclass Grammar(object): 237db96d56Sopenharmony_ci """Pgen parsing tables conversion class. 247db96d56Sopenharmony_ci 257db96d56Sopenharmony_ci Once initialized, this class supplies the grammar tables for the 267db96d56Sopenharmony_ci parsing engine implemented by parse.py. The parsing engine 277db96d56Sopenharmony_ci accesses the instance variables directly. The class here does not 287db96d56Sopenharmony_ci provide initialization of the tables; several subclasses exist to 297db96d56Sopenharmony_ci do this (see the conv and pgen modules). 307db96d56Sopenharmony_ci 317db96d56Sopenharmony_ci The load() method reads the tables from a pickle file, which is 327db96d56Sopenharmony_ci much faster than the other ways offered by subclasses. The pickle 337db96d56Sopenharmony_ci file is written by calling dump() (after loading the grammar 347db96d56Sopenharmony_ci tables using a subclass). The report() method prints a readable 357db96d56Sopenharmony_ci representation of the tables to stdout, for debugging. 367db96d56Sopenharmony_ci 377db96d56Sopenharmony_ci The instance variables are as follows: 387db96d56Sopenharmony_ci 397db96d56Sopenharmony_ci symbol2number -- a dict mapping symbol names to numbers. Symbol 407db96d56Sopenharmony_ci numbers are always 256 or higher, to distinguish 417db96d56Sopenharmony_ci them from token numbers, which are between 0 and 427db96d56Sopenharmony_ci 255 (inclusive). 437db96d56Sopenharmony_ci 447db96d56Sopenharmony_ci number2symbol -- a dict mapping numbers to symbol names; 457db96d56Sopenharmony_ci these two are each other's inverse. 467db96d56Sopenharmony_ci 477db96d56Sopenharmony_ci states -- a list of DFAs, where each DFA is a list of 487db96d56Sopenharmony_ci states, each state is a list of arcs, and each 497db96d56Sopenharmony_ci arc is a (i, j) pair where i is a label and j is 507db96d56Sopenharmony_ci a state number. The DFA number is the index into 517db96d56Sopenharmony_ci this list. (This name is slightly confusing.) 527db96d56Sopenharmony_ci Final states are represented by a special arc of 537db96d56Sopenharmony_ci the form (0, j) where j is its own state number. 547db96d56Sopenharmony_ci 557db96d56Sopenharmony_ci dfas -- a dict mapping symbol numbers to (DFA, first) 567db96d56Sopenharmony_ci pairs, where DFA is an item from the states list 577db96d56Sopenharmony_ci above, and first is a set of tokens that can 587db96d56Sopenharmony_ci begin this grammar rule (represented by a dict 597db96d56Sopenharmony_ci whose values are always 1). 607db96d56Sopenharmony_ci 617db96d56Sopenharmony_ci labels -- a list of (x, y) pairs where x is either a token 627db96d56Sopenharmony_ci number or a symbol number, and y is either None 637db96d56Sopenharmony_ci or a string; the strings are keywords. The label 647db96d56Sopenharmony_ci number is the index in this list; label numbers 657db96d56Sopenharmony_ci are used to mark state transitions (arcs) in the 667db96d56Sopenharmony_ci DFAs. 677db96d56Sopenharmony_ci 687db96d56Sopenharmony_ci start -- the number of the grammar's start symbol. 697db96d56Sopenharmony_ci 707db96d56Sopenharmony_ci keywords -- a dict mapping keyword strings to arc labels. 717db96d56Sopenharmony_ci 727db96d56Sopenharmony_ci tokens -- a dict mapping token numbers to arc labels. 737db96d56Sopenharmony_ci 747db96d56Sopenharmony_ci """ 757db96d56Sopenharmony_ci 767db96d56Sopenharmony_ci def __init__(self): 777db96d56Sopenharmony_ci self.symbol2number = {} 787db96d56Sopenharmony_ci self.number2symbol = {} 797db96d56Sopenharmony_ci self.states = [] 807db96d56Sopenharmony_ci self.dfas = {} 817db96d56Sopenharmony_ci self.labels = [(0, "EMPTY")] 827db96d56Sopenharmony_ci self.keywords = {} 837db96d56Sopenharmony_ci self.tokens = {} 847db96d56Sopenharmony_ci self.symbol2label = {} 857db96d56Sopenharmony_ci self.start = 256 867db96d56Sopenharmony_ci 877db96d56Sopenharmony_ci def dump(self, filename): 887db96d56Sopenharmony_ci """Dump the grammar tables to a pickle file.""" 897db96d56Sopenharmony_ci with open(filename, "wb") as f: 907db96d56Sopenharmony_ci pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL) 917db96d56Sopenharmony_ci 927db96d56Sopenharmony_ci def load(self, filename): 937db96d56Sopenharmony_ci """Load the grammar tables from a pickle file.""" 947db96d56Sopenharmony_ci with open(filename, "rb") as f: 957db96d56Sopenharmony_ci d = pickle.load(f) 967db96d56Sopenharmony_ci self.__dict__.update(d) 977db96d56Sopenharmony_ci 987db96d56Sopenharmony_ci def loads(self, pkl): 997db96d56Sopenharmony_ci """Load the grammar tables from a pickle bytes object.""" 1007db96d56Sopenharmony_ci self.__dict__.update(pickle.loads(pkl)) 1017db96d56Sopenharmony_ci 1027db96d56Sopenharmony_ci def copy(self): 1037db96d56Sopenharmony_ci """ 1047db96d56Sopenharmony_ci Copy the grammar. 1057db96d56Sopenharmony_ci """ 1067db96d56Sopenharmony_ci new = self.__class__() 1077db96d56Sopenharmony_ci for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", 1087db96d56Sopenharmony_ci "tokens", "symbol2label"): 1097db96d56Sopenharmony_ci setattr(new, dict_attr, getattr(self, dict_attr).copy()) 1107db96d56Sopenharmony_ci new.labels = self.labels[:] 1117db96d56Sopenharmony_ci new.states = self.states[:] 1127db96d56Sopenharmony_ci new.start = self.start 1137db96d56Sopenharmony_ci return new 1147db96d56Sopenharmony_ci 1157db96d56Sopenharmony_ci def report(self): 1167db96d56Sopenharmony_ci """Dump the grammar tables to standard output, for debugging.""" 1177db96d56Sopenharmony_ci from pprint import pprint 1187db96d56Sopenharmony_ci print("s2n") 1197db96d56Sopenharmony_ci pprint(self.symbol2number) 1207db96d56Sopenharmony_ci print("n2s") 1217db96d56Sopenharmony_ci pprint(self.number2symbol) 1227db96d56Sopenharmony_ci print("states") 1237db96d56Sopenharmony_ci pprint(self.states) 1247db96d56Sopenharmony_ci print("dfas") 1257db96d56Sopenharmony_ci pprint(self.dfas) 1267db96d56Sopenharmony_ci print("labels") 1277db96d56Sopenharmony_ci pprint(self.labels) 1287db96d56Sopenharmony_ci print("start", self.start) 1297db96d56Sopenharmony_ci 1307db96d56Sopenharmony_ci 1317db96d56Sopenharmony_ci# Map from operator to number (since tokenize doesn't do this) 1327db96d56Sopenharmony_ci 1337db96d56Sopenharmony_ciopmap_raw = """ 1347db96d56Sopenharmony_ci( LPAR 1357db96d56Sopenharmony_ci) RPAR 1367db96d56Sopenharmony_ci[ LSQB 1377db96d56Sopenharmony_ci] RSQB 1387db96d56Sopenharmony_ci: COLON 1397db96d56Sopenharmony_ci, COMMA 1407db96d56Sopenharmony_ci; SEMI 1417db96d56Sopenharmony_ci+ PLUS 1427db96d56Sopenharmony_ci- MINUS 1437db96d56Sopenharmony_ci* STAR 1447db96d56Sopenharmony_ci/ SLASH 1457db96d56Sopenharmony_ci| VBAR 1467db96d56Sopenharmony_ci& AMPER 1477db96d56Sopenharmony_ci< LESS 1487db96d56Sopenharmony_ci> GREATER 1497db96d56Sopenharmony_ci= EQUAL 1507db96d56Sopenharmony_ci. DOT 1517db96d56Sopenharmony_ci% PERCENT 1527db96d56Sopenharmony_ci` BACKQUOTE 1537db96d56Sopenharmony_ci{ LBRACE 1547db96d56Sopenharmony_ci} RBRACE 1557db96d56Sopenharmony_ci@ AT 1567db96d56Sopenharmony_ci@= ATEQUAL 1577db96d56Sopenharmony_ci== EQEQUAL 1587db96d56Sopenharmony_ci!= NOTEQUAL 1597db96d56Sopenharmony_ci<> NOTEQUAL 1607db96d56Sopenharmony_ci<= LESSEQUAL 1617db96d56Sopenharmony_ci>= GREATEREQUAL 1627db96d56Sopenharmony_ci~ TILDE 1637db96d56Sopenharmony_ci^ CIRCUMFLEX 1647db96d56Sopenharmony_ci<< LEFTSHIFT 1657db96d56Sopenharmony_ci>> RIGHTSHIFT 1667db96d56Sopenharmony_ci** DOUBLESTAR 1677db96d56Sopenharmony_ci+= PLUSEQUAL 1687db96d56Sopenharmony_ci-= MINEQUAL 1697db96d56Sopenharmony_ci*= STAREQUAL 1707db96d56Sopenharmony_ci/= SLASHEQUAL 1717db96d56Sopenharmony_ci%= PERCENTEQUAL 1727db96d56Sopenharmony_ci&= AMPEREQUAL 1737db96d56Sopenharmony_ci|= VBAREQUAL 1747db96d56Sopenharmony_ci^= CIRCUMFLEXEQUAL 1757db96d56Sopenharmony_ci<<= LEFTSHIFTEQUAL 1767db96d56Sopenharmony_ci>>= RIGHTSHIFTEQUAL 1777db96d56Sopenharmony_ci**= DOUBLESTAREQUAL 1787db96d56Sopenharmony_ci// DOUBLESLASH 1797db96d56Sopenharmony_ci//= DOUBLESLASHEQUAL 1807db96d56Sopenharmony_ci-> RARROW 1817db96d56Sopenharmony_ci:= COLONEQUAL 1827db96d56Sopenharmony_ci""" 1837db96d56Sopenharmony_ci 1847db96d56Sopenharmony_ciopmap = {} 1857db96d56Sopenharmony_cifor line in opmap_raw.splitlines(): 1867db96d56Sopenharmony_ci if line: 1877db96d56Sopenharmony_ci op, name = line.split() 1887db96d56Sopenharmony_ci opmap[op] = getattr(token, name) 1897db96d56Sopenharmony_cidel line, op, name 190