1"""A simple non-validating parser for C99.
2
3The functions and regex patterns here are not entirely suitable for
4validating C syntax.  Please rely on a proper compiler for that.
5Instead our goal here is merely matching and extracting information from
6valid C code.
7
8Furthermore, the grammar rules for the C syntax (particularly as
9described in the K&R book) actually describe a superset, of which the
10full C language is a proper subset.  Here are some of the extra
11conditions that must be applied when parsing C code:
12
13* ...
14
15(see: https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf)
16
17We have taken advantage of the elements of the C grammar that are used
18only in a few limited contexts, mostly as delimiters.  They allow us to
19focus the regex patterns confidently.  Here are the relevant tokens and
20in which grammar rules they are used:
21
22separators:
23* ";"
24   + (decl) struct/union:  at end of each member decl
25   + (decl) declaration:  at end of each (non-compound) decl
26   + (stmt) expr stmt:  at end of each stmt
27   + (stmt) for:  between exprs in "header"
28   + (stmt) goto:  at end
29   + (stmt) continue:  at end
30   + (stmt) break:  at end
31   + (stmt) return:  at end
32* ","
33   + (decl) struct/union:  between member declators
34   + (decl) param-list:  between params
35   + (decl) enum: between enumerators
36   + (decl) initializer (compound):  between initializers
37   + (expr) postfix:  between func call args
38   + (expr) expression:  between "assignment" exprs
39* ":"
40   + (decl) struct/union:  in member declators
41   + (stmt) label:  between label and stmt
42   + (stmt) case:  between expression and stmt
43   + (stmt) default:  between "default" and stmt
44* "="
45   + (decl) delaration:  between decl and initializer
46   + (decl) enumerator:  between identifier and "initializer"
47   + (expr) assignment:  between "var" and expr
48
49wrappers:
50* "(...)"
51   + (decl) declarator (func ptr):  to wrap ptr/name
52   + (decl) declarator (func ptr):  around params
53   + (decl) declarator:  around sub-declarator (for readability)
54   + (expr) postfix (func call):  around args
55   + (expr) primary:  around sub-expr
56   + (stmt) if:  around condition
57   + (stmt) switch:  around source expr
58   + (stmt) while:  around condition
59   + (stmt) do-while:  around condition
60   + (stmt) for:  around "header"
61* "{...}"
62   + (decl) enum:  around enumerators
63   + (decl) func:  around body
64   + (stmt) compound:  around stmts
65* "[...]"
66   * (decl) declarator:  for arrays
67   * (expr) postfix:  array access
68
69other:
70* "*"
71   + (decl) declarator:  for pointer types
72   + (expr) unary:  for pointer deref
73
74
75To simplify the regular expressions used here, we've takens some
76shortcuts and made certain assumptions about the code we are parsing.
77Some of these allow us to skip context-sensitive matching (e.g. braces)
78or otherwise still match arbitrary C code unambiguously.  However, in
79some cases there are certain corner cases where the patterns are
80ambiguous relative to arbitrary C code.  However, they are still
81unambiguous in the specific code we are parsing.
82
83Here are the cases where we've taken shortcuts or made assumptions:
84
85* there is no overlap syntactically between the local context (func
86  bodies) and the global context (other than variable decls), so we
87  do not need to worry about ambiguity due to the overlap:
88   + the global context has no expressions or statements
89   + the local context has no function definitions or type decls
90* no "inline" type declarations (struct, union, enum) in function
91  parameters ~(including function pointers)~
92* no "inline" type decls in function return types
93* no superfluous parentheses in declarators
94* var decls in for loops are always "simple" (e.g. no inline types)
95* only inline struct/union/enum decls may be anonymouns (without a name)
96* no function pointers in function pointer parameters
97* for loop "headers" do not have curly braces (e.g. compound init)
98* syntactically, variable decls do not overlap with stmts/exprs, except
99  in the following case:
100    spam (*eggs) (...)
101  This could be either a function pointer variable named "eggs"
102  or a call to a function named "spam", which returns a function
103  pointer that gets called.  The only differentiator is the
104  syntax used in the "..." part.  It will be comma-separated
105  parameters for the former and comma-separated expressions for
106  the latter.  Thus, if we expect such decls or calls then we must
107  parse the decl params.
108"""
109
110"""
111TODO:
112* extract CPython-specific code
113* drop include injection (or only add when needed)
114* track position instead of slicing "text"
115* Parser class instead of the _iter_source() mess
116* alt impl using a state machine (& tokenizer or split on delimiters)
117"""
118
119from ..info import ParsedItem
120from ._info import SourceInfo
121
122
123def parse(srclines, **srckwargs):
124    if isinstance(srclines, str):  # a filename
125        raise NotImplementedError
126
127    anon_name = anonymous_names()
128    for result in _parse(srclines, anon_name, **srckwargs):
129        yield ParsedItem.from_raw(result)
130
131
132# XXX Later: Add a separate function to deal with preprocessor directives
133# parsed out of raw source.
134
135
136def anonymous_names():
137    counter = 1
138    def anon_name(prefix='anon-'):
139        nonlocal counter
140        name = f'{prefix}{counter}'
141        counter += 1
142        return name
143    return anon_name
144
145
146#############################
147# internal impl
148
149import logging
150
151
152_logger = logging.getLogger(__name__)
153
154
155def _parse(srclines, anon_name, **srckwargs):
156    from ._global import parse_globals
157
158    source = _iter_source(srclines, **srckwargs)
159    for result in parse_globals(source, anon_name):
160        # XXX Handle blocks here instead of in parse_globals().
161        yield result
162
163
164# We use defaults that cover most files.  Files with bigger declarations
165# are covered elsewhere (MAX_SIZES in cpython/_parser.py).
166
167def _iter_source(lines, *, maxtext=10_000, maxlines=200, showtext=False):
168    maxtext = maxtext if maxtext and maxtext > 0 else None
169    maxlines = maxlines if maxlines and maxlines > 0 else None
170    filestack = []
171    allinfo = {}
172    # "lines" should be (fileinfo, data), as produced by the preprocessor code.
173    for fileinfo, line in lines:
174        if fileinfo.filename in filestack:
175            while fileinfo.filename != filestack[-1]:
176                filename = filestack.pop()
177                del allinfo[filename]
178            filename = fileinfo.filename
179            srcinfo = allinfo[filename]
180        else:
181            filename = fileinfo.filename
182            srcinfo = SourceInfo(filename)
183            filestack.append(filename)
184            allinfo[filename] = srcinfo
185
186        _logger.debug(f'-> {line}')
187        srcinfo._add_line(line, fileinfo.lno)
188        if srcinfo.too_much(maxtext, maxlines):
189            break
190        while srcinfo._used():
191            yield srcinfo
192            if showtext:
193                _logger.debug(f'=> {srcinfo.text}')
194    else:
195        if not filestack:
196            srcinfo = SourceInfo('???')
197        else:
198            filename = filestack[-1]
199            srcinfo = allinfo[filename]
200            while srcinfo._used():
201                yield srcinfo
202                if showtext:
203                    _logger.debug(f'=> {srcinfo.text}')
204        yield srcinfo
205        if showtext:
206            _logger.debug(f'=> {srcinfo.text}')
207        if not srcinfo._ready:
208            return
209    # At this point either the file ended prematurely
210    # or there's "too much" text.
211    filename, lno, text = srcinfo.filename, srcinfo._start, srcinfo.text
212    if len(text) > 500:
213        text = text[:500] + '...'
214    raise Exception(f'unmatched text ({filename} starting at line {lno}):\n{text}')
215