xref: /third_party/python/Parser/pegen_errors.c (revision 7db96d56)
1#include <Python.h>
2#include <errcode.h>
3
4#include "tokenizer.h"
5#include "pegen.h"
6
7// TOKENIZER ERRORS
8
9void
10_PyPegen_raise_tokenizer_init_error(PyObject *filename)
11{
12    if (!(PyErr_ExceptionMatches(PyExc_LookupError)
13          || PyErr_ExceptionMatches(PyExc_SyntaxError)
14          || PyErr_ExceptionMatches(PyExc_ValueError)
15          || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
16        return;
17    }
18    PyObject *errstr = NULL;
19    PyObject *tuple = NULL;
20    PyObject *type;
21    PyObject *value;
22    PyObject *tback;
23    PyErr_Fetch(&type, &value, &tback);
24    errstr = PyObject_Str(value);
25    if (!errstr) {
26        goto error;
27    }
28
29    PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
30    if (!tmp) {
31        goto error;
32    }
33
34    tuple = PyTuple_Pack(2, errstr, tmp);
35    Py_DECREF(tmp);
36    if (!value) {
37        goto error;
38    }
39    PyErr_SetObject(PyExc_SyntaxError, tuple);
40
41error:
42    Py_XDECREF(type);
43    Py_XDECREF(value);
44    Py_XDECREF(tback);
45    Py_XDECREF(errstr);
46    Py_XDECREF(tuple);
47}
48
49static inline void
50raise_unclosed_parentheses_error(Parser *p) {
51       int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
52       int error_col = p->tok->parencolstack[p->tok->level-1];
53       RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
54                                  error_lineno, error_col, error_lineno, -1,
55                                  "'%c' was never closed",
56                                  p->tok->parenstack[p->tok->level-1]);
57}
58
59int
60_Pypegen_tokenizer_error(Parser *p)
61{
62    if (PyErr_Occurred()) {
63        return -1;
64    }
65
66    const char *msg = NULL;
67    PyObject* errtype = PyExc_SyntaxError;
68    Py_ssize_t col_offset = -1;
69    switch (p->tok->done) {
70        case E_TOKEN:
71            msg = "invalid token";
72            break;
73        case E_EOF:
74            if (p->tok->level) {
75                raise_unclosed_parentheses_error(p);
76            } else {
77                RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
78            }
79            return -1;
80        case E_DEDENT:
81            RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
82            return -1;
83        case E_INTR:
84            if (!PyErr_Occurred()) {
85                PyErr_SetNone(PyExc_KeyboardInterrupt);
86            }
87            return -1;
88        case E_NOMEM:
89            PyErr_NoMemory();
90            return -1;
91        case E_TABSPACE:
92            errtype = PyExc_TabError;
93            msg = "inconsistent use of tabs and spaces in indentation";
94            break;
95        case E_TOODEEP:
96            errtype = PyExc_IndentationError;
97            msg = "too many levels of indentation";
98            break;
99        case E_LINECONT: {
100            col_offset = p->tok->cur - p->tok->buf - 1;
101            msg = "unexpected character after line continuation character";
102            break;
103        }
104        default:
105            msg = "unknown parsing error";
106    }
107
108    RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
109                               col_offset >= 0 ? col_offset : 0,
110                               p->tok->lineno, -1, msg);
111    return -1;
112}
113
114int
115_Pypegen_raise_decode_error(Parser *p)
116{
117    assert(PyErr_Occurred());
118    const char *errtype = NULL;
119    if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
120        errtype = "unicode error";
121    }
122    else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
123        errtype = "value error";
124    }
125    if (errtype) {
126        PyObject *type;
127        PyObject *value;
128        PyObject *tback;
129        PyObject *errstr;
130        PyErr_Fetch(&type, &value, &tback);
131        errstr = PyObject_Str(value);
132        if (errstr) {
133            RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
134            Py_DECREF(errstr);
135        }
136        else {
137            PyErr_Clear();
138            RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
139        }
140        Py_XDECREF(type);
141        Py_XDECREF(value);
142        Py_XDECREF(tback);
143    }
144
145    return -1;
146}
147
148static int
149_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
150    // Tokenize the whole input to see if there are any tokenization
151    // errors such as mistmatching parentheses. These will get priority
152    // over generic syntax errors only if the line number of the error is
153    // before the one that we had for the generic error.
154
155    // We don't want to tokenize to the end for interactive input
156    if (p->tok->prompt != NULL) {
157        return 0;
158    }
159
160    PyObject *type, *value, *traceback;
161    PyErr_Fetch(&type, &value, &traceback);
162
163    Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
164    Py_ssize_t current_err_line = current_token->lineno;
165
166    int ret = 0;
167
168    for (;;) {
169        const char *start;
170        const char *end;
171        switch (_PyTokenizer_Get(p->tok, &start, &end)) {
172            case ERRORTOKEN:
173                if (PyErr_Occurred()) {
174                    ret = -1;
175                    goto exit;
176                }
177                if (p->tok->level != 0) {
178                    int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
179                    if (current_err_line > error_lineno) {
180                        raise_unclosed_parentheses_error(p);
181                        ret = -1;
182                        goto exit;
183                    }
184                }
185                break;
186            case ENDMARKER:
187                break;
188            default:
189                continue;
190        }
191        break;
192    }
193
194
195exit:
196    if (PyErr_Occurred()) {
197        Py_XDECREF(value);
198        Py_XDECREF(type);
199        Py_XDECREF(traceback);
200    } else {
201        PyErr_Restore(type, value, traceback);
202    }
203    return ret;
204}
205
206// PARSER ERRORS
207
208void *
209_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
210{
211    if (p->fill == 0) {
212        va_list va;
213        va_start(va, errmsg);
214        _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
215        va_end(va);
216        return NULL;
217    }
218
219    Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
220    Py_ssize_t col_offset;
221    Py_ssize_t end_col_offset = -1;
222    if (t->col_offset == -1) {
223        if (p->tok->cur == p->tok->buf) {
224            col_offset = 0;
225        } else {
226            const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
227            col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
228        }
229    } else {
230        col_offset = t->col_offset + 1;
231    }
232
233    if (t->end_col_offset != -1) {
234        end_col_offset = t->end_col_offset + 1;
235    }
236
237    va_list va;
238    va_start(va, errmsg);
239    _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
240    va_end(va);
241
242    return NULL;
243}
244
245static PyObject *
246get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
247{
248    /* If the file descriptor is interactive, the source lines of the current
249     * (multi-line) statement are stored in p->tok->interactive_src_start.
250     * If not, we're parsing from a string, which means that the whole source
251     * is stored in p->tok->str. */
252    assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
253
254    char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
255    if (cur_line == NULL) {
256        assert(p->tok->fp_interactive);
257        // We can reach this point if the tokenizer buffers for interactive source have not been
258        // initialized because we failed to decode the original source with the given locale.
259        return PyUnicode_FromStringAndSize("", 0);
260    }
261
262    Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
263    const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
264
265    for (int i = 0; i < relative_lineno - 1; i++) {
266        char *new_line = strchr(cur_line, '\n');
267        // The assert is here for debug builds but the conditional that
268        // follows is there so in release builds we do not crash at the cost
269        // to report a potentially wrong line.
270        assert(new_line != NULL && new_line + 1 < buf_end);
271        if (new_line == NULL || new_line + 1 > buf_end) {
272            break;
273        }
274        cur_line = new_line + 1;
275    }
276
277    char *next_newline;
278    if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
279        next_newline = cur_line + strlen(cur_line);
280    }
281    return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
282}
283
284void *
285_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
286                                    Py_ssize_t lineno, Py_ssize_t col_offset,
287                                    Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
288                                    const char *errmsg, va_list va)
289{
290    PyObject *value = NULL;
291    PyObject *errstr = NULL;
292    PyObject *error_line = NULL;
293    PyObject *tmp = NULL;
294    p->error_indicator = 1;
295
296    if (end_lineno == CURRENT_POS) {
297        end_lineno = p->tok->lineno;
298    }
299    if (end_col_offset == CURRENT_POS) {
300        end_col_offset = p->tok->cur - p->tok->line_start;
301    }
302
303    if (p->start_rule == Py_fstring_input) {
304        const char *fstring_msg = "f-string: ";
305        Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
306
307        char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
308        if (!new_errmsg) {
309            return (void *) PyErr_NoMemory();
310        }
311
312        // Copy both strings into new buffer
313        memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
314        memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
315        new_errmsg[len] = 0;
316        errmsg = new_errmsg;
317    }
318    errstr = PyUnicode_FromFormatV(errmsg, va);
319    if (!errstr) {
320        goto error;
321    }
322
323    if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) {
324        error_line = get_error_line_from_tokenizer_buffers(p, lineno);
325    }
326    else if (p->start_rule == Py_file_input) {
327        error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
328                                                     (int) lineno, p->tok->encoding);
329    }
330
331    if (!error_line) {
332        /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
333           then we need to find the error line from some other source, because
334           p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
335           failed or we're parsing from a string or the REPL. There's a third edge case where
336           we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
337           `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
338           does not physically exist */
339        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
340
341        if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
342            Py_ssize_t size = p->tok->inp - p->tok->buf;
343            error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
344        }
345        else if (p->tok->fp == NULL || p->tok->fp == stdin) {
346            error_line = get_error_line_from_tokenizer_buffers(p, lineno);
347        }
348        else {
349            error_line = PyUnicode_FromStringAndSize("", 0);
350        }
351        if (!error_line) {
352            goto error;
353        }
354    }
355
356    if (p->start_rule == Py_fstring_input) {
357        col_offset -= p->starting_col_offset;
358        end_col_offset -= p->starting_col_offset;
359    }
360
361    Py_ssize_t col_number = col_offset;
362    Py_ssize_t end_col_number = end_col_offset;
363
364    if (p->tok->encoding != NULL) {
365        col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
366        if (col_number < 0) {
367            goto error;
368        }
369        if (end_col_number > 0) {
370            Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
371            if (end_col_offset < 0) {
372                goto error;
373            } else {
374                end_col_number = end_col_offset;
375            }
376        }
377    }
378    tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
379    if (!tmp) {
380        goto error;
381    }
382    value = PyTuple_Pack(2, errstr, tmp);
383    Py_DECREF(tmp);
384    if (!value) {
385        goto error;
386    }
387    PyErr_SetObject(errtype, value);
388
389    Py_DECREF(errstr);
390    Py_DECREF(value);
391    if (p->start_rule == Py_fstring_input) {
392        PyMem_Free((void *)errmsg);
393    }
394    return NULL;
395
396error:
397    Py_XDECREF(errstr);
398    Py_XDECREF(error_line);
399    if (p->start_rule == Py_fstring_input) {
400        PyMem_Free((void *)errmsg);
401    }
402    return NULL;
403}
404
405void
406_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
407    // Existing sintax error
408    if (PyErr_Occurred()) {
409        // Prioritize tokenizer errors to custom syntax errors raised
410        // on the second phase only if the errors come from the parser.
411        int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK);
412        if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
413            _PyPegen_tokenize_full_source_to_check_for_errors(p);
414        }
415        // Propagate the existing syntax error.
416        return;
417    }
418    // Initialization error
419    if (p->fill == 0) {
420        RAISE_SYNTAX_ERROR("error at start before reading any input");
421    }
422    // Parser encountered EOF (End of File) unexpectedtly
423    if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
424        if (p->tok->level) {
425            raise_unclosed_parentheses_error(p);
426        } else {
427            RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
428        }
429        return;
430    }
431    // Indentation error in the tokenizer
432    if (last_token->type == INDENT || last_token->type == DEDENT) {
433        RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
434        return;
435    }
436    // Unknown error (generic case)
437
438    // Use the last token we found on the first pass to avoid reporting
439    // incorrect locations for generic syntax errors just because we reached
440    // further away when trying to find specific syntax errors in the second
441    // pass.
442    RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
443    // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
444    // generic SyntaxError we just raised if errors are found.
445    _PyPegen_tokenize_full_source_to_check_for_errors(p);
446}
447