1#include <Python.h> 2#include <errcode.h> 3 4#include "tokenizer.h" 5#include "pegen.h" 6 7// TOKENIZER ERRORS 8 9void 10_PyPegen_raise_tokenizer_init_error(PyObject *filename) 11{ 12 if (!(PyErr_ExceptionMatches(PyExc_LookupError) 13 || PyErr_ExceptionMatches(PyExc_SyntaxError) 14 || PyErr_ExceptionMatches(PyExc_ValueError) 15 || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) { 16 return; 17 } 18 PyObject *errstr = NULL; 19 PyObject *tuple = NULL; 20 PyObject *type; 21 PyObject *value; 22 PyObject *tback; 23 PyErr_Fetch(&type, &value, &tback); 24 errstr = PyObject_Str(value); 25 if (!errstr) { 26 goto error; 27 } 28 29 PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None); 30 if (!tmp) { 31 goto error; 32 } 33 34 tuple = PyTuple_Pack(2, errstr, tmp); 35 Py_DECREF(tmp); 36 if (!value) { 37 goto error; 38 } 39 PyErr_SetObject(PyExc_SyntaxError, tuple); 40 41error: 42 Py_XDECREF(type); 43 Py_XDECREF(value); 44 Py_XDECREF(tback); 45 Py_XDECREF(errstr); 46 Py_XDECREF(tuple); 47} 48 49static inline void 50raise_unclosed_parentheses_error(Parser *p) { 51 int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; 52 int error_col = p->tok->parencolstack[p->tok->level-1]; 53 RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, 54 error_lineno, error_col, error_lineno, -1, 55 "'%c' was never closed", 56 p->tok->parenstack[p->tok->level-1]); 57} 58 59int 60_Pypegen_tokenizer_error(Parser *p) 61{ 62 if (PyErr_Occurred()) { 63 return -1; 64 } 65 66 const char *msg = NULL; 67 PyObject* errtype = PyExc_SyntaxError; 68 Py_ssize_t col_offset = -1; 69 switch (p->tok->done) { 70 case E_TOKEN: 71 msg = "invalid token"; 72 break; 73 case E_EOF: 74 if (p->tok->level) { 75 raise_unclosed_parentheses_error(p); 76 } else { 77 RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); 78 } 79 return -1; 80 case E_DEDENT: 81 RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); 82 return -1; 83 case E_INTR: 84 if (!PyErr_Occurred()) { 85 PyErr_SetNone(PyExc_KeyboardInterrupt); 86 } 87 return -1; 88 case E_NOMEM: 89 PyErr_NoMemory(); 90 return -1; 91 case E_TABSPACE: 92 errtype = PyExc_TabError; 93 msg = "inconsistent use of tabs and spaces in indentation"; 94 break; 95 case E_TOODEEP: 96 errtype = PyExc_IndentationError; 97 msg = "too many levels of indentation"; 98 break; 99 case E_LINECONT: { 100 col_offset = p->tok->cur - p->tok->buf - 1; 101 msg = "unexpected character after line continuation character"; 102 break; 103 } 104 default: 105 msg = "unknown parsing error"; 106 } 107 108 RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno, 109 col_offset >= 0 ? col_offset : 0, 110 p->tok->lineno, -1, msg); 111 return -1; 112} 113 114int 115_Pypegen_raise_decode_error(Parser *p) 116{ 117 assert(PyErr_Occurred()); 118 const char *errtype = NULL; 119 if (PyErr_ExceptionMatches(PyExc_UnicodeError)) { 120 errtype = "unicode error"; 121 } 122 else if (PyErr_ExceptionMatches(PyExc_ValueError)) { 123 errtype = "value error"; 124 } 125 if (errtype) { 126 PyObject *type; 127 PyObject *value; 128 PyObject *tback; 129 PyObject *errstr; 130 PyErr_Fetch(&type, &value, &tback); 131 errstr = PyObject_Str(value); 132 if (errstr) { 133 RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr); 134 Py_DECREF(errstr); 135 } 136 else { 137 PyErr_Clear(); 138 RAISE_SYNTAX_ERROR("(%s) unknown error", errtype); 139 } 140 Py_XDECREF(type); 141 Py_XDECREF(value); 142 Py_XDECREF(tback); 143 } 144 145 return -1; 146} 147 148static int 149_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { 150 // Tokenize the whole input to see if there are any tokenization 151 // errors such as mistmatching parentheses. These will get priority 152 // over generic syntax errors only if the line number of the error is 153 // before the one that we had for the generic error. 154 155 // We don't want to tokenize to the end for interactive input 156 if (p->tok->prompt != NULL) { 157 return 0; 158 } 159 160 PyObject *type, *value, *traceback; 161 PyErr_Fetch(&type, &value, &traceback); 162 163 Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; 164 Py_ssize_t current_err_line = current_token->lineno; 165 166 int ret = 0; 167 168 for (;;) { 169 const char *start; 170 const char *end; 171 switch (_PyTokenizer_Get(p->tok, &start, &end)) { 172 case ERRORTOKEN: 173 if (PyErr_Occurred()) { 174 ret = -1; 175 goto exit; 176 } 177 if (p->tok->level != 0) { 178 int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; 179 if (current_err_line > error_lineno) { 180 raise_unclosed_parentheses_error(p); 181 ret = -1; 182 goto exit; 183 } 184 } 185 break; 186 case ENDMARKER: 187 break; 188 default: 189 continue; 190 } 191 break; 192 } 193 194 195exit: 196 if (PyErr_Occurred()) { 197 Py_XDECREF(value); 198 Py_XDECREF(type); 199 Py_XDECREF(traceback); 200 } else { 201 PyErr_Restore(type, value, traceback); 202 } 203 return ret; 204} 205 206// PARSER ERRORS 207 208void * 209_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) 210{ 211 if (p->fill == 0) { 212 va_list va; 213 va_start(va, errmsg); 214 _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va); 215 va_end(va); 216 return NULL; 217 } 218 219 Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; 220 Py_ssize_t col_offset; 221 Py_ssize_t end_col_offset = -1; 222 if (t->col_offset == -1) { 223 if (p->tok->cur == p->tok->buf) { 224 col_offset = 0; 225 } else { 226 const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf; 227 col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int); 228 } 229 } else { 230 col_offset = t->col_offset + 1; 231 } 232 233 if (t->end_col_offset != -1) { 234 end_col_offset = t->end_col_offset + 1; 235 } 236 237 va_list va; 238 va_start(va, errmsg); 239 _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va); 240 va_end(va); 241 242 return NULL; 243} 244 245static PyObject * 246get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno) 247{ 248 /* If the file descriptor is interactive, the source lines of the current 249 * (multi-line) statement are stored in p->tok->interactive_src_start. 250 * If not, we're parsing from a string, which means that the whole source 251 * is stored in p->tok->str. */ 252 assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin); 253 254 char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; 255 if (cur_line == NULL) { 256 assert(p->tok->fp_interactive); 257 // We can reach this point if the tokenizer buffers for interactive source have not been 258 // initialized because we failed to decode the original source with the given locale. 259 return PyUnicode_FromStringAndSize("", 0); 260 } 261 262 Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno; 263 const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp; 264 265 for (int i = 0; i < relative_lineno - 1; i++) { 266 char *new_line = strchr(cur_line, '\n'); 267 // The assert is here for debug builds but the conditional that 268 // follows is there so in release builds we do not crash at the cost 269 // to report a potentially wrong line. 270 assert(new_line != NULL && new_line + 1 < buf_end); 271 if (new_line == NULL || new_line + 1 > buf_end) { 272 break; 273 } 274 cur_line = new_line + 1; 275 } 276 277 char *next_newline; 278 if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line 279 next_newline = cur_line + strlen(cur_line); 280 } 281 return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); 282} 283 284void * 285_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, 286 Py_ssize_t lineno, Py_ssize_t col_offset, 287 Py_ssize_t end_lineno, Py_ssize_t end_col_offset, 288 const char *errmsg, va_list va) 289{ 290 PyObject *value = NULL; 291 PyObject *errstr = NULL; 292 PyObject *error_line = NULL; 293 PyObject *tmp = NULL; 294 p->error_indicator = 1; 295 296 if (end_lineno == CURRENT_POS) { 297 end_lineno = p->tok->lineno; 298 } 299 if (end_col_offset == CURRENT_POS) { 300 end_col_offset = p->tok->cur - p->tok->line_start; 301 } 302 303 if (p->start_rule == Py_fstring_input) { 304 const char *fstring_msg = "f-string: "; 305 Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg); 306 307 char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character 308 if (!new_errmsg) { 309 return (void *) PyErr_NoMemory(); 310 } 311 312 // Copy both strings into new buffer 313 memcpy(new_errmsg, fstring_msg, strlen(fstring_msg)); 314 memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg)); 315 new_errmsg[len] = 0; 316 errmsg = new_errmsg; 317 } 318 errstr = PyUnicode_FromFormatV(errmsg, va); 319 if (!errstr) { 320 goto error; 321 } 322 323 if (p->tok->fp_interactive && p->tok->interactive_src_start != NULL) { 324 error_line = get_error_line_from_tokenizer_buffers(p, lineno); 325 } 326 else if (p->start_rule == Py_file_input) { 327 error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename, 328 (int) lineno, p->tok->encoding); 329 } 330 331 if (!error_line) { 332 /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called, 333 then we need to find the error line from some other source, because 334 p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly 335 failed or we're parsing from a string or the REPL. There's a third edge case where 336 we're actually parsing from a file, which has an E_EOF SyntaxError and in that case 337 `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which 338 does not physically exist */ 339 assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); 340 341 if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { 342 Py_ssize_t size = p->tok->inp - p->tok->buf; 343 error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); 344 } 345 else if (p->tok->fp == NULL || p->tok->fp == stdin) { 346 error_line = get_error_line_from_tokenizer_buffers(p, lineno); 347 } 348 else { 349 error_line = PyUnicode_FromStringAndSize("", 0); 350 } 351 if (!error_line) { 352 goto error; 353 } 354 } 355 356 if (p->start_rule == Py_fstring_input) { 357 col_offset -= p->starting_col_offset; 358 end_col_offset -= p->starting_col_offset; 359 } 360 361 Py_ssize_t col_number = col_offset; 362 Py_ssize_t end_col_number = end_col_offset; 363 364 if (p->tok->encoding != NULL) { 365 col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset); 366 if (col_number < 0) { 367 goto error; 368 } 369 if (end_col_number > 0) { 370 Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number); 371 if (end_col_offset < 0) { 372 goto error; 373 } else { 374 end_col_number = end_col_offset; 375 } 376 } 377 } 378 tmp = Py_BuildValue("(OnnNnn)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number); 379 if (!tmp) { 380 goto error; 381 } 382 value = PyTuple_Pack(2, errstr, tmp); 383 Py_DECREF(tmp); 384 if (!value) { 385 goto error; 386 } 387 PyErr_SetObject(errtype, value); 388 389 Py_DECREF(errstr); 390 Py_DECREF(value); 391 if (p->start_rule == Py_fstring_input) { 392 PyMem_Free((void *)errmsg); 393 } 394 return NULL; 395 396error: 397 Py_XDECREF(errstr); 398 Py_XDECREF(error_line); 399 if (p->start_rule == Py_fstring_input) { 400 PyMem_Free((void *)errmsg); 401 } 402 return NULL; 403} 404 405void 406_Pypegen_set_syntax_error(Parser* p, Token* last_token) { 407 // Existing sintax error 408 if (PyErr_Occurred()) { 409 // Prioritize tokenizer errors to custom syntax errors raised 410 // on the second phase only if the errors come from the parser. 411 int is_tok_ok = (p->tok->done == E_DONE || p->tok->done == E_OK); 412 if (is_tok_ok && PyErr_ExceptionMatches(PyExc_SyntaxError)) { 413 _PyPegen_tokenize_full_source_to_check_for_errors(p); 414 } 415 // Propagate the existing syntax error. 416 return; 417 } 418 // Initialization error 419 if (p->fill == 0) { 420 RAISE_SYNTAX_ERROR("error at start before reading any input"); 421 } 422 // Parser encountered EOF (End of File) unexpectedtly 423 if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) { 424 if (p->tok->level) { 425 raise_unclosed_parentheses_error(p); 426 } else { 427 RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); 428 } 429 return; 430 } 431 // Indentation error in the tokenizer 432 if (last_token->type == INDENT || last_token->type == DEDENT) { 433 RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent"); 434 return; 435 } 436 // Unknown error (generic case) 437 438 // Use the last token we found on the first pass to avoid reporting 439 // incorrect locations for generic syntax errors just because we reached 440 // further away when trying to find specific syntax errors in the second 441 // pass. 442 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax"); 443 // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing 444 // generic SyntaxError we just raised if errors are found. 445 _PyPegen_tokenize_full_source_to_check_for_errors(p); 446} 447