17db96d56Sopenharmony_ci 27db96d56Sopenharmony_ci/* Tokenizer implementation */ 37db96d56Sopenharmony_ci 47db96d56Sopenharmony_ci#define PY_SSIZE_T_CLEAN 57db96d56Sopenharmony_ci#include "Python.h" 67db96d56Sopenharmony_ci#include "pycore_call.h" // _PyObject_CallNoArgs() 77db96d56Sopenharmony_ci 87db96d56Sopenharmony_ci#include <ctype.h> 97db96d56Sopenharmony_ci#include <assert.h> 107db96d56Sopenharmony_ci 117db96d56Sopenharmony_ci#include "tokenizer.h" 127db96d56Sopenharmony_ci#include "errcode.h" 137db96d56Sopenharmony_ci 147db96d56Sopenharmony_ci#include "unicodeobject.h" 157db96d56Sopenharmony_ci#include "bytesobject.h" 167db96d56Sopenharmony_ci#include "fileobject.h" 177db96d56Sopenharmony_ci#include "abstract.h" 187db96d56Sopenharmony_ci 197db96d56Sopenharmony_ci/* Alternate tab spacing */ 207db96d56Sopenharmony_ci#define ALTTABSIZE 1 217db96d56Sopenharmony_ci 227db96d56Sopenharmony_ci#define is_potential_identifier_start(c) (\ 237db96d56Sopenharmony_ci (c >= 'a' && c <= 'z')\ 247db96d56Sopenharmony_ci || (c >= 'A' && c <= 'Z')\ 257db96d56Sopenharmony_ci || c == '_'\ 267db96d56Sopenharmony_ci || (c >= 128)) 277db96d56Sopenharmony_ci 287db96d56Sopenharmony_ci#define is_potential_identifier_char(c) (\ 297db96d56Sopenharmony_ci (c >= 'a' && c <= 'z')\ 307db96d56Sopenharmony_ci || (c >= 'A' && c <= 'Z')\ 317db96d56Sopenharmony_ci || (c >= '0' && c <= '9')\ 327db96d56Sopenharmony_ci || c == '_'\ 337db96d56Sopenharmony_ci || (c >= 128)) 347db96d56Sopenharmony_ci 357db96d56Sopenharmony_ci 367db96d56Sopenharmony_ci/* Don't ever change this -- it would break the portability of Python code */ 377db96d56Sopenharmony_ci#define TABSIZE 8 387db96d56Sopenharmony_ci 397db96d56Sopenharmony_ci/* Forward */ 407db96d56Sopenharmony_cistatic struct tok_state *tok_new(void); 417db96d56Sopenharmony_cistatic int tok_nextc(struct tok_state *tok); 427db96d56Sopenharmony_cistatic void tok_backup(struct tok_state *tok, int c); 437db96d56Sopenharmony_cistatic int syntaxerror(struct tok_state *tok, const char *format, ...); 447db96d56Sopenharmony_ci 457db96d56Sopenharmony_ci/* Spaces in this constant are treated as "zero or more spaces or tabs" when 467db96d56Sopenharmony_ci tokenizing. */ 477db96d56Sopenharmony_cistatic const char* type_comment_prefix = "# type: "; 487db96d56Sopenharmony_ci 497db96d56Sopenharmony_ci/* Create and initialize a new tok_state structure */ 507db96d56Sopenharmony_ci 517db96d56Sopenharmony_cistatic struct tok_state * 527db96d56Sopenharmony_citok_new(void) 537db96d56Sopenharmony_ci{ 547db96d56Sopenharmony_ci struct tok_state *tok = (struct tok_state *)PyMem_Malloc( 557db96d56Sopenharmony_ci sizeof(struct tok_state)); 567db96d56Sopenharmony_ci if (tok == NULL) 577db96d56Sopenharmony_ci return NULL; 587db96d56Sopenharmony_ci tok->buf = tok->cur = tok->inp = NULL; 597db96d56Sopenharmony_ci tok->fp_interactive = 0; 607db96d56Sopenharmony_ci tok->interactive_src_start = NULL; 617db96d56Sopenharmony_ci tok->interactive_src_end = NULL; 627db96d56Sopenharmony_ci tok->start = NULL; 637db96d56Sopenharmony_ci tok->end = NULL; 647db96d56Sopenharmony_ci tok->done = E_OK; 657db96d56Sopenharmony_ci tok->fp = NULL; 667db96d56Sopenharmony_ci tok->input = NULL; 677db96d56Sopenharmony_ci tok->tabsize = TABSIZE; 687db96d56Sopenharmony_ci tok->indent = 0; 697db96d56Sopenharmony_ci tok->indstack[0] = 0; 707db96d56Sopenharmony_ci tok->atbol = 1; 717db96d56Sopenharmony_ci tok->pendin = 0; 727db96d56Sopenharmony_ci tok->prompt = tok->nextprompt = NULL; 737db96d56Sopenharmony_ci tok->lineno = 0; 747db96d56Sopenharmony_ci tok->level = 0; 757db96d56Sopenharmony_ci tok->altindstack[0] = 0; 767db96d56Sopenharmony_ci tok->decoding_state = STATE_INIT; 777db96d56Sopenharmony_ci tok->decoding_erred = 0; 787db96d56Sopenharmony_ci tok->enc = NULL; 797db96d56Sopenharmony_ci tok->encoding = NULL; 807db96d56Sopenharmony_ci tok->cont_line = 0; 817db96d56Sopenharmony_ci tok->filename = NULL; 827db96d56Sopenharmony_ci tok->decoding_readline = NULL; 837db96d56Sopenharmony_ci tok->decoding_buffer = NULL; 847db96d56Sopenharmony_ci tok->type_comments = 0; 857db96d56Sopenharmony_ci tok->async_hacks = 0; 867db96d56Sopenharmony_ci tok->async_def = 0; 877db96d56Sopenharmony_ci tok->async_def_indent = 0; 887db96d56Sopenharmony_ci tok->async_def_nl = 0; 897db96d56Sopenharmony_ci tok->interactive_underflow = IUNDERFLOW_NORMAL; 907db96d56Sopenharmony_ci tok->str = NULL; 917db96d56Sopenharmony_ci tok->report_warnings = 1; 927db96d56Sopenharmony_ci return tok; 937db96d56Sopenharmony_ci} 947db96d56Sopenharmony_ci 957db96d56Sopenharmony_cistatic char * 967db96d56Sopenharmony_cinew_string(const char *s, Py_ssize_t len, struct tok_state *tok) 977db96d56Sopenharmony_ci{ 987db96d56Sopenharmony_ci char* result = (char *)PyMem_Malloc(len + 1); 997db96d56Sopenharmony_ci if (!result) { 1007db96d56Sopenharmony_ci tok->done = E_NOMEM; 1017db96d56Sopenharmony_ci return NULL; 1027db96d56Sopenharmony_ci } 1037db96d56Sopenharmony_ci memcpy(result, s, len); 1047db96d56Sopenharmony_ci result[len] = '\0'; 1057db96d56Sopenharmony_ci return result; 1067db96d56Sopenharmony_ci} 1077db96d56Sopenharmony_ci 1087db96d56Sopenharmony_cistatic char * 1097db96d56Sopenharmony_cierror_ret(struct tok_state *tok) /* XXX */ 1107db96d56Sopenharmony_ci{ 1117db96d56Sopenharmony_ci tok->decoding_erred = 1; 1127db96d56Sopenharmony_ci if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */ 1137db96d56Sopenharmony_ci PyMem_Free(tok->buf); 1147db96d56Sopenharmony_ci tok->buf = tok->cur = tok->inp = NULL; 1157db96d56Sopenharmony_ci tok->start = NULL; 1167db96d56Sopenharmony_ci tok->end = NULL; 1177db96d56Sopenharmony_ci tok->done = E_DECODE; 1187db96d56Sopenharmony_ci return NULL; /* as if it were EOF */ 1197db96d56Sopenharmony_ci} 1207db96d56Sopenharmony_ci 1217db96d56Sopenharmony_ci 1227db96d56Sopenharmony_cistatic const char * 1237db96d56Sopenharmony_ciget_normal_name(const char *s) /* for utf-8 and latin-1 */ 1247db96d56Sopenharmony_ci{ 1257db96d56Sopenharmony_ci char buf[13]; 1267db96d56Sopenharmony_ci int i; 1277db96d56Sopenharmony_ci for (i = 0; i < 12; i++) { 1287db96d56Sopenharmony_ci int c = s[i]; 1297db96d56Sopenharmony_ci if (c == '\0') 1307db96d56Sopenharmony_ci break; 1317db96d56Sopenharmony_ci else if (c == '_') 1327db96d56Sopenharmony_ci buf[i] = '-'; 1337db96d56Sopenharmony_ci else 1347db96d56Sopenharmony_ci buf[i] = tolower(c); 1357db96d56Sopenharmony_ci } 1367db96d56Sopenharmony_ci buf[i] = '\0'; 1377db96d56Sopenharmony_ci if (strcmp(buf, "utf-8") == 0 || 1387db96d56Sopenharmony_ci strncmp(buf, "utf-8-", 6) == 0) 1397db96d56Sopenharmony_ci return "utf-8"; 1407db96d56Sopenharmony_ci else if (strcmp(buf, "latin-1") == 0 || 1417db96d56Sopenharmony_ci strcmp(buf, "iso-8859-1") == 0 || 1427db96d56Sopenharmony_ci strcmp(buf, "iso-latin-1") == 0 || 1437db96d56Sopenharmony_ci strncmp(buf, "latin-1-", 8) == 0 || 1447db96d56Sopenharmony_ci strncmp(buf, "iso-8859-1-", 11) == 0 || 1457db96d56Sopenharmony_ci strncmp(buf, "iso-latin-1-", 12) == 0) 1467db96d56Sopenharmony_ci return "iso-8859-1"; 1477db96d56Sopenharmony_ci else 1487db96d56Sopenharmony_ci return s; 1497db96d56Sopenharmony_ci} 1507db96d56Sopenharmony_ci 1517db96d56Sopenharmony_ci/* Return the coding spec in S, or NULL if none is found. */ 1527db96d56Sopenharmony_ci 1537db96d56Sopenharmony_cistatic int 1547db96d56Sopenharmony_ciget_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) 1557db96d56Sopenharmony_ci{ 1567db96d56Sopenharmony_ci Py_ssize_t i; 1577db96d56Sopenharmony_ci *spec = NULL; 1587db96d56Sopenharmony_ci /* Coding spec must be in a comment, and that comment must be 1597db96d56Sopenharmony_ci * the only statement on the source code line. */ 1607db96d56Sopenharmony_ci for (i = 0; i < size - 6; i++) { 1617db96d56Sopenharmony_ci if (s[i] == '#') 1627db96d56Sopenharmony_ci break; 1637db96d56Sopenharmony_ci if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') 1647db96d56Sopenharmony_ci return 1; 1657db96d56Sopenharmony_ci } 1667db96d56Sopenharmony_ci for (; i < size - 6; i++) { /* XXX inefficient search */ 1677db96d56Sopenharmony_ci const char* t = s + i; 1687db96d56Sopenharmony_ci if (memcmp(t, "coding", 6) == 0) { 1697db96d56Sopenharmony_ci const char* begin = NULL; 1707db96d56Sopenharmony_ci t += 6; 1717db96d56Sopenharmony_ci if (t[0] != ':' && t[0] != '=') 1727db96d56Sopenharmony_ci continue; 1737db96d56Sopenharmony_ci do { 1747db96d56Sopenharmony_ci t++; 1757db96d56Sopenharmony_ci } while (t[0] == ' ' || t[0] == '\t'); 1767db96d56Sopenharmony_ci 1777db96d56Sopenharmony_ci begin = t; 1787db96d56Sopenharmony_ci while (Py_ISALNUM(t[0]) || 1797db96d56Sopenharmony_ci t[0] == '-' || t[0] == '_' || t[0] == '.') 1807db96d56Sopenharmony_ci t++; 1817db96d56Sopenharmony_ci 1827db96d56Sopenharmony_ci if (begin < t) { 1837db96d56Sopenharmony_ci char* r = new_string(begin, t - begin, tok); 1847db96d56Sopenharmony_ci const char* q; 1857db96d56Sopenharmony_ci if (!r) 1867db96d56Sopenharmony_ci return 0; 1877db96d56Sopenharmony_ci q = get_normal_name(r); 1887db96d56Sopenharmony_ci if (r != q) { 1897db96d56Sopenharmony_ci PyMem_Free(r); 1907db96d56Sopenharmony_ci r = new_string(q, strlen(q), tok); 1917db96d56Sopenharmony_ci if (!r) 1927db96d56Sopenharmony_ci return 0; 1937db96d56Sopenharmony_ci } 1947db96d56Sopenharmony_ci *spec = r; 1957db96d56Sopenharmony_ci break; 1967db96d56Sopenharmony_ci } 1977db96d56Sopenharmony_ci } 1987db96d56Sopenharmony_ci } 1997db96d56Sopenharmony_ci return 1; 2007db96d56Sopenharmony_ci} 2017db96d56Sopenharmony_ci 2027db96d56Sopenharmony_ci/* Check whether the line contains a coding spec. If it does, 2037db96d56Sopenharmony_ci invoke the set_readline function for the new encoding. 2047db96d56Sopenharmony_ci This function receives the tok_state and the new encoding. 2057db96d56Sopenharmony_ci Return 1 on success, 0 on failure. */ 2067db96d56Sopenharmony_ci 2077db96d56Sopenharmony_cistatic int 2087db96d56Sopenharmony_cicheck_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, 2097db96d56Sopenharmony_ci int set_readline(struct tok_state *, const char *)) 2107db96d56Sopenharmony_ci{ 2117db96d56Sopenharmony_ci char *cs; 2127db96d56Sopenharmony_ci if (tok->cont_line) { 2137db96d56Sopenharmony_ci /* It's a continuation line, so it can't be a coding spec. */ 2147db96d56Sopenharmony_ci tok->decoding_state = STATE_NORMAL; 2157db96d56Sopenharmony_ci return 1; 2167db96d56Sopenharmony_ci } 2177db96d56Sopenharmony_ci if (!get_coding_spec(line, &cs, size, tok)) { 2187db96d56Sopenharmony_ci return 0; 2197db96d56Sopenharmony_ci } 2207db96d56Sopenharmony_ci if (!cs) { 2217db96d56Sopenharmony_ci Py_ssize_t i; 2227db96d56Sopenharmony_ci for (i = 0; i < size; i++) { 2237db96d56Sopenharmony_ci if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') 2247db96d56Sopenharmony_ci break; 2257db96d56Sopenharmony_ci if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { 2267db96d56Sopenharmony_ci /* Stop checking coding spec after a line containing 2277db96d56Sopenharmony_ci * anything except a comment. */ 2287db96d56Sopenharmony_ci tok->decoding_state = STATE_NORMAL; 2297db96d56Sopenharmony_ci break; 2307db96d56Sopenharmony_ci } 2317db96d56Sopenharmony_ci } 2327db96d56Sopenharmony_ci return 1; 2337db96d56Sopenharmony_ci } 2347db96d56Sopenharmony_ci tok->decoding_state = STATE_NORMAL; 2357db96d56Sopenharmony_ci if (tok->encoding == NULL) { 2367db96d56Sopenharmony_ci assert(tok->decoding_readline == NULL); 2377db96d56Sopenharmony_ci if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) { 2387db96d56Sopenharmony_ci error_ret(tok); 2397db96d56Sopenharmony_ci PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); 2407db96d56Sopenharmony_ci PyMem_Free(cs); 2417db96d56Sopenharmony_ci return 0; 2427db96d56Sopenharmony_ci } 2437db96d56Sopenharmony_ci tok->encoding = cs; 2447db96d56Sopenharmony_ci } else { /* then, compare cs with BOM */ 2457db96d56Sopenharmony_ci if (strcmp(tok->encoding, cs) != 0) { 2467db96d56Sopenharmony_ci error_ret(tok); 2477db96d56Sopenharmony_ci PyErr_Format(PyExc_SyntaxError, 2487db96d56Sopenharmony_ci "encoding problem: %s with BOM", cs); 2497db96d56Sopenharmony_ci PyMem_Free(cs); 2507db96d56Sopenharmony_ci return 0; 2517db96d56Sopenharmony_ci } 2527db96d56Sopenharmony_ci PyMem_Free(cs); 2537db96d56Sopenharmony_ci } 2547db96d56Sopenharmony_ci return 1; 2557db96d56Sopenharmony_ci} 2567db96d56Sopenharmony_ci 2577db96d56Sopenharmony_ci/* See whether the file starts with a BOM. If it does, 2587db96d56Sopenharmony_ci invoke the set_readline function with the new encoding. 2597db96d56Sopenharmony_ci Return 1 on success, 0 on failure. */ 2607db96d56Sopenharmony_ci 2617db96d56Sopenharmony_cistatic int 2627db96d56Sopenharmony_cicheck_bom(int get_char(struct tok_state *), 2637db96d56Sopenharmony_ci void unget_char(int, struct tok_state *), 2647db96d56Sopenharmony_ci int set_readline(struct tok_state *, const char *), 2657db96d56Sopenharmony_ci struct tok_state *tok) 2667db96d56Sopenharmony_ci{ 2677db96d56Sopenharmony_ci int ch1, ch2, ch3; 2687db96d56Sopenharmony_ci ch1 = get_char(tok); 2697db96d56Sopenharmony_ci tok->decoding_state = STATE_SEEK_CODING; 2707db96d56Sopenharmony_ci if (ch1 == EOF) { 2717db96d56Sopenharmony_ci return 1; 2727db96d56Sopenharmony_ci } else if (ch1 == 0xEF) { 2737db96d56Sopenharmony_ci ch2 = get_char(tok); 2747db96d56Sopenharmony_ci if (ch2 != 0xBB) { 2757db96d56Sopenharmony_ci unget_char(ch2, tok); 2767db96d56Sopenharmony_ci unget_char(ch1, tok); 2777db96d56Sopenharmony_ci return 1; 2787db96d56Sopenharmony_ci } 2797db96d56Sopenharmony_ci ch3 = get_char(tok); 2807db96d56Sopenharmony_ci if (ch3 != 0xBF) { 2817db96d56Sopenharmony_ci unget_char(ch3, tok); 2827db96d56Sopenharmony_ci unget_char(ch2, tok); 2837db96d56Sopenharmony_ci unget_char(ch1, tok); 2847db96d56Sopenharmony_ci return 1; 2857db96d56Sopenharmony_ci } 2867db96d56Sopenharmony_ci } else { 2877db96d56Sopenharmony_ci unget_char(ch1, tok); 2887db96d56Sopenharmony_ci return 1; 2897db96d56Sopenharmony_ci } 2907db96d56Sopenharmony_ci if (tok->encoding != NULL) 2917db96d56Sopenharmony_ci PyMem_Free(tok->encoding); 2927db96d56Sopenharmony_ci tok->encoding = new_string("utf-8", 5, tok); 2937db96d56Sopenharmony_ci if (!tok->encoding) 2947db96d56Sopenharmony_ci return 0; 2957db96d56Sopenharmony_ci /* No need to set_readline: input is already utf-8 */ 2967db96d56Sopenharmony_ci return 1; 2977db96d56Sopenharmony_ci} 2987db96d56Sopenharmony_ci 2997db96d56Sopenharmony_cistatic int 3007db96d56Sopenharmony_citok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { 3017db96d56Sopenharmony_ci assert(tok->fp_interactive); 3027db96d56Sopenharmony_ci 3037db96d56Sopenharmony_ci if (!line) { 3047db96d56Sopenharmony_ci return 0; 3057db96d56Sopenharmony_ci } 3067db96d56Sopenharmony_ci 3077db96d56Sopenharmony_ci Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start; 3087db96d56Sopenharmony_ci Py_ssize_t line_size = strlen(line); 3097db96d56Sopenharmony_ci char last_char = line[line_size > 0 ? line_size - 1 : line_size]; 3107db96d56Sopenharmony_ci if (last_char != '\n') { 3117db96d56Sopenharmony_ci line_size += 1; 3127db96d56Sopenharmony_ci } 3137db96d56Sopenharmony_ci char* new_str = tok->interactive_src_start; 3147db96d56Sopenharmony_ci 3157db96d56Sopenharmony_ci new_str = PyMem_Realloc(new_str, current_size + line_size + 1); 3167db96d56Sopenharmony_ci if (!new_str) { 3177db96d56Sopenharmony_ci if (tok->interactive_src_start) { 3187db96d56Sopenharmony_ci PyMem_Free(tok->interactive_src_start); 3197db96d56Sopenharmony_ci } 3207db96d56Sopenharmony_ci tok->interactive_src_start = NULL; 3217db96d56Sopenharmony_ci tok->interactive_src_end = NULL; 3227db96d56Sopenharmony_ci tok->done = E_NOMEM; 3237db96d56Sopenharmony_ci return -1; 3247db96d56Sopenharmony_ci } 3257db96d56Sopenharmony_ci strcpy(new_str + current_size, line); 3267db96d56Sopenharmony_ci if (last_char != '\n') { 3277db96d56Sopenharmony_ci /* Last line does not end in \n, fake one */ 3287db96d56Sopenharmony_ci new_str[current_size + line_size - 1] = '\n'; 3297db96d56Sopenharmony_ci new_str[current_size + line_size] = '\0'; 3307db96d56Sopenharmony_ci } 3317db96d56Sopenharmony_ci tok->interactive_src_start = new_str; 3327db96d56Sopenharmony_ci tok->interactive_src_end = new_str + current_size + line_size; 3337db96d56Sopenharmony_ci return 0; 3347db96d56Sopenharmony_ci} 3357db96d56Sopenharmony_ci 3367db96d56Sopenharmony_ci 3377db96d56Sopenharmony_ci/* Read a line of text from TOK into S, using the stream in TOK. 3387db96d56Sopenharmony_ci Return NULL on failure, else S. 3397db96d56Sopenharmony_ci 3407db96d56Sopenharmony_ci On entry, tok->decoding_buffer will be one of: 3417db96d56Sopenharmony_ci 1) NULL: need to call tok->decoding_readline to get a new line 3427db96d56Sopenharmony_ci 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and 3437db96d56Sopenharmony_ci stored the result in tok->decoding_buffer 3447db96d56Sopenharmony_ci 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room 3457db96d56Sopenharmony_ci (in the s buffer) to copy entire contents of the line read 3467db96d56Sopenharmony_ci by tok->decoding_readline. tok->decoding_buffer has the overflow. 3477db96d56Sopenharmony_ci In this case, tok_readline_recode is called in a loop (with an expanded buffer) 3487db96d56Sopenharmony_ci until the buffer ends with a '\n' (or until the end of the file is 3497db96d56Sopenharmony_ci reached): see tok_nextc and its calls to tok_reserve_buf. 3507db96d56Sopenharmony_ci*/ 3517db96d56Sopenharmony_ci 3527db96d56Sopenharmony_cistatic int 3537db96d56Sopenharmony_citok_reserve_buf(struct tok_state *tok, Py_ssize_t size) 3547db96d56Sopenharmony_ci{ 3557db96d56Sopenharmony_ci Py_ssize_t cur = tok->cur - tok->buf; 3567db96d56Sopenharmony_ci Py_ssize_t oldsize = tok->inp - tok->buf; 3577db96d56Sopenharmony_ci Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1); 3587db96d56Sopenharmony_ci if (newsize > tok->end - tok->buf) { 3597db96d56Sopenharmony_ci char *newbuf = tok->buf; 3607db96d56Sopenharmony_ci Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf; 3617db96d56Sopenharmony_ci Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf; 3627db96d56Sopenharmony_ci Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf; 3637db96d56Sopenharmony_ci newbuf = (char *)PyMem_Realloc(newbuf, newsize); 3647db96d56Sopenharmony_ci if (newbuf == NULL) { 3657db96d56Sopenharmony_ci tok->done = E_NOMEM; 3667db96d56Sopenharmony_ci return 0; 3677db96d56Sopenharmony_ci } 3687db96d56Sopenharmony_ci tok->buf = newbuf; 3697db96d56Sopenharmony_ci tok->cur = tok->buf + cur; 3707db96d56Sopenharmony_ci tok->inp = tok->buf + oldsize; 3717db96d56Sopenharmony_ci tok->end = tok->buf + newsize; 3727db96d56Sopenharmony_ci tok->start = start < 0 ? NULL : tok->buf + start; 3737db96d56Sopenharmony_ci tok->line_start = line_start < 0 ? NULL : tok->buf + line_start; 3747db96d56Sopenharmony_ci tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start; 3757db96d56Sopenharmony_ci } 3767db96d56Sopenharmony_ci return 1; 3777db96d56Sopenharmony_ci} 3787db96d56Sopenharmony_ci 3797db96d56Sopenharmony_cistatic inline int 3807db96d56Sopenharmony_cicontains_null_bytes(const char* str, size_t size) { 3817db96d56Sopenharmony_ci return memchr(str, 0, size) != NULL; 3827db96d56Sopenharmony_ci} 3837db96d56Sopenharmony_ci 3847db96d56Sopenharmony_cistatic int 3857db96d56Sopenharmony_citok_readline_recode(struct tok_state *tok) { 3867db96d56Sopenharmony_ci PyObject *line; 3877db96d56Sopenharmony_ci const char *buf; 3887db96d56Sopenharmony_ci Py_ssize_t buflen; 3897db96d56Sopenharmony_ci line = tok->decoding_buffer; 3907db96d56Sopenharmony_ci if (line == NULL) { 3917db96d56Sopenharmony_ci line = PyObject_CallNoArgs(tok->decoding_readline); 3927db96d56Sopenharmony_ci if (line == NULL) { 3937db96d56Sopenharmony_ci error_ret(tok); 3947db96d56Sopenharmony_ci goto error; 3957db96d56Sopenharmony_ci } 3967db96d56Sopenharmony_ci } 3977db96d56Sopenharmony_ci else { 3987db96d56Sopenharmony_ci tok->decoding_buffer = NULL; 3997db96d56Sopenharmony_ci } 4007db96d56Sopenharmony_ci buf = PyUnicode_AsUTF8AndSize(line, &buflen); 4017db96d56Sopenharmony_ci if (buf == NULL) { 4027db96d56Sopenharmony_ci error_ret(tok); 4037db96d56Sopenharmony_ci goto error; 4047db96d56Sopenharmony_ci } 4057db96d56Sopenharmony_ci // Make room for the null terminator *and* potentially 4067db96d56Sopenharmony_ci // an extra newline character that we may need to artificially 4077db96d56Sopenharmony_ci // add. 4087db96d56Sopenharmony_ci size_t buffer_size = buflen + 2; 4097db96d56Sopenharmony_ci if (!tok_reserve_buf(tok, buffer_size)) { 4107db96d56Sopenharmony_ci goto error; 4117db96d56Sopenharmony_ci } 4127db96d56Sopenharmony_ci memcpy(tok->inp, buf, buflen); 4137db96d56Sopenharmony_ci tok->inp += buflen; 4147db96d56Sopenharmony_ci *tok->inp = '\0'; 4157db96d56Sopenharmony_ci if (tok->fp_interactive && 4167db96d56Sopenharmony_ci tok_concatenate_interactive_new_line(tok, buf) == -1) { 4177db96d56Sopenharmony_ci goto error; 4187db96d56Sopenharmony_ci } 4197db96d56Sopenharmony_ci Py_DECREF(line); 4207db96d56Sopenharmony_ci return 1; 4217db96d56Sopenharmony_cierror: 4227db96d56Sopenharmony_ci Py_XDECREF(line); 4237db96d56Sopenharmony_ci return 0; 4247db96d56Sopenharmony_ci} 4257db96d56Sopenharmony_ci 4267db96d56Sopenharmony_ci/* Set the readline function for TOK to a StreamReader's 4277db96d56Sopenharmony_ci readline function. The StreamReader is named ENC. 4287db96d56Sopenharmony_ci 4297db96d56Sopenharmony_ci This function is called from check_bom and check_coding_spec. 4307db96d56Sopenharmony_ci 4317db96d56Sopenharmony_ci ENC is usually identical to the future value of tok->encoding, 4327db96d56Sopenharmony_ci except for the (currently unsupported) case of UTF-16. 4337db96d56Sopenharmony_ci 4347db96d56Sopenharmony_ci Return 1 on success, 0 on failure. */ 4357db96d56Sopenharmony_ci 4367db96d56Sopenharmony_cistatic int 4377db96d56Sopenharmony_cifp_setreadl(struct tok_state *tok, const char* enc) 4387db96d56Sopenharmony_ci{ 4397db96d56Sopenharmony_ci PyObject *readline, *io, *stream; 4407db96d56Sopenharmony_ci int fd; 4417db96d56Sopenharmony_ci long pos; 4427db96d56Sopenharmony_ci 4437db96d56Sopenharmony_ci fd = fileno(tok->fp); 4447db96d56Sopenharmony_ci /* Due to buffering the file offset for fd can be different from the file 4457db96d56Sopenharmony_ci * position of tok->fp. If tok->fp was opened in text mode on Windows, 4467db96d56Sopenharmony_ci * its file position counts CRLF as one char and can't be directly mapped 4477db96d56Sopenharmony_ci * to the file offset for fd. Instead we step back one byte and read to 4487db96d56Sopenharmony_ci * the end of line.*/ 4497db96d56Sopenharmony_ci pos = ftell(tok->fp); 4507db96d56Sopenharmony_ci if (pos == -1 || 4517db96d56Sopenharmony_ci lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { 4527db96d56Sopenharmony_ci PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); 4537db96d56Sopenharmony_ci return 0; 4547db96d56Sopenharmony_ci } 4557db96d56Sopenharmony_ci 4567db96d56Sopenharmony_ci io = PyImport_ImportModule("io"); 4577db96d56Sopenharmony_ci if (io == NULL) { 4587db96d56Sopenharmony_ci return 0; 4597db96d56Sopenharmony_ci } 4607db96d56Sopenharmony_ci stream = _PyObject_CallMethod(io, &_Py_ID(open), "isisOOO", 4617db96d56Sopenharmony_ci fd, "r", -1, enc, Py_None, Py_None, Py_False); 4627db96d56Sopenharmony_ci Py_DECREF(io); 4637db96d56Sopenharmony_ci if (stream == NULL) { 4647db96d56Sopenharmony_ci return 0; 4657db96d56Sopenharmony_ci } 4667db96d56Sopenharmony_ci 4677db96d56Sopenharmony_ci readline = PyObject_GetAttr(stream, &_Py_ID(readline)); 4687db96d56Sopenharmony_ci Py_DECREF(stream); 4697db96d56Sopenharmony_ci if (readline == NULL) { 4707db96d56Sopenharmony_ci return 0; 4717db96d56Sopenharmony_ci } 4727db96d56Sopenharmony_ci Py_XSETREF(tok->decoding_readline, readline); 4737db96d56Sopenharmony_ci 4747db96d56Sopenharmony_ci if (pos > 0) { 4757db96d56Sopenharmony_ci PyObject *bufobj = _PyObject_CallNoArgs(readline); 4767db96d56Sopenharmony_ci if (bufobj == NULL) { 4777db96d56Sopenharmony_ci return 0; 4787db96d56Sopenharmony_ci } 4797db96d56Sopenharmony_ci Py_DECREF(bufobj); 4807db96d56Sopenharmony_ci } 4817db96d56Sopenharmony_ci 4827db96d56Sopenharmony_ci return 1; 4837db96d56Sopenharmony_ci} 4847db96d56Sopenharmony_ci 4857db96d56Sopenharmony_ci/* Fetch the next byte from TOK. */ 4867db96d56Sopenharmony_ci 4877db96d56Sopenharmony_cistatic int fp_getc(struct tok_state *tok) { 4887db96d56Sopenharmony_ci return getc(tok->fp); 4897db96d56Sopenharmony_ci} 4907db96d56Sopenharmony_ci 4917db96d56Sopenharmony_ci/* Unfetch the last byte back into TOK. */ 4927db96d56Sopenharmony_ci 4937db96d56Sopenharmony_cistatic void fp_ungetc(int c, struct tok_state *tok) { 4947db96d56Sopenharmony_ci ungetc(c, tok->fp); 4957db96d56Sopenharmony_ci} 4967db96d56Sopenharmony_ci 4977db96d56Sopenharmony_ci/* Check whether the characters at s start a valid 4987db96d56Sopenharmony_ci UTF-8 sequence. Return the number of characters forming 4997db96d56Sopenharmony_ci the sequence if yes, 0 if not. The special cases match 5007db96d56Sopenharmony_ci those in stringlib/codecs.h:utf8_decode. 5017db96d56Sopenharmony_ci*/ 5027db96d56Sopenharmony_cistatic int 5037db96d56Sopenharmony_civalid_utf8(const unsigned char* s) 5047db96d56Sopenharmony_ci{ 5057db96d56Sopenharmony_ci int expected = 0; 5067db96d56Sopenharmony_ci int length; 5077db96d56Sopenharmony_ci if (*s < 0x80) { 5087db96d56Sopenharmony_ci /* single-byte code */ 5097db96d56Sopenharmony_ci return 1; 5107db96d56Sopenharmony_ci } 5117db96d56Sopenharmony_ci else if (*s < 0xE0) { 5127db96d56Sopenharmony_ci /* \xC2\x80-\xDF\xBF -- 0080-07FF */ 5137db96d56Sopenharmony_ci if (*s < 0xC2) { 5147db96d56Sopenharmony_ci /* invalid sequence 5157db96d56Sopenharmony_ci \x80-\xBF -- continuation byte 5167db96d56Sopenharmony_ci \xC0-\xC1 -- fake 0000-007F */ 5177db96d56Sopenharmony_ci return 0; 5187db96d56Sopenharmony_ci } 5197db96d56Sopenharmony_ci expected = 1; 5207db96d56Sopenharmony_ci } 5217db96d56Sopenharmony_ci else if (*s < 0xF0) { 5227db96d56Sopenharmony_ci /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ 5237db96d56Sopenharmony_ci if (*s == 0xE0 && *(s + 1) < 0xA0) { 5247db96d56Sopenharmony_ci /* invalid sequence 5257db96d56Sopenharmony_ci \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ 5267db96d56Sopenharmony_ci return 0; 5277db96d56Sopenharmony_ci } 5287db96d56Sopenharmony_ci else if (*s == 0xED && *(s + 1) >= 0xA0) { 5297db96d56Sopenharmony_ci /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF 5307db96d56Sopenharmony_ci will result in surrogates in range D800-DFFF. Surrogates are 5317db96d56Sopenharmony_ci not valid UTF-8 so they are rejected. 5327db96d56Sopenharmony_ci See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 5337db96d56Sopenharmony_ci (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 5347db96d56Sopenharmony_ci return 0; 5357db96d56Sopenharmony_ci } 5367db96d56Sopenharmony_ci expected = 2; 5377db96d56Sopenharmony_ci } 5387db96d56Sopenharmony_ci else if (*s < 0xF5) { 5397db96d56Sopenharmony_ci /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ 5407db96d56Sopenharmony_ci if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) { 5417db96d56Sopenharmony_ci /* invalid sequence -- one of: 5427db96d56Sopenharmony_ci \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF 5437db96d56Sopenharmony_ci \xF4\x90\x80\x80- -- 110000- overflow */ 5447db96d56Sopenharmony_ci return 0; 5457db96d56Sopenharmony_ci } 5467db96d56Sopenharmony_ci expected = 3; 5477db96d56Sopenharmony_ci } 5487db96d56Sopenharmony_ci else { 5497db96d56Sopenharmony_ci /* invalid start byte */ 5507db96d56Sopenharmony_ci return 0; 5517db96d56Sopenharmony_ci } 5527db96d56Sopenharmony_ci length = expected + 1; 5537db96d56Sopenharmony_ci for (; expected; expected--) 5547db96d56Sopenharmony_ci if (s[expected] < 0x80 || s[expected] >= 0xC0) 5557db96d56Sopenharmony_ci return 0; 5567db96d56Sopenharmony_ci return length; 5577db96d56Sopenharmony_ci} 5587db96d56Sopenharmony_ci 5597db96d56Sopenharmony_cistatic int 5607db96d56Sopenharmony_ciensure_utf8(char *line, struct tok_state *tok) 5617db96d56Sopenharmony_ci{ 5627db96d56Sopenharmony_ci int badchar = 0; 5637db96d56Sopenharmony_ci unsigned char *c; 5647db96d56Sopenharmony_ci int length; 5657db96d56Sopenharmony_ci for (c = (unsigned char *)line; *c; c += length) { 5667db96d56Sopenharmony_ci if (!(length = valid_utf8(c))) { 5677db96d56Sopenharmony_ci badchar = *c; 5687db96d56Sopenharmony_ci break; 5697db96d56Sopenharmony_ci } 5707db96d56Sopenharmony_ci } 5717db96d56Sopenharmony_ci if (badchar) { 5727db96d56Sopenharmony_ci PyErr_Format(PyExc_SyntaxError, 5737db96d56Sopenharmony_ci "Non-UTF-8 code starting with '\\x%.2x' " 5747db96d56Sopenharmony_ci "in file %U on line %i, " 5757db96d56Sopenharmony_ci "but no encoding declared; " 5767db96d56Sopenharmony_ci "see https://peps.python.org/pep-0263/ for details", 5777db96d56Sopenharmony_ci badchar, tok->filename, tok->lineno); 5787db96d56Sopenharmony_ci return 0; 5797db96d56Sopenharmony_ci } 5807db96d56Sopenharmony_ci return 1; 5817db96d56Sopenharmony_ci} 5827db96d56Sopenharmony_ci 5837db96d56Sopenharmony_ci/* Fetch a byte from TOK, using the string buffer. */ 5847db96d56Sopenharmony_ci 5857db96d56Sopenharmony_cistatic int 5867db96d56Sopenharmony_cibuf_getc(struct tok_state *tok) { 5877db96d56Sopenharmony_ci return Py_CHARMASK(*tok->str++); 5887db96d56Sopenharmony_ci} 5897db96d56Sopenharmony_ci 5907db96d56Sopenharmony_ci/* Unfetch a byte from TOK, using the string buffer. */ 5917db96d56Sopenharmony_ci 5927db96d56Sopenharmony_cistatic void 5937db96d56Sopenharmony_cibuf_ungetc(int c, struct tok_state *tok) { 5947db96d56Sopenharmony_ci tok->str--; 5957db96d56Sopenharmony_ci assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ 5967db96d56Sopenharmony_ci} 5977db96d56Sopenharmony_ci 5987db96d56Sopenharmony_ci/* Set the readline function for TOK to ENC. For the string-based 5997db96d56Sopenharmony_ci tokenizer, this means to just record the encoding. */ 6007db96d56Sopenharmony_ci 6017db96d56Sopenharmony_cistatic int 6027db96d56Sopenharmony_cibuf_setreadl(struct tok_state *tok, const char* enc) { 6037db96d56Sopenharmony_ci tok->enc = enc; 6047db96d56Sopenharmony_ci return 1; 6057db96d56Sopenharmony_ci} 6067db96d56Sopenharmony_ci 6077db96d56Sopenharmony_ci/* Return a UTF-8 encoding Python string object from the 6087db96d56Sopenharmony_ci C byte string STR, which is encoded with ENC. */ 6097db96d56Sopenharmony_ci 6107db96d56Sopenharmony_cistatic PyObject * 6117db96d56Sopenharmony_citranslate_into_utf8(const char* str, const char* enc) { 6127db96d56Sopenharmony_ci PyObject *utf8; 6137db96d56Sopenharmony_ci PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); 6147db96d56Sopenharmony_ci if (buf == NULL) 6157db96d56Sopenharmony_ci return NULL; 6167db96d56Sopenharmony_ci utf8 = PyUnicode_AsUTF8String(buf); 6177db96d56Sopenharmony_ci Py_DECREF(buf); 6187db96d56Sopenharmony_ci return utf8; 6197db96d56Sopenharmony_ci} 6207db96d56Sopenharmony_ci 6217db96d56Sopenharmony_ci 6227db96d56Sopenharmony_cistatic char * 6237db96d56Sopenharmony_citranslate_newlines(const char *s, int exec_input, struct tok_state *tok) { 6247db96d56Sopenharmony_ci int skip_next_lf = 0; 6257db96d56Sopenharmony_ci size_t needed_length = strlen(s) + 2, final_length; 6267db96d56Sopenharmony_ci char *buf, *current; 6277db96d56Sopenharmony_ci char c = '\0'; 6287db96d56Sopenharmony_ci buf = PyMem_Malloc(needed_length); 6297db96d56Sopenharmony_ci if (buf == NULL) { 6307db96d56Sopenharmony_ci tok->done = E_NOMEM; 6317db96d56Sopenharmony_ci return NULL; 6327db96d56Sopenharmony_ci } 6337db96d56Sopenharmony_ci for (current = buf; *s; s++, current++) { 6347db96d56Sopenharmony_ci c = *s; 6357db96d56Sopenharmony_ci if (skip_next_lf) { 6367db96d56Sopenharmony_ci skip_next_lf = 0; 6377db96d56Sopenharmony_ci if (c == '\n') { 6387db96d56Sopenharmony_ci c = *++s; 6397db96d56Sopenharmony_ci if (!c) 6407db96d56Sopenharmony_ci break; 6417db96d56Sopenharmony_ci } 6427db96d56Sopenharmony_ci } 6437db96d56Sopenharmony_ci if (c == '\r') { 6447db96d56Sopenharmony_ci skip_next_lf = 1; 6457db96d56Sopenharmony_ci c = '\n'; 6467db96d56Sopenharmony_ci } 6477db96d56Sopenharmony_ci *current = c; 6487db96d56Sopenharmony_ci } 6497db96d56Sopenharmony_ci /* If this is exec input, add a newline to the end of the string if 6507db96d56Sopenharmony_ci there isn't one already. */ 6517db96d56Sopenharmony_ci if (exec_input && c != '\n') { 6527db96d56Sopenharmony_ci *current = '\n'; 6537db96d56Sopenharmony_ci current++; 6547db96d56Sopenharmony_ci } 6557db96d56Sopenharmony_ci *current = '\0'; 6567db96d56Sopenharmony_ci final_length = current - buf + 1; 6577db96d56Sopenharmony_ci if (final_length < needed_length && final_length) { 6587db96d56Sopenharmony_ci /* should never fail */ 6597db96d56Sopenharmony_ci char* result = PyMem_Realloc(buf, final_length); 6607db96d56Sopenharmony_ci if (result == NULL) { 6617db96d56Sopenharmony_ci PyMem_Free(buf); 6627db96d56Sopenharmony_ci } 6637db96d56Sopenharmony_ci buf = result; 6647db96d56Sopenharmony_ci } 6657db96d56Sopenharmony_ci return buf; 6667db96d56Sopenharmony_ci} 6677db96d56Sopenharmony_ci 6687db96d56Sopenharmony_ci/* Decode a byte string STR for use as the buffer of TOK. 6697db96d56Sopenharmony_ci Look for encoding declarations inside STR, and record them 6707db96d56Sopenharmony_ci inside TOK. */ 6717db96d56Sopenharmony_ci 6727db96d56Sopenharmony_cistatic char * 6737db96d56Sopenharmony_cidecode_str(const char *input, int single, struct tok_state *tok) 6747db96d56Sopenharmony_ci{ 6757db96d56Sopenharmony_ci PyObject* utf8 = NULL; 6767db96d56Sopenharmony_ci char *str; 6777db96d56Sopenharmony_ci const char *s; 6787db96d56Sopenharmony_ci const char *newl[2] = {NULL, NULL}; 6797db96d56Sopenharmony_ci int lineno = 0; 6807db96d56Sopenharmony_ci tok->input = str = translate_newlines(input, single, tok); 6817db96d56Sopenharmony_ci if (str == NULL) 6827db96d56Sopenharmony_ci return NULL; 6837db96d56Sopenharmony_ci tok->enc = NULL; 6847db96d56Sopenharmony_ci tok->str = str; 6857db96d56Sopenharmony_ci if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) 6867db96d56Sopenharmony_ci return error_ret(tok); 6877db96d56Sopenharmony_ci str = tok->str; /* string after BOM if any */ 6887db96d56Sopenharmony_ci assert(str); 6897db96d56Sopenharmony_ci if (tok->enc != NULL) { 6907db96d56Sopenharmony_ci utf8 = translate_into_utf8(str, tok->enc); 6917db96d56Sopenharmony_ci if (utf8 == NULL) 6927db96d56Sopenharmony_ci return error_ret(tok); 6937db96d56Sopenharmony_ci str = PyBytes_AsString(utf8); 6947db96d56Sopenharmony_ci } 6957db96d56Sopenharmony_ci for (s = str;; s++) { 6967db96d56Sopenharmony_ci if (*s == '\0') break; 6977db96d56Sopenharmony_ci else if (*s == '\n') { 6987db96d56Sopenharmony_ci assert(lineno < 2); 6997db96d56Sopenharmony_ci newl[lineno] = s; 7007db96d56Sopenharmony_ci lineno++; 7017db96d56Sopenharmony_ci if (lineno == 2) break; 7027db96d56Sopenharmony_ci } 7037db96d56Sopenharmony_ci } 7047db96d56Sopenharmony_ci tok->enc = NULL; 7057db96d56Sopenharmony_ci /* need to check line 1 and 2 separately since check_coding_spec 7067db96d56Sopenharmony_ci assumes a single line as input */ 7077db96d56Sopenharmony_ci if (newl[0]) { 7087db96d56Sopenharmony_ci if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) { 7097db96d56Sopenharmony_ci return NULL; 7107db96d56Sopenharmony_ci } 7117db96d56Sopenharmony_ci if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) { 7127db96d56Sopenharmony_ci if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], 7137db96d56Sopenharmony_ci tok, buf_setreadl)) 7147db96d56Sopenharmony_ci return NULL; 7157db96d56Sopenharmony_ci } 7167db96d56Sopenharmony_ci } 7177db96d56Sopenharmony_ci if (tok->enc != NULL) { 7187db96d56Sopenharmony_ci assert(utf8 == NULL); 7197db96d56Sopenharmony_ci utf8 = translate_into_utf8(str, tok->enc); 7207db96d56Sopenharmony_ci if (utf8 == NULL) 7217db96d56Sopenharmony_ci return error_ret(tok); 7227db96d56Sopenharmony_ci str = PyBytes_AS_STRING(utf8); 7237db96d56Sopenharmony_ci } 7247db96d56Sopenharmony_ci assert(tok->decoding_buffer == NULL); 7257db96d56Sopenharmony_ci tok->decoding_buffer = utf8; /* CAUTION */ 7267db96d56Sopenharmony_ci return str; 7277db96d56Sopenharmony_ci} 7287db96d56Sopenharmony_ci 7297db96d56Sopenharmony_ci/* Set up tokenizer for string */ 7307db96d56Sopenharmony_ci 7317db96d56Sopenharmony_cistruct tok_state * 7327db96d56Sopenharmony_ci_PyTokenizer_FromString(const char *str, int exec_input) 7337db96d56Sopenharmony_ci{ 7347db96d56Sopenharmony_ci struct tok_state *tok = tok_new(); 7357db96d56Sopenharmony_ci char *decoded; 7367db96d56Sopenharmony_ci 7377db96d56Sopenharmony_ci if (tok == NULL) 7387db96d56Sopenharmony_ci return NULL; 7397db96d56Sopenharmony_ci decoded = decode_str(str, exec_input, tok); 7407db96d56Sopenharmony_ci if (decoded == NULL) { 7417db96d56Sopenharmony_ci _PyTokenizer_Free(tok); 7427db96d56Sopenharmony_ci return NULL; 7437db96d56Sopenharmony_ci } 7447db96d56Sopenharmony_ci 7457db96d56Sopenharmony_ci tok->buf = tok->cur = tok->inp = decoded; 7467db96d56Sopenharmony_ci tok->end = decoded; 7477db96d56Sopenharmony_ci return tok; 7487db96d56Sopenharmony_ci} 7497db96d56Sopenharmony_ci 7507db96d56Sopenharmony_ci/* Set up tokenizer for UTF-8 string */ 7517db96d56Sopenharmony_ci 7527db96d56Sopenharmony_cistruct tok_state * 7537db96d56Sopenharmony_ci_PyTokenizer_FromUTF8(const char *str, int exec_input) 7547db96d56Sopenharmony_ci{ 7557db96d56Sopenharmony_ci struct tok_state *tok = tok_new(); 7567db96d56Sopenharmony_ci char *translated; 7577db96d56Sopenharmony_ci if (tok == NULL) 7587db96d56Sopenharmony_ci return NULL; 7597db96d56Sopenharmony_ci tok->input = translated = translate_newlines(str, exec_input, tok); 7607db96d56Sopenharmony_ci if (translated == NULL) { 7617db96d56Sopenharmony_ci _PyTokenizer_Free(tok); 7627db96d56Sopenharmony_ci return NULL; 7637db96d56Sopenharmony_ci } 7647db96d56Sopenharmony_ci tok->decoding_state = STATE_NORMAL; 7657db96d56Sopenharmony_ci tok->enc = NULL; 7667db96d56Sopenharmony_ci tok->str = translated; 7677db96d56Sopenharmony_ci tok->encoding = new_string("utf-8", 5, tok); 7687db96d56Sopenharmony_ci if (!tok->encoding) { 7697db96d56Sopenharmony_ci _PyTokenizer_Free(tok); 7707db96d56Sopenharmony_ci return NULL; 7717db96d56Sopenharmony_ci } 7727db96d56Sopenharmony_ci 7737db96d56Sopenharmony_ci tok->buf = tok->cur = tok->inp = translated; 7747db96d56Sopenharmony_ci tok->end = translated; 7757db96d56Sopenharmony_ci return tok; 7767db96d56Sopenharmony_ci} 7777db96d56Sopenharmony_ci 7787db96d56Sopenharmony_ci/* Set up tokenizer for file */ 7797db96d56Sopenharmony_ci 7807db96d56Sopenharmony_cistruct tok_state * 7817db96d56Sopenharmony_ci_PyTokenizer_FromFile(FILE *fp, const char* enc, 7827db96d56Sopenharmony_ci const char *ps1, const char *ps2) 7837db96d56Sopenharmony_ci{ 7847db96d56Sopenharmony_ci struct tok_state *tok = tok_new(); 7857db96d56Sopenharmony_ci if (tok == NULL) 7867db96d56Sopenharmony_ci return NULL; 7877db96d56Sopenharmony_ci if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { 7887db96d56Sopenharmony_ci _PyTokenizer_Free(tok); 7897db96d56Sopenharmony_ci return NULL; 7907db96d56Sopenharmony_ci } 7917db96d56Sopenharmony_ci tok->cur = tok->inp = tok->buf; 7927db96d56Sopenharmony_ci tok->end = tok->buf + BUFSIZ; 7937db96d56Sopenharmony_ci tok->fp = fp; 7947db96d56Sopenharmony_ci tok->prompt = ps1; 7957db96d56Sopenharmony_ci tok->nextprompt = ps2; 7967db96d56Sopenharmony_ci if (enc != NULL) { 7977db96d56Sopenharmony_ci /* Must copy encoding declaration since it 7987db96d56Sopenharmony_ci gets copied into the parse tree. */ 7997db96d56Sopenharmony_ci tok->encoding = new_string(enc, strlen(enc), tok); 8007db96d56Sopenharmony_ci if (!tok->encoding) { 8017db96d56Sopenharmony_ci _PyTokenizer_Free(tok); 8027db96d56Sopenharmony_ci return NULL; 8037db96d56Sopenharmony_ci } 8047db96d56Sopenharmony_ci tok->decoding_state = STATE_NORMAL; 8057db96d56Sopenharmony_ci } 8067db96d56Sopenharmony_ci return tok; 8077db96d56Sopenharmony_ci} 8087db96d56Sopenharmony_ci 8097db96d56Sopenharmony_ci/* Free a tok_state structure */ 8107db96d56Sopenharmony_ci 8117db96d56Sopenharmony_civoid 8127db96d56Sopenharmony_ci_PyTokenizer_Free(struct tok_state *tok) 8137db96d56Sopenharmony_ci{ 8147db96d56Sopenharmony_ci if (tok->encoding != NULL) { 8157db96d56Sopenharmony_ci PyMem_Free(tok->encoding); 8167db96d56Sopenharmony_ci } 8177db96d56Sopenharmony_ci Py_XDECREF(tok->decoding_readline); 8187db96d56Sopenharmony_ci Py_XDECREF(tok->decoding_buffer); 8197db96d56Sopenharmony_ci Py_XDECREF(tok->filename); 8207db96d56Sopenharmony_ci if (tok->fp != NULL && tok->buf != NULL) { 8217db96d56Sopenharmony_ci PyMem_Free(tok->buf); 8227db96d56Sopenharmony_ci } 8237db96d56Sopenharmony_ci if (tok->input) { 8247db96d56Sopenharmony_ci PyMem_Free(tok->input); 8257db96d56Sopenharmony_ci } 8267db96d56Sopenharmony_ci if (tok->interactive_src_start != NULL) { 8277db96d56Sopenharmony_ci PyMem_Free(tok->interactive_src_start); 8287db96d56Sopenharmony_ci } 8297db96d56Sopenharmony_ci PyMem_Free(tok); 8307db96d56Sopenharmony_ci} 8317db96d56Sopenharmony_ci 8327db96d56Sopenharmony_cistatic int 8337db96d56Sopenharmony_citok_readline_raw(struct tok_state *tok) 8347db96d56Sopenharmony_ci{ 8357db96d56Sopenharmony_ci do { 8367db96d56Sopenharmony_ci if (!tok_reserve_buf(tok, BUFSIZ)) { 8377db96d56Sopenharmony_ci return 0; 8387db96d56Sopenharmony_ci } 8397db96d56Sopenharmony_ci int n_chars = (int)(tok->end - tok->inp); 8407db96d56Sopenharmony_ci size_t line_size = 0; 8417db96d56Sopenharmony_ci char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size); 8427db96d56Sopenharmony_ci if (line == NULL) { 8437db96d56Sopenharmony_ci return 1; 8447db96d56Sopenharmony_ci } 8457db96d56Sopenharmony_ci if (tok->fp_interactive && 8467db96d56Sopenharmony_ci tok_concatenate_interactive_new_line(tok, line) == -1) { 8477db96d56Sopenharmony_ci return 0; 8487db96d56Sopenharmony_ci } 8497db96d56Sopenharmony_ci tok->inp += line_size; 8507db96d56Sopenharmony_ci if (tok->inp == tok->buf) { 8517db96d56Sopenharmony_ci return 0; 8527db96d56Sopenharmony_ci } 8537db96d56Sopenharmony_ci } while (tok->inp[-1] != '\n'); 8547db96d56Sopenharmony_ci return 1; 8557db96d56Sopenharmony_ci} 8567db96d56Sopenharmony_ci 8577db96d56Sopenharmony_cistatic int 8587db96d56Sopenharmony_citok_underflow_string(struct tok_state *tok) { 8597db96d56Sopenharmony_ci char *end = strchr(tok->inp, '\n'); 8607db96d56Sopenharmony_ci if (end != NULL) { 8617db96d56Sopenharmony_ci end++; 8627db96d56Sopenharmony_ci } 8637db96d56Sopenharmony_ci else { 8647db96d56Sopenharmony_ci end = strchr(tok->inp, '\0'); 8657db96d56Sopenharmony_ci if (end == tok->inp) { 8667db96d56Sopenharmony_ci tok->done = E_EOF; 8677db96d56Sopenharmony_ci return 0; 8687db96d56Sopenharmony_ci } 8697db96d56Sopenharmony_ci } 8707db96d56Sopenharmony_ci if (tok->start == NULL) { 8717db96d56Sopenharmony_ci tok->buf = tok->cur; 8727db96d56Sopenharmony_ci } 8737db96d56Sopenharmony_ci tok->line_start = tok->cur; 8747db96d56Sopenharmony_ci tok->lineno++; 8757db96d56Sopenharmony_ci tok->inp = end; 8767db96d56Sopenharmony_ci return 1; 8777db96d56Sopenharmony_ci} 8787db96d56Sopenharmony_ci 8797db96d56Sopenharmony_cistatic int 8807db96d56Sopenharmony_citok_underflow_interactive(struct tok_state *tok) { 8817db96d56Sopenharmony_ci if (tok->interactive_underflow == IUNDERFLOW_STOP) { 8827db96d56Sopenharmony_ci tok->done = E_INTERACT_STOP; 8837db96d56Sopenharmony_ci return 1; 8847db96d56Sopenharmony_ci } 8857db96d56Sopenharmony_ci char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt); 8867db96d56Sopenharmony_ci if (newtok != NULL) { 8877db96d56Sopenharmony_ci char *translated = translate_newlines(newtok, 0, tok); 8887db96d56Sopenharmony_ci PyMem_Free(newtok); 8897db96d56Sopenharmony_ci if (translated == NULL) { 8907db96d56Sopenharmony_ci return 0; 8917db96d56Sopenharmony_ci } 8927db96d56Sopenharmony_ci newtok = translated; 8937db96d56Sopenharmony_ci } 8947db96d56Sopenharmony_ci if (tok->encoding && newtok && *newtok) { 8957db96d56Sopenharmony_ci /* Recode to UTF-8 */ 8967db96d56Sopenharmony_ci Py_ssize_t buflen; 8977db96d56Sopenharmony_ci const char* buf; 8987db96d56Sopenharmony_ci PyObject *u = translate_into_utf8(newtok, tok->encoding); 8997db96d56Sopenharmony_ci PyMem_Free(newtok); 9007db96d56Sopenharmony_ci if (u == NULL) { 9017db96d56Sopenharmony_ci tok->done = E_DECODE; 9027db96d56Sopenharmony_ci return 0; 9037db96d56Sopenharmony_ci } 9047db96d56Sopenharmony_ci buflen = PyBytes_GET_SIZE(u); 9057db96d56Sopenharmony_ci buf = PyBytes_AS_STRING(u); 9067db96d56Sopenharmony_ci newtok = PyMem_Malloc(buflen+1); 9077db96d56Sopenharmony_ci if (newtok == NULL) { 9087db96d56Sopenharmony_ci Py_DECREF(u); 9097db96d56Sopenharmony_ci tok->done = E_NOMEM; 9107db96d56Sopenharmony_ci return 0; 9117db96d56Sopenharmony_ci } 9127db96d56Sopenharmony_ci strcpy(newtok, buf); 9137db96d56Sopenharmony_ci Py_DECREF(u); 9147db96d56Sopenharmony_ci } 9157db96d56Sopenharmony_ci if (tok->fp_interactive && 9167db96d56Sopenharmony_ci tok_concatenate_interactive_new_line(tok, newtok) == -1) { 9177db96d56Sopenharmony_ci PyMem_Free(newtok); 9187db96d56Sopenharmony_ci return 0; 9197db96d56Sopenharmony_ci } 9207db96d56Sopenharmony_ci if (tok->nextprompt != NULL) { 9217db96d56Sopenharmony_ci tok->prompt = tok->nextprompt; 9227db96d56Sopenharmony_ci } 9237db96d56Sopenharmony_ci if (newtok == NULL) { 9247db96d56Sopenharmony_ci tok->done = E_INTR; 9257db96d56Sopenharmony_ci } 9267db96d56Sopenharmony_ci else if (*newtok == '\0') { 9277db96d56Sopenharmony_ci PyMem_Free(newtok); 9287db96d56Sopenharmony_ci tok->done = E_EOF; 9297db96d56Sopenharmony_ci } 9307db96d56Sopenharmony_ci else if (tok->start != NULL) { 9317db96d56Sopenharmony_ci Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; 9327db96d56Sopenharmony_ci size_t size = strlen(newtok); 9337db96d56Sopenharmony_ci tok->lineno++; 9347db96d56Sopenharmony_ci if (!tok_reserve_buf(tok, size + 1)) { 9357db96d56Sopenharmony_ci PyMem_Free(tok->buf); 9367db96d56Sopenharmony_ci tok->buf = NULL; 9377db96d56Sopenharmony_ci PyMem_Free(newtok); 9387db96d56Sopenharmony_ci return 0; 9397db96d56Sopenharmony_ci } 9407db96d56Sopenharmony_ci memcpy(tok->cur, newtok, size + 1); 9417db96d56Sopenharmony_ci PyMem_Free(newtok); 9427db96d56Sopenharmony_ci tok->inp += size; 9437db96d56Sopenharmony_ci tok->multi_line_start = tok->buf + cur_multi_line_start; 9447db96d56Sopenharmony_ci } 9457db96d56Sopenharmony_ci else { 9467db96d56Sopenharmony_ci tok->lineno++; 9477db96d56Sopenharmony_ci PyMem_Free(tok->buf); 9487db96d56Sopenharmony_ci tok->buf = newtok; 9497db96d56Sopenharmony_ci tok->cur = tok->buf; 9507db96d56Sopenharmony_ci tok->line_start = tok->buf; 9517db96d56Sopenharmony_ci tok->inp = strchr(tok->buf, '\0'); 9527db96d56Sopenharmony_ci tok->end = tok->inp + 1; 9537db96d56Sopenharmony_ci } 9547db96d56Sopenharmony_ci if (tok->done != E_OK) { 9557db96d56Sopenharmony_ci if (tok->prompt != NULL) { 9567db96d56Sopenharmony_ci PySys_WriteStderr("\n"); 9577db96d56Sopenharmony_ci } 9587db96d56Sopenharmony_ci return 0; 9597db96d56Sopenharmony_ci } 9607db96d56Sopenharmony_ci return 1; 9617db96d56Sopenharmony_ci} 9627db96d56Sopenharmony_ci 9637db96d56Sopenharmony_cistatic int 9647db96d56Sopenharmony_citok_underflow_file(struct tok_state *tok) { 9657db96d56Sopenharmony_ci if (tok->start == NULL) { 9667db96d56Sopenharmony_ci tok->cur = tok->inp = tok->buf; 9677db96d56Sopenharmony_ci } 9687db96d56Sopenharmony_ci if (tok->decoding_state == STATE_INIT) { 9697db96d56Sopenharmony_ci /* We have not yet determined the encoding. 9707db96d56Sopenharmony_ci If an encoding is found, use the file-pointer 9717db96d56Sopenharmony_ci reader functions from now on. */ 9727db96d56Sopenharmony_ci if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) { 9737db96d56Sopenharmony_ci error_ret(tok); 9747db96d56Sopenharmony_ci return 0; 9757db96d56Sopenharmony_ci } 9767db96d56Sopenharmony_ci assert(tok->decoding_state != STATE_INIT); 9777db96d56Sopenharmony_ci } 9787db96d56Sopenharmony_ci /* Read until '\n' or EOF */ 9797db96d56Sopenharmony_ci if (tok->decoding_readline != NULL) { 9807db96d56Sopenharmony_ci /* We already have a codec associated with this input. */ 9817db96d56Sopenharmony_ci if (!tok_readline_recode(tok)) { 9827db96d56Sopenharmony_ci return 0; 9837db96d56Sopenharmony_ci } 9847db96d56Sopenharmony_ci } 9857db96d56Sopenharmony_ci else { 9867db96d56Sopenharmony_ci /* We want a 'raw' read. */ 9877db96d56Sopenharmony_ci if (!tok_readline_raw(tok)) { 9887db96d56Sopenharmony_ci return 0; 9897db96d56Sopenharmony_ci } 9907db96d56Sopenharmony_ci } 9917db96d56Sopenharmony_ci if (tok->inp == tok->cur) { 9927db96d56Sopenharmony_ci tok->done = E_EOF; 9937db96d56Sopenharmony_ci return 0; 9947db96d56Sopenharmony_ci } 9957db96d56Sopenharmony_ci if (tok->inp[-1] != '\n') { 9967db96d56Sopenharmony_ci assert(tok->inp + 1 < tok->end); 9977db96d56Sopenharmony_ci /* Last line does not end in \n, fake one */ 9987db96d56Sopenharmony_ci *tok->inp++ = '\n'; 9997db96d56Sopenharmony_ci *tok->inp = '\0'; 10007db96d56Sopenharmony_ci } 10017db96d56Sopenharmony_ci 10027db96d56Sopenharmony_ci tok->lineno++; 10037db96d56Sopenharmony_ci if (tok->decoding_state != STATE_NORMAL) { 10047db96d56Sopenharmony_ci if (tok->lineno > 2) { 10057db96d56Sopenharmony_ci tok->decoding_state = STATE_NORMAL; 10067db96d56Sopenharmony_ci } 10077db96d56Sopenharmony_ci else if (!check_coding_spec(tok->cur, strlen(tok->cur), 10087db96d56Sopenharmony_ci tok, fp_setreadl)) 10097db96d56Sopenharmony_ci { 10107db96d56Sopenharmony_ci return 0; 10117db96d56Sopenharmony_ci } 10127db96d56Sopenharmony_ci } 10137db96d56Sopenharmony_ci /* The default encoding is UTF-8, so make sure we don't have any 10147db96d56Sopenharmony_ci non-UTF-8 sequences in it. */ 10157db96d56Sopenharmony_ci if (!tok->encoding && !ensure_utf8(tok->cur, tok)) { 10167db96d56Sopenharmony_ci error_ret(tok); 10177db96d56Sopenharmony_ci return 0; 10187db96d56Sopenharmony_ci } 10197db96d56Sopenharmony_ci assert(tok->done == E_OK); 10207db96d56Sopenharmony_ci return tok->done == E_OK; 10217db96d56Sopenharmony_ci} 10227db96d56Sopenharmony_ci 10237db96d56Sopenharmony_ci#if defined(Py_DEBUG) 10247db96d56Sopenharmony_cistatic void 10257db96d56Sopenharmony_ciprint_escape(FILE *f, const char *s, Py_ssize_t size) 10267db96d56Sopenharmony_ci{ 10277db96d56Sopenharmony_ci if (s == NULL) { 10287db96d56Sopenharmony_ci fputs("NULL", f); 10297db96d56Sopenharmony_ci return; 10307db96d56Sopenharmony_ci } 10317db96d56Sopenharmony_ci putc('"', f); 10327db96d56Sopenharmony_ci while (size-- > 0) { 10337db96d56Sopenharmony_ci unsigned char c = *s++; 10347db96d56Sopenharmony_ci switch (c) { 10357db96d56Sopenharmony_ci case '\n': fputs("\\n", f); break; 10367db96d56Sopenharmony_ci case '\r': fputs("\\r", f); break; 10377db96d56Sopenharmony_ci case '\t': fputs("\\t", f); break; 10387db96d56Sopenharmony_ci case '\f': fputs("\\f", f); break; 10397db96d56Sopenharmony_ci case '\'': fputs("\\'", f); break; 10407db96d56Sopenharmony_ci case '"': fputs("\\\"", f); break; 10417db96d56Sopenharmony_ci default: 10427db96d56Sopenharmony_ci if (0x20 <= c && c <= 0x7f) 10437db96d56Sopenharmony_ci putc(c, f); 10447db96d56Sopenharmony_ci else 10457db96d56Sopenharmony_ci fprintf(f, "\\x%02x", c); 10467db96d56Sopenharmony_ci } 10477db96d56Sopenharmony_ci } 10487db96d56Sopenharmony_ci putc('"', f); 10497db96d56Sopenharmony_ci} 10507db96d56Sopenharmony_ci#endif 10517db96d56Sopenharmony_ci 10527db96d56Sopenharmony_ci/* Get next char, updating state; error code goes into tok->done */ 10537db96d56Sopenharmony_ci 10547db96d56Sopenharmony_cistatic int 10557db96d56Sopenharmony_citok_nextc(struct tok_state *tok) 10567db96d56Sopenharmony_ci{ 10577db96d56Sopenharmony_ci int rc; 10587db96d56Sopenharmony_ci for (;;) { 10597db96d56Sopenharmony_ci if (tok->cur != tok->inp) { 10607db96d56Sopenharmony_ci return Py_CHARMASK(*tok->cur++); /* Fast path */ 10617db96d56Sopenharmony_ci } 10627db96d56Sopenharmony_ci if (tok->done != E_OK) { 10637db96d56Sopenharmony_ci return EOF; 10647db96d56Sopenharmony_ci } 10657db96d56Sopenharmony_ci if (tok->fp == NULL) { 10667db96d56Sopenharmony_ci rc = tok_underflow_string(tok); 10677db96d56Sopenharmony_ci } 10687db96d56Sopenharmony_ci else if (tok->prompt != NULL) { 10697db96d56Sopenharmony_ci rc = tok_underflow_interactive(tok); 10707db96d56Sopenharmony_ci } 10717db96d56Sopenharmony_ci else { 10727db96d56Sopenharmony_ci rc = tok_underflow_file(tok); 10737db96d56Sopenharmony_ci } 10747db96d56Sopenharmony_ci#if defined(Py_DEBUG) 10757db96d56Sopenharmony_ci if (Py_DebugFlag) { 10767db96d56Sopenharmony_ci fprintf(stderr, "line[%d] = ", tok->lineno); 10777db96d56Sopenharmony_ci print_escape(stderr, tok->cur, tok->inp - tok->cur); 10787db96d56Sopenharmony_ci fprintf(stderr, " tok->done = %d\n", tok->done); 10797db96d56Sopenharmony_ci } 10807db96d56Sopenharmony_ci#endif 10817db96d56Sopenharmony_ci if (!rc) { 10827db96d56Sopenharmony_ci tok->cur = tok->inp; 10837db96d56Sopenharmony_ci return EOF; 10847db96d56Sopenharmony_ci } 10857db96d56Sopenharmony_ci tok->line_start = tok->cur; 10867db96d56Sopenharmony_ci 10877db96d56Sopenharmony_ci if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) { 10887db96d56Sopenharmony_ci syntaxerror(tok, "source code cannot contain null bytes"); 10897db96d56Sopenharmony_ci tok->cur = tok->inp; 10907db96d56Sopenharmony_ci return EOF; 10917db96d56Sopenharmony_ci } 10927db96d56Sopenharmony_ci } 10937db96d56Sopenharmony_ci Py_UNREACHABLE(); 10947db96d56Sopenharmony_ci} 10957db96d56Sopenharmony_ci 10967db96d56Sopenharmony_ci/* Back-up one character */ 10977db96d56Sopenharmony_ci 10987db96d56Sopenharmony_cistatic void 10997db96d56Sopenharmony_citok_backup(struct tok_state *tok, int c) 11007db96d56Sopenharmony_ci{ 11017db96d56Sopenharmony_ci if (c != EOF) { 11027db96d56Sopenharmony_ci if (--tok->cur < tok->buf) { 11037db96d56Sopenharmony_ci Py_FatalError("tokenizer beginning of buffer"); 11047db96d56Sopenharmony_ci } 11057db96d56Sopenharmony_ci if ((int)(unsigned char)*tok->cur != c) { 11067db96d56Sopenharmony_ci Py_FatalError("tok_backup: wrong character"); 11077db96d56Sopenharmony_ci } 11087db96d56Sopenharmony_ci } 11097db96d56Sopenharmony_ci} 11107db96d56Sopenharmony_ci 11117db96d56Sopenharmony_cistatic int 11127db96d56Sopenharmony_ci_syntaxerror_range(struct tok_state *tok, const char *format, 11137db96d56Sopenharmony_ci int col_offset, int end_col_offset, 11147db96d56Sopenharmony_ci va_list vargs) 11157db96d56Sopenharmony_ci{ 11167db96d56Sopenharmony_ci PyObject *errmsg, *errtext, *args; 11177db96d56Sopenharmony_ci errmsg = PyUnicode_FromFormatV(format, vargs); 11187db96d56Sopenharmony_ci if (!errmsg) { 11197db96d56Sopenharmony_ci goto error; 11207db96d56Sopenharmony_ci } 11217db96d56Sopenharmony_ci 11227db96d56Sopenharmony_ci errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, 11237db96d56Sopenharmony_ci "replace"); 11247db96d56Sopenharmony_ci if (!errtext) { 11257db96d56Sopenharmony_ci goto error; 11267db96d56Sopenharmony_ci } 11277db96d56Sopenharmony_ci 11287db96d56Sopenharmony_ci if (col_offset == -1) { 11297db96d56Sopenharmony_ci col_offset = (int)PyUnicode_GET_LENGTH(errtext); 11307db96d56Sopenharmony_ci } 11317db96d56Sopenharmony_ci if (end_col_offset == -1) { 11327db96d56Sopenharmony_ci end_col_offset = col_offset; 11337db96d56Sopenharmony_ci } 11347db96d56Sopenharmony_ci 11357db96d56Sopenharmony_ci Py_ssize_t line_len = strcspn(tok->line_start, "\n"); 11367db96d56Sopenharmony_ci if (line_len != tok->cur - tok->line_start) { 11377db96d56Sopenharmony_ci Py_DECREF(errtext); 11387db96d56Sopenharmony_ci errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, 11397db96d56Sopenharmony_ci "replace"); 11407db96d56Sopenharmony_ci } 11417db96d56Sopenharmony_ci if (!errtext) { 11427db96d56Sopenharmony_ci goto error; 11437db96d56Sopenharmony_ci } 11447db96d56Sopenharmony_ci 11457db96d56Sopenharmony_ci args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno, 11467db96d56Sopenharmony_ci col_offset, errtext, tok->lineno, end_col_offset); 11477db96d56Sopenharmony_ci if (args) { 11487db96d56Sopenharmony_ci PyErr_SetObject(PyExc_SyntaxError, args); 11497db96d56Sopenharmony_ci Py_DECREF(args); 11507db96d56Sopenharmony_ci } 11517db96d56Sopenharmony_ci 11527db96d56Sopenharmony_cierror: 11537db96d56Sopenharmony_ci Py_XDECREF(errmsg); 11547db96d56Sopenharmony_ci tok->done = E_ERROR; 11557db96d56Sopenharmony_ci return ERRORTOKEN; 11567db96d56Sopenharmony_ci} 11577db96d56Sopenharmony_ci 11587db96d56Sopenharmony_cistatic int 11597db96d56Sopenharmony_cisyntaxerror(struct tok_state *tok, const char *format, ...) 11607db96d56Sopenharmony_ci{ 11617db96d56Sopenharmony_ci va_list vargs; 11627db96d56Sopenharmony_ci#ifdef HAVE_STDARG_PROTOTYPES 11637db96d56Sopenharmony_ci va_start(vargs, format); 11647db96d56Sopenharmony_ci#else 11657db96d56Sopenharmony_ci va_start(vargs); 11667db96d56Sopenharmony_ci#endif 11677db96d56Sopenharmony_ci int ret = _syntaxerror_range(tok, format, -1, -1, vargs); 11687db96d56Sopenharmony_ci va_end(vargs); 11697db96d56Sopenharmony_ci return ret; 11707db96d56Sopenharmony_ci} 11717db96d56Sopenharmony_ci 11727db96d56Sopenharmony_cistatic int 11737db96d56Sopenharmony_cisyntaxerror_known_range(struct tok_state *tok, 11747db96d56Sopenharmony_ci int col_offset, int end_col_offset, 11757db96d56Sopenharmony_ci const char *format, ...) 11767db96d56Sopenharmony_ci{ 11777db96d56Sopenharmony_ci va_list vargs; 11787db96d56Sopenharmony_ci#ifdef HAVE_STDARG_PROTOTYPES 11797db96d56Sopenharmony_ci va_start(vargs, format); 11807db96d56Sopenharmony_ci#else 11817db96d56Sopenharmony_ci va_start(vargs); 11827db96d56Sopenharmony_ci#endif 11837db96d56Sopenharmony_ci int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs); 11847db96d56Sopenharmony_ci va_end(vargs); 11857db96d56Sopenharmony_ci return ret; 11867db96d56Sopenharmony_ci} 11877db96d56Sopenharmony_ci 11887db96d56Sopenharmony_ci 11897db96d56Sopenharmony_ci 11907db96d56Sopenharmony_cistatic int 11917db96d56Sopenharmony_ciindenterror(struct tok_state *tok) 11927db96d56Sopenharmony_ci{ 11937db96d56Sopenharmony_ci tok->done = E_TABSPACE; 11947db96d56Sopenharmony_ci tok->cur = tok->inp; 11957db96d56Sopenharmony_ci return ERRORTOKEN; 11967db96d56Sopenharmony_ci} 11977db96d56Sopenharmony_ci 11987db96d56Sopenharmony_cistatic int 11997db96d56Sopenharmony_ciparser_warn(struct tok_state *tok, PyObject *category, const char *format, ...) 12007db96d56Sopenharmony_ci{ 12017db96d56Sopenharmony_ci if (!tok->report_warnings) { 12027db96d56Sopenharmony_ci return 0; 12037db96d56Sopenharmony_ci } 12047db96d56Sopenharmony_ci 12057db96d56Sopenharmony_ci PyObject *errmsg; 12067db96d56Sopenharmony_ci va_list vargs; 12077db96d56Sopenharmony_ci#ifdef HAVE_STDARG_PROTOTYPES 12087db96d56Sopenharmony_ci va_start(vargs, format); 12097db96d56Sopenharmony_ci#else 12107db96d56Sopenharmony_ci va_start(vargs); 12117db96d56Sopenharmony_ci#endif 12127db96d56Sopenharmony_ci errmsg = PyUnicode_FromFormatV(format, vargs); 12137db96d56Sopenharmony_ci va_end(vargs); 12147db96d56Sopenharmony_ci if (!errmsg) { 12157db96d56Sopenharmony_ci goto error; 12167db96d56Sopenharmony_ci } 12177db96d56Sopenharmony_ci 12187db96d56Sopenharmony_ci if (PyErr_WarnExplicitObject(category, errmsg, tok->filename, 12197db96d56Sopenharmony_ci tok->lineno, NULL, NULL) < 0) { 12207db96d56Sopenharmony_ci if (PyErr_ExceptionMatches(category)) { 12217db96d56Sopenharmony_ci /* Replace the DeprecationWarning exception with a SyntaxError 12227db96d56Sopenharmony_ci to get a more accurate error report */ 12237db96d56Sopenharmony_ci PyErr_Clear(); 12247db96d56Sopenharmony_ci syntaxerror(tok, "%U", errmsg); 12257db96d56Sopenharmony_ci } 12267db96d56Sopenharmony_ci goto error; 12277db96d56Sopenharmony_ci } 12287db96d56Sopenharmony_ci Py_DECREF(errmsg); 12297db96d56Sopenharmony_ci return 0; 12307db96d56Sopenharmony_ci 12317db96d56Sopenharmony_cierror: 12327db96d56Sopenharmony_ci Py_XDECREF(errmsg); 12337db96d56Sopenharmony_ci tok->done = E_ERROR; 12347db96d56Sopenharmony_ci return -1; 12357db96d56Sopenharmony_ci} 12367db96d56Sopenharmony_ci 12377db96d56Sopenharmony_cistatic int 12387db96d56Sopenharmony_cilookahead(struct tok_state *tok, const char *test) 12397db96d56Sopenharmony_ci{ 12407db96d56Sopenharmony_ci const char *s = test; 12417db96d56Sopenharmony_ci int res = 0; 12427db96d56Sopenharmony_ci while (1) { 12437db96d56Sopenharmony_ci int c = tok_nextc(tok); 12447db96d56Sopenharmony_ci if (*s == 0) { 12457db96d56Sopenharmony_ci res = !is_potential_identifier_char(c); 12467db96d56Sopenharmony_ci } 12477db96d56Sopenharmony_ci else if (c == *s) { 12487db96d56Sopenharmony_ci s++; 12497db96d56Sopenharmony_ci continue; 12507db96d56Sopenharmony_ci } 12517db96d56Sopenharmony_ci 12527db96d56Sopenharmony_ci tok_backup(tok, c); 12537db96d56Sopenharmony_ci while (s != test) { 12547db96d56Sopenharmony_ci tok_backup(tok, *--s); 12557db96d56Sopenharmony_ci } 12567db96d56Sopenharmony_ci return res; 12577db96d56Sopenharmony_ci } 12587db96d56Sopenharmony_ci} 12597db96d56Sopenharmony_ci 12607db96d56Sopenharmony_cistatic int 12617db96d56Sopenharmony_civerify_end_of_number(struct tok_state *tok, int c, const char *kind) 12627db96d56Sopenharmony_ci{ 12637db96d56Sopenharmony_ci /* Emit a deprecation warning only if the numeric literal is immediately 12647db96d56Sopenharmony_ci * followed by one of keywords which can occur after a numeric literal 12657db96d56Sopenharmony_ci * in valid code: "and", "else", "for", "if", "in", "is" and "or". 12667db96d56Sopenharmony_ci * It allows to gradually deprecate existing valid code without adding 12677db96d56Sopenharmony_ci * warning before error in most cases of invalid numeric literal (which 12687db96d56Sopenharmony_ci * would be confusing and break existing tests). 12697db96d56Sopenharmony_ci * Raise a syntax error with slightly better message than plain 12707db96d56Sopenharmony_ci * "invalid syntax" if the numeric literal is immediately followed by 12717db96d56Sopenharmony_ci * other keyword or identifier. 12727db96d56Sopenharmony_ci */ 12737db96d56Sopenharmony_ci int r = 0; 12747db96d56Sopenharmony_ci if (c == 'a') { 12757db96d56Sopenharmony_ci r = lookahead(tok, "nd"); 12767db96d56Sopenharmony_ci } 12777db96d56Sopenharmony_ci else if (c == 'e') { 12787db96d56Sopenharmony_ci r = lookahead(tok, "lse"); 12797db96d56Sopenharmony_ci } 12807db96d56Sopenharmony_ci else if (c == 'f') { 12817db96d56Sopenharmony_ci r = lookahead(tok, "or"); 12827db96d56Sopenharmony_ci } 12837db96d56Sopenharmony_ci else if (c == 'i') { 12847db96d56Sopenharmony_ci int c2 = tok_nextc(tok); 12857db96d56Sopenharmony_ci if (c2 == 'f' || c2 == 'n' || c2 == 's') { 12867db96d56Sopenharmony_ci r = 1; 12877db96d56Sopenharmony_ci } 12887db96d56Sopenharmony_ci tok_backup(tok, c2); 12897db96d56Sopenharmony_ci } 12907db96d56Sopenharmony_ci else if (c == 'o') { 12917db96d56Sopenharmony_ci r = lookahead(tok, "r"); 12927db96d56Sopenharmony_ci } 12937db96d56Sopenharmony_ci else if (c == 'n') { 12947db96d56Sopenharmony_ci r = lookahead(tok, "ot"); 12957db96d56Sopenharmony_ci } 12967db96d56Sopenharmony_ci if (r) { 12977db96d56Sopenharmony_ci tok_backup(tok, c); 12987db96d56Sopenharmony_ci if (parser_warn(tok, PyExc_SyntaxWarning, 12997db96d56Sopenharmony_ci "invalid %s literal", kind)) 13007db96d56Sopenharmony_ci { 13017db96d56Sopenharmony_ci return 0; 13027db96d56Sopenharmony_ci } 13037db96d56Sopenharmony_ci tok_nextc(tok); 13047db96d56Sopenharmony_ci } 13057db96d56Sopenharmony_ci else /* In future releases, only error will remain. */ 13067db96d56Sopenharmony_ci if (is_potential_identifier_char(c)) { 13077db96d56Sopenharmony_ci tok_backup(tok, c); 13087db96d56Sopenharmony_ci syntaxerror(tok, "invalid %s literal", kind); 13097db96d56Sopenharmony_ci return 0; 13107db96d56Sopenharmony_ci } 13117db96d56Sopenharmony_ci return 1; 13127db96d56Sopenharmony_ci} 13137db96d56Sopenharmony_ci 13147db96d56Sopenharmony_ci/* Verify that the identifier follows PEP 3131. 13157db96d56Sopenharmony_ci All identifier strings are guaranteed to be "ready" unicode objects. 13167db96d56Sopenharmony_ci */ 13177db96d56Sopenharmony_cistatic int 13187db96d56Sopenharmony_civerify_identifier(struct tok_state *tok) 13197db96d56Sopenharmony_ci{ 13207db96d56Sopenharmony_ci PyObject *s; 13217db96d56Sopenharmony_ci if (tok->decoding_erred) 13227db96d56Sopenharmony_ci return 0; 13237db96d56Sopenharmony_ci s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); 13247db96d56Sopenharmony_ci if (s == NULL) { 13257db96d56Sopenharmony_ci if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { 13267db96d56Sopenharmony_ci tok->done = E_DECODE; 13277db96d56Sopenharmony_ci } 13287db96d56Sopenharmony_ci else { 13297db96d56Sopenharmony_ci tok->done = E_ERROR; 13307db96d56Sopenharmony_ci } 13317db96d56Sopenharmony_ci return 0; 13327db96d56Sopenharmony_ci } 13337db96d56Sopenharmony_ci Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); 13347db96d56Sopenharmony_ci if (invalid < 0) { 13357db96d56Sopenharmony_ci Py_DECREF(s); 13367db96d56Sopenharmony_ci tok->done = E_ERROR; 13377db96d56Sopenharmony_ci return 0; 13387db96d56Sopenharmony_ci } 13397db96d56Sopenharmony_ci assert(PyUnicode_GET_LENGTH(s) > 0); 13407db96d56Sopenharmony_ci if (invalid < PyUnicode_GET_LENGTH(s)) { 13417db96d56Sopenharmony_ci Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); 13427db96d56Sopenharmony_ci if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { 13437db96d56Sopenharmony_ci /* Determine the offset in UTF-8 encoded input */ 13447db96d56Sopenharmony_ci Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); 13457db96d56Sopenharmony_ci if (s != NULL) { 13467db96d56Sopenharmony_ci Py_SETREF(s, PyUnicode_AsUTF8String(s)); 13477db96d56Sopenharmony_ci } 13487db96d56Sopenharmony_ci if (s == NULL) { 13497db96d56Sopenharmony_ci tok->done = E_ERROR; 13507db96d56Sopenharmony_ci return 0; 13517db96d56Sopenharmony_ci } 13527db96d56Sopenharmony_ci tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); 13537db96d56Sopenharmony_ci } 13547db96d56Sopenharmony_ci Py_DECREF(s); 13557db96d56Sopenharmony_ci // PyUnicode_FromFormatV() does not support %X 13567db96d56Sopenharmony_ci char hex[9]; 13577db96d56Sopenharmony_ci (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch); 13587db96d56Sopenharmony_ci if (Py_UNICODE_ISPRINTABLE(ch)) { 13597db96d56Sopenharmony_ci syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); 13607db96d56Sopenharmony_ci } 13617db96d56Sopenharmony_ci else { 13627db96d56Sopenharmony_ci syntaxerror(tok, "invalid non-printable character U+%s", hex); 13637db96d56Sopenharmony_ci } 13647db96d56Sopenharmony_ci return 0; 13657db96d56Sopenharmony_ci } 13667db96d56Sopenharmony_ci Py_DECREF(s); 13677db96d56Sopenharmony_ci return 1; 13687db96d56Sopenharmony_ci} 13697db96d56Sopenharmony_ci 13707db96d56Sopenharmony_cistatic int 13717db96d56Sopenharmony_citok_decimal_tail(struct tok_state *tok) 13727db96d56Sopenharmony_ci{ 13737db96d56Sopenharmony_ci int c; 13747db96d56Sopenharmony_ci 13757db96d56Sopenharmony_ci while (1) { 13767db96d56Sopenharmony_ci do { 13777db96d56Sopenharmony_ci c = tok_nextc(tok); 13787db96d56Sopenharmony_ci } while (isdigit(c)); 13797db96d56Sopenharmony_ci if (c != '_') { 13807db96d56Sopenharmony_ci break; 13817db96d56Sopenharmony_ci } 13827db96d56Sopenharmony_ci c = tok_nextc(tok); 13837db96d56Sopenharmony_ci if (!isdigit(c)) { 13847db96d56Sopenharmony_ci tok_backup(tok, c); 13857db96d56Sopenharmony_ci syntaxerror(tok, "invalid decimal literal"); 13867db96d56Sopenharmony_ci return 0; 13877db96d56Sopenharmony_ci } 13887db96d56Sopenharmony_ci } 13897db96d56Sopenharmony_ci return c; 13907db96d56Sopenharmony_ci} 13917db96d56Sopenharmony_ci 13927db96d56Sopenharmony_ci/* Get next token, after space stripping etc. */ 13937db96d56Sopenharmony_ci 13947db96d56Sopenharmony_cistatic inline int 13957db96d56Sopenharmony_citok_continuation_line(struct tok_state *tok) { 13967db96d56Sopenharmony_ci int c = tok_nextc(tok); 13977db96d56Sopenharmony_ci if (c != '\n') { 13987db96d56Sopenharmony_ci tok->done = E_LINECONT; 13997db96d56Sopenharmony_ci return -1; 14007db96d56Sopenharmony_ci } 14017db96d56Sopenharmony_ci c = tok_nextc(tok); 14027db96d56Sopenharmony_ci if (c == EOF) { 14037db96d56Sopenharmony_ci tok->done = E_EOF; 14047db96d56Sopenharmony_ci tok->cur = tok->inp; 14057db96d56Sopenharmony_ci return -1; 14067db96d56Sopenharmony_ci } else { 14077db96d56Sopenharmony_ci tok_backup(tok, c); 14087db96d56Sopenharmony_ci } 14097db96d56Sopenharmony_ci return c; 14107db96d56Sopenharmony_ci} 14117db96d56Sopenharmony_ci 14127db96d56Sopenharmony_cistatic int 14137db96d56Sopenharmony_citok_get(struct tok_state *tok, const char **p_start, const char **p_end) 14147db96d56Sopenharmony_ci{ 14157db96d56Sopenharmony_ci int c; 14167db96d56Sopenharmony_ci int blankline, nonascii; 14177db96d56Sopenharmony_ci 14187db96d56Sopenharmony_ci *p_start = *p_end = NULL; 14197db96d56Sopenharmony_ci nextline: 14207db96d56Sopenharmony_ci tok->start = NULL; 14217db96d56Sopenharmony_ci blankline = 0; 14227db96d56Sopenharmony_ci 14237db96d56Sopenharmony_ci /* Get indentation level */ 14247db96d56Sopenharmony_ci if (tok->atbol) { 14257db96d56Sopenharmony_ci int col = 0; 14267db96d56Sopenharmony_ci int altcol = 0; 14277db96d56Sopenharmony_ci tok->atbol = 0; 14287db96d56Sopenharmony_ci int cont_line_col = 0; 14297db96d56Sopenharmony_ci for (;;) { 14307db96d56Sopenharmony_ci c = tok_nextc(tok); 14317db96d56Sopenharmony_ci if (c == ' ') { 14327db96d56Sopenharmony_ci col++, altcol++; 14337db96d56Sopenharmony_ci } 14347db96d56Sopenharmony_ci else if (c == '\t') { 14357db96d56Sopenharmony_ci col = (col / tok->tabsize + 1) * tok->tabsize; 14367db96d56Sopenharmony_ci altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; 14377db96d56Sopenharmony_ci } 14387db96d56Sopenharmony_ci else if (c == '\014') {/* Control-L (formfeed) */ 14397db96d56Sopenharmony_ci col = altcol = 0; /* For Emacs users */ 14407db96d56Sopenharmony_ci } 14417db96d56Sopenharmony_ci else if (c == '\\') { 14427db96d56Sopenharmony_ci // Indentation cannot be split over multiple physical lines 14437db96d56Sopenharmony_ci // using backslashes. This means that if we found a backslash 14447db96d56Sopenharmony_ci // preceded by whitespace, **the first one we find** determines 14457db96d56Sopenharmony_ci // the level of indentation of whatever comes next. 14467db96d56Sopenharmony_ci cont_line_col = cont_line_col ? cont_line_col : col; 14477db96d56Sopenharmony_ci if ((c = tok_continuation_line(tok)) == -1) { 14487db96d56Sopenharmony_ci return ERRORTOKEN; 14497db96d56Sopenharmony_ci } 14507db96d56Sopenharmony_ci } 14517db96d56Sopenharmony_ci else { 14527db96d56Sopenharmony_ci break; 14537db96d56Sopenharmony_ci } 14547db96d56Sopenharmony_ci } 14557db96d56Sopenharmony_ci tok_backup(tok, c); 14567db96d56Sopenharmony_ci if (c == '#' || c == '\n') { 14577db96d56Sopenharmony_ci /* Lines with only whitespace and/or comments 14587db96d56Sopenharmony_ci shouldn't affect the indentation and are 14597db96d56Sopenharmony_ci not passed to the parser as NEWLINE tokens, 14607db96d56Sopenharmony_ci except *totally* empty lines in interactive 14617db96d56Sopenharmony_ci mode, which signal the end of a command group. */ 14627db96d56Sopenharmony_ci if (col == 0 && c == '\n' && tok->prompt != NULL) { 14637db96d56Sopenharmony_ci blankline = 0; /* Let it through */ 14647db96d56Sopenharmony_ci } 14657db96d56Sopenharmony_ci else if (tok->prompt != NULL && tok->lineno == 1) { 14667db96d56Sopenharmony_ci /* In interactive mode, if the first line contains 14677db96d56Sopenharmony_ci only spaces and/or a comment, let it through. */ 14687db96d56Sopenharmony_ci blankline = 0; 14697db96d56Sopenharmony_ci col = altcol = 0; 14707db96d56Sopenharmony_ci } 14717db96d56Sopenharmony_ci else { 14727db96d56Sopenharmony_ci blankline = 1; /* Ignore completely */ 14737db96d56Sopenharmony_ci } 14747db96d56Sopenharmony_ci /* We can't jump back right here since we still 14757db96d56Sopenharmony_ci may need to skip to the end of a comment */ 14767db96d56Sopenharmony_ci } 14777db96d56Sopenharmony_ci if (!blankline && tok->level == 0) { 14787db96d56Sopenharmony_ci col = cont_line_col ? cont_line_col : col; 14797db96d56Sopenharmony_ci altcol = cont_line_col ? cont_line_col : altcol; 14807db96d56Sopenharmony_ci if (col == tok->indstack[tok->indent]) { 14817db96d56Sopenharmony_ci /* No change */ 14827db96d56Sopenharmony_ci if (altcol != tok->altindstack[tok->indent]) { 14837db96d56Sopenharmony_ci return indenterror(tok); 14847db96d56Sopenharmony_ci } 14857db96d56Sopenharmony_ci } 14867db96d56Sopenharmony_ci else if (col > tok->indstack[tok->indent]) { 14877db96d56Sopenharmony_ci /* Indent -- always one */ 14887db96d56Sopenharmony_ci if (tok->indent+1 >= MAXINDENT) { 14897db96d56Sopenharmony_ci tok->done = E_TOODEEP; 14907db96d56Sopenharmony_ci tok->cur = tok->inp; 14917db96d56Sopenharmony_ci return ERRORTOKEN; 14927db96d56Sopenharmony_ci } 14937db96d56Sopenharmony_ci if (altcol <= tok->altindstack[tok->indent]) { 14947db96d56Sopenharmony_ci return indenterror(tok); 14957db96d56Sopenharmony_ci } 14967db96d56Sopenharmony_ci tok->pendin++; 14977db96d56Sopenharmony_ci tok->indstack[++tok->indent] = col; 14987db96d56Sopenharmony_ci tok->altindstack[tok->indent] = altcol; 14997db96d56Sopenharmony_ci } 15007db96d56Sopenharmony_ci else /* col < tok->indstack[tok->indent] */ { 15017db96d56Sopenharmony_ci /* Dedent -- any number, must be consistent */ 15027db96d56Sopenharmony_ci while (tok->indent > 0 && 15037db96d56Sopenharmony_ci col < tok->indstack[tok->indent]) { 15047db96d56Sopenharmony_ci tok->pendin--; 15057db96d56Sopenharmony_ci tok->indent--; 15067db96d56Sopenharmony_ci } 15077db96d56Sopenharmony_ci if (col != tok->indstack[tok->indent]) { 15087db96d56Sopenharmony_ci tok->done = E_DEDENT; 15097db96d56Sopenharmony_ci tok->cur = tok->inp; 15107db96d56Sopenharmony_ci return ERRORTOKEN; 15117db96d56Sopenharmony_ci } 15127db96d56Sopenharmony_ci if (altcol != tok->altindstack[tok->indent]) { 15137db96d56Sopenharmony_ci return indenterror(tok); 15147db96d56Sopenharmony_ci } 15157db96d56Sopenharmony_ci } 15167db96d56Sopenharmony_ci } 15177db96d56Sopenharmony_ci } 15187db96d56Sopenharmony_ci 15197db96d56Sopenharmony_ci tok->start = tok->cur; 15207db96d56Sopenharmony_ci 15217db96d56Sopenharmony_ci /* Return pending indents/dedents */ 15227db96d56Sopenharmony_ci if (tok->pendin != 0) { 15237db96d56Sopenharmony_ci if (tok->pendin < 0) { 15247db96d56Sopenharmony_ci tok->pendin++; 15257db96d56Sopenharmony_ci return DEDENT; 15267db96d56Sopenharmony_ci } 15277db96d56Sopenharmony_ci else { 15287db96d56Sopenharmony_ci tok->pendin--; 15297db96d56Sopenharmony_ci return INDENT; 15307db96d56Sopenharmony_ci } 15317db96d56Sopenharmony_ci } 15327db96d56Sopenharmony_ci 15337db96d56Sopenharmony_ci /* Peek ahead at the next character */ 15347db96d56Sopenharmony_ci c = tok_nextc(tok); 15357db96d56Sopenharmony_ci tok_backup(tok, c); 15367db96d56Sopenharmony_ci /* Check if we are closing an async function */ 15377db96d56Sopenharmony_ci if (tok->async_def 15387db96d56Sopenharmony_ci && !blankline 15397db96d56Sopenharmony_ci /* Due to some implementation artifacts of type comments, 15407db96d56Sopenharmony_ci * a TYPE_COMMENT at the start of a function won't set an 15417db96d56Sopenharmony_ci * indentation level and it will produce a NEWLINE after it. 15427db96d56Sopenharmony_ci * To avoid spuriously ending an async function due to this, 15437db96d56Sopenharmony_ci * wait until we have some non-newline char in front of us. */ 15447db96d56Sopenharmony_ci && c != '\n' 15457db96d56Sopenharmony_ci && tok->level == 0 15467db96d56Sopenharmony_ci /* There was a NEWLINE after ASYNC DEF, 15477db96d56Sopenharmony_ci so we're past the signature. */ 15487db96d56Sopenharmony_ci && tok->async_def_nl 15497db96d56Sopenharmony_ci /* Current indentation level is less than where 15507db96d56Sopenharmony_ci the async function was defined */ 15517db96d56Sopenharmony_ci && tok->async_def_indent >= tok->indent) 15527db96d56Sopenharmony_ci { 15537db96d56Sopenharmony_ci tok->async_def = 0; 15547db96d56Sopenharmony_ci tok->async_def_indent = 0; 15557db96d56Sopenharmony_ci tok->async_def_nl = 0; 15567db96d56Sopenharmony_ci } 15577db96d56Sopenharmony_ci 15587db96d56Sopenharmony_ci again: 15597db96d56Sopenharmony_ci tok->start = NULL; 15607db96d56Sopenharmony_ci /* Skip spaces */ 15617db96d56Sopenharmony_ci do { 15627db96d56Sopenharmony_ci c = tok_nextc(tok); 15637db96d56Sopenharmony_ci } while (c == ' ' || c == '\t' || c == '\014'); 15647db96d56Sopenharmony_ci 15657db96d56Sopenharmony_ci /* Set start of current token */ 15667db96d56Sopenharmony_ci tok->start = tok->cur == NULL ? NULL : tok->cur - 1; 15677db96d56Sopenharmony_ci 15687db96d56Sopenharmony_ci /* Skip comment, unless it's a type comment */ 15697db96d56Sopenharmony_ci if (c == '#') { 15707db96d56Sopenharmony_ci const char *prefix, *p, *type_start; 15717db96d56Sopenharmony_ci 15727db96d56Sopenharmony_ci while (c != EOF && c != '\n') { 15737db96d56Sopenharmony_ci c = tok_nextc(tok); 15747db96d56Sopenharmony_ci } 15757db96d56Sopenharmony_ci 15767db96d56Sopenharmony_ci if (tok->type_comments) { 15777db96d56Sopenharmony_ci p = tok->start; 15787db96d56Sopenharmony_ci prefix = type_comment_prefix; 15797db96d56Sopenharmony_ci while (*prefix && p < tok->cur) { 15807db96d56Sopenharmony_ci if (*prefix == ' ') { 15817db96d56Sopenharmony_ci while (*p == ' ' || *p == '\t') { 15827db96d56Sopenharmony_ci p++; 15837db96d56Sopenharmony_ci } 15847db96d56Sopenharmony_ci } else if (*prefix == *p) { 15857db96d56Sopenharmony_ci p++; 15867db96d56Sopenharmony_ci } else { 15877db96d56Sopenharmony_ci break; 15887db96d56Sopenharmony_ci } 15897db96d56Sopenharmony_ci 15907db96d56Sopenharmony_ci prefix++; 15917db96d56Sopenharmony_ci } 15927db96d56Sopenharmony_ci 15937db96d56Sopenharmony_ci /* This is a type comment if we matched all of type_comment_prefix. */ 15947db96d56Sopenharmony_ci if (!*prefix) { 15957db96d56Sopenharmony_ci int is_type_ignore = 1; 15967db96d56Sopenharmony_ci const char *ignore_end = p + 6; 15977db96d56Sopenharmony_ci tok_backup(tok, c); /* don't eat the newline or EOF */ 15987db96d56Sopenharmony_ci 15997db96d56Sopenharmony_ci type_start = p; 16007db96d56Sopenharmony_ci 16017db96d56Sopenharmony_ci /* A TYPE_IGNORE is "type: ignore" followed by the end of the token 16027db96d56Sopenharmony_ci * or anything ASCII and non-alphanumeric. */ 16037db96d56Sopenharmony_ci is_type_ignore = ( 16047db96d56Sopenharmony_ci tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 16057db96d56Sopenharmony_ci && !(tok->cur > ignore_end 16067db96d56Sopenharmony_ci && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); 16077db96d56Sopenharmony_ci 16087db96d56Sopenharmony_ci if (is_type_ignore) { 16097db96d56Sopenharmony_ci *p_start = ignore_end; 16107db96d56Sopenharmony_ci *p_end = tok->cur; 16117db96d56Sopenharmony_ci 16127db96d56Sopenharmony_ci /* If this type ignore is the only thing on the line, consume the newline also. */ 16137db96d56Sopenharmony_ci if (blankline) { 16147db96d56Sopenharmony_ci tok_nextc(tok); 16157db96d56Sopenharmony_ci tok->atbol = 1; 16167db96d56Sopenharmony_ci } 16177db96d56Sopenharmony_ci return TYPE_IGNORE; 16187db96d56Sopenharmony_ci } else { 16197db96d56Sopenharmony_ci *p_start = type_start; /* after type_comment_prefix */ 16207db96d56Sopenharmony_ci *p_end = tok->cur; 16217db96d56Sopenharmony_ci return TYPE_COMMENT; 16227db96d56Sopenharmony_ci } 16237db96d56Sopenharmony_ci } 16247db96d56Sopenharmony_ci } 16257db96d56Sopenharmony_ci } 16267db96d56Sopenharmony_ci 16277db96d56Sopenharmony_ci if (tok->done == E_INTERACT_STOP) { 16287db96d56Sopenharmony_ci return ENDMARKER; 16297db96d56Sopenharmony_ci } 16307db96d56Sopenharmony_ci 16317db96d56Sopenharmony_ci /* Check for EOF and errors now */ 16327db96d56Sopenharmony_ci if (c == EOF) { 16337db96d56Sopenharmony_ci if (tok->level) { 16347db96d56Sopenharmony_ci return ERRORTOKEN; 16357db96d56Sopenharmony_ci } 16367db96d56Sopenharmony_ci return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 16377db96d56Sopenharmony_ci } 16387db96d56Sopenharmony_ci 16397db96d56Sopenharmony_ci /* Identifier (most frequent token!) */ 16407db96d56Sopenharmony_ci nonascii = 0; 16417db96d56Sopenharmony_ci if (is_potential_identifier_start(c)) { 16427db96d56Sopenharmony_ci /* Process the various legal combinations of b"", r"", u"", and f"". */ 16437db96d56Sopenharmony_ci int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0; 16447db96d56Sopenharmony_ci while (1) { 16457db96d56Sopenharmony_ci if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B')) 16467db96d56Sopenharmony_ci saw_b = 1; 16477db96d56Sopenharmony_ci /* Since this is a backwards compatibility support literal we don't 16487db96d56Sopenharmony_ci want to support it in arbitrary order like byte literals. */ 16497db96d56Sopenharmony_ci else if (!(saw_b || saw_u || saw_r || saw_f) 16507db96d56Sopenharmony_ci && (c == 'u'|| c == 'U')) { 16517db96d56Sopenharmony_ci saw_u = 1; 16527db96d56Sopenharmony_ci } 16537db96d56Sopenharmony_ci /* ur"" and ru"" are not supported */ 16547db96d56Sopenharmony_ci else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) { 16557db96d56Sopenharmony_ci saw_r = 1; 16567db96d56Sopenharmony_ci } 16577db96d56Sopenharmony_ci else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) { 16587db96d56Sopenharmony_ci saw_f = 1; 16597db96d56Sopenharmony_ci } 16607db96d56Sopenharmony_ci else { 16617db96d56Sopenharmony_ci break; 16627db96d56Sopenharmony_ci } 16637db96d56Sopenharmony_ci c = tok_nextc(tok); 16647db96d56Sopenharmony_ci if (c == '"' || c == '\'') { 16657db96d56Sopenharmony_ci goto letter_quote; 16667db96d56Sopenharmony_ci } 16677db96d56Sopenharmony_ci } 16687db96d56Sopenharmony_ci while (is_potential_identifier_char(c)) { 16697db96d56Sopenharmony_ci if (c >= 128) { 16707db96d56Sopenharmony_ci nonascii = 1; 16717db96d56Sopenharmony_ci } 16727db96d56Sopenharmony_ci c = tok_nextc(tok); 16737db96d56Sopenharmony_ci } 16747db96d56Sopenharmony_ci tok_backup(tok, c); 16757db96d56Sopenharmony_ci if (nonascii && !verify_identifier(tok)) { 16767db96d56Sopenharmony_ci return ERRORTOKEN; 16777db96d56Sopenharmony_ci } 16787db96d56Sopenharmony_ci 16797db96d56Sopenharmony_ci *p_start = tok->start; 16807db96d56Sopenharmony_ci *p_end = tok->cur; 16817db96d56Sopenharmony_ci 16827db96d56Sopenharmony_ci /* async/await parsing block. */ 16837db96d56Sopenharmony_ci if (tok->cur - tok->start == 5 && tok->start[0] == 'a') { 16847db96d56Sopenharmony_ci /* May be an 'async' or 'await' token. For Python 3.7 or 16857db96d56Sopenharmony_ci later we recognize them unconditionally. For Python 16867db96d56Sopenharmony_ci 3.5 or 3.6 we recognize 'async' in front of 'def', and 16877db96d56Sopenharmony_ci either one inside of 'async def'. (Technically we 16887db96d56Sopenharmony_ci shouldn't recognize these at all for 3.4 or earlier, 16897db96d56Sopenharmony_ci but there's no *valid* Python 3.4 code that would be 16907db96d56Sopenharmony_ci rejected, and async functions will be rejected in a 16917db96d56Sopenharmony_ci later phase.) */ 16927db96d56Sopenharmony_ci if (!tok->async_hacks || tok->async_def) { 16937db96d56Sopenharmony_ci /* Always recognize the keywords. */ 16947db96d56Sopenharmony_ci if (memcmp(tok->start, "async", 5) == 0) { 16957db96d56Sopenharmony_ci return ASYNC; 16967db96d56Sopenharmony_ci } 16977db96d56Sopenharmony_ci if (memcmp(tok->start, "await", 5) == 0) { 16987db96d56Sopenharmony_ci return AWAIT; 16997db96d56Sopenharmony_ci } 17007db96d56Sopenharmony_ci } 17017db96d56Sopenharmony_ci else if (memcmp(tok->start, "async", 5) == 0) { 17027db96d56Sopenharmony_ci /* The current token is 'async'. 17037db96d56Sopenharmony_ci Look ahead one token to see if that is 'def'. */ 17047db96d56Sopenharmony_ci 17057db96d56Sopenharmony_ci struct tok_state ahead_tok; 17067db96d56Sopenharmony_ci const char *ahead_tok_start = NULL; 17077db96d56Sopenharmony_ci const char *ahead_tok_end = NULL; 17087db96d56Sopenharmony_ci int ahead_tok_kind; 17097db96d56Sopenharmony_ci 17107db96d56Sopenharmony_ci memcpy(&ahead_tok, tok, sizeof(ahead_tok)); 17117db96d56Sopenharmony_ci ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, 17127db96d56Sopenharmony_ci &ahead_tok_end); 17137db96d56Sopenharmony_ci 17147db96d56Sopenharmony_ci if (ahead_tok_kind == NAME 17157db96d56Sopenharmony_ci && ahead_tok.cur - ahead_tok.start == 3 17167db96d56Sopenharmony_ci && memcmp(ahead_tok.start, "def", 3) == 0) 17177db96d56Sopenharmony_ci { 17187db96d56Sopenharmony_ci /* The next token is going to be 'def', so instead of 17197db96d56Sopenharmony_ci returning a plain NAME token, return ASYNC. */ 17207db96d56Sopenharmony_ci tok->async_def_indent = tok->indent; 17217db96d56Sopenharmony_ci tok->async_def = 1; 17227db96d56Sopenharmony_ci return ASYNC; 17237db96d56Sopenharmony_ci } 17247db96d56Sopenharmony_ci } 17257db96d56Sopenharmony_ci } 17267db96d56Sopenharmony_ci 17277db96d56Sopenharmony_ci return NAME; 17287db96d56Sopenharmony_ci } 17297db96d56Sopenharmony_ci 17307db96d56Sopenharmony_ci /* Newline */ 17317db96d56Sopenharmony_ci if (c == '\n') { 17327db96d56Sopenharmony_ci tok->atbol = 1; 17337db96d56Sopenharmony_ci if (blankline || tok->level > 0) { 17347db96d56Sopenharmony_ci goto nextline; 17357db96d56Sopenharmony_ci } 17367db96d56Sopenharmony_ci *p_start = tok->start; 17377db96d56Sopenharmony_ci *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 17387db96d56Sopenharmony_ci tok->cont_line = 0; 17397db96d56Sopenharmony_ci if (tok->async_def) { 17407db96d56Sopenharmony_ci /* We're somewhere inside an 'async def' function, and 17417db96d56Sopenharmony_ci we've encountered a NEWLINE after its signature. */ 17427db96d56Sopenharmony_ci tok->async_def_nl = 1; 17437db96d56Sopenharmony_ci } 17447db96d56Sopenharmony_ci return NEWLINE; 17457db96d56Sopenharmony_ci } 17467db96d56Sopenharmony_ci 17477db96d56Sopenharmony_ci /* Period or number starting with period? */ 17487db96d56Sopenharmony_ci if (c == '.') { 17497db96d56Sopenharmony_ci c = tok_nextc(tok); 17507db96d56Sopenharmony_ci if (isdigit(c)) { 17517db96d56Sopenharmony_ci goto fraction; 17527db96d56Sopenharmony_ci } else if (c == '.') { 17537db96d56Sopenharmony_ci c = tok_nextc(tok); 17547db96d56Sopenharmony_ci if (c == '.') { 17557db96d56Sopenharmony_ci *p_start = tok->start; 17567db96d56Sopenharmony_ci *p_end = tok->cur; 17577db96d56Sopenharmony_ci return ELLIPSIS; 17587db96d56Sopenharmony_ci } 17597db96d56Sopenharmony_ci else { 17607db96d56Sopenharmony_ci tok_backup(tok, c); 17617db96d56Sopenharmony_ci } 17627db96d56Sopenharmony_ci tok_backup(tok, '.'); 17637db96d56Sopenharmony_ci } 17647db96d56Sopenharmony_ci else { 17657db96d56Sopenharmony_ci tok_backup(tok, c); 17667db96d56Sopenharmony_ci } 17677db96d56Sopenharmony_ci *p_start = tok->start; 17687db96d56Sopenharmony_ci *p_end = tok->cur; 17697db96d56Sopenharmony_ci return DOT; 17707db96d56Sopenharmony_ci } 17717db96d56Sopenharmony_ci 17727db96d56Sopenharmony_ci /* Number */ 17737db96d56Sopenharmony_ci if (isdigit(c)) { 17747db96d56Sopenharmony_ci if (c == '0') { 17757db96d56Sopenharmony_ci /* Hex, octal or binary -- maybe. */ 17767db96d56Sopenharmony_ci c = tok_nextc(tok); 17777db96d56Sopenharmony_ci if (c == 'x' || c == 'X') { 17787db96d56Sopenharmony_ci /* Hex */ 17797db96d56Sopenharmony_ci c = tok_nextc(tok); 17807db96d56Sopenharmony_ci do { 17817db96d56Sopenharmony_ci if (c == '_') { 17827db96d56Sopenharmony_ci c = tok_nextc(tok); 17837db96d56Sopenharmony_ci } 17847db96d56Sopenharmony_ci if (!isxdigit(c)) { 17857db96d56Sopenharmony_ci tok_backup(tok, c); 17867db96d56Sopenharmony_ci return syntaxerror(tok, "invalid hexadecimal literal"); 17877db96d56Sopenharmony_ci } 17887db96d56Sopenharmony_ci do { 17897db96d56Sopenharmony_ci c = tok_nextc(tok); 17907db96d56Sopenharmony_ci } while (isxdigit(c)); 17917db96d56Sopenharmony_ci } while (c == '_'); 17927db96d56Sopenharmony_ci if (!verify_end_of_number(tok, c, "hexadecimal")) { 17937db96d56Sopenharmony_ci return ERRORTOKEN; 17947db96d56Sopenharmony_ci } 17957db96d56Sopenharmony_ci } 17967db96d56Sopenharmony_ci else if (c == 'o' || c == 'O') { 17977db96d56Sopenharmony_ci /* Octal */ 17987db96d56Sopenharmony_ci c = tok_nextc(tok); 17997db96d56Sopenharmony_ci do { 18007db96d56Sopenharmony_ci if (c == '_') { 18017db96d56Sopenharmony_ci c = tok_nextc(tok); 18027db96d56Sopenharmony_ci } 18037db96d56Sopenharmony_ci if (c < '0' || c >= '8') { 18047db96d56Sopenharmony_ci if (isdigit(c)) { 18057db96d56Sopenharmony_ci return syntaxerror(tok, 18067db96d56Sopenharmony_ci "invalid digit '%c' in octal literal", c); 18077db96d56Sopenharmony_ci } 18087db96d56Sopenharmony_ci else { 18097db96d56Sopenharmony_ci tok_backup(tok, c); 18107db96d56Sopenharmony_ci return syntaxerror(tok, "invalid octal literal"); 18117db96d56Sopenharmony_ci } 18127db96d56Sopenharmony_ci } 18137db96d56Sopenharmony_ci do { 18147db96d56Sopenharmony_ci c = tok_nextc(tok); 18157db96d56Sopenharmony_ci } while ('0' <= c && c < '8'); 18167db96d56Sopenharmony_ci } while (c == '_'); 18177db96d56Sopenharmony_ci if (isdigit(c)) { 18187db96d56Sopenharmony_ci return syntaxerror(tok, 18197db96d56Sopenharmony_ci "invalid digit '%c' in octal literal", c); 18207db96d56Sopenharmony_ci } 18217db96d56Sopenharmony_ci if (!verify_end_of_number(tok, c, "octal")) { 18227db96d56Sopenharmony_ci return ERRORTOKEN; 18237db96d56Sopenharmony_ci } 18247db96d56Sopenharmony_ci } 18257db96d56Sopenharmony_ci else if (c == 'b' || c == 'B') { 18267db96d56Sopenharmony_ci /* Binary */ 18277db96d56Sopenharmony_ci c = tok_nextc(tok); 18287db96d56Sopenharmony_ci do { 18297db96d56Sopenharmony_ci if (c == '_') { 18307db96d56Sopenharmony_ci c = tok_nextc(tok); 18317db96d56Sopenharmony_ci } 18327db96d56Sopenharmony_ci if (c != '0' && c != '1') { 18337db96d56Sopenharmony_ci if (isdigit(c)) { 18347db96d56Sopenharmony_ci return syntaxerror(tok, 18357db96d56Sopenharmony_ci "invalid digit '%c' in binary literal", c); 18367db96d56Sopenharmony_ci } 18377db96d56Sopenharmony_ci else { 18387db96d56Sopenharmony_ci tok_backup(tok, c); 18397db96d56Sopenharmony_ci return syntaxerror(tok, "invalid binary literal"); 18407db96d56Sopenharmony_ci } 18417db96d56Sopenharmony_ci } 18427db96d56Sopenharmony_ci do { 18437db96d56Sopenharmony_ci c = tok_nextc(tok); 18447db96d56Sopenharmony_ci } while (c == '0' || c == '1'); 18457db96d56Sopenharmony_ci } while (c == '_'); 18467db96d56Sopenharmony_ci if (isdigit(c)) { 18477db96d56Sopenharmony_ci return syntaxerror(tok, 18487db96d56Sopenharmony_ci "invalid digit '%c' in binary literal", c); 18497db96d56Sopenharmony_ci } 18507db96d56Sopenharmony_ci if (!verify_end_of_number(tok, c, "binary")) { 18517db96d56Sopenharmony_ci return ERRORTOKEN; 18527db96d56Sopenharmony_ci } 18537db96d56Sopenharmony_ci } 18547db96d56Sopenharmony_ci else { 18557db96d56Sopenharmony_ci int nonzero = 0; 18567db96d56Sopenharmony_ci /* maybe old-style octal; c is first char of it */ 18577db96d56Sopenharmony_ci /* in any case, allow '0' as a literal */ 18587db96d56Sopenharmony_ci while (1) { 18597db96d56Sopenharmony_ci if (c == '_') { 18607db96d56Sopenharmony_ci c = tok_nextc(tok); 18617db96d56Sopenharmony_ci if (!isdigit(c)) { 18627db96d56Sopenharmony_ci tok_backup(tok, c); 18637db96d56Sopenharmony_ci return syntaxerror(tok, "invalid decimal literal"); 18647db96d56Sopenharmony_ci } 18657db96d56Sopenharmony_ci } 18667db96d56Sopenharmony_ci if (c != '0') { 18677db96d56Sopenharmony_ci break; 18687db96d56Sopenharmony_ci } 18697db96d56Sopenharmony_ci c = tok_nextc(tok); 18707db96d56Sopenharmony_ci } 18717db96d56Sopenharmony_ci char* zeros_end = tok->cur; 18727db96d56Sopenharmony_ci if (isdigit(c)) { 18737db96d56Sopenharmony_ci nonzero = 1; 18747db96d56Sopenharmony_ci c = tok_decimal_tail(tok); 18757db96d56Sopenharmony_ci if (c == 0) { 18767db96d56Sopenharmony_ci return ERRORTOKEN; 18777db96d56Sopenharmony_ci } 18787db96d56Sopenharmony_ci } 18797db96d56Sopenharmony_ci if (c == '.') { 18807db96d56Sopenharmony_ci c = tok_nextc(tok); 18817db96d56Sopenharmony_ci goto fraction; 18827db96d56Sopenharmony_ci } 18837db96d56Sopenharmony_ci else if (c == 'e' || c == 'E') { 18847db96d56Sopenharmony_ci goto exponent; 18857db96d56Sopenharmony_ci } 18867db96d56Sopenharmony_ci else if (c == 'j' || c == 'J') { 18877db96d56Sopenharmony_ci goto imaginary; 18887db96d56Sopenharmony_ci } 18897db96d56Sopenharmony_ci else if (nonzero) { 18907db96d56Sopenharmony_ci /* Old-style octal: now disallowed. */ 18917db96d56Sopenharmony_ci tok_backup(tok, c); 18927db96d56Sopenharmony_ci return syntaxerror_known_range( 18937db96d56Sopenharmony_ci tok, (int)(tok->start + 1 - tok->line_start), 18947db96d56Sopenharmony_ci (int)(zeros_end - tok->line_start), 18957db96d56Sopenharmony_ci "leading zeros in decimal integer " 18967db96d56Sopenharmony_ci "literals are not permitted; " 18977db96d56Sopenharmony_ci "use an 0o prefix for octal integers"); 18987db96d56Sopenharmony_ci } 18997db96d56Sopenharmony_ci if (!verify_end_of_number(tok, c, "decimal")) { 19007db96d56Sopenharmony_ci return ERRORTOKEN; 19017db96d56Sopenharmony_ci } 19027db96d56Sopenharmony_ci } 19037db96d56Sopenharmony_ci } 19047db96d56Sopenharmony_ci else { 19057db96d56Sopenharmony_ci /* Decimal */ 19067db96d56Sopenharmony_ci c = tok_decimal_tail(tok); 19077db96d56Sopenharmony_ci if (c == 0) { 19087db96d56Sopenharmony_ci return ERRORTOKEN; 19097db96d56Sopenharmony_ci } 19107db96d56Sopenharmony_ci { 19117db96d56Sopenharmony_ci /* Accept floating point numbers. */ 19127db96d56Sopenharmony_ci if (c == '.') { 19137db96d56Sopenharmony_ci c = tok_nextc(tok); 19147db96d56Sopenharmony_ci fraction: 19157db96d56Sopenharmony_ci /* Fraction */ 19167db96d56Sopenharmony_ci if (isdigit(c)) { 19177db96d56Sopenharmony_ci c = tok_decimal_tail(tok); 19187db96d56Sopenharmony_ci if (c == 0) { 19197db96d56Sopenharmony_ci return ERRORTOKEN; 19207db96d56Sopenharmony_ci } 19217db96d56Sopenharmony_ci } 19227db96d56Sopenharmony_ci } 19237db96d56Sopenharmony_ci if (c == 'e' || c == 'E') { 19247db96d56Sopenharmony_ci int e; 19257db96d56Sopenharmony_ci exponent: 19267db96d56Sopenharmony_ci e = c; 19277db96d56Sopenharmony_ci /* Exponent part */ 19287db96d56Sopenharmony_ci c = tok_nextc(tok); 19297db96d56Sopenharmony_ci if (c == '+' || c == '-') { 19307db96d56Sopenharmony_ci c = tok_nextc(tok); 19317db96d56Sopenharmony_ci if (!isdigit(c)) { 19327db96d56Sopenharmony_ci tok_backup(tok, c); 19337db96d56Sopenharmony_ci return syntaxerror(tok, "invalid decimal literal"); 19347db96d56Sopenharmony_ci } 19357db96d56Sopenharmony_ci } else if (!isdigit(c)) { 19367db96d56Sopenharmony_ci tok_backup(tok, c); 19377db96d56Sopenharmony_ci if (!verify_end_of_number(tok, e, "decimal")) { 19387db96d56Sopenharmony_ci return ERRORTOKEN; 19397db96d56Sopenharmony_ci } 19407db96d56Sopenharmony_ci tok_backup(tok, e); 19417db96d56Sopenharmony_ci *p_start = tok->start; 19427db96d56Sopenharmony_ci *p_end = tok->cur; 19437db96d56Sopenharmony_ci return NUMBER; 19447db96d56Sopenharmony_ci } 19457db96d56Sopenharmony_ci c = tok_decimal_tail(tok); 19467db96d56Sopenharmony_ci if (c == 0) { 19477db96d56Sopenharmony_ci return ERRORTOKEN; 19487db96d56Sopenharmony_ci } 19497db96d56Sopenharmony_ci } 19507db96d56Sopenharmony_ci if (c == 'j' || c == 'J') { 19517db96d56Sopenharmony_ci /* Imaginary part */ 19527db96d56Sopenharmony_ci imaginary: 19537db96d56Sopenharmony_ci c = tok_nextc(tok); 19547db96d56Sopenharmony_ci if (!verify_end_of_number(tok, c, "imaginary")) { 19557db96d56Sopenharmony_ci return ERRORTOKEN; 19567db96d56Sopenharmony_ci } 19577db96d56Sopenharmony_ci } 19587db96d56Sopenharmony_ci else if (!verify_end_of_number(tok, c, "decimal")) { 19597db96d56Sopenharmony_ci return ERRORTOKEN; 19607db96d56Sopenharmony_ci } 19617db96d56Sopenharmony_ci } 19627db96d56Sopenharmony_ci } 19637db96d56Sopenharmony_ci tok_backup(tok, c); 19647db96d56Sopenharmony_ci *p_start = tok->start; 19657db96d56Sopenharmony_ci *p_end = tok->cur; 19667db96d56Sopenharmony_ci return NUMBER; 19677db96d56Sopenharmony_ci } 19687db96d56Sopenharmony_ci 19697db96d56Sopenharmony_ci letter_quote: 19707db96d56Sopenharmony_ci /* String */ 19717db96d56Sopenharmony_ci if (c == '\'' || c == '"') { 19727db96d56Sopenharmony_ci int quote = c; 19737db96d56Sopenharmony_ci int quote_size = 1; /* 1 or 3 */ 19747db96d56Sopenharmony_ci int end_quote_size = 0; 19757db96d56Sopenharmony_ci 19767db96d56Sopenharmony_ci /* Nodes of type STRING, especially multi line strings 19777db96d56Sopenharmony_ci must be handled differently in order to get both 19787db96d56Sopenharmony_ci the starting line number and the column offset right. 19797db96d56Sopenharmony_ci (cf. issue 16806) */ 19807db96d56Sopenharmony_ci tok->first_lineno = tok->lineno; 19817db96d56Sopenharmony_ci tok->multi_line_start = tok->line_start; 19827db96d56Sopenharmony_ci 19837db96d56Sopenharmony_ci /* Find the quote size and start of string */ 19847db96d56Sopenharmony_ci c = tok_nextc(tok); 19857db96d56Sopenharmony_ci if (c == quote) { 19867db96d56Sopenharmony_ci c = tok_nextc(tok); 19877db96d56Sopenharmony_ci if (c == quote) { 19887db96d56Sopenharmony_ci quote_size = 3; 19897db96d56Sopenharmony_ci } 19907db96d56Sopenharmony_ci else { 19917db96d56Sopenharmony_ci end_quote_size = 1; /* empty string found */ 19927db96d56Sopenharmony_ci } 19937db96d56Sopenharmony_ci } 19947db96d56Sopenharmony_ci if (c != quote) { 19957db96d56Sopenharmony_ci tok_backup(tok, c); 19967db96d56Sopenharmony_ci } 19977db96d56Sopenharmony_ci 19987db96d56Sopenharmony_ci /* Get rest of string */ 19997db96d56Sopenharmony_ci while (end_quote_size != quote_size) { 20007db96d56Sopenharmony_ci c = tok_nextc(tok); 20017db96d56Sopenharmony_ci if (tok->done == E_ERROR) { 20027db96d56Sopenharmony_ci return ERRORTOKEN; 20037db96d56Sopenharmony_ci } 20047db96d56Sopenharmony_ci if (tok->done == E_DECODE) { 20057db96d56Sopenharmony_ci break; 20067db96d56Sopenharmony_ci } 20077db96d56Sopenharmony_ci if (c == EOF || (quote_size == 1 && c == '\n')) { 20087db96d56Sopenharmony_ci assert(tok->multi_line_start != NULL); 20097db96d56Sopenharmony_ci // shift the tok_state's location into 20107db96d56Sopenharmony_ci // the start of string, and report the error 20117db96d56Sopenharmony_ci // from the initial quote character 20127db96d56Sopenharmony_ci tok->cur = (char *)tok->start; 20137db96d56Sopenharmony_ci tok->cur++; 20147db96d56Sopenharmony_ci tok->line_start = tok->multi_line_start; 20157db96d56Sopenharmony_ci int start = tok->lineno; 20167db96d56Sopenharmony_ci tok->lineno = tok->first_lineno; 20177db96d56Sopenharmony_ci if (quote_size == 3) { 20187db96d56Sopenharmony_ci syntaxerror(tok, "unterminated triple-quoted string literal" 20197db96d56Sopenharmony_ci " (detected at line %d)", start); 20207db96d56Sopenharmony_ci if (c != '\n') { 20217db96d56Sopenharmony_ci tok->done = E_EOFS; 20227db96d56Sopenharmony_ci } 20237db96d56Sopenharmony_ci return ERRORTOKEN; 20247db96d56Sopenharmony_ci } 20257db96d56Sopenharmony_ci else { 20267db96d56Sopenharmony_ci syntaxerror(tok, "unterminated string literal (detected at" 20277db96d56Sopenharmony_ci " line %d)", start); 20287db96d56Sopenharmony_ci if (c != '\n') { 20297db96d56Sopenharmony_ci tok->done = E_EOLS; 20307db96d56Sopenharmony_ci } 20317db96d56Sopenharmony_ci return ERRORTOKEN; 20327db96d56Sopenharmony_ci } 20337db96d56Sopenharmony_ci } 20347db96d56Sopenharmony_ci if (c == quote) { 20357db96d56Sopenharmony_ci end_quote_size += 1; 20367db96d56Sopenharmony_ci } 20377db96d56Sopenharmony_ci else { 20387db96d56Sopenharmony_ci end_quote_size = 0; 20397db96d56Sopenharmony_ci if (c == '\\') { 20407db96d56Sopenharmony_ci tok_nextc(tok); /* skip escaped char */ 20417db96d56Sopenharmony_ci } 20427db96d56Sopenharmony_ci } 20437db96d56Sopenharmony_ci } 20447db96d56Sopenharmony_ci 20457db96d56Sopenharmony_ci *p_start = tok->start; 20467db96d56Sopenharmony_ci *p_end = tok->cur; 20477db96d56Sopenharmony_ci return STRING; 20487db96d56Sopenharmony_ci } 20497db96d56Sopenharmony_ci 20507db96d56Sopenharmony_ci /* Line continuation */ 20517db96d56Sopenharmony_ci if (c == '\\') { 20527db96d56Sopenharmony_ci if ((c = tok_continuation_line(tok)) == -1) { 20537db96d56Sopenharmony_ci return ERRORTOKEN; 20547db96d56Sopenharmony_ci } 20557db96d56Sopenharmony_ci tok->cont_line = 1; 20567db96d56Sopenharmony_ci goto again; /* Read next line */ 20577db96d56Sopenharmony_ci } 20587db96d56Sopenharmony_ci 20597db96d56Sopenharmony_ci /* Check for two-character token */ 20607db96d56Sopenharmony_ci { 20617db96d56Sopenharmony_ci int c2 = tok_nextc(tok); 20627db96d56Sopenharmony_ci int token = PyToken_TwoChars(c, c2); 20637db96d56Sopenharmony_ci if (token != OP) { 20647db96d56Sopenharmony_ci int c3 = tok_nextc(tok); 20657db96d56Sopenharmony_ci int token3 = PyToken_ThreeChars(c, c2, c3); 20667db96d56Sopenharmony_ci if (token3 != OP) { 20677db96d56Sopenharmony_ci token = token3; 20687db96d56Sopenharmony_ci } 20697db96d56Sopenharmony_ci else { 20707db96d56Sopenharmony_ci tok_backup(tok, c3); 20717db96d56Sopenharmony_ci } 20727db96d56Sopenharmony_ci *p_start = tok->start; 20737db96d56Sopenharmony_ci *p_end = tok->cur; 20747db96d56Sopenharmony_ci return token; 20757db96d56Sopenharmony_ci } 20767db96d56Sopenharmony_ci tok_backup(tok, c2); 20777db96d56Sopenharmony_ci } 20787db96d56Sopenharmony_ci 20797db96d56Sopenharmony_ci /* Keep track of parentheses nesting level */ 20807db96d56Sopenharmony_ci switch (c) { 20817db96d56Sopenharmony_ci case '(': 20827db96d56Sopenharmony_ci case '[': 20837db96d56Sopenharmony_ci case '{': 20847db96d56Sopenharmony_ci if (tok->level >= MAXLEVEL) { 20857db96d56Sopenharmony_ci return syntaxerror(tok, "too many nested parentheses"); 20867db96d56Sopenharmony_ci } 20877db96d56Sopenharmony_ci tok->parenstack[tok->level] = c; 20887db96d56Sopenharmony_ci tok->parenlinenostack[tok->level] = tok->lineno; 20897db96d56Sopenharmony_ci tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); 20907db96d56Sopenharmony_ci tok->level++; 20917db96d56Sopenharmony_ci break; 20927db96d56Sopenharmony_ci case ')': 20937db96d56Sopenharmony_ci case ']': 20947db96d56Sopenharmony_ci case '}': 20957db96d56Sopenharmony_ci if (!tok->level) { 20967db96d56Sopenharmony_ci return syntaxerror(tok, "unmatched '%c'", c); 20977db96d56Sopenharmony_ci } 20987db96d56Sopenharmony_ci tok->level--; 20997db96d56Sopenharmony_ci int opening = tok->parenstack[tok->level]; 21007db96d56Sopenharmony_ci if (!((opening == '(' && c == ')') || 21017db96d56Sopenharmony_ci (opening == '[' && c == ']') || 21027db96d56Sopenharmony_ci (opening == '{' && c == '}'))) 21037db96d56Sopenharmony_ci { 21047db96d56Sopenharmony_ci if (tok->parenlinenostack[tok->level] != tok->lineno) { 21057db96d56Sopenharmony_ci return syntaxerror(tok, 21067db96d56Sopenharmony_ci "closing parenthesis '%c' does not match " 21077db96d56Sopenharmony_ci "opening parenthesis '%c' on line %d", 21087db96d56Sopenharmony_ci c, opening, tok->parenlinenostack[tok->level]); 21097db96d56Sopenharmony_ci } 21107db96d56Sopenharmony_ci else { 21117db96d56Sopenharmony_ci return syntaxerror(tok, 21127db96d56Sopenharmony_ci "closing parenthesis '%c' does not match " 21137db96d56Sopenharmony_ci "opening parenthesis '%c'", 21147db96d56Sopenharmony_ci c, opening); 21157db96d56Sopenharmony_ci } 21167db96d56Sopenharmony_ci } 21177db96d56Sopenharmony_ci break; 21187db96d56Sopenharmony_ci } 21197db96d56Sopenharmony_ci 21207db96d56Sopenharmony_ci if (!Py_UNICODE_ISPRINTABLE(c)) { 21217db96d56Sopenharmony_ci char hex[9]; 21227db96d56Sopenharmony_ci (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c); 21237db96d56Sopenharmony_ci return syntaxerror(tok, "invalid non-printable character U+%s", hex); 21247db96d56Sopenharmony_ci } 21257db96d56Sopenharmony_ci 21267db96d56Sopenharmony_ci /* Punctuation character */ 21277db96d56Sopenharmony_ci *p_start = tok->start; 21287db96d56Sopenharmony_ci *p_end = tok->cur; 21297db96d56Sopenharmony_ci return PyToken_OneChar(c); 21307db96d56Sopenharmony_ci} 21317db96d56Sopenharmony_ci 21327db96d56Sopenharmony_ciint 21337db96d56Sopenharmony_ci_PyTokenizer_Get(struct tok_state *tok, 21347db96d56Sopenharmony_ci const char **p_start, const char **p_end) 21357db96d56Sopenharmony_ci{ 21367db96d56Sopenharmony_ci int result = tok_get(tok, p_start, p_end); 21377db96d56Sopenharmony_ci if (tok->decoding_erred) { 21387db96d56Sopenharmony_ci result = ERRORTOKEN; 21397db96d56Sopenharmony_ci tok->done = E_DECODE; 21407db96d56Sopenharmony_ci } 21417db96d56Sopenharmony_ci return result; 21427db96d56Sopenharmony_ci} 21437db96d56Sopenharmony_ci 21447db96d56Sopenharmony_ci#if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3)) 21457db96d56Sopenharmony_ci// fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's 21467db96d56Sopenharmony_ci// dup() emulation with open() is slow. 21477db96d56Sopenharmony_citypedef union { 21487db96d56Sopenharmony_ci void *cookie; 21497db96d56Sopenharmony_ci int fd; 21507db96d56Sopenharmony_ci} borrowed; 21517db96d56Sopenharmony_ci 21527db96d56Sopenharmony_cistatic ssize_t 21537db96d56Sopenharmony_ciborrow_read(void *cookie, char *buf, size_t size) 21547db96d56Sopenharmony_ci{ 21557db96d56Sopenharmony_ci borrowed b = {.cookie = cookie}; 21567db96d56Sopenharmony_ci return read(b.fd, (void *)buf, size); 21577db96d56Sopenharmony_ci} 21587db96d56Sopenharmony_ci 21597db96d56Sopenharmony_cistatic FILE * 21607db96d56Sopenharmony_cifdopen_borrow(int fd) { 21617db96d56Sopenharmony_ci // supports only reading. seek fails. close and write are no-ops. 21627db96d56Sopenharmony_ci cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL}; 21637db96d56Sopenharmony_ci borrowed b = {.fd = fd}; 21647db96d56Sopenharmony_ci return fopencookie(b.cookie, "r", io_cb); 21657db96d56Sopenharmony_ci} 21667db96d56Sopenharmony_ci#else 21677db96d56Sopenharmony_cistatic FILE * 21687db96d56Sopenharmony_cifdopen_borrow(int fd) { 21697db96d56Sopenharmony_ci fd = _Py_dup(fd); 21707db96d56Sopenharmony_ci if (fd < 0) { 21717db96d56Sopenharmony_ci return NULL; 21727db96d56Sopenharmony_ci } 21737db96d56Sopenharmony_ci return fdopen(fd, "r"); 21747db96d56Sopenharmony_ci} 21757db96d56Sopenharmony_ci#endif 21767db96d56Sopenharmony_ci 21777db96d56Sopenharmony_ci/* Get the encoding of a Python file. Check for the coding cookie and check if 21787db96d56Sopenharmony_ci the file starts with a BOM. 21797db96d56Sopenharmony_ci 21807db96d56Sopenharmony_ci _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the 21817db96d56Sopenharmony_ci encoding in the first or second line of the file (in which case the encoding 21827db96d56Sopenharmony_ci should be assumed to be UTF-8). 21837db96d56Sopenharmony_ci 21847db96d56Sopenharmony_ci The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed 21857db96d56Sopenharmony_ci by the caller. */ 21867db96d56Sopenharmony_ci 21877db96d56Sopenharmony_cichar * 21887db96d56Sopenharmony_ci_PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) 21897db96d56Sopenharmony_ci{ 21907db96d56Sopenharmony_ci struct tok_state *tok; 21917db96d56Sopenharmony_ci FILE *fp; 21927db96d56Sopenharmony_ci const char *p_start = NULL; 21937db96d56Sopenharmony_ci const char *p_end = NULL; 21947db96d56Sopenharmony_ci char *encoding = NULL; 21957db96d56Sopenharmony_ci 21967db96d56Sopenharmony_ci fp = fdopen_borrow(fd); 21977db96d56Sopenharmony_ci if (fp == NULL) { 21987db96d56Sopenharmony_ci return NULL; 21997db96d56Sopenharmony_ci } 22007db96d56Sopenharmony_ci tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL); 22017db96d56Sopenharmony_ci if (tok == NULL) { 22027db96d56Sopenharmony_ci fclose(fp); 22037db96d56Sopenharmony_ci return NULL; 22047db96d56Sopenharmony_ci } 22057db96d56Sopenharmony_ci if (filename != NULL) { 22067db96d56Sopenharmony_ci Py_INCREF(filename); 22077db96d56Sopenharmony_ci tok->filename = filename; 22087db96d56Sopenharmony_ci } 22097db96d56Sopenharmony_ci else { 22107db96d56Sopenharmony_ci tok->filename = PyUnicode_FromString("<string>"); 22117db96d56Sopenharmony_ci if (tok->filename == NULL) { 22127db96d56Sopenharmony_ci fclose(fp); 22137db96d56Sopenharmony_ci _PyTokenizer_Free(tok); 22147db96d56Sopenharmony_ci return encoding; 22157db96d56Sopenharmony_ci } 22167db96d56Sopenharmony_ci } 22177db96d56Sopenharmony_ci // We don't want to report warnings here because it could cause infinite recursion 22187db96d56Sopenharmony_ci // if fetching the encoding shows a warning. 22197db96d56Sopenharmony_ci tok->report_warnings = 0; 22207db96d56Sopenharmony_ci while (tok->lineno < 2 && tok->done == E_OK) { 22217db96d56Sopenharmony_ci _PyTokenizer_Get(tok, &p_start, &p_end); 22227db96d56Sopenharmony_ci } 22237db96d56Sopenharmony_ci fclose(fp); 22247db96d56Sopenharmony_ci if (tok->encoding) { 22257db96d56Sopenharmony_ci encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1); 22267db96d56Sopenharmony_ci if (encoding) { 22277db96d56Sopenharmony_ci strcpy(encoding, tok->encoding); 22287db96d56Sopenharmony_ci } 22297db96d56Sopenharmony_ci } 22307db96d56Sopenharmony_ci _PyTokenizer_Free(tok); 22317db96d56Sopenharmony_ci return encoding; 22327db96d56Sopenharmony_ci} 22337db96d56Sopenharmony_ci 22347db96d56Sopenharmony_ci#ifdef Py_DEBUG 22357db96d56Sopenharmony_civoid 22367db96d56Sopenharmony_citok_dump(int type, char *start, char *end) 22377db96d56Sopenharmony_ci{ 22387db96d56Sopenharmony_ci fprintf(stderr, "%s", _PyParser_TokenNames[type]); 22397db96d56Sopenharmony_ci if (type == NAME || type == NUMBER || type == STRING || type == OP) 22407db96d56Sopenharmony_ci fprintf(stderr, "(%.*s)", (int)(end - start), start); 22417db96d56Sopenharmony_ci} 22427db96d56Sopenharmony_ci#endif // Py_DEBUG 2243