xref: /third_party/python/Parser/tokenizer.c (revision 7db96d56)
1
2/* Tokenizer implementation */
3
4#define PY_SSIZE_T_CLEAN
5#include "Python.h"
6#include "pycore_call.h"          // _PyObject_CallNoArgs()
7
8#include <ctype.h>
9#include <assert.h>
10
11#include "tokenizer.h"
12#include "errcode.h"
13
14#include "unicodeobject.h"
15#include "bytesobject.h"
16#include "fileobject.h"
17#include "abstract.h"
18
19/* Alternate tab spacing */
20#define ALTTABSIZE 1
21
22#define is_potential_identifier_start(c) (\
23              (c >= 'a' && c <= 'z')\
24               || (c >= 'A' && c <= 'Z')\
25               || c == '_'\
26               || (c >= 128))
27
28#define is_potential_identifier_char(c) (\
29              (c >= 'a' && c <= 'z')\
30               || (c >= 'A' && c <= 'Z')\
31               || (c >= '0' && c <= '9')\
32               || c == '_'\
33               || (c >= 128))
34
35
36/* Don't ever change this -- it would break the portability of Python code */
37#define TABSIZE 8
38
39/* Forward */
40static struct tok_state *tok_new(void);
41static int tok_nextc(struct tok_state *tok);
42static void tok_backup(struct tok_state *tok, int c);
43static int syntaxerror(struct tok_state *tok, const char *format, ...);
44
45/* Spaces in this constant are treated as "zero or more spaces or tabs" when
46   tokenizing. */
47static const char* type_comment_prefix = "# type: ";
48
49/* Create and initialize a new tok_state structure */
50
51static struct tok_state *
52tok_new(void)
53{
54    struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
55                                            sizeof(struct tok_state));
56    if (tok == NULL)
57        return NULL;
58    tok->buf = tok->cur = tok->inp = NULL;
59    tok->fp_interactive = 0;
60    tok->interactive_src_start = NULL;
61    tok->interactive_src_end = NULL;
62    tok->start = NULL;
63    tok->end = NULL;
64    tok->done = E_OK;
65    tok->fp = NULL;
66    tok->input = NULL;
67    tok->tabsize = TABSIZE;
68    tok->indent = 0;
69    tok->indstack[0] = 0;
70    tok->atbol = 1;
71    tok->pendin = 0;
72    tok->prompt = tok->nextprompt = NULL;
73    tok->lineno = 0;
74    tok->level = 0;
75    tok->altindstack[0] = 0;
76    tok->decoding_state = STATE_INIT;
77    tok->decoding_erred = 0;
78    tok->enc = NULL;
79    tok->encoding = NULL;
80    tok->cont_line = 0;
81    tok->filename = NULL;
82    tok->decoding_readline = NULL;
83    tok->decoding_buffer = NULL;
84    tok->type_comments = 0;
85    tok->async_hacks = 0;
86    tok->async_def = 0;
87    tok->async_def_indent = 0;
88    tok->async_def_nl = 0;
89    tok->interactive_underflow = IUNDERFLOW_NORMAL;
90    tok->str = NULL;
91    tok->report_warnings = 1;
92    return tok;
93}
94
95static char *
96new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
97{
98    char* result = (char *)PyMem_Malloc(len + 1);
99    if (!result) {
100        tok->done = E_NOMEM;
101        return NULL;
102    }
103    memcpy(result, s, len);
104    result[len] = '\0';
105    return result;
106}
107
108static char *
109error_ret(struct tok_state *tok) /* XXX */
110{
111    tok->decoding_erred = 1;
112    if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
113        PyMem_Free(tok->buf);
114    tok->buf = tok->cur = tok->inp = NULL;
115    tok->start = NULL;
116    tok->end = NULL;
117    tok->done = E_DECODE;
118    return NULL;                /* as if it were EOF */
119}
120
121
122static const char *
123get_normal_name(const char *s)  /* for utf-8 and latin-1 */
124{
125    char buf[13];
126    int i;
127    for (i = 0; i < 12; i++) {
128        int c = s[i];
129        if (c == '\0')
130            break;
131        else if (c == '_')
132            buf[i] = '-';
133        else
134            buf[i] = tolower(c);
135    }
136    buf[i] = '\0';
137    if (strcmp(buf, "utf-8") == 0 ||
138        strncmp(buf, "utf-8-", 6) == 0)
139        return "utf-8";
140    else if (strcmp(buf, "latin-1") == 0 ||
141             strcmp(buf, "iso-8859-1") == 0 ||
142             strcmp(buf, "iso-latin-1") == 0 ||
143             strncmp(buf, "latin-1-", 8) == 0 ||
144             strncmp(buf, "iso-8859-1-", 11) == 0 ||
145             strncmp(buf, "iso-latin-1-", 12) == 0)
146        return "iso-8859-1";
147    else
148        return s;
149}
150
151/* Return the coding spec in S, or NULL if none is found.  */
152
153static int
154get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
155{
156    Py_ssize_t i;
157    *spec = NULL;
158    /* Coding spec must be in a comment, and that comment must be
159     * the only statement on the source code line. */
160    for (i = 0; i < size - 6; i++) {
161        if (s[i] == '#')
162            break;
163        if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
164            return 1;
165    }
166    for (; i < size - 6; i++) { /* XXX inefficient search */
167        const char* t = s + i;
168        if (memcmp(t, "coding", 6) == 0) {
169            const char* begin = NULL;
170            t += 6;
171            if (t[0] != ':' && t[0] != '=')
172                continue;
173            do {
174                t++;
175            } while (t[0] == ' ' || t[0] == '\t');
176
177            begin = t;
178            while (Py_ISALNUM(t[0]) ||
179                   t[0] == '-' || t[0] == '_' || t[0] == '.')
180                t++;
181
182            if (begin < t) {
183                char* r = new_string(begin, t - begin, tok);
184                const char* q;
185                if (!r)
186                    return 0;
187                q = get_normal_name(r);
188                if (r != q) {
189                    PyMem_Free(r);
190                    r = new_string(q, strlen(q), tok);
191                    if (!r)
192                        return 0;
193                }
194                *spec = r;
195                break;
196            }
197        }
198    }
199    return 1;
200}
201
202/* Check whether the line contains a coding spec. If it does,
203   invoke the set_readline function for the new encoding.
204   This function receives the tok_state and the new encoding.
205   Return 1 on success, 0 on failure.  */
206
207static int
208check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
209                  int set_readline(struct tok_state *, const char *))
210{
211    char *cs;
212    if (tok->cont_line) {
213        /* It's a continuation line, so it can't be a coding spec. */
214        tok->decoding_state = STATE_NORMAL;
215        return 1;
216    }
217    if (!get_coding_spec(line, &cs, size, tok)) {
218        return 0;
219    }
220    if (!cs) {
221        Py_ssize_t i;
222        for (i = 0; i < size; i++) {
223            if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
224                break;
225            if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
226                /* Stop checking coding spec after a line containing
227                 * anything except a comment. */
228                tok->decoding_state = STATE_NORMAL;
229                break;
230            }
231        }
232        return 1;
233    }
234    tok->decoding_state = STATE_NORMAL;
235    if (tok->encoding == NULL) {
236        assert(tok->decoding_readline == NULL);
237        if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
238            error_ret(tok);
239            PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
240            PyMem_Free(cs);
241            return 0;
242        }
243        tok->encoding = cs;
244    } else {                /* then, compare cs with BOM */
245        if (strcmp(tok->encoding, cs) != 0) {
246            error_ret(tok);
247            PyErr_Format(PyExc_SyntaxError,
248                         "encoding problem: %s with BOM", cs);
249            PyMem_Free(cs);
250            return 0;
251        }
252        PyMem_Free(cs);
253    }
254    return 1;
255}
256
257/* See whether the file starts with a BOM. If it does,
258   invoke the set_readline function with the new encoding.
259   Return 1 on success, 0 on failure.  */
260
261static int
262check_bom(int get_char(struct tok_state *),
263          void unget_char(int, struct tok_state *),
264          int set_readline(struct tok_state *, const char *),
265          struct tok_state *tok)
266{
267    int ch1, ch2, ch3;
268    ch1 = get_char(tok);
269    tok->decoding_state = STATE_SEEK_CODING;
270    if (ch1 == EOF) {
271        return 1;
272    } else if (ch1 == 0xEF) {
273        ch2 = get_char(tok);
274        if (ch2 != 0xBB) {
275            unget_char(ch2, tok);
276            unget_char(ch1, tok);
277            return 1;
278        }
279        ch3 = get_char(tok);
280        if (ch3 != 0xBF) {
281            unget_char(ch3, tok);
282            unget_char(ch2, tok);
283            unget_char(ch1, tok);
284            return 1;
285        }
286    } else {
287        unget_char(ch1, tok);
288        return 1;
289    }
290    if (tok->encoding != NULL)
291        PyMem_Free(tok->encoding);
292    tok->encoding = new_string("utf-8", 5, tok);
293    if (!tok->encoding)
294        return 0;
295    /* No need to set_readline: input is already utf-8 */
296    return 1;
297}
298
299static int
300tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
301    assert(tok->fp_interactive);
302
303    if (!line) {
304        return 0;
305    }
306
307    Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
308    Py_ssize_t line_size = strlen(line);
309    char last_char = line[line_size > 0 ? line_size - 1 : line_size];
310    if (last_char != '\n') {
311        line_size += 1;
312    }
313    char* new_str = tok->interactive_src_start;
314
315    new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
316    if (!new_str) {
317        if (tok->interactive_src_start) {
318            PyMem_Free(tok->interactive_src_start);
319        }
320        tok->interactive_src_start = NULL;
321        tok->interactive_src_end = NULL;
322        tok->done = E_NOMEM;
323        return -1;
324    }
325    strcpy(new_str + current_size, line);
326    if (last_char != '\n') {
327        /* Last line does not end in \n, fake one */
328        new_str[current_size + line_size - 1] = '\n';
329        new_str[current_size + line_size] = '\0';
330    }
331    tok->interactive_src_start = new_str;
332    tok->interactive_src_end = new_str + current_size + line_size;
333    return 0;
334}
335
336
337/* Read a line of text from TOK into S, using the stream in TOK.
338   Return NULL on failure, else S.
339
340   On entry, tok->decoding_buffer will be one of:
341     1) NULL: need to call tok->decoding_readline to get a new line
342     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
343       stored the result in tok->decoding_buffer
344     3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
345       (in the s buffer) to copy entire contents of the line read
346       by tok->decoding_readline.  tok->decoding_buffer has the overflow.
347       In this case, tok_readline_recode is called in a loop (with an expanded buffer)
348       until the buffer ends with a '\n' (or until the end of the file is
349       reached): see tok_nextc and its calls to tok_reserve_buf.
350*/
351
352static int
353tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
354{
355    Py_ssize_t cur = tok->cur - tok->buf;
356    Py_ssize_t oldsize = tok->inp - tok->buf;
357    Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
358    if (newsize > tok->end - tok->buf) {
359        char *newbuf = tok->buf;
360        Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
361        Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
362        Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
363        newbuf = (char *)PyMem_Realloc(newbuf, newsize);
364        if (newbuf == NULL) {
365            tok->done = E_NOMEM;
366            return 0;
367        }
368        tok->buf = newbuf;
369        tok->cur = tok->buf + cur;
370        tok->inp = tok->buf + oldsize;
371        tok->end = tok->buf + newsize;
372        tok->start = start < 0 ? NULL : tok->buf + start;
373        tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
374        tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
375    }
376    return 1;
377}
378
379static inline int
380contains_null_bytes(const char* str, size_t size) {
381    return memchr(str, 0, size) != NULL;
382}
383
384static int
385tok_readline_recode(struct tok_state *tok) {
386    PyObject *line;
387    const  char *buf;
388    Py_ssize_t buflen;
389    line = tok->decoding_buffer;
390    if (line == NULL) {
391        line = PyObject_CallNoArgs(tok->decoding_readline);
392        if (line == NULL) {
393            error_ret(tok);
394            goto error;
395        }
396    }
397    else {
398        tok->decoding_buffer = NULL;
399    }
400    buf = PyUnicode_AsUTF8AndSize(line, &buflen);
401    if (buf == NULL) {
402        error_ret(tok);
403        goto error;
404    }
405    // Make room for the null terminator *and* potentially
406    // an extra newline character that we may need to artificially
407    // add.
408    size_t buffer_size = buflen + 2;
409    if (!tok_reserve_buf(tok, buffer_size)) {
410        goto error;
411    }
412    memcpy(tok->inp, buf, buflen);
413    tok->inp += buflen;
414    *tok->inp = '\0';
415    if (tok->fp_interactive &&
416        tok_concatenate_interactive_new_line(tok, buf) == -1) {
417        goto error;
418    }
419    Py_DECREF(line);
420    return 1;
421error:
422    Py_XDECREF(line);
423    return 0;
424}
425
426/* Set the readline function for TOK to a StreamReader's
427   readline function. The StreamReader is named ENC.
428
429   This function is called from check_bom and check_coding_spec.
430
431   ENC is usually identical to the future value of tok->encoding,
432   except for the (currently unsupported) case of UTF-16.
433
434   Return 1 on success, 0 on failure. */
435
436static int
437fp_setreadl(struct tok_state *tok, const char* enc)
438{
439    PyObject *readline, *io, *stream;
440    int fd;
441    long pos;
442
443    fd = fileno(tok->fp);
444    /* Due to buffering the file offset for fd can be different from the file
445     * position of tok->fp.  If tok->fp was opened in text mode on Windows,
446     * its file position counts CRLF as one char and can't be directly mapped
447     * to the file offset for fd.  Instead we step back one byte and read to
448     * the end of line.*/
449    pos = ftell(tok->fp);
450    if (pos == -1 ||
451        lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
452        PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
453        return 0;
454    }
455
456    io = PyImport_ImportModule("io");
457    if (io == NULL) {
458        return 0;
459    }
460    stream = _PyObject_CallMethod(io, &_Py_ID(open), "isisOOO",
461                    fd, "r", -1, enc, Py_None, Py_None, Py_False);
462    Py_DECREF(io);
463    if (stream == NULL) {
464        return 0;
465    }
466
467    readline = PyObject_GetAttr(stream, &_Py_ID(readline));
468    Py_DECREF(stream);
469    if (readline == NULL) {
470        return 0;
471    }
472    Py_XSETREF(tok->decoding_readline, readline);
473
474    if (pos > 0) {
475        PyObject *bufobj = _PyObject_CallNoArgs(readline);
476        if (bufobj == NULL) {
477            return 0;
478        }
479        Py_DECREF(bufobj);
480    }
481
482    return 1;
483}
484
485/* Fetch the next byte from TOK. */
486
487static int fp_getc(struct tok_state *tok) {
488    return getc(tok->fp);
489}
490
491/* Unfetch the last byte back into TOK.  */
492
493static void fp_ungetc(int c, struct tok_state *tok) {
494    ungetc(c, tok->fp);
495}
496
497/* Check whether the characters at s start a valid
498   UTF-8 sequence. Return the number of characters forming
499   the sequence if yes, 0 if not.  The special cases match
500   those in stringlib/codecs.h:utf8_decode.
501*/
502static int
503valid_utf8(const unsigned char* s)
504{
505    int expected = 0;
506    int length;
507    if (*s < 0x80) {
508        /* single-byte code */
509        return 1;
510    }
511    else if (*s < 0xE0) {
512        /* \xC2\x80-\xDF\xBF -- 0080-07FF */
513        if (*s < 0xC2) {
514            /* invalid sequence
515               \x80-\xBF -- continuation byte
516               \xC0-\xC1 -- fake 0000-007F */
517            return 0;
518        }
519        expected = 1;
520    }
521    else if (*s < 0xF0) {
522        /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
523        if (*s == 0xE0 && *(s + 1) < 0xA0) {
524            /* invalid sequence
525               \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
526            return 0;
527        }
528        else if (*s == 0xED && *(s + 1) >= 0xA0) {
529            /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
530               will result in surrogates in range D800-DFFF. Surrogates are
531               not valid UTF-8 so they are rejected.
532               See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
533               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
534            return 0;
535        }
536        expected = 2;
537    }
538    else if (*s < 0xF5) {
539        /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
540        if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
541            /* invalid sequence -- one of:
542               \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
543               \xF4\x90\x80\x80- -- 110000- overflow */
544            return 0;
545        }
546        expected = 3;
547    }
548    else {
549        /* invalid start byte */
550        return 0;
551    }
552    length = expected + 1;
553    for (; expected; expected--)
554        if (s[expected] < 0x80 || s[expected] >= 0xC0)
555            return 0;
556    return length;
557}
558
559static int
560ensure_utf8(char *line, struct tok_state *tok)
561{
562    int badchar = 0;
563    unsigned char *c;
564    int length;
565    for (c = (unsigned char *)line; *c; c += length) {
566        if (!(length = valid_utf8(c))) {
567            badchar = *c;
568            break;
569        }
570    }
571    if (badchar) {
572        PyErr_Format(PyExc_SyntaxError,
573                     "Non-UTF-8 code starting with '\\x%.2x' "
574                     "in file %U on line %i, "
575                     "but no encoding declared; "
576                     "see https://peps.python.org/pep-0263/ for details",
577                     badchar, tok->filename, tok->lineno);
578        return 0;
579    }
580    return 1;
581}
582
583/* Fetch a byte from TOK, using the string buffer. */
584
585static int
586buf_getc(struct tok_state *tok) {
587    return Py_CHARMASK(*tok->str++);
588}
589
590/* Unfetch a byte from TOK, using the string buffer. */
591
592static void
593buf_ungetc(int c, struct tok_state *tok) {
594    tok->str--;
595    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
596}
597
598/* Set the readline function for TOK to ENC. For the string-based
599   tokenizer, this means to just record the encoding. */
600
601static int
602buf_setreadl(struct tok_state *tok, const char* enc) {
603    tok->enc = enc;
604    return 1;
605}
606
607/* Return a UTF-8 encoding Python string object from the
608   C byte string STR, which is encoded with ENC. */
609
610static PyObject *
611translate_into_utf8(const char* str, const char* enc) {
612    PyObject *utf8;
613    PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
614    if (buf == NULL)
615        return NULL;
616    utf8 = PyUnicode_AsUTF8String(buf);
617    Py_DECREF(buf);
618    return utf8;
619}
620
621
622static char *
623translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
624    int skip_next_lf = 0;
625    size_t needed_length = strlen(s) + 2, final_length;
626    char *buf, *current;
627    char c = '\0';
628    buf = PyMem_Malloc(needed_length);
629    if (buf == NULL) {
630        tok->done = E_NOMEM;
631        return NULL;
632    }
633    for (current = buf; *s; s++, current++) {
634        c = *s;
635        if (skip_next_lf) {
636            skip_next_lf = 0;
637            if (c == '\n') {
638                c = *++s;
639                if (!c)
640                    break;
641            }
642        }
643        if (c == '\r') {
644            skip_next_lf = 1;
645            c = '\n';
646        }
647        *current = c;
648    }
649    /* If this is exec input, add a newline to the end of the string if
650       there isn't one already. */
651    if (exec_input && c != '\n') {
652        *current = '\n';
653        current++;
654    }
655    *current = '\0';
656    final_length = current - buf + 1;
657    if (final_length < needed_length && final_length) {
658        /* should never fail */
659        char* result = PyMem_Realloc(buf, final_length);
660        if (result == NULL) {
661            PyMem_Free(buf);
662        }
663        buf = result;
664    }
665    return buf;
666}
667
668/* Decode a byte string STR for use as the buffer of TOK.
669   Look for encoding declarations inside STR, and record them
670   inside TOK.  */
671
672static char *
673decode_str(const char *input, int single, struct tok_state *tok)
674{
675    PyObject* utf8 = NULL;
676    char *str;
677    const char *s;
678    const char *newl[2] = {NULL, NULL};
679    int lineno = 0;
680    tok->input = str = translate_newlines(input, single, tok);
681    if (str == NULL)
682        return NULL;
683    tok->enc = NULL;
684    tok->str = str;
685    if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
686        return error_ret(tok);
687    str = tok->str;             /* string after BOM if any */
688    assert(str);
689    if (tok->enc != NULL) {
690        utf8 = translate_into_utf8(str, tok->enc);
691        if (utf8 == NULL)
692            return error_ret(tok);
693        str = PyBytes_AsString(utf8);
694    }
695    for (s = str;; s++) {
696        if (*s == '\0') break;
697        else if (*s == '\n') {
698            assert(lineno < 2);
699            newl[lineno] = s;
700            lineno++;
701            if (lineno == 2) break;
702        }
703    }
704    tok->enc = NULL;
705    /* need to check line 1 and 2 separately since check_coding_spec
706       assumes a single line as input */
707    if (newl[0]) {
708        if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
709            return NULL;
710        }
711        if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
712            if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
713                                   tok, buf_setreadl))
714                return NULL;
715        }
716    }
717    if (tok->enc != NULL) {
718        assert(utf8 == NULL);
719        utf8 = translate_into_utf8(str, tok->enc);
720        if (utf8 == NULL)
721            return error_ret(tok);
722        str = PyBytes_AS_STRING(utf8);
723    }
724    assert(tok->decoding_buffer == NULL);
725    tok->decoding_buffer = utf8; /* CAUTION */
726    return str;
727}
728
729/* Set up tokenizer for string */
730
731struct tok_state *
732_PyTokenizer_FromString(const char *str, int exec_input)
733{
734    struct tok_state *tok = tok_new();
735    char *decoded;
736
737    if (tok == NULL)
738        return NULL;
739    decoded = decode_str(str, exec_input, tok);
740    if (decoded == NULL) {
741        _PyTokenizer_Free(tok);
742        return NULL;
743    }
744
745    tok->buf = tok->cur = tok->inp = decoded;
746    tok->end = decoded;
747    return tok;
748}
749
750/* Set up tokenizer for UTF-8 string */
751
752struct tok_state *
753_PyTokenizer_FromUTF8(const char *str, int exec_input)
754{
755    struct tok_state *tok = tok_new();
756    char *translated;
757    if (tok == NULL)
758        return NULL;
759    tok->input = translated = translate_newlines(str, exec_input, tok);
760    if (translated == NULL) {
761        _PyTokenizer_Free(tok);
762        return NULL;
763    }
764    tok->decoding_state = STATE_NORMAL;
765    tok->enc = NULL;
766    tok->str = translated;
767    tok->encoding = new_string("utf-8", 5, tok);
768    if (!tok->encoding) {
769        _PyTokenizer_Free(tok);
770        return NULL;
771    }
772
773    tok->buf = tok->cur = tok->inp = translated;
774    tok->end = translated;
775    return tok;
776}
777
778/* Set up tokenizer for file */
779
780struct tok_state *
781_PyTokenizer_FromFile(FILE *fp, const char* enc,
782                      const char *ps1, const char *ps2)
783{
784    struct tok_state *tok = tok_new();
785    if (tok == NULL)
786        return NULL;
787    if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
788        _PyTokenizer_Free(tok);
789        return NULL;
790    }
791    tok->cur = tok->inp = tok->buf;
792    tok->end = tok->buf + BUFSIZ;
793    tok->fp = fp;
794    tok->prompt = ps1;
795    tok->nextprompt = ps2;
796    if (enc != NULL) {
797        /* Must copy encoding declaration since it
798           gets copied into the parse tree. */
799        tok->encoding = new_string(enc, strlen(enc), tok);
800        if (!tok->encoding) {
801            _PyTokenizer_Free(tok);
802            return NULL;
803        }
804        tok->decoding_state = STATE_NORMAL;
805    }
806    return tok;
807}
808
809/* Free a tok_state structure */
810
811void
812_PyTokenizer_Free(struct tok_state *tok)
813{
814    if (tok->encoding != NULL) {
815        PyMem_Free(tok->encoding);
816    }
817    Py_XDECREF(tok->decoding_readline);
818    Py_XDECREF(tok->decoding_buffer);
819    Py_XDECREF(tok->filename);
820    if (tok->fp != NULL && tok->buf != NULL) {
821        PyMem_Free(tok->buf);
822    }
823    if (tok->input) {
824        PyMem_Free(tok->input);
825    }
826    if (tok->interactive_src_start != NULL) {
827        PyMem_Free(tok->interactive_src_start);
828    }
829    PyMem_Free(tok);
830}
831
832static int
833tok_readline_raw(struct tok_state *tok)
834{
835    do {
836        if (!tok_reserve_buf(tok, BUFSIZ)) {
837            return 0;
838        }
839        int n_chars = (int)(tok->end - tok->inp);
840        size_t line_size = 0;
841        char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size);
842        if (line == NULL) {
843            return 1;
844        }
845        if (tok->fp_interactive &&
846            tok_concatenate_interactive_new_line(tok, line) == -1) {
847            return 0;
848        }
849        tok->inp += line_size;
850        if (tok->inp == tok->buf) {
851            return 0;
852        }
853    } while (tok->inp[-1] != '\n');
854    return 1;
855}
856
857static int
858tok_underflow_string(struct tok_state *tok) {
859    char *end = strchr(tok->inp, '\n');
860    if (end != NULL) {
861        end++;
862    }
863    else {
864        end = strchr(tok->inp, '\0');
865        if (end == tok->inp) {
866            tok->done = E_EOF;
867            return 0;
868        }
869    }
870    if (tok->start == NULL) {
871        tok->buf = tok->cur;
872    }
873    tok->line_start = tok->cur;
874    tok->lineno++;
875    tok->inp = end;
876    return 1;
877}
878
879static int
880tok_underflow_interactive(struct tok_state *tok) {
881    if (tok->interactive_underflow == IUNDERFLOW_STOP) {
882        tok->done = E_INTERACT_STOP;
883        return 1;
884    }
885    char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
886    if (newtok != NULL) {
887        char *translated = translate_newlines(newtok, 0, tok);
888        PyMem_Free(newtok);
889        if (translated == NULL) {
890            return 0;
891        }
892        newtok = translated;
893    }
894    if (tok->encoding && newtok && *newtok) {
895        /* Recode to UTF-8 */
896        Py_ssize_t buflen;
897        const char* buf;
898        PyObject *u = translate_into_utf8(newtok, tok->encoding);
899        PyMem_Free(newtok);
900        if (u == NULL) {
901            tok->done = E_DECODE;
902            return 0;
903        }
904        buflen = PyBytes_GET_SIZE(u);
905        buf = PyBytes_AS_STRING(u);
906        newtok = PyMem_Malloc(buflen+1);
907        if (newtok == NULL) {
908            Py_DECREF(u);
909            tok->done = E_NOMEM;
910            return 0;
911        }
912        strcpy(newtok, buf);
913        Py_DECREF(u);
914    }
915    if (tok->fp_interactive &&
916        tok_concatenate_interactive_new_line(tok, newtok) == -1) {
917        PyMem_Free(newtok);
918        return 0;
919    }
920    if (tok->nextprompt != NULL) {
921        tok->prompt = tok->nextprompt;
922    }
923    if (newtok == NULL) {
924        tok->done = E_INTR;
925    }
926    else if (*newtok == '\0') {
927        PyMem_Free(newtok);
928        tok->done = E_EOF;
929    }
930    else if (tok->start != NULL) {
931        Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
932        size_t size = strlen(newtok);
933        tok->lineno++;
934        if (!tok_reserve_buf(tok, size + 1)) {
935            PyMem_Free(tok->buf);
936            tok->buf = NULL;
937            PyMem_Free(newtok);
938            return 0;
939        }
940        memcpy(tok->cur, newtok, size + 1);
941        PyMem_Free(newtok);
942        tok->inp += size;
943        tok->multi_line_start = tok->buf + cur_multi_line_start;
944    }
945    else {
946        tok->lineno++;
947        PyMem_Free(tok->buf);
948        tok->buf = newtok;
949        tok->cur = tok->buf;
950        tok->line_start = tok->buf;
951        tok->inp = strchr(tok->buf, '\0');
952        tok->end = tok->inp + 1;
953    }
954    if (tok->done != E_OK) {
955        if (tok->prompt != NULL) {
956            PySys_WriteStderr("\n");
957        }
958        return 0;
959    }
960    return 1;
961}
962
963static int
964tok_underflow_file(struct tok_state *tok) {
965    if (tok->start == NULL) {
966        tok->cur = tok->inp = tok->buf;
967    }
968    if (tok->decoding_state == STATE_INIT) {
969        /* We have not yet determined the encoding.
970           If an encoding is found, use the file-pointer
971           reader functions from now on. */
972        if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
973            error_ret(tok);
974            return 0;
975        }
976        assert(tok->decoding_state != STATE_INIT);
977    }
978    /* Read until '\n' or EOF */
979    if (tok->decoding_readline != NULL) {
980        /* We already have a codec associated with this input. */
981        if (!tok_readline_recode(tok)) {
982            return 0;
983        }
984    }
985    else {
986        /* We want a 'raw' read. */
987        if (!tok_readline_raw(tok)) {
988            return 0;
989        }
990    }
991    if (tok->inp == tok->cur) {
992        tok->done = E_EOF;
993        return 0;
994    }
995    if (tok->inp[-1] != '\n') {
996        assert(tok->inp + 1 < tok->end);
997        /* Last line does not end in \n, fake one */
998        *tok->inp++ = '\n';
999        *tok->inp = '\0';
1000    }
1001
1002    tok->lineno++;
1003    if (tok->decoding_state != STATE_NORMAL) {
1004        if (tok->lineno > 2) {
1005            tok->decoding_state = STATE_NORMAL;
1006        }
1007        else if (!check_coding_spec(tok->cur, strlen(tok->cur),
1008                                    tok, fp_setreadl))
1009        {
1010            return 0;
1011        }
1012    }
1013    /* The default encoding is UTF-8, so make sure we don't have any
1014       non-UTF-8 sequences in it. */
1015    if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
1016        error_ret(tok);
1017        return 0;
1018    }
1019    assert(tok->done == E_OK);
1020    return tok->done == E_OK;
1021}
1022
1023#if defined(Py_DEBUG)
1024static void
1025print_escape(FILE *f, const char *s, Py_ssize_t size)
1026{
1027    if (s == NULL) {
1028        fputs("NULL", f);
1029        return;
1030    }
1031    putc('"', f);
1032    while (size-- > 0) {
1033        unsigned char c = *s++;
1034        switch (c) {
1035            case '\n': fputs("\\n", f); break;
1036            case '\r': fputs("\\r", f); break;
1037            case '\t': fputs("\\t", f); break;
1038            case '\f': fputs("\\f", f); break;
1039            case '\'': fputs("\\'", f); break;
1040            case '"': fputs("\\\"", f); break;
1041            default:
1042                if (0x20 <= c && c <= 0x7f)
1043                    putc(c, f);
1044                else
1045                    fprintf(f, "\\x%02x", c);
1046        }
1047    }
1048    putc('"', f);
1049}
1050#endif
1051
1052/* Get next char, updating state; error code goes into tok->done */
1053
1054static int
1055tok_nextc(struct tok_state *tok)
1056{
1057    int rc;
1058    for (;;) {
1059        if (tok->cur != tok->inp) {
1060            return Py_CHARMASK(*tok->cur++); /* Fast path */
1061        }
1062        if (tok->done != E_OK) {
1063           return EOF;
1064        }
1065        if (tok->fp == NULL) {
1066            rc = tok_underflow_string(tok);
1067        }
1068        else if (tok->prompt != NULL) {
1069            rc = tok_underflow_interactive(tok);
1070        }
1071        else {
1072            rc = tok_underflow_file(tok);
1073        }
1074#if defined(Py_DEBUG)
1075        if (Py_DebugFlag) {
1076            fprintf(stderr, "line[%d] = ", tok->lineno);
1077            print_escape(stderr, tok->cur, tok->inp - tok->cur);
1078            fprintf(stderr, "  tok->done = %d\n", tok->done);
1079        }
1080#endif
1081        if (!rc) {
1082            tok->cur = tok->inp;
1083            return EOF;
1084        }
1085        tok->line_start = tok->cur;
1086
1087        if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) {
1088            syntaxerror(tok, "source code cannot contain null bytes");
1089            tok->cur = tok->inp;
1090            return EOF;
1091        }
1092    }
1093    Py_UNREACHABLE();
1094}
1095
1096/* Back-up one character */
1097
1098static void
1099tok_backup(struct tok_state *tok, int c)
1100{
1101    if (c != EOF) {
1102        if (--tok->cur < tok->buf) {
1103            Py_FatalError("tokenizer beginning of buffer");
1104        }
1105        if ((int)(unsigned char)*tok->cur != c) {
1106            Py_FatalError("tok_backup: wrong character");
1107        }
1108    }
1109}
1110
1111static int
1112_syntaxerror_range(struct tok_state *tok, const char *format,
1113                   int col_offset, int end_col_offset,
1114                   va_list vargs)
1115{
1116    PyObject *errmsg, *errtext, *args;
1117    errmsg = PyUnicode_FromFormatV(format, vargs);
1118    if (!errmsg) {
1119        goto error;
1120    }
1121
1122    errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1123                                   "replace");
1124    if (!errtext) {
1125        goto error;
1126    }
1127
1128    if (col_offset == -1) {
1129        col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1130    }
1131    if (end_col_offset == -1) {
1132        end_col_offset = col_offset;
1133    }
1134
1135    Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1136    if (line_len != tok->cur - tok->line_start) {
1137        Py_DECREF(errtext);
1138        errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1139                                       "replace");
1140    }
1141    if (!errtext) {
1142        goto error;
1143    }
1144
1145    args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1146                         col_offset, errtext, tok->lineno, end_col_offset);
1147    if (args) {
1148        PyErr_SetObject(PyExc_SyntaxError, args);
1149        Py_DECREF(args);
1150    }
1151
1152error:
1153    Py_XDECREF(errmsg);
1154    tok->done = E_ERROR;
1155    return ERRORTOKEN;
1156}
1157
1158static int
1159syntaxerror(struct tok_state *tok, const char *format, ...)
1160{
1161    va_list vargs;
1162#ifdef HAVE_STDARG_PROTOTYPES
1163    va_start(vargs, format);
1164#else
1165    va_start(vargs);
1166#endif
1167    int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1168    va_end(vargs);
1169    return ret;
1170}
1171
1172static int
1173syntaxerror_known_range(struct tok_state *tok,
1174                        int col_offset, int end_col_offset,
1175                        const char *format, ...)
1176{
1177    va_list vargs;
1178#ifdef HAVE_STDARG_PROTOTYPES
1179    va_start(vargs, format);
1180#else
1181    va_start(vargs);
1182#endif
1183    int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1184    va_end(vargs);
1185    return ret;
1186}
1187
1188
1189
1190static int
1191indenterror(struct tok_state *tok)
1192{
1193    tok->done = E_TABSPACE;
1194    tok->cur = tok->inp;
1195    return ERRORTOKEN;
1196}
1197
1198static int
1199parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
1200{
1201    if (!tok->report_warnings) {
1202        return 0;
1203    }
1204
1205    PyObject *errmsg;
1206    va_list vargs;
1207#ifdef HAVE_STDARG_PROTOTYPES
1208    va_start(vargs, format);
1209#else
1210    va_start(vargs);
1211#endif
1212    errmsg = PyUnicode_FromFormatV(format, vargs);
1213    va_end(vargs);
1214    if (!errmsg) {
1215        goto error;
1216    }
1217
1218    if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,
1219                                 tok->lineno, NULL, NULL) < 0) {
1220        if (PyErr_ExceptionMatches(category)) {
1221            /* Replace the DeprecationWarning exception with a SyntaxError
1222               to get a more accurate error report */
1223            PyErr_Clear();
1224            syntaxerror(tok, "%U", errmsg);
1225        }
1226        goto error;
1227    }
1228    Py_DECREF(errmsg);
1229    return 0;
1230
1231error:
1232    Py_XDECREF(errmsg);
1233    tok->done = E_ERROR;
1234    return -1;
1235}
1236
1237static int
1238lookahead(struct tok_state *tok, const char *test)
1239{
1240    const char *s = test;
1241    int res = 0;
1242    while (1) {
1243        int c = tok_nextc(tok);
1244        if (*s == 0) {
1245            res = !is_potential_identifier_char(c);
1246        }
1247        else if (c == *s) {
1248            s++;
1249            continue;
1250        }
1251
1252        tok_backup(tok, c);
1253        while (s != test) {
1254            tok_backup(tok, *--s);
1255        }
1256        return res;
1257    }
1258}
1259
1260static int
1261verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1262{
1263    /* Emit a deprecation warning only if the numeric literal is immediately
1264     * followed by one of keywords which can occur after a numeric literal
1265     * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1266     * It allows to gradually deprecate existing valid code without adding
1267     * warning before error in most cases of invalid numeric literal (which
1268     * would be confusing and break existing tests).
1269     * Raise a syntax error with slightly better message than plain
1270     * "invalid syntax" if the numeric literal is immediately followed by
1271     * other keyword or identifier.
1272     */
1273    int r = 0;
1274    if (c == 'a') {
1275        r = lookahead(tok, "nd");
1276    }
1277    else if (c == 'e') {
1278        r = lookahead(tok, "lse");
1279    }
1280    else if (c == 'f') {
1281        r = lookahead(tok, "or");
1282    }
1283    else if (c == 'i') {
1284        int c2 = tok_nextc(tok);
1285        if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1286            r = 1;
1287        }
1288        tok_backup(tok, c2);
1289    }
1290    else if (c == 'o') {
1291        r = lookahead(tok, "r");
1292    }
1293    else if (c == 'n') {
1294        r = lookahead(tok, "ot");
1295    }
1296    if (r) {
1297        tok_backup(tok, c);
1298        if (parser_warn(tok, PyExc_SyntaxWarning,
1299                "invalid %s literal", kind))
1300        {
1301            return 0;
1302        }
1303        tok_nextc(tok);
1304    }
1305    else /* In future releases, only error will remain. */
1306    if (is_potential_identifier_char(c)) {
1307        tok_backup(tok, c);
1308        syntaxerror(tok, "invalid %s literal", kind);
1309        return 0;
1310    }
1311    return 1;
1312}
1313
1314/* Verify that the identifier follows PEP 3131.
1315   All identifier strings are guaranteed to be "ready" unicode objects.
1316 */
1317static int
1318verify_identifier(struct tok_state *tok)
1319{
1320    PyObject *s;
1321    if (tok->decoding_erred)
1322        return 0;
1323    s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1324    if (s == NULL) {
1325        if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1326            tok->done = E_DECODE;
1327        }
1328        else {
1329            tok->done = E_ERROR;
1330        }
1331        return 0;
1332    }
1333    Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1334    if (invalid < 0) {
1335        Py_DECREF(s);
1336        tok->done = E_ERROR;
1337        return 0;
1338    }
1339    assert(PyUnicode_GET_LENGTH(s) > 0);
1340    if (invalid < PyUnicode_GET_LENGTH(s)) {
1341        Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1342        if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1343            /* Determine the offset in UTF-8 encoded input */
1344            Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1345            if (s != NULL) {
1346                Py_SETREF(s, PyUnicode_AsUTF8String(s));
1347            }
1348            if (s == NULL) {
1349                tok->done = E_ERROR;
1350                return 0;
1351            }
1352            tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1353        }
1354        Py_DECREF(s);
1355        // PyUnicode_FromFormatV() does not support %X
1356        char hex[9];
1357        (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1358        if (Py_UNICODE_ISPRINTABLE(ch)) {
1359            syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1360        }
1361        else {
1362            syntaxerror(tok, "invalid non-printable character U+%s", hex);
1363        }
1364        return 0;
1365    }
1366    Py_DECREF(s);
1367    return 1;
1368}
1369
1370static int
1371tok_decimal_tail(struct tok_state *tok)
1372{
1373    int c;
1374
1375    while (1) {
1376        do {
1377            c = tok_nextc(tok);
1378        } while (isdigit(c));
1379        if (c != '_') {
1380            break;
1381        }
1382        c = tok_nextc(tok);
1383        if (!isdigit(c)) {
1384            tok_backup(tok, c);
1385            syntaxerror(tok, "invalid decimal literal");
1386            return 0;
1387        }
1388    }
1389    return c;
1390}
1391
1392/* Get next token, after space stripping etc. */
1393
1394static inline int
1395tok_continuation_line(struct tok_state *tok) {
1396    int c = tok_nextc(tok);
1397    if (c != '\n') {
1398        tok->done = E_LINECONT;
1399        return -1;
1400    }
1401    c = tok_nextc(tok);
1402    if (c == EOF) {
1403        tok->done = E_EOF;
1404        tok->cur = tok->inp;
1405        return -1;
1406    } else {
1407        tok_backup(tok, c);
1408    }
1409    return c;
1410}
1411
1412static int
1413tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1414{
1415    int c;
1416    int blankline, nonascii;
1417
1418    *p_start = *p_end = NULL;
1419  nextline:
1420    tok->start = NULL;
1421    blankline = 0;
1422
1423    /* Get indentation level */
1424    if (tok->atbol) {
1425        int col = 0;
1426        int altcol = 0;
1427        tok->atbol = 0;
1428        int cont_line_col = 0;
1429        for (;;) {
1430            c = tok_nextc(tok);
1431            if (c == ' ') {
1432                col++, altcol++;
1433            }
1434            else if (c == '\t') {
1435                col = (col / tok->tabsize + 1) * tok->tabsize;
1436                altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1437            }
1438            else if (c == '\014')  {/* Control-L (formfeed) */
1439                col = altcol = 0; /* For Emacs users */
1440            }
1441            else if (c == '\\') {
1442                // Indentation cannot be split over multiple physical lines
1443                // using backslashes. This means that if we found a backslash
1444                // preceded by whitespace, **the first one we find** determines
1445                // the level of indentation of whatever comes next.
1446                cont_line_col = cont_line_col ? cont_line_col : col;
1447                if ((c = tok_continuation_line(tok)) == -1) {
1448                    return ERRORTOKEN;
1449                }
1450            }
1451            else {
1452                break;
1453            }
1454        }
1455        tok_backup(tok, c);
1456        if (c == '#' || c == '\n') {
1457            /* Lines with only whitespace and/or comments
1458               shouldn't affect the indentation and are
1459               not passed to the parser as NEWLINE tokens,
1460               except *totally* empty lines in interactive
1461               mode, which signal the end of a command group. */
1462            if (col == 0 && c == '\n' && tok->prompt != NULL) {
1463                blankline = 0; /* Let it through */
1464            }
1465            else if (tok->prompt != NULL && tok->lineno == 1) {
1466                /* In interactive mode, if the first line contains
1467                   only spaces and/or a comment, let it through. */
1468                blankline = 0;
1469                col = altcol = 0;
1470            }
1471            else {
1472                blankline = 1; /* Ignore completely */
1473            }
1474            /* We can't jump back right here since we still
1475               may need to skip to the end of a comment */
1476        }
1477        if (!blankline && tok->level == 0) {
1478            col = cont_line_col ? cont_line_col : col;
1479            altcol = cont_line_col ? cont_line_col : altcol;
1480            if (col == tok->indstack[tok->indent]) {
1481                /* No change */
1482                if (altcol != tok->altindstack[tok->indent]) {
1483                    return indenterror(tok);
1484                }
1485            }
1486            else if (col > tok->indstack[tok->indent]) {
1487                /* Indent -- always one */
1488                if (tok->indent+1 >= MAXINDENT) {
1489                    tok->done = E_TOODEEP;
1490                    tok->cur = tok->inp;
1491                    return ERRORTOKEN;
1492                }
1493                if (altcol <= tok->altindstack[tok->indent]) {
1494                    return indenterror(tok);
1495                }
1496                tok->pendin++;
1497                tok->indstack[++tok->indent] = col;
1498                tok->altindstack[tok->indent] = altcol;
1499            }
1500            else /* col < tok->indstack[tok->indent] */ {
1501                /* Dedent -- any number, must be consistent */
1502                while (tok->indent > 0 &&
1503                    col < tok->indstack[tok->indent]) {
1504                    tok->pendin--;
1505                    tok->indent--;
1506                }
1507                if (col != tok->indstack[tok->indent]) {
1508                    tok->done = E_DEDENT;
1509                    tok->cur = tok->inp;
1510                    return ERRORTOKEN;
1511                }
1512                if (altcol != tok->altindstack[tok->indent]) {
1513                    return indenterror(tok);
1514                }
1515            }
1516        }
1517    }
1518
1519    tok->start = tok->cur;
1520
1521    /* Return pending indents/dedents */
1522    if (tok->pendin != 0) {
1523        if (tok->pendin < 0) {
1524            tok->pendin++;
1525            return DEDENT;
1526        }
1527        else {
1528            tok->pendin--;
1529            return INDENT;
1530        }
1531    }
1532
1533    /* Peek ahead at the next character */
1534    c = tok_nextc(tok);
1535    tok_backup(tok, c);
1536    /* Check if we are closing an async function */
1537    if (tok->async_def
1538        && !blankline
1539        /* Due to some implementation artifacts of type comments,
1540         * a TYPE_COMMENT at the start of a function won't set an
1541         * indentation level and it will produce a NEWLINE after it.
1542         * To avoid spuriously ending an async function due to this,
1543         * wait until we have some non-newline char in front of us. */
1544        && c != '\n'
1545        && tok->level == 0
1546        /* There was a NEWLINE after ASYNC DEF,
1547           so we're past the signature. */
1548        && tok->async_def_nl
1549        /* Current indentation level is less than where
1550           the async function was defined */
1551        && tok->async_def_indent >= tok->indent)
1552    {
1553        tok->async_def = 0;
1554        tok->async_def_indent = 0;
1555        tok->async_def_nl = 0;
1556    }
1557
1558 again:
1559    tok->start = NULL;
1560    /* Skip spaces */
1561    do {
1562        c = tok_nextc(tok);
1563    } while (c == ' ' || c == '\t' || c == '\014');
1564
1565    /* Set start of current token */
1566    tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
1567
1568    /* Skip comment, unless it's a type comment */
1569    if (c == '#') {
1570        const char *prefix, *p, *type_start;
1571
1572        while (c != EOF && c != '\n') {
1573            c = tok_nextc(tok);
1574        }
1575
1576        if (tok->type_comments) {
1577            p = tok->start;
1578            prefix = type_comment_prefix;
1579            while (*prefix && p < tok->cur) {
1580                if (*prefix == ' ') {
1581                    while (*p == ' ' || *p == '\t') {
1582                        p++;
1583                    }
1584                } else if (*prefix == *p) {
1585                    p++;
1586                } else {
1587                    break;
1588                }
1589
1590                prefix++;
1591            }
1592
1593            /* This is a type comment if we matched all of type_comment_prefix. */
1594            if (!*prefix) {
1595                int is_type_ignore = 1;
1596                const char *ignore_end = p + 6;
1597                tok_backup(tok, c);  /* don't eat the newline or EOF */
1598
1599                type_start = p;
1600
1601                /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1602                 * or anything ASCII and non-alphanumeric. */
1603                is_type_ignore = (
1604                    tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1605                    && !(tok->cur > ignore_end
1606                         && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1607
1608                if (is_type_ignore) {
1609                    *p_start = ignore_end;
1610                    *p_end = tok->cur;
1611
1612                    /* If this type ignore is the only thing on the line, consume the newline also. */
1613                    if (blankline) {
1614                        tok_nextc(tok);
1615                        tok->atbol = 1;
1616                    }
1617                    return TYPE_IGNORE;
1618                } else {
1619                    *p_start = type_start;  /* after type_comment_prefix */
1620                    *p_end = tok->cur;
1621                    return TYPE_COMMENT;
1622                }
1623            }
1624        }
1625    }
1626
1627    if (tok->done == E_INTERACT_STOP) {
1628        return ENDMARKER;
1629    }
1630
1631    /* Check for EOF and errors now */
1632    if (c == EOF) {
1633        if (tok->level) {
1634            return ERRORTOKEN;
1635        }
1636        return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1637    }
1638
1639    /* Identifier (most frequent token!) */
1640    nonascii = 0;
1641    if (is_potential_identifier_start(c)) {
1642        /* Process the various legal combinations of b"", r"", u"", and f"". */
1643        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1644        while (1) {
1645            if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1646                saw_b = 1;
1647            /* Since this is a backwards compatibility support literal we don't
1648               want to support it in arbitrary order like byte literals. */
1649            else if (!(saw_b || saw_u || saw_r || saw_f)
1650                     && (c == 'u'|| c == 'U')) {
1651                saw_u = 1;
1652            }
1653            /* ur"" and ru"" are not supported */
1654            else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1655                saw_r = 1;
1656            }
1657            else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1658                saw_f = 1;
1659            }
1660            else {
1661                break;
1662            }
1663            c = tok_nextc(tok);
1664            if (c == '"' || c == '\'') {
1665                goto letter_quote;
1666            }
1667        }
1668        while (is_potential_identifier_char(c)) {
1669            if (c >= 128) {
1670                nonascii = 1;
1671            }
1672            c = tok_nextc(tok);
1673        }
1674        tok_backup(tok, c);
1675        if (nonascii && !verify_identifier(tok)) {
1676            return ERRORTOKEN;
1677        }
1678
1679        *p_start = tok->start;
1680        *p_end = tok->cur;
1681
1682        /* async/await parsing block. */
1683        if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1684            /* May be an 'async' or 'await' token.  For Python 3.7 or
1685               later we recognize them unconditionally.  For Python
1686               3.5 or 3.6 we recognize 'async' in front of 'def', and
1687               either one inside of 'async def'.  (Technically we
1688               shouldn't recognize these at all for 3.4 or earlier,
1689               but there's no *valid* Python 3.4 code that would be
1690               rejected, and async functions will be rejected in a
1691               later phase.) */
1692            if (!tok->async_hacks || tok->async_def) {
1693                /* Always recognize the keywords. */
1694                if (memcmp(tok->start, "async", 5) == 0) {
1695                    return ASYNC;
1696                }
1697                if (memcmp(tok->start, "await", 5) == 0) {
1698                    return AWAIT;
1699                }
1700            }
1701            else if (memcmp(tok->start, "async", 5) == 0) {
1702                /* The current token is 'async'.
1703                   Look ahead one token to see if that is 'def'. */
1704
1705                struct tok_state ahead_tok;
1706                const char *ahead_tok_start = NULL;
1707                const char *ahead_tok_end = NULL;
1708                int ahead_tok_kind;
1709
1710                memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1711                ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1712                                         &ahead_tok_end);
1713
1714                if (ahead_tok_kind == NAME
1715                    && ahead_tok.cur - ahead_tok.start == 3
1716                    && memcmp(ahead_tok.start, "def", 3) == 0)
1717                {
1718                    /* The next token is going to be 'def', so instead of
1719                       returning a plain NAME token, return ASYNC. */
1720                    tok->async_def_indent = tok->indent;
1721                    tok->async_def = 1;
1722                    return ASYNC;
1723                }
1724            }
1725        }
1726
1727        return NAME;
1728    }
1729
1730    /* Newline */
1731    if (c == '\n') {
1732        tok->atbol = 1;
1733        if (blankline || tok->level > 0) {
1734            goto nextline;
1735        }
1736        *p_start = tok->start;
1737        *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1738        tok->cont_line = 0;
1739        if (tok->async_def) {
1740            /* We're somewhere inside an 'async def' function, and
1741               we've encountered a NEWLINE after its signature. */
1742            tok->async_def_nl = 1;
1743        }
1744        return NEWLINE;
1745    }
1746
1747    /* Period or number starting with period? */
1748    if (c == '.') {
1749        c = tok_nextc(tok);
1750        if (isdigit(c)) {
1751            goto fraction;
1752        } else if (c == '.') {
1753            c = tok_nextc(tok);
1754            if (c == '.') {
1755                *p_start = tok->start;
1756                *p_end = tok->cur;
1757                return ELLIPSIS;
1758            }
1759            else {
1760                tok_backup(tok, c);
1761            }
1762            tok_backup(tok, '.');
1763        }
1764        else {
1765            tok_backup(tok, c);
1766        }
1767        *p_start = tok->start;
1768        *p_end = tok->cur;
1769        return DOT;
1770    }
1771
1772    /* Number */
1773    if (isdigit(c)) {
1774        if (c == '0') {
1775            /* Hex, octal or binary -- maybe. */
1776            c = tok_nextc(tok);
1777            if (c == 'x' || c == 'X') {
1778                /* Hex */
1779                c = tok_nextc(tok);
1780                do {
1781                    if (c == '_') {
1782                        c = tok_nextc(tok);
1783                    }
1784                    if (!isxdigit(c)) {
1785                        tok_backup(tok, c);
1786                        return syntaxerror(tok, "invalid hexadecimal literal");
1787                    }
1788                    do {
1789                        c = tok_nextc(tok);
1790                    } while (isxdigit(c));
1791                } while (c == '_');
1792                if (!verify_end_of_number(tok, c, "hexadecimal")) {
1793                    return ERRORTOKEN;
1794                }
1795            }
1796            else if (c == 'o' || c == 'O') {
1797                /* Octal */
1798                c = tok_nextc(tok);
1799                do {
1800                    if (c == '_') {
1801                        c = tok_nextc(tok);
1802                    }
1803                    if (c < '0' || c >= '8') {
1804                        if (isdigit(c)) {
1805                            return syntaxerror(tok,
1806                                    "invalid digit '%c' in octal literal", c);
1807                        }
1808                        else {
1809                            tok_backup(tok, c);
1810                            return syntaxerror(tok, "invalid octal literal");
1811                        }
1812                    }
1813                    do {
1814                        c = tok_nextc(tok);
1815                    } while ('0' <= c && c < '8');
1816                } while (c == '_');
1817                if (isdigit(c)) {
1818                    return syntaxerror(tok,
1819                            "invalid digit '%c' in octal literal", c);
1820                }
1821                if (!verify_end_of_number(tok, c, "octal")) {
1822                    return ERRORTOKEN;
1823                }
1824            }
1825            else if (c == 'b' || c == 'B') {
1826                /* Binary */
1827                c = tok_nextc(tok);
1828                do {
1829                    if (c == '_') {
1830                        c = tok_nextc(tok);
1831                    }
1832                    if (c != '0' && c != '1') {
1833                        if (isdigit(c)) {
1834                            return syntaxerror(tok,
1835                                    "invalid digit '%c' in binary literal", c);
1836                        }
1837                        else {
1838                            tok_backup(tok, c);
1839                            return syntaxerror(tok, "invalid binary literal");
1840                        }
1841                    }
1842                    do {
1843                        c = tok_nextc(tok);
1844                    } while (c == '0' || c == '1');
1845                } while (c == '_');
1846                if (isdigit(c)) {
1847                    return syntaxerror(tok,
1848                            "invalid digit '%c' in binary literal", c);
1849                }
1850                if (!verify_end_of_number(tok, c, "binary")) {
1851                    return ERRORTOKEN;
1852                }
1853            }
1854            else {
1855                int nonzero = 0;
1856                /* maybe old-style octal; c is first char of it */
1857                /* in any case, allow '0' as a literal */
1858                while (1) {
1859                    if (c == '_') {
1860                        c = tok_nextc(tok);
1861                        if (!isdigit(c)) {
1862                            tok_backup(tok, c);
1863                            return syntaxerror(tok, "invalid decimal literal");
1864                        }
1865                    }
1866                    if (c != '0') {
1867                        break;
1868                    }
1869                    c = tok_nextc(tok);
1870                }
1871                char* zeros_end = tok->cur;
1872                if (isdigit(c)) {
1873                    nonzero = 1;
1874                    c = tok_decimal_tail(tok);
1875                    if (c == 0) {
1876                        return ERRORTOKEN;
1877                    }
1878                }
1879                if (c == '.') {
1880                    c = tok_nextc(tok);
1881                    goto fraction;
1882                }
1883                else if (c == 'e' || c == 'E') {
1884                    goto exponent;
1885                }
1886                else if (c == 'j' || c == 'J') {
1887                    goto imaginary;
1888                }
1889                else if (nonzero) {
1890                    /* Old-style octal: now disallowed. */
1891                    tok_backup(tok, c);
1892                    return syntaxerror_known_range(
1893                            tok, (int)(tok->start + 1 - tok->line_start),
1894                            (int)(zeros_end - tok->line_start),
1895                            "leading zeros in decimal integer "
1896                            "literals are not permitted; "
1897                            "use an 0o prefix for octal integers");
1898                }
1899                if (!verify_end_of_number(tok, c, "decimal")) {
1900                    return ERRORTOKEN;
1901                }
1902            }
1903        }
1904        else {
1905            /* Decimal */
1906            c = tok_decimal_tail(tok);
1907            if (c == 0) {
1908                return ERRORTOKEN;
1909            }
1910            {
1911                /* Accept floating point numbers. */
1912                if (c == '.') {
1913                    c = tok_nextc(tok);
1914        fraction:
1915                    /* Fraction */
1916                    if (isdigit(c)) {
1917                        c = tok_decimal_tail(tok);
1918                        if (c == 0) {
1919                            return ERRORTOKEN;
1920                        }
1921                    }
1922                }
1923                if (c == 'e' || c == 'E') {
1924                    int e;
1925                  exponent:
1926                    e = c;
1927                    /* Exponent part */
1928                    c = tok_nextc(tok);
1929                    if (c == '+' || c == '-') {
1930                        c = tok_nextc(tok);
1931                        if (!isdigit(c)) {
1932                            tok_backup(tok, c);
1933                            return syntaxerror(tok, "invalid decimal literal");
1934                        }
1935                    } else if (!isdigit(c)) {
1936                        tok_backup(tok, c);
1937                        if (!verify_end_of_number(tok, e, "decimal")) {
1938                            return ERRORTOKEN;
1939                        }
1940                        tok_backup(tok, e);
1941                        *p_start = tok->start;
1942                        *p_end = tok->cur;
1943                        return NUMBER;
1944                    }
1945                    c = tok_decimal_tail(tok);
1946                    if (c == 0) {
1947                        return ERRORTOKEN;
1948                    }
1949                }
1950                if (c == 'j' || c == 'J') {
1951                    /* Imaginary part */
1952        imaginary:
1953                    c = tok_nextc(tok);
1954                    if (!verify_end_of_number(tok, c, "imaginary")) {
1955                        return ERRORTOKEN;
1956                    }
1957                }
1958                else if (!verify_end_of_number(tok, c, "decimal")) {
1959                    return ERRORTOKEN;
1960                }
1961            }
1962        }
1963        tok_backup(tok, c);
1964        *p_start = tok->start;
1965        *p_end = tok->cur;
1966        return NUMBER;
1967    }
1968
1969  letter_quote:
1970    /* String */
1971    if (c == '\'' || c == '"') {
1972        int quote = c;
1973        int quote_size = 1;             /* 1 or 3 */
1974        int end_quote_size = 0;
1975
1976        /* Nodes of type STRING, especially multi line strings
1977           must be handled differently in order to get both
1978           the starting line number and the column offset right.
1979           (cf. issue 16806) */
1980        tok->first_lineno = tok->lineno;
1981        tok->multi_line_start = tok->line_start;
1982
1983        /* Find the quote size and start of string */
1984        c = tok_nextc(tok);
1985        if (c == quote) {
1986            c = tok_nextc(tok);
1987            if (c == quote) {
1988                quote_size = 3;
1989            }
1990            else {
1991                end_quote_size = 1;     /* empty string found */
1992            }
1993        }
1994        if (c != quote) {
1995            tok_backup(tok, c);
1996        }
1997
1998        /* Get rest of string */
1999        while (end_quote_size != quote_size) {
2000            c = tok_nextc(tok);
2001            if (tok->done == E_ERROR) {
2002                return ERRORTOKEN;
2003            }
2004            if (tok->done == E_DECODE) {
2005                break;
2006            }
2007            if (c == EOF || (quote_size == 1 && c == '\n')) {
2008                assert(tok->multi_line_start != NULL);
2009                // shift the tok_state's location into
2010                // the start of string, and report the error
2011                // from the initial quote character
2012                tok->cur = (char *)tok->start;
2013                tok->cur++;
2014                tok->line_start = tok->multi_line_start;
2015                int start = tok->lineno;
2016                tok->lineno = tok->first_lineno;
2017                if (quote_size == 3) {
2018                    syntaxerror(tok, "unterminated triple-quoted string literal"
2019                                     " (detected at line %d)", start);
2020                    if (c != '\n') {
2021                        tok->done = E_EOFS;
2022                    }
2023                    return ERRORTOKEN;
2024                }
2025                else {
2026                    syntaxerror(tok, "unterminated string literal (detected at"
2027                                     " line %d)", start);
2028                    if (c != '\n') {
2029                        tok->done = E_EOLS;
2030                    }
2031                    return ERRORTOKEN;
2032                }
2033            }
2034            if (c == quote) {
2035                end_quote_size += 1;
2036            }
2037            else {
2038                end_quote_size = 0;
2039                if (c == '\\') {
2040                    tok_nextc(tok);  /* skip escaped char */
2041                }
2042            }
2043        }
2044
2045        *p_start = tok->start;
2046        *p_end = tok->cur;
2047        return STRING;
2048    }
2049
2050    /* Line continuation */
2051    if (c == '\\') {
2052        if ((c = tok_continuation_line(tok)) == -1) {
2053            return ERRORTOKEN;
2054        }
2055        tok->cont_line = 1;
2056        goto again; /* Read next line */
2057    }
2058
2059    /* Check for two-character token */
2060    {
2061        int c2 = tok_nextc(tok);
2062        int token = PyToken_TwoChars(c, c2);
2063        if (token != OP) {
2064            int c3 = tok_nextc(tok);
2065            int token3 = PyToken_ThreeChars(c, c2, c3);
2066            if (token3 != OP) {
2067                token = token3;
2068            }
2069            else {
2070                tok_backup(tok, c3);
2071            }
2072            *p_start = tok->start;
2073            *p_end = tok->cur;
2074            return token;
2075        }
2076        tok_backup(tok, c2);
2077    }
2078
2079    /* Keep track of parentheses nesting level */
2080    switch (c) {
2081    case '(':
2082    case '[':
2083    case '{':
2084        if (tok->level >= MAXLEVEL) {
2085            return syntaxerror(tok, "too many nested parentheses");
2086        }
2087        tok->parenstack[tok->level] = c;
2088        tok->parenlinenostack[tok->level] = tok->lineno;
2089        tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2090        tok->level++;
2091        break;
2092    case ')':
2093    case ']':
2094    case '}':
2095        if (!tok->level) {
2096            return syntaxerror(tok, "unmatched '%c'", c);
2097        }
2098        tok->level--;
2099        int opening = tok->parenstack[tok->level];
2100        if (!((opening == '(' && c == ')') ||
2101              (opening == '[' && c == ']') ||
2102              (opening == '{' && c == '}')))
2103        {
2104            if (tok->parenlinenostack[tok->level] != tok->lineno) {
2105                return syntaxerror(tok,
2106                        "closing parenthesis '%c' does not match "
2107                        "opening parenthesis '%c' on line %d",
2108                        c, opening, tok->parenlinenostack[tok->level]);
2109            }
2110            else {
2111                return syntaxerror(tok,
2112                        "closing parenthesis '%c' does not match "
2113                        "opening parenthesis '%c'",
2114                        c, opening);
2115            }
2116        }
2117        break;
2118    }
2119
2120    if (!Py_UNICODE_ISPRINTABLE(c)) {
2121        char hex[9];
2122        (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
2123        return syntaxerror(tok, "invalid non-printable character U+%s", hex);
2124    }
2125
2126    /* Punctuation character */
2127    *p_start = tok->start;
2128    *p_end = tok->cur;
2129    return PyToken_OneChar(c);
2130}
2131
2132int
2133_PyTokenizer_Get(struct tok_state *tok,
2134                 const char **p_start, const char **p_end)
2135{
2136    int result = tok_get(tok, p_start, p_end);
2137    if (tok->decoding_erred) {
2138        result = ERRORTOKEN;
2139        tok->done = E_DECODE;
2140    }
2141    return result;
2142}
2143
2144#if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3))
2145// fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's
2146// dup() emulation with open() is slow.
2147typedef union {
2148    void *cookie;
2149    int fd;
2150} borrowed;
2151
2152static ssize_t
2153borrow_read(void *cookie, char *buf, size_t size)
2154{
2155    borrowed b = {.cookie = cookie};
2156    return read(b.fd, (void *)buf, size);
2157}
2158
2159static FILE *
2160fdopen_borrow(int fd) {
2161    // supports only reading. seek fails. close and write are no-ops.
2162    cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};
2163    borrowed b = {.fd = fd};
2164    return fopencookie(b.cookie, "r", io_cb);
2165}
2166#else
2167static FILE *
2168fdopen_borrow(int fd) {
2169    fd = _Py_dup(fd);
2170    if (fd < 0) {
2171        return NULL;
2172    }
2173    return fdopen(fd, "r");
2174}
2175#endif
2176
2177/* Get the encoding of a Python file. Check for the coding cookie and check if
2178   the file starts with a BOM.
2179
2180   _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2181   encoding in the first or second line of the file (in which case the encoding
2182   should be assumed to be UTF-8).
2183
2184   The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2185   by the caller. */
2186
2187char *
2188_PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2189{
2190    struct tok_state *tok;
2191    FILE *fp;
2192    const char *p_start = NULL;
2193    const char *p_end = NULL;
2194    char *encoding = NULL;
2195
2196    fp = fdopen_borrow(fd);
2197    if (fp == NULL) {
2198        return NULL;
2199    }
2200    tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2201    if (tok == NULL) {
2202        fclose(fp);
2203        return NULL;
2204    }
2205    if (filename != NULL) {
2206        Py_INCREF(filename);
2207        tok->filename = filename;
2208    }
2209    else {
2210        tok->filename = PyUnicode_FromString("<string>");
2211        if (tok->filename == NULL) {
2212            fclose(fp);
2213            _PyTokenizer_Free(tok);
2214            return encoding;
2215        }
2216    }
2217    // We don't want to report warnings here because it could cause infinite recursion
2218    // if fetching the encoding shows a warning.
2219    tok->report_warnings = 0;
2220    while (tok->lineno < 2 && tok->done == E_OK) {
2221        _PyTokenizer_Get(tok, &p_start, &p_end);
2222    }
2223    fclose(fp);
2224    if (tok->encoding) {
2225        encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2226        if (encoding) {
2227            strcpy(encoding, tok->encoding);
2228        }
2229    }
2230    _PyTokenizer_Free(tok);
2231    return encoding;
2232}
2233
2234#ifdef Py_DEBUG
2235void
2236tok_dump(int type, char *start, char *end)
2237{
2238    fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2239    if (type == NAME || type == NUMBER || type == STRING || type == OP)
2240        fprintf(stderr, "(%.*s)", (int)(end - start), start);
2241}
2242#endif  // Py_DEBUG
2243