1/*
2    unicode_format.h -- implementation of str.format().
3*/
4
5#include "pycore_floatobject.h"   // _PyFloat_FormatAdvancedWriter()
6
7/************************************************************************/
8/***********   Global data structures and forward declarations  *********/
9/************************************************************************/
10
11/*
12   A SubString consists of the characters between two string or
13   unicode pointers.
14*/
15typedef struct {
16    PyObject *str; /* borrowed reference */
17    Py_ssize_t start, end;
18} SubString;
19
20
21typedef enum {
22    ANS_INIT,
23    ANS_AUTO,
24    ANS_MANUAL
25} AutoNumberState;   /* Keep track if we're auto-numbering fields */
26
27/* Keeps track of our auto-numbering state, and which number field we're on */
28typedef struct {
29    AutoNumberState an_state;
30    int an_field_number;
31} AutoNumber;
32
33
34/* forward declaration for recursion */
35static PyObject *
36build_string(SubString *input, PyObject *args, PyObject *kwargs,
37             int recursion_depth, AutoNumber *auto_number);
38
39
40
41/************************************************************************/
42/**************************  Utility  functions  ************************/
43/************************************************************************/
44
45static void
46AutoNumber_Init(AutoNumber *auto_number)
47{
48    auto_number->an_state = ANS_INIT;
49    auto_number->an_field_number = 0;
50}
51
52/* fill in a SubString from a pointer and length */
53Py_LOCAL_INLINE(void)
54SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
55{
56    str->str = s;
57    str->start = start;
58    str->end = end;
59}
60
61/* return a new string.  if str->str is NULL, return None */
62Py_LOCAL_INLINE(PyObject *)
63SubString_new_object(SubString *str)
64{
65    if (str->str == NULL)
66        Py_RETURN_NONE;
67    return PyUnicode_Substring(str->str, str->start, str->end);
68}
69
70/* return a new string.  if str->str is NULL, return a new empty string */
71Py_LOCAL_INLINE(PyObject *)
72SubString_new_object_or_empty(SubString *str)
73{
74    if (str->str == NULL) {
75        return PyUnicode_New(0, 0);
76    }
77    return SubString_new_object(str);
78}
79
80/* Return 1 if an error has been detected switching between automatic
81   field numbering and manual field specification, else return 0. Set
82   ValueError on error. */
83static int
84autonumber_state_error(AutoNumberState state, int field_name_is_empty)
85{
86    if (state == ANS_MANUAL) {
87        if (field_name_is_empty) {
88            PyErr_SetString(PyExc_ValueError, "cannot switch from "
89                            "manual field specification to "
90                            "automatic field numbering");
91            return 1;
92        }
93    }
94    else {
95        if (!field_name_is_empty) {
96            PyErr_SetString(PyExc_ValueError, "cannot switch from "
97                            "automatic field numbering to "
98                            "manual field specification");
99            return 1;
100        }
101    }
102    return 0;
103}
104
105
106/************************************************************************/
107/***********  Format string parsing -- integers and identifiers *********/
108/************************************************************************/
109
110static Py_ssize_t
111get_integer(const SubString *str)
112{
113    Py_ssize_t accumulator = 0;
114    Py_ssize_t digitval;
115    Py_ssize_t i;
116
117    /* empty string is an error */
118    if (str->start >= str->end)
119        return -1;
120
121    for (i = str->start; i < str->end; i++) {
122        digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
123        if (digitval < 0)
124            return -1;
125        /*
126           Detect possible overflow before it happens:
127
128              accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
129              accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
130        */
131        if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
132            PyErr_Format(PyExc_ValueError,
133                         "Too many decimal digits in format string");
134            return -1;
135        }
136        accumulator = accumulator * 10 + digitval;
137    }
138    return accumulator;
139}
140
141/************************************************************************/
142/******** Functions to get field objects and specification strings ******/
143/************************************************************************/
144
145/* do the equivalent of obj.name */
146static PyObject *
147getattr(PyObject *obj, SubString *name)
148{
149    PyObject *newobj;
150    PyObject *str = SubString_new_object(name);
151    if (str == NULL)
152        return NULL;
153    newobj = PyObject_GetAttr(obj, str);
154    Py_DECREF(str);
155    return newobj;
156}
157
158/* do the equivalent of obj[idx], where obj is a sequence */
159static PyObject *
160getitem_sequence(PyObject *obj, Py_ssize_t idx)
161{
162    return PySequence_GetItem(obj, idx);
163}
164
165/* do the equivalent of obj[idx], where obj is not a sequence */
166static PyObject *
167getitem_idx(PyObject *obj, Py_ssize_t idx)
168{
169    PyObject *newobj;
170    PyObject *idx_obj = PyLong_FromSsize_t(idx);
171    if (idx_obj == NULL)
172        return NULL;
173    newobj = PyObject_GetItem(obj, idx_obj);
174    Py_DECREF(idx_obj);
175    return newobj;
176}
177
178/* do the equivalent of obj[name] */
179static PyObject *
180getitem_str(PyObject *obj, SubString *name)
181{
182    PyObject *newobj;
183    PyObject *str = SubString_new_object(name);
184    if (str == NULL)
185        return NULL;
186    newobj = PyObject_GetItem(obj, str);
187    Py_DECREF(str);
188    return newobj;
189}
190
191typedef struct {
192    /* the entire string we're parsing.  we assume that someone else
193       is managing its lifetime, and that it will exist for the
194       lifetime of the iterator.  can be empty */
195    SubString str;
196
197    /* index to where we are inside field_name */
198    Py_ssize_t index;
199} FieldNameIterator;
200
201
202static int
203FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
204                       Py_ssize_t start, Py_ssize_t end)
205{
206    SubString_init(&self->str, s, start, end);
207    self->index = start;
208    return 1;
209}
210
211static int
212_FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
213{
214    Py_UCS4 c;
215
216    name->str = self->str.str;
217    name->start = self->index;
218
219    /* return everything until '.' or '[' */
220    while (self->index < self->str.end) {
221        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
222        switch (c) {
223        case '[':
224        case '.':
225            /* backup so that we this character will be seen next time */
226            self->index--;
227            break;
228        default:
229            continue;
230        }
231        break;
232    }
233    /* end of string is okay */
234    name->end = self->index;
235    return 1;
236}
237
238static int
239_FieldNameIterator_item(FieldNameIterator *self, SubString *name)
240{
241    int bracket_seen = 0;
242    Py_UCS4 c;
243
244    name->str = self->str.str;
245    name->start = self->index;
246
247    /* return everything until ']' */
248    while (self->index < self->str.end) {
249        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
250        switch (c) {
251        case ']':
252            bracket_seen = 1;
253            break;
254        default:
255            continue;
256        }
257        break;
258    }
259    /* make sure we ended with a ']' */
260    if (!bracket_seen) {
261        PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
262        return 0;
263    }
264
265    /* end of string is okay */
266    /* don't include the ']' */
267    name->end = self->index-1;
268    return 1;
269}
270
271/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
272static int
273FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
274                       Py_ssize_t *name_idx, SubString *name)
275{
276    /* check at end of input */
277    if (self->index >= self->str.end)
278        return 1;
279
280    switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
281    case '.':
282        *is_attribute = 1;
283        if (_FieldNameIterator_attr(self, name) == 0)
284            return 0;
285        *name_idx = -1;
286        break;
287    case '[':
288        *is_attribute = 0;
289        if (_FieldNameIterator_item(self, name) == 0)
290            return 0;
291        *name_idx = get_integer(name);
292        if (*name_idx == -1 && PyErr_Occurred())
293            return 0;
294        break;
295    default:
296        /* Invalid character follows ']' */
297        PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
298                        "follow ']' in format field specifier");
299        return 0;
300    }
301
302    /* empty string is an error */
303    if (name->start == name->end) {
304        PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
305        return 0;
306    }
307
308    return 2;
309}
310
311
312/* input: field_name
313   output: 'first' points to the part before the first '[' or '.'
314           'first_idx' is -1 if 'first' is not an integer, otherwise
315                       it's the value of first converted to an integer
316           'rest' is an iterator to return the rest
317*/
318static int
319field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
320                 Py_ssize_t *first_idx, FieldNameIterator *rest,
321                 AutoNumber *auto_number)
322{
323    Py_UCS4 c;
324    Py_ssize_t i = start;
325    int field_name_is_empty;
326    int using_numeric_index;
327
328    /* find the part up until the first '.' or '[' */
329    while (i < end) {
330        switch (c = PyUnicode_READ_CHAR(str, i++)) {
331        case '[':
332        case '.':
333            /* backup so that we this character is available to the
334               "rest" iterator */
335            i--;
336            break;
337        default:
338            continue;
339        }
340        break;
341    }
342
343    /* set up the return values */
344    SubString_init(first, str, start, i);
345    FieldNameIterator_init(rest, str, i, end);
346
347    /* see if "first" is an integer, in which case it's used as an index */
348    *first_idx = get_integer(first);
349    if (*first_idx == -1 && PyErr_Occurred())
350        return 0;
351
352    field_name_is_empty = first->start >= first->end;
353
354    /* If the field name is omitted or if we have a numeric index
355       specified, then we're doing numeric indexing into args. */
356    using_numeric_index = field_name_is_empty || *first_idx != -1;
357
358    /* We always get here exactly one time for each field we're
359       processing. And we get here in field order (counting by left
360       braces). So this is the perfect place to handle automatic field
361       numbering if the field name is omitted. */
362
363    /* Check if we need to do the auto-numbering. It's not needed if
364       we're called from string.Format routines, because it's handled
365       in that class by itself. */
366    if (auto_number) {
367        /* Initialize our auto numbering state if this is the first
368           time we're either auto-numbering or manually numbering. */
369        if (auto_number->an_state == ANS_INIT && using_numeric_index)
370            auto_number->an_state = field_name_is_empty ?
371                ANS_AUTO : ANS_MANUAL;
372
373        /* Make sure our state is consistent with what we're doing
374           this time through. Only check if we're using a numeric
375           index. */
376        if (using_numeric_index)
377            if (autonumber_state_error(auto_number->an_state,
378                                       field_name_is_empty))
379                return 0;
380        /* Zero length field means we want to do auto-numbering of the
381           fields. */
382        if (field_name_is_empty)
383            *first_idx = (auto_number->an_field_number)++;
384    }
385
386    return 1;
387}
388
389
390/*
391    get_field_object returns the object inside {}, before the
392    format_spec.  It handles getindex and getattr lookups and consumes
393    the entire input string.
394*/
395static PyObject *
396get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
397                 AutoNumber *auto_number)
398{
399    PyObject *obj = NULL;
400    int ok;
401    int is_attribute;
402    SubString name;
403    SubString first;
404    Py_ssize_t index;
405    FieldNameIterator rest;
406
407    if (!field_name_split(input->str, input->start, input->end, &first,
408                          &index, &rest, auto_number)) {
409        goto error;
410    }
411
412    if (index == -1) {
413        /* look up in kwargs */
414        PyObject *key = SubString_new_object(&first);
415        if (key == NULL) {
416            goto error;
417        }
418        if (kwargs == NULL) {
419            PyErr_SetObject(PyExc_KeyError, key);
420            Py_DECREF(key);
421            goto error;
422        }
423        /* Use PyObject_GetItem instead of PyDict_GetItem because this
424           code is no longer just used with kwargs. It might be passed
425           a non-dict when called through format_map. */
426        obj = PyObject_GetItem(kwargs, key);
427        Py_DECREF(key);
428        if (obj == NULL) {
429            goto error;
430        }
431    }
432    else {
433        /* If args is NULL, we have a format string with a positional field
434           with only kwargs to retrieve it from. This can only happen when
435           used with format_map(), where positional arguments are not
436           allowed. */
437        if (args == NULL) {
438            PyErr_SetString(PyExc_ValueError, "Format string contains "
439                            "positional fields");
440            goto error;
441        }
442
443        /* look up in args */
444        obj = PySequence_GetItem(args, index);
445        if (obj == NULL) {
446            PyErr_Format(PyExc_IndexError,
447                         "Replacement index %zd out of range for positional "
448                         "args tuple",
449                         index);
450             goto error;
451        }
452    }
453
454    /* iterate over the rest of the field_name */
455    while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
456                                        &name)) == 2) {
457        PyObject *tmp;
458
459        if (is_attribute)
460            /* getattr lookup "." */
461            tmp = getattr(obj, &name);
462        else
463            /* getitem lookup "[]" */
464            if (index == -1)
465                tmp = getitem_str(obj, &name);
466            else
467                if (PySequence_Check(obj))
468                    tmp = getitem_sequence(obj, index);
469                else
470                    /* not a sequence */
471                    tmp = getitem_idx(obj, index);
472        if (tmp == NULL)
473            goto error;
474
475        /* assign to obj */
476        Py_DECREF(obj);
477        obj = tmp;
478    }
479    /* end of iterator, this is the non-error case */
480    if (ok == 1)
481        return obj;
482error:
483    Py_XDECREF(obj);
484    return NULL;
485}
486
487/************************************************************************/
488/*****************  Field rendering functions  **************************/
489/************************************************************************/
490
491/*
492    render_field() is the main function in this section.  It takes the
493    field object and field specification string generated by
494    get_field_and_spec, and renders the field into the output string.
495
496    render_field calls fieldobj.__format__(format_spec) method, and
497    appends to the output.
498*/
499static int
500render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
501{
502    int ok = 0;
503    PyObject *result = NULL;
504    PyObject *format_spec_object = NULL;
505    int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
506    int err;
507
508    /* If we know the type exactly, skip the lookup of __format__ and just
509       call the formatter directly. */
510    if (PyUnicode_CheckExact(fieldobj))
511        formatter = _PyUnicode_FormatAdvancedWriter;
512    else if (PyLong_CheckExact(fieldobj))
513        formatter = _PyLong_FormatAdvancedWriter;
514    else if (PyFloat_CheckExact(fieldobj))
515        formatter = _PyFloat_FormatAdvancedWriter;
516    else if (PyComplex_CheckExact(fieldobj))
517        formatter = _PyComplex_FormatAdvancedWriter;
518
519    if (formatter) {
520        /* we know exactly which formatter will be called when __format__ is
521           looked up, so call it directly, instead. */
522        err = formatter(writer, fieldobj, format_spec->str,
523                        format_spec->start, format_spec->end);
524        return (err == 0);
525    }
526    else {
527        /* We need to create an object out of the pointers we have, because
528           __format__ takes a string/unicode object for format_spec. */
529        if (format_spec->str)
530            format_spec_object = PyUnicode_Substring(format_spec->str,
531                                                     format_spec->start,
532                                                     format_spec->end);
533        else
534            format_spec_object = PyUnicode_New(0, 0);
535        if (format_spec_object == NULL)
536            goto done;
537
538        result = PyObject_Format(fieldobj, format_spec_object);
539    }
540    if (result == NULL)
541        goto done;
542
543    if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
544        goto done;
545    ok = 1;
546
547done:
548    Py_XDECREF(format_spec_object);
549    Py_XDECREF(result);
550    return ok;
551}
552
553static int
554parse_field(SubString *str, SubString *field_name, SubString *format_spec,
555            int *format_spec_needs_expanding, Py_UCS4 *conversion)
556{
557    /* Note this function works if the field name is zero length,
558       which is good.  Zero length field names are handled later, in
559       field_name_split. */
560
561    Py_UCS4 c = 0;
562
563    /* initialize these, as they may be empty */
564    *conversion = '\0';
565    SubString_init(format_spec, NULL, 0, 0);
566
567    /* Search for the field name.  it's terminated by the end of
568       the string, or a ':' or '!' */
569    field_name->str = str->str;
570    field_name->start = str->start;
571    while (str->start < str->end) {
572        switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
573        case '{':
574            PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
575            return 0;
576        case '[':
577            for (; str->start < str->end; str->start++)
578                if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
579                    break;
580            continue;
581        case '}':
582        case ':':
583        case '!':
584            break;
585        default:
586            continue;
587        }
588        break;
589    }
590
591    field_name->end = str->start - 1;
592    if (c == '!' || c == ':') {
593        Py_ssize_t count;
594        /* we have a format specifier and/or a conversion */
595        /* don't include the last character */
596
597        /* see if there's a conversion specifier */
598        if (c == '!') {
599            /* there must be another character present */
600            if (str->start >= str->end) {
601                PyErr_SetString(PyExc_ValueError,
602                                "end of string while looking for conversion "
603                                "specifier");
604                return 0;
605            }
606            *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
607
608            if (str->start < str->end) {
609                c = PyUnicode_READ_CHAR(str->str, str->start++);
610                if (c == '}')
611                    return 1;
612                if (c != ':') {
613                    PyErr_SetString(PyExc_ValueError,
614                                    "expected ':' after conversion specifier");
615                    return 0;
616                }
617            }
618        }
619        format_spec->str = str->str;
620        format_spec->start = str->start;
621        count = 1;
622        while (str->start < str->end) {
623            switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
624            case '{':
625                *format_spec_needs_expanding = 1;
626                count++;
627                break;
628            case '}':
629                count--;
630                if (count == 0) {
631                    format_spec->end = str->start - 1;
632                    return 1;
633                }
634                break;
635            default:
636                break;
637            }
638        }
639
640        PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
641        return 0;
642    }
643    else if (c != '}') {
644        PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
645        return 0;
646    }
647
648    return 1;
649}
650
651/************************************************************************/
652/******* Output string allocation and escape-to-markup processing  ******/
653/************************************************************************/
654
655/* MarkupIterator breaks the string into pieces of either literal
656   text, or things inside {} that need to be marked up.  it is
657   designed to make it easy to wrap a Python iterator around it, for
658   use with the Formatter class */
659
660typedef struct {
661    SubString str;
662} MarkupIterator;
663
664static int
665MarkupIterator_init(MarkupIterator *self, PyObject *str,
666                    Py_ssize_t start, Py_ssize_t end)
667{
668    SubString_init(&self->str, str, start, end);
669    return 1;
670}
671
672/* returns 0 on error, 1 on non-error termination, and 2 if it got a
673   string (or something to be expanded) */
674static int
675MarkupIterator_next(MarkupIterator *self, SubString *literal,
676                    int *field_present, SubString *field_name,
677                    SubString *format_spec, Py_UCS4 *conversion,
678                    int *format_spec_needs_expanding)
679{
680    int at_end;
681    Py_UCS4 c = 0;
682    Py_ssize_t start;
683    Py_ssize_t len;
684    int markup_follows = 0;
685
686    /* initialize all of the output variables */
687    SubString_init(literal, NULL, 0, 0);
688    SubString_init(field_name, NULL, 0, 0);
689    SubString_init(format_spec, NULL, 0, 0);
690    *conversion = '\0';
691    *format_spec_needs_expanding = 0;
692    *field_present = 0;
693
694    /* No more input, end of iterator.  This is the normal exit
695       path. */
696    if (self->str.start >= self->str.end)
697        return 1;
698
699    start = self->str.start;
700
701    /* First read any literal text. Read until the end of string, an
702       escaped '{' or '}', or an unescaped '{'.  In order to never
703       allocate memory and so I can just pass pointers around, if
704       there's an escaped '{' or '}' then we'll return the literal
705       including the brace, but no format object.  The next time
706       through, we'll return the rest of the literal, skipping past
707       the second consecutive brace. */
708    while (self->str.start < self->str.end) {
709        switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
710        case '{':
711        case '}':
712            markup_follows = 1;
713            break;
714        default:
715            continue;
716        }
717        break;
718    }
719
720    at_end = self->str.start >= self->str.end;
721    len = self->str.start - start;
722
723    if ((c == '}') && (at_end ||
724                       (c != PyUnicode_READ_CHAR(self->str.str,
725                                                 self->str.start)))) {
726        PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
727                        "in format string");
728        return 0;
729    }
730    if (at_end && c == '{') {
731        PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
732                        "in format string");
733        return 0;
734    }
735    if (!at_end) {
736        if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
737            /* escaped } or {, skip it in the input.  there is no
738               markup object following us, just this literal text */
739            self->str.start++;
740            markup_follows = 0;
741        }
742        else
743            len--;
744    }
745
746    /* record the literal text */
747    literal->str = self->str.str;
748    literal->start = start;
749    literal->end = start + len;
750
751    if (!markup_follows)
752        return 2;
753
754    /* this is markup; parse the field */
755    *field_present = 1;
756    if (!parse_field(&self->str, field_name, format_spec,
757                     format_spec_needs_expanding, conversion))
758        return 0;
759    return 2;
760}
761
762
763/* do the !r or !s conversion on obj */
764static PyObject *
765do_conversion(PyObject *obj, Py_UCS4 conversion)
766{
767    /* XXX in pre-3.0, do we need to convert this to unicode, since it
768       might have returned a string? */
769    switch (conversion) {
770    case 'r':
771        return PyObject_Repr(obj);
772    case 's':
773        return PyObject_Str(obj);
774    case 'a':
775        return PyObject_ASCII(obj);
776    default:
777        if (conversion > 32 && conversion < 127) {
778                /* It's the ASCII subrange; casting to char is safe
779                   (assuming the execution character set is an ASCII
780                   superset). */
781                PyErr_Format(PyExc_ValueError,
782                     "Unknown conversion specifier %c",
783                     (char)conversion);
784        } else
785                PyErr_Format(PyExc_ValueError,
786                     "Unknown conversion specifier \\x%x",
787                     (unsigned int)conversion);
788        return NULL;
789    }
790}
791
792/* given:
793
794   {field_name!conversion:format_spec}
795
796   compute the result and write it to output.
797   format_spec_needs_expanding is an optimization.  if it's false,
798   just output the string directly, otherwise recursively expand the
799   format_spec string.
800
801   field_name is allowed to be zero length, in which case we
802   are doing auto field numbering.
803*/
804
805static int
806output_markup(SubString *field_name, SubString *format_spec,
807              int format_spec_needs_expanding, Py_UCS4 conversion,
808              _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
809              int recursion_depth, AutoNumber *auto_number)
810{
811    PyObject *tmp = NULL;
812    PyObject *fieldobj = NULL;
813    SubString expanded_format_spec;
814    SubString *actual_format_spec;
815    int result = 0;
816
817    /* convert field_name to an object */
818    fieldobj = get_field_object(field_name, args, kwargs, auto_number);
819    if (fieldobj == NULL)
820        goto done;
821
822    if (conversion != '\0') {
823        tmp = do_conversion(fieldobj, conversion);
824        if (tmp == NULL || PyUnicode_READY(tmp) == -1)
825            goto done;
826
827        /* do the assignment, transferring ownership: fieldobj = tmp */
828        Py_DECREF(fieldobj);
829        fieldobj = tmp;
830        tmp = NULL;
831    }
832
833    /* if needed, recursively compute the format_spec */
834    if (format_spec_needs_expanding) {
835        tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
836                           auto_number);
837        if (tmp == NULL || PyUnicode_READY(tmp) == -1)
838            goto done;
839
840        /* note that in the case we're expanding the format string,
841           tmp must be kept around until after the call to
842           render_field. */
843        SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
844        actual_format_spec = &expanded_format_spec;
845    }
846    else
847        actual_format_spec = format_spec;
848
849    if (render_field(fieldobj, actual_format_spec, writer) == 0)
850        goto done;
851
852    result = 1;
853
854done:
855    Py_XDECREF(fieldobj);
856    Py_XDECREF(tmp);
857
858    return result;
859}
860
861/*
862    do_markup is the top-level loop for the format() method.  It
863    searches through the format string for escapes to markup codes, and
864    calls other functions to move non-markup text to the output,
865    and to perform the markup to the output.
866*/
867static int
868do_markup(SubString *input, PyObject *args, PyObject *kwargs,
869          _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
870{
871    MarkupIterator iter;
872    int format_spec_needs_expanding;
873    int result;
874    int field_present;
875    SubString literal;
876    SubString field_name;
877    SubString format_spec;
878    Py_UCS4 conversion;
879
880    MarkupIterator_init(&iter, input->str, input->start, input->end);
881    while ((result = MarkupIterator_next(&iter, &literal, &field_present,
882                                         &field_name, &format_spec,
883                                         &conversion,
884                                         &format_spec_needs_expanding)) == 2) {
885        if (literal.end != literal.start) {
886            if (!field_present && iter.str.start == iter.str.end)
887                writer->overallocate = 0;
888            if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
889                                                literal.start, literal.end) < 0)
890                return 0;
891        }
892
893        if (field_present) {
894            if (iter.str.start == iter.str.end)
895                writer->overallocate = 0;
896            if (!output_markup(&field_name, &format_spec,
897                               format_spec_needs_expanding, conversion, writer,
898                               args, kwargs, recursion_depth, auto_number))
899                return 0;
900        }
901    }
902    return result;
903}
904
905
906/*
907    build_string allocates the output string and then
908    calls do_markup to do the heavy lifting.
909*/
910static PyObject *
911build_string(SubString *input, PyObject *args, PyObject *kwargs,
912             int recursion_depth, AutoNumber *auto_number)
913{
914    _PyUnicodeWriter writer;
915
916    /* check the recursion level */
917    if (recursion_depth <= 0) {
918        PyErr_SetString(PyExc_ValueError,
919                        "Max string recursion exceeded");
920        return NULL;
921    }
922
923    _PyUnicodeWriter_Init(&writer);
924    writer.overallocate = 1;
925    writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
926
927    if (!do_markup(input, args, kwargs, &writer, recursion_depth,
928                   auto_number)) {
929        _PyUnicodeWriter_Dealloc(&writer);
930        return NULL;
931    }
932
933    return _PyUnicodeWriter_Finish(&writer);
934}
935
936/************************************************************************/
937/*********** main routine ***********************************************/
938/************************************************************************/
939
940/* this is the main entry point */
941static PyObject *
942do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
943{
944    SubString input;
945
946    /* PEP 3101 says only 2 levels, so that
947       "{0:{1}}".format('abc', 's')            # works
948       "{0:{1:{2}}}".format('abc', 's', '')    # fails
949    */
950    int recursion_depth = 2;
951
952    AutoNumber auto_number;
953
954    if (PyUnicode_READY(self) == -1)
955        return NULL;
956
957    AutoNumber_Init(&auto_number);
958    SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
959    return build_string(&input, args, kwargs, recursion_depth, &auto_number);
960}
961
962static PyObject *
963do_string_format_map(PyObject *self, PyObject *obj)
964{
965    return do_string_format(self, NULL, obj);
966}
967
968
969/************************************************************************/
970/*********** formatteriterator ******************************************/
971/************************************************************************/
972
973/* This is used to implement string.Formatter.vparse().  It exists so
974   Formatter can share code with the built in unicode.format() method.
975   It's really just a wrapper around MarkupIterator that is callable
976   from Python. */
977
978typedef struct {
979    PyObject_HEAD
980    PyObject *str;
981    MarkupIterator it_markup;
982} formatteriterobject;
983
984static void
985formatteriter_dealloc(formatteriterobject *it)
986{
987    Py_XDECREF(it->str);
988    PyObject_Free(it);
989}
990
991/* returns a tuple:
992   (literal, field_name, format_spec, conversion)
993
994   literal is any literal text to output.  might be zero length
995   field_name is the string before the ':'.  might be None
996   format_spec is the string after the ':'.  mibht be None
997   conversion is either None, or the string after the '!'
998*/
999static PyObject *
1000formatteriter_next(formatteriterobject *it)
1001{
1002    SubString literal;
1003    SubString field_name;
1004    SubString format_spec;
1005    Py_UCS4 conversion;
1006    int format_spec_needs_expanding;
1007    int field_present;
1008    int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1009                                     &field_name, &format_spec, &conversion,
1010                                     &format_spec_needs_expanding);
1011
1012    /* all of the SubString objects point into it->str, so no
1013       memory management needs to be done on them */
1014    assert(0 <= result && result <= 2);
1015    if (result == 0 || result == 1)
1016        /* if 0, error has already been set, if 1, iterator is empty */
1017        return NULL;
1018    else {
1019        PyObject *literal_str = NULL;
1020        PyObject *field_name_str = NULL;
1021        PyObject *format_spec_str = NULL;
1022        PyObject *conversion_str = NULL;
1023        PyObject *tuple = NULL;
1024
1025        literal_str = SubString_new_object(&literal);
1026        if (literal_str == NULL)
1027            goto done;
1028
1029        field_name_str = SubString_new_object(&field_name);
1030        if (field_name_str == NULL)
1031            goto done;
1032
1033        /* if field_name is non-zero length, return a string for
1034           format_spec (even if zero length), else return None */
1035        format_spec_str = (field_present ?
1036                           SubString_new_object_or_empty :
1037                           SubString_new_object)(&format_spec);
1038        if (format_spec_str == NULL)
1039            goto done;
1040
1041        /* if the conversion is not specified, return a None,
1042           otherwise create a one length string with the conversion
1043           character */
1044        if (conversion == '\0') {
1045            conversion_str = Py_None;
1046            Py_INCREF(conversion_str);
1047        }
1048        else
1049            conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
1050                                                       &conversion, 1);
1051        if (conversion_str == NULL)
1052            goto done;
1053
1054        tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1055                             conversion_str);
1056    done:
1057        Py_XDECREF(literal_str);
1058        Py_XDECREF(field_name_str);
1059        Py_XDECREF(format_spec_str);
1060        Py_XDECREF(conversion_str);
1061        return tuple;
1062    }
1063}
1064
1065static PyMethodDef formatteriter_methods[] = {
1066    {NULL,              NULL}           /* sentinel */
1067};
1068
1069static PyTypeObject PyFormatterIter_Type = {
1070    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1071    "formatteriterator",                /* tp_name */
1072    sizeof(formatteriterobject),        /* tp_basicsize */
1073    0,                                  /* tp_itemsize */
1074    /* methods */
1075    (destructor)formatteriter_dealloc,  /* tp_dealloc */
1076    0,                                  /* tp_vectorcall_offset */
1077    0,                                  /* tp_getattr */
1078    0,                                  /* tp_setattr */
1079    0,                                  /* tp_as_async */
1080    0,                                  /* tp_repr */
1081    0,                                  /* tp_as_number */
1082    0,                                  /* tp_as_sequence */
1083    0,                                  /* tp_as_mapping */
1084    0,                                  /* tp_hash */
1085    0,                                  /* tp_call */
1086    0,                                  /* tp_str */
1087    PyObject_GenericGetAttr,            /* tp_getattro */
1088    0,                                  /* tp_setattro */
1089    0,                                  /* tp_as_buffer */
1090    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1091    0,                                  /* tp_doc */
1092    0,                                  /* tp_traverse */
1093    0,                                  /* tp_clear */
1094    0,                                  /* tp_richcompare */
1095    0,                                  /* tp_weaklistoffset */
1096    PyObject_SelfIter,                  /* tp_iter */
1097    (iternextfunc)formatteriter_next,   /* tp_iternext */
1098    formatteriter_methods,              /* tp_methods */
1099    0,
1100};
1101
1102/* unicode_formatter_parser is used to implement
1103   string.Formatter.vformat.  it parses a string and returns tuples
1104   describing the parsed elements.  It's a wrapper around
1105   stringlib/string_format.h's MarkupIterator */
1106static PyObject *
1107formatter_parser(PyObject *ignored, PyObject *self)
1108{
1109    formatteriterobject *it;
1110
1111    if (!PyUnicode_Check(self)) {
1112        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1113        return NULL;
1114    }
1115
1116    if (PyUnicode_READY(self) == -1)
1117        return NULL;
1118
1119    it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1120    if (it == NULL)
1121        return NULL;
1122
1123    /* take ownership, give the object to the iterator */
1124    Py_INCREF(self);
1125    it->str = self;
1126
1127    /* initialize the contained MarkupIterator */
1128    MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1129    return (PyObject *)it;
1130}
1131
1132
1133/************************************************************************/
1134/*********** fieldnameiterator ******************************************/
1135/************************************************************************/
1136
1137
1138/* This is used to implement string.Formatter.vparse().  It parses the
1139   field name into attribute and item values.  It's a Python-callable
1140   wrapper around FieldNameIterator */
1141
1142typedef struct {
1143    PyObject_HEAD
1144    PyObject *str;
1145    FieldNameIterator it_field;
1146} fieldnameiterobject;
1147
1148static void
1149fieldnameiter_dealloc(fieldnameiterobject *it)
1150{
1151    Py_XDECREF(it->str);
1152    PyObject_Free(it);
1153}
1154
1155/* returns a tuple:
1156   (is_attr, value)
1157   is_attr is true if we used attribute syntax (e.g., '.foo')
1158              false if we used index syntax (e.g., '[foo]')
1159   value is an integer or string
1160*/
1161static PyObject *
1162fieldnameiter_next(fieldnameiterobject *it)
1163{
1164    int result;
1165    int is_attr;
1166    Py_ssize_t idx;
1167    SubString name;
1168
1169    result = FieldNameIterator_next(&it->it_field, &is_attr,
1170                                    &idx, &name);
1171    if (result == 0 || result == 1)
1172        /* if 0, error has already been set, if 1, iterator is empty */
1173        return NULL;
1174    else {
1175        PyObject* result = NULL;
1176        PyObject* is_attr_obj = NULL;
1177        PyObject* obj = NULL;
1178
1179        is_attr_obj = PyBool_FromLong(is_attr);
1180        if (is_attr_obj == NULL)
1181            goto done;
1182
1183        /* either an integer or a string */
1184        if (idx != -1)
1185            obj = PyLong_FromSsize_t(idx);
1186        else
1187            obj = SubString_new_object(&name);
1188        if (obj == NULL)
1189            goto done;
1190
1191        /* return a tuple of values */
1192        result = PyTuple_Pack(2, is_attr_obj, obj);
1193
1194    done:
1195        Py_XDECREF(is_attr_obj);
1196        Py_XDECREF(obj);
1197        return result;
1198    }
1199}
1200
1201static PyMethodDef fieldnameiter_methods[] = {
1202    {NULL,              NULL}           /* sentinel */
1203};
1204
1205static PyTypeObject PyFieldNameIter_Type = {
1206    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1207    "fieldnameiterator",                /* tp_name */
1208    sizeof(fieldnameiterobject),        /* tp_basicsize */
1209    0,                                  /* tp_itemsize */
1210    /* methods */
1211    (destructor)fieldnameiter_dealloc,  /* tp_dealloc */
1212    0,                                  /* tp_vectorcall_offset */
1213    0,                                  /* tp_getattr */
1214    0,                                  /* tp_setattr */
1215    0,                                  /* tp_as_async */
1216    0,                                  /* tp_repr */
1217    0,                                  /* tp_as_number */
1218    0,                                  /* tp_as_sequence */
1219    0,                                  /* tp_as_mapping */
1220    0,                                  /* tp_hash */
1221    0,                                  /* tp_call */
1222    0,                                  /* tp_str */
1223    PyObject_GenericGetAttr,            /* tp_getattro */
1224    0,                                  /* tp_setattro */
1225    0,                                  /* tp_as_buffer */
1226    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1227    0,                                  /* tp_doc */
1228    0,                                  /* tp_traverse */
1229    0,                                  /* tp_clear */
1230    0,                                  /* tp_richcompare */
1231    0,                                  /* tp_weaklistoffset */
1232    PyObject_SelfIter,                  /* tp_iter */
1233    (iternextfunc)fieldnameiter_next,   /* tp_iternext */
1234    fieldnameiter_methods,              /* tp_methods */
1235    0};
1236
1237/* unicode_formatter_field_name_split is used to implement
1238   string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1239   returns a tuple of (first, rest): "first", the part before the
1240   first '.' or '['; and "rest", an iterator for the rest of the field
1241   name.  it's a wrapper around stringlib/string_format.h's
1242   field_name_split.  The iterator it returns is a
1243   FieldNameIterator */
1244static PyObject *
1245formatter_field_name_split(PyObject *ignored, PyObject *self)
1246{
1247    SubString first;
1248    Py_ssize_t first_idx;
1249    fieldnameiterobject *it;
1250
1251    PyObject *first_obj = NULL;
1252    PyObject *result = NULL;
1253
1254    if (!PyUnicode_Check(self)) {
1255        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1256        return NULL;
1257    }
1258
1259    if (PyUnicode_READY(self) == -1)
1260        return NULL;
1261
1262    it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1263    if (it == NULL)
1264        return NULL;
1265
1266    /* take ownership, give the object to the iterator.  this is
1267       just to keep the field_name alive */
1268    Py_INCREF(self);
1269    it->str = self;
1270
1271    /* Pass in auto_number = NULL. We'll return an empty string for
1272       first_obj in that case. */
1273    if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1274                          &first, &first_idx, &it->it_field, NULL))
1275        goto done;
1276
1277    /* first becomes an integer, if possible; else a string */
1278    if (first_idx != -1)
1279        first_obj = PyLong_FromSsize_t(first_idx);
1280    else
1281        /* convert "first" into a string object */
1282        first_obj = SubString_new_object(&first);
1283    if (first_obj == NULL)
1284        goto done;
1285
1286    /* return a tuple of values */
1287    result = PyTuple_Pack(2, first_obj, it);
1288
1289done:
1290    Py_XDECREF(it);
1291    Py_XDECREF(first_obj);
1292    return result;
1293}
1294