1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "pycore_abstract.h"      // _PyIndex_Check()
44#include "pycore_atomic_funcs.h"  // _Py_atomic_size_get()
45#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
46#include "pycore_bytes_methods.h" // _Py_bytes_lower()
47#include "pycore_format.h"        // F_LJUST
48#include "pycore_initconfig.h"    // _PyStatus_OK()
49#include "pycore_interp.h"        // PyInterpreterState.fs_codec
50#include "pycore_long.h"          // _PyLong_FormatWriter()
51#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
52#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
53#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
54#include "pycore_pystate.h"       // _PyInterpreterState_GET()
55#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
56#include "pycore_unicodeobject.h" // struct _Py_unicode_state
57#include "stringlib/eq.h"         // unicode_eq()
58
59#ifdef MS_WINDOWS
60#include <windows.h>
61#endif
62
63#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
64#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
65#endif
66
67/* Uncomment to display statistics on interned strings at exit
68   in _PyUnicode_ClearInterned(). */
69/* #define INTERNED_STATS 1 */
70
71
72/*[clinic input]
73class str "PyObject *" "&PyUnicode_Type"
74[clinic start generated code]*/
75/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
76
77/*[python input]
78class Py_UCS4_converter(CConverter):
79    type = 'Py_UCS4'
80    converter = 'convert_uc'
81
82    def converter_init(self):
83        if self.default is not unspecified:
84            self.c_default = ascii(self.default)
85            if len(self.c_default) > 4 or self.c_default[0] != "'":
86                self.c_default = hex(ord(self.default))
87
88[python start generated code]*/
89/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
90
91/* --- Globals ------------------------------------------------------------
92
93NOTE: In the interpreter's initialization phase, some globals are currently
94      initialized dynamically as needed. In the process Unicode objects may
95      be created before the Unicode type is ready.
96
97*/
98
99
100#ifdef __cplusplus
101extern "C" {
102#endif
103
104// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
105// The value must be the same in fileutils.c.
106#define MAX_UNICODE 0x10ffff
107
108#ifdef Py_DEBUG
109#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
110#else
111#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
112#endif
113
114#define _PyUnicode_UTF8(op)                             \
115    (_PyCompactUnicodeObject_CAST(op)->utf8)
116#define PyUnicode_UTF8(op)                              \
117    (assert(_PyUnicode_CHECK(op)),                      \
118     assert(PyUnicode_IS_READY(op)),                    \
119     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
120         ((char*)(_PyASCIIObject_CAST(op) + 1)) :       \
121         _PyUnicode_UTF8(op))
122#define _PyUnicode_UTF8_LENGTH(op)                      \
123    (_PyCompactUnicodeObject_CAST(op)->utf8_length)
124#define PyUnicode_UTF8_LENGTH(op)                       \
125    (assert(_PyUnicode_CHECK(op)),                      \
126     assert(PyUnicode_IS_READY(op)),                    \
127     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
128         _PyASCIIObject_CAST(op)->length :              \
129         _PyUnicode_UTF8_LENGTH(op))
130#define _PyUnicode_WSTR(op)                             \
131    (_PyASCIIObject_CAST(op)->wstr)
132
133/* Don't use deprecated macro of unicodeobject.h */
134#undef PyUnicode_WSTR_LENGTH
135#define PyUnicode_WSTR_LENGTH(op) \
136    (PyUnicode_IS_COMPACT_ASCII(op) ?                   \
137     _PyASCIIObject_CAST(op)->length :                  \
138     _PyCompactUnicodeObject_CAST(op)->wstr_length)
139#define _PyUnicode_WSTR_LENGTH(op)                      \
140    (_PyCompactUnicodeObject_CAST(op)->wstr_length)
141#define _PyUnicode_LENGTH(op)                           \
142    (_PyASCIIObject_CAST(op)->length)
143#define _PyUnicode_STATE(op)                            \
144    (_PyASCIIObject_CAST(op)->state)
145#define _PyUnicode_HASH(op)                             \
146    (_PyASCIIObject_CAST(op)->hash)
147#define _PyUnicode_KIND(op)                             \
148    (assert(_PyUnicode_CHECK(op)),                      \
149     _PyASCIIObject_CAST(op)->state.kind)
150#define _PyUnicode_GET_LENGTH(op)                       \
151    (assert(_PyUnicode_CHECK(op)),                      \
152     _PyASCIIObject_CAST(op)->length)
153#define _PyUnicode_DATA_ANY(op)                         \
154    (_PyUnicodeObject_CAST(op)->data.any)
155
156#undef PyUnicode_READY
157#define PyUnicode_READY(op)                             \
158    (assert(_PyUnicode_CHECK(op)),                      \
159     (PyUnicode_IS_READY(op) ?                          \
160      0 :                                               \
161      _PyUnicode_Ready(op)))
162
163#define _PyUnicode_SHARE_UTF8(op)                       \
164    (assert(_PyUnicode_CHECK(op)),                      \
165     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
166     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
167#define _PyUnicode_SHARE_WSTR(op)                       \
168    (assert(_PyUnicode_CHECK(op)),                      \
169     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
170
171/* true if the Unicode object has an allocated UTF-8 memory block
172   (not shared with other data) */
173#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
174    ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
175      && _PyUnicode_UTF8(op)                            \
176      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
177
178/* true if the Unicode object has an allocated wstr memory block
179   (not shared with other data) */
180#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
181    ((_PyUnicode_WSTR(op) &&                            \
182      (!PyUnicode_IS_READY(op) ||                       \
183       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
184
185/* Generic helper macro to convert characters of different types.
186   from_type and to_type have to be valid type names, begin and end
187   are pointers to the source characters which should be of type
188   "from_type *".  to is a pointer of type "to_type *" and points to the
189   buffer where the result characters are written to. */
190#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
191    do {                                                \
192        to_type *_to = (to_type *)(to);                 \
193        const from_type *_iter = (const from_type *)(begin);\
194        const from_type *_end = (const from_type *)(end);\
195        Py_ssize_t n = (_end) - (_iter);                \
196        const from_type *_unrolled_end =                \
197            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
198        while (_iter < (_unrolled_end)) {               \
199            _to[0] = (to_type) _iter[0];                \
200            _to[1] = (to_type) _iter[1];                \
201            _to[2] = (to_type) _iter[2];                \
202            _to[3] = (to_type) _iter[3];                \
203            _iter += 4; _to += 4;                       \
204        }                                               \
205        while (_iter < (_end))                          \
206            *_to++ = (to_type) *_iter++;                \
207    } while (0)
208
209#define LATIN1(ch)  \
210    (ch < 128 \
211     ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \
212     : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128])
213
214#ifdef MS_WINDOWS
215   /* On Windows, overallocate by 50% is the best factor */
216#  define OVERALLOCATE_FACTOR 2
217#else
218   /* On Linux, overallocate by 25% is the best factor */
219#  define OVERALLOCATE_FACTOR 4
220#endif
221
222/* This dictionary holds all interned unicode strings.  Note that references
223   to strings in this dictionary are *not* counted in the string's ob_refcnt.
224   When the interned string reaches a refcnt of 0 the string deallocation
225   function will delete the reference from this dictionary.
226
227   Another way to look at this is that to say that the actual reference
228   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
229*/
230static PyObject *interned = NULL;
231
232/* Forward declaration */
233static inline int
234_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
235static inline void
236_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
237static PyObject *
238unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
239                    const char *errors);
240static PyObject *
241unicode_decode_utf8(const char *s, Py_ssize_t size,
242                    _Py_error_handler error_handler, const char *errors,
243                    Py_ssize_t *consumed);
244#ifdef Py_DEBUG
245static inline int unicode_is_finalizing(void);
246static int unicode_is_singleton(PyObject *unicode);
247#endif
248
249
250// Return a borrowed reference to the empty string singleton.
251static inline PyObject* unicode_get_empty(void)
252{
253    _Py_DECLARE_STR(empty, "");
254    return &_Py_STR(empty);
255}
256
257
258// Return a strong reference to the empty string singleton.
259static inline PyObject* unicode_new_empty(void)
260{
261    PyObject *empty = unicode_get_empty();
262    Py_INCREF(empty);
263    return empty;
264}
265
266#define _Py_RETURN_UNICODE_EMPTY()   \
267    do {                             \
268        return unicode_new_empty();  \
269    } while (0)
270
271static inline void
272unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
273             Py_ssize_t start, Py_ssize_t length)
274{
275    assert(0 <= start);
276    assert(kind != PyUnicode_WCHAR_KIND);
277    switch (kind) {
278    case PyUnicode_1BYTE_KIND: {
279        assert(value <= 0xff);
280        Py_UCS1 ch = (unsigned char)value;
281        Py_UCS1 *to = (Py_UCS1 *)data + start;
282        memset(to, ch, length);
283        break;
284    }
285    case PyUnicode_2BYTE_KIND: {
286        assert(value <= 0xffff);
287        Py_UCS2 ch = (Py_UCS2)value;
288        Py_UCS2 *to = (Py_UCS2 *)data + start;
289        const Py_UCS2 *end = to + length;
290        for (; to < end; ++to) *to = ch;
291        break;
292    }
293    case PyUnicode_4BYTE_KIND: {
294        assert(value <= MAX_UNICODE);
295        Py_UCS4 ch = value;
296        Py_UCS4 * to = (Py_UCS4 *)data + start;
297        const Py_UCS4 *end = to + length;
298        for (; to < end; ++to) *to = ch;
299        break;
300    }
301    default: Py_UNREACHABLE();
302    }
303}
304
305
306/* Fast detection of the most frequent whitespace characters */
307const unsigned char _Py_ascii_whitespace[] = {
308    0, 0, 0, 0, 0, 0, 0, 0,
309/*     case 0x0009: * CHARACTER TABULATION */
310/*     case 0x000A: * LINE FEED */
311/*     case 0x000B: * LINE TABULATION */
312/*     case 0x000C: * FORM FEED */
313/*     case 0x000D: * CARRIAGE RETURN */
314    0, 1, 1, 1, 1, 1, 0, 0,
315    0, 0, 0, 0, 0, 0, 0, 0,
316/*     case 0x001C: * FILE SEPARATOR */
317/*     case 0x001D: * GROUP SEPARATOR */
318/*     case 0x001E: * RECORD SEPARATOR */
319/*     case 0x001F: * UNIT SEPARATOR */
320    0, 0, 0, 0, 1, 1, 1, 1,
321/*     case 0x0020: * SPACE */
322    1, 0, 0, 0, 0, 0, 0, 0,
323    0, 0, 0, 0, 0, 0, 0, 0,
324    0, 0, 0, 0, 0, 0, 0, 0,
325    0, 0, 0, 0, 0, 0, 0, 0,
326
327    0, 0, 0, 0, 0, 0, 0, 0,
328    0, 0, 0, 0, 0, 0, 0, 0,
329    0, 0, 0, 0, 0, 0, 0, 0,
330    0, 0, 0, 0, 0, 0, 0, 0,
331    0, 0, 0, 0, 0, 0, 0, 0,
332    0, 0, 0, 0, 0, 0, 0, 0,
333    0, 0, 0, 0, 0, 0, 0, 0,
334    0, 0, 0, 0, 0, 0, 0, 0
335};
336
337/* forward */
338static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
339static PyObject* get_latin1_char(unsigned char ch);
340static int unicode_modifiable(PyObject *unicode);
341
342
343static PyObject *
344_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
345static PyObject *
346_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
347static PyObject *
348_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
349
350static PyObject *
351unicode_encode_call_errorhandler(const char *errors,
352       PyObject **errorHandler,const char *encoding, const char *reason,
353       PyObject *unicode, PyObject **exceptionObject,
354       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
355
356static void
357raise_encode_exception(PyObject **exceptionObject,
358                       const char *encoding,
359                       PyObject *unicode,
360                       Py_ssize_t startpos, Py_ssize_t endpos,
361                       const char *reason);
362
363/* Same for linebreaks */
364static const unsigned char ascii_linebreak[] = {
365    0, 0, 0, 0, 0, 0, 0, 0,
366/*         0x000A, * LINE FEED */
367/*         0x000B, * LINE TABULATION */
368/*         0x000C, * FORM FEED */
369/*         0x000D, * CARRIAGE RETURN */
370    0, 0, 1, 1, 1, 1, 0, 0,
371    0, 0, 0, 0, 0, 0, 0, 0,
372/*         0x001C, * FILE SEPARATOR */
373/*         0x001D, * GROUP SEPARATOR */
374/*         0x001E, * RECORD SEPARATOR */
375    0, 0, 0, 0, 1, 1, 1, 0,
376    0, 0, 0, 0, 0, 0, 0, 0,
377    0, 0, 0, 0, 0, 0, 0, 0,
378    0, 0, 0, 0, 0, 0, 0, 0,
379    0, 0, 0, 0, 0, 0, 0, 0,
380
381    0, 0, 0, 0, 0, 0, 0, 0,
382    0, 0, 0, 0, 0, 0, 0, 0,
383    0, 0, 0, 0, 0, 0, 0, 0,
384    0, 0, 0, 0, 0, 0, 0, 0,
385    0, 0, 0, 0, 0, 0, 0, 0,
386    0, 0, 0, 0, 0, 0, 0, 0,
387    0, 0, 0, 0, 0, 0, 0, 0,
388    0, 0, 0, 0, 0, 0, 0, 0
389};
390
391static int convert_uc(PyObject *obj, void *addr);
392
393struct encoding_map;
394#include "clinic/unicodeobject.c.h"
395
396_Py_error_handler
397_Py_GetErrorHandler(const char *errors)
398{
399    if (errors == NULL || strcmp(errors, "strict") == 0) {
400        return _Py_ERROR_STRICT;
401    }
402    if (strcmp(errors, "surrogateescape") == 0) {
403        return _Py_ERROR_SURROGATEESCAPE;
404    }
405    if (strcmp(errors, "replace") == 0) {
406        return _Py_ERROR_REPLACE;
407    }
408    if (strcmp(errors, "ignore") == 0) {
409        return _Py_ERROR_IGNORE;
410    }
411    if (strcmp(errors, "backslashreplace") == 0) {
412        return _Py_ERROR_BACKSLASHREPLACE;
413    }
414    if (strcmp(errors, "surrogatepass") == 0) {
415        return _Py_ERROR_SURROGATEPASS;
416    }
417    if (strcmp(errors, "xmlcharrefreplace") == 0) {
418        return _Py_ERROR_XMLCHARREFREPLACE;
419    }
420    return _Py_ERROR_OTHER;
421}
422
423
424static _Py_error_handler
425get_error_handler_wide(const wchar_t *errors)
426{
427    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428        return _Py_ERROR_STRICT;
429    }
430    if (wcscmp(errors, L"surrogateescape") == 0) {
431        return _Py_ERROR_SURROGATEESCAPE;
432    }
433    if (wcscmp(errors, L"replace") == 0) {
434        return _Py_ERROR_REPLACE;
435    }
436    if (wcscmp(errors, L"ignore") == 0) {
437        return _Py_ERROR_IGNORE;
438    }
439    if (wcscmp(errors, L"backslashreplace") == 0) {
440        return _Py_ERROR_BACKSLASHREPLACE;
441    }
442    if (wcscmp(errors, L"surrogatepass") == 0) {
443        return _Py_ERROR_SURROGATEPASS;
444    }
445    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446        return _Py_ERROR_XMLCHARREFREPLACE;
447    }
448    return _Py_ERROR_OTHER;
449}
450
451
452static inline int
453unicode_check_encoding_errors(const char *encoding, const char *errors)
454{
455    if (encoding == NULL && errors == NULL) {
456        return 0;
457    }
458
459    PyInterpreterState *interp = _PyInterpreterState_GET();
460#ifndef Py_DEBUG
461    /* In release mode, only check in development mode (-X dev) */
462    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
463        return 0;
464    }
465#else
466    /* Always check in debug mode */
467#endif
468
469    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
471    if (!interp->unicode.fs_codec.encoding) {
472        return 0;
473    }
474
475    /* Disable checks during Python finalization. For example, it allows to
476       call _PyObject_Dump() during finalization for debugging purpose. */
477    if (interp->finalizing) {
478        return 0;
479    }
480
481    if (encoding != NULL) {
482        PyObject *handler = _PyCodec_Lookup(encoding);
483        if (handler == NULL) {
484            return -1;
485        }
486        Py_DECREF(handler);
487    }
488
489    if (errors != NULL) {
490        PyObject *handler = PyCodec_LookupError(errors);
491        if (handler == NULL) {
492            return -1;
493        }
494        Py_DECREF(handler);
495    }
496    return 0;
497}
498
499
500int
501_PyUnicode_CheckConsistency(PyObject *op, int check_content)
502{
503#define CHECK(expr) \
504    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505
506    assert(op != NULL);
507    CHECK(PyUnicode_Check(op));
508
509    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
510    unsigned int kind = ascii->state.kind;
511
512    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
513        CHECK(kind == PyUnicode_1BYTE_KIND);
514        CHECK(ascii->state.ready == 1);
515    }
516    else {
517        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
518        void *data;
519
520        if (ascii->state.compact == 1) {
521            data = compact + 1;
522            CHECK(kind == PyUnicode_1BYTE_KIND
523                                 || kind == PyUnicode_2BYTE_KIND
524                                 || kind == PyUnicode_4BYTE_KIND);
525            CHECK(ascii->state.ascii == 0);
526            CHECK(ascii->state.ready == 1);
527            CHECK(compact->utf8 != data);
528        }
529        else {
530            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
531
532            data = unicode->data.any;
533            if (kind == PyUnicode_WCHAR_KIND) {
534                CHECK(ascii->length == 0);
535                CHECK(ascii->hash == -1);
536                CHECK(ascii->state.compact == 0);
537                CHECK(ascii->state.ascii == 0);
538                CHECK(ascii->state.ready == 0);
539                CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
540                CHECK(ascii->wstr != NULL);
541                CHECK(data == NULL);
542                CHECK(compact->utf8 == NULL);
543            }
544            else {
545                CHECK(kind == PyUnicode_1BYTE_KIND
546                                     || kind == PyUnicode_2BYTE_KIND
547                                     || kind == PyUnicode_4BYTE_KIND);
548                CHECK(ascii->state.compact == 0);
549                CHECK(ascii->state.ready == 1);
550                CHECK(data != NULL);
551                if (ascii->state.ascii) {
552                    CHECK(compact->utf8 == data);
553                    CHECK(compact->utf8_length == ascii->length);
554                }
555                else
556                    CHECK(compact->utf8 != data);
557            }
558        }
559        if (kind != PyUnicode_WCHAR_KIND) {
560            if (
561#if SIZEOF_WCHAR_T == 2
562                kind == PyUnicode_2BYTE_KIND
563#else
564                kind == PyUnicode_4BYTE_KIND
565#endif
566               )
567            {
568                CHECK(ascii->wstr == data);
569                CHECK(compact->wstr_length == ascii->length);
570            } else
571                CHECK(ascii->wstr != data);
572        }
573
574        if (compact->utf8 == NULL)
575            CHECK(compact->utf8_length == 0);
576        if (ascii->wstr == NULL)
577            CHECK(compact->wstr_length == 0);
578    }
579
580    /* check that the best kind is used: O(n) operation */
581    if (check_content && kind != PyUnicode_WCHAR_KIND) {
582        Py_ssize_t i;
583        Py_UCS4 maxchar = 0;
584        const void *data;
585        Py_UCS4 ch;
586
587        data = PyUnicode_DATA(ascii);
588        for (i=0; i < ascii->length; i++)
589        {
590            ch = PyUnicode_READ(kind, data, i);
591            if (ch > maxchar)
592                maxchar = ch;
593        }
594        if (kind == PyUnicode_1BYTE_KIND) {
595            if (ascii->state.ascii == 0) {
596                CHECK(maxchar >= 128);
597                CHECK(maxchar <= 255);
598            }
599            else
600                CHECK(maxchar < 128);
601        }
602        else if (kind == PyUnicode_2BYTE_KIND) {
603            CHECK(maxchar >= 0x100);
604            CHECK(maxchar <= 0xFFFF);
605        }
606        else {
607            CHECK(maxchar >= 0x10000);
608            CHECK(maxchar <= MAX_UNICODE);
609        }
610        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
611    }
612    return 1;
613
614#undef CHECK
615}
616
617
618static PyObject*
619unicode_result_wchar(PyObject *unicode)
620{
621#ifndef Py_DEBUG
622    Py_ssize_t len;
623
624    len = _PyUnicode_WSTR_LENGTH(unicode);
625    if (len == 0) {
626        Py_DECREF(unicode);
627        _Py_RETURN_UNICODE_EMPTY();
628    }
629
630    if (len == 1) {
631        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
632        if ((Py_UCS4)ch < 256) {
633            Py_DECREF(unicode);
634            return get_latin1_char((unsigned char)ch);
635        }
636    }
637
638    if (_PyUnicode_Ready(unicode) < 0) {
639        Py_DECREF(unicode);
640        return NULL;
641    }
642#else
643    assert(Py_REFCNT(unicode) == 1);
644
645    /* don't make the result ready in debug mode to ensure that the caller
646       makes the string ready before using it */
647    assert(_PyUnicode_CheckConsistency(unicode, 1));
648#endif
649    return unicode;
650}
651
652static PyObject*
653unicode_result_ready(PyObject *unicode)
654{
655    Py_ssize_t length;
656
657    length = PyUnicode_GET_LENGTH(unicode);
658    if (length == 0) {
659        PyObject *empty = unicode_get_empty();
660        if (unicode != empty) {
661            Py_DECREF(unicode);
662            Py_INCREF(empty);
663        }
664        return empty;
665    }
666
667    if (length == 1) {
668        int kind = PyUnicode_KIND(unicode);
669        if (kind == PyUnicode_1BYTE_KIND) {
670            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
671            Py_UCS1 ch = data[0];
672            PyObject *latin1_char = LATIN1(ch);
673            if (unicode != latin1_char) {
674                Py_INCREF(latin1_char);
675                Py_DECREF(unicode);
676            }
677            return latin1_char;
678        }
679    }
680
681    assert(_PyUnicode_CheckConsistency(unicode, 1));
682    return unicode;
683}
684
685static PyObject*
686unicode_result(PyObject *unicode)
687{
688    assert(_PyUnicode_CHECK(unicode));
689    if (PyUnicode_IS_READY(unicode))
690        return unicode_result_ready(unicode);
691    else
692        return unicode_result_wchar(unicode);
693}
694
695static PyObject*
696unicode_result_unchanged(PyObject *unicode)
697{
698    if (PyUnicode_CheckExact(unicode)) {
699        if (PyUnicode_READY(unicode) == -1)
700            return NULL;
701        Py_INCREF(unicode);
702        return unicode;
703    }
704    else
705        /* Subtype -- return genuine unicode string with the same value. */
706        return _PyUnicode_Copy(unicode);
707}
708
709/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
710   ASCII, Latin1, UTF-8, etc. */
711static char*
712backslashreplace(_PyBytesWriter *writer, char *str,
713                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
714{
715    Py_ssize_t size, i;
716    Py_UCS4 ch;
717    enum PyUnicode_Kind kind;
718    const void *data;
719
720    assert(PyUnicode_IS_READY(unicode));
721    kind = PyUnicode_KIND(unicode);
722    data = PyUnicode_DATA(unicode);
723
724    size = 0;
725    /* determine replacement size */
726    for (i = collstart; i < collend; ++i) {
727        Py_ssize_t incr;
728
729        ch = PyUnicode_READ(kind, data, i);
730        if (ch < 0x100)
731            incr = 2+2;
732        else if (ch < 0x10000)
733            incr = 2+4;
734        else {
735            assert(ch <= MAX_UNICODE);
736            incr = 2+8;
737        }
738        if (size > PY_SSIZE_T_MAX - incr) {
739            PyErr_SetString(PyExc_OverflowError,
740                            "encoded result is too long for a Python string");
741            return NULL;
742        }
743        size += incr;
744    }
745
746    str = _PyBytesWriter_Prepare(writer, str, size);
747    if (str == NULL)
748        return NULL;
749
750    /* generate replacement */
751    for (i = collstart; i < collend; ++i) {
752        ch = PyUnicode_READ(kind, data, i);
753        *str++ = '\\';
754        if (ch >= 0x00010000) {
755            *str++ = 'U';
756            *str++ = Py_hexdigits[(ch>>28)&0xf];
757            *str++ = Py_hexdigits[(ch>>24)&0xf];
758            *str++ = Py_hexdigits[(ch>>20)&0xf];
759            *str++ = Py_hexdigits[(ch>>16)&0xf];
760            *str++ = Py_hexdigits[(ch>>12)&0xf];
761            *str++ = Py_hexdigits[(ch>>8)&0xf];
762        }
763        else if (ch >= 0x100) {
764            *str++ = 'u';
765            *str++ = Py_hexdigits[(ch>>12)&0xf];
766            *str++ = Py_hexdigits[(ch>>8)&0xf];
767        }
768        else
769            *str++ = 'x';
770        *str++ = Py_hexdigits[(ch>>4)&0xf];
771        *str++ = Py_hexdigits[ch&0xf];
772    }
773    return str;
774}
775
776/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
777   ASCII, Latin1, UTF-8, etc. */
778static char*
779xmlcharrefreplace(_PyBytesWriter *writer, char *str,
780                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
781{
782    Py_ssize_t size, i;
783    Py_UCS4 ch;
784    enum PyUnicode_Kind kind;
785    const void *data;
786
787    assert(PyUnicode_IS_READY(unicode));
788    kind = PyUnicode_KIND(unicode);
789    data = PyUnicode_DATA(unicode);
790
791    size = 0;
792    /* determine replacement size */
793    for (i = collstart; i < collend; ++i) {
794        Py_ssize_t incr;
795
796        ch = PyUnicode_READ(kind, data, i);
797        if (ch < 10)
798            incr = 2+1+1;
799        else if (ch < 100)
800            incr = 2+2+1;
801        else if (ch < 1000)
802            incr = 2+3+1;
803        else if (ch < 10000)
804            incr = 2+4+1;
805        else if (ch < 100000)
806            incr = 2+5+1;
807        else if (ch < 1000000)
808            incr = 2+6+1;
809        else {
810            assert(ch <= MAX_UNICODE);
811            incr = 2+7+1;
812        }
813        if (size > PY_SSIZE_T_MAX - incr) {
814            PyErr_SetString(PyExc_OverflowError,
815                            "encoded result is too long for a Python string");
816            return NULL;
817        }
818        size += incr;
819    }
820
821    str = _PyBytesWriter_Prepare(writer, str, size);
822    if (str == NULL)
823        return NULL;
824
825    /* generate replacement */
826    for (i = collstart; i < collend; ++i) {
827        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
828        if (size < 0) {
829            return NULL;
830        }
831        str += size;
832    }
833    return str;
834}
835
836/* --- Bloom Filters ----------------------------------------------------- */
837
838/* stuff to implement simple "bloom filters" for Unicode characters.
839   to keep things simple, we use a single bitmask, using the least 5
840   bits from each unicode characters as the bit index. */
841
842/* the linebreak mask is set up by _PyUnicode_Init() below */
843
844#if LONG_BIT >= 128
845#define BLOOM_WIDTH 128
846#elif LONG_BIT >= 64
847#define BLOOM_WIDTH 64
848#elif LONG_BIT >= 32
849#define BLOOM_WIDTH 32
850#else
851#error "LONG_BIT is smaller than 32"
852#endif
853
854#define BLOOM_MASK unsigned long
855
856static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
857
858#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
859
860#define BLOOM_LINEBREAK(ch)                                             \
861    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
862     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
863
864static inline BLOOM_MASK
865make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
866{
867#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
868    do {                                               \
869        TYPE *data = (TYPE *)PTR;                      \
870        TYPE *end = data + LEN;                        \
871        Py_UCS4 ch;                                    \
872        for (; data != end; data++) {                  \
873            ch = *data;                                \
874            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
875        }                                              \
876        break;                                         \
877    } while (0)
878
879    /* calculate simple bloom-style bitmask for a given unicode string */
880
881    BLOOM_MASK mask;
882
883    mask = 0;
884    switch (kind) {
885    case PyUnicode_1BYTE_KIND:
886        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
887        break;
888    case PyUnicode_2BYTE_KIND:
889        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
890        break;
891    case PyUnicode_4BYTE_KIND:
892        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
893        break;
894    default:
895        Py_UNREACHABLE();
896    }
897    return mask;
898
899#undef BLOOM_UPDATE
900}
901
902static int
903ensure_unicode(PyObject *obj)
904{
905    if (!PyUnicode_Check(obj)) {
906        PyErr_Format(PyExc_TypeError,
907                     "must be str, not %.100s",
908                     Py_TYPE(obj)->tp_name);
909        return -1;
910    }
911    return PyUnicode_READY(obj);
912}
913
914/* Compilation of templated routines */
915
916#define STRINGLIB_GET_EMPTY() unicode_get_empty()
917
918#include "stringlib/asciilib.h"
919#include "stringlib/fastsearch.h"
920#include "stringlib/partition.h"
921#include "stringlib/split.h"
922#include "stringlib/count.h"
923#include "stringlib/find.h"
924#include "stringlib/find_max_char.h"
925#include "stringlib/undef.h"
926
927#include "stringlib/ucs1lib.h"
928#include "stringlib/fastsearch.h"
929#include "stringlib/partition.h"
930#include "stringlib/split.h"
931#include "stringlib/count.h"
932#include "stringlib/find.h"
933#include "stringlib/replace.h"
934#include "stringlib/find_max_char.h"
935#include "stringlib/undef.h"
936
937#include "stringlib/ucs2lib.h"
938#include "stringlib/fastsearch.h"
939#include "stringlib/partition.h"
940#include "stringlib/split.h"
941#include "stringlib/count.h"
942#include "stringlib/find.h"
943#include "stringlib/replace.h"
944#include "stringlib/find_max_char.h"
945#include "stringlib/undef.h"
946
947#include "stringlib/ucs4lib.h"
948#include "stringlib/fastsearch.h"
949#include "stringlib/partition.h"
950#include "stringlib/split.h"
951#include "stringlib/count.h"
952#include "stringlib/find.h"
953#include "stringlib/replace.h"
954#include "stringlib/find_max_char.h"
955#include "stringlib/undef.h"
956
957_Py_COMP_DIAG_PUSH
958_Py_COMP_DIAG_IGNORE_DEPR_DECLS
959#include "stringlib/unicodedefs.h"
960#include "stringlib/fastsearch.h"
961#include "stringlib/count.h"
962#include "stringlib/find.h"
963#include "stringlib/undef.h"
964_Py_COMP_DIAG_POP
965
966#undef STRINGLIB_GET_EMPTY
967
968/* --- Unicode Object ----------------------------------------------------- */
969
970static inline Py_ssize_t
971findchar(const void *s, int kind,
972         Py_ssize_t size, Py_UCS4 ch,
973         int direction)
974{
975    switch (kind) {
976    case PyUnicode_1BYTE_KIND:
977        if ((Py_UCS1) ch != ch)
978            return -1;
979        if (direction > 0)
980            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
981        else
982            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
983    case PyUnicode_2BYTE_KIND:
984        if ((Py_UCS2) ch != ch)
985            return -1;
986        if (direction > 0)
987            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
988        else
989            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
990    case PyUnicode_4BYTE_KIND:
991        if (direction > 0)
992            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
993        else
994            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
995    default:
996        Py_UNREACHABLE();
997    }
998}
999
1000#ifdef Py_DEBUG
1001/* Fill the data of a Unicode string with invalid characters to detect bugs
1002   earlier.
1003
1004   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1005   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1006   invalid character in Unicode 6.0. */
1007static void
1008unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1009{
1010    int kind = PyUnicode_KIND(unicode);
1011    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1012    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1013    if (length <= old_length)
1014        return;
1015    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1016}
1017#endif
1018
1019static PyObject*
1020resize_compact(PyObject *unicode, Py_ssize_t length)
1021{
1022    Py_ssize_t char_size;
1023    Py_ssize_t struct_size;
1024    Py_ssize_t new_size;
1025    int share_wstr;
1026    PyObject *new_unicode;
1027#ifdef Py_DEBUG
1028    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1029#endif
1030
1031    assert(unicode_modifiable(unicode));
1032    assert(PyUnicode_IS_READY(unicode));
1033    assert(PyUnicode_IS_COMPACT(unicode));
1034
1035    char_size = PyUnicode_KIND(unicode);
1036    if (PyUnicode_IS_ASCII(unicode))
1037        struct_size = sizeof(PyASCIIObject);
1038    else
1039        struct_size = sizeof(PyCompactUnicodeObject);
1040    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1041
1042    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1043        PyErr_NoMemory();
1044        return NULL;
1045    }
1046    new_size = (struct_size + (length + 1) * char_size);
1047
1048    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1049        PyObject_Free(_PyUnicode_UTF8(unicode));
1050        _PyUnicode_UTF8(unicode) = NULL;
1051        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1052    }
1053#ifdef Py_REF_DEBUG
1054    _Py_RefTotal--;
1055#endif
1056#ifdef Py_TRACE_REFS
1057    _Py_ForgetReference(unicode);
1058#endif
1059
1060    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1061    if (new_unicode == NULL) {
1062        _Py_NewReference(unicode);
1063        PyErr_NoMemory();
1064        return NULL;
1065    }
1066    unicode = new_unicode;
1067    _Py_NewReference(unicode);
1068
1069    _PyUnicode_LENGTH(unicode) = length;
1070    if (share_wstr) {
1071        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1072        if (!PyUnicode_IS_ASCII(unicode))
1073            _PyUnicode_WSTR_LENGTH(unicode) = length;
1074    }
1075    else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1076        PyObject_Free(_PyUnicode_WSTR(unicode));
1077        _PyUnicode_WSTR(unicode) = NULL;
1078        if (!PyUnicode_IS_ASCII(unicode))
1079            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1080    }
1081#ifdef Py_DEBUG
1082    unicode_fill_invalid(unicode, old_length);
1083#endif
1084    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1085                    length, 0);
1086    assert(_PyUnicode_CheckConsistency(unicode, 0));
1087    return unicode;
1088}
1089
1090static int
1091resize_inplace(PyObject *unicode, Py_ssize_t length)
1092{
1093    wchar_t *wstr;
1094    Py_ssize_t new_size;
1095    assert(!PyUnicode_IS_COMPACT(unicode));
1096    assert(Py_REFCNT(unicode) == 1);
1097
1098    if (PyUnicode_IS_READY(unicode)) {
1099        Py_ssize_t char_size;
1100        int share_wstr, share_utf8;
1101        void *data;
1102#ifdef Py_DEBUG
1103        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1104#endif
1105
1106        data = _PyUnicode_DATA_ANY(unicode);
1107        char_size = PyUnicode_KIND(unicode);
1108        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1109        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1110
1111        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1112            PyErr_NoMemory();
1113            return -1;
1114        }
1115        new_size = (length + 1) * char_size;
1116
1117        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1118        {
1119            PyObject_Free(_PyUnicode_UTF8(unicode));
1120            _PyUnicode_UTF8(unicode) = NULL;
1121            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1122        }
1123
1124        data = (PyObject *)PyObject_Realloc(data, new_size);
1125        if (data == NULL) {
1126            PyErr_NoMemory();
1127            return -1;
1128        }
1129        _PyUnicode_DATA_ANY(unicode) = data;
1130        if (share_wstr) {
1131            _PyUnicode_WSTR(unicode) = data;
1132            _PyUnicode_WSTR_LENGTH(unicode) = length;
1133        }
1134        if (share_utf8) {
1135            _PyUnicode_UTF8(unicode) = data;
1136            _PyUnicode_UTF8_LENGTH(unicode) = length;
1137        }
1138        _PyUnicode_LENGTH(unicode) = length;
1139        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1140#ifdef Py_DEBUG
1141        unicode_fill_invalid(unicode, old_length);
1142#endif
1143        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1144            assert(_PyUnicode_CheckConsistency(unicode, 0));
1145            return 0;
1146        }
1147    }
1148    assert(_PyUnicode_WSTR(unicode) != NULL);
1149
1150    /* check for integer overflow */
1151    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1152        PyErr_NoMemory();
1153        return -1;
1154    }
1155    new_size = sizeof(wchar_t) * (length + 1);
1156    wstr =  _PyUnicode_WSTR(unicode);
1157    wstr = PyObject_Realloc(wstr, new_size);
1158    if (!wstr) {
1159        PyErr_NoMemory();
1160        return -1;
1161    }
1162    _PyUnicode_WSTR(unicode) = wstr;
1163    _PyUnicode_WSTR(unicode)[length] = 0;
1164    _PyUnicode_WSTR_LENGTH(unicode) = length;
1165    assert(_PyUnicode_CheckConsistency(unicode, 0));
1166    return 0;
1167}
1168
1169static PyObject*
1170resize_copy(PyObject *unicode, Py_ssize_t length)
1171{
1172    Py_ssize_t copy_length;
1173    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1174        PyObject *copy;
1175
1176        assert(PyUnicode_IS_READY(unicode));
1177
1178        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1179        if (copy == NULL)
1180            return NULL;
1181
1182        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1183        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1184        return copy;
1185    }
1186    else {
1187        PyObject *w;
1188
1189        w = (PyObject*)_PyUnicode_New(length);
1190        if (w == NULL)
1191            return NULL;
1192        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1193        copy_length = Py_MIN(copy_length, length);
1194        memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1195                  copy_length * sizeof(wchar_t));
1196        return w;
1197    }
1198}
1199
1200/* We allocate one more byte to make sure the string is
1201   Ux0000 terminated; some code (e.g. new_identifier)
1202   relies on that.
1203
1204   XXX This allocator could further be enhanced by assuring that the
1205   free list never reduces its size below 1.
1206
1207*/
1208
1209static PyUnicodeObject *
1210_PyUnicode_New(Py_ssize_t length)
1211{
1212    PyUnicodeObject *unicode;
1213    size_t new_size;
1214
1215    /* Optimization for empty strings */
1216    if (length == 0) {
1217        return (PyUnicodeObject *)unicode_new_empty();
1218    }
1219
1220    /* Ensure we won't overflow the size. */
1221    if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1222        return (PyUnicodeObject *)PyErr_NoMemory();
1223    }
1224    if (length < 0) {
1225        PyErr_SetString(PyExc_SystemError,
1226                        "Negative size passed to _PyUnicode_New");
1227        return NULL;
1228    }
1229
1230    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1231    if (unicode == NULL)
1232        return NULL;
1233    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1234
1235    _PyUnicode_WSTR_LENGTH(unicode) = length;
1236    _PyUnicode_HASH(unicode) = -1;
1237    _PyUnicode_STATE(unicode).interned = 0;
1238    _PyUnicode_STATE(unicode).kind = 0;
1239    _PyUnicode_STATE(unicode).compact = 0;
1240    _PyUnicode_STATE(unicode).ready = 0;
1241    _PyUnicode_STATE(unicode).ascii = 0;
1242    _PyUnicode_DATA_ANY(unicode) = NULL;
1243    _PyUnicode_LENGTH(unicode) = 0;
1244    _PyUnicode_UTF8(unicode) = NULL;
1245    _PyUnicode_UTF8_LENGTH(unicode) = 0;
1246
1247    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1248    if (!_PyUnicode_WSTR(unicode)) {
1249        Py_DECREF(unicode);
1250        PyErr_NoMemory();
1251        return NULL;
1252    }
1253
1254    /* Initialize the first element to guard against cases where
1255     * the caller fails before initializing str -- unicode_resize()
1256     * reads str[0], and the Keep-Alive optimization can keep memory
1257     * allocated for str alive across a call to unicode_dealloc(unicode).
1258     * We don't want unicode_resize to read uninitialized memory in
1259     * that case.
1260     */
1261    _PyUnicode_WSTR(unicode)[0] = 0;
1262    _PyUnicode_WSTR(unicode)[length] = 0;
1263
1264    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1265    return unicode;
1266}
1267
1268static const char*
1269unicode_kind_name(PyObject *unicode)
1270{
1271    /* don't check consistency: unicode_kind_name() is called from
1272       _PyUnicode_Dump() */
1273    if (!PyUnicode_IS_COMPACT(unicode))
1274    {
1275        if (!PyUnicode_IS_READY(unicode))
1276            return "wstr";
1277        switch (PyUnicode_KIND(unicode))
1278        {
1279        case PyUnicode_1BYTE_KIND:
1280            if (PyUnicode_IS_ASCII(unicode))
1281                return "legacy ascii";
1282            else
1283                return "legacy latin1";
1284        case PyUnicode_2BYTE_KIND:
1285            return "legacy UCS2";
1286        case PyUnicode_4BYTE_KIND:
1287            return "legacy UCS4";
1288        default:
1289            return "<legacy invalid kind>";
1290        }
1291    }
1292    assert(PyUnicode_IS_READY(unicode));
1293    switch (PyUnicode_KIND(unicode)) {
1294    case PyUnicode_1BYTE_KIND:
1295        if (PyUnicode_IS_ASCII(unicode))
1296            return "ascii";
1297        else
1298            return "latin1";
1299    case PyUnicode_2BYTE_KIND:
1300        return "UCS2";
1301    case PyUnicode_4BYTE_KIND:
1302        return "UCS4";
1303    default:
1304        return "<invalid compact kind>";
1305    }
1306}
1307
1308#ifdef Py_DEBUG
1309/* Functions wrapping macros for use in debugger */
1310const char *_PyUnicode_utf8(void *unicode_raw){
1311    PyObject *unicode = _PyObject_CAST(unicode_raw);
1312    return PyUnicode_UTF8(unicode);
1313}
1314
1315const void *_PyUnicode_compact_data(void *unicode_raw) {
1316    PyObject *unicode = _PyObject_CAST(unicode_raw);
1317    return _PyUnicode_COMPACT_DATA(unicode);
1318}
1319const void *_PyUnicode_data(void *unicode_raw) {
1320    PyObject *unicode = _PyObject_CAST(unicode_raw);
1321    printf("obj %p\n", (void*)unicode);
1322    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1323    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1324    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1325    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1326    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1327    return PyUnicode_DATA(unicode);
1328}
1329
1330void
1331_PyUnicode_Dump(PyObject *op)
1332{
1333    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1334    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1335    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1336    const void *data;
1337
1338    if (ascii->state.compact)
1339    {
1340        if (ascii->state.ascii)
1341            data = (ascii + 1);
1342        else
1343            data = (compact + 1);
1344    }
1345    else
1346        data = unicode->data.any;
1347    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1348
1349    if (ascii->wstr == data)
1350        printf("shared ");
1351    printf("wstr=%p", (void *)ascii->wstr);
1352
1353    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1354        printf(" (%zu), ", compact->wstr_length);
1355        if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1356            printf("shared ");
1357        }
1358        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1359    }
1360    printf(", data=%p\n", data);
1361}
1362#endif
1363
1364
1365PyObject *
1366PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1367{
1368    /* Optimization for empty strings */
1369    if (size == 0) {
1370        return unicode_new_empty();
1371    }
1372
1373    PyObject *obj;
1374    PyCompactUnicodeObject *unicode;
1375    void *data;
1376    enum PyUnicode_Kind kind;
1377    int is_sharing, is_ascii;
1378    Py_ssize_t char_size;
1379    Py_ssize_t struct_size;
1380
1381    is_ascii = 0;
1382    is_sharing = 0;
1383    struct_size = sizeof(PyCompactUnicodeObject);
1384    if (maxchar < 128) {
1385        kind = PyUnicode_1BYTE_KIND;
1386        char_size = 1;
1387        is_ascii = 1;
1388        struct_size = sizeof(PyASCIIObject);
1389    }
1390    else if (maxchar < 256) {
1391        kind = PyUnicode_1BYTE_KIND;
1392        char_size = 1;
1393    }
1394    else if (maxchar < 65536) {
1395        kind = PyUnicode_2BYTE_KIND;
1396        char_size = 2;
1397        if (sizeof(wchar_t) == 2)
1398            is_sharing = 1;
1399    }
1400    else {
1401        if (maxchar > MAX_UNICODE) {
1402            PyErr_SetString(PyExc_SystemError,
1403                            "invalid maximum character passed to PyUnicode_New");
1404            return NULL;
1405        }
1406        kind = PyUnicode_4BYTE_KIND;
1407        char_size = 4;
1408        if (sizeof(wchar_t) == 4)
1409            is_sharing = 1;
1410    }
1411
1412    /* Ensure we won't overflow the size. */
1413    if (size < 0) {
1414        PyErr_SetString(PyExc_SystemError,
1415                        "Negative size passed to PyUnicode_New");
1416        return NULL;
1417    }
1418    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1419        return PyErr_NoMemory();
1420
1421    /* Duplicated allocation code from _PyObject_New() instead of a call to
1422     * PyObject_New() so we are able to allocate space for the object and
1423     * it's data buffer.
1424     */
1425    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1426    if (obj == NULL) {
1427        return PyErr_NoMemory();
1428    }
1429    _PyObject_Init(obj, &PyUnicode_Type);
1430
1431    unicode = (PyCompactUnicodeObject *)obj;
1432    if (is_ascii)
1433        data = ((PyASCIIObject*)obj) + 1;
1434    else
1435        data = unicode + 1;
1436    _PyUnicode_LENGTH(unicode) = size;
1437    _PyUnicode_HASH(unicode) = -1;
1438    _PyUnicode_STATE(unicode).interned = 0;
1439    _PyUnicode_STATE(unicode).kind = kind;
1440    _PyUnicode_STATE(unicode).compact = 1;
1441    _PyUnicode_STATE(unicode).ready = 1;
1442    _PyUnicode_STATE(unicode).ascii = is_ascii;
1443    if (is_ascii) {
1444        ((char*)data)[size] = 0;
1445        _PyUnicode_WSTR(unicode) = NULL;
1446    }
1447    else if (kind == PyUnicode_1BYTE_KIND) {
1448        ((char*)data)[size] = 0;
1449        _PyUnicode_WSTR(unicode) = NULL;
1450        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451        unicode->utf8 = NULL;
1452        unicode->utf8_length = 0;
1453    }
1454    else {
1455        unicode->utf8 = NULL;
1456        unicode->utf8_length = 0;
1457        if (kind == PyUnicode_2BYTE_KIND)
1458            ((Py_UCS2*)data)[size] = 0;
1459        else /* kind == PyUnicode_4BYTE_KIND */
1460            ((Py_UCS4*)data)[size] = 0;
1461        if (is_sharing) {
1462            _PyUnicode_WSTR_LENGTH(unicode) = size;
1463            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1464        }
1465        else {
1466            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1467            _PyUnicode_WSTR(unicode) = NULL;
1468        }
1469    }
1470#ifdef Py_DEBUG
1471    unicode_fill_invalid((PyObject*)unicode, 0);
1472#endif
1473    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1474    return obj;
1475}
1476
1477#if SIZEOF_WCHAR_T == 2
1478/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1479   will decode surrogate pairs, the other conversions are implemented as macros
1480   for efficiency.
1481
1482   This function assumes that unicode can hold one more code point than wstr
1483   characters for a terminating null character. */
1484static void
1485unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1486                              PyObject *unicode)
1487{
1488    const wchar_t *iter;
1489    Py_UCS4 *ucs4_out;
1490
1491    assert(unicode != NULL);
1492    assert(_PyUnicode_CHECK(unicode));
1493    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1494    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1495
1496    for (iter = begin; iter < end; ) {
1497        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1498                           _PyUnicode_GET_LENGTH(unicode)));
1499        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1500            && (iter+1) < end
1501            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1502        {
1503            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1504            iter += 2;
1505        }
1506        else {
1507            *ucs4_out++ = *iter;
1508            iter++;
1509        }
1510    }
1511    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1512                        _PyUnicode_GET_LENGTH(unicode)));
1513
1514}
1515#endif
1516
1517static int
1518unicode_check_modifiable(PyObject *unicode)
1519{
1520    if (!unicode_modifiable(unicode)) {
1521        PyErr_SetString(PyExc_SystemError,
1522                        "Cannot modify a string currently used");
1523        return -1;
1524    }
1525    return 0;
1526}
1527
1528static int
1529_copy_characters(PyObject *to, Py_ssize_t to_start,
1530                 PyObject *from, Py_ssize_t from_start,
1531                 Py_ssize_t how_many, int check_maxchar)
1532{
1533    unsigned int from_kind, to_kind;
1534    const void *from_data;
1535    void *to_data;
1536
1537    assert(0 <= how_many);
1538    assert(0 <= from_start);
1539    assert(0 <= to_start);
1540    assert(PyUnicode_Check(from));
1541    assert(PyUnicode_IS_READY(from));
1542    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1543
1544    assert(PyUnicode_Check(to));
1545    assert(PyUnicode_IS_READY(to));
1546    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1547
1548    if (how_many == 0)
1549        return 0;
1550
1551    from_kind = PyUnicode_KIND(from);
1552    from_data = PyUnicode_DATA(from);
1553    to_kind = PyUnicode_KIND(to);
1554    to_data = PyUnicode_DATA(to);
1555
1556#ifdef Py_DEBUG
1557    if (!check_maxchar
1558        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1559    {
1560        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1561        Py_UCS4 ch;
1562        Py_ssize_t i;
1563        for (i=0; i < how_many; i++) {
1564            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1565            assert(ch <= to_maxchar);
1566        }
1567    }
1568#endif
1569
1570    if (from_kind == to_kind) {
1571        if (check_maxchar
1572            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1573        {
1574            /* Writing Latin-1 characters into an ASCII string requires to
1575               check that all written characters are pure ASCII */
1576            Py_UCS4 max_char;
1577            max_char = ucs1lib_find_max_char(from_data,
1578                                             (const Py_UCS1*)from_data + how_many);
1579            if (max_char >= 128)
1580                return -1;
1581        }
1582        memcpy((char*)to_data + to_kind * to_start,
1583                  (const char*)from_data + from_kind * from_start,
1584                  to_kind * how_many);
1585    }
1586    else if (from_kind == PyUnicode_1BYTE_KIND
1587             && to_kind == PyUnicode_2BYTE_KIND)
1588    {
1589        _PyUnicode_CONVERT_BYTES(
1590            Py_UCS1, Py_UCS2,
1591            PyUnicode_1BYTE_DATA(from) + from_start,
1592            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1593            PyUnicode_2BYTE_DATA(to) + to_start
1594            );
1595    }
1596    else if (from_kind == PyUnicode_1BYTE_KIND
1597             && to_kind == PyUnicode_4BYTE_KIND)
1598    {
1599        _PyUnicode_CONVERT_BYTES(
1600            Py_UCS1, Py_UCS4,
1601            PyUnicode_1BYTE_DATA(from) + from_start,
1602            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1603            PyUnicode_4BYTE_DATA(to) + to_start
1604            );
1605    }
1606    else if (from_kind == PyUnicode_2BYTE_KIND
1607             && to_kind == PyUnicode_4BYTE_KIND)
1608    {
1609        _PyUnicode_CONVERT_BYTES(
1610            Py_UCS2, Py_UCS4,
1611            PyUnicode_2BYTE_DATA(from) + from_start,
1612            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1613            PyUnicode_4BYTE_DATA(to) + to_start
1614            );
1615    }
1616    else {
1617        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1618
1619        if (!check_maxchar) {
1620            if (from_kind == PyUnicode_2BYTE_KIND
1621                && to_kind == PyUnicode_1BYTE_KIND)
1622            {
1623                _PyUnicode_CONVERT_BYTES(
1624                    Py_UCS2, Py_UCS1,
1625                    PyUnicode_2BYTE_DATA(from) + from_start,
1626                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1627                    PyUnicode_1BYTE_DATA(to) + to_start
1628                    );
1629            }
1630            else if (from_kind == PyUnicode_4BYTE_KIND
1631                     && to_kind == PyUnicode_1BYTE_KIND)
1632            {
1633                _PyUnicode_CONVERT_BYTES(
1634                    Py_UCS4, Py_UCS1,
1635                    PyUnicode_4BYTE_DATA(from) + from_start,
1636                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1637                    PyUnicode_1BYTE_DATA(to) + to_start
1638                    );
1639            }
1640            else if (from_kind == PyUnicode_4BYTE_KIND
1641                     && to_kind == PyUnicode_2BYTE_KIND)
1642            {
1643                _PyUnicode_CONVERT_BYTES(
1644                    Py_UCS4, Py_UCS2,
1645                    PyUnicode_4BYTE_DATA(from) + from_start,
1646                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1647                    PyUnicode_2BYTE_DATA(to) + to_start
1648                    );
1649            }
1650            else {
1651                Py_UNREACHABLE();
1652            }
1653        }
1654        else {
1655            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1656            Py_UCS4 ch;
1657            Py_ssize_t i;
1658
1659            for (i=0; i < how_many; i++) {
1660                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1661                if (ch > to_maxchar)
1662                    return -1;
1663                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1664            }
1665        }
1666    }
1667    return 0;
1668}
1669
1670void
1671_PyUnicode_FastCopyCharacters(
1672    PyObject *to, Py_ssize_t to_start,
1673    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1674{
1675    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1676}
1677
1678Py_ssize_t
1679PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1680                         PyObject *from, Py_ssize_t from_start,
1681                         Py_ssize_t how_many)
1682{
1683    int err;
1684
1685    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1686        PyErr_BadInternalCall();
1687        return -1;
1688    }
1689
1690    if (PyUnicode_READY(from) == -1)
1691        return -1;
1692    if (PyUnicode_READY(to) == -1)
1693        return -1;
1694
1695    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1696        PyErr_SetString(PyExc_IndexError, "string index out of range");
1697        return -1;
1698    }
1699    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1700        PyErr_SetString(PyExc_IndexError, "string index out of range");
1701        return -1;
1702    }
1703    if (how_many < 0) {
1704        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1705        return -1;
1706    }
1707    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1708    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1709        PyErr_Format(PyExc_SystemError,
1710                     "Cannot write %zi characters at %zi "
1711                     "in a string of %zi characters",
1712                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1713        return -1;
1714    }
1715
1716    if (how_many == 0)
1717        return 0;
1718
1719    if (unicode_check_modifiable(to))
1720        return -1;
1721
1722    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1723    if (err) {
1724        PyErr_Format(PyExc_SystemError,
1725                     "Cannot copy %s characters "
1726                     "into a string of %s characters",
1727                     unicode_kind_name(from),
1728                     unicode_kind_name(to));
1729        return -1;
1730    }
1731    return how_many;
1732}
1733
1734/* Find the maximum code point and count the number of surrogate pairs so a
1735   correct string length can be computed before converting a string to UCS4.
1736   This function counts single surrogates as a character and not as a pair.
1737
1738   Return 0 on success, or -1 on error. */
1739static int
1740find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1741                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1742{
1743    const wchar_t *iter;
1744    Py_UCS4 ch;
1745
1746    assert(num_surrogates != NULL && maxchar != NULL);
1747    *num_surrogates = 0;
1748    *maxchar = 0;
1749
1750    for (iter = begin; iter < end; ) {
1751#if SIZEOF_WCHAR_T == 2
1752        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1753            && (iter+1) < end
1754            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1755        {
1756            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1757            ++(*num_surrogates);
1758            iter += 2;
1759        }
1760        else
1761#endif
1762        {
1763            ch = *iter;
1764            iter++;
1765        }
1766        if (ch > *maxchar) {
1767            *maxchar = ch;
1768            if (*maxchar > MAX_UNICODE) {
1769                PyErr_Format(PyExc_ValueError,
1770                             "character U+%x is not in range [U+0000; U+%x]",
1771                             ch, MAX_UNICODE);
1772                return -1;
1773            }
1774        }
1775    }
1776    return 0;
1777}
1778
1779int
1780_PyUnicode_Ready(PyObject *unicode)
1781{
1782    wchar_t *end;
1783    Py_UCS4 maxchar = 0;
1784    Py_ssize_t num_surrogates;
1785#if SIZEOF_WCHAR_T == 2
1786    Py_ssize_t length_wo_surrogates;
1787#endif
1788
1789    /* _PyUnicode_Ready() is only intended for old-style API usage where
1790       strings were created using _PyObject_New() and where no canonical
1791       representation (the str field) has been set yet aka strings
1792       which are not yet ready. */
1793    assert(_PyUnicode_CHECK(unicode));
1794    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1795    assert(_PyUnicode_WSTR(unicode) != NULL);
1796    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1797    assert(_PyUnicode_UTF8(unicode) == NULL);
1798    /* Actually, it should neither be interned nor be anything else: */
1799    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1800
1801    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1802    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1803                                &maxchar, &num_surrogates) == -1)
1804        return -1;
1805
1806    if (maxchar < 256) {
1807        _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1808        if (!_PyUnicode_DATA_ANY(unicode)) {
1809            PyErr_NoMemory();
1810            return -1;
1811        }
1812        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1813                                _PyUnicode_WSTR(unicode), end,
1814                                PyUnicode_1BYTE_DATA(unicode));
1815        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1816        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1817        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1818        if (maxchar < 128) {
1819            _PyUnicode_STATE(unicode).ascii = 1;
1820            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1821            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1822        }
1823        else {
1824            _PyUnicode_STATE(unicode).ascii = 0;
1825            _PyUnicode_UTF8(unicode) = NULL;
1826            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1827        }
1828        PyObject_Free(_PyUnicode_WSTR(unicode));
1829        _PyUnicode_WSTR(unicode) = NULL;
1830        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1831    }
1832    /* In this case we might have to convert down from 4-byte native
1833       wchar_t to 2-byte unicode. */
1834    else if (maxchar < 65536) {
1835        assert(num_surrogates == 0 &&
1836               "FindMaxCharAndNumSurrogatePairs() messed up");
1837
1838#if SIZEOF_WCHAR_T == 2
1839        /* We can share representations and are done. */
1840        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1841        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1842        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1843        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1844        _PyUnicode_UTF8(unicode) = NULL;
1845        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1846#else
1847        /* sizeof(wchar_t) == 4 */
1848        _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1849            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1850        if (!_PyUnicode_DATA_ANY(unicode)) {
1851            PyErr_NoMemory();
1852            return -1;
1853        }
1854        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1855                                _PyUnicode_WSTR(unicode), end,
1856                                PyUnicode_2BYTE_DATA(unicode));
1857        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1858        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1859        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1860        _PyUnicode_UTF8(unicode) = NULL;
1861        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1862        PyObject_Free(_PyUnicode_WSTR(unicode));
1863        _PyUnicode_WSTR(unicode) = NULL;
1864        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1865#endif
1866    }
1867    /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1868    else {
1869#if SIZEOF_WCHAR_T == 2
1870        /* in case the native representation is 2-bytes, we need to allocate a
1871           new normalized 4-byte version. */
1872        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1873        if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1874            PyErr_NoMemory();
1875            return -1;
1876        }
1877        _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
1878        if (!_PyUnicode_DATA_ANY(unicode)) {
1879            PyErr_NoMemory();
1880            return -1;
1881        }
1882        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1883        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1884        _PyUnicode_UTF8(unicode) = NULL;
1885        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1886        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1887        _PyUnicode_STATE(unicode).ready = 1;
1888        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1889        PyObject_Free(_PyUnicode_WSTR(unicode));
1890        _PyUnicode_WSTR(unicode) = NULL;
1891        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1892#else
1893        assert(num_surrogates == 0);
1894
1895        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1896        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897        _PyUnicode_UTF8(unicode) = NULL;
1898        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1899        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1900#endif
1901        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1902    }
1903    _PyUnicode_STATE(unicode).ready = 1;
1904    assert(_PyUnicode_CheckConsistency(unicode, 1));
1905    return 0;
1906}
1907
1908static void
1909unicode_dealloc(PyObject *unicode)
1910{
1911#ifdef Py_DEBUG
1912    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1913        _Py_FatalRefcountError("deallocating an Unicode singleton");
1914    }
1915#endif
1916
1917    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1918    case SSTATE_NOT_INTERNED:
1919        break;
1920    case SSTATE_INTERNED_MORTAL:
1921    {
1922        /* Revive the dead object temporarily. PyDict_DelItem() removes two
1923           references (key and value) which were ignored by
1924           PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1925           to prevent calling unicode_dealloc() again. Adjust refcnt after
1926           PyDict_DelItem(). */
1927        assert(Py_REFCNT(unicode) == 0);
1928        Py_SET_REFCNT(unicode, 3);
1929        if (PyDict_DelItem(interned, unicode) != 0) {
1930            _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1931                                      NULL);
1932        }
1933        assert(Py_REFCNT(unicode) == 1);
1934        Py_SET_REFCNT(unicode, 0);
1935        break;
1936    }
1937
1938    case SSTATE_INTERNED_IMMORTAL:
1939        _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1940        break;
1941
1942    default:
1943        Py_UNREACHABLE();
1944    }
1945
1946    if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1947        PyObject_Free(_PyUnicode_WSTR(unicode));
1948    }
1949    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1950        PyObject_Free(_PyUnicode_UTF8(unicode));
1951    }
1952    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1953        PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1954    }
1955
1956    Py_TYPE(unicode)->tp_free(unicode);
1957}
1958
1959#ifdef Py_DEBUG
1960static int
1961unicode_is_singleton(PyObject *unicode)
1962{
1963    if (unicode == &_Py_STR(empty)) {
1964        return 1;
1965    }
1966
1967    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1968    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) {
1969        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1970        if (ch < 256 && LATIN1(ch) == unicode) {
1971            return 1;
1972        }
1973    }
1974    return 0;
1975}
1976#endif
1977
1978static int
1979unicode_modifiable(PyObject *unicode)
1980{
1981    assert(_PyUnicode_CHECK(unicode));
1982    if (Py_REFCNT(unicode) != 1)
1983        return 0;
1984    if (_PyUnicode_HASH(unicode) != -1)
1985        return 0;
1986    if (PyUnicode_CHECK_INTERNED(unicode))
1987        return 0;
1988    if (!PyUnicode_CheckExact(unicode))
1989        return 0;
1990#ifdef Py_DEBUG
1991    /* singleton refcount is greater than 1 */
1992    assert(!unicode_is_singleton(unicode));
1993#endif
1994    return 1;
1995}
1996
1997static int
1998unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1999{
2000    PyObject *unicode;
2001    Py_ssize_t old_length;
2002
2003    assert(p_unicode != NULL);
2004    unicode = *p_unicode;
2005
2006    assert(unicode != NULL);
2007    assert(PyUnicode_Check(unicode));
2008    assert(0 <= length);
2009
2010    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2011        old_length = PyUnicode_WSTR_LENGTH(unicode);
2012    else
2013        old_length = PyUnicode_GET_LENGTH(unicode);
2014    if (old_length == length)
2015        return 0;
2016
2017    if (length == 0) {
2018        PyObject *empty = unicode_new_empty();
2019        Py_SETREF(*p_unicode, empty);
2020        return 0;
2021    }
2022
2023    if (!unicode_modifiable(unicode)) {
2024        PyObject *copy = resize_copy(unicode, length);
2025        if (copy == NULL)
2026            return -1;
2027        Py_SETREF(*p_unicode, copy);
2028        return 0;
2029    }
2030
2031    if (PyUnicode_IS_COMPACT(unicode)) {
2032        PyObject *new_unicode = resize_compact(unicode, length);
2033        if (new_unicode == NULL)
2034            return -1;
2035        *p_unicode = new_unicode;
2036        return 0;
2037    }
2038    return resize_inplace(unicode, length);
2039}
2040
2041int
2042PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2043{
2044    PyObject *unicode;
2045    if (p_unicode == NULL) {
2046        PyErr_BadInternalCall();
2047        return -1;
2048    }
2049    unicode = *p_unicode;
2050    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2051    {
2052        PyErr_BadInternalCall();
2053        return -1;
2054    }
2055    return unicode_resize(p_unicode, length);
2056}
2057
2058/* Copy an ASCII or latin1 char* string into a Python Unicode string.
2059
2060   WARNING: The function doesn't copy the terminating null character and
2061   doesn't check the maximum character (may write a latin1 character in an
2062   ASCII string). */
2063static void
2064unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2065                   const char *str, Py_ssize_t len)
2066{
2067    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2068    const void *data = PyUnicode_DATA(unicode);
2069    const char *end = str + len;
2070
2071    assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2072    switch (kind) {
2073    case PyUnicode_1BYTE_KIND: {
2074#ifdef Py_DEBUG
2075        if (PyUnicode_IS_ASCII(unicode)) {
2076            Py_UCS4 maxchar = ucs1lib_find_max_char(
2077                (const Py_UCS1*)str,
2078                (const Py_UCS1*)str + len);
2079            assert(maxchar < 128);
2080        }
2081#endif
2082        memcpy((char *) data + index, str, len);
2083        break;
2084    }
2085    case PyUnicode_2BYTE_KIND: {
2086        Py_UCS2 *start = (Py_UCS2 *)data + index;
2087        Py_UCS2 *ucs2 = start;
2088
2089        for (; str < end; ++ucs2, ++str)
2090            *ucs2 = (Py_UCS2)*str;
2091
2092        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2093        break;
2094    }
2095    case PyUnicode_4BYTE_KIND: {
2096        Py_UCS4 *start = (Py_UCS4 *)data + index;
2097        Py_UCS4 *ucs4 = start;
2098
2099        for (; str < end; ++ucs4, ++str)
2100            *ucs4 = (Py_UCS4)*str;
2101
2102        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2103        break;
2104    }
2105    default:
2106        Py_UNREACHABLE();
2107    }
2108}
2109
2110static PyObject*
2111get_latin1_char(Py_UCS1 ch)
2112{
2113    return Py_NewRef(LATIN1(ch));
2114}
2115
2116static PyObject*
2117unicode_char(Py_UCS4 ch)
2118{
2119    PyObject *unicode;
2120
2121    assert(ch <= MAX_UNICODE);
2122
2123    if (ch < 256) {
2124        return get_latin1_char(ch);
2125    }
2126
2127    unicode = PyUnicode_New(1, ch);
2128    if (unicode == NULL)
2129        return NULL;
2130
2131    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2132    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2133        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2134    } else {
2135        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2136        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2137    }
2138    assert(_PyUnicode_CheckConsistency(unicode, 1));
2139    return unicode;
2140}
2141
2142PyObject *
2143PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2144{
2145    if (u == NULL) {
2146        if (size > 0) {
2147            if (PyErr_WarnEx(PyExc_DeprecationWarning,
2148                    "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2149                    "use PyUnicode_New() instead", 1) < 0) {
2150                return NULL;
2151            }
2152        }
2153        return (PyObject*)_PyUnicode_New(size);
2154    }
2155
2156    if (size < 0) {
2157        PyErr_BadInternalCall();
2158        return NULL;
2159    }
2160
2161    return PyUnicode_FromWideChar(u, size);
2162}
2163
2164PyObject *
2165PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2166{
2167    PyObject *unicode;
2168    Py_UCS4 maxchar = 0;
2169    Py_ssize_t num_surrogates;
2170
2171    if (u == NULL && size != 0) {
2172        PyErr_BadInternalCall();
2173        return NULL;
2174    }
2175
2176    if (size == -1) {
2177        size = wcslen(u);
2178    }
2179
2180    /* If the Unicode data is known at construction time, we can apply
2181       some optimizations which share commonly used objects. */
2182
2183    /* Optimization for empty strings */
2184    if (size == 0)
2185        _Py_RETURN_UNICODE_EMPTY();
2186
2187#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2188    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2189       non-Unicode locales and hence needs conversion to UCS-4 first. */
2190    if (_Py_LocaleUsesNonUnicodeWchar()) {
2191        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2192        if (!converted) {
2193            return NULL;
2194        }
2195        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2196        PyMem_Free(converted);
2197        return unicode;
2198    }
2199#endif
2200
2201    /* Single character Unicode objects in the Latin-1 range are
2202       shared when using this constructor */
2203    if (size == 1 && (Py_UCS4)*u < 256)
2204        return get_latin1_char((unsigned char)*u);
2205
2206    /* If not empty and not single character, copy the Unicode data
2207       into the new object */
2208    if (find_maxchar_surrogates(u, u + size,
2209                                &maxchar, &num_surrogates) == -1)
2210        return NULL;
2211
2212    unicode = PyUnicode_New(size - num_surrogates, maxchar);
2213    if (!unicode)
2214        return NULL;
2215
2216    switch (PyUnicode_KIND(unicode)) {
2217    case PyUnicode_1BYTE_KIND:
2218        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2219                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
2220        break;
2221    case PyUnicode_2BYTE_KIND:
2222#if Py_UNICODE_SIZE == 2
2223        memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2224#else
2225        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2226                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
2227#endif
2228        break;
2229    case PyUnicode_4BYTE_KIND:
2230#if SIZEOF_WCHAR_T == 2
2231        /* This is the only case which has to process surrogates, thus
2232           a simple copy loop is not enough and we need a function. */
2233        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2234#else
2235        assert(num_surrogates == 0);
2236        memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2237#endif
2238        break;
2239    default:
2240        Py_UNREACHABLE();
2241    }
2242
2243    return unicode_result(unicode);
2244}
2245
2246PyObject *
2247PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2248{
2249    if (size < 0) {
2250        PyErr_SetString(PyExc_SystemError,
2251                        "Negative size passed to PyUnicode_FromStringAndSize");
2252        return NULL;
2253    }
2254    if (u != NULL) {
2255        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2256    }
2257    else {
2258        if (size > 0) {
2259            if (PyErr_WarnEx(PyExc_DeprecationWarning,
2260                    "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2261                    "use PyUnicode_New() instead", 1) < 0) {
2262                return NULL;
2263            }
2264        }
2265        return (PyObject *)_PyUnicode_New(size);
2266    }
2267}
2268
2269PyObject *
2270PyUnicode_FromString(const char *u)
2271{
2272    size_t size = strlen(u);
2273    if (size > PY_SSIZE_T_MAX) {
2274        PyErr_SetString(PyExc_OverflowError, "input too long");
2275        return NULL;
2276    }
2277    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2278}
2279
2280
2281PyObject *
2282_PyUnicode_FromId(_Py_Identifier *id)
2283{
2284    PyInterpreterState *interp = _PyInterpreterState_GET();
2285    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2286
2287    Py_ssize_t index = _Py_atomic_size_get(&id->index);
2288    if (index < 0) {
2289        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2290
2291        PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2292        // Check again to detect concurrent access. Another thread can have
2293        // initialized the index while this thread waited for the lock.
2294        index = _Py_atomic_size_get(&id->index);
2295        if (index < 0) {
2296            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2297            index = rt_ids->next_index;
2298            rt_ids->next_index++;
2299            _Py_atomic_size_set(&id->index, index);
2300        }
2301        PyThread_release_lock(rt_ids->lock);
2302    }
2303    assert(index >= 0);
2304
2305    PyObject *obj;
2306    if (index < ids->size) {
2307        obj = ids->array[index];
2308        if (obj) {
2309            // Return a borrowed reference
2310            return obj;
2311        }
2312    }
2313
2314    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2315                                       NULL, NULL);
2316    if (!obj) {
2317        return NULL;
2318    }
2319    PyUnicode_InternInPlace(&obj);
2320
2321    if (index >= ids->size) {
2322        // Overallocate to reduce the number of realloc
2323        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2324        Py_ssize_t item_size = sizeof(ids->array[0]);
2325        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2326        if (new_array == NULL) {
2327            PyErr_NoMemory();
2328            return NULL;
2329        }
2330        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2331        ids->array = new_array;
2332        ids->size = new_size;
2333    }
2334
2335    // The array stores a strong reference
2336    ids->array[index] = obj;
2337
2338    // Return a borrowed reference
2339    return obj;
2340}
2341
2342
2343static void
2344unicode_clear_identifiers(struct _Py_unicode_state *state)
2345{
2346    struct _Py_unicode_ids *ids = &state->ids;
2347    for (Py_ssize_t i=0; i < ids->size; i++) {
2348        Py_XDECREF(ids->array[i]);
2349    }
2350    ids->size = 0;
2351    PyMem_Free(ids->array);
2352    ids->array = NULL;
2353    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2354    // after Py_Finalize().
2355}
2356
2357
2358/* Internal function, doesn't check maximum character */
2359
2360PyObject*
2361_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2362{
2363    const unsigned char *s = (const unsigned char *)buffer;
2364    PyObject *unicode;
2365    if (size == 1) {
2366#ifdef Py_DEBUG
2367        assert((unsigned char)s[0] < 128);
2368#endif
2369        return get_latin1_char(s[0]);
2370    }
2371    unicode = PyUnicode_New(size, 127);
2372    if (!unicode)
2373        return NULL;
2374    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2375    assert(_PyUnicode_CheckConsistency(unicode, 1));
2376    return unicode;
2377}
2378
2379static Py_UCS4
2380kind_maxchar_limit(unsigned int kind)
2381{
2382    switch (kind) {
2383    case PyUnicode_1BYTE_KIND:
2384        return 0x80;
2385    case PyUnicode_2BYTE_KIND:
2386        return 0x100;
2387    case PyUnicode_4BYTE_KIND:
2388        return 0x10000;
2389    default:
2390        Py_UNREACHABLE();
2391    }
2392}
2393
2394static PyObject*
2395_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2396{
2397    PyObject *res;
2398    unsigned char max_char;
2399
2400    if (size == 0) {
2401        _Py_RETURN_UNICODE_EMPTY();
2402    }
2403    assert(size > 0);
2404    if (size == 1) {
2405        return get_latin1_char(u[0]);
2406    }
2407
2408    max_char = ucs1lib_find_max_char(u, u + size);
2409    res = PyUnicode_New(size, max_char);
2410    if (!res)
2411        return NULL;
2412    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2413    assert(_PyUnicode_CheckConsistency(res, 1));
2414    return res;
2415}
2416
2417static PyObject*
2418_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2419{
2420    PyObject *res;
2421    Py_UCS2 max_char;
2422
2423    if (size == 0)
2424        _Py_RETURN_UNICODE_EMPTY();
2425    assert(size > 0);
2426    if (size == 1)
2427        return unicode_char(u[0]);
2428
2429    max_char = ucs2lib_find_max_char(u, u + size);
2430    res = PyUnicode_New(size, max_char);
2431    if (!res)
2432        return NULL;
2433    if (max_char >= 256)
2434        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2435    else {
2436        _PyUnicode_CONVERT_BYTES(
2437            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2438    }
2439    assert(_PyUnicode_CheckConsistency(res, 1));
2440    return res;
2441}
2442
2443static PyObject*
2444_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2445{
2446    PyObject *res;
2447    Py_UCS4 max_char;
2448
2449    if (size == 0)
2450        _Py_RETURN_UNICODE_EMPTY();
2451    assert(size > 0);
2452    if (size == 1)
2453        return unicode_char(u[0]);
2454
2455    max_char = ucs4lib_find_max_char(u, u + size);
2456    res = PyUnicode_New(size, max_char);
2457    if (!res)
2458        return NULL;
2459    if (max_char < 256)
2460        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2461                                 PyUnicode_1BYTE_DATA(res));
2462    else if (max_char < 0x10000)
2463        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2464                                 PyUnicode_2BYTE_DATA(res));
2465    else
2466        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2467    assert(_PyUnicode_CheckConsistency(res, 1));
2468    return res;
2469}
2470
2471PyObject*
2472PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2473{
2474    if (size < 0) {
2475        PyErr_SetString(PyExc_ValueError, "size must be positive");
2476        return NULL;
2477    }
2478    switch (kind) {
2479    case PyUnicode_1BYTE_KIND:
2480        return _PyUnicode_FromUCS1(buffer, size);
2481    case PyUnicode_2BYTE_KIND:
2482        return _PyUnicode_FromUCS2(buffer, size);
2483    case PyUnicode_4BYTE_KIND:
2484        return _PyUnicode_FromUCS4(buffer, size);
2485    default:
2486        PyErr_SetString(PyExc_SystemError, "invalid kind");
2487        return NULL;
2488    }
2489}
2490
2491Py_UCS4
2492_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2493{
2494    enum PyUnicode_Kind kind;
2495    const void *startptr, *endptr;
2496
2497    assert(PyUnicode_IS_READY(unicode));
2498    assert(0 <= start);
2499    assert(end <= PyUnicode_GET_LENGTH(unicode));
2500    assert(start <= end);
2501
2502    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2503        return PyUnicode_MAX_CHAR_VALUE(unicode);
2504
2505    if (start == end)
2506        return 127;
2507
2508    if (PyUnicode_IS_ASCII(unicode))
2509        return 127;
2510
2511    kind = PyUnicode_KIND(unicode);
2512    startptr = PyUnicode_DATA(unicode);
2513    endptr = (char *)startptr + end * kind;
2514    startptr = (char *)startptr + start * kind;
2515    switch(kind) {
2516    case PyUnicode_1BYTE_KIND:
2517        return ucs1lib_find_max_char(startptr, endptr);
2518    case PyUnicode_2BYTE_KIND:
2519        return ucs2lib_find_max_char(startptr, endptr);
2520    case PyUnicode_4BYTE_KIND:
2521        return ucs4lib_find_max_char(startptr, endptr);
2522    default:
2523        Py_UNREACHABLE();
2524    }
2525}
2526
2527/* Ensure that a string uses the most efficient storage, if it is not the
2528   case: create a new string with of the right kind. Write NULL into *p_unicode
2529   on error. */
2530static void
2531unicode_adjust_maxchar(PyObject **p_unicode)
2532{
2533    PyObject *unicode, *copy;
2534    Py_UCS4 max_char;
2535    Py_ssize_t len;
2536    unsigned int kind;
2537
2538    assert(p_unicode != NULL);
2539    unicode = *p_unicode;
2540    assert(PyUnicode_IS_READY(unicode));
2541    if (PyUnicode_IS_ASCII(unicode))
2542        return;
2543
2544    len = PyUnicode_GET_LENGTH(unicode);
2545    kind = PyUnicode_KIND(unicode);
2546    if (kind == PyUnicode_1BYTE_KIND) {
2547        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2548        max_char = ucs1lib_find_max_char(u, u + len);
2549        if (max_char >= 128)
2550            return;
2551    }
2552    else if (kind == PyUnicode_2BYTE_KIND) {
2553        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2554        max_char = ucs2lib_find_max_char(u, u + len);
2555        if (max_char >= 256)
2556            return;
2557    }
2558    else if (kind == PyUnicode_4BYTE_KIND) {
2559        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2560        max_char = ucs4lib_find_max_char(u, u + len);
2561        if (max_char >= 0x10000)
2562            return;
2563    }
2564    else
2565        Py_UNREACHABLE();
2566
2567    copy = PyUnicode_New(len, max_char);
2568    if (copy != NULL)
2569        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2570    Py_DECREF(unicode);
2571    *p_unicode = copy;
2572}
2573
2574PyObject*
2575_PyUnicode_Copy(PyObject *unicode)
2576{
2577    Py_ssize_t length;
2578    PyObject *copy;
2579
2580    if (!PyUnicode_Check(unicode)) {
2581        PyErr_BadInternalCall();
2582        return NULL;
2583    }
2584    if (PyUnicode_READY(unicode) == -1)
2585        return NULL;
2586
2587    length = PyUnicode_GET_LENGTH(unicode);
2588    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2589    if (!copy)
2590        return NULL;
2591    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2592
2593    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2594              length * PyUnicode_KIND(unicode));
2595    assert(_PyUnicode_CheckConsistency(copy, 1));
2596    return copy;
2597}
2598
2599
2600/* Widen Unicode objects to larger buffers. Don't write terminating null
2601   character. Return NULL on error. */
2602
2603static void*
2604unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2605{
2606    void *result;
2607
2608    assert(skind < kind);
2609    switch (kind) {
2610    case PyUnicode_2BYTE_KIND:
2611        result = PyMem_New(Py_UCS2, len);
2612        if (!result)
2613            return PyErr_NoMemory();
2614        assert(skind == PyUnicode_1BYTE_KIND);
2615        _PyUnicode_CONVERT_BYTES(
2616            Py_UCS1, Py_UCS2,
2617            (const Py_UCS1 *)data,
2618            ((const Py_UCS1 *)data) + len,
2619            result);
2620        return result;
2621    case PyUnicode_4BYTE_KIND:
2622        result = PyMem_New(Py_UCS4, len);
2623        if (!result)
2624            return PyErr_NoMemory();
2625        if (skind == PyUnicode_2BYTE_KIND) {
2626            _PyUnicode_CONVERT_BYTES(
2627                Py_UCS2, Py_UCS4,
2628                (const Py_UCS2 *)data,
2629                ((const Py_UCS2 *)data) + len,
2630                result);
2631        }
2632        else {
2633            assert(skind == PyUnicode_1BYTE_KIND);
2634            _PyUnicode_CONVERT_BYTES(
2635                Py_UCS1, Py_UCS4,
2636                (const Py_UCS1 *)data,
2637                ((const Py_UCS1 *)data) + len,
2638                result);
2639        }
2640        return result;
2641    default:
2642        Py_UNREACHABLE();
2643        return NULL;
2644    }
2645}
2646
2647static Py_UCS4*
2648as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2649        int copy_null)
2650{
2651    int kind;
2652    const void *data;
2653    Py_ssize_t len, targetlen;
2654    if (PyUnicode_READY(string) == -1)
2655        return NULL;
2656    kind = PyUnicode_KIND(string);
2657    data = PyUnicode_DATA(string);
2658    len = PyUnicode_GET_LENGTH(string);
2659    targetlen = len;
2660    if (copy_null)
2661        targetlen++;
2662    if (!target) {
2663        target = PyMem_New(Py_UCS4, targetlen);
2664        if (!target) {
2665            PyErr_NoMemory();
2666            return NULL;
2667        }
2668    }
2669    else {
2670        if (targetsize < targetlen) {
2671            PyErr_Format(PyExc_SystemError,
2672                         "string is longer than the buffer");
2673            if (copy_null && 0 < targetsize)
2674                target[0] = 0;
2675            return NULL;
2676        }
2677    }
2678    if (kind == PyUnicode_1BYTE_KIND) {
2679        const Py_UCS1 *start = (const Py_UCS1 *) data;
2680        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2681    }
2682    else if (kind == PyUnicode_2BYTE_KIND) {
2683        const Py_UCS2 *start = (const Py_UCS2 *) data;
2684        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2685    }
2686    else if (kind == PyUnicode_4BYTE_KIND) {
2687        memcpy(target, data, len * sizeof(Py_UCS4));
2688    }
2689    else {
2690        Py_UNREACHABLE();
2691    }
2692    if (copy_null)
2693        target[len] = 0;
2694    return target;
2695}
2696
2697Py_UCS4*
2698PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2699                 int copy_null)
2700{
2701    if (target == NULL || targetsize < 0) {
2702        PyErr_BadInternalCall();
2703        return NULL;
2704    }
2705    return as_ucs4(string, target, targetsize, copy_null);
2706}
2707
2708Py_UCS4*
2709PyUnicode_AsUCS4Copy(PyObject *string)
2710{
2711    return as_ucs4(string, NULL, 0, 1);
2712}
2713
2714/* maximum number of characters required for output of %lld or %p.
2715   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2716   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2717#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2718
2719static int
2720unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2721                             Py_ssize_t width, Py_ssize_t precision)
2722{
2723    Py_ssize_t length, fill, arglen;
2724    Py_UCS4 maxchar;
2725
2726    if (PyUnicode_READY(str) == -1)
2727        return -1;
2728
2729    length = PyUnicode_GET_LENGTH(str);
2730    if ((precision == -1 || precision >= length)
2731        && width <= length)
2732        return _PyUnicodeWriter_WriteStr(writer, str);
2733
2734    if (precision != -1)
2735        length = Py_MIN(precision, length);
2736
2737    arglen = Py_MAX(length, width);
2738    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2739        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2740    else
2741        maxchar = writer->maxchar;
2742
2743    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2744        return -1;
2745
2746    if (width > length) {
2747        fill = width - length;
2748        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2749            return -1;
2750        writer->pos += fill;
2751    }
2752
2753    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2754                                  str, 0, length);
2755    writer->pos += length;
2756    return 0;
2757}
2758
2759static int
2760unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2761                              Py_ssize_t width, Py_ssize_t precision)
2762{
2763    /* UTF-8 */
2764    Py_ssize_t length;
2765    PyObject *unicode;
2766    int res;
2767
2768    if (precision == -1) {
2769        length = strlen(str);
2770    }
2771    else {
2772        length = 0;
2773        while (length < precision && str[length]) {
2774            length++;
2775        }
2776    }
2777    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2778    if (unicode == NULL)
2779        return -1;
2780
2781    res = unicode_fromformat_write_str(writer, unicode, width, -1);
2782    Py_DECREF(unicode);
2783    return res;
2784}
2785
2786static const char*
2787unicode_fromformat_arg(_PyUnicodeWriter *writer,
2788                       const char *f, va_list *vargs)
2789{
2790    const char *p;
2791    Py_ssize_t len;
2792    int zeropad;
2793    Py_ssize_t width;
2794    Py_ssize_t precision;
2795    int longflag;
2796    int longlongflag;
2797    int size_tflag;
2798    Py_ssize_t fill;
2799
2800    p = f;
2801    f++;
2802    zeropad = 0;
2803    if (*f == '0') {
2804        zeropad = 1;
2805        f++;
2806    }
2807
2808    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2809    width = -1;
2810    if (Py_ISDIGIT((unsigned)*f)) {
2811        width = *f - '0';
2812        f++;
2813        while (Py_ISDIGIT((unsigned)*f)) {
2814            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2815                PyErr_SetString(PyExc_ValueError,
2816                                "width too big");
2817                return NULL;
2818            }
2819            width = (width * 10) + (*f - '0');
2820            f++;
2821        }
2822    }
2823    precision = -1;
2824    if (*f == '.') {
2825        f++;
2826        if (Py_ISDIGIT((unsigned)*f)) {
2827            precision = (*f - '0');
2828            f++;
2829            while (Py_ISDIGIT((unsigned)*f)) {
2830                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2831                    PyErr_SetString(PyExc_ValueError,
2832                                    "precision too big");
2833                    return NULL;
2834                }
2835                precision = (precision * 10) + (*f - '0');
2836                f++;
2837            }
2838        }
2839        if (*f == '%') {
2840            /* "%.3%s" => f points to "3" */
2841            f--;
2842        }
2843    }
2844    if (*f == '\0') {
2845        /* bogus format "%.123" => go backward, f points to "3" */
2846        f--;
2847    }
2848
2849    /* Handle %ld, %lu, %lld and %llu. */
2850    longflag = 0;
2851    longlongflag = 0;
2852    size_tflag = 0;
2853    if (*f == 'l') {
2854        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2855            longflag = 1;
2856            ++f;
2857        }
2858        else if (f[1] == 'l' &&
2859                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2860            longlongflag = 1;
2861            f += 2;
2862        }
2863    }
2864    /* handle the size_t flag. */
2865    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2866        size_tflag = 1;
2867        ++f;
2868    }
2869
2870    if (f[1] == '\0')
2871        writer->overallocate = 0;
2872
2873    switch (*f) {
2874    case 'c':
2875    {
2876        int ordinal = va_arg(*vargs, int);
2877        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2878            PyErr_SetString(PyExc_OverflowError,
2879                            "character argument not in range(0x110000)");
2880            return NULL;
2881        }
2882        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2883            return NULL;
2884        break;
2885    }
2886
2887    case 'i':
2888    case 'd':
2889    case 'u':
2890    case 'x':
2891    {
2892        /* used by sprintf */
2893        char buffer[MAX_LONG_LONG_CHARS];
2894        Py_ssize_t arglen;
2895
2896        if (*f == 'u') {
2897            if (longflag) {
2898                len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2899            }
2900            else if (longlongflag) {
2901                len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2902            }
2903            else if (size_tflag) {
2904                len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2905            }
2906            else {
2907                len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2908            }
2909        }
2910        else if (*f == 'x') {
2911            len = sprintf(buffer, "%x", va_arg(*vargs, int));
2912        }
2913        else {
2914            if (longflag) {
2915                len = sprintf(buffer, "%li", va_arg(*vargs, long));
2916            }
2917            else if (longlongflag) {
2918                len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2919            }
2920            else if (size_tflag) {
2921                len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2922            }
2923            else {
2924                len = sprintf(buffer, "%i", va_arg(*vargs, int));
2925            }
2926        }
2927        assert(len >= 0);
2928
2929        if (precision < len)
2930            precision = len;
2931
2932        arglen = Py_MAX(precision, width);
2933        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2934            return NULL;
2935
2936        if (width > precision) {
2937            Py_UCS4 fillchar;
2938            fill = width - precision;
2939            fillchar = zeropad?'0':' ';
2940            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2941                return NULL;
2942            writer->pos += fill;
2943        }
2944        if (precision > len) {
2945            fill = precision - len;
2946            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2947                return NULL;
2948            writer->pos += fill;
2949        }
2950
2951        if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2952            return NULL;
2953        break;
2954    }
2955
2956    case 'p':
2957    {
2958        char number[MAX_LONG_LONG_CHARS];
2959
2960        len = sprintf(number, "%p", va_arg(*vargs, void*));
2961        assert(len >= 0);
2962
2963        /* %p is ill-defined:  ensure leading 0x. */
2964        if (number[1] == 'X')
2965            number[1] = 'x';
2966        else if (number[1] != 'x') {
2967            memmove(number + 2, number,
2968                    strlen(number) + 1);
2969            number[0] = '0';
2970            number[1] = 'x';
2971            len += 2;
2972        }
2973
2974        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2975            return NULL;
2976        break;
2977    }
2978
2979    case 's':
2980    {
2981        /* UTF-8 */
2982        const char *s = va_arg(*vargs, const char*);
2983        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2984            return NULL;
2985        break;
2986    }
2987
2988    case 'U':
2989    {
2990        PyObject *obj = va_arg(*vargs, PyObject *);
2991        assert(obj && _PyUnicode_CHECK(obj));
2992
2993        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2994            return NULL;
2995        break;
2996    }
2997
2998    case 'V':
2999    {
3000        PyObject *obj = va_arg(*vargs, PyObject *);
3001        const char *str = va_arg(*vargs, const char *);
3002        if (obj) {
3003            assert(_PyUnicode_CHECK(obj));
3004            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3005                return NULL;
3006        }
3007        else {
3008            assert(str != NULL);
3009            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
3010                return NULL;
3011        }
3012        break;
3013    }
3014
3015    case 'S':
3016    {
3017        PyObject *obj = va_arg(*vargs, PyObject *);
3018        PyObject *str;
3019        assert(obj);
3020        str = PyObject_Str(obj);
3021        if (!str)
3022            return NULL;
3023        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
3024            Py_DECREF(str);
3025            return NULL;
3026        }
3027        Py_DECREF(str);
3028        break;
3029    }
3030
3031    case 'R':
3032    {
3033        PyObject *obj = va_arg(*vargs, PyObject *);
3034        PyObject *repr;
3035        assert(obj);
3036        repr = PyObject_Repr(obj);
3037        if (!repr)
3038            return NULL;
3039        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
3040            Py_DECREF(repr);
3041            return NULL;
3042        }
3043        Py_DECREF(repr);
3044        break;
3045    }
3046
3047    case 'A':
3048    {
3049        PyObject *obj = va_arg(*vargs, PyObject *);
3050        PyObject *ascii;
3051        assert(obj);
3052        ascii = PyObject_ASCII(obj);
3053        if (!ascii)
3054            return NULL;
3055        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3056            Py_DECREF(ascii);
3057            return NULL;
3058        }
3059        Py_DECREF(ascii);
3060        break;
3061    }
3062
3063    case '%':
3064        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3065            return NULL;
3066        break;
3067
3068    default:
3069        /* if we stumble upon an unknown formatting code, copy the rest
3070           of the format string to the output string. (we cannot just
3071           skip the code, since there's no way to know what's in the
3072           argument list) */
3073        len = strlen(p);
3074        if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3075            return NULL;
3076        f = p+len;
3077        return f;
3078    }
3079
3080    f++;
3081    return f;
3082}
3083
3084PyObject *
3085PyUnicode_FromFormatV(const char *format, va_list vargs)
3086{
3087    va_list vargs2;
3088    const char *f;
3089    _PyUnicodeWriter writer;
3090
3091    _PyUnicodeWriter_Init(&writer);
3092    writer.min_length = strlen(format) + 100;
3093    writer.overallocate = 1;
3094
3095    // Copy varags to be able to pass a reference to a subfunction.
3096    va_copy(vargs2, vargs);
3097
3098    for (f = format; *f; ) {
3099        if (*f == '%') {
3100            f = unicode_fromformat_arg(&writer, f, &vargs2);
3101            if (f == NULL)
3102                goto fail;
3103        }
3104        else {
3105            const char *p;
3106            Py_ssize_t len;
3107
3108            p = f;
3109            do
3110            {
3111                if ((unsigned char)*p > 127) {
3112                    PyErr_Format(PyExc_ValueError,
3113                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3114                        "string, got a non-ASCII byte: 0x%02x",
3115                        (unsigned char)*p);
3116                    goto fail;
3117                }
3118                p++;
3119            }
3120            while (*p != '\0' && *p != '%');
3121            len = p - f;
3122
3123            if (*p == '\0')
3124                writer.overallocate = 0;
3125
3126            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3127                goto fail;
3128
3129            f = p;
3130        }
3131    }
3132    va_end(vargs2);
3133    return _PyUnicodeWriter_Finish(&writer);
3134
3135  fail:
3136    va_end(vargs2);
3137    _PyUnicodeWriter_Dealloc(&writer);
3138    return NULL;
3139}
3140
3141PyObject *
3142PyUnicode_FromFormat(const char *format, ...)
3143{
3144    PyObject* ret;
3145    va_list vargs;
3146
3147#ifdef HAVE_STDARG_PROTOTYPES
3148    va_start(vargs, format);
3149#else
3150    va_start(vargs);
3151#endif
3152    ret = PyUnicode_FromFormatV(format, vargs);
3153    va_end(vargs);
3154    return ret;
3155}
3156
3157static Py_ssize_t
3158unicode_get_widechar_size(PyObject *unicode)
3159{
3160    Py_ssize_t res;
3161
3162    assert(unicode != NULL);
3163    assert(_PyUnicode_CHECK(unicode));
3164
3165#if USE_UNICODE_WCHAR_CACHE
3166    if (_PyUnicode_WSTR(unicode) != NULL) {
3167        return PyUnicode_WSTR_LENGTH(unicode);
3168    }
3169#endif /* USE_UNICODE_WCHAR_CACHE */
3170    assert(PyUnicode_IS_READY(unicode));
3171
3172    res = _PyUnicode_LENGTH(unicode);
3173#if SIZEOF_WCHAR_T == 2
3174    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3175        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3176        const Py_UCS4 *end = s + res;
3177        for (; s < end; ++s) {
3178            if (*s > 0xFFFF) {
3179                ++res;
3180            }
3181        }
3182    }
3183#endif
3184    return res;
3185}
3186
3187static void
3188unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3189{
3190    assert(unicode != NULL);
3191    assert(_PyUnicode_CHECK(unicode));
3192
3193#if USE_UNICODE_WCHAR_CACHE
3194    const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3195    if (wstr != NULL) {
3196        memcpy(w, wstr, size * sizeof(wchar_t));
3197        return;
3198    }
3199#else /* USE_UNICODE_WCHAR_CACHE */
3200    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3201        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3202        return;
3203    }
3204#endif /* USE_UNICODE_WCHAR_CACHE */
3205    assert(PyUnicode_IS_READY(unicode));
3206
3207    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3208        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3209        for (; size--; ++s, ++w) {
3210            *w = *s;
3211        }
3212    }
3213    else {
3214#if SIZEOF_WCHAR_T == 4
3215        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3216        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3217        for (; size--; ++s, ++w) {
3218            *w = *s;
3219        }
3220#else
3221        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3222        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3223        for (; size--; ++s, ++w) {
3224            Py_UCS4 ch = *s;
3225            if (ch > 0xFFFF) {
3226                assert(ch <= MAX_UNICODE);
3227                /* encode surrogate pair in this case */
3228                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3229                if (!size--)
3230                    break;
3231                *w = Py_UNICODE_LOW_SURROGATE(ch);
3232            }
3233            else {
3234                *w = ch;
3235            }
3236        }
3237#endif
3238    }
3239}
3240
3241#ifdef HAVE_WCHAR_H
3242
3243/* Convert a Unicode object to a wide character string.
3244
3245   - If w is NULL: return the number of wide characters (including the null
3246     character) required to convert the unicode object. Ignore size argument.
3247
3248   - Otherwise: return the number of wide characters (excluding the null
3249     character) written into w. Write at most size wide characters (including
3250     the null character). */
3251Py_ssize_t
3252PyUnicode_AsWideChar(PyObject *unicode,
3253                     wchar_t *w,
3254                     Py_ssize_t size)
3255{
3256    Py_ssize_t res;
3257
3258    if (unicode == NULL) {
3259        PyErr_BadInternalCall();
3260        return -1;
3261    }
3262    if (!PyUnicode_Check(unicode)) {
3263        PyErr_BadArgument();
3264        return -1;
3265    }
3266
3267    res = unicode_get_widechar_size(unicode);
3268    if (w == NULL) {
3269        return res + 1;
3270    }
3271
3272    if (size > res) {
3273        size = res + 1;
3274    }
3275    else {
3276        res = size;
3277    }
3278    unicode_copy_as_widechar(unicode, w, size);
3279
3280#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3281    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3282       non-Unicode locales and hence needs conversion first. */
3283    if (_Py_LocaleUsesNonUnicodeWchar()) {
3284        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3285            return -1;
3286        }
3287    }
3288#endif
3289
3290    return res;
3291}
3292
3293wchar_t*
3294PyUnicode_AsWideCharString(PyObject *unicode,
3295                           Py_ssize_t *size)
3296{
3297    wchar_t *buffer;
3298    Py_ssize_t buflen;
3299
3300    if (unicode == NULL) {
3301        PyErr_BadInternalCall();
3302        return NULL;
3303    }
3304    if (!PyUnicode_Check(unicode)) {
3305        PyErr_BadArgument();
3306        return NULL;
3307    }
3308
3309    buflen = unicode_get_widechar_size(unicode);
3310    buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3311    if (buffer == NULL) {
3312        PyErr_NoMemory();
3313        return NULL;
3314    }
3315    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3316
3317#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3318    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3319       non-Unicode locales and hence needs conversion first. */
3320    if (_Py_LocaleUsesNonUnicodeWchar()) {
3321        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3322            return NULL;
3323        }
3324    }
3325#endif
3326
3327    if (size != NULL) {
3328        *size = buflen;
3329    }
3330    else if (wcslen(buffer) != (size_t)buflen) {
3331        PyMem_Free(buffer);
3332        PyErr_SetString(PyExc_ValueError,
3333                        "embedded null character");
3334        return NULL;
3335    }
3336    return buffer;
3337}
3338
3339#endif /* HAVE_WCHAR_H */
3340
3341int
3342_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3343{
3344    wchar_t **p = (wchar_t **)ptr;
3345    if (obj == NULL) {
3346#if !USE_UNICODE_WCHAR_CACHE
3347        PyMem_Free(*p);
3348#endif /* USE_UNICODE_WCHAR_CACHE */
3349        *p = NULL;
3350        return 1;
3351    }
3352    if (PyUnicode_Check(obj)) {
3353#if USE_UNICODE_WCHAR_CACHE
3354        *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3355        if (*p == NULL) {
3356            return 0;
3357        }
3358        return 1;
3359#else /* USE_UNICODE_WCHAR_CACHE */
3360        *p = PyUnicode_AsWideCharString(obj, NULL);
3361        if (*p == NULL) {
3362            return 0;
3363        }
3364        return Py_CLEANUP_SUPPORTED;
3365#endif /* USE_UNICODE_WCHAR_CACHE */
3366    }
3367    PyErr_Format(PyExc_TypeError,
3368                 "argument must be str, not %.50s",
3369                 Py_TYPE(obj)->tp_name);
3370    return 0;
3371}
3372
3373int
3374_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3375{
3376    wchar_t **p = (wchar_t **)ptr;
3377    if (obj == NULL) {
3378#if !USE_UNICODE_WCHAR_CACHE
3379        PyMem_Free(*p);
3380#endif /* USE_UNICODE_WCHAR_CACHE */
3381        *p = NULL;
3382        return 1;
3383    }
3384    if (obj == Py_None) {
3385        *p = NULL;
3386        return 1;
3387    }
3388    if (PyUnicode_Check(obj)) {
3389#if USE_UNICODE_WCHAR_CACHE
3390        *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3391        if (*p == NULL) {
3392            return 0;
3393        }
3394        return 1;
3395#else /* USE_UNICODE_WCHAR_CACHE */
3396        *p = PyUnicode_AsWideCharString(obj, NULL);
3397        if (*p == NULL) {
3398            return 0;
3399        }
3400        return Py_CLEANUP_SUPPORTED;
3401#endif /* USE_UNICODE_WCHAR_CACHE */
3402    }
3403    PyErr_Format(PyExc_TypeError,
3404                 "argument must be str or None, not %.50s",
3405                 Py_TYPE(obj)->tp_name);
3406    return 0;
3407}
3408
3409PyObject *
3410PyUnicode_FromOrdinal(int ordinal)
3411{
3412    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3413        PyErr_SetString(PyExc_ValueError,
3414                        "chr() arg not in range(0x110000)");
3415        return NULL;
3416    }
3417
3418    return unicode_char((Py_UCS4)ordinal);
3419}
3420
3421PyObject *
3422PyUnicode_FromObject(PyObject *obj)
3423{
3424    /* XXX Perhaps we should make this API an alias of
3425       PyObject_Str() instead ?! */
3426    if (PyUnicode_CheckExact(obj)) {
3427        if (PyUnicode_READY(obj) == -1)
3428            return NULL;
3429        Py_INCREF(obj);
3430        return obj;
3431    }
3432    if (PyUnicode_Check(obj)) {
3433        /* For a Unicode subtype that's not a Unicode object,
3434           return a true Unicode object with the same data. */
3435        return _PyUnicode_Copy(obj);
3436    }
3437    PyErr_Format(PyExc_TypeError,
3438                 "Can't convert '%.100s' object to str implicitly",
3439                 Py_TYPE(obj)->tp_name);
3440    return NULL;
3441}
3442
3443PyObject *
3444PyUnicode_FromEncodedObject(PyObject *obj,
3445                            const char *encoding,
3446                            const char *errors)
3447{
3448    Py_buffer buffer;
3449    PyObject *v;
3450
3451    if (obj == NULL) {
3452        PyErr_BadInternalCall();
3453        return NULL;
3454    }
3455
3456    /* Decoding bytes objects is the most common case and should be fast */
3457    if (PyBytes_Check(obj)) {
3458        if (PyBytes_GET_SIZE(obj) == 0) {
3459            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3460                return NULL;
3461            }
3462            _Py_RETURN_UNICODE_EMPTY();
3463        }
3464        return PyUnicode_Decode(
3465                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3466                encoding, errors);
3467    }
3468
3469    if (PyUnicode_Check(obj)) {
3470        PyErr_SetString(PyExc_TypeError,
3471                        "decoding str is not supported");
3472        return NULL;
3473    }
3474
3475    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3476    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3477        PyErr_Format(PyExc_TypeError,
3478                     "decoding to str: need a bytes-like object, %.80s found",
3479                     Py_TYPE(obj)->tp_name);
3480        return NULL;
3481    }
3482
3483    if (buffer.len == 0) {
3484        PyBuffer_Release(&buffer);
3485        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3486            return NULL;
3487        }
3488        _Py_RETURN_UNICODE_EMPTY();
3489    }
3490
3491    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3492    PyBuffer_Release(&buffer);
3493    return v;
3494}
3495
3496/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3497   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3498   longer than lower_len-1). */
3499int
3500_Py_normalize_encoding(const char *encoding,
3501                       char *lower,
3502                       size_t lower_len)
3503{
3504    const char *e;
3505    char *l;
3506    char *l_end;
3507    int punct;
3508
3509    assert(encoding != NULL);
3510
3511    e = encoding;
3512    l = lower;
3513    l_end = &lower[lower_len - 1];
3514    punct = 0;
3515    while (1) {
3516        char c = *e;
3517        if (c == 0) {
3518            break;
3519        }
3520
3521        if (Py_ISALNUM(c) || c == '.') {
3522            if (punct && l != lower) {
3523                if (l == l_end) {
3524                    return 0;
3525                }
3526                *l++ = '_';
3527            }
3528            punct = 0;
3529
3530            if (l == l_end) {
3531                return 0;
3532            }
3533            *l++ = Py_TOLOWER(c);
3534        }
3535        else {
3536            punct = 1;
3537        }
3538
3539        e++;
3540    }
3541    *l = '\0';
3542    return 1;
3543}
3544
3545PyObject *
3546PyUnicode_Decode(const char *s,
3547                 Py_ssize_t size,
3548                 const char *encoding,
3549                 const char *errors)
3550{
3551    PyObject *buffer = NULL, *unicode;
3552    Py_buffer info;
3553    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3554
3555    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3556        return NULL;
3557    }
3558
3559    if (size == 0) {
3560        _Py_RETURN_UNICODE_EMPTY();
3561    }
3562
3563    if (encoding == NULL) {
3564        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3565    }
3566
3567    /* Shortcuts for common default encodings */
3568    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3569        char *lower = buflower;
3570
3571        /* Fast paths */
3572        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3573            lower += 3;
3574            if (*lower == '_') {
3575                /* Match "utf8" and "utf_8" */
3576                lower++;
3577            }
3578
3579            if (lower[0] == '8' && lower[1] == 0) {
3580                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3581            }
3582            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3583                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3584            }
3585            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3586                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3587            }
3588        }
3589        else {
3590            if (strcmp(lower, "ascii") == 0
3591                || strcmp(lower, "us_ascii") == 0) {
3592                return PyUnicode_DecodeASCII(s, size, errors);
3593            }
3594    #ifdef MS_WINDOWS
3595            else if (strcmp(lower, "mbcs") == 0) {
3596                return PyUnicode_DecodeMBCS(s, size, errors);
3597            }
3598    #endif
3599            else if (strcmp(lower, "latin1") == 0
3600                     || strcmp(lower, "latin_1") == 0
3601                     || strcmp(lower, "iso_8859_1") == 0
3602                     || strcmp(lower, "iso8859_1") == 0) {
3603                return PyUnicode_DecodeLatin1(s, size, errors);
3604            }
3605        }
3606    }
3607
3608    /* Decode via the codec registry */
3609    buffer = NULL;
3610    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3611        goto onError;
3612    buffer = PyMemoryView_FromBuffer(&info);
3613    if (buffer == NULL)
3614        goto onError;
3615    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3616    if (unicode == NULL)
3617        goto onError;
3618    if (!PyUnicode_Check(unicode)) {
3619        PyErr_Format(PyExc_TypeError,
3620                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3621                     "use codecs.decode() to decode to arbitrary types",
3622                     encoding,
3623                     Py_TYPE(unicode)->tp_name);
3624        Py_DECREF(unicode);
3625        goto onError;
3626    }
3627    Py_DECREF(buffer);
3628    return unicode_result(unicode);
3629
3630  onError:
3631    Py_XDECREF(buffer);
3632    return NULL;
3633}
3634
3635PyObject *
3636PyUnicode_AsDecodedObject(PyObject *unicode,
3637                          const char *encoding,
3638                          const char *errors)
3639{
3640    if (!PyUnicode_Check(unicode)) {
3641        PyErr_BadArgument();
3642        return NULL;
3643    }
3644
3645    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3646                     "PyUnicode_AsDecodedObject() is deprecated; "
3647                     "use PyCodec_Decode() to decode from str", 1) < 0)
3648        return NULL;
3649
3650    if (encoding == NULL)
3651        encoding = PyUnicode_GetDefaultEncoding();
3652
3653    /* Decode via the codec registry */
3654    return PyCodec_Decode(unicode, encoding, errors);
3655}
3656
3657PyObject *
3658PyUnicode_AsDecodedUnicode(PyObject *unicode,
3659                           const char *encoding,
3660                           const char *errors)
3661{
3662    PyObject *v;
3663
3664    if (!PyUnicode_Check(unicode)) {
3665        PyErr_BadArgument();
3666        goto onError;
3667    }
3668
3669    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3670                     "PyUnicode_AsDecodedUnicode() is deprecated; "
3671                     "use PyCodec_Decode() to decode from str to str", 1) < 0)
3672        return NULL;
3673
3674    if (encoding == NULL)
3675        encoding = PyUnicode_GetDefaultEncoding();
3676
3677    /* Decode via the codec registry */
3678    v = PyCodec_Decode(unicode, encoding, errors);
3679    if (v == NULL)
3680        goto onError;
3681    if (!PyUnicode_Check(v)) {
3682        PyErr_Format(PyExc_TypeError,
3683                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3684                     "use codecs.decode() to decode to arbitrary types",
3685                     encoding,
3686                     Py_TYPE(unicode)->tp_name);
3687        Py_DECREF(v);
3688        goto onError;
3689    }
3690    return unicode_result(v);
3691
3692  onError:
3693    return NULL;
3694}
3695
3696PyObject *
3697PyUnicode_AsEncodedObject(PyObject *unicode,
3698                          const char *encoding,
3699                          const char *errors)
3700{
3701    PyObject *v;
3702
3703    if (!PyUnicode_Check(unicode)) {
3704        PyErr_BadArgument();
3705        goto onError;
3706    }
3707
3708    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3709                     "PyUnicode_AsEncodedObject() is deprecated; "
3710                     "use PyUnicode_AsEncodedString() to encode from str to bytes "
3711                     "or PyCodec_Encode() for generic encoding", 1) < 0)
3712        return NULL;
3713
3714    if (encoding == NULL)
3715        encoding = PyUnicode_GetDefaultEncoding();
3716
3717    /* Encode via the codec registry */
3718    v = PyCodec_Encode(unicode, encoding, errors);
3719    if (v == NULL)
3720        goto onError;
3721    return v;
3722
3723  onError:
3724    return NULL;
3725}
3726
3727
3728static PyObject *
3729unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3730                      int current_locale)
3731{
3732    Py_ssize_t wlen;
3733    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3734    if (wstr == NULL) {
3735        return NULL;
3736    }
3737
3738    if ((size_t)wlen != wcslen(wstr)) {
3739        PyErr_SetString(PyExc_ValueError, "embedded null character");
3740        PyMem_Free(wstr);
3741        return NULL;
3742    }
3743
3744    char *str;
3745    size_t error_pos;
3746    const char *reason;
3747    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3748                                 current_locale, error_handler);
3749    PyMem_Free(wstr);
3750
3751    if (res != 0) {
3752        if (res == -2) {
3753            PyObject *exc;
3754            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3755                    "locale", unicode,
3756                    (Py_ssize_t)error_pos,
3757                    (Py_ssize_t)(error_pos+1),
3758                    reason);
3759            if (exc != NULL) {
3760                PyCodec_StrictErrors(exc);
3761                Py_DECREF(exc);
3762            }
3763        }
3764        else if (res == -3) {
3765            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3766        }
3767        else {
3768            PyErr_NoMemory();
3769        }
3770        return NULL;
3771    }
3772
3773    PyObject *bytes = PyBytes_FromString(str);
3774    PyMem_RawFree(str);
3775    return bytes;
3776}
3777
3778PyObject *
3779PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3780{
3781    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3782    return unicode_encode_locale(unicode, error_handler, 1);
3783}
3784
3785PyObject *
3786PyUnicode_EncodeFSDefault(PyObject *unicode)
3787{
3788    PyInterpreterState *interp = _PyInterpreterState_GET();
3789    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3790    if (fs_codec->utf8) {
3791        return unicode_encode_utf8(unicode,
3792                                   fs_codec->error_handler,
3793                                   fs_codec->errors);
3794    }
3795#ifndef _Py_FORCE_UTF8_FS_ENCODING
3796    else if (fs_codec->encoding) {
3797        return PyUnicode_AsEncodedString(unicode,
3798                                         fs_codec->encoding,
3799                                         fs_codec->errors);
3800    }
3801#endif
3802    else {
3803        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3804           machinery is not ready and so cannot be used:
3805           use wcstombs() in this case. */
3806        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3807        const wchar_t *filesystem_errors = config->filesystem_errors;
3808        assert(filesystem_errors != NULL);
3809        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3810        assert(errors != _Py_ERROR_UNKNOWN);
3811#ifdef _Py_FORCE_UTF8_FS_ENCODING
3812        return unicode_encode_utf8(unicode, errors, NULL);
3813#else
3814        return unicode_encode_locale(unicode, errors, 0);
3815#endif
3816    }
3817}
3818
3819PyObject *
3820PyUnicode_AsEncodedString(PyObject *unicode,
3821                          const char *encoding,
3822                          const char *errors)
3823{
3824    PyObject *v;
3825    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3826
3827    if (!PyUnicode_Check(unicode)) {
3828        PyErr_BadArgument();
3829        return NULL;
3830    }
3831
3832    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3833        return NULL;
3834    }
3835
3836    if (encoding == NULL) {
3837        return _PyUnicode_AsUTF8String(unicode, errors);
3838    }
3839
3840    /* Shortcuts for common default encodings */
3841    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3842        char *lower = buflower;
3843
3844        /* Fast paths */
3845        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3846            lower += 3;
3847            if (*lower == '_') {
3848                /* Match "utf8" and "utf_8" */
3849                lower++;
3850            }
3851
3852            if (lower[0] == '8' && lower[1] == 0) {
3853                return _PyUnicode_AsUTF8String(unicode, errors);
3854            }
3855            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3856                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3857            }
3858            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3859                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3860            }
3861        }
3862        else {
3863            if (strcmp(lower, "ascii") == 0
3864                || strcmp(lower, "us_ascii") == 0) {
3865                return _PyUnicode_AsASCIIString(unicode, errors);
3866            }
3867#ifdef MS_WINDOWS
3868            else if (strcmp(lower, "mbcs") == 0) {
3869                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3870            }
3871#endif
3872            else if (strcmp(lower, "latin1") == 0 ||
3873                     strcmp(lower, "latin_1") == 0 ||
3874                     strcmp(lower, "iso_8859_1") == 0 ||
3875                     strcmp(lower, "iso8859_1") == 0) {
3876                return _PyUnicode_AsLatin1String(unicode, errors);
3877            }
3878        }
3879    }
3880
3881    /* Encode via the codec registry */
3882    v = _PyCodec_EncodeText(unicode, encoding, errors);
3883    if (v == NULL)
3884        return NULL;
3885
3886    /* The normal path */
3887    if (PyBytes_Check(v))
3888        return v;
3889
3890    /* If the codec returns a buffer, raise a warning and convert to bytes */
3891    if (PyByteArray_Check(v)) {
3892        int error;
3893        PyObject *b;
3894
3895        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3896            "encoder %s returned bytearray instead of bytes; "
3897            "use codecs.encode() to encode to arbitrary types",
3898            encoding);
3899        if (error) {
3900            Py_DECREF(v);
3901            return NULL;
3902        }
3903
3904        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3905                                      PyByteArray_GET_SIZE(v));
3906        Py_DECREF(v);
3907        return b;
3908    }
3909
3910    PyErr_Format(PyExc_TypeError,
3911                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3912                 "use codecs.encode() to encode to arbitrary types",
3913                 encoding,
3914                 Py_TYPE(v)->tp_name);
3915    Py_DECREF(v);
3916    return NULL;
3917}
3918
3919PyObject *
3920PyUnicode_AsEncodedUnicode(PyObject *unicode,
3921                           const char *encoding,
3922                           const char *errors)
3923{
3924    PyObject *v;
3925
3926    if (!PyUnicode_Check(unicode)) {
3927        PyErr_BadArgument();
3928        goto onError;
3929    }
3930
3931    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3932                     "PyUnicode_AsEncodedUnicode() is deprecated; "
3933                     "use PyCodec_Encode() to encode from str to str", 1) < 0)
3934        return NULL;
3935
3936    if (encoding == NULL)
3937        encoding = PyUnicode_GetDefaultEncoding();
3938
3939    /* Encode via the codec registry */
3940    v = PyCodec_Encode(unicode, encoding, errors);
3941    if (v == NULL)
3942        goto onError;
3943    if (!PyUnicode_Check(v)) {
3944        PyErr_Format(PyExc_TypeError,
3945                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3946                     "use codecs.encode() to encode to arbitrary types",
3947                     encoding,
3948                     Py_TYPE(v)->tp_name);
3949        Py_DECREF(v);
3950        goto onError;
3951    }
3952    return v;
3953
3954  onError:
3955    return NULL;
3956}
3957
3958static PyObject*
3959unicode_decode_locale(const char *str, Py_ssize_t len,
3960                      _Py_error_handler errors, int current_locale)
3961{
3962    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3963        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3964        return NULL;
3965    }
3966
3967    wchar_t *wstr;
3968    size_t wlen;
3969    const char *reason;
3970    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3971                                 current_locale, errors);
3972    if (res != 0) {
3973        if (res == -2) {
3974            PyObject *exc;
3975            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3976                                        "locale", str, len,
3977                                        (Py_ssize_t)wlen,
3978                                        (Py_ssize_t)(wlen + 1),
3979                                        reason);
3980            if (exc != NULL) {
3981                PyCodec_StrictErrors(exc);
3982                Py_DECREF(exc);
3983            }
3984        }
3985        else if (res == -3) {
3986            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3987        }
3988        else {
3989            PyErr_NoMemory();
3990        }
3991        return NULL;
3992    }
3993
3994    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3995    PyMem_RawFree(wstr);
3996    return unicode;
3997}
3998
3999PyObject*
4000PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4001                              const char *errors)
4002{
4003    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4004    return unicode_decode_locale(str, len, error_handler, 1);
4005}
4006
4007PyObject*
4008PyUnicode_DecodeLocale(const char *str, const char *errors)
4009{
4010    Py_ssize_t size = (Py_ssize_t)strlen(str);
4011    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4012    return unicode_decode_locale(str, size, error_handler, 1);
4013}
4014
4015
4016PyObject*
4017PyUnicode_DecodeFSDefault(const char *s) {
4018    Py_ssize_t size = (Py_ssize_t)strlen(s);
4019    return PyUnicode_DecodeFSDefaultAndSize(s, size);
4020}
4021
4022PyObject*
4023PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4024{
4025    PyInterpreterState *interp = _PyInterpreterState_GET();
4026    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4027    if (fs_codec->utf8) {
4028        return unicode_decode_utf8(s, size,
4029                                   fs_codec->error_handler,
4030                                   fs_codec->errors,
4031                                   NULL);
4032    }
4033#ifndef _Py_FORCE_UTF8_FS_ENCODING
4034    else if (fs_codec->encoding) {
4035        return PyUnicode_Decode(s, size,
4036                                fs_codec->encoding,
4037                                fs_codec->errors);
4038    }
4039#endif
4040    else {
4041        /* Before _PyUnicode_InitEncodings() is called, the Python codec
4042           machinery is not ready and so cannot be used:
4043           use mbstowcs() in this case. */
4044        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4045        const wchar_t *filesystem_errors = config->filesystem_errors;
4046        assert(filesystem_errors != NULL);
4047        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4048        assert(errors != _Py_ERROR_UNKNOWN);
4049#ifdef _Py_FORCE_UTF8_FS_ENCODING
4050        return unicode_decode_utf8(s, size, errors, NULL, NULL);
4051#else
4052        return unicode_decode_locale(s, size, errors, 0);
4053#endif
4054    }
4055}
4056
4057
4058int
4059PyUnicode_FSConverter(PyObject* arg, void* addr)
4060{
4061    PyObject *path = NULL;
4062    PyObject *output = NULL;
4063    Py_ssize_t size;
4064    const char *data;
4065    if (arg == NULL) {
4066        Py_DECREF(*(PyObject**)addr);
4067        *(PyObject**)addr = NULL;
4068        return 1;
4069    }
4070    path = PyOS_FSPath(arg);
4071    if (path == NULL) {
4072        return 0;
4073    }
4074    if (PyBytes_Check(path)) {
4075        output = path;
4076    }
4077    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4078        output = PyUnicode_EncodeFSDefault(path);
4079        Py_DECREF(path);
4080        if (!output) {
4081            return 0;
4082        }
4083        assert(PyBytes_Check(output));
4084    }
4085
4086    size = PyBytes_GET_SIZE(output);
4087    data = PyBytes_AS_STRING(output);
4088    if ((size_t)size != strlen(data)) {
4089        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4090        Py_DECREF(output);
4091        return 0;
4092    }
4093    *(PyObject**)addr = output;
4094    return Py_CLEANUP_SUPPORTED;
4095}
4096
4097
4098int
4099PyUnicode_FSDecoder(PyObject* arg, void* addr)
4100{
4101    int is_buffer = 0;
4102    PyObject *path = NULL;
4103    PyObject *output = NULL;
4104    if (arg == NULL) {
4105        Py_DECREF(*(PyObject**)addr);
4106        *(PyObject**)addr = NULL;
4107        return 1;
4108    }
4109
4110    is_buffer = PyObject_CheckBuffer(arg);
4111    if (!is_buffer) {
4112        path = PyOS_FSPath(arg);
4113        if (path == NULL) {
4114            return 0;
4115        }
4116    }
4117    else {
4118        path = arg;
4119        Py_INCREF(arg);
4120    }
4121
4122    if (PyUnicode_Check(path)) {
4123        output = path;
4124    }
4125    else if (PyBytes_Check(path) || is_buffer) {
4126        PyObject *path_bytes = NULL;
4127
4128        if (!PyBytes_Check(path) &&
4129            PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4130            "path should be string, bytes, or os.PathLike, not %.200s",
4131            Py_TYPE(arg)->tp_name)) {
4132                Py_DECREF(path);
4133            return 0;
4134        }
4135        path_bytes = PyBytes_FromObject(path);
4136        Py_DECREF(path);
4137        if (!path_bytes) {
4138            return 0;
4139        }
4140        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4141                                                  PyBytes_GET_SIZE(path_bytes));
4142        Py_DECREF(path_bytes);
4143        if (!output) {
4144            return 0;
4145        }
4146    }
4147    else {
4148        PyErr_Format(PyExc_TypeError,
4149                     "path should be string, bytes, or os.PathLike, not %.200s",
4150                     Py_TYPE(arg)->tp_name);
4151        Py_DECREF(path);
4152        return 0;
4153    }
4154    if (PyUnicode_READY(output) == -1) {
4155        Py_DECREF(output);
4156        return 0;
4157    }
4158    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4159                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4160        PyErr_SetString(PyExc_ValueError, "embedded null character");
4161        Py_DECREF(output);
4162        return 0;
4163    }
4164    *(PyObject**)addr = output;
4165    return Py_CLEANUP_SUPPORTED;
4166}
4167
4168
4169static int unicode_fill_utf8(PyObject *unicode);
4170
4171const char *
4172PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4173{
4174    if (!PyUnicode_Check(unicode)) {
4175        PyErr_BadArgument();
4176        return NULL;
4177    }
4178    if (PyUnicode_READY(unicode) == -1)
4179        return NULL;
4180
4181    if (PyUnicode_UTF8(unicode) == NULL) {
4182        if (unicode_fill_utf8(unicode) == -1) {
4183            return NULL;
4184        }
4185    }
4186
4187    if (psize)
4188        *psize = PyUnicode_UTF8_LENGTH(unicode);
4189    return PyUnicode_UTF8(unicode);
4190}
4191
4192const char *
4193PyUnicode_AsUTF8(PyObject *unicode)
4194{
4195    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4196}
4197
4198Py_UNICODE *
4199PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4200{
4201    if (!PyUnicode_Check(unicode)) {
4202        PyErr_BadArgument();
4203        return NULL;
4204    }
4205    Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4206    if (w == NULL) {
4207        /* Non-ASCII compact unicode object */
4208        assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4209        assert(PyUnicode_IS_READY(unicode));
4210
4211        Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4212        if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4213            PyErr_NoMemory();
4214            return NULL;
4215        }
4216        w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
4217        if (w == NULL) {
4218            PyErr_NoMemory();
4219            return NULL;
4220        }
4221        unicode_copy_as_widechar(unicode, w, wlen + 1);
4222        _PyUnicode_WSTR(unicode) = w;
4223        if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4224            _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4225        }
4226    }
4227    if (size != NULL)
4228        *size = PyUnicode_WSTR_LENGTH(unicode);
4229    return w;
4230}
4231
4232/* Deprecated APIs */
4233
4234_Py_COMP_DIAG_PUSH
4235_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4236
4237Py_UNICODE *
4238PyUnicode_AsUnicode(PyObject *unicode)
4239{
4240    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4241}
4242
4243const Py_UNICODE *
4244_PyUnicode_AsUnicode(PyObject *unicode)
4245{
4246    Py_ssize_t size;
4247    const Py_UNICODE *wstr;
4248
4249    wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4250    if (wstr && wcslen(wstr) != (size_t)size) {
4251        PyErr_SetString(PyExc_ValueError, "embedded null character");
4252        return NULL;
4253    }
4254    return wstr;
4255}
4256
4257
4258Py_ssize_t
4259PyUnicode_GetSize(PyObject *unicode)
4260{
4261    if (!PyUnicode_Check(unicode)) {
4262        PyErr_BadArgument();
4263        goto onError;
4264    }
4265    if (_PyUnicode_WSTR(unicode) == NULL) {
4266        if (PyUnicode_AsUnicode(unicode) == NULL)
4267            goto onError;
4268    }
4269    return PyUnicode_WSTR_LENGTH(unicode);
4270
4271  onError:
4272    return -1;
4273}
4274
4275_Py_COMP_DIAG_POP
4276
4277Py_ssize_t
4278PyUnicode_GetLength(PyObject *unicode)
4279{
4280    if (!PyUnicode_Check(unicode)) {
4281        PyErr_BadArgument();
4282        return -1;
4283    }
4284    if (PyUnicode_READY(unicode) == -1)
4285        return -1;
4286    return PyUnicode_GET_LENGTH(unicode);
4287}
4288
4289Py_UCS4
4290PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4291{
4292    const void *data;
4293    int kind;
4294
4295    if (!PyUnicode_Check(unicode)) {
4296        PyErr_BadArgument();
4297        return (Py_UCS4)-1;
4298    }
4299    if (PyUnicode_READY(unicode) == -1) {
4300        return (Py_UCS4)-1;
4301    }
4302    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4303        PyErr_SetString(PyExc_IndexError, "string index out of range");
4304        return (Py_UCS4)-1;
4305    }
4306    data = PyUnicode_DATA(unicode);
4307    kind = PyUnicode_KIND(unicode);
4308    return PyUnicode_READ(kind, data, index);
4309}
4310
4311int
4312PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4313{
4314    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4315        PyErr_BadArgument();
4316        return -1;
4317    }
4318    assert(PyUnicode_IS_READY(unicode));
4319    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4320        PyErr_SetString(PyExc_IndexError, "string index out of range");
4321        return -1;
4322    }
4323    if (unicode_check_modifiable(unicode))
4324        return -1;
4325    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4326        PyErr_SetString(PyExc_ValueError, "character out of range");
4327        return -1;
4328    }
4329    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4330                    index, ch);
4331    return 0;
4332}
4333
4334const char *
4335PyUnicode_GetDefaultEncoding(void)
4336{
4337    return "utf-8";
4338}
4339
4340/* create or adjust a UnicodeDecodeError */
4341static void
4342make_decode_exception(PyObject **exceptionObject,
4343                      const char *encoding,
4344                      const char *input, Py_ssize_t length,
4345                      Py_ssize_t startpos, Py_ssize_t endpos,
4346                      const char *reason)
4347{
4348    if (*exceptionObject == NULL) {
4349        *exceptionObject = PyUnicodeDecodeError_Create(
4350            encoding, input, length, startpos, endpos, reason);
4351    }
4352    else {
4353        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4354            goto onError;
4355        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4356            goto onError;
4357        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4358            goto onError;
4359    }
4360    return;
4361
4362onError:
4363    Py_CLEAR(*exceptionObject);
4364}
4365
4366#ifdef MS_WINDOWS
4367static int
4368widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4369{
4370    if (newsize > *size) {
4371        wchar_t *newbuf = *buf;
4372        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4373            PyErr_NoMemory();
4374            return -1;
4375        }
4376        *buf = newbuf;
4377    }
4378    *size = newsize;
4379    return 0;
4380}
4381
4382/* error handling callback helper:
4383   build arguments, call the callback and check the arguments,
4384   if no exception occurred, copy the replacement to the output
4385   and adjust various state variables.
4386   return 0 on success, -1 on error
4387*/
4388
4389static int
4390unicode_decode_call_errorhandler_wchar(
4391    const char *errors, PyObject **errorHandler,
4392    const char *encoding, const char *reason,
4393    const char **input, const char **inend, Py_ssize_t *startinpos,
4394    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4395    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4396{
4397    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4398
4399    PyObject *restuple = NULL;
4400    PyObject *repunicode = NULL;
4401    Py_ssize_t outsize;
4402    Py_ssize_t insize;
4403    Py_ssize_t requiredsize;
4404    Py_ssize_t newpos;
4405    PyObject *inputobj = NULL;
4406    Py_ssize_t repwlen;
4407
4408    if (*errorHandler == NULL) {
4409        *errorHandler = PyCodec_LookupError(errors);
4410        if (*errorHandler == NULL)
4411            goto onError;
4412    }
4413
4414    make_decode_exception(exceptionObject,
4415        encoding,
4416        *input, *inend - *input,
4417        *startinpos, *endinpos,
4418        reason);
4419    if (*exceptionObject == NULL)
4420        goto onError;
4421
4422    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4423    if (restuple == NULL)
4424        goto onError;
4425    if (!PyTuple_Check(restuple)) {
4426        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4427        goto onError;
4428    }
4429    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4430        goto onError;
4431
4432    /* Copy back the bytes variables, which might have been modified by the
4433       callback */
4434    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4435    if (!inputobj)
4436        goto onError;
4437    *input = PyBytes_AS_STRING(inputobj);
4438    insize = PyBytes_GET_SIZE(inputobj);
4439    *inend = *input + insize;
4440    /* we can DECREF safely, as the exception has another reference,
4441       so the object won't go away. */
4442    Py_DECREF(inputobj);
4443
4444    if (newpos<0)
4445        newpos = insize+newpos;
4446    if (newpos<0 || newpos>insize) {
4447        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4448        goto onError;
4449    }
4450
4451#if USE_UNICODE_WCHAR_CACHE
4452_Py_COMP_DIAG_PUSH
4453_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4454    repwlen = PyUnicode_GetSize(repunicode);
4455    if (repwlen < 0)
4456        goto onError;
4457_Py_COMP_DIAG_POP
4458#else /* USE_UNICODE_WCHAR_CACHE */
4459    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4460    if (repwlen < 0)
4461        goto onError;
4462    repwlen--;
4463#endif /* USE_UNICODE_WCHAR_CACHE */
4464    /* need more space? (at least enough for what we
4465       have+the replacement+the rest of the string (starting
4466       at the new input position), so we won't have to check space
4467       when there are no errors in the rest of the string) */
4468    requiredsize = *outpos;
4469    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4470        goto overflow;
4471    requiredsize += repwlen;
4472    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4473        goto overflow;
4474    requiredsize += insize - newpos;
4475    outsize = *bufsize;
4476    if (requiredsize > outsize) {
4477        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4478            requiredsize = 2*outsize;
4479        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4480            goto onError;
4481        }
4482    }
4483    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4484    *outpos += repwlen;
4485    *endinpos = newpos;
4486    *inptr = *input + newpos;
4487
4488    /* we made it! */
4489    Py_DECREF(restuple);
4490    return 0;
4491
4492  overflow:
4493    PyErr_SetString(PyExc_OverflowError,
4494                    "decoded result is too long for a Python string");
4495
4496  onError:
4497    Py_XDECREF(restuple);
4498    return -1;
4499}
4500#endif   /* MS_WINDOWS */
4501
4502static int
4503unicode_decode_call_errorhandler_writer(
4504    const char *errors, PyObject **errorHandler,
4505    const char *encoding, const char *reason,
4506    const char **input, const char **inend, Py_ssize_t *startinpos,
4507    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4508    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4509{
4510    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4511
4512    PyObject *restuple = NULL;
4513    PyObject *repunicode = NULL;
4514    Py_ssize_t insize;
4515    Py_ssize_t newpos;
4516    Py_ssize_t replen;
4517    Py_ssize_t remain;
4518    PyObject *inputobj = NULL;
4519    int need_to_grow = 0;
4520    const char *new_inptr;
4521
4522    if (*errorHandler == NULL) {
4523        *errorHandler = PyCodec_LookupError(errors);
4524        if (*errorHandler == NULL)
4525            goto onError;
4526    }
4527
4528    make_decode_exception(exceptionObject,
4529        encoding,
4530        *input, *inend - *input,
4531        *startinpos, *endinpos,
4532        reason);
4533    if (*exceptionObject == NULL)
4534        goto onError;
4535
4536    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4537    if (restuple == NULL)
4538        goto onError;
4539    if (!PyTuple_Check(restuple)) {
4540        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4541        goto onError;
4542    }
4543    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4544        goto onError;
4545
4546    /* Copy back the bytes variables, which might have been modified by the
4547       callback */
4548    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4549    if (!inputobj)
4550        goto onError;
4551    remain = *inend - *input - *endinpos;
4552    *input = PyBytes_AS_STRING(inputobj);
4553    insize = PyBytes_GET_SIZE(inputobj);
4554    *inend = *input + insize;
4555    /* we can DECREF safely, as the exception has another reference,
4556       so the object won't go away. */
4557    Py_DECREF(inputobj);
4558
4559    if (newpos<0)
4560        newpos = insize+newpos;
4561    if (newpos<0 || newpos>insize) {
4562        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4563        goto onError;
4564    }
4565
4566    replen = PyUnicode_GET_LENGTH(repunicode);
4567    if (replen > 1) {
4568        writer->min_length += replen - 1;
4569        need_to_grow = 1;
4570    }
4571    new_inptr = *input + newpos;
4572    if (*inend - new_inptr > remain) {
4573        /* We don't know the decoding algorithm here so we make the worst
4574           assumption that one byte decodes to one unicode character.
4575           If unfortunately one byte could decode to more unicode characters,
4576           the decoder may write out-of-bound then.  Is it possible for the
4577           algorithms using this function? */
4578        writer->min_length += *inend - new_inptr - remain;
4579        need_to_grow = 1;
4580    }
4581    if (need_to_grow) {
4582        writer->overallocate = 1;
4583        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4584                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4585            goto onError;
4586    }
4587    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4588        goto onError;
4589
4590    *endinpos = newpos;
4591    *inptr = new_inptr;
4592
4593    /* we made it! */
4594    Py_DECREF(restuple);
4595    return 0;
4596
4597  onError:
4598    Py_XDECREF(restuple);
4599    return -1;
4600}
4601
4602/* --- UTF-7 Codec -------------------------------------------------------- */
4603
4604/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4605
4606/* Three simple macros defining base-64. */
4607
4608/* Is c a base-64 character? */
4609
4610#define IS_BASE64(c) \
4611    (((c) >= 'A' && (c) <= 'Z') ||     \
4612     ((c) >= 'a' && (c) <= 'z') ||     \
4613     ((c) >= '0' && (c) <= '9') ||     \
4614     (c) == '+' || (c) == '/')
4615
4616/* given that c is a base-64 character, what is its base-64 value? */
4617
4618#define FROM_BASE64(c)                                                  \
4619    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4620     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4621     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4622     (c) == '+' ? 62 : 63)
4623
4624/* What is the base-64 character of the bottom 6 bits of n? */
4625
4626#define TO_BASE64(n)  \
4627    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4628
4629/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4630 * decoded as itself.  We are permissive on decoding; the only ASCII
4631 * byte not decoding to itself is the + which begins a base64
4632 * string. */
4633
4634#define DECODE_DIRECT(c)                                \
4635    ((c) <= 127 && (c) != '+')
4636
4637/* The UTF-7 encoder treats ASCII characters differently according to
4638 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4639 * the above).  See RFC2152.  This array identifies these different
4640 * sets:
4641 * 0 : "Set D"
4642 *     alphanumeric and '(),-./:?
4643 * 1 : "Set O"
4644 *     !"#$%&*;<=>@[]^_`{|}
4645 * 2 : "whitespace"
4646 *     ht nl cr sp
4647 * 3 : special (must be base64 encoded)
4648 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4649 */
4650
4651static
4652char utf7_category[128] = {
4653/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4654    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4655/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4656    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4657/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4658    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4659/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4660    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4661/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4662    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4663/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4664    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4665/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4666    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4667/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4668    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4669};
4670
4671/* ENCODE_DIRECT: this character should be encoded as itself.  The
4672 * answer depends on whether we are encoding set O as itself, and also
4673 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4674 * clear that the answers to these questions vary between
4675 * applications, so this code needs to be flexible.  */
4676
4677#define ENCODE_DIRECT(c, directO, directWS)             \
4678    ((c) < 128 && (c) > 0 &&                            \
4679     ((utf7_category[(c)] == 0) ||                      \
4680      (directWS && (utf7_category[(c)] == 2)) ||        \
4681      (directO && (utf7_category[(c)] == 1))))
4682
4683PyObject *
4684PyUnicode_DecodeUTF7(const char *s,
4685                     Py_ssize_t size,
4686                     const char *errors)
4687{
4688    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4689}
4690
4691/* The decoder.  The only state we preserve is our read position,
4692 * i.e. how many characters we have consumed.  So if we end in the
4693 * middle of a shift sequence we have to back off the read position
4694 * and the output to the beginning of the sequence, otherwise we lose
4695 * all the shift state (seen bits, number of bits seen, high
4696 * surrogate). */
4697
4698PyObject *
4699PyUnicode_DecodeUTF7Stateful(const char *s,
4700                             Py_ssize_t size,
4701                             const char *errors,
4702                             Py_ssize_t *consumed)
4703{
4704    const char *starts = s;
4705    Py_ssize_t startinpos;
4706    Py_ssize_t endinpos;
4707    const char *e;
4708    _PyUnicodeWriter writer;
4709    const char *errmsg = "";
4710    int inShift = 0;
4711    Py_ssize_t shiftOutStart;
4712    unsigned int base64bits = 0;
4713    unsigned long base64buffer = 0;
4714    Py_UCS4 surrogate = 0;
4715    PyObject *errorHandler = NULL;
4716    PyObject *exc = NULL;
4717
4718    if (size == 0) {
4719        if (consumed)
4720            *consumed = 0;
4721        _Py_RETURN_UNICODE_EMPTY();
4722    }
4723
4724    /* Start off assuming it's all ASCII. Widen later as necessary. */
4725    _PyUnicodeWriter_Init(&writer);
4726    writer.min_length = size;
4727
4728    shiftOutStart = 0;
4729    e = s + size;
4730
4731    while (s < e) {
4732        Py_UCS4 ch;
4733      restart:
4734        ch = (unsigned char) *s;
4735
4736        if (inShift) { /* in a base-64 section */
4737            if (IS_BASE64(ch)) { /* consume a base-64 character */
4738                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4739                base64bits += 6;
4740                s++;
4741                if (base64bits >= 16) {
4742                    /* we have enough bits for a UTF-16 value */
4743                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4744                    base64bits -= 16;
4745                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4746                    assert(outCh <= 0xffff);
4747                    if (surrogate) {
4748                        /* expecting a second surrogate */
4749                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4750                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4751                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4752                                goto onError;
4753                            surrogate = 0;
4754                            continue;
4755                        }
4756                        else {
4757                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4758                                goto onError;
4759                            surrogate = 0;
4760                        }
4761                    }
4762                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4763                        /* first surrogate */
4764                        surrogate = outCh;
4765                    }
4766                    else {
4767                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4768                            goto onError;
4769                    }
4770                }
4771            }
4772            else { /* now leaving a base-64 section */
4773                inShift = 0;
4774                if (base64bits > 0) { /* left-over bits */
4775                    if (base64bits >= 6) {
4776                        /* We've seen at least one base-64 character */
4777                        s++;
4778                        errmsg = "partial character in shift sequence";
4779                        goto utf7Error;
4780                    }
4781                    else {
4782                        /* Some bits remain; they should be zero */
4783                        if (base64buffer != 0) {
4784                            s++;
4785                            errmsg = "non-zero padding bits in shift sequence";
4786                            goto utf7Error;
4787                        }
4788                    }
4789                }
4790                if (surrogate && DECODE_DIRECT(ch)) {
4791                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4792                        goto onError;
4793                }
4794                surrogate = 0;
4795                if (ch == '-') {
4796                    /* '-' is absorbed; other terminating
4797                       characters are preserved */
4798                    s++;
4799                }
4800            }
4801        }
4802        else if ( ch == '+' ) {
4803            startinpos = s-starts;
4804            s++; /* consume '+' */
4805            if (s < e && *s == '-') { /* '+-' encodes '+' */
4806                s++;
4807                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4808                    goto onError;
4809            }
4810            else if (s < e && !IS_BASE64(*s)) {
4811                s++;
4812                errmsg = "ill-formed sequence";
4813                goto utf7Error;
4814            }
4815            else { /* begin base64-encoded section */
4816                inShift = 1;
4817                surrogate = 0;
4818                shiftOutStart = writer.pos;
4819                base64bits = 0;
4820                base64buffer = 0;
4821            }
4822        }
4823        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4824            s++;
4825            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4826                goto onError;
4827        }
4828        else {
4829            startinpos = s-starts;
4830            s++;
4831            errmsg = "unexpected special character";
4832            goto utf7Error;
4833        }
4834        continue;
4835utf7Error:
4836        endinpos = s-starts;
4837        if (unicode_decode_call_errorhandler_writer(
4838                errors, &errorHandler,
4839                "utf7", errmsg,
4840                &starts, &e, &startinpos, &endinpos, &exc, &s,
4841                &writer))
4842            goto onError;
4843    }
4844
4845    /* end of string */
4846
4847    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4848        /* if we're in an inconsistent state, that's an error */
4849        inShift = 0;
4850        if (surrogate ||
4851                (base64bits >= 6) ||
4852                (base64bits > 0 && base64buffer != 0)) {
4853            endinpos = size;
4854            if (unicode_decode_call_errorhandler_writer(
4855                    errors, &errorHandler,
4856                    "utf7", "unterminated shift sequence",
4857                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4858                    &writer))
4859                goto onError;
4860            if (s < e)
4861                goto restart;
4862        }
4863    }
4864
4865    /* return state */
4866    if (consumed) {
4867        if (inShift) {
4868            *consumed = startinpos;
4869            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4870                PyObject *result = PyUnicode_FromKindAndData(
4871                        writer.kind, writer.data, shiftOutStart);
4872                Py_XDECREF(errorHandler);
4873                Py_XDECREF(exc);
4874                _PyUnicodeWriter_Dealloc(&writer);
4875                return result;
4876            }
4877            writer.pos = shiftOutStart; /* back off output */
4878        }
4879        else {
4880            *consumed = s-starts;
4881        }
4882    }
4883
4884    Py_XDECREF(errorHandler);
4885    Py_XDECREF(exc);
4886    return _PyUnicodeWriter_Finish(&writer);
4887
4888  onError:
4889    Py_XDECREF(errorHandler);
4890    Py_XDECREF(exc);
4891    _PyUnicodeWriter_Dealloc(&writer);
4892    return NULL;
4893}
4894
4895
4896PyObject *
4897_PyUnicode_EncodeUTF7(PyObject *str,
4898                      int base64SetO,
4899                      int base64WhiteSpace,
4900                      const char *errors)
4901{
4902    int kind;
4903    const void *data;
4904    Py_ssize_t len;
4905    PyObject *v;
4906    int inShift = 0;
4907    Py_ssize_t i;
4908    unsigned int base64bits = 0;
4909    unsigned long base64buffer = 0;
4910    char * out;
4911    const char * start;
4912
4913    if (PyUnicode_READY(str) == -1)
4914        return NULL;
4915    kind = PyUnicode_KIND(str);
4916    data = PyUnicode_DATA(str);
4917    len = PyUnicode_GET_LENGTH(str);
4918
4919    if (len == 0)
4920        return PyBytes_FromStringAndSize(NULL, 0);
4921
4922    /* It might be possible to tighten this worst case */
4923    if (len > PY_SSIZE_T_MAX / 8)
4924        return PyErr_NoMemory();
4925    v = PyBytes_FromStringAndSize(NULL, len * 8);
4926    if (v == NULL)
4927        return NULL;
4928
4929    start = out = PyBytes_AS_STRING(v);
4930    for (i = 0; i < len; ++i) {
4931        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4932
4933        if (inShift) {
4934            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4935                /* shifting out */
4936                if (base64bits) { /* output remaining bits */
4937                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4938                    base64buffer = 0;
4939                    base64bits = 0;
4940                }
4941                inShift = 0;
4942                /* Characters not in the BASE64 set implicitly unshift the sequence
4943                   so no '-' is required, except if the character is itself a '-' */
4944                if (IS_BASE64(ch) || ch == '-') {
4945                    *out++ = '-';
4946                }
4947                *out++ = (char) ch;
4948            }
4949            else {
4950                goto encode_char;
4951            }
4952        }
4953        else { /* not in a shift sequence */
4954            if (ch == '+') {
4955                *out++ = '+';
4956                        *out++ = '-';
4957            }
4958            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4959                *out++ = (char) ch;
4960            }
4961            else {
4962                *out++ = '+';
4963                inShift = 1;
4964                goto encode_char;
4965            }
4966        }
4967        continue;
4968encode_char:
4969        if (ch >= 0x10000) {
4970            assert(ch <= MAX_UNICODE);
4971
4972            /* code first surrogate */
4973            base64bits += 16;
4974            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4975            while (base64bits >= 6) {
4976                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4977                base64bits -= 6;
4978            }
4979            /* prepare second surrogate */
4980            ch = Py_UNICODE_LOW_SURROGATE(ch);
4981        }
4982        base64bits += 16;
4983        base64buffer = (base64buffer << 16) | ch;
4984        while (base64bits >= 6) {
4985            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4986            base64bits -= 6;
4987        }
4988    }
4989    if (base64bits)
4990        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4991    if (inShift)
4992        *out++ = '-';
4993    if (_PyBytes_Resize(&v, out - start) < 0)
4994        return NULL;
4995    return v;
4996}
4997
4998#undef IS_BASE64
4999#undef FROM_BASE64
5000#undef TO_BASE64
5001#undef DECODE_DIRECT
5002#undef ENCODE_DIRECT
5003
5004/* --- UTF-8 Codec -------------------------------------------------------- */
5005
5006PyObject *
5007PyUnicode_DecodeUTF8(const char *s,
5008                     Py_ssize_t size,
5009                     const char *errors)
5010{
5011    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5012}
5013
5014#include "stringlib/asciilib.h"
5015#include "stringlib/codecs.h"
5016#include "stringlib/undef.h"
5017
5018#include "stringlib/ucs1lib.h"
5019#include "stringlib/codecs.h"
5020#include "stringlib/undef.h"
5021
5022#include "stringlib/ucs2lib.h"
5023#include "stringlib/codecs.h"
5024#include "stringlib/undef.h"
5025
5026#include "stringlib/ucs4lib.h"
5027#include "stringlib/codecs.h"
5028#include "stringlib/undef.h"
5029
5030/* Mask to quickly check whether a C 'size_t' contains a
5031   non-ASCII, UTF8-encoded char. */
5032#if (SIZEOF_SIZE_T == 8)
5033# define ASCII_CHAR_MASK 0x8080808080808080ULL
5034#elif (SIZEOF_SIZE_T == 4)
5035# define ASCII_CHAR_MASK 0x80808080U
5036#else
5037# error C 'size_t' size should be either 4 or 8!
5038#endif
5039
5040static Py_ssize_t
5041ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5042{
5043    const char *p = start;
5044
5045#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5046    assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5047    if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5048        /* Fast path, see in STRINGLIB(utf8_decode) for
5049           an explanation. */
5050        /* Help allocation */
5051        const char *_p = p;
5052        Py_UCS1 * q = dest;
5053        while (_p + SIZEOF_SIZE_T <= end) {
5054            size_t value = *(const size_t *) _p;
5055            if (value & ASCII_CHAR_MASK)
5056                break;
5057            *((size_t *)q) = value;
5058            _p += SIZEOF_SIZE_T;
5059            q += SIZEOF_SIZE_T;
5060        }
5061        p = _p;
5062        while (p < end) {
5063            if ((unsigned char)*p & 0x80)
5064                break;
5065            *q++ = *p++;
5066        }
5067        return p - start;
5068    }
5069#endif
5070    while (p < end) {
5071        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5072           for an explanation. */
5073        if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5074            /* Help allocation */
5075            const char *_p = p;
5076            while (_p + SIZEOF_SIZE_T <= end) {
5077                size_t value = *(const size_t *) _p;
5078                if (value & ASCII_CHAR_MASK)
5079                    break;
5080                _p += SIZEOF_SIZE_T;
5081            }
5082            p = _p;
5083            if (_p == end)
5084                break;
5085        }
5086        if ((unsigned char)*p & 0x80)
5087            break;
5088        ++p;
5089    }
5090    memcpy(dest, start, p - start);
5091    return p - start;
5092}
5093
5094static PyObject *
5095unicode_decode_utf8(const char *s, Py_ssize_t size,
5096                    _Py_error_handler error_handler, const char *errors,
5097                    Py_ssize_t *consumed)
5098{
5099    if (size == 0) {
5100        if (consumed)
5101            *consumed = 0;
5102        _Py_RETURN_UNICODE_EMPTY();
5103    }
5104
5105    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5106    if (size == 1 && (unsigned char)s[0] < 128) {
5107        if (consumed) {
5108            *consumed = 1;
5109        }
5110        return get_latin1_char((unsigned char)s[0]);
5111    }
5112
5113    const char *starts = s;
5114    const char *end = s + size;
5115
5116    // fast path: try ASCII string.
5117    PyObject *u = PyUnicode_New(size, 127);
5118    if (u == NULL) {
5119        return NULL;
5120    }
5121    s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5122    if (s == end) {
5123        return u;
5124    }
5125
5126    // Use _PyUnicodeWriter after fast path is failed.
5127    _PyUnicodeWriter writer;
5128    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5129    writer.pos = s - starts;
5130
5131    Py_ssize_t startinpos, endinpos;
5132    const char *errmsg = "";
5133    PyObject *error_handler_obj = NULL;
5134    PyObject *exc = NULL;
5135
5136    while (s < end) {
5137        Py_UCS4 ch;
5138        int kind = writer.kind;
5139
5140        if (kind == PyUnicode_1BYTE_KIND) {
5141            if (PyUnicode_IS_ASCII(writer.buffer))
5142                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5143            else
5144                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5145        } else if (kind == PyUnicode_2BYTE_KIND) {
5146            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5147        } else {
5148            assert(kind == PyUnicode_4BYTE_KIND);
5149            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5150        }
5151
5152        switch (ch) {
5153        case 0:
5154            if (s == end || consumed)
5155                goto End;
5156            errmsg = "unexpected end of data";
5157            startinpos = s - starts;
5158            endinpos = end - starts;
5159            break;
5160        case 1:
5161            errmsg = "invalid start byte";
5162            startinpos = s - starts;
5163            endinpos = startinpos + 1;
5164            break;
5165        case 2:
5166            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5167                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5168            {
5169                /* Truncated surrogate code in range D800-DFFF */
5170                goto End;
5171            }
5172            /* fall through */
5173        case 3:
5174        case 4:
5175            errmsg = "invalid continuation byte";
5176            startinpos = s - starts;
5177            endinpos = startinpos + ch - 1;
5178            break;
5179        default:
5180            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5181                goto onError;
5182            continue;
5183        }
5184
5185        if (error_handler == _Py_ERROR_UNKNOWN)
5186            error_handler = _Py_GetErrorHandler(errors);
5187
5188        switch (error_handler) {
5189        case _Py_ERROR_IGNORE:
5190            s += (endinpos - startinpos);
5191            break;
5192
5193        case _Py_ERROR_REPLACE:
5194            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5195                goto onError;
5196            s += (endinpos - startinpos);
5197            break;
5198
5199        case _Py_ERROR_SURROGATEESCAPE:
5200        {
5201            Py_ssize_t i;
5202
5203            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5204                goto onError;
5205            for (i=startinpos; i<endinpos; i++) {
5206                ch = (Py_UCS4)(unsigned char)(starts[i]);
5207                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5208                                ch + 0xdc00);
5209                writer.pos++;
5210            }
5211            s += (endinpos - startinpos);
5212            break;
5213        }
5214
5215        default:
5216            if (unicode_decode_call_errorhandler_writer(
5217                    errors, &error_handler_obj,
5218                    "utf-8", errmsg,
5219                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5220                    &writer))
5221                goto onError;
5222        }
5223    }
5224
5225End:
5226    if (consumed)
5227        *consumed = s - starts;
5228
5229    Py_XDECREF(error_handler_obj);
5230    Py_XDECREF(exc);
5231    return _PyUnicodeWriter_Finish(&writer);
5232
5233onError:
5234    Py_XDECREF(error_handler_obj);
5235    Py_XDECREF(exc);
5236    _PyUnicodeWriter_Dealloc(&writer);
5237    return NULL;
5238}
5239
5240
5241PyObject *
5242PyUnicode_DecodeUTF8Stateful(const char *s,
5243                             Py_ssize_t size,
5244                             const char *errors,
5245                             Py_ssize_t *consumed)
5246{
5247    return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5248}
5249
5250
5251/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5252   non-zero, use strict error handler otherwise.
5253
5254   On success, write a pointer to a newly allocated wide character string into
5255   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5256   (in number of wchar_t units) into *wlen (if wlen is set).
5257
5258   On memory allocation failure, return -1.
5259
5260   On decoding error (if surrogateescape is zero), return -2. If wlen is
5261   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5262   is not NULL, write the decoding error message into *reason. */
5263int
5264_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5265                 const char **reason, _Py_error_handler errors)
5266{
5267    const char *orig_s = s;
5268    const char *e;
5269    wchar_t *unicode;
5270    Py_ssize_t outpos;
5271
5272    int surrogateescape = 0;
5273    int surrogatepass = 0;
5274    switch (errors)
5275    {
5276    case _Py_ERROR_STRICT:
5277        break;
5278    case _Py_ERROR_SURROGATEESCAPE:
5279        surrogateescape = 1;
5280        break;
5281    case _Py_ERROR_SURROGATEPASS:
5282        surrogatepass = 1;
5283        break;
5284    default:
5285        return -3;
5286    }
5287
5288    /* Note: size will always be longer than the resulting Unicode
5289       character count */
5290    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5291        return -1;
5292    }
5293
5294    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5295    if (!unicode) {
5296        return -1;
5297    }
5298
5299    /* Unpack UTF-8 encoded data */
5300    e = s + size;
5301    outpos = 0;
5302    while (s < e) {
5303        Py_UCS4 ch;
5304#if SIZEOF_WCHAR_T == 4
5305        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5306#else
5307        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5308#endif
5309        if (ch > 0xFF) {
5310#if SIZEOF_WCHAR_T == 4
5311            Py_UNREACHABLE();
5312#else
5313            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5314            /* write a surrogate pair */
5315            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5316            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5317#endif
5318        }
5319        else {
5320            if (!ch && s == e) {
5321                break;
5322            }
5323
5324            if (surrogateescape) {
5325                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5326            }
5327            else {
5328                /* Is it a valid three-byte code? */
5329                if (surrogatepass
5330                    && (e - s) >= 3
5331                    && (s[0] & 0xf0) == 0xe0
5332                    && (s[1] & 0xc0) == 0x80
5333                    && (s[2] & 0xc0) == 0x80)
5334                {
5335                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5336                    s += 3;
5337                    unicode[outpos++] = ch;
5338                }
5339                else {
5340                    PyMem_RawFree(unicode );
5341                    if (reason != NULL) {
5342                        switch (ch) {
5343                        case 0:
5344                            *reason = "unexpected end of data";
5345                            break;
5346                        case 1:
5347                            *reason = "invalid start byte";
5348                            break;
5349                        /* 2, 3, 4 */
5350                        default:
5351                            *reason = "invalid continuation byte";
5352                            break;
5353                        }
5354                    }
5355                    if (wlen != NULL) {
5356                        *wlen = s - orig_s;
5357                    }
5358                    return -2;
5359                }
5360            }
5361        }
5362    }
5363    unicode[outpos] = L'\0';
5364    if (wlen) {
5365        *wlen = outpos;
5366    }
5367    *wstr = unicode;
5368    return 0;
5369}
5370
5371
5372wchar_t*
5373_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5374                               size_t *wlen)
5375{
5376    wchar_t *wstr;
5377    int res = _Py_DecodeUTF8Ex(arg, arglen,
5378                               &wstr, wlen,
5379                               NULL, _Py_ERROR_SURROGATEESCAPE);
5380    if (res != 0) {
5381        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5382        assert(res != -3);
5383        if (wlen) {
5384            *wlen = (size_t)res;
5385        }
5386        return NULL;
5387    }
5388    return wstr;
5389}
5390
5391
5392/* UTF-8 encoder using the surrogateescape error handler .
5393
5394   On success, return 0 and write the newly allocated character string (use
5395   PyMem_Free() to free the memory) into *str.
5396
5397   On encoding failure, return -2 and write the position of the invalid
5398   surrogate character into *error_pos (if error_pos is set) and the decoding
5399   error message into *reason (if reason is set).
5400
5401   On memory allocation failure, return -1. */
5402int
5403_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5404                 const char **reason, int raw_malloc, _Py_error_handler errors)
5405{
5406    const Py_ssize_t max_char_size = 4;
5407    Py_ssize_t len = wcslen(text);
5408
5409    assert(len >= 0);
5410
5411    int surrogateescape = 0;
5412    int surrogatepass = 0;
5413    switch (errors)
5414    {
5415    case _Py_ERROR_STRICT:
5416        break;
5417    case _Py_ERROR_SURROGATEESCAPE:
5418        surrogateescape = 1;
5419        break;
5420    case _Py_ERROR_SURROGATEPASS:
5421        surrogatepass = 1;
5422        break;
5423    default:
5424        return -3;
5425    }
5426
5427    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5428        return -1;
5429    }
5430    char *bytes;
5431    if (raw_malloc) {
5432        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5433    }
5434    else {
5435        bytes = PyMem_Malloc((len + 1) * max_char_size);
5436    }
5437    if (bytes == NULL) {
5438        return -1;
5439    }
5440
5441    char *p = bytes;
5442    Py_ssize_t i;
5443    for (i = 0; i < len; ) {
5444        Py_ssize_t ch_pos = i;
5445        Py_UCS4 ch = text[i];
5446        i++;
5447#if Py_UNICODE_SIZE == 2
5448        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5449            && i < len
5450            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5451        {
5452            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5453            i++;
5454        }
5455#endif
5456
5457        if (ch < 0x80) {
5458            /* Encode ASCII */
5459            *p++ = (char) ch;
5460
5461        }
5462        else if (ch < 0x0800) {
5463            /* Encode Latin-1 */
5464            *p++ = (char)(0xc0 | (ch >> 6));
5465            *p++ = (char)(0x80 | (ch & 0x3f));
5466        }
5467        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5468            /* surrogateescape error handler */
5469            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5470                if (error_pos != NULL) {
5471                    *error_pos = (size_t)ch_pos;
5472                }
5473                if (reason != NULL) {
5474                    *reason = "encoding error";
5475                }
5476                if (raw_malloc) {
5477                    PyMem_RawFree(bytes);
5478                }
5479                else {
5480                    PyMem_Free(bytes);
5481                }
5482                return -2;
5483            }
5484            *p++ = (char)(ch & 0xff);
5485        }
5486        else if (ch < 0x10000) {
5487            *p++ = (char)(0xe0 | (ch >> 12));
5488            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5489            *p++ = (char)(0x80 | (ch & 0x3f));
5490        }
5491        else {  /* ch >= 0x10000 */
5492            assert(ch <= MAX_UNICODE);
5493            /* Encode UCS4 Unicode ordinals */
5494            *p++ = (char)(0xf0 | (ch >> 18));
5495            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5496            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5497            *p++ = (char)(0x80 | (ch & 0x3f));
5498        }
5499    }
5500    *p++ = '\0';
5501
5502    size_t final_size = (p - bytes);
5503    char *bytes2;
5504    if (raw_malloc) {
5505        bytes2 = PyMem_RawRealloc(bytes, final_size);
5506    }
5507    else {
5508        bytes2 = PyMem_Realloc(bytes, final_size);
5509    }
5510    if (bytes2 == NULL) {
5511        if (error_pos != NULL) {
5512            *error_pos = (size_t)-1;
5513        }
5514        if (raw_malloc) {
5515            PyMem_RawFree(bytes);
5516        }
5517        else {
5518            PyMem_Free(bytes);
5519        }
5520        return -1;
5521    }
5522    *str = bytes2;
5523    return 0;
5524}
5525
5526
5527/* Primary internal function which creates utf8 encoded bytes objects.
5528
5529   Allocation strategy:  if the string is short, convert into a stack buffer
5530   and allocate exactly as much space needed at the end.  Else allocate the
5531   maximum possible needed (4 result bytes per Unicode character), and return
5532   the excess memory at the end.
5533*/
5534static PyObject *
5535unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5536                    const char *errors)
5537{
5538    if (!PyUnicode_Check(unicode)) {
5539        PyErr_BadArgument();
5540        return NULL;
5541    }
5542
5543    if (PyUnicode_READY(unicode) == -1)
5544        return NULL;
5545
5546    if (PyUnicode_UTF8(unicode))
5547        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5548                                         PyUnicode_UTF8_LENGTH(unicode));
5549
5550    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5551    const void *data = PyUnicode_DATA(unicode);
5552    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5553
5554    _PyBytesWriter writer;
5555    char *end;
5556
5557    switch (kind) {
5558    default:
5559        Py_UNREACHABLE();
5560    case PyUnicode_1BYTE_KIND:
5561        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5562        assert(!PyUnicode_IS_ASCII(unicode));
5563        end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5564        break;
5565    case PyUnicode_2BYTE_KIND:
5566        end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5567        break;
5568    case PyUnicode_4BYTE_KIND:
5569        end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5570        break;
5571    }
5572
5573    if (end == NULL) {
5574        _PyBytesWriter_Dealloc(&writer);
5575        return NULL;
5576    }
5577    return _PyBytesWriter_Finish(&writer, end);
5578}
5579
5580static int
5581unicode_fill_utf8(PyObject *unicode)
5582{
5583    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5584    assert(!PyUnicode_IS_ASCII(unicode));
5585
5586    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5587    const void *data = PyUnicode_DATA(unicode);
5588    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5589
5590    _PyBytesWriter writer;
5591    char *end;
5592
5593    switch (kind) {
5594    default:
5595        Py_UNREACHABLE();
5596    case PyUnicode_1BYTE_KIND:
5597        end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5598                                   _Py_ERROR_STRICT, NULL);
5599        break;
5600    case PyUnicode_2BYTE_KIND:
5601        end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5602                                   _Py_ERROR_STRICT, NULL);
5603        break;
5604    case PyUnicode_4BYTE_KIND:
5605        end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5606                                   _Py_ERROR_STRICT, NULL);
5607        break;
5608    }
5609    if (end == NULL) {
5610        _PyBytesWriter_Dealloc(&writer);
5611        return -1;
5612    }
5613
5614    const char *start = writer.use_small_buffer ? writer.small_buffer :
5615                    PyBytes_AS_STRING(writer.buffer);
5616    Py_ssize_t len = end - start;
5617
5618    char *cache = PyObject_Malloc(len + 1);
5619    if (cache == NULL) {
5620        _PyBytesWriter_Dealloc(&writer);
5621        PyErr_NoMemory();
5622        return -1;
5623    }
5624    _PyUnicode_UTF8(unicode) = cache;
5625    _PyUnicode_UTF8_LENGTH(unicode) = len;
5626    memcpy(cache, start, len);
5627    cache[len] = '\0';
5628    _PyBytesWriter_Dealloc(&writer);
5629    return 0;
5630}
5631
5632PyObject *
5633_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5634{
5635    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5636}
5637
5638
5639PyObject *
5640PyUnicode_AsUTF8String(PyObject *unicode)
5641{
5642    return _PyUnicode_AsUTF8String(unicode, NULL);
5643}
5644
5645/* --- UTF-32 Codec ------------------------------------------------------- */
5646
5647PyObject *
5648PyUnicode_DecodeUTF32(const char *s,
5649                      Py_ssize_t size,
5650                      const char *errors,
5651                      int *byteorder)
5652{
5653    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5654}
5655
5656PyObject *
5657PyUnicode_DecodeUTF32Stateful(const char *s,
5658                              Py_ssize_t size,
5659                              const char *errors,
5660                              int *byteorder,
5661                              Py_ssize_t *consumed)
5662{
5663    const char *starts = s;
5664    Py_ssize_t startinpos;
5665    Py_ssize_t endinpos;
5666    _PyUnicodeWriter writer;
5667    const unsigned char *q, *e;
5668    int le, bo = 0;       /* assume native ordering by default */
5669    const char *encoding;
5670    const char *errmsg = "";
5671    PyObject *errorHandler = NULL;
5672    PyObject *exc = NULL;
5673
5674    q = (const unsigned char *)s;
5675    e = q + size;
5676
5677    if (byteorder)
5678        bo = *byteorder;
5679
5680    /* Check for BOM marks (U+FEFF) in the input and adjust current
5681       byte order setting accordingly. In native mode, the leading BOM
5682       mark is skipped, in all other modes, it is copied to the output
5683       stream as-is (giving a ZWNBSP character). */
5684    if (bo == 0 && size >= 4) {
5685        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5686        if (bom == 0x0000FEFF) {
5687            bo = -1;
5688            q += 4;
5689        }
5690        else if (bom == 0xFFFE0000) {
5691            bo = 1;
5692            q += 4;
5693        }
5694        if (byteorder)
5695            *byteorder = bo;
5696    }
5697
5698    if (q == e) {
5699        if (consumed)
5700            *consumed = size;
5701        _Py_RETURN_UNICODE_EMPTY();
5702    }
5703
5704#ifdef WORDS_BIGENDIAN
5705    le = bo < 0;
5706#else
5707    le = bo <= 0;
5708#endif
5709    encoding = le ? "utf-32-le" : "utf-32-be";
5710
5711    _PyUnicodeWriter_Init(&writer);
5712    writer.min_length = (e - q + 3) / 4;
5713    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5714        goto onError;
5715
5716    while (1) {
5717        Py_UCS4 ch = 0;
5718        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5719
5720        if (e - q >= 4) {
5721            enum PyUnicode_Kind kind = writer.kind;
5722            void *data = writer.data;
5723            const unsigned char *last = e - 4;
5724            Py_ssize_t pos = writer.pos;
5725            if (le) {
5726                do {
5727                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5728                    if (ch > maxch)
5729                        break;
5730                    if (kind != PyUnicode_1BYTE_KIND &&
5731                        Py_UNICODE_IS_SURROGATE(ch))
5732                        break;
5733                    PyUnicode_WRITE(kind, data, pos++, ch);
5734                    q += 4;
5735                } while (q <= last);
5736            }
5737            else {
5738                do {
5739                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5740                    if (ch > maxch)
5741                        break;
5742                    if (kind != PyUnicode_1BYTE_KIND &&
5743                        Py_UNICODE_IS_SURROGATE(ch))
5744                        break;
5745                    PyUnicode_WRITE(kind, data, pos++, ch);
5746                    q += 4;
5747                } while (q <= last);
5748            }
5749            writer.pos = pos;
5750        }
5751
5752        if (Py_UNICODE_IS_SURROGATE(ch)) {
5753            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5754            startinpos = ((const char *)q) - starts;
5755            endinpos = startinpos + 4;
5756        }
5757        else if (ch <= maxch) {
5758            if (q == e || consumed)
5759                break;
5760            /* remaining bytes at the end? (size should be divisible by 4) */
5761            errmsg = "truncated data";
5762            startinpos = ((const char *)q) - starts;
5763            endinpos = ((const char *)e) - starts;
5764        }
5765        else {
5766            if (ch < 0x110000) {
5767                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5768                    goto onError;
5769                q += 4;
5770                continue;
5771            }
5772            errmsg = "code point not in range(0x110000)";
5773            startinpos = ((const char *)q) - starts;
5774            endinpos = startinpos + 4;
5775        }
5776
5777        /* The remaining input chars are ignored if the callback
5778           chooses to skip the input */
5779        if (unicode_decode_call_errorhandler_writer(
5780                errors, &errorHandler,
5781                encoding, errmsg,
5782                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5783                &writer))
5784            goto onError;
5785    }
5786
5787    if (consumed)
5788        *consumed = (const char *)q-starts;
5789
5790    Py_XDECREF(errorHandler);
5791    Py_XDECREF(exc);
5792    return _PyUnicodeWriter_Finish(&writer);
5793
5794  onError:
5795    _PyUnicodeWriter_Dealloc(&writer);
5796    Py_XDECREF(errorHandler);
5797    Py_XDECREF(exc);
5798    return NULL;
5799}
5800
5801PyObject *
5802_PyUnicode_EncodeUTF32(PyObject *str,
5803                       const char *errors,
5804                       int byteorder)
5805{
5806    enum PyUnicode_Kind kind;
5807    const void *data;
5808    Py_ssize_t len;
5809    PyObject *v;
5810    uint32_t *out;
5811#if PY_LITTLE_ENDIAN
5812    int native_ordering = byteorder <= 0;
5813#else
5814    int native_ordering = byteorder >= 0;
5815#endif
5816    const char *encoding;
5817    Py_ssize_t nsize, pos;
5818    PyObject *errorHandler = NULL;
5819    PyObject *exc = NULL;
5820    PyObject *rep = NULL;
5821
5822    if (!PyUnicode_Check(str)) {
5823        PyErr_BadArgument();
5824        return NULL;
5825    }
5826    if (PyUnicode_READY(str) == -1)
5827        return NULL;
5828    kind = PyUnicode_KIND(str);
5829    data = PyUnicode_DATA(str);
5830    len = PyUnicode_GET_LENGTH(str);
5831
5832    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5833        return PyErr_NoMemory();
5834    nsize = len + (byteorder == 0);
5835    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5836    if (v == NULL)
5837        return NULL;
5838
5839    /* output buffer is 4-bytes aligned */
5840    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5841    out = (uint32_t *)PyBytes_AS_STRING(v);
5842    if (byteorder == 0)
5843        *out++ = 0xFEFF;
5844    if (len == 0)
5845        goto done;
5846
5847    if (byteorder == -1)
5848        encoding = "utf-32-le";
5849    else if (byteorder == 1)
5850        encoding = "utf-32-be";
5851    else
5852        encoding = "utf-32";
5853
5854    if (kind == PyUnicode_1BYTE_KIND) {
5855        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5856        goto done;
5857    }
5858
5859    pos = 0;
5860    while (pos < len) {
5861        Py_ssize_t newpos, repsize, moreunits;
5862
5863        if (kind == PyUnicode_2BYTE_KIND) {
5864            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5865                                        &out, native_ordering);
5866        }
5867        else {
5868            assert(kind == PyUnicode_4BYTE_KIND);
5869            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5870                                        &out, native_ordering);
5871        }
5872        if (pos == len)
5873            break;
5874
5875        rep = unicode_encode_call_errorhandler(
5876                errors, &errorHandler,
5877                encoding, "surrogates not allowed",
5878                str, &exc, pos, pos + 1, &newpos);
5879        if (!rep)
5880            goto error;
5881
5882        if (PyBytes_Check(rep)) {
5883            repsize = PyBytes_GET_SIZE(rep);
5884            if (repsize & 3) {
5885                raise_encode_exception(&exc, encoding,
5886                                       str, pos, pos + 1,
5887                                       "surrogates not allowed");
5888                goto error;
5889            }
5890            moreunits = repsize / 4;
5891        }
5892        else {
5893            assert(PyUnicode_Check(rep));
5894            if (PyUnicode_READY(rep) < 0)
5895                goto error;
5896            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5897            if (!PyUnicode_IS_ASCII(rep)) {
5898                raise_encode_exception(&exc, encoding,
5899                                       str, pos, pos + 1,
5900                                       "surrogates not allowed");
5901                goto error;
5902            }
5903        }
5904        moreunits += pos - newpos;
5905        pos = newpos;
5906
5907        /* four bytes are reserved for each surrogate */
5908        if (moreunits > 0) {
5909            Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5910            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5911                /* integer overflow */
5912                PyErr_NoMemory();
5913                goto error;
5914            }
5915            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
5916                goto error;
5917            out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5918        }
5919
5920        if (PyBytes_Check(rep)) {
5921            memcpy(out, PyBytes_AS_STRING(rep), repsize);
5922            out += repsize / 4;
5923        } else /* rep is unicode */ {
5924            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5925            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5926                                 &out, native_ordering);
5927        }
5928
5929        Py_CLEAR(rep);
5930    }
5931
5932    /* Cut back to size actually needed. This is necessary for, for example,
5933       encoding of a string containing isolated surrogates and the 'ignore'
5934       handler is used. */
5935    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5936    if (nsize != PyBytes_GET_SIZE(v))
5937      _PyBytes_Resize(&v, nsize);
5938    Py_XDECREF(errorHandler);
5939    Py_XDECREF(exc);
5940  done:
5941    return v;
5942  error:
5943    Py_XDECREF(rep);
5944    Py_XDECREF(errorHandler);
5945    Py_XDECREF(exc);
5946    Py_XDECREF(v);
5947    return NULL;
5948}
5949
5950PyObject *
5951PyUnicode_AsUTF32String(PyObject *unicode)
5952{
5953    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5954}
5955
5956/* --- UTF-16 Codec ------------------------------------------------------- */
5957
5958PyObject *
5959PyUnicode_DecodeUTF16(const char *s,
5960                      Py_ssize_t size,
5961                      const char *errors,
5962                      int *byteorder)
5963{
5964    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5965}
5966
5967PyObject *
5968PyUnicode_DecodeUTF16Stateful(const char *s,
5969                              Py_ssize_t size,
5970                              const char *errors,
5971                              int *byteorder,
5972                              Py_ssize_t *consumed)
5973{
5974    const char *starts = s;
5975    Py_ssize_t startinpos;
5976    Py_ssize_t endinpos;
5977    _PyUnicodeWriter writer;
5978    const unsigned char *q, *e;
5979    int bo = 0;       /* assume native ordering by default */
5980    int native_ordering;
5981    const char *errmsg = "";
5982    PyObject *errorHandler = NULL;
5983    PyObject *exc = NULL;
5984    const char *encoding;
5985
5986    q = (const unsigned char *)s;
5987    e = q + size;
5988
5989    if (byteorder)
5990        bo = *byteorder;
5991
5992    /* Check for BOM marks (U+FEFF) in the input and adjust current
5993       byte order setting accordingly. In native mode, the leading BOM
5994       mark is skipped, in all other modes, it is copied to the output
5995       stream as-is (giving a ZWNBSP character). */
5996    if (bo == 0 && size >= 2) {
5997        const Py_UCS4 bom = (q[1] << 8) | q[0];
5998        if (bom == 0xFEFF) {
5999            q += 2;
6000            bo = -1;
6001        }
6002        else if (bom == 0xFFFE) {
6003            q += 2;
6004            bo = 1;
6005        }
6006        if (byteorder)
6007            *byteorder = bo;
6008    }
6009
6010    if (q == e) {
6011        if (consumed)
6012            *consumed = size;
6013        _Py_RETURN_UNICODE_EMPTY();
6014    }
6015
6016#if PY_LITTLE_ENDIAN
6017    native_ordering = bo <= 0;
6018    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6019#else
6020    native_ordering = bo >= 0;
6021    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6022#endif
6023
6024    /* Note: size will always be longer than the resulting Unicode
6025       character count normally.  Error handler will take care of
6026       resizing when needed. */
6027    _PyUnicodeWriter_Init(&writer);
6028    writer.min_length = (e - q + 1) / 2;
6029    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6030        goto onError;
6031
6032    while (1) {
6033        Py_UCS4 ch = 0;
6034        if (e - q >= 2) {
6035            int kind = writer.kind;
6036            if (kind == PyUnicode_1BYTE_KIND) {
6037                if (PyUnicode_IS_ASCII(writer.buffer))
6038                    ch = asciilib_utf16_decode(&q, e,
6039                            (Py_UCS1*)writer.data, &writer.pos,
6040                            native_ordering);
6041                else
6042                    ch = ucs1lib_utf16_decode(&q, e,
6043                            (Py_UCS1*)writer.data, &writer.pos,
6044                            native_ordering);
6045            } else if (kind == PyUnicode_2BYTE_KIND) {
6046                ch = ucs2lib_utf16_decode(&q, e,
6047                        (Py_UCS2*)writer.data, &writer.pos,
6048                        native_ordering);
6049            } else {
6050                assert(kind == PyUnicode_4BYTE_KIND);
6051                ch = ucs4lib_utf16_decode(&q, e,
6052                        (Py_UCS4*)writer.data, &writer.pos,
6053                        native_ordering);
6054            }
6055        }
6056
6057        switch (ch)
6058        {
6059        case 0:
6060            /* remaining byte at the end? (size should be even) */
6061            if (q == e || consumed)
6062                goto End;
6063            errmsg = "truncated data";
6064            startinpos = ((const char *)q) - starts;
6065            endinpos = ((const char *)e) - starts;
6066            break;
6067            /* The remaining input chars are ignored if the callback
6068               chooses to skip the input */
6069        case 1:
6070            q -= 2;
6071            if (consumed)
6072                goto End;
6073            errmsg = "unexpected end of data";
6074            startinpos = ((const char *)q) - starts;
6075            endinpos = ((const char *)e) - starts;
6076            break;
6077        case 2:
6078            errmsg = "illegal encoding";
6079            startinpos = ((const char *)q) - 2 - starts;
6080            endinpos = startinpos + 2;
6081            break;
6082        case 3:
6083            errmsg = "illegal UTF-16 surrogate";
6084            startinpos = ((const char *)q) - 4 - starts;
6085            endinpos = startinpos + 2;
6086            break;
6087        default:
6088            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6089                goto onError;
6090            continue;
6091        }
6092
6093        if (unicode_decode_call_errorhandler_writer(
6094                errors,
6095                &errorHandler,
6096                encoding, errmsg,
6097                &starts,
6098                (const char **)&e,
6099                &startinpos,
6100                &endinpos,
6101                &exc,
6102                (const char **)&q,
6103                &writer))
6104            goto onError;
6105    }
6106
6107End:
6108    if (consumed)
6109        *consumed = (const char *)q-starts;
6110
6111    Py_XDECREF(errorHandler);
6112    Py_XDECREF(exc);
6113    return _PyUnicodeWriter_Finish(&writer);
6114
6115  onError:
6116    _PyUnicodeWriter_Dealloc(&writer);
6117    Py_XDECREF(errorHandler);
6118    Py_XDECREF(exc);
6119    return NULL;
6120}
6121
6122PyObject *
6123_PyUnicode_EncodeUTF16(PyObject *str,
6124                       const char *errors,
6125                       int byteorder)
6126{
6127    enum PyUnicode_Kind kind;
6128    const void *data;
6129    Py_ssize_t len;
6130    PyObject *v;
6131    unsigned short *out;
6132    Py_ssize_t pairs;
6133#if PY_BIG_ENDIAN
6134    int native_ordering = byteorder >= 0;
6135#else
6136    int native_ordering = byteorder <= 0;
6137#endif
6138    const char *encoding;
6139    Py_ssize_t nsize, pos;
6140    PyObject *errorHandler = NULL;
6141    PyObject *exc = NULL;
6142    PyObject *rep = NULL;
6143
6144    if (!PyUnicode_Check(str)) {
6145        PyErr_BadArgument();
6146        return NULL;
6147    }
6148    if (PyUnicode_READY(str) == -1)
6149        return NULL;
6150    kind = PyUnicode_KIND(str);
6151    data = PyUnicode_DATA(str);
6152    len = PyUnicode_GET_LENGTH(str);
6153
6154    pairs = 0;
6155    if (kind == PyUnicode_4BYTE_KIND) {
6156        const Py_UCS4 *in = (const Py_UCS4 *)data;
6157        const Py_UCS4 *end = in + len;
6158        while (in < end) {
6159            if (*in++ >= 0x10000) {
6160                pairs++;
6161            }
6162        }
6163    }
6164    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6165        return PyErr_NoMemory();
6166    }
6167    nsize = len + pairs + (byteorder == 0);
6168    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6169    if (v == NULL) {
6170        return NULL;
6171    }
6172
6173    /* output buffer is 2-bytes aligned */
6174    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6175    out = (unsigned short *)PyBytes_AS_STRING(v);
6176    if (byteorder == 0) {
6177        *out++ = 0xFEFF;
6178    }
6179    if (len == 0) {
6180        goto done;
6181    }
6182
6183    if (kind == PyUnicode_1BYTE_KIND) {
6184        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6185        goto done;
6186    }
6187
6188    if (byteorder < 0) {
6189        encoding = "utf-16-le";
6190    }
6191    else if (byteorder > 0) {
6192        encoding = "utf-16-be";
6193    }
6194    else {
6195        encoding = "utf-16";
6196    }
6197
6198    pos = 0;
6199    while (pos < len) {
6200        Py_ssize_t newpos, repsize, moreunits;
6201
6202        if (kind == PyUnicode_2BYTE_KIND) {
6203            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6204                                        &out, native_ordering);
6205        }
6206        else {
6207            assert(kind == PyUnicode_4BYTE_KIND);
6208            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6209                                        &out, native_ordering);
6210        }
6211        if (pos == len)
6212            break;
6213
6214        rep = unicode_encode_call_errorhandler(
6215                errors, &errorHandler,
6216                encoding, "surrogates not allowed",
6217                str, &exc, pos, pos + 1, &newpos);
6218        if (!rep)
6219            goto error;
6220
6221        if (PyBytes_Check(rep)) {
6222            repsize = PyBytes_GET_SIZE(rep);
6223            if (repsize & 1) {
6224                raise_encode_exception(&exc, encoding,
6225                                       str, pos, pos + 1,
6226                                       "surrogates not allowed");
6227                goto error;
6228            }
6229            moreunits = repsize / 2;
6230        }
6231        else {
6232            assert(PyUnicode_Check(rep));
6233            if (PyUnicode_READY(rep) < 0)
6234                goto error;
6235            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6236            if (!PyUnicode_IS_ASCII(rep)) {
6237                raise_encode_exception(&exc, encoding,
6238                                       str, pos, pos + 1,
6239                                       "surrogates not allowed");
6240                goto error;
6241            }
6242        }
6243        moreunits += pos - newpos;
6244        pos = newpos;
6245
6246        /* two bytes are reserved for each surrogate */
6247        if (moreunits > 0) {
6248            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6249            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6250                /* integer overflow */
6251                PyErr_NoMemory();
6252                goto error;
6253            }
6254            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6255                goto error;
6256            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6257        }
6258
6259        if (PyBytes_Check(rep)) {
6260            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6261            out += repsize / 2;
6262        } else /* rep is unicode */ {
6263            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6264            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6265                                 &out, native_ordering);
6266        }
6267
6268        Py_CLEAR(rep);
6269    }
6270
6271    /* Cut back to size actually needed. This is necessary for, for example,
6272    encoding of a string containing isolated surrogates and the 'ignore' handler
6273    is used. */
6274    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6275    if (nsize != PyBytes_GET_SIZE(v))
6276      _PyBytes_Resize(&v, nsize);
6277    Py_XDECREF(errorHandler);
6278    Py_XDECREF(exc);
6279  done:
6280    return v;
6281  error:
6282    Py_XDECREF(rep);
6283    Py_XDECREF(errorHandler);
6284    Py_XDECREF(exc);
6285    Py_XDECREF(v);
6286    return NULL;
6287#undef STORECHAR
6288}
6289
6290PyObject *
6291PyUnicode_AsUTF16String(PyObject *unicode)
6292{
6293    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6294}
6295
6296/* --- Unicode Escape Codec ----------------------------------------------- */
6297
6298static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
6299
6300PyObject *
6301_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6302                               Py_ssize_t size,
6303                               const char *errors,
6304                               Py_ssize_t *consumed,
6305                               const char **first_invalid_escape)
6306{
6307    const char *starts = s;
6308    _PyUnicodeWriter writer;
6309    const char *end;
6310    PyObject *errorHandler = NULL;
6311    PyObject *exc = NULL;
6312
6313    // so we can remember if we've seen an invalid escape char or not
6314    *first_invalid_escape = NULL;
6315
6316    if (size == 0) {
6317        if (consumed) {
6318            *consumed = 0;
6319        }
6320        _Py_RETURN_UNICODE_EMPTY();
6321    }
6322    /* Escaped strings will always be longer than the resulting
6323       Unicode string, so we start with size here and then reduce the
6324       length after conversion to the true value.
6325       (but if the error callback returns a long replacement string
6326       we'll have to allocate more space) */
6327    _PyUnicodeWriter_Init(&writer);
6328    writer.min_length = size;
6329    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6330        goto onError;
6331    }
6332
6333    end = s + size;
6334    while (s < end) {
6335        unsigned char c = (unsigned char) *s++;
6336        Py_UCS4 ch;
6337        int count;
6338        const char *message;
6339
6340#define WRITE_ASCII_CHAR(ch)                                                  \
6341            do {                                                              \
6342                assert(ch <= 127);                                            \
6343                assert(writer.pos < writer.size);                             \
6344                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6345            } while(0)
6346
6347#define WRITE_CHAR(ch)                                                        \
6348            do {                                                              \
6349                if (ch <= writer.maxchar) {                                   \
6350                    assert(writer.pos < writer.size);                         \
6351                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6352                }                                                             \
6353                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6354                    goto onError;                                             \
6355                }                                                             \
6356            } while(0)
6357
6358        /* Non-escape characters are interpreted as Unicode ordinals */
6359        if (c != '\\') {
6360            WRITE_CHAR(c);
6361            continue;
6362        }
6363
6364        Py_ssize_t startinpos = s - starts - 1;
6365        /* \ - Escapes */
6366        if (s >= end) {
6367            message = "\\ at end of string";
6368            goto incomplete;
6369        }
6370        c = (unsigned char) *s++;
6371
6372        assert(writer.pos < writer.size);
6373        switch (c) {
6374
6375            /* \x escapes */
6376        case '\n': continue;
6377        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6378        case '\'': WRITE_ASCII_CHAR('\''); continue;
6379        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6380        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6381        /* FF */
6382        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6383        case 't': WRITE_ASCII_CHAR('\t'); continue;
6384        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6385        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6386        /* VT */
6387        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6388        /* BEL, not classic C */
6389        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6390
6391            /* \OOO (octal) escapes */
6392        case '0': case '1': case '2': case '3':
6393        case '4': case '5': case '6': case '7':
6394            ch = c - '0';
6395            if (s < end && '0' <= *s && *s <= '7') {
6396                ch = (ch<<3) + *s++ - '0';
6397                if (s < end && '0' <= *s && *s <= '7') {
6398                    ch = (ch<<3) + *s++ - '0';
6399                }
6400            }
6401            if (ch > 0377) {
6402                if (*first_invalid_escape == NULL) {
6403                    *first_invalid_escape = s-3; /* Back up 3 chars, since we've
6404                                                    already incremented s. */
6405                }
6406            }
6407            WRITE_CHAR(ch);
6408            continue;
6409
6410            /* hex escapes */
6411            /* \xXX */
6412        case 'x':
6413            count = 2;
6414            message = "truncated \\xXX escape";
6415            goto hexescape;
6416
6417            /* \uXXXX */
6418        case 'u':
6419            count = 4;
6420            message = "truncated \\uXXXX escape";
6421            goto hexescape;
6422
6423            /* \UXXXXXXXX */
6424        case 'U':
6425            count = 8;
6426            message = "truncated \\UXXXXXXXX escape";
6427        hexescape:
6428            for (ch = 0; count; ++s, --count) {
6429                if (s >= end) {
6430                    goto incomplete;
6431                }
6432                c = (unsigned char)*s;
6433                ch <<= 4;
6434                if (c >= '0' && c <= '9') {
6435                    ch += c - '0';
6436                }
6437                else if (c >= 'a' && c <= 'f') {
6438                    ch += c - ('a' - 10);
6439                }
6440                else if (c >= 'A' && c <= 'F') {
6441                    ch += c - ('A' - 10);
6442                }
6443                else {
6444                    goto error;
6445                }
6446            }
6447
6448            /* when we get here, ch is a 32-bit unicode character */
6449            if (ch > MAX_UNICODE) {
6450                message = "illegal Unicode character";
6451                goto error;
6452            }
6453
6454            WRITE_CHAR(ch);
6455            continue;
6456
6457            /* \N{name} */
6458        case 'N':
6459            if (ucnhash_capi == NULL) {
6460                /* load the unicode data module */
6461                ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6462                                                PyUnicodeData_CAPSULE_NAME, 1);
6463                if (ucnhash_capi == NULL) {
6464                    PyErr_SetString(
6465                        PyExc_UnicodeError,
6466                        "\\N escapes not supported (can't load unicodedata module)"
6467                        );
6468                    goto onError;
6469                }
6470            }
6471
6472            message = "malformed \\N character escape";
6473            if (s >= end) {
6474                goto incomplete;
6475            }
6476            if (*s == '{') {
6477                const char *start = ++s;
6478                size_t namelen;
6479                /* look for the closing brace */
6480                while (s < end && *s != '}')
6481                    s++;
6482                if (s >= end) {
6483                    goto incomplete;
6484                }
6485                namelen = s - start;
6486                if (namelen) {
6487                    /* found a name.  look it up in the unicode database */
6488                    s++;
6489                    ch = 0xffffffff; /* in case 'getcode' messes up */
6490                    if (namelen <= INT_MAX &&
6491                        ucnhash_capi->getcode(start, (int)namelen,
6492                                              &ch, 0)) {
6493                        assert(ch <= MAX_UNICODE);
6494                        WRITE_CHAR(ch);
6495                        continue;
6496                    }
6497                    message = "unknown Unicode character name";
6498                }
6499            }
6500            goto error;
6501
6502        default:
6503            if (*first_invalid_escape == NULL) {
6504                *first_invalid_escape = s-1; /* Back up one char, since we've
6505                                                already incremented s. */
6506            }
6507            WRITE_ASCII_CHAR('\\');
6508            WRITE_CHAR(c);
6509            continue;
6510        }
6511
6512      incomplete:
6513        if (consumed) {
6514            *consumed = startinpos;
6515            break;
6516        }
6517      error:;
6518        Py_ssize_t endinpos = s-starts;
6519        writer.min_length = end - s + writer.pos;
6520        if (unicode_decode_call_errorhandler_writer(
6521                errors, &errorHandler,
6522                "unicodeescape", message,
6523                &starts, &end, &startinpos, &endinpos, &exc, &s,
6524                &writer)) {
6525            goto onError;
6526        }
6527        assert(end - s <= writer.size - writer.pos);
6528
6529#undef WRITE_ASCII_CHAR
6530#undef WRITE_CHAR
6531    }
6532
6533    Py_XDECREF(errorHandler);
6534    Py_XDECREF(exc);
6535    return _PyUnicodeWriter_Finish(&writer);
6536
6537  onError:
6538    _PyUnicodeWriter_Dealloc(&writer);
6539    Py_XDECREF(errorHandler);
6540    Py_XDECREF(exc);
6541    return NULL;
6542}
6543
6544PyObject *
6545_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6546                              Py_ssize_t size,
6547                              const char *errors,
6548                              Py_ssize_t *consumed)
6549{
6550    const char *first_invalid_escape;
6551    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6552                                                      consumed,
6553                                                      &first_invalid_escape);
6554    if (result == NULL)
6555        return NULL;
6556    if (first_invalid_escape != NULL) {
6557        unsigned char c = *first_invalid_escape;
6558        if ('4' <= c && c <= '7') {
6559            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6560                                 "invalid octal escape sequence '\\%.3s'",
6561                                 first_invalid_escape) < 0)
6562            {
6563                Py_DECREF(result);
6564                return NULL;
6565            }
6566        }
6567        else {
6568            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6569                                 "invalid escape sequence '\\%c'",
6570                                 c) < 0)
6571            {
6572                Py_DECREF(result);
6573                return NULL;
6574            }
6575        }
6576    }
6577    return result;
6578}
6579
6580PyObject *
6581PyUnicode_DecodeUnicodeEscape(const char *s,
6582                              Py_ssize_t size,
6583                              const char *errors)
6584{
6585    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6586}
6587
6588/* Return a Unicode-Escape string version of the Unicode object. */
6589
6590PyObject *
6591PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6592{
6593    Py_ssize_t i, len;
6594    PyObject *repr;
6595    char *p;
6596    enum PyUnicode_Kind kind;
6597    const void *data;
6598    Py_ssize_t expandsize;
6599
6600    /* Initial allocation is based on the longest-possible character
6601       escape.
6602
6603       For UCS1 strings it's '\xxx', 4 bytes per source character.
6604       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6605       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6606    */
6607
6608    if (!PyUnicode_Check(unicode)) {
6609        PyErr_BadArgument();
6610        return NULL;
6611    }
6612    if (PyUnicode_READY(unicode) == -1) {
6613        return NULL;
6614    }
6615
6616    len = PyUnicode_GET_LENGTH(unicode);
6617    if (len == 0) {
6618        return PyBytes_FromStringAndSize(NULL, 0);
6619    }
6620
6621    kind = PyUnicode_KIND(unicode);
6622    data = PyUnicode_DATA(unicode);
6623    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6624       bytes, and 1 byte characters 4. */
6625    expandsize = kind * 2 + 2;
6626    if (len > PY_SSIZE_T_MAX / expandsize) {
6627        return PyErr_NoMemory();
6628    }
6629    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6630    if (repr == NULL) {
6631        return NULL;
6632    }
6633
6634    p = PyBytes_AS_STRING(repr);
6635    for (i = 0; i < len; i++) {
6636        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6637
6638        /* U+0000-U+00ff range */
6639        if (ch < 0x100) {
6640            if (ch >= ' ' && ch < 127) {
6641                if (ch != '\\') {
6642                    /* Copy printable US ASCII as-is */
6643                    *p++ = (char) ch;
6644                }
6645                /* Escape backslashes */
6646                else {
6647                    *p++ = '\\';
6648                    *p++ = '\\';
6649                }
6650            }
6651
6652            /* Map special whitespace to '\t', \n', '\r' */
6653            else if (ch == '\t') {
6654                *p++ = '\\';
6655                *p++ = 't';
6656            }
6657            else if (ch == '\n') {
6658                *p++ = '\\';
6659                *p++ = 'n';
6660            }
6661            else if (ch == '\r') {
6662                *p++ = '\\';
6663                *p++ = 'r';
6664            }
6665
6666            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6667            else {
6668                *p++ = '\\';
6669                *p++ = 'x';
6670                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6671                *p++ = Py_hexdigits[ch & 0x000F];
6672            }
6673        }
6674        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6675        else if (ch < 0x10000) {
6676            *p++ = '\\';
6677            *p++ = 'u';
6678            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6679            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6680            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6681            *p++ = Py_hexdigits[ch & 0x000F];
6682        }
6683        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6684        else {
6685
6686            /* Make sure that the first two digits are zero */
6687            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6688            *p++ = '\\';
6689            *p++ = 'U';
6690            *p++ = '0';
6691            *p++ = '0';
6692            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6693            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6694            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6695            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6696            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6697            *p++ = Py_hexdigits[ch & 0x0000000F];
6698        }
6699    }
6700
6701    assert(p - PyBytes_AS_STRING(repr) > 0);
6702    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6703        return NULL;
6704    }
6705    return repr;
6706}
6707
6708/* --- Raw Unicode Escape Codec ------------------------------------------- */
6709
6710PyObject *
6711_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6712                                          Py_ssize_t size,
6713                                          const char *errors,
6714                                          Py_ssize_t *consumed)
6715{
6716    const char *starts = s;
6717    _PyUnicodeWriter writer;
6718    const char *end;
6719    PyObject *errorHandler = NULL;
6720    PyObject *exc = NULL;
6721
6722    if (size == 0) {
6723        if (consumed) {
6724            *consumed = 0;
6725        }
6726        _Py_RETURN_UNICODE_EMPTY();
6727    }
6728
6729    /* Escaped strings will always be longer than the resulting
6730       Unicode string, so we start with size here and then reduce the
6731       length after conversion to the true value. (But decoding error
6732       handler might have to resize the string) */
6733    _PyUnicodeWriter_Init(&writer);
6734    writer.min_length = size;
6735    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6736        goto onError;
6737    }
6738
6739    end = s + size;
6740    while (s < end) {
6741        unsigned char c = (unsigned char) *s++;
6742        Py_UCS4 ch;
6743        int count;
6744        const char *message;
6745
6746#define WRITE_CHAR(ch)                                                        \
6747            do {                                                              \
6748                if (ch <= writer.maxchar) {                                   \
6749                    assert(writer.pos < writer.size);                         \
6750                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6751                }                                                             \
6752                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6753                    goto onError;                                             \
6754                }                                                             \
6755            } while(0)
6756
6757        /* Non-escape characters are interpreted as Unicode ordinals */
6758        if (c != '\\' || (s >= end && !consumed)) {
6759            WRITE_CHAR(c);
6760            continue;
6761        }
6762
6763        Py_ssize_t startinpos = s - starts - 1;
6764        /* \ - Escapes */
6765        if (s >= end) {
6766            assert(consumed);
6767            // Set message to silent compiler warning.
6768            // Actually it is never used.
6769            message = "\\ at end of string";
6770            goto incomplete;
6771        }
6772
6773        c = (unsigned char) *s++;
6774        if (c == 'u') {
6775            count = 4;
6776            message = "truncated \\uXXXX escape";
6777        }
6778        else if (c == 'U') {
6779            count = 8;
6780            message = "truncated \\UXXXXXXXX escape";
6781        }
6782        else {
6783            assert(writer.pos < writer.size);
6784            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6785            WRITE_CHAR(c);
6786            continue;
6787        }
6788
6789        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6790        for (ch = 0; count; ++s, --count) {
6791            if (s >= end) {
6792                goto incomplete;
6793            }
6794            c = (unsigned char)*s;
6795            ch <<= 4;
6796            if (c >= '0' && c <= '9') {
6797                ch += c - '0';
6798            }
6799            else if (c >= 'a' && c <= 'f') {
6800                ch += c - ('a' - 10);
6801            }
6802            else if (c >= 'A' && c <= 'F') {
6803                ch += c - ('A' - 10);
6804            }
6805            else {
6806                goto error;
6807            }
6808        }
6809        if (ch > MAX_UNICODE) {
6810            message = "\\Uxxxxxxxx out of range";
6811            goto error;
6812        }
6813        WRITE_CHAR(ch);
6814        continue;
6815
6816      incomplete:
6817        if (consumed) {
6818            *consumed = startinpos;
6819            break;
6820        }
6821      error:;
6822        Py_ssize_t endinpos = s-starts;
6823        writer.min_length = end - s + writer.pos;
6824        if (unicode_decode_call_errorhandler_writer(
6825                errors, &errorHandler,
6826                "rawunicodeescape", message,
6827                &starts, &end, &startinpos, &endinpos, &exc, &s,
6828                &writer)) {
6829            goto onError;
6830        }
6831        assert(end - s <= writer.size - writer.pos);
6832
6833#undef WRITE_CHAR
6834    }
6835    Py_XDECREF(errorHandler);
6836    Py_XDECREF(exc);
6837    return _PyUnicodeWriter_Finish(&writer);
6838
6839  onError:
6840    _PyUnicodeWriter_Dealloc(&writer);
6841    Py_XDECREF(errorHandler);
6842    Py_XDECREF(exc);
6843    return NULL;
6844}
6845
6846PyObject *
6847PyUnicode_DecodeRawUnicodeEscape(const char *s,
6848                                 Py_ssize_t size,
6849                                 const char *errors)
6850{
6851    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6852}
6853
6854
6855PyObject *
6856PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6857{
6858    PyObject *repr;
6859    char *p;
6860    Py_ssize_t expandsize, pos;
6861    int kind;
6862    const void *data;
6863    Py_ssize_t len;
6864
6865    if (!PyUnicode_Check(unicode)) {
6866        PyErr_BadArgument();
6867        return NULL;
6868    }
6869    if (PyUnicode_READY(unicode) == -1) {
6870        return NULL;
6871    }
6872    kind = PyUnicode_KIND(unicode);
6873    data = PyUnicode_DATA(unicode);
6874    len = PyUnicode_GET_LENGTH(unicode);
6875    if (kind == PyUnicode_1BYTE_KIND) {
6876        return PyBytes_FromStringAndSize(data, len);
6877    }
6878
6879    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6880       bytes, and 1 byte characters 4. */
6881    expandsize = kind * 2 + 2;
6882
6883    if (len > PY_SSIZE_T_MAX / expandsize) {
6884        return PyErr_NoMemory();
6885    }
6886    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6887    if (repr == NULL) {
6888        return NULL;
6889    }
6890    if (len == 0) {
6891        return repr;
6892    }
6893
6894    p = PyBytes_AS_STRING(repr);
6895    for (pos = 0; pos < len; pos++) {
6896        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6897
6898        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6899        if (ch < 0x100) {
6900            *p++ = (char) ch;
6901        }
6902        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6903        else if (ch < 0x10000) {
6904            *p++ = '\\';
6905            *p++ = 'u';
6906            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6907            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6908            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6909            *p++ = Py_hexdigits[ch & 15];
6910        }
6911        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6912        else {
6913            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6914            *p++ = '\\';
6915            *p++ = 'U';
6916            *p++ = '0';
6917            *p++ = '0';
6918            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6919            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6920            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6921            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6922            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6923            *p++ = Py_hexdigits[ch & 15];
6924        }
6925    }
6926
6927    assert(p > PyBytes_AS_STRING(repr));
6928    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6929        return NULL;
6930    }
6931    return repr;
6932}
6933
6934/* --- Latin-1 Codec ------------------------------------------------------ */
6935
6936PyObject *
6937PyUnicode_DecodeLatin1(const char *s,
6938                       Py_ssize_t size,
6939                       const char *errors)
6940{
6941    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6942    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6943}
6944
6945/* create or adjust a UnicodeEncodeError */
6946static void
6947make_encode_exception(PyObject **exceptionObject,
6948                      const char *encoding,
6949                      PyObject *unicode,
6950                      Py_ssize_t startpos, Py_ssize_t endpos,
6951                      const char *reason)
6952{
6953    if (*exceptionObject == NULL) {
6954        *exceptionObject = PyObject_CallFunction(
6955            PyExc_UnicodeEncodeError, "sOnns",
6956            encoding, unicode, startpos, endpos, reason);
6957    }
6958    else {
6959        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6960            goto onError;
6961        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6962            goto onError;
6963        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6964            goto onError;
6965        return;
6966      onError:
6967        Py_CLEAR(*exceptionObject);
6968    }
6969}
6970
6971/* raises a UnicodeEncodeError */
6972static void
6973raise_encode_exception(PyObject **exceptionObject,
6974                       const char *encoding,
6975                       PyObject *unicode,
6976                       Py_ssize_t startpos, Py_ssize_t endpos,
6977                       const char *reason)
6978{
6979    make_encode_exception(exceptionObject,
6980                          encoding, unicode, startpos, endpos, reason);
6981    if (*exceptionObject != NULL)
6982        PyCodec_StrictErrors(*exceptionObject);
6983}
6984
6985/* error handling callback helper:
6986   build arguments, call the callback and check the arguments,
6987   put the result into newpos and return the replacement string, which
6988   has to be freed by the caller */
6989static PyObject *
6990unicode_encode_call_errorhandler(const char *errors,
6991                                 PyObject **errorHandler,
6992                                 const char *encoding, const char *reason,
6993                                 PyObject *unicode, PyObject **exceptionObject,
6994                                 Py_ssize_t startpos, Py_ssize_t endpos,
6995                                 Py_ssize_t *newpos)
6996{
6997    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6998    Py_ssize_t len;
6999    PyObject *restuple;
7000    PyObject *resunicode;
7001
7002    if (*errorHandler == NULL) {
7003        *errorHandler = PyCodec_LookupError(errors);
7004        if (*errorHandler == NULL)
7005            return NULL;
7006    }
7007
7008    if (PyUnicode_READY(unicode) == -1)
7009        return NULL;
7010    len = PyUnicode_GET_LENGTH(unicode);
7011
7012    make_encode_exception(exceptionObject,
7013                          encoding, unicode, startpos, endpos, reason);
7014    if (*exceptionObject == NULL)
7015        return NULL;
7016
7017    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7018    if (restuple == NULL)
7019        return NULL;
7020    if (!PyTuple_Check(restuple)) {
7021        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7022        Py_DECREF(restuple);
7023        return NULL;
7024    }
7025    if (!PyArg_ParseTuple(restuple, argparse,
7026                          &resunicode, newpos)) {
7027        Py_DECREF(restuple);
7028        return NULL;
7029    }
7030    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7031        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7032        Py_DECREF(restuple);
7033        return NULL;
7034    }
7035    if (*newpos<0)
7036        *newpos = len + *newpos;
7037    if (*newpos<0 || *newpos>len) {
7038        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7039        Py_DECREF(restuple);
7040        return NULL;
7041    }
7042    Py_INCREF(resunicode);
7043    Py_DECREF(restuple);
7044    return resunicode;
7045}
7046
7047static PyObject *
7048unicode_encode_ucs1(PyObject *unicode,
7049                    const char *errors,
7050                    const Py_UCS4 limit)
7051{
7052    /* input state */
7053    Py_ssize_t pos=0, size;
7054    int kind;
7055    const void *data;
7056    /* pointer into the output */
7057    char *str;
7058    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7059    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7060    PyObject *error_handler_obj = NULL;
7061    PyObject *exc = NULL;
7062    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7063    PyObject *rep = NULL;
7064    /* output object */
7065    _PyBytesWriter writer;
7066
7067    if (PyUnicode_READY(unicode) == -1)
7068        return NULL;
7069    size = PyUnicode_GET_LENGTH(unicode);
7070    kind = PyUnicode_KIND(unicode);
7071    data = PyUnicode_DATA(unicode);
7072    /* allocate enough for a simple encoding without
7073       replacements, if we need more, we'll resize */
7074    if (size == 0)
7075        return PyBytes_FromStringAndSize(NULL, 0);
7076
7077    _PyBytesWriter_Init(&writer);
7078    str = _PyBytesWriter_Alloc(&writer, size);
7079    if (str == NULL)
7080        return NULL;
7081
7082    while (pos < size) {
7083        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7084
7085        /* can we encode this? */
7086        if (ch < limit) {
7087            /* no overflow check, because we know that the space is enough */
7088            *str++ = (char)ch;
7089            ++pos;
7090        }
7091        else {
7092            Py_ssize_t newpos, i;
7093            /* startpos for collecting unencodable chars */
7094            Py_ssize_t collstart = pos;
7095            Py_ssize_t collend = collstart + 1;
7096            /* find all unecodable characters */
7097
7098            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7099                ++collend;
7100
7101            /* Only overallocate the buffer if it's not the last write */
7102            writer.overallocate = (collend < size);
7103
7104            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7105            if (error_handler == _Py_ERROR_UNKNOWN)
7106                error_handler = _Py_GetErrorHandler(errors);
7107
7108            switch (error_handler) {
7109            case _Py_ERROR_STRICT:
7110                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7111                goto onError;
7112
7113            case _Py_ERROR_REPLACE:
7114                memset(str, '?', collend - collstart);
7115                str += (collend - collstart);
7116                /* fall through */
7117            case _Py_ERROR_IGNORE:
7118                pos = collend;
7119                break;
7120
7121            case _Py_ERROR_BACKSLASHREPLACE:
7122                /* subtract preallocated bytes */
7123                writer.min_size -= (collend - collstart);
7124                str = backslashreplace(&writer, str,
7125                                       unicode, collstart, collend);
7126                if (str == NULL)
7127                    goto onError;
7128                pos = collend;
7129                break;
7130
7131            case _Py_ERROR_XMLCHARREFREPLACE:
7132                /* subtract preallocated bytes */
7133                writer.min_size -= (collend - collstart);
7134                str = xmlcharrefreplace(&writer, str,
7135                                        unicode, collstart, collend);
7136                if (str == NULL)
7137                    goto onError;
7138                pos = collend;
7139                break;
7140
7141            case _Py_ERROR_SURROGATEESCAPE:
7142                for (i = collstart; i < collend; ++i) {
7143                    ch = PyUnicode_READ(kind, data, i);
7144                    if (ch < 0xdc80 || 0xdcff < ch) {
7145                        /* Not a UTF-8b surrogate */
7146                        break;
7147                    }
7148                    *str++ = (char)(ch - 0xdc00);
7149                    ++pos;
7150                }
7151                if (i >= collend)
7152                    break;
7153                collstart = pos;
7154                assert(collstart != collend);
7155                /* fall through */
7156
7157            default:
7158                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7159                                                       encoding, reason, unicode, &exc,
7160                                                       collstart, collend, &newpos);
7161                if (rep == NULL)
7162                    goto onError;
7163
7164                if (newpos < collstart) {
7165                    writer.overallocate = 1;
7166                    str = _PyBytesWriter_Prepare(&writer, str,
7167                                                 collstart - newpos);
7168                    if (str == NULL)
7169                        goto onError;
7170                }
7171                else {
7172                    /* subtract preallocated bytes */
7173                    writer.min_size -= newpos - collstart;
7174                    /* Only overallocate the buffer if it's not the last write */
7175                    writer.overallocate = (newpos < size);
7176                }
7177
7178                if (PyBytes_Check(rep)) {
7179                    /* Directly copy bytes result to output. */
7180                    str = _PyBytesWriter_WriteBytes(&writer, str,
7181                                                    PyBytes_AS_STRING(rep),
7182                                                    PyBytes_GET_SIZE(rep));
7183                }
7184                else {
7185                    assert(PyUnicode_Check(rep));
7186
7187                    if (PyUnicode_READY(rep) < 0)
7188                        goto onError;
7189
7190                    if (limit == 256 ?
7191                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7192                        !PyUnicode_IS_ASCII(rep))
7193                    {
7194                        /* Not all characters are smaller than limit */
7195                        raise_encode_exception(&exc, encoding, unicode,
7196                                               collstart, collend, reason);
7197                        goto onError;
7198                    }
7199                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7200                    str = _PyBytesWriter_WriteBytes(&writer, str,
7201                                                    PyUnicode_DATA(rep),
7202                                                    PyUnicode_GET_LENGTH(rep));
7203                }
7204                if (str == NULL)
7205                    goto onError;
7206
7207                pos = newpos;
7208                Py_CLEAR(rep);
7209            }
7210
7211            /* If overallocation was disabled, ensure that it was the last
7212               write. Otherwise, we missed an optimization */
7213            assert(writer.overallocate || pos == size);
7214        }
7215    }
7216
7217    Py_XDECREF(error_handler_obj);
7218    Py_XDECREF(exc);
7219    return _PyBytesWriter_Finish(&writer, str);
7220
7221  onError:
7222    Py_XDECREF(rep);
7223    _PyBytesWriter_Dealloc(&writer);
7224    Py_XDECREF(error_handler_obj);
7225    Py_XDECREF(exc);
7226    return NULL;
7227}
7228
7229PyObject *
7230_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7231{
7232    if (!PyUnicode_Check(unicode)) {
7233        PyErr_BadArgument();
7234        return NULL;
7235    }
7236    if (PyUnicode_READY(unicode) == -1)
7237        return NULL;
7238    /* Fast path: if it is a one-byte string, construct
7239       bytes object directly. */
7240    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7241        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7242                                         PyUnicode_GET_LENGTH(unicode));
7243    /* Non-Latin-1 characters present. Defer to above function to
7244       raise the exception. */
7245    return unicode_encode_ucs1(unicode, errors, 256);
7246}
7247
7248PyObject*
7249PyUnicode_AsLatin1String(PyObject *unicode)
7250{
7251    return _PyUnicode_AsLatin1String(unicode, NULL);
7252}
7253
7254/* --- 7-bit ASCII Codec -------------------------------------------------- */
7255
7256PyObject *
7257PyUnicode_DecodeASCII(const char *s,
7258                      Py_ssize_t size,
7259                      const char *errors)
7260{
7261    const char *starts = s;
7262    const char *e = s + size;
7263    PyObject *error_handler_obj = NULL;
7264    PyObject *exc = NULL;
7265    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7266
7267    if (size == 0)
7268        _Py_RETURN_UNICODE_EMPTY();
7269
7270    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7271    if (size == 1 && (unsigned char)s[0] < 128) {
7272        return get_latin1_char((unsigned char)s[0]);
7273    }
7274
7275    // Shortcut for simple case
7276    PyObject *u = PyUnicode_New(size, 127);
7277    if (u == NULL) {
7278        return NULL;
7279    }
7280    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7281    if (outpos == size) {
7282        return u;
7283    }
7284
7285    _PyUnicodeWriter writer;
7286    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7287    writer.pos = outpos;
7288
7289    s += outpos;
7290    int kind = writer.kind;
7291    void *data = writer.data;
7292    Py_ssize_t startinpos, endinpos;
7293
7294    while (s < e) {
7295        unsigned char c = (unsigned char)*s;
7296        if (c < 128) {
7297            PyUnicode_WRITE(kind, data, writer.pos, c);
7298            writer.pos++;
7299            ++s;
7300            continue;
7301        }
7302
7303        /* byte outsize range 0x00..0x7f: call the error handler */
7304
7305        if (error_handler == _Py_ERROR_UNKNOWN)
7306            error_handler = _Py_GetErrorHandler(errors);
7307
7308        switch (error_handler)
7309        {
7310        case _Py_ERROR_REPLACE:
7311        case _Py_ERROR_SURROGATEESCAPE:
7312            /* Fast-path: the error handler only writes one character,
7313               but we may switch to UCS2 at the first write */
7314            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7315                goto onError;
7316            kind = writer.kind;
7317            data = writer.data;
7318
7319            if (error_handler == _Py_ERROR_REPLACE)
7320                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7321            else
7322                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7323            writer.pos++;
7324            ++s;
7325            break;
7326
7327        case _Py_ERROR_IGNORE:
7328            ++s;
7329            break;
7330
7331        default:
7332            startinpos = s-starts;
7333            endinpos = startinpos + 1;
7334            if (unicode_decode_call_errorhandler_writer(
7335                    errors, &error_handler_obj,
7336                    "ascii", "ordinal not in range(128)",
7337                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7338                    &writer))
7339                goto onError;
7340            kind = writer.kind;
7341            data = writer.data;
7342        }
7343    }
7344    Py_XDECREF(error_handler_obj);
7345    Py_XDECREF(exc);
7346    return _PyUnicodeWriter_Finish(&writer);
7347
7348  onError:
7349    _PyUnicodeWriter_Dealloc(&writer);
7350    Py_XDECREF(error_handler_obj);
7351    Py_XDECREF(exc);
7352    return NULL;
7353}
7354
7355PyObject *
7356_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7357{
7358    if (!PyUnicode_Check(unicode)) {
7359        PyErr_BadArgument();
7360        return NULL;
7361    }
7362    if (PyUnicode_READY(unicode) == -1)
7363        return NULL;
7364    /* Fast path: if it is an ASCII-only string, construct bytes object
7365       directly. Else defer to above function to raise the exception. */
7366    if (PyUnicode_IS_ASCII(unicode))
7367        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7368                                         PyUnicode_GET_LENGTH(unicode));
7369    return unicode_encode_ucs1(unicode, errors, 128);
7370}
7371
7372PyObject *
7373PyUnicode_AsASCIIString(PyObject *unicode)
7374{
7375    return _PyUnicode_AsASCIIString(unicode, NULL);
7376}
7377
7378#ifdef MS_WINDOWS
7379
7380/* --- MBCS codecs for Windows -------------------------------------------- */
7381
7382#if SIZEOF_INT < SIZEOF_SIZE_T
7383#define NEED_RETRY
7384#endif
7385
7386/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7387   transcoding from UTF-16), but INT_MAX / 4 performs better in
7388   both cases also and avoids partial characters overrunning the
7389   length limit in MultiByteToWideChar on Windows */
7390#define DECODING_CHUNK_SIZE (INT_MAX/4)
7391
7392#ifndef WC_ERR_INVALID_CHARS
7393#  define WC_ERR_INVALID_CHARS 0x0080
7394#endif
7395
7396static const char*
7397code_page_name(UINT code_page, PyObject **obj)
7398{
7399    *obj = NULL;
7400    if (code_page == CP_ACP)
7401        return "mbcs";
7402    if (code_page == CP_UTF7)
7403        return "CP_UTF7";
7404    if (code_page == CP_UTF8)
7405        return "CP_UTF8";
7406
7407    *obj = PyBytes_FromFormat("cp%u", code_page);
7408    if (*obj == NULL)
7409        return NULL;
7410    return PyBytes_AS_STRING(*obj);
7411}
7412
7413static DWORD
7414decode_code_page_flags(UINT code_page)
7415{
7416    if (code_page == CP_UTF7) {
7417        /* The CP_UTF7 decoder only supports flags=0 */
7418        return 0;
7419    }
7420    else
7421        return MB_ERR_INVALID_CHARS;
7422}
7423
7424/*
7425 * Decode a byte string from a Windows code page into unicode object in strict
7426 * mode.
7427 *
7428 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7429 * OSError and returns -1 on other error.
7430 */
7431static int
7432decode_code_page_strict(UINT code_page,
7433                        wchar_t **buf,
7434                        Py_ssize_t *bufsize,
7435                        const char *in,
7436                        int insize)
7437{
7438    DWORD flags = MB_ERR_INVALID_CHARS;
7439    wchar_t *out;
7440    DWORD outsize;
7441
7442    /* First get the size of the result */
7443    assert(insize > 0);
7444    while ((outsize = MultiByteToWideChar(code_page, flags,
7445                                          in, insize, NULL, 0)) <= 0)
7446    {
7447        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7448            goto error;
7449        }
7450        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7451        flags = 0;
7452    }
7453
7454    /* Extend a wchar_t* buffer */
7455    Py_ssize_t n = *bufsize;   /* Get the current length */
7456    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7457        return -1;
7458    }
7459    out = *buf + n;
7460
7461    /* Do the conversion */
7462    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7463    if (outsize <= 0)
7464        goto error;
7465    return insize;
7466
7467error:
7468    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7469        return -2;
7470    PyErr_SetFromWindowsErr(0);
7471    return -1;
7472}
7473
7474/*
7475 * Decode a byte string from a code page into unicode object with an error
7476 * handler.
7477 *
7478 * Returns consumed size if succeed, or raise an OSError or
7479 * UnicodeDecodeError exception and returns -1 on error.
7480 */
7481static int
7482decode_code_page_errors(UINT code_page,
7483                        wchar_t **buf,
7484                        Py_ssize_t *bufsize,
7485                        const char *in, const int size,
7486                        const char *errors, int final)
7487{
7488    const char *startin = in;
7489    const char *endin = in + size;
7490    DWORD flags = MB_ERR_INVALID_CHARS;
7491    /* Ideally, we should get reason from FormatMessage. This is the Windows
7492       2000 English version of the message. */
7493    const char *reason = "No mapping for the Unicode character exists "
7494                         "in the target code page.";
7495    /* each step cannot decode more than 1 character, but a character can be
7496       represented as a surrogate pair */
7497    wchar_t buffer[2], *out;
7498    int insize;
7499    Py_ssize_t outsize;
7500    PyObject *errorHandler = NULL;
7501    PyObject *exc = NULL;
7502    PyObject *encoding_obj = NULL;
7503    const char *encoding;
7504    DWORD err;
7505    int ret = -1;
7506
7507    assert(size > 0);
7508
7509    encoding = code_page_name(code_page, &encoding_obj);
7510    if (encoding == NULL)
7511        return -1;
7512
7513    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7514        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7515           UnicodeDecodeError. */
7516        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7517        if (exc != NULL) {
7518            PyCodec_StrictErrors(exc);
7519            Py_CLEAR(exc);
7520        }
7521        goto error;
7522    }
7523
7524    /* Extend a wchar_t* buffer */
7525    Py_ssize_t n = *bufsize;   /* Get the current length */
7526    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7527        PyErr_NoMemory();
7528        goto error;
7529    }
7530    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7531        goto error;
7532    }
7533    out = *buf + n;
7534
7535    /* Decode the byte string character per character */
7536    while (in < endin)
7537    {
7538        /* Decode a character */
7539        insize = 1;
7540        do
7541        {
7542            outsize = MultiByteToWideChar(code_page, flags,
7543                                          in, insize,
7544                                          buffer, Py_ARRAY_LENGTH(buffer));
7545            if (outsize > 0)
7546                break;
7547            err = GetLastError();
7548            if (err == ERROR_INVALID_FLAGS && flags) {
7549                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7550                flags = 0;
7551                continue;
7552            }
7553            if (err != ERROR_NO_UNICODE_TRANSLATION
7554                && err != ERROR_INSUFFICIENT_BUFFER)
7555            {
7556                PyErr_SetFromWindowsErr(0);
7557                goto error;
7558            }
7559            insize++;
7560        }
7561        /* 4=maximum length of a UTF-8 sequence */
7562        while (insize <= 4 && (in + insize) <= endin);
7563
7564        if (outsize <= 0) {
7565            Py_ssize_t startinpos, endinpos, outpos;
7566
7567            /* last character in partial decode? */
7568            if (in + insize >= endin && !final)
7569                break;
7570
7571            startinpos = in - startin;
7572            endinpos = startinpos + 1;
7573            outpos = out - *buf;
7574            if (unicode_decode_call_errorhandler_wchar(
7575                    errors, &errorHandler,
7576                    encoding, reason,
7577                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7578                    buf, bufsize, &outpos))
7579            {
7580                goto error;
7581            }
7582            out = *buf + outpos;
7583        }
7584        else {
7585            in += insize;
7586            memcpy(out, buffer, outsize * sizeof(wchar_t));
7587            out += outsize;
7588        }
7589    }
7590
7591    /* Shrink the buffer */
7592    assert(out - *buf <= *bufsize);
7593    *bufsize = out - *buf;
7594    /* (in - startin) <= size and size is an int */
7595    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7596
7597error:
7598    Py_XDECREF(encoding_obj);
7599    Py_XDECREF(errorHandler);
7600    Py_XDECREF(exc);
7601    return ret;
7602}
7603
7604static PyObject *
7605decode_code_page_stateful(int code_page,
7606                          const char *s, Py_ssize_t size,
7607                          const char *errors, Py_ssize_t *consumed)
7608{
7609    wchar_t *buf = NULL;
7610    Py_ssize_t bufsize = 0;
7611    int chunk_size, final, converted, done;
7612
7613    if (code_page < 0) {
7614        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7615        return NULL;
7616    }
7617    if (size < 0) {
7618        PyErr_BadInternalCall();
7619        return NULL;
7620    }
7621
7622    if (consumed)
7623        *consumed = 0;
7624
7625    do
7626    {
7627#ifdef NEED_RETRY
7628        if (size > DECODING_CHUNK_SIZE) {
7629            chunk_size = DECODING_CHUNK_SIZE;
7630            final = 0;
7631            done = 0;
7632        }
7633        else
7634#endif
7635        {
7636            chunk_size = (int)size;
7637            final = (consumed == NULL);
7638            done = 1;
7639        }
7640
7641        if (chunk_size == 0 && done) {
7642            if (buf != NULL)
7643                break;
7644            _Py_RETURN_UNICODE_EMPTY();
7645        }
7646
7647        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7648                                            s, chunk_size);
7649        if (converted == -2)
7650            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7651                                                s, chunk_size,
7652                                                errors, final);
7653        assert(converted != 0 || done);
7654
7655        if (converted < 0) {
7656            PyMem_Free(buf);
7657            return NULL;
7658        }
7659
7660        if (consumed)
7661            *consumed += converted;
7662
7663        s += converted;
7664        size -= converted;
7665    } while (!done);
7666
7667    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7668    PyMem_Free(buf);
7669    return v;
7670}
7671
7672PyObject *
7673PyUnicode_DecodeCodePageStateful(int code_page,
7674                                 const char *s,
7675                                 Py_ssize_t size,
7676                                 const char *errors,
7677                                 Py_ssize_t *consumed)
7678{
7679    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7680}
7681
7682PyObject *
7683PyUnicode_DecodeMBCSStateful(const char *s,
7684                             Py_ssize_t size,
7685                             const char *errors,
7686                             Py_ssize_t *consumed)
7687{
7688    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7689}
7690
7691PyObject *
7692PyUnicode_DecodeMBCS(const char *s,
7693                     Py_ssize_t size,
7694                     const char *errors)
7695{
7696    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7697}
7698
7699static DWORD
7700encode_code_page_flags(UINT code_page, const char *errors)
7701{
7702    if (code_page == CP_UTF8) {
7703        return WC_ERR_INVALID_CHARS;
7704    }
7705    else if (code_page == CP_UTF7) {
7706        /* CP_UTF7 only supports flags=0 */
7707        return 0;
7708    }
7709    else {
7710        if (errors != NULL && strcmp(errors, "replace") == 0)
7711            return 0;
7712        else
7713            return WC_NO_BEST_FIT_CHARS;
7714    }
7715}
7716
7717/*
7718 * Encode a Unicode string to a Windows code page into a byte string in strict
7719 * mode.
7720 *
7721 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7722 * an OSError and returns -1 on other error.
7723 */
7724static int
7725encode_code_page_strict(UINT code_page, PyObject **outbytes,
7726                        PyObject *unicode, Py_ssize_t offset, int len,
7727                        const char* errors)
7728{
7729    BOOL usedDefaultChar = FALSE;
7730    BOOL *pusedDefaultChar = &usedDefaultChar;
7731    int outsize;
7732    wchar_t *p;
7733    Py_ssize_t size;
7734    const DWORD flags = encode_code_page_flags(code_page, NULL);
7735    char *out;
7736    /* Create a substring so that we can get the UTF-16 representation
7737       of just the slice under consideration. */
7738    PyObject *substring;
7739    int ret = -1;
7740
7741    assert(len > 0);
7742
7743    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7744        pusedDefaultChar = &usedDefaultChar;
7745    else
7746        pusedDefaultChar = NULL;
7747
7748    substring = PyUnicode_Substring(unicode, offset, offset+len);
7749    if (substring == NULL)
7750        return -1;
7751#if USE_UNICODE_WCHAR_CACHE
7752_Py_COMP_DIAG_PUSH
7753_Py_COMP_DIAG_IGNORE_DEPR_DECLS
7754    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7755    if (p == NULL) {
7756        Py_DECREF(substring);
7757        return -1;
7758    }
7759_Py_COMP_DIAG_POP
7760#else /* USE_UNICODE_WCHAR_CACHE */
7761    p = PyUnicode_AsWideCharString(substring, &size);
7762    Py_CLEAR(substring);
7763    if (p == NULL) {
7764        return -1;
7765    }
7766#endif /* USE_UNICODE_WCHAR_CACHE */
7767    assert(size <= INT_MAX);
7768
7769    /* First get the size of the result */
7770    outsize = WideCharToMultiByte(code_page, flags,
7771                                  p, (int)size,
7772                                  NULL, 0,
7773                                  NULL, pusedDefaultChar);
7774    if (outsize <= 0)
7775        goto error;
7776    /* If we used a default char, then we failed! */
7777    if (pusedDefaultChar && *pusedDefaultChar) {
7778        ret = -2;
7779        goto done;
7780    }
7781
7782    if (*outbytes == NULL) {
7783        /* Create string object */
7784        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7785        if (*outbytes == NULL) {
7786            goto done;
7787        }
7788        out = PyBytes_AS_STRING(*outbytes);
7789    }
7790    else {
7791        /* Extend string object */
7792        const Py_ssize_t n = PyBytes_Size(*outbytes);
7793        if (outsize > PY_SSIZE_T_MAX - n) {
7794            PyErr_NoMemory();
7795            goto done;
7796        }
7797        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7798            goto done;
7799        }
7800        out = PyBytes_AS_STRING(*outbytes) + n;
7801    }
7802
7803    /* Do the conversion */
7804    outsize = WideCharToMultiByte(code_page, flags,
7805                                  p, (int)size,
7806                                  out, outsize,
7807                                  NULL, pusedDefaultChar);
7808    if (outsize <= 0)
7809        goto error;
7810    if (pusedDefaultChar && *pusedDefaultChar) {
7811        ret = -2;
7812        goto done;
7813    }
7814    ret = 0;
7815
7816done:
7817#if USE_UNICODE_WCHAR_CACHE
7818    Py_DECREF(substring);
7819#else /* USE_UNICODE_WCHAR_CACHE */
7820    PyMem_Free(p);
7821#endif /* USE_UNICODE_WCHAR_CACHE */
7822    return ret;
7823
7824error:
7825    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7826        ret = -2;
7827        goto done;
7828    }
7829    PyErr_SetFromWindowsErr(0);
7830    goto done;
7831}
7832
7833/*
7834 * Encode a Unicode string to a Windows code page into a byte string using an
7835 * error handler.
7836 *
7837 * Returns consumed characters if succeed, or raise an OSError and returns
7838 * -1 on other error.
7839 */
7840static int
7841encode_code_page_errors(UINT code_page, PyObject **outbytes,
7842                        PyObject *unicode, Py_ssize_t unicode_offset,
7843                        Py_ssize_t insize, const char* errors)
7844{
7845    const DWORD flags = encode_code_page_flags(code_page, errors);
7846    Py_ssize_t pos = unicode_offset;
7847    Py_ssize_t endin = unicode_offset + insize;
7848    /* Ideally, we should get reason from FormatMessage. This is the Windows
7849       2000 English version of the message. */
7850    const char *reason = "invalid character";
7851    /* 4=maximum length of a UTF-8 sequence */
7852    char buffer[4];
7853    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7854    Py_ssize_t outsize;
7855    char *out;
7856    PyObject *errorHandler = NULL;
7857    PyObject *exc = NULL;
7858    PyObject *encoding_obj = NULL;
7859    const char *encoding;
7860    Py_ssize_t newpos, newoutsize;
7861    PyObject *rep;
7862    int ret = -1;
7863
7864    assert(insize > 0);
7865
7866    encoding = code_page_name(code_page, &encoding_obj);
7867    if (encoding == NULL)
7868        return -1;
7869
7870    if (errors == NULL || strcmp(errors, "strict") == 0) {
7871        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7872           then we raise a UnicodeEncodeError. */
7873        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7874        if (exc != NULL) {
7875            PyCodec_StrictErrors(exc);
7876            Py_DECREF(exc);
7877        }
7878        Py_XDECREF(encoding_obj);
7879        return -1;
7880    }
7881
7882    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7883        pusedDefaultChar = &usedDefaultChar;
7884    else
7885        pusedDefaultChar = NULL;
7886
7887    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7888        PyErr_NoMemory();
7889        goto error;
7890    }
7891    outsize = insize * Py_ARRAY_LENGTH(buffer);
7892
7893    if (*outbytes == NULL) {
7894        /* Create string object */
7895        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7896        if (*outbytes == NULL)
7897            goto error;
7898        out = PyBytes_AS_STRING(*outbytes);
7899    }
7900    else {
7901        /* Extend string object */
7902        Py_ssize_t n = PyBytes_Size(*outbytes);
7903        if (n > PY_SSIZE_T_MAX - outsize) {
7904            PyErr_NoMemory();
7905            goto error;
7906        }
7907        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7908            goto error;
7909        out = PyBytes_AS_STRING(*outbytes) + n;
7910    }
7911
7912    /* Encode the string character per character */
7913    while (pos < endin)
7914    {
7915        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7916        wchar_t chars[2];
7917        int charsize;
7918        if (ch < 0x10000) {
7919            chars[0] = (wchar_t)ch;
7920            charsize = 1;
7921        }
7922        else {
7923            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7924            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7925            charsize = 2;
7926        }
7927
7928        outsize = WideCharToMultiByte(code_page, flags,
7929                                      chars, charsize,
7930                                      buffer, Py_ARRAY_LENGTH(buffer),
7931                                      NULL, pusedDefaultChar);
7932        if (outsize > 0) {
7933            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7934            {
7935                pos++;
7936                memcpy(out, buffer, outsize);
7937                out += outsize;
7938                continue;
7939            }
7940        }
7941        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7942            PyErr_SetFromWindowsErr(0);
7943            goto error;
7944        }
7945
7946        rep = unicode_encode_call_errorhandler(
7947                  errors, &errorHandler, encoding, reason,
7948                  unicode, &exc,
7949                  pos, pos + 1, &newpos);
7950        if (rep == NULL)
7951            goto error;
7952
7953        Py_ssize_t morebytes = pos - newpos;
7954        if (PyBytes_Check(rep)) {
7955            outsize = PyBytes_GET_SIZE(rep);
7956            morebytes += outsize;
7957            if (morebytes > 0) {
7958                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7959                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7960                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7961                    Py_DECREF(rep);
7962                    goto error;
7963                }
7964                out = PyBytes_AS_STRING(*outbytes) + offset;
7965            }
7966            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7967            out += outsize;
7968        }
7969        else {
7970            Py_ssize_t i;
7971            enum PyUnicode_Kind kind;
7972            const void *data;
7973
7974            if (PyUnicode_READY(rep) == -1) {
7975                Py_DECREF(rep);
7976                goto error;
7977            }
7978
7979            outsize = PyUnicode_GET_LENGTH(rep);
7980            morebytes += outsize;
7981            if (morebytes > 0) {
7982                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7983                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7984                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7985                    Py_DECREF(rep);
7986                    goto error;
7987                }
7988                out = PyBytes_AS_STRING(*outbytes) + offset;
7989            }
7990            kind = PyUnicode_KIND(rep);
7991            data = PyUnicode_DATA(rep);
7992            for (i=0; i < outsize; i++) {
7993                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7994                if (ch > 127) {
7995                    raise_encode_exception(&exc,
7996                        encoding, unicode,
7997                        pos, pos + 1,
7998                        "unable to encode error handler result to ASCII");
7999                    Py_DECREF(rep);
8000                    goto error;
8001                }
8002                *out = (unsigned char)ch;
8003                out++;
8004            }
8005        }
8006        pos = newpos;
8007        Py_DECREF(rep);
8008    }
8009    /* write a NUL byte */
8010    *out = 0;
8011    outsize = out - PyBytes_AS_STRING(*outbytes);
8012    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8013    if (_PyBytes_Resize(outbytes, outsize) < 0)
8014        goto error;
8015    ret = 0;
8016
8017error:
8018    Py_XDECREF(encoding_obj);
8019    Py_XDECREF(errorHandler);
8020    Py_XDECREF(exc);
8021    return ret;
8022}
8023
8024static PyObject *
8025encode_code_page(int code_page,
8026                 PyObject *unicode,
8027                 const char *errors)
8028{
8029    Py_ssize_t len;
8030    PyObject *outbytes = NULL;
8031    Py_ssize_t offset;
8032    int chunk_len, ret, done;
8033
8034    if (!PyUnicode_Check(unicode)) {
8035        PyErr_BadArgument();
8036        return NULL;
8037    }
8038
8039    if (PyUnicode_READY(unicode) == -1)
8040        return NULL;
8041    len = PyUnicode_GET_LENGTH(unicode);
8042
8043    if (code_page < 0) {
8044        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8045        return NULL;
8046    }
8047
8048    if (len == 0)
8049        return PyBytes_FromStringAndSize(NULL, 0);
8050
8051    offset = 0;
8052    do
8053    {
8054#ifdef NEED_RETRY
8055        if (len > DECODING_CHUNK_SIZE) {
8056            chunk_len = DECODING_CHUNK_SIZE;
8057            done = 0;
8058        }
8059        else
8060#endif
8061        {
8062            chunk_len = (int)len;
8063            done = 1;
8064        }
8065
8066        ret = encode_code_page_strict(code_page, &outbytes,
8067                                      unicode, offset, chunk_len,
8068                                      errors);
8069        if (ret == -2)
8070            ret = encode_code_page_errors(code_page, &outbytes,
8071                                          unicode, offset,
8072                                          chunk_len, errors);
8073        if (ret < 0) {
8074            Py_XDECREF(outbytes);
8075            return NULL;
8076        }
8077
8078        offset += chunk_len;
8079        len -= chunk_len;
8080    } while (!done);
8081
8082    return outbytes;
8083}
8084
8085PyObject *
8086PyUnicode_EncodeCodePage(int code_page,
8087                         PyObject *unicode,
8088                         const char *errors)
8089{
8090    return encode_code_page(code_page, unicode, errors);
8091}
8092
8093PyObject *
8094PyUnicode_AsMBCSString(PyObject *unicode)
8095{
8096    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8097}
8098
8099#undef NEED_RETRY
8100
8101#endif /* MS_WINDOWS */
8102
8103/* --- Character Mapping Codec -------------------------------------------- */
8104
8105static int
8106charmap_decode_string(const char *s,
8107                      Py_ssize_t size,
8108                      PyObject *mapping,
8109                      const char *errors,
8110                      _PyUnicodeWriter *writer)
8111{
8112    const char *starts = s;
8113    const char *e;
8114    Py_ssize_t startinpos, endinpos;
8115    PyObject *errorHandler = NULL, *exc = NULL;
8116    Py_ssize_t maplen;
8117    enum PyUnicode_Kind mapkind;
8118    const void *mapdata;
8119    Py_UCS4 x;
8120    unsigned char ch;
8121
8122    if (PyUnicode_READY(mapping) == -1)
8123        return -1;
8124
8125    maplen = PyUnicode_GET_LENGTH(mapping);
8126    mapdata = PyUnicode_DATA(mapping);
8127    mapkind = PyUnicode_KIND(mapping);
8128
8129    e = s + size;
8130
8131    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8132        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8133         * is disabled in encoding aliases, latin1 is preferred because
8134         * its implementation is faster. */
8135        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8136        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8137        Py_UCS4 maxchar = writer->maxchar;
8138
8139        assert (writer->kind == PyUnicode_1BYTE_KIND);
8140        while (s < e) {
8141            ch = *s;
8142            x = mapdata_ucs1[ch];
8143            if (x > maxchar) {
8144                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8145                    goto onError;
8146                maxchar = writer->maxchar;
8147                outdata = (Py_UCS1 *)writer->data;
8148            }
8149            outdata[writer->pos] = x;
8150            writer->pos++;
8151            ++s;
8152        }
8153        return 0;
8154    }
8155
8156    while (s < e) {
8157        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8158            enum PyUnicode_Kind outkind = writer->kind;
8159            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8160            if (outkind == PyUnicode_1BYTE_KIND) {
8161                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8162                Py_UCS4 maxchar = writer->maxchar;
8163                while (s < e) {
8164                    ch = *s;
8165                    x = mapdata_ucs2[ch];
8166                    if (x > maxchar)
8167                        goto Error;
8168                    outdata[writer->pos] = x;
8169                    writer->pos++;
8170                    ++s;
8171                }
8172                break;
8173            }
8174            else if (outkind == PyUnicode_2BYTE_KIND) {
8175                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8176                while (s < e) {
8177                    ch = *s;
8178                    x = mapdata_ucs2[ch];
8179                    if (x == 0xFFFE)
8180                        goto Error;
8181                    outdata[writer->pos] = x;
8182                    writer->pos++;
8183                    ++s;
8184                }
8185                break;
8186            }
8187        }
8188        ch = *s;
8189
8190        if (ch < maplen)
8191            x = PyUnicode_READ(mapkind, mapdata, ch);
8192        else
8193            x = 0xfffe; /* invalid value */
8194Error:
8195        if (x == 0xfffe)
8196        {
8197            /* undefined mapping */
8198            startinpos = s-starts;
8199            endinpos = startinpos+1;
8200            if (unicode_decode_call_errorhandler_writer(
8201                    errors, &errorHandler,
8202                    "charmap", "character maps to <undefined>",
8203                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8204                    writer)) {
8205                goto onError;
8206            }
8207            continue;
8208        }
8209
8210        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8211            goto onError;
8212        ++s;
8213    }
8214    Py_XDECREF(errorHandler);
8215    Py_XDECREF(exc);
8216    return 0;
8217
8218onError:
8219    Py_XDECREF(errorHandler);
8220    Py_XDECREF(exc);
8221    return -1;
8222}
8223
8224static int
8225charmap_decode_mapping(const char *s,
8226                       Py_ssize_t size,
8227                       PyObject *mapping,
8228                       const char *errors,
8229                       _PyUnicodeWriter *writer)
8230{
8231    const char *starts = s;
8232    const char *e;
8233    Py_ssize_t startinpos, endinpos;
8234    PyObject *errorHandler = NULL, *exc = NULL;
8235    unsigned char ch;
8236    PyObject *key, *item = NULL;
8237
8238    e = s + size;
8239
8240    while (s < e) {
8241        ch = *s;
8242
8243        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8244        key = PyLong_FromLong((long)ch);
8245        if (key == NULL)
8246            goto onError;
8247
8248        item = PyObject_GetItem(mapping, key);
8249        Py_DECREF(key);
8250        if (item == NULL) {
8251            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8252                /* No mapping found means: mapping is undefined. */
8253                PyErr_Clear();
8254                goto Undefined;
8255            } else
8256                goto onError;
8257        }
8258
8259        /* Apply mapping */
8260        if (item == Py_None)
8261            goto Undefined;
8262        if (PyLong_Check(item)) {
8263            long value = PyLong_AS_LONG(item);
8264            if (value == 0xFFFE)
8265                goto Undefined;
8266            if (value < 0 || value > MAX_UNICODE) {
8267                PyErr_Format(PyExc_TypeError,
8268                             "character mapping must be in range(0x%x)",
8269                             (unsigned long)MAX_UNICODE + 1);
8270                goto onError;
8271            }
8272
8273            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8274                goto onError;
8275        }
8276        else if (PyUnicode_Check(item)) {
8277            if (PyUnicode_READY(item) == -1)
8278                goto onError;
8279            if (PyUnicode_GET_LENGTH(item) == 1) {
8280                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8281                if (value == 0xFFFE)
8282                    goto Undefined;
8283                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8284                    goto onError;
8285            }
8286            else {
8287                writer->overallocate = 1;
8288                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8289                    goto onError;
8290            }
8291        }
8292        else {
8293            /* wrong return value */
8294            PyErr_SetString(PyExc_TypeError,
8295                            "character mapping must return integer, None or str");
8296            goto onError;
8297        }
8298        Py_CLEAR(item);
8299        ++s;
8300        continue;
8301
8302Undefined:
8303        /* undefined mapping */
8304        Py_CLEAR(item);
8305        startinpos = s-starts;
8306        endinpos = startinpos+1;
8307        if (unicode_decode_call_errorhandler_writer(
8308                errors, &errorHandler,
8309                "charmap", "character maps to <undefined>",
8310                &starts, &e, &startinpos, &endinpos, &exc, &s,
8311                writer)) {
8312            goto onError;
8313        }
8314    }
8315    Py_XDECREF(errorHandler);
8316    Py_XDECREF(exc);
8317    return 0;
8318
8319onError:
8320    Py_XDECREF(item);
8321    Py_XDECREF(errorHandler);
8322    Py_XDECREF(exc);
8323    return -1;
8324}
8325
8326PyObject *
8327PyUnicode_DecodeCharmap(const char *s,
8328                        Py_ssize_t size,
8329                        PyObject *mapping,
8330                        const char *errors)
8331{
8332    _PyUnicodeWriter writer;
8333
8334    /* Default to Latin-1 */
8335    if (mapping == NULL)
8336        return PyUnicode_DecodeLatin1(s, size, errors);
8337
8338    if (size == 0)
8339        _Py_RETURN_UNICODE_EMPTY();
8340    _PyUnicodeWriter_Init(&writer);
8341    writer.min_length = size;
8342    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8343        goto onError;
8344
8345    if (PyUnicode_CheckExact(mapping)) {
8346        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8347            goto onError;
8348    }
8349    else {
8350        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8351            goto onError;
8352    }
8353    return _PyUnicodeWriter_Finish(&writer);
8354
8355  onError:
8356    _PyUnicodeWriter_Dealloc(&writer);
8357    return NULL;
8358}
8359
8360/* Charmap encoding: the lookup table */
8361
8362/*[clinic input]
8363class EncodingMap "struct encoding_map *" "&EncodingMapType"
8364[clinic start generated code]*/
8365/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8366
8367struct encoding_map {
8368    PyObject_HEAD
8369    unsigned char level1[32];
8370    int count2, count3;
8371    unsigned char level23[1];
8372};
8373
8374/*[clinic input]
8375EncodingMap.size
8376
8377Return the size (in bytes) of this object.
8378[clinic start generated code]*/
8379
8380static PyObject *
8381EncodingMap_size_impl(struct encoding_map *self)
8382/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8383{
8384    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8385                           128*self->count3);
8386}
8387
8388static PyMethodDef encoding_map_methods[] = {
8389    ENCODINGMAP_SIZE_METHODDEF
8390    {NULL, NULL}
8391};
8392
8393static PyTypeObject EncodingMapType = {
8394    PyVarObject_HEAD_INIT(NULL, 0)
8395    .tp_name = "EncodingMap",
8396    .tp_basicsize = sizeof(struct encoding_map),
8397    /* methods */
8398    .tp_flags = Py_TPFLAGS_DEFAULT,
8399    .tp_methods = encoding_map_methods,
8400};
8401
8402PyObject*
8403PyUnicode_BuildEncodingMap(PyObject* string)
8404{
8405    PyObject *result;
8406    struct encoding_map *mresult;
8407    int i;
8408    int need_dict = 0;
8409    unsigned char level1[32];
8410    unsigned char level2[512];
8411    unsigned char *mlevel1, *mlevel2, *mlevel3;
8412    int count2 = 0, count3 = 0;
8413    int kind;
8414    const void *data;
8415    Py_ssize_t length;
8416    Py_UCS4 ch;
8417
8418    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8419        PyErr_BadArgument();
8420        return NULL;
8421    }
8422    kind = PyUnicode_KIND(string);
8423    data = PyUnicode_DATA(string);
8424    length = PyUnicode_GET_LENGTH(string);
8425    length = Py_MIN(length, 256);
8426    memset(level1, 0xFF, sizeof level1);
8427    memset(level2, 0xFF, sizeof level2);
8428
8429    /* If there isn't a one-to-one mapping of NULL to \0,
8430       or if there are non-BMP characters, we need to use
8431       a mapping dictionary. */
8432    if (PyUnicode_READ(kind, data, 0) != 0)
8433        need_dict = 1;
8434    for (i = 1; i < length; i++) {
8435        int l1, l2;
8436        ch = PyUnicode_READ(kind, data, i);
8437        if (ch == 0 || ch > 0xFFFF) {
8438            need_dict = 1;
8439            break;
8440        }
8441        if (ch == 0xFFFE)
8442            /* unmapped character */
8443            continue;
8444        l1 = ch >> 11;
8445        l2 = ch >> 7;
8446        if (level1[l1] == 0xFF)
8447            level1[l1] = count2++;
8448        if (level2[l2] == 0xFF)
8449            level2[l2] = count3++;
8450    }
8451
8452    if (count2 >= 0xFF || count3 >= 0xFF)
8453        need_dict = 1;
8454
8455    if (need_dict) {
8456        PyObject *result = PyDict_New();
8457        PyObject *key, *value;
8458        if (!result)
8459            return NULL;
8460        for (i = 0; i < length; i++) {
8461            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8462            value = PyLong_FromLong(i);
8463            if (!key || !value)
8464                goto failed1;
8465            if (PyDict_SetItem(result, key, value) == -1)
8466                goto failed1;
8467            Py_DECREF(key);
8468            Py_DECREF(value);
8469        }
8470        return result;
8471      failed1:
8472        Py_XDECREF(key);
8473        Py_XDECREF(value);
8474        Py_DECREF(result);
8475        return NULL;
8476    }
8477
8478    /* Create a three-level trie */
8479    result = PyObject_Malloc(sizeof(struct encoding_map) +
8480                             16*count2 + 128*count3 - 1);
8481    if (!result) {
8482        return PyErr_NoMemory();
8483    }
8484
8485    _PyObject_Init(result, &EncodingMapType);
8486    mresult = (struct encoding_map*)result;
8487    mresult->count2 = count2;
8488    mresult->count3 = count3;
8489    mlevel1 = mresult->level1;
8490    mlevel2 = mresult->level23;
8491    mlevel3 = mresult->level23 + 16*count2;
8492    memcpy(mlevel1, level1, 32);
8493    memset(mlevel2, 0xFF, 16*count2);
8494    memset(mlevel3, 0, 128*count3);
8495    count3 = 0;
8496    for (i = 1; i < length; i++) {
8497        int o1, o2, o3, i2, i3;
8498        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8499        if (ch == 0xFFFE)
8500            /* unmapped character */
8501            continue;
8502        o1 = ch>>11;
8503        o2 = (ch>>7) & 0xF;
8504        i2 = 16*mlevel1[o1] + o2;
8505        if (mlevel2[i2] == 0xFF)
8506            mlevel2[i2] = count3++;
8507        o3 = ch & 0x7F;
8508        i3 = 128*mlevel2[i2] + o3;
8509        mlevel3[i3] = i;
8510    }
8511    return result;
8512}
8513
8514static int
8515encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8516{
8517    struct encoding_map *map = (struct encoding_map*)mapping;
8518    int l1 = c>>11;
8519    int l2 = (c>>7) & 0xF;
8520    int l3 = c & 0x7F;
8521    int i;
8522
8523    if (c > 0xFFFF)
8524        return -1;
8525    if (c == 0)
8526        return 0;
8527    /* level 1*/
8528    i = map->level1[l1];
8529    if (i == 0xFF) {
8530        return -1;
8531    }
8532    /* level 2*/
8533    i = map->level23[16*i+l2];
8534    if (i == 0xFF) {
8535        return -1;
8536    }
8537    /* level 3 */
8538    i = map->level23[16*map->count2 + 128*i + l3];
8539    if (i == 0) {
8540        return -1;
8541    }
8542    return i;
8543}
8544
8545/* Lookup the character ch in the mapping. If the character
8546   can't be found, Py_None is returned (or NULL, if another
8547   error occurred). */
8548static PyObject *
8549charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8550{
8551    PyObject *w = PyLong_FromLong((long)c);
8552    PyObject *x;
8553
8554    if (w == NULL)
8555        return NULL;
8556    x = PyObject_GetItem(mapping, w);
8557    Py_DECREF(w);
8558    if (x == NULL) {
8559        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8560            /* No mapping found means: mapping is undefined. */
8561            PyErr_Clear();
8562            Py_RETURN_NONE;
8563        } else
8564            return NULL;
8565    }
8566    else if (x == Py_None)
8567        return x;
8568    else if (PyLong_Check(x)) {
8569        long value = PyLong_AS_LONG(x);
8570        if (value < 0 || value > 255) {
8571            PyErr_SetString(PyExc_TypeError,
8572                            "character mapping must be in range(256)");
8573            Py_DECREF(x);
8574            return NULL;
8575        }
8576        return x;
8577    }
8578    else if (PyBytes_Check(x))
8579        return x;
8580    else {
8581        /* wrong return value */
8582        PyErr_Format(PyExc_TypeError,
8583                     "character mapping must return integer, bytes or None, not %.400s",
8584                     Py_TYPE(x)->tp_name);
8585        Py_DECREF(x);
8586        return NULL;
8587    }
8588}
8589
8590static int
8591charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8592{
8593    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8594    /* exponentially overallocate to minimize reallocations */
8595    if (requiredsize < 2*outsize)
8596        requiredsize = 2*outsize;
8597    if (_PyBytes_Resize(outobj, requiredsize))
8598        return -1;
8599    return 0;
8600}
8601
8602typedef enum charmapencode_result {
8603    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8604} charmapencode_result;
8605/* lookup the character, put the result in the output string and adjust
8606   various state variables. Resize the output bytes object if not enough
8607   space is available. Return a new reference to the object that
8608   was put in the output buffer, or Py_None, if the mapping was undefined
8609   (in which case no character was written) or NULL, if a
8610   reallocation error occurred. The caller must decref the result */
8611static charmapencode_result
8612charmapencode_output(Py_UCS4 c, PyObject *mapping,
8613                     PyObject **outobj, Py_ssize_t *outpos)
8614{
8615    PyObject *rep;
8616    char *outstart;
8617    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8618
8619    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8620        int res = encoding_map_lookup(c, mapping);
8621        Py_ssize_t requiredsize = *outpos+1;
8622        if (res == -1)
8623            return enc_FAILED;
8624        if (outsize<requiredsize)
8625            if (charmapencode_resize(outobj, outpos, requiredsize))
8626                return enc_EXCEPTION;
8627        outstart = PyBytes_AS_STRING(*outobj);
8628        outstart[(*outpos)++] = (char)res;
8629        return enc_SUCCESS;
8630    }
8631
8632    rep = charmapencode_lookup(c, mapping);
8633    if (rep==NULL)
8634        return enc_EXCEPTION;
8635    else if (rep==Py_None) {
8636        Py_DECREF(rep);
8637        return enc_FAILED;
8638    } else {
8639        if (PyLong_Check(rep)) {
8640            Py_ssize_t requiredsize = *outpos+1;
8641            if (outsize<requiredsize)
8642                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8643                    Py_DECREF(rep);
8644                    return enc_EXCEPTION;
8645                }
8646            outstart = PyBytes_AS_STRING(*outobj);
8647            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8648        }
8649        else {
8650            const char *repchars = PyBytes_AS_STRING(rep);
8651            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8652            Py_ssize_t requiredsize = *outpos+repsize;
8653            if (outsize<requiredsize)
8654                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8655                    Py_DECREF(rep);
8656                    return enc_EXCEPTION;
8657                }
8658            outstart = PyBytes_AS_STRING(*outobj);
8659            memcpy(outstart + *outpos, repchars, repsize);
8660            *outpos += repsize;
8661        }
8662    }
8663    Py_DECREF(rep);
8664    return enc_SUCCESS;
8665}
8666
8667/* handle an error in PyUnicode_EncodeCharmap
8668   Return 0 on success, -1 on error */
8669static int
8670charmap_encoding_error(
8671    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8672    PyObject **exceptionObject,
8673    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8674    PyObject **res, Py_ssize_t *respos)
8675{
8676    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8677    Py_ssize_t size, repsize;
8678    Py_ssize_t newpos;
8679    enum PyUnicode_Kind kind;
8680    const void *data;
8681    Py_ssize_t index;
8682    /* startpos for collecting unencodable chars */
8683    Py_ssize_t collstartpos = *inpos;
8684    Py_ssize_t collendpos = *inpos+1;
8685    Py_ssize_t collpos;
8686    const char *encoding = "charmap";
8687    const char *reason = "character maps to <undefined>";
8688    charmapencode_result x;
8689    Py_UCS4 ch;
8690    int val;
8691
8692    if (PyUnicode_READY(unicode) == -1)
8693        return -1;
8694    size = PyUnicode_GET_LENGTH(unicode);
8695    /* find all unencodable characters */
8696    while (collendpos < size) {
8697        PyObject *rep;
8698        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8699            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8700            val = encoding_map_lookup(ch, mapping);
8701            if (val != -1)
8702                break;
8703            ++collendpos;
8704            continue;
8705        }
8706
8707        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8708        rep = charmapencode_lookup(ch, mapping);
8709        if (rep==NULL)
8710            return -1;
8711        else if (rep!=Py_None) {
8712            Py_DECREF(rep);
8713            break;
8714        }
8715        Py_DECREF(rep);
8716        ++collendpos;
8717    }
8718    /* cache callback name lookup
8719     * (if not done yet, i.e. it's the first error) */
8720    if (*error_handler == _Py_ERROR_UNKNOWN)
8721        *error_handler = _Py_GetErrorHandler(errors);
8722
8723    switch (*error_handler) {
8724    case _Py_ERROR_STRICT:
8725        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8726        return -1;
8727
8728    case _Py_ERROR_REPLACE:
8729        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8730            x = charmapencode_output('?', mapping, res, respos);
8731            if (x==enc_EXCEPTION) {
8732                return -1;
8733            }
8734            else if (x==enc_FAILED) {
8735                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8736                return -1;
8737            }
8738        }
8739        /* fall through */
8740    case _Py_ERROR_IGNORE:
8741        *inpos = collendpos;
8742        break;
8743
8744    case _Py_ERROR_XMLCHARREFREPLACE:
8745        /* generate replacement (temporarily (mis)uses p) */
8746        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8747            char buffer[2+29+1+1];
8748            char *cp;
8749            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8750            for (cp = buffer; *cp; ++cp) {
8751                x = charmapencode_output(*cp, mapping, res, respos);
8752                if (x==enc_EXCEPTION)
8753                    return -1;
8754                else if (x==enc_FAILED) {
8755                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8756                    return -1;
8757                }
8758            }
8759        }
8760        *inpos = collendpos;
8761        break;
8762
8763    default:
8764        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8765                                                      encoding, reason, unicode, exceptionObject,
8766                                                      collstartpos, collendpos, &newpos);
8767        if (repunicode == NULL)
8768            return -1;
8769        if (PyBytes_Check(repunicode)) {
8770            /* Directly copy bytes result to output. */
8771            Py_ssize_t outsize = PyBytes_Size(*res);
8772            Py_ssize_t requiredsize;
8773            repsize = PyBytes_Size(repunicode);
8774            requiredsize = *respos + repsize;
8775            if (requiredsize > outsize)
8776                /* Make room for all additional bytes. */
8777                if (charmapencode_resize(res, respos, requiredsize)) {
8778                    Py_DECREF(repunicode);
8779                    return -1;
8780                }
8781            memcpy(PyBytes_AsString(*res) + *respos,
8782                   PyBytes_AsString(repunicode),  repsize);
8783            *respos += repsize;
8784            *inpos = newpos;
8785            Py_DECREF(repunicode);
8786            break;
8787        }
8788        /* generate replacement  */
8789        if (PyUnicode_READY(repunicode) == -1) {
8790            Py_DECREF(repunicode);
8791            return -1;
8792        }
8793        repsize = PyUnicode_GET_LENGTH(repunicode);
8794        data = PyUnicode_DATA(repunicode);
8795        kind = PyUnicode_KIND(repunicode);
8796        for (index = 0; index < repsize; index++) {
8797            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8798            x = charmapencode_output(repch, mapping, res, respos);
8799            if (x==enc_EXCEPTION) {
8800                Py_DECREF(repunicode);
8801                return -1;
8802            }
8803            else if (x==enc_FAILED) {
8804                Py_DECREF(repunicode);
8805                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8806                return -1;
8807            }
8808        }
8809        *inpos = newpos;
8810        Py_DECREF(repunicode);
8811    }
8812    return 0;
8813}
8814
8815PyObject *
8816_PyUnicode_EncodeCharmap(PyObject *unicode,
8817                         PyObject *mapping,
8818                         const char *errors)
8819{
8820    /* output object */
8821    PyObject *res = NULL;
8822    /* current input position */
8823    Py_ssize_t inpos = 0;
8824    Py_ssize_t size;
8825    /* current output position */
8826    Py_ssize_t respos = 0;
8827    PyObject *error_handler_obj = NULL;
8828    PyObject *exc = NULL;
8829    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8830    const void *data;
8831    int kind;
8832
8833    if (PyUnicode_READY(unicode) == -1)
8834        return NULL;
8835    size = PyUnicode_GET_LENGTH(unicode);
8836    data = PyUnicode_DATA(unicode);
8837    kind = PyUnicode_KIND(unicode);
8838
8839    /* Default to Latin-1 */
8840    if (mapping == NULL)
8841        return unicode_encode_ucs1(unicode, errors, 256);
8842
8843    /* allocate enough for a simple encoding without
8844       replacements, if we need more, we'll resize */
8845    res = PyBytes_FromStringAndSize(NULL, size);
8846    if (res == NULL)
8847        goto onError;
8848    if (size == 0)
8849        return res;
8850
8851    while (inpos<size) {
8852        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8853        /* try to encode it */
8854        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8855        if (x==enc_EXCEPTION) /* error */
8856            goto onError;
8857        if (x==enc_FAILED) { /* unencodable character */
8858            if (charmap_encoding_error(unicode, &inpos, mapping,
8859                                       &exc,
8860                                       &error_handler, &error_handler_obj, errors,
8861                                       &res, &respos)) {
8862                goto onError;
8863            }
8864        }
8865        else
8866            /* done with this character => adjust input position */
8867            ++inpos;
8868    }
8869
8870    /* Resize if we allocated to much */
8871    if (respos<PyBytes_GET_SIZE(res))
8872        if (_PyBytes_Resize(&res, respos) < 0)
8873            goto onError;
8874
8875    Py_XDECREF(exc);
8876    Py_XDECREF(error_handler_obj);
8877    return res;
8878
8879  onError:
8880    Py_XDECREF(res);
8881    Py_XDECREF(exc);
8882    Py_XDECREF(error_handler_obj);
8883    return NULL;
8884}
8885
8886PyObject *
8887PyUnicode_AsCharmapString(PyObject *unicode,
8888                          PyObject *mapping)
8889{
8890    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8891        PyErr_BadArgument();
8892        return NULL;
8893    }
8894    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8895}
8896
8897/* create or adjust a UnicodeTranslateError */
8898static void
8899make_translate_exception(PyObject **exceptionObject,
8900                         PyObject *unicode,
8901                         Py_ssize_t startpos, Py_ssize_t endpos,
8902                         const char *reason)
8903{
8904    if (*exceptionObject == NULL) {
8905        *exceptionObject = _PyUnicodeTranslateError_Create(
8906            unicode, startpos, endpos, reason);
8907    }
8908    else {
8909        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8910            goto onError;
8911        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8912            goto onError;
8913        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8914            goto onError;
8915        return;
8916      onError:
8917        Py_CLEAR(*exceptionObject);
8918    }
8919}
8920
8921/* error handling callback helper:
8922   build arguments, call the callback and check the arguments,
8923   put the result into newpos and return the replacement string, which
8924   has to be freed by the caller */
8925static PyObject *
8926unicode_translate_call_errorhandler(const char *errors,
8927                                    PyObject **errorHandler,
8928                                    const char *reason,
8929                                    PyObject *unicode, PyObject **exceptionObject,
8930                                    Py_ssize_t startpos, Py_ssize_t endpos,
8931                                    Py_ssize_t *newpos)
8932{
8933    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8934
8935    Py_ssize_t i_newpos;
8936    PyObject *restuple;
8937    PyObject *resunicode;
8938
8939    if (*errorHandler == NULL) {
8940        *errorHandler = PyCodec_LookupError(errors);
8941        if (*errorHandler == NULL)
8942            return NULL;
8943    }
8944
8945    make_translate_exception(exceptionObject,
8946                             unicode, startpos, endpos, reason);
8947    if (*exceptionObject == NULL)
8948        return NULL;
8949
8950    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8951    if (restuple == NULL)
8952        return NULL;
8953    if (!PyTuple_Check(restuple)) {
8954        PyErr_SetString(PyExc_TypeError, &argparse[3]);
8955        Py_DECREF(restuple);
8956        return NULL;
8957    }
8958    if (!PyArg_ParseTuple(restuple, argparse,
8959                          &resunicode, &i_newpos)) {
8960        Py_DECREF(restuple);
8961        return NULL;
8962    }
8963    if (i_newpos<0)
8964        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8965    else
8966        *newpos = i_newpos;
8967    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8968        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8969        Py_DECREF(restuple);
8970        return NULL;
8971    }
8972    Py_INCREF(resunicode);
8973    Py_DECREF(restuple);
8974    return resunicode;
8975}
8976
8977/* Lookup the character ch in the mapping and put the result in result,
8978   which must be decrefed by the caller.
8979   Return 0 on success, -1 on error */
8980static int
8981charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8982{
8983    PyObject *w = PyLong_FromLong((long)c);
8984    PyObject *x;
8985
8986    if (w == NULL)
8987        return -1;
8988    x = PyObject_GetItem(mapping, w);
8989    Py_DECREF(w);
8990    if (x == NULL) {
8991        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8992            /* No mapping found means: use 1:1 mapping. */
8993            PyErr_Clear();
8994            *result = NULL;
8995            return 0;
8996        } else
8997            return -1;
8998    }
8999    else if (x == Py_None) {
9000        *result = x;
9001        return 0;
9002    }
9003    else if (PyLong_Check(x)) {
9004        long value = PyLong_AS_LONG(x);
9005        if (value < 0 || value > MAX_UNICODE) {
9006            PyErr_Format(PyExc_ValueError,
9007                         "character mapping must be in range(0x%x)",
9008                         MAX_UNICODE+1);
9009            Py_DECREF(x);
9010            return -1;
9011        }
9012        *result = x;
9013        return 0;
9014    }
9015    else if (PyUnicode_Check(x)) {
9016        *result = x;
9017        return 0;
9018    }
9019    else {
9020        /* wrong return value */
9021        PyErr_SetString(PyExc_TypeError,
9022                        "character mapping must return integer, None or str");
9023        Py_DECREF(x);
9024        return -1;
9025    }
9026}
9027
9028/* lookup the character, write the result into the writer.
9029   Return 1 if the result was written into the writer, return 0 if the mapping
9030   was undefined, raise an exception return -1 on error. */
9031static int
9032charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9033                        _PyUnicodeWriter *writer)
9034{
9035    PyObject *item;
9036
9037    if (charmaptranslate_lookup(ch, mapping, &item))
9038        return -1;
9039
9040    if (item == NULL) {
9041        /* not found => default to 1:1 mapping */
9042        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9043            return -1;
9044        }
9045        return 1;
9046    }
9047
9048    if (item == Py_None) {
9049        Py_DECREF(item);
9050        return 0;
9051    }
9052
9053    if (PyLong_Check(item)) {
9054        long ch = (Py_UCS4)PyLong_AS_LONG(item);
9055        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9056           used it */
9057        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9058            Py_DECREF(item);
9059            return -1;
9060        }
9061        Py_DECREF(item);
9062        return 1;
9063    }
9064
9065    if (!PyUnicode_Check(item)) {
9066        Py_DECREF(item);
9067        return -1;
9068    }
9069
9070    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9071        Py_DECREF(item);
9072        return -1;
9073    }
9074
9075    Py_DECREF(item);
9076    return 1;
9077}
9078
9079static int
9080unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9081                              Py_UCS1 *translate)
9082{
9083    PyObject *item = NULL;
9084    int ret = 0;
9085
9086    if (charmaptranslate_lookup(ch, mapping, &item)) {
9087        return -1;
9088    }
9089
9090    if (item == Py_None) {
9091        /* deletion */
9092        translate[ch] = 0xfe;
9093    }
9094    else if (item == NULL) {
9095        /* not found => default to 1:1 mapping */
9096        translate[ch] = ch;
9097        return 1;
9098    }
9099    else if (PyLong_Check(item)) {
9100        long replace = PyLong_AS_LONG(item);
9101        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9102           used it */
9103        if (127 < replace) {
9104            /* invalid character or character outside ASCII:
9105               skip the fast translate */
9106            goto exit;
9107        }
9108        translate[ch] = (Py_UCS1)replace;
9109    }
9110    else if (PyUnicode_Check(item)) {
9111        Py_UCS4 replace;
9112
9113        if (PyUnicode_READY(item) == -1) {
9114            Py_DECREF(item);
9115            return -1;
9116        }
9117        if (PyUnicode_GET_LENGTH(item) != 1)
9118            goto exit;
9119
9120        replace = PyUnicode_READ_CHAR(item, 0);
9121        if (replace > 127)
9122            goto exit;
9123        translate[ch] = (Py_UCS1)replace;
9124    }
9125    else {
9126        /* not None, NULL, long or unicode */
9127        goto exit;
9128    }
9129    ret = 1;
9130
9131  exit:
9132    Py_DECREF(item);
9133    return ret;
9134}
9135
9136/* Fast path for ascii => ascii translation. Return 1 if the whole string
9137   was translated into writer, return 0 if the input string was partially
9138   translated into writer, raise an exception and return -1 on error. */
9139static int
9140unicode_fast_translate(PyObject *input, PyObject *mapping,
9141                       _PyUnicodeWriter *writer, int ignore,
9142                       Py_ssize_t *input_pos)
9143{
9144    Py_UCS1 ascii_table[128], ch, ch2;
9145    Py_ssize_t len;
9146    const Py_UCS1 *in, *end;
9147    Py_UCS1 *out;
9148    int res = 0;
9149
9150    len = PyUnicode_GET_LENGTH(input);
9151
9152    memset(ascii_table, 0xff, 128);
9153
9154    in = PyUnicode_1BYTE_DATA(input);
9155    end = in + len;
9156
9157    assert(PyUnicode_IS_ASCII(writer->buffer));
9158    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9159    out = PyUnicode_1BYTE_DATA(writer->buffer);
9160
9161    for (; in < end; in++) {
9162        ch = *in;
9163        ch2 = ascii_table[ch];
9164        if (ch2 == 0xff) {
9165            int translate = unicode_fast_translate_lookup(mapping, ch,
9166                                                          ascii_table);
9167            if (translate < 0)
9168                return -1;
9169            if (translate == 0)
9170                goto exit;
9171            ch2 = ascii_table[ch];
9172        }
9173        if (ch2 == 0xfe) {
9174            if (ignore)
9175                continue;
9176            goto exit;
9177        }
9178        assert(ch2 < 128);
9179        *out = ch2;
9180        out++;
9181    }
9182    res = 1;
9183
9184exit:
9185    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9186    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9187    return res;
9188}
9189
9190static PyObject *
9191_PyUnicode_TranslateCharmap(PyObject *input,
9192                            PyObject *mapping,
9193                            const char *errors)
9194{
9195    /* input object */
9196    const void *data;
9197    Py_ssize_t size, i;
9198    int kind;
9199    /* output buffer */
9200    _PyUnicodeWriter writer;
9201    /* error handler */
9202    const char *reason = "character maps to <undefined>";
9203    PyObject *errorHandler = NULL;
9204    PyObject *exc = NULL;
9205    int ignore;
9206    int res;
9207
9208    if (mapping == NULL) {
9209        PyErr_BadArgument();
9210        return NULL;
9211    }
9212
9213    if (PyUnicode_READY(input) == -1)
9214        return NULL;
9215    data = PyUnicode_DATA(input);
9216    kind = PyUnicode_KIND(input);
9217    size = PyUnicode_GET_LENGTH(input);
9218
9219    if (size == 0)
9220        return PyUnicode_FromObject(input);
9221
9222    /* allocate enough for a simple 1:1 translation without
9223       replacements, if we need more, we'll resize */
9224    _PyUnicodeWriter_Init(&writer);
9225    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9226        goto onError;
9227
9228    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9229
9230    if (PyUnicode_READY(input) == -1)
9231        return NULL;
9232    if (PyUnicode_IS_ASCII(input)) {
9233        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9234        if (res < 0) {
9235            _PyUnicodeWriter_Dealloc(&writer);
9236            return NULL;
9237        }
9238        if (res == 1)
9239            return _PyUnicodeWriter_Finish(&writer);
9240    }
9241    else {
9242        i = 0;
9243    }
9244
9245    while (i<size) {
9246        /* try to encode it */
9247        int translate;
9248        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9249        Py_ssize_t newpos;
9250        /* startpos for collecting untranslatable chars */
9251        Py_ssize_t collstart;
9252        Py_ssize_t collend;
9253        Py_UCS4 ch;
9254
9255        ch = PyUnicode_READ(kind, data, i);
9256        translate = charmaptranslate_output(ch, mapping, &writer);
9257        if (translate < 0)
9258            goto onError;
9259
9260        if (translate != 0) {
9261            /* it worked => adjust input pointer */
9262            ++i;
9263            continue;
9264        }
9265
9266        /* untranslatable character */
9267        collstart = i;
9268        collend = i+1;
9269
9270        /* find all untranslatable characters */
9271        while (collend < size) {
9272            PyObject *x;
9273            ch = PyUnicode_READ(kind, data, collend);
9274            if (charmaptranslate_lookup(ch, mapping, &x))
9275                goto onError;
9276            Py_XDECREF(x);
9277            if (x != Py_None)
9278                break;
9279            ++collend;
9280        }
9281
9282        if (ignore) {
9283            i = collend;
9284        }
9285        else {
9286            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9287                                                             reason, input, &exc,
9288                                                             collstart, collend, &newpos);
9289            if (repunicode == NULL)
9290                goto onError;
9291            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9292                Py_DECREF(repunicode);
9293                goto onError;
9294            }
9295            Py_DECREF(repunicode);
9296            i = newpos;
9297        }
9298    }
9299    Py_XDECREF(exc);
9300    Py_XDECREF(errorHandler);
9301    return _PyUnicodeWriter_Finish(&writer);
9302
9303  onError:
9304    _PyUnicodeWriter_Dealloc(&writer);
9305    Py_XDECREF(exc);
9306    Py_XDECREF(errorHandler);
9307    return NULL;
9308}
9309
9310PyObject *
9311PyUnicode_Translate(PyObject *str,
9312                    PyObject *mapping,
9313                    const char *errors)
9314{
9315    if (ensure_unicode(str) < 0)
9316        return NULL;
9317    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9318}
9319
9320PyObject *
9321_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9322{
9323    if (!PyUnicode_Check(unicode)) {
9324        PyErr_BadInternalCall();
9325        return NULL;
9326    }
9327    if (PyUnicode_READY(unicode) == -1)
9328        return NULL;
9329    if (PyUnicode_IS_ASCII(unicode)) {
9330        /* If the string is already ASCII, just return the same string */
9331        Py_INCREF(unicode);
9332        return unicode;
9333    }
9334
9335    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9336    PyObject *result = PyUnicode_New(len, 127);
9337    if (result == NULL) {
9338        return NULL;
9339    }
9340
9341    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9342    int kind = PyUnicode_KIND(unicode);
9343    const void *data = PyUnicode_DATA(unicode);
9344    Py_ssize_t i;
9345    for (i = 0; i < len; ++i) {
9346        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9347        if (ch < 127) {
9348            out[i] = ch;
9349        }
9350        else if (Py_UNICODE_ISSPACE(ch)) {
9351            out[i] = ' ';
9352        }
9353        else {
9354            int decimal = Py_UNICODE_TODECIMAL(ch);
9355            if (decimal < 0) {
9356                out[i] = '?';
9357                out[i+1] = '\0';
9358                _PyUnicode_LENGTH(result) = i + 1;
9359                break;
9360            }
9361            out[i] = '0' + decimal;
9362        }
9363    }
9364
9365    assert(_PyUnicode_CheckConsistency(result, 1));
9366    return result;
9367}
9368
9369/* --- Helpers ------------------------------------------------------------ */
9370
9371/* helper macro to fixup start/end slice values */
9372#define ADJUST_INDICES(start, end, len)         \
9373    if (end > len)                              \
9374        end = len;                              \
9375    else if (end < 0) {                         \
9376        end += len;                             \
9377        if (end < 0)                            \
9378            end = 0;                            \
9379    }                                           \
9380    if (start < 0) {                            \
9381        start += len;                           \
9382        if (start < 0)                          \
9383            start = 0;                          \
9384    }
9385
9386static Py_ssize_t
9387any_find_slice(PyObject* s1, PyObject* s2,
9388               Py_ssize_t start,
9389               Py_ssize_t end,
9390               int direction)
9391{
9392    int kind1, kind2;
9393    const void *buf1, *buf2;
9394    Py_ssize_t len1, len2, result;
9395
9396    kind1 = PyUnicode_KIND(s1);
9397    kind2 = PyUnicode_KIND(s2);
9398    if (kind1 < kind2)
9399        return -1;
9400
9401    len1 = PyUnicode_GET_LENGTH(s1);
9402    len2 = PyUnicode_GET_LENGTH(s2);
9403    ADJUST_INDICES(start, end, len1);
9404    if (end - start < len2)
9405        return -1;
9406
9407    buf1 = PyUnicode_DATA(s1);
9408    buf2 = PyUnicode_DATA(s2);
9409    if (len2 == 1) {
9410        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9411        result = findchar((const char *)buf1 + kind1*start,
9412                          kind1, end - start, ch, direction);
9413        if (result == -1)
9414            return -1;
9415        else
9416            return start + result;
9417    }
9418
9419    if (kind2 != kind1) {
9420        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9421        if (!buf2)
9422            return -2;
9423    }
9424
9425    if (direction > 0) {
9426        switch (kind1) {
9427        case PyUnicode_1BYTE_KIND:
9428            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9429                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9430            else
9431                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9432            break;
9433        case PyUnicode_2BYTE_KIND:
9434            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9435            break;
9436        case PyUnicode_4BYTE_KIND:
9437            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9438            break;
9439        default:
9440            Py_UNREACHABLE();
9441        }
9442    }
9443    else {
9444        switch (kind1) {
9445        case PyUnicode_1BYTE_KIND:
9446            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9447                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9448            else
9449                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9450            break;
9451        case PyUnicode_2BYTE_KIND:
9452            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9453            break;
9454        case PyUnicode_4BYTE_KIND:
9455            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9456            break;
9457        default:
9458            Py_UNREACHABLE();
9459        }
9460    }
9461
9462    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9463    if (kind2 != kind1)
9464        PyMem_Free((void *)buf2);
9465
9466    return result;
9467}
9468
9469/* _PyUnicode_InsertThousandsGrouping() helper functions */
9470#include "stringlib/localeutil.h"
9471
9472/**
9473 * InsertThousandsGrouping:
9474 * @writer: Unicode writer.
9475 * @n_buffer: Number of characters in @buffer.
9476 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9477 * @d_pos: Start of digits string.
9478 * @n_digits: The number of digits in the string, in which we want
9479 *            to put the grouping chars.
9480 * @min_width: The minimum width of the digits in the output string.
9481 *             Output will be zero-padded on the left to fill.
9482 * @grouping: see definition in localeconv().
9483 * @thousands_sep: see definition in localeconv().
9484 *
9485 * There are 2 modes: counting and filling. If @writer is NULL,
9486 *  we are in counting mode, else filling mode.
9487 * If counting, the required buffer size is returned.
9488 * If filling, we know the buffer will be large enough, so we don't
9489 *  need to pass in the buffer size.
9490 * Inserts thousand grouping characters (as defined by grouping and
9491 *  thousands_sep) into @writer.
9492 *
9493 * Return value: -1 on error, number of characters otherwise.
9494 **/
9495Py_ssize_t
9496_PyUnicode_InsertThousandsGrouping(
9497    _PyUnicodeWriter *writer,
9498    Py_ssize_t n_buffer,
9499    PyObject *digits,
9500    Py_ssize_t d_pos,
9501    Py_ssize_t n_digits,
9502    Py_ssize_t min_width,
9503    const char *grouping,
9504    PyObject *thousands_sep,
9505    Py_UCS4 *maxchar)
9506{
9507    min_width = Py_MAX(0, min_width);
9508    if (writer) {
9509        assert(digits != NULL);
9510        assert(maxchar == NULL);
9511    }
9512    else {
9513        assert(digits == NULL);
9514        assert(maxchar != NULL);
9515    }
9516    assert(0 <= d_pos);
9517    assert(0 <= n_digits);
9518    assert(grouping != NULL);
9519
9520    if (digits != NULL) {
9521        if (PyUnicode_READY(digits) == -1) {
9522            return -1;
9523        }
9524    }
9525    if (PyUnicode_READY(thousands_sep) == -1) {
9526        return -1;
9527    }
9528
9529    Py_ssize_t count = 0;
9530    Py_ssize_t n_zeros;
9531    int loop_broken = 0;
9532    int use_separator = 0; /* First time through, don't append the
9533                              separator. They only go between
9534                              groups. */
9535    Py_ssize_t buffer_pos;
9536    Py_ssize_t digits_pos;
9537    Py_ssize_t len;
9538    Py_ssize_t n_chars;
9539    Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9540                                        be looked at */
9541    /* A generator that returns all of the grouping widths, until it
9542       returns 0. */
9543    GroupGenerator groupgen;
9544    GroupGenerator_init(&groupgen, grouping);
9545    const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9546
9547    /* if digits are not grouped, thousands separator
9548       should be an empty string */
9549    assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9550
9551    digits_pos = d_pos + n_digits;
9552    if (writer) {
9553        buffer_pos = writer->pos + n_buffer;
9554        assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9555        assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9556    }
9557    else {
9558        buffer_pos = n_buffer;
9559    }
9560
9561    if (!writer) {
9562        *maxchar = 127;
9563    }
9564
9565    while ((len = GroupGenerator_next(&groupgen)) > 0) {
9566        len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9567        n_zeros = Py_MAX(0, len - remaining);
9568        n_chars = Py_MAX(0, Py_MIN(remaining, len));
9569
9570        /* Use n_zero zero's and n_chars chars */
9571
9572        /* Count only, don't do anything. */
9573        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9574
9575        /* Copy into the writer. */
9576        InsertThousandsGrouping_fill(writer, &buffer_pos,
9577                                     digits, &digits_pos,
9578                                     n_chars, n_zeros,
9579                                     use_separator ? thousands_sep : NULL,
9580                                     thousands_sep_len, maxchar);
9581
9582        /* Use a separator next time. */
9583        use_separator = 1;
9584
9585        remaining -= n_chars;
9586        min_width -= len;
9587
9588        if (remaining <= 0 && min_width <= 0) {
9589            loop_broken = 1;
9590            break;
9591        }
9592        min_width -= thousands_sep_len;
9593    }
9594    if (!loop_broken) {
9595        /* We left the loop without using a break statement. */
9596
9597        len = Py_MAX(Py_MAX(remaining, min_width), 1);
9598        n_zeros = Py_MAX(0, len - remaining);
9599        n_chars = Py_MAX(0, Py_MIN(remaining, len));
9600
9601        /* Use n_zero zero's and n_chars chars */
9602        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9603
9604        /* Copy into the writer. */
9605        InsertThousandsGrouping_fill(writer, &buffer_pos,
9606                                     digits, &digits_pos,
9607                                     n_chars, n_zeros,
9608                                     use_separator ? thousands_sep : NULL,
9609                                     thousands_sep_len, maxchar);
9610    }
9611    return count;
9612}
9613
9614
9615Py_ssize_t
9616PyUnicode_Count(PyObject *str,
9617                PyObject *substr,
9618                Py_ssize_t start,
9619                Py_ssize_t end)
9620{
9621    Py_ssize_t result;
9622    int kind1, kind2;
9623    const void *buf1 = NULL, *buf2 = NULL;
9624    Py_ssize_t len1, len2;
9625
9626    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9627        return -1;
9628
9629    kind1 = PyUnicode_KIND(str);
9630    kind2 = PyUnicode_KIND(substr);
9631    if (kind1 < kind2)
9632        return 0;
9633
9634    len1 = PyUnicode_GET_LENGTH(str);
9635    len2 = PyUnicode_GET_LENGTH(substr);
9636    ADJUST_INDICES(start, end, len1);
9637    if (end - start < len2)
9638        return 0;
9639
9640    buf1 = PyUnicode_DATA(str);
9641    buf2 = PyUnicode_DATA(substr);
9642    if (kind2 != kind1) {
9643        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9644        if (!buf2)
9645            goto onError;
9646    }
9647
9648    switch (kind1) {
9649    case PyUnicode_1BYTE_KIND:
9650        if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9651            result = asciilib_count(
9652                ((const Py_UCS1*)buf1) + start, end - start,
9653                buf2, len2, PY_SSIZE_T_MAX
9654                );
9655        else
9656            result = ucs1lib_count(
9657                ((const Py_UCS1*)buf1) + start, end - start,
9658                buf2, len2, PY_SSIZE_T_MAX
9659                );
9660        break;
9661    case PyUnicode_2BYTE_KIND:
9662        result = ucs2lib_count(
9663            ((const Py_UCS2*)buf1) + start, end - start,
9664            buf2, len2, PY_SSIZE_T_MAX
9665            );
9666        break;
9667    case PyUnicode_4BYTE_KIND:
9668        result = ucs4lib_count(
9669            ((const Py_UCS4*)buf1) + start, end - start,
9670            buf2, len2, PY_SSIZE_T_MAX
9671            );
9672        break;
9673    default:
9674        Py_UNREACHABLE();
9675    }
9676
9677    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9678    if (kind2 != kind1)
9679        PyMem_Free((void *)buf2);
9680
9681    return result;
9682  onError:
9683    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9684    if (kind2 != kind1)
9685        PyMem_Free((void *)buf2);
9686    return -1;
9687}
9688
9689Py_ssize_t
9690PyUnicode_Find(PyObject *str,
9691               PyObject *substr,
9692               Py_ssize_t start,
9693               Py_ssize_t end,
9694               int direction)
9695{
9696    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9697        return -2;
9698
9699    return any_find_slice(str, substr, start, end, direction);
9700}
9701
9702Py_ssize_t
9703PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9704                   Py_ssize_t start, Py_ssize_t end,
9705                   int direction)
9706{
9707    int kind;
9708    Py_ssize_t len, result;
9709    if (PyUnicode_READY(str) == -1)
9710        return -2;
9711    len = PyUnicode_GET_LENGTH(str);
9712    ADJUST_INDICES(start, end, len);
9713    if (end - start < 1)
9714        return -1;
9715    kind = PyUnicode_KIND(str);
9716    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9717                      kind, end-start, ch, direction);
9718    if (result == -1)
9719        return -1;
9720    else
9721        return start + result;
9722}
9723
9724static int
9725tailmatch(PyObject *self,
9726          PyObject *substring,
9727          Py_ssize_t start,
9728          Py_ssize_t end,
9729          int direction)
9730{
9731    int kind_self;
9732    int kind_sub;
9733    const void *data_self;
9734    const void *data_sub;
9735    Py_ssize_t offset;
9736    Py_ssize_t i;
9737    Py_ssize_t end_sub;
9738
9739    if (PyUnicode_READY(self) == -1 ||
9740        PyUnicode_READY(substring) == -1)
9741        return -1;
9742
9743    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9744    end -= PyUnicode_GET_LENGTH(substring);
9745    if (end < start)
9746        return 0;
9747
9748    if (PyUnicode_GET_LENGTH(substring) == 0)
9749        return 1;
9750
9751    kind_self = PyUnicode_KIND(self);
9752    data_self = PyUnicode_DATA(self);
9753    kind_sub = PyUnicode_KIND(substring);
9754    data_sub = PyUnicode_DATA(substring);
9755    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9756
9757    if (direction > 0)
9758        offset = end;
9759    else
9760        offset = start;
9761
9762    if (PyUnicode_READ(kind_self, data_self, offset) ==
9763        PyUnicode_READ(kind_sub, data_sub, 0) &&
9764        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9765        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9766        /* If both are of the same kind, memcmp is sufficient */
9767        if (kind_self == kind_sub) {
9768            return ! memcmp((char *)data_self +
9769                                (offset * PyUnicode_KIND(substring)),
9770                            data_sub,
9771                            PyUnicode_GET_LENGTH(substring) *
9772                                PyUnicode_KIND(substring));
9773        }
9774        /* otherwise we have to compare each character by first accessing it */
9775        else {
9776            /* We do not need to compare 0 and len(substring)-1 because
9777               the if statement above ensured already that they are equal
9778               when we end up here. */
9779            for (i = 1; i < end_sub; ++i) {
9780                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9781                    PyUnicode_READ(kind_sub, data_sub, i))
9782                    return 0;
9783            }
9784            return 1;
9785        }
9786    }
9787
9788    return 0;
9789}
9790
9791Py_ssize_t
9792PyUnicode_Tailmatch(PyObject *str,
9793                    PyObject *substr,
9794                    Py_ssize_t start,
9795                    Py_ssize_t end,
9796                    int direction)
9797{
9798    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9799        return -1;
9800
9801    return tailmatch(str, substr, start, end, direction);
9802}
9803
9804static PyObject *
9805ascii_upper_or_lower(PyObject *self, int lower)
9806{
9807    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9808    const char *data = PyUnicode_DATA(self);
9809    char *resdata;
9810    PyObject *res;
9811
9812    res = PyUnicode_New(len, 127);
9813    if (res == NULL)
9814        return NULL;
9815    resdata = PyUnicode_DATA(res);
9816    if (lower)
9817        _Py_bytes_lower(resdata, data, len);
9818    else
9819        _Py_bytes_upper(resdata, data, len);
9820    return res;
9821}
9822
9823static Py_UCS4
9824handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9825{
9826    Py_ssize_t j;
9827    int final_sigma;
9828    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9829    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9830
9831     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9832
9833    where ! is a negation and \p{xxx} is a character with property xxx.
9834    */
9835    for (j = i - 1; j >= 0; j--) {
9836        c = PyUnicode_READ(kind, data, j);
9837        if (!_PyUnicode_IsCaseIgnorable(c))
9838            break;
9839    }
9840    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9841    if (final_sigma) {
9842        for (j = i + 1; j < length; j++) {
9843            c = PyUnicode_READ(kind, data, j);
9844            if (!_PyUnicode_IsCaseIgnorable(c))
9845                break;
9846        }
9847        final_sigma = j == length || !_PyUnicode_IsCased(c);
9848    }
9849    return (final_sigma) ? 0x3C2 : 0x3C3;
9850}
9851
9852static int
9853lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9854           Py_UCS4 c, Py_UCS4 *mapped)
9855{
9856    /* Obscure special case. */
9857    if (c == 0x3A3) {
9858        mapped[0] = handle_capital_sigma(kind, data, length, i);
9859        return 1;
9860    }
9861    return _PyUnicode_ToLowerFull(c, mapped);
9862}
9863
9864static Py_ssize_t
9865do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9866{
9867    Py_ssize_t i, k = 0;
9868    int n_res, j;
9869    Py_UCS4 c, mapped[3];
9870
9871    c = PyUnicode_READ(kind, data, 0);
9872    n_res = _PyUnicode_ToTitleFull(c, mapped);
9873    for (j = 0; j < n_res; j++) {
9874        *maxchar = Py_MAX(*maxchar, mapped[j]);
9875        res[k++] = mapped[j];
9876    }
9877    for (i = 1; i < length; i++) {
9878        c = PyUnicode_READ(kind, data, i);
9879        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9880        for (j = 0; j < n_res; j++) {
9881            *maxchar = Py_MAX(*maxchar, mapped[j]);
9882            res[k++] = mapped[j];
9883        }
9884    }
9885    return k;
9886}
9887
9888static Py_ssize_t
9889do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9890    Py_ssize_t i, k = 0;
9891
9892    for (i = 0; i < length; i++) {
9893        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9894        int n_res, j;
9895        if (Py_UNICODE_ISUPPER(c)) {
9896            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9897        }
9898        else if (Py_UNICODE_ISLOWER(c)) {
9899            n_res = _PyUnicode_ToUpperFull(c, mapped);
9900        }
9901        else {
9902            n_res = 1;
9903            mapped[0] = c;
9904        }
9905        for (j = 0; j < n_res; j++) {
9906            *maxchar = Py_MAX(*maxchar, mapped[j]);
9907            res[k++] = mapped[j];
9908        }
9909    }
9910    return k;
9911}
9912
9913static Py_ssize_t
9914do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9915                  Py_UCS4 *maxchar, int lower)
9916{
9917    Py_ssize_t i, k = 0;
9918
9919    for (i = 0; i < length; i++) {
9920        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9921        int n_res, j;
9922        if (lower)
9923            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9924        else
9925            n_res = _PyUnicode_ToUpperFull(c, mapped);
9926        for (j = 0; j < n_res; j++) {
9927            *maxchar = Py_MAX(*maxchar, mapped[j]);
9928            res[k++] = mapped[j];
9929        }
9930    }
9931    return k;
9932}
9933
9934static Py_ssize_t
9935do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9936{
9937    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9938}
9939
9940static Py_ssize_t
9941do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9942{
9943    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9944}
9945
9946static Py_ssize_t
9947do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9948{
9949    Py_ssize_t i, k = 0;
9950
9951    for (i = 0; i < length; i++) {
9952        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9953        Py_UCS4 mapped[3];
9954        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9955        for (j = 0; j < n_res; j++) {
9956            *maxchar = Py_MAX(*maxchar, mapped[j]);
9957            res[k++] = mapped[j];
9958        }
9959    }
9960    return k;
9961}
9962
9963static Py_ssize_t
9964do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9965{
9966    Py_ssize_t i, k = 0;
9967    int previous_is_cased;
9968
9969    previous_is_cased = 0;
9970    for (i = 0; i < length; i++) {
9971        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9972        Py_UCS4 mapped[3];
9973        int n_res, j;
9974
9975        if (previous_is_cased)
9976            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9977        else
9978            n_res = _PyUnicode_ToTitleFull(c, mapped);
9979
9980        for (j = 0; j < n_res; j++) {
9981            *maxchar = Py_MAX(*maxchar, mapped[j]);
9982            res[k++] = mapped[j];
9983        }
9984
9985        previous_is_cased = _PyUnicode_IsCased(c);
9986    }
9987    return k;
9988}
9989
9990static PyObject *
9991case_operation(PyObject *self,
9992               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9993{
9994    PyObject *res = NULL;
9995    Py_ssize_t length, newlength = 0;
9996    int kind, outkind;
9997    const void *data;
9998    void *outdata;
9999    Py_UCS4 maxchar = 0, *tmp, *tmpend;
10000
10001    assert(PyUnicode_IS_READY(self));
10002
10003    kind = PyUnicode_KIND(self);
10004    data = PyUnicode_DATA(self);
10005    length = PyUnicode_GET_LENGTH(self);
10006    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10007        PyErr_SetString(PyExc_OverflowError, "string is too long");
10008        return NULL;
10009    }
10010    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10011    if (tmp == NULL)
10012        return PyErr_NoMemory();
10013    newlength = perform(kind, data, length, tmp, &maxchar);
10014    res = PyUnicode_New(newlength, maxchar);
10015    if (res == NULL)
10016        goto leave;
10017    tmpend = tmp + newlength;
10018    outdata = PyUnicode_DATA(res);
10019    outkind = PyUnicode_KIND(res);
10020    switch (outkind) {
10021    case PyUnicode_1BYTE_KIND:
10022        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10023        break;
10024    case PyUnicode_2BYTE_KIND:
10025        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10026        break;
10027    case PyUnicode_4BYTE_KIND:
10028        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10029        break;
10030    default:
10031        Py_UNREACHABLE();
10032    }
10033  leave:
10034    PyMem_Free(tmp);
10035    return res;
10036}
10037
10038PyObject *
10039PyUnicode_Join(PyObject *separator, PyObject *seq)
10040{
10041    PyObject *res;
10042    PyObject *fseq;
10043    Py_ssize_t seqlen;
10044    PyObject **items;
10045
10046    fseq = PySequence_Fast(seq, "can only join an iterable");
10047    if (fseq == NULL) {
10048        return NULL;
10049    }
10050
10051    /* NOTE: the following code can't call back into Python code,
10052     * so we are sure that fseq won't be mutated.
10053     */
10054
10055    items = PySequence_Fast_ITEMS(fseq);
10056    seqlen = PySequence_Fast_GET_SIZE(fseq);
10057    res = _PyUnicode_JoinArray(separator, items, seqlen);
10058    Py_DECREF(fseq);
10059    return res;
10060}
10061
10062PyObject *
10063_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10064{
10065    PyObject *res = NULL; /* the result */
10066    PyObject *sep = NULL;
10067    Py_ssize_t seplen;
10068    PyObject *item;
10069    Py_ssize_t sz, i, res_offset;
10070    Py_UCS4 maxchar;
10071    Py_UCS4 item_maxchar;
10072    int use_memcpy;
10073    unsigned char *res_data = NULL, *sep_data = NULL;
10074    PyObject *last_obj;
10075    unsigned int kind = 0;
10076
10077    /* If empty sequence, return u"". */
10078    if (seqlen == 0) {
10079        _Py_RETURN_UNICODE_EMPTY();
10080    }
10081
10082    /* If singleton sequence with an exact Unicode, return that. */
10083    last_obj = NULL;
10084    if (seqlen == 1) {
10085        if (PyUnicode_CheckExact(items[0])) {
10086            res = items[0];
10087            Py_INCREF(res);
10088            return res;
10089        }
10090        seplen = 0;
10091        maxchar = 0;
10092    }
10093    else {
10094        /* Set up sep and seplen */
10095        if (separator == NULL) {
10096            /* fall back to a blank space separator */
10097            sep = PyUnicode_FromOrdinal(' ');
10098            if (!sep)
10099                goto onError;
10100            seplen = 1;
10101            maxchar = 32;
10102        }
10103        else {
10104            if (!PyUnicode_Check(separator)) {
10105                PyErr_Format(PyExc_TypeError,
10106                             "separator: expected str instance,"
10107                             " %.80s found",
10108                             Py_TYPE(separator)->tp_name);
10109                goto onError;
10110            }
10111            if (PyUnicode_READY(separator))
10112                goto onError;
10113            sep = separator;
10114            seplen = PyUnicode_GET_LENGTH(separator);
10115            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10116            /* inc refcount to keep this code path symmetric with the
10117               above case of a blank separator */
10118            Py_INCREF(sep);
10119        }
10120        last_obj = sep;
10121    }
10122
10123    /* There are at least two things to join, or else we have a subclass
10124     * of str in the sequence.
10125     * Do a pre-pass to figure out the total amount of space we'll
10126     * need (sz), and see whether all argument are strings.
10127     */
10128    sz = 0;
10129#ifdef Py_DEBUG
10130    use_memcpy = 0;
10131#else
10132    use_memcpy = 1;
10133#endif
10134    for (i = 0; i < seqlen; i++) {
10135        size_t add_sz;
10136        item = items[i];
10137        if (!PyUnicode_Check(item)) {
10138            PyErr_Format(PyExc_TypeError,
10139                         "sequence item %zd: expected str instance,"
10140                         " %.80s found",
10141                         i, Py_TYPE(item)->tp_name);
10142            goto onError;
10143        }
10144        if (PyUnicode_READY(item) == -1)
10145            goto onError;
10146        add_sz = PyUnicode_GET_LENGTH(item);
10147        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10148        maxchar = Py_MAX(maxchar, item_maxchar);
10149        if (i != 0) {
10150            add_sz += seplen;
10151        }
10152        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10153            PyErr_SetString(PyExc_OverflowError,
10154                            "join() result is too long for a Python string");
10155            goto onError;
10156        }
10157        sz += add_sz;
10158        if (use_memcpy && last_obj != NULL) {
10159            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10160                use_memcpy = 0;
10161        }
10162        last_obj = item;
10163    }
10164
10165    res = PyUnicode_New(sz, maxchar);
10166    if (res == NULL)
10167        goto onError;
10168
10169    /* Catenate everything. */
10170#ifdef Py_DEBUG
10171    use_memcpy = 0;
10172#else
10173    if (use_memcpy) {
10174        res_data = PyUnicode_1BYTE_DATA(res);
10175        kind = PyUnicode_KIND(res);
10176        if (seplen != 0)
10177            sep_data = PyUnicode_1BYTE_DATA(sep);
10178    }
10179#endif
10180    if (use_memcpy) {
10181        for (i = 0; i < seqlen; ++i) {
10182            Py_ssize_t itemlen;
10183            item = items[i];
10184
10185            /* Copy item, and maybe the separator. */
10186            if (i && seplen != 0) {
10187                memcpy(res_data,
10188                          sep_data,
10189                          kind * seplen);
10190                res_data += kind * seplen;
10191            }
10192
10193            itemlen = PyUnicode_GET_LENGTH(item);
10194            if (itemlen != 0) {
10195                memcpy(res_data,
10196                          PyUnicode_DATA(item),
10197                          kind * itemlen);
10198                res_data += kind * itemlen;
10199            }
10200        }
10201        assert(res_data == PyUnicode_1BYTE_DATA(res)
10202                           + kind * PyUnicode_GET_LENGTH(res));
10203    }
10204    else {
10205        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10206            Py_ssize_t itemlen;
10207            item = items[i];
10208
10209            /* Copy item, and maybe the separator. */
10210            if (i && seplen != 0) {
10211                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10212                res_offset += seplen;
10213            }
10214
10215            itemlen = PyUnicode_GET_LENGTH(item);
10216            if (itemlen != 0) {
10217                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10218                res_offset += itemlen;
10219            }
10220        }
10221        assert(res_offset == PyUnicode_GET_LENGTH(res));
10222    }
10223
10224    Py_XDECREF(sep);
10225    assert(_PyUnicode_CheckConsistency(res, 1));
10226    return res;
10227
10228  onError:
10229    Py_XDECREF(sep);
10230    Py_XDECREF(res);
10231    return NULL;
10232}
10233
10234void
10235_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10236                    Py_UCS4 fill_char)
10237{
10238    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10239    void *data = PyUnicode_DATA(unicode);
10240    assert(PyUnicode_IS_READY(unicode));
10241    assert(unicode_modifiable(unicode));
10242    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10243    assert(start >= 0);
10244    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10245    unicode_fill(kind, data, fill_char, start, length);
10246}
10247
10248Py_ssize_t
10249PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10250               Py_UCS4 fill_char)
10251{
10252    Py_ssize_t maxlen;
10253
10254    if (!PyUnicode_Check(unicode)) {
10255        PyErr_BadInternalCall();
10256        return -1;
10257    }
10258    if (PyUnicode_READY(unicode) == -1)
10259        return -1;
10260    if (unicode_check_modifiable(unicode))
10261        return -1;
10262
10263    if (start < 0) {
10264        PyErr_SetString(PyExc_IndexError, "string index out of range");
10265        return -1;
10266    }
10267    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10268        PyErr_SetString(PyExc_ValueError,
10269                         "fill character is bigger than "
10270                         "the string maximum character");
10271        return -1;
10272    }
10273
10274    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10275    length = Py_MIN(maxlen, length);
10276    if (length <= 0)
10277        return 0;
10278
10279    _PyUnicode_FastFill(unicode, start, length, fill_char);
10280    return length;
10281}
10282
10283static PyObject *
10284pad(PyObject *self,
10285    Py_ssize_t left,
10286    Py_ssize_t right,
10287    Py_UCS4 fill)
10288{
10289    PyObject *u;
10290    Py_UCS4 maxchar;
10291    int kind;
10292    void *data;
10293
10294    if (left < 0)
10295        left = 0;
10296    if (right < 0)
10297        right = 0;
10298
10299    if (left == 0 && right == 0)
10300        return unicode_result_unchanged(self);
10301
10302    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10303        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10304        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10305        return NULL;
10306    }
10307    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10308    maxchar = Py_MAX(maxchar, fill);
10309    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10310    if (!u)
10311        return NULL;
10312
10313    kind = PyUnicode_KIND(u);
10314    data = PyUnicode_DATA(u);
10315    if (left)
10316        unicode_fill(kind, data, fill, 0, left);
10317    if (right)
10318        unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10319    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10320    assert(_PyUnicode_CheckConsistency(u, 1));
10321    return u;
10322}
10323
10324PyObject *
10325PyUnicode_Splitlines(PyObject *string, int keepends)
10326{
10327    PyObject *list;
10328
10329    if (ensure_unicode(string) < 0)
10330        return NULL;
10331
10332    switch (PyUnicode_KIND(string)) {
10333    case PyUnicode_1BYTE_KIND:
10334        if (PyUnicode_IS_ASCII(string))
10335            list = asciilib_splitlines(
10336                string, PyUnicode_1BYTE_DATA(string),
10337                PyUnicode_GET_LENGTH(string), keepends);
10338        else
10339            list = ucs1lib_splitlines(
10340                string, PyUnicode_1BYTE_DATA(string),
10341                PyUnicode_GET_LENGTH(string), keepends);
10342        break;
10343    case PyUnicode_2BYTE_KIND:
10344        list = ucs2lib_splitlines(
10345            string, PyUnicode_2BYTE_DATA(string),
10346            PyUnicode_GET_LENGTH(string), keepends);
10347        break;
10348    case PyUnicode_4BYTE_KIND:
10349        list = ucs4lib_splitlines(
10350            string, PyUnicode_4BYTE_DATA(string),
10351            PyUnicode_GET_LENGTH(string), keepends);
10352        break;
10353    default:
10354        Py_UNREACHABLE();
10355    }
10356    return list;
10357}
10358
10359static PyObject *
10360split(PyObject *self,
10361      PyObject *substring,
10362      Py_ssize_t maxcount)
10363{
10364    int kind1, kind2;
10365    const void *buf1, *buf2;
10366    Py_ssize_t len1, len2;
10367    PyObject* out;
10368
10369    if (maxcount < 0)
10370        maxcount = PY_SSIZE_T_MAX;
10371
10372    if (PyUnicode_READY(self) == -1)
10373        return NULL;
10374
10375    if (substring == NULL)
10376        switch (PyUnicode_KIND(self)) {
10377        case PyUnicode_1BYTE_KIND:
10378            if (PyUnicode_IS_ASCII(self))
10379                return asciilib_split_whitespace(
10380                    self,  PyUnicode_1BYTE_DATA(self),
10381                    PyUnicode_GET_LENGTH(self), maxcount
10382                    );
10383            else
10384                return ucs1lib_split_whitespace(
10385                    self,  PyUnicode_1BYTE_DATA(self),
10386                    PyUnicode_GET_LENGTH(self), maxcount
10387                    );
10388        case PyUnicode_2BYTE_KIND:
10389            return ucs2lib_split_whitespace(
10390                self,  PyUnicode_2BYTE_DATA(self),
10391                PyUnicode_GET_LENGTH(self), maxcount
10392                );
10393        case PyUnicode_4BYTE_KIND:
10394            return ucs4lib_split_whitespace(
10395                self,  PyUnicode_4BYTE_DATA(self),
10396                PyUnicode_GET_LENGTH(self), maxcount
10397                );
10398        default:
10399            Py_UNREACHABLE();
10400        }
10401
10402    if (PyUnicode_READY(substring) == -1)
10403        return NULL;
10404
10405    kind1 = PyUnicode_KIND(self);
10406    kind2 = PyUnicode_KIND(substring);
10407    len1 = PyUnicode_GET_LENGTH(self);
10408    len2 = PyUnicode_GET_LENGTH(substring);
10409    if (kind1 < kind2 || len1 < len2) {
10410        out = PyList_New(1);
10411        if (out == NULL)
10412            return NULL;
10413        Py_INCREF(self);
10414        PyList_SET_ITEM(out, 0, self);
10415        return out;
10416    }
10417    buf1 = PyUnicode_DATA(self);
10418    buf2 = PyUnicode_DATA(substring);
10419    if (kind2 != kind1) {
10420        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10421        if (!buf2)
10422            return NULL;
10423    }
10424
10425    switch (kind1) {
10426    case PyUnicode_1BYTE_KIND:
10427        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10428            out = asciilib_split(
10429                self,  buf1, len1, buf2, len2, maxcount);
10430        else
10431            out = ucs1lib_split(
10432                self,  buf1, len1, buf2, len2, maxcount);
10433        break;
10434    case PyUnicode_2BYTE_KIND:
10435        out = ucs2lib_split(
10436            self,  buf1, len1, buf2, len2, maxcount);
10437        break;
10438    case PyUnicode_4BYTE_KIND:
10439        out = ucs4lib_split(
10440            self,  buf1, len1, buf2, len2, maxcount);
10441        break;
10442    default:
10443        out = NULL;
10444    }
10445    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10446    if (kind2 != kind1)
10447        PyMem_Free((void *)buf2);
10448    return out;
10449}
10450
10451static PyObject *
10452rsplit(PyObject *self,
10453       PyObject *substring,
10454       Py_ssize_t maxcount)
10455{
10456    int kind1, kind2;
10457    const void *buf1, *buf2;
10458    Py_ssize_t len1, len2;
10459    PyObject* out;
10460
10461    if (maxcount < 0)
10462        maxcount = PY_SSIZE_T_MAX;
10463
10464    if (PyUnicode_READY(self) == -1)
10465        return NULL;
10466
10467    if (substring == NULL)
10468        switch (PyUnicode_KIND(self)) {
10469        case PyUnicode_1BYTE_KIND:
10470            if (PyUnicode_IS_ASCII(self))
10471                return asciilib_rsplit_whitespace(
10472                    self,  PyUnicode_1BYTE_DATA(self),
10473                    PyUnicode_GET_LENGTH(self), maxcount
10474                    );
10475            else
10476                return ucs1lib_rsplit_whitespace(
10477                    self,  PyUnicode_1BYTE_DATA(self),
10478                    PyUnicode_GET_LENGTH(self), maxcount
10479                    );
10480        case PyUnicode_2BYTE_KIND:
10481            return ucs2lib_rsplit_whitespace(
10482                self,  PyUnicode_2BYTE_DATA(self),
10483                PyUnicode_GET_LENGTH(self), maxcount
10484                );
10485        case PyUnicode_4BYTE_KIND:
10486            return ucs4lib_rsplit_whitespace(
10487                self,  PyUnicode_4BYTE_DATA(self),
10488                PyUnicode_GET_LENGTH(self), maxcount
10489                );
10490        default:
10491            Py_UNREACHABLE();
10492        }
10493
10494    if (PyUnicode_READY(substring) == -1)
10495        return NULL;
10496
10497    kind1 = PyUnicode_KIND(self);
10498    kind2 = PyUnicode_KIND(substring);
10499    len1 = PyUnicode_GET_LENGTH(self);
10500    len2 = PyUnicode_GET_LENGTH(substring);
10501    if (kind1 < kind2 || len1 < len2) {
10502        out = PyList_New(1);
10503        if (out == NULL)
10504            return NULL;
10505        Py_INCREF(self);
10506        PyList_SET_ITEM(out, 0, self);
10507        return out;
10508    }
10509    buf1 = PyUnicode_DATA(self);
10510    buf2 = PyUnicode_DATA(substring);
10511    if (kind2 != kind1) {
10512        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10513        if (!buf2)
10514            return NULL;
10515    }
10516
10517    switch (kind1) {
10518    case PyUnicode_1BYTE_KIND:
10519        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10520            out = asciilib_rsplit(
10521                self,  buf1, len1, buf2, len2, maxcount);
10522        else
10523            out = ucs1lib_rsplit(
10524                self,  buf1, len1, buf2, len2, maxcount);
10525        break;
10526    case PyUnicode_2BYTE_KIND:
10527        out = ucs2lib_rsplit(
10528            self,  buf1, len1, buf2, len2, maxcount);
10529        break;
10530    case PyUnicode_4BYTE_KIND:
10531        out = ucs4lib_rsplit(
10532            self,  buf1, len1, buf2, len2, maxcount);
10533        break;
10534    default:
10535        out = NULL;
10536    }
10537    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10538    if (kind2 != kind1)
10539        PyMem_Free((void *)buf2);
10540    return out;
10541}
10542
10543static Py_ssize_t
10544anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10545            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10546{
10547    switch (kind) {
10548    case PyUnicode_1BYTE_KIND:
10549        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10550            return asciilib_find(buf1, len1, buf2, len2, offset);
10551        else
10552            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10553    case PyUnicode_2BYTE_KIND:
10554        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10555    case PyUnicode_4BYTE_KIND:
10556        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10557    }
10558    Py_UNREACHABLE();
10559}
10560
10561static Py_ssize_t
10562anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10563             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10564{
10565    switch (kind) {
10566    case PyUnicode_1BYTE_KIND:
10567        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10568            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10569        else
10570            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10571    case PyUnicode_2BYTE_KIND:
10572        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10573    case PyUnicode_4BYTE_KIND:
10574        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10575    }
10576    Py_UNREACHABLE();
10577}
10578
10579static void
10580replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10581                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10582{
10583    int kind = PyUnicode_KIND(u);
10584    void *data = PyUnicode_DATA(u);
10585    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10586    if (kind == PyUnicode_1BYTE_KIND) {
10587        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10588                                      (Py_UCS1 *)data + len,
10589                                      u1, u2, maxcount);
10590    }
10591    else if (kind == PyUnicode_2BYTE_KIND) {
10592        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10593                                      (Py_UCS2 *)data + len,
10594                                      u1, u2, maxcount);
10595    }
10596    else {
10597        assert(kind == PyUnicode_4BYTE_KIND);
10598        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10599                                      (Py_UCS4 *)data + len,
10600                                      u1, u2, maxcount);
10601    }
10602}
10603
10604static PyObject *
10605replace(PyObject *self, PyObject *str1,
10606        PyObject *str2, Py_ssize_t maxcount)
10607{
10608    PyObject *u;
10609    const char *sbuf = PyUnicode_DATA(self);
10610    const void *buf1 = PyUnicode_DATA(str1);
10611    const void *buf2 = PyUnicode_DATA(str2);
10612    int srelease = 0, release1 = 0, release2 = 0;
10613    int skind = PyUnicode_KIND(self);
10614    int kind1 = PyUnicode_KIND(str1);
10615    int kind2 = PyUnicode_KIND(str2);
10616    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10617    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10618    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10619    int mayshrink;
10620    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10621
10622    if (slen < len1)
10623        goto nothing;
10624
10625    if (maxcount < 0)
10626        maxcount = PY_SSIZE_T_MAX;
10627    else if (maxcount == 0)
10628        goto nothing;
10629
10630    if (str1 == str2)
10631        goto nothing;
10632
10633    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10634    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10635    if (maxchar < maxchar_str1)
10636        /* substring too wide to be present */
10637        goto nothing;
10638    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10639    /* Replacing str1 with str2 may cause a maxchar reduction in the
10640       result string. */
10641    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10642    maxchar = Py_MAX(maxchar, maxchar_str2);
10643
10644    if (len1 == len2) {
10645        /* same length */
10646        if (len1 == 0)
10647            goto nothing;
10648        if (len1 == 1) {
10649            /* replace characters */
10650            Py_UCS4 u1, u2;
10651            Py_ssize_t pos;
10652
10653            u1 = PyUnicode_READ(kind1, buf1, 0);
10654            pos = findchar(sbuf, skind, slen, u1, 1);
10655            if (pos < 0)
10656                goto nothing;
10657            u2 = PyUnicode_READ(kind2, buf2, 0);
10658            u = PyUnicode_New(slen, maxchar);
10659            if (!u)
10660                goto error;
10661
10662            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10663            replace_1char_inplace(u, pos, u1, u2, maxcount);
10664        }
10665        else {
10666            int rkind = skind;
10667            char *res;
10668            Py_ssize_t i;
10669
10670            if (kind1 < rkind) {
10671                /* widen substring */
10672                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10673                if (!buf1) goto error;
10674                release1 = 1;
10675            }
10676            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10677            if (i < 0)
10678                goto nothing;
10679            if (rkind > kind2) {
10680                /* widen replacement */
10681                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10682                if (!buf2) goto error;
10683                release2 = 1;
10684            }
10685            else if (rkind < kind2) {
10686                /* widen self and buf1 */
10687                rkind = kind2;
10688                if (release1) {
10689                    assert(buf1 != PyUnicode_DATA(str1));
10690                    PyMem_Free((void *)buf1);
10691                    buf1 = PyUnicode_DATA(str1);
10692                    release1 = 0;
10693                }
10694                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10695                if (!sbuf) goto error;
10696                srelease = 1;
10697                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10698                if (!buf1) goto error;
10699                release1 = 1;
10700            }
10701            u = PyUnicode_New(slen, maxchar);
10702            if (!u)
10703                goto error;
10704            assert(PyUnicode_KIND(u) == rkind);
10705            res = PyUnicode_DATA(u);
10706
10707            memcpy(res, sbuf, rkind * slen);
10708            /* change everything in-place, starting with this one */
10709            memcpy(res + rkind * i,
10710                   buf2,
10711                   rkind * len2);
10712            i += len1;
10713
10714            while ( --maxcount > 0) {
10715                i = anylib_find(rkind, self,
10716                                sbuf+rkind*i, slen-i,
10717                                str1, buf1, len1, i);
10718                if (i == -1)
10719                    break;
10720                memcpy(res + rkind * i,
10721                       buf2,
10722                       rkind * len2);
10723                i += len1;
10724            }
10725        }
10726    }
10727    else {
10728        Py_ssize_t n, i, j, ires;
10729        Py_ssize_t new_size;
10730        int rkind = skind;
10731        char *res;
10732
10733        if (kind1 < rkind) {
10734            /* widen substring */
10735            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10736            if (!buf1) goto error;
10737            release1 = 1;
10738        }
10739        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10740        if (n == 0)
10741            goto nothing;
10742        if (kind2 < rkind) {
10743            /* widen replacement */
10744            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10745            if (!buf2) goto error;
10746            release2 = 1;
10747        }
10748        else if (kind2 > rkind) {
10749            /* widen self and buf1 */
10750            rkind = kind2;
10751            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10752            if (!sbuf) goto error;
10753            srelease = 1;
10754            if (release1) {
10755                assert(buf1 != PyUnicode_DATA(str1));
10756                PyMem_Free((void *)buf1);
10757                buf1 = PyUnicode_DATA(str1);
10758                release1 = 0;
10759            }
10760            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10761            if (!buf1) goto error;
10762            release1 = 1;
10763        }
10764        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10765           PyUnicode_GET_LENGTH(str1)); */
10766        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10767                PyErr_SetString(PyExc_OverflowError,
10768                                "replace string is too long");
10769                goto error;
10770        }
10771        new_size = slen + n * (len2 - len1);
10772        if (new_size == 0) {
10773            u = unicode_new_empty();
10774            goto done;
10775        }
10776        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10777            PyErr_SetString(PyExc_OverflowError,
10778                            "replace string is too long");
10779            goto error;
10780        }
10781        u = PyUnicode_New(new_size, maxchar);
10782        if (!u)
10783            goto error;
10784        assert(PyUnicode_KIND(u) == rkind);
10785        res = PyUnicode_DATA(u);
10786        ires = i = 0;
10787        if (len1 > 0) {
10788            while (n-- > 0) {
10789                /* look for next match */
10790                j = anylib_find(rkind, self,
10791                                sbuf + rkind * i, slen-i,
10792                                str1, buf1, len1, i);
10793                if (j == -1)
10794                    break;
10795                else if (j > i) {
10796                    /* copy unchanged part [i:j] */
10797                    memcpy(res + rkind * ires,
10798                           sbuf + rkind * i,
10799                           rkind * (j-i));
10800                    ires += j - i;
10801                }
10802                /* copy substitution string */
10803                if (len2 > 0) {
10804                    memcpy(res + rkind * ires,
10805                           buf2,
10806                           rkind * len2);
10807                    ires += len2;
10808                }
10809                i = j + len1;
10810            }
10811            if (i < slen)
10812                /* copy tail [i:] */
10813                memcpy(res + rkind * ires,
10814                       sbuf + rkind * i,
10815                       rkind * (slen-i));
10816        }
10817        else {
10818            /* interleave */
10819            while (n > 0) {
10820                memcpy(res + rkind * ires,
10821                       buf2,
10822                       rkind * len2);
10823                ires += len2;
10824                if (--n <= 0)
10825                    break;
10826                memcpy(res + rkind * ires,
10827                       sbuf + rkind * i,
10828                       rkind);
10829                ires++;
10830                i++;
10831            }
10832            memcpy(res + rkind * ires,
10833                   sbuf + rkind * i,
10834                   rkind * (slen-i));
10835        }
10836    }
10837
10838    if (mayshrink) {
10839        unicode_adjust_maxchar(&u);
10840        if (u == NULL)
10841            goto error;
10842    }
10843
10844  done:
10845    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10846    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10847    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10848    if (srelease)
10849        PyMem_Free((void *)sbuf);
10850    if (release1)
10851        PyMem_Free((void *)buf1);
10852    if (release2)
10853        PyMem_Free((void *)buf2);
10854    assert(_PyUnicode_CheckConsistency(u, 1));
10855    return u;
10856
10857  nothing:
10858    /* nothing to replace; return original string (when possible) */
10859    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10860    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10861    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10862    if (srelease)
10863        PyMem_Free((void *)sbuf);
10864    if (release1)
10865        PyMem_Free((void *)buf1);
10866    if (release2)
10867        PyMem_Free((void *)buf2);
10868    return unicode_result_unchanged(self);
10869
10870  error:
10871    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10872    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10873    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10874    if (srelease)
10875        PyMem_Free((void *)sbuf);
10876    if (release1)
10877        PyMem_Free((void *)buf1);
10878    if (release2)
10879        PyMem_Free((void *)buf2);
10880    return NULL;
10881}
10882
10883/* --- Unicode Object Methods --------------------------------------------- */
10884
10885/*[clinic input]
10886str.title as unicode_title
10887
10888Return a version of the string where each word is titlecased.
10889
10890More specifically, words start with uppercased characters and all remaining
10891cased characters have lower case.
10892[clinic start generated code]*/
10893
10894static PyObject *
10895unicode_title_impl(PyObject *self)
10896/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10897{
10898    if (PyUnicode_READY(self) == -1)
10899        return NULL;
10900    return case_operation(self, do_title);
10901}
10902
10903/*[clinic input]
10904str.capitalize as unicode_capitalize
10905
10906Return a capitalized version of the string.
10907
10908More specifically, make the first character have upper case and the rest lower
10909case.
10910[clinic start generated code]*/
10911
10912static PyObject *
10913unicode_capitalize_impl(PyObject *self)
10914/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10915{
10916    if (PyUnicode_READY(self) == -1)
10917        return NULL;
10918    if (PyUnicode_GET_LENGTH(self) == 0)
10919        return unicode_result_unchanged(self);
10920    return case_operation(self, do_capitalize);
10921}
10922
10923/*[clinic input]
10924str.casefold as unicode_casefold
10925
10926Return a version of the string suitable for caseless comparisons.
10927[clinic start generated code]*/
10928
10929static PyObject *
10930unicode_casefold_impl(PyObject *self)
10931/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10932{
10933    if (PyUnicode_READY(self) == -1)
10934        return NULL;
10935    if (PyUnicode_IS_ASCII(self))
10936        return ascii_upper_or_lower(self, 1);
10937    return case_operation(self, do_casefold);
10938}
10939
10940
10941/* Argument converter. Accepts a single Unicode character. */
10942
10943static int
10944convert_uc(PyObject *obj, void *addr)
10945{
10946    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10947
10948    if (!PyUnicode_Check(obj)) {
10949        PyErr_Format(PyExc_TypeError,
10950                     "The fill character must be a unicode character, "
10951                     "not %.100s", Py_TYPE(obj)->tp_name);
10952        return 0;
10953    }
10954    if (PyUnicode_READY(obj) < 0)
10955        return 0;
10956    if (PyUnicode_GET_LENGTH(obj) != 1) {
10957        PyErr_SetString(PyExc_TypeError,
10958                        "The fill character must be exactly one character long");
10959        return 0;
10960    }
10961    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10962    return 1;
10963}
10964
10965/*[clinic input]
10966str.center as unicode_center
10967
10968    width: Py_ssize_t
10969    fillchar: Py_UCS4 = ' '
10970    /
10971
10972Return a centered string of length width.
10973
10974Padding is done using the specified fill character (default is a space).
10975[clinic start generated code]*/
10976
10977static PyObject *
10978unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10979/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10980{
10981    Py_ssize_t marg, left;
10982
10983    if (PyUnicode_READY(self) == -1)
10984        return NULL;
10985
10986    if (PyUnicode_GET_LENGTH(self) >= width)
10987        return unicode_result_unchanged(self);
10988
10989    marg = width - PyUnicode_GET_LENGTH(self);
10990    left = marg / 2 + (marg & width & 1);
10991
10992    return pad(self, left, marg - left, fillchar);
10993}
10994
10995/* This function assumes that str1 and str2 are readied by the caller. */
10996
10997static int
10998unicode_compare(PyObject *str1, PyObject *str2)
10999{
11000#define COMPARE(TYPE1, TYPE2) \
11001    do { \
11002        TYPE1* p1 = (TYPE1 *)data1; \
11003        TYPE2* p2 = (TYPE2 *)data2; \
11004        TYPE1* end = p1 + len; \
11005        Py_UCS4 c1, c2; \
11006        for (; p1 != end; p1++, p2++) { \
11007            c1 = *p1; \
11008            c2 = *p2; \
11009            if (c1 != c2) \
11010                return (c1 < c2) ? -1 : 1; \
11011        } \
11012    } \
11013    while (0)
11014
11015    int kind1, kind2;
11016    const void *data1, *data2;
11017    Py_ssize_t len1, len2, len;
11018
11019    kind1 = PyUnicode_KIND(str1);
11020    kind2 = PyUnicode_KIND(str2);
11021    data1 = PyUnicode_DATA(str1);
11022    data2 = PyUnicode_DATA(str2);
11023    len1 = PyUnicode_GET_LENGTH(str1);
11024    len2 = PyUnicode_GET_LENGTH(str2);
11025    len = Py_MIN(len1, len2);
11026
11027    switch(kind1) {
11028    case PyUnicode_1BYTE_KIND:
11029    {
11030        switch(kind2) {
11031        case PyUnicode_1BYTE_KIND:
11032        {
11033            int cmp = memcmp(data1, data2, len);
11034            /* normalize result of memcmp() into the range [-1; 1] */
11035            if (cmp < 0)
11036                return -1;
11037            if (cmp > 0)
11038                return 1;
11039            break;
11040        }
11041        case PyUnicode_2BYTE_KIND:
11042            COMPARE(Py_UCS1, Py_UCS2);
11043            break;
11044        case PyUnicode_4BYTE_KIND:
11045            COMPARE(Py_UCS1, Py_UCS4);
11046            break;
11047        default:
11048            Py_UNREACHABLE();
11049        }
11050        break;
11051    }
11052    case PyUnicode_2BYTE_KIND:
11053    {
11054        switch(kind2) {
11055        case PyUnicode_1BYTE_KIND:
11056            COMPARE(Py_UCS2, Py_UCS1);
11057            break;
11058        case PyUnicode_2BYTE_KIND:
11059        {
11060            COMPARE(Py_UCS2, Py_UCS2);
11061            break;
11062        }
11063        case PyUnicode_4BYTE_KIND:
11064            COMPARE(Py_UCS2, Py_UCS4);
11065            break;
11066        default:
11067            Py_UNREACHABLE();
11068        }
11069        break;
11070    }
11071    case PyUnicode_4BYTE_KIND:
11072    {
11073        switch(kind2) {
11074        case PyUnicode_1BYTE_KIND:
11075            COMPARE(Py_UCS4, Py_UCS1);
11076            break;
11077        case PyUnicode_2BYTE_KIND:
11078            COMPARE(Py_UCS4, Py_UCS2);
11079            break;
11080        case PyUnicode_4BYTE_KIND:
11081        {
11082#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11083            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11084            /* normalize result of wmemcmp() into the range [-1; 1] */
11085            if (cmp < 0)
11086                return -1;
11087            if (cmp > 0)
11088                return 1;
11089#else
11090            COMPARE(Py_UCS4, Py_UCS4);
11091#endif
11092            break;
11093        }
11094        default:
11095            Py_UNREACHABLE();
11096        }
11097        break;
11098    }
11099    default:
11100        Py_UNREACHABLE();
11101    }
11102
11103    if (len1 == len2)
11104        return 0;
11105    if (len1 < len2)
11106        return -1;
11107    else
11108        return 1;
11109
11110#undef COMPARE
11111}
11112
11113static int
11114unicode_compare_eq(PyObject *str1, PyObject *str2)
11115{
11116    int kind;
11117    const void *data1, *data2;
11118    Py_ssize_t len;
11119    int cmp;
11120
11121    len = PyUnicode_GET_LENGTH(str1);
11122    if (PyUnicode_GET_LENGTH(str2) != len)
11123        return 0;
11124    kind = PyUnicode_KIND(str1);
11125    if (PyUnicode_KIND(str2) != kind)
11126        return 0;
11127    data1 = PyUnicode_DATA(str1);
11128    data2 = PyUnicode_DATA(str2);
11129
11130    cmp = memcmp(data1, data2, len * kind);
11131    return (cmp == 0);
11132}
11133
11134int
11135_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11136{
11137    assert(PyUnicode_Check(str1));
11138    assert(PyUnicode_Check(str2));
11139    if (str1 == str2) {
11140        return 1;
11141    }
11142    if (PyUnicode_READY(str1) || PyUnicode_READY(str2)) {
11143        return -1;
11144    }
11145    return unicode_compare_eq(str1, str2);
11146}
11147
11148
11149int
11150PyUnicode_Compare(PyObject *left, PyObject *right)
11151{
11152    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11153        if (PyUnicode_READY(left) == -1 ||
11154            PyUnicode_READY(right) == -1)
11155            return -1;
11156
11157        /* a string is equal to itself */
11158        if (left == right)
11159            return 0;
11160
11161        return unicode_compare(left, right);
11162    }
11163    PyErr_Format(PyExc_TypeError,
11164                 "Can't compare %.100s and %.100s",
11165                 Py_TYPE(left)->tp_name,
11166                 Py_TYPE(right)->tp_name);
11167    return -1;
11168}
11169
11170int
11171PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11172{
11173    Py_ssize_t i;
11174    int kind;
11175    Py_UCS4 chr;
11176    const unsigned char *ustr = (const unsigned char *)str;
11177
11178    assert(_PyUnicode_CHECK(uni));
11179    if (!PyUnicode_IS_READY(uni)) {
11180        const wchar_t *ws = _PyUnicode_WSTR(uni);
11181        /* Compare Unicode string and source character set string */
11182        for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11183            if (chr != ustr[i])
11184                return (chr < ustr[i]) ? -1 : 1;
11185        }
11186        /* This check keeps Python strings that end in '\0' from comparing equal
11187         to C strings identical up to that point. */
11188        if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11189            return 1; /* uni is longer */
11190        if (ustr[i])
11191            return -1; /* str is longer */
11192        return 0;
11193    }
11194    kind = PyUnicode_KIND(uni);
11195    if (kind == PyUnicode_1BYTE_KIND) {
11196        const void *data = PyUnicode_1BYTE_DATA(uni);
11197        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11198        size_t len, len2 = strlen(str);
11199        int cmp;
11200
11201        len = Py_MIN(len1, len2);
11202        cmp = memcmp(data, str, len);
11203        if (cmp != 0) {
11204            if (cmp < 0)
11205                return -1;
11206            else
11207                return 1;
11208        }
11209        if (len1 > len2)
11210            return 1; /* uni is longer */
11211        if (len1 < len2)
11212            return -1; /* str is longer */
11213        return 0;
11214    }
11215    else {
11216        const void *data = PyUnicode_DATA(uni);
11217        /* Compare Unicode string and source character set string */
11218        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11219            if (chr != (unsigned char)str[i])
11220                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11221        /* This check keeps Python strings that end in '\0' from comparing equal
11222         to C strings identical up to that point. */
11223        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11224            return 1; /* uni is longer */
11225        if (str[i])
11226            return -1; /* str is longer */
11227        return 0;
11228    }
11229}
11230
11231static int
11232non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11233{
11234    size_t i, len;
11235    const wchar_t *p;
11236    len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11237    if (strlen(str) != len)
11238        return 0;
11239    p = _PyUnicode_WSTR(unicode);
11240    assert(p);
11241    for (i = 0; i < len; i++) {
11242        unsigned char c = (unsigned char)str[i];
11243        if (c >= 128 || p[i] != (wchar_t)c)
11244            return 0;
11245    }
11246    return 1;
11247}
11248
11249int
11250_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11251{
11252    size_t len;
11253    assert(_PyUnicode_CHECK(unicode));
11254    assert(str);
11255#ifndef NDEBUG
11256    for (const char *p = str; *p; p++) {
11257        assert((unsigned char)*p < 128);
11258    }
11259#endif
11260    if (PyUnicode_READY(unicode) == -1) {
11261        /* Memory error or bad data */
11262        PyErr_Clear();
11263        return non_ready_unicode_equal_to_ascii_string(unicode, str);
11264    }
11265    if (!PyUnicode_IS_ASCII(unicode))
11266        return 0;
11267    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11268    return strlen(str) == len &&
11269           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11270}
11271
11272int
11273_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11274{
11275    PyObject *right_uni;
11276
11277    assert(_PyUnicode_CHECK(left));
11278    assert(right->string);
11279#ifndef NDEBUG
11280    for (const char *p = right->string; *p; p++) {
11281        assert((unsigned char)*p < 128);
11282    }
11283#endif
11284
11285    if (PyUnicode_READY(left) == -1) {
11286        /* memory error or bad data */
11287        PyErr_Clear();
11288        return non_ready_unicode_equal_to_ascii_string(left, right->string);
11289    }
11290
11291    if (!PyUnicode_IS_ASCII(left))
11292        return 0;
11293
11294    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11295    if (right_uni == NULL) {
11296        /* memory error or bad data */
11297        PyErr_Clear();
11298        return _PyUnicode_EqualToASCIIString(left, right->string);
11299    }
11300
11301    if (left == right_uni)
11302        return 1;
11303
11304    if (PyUnicode_CHECK_INTERNED(left))
11305        return 0;
11306
11307    assert(_PyUnicode_HASH(right_uni) != -1);
11308    Py_hash_t hash = _PyUnicode_HASH(left);
11309    if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
11310        return 0;
11311    }
11312
11313    return unicode_compare_eq(left, right_uni);
11314}
11315
11316PyObject *
11317PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11318{
11319    int result;
11320
11321    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11322        Py_RETURN_NOTIMPLEMENTED;
11323
11324    if (PyUnicode_READY(left) == -1 ||
11325        PyUnicode_READY(right) == -1)
11326        return NULL;
11327
11328    if (left == right) {
11329        switch (op) {
11330        case Py_EQ:
11331        case Py_LE:
11332        case Py_GE:
11333            /* a string is equal to itself */
11334            Py_RETURN_TRUE;
11335        case Py_NE:
11336        case Py_LT:
11337        case Py_GT:
11338            Py_RETURN_FALSE;
11339        default:
11340            PyErr_BadArgument();
11341            return NULL;
11342        }
11343    }
11344    else if (op == Py_EQ || op == Py_NE) {
11345        result = unicode_compare_eq(left, right);
11346        result ^= (op == Py_NE);
11347        return PyBool_FromLong(result);
11348    }
11349    else {
11350        result = unicode_compare(left, right);
11351        Py_RETURN_RICHCOMPARE(result, 0, op);
11352    }
11353}
11354
11355int
11356_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11357{
11358    return unicode_eq(aa, bb);
11359}
11360
11361int
11362PyUnicode_Contains(PyObject *str, PyObject *substr)
11363{
11364    int kind1, kind2;
11365    const void *buf1, *buf2;
11366    Py_ssize_t len1, len2;
11367    int result;
11368
11369    if (!PyUnicode_Check(substr)) {
11370        PyErr_Format(PyExc_TypeError,
11371                     "'in <string>' requires string as left operand, not %.100s",
11372                     Py_TYPE(substr)->tp_name);
11373        return -1;
11374    }
11375    if (PyUnicode_READY(substr) == -1)
11376        return -1;
11377    if (ensure_unicode(str) < 0)
11378        return -1;
11379
11380    kind1 = PyUnicode_KIND(str);
11381    kind2 = PyUnicode_KIND(substr);
11382    if (kind1 < kind2)
11383        return 0;
11384    len1 = PyUnicode_GET_LENGTH(str);
11385    len2 = PyUnicode_GET_LENGTH(substr);
11386    if (len1 < len2)
11387        return 0;
11388    buf1 = PyUnicode_DATA(str);
11389    buf2 = PyUnicode_DATA(substr);
11390    if (len2 == 1) {
11391        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11392        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11393        return result;
11394    }
11395    if (kind2 != kind1) {
11396        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11397        if (!buf2)
11398            return -1;
11399    }
11400
11401    switch (kind1) {
11402    case PyUnicode_1BYTE_KIND:
11403        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11404        break;
11405    case PyUnicode_2BYTE_KIND:
11406        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11407        break;
11408    case PyUnicode_4BYTE_KIND:
11409        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11410        break;
11411    default:
11412        Py_UNREACHABLE();
11413    }
11414
11415    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11416    if (kind2 != kind1)
11417        PyMem_Free((void *)buf2);
11418
11419    return result;
11420}
11421
11422/* Concat to string or Unicode object giving a new Unicode object. */
11423
11424PyObject *
11425PyUnicode_Concat(PyObject *left, PyObject *right)
11426{
11427    PyObject *result;
11428    Py_UCS4 maxchar, maxchar2;
11429    Py_ssize_t left_len, right_len, new_len;
11430
11431    if (ensure_unicode(left) < 0)
11432        return NULL;
11433
11434    if (!PyUnicode_Check(right)) {
11435        PyErr_Format(PyExc_TypeError,
11436                     "can only concatenate str (not \"%.200s\") to str",
11437                     Py_TYPE(right)->tp_name);
11438        return NULL;
11439    }
11440    if (PyUnicode_READY(right) < 0)
11441        return NULL;
11442
11443    /* Shortcuts */
11444    PyObject *empty = unicode_get_empty();  // Borrowed reference
11445    if (left == empty) {
11446        return PyUnicode_FromObject(right);
11447    }
11448    if (right == empty) {
11449        return PyUnicode_FromObject(left);
11450    }
11451
11452    left_len = PyUnicode_GET_LENGTH(left);
11453    right_len = PyUnicode_GET_LENGTH(right);
11454    if (left_len > PY_SSIZE_T_MAX - right_len) {
11455        PyErr_SetString(PyExc_OverflowError,
11456                        "strings are too large to concat");
11457        return NULL;
11458    }
11459    new_len = left_len + right_len;
11460
11461    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11462    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11463    maxchar = Py_MAX(maxchar, maxchar2);
11464
11465    /* Concat the two Unicode strings */
11466    result = PyUnicode_New(new_len, maxchar);
11467    if (result == NULL)
11468        return NULL;
11469    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11470    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11471    assert(_PyUnicode_CheckConsistency(result, 1));
11472    return result;
11473}
11474
11475void
11476PyUnicode_Append(PyObject **p_left, PyObject *right)
11477{
11478    PyObject *left, *res;
11479    Py_UCS4 maxchar, maxchar2;
11480    Py_ssize_t left_len, right_len, new_len;
11481
11482    if (p_left == NULL) {
11483        if (!PyErr_Occurred())
11484            PyErr_BadInternalCall();
11485        return;
11486    }
11487    left = *p_left;
11488    if (right == NULL || left == NULL
11489        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11490        if (!PyErr_Occurred())
11491            PyErr_BadInternalCall();
11492        goto error;
11493    }
11494
11495    if (PyUnicode_READY(left) == -1)
11496        goto error;
11497    if (PyUnicode_READY(right) == -1)
11498        goto error;
11499
11500    /* Shortcuts */
11501    PyObject *empty = unicode_get_empty();  // Borrowed reference
11502    if (left == empty) {
11503        Py_DECREF(left);
11504        Py_INCREF(right);
11505        *p_left = right;
11506        return;
11507    }
11508    if (right == empty) {
11509        return;
11510    }
11511
11512    left_len = PyUnicode_GET_LENGTH(left);
11513    right_len = PyUnicode_GET_LENGTH(right);
11514    if (left_len > PY_SSIZE_T_MAX - right_len) {
11515        PyErr_SetString(PyExc_OverflowError,
11516                        "strings are too large to concat");
11517        goto error;
11518    }
11519    new_len = left_len + right_len;
11520
11521    if (unicode_modifiable(left)
11522        && PyUnicode_CheckExact(right)
11523        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11524        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11525           to change the structure size, but characters are stored just after
11526           the structure, and so it requires to move all characters which is
11527           not so different than duplicating the string. */
11528        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11529    {
11530        /* append inplace */
11531        if (unicode_resize(p_left, new_len) != 0)
11532            goto error;
11533
11534        /* copy 'right' into the newly allocated area of 'left' */
11535        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11536    }
11537    else {
11538        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11539        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11540        maxchar = Py_MAX(maxchar, maxchar2);
11541
11542        /* Concat the two Unicode strings */
11543        res = PyUnicode_New(new_len, maxchar);
11544        if (res == NULL)
11545            goto error;
11546        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11547        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11548        Py_DECREF(left);
11549        *p_left = res;
11550    }
11551    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11552    return;
11553
11554error:
11555    Py_CLEAR(*p_left);
11556}
11557
11558void
11559PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11560{
11561    PyUnicode_Append(pleft, right);
11562    Py_XDECREF(right);
11563}
11564
11565/*
11566Wraps stringlib_parse_args_finds() and additionally ensures that the
11567first argument is a unicode object.
11568*/
11569
11570static inline int
11571parse_args_finds_unicode(const char * function_name, PyObject *args,
11572                         PyObject **substring,
11573                         Py_ssize_t *start, Py_ssize_t *end)
11574{
11575    if(stringlib_parse_args_finds(function_name, args, substring,
11576                                  start, end)) {
11577        if (ensure_unicode(*substring) < 0)
11578            return 0;
11579        return 1;
11580    }
11581    return 0;
11582}
11583
11584PyDoc_STRVAR(count__doc__,
11585             "S.count(sub[, start[, end]]) -> int\n\
11586\n\
11587Return the number of non-overlapping occurrences of substring sub in\n\
11588string S[start:end].  Optional arguments start and end are\n\
11589interpreted as in slice notation.");
11590
11591static PyObject *
11592unicode_count(PyObject *self, PyObject *args)
11593{
11594    PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11595    Py_ssize_t start = 0;
11596    Py_ssize_t end = PY_SSIZE_T_MAX;
11597    PyObject *result;
11598    int kind1, kind2;
11599    const void *buf1, *buf2;
11600    Py_ssize_t len1, len2, iresult;
11601
11602    if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11603        return NULL;
11604
11605    kind1 = PyUnicode_KIND(self);
11606    kind2 = PyUnicode_KIND(substring);
11607    if (kind1 < kind2)
11608        return PyLong_FromLong(0);
11609
11610    len1 = PyUnicode_GET_LENGTH(self);
11611    len2 = PyUnicode_GET_LENGTH(substring);
11612    ADJUST_INDICES(start, end, len1);
11613    if (end - start < len2)
11614        return PyLong_FromLong(0);
11615
11616    buf1 = PyUnicode_DATA(self);
11617    buf2 = PyUnicode_DATA(substring);
11618    if (kind2 != kind1) {
11619        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11620        if (!buf2)
11621            return NULL;
11622    }
11623    switch (kind1) {
11624    case PyUnicode_1BYTE_KIND:
11625        iresult = ucs1lib_count(
11626            ((const Py_UCS1*)buf1) + start, end - start,
11627            buf2, len2, PY_SSIZE_T_MAX
11628            );
11629        break;
11630    case PyUnicode_2BYTE_KIND:
11631        iresult = ucs2lib_count(
11632            ((const Py_UCS2*)buf1) + start, end - start,
11633            buf2, len2, PY_SSIZE_T_MAX
11634            );
11635        break;
11636    case PyUnicode_4BYTE_KIND:
11637        iresult = ucs4lib_count(
11638            ((const Py_UCS4*)buf1) + start, end - start,
11639            buf2, len2, PY_SSIZE_T_MAX
11640            );
11641        break;
11642    default:
11643        Py_UNREACHABLE();
11644    }
11645
11646    result = PyLong_FromSsize_t(iresult);
11647
11648    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11649    if (kind2 != kind1)
11650        PyMem_Free((void *)buf2);
11651
11652    return result;
11653}
11654
11655/*[clinic input]
11656str.encode as unicode_encode
11657
11658    encoding: str(c_default="NULL") = 'utf-8'
11659        The encoding in which to encode the string.
11660    errors: str(c_default="NULL") = 'strict'
11661        The error handling scheme to use for encoding errors.
11662        The default is 'strict' meaning that encoding errors raise a
11663        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11664        'xmlcharrefreplace' as well as any other name registered with
11665        codecs.register_error that can handle UnicodeEncodeErrors.
11666
11667Encode the string using the codec registered for encoding.
11668[clinic start generated code]*/
11669
11670static PyObject *
11671unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11672/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11673{
11674    return PyUnicode_AsEncodedString(self, encoding, errors);
11675}
11676
11677/*[clinic input]
11678str.expandtabs as unicode_expandtabs
11679
11680    tabsize: int = 8
11681
11682Return a copy where all tab characters are expanded using spaces.
11683
11684If tabsize is not given, a tab size of 8 characters is assumed.
11685[clinic start generated code]*/
11686
11687static PyObject *
11688unicode_expandtabs_impl(PyObject *self, int tabsize)
11689/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11690{
11691    Py_ssize_t i, j, line_pos, src_len, incr;
11692    Py_UCS4 ch;
11693    PyObject *u;
11694    const void *src_data;
11695    void *dest_data;
11696    int kind;
11697    int found;
11698
11699    if (PyUnicode_READY(self) == -1)
11700        return NULL;
11701
11702    /* First pass: determine size of output string */
11703    src_len = PyUnicode_GET_LENGTH(self);
11704    i = j = line_pos = 0;
11705    kind = PyUnicode_KIND(self);
11706    src_data = PyUnicode_DATA(self);
11707    found = 0;
11708    for (; i < src_len; i++) {
11709        ch = PyUnicode_READ(kind, src_data, i);
11710        if (ch == '\t') {
11711            found = 1;
11712            if (tabsize > 0) {
11713                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11714                if (j > PY_SSIZE_T_MAX - incr)
11715                    goto overflow;
11716                line_pos += incr;
11717                j += incr;
11718            }
11719        }
11720        else {
11721            if (j > PY_SSIZE_T_MAX - 1)
11722                goto overflow;
11723            line_pos++;
11724            j++;
11725            if (ch == '\n' || ch == '\r')
11726                line_pos = 0;
11727        }
11728    }
11729    if (!found)
11730        return unicode_result_unchanged(self);
11731
11732    /* Second pass: create output string and fill it */
11733    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11734    if (!u)
11735        return NULL;
11736    dest_data = PyUnicode_DATA(u);
11737
11738    i = j = line_pos = 0;
11739
11740    for (; i < src_len; i++) {
11741        ch = PyUnicode_READ(kind, src_data, i);
11742        if (ch == '\t') {
11743            if (tabsize > 0) {
11744                incr = tabsize - (line_pos % tabsize);
11745                line_pos += incr;
11746                unicode_fill(kind, dest_data, ' ', j, incr);
11747                j += incr;
11748            }
11749        }
11750        else {
11751            line_pos++;
11752            PyUnicode_WRITE(kind, dest_data, j, ch);
11753            j++;
11754            if (ch == '\n' || ch == '\r')
11755                line_pos = 0;
11756        }
11757    }
11758    assert (j == PyUnicode_GET_LENGTH(u));
11759    return unicode_result(u);
11760
11761  overflow:
11762    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11763    return NULL;
11764}
11765
11766PyDoc_STRVAR(find__doc__,
11767             "S.find(sub[, start[, end]]) -> int\n\
11768\n\
11769Return the lowest index in S where substring sub is found,\n\
11770such that sub is contained within S[start:end].  Optional\n\
11771arguments start and end are interpreted as in slice notation.\n\
11772\n\
11773Return -1 on failure.");
11774
11775static PyObject *
11776unicode_find(PyObject *self, PyObject *args)
11777{
11778    /* initialize variables to prevent gcc warning */
11779    PyObject *substring = NULL;
11780    Py_ssize_t start = 0;
11781    Py_ssize_t end = 0;
11782    Py_ssize_t result;
11783
11784    if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11785        return NULL;
11786
11787    if (PyUnicode_READY(self) == -1)
11788        return NULL;
11789
11790    result = any_find_slice(self, substring, start, end, 1);
11791
11792    if (result == -2)
11793        return NULL;
11794
11795    return PyLong_FromSsize_t(result);
11796}
11797
11798static PyObject *
11799unicode_getitem(PyObject *self, Py_ssize_t index)
11800{
11801    const void *data;
11802    enum PyUnicode_Kind kind;
11803    Py_UCS4 ch;
11804
11805    if (!PyUnicode_Check(self)) {
11806        PyErr_BadArgument();
11807        return NULL;
11808    }
11809    if (PyUnicode_READY(self) == -1) {
11810        return NULL;
11811    }
11812    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11813        PyErr_SetString(PyExc_IndexError, "string index out of range");
11814        return NULL;
11815    }
11816    kind = PyUnicode_KIND(self);
11817    data = PyUnicode_DATA(self);
11818    ch = PyUnicode_READ(kind, data, index);
11819    return unicode_char(ch);
11820}
11821
11822/* Believe it or not, this produces the same value for ASCII strings
11823   as bytes_hash(). */
11824static Py_hash_t
11825unicode_hash(PyObject *self)
11826{
11827    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11828
11829#ifdef Py_DEBUG
11830    assert(_Py_HashSecret_Initialized);
11831#endif
11832    if (_PyUnicode_HASH(self) != -1)
11833        return _PyUnicode_HASH(self);
11834    if (PyUnicode_READY(self) == -1)
11835        return -1;
11836
11837    x = _Py_HashBytes(PyUnicode_DATA(self),
11838                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11839    _PyUnicode_HASH(self) = x;
11840    return x;
11841}
11842
11843PyDoc_STRVAR(index__doc__,
11844             "S.index(sub[, start[, end]]) -> int\n\
11845\n\
11846Return the lowest index in S where substring sub is found,\n\
11847such that sub is contained within S[start:end].  Optional\n\
11848arguments start and end are interpreted as in slice notation.\n\
11849\n\
11850Raises ValueError when the substring is not found.");
11851
11852static PyObject *
11853unicode_index(PyObject *self, PyObject *args)
11854{
11855    /* initialize variables to prevent gcc warning */
11856    Py_ssize_t result;
11857    PyObject *substring = NULL;
11858    Py_ssize_t start = 0;
11859    Py_ssize_t end = 0;
11860
11861    if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11862        return NULL;
11863
11864    if (PyUnicode_READY(self) == -1)
11865        return NULL;
11866
11867    result = any_find_slice(self, substring, start, end, 1);
11868
11869    if (result == -2)
11870        return NULL;
11871
11872    if (result < 0) {
11873        PyErr_SetString(PyExc_ValueError, "substring not found");
11874        return NULL;
11875    }
11876
11877    return PyLong_FromSsize_t(result);
11878}
11879
11880/*[clinic input]
11881str.isascii as unicode_isascii
11882
11883Return True if all characters in the string are ASCII, False otherwise.
11884
11885ASCII characters have code points in the range U+0000-U+007F.
11886Empty string is ASCII too.
11887[clinic start generated code]*/
11888
11889static PyObject *
11890unicode_isascii_impl(PyObject *self)
11891/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11892{
11893    if (PyUnicode_READY(self) == -1) {
11894        return NULL;
11895    }
11896    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11897}
11898
11899/*[clinic input]
11900str.islower as unicode_islower
11901
11902Return True if the string is a lowercase string, False otherwise.
11903
11904A string is lowercase if all cased characters in the string are lowercase and
11905there is at least one cased character in the string.
11906[clinic start generated code]*/
11907
11908static PyObject *
11909unicode_islower_impl(PyObject *self)
11910/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11911{
11912    Py_ssize_t i, length;
11913    int kind;
11914    const void *data;
11915    int cased;
11916
11917    if (PyUnicode_READY(self) == -1)
11918        return NULL;
11919    length = PyUnicode_GET_LENGTH(self);
11920    kind = PyUnicode_KIND(self);
11921    data = PyUnicode_DATA(self);
11922
11923    /* Shortcut for single character strings */
11924    if (length == 1)
11925        return PyBool_FromLong(
11926            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11927
11928    /* Special case for empty strings */
11929    if (length == 0)
11930        Py_RETURN_FALSE;
11931
11932    cased = 0;
11933    for (i = 0; i < length; i++) {
11934        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11935
11936        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11937            Py_RETURN_FALSE;
11938        else if (!cased && Py_UNICODE_ISLOWER(ch))
11939            cased = 1;
11940    }
11941    return PyBool_FromLong(cased);
11942}
11943
11944/*[clinic input]
11945str.isupper as unicode_isupper
11946
11947Return True if the string is an uppercase string, False otherwise.
11948
11949A string is uppercase if all cased characters in the string are uppercase and
11950there is at least one cased character in the string.
11951[clinic start generated code]*/
11952
11953static PyObject *
11954unicode_isupper_impl(PyObject *self)
11955/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11956{
11957    Py_ssize_t i, length;
11958    int kind;
11959    const void *data;
11960    int cased;
11961
11962    if (PyUnicode_READY(self) == -1)
11963        return NULL;
11964    length = PyUnicode_GET_LENGTH(self);
11965    kind = PyUnicode_KIND(self);
11966    data = PyUnicode_DATA(self);
11967
11968    /* Shortcut for single character strings */
11969    if (length == 1)
11970        return PyBool_FromLong(
11971            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11972
11973    /* Special case for empty strings */
11974    if (length == 0)
11975        Py_RETURN_FALSE;
11976
11977    cased = 0;
11978    for (i = 0; i < length; i++) {
11979        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11980
11981        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11982            Py_RETURN_FALSE;
11983        else if (!cased && Py_UNICODE_ISUPPER(ch))
11984            cased = 1;
11985    }
11986    return PyBool_FromLong(cased);
11987}
11988
11989/*[clinic input]
11990str.istitle as unicode_istitle
11991
11992Return True if the string is a title-cased string, False otherwise.
11993
11994In a title-cased string, upper- and title-case characters may only
11995follow uncased characters and lowercase characters only cased ones.
11996[clinic start generated code]*/
11997
11998static PyObject *
11999unicode_istitle_impl(PyObject *self)
12000/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12001{
12002    Py_ssize_t i, length;
12003    int kind;
12004    const void *data;
12005    int cased, previous_is_cased;
12006
12007    if (PyUnicode_READY(self) == -1)
12008        return NULL;
12009    length = PyUnicode_GET_LENGTH(self);
12010    kind = PyUnicode_KIND(self);
12011    data = PyUnicode_DATA(self);
12012
12013    /* Shortcut for single character strings */
12014    if (length == 1) {
12015        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12016        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12017                               (Py_UNICODE_ISUPPER(ch) != 0));
12018    }
12019
12020    /* Special case for empty strings */
12021    if (length == 0)
12022        Py_RETURN_FALSE;
12023
12024    cased = 0;
12025    previous_is_cased = 0;
12026    for (i = 0; i < length; i++) {
12027        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12028
12029        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12030            if (previous_is_cased)
12031                Py_RETURN_FALSE;
12032            previous_is_cased = 1;
12033            cased = 1;
12034        }
12035        else if (Py_UNICODE_ISLOWER(ch)) {
12036            if (!previous_is_cased)
12037                Py_RETURN_FALSE;
12038            previous_is_cased = 1;
12039            cased = 1;
12040        }
12041        else
12042            previous_is_cased = 0;
12043    }
12044    return PyBool_FromLong(cased);
12045}
12046
12047/*[clinic input]
12048str.isspace as unicode_isspace
12049
12050Return True if the string is a whitespace string, False otherwise.
12051
12052A string is whitespace if all characters in the string are whitespace and there
12053is at least one character in the string.
12054[clinic start generated code]*/
12055
12056static PyObject *
12057unicode_isspace_impl(PyObject *self)
12058/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12059{
12060    Py_ssize_t i, length;
12061    int kind;
12062    const void *data;
12063
12064    if (PyUnicode_READY(self) == -1)
12065        return NULL;
12066    length = PyUnicode_GET_LENGTH(self);
12067    kind = PyUnicode_KIND(self);
12068    data = PyUnicode_DATA(self);
12069
12070    /* Shortcut for single character strings */
12071    if (length == 1)
12072        return PyBool_FromLong(
12073            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12074
12075    /* Special case for empty strings */
12076    if (length == 0)
12077        Py_RETURN_FALSE;
12078
12079    for (i = 0; i < length; i++) {
12080        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12081        if (!Py_UNICODE_ISSPACE(ch))
12082            Py_RETURN_FALSE;
12083    }
12084    Py_RETURN_TRUE;
12085}
12086
12087/*[clinic input]
12088str.isalpha as unicode_isalpha
12089
12090Return True if the string is an alphabetic string, False otherwise.
12091
12092A string is alphabetic if all characters in the string are alphabetic and there
12093is at least one character in the string.
12094[clinic start generated code]*/
12095
12096static PyObject *
12097unicode_isalpha_impl(PyObject *self)
12098/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12099{
12100    Py_ssize_t i, length;
12101    int kind;
12102    const void *data;
12103
12104    if (PyUnicode_READY(self) == -1)
12105        return NULL;
12106    length = PyUnicode_GET_LENGTH(self);
12107    kind = PyUnicode_KIND(self);
12108    data = PyUnicode_DATA(self);
12109
12110    /* Shortcut for single character strings */
12111    if (length == 1)
12112        return PyBool_FromLong(
12113            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12114
12115    /* Special case for empty strings */
12116    if (length == 0)
12117        Py_RETURN_FALSE;
12118
12119    for (i = 0; i < length; i++) {
12120        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12121            Py_RETURN_FALSE;
12122    }
12123    Py_RETURN_TRUE;
12124}
12125
12126/*[clinic input]
12127str.isalnum as unicode_isalnum
12128
12129Return True if the string is an alpha-numeric string, False otherwise.
12130
12131A string is alpha-numeric if all characters in the string are alpha-numeric and
12132there is at least one character in the string.
12133[clinic start generated code]*/
12134
12135static PyObject *
12136unicode_isalnum_impl(PyObject *self)
12137/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12138{
12139    int kind;
12140    const void *data;
12141    Py_ssize_t len, i;
12142
12143    if (PyUnicode_READY(self) == -1)
12144        return NULL;
12145
12146    kind = PyUnicode_KIND(self);
12147    data = PyUnicode_DATA(self);
12148    len = PyUnicode_GET_LENGTH(self);
12149
12150    /* Shortcut for single character strings */
12151    if (len == 1) {
12152        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12153        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12154    }
12155
12156    /* Special case for empty strings */
12157    if (len == 0)
12158        Py_RETURN_FALSE;
12159
12160    for (i = 0; i < len; i++) {
12161        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12162        if (!Py_UNICODE_ISALNUM(ch))
12163            Py_RETURN_FALSE;
12164    }
12165    Py_RETURN_TRUE;
12166}
12167
12168/*[clinic input]
12169str.isdecimal as unicode_isdecimal
12170
12171Return True if the string is a decimal string, False otherwise.
12172
12173A string is a decimal string if all characters in the string are decimal and
12174there is at least one character in the string.
12175[clinic start generated code]*/
12176
12177static PyObject *
12178unicode_isdecimal_impl(PyObject *self)
12179/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12180{
12181    Py_ssize_t i, length;
12182    int kind;
12183    const void *data;
12184
12185    if (PyUnicode_READY(self) == -1)
12186        return NULL;
12187    length = PyUnicode_GET_LENGTH(self);
12188    kind = PyUnicode_KIND(self);
12189    data = PyUnicode_DATA(self);
12190
12191    /* Shortcut for single character strings */
12192    if (length == 1)
12193        return PyBool_FromLong(
12194            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12195
12196    /* Special case for empty strings */
12197    if (length == 0)
12198        Py_RETURN_FALSE;
12199
12200    for (i = 0; i < length; i++) {
12201        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12202            Py_RETURN_FALSE;
12203    }
12204    Py_RETURN_TRUE;
12205}
12206
12207/*[clinic input]
12208str.isdigit as unicode_isdigit
12209
12210Return True if the string is a digit string, False otherwise.
12211
12212A string is a digit string if all characters in the string are digits and there
12213is at least one character in the string.
12214[clinic start generated code]*/
12215
12216static PyObject *
12217unicode_isdigit_impl(PyObject *self)
12218/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12219{
12220    Py_ssize_t i, length;
12221    int kind;
12222    const void *data;
12223
12224    if (PyUnicode_READY(self) == -1)
12225        return NULL;
12226    length = PyUnicode_GET_LENGTH(self);
12227    kind = PyUnicode_KIND(self);
12228    data = PyUnicode_DATA(self);
12229
12230    /* Shortcut for single character strings */
12231    if (length == 1) {
12232        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12233        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12234    }
12235
12236    /* Special case for empty strings */
12237    if (length == 0)
12238        Py_RETURN_FALSE;
12239
12240    for (i = 0; i < length; i++) {
12241        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12242            Py_RETURN_FALSE;
12243    }
12244    Py_RETURN_TRUE;
12245}
12246
12247/*[clinic input]
12248str.isnumeric as unicode_isnumeric
12249
12250Return True if the string is a numeric string, False otherwise.
12251
12252A string is numeric if all characters in the string are numeric and there is at
12253least one character in the string.
12254[clinic start generated code]*/
12255
12256static PyObject *
12257unicode_isnumeric_impl(PyObject *self)
12258/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12259{
12260    Py_ssize_t i, length;
12261    int kind;
12262    const void *data;
12263
12264    if (PyUnicode_READY(self) == -1)
12265        return NULL;
12266    length = PyUnicode_GET_LENGTH(self);
12267    kind = PyUnicode_KIND(self);
12268    data = PyUnicode_DATA(self);
12269
12270    /* Shortcut for single character strings */
12271    if (length == 1)
12272        return PyBool_FromLong(
12273            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12274
12275    /* Special case for empty strings */
12276    if (length == 0)
12277        Py_RETURN_FALSE;
12278
12279    for (i = 0; i < length; i++) {
12280        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12281            Py_RETURN_FALSE;
12282    }
12283    Py_RETURN_TRUE;
12284}
12285
12286Py_ssize_t
12287_PyUnicode_ScanIdentifier(PyObject *self)
12288{
12289    Py_ssize_t i;
12290    if (PyUnicode_READY(self) == -1)
12291        return -1;
12292
12293    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12294    if (len == 0) {
12295        /* an empty string is not a valid identifier */
12296        return 0;
12297    }
12298
12299    int kind = PyUnicode_KIND(self);
12300    const void *data = PyUnicode_DATA(self);
12301    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12302    /* PEP 3131 says that the first character must be in
12303       XID_Start and subsequent characters in XID_Continue,
12304       and for the ASCII range, the 2.x rules apply (i.e
12305       start with letters and underscore, continue with
12306       letters, digits, underscore). However, given the current
12307       definition of XID_Start and XID_Continue, it is sufficient
12308       to check just for these, except that _ must be allowed
12309       as starting an identifier.  */
12310    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12311        return 0;
12312    }
12313
12314    for (i = 1; i < len; i++) {
12315        ch = PyUnicode_READ(kind, data, i);
12316        if (!_PyUnicode_IsXidContinue(ch)) {
12317            return i;
12318        }
12319    }
12320    return i;
12321}
12322
12323int
12324PyUnicode_IsIdentifier(PyObject *self)
12325{
12326    if (PyUnicode_IS_READY(self)) {
12327        Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12328        Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12329        /* an empty string is not a valid identifier */
12330        return len && i == len;
12331    }
12332    else {
12333_Py_COMP_DIAG_PUSH
12334_Py_COMP_DIAG_IGNORE_DEPR_DECLS
12335        Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12336        if (len == 0) {
12337            /* an empty string is not a valid identifier */
12338            return 0;
12339        }
12340
12341        const wchar_t *wstr = _PyUnicode_WSTR(self);
12342        Py_UCS4 ch = wstr[i++];
12343#if SIZEOF_WCHAR_T == 2
12344        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12345            && i < len
12346            && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12347        {
12348            ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12349            i++;
12350        }
12351#endif
12352        if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12353            return 0;
12354        }
12355
12356        while (i < len) {
12357            ch = wstr[i++];
12358#if SIZEOF_WCHAR_T == 2
12359            if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12360                && i < len
12361                && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12362            {
12363                ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12364                i++;
12365            }
12366#endif
12367            if (!_PyUnicode_IsXidContinue(ch)) {
12368                return 0;
12369            }
12370        }
12371        return 1;
12372_Py_COMP_DIAG_POP
12373    }
12374}
12375
12376/*[clinic input]
12377str.isidentifier as unicode_isidentifier
12378
12379Return True if the string is a valid Python identifier, False otherwise.
12380
12381Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12382such as "def" or "class".
12383[clinic start generated code]*/
12384
12385static PyObject *
12386unicode_isidentifier_impl(PyObject *self)
12387/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12388{
12389    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12390}
12391
12392/*[clinic input]
12393str.isprintable as unicode_isprintable
12394
12395Return True if the string is printable, False otherwise.
12396
12397A string is printable if all of its characters are considered printable in
12398repr() or if it is empty.
12399[clinic start generated code]*/
12400
12401static PyObject *
12402unicode_isprintable_impl(PyObject *self)
12403/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12404{
12405    Py_ssize_t i, length;
12406    int kind;
12407    const void *data;
12408
12409    if (PyUnicode_READY(self) == -1)
12410        return NULL;
12411    length = PyUnicode_GET_LENGTH(self);
12412    kind = PyUnicode_KIND(self);
12413    data = PyUnicode_DATA(self);
12414
12415    /* Shortcut for single character strings */
12416    if (length == 1)
12417        return PyBool_FromLong(
12418            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12419
12420    for (i = 0; i < length; i++) {
12421        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12422            Py_RETURN_FALSE;
12423        }
12424    }
12425    Py_RETURN_TRUE;
12426}
12427
12428/*[clinic input]
12429str.join as unicode_join
12430
12431    iterable: object
12432    /
12433
12434Concatenate any number of strings.
12435
12436The string whose method is called is inserted in between each given string.
12437The result is returned as a new string.
12438
12439Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12440[clinic start generated code]*/
12441
12442static PyObject *
12443unicode_join(PyObject *self, PyObject *iterable)
12444/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12445{
12446    return PyUnicode_Join(self, iterable);
12447}
12448
12449static Py_ssize_t
12450unicode_length(PyObject *self)
12451{
12452    if (PyUnicode_READY(self) == -1)
12453        return -1;
12454    return PyUnicode_GET_LENGTH(self);
12455}
12456
12457/*[clinic input]
12458str.ljust as unicode_ljust
12459
12460    width: Py_ssize_t
12461    fillchar: Py_UCS4 = ' '
12462    /
12463
12464Return a left-justified string of length width.
12465
12466Padding is done using the specified fill character (default is a space).
12467[clinic start generated code]*/
12468
12469static PyObject *
12470unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12471/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12472{
12473    if (PyUnicode_READY(self) == -1)
12474        return NULL;
12475
12476    if (PyUnicode_GET_LENGTH(self) >= width)
12477        return unicode_result_unchanged(self);
12478
12479    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12480}
12481
12482/*[clinic input]
12483str.lower as unicode_lower
12484
12485Return a copy of the string converted to lowercase.
12486[clinic start generated code]*/
12487
12488static PyObject *
12489unicode_lower_impl(PyObject *self)
12490/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12491{
12492    if (PyUnicode_READY(self) == -1)
12493        return NULL;
12494    if (PyUnicode_IS_ASCII(self))
12495        return ascii_upper_or_lower(self, 1);
12496    return case_operation(self, do_lower);
12497}
12498
12499#define LEFTSTRIP 0
12500#define RIGHTSTRIP 1
12501#define BOTHSTRIP 2
12502
12503/* Arrays indexed by above */
12504static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12505
12506#define STRIPNAME(i) (stripfuncnames[i])
12507
12508/* externally visible for str.strip(unicode) */
12509PyObject *
12510_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12511{
12512    const void *data;
12513    int kind;
12514    Py_ssize_t i, j, len;
12515    BLOOM_MASK sepmask;
12516    Py_ssize_t seplen;
12517
12518    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12519        return NULL;
12520
12521    kind = PyUnicode_KIND(self);
12522    data = PyUnicode_DATA(self);
12523    len = PyUnicode_GET_LENGTH(self);
12524    seplen = PyUnicode_GET_LENGTH(sepobj);
12525    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12526                              PyUnicode_DATA(sepobj),
12527                              seplen);
12528
12529    i = 0;
12530    if (striptype != RIGHTSTRIP) {
12531        while (i < len) {
12532            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12533            if (!BLOOM(sepmask, ch))
12534                break;
12535            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12536                break;
12537            i++;
12538        }
12539    }
12540
12541    j = len;
12542    if (striptype != LEFTSTRIP) {
12543        j--;
12544        while (j >= i) {
12545            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12546            if (!BLOOM(sepmask, ch))
12547                break;
12548            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12549                break;
12550            j--;
12551        }
12552
12553        j++;
12554    }
12555
12556    return PyUnicode_Substring(self, i, j);
12557}
12558
12559PyObject*
12560PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12561{
12562    const unsigned char *data;
12563    int kind;
12564    Py_ssize_t length;
12565
12566    if (PyUnicode_READY(self) == -1)
12567        return NULL;
12568
12569    length = PyUnicode_GET_LENGTH(self);
12570    end = Py_MIN(end, length);
12571
12572    if (start == 0 && end == length)
12573        return unicode_result_unchanged(self);
12574
12575    if (start < 0 || end < 0) {
12576        PyErr_SetString(PyExc_IndexError, "string index out of range");
12577        return NULL;
12578    }
12579    if (start >= length || end < start)
12580        _Py_RETURN_UNICODE_EMPTY();
12581
12582    length = end - start;
12583    if (PyUnicode_IS_ASCII(self)) {
12584        data = PyUnicode_1BYTE_DATA(self);
12585        return _PyUnicode_FromASCII((const char*)(data + start), length);
12586    }
12587    else {
12588        kind = PyUnicode_KIND(self);
12589        data = PyUnicode_1BYTE_DATA(self);
12590        return PyUnicode_FromKindAndData(kind,
12591                                         data + kind * start,
12592                                         length);
12593    }
12594}
12595
12596static PyObject *
12597do_strip(PyObject *self, int striptype)
12598{
12599    Py_ssize_t len, i, j;
12600
12601    if (PyUnicode_READY(self) == -1)
12602        return NULL;
12603
12604    len = PyUnicode_GET_LENGTH(self);
12605
12606    if (PyUnicode_IS_ASCII(self)) {
12607        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12608
12609        i = 0;
12610        if (striptype != RIGHTSTRIP) {
12611            while (i < len) {
12612                Py_UCS1 ch = data[i];
12613                if (!_Py_ascii_whitespace[ch])
12614                    break;
12615                i++;
12616            }
12617        }
12618
12619        j = len;
12620        if (striptype != LEFTSTRIP) {
12621            j--;
12622            while (j >= i) {
12623                Py_UCS1 ch = data[j];
12624                if (!_Py_ascii_whitespace[ch])
12625                    break;
12626                j--;
12627            }
12628            j++;
12629        }
12630    }
12631    else {
12632        int kind = PyUnicode_KIND(self);
12633        const void *data = PyUnicode_DATA(self);
12634
12635        i = 0;
12636        if (striptype != RIGHTSTRIP) {
12637            while (i < len) {
12638                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12639                if (!Py_UNICODE_ISSPACE(ch))
12640                    break;
12641                i++;
12642            }
12643        }
12644
12645        j = len;
12646        if (striptype != LEFTSTRIP) {
12647            j--;
12648            while (j >= i) {
12649                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12650                if (!Py_UNICODE_ISSPACE(ch))
12651                    break;
12652                j--;
12653            }
12654            j++;
12655        }
12656    }
12657
12658    return PyUnicode_Substring(self, i, j);
12659}
12660
12661
12662static PyObject *
12663do_argstrip(PyObject *self, int striptype, PyObject *sep)
12664{
12665    if (sep != Py_None) {
12666        if (PyUnicode_Check(sep))
12667            return _PyUnicode_XStrip(self, striptype, sep);
12668        else {
12669            PyErr_Format(PyExc_TypeError,
12670                         "%s arg must be None or str",
12671                         STRIPNAME(striptype));
12672            return NULL;
12673        }
12674    }
12675
12676    return do_strip(self, striptype);
12677}
12678
12679
12680/*[clinic input]
12681str.strip as unicode_strip
12682
12683    chars: object = None
12684    /
12685
12686Return a copy of the string with leading and trailing whitespace removed.
12687
12688If chars is given and not None, remove characters in chars instead.
12689[clinic start generated code]*/
12690
12691static PyObject *
12692unicode_strip_impl(PyObject *self, PyObject *chars)
12693/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12694{
12695    return do_argstrip(self, BOTHSTRIP, chars);
12696}
12697
12698
12699/*[clinic input]
12700str.lstrip as unicode_lstrip
12701
12702    chars: object = None
12703    /
12704
12705Return a copy of the string with leading whitespace removed.
12706
12707If chars is given and not None, remove characters in chars instead.
12708[clinic start generated code]*/
12709
12710static PyObject *
12711unicode_lstrip_impl(PyObject *self, PyObject *chars)
12712/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12713{
12714    return do_argstrip(self, LEFTSTRIP, chars);
12715}
12716
12717
12718/*[clinic input]
12719str.rstrip as unicode_rstrip
12720
12721    chars: object = None
12722    /
12723
12724Return a copy of the string with trailing whitespace removed.
12725
12726If chars is given and not None, remove characters in chars instead.
12727[clinic start generated code]*/
12728
12729static PyObject *
12730unicode_rstrip_impl(PyObject *self, PyObject *chars)
12731/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12732{
12733    return do_argstrip(self, RIGHTSTRIP, chars);
12734}
12735
12736
12737static PyObject*
12738unicode_repeat(PyObject *str, Py_ssize_t len)
12739{
12740    PyObject *u;
12741    Py_ssize_t nchars, n;
12742
12743    if (len < 1)
12744        _Py_RETURN_UNICODE_EMPTY();
12745
12746    /* no repeat, return original string */
12747    if (len == 1)
12748        return unicode_result_unchanged(str);
12749
12750    if (PyUnicode_READY(str) == -1)
12751        return NULL;
12752
12753    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12754        PyErr_SetString(PyExc_OverflowError,
12755                        "repeated string is too long");
12756        return NULL;
12757    }
12758    nchars = len * PyUnicode_GET_LENGTH(str);
12759
12760    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12761    if (!u)
12762        return NULL;
12763    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12764
12765    if (PyUnicode_GET_LENGTH(str) == 1) {
12766        int kind = PyUnicode_KIND(str);
12767        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12768        if (kind == PyUnicode_1BYTE_KIND) {
12769            void *to = PyUnicode_DATA(u);
12770            memset(to, (unsigned char)fill_char, len);
12771        }
12772        else if (kind == PyUnicode_2BYTE_KIND) {
12773            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12774            for (n = 0; n < len; ++n)
12775                ucs2[n] = fill_char;
12776        } else {
12777            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12778            assert(kind == PyUnicode_4BYTE_KIND);
12779            for (n = 0; n < len; ++n)
12780                ucs4[n] = fill_char;
12781        }
12782    }
12783    else {
12784        Py_ssize_t char_size = PyUnicode_KIND(str);
12785        char *to = (char *) PyUnicode_DATA(u);
12786        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12787            PyUnicode_GET_LENGTH(str) * char_size);
12788    }
12789
12790    assert(_PyUnicode_CheckConsistency(u, 1));
12791    return u;
12792}
12793
12794PyObject *
12795PyUnicode_Replace(PyObject *str,
12796                  PyObject *substr,
12797                  PyObject *replstr,
12798                  Py_ssize_t maxcount)
12799{
12800    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12801            ensure_unicode(replstr) < 0)
12802        return NULL;
12803    return replace(str, substr, replstr, maxcount);
12804}
12805
12806/*[clinic input]
12807str.replace as unicode_replace
12808
12809    old: unicode
12810    new: unicode
12811    count: Py_ssize_t = -1
12812        Maximum number of occurrences to replace.
12813        -1 (the default value) means replace all occurrences.
12814    /
12815
12816Return a copy with all occurrences of substring old replaced by new.
12817
12818If the optional argument count is given, only the first count occurrences are
12819replaced.
12820[clinic start generated code]*/
12821
12822static PyObject *
12823unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12824                     Py_ssize_t count)
12825/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12826{
12827    if (PyUnicode_READY(self) == -1)
12828        return NULL;
12829    return replace(self, old, new, count);
12830}
12831
12832/*[clinic input]
12833str.removeprefix as unicode_removeprefix
12834
12835    prefix: unicode
12836    /
12837
12838Return a str with the given prefix string removed if present.
12839
12840If the string starts with the prefix string, return string[len(prefix):].
12841Otherwise, return a copy of the original string.
12842[clinic start generated code]*/
12843
12844static PyObject *
12845unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12846/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12847{
12848    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12849    if (match == -1) {
12850        return NULL;
12851    }
12852    if (match) {
12853        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12854                                   PyUnicode_GET_LENGTH(self));
12855    }
12856    return unicode_result_unchanged(self);
12857}
12858
12859/*[clinic input]
12860str.removesuffix as unicode_removesuffix
12861
12862    suffix: unicode
12863    /
12864
12865Return a str with the given suffix string removed if present.
12866
12867If the string ends with the suffix string and that suffix is not empty,
12868return string[:-len(suffix)]. Otherwise, return a copy of the original
12869string.
12870[clinic start generated code]*/
12871
12872static PyObject *
12873unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12874/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12875{
12876    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12877    if (match == -1) {
12878        return NULL;
12879    }
12880    if (match) {
12881        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12882                                            - PyUnicode_GET_LENGTH(suffix));
12883    }
12884    return unicode_result_unchanged(self);
12885}
12886
12887static PyObject *
12888unicode_repr(PyObject *unicode)
12889{
12890    PyObject *repr;
12891    Py_ssize_t isize;
12892    Py_ssize_t osize, squote, dquote, i, o;
12893    Py_UCS4 max, quote;
12894    int ikind, okind, unchanged;
12895    const void *idata;
12896    void *odata;
12897
12898    if (PyUnicode_READY(unicode) == -1)
12899        return NULL;
12900
12901    isize = PyUnicode_GET_LENGTH(unicode);
12902    idata = PyUnicode_DATA(unicode);
12903
12904    /* Compute length of output, quote characters, and
12905       maximum character */
12906    osize = 0;
12907    max = 127;
12908    squote = dquote = 0;
12909    ikind = PyUnicode_KIND(unicode);
12910    for (i = 0; i < isize; i++) {
12911        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12912        Py_ssize_t incr = 1;
12913        switch (ch) {
12914        case '\'': squote++; break;
12915        case '"':  dquote++; break;
12916        case '\\': case '\t': case '\r': case '\n':
12917            incr = 2;
12918            break;
12919        default:
12920            /* Fast-path ASCII */
12921            if (ch < ' ' || ch == 0x7f)
12922                incr = 4; /* \xHH */
12923            else if (ch < 0x7f)
12924                ;
12925            else if (Py_UNICODE_ISPRINTABLE(ch))
12926                max = ch > max ? ch : max;
12927            else if (ch < 0x100)
12928                incr = 4; /* \xHH */
12929            else if (ch < 0x10000)
12930                incr = 6; /* \uHHHH */
12931            else
12932                incr = 10; /* \uHHHHHHHH */
12933        }
12934        if (osize > PY_SSIZE_T_MAX - incr) {
12935            PyErr_SetString(PyExc_OverflowError,
12936                            "string is too long to generate repr");
12937            return NULL;
12938        }
12939        osize += incr;
12940    }
12941
12942    quote = '\'';
12943    unchanged = (osize == isize);
12944    if (squote) {
12945        unchanged = 0;
12946        if (dquote)
12947            /* Both squote and dquote present. Use squote,
12948               and escape them */
12949            osize += squote;
12950        else
12951            quote = '"';
12952    }
12953    osize += 2;   /* quotes */
12954
12955    repr = PyUnicode_New(osize, max);
12956    if (repr == NULL)
12957        return NULL;
12958    okind = PyUnicode_KIND(repr);
12959    odata = PyUnicode_DATA(repr);
12960
12961    PyUnicode_WRITE(okind, odata, 0, quote);
12962    PyUnicode_WRITE(okind, odata, osize-1, quote);
12963    if (unchanged) {
12964        _PyUnicode_FastCopyCharacters(repr, 1,
12965                                      unicode, 0,
12966                                      isize);
12967    }
12968    else {
12969        for (i = 0, o = 1; i < isize; i++) {
12970            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12971
12972            /* Escape quotes and backslashes */
12973            if ((ch == quote) || (ch == '\\')) {
12974                PyUnicode_WRITE(okind, odata, o++, '\\');
12975                PyUnicode_WRITE(okind, odata, o++, ch);
12976                continue;
12977            }
12978
12979            /* Map special whitespace to '\t', \n', '\r' */
12980            if (ch == '\t') {
12981                PyUnicode_WRITE(okind, odata, o++, '\\');
12982                PyUnicode_WRITE(okind, odata, o++, 't');
12983            }
12984            else if (ch == '\n') {
12985                PyUnicode_WRITE(okind, odata, o++, '\\');
12986                PyUnicode_WRITE(okind, odata, o++, 'n');
12987            }
12988            else if (ch == '\r') {
12989                PyUnicode_WRITE(okind, odata, o++, '\\');
12990                PyUnicode_WRITE(okind, odata, o++, 'r');
12991            }
12992
12993            /* Map non-printable US ASCII to '\xhh' */
12994            else if (ch < ' ' || ch == 0x7F) {
12995                PyUnicode_WRITE(okind, odata, o++, '\\');
12996                PyUnicode_WRITE(okind, odata, o++, 'x');
12997                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12998                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12999            }
13000
13001            /* Copy ASCII characters as-is */
13002            else if (ch < 0x7F) {
13003                PyUnicode_WRITE(okind, odata, o++, ch);
13004            }
13005
13006            /* Non-ASCII characters */
13007            else {
13008                /* Map Unicode whitespace and control characters
13009                   (categories Z* and C* except ASCII space)
13010                */
13011                if (!Py_UNICODE_ISPRINTABLE(ch)) {
13012                    PyUnicode_WRITE(okind, odata, o++, '\\');
13013                    /* Map 8-bit characters to '\xhh' */
13014                    if (ch <= 0xff) {
13015                        PyUnicode_WRITE(okind, odata, o++, 'x');
13016                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13017                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13018                    }
13019                    /* Map 16-bit characters to '\uxxxx' */
13020                    else if (ch <= 0xffff) {
13021                        PyUnicode_WRITE(okind, odata, o++, 'u');
13022                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13023                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13024                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13025                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13026                    }
13027                    /* Map 21-bit characters to '\U00xxxxxx' */
13028                    else {
13029                        PyUnicode_WRITE(okind, odata, o++, 'U');
13030                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13031                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13032                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13033                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13034                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13035                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13036                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13037                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13038                    }
13039                }
13040                /* Copy characters as-is */
13041                else {
13042                    PyUnicode_WRITE(okind, odata, o++, ch);
13043                }
13044            }
13045        }
13046    }
13047    /* Closing quote already added at the beginning */
13048    assert(_PyUnicode_CheckConsistency(repr, 1));
13049    return repr;
13050}
13051
13052PyDoc_STRVAR(rfind__doc__,
13053             "S.rfind(sub[, start[, end]]) -> int\n\
13054\n\
13055Return the highest index in S where substring sub is found,\n\
13056such that sub is contained within S[start:end].  Optional\n\
13057arguments start and end are interpreted as in slice notation.\n\
13058\n\
13059Return -1 on failure.");
13060
13061static PyObject *
13062unicode_rfind(PyObject *self, PyObject *args)
13063{
13064    /* initialize variables to prevent gcc warning */
13065    PyObject *substring = NULL;
13066    Py_ssize_t start = 0;
13067    Py_ssize_t end = 0;
13068    Py_ssize_t result;
13069
13070    if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13071        return NULL;
13072
13073    if (PyUnicode_READY(self) == -1)
13074        return NULL;
13075
13076    result = any_find_slice(self, substring, start, end, -1);
13077
13078    if (result == -2)
13079        return NULL;
13080
13081    return PyLong_FromSsize_t(result);
13082}
13083
13084PyDoc_STRVAR(rindex__doc__,
13085             "S.rindex(sub[, start[, end]]) -> int\n\
13086\n\
13087Return the highest index in S where substring sub is found,\n\
13088such that sub is contained within S[start:end].  Optional\n\
13089arguments start and end are interpreted as in slice notation.\n\
13090\n\
13091Raises ValueError when the substring is not found.");
13092
13093static PyObject *
13094unicode_rindex(PyObject *self, PyObject *args)
13095{
13096    /* initialize variables to prevent gcc warning */
13097    PyObject *substring = NULL;
13098    Py_ssize_t start = 0;
13099    Py_ssize_t end = 0;
13100    Py_ssize_t result;
13101
13102    if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13103        return NULL;
13104
13105    if (PyUnicode_READY(self) == -1)
13106        return NULL;
13107
13108    result = any_find_slice(self, substring, start, end, -1);
13109
13110    if (result == -2)
13111        return NULL;
13112
13113    if (result < 0) {
13114        PyErr_SetString(PyExc_ValueError, "substring not found");
13115        return NULL;
13116    }
13117
13118    return PyLong_FromSsize_t(result);
13119}
13120
13121/*[clinic input]
13122str.rjust as unicode_rjust
13123
13124    width: Py_ssize_t
13125    fillchar: Py_UCS4 = ' '
13126    /
13127
13128Return a right-justified string of length width.
13129
13130Padding is done using the specified fill character (default is a space).
13131[clinic start generated code]*/
13132
13133static PyObject *
13134unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13135/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13136{
13137    if (PyUnicode_READY(self) == -1)
13138        return NULL;
13139
13140    if (PyUnicode_GET_LENGTH(self) >= width)
13141        return unicode_result_unchanged(self);
13142
13143    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13144}
13145
13146PyObject *
13147PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13148{
13149    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13150        return NULL;
13151
13152    return split(s, sep, maxsplit);
13153}
13154
13155/*[clinic input]
13156str.split as unicode_split
13157
13158    sep: object = None
13159        The separator used to split the string.
13160
13161        When set to None (the default value), will split on any whitespace
13162        character (including \\n \\r \\t \\f and spaces) and will discard
13163        empty strings from the result.
13164    maxsplit: Py_ssize_t = -1
13165        Maximum number of splits (starting from the left).
13166        -1 (the default value) means no limit.
13167
13168Return a list of the substrings in the string, using sep as the separator string.
13169
13170Note, str.split() is mainly useful for data that has been intentionally
13171delimited.  With natural text that includes punctuation, consider using
13172the regular expression module.
13173
13174[clinic start generated code]*/
13175
13176static PyObject *
13177unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13178/*[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]*/
13179{
13180    if (sep == Py_None)
13181        return split(self, NULL, maxsplit);
13182    if (PyUnicode_Check(sep))
13183        return split(self, sep, maxsplit);
13184
13185    PyErr_Format(PyExc_TypeError,
13186                 "must be str or None, not %.100s",
13187                 Py_TYPE(sep)->tp_name);
13188    return NULL;
13189}
13190
13191PyObject *
13192PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13193{
13194    PyObject* out;
13195    int kind1, kind2;
13196    const void *buf1, *buf2;
13197    Py_ssize_t len1, len2;
13198
13199    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13200        return NULL;
13201
13202    kind1 = PyUnicode_KIND(str_obj);
13203    kind2 = PyUnicode_KIND(sep_obj);
13204    len1 = PyUnicode_GET_LENGTH(str_obj);
13205    len2 = PyUnicode_GET_LENGTH(sep_obj);
13206    if (kind1 < kind2 || len1 < len2) {
13207        PyObject *empty = unicode_get_empty();  // Borrowed reference
13208        return PyTuple_Pack(3, str_obj, empty, empty);
13209    }
13210    buf1 = PyUnicode_DATA(str_obj);
13211    buf2 = PyUnicode_DATA(sep_obj);
13212    if (kind2 != kind1) {
13213        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13214        if (!buf2)
13215            return NULL;
13216    }
13217
13218    switch (kind1) {
13219    case PyUnicode_1BYTE_KIND:
13220        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13221            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13222        else
13223            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13224        break;
13225    case PyUnicode_2BYTE_KIND:
13226        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13227        break;
13228    case PyUnicode_4BYTE_KIND:
13229        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13230        break;
13231    default:
13232        Py_UNREACHABLE();
13233    }
13234
13235    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13236    if (kind2 != kind1)
13237        PyMem_Free((void *)buf2);
13238
13239    return out;
13240}
13241
13242
13243PyObject *
13244PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13245{
13246    PyObject* out;
13247    int kind1, kind2;
13248    const void *buf1, *buf2;
13249    Py_ssize_t len1, len2;
13250
13251    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13252        return NULL;
13253
13254    kind1 = PyUnicode_KIND(str_obj);
13255    kind2 = PyUnicode_KIND(sep_obj);
13256    len1 = PyUnicode_GET_LENGTH(str_obj);
13257    len2 = PyUnicode_GET_LENGTH(sep_obj);
13258    if (kind1 < kind2 || len1 < len2) {
13259        PyObject *empty = unicode_get_empty();  // Borrowed reference
13260        return PyTuple_Pack(3, empty, empty, str_obj);
13261    }
13262    buf1 = PyUnicode_DATA(str_obj);
13263    buf2 = PyUnicode_DATA(sep_obj);
13264    if (kind2 != kind1) {
13265        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13266        if (!buf2)
13267            return NULL;
13268    }
13269
13270    switch (kind1) {
13271    case PyUnicode_1BYTE_KIND:
13272        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13273            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13274        else
13275            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13276        break;
13277    case PyUnicode_2BYTE_KIND:
13278        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13279        break;
13280    case PyUnicode_4BYTE_KIND:
13281        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13282        break;
13283    default:
13284        Py_UNREACHABLE();
13285    }
13286
13287    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13288    if (kind2 != kind1)
13289        PyMem_Free((void *)buf2);
13290
13291    return out;
13292}
13293
13294/*[clinic input]
13295str.partition as unicode_partition
13296
13297    sep: object
13298    /
13299
13300Partition the string into three parts using the given separator.
13301
13302This will search for the separator in the string.  If the separator is found,
13303returns a 3-tuple containing the part before the separator, the separator
13304itself, and the part after it.
13305
13306If the separator is not found, returns a 3-tuple containing the original string
13307and two empty strings.
13308[clinic start generated code]*/
13309
13310static PyObject *
13311unicode_partition(PyObject *self, PyObject *sep)
13312/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13313{
13314    return PyUnicode_Partition(self, sep);
13315}
13316
13317/*[clinic input]
13318str.rpartition as unicode_rpartition = str.partition
13319
13320Partition the string into three parts using the given separator.
13321
13322This will search for the separator in the string, starting at the end. If
13323the separator is found, returns a 3-tuple containing the part before the
13324separator, the separator itself, and the part after it.
13325
13326If the separator is not found, returns a 3-tuple containing two empty strings
13327and the original string.
13328[clinic start generated code]*/
13329
13330static PyObject *
13331unicode_rpartition(PyObject *self, PyObject *sep)
13332/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13333{
13334    return PyUnicode_RPartition(self, sep);
13335}
13336
13337PyObject *
13338PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13339{
13340    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13341        return NULL;
13342
13343    return rsplit(s, sep, maxsplit);
13344}
13345
13346/*[clinic input]
13347str.rsplit as unicode_rsplit = str.split
13348
13349Return a list of the substrings in the string, using sep as the separator string.
13350
13351Splitting starts at the end of the string and works to the front.
13352[clinic start generated code]*/
13353
13354static PyObject *
13355unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13356/*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
13357{
13358    if (sep == Py_None)
13359        return rsplit(self, NULL, maxsplit);
13360    if (PyUnicode_Check(sep))
13361        return rsplit(self, sep, maxsplit);
13362
13363    PyErr_Format(PyExc_TypeError,
13364                 "must be str or None, not %.100s",
13365                 Py_TYPE(sep)->tp_name);
13366    return NULL;
13367}
13368
13369/*[clinic input]
13370str.splitlines as unicode_splitlines
13371
13372    keepends: bool(accept={int}) = False
13373
13374Return a list of the lines in the string, breaking at line boundaries.
13375
13376Line breaks are not included in the resulting list unless keepends is given and
13377true.
13378[clinic start generated code]*/
13379
13380static PyObject *
13381unicode_splitlines_impl(PyObject *self, int keepends)
13382/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13383{
13384    return PyUnicode_Splitlines(self, keepends);
13385}
13386
13387static
13388PyObject *unicode_str(PyObject *self)
13389{
13390    return unicode_result_unchanged(self);
13391}
13392
13393/*[clinic input]
13394str.swapcase as unicode_swapcase
13395
13396Convert uppercase characters to lowercase and lowercase characters to uppercase.
13397[clinic start generated code]*/
13398
13399static PyObject *
13400unicode_swapcase_impl(PyObject *self)
13401/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13402{
13403    if (PyUnicode_READY(self) == -1)
13404        return NULL;
13405    return case_operation(self, do_swapcase);
13406}
13407
13408/*[clinic input]
13409
13410@staticmethod
13411str.maketrans as unicode_maketrans
13412
13413  x: object
13414
13415  y: unicode=NULL
13416
13417  z: unicode=NULL
13418
13419  /
13420
13421Return a translation table usable for str.translate().
13422
13423If there is only one argument, it must be a dictionary mapping Unicode
13424ordinals (integers) or characters to Unicode ordinals, strings or None.
13425Character keys will be then converted to ordinals.
13426If there are two arguments, they must be strings of equal length, and
13427in the resulting dictionary, each character in x will be mapped to the
13428character at the same position in y. If there is a third argument, it
13429must be a string, whose characters will be mapped to None in the result.
13430[clinic start generated code]*/
13431
13432static PyObject *
13433unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13434/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13435{
13436    PyObject *new = NULL, *key, *value;
13437    Py_ssize_t i = 0;
13438    int res;
13439
13440    new = PyDict_New();
13441    if (!new)
13442        return NULL;
13443    if (y != NULL) {
13444        int x_kind, y_kind, z_kind;
13445        const void *x_data, *y_data, *z_data;
13446
13447        /* x must be a string too, of equal length */
13448        if (!PyUnicode_Check(x)) {
13449            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13450                            "be a string if there is a second argument");
13451            goto err;
13452        }
13453        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13454            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13455                            "arguments must have equal length");
13456            goto err;
13457        }
13458        /* create entries for translating chars in x to those in y */
13459        x_kind = PyUnicode_KIND(x);
13460        y_kind = PyUnicode_KIND(y);
13461        x_data = PyUnicode_DATA(x);
13462        y_data = PyUnicode_DATA(y);
13463        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13464            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13465            if (!key)
13466                goto err;
13467            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13468            if (!value) {
13469                Py_DECREF(key);
13470                goto err;
13471            }
13472            res = PyDict_SetItem(new, key, value);
13473            Py_DECREF(key);
13474            Py_DECREF(value);
13475            if (res < 0)
13476                goto err;
13477        }
13478        /* create entries for deleting chars in z */
13479        if (z != NULL) {
13480            z_kind = PyUnicode_KIND(z);
13481            z_data = PyUnicode_DATA(z);
13482            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13483                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13484                if (!key)
13485                    goto err;
13486                res = PyDict_SetItem(new, key, Py_None);
13487                Py_DECREF(key);
13488                if (res < 0)
13489                    goto err;
13490            }
13491        }
13492    } else {
13493        int kind;
13494        const void *data;
13495
13496        /* x must be a dict */
13497        if (!PyDict_CheckExact(x)) {
13498            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13499                            "to maketrans it must be a dict");
13500            goto err;
13501        }
13502        /* copy entries into the new dict, converting string keys to int keys */
13503        while (PyDict_Next(x, &i, &key, &value)) {
13504            if (PyUnicode_Check(key)) {
13505                /* convert string keys to integer keys */
13506                PyObject *newkey;
13507                if (PyUnicode_GET_LENGTH(key) != 1) {
13508                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13509                                    "table must be of length 1");
13510                    goto err;
13511                }
13512                kind = PyUnicode_KIND(key);
13513                data = PyUnicode_DATA(key);
13514                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13515                if (!newkey)
13516                    goto err;
13517                res = PyDict_SetItem(new, newkey, value);
13518                Py_DECREF(newkey);
13519                if (res < 0)
13520                    goto err;
13521            } else if (PyLong_Check(key)) {
13522                /* just keep integer keys */
13523                if (PyDict_SetItem(new, key, value) < 0)
13524                    goto err;
13525            } else {
13526                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13527                                "be strings or integers");
13528                goto err;
13529            }
13530        }
13531    }
13532    return new;
13533  err:
13534    Py_DECREF(new);
13535    return NULL;
13536}
13537
13538/*[clinic input]
13539str.translate as unicode_translate
13540
13541    table: object
13542        Translation table, which must be a mapping of Unicode ordinals to
13543        Unicode ordinals, strings, or None.
13544    /
13545
13546Replace each character in the string using the given translation table.
13547
13548The table must implement lookup/indexing via __getitem__, for instance a
13549dictionary or list.  If this operation raises LookupError, the character is
13550left untouched.  Characters mapped to None are deleted.
13551[clinic start generated code]*/
13552
13553static PyObject *
13554unicode_translate(PyObject *self, PyObject *table)
13555/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13556{
13557    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13558}
13559
13560/*[clinic input]
13561str.upper as unicode_upper
13562
13563Return a copy of the string converted to uppercase.
13564[clinic start generated code]*/
13565
13566static PyObject *
13567unicode_upper_impl(PyObject *self)
13568/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13569{
13570    if (PyUnicode_READY(self) == -1)
13571        return NULL;
13572    if (PyUnicode_IS_ASCII(self))
13573        return ascii_upper_or_lower(self, 0);
13574    return case_operation(self, do_upper);
13575}
13576
13577/*[clinic input]
13578str.zfill as unicode_zfill
13579
13580    width: Py_ssize_t
13581    /
13582
13583Pad a numeric string with zeros on the left, to fill a field of the given width.
13584
13585The string is never truncated.
13586[clinic start generated code]*/
13587
13588static PyObject *
13589unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13590/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13591{
13592    Py_ssize_t fill;
13593    PyObject *u;
13594    int kind;
13595    const void *data;
13596    Py_UCS4 chr;
13597
13598    if (PyUnicode_READY(self) == -1)
13599        return NULL;
13600
13601    if (PyUnicode_GET_LENGTH(self) >= width)
13602        return unicode_result_unchanged(self);
13603
13604    fill = width - PyUnicode_GET_LENGTH(self);
13605
13606    u = pad(self, fill, 0, '0');
13607
13608    if (u == NULL)
13609        return NULL;
13610
13611    kind = PyUnicode_KIND(u);
13612    data = PyUnicode_DATA(u);
13613    chr = PyUnicode_READ(kind, data, fill);
13614
13615    if (chr == '+' || chr == '-') {
13616        /* move sign to beginning of string */
13617        PyUnicode_WRITE(kind, data, 0, chr);
13618        PyUnicode_WRITE(kind, data, fill, '0');
13619    }
13620
13621    assert(_PyUnicode_CheckConsistency(u, 1));
13622    return u;
13623}
13624
13625PyDoc_STRVAR(startswith__doc__,
13626             "S.startswith(prefix[, start[, end]]) -> bool\n\
13627\n\
13628Return True if S starts with the specified prefix, False otherwise.\n\
13629With optional start, test S beginning at that position.\n\
13630With optional end, stop comparing S at that position.\n\
13631prefix can also be a tuple of strings to try.");
13632
13633static PyObject *
13634unicode_startswith(PyObject *self,
13635                   PyObject *args)
13636{
13637    PyObject *subobj;
13638    PyObject *substring;
13639    Py_ssize_t start = 0;
13640    Py_ssize_t end = PY_SSIZE_T_MAX;
13641    int result;
13642
13643    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13644        return NULL;
13645    if (PyTuple_Check(subobj)) {
13646        Py_ssize_t i;
13647        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13648            substring = PyTuple_GET_ITEM(subobj, i);
13649            if (!PyUnicode_Check(substring)) {
13650                PyErr_Format(PyExc_TypeError,
13651                             "tuple for startswith must only contain str, "
13652                             "not %.100s",
13653                             Py_TYPE(substring)->tp_name);
13654                return NULL;
13655            }
13656            result = tailmatch(self, substring, start, end, -1);
13657            if (result == -1)
13658                return NULL;
13659            if (result) {
13660                Py_RETURN_TRUE;
13661            }
13662        }
13663        /* nothing matched */
13664        Py_RETURN_FALSE;
13665    }
13666    if (!PyUnicode_Check(subobj)) {
13667        PyErr_Format(PyExc_TypeError,
13668                     "startswith first arg must be str or "
13669                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13670        return NULL;
13671    }
13672    result = tailmatch(self, subobj, start, end, -1);
13673    if (result == -1)
13674        return NULL;
13675    return PyBool_FromLong(result);
13676}
13677
13678
13679PyDoc_STRVAR(endswith__doc__,
13680             "S.endswith(suffix[, start[, end]]) -> bool\n\
13681\n\
13682Return True if S ends with the specified suffix, False otherwise.\n\
13683With optional start, test S beginning at that position.\n\
13684With optional end, stop comparing S at that position.\n\
13685suffix can also be a tuple of strings to try.");
13686
13687static PyObject *
13688unicode_endswith(PyObject *self,
13689                 PyObject *args)
13690{
13691    PyObject *subobj;
13692    PyObject *substring;
13693    Py_ssize_t start = 0;
13694    Py_ssize_t end = PY_SSIZE_T_MAX;
13695    int result;
13696
13697    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13698        return NULL;
13699    if (PyTuple_Check(subobj)) {
13700        Py_ssize_t i;
13701        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13702            substring = PyTuple_GET_ITEM(subobj, i);
13703            if (!PyUnicode_Check(substring)) {
13704                PyErr_Format(PyExc_TypeError,
13705                             "tuple for endswith must only contain str, "
13706                             "not %.100s",
13707                             Py_TYPE(substring)->tp_name);
13708                return NULL;
13709            }
13710            result = tailmatch(self, substring, start, end, +1);
13711            if (result == -1)
13712                return NULL;
13713            if (result) {
13714                Py_RETURN_TRUE;
13715            }
13716        }
13717        Py_RETURN_FALSE;
13718    }
13719    if (!PyUnicode_Check(subobj)) {
13720        PyErr_Format(PyExc_TypeError,
13721                     "endswith first arg must be str or "
13722                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13723        return NULL;
13724    }
13725    result = tailmatch(self, subobj, start, end, +1);
13726    if (result == -1)
13727        return NULL;
13728    return PyBool_FromLong(result);
13729}
13730
13731static inline void
13732_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13733{
13734    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13735    writer->data = PyUnicode_DATA(writer->buffer);
13736
13737    if (!writer->readonly) {
13738        writer->kind = PyUnicode_KIND(writer->buffer);
13739        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13740    }
13741    else {
13742        /* use a value smaller than PyUnicode_1BYTE_KIND() so
13743           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13744        writer->kind = PyUnicode_WCHAR_KIND;
13745        assert(writer->kind <= PyUnicode_1BYTE_KIND);
13746
13747        /* Copy-on-write mode: set buffer size to 0 so
13748         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13749         * next write. */
13750        writer->size = 0;
13751    }
13752}
13753
13754void
13755_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13756{
13757    memset(writer, 0, sizeof(*writer));
13758
13759    /* ASCII is the bare minimum */
13760    writer->min_char = 127;
13761
13762    /* use a value smaller than PyUnicode_1BYTE_KIND() so
13763       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13764    writer->kind = PyUnicode_WCHAR_KIND;
13765    assert(writer->kind <= PyUnicode_1BYTE_KIND);
13766}
13767
13768// Initialize _PyUnicodeWriter with initial buffer
13769static inline void
13770_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13771{
13772    memset(writer, 0, sizeof(*writer));
13773    writer->buffer = buffer;
13774    _PyUnicodeWriter_Update(writer);
13775    writer->min_length = writer->size;
13776}
13777
13778int
13779_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13780                                 Py_ssize_t length, Py_UCS4 maxchar)
13781{
13782    Py_ssize_t newlen;
13783    PyObject *newbuffer;
13784
13785    assert(maxchar <= MAX_UNICODE);
13786
13787    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13788    assert((maxchar > writer->maxchar && length >= 0)
13789           || length > 0);
13790
13791    if (length > PY_SSIZE_T_MAX - writer->pos) {
13792        PyErr_NoMemory();
13793        return -1;
13794    }
13795    newlen = writer->pos + length;
13796
13797    maxchar = Py_MAX(maxchar, writer->min_char);
13798
13799    if (writer->buffer == NULL) {
13800        assert(!writer->readonly);
13801        if (writer->overallocate
13802            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13803            /* overallocate to limit the number of realloc() */
13804            newlen += newlen / OVERALLOCATE_FACTOR;
13805        }
13806        if (newlen < writer->min_length)
13807            newlen = writer->min_length;
13808
13809        writer->buffer = PyUnicode_New(newlen, maxchar);
13810        if (writer->buffer == NULL)
13811            return -1;
13812    }
13813    else if (newlen > writer->size) {
13814        if (writer->overallocate
13815            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13816            /* overallocate to limit the number of realloc() */
13817            newlen += newlen / OVERALLOCATE_FACTOR;
13818        }
13819        if (newlen < writer->min_length)
13820            newlen = writer->min_length;
13821
13822        if (maxchar > writer->maxchar || writer->readonly) {
13823            /* resize + widen */
13824            maxchar = Py_MAX(maxchar, writer->maxchar);
13825            newbuffer = PyUnicode_New(newlen, maxchar);
13826            if (newbuffer == NULL)
13827                return -1;
13828            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13829                                          writer->buffer, 0, writer->pos);
13830            Py_DECREF(writer->buffer);
13831            writer->readonly = 0;
13832        }
13833        else {
13834            newbuffer = resize_compact(writer->buffer, newlen);
13835            if (newbuffer == NULL)
13836                return -1;
13837        }
13838        writer->buffer = newbuffer;
13839    }
13840    else if (maxchar > writer->maxchar) {
13841        assert(!writer->readonly);
13842        newbuffer = PyUnicode_New(writer->size, maxchar);
13843        if (newbuffer == NULL)
13844            return -1;
13845        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13846                                      writer->buffer, 0, writer->pos);
13847        Py_SETREF(writer->buffer, newbuffer);
13848    }
13849    _PyUnicodeWriter_Update(writer);
13850    return 0;
13851
13852#undef OVERALLOCATE_FACTOR
13853}
13854
13855int
13856_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13857                                     enum PyUnicode_Kind kind)
13858{
13859    Py_UCS4 maxchar;
13860
13861    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13862    assert(writer->kind < kind);
13863
13864    switch (kind)
13865    {
13866    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13867    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13868    case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13869    default:
13870        Py_UNREACHABLE();
13871    }
13872
13873    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13874}
13875
13876static inline int
13877_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13878{
13879    assert(ch <= MAX_UNICODE);
13880    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13881        return -1;
13882    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13883    writer->pos++;
13884    return 0;
13885}
13886
13887int
13888_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13889{
13890    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13891}
13892
13893int
13894_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13895{
13896    Py_UCS4 maxchar;
13897    Py_ssize_t len;
13898
13899    if (PyUnicode_READY(str) == -1)
13900        return -1;
13901    len = PyUnicode_GET_LENGTH(str);
13902    if (len == 0)
13903        return 0;
13904    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13905    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13906        if (writer->buffer == NULL && !writer->overallocate) {
13907            assert(_PyUnicode_CheckConsistency(str, 1));
13908            writer->readonly = 1;
13909            Py_INCREF(str);
13910            writer->buffer = str;
13911            _PyUnicodeWriter_Update(writer);
13912            writer->pos += len;
13913            return 0;
13914        }
13915        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13916            return -1;
13917    }
13918    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13919                                  str, 0, len);
13920    writer->pos += len;
13921    return 0;
13922}
13923
13924int
13925_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13926                                Py_ssize_t start, Py_ssize_t end)
13927{
13928    Py_UCS4 maxchar;
13929    Py_ssize_t len;
13930
13931    if (PyUnicode_READY(str) == -1)
13932        return -1;
13933
13934    assert(0 <= start);
13935    assert(end <= PyUnicode_GET_LENGTH(str));
13936    assert(start <= end);
13937
13938    if (end == 0)
13939        return 0;
13940
13941    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13942        return _PyUnicodeWriter_WriteStr(writer, str);
13943
13944    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13945        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13946    else
13947        maxchar = writer->maxchar;
13948    len = end - start;
13949
13950    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13951        return -1;
13952
13953    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13954                                  str, start, len);
13955    writer->pos += len;
13956    return 0;
13957}
13958
13959int
13960_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13961                                  const char *ascii, Py_ssize_t len)
13962{
13963    if (len == -1)
13964        len = strlen(ascii);
13965
13966    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13967
13968    if (writer->buffer == NULL && !writer->overallocate) {
13969        PyObject *str;
13970
13971        str = _PyUnicode_FromASCII(ascii, len);
13972        if (str == NULL)
13973            return -1;
13974
13975        writer->readonly = 1;
13976        writer->buffer = str;
13977        _PyUnicodeWriter_Update(writer);
13978        writer->pos += len;
13979        return 0;
13980    }
13981
13982    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13983        return -1;
13984
13985    switch (writer->kind)
13986    {
13987    case PyUnicode_1BYTE_KIND:
13988    {
13989        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13990        Py_UCS1 *data = writer->data;
13991
13992        memcpy(data + writer->pos, str, len);
13993        break;
13994    }
13995    case PyUnicode_2BYTE_KIND:
13996    {
13997        _PyUnicode_CONVERT_BYTES(
13998            Py_UCS1, Py_UCS2,
13999            ascii, ascii + len,
14000            (Py_UCS2 *)writer->data + writer->pos);
14001        break;
14002    }
14003    case PyUnicode_4BYTE_KIND:
14004    {
14005        _PyUnicode_CONVERT_BYTES(
14006            Py_UCS1, Py_UCS4,
14007            ascii, ascii + len,
14008            (Py_UCS4 *)writer->data + writer->pos);
14009        break;
14010    }
14011    default:
14012        Py_UNREACHABLE();
14013    }
14014
14015    writer->pos += len;
14016    return 0;
14017}
14018
14019int
14020_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14021                                   const char *str, Py_ssize_t len)
14022{
14023    Py_UCS4 maxchar;
14024
14025    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14026    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14027        return -1;
14028    unicode_write_cstr(writer->buffer, writer->pos, str, len);
14029    writer->pos += len;
14030    return 0;
14031}
14032
14033PyObject *
14034_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14035{
14036    PyObject *str;
14037
14038    if (writer->pos == 0) {
14039        Py_CLEAR(writer->buffer);
14040        _Py_RETURN_UNICODE_EMPTY();
14041    }
14042
14043    str = writer->buffer;
14044    writer->buffer = NULL;
14045
14046    if (writer->readonly) {
14047        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14048        return str;
14049    }
14050
14051    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14052        PyObject *str2;
14053        str2 = resize_compact(str, writer->pos);
14054        if (str2 == NULL) {
14055            Py_DECREF(str);
14056            return NULL;
14057        }
14058        str = str2;
14059    }
14060
14061    assert(_PyUnicode_CheckConsistency(str, 1));
14062    return unicode_result_ready(str);
14063}
14064
14065void
14066_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14067{
14068    Py_CLEAR(writer->buffer);
14069}
14070
14071#include "stringlib/unicode_format.h"
14072
14073PyDoc_STRVAR(format__doc__,
14074             "S.format(*args, **kwargs) -> str\n\
14075\n\
14076Return a formatted version of S, using substitutions from args and kwargs.\n\
14077The substitutions are identified by braces ('{' and '}').");
14078
14079PyDoc_STRVAR(format_map__doc__,
14080             "S.format_map(mapping) -> str\n\
14081\n\
14082Return a formatted version of S, using substitutions from mapping.\n\
14083The substitutions are identified by braces ('{' and '}').");
14084
14085/*[clinic input]
14086str.__format__ as unicode___format__
14087
14088    format_spec: unicode
14089    /
14090
14091Return a formatted version of the string as described by format_spec.
14092[clinic start generated code]*/
14093
14094static PyObject *
14095unicode___format___impl(PyObject *self, PyObject *format_spec)
14096/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14097{
14098    _PyUnicodeWriter writer;
14099    int ret;
14100
14101    if (PyUnicode_READY(self) == -1)
14102        return NULL;
14103    _PyUnicodeWriter_Init(&writer);
14104    ret = _PyUnicode_FormatAdvancedWriter(&writer,
14105                                          self, format_spec, 0,
14106                                          PyUnicode_GET_LENGTH(format_spec));
14107    if (ret == -1) {
14108        _PyUnicodeWriter_Dealloc(&writer);
14109        return NULL;
14110    }
14111    return _PyUnicodeWriter_Finish(&writer);
14112}
14113
14114/*[clinic input]
14115str.__sizeof__ as unicode_sizeof
14116
14117Return the size of the string in memory, in bytes.
14118[clinic start generated code]*/
14119
14120static PyObject *
14121unicode_sizeof_impl(PyObject *self)
14122/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14123{
14124    Py_ssize_t size;
14125
14126    /* If it's a compact object, account for base structure +
14127       character data. */
14128    if (PyUnicode_IS_COMPACT_ASCII(self))
14129        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14130    else if (PyUnicode_IS_COMPACT(self))
14131        size = sizeof(PyCompactUnicodeObject) +
14132            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14133    else {
14134        /* If it is a two-block object, account for base object, and
14135           for character block if present. */
14136        size = sizeof(PyUnicodeObject);
14137        if (_PyUnicode_DATA_ANY(self))
14138            size += (PyUnicode_GET_LENGTH(self) + 1) *
14139                PyUnicode_KIND(self);
14140    }
14141    /* If the wstr pointer is present, account for it unless it is shared
14142       with the data pointer. Check if the data is not shared. */
14143    if (_PyUnicode_HAS_WSTR_MEMORY(self))
14144        size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14145    if (_PyUnicode_HAS_UTF8_MEMORY(self))
14146        size += PyUnicode_UTF8_LENGTH(self) + 1;
14147
14148    return PyLong_FromSsize_t(size);
14149}
14150
14151static PyObject *
14152unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14153{
14154    PyObject *copy = _PyUnicode_Copy(v);
14155    if (!copy)
14156        return NULL;
14157    return Py_BuildValue("(N)", copy);
14158}
14159
14160static PyMethodDef unicode_methods[] = {
14161    UNICODE_ENCODE_METHODDEF
14162    UNICODE_REPLACE_METHODDEF
14163    UNICODE_SPLIT_METHODDEF
14164    UNICODE_RSPLIT_METHODDEF
14165    UNICODE_JOIN_METHODDEF
14166    UNICODE_CAPITALIZE_METHODDEF
14167    UNICODE_CASEFOLD_METHODDEF
14168    UNICODE_TITLE_METHODDEF
14169    UNICODE_CENTER_METHODDEF
14170    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14171    UNICODE_EXPANDTABS_METHODDEF
14172    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14173    UNICODE_PARTITION_METHODDEF
14174    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14175    UNICODE_LJUST_METHODDEF
14176    UNICODE_LOWER_METHODDEF
14177    UNICODE_LSTRIP_METHODDEF
14178    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14179    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14180    UNICODE_RJUST_METHODDEF
14181    UNICODE_RSTRIP_METHODDEF
14182    UNICODE_RPARTITION_METHODDEF
14183    UNICODE_SPLITLINES_METHODDEF
14184    UNICODE_STRIP_METHODDEF
14185    UNICODE_SWAPCASE_METHODDEF
14186    UNICODE_TRANSLATE_METHODDEF
14187    UNICODE_UPPER_METHODDEF
14188    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14189    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14190    UNICODE_REMOVEPREFIX_METHODDEF
14191    UNICODE_REMOVESUFFIX_METHODDEF
14192    UNICODE_ISASCII_METHODDEF
14193    UNICODE_ISLOWER_METHODDEF
14194    UNICODE_ISUPPER_METHODDEF
14195    UNICODE_ISTITLE_METHODDEF
14196    UNICODE_ISSPACE_METHODDEF
14197    UNICODE_ISDECIMAL_METHODDEF
14198    UNICODE_ISDIGIT_METHODDEF
14199    UNICODE_ISNUMERIC_METHODDEF
14200    UNICODE_ISALPHA_METHODDEF
14201    UNICODE_ISALNUM_METHODDEF
14202    UNICODE_ISIDENTIFIER_METHODDEF
14203    UNICODE_ISPRINTABLE_METHODDEF
14204    UNICODE_ZFILL_METHODDEF
14205    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14206    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14207    UNICODE___FORMAT___METHODDEF
14208    UNICODE_MAKETRANS_METHODDEF
14209    UNICODE_SIZEOF_METHODDEF
14210    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14211    {NULL, NULL}
14212};
14213
14214static PyObject *
14215unicode_mod(PyObject *v, PyObject *w)
14216{
14217    if (!PyUnicode_Check(v))
14218        Py_RETURN_NOTIMPLEMENTED;
14219    return PyUnicode_Format(v, w);
14220}
14221
14222static PyNumberMethods unicode_as_number = {
14223    0,              /*nb_add*/
14224    0,              /*nb_subtract*/
14225    0,              /*nb_multiply*/
14226    unicode_mod,            /*nb_remainder*/
14227};
14228
14229static PySequenceMethods unicode_as_sequence = {
14230    (lenfunc) unicode_length,       /* sq_length */
14231    PyUnicode_Concat,           /* sq_concat */
14232    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
14233    (ssizeargfunc) unicode_getitem,     /* sq_item */
14234    0,                  /* sq_slice */
14235    0,                  /* sq_ass_item */
14236    0,                  /* sq_ass_slice */
14237    PyUnicode_Contains,         /* sq_contains */
14238};
14239
14240static PyObject*
14241unicode_subscript(PyObject* self, PyObject* item)
14242{
14243    if (PyUnicode_READY(self) == -1)
14244        return NULL;
14245
14246    if (_PyIndex_Check(item)) {
14247        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14248        if (i == -1 && PyErr_Occurred())
14249            return NULL;
14250        if (i < 0)
14251            i += PyUnicode_GET_LENGTH(self);
14252        return unicode_getitem(self, i);
14253    } else if (PySlice_Check(item)) {
14254        Py_ssize_t start, stop, step, slicelength, i;
14255        size_t cur;
14256        PyObject *result;
14257        const void *src_data;
14258        void *dest_data;
14259        int src_kind, dest_kind;
14260        Py_UCS4 ch, max_char, kind_limit;
14261
14262        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14263            return NULL;
14264        }
14265        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14266                                            &start, &stop, step);
14267
14268        if (slicelength <= 0) {
14269            _Py_RETURN_UNICODE_EMPTY();
14270        } else if (start == 0 && step == 1 &&
14271                   slicelength == PyUnicode_GET_LENGTH(self)) {
14272            return unicode_result_unchanged(self);
14273        } else if (step == 1) {
14274            return PyUnicode_Substring(self,
14275                                       start, start + slicelength);
14276        }
14277        /* General case */
14278        src_kind = PyUnicode_KIND(self);
14279        src_data = PyUnicode_DATA(self);
14280        if (!PyUnicode_IS_ASCII(self)) {
14281            kind_limit = kind_maxchar_limit(src_kind);
14282            max_char = 0;
14283            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14284                ch = PyUnicode_READ(src_kind, src_data, cur);
14285                if (ch > max_char) {
14286                    max_char = ch;
14287                    if (max_char >= kind_limit)
14288                        break;
14289                }
14290            }
14291        }
14292        else
14293            max_char = 127;
14294        result = PyUnicode_New(slicelength, max_char);
14295        if (result == NULL)
14296            return NULL;
14297        dest_kind = PyUnicode_KIND(result);
14298        dest_data = PyUnicode_DATA(result);
14299
14300        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14301            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14302            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14303        }
14304        assert(_PyUnicode_CheckConsistency(result, 1));
14305        return result;
14306    } else {
14307        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14308                     Py_TYPE(item)->tp_name);
14309        return NULL;
14310    }
14311}
14312
14313static PyMappingMethods unicode_as_mapping = {
14314    (lenfunc)unicode_length,        /* mp_length */
14315    (binaryfunc)unicode_subscript,  /* mp_subscript */
14316    (objobjargproc)0,           /* mp_ass_subscript */
14317};
14318
14319
14320/* Helpers for PyUnicode_Format() */
14321
14322struct unicode_formatter_t {
14323    PyObject *args;
14324    int args_owned;
14325    Py_ssize_t arglen, argidx;
14326    PyObject *dict;
14327
14328    enum PyUnicode_Kind fmtkind;
14329    Py_ssize_t fmtcnt, fmtpos;
14330    const void *fmtdata;
14331    PyObject *fmtstr;
14332
14333    _PyUnicodeWriter writer;
14334};
14335
14336struct unicode_format_arg_t {
14337    Py_UCS4 ch;
14338    int flags;
14339    Py_ssize_t width;
14340    int prec;
14341    int sign;
14342};
14343
14344static PyObject *
14345unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14346{
14347    Py_ssize_t argidx = ctx->argidx;
14348
14349    if (argidx < ctx->arglen) {
14350        ctx->argidx++;
14351        if (ctx->arglen < 0)
14352            return ctx->args;
14353        else
14354            return PyTuple_GetItem(ctx->args, argidx);
14355    }
14356    PyErr_SetString(PyExc_TypeError,
14357                    "not enough arguments for format string");
14358    return NULL;
14359}
14360
14361/* Returns a new reference to a PyUnicode object, or NULL on failure. */
14362
14363/* Format a float into the writer if the writer is not NULL, or into *p_output
14364   otherwise.
14365
14366   Return 0 on success, raise an exception and return -1 on error. */
14367static int
14368formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14369            PyObject **p_output,
14370            _PyUnicodeWriter *writer)
14371{
14372    char *p;
14373    double x;
14374    Py_ssize_t len;
14375    int prec;
14376    int dtoa_flags = 0;
14377
14378    x = PyFloat_AsDouble(v);
14379    if (x == -1.0 && PyErr_Occurred())
14380        return -1;
14381
14382    prec = arg->prec;
14383    if (prec < 0)
14384        prec = 6;
14385
14386    if (arg->flags & F_ALT)
14387        dtoa_flags |= Py_DTSF_ALT;
14388    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14389    if (p == NULL)
14390        return -1;
14391    len = strlen(p);
14392    if (writer) {
14393        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14394            PyMem_Free(p);
14395            return -1;
14396        }
14397    }
14398    else
14399        *p_output = _PyUnicode_FromASCII(p, len);
14400    PyMem_Free(p);
14401    return 0;
14402}
14403
14404/* formatlong() emulates the format codes d, u, o, x and X, and
14405 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14406 * Python's regular ints.
14407 * Return value:  a new PyUnicodeObject*, or NULL if error.
14408 *     The output string is of the form
14409 *         "-"? ("0x" | "0X")? digit+
14410 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14411 *         set in flags.  The case of hex digits will be correct,
14412 *     There will be at least prec digits, zero-filled on the left if
14413 *         necessary to get that many.
14414 * val          object to be converted
14415 * flags        bitmask of format flags; only F_ALT is looked at
14416 * prec         minimum number of digits; 0-fill on left if needed
14417 * type         a character in [duoxX]; u acts the same as d
14418 *
14419 * CAUTION:  o, x and X conversions on regular ints can never
14420 * produce a '-' sign, but can for Python's unbounded ints.
14421 */
14422PyObject *
14423_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14424{
14425    PyObject *result = NULL;
14426    char *buf;
14427    Py_ssize_t i;
14428    int sign;           /* 1 if '-', else 0 */
14429    int len;            /* number of characters */
14430    Py_ssize_t llen;
14431    int numdigits;      /* len == numnondigits + numdigits */
14432    int numnondigits = 0;
14433
14434    /* Avoid exceeding SSIZE_T_MAX */
14435    if (prec > INT_MAX-3) {
14436        PyErr_SetString(PyExc_OverflowError,
14437                        "precision too large");
14438        return NULL;
14439    }
14440
14441    assert(PyLong_Check(val));
14442
14443    switch (type) {
14444    default:
14445        Py_UNREACHABLE();
14446    case 'd':
14447    case 'i':
14448    case 'u':
14449        /* int and int subclasses should print numerically when a numeric */
14450        /* format code is used (see issue18780) */
14451        result = PyNumber_ToBase(val, 10);
14452        break;
14453    case 'o':
14454        numnondigits = 2;
14455        result = PyNumber_ToBase(val, 8);
14456        break;
14457    case 'x':
14458    case 'X':
14459        numnondigits = 2;
14460        result = PyNumber_ToBase(val, 16);
14461        break;
14462    }
14463    if (!result)
14464        return NULL;
14465
14466    assert(unicode_modifiable(result));
14467    assert(PyUnicode_IS_READY(result));
14468    assert(PyUnicode_IS_ASCII(result));
14469
14470    /* To modify the string in-place, there can only be one reference. */
14471    if (Py_REFCNT(result) != 1) {
14472        Py_DECREF(result);
14473        PyErr_BadInternalCall();
14474        return NULL;
14475    }
14476    buf = PyUnicode_DATA(result);
14477    llen = PyUnicode_GET_LENGTH(result);
14478    if (llen > INT_MAX) {
14479        Py_DECREF(result);
14480        PyErr_SetString(PyExc_ValueError,
14481                        "string too large in _PyUnicode_FormatLong");
14482        return NULL;
14483    }
14484    len = (int)llen;
14485    sign = buf[0] == '-';
14486    numnondigits += sign;
14487    numdigits = len - numnondigits;
14488    assert(numdigits > 0);
14489
14490    /* Get rid of base marker unless F_ALT */
14491    if (((alt) == 0 &&
14492        (type == 'o' || type == 'x' || type == 'X'))) {
14493        assert(buf[sign] == '0');
14494        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14495               buf[sign+1] == 'o');
14496        numnondigits -= 2;
14497        buf += 2;
14498        len -= 2;
14499        if (sign)
14500            buf[0] = '-';
14501        assert(len == numnondigits + numdigits);
14502        assert(numdigits > 0);
14503    }
14504
14505    /* Fill with leading zeroes to meet minimum width. */
14506    if (prec > numdigits) {
14507        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14508                                numnondigits + prec);
14509        char *b1;
14510        if (!r1) {
14511            Py_DECREF(result);
14512            return NULL;
14513        }
14514        b1 = PyBytes_AS_STRING(r1);
14515        for (i = 0; i < numnondigits; ++i)
14516            *b1++ = *buf++;
14517        for (i = 0; i < prec - numdigits; i++)
14518            *b1++ = '0';
14519        for (i = 0; i < numdigits; i++)
14520            *b1++ = *buf++;
14521        *b1 = '\0';
14522        Py_DECREF(result);
14523        result = r1;
14524        buf = PyBytes_AS_STRING(result);
14525        len = numnondigits + prec;
14526    }
14527
14528    /* Fix up case for hex conversions. */
14529    if (type == 'X') {
14530        /* Need to convert all lower case letters to upper case.
14531           and need to convert 0x to 0X (and -0x to -0X). */
14532        for (i = 0; i < len; i++)
14533            if (buf[i] >= 'a' && buf[i] <= 'x')
14534                buf[i] -= 'a'-'A';
14535    }
14536    if (!PyUnicode_Check(result)
14537        || buf != PyUnicode_DATA(result)) {
14538        PyObject *unicode;
14539        unicode = _PyUnicode_FromASCII(buf, len);
14540        Py_DECREF(result);
14541        result = unicode;
14542    }
14543    else if (len != PyUnicode_GET_LENGTH(result)) {
14544        if (PyUnicode_Resize(&result, len) < 0)
14545            Py_CLEAR(result);
14546    }
14547    return result;
14548}
14549
14550/* Format an integer or a float as an integer.
14551 * Return 1 if the number has been formatted into the writer,
14552 *        0 if the number has been formatted into *p_output
14553 *       -1 and raise an exception on error */
14554static int
14555mainformatlong(PyObject *v,
14556               struct unicode_format_arg_t *arg,
14557               PyObject **p_output,
14558               _PyUnicodeWriter *writer)
14559{
14560    PyObject *iobj, *res;
14561    char type = (char)arg->ch;
14562
14563    if (!PyNumber_Check(v))
14564        goto wrongtype;
14565
14566    /* make sure number is a type of integer for o, x, and X */
14567    if (!PyLong_Check(v)) {
14568        if (type == 'o' || type == 'x' || type == 'X') {
14569            iobj = _PyNumber_Index(v);
14570        }
14571        else {
14572            iobj = PyNumber_Long(v);
14573        }
14574        if (iobj == NULL ) {
14575            if (PyErr_ExceptionMatches(PyExc_TypeError))
14576                goto wrongtype;
14577            return -1;
14578        }
14579        assert(PyLong_Check(iobj));
14580    }
14581    else {
14582        iobj = v;
14583        Py_INCREF(iobj);
14584    }
14585
14586    if (PyLong_CheckExact(v)
14587        && arg->width == -1 && arg->prec == -1
14588        && !(arg->flags & (F_SIGN | F_BLANK))
14589        && type != 'X')
14590    {
14591        /* Fast path */
14592        int alternate = arg->flags & F_ALT;
14593        int base;
14594
14595        switch(type)
14596        {
14597            default:
14598                Py_UNREACHABLE();
14599            case 'd':
14600            case 'i':
14601            case 'u':
14602                base = 10;
14603                break;
14604            case 'o':
14605                base = 8;
14606                break;
14607            case 'x':
14608            case 'X':
14609                base = 16;
14610                break;
14611        }
14612
14613        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14614            Py_DECREF(iobj);
14615            return -1;
14616        }
14617        Py_DECREF(iobj);
14618        return 1;
14619    }
14620
14621    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14622    Py_DECREF(iobj);
14623    if (res == NULL)
14624        return -1;
14625    *p_output = res;
14626    return 0;
14627
14628wrongtype:
14629    switch(type)
14630    {
14631        case 'o':
14632        case 'x':
14633        case 'X':
14634            PyErr_Format(PyExc_TypeError,
14635                    "%%%c format: an integer is required, "
14636                    "not %.200s",
14637                    type, Py_TYPE(v)->tp_name);
14638            break;
14639        default:
14640            PyErr_Format(PyExc_TypeError,
14641                    "%%%c format: a real number is required, "
14642                    "not %.200s",
14643                    type, Py_TYPE(v)->tp_name);
14644            break;
14645    }
14646    return -1;
14647}
14648
14649static Py_UCS4
14650formatchar(PyObject *v)
14651{
14652    /* presume that the buffer is at least 3 characters long */
14653    if (PyUnicode_Check(v)) {
14654        if (PyUnicode_GET_LENGTH(v) == 1) {
14655            return PyUnicode_READ_CHAR(v, 0);
14656        }
14657        goto onError;
14658    }
14659    else {
14660        int overflow;
14661        long x = PyLong_AsLongAndOverflow(v, &overflow);
14662        if (x == -1 && PyErr_Occurred()) {
14663            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14664                goto onError;
14665            }
14666            return (Py_UCS4) -1;
14667        }
14668
14669        if (x < 0 || x > MAX_UNICODE) {
14670            /* this includes an overflow in converting to C long */
14671            PyErr_SetString(PyExc_OverflowError,
14672                            "%c arg not in range(0x110000)");
14673            return (Py_UCS4) -1;
14674        }
14675
14676        return (Py_UCS4) x;
14677    }
14678
14679  onError:
14680    PyErr_SetString(PyExc_TypeError,
14681                    "%c requires int or char");
14682    return (Py_UCS4) -1;
14683}
14684
14685/* Parse options of an argument: flags, width, precision.
14686   Handle also "%(name)" syntax.
14687
14688   Return 0 if the argument has been formatted into arg->str.
14689   Return 1 if the argument has been written into ctx->writer,
14690   Raise an exception and return -1 on error. */
14691static int
14692unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14693                         struct unicode_format_arg_t *arg)
14694{
14695#define FORMAT_READ(ctx) \
14696        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14697
14698    PyObject *v;
14699
14700    if (arg->ch == '(') {
14701        /* Get argument value from a dictionary. Example: "%(name)s". */
14702        Py_ssize_t keystart;
14703        Py_ssize_t keylen;
14704        PyObject *key;
14705        int pcount = 1;
14706
14707        if (ctx->dict == NULL) {
14708            PyErr_SetString(PyExc_TypeError,
14709                            "format requires a mapping");
14710            return -1;
14711        }
14712        ++ctx->fmtpos;
14713        --ctx->fmtcnt;
14714        keystart = ctx->fmtpos;
14715        /* Skip over balanced parentheses */
14716        while (pcount > 0 && --ctx->fmtcnt >= 0) {
14717            arg->ch = FORMAT_READ(ctx);
14718            if (arg->ch == ')')
14719                --pcount;
14720            else if (arg->ch == '(')
14721                ++pcount;
14722            ctx->fmtpos++;
14723        }
14724        keylen = ctx->fmtpos - keystart - 1;
14725        if (ctx->fmtcnt < 0 || pcount > 0) {
14726            PyErr_SetString(PyExc_ValueError,
14727                            "incomplete format key");
14728            return -1;
14729        }
14730        key = PyUnicode_Substring(ctx->fmtstr,
14731                                  keystart, keystart + keylen);
14732        if (key == NULL)
14733            return -1;
14734        if (ctx->args_owned) {
14735            ctx->args_owned = 0;
14736            Py_DECREF(ctx->args);
14737        }
14738        ctx->args = PyObject_GetItem(ctx->dict, key);
14739        Py_DECREF(key);
14740        if (ctx->args == NULL)
14741            return -1;
14742        ctx->args_owned = 1;
14743        ctx->arglen = -1;
14744        ctx->argidx = -2;
14745    }
14746
14747    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14748    while (--ctx->fmtcnt >= 0) {
14749        arg->ch = FORMAT_READ(ctx);
14750        ctx->fmtpos++;
14751        switch (arg->ch) {
14752        case '-': arg->flags |= F_LJUST; continue;
14753        case '+': arg->flags |= F_SIGN; continue;
14754        case ' ': arg->flags |= F_BLANK; continue;
14755        case '#': arg->flags |= F_ALT; continue;
14756        case '0': arg->flags |= F_ZERO; continue;
14757        }
14758        break;
14759    }
14760
14761    /* Parse width. Example: "%10s" => width=10 */
14762    if (arg->ch == '*') {
14763        v = unicode_format_getnextarg(ctx);
14764        if (v == NULL)
14765            return -1;
14766        if (!PyLong_Check(v)) {
14767            PyErr_SetString(PyExc_TypeError,
14768                            "* wants int");
14769            return -1;
14770        }
14771        arg->width = PyLong_AsSsize_t(v);
14772        if (arg->width == -1 && PyErr_Occurred())
14773            return -1;
14774        if (arg->width < 0) {
14775            arg->flags |= F_LJUST;
14776            arg->width = -arg->width;
14777        }
14778        if (--ctx->fmtcnt >= 0) {
14779            arg->ch = FORMAT_READ(ctx);
14780            ctx->fmtpos++;
14781        }
14782    }
14783    else if (arg->ch >= '0' && arg->ch <= '9') {
14784        arg->width = arg->ch - '0';
14785        while (--ctx->fmtcnt >= 0) {
14786            arg->ch = FORMAT_READ(ctx);
14787            ctx->fmtpos++;
14788            if (arg->ch < '0' || arg->ch > '9')
14789                break;
14790            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14791               mixing signed and unsigned comparison. Since arg->ch is between
14792               '0' and '9', casting to int is safe. */
14793            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14794                PyErr_SetString(PyExc_ValueError,
14795                                "width too big");
14796                return -1;
14797            }
14798            arg->width = arg->width*10 + (arg->ch - '0');
14799        }
14800    }
14801
14802    /* Parse precision. Example: "%.3f" => prec=3 */
14803    if (arg->ch == '.') {
14804        arg->prec = 0;
14805        if (--ctx->fmtcnt >= 0) {
14806            arg->ch = FORMAT_READ(ctx);
14807            ctx->fmtpos++;
14808        }
14809        if (arg->ch == '*') {
14810            v = unicode_format_getnextarg(ctx);
14811            if (v == NULL)
14812                return -1;
14813            if (!PyLong_Check(v)) {
14814                PyErr_SetString(PyExc_TypeError,
14815                                "* wants int");
14816                return -1;
14817            }
14818            arg->prec = _PyLong_AsInt(v);
14819            if (arg->prec == -1 && PyErr_Occurred())
14820                return -1;
14821            if (arg->prec < 0)
14822                arg->prec = 0;
14823            if (--ctx->fmtcnt >= 0) {
14824                arg->ch = FORMAT_READ(ctx);
14825                ctx->fmtpos++;
14826            }
14827        }
14828        else if (arg->ch >= '0' && arg->ch <= '9') {
14829            arg->prec = arg->ch - '0';
14830            while (--ctx->fmtcnt >= 0) {
14831                arg->ch = FORMAT_READ(ctx);
14832                ctx->fmtpos++;
14833                if (arg->ch < '0' || arg->ch > '9')
14834                    break;
14835                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14836                    PyErr_SetString(PyExc_ValueError,
14837                                    "precision too big");
14838                    return -1;
14839                }
14840                arg->prec = arg->prec*10 + (arg->ch - '0');
14841            }
14842        }
14843    }
14844
14845    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14846    if (ctx->fmtcnt >= 0) {
14847        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14848            if (--ctx->fmtcnt >= 0) {
14849                arg->ch = FORMAT_READ(ctx);
14850                ctx->fmtpos++;
14851            }
14852        }
14853    }
14854    if (ctx->fmtcnt < 0) {
14855        PyErr_SetString(PyExc_ValueError,
14856                        "incomplete format");
14857        return -1;
14858    }
14859    return 0;
14860
14861#undef FORMAT_READ
14862}
14863
14864/* Format one argument. Supported conversion specifiers:
14865
14866   - "s", "r", "a": any type
14867   - "i", "d", "u": int or float
14868   - "o", "x", "X": int
14869   - "e", "E", "f", "F", "g", "G": float
14870   - "c": int or str (1 character)
14871
14872   When possible, the output is written directly into the Unicode writer
14873   (ctx->writer). A string is created when padding is required.
14874
14875   Return 0 if the argument has been formatted into *p_str,
14876          1 if the argument has been written into ctx->writer,
14877         -1 on error. */
14878static int
14879unicode_format_arg_format(struct unicode_formatter_t *ctx,
14880                          struct unicode_format_arg_t *arg,
14881                          PyObject **p_str)
14882{
14883    PyObject *v;
14884    _PyUnicodeWriter *writer = &ctx->writer;
14885
14886    if (ctx->fmtcnt == 0)
14887        ctx->writer.overallocate = 0;
14888
14889    v = unicode_format_getnextarg(ctx);
14890    if (v == NULL)
14891        return -1;
14892
14893
14894    switch (arg->ch) {
14895    case 's':
14896    case 'r':
14897    case 'a':
14898        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14899            /* Fast path */
14900            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14901                return -1;
14902            return 1;
14903        }
14904
14905        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14906            *p_str = v;
14907            Py_INCREF(*p_str);
14908        }
14909        else {
14910            if (arg->ch == 's')
14911                *p_str = PyObject_Str(v);
14912            else if (arg->ch == 'r')
14913                *p_str = PyObject_Repr(v);
14914            else
14915                *p_str = PyObject_ASCII(v);
14916        }
14917        break;
14918
14919    case 'i':
14920    case 'd':
14921    case 'u':
14922    case 'o':
14923    case 'x':
14924    case 'X':
14925    {
14926        int ret = mainformatlong(v, arg, p_str, writer);
14927        if (ret != 0)
14928            return ret;
14929        arg->sign = 1;
14930        break;
14931    }
14932
14933    case 'e':
14934    case 'E':
14935    case 'f':
14936    case 'F':
14937    case 'g':
14938    case 'G':
14939        if (arg->width == -1 && arg->prec == -1
14940            && !(arg->flags & (F_SIGN | F_BLANK)))
14941        {
14942            /* Fast path */
14943            if (formatfloat(v, arg, NULL, writer) == -1)
14944                return -1;
14945            return 1;
14946        }
14947
14948        arg->sign = 1;
14949        if (formatfloat(v, arg, p_str, NULL) == -1)
14950            return -1;
14951        break;
14952
14953    case 'c':
14954    {
14955        Py_UCS4 ch = formatchar(v);
14956        if (ch == (Py_UCS4) -1)
14957            return -1;
14958        if (arg->width == -1 && arg->prec == -1) {
14959            /* Fast path */
14960            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14961                return -1;
14962            return 1;
14963        }
14964        *p_str = PyUnicode_FromOrdinal(ch);
14965        break;
14966    }
14967
14968    default:
14969        PyErr_Format(PyExc_ValueError,
14970                     "unsupported format character '%c' (0x%x) "
14971                     "at index %zd",
14972                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14973                     (int)arg->ch,
14974                     ctx->fmtpos - 1);
14975        return -1;
14976    }
14977    if (*p_str == NULL)
14978        return -1;
14979    assert (PyUnicode_Check(*p_str));
14980    return 0;
14981}
14982
14983static int
14984unicode_format_arg_output(struct unicode_formatter_t *ctx,
14985                          struct unicode_format_arg_t *arg,
14986                          PyObject *str)
14987{
14988    Py_ssize_t len;
14989    enum PyUnicode_Kind kind;
14990    const void *pbuf;
14991    Py_ssize_t pindex;
14992    Py_UCS4 signchar;
14993    Py_ssize_t buflen;
14994    Py_UCS4 maxchar;
14995    Py_ssize_t sublen;
14996    _PyUnicodeWriter *writer = &ctx->writer;
14997    Py_UCS4 fill;
14998
14999    fill = ' ';
15000    if (arg->sign && arg->flags & F_ZERO)
15001        fill = '0';
15002
15003    if (PyUnicode_READY(str) == -1)
15004        return -1;
15005
15006    len = PyUnicode_GET_LENGTH(str);
15007    if ((arg->width == -1 || arg->width <= len)
15008        && (arg->prec == -1 || arg->prec >= len)
15009        && !(arg->flags & (F_SIGN | F_BLANK)))
15010    {
15011        /* Fast path */
15012        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15013            return -1;
15014        return 0;
15015    }
15016
15017    /* Truncate the string for "s", "r" and "a" formats
15018       if the precision is set */
15019    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15020        if (arg->prec >= 0 && len > arg->prec)
15021            len = arg->prec;
15022    }
15023
15024    /* Adjust sign and width */
15025    kind = PyUnicode_KIND(str);
15026    pbuf = PyUnicode_DATA(str);
15027    pindex = 0;
15028    signchar = '\0';
15029    if (arg->sign) {
15030        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15031        if (ch == '-' || ch == '+') {
15032            signchar = ch;
15033            len--;
15034            pindex++;
15035        }
15036        else if (arg->flags & F_SIGN)
15037            signchar = '+';
15038        else if (arg->flags & F_BLANK)
15039            signchar = ' ';
15040        else
15041            arg->sign = 0;
15042    }
15043    if (arg->width < len)
15044        arg->width = len;
15045
15046    /* Prepare the writer */
15047    maxchar = writer->maxchar;
15048    if (!(arg->flags & F_LJUST)) {
15049        if (arg->sign) {
15050            if ((arg->width-1) > len)
15051                maxchar = Py_MAX(maxchar, fill);
15052        }
15053        else {
15054            if (arg->width > len)
15055                maxchar = Py_MAX(maxchar, fill);
15056        }
15057    }
15058    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15059        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15060        maxchar = Py_MAX(maxchar, strmaxchar);
15061    }
15062
15063    buflen = arg->width;
15064    if (arg->sign && len == arg->width)
15065        buflen++;
15066    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15067        return -1;
15068
15069    /* Write the sign if needed */
15070    if (arg->sign) {
15071        if (fill != ' ') {
15072            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15073            writer->pos += 1;
15074        }
15075        if (arg->width > len)
15076            arg->width--;
15077    }
15078
15079    /* Write the numeric prefix for "x", "X" and "o" formats
15080       if the alternate form is used.
15081       For example, write "0x" for the "%#x" format. */
15082    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15083        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15084        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15085        if (fill != ' ') {
15086            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15087            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15088            writer->pos += 2;
15089            pindex += 2;
15090        }
15091        arg->width -= 2;
15092        if (arg->width < 0)
15093            arg->width = 0;
15094        len -= 2;
15095    }
15096
15097    /* Pad left with the fill character if needed */
15098    if (arg->width > len && !(arg->flags & F_LJUST)) {
15099        sublen = arg->width - len;
15100        unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15101        writer->pos += sublen;
15102        arg->width = len;
15103    }
15104
15105    /* If padding with spaces: write sign if needed and/or numeric prefix if
15106       the alternate form is used */
15107    if (fill == ' ') {
15108        if (arg->sign) {
15109            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15110            writer->pos += 1;
15111        }
15112        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15113            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15114            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15115            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15116            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15117            writer->pos += 2;
15118            pindex += 2;
15119        }
15120    }
15121
15122    /* Write characters */
15123    if (len) {
15124        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15125                                      str, pindex, len);
15126        writer->pos += len;
15127    }
15128
15129    /* Pad right with the fill character if needed */
15130    if (arg->width > len) {
15131        sublen = arg->width - len;
15132        unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15133        writer->pos += sublen;
15134    }
15135    return 0;
15136}
15137
15138/* Helper of PyUnicode_Format(): format one arg.
15139   Return 0 on success, raise an exception and return -1 on error. */
15140static int
15141unicode_format_arg(struct unicode_formatter_t *ctx)
15142{
15143    struct unicode_format_arg_t arg;
15144    PyObject *str;
15145    int ret;
15146
15147    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15148    if (arg.ch == '%') {
15149        ctx->fmtpos++;
15150        ctx->fmtcnt--;
15151        if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15152            return -1;
15153        return 0;
15154    }
15155    arg.flags = 0;
15156    arg.width = -1;
15157    arg.prec = -1;
15158    arg.sign = 0;
15159    str = NULL;
15160
15161    ret = unicode_format_arg_parse(ctx, &arg);
15162    if (ret == -1)
15163        return -1;
15164
15165    ret = unicode_format_arg_format(ctx, &arg, &str);
15166    if (ret == -1)
15167        return -1;
15168
15169    if (ret != 1) {
15170        ret = unicode_format_arg_output(ctx, &arg, str);
15171        Py_DECREF(str);
15172        if (ret == -1)
15173            return -1;
15174    }
15175
15176    if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15177        PyErr_SetString(PyExc_TypeError,
15178                        "not all arguments converted during string formatting");
15179        return -1;
15180    }
15181    return 0;
15182}
15183
15184PyObject *
15185PyUnicode_Format(PyObject *format, PyObject *args)
15186{
15187    struct unicode_formatter_t ctx;
15188
15189    if (format == NULL || args == NULL) {
15190        PyErr_BadInternalCall();
15191        return NULL;
15192    }
15193
15194    if (ensure_unicode(format) < 0)
15195        return NULL;
15196
15197    ctx.fmtstr = format;
15198    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15199    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15200    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15201    ctx.fmtpos = 0;
15202
15203    _PyUnicodeWriter_Init(&ctx.writer);
15204    ctx.writer.min_length = ctx.fmtcnt + 100;
15205    ctx.writer.overallocate = 1;
15206
15207    if (PyTuple_Check(args)) {
15208        ctx.arglen = PyTuple_Size(args);
15209        ctx.argidx = 0;
15210    }
15211    else {
15212        ctx.arglen = -1;
15213        ctx.argidx = -2;
15214    }
15215    ctx.args_owned = 0;
15216    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15217        ctx.dict = args;
15218    else
15219        ctx.dict = NULL;
15220    ctx.args = args;
15221
15222    while (--ctx.fmtcnt >= 0) {
15223        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15224            Py_ssize_t nonfmtpos;
15225
15226            nonfmtpos = ctx.fmtpos++;
15227            while (ctx.fmtcnt >= 0 &&
15228                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15229                ctx.fmtpos++;
15230                ctx.fmtcnt--;
15231            }
15232            if (ctx.fmtcnt < 0) {
15233                ctx.fmtpos--;
15234                ctx.writer.overallocate = 0;
15235            }
15236
15237            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15238                                                nonfmtpos, ctx.fmtpos) < 0)
15239                goto onError;
15240        }
15241        else {
15242            ctx.fmtpos++;
15243            if (unicode_format_arg(&ctx) == -1)
15244                goto onError;
15245        }
15246    }
15247
15248    if (ctx.argidx < ctx.arglen && !ctx.dict) {
15249        PyErr_SetString(PyExc_TypeError,
15250                        "not all arguments converted during string formatting");
15251        goto onError;
15252    }
15253
15254    if (ctx.args_owned) {
15255        Py_DECREF(ctx.args);
15256    }
15257    return _PyUnicodeWriter_Finish(&ctx.writer);
15258
15259  onError:
15260    _PyUnicodeWriter_Dealloc(&ctx.writer);
15261    if (ctx.args_owned) {
15262        Py_DECREF(ctx.args);
15263    }
15264    return NULL;
15265}
15266
15267static PyObject *
15268unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15269
15270/*[clinic input]
15271@classmethod
15272str.__new__ as unicode_new
15273
15274    object as x: object = NULL
15275    encoding: str = NULL
15276    errors: str = NULL
15277
15278[clinic start generated code]*/
15279
15280static PyObject *
15281unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15282                 const char *errors)
15283/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15284{
15285    PyObject *unicode;
15286    if (x == NULL) {
15287        unicode = unicode_new_empty();
15288    }
15289    else if (encoding == NULL && errors == NULL) {
15290        unicode = PyObject_Str(x);
15291    }
15292    else {
15293        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15294    }
15295
15296    if (unicode != NULL && type != &PyUnicode_Type) {
15297        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15298    }
15299    return unicode;
15300}
15301
15302static PyObject *
15303unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15304{
15305    PyObject *self;
15306    Py_ssize_t length, char_size;
15307    int share_wstr, share_utf8;
15308    unsigned int kind;
15309    void *data;
15310
15311    assert(PyType_IsSubtype(type, &PyUnicode_Type));
15312    assert(_PyUnicode_CHECK(unicode));
15313    if (PyUnicode_READY(unicode) == -1) {
15314        return NULL;
15315    }
15316
15317    self = type->tp_alloc(type, 0);
15318    if (self == NULL) {
15319        return NULL;
15320    }
15321    kind = PyUnicode_KIND(unicode);
15322    length = PyUnicode_GET_LENGTH(unicode);
15323
15324    _PyUnicode_LENGTH(self) = length;
15325#ifdef Py_DEBUG
15326    _PyUnicode_HASH(self) = -1;
15327#else
15328    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15329#endif
15330    _PyUnicode_STATE(self).interned = 0;
15331    _PyUnicode_STATE(self).kind = kind;
15332    _PyUnicode_STATE(self).compact = 0;
15333    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15334    _PyUnicode_STATE(self).ready = 1;
15335    _PyUnicode_WSTR(self) = NULL;
15336    _PyUnicode_UTF8_LENGTH(self) = 0;
15337    _PyUnicode_UTF8(self) = NULL;
15338    _PyUnicode_WSTR_LENGTH(self) = 0;
15339    _PyUnicode_DATA_ANY(self) = NULL;
15340
15341    share_utf8 = 0;
15342    share_wstr = 0;
15343    if (kind == PyUnicode_1BYTE_KIND) {
15344        char_size = 1;
15345        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15346            share_utf8 = 1;
15347    }
15348    else if (kind == PyUnicode_2BYTE_KIND) {
15349        char_size = 2;
15350        if (sizeof(wchar_t) == 2)
15351            share_wstr = 1;
15352    }
15353    else {
15354        assert(kind == PyUnicode_4BYTE_KIND);
15355        char_size = 4;
15356        if (sizeof(wchar_t) == 4)
15357            share_wstr = 1;
15358    }
15359
15360    /* Ensure we won't overflow the length. */
15361    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15362        PyErr_NoMemory();
15363        goto onError;
15364    }
15365    data = PyObject_Malloc((length + 1) * char_size);
15366    if (data == NULL) {
15367        PyErr_NoMemory();
15368        goto onError;
15369    }
15370
15371    _PyUnicode_DATA_ANY(self) = data;
15372    if (share_utf8) {
15373        _PyUnicode_UTF8_LENGTH(self) = length;
15374        _PyUnicode_UTF8(self) = data;
15375    }
15376    if (share_wstr) {
15377        _PyUnicode_WSTR_LENGTH(self) = length;
15378        _PyUnicode_WSTR(self) = (wchar_t *)data;
15379    }
15380
15381    memcpy(data, PyUnicode_DATA(unicode),
15382              kind * (length + 1));
15383    assert(_PyUnicode_CheckConsistency(self, 1));
15384#ifdef Py_DEBUG
15385    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15386#endif
15387    return self;
15388
15389onError:
15390    Py_DECREF(self);
15391    return NULL;
15392}
15393
15394void
15395_PyUnicode_ExactDealloc(PyObject *op)
15396{
15397    assert(PyUnicode_CheckExact(op));
15398    unicode_dealloc(op);
15399}
15400
15401PyDoc_STRVAR(unicode_doc,
15402"str(object='') -> str\n\
15403str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15404\n\
15405Create a new string object from the given object. If encoding or\n\
15406errors is specified, then the object must expose a data buffer\n\
15407that will be decoded using the given encoding and error handler.\n\
15408Otherwise, returns the result of object.__str__() (if defined)\n\
15409or repr(object).\n\
15410encoding defaults to sys.getdefaultencoding().\n\
15411errors defaults to 'strict'.");
15412
15413static PyObject *unicode_iter(PyObject *seq);
15414
15415PyTypeObject PyUnicode_Type = {
15416    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15417    "str",                        /* tp_name */
15418    sizeof(PyUnicodeObject),      /* tp_basicsize */
15419    0,                            /* tp_itemsize */
15420    /* Slots */
15421    (destructor)unicode_dealloc,  /* tp_dealloc */
15422    0,                            /* tp_vectorcall_offset */
15423    0,                            /* tp_getattr */
15424    0,                            /* tp_setattr */
15425    0,                            /* tp_as_async */
15426    unicode_repr,                 /* tp_repr */
15427    &unicode_as_number,           /* tp_as_number */
15428    &unicode_as_sequence,         /* tp_as_sequence */
15429    &unicode_as_mapping,          /* tp_as_mapping */
15430    (hashfunc) unicode_hash,      /* tp_hash*/
15431    0,                            /* tp_call*/
15432    (reprfunc) unicode_str,       /* tp_str */
15433    PyObject_GenericGetAttr,      /* tp_getattro */
15434    0,                            /* tp_setattro */
15435    0,                            /* tp_as_buffer */
15436    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15437        Py_TPFLAGS_UNICODE_SUBCLASS |
15438        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15439    unicode_doc,                  /* tp_doc */
15440    0,                            /* tp_traverse */
15441    0,                            /* tp_clear */
15442    PyUnicode_RichCompare,        /* tp_richcompare */
15443    0,                            /* tp_weaklistoffset */
15444    unicode_iter,                 /* tp_iter */
15445    0,                            /* tp_iternext */
15446    unicode_methods,              /* tp_methods */
15447    0,                            /* tp_members */
15448    0,                            /* tp_getset */
15449    0,                            /* tp_base */
15450    0,                            /* tp_dict */
15451    0,                            /* tp_descr_get */
15452    0,                            /* tp_descr_set */
15453    0,                            /* tp_dictoffset */
15454    0,                            /* tp_init */
15455    0,                            /* tp_alloc */
15456    unicode_new,                  /* tp_new */
15457    PyObject_Del,                 /* tp_free */
15458};
15459
15460/* Initialize the Unicode implementation */
15461
15462void
15463_PyUnicode_InitState(PyInterpreterState *interp)
15464{
15465    if (!_Py_IsMainInterpreter(interp)) {
15466        return;
15467    }
15468
15469    /* initialize the linebreak bloom filter */
15470    const Py_UCS2 linebreak[] = {
15471        0x000A, /* LINE FEED */
15472        0x000D, /* CARRIAGE RETURN */
15473        0x001C, /* FILE SEPARATOR */
15474        0x001D, /* GROUP SEPARATOR */
15475        0x001E, /* RECORD SEPARATOR */
15476        0x0085, /* NEXT LINE */
15477        0x2028, /* LINE SEPARATOR */
15478        0x2029, /* PARAGRAPH SEPARATOR */
15479    };
15480    bloom_linebreak = make_bloom_mask(
15481        PyUnicode_2BYTE_KIND, linebreak,
15482        Py_ARRAY_LENGTH(linebreak));
15483}
15484
15485
15486PyStatus
15487_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15488{
15489    if (!_Py_IsMainInterpreter(interp)) {
15490        return _PyStatus_OK();
15491    }
15492
15493#ifdef Py_DEBUG
15494    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
15495
15496    for (int i = 0; i < 256; i++) {
15497        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
15498    }
15499#endif
15500
15501    return _PyStatus_OK();
15502}
15503
15504
15505PyStatus
15506_PyUnicode_InitTypes(PyInterpreterState *interp)
15507{
15508    if (!_Py_IsMainInterpreter(interp)) {
15509        return _PyStatus_OK();
15510    }
15511
15512    if (PyType_Ready(&EncodingMapType) < 0) {
15513        goto error;
15514    }
15515    if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15516        goto error;
15517    }
15518    if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15519        goto error;
15520    }
15521    return _PyStatus_OK();
15522
15523error:
15524    return _PyStatus_ERR("Can't initialize unicode types");
15525}
15526
15527
15528void
15529PyUnicode_InternInPlace(PyObject **p)
15530{
15531    PyObject *s = *p;
15532#ifdef Py_DEBUG
15533    assert(s != NULL);
15534    assert(_PyUnicode_CHECK(s));
15535#else
15536    if (s == NULL || !PyUnicode_Check(s)) {
15537        return;
15538    }
15539#endif
15540
15541    /* If it's a subclass, we don't really know what putting
15542       it in the interned dict might do. */
15543    if (!PyUnicode_CheckExact(s)) {
15544        return;
15545    }
15546
15547    if (PyUnicode_CHECK_INTERNED(s)) {
15548        return;
15549    }
15550
15551    if (PyUnicode_READY(s) == -1) {
15552        PyErr_Clear();
15553        return;
15554    }
15555
15556    if (interned == NULL) {
15557        interned = PyDict_New();
15558        if (interned == NULL) {
15559            PyErr_Clear(); /* Don't leave an exception */
15560            return;
15561        }
15562    }
15563
15564    PyObject *t = PyDict_SetDefault(interned, s, s);
15565    if (t == NULL) {
15566        PyErr_Clear();
15567        return;
15568    }
15569
15570    if (t != s) {
15571        Py_INCREF(t);
15572        Py_SETREF(*p, t);
15573        return;
15574    }
15575
15576    /* The two references in interned dict (key and value) are not counted by
15577       refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15578       this. */
15579    Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15580    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15581}
15582
15583void
15584PyUnicode_InternImmortal(PyObject **p)
15585{
15586    if (PyErr_WarnEx(PyExc_DeprecationWarning,
15587            "PyUnicode_InternImmortal() is deprecated; "
15588            "use PyUnicode_InternInPlace() instead", 1) < 0)
15589    {
15590        // The function has no return value, the exception cannot
15591        // be reported to the caller, so just log it.
15592        PyErr_WriteUnraisable(NULL);
15593    }
15594
15595    PyUnicode_InternInPlace(p);
15596    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15597        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15598        Py_INCREF(*p);
15599    }
15600}
15601
15602PyObject *
15603PyUnicode_InternFromString(const char *cp)
15604{
15605    PyObject *s = PyUnicode_FromString(cp);
15606    if (s == NULL)
15607        return NULL;
15608    PyUnicode_InternInPlace(&s);
15609    return s;
15610}
15611
15612
15613void
15614_PyUnicode_ClearInterned(PyInterpreterState *interp)
15615{
15616    if (!_Py_IsMainInterpreter(interp)) {
15617        // interned dict is shared by all interpreters
15618        return;
15619    }
15620
15621    if (interned == NULL) {
15622        return;
15623    }
15624    assert(PyDict_CheckExact(interned));
15625
15626    /* Interned unicode strings are not forcibly deallocated; rather, we give
15627       them their stolen references back, and then clear and DECREF the
15628       interned dict. */
15629
15630#ifdef INTERNED_STATS
15631    fprintf(stderr, "releasing %zd interned strings\n",
15632            PyDict_GET_SIZE(interned));
15633
15634    Py_ssize_t immortal_size = 0, mortal_size = 0;
15635#endif
15636    Py_ssize_t pos = 0;
15637    PyObject *s, *ignored_value;
15638    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15639        assert(PyUnicode_IS_READY(s));
15640
15641        switch (PyUnicode_CHECK_INTERNED(s)) {
15642        case SSTATE_INTERNED_IMMORTAL:
15643            Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
15644#ifdef INTERNED_STATS
15645            immortal_size += PyUnicode_GET_LENGTH(s);
15646#endif
15647            break;
15648        case SSTATE_INTERNED_MORTAL:
15649            // Restore the two references (key and value) ignored
15650            // by PyUnicode_InternInPlace().
15651            Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15652#ifdef INTERNED_STATS
15653            mortal_size += PyUnicode_GET_LENGTH(s);
15654#endif
15655            break;
15656        case SSTATE_NOT_INTERNED:
15657            /* fall through */
15658        default:
15659            Py_UNREACHABLE();
15660        }
15661        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15662    }
15663#ifdef INTERNED_STATS
15664    fprintf(stderr,
15665            "total size of all interned strings: %zd/%zd mortal/immortal\n",
15666            mortal_size, immortal_size);
15667#endif
15668
15669    PyDict_Clear(interned);
15670    Py_CLEAR(interned);
15671}
15672
15673
15674/********************* Unicode Iterator **************************/
15675
15676typedef struct {
15677    PyObject_HEAD
15678    Py_ssize_t it_index;
15679    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15680} unicodeiterobject;
15681
15682static void
15683unicodeiter_dealloc(unicodeiterobject *it)
15684{
15685    _PyObject_GC_UNTRACK(it);
15686    Py_XDECREF(it->it_seq);
15687    PyObject_GC_Del(it);
15688}
15689
15690static int
15691unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15692{
15693    Py_VISIT(it->it_seq);
15694    return 0;
15695}
15696
15697static PyObject *
15698unicodeiter_next(unicodeiterobject *it)
15699{
15700    PyObject *seq;
15701
15702    assert(it != NULL);
15703    seq = it->it_seq;
15704    if (seq == NULL)
15705        return NULL;
15706    assert(_PyUnicode_CHECK(seq));
15707
15708    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15709        int kind = PyUnicode_KIND(seq);
15710        const void *data = PyUnicode_DATA(seq);
15711        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15712        it->it_index++;
15713        return unicode_char(chr);
15714    }
15715
15716    it->it_seq = NULL;
15717    Py_DECREF(seq);
15718    return NULL;
15719}
15720
15721static PyObject *
15722unicode_ascii_iter_next(unicodeiterobject *it)
15723{
15724    assert(it != NULL);
15725    PyObject *seq = it->it_seq;
15726    if (seq == NULL) {
15727        return NULL;
15728    }
15729    assert(_PyUnicode_CHECK(seq));
15730    assert(PyUnicode_IS_COMPACT_ASCII(seq));
15731    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15732        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
15733        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
15734                                              data, it->it_index);
15735        it->it_index++;
15736        PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
15737        return Py_NewRef(item);
15738    }
15739    it->it_seq = NULL;
15740    Py_DECREF(seq);
15741    return NULL;
15742}
15743
15744static PyObject *
15745unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15746{
15747    Py_ssize_t len = 0;
15748    if (it->it_seq)
15749        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15750    return PyLong_FromSsize_t(len);
15751}
15752
15753PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15754
15755static PyObject *
15756unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15757{
15758    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
15759
15760    /* _PyEval_GetBuiltin can invoke arbitrary code,
15761     * call must be before access of iterator pointers.
15762     * see issue #101765 */
15763
15764    if (it->it_seq != NULL) {
15765        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
15766    } else {
15767        PyObject *u = (PyObject *)_PyUnicode_New(0);
15768        if (u == NULL) {
15769            Py_XDECREF(iter);
15770            return NULL;
15771        }
15772        return Py_BuildValue("N(N)", iter, u);
15773    }
15774}
15775
15776PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15777
15778static PyObject *
15779unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15780{
15781    Py_ssize_t index = PyLong_AsSsize_t(state);
15782    if (index == -1 && PyErr_Occurred())
15783        return NULL;
15784    if (it->it_seq != NULL) {
15785        if (index < 0)
15786            index = 0;
15787        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15788            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15789        it->it_index = index;
15790    }
15791    Py_RETURN_NONE;
15792}
15793
15794PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15795
15796static PyMethodDef unicodeiter_methods[] = {
15797    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15798     length_hint_doc},
15799    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15800     reduce_doc},
15801    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15802     setstate_doc},
15803    {NULL,      NULL}       /* sentinel */
15804};
15805
15806PyTypeObject PyUnicodeIter_Type = {
15807    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15808    "str_iterator",         /* tp_name */
15809    sizeof(unicodeiterobject),      /* tp_basicsize */
15810    0,                  /* tp_itemsize */
15811    /* methods */
15812    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15813    0,                  /* tp_vectorcall_offset */
15814    0,                  /* tp_getattr */
15815    0,                  /* tp_setattr */
15816    0,                  /* tp_as_async */
15817    0,                  /* tp_repr */
15818    0,                  /* tp_as_number */
15819    0,                  /* tp_as_sequence */
15820    0,                  /* tp_as_mapping */
15821    0,                  /* tp_hash */
15822    0,                  /* tp_call */
15823    0,                  /* tp_str */
15824    PyObject_GenericGetAttr,        /* tp_getattro */
15825    0,                  /* tp_setattro */
15826    0,                  /* tp_as_buffer */
15827    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15828    0,                  /* tp_doc */
15829    (traverseproc)unicodeiter_traverse, /* tp_traverse */
15830    0,                  /* tp_clear */
15831    0,                  /* tp_richcompare */
15832    0,                  /* tp_weaklistoffset */
15833    PyObject_SelfIter,          /* tp_iter */
15834    (iternextfunc)unicodeiter_next,     /* tp_iternext */
15835    unicodeiter_methods,            /* tp_methods */
15836    0,
15837};
15838
15839PyTypeObject _PyUnicodeASCIIIter_Type = {
15840    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15841    .tp_name = "str_ascii_iterator",
15842    .tp_basicsize = sizeof(unicodeiterobject),
15843    .tp_dealloc = (destructor)unicodeiter_dealloc,
15844    .tp_getattro = PyObject_GenericGetAttr,
15845    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
15846    .tp_traverse = (traverseproc)unicodeiter_traverse,
15847    .tp_iter = PyObject_SelfIter,
15848    .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
15849    .tp_methods = unicodeiter_methods,
15850};
15851
15852static PyObject *
15853unicode_iter(PyObject *seq)
15854{
15855    unicodeiterobject *it;
15856
15857    if (!PyUnicode_Check(seq)) {
15858        PyErr_BadInternalCall();
15859        return NULL;
15860    }
15861    if (PyUnicode_READY(seq) == -1)
15862        return NULL;
15863    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
15864        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
15865    }
15866    else {
15867        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15868    }
15869    if (it == NULL)
15870        return NULL;
15871    it->it_index = 0;
15872    Py_INCREF(seq);
15873    it->it_seq = seq;
15874    _PyObject_GC_TRACK(it);
15875    return (PyObject *)it;
15876}
15877
15878static int
15879encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15880{
15881    int res;
15882    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15883    if (res == -2) {
15884        PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15885        return -1;
15886    }
15887    if (res < 0) {
15888        PyErr_NoMemory();
15889        return -1;
15890    }
15891    return 0;
15892}
15893
15894
15895static int
15896config_get_codec_name(wchar_t **config_encoding)
15897{
15898    char *encoding;
15899    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15900        return -1;
15901    }
15902
15903    PyObject *name_obj = NULL;
15904    PyObject *codec = _PyCodec_Lookup(encoding);
15905    PyMem_RawFree(encoding);
15906
15907    if (!codec)
15908        goto error;
15909
15910    name_obj = PyObject_GetAttrString(codec, "name");
15911    Py_CLEAR(codec);
15912    if (!name_obj) {
15913        goto error;
15914    }
15915
15916    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15917    Py_DECREF(name_obj);
15918    if (wname == NULL) {
15919        goto error;
15920    }
15921
15922    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15923    if (raw_wname == NULL) {
15924        PyMem_Free(wname);
15925        PyErr_NoMemory();
15926        goto error;
15927    }
15928
15929    PyMem_RawFree(*config_encoding);
15930    *config_encoding = raw_wname;
15931
15932    PyMem_Free(wname);
15933    return 0;
15934
15935error:
15936    Py_XDECREF(codec);
15937    Py_XDECREF(name_obj);
15938    return -1;
15939}
15940
15941
15942static PyStatus
15943init_stdio_encoding(PyInterpreterState *interp)
15944{
15945    /* Update the stdio encoding to the normalized Python codec name. */
15946    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15947    if (config_get_codec_name(&config->stdio_encoding) < 0) {
15948        return _PyStatus_ERR("failed to get the Python codec name "
15949                             "of the stdio encoding");
15950    }
15951    return _PyStatus_OK();
15952}
15953
15954
15955static int
15956init_fs_codec(PyInterpreterState *interp)
15957{
15958    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
15959
15960    _Py_error_handler error_handler;
15961    error_handler = get_error_handler_wide(config->filesystem_errors);
15962    if (error_handler == _Py_ERROR_UNKNOWN) {
15963        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
15964        return -1;
15965    }
15966
15967    char *encoding, *errors;
15968    if (encode_wstr_utf8(config->filesystem_encoding,
15969                         &encoding,
15970                         "filesystem_encoding") < 0) {
15971        return -1;
15972    }
15973
15974    if (encode_wstr_utf8(config->filesystem_errors,
15975                         &errors,
15976                         "filesystem_errors") < 0) {
15977        PyMem_RawFree(encoding);
15978        return -1;
15979    }
15980
15981    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
15982    PyMem_RawFree(fs_codec->encoding);
15983    fs_codec->encoding = encoding;
15984    /* encoding has been normalized by init_fs_encoding() */
15985    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
15986    PyMem_RawFree(fs_codec->errors);
15987    fs_codec->errors = errors;
15988    fs_codec->error_handler = error_handler;
15989
15990#ifdef _Py_FORCE_UTF8_FS_ENCODING
15991    assert(fs_codec->utf8 == 1);
15992#endif
15993
15994    /* At this point, PyUnicode_EncodeFSDefault() and
15995       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15996       the C implementation of the filesystem encoding. */
15997
15998    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15999       global configuration variables. */
16000    if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16001                                  fs_codec->errors) < 0) {
16002        PyErr_NoMemory();
16003        return -1;
16004    }
16005    return 0;
16006}
16007
16008
16009static PyStatus
16010init_fs_encoding(PyThreadState *tstate)
16011{
16012    PyInterpreterState *interp = tstate->interp;
16013
16014    /* Update the filesystem encoding to the normalized Python codec name.
16015       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16016       (Python codec name). */
16017    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16018    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16019        _Py_DumpPathConfig(tstate);
16020        return _PyStatus_ERR("failed to get the Python codec "
16021                             "of the filesystem encoding");
16022    }
16023
16024    if (init_fs_codec(interp) < 0) {
16025        return _PyStatus_ERR("cannot initialize filesystem codec");
16026    }
16027    return _PyStatus_OK();
16028}
16029
16030
16031PyStatus
16032_PyUnicode_InitEncodings(PyThreadState *tstate)
16033{
16034    PyStatus status = init_fs_encoding(tstate);
16035    if (_PyStatus_EXCEPTION(status)) {
16036        return status;
16037    }
16038
16039    return init_stdio_encoding(tstate->interp);
16040}
16041
16042
16043static void
16044_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16045{
16046    PyMem_RawFree(fs_codec->encoding);
16047    fs_codec->encoding = NULL;
16048    fs_codec->utf8 = 0;
16049    PyMem_RawFree(fs_codec->errors);
16050    fs_codec->errors = NULL;
16051    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16052}
16053
16054
16055#ifdef MS_WINDOWS
16056int
16057_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16058{
16059    PyInterpreterState *interp = _PyInterpreterState_GET();
16060    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16061
16062    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16063    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16064    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16065    if (encoding == NULL || errors == NULL) {
16066        PyMem_RawFree(encoding);
16067        PyMem_RawFree(errors);
16068        PyErr_NoMemory();
16069        return -1;
16070    }
16071
16072    PyMem_RawFree(config->filesystem_encoding);
16073    config->filesystem_encoding = encoding;
16074    PyMem_RawFree(config->filesystem_errors);
16075    config->filesystem_errors = errors;
16076
16077    return init_fs_codec(interp);
16078}
16079#endif
16080
16081
16082#ifdef Py_DEBUG
16083static inline int
16084unicode_is_finalizing(void)
16085{
16086    return (interned == NULL);
16087}
16088#endif
16089
16090
16091void
16092_PyUnicode_FiniTypes(PyInterpreterState *interp)
16093{
16094    if (!_Py_IsMainInterpreter(interp)) {
16095        return;
16096    }
16097
16098    _PyStaticType_Dealloc(&EncodingMapType);
16099    _PyStaticType_Dealloc(&PyFieldNameIter_Type);
16100    _PyStaticType_Dealloc(&PyFormatterIter_Type);
16101}
16102
16103
16104static void unicode_static_dealloc(PyObject *op)
16105{
16106    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
16107
16108    assert(ascii->state.compact);
16109
16110    if (ascii->state.ascii) {
16111        if (ascii->wstr) {
16112            PyObject_Free(ascii->wstr);
16113            ascii->wstr = NULL;
16114        }
16115    }
16116    else {
16117        PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op;
16118        void* data = (void*)(compact + 1);
16119        if (ascii->wstr && ascii->wstr != data) {
16120            PyObject_Free(ascii->wstr);
16121            ascii->wstr = NULL;
16122            compact->wstr_length = 0;
16123        }
16124        if (compact->utf8) {
16125            PyObject_Free(compact->utf8);
16126            compact->utf8 = NULL;
16127            compact->utf8_length = 0;
16128        }
16129    }
16130}
16131
16132
16133void
16134_PyUnicode_Fini(PyInterpreterState *interp)
16135{
16136    struct _Py_unicode_state *state = &interp->unicode;
16137
16138    if (_Py_IsMainInterpreter(interp)) {
16139        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16140        assert(interned == NULL);
16141        // bpo-47182: force a unicodedata CAPI capsule re-import on
16142        // subsequent initialization of main interpreter.
16143        ucnhash_capi = NULL;
16144    }
16145
16146    _PyUnicode_FiniEncodings(&state->fs_codec);
16147
16148    unicode_clear_identifiers(state);
16149
16150    // Clear the single character singletons
16151    for (int i = 0; i < 128; i++) {
16152        unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]);
16153    }
16154    for (int i = 0; i < 128; i++) {
16155        unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]);
16156    }
16157}
16158
16159
16160void
16161_PyStaticUnicode_Dealloc(PyObject *op)
16162{
16163    unicode_static_dealloc(op);
16164}
16165
16166
16167/* A _string module, to export formatter_parser and formatter_field_name_split
16168   to the string.Formatter class implemented in Python. */
16169
16170static PyMethodDef _string_methods[] = {
16171    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16172     METH_O, PyDoc_STR("split the argument as a field name")},
16173    {"formatter_parser", (PyCFunction) formatter_parser,
16174     METH_O, PyDoc_STR("parse the argument as a format string")},
16175    {NULL, NULL}
16176};
16177
16178static struct PyModuleDef _string_module = {
16179    PyModuleDef_HEAD_INIT,
16180    .m_name = "_string",
16181    .m_doc = PyDoc_STR("string helper module"),
16182    .m_size = 0,
16183    .m_methods = _string_methods,
16184};
16185
16186PyMODINIT_FUNC
16187PyInit__string(void)
16188{
16189    return PyModuleDef_Init(&_string_module);
16190}
16191
16192
16193#ifdef __cplusplus
16194}
16195#endif
16196