1#ifndef Py_CPYTHON_UNICODEOBJECT_H
2#  error "this header file must not be included directly"
3#endif
4
5/* Py_UNICODE was the native Unicode storage format (code unit) used by
6   Python and represents a single Unicode element in the Unicode type.
7   With PEP 393, Py_UNICODE is deprecated and replaced with a
8   typedef to wchar_t. */
9#define PY_UNICODE_TYPE wchar_t
10/* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
11
12/* --- Internal Unicode Operations ---------------------------------------- */
13
14#ifndef USE_UNICODE_WCHAR_CACHE
15#  define USE_UNICODE_WCHAR_CACHE 1
16#endif /* USE_UNICODE_WCHAR_CACHE */
17
18/* Since splitting on whitespace is an important use case, and
19   whitespace in most situations is solely ASCII whitespace, we
20   optimize for the common case by using a quick look-up table
21   _Py_ascii_whitespace (see below) with an inlined check.
22
23 */
24#define Py_UNICODE_ISSPACE(ch) \
25    ((Py_UCS4)(ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
26
27#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
28#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
29#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
30#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
31
32#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
33#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
34#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
35
36#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
37#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
38#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
39#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
40
41#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
42#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
43#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
44
45#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
46
47#define Py_UNICODE_ISALNUM(ch) \
48   (Py_UNICODE_ISALPHA(ch) || \
49    Py_UNICODE_ISDECIMAL(ch) || \
50    Py_UNICODE_ISDIGIT(ch) || \
51    Py_UNICODE_ISNUMERIC(ch))
52
53/* macros to work with surrogates */
54#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
55#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
56#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
57/* Join two surrogate characters and return a single Py_UCS4 value. */
58#define Py_UNICODE_JOIN_SURROGATES(high, low)  \
59    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
60      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
61/* high surrogate = top 10 bits added to D800 */
62#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
63/* low surrogate = bottom 10 bits added to DC00 */
64#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
65
66/* --- Unicode Type ------------------------------------------------------- */
67
68/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
69   structure. state.ascii and state.compact are set, and the data
70   immediately follow the structure. utf8_length and wstr_length can be found
71   in the length field; the utf8 pointer is equal to the data pointer. */
72typedef struct {
73    /* There are 4 forms of Unicode strings:
74
75       - compact ascii:
76
77         * structure = PyASCIIObject
78         * test: PyUnicode_IS_COMPACT_ASCII(op)
79         * kind = PyUnicode_1BYTE_KIND
80         * compact = 1
81         * ascii = 1
82         * ready = 1
83         * (length is the length of the utf8 and wstr strings)
84         * (data starts just after the structure)
85         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
86
87       - compact:
88
89         * structure = PyCompactUnicodeObject
90         * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
91         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
92           PyUnicode_4BYTE_KIND
93         * compact = 1
94         * ready = 1
95         * ascii = 0
96         * utf8 is not shared with data
97         * utf8_length = 0 if utf8 is NULL
98         * wstr is shared with data and wstr_length=length
99           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
100           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
101         * wstr_length = 0 if wstr is NULL
102         * (data starts just after the structure)
103
104       - legacy string, not ready:
105
106         * structure = PyUnicodeObject
107         * test: kind == PyUnicode_WCHAR_KIND
108         * length = 0 (use wstr_length)
109         * hash = -1
110         * kind = PyUnicode_WCHAR_KIND
111         * compact = 0
112         * ascii = 0
113         * ready = 0
114         * interned = SSTATE_NOT_INTERNED
115         * wstr is not NULL
116         * data.any is NULL
117         * utf8 is NULL
118         * utf8_length = 0
119
120       - legacy string, ready:
121
122         * structure = PyUnicodeObject structure
123         * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
124         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
125           PyUnicode_4BYTE_KIND
126         * compact = 0
127         * ready = 1
128         * data.any is not NULL
129         * utf8 is shared and utf8_length = length with data.any if ascii = 1
130         * utf8_length = 0 if utf8 is NULL
131         * wstr is shared with data.any and wstr_length = length
132           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
133           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
134         * wstr_length = 0 if wstr is NULL
135
136       Compact strings use only one memory block (structure + characters),
137       whereas legacy strings use one block for the structure and one block
138       for characters.
139
140       Legacy strings are created by PyUnicode_FromUnicode() and
141       PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
142       when PyUnicode_READY() is called.
143
144       See also _PyUnicode_CheckConsistency().
145    */
146    PyObject_HEAD
147    Py_ssize_t length;          /* Number of code points in the string */
148    Py_hash_t hash;             /* Hash value; -1 if not set */
149    struct {
150        /*
151           SSTATE_NOT_INTERNED (0)
152           SSTATE_INTERNED_MORTAL (1)
153           SSTATE_INTERNED_IMMORTAL (2)
154
155           If interned != SSTATE_NOT_INTERNED, the two references from the
156           dictionary to this object are *not* counted in ob_refcnt.
157         */
158        unsigned int interned:2;
159        /* Character size:
160
161           - PyUnicode_WCHAR_KIND (0):
162
163             * character type = wchar_t (16 or 32 bits, depending on the
164               platform)
165
166           - PyUnicode_1BYTE_KIND (1):
167
168             * character type = Py_UCS1 (8 bits, unsigned)
169             * all characters are in the range U+0000-U+00FF (latin1)
170             * if ascii is set, all characters are in the range U+0000-U+007F
171               (ASCII), otherwise at least one character is in the range
172               U+0080-U+00FF
173
174           - PyUnicode_2BYTE_KIND (2):
175
176             * character type = Py_UCS2 (16 bits, unsigned)
177             * all characters are in the range U+0000-U+FFFF (BMP)
178             * at least one character is in the range U+0100-U+FFFF
179
180           - PyUnicode_4BYTE_KIND (4):
181
182             * character type = Py_UCS4 (32 bits, unsigned)
183             * all characters are in the range U+0000-U+10FFFF
184             * at least one character is in the range U+10000-U+10FFFF
185         */
186        unsigned int kind:3;
187        /* Compact is with respect to the allocation scheme. Compact unicode
188           objects only require one memory block while non-compact objects use
189           one block for the PyUnicodeObject struct and another for its data
190           buffer. */
191        unsigned int compact:1;
192        /* The string only contains characters in the range U+0000-U+007F (ASCII)
193           and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
194           set, use the PyASCIIObject structure. */
195        unsigned int ascii:1;
196        /* The ready flag indicates whether the object layout is initialized
197           completely. This means that this is either a compact object, or
198           the data pointer is filled out. The bit is redundant, and helps
199           to minimize the test in PyUnicode_IS_READY(). */
200        unsigned int ready:1;
201        /* Padding to ensure that PyUnicode_DATA() is always aligned to
202           4 bytes (see issue #19537 on m68k). */
203        unsigned int :24;
204    } state;
205    wchar_t *wstr;              /* wchar_t representation (null-terminated) */
206} PyASCIIObject;
207
208/* Non-ASCII strings allocated through PyUnicode_New use the
209   PyCompactUnicodeObject structure. state.compact is set, and the data
210   immediately follow the structure. */
211typedef struct {
212    PyASCIIObject _base;
213    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
214                                 * terminating \0. */
215    char *utf8;                 /* UTF-8 representation (null-terminated) */
216    Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
217                                 * surrogates count as two code points. */
218} PyCompactUnicodeObject;
219
220/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
221   PyUnicodeObject structure. The actual string data is initially in the wstr
222   block, and copied into the data block using _PyUnicode_Ready. */
223typedef struct {
224    PyCompactUnicodeObject _base;
225    union {
226        void *any;
227        Py_UCS1 *latin1;
228        Py_UCS2 *ucs2;
229        Py_UCS4 *ucs4;
230    } data;                     /* Canonical, smallest-form Unicode buffer */
231} PyUnicodeObject;
232
233PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
234    PyObject *op,
235    int check_content);
236
237
238#define _PyASCIIObject_CAST(op) \
239    (assert(PyUnicode_Check(op)), \
240     _Py_CAST(PyASCIIObject*, (op)))
241#define _PyCompactUnicodeObject_CAST(op) \
242    (assert(PyUnicode_Check(op)), \
243     _Py_CAST(PyCompactUnicodeObject*, (op)))
244#define _PyUnicodeObject_CAST(op) \
245    (assert(PyUnicode_Check(op)), \
246     _Py_CAST(PyUnicodeObject*, (op)))
247
248
249/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
250
251/* Values for PyASCIIObject.state: */
252
253/* Interning state. */
254#define SSTATE_NOT_INTERNED 0
255#define SSTATE_INTERNED_MORTAL 1
256#define SSTATE_INTERNED_IMMORTAL 2
257
258/* Use only if you know it's a string */
259static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
260    return _PyASCIIObject_CAST(op)->state.interned;
261}
262#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
263#  define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
264#endif
265
266/* Fast check to determine whether an object is ready. Equivalent to:
267   PyUnicode_IS_COMPACT(op) || _PyUnicodeObject_CAST(op)->data.any */
268static inline unsigned int PyUnicode_IS_READY(PyObject *op) {
269    return _PyASCIIObject_CAST(op)->state.ready;
270}
271#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
272#  define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
273#endif
274
275/* Return true if the string contains only ASCII characters, or 0 if not. The
276   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
277   ready. */
278static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
279    assert(PyUnicode_IS_READY(op));
280    return _PyASCIIObject_CAST(op)->state.ascii;
281}
282#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
283#  define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
284#endif
285
286/* Return true if the string is compact or 0 if not.
287   No type checks or Ready calls are performed. */
288static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
289    return _PyASCIIObject_CAST(op)->state.compact;
290}
291#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
292#  define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
293#endif
294
295/* Return true if the string is a compact ASCII string (use PyASCIIObject
296   structure), or 0 if not.  No type checks or Ready calls are performed. */
297static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
298    return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
299}
300#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
301#  define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
302#endif
303
304enum PyUnicode_Kind {
305/* String contains only wstr byte characters.  This is only possible
306   when the string was created with a legacy API and _PyUnicode_Ready()
307   has not been called yet.  */
308    PyUnicode_WCHAR_KIND = 0,
309/* Return values of the PyUnicode_KIND() function: */
310    PyUnicode_1BYTE_KIND = 1,
311    PyUnicode_2BYTE_KIND = 2,
312    PyUnicode_4BYTE_KIND = 4
313};
314
315/* Return one of the PyUnicode_*_KIND values defined above. */
316#define PyUnicode_KIND(op) \
317    (assert(PyUnicode_IS_READY(op)), \
318     _PyASCIIObject_CAST(op)->state.kind)
319
320/* Return a void pointer to the raw unicode buffer. */
321static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
322    if (PyUnicode_IS_ASCII(op)) {
323        return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
324    }
325    return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
326}
327
328static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
329    void *data;
330    assert(!PyUnicode_IS_COMPACT(op));
331    data = _PyUnicodeObject_CAST(op)->data.any;
332    assert(data != NULL);
333    return data;
334}
335
336static inline void* PyUnicode_DATA(PyObject *op) {
337    if (PyUnicode_IS_COMPACT(op)) {
338        return _PyUnicode_COMPACT_DATA(op);
339    }
340    return _PyUnicode_NONCOMPACT_DATA(op);
341}
342#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
343#  define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
344#endif
345
346/* Return pointers to the canonical representation cast to unsigned char,
347   Py_UCS2, or Py_UCS4 for direct character access.
348   No checks are performed, use PyUnicode_KIND() before to ensure
349   these will work correctly. */
350
351#define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
352#define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
353#define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
354
355/* Returns the length of the unicode string. The caller has to make sure that
356   the string has it's canonical representation set before calling
357   this function.  Call PyUnicode_(FAST_)Ready to ensure that. */
358static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
359    assert(PyUnicode_IS_READY(op));
360    return _PyASCIIObject_CAST(op)->length;
361}
362#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
363#  define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
364#endif
365
366/* Write into the canonical representation, this function does not do any sanity
367   checks and is intended for usage in loops.  The caller should cache the
368   kind and data pointers obtained from other function calls.
369   index is the index in the string (starts at 0) and value is the new
370   code point value which should be written to that location. */
371static inline void PyUnicode_WRITE(int kind, void *data,
372                                   Py_ssize_t index, Py_UCS4 value)
373{
374    if (kind == PyUnicode_1BYTE_KIND) {
375        assert(value <= 0xffU);
376        _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
377    }
378    else if (kind == PyUnicode_2BYTE_KIND) {
379        assert(value <= 0xffffU);
380        _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
381    }
382    else {
383        assert(kind == PyUnicode_4BYTE_KIND);
384        assert(value <= 0x10ffffU);
385        _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
386    }
387}
388#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
389#define PyUnicode_WRITE(kind, data, index, value) \
390    PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
391                    (index), _Py_STATIC_CAST(Py_UCS4, value))
392#endif
393
394/* Read a code point from the string's canonical representation.  No checks
395   or ready calls are performed. */
396static inline Py_UCS4 PyUnicode_READ(int kind,
397                                     const void *data, Py_ssize_t index)
398{
399    if (kind == PyUnicode_1BYTE_KIND) {
400        return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
401    }
402    if (kind == PyUnicode_2BYTE_KIND) {
403        return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
404    }
405    assert(kind == PyUnicode_4BYTE_KIND);
406    return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
407}
408#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
409#define PyUnicode_READ(kind, data, index) \
410    PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
411                   _Py_STATIC_CAST(const void*, data), \
412                   (index))
413#endif
414
415/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
416   calls PyUnicode_KIND() and might call it twice.  For single reads, use
417   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
418   cache kind and use PyUnicode_READ instead. */
419static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
420{
421    int kind;
422    assert(PyUnicode_IS_READY(unicode));
423    kind = PyUnicode_KIND(unicode);
424    if (kind == PyUnicode_1BYTE_KIND) {
425        return PyUnicode_1BYTE_DATA(unicode)[index];
426    }
427    if (kind == PyUnicode_2BYTE_KIND) {
428        return PyUnicode_2BYTE_DATA(unicode)[index];
429    }
430    assert(kind == PyUnicode_4BYTE_KIND);
431    return PyUnicode_4BYTE_DATA(unicode)[index];
432}
433#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
434#  define PyUnicode_READ_CHAR(unicode, index) \
435       PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
436#endif
437
438/* Return a maximum character value which is suitable for creating another
439   string based on op.  This is always an approximation but more efficient
440   than iterating over the string. */
441static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
442{
443    int kind;
444
445    assert(PyUnicode_IS_READY(op));
446    if (PyUnicode_IS_ASCII(op)) {
447        return 0x7fU;
448    }
449
450    kind = PyUnicode_KIND(op);
451    if (kind == PyUnicode_1BYTE_KIND) {
452       return 0xffU;
453    }
454    if (kind == PyUnicode_2BYTE_KIND) {
455        return 0xffffU;
456    }
457    assert(kind == PyUnicode_4BYTE_KIND);
458    return 0x10ffffU;
459}
460#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
461#  define PyUnicode_MAX_CHAR_VALUE(op) \
462       PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
463#endif
464
465/* === Public API ========================================================= */
466
467/* --- Plain Py_UNICODE --------------------------------------------------- */
468
469/* With PEP 393, this is the recommended way to allocate a new unicode object.
470   This function will allocate the object and its buffer in a single memory
471   block.  Objects created using this function are not resizable. */
472PyAPI_FUNC(PyObject*) PyUnicode_New(
473    Py_ssize_t size,            /* Number of code points in the new string */
474    Py_UCS4 maxchar             /* maximum code point value in the string */
475    );
476
477/* Initializes the canonical string representation from the deprecated
478   wstr/Py_UNICODE representation. This function is used to convert Unicode
479   objects which were created using the old API to the new flexible format
480   introduced with PEP 393.
481
482   Don't call this function directly, use the public PyUnicode_READY() function
483   instead. */
484PyAPI_FUNC(int) _PyUnicode_Ready(
485    PyObject *unicode           /* Unicode object */
486    );
487
488/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
489   case.  If the canonical representation is not yet set, it will still call
490   _PyUnicode_Ready().
491   Returns 0 on success and -1 on errors. */
492static inline int PyUnicode_READY(PyObject *op)
493{
494    if (PyUnicode_IS_READY(op)) {
495        return 0;
496    }
497    return _PyUnicode_Ready(op);
498}
499#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
500#  define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
501#endif
502
503/* Get a copy of a Unicode string. */
504PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
505    PyObject *unicode
506    );
507
508/* Copy character from one unicode object into another, this function performs
509   character conversion when necessary and falls back to memcpy() if possible.
510
511   Fail if to is too small (smaller than *how_many* or smaller than
512   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
513   kind(to), or if *to* has more than 1 reference.
514
515   Return the number of written character, or return -1 and raise an exception
516   on error.
517
518   Pseudo-code:
519
520       how_many = min(how_many, len(from) - from_start)
521       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
522       return how_many
523
524   Note: The function doesn't write a terminating null character.
525   */
526PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
527    PyObject *to,
528    Py_ssize_t to_start,
529    PyObject *from,
530    Py_ssize_t from_start,
531    Py_ssize_t how_many
532    );
533
534/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
535   may crash if parameters are invalid (e.g. if the output string
536   is too short). */
537PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
538    PyObject *to,
539    Py_ssize_t to_start,
540    PyObject *from,
541    Py_ssize_t from_start,
542    Py_ssize_t how_many
543    );
544
545/* Fill a string with a character: write fill_char into
546   unicode[start:start+length].
547
548   Fail if fill_char is bigger than the string maximum character, or if the
549   string has more than 1 reference.
550
551   Return the number of written character, or return -1 and raise an exception
552   on error. */
553PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
554    PyObject *unicode,
555    Py_ssize_t start,
556    Py_ssize_t length,
557    Py_UCS4 fill_char
558    );
559
560/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
561   if parameters are invalid (e.g. if length is longer than the string). */
562PyAPI_FUNC(void) _PyUnicode_FastFill(
563    PyObject *unicode,
564    Py_ssize_t start,
565    Py_ssize_t length,
566    Py_UCS4 fill_char
567    );
568
569/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
570   Scan the string to find the maximum character. */
571PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
572    int kind,
573    const void *buffer,
574    Py_ssize_t size);
575
576/* Create a new string from a buffer of ASCII characters.
577   WARNING: Don't check if the string contains any non-ASCII character. */
578PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
579    const char *buffer,
580    Py_ssize_t size);
581
582/* Compute the maximum character of the substring unicode[start:end].
583   Return 127 for an empty string. */
584PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
585    PyObject *unicode,
586    Py_ssize_t start,
587    Py_ssize_t end);
588
589/* --- Legacy deprecated API ---------------------------------------------- */
590
591/* Create a Unicode Object from the Py_UNICODE buffer u of the given
592   size.
593
594   u may be NULL which causes the contents to be undefined. It is the
595   user's responsibility to fill in the needed data afterwards. Note
596   that modifying the Unicode object contents after construction is
597   only allowed if u was set to NULL.
598
599   The buffer is copied into the new object. */
600Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
601    const Py_UNICODE *u,        /* Unicode buffer */
602    Py_ssize_t size             /* size of buffer */
603    );
604
605/* Return a read-only pointer to the Unicode object's internal
606   Py_UNICODE buffer.
607   If the wchar_t/Py_UNICODE representation is not yet available, this
608   function will calculate it. */
609Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
610    PyObject *unicode           /* Unicode object */
611    );
612
613/* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string
614   contains null characters. */
615PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode(
616    PyObject *unicode           /* Unicode object */
617    );
618
619/* Return a read-only pointer to the Unicode object's internal
620   Py_UNICODE buffer and save the length at size.
621   If the wchar_t/Py_UNICODE representation is not yet available, this
622   function will calculate it. */
623
624Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
625    PyObject *unicode,          /* Unicode object */
626    Py_ssize_t *size            /* location where to save the length */
627    );
628
629
630/* Fast access macros */
631
632Py_DEPRECATED(3.3)
633static inline Py_ssize_t PyUnicode_WSTR_LENGTH(PyObject *op)
634{
635    if (PyUnicode_IS_COMPACT_ASCII(op)) {
636        return _PyASCIIObject_CAST(op)->length;
637    }
638    else {
639        return _PyCompactUnicodeObject_CAST(op)->wstr_length;
640    }
641}
642#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
643#  define PyUnicode_WSTR_LENGTH(op) PyUnicode_WSTR_LENGTH(_PyObject_CAST(op))
644#endif
645
646/* Returns the deprecated Py_UNICODE representation's size in code units
647   (this includes surrogate pairs as 2 units).
648   If the Py_UNICODE representation is not available, it will be computed
649   on request.  Use PyUnicode_GET_LENGTH() for the length in code points. */
650
651Py_DEPRECATED(3.3)
652static inline Py_ssize_t PyUnicode_GET_SIZE(PyObject *op)
653{
654    _Py_COMP_DIAG_PUSH
655    _Py_COMP_DIAG_IGNORE_DEPR_DECLS
656    if (_PyASCIIObject_CAST(op)->wstr == _Py_NULL) {
657        (void)PyUnicode_AsUnicode(op);
658        assert(_PyASCIIObject_CAST(op)->wstr != _Py_NULL);
659    }
660    return PyUnicode_WSTR_LENGTH(op);
661    _Py_COMP_DIAG_POP
662}
663#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
664#  define PyUnicode_GET_SIZE(op) PyUnicode_GET_SIZE(_PyObject_CAST(op))
665#endif
666
667Py_DEPRECATED(3.3)
668static inline Py_ssize_t PyUnicode_GET_DATA_SIZE(PyObject *op)
669{
670    _Py_COMP_DIAG_PUSH
671    _Py_COMP_DIAG_IGNORE_DEPR_DECLS
672    return PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE;
673    _Py_COMP_DIAG_POP
674}
675#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
676#  define PyUnicode_GET_DATA_SIZE(op) PyUnicode_GET_DATA_SIZE(_PyObject_CAST(op))
677#endif
678
679/* Alias for PyUnicode_AsUnicode().  This will create a wchar_t/Py_UNICODE
680   representation on demand.  Using this macro is very inefficient now,
681   try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
682   use PyUnicode_WRITE() and PyUnicode_READ(). */
683
684Py_DEPRECATED(3.3)
685static inline Py_UNICODE* PyUnicode_AS_UNICODE(PyObject *op)
686{
687    wchar_t *wstr = _PyASCIIObject_CAST(op)->wstr;
688    if (wstr != _Py_NULL) {
689        return wstr;
690    }
691
692    _Py_COMP_DIAG_PUSH
693    _Py_COMP_DIAG_IGNORE_DEPR_DECLS
694    return PyUnicode_AsUnicode(op);
695    _Py_COMP_DIAG_POP
696}
697#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
698#  define PyUnicode_AS_UNICODE(op) PyUnicode_AS_UNICODE(_PyObject_CAST(op))
699#endif
700
701Py_DEPRECATED(3.3)
702static inline const char* PyUnicode_AS_DATA(PyObject *op)
703{
704    _Py_COMP_DIAG_PUSH
705    _Py_COMP_DIAG_IGNORE_DEPR_DECLS
706    Py_UNICODE *data = PyUnicode_AS_UNICODE(op);
707    // In C++, casting directly PyUnicode* to const char* is not valid
708    return _Py_STATIC_CAST(const char*, _Py_STATIC_CAST(const void*, data));
709    _Py_COMP_DIAG_POP
710}
711#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
712#  define PyUnicode_AS_DATA(op) PyUnicode_AS_DATA(_PyObject_CAST(op))
713#endif
714
715
716/* --- _PyUnicodeWriter API ----------------------------------------------- */
717
718typedef struct {
719    PyObject *buffer;
720    void *data;
721    enum PyUnicode_Kind kind;
722    Py_UCS4 maxchar;
723    Py_ssize_t size;
724    Py_ssize_t pos;
725
726    /* minimum number of allocated characters (default: 0) */
727    Py_ssize_t min_length;
728
729    /* minimum character (default: 127, ASCII) */
730    Py_UCS4 min_char;
731
732    /* If non-zero, overallocate the buffer (default: 0). */
733    unsigned char overallocate;
734
735    /* If readonly is 1, buffer is a shared string (cannot be modified)
736       and size is set to 0. */
737    unsigned char readonly;
738} _PyUnicodeWriter ;
739
740/* Initialize a Unicode writer.
741 *
742 * By default, the minimum buffer size is 0 character and overallocation is
743 * disabled. Set min_length, min_char and overallocate attributes to control
744 * the allocation of the buffer. */
745PyAPI_FUNC(void)
746_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
747
748/* Prepare the buffer to write 'length' characters
749   with the specified maximum character.
750
751   Return 0 on success, raise an exception and return -1 on error. */
752#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
753    (((MAXCHAR) <= (WRITER)->maxchar                                  \
754      && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
755     ? 0                                                              \
756     : (((LENGTH) == 0)                                               \
757        ? 0                                                           \
758        : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
759
760/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
761   instead. */
762PyAPI_FUNC(int)
763_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
764                                 Py_ssize_t length, Py_UCS4 maxchar);
765
766/* Prepare the buffer to have at least the kind KIND.
767   For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
768   support characters in range U+000-U+FFFF.
769
770   Return 0 on success, raise an exception and return -1 on error. */
771#define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
772    (assert((KIND) != PyUnicode_WCHAR_KIND),                          \
773     (KIND) <= (WRITER)->kind                                         \
774     ? 0                                                              \
775     : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
776
777/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
778   macro instead. */
779PyAPI_FUNC(int)
780_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
781                                     enum PyUnicode_Kind kind);
782
783/* Append a Unicode character.
784   Return 0 on success, raise an exception and return -1 on error. */
785PyAPI_FUNC(int)
786_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
787    Py_UCS4 ch
788    );
789
790/* Append a Unicode string.
791   Return 0 on success, raise an exception and return -1 on error. */
792PyAPI_FUNC(int)
793_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
794    PyObject *str               /* Unicode string */
795    );
796
797/* Append a substring of a Unicode string.
798   Return 0 on success, raise an exception and return -1 on error. */
799PyAPI_FUNC(int)
800_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
801    PyObject *str,              /* Unicode string */
802    Py_ssize_t start,
803    Py_ssize_t end
804    );
805
806/* Append an ASCII-encoded byte string.
807   Return 0 on success, raise an exception and return -1 on error. */
808PyAPI_FUNC(int)
809_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
810    const char *str,           /* ASCII-encoded byte string */
811    Py_ssize_t len             /* number of bytes, or -1 if unknown */
812    );
813
814/* Append a latin1-encoded byte string.
815   Return 0 on success, raise an exception and return -1 on error. */
816PyAPI_FUNC(int)
817_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
818    const char *str,           /* latin1-encoded byte string */
819    Py_ssize_t len             /* length in bytes */
820    );
821
822/* Get the value of the writer as a Unicode string. Clear the
823   buffer of the writer. Raise an exception and return NULL
824   on error. */
825PyAPI_FUNC(PyObject *)
826_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
827
828/* Deallocate memory of a writer (clear its internal buffer). */
829PyAPI_FUNC(void)
830_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
831
832
833/* Format the object based on the format_spec, as defined in PEP 3101
834   (Advanced String Formatting). */
835PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
836    _PyUnicodeWriter *writer,
837    PyObject *obj,
838    PyObject *format_spec,
839    Py_ssize_t start,
840    Py_ssize_t end);
841
842/* --- Manage the default encoding ---------------------------------------- */
843
844/* Returns a pointer to the default encoding (UTF-8) of the
845   Unicode object unicode.
846
847   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
848   in the unicodeobject.
849
850   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
851   support the previous internal function with the same behaviour.
852
853   Use of this API is DEPRECATED since no size information can be
854   extracted from the returned data.
855*/
856
857PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
858
859#define _PyUnicode_AsString PyUnicode_AsUTF8
860
861/* --- UTF-7 Codecs ------------------------------------------------------- */
862
863PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
864    PyObject *unicode,          /* Unicode object */
865    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
866    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
867    const char *errors          /* error handling */
868    );
869
870/* --- UTF-8 Codecs ------------------------------------------------------- */
871
872PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
873    PyObject *unicode,
874    const char *errors);
875
876/* --- UTF-32 Codecs ------------------------------------------------------ */
877
878PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
879    PyObject *object,           /* Unicode object */
880    const char *errors,         /* error handling */
881    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
882    );
883
884/* --- UTF-16 Codecs ------------------------------------------------------ */
885
886/* Returns a Python string object holding the UTF-16 encoded value of
887   the Unicode data.
888
889   If byteorder is not 0, output is written according to the following
890   byte order:
891
892   byteorder == -1: little endian
893   byteorder == 0:  native byte order (writes a BOM mark)
894   byteorder == 1:  big endian
895
896   If byteorder is 0, the output string will always start with the
897   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
898   prepended.
899*/
900PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
901    PyObject* unicode,          /* Unicode object */
902    const char *errors,         /* error handling */
903    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
904    );
905
906/* --- Unicode-Escape Codecs ---------------------------------------------- */
907
908/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
909PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
910        const char *string,     /* Unicode-Escape encoded string */
911        Py_ssize_t length,      /* size of string */
912        const char *errors,     /* error handling */
913        Py_ssize_t *consumed    /* bytes consumed */
914);
915/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
916   chars. */
917PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
918        const char *string,     /* Unicode-Escape encoded string */
919        Py_ssize_t length,      /* size of string */
920        const char *errors,     /* error handling */
921        Py_ssize_t *consumed,   /* bytes consumed */
922        const char **first_invalid_escape  /* on return, points to first
923                                              invalid escaped char in
924                                              string. */
925);
926
927/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
928
929/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
930PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
931        const char *string,     /* Unicode-Escape encoded string */
932        Py_ssize_t length,      /* size of string */
933        const char *errors,     /* error handling */
934        Py_ssize_t *consumed    /* bytes consumed */
935);
936
937/* --- Latin-1 Codecs ----------------------------------------------------- */
938
939PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
940    PyObject* unicode,
941    const char* errors);
942
943/* --- ASCII Codecs ------------------------------------------------------- */
944
945PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
946    PyObject* unicode,
947    const char* errors);
948
949/* --- Character Map Codecs ----------------------------------------------- */
950
951/* Translate an Unicode object by applying a character mapping table to
952   it and return the resulting Unicode object.
953
954   The mapping table must map Unicode ordinal integers to Unicode strings,
955   Unicode ordinal integers or None (causing deletion of the character).
956
957   Mapping tables may be dictionaries or sequences. Unmapped character
958   ordinals (ones which cause a LookupError) are left untouched and
959   are copied as-is.
960*/
961PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
962    PyObject *unicode,          /* Unicode object */
963    PyObject *mapping,          /* encoding mapping */
964    const char *errors          /* error handling */
965    );
966
967/* --- Decimal Encoder ---------------------------------------------------- */
968
969/* Coverts a Unicode object holding a decimal value to an ASCII string
970   for using in int, float and complex parsers.
971   Transforms code points that have decimal digit property to the
972   corresponding ASCII digit code points.  Transforms spaces to ASCII.
973   Transforms code points starting from the first non-ASCII code point that
974   is neither a decimal digit nor a space to the end into '?'. */
975
976PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
977    PyObject *unicode           /* Unicode object */
978    );
979
980/* --- Methods & Slots ---------------------------------------------------- */
981
982PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
983    PyObject *separator,
984    PyObject *const *items,
985    Py_ssize_t seqlen
986    );
987
988/* Test whether a unicode is equal to ASCII identifier.  Return 1 if true,
989   0 otherwise.  The right argument must be ASCII identifier.
990   Any error occurs inside will be cleared before return. */
991PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
992    PyObject *left,             /* Left string */
993    _Py_Identifier *right       /* Right identifier */
994    );
995
996/* Test whether a unicode is equal to ASCII string.  Return 1 if true,
997   0 otherwise.  The right argument must be ASCII-encoded string.
998   Any error occurs inside will be cleared before return. */
999PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
1000    PyObject *left,
1001    const char *right           /* ASCII-encoded string */
1002    );
1003
1004/* Externally visible for str.strip(unicode) */
1005PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1006    PyObject *self,
1007    int striptype,
1008    PyObject *sepobj
1009    );
1010
1011/* Using explicit passed-in values, insert the thousands grouping
1012   into the string pointed to by buffer.  For the argument descriptions,
1013   see Objects/stringlib/localeutil.h */
1014PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1015    _PyUnicodeWriter *writer,
1016    Py_ssize_t n_buffer,
1017    PyObject *digits,
1018    Py_ssize_t d_pos,
1019    Py_ssize_t n_digits,
1020    Py_ssize_t min_width,
1021    const char *grouping,
1022    PyObject *thousands_sep,
1023    Py_UCS4 *maxchar);
1024
1025/* === Characters Type APIs =============================================== */
1026
1027/* Helper array used by Py_UNICODE_ISSPACE(). */
1028
1029PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1030
1031/* These should not be used directly. Use the Py_UNICODE_IS* and
1032   Py_UNICODE_TO* macros instead.
1033
1034   These APIs are implemented in Objects/unicodectype.c.
1035
1036*/
1037
1038PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1039    Py_UCS4 ch       /* Unicode character */
1040    );
1041
1042PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1043    Py_UCS4 ch       /* Unicode character */
1044    );
1045
1046PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1047    Py_UCS4 ch       /* Unicode character */
1048    );
1049
1050PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1051    Py_UCS4 ch       /* Unicode character */
1052    );
1053
1054PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1055    Py_UCS4 ch       /* Unicode character */
1056    );
1057
1058PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1059    const Py_UCS4 ch         /* Unicode character */
1060    );
1061
1062PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1063    const Py_UCS4 ch         /* Unicode character */
1064    );
1065
1066/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1067    Py_UCS4 ch       /* Unicode character */
1068    );
1069
1070/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1071    Py_UCS4 ch       /* Unicode character */
1072    );
1073
1074Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1075    Py_UCS4 ch       /* Unicode character */
1076    );
1077
1078PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
1079    Py_UCS4 ch,       /* Unicode character */
1080    Py_UCS4 *res
1081    );
1082
1083PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
1084    Py_UCS4 ch,       /* Unicode character */
1085    Py_UCS4 *res
1086    );
1087
1088PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
1089    Py_UCS4 ch,       /* Unicode character */
1090    Py_UCS4 *res
1091    );
1092
1093PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
1094    Py_UCS4 ch,       /* Unicode character */
1095    Py_UCS4 *res
1096    );
1097
1098PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
1099    Py_UCS4 ch         /* Unicode character */
1100    );
1101
1102PyAPI_FUNC(int) _PyUnicode_IsCased(
1103    Py_UCS4 ch         /* Unicode character */
1104    );
1105
1106PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1107    Py_UCS4 ch       /* Unicode character */
1108    );
1109
1110PyAPI_FUNC(int) _PyUnicode_ToDigit(
1111    Py_UCS4 ch       /* Unicode character */
1112    );
1113
1114PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1115    Py_UCS4 ch       /* Unicode character */
1116    );
1117
1118PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1119    Py_UCS4 ch       /* Unicode character */
1120    );
1121
1122PyAPI_FUNC(int) _PyUnicode_IsDigit(
1123    Py_UCS4 ch       /* Unicode character */
1124    );
1125
1126PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1127    Py_UCS4 ch       /* Unicode character */
1128    );
1129
1130PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1131    Py_UCS4 ch       /* Unicode character */
1132    );
1133
1134PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1135    Py_UCS4 ch       /* Unicode character */
1136    );
1137
1138PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
1139
1140/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
1141PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
1142
1143/* Fast equality check when the inputs are known to be exact unicode types
1144   and where the hash values are equal (i.e. a very probable match) */
1145PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
1146
1147/* Equality check. Returns -1 on failure. */
1148PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
1149
1150PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);
1151PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
1152
1153PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
1154