1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>               // va_list
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
12
13Copyright (c) Corporation for National Research Initiatives.
14
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python.  This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
32 *
33 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
35 *
36 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
39 *
40 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
48 *
49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
58#include <ctype.h>
59
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
64/* Python 3.x requires unicode */
65#define Py_USING_UNICODE
66
67#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
69#endif
70
71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74   Otherwise, Unicode strings are stored as UCS-2 (with limited support
75   for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* If the compiler provides a wchar_t type we try to support it
87   through the interface functions PyUnicode_FromWideChar(),
88   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
89
90#ifdef HAVE_USABLE_WCHAR_T
91# ifndef HAVE_WCHAR_H
92#  define HAVE_WCHAR_H
93# endif
94#endif
95
96#ifdef HAVE_WCHAR_H
97#  include <wchar.h>
98#endif
99
100/* Py_UCS4 and Py_UCS2 are typedefs for the respective
101   unicode representations. */
102typedef uint32_t Py_UCS4;
103typedef uint16_t Py_UCS2;
104typedef uint8_t Py_UCS1;
105
106#ifdef __cplusplus
107extern "C" {
108#endif
109
110
111PyAPI_DATA(PyTypeObject) PyUnicode_Type;
112PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
113
114#define PyUnicode_Check(op) \
115    PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
116#define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type)
117
118/* --- Constants ---------------------------------------------------------- */
119
120/* This Unicode character will be used as replacement character during
121   decoding if the errors argument is set to "replace". Note: the
122   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123   Unicode 3.0. */
124
125#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
126
127/* === Public API ========================================================= */
128
129/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
130PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
131    const char *u,             /* UTF-8 encoded string */
132    Py_ssize_t size            /* size of buffer */
133    );
134
135/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
136   UTF-8 encoded bytes.  The size is determined with strlen(). */
137PyAPI_FUNC(PyObject*) PyUnicode_FromString(
138    const char *u              /* UTF-8 encoded string */
139    );
140
141#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
142PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143    PyObject *str,
144    Py_ssize_t start,
145    Py_ssize_t end);
146#endif
147
148#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
149/* Copy the string into a UCS4 buffer including the null character if copy_null
150   is set. Return NULL and raise an exception on error. Raise a SystemError if
151   the buffer is smaller than the string. Return buffer on success.
152
153   buflen is the length of the buffer in (Py_UCS4) characters. */
154PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155    PyObject *unicode,
156    Py_UCS4* buffer,
157    Py_ssize_t buflen,
158    int copy_null);
159
160/* Copy the string into a UCS4 buffer. A new buffer is allocated using
161 * PyMem_Malloc; if this fails, NULL is returned with a memory error
162   exception set. */
163PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
164#endif
165
166#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
167/* Get the length of the Unicode object. */
168
169PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170    PyObject *unicode
171);
172#endif
173
174/* Get the number of Py_UNICODE units in the
175   string representation. */
176
177Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
178    PyObject *unicode           /* Unicode object */
179    );
180
181#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
182/* Read a character from the string. */
183
184PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185    PyObject *unicode,
186    Py_ssize_t index
187    );
188
189/* Write a character to the string. The string must have been created through
190   PyUnicode_New, must not be shared, and must not have been hashed yet.
191
192   Return 0 on success, -1 on error. */
193
194PyAPI_FUNC(int) PyUnicode_WriteChar(
195    PyObject *unicode,
196    Py_ssize_t index,
197    Py_UCS4 character
198    );
199#endif
200
201/* Resize a Unicode object. The length is the number of characters, except
202   if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203   is the number of Py_UNICODE characters.
204
205   *unicode is modified to point to the new (resized) object and 0
206   returned on success.
207
208   Try to resize the string in place (which is usually faster than allocating
209   a new string and copy characters), or create a new string.
210
211   Error handling is implemented as follows: an exception is set, -1
212   is returned and *unicode left untouched.
213
214   WARNING: The function doesn't check string content, the result may not be a
215            string in canonical representation. */
216
217PyAPI_FUNC(int) PyUnicode_Resize(
218    PyObject **unicode,         /* Pointer to the Unicode object */
219    Py_ssize_t length           /* New length */
220    );
221
222/* Decode obj to a Unicode object.
223
224   bytes, bytearray and other bytes-like objects are decoded according to the
225   given encoding and error handler. The encoding and error handler can be
226   NULL to have the interface use UTF-8 and "strict".
227
228   All other objects (including Unicode objects) raise an exception.
229
230   The API returns NULL in case of an error. The caller is responsible
231   for decref'ing the returned objects.
232
233*/
234
235PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
236    PyObject *obj,              /* Object */
237    const char *encoding,       /* encoding */
238    const char *errors          /* error handling */
239    );
240
241/* Copy an instance of a Unicode subtype to a new true Unicode object if
242   necessary. If obj is already a true Unicode object (not a subtype), return
243   the reference with *incremented* refcount.
244
245   The API returns NULL in case of an error. The caller is responsible
246   for decref'ing the returned objects.
247
248*/
249
250PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
251    PyObject *obj      /* Object */
252    );
253
254PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255    const char *format,   /* ASCII-encoded string  */
256    va_list vargs
257    );
258PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259    const char *format,   /* ASCII-encoded string  */
260    ...
261    );
262
263PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
264PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
265    const char *u              /* UTF-8 encoded string */
266    );
267
268// PyUnicode_InternImmortal() is deprecated since Python 3.10
269// and will be removed in Python 3.12. Use PyUnicode_InternInPlace() instead.
270Py_DEPRECATED(3.10) PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
271
272/* --- wchar_t support for platforms which support it --------------------- */
273
274#ifdef HAVE_WCHAR_H
275
276/* Create a Unicode Object from the wchar_t buffer w of the given
277   size.
278
279   The buffer is copied into the new object. */
280
281PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
282    const wchar_t *w,           /* wchar_t buffer */
283    Py_ssize_t size             /* size of buffer */
284    );
285
286/* Copies the Unicode Object contents into the wchar_t buffer w.  At
287   most size wchar_t characters are copied.
288
289   Note that the resulting wchar_t string may or may not be
290   0-terminated.  It is the responsibility of the caller to make sure
291   that the wchar_t string is 0-terminated in case this is required by
292   the application.
293
294   Returns the number of wchar_t characters copied (excluding a
295   possibly trailing 0-termination character) or -1 in case of an
296   error. */
297
298PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
299    PyObject *unicode,          /* Unicode object */
300    wchar_t *w,                 /* wchar_t buffer */
301    Py_ssize_t size             /* size of buffer */
302    );
303
304/* Convert the Unicode object to a wide character string. The output string
305   always ends with a nul character. If size is not NULL, write the number of
306   wide characters (excluding the null character) into *size.
307
308   Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
309   on success. On error, returns NULL, *size is undefined and raises a
310   MemoryError. */
311
312PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
313    PyObject *unicode,          /* Unicode object */
314    Py_ssize_t *size            /* number of characters of the result */
315    );
316
317#endif
318
319/* --- Unicode ordinals --------------------------------------------------- */
320
321/* Create a Unicode Object from the given Unicode code point ordinal.
322
323   The ordinal must be in range(0x110000). A ValueError is
324   raised in case it is not.
325
326*/
327
328PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
329
330/* === Builtin Codecs =====================================================
331
332   Many of these APIs take two arguments encoding and errors. These
333   parameters encoding and errors have the same semantics as the ones
334   of the builtin str() API.
335
336   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
337
338   Error handling is set by errors which may also be set to NULL
339   meaning to use the default handling defined for the codec. Default
340   error handling for all builtin codecs is "strict" (ValueErrors are
341   raised).
342
343   The codecs all use a similar interface. Only deviation from the
344   generic ones are documented.
345
346*/
347
348/* --- Manage the default encoding ---------------------------------------- */
349
350/* Returns "utf-8".  */
351PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
352
353/* --- Generic Codecs ----------------------------------------------------- */
354
355/* Create a Unicode object by decoding the encoded string s of the
356   given size. */
357
358PyAPI_FUNC(PyObject*) PyUnicode_Decode(
359    const char *s,              /* encoded string */
360    Py_ssize_t size,            /* size of buffer */
361    const char *encoding,       /* encoding */
362    const char *errors          /* error handling */
363    );
364
365/* Decode a Unicode object unicode and return the result as Python
366   object.
367
368   This API is DEPRECATED. The only supported standard encoding is rot13.
369   Use PyCodec_Decode() to decode with rot13 and non-standard codecs
370   that decode from str. */
371
372Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
373    PyObject *unicode,          /* Unicode object */
374    const char *encoding,       /* encoding */
375    const char *errors          /* error handling */
376    );
377
378/* Decode a Unicode object unicode and return the result as Unicode
379   object.
380
381   This API is DEPRECATED. The only supported standard encoding is rot13.
382   Use PyCodec_Decode() to decode with rot13 and non-standard codecs
383   that decode from str to str. */
384
385Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
386    PyObject *unicode,          /* Unicode object */
387    const char *encoding,       /* encoding */
388    const char *errors          /* error handling */
389    );
390
391/* Encodes a Unicode object and returns the result as Python
392   object.
393
394   This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
395   since all standard encodings (except rot13) encode str to bytes.
396   Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
397   that encode form str to non-bytes. */
398
399Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
400    PyObject *unicode,          /* Unicode object */
401    const char *encoding,       /* encoding */
402    const char *errors          /* error handling */
403    );
404
405/* Encodes a Unicode object and returns the result as Python string
406   object. */
407
408PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
409    PyObject *unicode,          /* Unicode object */
410    const char *encoding,       /* encoding */
411    const char *errors          /* error handling */
412    );
413
414/* Encodes a Unicode object and returns the result as Unicode
415   object.
416
417   This API is DEPRECATED.  The only supported standard encodings is rot13.
418   Use PyCodec_Encode() to encode with rot13 and non-standard codecs
419   that encode from str to str. */
420
421Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
422    PyObject *unicode,          /* Unicode object */
423    const char *encoding,       /* encoding */
424    const char *errors          /* error handling */
425    );
426
427/* Build an encoding map. */
428
429PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
430    PyObject* string            /* 256 character map */
431   );
432
433/* --- UTF-7 Codecs ------------------------------------------------------- */
434
435PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
436    const char *string,         /* UTF-7 encoded string */
437    Py_ssize_t length,          /* size of string */
438    const char *errors          /* error handling */
439    );
440
441PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
442    const char *string,         /* UTF-7 encoded string */
443    Py_ssize_t length,          /* size of string */
444    const char *errors,         /* error handling */
445    Py_ssize_t *consumed        /* bytes consumed */
446    );
447
448/* --- UTF-8 Codecs ------------------------------------------------------- */
449
450PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
451    const char *string,         /* UTF-8 encoded string */
452    Py_ssize_t length,          /* size of string */
453    const char *errors          /* error handling */
454    );
455
456PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
457    const char *string,         /* UTF-8 encoded string */
458    Py_ssize_t length,          /* size of string */
459    const char *errors,         /* error handling */
460    Py_ssize_t *consumed        /* bytes consumed */
461    );
462
463PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
464    PyObject *unicode           /* Unicode object */
465    );
466
467/* Returns a pointer to the default encoding (UTF-8) of the
468   Unicode object unicode and the size of the encoded representation
469   in bytes stored in *size.
470
471   In case of an error, no *size is set.
472
473   This function caches the UTF-8 encoded string in the unicodeobject
474   and subsequent calls will return the same string.  The memory is released
475   when the unicodeobject is deallocated.
476*/
477
478#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
479PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
480    PyObject *unicode,
481    Py_ssize_t *size);
482#endif
483
484/* --- UTF-32 Codecs ------------------------------------------------------ */
485
486/* Decodes length bytes from a UTF-32 encoded buffer string and returns
487   the corresponding Unicode object.
488
489   errors (if non-NULL) defines the error handling. It defaults
490   to "strict".
491
492   If byteorder is non-NULL, the decoder starts decoding using the
493   given byte order:
494
495    *byteorder == -1: little endian
496    *byteorder == 0:  native order
497    *byteorder == 1:  big endian
498
499   In native mode, the first four bytes of the stream are checked for a
500   BOM mark. If found, the BOM mark is analysed, the byte order
501   adjusted and the BOM skipped.  In the other modes, no BOM mark
502   interpretation is done. After completion, *byteorder is set to the
503   current byte order at the end of input data.
504
505   If byteorder is NULL, the codec starts in native order mode.
506
507*/
508
509PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
510    const char *string,         /* UTF-32 encoded string */
511    Py_ssize_t length,          /* size of string */
512    const char *errors,         /* error handling */
513    int *byteorder              /* pointer to byteorder to use
514                                   0=native;-1=LE,1=BE; updated on
515                                   exit */
516    );
517
518PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
519    const char *string,         /* UTF-32 encoded string */
520    Py_ssize_t length,          /* size of string */
521    const char *errors,         /* error handling */
522    int *byteorder,             /* pointer to byteorder to use
523                                   0=native;-1=LE,1=BE; updated on
524                                   exit */
525    Py_ssize_t *consumed        /* bytes consumed */
526    );
527
528/* Returns a Python string using the UTF-32 encoding in native byte
529   order. The string always starts with a BOM mark.  */
530
531PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
532    PyObject *unicode           /* Unicode object */
533    );
534
535/* Returns a Python string object holding the UTF-32 encoded value of
536   the Unicode data.
537
538   If byteorder is not 0, output is written according to the following
539   byte order:
540
541   byteorder == -1: little endian
542   byteorder == 0:  native byte order (writes a BOM mark)
543   byteorder == 1:  big endian
544
545   If byteorder is 0, the output string will always start with the
546   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
547   prepended.
548
549*/
550
551/* --- UTF-16 Codecs ------------------------------------------------------ */
552
553/* Decodes length bytes from a UTF-16 encoded buffer string and returns
554   the corresponding Unicode object.
555
556   errors (if non-NULL) defines the error handling. It defaults
557   to "strict".
558
559   If byteorder is non-NULL, the decoder starts decoding using the
560   given byte order:
561
562    *byteorder == -1: little endian
563    *byteorder == 0:  native order
564    *byteorder == 1:  big endian
565
566   In native mode, the first two bytes of the stream are checked for a
567   BOM mark. If found, the BOM mark is analysed, the byte order
568   adjusted and the BOM skipped.  In the other modes, no BOM mark
569   interpretation is done. After completion, *byteorder is set to the
570   current byte order at the end of input data.
571
572   If byteorder is NULL, the codec starts in native order mode.
573
574*/
575
576PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
577    const char *string,         /* UTF-16 encoded string */
578    Py_ssize_t length,          /* size of string */
579    const char *errors,         /* error handling */
580    int *byteorder              /* pointer to byteorder to use
581                                   0=native;-1=LE,1=BE; updated on
582                                   exit */
583    );
584
585PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
586    const char *string,         /* UTF-16 encoded string */
587    Py_ssize_t length,          /* size of string */
588    const char *errors,         /* error handling */
589    int *byteorder,             /* pointer to byteorder to use
590                                   0=native;-1=LE,1=BE; updated on
591                                   exit */
592    Py_ssize_t *consumed        /* bytes consumed */
593    );
594
595/* Returns a Python string using the UTF-16 encoding in native byte
596   order. The string always starts with a BOM mark.  */
597
598PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
599    PyObject *unicode           /* Unicode object */
600    );
601
602/* --- Unicode-Escape Codecs ---------------------------------------------- */
603
604PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
605    const char *string,         /* Unicode-Escape encoded string */
606    Py_ssize_t length,          /* size of string */
607    const char *errors          /* error handling */
608    );
609
610PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
611    PyObject *unicode           /* Unicode object */
612    );
613
614/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
615
616PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
617    const char *string,         /* Raw-Unicode-Escape encoded string */
618    Py_ssize_t length,          /* size of string */
619    const char *errors          /* error handling */
620    );
621
622PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
623    PyObject *unicode           /* Unicode object */
624    );
625
626/* --- Latin-1 Codecs -----------------------------------------------------
627
628   Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
629
630PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
631    const char *string,         /* Latin-1 encoded string */
632    Py_ssize_t length,          /* size of string */
633    const char *errors          /* error handling */
634    );
635
636PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
637    PyObject *unicode           /* Unicode object */
638    );
639
640/* --- ASCII Codecs -------------------------------------------------------
641
642   Only 7-bit ASCII data is excepted. All other codes generate errors.
643
644*/
645
646PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
647    const char *string,         /* ASCII encoded string */
648    Py_ssize_t length,          /* size of string */
649    const char *errors          /* error handling */
650    );
651
652PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
653    PyObject *unicode           /* Unicode object */
654    );
655
656/* --- Character Map Codecs -----------------------------------------------
657
658   This codec uses mappings to encode and decode characters.
659
660   Decoding mappings must map byte ordinals (integers in the range from 0 to
661   255) to Unicode strings, integers (which are then interpreted as Unicode
662   ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
663   as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
664   mapping" and cause an error.
665
666   Encoding mappings must map Unicode ordinal integers to bytes objects,
667   integers in the range from 0 to 255 or None.  Unmapped character
668   ordinals (ones which cause a LookupError) as well as mapped to
669   None are treated as "undefined mapping" and cause an error.
670
671*/
672
673PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
674    const char *string,         /* Encoded string */
675    Py_ssize_t length,          /* size of string */
676    PyObject *mapping,          /* decoding mapping */
677    const char *errors          /* error handling */
678    );
679
680PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
681    PyObject *unicode,          /* Unicode object */
682    PyObject *mapping           /* encoding mapping */
683    );
684
685/* --- MBCS codecs for Windows -------------------------------------------- */
686
687#ifdef MS_WINDOWS
688PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
689    const char *string,         /* MBCS encoded string */
690    Py_ssize_t length,          /* size of string */
691    const char *errors          /* error handling */
692    );
693
694PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
695    const char *string,         /* MBCS encoded string */
696    Py_ssize_t length,          /* size of string */
697    const char *errors,         /* error handling */
698    Py_ssize_t *consumed        /* bytes consumed */
699    );
700
701#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
702PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
703    int code_page,              /* code page number */
704    const char *string,         /* encoded string */
705    Py_ssize_t length,          /* size of string */
706    const char *errors,         /* error handling */
707    Py_ssize_t *consumed        /* bytes consumed */
708    );
709#endif
710
711PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
712    PyObject *unicode           /* Unicode object */
713    );
714
715#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
716PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
717    int code_page,              /* code page number */
718    PyObject *unicode,          /* Unicode object */
719    const char *errors          /* error handling */
720    );
721#endif
722
723#endif /* MS_WINDOWS */
724
725/* --- Locale encoding --------------------------------------------------- */
726
727#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
728/* Decode a string from the current locale encoding. The decoder is strict if
729   *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
730   error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
731   be decoded as a surrogate character and *surrogateescape* is not equal to
732   zero, the byte sequence is escaped using the 'surrogateescape' error handler
733   instead of being decoded. *str* must end with a null character but cannot
734   contain embedded null characters. */
735
736PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
737    const char *str,
738    Py_ssize_t len,
739    const char *errors);
740
741/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
742   length using strlen(). */
743
744PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
745    const char *str,
746    const char *errors);
747
748/* Encode a Unicode object to the current locale encoding. The encoder is
749   strict is *surrogateescape* is equal to zero, otherwise the
750   "surrogateescape" error handler is used. Return a bytes object. The string
751   cannot contain embedded null characters. */
752
753PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
754    PyObject *unicode,
755    const char *errors
756    );
757#endif
758
759/* --- File system encoding ---------------------------------------------- */
760
761/* ParseTuple converter: encode str objects to bytes using
762   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
763
764PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
765
766/* ParseTuple converter: decode bytes objects to unicode using
767   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
768
769PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
770
771/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
772   and the "surrogateescape" error handler.
773
774   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
775   encoding.
776
777   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
778*/
779
780PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
781    const char *s               /* encoded string */
782    );
783
784/* Decode a string using Py_FileSystemDefaultEncoding
785   and the "surrogateescape" error handler.
786
787   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
788   encoding.
789*/
790
791PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
792    const char *s,               /* encoded string */
793    Py_ssize_t size              /* size */
794    );
795
796/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
797   "surrogateescape" error handler, and return bytes.
798
799   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
800   encoding.
801*/
802
803PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
804    PyObject *unicode
805    );
806
807/* --- Methods & Slots ----------------------------------------------------
808
809   These are capable of handling Unicode objects and strings on input
810   (we refer to them as strings in the descriptions) and return
811   Unicode objects or integers as appropriate. */
812
813/* Concat two strings giving a new Unicode string. */
814
815PyAPI_FUNC(PyObject*) PyUnicode_Concat(
816    PyObject *left,             /* Left string */
817    PyObject *right             /* Right string */
818    );
819
820/* Concat two strings and put the result in *pleft
821   (sets *pleft to NULL on error) */
822
823PyAPI_FUNC(void) PyUnicode_Append(
824    PyObject **pleft,           /* Pointer to left string */
825    PyObject *right             /* Right string */
826    );
827
828/* Concat two strings, put the result in *pleft and drop the right object
829   (sets *pleft to NULL on error) */
830
831PyAPI_FUNC(void) PyUnicode_AppendAndDel(
832    PyObject **pleft,           /* Pointer to left string */
833    PyObject *right             /* Right string */
834    );
835
836/* Split a string giving a list of Unicode strings.
837
838   If sep is NULL, splitting will be done at all whitespace
839   substrings. Otherwise, splits occur at the given separator.
840
841   At most maxsplit splits will be done. If negative, no limit is set.
842
843   Separators are not included in the resulting list.
844
845*/
846
847PyAPI_FUNC(PyObject*) PyUnicode_Split(
848    PyObject *s,                /* String to split */
849    PyObject *sep,              /* String separator */
850    Py_ssize_t maxsplit         /* Maxsplit count */
851    );
852
853/* Dito, but split at line breaks.
854
855   CRLF is considered to be one line break. Line breaks are not
856   included in the resulting list. */
857
858PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
859    PyObject *s,                /* String to split */
860    int keepends                /* If true, line end markers are included */
861    );
862
863/* Partition a string using a given separator. */
864
865PyAPI_FUNC(PyObject*) PyUnicode_Partition(
866    PyObject *s,                /* String to partition */
867    PyObject *sep               /* String separator */
868    );
869
870/* Partition a string using a given separator, searching from the end of the
871   string. */
872
873PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
874    PyObject *s,                /* String to partition */
875    PyObject *sep               /* String separator */
876    );
877
878/* Split a string giving a list of Unicode strings.
879
880   If sep is NULL, splitting will be done at all whitespace
881   substrings. Otherwise, splits occur at the given separator.
882
883   At most maxsplit splits will be done. But unlike PyUnicode_Split
884   PyUnicode_RSplit splits from the end of the string. If negative,
885   no limit is set.
886
887   Separators are not included in the resulting list.
888
889*/
890
891PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
892    PyObject *s,                /* String to split */
893    PyObject *sep,              /* String separator */
894    Py_ssize_t maxsplit         /* Maxsplit count */
895    );
896
897/* Translate a string by applying a character mapping table to it and
898   return the resulting Unicode object.
899
900   The mapping table must map Unicode ordinal integers to Unicode strings,
901   Unicode ordinal integers or None (causing deletion of the character).
902
903   Mapping tables may be dictionaries or sequences. Unmapped character
904   ordinals (ones which cause a LookupError) are left untouched and
905   are copied as-is.
906
907*/
908
909PyAPI_FUNC(PyObject *) PyUnicode_Translate(
910    PyObject *str,              /* String */
911    PyObject *table,            /* Translate table */
912    const char *errors          /* error handling */
913    );
914
915/* Join a sequence of strings using the given separator and return
916   the resulting Unicode string. */
917
918PyAPI_FUNC(PyObject*) PyUnicode_Join(
919    PyObject *separator,        /* Separator string */
920    PyObject *seq               /* Sequence object */
921    );
922
923/* Return 1 if substr matches str[start:end] at the given tail end, 0
924   otherwise. */
925
926PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
927    PyObject *str,              /* String */
928    PyObject *substr,           /* Prefix or Suffix string */
929    Py_ssize_t start,           /* Start index */
930    Py_ssize_t end,             /* Stop index */
931    int direction               /* Tail end: -1 prefix, +1 suffix */
932    );
933
934/* Return the first position of substr in str[start:end] using the
935   given search direction or -1 if not found. -2 is returned in case
936   an error occurred and an exception is set. */
937
938PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
939    PyObject *str,              /* String */
940    PyObject *substr,           /* Substring to find */
941    Py_ssize_t start,           /* Start index */
942    Py_ssize_t end,             /* Stop index */
943    int direction               /* Find direction: +1 forward, -1 backward */
944    );
945
946#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
947/* Like PyUnicode_Find, but search for single character only. */
948PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
949    PyObject *str,
950    Py_UCS4 ch,
951    Py_ssize_t start,
952    Py_ssize_t end,
953    int direction
954    );
955#endif
956
957/* Count the number of occurrences of substr in str[start:end]. */
958
959PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
960    PyObject *str,              /* String */
961    PyObject *substr,           /* Substring to count */
962    Py_ssize_t start,           /* Start index */
963    Py_ssize_t end              /* Stop index */
964    );
965
966/* Replace at most maxcount occurrences of substr in str with replstr
967   and return the resulting Unicode object. */
968
969PyAPI_FUNC(PyObject *) PyUnicode_Replace(
970    PyObject *str,              /* String */
971    PyObject *substr,           /* Substring to find */
972    PyObject *replstr,          /* Substring to replace */
973    Py_ssize_t maxcount         /* Max. number of replacements to apply;
974                                   -1 = all */
975    );
976
977/* Compare two strings and return -1, 0, 1 for less than, equal,
978   greater than resp.
979   Raise an exception and return -1 on error. */
980
981PyAPI_FUNC(int) PyUnicode_Compare(
982    PyObject *left,             /* Left string */
983    PyObject *right             /* Right string */
984    );
985
986/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
987   equal, and greater than, respectively.  It is best to pass only
988   ASCII-encoded strings, but the function interprets the input string as
989   ISO-8859-1 if it contains non-ASCII characters.
990   This function does not raise exceptions. */
991
992PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
993    PyObject *left,
994    const char *right           /* ASCII-encoded string */
995    );
996
997/* Rich compare two strings and return one of the following:
998
999   - NULL in case an exception was raised
1000   - Py_True or Py_False for successful comparisons
1001   - Py_NotImplemented in case the type combination is unknown
1002
1003   Possible values for op:
1004
1005     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1006
1007*/
1008
1009PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1010    PyObject *left,             /* Left string */
1011    PyObject *right,            /* Right string */
1012    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1013    );
1014
1015/* Apply an argument tuple or dictionary to a format string and return
1016   the resulting Unicode string. */
1017
1018PyAPI_FUNC(PyObject *) PyUnicode_Format(
1019    PyObject *format,           /* Format string */
1020    PyObject *args              /* Argument tuple or dictionary */
1021    );
1022
1023/* Checks whether element is contained in container and return 1/0
1024   accordingly.
1025
1026   element has to coerce to a one element Unicode string. -1 is
1027   returned in case of an error. */
1028
1029PyAPI_FUNC(int) PyUnicode_Contains(
1030    PyObject *container,        /* Container string */
1031    PyObject *element           /* Element string */
1032    );
1033
1034/* Checks whether argument is a valid identifier. */
1035
1036PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1037
1038/* === Characters Type APIs =============================================== */
1039
1040#ifndef Py_LIMITED_API
1041#  define Py_CPYTHON_UNICODEOBJECT_H
1042#  include "cpython/unicodeobject.h"
1043#  undef Py_CPYTHON_UNICODEOBJECT_H
1044#endif
1045
1046#ifdef __cplusplus
1047}
1048#endif
1049#endif /* !Py_UNICODEOBJECT_H */
1050