xref: /third_party/python/Python/codecs.c (revision 7db96d56)
1/* ------------------------------------------------------------------------
2
3   Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9   ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include "pycore_call.h"          // _PyObject_CallNoArgs()
13#include "pycore_interp.h"        // PyInterpreterState.codec_search_path
14#include "pycore_pystate.h"       // _PyInterpreterState_GET()
15#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
16#include <ctype.h>
17
18const char *Py_hexdigits = "0123456789abcdef";
19
20/* --- Codec Registry ----------------------------------------------------- */
21
22/* Import the standard encodings package which will register the first
23   codec search function.
24
25   This is done in a lazy way so that the Unicode implementation does
26   not downgrade startup time of scripts not needing it.
27
28   ImportErrors are silently ignored by this function. Only one try is
29   made.
30
31*/
32
33static int _PyCodecRegistry_Init(void); /* Forward */
34
35int PyCodec_Register(PyObject *search_function)
36{
37    PyInterpreterState *interp = _PyInterpreterState_GET();
38    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
39        goto onError;
40    if (search_function == NULL) {
41        PyErr_BadArgument();
42        goto onError;
43    }
44    if (!PyCallable_Check(search_function)) {
45        PyErr_SetString(PyExc_TypeError, "argument must be callable");
46        goto onError;
47    }
48    return PyList_Append(interp->codec_search_path, search_function);
49
50 onError:
51    return -1;
52}
53
54int
55PyCodec_Unregister(PyObject *search_function)
56{
57    PyInterpreterState *interp = PyInterpreterState_Get();
58    PyObject *codec_search_path = interp->codec_search_path;
59    /* Do nothing if codec_search_path is not created yet or was cleared. */
60    if (codec_search_path == NULL) {
61        return 0;
62    }
63
64    assert(PyList_CheckExact(codec_search_path));
65    Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
66    for (Py_ssize_t i = 0; i < n; i++) {
67        PyObject *item = PyList_GET_ITEM(codec_search_path, i);
68        if (item == search_function) {
69            if (interp->codec_search_cache != NULL) {
70                assert(PyDict_CheckExact(interp->codec_search_cache));
71                PyDict_Clear(interp->codec_search_cache);
72            }
73            return PyList_SetSlice(codec_search_path, i, i+1, NULL);
74        }
75    }
76    return 0;
77}
78
79extern int _Py_normalize_encoding(const char *, char *, size_t);
80
81/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
82   converted to lower case, spaces and hyphens are replaced with underscores. */
83
84static
85PyObject *normalizestring(const char *string)
86{
87    size_t len = strlen(string);
88    char *encoding;
89    PyObject *v;
90
91    if (len > PY_SSIZE_T_MAX) {
92        PyErr_SetString(PyExc_OverflowError, "string is too large");
93        return NULL;
94    }
95
96    encoding = PyMem_Malloc(len + 1);
97    if (encoding == NULL)
98        return PyErr_NoMemory();
99
100    if (!_Py_normalize_encoding(string, encoding, len + 1))
101    {
102        PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
103        PyMem_Free(encoding);
104        return NULL;
105    }
106
107    v = PyUnicode_FromString(encoding);
108    PyMem_Free(encoding);
109    return v;
110}
111
112/* Lookup the given encoding and return a tuple providing the codec
113   facilities.
114
115   The encoding string is looked up converted to all lower-case
116   characters. This makes encodings looked up through this mechanism
117   effectively case-insensitive.
118
119   If no codec is found, a LookupError is set and NULL returned.
120
121   As side effect, this tries to load the encodings package, if not
122   yet done. This is part of the lazy load strategy for the encodings
123   package.
124
125*/
126
127PyObject *_PyCodec_Lookup(const char *encoding)
128{
129    if (encoding == NULL) {
130        PyErr_BadArgument();
131        return NULL;
132    }
133
134    PyInterpreterState *interp = _PyInterpreterState_GET();
135    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
136        return NULL;
137    }
138
139    /* Convert the encoding to a normalized Python string: all
140       characters are converted to lower case, spaces and hyphens are
141       replaced with underscores. */
142    PyObject *v = normalizestring(encoding);
143    if (v == NULL) {
144        return NULL;
145    }
146    PyUnicode_InternInPlace(&v);
147
148    /* First, try to lookup the name in the registry dictionary */
149    PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
150    if (result != NULL) {
151        Py_INCREF(result);
152        Py_DECREF(v);
153        return result;
154    }
155    else if (PyErr_Occurred()) {
156        goto onError;
157    }
158
159    /* Next, scan the search functions in order of registration */
160    const Py_ssize_t len = PyList_Size(interp->codec_search_path);
161    if (len < 0)
162        goto onError;
163    if (len == 0) {
164        PyErr_SetString(PyExc_LookupError,
165                        "no codec search functions registered: "
166                        "can't find encoding");
167        goto onError;
168    }
169
170    Py_ssize_t i;
171    for (i = 0; i < len; i++) {
172        PyObject *func;
173
174        func = PyList_GetItem(interp->codec_search_path, i);
175        if (func == NULL)
176            goto onError;
177        result = PyObject_CallOneArg(func, v);
178        if (result == NULL)
179            goto onError;
180        if (result == Py_None) {
181            Py_DECREF(result);
182            continue;
183        }
184        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
185            PyErr_SetString(PyExc_TypeError,
186                            "codec search functions must return 4-tuples");
187            Py_DECREF(result);
188            goto onError;
189        }
190        break;
191    }
192    if (i == len) {
193        /* XXX Perhaps we should cache misses too ? */
194        PyErr_Format(PyExc_LookupError,
195                     "unknown encoding: %s", encoding);
196        goto onError;
197    }
198
199    /* Cache and return the result */
200    if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
201        Py_DECREF(result);
202        goto onError;
203    }
204    Py_DECREF(v);
205    return result;
206
207 onError:
208    Py_DECREF(v);
209    return NULL;
210}
211
212/* Codec registry encoding check API. */
213
214int PyCodec_KnownEncoding(const char *encoding)
215{
216    PyObject *codecs;
217
218    codecs = _PyCodec_Lookup(encoding);
219    if (!codecs) {
220        PyErr_Clear();
221        return 0;
222    }
223    else {
224        Py_DECREF(codecs);
225        return 1;
226    }
227}
228
229static
230PyObject *args_tuple(PyObject *object,
231                     const char *errors)
232{
233    PyObject *args;
234
235    args = PyTuple_New(1 + (errors != NULL));
236    if (args == NULL)
237        return NULL;
238    Py_INCREF(object);
239    PyTuple_SET_ITEM(args,0,object);
240    if (errors) {
241        PyObject *v;
242
243        v = PyUnicode_FromString(errors);
244        if (v == NULL) {
245            Py_DECREF(args);
246            return NULL;
247        }
248        PyTuple_SET_ITEM(args, 1, v);
249    }
250    return args;
251}
252
253/* Helper function to get a codec item */
254
255static
256PyObject *codec_getitem(const char *encoding, int index)
257{
258    PyObject *codecs;
259    PyObject *v;
260
261    codecs = _PyCodec_Lookup(encoding);
262    if (codecs == NULL)
263        return NULL;
264    v = PyTuple_GET_ITEM(codecs, index);
265    Py_DECREF(codecs);
266    Py_INCREF(v);
267    return v;
268}
269
270/* Helper functions to create an incremental codec. */
271static
272PyObject *codec_makeincrementalcodec(PyObject *codec_info,
273                                     const char *errors,
274                                     const char *attrname)
275{
276    PyObject *ret, *inccodec;
277
278    inccodec = PyObject_GetAttrString(codec_info, attrname);
279    if (inccodec == NULL)
280        return NULL;
281    if (errors)
282        ret = PyObject_CallFunction(inccodec, "s", errors);
283    else
284        ret = _PyObject_CallNoArgs(inccodec);
285    Py_DECREF(inccodec);
286    return ret;
287}
288
289static
290PyObject *codec_getincrementalcodec(const char *encoding,
291                                    const char *errors,
292                                    const char *attrname)
293{
294    PyObject *codec_info, *ret;
295
296    codec_info = _PyCodec_Lookup(encoding);
297    if (codec_info == NULL)
298        return NULL;
299    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
300    Py_DECREF(codec_info);
301    return ret;
302}
303
304/* Helper function to create a stream codec. */
305
306static
307PyObject *codec_getstreamcodec(const char *encoding,
308                               PyObject *stream,
309                               const char *errors,
310                               const int index)
311{
312    PyObject *codecs, *streamcodec, *codeccls;
313
314    codecs = _PyCodec_Lookup(encoding);
315    if (codecs == NULL)
316        return NULL;
317
318    codeccls = PyTuple_GET_ITEM(codecs, index);
319    if (errors != NULL)
320        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
321    else
322        streamcodec = PyObject_CallOneArg(codeccls, stream);
323    Py_DECREF(codecs);
324    return streamcodec;
325}
326
327/* Helpers to work with the result of _PyCodec_Lookup
328
329 */
330PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
331                                             const char *errors)
332{
333    return codec_makeincrementalcodec(codec_info, errors,
334                                      "incrementaldecoder");
335}
336
337PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
338                                             const char *errors)
339{
340    return codec_makeincrementalcodec(codec_info, errors,
341                                      "incrementalencoder");
342}
343
344
345/* Convenience APIs to query the Codec registry.
346
347   All APIs return a codec object with incremented refcount.
348
349 */
350
351PyObject *PyCodec_Encoder(const char *encoding)
352{
353    return codec_getitem(encoding, 0);
354}
355
356PyObject *PyCodec_Decoder(const char *encoding)
357{
358    return codec_getitem(encoding, 1);
359}
360
361PyObject *PyCodec_IncrementalEncoder(const char *encoding,
362                                     const char *errors)
363{
364    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
365}
366
367PyObject *PyCodec_IncrementalDecoder(const char *encoding,
368                                     const char *errors)
369{
370    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
371}
372
373PyObject *PyCodec_StreamReader(const char *encoding,
374                               PyObject *stream,
375                               const char *errors)
376{
377    return codec_getstreamcodec(encoding, stream, errors, 2);
378}
379
380PyObject *PyCodec_StreamWriter(const char *encoding,
381                               PyObject *stream,
382                               const char *errors)
383{
384    return codec_getstreamcodec(encoding, stream, errors, 3);
385}
386
387/* Helper that tries to ensure the reported exception chain indicates the
388 * codec that was invoked to trigger the failure without changing the type
389 * of the exception raised.
390 */
391static void
392wrap_codec_error(const char *operation,
393                 const char *encoding)
394{
395    /* TrySetFromCause will replace the active exception with a suitably
396     * updated clone if it can, otherwise it will leave the original
397     * exception alone.
398     */
399    _PyErr_TrySetFromCause("%s with '%s' codec failed",
400                           operation, encoding);
401}
402
403/* Encode an object (e.g. a Unicode object) using the given encoding
404   and return the resulting encoded object (usually a Python string).
405
406   errors is passed to the encoder factory as argument if non-NULL. */
407
408static PyObject *
409_PyCodec_EncodeInternal(PyObject *object,
410                        PyObject *encoder,
411                        const char *encoding,
412                        const char *errors)
413{
414    PyObject *args = NULL, *result = NULL;
415    PyObject *v = NULL;
416
417    args = args_tuple(object, errors);
418    if (args == NULL)
419        goto onError;
420
421    result = PyObject_Call(encoder, args, NULL);
422    if (result == NULL) {
423        wrap_codec_error("encoding", encoding);
424        goto onError;
425    }
426
427    if (!PyTuple_Check(result) ||
428        PyTuple_GET_SIZE(result) != 2) {
429        PyErr_SetString(PyExc_TypeError,
430                        "encoder must return a tuple (object, integer)");
431        goto onError;
432    }
433    v = PyTuple_GET_ITEM(result,0);
434    Py_INCREF(v);
435    /* We don't check or use the second (integer) entry. */
436
437    Py_DECREF(args);
438    Py_DECREF(encoder);
439    Py_DECREF(result);
440    return v;
441
442 onError:
443    Py_XDECREF(result);
444    Py_XDECREF(args);
445    Py_XDECREF(encoder);
446    return NULL;
447}
448
449/* Decode an object (usually a Python string) using the given encoding
450   and return an equivalent object (e.g. a Unicode object).
451
452   errors is passed to the decoder factory as argument if non-NULL. */
453
454static PyObject *
455_PyCodec_DecodeInternal(PyObject *object,
456                        PyObject *decoder,
457                        const char *encoding,
458                        const char *errors)
459{
460    PyObject *args = NULL, *result = NULL;
461    PyObject *v;
462
463    args = args_tuple(object, errors);
464    if (args == NULL)
465        goto onError;
466
467    result = PyObject_Call(decoder, args, NULL);
468    if (result == NULL) {
469        wrap_codec_error("decoding", encoding);
470        goto onError;
471    }
472    if (!PyTuple_Check(result) ||
473        PyTuple_GET_SIZE(result) != 2) {
474        PyErr_SetString(PyExc_TypeError,
475                        "decoder must return a tuple (object,integer)");
476        goto onError;
477    }
478    v = PyTuple_GET_ITEM(result,0);
479    Py_INCREF(v);
480    /* We don't check or use the second (integer) entry. */
481
482    Py_DECREF(args);
483    Py_DECREF(decoder);
484    Py_DECREF(result);
485    return v;
486
487 onError:
488    Py_XDECREF(args);
489    Py_XDECREF(decoder);
490    Py_XDECREF(result);
491    return NULL;
492}
493
494/* Generic encoding/decoding API */
495PyObject *PyCodec_Encode(PyObject *object,
496                         const char *encoding,
497                         const char *errors)
498{
499    PyObject *encoder;
500
501    encoder = PyCodec_Encoder(encoding);
502    if (encoder == NULL)
503        return NULL;
504
505    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
506}
507
508PyObject *PyCodec_Decode(PyObject *object,
509                         const char *encoding,
510                         const char *errors)
511{
512    PyObject *decoder;
513
514    decoder = PyCodec_Decoder(encoding);
515    if (decoder == NULL)
516        return NULL;
517
518    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
519}
520
521/* Text encoding/decoding API */
522PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
523                                       const char *alternate_command)
524{
525    PyObject *codec;
526    PyObject *attr;
527    int is_text_codec;
528
529    codec = _PyCodec_Lookup(encoding);
530    if (codec == NULL)
531        return NULL;
532
533    /* Backwards compatibility: assume any raw tuple describes a text
534     * encoding, and the same for anything lacking the private
535     * attribute.
536     */
537    if (!PyTuple_CheckExact(codec)) {
538        if (_PyObject_LookupAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
539            Py_DECREF(codec);
540            return NULL;
541        }
542        if (attr != NULL) {
543            is_text_codec = PyObject_IsTrue(attr);
544            Py_DECREF(attr);
545            if (is_text_codec <= 0) {
546                Py_DECREF(codec);
547                if (!is_text_codec)
548                    PyErr_Format(PyExc_LookupError,
549                                 "'%.400s' is not a text encoding; "
550                                 "use %s to handle arbitrary codecs",
551                                 encoding, alternate_command);
552                return NULL;
553            }
554        }
555    }
556
557    /* This appears to be a valid text encoding */
558    return codec;
559}
560
561
562static
563PyObject *codec_getitem_checked(const char *encoding,
564                                const char *alternate_command,
565                                int index)
566{
567    PyObject *codec;
568    PyObject *v;
569
570    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
571    if (codec == NULL)
572        return NULL;
573
574    v = PyTuple_GET_ITEM(codec, index);
575    Py_INCREF(v);
576    Py_DECREF(codec);
577    return v;
578}
579
580static PyObject * _PyCodec_TextEncoder(const char *encoding)
581{
582    return codec_getitem_checked(encoding, "codecs.encode()", 0);
583}
584
585static PyObject * _PyCodec_TextDecoder(const char *encoding)
586{
587    return codec_getitem_checked(encoding, "codecs.decode()", 1);
588}
589
590PyObject *_PyCodec_EncodeText(PyObject *object,
591                              const char *encoding,
592                              const char *errors)
593{
594    PyObject *encoder;
595
596    encoder = _PyCodec_TextEncoder(encoding);
597    if (encoder == NULL)
598        return NULL;
599
600    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
601}
602
603PyObject *_PyCodec_DecodeText(PyObject *object,
604                              const char *encoding,
605                              const char *errors)
606{
607    PyObject *decoder;
608
609    decoder = _PyCodec_TextDecoder(encoding);
610    if (decoder == NULL)
611        return NULL;
612
613    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
614}
615
616/* Register the error handling callback function error under the name
617   name. This function will be called by the codec when it encounters
618   an unencodable characters/undecodable bytes and doesn't know the
619   callback name, when name is specified as the error parameter
620   in the call to the encode/decode function.
621   Return 0 on success, -1 on error */
622int PyCodec_RegisterError(const char *name, PyObject *error)
623{
624    PyInterpreterState *interp = _PyInterpreterState_GET();
625    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
626        return -1;
627    if (!PyCallable_Check(error)) {
628        PyErr_SetString(PyExc_TypeError, "handler must be callable");
629        return -1;
630    }
631    return PyDict_SetItemString(interp->codec_error_registry,
632                                name, error);
633}
634
635/* Lookup the error handling callback function registered under the
636   name error. As a special case NULL can be passed, in which case
637   the error handling callback for strict encoding will be returned. */
638PyObject *PyCodec_LookupError(const char *name)
639{
640    PyObject *handler = NULL;
641
642    PyInterpreterState *interp = _PyInterpreterState_GET();
643    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
644        return NULL;
645
646    if (name==NULL)
647        name = "strict";
648    handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
649    if (handler) {
650        Py_INCREF(handler);
651    }
652    else if (!PyErr_Occurred()) {
653        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
654    }
655    return handler;
656}
657
658static void wrong_exception_type(PyObject *exc)
659{
660    PyErr_Format(PyExc_TypeError,
661                 "don't know how to handle %.200s in error callback",
662                 Py_TYPE(exc)->tp_name);
663}
664
665PyObject *PyCodec_StrictErrors(PyObject *exc)
666{
667    if (PyExceptionInstance_Check(exc))
668        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
669    else
670        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
671    return NULL;
672}
673
674
675PyObject *PyCodec_IgnoreErrors(PyObject *exc)
676{
677    Py_ssize_t end;
678
679    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
680        if (PyUnicodeEncodeError_GetEnd(exc, &end))
681            return NULL;
682    }
683    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
684        if (PyUnicodeDecodeError_GetEnd(exc, &end))
685            return NULL;
686    }
687    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
688        if (PyUnicodeTranslateError_GetEnd(exc, &end))
689            return NULL;
690    }
691    else {
692        wrong_exception_type(exc);
693        return NULL;
694    }
695    return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
696}
697
698
699PyObject *PyCodec_ReplaceErrors(PyObject *exc)
700{
701    Py_ssize_t start, end, i, len;
702
703    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
704        PyObject *res;
705        Py_UCS1 *outp;
706        if (PyUnicodeEncodeError_GetStart(exc, &start))
707            return NULL;
708        if (PyUnicodeEncodeError_GetEnd(exc, &end))
709            return NULL;
710        len = end - start;
711        res = PyUnicode_New(len, '?');
712        if (res == NULL)
713            return NULL;
714        assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
715        outp = PyUnicode_1BYTE_DATA(res);
716        for (i = 0; i < len; ++i)
717            outp[i] = '?';
718        assert(_PyUnicode_CheckConsistency(res, 1));
719        return Py_BuildValue("(Nn)", res, end);
720    }
721    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
722        if (PyUnicodeDecodeError_GetEnd(exc, &end))
723            return NULL;
724        return Py_BuildValue("(Cn)",
725                             (int)Py_UNICODE_REPLACEMENT_CHARACTER,
726                             end);
727    }
728    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
729        PyObject *res;
730        Py_UCS2 *outp;
731        if (PyUnicodeTranslateError_GetStart(exc, &start))
732            return NULL;
733        if (PyUnicodeTranslateError_GetEnd(exc, &end))
734            return NULL;
735        len = end - start;
736        res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
737        if (res == NULL)
738            return NULL;
739        assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
740        outp = PyUnicode_2BYTE_DATA(res);
741        for (i = 0; i < len; i++)
742            outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
743        assert(_PyUnicode_CheckConsistency(res, 1));
744        return Py_BuildValue("(Nn)", res, end);
745    }
746    else {
747        wrong_exception_type(exc);
748        return NULL;
749    }
750}
751
752PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
753{
754    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
755        PyObject *restuple;
756        PyObject *object;
757        Py_ssize_t i;
758        Py_ssize_t start;
759        Py_ssize_t end;
760        PyObject *res;
761        Py_UCS1 *outp;
762        Py_ssize_t ressize;
763        Py_UCS4 ch;
764        if (PyUnicodeEncodeError_GetStart(exc, &start))
765            return NULL;
766        if (PyUnicodeEncodeError_GetEnd(exc, &end))
767            return NULL;
768        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
769            return NULL;
770        if (end - start > PY_SSIZE_T_MAX / (2+7+1))
771            end = start + PY_SSIZE_T_MAX / (2+7+1);
772        for (i = start, ressize = 0; i < end; ++i) {
773            /* object is guaranteed to be "ready" */
774            ch = PyUnicode_READ_CHAR(object, i);
775            if (ch<10)
776                ressize += 2+1+1;
777            else if (ch<100)
778                ressize += 2+2+1;
779            else if (ch<1000)
780                ressize += 2+3+1;
781            else if (ch<10000)
782                ressize += 2+4+1;
783            else if (ch<100000)
784                ressize += 2+5+1;
785            else if (ch<1000000)
786                ressize += 2+6+1;
787            else
788                ressize += 2+7+1;
789        }
790        /* allocate replacement */
791        res = PyUnicode_New(ressize, 127);
792        if (res == NULL) {
793            Py_DECREF(object);
794            return NULL;
795        }
796        outp = PyUnicode_1BYTE_DATA(res);
797        /* generate replacement */
798        for (i = start; i < end; ++i) {
799            int digits;
800            int base;
801            ch = PyUnicode_READ_CHAR(object, i);
802            *outp++ = '&';
803            *outp++ = '#';
804            if (ch<10) {
805                digits = 1;
806                base = 1;
807            }
808            else if (ch<100) {
809                digits = 2;
810                base = 10;
811            }
812            else if (ch<1000) {
813                digits = 3;
814                base = 100;
815            }
816            else if (ch<10000) {
817                digits = 4;
818                base = 1000;
819            }
820            else if (ch<100000) {
821                digits = 5;
822                base = 10000;
823            }
824            else if (ch<1000000) {
825                digits = 6;
826                base = 100000;
827            }
828            else {
829                digits = 7;
830                base = 1000000;
831            }
832            while (digits-->0) {
833                *outp++ = '0' + ch/base;
834                ch %= base;
835                base /= 10;
836            }
837            *outp++ = ';';
838        }
839        assert(_PyUnicode_CheckConsistency(res, 1));
840        restuple = Py_BuildValue("(Nn)", res, end);
841        Py_DECREF(object);
842        return restuple;
843    }
844    else {
845        wrong_exception_type(exc);
846        return NULL;
847    }
848}
849
850PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
851{
852    PyObject *object;
853    Py_ssize_t i;
854    Py_ssize_t start;
855    Py_ssize_t end;
856    PyObject *res;
857    Py_UCS1 *outp;
858    int ressize;
859    Py_UCS4 c;
860
861    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
862        const unsigned char *p;
863        if (PyUnicodeDecodeError_GetStart(exc, &start))
864            return NULL;
865        if (PyUnicodeDecodeError_GetEnd(exc, &end))
866            return NULL;
867        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
868            return NULL;
869        p = (const unsigned char*)PyBytes_AS_STRING(object);
870        res = PyUnicode_New(4 * (end - start), 127);
871        if (res == NULL) {
872            Py_DECREF(object);
873            return NULL;
874        }
875        outp = PyUnicode_1BYTE_DATA(res);
876        for (i = start; i < end; i++, outp += 4) {
877            unsigned char c = p[i];
878            outp[0] = '\\';
879            outp[1] = 'x';
880            outp[2] = Py_hexdigits[(c>>4)&0xf];
881            outp[3] = Py_hexdigits[c&0xf];
882        }
883
884        assert(_PyUnicode_CheckConsistency(res, 1));
885        Py_DECREF(object);
886        return Py_BuildValue("(Nn)", res, end);
887    }
888    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
889        if (PyUnicodeEncodeError_GetStart(exc, &start))
890            return NULL;
891        if (PyUnicodeEncodeError_GetEnd(exc, &end))
892            return NULL;
893        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
894            return NULL;
895    }
896    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
897        if (PyUnicodeTranslateError_GetStart(exc, &start))
898            return NULL;
899        if (PyUnicodeTranslateError_GetEnd(exc, &end))
900            return NULL;
901        if (!(object = PyUnicodeTranslateError_GetObject(exc)))
902            return NULL;
903    }
904    else {
905        wrong_exception_type(exc);
906        return NULL;
907    }
908
909    if (end - start > PY_SSIZE_T_MAX / (1+1+8))
910        end = start + PY_SSIZE_T_MAX / (1+1+8);
911    for (i = start, ressize = 0; i < end; ++i) {
912        /* object is guaranteed to be "ready" */
913        c = PyUnicode_READ_CHAR(object, i);
914        if (c >= 0x10000) {
915            ressize += 1+1+8;
916        }
917        else if (c >= 0x100) {
918            ressize += 1+1+4;
919        }
920        else
921            ressize += 1+1+2;
922    }
923    res = PyUnicode_New(ressize, 127);
924    if (res == NULL) {
925        Py_DECREF(object);
926        return NULL;
927    }
928    outp = PyUnicode_1BYTE_DATA(res);
929    for (i = start; i < end; ++i) {
930        c = PyUnicode_READ_CHAR(object, i);
931        *outp++ = '\\';
932        if (c >= 0x00010000) {
933            *outp++ = 'U';
934            *outp++ = Py_hexdigits[(c>>28)&0xf];
935            *outp++ = Py_hexdigits[(c>>24)&0xf];
936            *outp++ = Py_hexdigits[(c>>20)&0xf];
937            *outp++ = Py_hexdigits[(c>>16)&0xf];
938            *outp++ = Py_hexdigits[(c>>12)&0xf];
939            *outp++ = Py_hexdigits[(c>>8)&0xf];
940        }
941        else if (c >= 0x100) {
942            *outp++ = 'u';
943            *outp++ = Py_hexdigits[(c>>12)&0xf];
944            *outp++ = Py_hexdigits[(c>>8)&0xf];
945        }
946        else
947            *outp++ = 'x';
948        *outp++ = Py_hexdigits[(c>>4)&0xf];
949        *outp++ = Py_hexdigits[c&0xf];
950    }
951
952    assert(_PyUnicode_CheckConsistency(res, 1));
953    Py_DECREF(object);
954    return Py_BuildValue("(Nn)", res, end);
955}
956
957static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
958
959PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
960{
961    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
962        PyObject *restuple;
963        PyObject *object;
964        Py_ssize_t i;
965        Py_ssize_t start;
966        Py_ssize_t end;
967        PyObject *res;
968        Py_UCS1 *outp;
969        Py_ssize_t ressize;
970        int replsize;
971        Py_UCS4 c;
972        char buffer[256]; /* NAME_MAXLEN */
973        if (PyUnicodeEncodeError_GetStart(exc, &start))
974            return NULL;
975        if (PyUnicodeEncodeError_GetEnd(exc, &end))
976            return NULL;
977        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
978            return NULL;
979        if (!ucnhash_capi) {
980            /* load the unicode data module */
981            ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
982                                            PyUnicodeData_CAPSULE_NAME, 1);
983            if (!ucnhash_capi) {
984                return NULL;
985            }
986        }
987        for (i = start, ressize = 0; i < end; ++i) {
988            /* object is guaranteed to be "ready" */
989            c = PyUnicode_READ_CHAR(object, i);
990            if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
991                replsize = 1+1+1+(int)strlen(buffer)+1;
992            }
993            else if (c >= 0x10000) {
994                replsize = 1+1+8;
995            }
996            else if (c >= 0x100) {
997                replsize = 1+1+4;
998            }
999            else
1000                replsize = 1+1+2;
1001            if (ressize > PY_SSIZE_T_MAX - replsize)
1002                break;
1003            ressize += replsize;
1004        }
1005        end = i;
1006        res = PyUnicode_New(ressize, 127);
1007        if (res==NULL)
1008            return NULL;
1009        for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1010            i < end; ++i) {
1011            c = PyUnicode_READ_CHAR(object, i);
1012            *outp++ = '\\';
1013            if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1014                *outp++ = 'N';
1015                *outp++ = '{';
1016                strcpy((char *)outp, buffer);
1017                outp += strlen(buffer);
1018                *outp++ = '}';
1019                continue;
1020            }
1021            if (c >= 0x00010000) {
1022                *outp++ = 'U';
1023                *outp++ = Py_hexdigits[(c>>28)&0xf];
1024                *outp++ = Py_hexdigits[(c>>24)&0xf];
1025                *outp++ = Py_hexdigits[(c>>20)&0xf];
1026                *outp++ = Py_hexdigits[(c>>16)&0xf];
1027                *outp++ = Py_hexdigits[(c>>12)&0xf];
1028                *outp++ = Py_hexdigits[(c>>8)&0xf];
1029            }
1030            else if (c >= 0x100) {
1031                *outp++ = 'u';
1032                *outp++ = Py_hexdigits[(c>>12)&0xf];
1033                *outp++ = Py_hexdigits[(c>>8)&0xf];
1034            }
1035            else
1036                *outp++ = 'x';
1037            *outp++ = Py_hexdigits[(c>>4)&0xf];
1038            *outp++ = Py_hexdigits[c&0xf];
1039        }
1040
1041        assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1042        assert(_PyUnicode_CheckConsistency(res, 1));
1043        restuple = Py_BuildValue("(Nn)", res, end);
1044        Py_DECREF(object);
1045        return restuple;
1046    }
1047    else {
1048        wrong_exception_type(exc);
1049        return NULL;
1050    }
1051}
1052
1053#define ENC_UNKNOWN     -1
1054#define ENC_UTF8        0
1055#define ENC_UTF16BE     1
1056#define ENC_UTF16LE     2
1057#define ENC_UTF32BE     3
1058#define ENC_UTF32LE     4
1059
1060static int
1061get_standard_encoding(const char *encoding, int *bytelength)
1062{
1063    if (Py_TOLOWER(encoding[0]) == 'u' &&
1064        Py_TOLOWER(encoding[1]) == 't' &&
1065        Py_TOLOWER(encoding[2]) == 'f') {
1066        encoding += 3;
1067        if (*encoding == '-' || *encoding == '_' )
1068            encoding++;
1069        if (encoding[0] == '8' && encoding[1] == '\0') {
1070            *bytelength = 3;
1071            return ENC_UTF8;
1072        }
1073        else if (encoding[0] == '1' && encoding[1] == '6') {
1074            encoding += 2;
1075            *bytelength = 2;
1076            if (*encoding == '\0') {
1077#ifdef WORDS_BIGENDIAN
1078                return ENC_UTF16BE;
1079#else
1080                return ENC_UTF16LE;
1081#endif
1082            }
1083            if (*encoding == '-' || *encoding == '_' )
1084                encoding++;
1085            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1086                if (Py_TOLOWER(encoding[0]) == 'b')
1087                    return ENC_UTF16BE;
1088                if (Py_TOLOWER(encoding[0]) == 'l')
1089                    return ENC_UTF16LE;
1090            }
1091        }
1092        else if (encoding[0] == '3' && encoding[1] == '2') {
1093            encoding += 2;
1094            *bytelength = 4;
1095            if (*encoding == '\0') {
1096#ifdef WORDS_BIGENDIAN
1097                return ENC_UTF32BE;
1098#else
1099                return ENC_UTF32LE;
1100#endif
1101            }
1102            if (*encoding == '-' || *encoding == '_' )
1103                encoding++;
1104            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1105                if (Py_TOLOWER(encoding[0]) == 'b')
1106                    return ENC_UTF32BE;
1107                if (Py_TOLOWER(encoding[0]) == 'l')
1108                    return ENC_UTF32LE;
1109            }
1110        }
1111    }
1112    else if (strcmp(encoding, "CP_UTF8") == 0) {
1113        *bytelength = 3;
1114        return ENC_UTF8;
1115    }
1116    return ENC_UNKNOWN;
1117}
1118
1119/* This handler is declared static until someone demonstrates
1120   a need to call it directly. */
1121static PyObject *
1122PyCodec_SurrogatePassErrors(PyObject *exc)
1123{
1124    PyObject *restuple;
1125    PyObject *object;
1126    PyObject *encode;
1127    const char *encoding;
1128    int code;
1129    int bytelength;
1130    Py_ssize_t i;
1131    Py_ssize_t start;
1132    Py_ssize_t end;
1133    PyObject *res;
1134
1135    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1136        unsigned char *outp;
1137        if (PyUnicodeEncodeError_GetStart(exc, &start))
1138            return NULL;
1139        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1140            return NULL;
1141        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1142            return NULL;
1143        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1144            Py_DECREF(object);
1145            return NULL;
1146        }
1147        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1148            Py_DECREF(object);
1149            Py_DECREF(encode);
1150            return NULL;
1151        }
1152        code = get_standard_encoding(encoding, &bytelength);
1153        Py_DECREF(encode);
1154        if (code == ENC_UNKNOWN) {
1155            /* Not supported, fail with original exception */
1156            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1157            Py_DECREF(object);
1158            return NULL;
1159        }
1160
1161        if (end - start > PY_SSIZE_T_MAX / bytelength)
1162            end = start + PY_SSIZE_T_MAX / bytelength;
1163        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1164        if (!res) {
1165            Py_DECREF(object);
1166            return NULL;
1167        }
1168        outp = (unsigned char*)PyBytes_AsString(res);
1169        for (i = start; i < end; i++) {
1170            /* object is guaranteed to be "ready" */
1171            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1172            if (!Py_UNICODE_IS_SURROGATE(ch)) {
1173                /* Not a surrogate, fail with original exception */
1174                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1175                Py_DECREF(res);
1176                Py_DECREF(object);
1177                return NULL;
1178            }
1179            switch (code) {
1180            case ENC_UTF8:
1181                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1182                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1183                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1184                break;
1185            case ENC_UTF16LE:
1186                *outp++ = (unsigned char) ch;
1187                *outp++ = (unsigned char)(ch >> 8);
1188                break;
1189            case ENC_UTF16BE:
1190                *outp++ = (unsigned char)(ch >> 8);
1191                *outp++ = (unsigned char) ch;
1192                break;
1193            case ENC_UTF32LE:
1194                *outp++ = (unsigned char) ch;
1195                *outp++ = (unsigned char)(ch >> 8);
1196                *outp++ = (unsigned char)(ch >> 16);
1197                *outp++ = (unsigned char)(ch >> 24);
1198                break;
1199            case ENC_UTF32BE:
1200                *outp++ = (unsigned char)(ch >> 24);
1201                *outp++ = (unsigned char)(ch >> 16);
1202                *outp++ = (unsigned char)(ch >> 8);
1203                *outp++ = (unsigned char) ch;
1204                break;
1205            }
1206        }
1207        restuple = Py_BuildValue("(On)", res, end);
1208        Py_DECREF(res);
1209        Py_DECREF(object);
1210        return restuple;
1211    }
1212    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1213        const unsigned char *p;
1214        Py_UCS4 ch = 0;
1215        if (PyUnicodeDecodeError_GetStart(exc, &start))
1216            return NULL;
1217        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1218            return NULL;
1219        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1220            return NULL;
1221        p = (const unsigned char*)PyBytes_AS_STRING(object);
1222        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1223            Py_DECREF(object);
1224            return NULL;
1225        }
1226        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1227            Py_DECREF(object);
1228            Py_DECREF(encode);
1229            return NULL;
1230        }
1231        code = get_standard_encoding(encoding, &bytelength);
1232        Py_DECREF(encode);
1233        if (code == ENC_UNKNOWN) {
1234            /* Not supported, fail with original exception */
1235            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1236            Py_DECREF(object);
1237            return NULL;
1238        }
1239
1240        /* Try decoding a single surrogate character. If
1241           there are more, let the codec call us again. */
1242        p += start;
1243        if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1244            switch (code) {
1245            case ENC_UTF8:
1246                if ((p[0] & 0xf0) == 0xe0 &&
1247                    (p[1] & 0xc0) == 0x80 &&
1248                    (p[2] & 0xc0) == 0x80) {
1249                    /* it's a three-byte code */
1250                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1251                }
1252                break;
1253            case ENC_UTF16LE:
1254                ch = p[1] << 8 | p[0];
1255                break;
1256            case ENC_UTF16BE:
1257                ch = p[0] << 8 | p[1];
1258                break;
1259            case ENC_UTF32LE:
1260                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1261                break;
1262            case ENC_UTF32BE:
1263                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1264                break;
1265            }
1266        }
1267
1268        Py_DECREF(object);
1269        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1270            /* it's not a surrogate - fail */
1271            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1272            return NULL;
1273        }
1274        res = PyUnicode_FromOrdinal(ch);
1275        if (res == NULL)
1276            return NULL;
1277        return Py_BuildValue("(Nn)", res, start + bytelength);
1278    }
1279    else {
1280        wrong_exception_type(exc);
1281        return NULL;
1282    }
1283}
1284
1285static PyObject *
1286PyCodec_SurrogateEscapeErrors(PyObject *exc)
1287{
1288    PyObject *restuple;
1289    PyObject *object;
1290    Py_ssize_t i;
1291    Py_ssize_t start;
1292    Py_ssize_t end;
1293    PyObject *res;
1294
1295    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1296        char *outp;
1297        if (PyUnicodeEncodeError_GetStart(exc, &start))
1298            return NULL;
1299        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1300            return NULL;
1301        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1302            return NULL;
1303        res = PyBytes_FromStringAndSize(NULL, end-start);
1304        if (!res) {
1305            Py_DECREF(object);
1306            return NULL;
1307        }
1308        outp = PyBytes_AsString(res);
1309        for (i = start; i < end; i++) {
1310            /* object is guaranteed to be "ready" */
1311            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1312            if (ch < 0xdc80 || ch > 0xdcff) {
1313                /* Not a UTF-8b surrogate, fail with original exception */
1314                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1315                Py_DECREF(res);
1316                Py_DECREF(object);
1317                return NULL;
1318            }
1319            *outp++ = ch - 0xdc00;
1320        }
1321        restuple = Py_BuildValue("(On)", res, end);
1322        Py_DECREF(res);
1323        Py_DECREF(object);
1324        return restuple;
1325    }
1326    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1327        PyObject *str;
1328        const unsigned char *p;
1329        Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1330        int consumed = 0;
1331        if (PyUnicodeDecodeError_GetStart(exc, &start))
1332            return NULL;
1333        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1334            return NULL;
1335        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1336            return NULL;
1337        p = (const unsigned char*)PyBytes_AS_STRING(object);
1338        while (consumed < 4 && consumed < end-start) {
1339            /* Refuse to escape ASCII bytes. */
1340            if (p[start+consumed] < 128)
1341                break;
1342            ch[consumed] = 0xdc00 + p[start+consumed];
1343            consumed++;
1344        }
1345        Py_DECREF(object);
1346        if (!consumed) {
1347            /* codec complained about ASCII byte. */
1348            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1349            return NULL;
1350        }
1351        str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1352        if (str == NULL)
1353            return NULL;
1354        return Py_BuildValue("(Nn)", str, start+consumed);
1355    }
1356    else {
1357        wrong_exception_type(exc);
1358        return NULL;
1359    }
1360}
1361
1362
1363static PyObject *strict_errors(PyObject *self, PyObject *exc)
1364{
1365    return PyCodec_StrictErrors(exc);
1366}
1367
1368
1369static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1370{
1371    return PyCodec_IgnoreErrors(exc);
1372}
1373
1374
1375static PyObject *replace_errors(PyObject *self, PyObject *exc)
1376{
1377    return PyCodec_ReplaceErrors(exc);
1378}
1379
1380
1381static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1382{
1383    return PyCodec_XMLCharRefReplaceErrors(exc);
1384}
1385
1386
1387static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1388{
1389    return PyCodec_BackslashReplaceErrors(exc);
1390}
1391
1392static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1393{
1394    return PyCodec_NameReplaceErrors(exc);
1395}
1396
1397static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1398{
1399    return PyCodec_SurrogatePassErrors(exc);
1400}
1401
1402static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1403{
1404    return PyCodec_SurrogateEscapeErrors(exc);
1405}
1406
1407static int _PyCodecRegistry_Init(void)
1408{
1409    static struct {
1410        const char *name;
1411        PyMethodDef def;
1412    } methods[] =
1413    {
1414        {
1415            "strict",
1416            {
1417                "strict_errors",
1418                strict_errors,
1419                METH_O,
1420                PyDoc_STR("Implements the 'strict' error handling, which "
1421                          "raises a UnicodeError on coding errors.")
1422            }
1423        },
1424        {
1425            "ignore",
1426            {
1427                "ignore_errors",
1428                ignore_errors,
1429                METH_O,
1430                PyDoc_STR("Implements the 'ignore' error handling, which "
1431                          "ignores malformed data and continues.")
1432            }
1433        },
1434        {
1435            "replace",
1436            {
1437                "replace_errors",
1438                replace_errors,
1439                METH_O,
1440                PyDoc_STR("Implements the 'replace' error handling, which "
1441                          "replaces malformed data with a replacement marker.")
1442            }
1443        },
1444        {
1445            "xmlcharrefreplace",
1446            {
1447                "xmlcharrefreplace_errors",
1448                xmlcharrefreplace_errors,
1449                METH_O,
1450                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1451                          "which replaces an unencodable character with the "
1452                          "appropriate XML character reference.")
1453            }
1454        },
1455        {
1456            "backslashreplace",
1457            {
1458                "backslashreplace_errors",
1459                backslashreplace_errors,
1460                METH_O,
1461                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1462                          "which replaces malformed data with a backslashed "
1463                          "escape sequence.")
1464            }
1465        },
1466        {
1467            "namereplace",
1468            {
1469                "namereplace_errors",
1470                namereplace_errors,
1471                METH_O,
1472                PyDoc_STR("Implements the 'namereplace' error handling, "
1473                          "which replaces an unencodable character with a "
1474                          "\\N{...} escape sequence.")
1475            }
1476        },
1477        {
1478            "surrogatepass",
1479            {
1480                "surrogatepass",
1481                surrogatepass_errors,
1482                METH_O
1483            }
1484        },
1485        {
1486            "surrogateescape",
1487            {
1488                "surrogateescape",
1489                surrogateescape_errors,
1490                METH_O
1491            }
1492        }
1493    };
1494
1495    PyInterpreterState *interp = _PyInterpreterState_GET();
1496    PyObject *mod;
1497
1498    if (interp->codec_search_path != NULL)
1499        return 0;
1500
1501    interp->codec_search_path = PyList_New(0);
1502    if (interp->codec_search_path == NULL) {
1503        return -1;
1504    }
1505
1506    interp->codec_search_cache = PyDict_New();
1507    if (interp->codec_search_cache == NULL) {
1508        return -1;
1509    }
1510
1511    interp->codec_error_registry = PyDict_New();
1512    if (interp->codec_error_registry == NULL) {
1513        return -1;
1514    }
1515
1516    for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1517        PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1518        if (!func) {
1519            return -1;
1520        }
1521
1522        int res = PyCodec_RegisterError(methods[i].name, func);
1523        Py_DECREF(func);
1524        if (res) {
1525            return -1;
1526        }
1527    }
1528
1529    mod = PyImport_ImportModule("encodings");
1530    if (mod == NULL) {
1531        return -1;
1532    }
1533    Py_DECREF(mod);
1534    interp->codecs_initialized = 1;
1535    return 0;
1536}
1537