17db96d56Sopenharmony_ci/* stringlib: codec implementations */
27db96d56Sopenharmony_ci
37db96d56Sopenharmony_ci#if !STRINGLIB_IS_UNICODE
47db96d56Sopenharmony_ci# error "codecs.h is specific to Unicode"
57db96d56Sopenharmony_ci#endif
67db96d56Sopenharmony_ci
77db96d56Sopenharmony_ci#include "pycore_bitutils.h"      // _Py_bswap32()
87db96d56Sopenharmony_ci
97db96d56Sopenharmony_ci/* Mask to quickly check whether a C 'size_t' contains a
107db96d56Sopenharmony_ci   non-ASCII, UTF8-encoded char. */
117db96d56Sopenharmony_ci#if (SIZEOF_SIZE_T == 8)
127db96d56Sopenharmony_ci# define ASCII_CHAR_MASK 0x8080808080808080ULL
137db96d56Sopenharmony_ci#elif (SIZEOF_SIZE_T == 4)
147db96d56Sopenharmony_ci# define ASCII_CHAR_MASK 0x80808080U
157db96d56Sopenharmony_ci#else
167db96d56Sopenharmony_ci# error C 'size_t' size should be either 4 or 8!
177db96d56Sopenharmony_ci#endif
187db96d56Sopenharmony_ci
197db96d56Sopenharmony_ci/* 10xxxxxx */
207db96d56Sopenharmony_ci#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
217db96d56Sopenharmony_ci
227db96d56Sopenharmony_ciPy_LOCAL_INLINE(Py_UCS4)
237db96d56Sopenharmony_ciSTRINGLIB(utf8_decode)(const char **inptr, const char *end,
247db96d56Sopenharmony_ci                       STRINGLIB_CHAR *dest,
257db96d56Sopenharmony_ci                       Py_ssize_t *outpos)
267db96d56Sopenharmony_ci{
277db96d56Sopenharmony_ci    Py_UCS4 ch;
287db96d56Sopenharmony_ci    const char *s = *inptr;
297db96d56Sopenharmony_ci    STRINGLIB_CHAR *p = dest + *outpos;
307db96d56Sopenharmony_ci
317db96d56Sopenharmony_ci    while (s < end) {
327db96d56Sopenharmony_ci        ch = (unsigned char)*s;
337db96d56Sopenharmony_ci
347db96d56Sopenharmony_ci        if (ch < 0x80) {
357db96d56Sopenharmony_ci            /* Fast path for runs of ASCII characters. Given that common UTF-8
367db96d56Sopenharmony_ci               input will consist of an overwhelming majority of ASCII
377db96d56Sopenharmony_ci               characters, we try to optimize for this case by checking
387db96d56Sopenharmony_ci               as many characters as a C 'size_t' can contain.
397db96d56Sopenharmony_ci               First, check if we can do an aligned read, as most CPUs have
407db96d56Sopenharmony_ci               a penalty for unaligned reads.
417db96d56Sopenharmony_ci            */
427db96d56Sopenharmony_ci            if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
437db96d56Sopenharmony_ci                /* Help register allocation */
447db96d56Sopenharmony_ci                const char *_s = s;
457db96d56Sopenharmony_ci                STRINGLIB_CHAR *_p = p;
467db96d56Sopenharmony_ci                while (_s + SIZEOF_SIZE_T <= end) {
477db96d56Sopenharmony_ci                    /* Read a whole size_t at a time (either 4 or 8 bytes),
487db96d56Sopenharmony_ci                       and do a fast unrolled copy if it only contains ASCII
497db96d56Sopenharmony_ci                       characters. */
507db96d56Sopenharmony_ci                    size_t value = *(const size_t *) _s;
517db96d56Sopenharmony_ci                    if (value & ASCII_CHAR_MASK)
527db96d56Sopenharmony_ci                        break;
537db96d56Sopenharmony_ci#if PY_LITTLE_ENDIAN
547db96d56Sopenharmony_ci                    _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
557db96d56Sopenharmony_ci                    _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
567db96d56Sopenharmony_ci                    _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
577db96d56Sopenharmony_ci                    _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
587db96d56Sopenharmony_ci# if SIZEOF_SIZE_T == 8
597db96d56Sopenharmony_ci                    _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
607db96d56Sopenharmony_ci                    _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
617db96d56Sopenharmony_ci                    _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
627db96d56Sopenharmony_ci                    _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
637db96d56Sopenharmony_ci# endif
647db96d56Sopenharmony_ci#else
657db96d56Sopenharmony_ci# if SIZEOF_SIZE_T == 8
667db96d56Sopenharmony_ci                    _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
677db96d56Sopenharmony_ci                    _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
687db96d56Sopenharmony_ci                    _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
697db96d56Sopenharmony_ci                    _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
707db96d56Sopenharmony_ci                    _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
717db96d56Sopenharmony_ci                    _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
727db96d56Sopenharmony_ci                    _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
737db96d56Sopenharmony_ci                    _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
747db96d56Sopenharmony_ci# else
757db96d56Sopenharmony_ci                    _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
767db96d56Sopenharmony_ci                    _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
777db96d56Sopenharmony_ci                    _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
787db96d56Sopenharmony_ci                    _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
797db96d56Sopenharmony_ci# endif
807db96d56Sopenharmony_ci#endif
817db96d56Sopenharmony_ci                    _s += SIZEOF_SIZE_T;
827db96d56Sopenharmony_ci                    _p += SIZEOF_SIZE_T;
837db96d56Sopenharmony_ci                }
847db96d56Sopenharmony_ci                s = _s;
857db96d56Sopenharmony_ci                p = _p;
867db96d56Sopenharmony_ci                if (s == end)
877db96d56Sopenharmony_ci                    break;
887db96d56Sopenharmony_ci                ch = (unsigned char)*s;
897db96d56Sopenharmony_ci            }
907db96d56Sopenharmony_ci            if (ch < 0x80) {
917db96d56Sopenharmony_ci                s++;
927db96d56Sopenharmony_ci                *p++ = ch;
937db96d56Sopenharmony_ci                continue;
947db96d56Sopenharmony_ci            }
957db96d56Sopenharmony_ci        }
967db96d56Sopenharmony_ci
977db96d56Sopenharmony_ci        if (ch < 0xE0) {
987db96d56Sopenharmony_ci            /* \xC2\x80-\xDF\xBF -- 0080-07FF */
997db96d56Sopenharmony_ci            Py_UCS4 ch2;
1007db96d56Sopenharmony_ci            if (ch < 0xC2) {
1017db96d56Sopenharmony_ci                /* invalid sequence
1027db96d56Sopenharmony_ci                \x80-\xBF -- continuation byte
1037db96d56Sopenharmony_ci                \xC0-\xC1 -- fake 0000-007F */
1047db96d56Sopenharmony_ci                goto InvalidStart;
1057db96d56Sopenharmony_ci            }
1067db96d56Sopenharmony_ci            if (end - s < 2) {
1077db96d56Sopenharmony_ci                /* unexpected end of data: the caller will decide whether
1087db96d56Sopenharmony_ci                   it's an error or not */
1097db96d56Sopenharmony_ci                break;
1107db96d56Sopenharmony_ci            }
1117db96d56Sopenharmony_ci            ch2 = (unsigned char)s[1];
1127db96d56Sopenharmony_ci            if (!IS_CONTINUATION_BYTE(ch2))
1137db96d56Sopenharmony_ci                /* invalid continuation byte */
1147db96d56Sopenharmony_ci                goto InvalidContinuation1;
1157db96d56Sopenharmony_ci            ch = (ch << 6) + ch2 -
1167db96d56Sopenharmony_ci                 ((0xC0 << 6) + 0x80);
1177db96d56Sopenharmony_ci            assert ((ch > 0x007F) && (ch <= 0x07FF));
1187db96d56Sopenharmony_ci            s += 2;
1197db96d56Sopenharmony_ci            if (STRINGLIB_MAX_CHAR <= 0x007F ||
1207db96d56Sopenharmony_ci                (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
1217db96d56Sopenharmony_ci                /* Out-of-range */
1227db96d56Sopenharmony_ci                goto Return;
1237db96d56Sopenharmony_ci            *p++ = ch;
1247db96d56Sopenharmony_ci            continue;
1257db96d56Sopenharmony_ci        }
1267db96d56Sopenharmony_ci
1277db96d56Sopenharmony_ci        if (ch < 0xF0) {
1287db96d56Sopenharmony_ci            /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
1297db96d56Sopenharmony_ci            Py_UCS4 ch2, ch3;
1307db96d56Sopenharmony_ci            if (end - s < 3) {
1317db96d56Sopenharmony_ci                /* unexpected end of data: the caller will decide whether
1327db96d56Sopenharmony_ci                   it's an error or not */
1337db96d56Sopenharmony_ci                if (end - s < 2)
1347db96d56Sopenharmony_ci                    break;
1357db96d56Sopenharmony_ci                ch2 = (unsigned char)s[1];
1367db96d56Sopenharmony_ci                if (!IS_CONTINUATION_BYTE(ch2) ||
1377db96d56Sopenharmony_ci                    (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
1387db96d56Sopenharmony_ci                    /* for clarification see comments below */
1397db96d56Sopenharmony_ci                    goto InvalidContinuation1;
1407db96d56Sopenharmony_ci                break;
1417db96d56Sopenharmony_ci            }
1427db96d56Sopenharmony_ci            ch2 = (unsigned char)s[1];
1437db96d56Sopenharmony_ci            ch3 = (unsigned char)s[2];
1447db96d56Sopenharmony_ci            if (!IS_CONTINUATION_BYTE(ch2)) {
1457db96d56Sopenharmony_ci                /* invalid continuation byte */
1467db96d56Sopenharmony_ci                goto InvalidContinuation1;
1477db96d56Sopenharmony_ci            }
1487db96d56Sopenharmony_ci            if (ch == 0xE0) {
1497db96d56Sopenharmony_ci                if (ch2 < 0xA0)
1507db96d56Sopenharmony_ci                    /* invalid sequence
1517db96d56Sopenharmony_ci                       \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
1527db96d56Sopenharmony_ci                    goto InvalidContinuation1;
1537db96d56Sopenharmony_ci            } else if (ch == 0xED && ch2 >= 0xA0) {
1547db96d56Sopenharmony_ci                /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
1557db96d56Sopenharmony_ci                   will result in surrogates in range D800-DFFF. Surrogates are
1567db96d56Sopenharmony_ci                   not valid UTF-8 so they are rejected.
1577db96d56Sopenharmony_ci                   See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1587db96d56Sopenharmony_ci                   (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
1597db96d56Sopenharmony_ci                goto InvalidContinuation1;
1607db96d56Sopenharmony_ci            }
1617db96d56Sopenharmony_ci            if (!IS_CONTINUATION_BYTE(ch3)) {
1627db96d56Sopenharmony_ci                /* invalid continuation byte */
1637db96d56Sopenharmony_ci                goto InvalidContinuation2;
1647db96d56Sopenharmony_ci            }
1657db96d56Sopenharmony_ci            ch = (ch << 12) + (ch2 << 6) + ch3 -
1667db96d56Sopenharmony_ci                 ((0xE0 << 12) + (0x80 << 6) + 0x80);
1677db96d56Sopenharmony_ci            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
1687db96d56Sopenharmony_ci            s += 3;
1697db96d56Sopenharmony_ci            if (STRINGLIB_MAX_CHAR <= 0x07FF ||
1707db96d56Sopenharmony_ci                (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
1717db96d56Sopenharmony_ci                /* Out-of-range */
1727db96d56Sopenharmony_ci                goto Return;
1737db96d56Sopenharmony_ci            *p++ = ch;
1747db96d56Sopenharmony_ci            continue;
1757db96d56Sopenharmony_ci        }
1767db96d56Sopenharmony_ci
1777db96d56Sopenharmony_ci        if (ch < 0xF5) {
1787db96d56Sopenharmony_ci            /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
1797db96d56Sopenharmony_ci            Py_UCS4 ch2, ch3, ch4;
1807db96d56Sopenharmony_ci            if (end - s < 4) {
1817db96d56Sopenharmony_ci                /* unexpected end of data: the caller will decide whether
1827db96d56Sopenharmony_ci                   it's an error or not */
1837db96d56Sopenharmony_ci                if (end - s < 2)
1847db96d56Sopenharmony_ci                    break;
1857db96d56Sopenharmony_ci                ch2 = (unsigned char)s[1];
1867db96d56Sopenharmony_ci                if (!IS_CONTINUATION_BYTE(ch2) ||
1877db96d56Sopenharmony_ci                    (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
1887db96d56Sopenharmony_ci                    /* for clarification see comments below */
1897db96d56Sopenharmony_ci                    goto InvalidContinuation1;
1907db96d56Sopenharmony_ci                if (end - s < 3)
1917db96d56Sopenharmony_ci                    break;
1927db96d56Sopenharmony_ci                ch3 = (unsigned char)s[2];
1937db96d56Sopenharmony_ci                if (!IS_CONTINUATION_BYTE(ch3))
1947db96d56Sopenharmony_ci                    goto InvalidContinuation2;
1957db96d56Sopenharmony_ci                break;
1967db96d56Sopenharmony_ci            }
1977db96d56Sopenharmony_ci            ch2 = (unsigned char)s[1];
1987db96d56Sopenharmony_ci            ch3 = (unsigned char)s[2];
1997db96d56Sopenharmony_ci            ch4 = (unsigned char)s[3];
2007db96d56Sopenharmony_ci            if (!IS_CONTINUATION_BYTE(ch2)) {
2017db96d56Sopenharmony_ci                /* invalid continuation byte */
2027db96d56Sopenharmony_ci                goto InvalidContinuation1;
2037db96d56Sopenharmony_ci            }
2047db96d56Sopenharmony_ci            if (ch == 0xF0) {
2057db96d56Sopenharmony_ci                if (ch2 < 0x90)
2067db96d56Sopenharmony_ci                    /* invalid sequence
2077db96d56Sopenharmony_ci                       \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
2087db96d56Sopenharmony_ci                    goto InvalidContinuation1;
2097db96d56Sopenharmony_ci            } else if (ch == 0xF4 && ch2 >= 0x90) {
2107db96d56Sopenharmony_ci                /* invalid sequence
2117db96d56Sopenharmony_ci                   \xF4\x90\x80\x80- -- 110000- overflow */
2127db96d56Sopenharmony_ci                goto InvalidContinuation1;
2137db96d56Sopenharmony_ci            }
2147db96d56Sopenharmony_ci            if (!IS_CONTINUATION_BYTE(ch3)) {
2157db96d56Sopenharmony_ci                /* invalid continuation byte */
2167db96d56Sopenharmony_ci                goto InvalidContinuation2;
2177db96d56Sopenharmony_ci            }
2187db96d56Sopenharmony_ci            if (!IS_CONTINUATION_BYTE(ch4)) {
2197db96d56Sopenharmony_ci                /* invalid continuation byte */
2207db96d56Sopenharmony_ci                goto InvalidContinuation3;
2217db96d56Sopenharmony_ci            }
2227db96d56Sopenharmony_ci            ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
2237db96d56Sopenharmony_ci                 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
2247db96d56Sopenharmony_ci            assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
2257db96d56Sopenharmony_ci            s += 4;
2267db96d56Sopenharmony_ci            if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
2277db96d56Sopenharmony_ci                (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
2287db96d56Sopenharmony_ci                /* Out-of-range */
2297db96d56Sopenharmony_ci                goto Return;
2307db96d56Sopenharmony_ci            *p++ = ch;
2317db96d56Sopenharmony_ci            continue;
2327db96d56Sopenharmony_ci        }
2337db96d56Sopenharmony_ci        goto InvalidStart;
2347db96d56Sopenharmony_ci    }
2357db96d56Sopenharmony_ci    ch = 0;
2367db96d56Sopenharmony_ciReturn:
2377db96d56Sopenharmony_ci    *inptr = s;
2387db96d56Sopenharmony_ci    *outpos = p - dest;
2397db96d56Sopenharmony_ci    return ch;
2407db96d56Sopenharmony_ciInvalidStart:
2417db96d56Sopenharmony_ci    ch = 1;
2427db96d56Sopenharmony_ci    goto Return;
2437db96d56Sopenharmony_ciInvalidContinuation1:
2447db96d56Sopenharmony_ci    ch = 2;
2457db96d56Sopenharmony_ci    goto Return;
2467db96d56Sopenharmony_ciInvalidContinuation2:
2477db96d56Sopenharmony_ci    ch = 3;
2487db96d56Sopenharmony_ci    goto Return;
2497db96d56Sopenharmony_ciInvalidContinuation3:
2507db96d56Sopenharmony_ci    ch = 4;
2517db96d56Sopenharmony_ci    goto Return;
2527db96d56Sopenharmony_ci}
2537db96d56Sopenharmony_ci
2547db96d56Sopenharmony_ci#undef ASCII_CHAR_MASK
2557db96d56Sopenharmony_ci
2567db96d56Sopenharmony_ci
2577db96d56Sopenharmony_ci/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
2587db96d56Sopenharmony_ci   PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
2597db96d56Sopenharmony_ci   UCS-1 strings don't need to handle surrogates for example. */
2607db96d56Sopenharmony_ciPy_LOCAL_INLINE(char *)
2617db96d56Sopenharmony_ciSTRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
2627db96d56Sopenharmony_ci                        PyObject *unicode,
2637db96d56Sopenharmony_ci                        const STRINGLIB_CHAR *data,
2647db96d56Sopenharmony_ci                        Py_ssize_t size,
2657db96d56Sopenharmony_ci                        _Py_error_handler error_handler,
2667db96d56Sopenharmony_ci                        const char *errors)
2677db96d56Sopenharmony_ci{
2687db96d56Sopenharmony_ci    Py_ssize_t i;                /* index into data of next input character */
2697db96d56Sopenharmony_ci    char *p;                     /* next free byte in output buffer */
2707db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1
2717db96d56Sopenharmony_ci    PyObject *error_handler_obj = NULL;
2727db96d56Sopenharmony_ci    PyObject *exc = NULL;
2737db96d56Sopenharmony_ci    PyObject *rep = NULL;
2747db96d56Sopenharmony_ci#endif
2757db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR == 1
2767db96d56Sopenharmony_ci    const Py_ssize_t max_char_size = 2;
2777db96d56Sopenharmony_ci#elif STRINGLIB_SIZEOF_CHAR == 2
2787db96d56Sopenharmony_ci    const Py_ssize_t max_char_size = 3;
2797db96d56Sopenharmony_ci#else /*  STRINGLIB_SIZEOF_CHAR == 4 */
2807db96d56Sopenharmony_ci    const Py_ssize_t max_char_size = 4;
2817db96d56Sopenharmony_ci#endif
2827db96d56Sopenharmony_ci
2837db96d56Sopenharmony_ci    assert(size >= 0);
2847db96d56Sopenharmony_ci    if (size > PY_SSIZE_T_MAX / max_char_size) {
2857db96d56Sopenharmony_ci        /* integer overflow */
2867db96d56Sopenharmony_ci        PyErr_NoMemory();
2877db96d56Sopenharmony_ci        return NULL;
2887db96d56Sopenharmony_ci    }
2897db96d56Sopenharmony_ci
2907db96d56Sopenharmony_ci    _PyBytesWriter_Init(writer);
2917db96d56Sopenharmony_ci    p = _PyBytesWriter_Alloc(writer, size * max_char_size);
2927db96d56Sopenharmony_ci    if (p == NULL)
2937db96d56Sopenharmony_ci        return NULL;
2947db96d56Sopenharmony_ci
2957db96d56Sopenharmony_ci    for (i = 0; i < size;) {
2967db96d56Sopenharmony_ci        Py_UCS4 ch = data[i++];
2977db96d56Sopenharmony_ci
2987db96d56Sopenharmony_ci        if (ch < 0x80) {
2997db96d56Sopenharmony_ci            /* Encode ASCII */
3007db96d56Sopenharmony_ci            *p++ = (char) ch;
3017db96d56Sopenharmony_ci
3027db96d56Sopenharmony_ci        }
3037db96d56Sopenharmony_ci        else
3047db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1
3057db96d56Sopenharmony_ci        if (ch < 0x0800)
3067db96d56Sopenharmony_ci#endif
3077db96d56Sopenharmony_ci        {
3087db96d56Sopenharmony_ci            /* Encode Latin-1 */
3097db96d56Sopenharmony_ci            *p++ = (char)(0xc0 | (ch >> 6));
3107db96d56Sopenharmony_ci            *p++ = (char)(0x80 | (ch & 0x3f));
3117db96d56Sopenharmony_ci        }
3127db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1
3137db96d56Sopenharmony_ci        else if (Py_UNICODE_IS_SURROGATE(ch)) {
3147db96d56Sopenharmony_ci            Py_ssize_t startpos, endpos, newpos;
3157db96d56Sopenharmony_ci            Py_ssize_t k;
3167db96d56Sopenharmony_ci            if (error_handler == _Py_ERROR_UNKNOWN) {
3177db96d56Sopenharmony_ci                error_handler = _Py_GetErrorHandler(errors);
3187db96d56Sopenharmony_ci            }
3197db96d56Sopenharmony_ci
3207db96d56Sopenharmony_ci            startpos = i-1;
3217db96d56Sopenharmony_ci            endpos = startpos+1;
3227db96d56Sopenharmony_ci
3237db96d56Sopenharmony_ci            while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
3247db96d56Sopenharmony_ci                endpos++;
3257db96d56Sopenharmony_ci
3267db96d56Sopenharmony_ci            /* Only overallocate the buffer if it's not the last write */
3277db96d56Sopenharmony_ci            writer->overallocate = (endpos < size);
3287db96d56Sopenharmony_ci
3297db96d56Sopenharmony_ci            switch (error_handler)
3307db96d56Sopenharmony_ci            {
3317db96d56Sopenharmony_ci            case _Py_ERROR_REPLACE:
3327db96d56Sopenharmony_ci                memset(p, '?', endpos - startpos);
3337db96d56Sopenharmony_ci                p += (endpos - startpos);
3347db96d56Sopenharmony_ci                /* fall through */
3357db96d56Sopenharmony_ci            case _Py_ERROR_IGNORE:
3367db96d56Sopenharmony_ci                i += (endpos - startpos - 1);
3377db96d56Sopenharmony_ci                break;
3387db96d56Sopenharmony_ci
3397db96d56Sopenharmony_ci            case _Py_ERROR_SURROGATEPASS:
3407db96d56Sopenharmony_ci                for (k=startpos; k<endpos; k++) {
3417db96d56Sopenharmony_ci                    ch = data[k];
3427db96d56Sopenharmony_ci                    *p++ = (char)(0xe0 | (ch >> 12));
3437db96d56Sopenharmony_ci                    *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3447db96d56Sopenharmony_ci                    *p++ = (char)(0x80 | (ch & 0x3f));
3457db96d56Sopenharmony_ci                }
3467db96d56Sopenharmony_ci                i += (endpos - startpos - 1);
3477db96d56Sopenharmony_ci                break;
3487db96d56Sopenharmony_ci
3497db96d56Sopenharmony_ci            case _Py_ERROR_BACKSLASHREPLACE:
3507db96d56Sopenharmony_ci                /* subtract preallocated bytes */
3517db96d56Sopenharmony_ci                writer->min_size -= max_char_size * (endpos - startpos);
3527db96d56Sopenharmony_ci                p = backslashreplace(writer, p,
3537db96d56Sopenharmony_ci                                     unicode, startpos, endpos);
3547db96d56Sopenharmony_ci                if (p == NULL)
3557db96d56Sopenharmony_ci                    goto error;
3567db96d56Sopenharmony_ci                i += (endpos - startpos - 1);
3577db96d56Sopenharmony_ci                break;
3587db96d56Sopenharmony_ci
3597db96d56Sopenharmony_ci            case _Py_ERROR_XMLCHARREFREPLACE:
3607db96d56Sopenharmony_ci                /* subtract preallocated bytes */
3617db96d56Sopenharmony_ci                writer->min_size -= max_char_size * (endpos - startpos);
3627db96d56Sopenharmony_ci                p = xmlcharrefreplace(writer, p,
3637db96d56Sopenharmony_ci                                      unicode, startpos, endpos);
3647db96d56Sopenharmony_ci                if (p == NULL)
3657db96d56Sopenharmony_ci                    goto error;
3667db96d56Sopenharmony_ci                i += (endpos - startpos - 1);
3677db96d56Sopenharmony_ci                break;
3687db96d56Sopenharmony_ci
3697db96d56Sopenharmony_ci            case _Py_ERROR_SURROGATEESCAPE:
3707db96d56Sopenharmony_ci                for (k=startpos; k<endpos; k++) {
3717db96d56Sopenharmony_ci                    ch = data[k];
3727db96d56Sopenharmony_ci                    if (!(0xDC80 <= ch && ch <= 0xDCFF))
3737db96d56Sopenharmony_ci                        break;
3747db96d56Sopenharmony_ci                    *p++ = (char)(ch & 0xff);
3757db96d56Sopenharmony_ci                }
3767db96d56Sopenharmony_ci                if (k >= endpos) {
3777db96d56Sopenharmony_ci                    i += (endpos - startpos - 1);
3787db96d56Sopenharmony_ci                    break;
3797db96d56Sopenharmony_ci                }
3807db96d56Sopenharmony_ci                startpos = k;
3817db96d56Sopenharmony_ci                assert(startpos < endpos);
3827db96d56Sopenharmony_ci                /* fall through */
3837db96d56Sopenharmony_ci            default:
3847db96d56Sopenharmony_ci                rep = unicode_encode_call_errorhandler(
3857db96d56Sopenharmony_ci                      errors, &error_handler_obj, "utf-8", "surrogates not allowed",
3867db96d56Sopenharmony_ci                      unicode, &exc, startpos, endpos, &newpos);
3877db96d56Sopenharmony_ci                if (!rep)
3887db96d56Sopenharmony_ci                    goto error;
3897db96d56Sopenharmony_ci
3907db96d56Sopenharmony_ci                if (newpos < startpos) {
3917db96d56Sopenharmony_ci                    writer->overallocate = 1;
3927db96d56Sopenharmony_ci                    p = _PyBytesWriter_Prepare(writer, p,
3937db96d56Sopenharmony_ci                                               max_char_size * (startpos - newpos));
3947db96d56Sopenharmony_ci                    if (p == NULL)
3957db96d56Sopenharmony_ci                        goto error;
3967db96d56Sopenharmony_ci                }
3977db96d56Sopenharmony_ci                else {
3987db96d56Sopenharmony_ci                    /* subtract preallocated bytes */
3997db96d56Sopenharmony_ci                    writer->min_size -= max_char_size * (newpos - startpos);
4007db96d56Sopenharmony_ci                    /* Only overallocate the buffer if it's not the last write */
4017db96d56Sopenharmony_ci                    writer->overallocate = (newpos < size);
4027db96d56Sopenharmony_ci                }
4037db96d56Sopenharmony_ci
4047db96d56Sopenharmony_ci                if (PyBytes_Check(rep)) {
4057db96d56Sopenharmony_ci                    p = _PyBytesWriter_WriteBytes(writer, p,
4067db96d56Sopenharmony_ci                                                  PyBytes_AS_STRING(rep),
4077db96d56Sopenharmony_ci                                                  PyBytes_GET_SIZE(rep));
4087db96d56Sopenharmony_ci                }
4097db96d56Sopenharmony_ci                else {
4107db96d56Sopenharmony_ci                    /* rep is unicode */
4117db96d56Sopenharmony_ci                    if (PyUnicode_READY(rep) < 0)
4127db96d56Sopenharmony_ci                        goto error;
4137db96d56Sopenharmony_ci
4147db96d56Sopenharmony_ci                    if (!PyUnicode_IS_ASCII(rep)) {
4157db96d56Sopenharmony_ci                        raise_encode_exception(&exc, "utf-8", unicode,
4167db96d56Sopenharmony_ci                                               startpos, endpos,
4177db96d56Sopenharmony_ci                                               "surrogates not allowed");
4187db96d56Sopenharmony_ci                        goto error;
4197db96d56Sopenharmony_ci                    }
4207db96d56Sopenharmony_ci
4217db96d56Sopenharmony_ci                    p = _PyBytesWriter_WriteBytes(writer, p,
4227db96d56Sopenharmony_ci                                                  PyUnicode_DATA(rep),
4237db96d56Sopenharmony_ci                                                  PyUnicode_GET_LENGTH(rep));
4247db96d56Sopenharmony_ci                }
4257db96d56Sopenharmony_ci
4267db96d56Sopenharmony_ci                if (p == NULL)
4277db96d56Sopenharmony_ci                    goto error;
4287db96d56Sopenharmony_ci                Py_CLEAR(rep);
4297db96d56Sopenharmony_ci
4307db96d56Sopenharmony_ci                i = newpos;
4317db96d56Sopenharmony_ci            }
4327db96d56Sopenharmony_ci
4337db96d56Sopenharmony_ci            /* If overallocation was disabled, ensure that it was the last
4347db96d56Sopenharmony_ci               write. Otherwise, we missed an optimization */
4357db96d56Sopenharmony_ci            assert(writer->overallocate || i == size);
4367db96d56Sopenharmony_ci        }
4377db96d56Sopenharmony_ci        else
4387db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 2
4397db96d56Sopenharmony_ci        if (ch < 0x10000)
4407db96d56Sopenharmony_ci#endif
4417db96d56Sopenharmony_ci        {
4427db96d56Sopenharmony_ci            *p++ = (char)(0xe0 | (ch >> 12));
4437db96d56Sopenharmony_ci            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4447db96d56Sopenharmony_ci            *p++ = (char)(0x80 | (ch & 0x3f));
4457db96d56Sopenharmony_ci        }
4467db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 2
4477db96d56Sopenharmony_ci        else /* ch >= 0x10000 */
4487db96d56Sopenharmony_ci        {
4497db96d56Sopenharmony_ci            assert(ch <= MAX_UNICODE);
4507db96d56Sopenharmony_ci            /* Encode UCS4 Unicode ordinals */
4517db96d56Sopenharmony_ci            *p++ = (char)(0xf0 | (ch >> 18));
4527db96d56Sopenharmony_ci            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4537db96d56Sopenharmony_ci            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4547db96d56Sopenharmony_ci            *p++ = (char)(0x80 | (ch & 0x3f));
4557db96d56Sopenharmony_ci        }
4567db96d56Sopenharmony_ci#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
4577db96d56Sopenharmony_ci#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
4587db96d56Sopenharmony_ci    }
4597db96d56Sopenharmony_ci
4607db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1
4617db96d56Sopenharmony_ci    Py_XDECREF(error_handler_obj);
4627db96d56Sopenharmony_ci    Py_XDECREF(exc);
4637db96d56Sopenharmony_ci#endif
4647db96d56Sopenharmony_ci    return p;
4657db96d56Sopenharmony_ci
4667db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1
4677db96d56Sopenharmony_ci error:
4687db96d56Sopenharmony_ci    Py_XDECREF(rep);
4697db96d56Sopenharmony_ci    Py_XDECREF(error_handler_obj);
4707db96d56Sopenharmony_ci    Py_XDECREF(exc);
4717db96d56Sopenharmony_ci    return NULL;
4727db96d56Sopenharmony_ci#endif
4737db96d56Sopenharmony_ci}
4747db96d56Sopenharmony_ci
4757db96d56Sopenharmony_ci/* The pattern for constructing UCS2-repeated masks. */
4767db96d56Sopenharmony_ci#if SIZEOF_LONG == 8
4777db96d56Sopenharmony_ci# define UCS2_REPEAT_MASK 0x0001000100010001ul
4787db96d56Sopenharmony_ci#elif SIZEOF_LONG == 4
4797db96d56Sopenharmony_ci# define UCS2_REPEAT_MASK 0x00010001ul
4807db96d56Sopenharmony_ci#else
4817db96d56Sopenharmony_ci# error C 'long' size should be either 4 or 8!
4827db96d56Sopenharmony_ci#endif
4837db96d56Sopenharmony_ci
4847db96d56Sopenharmony_ci/* The mask for fast checking. */
4857db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR == 1
4867db96d56Sopenharmony_ci/* The mask for fast checking of whether a C 'long' contains a
4877db96d56Sopenharmony_ci   non-ASCII or non-Latin1 UTF16-encoded characters. */
4887db96d56Sopenharmony_ci# define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
4897db96d56Sopenharmony_ci#else
4907db96d56Sopenharmony_ci/* The mask for fast checking of whether a C 'long' may contain
4917db96d56Sopenharmony_ci   UTF16-encoded surrogate characters. This is an efficient heuristic,
4927db96d56Sopenharmony_ci   assuming that non-surrogate characters with a code point >= 0x8000 are
4937db96d56Sopenharmony_ci   rare in most input.
4947db96d56Sopenharmony_ci*/
4957db96d56Sopenharmony_ci# define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * 0x8000u)
4967db96d56Sopenharmony_ci#endif
4977db96d56Sopenharmony_ci/* The mask for fast byte-swapping. */
4987db96d56Sopenharmony_ci#define STRIPPED_MASK           (UCS2_REPEAT_MASK * 0x00FFu)
4997db96d56Sopenharmony_ci/* Swap bytes. */
5007db96d56Sopenharmony_ci#define SWAB(value)             ((((value) >> 8) & STRIPPED_MASK) | \
5017db96d56Sopenharmony_ci                                 (((value) & STRIPPED_MASK) << 8))
5027db96d56Sopenharmony_ci
5037db96d56Sopenharmony_ciPy_LOCAL_INLINE(Py_UCS4)
5047db96d56Sopenharmony_ciSTRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
5057db96d56Sopenharmony_ci                        STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
5067db96d56Sopenharmony_ci                        int native_ordering)
5077db96d56Sopenharmony_ci{
5087db96d56Sopenharmony_ci    Py_UCS4 ch;
5097db96d56Sopenharmony_ci    const unsigned char *q = *inptr;
5107db96d56Sopenharmony_ci    STRINGLIB_CHAR *p = dest + *outpos;
5117db96d56Sopenharmony_ci    /* Offsets from q for retrieving byte pairs in the right order. */
5127db96d56Sopenharmony_ci#if PY_LITTLE_ENDIAN
5137db96d56Sopenharmony_ci    int ihi = !!native_ordering, ilo = !native_ordering;
5147db96d56Sopenharmony_ci#else
5157db96d56Sopenharmony_ci    int ihi = !native_ordering, ilo = !!native_ordering;
5167db96d56Sopenharmony_ci#endif
5177db96d56Sopenharmony_ci    --e;
5187db96d56Sopenharmony_ci
5197db96d56Sopenharmony_ci    while (q < e) {
5207db96d56Sopenharmony_ci        Py_UCS4 ch2;
5217db96d56Sopenharmony_ci        /* First check for possible aligned read of a C 'long'. Unaligned
5227db96d56Sopenharmony_ci           reads are more expensive, better to defer to another iteration. */
5237db96d56Sopenharmony_ci        if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) {
5247db96d56Sopenharmony_ci            /* Fast path for runs of in-range non-surrogate chars. */
5257db96d56Sopenharmony_ci            const unsigned char *_q = q;
5267db96d56Sopenharmony_ci            while (_q + SIZEOF_LONG <= e) {
5277db96d56Sopenharmony_ci                unsigned long block = * (const unsigned long *) _q;
5287db96d56Sopenharmony_ci                if (native_ordering) {
5297db96d56Sopenharmony_ci                    /* Can use buffer directly */
5307db96d56Sopenharmony_ci                    if (block & FAST_CHAR_MASK)
5317db96d56Sopenharmony_ci                        break;
5327db96d56Sopenharmony_ci                }
5337db96d56Sopenharmony_ci                else {
5347db96d56Sopenharmony_ci                    /* Need to byte-swap */
5357db96d56Sopenharmony_ci                    if (block & SWAB(FAST_CHAR_MASK))
5367db96d56Sopenharmony_ci                        break;
5377db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR == 1
5387db96d56Sopenharmony_ci                    block >>= 8;
5397db96d56Sopenharmony_ci#else
5407db96d56Sopenharmony_ci                    block = SWAB(block);
5417db96d56Sopenharmony_ci#endif
5427db96d56Sopenharmony_ci                }
5437db96d56Sopenharmony_ci#if PY_LITTLE_ENDIAN
5447db96d56Sopenharmony_ci# if SIZEOF_LONG == 4
5457db96d56Sopenharmony_ci                p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
5467db96d56Sopenharmony_ci                p[1] = (STRINGLIB_CHAR)(block >> 16);
5477db96d56Sopenharmony_ci# elif SIZEOF_LONG == 8
5487db96d56Sopenharmony_ci                p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
5497db96d56Sopenharmony_ci                p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
5507db96d56Sopenharmony_ci                p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
5517db96d56Sopenharmony_ci                p[3] = (STRINGLIB_CHAR)(block >> 48);
5527db96d56Sopenharmony_ci# endif
5537db96d56Sopenharmony_ci#else
5547db96d56Sopenharmony_ci# if SIZEOF_LONG == 4
5557db96d56Sopenharmony_ci                p[0] = (STRINGLIB_CHAR)(block >> 16);
5567db96d56Sopenharmony_ci                p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
5577db96d56Sopenharmony_ci# elif SIZEOF_LONG == 8
5587db96d56Sopenharmony_ci                p[0] = (STRINGLIB_CHAR)(block >> 48);
5597db96d56Sopenharmony_ci                p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
5607db96d56Sopenharmony_ci                p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
5617db96d56Sopenharmony_ci                p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
5627db96d56Sopenharmony_ci# endif
5637db96d56Sopenharmony_ci#endif
5647db96d56Sopenharmony_ci                _q += SIZEOF_LONG;
5657db96d56Sopenharmony_ci                p += SIZEOF_LONG / 2;
5667db96d56Sopenharmony_ci            }
5677db96d56Sopenharmony_ci            q = _q;
5687db96d56Sopenharmony_ci            if (q >= e)
5697db96d56Sopenharmony_ci                break;
5707db96d56Sopenharmony_ci        }
5717db96d56Sopenharmony_ci
5727db96d56Sopenharmony_ci        ch = (q[ihi] << 8) | q[ilo];
5737db96d56Sopenharmony_ci        q += 2;
5747db96d56Sopenharmony_ci        if (!Py_UNICODE_IS_SURROGATE(ch)) {
5757db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR < 2
5767db96d56Sopenharmony_ci            if (ch > STRINGLIB_MAX_CHAR)
5777db96d56Sopenharmony_ci                /* Out-of-range */
5787db96d56Sopenharmony_ci                goto Return;
5797db96d56Sopenharmony_ci#endif
5807db96d56Sopenharmony_ci            *p++ = (STRINGLIB_CHAR)ch;
5817db96d56Sopenharmony_ci            continue;
5827db96d56Sopenharmony_ci        }
5837db96d56Sopenharmony_ci
5847db96d56Sopenharmony_ci        /* UTF-16 code pair: */
5857db96d56Sopenharmony_ci        if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
5867db96d56Sopenharmony_ci            goto IllegalEncoding;
5877db96d56Sopenharmony_ci        if (q >= e)
5887db96d56Sopenharmony_ci            goto UnexpectedEnd;
5897db96d56Sopenharmony_ci        ch2 = (q[ihi] << 8) | q[ilo];
5907db96d56Sopenharmony_ci        q += 2;
5917db96d56Sopenharmony_ci        if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
5927db96d56Sopenharmony_ci            goto IllegalSurrogate;
5937db96d56Sopenharmony_ci        ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
5947db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR < 4
5957db96d56Sopenharmony_ci        /* Out-of-range */
5967db96d56Sopenharmony_ci        goto Return;
5977db96d56Sopenharmony_ci#else
5987db96d56Sopenharmony_ci        *p++ = (STRINGLIB_CHAR)ch;
5997db96d56Sopenharmony_ci#endif
6007db96d56Sopenharmony_ci    }
6017db96d56Sopenharmony_ci    ch = 0;
6027db96d56Sopenharmony_ciReturn:
6037db96d56Sopenharmony_ci    *inptr = q;
6047db96d56Sopenharmony_ci    *outpos = p - dest;
6057db96d56Sopenharmony_ci    return ch;
6067db96d56Sopenharmony_ciUnexpectedEnd:
6077db96d56Sopenharmony_ci    ch = 1;
6087db96d56Sopenharmony_ci    goto Return;
6097db96d56Sopenharmony_ciIllegalEncoding:
6107db96d56Sopenharmony_ci    ch = 2;
6117db96d56Sopenharmony_ci    goto Return;
6127db96d56Sopenharmony_ciIllegalSurrogate:
6137db96d56Sopenharmony_ci    ch = 3;
6147db96d56Sopenharmony_ci    goto Return;
6157db96d56Sopenharmony_ci}
6167db96d56Sopenharmony_ci#undef UCS2_REPEAT_MASK
6177db96d56Sopenharmony_ci#undef FAST_CHAR_MASK
6187db96d56Sopenharmony_ci#undef STRIPPED_MASK
6197db96d56Sopenharmony_ci#undef SWAB
6207db96d56Sopenharmony_ci
6217db96d56Sopenharmony_ci
6227db96d56Sopenharmony_ci#if STRINGLIB_MAX_CHAR >= 0x80
6237db96d56Sopenharmony_ciPy_LOCAL_INLINE(Py_ssize_t)
6247db96d56Sopenharmony_ciSTRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
6257db96d56Sopenharmony_ci                        Py_ssize_t len,
6267db96d56Sopenharmony_ci                        unsigned short **outptr,
6277db96d56Sopenharmony_ci                        int native_ordering)
6287db96d56Sopenharmony_ci{
6297db96d56Sopenharmony_ci    unsigned short *out = *outptr;
6307db96d56Sopenharmony_ci    const STRINGLIB_CHAR *end = in + len;
6317db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR == 1
6327db96d56Sopenharmony_ci    if (native_ordering) {
6337db96d56Sopenharmony_ci        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
6347db96d56Sopenharmony_ci        while (in < unrolled_end) {
6357db96d56Sopenharmony_ci            out[0] = in[0];
6367db96d56Sopenharmony_ci            out[1] = in[1];
6377db96d56Sopenharmony_ci            out[2] = in[2];
6387db96d56Sopenharmony_ci            out[3] = in[3];
6397db96d56Sopenharmony_ci            in += 4; out += 4;
6407db96d56Sopenharmony_ci        }
6417db96d56Sopenharmony_ci        while (in < end) {
6427db96d56Sopenharmony_ci            *out++ = *in++;
6437db96d56Sopenharmony_ci        }
6447db96d56Sopenharmony_ci    } else {
6457db96d56Sopenharmony_ci# define SWAB2(CH)  ((CH) << 8) /* high byte is zero */
6467db96d56Sopenharmony_ci        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
6477db96d56Sopenharmony_ci        while (in < unrolled_end) {
6487db96d56Sopenharmony_ci            out[0] = SWAB2(in[0]);
6497db96d56Sopenharmony_ci            out[1] = SWAB2(in[1]);
6507db96d56Sopenharmony_ci            out[2] = SWAB2(in[2]);
6517db96d56Sopenharmony_ci            out[3] = SWAB2(in[3]);
6527db96d56Sopenharmony_ci            in += 4; out += 4;
6537db96d56Sopenharmony_ci        }
6547db96d56Sopenharmony_ci        while (in < end) {
6557db96d56Sopenharmony_ci            Py_UCS4 ch = *in++;
6567db96d56Sopenharmony_ci            *out++ = SWAB2((Py_UCS2)ch);
6577db96d56Sopenharmony_ci        }
6587db96d56Sopenharmony_ci#undef SWAB2
6597db96d56Sopenharmony_ci    }
6607db96d56Sopenharmony_ci    *outptr = out;
6617db96d56Sopenharmony_ci    return len;
6627db96d56Sopenharmony_ci#else
6637db96d56Sopenharmony_ci    if (native_ordering) {
6647db96d56Sopenharmony_ci#if STRINGLIB_MAX_CHAR < 0x10000
6657db96d56Sopenharmony_ci        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
6667db96d56Sopenharmony_ci        while (in < unrolled_end) {
6677db96d56Sopenharmony_ci            /* check if any character is a surrogate character */
6687db96d56Sopenharmony_ci            if (((in[0] ^ 0xd800) &
6697db96d56Sopenharmony_ci                 (in[1] ^ 0xd800) &
6707db96d56Sopenharmony_ci                 (in[2] ^ 0xd800) &
6717db96d56Sopenharmony_ci                 (in[3] ^ 0xd800) & 0xf800) == 0)
6727db96d56Sopenharmony_ci                break;
6737db96d56Sopenharmony_ci            out[0] = in[0];
6747db96d56Sopenharmony_ci            out[1] = in[1];
6757db96d56Sopenharmony_ci            out[2] = in[2];
6767db96d56Sopenharmony_ci            out[3] = in[3];
6777db96d56Sopenharmony_ci            in += 4; out += 4;
6787db96d56Sopenharmony_ci        }
6797db96d56Sopenharmony_ci#endif
6807db96d56Sopenharmony_ci        while (in < end) {
6817db96d56Sopenharmony_ci            Py_UCS4 ch;
6827db96d56Sopenharmony_ci            ch = *in++;
6837db96d56Sopenharmony_ci            if (ch < 0xd800)
6847db96d56Sopenharmony_ci                *out++ = ch;
6857db96d56Sopenharmony_ci            else if (ch < 0xe000)
6867db96d56Sopenharmony_ci                /* reject surrogate characters (U+D800-U+DFFF) */
6877db96d56Sopenharmony_ci                goto fail;
6887db96d56Sopenharmony_ci#if STRINGLIB_MAX_CHAR >= 0x10000
6897db96d56Sopenharmony_ci            else if (ch >= 0x10000) {
6907db96d56Sopenharmony_ci                out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
6917db96d56Sopenharmony_ci                out[1] = Py_UNICODE_LOW_SURROGATE(ch);
6927db96d56Sopenharmony_ci                out += 2;
6937db96d56Sopenharmony_ci            }
6947db96d56Sopenharmony_ci#endif
6957db96d56Sopenharmony_ci            else
6967db96d56Sopenharmony_ci                *out++ = ch;
6977db96d56Sopenharmony_ci        }
6987db96d56Sopenharmony_ci    } else {
6997db96d56Sopenharmony_ci#define SWAB2(CH)  (((CH) << 8) | ((CH) >> 8))
7007db96d56Sopenharmony_ci#if STRINGLIB_MAX_CHAR < 0x10000
7017db96d56Sopenharmony_ci        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
7027db96d56Sopenharmony_ci        while (in < unrolled_end) {
7037db96d56Sopenharmony_ci            /* check if any character is a surrogate character */
7047db96d56Sopenharmony_ci            if (((in[0] ^ 0xd800) &
7057db96d56Sopenharmony_ci                 (in[1] ^ 0xd800) &
7067db96d56Sopenharmony_ci                 (in[2] ^ 0xd800) &
7077db96d56Sopenharmony_ci                 (in[3] ^ 0xd800) & 0xf800) == 0)
7087db96d56Sopenharmony_ci                break;
7097db96d56Sopenharmony_ci            out[0] = SWAB2(in[0]);
7107db96d56Sopenharmony_ci            out[1] = SWAB2(in[1]);
7117db96d56Sopenharmony_ci            out[2] = SWAB2(in[2]);
7127db96d56Sopenharmony_ci            out[3] = SWAB2(in[3]);
7137db96d56Sopenharmony_ci            in += 4; out += 4;
7147db96d56Sopenharmony_ci        }
7157db96d56Sopenharmony_ci#endif
7167db96d56Sopenharmony_ci        while (in < end) {
7177db96d56Sopenharmony_ci            Py_UCS4 ch = *in++;
7187db96d56Sopenharmony_ci            if (ch < 0xd800)
7197db96d56Sopenharmony_ci                *out++ = SWAB2((Py_UCS2)ch);
7207db96d56Sopenharmony_ci            else if (ch < 0xe000)
7217db96d56Sopenharmony_ci                /* reject surrogate characters (U+D800-U+DFFF) */
7227db96d56Sopenharmony_ci                goto fail;
7237db96d56Sopenharmony_ci#if STRINGLIB_MAX_CHAR >= 0x10000
7247db96d56Sopenharmony_ci            else if (ch >= 0x10000) {
7257db96d56Sopenharmony_ci                Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
7267db96d56Sopenharmony_ci                Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
7277db96d56Sopenharmony_ci                out[0] = SWAB2(ch1);
7287db96d56Sopenharmony_ci                out[1] = SWAB2(ch2);
7297db96d56Sopenharmony_ci                out += 2;
7307db96d56Sopenharmony_ci            }
7317db96d56Sopenharmony_ci#endif
7327db96d56Sopenharmony_ci            else
7337db96d56Sopenharmony_ci                *out++ = SWAB2((Py_UCS2)ch);
7347db96d56Sopenharmony_ci        }
7357db96d56Sopenharmony_ci#undef SWAB2
7367db96d56Sopenharmony_ci    }
7377db96d56Sopenharmony_ci    *outptr = out;
7387db96d56Sopenharmony_ci    return len;
7397db96d56Sopenharmony_ci  fail:
7407db96d56Sopenharmony_ci    *outptr = out;
7417db96d56Sopenharmony_ci    return len - (end - in + 1);
7427db96d56Sopenharmony_ci#endif
7437db96d56Sopenharmony_ci}
7447db96d56Sopenharmony_ci
7457db96d56Sopenharmony_cistatic inline uint32_t
7467db96d56Sopenharmony_ciSTRINGLIB(SWAB4)(STRINGLIB_CHAR ch)
7477db96d56Sopenharmony_ci{
7487db96d56Sopenharmony_ci    uint32_t word = ch;
7497db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR == 1
7507db96d56Sopenharmony_ci    /* high bytes are zero */
7517db96d56Sopenharmony_ci    return (word << 24);
7527db96d56Sopenharmony_ci#elif STRINGLIB_SIZEOF_CHAR == 2
7537db96d56Sopenharmony_ci    /* high bytes are zero */
7547db96d56Sopenharmony_ci    return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8);
7557db96d56Sopenharmony_ci#else
7567db96d56Sopenharmony_ci    return _Py_bswap32(word);
7577db96d56Sopenharmony_ci#endif
7587db96d56Sopenharmony_ci}
7597db96d56Sopenharmony_ci
7607db96d56Sopenharmony_ciPy_LOCAL_INLINE(Py_ssize_t)
7617db96d56Sopenharmony_ciSTRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
7627db96d56Sopenharmony_ci                        Py_ssize_t len,
7637db96d56Sopenharmony_ci                        uint32_t **outptr,
7647db96d56Sopenharmony_ci                        int native_ordering)
7657db96d56Sopenharmony_ci{
7667db96d56Sopenharmony_ci    uint32_t *out = *outptr;
7677db96d56Sopenharmony_ci    const STRINGLIB_CHAR *end = in + len;
7687db96d56Sopenharmony_ci    if (native_ordering) {
7697db96d56Sopenharmony_ci        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
7707db96d56Sopenharmony_ci        while (in < unrolled_end) {
7717db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1
7727db96d56Sopenharmony_ci            /* check if any character is a surrogate character */
7737db96d56Sopenharmony_ci            if (((in[0] ^ 0xd800) &
7747db96d56Sopenharmony_ci                 (in[1] ^ 0xd800) &
7757db96d56Sopenharmony_ci                 (in[2] ^ 0xd800) &
7767db96d56Sopenharmony_ci                 (in[3] ^ 0xd800) & 0xf800) == 0)
7777db96d56Sopenharmony_ci                break;
7787db96d56Sopenharmony_ci#endif
7797db96d56Sopenharmony_ci            out[0] = in[0];
7807db96d56Sopenharmony_ci            out[1] = in[1];
7817db96d56Sopenharmony_ci            out[2] = in[2];
7827db96d56Sopenharmony_ci            out[3] = in[3];
7837db96d56Sopenharmony_ci            in += 4; out += 4;
7847db96d56Sopenharmony_ci        }
7857db96d56Sopenharmony_ci        while (in < end) {
7867db96d56Sopenharmony_ci            Py_UCS4 ch;
7877db96d56Sopenharmony_ci            ch = *in++;
7887db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1
7897db96d56Sopenharmony_ci            if (Py_UNICODE_IS_SURROGATE(ch)) {
7907db96d56Sopenharmony_ci                /* reject surrogate characters (U+D800-U+DFFF) */
7917db96d56Sopenharmony_ci                goto fail;
7927db96d56Sopenharmony_ci            }
7937db96d56Sopenharmony_ci#endif
7947db96d56Sopenharmony_ci            *out++ = ch;
7957db96d56Sopenharmony_ci        }
7967db96d56Sopenharmony_ci    } else {
7977db96d56Sopenharmony_ci        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
7987db96d56Sopenharmony_ci        while (in < unrolled_end) {
7997db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1
8007db96d56Sopenharmony_ci            /* check if any character is a surrogate character */
8017db96d56Sopenharmony_ci            if (((in[0] ^ 0xd800) &
8027db96d56Sopenharmony_ci                 (in[1] ^ 0xd800) &
8037db96d56Sopenharmony_ci                 (in[2] ^ 0xd800) &
8047db96d56Sopenharmony_ci                 (in[3] ^ 0xd800) & 0xf800) == 0)
8057db96d56Sopenharmony_ci                break;
8067db96d56Sopenharmony_ci#endif
8077db96d56Sopenharmony_ci            out[0] = STRINGLIB(SWAB4)(in[0]);
8087db96d56Sopenharmony_ci            out[1] = STRINGLIB(SWAB4)(in[1]);
8097db96d56Sopenharmony_ci            out[2] = STRINGLIB(SWAB4)(in[2]);
8107db96d56Sopenharmony_ci            out[3] = STRINGLIB(SWAB4)(in[3]);
8117db96d56Sopenharmony_ci            in += 4; out += 4;
8127db96d56Sopenharmony_ci        }
8137db96d56Sopenharmony_ci        while (in < end) {
8147db96d56Sopenharmony_ci            Py_UCS4 ch = *in++;
8157db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1
8167db96d56Sopenharmony_ci            if (Py_UNICODE_IS_SURROGATE(ch)) {
8177db96d56Sopenharmony_ci                /* reject surrogate characters (U+D800-U+DFFF) */
8187db96d56Sopenharmony_ci                goto fail;
8197db96d56Sopenharmony_ci            }
8207db96d56Sopenharmony_ci#endif
8217db96d56Sopenharmony_ci            *out++ = STRINGLIB(SWAB4)(ch);
8227db96d56Sopenharmony_ci        }
8237db96d56Sopenharmony_ci    }
8247db96d56Sopenharmony_ci    *outptr = out;
8257db96d56Sopenharmony_ci    return len;
8267db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1
8277db96d56Sopenharmony_ci  fail:
8287db96d56Sopenharmony_ci    *outptr = out;
8297db96d56Sopenharmony_ci    return len - (end - in + 1);
8307db96d56Sopenharmony_ci#endif
8317db96d56Sopenharmony_ci}
8327db96d56Sopenharmony_ci
8337db96d56Sopenharmony_ci#endif
834