17db96d56Sopenharmony_ci/* stringlib: codec implementations */ 27db96d56Sopenharmony_ci 37db96d56Sopenharmony_ci#if !STRINGLIB_IS_UNICODE 47db96d56Sopenharmony_ci# error "codecs.h is specific to Unicode" 57db96d56Sopenharmony_ci#endif 67db96d56Sopenharmony_ci 77db96d56Sopenharmony_ci#include "pycore_bitutils.h" // _Py_bswap32() 87db96d56Sopenharmony_ci 97db96d56Sopenharmony_ci/* Mask to quickly check whether a C 'size_t' contains a 107db96d56Sopenharmony_ci non-ASCII, UTF8-encoded char. */ 117db96d56Sopenharmony_ci#if (SIZEOF_SIZE_T == 8) 127db96d56Sopenharmony_ci# define ASCII_CHAR_MASK 0x8080808080808080ULL 137db96d56Sopenharmony_ci#elif (SIZEOF_SIZE_T == 4) 147db96d56Sopenharmony_ci# define ASCII_CHAR_MASK 0x80808080U 157db96d56Sopenharmony_ci#else 167db96d56Sopenharmony_ci# error C 'size_t' size should be either 4 or 8! 177db96d56Sopenharmony_ci#endif 187db96d56Sopenharmony_ci 197db96d56Sopenharmony_ci/* 10xxxxxx */ 207db96d56Sopenharmony_ci#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) 217db96d56Sopenharmony_ci 227db96d56Sopenharmony_ciPy_LOCAL_INLINE(Py_UCS4) 237db96d56Sopenharmony_ciSTRINGLIB(utf8_decode)(const char **inptr, const char *end, 247db96d56Sopenharmony_ci STRINGLIB_CHAR *dest, 257db96d56Sopenharmony_ci Py_ssize_t *outpos) 267db96d56Sopenharmony_ci{ 277db96d56Sopenharmony_ci Py_UCS4 ch; 287db96d56Sopenharmony_ci const char *s = *inptr; 297db96d56Sopenharmony_ci STRINGLIB_CHAR *p = dest + *outpos; 307db96d56Sopenharmony_ci 317db96d56Sopenharmony_ci while (s < end) { 327db96d56Sopenharmony_ci ch = (unsigned char)*s; 337db96d56Sopenharmony_ci 347db96d56Sopenharmony_ci if (ch < 0x80) { 357db96d56Sopenharmony_ci /* Fast path for runs of ASCII characters. Given that common UTF-8 367db96d56Sopenharmony_ci input will consist of an overwhelming majority of ASCII 377db96d56Sopenharmony_ci characters, we try to optimize for this case by checking 387db96d56Sopenharmony_ci as many characters as a C 'size_t' can contain. 397db96d56Sopenharmony_ci First, check if we can do an aligned read, as most CPUs have 407db96d56Sopenharmony_ci a penalty for unaligned reads. 417db96d56Sopenharmony_ci */ 427db96d56Sopenharmony_ci if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { 437db96d56Sopenharmony_ci /* Help register allocation */ 447db96d56Sopenharmony_ci const char *_s = s; 457db96d56Sopenharmony_ci STRINGLIB_CHAR *_p = p; 467db96d56Sopenharmony_ci while (_s + SIZEOF_SIZE_T <= end) { 477db96d56Sopenharmony_ci /* Read a whole size_t at a time (either 4 or 8 bytes), 487db96d56Sopenharmony_ci and do a fast unrolled copy if it only contains ASCII 497db96d56Sopenharmony_ci characters. */ 507db96d56Sopenharmony_ci size_t value = *(const size_t *) _s; 517db96d56Sopenharmony_ci if (value & ASCII_CHAR_MASK) 527db96d56Sopenharmony_ci break; 537db96d56Sopenharmony_ci#if PY_LITTLE_ENDIAN 547db96d56Sopenharmony_ci _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); 557db96d56Sopenharmony_ci _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); 567db96d56Sopenharmony_ci _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); 577db96d56Sopenharmony_ci _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); 587db96d56Sopenharmony_ci# if SIZEOF_SIZE_T == 8 597db96d56Sopenharmony_ci _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); 607db96d56Sopenharmony_ci _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); 617db96d56Sopenharmony_ci _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); 627db96d56Sopenharmony_ci _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); 637db96d56Sopenharmony_ci# endif 647db96d56Sopenharmony_ci#else 657db96d56Sopenharmony_ci# if SIZEOF_SIZE_T == 8 667db96d56Sopenharmony_ci _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); 677db96d56Sopenharmony_ci _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); 687db96d56Sopenharmony_ci _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); 697db96d56Sopenharmony_ci _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); 707db96d56Sopenharmony_ci _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); 717db96d56Sopenharmony_ci _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); 727db96d56Sopenharmony_ci _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); 737db96d56Sopenharmony_ci _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); 747db96d56Sopenharmony_ci# else 757db96d56Sopenharmony_ci _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); 767db96d56Sopenharmony_ci _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); 777db96d56Sopenharmony_ci _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); 787db96d56Sopenharmony_ci _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); 797db96d56Sopenharmony_ci# endif 807db96d56Sopenharmony_ci#endif 817db96d56Sopenharmony_ci _s += SIZEOF_SIZE_T; 827db96d56Sopenharmony_ci _p += SIZEOF_SIZE_T; 837db96d56Sopenharmony_ci } 847db96d56Sopenharmony_ci s = _s; 857db96d56Sopenharmony_ci p = _p; 867db96d56Sopenharmony_ci if (s == end) 877db96d56Sopenharmony_ci break; 887db96d56Sopenharmony_ci ch = (unsigned char)*s; 897db96d56Sopenharmony_ci } 907db96d56Sopenharmony_ci if (ch < 0x80) { 917db96d56Sopenharmony_ci s++; 927db96d56Sopenharmony_ci *p++ = ch; 937db96d56Sopenharmony_ci continue; 947db96d56Sopenharmony_ci } 957db96d56Sopenharmony_ci } 967db96d56Sopenharmony_ci 977db96d56Sopenharmony_ci if (ch < 0xE0) { 987db96d56Sopenharmony_ci /* \xC2\x80-\xDF\xBF -- 0080-07FF */ 997db96d56Sopenharmony_ci Py_UCS4 ch2; 1007db96d56Sopenharmony_ci if (ch < 0xC2) { 1017db96d56Sopenharmony_ci /* invalid sequence 1027db96d56Sopenharmony_ci \x80-\xBF -- continuation byte 1037db96d56Sopenharmony_ci \xC0-\xC1 -- fake 0000-007F */ 1047db96d56Sopenharmony_ci goto InvalidStart; 1057db96d56Sopenharmony_ci } 1067db96d56Sopenharmony_ci if (end - s < 2) { 1077db96d56Sopenharmony_ci /* unexpected end of data: the caller will decide whether 1087db96d56Sopenharmony_ci it's an error or not */ 1097db96d56Sopenharmony_ci break; 1107db96d56Sopenharmony_ci } 1117db96d56Sopenharmony_ci ch2 = (unsigned char)s[1]; 1127db96d56Sopenharmony_ci if (!IS_CONTINUATION_BYTE(ch2)) 1137db96d56Sopenharmony_ci /* invalid continuation byte */ 1147db96d56Sopenharmony_ci goto InvalidContinuation1; 1157db96d56Sopenharmony_ci ch = (ch << 6) + ch2 - 1167db96d56Sopenharmony_ci ((0xC0 << 6) + 0x80); 1177db96d56Sopenharmony_ci assert ((ch > 0x007F) && (ch <= 0x07FF)); 1187db96d56Sopenharmony_ci s += 2; 1197db96d56Sopenharmony_ci if (STRINGLIB_MAX_CHAR <= 0x007F || 1207db96d56Sopenharmony_ci (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) 1217db96d56Sopenharmony_ci /* Out-of-range */ 1227db96d56Sopenharmony_ci goto Return; 1237db96d56Sopenharmony_ci *p++ = ch; 1247db96d56Sopenharmony_ci continue; 1257db96d56Sopenharmony_ci } 1267db96d56Sopenharmony_ci 1277db96d56Sopenharmony_ci if (ch < 0xF0) { 1287db96d56Sopenharmony_ci /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ 1297db96d56Sopenharmony_ci Py_UCS4 ch2, ch3; 1307db96d56Sopenharmony_ci if (end - s < 3) { 1317db96d56Sopenharmony_ci /* unexpected end of data: the caller will decide whether 1327db96d56Sopenharmony_ci it's an error or not */ 1337db96d56Sopenharmony_ci if (end - s < 2) 1347db96d56Sopenharmony_ci break; 1357db96d56Sopenharmony_ci ch2 = (unsigned char)s[1]; 1367db96d56Sopenharmony_ci if (!IS_CONTINUATION_BYTE(ch2) || 1377db96d56Sopenharmony_ci (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) 1387db96d56Sopenharmony_ci /* for clarification see comments below */ 1397db96d56Sopenharmony_ci goto InvalidContinuation1; 1407db96d56Sopenharmony_ci break; 1417db96d56Sopenharmony_ci } 1427db96d56Sopenharmony_ci ch2 = (unsigned char)s[1]; 1437db96d56Sopenharmony_ci ch3 = (unsigned char)s[2]; 1447db96d56Sopenharmony_ci if (!IS_CONTINUATION_BYTE(ch2)) { 1457db96d56Sopenharmony_ci /* invalid continuation byte */ 1467db96d56Sopenharmony_ci goto InvalidContinuation1; 1477db96d56Sopenharmony_ci } 1487db96d56Sopenharmony_ci if (ch == 0xE0) { 1497db96d56Sopenharmony_ci if (ch2 < 0xA0) 1507db96d56Sopenharmony_ci /* invalid sequence 1517db96d56Sopenharmony_ci \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ 1527db96d56Sopenharmony_ci goto InvalidContinuation1; 1537db96d56Sopenharmony_ci } else if (ch == 0xED && ch2 >= 0xA0) { 1547db96d56Sopenharmony_ci /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF 1557db96d56Sopenharmony_ci will result in surrogates in range D800-DFFF. Surrogates are 1567db96d56Sopenharmony_ci not valid UTF-8 so they are rejected. 1577db96d56Sopenharmony_ci See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 1587db96d56Sopenharmony_ci (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 1597db96d56Sopenharmony_ci goto InvalidContinuation1; 1607db96d56Sopenharmony_ci } 1617db96d56Sopenharmony_ci if (!IS_CONTINUATION_BYTE(ch3)) { 1627db96d56Sopenharmony_ci /* invalid continuation byte */ 1637db96d56Sopenharmony_ci goto InvalidContinuation2; 1647db96d56Sopenharmony_ci } 1657db96d56Sopenharmony_ci ch = (ch << 12) + (ch2 << 6) + ch3 - 1667db96d56Sopenharmony_ci ((0xE0 << 12) + (0x80 << 6) + 0x80); 1677db96d56Sopenharmony_ci assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 1687db96d56Sopenharmony_ci s += 3; 1697db96d56Sopenharmony_ci if (STRINGLIB_MAX_CHAR <= 0x07FF || 1707db96d56Sopenharmony_ci (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) 1717db96d56Sopenharmony_ci /* Out-of-range */ 1727db96d56Sopenharmony_ci goto Return; 1737db96d56Sopenharmony_ci *p++ = ch; 1747db96d56Sopenharmony_ci continue; 1757db96d56Sopenharmony_ci } 1767db96d56Sopenharmony_ci 1777db96d56Sopenharmony_ci if (ch < 0xF5) { 1787db96d56Sopenharmony_ci /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ 1797db96d56Sopenharmony_ci Py_UCS4 ch2, ch3, ch4; 1807db96d56Sopenharmony_ci if (end - s < 4) { 1817db96d56Sopenharmony_ci /* unexpected end of data: the caller will decide whether 1827db96d56Sopenharmony_ci it's an error or not */ 1837db96d56Sopenharmony_ci if (end - s < 2) 1847db96d56Sopenharmony_ci break; 1857db96d56Sopenharmony_ci ch2 = (unsigned char)s[1]; 1867db96d56Sopenharmony_ci if (!IS_CONTINUATION_BYTE(ch2) || 1877db96d56Sopenharmony_ci (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) 1887db96d56Sopenharmony_ci /* for clarification see comments below */ 1897db96d56Sopenharmony_ci goto InvalidContinuation1; 1907db96d56Sopenharmony_ci if (end - s < 3) 1917db96d56Sopenharmony_ci break; 1927db96d56Sopenharmony_ci ch3 = (unsigned char)s[2]; 1937db96d56Sopenharmony_ci if (!IS_CONTINUATION_BYTE(ch3)) 1947db96d56Sopenharmony_ci goto InvalidContinuation2; 1957db96d56Sopenharmony_ci break; 1967db96d56Sopenharmony_ci } 1977db96d56Sopenharmony_ci ch2 = (unsigned char)s[1]; 1987db96d56Sopenharmony_ci ch3 = (unsigned char)s[2]; 1997db96d56Sopenharmony_ci ch4 = (unsigned char)s[3]; 2007db96d56Sopenharmony_ci if (!IS_CONTINUATION_BYTE(ch2)) { 2017db96d56Sopenharmony_ci /* invalid continuation byte */ 2027db96d56Sopenharmony_ci goto InvalidContinuation1; 2037db96d56Sopenharmony_ci } 2047db96d56Sopenharmony_ci if (ch == 0xF0) { 2057db96d56Sopenharmony_ci if (ch2 < 0x90) 2067db96d56Sopenharmony_ci /* invalid sequence 2077db96d56Sopenharmony_ci \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ 2087db96d56Sopenharmony_ci goto InvalidContinuation1; 2097db96d56Sopenharmony_ci } else if (ch == 0xF4 && ch2 >= 0x90) { 2107db96d56Sopenharmony_ci /* invalid sequence 2117db96d56Sopenharmony_ci \xF4\x90\x80\x80- -- 110000- overflow */ 2127db96d56Sopenharmony_ci goto InvalidContinuation1; 2137db96d56Sopenharmony_ci } 2147db96d56Sopenharmony_ci if (!IS_CONTINUATION_BYTE(ch3)) { 2157db96d56Sopenharmony_ci /* invalid continuation byte */ 2167db96d56Sopenharmony_ci goto InvalidContinuation2; 2177db96d56Sopenharmony_ci } 2187db96d56Sopenharmony_ci if (!IS_CONTINUATION_BYTE(ch4)) { 2197db96d56Sopenharmony_ci /* invalid continuation byte */ 2207db96d56Sopenharmony_ci goto InvalidContinuation3; 2217db96d56Sopenharmony_ci } 2227db96d56Sopenharmony_ci ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - 2237db96d56Sopenharmony_ci ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); 2247db96d56Sopenharmony_ci assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); 2257db96d56Sopenharmony_ci s += 4; 2267db96d56Sopenharmony_ci if (STRINGLIB_MAX_CHAR <= 0xFFFF || 2277db96d56Sopenharmony_ci (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) 2287db96d56Sopenharmony_ci /* Out-of-range */ 2297db96d56Sopenharmony_ci goto Return; 2307db96d56Sopenharmony_ci *p++ = ch; 2317db96d56Sopenharmony_ci continue; 2327db96d56Sopenharmony_ci } 2337db96d56Sopenharmony_ci goto InvalidStart; 2347db96d56Sopenharmony_ci } 2357db96d56Sopenharmony_ci ch = 0; 2367db96d56Sopenharmony_ciReturn: 2377db96d56Sopenharmony_ci *inptr = s; 2387db96d56Sopenharmony_ci *outpos = p - dest; 2397db96d56Sopenharmony_ci return ch; 2407db96d56Sopenharmony_ciInvalidStart: 2417db96d56Sopenharmony_ci ch = 1; 2427db96d56Sopenharmony_ci goto Return; 2437db96d56Sopenharmony_ciInvalidContinuation1: 2447db96d56Sopenharmony_ci ch = 2; 2457db96d56Sopenharmony_ci goto Return; 2467db96d56Sopenharmony_ciInvalidContinuation2: 2477db96d56Sopenharmony_ci ch = 3; 2487db96d56Sopenharmony_ci goto Return; 2497db96d56Sopenharmony_ciInvalidContinuation3: 2507db96d56Sopenharmony_ci ch = 4; 2517db96d56Sopenharmony_ci goto Return; 2527db96d56Sopenharmony_ci} 2537db96d56Sopenharmony_ci 2547db96d56Sopenharmony_ci#undef ASCII_CHAR_MASK 2557db96d56Sopenharmony_ci 2567db96d56Sopenharmony_ci 2577db96d56Sopenharmony_ci/* UTF-8 encoder specialized for a Unicode kind to avoid the slow 2587db96d56Sopenharmony_ci PyUnicode_READ() macro. Delete some parts of the code depending on the kind: 2597db96d56Sopenharmony_ci UCS-1 strings don't need to handle surrogates for example. */ 2607db96d56Sopenharmony_ciPy_LOCAL_INLINE(char *) 2617db96d56Sopenharmony_ciSTRINGLIB(utf8_encoder)(_PyBytesWriter *writer, 2627db96d56Sopenharmony_ci PyObject *unicode, 2637db96d56Sopenharmony_ci const STRINGLIB_CHAR *data, 2647db96d56Sopenharmony_ci Py_ssize_t size, 2657db96d56Sopenharmony_ci _Py_error_handler error_handler, 2667db96d56Sopenharmony_ci const char *errors) 2677db96d56Sopenharmony_ci{ 2687db96d56Sopenharmony_ci Py_ssize_t i; /* index into data of next input character */ 2697db96d56Sopenharmony_ci char *p; /* next free byte in output buffer */ 2707db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1 2717db96d56Sopenharmony_ci PyObject *error_handler_obj = NULL; 2727db96d56Sopenharmony_ci PyObject *exc = NULL; 2737db96d56Sopenharmony_ci PyObject *rep = NULL; 2747db96d56Sopenharmony_ci#endif 2757db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR == 1 2767db96d56Sopenharmony_ci const Py_ssize_t max_char_size = 2; 2777db96d56Sopenharmony_ci#elif STRINGLIB_SIZEOF_CHAR == 2 2787db96d56Sopenharmony_ci const Py_ssize_t max_char_size = 3; 2797db96d56Sopenharmony_ci#else /* STRINGLIB_SIZEOF_CHAR == 4 */ 2807db96d56Sopenharmony_ci const Py_ssize_t max_char_size = 4; 2817db96d56Sopenharmony_ci#endif 2827db96d56Sopenharmony_ci 2837db96d56Sopenharmony_ci assert(size >= 0); 2847db96d56Sopenharmony_ci if (size > PY_SSIZE_T_MAX / max_char_size) { 2857db96d56Sopenharmony_ci /* integer overflow */ 2867db96d56Sopenharmony_ci PyErr_NoMemory(); 2877db96d56Sopenharmony_ci return NULL; 2887db96d56Sopenharmony_ci } 2897db96d56Sopenharmony_ci 2907db96d56Sopenharmony_ci _PyBytesWriter_Init(writer); 2917db96d56Sopenharmony_ci p = _PyBytesWriter_Alloc(writer, size * max_char_size); 2927db96d56Sopenharmony_ci if (p == NULL) 2937db96d56Sopenharmony_ci return NULL; 2947db96d56Sopenharmony_ci 2957db96d56Sopenharmony_ci for (i = 0; i < size;) { 2967db96d56Sopenharmony_ci Py_UCS4 ch = data[i++]; 2977db96d56Sopenharmony_ci 2987db96d56Sopenharmony_ci if (ch < 0x80) { 2997db96d56Sopenharmony_ci /* Encode ASCII */ 3007db96d56Sopenharmony_ci *p++ = (char) ch; 3017db96d56Sopenharmony_ci 3027db96d56Sopenharmony_ci } 3037db96d56Sopenharmony_ci else 3047db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1 3057db96d56Sopenharmony_ci if (ch < 0x0800) 3067db96d56Sopenharmony_ci#endif 3077db96d56Sopenharmony_ci { 3087db96d56Sopenharmony_ci /* Encode Latin-1 */ 3097db96d56Sopenharmony_ci *p++ = (char)(0xc0 | (ch >> 6)); 3107db96d56Sopenharmony_ci *p++ = (char)(0x80 | (ch & 0x3f)); 3117db96d56Sopenharmony_ci } 3127db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1 3137db96d56Sopenharmony_ci else if (Py_UNICODE_IS_SURROGATE(ch)) { 3147db96d56Sopenharmony_ci Py_ssize_t startpos, endpos, newpos; 3157db96d56Sopenharmony_ci Py_ssize_t k; 3167db96d56Sopenharmony_ci if (error_handler == _Py_ERROR_UNKNOWN) { 3177db96d56Sopenharmony_ci error_handler = _Py_GetErrorHandler(errors); 3187db96d56Sopenharmony_ci } 3197db96d56Sopenharmony_ci 3207db96d56Sopenharmony_ci startpos = i-1; 3217db96d56Sopenharmony_ci endpos = startpos+1; 3227db96d56Sopenharmony_ci 3237db96d56Sopenharmony_ci while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) 3247db96d56Sopenharmony_ci endpos++; 3257db96d56Sopenharmony_ci 3267db96d56Sopenharmony_ci /* Only overallocate the buffer if it's not the last write */ 3277db96d56Sopenharmony_ci writer->overallocate = (endpos < size); 3287db96d56Sopenharmony_ci 3297db96d56Sopenharmony_ci switch (error_handler) 3307db96d56Sopenharmony_ci { 3317db96d56Sopenharmony_ci case _Py_ERROR_REPLACE: 3327db96d56Sopenharmony_ci memset(p, '?', endpos - startpos); 3337db96d56Sopenharmony_ci p += (endpos - startpos); 3347db96d56Sopenharmony_ci /* fall through */ 3357db96d56Sopenharmony_ci case _Py_ERROR_IGNORE: 3367db96d56Sopenharmony_ci i += (endpos - startpos - 1); 3377db96d56Sopenharmony_ci break; 3387db96d56Sopenharmony_ci 3397db96d56Sopenharmony_ci case _Py_ERROR_SURROGATEPASS: 3407db96d56Sopenharmony_ci for (k=startpos; k<endpos; k++) { 3417db96d56Sopenharmony_ci ch = data[k]; 3427db96d56Sopenharmony_ci *p++ = (char)(0xe0 | (ch >> 12)); 3437db96d56Sopenharmony_ci *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 3447db96d56Sopenharmony_ci *p++ = (char)(0x80 | (ch & 0x3f)); 3457db96d56Sopenharmony_ci } 3467db96d56Sopenharmony_ci i += (endpos - startpos - 1); 3477db96d56Sopenharmony_ci break; 3487db96d56Sopenharmony_ci 3497db96d56Sopenharmony_ci case _Py_ERROR_BACKSLASHREPLACE: 3507db96d56Sopenharmony_ci /* subtract preallocated bytes */ 3517db96d56Sopenharmony_ci writer->min_size -= max_char_size * (endpos - startpos); 3527db96d56Sopenharmony_ci p = backslashreplace(writer, p, 3537db96d56Sopenharmony_ci unicode, startpos, endpos); 3547db96d56Sopenharmony_ci if (p == NULL) 3557db96d56Sopenharmony_ci goto error; 3567db96d56Sopenharmony_ci i += (endpos - startpos - 1); 3577db96d56Sopenharmony_ci break; 3587db96d56Sopenharmony_ci 3597db96d56Sopenharmony_ci case _Py_ERROR_XMLCHARREFREPLACE: 3607db96d56Sopenharmony_ci /* subtract preallocated bytes */ 3617db96d56Sopenharmony_ci writer->min_size -= max_char_size * (endpos - startpos); 3627db96d56Sopenharmony_ci p = xmlcharrefreplace(writer, p, 3637db96d56Sopenharmony_ci unicode, startpos, endpos); 3647db96d56Sopenharmony_ci if (p == NULL) 3657db96d56Sopenharmony_ci goto error; 3667db96d56Sopenharmony_ci i += (endpos - startpos - 1); 3677db96d56Sopenharmony_ci break; 3687db96d56Sopenharmony_ci 3697db96d56Sopenharmony_ci case _Py_ERROR_SURROGATEESCAPE: 3707db96d56Sopenharmony_ci for (k=startpos; k<endpos; k++) { 3717db96d56Sopenharmony_ci ch = data[k]; 3727db96d56Sopenharmony_ci if (!(0xDC80 <= ch && ch <= 0xDCFF)) 3737db96d56Sopenharmony_ci break; 3747db96d56Sopenharmony_ci *p++ = (char)(ch & 0xff); 3757db96d56Sopenharmony_ci } 3767db96d56Sopenharmony_ci if (k >= endpos) { 3777db96d56Sopenharmony_ci i += (endpos - startpos - 1); 3787db96d56Sopenharmony_ci break; 3797db96d56Sopenharmony_ci } 3807db96d56Sopenharmony_ci startpos = k; 3817db96d56Sopenharmony_ci assert(startpos < endpos); 3827db96d56Sopenharmony_ci /* fall through */ 3837db96d56Sopenharmony_ci default: 3847db96d56Sopenharmony_ci rep = unicode_encode_call_errorhandler( 3857db96d56Sopenharmony_ci errors, &error_handler_obj, "utf-8", "surrogates not allowed", 3867db96d56Sopenharmony_ci unicode, &exc, startpos, endpos, &newpos); 3877db96d56Sopenharmony_ci if (!rep) 3887db96d56Sopenharmony_ci goto error; 3897db96d56Sopenharmony_ci 3907db96d56Sopenharmony_ci if (newpos < startpos) { 3917db96d56Sopenharmony_ci writer->overallocate = 1; 3927db96d56Sopenharmony_ci p = _PyBytesWriter_Prepare(writer, p, 3937db96d56Sopenharmony_ci max_char_size * (startpos - newpos)); 3947db96d56Sopenharmony_ci if (p == NULL) 3957db96d56Sopenharmony_ci goto error; 3967db96d56Sopenharmony_ci } 3977db96d56Sopenharmony_ci else { 3987db96d56Sopenharmony_ci /* subtract preallocated bytes */ 3997db96d56Sopenharmony_ci writer->min_size -= max_char_size * (newpos - startpos); 4007db96d56Sopenharmony_ci /* Only overallocate the buffer if it's not the last write */ 4017db96d56Sopenharmony_ci writer->overallocate = (newpos < size); 4027db96d56Sopenharmony_ci } 4037db96d56Sopenharmony_ci 4047db96d56Sopenharmony_ci if (PyBytes_Check(rep)) { 4057db96d56Sopenharmony_ci p = _PyBytesWriter_WriteBytes(writer, p, 4067db96d56Sopenharmony_ci PyBytes_AS_STRING(rep), 4077db96d56Sopenharmony_ci PyBytes_GET_SIZE(rep)); 4087db96d56Sopenharmony_ci } 4097db96d56Sopenharmony_ci else { 4107db96d56Sopenharmony_ci /* rep is unicode */ 4117db96d56Sopenharmony_ci if (PyUnicode_READY(rep) < 0) 4127db96d56Sopenharmony_ci goto error; 4137db96d56Sopenharmony_ci 4147db96d56Sopenharmony_ci if (!PyUnicode_IS_ASCII(rep)) { 4157db96d56Sopenharmony_ci raise_encode_exception(&exc, "utf-8", unicode, 4167db96d56Sopenharmony_ci startpos, endpos, 4177db96d56Sopenharmony_ci "surrogates not allowed"); 4187db96d56Sopenharmony_ci goto error; 4197db96d56Sopenharmony_ci } 4207db96d56Sopenharmony_ci 4217db96d56Sopenharmony_ci p = _PyBytesWriter_WriteBytes(writer, p, 4227db96d56Sopenharmony_ci PyUnicode_DATA(rep), 4237db96d56Sopenharmony_ci PyUnicode_GET_LENGTH(rep)); 4247db96d56Sopenharmony_ci } 4257db96d56Sopenharmony_ci 4267db96d56Sopenharmony_ci if (p == NULL) 4277db96d56Sopenharmony_ci goto error; 4287db96d56Sopenharmony_ci Py_CLEAR(rep); 4297db96d56Sopenharmony_ci 4307db96d56Sopenharmony_ci i = newpos; 4317db96d56Sopenharmony_ci } 4327db96d56Sopenharmony_ci 4337db96d56Sopenharmony_ci /* If overallocation was disabled, ensure that it was the last 4347db96d56Sopenharmony_ci write. Otherwise, we missed an optimization */ 4357db96d56Sopenharmony_ci assert(writer->overallocate || i == size); 4367db96d56Sopenharmony_ci } 4377db96d56Sopenharmony_ci else 4387db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 2 4397db96d56Sopenharmony_ci if (ch < 0x10000) 4407db96d56Sopenharmony_ci#endif 4417db96d56Sopenharmony_ci { 4427db96d56Sopenharmony_ci *p++ = (char)(0xe0 | (ch >> 12)); 4437db96d56Sopenharmony_ci *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4447db96d56Sopenharmony_ci *p++ = (char)(0x80 | (ch & 0x3f)); 4457db96d56Sopenharmony_ci } 4467db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 2 4477db96d56Sopenharmony_ci else /* ch >= 0x10000 */ 4487db96d56Sopenharmony_ci { 4497db96d56Sopenharmony_ci assert(ch <= MAX_UNICODE); 4507db96d56Sopenharmony_ci /* Encode UCS4 Unicode ordinals */ 4517db96d56Sopenharmony_ci *p++ = (char)(0xf0 | (ch >> 18)); 4527db96d56Sopenharmony_ci *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 4537db96d56Sopenharmony_ci *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4547db96d56Sopenharmony_ci *p++ = (char)(0x80 | (ch & 0x3f)); 4557db96d56Sopenharmony_ci } 4567db96d56Sopenharmony_ci#endif /* STRINGLIB_SIZEOF_CHAR > 2 */ 4577db96d56Sopenharmony_ci#endif /* STRINGLIB_SIZEOF_CHAR > 1 */ 4587db96d56Sopenharmony_ci } 4597db96d56Sopenharmony_ci 4607db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1 4617db96d56Sopenharmony_ci Py_XDECREF(error_handler_obj); 4627db96d56Sopenharmony_ci Py_XDECREF(exc); 4637db96d56Sopenharmony_ci#endif 4647db96d56Sopenharmony_ci return p; 4657db96d56Sopenharmony_ci 4667db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1 4677db96d56Sopenharmony_ci error: 4687db96d56Sopenharmony_ci Py_XDECREF(rep); 4697db96d56Sopenharmony_ci Py_XDECREF(error_handler_obj); 4707db96d56Sopenharmony_ci Py_XDECREF(exc); 4717db96d56Sopenharmony_ci return NULL; 4727db96d56Sopenharmony_ci#endif 4737db96d56Sopenharmony_ci} 4747db96d56Sopenharmony_ci 4757db96d56Sopenharmony_ci/* The pattern for constructing UCS2-repeated masks. */ 4767db96d56Sopenharmony_ci#if SIZEOF_LONG == 8 4777db96d56Sopenharmony_ci# define UCS2_REPEAT_MASK 0x0001000100010001ul 4787db96d56Sopenharmony_ci#elif SIZEOF_LONG == 4 4797db96d56Sopenharmony_ci# define UCS2_REPEAT_MASK 0x00010001ul 4807db96d56Sopenharmony_ci#else 4817db96d56Sopenharmony_ci# error C 'long' size should be either 4 or 8! 4827db96d56Sopenharmony_ci#endif 4837db96d56Sopenharmony_ci 4847db96d56Sopenharmony_ci/* The mask for fast checking. */ 4857db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR == 1 4867db96d56Sopenharmony_ci/* The mask for fast checking of whether a C 'long' contains a 4877db96d56Sopenharmony_ci non-ASCII or non-Latin1 UTF16-encoded characters. */ 4887db96d56Sopenharmony_ci# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) 4897db96d56Sopenharmony_ci#else 4907db96d56Sopenharmony_ci/* The mask for fast checking of whether a C 'long' may contain 4917db96d56Sopenharmony_ci UTF16-encoded surrogate characters. This is an efficient heuristic, 4927db96d56Sopenharmony_ci assuming that non-surrogate characters with a code point >= 0x8000 are 4937db96d56Sopenharmony_ci rare in most input. 4947db96d56Sopenharmony_ci*/ 4957db96d56Sopenharmony_ci# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) 4967db96d56Sopenharmony_ci#endif 4977db96d56Sopenharmony_ci/* The mask for fast byte-swapping. */ 4987db96d56Sopenharmony_ci#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) 4997db96d56Sopenharmony_ci/* Swap bytes. */ 5007db96d56Sopenharmony_ci#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ 5017db96d56Sopenharmony_ci (((value) & STRIPPED_MASK) << 8)) 5027db96d56Sopenharmony_ci 5037db96d56Sopenharmony_ciPy_LOCAL_INLINE(Py_UCS4) 5047db96d56Sopenharmony_ciSTRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, 5057db96d56Sopenharmony_ci STRINGLIB_CHAR *dest, Py_ssize_t *outpos, 5067db96d56Sopenharmony_ci int native_ordering) 5077db96d56Sopenharmony_ci{ 5087db96d56Sopenharmony_ci Py_UCS4 ch; 5097db96d56Sopenharmony_ci const unsigned char *q = *inptr; 5107db96d56Sopenharmony_ci STRINGLIB_CHAR *p = dest + *outpos; 5117db96d56Sopenharmony_ci /* Offsets from q for retrieving byte pairs in the right order. */ 5127db96d56Sopenharmony_ci#if PY_LITTLE_ENDIAN 5137db96d56Sopenharmony_ci int ihi = !!native_ordering, ilo = !native_ordering; 5147db96d56Sopenharmony_ci#else 5157db96d56Sopenharmony_ci int ihi = !native_ordering, ilo = !!native_ordering; 5167db96d56Sopenharmony_ci#endif 5177db96d56Sopenharmony_ci --e; 5187db96d56Sopenharmony_ci 5197db96d56Sopenharmony_ci while (q < e) { 5207db96d56Sopenharmony_ci Py_UCS4 ch2; 5217db96d56Sopenharmony_ci /* First check for possible aligned read of a C 'long'. Unaligned 5227db96d56Sopenharmony_ci reads are more expensive, better to defer to another iteration. */ 5237db96d56Sopenharmony_ci if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { 5247db96d56Sopenharmony_ci /* Fast path for runs of in-range non-surrogate chars. */ 5257db96d56Sopenharmony_ci const unsigned char *_q = q; 5267db96d56Sopenharmony_ci while (_q + SIZEOF_LONG <= e) { 5277db96d56Sopenharmony_ci unsigned long block = * (const unsigned long *) _q; 5287db96d56Sopenharmony_ci if (native_ordering) { 5297db96d56Sopenharmony_ci /* Can use buffer directly */ 5307db96d56Sopenharmony_ci if (block & FAST_CHAR_MASK) 5317db96d56Sopenharmony_ci break; 5327db96d56Sopenharmony_ci } 5337db96d56Sopenharmony_ci else { 5347db96d56Sopenharmony_ci /* Need to byte-swap */ 5357db96d56Sopenharmony_ci if (block & SWAB(FAST_CHAR_MASK)) 5367db96d56Sopenharmony_ci break; 5377db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR == 1 5387db96d56Sopenharmony_ci block >>= 8; 5397db96d56Sopenharmony_ci#else 5407db96d56Sopenharmony_ci block = SWAB(block); 5417db96d56Sopenharmony_ci#endif 5427db96d56Sopenharmony_ci } 5437db96d56Sopenharmony_ci#if PY_LITTLE_ENDIAN 5447db96d56Sopenharmony_ci# if SIZEOF_LONG == 4 5457db96d56Sopenharmony_ci p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); 5467db96d56Sopenharmony_ci p[1] = (STRINGLIB_CHAR)(block >> 16); 5477db96d56Sopenharmony_ci# elif SIZEOF_LONG == 8 5487db96d56Sopenharmony_ci p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); 5497db96d56Sopenharmony_ci p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); 5507db96d56Sopenharmony_ci p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); 5517db96d56Sopenharmony_ci p[3] = (STRINGLIB_CHAR)(block >> 48); 5527db96d56Sopenharmony_ci# endif 5537db96d56Sopenharmony_ci#else 5547db96d56Sopenharmony_ci# if SIZEOF_LONG == 4 5557db96d56Sopenharmony_ci p[0] = (STRINGLIB_CHAR)(block >> 16); 5567db96d56Sopenharmony_ci p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); 5577db96d56Sopenharmony_ci# elif SIZEOF_LONG == 8 5587db96d56Sopenharmony_ci p[0] = (STRINGLIB_CHAR)(block >> 48); 5597db96d56Sopenharmony_ci p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); 5607db96d56Sopenharmony_ci p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); 5617db96d56Sopenharmony_ci p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); 5627db96d56Sopenharmony_ci# endif 5637db96d56Sopenharmony_ci#endif 5647db96d56Sopenharmony_ci _q += SIZEOF_LONG; 5657db96d56Sopenharmony_ci p += SIZEOF_LONG / 2; 5667db96d56Sopenharmony_ci } 5677db96d56Sopenharmony_ci q = _q; 5687db96d56Sopenharmony_ci if (q >= e) 5697db96d56Sopenharmony_ci break; 5707db96d56Sopenharmony_ci } 5717db96d56Sopenharmony_ci 5727db96d56Sopenharmony_ci ch = (q[ihi] << 8) | q[ilo]; 5737db96d56Sopenharmony_ci q += 2; 5747db96d56Sopenharmony_ci if (!Py_UNICODE_IS_SURROGATE(ch)) { 5757db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR < 2 5767db96d56Sopenharmony_ci if (ch > STRINGLIB_MAX_CHAR) 5777db96d56Sopenharmony_ci /* Out-of-range */ 5787db96d56Sopenharmony_ci goto Return; 5797db96d56Sopenharmony_ci#endif 5807db96d56Sopenharmony_ci *p++ = (STRINGLIB_CHAR)ch; 5817db96d56Sopenharmony_ci continue; 5827db96d56Sopenharmony_ci } 5837db96d56Sopenharmony_ci 5847db96d56Sopenharmony_ci /* UTF-16 code pair: */ 5857db96d56Sopenharmony_ci if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) 5867db96d56Sopenharmony_ci goto IllegalEncoding; 5877db96d56Sopenharmony_ci if (q >= e) 5887db96d56Sopenharmony_ci goto UnexpectedEnd; 5897db96d56Sopenharmony_ci ch2 = (q[ihi] << 8) | q[ilo]; 5907db96d56Sopenharmony_ci q += 2; 5917db96d56Sopenharmony_ci if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) 5927db96d56Sopenharmony_ci goto IllegalSurrogate; 5937db96d56Sopenharmony_ci ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); 5947db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR < 4 5957db96d56Sopenharmony_ci /* Out-of-range */ 5967db96d56Sopenharmony_ci goto Return; 5977db96d56Sopenharmony_ci#else 5987db96d56Sopenharmony_ci *p++ = (STRINGLIB_CHAR)ch; 5997db96d56Sopenharmony_ci#endif 6007db96d56Sopenharmony_ci } 6017db96d56Sopenharmony_ci ch = 0; 6027db96d56Sopenharmony_ciReturn: 6037db96d56Sopenharmony_ci *inptr = q; 6047db96d56Sopenharmony_ci *outpos = p - dest; 6057db96d56Sopenharmony_ci return ch; 6067db96d56Sopenharmony_ciUnexpectedEnd: 6077db96d56Sopenharmony_ci ch = 1; 6087db96d56Sopenharmony_ci goto Return; 6097db96d56Sopenharmony_ciIllegalEncoding: 6107db96d56Sopenharmony_ci ch = 2; 6117db96d56Sopenharmony_ci goto Return; 6127db96d56Sopenharmony_ciIllegalSurrogate: 6137db96d56Sopenharmony_ci ch = 3; 6147db96d56Sopenharmony_ci goto Return; 6157db96d56Sopenharmony_ci} 6167db96d56Sopenharmony_ci#undef UCS2_REPEAT_MASK 6177db96d56Sopenharmony_ci#undef FAST_CHAR_MASK 6187db96d56Sopenharmony_ci#undef STRIPPED_MASK 6197db96d56Sopenharmony_ci#undef SWAB 6207db96d56Sopenharmony_ci 6217db96d56Sopenharmony_ci 6227db96d56Sopenharmony_ci#if STRINGLIB_MAX_CHAR >= 0x80 6237db96d56Sopenharmony_ciPy_LOCAL_INLINE(Py_ssize_t) 6247db96d56Sopenharmony_ciSTRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, 6257db96d56Sopenharmony_ci Py_ssize_t len, 6267db96d56Sopenharmony_ci unsigned short **outptr, 6277db96d56Sopenharmony_ci int native_ordering) 6287db96d56Sopenharmony_ci{ 6297db96d56Sopenharmony_ci unsigned short *out = *outptr; 6307db96d56Sopenharmony_ci const STRINGLIB_CHAR *end = in + len; 6317db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR == 1 6327db96d56Sopenharmony_ci if (native_ordering) { 6337db96d56Sopenharmony_ci const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); 6347db96d56Sopenharmony_ci while (in < unrolled_end) { 6357db96d56Sopenharmony_ci out[0] = in[0]; 6367db96d56Sopenharmony_ci out[1] = in[1]; 6377db96d56Sopenharmony_ci out[2] = in[2]; 6387db96d56Sopenharmony_ci out[3] = in[3]; 6397db96d56Sopenharmony_ci in += 4; out += 4; 6407db96d56Sopenharmony_ci } 6417db96d56Sopenharmony_ci while (in < end) { 6427db96d56Sopenharmony_ci *out++ = *in++; 6437db96d56Sopenharmony_ci } 6447db96d56Sopenharmony_ci } else { 6457db96d56Sopenharmony_ci# define SWAB2(CH) ((CH) << 8) /* high byte is zero */ 6467db96d56Sopenharmony_ci const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); 6477db96d56Sopenharmony_ci while (in < unrolled_end) { 6487db96d56Sopenharmony_ci out[0] = SWAB2(in[0]); 6497db96d56Sopenharmony_ci out[1] = SWAB2(in[1]); 6507db96d56Sopenharmony_ci out[2] = SWAB2(in[2]); 6517db96d56Sopenharmony_ci out[3] = SWAB2(in[3]); 6527db96d56Sopenharmony_ci in += 4; out += 4; 6537db96d56Sopenharmony_ci } 6547db96d56Sopenharmony_ci while (in < end) { 6557db96d56Sopenharmony_ci Py_UCS4 ch = *in++; 6567db96d56Sopenharmony_ci *out++ = SWAB2((Py_UCS2)ch); 6577db96d56Sopenharmony_ci } 6587db96d56Sopenharmony_ci#undef SWAB2 6597db96d56Sopenharmony_ci } 6607db96d56Sopenharmony_ci *outptr = out; 6617db96d56Sopenharmony_ci return len; 6627db96d56Sopenharmony_ci#else 6637db96d56Sopenharmony_ci if (native_ordering) { 6647db96d56Sopenharmony_ci#if STRINGLIB_MAX_CHAR < 0x10000 6657db96d56Sopenharmony_ci const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); 6667db96d56Sopenharmony_ci while (in < unrolled_end) { 6677db96d56Sopenharmony_ci /* check if any character is a surrogate character */ 6687db96d56Sopenharmony_ci if (((in[0] ^ 0xd800) & 6697db96d56Sopenharmony_ci (in[1] ^ 0xd800) & 6707db96d56Sopenharmony_ci (in[2] ^ 0xd800) & 6717db96d56Sopenharmony_ci (in[3] ^ 0xd800) & 0xf800) == 0) 6727db96d56Sopenharmony_ci break; 6737db96d56Sopenharmony_ci out[0] = in[0]; 6747db96d56Sopenharmony_ci out[1] = in[1]; 6757db96d56Sopenharmony_ci out[2] = in[2]; 6767db96d56Sopenharmony_ci out[3] = in[3]; 6777db96d56Sopenharmony_ci in += 4; out += 4; 6787db96d56Sopenharmony_ci } 6797db96d56Sopenharmony_ci#endif 6807db96d56Sopenharmony_ci while (in < end) { 6817db96d56Sopenharmony_ci Py_UCS4 ch; 6827db96d56Sopenharmony_ci ch = *in++; 6837db96d56Sopenharmony_ci if (ch < 0xd800) 6847db96d56Sopenharmony_ci *out++ = ch; 6857db96d56Sopenharmony_ci else if (ch < 0xe000) 6867db96d56Sopenharmony_ci /* reject surrogate characters (U+D800-U+DFFF) */ 6877db96d56Sopenharmony_ci goto fail; 6887db96d56Sopenharmony_ci#if STRINGLIB_MAX_CHAR >= 0x10000 6897db96d56Sopenharmony_ci else if (ch >= 0x10000) { 6907db96d56Sopenharmony_ci out[0] = Py_UNICODE_HIGH_SURROGATE(ch); 6917db96d56Sopenharmony_ci out[1] = Py_UNICODE_LOW_SURROGATE(ch); 6927db96d56Sopenharmony_ci out += 2; 6937db96d56Sopenharmony_ci } 6947db96d56Sopenharmony_ci#endif 6957db96d56Sopenharmony_ci else 6967db96d56Sopenharmony_ci *out++ = ch; 6977db96d56Sopenharmony_ci } 6987db96d56Sopenharmony_ci } else { 6997db96d56Sopenharmony_ci#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) 7007db96d56Sopenharmony_ci#if STRINGLIB_MAX_CHAR < 0x10000 7017db96d56Sopenharmony_ci const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); 7027db96d56Sopenharmony_ci while (in < unrolled_end) { 7037db96d56Sopenharmony_ci /* check if any character is a surrogate character */ 7047db96d56Sopenharmony_ci if (((in[0] ^ 0xd800) & 7057db96d56Sopenharmony_ci (in[1] ^ 0xd800) & 7067db96d56Sopenharmony_ci (in[2] ^ 0xd800) & 7077db96d56Sopenharmony_ci (in[3] ^ 0xd800) & 0xf800) == 0) 7087db96d56Sopenharmony_ci break; 7097db96d56Sopenharmony_ci out[0] = SWAB2(in[0]); 7107db96d56Sopenharmony_ci out[1] = SWAB2(in[1]); 7117db96d56Sopenharmony_ci out[2] = SWAB2(in[2]); 7127db96d56Sopenharmony_ci out[3] = SWAB2(in[3]); 7137db96d56Sopenharmony_ci in += 4; out += 4; 7147db96d56Sopenharmony_ci } 7157db96d56Sopenharmony_ci#endif 7167db96d56Sopenharmony_ci while (in < end) { 7177db96d56Sopenharmony_ci Py_UCS4 ch = *in++; 7187db96d56Sopenharmony_ci if (ch < 0xd800) 7197db96d56Sopenharmony_ci *out++ = SWAB2((Py_UCS2)ch); 7207db96d56Sopenharmony_ci else if (ch < 0xe000) 7217db96d56Sopenharmony_ci /* reject surrogate characters (U+D800-U+DFFF) */ 7227db96d56Sopenharmony_ci goto fail; 7237db96d56Sopenharmony_ci#if STRINGLIB_MAX_CHAR >= 0x10000 7247db96d56Sopenharmony_ci else if (ch >= 0x10000) { 7257db96d56Sopenharmony_ci Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); 7267db96d56Sopenharmony_ci Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); 7277db96d56Sopenharmony_ci out[0] = SWAB2(ch1); 7287db96d56Sopenharmony_ci out[1] = SWAB2(ch2); 7297db96d56Sopenharmony_ci out += 2; 7307db96d56Sopenharmony_ci } 7317db96d56Sopenharmony_ci#endif 7327db96d56Sopenharmony_ci else 7337db96d56Sopenharmony_ci *out++ = SWAB2((Py_UCS2)ch); 7347db96d56Sopenharmony_ci } 7357db96d56Sopenharmony_ci#undef SWAB2 7367db96d56Sopenharmony_ci } 7377db96d56Sopenharmony_ci *outptr = out; 7387db96d56Sopenharmony_ci return len; 7397db96d56Sopenharmony_ci fail: 7407db96d56Sopenharmony_ci *outptr = out; 7417db96d56Sopenharmony_ci return len - (end - in + 1); 7427db96d56Sopenharmony_ci#endif 7437db96d56Sopenharmony_ci} 7447db96d56Sopenharmony_ci 7457db96d56Sopenharmony_cistatic inline uint32_t 7467db96d56Sopenharmony_ciSTRINGLIB(SWAB4)(STRINGLIB_CHAR ch) 7477db96d56Sopenharmony_ci{ 7487db96d56Sopenharmony_ci uint32_t word = ch; 7497db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR == 1 7507db96d56Sopenharmony_ci /* high bytes are zero */ 7517db96d56Sopenharmony_ci return (word << 24); 7527db96d56Sopenharmony_ci#elif STRINGLIB_SIZEOF_CHAR == 2 7537db96d56Sopenharmony_ci /* high bytes are zero */ 7547db96d56Sopenharmony_ci return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); 7557db96d56Sopenharmony_ci#else 7567db96d56Sopenharmony_ci return _Py_bswap32(word); 7577db96d56Sopenharmony_ci#endif 7587db96d56Sopenharmony_ci} 7597db96d56Sopenharmony_ci 7607db96d56Sopenharmony_ciPy_LOCAL_INLINE(Py_ssize_t) 7617db96d56Sopenharmony_ciSTRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, 7627db96d56Sopenharmony_ci Py_ssize_t len, 7637db96d56Sopenharmony_ci uint32_t **outptr, 7647db96d56Sopenharmony_ci int native_ordering) 7657db96d56Sopenharmony_ci{ 7667db96d56Sopenharmony_ci uint32_t *out = *outptr; 7677db96d56Sopenharmony_ci const STRINGLIB_CHAR *end = in + len; 7687db96d56Sopenharmony_ci if (native_ordering) { 7697db96d56Sopenharmony_ci const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); 7707db96d56Sopenharmony_ci while (in < unrolled_end) { 7717db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1 7727db96d56Sopenharmony_ci /* check if any character is a surrogate character */ 7737db96d56Sopenharmony_ci if (((in[0] ^ 0xd800) & 7747db96d56Sopenharmony_ci (in[1] ^ 0xd800) & 7757db96d56Sopenharmony_ci (in[2] ^ 0xd800) & 7767db96d56Sopenharmony_ci (in[3] ^ 0xd800) & 0xf800) == 0) 7777db96d56Sopenharmony_ci break; 7787db96d56Sopenharmony_ci#endif 7797db96d56Sopenharmony_ci out[0] = in[0]; 7807db96d56Sopenharmony_ci out[1] = in[1]; 7817db96d56Sopenharmony_ci out[2] = in[2]; 7827db96d56Sopenharmony_ci out[3] = in[3]; 7837db96d56Sopenharmony_ci in += 4; out += 4; 7847db96d56Sopenharmony_ci } 7857db96d56Sopenharmony_ci while (in < end) { 7867db96d56Sopenharmony_ci Py_UCS4 ch; 7877db96d56Sopenharmony_ci ch = *in++; 7887db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1 7897db96d56Sopenharmony_ci if (Py_UNICODE_IS_SURROGATE(ch)) { 7907db96d56Sopenharmony_ci /* reject surrogate characters (U+D800-U+DFFF) */ 7917db96d56Sopenharmony_ci goto fail; 7927db96d56Sopenharmony_ci } 7937db96d56Sopenharmony_ci#endif 7947db96d56Sopenharmony_ci *out++ = ch; 7957db96d56Sopenharmony_ci } 7967db96d56Sopenharmony_ci } else { 7977db96d56Sopenharmony_ci const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); 7987db96d56Sopenharmony_ci while (in < unrolled_end) { 7997db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1 8007db96d56Sopenharmony_ci /* check if any character is a surrogate character */ 8017db96d56Sopenharmony_ci if (((in[0] ^ 0xd800) & 8027db96d56Sopenharmony_ci (in[1] ^ 0xd800) & 8037db96d56Sopenharmony_ci (in[2] ^ 0xd800) & 8047db96d56Sopenharmony_ci (in[3] ^ 0xd800) & 0xf800) == 0) 8057db96d56Sopenharmony_ci break; 8067db96d56Sopenharmony_ci#endif 8077db96d56Sopenharmony_ci out[0] = STRINGLIB(SWAB4)(in[0]); 8087db96d56Sopenharmony_ci out[1] = STRINGLIB(SWAB4)(in[1]); 8097db96d56Sopenharmony_ci out[2] = STRINGLIB(SWAB4)(in[2]); 8107db96d56Sopenharmony_ci out[3] = STRINGLIB(SWAB4)(in[3]); 8117db96d56Sopenharmony_ci in += 4; out += 4; 8127db96d56Sopenharmony_ci } 8137db96d56Sopenharmony_ci while (in < end) { 8147db96d56Sopenharmony_ci Py_UCS4 ch = *in++; 8157db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1 8167db96d56Sopenharmony_ci if (Py_UNICODE_IS_SURROGATE(ch)) { 8177db96d56Sopenharmony_ci /* reject surrogate characters (U+D800-U+DFFF) */ 8187db96d56Sopenharmony_ci goto fail; 8197db96d56Sopenharmony_ci } 8207db96d56Sopenharmony_ci#endif 8217db96d56Sopenharmony_ci *out++ = STRINGLIB(SWAB4)(ch); 8227db96d56Sopenharmony_ci } 8237db96d56Sopenharmony_ci } 8247db96d56Sopenharmony_ci *outptr = out; 8257db96d56Sopenharmony_ci return len; 8267db96d56Sopenharmony_ci#if STRINGLIB_SIZEOF_CHAR > 1 8277db96d56Sopenharmony_ci fail: 8287db96d56Sopenharmony_ci *outptr = out; 8297db96d56Sopenharmony_ci return len - (end - in + 1); 8307db96d56Sopenharmony_ci#endif 8317db96d56Sopenharmony_ci} 8327db96d56Sopenharmony_ci 8337db96d56Sopenharmony_ci#endif 834