Lines Matching refs:unicode
169 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
222 /* This dictionary holds all interned unicode strings. Note that references
238 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
246 static int unicode_is_singleton(PyObject *unicode);
340 static int unicode_modifiable(PyObject *unicode);
353 PyObject *unicode, PyObject **exceptionObject,
359 PyObject *unicode,
471 if (!interp->unicode.fs_codec.encoding) {
530 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
532 data = unicode->data.any;
619 unicode_result_wchar(PyObject *unicode)
624 len = _PyUnicode_WSTR_LENGTH(unicode);
626 Py_DECREF(unicode);
631 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
633 Py_DECREF(unicode);
638 if (_PyUnicode_Ready(unicode) < 0) {
639 Py_DECREF(unicode);
643 assert(Py_REFCNT(unicode) == 1);
647 assert(_PyUnicode_CheckConsistency(unicode, 1));
649 return unicode;
653 unicode_result_ready(PyObject *unicode)
657 length = PyUnicode_GET_LENGTH(unicode);
660 if (unicode != empty) {
661 Py_DECREF(unicode);
668 int kind = PyUnicode_KIND(unicode);
670 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
673 if (unicode != latin1_char) {
675 Py_DECREF(unicode);
681 assert(_PyUnicode_CheckConsistency(unicode, 1));
682 return unicode;
686 unicode_result(PyObject *unicode)
688 assert(_PyUnicode_CHECK(unicode));
689 if (PyUnicode_IS_READY(unicode))
690 return unicode_result_ready(unicode);
692 return unicode_result_wchar(unicode);
696 unicode_result_unchanged(PyObject *unicode)
698 if (PyUnicode_CheckExact(unicode)) {
699 if (PyUnicode_READY(unicode) == -1)
701 Py_INCREF(unicode);
702 return unicode;
705 /* Subtype -- return genuine unicode string with the same value. */
706 return _PyUnicode_Copy(unicode);
713 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
720 assert(PyUnicode_IS_READY(unicode));
721 kind = PyUnicode_KIND(unicode);
722 data = PyUnicode_DATA(unicode);
780 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
787 assert(PyUnicode_IS_READY(unicode));
788 kind = PyUnicode_KIND(unicode);
789 data = PyUnicode_DATA(unicode);
840 bits from each unicode characters as the bit index. */
879 /* calculate simple bloom-style bitmask for a given unicode string */
1008 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1010 int kind = PyUnicode_KIND(unicode);
1011 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1012 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1020 resize_compact(PyObject *unicode, Py_ssize_t length)
1028 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1031 assert(unicode_modifiable(unicode));
1032 assert(PyUnicode_IS_READY(unicode));
1033 assert(PyUnicode_IS_COMPACT(unicode));
1035 char_size = PyUnicode_KIND(unicode);
1036 if (PyUnicode_IS_ASCII(unicode))
1040 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1048 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1049 PyObject_Free(_PyUnicode_UTF8(unicode));
1050 _PyUnicode_UTF8(unicode) = NULL;
1051 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1057 _Py_ForgetReference(unicode);
1060 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1062 _Py_NewReference(unicode);
1066 unicode = new_unicode;
1067 _Py_NewReference(unicode);
1069 _PyUnicode_LENGTH(unicode) = length;
1071 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1072 if (!PyUnicode_IS_ASCII(unicode))
1073 _PyUnicode_WSTR_LENGTH(unicode) = length;
1075 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1076 PyObject_Free(_PyUnicode_WSTR(unicode));
1077 _PyUnicode_WSTR(unicode) = NULL;
1078 if (!PyUnicode_IS_ASCII(unicode))
1079 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1082 unicode_fill_invalid(unicode, old_length);
1084 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1086 assert(_PyUnicode_CheckConsistency(unicode, 0));
1087 return unicode;
1091 resize_inplace(PyObject *unicode, Py_ssize_t length)
1095 assert(!PyUnicode_IS_COMPACT(unicode));
1096 assert(Py_REFCNT(unicode) == 1);
1098 if (PyUnicode_IS_READY(unicode)) {
1103 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1106 data = _PyUnicode_DATA_ANY(unicode);
1107 char_size = PyUnicode_KIND(unicode);
1108 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1109 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1117 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1119 PyObject_Free(_PyUnicode_UTF8(unicode));
1120 _PyUnicode_UTF8(unicode) = NULL;
1121 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1129 _PyUnicode_DATA_ANY(unicode) = data;
1131 _PyUnicode_WSTR(unicode) = data;
1132 _PyUnicode_WSTR_LENGTH(unicode) = length;
1135 _PyUnicode_UTF8(unicode) = data;
1136 _PyUnicode_UTF8_LENGTH(unicode) = length;
1138 _PyUnicode_LENGTH(unicode) = length;
1139 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1141 unicode_fill_invalid(unicode, old_length);
1143 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1144 assert(_PyUnicode_CheckConsistency(unicode, 0));
1148 assert(_PyUnicode_WSTR(unicode) != NULL);
1156 wstr = _PyUnicode_WSTR(unicode);
1162 _PyUnicode_WSTR(unicode) = wstr;
1163 _PyUnicode_WSTR(unicode)[length] = 0;
1164 _PyUnicode_WSTR_LENGTH(unicode) = length;
1165 assert(_PyUnicode_CheckConsistency(unicode, 0));
1170 resize_copy(PyObject *unicode, Py_ssize_t length)
1173 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1176 assert(PyUnicode_IS_READY(unicode));
1178 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1182 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1183 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1192 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1194 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1212 PyUnicodeObject *unicode;
1230 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1231 if (unicode == NULL)
1235 _PyUnicode_WSTR_LENGTH(unicode) = length;
1236 _PyUnicode_HASH(unicode) = -1;
1237 _PyUnicode_STATE(unicode).interned = 0;
1238 _PyUnicode_STATE(unicode).kind = 0;
1239 _PyUnicode_STATE(unicode).compact = 0;
1240 _PyUnicode_STATE(unicode).ready = 0;
1241 _PyUnicode_STATE(unicode).ascii = 0;
1242 _PyUnicode_DATA_ANY(unicode) = NULL;
1243 _PyUnicode_LENGTH(unicode) = 0;
1244 _PyUnicode_UTF8(unicode) = NULL;
1245 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1247 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1248 if (!_PyUnicode_WSTR(unicode)) {
1249 Py_DECREF(unicode);
1257 * allocated for str alive across a call to unicode_dealloc(unicode).
1261 _PyUnicode_WSTR(unicode)[0] = 0;
1262 _PyUnicode_WSTR(unicode)[length] = 0;
1264 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1265 return unicode;
1269 unicode_kind_name(PyObject *unicode)
1273 if (!PyUnicode_IS_COMPACT(unicode))
1275 if (!PyUnicode_IS_READY(unicode))
1277 switch (PyUnicode_KIND(unicode))
1280 if (PyUnicode_IS_ASCII(unicode))
1292 assert(PyUnicode_IS_READY(unicode));
1293 switch (PyUnicode_KIND(unicode)) {
1295 if (PyUnicode_IS_ASCII(unicode))
1311 PyObject *unicode = _PyObject_CAST(unicode_raw);
1312 return PyUnicode_UTF8(unicode);
1316 PyObject *unicode = _PyObject_CAST(unicode_raw);
1317 return _PyUnicode_COMPACT_DATA(unicode);
1320 PyObject *unicode = _PyObject_CAST(unicode_raw);
1321 printf("obj %p\n", (void*)unicode);
1322 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1323 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1324 printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1325 printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1326 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1327 return PyUnicode_DATA(unicode);
1335 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1346 data = unicode->data.any;
1355 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1374 PyCompactUnicodeObject *unicode;
1431 unicode = (PyCompactUnicodeObject *)obj;
1435 data = unicode + 1;
1436 _PyUnicode_LENGTH(unicode) = size;
1437 _PyUnicode_HASH(unicode) = -1;
1438 _PyUnicode_STATE(unicode).interned = 0;
1439 _PyUnicode_STATE(unicode).kind = kind;
1440 _PyUnicode_STATE(unicode).compact = 1;
1441 _PyUnicode_STATE(unicode).ready = 1;
1442 _PyUnicode_STATE(unicode).ascii = is_ascii;
1445 _PyUnicode_WSTR(unicode) = NULL;
1449 _PyUnicode_WSTR(unicode) = NULL;
1450 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451 unicode->utf8 = NULL;
1452 unicode->utf8_length = 0;
1455 unicode->utf8 = NULL;
1456 unicode->utf8_length = 0;
1462 _PyUnicode_WSTR_LENGTH(unicode) = size;
1463 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1466 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1467 _PyUnicode_WSTR(unicode) = NULL;
1471 unicode_fill_invalid((PyObject*)unicode, 0);
1473 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1482 This function assumes that unicode can hold one more code point than wstr
1486 PyObject *unicode)
1491 assert(unicode != NULL);
1492 assert(_PyUnicode_CHECK(unicode));
1493 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1494 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1497 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1498 _PyUnicode_GET_LENGTH(unicode)));
1511 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1512 _PyUnicode_GET_LENGTH(unicode)));
1518 unicode_check_modifiable(PyObject *unicode)
1520 if (!unicode_modifiable(unicode)) {
1780 _PyUnicode_Ready(PyObject *unicode)
1793 assert(_PyUnicode_CHECK(unicode));
1794 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1795 assert(_PyUnicode_WSTR(unicode) != NULL);
1796 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1797 assert(_PyUnicode_UTF8(unicode) == NULL);
1799 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1801 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1802 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1807 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1808 if (!_PyUnicode_DATA_ANY(unicode)) {
1813 _PyUnicode_WSTR(unicode), end,
1814 PyUnicode_1BYTE_DATA(unicode));
1815 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1816 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1817 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1819 _PyUnicode_STATE(unicode).ascii = 1;
1820 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1821 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1824 _PyUnicode_STATE(unicode).ascii = 0;
1825 _PyUnicode_UTF8(unicode) = NULL;
1826 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1828 PyObject_Free(_PyUnicode_WSTR(unicode));
1829 _PyUnicode_WSTR(unicode) = NULL;
1830 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1833 wchar_t to 2-byte unicode. */
1840 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1841 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1842 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1843 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1844 _PyUnicode_UTF8(unicode) = NULL;
1845 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1848 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1849 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1850 if (!_PyUnicode_DATA_ANY(unicode)) {
1855 _PyUnicode_WSTR(unicode), end,
1856 PyUnicode_2BYTE_DATA(unicode));
1857 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1858 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1859 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1860 _PyUnicode_UTF8(unicode) = NULL;
1861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1862 PyObject_Free(_PyUnicode_WSTR(unicode));
1863 _PyUnicode_WSTR(unicode) = NULL;
1864 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1867 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1872 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1877 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
1878 if (!_PyUnicode_DATA_ANY(unicode)) {
1882 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1883 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1884 _PyUnicode_UTF8(unicode) = NULL;
1885 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1887 _PyUnicode_STATE(unicode).ready = 1;
1888 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1889 PyObject_Free(_PyUnicode_WSTR(unicode));
1890 _PyUnicode_WSTR(unicode) = NULL;
1891 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1895 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1896 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897 _PyUnicode_UTF8(unicode) = NULL;
1898 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1899 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1901 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1903 _PyUnicode_STATE(unicode).ready = 1;
1904 assert(_PyUnicode_CheckConsistency(unicode, 1));
1909 unicode_dealloc(PyObject *unicode)
1912 if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1917 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1927 assert(Py_REFCNT(unicode) == 0);
1928 Py_SET_REFCNT(unicode, 3);
1929 if (PyDict_DelItem(interned, unicode) != 0) {
1933 assert(Py_REFCNT(unicode) == 1);
1934 Py_SET_REFCNT(unicode, 0);
1939 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1946 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1947 PyObject_Free(_PyUnicode_WSTR(unicode));
1949 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1950 PyObject_Free(_PyUnicode_UTF8(unicode));
1952 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1953 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1956 Py_TYPE(unicode)->tp_free(unicode);
1961 unicode_is_singleton(PyObject *unicode)
1963 if (unicode == &_Py_STR(empty)) {
1967 PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1969 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1970 if (ch < 256 && LATIN1(ch) == unicode) {
1979 unicode_modifiable(PyObject *unicode)
1981 assert(_PyUnicode_CHECK(unicode));
1982 if (Py_REFCNT(unicode) != 1)
1984 if (_PyUnicode_HASH(unicode) != -1)
1986 if (PyUnicode_CHECK_INTERNED(unicode))
1988 if (!PyUnicode_CheckExact(unicode))
1992 assert(!unicode_is_singleton(unicode));
2000 PyObject *unicode;
2004 unicode = *p_unicode;
2006 assert(unicode != NULL);
2007 assert(PyUnicode_Check(unicode));
2010 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2011 old_length = PyUnicode_WSTR_LENGTH(unicode);
2013 old_length = PyUnicode_GET_LENGTH(unicode);
2023 if (!unicode_modifiable(unicode)) {
2024 PyObject *copy = resize_copy(unicode, length);
2031 if (PyUnicode_IS_COMPACT(unicode)) {
2032 PyObject *new_unicode = resize_compact(unicode, length);
2038 return resize_inplace(unicode, length);
2044 PyObject *unicode;
2049 unicode = *p_unicode;
2050 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2064 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2067 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2068 const void *data = PyUnicode_DATA(unicode);
2071 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2075 if (PyUnicode_IS_ASCII(unicode)) {
2092 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2102 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2119 PyObject *unicode;
2127 unicode = PyUnicode_New(1, ch);
2128 if (unicode == NULL)
2131 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2132 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2133 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2135 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2136 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2138 assert(_PyUnicode_CheckConsistency(unicode, 1));
2139 return unicode;
2167 PyObject *unicode;
2195 PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2197 return unicode;
2212 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2213 if (!unicode)
2216 switch (PyUnicode_KIND(unicode)) {
2219 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2223 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2226 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2233 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2236 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2243 return unicode_result(unicode);
2285 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2364 PyObject *unicode;
2371 unicode = PyUnicode_New(size, 127);
2372 if (!unicode)
2374 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2375 assert(_PyUnicode_CheckConsistency(unicode, 1));
2376 return unicode;
2492 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2497 assert(PyUnicode_IS_READY(unicode));
2499 assert(end <= PyUnicode_GET_LENGTH(unicode));
2502 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2503 return PyUnicode_MAX_CHAR_VALUE(unicode);
2508 if (PyUnicode_IS_ASCII(unicode))
2511 kind = PyUnicode_KIND(unicode);
2512 startptr = PyUnicode_DATA(unicode);
2533 PyObject *unicode, *copy;
2539 unicode = *p_unicode;
2540 assert(PyUnicode_IS_READY(unicode));
2541 if (PyUnicode_IS_ASCII(unicode))
2544 len = PyUnicode_GET_LENGTH(unicode);
2545 kind = PyUnicode_KIND(unicode);
2547 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2553 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2559 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2569 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2570 Py_DECREF(unicode);
2575 _PyUnicode_Copy(PyObject *unicode)
2580 if (!PyUnicode_Check(unicode)) {
2584 if (PyUnicode_READY(unicode) == -1)
2587 length = PyUnicode_GET_LENGTH(unicode);
2588 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2591 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2593 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2594 length * PyUnicode_KIND(unicode));
2765 PyObject *unicode;
2777 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2778 if (unicode == NULL)
2781 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2782 Py_DECREF(unicode);
3158 unicode_get_widechar_size(PyObject *unicode)
3162 assert(unicode != NULL);
3163 assert(_PyUnicode_CHECK(unicode));
3166 if (_PyUnicode_WSTR(unicode) != NULL) {
3167 return PyUnicode_WSTR_LENGTH(unicode);
3170 assert(PyUnicode_IS_READY(unicode));
3172 res = _PyUnicode_LENGTH(unicode);
3174 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3175 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3188 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3190 assert(unicode != NULL);
3191 assert(_PyUnicode_CHECK(unicode));
3194 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3200 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3201 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3205 assert(PyUnicode_IS_READY(unicode));
3207 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3208 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3215 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3216 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3221 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3222 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3246 character) required to convert the unicode object. Ignore size argument.
3252 PyUnicode_AsWideChar(PyObject *unicode,
3258 if (unicode == NULL) {
3262 if (!PyUnicode_Check(unicode)) {
3267 res = unicode_get_widechar_size(unicode);
3278 unicode_copy_as_widechar(unicode, w, size);
3294 PyUnicode_AsWideCharString(PyObject *unicode,
3300 if (unicode == NULL) {
3304 if (!PyUnicode_Check(unicode)) {
3309 buflen = unicode_get_widechar_size(unicode);
3315 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3551 PyObject *buffer = NULL, *unicode;
3615 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3616 if (unicode == NULL)
3618 if (!PyUnicode_Check(unicode)) {
3623 Py_TYPE(unicode)->tp_name);
3624 Py_DECREF(unicode);
3628 return unicode_result(unicode);
3636 PyUnicode_AsDecodedObject(PyObject *unicode,
3640 if (!PyUnicode_Check(unicode)) {
3654 return PyCodec_Decode(unicode, encoding, errors);
3658 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3664 if (!PyUnicode_Check(unicode)) {
3678 v = PyCodec_Decode(unicode, encoding, errors);
3686 Py_TYPE(unicode)->tp_name);
3697 PyUnicode_AsEncodedObject(PyObject *unicode,
3703 if (!PyUnicode_Check(unicode)) {
3718 v = PyCodec_Encode(unicode, encoding, errors);
3729 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3733 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3755 "locale", unicode,
3779 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3782 return unicode_encode_locale(unicode, error_handler, 1);
3786 PyUnicode_EncodeFSDefault(PyObject *unicode)
3789 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3791 return unicode_encode_utf8(unicode,
3797 return PyUnicode_AsEncodedString(unicode,
3812 return unicode_encode_utf8(unicode, errors, NULL);
3814 return unicode_encode_locale(unicode, errors, 0);
3820 PyUnicode_AsEncodedString(PyObject *unicode,
3827 if (!PyUnicode_Check(unicode)) {
3837 return _PyUnicode_AsUTF8String(unicode, errors);
3853 return _PyUnicode_AsUTF8String(unicode, errors);
3856 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3859 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3865 return _PyUnicode_AsASCIIString(unicode, errors);
3869 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3876 return _PyUnicode_AsLatin1String(unicode, errors);
3882 v = _PyCodec_EncodeText(unicode, encoding, errors);
3920 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3926 if (!PyUnicode_Check(unicode)) {
3940 v = PyCodec_Encode(unicode, encoding, errors);
3994 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3996 return unicode;
4026 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4169 static int unicode_fill_utf8(PyObject *unicode);
4172 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4174 if (!PyUnicode_Check(unicode)) {
4178 if (PyUnicode_READY(unicode) == -1)
4181 if (PyUnicode_UTF8(unicode) == NULL) {
4182 if (unicode_fill_utf8(unicode) == -1) {
4188 *psize = PyUnicode_UTF8_LENGTH(unicode);
4189 return PyUnicode_UTF8(unicode);
4193 PyUnicode_AsUTF8(PyObject *unicode)
4195 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4199 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4201 if (!PyUnicode_Check(unicode)) {
4205 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4207 /* Non-ASCII compact unicode object */
4208 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4209 assert(PyUnicode_IS_READY(unicode));
4211 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4221 unicode_copy_as_widechar(unicode, w, wlen + 1);
4222 _PyUnicode_WSTR(unicode) = w;
4223 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4224 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4228 *size = PyUnicode_WSTR_LENGTH(unicode);
4238 PyUnicode_AsUnicode(PyObject *unicode)
4240 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4244 _PyUnicode_AsUnicode(PyObject *unicode)
4249 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4259 PyUnicode_GetSize(PyObject *unicode)
4261 if (!PyUnicode_Check(unicode)) {
4265 if (_PyUnicode_WSTR(unicode) == NULL) {
4266 if (PyUnicode_AsUnicode(unicode) == NULL)
4269 return PyUnicode_WSTR_LENGTH(unicode);
4278 PyUnicode_GetLength(PyObject *unicode)
4280 if (!PyUnicode_Check(unicode)) {
4284 if (PyUnicode_READY(unicode) == -1)
4286 return PyUnicode_GET_LENGTH(unicode);
4290 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4295 if (!PyUnicode_Check(unicode)) {
4299 if (PyUnicode_READY(unicode) == -1) {
4302 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4306 data = PyUnicode_DATA(unicode);
4307 kind = PyUnicode_KIND(unicode);
4312 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4314 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4318 assert(PyUnicode_IS_READY(unicode));
4319 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4323 if (unicode_check_modifiable(unicode))
4325 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4329 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4574 assumption that one byte decodes to one unicode character.
4575 If unfortunately one byte could decode to more unicode characters,
5269 wchar_t *unicode;
5294 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5295 if (!unicode) {
5305 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5307 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5315 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5316 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5325 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5337 unicode[outpos++] = ch;
5340 PyMem_RawFree(unicode );
5363 unicode[outpos] = L'\0';
5367 *wstr = unicode;
5535 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5538 if (!PyUnicode_Check(unicode)) {
5543 if (PyUnicode_READY(unicode) == -1)
5546 if (PyUnicode_UTF8(unicode))
5547 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5548 PyUnicode_UTF8_LENGTH(unicode));
5550 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5551 const void *data = PyUnicode_DATA(unicode);
5552 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5562 assert(!PyUnicode_IS_ASCII(unicode));
5563 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5566 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5569 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5581 unicode_fill_utf8(PyObject *unicode)
5584 assert(!PyUnicode_IS_ASCII(unicode));
5586 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5587 const void *data = PyUnicode_DATA(unicode);
5588 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5597 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5601 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5605 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5624 _PyUnicode_UTF8(unicode) = cache;
5625 _PyUnicode_UTF8_LENGTH(unicode) = len;
5633 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5635 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5640 PyUnicode_AsUTF8String(PyObject *unicode)
5642 return _PyUnicode_AsUTF8String(unicode, NULL);
5923 } else /* rep is unicode */ {
5951 PyUnicode_AsUTF32String(PyObject *unicode)
5953 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6262 } else /* rep is unicode */ {
6291 PyUnicode_AsUTF16String(PyObject *unicode)
6293 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6448 /* when we get here, ch is a 32-bit unicode character */
6460 /* load the unicode data module */
6487 /* found a name. look it up in the unicode database */
6591 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6608 if (!PyUnicode_Check(unicode)) {
6612 if (PyUnicode_READY(unicode) == -1) {
6616 len = PyUnicode_GET_LENGTH(unicode);
6621 kind = PyUnicode_KIND(unicode);
6622 data = PyUnicode_DATA(unicode);
6856 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6865 if (!PyUnicode_Check(unicode)) {
6869 if (PyUnicode_READY(unicode) == -1) {
6872 kind = PyUnicode_KIND(unicode);
6873 data = PyUnicode_DATA(unicode);
6874 len = PyUnicode_GET_LENGTH(unicode);
6949 PyObject *unicode,
6956 encoding, unicode, startpos, endpos, reason);
6975 PyObject *unicode,
6980 encoding, unicode, startpos, endpos, reason);
6993 PyObject *unicode, PyObject **exceptionObject,
7008 if (PyUnicode_READY(unicode) == -1)
7010 len = PyUnicode_GET_LENGTH(unicode);
7013 encoding, unicode, startpos, endpos, reason);
7048 unicode_encode_ucs1(PyObject *unicode,
7067 if (PyUnicode_READY(unicode) == -1)
7069 size = PyUnicode_GET_LENGTH(unicode);
7070 kind = PyUnicode_KIND(unicode);
7071 data = PyUnicode_DATA(unicode);
7110 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7125 unicode, collstart, collend);
7135 unicode, collstart, collend);
7159 encoding, reason, unicode, &exc,
7195 raise_encode_exception(&exc, encoding, unicode,
7230 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7232 if (!PyUnicode_Check(unicode)) {
7236 if (PyUnicode_READY(unicode) == -1)
7240 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7241 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7242 PyUnicode_GET_LENGTH(unicode));
7245 return unicode_encode_ucs1(unicode, errors, 256);
7249 PyUnicode_AsLatin1String(PyObject *unicode)
7251 return _PyUnicode_AsLatin1String(unicode, NULL);
7356 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7358 if (!PyUnicode_Check(unicode)) {
7362 if (PyUnicode_READY(unicode) == -1)
7366 if (PyUnicode_IS_ASCII(unicode))
7367 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7368 PyUnicode_GET_LENGTH(unicode));
7369 return unicode_encode_ucs1(unicode, errors, 128);
7373 PyUnicode_AsASCIIString(PyObject *unicode)
7375 return _PyUnicode_AsASCIIString(unicode, NULL);
7425 * Decode a byte string from a Windows code page into unicode object in strict
7475 * Decode a byte string from a code page into unicode object with an error
7726 PyObject *unicode, Py_ssize_t offset, int len,
7748 substring = PyUnicode_Substring(unicode, offset, offset+len);
7842 PyObject *unicode, Py_ssize_t unicode_offset,
7873 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7915 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7948 unicode, &exc,
7996 encoding, unicode,
8026 PyObject *unicode,
8034 if (!PyUnicode_Check(unicode)) {
8039 if (PyUnicode_READY(unicode) == -1)
8041 len = PyUnicode_GET_LENGTH(unicode);
8067 unicode, offset, chunk_len,
8071 unicode, offset,
8087 PyObject *unicode,
8090 return encode_code_page(code_page, unicode, errors);
8094 PyUnicode_AsMBCSString(PyObject *unicode)
8096 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8671 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8692 if (PyUnicode_READY(unicode) == -1)
8694 size = PyUnicode_GET_LENGTH(unicode);
8699 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8707 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8725 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8735 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8749 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8755 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8765 encoding, reason, unicode, exceptionObject,
8805 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8816 _PyUnicode_EncodeCharmap(PyObject *unicode,
8833 if (PyUnicode_READY(unicode) == -1)
8835 size = PyUnicode_GET_LENGTH(unicode);
8836 data = PyUnicode_DATA(unicode);
8837 kind = PyUnicode_KIND(unicode);
8841 return unicode_encode_ucs1(unicode, errors, 256);
8858 if (charmap_encoding_error(unicode, &inpos, mapping,
8887 PyUnicode_AsCharmapString(PyObject *unicode,
8890 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8894 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8900 PyObject *unicode,
8906 unicode, startpos, endpos, reason);
8929 PyObject *unicode, PyObject **exceptionObject,
8946 unicode, startpos, endpos, reason);
8964 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8967 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9126 /* not None, NULL, long or unicode */
9321 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9323 if (!PyUnicode_Check(unicode)) {
9327 if (PyUnicode_READY(unicode) == -1)
9329 if (PyUnicode_IS_ASCII(unicode)) {
9331 Py_INCREF(unicode);
9332 return unicode;
9335 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9342 int kind = PyUnicode_KIND(unicode);
9343 const void *data = PyUnicode_DATA(unicode);
10235 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10238 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10239 void *data = PyUnicode_DATA(unicode);
10240 assert(PyUnicode_IS_READY(unicode));
10241 assert(unicode_modifiable(unicode));
10242 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10244 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10249 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10254 if (!PyUnicode_Check(unicode)) {
10258 if (PyUnicode_READY(unicode) == -1)
10260 if (unicode_check_modifiable(unicode))
10267 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10274 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10279 _PyUnicode_FastFill(unicode, start, length, fill_char);
10950 "The fill character must be a unicode character, "
11232 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11236 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11239 p = _PyUnicode_WSTR(unicode);
11250 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11253 assert(_PyUnicode_CHECK(unicode));
11260 if (PyUnicode_READY(unicode) == -1) {
11263 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11265 if (!PyUnicode_IS_ASCII(unicode))
11267 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11269 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11567 first argument is a unicode object.
12508 /* externally visible for str.strip(unicode) */
12809 old: unicode
12810 new: unicode
12835 prefix: unicode
12862 suffix: unicode
12888 unicode_repr(PyObject *unicode)
12898 if (PyUnicode_READY(unicode) == -1)
12901 isize = PyUnicode_GET_LENGTH(unicode);
12902 idata = PyUnicode_DATA(unicode);
12909 ikind = PyUnicode_KIND(unicode);
12965 unicode, 0,
13415 y: unicode=NULL
13417 z: unicode=NULL
14088 format_spec: unicode
14538 PyObject *unicode;
14539 unicode = _PyUnicode_FromASCII(buf, len);
14541 result = unicode;
15268 unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15285 PyObject *unicode;
15287 unicode = unicode_new_empty();
15290 unicode = PyObject_Str(x);
15293 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15296 if (unicode != NULL && type != &PyUnicode_Type) {
15297 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15299 return unicode;
15303 unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15312 assert(_PyUnicode_CHECK(unicode));
15313 if (PyUnicode_READY(unicode) == -1) {
15321 kind = PyUnicode_KIND(unicode);
15322 length = PyUnicode_GET_LENGTH(unicode);
15328 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15333 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15345 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15381 memcpy(data, PyUnicode_DATA(unicode),
15385 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15524 return _PyStatus_ERR("Can't initialize unicode types");
15626 /* Interned unicode strings are not forcibly deallocated; rather, we give
15981 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16136 struct _Py_unicode_state *state = &interp->unicode;