17db96d56Sopenharmony_ci/* ------------------------------------------------------------------------ 27db96d56Sopenharmony_ci 37db96d56Sopenharmony_ci unicodedata -- Provides access to the Unicode database. 47db96d56Sopenharmony_ci 57db96d56Sopenharmony_ci The current version number is reported in the unidata_version constant. 67db96d56Sopenharmony_ci 77db96d56Sopenharmony_ci Written by Marc-Andre Lemburg (mal@lemburg.com). 87db96d56Sopenharmony_ci Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) 97db96d56Sopenharmony_ci Modified by Martin v. Löwis (martin@v.loewis.de) 107db96d56Sopenharmony_ci 117db96d56Sopenharmony_ci Copyright (c) Corporation for National Research Initiatives. 127db96d56Sopenharmony_ci 137db96d56Sopenharmony_ci ------------------------------------------------------------------------ */ 147db96d56Sopenharmony_ci 157db96d56Sopenharmony_ci#ifndef Py_BUILD_CORE_BUILTIN 167db96d56Sopenharmony_ci# define Py_BUILD_CORE_MODULE 1 177db96d56Sopenharmony_ci#endif 187db96d56Sopenharmony_ci 197db96d56Sopenharmony_ci#define PY_SSIZE_T_CLEAN 207db96d56Sopenharmony_ci 217db96d56Sopenharmony_ci#include "Python.h" 227db96d56Sopenharmony_ci#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI 237db96d56Sopenharmony_ci#include "structmember.h" // PyMemberDef 247db96d56Sopenharmony_ci 257db96d56Sopenharmony_ci#include <stdbool.h> 267db96d56Sopenharmony_ci 277db96d56Sopenharmony_ci/*[clinic input] 287db96d56Sopenharmony_cimodule unicodedata 297db96d56Sopenharmony_ciclass unicodedata.UCD 'PreviousDBVersion *' '<not used>' 307db96d56Sopenharmony_ci[clinic start generated code]*/ 317db96d56Sopenharmony_ci/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/ 327db96d56Sopenharmony_ci 337db96d56Sopenharmony_ci/* character properties */ 347db96d56Sopenharmony_ci 357db96d56Sopenharmony_citypedef struct { 367db96d56Sopenharmony_ci const unsigned char category; /* index into 377db96d56Sopenharmony_ci _PyUnicode_CategoryNames */ 387db96d56Sopenharmony_ci const unsigned char combining; /* combining class value 0 - 255 */ 397db96d56Sopenharmony_ci const unsigned char bidirectional; /* index into 407db96d56Sopenharmony_ci _PyUnicode_BidirectionalNames */ 417db96d56Sopenharmony_ci const unsigned char mirrored; /* true if mirrored in bidir mode */ 427db96d56Sopenharmony_ci const unsigned char east_asian_width; /* index into 437db96d56Sopenharmony_ci _PyUnicode_EastAsianWidth */ 447db96d56Sopenharmony_ci const unsigned char normalization_quick_check; /* see is_normalized() */ 457db96d56Sopenharmony_ci} _PyUnicode_DatabaseRecord; 467db96d56Sopenharmony_ci 477db96d56Sopenharmony_citypedef struct change_record { 487db96d56Sopenharmony_ci /* sequence of fields should be the same as in merge_old_version */ 497db96d56Sopenharmony_ci const unsigned char bidir_changed; 507db96d56Sopenharmony_ci const unsigned char category_changed; 517db96d56Sopenharmony_ci const unsigned char decimal_changed; 527db96d56Sopenharmony_ci const unsigned char mirrored_changed; 537db96d56Sopenharmony_ci const unsigned char east_asian_width_changed; 547db96d56Sopenharmony_ci const double numeric_changed; 557db96d56Sopenharmony_ci} change_record; 567db96d56Sopenharmony_ci 577db96d56Sopenharmony_ci/* data file generated by Tools/unicode/makeunicodedata.py */ 587db96d56Sopenharmony_ci#include "unicodedata_db.h" 597db96d56Sopenharmony_ci 607db96d56Sopenharmony_cistatic const _PyUnicode_DatabaseRecord* 617db96d56Sopenharmony_ci_getrecord_ex(Py_UCS4 code) 627db96d56Sopenharmony_ci{ 637db96d56Sopenharmony_ci int index; 647db96d56Sopenharmony_ci if (code >= 0x110000) 657db96d56Sopenharmony_ci index = 0; 667db96d56Sopenharmony_ci else { 677db96d56Sopenharmony_ci index = index1[(code>>SHIFT)]; 687db96d56Sopenharmony_ci index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 697db96d56Sopenharmony_ci } 707db96d56Sopenharmony_ci 717db96d56Sopenharmony_ci return &_PyUnicode_Database_Records[index]; 727db96d56Sopenharmony_ci} 737db96d56Sopenharmony_ci 747db96d56Sopenharmony_ci/* ------------- Previous-version API ------------------------------------- */ 757db96d56Sopenharmony_citypedef struct previous_version { 767db96d56Sopenharmony_ci PyObject_HEAD 777db96d56Sopenharmony_ci const char *name; 787db96d56Sopenharmony_ci const change_record* (*getrecord)(Py_UCS4); 797db96d56Sopenharmony_ci Py_UCS4 (*normalization)(Py_UCS4); 807db96d56Sopenharmony_ci} PreviousDBVersion; 817db96d56Sopenharmony_ci 827db96d56Sopenharmony_ci#include "clinic/unicodedata.c.h" 837db96d56Sopenharmony_ci 847db96d56Sopenharmony_ci#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v)) 857db96d56Sopenharmony_ci 867db96d56Sopenharmony_cistatic PyMemberDef DB_members[] = { 877db96d56Sopenharmony_ci {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, 887db96d56Sopenharmony_ci {NULL} 897db96d56Sopenharmony_ci}; 907db96d56Sopenharmony_ci 917db96d56Sopenharmony_ci// Check if self is an unicodedata.UCD instance. 927db96d56Sopenharmony_ci// If self is NULL (when the PyCapsule C API is used), return 0. 937db96d56Sopenharmony_ci// PyModule_Check() is used to avoid having to retrieve the ucd_type. 947db96d56Sopenharmony_ci// See unicodedata_functions comment to the rationale of this macro. 957db96d56Sopenharmony_ci#define UCD_Check(self) (self != NULL && !PyModule_Check(self)) 967db96d56Sopenharmony_ci 977db96d56Sopenharmony_cistatic PyObject* 987db96d56Sopenharmony_cinew_previous_version(PyTypeObject *ucd_type, 997db96d56Sopenharmony_ci const char*name, const change_record* (*getrecord)(Py_UCS4), 1007db96d56Sopenharmony_ci Py_UCS4 (*normalization)(Py_UCS4)) 1017db96d56Sopenharmony_ci{ 1027db96d56Sopenharmony_ci PreviousDBVersion *self; 1037db96d56Sopenharmony_ci self = PyObject_GC_New(PreviousDBVersion, ucd_type); 1047db96d56Sopenharmony_ci if (self == NULL) 1057db96d56Sopenharmony_ci return NULL; 1067db96d56Sopenharmony_ci self->name = name; 1077db96d56Sopenharmony_ci self->getrecord = getrecord; 1087db96d56Sopenharmony_ci self->normalization = normalization; 1097db96d56Sopenharmony_ci PyObject_GC_Track(self); 1107db96d56Sopenharmony_ci return (PyObject*)self; 1117db96d56Sopenharmony_ci} 1127db96d56Sopenharmony_ci 1137db96d56Sopenharmony_ci 1147db96d56Sopenharmony_ci/* --- Module API --------------------------------------------------------- */ 1157db96d56Sopenharmony_ci 1167db96d56Sopenharmony_ci/*[clinic input] 1177db96d56Sopenharmony_ciunicodedata.UCD.decimal 1187db96d56Sopenharmony_ci 1197db96d56Sopenharmony_ci self: self 1207db96d56Sopenharmony_ci chr: int(accept={str}) 1217db96d56Sopenharmony_ci default: object=NULL 1227db96d56Sopenharmony_ci / 1237db96d56Sopenharmony_ci 1247db96d56Sopenharmony_ciConverts a Unicode character into its equivalent decimal value. 1257db96d56Sopenharmony_ci 1267db96d56Sopenharmony_ciReturns the decimal value assigned to the character chr as integer. 1277db96d56Sopenharmony_ciIf no such value is defined, default is returned, or, if not given, 1287db96d56Sopenharmony_ciValueError is raised. 1297db96d56Sopenharmony_ci[clinic start generated code]*/ 1307db96d56Sopenharmony_ci 1317db96d56Sopenharmony_cistatic PyObject * 1327db96d56Sopenharmony_ciunicodedata_UCD_decimal_impl(PyObject *self, int chr, 1337db96d56Sopenharmony_ci PyObject *default_value) 1347db96d56Sopenharmony_ci/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/ 1357db96d56Sopenharmony_ci{ 1367db96d56Sopenharmony_ci int have_old = 0; 1377db96d56Sopenharmony_ci long rc; 1387db96d56Sopenharmony_ci Py_UCS4 c = (Py_UCS4)chr; 1397db96d56Sopenharmony_ci 1407db96d56Sopenharmony_ci if (UCD_Check(self)) { 1417db96d56Sopenharmony_ci const change_record *old = get_old_record(self, c); 1427db96d56Sopenharmony_ci if (old->category_changed == 0) { 1437db96d56Sopenharmony_ci /* unassigned */ 1447db96d56Sopenharmony_ci have_old = 1; 1457db96d56Sopenharmony_ci rc = -1; 1467db96d56Sopenharmony_ci } 1477db96d56Sopenharmony_ci else if (old->decimal_changed != 0xFF) { 1487db96d56Sopenharmony_ci have_old = 1; 1497db96d56Sopenharmony_ci rc = old->decimal_changed; 1507db96d56Sopenharmony_ci } 1517db96d56Sopenharmony_ci } 1527db96d56Sopenharmony_ci 1537db96d56Sopenharmony_ci if (!have_old) 1547db96d56Sopenharmony_ci rc = Py_UNICODE_TODECIMAL(c); 1557db96d56Sopenharmony_ci if (rc < 0) { 1567db96d56Sopenharmony_ci if (default_value == NULL) { 1577db96d56Sopenharmony_ci PyErr_SetString(PyExc_ValueError, 1587db96d56Sopenharmony_ci "not a decimal"); 1597db96d56Sopenharmony_ci return NULL; 1607db96d56Sopenharmony_ci } 1617db96d56Sopenharmony_ci else { 1627db96d56Sopenharmony_ci Py_INCREF(default_value); 1637db96d56Sopenharmony_ci return default_value; 1647db96d56Sopenharmony_ci } 1657db96d56Sopenharmony_ci } 1667db96d56Sopenharmony_ci return PyLong_FromLong(rc); 1677db96d56Sopenharmony_ci} 1687db96d56Sopenharmony_ci 1697db96d56Sopenharmony_ci/*[clinic input] 1707db96d56Sopenharmony_ciunicodedata.UCD.digit 1717db96d56Sopenharmony_ci 1727db96d56Sopenharmony_ci self: self 1737db96d56Sopenharmony_ci chr: int(accept={str}) 1747db96d56Sopenharmony_ci default: object=NULL 1757db96d56Sopenharmony_ci / 1767db96d56Sopenharmony_ci 1777db96d56Sopenharmony_ciConverts a Unicode character into its equivalent digit value. 1787db96d56Sopenharmony_ci 1797db96d56Sopenharmony_ciReturns the digit value assigned to the character chr as integer. 1807db96d56Sopenharmony_ciIf no such value is defined, default is returned, or, if not given, 1817db96d56Sopenharmony_ciValueError is raised. 1827db96d56Sopenharmony_ci[clinic start generated code]*/ 1837db96d56Sopenharmony_ci 1847db96d56Sopenharmony_cistatic PyObject * 1857db96d56Sopenharmony_ciunicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value) 1867db96d56Sopenharmony_ci/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/ 1877db96d56Sopenharmony_ci{ 1887db96d56Sopenharmony_ci long rc; 1897db96d56Sopenharmony_ci Py_UCS4 c = (Py_UCS4)chr; 1907db96d56Sopenharmony_ci rc = Py_UNICODE_TODIGIT(c); 1917db96d56Sopenharmony_ci if (rc < 0) { 1927db96d56Sopenharmony_ci if (default_value == NULL) { 1937db96d56Sopenharmony_ci PyErr_SetString(PyExc_ValueError, "not a digit"); 1947db96d56Sopenharmony_ci return NULL; 1957db96d56Sopenharmony_ci } 1967db96d56Sopenharmony_ci else { 1977db96d56Sopenharmony_ci Py_INCREF(default_value); 1987db96d56Sopenharmony_ci return default_value; 1997db96d56Sopenharmony_ci } 2007db96d56Sopenharmony_ci } 2017db96d56Sopenharmony_ci return PyLong_FromLong(rc); 2027db96d56Sopenharmony_ci} 2037db96d56Sopenharmony_ci 2047db96d56Sopenharmony_ci/*[clinic input] 2057db96d56Sopenharmony_ciunicodedata.UCD.numeric 2067db96d56Sopenharmony_ci 2077db96d56Sopenharmony_ci self: self 2087db96d56Sopenharmony_ci chr: int(accept={str}) 2097db96d56Sopenharmony_ci default: object=NULL 2107db96d56Sopenharmony_ci / 2117db96d56Sopenharmony_ci 2127db96d56Sopenharmony_ciConverts a Unicode character into its equivalent numeric value. 2137db96d56Sopenharmony_ci 2147db96d56Sopenharmony_ciReturns the numeric value assigned to the character chr as float. 2157db96d56Sopenharmony_ciIf no such value is defined, default is returned, or, if not given, 2167db96d56Sopenharmony_ciValueError is raised. 2177db96d56Sopenharmony_ci[clinic start generated code]*/ 2187db96d56Sopenharmony_ci 2197db96d56Sopenharmony_cistatic PyObject * 2207db96d56Sopenharmony_ciunicodedata_UCD_numeric_impl(PyObject *self, int chr, 2217db96d56Sopenharmony_ci PyObject *default_value) 2227db96d56Sopenharmony_ci/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/ 2237db96d56Sopenharmony_ci{ 2247db96d56Sopenharmony_ci int have_old = 0; 2257db96d56Sopenharmony_ci double rc; 2267db96d56Sopenharmony_ci Py_UCS4 c = (Py_UCS4)chr; 2277db96d56Sopenharmony_ci 2287db96d56Sopenharmony_ci if (UCD_Check(self)) { 2297db96d56Sopenharmony_ci const change_record *old = get_old_record(self, c); 2307db96d56Sopenharmony_ci if (old->category_changed == 0) { 2317db96d56Sopenharmony_ci /* unassigned */ 2327db96d56Sopenharmony_ci have_old = 1; 2337db96d56Sopenharmony_ci rc = -1.0; 2347db96d56Sopenharmony_ci } 2357db96d56Sopenharmony_ci else if (old->decimal_changed != 0xFF) { 2367db96d56Sopenharmony_ci have_old = 1; 2377db96d56Sopenharmony_ci rc = old->decimal_changed; 2387db96d56Sopenharmony_ci } 2397db96d56Sopenharmony_ci } 2407db96d56Sopenharmony_ci 2417db96d56Sopenharmony_ci if (!have_old) 2427db96d56Sopenharmony_ci rc = Py_UNICODE_TONUMERIC(c); 2437db96d56Sopenharmony_ci if (rc == -1.0) { 2447db96d56Sopenharmony_ci if (default_value == NULL) { 2457db96d56Sopenharmony_ci PyErr_SetString(PyExc_ValueError, "not a numeric character"); 2467db96d56Sopenharmony_ci return NULL; 2477db96d56Sopenharmony_ci } 2487db96d56Sopenharmony_ci else { 2497db96d56Sopenharmony_ci Py_INCREF(default_value); 2507db96d56Sopenharmony_ci return default_value; 2517db96d56Sopenharmony_ci } 2527db96d56Sopenharmony_ci } 2537db96d56Sopenharmony_ci return PyFloat_FromDouble(rc); 2547db96d56Sopenharmony_ci} 2557db96d56Sopenharmony_ci 2567db96d56Sopenharmony_ci/*[clinic input] 2577db96d56Sopenharmony_ciunicodedata.UCD.category 2587db96d56Sopenharmony_ci 2597db96d56Sopenharmony_ci self: self 2607db96d56Sopenharmony_ci chr: int(accept={str}) 2617db96d56Sopenharmony_ci / 2627db96d56Sopenharmony_ci 2637db96d56Sopenharmony_ciReturns the general category assigned to the character chr as string. 2647db96d56Sopenharmony_ci[clinic start generated code]*/ 2657db96d56Sopenharmony_ci 2667db96d56Sopenharmony_cistatic PyObject * 2677db96d56Sopenharmony_ciunicodedata_UCD_category_impl(PyObject *self, int chr) 2687db96d56Sopenharmony_ci/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/ 2697db96d56Sopenharmony_ci{ 2707db96d56Sopenharmony_ci int index; 2717db96d56Sopenharmony_ci Py_UCS4 c = (Py_UCS4)chr; 2727db96d56Sopenharmony_ci index = (int) _getrecord_ex(c)->category; 2737db96d56Sopenharmony_ci if (UCD_Check(self)) { 2747db96d56Sopenharmony_ci const change_record *old = get_old_record(self, c); 2757db96d56Sopenharmony_ci if (old->category_changed != 0xFF) 2767db96d56Sopenharmony_ci index = old->category_changed; 2777db96d56Sopenharmony_ci } 2787db96d56Sopenharmony_ci return PyUnicode_FromString(_PyUnicode_CategoryNames[index]); 2797db96d56Sopenharmony_ci} 2807db96d56Sopenharmony_ci 2817db96d56Sopenharmony_ci/*[clinic input] 2827db96d56Sopenharmony_ciunicodedata.UCD.bidirectional 2837db96d56Sopenharmony_ci 2847db96d56Sopenharmony_ci self: self 2857db96d56Sopenharmony_ci chr: int(accept={str}) 2867db96d56Sopenharmony_ci / 2877db96d56Sopenharmony_ci 2887db96d56Sopenharmony_ciReturns the bidirectional class assigned to the character chr as string. 2897db96d56Sopenharmony_ci 2907db96d56Sopenharmony_ciIf no such value is defined, an empty string is returned. 2917db96d56Sopenharmony_ci[clinic start generated code]*/ 2927db96d56Sopenharmony_ci 2937db96d56Sopenharmony_cistatic PyObject * 2947db96d56Sopenharmony_ciunicodedata_UCD_bidirectional_impl(PyObject *self, int chr) 2957db96d56Sopenharmony_ci/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/ 2967db96d56Sopenharmony_ci{ 2977db96d56Sopenharmony_ci int index; 2987db96d56Sopenharmony_ci Py_UCS4 c = (Py_UCS4)chr; 2997db96d56Sopenharmony_ci index = (int) _getrecord_ex(c)->bidirectional; 3007db96d56Sopenharmony_ci if (UCD_Check(self)) { 3017db96d56Sopenharmony_ci const change_record *old = get_old_record(self, c); 3027db96d56Sopenharmony_ci if (old->category_changed == 0) 3037db96d56Sopenharmony_ci index = 0; /* unassigned */ 3047db96d56Sopenharmony_ci else if (old->bidir_changed != 0xFF) 3057db96d56Sopenharmony_ci index = old->bidir_changed; 3067db96d56Sopenharmony_ci } 3077db96d56Sopenharmony_ci return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]); 3087db96d56Sopenharmony_ci} 3097db96d56Sopenharmony_ci 3107db96d56Sopenharmony_ci/*[clinic input] 3117db96d56Sopenharmony_ciunicodedata.UCD.combining -> int 3127db96d56Sopenharmony_ci 3137db96d56Sopenharmony_ci self: self 3147db96d56Sopenharmony_ci chr: int(accept={str}) 3157db96d56Sopenharmony_ci / 3167db96d56Sopenharmony_ci 3177db96d56Sopenharmony_ciReturns the canonical combining class assigned to the character chr as integer. 3187db96d56Sopenharmony_ci 3197db96d56Sopenharmony_ciReturns 0 if no combining class is defined. 3207db96d56Sopenharmony_ci[clinic start generated code]*/ 3217db96d56Sopenharmony_ci 3227db96d56Sopenharmony_cistatic int 3237db96d56Sopenharmony_ciunicodedata_UCD_combining_impl(PyObject *self, int chr) 3247db96d56Sopenharmony_ci/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/ 3257db96d56Sopenharmony_ci{ 3267db96d56Sopenharmony_ci int index; 3277db96d56Sopenharmony_ci Py_UCS4 c = (Py_UCS4)chr; 3287db96d56Sopenharmony_ci index = (int) _getrecord_ex(c)->combining; 3297db96d56Sopenharmony_ci if (UCD_Check(self)) { 3307db96d56Sopenharmony_ci const change_record *old = get_old_record(self, c); 3317db96d56Sopenharmony_ci if (old->category_changed == 0) 3327db96d56Sopenharmony_ci index = 0; /* unassigned */ 3337db96d56Sopenharmony_ci } 3347db96d56Sopenharmony_ci return index; 3357db96d56Sopenharmony_ci} 3367db96d56Sopenharmony_ci 3377db96d56Sopenharmony_ci/*[clinic input] 3387db96d56Sopenharmony_ciunicodedata.UCD.mirrored -> int 3397db96d56Sopenharmony_ci 3407db96d56Sopenharmony_ci self: self 3417db96d56Sopenharmony_ci chr: int(accept={str}) 3427db96d56Sopenharmony_ci / 3437db96d56Sopenharmony_ci 3447db96d56Sopenharmony_ciReturns the mirrored property assigned to the character chr as integer. 3457db96d56Sopenharmony_ci 3467db96d56Sopenharmony_ciReturns 1 if the character has been identified as a "mirrored" 3477db96d56Sopenharmony_cicharacter in bidirectional text, 0 otherwise. 3487db96d56Sopenharmony_ci[clinic start generated code]*/ 3497db96d56Sopenharmony_ci 3507db96d56Sopenharmony_cistatic int 3517db96d56Sopenharmony_ciunicodedata_UCD_mirrored_impl(PyObject *self, int chr) 3527db96d56Sopenharmony_ci/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/ 3537db96d56Sopenharmony_ci{ 3547db96d56Sopenharmony_ci int index; 3557db96d56Sopenharmony_ci Py_UCS4 c = (Py_UCS4)chr; 3567db96d56Sopenharmony_ci index = (int) _getrecord_ex(c)->mirrored; 3577db96d56Sopenharmony_ci if (UCD_Check(self)) { 3587db96d56Sopenharmony_ci const change_record *old = get_old_record(self, c); 3597db96d56Sopenharmony_ci if (old->category_changed == 0) 3607db96d56Sopenharmony_ci index = 0; /* unassigned */ 3617db96d56Sopenharmony_ci else if (old->mirrored_changed != 0xFF) 3627db96d56Sopenharmony_ci index = old->mirrored_changed; 3637db96d56Sopenharmony_ci } 3647db96d56Sopenharmony_ci return index; 3657db96d56Sopenharmony_ci} 3667db96d56Sopenharmony_ci 3677db96d56Sopenharmony_ci/*[clinic input] 3687db96d56Sopenharmony_ciunicodedata.UCD.east_asian_width 3697db96d56Sopenharmony_ci 3707db96d56Sopenharmony_ci self: self 3717db96d56Sopenharmony_ci chr: int(accept={str}) 3727db96d56Sopenharmony_ci / 3737db96d56Sopenharmony_ci 3747db96d56Sopenharmony_ciReturns the east asian width assigned to the character chr as string. 3757db96d56Sopenharmony_ci[clinic start generated code]*/ 3767db96d56Sopenharmony_ci 3777db96d56Sopenharmony_cistatic PyObject * 3787db96d56Sopenharmony_ciunicodedata_UCD_east_asian_width_impl(PyObject *self, int chr) 3797db96d56Sopenharmony_ci/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/ 3807db96d56Sopenharmony_ci{ 3817db96d56Sopenharmony_ci int index; 3827db96d56Sopenharmony_ci Py_UCS4 c = (Py_UCS4)chr; 3837db96d56Sopenharmony_ci index = (int) _getrecord_ex(c)->east_asian_width; 3847db96d56Sopenharmony_ci if (UCD_Check(self)) { 3857db96d56Sopenharmony_ci const change_record *old = get_old_record(self, c); 3867db96d56Sopenharmony_ci if (old->category_changed == 0) 3877db96d56Sopenharmony_ci index = 0; /* unassigned */ 3887db96d56Sopenharmony_ci else if (old->east_asian_width_changed != 0xFF) 3897db96d56Sopenharmony_ci index = old->east_asian_width_changed; 3907db96d56Sopenharmony_ci } 3917db96d56Sopenharmony_ci return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]); 3927db96d56Sopenharmony_ci} 3937db96d56Sopenharmony_ci 3947db96d56Sopenharmony_ci/*[clinic input] 3957db96d56Sopenharmony_ciunicodedata.UCD.decomposition 3967db96d56Sopenharmony_ci 3977db96d56Sopenharmony_ci self: self 3987db96d56Sopenharmony_ci chr: int(accept={str}) 3997db96d56Sopenharmony_ci / 4007db96d56Sopenharmony_ci 4017db96d56Sopenharmony_ciReturns the character decomposition mapping assigned to the character chr as string. 4027db96d56Sopenharmony_ci 4037db96d56Sopenharmony_ciAn empty string is returned in case no such mapping is defined. 4047db96d56Sopenharmony_ci[clinic start generated code]*/ 4057db96d56Sopenharmony_ci 4067db96d56Sopenharmony_cistatic PyObject * 4077db96d56Sopenharmony_ciunicodedata_UCD_decomposition_impl(PyObject *self, int chr) 4087db96d56Sopenharmony_ci/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/ 4097db96d56Sopenharmony_ci{ 4107db96d56Sopenharmony_ci char decomp[256]; 4117db96d56Sopenharmony_ci int code, index, count; 4127db96d56Sopenharmony_ci size_t i; 4137db96d56Sopenharmony_ci unsigned int prefix_index; 4147db96d56Sopenharmony_ci Py_UCS4 c = (Py_UCS4)chr; 4157db96d56Sopenharmony_ci 4167db96d56Sopenharmony_ci code = (int)c; 4177db96d56Sopenharmony_ci 4187db96d56Sopenharmony_ci if (UCD_Check(self)) { 4197db96d56Sopenharmony_ci const change_record *old = get_old_record(self, c); 4207db96d56Sopenharmony_ci if (old->category_changed == 0) 4217db96d56Sopenharmony_ci return PyUnicode_FromString(""); /* unassigned */ 4227db96d56Sopenharmony_ci } 4237db96d56Sopenharmony_ci 4247db96d56Sopenharmony_ci if (code < 0 || code >= 0x110000) 4257db96d56Sopenharmony_ci index = 0; 4267db96d56Sopenharmony_ci else { 4277db96d56Sopenharmony_ci index = decomp_index1[(code>>DECOMP_SHIFT)]; 4287db96d56Sopenharmony_ci index = decomp_index2[(index<<DECOMP_SHIFT)+ 4297db96d56Sopenharmony_ci (code&((1<<DECOMP_SHIFT)-1))]; 4307db96d56Sopenharmony_ci } 4317db96d56Sopenharmony_ci 4327db96d56Sopenharmony_ci /* high byte is number of hex bytes (usually one or two), low byte 4337db96d56Sopenharmony_ci is prefix code (from*/ 4347db96d56Sopenharmony_ci count = decomp_data[index] >> 8; 4357db96d56Sopenharmony_ci 4367db96d56Sopenharmony_ci /* XXX: could allocate the PyString up front instead 4377db96d56Sopenharmony_ci (strlen(prefix) + 5 * count + 1 bytes) */ 4387db96d56Sopenharmony_ci 4397db96d56Sopenharmony_ci /* Based on how index is calculated above and decomp_data is generated 4407db96d56Sopenharmony_ci from Tools/unicode/makeunicodedata.py, it should not be possible 4417db96d56Sopenharmony_ci to overflow decomp_prefix. */ 4427db96d56Sopenharmony_ci prefix_index = decomp_data[index] & 255; 4437db96d56Sopenharmony_ci assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix)); 4447db96d56Sopenharmony_ci 4457db96d56Sopenharmony_ci /* copy prefix */ 4467db96d56Sopenharmony_ci i = strlen(decomp_prefix[prefix_index]); 4477db96d56Sopenharmony_ci memcpy(decomp, decomp_prefix[prefix_index], i); 4487db96d56Sopenharmony_ci 4497db96d56Sopenharmony_ci while (count-- > 0) { 4507db96d56Sopenharmony_ci if (i) 4517db96d56Sopenharmony_ci decomp[i++] = ' '; 4527db96d56Sopenharmony_ci assert(i < sizeof(decomp)); 4537db96d56Sopenharmony_ci PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", 4547db96d56Sopenharmony_ci decomp_data[++index]); 4557db96d56Sopenharmony_ci i += strlen(decomp + i); 4567db96d56Sopenharmony_ci } 4577db96d56Sopenharmony_ci return PyUnicode_FromStringAndSize(decomp, i); 4587db96d56Sopenharmony_ci} 4597db96d56Sopenharmony_ci 4607db96d56Sopenharmony_cistatic void 4617db96d56Sopenharmony_ciget_decomp_record(PyObject *self, Py_UCS4 code, 4627db96d56Sopenharmony_ci int *index, int *prefix, int *count) 4637db96d56Sopenharmony_ci{ 4647db96d56Sopenharmony_ci if (code >= 0x110000) { 4657db96d56Sopenharmony_ci *index = 0; 4667db96d56Sopenharmony_ci } 4677db96d56Sopenharmony_ci else if (UCD_Check(self) 4687db96d56Sopenharmony_ci && get_old_record(self, code)->category_changed==0) { 4697db96d56Sopenharmony_ci /* unassigned in old version */ 4707db96d56Sopenharmony_ci *index = 0; 4717db96d56Sopenharmony_ci } 4727db96d56Sopenharmony_ci else { 4737db96d56Sopenharmony_ci *index = decomp_index1[(code>>DECOMP_SHIFT)]; 4747db96d56Sopenharmony_ci *index = decomp_index2[(*index<<DECOMP_SHIFT)+ 4757db96d56Sopenharmony_ci (code&((1<<DECOMP_SHIFT)-1))]; 4767db96d56Sopenharmony_ci } 4777db96d56Sopenharmony_ci 4787db96d56Sopenharmony_ci /* high byte is number of hex bytes (usually one or two), low byte 4797db96d56Sopenharmony_ci is prefix code (from*/ 4807db96d56Sopenharmony_ci *count = decomp_data[*index] >> 8; 4817db96d56Sopenharmony_ci *prefix = decomp_data[*index] & 255; 4827db96d56Sopenharmony_ci 4837db96d56Sopenharmony_ci (*index)++; 4847db96d56Sopenharmony_ci} 4857db96d56Sopenharmony_ci 4867db96d56Sopenharmony_ci#define SBase 0xAC00 4877db96d56Sopenharmony_ci#define LBase 0x1100 4887db96d56Sopenharmony_ci#define VBase 0x1161 4897db96d56Sopenharmony_ci#define TBase 0x11A7 4907db96d56Sopenharmony_ci#define LCount 19 4917db96d56Sopenharmony_ci#define VCount 21 4927db96d56Sopenharmony_ci#define TCount 28 4937db96d56Sopenharmony_ci#define NCount (VCount*TCount) 4947db96d56Sopenharmony_ci#define SCount (LCount*NCount) 4957db96d56Sopenharmony_ci 4967db96d56Sopenharmony_cistatic PyObject* 4977db96d56Sopenharmony_cinfd_nfkd(PyObject *self, PyObject *input, int k) 4987db96d56Sopenharmony_ci{ 4997db96d56Sopenharmony_ci PyObject *result; 5007db96d56Sopenharmony_ci Py_UCS4 *output; 5017db96d56Sopenharmony_ci Py_ssize_t i, o, osize; 5027db96d56Sopenharmony_ci int kind; 5037db96d56Sopenharmony_ci const void *data; 5047db96d56Sopenharmony_ci /* Longest decomposition in Unicode 3.2: U+FDFA */ 5057db96d56Sopenharmony_ci Py_UCS4 stack[20]; 5067db96d56Sopenharmony_ci Py_ssize_t space, isize; 5077db96d56Sopenharmony_ci int index, prefix, count, stackptr; 5087db96d56Sopenharmony_ci unsigned char prev, cur; 5097db96d56Sopenharmony_ci 5107db96d56Sopenharmony_ci stackptr = 0; 5117db96d56Sopenharmony_ci isize = PyUnicode_GET_LENGTH(input); 5127db96d56Sopenharmony_ci space = isize; 5137db96d56Sopenharmony_ci /* Overallocate at most 10 characters. */ 5147db96d56Sopenharmony_ci if (space > 10) { 5157db96d56Sopenharmony_ci if (space <= PY_SSIZE_T_MAX - 10) 5167db96d56Sopenharmony_ci space += 10; 5177db96d56Sopenharmony_ci } 5187db96d56Sopenharmony_ci else { 5197db96d56Sopenharmony_ci space *= 2; 5207db96d56Sopenharmony_ci } 5217db96d56Sopenharmony_ci osize = space; 5227db96d56Sopenharmony_ci output = PyMem_NEW(Py_UCS4, space); 5237db96d56Sopenharmony_ci if (!output) { 5247db96d56Sopenharmony_ci PyErr_NoMemory(); 5257db96d56Sopenharmony_ci return NULL; 5267db96d56Sopenharmony_ci } 5277db96d56Sopenharmony_ci i = o = 0; 5287db96d56Sopenharmony_ci kind = PyUnicode_KIND(input); 5297db96d56Sopenharmony_ci data = PyUnicode_DATA(input); 5307db96d56Sopenharmony_ci 5317db96d56Sopenharmony_ci while (i < isize) { 5327db96d56Sopenharmony_ci stack[stackptr++] = PyUnicode_READ(kind, data, i++); 5337db96d56Sopenharmony_ci while(stackptr) { 5347db96d56Sopenharmony_ci Py_UCS4 code = stack[--stackptr]; 5357db96d56Sopenharmony_ci /* Hangul Decomposition adds three characters in 5367db96d56Sopenharmony_ci a single step, so we need at least that much room. */ 5377db96d56Sopenharmony_ci if (space < 3) { 5387db96d56Sopenharmony_ci Py_UCS4 *new_output; 5397db96d56Sopenharmony_ci osize += 10; 5407db96d56Sopenharmony_ci space += 10; 5417db96d56Sopenharmony_ci new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4)); 5427db96d56Sopenharmony_ci if (new_output == NULL) { 5437db96d56Sopenharmony_ci PyMem_Free(output); 5447db96d56Sopenharmony_ci PyErr_NoMemory(); 5457db96d56Sopenharmony_ci return NULL; 5467db96d56Sopenharmony_ci } 5477db96d56Sopenharmony_ci output = new_output; 5487db96d56Sopenharmony_ci } 5497db96d56Sopenharmony_ci /* Hangul Decomposition. */ 5507db96d56Sopenharmony_ci if (SBase <= code && code < (SBase+SCount)) { 5517db96d56Sopenharmony_ci int SIndex = code - SBase; 5527db96d56Sopenharmony_ci int L = LBase + SIndex / NCount; 5537db96d56Sopenharmony_ci int V = VBase + (SIndex % NCount) / TCount; 5547db96d56Sopenharmony_ci int T = TBase + SIndex % TCount; 5557db96d56Sopenharmony_ci output[o++] = L; 5567db96d56Sopenharmony_ci output[o++] = V; 5577db96d56Sopenharmony_ci space -= 2; 5587db96d56Sopenharmony_ci if (T != TBase) { 5597db96d56Sopenharmony_ci output[o++] = T; 5607db96d56Sopenharmony_ci space --; 5617db96d56Sopenharmony_ci } 5627db96d56Sopenharmony_ci continue; 5637db96d56Sopenharmony_ci } 5647db96d56Sopenharmony_ci /* normalization changes */ 5657db96d56Sopenharmony_ci if (UCD_Check(self)) { 5667db96d56Sopenharmony_ci Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); 5677db96d56Sopenharmony_ci if (value != 0) { 5687db96d56Sopenharmony_ci stack[stackptr++] = value; 5697db96d56Sopenharmony_ci continue; 5707db96d56Sopenharmony_ci } 5717db96d56Sopenharmony_ci } 5727db96d56Sopenharmony_ci 5737db96d56Sopenharmony_ci /* Other decompositions. */ 5747db96d56Sopenharmony_ci get_decomp_record(self, code, &index, &prefix, &count); 5757db96d56Sopenharmony_ci 5767db96d56Sopenharmony_ci /* Copy character if it is not decomposable, or has a 5777db96d56Sopenharmony_ci compatibility decomposition, but we do NFD. */ 5787db96d56Sopenharmony_ci if (!count || (prefix && !k)) { 5797db96d56Sopenharmony_ci output[o++] = code; 5807db96d56Sopenharmony_ci space--; 5817db96d56Sopenharmony_ci continue; 5827db96d56Sopenharmony_ci } 5837db96d56Sopenharmony_ci /* Copy decomposition onto the stack, in reverse 5847db96d56Sopenharmony_ci order. */ 5857db96d56Sopenharmony_ci while(count) { 5867db96d56Sopenharmony_ci code = decomp_data[index + (--count)]; 5877db96d56Sopenharmony_ci stack[stackptr++] = code; 5887db96d56Sopenharmony_ci } 5897db96d56Sopenharmony_ci } 5907db96d56Sopenharmony_ci } 5917db96d56Sopenharmony_ci 5927db96d56Sopenharmony_ci result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, 5937db96d56Sopenharmony_ci output, o); 5947db96d56Sopenharmony_ci PyMem_Free(output); 5957db96d56Sopenharmony_ci if (!result) 5967db96d56Sopenharmony_ci return NULL; 5977db96d56Sopenharmony_ci /* result is guaranteed to be ready, as it is compact. */ 5987db96d56Sopenharmony_ci kind = PyUnicode_KIND(result); 5997db96d56Sopenharmony_ci data = PyUnicode_DATA(result); 6007db96d56Sopenharmony_ci 6017db96d56Sopenharmony_ci /* Sort canonically. */ 6027db96d56Sopenharmony_ci i = 0; 6037db96d56Sopenharmony_ci prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 6047db96d56Sopenharmony_ci for (i++; i < PyUnicode_GET_LENGTH(result); i++) { 6057db96d56Sopenharmony_ci cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 6067db96d56Sopenharmony_ci if (prev == 0 || cur == 0 || prev <= cur) { 6077db96d56Sopenharmony_ci prev = cur; 6087db96d56Sopenharmony_ci continue; 6097db96d56Sopenharmony_ci } 6107db96d56Sopenharmony_ci /* Non-canonical order. Need to switch *i with previous. */ 6117db96d56Sopenharmony_ci o = i - 1; 6127db96d56Sopenharmony_ci while (1) { 6137db96d56Sopenharmony_ci Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1); 6147db96d56Sopenharmony_ci PyUnicode_WRITE(kind, data, o+1, 6157db96d56Sopenharmony_ci PyUnicode_READ(kind, data, o)); 6167db96d56Sopenharmony_ci PyUnicode_WRITE(kind, data, o, tmp); 6177db96d56Sopenharmony_ci o--; 6187db96d56Sopenharmony_ci if (o < 0) 6197db96d56Sopenharmony_ci break; 6207db96d56Sopenharmony_ci prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining; 6217db96d56Sopenharmony_ci if (prev == 0 || prev <= cur) 6227db96d56Sopenharmony_ci break; 6237db96d56Sopenharmony_ci } 6247db96d56Sopenharmony_ci prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 6257db96d56Sopenharmony_ci } 6267db96d56Sopenharmony_ci return result; 6277db96d56Sopenharmony_ci} 6287db96d56Sopenharmony_ci 6297db96d56Sopenharmony_cistatic int 6307db96d56Sopenharmony_cifind_nfc_index(const struct reindex* nfc, Py_UCS4 code) 6317db96d56Sopenharmony_ci{ 6327db96d56Sopenharmony_ci unsigned int index; 6337db96d56Sopenharmony_ci for (index = 0; nfc[index].start; index++) { 6347db96d56Sopenharmony_ci unsigned int start = nfc[index].start; 6357db96d56Sopenharmony_ci if (code < start) 6367db96d56Sopenharmony_ci return -1; 6377db96d56Sopenharmony_ci if (code <= start + nfc[index].count) { 6387db96d56Sopenharmony_ci unsigned int delta = code - start; 6397db96d56Sopenharmony_ci return nfc[index].index + delta; 6407db96d56Sopenharmony_ci } 6417db96d56Sopenharmony_ci } 6427db96d56Sopenharmony_ci return -1; 6437db96d56Sopenharmony_ci} 6447db96d56Sopenharmony_ci 6457db96d56Sopenharmony_cistatic PyObject* 6467db96d56Sopenharmony_cinfc_nfkc(PyObject *self, PyObject *input, int k) 6477db96d56Sopenharmony_ci{ 6487db96d56Sopenharmony_ci PyObject *result; 6497db96d56Sopenharmony_ci int kind; 6507db96d56Sopenharmony_ci const void *data; 6517db96d56Sopenharmony_ci Py_UCS4 *output; 6527db96d56Sopenharmony_ci Py_ssize_t i, i1, o, len; 6537db96d56Sopenharmony_ci int f,l,index,index1,comb; 6547db96d56Sopenharmony_ci Py_UCS4 code; 6557db96d56Sopenharmony_ci Py_ssize_t skipped[20]; 6567db96d56Sopenharmony_ci int cskipped = 0; 6577db96d56Sopenharmony_ci 6587db96d56Sopenharmony_ci result = nfd_nfkd(self, input, k); 6597db96d56Sopenharmony_ci if (!result) 6607db96d56Sopenharmony_ci return NULL; 6617db96d56Sopenharmony_ci /* result will be "ready". */ 6627db96d56Sopenharmony_ci kind = PyUnicode_KIND(result); 6637db96d56Sopenharmony_ci data = PyUnicode_DATA(result); 6647db96d56Sopenharmony_ci len = PyUnicode_GET_LENGTH(result); 6657db96d56Sopenharmony_ci 6667db96d56Sopenharmony_ci /* We allocate a buffer for the output. 6677db96d56Sopenharmony_ci If we find that we made no changes, we still return 6687db96d56Sopenharmony_ci the NFD result. */ 6697db96d56Sopenharmony_ci output = PyMem_NEW(Py_UCS4, len); 6707db96d56Sopenharmony_ci if (!output) { 6717db96d56Sopenharmony_ci PyErr_NoMemory(); 6727db96d56Sopenharmony_ci Py_DECREF(result); 6737db96d56Sopenharmony_ci return 0; 6747db96d56Sopenharmony_ci } 6757db96d56Sopenharmony_ci i = o = 0; 6767db96d56Sopenharmony_ci 6777db96d56Sopenharmony_ci again: 6787db96d56Sopenharmony_ci while (i < len) { 6797db96d56Sopenharmony_ci for (index = 0; index < cskipped; index++) { 6807db96d56Sopenharmony_ci if (skipped[index] == i) { 6817db96d56Sopenharmony_ci /* *i character is skipped. 6827db96d56Sopenharmony_ci Remove from list. */ 6837db96d56Sopenharmony_ci skipped[index] = skipped[cskipped-1]; 6847db96d56Sopenharmony_ci cskipped--; 6857db96d56Sopenharmony_ci i++; 6867db96d56Sopenharmony_ci goto again; /* continue while */ 6877db96d56Sopenharmony_ci } 6887db96d56Sopenharmony_ci } 6897db96d56Sopenharmony_ci /* Hangul Composition. We don't need to check for <LV,T> 6907db96d56Sopenharmony_ci pairs, since we always have decomposed data. */ 6917db96d56Sopenharmony_ci code = PyUnicode_READ(kind, data, i); 6927db96d56Sopenharmony_ci if (LBase <= code && code < (LBase+LCount) && 6937db96d56Sopenharmony_ci i + 1 < len && 6947db96d56Sopenharmony_ci VBase <= PyUnicode_READ(kind, data, i+1) && 6957db96d56Sopenharmony_ci PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) { 6967db96d56Sopenharmony_ci /* check L character is a modern leading consonant (0x1100 ~ 0x1112) 6977db96d56Sopenharmony_ci and V character is a modern vowel (0x1161 ~ 0x1175). */ 6987db96d56Sopenharmony_ci int LIndex, VIndex; 6997db96d56Sopenharmony_ci LIndex = code - LBase; 7007db96d56Sopenharmony_ci VIndex = PyUnicode_READ(kind, data, i+1) - VBase; 7017db96d56Sopenharmony_ci code = SBase + (LIndex*VCount+VIndex)*TCount; 7027db96d56Sopenharmony_ci i+=2; 7037db96d56Sopenharmony_ci if (i < len && 7047db96d56Sopenharmony_ci TBase < PyUnicode_READ(kind, data, i) && 7057db96d56Sopenharmony_ci PyUnicode_READ(kind, data, i) < (TBase+TCount)) { 7067db96d56Sopenharmony_ci /* check T character is a modern trailing consonant 7077db96d56Sopenharmony_ci (0x11A8 ~ 0x11C2). */ 7087db96d56Sopenharmony_ci code += PyUnicode_READ(kind, data, i)-TBase; 7097db96d56Sopenharmony_ci i++; 7107db96d56Sopenharmony_ci } 7117db96d56Sopenharmony_ci output[o++] = code; 7127db96d56Sopenharmony_ci continue; 7137db96d56Sopenharmony_ci } 7147db96d56Sopenharmony_ci 7157db96d56Sopenharmony_ci /* code is still input[i] here */ 7167db96d56Sopenharmony_ci f = find_nfc_index(nfc_first, code); 7177db96d56Sopenharmony_ci if (f == -1) { 7187db96d56Sopenharmony_ci output[o++] = code; 7197db96d56Sopenharmony_ci i++; 7207db96d56Sopenharmony_ci continue; 7217db96d56Sopenharmony_ci } 7227db96d56Sopenharmony_ci /* Find next unblocked character. */ 7237db96d56Sopenharmony_ci i1 = i+1; 7247db96d56Sopenharmony_ci comb = 0; 7257db96d56Sopenharmony_ci /* output base character for now; might be updated later. */ 7267db96d56Sopenharmony_ci output[o] = PyUnicode_READ(kind, data, i); 7277db96d56Sopenharmony_ci while (i1 < len) { 7287db96d56Sopenharmony_ci Py_UCS4 code1 = PyUnicode_READ(kind, data, i1); 7297db96d56Sopenharmony_ci int comb1 = _getrecord_ex(code1)->combining; 7307db96d56Sopenharmony_ci if (comb) { 7317db96d56Sopenharmony_ci if (comb1 == 0) 7327db96d56Sopenharmony_ci break; 7337db96d56Sopenharmony_ci if (comb >= comb1) { 7347db96d56Sopenharmony_ci /* Character is blocked. */ 7357db96d56Sopenharmony_ci i1++; 7367db96d56Sopenharmony_ci continue; 7377db96d56Sopenharmony_ci } 7387db96d56Sopenharmony_ci } 7397db96d56Sopenharmony_ci l = find_nfc_index(nfc_last, code1); 7407db96d56Sopenharmony_ci /* i1 cannot be combined with i. If i1 7417db96d56Sopenharmony_ci is a starter, we don't need to look further. 7427db96d56Sopenharmony_ci Otherwise, record the combining class. */ 7437db96d56Sopenharmony_ci if (l == -1) { 7447db96d56Sopenharmony_ci not_combinable: 7457db96d56Sopenharmony_ci if (comb1 == 0) 7467db96d56Sopenharmony_ci break; 7477db96d56Sopenharmony_ci comb = comb1; 7487db96d56Sopenharmony_ci i1++; 7497db96d56Sopenharmony_ci continue; 7507db96d56Sopenharmony_ci } 7517db96d56Sopenharmony_ci index = f*TOTAL_LAST + l; 7527db96d56Sopenharmony_ci index1 = comp_index[index >> COMP_SHIFT]; 7537db96d56Sopenharmony_ci code = comp_data[(index1<<COMP_SHIFT)+ 7547db96d56Sopenharmony_ci (index&((1<<COMP_SHIFT)-1))]; 7557db96d56Sopenharmony_ci if (code == 0) 7567db96d56Sopenharmony_ci goto not_combinable; 7577db96d56Sopenharmony_ci 7587db96d56Sopenharmony_ci /* Replace the original character. */ 7597db96d56Sopenharmony_ci output[o] = code; 7607db96d56Sopenharmony_ci /* Mark the second character unused. */ 7617db96d56Sopenharmony_ci assert(cskipped < 20); 7627db96d56Sopenharmony_ci skipped[cskipped++] = i1; 7637db96d56Sopenharmony_ci i1++; 7647db96d56Sopenharmony_ci f = find_nfc_index(nfc_first, output[o]); 7657db96d56Sopenharmony_ci if (f == -1) 7667db96d56Sopenharmony_ci break; 7677db96d56Sopenharmony_ci } 7687db96d56Sopenharmony_ci /* Output character was already written. 7697db96d56Sopenharmony_ci Just advance the indices. */ 7707db96d56Sopenharmony_ci o++; i++; 7717db96d56Sopenharmony_ci } 7727db96d56Sopenharmony_ci if (o == len) { 7737db96d56Sopenharmony_ci /* No changes. Return original string. */ 7747db96d56Sopenharmony_ci PyMem_Free(output); 7757db96d56Sopenharmony_ci return result; 7767db96d56Sopenharmony_ci } 7777db96d56Sopenharmony_ci Py_DECREF(result); 7787db96d56Sopenharmony_ci result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, 7797db96d56Sopenharmony_ci output, o); 7807db96d56Sopenharmony_ci PyMem_Free(output); 7817db96d56Sopenharmony_ci return result; 7827db96d56Sopenharmony_ci} 7837db96d56Sopenharmony_ci 7847db96d56Sopenharmony_ci// This needs to match the logic in makeunicodedata.py 7857db96d56Sopenharmony_ci// which constructs the quickcheck data. 7867db96d56Sopenharmony_citypedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; 7877db96d56Sopenharmony_ci 7887db96d56Sopenharmony_ci/* Run the Unicode normalization "quickcheck" algorithm. 7897db96d56Sopenharmony_ci * 7907db96d56Sopenharmony_ci * Return YES or NO if quickcheck determines the input is certainly 7917db96d56Sopenharmony_ci * normalized or certainly not, and MAYBE if quickcheck is unable to 7927db96d56Sopenharmony_ci * tell. 7937db96d56Sopenharmony_ci * 7947db96d56Sopenharmony_ci * If `yes_only` is true, then return MAYBE as soon as we determine 7957db96d56Sopenharmony_ci * the answer is not YES. 7967db96d56Sopenharmony_ci * 7977db96d56Sopenharmony_ci * For background and details on the algorithm, see UAX #15: 7987db96d56Sopenharmony_ci * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms 7997db96d56Sopenharmony_ci */ 8007db96d56Sopenharmony_cistatic QuickcheckResult 8017db96d56Sopenharmony_ciis_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k, 8027db96d56Sopenharmony_ci bool yes_only) 8037db96d56Sopenharmony_ci{ 8047db96d56Sopenharmony_ci /* UCD 3.2.0 is requested, quickchecks must be disabled. */ 8057db96d56Sopenharmony_ci if (UCD_Check(self)) { 8067db96d56Sopenharmony_ci return MAYBE; 8077db96d56Sopenharmony_ci } 8087db96d56Sopenharmony_ci 8097db96d56Sopenharmony_ci if (PyUnicode_IS_ASCII(input)) { 8107db96d56Sopenharmony_ci return YES; 8117db96d56Sopenharmony_ci } 8127db96d56Sopenharmony_ci 8137db96d56Sopenharmony_ci Py_ssize_t i, len; 8147db96d56Sopenharmony_ci int kind; 8157db96d56Sopenharmony_ci const void *data; 8167db96d56Sopenharmony_ci unsigned char prev_combining = 0; 8177db96d56Sopenharmony_ci 8187db96d56Sopenharmony_ci /* The two quickcheck bits at this shift have type QuickcheckResult. */ 8197db96d56Sopenharmony_ci int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0); 8207db96d56Sopenharmony_ci 8217db96d56Sopenharmony_ci QuickcheckResult result = YES; /* certainly normalized, unless we find something */ 8227db96d56Sopenharmony_ci 8237db96d56Sopenharmony_ci i = 0; 8247db96d56Sopenharmony_ci kind = PyUnicode_KIND(input); 8257db96d56Sopenharmony_ci data = PyUnicode_DATA(input); 8267db96d56Sopenharmony_ci len = PyUnicode_GET_LENGTH(input); 8277db96d56Sopenharmony_ci while (i < len) { 8287db96d56Sopenharmony_ci Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 8297db96d56Sopenharmony_ci const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); 8307db96d56Sopenharmony_ci 8317db96d56Sopenharmony_ci unsigned char combining = record->combining; 8327db96d56Sopenharmony_ci if (combining && prev_combining > combining) 8337db96d56Sopenharmony_ci return NO; /* non-canonical sort order, not normalized */ 8347db96d56Sopenharmony_ci prev_combining = combining; 8357db96d56Sopenharmony_ci 8367db96d56Sopenharmony_ci unsigned char quickcheck_whole = record->normalization_quick_check; 8377db96d56Sopenharmony_ci if (yes_only) { 8387db96d56Sopenharmony_ci if (quickcheck_whole & (3 << quickcheck_shift)) 8397db96d56Sopenharmony_ci return MAYBE; 8407db96d56Sopenharmony_ci } else { 8417db96d56Sopenharmony_ci switch ((quickcheck_whole >> quickcheck_shift) & 3) { 8427db96d56Sopenharmony_ci case NO: 8437db96d56Sopenharmony_ci return NO; 8447db96d56Sopenharmony_ci case MAYBE: 8457db96d56Sopenharmony_ci result = MAYBE; /* this string might need normalization */ 8467db96d56Sopenharmony_ci } 8477db96d56Sopenharmony_ci } 8487db96d56Sopenharmony_ci } 8497db96d56Sopenharmony_ci return result; 8507db96d56Sopenharmony_ci} 8517db96d56Sopenharmony_ci 8527db96d56Sopenharmony_ci/*[clinic input] 8537db96d56Sopenharmony_ciunicodedata.UCD.is_normalized 8547db96d56Sopenharmony_ci 8557db96d56Sopenharmony_ci self: self 8567db96d56Sopenharmony_ci form: unicode 8577db96d56Sopenharmony_ci unistr as input: unicode 8587db96d56Sopenharmony_ci / 8597db96d56Sopenharmony_ci 8607db96d56Sopenharmony_ciReturn whether the Unicode string unistr is in the normal form 'form'. 8617db96d56Sopenharmony_ci 8627db96d56Sopenharmony_ciValid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. 8637db96d56Sopenharmony_ci[clinic start generated code]*/ 8647db96d56Sopenharmony_ci 8657db96d56Sopenharmony_cistatic PyObject * 8667db96d56Sopenharmony_ciunicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, 8677db96d56Sopenharmony_ci PyObject *input) 8687db96d56Sopenharmony_ci/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/ 8697db96d56Sopenharmony_ci{ 8707db96d56Sopenharmony_ci if (PyUnicode_READY(input) == -1) { 8717db96d56Sopenharmony_ci return NULL; 8727db96d56Sopenharmony_ci } 8737db96d56Sopenharmony_ci 8747db96d56Sopenharmony_ci if (PyUnicode_GET_LENGTH(input) == 0) { 8757db96d56Sopenharmony_ci /* special case empty input strings. */ 8767db96d56Sopenharmony_ci Py_RETURN_TRUE; 8777db96d56Sopenharmony_ci } 8787db96d56Sopenharmony_ci 8797db96d56Sopenharmony_ci PyObject *result; 8807db96d56Sopenharmony_ci bool nfc = false; 8817db96d56Sopenharmony_ci bool k = false; 8827db96d56Sopenharmony_ci QuickcheckResult m; 8837db96d56Sopenharmony_ci 8847db96d56Sopenharmony_ci PyObject *cmp; 8857db96d56Sopenharmony_ci int match = 0; 8867db96d56Sopenharmony_ci 8877db96d56Sopenharmony_ci if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) { 8887db96d56Sopenharmony_ci nfc = true; 8897db96d56Sopenharmony_ci } 8907db96d56Sopenharmony_ci else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) { 8917db96d56Sopenharmony_ci nfc = true; 8927db96d56Sopenharmony_ci k = true; 8937db96d56Sopenharmony_ci } 8947db96d56Sopenharmony_ci else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) { 8957db96d56Sopenharmony_ci /* matches default values for `nfc` and `k` */ 8967db96d56Sopenharmony_ci } 8977db96d56Sopenharmony_ci else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) { 8987db96d56Sopenharmony_ci k = true; 8997db96d56Sopenharmony_ci } 9007db96d56Sopenharmony_ci else { 9017db96d56Sopenharmony_ci PyErr_SetString(PyExc_ValueError, "invalid normalization form"); 9027db96d56Sopenharmony_ci return NULL; 9037db96d56Sopenharmony_ci } 9047db96d56Sopenharmony_ci 9057db96d56Sopenharmony_ci m = is_normalized_quickcheck(self, input, nfc, k, false); 9067db96d56Sopenharmony_ci 9077db96d56Sopenharmony_ci if (m == MAYBE) { 9087db96d56Sopenharmony_ci cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); 9097db96d56Sopenharmony_ci if (cmp == NULL) { 9107db96d56Sopenharmony_ci return NULL; 9117db96d56Sopenharmony_ci } 9127db96d56Sopenharmony_ci match = PyUnicode_Compare(input, cmp); 9137db96d56Sopenharmony_ci Py_DECREF(cmp); 9147db96d56Sopenharmony_ci result = (match == 0) ? Py_True : Py_False; 9157db96d56Sopenharmony_ci } 9167db96d56Sopenharmony_ci else { 9177db96d56Sopenharmony_ci result = (m == YES) ? Py_True : Py_False; 9187db96d56Sopenharmony_ci } 9197db96d56Sopenharmony_ci 9207db96d56Sopenharmony_ci Py_INCREF(result); 9217db96d56Sopenharmony_ci return result; 9227db96d56Sopenharmony_ci} 9237db96d56Sopenharmony_ci 9247db96d56Sopenharmony_ci 9257db96d56Sopenharmony_ci/*[clinic input] 9267db96d56Sopenharmony_ciunicodedata.UCD.normalize 9277db96d56Sopenharmony_ci 9287db96d56Sopenharmony_ci self: self 9297db96d56Sopenharmony_ci form: unicode 9307db96d56Sopenharmony_ci unistr as input: unicode 9317db96d56Sopenharmony_ci / 9327db96d56Sopenharmony_ci 9337db96d56Sopenharmony_ciReturn the normal form 'form' for the Unicode string unistr. 9347db96d56Sopenharmony_ci 9357db96d56Sopenharmony_ciValid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. 9367db96d56Sopenharmony_ci[clinic start generated code]*/ 9377db96d56Sopenharmony_ci 9387db96d56Sopenharmony_cistatic PyObject * 9397db96d56Sopenharmony_ciunicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, 9407db96d56Sopenharmony_ci PyObject *input) 9417db96d56Sopenharmony_ci/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/ 9427db96d56Sopenharmony_ci{ 9437db96d56Sopenharmony_ci if (PyUnicode_GET_LENGTH(input) == 0) { 9447db96d56Sopenharmony_ci /* Special case empty input strings, since resizing 9457db96d56Sopenharmony_ci them later would cause internal errors. */ 9467db96d56Sopenharmony_ci Py_INCREF(input); 9477db96d56Sopenharmony_ci return input; 9487db96d56Sopenharmony_ci } 9497db96d56Sopenharmony_ci 9507db96d56Sopenharmony_ci if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) { 9517db96d56Sopenharmony_ci if (is_normalized_quickcheck(self, input, 9527db96d56Sopenharmony_ci true, false, true) == YES) { 9537db96d56Sopenharmony_ci Py_INCREF(input); 9547db96d56Sopenharmony_ci return input; 9557db96d56Sopenharmony_ci } 9567db96d56Sopenharmony_ci return nfc_nfkc(self, input, 0); 9577db96d56Sopenharmony_ci } 9587db96d56Sopenharmony_ci if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) { 9597db96d56Sopenharmony_ci if (is_normalized_quickcheck(self, input, 9607db96d56Sopenharmony_ci true, true, true) == YES) { 9617db96d56Sopenharmony_ci Py_INCREF(input); 9627db96d56Sopenharmony_ci return input; 9637db96d56Sopenharmony_ci } 9647db96d56Sopenharmony_ci return nfc_nfkc(self, input, 1); 9657db96d56Sopenharmony_ci } 9667db96d56Sopenharmony_ci if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) { 9677db96d56Sopenharmony_ci if (is_normalized_quickcheck(self, input, 9687db96d56Sopenharmony_ci false, false, true) == YES) { 9697db96d56Sopenharmony_ci Py_INCREF(input); 9707db96d56Sopenharmony_ci return input; 9717db96d56Sopenharmony_ci } 9727db96d56Sopenharmony_ci return nfd_nfkd(self, input, 0); 9737db96d56Sopenharmony_ci } 9747db96d56Sopenharmony_ci if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) { 9757db96d56Sopenharmony_ci if (is_normalized_quickcheck(self, input, 9767db96d56Sopenharmony_ci false, true, true) == YES) { 9777db96d56Sopenharmony_ci Py_INCREF(input); 9787db96d56Sopenharmony_ci return input; 9797db96d56Sopenharmony_ci } 9807db96d56Sopenharmony_ci return nfd_nfkd(self, input, 1); 9817db96d56Sopenharmony_ci } 9827db96d56Sopenharmony_ci PyErr_SetString(PyExc_ValueError, "invalid normalization form"); 9837db96d56Sopenharmony_ci return NULL; 9847db96d56Sopenharmony_ci} 9857db96d56Sopenharmony_ci 9867db96d56Sopenharmony_ci/* -------------------------------------------------------------------- */ 9877db96d56Sopenharmony_ci/* unicode character name tables */ 9887db96d56Sopenharmony_ci 9897db96d56Sopenharmony_ci/* data file generated by Tools/unicode/makeunicodedata.py */ 9907db96d56Sopenharmony_ci#include "unicodename_db.h" 9917db96d56Sopenharmony_ci 9927db96d56Sopenharmony_ci/* -------------------------------------------------------------------- */ 9937db96d56Sopenharmony_ci/* database code (cut and pasted from the unidb package) */ 9947db96d56Sopenharmony_ci 9957db96d56Sopenharmony_cistatic unsigned long 9967db96d56Sopenharmony_ci_gethash(const char *s, int len, int scale) 9977db96d56Sopenharmony_ci{ 9987db96d56Sopenharmony_ci int i; 9997db96d56Sopenharmony_ci unsigned long h = 0; 10007db96d56Sopenharmony_ci unsigned long ix; 10017db96d56Sopenharmony_ci for (i = 0; i < len; i++) { 10027db96d56Sopenharmony_ci h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]); 10037db96d56Sopenharmony_ci ix = h & 0xff000000; 10047db96d56Sopenharmony_ci if (ix) 10057db96d56Sopenharmony_ci h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; 10067db96d56Sopenharmony_ci } 10077db96d56Sopenharmony_ci return h; 10087db96d56Sopenharmony_ci} 10097db96d56Sopenharmony_ci 10107db96d56Sopenharmony_cistatic const char * const hangul_syllables[][3] = { 10117db96d56Sopenharmony_ci { "G", "A", "" }, 10127db96d56Sopenharmony_ci { "GG", "AE", "G" }, 10137db96d56Sopenharmony_ci { "N", "YA", "GG" }, 10147db96d56Sopenharmony_ci { "D", "YAE", "GS" }, 10157db96d56Sopenharmony_ci { "DD", "EO", "N", }, 10167db96d56Sopenharmony_ci { "R", "E", "NJ" }, 10177db96d56Sopenharmony_ci { "M", "YEO", "NH" }, 10187db96d56Sopenharmony_ci { "B", "YE", "D" }, 10197db96d56Sopenharmony_ci { "BB", "O", "L" }, 10207db96d56Sopenharmony_ci { "S", "WA", "LG" }, 10217db96d56Sopenharmony_ci { "SS", "WAE", "LM" }, 10227db96d56Sopenharmony_ci { "", "OE", "LB" }, 10237db96d56Sopenharmony_ci { "J", "YO", "LS" }, 10247db96d56Sopenharmony_ci { "JJ", "U", "LT" }, 10257db96d56Sopenharmony_ci { "C", "WEO", "LP" }, 10267db96d56Sopenharmony_ci { "K", "WE", "LH" }, 10277db96d56Sopenharmony_ci { "T", "WI", "M" }, 10287db96d56Sopenharmony_ci { "P", "YU", "B" }, 10297db96d56Sopenharmony_ci { "H", "EU", "BS" }, 10307db96d56Sopenharmony_ci { 0, "YI", "S" }, 10317db96d56Sopenharmony_ci { 0, "I", "SS" }, 10327db96d56Sopenharmony_ci { 0, 0, "NG" }, 10337db96d56Sopenharmony_ci { 0, 0, "J" }, 10347db96d56Sopenharmony_ci { 0, 0, "C" }, 10357db96d56Sopenharmony_ci { 0, 0, "K" }, 10367db96d56Sopenharmony_ci { 0, 0, "T" }, 10377db96d56Sopenharmony_ci { 0, 0, "P" }, 10387db96d56Sopenharmony_ci { 0, 0, "H" } 10397db96d56Sopenharmony_ci}; 10407db96d56Sopenharmony_ci 10417db96d56Sopenharmony_ci/* These ranges need to match makeunicodedata.py:cjk_ranges. */ 10427db96d56Sopenharmony_cistatic int 10437db96d56Sopenharmony_ciis_unified_ideograph(Py_UCS4 code) 10447db96d56Sopenharmony_ci{ 10457db96d56Sopenharmony_ci return 10467db96d56Sopenharmony_ci (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */ 10477db96d56Sopenharmony_ci (0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */ 10487db96d56Sopenharmony_ci (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */ 10497db96d56Sopenharmony_ci (0x2A700 <= code && code <= 0x2B738) || /* CJK Ideograph Extension C */ 10507db96d56Sopenharmony_ci (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */ 10517db96d56Sopenharmony_ci (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */ 10527db96d56Sopenharmony_ci (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */ 10537db96d56Sopenharmony_ci (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */ 10547db96d56Sopenharmony_ci} 10557db96d56Sopenharmony_ci 10567db96d56Sopenharmony_ci/* macros used to determine if the given code point is in the PUA range that 10577db96d56Sopenharmony_ci * we are using to store aliases and named sequences */ 10587db96d56Sopenharmony_ci#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end)) 10597db96d56Sopenharmony_ci#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \ 10607db96d56Sopenharmony_ci (cp < named_sequences_end)) 10617db96d56Sopenharmony_ci 10627db96d56Sopenharmony_cistatic int 10637db96d56Sopenharmony_ci_getucname(PyObject *self, 10647db96d56Sopenharmony_ci Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq) 10657db96d56Sopenharmony_ci{ 10667db96d56Sopenharmony_ci /* Find the name associated with the given code point. 10677db96d56Sopenharmony_ci * If with_alias_and_seq is 1, check for names in the Private Use Area 15 10687db96d56Sopenharmony_ci * that we are using for aliases and named sequences. */ 10697db96d56Sopenharmony_ci int offset; 10707db96d56Sopenharmony_ci int i; 10717db96d56Sopenharmony_ci int word; 10727db96d56Sopenharmony_ci const unsigned char* w; 10737db96d56Sopenharmony_ci 10747db96d56Sopenharmony_ci if (code >= 0x110000) 10757db96d56Sopenharmony_ci return 0; 10767db96d56Sopenharmony_ci 10777db96d56Sopenharmony_ci /* XXX should we just skip all the code points in the PUAs here? */ 10787db96d56Sopenharmony_ci if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code))) 10797db96d56Sopenharmony_ci return 0; 10807db96d56Sopenharmony_ci 10817db96d56Sopenharmony_ci if (UCD_Check(self)) { 10827db96d56Sopenharmony_ci /* in 3.2.0 there are no aliases and named sequences */ 10837db96d56Sopenharmony_ci const change_record *old; 10847db96d56Sopenharmony_ci if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) 10857db96d56Sopenharmony_ci return 0; 10867db96d56Sopenharmony_ci old = get_old_record(self, code); 10877db96d56Sopenharmony_ci if (old->category_changed == 0) { 10887db96d56Sopenharmony_ci /* unassigned */ 10897db96d56Sopenharmony_ci return 0; 10907db96d56Sopenharmony_ci } 10917db96d56Sopenharmony_ci } 10927db96d56Sopenharmony_ci 10937db96d56Sopenharmony_ci if (SBase <= code && code < SBase+SCount) { 10947db96d56Sopenharmony_ci /* Hangul syllable. */ 10957db96d56Sopenharmony_ci int SIndex = code - SBase; 10967db96d56Sopenharmony_ci int L = SIndex / NCount; 10977db96d56Sopenharmony_ci int V = (SIndex % NCount) / TCount; 10987db96d56Sopenharmony_ci int T = SIndex % TCount; 10997db96d56Sopenharmony_ci 11007db96d56Sopenharmony_ci if (buflen < 27) 11017db96d56Sopenharmony_ci /* Worst case: HANGUL SYLLABLE <10chars>. */ 11027db96d56Sopenharmony_ci return 0; 11037db96d56Sopenharmony_ci strcpy(buffer, "HANGUL SYLLABLE "); 11047db96d56Sopenharmony_ci buffer += 16; 11057db96d56Sopenharmony_ci strcpy(buffer, hangul_syllables[L][0]); 11067db96d56Sopenharmony_ci buffer += strlen(hangul_syllables[L][0]); 11077db96d56Sopenharmony_ci strcpy(buffer, hangul_syllables[V][1]); 11087db96d56Sopenharmony_ci buffer += strlen(hangul_syllables[V][1]); 11097db96d56Sopenharmony_ci strcpy(buffer, hangul_syllables[T][2]); 11107db96d56Sopenharmony_ci buffer += strlen(hangul_syllables[T][2]); 11117db96d56Sopenharmony_ci *buffer = '\0'; 11127db96d56Sopenharmony_ci return 1; 11137db96d56Sopenharmony_ci } 11147db96d56Sopenharmony_ci 11157db96d56Sopenharmony_ci if (is_unified_ideograph(code)) { 11167db96d56Sopenharmony_ci if (buflen < 28) 11177db96d56Sopenharmony_ci /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ 11187db96d56Sopenharmony_ci return 0; 11197db96d56Sopenharmony_ci sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); 11207db96d56Sopenharmony_ci return 1; 11217db96d56Sopenharmony_ci } 11227db96d56Sopenharmony_ci 11237db96d56Sopenharmony_ci /* get offset into phrasebook */ 11247db96d56Sopenharmony_ci offset = phrasebook_offset1[(code>>phrasebook_shift)]; 11257db96d56Sopenharmony_ci offset = phrasebook_offset2[(offset<<phrasebook_shift) + 11267db96d56Sopenharmony_ci (code&((1<<phrasebook_shift)-1))]; 11277db96d56Sopenharmony_ci if (!offset) 11287db96d56Sopenharmony_ci return 0; 11297db96d56Sopenharmony_ci 11307db96d56Sopenharmony_ci i = 0; 11317db96d56Sopenharmony_ci 11327db96d56Sopenharmony_ci for (;;) { 11337db96d56Sopenharmony_ci /* get word index */ 11347db96d56Sopenharmony_ci word = phrasebook[offset] - phrasebook_short; 11357db96d56Sopenharmony_ci if (word >= 0) { 11367db96d56Sopenharmony_ci word = (word << 8) + phrasebook[offset+1]; 11377db96d56Sopenharmony_ci offset += 2; 11387db96d56Sopenharmony_ci } else 11397db96d56Sopenharmony_ci word = phrasebook[offset++]; 11407db96d56Sopenharmony_ci if (i) { 11417db96d56Sopenharmony_ci if (i > buflen) 11427db96d56Sopenharmony_ci return 0; /* buffer overflow */ 11437db96d56Sopenharmony_ci buffer[i++] = ' '; 11447db96d56Sopenharmony_ci } 11457db96d56Sopenharmony_ci /* copy word string from lexicon. the last character in the 11467db96d56Sopenharmony_ci word has bit 7 set. the last word in a string ends with 11477db96d56Sopenharmony_ci 0x80 */ 11487db96d56Sopenharmony_ci w = lexicon + lexicon_offset[word]; 11497db96d56Sopenharmony_ci while (*w < 128) { 11507db96d56Sopenharmony_ci if (i >= buflen) 11517db96d56Sopenharmony_ci return 0; /* buffer overflow */ 11527db96d56Sopenharmony_ci buffer[i++] = *w++; 11537db96d56Sopenharmony_ci } 11547db96d56Sopenharmony_ci if (i >= buflen) 11557db96d56Sopenharmony_ci return 0; /* buffer overflow */ 11567db96d56Sopenharmony_ci buffer[i++] = *w & 127; 11577db96d56Sopenharmony_ci if (*w == 128) 11587db96d56Sopenharmony_ci break; /* end of word */ 11597db96d56Sopenharmony_ci } 11607db96d56Sopenharmony_ci 11617db96d56Sopenharmony_ci return 1; 11627db96d56Sopenharmony_ci} 11637db96d56Sopenharmony_ci 11647db96d56Sopenharmony_cistatic int 11657db96d56Sopenharmony_cicapi_getucname(Py_UCS4 code, 11667db96d56Sopenharmony_ci char* buffer, int buflen, 11677db96d56Sopenharmony_ci int with_alias_and_seq) 11687db96d56Sopenharmony_ci{ 11697db96d56Sopenharmony_ci return _getucname(NULL, code, buffer, buflen, with_alias_and_seq); 11707db96d56Sopenharmony_ci 11717db96d56Sopenharmony_ci} 11727db96d56Sopenharmony_ci 11737db96d56Sopenharmony_cistatic int 11747db96d56Sopenharmony_ci_cmpname(PyObject *self, int code, const char* name, int namelen) 11757db96d56Sopenharmony_ci{ 11767db96d56Sopenharmony_ci /* check if code corresponds to the given name */ 11777db96d56Sopenharmony_ci int i; 11787db96d56Sopenharmony_ci char buffer[NAME_MAXLEN+1]; 11797db96d56Sopenharmony_ci if (!_getucname(self, code, buffer, NAME_MAXLEN, 1)) 11807db96d56Sopenharmony_ci return 0; 11817db96d56Sopenharmony_ci for (i = 0; i < namelen; i++) { 11827db96d56Sopenharmony_ci if (Py_TOUPPER(name[i]) != buffer[i]) 11837db96d56Sopenharmony_ci return 0; 11847db96d56Sopenharmony_ci } 11857db96d56Sopenharmony_ci return buffer[namelen] == '\0'; 11867db96d56Sopenharmony_ci} 11877db96d56Sopenharmony_ci 11887db96d56Sopenharmony_cistatic void 11897db96d56Sopenharmony_cifind_syllable(const char *str, int *len, int *pos, int count, int column) 11907db96d56Sopenharmony_ci{ 11917db96d56Sopenharmony_ci int i, len1; 11927db96d56Sopenharmony_ci *len = -1; 11937db96d56Sopenharmony_ci for (i = 0; i < count; i++) { 11947db96d56Sopenharmony_ci const char *s = hangul_syllables[i][column]; 11957db96d56Sopenharmony_ci len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int); 11967db96d56Sopenharmony_ci if (len1 <= *len) 11977db96d56Sopenharmony_ci continue; 11987db96d56Sopenharmony_ci if (strncmp(str, s, len1) == 0) { 11997db96d56Sopenharmony_ci *len = len1; 12007db96d56Sopenharmony_ci *pos = i; 12017db96d56Sopenharmony_ci } 12027db96d56Sopenharmony_ci } 12037db96d56Sopenharmony_ci if (*len == -1) { 12047db96d56Sopenharmony_ci *len = 0; 12057db96d56Sopenharmony_ci } 12067db96d56Sopenharmony_ci} 12077db96d56Sopenharmony_ci 12087db96d56Sopenharmony_cistatic int 12097db96d56Sopenharmony_ci_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq) 12107db96d56Sopenharmony_ci{ 12117db96d56Sopenharmony_ci /* check if named sequences are allowed */ 12127db96d56Sopenharmony_ci if (!with_named_seq && IS_NAMED_SEQ(cp)) 12137db96d56Sopenharmony_ci return 0; 12147db96d56Sopenharmony_ci /* if the code point is in the PUA range that we use for aliases, 12157db96d56Sopenharmony_ci * convert it to obtain the right code point */ 12167db96d56Sopenharmony_ci if (IS_ALIAS(cp)) 12177db96d56Sopenharmony_ci *code = name_aliases[cp-aliases_start]; 12187db96d56Sopenharmony_ci else 12197db96d56Sopenharmony_ci *code = cp; 12207db96d56Sopenharmony_ci return 1; 12217db96d56Sopenharmony_ci} 12227db96d56Sopenharmony_ci 12237db96d56Sopenharmony_cistatic int 12247db96d56Sopenharmony_ci_getcode(PyObject* self, 12257db96d56Sopenharmony_ci const char* name, int namelen, Py_UCS4* code, int with_named_seq) 12267db96d56Sopenharmony_ci{ 12277db96d56Sopenharmony_ci /* Return the code point associated with the given name. 12287db96d56Sopenharmony_ci * Named aliases are resolved too (unless self != NULL (i.e. we are using 12297db96d56Sopenharmony_ci * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are 12307db96d56Sopenharmony_ci * using for the named sequence, and the caller must then convert it. */ 12317db96d56Sopenharmony_ci unsigned int h, v; 12327db96d56Sopenharmony_ci unsigned int mask = code_size-1; 12337db96d56Sopenharmony_ci unsigned int i, incr; 12347db96d56Sopenharmony_ci 12357db96d56Sopenharmony_ci /* Check for hangul syllables. */ 12367db96d56Sopenharmony_ci if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { 12377db96d56Sopenharmony_ci int len, L = -1, V = -1, T = -1; 12387db96d56Sopenharmony_ci const char *pos = name + 16; 12397db96d56Sopenharmony_ci find_syllable(pos, &len, &L, LCount, 0); 12407db96d56Sopenharmony_ci pos += len; 12417db96d56Sopenharmony_ci find_syllable(pos, &len, &V, VCount, 1); 12427db96d56Sopenharmony_ci pos += len; 12437db96d56Sopenharmony_ci find_syllable(pos, &len, &T, TCount, 2); 12447db96d56Sopenharmony_ci pos += len; 12457db96d56Sopenharmony_ci if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { 12467db96d56Sopenharmony_ci *code = SBase + (L*VCount+V)*TCount + T; 12477db96d56Sopenharmony_ci return 1; 12487db96d56Sopenharmony_ci } 12497db96d56Sopenharmony_ci /* Otherwise, it's an illegal syllable name. */ 12507db96d56Sopenharmony_ci return 0; 12517db96d56Sopenharmony_ci } 12527db96d56Sopenharmony_ci 12537db96d56Sopenharmony_ci /* Check for unified ideographs. */ 12547db96d56Sopenharmony_ci if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { 12557db96d56Sopenharmony_ci /* Four or five hexdigits must follow. */ 12567db96d56Sopenharmony_ci v = 0; 12577db96d56Sopenharmony_ci name += 22; 12587db96d56Sopenharmony_ci namelen -= 22; 12597db96d56Sopenharmony_ci if (namelen != 4 && namelen != 5) 12607db96d56Sopenharmony_ci return 0; 12617db96d56Sopenharmony_ci while (namelen--) { 12627db96d56Sopenharmony_ci v *= 16; 12637db96d56Sopenharmony_ci if (*name >= '0' && *name <= '9') 12647db96d56Sopenharmony_ci v += *name - '0'; 12657db96d56Sopenharmony_ci else if (*name >= 'A' && *name <= 'F') 12667db96d56Sopenharmony_ci v += *name - 'A' + 10; 12677db96d56Sopenharmony_ci else 12687db96d56Sopenharmony_ci return 0; 12697db96d56Sopenharmony_ci name++; 12707db96d56Sopenharmony_ci } 12717db96d56Sopenharmony_ci if (!is_unified_ideograph(v)) 12727db96d56Sopenharmony_ci return 0; 12737db96d56Sopenharmony_ci *code = v; 12747db96d56Sopenharmony_ci return 1; 12757db96d56Sopenharmony_ci } 12767db96d56Sopenharmony_ci 12777db96d56Sopenharmony_ci /* the following is the same as python's dictionary lookup, with 12787db96d56Sopenharmony_ci only minor changes. see the makeunicodedata script for more 12797db96d56Sopenharmony_ci details */ 12807db96d56Sopenharmony_ci 12817db96d56Sopenharmony_ci h = (unsigned int) _gethash(name, namelen, code_magic); 12827db96d56Sopenharmony_ci i = (~h) & mask; 12837db96d56Sopenharmony_ci v = code_hash[i]; 12847db96d56Sopenharmony_ci if (!v) 12857db96d56Sopenharmony_ci return 0; 12867db96d56Sopenharmony_ci if (_cmpname(self, v, name, namelen)) { 12877db96d56Sopenharmony_ci return _check_alias_and_seq(v, code, with_named_seq); 12887db96d56Sopenharmony_ci } 12897db96d56Sopenharmony_ci incr = (h ^ (h >> 3)) & mask; 12907db96d56Sopenharmony_ci if (!incr) 12917db96d56Sopenharmony_ci incr = mask; 12927db96d56Sopenharmony_ci for (;;) { 12937db96d56Sopenharmony_ci i = (i + incr) & mask; 12947db96d56Sopenharmony_ci v = code_hash[i]; 12957db96d56Sopenharmony_ci if (!v) 12967db96d56Sopenharmony_ci return 0; 12977db96d56Sopenharmony_ci if (_cmpname(self, v, name, namelen)) { 12987db96d56Sopenharmony_ci return _check_alias_and_seq(v, code, with_named_seq); 12997db96d56Sopenharmony_ci } 13007db96d56Sopenharmony_ci incr = incr << 1; 13017db96d56Sopenharmony_ci if (incr > mask) 13027db96d56Sopenharmony_ci incr = incr ^ code_poly; 13037db96d56Sopenharmony_ci } 13047db96d56Sopenharmony_ci} 13057db96d56Sopenharmony_ci 13067db96d56Sopenharmony_cistatic int 13077db96d56Sopenharmony_cicapi_getcode(const char* name, int namelen, Py_UCS4* code, 13087db96d56Sopenharmony_ci int with_named_seq) 13097db96d56Sopenharmony_ci{ 13107db96d56Sopenharmony_ci return _getcode(NULL, name, namelen, code, with_named_seq); 13117db96d56Sopenharmony_ci 13127db96d56Sopenharmony_ci} 13137db96d56Sopenharmony_ci 13147db96d56Sopenharmony_cistatic void 13157db96d56Sopenharmony_ciunicodedata_destroy_capi(PyObject *capsule) 13167db96d56Sopenharmony_ci{ 13177db96d56Sopenharmony_ci void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME); 13187db96d56Sopenharmony_ci PyMem_Free(capi); 13197db96d56Sopenharmony_ci} 13207db96d56Sopenharmony_ci 13217db96d56Sopenharmony_cistatic PyObject * 13227db96d56Sopenharmony_ciunicodedata_create_capi(void) 13237db96d56Sopenharmony_ci{ 13247db96d56Sopenharmony_ci _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI)); 13257db96d56Sopenharmony_ci if (capi == NULL) { 13267db96d56Sopenharmony_ci PyErr_NoMemory(); 13277db96d56Sopenharmony_ci return NULL; 13287db96d56Sopenharmony_ci } 13297db96d56Sopenharmony_ci capi->getname = capi_getucname; 13307db96d56Sopenharmony_ci capi->getcode = capi_getcode; 13317db96d56Sopenharmony_ci 13327db96d56Sopenharmony_ci PyObject *capsule = PyCapsule_New(capi, 13337db96d56Sopenharmony_ci PyUnicodeData_CAPSULE_NAME, 13347db96d56Sopenharmony_ci unicodedata_destroy_capi); 13357db96d56Sopenharmony_ci if (capsule == NULL) { 13367db96d56Sopenharmony_ci PyMem_Free(capi); 13377db96d56Sopenharmony_ci } 13387db96d56Sopenharmony_ci return capsule; 13397db96d56Sopenharmony_ci}; 13407db96d56Sopenharmony_ci 13417db96d56Sopenharmony_ci 13427db96d56Sopenharmony_ci/* -------------------------------------------------------------------- */ 13437db96d56Sopenharmony_ci/* Python bindings */ 13447db96d56Sopenharmony_ci 13457db96d56Sopenharmony_ci/*[clinic input] 13467db96d56Sopenharmony_ciunicodedata.UCD.name 13477db96d56Sopenharmony_ci 13487db96d56Sopenharmony_ci self: self 13497db96d56Sopenharmony_ci chr: int(accept={str}) 13507db96d56Sopenharmony_ci default: object=NULL 13517db96d56Sopenharmony_ci / 13527db96d56Sopenharmony_ci 13537db96d56Sopenharmony_ciReturns the name assigned to the character chr as a string. 13547db96d56Sopenharmony_ci 13557db96d56Sopenharmony_ciIf no name is defined, default is returned, or, if not given, 13567db96d56Sopenharmony_ciValueError is raised. 13577db96d56Sopenharmony_ci[clinic start generated code]*/ 13587db96d56Sopenharmony_ci 13597db96d56Sopenharmony_cistatic PyObject * 13607db96d56Sopenharmony_ciunicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value) 13617db96d56Sopenharmony_ci/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/ 13627db96d56Sopenharmony_ci{ 13637db96d56Sopenharmony_ci char name[NAME_MAXLEN+1]; 13647db96d56Sopenharmony_ci Py_UCS4 c = (Py_UCS4)chr; 13657db96d56Sopenharmony_ci 13667db96d56Sopenharmony_ci if (!_getucname(self, c, name, NAME_MAXLEN, 0)) { 13677db96d56Sopenharmony_ci if (default_value == NULL) { 13687db96d56Sopenharmony_ci PyErr_SetString(PyExc_ValueError, "no such name"); 13697db96d56Sopenharmony_ci return NULL; 13707db96d56Sopenharmony_ci } 13717db96d56Sopenharmony_ci else { 13727db96d56Sopenharmony_ci Py_INCREF(default_value); 13737db96d56Sopenharmony_ci return default_value; 13747db96d56Sopenharmony_ci } 13757db96d56Sopenharmony_ci } 13767db96d56Sopenharmony_ci 13777db96d56Sopenharmony_ci return PyUnicode_FromString(name); 13787db96d56Sopenharmony_ci} 13797db96d56Sopenharmony_ci 13807db96d56Sopenharmony_ci/*[clinic input] 13817db96d56Sopenharmony_ciunicodedata.UCD.lookup 13827db96d56Sopenharmony_ci 13837db96d56Sopenharmony_ci self: self 13847db96d56Sopenharmony_ci name: str(accept={str, robuffer}, zeroes=True) 13857db96d56Sopenharmony_ci / 13867db96d56Sopenharmony_ci 13877db96d56Sopenharmony_ciLook up character by name. 13887db96d56Sopenharmony_ci 13897db96d56Sopenharmony_ciIf a character with the given name is found, return the 13907db96d56Sopenharmony_cicorresponding character. If not found, KeyError is raised. 13917db96d56Sopenharmony_ci[clinic start generated code]*/ 13927db96d56Sopenharmony_ci 13937db96d56Sopenharmony_cistatic PyObject * 13947db96d56Sopenharmony_ciunicodedata_UCD_lookup_impl(PyObject *self, const char *name, 13957db96d56Sopenharmony_ci Py_ssize_t name_length) 13967db96d56Sopenharmony_ci/*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/ 13977db96d56Sopenharmony_ci{ 13987db96d56Sopenharmony_ci Py_UCS4 code; 13997db96d56Sopenharmony_ci unsigned int index; 14007db96d56Sopenharmony_ci if (name_length > NAME_MAXLEN) { 14017db96d56Sopenharmony_ci PyErr_SetString(PyExc_KeyError, "name too long"); 14027db96d56Sopenharmony_ci return NULL; 14037db96d56Sopenharmony_ci } 14047db96d56Sopenharmony_ci 14057db96d56Sopenharmony_ci if (!_getcode(self, name, (int)name_length, &code, 1)) { 14067db96d56Sopenharmony_ci PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name); 14077db96d56Sopenharmony_ci return NULL; 14087db96d56Sopenharmony_ci } 14097db96d56Sopenharmony_ci /* check if code is in the PUA range that we use for named sequences 14107db96d56Sopenharmony_ci and convert it */ 14117db96d56Sopenharmony_ci if (IS_NAMED_SEQ(code)) { 14127db96d56Sopenharmony_ci index = code-named_sequences_start; 14137db96d56Sopenharmony_ci return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, 14147db96d56Sopenharmony_ci named_sequences[index].seq, 14157db96d56Sopenharmony_ci named_sequences[index].seqlen); 14167db96d56Sopenharmony_ci } 14177db96d56Sopenharmony_ci return PyUnicode_FromOrdinal(code); 14187db96d56Sopenharmony_ci} 14197db96d56Sopenharmony_ci 14207db96d56Sopenharmony_ci// List of functions used to define module functions *AND* unicodedata.UCD 14217db96d56Sopenharmony_ci// methods. For module functions, self is the module. For UCD methods, self 14227db96d56Sopenharmony_ci// is an UCD instance. The UCD_Check() macro is used to check if self is 14237db96d56Sopenharmony_ci// an UCD instance. 14247db96d56Sopenharmony_cistatic PyMethodDef unicodedata_functions[] = { 14257db96d56Sopenharmony_ci UNICODEDATA_UCD_DECIMAL_METHODDEF 14267db96d56Sopenharmony_ci UNICODEDATA_UCD_DIGIT_METHODDEF 14277db96d56Sopenharmony_ci UNICODEDATA_UCD_NUMERIC_METHODDEF 14287db96d56Sopenharmony_ci UNICODEDATA_UCD_CATEGORY_METHODDEF 14297db96d56Sopenharmony_ci UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF 14307db96d56Sopenharmony_ci UNICODEDATA_UCD_COMBINING_METHODDEF 14317db96d56Sopenharmony_ci UNICODEDATA_UCD_MIRRORED_METHODDEF 14327db96d56Sopenharmony_ci UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF 14337db96d56Sopenharmony_ci UNICODEDATA_UCD_DECOMPOSITION_METHODDEF 14347db96d56Sopenharmony_ci UNICODEDATA_UCD_NAME_METHODDEF 14357db96d56Sopenharmony_ci UNICODEDATA_UCD_LOOKUP_METHODDEF 14367db96d56Sopenharmony_ci UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF 14377db96d56Sopenharmony_ci UNICODEDATA_UCD_NORMALIZE_METHODDEF 14387db96d56Sopenharmony_ci {NULL, NULL} /* sentinel */ 14397db96d56Sopenharmony_ci}; 14407db96d56Sopenharmony_ci 14417db96d56Sopenharmony_cistatic int 14427db96d56Sopenharmony_ciucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg) 14437db96d56Sopenharmony_ci{ 14447db96d56Sopenharmony_ci Py_VISIT(Py_TYPE(self)); 14457db96d56Sopenharmony_ci return 0; 14467db96d56Sopenharmony_ci} 14477db96d56Sopenharmony_ci 14487db96d56Sopenharmony_cistatic void 14497db96d56Sopenharmony_ciucd_dealloc(PreviousDBVersion *self) 14507db96d56Sopenharmony_ci{ 14517db96d56Sopenharmony_ci PyTypeObject *tp = Py_TYPE(self); 14527db96d56Sopenharmony_ci PyObject_GC_UnTrack(self); 14537db96d56Sopenharmony_ci PyObject_GC_Del(self); 14547db96d56Sopenharmony_ci Py_DECREF(tp); 14557db96d56Sopenharmony_ci} 14567db96d56Sopenharmony_ci 14577db96d56Sopenharmony_cistatic PyType_Slot ucd_type_slots[] = { 14587db96d56Sopenharmony_ci {Py_tp_dealloc, ucd_dealloc}, 14597db96d56Sopenharmony_ci {Py_tp_traverse, ucd_traverse}, 14607db96d56Sopenharmony_ci {Py_tp_getattro, PyObject_GenericGetAttr}, 14617db96d56Sopenharmony_ci {Py_tp_methods, unicodedata_functions}, 14627db96d56Sopenharmony_ci {Py_tp_members, DB_members}, 14637db96d56Sopenharmony_ci {0, 0} 14647db96d56Sopenharmony_ci}; 14657db96d56Sopenharmony_ci 14667db96d56Sopenharmony_cistatic PyType_Spec ucd_type_spec = { 14677db96d56Sopenharmony_ci .name = "unicodedata.UCD", 14687db96d56Sopenharmony_ci .basicsize = sizeof(PreviousDBVersion), 14697db96d56Sopenharmony_ci .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | 14707db96d56Sopenharmony_ci Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE), 14717db96d56Sopenharmony_ci .slots = ucd_type_slots 14727db96d56Sopenharmony_ci}; 14737db96d56Sopenharmony_ci 14747db96d56Sopenharmony_ciPyDoc_STRVAR(unicodedata_docstring, 14757db96d56Sopenharmony_ci"This module provides access to the Unicode Character Database which\n\ 14767db96d56Sopenharmony_cidefines character properties for all Unicode characters. The data in\n\ 14777db96d56Sopenharmony_cithis database is based on the UnicodeData.txt file version\n\ 14787db96d56Sopenharmony_ci" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\ 14797db96d56Sopenharmony_ci\n\ 14807db96d56Sopenharmony_ciThe module uses the same names and symbols as defined by the\n\ 14817db96d56Sopenharmony_ciUnicodeData File Format " UNIDATA_VERSION "."); 14827db96d56Sopenharmony_ci 14837db96d56Sopenharmony_cistatic int 14847db96d56Sopenharmony_ciunicodedata_exec(PyObject *module) 14857db96d56Sopenharmony_ci{ 14867db96d56Sopenharmony_ci if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) { 14877db96d56Sopenharmony_ci return -1; 14887db96d56Sopenharmony_ci } 14897db96d56Sopenharmony_ci 14907db96d56Sopenharmony_ci PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec); 14917db96d56Sopenharmony_ci if (ucd_type == NULL) { 14927db96d56Sopenharmony_ci return -1; 14937db96d56Sopenharmony_ci } 14947db96d56Sopenharmony_ci 14957db96d56Sopenharmony_ci if (PyModule_AddType(module, ucd_type) < 0) { 14967db96d56Sopenharmony_ci Py_DECREF(ucd_type); 14977db96d56Sopenharmony_ci return -1; 14987db96d56Sopenharmony_ci } 14997db96d56Sopenharmony_ci 15007db96d56Sopenharmony_ci // Unicode database version 3.2.0 used by the IDNA encoding 15017db96d56Sopenharmony_ci PyObject *v; 15027db96d56Sopenharmony_ci v = new_previous_version(ucd_type, "3.2.0", 15037db96d56Sopenharmony_ci get_change_3_2_0, normalization_3_2_0); 15047db96d56Sopenharmony_ci Py_DECREF(ucd_type); 15057db96d56Sopenharmony_ci if (v == NULL) { 15067db96d56Sopenharmony_ci return -1; 15077db96d56Sopenharmony_ci } 15087db96d56Sopenharmony_ci if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) { 15097db96d56Sopenharmony_ci Py_DECREF(v); 15107db96d56Sopenharmony_ci return -1; 15117db96d56Sopenharmony_ci } 15127db96d56Sopenharmony_ci 15137db96d56Sopenharmony_ci /* Export C API */ 15147db96d56Sopenharmony_ci PyObject *capsule = unicodedata_create_capi(); 15157db96d56Sopenharmony_ci if (capsule == NULL) { 15167db96d56Sopenharmony_ci return -1; 15177db96d56Sopenharmony_ci } 15187db96d56Sopenharmony_ci int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule); 15197db96d56Sopenharmony_ci Py_DECREF(capsule); 15207db96d56Sopenharmony_ci if (rc < 0) { 15217db96d56Sopenharmony_ci return -1; 15227db96d56Sopenharmony_ci } 15237db96d56Sopenharmony_ci return 0; 15247db96d56Sopenharmony_ci} 15257db96d56Sopenharmony_ci 15267db96d56Sopenharmony_cistatic PyModuleDef_Slot unicodedata_slots[] = { 15277db96d56Sopenharmony_ci {Py_mod_exec, unicodedata_exec}, 15287db96d56Sopenharmony_ci {0, NULL} 15297db96d56Sopenharmony_ci}; 15307db96d56Sopenharmony_ci 15317db96d56Sopenharmony_cistatic struct PyModuleDef unicodedata_module = { 15327db96d56Sopenharmony_ci PyModuleDef_HEAD_INIT, 15337db96d56Sopenharmony_ci .m_name = "unicodedata", 15347db96d56Sopenharmony_ci .m_doc = unicodedata_docstring, 15357db96d56Sopenharmony_ci .m_size = 0, 15367db96d56Sopenharmony_ci .m_methods = unicodedata_functions, 15377db96d56Sopenharmony_ci .m_slots = unicodedata_slots, 15387db96d56Sopenharmony_ci}; 15397db96d56Sopenharmony_ci 15407db96d56Sopenharmony_ciPyMODINIT_FUNC 15417db96d56Sopenharmony_ciPyInit_unicodedata(void) 15427db96d56Sopenharmony_ci{ 15437db96d56Sopenharmony_ci return PyModuleDef_Init(&unicodedata_module); 15447db96d56Sopenharmony_ci} 15457db96d56Sopenharmony_ci 15467db96d56Sopenharmony_ci 15477db96d56Sopenharmony_ci/* 15487db96d56Sopenharmony_ciLocal variables: 15497db96d56Sopenharmony_cic-basic-offset: 4 15507db96d56Sopenharmony_ciindent-tabs-mode: nil 15517db96d56Sopenharmony_ciEnd: 15527db96d56Sopenharmony_ci*/ 1553