xref: /third_party/python/Modules/unicodedata.c (revision 7db96d56)
17db96d56Sopenharmony_ci/* ------------------------------------------------------------------------
27db96d56Sopenharmony_ci
37db96d56Sopenharmony_ci   unicodedata -- Provides access to the Unicode database.
47db96d56Sopenharmony_ci
57db96d56Sopenharmony_ci   The current version number is reported in the unidata_version constant.
67db96d56Sopenharmony_ci
77db96d56Sopenharmony_ci   Written by Marc-Andre Lemburg (mal@lemburg.com).
87db96d56Sopenharmony_ci   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
97db96d56Sopenharmony_ci   Modified by Martin v. Löwis (martin@v.loewis.de)
107db96d56Sopenharmony_ci
117db96d56Sopenharmony_ci   Copyright (c) Corporation for National Research Initiatives.
127db96d56Sopenharmony_ci
137db96d56Sopenharmony_ci   ------------------------------------------------------------------------ */
147db96d56Sopenharmony_ci
157db96d56Sopenharmony_ci#ifndef Py_BUILD_CORE_BUILTIN
167db96d56Sopenharmony_ci#  define Py_BUILD_CORE_MODULE 1
177db96d56Sopenharmony_ci#endif
187db96d56Sopenharmony_ci
197db96d56Sopenharmony_ci#define PY_SSIZE_T_CLEAN
207db96d56Sopenharmony_ci
217db96d56Sopenharmony_ci#include "Python.h"
227db96d56Sopenharmony_ci#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
237db96d56Sopenharmony_ci#include "structmember.h"         // PyMemberDef
247db96d56Sopenharmony_ci
257db96d56Sopenharmony_ci#include <stdbool.h>
267db96d56Sopenharmony_ci
277db96d56Sopenharmony_ci/*[clinic input]
287db96d56Sopenharmony_cimodule unicodedata
297db96d56Sopenharmony_ciclass unicodedata.UCD 'PreviousDBVersion *' '<not used>'
307db96d56Sopenharmony_ci[clinic start generated code]*/
317db96d56Sopenharmony_ci/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
327db96d56Sopenharmony_ci
337db96d56Sopenharmony_ci/* character properties */
347db96d56Sopenharmony_ci
357db96d56Sopenharmony_citypedef struct {
367db96d56Sopenharmony_ci    const unsigned char category;       /* index into
377db96d56Sopenharmony_ci                                           _PyUnicode_CategoryNames */
387db96d56Sopenharmony_ci    const unsigned char combining;      /* combining class value 0 - 255 */
397db96d56Sopenharmony_ci    const unsigned char bidirectional;  /* index into
407db96d56Sopenharmony_ci                                           _PyUnicode_BidirectionalNames */
417db96d56Sopenharmony_ci    const unsigned char mirrored;       /* true if mirrored in bidir mode */
427db96d56Sopenharmony_ci    const unsigned char east_asian_width;       /* index into
437db96d56Sopenharmony_ci                                                   _PyUnicode_EastAsianWidth */
447db96d56Sopenharmony_ci    const unsigned char normalization_quick_check; /* see is_normalized() */
457db96d56Sopenharmony_ci} _PyUnicode_DatabaseRecord;
467db96d56Sopenharmony_ci
477db96d56Sopenharmony_citypedef struct change_record {
487db96d56Sopenharmony_ci    /* sequence of fields should be the same as in merge_old_version */
497db96d56Sopenharmony_ci    const unsigned char bidir_changed;
507db96d56Sopenharmony_ci    const unsigned char category_changed;
517db96d56Sopenharmony_ci    const unsigned char decimal_changed;
527db96d56Sopenharmony_ci    const unsigned char mirrored_changed;
537db96d56Sopenharmony_ci    const unsigned char east_asian_width_changed;
547db96d56Sopenharmony_ci    const double numeric_changed;
557db96d56Sopenharmony_ci} change_record;
567db96d56Sopenharmony_ci
577db96d56Sopenharmony_ci/* data file generated by Tools/unicode/makeunicodedata.py */
587db96d56Sopenharmony_ci#include "unicodedata_db.h"
597db96d56Sopenharmony_ci
607db96d56Sopenharmony_cistatic const _PyUnicode_DatabaseRecord*
617db96d56Sopenharmony_ci_getrecord_ex(Py_UCS4 code)
627db96d56Sopenharmony_ci{
637db96d56Sopenharmony_ci    int index;
647db96d56Sopenharmony_ci    if (code >= 0x110000)
657db96d56Sopenharmony_ci        index = 0;
667db96d56Sopenharmony_ci    else {
677db96d56Sopenharmony_ci        index = index1[(code>>SHIFT)];
687db96d56Sopenharmony_ci        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
697db96d56Sopenharmony_ci    }
707db96d56Sopenharmony_ci
717db96d56Sopenharmony_ci    return &_PyUnicode_Database_Records[index];
727db96d56Sopenharmony_ci}
737db96d56Sopenharmony_ci
747db96d56Sopenharmony_ci/* ------------- Previous-version API ------------------------------------- */
757db96d56Sopenharmony_citypedef struct previous_version {
767db96d56Sopenharmony_ci    PyObject_HEAD
777db96d56Sopenharmony_ci    const char *name;
787db96d56Sopenharmony_ci    const change_record* (*getrecord)(Py_UCS4);
797db96d56Sopenharmony_ci    Py_UCS4 (*normalization)(Py_UCS4);
807db96d56Sopenharmony_ci} PreviousDBVersion;
817db96d56Sopenharmony_ci
827db96d56Sopenharmony_ci#include "clinic/unicodedata.c.h"
837db96d56Sopenharmony_ci
847db96d56Sopenharmony_ci#define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
857db96d56Sopenharmony_ci
867db96d56Sopenharmony_cistatic PyMemberDef DB_members[] = {
877db96d56Sopenharmony_ci        {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
887db96d56Sopenharmony_ci        {NULL}
897db96d56Sopenharmony_ci};
907db96d56Sopenharmony_ci
917db96d56Sopenharmony_ci// Check if self is an unicodedata.UCD instance.
927db96d56Sopenharmony_ci// If self is NULL (when the PyCapsule C API is used), return 0.
937db96d56Sopenharmony_ci// PyModule_Check() is used to avoid having to retrieve the ucd_type.
947db96d56Sopenharmony_ci// See unicodedata_functions comment to the rationale of this macro.
957db96d56Sopenharmony_ci#define UCD_Check(self) (self != NULL && !PyModule_Check(self))
967db96d56Sopenharmony_ci
977db96d56Sopenharmony_cistatic PyObject*
987db96d56Sopenharmony_cinew_previous_version(PyTypeObject *ucd_type,
997db96d56Sopenharmony_ci                     const char*name, const change_record* (*getrecord)(Py_UCS4),
1007db96d56Sopenharmony_ci                     Py_UCS4 (*normalization)(Py_UCS4))
1017db96d56Sopenharmony_ci{
1027db96d56Sopenharmony_ci    PreviousDBVersion *self;
1037db96d56Sopenharmony_ci    self = PyObject_GC_New(PreviousDBVersion, ucd_type);
1047db96d56Sopenharmony_ci    if (self == NULL)
1057db96d56Sopenharmony_ci        return NULL;
1067db96d56Sopenharmony_ci    self->name = name;
1077db96d56Sopenharmony_ci    self->getrecord = getrecord;
1087db96d56Sopenharmony_ci    self->normalization = normalization;
1097db96d56Sopenharmony_ci    PyObject_GC_Track(self);
1107db96d56Sopenharmony_ci    return (PyObject*)self;
1117db96d56Sopenharmony_ci}
1127db96d56Sopenharmony_ci
1137db96d56Sopenharmony_ci
1147db96d56Sopenharmony_ci/* --- Module API --------------------------------------------------------- */
1157db96d56Sopenharmony_ci
1167db96d56Sopenharmony_ci/*[clinic input]
1177db96d56Sopenharmony_ciunicodedata.UCD.decimal
1187db96d56Sopenharmony_ci
1197db96d56Sopenharmony_ci    self: self
1207db96d56Sopenharmony_ci    chr: int(accept={str})
1217db96d56Sopenharmony_ci    default: object=NULL
1227db96d56Sopenharmony_ci    /
1237db96d56Sopenharmony_ci
1247db96d56Sopenharmony_ciConverts a Unicode character into its equivalent decimal value.
1257db96d56Sopenharmony_ci
1267db96d56Sopenharmony_ciReturns the decimal value assigned to the character chr as integer.
1277db96d56Sopenharmony_ciIf no such value is defined, default is returned, or, if not given,
1287db96d56Sopenharmony_ciValueError is raised.
1297db96d56Sopenharmony_ci[clinic start generated code]*/
1307db96d56Sopenharmony_ci
1317db96d56Sopenharmony_cistatic PyObject *
1327db96d56Sopenharmony_ciunicodedata_UCD_decimal_impl(PyObject *self, int chr,
1337db96d56Sopenharmony_ci                             PyObject *default_value)
1347db96d56Sopenharmony_ci/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
1357db96d56Sopenharmony_ci{
1367db96d56Sopenharmony_ci    int have_old = 0;
1377db96d56Sopenharmony_ci    long rc;
1387db96d56Sopenharmony_ci    Py_UCS4 c = (Py_UCS4)chr;
1397db96d56Sopenharmony_ci
1407db96d56Sopenharmony_ci    if (UCD_Check(self)) {
1417db96d56Sopenharmony_ci        const change_record *old = get_old_record(self, c);
1427db96d56Sopenharmony_ci        if (old->category_changed == 0) {
1437db96d56Sopenharmony_ci            /* unassigned */
1447db96d56Sopenharmony_ci            have_old = 1;
1457db96d56Sopenharmony_ci            rc = -1;
1467db96d56Sopenharmony_ci        }
1477db96d56Sopenharmony_ci        else if (old->decimal_changed != 0xFF) {
1487db96d56Sopenharmony_ci            have_old = 1;
1497db96d56Sopenharmony_ci            rc = old->decimal_changed;
1507db96d56Sopenharmony_ci        }
1517db96d56Sopenharmony_ci    }
1527db96d56Sopenharmony_ci
1537db96d56Sopenharmony_ci    if (!have_old)
1547db96d56Sopenharmony_ci        rc = Py_UNICODE_TODECIMAL(c);
1557db96d56Sopenharmony_ci    if (rc < 0) {
1567db96d56Sopenharmony_ci        if (default_value == NULL) {
1577db96d56Sopenharmony_ci            PyErr_SetString(PyExc_ValueError,
1587db96d56Sopenharmony_ci                            "not a decimal");
1597db96d56Sopenharmony_ci            return NULL;
1607db96d56Sopenharmony_ci        }
1617db96d56Sopenharmony_ci        else {
1627db96d56Sopenharmony_ci            Py_INCREF(default_value);
1637db96d56Sopenharmony_ci            return default_value;
1647db96d56Sopenharmony_ci        }
1657db96d56Sopenharmony_ci    }
1667db96d56Sopenharmony_ci    return PyLong_FromLong(rc);
1677db96d56Sopenharmony_ci}
1687db96d56Sopenharmony_ci
1697db96d56Sopenharmony_ci/*[clinic input]
1707db96d56Sopenharmony_ciunicodedata.UCD.digit
1717db96d56Sopenharmony_ci
1727db96d56Sopenharmony_ci    self: self
1737db96d56Sopenharmony_ci    chr: int(accept={str})
1747db96d56Sopenharmony_ci    default: object=NULL
1757db96d56Sopenharmony_ci    /
1767db96d56Sopenharmony_ci
1777db96d56Sopenharmony_ciConverts a Unicode character into its equivalent digit value.
1787db96d56Sopenharmony_ci
1797db96d56Sopenharmony_ciReturns the digit value assigned to the character chr as integer.
1807db96d56Sopenharmony_ciIf no such value is defined, default is returned, or, if not given,
1817db96d56Sopenharmony_ciValueError is raised.
1827db96d56Sopenharmony_ci[clinic start generated code]*/
1837db96d56Sopenharmony_ci
1847db96d56Sopenharmony_cistatic PyObject *
1857db96d56Sopenharmony_ciunicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
1867db96d56Sopenharmony_ci/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
1877db96d56Sopenharmony_ci{
1887db96d56Sopenharmony_ci    long rc;
1897db96d56Sopenharmony_ci    Py_UCS4 c = (Py_UCS4)chr;
1907db96d56Sopenharmony_ci    rc = Py_UNICODE_TODIGIT(c);
1917db96d56Sopenharmony_ci    if (rc < 0) {
1927db96d56Sopenharmony_ci        if (default_value == NULL) {
1937db96d56Sopenharmony_ci            PyErr_SetString(PyExc_ValueError, "not a digit");
1947db96d56Sopenharmony_ci            return NULL;
1957db96d56Sopenharmony_ci        }
1967db96d56Sopenharmony_ci        else {
1977db96d56Sopenharmony_ci            Py_INCREF(default_value);
1987db96d56Sopenharmony_ci            return default_value;
1997db96d56Sopenharmony_ci        }
2007db96d56Sopenharmony_ci    }
2017db96d56Sopenharmony_ci    return PyLong_FromLong(rc);
2027db96d56Sopenharmony_ci}
2037db96d56Sopenharmony_ci
2047db96d56Sopenharmony_ci/*[clinic input]
2057db96d56Sopenharmony_ciunicodedata.UCD.numeric
2067db96d56Sopenharmony_ci
2077db96d56Sopenharmony_ci    self: self
2087db96d56Sopenharmony_ci    chr: int(accept={str})
2097db96d56Sopenharmony_ci    default: object=NULL
2107db96d56Sopenharmony_ci    /
2117db96d56Sopenharmony_ci
2127db96d56Sopenharmony_ciConverts a Unicode character into its equivalent numeric value.
2137db96d56Sopenharmony_ci
2147db96d56Sopenharmony_ciReturns the numeric value assigned to the character chr as float.
2157db96d56Sopenharmony_ciIf no such value is defined, default is returned, or, if not given,
2167db96d56Sopenharmony_ciValueError is raised.
2177db96d56Sopenharmony_ci[clinic start generated code]*/
2187db96d56Sopenharmony_ci
2197db96d56Sopenharmony_cistatic PyObject *
2207db96d56Sopenharmony_ciunicodedata_UCD_numeric_impl(PyObject *self, int chr,
2217db96d56Sopenharmony_ci                             PyObject *default_value)
2227db96d56Sopenharmony_ci/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
2237db96d56Sopenharmony_ci{
2247db96d56Sopenharmony_ci    int have_old = 0;
2257db96d56Sopenharmony_ci    double rc;
2267db96d56Sopenharmony_ci    Py_UCS4 c = (Py_UCS4)chr;
2277db96d56Sopenharmony_ci
2287db96d56Sopenharmony_ci    if (UCD_Check(self)) {
2297db96d56Sopenharmony_ci        const change_record *old = get_old_record(self, c);
2307db96d56Sopenharmony_ci        if (old->category_changed == 0) {
2317db96d56Sopenharmony_ci            /* unassigned */
2327db96d56Sopenharmony_ci            have_old = 1;
2337db96d56Sopenharmony_ci            rc = -1.0;
2347db96d56Sopenharmony_ci        }
2357db96d56Sopenharmony_ci        else if (old->decimal_changed != 0xFF) {
2367db96d56Sopenharmony_ci            have_old = 1;
2377db96d56Sopenharmony_ci            rc = old->decimal_changed;
2387db96d56Sopenharmony_ci        }
2397db96d56Sopenharmony_ci    }
2407db96d56Sopenharmony_ci
2417db96d56Sopenharmony_ci    if (!have_old)
2427db96d56Sopenharmony_ci        rc = Py_UNICODE_TONUMERIC(c);
2437db96d56Sopenharmony_ci    if (rc == -1.0) {
2447db96d56Sopenharmony_ci        if (default_value == NULL) {
2457db96d56Sopenharmony_ci            PyErr_SetString(PyExc_ValueError, "not a numeric character");
2467db96d56Sopenharmony_ci            return NULL;
2477db96d56Sopenharmony_ci        }
2487db96d56Sopenharmony_ci        else {
2497db96d56Sopenharmony_ci            Py_INCREF(default_value);
2507db96d56Sopenharmony_ci            return default_value;
2517db96d56Sopenharmony_ci        }
2527db96d56Sopenharmony_ci    }
2537db96d56Sopenharmony_ci    return PyFloat_FromDouble(rc);
2547db96d56Sopenharmony_ci}
2557db96d56Sopenharmony_ci
2567db96d56Sopenharmony_ci/*[clinic input]
2577db96d56Sopenharmony_ciunicodedata.UCD.category
2587db96d56Sopenharmony_ci
2597db96d56Sopenharmony_ci    self: self
2607db96d56Sopenharmony_ci    chr: int(accept={str})
2617db96d56Sopenharmony_ci    /
2627db96d56Sopenharmony_ci
2637db96d56Sopenharmony_ciReturns the general category assigned to the character chr as string.
2647db96d56Sopenharmony_ci[clinic start generated code]*/
2657db96d56Sopenharmony_ci
2667db96d56Sopenharmony_cistatic PyObject *
2677db96d56Sopenharmony_ciunicodedata_UCD_category_impl(PyObject *self, int chr)
2687db96d56Sopenharmony_ci/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
2697db96d56Sopenharmony_ci{
2707db96d56Sopenharmony_ci    int index;
2717db96d56Sopenharmony_ci    Py_UCS4 c = (Py_UCS4)chr;
2727db96d56Sopenharmony_ci    index = (int) _getrecord_ex(c)->category;
2737db96d56Sopenharmony_ci    if (UCD_Check(self)) {
2747db96d56Sopenharmony_ci        const change_record *old = get_old_record(self, c);
2757db96d56Sopenharmony_ci        if (old->category_changed != 0xFF)
2767db96d56Sopenharmony_ci            index = old->category_changed;
2777db96d56Sopenharmony_ci    }
2787db96d56Sopenharmony_ci    return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
2797db96d56Sopenharmony_ci}
2807db96d56Sopenharmony_ci
2817db96d56Sopenharmony_ci/*[clinic input]
2827db96d56Sopenharmony_ciunicodedata.UCD.bidirectional
2837db96d56Sopenharmony_ci
2847db96d56Sopenharmony_ci    self: self
2857db96d56Sopenharmony_ci    chr: int(accept={str})
2867db96d56Sopenharmony_ci    /
2877db96d56Sopenharmony_ci
2887db96d56Sopenharmony_ciReturns the bidirectional class assigned to the character chr as string.
2897db96d56Sopenharmony_ci
2907db96d56Sopenharmony_ciIf no such value is defined, an empty string is returned.
2917db96d56Sopenharmony_ci[clinic start generated code]*/
2927db96d56Sopenharmony_ci
2937db96d56Sopenharmony_cistatic PyObject *
2947db96d56Sopenharmony_ciunicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
2957db96d56Sopenharmony_ci/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
2967db96d56Sopenharmony_ci{
2977db96d56Sopenharmony_ci    int index;
2987db96d56Sopenharmony_ci    Py_UCS4 c = (Py_UCS4)chr;
2997db96d56Sopenharmony_ci    index = (int) _getrecord_ex(c)->bidirectional;
3007db96d56Sopenharmony_ci    if (UCD_Check(self)) {
3017db96d56Sopenharmony_ci        const change_record *old = get_old_record(self, c);
3027db96d56Sopenharmony_ci        if (old->category_changed == 0)
3037db96d56Sopenharmony_ci            index = 0; /* unassigned */
3047db96d56Sopenharmony_ci        else if (old->bidir_changed != 0xFF)
3057db96d56Sopenharmony_ci            index = old->bidir_changed;
3067db96d56Sopenharmony_ci    }
3077db96d56Sopenharmony_ci    return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
3087db96d56Sopenharmony_ci}
3097db96d56Sopenharmony_ci
3107db96d56Sopenharmony_ci/*[clinic input]
3117db96d56Sopenharmony_ciunicodedata.UCD.combining -> int
3127db96d56Sopenharmony_ci
3137db96d56Sopenharmony_ci    self: self
3147db96d56Sopenharmony_ci    chr: int(accept={str})
3157db96d56Sopenharmony_ci    /
3167db96d56Sopenharmony_ci
3177db96d56Sopenharmony_ciReturns the canonical combining class assigned to the character chr as integer.
3187db96d56Sopenharmony_ci
3197db96d56Sopenharmony_ciReturns 0 if no combining class is defined.
3207db96d56Sopenharmony_ci[clinic start generated code]*/
3217db96d56Sopenharmony_ci
3227db96d56Sopenharmony_cistatic int
3237db96d56Sopenharmony_ciunicodedata_UCD_combining_impl(PyObject *self, int chr)
3247db96d56Sopenharmony_ci/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
3257db96d56Sopenharmony_ci{
3267db96d56Sopenharmony_ci    int index;
3277db96d56Sopenharmony_ci    Py_UCS4 c = (Py_UCS4)chr;
3287db96d56Sopenharmony_ci    index = (int) _getrecord_ex(c)->combining;
3297db96d56Sopenharmony_ci    if (UCD_Check(self)) {
3307db96d56Sopenharmony_ci        const change_record *old = get_old_record(self, c);
3317db96d56Sopenharmony_ci        if (old->category_changed == 0)
3327db96d56Sopenharmony_ci            index = 0; /* unassigned */
3337db96d56Sopenharmony_ci    }
3347db96d56Sopenharmony_ci    return index;
3357db96d56Sopenharmony_ci}
3367db96d56Sopenharmony_ci
3377db96d56Sopenharmony_ci/*[clinic input]
3387db96d56Sopenharmony_ciunicodedata.UCD.mirrored -> int
3397db96d56Sopenharmony_ci
3407db96d56Sopenharmony_ci    self: self
3417db96d56Sopenharmony_ci    chr: int(accept={str})
3427db96d56Sopenharmony_ci    /
3437db96d56Sopenharmony_ci
3447db96d56Sopenharmony_ciReturns the mirrored property assigned to the character chr as integer.
3457db96d56Sopenharmony_ci
3467db96d56Sopenharmony_ciReturns 1 if the character has been identified as a "mirrored"
3477db96d56Sopenharmony_cicharacter in bidirectional text, 0 otherwise.
3487db96d56Sopenharmony_ci[clinic start generated code]*/
3497db96d56Sopenharmony_ci
3507db96d56Sopenharmony_cistatic int
3517db96d56Sopenharmony_ciunicodedata_UCD_mirrored_impl(PyObject *self, int chr)
3527db96d56Sopenharmony_ci/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
3537db96d56Sopenharmony_ci{
3547db96d56Sopenharmony_ci    int index;
3557db96d56Sopenharmony_ci    Py_UCS4 c = (Py_UCS4)chr;
3567db96d56Sopenharmony_ci    index = (int) _getrecord_ex(c)->mirrored;
3577db96d56Sopenharmony_ci    if (UCD_Check(self)) {
3587db96d56Sopenharmony_ci        const change_record *old = get_old_record(self, c);
3597db96d56Sopenharmony_ci        if (old->category_changed == 0)
3607db96d56Sopenharmony_ci            index = 0; /* unassigned */
3617db96d56Sopenharmony_ci        else if (old->mirrored_changed != 0xFF)
3627db96d56Sopenharmony_ci            index = old->mirrored_changed;
3637db96d56Sopenharmony_ci    }
3647db96d56Sopenharmony_ci    return index;
3657db96d56Sopenharmony_ci}
3667db96d56Sopenharmony_ci
3677db96d56Sopenharmony_ci/*[clinic input]
3687db96d56Sopenharmony_ciunicodedata.UCD.east_asian_width
3697db96d56Sopenharmony_ci
3707db96d56Sopenharmony_ci    self: self
3717db96d56Sopenharmony_ci    chr: int(accept={str})
3727db96d56Sopenharmony_ci    /
3737db96d56Sopenharmony_ci
3747db96d56Sopenharmony_ciReturns the east asian width assigned to the character chr as string.
3757db96d56Sopenharmony_ci[clinic start generated code]*/
3767db96d56Sopenharmony_ci
3777db96d56Sopenharmony_cistatic PyObject *
3787db96d56Sopenharmony_ciunicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
3797db96d56Sopenharmony_ci/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
3807db96d56Sopenharmony_ci{
3817db96d56Sopenharmony_ci    int index;
3827db96d56Sopenharmony_ci    Py_UCS4 c = (Py_UCS4)chr;
3837db96d56Sopenharmony_ci    index = (int) _getrecord_ex(c)->east_asian_width;
3847db96d56Sopenharmony_ci    if (UCD_Check(self)) {
3857db96d56Sopenharmony_ci        const change_record *old = get_old_record(self, c);
3867db96d56Sopenharmony_ci        if (old->category_changed == 0)
3877db96d56Sopenharmony_ci            index = 0; /* unassigned */
3887db96d56Sopenharmony_ci        else if (old->east_asian_width_changed != 0xFF)
3897db96d56Sopenharmony_ci            index = old->east_asian_width_changed;
3907db96d56Sopenharmony_ci    }
3917db96d56Sopenharmony_ci    return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
3927db96d56Sopenharmony_ci}
3937db96d56Sopenharmony_ci
3947db96d56Sopenharmony_ci/*[clinic input]
3957db96d56Sopenharmony_ciunicodedata.UCD.decomposition
3967db96d56Sopenharmony_ci
3977db96d56Sopenharmony_ci    self: self
3987db96d56Sopenharmony_ci    chr: int(accept={str})
3997db96d56Sopenharmony_ci    /
4007db96d56Sopenharmony_ci
4017db96d56Sopenharmony_ciReturns the character decomposition mapping assigned to the character chr as string.
4027db96d56Sopenharmony_ci
4037db96d56Sopenharmony_ciAn empty string is returned in case no such mapping is defined.
4047db96d56Sopenharmony_ci[clinic start generated code]*/
4057db96d56Sopenharmony_ci
4067db96d56Sopenharmony_cistatic PyObject *
4077db96d56Sopenharmony_ciunicodedata_UCD_decomposition_impl(PyObject *self, int chr)
4087db96d56Sopenharmony_ci/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
4097db96d56Sopenharmony_ci{
4107db96d56Sopenharmony_ci    char decomp[256];
4117db96d56Sopenharmony_ci    int code, index, count;
4127db96d56Sopenharmony_ci    size_t i;
4137db96d56Sopenharmony_ci    unsigned int prefix_index;
4147db96d56Sopenharmony_ci    Py_UCS4 c = (Py_UCS4)chr;
4157db96d56Sopenharmony_ci
4167db96d56Sopenharmony_ci    code = (int)c;
4177db96d56Sopenharmony_ci
4187db96d56Sopenharmony_ci    if (UCD_Check(self)) {
4197db96d56Sopenharmony_ci        const change_record *old = get_old_record(self, c);
4207db96d56Sopenharmony_ci        if (old->category_changed == 0)
4217db96d56Sopenharmony_ci            return PyUnicode_FromString(""); /* unassigned */
4227db96d56Sopenharmony_ci    }
4237db96d56Sopenharmony_ci
4247db96d56Sopenharmony_ci    if (code < 0 || code >= 0x110000)
4257db96d56Sopenharmony_ci        index = 0;
4267db96d56Sopenharmony_ci    else {
4277db96d56Sopenharmony_ci        index = decomp_index1[(code>>DECOMP_SHIFT)];
4287db96d56Sopenharmony_ci        index = decomp_index2[(index<<DECOMP_SHIFT)+
4297db96d56Sopenharmony_ci                             (code&((1<<DECOMP_SHIFT)-1))];
4307db96d56Sopenharmony_ci    }
4317db96d56Sopenharmony_ci
4327db96d56Sopenharmony_ci    /* high byte is number of hex bytes (usually one or two), low byte
4337db96d56Sopenharmony_ci       is prefix code (from*/
4347db96d56Sopenharmony_ci    count = decomp_data[index] >> 8;
4357db96d56Sopenharmony_ci
4367db96d56Sopenharmony_ci    /* XXX: could allocate the PyString up front instead
4377db96d56Sopenharmony_ci       (strlen(prefix) + 5 * count + 1 bytes) */
4387db96d56Sopenharmony_ci
4397db96d56Sopenharmony_ci    /* Based on how index is calculated above and decomp_data is generated
4407db96d56Sopenharmony_ci       from Tools/unicode/makeunicodedata.py, it should not be possible
4417db96d56Sopenharmony_ci       to overflow decomp_prefix. */
4427db96d56Sopenharmony_ci    prefix_index = decomp_data[index] & 255;
4437db96d56Sopenharmony_ci    assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
4447db96d56Sopenharmony_ci
4457db96d56Sopenharmony_ci    /* copy prefix */
4467db96d56Sopenharmony_ci    i = strlen(decomp_prefix[prefix_index]);
4477db96d56Sopenharmony_ci    memcpy(decomp, decomp_prefix[prefix_index], i);
4487db96d56Sopenharmony_ci
4497db96d56Sopenharmony_ci    while (count-- > 0) {
4507db96d56Sopenharmony_ci        if (i)
4517db96d56Sopenharmony_ci            decomp[i++] = ' ';
4527db96d56Sopenharmony_ci        assert(i < sizeof(decomp));
4537db96d56Sopenharmony_ci        PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
4547db96d56Sopenharmony_ci                      decomp_data[++index]);
4557db96d56Sopenharmony_ci        i += strlen(decomp + i);
4567db96d56Sopenharmony_ci    }
4577db96d56Sopenharmony_ci    return PyUnicode_FromStringAndSize(decomp, i);
4587db96d56Sopenharmony_ci}
4597db96d56Sopenharmony_ci
4607db96d56Sopenharmony_cistatic void
4617db96d56Sopenharmony_ciget_decomp_record(PyObject *self, Py_UCS4 code,
4627db96d56Sopenharmony_ci                  int *index, int *prefix, int *count)
4637db96d56Sopenharmony_ci{
4647db96d56Sopenharmony_ci    if (code >= 0x110000) {
4657db96d56Sopenharmony_ci        *index = 0;
4667db96d56Sopenharmony_ci    }
4677db96d56Sopenharmony_ci    else if (UCD_Check(self)
4687db96d56Sopenharmony_ci             && get_old_record(self, code)->category_changed==0) {
4697db96d56Sopenharmony_ci        /* unassigned in old version */
4707db96d56Sopenharmony_ci        *index = 0;
4717db96d56Sopenharmony_ci    }
4727db96d56Sopenharmony_ci    else {
4737db96d56Sopenharmony_ci        *index = decomp_index1[(code>>DECOMP_SHIFT)];
4747db96d56Sopenharmony_ci        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
4757db96d56Sopenharmony_ci                               (code&((1<<DECOMP_SHIFT)-1))];
4767db96d56Sopenharmony_ci    }
4777db96d56Sopenharmony_ci
4787db96d56Sopenharmony_ci    /* high byte is number of hex bytes (usually one or two), low byte
4797db96d56Sopenharmony_ci       is prefix code (from*/
4807db96d56Sopenharmony_ci    *count = decomp_data[*index] >> 8;
4817db96d56Sopenharmony_ci    *prefix = decomp_data[*index] & 255;
4827db96d56Sopenharmony_ci
4837db96d56Sopenharmony_ci    (*index)++;
4847db96d56Sopenharmony_ci}
4857db96d56Sopenharmony_ci
4867db96d56Sopenharmony_ci#define SBase   0xAC00
4877db96d56Sopenharmony_ci#define LBase   0x1100
4887db96d56Sopenharmony_ci#define VBase   0x1161
4897db96d56Sopenharmony_ci#define TBase   0x11A7
4907db96d56Sopenharmony_ci#define LCount  19
4917db96d56Sopenharmony_ci#define VCount  21
4927db96d56Sopenharmony_ci#define TCount  28
4937db96d56Sopenharmony_ci#define NCount  (VCount*TCount)
4947db96d56Sopenharmony_ci#define SCount  (LCount*NCount)
4957db96d56Sopenharmony_ci
4967db96d56Sopenharmony_cistatic PyObject*
4977db96d56Sopenharmony_cinfd_nfkd(PyObject *self, PyObject *input, int k)
4987db96d56Sopenharmony_ci{
4997db96d56Sopenharmony_ci    PyObject *result;
5007db96d56Sopenharmony_ci    Py_UCS4 *output;
5017db96d56Sopenharmony_ci    Py_ssize_t i, o, osize;
5027db96d56Sopenharmony_ci    int kind;
5037db96d56Sopenharmony_ci    const void *data;
5047db96d56Sopenharmony_ci    /* Longest decomposition in Unicode 3.2: U+FDFA */
5057db96d56Sopenharmony_ci    Py_UCS4 stack[20];
5067db96d56Sopenharmony_ci    Py_ssize_t space, isize;
5077db96d56Sopenharmony_ci    int index, prefix, count, stackptr;
5087db96d56Sopenharmony_ci    unsigned char prev, cur;
5097db96d56Sopenharmony_ci
5107db96d56Sopenharmony_ci    stackptr = 0;
5117db96d56Sopenharmony_ci    isize = PyUnicode_GET_LENGTH(input);
5127db96d56Sopenharmony_ci    space = isize;
5137db96d56Sopenharmony_ci    /* Overallocate at most 10 characters. */
5147db96d56Sopenharmony_ci    if (space > 10) {
5157db96d56Sopenharmony_ci        if (space <= PY_SSIZE_T_MAX - 10)
5167db96d56Sopenharmony_ci            space += 10;
5177db96d56Sopenharmony_ci    }
5187db96d56Sopenharmony_ci    else {
5197db96d56Sopenharmony_ci        space *= 2;
5207db96d56Sopenharmony_ci    }
5217db96d56Sopenharmony_ci    osize = space;
5227db96d56Sopenharmony_ci    output = PyMem_NEW(Py_UCS4, space);
5237db96d56Sopenharmony_ci    if (!output) {
5247db96d56Sopenharmony_ci        PyErr_NoMemory();
5257db96d56Sopenharmony_ci        return NULL;
5267db96d56Sopenharmony_ci    }
5277db96d56Sopenharmony_ci    i = o = 0;
5287db96d56Sopenharmony_ci    kind = PyUnicode_KIND(input);
5297db96d56Sopenharmony_ci    data = PyUnicode_DATA(input);
5307db96d56Sopenharmony_ci
5317db96d56Sopenharmony_ci    while (i < isize) {
5327db96d56Sopenharmony_ci        stack[stackptr++] = PyUnicode_READ(kind, data, i++);
5337db96d56Sopenharmony_ci        while(stackptr) {
5347db96d56Sopenharmony_ci            Py_UCS4 code = stack[--stackptr];
5357db96d56Sopenharmony_ci            /* Hangul Decomposition adds three characters in
5367db96d56Sopenharmony_ci               a single step, so we need at least that much room. */
5377db96d56Sopenharmony_ci            if (space < 3) {
5387db96d56Sopenharmony_ci                Py_UCS4 *new_output;
5397db96d56Sopenharmony_ci                osize += 10;
5407db96d56Sopenharmony_ci                space += 10;
5417db96d56Sopenharmony_ci                new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
5427db96d56Sopenharmony_ci                if (new_output == NULL) {
5437db96d56Sopenharmony_ci                    PyMem_Free(output);
5447db96d56Sopenharmony_ci                    PyErr_NoMemory();
5457db96d56Sopenharmony_ci                    return NULL;
5467db96d56Sopenharmony_ci                }
5477db96d56Sopenharmony_ci                output = new_output;
5487db96d56Sopenharmony_ci            }
5497db96d56Sopenharmony_ci            /* Hangul Decomposition. */
5507db96d56Sopenharmony_ci            if (SBase <= code && code < (SBase+SCount)) {
5517db96d56Sopenharmony_ci                int SIndex = code - SBase;
5527db96d56Sopenharmony_ci                int L = LBase + SIndex / NCount;
5537db96d56Sopenharmony_ci                int V = VBase + (SIndex % NCount) / TCount;
5547db96d56Sopenharmony_ci                int T = TBase + SIndex % TCount;
5557db96d56Sopenharmony_ci                output[o++] = L;
5567db96d56Sopenharmony_ci                output[o++] = V;
5577db96d56Sopenharmony_ci                space -= 2;
5587db96d56Sopenharmony_ci                if (T != TBase) {
5597db96d56Sopenharmony_ci                    output[o++] = T;
5607db96d56Sopenharmony_ci                    space --;
5617db96d56Sopenharmony_ci                }
5627db96d56Sopenharmony_ci                continue;
5637db96d56Sopenharmony_ci            }
5647db96d56Sopenharmony_ci            /* normalization changes */
5657db96d56Sopenharmony_ci            if (UCD_Check(self)) {
5667db96d56Sopenharmony_ci                Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
5677db96d56Sopenharmony_ci                if (value != 0) {
5687db96d56Sopenharmony_ci                    stack[stackptr++] = value;
5697db96d56Sopenharmony_ci                    continue;
5707db96d56Sopenharmony_ci                }
5717db96d56Sopenharmony_ci            }
5727db96d56Sopenharmony_ci
5737db96d56Sopenharmony_ci            /* Other decompositions. */
5747db96d56Sopenharmony_ci            get_decomp_record(self, code, &index, &prefix, &count);
5757db96d56Sopenharmony_ci
5767db96d56Sopenharmony_ci            /* Copy character if it is not decomposable, or has a
5777db96d56Sopenharmony_ci               compatibility decomposition, but we do NFD. */
5787db96d56Sopenharmony_ci            if (!count || (prefix && !k)) {
5797db96d56Sopenharmony_ci                output[o++] = code;
5807db96d56Sopenharmony_ci                space--;
5817db96d56Sopenharmony_ci                continue;
5827db96d56Sopenharmony_ci            }
5837db96d56Sopenharmony_ci            /* Copy decomposition onto the stack, in reverse
5847db96d56Sopenharmony_ci               order.  */
5857db96d56Sopenharmony_ci            while(count) {
5867db96d56Sopenharmony_ci                code = decomp_data[index + (--count)];
5877db96d56Sopenharmony_ci                stack[stackptr++] = code;
5887db96d56Sopenharmony_ci            }
5897db96d56Sopenharmony_ci        }
5907db96d56Sopenharmony_ci    }
5917db96d56Sopenharmony_ci
5927db96d56Sopenharmony_ci    result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
5937db96d56Sopenharmony_ci                                       output, o);
5947db96d56Sopenharmony_ci    PyMem_Free(output);
5957db96d56Sopenharmony_ci    if (!result)
5967db96d56Sopenharmony_ci        return NULL;
5977db96d56Sopenharmony_ci    /* result is guaranteed to be ready, as it is compact. */
5987db96d56Sopenharmony_ci    kind = PyUnicode_KIND(result);
5997db96d56Sopenharmony_ci    data = PyUnicode_DATA(result);
6007db96d56Sopenharmony_ci
6017db96d56Sopenharmony_ci    /* Sort canonically. */
6027db96d56Sopenharmony_ci    i = 0;
6037db96d56Sopenharmony_ci    prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
6047db96d56Sopenharmony_ci    for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
6057db96d56Sopenharmony_ci        cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
6067db96d56Sopenharmony_ci        if (prev == 0 || cur == 0 || prev <= cur) {
6077db96d56Sopenharmony_ci            prev = cur;
6087db96d56Sopenharmony_ci            continue;
6097db96d56Sopenharmony_ci        }
6107db96d56Sopenharmony_ci        /* Non-canonical order. Need to switch *i with previous. */
6117db96d56Sopenharmony_ci        o = i - 1;
6127db96d56Sopenharmony_ci        while (1) {
6137db96d56Sopenharmony_ci            Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
6147db96d56Sopenharmony_ci            PyUnicode_WRITE(kind, data, o+1,
6157db96d56Sopenharmony_ci                            PyUnicode_READ(kind, data, o));
6167db96d56Sopenharmony_ci            PyUnicode_WRITE(kind, data, o, tmp);
6177db96d56Sopenharmony_ci            o--;
6187db96d56Sopenharmony_ci            if (o < 0)
6197db96d56Sopenharmony_ci                break;
6207db96d56Sopenharmony_ci            prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
6217db96d56Sopenharmony_ci            if (prev == 0 || prev <= cur)
6227db96d56Sopenharmony_ci                break;
6237db96d56Sopenharmony_ci        }
6247db96d56Sopenharmony_ci        prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
6257db96d56Sopenharmony_ci    }
6267db96d56Sopenharmony_ci    return result;
6277db96d56Sopenharmony_ci}
6287db96d56Sopenharmony_ci
6297db96d56Sopenharmony_cistatic int
6307db96d56Sopenharmony_cifind_nfc_index(const struct reindex* nfc, Py_UCS4 code)
6317db96d56Sopenharmony_ci{
6327db96d56Sopenharmony_ci    unsigned int index;
6337db96d56Sopenharmony_ci    for (index = 0; nfc[index].start; index++) {
6347db96d56Sopenharmony_ci        unsigned int start = nfc[index].start;
6357db96d56Sopenharmony_ci        if (code < start)
6367db96d56Sopenharmony_ci            return -1;
6377db96d56Sopenharmony_ci        if (code <= start + nfc[index].count) {
6387db96d56Sopenharmony_ci            unsigned int delta = code - start;
6397db96d56Sopenharmony_ci            return nfc[index].index + delta;
6407db96d56Sopenharmony_ci        }
6417db96d56Sopenharmony_ci    }
6427db96d56Sopenharmony_ci    return -1;
6437db96d56Sopenharmony_ci}
6447db96d56Sopenharmony_ci
6457db96d56Sopenharmony_cistatic PyObject*
6467db96d56Sopenharmony_cinfc_nfkc(PyObject *self, PyObject *input, int k)
6477db96d56Sopenharmony_ci{
6487db96d56Sopenharmony_ci    PyObject *result;
6497db96d56Sopenharmony_ci    int kind;
6507db96d56Sopenharmony_ci    const void *data;
6517db96d56Sopenharmony_ci    Py_UCS4 *output;
6527db96d56Sopenharmony_ci    Py_ssize_t i, i1, o, len;
6537db96d56Sopenharmony_ci    int f,l,index,index1,comb;
6547db96d56Sopenharmony_ci    Py_UCS4 code;
6557db96d56Sopenharmony_ci    Py_ssize_t skipped[20];
6567db96d56Sopenharmony_ci    int cskipped = 0;
6577db96d56Sopenharmony_ci
6587db96d56Sopenharmony_ci    result = nfd_nfkd(self, input, k);
6597db96d56Sopenharmony_ci    if (!result)
6607db96d56Sopenharmony_ci        return NULL;
6617db96d56Sopenharmony_ci    /* result will be "ready". */
6627db96d56Sopenharmony_ci    kind = PyUnicode_KIND(result);
6637db96d56Sopenharmony_ci    data = PyUnicode_DATA(result);
6647db96d56Sopenharmony_ci    len = PyUnicode_GET_LENGTH(result);
6657db96d56Sopenharmony_ci
6667db96d56Sopenharmony_ci    /* We allocate a buffer for the output.
6677db96d56Sopenharmony_ci       If we find that we made no changes, we still return
6687db96d56Sopenharmony_ci       the NFD result. */
6697db96d56Sopenharmony_ci    output = PyMem_NEW(Py_UCS4, len);
6707db96d56Sopenharmony_ci    if (!output) {
6717db96d56Sopenharmony_ci        PyErr_NoMemory();
6727db96d56Sopenharmony_ci        Py_DECREF(result);
6737db96d56Sopenharmony_ci        return 0;
6747db96d56Sopenharmony_ci    }
6757db96d56Sopenharmony_ci    i = o = 0;
6767db96d56Sopenharmony_ci
6777db96d56Sopenharmony_ci  again:
6787db96d56Sopenharmony_ci    while (i < len) {
6797db96d56Sopenharmony_ci      for (index = 0; index < cskipped; index++) {
6807db96d56Sopenharmony_ci          if (skipped[index] == i) {
6817db96d56Sopenharmony_ci              /* *i character is skipped.
6827db96d56Sopenharmony_ci                 Remove from list. */
6837db96d56Sopenharmony_ci              skipped[index] = skipped[cskipped-1];
6847db96d56Sopenharmony_ci              cskipped--;
6857db96d56Sopenharmony_ci              i++;
6867db96d56Sopenharmony_ci              goto again; /* continue while */
6877db96d56Sopenharmony_ci          }
6887db96d56Sopenharmony_ci      }
6897db96d56Sopenharmony_ci      /* Hangul Composition. We don't need to check for <LV,T>
6907db96d56Sopenharmony_ci         pairs, since we always have decomposed data. */
6917db96d56Sopenharmony_ci      code = PyUnicode_READ(kind, data, i);
6927db96d56Sopenharmony_ci      if (LBase <= code && code < (LBase+LCount) &&
6937db96d56Sopenharmony_ci          i + 1 < len &&
6947db96d56Sopenharmony_ci          VBase <= PyUnicode_READ(kind, data, i+1) &&
6957db96d56Sopenharmony_ci          PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
6967db96d56Sopenharmony_ci          /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
6977db96d56Sopenharmony_ci             and V character is a modern vowel (0x1161 ~ 0x1175). */
6987db96d56Sopenharmony_ci          int LIndex, VIndex;
6997db96d56Sopenharmony_ci          LIndex = code - LBase;
7007db96d56Sopenharmony_ci          VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
7017db96d56Sopenharmony_ci          code = SBase + (LIndex*VCount+VIndex)*TCount;
7027db96d56Sopenharmony_ci          i+=2;
7037db96d56Sopenharmony_ci          if (i < len &&
7047db96d56Sopenharmony_ci              TBase < PyUnicode_READ(kind, data, i) &&
7057db96d56Sopenharmony_ci              PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
7067db96d56Sopenharmony_ci              /* check T character is a modern trailing consonant
7077db96d56Sopenharmony_ci                 (0x11A8 ~ 0x11C2). */
7087db96d56Sopenharmony_ci              code += PyUnicode_READ(kind, data, i)-TBase;
7097db96d56Sopenharmony_ci              i++;
7107db96d56Sopenharmony_ci          }
7117db96d56Sopenharmony_ci          output[o++] = code;
7127db96d56Sopenharmony_ci          continue;
7137db96d56Sopenharmony_ci      }
7147db96d56Sopenharmony_ci
7157db96d56Sopenharmony_ci      /* code is still input[i] here */
7167db96d56Sopenharmony_ci      f = find_nfc_index(nfc_first, code);
7177db96d56Sopenharmony_ci      if (f == -1) {
7187db96d56Sopenharmony_ci          output[o++] = code;
7197db96d56Sopenharmony_ci          i++;
7207db96d56Sopenharmony_ci          continue;
7217db96d56Sopenharmony_ci      }
7227db96d56Sopenharmony_ci      /* Find next unblocked character. */
7237db96d56Sopenharmony_ci      i1 = i+1;
7247db96d56Sopenharmony_ci      comb = 0;
7257db96d56Sopenharmony_ci      /* output base character for now; might be updated later. */
7267db96d56Sopenharmony_ci      output[o] = PyUnicode_READ(kind, data, i);
7277db96d56Sopenharmony_ci      while (i1 < len) {
7287db96d56Sopenharmony_ci          Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
7297db96d56Sopenharmony_ci          int comb1 = _getrecord_ex(code1)->combining;
7307db96d56Sopenharmony_ci          if (comb) {
7317db96d56Sopenharmony_ci              if (comb1 == 0)
7327db96d56Sopenharmony_ci                  break;
7337db96d56Sopenharmony_ci              if (comb >= comb1) {
7347db96d56Sopenharmony_ci                  /* Character is blocked. */
7357db96d56Sopenharmony_ci                  i1++;
7367db96d56Sopenharmony_ci                  continue;
7377db96d56Sopenharmony_ci              }
7387db96d56Sopenharmony_ci          }
7397db96d56Sopenharmony_ci          l = find_nfc_index(nfc_last, code1);
7407db96d56Sopenharmony_ci          /* i1 cannot be combined with i. If i1
7417db96d56Sopenharmony_ci             is a starter, we don't need to look further.
7427db96d56Sopenharmony_ci             Otherwise, record the combining class. */
7437db96d56Sopenharmony_ci          if (l == -1) {
7447db96d56Sopenharmony_ci            not_combinable:
7457db96d56Sopenharmony_ci              if (comb1 == 0)
7467db96d56Sopenharmony_ci                  break;
7477db96d56Sopenharmony_ci              comb = comb1;
7487db96d56Sopenharmony_ci              i1++;
7497db96d56Sopenharmony_ci              continue;
7507db96d56Sopenharmony_ci          }
7517db96d56Sopenharmony_ci          index = f*TOTAL_LAST + l;
7527db96d56Sopenharmony_ci          index1 = comp_index[index >> COMP_SHIFT];
7537db96d56Sopenharmony_ci          code = comp_data[(index1<<COMP_SHIFT)+
7547db96d56Sopenharmony_ci                           (index&((1<<COMP_SHIFT)-1))];
7557db96d56Sopenharmony_ci          if (code == 0)
7567db96d56Sopenharmony_ci              goto not_combinable;
7577db96d56Sopenharmony_ci
7587db96d56Sopenharmony_ci          /* Replace the original character. */
7597db96d56Sopenharmony_ci          output[o] = code;
7607db96d56Sopenharmony_ci          /* Mark the second character unused. */
7617db96d56Sopenharmony_ci          assert(cskipped < 20);
7627db96d56Sopenharmony_ci          skipped[cskipped++] = i1;
7637db96d56Sopenharmony_ci          i1++;
7647db96d56Sopenharmony_ci          f = find_nfc_index(nfc_first, output[o]);
7657db96d56Sopenharmony_ci          if (f == -1)
7667db96d56Sopenharmony_ci              break;
7677db96d56Sopenharmony_ci      }
7687db96d56Sopenharmony_ci      /* Output character was already written.
7697db96d56Sopenharmony_ci         Just advance the indices. */
7707db96d56Sopenharmony_ci      o++; i++;
7717db96d56Sopenharmony_ci    }
7727db96d56Sopenharmony_ci    if (o == len) {
7737db96d56Sopenharmony_ci        /* No changes. Return original string. */
7747db96d56Sopenharmony_ci        PyMem_Free(output);
7757db96d56Sopenharmony_ci        return result;
7767db96d56Sopenharmony_ci    }
7777db96d56Sopenharmony_ci    Py_DECREF(result);
7787db96d56Sopenharmony_ci    result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
7797db96d56Sopenharmony_ci                                       output, o);
7807db96d56Sopenharmony_ci    PyMem_Free(output);
7817db96d56Sopenharmony_ci    return result;
7827db96d56Sopenharmony_ci}
7837db96d56Sopenharmony_ci
7847db96d56Sopenharmony_ci// This needs to match the logic in makeunicodedata.py
7857db96d56Sopenharmony_ci// which constructs the quickcheck data.
7867db96d56Sopenharmony_citypedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
7877db96d56Sopenharmony_ci
7887db96d56Sopenharmony_ci/* Run the Unicode normalization "quickcheck" algorithm.
7897db96d56Sopenharmony_ci *
7907db96d56Sopenharmony_ci * Return YES or NO if quickcheck determines the input is certainly
7917db96d56Sopenharmony_ci * normalized or certainly not, and MAYBE if quickcheck is unable to
7927db96d56Sopenharmony_ci * tell.
7937db96d56Sopenharmony_ci *
7947db96d56Sopenharmony_ci * If `yes_only` is true, then return MAYBE as soon as we determine
7957db96d56Sopenharmony_ci * the answer is not YES.
7967db96d56Sopenharmony_ci *
7977db96d56Sopenharmony_ci * For background and details on the algorithm, see UAX #15:
7987db96d56Sopenharmony_ci *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
7997db96d56Sopenharmony_ci */
8007db96d56Sopenharmony_cistatic QuickcheckResult
8017db96d56Sopenharmony_ciis_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
8027db96d56Sopenharmony_ci                         bool yes_only)
8037db96d56Sopenharmony_ci{
8047db96d56Sopenharmony_ci    /* UCD 3.2.0 is requested, quickchecks must be disabled. */
8057db96d56Sopenharmony_ci    if (UCD_Check(self)) {
8067db96d56Sopenharmony_ci        return MAYBE;
8077db96d56Sopenharmony_ci    }
8087db96d56Sopenharmony_ci
8097db96d56Sopenharmony_ci    if (PyUnicode_IS_ASCII(input)) {
8107db96d56Sopenharmony_ci        return YES;
8117db96d56Sopenharmony_ci    }
8127db96d56Sopenharmony_ci
8137db96d56Sopenharmony_ci    Py_ssize_t i, len;
8147db96d56Sopenharmony_ci    int kind;
8157db96d56Sopenharmony_ci    const void *data;
8167db96d56Sopenharmony_ci    unsigned char prev_combining = 0;
8177db96d56Sopenharmony_ci
8187db96d56Sopenharmony_ci    /* The two quickcheck bits at this shift have type QuickcheckResult. */
8197db96d56Sopenharmony_ci    int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
8207db96d56Sopenharmony_ci
8217db96d56Sopenharmony_ci    QuickcheckResult result = YES; /* certainly normalized, unless we find something */
8227db96d56Sopenharmony_ci
8237db96d56Sopenharmony_ci    i = 0;
8247db96d56Sopenharmony_ci    kind = PyUnicode_KIND(input);
8257db96d56Sopenharmony_ci    data = PyUnicode_DATA(input);
8267db96d56Sopenharmony_ci    len = PyUnicode_GET_LENGTH(input);
8277db96d56Sopenharmony_ci    while (i < len) {
8287db96d56Sopenharmony_ci        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
8297db96d56Sopenharmony_ci        const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
8307db96d56Sopenharmony_ci
8317db96d56Sopenharmony_ci        unsigned char combining = record->combining;
8327db96d56Sopenharmony_ci        if (combining && prev_combining > combining)
8337db96d56Sopenharmony_ci            return NO; /* non-canonical sort order, not normalized */
8347db96d56Sopenharmony_ci        prev_combining = combining;
8357db96d56Sopenharmony_ci
8367db96d56Sopenharmony_ci        unsigned char quickcheck_whole = record->normalization_quick_check;
8377db96d56Sopenharmony_ci        if (yes_only) {
8387db96d56Sopenharmony_ci            if (quickcheck_whole & (3 << quickcheck_shift))
8397db96d56Sopenharmony_ci                return MAYBE;
8407db96d56Sopenharmony_ci        } else {
8417db96d56Sopenharmony_ci            switch ((quickcheck_whole >> quickcheck_shift) & 3) {
8427db96d56Sopenharmony_ci            case NO:
8437db96d56Sopenharmony_ci              return NO;
8447db96d56Sopenharmony_ci            case MAYBE:
8457db96d56Sopenharmony_ci              result = MAYBE; /* this string might need normalization */
8467db96d56Sopenharmony_ci            }
8477db96d56Sopenharmony_ci        }
8487db96d56Sopenharmony_ci    }
8497db96d56Sopenharmony_ci    return result;
8507db96d56Sopenharmony_ci}
8517db96d56Sopenharmony_ci
8527db96d56Sopenharmony_ci/*[clinic input]
8537db96d56Sopenharmony_ciunicodedata.UCD.is_normalized
8547db96d56Sopenharmony_ci
8557db96d56Sopenharmony_ci    self: self
8567db96d56Sopenharmony_ci    form: unicode
8577db96d56Sopenharmony_ci    unistr as input: unicode
8587db96d56Sopenharmony_ci    /
8597db96d56Sopenharmony_ci
8607db96d56Sopenharmony_ciReturn whether the Unicode string unistr is in the normal form 'form'.
8617db96d56Sopenharmony_ci
8627db96d56Sopenharmony_ciValid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
8637db96d56Sopenharmony_ci[clinic start generated code]*/
8647db96d56Sopenharmony_ci
8657db96d56Sopenharmony_cistatic PyObject *
8667db96d56Sopenharmony_ciunicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
8677db96d56Sopenharmony_ci                                   PyObject *input)
8687db96d56Sopenharmony_ci/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
8697db96d56Sopenharmony_ci{
8707db96d56Sopenharmony_ci    if (PyUnicode_READY(input) == -1) {
8717db96d56Sopenharmony_ci        return NULL;
8727db96d56Sopenharmony_ci    }
8737db96d56Sopenharmony_ci
8747db96d56Sopenharmony_ci    if (PyUnicode_GET_LENGTH(input) == 0) {
8757db96d56Sopenharmony_ci        /* special case empty input strings. */
8767db96d56Sopenharmony_ci        Py_RETURN_TRUE;
8777db96d56Sopenharmony_ci    }
8787db96d56Sopenharmony_ci
8797db96d56Sopenharmony_ci    PyObject *result;
8807db96d56Sopenharmony_ci    bool nfc = false;
8817db96d56Sopenharmony_ci    bool k = false;
8827db96d56Sopenharmony_ci    QuickcheckResult m;
8837db96d56Sopenharmony_ci
8847db96d56Sopenharmony_ci    PyObject *cmp;
8857db96d56Sopenharmony_ci    int match = 0;
8867db96d56Sopenharmony_ci
8877db96d56Sopenharmony_ci    if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
8887db96d56Sopenharmony_ci        nfc = true;
8897db96d56Sopenharmony_ci    }
8907db96d56Sopenharmony_ci    else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
8917db96d56Sopenharmony_ci        nfc = true;
8927db96d56Sopenharmony_ci        k = true;
8937db96d56Sopenharmony_ci    }
8947db96d56Sopenharmony_ci    else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
8957db96d56Sopenharmony_ci        /* matches default values for `nfc` and `k` */
8967db96d56Sopenharmony_ci    }
8977db96d56Sopenharmony_ci    else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
8987db96d56Sopenharmony_ci        k = true;
8997db96d56Sopenharmony_ci    }
9007db96d56Sopenharmony_ci    else {
9017db96d56Sopenharmony_ci        PyErr_SetString(PyExc_ValueError, "invalid normalization form");
9027db96d56Sopenharmony_ci        return NULL;
9037db96d56Sopenharmony_ci    }
9047db96d56Sopenharmony_ci
9057db96d56Sopenharmony_ci    m = is_normalized_quickcheck(self, input, nfc, k, false);
9067db96d56Sopenharmony_ci
9077db96d56Sopenharmony_ci    if (m == MAYBE) {
9087db96d56Sopenharmony_ci        cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
9097db96d56Sopenharmony_ci        if (cmp == NULL) {
9107db96d56Sopenharmony_ci            return NULL;
9117db96d56Sopenharmony_ci        }
9127db96d56Sopenharmony_ci        match = PyUnicode_Compare(input, cmp);
9137db96d56Sopenharmony_ci        Py_DECREF(cmp);
9147db96d56Sopenharmony_ci        result = (match == 0) ? Py_True : Py_False;
9157db96d56Sopenharmony_ci    }
9167db96d56Sopenharmony_ci    else {
9177db96d56Sopenharmony_ci        result = (m == YES) ? Py_True : Py_False;
9187db96d56Sopenharmony_ci    }
9197db96d56Sopenharmony_ci
9207db96d56Sopenharmony_ci    Py_INCREF(result);
9217db96d56Sopenharmony_ci    return result;
9227db96d56Sopenharmony_ci}
9237db96d56Sopenharmony_ci
9247db96d56Sopenharmony_ci
9257db96d56Sopenharmony_ci/*[clinic input]
9267db96d56Sopenharmony_ciunicodedata.UCD.normalize
9277db96d56Sopenharmony_ci
9287db96d56Sopenharmony_ci    self: self
9297db96d56Sopenharmony_ci    form: unicode
9307db96d56Sopenharmony_ci    unistr as input: unicode
9317db96d56Sopenharmony_ci    /
9327db96d56Sopenharmony_ci
9337db96d56Sopenharmony_ciReturn the normal form 'form' for the Unicode string unistr.
9347db96d56Sopenharmony_ci
9357db96d56Sopenharmony_ciValid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
9367db96d56Sopenharmony_ci[clinic start generated code]*/
9377db96d56Sopenharmony_ci
9387db96d56Sopenharmony_cistatic PyObject *
9397db96d56Sopenharmony_ciunicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
9407db96d56Sopenharmony_ci                               PyObject *input)
9417db96d56Sopenharmony_ci/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
9427db96d56Sopenharmony_ci{
9437db96d56Sopenharmony_ci    if (PyUnicode_GET_LENGTH(input) == 0) {
9447db96d56Sopenharmony_ci        /* Special case empty input strings, since resizing
9457db96d56Sopenharmony_ci           them  later would cause internal errors. */
9467db96d56Sopenharmony_ci        Py_INCREF(input);
9477db96d56Sopenharmony_ci        return input;
9487db96d56Sopenharmony_ci    }
9497db96d56Sopenharmony_ci
9507db96d56Sopenharmony_ci    if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
9517db96d56Sopenharmony_ci        if (is_normalized_quickcheck(self, input,
9527db96d56Sopenharmony_ci                                     true,  false, true) == YES) {
9537db96d56Sopenharmony_ci            Py_INCREF(input);
9547db96d56Sopenharmony_ci            return input;
9557db96d56Sopenharmony_ci        }
9567db96d56Sopenharmony_ci        return nfc_nfkc(self, input, 0);
9577db96d56Sopenharmony_ci    }
9587db96d56Sopenharmony_ci    if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
9597db96d56Sopenharmony_ci        if (is_normalized_quickcheck(self, input,
9607db96d56Sopenharmony_ci                                     true,  true,  true) == YES) {
9617db96d56Sopenharmony_ci            Py_INCREF(input);
9627db96d56Sopenharmony_ci            return input;
9637db96d56Sopenharmony_ci        }
9647db96d56Sopenharmony_ci        return nfc_nfkc(self, input, 1);
9657db96d56Sopenharmony_ci    }
9667db96d56Sopenharmony_ci    if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
9677db96d56Sopenharmony_ci        if (is_normalized_quickcheck(self, input,
9687db96d56Sopenharmony_ci                                     false, false, true) == YES) {
9697db96d56Sopenharmony_ci            Py_INCREF(input);
9707db96d56Sopenharmony_ci            return input;
9717db96d56Sopenharmony_ci        }
9727db96d56Sopenharmony_ci        return nfd_nfkd(self, input, 0);
9737db96d56Sopenharmony_ci    }
9747db96d56Sopenharmony_ci    if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
9757db96d56Sopenharmony_ci        if (is_normalized_quickcheck(self, input,
9767db96d56Sopenharmony_ci                                     false, true,  true) == YES) {
9777db96d56Sopenharmony_ci            Py_INCREF(input);
9787db96d56Sopenharmony_ci            return input;
9797db96d56Sopenharmony_ci        }
9807db96d56Sopenharmony_ci        return nfd_nfkd(self, input, 1);
9817db96d56Sopenharmony_ci    }
9827db96d56Sopenharmony_ci    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
9837db96d56Sopenharmony_ci    return NULL;
9847db96d56Sopenharmony_ci}
9857db96d56Sopenharmony_ci
9867db96d56Sopenharmony_ci/* -------------------------------------------------------------------- */
9877db96d56Sopenharmony_ci/* unicode character name tables */
9887db96d56Sopenharmony_ci
9897db96d56Sopenharmony_ci/* data file generated by Tools/unicode/makeunicodedata.py */
9907db96d56Sopenharmony_ci#include "unicodename_db.h"
9917db96d56Sopenharmony_ci
9927db96d56Sopenharmony_ci/* -------------------------------------------------------------------- */
9937db96d56Sopenharmony_ci/* database code (cut and pasted from the unidb package) */
9947db96d56Sopenharmony_ci
9957db96d56Sopenharmony_cistatic unsigned long
9967db96d56Sopenharmony_ci_gethash(const char *s, int len, int scale)
9977db96d56Sopenharmony_ci{
9987db96d56Sopenharmony_ci    int i;
9997db96d56Sopenharmony_ci    unsigned long h = 0;
10007db96d56Sopenharmony_ci    unsigned long ix;
10017db96d56Sopenharmony_ci    for (i = 0; i < len; i++) {
10027db96d56Sopenharmony_ci        h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
10037db96d56Sopenharmony_ci        ix = h & 0xff000000;
10047db96d56Sopenharmony_ci        if (ix)
10057db96d56Sopenharmony_ci            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
10067db96d56Sopenharmony_ci    }
10077db96d56Sopenharmony_ci    return h;
10087db96d56Sopenharmony_ci}
10097db96d56Sopenharmony_ci
10107db96d56Sopenharmony_cistatic const char * const hangul_syllables[][3] = {
10117db96d56Sopenharmony_ci    { "G",  "A",   ""   },
10127db96d56Sopenharmony_ci    { "GG", "AE",  "G"  },
10137db96d56Sopenharmony_ci    { "N",  "YA",  "GG" },
10147db96d56Sopenharmony_ci    { "D",  "YAE", "GS" },
10157db96d56Sopenharmony_ci    { "DD", "EO",  "N", },
10167db96d56Sopenharmony_ci    { "R",  "E",   "NJ" },
10177db96d56Sopenharmony_ci    { "M",  "YEO", "NH" },
10187db96d56Sopenharmony_ci    { "B",  "YE",  "D"  },
10197db96d56Sopenharmony_ci    { "BB", "O",   "L"  },
10207db96d56Sopenharmony_ci    { "S",  "WA",  "LG" },
10217db96d56Sopenharmony_ci    { "SS", "WAE", "LM" },
10227db96d56Sopenharmony_ci    { "",   "OE",  "LB" },
10237db96d56Sopenharmony_ci    { "J",  "YO",  "LS" },
10247db96d56Sopenharmony_ci    { "JJ", "U",   "LT" },
10257db96d56Sopenharmony_ci    { "C",  "WEO", "LP" },
10267db96d56Sopenharmony_ci    { "K",  "WE",  "LH" },
10277db96d56Sopenharmony_ci    { "T",  "WI",  "M"  },
10287db96d56Sopenharmony_ci    { "P",  "YU",  "B"  },
10297db96d56Sopenharmony_ci    { "H",  "EU",  "BS" },
10307db96d56Sopenharmony_ci    { 0,    "YI",  "S"  },
10317db96d56Sopenharmony_ci    { 0,    "I",   "SS" },
10327db96d56Sopenharmony_ci    { 0,    0,     "NG" },
10337db96d56Sopenharmony_ci    { 0,    0,     "J"  },
10347db96d56Sopenharmony_ci    { 0,    0,     "C"  },
10357db96d56Sopenharmony_ci    { 0,    0,     "K"  },
10367db96d56Sopenharmony_ci    { 0,    0,     "T"  },
10377db96d56Sopenharmony_ci    { 0,    0,     "P"  },
10387db96d56Sopenharmony_ci    { 0,    0,     "H"  }
10397db96d56Sopenharmony_ci};
10407db96d56Sopenharmony_ci
10417db96d56Sopenharmony_ci/* These ranges need to match makeunicodedata.py:cjk_ranges. */
10427db96d56Sopenharmony_cistatic int
10437db96d56Sopenharmony_ciis_unified_ideograph(Py_UCS4 code)
10447db96d56Sopenharmony_ci{
10457db96d56Sopenharmony_ci    return
10467db96d56Sopenharmony_ci        (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
10477db96d56Sopenharmony_ci        (0x4E00 <= code && code <= 0x9FFF)   || /* CJK Ideograph */
10487db96d56Sopenharmony_ci        (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
10497db96d56Sopenharmony_ci        (0x2A700 <= code && code <= 0x2B738) || /* CJK Ideograph Extension C */
10507db96d56Sopenharmony_ci        (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
10517db96d56Sopenharmony_ci        (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
10527db96d56Sopenharmony_ci        (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
10537db96d56Sopenharmony_ci        (0x30000 <= code && code <= 0x3134A);   /* CJK Ideograph Extension G */
10547db96d56Sopenharmony_ci}
10557db96d56Sopenharmony_ci
10567db96d56Sopenharmony_ci/* macros used to determine if the given code point is in the PUA range that
10577db96d56Sopenharmony_ci * we are using to store aliases and named sequences */
10587db96d56Sopenharmony_ci#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
10597db96d56Sopenharmony_ci#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
10607db96d56Sopenharmony_ci                          (cp < named_sequences_end))
10617db96d56Sopenharmony_ci
10627db96d56Sopenharmony_cistatic int
10637db96d56Sopenharmony_ci_getucname(PyObject *self,
10647db96d56Sopenharmony_ci           Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
10657db96d56Sopenharmony_ci{
10667db96d56Sopenharmony_ci    /* Find the name associated with the given code point.
10677db96d56Sopenharmony_ci     * If with_alias_and_seq is 1, check for names in the Private Use Area 15
10687db96d56Sopenharmony_ci     * that we are using for aliases and named sequences. */
10697db96d56Sopenharmony_ci    int offset;
10707db96d56Sopenharmony_ci    int i;
10717db96d56Sopenharmony_ci    int word;
10727db96d56Sopenharmony_ci    const unsigned char* w;
10737db96d56Sopenharmony_ci
10747db96d56Sopenharmony_ci    if (code >= 0x110000)
10757db96d56Sopenharmony_ci        return 0;
10767db96d56Sopenharmony_ci
10777db96d56Sopenharmony_ci    /* XXX should we just skip all the code points in the PUAs here? */
10787db96d56Sopenharmony_ci    if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
10797db96d56Sopenharmony_ci        return 0;
10807db96d56Sopenharmony_ci
10817db96d56Sopenharmony_ci    if (UCD_Check(self)) {
10827db96d56Sopenharmony_ci        /* in 3.2.0 there are no aliases and named sequences */
10837db96d56Sopenharmony_ci        const change_record *old;
10847db96d56Sopenharmony_ci        if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
10857db96d56Sopenharmony_ci            return 0;
10867db96d56Sopenharmony_ci        old = get_old_record(self, code);
10877db96d56Sopenharmony_ci        if (old->category_changed == 0) {
10887db96d56Sopenharmony_ci            /* unassigned */
10897db96d56Sopenharmony_ci            return 0;
10907db96d56Sopenharmony_ci        }
10917db96d56Sopenharmony_ci    }
10927db96d56Sopenharmony_ci
10937db96d56Sopenharmony_ci    if (SBase <= code && code < SBase+SCount) {
10947db96d56Sopenharmony_ci        /* Hangul syllable. */
10957db96d56Sopenharmony_ci        int SIndex = code - SBase;
10967db96d56Sopenharmony_ci        int L = SIndex / NCount;
10977db96d56Sopenharmony_ci        int V = (SIndex % NCount) / TCount;
10987db96d56Sopenharmony_ci        int T = SIndex % TCount;
10997db96d56Sopenharmony_ci
11007db96d56Sopenharmony_ci        if (buflen < 27)
11017db96d56Sopenharmony_ci            /* Worst case: HANGUL SYLLABLE <10chars>. */
11027db96d56Sopenharmony_ci            return 0;
11037db96d56Sopenharmony_ci        strcpy(buffer, "HANGUL SYLLABLE ");
11047db96d56Sopenharmony_ci        buffer += 16;
11057db96d56Sopenharmony_ci        strcpy(buffer, hangul_syllables[L][0]);
11067db96d56Sopenharmony_ci        buffer += strlen(hangul_syllables[L][0]);
11077db96d56Sopenharmony_ci        strcpy(buffer, hangul_syllables[V][1]);
11087db96d56Sopenharmony_ci        buffer += strlen(hangul_syllables[V][1]);
11097db96d56Sopenharmony_ci        strcpy(buffer, hangul_syllables[T][2]);
11107db96d56Sopenharmony_ci        buffer += strlen(hangul_syllables[T][2]);
11117db96d56Sopenharmony_ci        *buffer = '\0';
11127db96d56Sopenharmony_ci        return 1;
11137db96d56Sopenharmony_ci    }
11147db96d56Sopenharmony_ci
11157db96d56Sopenharmony_ci    if (is_unified_ideograph(code)) {
11167db96d56Sopenharmony_ci        if (buflen < 28)
11177db96d56Sopenharmony_ci            /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
11187db96d56Sopenharmony_ci            return 0;
11197db96d56Sopenharmony_ci        sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
11207db96d56Sopenharmony_ci        return 1;
11217db96d56Sopenharmony_ci    }
11227db96d56Sopenharmony_ci
11237db96d56Sopenharmony_ci    /* get offset into phrasebook */
11247db96d56Sopenharmony_ci    offset = phrasebook_offset1[(code>>phrasebook_shift)];
11257db96d56Sopenharmony_ci    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
11267db96d56Sopenharmony_ci                               (code&((1<<phrasebook_shift)-1))];
11277db96d56Sopenharmony_ci    if (!offset)
11287db96d56Sopenharmony_ci        return 0;
11297db96d56Sopenharmony_ci
11307db96d56Sopenharmony_ci    i = 0;
11317db96d56Sopenharmony_ci
11327db96d56Sopenharmony_ci    for (;;) {
11337db96d56Sopenharmony_ci        /* get word index */
11347db96d56Sopenharmony_ci        word = phrasebook[offset] - phrasebook_short;
11357db96d56Sopenharmony_ci        if (word >= 0) {
11367db96d56Sopenharmony_ci            word = (word << 8) + phrasebook[offset+1];
11377db96d56Sopenharmony_ci            offset += 2;
11387db96d56Sopenharmony_ci        } else
11397db96d56Sopenharmony_ci            word = phrasebook[offset++];
11407db96d56Sopenharmony_ci        if (i) {
11417db96d56Sopenharmony_ci            if (i > buflen)
11427db96d56Sopenharmony_ci                return 0; /* buffer overflow */
11437db96d56Sopenharmony_ci            buffer[i++] = ' ';
11447db96d56Sopenharmony_ci        }
11457db96d56Sopenharmony_ci        /* copy word string from lexicon.  the last character in the
11467db96d56Sopenharmony_ci           word has bit 7 set.  the last word in a string ends with
11477db96d56Sopenharmony_ci           0x80 */
11487db96d56Sopenharmony_ci        w = lexicon + lexicon_offset[word];
11497db96d56Sopenharmony_ci        while (*w < 128) {
11507db96d56Sopenharmony_ci            if (i >= buflen)
11517db96d56Sopenharmony_ci                return 0; /* buffer overflow */
11527db96d56Sopenharmony_ci            buffer[i++] = *w++;
11537db96d56Sopenharmony_ci        }
11547db96d56Sopenharmony_ci        if (i >= buflen)
11557db96d56Sopenharmony_ci            return 0; /* buffer overflow */
11567db96d56Sopenharmony_ci        buffer[i++] = *w & 127;
11577db96d56Sopenharmony_ci        if (*w == 128)
11587db96d56Sopenharmony_ci            break; /* end of word */
11597db96d56Sopenharmony_ci    }
11607db96d56Sopenharmony_ci
11617db96d56Sopenharmony_ci    return 1;
11627db96d56Sopenharmony_ci}
11637db96d56Sopenharmony_ci
11647db96d56Sopenharmony_cistatic int
11657db96d56Sopenharmony_cicapi_getucname(Py_UCS4 code,
11667db96d56Sopenharmony_ci               char* buffer, int buflen,
11677db96d56Sopenharmony_ci               int with_alias_and_seq)
11687db96d56Sopenharmony_ci{
11697db96d56Sopenharmony_ci    return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
11707db96d56Sopenharmony_ci
11717db96d56Sopenharmony_ci}
11727db96d56Sopenharmony_ci
11737db96d56Sopenharmony_cistatic int
11747db96d56Sopenharmony_ci_cmpname(PyObject *self, int code, const char* name, int namelen)
11757db96d56Sopenharmony_ci{
11767db96d56Sopenharmony_ci    /* check if code corresponds to the given name */
11777db96d56Sopenharmony_ci    int i;
11787db96d56Sopenharmony_ci    char buffer[NAME_MAXLEN+1];
11797db96d56Sopenharmony_ci    if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
11807db96d56Sopenharmony_ci        return 0;
11817db96d56Sopenharmony_ci    for (i = 0; i < namelen; i++) {
11827db96d56Sopenharmony_ci        if (Py_TOUPPER(name[i]) != buffer[i])
11837db96d56Sopenharmony_ci            return 0;
11847db96d56Sopenharmony_ci    }
11857db96d56Sopenharmony_ci    return buffer[namelen] == '\0';
11867db96d56Sopenharmony_ci}
11877db96d56Sopenharmony_ci
11887db96d56Sopenharmony_cistatic void
11897db96d56Sopenharmony_cifind_syllable(const char *str, int *len, int *pos, int count, int column)
11907db96d56Sopenharmony_ci{
11917db96d56Sopenharmony_ci    int i, len1;
11927db96d56Sopenharmony_ci    *len = -1;
11937db96d56Sopenharmony_ci    for (i = 0; i < count; i++) {
11947db96d56Sopenharmony_ci        const char *s = hangul_syllables[i][column];
11957db96d56Sopenharmony_ci        len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
11967db96d56Sopenharmony_ci        if (len1 <= *len)
11977db96d56Sopenharmony_ci            continue;
11987db96d56Sopenharmony_ci        if (strncmp(str, s, len1) == 0) {
11997db96d56Sopenharmony_ci            *len = len1;
12007db96d56Sopenharmony_ci            *pos = i;
12017db96d56Sopenharmony_ci        }
12027db96d56Sopenharmony_ci    }
12037db96d56Sopenharmony_ci    if (*len == -1) {
12047db96d56Sopenharmony_ci        *len = 0;
12057db96d56Sopenharmony_ci    }
12067db96d56Sopenharmony_ci}
12077db96d56Sopenharmony_ci
12087db96d56Sopenharmony_cistatic int
12097db96d56Sopenharmony_ci_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
12107db96d56Sopenharmony_ci{
12117db96d56Sopenharmony_ci    /* check if named sequences are allowed */
12127db96d56Sopenharmony_ci    if (!with_named_seq && IS_NAMED_SEQ(cp))
12137db96d56Sopenharmony_ci        return 0;
12147db96d56Sopenharmony_ci    /* if the code point is in the PUA range that we use for aliases,
12157db96d56Sopenharmony_ci     * convert it to obtain the right code point */
12167db96d56Sopenharmony_ci    if (IS_ALIAS(cp))
12177db96d56Sopenharmony_ci        *code = name_aliases[cp-aliases_start];
12187db96d56Sopenharmony_ci    else
12197db96d56Sopenharmony_ci        *code = cp;
12207db96d56Sopenharmony_ci    return 1;
12217db96d56Sopenharmony_ci}
12227db96d56Sopenharmony_ci
12237db96d56Sopenharmony_cistatic int
12247db96d56Sopenharmony_ci_getcode(PyObject* self,
12257db96d56Sopenharmony_ci         const char* name, int namelen, Py_UCS4* code, int with_named_seq)
12267db96d56Sopenharmony_ci{
12277db96d56Sopenharmony_ci    /* Return the code point associated with the given name.
12287db96d56Sopenharmony_ci     * Named aliases are resolved too (unless self != NULL (i.e. we are using
12297db96d56Sopenharmony_ci     * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
12307db96d56Sopenharmony_ci     * using for the named sequence, and the caller must then convert it. */
12317db96d56Sopenharmony_ci    unsigned int h, v;
12327db96d56Sopenharmony_ci    unsigned int mask = code_size-1;
12337db96d56Sopenharmony_ci    unsigned int i, incr;
12347db96d56Sopenharmony_ci
12357db96d56Sopenharmony_ci    /* Check for hangul syllables. */
12367db96d56Sopenharmony_ci    if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
12377db96d56Sopenharmony_ci        int len, L = -1, V = -1, T = -1;
12387db96d56Sopenharmony_ci        const char *pos = name + 16;
12397db96d56Sopenharmony_ci        find_syllable(pos, &len, &L, LCount, 0);
12407db96d56Sopenharmony_ci        pos += len;
12417db96d56Sopenharmony_ci        find_syllable(pos, &len, &V, VCount, 1);
12427db96d56Sopenharmony_ci        pos += len;
12437db96d56Sopenharmony_ci        find_syllable(pos, &len, &T, TCount, 2);
12447db96d56Sopenharmony_ci        pos += len;
12457db96d56Sopenharmony_ci        if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
12467db96d56Sopenharmony_ci            *code = SBase + (L*VCount+V)*TCount + T;
12477db96d56Sopenharmony_ci            return 1;
12487db96d56Sopenharmony_ci        }
12497db96d56Sopenharmony_ci        /* Otherwise, it's an illegal syllable name. */
12507db96d56Sopenharmony_ci        return 0;
12517db96d56Sopenharmony_ci    }
12527db96d56Sopenharmony_ci
12537db96d56Sopenharmony_ci    /* Check for unified ideographs. */
12547db96d56Sopenharmony_ci    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
12557db96d56Sopenharmony_ci        /* Four or five hexdigits must follow. */
12567db96d56Sopenharmony_ci        v = 0;
12577db96d56Sopenharmony_ci        name += 22;
12587db96d56Sopenharmony_ci        namelen -= 22;
12597db96d56Sopenharmony_ci        if (namelen != 4 && namelen != 5)
12607db96d56Sopenharmony_ci            return 0;
12617db96d56Sopenharmony_ci        while (namelen--) {
12627db96d56Sopenharmony_ci            v *= 16;
12637db96d56Sopenharmony_ci            if (*name >= '0' && *name <= '9')
12647db96d56Sopenharmony_ci                v += *name - '0';
12657db96d56Sopenharmony_ci            else if (*name >= 'A' && *name <= 'F')
12667db96d56Sopenharmony_ci                v += *name - 'A' + 10;
12677db96d56Sopenharmony_ci            else
12687db96d56Sopenharmony_ci                return 0;
12697db96d56Sopenharmony_ci            name++;
12707db96d56Sopenharmony_ci        }
12717db96d56Sopenharmony_ci        if (!is_unified_ideograph(v))
12727db96d56Sopenharmony_ci            return 0;
12737db96d56Sopenharmony_ci        *code = v;
12747db96d56Sopenharmony_ci        return 1;
12757db96d56Sopenharmony_ci    }
12767db96d56Sopenharmony_ci
12777db96d56Sopenharmony_ci    /* the following is the same as python's dictionary lookup, with
12787db96d56Sopenharmony_ci       only minor changes.  see the makeunicodedata script for more
12797db96d56Sopenharmony_ci       details */
12807db96d56Sopenharmony_ci
12817db96d56Sopenharmony_ci    h = (unsigned int) _gethash(name, namelen, code_magic);
12827db96d56Sopenharmony_ci    i = (~h) & mask;
12837db96d56Sopenharmony_ci    v = code_hash[i];
12847db96d56Sopenharmony_ci    if (!v)
12857db96d56Sopenharmony_ci        return 0;
12867db96d56Sopenharmony_ci    if (_cmpname(self, v, name, namelen)) {
12877db96d56Sopenharmony_ci        return _check_alias_and_seq(v, code, with_named_seq);
12887db96d56Sopenharmony_ci    }
12897db96d56Sopenharmony_ci    incr = (h ^ (h >> 3)) & mask;
12907db96d56Sopenharmony_ci    if (!incr)
12917db96d56Sopenharmony_ci        incr = mask;
12927db96d56Sopenharmony_ci    for (;;) {
12937db96d56Sopenharmony_ci        i = (i + incr) & mask;
12947db96d56Sopenharmony_ci        v = code_hash[i];
12957db96d56Sopenharmony_ci        if (!v)
12967db96d56Sopenharmony_ci            return 0;
12977db96d56Sopenharmony_ci        if (_cmpname(self, v, name, namelen)) {
12987db96d56Sopenharmony_ci            return _check_alias_and_seq(v, code, with_named_seq);
12997db96d56Sopenharmony_ci        }
13007db96d56Sopenharmony_ci        incr = incr << 1;
13017db96d56Sopenharmony_ci        if (incr > mask)
13027db96d56Sopenharmony_ci            incr = incr ^ code_poly;
13037db96d56Sopenharmony_ci    }
13047db96d56Sopenharmony_ci}
13057db96d56Sopenharmony_ci
13067db96d56Sopenharmony_cistatic int
13077db96d56Sopenharmony_cicapi_getcode(const char* name, int namelen, Py_UCS4* code,
13087db96d56Sopenharmony_ci             int with_named_seq)
13097db96d56Sopenharmony_ci{
13107db96d56Sopenharmony_ci    return _getcode(NULL, name, namelen, code, with_named_seq);
13117db96d56Sopenharmony_ci
13127db96d56Sopenharmony_ci}
13137db96d56Sopenharmony_ci
13147db96d56Sopenharmony_cistatic void
13157db96d56Sopenharmony_ciunicodedata_destroy_capi(PyObject *capsule)
13167db96d56Sopenharmony_ci{
13177db96d56Sopenharmony_ci    void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
13187db96d56Sopenharmony_ci    PyMem_Free(capi);
13197db96d56Sopenharmony_ci}
13207db96d56Sopenharmony_ci
13217db96d56Sopenharmony_cistatic PyObject *
13227db96d56Sopenharmony_ciunicodedata_create_capi(void)
13237db96d56Sopenharmony_ci{
13247db96d56Sopenharmony_ci    _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
13257db96d56Sopenharmony_ci    if (capi == NULL) {
13267db96d56Sopenharmony_ci        PyErr_NoMemory();
13277db96d56Sopenharmony_ci        return NULL;
13287db96d56Sopenharmony_ci    }
13297db96d56Sopenharmony_ci    capi->getname = capi_getucname;
13307db96d56Sopenharmony_ci    capi->getcode = capi_getcode;
13317db96d56Sopenharmony_ci
13327db96d56Sopenharmony_ci    PyObject *capsule = PyCapsule_New(capi,
13337db96d56Sopenharmony_ci                                      PyUnicodeData_CAPSULE_NAME,
13347db96d56Sopenharmony_ci                                      unicodedata_destroy_capi);
13357db96d56Sopenharmony_ci    if (capsule == NULL) {
13367db96d56Sopenharmony_ci        PyMem_Free(capi);
13377db96d56Sopenharmony_ci    }
13387db96d56Sopenharmony_ci    return capsule;
13397db96d56Sopenharmony_ci};
13407db96d56Sopenharmony_ci
13417db96d56Sopenharmony_ci
13427db96d56Sopenharmony_ci/* -------------------------------------------------------------------- */
13437db96d56Sopenharmony_ci/* Python bindings */
13447db96d56Sopenharmony_ci
13457db96d56Sopenharmony_ci/*[clinic input]
13467db96d56Sopenharmony_ciunicodedata.UCD.name
13477db96d56Sopenharmony_ci
13487db96d56Sopenharmony_ci    self: self
13497db96d56Sopenharmony_ci    chr: int(accept={str})
13507db96d56Sopenharmony_ci    default: object=NULL
13517db96d56Sopenharmony_ci    /
13527db96d56Sopenharmony_ci
13537db96d56Sopenharmony_ciReturns the name assigned to the character chr as a string.
13547db96d56Sopenharmony_ci
13557db96d56Sopenharmony_ciIf no name is defined, default is returned, or, if not given,
13567db96d56Sopenharmony_ciValueError is raised.
13577db96d56Sopenharmony_ci[clinic start generated code]*/
13587db96d56Sopenharmony_ci
13597db96d56Sopenharmony_cistatic PyObject *
13607db96d56Sopenharmony_ciunicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
13617db96d56Sopenharmony_ci/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
13627db96d56Sopenharmony_ci{
13637db96d56Sopenharmony_ci    char name[NAME_MAXLEN+1];
13647db96d56Sopenharmony_ci    Py_UCS4 c = (Py_UCS4)chr;
13657db96d56Sopenharmony_ci
13667db96d56Sopenharmony_ci    if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
13677db96d56Sopenharmony_ci        if (default_value == NULL) {
13687db96d56Sopenharmony_ci            PyErr_SetString(PyExc_ValueError, "no such name");
13697db96d56Sopenharmony_ci            return NULL;
13707db96d56Sopenharmony_ci        }
13717db96d56Sopenharmony_ci        else {
13727db96d56Sopenharmony_ci            Py_INCREF(default_value);
13737db96d56Sopenharmony_ci            return default_value;
13747db96d56Sopenharmony_ci        }
13757db96d56Sopenharmony_ci    }
13767db96d56Sopenharmony_ci
13777db96d56Sopenharmony_ci    return PyUnicode_FromString(name);
13787db96d56Sopenharmony_ci}
13797db96d56Sopenharmony_ci
13807db96d56Sopenharmony_ci/*[clinic input]
13817db96d56Sopenharmony_ciunicodedata.UCD.lookup
13827db96d56Sopenharmony_ci
13837db96d56Sopenharmony_ci    self: self
13847db96d56Sopenharmony_ci    name: str(accept={str, robuffer}, zeroes=True)
13857db96d56Sopenharmony_ci    /
13867db96d56Sopenharmony_ci
13877db96d56Sopenharmony_ciLook up character by name.
13887db96d56Sopenharmony_ci
13897db96d56Sopenharmony_ciIf a character with the given name is found, return the
13907db96d56Sopenharmony_cicorresponding character.  If not found, KeyError is raised.
13917db96d56Sopenharmony_ci[clinic start generated code]*/
13927db96d56Sopenharmony_ci
13937db96d56Sopenharmony_cistatic PyObject *
13947db96d56Sopenharmony_ciunicodedata_UCD_lookup_impl(PyObject *self, const char *name,
13957db96d56Sopenharmony_ci                            Py_ssize_t name_length)
13967db96d56Sopenharmony_ci/*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
13977db96d56Sopenharmony_ci{
13987db96d56Sopenharmony_ci    Py_UCS4 code;
13997db96d56Sopenharmony_ci    unsigned int index;
14007db96d56Sopenharmony_ci    if (name_length > NAME_MAXLEN) {
14017db96d56Sopenharmony_ci        PyErr_SetString(PyExc_KeyError, "name too long");
14027db96d56Sopenharmony_ci        return NULL;
14037db96d56Sopenharmony_ci    }
14047db96d56Sopenharmony_ci
14057db96d56Sopenharmony_ci    if (!_getcode(self, name, (int)name_length, &code, 1)) {
14067db96d56Sopenharmony_ci        PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
14077db96d56Sopenharmony_ci        return NULL;
14087db96d56Sopenharmony_ci    }
14097db96d56Sopenharmony_ci    /* check if code is in the PUA range that we use for named sequences
14107db96d56Sopenharmony_ci       and convert it */
14117db96d56Sopenharmony_ci    if (IS_NAMED_SEQ(code)) {
14127db96d56Sopenharmony_ci        index = code-named_sequences_start;
14137db96d56Sopenharmony_ci        return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
14147db96d56Sopenharmony_ci                                         named_sequences[index].seq,
14157db96d56Sopenharmony_ci                                         named_sequences[index].seqlen);
14167db96d56Sopenharmony_ci    }
14177db96d56Sopenharmony_ci    return PyUnicode_FromOrdinal(code);
14187db96d56Sopenharmony_ci}
14197db96d56Sopenharmony_ci
14207db96d56Sopenharmony_ci// List of functions used to define module functions *AND* unicodedata.UCD
14217db96d56Sopenharmony_ci// methods. For module functions, self is the module. For UCD methods, self
14227db96d56Sopenharmony_ci// is an UCD instance. The UCD_Check() macro is used to check if self is
14237db96d56Sopenharmony_ci// an UCD instance.
14247db96d56Sopenharmony_cistatic PyMethodDef unicodedata_functions[] = {
14257db96d56Sopenharmony_ci    UNICODEDATA_UCD_DECIMAL_METHODDEF
14267db96d56Sopenharmony_ci    UNICODEDATA_UCD_DIGIT_METHODDEF
14277db96d56Sopenharmony_ci    UNICODEDATA_UCD_NUMERIC_METHODDEF
14287db96d56Sopenharmony_ci    UNICODEDATA_UCD_CATEGORY_METHODDEF
14297db96d56Sopenharmony_ci    UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
14307db96d56Sopenharmony_ci    UNICODEDATA_UCD_COMBINING_METHODDEF
14317db96d56Sopenharmony_ci    UNICODEDATA_UCD_MIRRORED_METHODDEF
14327db96d56Sopenharmony_ci    UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
14337db96d56Sopenharmony_ci    UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
14347db96d56Sopenharmony_ci    UNICODEDATA_UCD_NAME_METHODDEF
14357db96d56Sopenharmony_ci    UNICODEDATA_UCD_LOOKUP_METHODDEF
14367db96d56Sopenharmony_ci    UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
14377db96d56Sopenharmony_ci    UNICODEDATA_UCD_NORMALIZE_METHODDEF
14387db96d56Sopenharmony_ci    {NULL, NULL}                /* sentinel */
14397db96d56Sopenharmony_ci};
14407db96d56Sopenharmony_ci
14417db96d56Sopenharmony_cistatic int
14427db96d56Sopenharmony_ciucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
14437db96d56Sopenharmony_ci{
14447db96d56Sopenharmony_ci    Py_VISIT(Py_TYPE(self));
14457db96d56Sopenharmony_ci    return 0;
14467db96d56Sopenharmony_ci}
14477db96d56Sopenharmony_ci
14487db96d56Sopenharmony_cistatic void
14497db96d56Sopenharmony_ciucd_dealloc(PreviousDBVersion *self)
14507db96d56Sopenharmony_ci{
14517db96d56Sopenharmony_ci    PyTypeObject *tp = Py_TYPE(self);
14527db96d56Sopenharmony_ci    PyObject_GC_UnTrack(self);
14537db96d56Sopenharmony_ci    PyObject_GC_Del(self);
14547db96d56Sopenharmony_ci    Py_DECREF(tp);
14557db96d56Sopenharmony_ci}
14567db96d56Sopenharmony_ci
14577db96d56Sopenharmony_cistatic PyType_Slot ucd_type_slots[] = {
14587db96d56Sopenharmony_ci    {Py_tp_dealloc, ucd_dealloc},
14597db96d56Sopenharmony_ci    {Py_tp_traverse, ucd_traverse},
14607db96d56Sopenharmony_ci    {Py_tp_getattro, PyObject_GenericGetAttr},
14617db96d56Sopenharmony_ci    {Py_tp_methods, unicodedata_functions},
14627db96d56Sopenharmony_ci    {Py_tp_members, DB_members},
14637db96d56Sopenharmony_ci    {0, 0}
14647db96d56Sopenharmony_ci};
14657db96d56Sopenharmony_ci
14667db96d56Sopenharmony_cistatic PyType_Spec ucd_type_spec = {
14677db96d56Sopenharmony_ci    .name = "unicodedata.UCD",
14687db96d56Sopenharmony_ci    .basicsize = sizeof(PreviousDBVersion),
14697db96d56Sopenharmony_ci    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
14707db96d56Sopenharmony_ci              Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
14717db96d56Sopenharmony_ci    .slots = ucd_type_slots
14727db96d56Sopenharmony_ci};
14737db96d56Sopenharmony_ci
14747db96d56Sopenharmony_ciPyDoc_STRVAR(unicodedata_docstring,
14757db96d56Sopenharmony_ci"This module provides access to the Unicode Character Database which\n\
14767db96d56Sopenharmony_cidefines character properties for all Unicode characters. The data in\n\
14777db96d56Sopenharmony_cithis database is based on the UnicodeData.txt file version\n\
14787db96d56Sopenharmony_ci" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
14797db96d56Sopenharmony_ci\n\
14807db96d56Sopenharmony_ciThe module uses the same names and symbols as defined by the\n\
14817db96d56Sopenharmony_ciUnicodeData File Format " UNIDATA_VERSION ".");
14827db96d56Sopenharmony_ci
14837db96d56Sopenharmony_cistatic int
14847db96d56Sopenharmony_ciunicodedata_exec(PyObject *module)
14857db96d56Sopenharmony_ci{
14867db96d56Sopenharmony_ci    if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
14877db96d56Sopenharmony_ci        return -1;
14887db96d56Sopenharmony_ci    }
14897db96d56Sopenharmony_ci
14907db96d56Sopenharmony_ci    PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
14917db96d56Sopenharmony_ci    if (ucd_type == NULL) {
14927db96d56Sopenharmony_ci        return -1;
14937db96d56Sopenharmony_ci    }
14947db96d56Sopenharmony_ci
14957db96d56Sopenharmony_ci    if (PyModule_AddType(module, ucd_type) < 0) {
14967db96d56Sopenharmony_ci        Py_DECREF(ucd_type);
14977db96d56Sopenharmony_ci        return -1;
14987db96d56Sopenharmony_ci    }
14997db96d56Sopenharmony_ci
15007db96d56Sopenharmony_ci    // Unicode database version 3.2.0 used by the IDNA encoding
15017db96d56Sopenharmony_ci    PyObject *v;
15027db96d56Sopenharmony_ci    v = new_previous_version(ucd_type, "3.2.0",
15037db96d56Sopenharmony_ci                             get_change_3_2_0, normalization_3_2_0);
15047db96d56Sopenharmony_ci    Py_DECREF(ucd_type);
15057db96d56Sopenharmony_ci    if (v == NULL) {
15067db96d56Sopenharmony_ci        return -1;
15077db96d56Sopenharmony_ci    }
15087db96d56Sopenharmony_ci    if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
15097db96d56Sopenharmony_ci        Py_DECREF(v);
15107db96d56Sopenharmony_ci        return -1;
15117db96d56Sopenharmony_ci    }
15127db96d56Sopenharmony_ci
15137db96d56Sopenharmony_ci    /* Export C API */
15147db96d56Sopenharmony_ci    PyObject *capsule = unicodedata_create_capi();
15157db96d56Sopenharmony_ci    if (capsule == NULL) {
15167db96d56Sopenharmony_ci        return -1;
15177db96d56Sopenharmony_ci    }
15187db96d56Sopenharmony_ci    int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
15197db96d56Sopenharmony_ci    Py_DECREF(capsule);
15207db96d56Sopenharmony_ci    if (rc < 0) {
15217db96d56Sopenharmony_ci        return -1;
15227db96d56Sopenharmony_ci    }
15237db96d56Sopenharmony_ci    return 0;
15247db96d56Sopenharmony_ci}
15257db96d56Sopenharmony_ci
15267db96d56Sopenharmony_cistatic PyModuleDef_Slot unicodedata_slots[] = {
15277db96d56Sopenharmony_ci    {Py_mod_exec, unicodedata_exec},
15287db96d56Sopenharmony_ci    {0, NULL}
15297db96d56Sopenharmony_ci};
15307db96d56Sopenharmony_ci
15317db96d56Sopenharmony_cistatic struct PyModuleDef unicodedata_module = {
15327db96d56Sopenharmony_ci    PyModuleDef_HEAD_INIT,
15337db96d56Sopenharmony_ci    .m_name = "unicodedata",
15347db96d56Sopenharmony_ci    .m_doc = unicodedata_docstring,
15357db96d56Sopenharmony_ci    .m_size = 0,
15367db96d56Sopenharmony_ci    .m_methods = unicodedata_functions,
15377db96d56Sopenharmony_ci    .m_slots = unicodedata_slots,
15387db96d56Sopenharmony_ci};
15397db96d56Sopenharmony_ci
15407db96d56Sopenharmony_ciPyMODINIT_FUNC
15417db96d56Sopenharmony_ciPyInit_unicodedata(void)
15427db96d56Sopenharmony_ci{
15437db96d56Sopenharmony_ci    return PyModuleDef_Init(&unicodedata_module);
15447db96d56Sopenharmony_ci}
15457db96d56Sopenharmony_ci
15467db96d56Sopenharmony_ci
15477db96d56Sopenharmony_ci/*
15487db96d56Sopenharmony_ciLocal variables:
15497db96d56Sopenharmony_cic-basic-offset: 4
15507db96d56Sopenharmony_ciindent-tabs-mode: nil
15517db96d56Sopenharmony_ciEnd:
15527db96d56Sopenharmony_ci*/
1553