xref: /third_party/python/Modules/unicodedata.c (revision 7db96d56)
1/* ------------------------------------------------------------------------
2
3   unicodedata -- Provides access to the Unicode database.
4
5   The current version number is reported in the unidata_version constant.
6
7   Written by Marc-Andre Lemburg (mal@lemburg.com).
8   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9   Modified by Martin v. Löwis (martin@v.loewis.de)
10
11   Copyright (c) Corporation for National Research Initiatives.
12
13   ------------------------------------------------------------------------ */
14
15#ifndef Py_BUILD_CORE_BUILTIN
16#  define Py_BUILD_CORE_MODULE 1
17#endif
18
19#define PY_SSIZE_T_CLEAN
20
21#include "Python.h"
22#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
23#include "structmember.h"         // PyMemberDef
24
25#include <stdbool.h>
26
27/*[clinic input]
28module unicodedata
29class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
30[clinic start generated code]*/
31/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
32
33/* character properties */
34
35typedef struct {
36    const unsigned char category;       /* index into
37                                           _PyUnicode_CategoryNames */
38    const unsigned char combining;      /* combining class value 0 - 255 */
39    const unsigned char bidirectional;  /* index into
40                                           _PyUnicode_BidirectionalNames */
41    const unsigned char mirrored;       /* true if mirrored in bidir mode */
42    const unsigned char east_asian_width;       /* index into
43                                                   _PyUnicode_EastAsianWidth */
44    const unsigned char normalization_quick_check; /* see is_normalized() */
45} _PyUnicode_DatabaseRecord;
46
47typedef struct change_record {
48    /* sequence of fields should be the same as in merge_old_version */
49    const unsigned char bidir_changed;
50    const unsigned char category_changed;
51    const unsigned char decimal_changed;
52    const unsigned char mirrored_changed;
53    const unsigned char east_asian_width_changed;
54    const double numeric_changed;
55} change_record;
56
57/* data file generated by Tools/unicode/makeunicodedata.py */
58#include "unicodedata_db.h"
59
60static const _PyUnicode_DatabaseRecord*
61_getrecord_ex(Py_UCS4 code)
62{
63    int index;
64    if (code >= 0x110000)
65        index = 0;
66    else {
67        index = index1[(code>>SHIFT)];
68        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
69    }
70
71    return &_PyUnicode_Database_Records[index];
72}
73
74/* ------------- Previous-version API ------------------------------------- */
75typedef struct previous_version {
76    PyObject_HEAD
77    const char *name;
78    const change_record* (*getrecord)(Py_UCS4);
79    Py_UCS4 (*normalization)(Py_UCS4);
80} PreviousDBVersion;
81
82#include "clinic/unicodedata.c.h"
83
84#define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
85
86static PyMemberDef DB_members[] = {
87        {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
88        {NULL}
89};
90
91// Check if self is an unicodedata.UCD instance.
92// If self is NULL (when the PyCapsule C API is used), return 0.
93// PyModule_Check() is used to avoid having to retrieve the ucd_type.
94// See unicodedata_functions comment to the rationale of this macro.
95#define UCD_Check(self) (self != NULL && !PyModule_Check(self))
96
97static PyObject*
98new_previous_version(PyTypeObject *ucd_type,
99                     const char*name, const change_record* (*getrecord)(Py_UCS4),
100                     Py_UCS4 (*normalization)(Py_UCS4))
101{
102    PreviousDBVersion *self;
103    self = PyObject_GC_New(PreviousDBVersion, ucd_type);
104    if (self == NULL)
105        return NULL;
106    self->name = name;
107    self->getrecord = getrecord;
108    self->normalization = normalization;
109    PyObject_GC_Track(self);
110    return (PyObject*)self;
111}
112
113
114/* --- Module API --------------------------------------------------------- */
115
116/*[clinic input]
117unicodedata.UCD.decimal
118
119    self: self
120    chr: int(accept={str})
121    default: object=NULL
122    /
123
124Converts a Unicode character into its equivalent decimal value.
125
126Returns the decimal value assigned to the character chr as integer.
127If no such value is defined, default is returned, or, if not given,
128ValueError is raised.
129[clinic start generated code]*/
130
131static PyObject *
132unicodedata_UCD_decimal_impl(PyObject *self, int chr,
133                             PyObject *default_value)
134/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
135{
136    int have_old = 0;
137    long rc;
138    Py_UCS4 c = (Py_UCS4)chr;
139
140    if (UCD_Check(self)) {
141        const change_record *old = get_old_record(self, c);
142        if (old->category_changed == 0) {
143            /* unassigned */
144            have_old = 1;
145            rc = -1;
146        }
147        else if (old->decimal_changed != 0xFF) {
148            have_old = 1;
149            rc = old->decimal_changed;
150        }
151    }
152
153    if (!have_old)
154        rc = Py_UNICODE_TODECIMAL(c);
155    if (rc < 0) {
156        if (default_value == NULL) {
157            PyErr_SetString(PyExc_ValueError,
158                            "not a decimal");
159            return NULL;
160        }
161        else {
162            Py_INCREF(default_value);
163            return default_value;
164        }
165    }
166    return PyLong_FromLong(rc);
167}
168
169/*[clinic input]
170unicodedata.UCD.digit
171
172    self: self
173    chr: int(accept={str})
174    default: object=NULL
175    /
176
177Converts a Unicode character into its equivalent digit value.
178
179Returns the digit value assigned to the character chr as integer.
180If no such value is defined, default is returned, or, if not given,
181ValueError is raised.
182[clinic start generated code]*/
183
184static PyObject *
185unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
186/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
187{
188    long rc;
189    Py_UCS4 c = (Py_UCS4)chr;
190    rc = Py_UNICODE_TODIGIT(c);
191    if (rc < 0) {
192        if (default_value == NULL) {
193            PyErr_SetString(PyExc_ValueError, "not a digit");
194            return NULL;
195        }
196        else {
197            Py_INCREF(default_value);
198            return default_value;
199        }
200    }
201    return PyLong_FromLong(rc);
202}
203
204/*[clinic input]
205unicodedata.UCD.numeric
206
207    self: self
208    chr: int(accept={str})
209    default: object=NULL
210    /
211
212Converts a Unicode character into its equivalent numeric value.
213
214Returns the numeric value assigned to the character chr as float.
215If no such value is defined, default is returned, or, if not given,
216ValueError is raised.
217[clinic start generated code]*/
218
219static PyObject *
220unicodedata_UCD_numeric_impl(PyObject *self, int chr,
221                             PyObject *default_value)
222/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
223{
224    int have_old = 0;
225    double rc;
226    Py_UCS4 c = (Py_UCS4)chr;
227
228    if (UCD_Check(self)) {
229        const change_record *old = get_old_record(self, c);
230        if (old->category_changed == 0) {
231            /* unassigned */
232            have_old = 1;
233            rc = -1.0;
234        }
235        else if (old->decimal_changed != 0xFF) {
236            have_old = 1;
237            rc = old->decimal_changed;
238        }
239    }
240
241    if (!have_old)
242        rc = Py_UNICODE_TONUMERIC(c);
243    if (rc == -1.0) {
244        if (default_value == NULL) {
245            PyErr_SetString(PyExc_ValueError, "not a numeric character");
246            return NULL;
247        }
248        else {
249            Py_INCREF(default_value);
250            return default_value;
251        }
252    }
253    return PyFloat_FromDouble(rc);
254}
255
256/*[clinic input]
257unicodedata.UCD.category
258
259    self: self
260    chr: int(accept={str})
261    /
262
263Returns the general category assigned to the character chr as string.
264[clinic start generated code]*/
265
266static PyObject *
267unicodedata_UCD_category_impl(PyObject *self, int chr)
268/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
269{
270    int index;
271    Py_UCS4 c = (Py_UCS4)chr;
272    index = (int) _getrecord_ex(c)->category;
273    if (UCD_Check(self)) {
274        const change_record *old = get_old_record(self, c);
275        if (old->category_changed != 0xFF)
276            index = old->category_changed;
277    }
278    return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
279}
280
281/*[clinic input]
282unicodedata.UCD.bidirectional
283
284    self: self
285    chr: int(accept={str})
286    /
287
288Returns the bidirectional class assigned to the character chr as string.
289
290If no such value is defined, an empty string is returned.
291[clinic start generated code]*/
292
293static PyObject *
294unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
295/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
296{
297    int index;
298    Py_UCS4 c = (Py_UCS4)chr;
299    index = (int) _getrecord_ex(c)->bidirectional;
300    if (UCD_Check(self)) {
301        const change_record *old = get_old_record(self, c);
302        if (old->category_changed == 0)
303            index = 0; /* unassigned */
304        else if (old->bidir_changed != 0xFF)
305            index = old->bidir_changed;
306    }
307    return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
308}
309
310/*[clinic input]
311unicodedata.UCD.combining -> int
312
313    self: self
314    chr: int(accept={str})
315    /
316
317Returns the canonical combining class assigned to the character chr as integer.
318
319Returns 0 if no combining class is defined.
320[clinic start generated code]*/
321
322static int
323unicodedata_UCD_combining_impl(PyObject *self, int chr)
324/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
325{
326    int index;
327    Py_UCS4 c = (Py_UCS4)chr;
328    index = (int) _getrecord_ex(c)->combining;
329    if (UCD_Check(self)) {
330        const change_record *old = get_old_record(self, c);
331        if (old->category_changed == 0)
332            index = 0; /* unassigned */
333    }
334    return index;
335}
336
337/*[clinic input]
338unicodedata.UCD.mirrored -> int
339
340    self: self
341    chr: int(accept={str})
342    /
343
344Returns the mirrored property assigned to the character chr as integer.
345
346Returns 1 if the character has been identified as a "mirrored"
347character in bidirectional text, 0 otherwise.
348[clinic start generated code]*/
349
350static int
351unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
352/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
353{
354    int index;
355    Py_UCS4 c = (Py_UCS4)chr;
356    index = (int) _getrecord_ex(c)->mirrored;
357    if (UCD_Check(self)) {
358        const change_record *old = get_old_record(self, c);
359        if (old->category_changed == 0)
360            index = 0; /* unassigned */
361        else if (old->mirrored_changed != 0xFF)
362            index = old->mirrored_changed;
363    }
364    return index;
365}
366
367/*[clinic input]
368unicodedata.UCD.east_asian_width
369
370    self: self
371    chr: int(accept={str})
372    /
373
374Returns the east asian width assigned to the character chr as string.
375[clinic start generated code]*/
376
377static PyObject *
378unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
379/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
380{
381    int index;
382    Py_UCS4 c = (Py_UCS4)chr;
383    index = (int) _getrecord_ex(c)->east_asian_width;
384    if (UCD_Check(self)) {
385        const change_record *old = get_old_record(self, c);
386        if (old->category_changed == 0)
387            index = 0; /* unassigned */
388        else if (old->east_asian_width_changed != 0xFF)
389            index = old->east_asian_width_changed;
390    }
391    return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
392}
393
394/*[clinic input]
395unicodedata.UCD.decomposition
396
397    self: self
398    chr: int(accept={str})
399    /
400
401Returns the character decomposition mapping assigned to the character chr as string.
402
403An empty string is returned in case no such mapping is defined.
404[clinic start generated code]*/
405
406static PyObject *
407unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
408/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
409{
410    char decomp[256];
411    int code, index, count;
412    size_t i;
413    unsigned int prefix_index;
414    Py_UCS4 c = (Py_UCS4)chr;
415
416    code = (int)c;
417
418    if (UCD_Check(self)) {
419        const change_record *old = get_old_record(self, c);
420        if (old->category_changed == 0)
421            return PyUnicode_FromString(""); /* unassigned */
422    }
423
424    if (code < 0 || code >= 0x110000)
425        index = 0;
426    else {
427        index = decomp_index1[(code>>DECOMP_SHIFT)];
428        index = decomp_index2[(index<<DECOMP_SHIFT)+
429                             (code&((1<<DECOMP_SHIFT)-1))];
430    }
431
432    /* high byte is number of hex bytes (usually one or two), low byte
433       is prefix code (from*/
434    count = decomp_data[index] >> 8;
435
436    /* XXX: could allocate the PyString up front instead
437       (strlen(prefix) + 5 * count + 1 bytes) */
438
439    /* Based on how index is calculated above and decomp_data is generated
440       from Tools/unicode/makeunicodedata.py, it should not be possible
441       to overflow decomp_prefix. */
442    prefix_index = decomp_data[index] & 255;
443    assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
444
445    /* copy prefix */
446    i = strlen(decomp_prefix[prefix_index]);
447    memcpy(decomp, decomp_prefix[prefix_index], i);
448
449    while (count-- > 0) {
450        if (i)
451            decomp[i++] = ' ';
452        assert(i < sizeof(decomp));
453        PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454                      decomp_data[++index]);
455        i += strlen(decomp + i);
456    }
457    return PyUnicode_FromStringAndSize(decomp, i);
458}
459
460static void
461get_decomp_record(PyObject *self, Py_UCS4 code,
462                  int *index, int *prefix, int *count)
463{
464    if (code >= 0x110000) {
465        *index = 0;
466    }
467    else if (UCD_Check(self)
468             && get_old_record(self, code)->category_changed==0) {
469        /* unassigned in old version */
470        *index = 0;
471    }
472    else {
473        *index = decomp_index1[(code>>DECOMP_SHIFT)];
474        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
475                               (code&((1<<DECOMP_SHIFT)-1))];
476    }
477
478    /* high byte is number of hex bytes (usually one or two), low byte
479       is prefix code (from*/
480    *count = decomp_data[*index] >> 8;
481    *prefix = decomp_data[*index] & 255;
482
483    (*index)++;
484}
485
486#define SBase   0xAC00
487#define LBase   0x1100
488#define VBase   0x1161
489#define TBase   0x11A7
490#define LCount  19
491#define VCount  21
492#define TCount  28
493#define NCount  (VCount*TCount)
494#define SCount  (LCount*NCount)
495
496static PyObject*
497nfd_nfkd(PyObject *self, PyObject *input, int k)
498{
499    PyObject *result;
500    Py_UCS4 *output;
501    Py_ssize_t i, o, osize;
502    int kind;
503    const void *data;
504    /* Longest decomposition in Unicode 3.2: U+FDFA */
505    Py_UCS4 stack[20];
506    Py_ssize_t space, isize;
507    int index, prefix, count, stackptr;
508    unsigned char prev, cur;
509
510    stackptr = 0;
511    isize = PyUnicode_GET_LENGTH(input);
512    space = isize;
513    /* Overallocate at most 10 characters. */
514    if (space > 10) {
515        if (space <= PY_SSIZE_T_MAX - 10)
516            space += 10;
517    }
518    else {
519        space *= 2;
520    }
521    osize = space;
522    output = PyMem_NEW(Py_UCS4, space);
523    if (!output) {
524        PyErr_NoMemory();
525        return NULL;
526    }
527    i = o = 0;
528    kind = PyUnicode_KIND(input);
529    data = PyUnicode_DATA(input);
530
531    while (i < isize) {
532        stack[stackptr++] = PyUnicode_READ(kind, data, i++);
533        while(stackptr) {
534            Py_UCS4 code = stack[--stackptr];
535            /* Hangul Decomposition adds three characters in
536               a single step, so we need at least that much room. */
537            if (space < 3) {
538                Py_UCS4 *new_output;
539                osize += 10;
540                space += 10;
541                new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
542                if (new_output == NULL) {
543                    PyMem_Free(output);
544                    PyErr_NoMemory();
545                    return NULL;
546                }
547                output = new_output;
548            }
549            /* Hangul Decomposition. */
550            if (SBase <= code && code < (SBase+SCount)) {
551                int SIndex = code - SBase;
552                int L = LBase + SIndex / NCount;
553                int V = VBase + (SIndex % NCount) / TCount;
554                int T = TBase + SIndex % TCount;
555                output[o++] = L;
556                output[o++] = V;
557                space -= 2;
558                if (T != TBase) {
559                    output[o++] = T;
560                    space --;
561                }
562                continue;
563            }
564            /* normalization changes */
565            if (UCD_Check(self)) {
566                Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
567                if (value != 0) {
568                    stack[stackptr++] = value;
569                    continue;
570                }
571            }
572
573            /* Other decompositions. */
574            get_decomp_record(self, code, &index, &prefix, &count);
575
576            /* Copy character if it is not decomposable, or has a
577               compatibility decomposition, but we do NFD. */
578            if (!count || (prefix && !k)) {
579                output[o++] = code;
580                space--;
581                continue;
582            }
583            /* Copy decomposition onto the stack, in reverse
584               order.  */
585            while(count) {
586                code = decomp_data[index + (--count)];
587                stack[stackptr++] = code;
588            }
589        }
590    }
591
592    result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
593                                       output, o);
594    PyMem_Free(output);
595    if (!result)
596        return NULL;
597    /* result is guaranteed to be ready, as it is compact. */
598    kind = PyUnicode_KIND(result);
599    data = PyUnicode_DATA(result);
600
601    /* Sort canonically. */
602    i = 0;
603    prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
604    for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
605        cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
606        if (prev == 0 || cur == 0 || prev <= cur) {
607            prev = cur;
608            continue;
609        }
610        /* Non-canonical order. Need to switch *i with previous. */
611        o = i - 1;
612        while (1) {
613            Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
614            PyUnicode_WRITE(kind, data, o+1,
615                            PyUnicode_READ(kind, data, o));
616            PyUnicode_WRITE(kind, data, o, tmp);
617            o--;
618            if (o < 0)
619                break;
620            prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
621            if (prev == 0 || prev <= cur)
622                break;
623        }
624        prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
625    }
626    return result;
627}
628
629static int
630find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
631{
632    unsigned int index;
633    for (index = 0; nfc[index].start; index++) {
634        unsigned int start = nfc[index].start;
635        if (code < start)
636            return -1;
637        if (code <= start + nfc[index].count) {
638            unsigned int delta = code - start;
639            return nfc[index].index + delta;
640        }
641    }
642    return -1;
643}
644
645static PyObject*
646nfc_nfkc(PyObject *self, PyObject *input, int k)
647{
648    PyObject *result;
649    int kind;
650    const void *data;
651    Py_UCS4 *output;
652    Py_ssize_t i, i1, o, len;
653    int f,l,index,index1,comb;
654    Py_UCS4 code;
655    Py_ssize_t skipped[20];
656    int cskipped = 0;
657
658    result = nfd_nfkd(self, input, k);
659    if (!result)
660        return NULL;
661    /* result will be "ready". */
662    kind = PyUnicode_KIND(result);
663    data = PyUnicode_DATA(result);
664    len = PyUnicode_GET_LENGTH(result);
665
666    /* We allocate a buffer for the output.
667       If we find that we made no changes, we still return
668       the NFD result. */
669    output = PyMem_NEW(Py_UCS4, len);
670    if (!output) {
671        PyErr_NoMemory();
672        Py_DECREF(result);
673        return 0;
674    }
675    i = o = 0;
676
677  again:
678    while (i < len) {
679      for (index = 0; index < cskipped; index++) {
680          if (skipped[index] == i) {
681              /* *i character is skipped.
682                 Remove from list. */
683              skipped[index] = skipped[cskipped-1];
684              cskipped--;
685              i++;
686              goto again; /* continue while */
687          }
688      }
689      /* Hangul Composition. We don't need to check for <LV,T>
690         pairs, since we always have decomposed data. */
691      code = PyUnicode_READ(kind, data, i);
692      if (LBase <= code && code < (LBase+LCount) &&
693          i + 1 < len &&
694          VBase <= PyUnicode_READ(kind, data, i+1) &&
695          PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
696          /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
697             and V character is a modern vowel (0x1161 ~ 0x1175). */
698          int LIndex, VIndex;
699          LIndex = code - LBase;
700          VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
701          code = SBase + (LIndex*VCount+VIndex)*TCount;
702          i+=2;
703          if (i < len &&
704              TBase < PyUnicode_READ(kind, data, i) &&
705              PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
706              /* check T character is a modern trailing consonant
707                 (0x11A8 ~ 0x11C2). */
708              code += PyUnicode_READ(kind, data, i)-TBase;
709              i++;
710          }
711          output[o++] = code;
712          continue;
713      }
714
715      /* code is still input[i] here */
716      f = find_nfc_index(nfc_first, code);
717      if (f == -1) {
718          output[o++] = code;
719          i++;
720          continue;
721      }
722      /* Find next unblocked character. */
723      i1 = i+1;
724      comb = 0;
725      /* output base character for now; might be updated later. */
726      output[o] = PyUnicode_READ(kind, data, i);
727      while (i1 < len) {
728          Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
729          int comb1 = _getrecord_ex(code1)->combining;
730          if (comb) {
731              if (comb1 == 0)
732                  break;
733              if (comb >= comb1) {
734                  /* Character is blocked. */
735                  i1++;
736                  continue;
737              }
738          }
739          l = find_nfc_index(nfc_last, code1);
740          /* i1 cannot be combined with i. If i1
741             is a starter, we don't need to look further.
742             Otherwise, record the combining class. */
743          if (l == -1) {
744            not_combinable:
745              if (comb1 == 0)
746                  break;
747              comb = comb1;
748              i1++;
749              continue;
750          }
751          index = f*TOTAL_LAST + l;
752          index1 = comp_index[index >> COMP_SHIFT];
753          code = comp_data[(index1<<COMP_SHIFT)+
754                           (index&((1<<COMP_SHIFT)-1))];
755          if (code == 0)
756              goto not_combinable;
757
758          /* Replace the original character. */
759          output[o] = code;
760          /* Mark the second character unused. */
761          assert(cskipped < 20);
762          skipped[cskipped++] = i1;
763          i1++;
764          f = find_nfc_index(nfc_first, output[o]);
765          if (f == -1)
766              break;
767      }
768      /* Output character was already written.
769         Just advance the indices. */
770      o++; i++;
771    }
772    if (o == len) {
773        /* No changes. Return original string. */
774        PyMem_Free(output);
775        return result;
776    }
777    Py_DECREF(result);
778    result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
779                                       output, o);
780    PyMem_Free(output);
781    return result;
782}
783
784// This needs to match the logic in makeunicodedata.py
785// which constructs the quickcheck data.
786typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
787
788/* Run the Unicode normalization "quickcheck" algorithm.
789 *
790 * Return YES or NO if quickcheck determines the input is certainly
791 * normalized or certainly not, and MAYBE if quickcheck is unable to
792 * tell.
793 *
794 * If `yes_only` is true, then return MAYBE as soon as we determine
795 * the answer is not YES.
796 *
797 * For background and details on the algorithm, see UAX #15:
798 *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
799 */
800static QuickcheckResult
801is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
802                         bool yes_only)
803{
804    /* UCD 3.2.0 is requested, quickchecks must be disabled. */
805    if (UCD_Check(self)) {
806        return MAYBE;
807    }
808
809    if (PyUnicode_IS_ASCII(input)) {
810        return YES;
811    }
812
813    Py_ssize_t i, len;
814    int kind;
815    const void *data;
816    unsigned char prev_combining = 0;
817
818    /* The two quickcheck bits at this shift have type QuickcheckResult. */
819    int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
820
821    QuickcheckResult result = YES; /* certainly normalized, unless we find something */
822
823    i = 0;
824    kind = PyUnicode_KIND(input);
825    data = PyUnicode_DATA(input);
826    len = PyUnicode_GET_LENGTH(input);
827    while (i < len) {
828        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
829        const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
830
831        unsigned char combining = record->combining;
832        if (combining && prev_combining > combining)
833            return NO; /* non-canonical sort order, not normalized */
834        prev_combining = combining;
835
836        unsigned char quickcheck_whole = record->normalization_quick_check;
837        if (yes_only) {
838            if (quickcheck_whole & (3 << quickcheck_shift))
839                return MAYBE;
840        } else {
841            switch ((quickcheck_whole >> quickcheck_shift) & 3) {
842            case NO:
843              return NO;
844            case MAYBE:
845              result = MAYBE; /* this string might need normalization */
846            }
847        }
848    }
849    return result;
850}
851
852/*[clinic input]
853unicodedata.UCD.is_normalized
854
855    self: self
856    form: unicode
857    unistr as input: unicode
858    /
859
860Return whether the Unicode string unistr is in the normal form 'form'.
861
862Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
863[clinic start generated code]*/
864
865static PyObject *
866unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
867                                   PyObject *input)
868/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
869{
870    if (PyUnicode_READY(input) == -1) {
871        return NULL;
872    }
873
874    if (PyUnicode_GET_LENGTH(input) == 0) {
875        /* special case empty input strings. */
876        Py_RETURN_TRUE;
877    }
878
879    PyObject *result;
880    bool nfc = false;
881    bool k = false;
882    QuickcheckResult m;
883
884    PyObject *cmp;
885    int match = 0;
886
887    if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
888        nfc = true;
889    }
890    else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
891        nfc = true;
892        k = true;
893    }
894    else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
895        /* matches default values for `nfc` and `k` */
896    }
897    else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
898        k = true;
899    }
900    else {
901        PyErr_SetString(PyExc_ValueError, "invalid normalization form");
902        return NULL;
903    }
904
905    m = is_normalized_quickcheck(self, input, nfc, k, false);
906
907    if (m == MAYBE) {
908        cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
909        if (cmp == NULL) {
910            return NULL;
911        }
912        match = PyUnicode_Compare(input, cmp);
913        Py_DECREF(cmp);
914        result = (match == 0) ? Py_True : Py_False;
915    }
916    else {
917        result = (m == YES) ? Py_True : Py_False;
918    }
919
920    Py_INCREF(result);
921    return result;
922}
923
924
925/*[clinic input]
926unicodedata.UCD.normalize
927
928    self: self
929    form: unicode
930    unistr as input: unicode
931    /
932
933Return the normal form 'form' for the Unicode string unistr.
934
935Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
936[clinic start generated code]*/
937
938static PyObject *
939unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
940                               PyObject *input)
941/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
942{
943    if (PyUnicode_GET_LENGTH(input) == 0) {
944        /* Special case empty input strings, since resizing
945           them  later would cause internal errors. */
946        Py_INCREF(input);
947        return input;
948    }
949
950    if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
951        if (is_normalized_quickcheck(self, input,
952                                     true,  false, true) == YES) {
953            Py_INCREF(input);
954            return input;
955        }
956        return nfc_nfkc(self, input, 0);
957    }
958    if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
959        if (is_normalized_quickcheck(self, input,
960                                     true,  true,  true) == YES) {
961            Py_INCREF(input);
962            return input;
963        }
964        return nfc_nfkc(self, input, 1);
965    }
966    if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
967        if (is_normalized_quickcheck(self, input,
968                                     false, false, true) == YES) {
969            Py_INCREF(input);
970            return input;
971        }
972        return nfd_nfkd(self, input, 0);
973    }
974    if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
975        if (is_normalized_quickcheck(self, input,
976                                     false, true,  true) == YES) {
977            Py_INCREF(input);
978            return input;
979        }
980        return nfd_nfkd(self, input, 1);
981    }
982    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
983    return NULL;
984}
985
986/* -------------------------------------------------------------------- */
987/* unicode character name tables */
988
989/* data file generated by Tools/unicode/makeunicodedata.py */
990#include "unicodename_db.h"
991
992/* -------------------------------------------------------------------- */
993/* database code (cut and pasted from the unidb package) */
994
995static unsigned long
996_gethash(const char *s, int len, int scale)
997{
998    int i;
999    unsigned long h = 0;
1000    unsigned long ix;
1001    for (i = 0; i < len; i++) {
1002        h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
1003        ix = h & 0xff000000;
1004        if (ix)
1005            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
1006    }
1007    return h;
1008}
1009
1010static const char * const hangul_syllables[][3] = {
1011    { "G",  "A",   ""   },
1012    { "GG", "AE",  "G"  },
1013    { "N",  "YA",  "GG" },
1014    { "D",  "YAE", "GS" },
1015    { "DD", "EO",  "N", },
1016    { "R",  "E",   "NJ" },
1017    { "M",  "YEO", "NH" },
1018    { "B",  "YE",  "D"  },
1019    { "BB", "O",   "L"  },
1020    { "S",  "WA",  "LG" },
1021    { "SS", "WAE", "LM" },
1022    { "",   "OE",  "LB" },
1023    { "J",  "YO",  "LS" },
1024    { "JJ", "U",   "LT" },
1025    { "C",  "WEO", "LP" },
1026    { "K",  "WE",  "LH" },
1027    { "T",  "WI",  "M"  },
1028    { "P",  "YU",  "B"  },
1029    { "H",  "EU",  "BS" },
1030    { 0,    "YI",  "S"  },
1031    { 0,    "I",   "SS" },
1032    { 0,    0,     "NG" },
1033    { 0,    0,     "J"  },
1034    { 0,    0,     "C"  },
1035    { 0,    0,     "K"  },
1036    { 0,    0,     "T"  },
1037    { 0,    0,     "P"  },
1038    { 0,    0,     "H"  }
1039};
1040
1041/* These ranges need to match makeunicodedata.py:cjk_ranges. */
1042static int
1043is_unified_ideograph(Py_UCS4 code)
1044{
1045    return
1046        (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
1047        (0x4E00 <= code && code <= 0x9FFF)   || /* CJK Ideograph */
1048        (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
1049        (0x2A700 <= code && code <= 0x2B738) || /* CJK Ideograph Extension C */
1050        (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1051        (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1052        (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1053        (0x30000 <= code && code <= 0x3134A);   /* CJK Ideograph Extension G */
1054}
1055
1056/* macros used to determine if the given code point is in the PUA range that
1057 * we are using to store aliases and named sequences */
1058#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1059#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1060                          (cp < named_sequences_end))
1061
1062static int
1063_getucname(PyObject *self,
1064           Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
1065{
1066    /* Find the name associated with the given code point.
1067     * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1068     * that we are using for aliases and named sequences. */
1069    int offset;
1070    int i;
1071    int word;
1072    const unsigned char* w;
1073
1074    if (code >= 0x110000)
1075        return 0;
1076
1077    /* XXX should we just skip all the code points in the PUAs here? */
1078    if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1079        return 0;
1080
1081    if (UCD_Check(self)) {
1082        /* in 3.2.0 there are no aliases and named sequences */
1083        const change_record *old;
1084        if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1085            return 0;
1086        old = get_old_record(self, code);
1087        if (old->category_changed == 0) {
1088            /* unassigned */
1089            return 0;
1090        }
1091    }
1092
1093    if (SBase <= code && code < SBase+SCount) {
1094        /* Hangul syllable. */
1095        int SIndex = code - SBase;
1096        int L = SIndex / NCount;
1097        int V = (SIndex % NCount) / TCount;
1098        int T = SIndex % TCount;
1099
1100        if (buflen < 27)
1101            /* Worst case: HANGUL SYLLABLE <10chars>. */
1102            return 0;
1103        strcpy(buffer, "HANGUL SYLLABLE ");
1104        buffer += 16;
1105        strcpy(buffer, hangul_syllables[L][0]);
1106        buffer += strlen(hangul_syllables[L][0]);
1107        strcpy(buffer, hangul_syllables[V][1]);
1108        buffer += strlen(hangul_syllables[V][1]);
1109        strcpy(buffer, hangul_syllables[T][2]);
1110        buffer += strlen(hangul_syllables[T][2]);
1111        *buffer = '\0';
1112        return 1;
1113    }
1114
1115    if (is_unified_ideograph(code)) {
1116        if (buflen < 28)
1117            /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1118            return 0;
1119        sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1120        return 1;
1121    }
1122
1123    /* get offset into phrasebook */
1124    offset = phrasebook_offset1[(code>>phrasebook_shift)];
1125    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1126                               (code&((1<<phrasebook_shift)-1))];
1127    if (!offset)
1128        return 0;
1129
1130    i = 0;
1131
1132    for (;;) {
1133        /* get word index */
1134        word = phrasebook[offset] - phrasebook_short;
1135        if (word >= 0) {
1136            word = (word << 8) + phrasebook[offset+1];
1137            offset += 2;
1138        } else
1139            word = phrasebook[offset++];
1140        if (i) {
1141            if (i > buflen)
1142                return 0; /* buffer overflow */
1143            buffer[i++] = ' ';
1144        }
1145        /* copy word string from lexicon.  the last character in the
1146           word has bit 7 set.  the last word in a string ends with
1147           0x80 */
1148        w = lexicon + lexicon_offset[word];
1149        while (*w < 128) {
1150            if (i >= buflen)
1151                return 0; /* buffer overflow */
1152            buffer[i++] = *w++;
1153        }
1154        if (i >= buflen)
1155            return 0; /* buffer overflow */
1156        buffer[i++] = *w & 127;
1157        if (*w == 128)
1158            break; /* end of word */
1159    }
1160
1161    return 1;
1162}
1163
1164static int
1165capi_getucname(Py_UCS4 code,
1166               char* buffer, int buflen,
1167               int with_alias_and_seq)
1168{
1169    return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
1170
1171}
1172
1173static int
1174_cmpname(PyObject *self, int code, const char* name, int namelen)
1175{
1176    /* check if code corresponds to the given name */
1177    int i;
1178    char buffer[NAME_MAXLEN+1];
1179    if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1180        return 0;
1181    for (i = 0; i < namelen; i++) {
1182        if (Py_TOUPPER(name[i]) != buffer[i])
1183            return 0;
1184    }
1185    return buffer[namelen] == '\0';
1186}
1187
1188static void
1189find_syllable(const char *str, int *len, int *pos, int count, int column)
1190{
1191    int i, len1;
1192    *len = -1;
1193    for (i = 0; i < count; i++) {
1194        const char *s = hangul_syllables[i][column];
1195        len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1196        if (len1 <= *len)
1197            continue;
1198        if (strncmp(str, s, len1) == 0) {
1199            *len = len1;
1200            *pos = i;
1201        }
1202    }
1203    if (*len == -1) {
1204        *len = 0;
1205    }
1206}
1207
1208static int
1209_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1210{
1211    /* check if named sequences are allowed */
1212    if (!with_named_seq && IS_NAMED_SEQ(cp))
1213        return 0;
1214    /* if the code point is in the PUA range that we use for aliases,
1215     * convert it to obtain the right code point */
1216    if (IS_ALIAS(cp))
1217        *code = name_aliases[cp-aliases_start];
1218    else
1219        *code = cp;
1220    return 1;
1221}
1222
1223static int
1224_getcode(PyObject* self,
1225         const char* name, int namelen, Py_UCS4* code, int with_named_seq)
1226{
1227    /* Return the code point associated with the given name.
1228     * Named aliases are resolved too (unless self != NULL (i.e. we are using
1229     * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
1230     * using for the named sequence, and the caller must then convert it. */
1231    unsigned int h, v;
1232    unsigned int mask = code_size-1;
1233    unsigned int i, incr;
1234
1235    /* Check for hangul syllables. */
1236    if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1237        int len, L = -1, V = -1, T = -1;
1238        const char *pos = name + 16;
1239        find_syllable(pos, &len, &L, LCount, 0);
1240        pos += len;
1241        find_syllable(pos, &len, &V, VCount, 1);
1242        pos += len;
1243        find_syllable(pos, &len, &T, TCount, 2);
1244        pos += len;
1245        if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1246            *code = SBase + (L*VCount+V)*TCount + T;
1247            return 1;
1248        }
1249        /* Otherwise, it's an illegal syllable name. */
1250        return 0;
1251    }
1252
1253    /* Check for unified ideographs. */
1254    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1255        /* Four or five hexdigits must follow. */
1256        v = 0;
1257        name += 22;
1258        namelen -= 22;
1259        if (namelen != 4 && namelen != 5)
1260            return 0;
1261        while (namelen--) {
1262            v *= 16;
1263            if (*name >= '0' && *name <= '9')
1264                v += *name - '0';
1265            else if (*name >= 'A' && *name <= 'F')
1266                v += *name - 'A' + 10;
1267            else
1268                return 0;
1269            name++;
1270        }
1271        if (!is_unified_ideograph(v))
1272            return 0;
1273        *code = v;
1274        return 1;
1275    }
1276
1277    /* the following is the same as python's dictionary lookup, with
1278       only minor changes.  see the makeunicodedata script for more
1279       details */
1280
1281    h = (unsigned int) _gethash(name, namelen, code_magic);
1282    i = (~h) & mask;
1283    v = code_hash[i];
1284    if (!v)
1285        return 0;
1286    if (_cmpname(self, v, name, namelen)) {
1287        return _check_alias_and_seq(v, code, with_named_seq);
1288    }
1289    incr = (h ^ (h >> 3)) & mask;
1290    if (!incr)
1291        incr = mask;
1292    for (;;) {
1293        i = (i + incr) & mask;
1294        v = code_hash[i];
1295        if (!v)
1296            return 0;
1297        if (_cmpname(self, v, name, namelen)) {
1298            return _check_alias_and_seq(v, code, with_named_seq);
1299        }
1300        incr = incr << 1;
1301        if (incr > mask)
1302            incr = incr ^ code_poly;
1303    }
1304}
1305
1306static int
1307capi_getcode(const char* name, int namelen, Py_UCS4* code,
1308             int with_named_seq)
1309{
1310    return _getcode(NULL, name, namelen, code, with_named_seq);
1311
1312}
1313
1314static void
1315unicodedata_destroy_capi(PyObject *capsule)
1316{
1317    void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1318    PyMem_Free(capi);
1319}
1320
1321static PyObject *
1322unicodedata_create_capi(void)
1323{
1324    _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
1325    if (capi == NULL) {
1326        PyErr_NoMemory();
1327        return NULL;
1328    }
1329    capi->getname = capi_getucname;
1330    capi->getcode = capi_getcode;
1331
1332    PyObject *capsule = PyCapsule_New(capi,
1333                                      PyUnicodeData_CAPSULE_NAME,
1334                                      unicodedata_destroy_capi);
1335    if (capsule == NULL) {
1336        PyMem_Free(capi);
1337    }
1338    return capsule;
1339};
1340
1341
1342/* -------------------------------------------------------------------- */
1343/* Python bindings */
1344
1345/*[clinic input]
1346unicodedata.UCD.name
1347
1348    self: self
1349    chr: int(accept={str})
1350    default: object=NULL
1351    /
1352
1353Returns the name assigned to the character chr as a string.
1354
1355If no name is defined, default is returned, or, if not given,
1356ValueError is raised.
1357[clinic start generated code]*/
1358
1359static PyObject *
1360unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1361/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1362{
1363    char name[NAME_MAXLEN+1];
1364    Py_UCS4 c = (Py_UCS4)chr;
1365
1366    if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1367        if (default_value == NULL) {
1368            PyErr_SetString(PyExc_ValueError, "no such name");
1369            return NULL;
1370        }
1371        else {
1372            Py_INCREF(default_value);
1373            return default_value;
1374        }
1375    }
1376
1377    return PyUnicode_FromString(name);
1378}
1379
1380/*[clinic input]
1381unicodedata.UCD.lookup
1382
1383    self: self
1384    name: str(accept={str, robuffer}, zeroes=True)
1385    /
1386
1387Look up character by name.
1388
1389If a character with the given name is found, return the
1390corresponding character.  If not found, KeyError is raised.
1391[clinic start generated code]*/
1392
1393static PyObject *
1394unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1395                            Py_ssize_t name_length)
1396/*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
1397{
1398    Py_UCS4 code;
1399    unsigned int index;
1400    if (name_length > NAME_MAXLEN) {
1401        PyErr_SetString(PyExc_KeyError, "name too long");
1402        return NULL;
1403    }
1404
1405    if (!_getcode(self, name, (int)name_length, &code, 1)) {
1406        PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1407        return NULL;
1408    }
1409    /* check if code is in the PUA range that we use for named sequences
1410       and convert it */
1411    if (IS_NAMED_SEQ(code)) {
1412        index = code-named_sequences_start;
1413        return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1414                                         named_sequences[index].seq,
1415                                         named_sequences[index].seqlen);
1416    }
1417    return PyUnicode_FromOrdinal(code);
1418}
1419
1420// List of functions used to define module functions *AND* unicodedata.UCD
1421// methods. For module functions, self is the module. For UCD methods, self
1422// is an UCD instance. The UCD_Check() macro is used to check if self is
1423// an UCD instance.
1424static PyMethodDef unicodedata_functions[] = {
1425    UNICODEDATA_UCD_DECIMAL_METHODDEF
1426    UNICODEDATA_UCD_DIGIT_METHODDEF
1427    UNICODEDATA_UCD_NUMERIC_METHODDEF
1428    UNICODEDATA_UCD_CATEGORY_METHODDEF
1429    UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1430    UNICODEDATA_UCD_COMBINING_METHODDEF
1431    UNICODEDATA_UCD_MIRRORED_METHODDEF
1432    UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1433    UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1434    UNICODEDATA_UCD_NAME_METHODDEF
1435    UNICODEDATA_UCD_LOOKUP_METHODDEF
1436    UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1437    UNICODEDATA_UCD_NORMALIZE_METHODDEF
1438    {NULL, NULL}                /* sentinel */
1439};
1440
1441static int
1442ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
1443{
1444    Py_VISIT(Py_TYPE(self));
1445    return 0;
1446}
1447
1448static void
1449ucd_dealloc(PreviousDBVersion *self)
1450{
1451    PyTypeObject *tp = Py_TYPE(self);
1452    PyObject_GC_UnTrack(self);
1453    PyObject_GC_Del(self);
1454    Py_DECREF(tp);
1455}
1456
1457static PyType_Slot ucd_type_slots[] = {
1458    {Py_tp_dealloc, ucd_dealloc},
1459    {Py_tp_traverse, ucd_traverse},
1460    {Py_tp_getattro, PyObject_GenericGetAttr},
1461    {Py_tp_methods, unicodedata_functions},
1462    {Py_tp_members, DB_members},
1463    {0, 0}
1464};
1465
1466static PyType_Spec ucd_type_spec = {
1467    .name = "unicodedata.UCD",
1468    .basicsize = sizeof(PreviousDBVersion),
1469    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
1470              Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
1471    .slots = ucd_type_slots
1472};
1473
1474PyDoc_STRVAR(unicodedata_docstring,
1475"This module provides access to the Unicode Character Database which\n\
1476defines character properties for all Unicode characters. The data in\n\
1477this database is based on the UnicodeData.txt file version\n\
1478" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1479\n\
1480The module uses the same names and symbols as defined by the\n\
1481UnicodeData File Format " UNIDATA_VERSION ".");
1482
1483static int
1484unicodedata_exec(PyObject *module)
1485{
1486    if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1487        return -1;
1488    }
1489
1490    PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1491    if (ucd_type == NULL) {
1492        return -1;
1493    }
1494
1495    if (PyModule_AddType(module, ucd_type) < 0) {
1496        Py_DECREF(ucd_type);
1497        return -1;
1498    }
1499
1500    // Unicode database version 3.2.0 used by the IDNA encoding
1501    PyObject *v;
1502    v = new_previous_version(ucd_type, "3.2.0",
1503                             get_change_3_2_0, normalization_3_2_0);
1504    Py_DECREF(ucd_type);
1505    if (v == NULL) {
1506        return -1;
1507    }
1508    if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
1509        Py_DECREF(v);
1510        return -1;
1511    }
1512
1513    /* Export C API */
1514    PyObject *capsule = unicodedata_create_capi();
1515    if (capsule == NULL) {
1516        return -1;
1517    }
1518    int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
1519    Py_DECREF(capsule);
1520    if (rc < 0) {
1521        return -1;
1522    }
1523    return 0;
1524}
1525
1526static PyModuleDef_Slot unicodedata_slots[] = {
1527    {Py_mod_exec, unicodedata_exec},
1528    {0, NULL}
1529};
1530
1531static struct PyModuleDef unicodedata_module = {
1532    PyModuleDef_HEAD_INIT,
1533    .m_name = "unicodedata",
1534    .m_doc = unicodedata_docstring,
1535    .m_size = 0,
1536    .m_methods = unicodedata_functions,
1537    .m_slots = unicodedata_slots,
1538};
1539
1540PyMODINIT_FUNC
1541PyInit_unicodedata(void)
1542{
1543    return PyModuleDef_Init(&unicodedata_module);
1544}
1545
1546
1547/*
1548Local variables:
1549c-basic-offset: 4
1550indent-tabs-mode: nil
1551End:
1552*/
1553