1/* ------------------------------------------------------------------------ 2 3 unicodedata -- Provides access to the Unicode database. 4 5 The current version number is reported in the unidata_version constant. 6 7 Written by Marc-Andre Lemburg (mal@lemburg.com). 8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) 9 Modified by Martin v. Löwis (martin@v.loewis.de) 10 11 Copyright (c) Corporation for National Research Initiatives. 12 13 ------------------------------------------------------------------------ */ 14 15#ifndef Py_BUILD_CORE_BUILTIN 16# define Py_BUILD_CORE_MODULE 1 17#endif 18 19#define PY_SSIZE_T_CLEAN 20 21#include "Python.h" 22#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI 23#include "structmember.h" // PyMemberDef 24 25#include <stdbool.h> 26 27/*[clinic input] 28module unicodedata 29class unicodedata.UCD 'PreviousDBVersion *' '<not used>' 30[clinic start generated code]*/ 31/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/ 32 33/* character properties */ 34 35typedef struct { 36 const unsigned char category; /* index into 37 _PyUnicode_CategoryNames */ 38 const unsigned char combining; /* combining class value 0 - 255 */ 39 const unsigned char bidirectional; /* index into 40 _PyUnicode_BidirectionalNames */ 41 const unsigned char mirrored; /* true if mirrored in bidir mode */ 42 const unsigned char east_asian_width; /* index into 43 _PyUnicode_EastAsianWidth */ 44 const unsigned char normalization_quick_check; /* see is_normalized() */ 45} _PyUnicode_DatabaseRecord; 46 47typedef struct change_record { 48 /* sequence of fields should be the same as in merge_old_version */ 49 const unsigned char bidir_changed; 50 const unsigned char category_changed; 51 const unsigned char decimal_changed; 52 const unsigned char mirrored_changed; 53 const unsigned char east_asian_width_changed; 54 const double numeric_changed; 55} change_record; 56 57/* data file generated by Tools/unicode/makeunicodedata.py */ 58#include "unicodedata_db.h" 59 60static const _PyUnicode_DatabaseRecord* 61_getrecord_ex(Py_UCS4 code) 62{ 63 int index; 64 if (code >= 0x110000) 65 index = 0; 66 else { 67 index = index1[(code>>SHIFT)]; 68 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 69 } 70 71 return &_PyUnicode_Database_Records[index]; 72} 73 74/* ------------- Previous-version API ------------------------------------- */ 75typedef struct previous_version { 76 PyObject_HEAD 77 const char *name; 78 const change_record* (*getrecord)(Py_UCS4); 79 Py_UCS4 (*normalization)(Py_UCS4); 80} PreviousDBVersion; 81 82#include "clinic/unicodedata.c.h" 83 84#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v)) 85 86static PyMemberDef DB_members[] = { 87 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, 88 {NULL} 89}; 90 91// Check if self is an unicodedata.UCD instance. 92// If self is NULL (when the PyCapsule C API is used), return 0. 93// PyModule_Check() is used to avoid having to retrieve the ucd_type. 94// See unicodedata_functions comment to the rationale of this macro. 95#define UCD_Check(self) (self != NULL && !PyModule_Check(self)) 96 97static PyObject* 98new_previous_version(PyTypeObject *ucd_type, 99 const char*name, const change_record* (*getrecord)(Py_UCS4), 100 Py_UCS4 (*normalization)(Py_UCS4)) 101{ 102 PreviousDBVersion *self; 103 self = PyObject_GC_New(PreviousDBVersion, ucd_type); 104 if (self == NULL) 105 return NULL; 106 self->name = name; 107 self->getrecord = getrecord; 108 self->normalization = normalization; 109 PyObject_GC_Track(self); 110 return (PyObject*)self; 111} 112 113 114/* --- Module API --------------------------------------------------------- */ 115 116/*[clinic input] 117unicodedata.UCD.decimal 118 119 self: self 120 chr: int(accept={str}) 121 default: object=NULL 122 / 123 124Converts a Unicode character into its equivalent decimal value. 125 126Returns the decimal value assigned to the character chr as integer. 127If no such value is defined, default is returned, or, if not given, 128ValueError is raised. 129[clinic start generated code]*/ 130 131static PyObject * 132unicodedata_UCD_decimal_impl(PyObject *self, int chr, 133 PyObject *default_value) 134/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/ 135{ 136 int have_old = 0; 137 long rc; 138 Py_UCS4 c = (Py_UCS4)chr; 139 140 if (UCD_Check(self)) { 141 const change_record *old = get_old_record(self, c); 142 if (old->category_changed == 0) { 143 /* unassigned */ 144 have_old = 1; 145 rc = -1; 146 } 147 else if (old->decimal_changed != 0xFF) { 148 have_old = 1; 149 rc = old->decimal_changed; 150 } 151 } 152 153 if (!have_old) 154 rc = Py_UNICODE_TODECIMAL(c); 155 if (rc < 0) { 156 if (default_value == NULL) { 157 PyErr_SetString(PyExc_ValueError, 158 "not a decimal"); 159 return NULL; 160 } 161 else { 162 Py_INCREF(default_value); 163 return default_value; 164 } 165 } 166 return PyLong_FromLong(rc); 167} 168 169/*[clinic input] 170unicodedata.UCD.digit 171 172 self: self 173 chr: int(accept={str}) 174 default: object=NULL 175 / 176 177Converts a Unicode character into its equivalent digit value. 178 179Returns the digit value assigned to the character chr as integer. 180If no such value is defined, default is returned, or, if not given, 181ValueError is raised. 182[clinic start generated code]*/ 183 184static PyObject * 185unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value) 186/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/ 187{ 188 long rc; 189 Py_UCS4 c = (Py_UCS4)chr; 190 rc = Py_UNICODE_TODIGIT(c); 191 if (rc < 0) { 192 if (default_value == NULL) { 193 PyErr_SetString(PyExc_ValueError, "not a digit"); 194 return NULL; 195 } 196 else { 197 Py_INCREF(default_value); 198 return default_value; 199 } 200 } 201 return PyLong_FromLong(rc); 202} 203 204/*[clinic input] 205unicodedata.UCD.numeric 206 207 self: self 208 chr: int(accept={str}) 209 default: object=NULL 210 / 211 212Converts a Unicode character into its equivalent numeric value. 213 214Returns the numeric value assigned to the character chr as float. 215If no such value is defined, default is returned, or, if not given, 216ValueError is raised. 217[clinic start generated code]*/ 218 219static PyObject * 220unicodedata_UCD_numeric_impl(PyObject *self, int chr, 221 PyObject *default_value) 222/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/ 223{ 224 int have_old = 0; 225 double rc; 226 Py_UCS4 c = (Py_UCS4)chr; 227 228 if (UCD_Check(self)) { 229 const change_record *old = get_old_record(self, c); 230 if (old->category_changed == 0) { 231 /* unassigned */ 232 have_old = 1; 233 rc = -1.0; 234 } 235 else if (old->decimal_changed != 0xFF) { 236 have_old = 1; 237 rc = old->decimal_changed; 238 } 239 } 240 241 if (!have_old) 242 rc = Py_UNICODE_TONUMERIC(c); 243 if (rc == -1.0) { 244 if (default_value == NULL) { 245 PyErr_SetString(PyExc_ValueError, "not a numeric character"); 246 return NULL; 247 } 248 else { 249 Py_INCREF(default_value); 250 return default_value; 251 } 252 } 253 return PyFloat_FromDouble(rc); 254} 255 256/*[clinic input] 257unicodedata.UCD.category 258 259 self: self 260 chr: int(accept={str}) 261 / 262 263Returns the general category assigned to the character chr as string. 264[clinic start generated code]*/ 265 266static PyObject * 267unicodedata_UCD_category_impl(PyObject *self, int chr) 268/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/ 269{ 270 int index; 271 Py_UCS4 c = (Py_UCS4)chr; 272 index = (int) _getrecord_ex(c)->category; 273 if (UCD_Check(self)) { 274 const change_record *old = get_old_record(self, c); 275 if (old->category_changed != 0xFF) 276 index = old->category_changed; 277 } 278 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]); 279} 280 281/*[clinic input] 282unicodedata.UCD.bidirectional 283 284 self: self 285 chr: int(accept={str}) 286 / 287 288Returns the bidirectional class assigned to the character chr as string. 289 290If no such value is defined, an empty string is returned. 291[clinic start generated code]*/ 292 293static PyObject * 294unicodedata_UCD_bidirectional_impl(PyObject *self, int chr) 295/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/ 296{ 297 int index; 298 Py_UCS4 c = (Py_UCS4)chr; 299 index = (int) _getrecord_ex(c)->bidirectional; 300 if (UCD_Check(self)) { 301 const change_record *old = get_old_record(self, c); 302 if (old->category_changed == 0) 303 index = 0; /* unassigned */ 304 else if (old->bidir_changed != 0xFF) 305 index = old->bidir_changed; 306 } 307 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]); 308} 309 310/*[clinic input] 311unicodedata.UCD.combining -> int 312 313 self: self 314 chr: int(accept={str}) 315 / 316 317Returns the canonical combining class assigned to the character chr as integer. 318 319Returns 0 if no combining class is defined. 320[clinic start generated code]*/ 321 322static int 323unicodedata_UCD_combining_impl(PyObject *self, int chr) 324/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/ 325{ 326 int index; 327 Py_UCS4 c = (Py_UCS4)chr; 328 index = (int) _getrecord_ex(c)->combining; 329 if (UCD_Check(self)) { 330 const change_record *old = get_old_record(self, c); 331 if (old->category_changed == 0) 332 index = 0; /* unassigned */ 333 } 334 return index; 335} 336 337/*[clinic input] 338unicodedata.UCD.mirrored -> int 339 340 self: self 341 chr: int(accept={str}) 342 / 343 344Returns the mirrored property assigned to the character chr as integer. 345 346Returns 1 if the character has been identified as a "mirrored" 347character in bidirectional text, 0 otherwise. 348[clinic start generated code]*/ 349 350static int 351unicodedata_UCD_mirrored_impl(PyObject *self, int chr) 352/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/ 353{ 354 int index; 355 Py_UCS4 c = (Py_UCS4)chr; 356 index = (int) _getrecord_ex(c)->mirrored; 357 if (UCD_Check(self)) { 358 const change_record *old = get_old_record(self, c); 359 if (old->category_changed == 0) 360 index = 0; /* unassigned */ 361 else if (old->mirrored_changed != 0xFF) 362 index = old->mirrored_changed; 363 } 364 return index; 365} 366 367/*[clinic input] 368unicodedata.UCD.east_asian_width 369 370 self: self 371 chr: int(accept={str}) 372 / 373 374Returns the east asian width assigned to the character chr as string. 375[clinic start generated code]*/ 376 377static PyObject * 378unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr) 379/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/ 380{ 381 int index; 382 Py_UCS4 c = (Py_UCS4)chr; 383 index = (int) _getrecord_ex(c)->east_asian_width; 384 if (UCD_Check(self)) { 385 const change_record *old = get_old_record(self, c); 386 if (old->category_changed == 0) 387 index = 0; /* unassigned */ 388 else if (old->east_asian_width_changed != 0xFF) 389 index = old->east_asian_width_changed; 390 } 391 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]); 392} 393 394/*[clinic input] 395unicodedata.UCD.decomposition 396 397 self: self 398 chr: int(accept={str}) 399 / 400 401Returns the character decomposition mapping assigned to the character chr as string. 402 403An empty string is returned in case no such mapping is defined. 404[clinic start generated code]*/ 405 406static PyObject * 407unicodedata_UCD_decomposition_impl(PyObject *self, int chr) 408/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/ 409{ 410 char decomp[256]; 411 int code, index, count; 412 size_t i; 413 unsigned int prefix_index; 414 Py_UCS4 c = (Py_UCS4)chr; 415 416 code = (int)c; 417 418 if (UCD_Check(self)) { 419 const change_record *old = get_old_record(self, c); 420 if (old->category_changed == 0) 421 return PyUnicode_FromString(""); /* unassigned */ 422 } 423 424 if (code < 0 || code >= 0x110000) 425 index = 0; 426 else { 427 index = decomp_index1[(code>>DECOMP_SHIFT)]; 428 index = decomp_index2[(index<<DECOMP_SHIFT)+ 429 (code&((1<<DECOMP_SHIFT)-1))]; 430 } 431 432 /* high byte is number of hex bytes (usually one or two), low byte 433 is prefix code (from*/ 434 count = decomp_data[index] >> 8; 435 436 /* XXX: could allocate the PyString up front instead 437 (strlen(prefix) + 5 * count + 1 bytes) */ 438 439 /* Based on how index is calculated above and decomp_data is generated 440 from Tools/unicode/makeunicodedata.py, it should not be possible 441 to overflow decomp_prefix. */ 442 prefix_index = decomp_data[index] & 255; 443 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix)); 444 445 /* copy prefix */ 446 i = strlen(decomp_prefix[prefix_index]); 447 memcpy(decomp, decomp_prefix[prefix_index], i); 448 449 while (count-- > 0) { 450 if (i) 451 decomp[i++] = ' '; 452 assert(i < sizeof(decomp)); 453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", 454 decomp_data[++index]); 455 i += strlen(decomp + i); 456 } 457 return PyUnicode_FromStringAndSize(decomp, i); 458} 459 460static void 461get_decomp_record(PyObject *self, Py_UCS4 code, 462 int *index, int *prefix, int *count) 463{ 464 if (code >= 0x110000) { 465 *index = 0; 466 } 467 else if (UCD_Check(self) 468 && get_old_record(self, code)->category_changed==0) { 469 /* unassigned in old version */ 470 *index = 0; 471 } 472 else { 473 *index = decomp_index1[(code>>DECOMP_SHIFT)]; 474 *index = decomp_index2[(*index<<DECOMP_SHIFT)+ 475 (code&((1<<DECOMP_SHIFT)-1))]; 476 } 477 478 /* high byte is number of hex bytes (usually one or two), low byte 479 is prefix code (from*/ 480 *count = decomp_data[*index] >> 8; 481 *prefix = decomp_data[*index] & 255; 482 483 (*index)++; 484} 485 486#define SBase 0xAC00 487#define LBase 0x1100 488#define VBase 0x1161 489#define TBase 0x11A7 490#define LCount 19 491#define VCount 21 492#define TCount 28 493#define NCount (VCount*TCount) 494#define SCount (LCount*NCount) 495 496static PyObject* 497nfd_nfkd(PyObject *self, PyObject *input, int k) 498{ 499 PyObject *result; 500 Py_UCS4 *output; 501 Py_ssize_t i, o, osize; 502 int kind; 503 const void *data; 504 /* Longest decomposition in Unicode 3.2: U+FDFA */ 505 Py_UCS4 stack[20]; 506 Py_ssize_t space, isize; 507 int index, prefix, count, stackptr; 508 unsigned char prev, cur; 509 510 stackptr = 0; 511 isize = PyUnicode_GET_LENGTH(input); 512 space = isize; 513 /* Overallocate at most 10 characters. */ 514 if (space > 10) { 515 if (space <= PY_SSIZE_T_MAX - 10) 516 space += 10; 517 } 518 else { 519 space *= 2; 520 } 521 osize = space; 522 output = PyMem_NEW(Py_UCS4, space); 523 if (!output) { 524 PyErr_NoMemory(); 525 return NULL; 526 } 527 i = o = 0; 528 kind = PyUnicode_KIND(input); 529 data = PyUnicode_DATA(input); 530 531 while (i < isize) { 532 stack[stackptr++] = PyUnicode_READ(kind, data, i++); 533 while(stackptr) { 534 Py_UCS4 code = stack[--stackptr]; 535 /* Hangul Decomposition adds three characters in 536 a single step, so we need at least that much room. */ 537 if (space < 3) { 538 Py_UCS4 *new_output; 539 osize += 10; 540 space += 10; 541 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4)); 542 if (new_output == NULL) { 543 PyMem_Free(output); 544 PyErr_NoMemory(); 545 return NULL; 546 } 547 output = new_output; 548 } 549 /* Hangul Decomposition. */ 550 if (SBase <= code && code < (SBase+SCount)) { 551 int SIndex = code - SBase; 552 int L = LBase + SIndex / NCount; 553 int V = VBase + (SIndex % NCount) / TCount; 554 int T = TBase + SIndex % TCount; 555 output[o++] = L; 556 output[o++] = V; 557 space -= 2; 558 if (T != TBase) { 559 output[o++] = T; 560 space --; 561 } 562 continue; 563 } 564 /* normalization changes */ 565 if (UCD_Check(self)) { 566 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); 567 if (value != 0) { 568 stack[stackptr++] = value; 569 continue; 570 } 571 } 572 573 /* Other decompositions. */ 574 get_decomp_record(self, code, &index, &prefix, &count); 575 576 /* Copy character if it is not decomposable, or has a 577 compatibility decomposition, but we do NFD. */ 578 if (!count || (prefix && !k)) { 579 output[o++] = code; 580 space--; 581 continue; 582 } 583 /* Copy decomposition onto the stack, in reverse 584 order. */ 585 while(count) { 586 code = decomp_data[index + (--count)]; 587 stack[stackptr++] = code; 588 } 589 } 590 } 591 592 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, 593 output, o); 594 PyMem_Free(output); 595 if (!result) 596 return NULL; 597 /* result is guaranteed to be ready, as it is compact. */ 598 kind = PyUnicode_KIND(result); 599 data = PyUnicode_DATA(result); 600 601 /* Sort canonically. */ 602 i = 0; 603 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 604 for (i++; i < PyUnicode_GET_LENGTH(result); i++) { 605 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 606 if (prev == 0 || cur == 0 || prev <= cur) { 607 prev = cur; 608 continue; 609 } 610 /* Non-canonical order. Need to switch *i with previous. */ 611 o = i - 1; 612 while (1) { 613 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1); 614 PyUnicode_WRITE(kind, data, o+1, 615 PyUnicode_READ(kind, data, o)); 616 PyUnicode_WRITE(kind, data, o, tmp); 617 o--; 618 if (o < 0) 619 break; 620 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining; 621 if (prev == 0 || prev <= cur) 622 break; 623 } 624 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; 625 } 626 return result; 627} 628 629static int 630find_nfc_index(const struct reindex* nfc, Py_UCS4 code) 631{ 632 unsigned int index; 633 for (index = 0; nfc[index].start; index++) { 634 unsigned int start = nfc[index].start; 635 if (code < start) 636 return -1; 637 if (code <= start + nfc[index].count) { 638 unsigned int delta = code - start; 639 return nfc[index].index + delta; 640 } 641 } 642 return -1; 643} 644 645static PyObject* 646nfc_nfkc(PyObject *self, PyObject *input, int k) 647{ 648 PyObject *result; 649 int kind; 650 const void *data; 651 Py_UCS4 *output; 652 Py_ssize_t i, i1, o, len; 653 int f,l,index,index1,comb; 654 Py_UCS4 code; 655 Py_ssize_t skipped[20]; 656 int cskipped = 0; 657 658 result = nfd_nfkd(self, input, k); 659 if (!result) 660 return NULL; 661 /* result will be "ready". */ 662 kind = PyUnicode_KIND(result); 663 data = PyUnicode_DATA(result); 664 len = PyUnicode_GET_LENGTH(result); 665 666 /* We allocate a buffer for the output. 667 If we find that we made no changes, we still return 668 the NFD result. */ 669 output = PyMem_NEW(Py_UCS4, len); 670 if (!output) { 671 PyErr_NoMemory(); 672 Py_DECREF(result); 673 return 0; 674 } 675 i = o = 0; 676 677 again: 678 while (i < len) { 679 for (index = 0; index < cskipped; index++) { 680 if (skipped[index] == i) { 681 /* *i character is skipped. 682 Remove from list. */ 683 skipped[index] = skipped[cskipped-1]; 684 cskipped--; 685 i++; 686 goto again; /* continue while */ 687 } 688 } 689 /* Hangul Composition. We don't need to check for <LV,T> 690 pairs, since we always have decomposed data. */ 691 code = PyUnicode_READ(kind, data, i); 692 if (LBase <= code && code < (LBase+LCount) && 693 i + 1 < len && 694 VBase <= PyUnicode_READ(kind, data, i+1) && 695 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) { 696 /* check L character is a modern leading consonant (0x1100 ~ 0x1112) 697 and V character is a modern vowel (0x1161 ~ 0x1175). */ 698 int LIndex, VIndex; 699 LIndex = code - LBase; 700 VIndex = PyUnicode_READ(kind, data, i+1) - VBase; 701 code = SBase + (LIndex*VCount+VIndex)*TCount; 702 i+=2; 703 if (i < len && 704 TBase < PyUnicode_READ(kind, data, i) && 705 PyUnicode_READ(kind, data, i) < (TBase+TCount)) { 706 /* check T character is a modern trailing consonant 707 (0x11A8 ~ 0x11C2). */ 708 code += PyUnicode_READ(kind, data, i)-TBase; 709 i++; 710 } 711 output[o++] = code; 712 continue; 713 } 714 715 /* code is still input[i] here */ 716 f = find_nfc_index(nfc_first, code); 717 if (f == -1) { 718 output[o++] = code; 719 i++; 720 continue; 721 } 722 /* Find next unblocked character. */ 723 i1 = i+1; 724 comb = 0; 725 /* output base character for now; might be updated later. */ 726 output[o] = PyUnicode_READ(kind, data, i); 727 while (i1 < len) { 728 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1); 729 int comb1 = _getrecord_ex(code1)->combining; 730 if (comb) { 731 if (comb1 == 0) 732 break; 733 if (comb >= comb1) { 734 /* Character is blocked. */ 735 i1++; 736 continue; 737 } 738 } 739 l = find_nfc_index(nfc_last, code1); 740 /* i1 cannot be combined with i. If i1 741 is a starter, we don't need to look further. 742 Otherwise, record the combining class. */ 743 if (l == -1) { 744 not_combinable: 745 if (comb1 == 0) 746 break; 747 comb = comb1; 748 i1++; 749 continue; 750 } 751 index = f*TOTAL_LAST + l; 752 index1 = comp_index[index >> COMP_SHIFT]; 753 code = comp_data[(index1<<COMP_SHIFT)+ 754 (index&((1<<COMP_SHIFT)-1))]; 755 if (code == 0) 756 goto not_combinable; 757 758 /* Replace the original character. */ 759 output[o] = code; 760 /* Mark the second character unused. */ 761 assert(cskipped < 20); 762 skipped[cskipped++] = i1; 763 i1++; 764 f = find_nfc_index(nfc_first, output[o]); 765 if (f == -1) 766 break; 767 } 768 /* Output character was already written. 769 Just advance the indices. */ 770 o++; i++; 771 } 772 if (o == len) { 773 /* No changes. Return original string. */ 774 PyMem_Free(output); 775 return result; 776 } 777 Py_DECREF(result); 778 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, 779 output, o); 780 PyMem_Free(output); 781 return result; 782} 783 784// This needs to match the logic in makeunicodedata.py 785// which constructs the quickcheck data. 786typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; 787 788/* Run the Unicode normalization "quickcheck" algorithm. 789 * 790 * Return YES or NO if quickcheck determines the input is certainly 791 * normalized or certainly not, and MAYBE if quickcheck is unable to 792 * tell. 793 * 794 * If `yes_only` is true, then return MAYBE as soon as we determine 795 * the answer is not YES. 796 * 797 * For background and details on the algorithm, see UAX #15: 798 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms 799 */ 800static QuickcheckResult 801is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k, 802 bool yes_only) 803{ 804 /* UCD 3.2.0 is requested, quickchecks must be disabled. */ 805 if (UCD_Check(self)) { 806 return MAYBE; 807 } 808 809 if (PyUnicode_IS_ASCII(input)) { 810 return YES; 811 } 812 813 Py_ssize_t i, len; 814 int kind; 815 const void *data; 816 unsigned char prev_combining = 0; 817 818 /* The two quickcheck bits at this shift have type QuickcheckResult. */ 819 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0); 820 821 QuickcheckResult result = YES; /* certainly normalized, unless we find something */ 822 823 i = 0; 824 kind = PyUnicode_KIND(input); 825 data = PyUnicode_DATA(input); 826 len = PyUnicode_GET_LENGTH(input); 827 while (i < len) { 828 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 829 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); 830 831 unsigned char combining = record->combining; 832 if (combining && prev_combining > combining) 833 return NO; /* non-canonical sort order, not normalized */ 834 prev_combining = combining; 835 836 unsigned char quickcheck_whole = record->normalization_quick_check; 837 if (yes_only) { 838 if (quickcheck_whole & (3 << quickcheck_shift)) 839 return MAYBE; 840 } else { 841 switch ((quickcheck_whole >> quickcheck_shift) & 3) { 842 case NO: 843 return NO; 844 case MAYBE: 845 result = MAYBE; /* this string might need normalization */ 846 } 847 } 848 } 849 return result; 850} 851 852/*[clinic input] 853unicodedata.UCD.is_normalized 854 855 self: self 856 form: unicode 857 unistr as input: unicode 858 / 859 860Return whether the Unicode string unistr is in the normal form 'form'. 861 862Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. 863[clinic start generated code]*/ 864 865static PyObject * 866unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, 867 PyObject *input) 868/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/ 869{ 870 if (PyUnicode_READY(input) == -1) { 871 return NULL; 872 } 873 874 if (PyUnicode_GET_LENGTH(input) == 0) { 875 /* special case empty input strings. */ 876 Py_RETURN_TRUE; 877 } 878 879 PyObject *result; 880 bool nfc = false; 881 bool k = false; 882 QuickcheckResult m; 883 884 PyObject *cmp; 885 int match = 0; 886 887 if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) { 888 nfc = true; 889 } 890 else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) { 891 nfc = true; 892 k = true; 893 } 894 else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) { 895 /* matches default values for `nfc` and `k` */ 896 } 897 else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) { 898 k = true; 899 } 900 else { 901 PyErr_SetString(PyExc_ValueError, "invalid normalization form"); 902 return NULL; 903 } 904 905 m = is_normalized_quickcheck(self, input, nfc, k, false); 906 907 if (m == MAYBE) { 908 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); 909 if (cmp == NULL) { 910 return NULL; 911 } 912 match = PyUnicode_Compare(input, cmp); 913 Py_DECREF(cmp); 914 result = (match == 0) ? Py_True : Py_False; 915 } 916 else { 917 result = (m == YES) ? Py_True : Py_False; 918 } 919 920 Py_INCREF(result); 921 return result; 922} 923 924 925/*[clinic input] 926unicodedata.UCD.normalize 927 928 self: self 929 form: unicode 930 unistr as input: unicode 931 / 932 933Return the normal form 'form' for the Unicode string unistr. 934 935Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. 936[clinic start generated code]*/ 937 938static PyObject * 939unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, 940 PyObject *input) 941/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/ 942{ 943 if (PyUnicode_GET_LENGTH(input) == 0) { 944 /* Special case empty input strings, since resizing 945 them later would cause internal errors. */ 946 Py_INCREF(input); 947 return input; 948 } 949 950 if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) { 951 if (is_normalized_quickcheck(self, input, 952 true, false, true) == YES) { 953 Py_INCREF(input); 954 return input; 955 } 956 return nfc_nfkc(self, input, 0); 957 } 958 if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) { 959 if (is_normalized_quickcheck(self, input, 960 true, true, true) == YES) { 961 Py_INCREF(input); 962 return input; 963 } 964 return nfc_nfkc(self, input, 1); 965 } 966 if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) { 967 if (is_normalized_quickcheck(self, input, 968 false, false, true) == YES) { 969 Py_INCREF(input); 970 return input; 971 } 972 return nfd_nfkd(self, input, 0); 973 } 974 if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) { 975 if (is_normalized_quickcheck(self, input, 976 false, true, true) == YES) { 977 Py_INCREF(input); 978 return input; 979 } 980 return nfd_nfkd(self, input, 1); 981 } 982 PyErr_SetString(PyExc_ValueError, "invalid normalization form"); 983 return NULL; 984} 985 986/* -------------------------------------------------------------------- */ 987/* unicode character name tables */ 988 989/* data file generated by Tools/unicode/makeunicodedata.py */ 990#include "unicodename_db.h" 991 992/* -------------------------------------------------------------------- */ 993/* database code (cut and pasted from the unidb package) */ 994 995static unsigned long 996_gethash(const char *s, int len, int scale) 997{ 998 int i; 999 unsigned long h = 0; 1000 unsigned long ix; 1001 for (i = 0; i < len; i++) { 1002 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]); 1003 ix = h & 0xff000000; 1004 if (ix) 1005 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; 1006 } 1007 return h; 1008} 1009 1010static const char * const hangul_syllables[][3] = { 1011 { "G", "A", "" }, 1012 { "GG", "AE", "G" }, 1013 { "N", "YA", "GG" }, 1014 { "D", "YAE", "GS" }, 1015 { "DD", "EO", "N", }, 1016 { "R", "E", "NJ" }, 1017 { "M", "YEO", "NH" }, 1018 { "B", "YE", "D" }, 1019 { "BB", "O", "L" }, 1020 { "S", "WA", "LG" }, 1021 { "SS", "WAE", "LM" }, 1022 { "", "OE", "LB" }, 1023 { "J", "YO", "LS" }, 1024 { "JJ", "U", "LT" }, 1025 { "C", "WEO", "LP" }, 1026 { "K", "WE", "LH" }, 1027 { "T", "WI", "M" }, 1028 { "P", "YU", "B" }, 1029 { "H", "EU", "BS" }, 1030 { 0, "YI", "S" }, 1031 { 0, "I", "SS" }, 1032 { 0, 0, "NG" }, 1033 { 0, 0, "J" }, 1034 { 0, 0, "C" }, 1035 { 0, 0, "K" }, 1036 { 0, 0, "T" }, 1037 { 0, 0, "P" }, 1038 { 0, 0, "H" } 1039}; 1040 1041/* These ranges need to match makeunicodedata.py:cjk_ranges. */ 1042static int 1043is_unified_ideograph(Py_UCS4 code) 1044{ 1045 return 1046 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */ 1047 (0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */ 1048 (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */ 1049 (0x2A700 <= code && code <= 0x2B738) || /* CJK Ideograph Extension C */ 1050 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */ 1051 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */ 1052 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */ 1053 (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */ 1054} 1055 1056/* macros used to determine if the given code point is in the PUA range that 1057 * we are using to store aliases and named sequences */ 1058#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end)) 1059#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \ 1060 (cp < named_sequences_end)) 1061 1062static int 1063_getucname(PyObject *self, 1064 Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq) 1065{ 1066 /* Find the name associated with the given code point. 1067 * If with_alias_and_seq is 1, check for names in the Private Use Area 15 1068 * that we are using for aliases and named sequences. */ 1069 int offset; 1070 int i; 1071 int word; 1072 const unsigned char* w; 1073 1074 if (code >= 0x110000) 1075 return 0; 1076 1077 /* XXX should we just skip all the code points in the PUAs here? */ 1078 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code))) 1079 return 0; 1080 1081 if (UCD_Check(self)) { 1082 /* in 3.2.0 there are no aliases and named sequences */ 1083 const change_record *old; 1084 if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) 1085 return 0; 1086 old = get_old_record(self, code); 1087 if (old->category_changed == 0) { 1088 /* unassigned */ 1089 return 0; 1090 } 1091 } 1092 1093 if (SBase <= code && code < SBase+SCount) { 1094 /* Hangul syllable. */ 1095 int SIndex = code - SBase; 1096 int L = SIndex / NCount; 1097 int V = (SIndex % NCount) / TCount; 1098 int T = SIndex % TCount; 1099 1100 if (buflen < 27) 1101 /* Worst case: HANGUL SYLLABLE <10chars>. */ 1102 return 0; 1103 strcpy(buffer, "HANGUL SYLLABLE "); 1104 buffer += 16; 1105 strcpy(buffer, hangul_syllables[L][0]); 1106 buffer += strlen(hangul_syllables[L][0]); 1107 strcpy(buffer, hangul_syllables[V][1]); 1108 buffer += strlen(hangul_syllables[V][1]); 1109 strcpy(buffer, hangul_syllables[T][2]); 1110 buffer += strlen(hangul_syllables[T][2]); 1111 *buffer = '\0'; 1112 return 1; 1113 } 1114 1115 if (is_unified_ideograph(code)) { 1116 if (buflen < 28) 1117 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ 1118 return 0; 1119 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); 1120 return 1; 1121 } 1122 1123 /* get offset into phrasebook */ 1124 offset = phrasebook_offset1[(code>>phrasebook_shift)]; 1125 offset = phrasebook_offset2[(offset<<phrasebook_shift) + 1126 (code&((1<<phrasebook_shift)-1))]; 1127 if (!offset) 1128 return 0; 1129 1130 i = 0; 1131 1132 for (;;) { 1133 /* get word index */ 1134 word = phrasebook[offset] - phrasebook_short; 1135 if (word >= 0) { 1136 word = (word << 8) + phrasebook[offset+1]; 1137 offset += 2; 1138 } else 1139 word = phrasebook[offset++]; 1140 if (i) { 1141 if (i > buflen) 1142 return 0; /* buffer overflow */ 1143 buffer[i++] = ' '; 1144 } 1145 /* copy word string from lexicon. the last character in the 1146 word has bit 7 set. the last word in a string ends with 1147 0x80 */ 1148 w = lexicon + lexicon_offset[word]; 1149 while (*w < 128) { 1150 if (i >= buflen) 1151 return 0; /* buffer overflow */ 1152 buffer[i++] = *w++; 1153 } 1154 if (i >= buflen) 1155 return 0; /* buffer overflow */ 1156 buffer[i++] = *w & 127; 1157 if (*w == 128) 1158 break; /* end of word */ 1159 } 1160 1161 return 1; 1162} 1163 1164static int 1165capi_getucname(Py_UCS4 code, 1166 char* buffer, int buflen, 1167 int with_alias_and_seq) 1168{ 1169 return _getucname(NULL, code, buffer, buflen, with_alias_and_seq); 1170 1171} 1172 1173static int 1174_cmpname(PyObject *self, int code, const char* name, int namelen) 1175{ 1176 /* check if code corresponds to the given name */ 1177 int i; 1178 char buffer[NAME_MAXLEN+1]; 1179 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1)) 1180 return 0; 1181 for (i = 0; i < namelen; i++) { 1182 if (Py_TOUPPER(name[i]) != buffer[i]) 1183 return 0; 1184 } 1185 return buffer[namelen] == '\0'; 1186} 1187 1188static void 1189find_syllable(const char *str, int *len, int *pos, int count, int column) 1190{ 1191 int i, len1; 1192 *len = -1; 1193 for (i = 0; i < count; i++) { 1194 const char *s = hangul_syllables[i][column]; 1195 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int); 1196 if (len1 <= *len) 1197 continue; 1198 if (strncmp(str, s, len1) == 0) { 1199 *len = len1; 1200 *pos = i; 1201 } 1202 } 1203 if (*len == -1) { 1204 *len = 0; 1205 } 1206} 1207 1208static int 1209_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq) 1210{ 1211 /* check if named sequences are allowed */ 1212 if (!with_named_seq && IS_NAMED_SEQ(cp)) 1213 return 0; 1214 /* if the code point is in the PUA range that we use for aliases, 1215 * convert it to obtain the right code point */ 1216 if (IS_ALIAS(cp)) 1217 *code = name_aliases[cp-aliases_start]; 1218 else 1219 *code = cp; 1220 return 1; 1221} 1222 1223static int 1224_getcode(PyObject* self, 1225 const char* name, int namelen, Py_UCS4* code, int with_named_seq) 1226{ 1227 /* Return the code point associated with the given name. 1228 * Named aliases are resolved too (unless self != NULL (i.e. we are using 1229 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are 1230 * using for the named sequence, and the caller must then convert it. */ 1231 unsigned int h, v; 1232 unsigned int mask = code_size-1; 1233 unsigned int i, incr; 1234 1235 /* Check for hangul syllables. */ 1236 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { 1237 int len, L = -1, V = -1, T = -1; 1238 const char *pos = name + 16; 1239 find_syllable(pos, &len, &L, LCount, 0); 1240 pos += len; 1241 find_syllable(pos, &len, &V, VCount, 1); 1242 pos += len; 1243 find_syllable(pos, &len, &T, TCount, 2); 1244 pos += len; 1245 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { 1246 *code = SBase + (L*VCount+V)*TCount + T; 1247 return 1; 1248 } 1249 /* Otherwise, it's an illegal syllable name. */ 1250 return 0; 1251 } 1252 1253 /* Check for unified ideographs. */ 1254 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { 1255 /* Four or five hexdigits must follow. */ 1256 v = 0; 1257 name += 22; 1258 namelen -= 22; 1259 if (namelen != 4 && namelen != 5) 1260 return 0; 1261 while (namelen--) { 1262 v *= 16; 1263 if (*name >= '0' && *name <= '9') 1264 v += *name - '0'; 1265 else if (*name >= 'A' && *name <= 'F') 1266 v += *name - 'A' + 10; 1267 else 1268 return 0; 1269 name++; 1270 } 1271 if (!is_unified_ideograph(v)) 1272 return 0; 1273 *code = v; 1274 return 1; 1275 } 1276 1277 /* the following is the same as python's dictionary lookup, with 1278 only minor changes. see the makeunicodedata script for more 1279 details */ 1280 1281 h = (unsigned int) _gethash(name, namelen, code_magic); 1282 i = (~h) & mask; 1283 v = code_hash[i]; 1284 if (!v) 1285 return 0; 1286 if (_cmpname(self, v, name, namelen)) { 1287 return _check_alias_and_seq(v, code, with_named_seq); 1288 } 1289 incr = (h ^ (h >> 3)) & mask; 1290 if (!incr) 1291 incr = mask; 1292 for (;;) { 1293 i = (i + incr) & mask; 1294 v = code_hash[i]; 1295 if (!v) 1296 return 0; 1297 if (_cmpname(self, v, name, namelen)) { 1298 return _check_alias_and_seq(v, code, with_named_seq); 1299 } 1300 incr = incr << 1; 1301 if (incr > mask) 1302 incr = incr ^ code_poly; 1303 } 1304} 1305 1306static int 1307capi_getcode(const char* name, int namelen, Py_UCS4* code, 1308 int with_named_seq) 1309{ 1310 return _getcode(NULL, name, namelen, code, with_named_seq); 1311 1312} 1313 1314static void 1315unicodedata_destroy_capi(PyObject *capsule) 1316{ 1317 void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME); 1318 PyMem_Free(capi); 1319} 1320 1321static PyObject * 1322unicodedata_create_capi(void) 1323{ 1324 _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI)); 1325 if (capi == NULL) { 1326 PyErr_NoMemory(); 1327 return NULL; 1328 } 1329 capi->getname = capi_getucname; 1330 capi->getcode = capi_getcode; 1331 1332 PyObject *capsule = PyCapsule_New(capi, 1333 PyUnicodeData_CAPSULE_NAME, 1334 unicodedata_destroy_capi); 1335 if (capsule == NULL) { 1336 PyMem_Free(capi); 1337 } 1338 return capsule; 1339}; 1340 1341 1342/* -------------------------------------------------------------------- */ 1343/* Python bindings */ 1344 1345/*[clinic input] 1346unicodedata.UCD.name 1347 1348 self: self 1349 chr: int(accept={str}) 1350 default: object=NULL 1351 / 1352 1353Returns the name assigned to the character chr as a string. 1354 1355If no name is defined, default is returned, or, if not given, 1356ValueError is raised. 1357[clinic start generated code]*/ 1358 1359static PyObject * 1360unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value) 1361/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/ 1362{ 1363 char name[NAME_MAXLEN+1]; 1364 Py_UCS4 c = (Py_UCS4)chr; 1365 1366 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) { 1367 if (default_value == NULL) { 1368 PyErr_SetString(PyExc_ValueError, "no such name"); 1369 return NULL; 1370 } 1371 else { 1372 Py_INCREF(default_value); 1373 return default_value; 1374 } 1375 } 1376 1377 return PyUnicode_FromString(name); 1378} 1379 1380/*[clinic input] 1381unicodedata.UCD.lookup 1382 1383 self: self 1384 name: str(accept={str, robuffer}, zeroes=True) 1385 / 1386 1387Look up character by name. 1388 1389If a character with the given name is found, return the 1390corresponding character. If not found, KeyError is raised. 1391[clinic start generated code]*/ 1392 1393static PyObject * 1394unicodedata_UCD_lookup_impl(PyObject *self, const char *name, 1395 Py_ssize_t name_length) 1396/*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/ 1397{ 1398 Py_UCS4 code; 1399 unsigned int index; 1400 if (name_length > NAME_MAXLEN) { 1401 PyErr_SetString(PyExc_KeyError, "name too long"); 1402 return NULL; 1403 } 1404 1405 if (!_getcode(self, name, (int)name_length, &code, 1)) { 1406 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name); 1407 return NULL; 1408 } 1409 /* check if code is in the PUA range that we use for named sequences 1410 and convert it */ 1411 if (IS_NAMED_SEQ(code)) { 1412 index = code-named_sequences_start; 1413 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, 1414 named_sequences[index].seq, 1415 named_sequences[index].seqlen); 1416 } 1417 return PyUnicode_FromOrdinal(code); 1418} 1419 1420// List of functions used to define module functions *AND* unicodedata.UCD 1421// methods. For module functions, self is the module. For UCD methods, self 1422// is an UCD instance. The UCD_Check() macro is used to check if self is 1423// an UCD instance. 1424static PyMethodDef unicodedata_functions[] = { 1425 UNICODEDATA_UCD_DECIMAL_METHODDEF 1426 UNICODEDATA_UCD_DIGIT_METHODDEF 1427 UNICODEDATA_UCD_NUMERIC_METHODDEF 1428 UNICODEDATA_UCD_CATEGORY_METHODDEF 1429 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF 1430 UNICODEDATA_UCD_COMBINING_METHODDEF 1431 UNICODEDATA_UCD_MIRRORED_METHODDEF 1432 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF 1433 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF 1434 UNICODEDATA_UCD_NAME_METHODDEF 1435 UNICODEDATA_UCD_LOOKUP_METHODDEF 1436 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF 1437 UNICODEDATA_UCD_NORMALIZE_METHODDEF 1438 {NULL, NULL} /* sentinel */ 1439}; 1440 1441static int 1442ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg) 1443{ 1444 Py_VISIT(Py_TYPE(self)); 1445 return 0; 1446} 1447 1448static void 1449ucd_dealloc(PreviousDBVersion *self) 1450{ 1451 PyTypeObject *tp = Py_TYPE(self); 1452 PyObject_GC_UnTrack(self); 1453 PyObject_GC_Del(self); 1454 Py_DECREF(tp); 1455} 1456 1457static PyType_Slot ucd_type_slots[] = { 1458 {Py_tp_dealloc, ucd_dealloc}, 1459 {Py_tp_traverse, ucd_traverse}, 1460 {Py_tp_getattro, PyObject_GenericGetAttr}, 1461 {Py_tp_methods, unicodedata_functions}, 1462 {Py_tp_members, DB_members}, 1463 {0, 0} 1464}; 1465 1466static PyType_Spec ucd_type_spec = { 1467 .name = "unicodedata.UCD", 1468 .basicsize = sizeof(PreviousDBVersion), 1469 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | 1470 Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE), 1471 .slots = ucd_type_slots 1472}; 1473 1474PyDoc_STRVAR(unicodedata_docstring, 1475"This module provides access to the Unicode Character Database which\n\ 1476defines character properties for all Unicode characters. The data in\n\ 1477this database is based on the UnicodeData.txt file version\n\ 1478" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\ 1479\n\ 1480The module uses the same names and symbols as defined by the\n\ 1481UnicodeData File Format " UNIDATA_VERSION "."); 1482 1483static int 1484unicodedata_exec(PyObject *module) 1485{ 1486 if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) { 1487 return -1; 1488 } 1489 1490 PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec); 1491 if (ucd_type == NULL) { 1492 return -1; 1493 } 1494 1495 if (PyModule_AddType(module, ucd_type) < 0) { 1496 Py_DECREF(ucd_type); 1497 return -1; 1498 } 1499 1500 // Unicode database version 3.2.0 used by the IDNA encoding 1501 PyObject *v; 1502 v = new_previous_version(ucd_type, "3.2.0", 1503 get_change_3_2_0, normalization_3_2_0); 1504 Py_DECREF(ucd_type); 1505 if (v == NULL) { 1506 return -1; 1507 } 1508 if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) { 1509 Py_DECREF(v); 1510 return -1; 1511 } 1512 1513 /* Export C API */ 1514 PyObject *capsule = unicodedata_create_capi(); 1515 if (capsule == NULL) { 1516 return -1; 1517 } 1518 int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule); 1519 Py_DECREF(capsule); 1520 if (rc < 0) { 1521 return -1; 1522 } 1523 return 0; 1524} 1525 1526static PyModuleDef_Slot unicodedata_slots[] = { 1527 {Py_mod_exec, unicodedata_exec}, 1528 {0, NULL} 1529}; 1530 1531static struct PyModuleDef unicodedata_module = { 1532 PyModuleDef_HEAD_INIT, 1533 .m_name = "unicodedata", 1534 .m_doc = unicodedata_docstring, 1535 .m_size = 0, 1536 .m_methods = unicodedata_functions, 1537 .m_slots = unicodedata_slots, 1538}; 1539 1540PyMODINIT_FUNC 1541PyInit_unicodedata(void) 1542{ 1543 return PyModuleDef_Init(&unicodedata_module); 1544} 1545 1546 1547/* 1548Local variables: 1549c-basic-offset: 4 1550indent-tabs-mode: nil 1551End: 1552*/ 1553