1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* 6* Copyright (C) 2000-2016, International Business Machines 7* Corporation and others. All Rights Reserved. 8* 9******************************************************************************* 10* file name: genuca.cpp 11* encoding: US-ASCII 12* tab size: 8 (not used) 13* indentation:4 14* 15* created at the end of XX century 16* created by: Vladimir Weinstein, 17* modified in 2013-2014 by Markus Scherer 18* 19* This program reads the Fractional UCA table and generates 20* internal format for UCA table as well as inverse UCA table. 21* It then writes the ucadata.icu binary file containing the data. 22*/ 23 24#define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1 25 26#include <stdio.h> 27#include <stdint.h> 28#include "unicode/utypes.h" 29#include "unicode/errorcode.h" 30#include "unicode/localpointer.h" 31#include "unicode/ucol.h" 32#include "unicode/uscript.h" 33#include "unicode/utf8.h" 34#include "charstr.h" 35#include "cmemory.h" 36#include "collation.h" 37#include "collationbasedatabuilder.h" 38#include "collationdata.h" 39#include "collationdatabuilder.h" 40#include "collationdatareader.h" 41#include "collationdatawriter.h" 42#include "collationinfo.h" 43#include "collationrootelements.h" 44#include "collationruleparser.h" 45#include "collationtailoring.h" 46#include "cstring.h" 47#include "normalizer2impl.h" 48#include "toolutil.h" 49#include "unewdata.h" 50#include "uoptions.h" 51#include "uparse.h" 52#include "writesrc.h" 53 54#if UCONFIG_NO_COLLATION 55 56extern "C" int 57main(int argc, char* argv[]) { 58 (void)argc; 59 (void)argv; 60 return 1; 61} 62 63#else 64 65U_NAMESPACE_USE 66 67enum HanOrderValue { 68 HAN_NO_ORDER = -1, 69 HAN_IMPLICIT, 70 HAN_RADICAL_STROKE 71}; 72 73static UBool beVerbose=false, withCopyright=true, icu4xMode=false; 74 75static HanOrderValue hanOrder = HAN_NO_ORDER; 76 77static UVersionInfo UCAVersion={ 0, 0, 0, 0 }; 78 79static UDataInfo ucaDataInfo={ 80 sizeof(UDataInfo), 81 0, 82 83 U_IS_BIG_ENDIAN, 84 U_CHARSET_FAMILY, 85 U_SIZEOF_UCHAR, 86 0, 87 88 { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol" 89 { 5, 0, 0, 0 }, // formatVersion 90 { 6, 3, 0, 0 } // dataVersion 91}; 92 93static char *skipWhiteSpace(char *s) { 94 while(*s == ' ' || *s == '\t') { ++s; } 95 return s; 96} 97 98static int32_t hex2num(char hex) { 99 if(hex>='0' && hex <='9') { 100 return hex-'0'; 101 } else if(hex>='a' && hex<='f') { 102 return hex-'a'+10; 103 } else if(hex>='A' && hex<='F') { 104 return hex-'A'+10; 105 } else { 106 return -1; 107 } 108} 109 110static uint32_t parseWeight(char *&s, const char *separators, 111 int32_t maxBytes, UErrorCode &errorCode) { 112 if(U_FAILURE(errorCode)) { return 0; } 113 uint32_t weight = 0; 114 int32_t numBytes = 0; 115 for(;;) { 116 // Check one character after another, so that we don't just run over a 00. 117 int32_t nibble1, nibble2; 118 if((nibble1 = hex2num(s[0])) < 0 || (nibble2 = hex2num(s[1])) < 0) { 119 // Stop when we find something other than a pair of hex digits. 120 break; 121 } 122 if(numBytes == maxBytes || (numBytes != 0 && nibble1 == 0 && nibble2 <= 1)) { 123 // Too many bytes, or a 00 or 01 byte which is illegal inside a weight. 124 errorCode = U_INVALID_FORMAT_ERROR; 125 return 0; 126 } 127 weight = (weight << 8) | ((uint32_t)nibble1 << 4) | (uint32_t)nibble2; 128 ++numBytes; 129 s += 2; 130 if(*s != ' ') { 131 break; 132 } 133 ++s; 134 } 135 char c = *s; 136 if(c == 0 || strchr(separators, c) == NULL) { 137 errorCode = U_INVALID_FORMAT_ERROR; 138 return 0; 139 } 140 // numBytes==0 is ok, for example in [,,] or [, 82, 05] 141 // Left-align the weight. 142 while(numBytes < 4) { 143 weight <<= 8; 144 ++numBytes; 145 } 146 return weight; 147} 148 149/** 150 * Parse a CE like [0A 86, 05, 17] or [U+4E00, 10]. 151 * Stop with an error, or else with the pointer s after the closing bracket. 152 */ 153static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode &errorCode) { 154 if(U_FAILURE(errorCode)) { return 0; } 155 ++s; // skip over the '[' 156 if(s[0] == 'U' && s[1] == '+') { 157 // Read a code point and look up its CE. 158 // We use this especially for implicit primary weights, 159 // so that we can use different algorithms in the FractionalUCA.txt 160 // generator and the parser. 161 // The generator may not even need to compute any implicit primaries at all. 162 s += 2; 163 char *end; 164 unsigned long longCp = uprv_strtoul(s, &end, 16); 165 if(end == s || longCp > 0x10ffff) { 166 errorCode = U_INVALID_FORMAT_ERROR; 167 return 0; 168 } 169 UChar32 c = (UChar32)longCp; 170 int64_t ce = builder.getSingleCE(c, errorCode); 171 if(U_FAILURE(errorCode)) { return 0; } 172 s = end; 173 if(*s == ']') { // [U+4E00] 174 ++s; 175 return ce; 176 } 177 if(*s != ',') { 178 errorCode = U_INVALID_FORMAT_ERROR; 179 return 0; 180 } 181 // Parse the following, secondary or tertiary weight. 182 s = skipWhiteSpace(s + 1); 183 uint32_t w = parseWeight(s, ",]", 2, errorCode); 184 if(U_FAILURE(errorCode)) { return 0; } 185 if(*s == ']') { // [U+4E00, 10] 186 ++s; 187 // Set the tertiary weight to w. 188 return (ce & INT64_C(0xffffffffffff0000)) | (w >> 16); 189 } 190 // Set the secondary weight to w: [U+9F9C, 70, 20] 191 ce = (ce & INT64_C(0xffffffff00000000)) | w; 192 // Parse and set the tertiary weight. 193 s = skipWhiteSpace(s + 1); 194 w = parseWeight(s, "]", 2, errorCode); 195 ++s; 196 return ce | (w >> 16); 197 } else { 198 uint32_t p = parseWeight(s, ",", 4, errorCode); 199 if(U_FAILURE(errorCode)) { return 0; } 200 int64_t ce = (int64_t)p << 32; 201 s = skipWhiteSpace(s + 1); 202 uint32_t w = parseWeight(s, ",", 2, errorCode); 203 if(U_FAILURE(errorCode)) { return 0; } 204 ce |= w; 205 s = skipWhiteSpace(s + 1); 206 w = parseWeight(s, "]", 2, errorCode); 207 ++s; 208 return ce | (w >> 16); 209 } 210} 211 212namespace { 213 214// Cached, lazy-init mapping from scripts to sample characters. 215UChar32 sampleChars[USCRIPT_CODE_LIMIT] = { U_SENTINEL }; 216 217} 218 219// Hardcoded mapping from script sample characters to script codes. 220// Pro: Available without complete and updated UCD scripts data, 221// easy to add non-script codes specific to collation. 222// Con: Needs manual update for each new script or change in sample character. 223static const struct { 224 UChar32 sampleChar; 225 int32_t script; 226} sampleCharsToScripts[] = { 227 { 0x00A0, UCOL_REORDER_CODE_SPACE }, 228 { 0x201C, UCOL_REORDER_CODE_PUNCTUATION }, 229 { 0x263A, UCOL_REORDER_CODE_SYMBOL }, 230 { 0x20AC, UCOL_REORDER_CODE_CURRENCY }, 231 { 0x0034, UCOL_REORDER_CODE_DIGIT }, 232 { 0x004C, USCRIPT_LATIN }, 233 { 0x03A9, USCRIPT_GREEK }, 234 { 0x03E2, USCRIPT_COPTIC }, 235 { 0x042F, USCRIPT_CYRILLIC }, 236 { 0x2C00, USCRIPT_GLAGOLITIC }, 237 { 0x1036B, USCRIPT_OLD_PERMIC }, 238 { 0x10D3, USCRIPT_GEORGIAN }, 239 { 0x0531, USCRIPT_ARMENIAN }, 240 { 0x05D0, USCRIPT_HEBREW }, 241 { 0x10900, USCRIPT_PHOENICIAN }, 242 { 0x0800, USCRIPT_SAMARITAN }, 243 { 0x0628, USCRIPT_ARABIC }, 244 { 0x0710, USCRIPT_SYRIAC }, 245 { 0x0840, USCRIPT_MANDAIC }, 246 { 0x078C, USCRIPT_THAANA }, 247 { 0x07CA, USCRIPT_NKO }, 248 { 0x07D8, USCRIPT_NKO }, 249 { 0x2D30, USCRIPT_TIFINAGH }, 250 { 0x2D5E, USCRIPT_TIFINAGH }, 251 { 0x12A0, USCRIPT_ETHIOPIC }, 252 { 0x0905, USCRIPT_DEVANAGARI }, 253 { 0x0995, USCRIPT_BENGALI }, 254 { 0x0A15, USCRIPT_GURMUKHI }, 255 { 0x0A95, USCRIPT_GUJARATI }, 256 { 0x0B15, USCRIPT_ORIYA }, 257 { 0x0B95, USCRIPT_TAMIL }, 258 { 0x0C15, USCRIPT_TELUGU }, 259 { 0x0C95, USCRIPT_KANNADA }, 260 { 0x0D15, USCRIPT_MALAYALAM }, 261 { 0x0D85, USCRIPT_SINHALA }, 262 { 0xABC0, USCRIPT_MEITEI_MAYEK }, 263 { 0xA800, USCRIPT_SYLOTI_NAGRI }, 264 { 0xA882, USCRIPT_SAURASHTRA }, 265 { 0x11083, USCRIPT_KAITHI }, 266 { 0x11152, USCRIPT_MAHAJANI }, 267 { 0x11183, USCRIPT_SHARADA }, 268 { 0x11208, USCRIPT_KHOJKI }, 269 { 0x112BE, USCRIPT_KHUDAWADI }, 270 { 0x1128F, USCRIPT_MULTANI }, 271 { 0x11315, USCRIPT_GRANTHA }, 272 { 0x11412, USCRIPT_NEWA }, 273 { 0x11484, USCRIPT_TIRHUTA }, 274 { 0x1158E, USCRIPT_SIDDHAM }, 275 { 0x1160E, USCRIPT_MODI }, 276 { 0x11680, USCRIPT_TAKRI }, 277 { 0x1180B, USCRIPT_DOGRA }, 278 { 0x11717, USCRIPT_AHOM }, 279 { 0x11D71, USCRIPT_GUNJALA_GONDI }, 280 { 0x1B83, USCRIPT_SUNDANESE }, 281 { 0x11005, USCRIPT_BRAHMI }, 282 { 0x10A00, USCRIPT_KHAROSHTHI }, 283 { 0x11C0E, USCRIPT_BHAIKSUKI }, 284 { 0x0E17, USCRIPT_THAI }, 285 { 0x0EA5, USCRIPT_LAO }, 286 { 0xAA80, USCRIPT_TAI_VIET }, 287 { 0x0F40, USCRIPT_TIBETAN }, 288 { 0x11C72, USCRIPT_MARCHEN }, 289 { 0x1C00, USCRIPT_LEPCHA }, 290 { 0xA840, USCRIPT_PHAGS_PA }, 291 { 0x1900, USCRIPT_LIMBU }, 292 { 0x1703, USCRIPT_TAGALOG }, 293 { 0x1723, USCRIPT_HANUNOO }, 294 { 0x1743, USCRIPT_BUHID }, 295 { 0x1763, USCRIPT_TAGBANWA }, 296 { 0x1A00, USCRIPT_BUGINESE }, 297 { 0x11EE5, USCRIPT_MAKASAR }, 298 { 0x1BC0, USCRIPT_BATAK }, 299 { 0xA930, USCRIPT_REJANG }, 300 { 0xA90A, USCRIPT_KAYAH_LI }, 301 { 0x1000, USCRIPT_MYANMAR }, 302 { 0x10D12, USCRIPT_HANIFI_ROHINGYA }, 303 { 0x11103, USCRIPT_CHAKMA }, 304 { 0x1780, USCRIPT_KHMER }, 305 { 0x1950, USCRIPT_TAI_LE }, 306 { 0x1980, USCRIPT_NEW_TAI_LUE }, 307 { 0x1A20, USCRIPT_LANNA }, 308 { 0xAA00, USCRIPT_CHAM }, 309 { 0x1B05, USCRIPT_BALINESE }, 310 { 0xA984, USCRIPT_JAVANESE }, 311 { 0x1826, USCRIPT_MONGOLIAN }, 312 { 0x1C5A, USCRIPT_OL_CHIKI }, 313 { 0x13C4, USCRIPT_CHEROKEE }, 314 { 0x104B5, USCRIPT_OSAGE }, 315 { 0x14C0, USCRIPT_CANADIAN_ABORIGINAL }, 316 { 0x168F, USCRIPT_OGHAM }, 317 { 0x16A0, USCRIPT_RUNIC }, 318 { 0x10CA1, USCRIPT_OLD_HUNGARIAN }, 319 { 0x10C00, USCRIPT_ORKHON }, 320 { 0xA549, USCRIPT_VAI }, 321 { 0xA6A0, USCRIPT_BAMUM }, 322 { 0x16AE6, USCRIPT_BASSA_VAH }, 323 { 0x1E802, USCRIPT_MENDE }, 324 { 0x16E40, USCRIPT_MEDEFAIDRIN }, 325 { 0x1E909, USCRIPT_ADLAM, }, 326 { 0xAC00, USCRIPT_HANGUL }, 327 { 0x304B, USCRIPT_HIRAGANA }, 328 { 0x30AB, USCRIPT_KATAKANA }, 329 { 0x3105, USCRIPT_BOPOMOFO }, 330 { 0xA288, USCRIPT_YI }, 331 { 0xA4D0, USCRIPT_LISU }, 332 { 0xA4E8, USCRIPT_LISU }, 333 { 0x16F00, USCRIPT_MIAO }, 334 { 0x118B4, USCRIPT_WARANG_CITI }, 335 { 0x11AC0, USCRIPT_PAU_CIN_HAU }, 336 { 0x16B1C, USCRIPT_PAHAWH_HMONG }, 337 { 0x10280, USCRIPT_LYCIAN }, 338 { 0x102A0, USCRIPT_CARIAN }, 339 { 0x102B7, USCRIPT_CARIAN }, 340 { 0x10920, USCRIPT_LYDIAN }, 341 { 0x10300, USCRIPT_OLD_ITALIC }, 342 { 0x10308, USCRIPT_OLD_ITALIC }, 343 { 0x10330, USCRIPT_GOTHIC }, 344 { 0x10414, USCRIPT_DESERET }, 345 { 0x10450, USCRIPT_SHAVIAN }, 346 { 0x1BC20, USCRIPT_DUPLOYAN }, 347 { 0x10480, USCRIPT_OSMANYA }, 348 { 0x10500, USCRIPT_ELBASAN }, 349 { 0x10537, USCRIPT_CAUCASIAN_ALBANIAN }, 350 { 0x110D0, USCRIPT_SORA_SOMPENG }, 351 { 0x16A4F, USCRIPT_MRO }, 352 { 0x10000, USCRIPT_LINEAR_B }, 353 { 0x10647, USCRIPT_LINEAR_A }, 354 { 0x10800, USCRIPT_CYPRIOT }, 355 { 0x10A60, USCRIPT_OLD_SOUTH_ARABIAN }, 356 { 0x10A95, USCRIPT_OLD_NORTH_ARABIAN }, 357 { 0x10B00, USCRIPT_AVESTAN }, 358 { 0x10873, USCRIPT_PALMYRENE }, 359 { 0x10896, USCRIPT_NABATAEAN }, 360 { 0x108F4, USCRIPT_HATRAN }, 361 { 0x10840, USCRIPT_IMPERIAL_ARAMAIC }, 362 { 0x10B40, USCRIPT_INSCRIPTIONAL_PARTHIAN }, 363 { 0x10B60, USCRIPT_INSCRIPTIONAL_PAHLAVI }, 364 { 0x10B8F, USCRIPT_PSALTER_PAHLAVI }, 365 { 0x10AC1, USCRIPT_MANICHAEAN }, 366 { 0x10AD8, USCRIPT_MANICHAEAN }, 367 { 0x10F19, USCRIPT_OLD_SOGDIAN }, 368 { 0x10F42, USCRIPT_SOGDIAN }, 369 { 0x10380, USCRIPT_UGARITIC }, 370 { 0x103A0, USCRIPT_OLD_PERSIAN }, 371 { 0x12000, USCRIPT_CUNEIFORM }, 372 { 0x13153, USCRIPT_EGYPTIAN_HIEROGLYPHS }, 373 { 0x109A0, USCRIPT_MEROITIC_CURSIVE }, 374 { 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS }, 375 { 0x14400, USCRIPT_ANATOLIAN_HIEROGLYPHS }, 376 { 0x18229, USCRIPT_TANGUT }, 377 { 0x5B57, USCRIPT_HAN }, 378 { 0x11D10, USCRIPT_MASARAM_GONDI }, 379 { 0x11A0B, USCRIPT_ZANABAZAR_SQUARE }, 380 { 0x11A5C, USCRIPT_SOYOMBO }, 381 { 0x1B1C4, USCRIPT_NUSHU }, 382 { 0xFDD0, USCRIPT_UNKNOWN } // unassigned-implicit primary weights 383}; 384 385static int32_t getCharScript(UChar32 c) { 386 if (sampleChars[0] < 0) { 387 // Lazy-init the script->sample cache. 388 for (int32_t script = 0; script < USCRIPT_CODE_LIMIT; ++script) { 389 UnicodeString sample = uscript_getSampleUnicodeString((UScriptCode)script); 390 if (sample.isEmpty() || sample.hasMoreChar32Than(0, INT32_MAX, 1)) { 391 sampleChars[script] = U_SENTINEL; 392 } else { 393 sampleChars[script] = sample.char32At(0); 394 } 395 } 396 } 397 for (int32_t script = 0; script < USCRIPT_CODE_LIMIT; ++script) { 398 if (c == sampleChars[script]) { 399 return script; 400 } 401 } 402 for(int32_t i = 0; i < UPRV_LENGTHOF(sampleCharsToScripts); ++i) { 403 if(c == sampleCharsToScripts[i].sampleChar) { 404 return sampleCharsToScripts[i].script; 405 } 406 } 407 return USCRIPT_INVALID_CODE; // -1 408} 409 410/** 411 * Maps Unified_Ideograph's to primary CEs in the given order of ranges. 412 */ 413class HanOrder { 414public: 415 HanOrder(UErrorCode &errorCode) : ranges(errorCode), set(), done(false) {} 416 417 void addRange(UChar32 start, UChar32 end, UErrorCode &errorCode) { 418 int32_t length = ranges.size(); 419 if(length > 0 && (ranges.elementAti(length - 1) + 1) == start) { 420 // The previous range end is just before this range start: Merge adjacent ranges. 421 ranges.setElementAt(end, length - 1); 422 } else { 423 ranges.addElement(start, errorCode); 424 ranges.addElement(end, errorCode); 425 } 426 set.add(start, end); 427 } 428 429 void setBuilderHanOrder(CollationBaseDataBuilder &builder, UErrorCode &errorCode) { 430 if(U_FAILURE(errorCode)) { return; } 431 builder.initHanRanges(ranges.getBuffer(), ranges.size(), errorCode); 432 done = true; 433 } 434 435 void setDone() { 436 done = true; 437 } 438 439 UBool isDone() { return done; } 440 441 const UnicodeSet &getSet() { return set; } 442 443private: 444 UVector32 ranges; 445 UnicodeSet set; 446 UBool done; 447}; 448 449static HanOrder *implicitHanOrder = NULL; 450static HanOrder *radicalStrokeOrder = NULL; 451 452enum ActionType { 453 READCE, 454 READPRIMARY, 455 READBYTE, 456 READUNIFIEDIDEOGRAPH, 457 READRADICAL, 458 READUCAVERSION, 459 READLEADBYTETOSCRIPTS, 460 IGNORE 461}; 462 463static struct { 464 const char *const name; 465 int64_t value; 466 const ActionType what_to_do; 467} vt[] = { 468 {"[first tertiary ignorable", 0, IGNORE}, 469 {"[last tertiary ignorable", 0, IGNORE}, 470 {"[first secondary ignorable", 0, READCE}, 471 {"[last secondary ignorable", 0, READCE}, 472 {"[first primary ignorable", 0, READCE}, 473 {"[last primary ignorable", 0, READCE}, 474 {"[first variable", 0, READCE}, 475 {"[last variable", 0, READCE}, 476 {"[first regular", 0, READCE}, 477 {"[last regular", 0, READCE}, 478 {"[first implicit", 0, READCE}, 479 {"[last implicit", 0, READCE}, 480 {"[first trailing", 0, READCE}, 481 {"[last trailing", 0, READCE}, 482 483 {"[Unified_Ideograph", 0, READUNIFIEDIDEOGRAPH}, 484 {"[radical", 0, READRADICAL}, 485 486 {"[fixed first implicit byte", 0, IGNORE}, 487 {"[fixed last implicit byte", 0, IGNORE}, 488 {"[fixed first trail byte", 0, IGNORE}, 489 {"[fixed last trail byte", 0, IGNORE}, 490 {"[fixed first special byte", 0, IGNORE}, 491 {"[fixed last special byte", 0, IGNORE}, 492 {"[fixed secondary common byte", 0, READBYTE}, 493 {"[fixed last secondary common byte", 0, READBYTE}, 494 {"[fixed first ignorable secondary byte", 0, READBYTE}, 495 {"[fixed tertiary common byte", 0, READBYTE}, 496 {"[fixed first ignorable tertiary byte", 0, READBYTE}, 497 {"[variable top = ", 0, IGNORE}, 498 {"[UCA version = ", 0, READUCAVERSION}, 499 {"[top_byte", 0, READLEADBYTETOSCRIPTS}, 500 {"[reorderingTokens", 0, IGNORE}, 501 {"[categories", 0, IGNORE}, 502 {"[first tertiary in secondary non-ignorable", 0, IGNORE}, 503 {"[last tertiary in secondary non-ignorable", 0, IGNORE}, 504 {"[first secondary in primary non-ignorable", 0, IGNORE}, 505 {"[last secondary in primary non-ignorable", 0, IGNORE}, 506}; 507 508static int64_t getOptionValue(const char *name) { 509 for (int32_t i = 0; i < UPRV_LENGTHOF(vt); ++i) { 510 if(uprv_strcmp(name, vt[i].name) == 0) { 511 return vt[i].value; 512 } 513 } 514 return 0; 515} 516 517static void readAnOption( 518 CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) { 519 for (int32_t cnt = 0; cnt<UPRV_LENGTHOF(vt); cnt++) { 520 int32_t vtLen = (int32_t)uprv_strlen(vt[cnt].name); 521 if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) { 522 ActionType what_to_do = vt[cnt].what_to_do; 523 char *pointer = skipWhiteSpace(buffer + vtLen); 524 if (what_to_do == IGNORE) { //vt[cnt].what_to_do == IGNORE 525 return; 526 } else if (what_to_do == READCE) { 527 vt[cnt].value = parseCE(builder, pointer, *status); 528 if(U_SUCCESS(*status) && *pointer != ']') { 529 *status = U_INVALID_FORMAT_ERROR; 530 } 531 if(U_FAILURE(*status)) { 532 fprintf(stderr, "Syntax error: unable to parse the CE from line '%s'\n", buffer); 533 return; 534 } 535 } else if(what_to_do == READPRIMARY) { 536 vt[cnt].value = parseWeight(pointer, "]", 4, *status); 537 if(U_FAILURE(*status)) { 538 fprintf(stderr, "Value of \"%s\" is not a primary weight\n", buffer); 539 return; 540 } 541 } else if(what_to_do == READBYTE) { 542 vt[cnt].value = parseWeight(pointer, "]", 1, *status) >> 24; 543 if(U_FAILURE(*status)) { 544 fprintf(stderr, "Value of \"%s\" is not a valid byte\n", buffer); 545 return; 546 } 547 } else if(what_to_do == READUNIFIEDIDEOGRAPH) { 548 if(implicitHanOrder != NULL) { 549 fprintf(stderr, "duplicate [Unified_Ideograph] lines\n"); 550 *status = U_INVALID_FORMAT_ERROR; 551 return; 552 } 553 implicitHanOrder = new HanOrder(*status); 554 if(U_FAILURE(*status)) { return; } 555 for(;;) { 556 if(*pointer == ']') { break; } 557 if(*pointer == 0) { 558 // Missing ] after ranges. 559 *status = U_INVALID_FORMAT_ERROR; 560 return; 561 } 562 char *s = pointer; 563 while(*s != ' ' && *s != '\t' && *s != ']' && *s != '\0') { ++s; } 564 char c = *s; 565 *s = 0; 566 uint32_t start, end; 567 u_parseCodePointRange(pointer, &start, &end, status); 568 *s = c; 569 if(U_FAILURE(*status)) { 570 fprintf(stderr, "Syntax error: unable to parse one of the ranges from line '%s'\n", buffer); 571 *status = U_INVALID_FORMAT_ERROR; 572 return; 573 } 574 implicitHanOrder->addRange((UChar32)start, (UChar32)end, *status); 575 pointer = skipWhiteSpace(s); 576 } 577 if(hanOrder == HAN_IMPLICIT) { 578 implicitHanOrder->setBuilderHanOrder(builder, *status); 579 } 580 implicitHanOrder->setDone(); 581 } else if(what_to_do == READRADICAL) { 582 if(radicalStrokeOrder == NULL) { 583 if(implicitHanOrder == NULL) { 584 fprintf(stderr, "[radical] section before [Unified_Ideograph] line\n"); 585 *status = U_INVALID_FORMAT_ERROR; 586 return; 587 } 588 radicalStrokeOrder = new HanOrder(*status); 589 if(U_FAILURE(*status)) { return; } 590 } else if(radicalStrokeOrder->isDone()) { 591 fprintf(stderr, "duplicate [radical] sections\n"); 592 *status = U_INVALID_FORMAT_ERROR; 593 return; 594 } 595 if(uprv_strcmp(pointer, "end]") == 0) { 596 if(radicalStrokeOrder->getSet() != implicitHanOrder->getSet()) { 597 fprintf(stderr, "[radical end]: " 598 "some of [Unified_Ideograph] missing from [radical] lines\n"); 599 *status = U_INVALID_FORMAT_ERROR; 600 return; 601 } 602 if(hanOrder == HAN_RADICAL_STROKE) { 603 radicalStrokeOrder->setBuilderHanOrder(builder, *status); 604 } 605 radicalStrokeOrder->setDone(); 606 } else { 607 // Read Han characters and ranges between : and ]. 608 // Ignore the radical data before the :. 609 char *startPointer = uprv_strchr(pointer, ':'); 610 char *limitPointer = uprv_strchr(pointer, ']'); 611 if(startPointer == NULL || limitPointer == NULL || 612 (startPointer + 1) >= limitPointer) { 613 fprintf(stderr, "[radical]: no Han characters listed between : and ]\n"); 614 *status = U_INVALID_FORMAT_ERROR; 615 return; 616 } 617 pointer = startPointer + 1; 618 int32_t length = (int32_t)(limitPointer - pointer); 619 for(int32_t i = 0; i < length;) { 620 UChar32 start; 621 U8_NEXT(pointer, i, length, start); 622 UChar32 end; 623 if(pointer[i] == '-') { 624 ++i; 625 U8_NEXT(pointer, i, length, end); 626 } else { 627 end = start; 628 } 629 if(radicalStrokeOrder->getSet().containsSome(start, end)) { 630 fprintf(stderr, "[radical]: some of U+%04x..U+%04x occur " 631 "multiple times in the radical-stroke order\n", 632 start, end); 633 *status = U_INVALID_FORMAT_ERROR; 634 return; 635 } 636 if(!implicitHanOrder->getSet().contains(start, end)) { 637 fprintf(stderr, "[radical]: some of U+%04x..U+%04x are " 638 "not Unified_Ideograph\n", 639 start, end); 640 *status = U_INVALID_FORMAT_ERROR; 641 return; 642 } 643 radicalStrokeOrder->addRange(start, end, *status); 644 } 645 } 646 } else if (what_to_do == READUCAVERSION) { 647 u_versionFromString(UCAVersion, pointer); 648 if(beVerbose) { 649 char uca[U_MAX_VERSION_STRING_LENGTH]; 650 u_versionToString(UCAVersion, uca); 651 printf("UCA version %s\n", uca); 652 } 653 UVersionInfo UCDVersion; 654 u_getUnicodeVersion(UCDVersion); 655 if (UCAVersion[0] != UCDVersion[0] || UCAVersion[1] != UCDVersion[1]) { 656 char uca[U_MAX_VERSION_STRING_LENGTH]; 657 char ucd[U_MAX_VERSION_STRING_LENGTH]; 658 u_versionToString(UCAVersion, uca); 659 u_versionToString(UCDVersion, ucd); 660 // Warning, not error, to permit bootstrapping during a version upgrade. 661 fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd); 662 } 663 } else if (what_to_do == READLEADBYTETOSCRIPTS) { 664 if (strstr(pointer, "COMPRESS") != NULL) { 665 uint16_t leadByte = (hex2num(*pointer++) * 16); 666 leadByte += hex2num(*pointer++); 667 builder.setCompressibleLeadByte(leadByte); 668 } 669 // We do not need the list of scripts on this line. 670 } 671 return; 672 } 673 } 674 fprintf(stderr, "Warning: unrecognized option: %s\n", buffer); 675} 676 677static UBool 678readAnElement(char *line, 679 CollationBaseDataBuilder &builder, 680 UnicodeString &prefix, UnicodeString &s, 681 int64_t ces[32], int32_t &cesLength, 682 UErrorCode *status) { 683 if(U_FAILURE(*status)) { 684 return false; 685 } 686 int32_t lineLength = (int32_t)uprv_strlen(line); 687 while(lineLength>0 && (line[lineLength-1] == '\r' || line[lineLength-1] == '\n')) { 688 line[--lineLength] = 0; 689 } 690 691 if(lineLength >= 3 && line[0] == (char)0xef && 692 line[1] == (char)0xbb && line[2] == (char)0xbf) { 693 // U+FEFF UTF-8 signature byte sequence. 694 // Ignore, assuming it is at the start of the file. 695 line += 3; 696 lineLength -= 3; 697 } 698 if(line[0] == 0 || line[0] == '#') { 699 return false; // just a comment, skip whole line 700 } 701 702 // Directives. 703 if(line[0] == '[') { 704 readAnOption(builder, line, status); 705 return false; 706 } 707 708 CharString input; 709 char *startCodePoint = line; 710 char *endCodePoint = strchr(startCodePoint, ';'); 711 if(endCodePoint == NULL) { 712 fprintf(stderr, "error - line with no code point:\n%s\n", line); 713 *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */ 714 return false; 715 } 716 717 char *pipePointer = strchr(line, '|'); 718 if (pipePointer != NULL) { 719 // Read the prefix string which precedes the actual string. 720 input.append(startCodePoint, (int32_t)(pipePointer - startCodePoint), *status); 721 UChar *prefixChars = prefix.getBuffer(32); 722 int32_t prefixSize = 723 u_parseString(input.data(), 724 prefixChars, prefix.getCapacity(), 725 NULL, status); 726 if(U_FAILURE(*status)) { 727 prefix.releaseBuffer(0); 728 fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n%s\n", 729 input.data(), line, u_errorName(*status)); 730 *status = U_INVALID_FORMAT_ERROR; 731 return false; 732 } 733 prefix.releaseBuffer(prefixSize); 734 startCodePoint = pipePointer + 1; 735 input.clear(); 736 } 737 738 // Read the string which gets the CE(s) assigned. 739 input.append(startCodePoint, (int32_t)(endCodePoint - startCodePoint), *status); 740 UChar *uchars = s.getBuffer(32); 741 int32_t cSize = 742 u_parseString(input.data(), 743 uchars, s.getCapacity(), 744 NULL, status); 745 if(U_FAILURE(*status)) { 746 s.releaseBuffer(0); 747 fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n%s\n", 748 input.data(), line, u_errorName(*status)); 749 *status = U_INVALID_FORMAT_ERROR; 750 return false; 751 } 752 s.releaseBuffer(cSize); 753 754 char *pointer = endCodePoint + 1; 755 756 char *commentStart = strchr(pointer, '#'); 757 if(commentStart == NULL) { 758 commentStart = strchr(pointer, 0); 759 } 760 761 cesLength = 0; 762 for(;;) { 763 pointer = skipWhiteSpace(pointer); 764 if(pointer == commentStart) { 765 break; 766 } 767 if(cesLength >= 31) { 768 fprintf(stderr, "Error: Too many CEs on line '%s'\n", line); 769 *status = U_INVALID_FORMAT_ERROR; 770 return false; 771 } 772 ces[cesLength++] = parseCE(builder, pointer, *status); 773 if(U_FAILURE(*status)) { 774 fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n", 775 line, u_errorName(*status)); 776 return false; 777 } 778 } 779 780 if(s.length() == 1 && s[0] == 0xfffe) { 781 // UCA 6.0 gives U+FFFE a special minimum weight using the 782 // byte 02 which is the merge-sort-key separator and illegal for any 783 // other characters. 784 } else { 785 // Rudimentary check for valid bytes in CE weights. 786 // For a more comprehensive check see CollationTest::TestRootElements(), 787 // intltest collate/CollationTest/TestRootElements 788 for (int32_t i = 0; i < cesLength; ++i) { 789 int64_t ce = ces[i]; 790 UBool isCompressible = false; 791 for (int j = 7; j >= 0; --j) { 792 uint8_t b = (uint8_t)(ce >> (j * 8)); 793 if(j <= 1) { b &= 0x3f; } // tertiary bytes use 6 bits 794 if (b == 1) { 795 fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", line); 796 return false; 797 } 798 if (j == 7 && b == 2) { 799 fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", line); 800 return false; 801 } 802 if (j == 7) { 803 isCompressible = builder.isCompressibleLeadByte(b); 804 } else if (j == 6) { 805 // Primary second bytes 03 and FF are compression terminators. 806 // 02, 03 and FF are usable when the lead byte is not compressible. 807 // 02 is unusable and 03 is the low compression terminator when the lead byte is compressible. 808 if (isCompressible && (b <= 3 || b == 0xff)) { 809 fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n", 810 b, line); 811 return false; 812 } 813 } 814 } 815 } 816 } 817 818 return true; 819} 820 821static void 822parseFractionalUCA(const char *filename, 823 CollationBaseDataBuilder &builder, 824 UErrorCode *status) 825{ 826 if(U_FAILURE(*status)) { return; } 827 FILE *data = fopen(filename, "r"); 828 if(data == NULL) { 829 fprintf(stderr, "Couldn't open file: %s\n", filename); 830 *status = U_FILE_ACCESS_ERROR; 831 return; 832 } 833 int32_t lineNumber = 0; 834 char buffer[30000]; 835 836 const Normalizer2* norm = nullptr; 837 if (icu4xMode) { 838 norm = Normalizer2::getNFDInstance(*status); 839 } 840 841 UChar32 maxCodePoint = 0; 842 while(!feof(data)) { 843 if(U_FAILURE(*status)) { 844 fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n", 845 *status, u_errorName(*status), (int)lineNumber, filename); 846 exit(*status); 847 } 848 849 lineNumber++; 850 char *line = fgets(buffer, sizeof(buffer), data); 851 if(line == NULL) { 852 if(feof(data)) { 853 break; 854 } else { 855 fprintf(stderr, "no more input line and also no EOF!\n"); 856 *status = U_INVALID_FORMAT_ERROR; 857 return; 858 } 859 } 860 861 UnicodeString prefix; 862 UnicodeString s; 863 int64_t ces[32]; 864 int32_t cesLength = 0; 865 if(readAnElement(line, builder, prefix, s, ces, cesLength, status)) { 866 // we have read the line, now do something sensible with the read data! 867 uint32_t p = (uint32_t)(ces[0] >> 32); 868 869 if(s.length() > 1 && s[0] == 0xFDD0) { 870 // FractionalUCA.txt contractions starting with U+FDD0 871 // are only entered into the inverse table, 872 // not into the normal collation data. 873 builder.addRootElements(ces, cesLength, *status); 874 if(s.length() == 2 && cesLength == 1) { 875 switch(s[1]) { 876 case 0x34: 877 // Lead byte for numeric sorting. 878 builder.setNumericPrimary(p); 879 break; 880 case 0xFF21: 881 builder.addScriptStart(CollationData::REORDER_RESERVED_BEFORE_LATIN, p); 882 break; 883 case 0xFF3A: 884 builder.addScriptStart(CollationData::REORDER_RESERVED_AFTER_LATIN, p); 885 break; 886 default: 887 break; 888 } 889 } 890 } else { 891 UChar32 c = s.char32At(0); 892 if(c > maxCodePoint) { maxCodePoint = c; } 893 894 // We ignore the CEs for U+FFFD..U+FFFF and for the unassigned first primary. 895 // CollationBaseDataBuilder::init() maps them to special CEs. 896 // Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt. 897 if(0xfffd <= c && c <= 0xffff) { continue; } 898 if (icu4xMode) { 899 if (c >= 0xAC00 && c <= 0xD7A3) { 900 // Hangul syllable 901 continue; 902 } 903 if (c >= 0xD800 && c < 0xE000) { 904 // Surrogate 905 continue; 906 } 907 UnicodeString src; 908 UnicodeString dst; 909 src.append(c); 910 norm->normalize(src, dst, *status); 911 if (src != dst) { 912 // c decomposed, skip it 913 continue; 914 } 915 } 916 if(s.length() >= 2 && c == 0xFDD1) { 917 UChar32 c2 = s.char32At(1); 918 int32_t script = getCharScript(c2); 919 if(script < 0) { 920 fprintf(stderr, 921 "Error: Unknown script for first-primary sample character " 922 "U+%04X on line %u of %s:\n" 923 "%s\n" 924 " (add the character to genuca.cpp sampleCharsToScripts[])\n", 925 c2, (int)lineNumber, filename, line); 926 exit(U_INVALID_FORMAT_ERROR); 927 } 928 if(script == USCRIPT_UNKNOWN) { 929 // FDD1 FDD0, first unassigned-implicit primary 930 builder.addScriptStart(script, Collation::FIRST_UNASSIGNED_PRIMARY); 931 continue; 932 } 933 builder.addScriptStart(script, p); 934 if(script == USCRIPT_HIRAGANA) { 935 builder.addScriptStart(USCRIPT_KATAKANA_OR_HIRAGANA, p); 936 } else if(script == USCRIPT_HAN) { 937 builder.addScriptStart(USCRIPT_SIMPLIFIED_HAN, p); 938 builder.addScriptStart(USCRIPT_TRADITIONAL_HAN, p); 939 } 940 } 941 942 if(0xe0000000 <= p && p < 0xf0000000) { 943 fprintf(stderr, 944 "Error: Unexpected mapping to an implicit or trailing primary" 945 " on line %u of %s:\n" 946 "%s\n", 947 (int)lineNumber, filename, line); 948 exit(U_INVALID_FORMAT_ERROR); 949 } 950 builder.add(prefix, s, ces, cesLength, *status); 951 } 952 } 953 } 954 955 int32_t numRanges = 0; 956 int32_t numRangeCodePoints = 0; 957 UChar32 rangeFirst = U_SENTINEL; 958 UChar32 rangeLast = U_SENTINEL; 959 uint32_t rangeFirstPrimary = 0; 960 uint32_t rangeLastPrimary = 0; 961 int32_t rangeStep = -1; 962 963 // Detect ranges of characters in primary code point order, 964 // with 3-byte primaries and 965 // with consistent "step" differences between adjacent primaries. 966 // This relies on the FractionalUCA generator using the same primary-weight incrementation. 967 // Start at U+0180: No ranges for common Latin characters. 968 // Go one beyond maxCodePoint in case a range ends there. 969 for(UChar32 c = 0x180; c <= (maxCodePoint + 1); ++c) { 970 UBool action; 971 uint32_t p = builder.getLongPrimaryIfSingleCE(c); 972 if(p != 0) { 973 // p is a "long" (three-byte) primary. 974 if(rangeFirst >= 0 && c == (rangeLast + 1) && p > rangeLastPrimary) { 975 // Find the offset between the two primaries. 976 int32_t step = CollationBaseDataBuilder::diffThreeBytePrimaries( 977 rangeLastPrimary, p, builder.isCompressiblePrimary(p)); 978 if(rangeFirst == rangeLast && step >= 2) { 979 // c == rangeFirst + 1, store the "step" between range primaries. 980 rangeStep = step; 981 rangeLast = c; 982 rangeLastPrimary = p; 983 action = 0; // continue range 984 } else if(rangeStep == step) { 985 // Continue the range with the same "step" difference. 986 rangeLast = c; 987 rangeLastPrimary = p; 988 action = 0; // continue range 989 } else { 990 action = 1; // maybe finish range, start a new one 991 } 992 } else { 993 action = 1; // maybe finish range, start a new one 994 } 995 } else { 996 action = -1; // maybe finish range, do not start a new one 997 } 998 if(action != 0 && rangeFirst >= 0) { 999 // Finish a range. 1000 // Set offset CE32s for a long range, leave single CEs for a short range. 1001 UBool didSetRange = builder.maybeSetPrimaryRange( 1002 rangeFirst, rangeLast, 1003 rangeFirstPrimary, rangeStep, *status); 1004 if(U_FAILURE(*status)) { 1005 fprintf(stderr, 1006 "failure setting code point order range U+%04lx..U+%04lx " 1007 "%08lx..%08lx step %d - %s\n", 1008 (long)rangeFirst, (long)rangeLast, 1009 (long)rangeFirstPrimary, (long)rangeLastPrimary, 1010 (int)rangeStep, u_errorName(*status)); 1011 } else if(didSetRange) { 1012 int32_t rangeLength = rangeLast - rangeFirst + 1; 1013 if(beVerbose) { 1014 printf("* set code point order range U+%04lx..U+%04lx [%d] " 1015 "%08lx..%08lx step %d\n", 1016 (long)rangeFirst, (long)rangeLast, 1017 (int)rangeLength, 1018 (long)rangeFirstPrimary, (long)rangeLastPrimary, 1019 (int)rangeStep); 1020 } 1021 ++numRanges; 1022 numRangeCodePoints += rangeLength; 1023 } 1024 rangeFirst = U_SENTINEL; 1025 rangeStep = -1; 1026 } 1027 if(action > 0) { 1028 // Start a new range. 1029 rangeFirst = rangeLast = c; 1030 rangeFirstPrimary = rangeLastPrimary = p; 1031 } 1032 } 1033 printf("** set %d ranges with %d code points\n", (int)numRanges, (int)numRangeCodePoints); 1034 1035 // Idea: Probably best to work in two passes. 1036 // Pass 1 for reading all data, setting isCompressible flags (and reordering groups) 1037 // and finding ranges. 1038 // Then set the ranges in a newly initialized builder 1039 // for optimal compression (makes sure that adjacent blocks can overlap easily). 1040 // Then set all mappings outside the ranges. 1041 // 1042 // In the first pass, we could store mappings in a simple list, 1043 // with single-character/single-long-primary-CE mappings in a UTrie2; 1044 // or store the mappings in a temporary builder; 1045 // or we could just parse the input file again in the second pass. 1046 // 1047 // Ideally set/copy U+0000..U+017F before setting anything else, 1048 // then set default Han/Hangul, then set the ranges, then copy non-range mappings. 1049 // It should be easy to copy mappings from an un-built builder to a new one. 1050 // Add CollationDataBuilder::copyFrom(builder, code point, errorCode) -- copy contexts & expansions. 1051 1052 if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) { 1053 fprintf(stderr, "UCA version not specified. Cannot create data file!\n"); 1054 fclose(data); 1055 return; 1056 } 1057 1058 if (beVerbose) { 1059 printf("\nLines read: %u\n", (int)lineNumber); 1060 } 1061 1062 fclose(data); 1063 1064 return; 1065} 1066 1067static void 1068buildAndWriteBaseData(CollationBaseDataBuilder &builder, 1069 const char *path, UErrorCode &errorCode) { 1070 if(U_FAILURE(errorCode)) { return; } 1071 1072 if(getOptionValue("[fixed secondary common byte") != Collation::COMMON_BYTE) { 1073 fprintf(stderr, "error: unexpected [fixed secondary common byte]"); 1074 errorCode = U_INVALID_FORMAT_ERROR; 1075 return; 1076 } 1077 if(getOptionValue("[fixed tertiary common byte") != Collation::COMMON_BYTE) { 1078 fprintf(stderr, "error: unexpected [fixed tertiary common byte]"); 1079 errorCode = U_INVALID_FORMAT_ERROR; 1080 return; 1081 } 1082 1083 CollationData data(*Normalizer2Factory::getNFCImpl(errorCode)); 1084 builder.enableFastLatin(); 1085 builder.build(data, errorCode); 1086 if(U_FAILURE(errorCode)) { 1087 fprintf(stderr, "builder.build() failed: %s\n", 1088 u_errorName(errorCode)); 1089 return; 1090 } 1091 1092 // The CollationSettings constructor gives us the properly encoded 1093 // default options, so that we need not duplicate them here. 1094 CollationSettings settings; 1095 1096 UVector32 rootElements(errorCode); 1097 for(int32_t i = 0; i < CollationRootElements::IX_COUNT; ++i) { 1098 rootElements.addElement(0, errorCode); 1099 } 1100 builder.buildRootElementsTable(rootElements, errorCode); 1101 if(U_FAILURE(errorCode)) { 1102 fprintf(stderr, "builder.buildRootElementsTable() failed: %s\n", 1103 u_errorName(errorCode)); 1104 return; 1105 } 1106 int32_t index = CollationRootElements::IX_COUNT; 1107 rootElements.setElementAt(index, CollationRootElements::IX_FIRST_TERTIARY_INDEX); 1108 1109 while((rootElements.elementAti(index) & 0xffff0000) == 0) { ++index; } 1110 rootElements.setElementAt(index, CollationRootElements::IX_FIRST_SECONDARY_INDEX); 1111 1112 while((rootElements.elementAti(index) & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) { 1113 ++index; 1114 } 1115 rootElements.setElementAt(index, CollationRootElements::IX_FIRST_PRIMARY_INDEX); 1116 1117 rootElements.setElementAt(Collation::COMMON_SEC_AND_TER_CE, 1118 CollationRootElements::IX_COMMON_SEC_AND_TER_CE); 1119 1120 int32_t secTerBoundaries = (int32_t)getOptionValue("[fixed last secondary common byte") << 24; 1121 secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable secondary byte") << 16; 1122 secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable tertiary byte"); 1123 rootElements.setElementAt(secTerBoundaries, CollationRootElements::IX_SEC_TER_BOUNDARIES); 1124 1125 LocalMemory<uint8_t> buffer; 1126 int32_t capacity = 1000000; 1127 uint8_t *dest = buffer.allocateInsteadAndCopy(capacity); 1128 if(dest == NULL) { 1129 fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n", 1130 (long)capacity); 1131 errorCode = U_MEMORY_ALLOCATION_ERROR; 1132 return; 1133 } 1134 int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1]; 1135 int32_t totalSize = CollationDataWriter::writeBase( 1136 data, settings, 1137 rootElements.getBuffer(), rootElements.size(), 1138 indexes, dest, capacity, 1139 errorCode); 1140 if(U_FAILURE(errorCode)) { 1141 fprintf(stderr, "CollationDataWriter::writeBase(capacity = %ld) failed: %s\n", 1142 (long)capacity, u_errorName(errorCode)); 1143 return; 1144 } 1145 printf("*** CLDR root collation part sizes ***\n"); 1146 CollationInfo::printSizes(totalSize, indexes); 1147 printf("*** CLDR root collation size: %6ld (with file header but no copyright string)\n", 1148 (long)totalSize + 32); // 32 bytes = DataHeader rounded up to 16-byte boundary 1149 1150 CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion); 1151 const char *dataName = 1152 hanOrder == HAN_IMPLICIT ? 1153 (icu4xMode ? "ucadata-implicithan-icu4x" : "ucadata-implicithan") : 1154 (icu4xMode ? "ucadata-unihan-icu4x" : "ucadata-unihan"); 1155 UNewDataMemory *pData=udata_create(path, "icu", dataName, &ucaDataInfo, 1156 withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 1157 if(U_FAILURE(errorCode)) { 1158 fprintf(stderr, "genuca: udata_create(%s, ucadata.icu) failed - %s\n", 1159 path, u_errorName(errorCode)); 1160 return; 1161 } 1162 1163 udata_writeBlock(pData, dest, totalSize); 1164 long dataLength = udata_finish(pData, &errorCode); 1165 if(U_FAILURE(errorCode)) { 1166 fprintf(stderr, "genuca: error %s writing the output file\n", u_errorName(errorCode)); 1167 return; 1168 } 1169 1170 if(dataLength != (long)totalSize) { 1171 fprintf(stderr, 1172 "udata_finish(ucadata.icu) reports %ld bytes written but should be %ld\n", 1173 dataLength, (long)totalSize); 1174 errorCode=U_INTERNAL_PROGRAM_ERROR; 1175 } 1176} 1177 1178/** 1179 * Adds each lead surrogate to the bmp set if any of the 1024 1180 * associated supplementary code points is in the supp set. 1181 * These can be one and the same set. 1182 */ 1183static void 1184setLeadSurrogatesForAssociatedSupplementary(UnicodeSet &bmp, const UnicodeSet &supp) { 1185 UChar32 c = 0x10000; 1186 for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { 1187 if(supp.containsSome(c, c + 0x3ff)) { 1188 bmp.add(lead); 1189 } 1190 } 1191} 1192 1193static int32_t 1194makeBMPFoldedBitSet(const UnicodeSet &set, uint8_t index[0x800], uint32_t bits[256], 1195 UErrorCode &errorCode) { 1196 if(U_FAILURE(errorCode)) { return 0; } 1197 bits[0] = 0; // no bits set 1198 bits[1] = 0xffffffff; // all bits set 1199 int32_t bitsLength = 2; 1200 int32_t i = 0; 1201 for(UChar32 c = 0; c <= 0xffff; c += 0x20, ++i) { 1202 if(set.containsNone(c, c + 0x1f)) { 1203 index[i] = 0; 1204 } else if(set.contains(c, c + 0x1f)) { 1205 index[i] = 1; 1206 } else { 1207 uint32_t b = 0; 1208 for(int32_t j = 0; j <= 0x1f; ++j) { 1209 if(set.contains(c + j)) { 1210 b |= (uint32_t)1 << j; 1211 } 1212 } 1213 int32_t k; 1214 for(k = 2;; ++k) { 1215 if(k == bitsLength) { 1216 // new bit combination 1217 if(bitsLength == 256) { 1218 errorCode = U_BUFFER_OVERFLOW_ERROR; 1219 return 0; 1220 } 1221 bits[bitsLength++] = b; 1222 break; 1223 } 1224 if(bits[k] == b) { 1225 // duplicate bit combination 1226 break; 1227 } 1228 } 1229 index[i] = k; 1230 } 1231 } 1232 return bitsLength; 1233} 1234 1235// TODO: Make preparseucd.py write fcd_data.h mapping code point ranges to FCD16 values, 1236// use that rather than properties APIs. 1237// Then consider moving related logic for the unsafeBwdSet back from the loader into this builder. 1238 1239/** 1240 * Builds data for the FCD check fast path. 1241 * For details see the CollationFCD class comments. 1242 */ 1243static void 1244buildAndWriteFCDData(const char *path, UErrorCode &errorCode) { 1245 UnicodeSet lcccSet(UNICODE_STRING_SIMPLE("[[:^lccc=0:][\\udc00-\\udfff]]"), errorCode); 1246 UnicodeSet tcccSet(UNICODE_STRING_SIMPLE("[:^tccc=0:]"), errorCode); 1247 if(U_FAILURE(errorCode)) { return; } 1248 setLeadSurrogatesForAssociatedSupplementary(tcccSet, tcccSet); 1249 // The following supp(lccc)->lead(tccc) should be unnecessary 1250 // after the previous supp(tccc)->lead(tccc) 1251 // because there should not be any characters with lccc!=0 and tccc=0. 1252 // It is safe and harmless. 1253 setLeadSurrogatesForAssociatedSupplementary(tcccSet, lcccSet); 1254 setLeadSurrogatesForAssociatedSupplementary(lcccSet, lcccSet); 1255 uint8_t lcccIndex[0x800], tcccIndex[0x800]; 1256 uint32_t lcccBits[256], tcccBits[256]; 1257 int32_t lcccBitsLength = makeBMPFoldedBitSet(lcccSet, lcccIndex, lcccBits, errorCode); 1258 int32_t tcccBitsLength = makeBMPFoldedBitSet(tcccSet, tcccIndex, tcccBits, errorCode); 1259 printf("@@@ lcccBitsLength=%d -> %d bytes\n", lcccBitsLength, 0x800 + lcccBitsLength * 4); 1260 printf("@@@ tcccBitsLength=%d -> %d bytes\n", tcccBitsLength, 0x800 + tcccBitsLength * 4); 1261 1262 if(U_FAILURE(errorCode)) { return; } 1263 1264 FILE *f=usrc_create(path, "collationfcd.cpp", 2016, 1265 "icu/tools/unicode/c/genuca/genuca.cpp"); 1266 if(f==NULL) { 1267 errorCode=U_FILE_ACCESS_ERROR; 1268 return; 1269 } 1270 fputs("#include \"unicode/utypes.h\"\n\n", f); 1271 fputs("#if !UCONFIG_NO_COLLATION\n\n", f); 1272 fputs("#include \"collationfcd.h\"\n\n", f); 1273 fputs("U_NAMESPACE_BEGIN\n\n", f); 1274 usrc_writeArray(f, 1275 "const uint8_t CollationFCD::lcccIndex[%ld]={\n", 1276 lcccIndex, 8, 0x800, 1277 "", "\n};\n\n"); 1278 usrc_writeArray(f, 1279 "const uint32_t CollationFCD::lcccBits[%ld]={\n", 1280 lcccBits, 32, lcccBitsLength, 1281 "", "\n};\n\n"); 1282 usrc_writeArray(f, 1283 "const uint8_t CollationFCD::tcccIndex[%ld]={\n", 1284 tcccIndex, 8, 0x800, 1285 "", "\n};\n\n"); 1286 usrc_writeArray(f, 1287 "const uint32_t CollationFCD::tcccBits[%ld]={\n", 1288 tcccBits, 32, tcccBitsLength, 1289 "", "\n};\n\n"); 1290 fputs("U_NAMESPACE_END\n\n", f); 1291 fputs("#endif // !UCONFIG_NO_COLLATION\n", f); 1292 fclose(f); 1293} 1294 1295static void 1296parseAndWriteCollationRootData( 1297 const char *fracUCAPath, 1298 const char *binaryDataPath, 1299 const char *sourceCodePath, 1300 UErrorCode &errorCode) { 1301 if(U_FAILURE(errorCode)) { return; } 1302 CollationBaseDataBuilder builder(icu4xMode, errorCode); 1303 builder.init(errorCode); 1304 parseFractionalUCA(fracUCAPath, builder, &errorCode); 1305 buildAndWriteBaseData(builder, binaryDataPath, errorCode); 1306 buildAndWriteFCDData(sourceCodePath, errorCode); 1307} 1308 1309// ------------------------------------------------------------------------- *** 1310 1311enum { 1312 HELP_H, 1313 HELP_QUESTION_MARK, 1314 VERBOSE, 1315 COPYRIGHT, 1316 HAN_ORDER, 1317 ICU4X 1318}; 1319 1320static UOption options[]={ 1321 UOPTION_HELP_H, 1322 UOPTION_HELP_QUESTION_MARK, 1323 UOPTION_VERBOSE, 1324 UOPTION_COPYRIGHT, 1325 UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG), 1326 UOPTION_DEF("icu4x", 'X', UOPT_NO_ARG) 1327}; 1328 1329extern "C" int 1330main(int argc, char* argv[]) { 1331 U_MAIN_INIT_ARGS(argc, argv); 1332 1333 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); 1334 1335 /* error handling, printing usage message */ 1336 if(argc<0) { 1337 fprintf(stderr, 1338 "error in command line argument \"%s\"\n", 1339 argv[-argc]); 1340 } 1341 if(options[HAN_ORDER].doesOccur) { 1342 const char *order = options[HAN_ORDER].value; 1343 if(uprv_strcmp(order, "implicit") == 0) { 1344 hanOrder = HAN_IMPLICIT; 1345 } else if(uprv_strcmp(order, "radical-stroke") == 0) { 1346 hanOrder = HAN_RADICAL_STROKE; 1347 } 1348 } 1349 if(hanOrder == HAN_NO_ORDER) { 1350 argc = -1; 1351 } 1352 if( argc<2 || 1353 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur 1354 ) { 1355 /* 1356 * Broken into chunks because the C89 standard says the minimum 1357 * required supported string length is 509 bytes. 1358 */ 1359 fprintf(stderr, 1360 "Usage: %s [-options] --hanOrder (implicit|radical-stroke) path/to/ICU/src/root\n" 1361 "\n" 1362 "Reads path/to/ICU/src/root/source/data/unidata/FractionalUCA.txt and\n" 1363 "writes source and binary data files with the collation root data.\n" 1364 "\n", 1365 argv[0]); 1366 fprintf(stderr, 1367 "Options:\n" 1368 "\t-h or -? or --help this usage text\n" 1369 "\t-v or --verbose verbose output\n" 1370 "\t-c or --copyright include a copyright notice\n" 1371 "\t --hanOrder implicit or radical-stroke\n"); 1372 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 1373 } 1374 1375 beVerbose=options[VERBOSE].doesOccur; 1376 withCopyright=options[COPYRIGHT].doesOccur; 1377 icu4xMode=options[ICU4X].doesOccur; 1378 1379 IcuToolErrorCode errorCode("genuca"); 1380 1381 CharString icuSrcRoot(argv[1], errorCode); 1382 1383 CharString icuSource(icuSrcRoot, errorCode); 1384 icuSource.appendPathPart("source", errorCode); 1385 1386 CharString icuSourceData(icuSource, errorCode); 1387 icuSourceData.appendPathPart("data", errorCode); 1388 1389 CharString fracUCAPath(icuSourceData, errorCode); 1390 fracUCAPath.appendPathPart("unidata", errorCode); 1391 fracUCAPath.appendPathPart("FractionalUCA.txt", errorCode); 1392 1393 CharString sourceDataInColl(icuSourceData, errorCode); 1394 sourceDataInColl.appendPathPart("in", errorCode); 1395 sourceDataInColl.appendPathPart("coll", errorCode); 1396 1397 CharString sourceI18n(icuSource, errorCode); 1398 sourceI18n.appendPathPart("i18n", errorCode); 1399 1400 errorCode.assertSuccess(); 1401 1402 parseAndWriteCollationRootData( 1403 fracUCAPath.data(), 1404 sourceDataInColl.data(), 1405 sourceI18n.data(), 1406 errorCode); 1407 1408 return errorCode; 1409} 1410 1411#endif // UCONFIG_NO_COLLATION 1412