1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 2000-2016, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*   file name:  genuca.cpp
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created at the end of XX century
16*   created by: Vladimir Weinstein,
17*   modified in 2013-2014 by Markus Scherer
18*
19*   This program reads the Fractional UCA table and generates
20*   internal format for UCA table as well as inverse UCA table.
21*   It then writes the ucadata.icu binary file containing the data.
22*/
23
24#define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1
25
26#include <stdio.h>
27#include <stdint.h>
28#include "unicode/utypes.h"
29#include "unicode/errorcode.h"
30#include "unicode/localpointer.h"
31#include "unicode/ucol.h"
32#include "unicode/uscript.h"
33#include "unicode/utf8.h"
34#include "charstr.h"
35#include "cmemory.h"
36#include "collation.h"
37#include "collationbasedatabuilder.h"
38#include "collationdata.h"
39#include "collationdatabuilder.h"
40#include "collationdatareader.h"
41#include "collationdatawriter.h"
42#include "collationinfo.h"
43#include "collationrootelements.h"
44#include "collationruleparser.h"
45#include "collationtailoring.h"
46#include "cstring.h"
47#include "normalizer2impl.h"
48#include "toolutil.h"
49#include "unewdata.h"
50#include "uoptions.h"
51#include "uparse.h"
52#include "writesrc.h"
53
54#if UCONFIG_NO_COLLATION
55
56extern "C" int
57main(int argc, char* argv[]) {
58    (void)argc;
59    (void)argv;
60    return 1;
61}
62
63#else
64
65U_NAMESPACE_USE
66
67enum HanOrderValue {
68    HAN_NO_ORDER = -1,
69    HAN_IMPLICIT,
70    HAN_RADICAL_STROKE
71};
72
73static UBool beVerbose=false, withCopyright=true, icu4xMode=false;
74
75static HanOrderValue hanOrder = HAN_NO_ORDER;
76
77static UVersionInfo UCAVersion={ 0, 0, 0, 0 };
78
79static UDataInfo ucaDataInfo={
80    sizeof(UDataInfo),
81    0,
82
83    U_IS_BIG_ENDIAN,
84    U_CHARSET_FAMILY,
85    U_SIZEOF_UCHAR,
86    0,
87
88    { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
89    { 5, 0, 0, 0 },                     // formatVersion
90    { 6, 3, 0, 0 }                      // dataVersion
91};
92
93static char *skipWhiteSpace(char *s) {
94    while(*s == ' ' || *s == '\t') { ++s; }
95    return s;
96}
97
98static int32_t hex2num(char hex) {
99    if(hex>='0' && hex <='9') {
100        return hex-'0';
101    } else if(hex>='a' && hex<='f') {
102        return hex-'a'+10;
103    } else if(hex>='A' && hex<='F') {
104        return hex-'A'+10;
105    } else {
106        return -1;
107    }
108}
109
110static uint32_t parseWeight(char *&s, const char *separators,
111                            int32_t maxBytes, UErrorCode &errorCode) {
112    if(U_FAILURE(errorCode)) { return 0; }
113    uint32_t weight = 0;
114    int32_t numBytes = 0;
115    for(;;) {
116        // Check one character after another, so that we don't just run over a 00.
117        int32_t nibble1, nibble2;
118        if((nibble1 = hex2num(s[0])) < 0 || (nibble2 = hex2num(s[1])) < 0) {
119            // Stop when we find something other than a pair of hex digits.
120            break;
121        }
122        if(numBytes == maxBytes || (numBytes != 0 && nibble1 == 0 && nibble2 <= 1)) {
123            // Too many bytes, or a 00 or 01 byte which is illegal inside a weight.
124            errorCode = U_INVALID_FORMAT_ERROR;
125            return 0;
126        }
127        weight = (weight << 8) | ((uint32_t)nibble1 << 4) | (uint32_t)nibble2;
128        ++numBytes;
129        s += 2;
130        if(*s != ' ') {
131            break;
132        }
133        ++s;
134    }
135    char c = *s;
136    if(c == 0 || strchr(separators, c) == NULL) {
137        errorCode = U_INVALID_FORMAT_ERROR;
138        return 0;
139    }
140    // numBytes==0 is ok, for example in [,,] or [, 82, 05]
141    // Left-align the weight.
142    while(numBytes < 4) {
143        weight <<= 8;
144        ++numBytes;
145    }
146    return weight;
147}
148
149/**
150 * Parse a CE like [0A 86, 05, 17] or [U+4E00, 10].
151 * Stop with an error, or else with the pointer s after the closing bracket.
152 */
153static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode &errorCode) {
154    if(U_FAILURE(errorCode)) { return 0; }
155    ++s;  // skip over the '['
156    if(s[0] == 'U' && s[1] == '+') {
157        // Read a code point and look up its CE.
158        // We use this especially for implicit primary weights,
159        // so that we can use different algorithms in the FractionalUCA.txt
160        // generator and the parser.
161        // The generator may not even need to compute any implicit primaries at all.
162        s += 2;
163        char *end;
164        unsigned long longCp = uprv_strtoul(s, &end, 16);
165        if(end == s || longCp > 0x10ffff) {
166            errorCode = U_INVALID_FORMAT_ERROR;
167            return 0;
168        }
169        UChar32 c = (UChar32)longCp;
170        int64_t ce = builder.getSingleCE(c, errorCode);
171        if(U_FAILURE(errorCode)) { return 0; }
172        s = end;
173        if(*s == ']') {  // [U+4E00]
174            ++s;
175            return ce;
176        }
177        if(*s != ',') {
178            errorCode = U_INVALID_FORMAT_ERROR;
179            return 0;
180        }
181        // Parse the following, secondary or tertiary weight.
182        s = skipWhiteSpace(s + 1);
183        uint32_t w = parseWeight(s, ",]", 2, errorCode);
184        if(U_FAILURE(errorCode)) { return 0; }
185        if(*s == ']') {  // [U+4E00, 10]
186            ++s;
187            // Set the tertiary weight to w.
188            return (ce & INT64_C(0xffffffffffff0000)) | (w >> 16);
189        }
190        // Set the secondary weight to w: [U+9F9C, 70, 20]
191        ce = (ce & INT64_C(0xffffffff00000000)) | w;
192        // Parse and set the tertiary weight.
193        s = skipWhiteSpace(s + 1);
194        w = parseWeight(s, "]", 2, errorCode);
195        ++s;
196        return ce | (w >> 16);
197    } else {
198        uint32_t p = parseWeight(s, ",", 4, errorCode);
199        if(U_FAILURE(errorCode)) { return 0; }
200        int64_t ce = (int64_t)p << 32;
201        s = skipWhiteSpace(s + 1);
202        uint32_t w = parseWeight(s, ",", 2, errorCode);
203        if(U_FAILURE(errorCode)) { return 0; }
204        ce |= w;
205        s = skipWhiteSpace(s + 1);
206        w = parseWeight(s, "]", 2, errorCode);
207        ++s;
208        return ce | (w >> 16);
209    }
210}
211
212namespace {
213
214// Cached, lazy-init mapping from scripts to sample characters.
215UChar32 sampleChars[USCRIPT_CODE_LIMIT] = { U_SENTINEL };
216
217}
218
219// Hardcoded mapping from script sample characters to script codes.
220// Pro: Available without complete and updated UCD scripts data,
221//      easy to add non-script codes specific to collation.
222// Con: Needs manual update for each new script or change in sample character.
223static const struct {
224    UChar32 sampleChar;
225    int32_t script;
226} sampleCharsToScripts[] = {
227    { 0x00A0, UCOL_REORDER_CODE_SPACE },
228    { 0x201C, UCOL_REORDER_CODE_PUNCTUATION },
229    { 0x263A, UCOL_REORDER_CODE_SYMBOL },
230    { 0x20AC, UCOL_REORDER_CODE_CURRENCY },
231    { 0x0034, UCOL_REORDER_CODE_DIGIT },
232    { 0x004C, USCRIPT_LATIN },
233    { 0x03A9, USCRIPT_GREEK },
234    { 0x03E2, USCRIPT_COPTIC },
235    { 0x042F, USCRIPT_CYRILLIC },
236    { 0x2C00, USCRIPT_GLAGOLITIC },
237    { 0x1036B, USCRIPT_OLD_PERMIC },
238    { 0x10D3, USCRIPT_GEORGIAN },
239    { 0x0531, USCRIPT_ARMENIAN },
240    { 0x05D0, USCRIPT_HEBREW },
241    { 0x10900, USCRIPT_PHOENICIAN },
242    { 0x0800, USCRIPT_SAMARITAN },
243    { 0x0628, USCRIPT_ARABIC },
244    { 0x0710, USCRIPT_SYRIAC },
245    { 0x0840, USCRIPT_MANDAIC },
246    { 0x078C, USCRIPT_THAANA },
247    { 0x07CA, USCRIPT_NKO },
248    { 0x07D8, USCRIPT_NKO },
249    { 0x2D30, USCRIPT_TIFINAGH },
250    { 0x2D5E, USCRIPT_TIFINAGH },
251    { 0x12A0, USCRIPT_ETHIOPIC },
252    { 0x0905, USCRIPT_DEVANAGARI },
253    { 0x0995, USCRIPT_BENGALI },
254    { 0x0A15, USCRIPT_GURMUKHI },
255    { 0x0A95, USCRIPT_GUJARATI },
256    { 0x0B15, USCRIPT_ORIYA },
257    { 0x0B95, USCRIPT_TAMIL },
258    { 0x0C15, USCRIPT_TELUGU },
259    { 0x0C95, USCRIPT_KANNADA },
260    { 0x0D15, USCRIPT_MALAYALAM },
261    { 0x0D85, USCRIPT_SINHALA },
262    { 0xABC0, USCRIPT_MEITEI_MAYEK },
263    { 0xA800, USCRIPT_SYLOTI_NAGRI },
264    { 0xA882, USCRIPT_SAURASHTRA },
265    { 0x11083, USCRIPT_KAITHI },
266    { 0x11152, USCRIPT_MAHAJANI },
267    { 0x11183, USCRIPT_SHARADA },
268    { 0x11208, USCRIPT_KHOJKI },
269    { 0x112BE, USCRIPT_KHUDAWADI },
270    { 0x1128F, USCRIPT_MULTANI },
271    { 0x11315, USCRIPT_GRANTHA },
272    { 0x11412, USCRIPT_NEWA },
273    { 0x11484, USCRIPT_TIRHUTA },
274    { 0x1158E, USCRIPT_SIDDHAM },
275    { 0x1160E, USCRIPT_MODI },
276    { 0x11680, USCRIPT_TAKRI },
277    { 0x1180B, USCRIPT_DOGRA },
278    { 0x11717, USCRIPT_AHOM },
279    { 0x11D71, USCRIPT_GUNJALA_GONDI },
280    { 0x1B83, USCRIPT_SUNDANESE },
281    { 0x11005, USCRIPT_BRAHMI },
282    { 0x10A00, USCRIPT_KHAROSHTHI },
283    { 0x11C0E, USCRIPT_BHAIKSUKI },
284    { 0x0E17, USCRIPT_THAI },
285    { 0x0EA5, USCRIPT_LAO },
286    { 0xAA80, USCRIPT_TAI_VIET },
287    { 0x0F40, USCRIPT_TIBETAN },
288    { 0x11C72, USCRIPT_MARCHEN },
289    { 0x1C00, USCRIPT_LEPCHA },
290    { 0xA840, USCRIPT_PHAGS_PA },
291    { 0x1900, USCRIPT_LIMBU },
292    { 0x1703, USCRIPT_TAGALOG },
293    { 0x1723, USCRIPT_HANUNOO },
294    { 0x1743, USCRIPT_BUHID },
295    { 0x1763, USCRIPT_TAGBANWA },
296    { 0x1A00, USCRIPT_BUGINESE },
297    { 0x11EE5, USCRIPT_MAKASAR },
298    { 0x1BC0, USCRIPT_BATAK },
299    { 0xA930, USCRIPT_REJANG },
300    { 0xA90A, USCRIPT_KAYAH_LI },
301    { 0x1000, USCRIPT_MYANMAR },
302    { 0x10D12, USCRIPT_HANIFI_ROHINGYA },
303    { 0x11103, USCRIPT_CHAKMA },
304    { 0x1780, USCRIPT_KHMER },
305    { 0x1950, USCRIPT_TAI_LE },
306    { 0x1980, USCRIPT_NEW_TAI_LUE },
307    { 0x1A20, USCRIPT_LANNA },
308    { 0xAA00, USCRIPT_CHAM },
309    { 0x1B05, USCRIPT_BALINESE },
310    { 0xA984, USCRIPT_JAVANESE },
311    { 0x1826, USCRIPT_MONGOLIAN },
312    { 0x1C5A, USCRIPT_OL_CHIKI },
313    { 0x13C4, USCRIPT_CHEROKEE },
314    { 0x104B5, USCRIPT_OSAGE },
315    { 0x14C0, USCRIPT_CANADIAN_ABORIGINAL },
316    { 0x168F, USCRIPT_OGHAM },
317    { 0x16A0, USCRIPT_RUNIC },
318    { 0x10CA1, USCRIPT_OLD_HUNGARIAN },
319    { 0x10C00, USCRIPT_ORKHON },
320    { 0xA549, USCRIPT_VAI },
321    { 0xA6A0, USCRIPT_BAMUM },
322    { 0x16AE6, USCRIPT_BASSA_VAH },
323    { 0x1E802, USCRIPT_MENDE },
324    { 0x16E40, USCRIPT_MEDEFAIDRIN },
325    { 0x1E909, USCRIPT_ADLAM, },
326    { 0xAC00, USCRIPT_HANGUL },
327    { 0x304B, USCRIPT_HIRAGANA },
328    { 0x30AB, USCRIPT_KATAKANA },
329    { 0x3105, USCRIPT_BOPOMOFO },
330    { 0xA288, USCRIPT_YI },
331    { 0xA4D0, USCRIPT_LISU },
332    { 0xA4E8, USCRIPT_LISU },
333    { 0x16F00, USCRIPT_MIAO },
334    { 0x118B4, USCRIPT_WARANG_CITI },
335    { 0x11AC0, USCRIPT_PAU_CIN_HAU },
336    { 0x16B1C, USCRIPT_PAHAWH_HMONG },
337    { 0x10280, USCRIPT_LYCIAN },
338    { 0x102A0, USCRIPT_CARIAN },
339    { 0x102B7, USCRIPT_CARIAN },
340    { 0x10920, USCRIPT_LYDIAN },
341    { 0x10300, USCRIPT_OLD_ITALIC },
342    { 0x10308, USCRIPT_OLD_ITALIC },
343    { 0x10330, USCRIPT_GOTHIC },
344    { 0x10414, USCRIPT_DESERET },
345    { 0x10450, USCRIPT_SHAVIAN },
346    { 0x1BC20, USCRIPT_DUPLOYAN },
347    { 0x10480, USCRIPT_OSMANYA },
348    { 0x10500, USCRIPT_ELBASAN },
349    { 0x10537, USCRIPT_CAUCASIAN_ALBANIAN },
350    { 0x110D0, USCRIPT_SORA_SOMPENG },
351    { 0x16A4F, USCRIPT_MRO },
352    { 0x10000, USCRIPT_LINEAR_B },
353    { 0x10647, USCRIPT_LINEAR_A },
354    { 0x10800, USCRIPT_CYPRIOT },
355    { 0x10A60, USCRIPT_OLD_SOUTH_ARABIAN },
356    { 0x10A95, USCRIPT_OLD_NORTH_ARABIAN },
357    { 0x10B00, USCRIPT_AVESTAN },
358    { 0x10873, USCRIPT_PALMYRENE },
359    { 0x10896, USCRIPT_NABATAEAN },
360    { 0x108F4, USCRIPT_HATRAN },
361    { 0x10840, USCRIPT_IMPERIAL_ARAMAIC },
362    { 0x10B40, USCRIPT_INSCRIPTIONAL_PARTHIAN },
363    { 0x10B60, USCRIPT_INSCRIPTIONAL_PAHLAVI },
364    { 0x10B8F, USCRIPT_PSALTER_PAHLAVI },
365    { 0x10AC1, USCRIPT_MANICHAEAN },
366    { 0x10AD8, USCRIPT_MANICHAEAN },
367    { 0x10F19, USCRIPT_OLD_SOGDIAN },
368    { 0x10F42, USCRIPT_SOGDIAN },
369    { 0x10380, USCRIPT_UGARITIC },
370    { 0x103A0, USCRIPT_OLD_PERSIAN },
371    { 0x12000, USCRIPT_CUNEIFORM },
372    { 0x13153, USCRIPT_EGYPTIAN_HIEROGLYPHS },
373    { 0x109A0, USCRIPT_MEROITIC_CURSIVE },
374    { 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS },
375    { 0x14400, USCRIPT_ANATOLIAN_HIEROGLYPHS },
376    { 0x18229, USCRIPT_TANGUT },
377    { 0x5B57, USCRIPT_HAN },
378    { 0x11D10, USCRIPT_MASARAM_GONDI },
379    { 0x11A0B, USCRIPT_ZANABAZAR_SQUARE },
380    { 0x11A5C, USCRIPT_SOYOMBO },
381    { 0x1B1C4, USCRIPT_NUSHU },
382    { 0xFDD0, USCRIPT_UNKNOWN }  // unassigned-implicit primary weights
383};
384
385static int32_t getCharScript(UChar32 c) {
386    if (sampleChars[0] < 0) {
387        // Lazy-init the script->sample cache.
388        for (int32_t script = 0; script < USCRIPT_CODE_LIMIT; ++script) {
389            UnicodeString sample = uscript_getSampleUnicodeString((UScriptCode)script);
390            if (sample.isEmpty() || sample.hasMoreChar32Than(0, INT32_MAX, 1)) {
391                sampleChars[script] = U_SENTINEL;
392            } else {
393                sampleChars[script] = sample.char32At(0);
394            }
395        }
396    }
397    for (int32_t script = 0; script < USCRIPT_CODE_LIMIT; ++script) {
398        if (c == sampleChars[script]) {
399            return script;
400        }
401    }
402    for(int32_t i = 0; i < UPRV_LENGTHOF(sampleCharsToScripts); ++i) {
403        if(c == sampleCharsToScripts[i].sampleChar) {
404            return sampleCharsToScripts[i].script;
405        }
406    }
407    return USCRIPT_INVALID_CODE;  // -1
408}
409
410/**
411 * Maps Unified_Ideograph's to primary CEs in the given order of ranges.
412 */
413class HanOrder {
414public:
415    HanOrder(UErrorCode &errorCode) : ranges(errorCode), set(), done(false) {}
416
417    void addRange(UChar32 start, UChar32 end, UErrorCode &errorCode) {
418        int32_t length = ranges.size();
419        if(length > 0 && (ranges.elementAti(length - 1) + 1) == start) {
420            // The previous range end is just before this range start: Merge adjacent ranges.
421            ranges.setElementAt(end, length - 1);
422        } else {
423            ranges.addElement(start, errorCode);
424            ranges.addElement(end, errorCode);
425        }
426        set.add(start, end);
427    }
428
429    void setBuilderHanOrder(CollationBaseDataBuilder &builder, UErrorCode &errorCode) {
430        if(U_FAILURE(errorCode)) { return; }
431        builder.initHanRanges(ranges.getBuffer(), ranges.size(), errorCode);
432        done = true;
433    }
434
435    void setDone() {
436        done = true;
437    }
438
439    UBool isDone() { return done; }
440
441    const UnicodeSet &getSet() { return set; }
442
443private:
444    UVector32 ranges;
445    UnicodeSet set;
446    UBool done;
447};
448
449static HanOrder *implicitHanOrder = NULL;
450static HanOrder *radicalStrokeOrder = NULL;
451
452enum ActionType {
453  READCE,
454  READPRIMARY,
455  READBYTE,
456  READUNIFIEDIDEOGRAPH,
457  READRADICAL,
458  READUCAVERSION,
459  READLEADBYTETOSCRIPTS,
460  IGNORE
461};
462
463static struct {
464    const char *const name;
465    int64_t value;
466    const ActionType what_to_do;
467} vt[]  = {
468    {"[first tertiary ignorable",     0, IGNORE},
469    {"[last tertiary ignorable",      0, IGNORE},
470    {"[first secondary ignorable",    0, READCE},
471    {"[last secondary ignorable",     0, READCE},
472    {"[first primary ignorable",      0, READCE},
473    {"[last primary ignorable",       0, READCE},
474    {"[first variable",               0, READCE},
475    {"[last variable",                0, READCE},
476    {"[first regular",                0, READCE},
477    {"[last regular",                 0, READCE},
478    {"[first implicit",               0, READCE},
479    {"[last implicit",                0, READCE},
480    {"[first trailing",               0, READCE},
481    {"[last trailing",                0, READCE},
482
483    {"[Unified_Ideograph",            0, READUNIFIEDIDEOGRAPH},
484    {"[radical",                      0, READRADICAL},
485
486    {"[fixed first implicit byte",    0, IGNORE},
487    {"[fixed last implicit byte",     0, IGNORE},
488    {"[fixed first trail byte",       0, IGNORE},
489    {"[fixed last trail byte",        0, IGNORE},
490    {"[fixed first special byte",     0, IGNORE},
491    {"[fixed last special byte",      0, IGNORE},
492    {"[fixed secondary common byte",                  0, READBYTE},
493    {"[fixed last secondary common byte",             0, READBYTE},
494    {"[fixed first ignorable secondary byte",         0, READBYTE},
495    {"[fixed tertiary common byte",                   0, READBYTE},
496    {"[fixed first ignorable tertiary byte",          0, READBYTE},
497    {"[variable top = ",              0, IGNORE},
498    {"[UCA version = ",               0, READUCAVERSION},
499    {"[top_byte",                     0, READLEADBYTETOSCRIPTS},
500    {"[reorderingTokens",             0, IGNORE},
501    {"[categories",                   0, IGNORE},
502    {"[first tertiary in secondary non-ignorable",    0, IGNORE},
503    {"[last tertiary in secondary non-ignorable",     0, IGNORE},
504    {"[first secondary in primary non-ignorable",     0, IGNORE},
505    {"[last secondary in primary non-ignorable",      0, IGNORE},
506};
507
508static int64_t getOptionValue(const char *name) {
509    for (int32_t i = 0; i < UPRV_LENGTHOF(vt); ++i) {
510        if(uprv_strcmp(name, vt[i].name) == 0) {
511            return vt[i].value;
512        }
513    }
514    return 0;
515}
516
517static void readAnOption(
518        CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) {
519    for (int32_t cnt = 0; cnt<UPRV_LENGTHOF(vt); cnt++) {
520        int32_t vtLen = (int32_t)uprv_strlen(vt[cnt].name);
521        if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
522            ActionType what_to_do = vt[cnt].what_to_do;
523            char *pointer = skipWhiteSpace(buffer + vtLen);
524            if (what_to_do == IGNORE) { //vt[cnt].what_to_do == IGNORE
525                return;
526            } else if (what_to_do == READCE) {
527                vt[cnt].value = parseCE(builder, pointer, *status);
528                if(U_SUCCESS(*status) && *pointer != ']') {
529                    *status = U_INVALID_FORMAT_ERROR;
530                }
531                if(U_FAILURE(*status)) {
532                    fprintf(stderr, "Syntax error: unable to parse the CE from line '%s'\n", buffer);
533                    return;
534                }
535            } else if(what_to_do == READPRIMARY) {
536                vt[cnt].value = parseWeight(pointer, "]", 4, *status);
537                if(U_FAILURE(*status)) {
538                    fprintf(stderr, "Value of \"%s\" is not a primary weight\n", buffer);
539                    return;
540                }
541            } else if(what_to_do == READBYTE) {
542                vt[cnt].value = parseWeight(pointer, "]", 1, *status) >> 24;
543                if(U_FAILURE(*status)) {
544                    fprintf(stderr, "Value of \"%s\" is not a valid byte\n", buffer);
545                    return;
546                }
547            } else if(what_to_do == READUNIFIEDIDEOGRAPH) {
548                if(implicitHanOrder != NULL) {
549                    fprintf(stderr, "duplicate [Unified_Ideograph] lines\n");
550                    *status = U_INVALID_FORMAT_ERROR;
551                    return;
552                }
553                implicitHanOrder = new HanOrder(*status);
554                if(U_FAILURE(*status)) { return; }
555                for(;;) {
556                    if(*pointer == ']') { break; }
557                    if(*pointer == 0) {
558                        // Missing ] after ranges.
559                        *status = U_INVALID_FORMAT_ERROR;
560                        return;
561                    }
562                    char *s = pointer;
563                    while(*s != ' ' && *s != '\t' && *s != ']' && *s != '\0') { ++s; }
564                    char c = *s;
565                    *s = 0;
566                    uint32_t start, end;
567                    u_parseCodePointRange(pointer, &start, &end, status);
568                    *s = c;
569                    if(U_FAILURE(*status)) {
570                        fprintf(stderr, "Syntax error: unable to parse one of the ranges from line '%s'\n", buffer);
571                        *status = U_INVALID_FORMAT_ERROR;
572                        return;
573                    }
574                    implicitHanOrder->addRange((UChar32)start, (UChar32)end, *status);
575                    pointer = skipWhiteSpace(s);
576                }
577                if(hanOrder == HAN_IMPLICIT) {
578                    implicitHanOrder->setBuilderHanOrder(builder, *status);
579                }
580                implicitHanOrder->setDone();
581            } else if(what_to_do == READRADICAL) {
582                if(radicalStrokeOrder == NULL) {
583                    if(implicitHanOrder == NULL) {
584                        fprintf(stderr, "[radical] section before [Unified_Ideograph] line\n");
585                        *status = U_INVALID_FORMAT_ERROR;
586                        return;
587                    }
588                    radicalStrokeOrder = new HanOrder(*status);
589                    if(U_FAILURE(*status)) { return; }
590                } else if(radicalStrokeOrder->isDone()) {
591                    fprintf(stderr, "duplicate [radical] sections\n");
592                    *status = U_INVALID_FORMAT_ERROR;
593                    return;
594                }
595                if(uprv_strcmp(pointer, "end]") == 0) {
596                    if(radicalStrokeOrder->getSet() != implicitHanOrder->getSet()) {
597                        fprintf(stderr, "[radical end]: "
598                                "some of [Unified_Ideograph] missing from [radical] lines\n");
599                        *status = U_INVALID_FORMAT_ERROR;
600                        return;
601                    }
602                    if(hanOrder == HAN_RADICAL_STROKE) {
603                        radicalStrokeOrder->setBuilderHanOrder(builder, *status);
604                    }
605                    radicalStrokeOrder->setDone();
606                } else {
607                    // Read Han characters and ranges between : and ].
608                    // Ignore the radical data before the :.
609                    char *startPointer = uprv_strchr(pointer, ':');
610                    char *limitPointer = uprv_strchr(pointer, ']');
611                    if(startPointer == NULL || limitPointer == NULL ||
612                            (startPointer + 1) >= limitPointer) {
613                        fprintf(stderr, "[radical]: no Han characters listed between : and ]\n");
614                        *status = U_INVALID_FORMAT_ERROR;
615                        return;
616                    }
617                    pointer = startPointer + 1;
618                    int32_t length = (int32_t)(limitPointer - pointer);
619                    for(int32_t i = 0; i < length;) {
620                        UChar32 start;
621                        U8_NEXT(pointer, i, length, start);
622                        UChar32 end;
623                        if(pointer[i] == '-') {
624                            ++i;
625                            U8_NEXT(pointer, i, length, end);
626                        } else {
627                            end = start;
628                        }
629                        if(radicalStrokeOrder->getSet().containsSome(start, end)) {
630                            fprintf(stderr, "[radical]: some of U+%04x..U+%04x occur "
631                                    "multiple times in the radical-stroke order\n",
632                                    start, end);
633                            *status = U_INVALID_FORMAT_ERROR;
634                            return;
635                        }
636                        if(!implicitHanOrder->getSet().contains(start, end)) {
637                            fprintf(stderr, "[radical]: some of U+%04x..U+%04x are "
638                                    "not Unified_Ideograph\n",
639                                    start, end);
640                            *status = U_INVALID_FORMAT_ERROR;
641                            return;
642                        }
643                        radicalStrokeOrder->addRange(start, end, *status);
644                    }
645                }
646            } else if (what_to_do == READUCAVERSION) {
647                u_versionFromString(UCAVersion, pointer);
648                if(beVerbose) {
649                    char uca[U_MAX_VERSION_STRING_LENGTH];
650                    u_versionToString(UCAVersion, uca);
651                    printf("UCA version %s\n", uca);
652                }
653                UVersionInfo UCDVersion;
654                u_getUnicodeVersion(UCDVersion);
655                if (UCAVersion[0] != UCDVersion[0] || UCAVersion[1] != UCDVersion[1]) {
656                    char uca[U_MAX_VERSION_STRING_LENGTH];
657                    char ucd[U_MAX_VERSION_STRING_LENGTH];
658                    u_versionToString(UCAVersion, uca);
659                    u_versionToString(UCDVersion, ucd);
660                    // Warning, not error, to permit bootstrapping during a version upgrade.
661                    fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd);
662                }
663            } else if (what_to_do == READLEADBYTETOSCRIPTS) {
664                if (strstr(pointer, "COMPRESS") != NULL) {
665                    uint16_t leadByte = (hex2num(*pointer++) * 16);
666                    leadByte += hex2num(*pointer++);
667                    builder.setCompressibleLeadByte(leadByte);
668                }
669                // We do not need the list of scripts on this line.
670            }
671            return;
672        }
673    }
674    fprintf(stderr, "Warning: unrecognized option: %s\n", buffer);
675}
676
677static UBool
678readAnElement(char *line,
679        CollationBaseDataBuilder &builder,
680        UnicodeString &prefix, UnicodeString &s,
681        int64_t ces[32], int32_t &cesLength,
682        UErrorCode *status) {
683    if(U_FAILURE(*status)) {
684        return false;
685    }
686    int32_t lineLength = (int32_t)uprv_strlen(line);
687    while(lineLength>0 && (line[lineLength-1] == '\r' || line[lineLength-1] == '\n')) {
688      line[--lineLength] = 0;
689    }
690
691    if(lineLength >= 3 && line[0] == (char)0xef &&
692            line[1] == (char)0xbb && line[2] == (char)0xbf) {
693        // U+FEFF UTF-8 signature byte sequence.
694        // Ignore, assuming it is at the start of the file.
695        line += 3;
696        lineLength -= 3;
697    }
698    if(line[0] == 0 || line[0] == '#') {
699        return false; // just a comment, skip whole line
700    }
701
702    // Directives.
703    if(line[0] == '[') {
704        readAnOption(builder, line, status);
705        return false;
706    }
707
708    CharString input;
709    char *startCodePoint = line;
710    char *endCodePoint = strchr(startCodePoint, ';');
711    if(endCodePoint == NULL) {
712        fprintf(stderr, "error - line with no code point:\n%s\n", line);
713        *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
714        return false;
715    }
716
717    char *pipePointer = strchr(line, '|');
718    if (pipePointer != NULL) {
719        // Read the prefix string which precedes the actual string.
720        input.append(startCodePoint, (int32_t)(pipePointer - startCodePoint), *status);
721        UChar *prefixChars = prefix.getBuffer(32);
722        int32_t prefixSize =
723            u_parseString(input.data(),
724                          prefixChars, prefix.getCapacity(),
725                          NULL, status);
726        if(U_FAILURE(*status)) {
727            prefix.releaseBuffer(0);
728            fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n%s\n",
729                    input.data(), line, u_errorName(*status));
730            *status = U_INVALID_FORMAT_ERROR;
731            return false;
732        }
733        prefix.releaseBuffer(prefixSize);
734        startCodePoint = pipePointer + 1;
735        input.clear();
736    }
737
738    // Read the string which gets the CE(s) assigned.
739    input.append(startCodePoint, (int32_t)(endCodePoint - startCodePoint), *status);
740    UChar *uchars = s.getBuffer(32);
741    int32_t cSize =
742        u_parseString(input.data(),
743                      uchars, s.getCapacity(),
744                      NULL, status);
745    if(U_FAILURE(*status)) {
746        s.releaseBuffer(0);
747        fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n%s\n",
748                input.data(), line, u_errorName(*status));
749        *status = U_INVALID_FORMAT_ERROR;
750        return false;
751    }
752    s.releaseBuffer(cSize);
753
754    char *pointer = endCodePoint + 1;
755
756    char *commentStart = strchr(pointer, '#');
757    if(commentStart == NULL) {
758        commentStart = strchr(pointer, 0);
759    }
760
761    cesLength = 0;
762    for(;;) {
763        pointer = skipWhiteSpace(pointer);
764        if(pointer == commentStart) {
765            break;
766        }
767        if(cesLength >= 31) {
768            fprintf(stderr, "Error: Too many CEs on line '%s'\n", line);
769            *status = U_INVALID_FORMAT_ERROR;
770            return false;
771        }
772        ces[cesLength++] = parseCE(builder, pointer, *status);
773        if(U_FAILURE(*status)) {
774            fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n",
775                    line, u_errorName(*status));
776            return false;
777        }
778    }
779
780    if(s.length() == 1 && s[0] == 0xfffe) {
781        // UCA 6.0 gives U+FFFE a special minimum weight using the
782        // byte 02 which is the merge-sort-key separator and illegal for any
783        // other characters.
784    } else {
785        // Rudimentary check for valid bytes in CE weights.
786        // For a more comprehensive check see CollationTest::TestRootElements(),
787        // intltest collate/CollationTest/TestRootElements
788        for (int32_t i = 0; i < cesLength; ++i) {
789            int64_t ce = ces[i];
790            UBool isCompressible = false;
791            for (int j = 7; j >= 0; --j) {
792                uint8_t b = (uint8_t)(ce >> (j * 8));
793                if(j <= 1) { b &= 0x3f; }  // tertiary bytes use 6 bits
794                if (b == 1) {
795                    fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", line);
796                    return false;
797                }
798                if (j == 7 && b == 2) {
799                    fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", line);
800                    return false;
801                }
802                if (j == 7) {
803                    isCompressible = builder.isCompressibleLeadByte(b);
804                } else if (j == 6) {
805                    // Primary second bytes 03 and FF are compression terminators.
806                    // 02, 03 and FF are usable when the lead byte is not compressible.
807                    // 02 is unusable and 03 is the low compression terminator when the lead byte is compressible.
808                    if (isCompressible && (b <= 3 || b == 0xff)) {
809                        fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n",
810                                b, line);
811                        return false;
812                    }
813                }
814            }
815        }
816    }
817
818    return true;
819}
820
821static void
822parseFractionalUCA(const char *filename,
823                   CollationBaseDataBuilder &builder,
824                   UErrorCode *status)
825{
826    if(U_FAILURE(*status)) { return; }
827    FILE *data = fopen(filename, "r");
828    if(data == NULL) {
829        fprintf(stderr, "Couldn't open file: %s\n", filename);
830        *status = U_FILE_ACCESS_ERROR;
831        return;
832    }
833    int32_t lineNumber = 0;
834    char buffer[30000];
835
836    const Normalizer2* norm = nullptr;
837    if (icu4xMode) {
838        norm = Normalizer2::getNFDInstance(*status);
839    }
840
841    UChar32 maxCodePoint = 0;
842    while(!feof(data)) {
843        if(U_FAILURE(*status)) {
844            fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
845                *status, u_errorName(*status), (int)lineNumber, filename);
846            exit(*status);
847        }
848
849        lineNumber++;
850        char *line = fgets(buffer, sizeof(buffer), data);
851        if(line == NULL) {
852            if(feof(data)) {
853                break;
854            } else {
855                fprintf(stderr, "no more input line and also no EOF!\n");
856                *status = U_INVALID_FORMAT_ERROR;
857                return;
858            }
859        }
860
861        UnicodeString prefix;
862        UnicodeString s;
863        int64_t ces[32];
864        int32_t cesLength = 0;
865        if(readAnElement(line, builder, prefix, s, ces, cesLength, status)) {
866            // we have read the line, now do something sensible with the read data!
867            uint32_t p = (uint32_t)(ces[0] >> 32);
868
869            if(s.length() > 1 && s[0] == 0xFDD0) {
870                // FractionalUCA.txt contractions starting with U+FDD0
871                // are only entered into the inverse table,
872                // not into the normal collation data.
873                builder.addRootElements(ces, cesLength, *status);
874                if(s.length() == 2 && cesLength == 1) {
875                    switch(s[1]) {
876                    case 0x34:
877                        // Lead byte for numeric sorting.
878                        builder.setNumericPrimary(p);
879                        break;
880                    case 0xFF21:
881                        builder.addScriptStart(CollationData::REORDER_RESERVED_BEFORE_LATIN, p);
882                        break;
883                    case 0xFF3A:
884                        builder.addScriptStart(CollationData::REORDER_RESERVED_AFTER_LATIN, p);
885                        break;
886                    default:
887                        break;
888                    }
889                }
890            } else {
891                UChar32 c = s.char32At(0);
892                if(c > maxCodePoint) { maxCodePoint = c; }
893
894                // We ignore the CEs for U+FFFD..U+FFFF and for the unassigned first primary.
895                // CollationBaseDataBuilder::init() maps them to special CEs.
896                // Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
897                if(0xfffd <= c && c <= 0xffff) { continue; }
898                if (icu4xMode) {
899                    if (c >= 0xAC00 && c <= 0xD7A3) {
900                        // Hangul syllable
901                        continue;
902                    }
903                    if (c >= 0xD800 && c < 0xE000) {
904                        // Surrogate
905                        continue;
906                    }
907                    UnicodeString src;
908                    UnicodeString dst;
909                    src.append(c);
910                    norm->normalize(src, dst, *status);
911                    if (src != dst) {
912                        // c decomposed, skip it
913                        continue;
914                    }
915                }
916                if(s.length() >= 2 && c == 0xFDD1) {
917                    UChar32 c2 = s.char32At(1);
918                    int32_t script = getCharScript(c2);
919                    if(script < 0) {
920                        fprintf(stderr,
921                                "Error: Unknown script for first-primary sample character "
922                                "U+%04X on line %u of %s:\n"
923                                "%s\n"
924                                "    (add the character to genuca.cpp sampleCharsToScripts[])\n",
925                                c2, (int)lineNumber, filename, line);
926                        exit(U_INVALID_FORMAT_ERROR);
927                    }
928                    if(script == USCRIPT_UNKNOWN) {
929                        // FDD1 FDD0, first unassigned-implicit primary
930                        builder.addScriptStart(script, Collation::FIRST_UNASSIGNED_PRIMARY);
931                        continue;
932                    }
933                    builder.addScriptStart(script, p);
934                    if(script == USCRIPT_HIRAGANA) {
935                        builder.addScriptStart(USCRIPT_KATAKANA_OR_HIRAGANA, p);
936                    } else if(script == USCRIPT_HAN) {
937                        builder.addScriptStart(USCRIPT_SIMPLIFIED_HAN, p);
938                        builder.addScriptStart(USCRIPT_TRADITIONAL_HAN, p);
939                    }
940                }
941
942                if(0xe0000000 <= p && p < 0xf0000000) {
943                    fprintf(stderr,
944                            "Error: Unexpected mapping to an implicit or trailing primary"
945                            " on line %u of %s:\n"
946                            "%s\n",
947                            (int)lineNumber, filename, line);
948                    exit(U_INVALID_FORMAT_ERROR);
949                }
950                builder.add(prefix, s, ces, cesLength, *status);
951            }
952        }
953    }
954
955    int32_t numRanges = 0;
956    int32_t numRangeCodePoints = 0;
957    UChar32 rangeFirst = U_SENTINEL;
958    UChar32 rangeLast = U_SENTINEL;
959    uint32_t rangeFirstPrimary = 0;
960    uint32_t rangeLastPrimary = 0;
961    int32_t rangeStep = -1;
962
963    // Detect ranges of characters in primary code point order,
964    // with 3-byte primaries and
965    // with consistent "step" differences between adjacent primaries.
966    // This relies on the FractionalUCA generator using the same primary-weight incrementation.
967    // Start at U+0180: No ranges for common Latin characters.
968    // Go one beyond maxCodePoint in case a range ends there.
969    for(UChar32 c = 0x180; c <= (maxCodePoint + 1); ++c) {
970        UBool action;
971        uint32_t p = builder.getLongPrimaryIfSingleCE(c);
972        if(p != 0) {
973            // p is a "long" (three-byte) primary.
974            if(rangeFirst >= 0 && c == (rangeLast + 1) && p > rangeLastPrimary) {
975                // Find the offset between the two primaries.
976                int32_t step = CollationBaseDataBuilder::diffThreeBytePrimaries(
977                    rangeLastPrimary, p, builder.isCompressiblePrimary(p));
978                if(rangeFirst == rangeLast && step >= 2) {
979                    // c == rangeFirst + 1, store the "step" between range primaries.
980                    rangeStep = step;
981                    rangeLast = c;
982                    rangeLastPrimary = p;
983                    action = 0;  // continue range
984                } else if(rangeStep == step) {
985                    // Continue the range with the same "step" difference.
986                    rangeLast = c;
987                    rangeLastPrimary = p;
988                    action = 0;  // continue range
989                } else {
990                    action = 1;  // maybe finish range, start a new one
991                }
992            } else {
993                action = 1;  // maybe finish range, start a new one
994            }
995        } else {
996            action = -1;  // maybe finish range, do not start a new one
997        }
998        if(action != 0 && rangeFirst >= 0) {
999            // Finish a range.
1000            // Set offset CE32s for a long range, leave single CEs for a short range.
1001            UBool didSetRange = builder.maybeSetPrimaryRange(
1002                rangeFirst, rangeLast,
1003                rangeFirstPrimary, rangeStep, *status);
1004            if(U_FAILURE(*status)) {
1005                fprintf(stderr,
1006                        "failure setting code point order range U+%04lx..U+%04lx "
1007                        "%08lx..%08lx step %d - %s\n",
1008                        (long)rangeFirst, (long)rangeLast,
1009                        (long)rangeFirstPrimary, (long)rangeLastPrimary,
1010                        (int)rangeStep, u_errorName(*status));
1011            } else if(didSetRange) {
1012                int32_t rangeLength = rangeLast - rangeFirst + 1;
1013                if(beVerbose) {
1014                    printf("* set code point order range U+%04lx..U+%04lx [%d] "
1015                            "%08lx..%08lx step %d\n",
1016                            (long)rangeFirst, (long)rangeLast,
1017                            (int)rangeLength,
1018                            (long)rangeFirstPrimary, (long)rangeLastPrimary,
1019                            (int)rangeStep);
1020                }
1021                ++numRanges;
1022                numRangeCodePoints += rangeLength;
1023            }
1024            rangeFirst = U_SENTINEL;
1025            rangeStep = -1;
1026        }
1027        if(action > 0) {
1028            // Start a new range.
1029            rangeFirst = rangeLast = c;
1030            rangeFirstPrimary = rangeLastPrimary = p;
1031        }
1032    }
1033    printf("** set %d ranges with %d code points\n", (int)numRanges, (int)numRangeCodePoints);
1034
1035    // Idea: Probably best to work in two passes.
1036    // Pass 1 for reading all data, setting isCompressible flags (and reordering groups)
1037    // and finding ranges.
1038    // Then set the ranges in a newly initialized builder
1039    // for optimal compression (makes sure that adjacent blocks can overlap easily).
1040    // Then set all mappings outside the ranges.
1041    //
1042    // In the first pass, we could store mappings in a simple list,
1043    // with single-character/single-long-primary-CE mappings in a UTrie2;
1044    // or store the mappings in a temporary builder;
1045    // or we could just parse the input file again in the second pass.
1046    //
1047    // Ideally set/copy U+0000..U+017F before setting anything else,
1048    // then set default Han/Hangul, then set the ranges, then copy non-range mappings.
1049    // It should be easy to copy mappings from an un-built builder to a new one.
1050    // Add CollationDataBuilder::copyFrom(builder, code point, errorCode) -- copy contexts & expansions.
1051
1052    if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) {
1053        fprintf(stderr, "UCA version not specified. Cannot create data file!\n");
1054        fclose(data);
1055        return;
1056    }
1057
1058    if (beVerbose) {
1059        printf("\nLines read: %u\n", (int)lineNumber);
1060    }
1061
1062    fclose(data);
1063
1064    return;
1065}
1066
1067static void
1068buildAndWriteBaseData(CollationBaseDataBuilder &builder,
1069                      const char *path, UErrorCode &errorCode) {
1070    if(U_FAILURE(errorCode)) { return; }
1071
1072    if(getOptionValue("[fixed secondary common byte") != Collation::COMMON_BYTE) {
1073        fprintf(stderr, "error: unexpected [fixed secondary common byte]");
1074        errorCode = U_INVALID_FORMAT_ERROR;
1075        return;
1076    }
1077    if(getOptionValue("[fixed tertiary common byte") != Collation::COMMON_BYTE) {
1078        fprintf(stderr, "error: unexpected [fixed tertiary common byte]");
1079        errorCode = U_INVALID_FORMAT_ERROR;
1080        return;
1081    }
1082
1083    CollationData data(*Normalizer2Factory::getNFCImpl(errorCode));
1084    builder.enableFastLatin();
1085    builder.build(data, errorCode);
1086    if(U_FAILURE(errorCode)) {
1087        fprintf(stderr, "builder.build() failed: %s\n",
1088                u_errorName(errorCode));
1089        return;
1090    }
1091
1092    // The CollationSettings constructor gives us the properly encoded
1093    // default options, so that we need not duplicate them here.
1094    CollationSettings settings;
1095
1096    UVector32 rootElements(errorCode);
1097    for(int32_t i = 0; i < CollationRootElements::IX_COUNT; ++i) {
1098        rootElements.addElement(0, errorCode);
1099    }
1100    builder.buildRootElementsTable(rootElements, errorCode);
1101    if(U_FAILURE(errorCode)) {
1102        fprintf(stderr, "builder.buildRootElementsTable() failed: %s\n",
1103                u_errorName(errorCode));
1104        return;
1105    }
1106    int32_t index = CollationRootElements::IX_COUNT;
1107    rootElements.setElementAt(index, CollationRootElements::IX_FIRST_TERTIARY_INDEX);
1108
1109    while((rootElements.elementAti(index) & 0xffff0000) == 0) { ++index; }
1110    rootElements.setElementAt(index, CollationRootElements::IX_FIRST_SECONDARY_INDEX);
1111
1112    while((rootElements.elementAti(index) & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
1113        ++index;
1114    }
1115    rootElements.setElementAt(index, CollationRootElements::IX_FIRST_PRIMARY_INDEX);
1116
1117    rootElements.setElementAt(Collation::COMMON_SEC_AND_TER_CE,
1118                              CollationRootElements::IX_COMMON_SEC_AND_TER_CE);
1119
1120    int32_t secTerBoundaries = (int32_t)getOptionValue("[fixed last secondary common byte") << 24;
1121    secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable secondary byte") << 16;
1122    secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable tertiary byte");
1123    rootElements.setElementAt(secTerBoundaries, CollationRootElements::IX_SEC_TER_BOUNDARIES);
1124
1125    LocalMemory<uint8_t> buffer;
1126    int32_t capacity = 1000000;
1127    uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
1128    if(dest == NULL) {
1129        fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
1130                (long)capacity);
1131        errorCode = U_MEMORY_ALLOCATION_ERROR;
1132        return;
1133    }
1134    int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
1135    int32_t totalSize = CollationDataWriter::writeBase(
1136            data, settings,
1137            rootElements.getBuffer(), rootElements.size(),
1138            indexes, dest, capacity,
1139            errorCode);
1140    if(U_FAILURE(errorCode)) {
1141        fprintf(stderr, "CollationDataWriter::writeBase(capacity = %ld) failed: %s\n",
1142                (long)capacity, u_errorName(errorCode));
1143        return;
1144    }
1145    printf("*** CLDR root collation part sizes ***\n");
1146    CollationInfo::printSizes(totalSize, indexes);
1147    printf("*** CLDR root collation size:   %6ld (with file header but no copyright string)\n",
1148           (long)totalSize + 32);  // 32 bytes = DataHeader rounded up to 16-byte boundary
1149
1150    CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion);
1151    const char *dataName =
1152        hanOrder == HAN_IMPLICIT ?
1153            (icu4xMode ? "ucadata-implicithan-icu4x" : "ucadata-implicithan") :
1154            (icu4xMode ? "ucadata-unihan-icu4x" : "ucadata-unihan");
1155    UNewDataMemory *pData=udata_create(path, "icu", dataName, &ucaDataInfo,
1156                                       withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1157    if(U_FAILURE(errorCode)) {
1158        fprintf(stderr, "genuca: udata_create(%s, ucadata.icu) failed - %s\n",
1159                path, u_errorName(errorCode));
1160        return;
1161    }
1162
1163    udata_writeBlock(pData, dest, totalSize);
1164    long dataLength = udata_finish(pData, &errorCode);
1165    if(U_FAILURE(errorCode)) {
1166        fprintf(stderr, "genuca: error %s writing the output file\n", u_errorName(errorCode));
1167        return;
1168    }
1169
1170    if(dataLength != (long)totalSize) {
1171        fprintf(stderr,
1172                "udata_finish(ucadata.icu) reports %ld bytes written but should be %ld\n",
1173                dataLength, (long)totalSize);
1174        errorCode=U_INTERNAL_PROGRAM_ERROR;
1175    }
1176}
1177
1178/**
1179 * Adds each lead surrogate to the bmp set if any of the 1024
1180 * associated supplementary code points is in the supp set.
1181 * These can be one and the same set.
1182 */
1183static void
1184setLeadSurrogatesForAssociatedSupplementary(UnicodeSet &bmp, const UnicodeSet &supp) {
1185    UChar32 c = 0x10000;
1186    for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
1187        if(supp.containsSome(c, c + 0x3ff)) {
1188            bmp.add(lead);
1189        }
1190    }
1191}
1192
1193static int32_t
1194makeBMPFoldedBitSet(const UnicodeSet &set, uint8_t index[0x800], uint32_t bits[256],
1195                    UErrorCode &errorCode) {
1196    if(U_FAILURE(errorCode)) { return 0; }
1197    bits[0] = 0;  // no bits set
1198    bits[1] = 0xffffffff;  // all bits set
1199    int32_t bitsLength = 2;
1200    int32_t i = 0;
1201    for(UChar32 c = 0; c <= 0xffff; c += 0x20, ++i) {
1202        if(set.containsNone(c, c + 0x1f)) {
1203            index[i] = 0;
1204        } else if(set.contains(c, c + 0x1f)) {
1205            index[i] = 1;
1206        } else {
1207            uint32_t b = 0;
1208            for(int32_t j = 0; j <= 0x1f; ++j) {
1209                if(set.contains(c + j)) {
1210                    b |= (uint32_t)1 << j;
1211                }
1212            }
1213            int32_t k;
1214            for(k = 2;; ++k) {
1215                if(k == bitsLength) {
1216                    // new bit combination
1217                    if(bitsLength == 256) {
1218                        errorCode = U_BUFFER_OVERFLOW_ERROR;
1219                        return 0;
1220                    }
1221                    bits[bitsLength++] = b;
1222                    break;
1223                }
1224                if(bits[k] == b) {
1225                    // duplicate bit combination
1226                    break;
1227                }
1228            }
1229            index[i] = k;
1230        }
1231    }
1232    return bitsLength;
1233}
1234
1235// TODO: Make preparseucd.py write fcd_data.h mapping code point ranges to FCD16 values,
1236// use that rather than properties APIs.
1237// Then consider moving related logic for the unsafeBwdSet back from the loader into this builder.
1238
1239/**
1240 * Builds data for the FCD check fast path.
1241 * For details see the CollationFCD class comments.
1242 */
1243static void
1244buildAndWriteFCDData(const char *path, UErrorCode &errorCode) {
1245    UnicodeSet lcccSet(UNICODE_STRING_SIMPLE("[[:^lccc=0:][\\udc00-\\udfff]]"), errorCode);
1246    UnicodeSet tcccSet(UNICODE_STRING_SIMPLE("[:^tccc=0:]"), errorCode);
1247    if(U_FAILURE(errorCode)) { return; }
1248    setLeadSurrogatesForAssociatedSupplementary(tcccSet, tcccSet);
1249    // The following supp(lccc)->lead(tccc) should be unnecessary
1250    // after the previous supp(tccc)->lead(tccc)
1251    // because there should not be any characters with lccc!=0 and tccc=0.
1252    // It is safe and harmless.
1253    setLeadSurrogatesForAssociatedSupplementary(tcccSet, lcccSet);
1254    setLeadSurrogatesForAssociatedSupplementary(lcccSet, lcccSet);
1255    uint8_t lcccIndex[0x800], tcccIndex[0x800];
1256    uint32_t lcccBits[256], tcccBits[256];
1257    int32_t lcccBitsLength = makeBMPFoldedBitSet(lcccSet, lcccIndex, lcccBits, errorCode);
1258    int32_t tcccBitsLength = makeBMPFoldedBitSet(tcccSet, tcccIndex, tcccBits, errorCode);
1259    printf("@@@ lcccBitsLength=%d -> %d bytes\n", lcccBitsLength, 0x800 + lcccBitsLength * 4);
1260    printf("@@@ tcccBitsLength=%d -> %d bytes\n", tcccBitsLength, 0x800 + tcccBitsLength * 4);
1261
1262    if(U_FAILURE(errorCode)) { return; }
1263
1264    FILE *f=usrc_create(path, "collationfcd.cpp", 2016,
1265                        "icu/tools/unicode/c/genuca/genuca.cpp");
1266    if(f==NULL) {
1267        errorCode=U_FILE_ACCESS_ERROR;
1268        return;
1269    }
1270    fputs("#include \"unicode/utypes.h\"\n\n", f);
1271    fputs("#if !UCONFIG_NO_COLLATION\n\n", f);
1272    fputs("#include \"collationfcd.h\"\n\n", f);
1273    fputs("U_NAMESPACE_BEGIN\n\n", f);
1274    usrc_writeArray(f,
1275        "const uint8_t CollationFCD::lcccIndex[%ld]={\n",
1276        lcccIndex, 8, 0x800,
1277        "", "\n};\n\n");
1278    usrc_writeArray(f,
1279        "const uint32_t CollationFCD::lcccBits[%ld]={\n",
1280        lcccBits, 32, lcccBitsLength,
1281        "", "\n};\n\n");
1282    usrc_writeArray(f,
1283        "const uint8_t CollationFCD::tcccIndex[%ld]={\n",
1284        tcccIndex, 8, 0x800,
1285        "", "\n};\n\n");
1286    usrc_writeArray(f,
1287        "const uint32_t CollationFCD::tcccBits[%ld]={\n",
1288        tcccBits, 32, tcccBitsLength,
1289        "", "\n};\n\n");
1290    fputs("U_NAMESPACE_END\n\n", f);
1291    fputs("#endif  // !UCONFIG_NO_COLLATION\n", f);
1292    fclose(f);
1293}
1294
1295static void
1296parseAndWriteCollationRootData(
1297        const char *fracUCAPath,
1298        const char *binaryDataPath,
1299        const char *sourceCodePath,
1300        UErrorCode &errorCode) {
1301    if(U_FAILURE(errorCode)) { return; }
1302    CollationBaseDataBuilder builder(icu4xMode, errorCode);
1303    builder.init(errorCode);
1304    parseFractionalUCA(fracUCAPath, builder, &errorCode);
1305    buildAndWriteBaseData(builder, binaryDataPath, errorCode);
1306    buildAndWriteFCDData(sourceCodePath, errorCode);
1307}
1308
1309// ------------------------------------------------------------------------- ***
1310
1311enum {
1312    HELP_H,
1313    HELP_QUESTION_MARK,
1314    VERBOSE,
1315    COPYRIGHT,
1316    HAN_ORDER,
1317    ICU4X
1318};
1319
1320static UOption options[]={
1321    UOPTION_HELP_H,
1322    UOPTION_HELP_QUESTION_MARK,
1323    UOPTION_VERBOSE,
1324    UOPTION_COPYRIGHT,
1325    UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG),
1326    UOPTION_DEF("icu4x", 'X', UOPT_NO_ARG)
1327};
1328
1329extern "C" int
1330main(int argc, char* argv[]) {
1331    U_MAIN_INIT_ARGS(argc, argv);
1332
1333    argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
1334
1335    /* error handling, printing usage message */
1336    if(argc<0) {
1337        fprintf(stderr,
1338            "error in command line argument \"%s\"\n",
1339            argv[-argc]);
1340    }
1341    if(options[HAN_ORDER].doesOccur) {
1342        const char *order = options[HAN_ORDER].value;
1343        if(uprv_strcmp(order, "implicit") == 0) {
1344            hanOrder = HAN_IMPLICIT;
1345        } else if(uprv_strcmp(order, "radical-stroke") == 0) {
1346            hanOrder = HAN_RADICAL_STROKE;
1347        }
1348    }
1349    if(hanOrder == HAN_NO_ORDER) {
1350        argc = -1;
1351    }
1352    if( argc<2 ||
1353        options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
1354    ) {
1355        /*
1356         * Broken into chunks because the C89 standard says the minimum
1357         * required supported string length is 509 bytes.
1358         */
1359        fprintf(stderr,
1360            "Usage: %s [-options] --hanOrder (implicit|radical-stroke) path/to/ICU/src/root\n"
1361            "\n"
1362            "Reads path/to/ICU/src/root/source/data/unidata/FractionalUCA.txt and\n"
1363            "writes source and binary data files with the collation root data.\n"
1364            "\n",
1365            argv[0]);
1366        fprintf(stderr,
1367            "Options:\n"
1368            "\t-h or -? or --help  this usage text\n"
1369            "\t-v or --verbose     verbose output\n"
1370            "\t-c or --copyright   include a copyright notice\n"
1371            "\t      --hanOrder    implicit or radical-stroke\n");
1372        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1373    }
1374
1375    beVerbose=options[VERBOSE].doesOccur;
1376    withCopyright=options[COPYRIGHT].doesOccur;
1377    icu4xMode=options[ICU4X].doesOccur;
1378
1379    IcuToolErrorCode errorCode("genuca");
1380
1381    CharString icuSrcRoot(argv[1], errorCode);
1382
1383    CharString icuSource(icuSrcRoot, errorCode);
1384    icuSource.appendPathPart("source", errorCode);
1385
1386    CharString icuSourceData(icuSource, errorCode);
1387    icuSourceData.appendPathPart("data", errorCode);
1388
1389    CharString fracUCAPath(icuSourceData, errorCode);
1390    fracUCAPath.appendPathPart("unidata", errorCode);
1391    fracUCAPath.appendPathPart("FractionalUCA.txt", errorCode);
1392
1393    CharString sourceDataInColl(icuSourceData, errorCode);
1394    sourceDataInColl.appendPathPart("in", errorCode);
1395    sourceDataInColl.appendPathPart("coll", errorCode);
1396
1397    CharString sourceI18n(icuSource, errorCode);
1398    sourceI18n.appendPathPart("i18n", errorCode);
1399
1400    errorCode.assertSuccess();
1401
1402    parseAndWriteCollationRootData(
1403        fracUCAPath.data(),
1404        sourceDataInColl.data(),
1405        sourceI18n.data(),
1406        errorCode);
1407
1408    return errorCode;
1409}
1410
1411#endif  // UCONFIG_NO_COLLATION
1412