12e5b6d6dSopenharmony_ci// © 2017 and later: Unicode, Inc. and others.
22e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html
32e5b6d6dSopenharmony_ci/*
42e5b6d6dSopenharmony_ci*******************************************************************************
52e5b6d6dSopenharmony_ci* Copyright (C) 2012-2015, International Business Machines
62e5b6d6dSopenharmony_ci* Corporation and others.  All Rights Reserved.
72e5b6d6dSopenharmony_ci*******************************************************************************
82e5b6d6dSopenharmony_ci* collationbasedatabuilder.cpp
92e5b6d6dSopenharmony_ci*
102e5b6d6dSopenharmony_ci* created on: 2012aug11
112e5b6d6dSopenharmony_ci* created by: Markus W. Scherer
122e5b6d6dSopenharmony_ci*/
132e5b6d6dSopenharmony_ci
142e5b6d6dSopenharmony_ci#include "unicode/utypes.h"
152e5b6d6dSopenharmony_ci
162e5b6d6dSopenharmony_ci#if !UCONFIG_NO_COLLATION
172e5b6d6dSopenharmony_ci
182e5b6d6dSopenharmony_ci#include "unicode/localpointer.h"
192e5b6d6dSopenharmony_ci#include "unicode/ucharstriebuilder.h"
202e5b6d6dSopenharmony_ci#include "unicode/uniset.h"
212e5b6d6dSopenharmony_ci#include "unicode/unistr.h"
222e5b6d6dSopenharmony_ci#include "unicode/utf16.h"
232e5b6d6dSopenharmony_ci#include "collation.h"
242e5b6d6dSopenharmony_ci#include "collationbasedatabuilder.h"
252e5b6d6dSopenharmony_ci#include "collationdata.h"
262e5b6d6dSopenharmony_ci#include "collationdatabuilder.h"
272e5b6d6dSopenharmony_ci#include "collationrootelements.h"
282e5b6d6dSopenharmony_ci#include "normalizer2impl.h"
292e5b6d6dSopenharmony_ci#include "uassert.h"
302e5b6d6dSopenharmony_ci#include "utrie2.h"
312e5b6d6dSopenharmony_ci#include "uvectr32.h"
322e5b6d6dSopenharmony_ci#include "uvectr64.h"
332e5b6d6dSopenharmony_ci#include "uvector.h"
342e5b6d6dSopenharmony_ci
352e5b6d6dSopenharmony_ciU_NAMESPACE_BEGIN
362e5b6d6dSopenharmony_ci
372e5b6d6dSopenharmony_cinamespace {
382e5b6d6dSopenharmony_ci
392e5b6d6dSopenharmony_ci/**
402e5b6d6dSopenharmony_ci * Compare two signed int64_t values as if they were unsigned.
412e5b6d6dSopenharmony_ci */
422e5b6d6dSopenharmony_ciint32_t
432e5b6d6dSopenharmony_cicompareInt64AsUnsigned(int64_t a, int64_t b) {
442e5b6d6dSopenharmony_ci    if((uint64_t)a < (uint64_t)b) {
452e5b6d6dSopenharmony_ci        return -1;
462e5b6d6dSopenharmony_ci    } else if((uint64_t)a > (uint64_t)b) {
472e5b6d6dSopenharmony_ci        return 1;
482e5b6d6dSopenharmony_ci    } else {
492e5b6d6dSopenharmony_ci        return 0;
502e5b6d6dSopenharmony_ci    }
512e5b6d6dSopenharmony_ci}
522e5b6d6dSopenharmony_ci
532e5b6d6dSopenharmony_ci// TODO: Try to merge this with the binarySearch in alphaindex.cpp.
542e5b6d6dSopenharmony_ci/**
552e5b6d6dSopenharmony_ci * Like Java Collections.binarySearch(List, String, Comparator).
562e5b6d6dSopenharmony_ci *
572e5b6d6dSopenharmony_ci * @return the index>=0 where the item was found,
582e5b6d6dSopenharmony_ci *         or the index<0 for inserting the string at ~index in sorted order
592e5b6d6dSopenharmony_ci */
602e5b6d6dSopenharmony_ciint32_t
612e5b6d6dSopenharmony_cibinarySearch(const UVector64 &list, int64_t ce) {
622e5b6d6dSopenharmony_ci    if (list.size() == 0) { return ~0; }
632e5b6d6dSopenharmony_ci    int32_t start = 0;
642e5b6d6dSopenharmony_ci    int32_t limit = list.size();
652e5b6d6dSopenharmony_ci    for (;;) {
662e5b6d6dSopenharmony_ci        int32_t i = (start + limit) / 2;
672e5b6d6dSopenharmony_ci        int32_t cmp = compareInt64AsUnsigned(ce, list.elementAti(i));
682e5b6d6dSopenharmony_ci        if (cmp == 0) {
692e5b6d6dSopenharmony_ci            return i;
702e5b6d6dSopenharmony_ci        } else if (cmp < 0) {
712e5b6d6dSopenharmony_ci            if (i == start) {
722e5b6d6dSopenharmony_ci                return ~start;  // insert ce before i
732e5b6d6dSopenharmony_ci            }
742e5b6d6dSopenharmony_ci            limit = i;
752e5b6d6dSopenharmony_ci        } else {
762e5b6d6dSopenharmony_ci            if (i == start) {
772e5b6d6dSopenharmony_ci                return ~(start + 1);  // insert ce after i
782e5b6d6dSopenharmony_ci            }
792e5b6d6dSopenharmony_ci            start = i;
802e5b6d6dSopenharmony_ci        }
812e5b6d6dSopenharmony_ci    }
822e5b6d6dSopenharmony_ci}
832e5b6d6dSopenharmony_ci
842e5b6d6dSopenharmony_ci}  // namespace
852e5b6d6dSopenharmony_ci
862e5b6d6dSopenharmony_ciCollationBaseDataBuilder::CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
872e5b6d6dSopenharmony_ci        : CollationDataBuilder(icu4xMode, errorCode),
882e5b6d6dSopenharmony_ci          numericPrimary(0x12000000),
892e5b6d6dSopenharmony_ci          firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
902e5b6d6dSopenharmony_ci          rootElements(errorCode),
912e5b6d6dSopenharmony_ci          scriptStartsLength(1) {
922e5b6d6dSopenharmony_ci    uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex));
932e5b6d6dSopenharmony_ci    uprv_memset(scriptStarts, 0, sizeof(scriptStarts));
942e5b6d6dSopenharmony_ci    this->icu4xMode = icu4xMode;
952e5b6d6dSopenharmony_ci}
962e5b6d6dSopenharmony_ci
972e5b6d6dSopenharmony_ciCollationBaseDataBuilder::~CollationBaseDataBuilder() {
982e5b6d6dSopenharmony_ci}
992e5b6d6dSopenharmony_ci
1002e5b6d6dSopenharmony_civoid
1012e5b6d6dSopenharmony_ciCollationBaseDataBuilder::init(UErrorCode &errorCode) {
1022e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode)) { return; }
1032e5b6d6dSopenharmony_ci    if(trie != NULL) {
1042e5b6d6dSopenharmony_ci        errorCode = U_INVALID_STATE_ERROR;
1052e5b6d6dSopenharmony_ci        return;
1062e5b6d6dSopenharmony_ci    }
1072e5b6d6dSopenharmony_ci
1082e5b6d6dSopenharmony_ci    // Not compressible:
1092e5b6d6dSopenharmony_ci    // - digits
1102e5b6d6dSopenharmony_ci    // - Latin
1112e5b6d6dSopenharmony_ci    // - Hani
1122e5b6d6dSopenharmony_ci    // - trail weights
1132e5b6d6dSopenharmony_ci    // Some scripts are compressible, some are not.
1142e5b6d6dSopenharmony_ci    uprv_memset(compressibleBytes, false, 256);
1152e5b6d6dSopenharmony_ci    compressibleBytes[Collation::UNASSIGNED_IMPLICIT_BYTE] = true;
1162e5b6d6dSopenharmony_ci
1172e5b6d6dSopenharmony_ci    // For a base, the default is to compute an unassigned-character implicit CE.
1182e5b6d6dSopenharmony_ci    // This includes surrogate code points; see the last option in
1192e5b6d6dSopenharmony_ci    // UCA section 7.1.1 Handling Ill-Formed Code Unit Sequences.
1202e5b6d6dSopenharmony_ci    trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode);
1212e5b6d6dSopenharmony_ci
1222e5b6d6dSopenharmony_ci    // Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches.
1232e5b6d6dSopenharmony_ci    // In the ICU4X case, only preallocate ASCII, because we don't store CE32s for
1242e5b6d6dSopenharmony_ci    // precomposed characters.
1252e5b6d6dSopenharmony_ci    for(UChar32 c = 0; c < (icu4xMode ? 0x80 : 0x180); ++c) {
1262e5b6d6dSopenharmony_ci        utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode);
1272e5b6d6dSopenharmony_ci    }
1282e5b6d6dSopenharmony_ci
1292e5b6d6dSopenharmony_ci    utrie2_set32(trie, 0xfffe, Collation::MERGE_SEPARATOR_CE32, &errorCode);
1302e5b6d6dSopenharmony_ci    // No root element for the merge separator which has 02 weights.
1312e5b6d6dSopenharmony_ci    // Some code assumes that the root first primary CE is the "space first primary"
1322e5b6d6dSopenharmony_ci    // from FractionalUCA.txt.
1332e5b6d6dSopenharmony_ci
1342e5b6d6dSopenharmony_ci    if (!icu4xMode) {
1352e5b6d6dSopenharmony_ci        uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
1362e5b6d6dSopenharmony_ci        utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
1372e5b6d6dSopenharmony_ci    }
1382e5b6d6dSopenharmony_ci
1392e5b6d6dSopenharmony_ci    // Add a mapping for the first-unassigned boundary,
1402e5b6d6dSopenharmony_ci    // which is the AlphabeticIndex overflow boundary.
1412e5b6d6dSopenharmony_ci    UnicodeString s((UChar)0xfdd1);  // Script boundary contractions start with U+FDD1.
1422e5b6d6dSopenharmony_ci    s.append((UChar)0xfdd0);  // Zzzz script sample character U+FDD0.
1432e5b6d6dSopenharmony_ci    int64_t ce = Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY);
1442e5b6d6dSopenharmony_ci    add(UnicodeString(), s, &ce, 1, errorCode);
1452e5b6d6dSopenharmony_ci
1462e5b6d6dSopenharmony_ci    // Add a tailoring boundary, but not a mapping, for [first trailing].
1472e5b6d6dSopenharmony_ci    ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
1482e5b6d6dSopenharmony_ci    rootElements.addElement(ce, errorCode);
1492e5b6d6dSopenharmony_ci
1502e5b6d6dSopenharmony_ci    // U+FFFD maps to a CE with the third-highest primary weight,
1512e5b6d6dSopenharmony_ci    // for predictable handling of ill-formed UTF-8.
1522e5b6d6dSopenharmony_ci    uint32_t ce32 = Collation::FFFD_CE32;
1532e5b6d6dSopenharmony_ci    utrie2_set32(trie, 0xfffd, ce32, &errorCode);
1542e5b6d6dSopenharmony_ci    addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
1552e5b6d6dSopenharmony_ci
1562e5b6d6dSopenharmony_ci    // U+FFFF maps to a CE with the highest primary weight.
1572e5b6d6dSopenharmony_ci    ce32 = Collation::MAX_REGULAR_CE32;
1582e5b6d6dSopenharmony_ci    utrie2_set32(trie, 0xffff, ce32, &errorCode);
1592e5b6d6dSopenharmony_ci    addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
1602e5b6d6dSopenharmony_ci}
1612e5b6d6dSopenharmony_ci
1622e5b6d6dSopenharmony_civoid
1632e5b6d6dSopenharmony_ciCollationBaseDataBuilder::initHanRanges(const UChar32 ranges[], int32_t length,
1642e5b6d6dSopenharmony_ci                                        UErrorCode &errorCode) {
1652e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode) || length == 0) { return; }
1662e5b6d6dSopenharmony_ci    if((length & 1) != 0) {  // incomplete start/end pairs
1672e5b6d6dSopenharmony_ci        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1682e5b6d6dSopenharmony_ci        return;
1692e5b6d6dSopenharmony_ci    }
1702e5b6d6dSopenharmony_ci    if(isAssigned(0x4e00)) {  // already set
1712e5b6d6dSopenharmony_ci        errorCode = U_INVALID_STATE_ERROR;
1722e5b6d6dSopenharmony_ci        return;
1732e5b6d6dSopenharmony_ci    }
1742e5b6d6dSopenharmony_ci    int32_t numHanCodePoints = 0;
1752e5b6d6dSopenharmony_ci    for(int32_t i = 0; i < length; i += 2) {
1762e5b6d6dSopenharmony_ci        UChar32 start = ranges[i];
1772e5b6d6dSopenharmony_ci        UChar32 end = ranges[i + 1];
1782e5b6d6dSopenharmony_ci        numHanCodePoints += end - start + 1;
1792e5b6d6dSopenharmony_ci    }
1802e5b6d6dSopenharmony_ci    // Multiply the number of code points by (gap+1).
1812e5b6d6dSopenharmony_ci    // Add hanStep+2 for tailoring after the last Han character.
1822e5b6d6dSopenharmony_ci    int32_t gap = 1;
1832e5b6d6dSopenharmony_ci    hanStep = gap + 1;
1842e5b6d6dSopenharmony_ci    int32_t numHan = numHanCodePoints * hanStep + hanStep + 2;
1852e5b6d6dSopenharmony_ci    // Numbers of Han primaries per lead byte determined by
1862e5b6d6dSopenharmony_ci    // numbers of 2nd (not compressible) times 3rd primary byte values.
1872e5b6d6dSopenharmony_ci    int32_t numHanPerLeadByte = 254 * 254;
1882e5b6d6dSopenharmony_ci    int32_t numHanLeadBytes = (numHan + numHanPerLeadByte - 1) / numHanPerLeadByte;
1892e5b6d6dSopenharmony_ci    uint32_t hanPrimary = (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE - numHanLeadBytes) << 24;
1902e5b6d6dSopenharmony_ci    hanPrimary |= 0x20200;
1912e5b6d6dSopenharmony_ci    firstHanPrimary = hanPrimary;
1922e5b6d6dSopenharmony_ci    for(int32_t i = 0; i < length; i += 2) {
1932e5b6d6dSopenharmony_ci        UChar32 start = ranges[i];
1942e5b6d6dSopenharmony_ci        UChar32 end = ranges[i + 1];
1952e5b6d6dSopenharmony_ci        hanPrimary = setPrimaryRangeAndReturnNext(start, end, hanPrimary, hanStep, errorCode);
1962e5b6d6dSopenharmony_ci    }
1972e5b6d6dSopenharmony_ci    // One past the actual last one, but that is harmless for tailoring.
1982e5b6d6dSopenharmony_ci    // It saves us from subtracting "hanStep" and handling underflows.
1992e5b6d6dSopenharmony_ci    lastHanPrimary = hanPrimary;
2002e5b6d6dSopenharmony_ci}
2012e5b6d6dSopenharmony_ci
2022e5b6d6dSopenharmony_ciUBool
2032e5b6d6dSopenharmony_ciCollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b) const {
2042e5b6d6dSopenharmony_ci    return compressibleBytes[b];
2052e5b6d6dSopenharmony_ci}
2062e5b6d6dSopenharmony_ci
2072e5b6d6dSopenharmony_civoid
2082e5b6d6dSopenharmony_ciCollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b) {
2092e5b6d6dSopenharmony_ci    compressibleBytes[b] = true;
2102e5b6d6dSopenharmony_ci}
2112e5b6d6dSopenharmony_ci
2122e5b6d6dSopenharmony_ciint32_t
2132e5b6d6dSopenharmony_ciCollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
2142e5b6d6dSopenharmony_ci    if((p1 & 0xff000000) == (p2 & 0xff000000)) {
2152e5b6d6dSopenharmony_ci        // Same lead bytes.
2162e5b6d6dSopenharmony_ci        return (int32_t)(p2 - p1) >> 16;
2172e5b6d6dSopenharmony_ci    } else {
2182e5b6d6dSopenharmony_ci        int32_t linear1;
2192e5b6d6dSopenharmony_ci        int32_t linear2;
2202e5b6d6dSopenharmony_ci        int32_t factor;
2212e5b6d6dSopenharmony_ci        if(isCompressible) {
2222e5b6d6dSopenharmony_ci            // Second byte for compressible lead byte: 251 bytes 04..FE
2232e5b6d6dSopenharmony_ci            linear1 = (int32_t)((p1 >> 16) & 0xff) - 4;
2242e5b6d6dSopenharmony_ci            linear2 = (int32_t)((p2 >> 16) & 0xff) - 4;
2252e5b6d6dSopenharmony_ci            factor = 251;
2262e5b6d6dSopenharmony_ci        } else {
2272e5b6d6dSopenharmony_ci            // Second byte for incompressible lead byte: 254 bytes 02..FF
2282e5b6d6dSopenharmony_ci            linear1 = (int32_t)((p1 >> 16) & 0xff) - 2;
2292e5b6d6dSopenharmony_ci            linear2 = (int32_t)((p2 >> 16) & 0xff) - 2;
2302e5b6d6dSopenharmony_ci            factor = 254;
2312e5b6d6dSopenharmony_ci        }
2322e5b6d6dSopenharmony_ci        linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
2332e5b6d6dSopenharmony_ci        linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
2342e5b6d6dSopenharmony_ci        return linear2 - linear1;
2352e5b6d6dSopenharmony_ci    }
2362e5b6d6dSopenharmony_ci}
2372e5b6d6dSopenharmony_ci
2382e5b6d6dSopenharmony_ciint32_t
2392e5b6d6dSopenharmony_ciCollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
2402e5b6d6dSopenharmony_ci    if((p1 & 0xffff0000) == (p2 & 0xffff0000)) {
2412e5b6d6dSopenharmony_ci        // Same first two bytes.
2422e5b6d6dSopenharmony_ci        return (int32_t)(p2 - p1) >> 8;
2432e5b6d6dSopenharmony_ci    } else {
2442e5b6d6dSopenharmony_ci        // Third byte: 254 bytes 02..FF
2452e5b6d6dSopenharmony_ci        int32_t linear1 = (int32_t)((p1 >> 8) & 0xff) - 2;
2462e5b6d6dSopenharmony_ci        int32_t linear2 = (int32_t)((p2 >> 8) & 0xff) - 2;
2472e5b6d6dSopenharmony_ci        int32_t factor;
2482e5b6d6dSopenharmony_ci        if(isCompressible) {
2492e5b6d6dSopenharmony_ci            // Second byte for compressible lead byte: 251 bytes 04..FE
2502e5b6d6dSopenharmony_ci            linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 4);
2512e5b6d6dSopenharmony_ci            linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 4);
2522e5b6d6dSopenharmony_ci            factor = 251 * 254;
2532e5b6d6dSopenharmony_ci        } else {
2542e5b6d6dSopenharmony_ci            // Second byte for incompressible lead byte: 254 bytes 02..FF
2552e5b6d6dSopenharmony_ci            linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 2);
2562e5b6d6dSopenharmony_ci            linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 2);
2572e5b6d6dSopenharmony_ci            factor = 254 * 254;
2582e5b6d6dSopenharmony_ci        }
2592e5b6d6dSopenharmony_ci        linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
2602e5b6d6dSopenharmony_ci        linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
2612e5b6d6dSopenharmony_ci        return linear2 - linear1;
2622e5b6d6dSopenharmony_ci    }
2632e5b6d6dSopenharmony_ci}
2642e5b6d6dSopenharmony_ci
2652e5b6d6dSopenharmony_ciuint32_t
2662e5b6d6dSopenharmony_ciCollationBaseDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode) {
2672e5b6d6dSopenharmony_ci    addRootElements(ces, cesLength, errorCode);
2682e5b6d6dSopenharmony_ci    return CollationDataBuilder::encodeCEs(ces, cesLength, errorCode);
2692e5b6d6dSopenharmony_ci}
2702e5b6d6dSopenharmony_ci
2712e5b6d6dSopenharmony_civoid
2722e5b6d6dSopenharmony_ciCollationBaseDataBuilder::addRootElements(const int64_t ces[], int32_t cesLength,
2732e5b6d6dSopenharmony_ci                                          UErrorCode &errorCode) {
2742e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode)) { return; }
2752e5b6d6dSopenharmony_ci    for(int32_t i = 0; i < cesLength; ++i) {
2762e5b6d6dSopenharmony_ci        addRootElement(ces[i], errorCode);
2772e5b6d6dSopenharmony_ci    }
2782e5b6d6dSopenharmony_ci}
2792e5b6d6dSopenharmony_ci
2802e5b6d6dSopenharmony_civoid
2812e5b6d6dSopenharmony_ciCollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
2822e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode) || ce == 0) { return; }
2832e5b6d6dSopenharmony_ci    // Remove case bits.
2842e5b6d6dSopenharmony_ci    ce &= INT64_C(0xffffffffffff3fff);
2852e5b6d6dSopenharmony_ci    U_ASSERT((ce & 0xc0) == 0);  // quaternary==0
2862e5b6d6dSopenharmony_ci    // Ignore the CE if it has a Han primary weight and common secondary/tertiary weights.
2872e5b6d6dSopenharmony_ci    // We will add it later, as part of the Han ranges.
2882e5b6d6dSopenharmony_ci    uint32_t p = (uint32_t)(ce >> 32);
2892e5b6d6dSopenharmony_ci    uint32_t secTer = (uint32_t)ce;
2902e5b6d6dSopenharmony_ci    if(firstHanPrimary <= p && p <= lastHanPrimary) {
2912e5b6d6dSopenharmony_ci        if(secTer < Collation::COMMON_SEC_AND_TER_CE) {
2922e5b6d6dSopenharmony_ci            // buildRootElementsTable() does not currently handle this case.
2932e5b6d6dSopenharmony_ci            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
2942e5b6d6dSopenharmony_ci            return;
2952e5b6d6dSopenharmony_ci        }
2962e5b6d6dSopenharmony_ci        if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
2972e5b6d6dSopenharmony_ci            return;
2982e5b6d6dSopenharmony_ci        }
2992e5b6d6dSopenharmony_ci    }
3002e5b6d6dSopenharmony_ci    if(secTer != Collation::COMMON_SEC_AND_TER_CE) {  // minor optimization
3012e5b6d6dSopenharmony_ci        // Check that secondary and tertiary weights are > 01.
3022e5b6d6dSopenharmony_ci        uint32_t s = secTer >> 16;
3032e5b6d6dSopenharmony_ci        uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK;
3042e5b6d6dSopenharmony_ci        if((s != 0 && s <= Collation::BEFORE_WEIGHT16) ||
3052e5b6d6dSopenharmony_ci                (t != 0 && t <= Collation::BEFORE_WEIGHT16)) {
3062e5b6d6dSopenharmony_ci            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
3072e5b6d6dSopenharmony_ci            return;
3082e5b6d6dSopenharmony_ci        }
3092e5b6d6dSopenharmony_ci    }
3102e5b6d6dSopenharmony_ci    // Check that primaries have at most 3 bytes.
3112e5b6d6dSopenharmony_ci    if((p & 0xff) != 0) {
3122e5b6d6dSopenharmony_ci        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
3132e5b6d6dSopenharmony_ci        return;
3142e5b6d6dSopenharmony_ci    }
3152e5b6d6dSopenharmony_ci    int32_t i = binarySearch(rootElements, ce);
3162e5b6d6dSopenharmony_ci    if(i < 0) {
3172e5b6d6dSopenharmony_ci        rootElements.insertElementAt(ce, ~i, errorCode);
3182e5b6d6dSopenharmony_ci    }
3192e5b6d6dSopenharmony_ci}
3202e5b6d6dSopenharmony_ci
3212e5b6d6dSopenharmony_civoid
3222e5b6d6dSopenharmony_ciCollationBaseDataBuilder::addScriptStart(int32_t script, uint32_t p) {
3232e5b6d6dSopenharmony_ci    // The primary weight must be the lowest possible for a two-byte prefix.
3242e5b6d6dSopenharmony_ci    // It could be 2, 3, or 4 bytes long. We round down to the two-byte boundary.
3252e5b6d6dSopenharmony_ci    U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
3262e5b6d6dSopenharmony_ci    p >>= 8;
3272e5b6d6dSopenharmony_ci    U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
3282e5b6d6dSopenharmony_ci    p >>= 8;
3292e5b6d6dSopenharmony_ci    uint32_t lowestP2 = compressibleBytes[p >> 8] ? 4 : 2;
3302e5b6d6dSopenharmony_ci    if((p & 0xff) == lowestP2) {
3312e5b6d6dSopenharmony_ci        // The script really starts on a lead byte boundary. Round down to that.
3322e5b6d6dSopenharmony_ci        p &= 0xff00;
3332e5b6d6dSopenharmony_ci    }
3342e5b6d6dSopenharmony_ci    // Script starts should be added in ascending order, otherwise we would need to sort them.
3352e5b6d6dSopenharmony_ci    if(script < UCOL_REORDER_CODE_FIRST) {
3362e5b6d6dSopenharmony_ci        U_ASSERT(0 <= script && script < USCRIPT_CODE_LIMIT);
3372e5b6d6dSopenharmony_ci    } else {
3382e5b6d6dSopenharmony_ci        U_ASSERT(script <= (UCOL_REORDER_CODE_FIRST + 15));
3392e5b6d6dSopenharmony_ci        script = USCRIPT_CODE_LIMIT + script - UCOL_REORDER_CODE_FIRST;
3402e5b6d6dSopenharmony_ci    }
3412e5b6d6dSopenharmony_ci    if(scriptStartsLength != 0 && scriptStarts[scriptStartsLength - 1] == p) {
3422e5b6d6dSopenharmony_ci        // Two scripts share a range (e.g., Hira & Kana).
3432e5b6d6dSopenharmony_ci        scriptsIndex[script] = (uint16_t)(scriptStartsLength - 1);
3442e5b6d6dSopenharmony_ci    } else {
3452e5b6d6dSopenharmony_ci        U_ASSERT(scriptStartsLength == 0 || scriptStarts[scriptStartsLength - 1] <= p);
3462e5b6d6dSopenharmony_ci        U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
3472e5b6d6dSopenharmony_ci        scriptsIndex[script] = (uint16_t)scriptStartsLength;
3482e5b6d6dSopenharmony_ci        scriptStarts[scriptStartsLength++] = (uint16_t)p;
3492e5b6d6dSopenharmony_ci    }
3502e5b6d6dSopenharmony_ci    if(script == USCRIPT_UNKNOWN) {
3512e5b6d6dSopenharmony_ci        // The last script start is for unassigned code points
3522e5b6d6dSopenharmony_ci        // (with high implicit primary weights).
3532e5b6d6dSopenharmony_ci        // Add one more entry with the limit of this range,
3542e5b6d6dSopenharmony_ci        // which is the start of the trailing-weights range.
3552e5b6d6dSopenharmony_ci        U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
3562e5b6d6dSopenharmony_ci        scriptStarts[scriptStartsLength++] =
3572e5b6d6dSopenharmony_ci                (uint16_t)((Collation::FIRST_TRAILING_PRIMARY >> 16) & 0xff00);
3582e5b6d6dSopenharmony_ci    }
3592e5b6d6dSopenharmony_ci}
3602e5b6d6dSopenharmony_ci
3612e5b6d6dSopenharmony_civoid
3622e5b6d6dSopenharmony_ciCollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
3632e5b6d6dSopenharmony_ci    buildMappings(data, errorCode);
3642e5b6d6dSopenharmony_ci    data.numericPrimary = numericPrimary;
3652e5b6d6dSopenharmony_ci    data.compressibleBytes = compressibleBytes;
3662e5b6d6dSopenharmony_ci
3672e5b6d6dSopenharmony_ci    int32_t numScripts = USCRIPT_CODE_LIMIT;
3682e5b6d6dSopenharmony_ci    while(numScripts > 0 && scriptsIndex[numScripts - 1] == 0) { --numScripts; }
3692e5b6d6dSopenharmony_ci    // Move the 16 special groups (not all used)
3702e5b6d6dSopenharmony_ci    // down for contiguous storage of the script and special-group indexes.
3712e5b6d6dSopenharmony_ci    for(int32_t i = 0; i < 16; ++i) {
3722e5b6d6dSopenharmony_ci        scriptsIndex[numScripts + i] = scriptsIndex[USCRIPT_CODE_LIMIT + i];
3732e5b6d6dSopenharmony_ci    }
3742e5b6d6dSopenharmony_ci    data.numScripts = numScripts;
3752e5b6d6dSopenharmony_ci    data.scriptsIndex = scriptsIndex;
3762e5b6d6dSopenharmony_ci    data.scriptStarts = scriptStarts;
3772e5b6d6dSopenharmony_ci    data.scriptStartsLength = scriptStartsLength;
3782e5b6d6dSopenharmony_ci    buildFastLatinTable(data, errorCode);
3792e5b6d6dSopenharmony_ci}
3802e5b6d6dSopenharmony_ci
3812e5b6d6dSopenharmony_civoid
3822e5b6d6dSopenharmony_ciCollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &errorCode) {
3832e5b6d6dSopenharmony_ci    // Limit sentinel for root elements.
3842e5b6d6dSopenharmony_ci    // This allows us to reduce range checks at runtime.
3852e5b6d6dSopenharmony_ci    rootElements.addElement(Collation::makeCE(CollationRootElements::PRIMARY_SENTINEL), errorCode);
3862e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode)) { return; }
3872e5b6d6dSopenharmony_ci    uint32_t nextHanPrimary = firstHanPrimary;  // Set to 0xffffffff after the last Han range.
3882e5b6d6dSopenharmony_ci    uint32_t prevPrimary = 0;  // Start with primary ignorable CEs.
3892e5b6d6dSopenharmony_ci    UBool needCommonSecTerUnit = false;
3902e5b6d6dSopenharmony_ci    UBool hasDeltaUnit = false;
3912e5b6d6dSopenharmony_ci    for(int32_t i = 0; i < rootElements.size(); ++i) {
3922e5b6d6dSopenharmony_ci        int64_t ce = rootElements.elementAti(i);
3932e5b6d6dSopenharmony_ci        uint32_t p = (uint32_t)(ce >> 32);
3942e5b6d6dSopenharmony_ci        uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
3952e5b6d6dSopenharmony_ci        if((p != prevPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE) && needCommonSecTerUnit) {
3962e5b6d6dSopenharmony_ci            // The last primary had low sec/ter weights but no common sec/ter combination.
3972e5b6d6dSopenharmony_ci            // The next unit is either a new primary or an above-common sec/ter unit.
3982e5b6d6dSopenharmony_ci            // Insert a common sec/ter unit so that the builder will reliably
3992e5b6d6dSopenharmony_ci            // tailor to either before or after a common weight but not across it.
4002e5b6d6dSopenharmony_ci            table.addElement((int32_t)Collation::COMMON_SEC_AND_TER_CE |
4012e5b6d6dSopenharmony_ci                            CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
4022e5b6d6dSopenharmony_ci        }
4032e5b6d6dSopenharmony_ci        if(p != prevPrimary) {
4042e5b6d6dSopenharmony_ci            U_ASSERT((p & 0xff) == 0);
4052e5b6d6dSopenharmony_ci            int32_t end;
4062e5b6d6dSopenharmony_ci            if(p >= nextHanPrimary) {
4072e5b6d6dSopenharmony_ci                // Add a Han primary weight or range.
4082e5b6d6dSopenharmony_ci                // We omitted them initially, and omitted all CEs with Han primaries
4092e5b6d6dSopenharmony_ci                // and common secondary/tertiary weights.
4102e5b6d6dSopenharmony_ci                U_ASSERT(p > lastHanPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE);
4112e5b6d6dSopenharmony_ci                if(p == nextHanPrimary) {
4122e5b6d6dSopenharmony_ci                    // One single Han primary with non-common secondary/tertiary weights.
4132e5b6d6dSopenharmony_ci                    table.addElement((int32_t)p, errorCode);
4142e5b6d6dSopenharmony_ci                    if(p < lastHanPrimary) {
4152e5b6d6dSopenharmony_ci                        // Prepare for the next Han range.
4162e5b6d6dSopenharmony_ci                        nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, false, hanStep);
4172e5b6d6dSopenharmony_ci                    } else {
4182e5b6d6dSopenharmony_ci                        // p is the last Han primary.
4192e5b6d6dSopenharmony_ci                        nextHanPrimary = 0xffffffff;
4202e5b6d6dSopenharmony_ci                    }
4212e5b6d6dSopenharmony_ci                } else {
4222e5b6d6dSopenharmony_ci                    // p > nextHanPrimary: Add a Han primary range, starting with nextHanPrimary.
4232e5b6d6dSopenharmony_ci                    table.addElement((int32_t)nextHanPrimary, errorCode);
4242e5b6d6dSopenharmony_ci                    if(nextHanPrimary == lastHanPrimary) {
4252e5b6d6dSopenharmony_ci                        // nextHanPrimary == lastHanPrimary < p
4262e5b6d6dSopenharmony_ci                        // We just wrote the single last Han primary.
4272e5b6d6dSopenharmony_ci                        nextHanPrimary = 0xffffffff;
4282e5b6d6dSopenharmony_ci                        table.addElement((int32_t)p, errorCode);
4292e5b6d6dSopenharmony_ci                    } else if(p < lastHanPrimary) {
4302e5b6d6dSopenharmony_ci                        // nextHanPrimary < p < lastHanPrimary
4312e5b6d6dSopenharmony_ci                        // End the Han range on p, prepare for the next range.
4322e5b6d6dSopenharmony_ci                        table.addElement((int32_t)p | hanStep, errorCode);
4332e5b6d6dSopenharmony_ci                        nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, false, hanStep);
4342e5b6d6dSopenharmony_ci                    } else if(p == lastHanPrimary) {
4352e5b6d6dSopenharmony_ci                        // nextHanPrimary < p == lastHanPrimary
4362e5b6d6dSopenharmony_ci                        // End the last Han range on p.
4372e5b6d6dSopenharmony_ci                        table.addElement((int32_t)p | hanStep, errorCode);
4382e5b6d6dSopenharmony_ci                        nextHanPrimary = 0xffffffff;
4392e5b6d6dSopenharmony_ci                    } else {
4402e5b6d6dSopenharmony_ci                        // nextHanPrimary < lastHanPrimary < p
4412e5b6d6dSopenharmony_ci                        // End the last Han range, then write p.
4422e5b6d6dSopenharmony_ci                        table.addElement((int32_t)lastHanPrimary | hanStep, errorCode);
4432e5b6d6dSopenharmony_ci                        nextHanPrimary = 0xffffffff;
4442e5b6d6dSopenharmony_ci                        table.addElement((int32_t)p, errorCode);
4452e5b6d6dSopenharmony_ci                    }
4462e5b6d6dSopenharmony_ci                }
4472e5b6d6dSopenharmony_ci            } else if(prevPrimary != 0 &&
4482e5b6d6dSopenharmony_ci                    // If there has not been an intervening delta unit,
4492e5b6d6dSopenharmony_ci                    // then we will try to combine the previous primary and
4502e5b6d6dSopenharmony_ci                    // the next several primaries into a range.
4512e5b6d6dSopenharmony_ci                    !hasDeltaUnit &&
4522e5b6d6dSopenharmony_ci                    // Might get a range with more than two primaries if the current CE
4532e5b6d6dSopenharmony_ci                    // has common sec/ter weights.
4542e5b6d6dSopenharmony_ci                    secTer == Collation::COMMON_SEC_AND_TER_CE &&
4552e5b6d6dSopenharmony_ci                    (end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) {
4562e5b6d6dSopenharmony_ci                // Multiple CEs with only common secondary/tertiary weights were
4572e5b6d6dSopenharmony_ci                // combined into a primary range.
4582e5b6d6dSopenharmony_ci                // The range end was written, ending with the primary of rootElements[end].
4592e5b6d6dSopenharmony_ci                ce = rootElements.elementAti(end);
4602e5b6d6dSopenharmony_ci                p = (uint32_t)(ce >> 32);
4612e5b6d6dSopenharmony_ci                secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
4622e5b6d6dSopenharmony_ci                i = end;
4632e5b6d6dSopenharmony_ci            } else {
4642e5b6d6dSopenharmony_ci                // Write the primary weight of a normal CE.
4652e5b6d6dSopenharmony_ci                table.addElement((int32_t)p, errorCode);
4662e5b6d6dSopenharmony_ci            }
4672e5b6d6dSopenharmony_ci            prevPrimary = p;
4682e5b6d6dSopenharmony_ci            needCommonSecTerUnit = false;
4692e5b6d6dSopenharmony_ci            hasDeltaUnit = false;
4702e5b6d6dSopenharmony_ci        }
4712e5b6d6dSopenharmony_ci        if(secTer == Collation::COMMON_SEC_AND_TER_CE && !needCommonSecTerUnit) {
4722e5b6d6dSopenharmony_ci            // The common secondar/tertiary weights are implied in the primary unit.
4732e5b6d6dSopenharmony_ci        } else {
4742e5b6d6dSopenharmony_ci            if(secTer < Collation::COMMON_SEC_AND_TER_CE) {
4752e5b6d6dSopenharmony_ci                // Remember to not suppress a common sec/ter unit if p!=0.
4762e5b6d6dSopenharmony_ci                needCommonSecTerUnit = p != 0;
4772e5b6d6dSopenharmony_ci            } else if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
4782e5b6d6dSopenharmony_ci                // Real common sec/ter unit, no need to insert an artificial one.
4792e5b6d6dSopenharmony_ci                needCommonSecTerUnit = false;
4802e5b6d6dSopenharmony_ci            }
4812e5b6d6dSopenharmony_ci            // For each new set of secondary/tertiary weights we write a delta unit.
4822e5b6d6dSopenharmony_ci            table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
4832e5b6d6dSopenharmony_ci            hasDeltaUnit = true;
4842e5b6d6dSopenharmony_ci        }
4852e5b6d6dSopenharmony_ci    }
4862e5b6d6dSopenharmony_ci}
4872e5b6d6dSopenharmony_ci
4882e5b6d6dSopenharmony_ciint32_t
4892e5b6d6dSopenharmony_ciCollationBaseDataBuilder::writeRootElementsRange(
4902e5b6d6dSopenharmony_ci        uint32_t prevPrimary, uint32_t p, int32_t i,
4912e5b6d6dSopenharmony_ci        UVector32 &table, UErrorCode &errorCode) {
4922e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode) || i >= rootElements.size()) { return 0; }
4932e5b6d6dSopenharmony_ci    U_ASSERT(prevPrimary < p);
4942e5b6d6dSopenharmony_ci    // No ranges of single-byte primaries.
4952e5b6d6dSopenharmony_ci    if((p & prevPrimary & 0xff0000) == 0) { return 0; }
4962e5b6d6dSopenharmony_ci    // Lead bytes of compressible primaries must match.
4972e5b6d6dSopenharmony_ci    UBool isCompressible = isCompressiblePrimary(p);
4982e5b6d6dSopenharmony_ci    if((isCompressible || isCompressiblePrimary(prevPrimary)) &&
4992e5b6d6dSopenharmony_ci            (p & 0xff000000) != (prevPrimary & 0xff000000)) {
5002e5b6d6dSopenharmony_ci        return 0;
5012e5b6d6dSopenharmony_ci    }
5022e5b6d6dSopenharmony_ci    // Number of bytes in the primaries.
5032e5b6d6dSopenharmony_ci    UBool twoBytes;
5042e5b6d6dSopenharmony_ci    // Number of primaries from prevPrimary to p.
5052e5b6d6dSopenharmony_ci    int32_t step;
5062e5b6d6dSopenharmony_ci    if((p & 0xff00) == 0) {
5072e5b6d6dSopenharmony_ci        // 2-byte primary
5082e5b6d6dSopenharmony_ci        if((prevPrimary & 0xff00) != 0) { return 0; }  // length mismatch
5092e5b6d6dSopenharmony_ci        twoBytes = true;
5102e5b6d6dSopenharmony_ci        step = diffTwoBytePrimaries(prevPrimary, p, isCompressible);
5112e5b6d6dSopenharmony_ci    } else {
5122e5b6d6dSopenharmony_ci        // 3-byte primary
5132e5b6d6dSopenharmony_ci        if((prevPrimary & 0xff00) == 0) { return 0; }  // length mismatch
5142e5b6d6dSopenharmony_ci        twoBytes = false;
5152e5b6d6dSopenharmony_ci        step = diffThreeBytePrimaries(prevPrimary, p, isCompressible);
5162e5b6d6dSopenharmony_ci    }
5172e5b6d6dSopenharmony_ci    if(step > (int32_t)CollationRootElements::PRIMARY_STEP_MASK) { return 0; }
5182e5b6d6dSopenharmony_ci    // See if there are more than two CEs with primaries increasing by "step"
5192e5b6d6dSopenharmony_ci    // and with only common secondary/tertiary weights on all but the last one.
5202e5b6d6dSopenharmony_ci    int32_t end = 0;  // Initially 0: No range for just two primaries.
5212e5b6d6dSopenharmony_ci    for(;;) {
5222e5b6d6dSopenharmony_ci        prevPrimary = p;
5232e5b6d6dSopenharmony_ci        // Calculate which primary we expect next.
5242e5b6d6dSopenharmony_ci        uint32_t nextPrimary;  // = p + step
5252e5b6d6dSopenharmony_ci        if(twoBytes) {
5262e5b6d6dSopenharmony_ci            nextPrimary = Collation::incTwoBytePrimaryByOffset(p, isCompressible, step);
5272e5b6d6dSopenharmony_ci        } else {
5282e5b6d6dSopenharmony_ci            nextPrimary = Collation::incThreeBytePrimaryByOffset(p, isCompressible, step);
5292e5b6d6dSopenharmony_ci        }
5302e5b6d6dSopenharmony_ci        // Fetch the actual next CE.
5312e5b6d6dSopenharmony_ci        int64_t ce = rootElements.elementAti(i);
5322e5b6d6dSopenharmony_ci        p = (uint32_t)(ce >> 32);
5332e5b6d6dSopenharmony_ci        uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
5342e5b6d6dSopenharmony_ci        // Does this primary increase by "step" from the last one?
5352e5b6d6dSopenharmony_ci        if(p != nextPrimary ||
5362e5b6d6dSopenharmony_ci                // Do not cross into a new lead byte if either is compressible.
5372e5b6d6dSopenharmony_ci                ((p & 0xff000000) != (prevPrimary & 0xff000000) &&
5382e5b6d6dSopenharmony_ci                    (isCompressible || isCompressiblePrimary(p)))) {
5392e5b6d6dSopenharmony_ci            // The range ends with the previous CE.
5402e5b6d6dSopenharmony_ci            p = prevPrimary;
5412e5b6d6dSopenharmony_ci            break;
5422e5b6d6dSopenharmony_ci        }
5432e5b6d6dSopenharmony_ci        // Extend the range to include this primary.
5442e5b6d6dSopenharmony_ci        end = i++;
5452e5b6d6dSopenharmony_ci        // This primary is the last in the range if it has non-common weights
5462e5b6d6dSopenharmony_ci        // or if we are at the end of the list.
5472e5b6d6dSopenharmony_ci        if(secTer != Collation::COMMON_SEC_AND_TER_CE || i >= rootElements.size()) { break; }
5482e5b6d6dSopenharmony_ci    }
5492e5b6d6dSopenharmony_ci    if(end != 0) {
5502e5b6d6dSopenharmony_ci        table.addElement((int32_t)p | step, errorCode);
5512e5b6d6dSopenharmony_ci    }
5522e5b6d6dSopenharmony_ci    return end;
5532e5b6d6dSopenharmony_ci}
5542e5b6d6dSopenharmony_ci
5552e5b6d6dSopenharmony_ciU_NAMESPACE_END
5562e5b6d6dSopenharmony_ci
5572e5b6d6dSopenharmony_ci#endif  // !UCONFIG_NO_COLLATION
558