12e5b6d6dSopenharmony_ci// © 2017 and later: Unicode, Inc. and others. 22e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 32e5b6d6dSopenharmony_ci/* 42e5b6d6dSopenharmony_ci******************************************************************************* 52e5b6d6dSopenharmony_ci* Copyright (C) 2012-2015, International Business Machines 62e5b6d6dSopenharmony_ci* Corporation and others. All Rights Reserved. 72e5b6d6dSopenharmony_ci******************************************************************************* 82e5b6d6dSopenharmony_ci* collationbasedatabuilder.cpp 92e5b6d6dSopenharmony_ci* 102e5b6d6dSopenharmony_ci* created on: 2012aug11 112e5b6d6dSopenharmony_ci* created by: Markus W. Scherer 122e5b6d6dSopenharmony_ci*/ 132e5b6d6dSopenharmony_ci 142e5b6d6dSopenharmony_ci#include "unicode/utypes.h" 152e5b6d6dSopenharmony_ci 162e5b6d6dSopenharmony_ci#if !UCONFIG_NO_COLLATION 172e5b6d6dSopenharmony_ci 182e5b6d6dSopenharmony_ci#include "unicode/localpointer.h" 192e5b6d6dSopenharmony_ci#include "unicode/ucharstriebuilder.h" 202e5b6d6dSopenharmony_ci#include "unicode/uniset.h" 212e5b6d6dSopenharmony_ci#include "unicode/unistr.h" 222e5b6d6dSopenharmony_ci#include "unicode/utf16.h" 232e5b6d6dSopenharmony_ci#include "collation.h" 242e5b6d6dSopenharmony_ci#include "collationbasedatabuilder.h" 252e5b6d6dSopenharmony_ci#include "collationdata.h" 262e5b6d6dSopenharmony_ci#include "collationdatabuilder.h" 272e5b6d6dSopenharmony_ci#include "collationrootelements.h" 282e5b6d6dSopenharmony_ci#include "normalizer2impl.h" 292e5b6d6dSopenharmony_ci#include "uassert.h" 302e5b6d6dSopenharmony_ci#include "utrie2.h" 312e5b6d6dSopenharmony_ci#include "uvectr32.h" 322e5b6d6dSopenharmony_ci#include "uvectr64.h" 332e5b6d6dSopenharmony_ci#include "uvector.h" 342e5b6d6dSopenharmony_ci 352e5b6d6dSopenharmony_ciU_NAMESPACE_BEGIN 362e5b6d6dSopenharmony_ci 372e5b6d6dSopenharmony_cinamespace { 382e5b6d6dSopenharmony_ci 392e5b6d6dSopenharmony_ci/** 402e5b6d6dSopenharmony_ci * Compare two signed int64_t values as if they were unsigned. 412e5b6d6dSopenharmony_ci */ 422e5b6d6dSopenharmony_ciint32_t 432e5b6d6dSopenharmony_cicompareInt64AsUnsigned(int64_t a, int64_t b) { 442e5b6d6dSopenharmony_ci if((uint64_t)a < (uint64_t)b) { 452e5b6d6dSopenharmony_ci return -1; 462e5b6d6dSopenharmony_ci } else if((uint64_t)a > (uint64_t)b) { 472e5b6d6dSopenharmony_ci return 1; 482e5b6d6dSopenharmony_ci } else { 492e5b6d6dSopenharmony_ci return 0; 502e5b6d6dSopenharmony_ci } 512e5b6d6dSopenharmony_ci} 522e5b6d6dSopenharmony_ci 532e5b6d6dSopenharmony_ci// TODO: Try to merge this with the binarySearch in alphaindex.cpp. 542e5b6d6dSopenharmony_ci/** 552e5b6d6dSopenharmony_ci * Like Java Collections.binarySearch(List, String, Comparator). 562e5b6d6dSopenharmony_ci * 572e5b6d6dSopenharmony_ci * @return the index>=0 where the item was found, 582e5b6d6dSopenharmony_ci * or the index<0 for inserting the string at ~index in sorted order 592e5b6d6dSopenharmony_ci */ 602e5b6d6dSopenharmony_ciint32_t 612e5b6d6dSopenharmony_cibinarySearch(const UVector64 &list, int64_t ce) { 622e5b6d6dSopenharmony_ci if (list.size() == 0) { return ~0; } 632e5b6d6dSopenharmony_ci int32_t start = 0; 642e5b6d6dSopenharmony_ci int32_t limit = list.size(); 652e5b6d6dSopenharmony_ci for (;;) { 662e5b6d6dSopenharmony_ci int32_t i = (start + limit) / 2; 672e5b6d6dSopenharmony_ci int32_t cmp = compareInt64AsUnsigned(ce, list.elementAti(i)); 682e5b6d6dSopenharmony_ci if (cmp == 0) { 692e5b6d6dSopenharmony_ci return i; 702e5b6d6dSopenharmony_ci } else if (cmp < 0) { 712e5b6d6dSopenharmony_ci if (i == start) { 722e5b6d6dSopenharmony_ci return ~start; // insert ce before i 732e5b6d6dSopenharmony_ci } 742e5b6d6dSopenharmony_ci limit = i; 752e5b6d6dSopenharmony_ci } else { 762e5b6d6dSopenharmony_ci if (i == start) { 772e5b6d6dSopenharmony_ci return ~(start + 1); // insert ce after i 782e5b6d6dSopenharmony_ci } 792e5b6d6dSopenharmony_ci start = i; 802e5b6d6dSopenharmony_ci } 812e5b6d6dSopenharmony_ci } 822e5b6d6dSopenharmony_ci} 832e5b6d6dSopenharmony_ci 842e5b6d6dSopenharmony_ci} // namespace 852e5b6d6dSopenharmony_ci 862e5b6d6dSopenharmony_ciCollationBaseDataBuilder::CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode) 872e5b6d6dSopenharmony_ci : CollationDataBuilder(icu4xMode, errorCode), 882e5b6d6dSopenharmony_ci numericPrimary(0x12000000), 892e5b6d6dSopenharmony_ci firstHanPrimary(0), lastHanPrimary(0), hanStep(2), 902e5b6d6dSopenharmony_ci rootElements(errorCode), 912e5b6d6dSopenharmony_ci scriptStartsLength(1) { 922e5b6d6dSopenharmony_ci uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex)); 932e5b6d6dSopenharmony_ci uprv_memset(scriptStarts, 0, sizeof(scriptStarts)); 942e5b6d6dSopenharmony_ci this->icu4xMode = icu4xMode; 952e5b6d6dSopenharmony_ci} 962e5b6d6dSopenharmony_ci 972e5b6d6dSopenharmony_ciCollationBaseDataBuilder::~CollationBaseDataBuilder() { 982e5b6d6dSopenharmony_ci} 992e5b6d6dSopenharmony_ci 1002e5b6d6dSopenharmony_civoid 1012e5b6d6dSopenharmony_ciCollationBaseDataBuilder::init(UErrorCode &errorCode) { 1022e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode)) { return; } 1032e5b6d6dSopenharmony_ci if(trie != NULL) { 1042e5b6d6dSopenharmony_ci errorCode = U_INVALID_STATE_ERROR; 1052e5b6d6dSopenharmony_ci return; 1062e5b6d6dSopenharmony_ci } 1072e5b6d6dSopenharmony_ci 1082e5b6d6dSopenharmony_ci // Not compressible: 1092e5b6d6dSopenharmony_ci // - digits 1102e5b6d6dSopenharmony_ci // - Latin 1112e5b6d6dSopenharmony_ci // - Hani 1122e5b6d6dSopenharmony_ci // - trail weights 1132e5b6d6dSopenharmony_ci // Some scripts are compressible, some are not. 1142e5b6d6dSopenharmony_ci uprv_memset(compressibleBytes, false, 256); 1152e5b6d6dSopenharmony_ci compressibleBytes[Collation::UNASSIGNED_IMPLICIT_BYTE] = true; 1162e5b6d6dSopenharmony_ci 1172e5b6d6dSopenharmony_ci // For a base, the default is to compute an unassigned-character implicit CE. 1182e5b6d6dSopenharmony_ci // This includes surrogate code points; see the last option in 1192e5b6d6dSopenharmony_ci // UCA section 7.1.1 Handling Ill-Formed Code Unit Sequences. 1202e5b6d6dSopenharmony_ci trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode); 1212e5b6d6dSopenharmony_ci 1222e5b6d6dSopenharmony_ci // Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches. 1232e5b6d6dSopenharmony_ci // In the ICU4X case, only preallocate ASCII, because we don't store CE32s for 1242e5b6d6dSopenharmony_ci // precomposed characters. 1252e5b6d6dSopenharmony_ci for(UChar32 c = 0; c < (icu4xMode ? 0x80 : 0x180); ++c) { 1262e5b6d6dSopenharmony_ci utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode); 1272e5b6d6dSopenharmony_ci } 1282e5b6d6dSopenharmony_ci 1292e5b6d6dSopenharmony_ci utrie2_set32(trie, 0xfffe, Collation::MERGE_SEPARATOR_CE32, &errorCode); 1302e5b6d6dSopenharmony_ci // No root element for the merge separator which has 02 weights. 1312e5b6d6dSopenharmony_ci // Some code assumes that the root first primary CE is the "space first primary" 1322e5b6d6dSopenharmony_ci // from FractionalUCA.txt. 1332e5b6d6dSopenharmony_ci 1342e5b6d6dSopenharmony_ci if (!icu4xMode) { 1352e5b6d6dSopenharmony_ci uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); 1362e5b6d6dSopenharmony_ci utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode); 1372e5b6d6dSopenharmony_ci } 1382e5b6d6dSopenharmony_ci 1392e5b6d6dSopenharmony_ci // Add a mapping for the first-unassigned boundary, 1402e5b6d6dSopenharmony_ci // which is the AlphabeticIndex overflow boundary. 1412e5b6d6dSopenharmony_ci UnicodeString s((UChar)0xfdd1); // Script boundary contractions start with U+FDD1. 1422e5b6d6dSopenharmony_ci s.append((UChar)0xfdd0); // Zzzz script sample character U+FDD0. 1432e5b6d6dSopenharmony_ci int64_t ce = Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY); 1442e5b6d6dSopenharmony_ci add(UnicodeString(), s, &ce, 1, errorCode); 1452e5b6d6dSopenharmony_ci 1462e5b6d6dSopenharmony_ci // Add a tailoring boundary, but not a mapping, for [first trailing]. 1472e5b6d6dSopenharmony_ci ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY); 1482e5b6d6dSopenharmony_ci rootElements.addElement(ce, errorCode); 1492e5b6d6dSopenharmony_ci 1502e5b6d6dSopenharmony_ci // U+FFFD maps to a CE with the third-highest primary weight, 1512e5b6d6dSopenharmony_ci // for predictable handling of ill-formed UTF-8. 1522e5b6d6dSopenharmony_ci uint32_t ce32 = Collation::FFFD_CE32; 1532e5b6d6dSopenharmony_ci utrie2_set32(trie, 0xfffd, ce32, &errorCode); 1542e5b6d6dSopenharmony_ci addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode); 1552e5b6d6dSopenharmony_ci 1562e5b6d6dSopenharmony_ci // U+FFFF maps to a CE with the highest primary weight. 1572e5b6d6dSopenharmony_ci ce32 = Collation::MAX_REGULAR_CE32; 1582e5b6d6dSopenharmony_ci utrie2_set32(trie, 0xffff, ce32, &errorCode); 1592e5b6d6dSopenharmony_ci addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode); 1602e5b6d6dSopenharmony_ci} 1612e5b6d6dSopenharmony_ci 1622e5b6d6dSopenharmony_civoid 1632e5b6d6dSopenharmony_ciCollationBaseDataBuilder::initHanRanges(const UChar32 ranges[], int32_t length, 1642e5b6d6dSopenharmony_ci UErrorCode &errorCode) { 1652e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode) || length == 0) { return; } 1662e5b6d6dSopenharmony_ci if((length & 1) != 0) { // incomplete start/end pairs 1672e5b6d6dSopenharmony_ci errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1682e5b6d6dSopenharmony_ci return; 1692e5b6d6dSopenharmony_ci } 1702e5b6d6dSopenharmony_ci if(isAssigned(0x4e00)) { // already set 1712e5b6d6dSopenharmony_ci errorCode = U_INVALID_STATE_ERROR; 1722e5b6d6dSopenharmony_ci return; 1732e5b6d6dSopenharmony_ci } 1742e5b6d6dSopenharmony_ci int32_t numHanCodePoints = 0; 1752e5b6d6dSopenharmony_ci for(int32_t i = 0; i < length; i += 2) { 1762e5b6d6dSopenharmony_ci UChar32 start = ranges[i]; 1772e5b6d6dSopenharmony_ci UChar32 end = ranges[i + 1]; 1782e5b6d6dSopenharmony_ci numHanCodePoints += end - start + 1; 1792e5b6d6dSopenharmony_ci } 1802e5b6d6dSopenharmony_ci // Multiply the number of code points by (gap+1). 1812e5b6d6dSopenharmony_ci // Add hanStep+2 for tailoring after the last Han character. 1822e5b6d6dSopenharmony_ci int32_t gap = 1; 1832e5b6d6dSopenharmony_ci hanStep = gap + 1; 1842e5b6d6dSopenharmony_ci int32_t numHan = numHanCodePoints * hanStep + hanStep + 2; 1852e5b6d6dSopenharmony_ci // Numbers of Han primaries per lead byte determined by 1862e5b6d6dSopenharmony_ci // numbers of 2nd (not compressible) times 3rd primary byte values. 1872e5b6d6dSopenharmony_ci int32_t numHanPerLeadByte = 254 * 254; 1882e5b6d6dSopenharmony_ci int32_t numHanLeadBytes = (numHan + numHanPerLeadByte - 1) / numHanPerLeadByte; 1892e5b6d6dSopenharmony_ci uint32_t hanPrimary = (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE - numHanLeadBytes) << 24; 1902e5b6d6dSopenharmony_ci hanPrimary |= 0x20200; 1912e5b6d6dSopenharmony_ci firstHanPrimary = hanPrimary; 1922e5b6d6dSopenharmony_ci for(int32_t i = 0; i < length; i += 2) { 1932e5b6d6dSopenharmony_ci UChar32 start = ranges[i]; 1942e5b6d6dSopenharmony_ci UChar32 end = ranges[i + 1]; 1952e5b6d6dSopenharmony_ci hanPrimary = setPrimaryRangeAndReturnNext(start, end, hanPrimary, hanStep, errorCode); 1962e5b6d6dSopenharmony_ci } 1972e5b6d6dSopenharmony_ci // One past the actual last one, but that is harmless for tailoring. 1982e5b6d6dSopenharmony_ci // It saves us from subtracting "hanStep" and handling underflows. 1992e5b6d6dSopenharmony_ci lastHanPrimary = hanPrimary; 2002e5b6d6dSopenharmony_ci} 2012e5b6d6dSopenharmony_ci 2022e5b6d6dSopenharmony_ciUBool 2032e5b6d6dSopenharmony_ciCollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b) const { 2042e5b6d6dSopenharmony_ci return compressibleBytes[b]; 2052e5b6d6dSopenharmony_ci} 2062e5b6d6dSopenharmony_ci 2072e5b6d6dSopenharmony_civoid 2082e5b6d6dSopenharmony_ciCollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b) { 2092e5b6d6dSopenharmony_ci compressibleBytes[b] = true; 2102e5b6d6dSopenharmony_ci} 2112e5b6d6dSopenharmony_ci 2122e5b6d6dSopenharmony_ciint32_t 2132e5b6d6dSopenharmony_ciCollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) { 2142e5b6d6dSopenharmony_ci if((p1 & 0xff000000) == (p2 & 0xff000000)) { 2152e5b6d6dSopenharmony_ci // Same lead bytes. 2162e5b6d6dSopenharmony_ci return (int32_t)(p2 - p1) >> 16; 2172e5b6d6dSopenharmony_ci } else { 2182e5b6d6dSopenharmony_ci int32_t linear1; 2192e5b6d6dSopenharmony_ci int32_t linear2; 2202e5b6d6dSopenharmony_ci int32_t factor; 2212e5b6d6dSopenharmony_ci if(isCompressible) { 2222e5b6d6dSopenharmony_ci // Second byte for compressible lead byte: 251 bytes 04..FE 2232e5b6d6dSopenharmony_ci linear1 = (int32_t)((p1 >> 16) & 0xff) - 4; 2242e5b6d6dSopenharmony_ci linear2 = (int32_t)((p2 >> 16) & 0xff) - 4; 2252e5b6d6dSopenharmony_ci factor = 251; 2262e5b6d6dSopenharmony_ci } else { 2272e5b6d6dSopenharmony_ci // Second byte for incompressible lead byte: 254 bytes 02..FF 2282e5b6d6dSopenharmony_ci linear1 = (int32_t)((p1 >> 16) & 0xff) - 2; 2292e5b6d6dSopenharmony_ci linear2 = (int32_t)((p2 >> 16) & 0xff) - 2; 2302e5b6d6dSopenharmony_ci factor = 254; 2312e5b6d6dSopenharmony_ci } 2322e5b6d6dSopenharmony_ci linear1 += factor * (int32_t)((p1 >> 24) & 0xff); 2332e5b6d6dSopenharmony_ci linear2 += factor * (int32_t)((p2 >> 24) & 0xff); 2342e5b6d6dSopenharmony_ci return linear2 - linear1; 2352e5b6d6dSopenharmony_ci } 2362e5b6d6dSopenharmony_ci} 2372e5b6d6dSopenharmony_ci 2382e5b6d6dSopenharmony_ciint32_t 2392e5b6d6dSopenharmony_ciCollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) { 2402e5b6d6dSopenharmony_ci if((p1 & 0xffff0000) == (p2 & 0xffff0000)) { 2412e5b6d6dSopenharmony_ci // Same first two bytes. 2422e5b6d6dSopenharmony_ci return (int32_t)(p2 - p1) >> 8; 2432e5b6d6dSopenharmony_ci } else { 2442e5b6d6dSopenharmony_ci // Third byte: 254 bytes 02..FF 2452e5b6d6dSopenharmony_ci int32_t linear1 = (int32_t)((p1 >> 8) & 0xff) - 2; 2462e5b6d6dSopenharmony_ci int32_t linear2 = (int32_t)((p2 >> 8) & 0xff) - 2; 2472e5b6d6dSopenharmony_ci int32_t factor; 2482e5b6d6dSopenharmony_ci if(isCompressible) { 2492e5b6d6dSopenharmony_ci // Second byte for compressible lead byte: 251 bytes 04..FE 2502e5b6d6dSopenharmony_ci linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 4); 2512e5b6d6dSopenharmony_ci linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 4); 2522e5b6d6dSopenharmony_ci factor = 251 * 254; 2532e5b6d6dSopenharmony_ci } else { 2542e5b6d6dSopenharmony_ci // Second byte for incompressible lead byte: 254 bytes 02..FF 2552e5b6d6dSopenharmony_ci linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 2); 2562e5b6d6dSopenharmony_ci linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 2); 2572e5b6d6dSopenharmony_ci factor = 254 * 254; 2582e5b6d6dSopenharmony_ci } 2592e5b6d6dSopenharmony_ci linear1 += factor * (int32_t)((p1 >> 24) & 0xff); 2602e5b6d6dSopenharmony_ci linear2 += factor * (int32_t)((p2 >> 24) & 0xff); 2612e5b6d6dSopenharmony_ci return linear2 - linear1; 2622e5b6d6dSopenharmony_ci } 2632e5b6d6dSopenharmony_ci} 2642e5b6d6dSopenharmony_ci 2652e5b6d6dSopenharmony_ciuint32_t 2662e5b6d6dSopenharmony_ciCollationBaseDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode) { 2672e5b6d6dSopenharmony_ci addRootElements(ces, cesLength, errorCode); 2682e5b6d6dSopenharmony_ci return CollationDataBuilder::encodeCEs(ces, cesLength, errorCode); 2692e5b6d6dSopenharmony_ci} 2702e5b6d6dSopenharmony_ci 2712e5b6d6dSopenharmony_civoid 2722e5b6d6dSopenharmony_ciCollationBaseDataBuilder::addRootElements(const int64_t ces[], int32_t cesLength, 2732e5b6d6dSopenharmony_ci UErrorCode &errorCode) { 2742e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode)) { return; } 2752e5b6d6dSopenharmony_ci for(int32_t i = 0; i < cesLength; ++i) { 2762e5b6d6dSopenharmony_ci addRootElement(ces[i], errorCode); 2772e5b6d6dSopenharmony_ci } 2782e5b6d6dSopenharmony_ci} 2792e5b6d6dSopenharmony_ci 2802e5b6d6dSopenharmony_civoid 2812e5b6d6dSopenharmony_ciCollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) { 2822e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode) || ce == 0) { return; } 2832e5b6d6dSopenharmony_ci // Remove case bits. 2842e5b6d6dSopenharmony_ci ce &= INT64_C(0xffffffffffff3fff); 2852e5b6d6dSopenharmony_ci U_ASSERT((ce & 0xc0) == 0); // quaternary==0 2862e5b6d6dSopenharmony_ci // Ignore the CE if it has a Han primary weight and common secondary/tertiary weights. 2872e5b6d6dSopenharmony_ci // We will add it later, as part of the Han ranges. 2882e5b6d6dSopenharmony_ci uint32_t p = (uint32_t)(ce >> 32); 2892e5b6d6dSopenharmony_ci uint32_t secTer = (uint32_t)ce; 2902e5b6d6dSopenharmony_ci if(firstHanPrimary <= p && p <= lastHanPrimary) { 2912e5b6d6dSopenharmony_ci if(secTer < Collation::COMMON_SEC_AND_TER_CE) { 2922e5b6d6dSopenharmony_ci // buildRootElementsTable() does not currently handle this case. 2932e5b6d6dSopenharmony_ci errorCode = U_ILLEGAL_ARGUMENT_ERROR; 2942e5b6d6dSopenharmony_ci return; 2952e5b6d6dSopenharmony_ci } 2962e5b6d6dSopenharmony_ci if(secTer == Collation::COMMON_SEC_AND_TER_CE) { 2972e5b6d6dSopenharmony_ci return; 2982e5b6d6dSopenharmony_ci } 2992e5b6d6dSopenharmony_ci } 3002e5b6d6dSopenharmony_ci if(secTer != Collation::COMMON_SEC_AND_TER_CE) { // minor optimization 3012e5b6d6dSopenharmony_ci // Check that secondary and tertiary weights are > 01. 3022e5b6d6dSopenharmony_ci uint32_t s = secTer >> 16; 3032e5b6d6dSopenharmony_ci uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK; 3042e5b6d6dSopenharmony_ci if((s != 0 && s <= Collation::BEFORE_WEIGHT16) || 3052e5b6d6dSopenharmony_ci (t != 0 && t <= Collation::BEFORE_WEIGHT16)) { 3062e5b6d6dSopenharmony_ci errorCode = U_ILLEGAL_ARGUMENT_ERROR; 3072e5b6d6dSopenharmony_ci return; 3082e5b6d6dSopenharmony_ci } 3092e5b6d6dSopenharmony_ci } 3102e5b6d6dSopenharmony_ci // Check that primaries have at most 3 bytes. 3112e5b6d6dSopenharmony_ci if((p & 0xff) != 0) { 3122e5b6d6dSopenharmony_ci errorCode = U_ILLEGAL_ARGUMENT_ERROR; 3132e5b6d6dSopenharmony_ci return; 3142e5b6d6dSopenharmony_ci } 3152e5b6d6dSopenharmony_ci int32_t i = binarySearch(rootElements, ce); 3162e5b6d6dSopenharmony_ci if(i < 0) { 3172e5b6d6dSopenharmony_ci rootElements.insertElementAt(ce, ~i, errorCode); 3182e5b6d6dSopenharmony_ci } 3192e5b6d6dSopenharmony_ci} 3202e5b6d6dSopenharmony_ci 3212e5b6d6dSopenharmony_civoid 3222e5b6d6dSopenharmony_ciCollationBaseDataBuilder::addScriptStart(int32_t script, uint32_t p) { 3232e5b6d6dSopenharmony_ci // The primary weight must be the lowest possible for a two-byte prefix. 3242e5b6d6dSopenharmony_ci // It could be 2, 3, or 4 bytes long. We round down to the two-byte boundary. 3252e5b6d6dSopenharmony_ci U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2); 3262e5b6d6dSopenharmony_ci p >>= 8; 3272e5b6d6dSopenharmony_ci U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2); 3282e5b6d6dSopenharmony_ci p >>= 8; 3292e5b6d6dSopenharmony_ci uint32_t lowestP2 = compressibleBytes[p >> 8] ? 4 : 2; 3302e5b6d6dSopenharmony_ci if((p & 0xff) == lowestP2) { 3312e5b6d6dSopenharmony_ci // The script really starts on a lead byte boundary. Round down to that. 3322e5b6d6dSopenharmony_ci p &= 0xff00; 3332e5b6d6dSopenharmony_ci } 3342e5b6d6dSopenharmony_ci // Script starts should be added in ascending order, otherwise we would need to sort them. 3352e5b6d6dSopenharmony_ci if(script < UCOL_REORDER_CODE_FIRST) { 3362e5b6d6dSopenharmony_ci U_ASSERT(0 <= script && script < USCRIPT_CODE_LIMIT); 3372e5b6d6dSopenharmony_ci } else { 3382e5b6d6dSopenharmony_ci U_ASSERT(script <= (UCOL_REORDER_CODE_FIRST + 15)); 3392e5b6d6dSopenharmony_ci script = USCRIPT_CODE_LIMIT + script - UCOL_REORDER_CODE_FIRST; 3402e5b6d6dSopenharmony_ci } 3412e5b6d6dSopenharmony_ci if(scriptStartsLength != 0 && scriptStarts[scriptStartsLength - 1] == p) { 3422e5b6d6dSopenharmony_ci // Two scripts share a range (e.g., Hira & Kana). 3432e5b6d6dSopenharmony_ci scriptsIndex[script] = (uint16_t)(scriptStartsLength - 1); 3442e5b6d6dSopenharmony_ci } else { 3452e5b6d6dSopenharmony_ci U_ASSERT(scriptStartsLength == 0 || scriptStarts[scriptStartsLength - 1] <= p); 3462e5b6d6dSopenharmony_ci U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts)); 3472e5b6d6dSopenharmony_ci scriptsIndex[script] = (uint16_t)scriptStartsLength; 3482e5b6d6dSopenharmony_ci scriptStarts[scriptStartsLength++] = (uint16_t)p; 3492e5b6d6dSopenharmony_ci } 3502e5b6d6dSopenharmony_ci if(script == USCRIPT_UNKNOWN) { 3512e5b6d6dSopenharmony_ci // The last script start is for unassigned code points 3522e5b6d6dSopenharmony_ci // (with high implicit primary weights). 3532e5b6d6dSopenharmony_ci // Add one more entry with the limit of this range, 3542e5b6d6dSopenharmony_ci // which is the start of the trailing-weights range. 3552e5b6d6dSopenharmony_ci U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts)); 3562e5b6d6dSopenharmony_ci scriptStarts[scriptStartsLength++] = 3572e5b6d6dSopenharmony_ci (uint16_t)((Collation::FIRST_TRAILING_PRIMARY >> 16) & 0xff00); 3582e5b6d6dSopenharmony_ci } 3592e5b6d6dSopenharmony_ci} 3602e5b6d6dSopenharmony_ci 3612e5b6d6dSopenharmony_civoid 3622e5b6d6dSopenharmony_ciCollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) { 3632e5b6d6dSopenharmony_ci buildMappings(data, errorCode); 3642e5b6d6dSopenharmony_ci data.numericPrimary = numericPrimary; 3652e5b6d6dSopenharmony_ci data.compressibleBytes = compressibleBytes; 3662e5b6d6dSopenharmony_ci 3672e5b6d6dSopenharmony_ci int32_t numScripts = USCRIPT_CODE_LIMIT; 3682e5b6d6dSopenharmony_ci while(numScripts > 0 && scriptsIndex[numScripts - 1] == 0) { --numScripts; } 3692e5b6d6dSopenharmony_ci // Move the 16 special groups (not all used) 3702e5b6d6dSopenharmony_ci // down for contiguous storage of the script and special-group indexes. 3712e5b6d6dSopenharmony_ci for(int32_t i = 0; i < 16; ++i) { 3722e5b6d6dSopenharmony_ci scriptsIndex[numScripts + i] = scriptsIndex[USCRIPT_CODE_LIMIT + i]; 3732e5b6d6dSopenharmony_ci } 3742e5b6d6dSopenharmony_ci data.numScripts = numScripts; 3752e5b6d6dSopenharmony_ci data.scriptsIndex = scriptsIndex; 3762e5b6d6dSopenharmony_ci data.scriptStarts = scriptStarts; 3772e5b6d6dSopenharmony_ci data.scriptStartsLength = scriptStartsLength; 3782e5b6d6dSopenharmony_ci buildFastLatinTable(data, errorCode); 3792e5b6d6dSopenharmony_ci} 3802e5b6d6dSopenharmony_ci 3812e5b6d6dSopenharmony_civoid 3822e5b6d6dSopenharmony_ciCollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &errorCode) { 3832e5b6d6dSopenharmony_ci // Limit sentinel for root elements. 3842e5b6d6dSopenharmony_ci // This allows us to reduce range checks at runtime. 3852e5b6d6dSopenharmony_ci rootElements.addElement(Collation::makeCE(CollationRootElements::PRIMARY_SENTINEL), errorCode); 3862e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode)) { return; } 3872e5b6d6dSopenharmony_ci uint32_t nextHanPrimary = firstHanPrimary; // Set to 0xffffffff after the last Han range. 3882e5b6d6dSopenharmony_ci uint32_t prevPrimary = 0; // Start with primary ignorable CEs. 3892e5b6d6dSopenharmony_ci UBool needCommonSecTerUnit = false; 3902e5b6d6dSopenharmony_ci UBool hasDeltaUnit = false; 3912e5b6d6dSopenharmony_ci for(int32_t i = 0; i < rootElements.size(); ++i) { 3922e5b6d6dSopenharmony_ci int64_t ce = rootElements.elementAti(i); 3932e5b6d6dSopenharmony_ci uint32_t p = (uint32_t)(ce >> 32); 3942e5b6d6dSopenharmony_ci uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK; 3952e5b6d6dSopenharmony_ci if((p != prevPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE) && needCommonSecTerUnit) { 3962e5b6d6dSopenharmony_ci // The last primary had low sec/ter weights but no common sec/ter combination. 3972e5b6d6dSopenharmony_ci // The next unit is either a new primary or an above-common sec/ter unit. 3982e5b6d6dSopenharmony_ci // Insert a common sec/ter unit so that the builder will reliably 3992e5b6d6dSopenharmony_ci // tailor to either before or after a common weight but not across it. 4002e5b6d6dSopenharmony_ci table.addElement((int32_t)Collation::COMMON_SEC_AND_TER_CE | 4012e5b6d6dSopenharmony_ci CollationRootElements::SEC_TER_DELTA_FLAG, errorCode); 4022e5b6d6dSopenharmony_ci } 4032e5b6d6dSopenharmony_ci if(p != prevPrimary) { 4042e5b6d6dSopenharmony_ci U_ASSERT((p & 0xff) == 0); 4052e5b6d6dSopenharmony_ci int32_t end; 4062e5b6d6dSopenharmony_ci if(p >= nextHanPrimary) { 4072e5b6d6dSopenharmony_ci // Add a Han primary weight or range. 4082e5b6d6dSopenharmony_ci // We omitted them initially, and omitted all CEs with Han primaries 4092e5b6d6dSopenharmony_ci // and common secondary/tertiary weights. 4102e5b6d6dSopenharmony_ci U_ASSERT(p > lastHanPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE); 4112e5b6d6dSopenharmony_ci if(p == nextHanPrimary) { 4122e5b6d6dSopenharmony_ci // One single Han primary with non-common secondary/tertiary weights. 4132e5b6d6dSopenharmony_ci table.addElement((int32_t)p, errorCode); 4142e5b6d6dSopenharmony_ci if(p < lastHanPrimary) { 4152e5b6d6dSopenharmony_ci // Prepare for the next Han range. 4162e5b6d6dSopenharmony_ci nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, false, hanStep); 4172e5b6d6dSopenharmony_ci } else { 4182e5b6d6dSopenharmony_ci // p is the last Han primary. 4192e5b6d6dSopenharmony_ci nextHanPrimary = 0xffffffff; 4202e5b6d6dSopenharmony_ci } 4212e5b6d6dSopenharmony_ci } else { 4222e5b6d6dSopenharmony_ci // p > nextHanPrimary: Add a Han primary range, starting with nextHanPrimary. 4232e5b6d6dSopenharmony_ci table.addElement((int32_t)nextHanPrimary, errorCode); 4242e5b6d6dSopenharmony_ci if(nextHanPrimary == lastHanPrimary) { 4252e5b6d6dSopenharmony_ci // nextHanPrimary == lastHanPrimary < p 4262e5b6d6dSopenharmony_ci // We just wrote the single last Han primary. 4272e5b6d6dSopenharmony_ci nextHanPrimary = 0xffffffff; 4282e5b6d6dSopenharmony_ci table.addElement((int32_t)p, errorCode); 4292e5b6d6dSopenharmony_ci } else if(p < lastHanPrimary) { 4302e5b6d6dSopenharmony_ci // nextHanPrimary < p < lastHanPrimary 4312e5b6d6dSopenharmony_ci // End the Han range on p, prepare for the next range. 4322e5b6d6dSopenharmony_ci table.addElement((int32_t)p | hanStep, errorCode); 4332e5b6d6dSopenharmony_ci nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, false, hanStep); 4342e5b6d6dSopenharmony_ci } else if(p == lastHanPrimary) { 4352e5b6d6dSopenharmony_ci // nextHanPrimary < p == lastHanPrimary 4362e5b6d6dSopenharmony_ci // End the last Han range on p. 4372e5b6d6dSopenharmony_ci table.addElement((int32_t)p | hanStep, errorCode); 4382e5b6d6dSopenharmony_ci nextHanPrimary = 0xffffffff; 4392e5b6d6dSopenharmony_ci } else { 4402e5b6d6dSopenharmony_ci // nextHanPrimary < lastHanPrimary < p 4412e5b6d6dSopenharmony_ci // End the last Han range, then write p. 4422e5b6d6dSopenharmony_ci table.addElement((int32_t)lastHanPrimary | hanStep, errorCode); 4432e5b6d6dSopenharmony_ci nextHanPrimary = 0xffffffff; 4442e5b6d6dSopenharmony_ci table.addElement((int32_t)p, errorCode); 4452e5b6d6dSopenharmony_ci } 4462e5b6d6dSopenharmony_ci } 4472e5b6d6dSopenharmony_ci } else if(prevPrimary != 0 && 4482e5b6d6dSopenharmony_ci // If there has not been an intervening delta unit, 4492e5b6d6dSopenharmony_ci // then we will try to combine the previous primary and 4502e5b6d6dSopenharmony_ci // the next several primaries into a range. 4512e5b6d6dSopenharmony_ci !hasDeltaUnit && 4522e5b6d6dSopenharmony_ci // Might get a range with more than two primaries if the current CE 4532e5b6d6dSopenharmony_ci // has common sec/ter weights. 4542e5b6d6dSopenharmony_ci secTer == Collation::COMMON_SEC_AND_TER_CE && 4552e5b6d6dSopenharmony_ci (end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) { 4562e5b6d6dSopenharmony_ci // Multiple CEs with only common secondary/tertiary weights were 4572e5b6d6dSopenharmony_ci // combined into a primary range. 4582e5b6d6dSopenharmony_ci // The range end was written, ending with the primary of rootElements[end]. 4592e5b6d6dSopenharmony_ci ce = rootElements.elementAti(end); 4602e5b6d6dSopenharmony_ci p = (uint32_t)(ce >> 32); 4612e5b6d6dSopenharmony_ci secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK; 4622e5b6d6dSopenharmony_ci i = end; 4632e5b6d6dSopenharmony_ci } else { 4642e5b6d6dSopenharmony_ci // Write the primary weight of a normal CE. 4652e5b6d6dSopenharmony_ci table.addElement((int32_t)p, errorCode); 4662e5b6d6dSopenharmony_ci } 4672e5b6d6dSopenharmony_ci prevPrimary = p; 4682e5b6d6dSopenharmony_ci needCommonSecTerUnit = false; 4692e5b6d6dSopenharmony_ci hasDeltaUnit = false; 4702e5b6d6dSopenharmony_ci } 4712e5b6d6dSopenharmony_ci if(secTer == Collation::COMMON_SEC_AND_TER_CE && !needCommonSecTerUnit) { 4722e5b6d6dSopenharmony_ci // The common secondar/tertiary weights are implied in the primary unit. 4732e5b6d6dSopenharmony_ci } else { 4742e5b6d6dSopenharmony_ci if(secTer < Collation::COMMON_SEC_AND_TER_CE) { 4752e5b6d6dSopenharmony_ci // Remember to not suppress a common sec/ter unit if p!=0. 4762e5b6d6dSopenharmony_ci needCommonSecTerUnit = p != 0; 4772e5b6d6dSopenharmony_ci } else if(secTer == Collation::COMMON_SEC_AND_TER_CE) { 4782e5b6d6dSopenharmony_ci // Real common sec/ter unit, no need to insert an artificial one. 4792e5b6d6dSopenharmony_ci needCommonSecTerUnit = false; 4802e5b6d6dSopenharmony_ci } 4812e5b6d6dSopenharmony_ci // For each new set of secondary/tertiary weights we write a delta unit. 4822e5b6d6dSopenharmony_ci table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DELTA_FLAG, errorCode); 4832e5b6d6dSopenharmony_ci hasDeltaUnit = true; 4842e5b6d6dSopenharmony_ci } 4852e5b6d6dSopenharmony_ci } 4862e5b6d6dSopenharmony_ci} 4872e5b6d6dSopenharmony_ci 4882e5b6d6dSopenharmony_ciint32_t 4892e5b6d6dSopenharmony_ciCollationBaseDataBuilder::writeRootElementsRange( 4902e5b6d6dSopenharmony_ci uint32_t prevPrimary, uint32_t p, int32_t i, 4912e5b6d6dSopenharmony_ci UVector32 &table, UErrorCode &errorCode) { 4922e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode) || i >= rootElements.size()) { return 0; } 4932e5b6d6dSopenharmony_ci U_ASSERT(prevPrimary < p); 4942e5b6d6dSopenharmony_ci // No ranges of single-byte primaries. 4952e5b6d6dSopenharmony_ci if((p & prevPrimary & 0xff0000) == 0) { return 0; } 4962e5b6d6dSopenharmony_ci // Lead bytes of compressible primaries must match. 4972e5b6d6dSopenharmony_ci UBool isCompressible = isCompressiblePrimary(p); 4982e5b6d6dSopenharmony_ci if((isCompressible || isCompressiblePrimary(prevPrimary)) && 4992e5b6d6dSopenharmony_ci (p & 0xff000000) != (prevPrimary & 0xff000000)) { 5002e5b6d6dSopenharmony_ci return 0; 5012e5b6d6dSopenharmony_ci } 5022e5b6d6dSopenharmony_ci // Number of bytes in the primaries. 5032e5b6d6dSopenharmony_ci UBool twoBytes; 5042e5b6d6dSopenharmony_ci // Number of primaries from prevPrimary to p. 5052e5b6d6dSopenharmony_ci int32_t step; 5062e5b6d6dSopenharmony_ci if((p & 0xff00) == 0) { 5072e5b6d6dSopenharmony_ci // 2-byte primary 5082e5b6d6dSopenharmony_ci if((prevPrimary & 0xff00) != 0) { return 0; } // length mismatch 5092e5b6d6dSopenharmony_ci twoBytes = true; 5102e5b6d6dSopenharmony_ci step = diffTwoBytePrimaries(prevPrimary, p, isCompressible); 5112e5b6d6dSopenharmony_ci } else { 5122e5b6d6dSopenharmony_ci // 3-byte primary 5132e5b6d6dSopenharmony_ci if((prevPrimary & 0xff00) == 0) { return 0; } // length mismatch 5142e5b6d6dSopenharmony_ci twoBytes = false; 5152e5b6d6dSopenharmony_ci step = diffThreeBytePrimaries(prevPrimary, p, isCompressible); 5162e5b6d6dSopenharmony_ci } 5172e5b6d6dSopenharmony_ci if(step > (int32_t)CollationRootElements::PRIMARY_STEP_MASK) { return 0; } 5182e5b6d6dSopenharmony_ci // See if there are more than two CEs with primaries increasing by "step" 5192e5b6d6dSopenharmony_ci // and with only common secondary/tertiary weights on all but the last one. 5202e5b6d6dSopenharmony_ci int32_t end = 0; // Initially 0: No range for just two primaries. 5212e5b6d6dSopenharmony_ci for(;;) { 5222e5b6d6dSopenharmony_ci prevPrimary = p; 5232e5b6d6dSopenharmony_ci // Calculate which primary we expect next. 5242e5b6d6dSopenharmony_ci uint32_t nextPrimary; // = p + step 5252e5b6d6dSopenharmony_ci if(twoBytes) { 5262e5b6d6dSopenharmony_ci nextPrimary = Collation::incTwoBytePrimaryByOffset(p, isCompressible, step); 5272e5b6d6dSopenharmony_ci } else { 5282e5b6d6dSopenharmony_ci nextPrimary = Collation::incThreeBytePrimaryByOffset(p, isCompressible, step); 5292e5b6d6dSopenharmony_ci } 5302e5b6d6dSopenharmony_ci // Fetch the actual next CE. 5312e5b6d6dSopenharmony_ci int64_t ce = rootElements.elementAti(i); 5322e5b6d6dSopenharmony_ci p = (uint32_t)(ce >> 32); 5332e5b6d6dSopenharmony_ci uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK; 5342e5b6d6dSopenharmony_ci // Does this primary increase by "step" from the last one? 5352e5b6d6dSopenharmony_ci if(p != nextPrimary || 5362e5b6d6dSopenharmony_ci // Do not cross into a new lead byte if either is compressible. 5372e5b6d6dSopenharmony_ci ((p & 0xff000000) != (prevPrimary & 0xff000000) && 5382e5b6d6dSopenharmony_ci (isCompressible || isCompressiblePrimary(p)))) { 5392e5b6d6dSopenharmony_ci // The range ends with the previous CE. 5402e5b6d6dSopenharmony_ci p = prevPrimary; 5412e5b6d6dSopenharmony_ci break; 5422e5b6d6dSopenharmony_ci } 5432e5b6d6dSopenharmony_ci // Extend the range to include this primary. 5442e5b6d6dSopenharmony_ci end = i++; 5452e5b6d6dSopenharmony_ci // This primary is the last in the range if it has non-common weights 5462e5b6d6dSopenharmony_ci // or if we are at the end of the list. 5472e5b6d6dSopenharmony_ci if(secTer != Collation::COMMON_SEC_AND_TER_CE || i >= rootElements.size()) { break; } 5482e5b6d6dSopenharmony_ci } 5492e5b6d6dSopenharmony_ci if(end != 0) { 5502e5b6d6dSopenharmony_ci table.addElement((int32_t)p | step, errorCode); 5512e5b6d6dSopenharmony_ci } 5522e5b6d6dSopenharmony_ci return end; 5532e5b6d6dSopenharmony_ci} 5542e5b6d6dSopenharmony_ci 5552e5b6d6dSopenharmony_ciU_NAMESPACE_END 5562e5b6d6dSopenharmony_ci 5572e5b6d6dSopenharmony_ci#endif // !UCONFIG_NO_COLLATION 558