12e5b6d6dSopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 22e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 32e5b6d6dSopenharmony_ci/* 42e5b6d6dSopenharmony_ci******************************************************************************* 52e5b6d6dSopenharmony_ci* 62e5b6d6dSopenharmony_ci* Copyright (C) 2009-2014, International Business Machines 72e5b6d6dSopenharmony_ci* Corporation and others. All Rights Reserved. 82e5b6d6dSopenharmony_ci* 92e5b6d6dSopenharmony_ci******************************************************************************* 102e5b6d6dSopenharmony_ci* file name: normalizer2impl.h 112e5b6d6dSopenharmony_ci* encoding: UTF-8 122e5b6d6dSopenharmony_ci* tab size: 8 (not used) 132e5b6d6dSopenharmony_ci* indentation:4 142e5b6d6dSopenharmony_ci* 152e5b6d6dSopenharmony_ci* created on: 2009nov22 162e5b6d6dSopenharmony_ci* created by: Markus W. Scherer 172e5b6d6dSopenharmony_ci*/ 182e5b6d6dSopenharmony_ci 192e5b6d6dSopenharmony_ci#ifndef __NORMALIZER2IMPL_H__ 202e5b6d6dSopenharmony_ci#define __NORMALIZER2IMPL_H__ 212e5b6d6dSopenharmony_ci 222e5b6d6dSopenharmony_ci#include "unicode/utypes.h" 232e5b6d6dSopenharmony_ci 242e5b6d6dSopenharmony_ci#if !UCONFIG_NO_NORMALIZATION 252e5b6d6dSopenharmony_ci 262e5b6d6dSopenharmony_ci#include "unicode/normalizer2.h" 272e5b6d6dSopenharmony_ci#include "unicode/ucptrie.h" 282e5b6d6dSopenharmony_ci#include "unicode/unistr.h" 292e5b6d6dSopenharmony_ci#include "unicode/unorm.h" 302e5b6d6dSopenharmony_ci#include "unicode/utf.h" 312e5b6d6dSopenharmony_ci#include "unicode/utf16.h" 322e5b6d6dSopenharmony_ci#include "mutex.h" 332e5b6d6dSopenharmony_ci#include "udataswp.h" 342e5b6d6dSopenharmony_ci#include "uset_imp.h" 352e5b6d6dSopenharmony_ci 362e5b6d6dSopenharmony_ci// When the nfc.nrm data is *not* hardcoded into the common library 372e5b6d6dSopenharmony_ci// (with this constant set to 0), 382e5b6d6dSopenharmony_ci// then it needs to be built into the data package: 392e5b6d6dSopenharmony_ci// Add nfc.nrm to icu4c/source/data/Makefile.in DAT_FILES_SHORT 402e5b6d6dSopenharmony_ci#define NORM2_HARDCODE_NFC_DATA 1 412e5b6d6dSopenharmony_ci 422e5b6d6dSopenharmony_ciU_NAMESPACE_BEGIN 432e5b6d6dSopenharmony_ci 442e5b6d6dSopenharmony_cistruct CanonIterData; 452e5b6d6dSopenharmony_ci 462e5b6d6dSopenharmony_ciclass ByteSink; 472e5b6d6dSopenharmony_ciclass Edits; 482e5b6d6dSopenharmony_ciclass InitCanonIterData; 492e5b6d6dSopenharmony_ciclass LcccContext; 502e5b6d6dSopenharmony_ci 512e5b6d6dSopenharmony_ciclass U_COMMON_API Hangul { 522e5b6d6dSopenharmony_cipublic: 532e5b6d6dSopenharmony_ci /* Korean Hangul and Jamo constants */ 542e5b6d6dSopenharmony_ci enum { 552e5b6d6dSopenharmony_ci JAMO_L_BASE=0x1100, /* "lead" jamo */ 562e5b6d6dSopenharmony_ci JAMO_L_END=0x1112, 572e5b6d6dSopenharmony_ci JAMO_V_BASE=0x1161, /* "vowel" jamo */ 582e5b6d6dSopenharmony_ci JAMO_V_END=0x1175, 592e5b6d6dSopenharmony_ci JAMO_T_BASE=0x11a7, /* "trail" jamo */ 602e5b6d6dSopenharmony_ci JAMO_T_END=0x11c2, 612e5b6d6dSopenharmony_ci 622e5b6d6dSopenharmony_ci HANGUL_BASE=0xac00, 632e5b6d6dSopenharmony_ci HANGUL_END=0xd7a3, 642e5b6d6dSopenharmony_ci 652e5b6d6dSopenharmony_ci JAMO_L_COUNT=19, 662e5b6d6dSopenharmony_ci JAMO_V_COUNT=21, 672e5b6d6dSopenharmony_ci JAMO_T_COUNT=28, 682e5b6d6dSopenharmony_ci 692e5b6d6dSopenharmony_ci JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, 702e5b6d6dSopenharmony_ci 712e5b6d6dSopenharmony_ci HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, 722e5b6d6dSopenharmony_ci HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT 732e5b6d6dSopenharmony_ci }; 742e5b6d6dSopenharmony_ci 752e5b6d6dSopenharmony_ci static inline UBool isHangul(UChar32 c) { 762e5b6d6dSopenharmony_ci return HANGUL_BASE<=c && c<HANGUL_LIMIT; 772e5b6d6dSopenharmony_ci } 782e5b6d6dSopenharmony_ci static inline UBool 792e5b6d6dSopenharmony_ci isHangulLV(UChar32 c) { 802e5b6d6dSopenharmony_ci c-=HANGUL_BASE; 812e5b6d6dSopenharmony_ci return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 822e5b6d6dSopenharmony_ci } 832e5b6d6dSopenharmony_ci static inline UBool isJamoL(UChar32 c) { 842e5b6d6dSopenharmony_ci return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT; 852e5b6d6dSopenharmony_ci } 862e5b6d6dSopenharmony_ci static inline UBool isJamoV(UChar32 c) { 872e5b6d6dSopenharmony_ci return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT; 882e5b6d6dSopenharmony_ci } 892e5b6d6dSopenharmony_ci static inline UBool isJamoT(UChar32 c) { 902e5b6d6dSopenharmony_ci int32_t t=c-JAMO_T_BASE; 912e5b6d6dSopenharmony_ci return 0<t && t<JAMO_T_COUNT; // not JAMO_T_BASE itself 922e5b6d6dSopenharmony_ci } 932e5b6d6dSopenharmony_ci static UBool isJamo(UChar32 c) { 942e5b6d6dSopenharmony_ci return JAMO_L_BASE<=c && c<=JAMO_T_END && 952e5b6d6dSopenharmony_ci (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c); 962e5b6d6dSopenharmony_ci } 972e5b6d6dSopenharmony_ci 982e5b6d6dSopenharmony_ci /** 992e5b6d6dSopenharmony_ci * Decomposes c, which must be a Hangul syllable, into buffer 1002e5b6d6dSopenharmony_ci * and returns the length of the decomposition (2 or 3). 1012e5b6d6dSopenharmony_ci */ 1022e5b6d6dSopenharmony_ci static inline int32_t decompose(UChar32 c, UChar buffer[3]) { 1032e5b6d6dSopenharmony_ci c-=HANGUL_BASE; 1042e5b6d6dSopenharmony_ci UChar32 c2=c%JAMO_T_COUNT; 1052e5b6d6dSopenharmony_ci c/=JAMO_T_COUNT; 1062e5b6d6dSopenharmony_ci buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); 1072e5b6d6dSopenharmony_ci buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); 1082e5b6d6dSopenharmony_ci if(c2==0) { 1092e5b6d6dSopenharmony_ci return 2; 1102e5b6d6dSopenharmony_ci } else { 1112e5b6d6dSopenharmony_ci buffer[2]=(UChar)(JAMO_T_BASE+c2); 1122e5b6d6dSopenharmony_ci return 3; 1132e5b6d6dSopenharmony_ci } 1142e5b6d6dSopenharmony_ci } 1152e5b6d6dSopenharmony_ci 1162e5b6d6dSopenharmony_ci /** 1172e5b6d6dSopenharmony_ci * Decomposes c, which must be a Hangul syllable, into buffer. 1182e5b6d6dSopenharmony_ci * This is the raw, not recursive, decomposition. Its length is always 2. 1192e5b6d6dSopenharmony_ci */ 1202e5b6d6dSopenharmony_ci static inline void getRawDecomposition(UChar32 c, UChar buffer[2]) { 1212e5b6d6dSopenharmony_ci UChar32 orig=c; 1222e5b6d6dSopenharmony_ci c-=HANGUL_BASE; 1232e5b6d6dSopenharmony_ci UChar32 c2=c%JAMO_T_COUNT; 1242e5b6d6dSopenharmony_ci if(c2==0) { 1252e5b6d6dSopenharmony_ci c/=JAMO_T_COUNT; 1262e5b6d6dSopenharmony_ci buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); 1272e5b6d6dSopenharmony_ci buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); 1282e5b6d6dSopenharmony_ci } else { 1292e5b6d6dSopenharmony_ci buffer[0]=(UChar)(orig-c2); // LV syllable 1302e5b6d6dSopenharmony_ci buffer[1]=(UChar)(JAMO_T_BASE+c2); 1312e5b6d6dSopenharmony_ci } 1322e5b6d6dSopenharmony_ci } 1332e5b6d6dSopenharmony_ciprivate: 1342e5b6d6dSopenharmony_ci Hangul() = delete; // no instantiation 1352e5b6d6dSopenharmony_ci}; 1362e5b6d6dSopenharmony_ci 1372e5b6d6dSopenharmony_ciclass Normalizer2Impl; 1382e5b6d6dSopenharmony_ci 1392e5b6d6dSopenharmony_ciclass U_COMMON_API ReorderingBuffer : public UMemory { 1402e5b6d6dSopenharmony_cipublic: 1412e5b6d6dSopenharmony_ci /** Constructs only; init() should be called. */ 1422e5b6d6dSopenharmony_ci ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) : 1432e5b6d6dSopenharmony_ci impl(ni), str(dest), 1442e5b6d6dSopenharmony_ci start(NULL), reorderStart(NULL), limit(NULL), 1452e5b6d6dSopenharmony_ci remainingCapacity(0), lastCC(0) {} 1462e5b6d6dSopenharmony_ci /** Constructs, removes the string contents, and initializes for a small initial capacity. */ 1472e5b6d6dSopenharmony_ci ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, UErrorCode &errorCode); 1482e5b6d6dSopenharmony_ci ~ReorderingBuffer() { 1492e5b6d6dSopenharmony_ci if(start!=NULL) { 1502e5b6d6dSopenharmony_ci str.releaseBuffer((int32_t)(limit-start)); 1512e5b6d6dSopenharmony_ci } 1522e5b6d6dSopenharmony_ci } 1532e5b6d6dSopenharmony_ci UBool init(int32_t destCapacity, UErrorCode &errorCode); 1542e5b6d6dSopenharmony_ci 1552e5b6d6dSopenharmony_ci UBool isEmpty() const { return start==limit; } 1562e5b6d6dSopenharmony_ci int32_t length() const { return (int32_t)(limit-start); } 1572e5b6d6dSopenharmony_ci UChar *getStart() { return start; } 1582e5b6d6dSopenharmony_ci UChar *getLimit() { return limit; } 1592e5b6d6dSopenharmony_ci uint8_t getLastCC() const { return lastCC; } 1602e5b6d6dSopenharmony_ci 1612e5b6d6dSopenharmony_ci UBool equals(const UChar *start, const UChar *limit) const; 1622e5b6d6dSopenharmony_ci UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const; 1632e5b6d6dSopenharmony_ci 1642e5b6d6dSopenharmony_ci UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 1652e5b6d6dSopenharmony_ci return (c<=0xffff) ? 1662e5b6d6dSopenharmony_ci appendBMP((UChar)c, cc, errorCode) : 1672e5b6d6dSopenharmony_ci appendSupplementary(c, cc, errorCode); 1682e5b6d6dSopenharmony_ci } 1692e5b6d6dSopenharmony_ci UBool append(const UChar *s, int32_t length, UBool isNFD, 1702e5b6d6dSopenharmony_ci uint8_t leadCC, uint8_t trailCC, 1712e5b6d6dSopenharmony_ci UErrorCode &errorCode); 1722e5b6d6dSopenharmony_ci UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) { 1732e5b6d6dSopenharmony_ci if(remainingCapacity==0 && !resize(1, errorCode)) { 1742e5b6d6dSopenharmony_ci return false; 1752e5b6d6dSopenharmony_ci } 1762e5b6d6dSopenharmony_ci if(lastCC<=cc || cc==0) { 1772e5b6d6dSopenharmony_ci *limit++=c; 1782e5b6d6dSopenharmony_ci lastCC=cc; 1792e5b6d6dSopenharmony_ci if(cc<=1) { 1802e5b6d6dSopenharmony_ci reorderStart=limit; 1812e5b6d6dSopenharmony_ci } 1822e5b6d6dSopenharmony_ci } else { 1832e5b6d6dSopenharmony_ci insert(c, cc); 1842e5b6d6dSopenharmony_ci } 1852e5b6d6dSopenharmony_ci --remainingCapacity; 1862e5b6d6dSopenharmony_ci return true; 1872e5b6d6dSopenharmony_ci } 1882e5b6d6dSopenharmony_ci UBool appendZeroCC(UChar32 c, UErrorCode &errorCode); 1892e5b6d6dSopenharmony_ci UBool appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode); 1902e5b6d6dSopenharmony_ci void remove(); 1912e5b6d6dSopenharmony_ci void removeSuffix(int32_t suffixLength); 1922e5b6d6dSopenharmony_ci void setReorderingLimit(UChar *newLimit) { 1932e5b6d6dSopenharmony_ci remainingCapacity+=(int32_t)(limit-newLimit); 1942e5b6d6dSopenharmony_ci reorderStart=limit=newLimit; 1952e5b6d6dSopenharmony_ci lastCC=0; 1962e5b6d6dSopenharmony_ci } 1972e5b6d6dSopenharmony_ci void copyReorderableSuffixTo(UnicodeString &s) const { 1982e5b6d6dSopenharmony_ci s.setTo(ConstChar16Ptr(reorderStart), (int32_t)(limit-reorderStart)); 1992e5b6d6dSopenharmony_ci } 2002e5b6d6dSopenharmony_ciprivate: 2012e5b6d6dSopenharmony_ci /* 2022e5b6d6dSopenharmony_ci * TODO: Revisit whether it makes sense to track reorderStart. 2032e5b6d6dSopenharmony_ci * It is set to after the last known character with cc<=1, 2042e5b6d6dSopenharmony_ci * which stops previousCC() before it reads that character and looks up its cc. 2052e5b6d6dSopenharmony_ci * previousCC() is normally only called from insert(). 2062e5b6d6dSopenharmony_ci * In other words, reorderStart speeds up the insertion of a combining mark 2072e5b6d6dSopenharmony_ci * into a multi-combining mark sequence where it does not belong at the end. 2082e5b6d6dSopenharmony_ci * This might not be worth the trouble. 2092e5b6d6dSopenharmony_ci * On the other hand, it's not a huge amount of trouble. 2102e5b6d6dSopenharmony_ci * 2112e5b6d6dSopenharmony_ci * We probably need it for UNORM_SIMPLE_APPEND. 2122e5b6d6dSopenharmony_ci */ 2132e5b6d6dSopenharmony_ci 2142e5b6d6dSopenharmony_ci UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode); 2152e5b6d6dSopenharmony_ci void insert(UChar32 c, uint8_t cc); 2162e5b6d6dSopenharmony_ci static void writeCodePoint(UChar *p, UChar32 c) { 2172e5b6d6dSopenharmony_ci if(c<=0xffff) { 2182e5b6d6dSopenharmony_ci *p=(UChar)c; 2192e5b6d6dSopenharmony_ci } else { 2202e5b6d6dSopenharmony_ci p[0]=U16_LEAD(c); 2212e5b6d6dSopenharmony_ci p[1]=U16_TRAIL(c); 2222e5b6d6dSopenharmony_ci } 2232e5b6d6dSopenharmony_ci } 2242e5b6d6dSopenharmony_ci UBool resize(int32_t appendLength, UErrorCode &errorCode); 2252e5b6d6dSopenharmony_ci 2262e5b6d6dSopenharmony_ci const Normalizer2Impl &impl; 2272e5b6d6dSopenharmony_ci UnicodeString &str; 2282e5b6d6dSopenharmony_ci UChar *start, *reorderStart, *limit; 2292e5b6d6dSopenharmony_ci int32_t remainingCapacity; 2302e5b6d6dSopenharmony_ci uint8_t lastCC; 2312e5b6d6dSopenharmony_ci 2322e5b6d6dSopenharmony_ci // private backward iterator 2332e5b6d6dSopenharmony_ci void setIterator() { codePointStart=limit; } 2342e5b6d6dSopenharmony_ci void skipPrevious(); // Requires start<codePointStart. 2352e5b6d6dSopenharmony_ci uint8_t previousCC(); // Returns 0 if there is no previous character. 2362e5b6d6dSopenharmony_ci 2372e5b6d6dSopenharmony_ci UChar *codePointStart, *codePointLimit; 2382e5b6d6dSopenharmony_ci}; 2392e5b6d6dSopenharmony_ci 2402e5b6d6dSopenharmony_ci/** 2412e5b6d6dSopenharmony_ci * Low-level implementation of the Unicode Normalization Algorithm. 2422e5b6d6dSopenharmony_ci * For the data structure and details see the documentation at the end of 2432e5b6d6dSopenharmony_ci * this normalizer2impl.h and in the design doc at 2442e5b6d6dSopenharmony_ci * https://icu.unicode.org/design/normalization/custom 2452e5b6d6dSopenharmony_ci */ 2462e5b6d6dSopenharmony_ciclass U_COMMON_API Normalizer2Impl : public UObject { 2472e5b6d6dSopenharmony_cipublic: 2482e5b6d6dSopenharmony_ci Normalizer2Impl() : normTrie(NULL), fCanonIterData(NULL) { } 2492e5b6d6dSopenharmony_ci virtual ~Normalizer2Impl(); 2502e5b6d6dSopenharmony_ci 2512e5b6d6dSopenharmony_ci void init(const int32_t *inIndexes, const UCPTrie *inTrie, 2522e5b6d6dSopenharmony_ci const uint16_t *inExtraData, const uint8_t *inSmallFCD); 2532e5b6d6dSopenharmony_ci 2542e5b6d6dSopenharmony_ci void addLcccChars(UnicodeSet &set) const; 2552e5b6d6dSopenharmony_ci void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; 2562e5b6d6dSopenharmony_ci void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; 2572e5b6d6dSopenharmony_ci 2582e5b6d6dSopenharmony_ci // low-level properties ------------------------------------------------ *** 2592e5b6d6dSopenharmony_ci 2602e5b6d6dSopenharmony_ci UBool ensureCanonIterData(UErrorCode &errorCode) const; 2612e5b6d6dSopenharmony_ci 2622e5b6d6dSopenharmony_ci // The trie stores values for lead surrogate code *units*. 2632e5b6d6dSopenharmony_ci // Surrogate code *points* are inert. 2642e5b6d6dSopenharmony_ci uint16_t getNorm16(UChar32 c) const { 2652e5b6d6dSopenharmony_ci return U_IS_LEAD(c) ? 2662e5b6d6dSopenharmony_ci static_cast<uint16_t>(INERT) : 2672e5b6d6dSopenharmony_ci UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); 2682e5b6d6dSopenharmony_ci } 2692e5b6d6dSopenharmony_ci uint16_t getRawNorm16(UChar32 c) const { return UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); } 2702e5b6d6dSopenharmony_ci 2712e5b6d6dSopenharmony_ci UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const { 2722e5b6d6dSopenharmony_ci if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 2732e5b6d6dSopenharmony_ci return UNORM_YES; 2742e5b6d6dSopenharmony_ci } else if(minMaybeYes<=norm16) { 2752e5b6d6dSopenharmony_ci return UNORM_MAYBE; 2762e5b6d6dSopenharmony_ci } else { 2772e5b6d6dSopenharmony_ci return UNORM_NO; 2782e5b6d6dSopenharmony_ci } 2792e5b6d6dSopenharmony_ci } 2802e5b6d6dSopenharmony_ci UBool isAlgorithmicNoNo(uint16_t norm16) const { return limitNoNo<=norm16 && norm16<minMaybeYes; } 2812e5b6d6dSopenharmony_ci UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; } 2822e5b6d6dSopenharmony_ci UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; } 2832e5b6d6dSopenharmony_ci 2842e5b6d6dSopenharmony_ci uint8_t getCC(uint16_t norm16) const { 2852e5b6d6dSopenharmony_ci if(norm16>=MIN_NORMAL_MAYBE_YES) { 2862e5b6d6dSopenharmony_ci return getCCFromNormalYesOrMaybe(norm16); 2872e5b6d6dSopenharmony_ci } 2882e5b6d6dSopenharmony_ci if(norm16<minNoNo || limitNoNo<=norm16) { 2892e5b6d6dSopenharmony_ci return 0; 2902e5b6d6dSopenharmony_ci } 2912e5b6d6dSopenharmony_ci return getCCFromNoNo(norm16); 2922e5b6d6dSopenharmony_ci } 2932e5b6d6dSopenharmony_ci static uint8_t getCCFromNormalYesOrMaybe(uint16_t norm16) { 2942e5b6d6dSopenharmony_ci return (uint8_t)(norm16 >> OFFSET_SHIFT); 2952e5b6d6dSopenharmony_ci } 2962e5b6d6dSopenharmony_ci static uint8_t getCCFromYesOrMaybe(uint16_t norm16) { 2972e5b6d6dSopenharmony_ci return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; 2982e5b6d6dSopenharmony_ci } 2992e5b6d6dSopenharmony_ci uint8_t getCCFromYesOrMaybeCP(UChar32 c) const { 3002e5b6d6dSopenharmony_ci if (c < minCompNoMaybeCP) { return 0; } 3012e5b6d6dSopenharmony_ci return getCCFromYesOrMaybe(getNorm16(c)); 3022e5b6d6dSopenharmony_ci } 3032e5b6d6dSopenharmony_ci 3042e5b6d6dSopenharmony_ci /** 3052e5b6d6dSopenharmony_ci * Returns the FCD data for code point c. 3062e5b6d6dSopenharmony_ci * @param c A Unicode code point. 3072e5b6d6dSopenharmony_ci * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 3082e5b6d6dSopenharmony_ci */ 3092e5b6d6dSopenharmony_ci uint16_t getFCD16(UChar32 c) const { 3102e5b6d6dSopenharmony_ci if(c<minDecompNoCP) { 3112e5b6d6dSopenharmony_ci return 0; 3122e5b6d6dSopenharmony_ci } else if(c<=0xffff) { 3132e5b6d6dSopenharmony_ci if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 3142e5b6d6dSopenharmony_ci } 3152e5b6d6dSopenharmony_ci return getFCD16FromNormData(c); 3162e5b6d6dSopenharmony_ci } 3172e5b6d6dSopenharmony_ci /** 3182e5b6d6dSopenharmony_ci * Returns the FCD data for the next code point (post-increment). 3192e5b6d6dSopenharmony_ci * Might skip only a lead surrogate rather than the whole surrogate pair if none of 3202e5b6d6dSopenharmony_ci * the supplementary code points associated with the lead surrogate have non-zero FCD data. 3212e5b6d6dSopenharmony_ci * @param s A valid pointer into a string. Requires s!=limit. 3222e5b6d6dSopenharmony_ci * @param limit The end of the string, or NULL. 3232e5b6d6dSopenharmony_ci * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 3242e5b6d6dSopenharmony_ci */ 3252e5b6d6dSopenharmony_ci uint16_t nextFCD16(const UChar *&s, const UChar *limit) const { 3262e5b6d6dSopenharmony_ci UChar32 c=*s++; 3272e5b6d6dSopenharmony_ci if(c<minDecompNoCP || !singleLeadMightHaveNonZeroFCD16(c)) { 3282e5b6d6dSopenharmony_ci return 0; 3292e5b6d6dSopenharmony_ci } 3302e5b6d6dSopenharmony_ci UChar c2; 3312e5b6d6dSopenharmony_ci if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) { 3322e5b6d6dSopenharmony_ci c=U16_GET_SUPPLEMENTARY(c, c2); 3332e5b6d6dSopenharmony_ci ++s; 3342e5b6d6dSopenharmony_ci } 3352e5b6d6dSopenharmony_ci return getFCD16FromNormData(c); 3362e5b6d6dSopenharmony_ci } 3372e5b6d6dSopenharmony_ci /** 3382e5b6d6dSopenharmony_ci * Returns the FCD data for the previous code point (pre-decrement). 3392e5b6d6dSopenharmony_ci * @param start The start of the string. 3402e5b6d6dSopenharmony_ci * @param s A valid pointer into a string. Requires start<s. 3412e5b6d6dSopenharmony_ci * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 3422e5b6d6dSopenharmony_ci */ 3432e5b6d6dSopenharmony_ci uint16_t previousFCD16(const UChar *start, const UChar *&s) const { 3442e5b6d6dSopenharmony_ci UChar32 c=*--s; 3452e5b6d6dSopenharmony_ci if(c<minDecompNoCP) { 3462e5b6d6dSopenharmony_ci return 0; 3472e5b6d6dSopenharmony_ci } 3482e5b6d6dSopenharmony_ci if(!U16_IS_TRAIL(c)) { 3492e5b6d6dSopenharmony_ci if(!singleLeadMightHaveNonZeroFCD16(c)) { 3502e5b6d6dSopenharmony_ci return 0; 3512e5b6d6dSopenharmony_ci } 3522e5b6d6dSopenharmony_ci } else { 3532e5b6d6dSopenharmony_ci UChar c2; 3542e5b6d6dSopenharmony_ci if(start<s && U16_IS_LEAD(c2=*(s-1))) { 3552e5b6d6dSopenharmony_ci c=U16_GET_SUPPLEMENTARY(c2, c); 3562e5b6d6dSopenharmony_ci --s; 3572e5b6d6dSopenharmony_ci } 3582e5b6d6dSopenharmony_ci } 3592e5b6d6dSopenharmony_ci return getFCD16FromNormData(c); 3602e5b6d6dSopenharmony_ci } 3612e5b6d6dSopenharmony_ci 3622e5b6d6dSopenharmony_ci /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ 3632e5b6d6dSopenharmony_ci UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const { 3642e5b6d6dSopenharmony_ci // 0<=lead<=0xffff 3652e5b6d6dSopenharmony_ci uint8_t bits=smallFCD[lead>>8]; 3662e5b6d6dSopenharmony_ci if(bits==0) { return false; } 3672e5b6d6dSopenharmony_ci return (UBool)((bits>>((lead>>5)&7))&1); 3682e5b6d6dSopenharmony_ci } 3692e5b6d6dSopenharmony_ci /** Returns the FCD value from the regular normalization data. */ 3702e5b6d6dSopenharmony_ci uint16_t getFCD16FromNormData(UChar32 c) const; 3712e5b6d6dSopenharmony_ci 3722e5b6d6dSopenharmony_ci /** 3732e5b6d6dSopenharmony_ci * Gets the decomposition for one code point. 3742e5b6d6dSopenharmony_ci * @param c code point 3752e5b6d6dSopenharmony_ci * @param buffer out-only buffer for algorithmic decompositions 3762e5b6d6dSopenharmony_ci * @param length out-only, takes the length of the decomposition, if any 3772e5b6d6dSopenharmony_ci * @return pointer to the decomposition, or NULL if none 3782e5b6d6dSopenharmony_ci */ 3792e5b6d6dSopenharmony_ci const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const; 3802e5b6d6dSopenharmony_ci 3812e5b6d6dSopenharmony_ci /** 3822e5b6d6dSopenharmony_ci * Gets the raw decomposition for one code point. 3832e5b6d6dSopenharmony_ci * @param c code point 3842e5b6d6dSopenharmony_ci * @param buffer out-only buffer for algorithmic decompositions 3852e5b6d6dSopenharmony_ci * @param length out-only, takes the length of the decomposition, if any 3862e5b6d6dSopenharmony_ci * @return pointer to the decomposition, or NULL if none 3872e5b6d6dSopenharmony_ci */ 3882e5b6d6dSopenharmony_ci const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const; 3892e5b6d6dSopenharmony_ci 3902e5b6d6dSopenharmony_ci UChar32 composePair(UChar32 a, UChar32 b) const; 3912e5b6d6dSopenharmony_ci 3922e5b6d6dSopenharmony_ci UBool isCanonSegmentStarter(UChar32 c) const; 3932e5b6d6dSopenharmony_ci UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; 3942e5b6d6dSopenharmony_ci 3952e5b6d6dSopenharmony_ci enum { 3962e5b6d6dSopenharmony_ci // Fixed norm16 values. 3972e5b6d6dSopenharmony_ci MIN_YES_YES_WITH_CC=0xfe02, 3982e5b6d6dSopenharmony_ci JAMO_VT=0xfe00, 3992e5b6d6dSopenharmony_ci MIN_NORMAL_MAYBE_YES=0xfc00, 4002e5b6d6dSopenharmony_ci JAMO_L=2, // offset=1 hasCompBoundaryAfter=false 4012e5b6d6dSopenharmony_ci INERT=1, // offset=0 hasCompBoundaryAfter=true 4022e5b6d6dSopenharmony_ci 4032e5b6d6dSopenharmony_ci // norm16 bit 0 is comp-boundary-after. 4042e5b6d6dSopenharmony_ci HAS_COMP_BOUNDARY_AFTER=1, 4052e5b6d6dSopenharmony_ci OFFSET_SHIFT=1, 4062e5b6d6dSopenharmony_ci 4072e5b6d6dSopenharmony_ci // For algorithmic one-way mappings, norm16 bits 2..1 indicate the 4082e5b6d6dSopenharmony_ci // tccc (0, 1, >1) for quick FCC boundary-after tests. 4092e5b6d6dSopenharmony_ci DELTA_TCCC_0=0, 4102e5b6d6dSopenharmony_ci DELTA_TCCC_1=2, 4112e5b6d6dSopenharmony_ci DELTA_TCCC_GT_1=4, 4122e5b6d6dSopenharmony_ci DELTA_TCCC_MASK=6, 4132e5b6d6dSopenharmony_ci DELTA_SHIFT=3, 4142e5b6d6dSopenharmony_ci 4152e5b6d6dSopenharmony_ci MAX_DELTA=0x40 4162e5b6d6dSopenharmony_ci }; 4172e5b6d6dSopenharmony_ci 4182e5b6d6dSopenharmony_ci enum { 4192e5b6d6dSopenharmony_ci // Byte offsets from the start of the data, after the generic header. 4202e5b6d6dSopenharmony_ci IX_NORM_TRIE_OFFSET, 4212e5b6d6dSopenharmony_ci IX_EXTRA_DATA_OFFSET, 4222e5b6d6dSopenharmony_ci IX_SMALL_FCD_OFFSET, 4232e5b6d6dSopenharmony_ci IX_RESERVED3_OFFSET, 4242e5b6d6dSopenharmony_ci IX_RESERVED4_OFFSET, 4252e5b6d6dSopenharmony_ci IX_RESERVED5_OFFSET, 4262e5b6d6dSopenharmony_ci IX_RESERVED6_OFFSET, 4272e5b6d6dSopenharmony_ci IX_TOTAL_SIZE, 4282e5b6d6dSopenharmony_ci 4292e5b6d6dSopenharmony_ci // Code point thresholds for quick check codes. 4302e5b6d6dSopenharmony_ci IX_MIN_DECOMP_NO_CP, 4312e5b6d6dSopenharmony_ci IX_MIN_COMP_NO_MAYBE_CP, 4322e5b6d6dSopenharmony_ci 4332e5b6d6dSopenharmony_ci // Norm16 value thresholds for quick check combinations and types of extra data. 4342e5b6d6dSopenharmony_ci 4352e5b6d6dSopenharmony_ci /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ 4362e5b6d6dSopenharmony_ci IX_MIN_YES_NO, 4372e5b6d6dSopenharmony_ci /** Mappings are comp-normalized. */ 4382e5b6d6dSopenharmony_ci IX_MIN_NO_NO, 4392e5b6d6dSopenharmony_ci IX_LIMIT_NO_NO, 4402e5b6d6dSopenharmony_ci IX_MIN_MAYBE_YES, 4412e5b6d6dSopenharmony_ci 4422e5b6d6dSopenharmony_ci /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ 4432e5b6d6dSopenharmony_ci IX_MIN_YES_NO_MAPPINGS_ONLY, 4442e5b6d6dSopenharmony_ci /** Mappings are not comp-normalized but have a comp boundary before. */ 4452e5b6d6dSopenharmony_ci IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE, 4462e5b6d6dSopenharmony_ci /** Mappings do not have a comp boundary before. */ 4472e5b6d6dSopenharmony_ci IX_MIN_NO_NO_COMP_NO_MAYBE_CC, 4482e5b6d6dSopenharmony_ci /** Mappings to the empty string. */ 4492e5b6d6dSopenharmony_ci IX_MIN_NO_NO_EMPTY, 4502e5b6d6dSopenharmony_ci 4512e5b6d6dSopenharmony_ci IX_MIN_LCCC_CP, 4522e5b6d6dSopenharmony_ci IX_RESERVED19, 4532e5b6d6dSopenharmony_ci IX_COUNT 4542e5b6d6dSopenharmony_ci }; 4552e5b6d6dSopenharmony_ci 4562e5b6d6dSopenharmony_ci enum { 4572e5b6d6dSopenharmony_ci MAPPING_HAS_CCC_LCCC_WORD=0x80, 4582e5b6d6dSopenharmony_ci MAPPING_HAS_RAW_MAPPING=0x40, 4592e5b6d6dSopenharmony_ci // unused bit 0x20, 4602e5b6d6dSopenharmony_ci MAPPING_LENGTH_MASK=0x1f 4612e5b6d6dSopenharmony_ci }; 4622e5b6d6dSopenharmony_ci 4632e5b6d6dSopenharmony_ci enum { 4642e5b6d6dSopenharmony_ci COMP_1_LAST_TUPLE=0x8000, 4652e5b6d6dSopenharmony_ci COMP_1_TRIPLE=1, 4662e5b6d6dSopenharmony_ci COMP_1_TRAIL_LIMIT=0x3400, 4672e5b6d6dSopenharmony_ci COMP_1_TRAIL_MASK=0x7ffe, 4682e5b6d6dSopenharmony_ci COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit 4692e5b6d6dSopenharmony_ci COMP_2_TRAIL_SHIFT=6, 4702e5b6d6dSopenharmony_ci COMP_2_TRAIL_MASK=0xffc0 4712e5b6d6dSopenharmony_ci }; 4722e5b6d6dSopenharmony_ci 4732e5b6d6dSopenharmony_ci // higher-level functionality ------------------------------------------ *** 4742e5b6d6dSopenharmony_ci 4752e5b6d6dSopenharmony_ci // NFD without an NFD Normalizer2 instance. 4762e5b6d6dSopenharmony_ci UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest, 4772e5b6d6dSopenharmony_ci UErrorCode &errorCode) const; 4782e5b6d6dSopenharmony_ci /** 4792e5b6d6dSopenharmony_ci * Decomposes [src, limit[ and writes the result to dest. 4802e5b6d6dSopenharmony_ci * limit can be NULL if src is NUL-terminated. 4812e5b6d6dSopenharmony_ci * destLengthEstimate is the initial dest buffer capacity and can be -1. 4822e5b6d6dSopenharmony_ci */ 4832e5b6d6dSopenharmony_ci void decompose(const UChar *src, const UChar *limit, 4842e5b6d6dSopenharmony_ci UnicodeString &dest, int32_t destLengthEstimate, 4852e5b6d6dSopenharmony_ci UErrorCode &errorCode) const; 4862e5b6d6dSopenharmony_ci 4872e5b6d6dSopenharmony_ci const UChar *decompose(const UChar *src, const UChar *limit, 4882e5b6d6dSopenharmony_ci ReorderingBuffer *buffer, UErrorCode &errorCode) const; 4892e5b6d6dSopenharmony_ci void decomposeAndAppend(const UChar *src, const UChar *limit, 4902e5b6d6dSopenharmony_ci UBool doDecompose, 4912e5b6d6dSopenharmony_ci UnicodeString &safeMiddle, 4922e5b6d6dSopenharmony_ci ReorderingBuffer &buffer, 4932e5b6d6dSopenharmony_ci UErrorCode &errorCode) const; 4942e5b6d6dSopenharmony_ci 4952e5b6d6dSopenharmony_ci /** sink==nullptr: isNormalized()/spanQuickCheckYes() */ 4962e5b6d6dSopenharmony_ci const uint8_t *decomposeUTF8(uint32_t options, 4972e5b6d6dSopenharmony_ci const uint8_t *src, const uint8_t *limit, 4982e5b6d6dSopenharmony_ci ByteSink *sink, Edits *edits, UErrorCode &errorCode) const; 4992e5b6d6dSopenharmony_ci 5002e5b6d6dSopenharmony_ci UBool compose(const UChar *src, const UChar *limit, 5012e5b6d6dSopenharmony_ci UBool onlyContiguous, 5022e5b6d6dSopenharmony_ci UBool doCompose, 5032e5b6d6dSopenharmony_ci ReorderingBuffer &buffer, 5042e5b6d6dSopenharmony_ci UErrorCode &errorCode) const; 5052e5b6d6dSopenharmony_ci const UChar *composeQuickCheck(const UChar *src, const UChar *limit, 5062e5b6d6dSopenharmony_ci UBool onlyContiguous, 5072e5b6d6dSopenharmony_ci UNormalizationCheckResult *pQCResult) const; 5082e5b6d6dSopenharmony_ci void composeAndAppend(const UChar *src, const UChar *limit, 5092e5b6d6dSopenharmony_ci UBool doCompose, 5102e5b6d6dSopenharmony_ci UBool onlyContiguous, 5112e5b6d6dSopenharmony_ci UnicodeString &safeMiddle, 5122e5b6d6dSopenharmony_ci ReorderingBuffer &buffer, 5132e5b6d6dSopenharmony_ci UErrorCode &errorCode) const; 5142e5b6d6dSopenharmony_ci 5152e5b6d6dSopenharmony_ci /** sink==nullptr: isNormalized() */ 5162e5b6d6dSopenharmony_ci UBool composeUTF8(uint32_t options, UBool onlyContiguous, 5172e5b6d6dSopenharmony_ci const uint8_t *src, const uint8_t *limit, 5182e5b6d6dSopenharmony_ci ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const; 5192e5b6d6dSopenharmony_ci 5202e5b6d6dSopenharmony_ci const UChar *makeFCD(const UChar *src, const UChar *limit, 5212e5b6d6dSopenharmony_ci ReorderingBuffer *buffer, UErrorCode &errorCode) const; 5222e5b6d6dSopenharmony_ci void makeFCDAndAppend(const UChar *src, const UChar *limit, 5232e5b6d6dSopenharmony_ci UBool doMakeFCD, 5242e5b6d6dSopenharmony_ci UnicodeString &safeMiddle, 5252e5b6d6dSopenharmony_ci ReorderingBuffer &buffer, 5262e5b6d6dSopenharmony_ci UErrorCode &errorCode) const; 5272e5b6d6dSopenharmony_ci 5282e5b6d6dSopenharmony_ci UBool hasDecompBoundaryBefore(UChar32 c) const; 5292e5b6d6dSopenharmony_ci UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const; 5302e5b6d6dSopenharmony_ci UBool hasDecompBoundaryAfter(UChar32 c) const; 5312e5b6d6dSopenharmony_ci UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const; 5322e5b6d6dSopenharmony_ci UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } 5332e5b6d6dSopenharmony_ci 5342e5b6d6dSopenharmony_ci UBool hasCompBoundaryBefore(UChar32 c) const { 5352e5b6d6dSopenharmony_ci return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); 5362e5b6d6dSopenharmony_ci } 5372e5b6d6dSopenharmony_ci UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous) const { 5382e5b6d6dSopenharmony_ci return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); 5392e5b6d6dSopenharmony_ci } 5402e5b6d6dSopenharmony_ci UBool isCompInert(UChar32 c, UBool onlyContiguous) const { 5412e5b6d6dSopenharmony_ci uint16_t norm16=getNorm16(c); 5422e5b6d6dSopenharmony_ci return isCompYesAndZeroCC(norm16) && 5432e5b6d6dSopenharmony_ci (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 5442e5b6d6dSopenharmony_ci (!onlyContiguous || isInert(norm16) || *getMapping(norm16) <= 0x1ff); 5452e5b6d6dSopenharmony_ci } 5462e5b6d6dSopenharmony_ci 5472e5b6d6dSopenharmony_ci UBool hasFCDBoundaryBefore(UChar32 c) const { return hasDecompBoundaryBefore(c); } 5482e5b6d6dSopenharmony_ci UBool hasFCDBoundaryAfter(UChar32 c) const { return hasDecompBoundaryAfter(c); } 5492e5b6d6dSopenharmony_ci UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; } 5502e5b6d6dSopenharmony_ciprivate: 5512e5b6d6dSopenharmony_ci friend class InitCanonIterData; 5522e5b6d6dSopenharmony_ci friend class LcccContext; 5532e5b6d6dSopenharmony_ci 5542e5b6d6dSopenharmony_ci UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 5552e5b6d6dSopenharmony_ci UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; } 5562e5b6d6dSopenharmony_ci static UBool isInert(uint16_t norm16) { return norm16==INERT; } 5572e5b6d6dSopenharmony_ci static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; } 5582e5b6d6dSopenharmony_ci static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } 5592e5b6d6dSopenharmony_ci uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } 5602e5b6d6dSopenharmony_ci UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; } 5612e5b6d6dSopenharmony_ci UBool isHangulLVT(uint16_t norm16) const { 5622e5b6d6dSopenharmony_ci return norm16==hangulLVT(); 5632e5b6d6dSopenharmony_ci } 5642e5b6d6dSopenharmony_ci UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; } 5652e5b6d6dSopenharmony_ci // UBool isCompYes(uint16_t norm16) const { 5662e5b6d6dSopenharmony_ci // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 5672e5b6d6dSopenharmony_ci // } 5682e5b6d6dSopenharmony_ci // UBool isCompYesOrMaybe(uint16_t norm16) const { 5692e5b6d6dSopenharmony_ci // return norm16<minNoNo || minMaybeYes<=norm16; 5702e5b6d6dSopenharmony_ci // } 5712e5b6d6dSopenharmony_ci // UBool hasZeroCCFromDecompYes(uint16_t norm16) const { 5722e5b6d6dSopenharmony_ci // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 5732e5b6d6dSopenharmony_ci // } 5742e5b6d6dSopenharmony_ci UBool isDecompYesAndZeroCC(uint16_t norm16) const { 5752e5b6d6dSopenharmony_ci return norm16<minYesNo || 5762e5b6d6dSopenharmony_ci norm16==JAMO_VT || 5772e5b6d6dSopenharmony_ci (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 5782e5b6d6dSopenharmony_ci } 5792e5b6d6dSopenharmony_ci /** 5802e5b6d6dSopenharmony_ci * A little faster and simpler than isDecompYesAndZeroCC() but does not include 5812e5b6d6dSopenharmony_ci * the MaybeYes which combine-forward and have ccc=0. 5822e5b6d6dSopenharmony_ci * (Standard Unicode 10 normalization does not have such characters.) 5832e5b6d6dSopenharmony_ci */ 5842e5b6d6dSopenharmony_ci UBool isMostDecompYesAndZeroCC(uint16_t norm16) const { 5852e5b6d6dSopenharmony_ci return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 5862e5b6d6dSopenharmony_ci } 5872e5b6d6dSopenharmony_ci UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; } 5882e5b6d6dSopenharmony_ci 5892e5b6d6dSopenharmony_ci // For use with isCompYes(). 5902e5b6d6dSopenharmony_ci // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 5912e5b6d6dSopenharmony_ci // static uint8_t getCCFromYes(uint16_t norm16) { 5922e5b6d6dSopenharmony_ci // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; 5932e5b6d6dSopenharmony_ci // } 5942e5b6d6dSopenharmony_ci uint8_t getCCFromNoNo(uint16_t norm16) const { 5952e5b6d6dSopenharmony_ci const uint16_t *mapping=getMapping(norm16); 5962e5b6d6dSopenharmony_ci if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { 5972e5b6d6dSopenharmony_ci return (uint8_t)*(mapping-1); 5982e5b6d6dSopenharmony_ci } else { 5992e5b6d6dSopenharmony_ci return 0; 6002e5b6d6dSopenharmony_ci } 6012e5b6d6dSopenharmony_ci } 6022e5b6d6dSopenharmony_ci // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() 6032e5b6d6dSopenharmony_ci uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const { 6042e5b6d6dSopenharmony_ci if(norm16<=minYesNo) { 6052e5b6d6dSopenharmony_ci return 0; // yesYes and Hangul LV have ccc=tccc=0 6062e5b6d6dSopenharmony_ci } else { 6072e5b6d6dSopenharmony_ci // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. 6082e5b6d6dSopenharmony_ci return (uint8_t)(*getMapping(norm16)>>8); // tccc from yesNo 6092e5b6d6dSopenharmony_ci } 6102e5b6d6dSopenharmony_ci } 6112e5b6d6dSopenharmony_ci uint8_t getPreviousTrailCC(const UChar *start, const UChar *p) const; 6122e5b6d6dSopenharmony_ci uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const; 6132e5b6d6dSopenharmony_ci 6142e5b6d6dSopenharmony_ci // Requires algorithmic-NoNo. 6152e5b6d6dSopenharmony_ci UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { 6162e5b6d6dSopenharmony_ci return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; 6172e5b6d6dSopenharmony_ci } 6182e5b6d6dSopenharmony_ci UChar32 getAlgorithmicDelta(uint16_t norm16) const { 6192e5b6d6dSopenharmony_ci return (norm16>>DELTA_SHIFT)-centerNoNoDelta; 6202e5b6d6dSopenharmony_ci } 6212e5b6d6dSopenharmony_ci 6222e5b6d6dSopenharmony_ci // Requires minYesNo<norm16<limitNoNo. 6232e5b6d6dSopenharmony_ci const uint16_t *getMapping(uint16_t norm16) const { return extraData+(norm16>>OFFSET_SHIFT); } 6242e5b6d6dSopenharmony_ci const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const { 6252e5b6d6dSopenharmony_ci if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { 6262e5b6d6dSopenharmony_ci return NULL; 6272e5b6d6dSopenharmony_ci } else if(norm16<minMaybeYes) { 6282e5b6d6dSopenharmony_ci return getMapping(norm16); // for yesYes; if Jamo L: harmless empty list 6292e5b6d6dSopenharmony_ci } else { 6302e5b6d6dSopenharmony_ci return maybeYesCompositions+norm16-minMaybeYes; 6312e5b6d6dSopenharmony_ci } 6322e5b6d6dSopenharmony_ci } 6332e5b6d6dSopenharmony_ci const uint16_t *getCompositionsListForComposite(uint16_t norm16) const { 6342e5b6d6dSopenharmony_ci // A composite has both mapping & compositions list. 6352e5b6d6dSopenharmony_ci const uint16_t *list=getMapping(norm16); 6362e5b6d6dSopenharmony_ci return list+ // mapping pointer 6372e5b6d6dSopenharmony_ci 1+ // +1 to skip the first unit with the mapping length 6382e5b6d6dSopenharmony_ci (*list&MAPPING_LENGTH_MASK); // + mapping length 6392e5b6d6dSopenharmony_ci } 6402e5b6d6dSopenharmony_ci const uint16_t *getCompositionsListForMaybe(uint16_t norm16) const { 6412e5b6d6dSopenharmony_ci // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES 6422e5b6d6dSopenharmony_ci return maybeYesCompositions+((norm16-minMaybeYes)>>OFFSET_SHIFT); 6432e5b6d6dSopenharmony_ci } 6442e5b6d6dSopenharmony_ci /** 6452e5b6d6dSopenharmony_ci * @param c code point must have compositions 6462e5b6d6dSopenharmony_ci * @return compositions list pointer 6472e5b6d6dSopenharmony_ci */ 6482e5b6d6dSopenharmony_ci const uint16_t *getCompositionsList(uint16_t norm16) const { 6492e5b6d6dSopenharmony_ci return isDecompYes(norm16) ? 6502e5b6d6dSopenharmony_ci getCompositionsListForDecompYes(norm16) : 6512e5b6d6dSopenharmony_ci getCompositionsListForComposite(norm16); 6522e5b6d6dSopenharmony_ci } 6532e5b6d6dSopenharmony_ci 6542e5b6d6dSopenharmony_ci const UChar *copyLowPrefixFromNulTerminated(const UChar *src, 6552e5b6d6dSopenharmony_ci UChar32 minNeedDataCP, 6562e5b6d6dSopenharmony_ci ReorderingBuffer *buffer, 6572e5b6d6dSopenharmony_ci UErrorCode &errorCode) const; 6582e5b6d6dSopenharmony_ci 6592e5b6d6dSopenharmony_ci enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY }; 6602e5b6d6dSopenharmony_ci 6612e5b6d6dSopenharmony_ci const UChar *decomposeShort(const UChar *src, const UChar *limit, 6622e5b6d6dSopenharmony_ci UBool stopAtCompBoundary, UBool onlyContiguous, 6632e5b6d6dSopenharmony_ci ReorderingBuffer &buffer, UErrorCode &errorCode) const; 6642e5b6d6dSopenharmony_ci UBool decompose(UChar32 c, uint16_t norm16, 6652e5b6d6dSopenharmony_ci ReorderingBuffer &buffer, UErrorCode &errorCode) const; 6662e5b6d6dSopenharmony_ci 6672e5b6d6dSopenharmony_ci const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit, 6682e5b6d6dSopenharmony_ci StopAt stopAt, UBool onlyContiguous, 6692e5b6d6dSopenharmony_ci ReorderingBuffer &buffer, UErrorCode &errorCode) const; 6702e5b6d6dSopenharmony_ci 6712e5b6d6dSopenharmony_ci static int32_t combine(const uint16_t *list, UChar32 trail); 6722e5b6d6dSopenharmony_ci void addComposites(const uint16_t *list, UnicodeSet &set) const; 6732e5b6d6dSopenharmony_ci void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 6742e5b6d6dSopenharmony_ci UBool onlyContiguous) const; 6752e5b6d6dSopenharmony_ci 6762e5b6d6dSopenharmony_ci UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { 6772e5b6d6dSopenharmony_ci return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); 6782e5b6d6dSopenharmony_ci } 6792e5b6d6dSopenharmony_ci UBool norm16HasCompBoundaryBefore(uint16_t norm16) const { 6802e5b6d6dSopenharmony_ci return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); 6812e5b6d6dSopenharmony_ci } 6822e5b6d6dSopenharmony_ci UBool hasCompBoundaryBefore(const UChar *src, const UChar *limit) const; 6832e5b6d6dSopenharmony_ci UBool hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const; 6842e5b6d6dSopenharmony_ci UBool hasCompBoundaryAfter(const UChar *start, const UChar *p, 6852e5b6d6dSopenharmony_ci UBool onlyContiguous) const; 6862e5b6d6dSopenharmony_ci UBool hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p, 6872e5b6d6dSopenharmony_ci UBool onlyContiguous) const; 6882e5b6d6dSopenharmony_ci UBool norm16HasCompBoundaryAfter(uint16_t norm16, UBool onlyContiguous) const { 6892e5b6d6dSopenharmony_ci return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 6902e5b6d6dSopenharmony_ci (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); 6912e5b6d6dSopenharmony_ci } 6922e5b6d6dSopenharmony_ci /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ 6932e5b6d6dSopenharmony_ci UBool isTrailCC01ForCompBoundaryAfter(uint16_t norm16) const { 6942e5b6d6dSopenharmony_ci return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? 6952e5b6d6dSopenharmony_ci (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : *getMapping(norm16) <= 0x1ff); 6962e5b6d6dSopenharmony_ci } 6972e5b6d6dSopenharmony_ci 6982e5b6d6dSopenharmony_ci const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p, UBool onlyContiguous) const; 6992e5b6d6dSopenharmony_ci const UChar *findNextCompBoundary(const UChar *p, const UChar *limit, UBool onlyContiguous) const; 7002e5b6d6dSopenharmony_ci 7012e5b6d6dSopenharmony_ci const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const; 7022e5b6d6dSopenharmony_ci const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const; 7032e5b6d6dSopenharmony_ci 7042e5b6d6dSopenharmony_ci void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16, 7052e5b6d6dSopenharmony_ci CanonIterData &newData, UErrorCode &errorCode) const; 7062e5b6d6dSopenharmony_ci 7072e5b6d6dSopenharmony_ci int32_t getCanonValue(UChar32 c) const; 7082e5b6d6dSopenharmony_ci const UnicodeSet &getCanonStartSet(int32_t n) const; 7092e5b6d6dSopenharmony_ci 7102e5b6d6dSopenharmony_ci // UVersionInfo dataVersion; 7112e5b6d6dSopenharmony_ci 7122e5b6d6dSopenharmony_ci // BMP code point thresholds for quick check loops looking at single UTF-16 code units. 7132e5b6d6dSopenharmony_ci UChar minDecompNoCP; 7142e5b6d6dSopenharmony_ci UChar minCompNoMaybeCP; 7152e5b6d6dSopenharmony_ci UChar minLcccCP; 7162e5b6d6dSopenharmony_ci 7172e5b6d6dSopenharmony_ci // Norm16 value thresholds for quick check combinations and types of extra data. 7182e5b6d6dSopenharmony_ci uint16_t minYesNo; 7192e5b6d6dSopenharmony_ci uint16_t minYesNoMappingsOnly; 7202e5b6d6dSopenharmony_ci uint16_t minNoNo; 7212e5b6d6dSopenharmony_ci uint16_t minNoNoCompBoundaryBefore; 7222e5b6d6dSopenharmony_ci uint16_t minNoNoCompNoMaybeCC; 7232e5b6d6dSopenharmony_ci uint16_t minNoNoEmpty; 7242e5b6d6dSopenharmony_ci uint16_t limitNoNo; 7252e5b6d6dSopenharmony_ci uint16_t centerNoNoDelta; 7262e5b6d6dSopenharmony_ci uint16_t minMaybeYes; 7272e5b6d6dSopenharmony_ci 7282e5b6d6dSopenharmony_ci const UCPTrie *normTrie; 7292e5b6d6dSopenharmony_ci const uint16_t *maybeYesCompositions; 7302e5b6d6dSopenharmony_ci const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 7312e5b6d6dSopenharmony_ci const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 7322e5b6d6dSopenharmony_ci 7332e5b6d6dSopenharmony_ci UInitOnce fCanonIterDataInitOnce {}; 7342e5b6d6dSopenharmony_ci CanonIterData *fCanonIterData; 7352e5b6d6dSopenharmony_ci}; 7362e5b6d6dSopenharmony_ci 7372e5b6d6dSopenharmony_ci// bits in canonIterData 7382e5b6d6dSopenharmony_ci#define CANON_NOT_SEGMENT_STARTER 0x80000000 7392e5b6d6dSopenharmony_ci#define CANON_HAS_COMPOSITIONS 0x40000000 7402e5b6d6dSopenharmony_ci#define CANON_HAS_SET 0x200000 7412e5b6d6dSopenharmony_ci#define CANON_VALUE_MASK 0x1fffff 7422e5b6d6dSopenharmony_ci 7432e5b6d6dSopenharmony_ci/** 7442e5b6d6dSopenharmony_ci * ICU-internal shortcut for quick access to standard Unicode normalization. 7452e5b6d6dSopenharmony_ci */ 7462e5b6d6dSopenharmony_ciclass U_COMMON_API Normalizer2Factory { 7472e5b6d6dSopenharmony_cipublic: 7482e5b6d6dSopenharmony_ci static const Normalizer2 *getFCDInstance(UErrorCode &errorCode); 7492e5b6d6dSopenharmony_ci static const Normalizer2 *getFCCInstance(UErrorCode &errorCode); 7502e5b6d6dSopenharmony_ci static const Normalizer2 *getNoopInstance(UErrorCode &errorCode); 7512e5b6d6dSopenharmony_ci 7522e5b6d6dSopenharmony_ci static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode); 7532e5b6d6dSopenharmony_ci 7542e5b6d6dSopenharmony_ci static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode); 7552e5b6d6dSopenharmony_ci static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode); 7562e5b6d6dSopenharmony_ci static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode); 7572e5b6d6dSopenharmony_ci 7582e5b6d6dSopenharmony_ci // Get the Impl instance of the Normalizer2. 7592e5b6d6dSopenharmony_ci // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance. 7602e5b6d6dSopenharmony_ci static const Normalizer2Impl *getImpl(const Normalizer2 *norm2); 7612e5b6d6dSopenharmony_ciprivate: 7622e5b6d6dSopenharmony_ci Normalizer2Factory() = delete; // No instantiation. 7632e5b6d6dSopenharmony_ci}; 7642e5b6d6dSopenharmony_ci 7652e5b6d6dSopenharmony_ciU_NAMESPACE_END 7662e5b6d6dSopenharmony_ci 7672e5b6d6dSopenharmony_ciU_CAPI int32_t U_EXPORT2 7682e5b6d6dSopenharmony_ciunorm2_swap(const UDataSwapper *ds, 7692e5b6d6dSopenharmony_ci const void *inData, int32_t length, void *outData, 7702e5b6d6dSopenharmony_ci UErrorCode *pErrorCode); 7712e5b6d6dSopenharmony_ci 7722e5b6d6dSopenharmony_ci/** 7732e5b6d6dSopenharmony_ci * Get the NF*_QC property for a code point, for u_getIntPropertyValue(). 7742e5b6d6dSopenharmony_ci * @internal 7752e5b6d6dSopenharmony_ci */ 7762e5b6d6dSopenharmony_ciU_CFUNC UNormalizationCheckResult 7772e5b6d6dSopenharmony_ciunorm_getQuickCheck(UChar32 c, UNormalizationMode mode); 7782e5b6d6dSopenharmony_ci 7792e5b6d6dSopenharmony_ci/** 7802e5b6d6dSopenharmony_ci * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue(). 7812e5b6d6dSopenharmony_ci * @internal 7822e5b6d6dSopenharmony_ci */ 7832e5b6d6dSopenharmony_ciU_CFUNC uint16_t 7842e5b6d6dSopenharmony_ciunorm_getFCD16(UChar32 c); 7852e5b6d6dSopenharmony_ci 7862e5b6d6dSopenharmony_ci/** 7872e5b6d6dSopenharmony_ci * Format of Normalizer2 .nrm data files. 7882e5b6d6dSopenharmony_ci * Format version 4.0. 7892e5b6d6dSopenharmony_ci * 7902e5b6d6dSopenharmony_ci * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms. 7912e5b6d6dSopenharmony_ci * ICU ships with data files for standard Unicode Normalization Forms 7922e5b6d6dSopenharmony_ci * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm). 7932e5b6d6dSopenharmony_ci * Custom (application-specific) data can be built into additional .nrm files 7942e5b6d6dSopenharmony_ci * with the gennorm2 build tool. 7952e5b6d6dSopenharmony_ci * ICU ships with one such file, uts46.nrm, for the implementation of UTS #46. 7962e5b6d6dSopenharmony_ci * 7972e5b6d6dSopenharmony_ci * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been 7982e5b6d6dSopenharmony_ci * cached already. Internally, Normalizer2Impl.load() reads the .nrm file. 7992e5b6d6dSopenharmony_ci * 8002e5b6d6dSopenharmony_ci * A .nrm file begins with a standard ICU data file header 8012e5b6d6dSopenharmony_ci * (DataHeader, see ucmndata.h and unicode/udata.h). 8022e5b6d6dSopenharmony_ci * The UDataInfo.dataVersion field usually contains the Unicode version 8032e5b6d6dSopenharmony_ci * for which the data was generated. 8042e5b6d6dSopenharmony_ci * 8052e5b6d6dSopenharmony_ci * After the header, the file contains the following parts. 8062e5b6d6dSopenharmony_ci * Constants are defined as enum values of the Normalizer2Impl class. 8072e5b6d6dSopenharmony_ci * 8082e5b6d6dSopenharmony_ci * Many details of the data structures are described in the design doc 8092e5b6d6dSopenharmony_ci * which is at https://icu.unicode.org/design/normalization/custom 8102e5b6d6dSopenharmony_ci * 8112e5b6d6dSopenharmony_ci * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4; 8122e5b6d6dSopenharmony_ci * 8132e5b6d6dSopenharmony_ci * The first eight indexes are byte offsets in ascending order. 8142e5b6d6dSopenharmony_ci * Each byte offset marks the start of the next part in the data file, 8152e5b6d6dSopenharmony_ci * and the end of the previous one. 8162e5b6d6dSopenharmony_ci * When two consecutive byte offsets are the same, then the corresponding part is empty. 8172e5b6d6dSopenharmony_ci * Byte offsets are offsets from after the header, 8182e5b6d6dSopenharmony_ci * that is, from the beginning of the indexes[]. 8192e5b6d6dSopenharmony_ci * Each part starts at an offset with proper alignment for its data. 8202e5b6d6dSopenharmony_ci * If necessary, the previous part may include padding bytes to achieve this alignment. 8212e5b6d6dSopenharmony_ci * 8222e5b6d6dSopenharmony_ci * minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point 8232e5b6d6dSopenharmony_ci * with a decomposition mapping, that is, with NF*D_QC=No. 8242e5b6d6dSopenharmony_ci * minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point 8252e5b6d6dSopenharmony_ci * with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward). 8262e5b6d6dSopenharmony_ci * minLcccCP=indexes[IX_MIN_LCCC_CP] (index 18, new in formatVersion 3) 8272e5b6d6dSopenharmony_ci * is the lowest code point with lccc!=0. 8282e5b6d6dSopenharmony_ci * 8292e5b6d6dSopenharmony_ci * The next eight indexes are thresholds of 16-bit trie values for ranges of 8302e5b6d6dSopenharmony_ci * values indicating multiple normalization properties. 8312e5b6d6dSopenharmony_ci * They are listed here in threshold order, not in the order they are stored in the indexes. 8322e5b6d6dSopenharmony_ci * minYesNo=indexes[IX_MIN_YES_NO]; 8332e5b6d6dSopenharmony_ci * minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 8342e5b6d6dSopenharmony_ci * minNoNo=indexes[IX_MIN_NO_NO]; 8352e5b6d6dSopenharmony_ci * minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; 8362e5b6d6dSopenharmony_ci * minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; 8372e5b6d6dSopenharmony_ci * minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY]; 8382e5b6d6dSopenharmony_ci * limitNoNo=indexes[IX_LIMIT_NO_NO]; 8392e5b6d6dSopenharmony_ci * minMaybeYes=indexes[IX_MIN_MAYBE_YES]; 8402e5b6d6dSopenharmony_ci * See the normTrie description below and the design doc for details. 8412e5b6d6dSopenharmony_ci * 8422e5b6d6dSopenharmony_ci * UCPTrie normTrie; -- see ucptrie_impl.h and ucptrie.h, same as Java CodePointTrie 8432e5b6d6dSopenharmony_ci * 8442e5b6d6dSopenharmony_ci * The trie holds the main normalization data. Each code point is mapped to a 16-bit value. 8452e5b6d6dSopenharmony_ci * Rather than using independent bits in the value (which would require more than 16 bits), 8462e5b6d6dSopenharmony_ci * information is extracted primarily via range checks. 8472e5b6d6dSopenharmony_ci * Except, format version 3 uses bit 0 for hasCompBoundaryAfter(). 8482e5b6d6dSopenharmony_ci * For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo 8492e5b6d6dSopenharmony_ci * means that the character has NF*C_QC=Yes and NF*D_QC=No properties, 8502e5b6d6dSopenharmony_ci * which means it has a two-way (round-trip) decomposition mapping. 8512e5b6d6dSopenharmony_ci * Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData 8522e5b6d6dSopenharmony_ci * pointing to mappings, compositions lists, or both. 8532e5b6d6dSopenharmony_ci * Value norm16==INERT (0 in versions 1 & 2, 1 in version 3) 8542e5b6d6dSopenharmony_ci * means that the character is normalization-inert, that is, 8552e5b6d6dSopenharmony_ci * it does not have a mapping, does not participate in composition, has a zero 8562e5b6d6dSopenharmony_ci * canonical combining class, and forms a boundary where text before it and after it 8572e5b6d6dSopenharmony_ci * can be normalized independently. 8582e5b6d6dSopenharmony_ci * For details about how multiple properties are encoded in 16-bit values 8592e5b6d6dSopenharmony_ci * see the design doc. 8602e5b6d6dSopenharmony_ci * Note that the encoding cannot express all combinations of the properties involved; 8612e5b6d6dSopenharmony_ci * it only supports those combinations that are allowed by 8622e5b6d6dSopenharmony_ci * the Unicode Normalization algorithms. Details are in the design doc as well. 8632e5b6d6dSopenharmony_ci * The gennorm2 tool only builds .nrm files for data that conforms to the limitations. 8642e5b6d6dSopenharmony_ci * 8652e5b6d6dSopenharmony_ci * The trie has a value for each lead surrogate code unit representing the "worst case" 8662e5b6d6dSopenharmony_ci * properties of the 1024 supplementary characters whose UTF-16 form starts with 8672e5b6d6dSopenharmony_ci * the lead surrogate. If all of the 1024 supplementary characters are normalization-inert, 8682e5b6d6dSopenharmony_ci * then their lead surrogate code unit has the trie value INERT. 8692e5b6d6dSopenharmony_ci * When the lead surrogate unit's value exceeds the quick check minimum during processing, 8702e5b6d6dSopenharmony_ci * the properties for the full supplementary code point need to be looked up. 8712e5b6d6dSopenharmony_ci * 8722e5b6d6dSopenharmony_ci * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]; 8732e5b6d6dSopenharmony_ci * uint16_t extraData[]; 8742e5b6d6dSopenharmony_ci * 8752e5b6d6dSopenharmony_ci * There is only one byte offset for the end of these two arrays. 8762e5b6d6dSopenharmony_ci * The split between them is given by the constant and variable mentioned above. 8772e5b6d6dSopenharmony_ci * In version 3, the difference must be shifted right by OFFSET_SHIFT. 8782e5b6d6dSopenharmony_ci * 8792e5b6d6dSopenharmony_ci * The maybeYesCompositions array contains compositions lists for characters that 8802e5b6d6dSopenharmony_ci * combine both forward (as starters in composition pairs) 8812e5b6d6dSopenharmony_ci * and backward (as trailing characters in composition pairs). 8822e5b6d6dSopenharmony_ci * Such characters do not occur in Unicode 5.2 but are allowed by 8832e5b6d6dSopenharmony_ci * the Unicode Normalization algorithms. 8842e5b6d6dSopenharmony_ci * If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES 8852e5b6d6dSopenharmony_ci * and the maybeYesCompositions array is empty. 8862e5b6d6dSopenharmony_ci * If there are such characters, then minMaybeYes is subtracted from their norm16 values 8872e5b6d6dSopenharmony_ci * to get the index into this array. 8882e5b6d6dSopenharmony_ci * 8892e5b6d6dSopenharmony_ci * The extraData array contains compositions lists for "YesYes" characters, 8902e5b6d6dSopenharmony_ci * followed by mappings and optional compositions lists for "YesNo" characters, 8912e5b6d6dSopenharmony_ci * followed by only mappings for "NoNo" characters. 8922e5b6d6dSopenharmony_ci * (Referring to pairs of NFC/NFD quick check values.) 8932e5b6d6dSopenharmony_ci * The norm16 values of those characters are directly indexes into the extraData array. 8942e5b6d6dSopenharmony_ci * In version 3, the norm16 values must be shifted right by OFFSET_SHIFT 8952e5b6d6dSopenharmony_ci * for accessing extraData. 8962e5b6d6dSopenharmony_ci * 8972e5b6d6dSopenharmony_ci * The data structures for compositions lists and mappings are described in the design doc. 8982e5b6d6dSopenharmony_ci * 8992e5b6d6dSopenharmony_ci * uint8_t smallFCD[0x100]; -- new in format version 2 9002e5b6d6dSopenharmony_ci * 9012e5b6d6dSopenharmony_ci * This is a bit set to help speed up FCD value lookups in the absence of a full 9022e5b6d6dSopenharmony_ci * UTrie2 or other large data structure with the full FCD value mapping. 9032e5b6d6dSopenharmony_ci * 9042e5b6d6dSopenharmony_ci * Each smallFCD bit is set if any of the corresponding 32 BMP code points 9052e5b6d6dSopenharmony_ci * has a non-zero FCD value (lccc!=0 or tccc!=0). 9062e5b6d6dSopenharmony_ci * Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF. 9072e5b6d6dSopenharmony_ci * A bit for 32 lead surrogates is set if any of the 32k corresponding 9082e5b6d6dSopenharmony_ci * _supplementary_ code points has a non-zero FCD value. 9092e5b6d6dSopenharmony_ci * 9102e5b6d6dSopenharmony_ci * This bit set is most useful for the large blocks of CJK characters with FCD=0. 9112e5b6d6dSopenharmony_ci * 9122e5b6d6dSopenharmony_ci * Changes from format version 1 to format version 2 --------------------------- 9132e5b6d6dSopenharmony_ci * 9142e5b6d6dSopenharmony_ci * - Addition of data for raw (not recursively decomposed) mappings. 9152e5b6d6dSopenharmony_ci * + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when 9162e5b6d6dSopenharmony_ci * the mapping is to an empty string or when the character combines-forward. 9172e5b6d6dSopenharmony_ci * This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which 9182e5b6d6dSopenharmony_ci * is then repurposed for the MAPPING_HAS_RAW_MAPPING bit. 9192e5b6d6dSopenharmony_ci * + For details see the design doc. 9202e5b6d6dSopenharmony_ci * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into 9212e5b6d6dSopenharmony_ci * distinct ranges (combines-forward vs. not) 9222e5b6d6dSopenharmony_ci * so that a range check can be used to find out if there is a compositions list. 9232e5b6d6dSopenharmony_ci * This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag. 9242e5b6d6dSopenharmony_ci * It is needed for the new (in ICU 49) composePair(), not for other normalization. 9252e5b6d6dSopenharmony_ci * - Addition of the smallFCD[] bit set. 9262e5b6d6dSopenharmony_ci * 9272e5b6d6dSopenharmony_ci * Changes from format version 2 to format version 3 (ICU 60) ------------------ 9282e5b6d6dSopenharmony_ci * 9292e5b6d6dSopenharmony_ci * - norm16 bit 0 indicates hasCompBoundaryAfter(), 9302e5b6d6dSopenharmony_ci * except that for contiguous composition (FCC) the tccc must be checked as well. 9312e5b6d6dSopenharmony_ci * Data indexes and ccc values are shifted left by one (OFFSET_SHIFT). 9322e5b6d6dSopenharmony_ci * Thresholds like minNoNo are tested before shifting. 9332e5b6d6dSopenharmony_ci * 9342e5b6d6dSopenharmony_ci * - Algorithmic mapping deltas are shifted left by two more bits (total DELTA_SHIFT), 9352e5b6d6dSopenharmony_ci * to make room for two bits (three values) indicating whether the tccc is 0, 1, or greater. 9362e5b6d6dSopenharmony_ci * See DELTA_TCCC_MASK etc. 9372e5b6d6dSopenharmony_ci * This helps with fetching tccc/FCD values and FCC hasCompBoundaryAfter(). 9382e5b6d6dSopenharmony_ci * minMaybeYes is 8-aligned so that the DELTA_TCCC_MASK bits can be tested directly. 9392e5b6d6dSopenharmony_ci * 9402e5b6d6dSopenharmony_ci * - Algorithmic mappings are only used for mapping to "comp yes and ccc=0" characters, 9412e5b6d6dSopenharmony_ci * and ASCII characters are mapped algorithmically only to other ASCII characters. 9422e5b6d6dSopenharmony_ci * This helps with hasCompBoundaryBefore() and compose() fast paths. 9432e5b6d6dSopenharmony_ci * It is never necessary any more to loop for algorithmic mappings. 9442e5b6d6dSopenharmony_ci * 9452e5b6d6dSopenharmony_ci * - Addition of indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE], 9462e5b6d6dSopenharmony_ci * indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC], and indexes[IX_MIN_NO_NO_EMPTY], 9472e5b6d6dSopenharmony_ci * and separation of the noNo extraData into distinct ranges. 9482e5b6d6dSopenharmony_ci * With this, the noNo norm16 value indicates whether the mapping is 9492e5b6d6dSopenharmony_ci * compose-normalized, not normalized but hasCompBoundaryBefore(), 9502e5b6d6dSopenharmony_ci * not even that, or maps to an empty string. 9512e5b6d6dSopenharmony_ci * hasCompBoundaryBefore() can be determined solely from the norm16 value. 9522e5b6d6dSopenharmony_ci * 9532e5b6d6dSopenharmony_ci * - The norm16 value for Hangul LVT is now different from that for Hangul LV, 9542e5b6d6dSopenharmony_ci * so that hasCompBoundaryAfter() need not check for the syllable type. 9552e5b6d6dSopenharmony_ci * For Hangul LV, minYesNo continues to be used (no comp-boundary-after). 9562e5b6d6dSopenharmony_ci * For Hangul LVT, minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER is used. 9572e5b6d6dSopenharmony_ci * The extraData units at these indexes are set to firstUnit=2 and firstUnit=3, respectively, 9582e5b6d6dSopenharmony_ci * to simplify some code. 9592e5b6d6dSopenharmony_ci * 9602e5b6d6dSopenharmony_ci * - The extraData firstUnit bit 5 is no longer necessary 9612e5b6d6dSopenharmony_ci * (norm16 bit 0 used instead of firstUnit MAPPING_NO_COMP_BOUNDARY_AFTER), 9622e5b6d6dSopenharmony_ci * is reserved again, and always set to 0. 9632e5b6d6dSopenharmony_ci * 9642e5b6d6dSopenharmony_ci * - Addition of indexes[IX_MIN_LCCC_CP], the first code point where lccc!=0. 9652e5b6d6dSopenharmony_ci * This used to be hardcoded to U+0300, but in data like NFKC_Casefold it is lower: 9662e5b6d6dSopenharmony_ci * U+00AD Soft Hyphen maps to an empty string, 9672e5b6d6dSopenharmony_ci * which is artificially assigned "worst case" values lccc=1 and tccc=255. 9682e5b6d6dSopenharmony_ci * 9692e5b6d6dSopenharmony_ci * - A mapping to an empty string has explicit lccc=1 and tccc=255 values. 9702e5b6d6dSopenharmony_ci * 9712e5b6d6dSopenharmony_ci * Changes from format version 3 to format version 4 (ICU 63) ------------------ 9722e5b6d6dSopenharmony_ci * 9732e5b6d6dSopenharmony_ci * Switched from UTrie2 to UCPTrie/CodePointTrie. 9742e5b6d6dSopenharmony_ci * 9752e5b6d6dSopenharmony_ci * The new trie no longer stores different values for surrogate code *units* vs. 9762e5b6d6dSopenharmony_ci * surrogate code *points*. 9772e5b6d6dSopenharmony_ci * Lead surrogates still have values for optimized UTF-16 string processing. 9782e5b6d6dSopenharmony_ci * When looking up code point properties, the code now checks for lead surrogates and 9792e5b6d6dSopenharmony_ci * treats them as inert. 9802e5b6d6dSopenharmony_ci * 9812e5b6d6dSopenharmony_ci * gennorm2 now has to reject mappings for surrogate code points. 9822e5b6d6dSopenharmony_ci * UTS #46 maps unpaired surrogates to U+FFFD in code rather than via its 9832e5b6d6dSopenharmony_ci * custom normalization data file. 9842e5b6d6dSopenharmony_ci */ 9852e5b6d6dSopenharmony_ci 9862e5b6d6dSopenharmony_ci#endif /* !UCONFIG_NO_NORMALIZATION */ 9872e5b6d6dSopenharmony_ci#endif /* __NORMALIZER2IMPL_H__ */ 988