11cb0ef41Sopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 21cb0ef41Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 31cb0ef41Sopenharmony_ci/* 41cb0ef41Sopenharmony_ci******************************************************************************* 51cb0ef41Sopenharmony_ci* 61cb0ef41Sopenharmony_ci* Copyright (C) 2009-2014, International Business Machines 71cb0ef41Sopenharmony_ci* Corporation and others. All Rights Reserved. 81cb0ef41Sopenharmony_ci* 91cb0ef41Sopenharmony_ci******************************************************************************* 101cb0ef41Sopenharmony_ci* file name: normalizer2impl.cpp 111cb0ef41Sopenharmony_ci* encoding: UTF-8 121cb0ef41Sopenharmony_ci* tab size: 8 (not used) 131cb0ef41Sopenharmony_ci* indentation:4 141cb0ef41Sopenharmony_ci* 151cb0ef41Sopenharmony_ci* created on: 2009nov22 161cb0ef41Sopenharmony_ci* created by: Markus W. Scherer 171cb0ef41Sopenharmony_ci*/ 181cb0ef41Sopenharmony_ci 191cb0ef41Sopenharmony_ci// #define UCPTRIE_DEBUG 201cb0ef41Sopenharmony_ci 211cb0ef41Sopenharmony_ci#include "unicode/utypes.h" 221cb0ef41Sopenharmony_ci 231cb0ef41Sopenharmony_ci#if !UCONFIG_NO_NORMALIZATION 241cb0ef41Sopenharmony_ci 251cb0ef41Sopenharmony_ci#include "unicode/bytestream.h" 261cb0ef41Sopenharmony_ci#include "unicode/edits.h" 271cb0ef41Sopenharmony_ci#include "unicode/normalizer2.h" 281cb0ef41Sopenharmony_ci#include "unicode/stringoptions.h" 291cb0ef41Sopenharmony_ci#include "unicode/ucptrie.h" 301cb0ef41Sopenharmony_ci#include "unicode/udata.h" 311cb0ef41Sopenharmony_ci#include "unicode/umutablecptrie.h" 321cb0ef41Sopenharmony_ci#include "unicode/ustring.h" 331cb0ef41Sopenharmony_ci#include "unicode/utf16.h" 341cb0ef41Sopenharmony_ci#include "unicode/utf8.h" 351cb0ef41Sopenharmony_ci#include "bytesinkutil.h" 361cb0ef41Sopenharmony_ci#include "cmemory.h" 371cb0ef41Sopenharmony_ci#include "mutex.h" 381cb0ef41Sopenharmony_ci#include "normalizer2impl.h" 391cb0ef41Sopenharmony_ci#include "putilimp.h" 401cb0ef41Sopenharmony_ci#include "uassert.h" 411cb0ef41Sopenharmony_ci#include "ucptrie_impl.h" 421cb0ef41Sopenharmony_ci#include "uset_imp.h" 431cb0ef41Sopenharmony_ci#include "uvector.h" 441cb0ef41Sopenharmony_ci 451cb0ef41Sopenharmony_ciU_NAMESPACE_BEGIN 461cb0ef41Sopenharmony_ci 471cb0ef41Sopenharmony_cinamespace { 481cb0ef41Sopenharmony_ci 491cb0ef41Sopenharmony_ci/** 501cb0ef41Sopenharmony_ci * UTF-8 lead byte for minNoMaybeCP. 511cb0ef41Sopenharmony_ci * Can be lower than the actual lead byte for c. 521cb0ef41Sopenharmony_ci * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold. 531cb0ef41Sopenharmony_ci */ 541cb0ef41Sopenharmony_ciinline uint8_t leadByteForCP(UChar32 c) { 551cb0ef41Sopenharmony_ci if (c <= 0x7f) { 561cb0ef41Sopenharmony_ci return (uint8_t)c; 571cb0ef41Sopenharmony_ci } else if (c <= 0x7ff) { 581cb0ef41Sopenharmony_ci return (uint8_t)(0xc0+(c>>6)); 591cb0ef41Sopenharmony_ci } else { 601cb0ef41Sopenharmony_ci // Should not occur because ccc(U+0300)!=0. 611cb0ef41Sopenharmony_ci return 0xe0; 621cb0ef41Sopenharmony_ci } 631cb0ef41Sopenharmony_ci} 641cb0ef41Sopenharmony_ci 651cb0ef41Sopenharmony_ci/** 661cb0ef41Sopenharmony_ci * Returns the code point from one single well-formed UTF-8 byte sequence 671cb0ef41Sopenharmony_ci * between cpStart and cpLimit. 681cb0ef41Sopenharmony_ci * 691cb0ef41Sopenharmony_ci * Trie UTF-8 macros do not assemble whole code points (for efficiency). 701cb0ef41Sopenharmony_ci * When we do need the code point, we call this function. 711cb0ef41Sopenharmony_ci * We should not need it for normalization-inert data (norm16==0). 721cb0ef41Sopenharmony_ci * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points. 731cb0ef41Sopenharmony_ci */ 741cb0ef41Sopenharmony_ciUChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) { 751cb0ef41Sopenharmony_ci // Similar to U8_NEXT_UNSAFE(s, i, c). 761cb0ef41Sopenharmony_ci U_ASSERT(cpStart < cpLimit); 771cb0ef41Sopenharmony_ci uint8_t c = *cpStart; 781cb0ef41Sopenharmony_ci switch(cpLimit-cpStart) { 791cb0ef41Sopenharmony_ci case 1: 801cb0ef41Sopenharmony_ci return c; 811cb0ef41Sopenharmony_ci case 2: 821cb0ef41Sopenharmony_ci return ((c&0x1f)<<6) | (cpStart[1]&0x3f); 831cb0ef41Sopenharmony_ci case 3: 841cb0ef41Sopenharmony_ci // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) 851cb0ef41Sopenharmony_ci return (char16_t)((c<<12) | ((cpStart[1]&0x3f)<<6) | (cpStart[2]&0x3f)); 861cb0ef41Sopenharmony_ci case 4: 871cb0ef41Sopenharmony_ci return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f); 881cb0ef41Sopenharmony_ci default: 891cb0ef41Sopenharmony_ci UPRV_UNREACHABLE_EXIT; // Should not occur. 901cb0ef41Sopenharmony_ci } 911cb0ef41Sopenharmony_ci} 921cb0ef41Sopenharmony_ci 931cb0ef41Sopenharmony_ci/** 941cb0ef41Sopenharmony_ci * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF. 951cb0ef41Sopenharmony_ci * Otherwise returns a negative value. 961cb0ef41Sopenharmony_ci */ 971cb0ef41Sopenharmony_ciUChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) { 981cb0ef41Sopenharmony_ci if ((p - start) >= 3) { 991cb0ef41Sopenharmony_ci p -= 3; 1001cb0ef41Sopenharmony_ci uint8_t l = *p; 1011cb0ef41Sopenharmony_ci uint8_t t1, t2; 1021cb0ef41Sopenharmony_ci if (0xe1 <= l && l <= 0xed && 1031cb0ef41Sopenharmony_ci (t1 = (uint8_t)(p[1] - 0x80)) <= 0x3f && 1041cb0ef41Sopenharmony_ci (t2 = (uint8_t)(p[2] - 0x80)) <= 0x3f && 1051cb0ef41Sopenharmony_ci (l < 0xed || t1 <= 0x1f)) { 1061cb0ef41Sopenharmony_ci return ((l & 0xf) << 12) | (t1 << 6) | t2; 1071cb0ef41Sopenharmony_ci } 1081cb0ef41Sopenharmony_ci } 1091cb0ef41Sopenharmony_ci return U_SENTINEL; 1101cb0ef41Sopenharmony_ci} 1111cb0ef41Sopenharmony_ci 1121cb0ef41Sopenharmony_ci/** 1131cb0ef41Sopenharmony_ci * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point. 1141cb0ef41Sopenharmony_ci * Otherwise returns a negative value. 1151cb0ef41Sopenharmony_ci */ 1161cb0ef41Sopenharmony_ciint32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) { 1171cb0ef41Sopenharmony_ci // Jamo T: E1 86 A8..E1 87 82 1181cb0ef41Sopenharmony_ci if ((limit - src) >= 3 && *src == 0xe1) { 1191cb0ef41Sopenharmony_ci if (src[1] == 0x86) { 1201cb0ef41Sopenharmony_ci uint8_t t = src[2]; 1211cb0ef41Sopenharmony_ci // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7. 1221cb0ef41Sopenharmony_ci // Offset 0 does not correspond to any conjoining Jamo. 1231cb0ef41Sopenharmony_ci if (0xa8 <= t && t <= 0xbf) { 1241cb0ef41Sopenharmony_ci return t - 0xa7; 1251cb0ef41Sopenharmony_ci } 1261cb0ef41Sopenharmony_ci } else if (src[1] == 0x87) { 1271cb0ef41Sopenharmony_ci uint8_t t = src[2]; 1281cb0ef41Sopenharmony_ci if ((int8_t)t <= (int8_t)0x82u) { 1291cb0ef41Sopenharmony_ci return t - (0xa7 - 0x40); 1301cb0ef41Sopenharmony_ci } 1311cb0ef41Sopenharmony_ci } 1321cb0ef41Sopenharmony_ci } 1331cb0ef41Sopenharmony_ci return -1; 1341cb0ef41Sopenharmony_ci} 1351cb0ef41Sopenharmony_ci 1361cb0ef41Sopenharmony_civoid 1371cb0ef41Sopenharmony_ciappendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta, 1381cb0ef41Sopenharmony_ci ByteSink &sink, Edits *edits) { 1391cb0ef41Sopenharmony_ci char buffer[U8_MAX_LENGTH]; 1401cb0ef41Sopenharmony_ci int32_t length; 1411cb0ef41Sopenharmony_ci int32_t cpLength = (int32_t)(cpLimit - cpStart); 1421cb0ef41Sopenharmony_ci if (cpLength == 1) { 1431cb0ef41Sopenharmony_ci // The builder makes ASCII map to ASCII. 1441cb0ef41Sopenharmony_ci buffer[0] = (uint8_t)(*cpStart + delta); 1451cb0ef41Sopenharmony_ci length = 1; 1461cb0ef41Sopenharmony_ci } else { 1471cb0ef41Sopenharmony_ci int32_t trail = *(cpLimit-1) + delta; 1481cb0ef41Sopenharmony_ci if (0x80 <= trail && trail <= 0xbf) { 1491cb0ef41Sopenharmony_ci // The delta only changes the last trail byte. 1501cb0ef41Sopenharmony_ci --cpLimit; 1511cb0ef41Sopenharmony_ci length = 0; 1521cb0ef41Sopenharmony_ci do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit); 1531cb0ef41Sopenharmony_ci buffer[length++] = (uint8_t)trail; 1541cb0ef41Sopenharmony_ci } else { 1551cb0ef41Sopenharmony_ci // Decode the code point, add the delta, re-encode. 1561cb0ef41Sopenharmony_ci UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta; 1571cb0ef41Sopenharmony_ci length = 0; 1581cb0ef41Sopenharmony_ci U8_APPEND_UNSAFE(buffer, length, c); 1591cb0ef41Sopenharmony_ci } 1601cb0ef41Sopenharmony_ci } 1611cb0ef41Sopenharmony_ci if (edits != nullptr) { 1621cb0ef41Sopenharmony_ci edits->addReplace(cpLength, length); 1631cb0ef41Sopenharmony_ci } 1641cb0ef41Sopenharmony_ci sink.Append(buffer, length); 1651cb0ef41Sopenharmony_ci} 1661cb0ef41Sopenharmony_ci 1671cb0ef41Sopenharmony_ci} // namespace 1681cb0ef41Sopenharmony_ci 1691cb0ef41Sopenharmony_ci// ReorderingBuffer -------------------------------------------------------- *** 1701cb0ef41Sopenharmony_ci 1711cb0ef41Sopenharmony_ciReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, 1721cb0ef41Sopenharmony_ci UErrorCode &errorCode) : 1731cb0ef41Sopenharmony_ci impl(ni), str(dest), 1741cb0ef41Sopenharmony_ci start(str.getBuffer(8)), reorderStart(start), limit(start), 1751cb0ef41Sopenharmony_ci remainingCapacity(str.getCapacity()), lastCC(0) { 1761cb0ef41Sopenharmony_ci if (start == nullptr && U_SUCCESS(errorCode)) { 1771cb0ef41Sopenharmony_ci // getBuffer() already did str.setToBogus() 1781cb0ef41Sopenharmony_ci errorCode = U_MEMORY_ALLOCATION_ERROR; 1791cb0ef41Sopenharmony_ci } 1801cb0ef41Sopenharmony_ci} 1811cb0ef41Sopenharmony_ci 1821cb0ef41Sopenharmony_ciUBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { 1831cb0ef41Sopenharmony_ci int32_t length=str.length(); 1841cb0ef41Sopenharmony_ci start=str.getBuffer(destCapacity); 1851cb0ef41Sopenharmony_ci if(start==nullptr) { 1861cb0ef41Sopenharmony_ci // getBuffer() already did str.setToBogus() 1871cb0ef41Sopenharmony_ci errorCode=U_MEMORY_ALLOCATION_ERROR; 1881cb0ef41Sopenharmony_ci return false; 1891cb0ef41Sopenharmony_ci } 1901cb0ef41Sopenharmony_ci limit=start+length; 1911cb0ef41Sopenharmony_ci remainingCapacity=str.getCapacity()-length; 1921cb0ef41Sopenharmony_ci reorderStart=start; 1931cb0ef41Sopenharmony_ci if(start==limit) { 1941cb0ef41Sopenharmony_ci lastCC=0; 1951cb0ef41Sopenharmony_ci } else { 1961cb0ef41Sopenharmony_ci setIterator(); 1971cb0ef41Sopenharmony_ci lastCC=previousCC(); 1981cb0ef41Sopenharmony_ci // Set reorderStart after the last code point with cc<=1 if there is one. 1991cb0ef41Sopenharmony_ci if(lastCC>1) { 2001cb0ef41Sopenharmony_ci while(previousCC()>1) {} 2011cb0ef41Sopenharmony_ci } 2021cb0ef41Sopenharmony_ci reorderStart=codePointLimit; 2031cb0ef41Sopenharmony_ci } 2041cb0ef41Sopenharmony_ci return true; 2051cb0ef41Sopenharmony_ci} 2061cb0ef41Sopenharmony_ci 2071cb0ef41Sopenharmony_ciUBool ReorderingBuffer::equals(const char16_t *otherStart, const char16_t *otherLimit) const { 2081cb0ef41Sopenharmony_ci int32_t length=(int32_t)(limit-start); 2091cb0ef41Sopenharmony_ci return 2101cb0ef41Sopenharmony_ci length==(int32_t)(otherLimit-otherStart) && 2111cb0ef41Sopenharmony_ci 0==u_memcmp(start, otherStart, length); 2121cb0ef41Sopenharmony_ci} 2131cb0ef41Sopenharmony_ci 2141cb0ef41Sopenharmony_ciUBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const { 2151cb0ef41Sopenharmony_ci U_ASSERT((otherLimit - otherStart) <= INT32_MAX); // ensured by caller 2161cb0ef41Sopenharmony_ci int32_t length = (int32_t)(limit - start); 2171cb0ef41Sopenharmony_ci int32_t otherLength = (int32_t)(otherLimit - otherStart); 2181cb0ef41Sopenharmony_ci // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long. 2191cb0ef41Sopenharmony_ci if (otherLength < length || (otherLength / 3) > length) { 2201cb0ef41Sopenharmony_ci return false; 2211cb0ef41Sopenharmony_ci } 2221cb0ef41Sopenharmony_ci // Compare valid strings from between normalization boundaries. 2231cb0ef41Sopenharmony_ci // (Invalid sequences are normalization-inert.) 2241cb0ef41Sopenharmony_ci for (int32_t i = 0, j = 0;;) { 2251cb0ef41Sopenharmony_ci if (i >= length) { 2261cb0ef41Sopenharmony_ci return j >= otherLength; 2271cb0ef41Sopenharmony_ci } else if (j >= otherLength) { 2281cb0ef41Sopenharmony_ci return false; 2291cb0ef41Sopenharmony_ci } 2301cb0ef41Sopenharmony_ci // Not at the end of either string yet. 2311cb0ef41Sopenharmony_ci UChar32 c, other; 2321cb0ef41Sopenharmony_ci U16_NEXT_UNSAFE(start, i, c); 2331cb0ef41Sopenharmony_ci U8_NEXT_UNSAFE(otherStart, j, other); 2341cb0ef41Sopenharmony_ci if (c != other) { 2351cb0ef41Sopenharmony_ci return false; 2361cb0ef41Sopenharmony_ci } 2371cb0ef41Sopenharmony_ci } 2381cb0ef41Sopenharmony_ci} 2391cb0ef41Sopenharmony_ci 2401cb0ef41Sopenharmony_ciUBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 2411cb0ef41Sopenharmony_ci if(remainingCapacity<2 && !resize(2, errorCode)) { 2421cb0ef41Sopenharmony_ci return false; 2431cb0ef41Sopenharmony_ci } 2441cb0ef41Sopenharmony_ci if(lastCC<=cc || cc==0) { 2451cb0ef41Sopenharmony_ci limit[0]=U16_LEAD(c); 2461cb0ef41Sopenharmony_ci limit[1]=U16_TRAIL(c); 2471cb0ef41Sopenharmony_ci limit+=2; 2481cb0ef41Sopenharmony_ci lastCC=cc; 2491cb0ef41Sopenharmony_ci if(cc<=1) { 2501cb0ef41Sopenharmony_ci reorderStart=limit; 2511cb0ef41Sopenharmony_ci } 2521cb0ef41Sopenharmony_ci } else { 2531cb0ef41Sopenharmony_ci insert(c, cc); 2541cb0ef41Sopenharmony_ci } 2551cb0ef41Sopenharmony_ci remainingCapacity-=2; 2561cb0ef41Sopenharmony_ci return true; 2571cb0ef41Sopenharmony_ci} 2581cb0ef41Sopenharmony_ci 2591cb0ef41Sopenharmony_ciUBool ReorderingBuffer::append(const char16_t *s, int32_t length, UBool isNFD, 2601cb0ef41Sopenharmony_ci uint8_t leadCC, uint8_t trailCC, 2611cb0ef41Sopenharmony_ci UErrorCode &errorCode) { 2621cb0ef41Sopenharmony_ci if(length==0) { 2631cb0ef41Sopenharmony_ci return true; 2641cb0ef41Sopenharmony_ci } 2651cb0ef41Sopenharmony_ci if(remainingCapacity<length && !resize(length, errorCode)) { 2661cb0ef41Sopenharmony_ci return false; 2671cb0ef41Sopenharmony_ci } 2681cb0ef41Sopenharmony_ci remainingCapacity-=length; 2691cb0ef41Sopenharmony_ci if(lastCC<=leadCC || leadCC==0) { 2701cb0ef41Sopenharmony_ci if(trailCC<=1) { 2711cb0ef41Sopenharmony_ci reorderStart=limit+length; 2721cb0ef41Sopenharmony_ci } else if(leadCC<=1) { 2731cb0ef41Sopenharmony_ci reorderStart=limit+1; // Ok if not a code point boundary. 2741cb0ef41Sopenharmony_ci } 2751cb0ef41Sopenharmony_ci const char16_t *sLimit=s+length; 2761cb0ef41Sopenharmony_ci do { *limit++=*s++; } while(s!=sLimit); 2771cb0ef41Sopenharmony_ci lastCC=trailCC; 2781cb0ef41Sopenharmony_ci } else { 2791cb0ef41Sopenharmony_ci int32_t i=0; 2801cb0ef41Sopenharmony_ci UChar32 c; 2811cb0ef41Sopenharmony_ci U16_NEXT(s, i, length, c); 2821cb0ef41Sopenharmony_ci insert(c, leadCC); // insert first code point 2831cb0ef41Sopenharmony_ci while(i<length) { 2841cb0ef41Sopenharmony_ci U16_NEXT(s, i, length, c); 2851cb0ef41Sopenharmony_ci if(i<length) { 2861cb0ef41Sopenharmony_ci if (isNFD) { 2871cb0ef41Sopenharmony_ci leadCC = Normalizer2Impl::getCCFromYesOrMaybe(impl.getRawNorm16(c)); 2881cb0ef41Sopenharmony_ci } else { 2891cb0ef41Sopenharmony_ci leadCC = impl.getCC(impl.getNorm16(c)); 2901cb0ef41Sopenharmony_ci } 2911cb0ef41Sopenharmony_ci } else { 2921cb0ef41Sopenharmony_ci leadCC=trailCC; 2931cb0ef41Sopenharmony_ci } 2941cb0ef41Sopenharmony_ci append(c, leadCC, errorCode); 2951cb0ef41Sopenharmony_ci } 2961cb0ef41Sopenharmony_ci } 2971cb0ef41Sopenharmony_ci return true; 2981cb0ef41Sopenharmony_ci} 2991cb0ef41Sopenharmony_ci 3001cb0ef41Sopenharmony_ciUBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { 3011cb0ef41Sopenharmony_ci int32_t cpLength=U16_LENGTH(c); 3021cb0ef41Sopenharmony_ci if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { 3031cb0ef41Sopenharmony_ci return false; 3041cb0ef41Sopenharmony_ci } 3051cb0ef41Sopenharmony_ci remainingCapacity-=cpLength; 3061cb0ef41Sopenharmony_ci if(cpLength==1) { 3071cb0ef41Sopenharmony_ci *limit++=(char16_t)c; 3081cb0ef41Sopenharmony_ci } else { 3091cb0ef41Sopenharmony_ci limit[0]=U16_LEAD(c); 3101cb0ef41Sopenharmony_ci limit[1]=U16_TRAIL(c); 3111cb0ef41Sopenharmony_ci limit+=2; 3121cb0ef41Sopenharmony_ci } 3131cb0ef41Sopenharmony_ci lastCC=0; 3141cb0ef41Sopenharmony_ci reorderStart=limit; 3151cb0ef41Sopenharmony_ci return true; 3161cb0ef41Sopenharmony_ci} 3171cb0ef41Sopenharmony_ci 3181cb0ef41Sopenharmony_ciUBool ReorderingBuffer::appendZeroCC(const char16_t *s, const char16_t *sLimit, UErrorCode &errorCode) { 3191cb0ef41Sopenharmony_ci if(s==sLimit) { 3201cb0ef41Sopenharmony_ci return true; 3211cb0ef41Sopenharmony_ci } 3221cb0ef41Sopenharmony_ci int32_t length=(int32_t)(sLimit-s); 3231cb0ef41Sopenharmony_ci if(remainingCapacity<length && !resize(length, errorCode)) { 3241cb0ef41Sopenharmony_ci return false; 3251cb0ef41Sopenharmony_ci } 3261cb0ef41Sopenharmony_ci u_memcpy(limit, s, length); 3271cb0ef41Sopenharmony_ci limit+=length; 3281cb0ef41Sopenharmony_ci remainingCapacity-=length; 3291cb0ef41Sopenharmony_ci lastCC=0; 3301cb0ef41Sopenharmony_ci reorderStart=limit; 3311cb0ef41Sopenharmony_ci return true; 3321cb0ef41Sopenharmony_ci} 3331cb0ef41Sopenharmony_ci 3341cb0ef41Sopenharmony_civoid ReorderingBuffer::remove() { 3351cb0ef41Sopenharmony_ci reorderStart=limit=start; 3361cb0ef41Sopenharmony_ci remainingCapacity=str.getCapacity(); 3371cb0ef41Sopenharmony_ci lastCC=0; 3381cb0ef41Sopenharmony_ci} 3391cb0ef41Sopenharmony_ci 3401cb0ef41Sopenharmony_civoid ReorderingBuffer::removeSuffix(int32_t suffixLength) { 3411cb0ef41Sopenharmony_ci if(suffixLength<(limit-start)) { 3421cb0ef41Sopenharmony_ci limit-=suffixLength; 3431cb0ef41Sopenharmony_ci remainingCapacity+=suffixLength; 3441cb0ef41Sopenharmony_ci } else { 3451cb0ef41Sopenharmony_ci limit=start; 3461cb0ef41Sopenharmony_ci remainingCapacity=str.getCapacity(); 3471cb0ef41Sopenharmony_ci } 3481cb0ef41Sopenharmony_ci lastCC=0; 3491cb0ef41Sopenharmony_ci reorderStart=limit; 3501cb0ef41Sopenharmony_ci} 3511cb0ef41Sopenharmony_ci 3521cb0ef41Sopenharmony_ciUBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { 3531cb0ef41Sopenharmony_ci int32_t reorderStartIndex=(int32_t)(reorderStart-start); 3541cb0ef41Sopenharmony_ci int32_t length=(int32_t)(limit-start); 3551cb0ef41Sopenharmony_ci str.releaseBuffer(length); 3561cb0ef41Sopenharmony_ci int32_t newCapacity=length+appendLength; 3571cb0ef41Sopenharmony_ci int32_t doubleCapacity=2*str.getCapacity(); 3581cb0ef41Sopenharmony_ci if(newCapacity<doubleCapacity) { 3591cb0ef41Sopenharmony_ci newCapacity=doubleCapacity; 3601cb0ef41Sopenharmony_ci } 3611cb0ef41Sopenharmony_ci if(newCapacity<256) { 3621cb0ef41Sopenharmony_ci newCapacity=256; 3631cb0ef41Sopenharmony_ci } 3641cb0ef41Sopenharmony_ci start=str.getBuffer(newCapacity); 3651cb0ef41Sopenharmony_ci if(start==nullptr) { 3661cb0ef41Sopenharmony_ci // getBuffer() already did str.setToBogus() 3671cb0ef41Sopenharmony_ci errorCode=U_MEMORY_ALLOCATION_ERROR; 3681cb0ef41Sopenharmony_ci return false; 3691cb0ef41Sopenharmony_ci } 3701cb0ef41Sopenharmony_ci reorderStart=start+reorderStartIndex; 3711cb0ef41Sopenharmony_ci limit=start+length; 3721cb0ef41Sopenharmony_ci remainingCapacity=str.getCapacity()-length; 3731cb0ef41Sopenharmony_ci return true; 3741cb0ef41Sopenharmony_ci} 3751cb0ef41Sopenharmony_ci 3761cb0ef41Sopenharmony_civoid ReorderingBuffer::skipPrevious() { 3771cb0ef41Sopenharmony_ci codePointLimit=codePointStart; 3781cb0ef41Sopenharmony_ci char16_t c=*--codePointStart; 3791cb0ef41Sopenharmony_ci if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { 3801cb0ef41Sopenharmony_ci --codePointStart; 3811cb0ef41Sopenharmony_ci } 3821cb0ef41Sopenharmony_ci} 3831cb0ef41Sopenharmony_ci 3841cb0ef41Sopenharmony_ciuint8_t ReorderingBuffer::previousCC() { 3851cb0ef41Sopenharmony_ci codePointLimit=codePointStart; 3861cb0ef41Sopenharmony_ci if(reorderStart>=codePointStart) { 3871cb0ef41Sopenharmony_ci return 0; 3881cb0ef41Sopenharmony_ci } 3891cb0ef41Sopenharmony_ci UChar32 c=*--codePointStart; 3901cb0ef41Sopenharmony_ci char16_t c2; 3911cb0ef41Sopenharmony_ci if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { 3921cb0ef41Sopenharmony_ci --codePointStart; 3931cb0ef41Sopenharmony_ci c=U16_GET_SUPPLEMENTARY(c2, c); 3941cb0ef41Sopenharmony_ci } 3951cb0ef41Sopenharmony_ci return impl.getCCFromYesOrMaybeCP(c); 3961cb0ef41Sopenharmony_ci} 3971cb0ef41Sopenharmony_ci 3981cb0ef41Sopenharmony_ci// Inserts c somewhere before the last character. 3991cb0ef41Sopenharmony_ci// Requires 0<cc<lastCC which implies reorderStart<limit. 4001cb0ef41Sopenharmony_civoid ReorderingBuffer::insert(UChar32 c, uint8_t cc) { 4011cb0ef41Sopenharmony_ci for(setIterator(), skipPrevious(); previousCC()>cc;) {} 4021cb0ef41Sopenharmony_ci // insert c at codePointLimit, after the character with prevCC<=cc 4031cb0ef41Sopenharmony_ci char16_t *q=limit; 4041cb0ef41Sopenharmony_ci char16_t *r=limit+=U16_LENGTH(c); 4051cb0ef41Sopenharmony_ci do { 4061cb0ef41Sopenharmony_ci *--r=*--q; 4071cb0ef41Sopenharmony_ci } while(codePointLimit!=q); 4081cb0ef41Sopenharmony_ci writeCodePoint(q, c); 4091cb0ef41Sopenharmony_ci if(cc<=1) { 4101cb0ef41Sopenharmony_ci reorderStart=r; 4111cb0ef41Sopenharmony_ci } 4121cb0ef41Sopenharmony_ci} 4131cb0ef41Sopenharmony_ci 4141cb0ef41Sopenharmony_ci// Normalizer2Impl --------------------------------------------------------- *** 4151cb0ef41Sopenharmony_ci 4161cb0ef41Sopenharmony_cistruct CanonIterData : public UMemory { 4171cb0ef41Sopenharmony_ci CanonIterData(UErrorCode &errorCode); 4181cb0ef41Sopenharmony_ci ~CanonIterData(); 4191cb0ef41Sopenharmony_ci void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); 4201cb0ef41Sopenharmony_ci UMutableCPTrie *mutableTrie; 4211cb0ef41Sopenharmony_ci UCPTrie *trie; 4221cb0ef41Sopenharmony_ci UVector canonStartSets; // contains UnicodeSet * 4231cb0ef41Sopenharmony_ci}; 4241cb0ef41Sopenharmony_ci 4251cb0ef41Sopenharmony_ciNormalizer2Impl::~Normalizer2Impl() { 4261cb0ef41Sopenharmony_ci delete fCanonIterData; 4271cb0ef41Sopenharmony_ci} 4281cb0ef41Sopenharmony_ci 4291cb0ef41Sopenharmony_civoid 4301cb0ef41Sopenharmony_ciNormalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie, 4311cb0ef41Sopenharmony_ci const uint16_t *inExtraData, const uint8_t *inSmallFCD) { 4321cb0ef41Sopenharmony_ci minDecompNoCP = static_cast<char16_t>(inIndexes[IX_MIN_DECOMP_NO_CP]); 4331cb0ef41Sopenharmony_ci minCompNoMaybeCP = static_cast<char16_t>(inIndexes[IX_MIN_COMP_NO_MAYBE_CP]); 4341cb0ef41Sopenharmony_ci minLcccCP = static_cast<char16_t>(inIndexes[IX_MIN_LCCC_CP]); 4351cb0ef41Sopenharmony_ci 4361cb0ef41Sopenharmony_ci minYesNo = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO]); 4371cb0ef41Sopenharmony_ci minYesNoMappingsOnly = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]); 4381cb0ef41Sopenharmony_ci minNoNo = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO]); 4391cb0ef41Sopenharmony_ci minNoNoCompBoundaryBefore = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]); 4401cb0ef41Sopenharmony_ci minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]); 4411cb0ef41Sopenharmony_ci minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]); 4421cb0ef41Sopenharmony_ci limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]); 4431cb0ef41Sopenharmony_ci minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]); 4441cb0ef41Sopenharmony_ci U_ASSERT((minMaybeYes & 7) == 0); // 8-aligned for noNoDelta bit fields 4451cb0ef41Sopenharmony_ci centerNoNoDelta = (minMaybeYes >> DELTA_SHIFT) - MAX_DELTA - 1; 4461cb0ef41Sopenharmony_ci 4471cb0ef41Sopenharmony_ci normTrie=inTrie; 4481cb0ef41Sopenharmony_ci 4491cb0ef41Sopenharmony_ci maybeYesCompositions=inExtraData; 4501cb0ef41Sopenharmony_ci extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); 4511cb0ef41Sopenharmony_ci 4521cb0ef41Sopenharmony_ci smallFCD=inSmallFCD; 4531cb0ef41Sopenharmony_ci} 4541cb0ef41Sopenharmony_ci 4551cb0ef41Sopenharmony_ciU_CDECL_BEGIN 4561cb0ef41Sopenharmony_ci 4571cb0ef41Sopenharmony_cistatic uint32_t U_CALLCONV 4581cb0ef41Sopenharmony_cisegmentStarterMapper(const void * /*context*/, uint32_t value) { 4591cb0ef41Sopenharmony_ci return value&CANON_NOT_SEGMENT_STARTER; 4601cb0ef41Sopenharmony_ci} 4611cb0ef41Sopenharmony_ci 4621cb0ef41Sopenharmony_ciU_CDECL_END 4631cb0ef41Sopenharmony_ci 4641cb0ef41Sopenharmony_civoid 4651cb0ef41Sopenharmony_ciNormalizer2Impl::addLcccChars(UnicodeSet &set) const { 4661cb0ef41Sopenharmony_ci UChar32 start = 0, end; 4671cb0ef41Sopenharmony_ci uint32_t norm16; 4681cb0ef41Sopenharmony_ci while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT, 4691cb0ef41Sopenharmony_ci nullptr, nullptr, &norm16)) >= 0) { 4701cb0ef41Sopenharmony_ci if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES && 4711cb0ef41Sopenharmony_ci norm16 != Normalizer2Impl::JAMO_VT) { 4721cb0ef41Sopenharmony_ci set.add(start, end); 4731cb0ef41Sopenharmony_ci } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) { 4741cb0ef41Sopenharmony_ci uint16_t fcd16 = getFCD16(start); 4751cb0ef41Sopenharmony_ci if (fcd16 > 0xff) { set.add(start, end); } 4761cb0ef41Sopenharmony_ci } 4771cb0ef41Sopenharmony_ci start = end + 1; 4781cb0ef41Sopenharmony_ci } 4791cb0ef41Sopenharmony_ci} 4801cb0ef41Sopenharmony_ci 4811cb0ef41Sopenharmony_civoid 4821cb0ef41Sopenharmony_ciNormalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { 4831cb0ef41Sopenharmony_ci // Add the start code point of each same-value range of the trie. 4841cb0ef41Sopenharmony_ci UChar32 start = 0, end; 4851cb0ef41Sopenharmony_ci uint32_t value; 4861cb0ef41Sopenharmony_ci while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT, 4871cb0ef41Sopenharmony_ci nullptr, nullptr, &value)) >= 0) { 4881cb0ef41Sopenharmony_ci sa->add(sa->set, start); 4891cb0ef41Sopenharmony_ci if (start != end && isAlgorithmicNoNo((uint16_t)value) && 4901cb0ef41Sopenharmony_ci (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) { 4911cb0ef41Sopenharmony_ci // Range of code points with same-norm16-value algorithmic decompositions. 4921cb0ef41Sopenharmony_ci // They might have different non-zero FCD16 values. 4931cb0ef41Sopenharmony_ci uint16_t prevFCD16 = getFCD16(start); 4941cb0ef41Sopenharmony_ci while (++start <= end) { 4951cb0ef41Sopenharmony_ci uint16_t fcd16 = getFCD16(start); 4961cb0ef41Sopenharmony_ci if (fcd16 != prevFCD16) { 4971cb0ef41Sopenharmony_ci sa->add(sa->set, start); 4981cb0ef41Sopenharmony_ci prevFCD16 = fcd16; 4991cb0ef41Sopenharmony_ci } 5001cb0ef41Sopenharmony_ci } 5011cb0ef41Sopenharmony_ci } 5021cb0ef41Sopenharmony_ci start = end + 1; 5031cb0ef41Sopenharmony_ci } 5041cb0ef41Sopenharmony_ci 5051cb0ef41Sopenharmony_ci /* add Hangul LV syllables and LV+1 because of skippables */ 5061cb0ef41Sopenharmony_ci for(char16_t c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { 5071cb0ef41Sopenharmony_ci sa->add(sa->set, c); 5081cb0ef41Sopenharmony_ci sa->add(sa->set, c+1); 5091cb0ef41Sopenharmony_ci } 5101cb0ef41Sopenharmony_ci sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 5111cb0ef41Sopenharmony_ci} 5121cb0ef41Sopenharmony_ci 5131cb0ef41Sopenharmony_civoid 5141cb0ef41Sopenharmony_ciNormalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { 5151cb0ef41Sopenharmony_ci // Add the start code point of each same-value range of the canonical iterator data trie. 5161cb0ef41Sopenharmony_ci if (!ensureCanonIterData(errorCode)) { return; } 5171cb0ef41Sopenharmony_ci // Currently only used for the SEGMENT_STARTER property. 5181cb0ef41Sopenharmony_ci UChar32 start = 0, end; 5191cb0ef41Sopenharmony_ci uint32_t value; 5201cb0ef41Sopenharmony_ci while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0, 5211cb0ef41Sopenharmony_ci segmentStarterMapper, nullptr, &value)) >= 0) { 5221cb0ef41Sopenharmony_ci sa->add(sa->set, start); 5231cb0ef41Sopenharmony_ci start = end + 1; 5241cb0ef41Sopenharmony_ci } 5251cb0ef41Sopenharmony_ci} 5261cb0ef41Sopenharmony_ci 5271cb0ef41Sopenharmony_ciconst char16_t * 5281cb0ef41Sopenharmony_ciNormalizer2Impl::copyLowPrefixFromNulTerminated(const char16_t *src, 5291cb0ef41Sopenharmony_ci UChar32 minNeedDataCP, 5301cb0ef41Sopenharmony_ci ReorderingBuffer *buffer, 5311cb0ef41Sopenharmony_ci UErrorCode &errorCode) const { 5321cb0ef41Sopenharmony_ci // Make some effort to support NUL-terminated strings reasonably. 5331cb0ef41Sopenharmony_ci // Take the part of the fast quick check loop that does not look up 5341cb0ef41Sopenharmony_ci // data and check the first part of the string. 5351cb0ef41Sopenharmony_ci // After this prefix, determine the string length to simplify the rest 5361cb0ef41Sopenharmony_ci // of the code. 5371cb0ef41Sopenharmony_ci const char16_t *prevSrc=src; 5381cb0ef41Sopenharmony_ci char16_t c; 5391cb0ef41Sopenharmony_ci while((c=*src++)<minNeedDataCP && c!=0) {} 5401cb0ef41Sopenharmony_ci // Back out the last character for full processing. 5411cb0ef41Sopenharmony_ci // Copy this prefix. 5421cb0ef41Sopenharmony_ci if(--src!=prevSrc) { 5431cb0ef41Sopenharmony_ci if(buffer!=nullptr) { 5441cb0ef41Sopenharmony_ci buffer->appendZeroCC(prevSrc, src, errorCode); 5451cb0ef41Sopenharmony_ci } 5461cb0ef41Sopenharmony_ci } 5471cb0ef41Sopenharmony_ci return src; 5481cb0ef41Sopenharmony_ci} 5491cb0ef41Sopenharmony_ci 5501cb0ef41Sopenharmony_ciUnicodeString & 5511cb0ef41Sopenharmony_ciNormalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest, 5521cb0ef41Sopenharmony_ci UErrorCode &errorCode) const { 5531cb0ef41Sopenharmony_ci if(U_FAILURE(errorCode)) { 5541cb0ef41Sopenharmony_ci dest.setToBogus(); 5551cb0ef41Sopenharmony_ci return dest; 5561cb0ef41Sopenharmony_ci } 5571cb0ef41Sopenharmony_ci const char16_t *sArray=src.getBuffer(); 5581cb0ef41Sopenharmony_ci if(&dest==&src || sArray==nullptr) { 5591cb0ef41Sopenharmony_ci errorCode=U_ILLEGAL_ARGUMENT_ERROR; 5601cb0ef41Sopenharmony_ci dest.setToBogus(); 5611cb0ef41Sopenharmony_ci return dest; 5621cb0ef41Sopenharmony_ci } 5631cb0ef41Sopenharmony_ci decompose(sArray, sArray+src.length(), dest, src.length(), errorCode); 5641cb0ef41Sopenharmony_ci return dest; 5651cb0ef41Sopenharmony_ci} 5661cb0ef41Sopenharmony_ci 5671cb0ef41Sopenharmony_civoid 5681cb0ef41Sopenharmony_ciNormalizer2Impl::decompose(const char16_t *src, const char16_t *limit, 5691cb0ef41Sopenharmony_ci UnicodeString &dest, 5701cb0ef41Sopenharmony_ci int32_t destLengthEstimate, 5711cb0ef41Sopenharmony_ci UErrorCode &errorCode) const { 5721cb0ef41Sopenharmony_ci if(destLengthEstimate<0 && limit!=nullptr) { 5731cb0ef41Sopenharmony_ci destLengthEstimate=(int32_t)(limit-src); 5741cb0ef41Sopenharmony_ci } 5751cb0ef41Sopenharmony_ci dest.remove(); 5761cb0ef41Sopenharmony_ci ReorderingBuffer buffer(*this, dest); 5771cb0ef41Sopenharmony_ci if(buffer.init(destLengthEstimate, errorCode)) { 5781cb0ef41Sopenharmony_ci decompose(src, limit, &buffer, errorCode); 5791cb0ef41Sopenharmony_ci } 5801cb0ef41Sopenharmony_ci} 5811cb0ef41Sopenharmony_ci 5821cb0ef41Sopenharmony_ci// Dual functionality: 5831cb0ef41Sopenharmony_ci// buffer!=nullptr: normalize 5841cb0ef41Sopenharmony_ci// buffer==nullptr: isNormalized/spanQuickCheckYes 5851cb0ef41Sopenharmony_ciconst char16_t * 5861cb0ef41Sopenharmony_ciNormalizer2Impl::decompose(const char16_t *src, const char16_t *limit, 5871cb0ef41Sopenharmony_ci ReorderingBuffer *buffer, 5881cb0ef41Sopenharmony_ci UErrorCode &errorCode) const { 5891cb0ef41Sopenharmony_ci UChar32 minNoCP=minDecompNoCP; 5901cb0ef41Sopenharmony_ci if(limit==nullptr) { 5911cb0ef41Sopenharmony_ci src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); 5921cb0ef41Sopenharmony_ci if(U_FAILURE(errorCode)) { 5931cb0ef41Sopenharmony_ci return src; 5941cb0ef41Sopenharmony_ci } 5951cb0ef41Sopenharmony_ci limit=u_strchr(src, 0); 5961cb0ef41Sopenharmony_ci } 5971cb0ef41Sopenharmony_ci 5981cb0ef41Sopenharmony_ci const char16_t *prevSrc; 5991cb0ef41Sopenharmony_ci UChar32 c=0; 6001cb0ef41Sopenharmony_ci uint16_t norm16=0; 6011cb0ef41Sopenharmony_ci 6021cb0ef41Sopenharmony_ci // only for quick check 6031cb0ef41Sopenharmony_ci const char16_t *prevBoundary=src; 6041cb0ef41Sopenharmony_ci uint8_t prevCC=0; 6051cb0ef41Sopenharmony_ci 6061cb0ef41Sopenharmony_ci for(;;) { 6071cb0ef41Sopenharmony_ci // count code units below the minimum or with irrelevant data for the quick check 6081cb0ef41Sopenharmony_ci for(prevSrc=src; src!=limit;) { 6091cb0ef41Sopenharmony_ci if( (c=*src)<minNoCP || 6101cb0ef41Sopenharmony_ci isMostDecompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)) 6111cb0ef41Sopenharmony_ci ) { 6121cb0ef41Sopenharmony_ci ++src; 6131cb0ef41Sopenharmony_ci } else if(!U16_IS_LEAD(c)) { 6141cb0ef41Sopenharmony_ci break; 6151cb0ef41Sopenharmony_ci } else { 6161cb0ef41Sopenharmony_ci char16_t c2; 6171cb0ef41Sopenharmony_ci if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 6181cb0ef41Sopenharmony_ci c=U16_GET_SUPPLEMENTARY(c, c2); 6191cb0ef41Sopenharmony_ci norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c); 6201cb0ef41Sopenharmony_ci if(isMostDecompYesAndZeroCC(norm16)) { 6211cb0ef41Sopenharmony_ci src+=2; 6221cb0ef41Sopenharmony_ci } else { 6231cb0ef41Sopenharmony_ci break; 6241cb0ef41Sopenharmony_ci } 6251cb0ef41Sopenharmony_ci } else { 6261cb0ef41Sopenharmony_ci ++src; // unpaired lead surrogate: inert 6271cb0ef41Sopenharmony_ci } 6281cb0ef41Sopenharmony_ci } 6291cb0ef41Sopenharmony_ci } 6301cb0ef41Sopenharmony_ci // copy these code units all at once 6311cb0ef41Sopenharmony_ci if(src!=prevSrc) { 6321cb0ef41Sopenharmony_ci if(buffer!=nullptr) { 6331cb0ef41Sopenharmony_ci if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { 6341cb0ef41Sopenharmony_ci break; 6351cb0ef41Sopenharmony_ci } 6361cb0ef41Sopenharmony_ci } else { 6371cb0ef41Sopenharmony_ci prevCC=0; 6381cb0ef41Sopenharmony_ci prevBoundary=src; 6391cb0ef41Sopenharmony_ci } 6401cb0ef41Sopenharmony_ci } 6411cb0ef41Sopenharmony_ci if(src==limit) { 6421cb0ef41Sopenharmony_ci break; 6431cb0ef41Sopenharmony_ci } 6441cb0ef41Sopenharmony_ci 6451cb0ef41Sopenharmony_ci // Check one above-minimum, relevant code point. 6461cb0ef41Sopenharmony_ci src+=U16_LENGTH(c); 6471cb0ef41Sopenharmony_ci if(buffer!=nullptr) { 6481cb0ef41Sopenharmony_ci if(!decompose(c, norm16, *buffer, errorCode)) { 6491cb0ef41Sopenharmony_ci break; 6501cb0ef41Sopenharmony_ci } 6511cb0ef41Sopenharmony_ci } else { 6521cb0ef41Sopenharmony_ci if(isDecompYes(norm16)) { 6531cb0ef41Sopenharmony_ci uint8_t cc=getCCFromYesOrMaybe(norm16); 6541cb0ef41Sopenharmony_ci if(prevCC<=cc || cc==0) { 6551cb0ef41Sopenharmony_ci prevCC=cc; 6561cb0ef41Sopenharmony_ci if(cc<=1) { 6571cb0ef41Sopenharmony_ci prevBoundary=src; 6581cb0ef41Sopenharmony_ci } 6591cb0ef41Sopenharmony_ci continue; 6601cb0ef41Sopenharmony_ci } 6611cb0ef41Sopenharmony_ci } 6621cb0ef41Sopenharmony_ci return prevBoundary; // "no" or cc out of order 6631cb0ef41Sopenharmony_ci } 6641cb0ef41Sopenharmony_ci } 6651cb0ef41Sopenharmony_ci return src; 6661cb0ef41Sopenharmony_ci} 6671cb0ef41Sopenharmony_ci 6681cb0ef41Sopenharmony_ci// Decompose a short piece of text which is likely to contain characters that 6691cb0ef41Sopenharmony_ci// fail the quick check loop and/or where the quick check loop's overhead 6701cb0ef41Sopenharmony_ci// is unlikely to be amortized. 6711cb0ef41Sopenharmony_ci// Called by the compose() and makeFCD() implementations. 6721cb0ef41Sopenharmony_ciconst char16_t * 6731cb0ef41Sopenharmony_ciNormalizer2Impl::decomposeShort(const char16_t *src, const char16_t *limit, 6741cb0ef41Sopenharmony_ci UBool stopAtCompBoundary, UBool onlyContiguous, 6751cb0ef41Sopenharmony_ci ReorderingBuffer &buffer, UErrorCode &errorCode) const { 6761cb0ef41Sopenharmony_ci if (U_FAILURE(errorCode)) { 6771cb0ef41Sopenharmony_ci return nullptr; 6781cb0ef41Sopenharmony_ci } 6791cb0ef41Sopenharmony_ci while(src<limit) { 6801cb0ef41Sopenharmony_ci if (stopAtCompBoundary && *src < minCompNoMaybeCP) { 6811cb0ef41Sopenharmony_ci return src; 6821cb0ef41Sopenharmony_ci } 6831cb0ef41Sopenharmony_ci const char16_t *prevSrc = src; 6841cb0ef41Sopenharmony_ci UChar32 c; 6851cb0ef41Sopenharmony_ci uint16_t norm16; 6861cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16); 6871cb0ef41Sopenharmony_ci if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { 6881cb0ef41Sopenharmony_ci return prevSrc; 6891cb0ef41Sopenharmony_ci } 6901cb0ef41Sopenharmony_ci if(!decompose(c, norm16, buffer, errorCode)) { 6911cb0ef41Sopenharmony_ci return nullptr; 6921cb0ef41Sopenharmony_ci } 6931cb0ef41Sopenharmony_ci if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 6941cb0ef41Sopenharmony_ci return src; 6951cb0ef41Sopenharmony_ci } 6961cb0ef41Sopenharmony_ci } 6971cb0ef41Sopenharmony_ci return src; 6981cb0ef41Sopenharmony_ci} 6991cb0ef41Sopenharmony_ci 7001cb0ef41Sopenharmony_ciUBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, 7011cb0ef41Sopenharmony_ci ReorderingBuffer &buffer, 7021cb0ef41Sopenharmony_ci UErrorCode &errorCode) const { 7031cb0ef41Sopenharmony_ci // get the decomposition and the lead and trail cc's 7041cb0ef41Sopenharmony_ci if (norm16 >= limitNoNo) { 7051cb0ef41Sopenharmony_ci if (isMaybeOrNonZeroCC(norm16)) { 7061cb0ef41Sopenharmony_ci return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode); 7071cb0ef41Sopenharmony_ci } 7081cb0ef41Sopenharmony_ci // Maps to an isCompYesAndZeroCC. 7091cb0ef41Sopenharmony_ci c=mapAlgorithmic(c, norm16); 7101cb0ef41Sopenharmony_ci norm16=getRawNorm16(c); 7111cb0ef41Sopenharmony_ci } 7121cb0ef41Sopenharmony_ci if (norm16 < minYesNo) { 7131cb0ef41Sopenharmony_ci // c does not decompose 7141cb0ef41Sopenharmony_ci return buffer.append(c, 0, errorCode); 7151cb0ef41Sopenharmony_ci } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 7161cb0ef41Sopenharmony_ci // Hangul syllable: decompose algorithmically 7171cb0ef41Sopenharmony_ci char16_t jamos[3]; 7181cb0ef41Sopenharmony_ci return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); 7191cb0ef41Sopenharmony_ci } 7201cb0ef41Sopenharmony_ci // c decomposes, get everything from the variable-length extra data 7211cb0ef41Sopenharmony_ci const uint16_t *mapping=getMapping(norm16); 7221cb0ef41Sopenharmony_ci uint16_t firstUnit=*mapping; 7231cb0ef41Sopenharmony_ci int32_t length=firstUnit&MAPPING_LENGTH_MASK; 7241cb0ef41Sopenharmony_ci uint8_t leadCC, trailCC; 7251cb0ef41Sopenharmony_ci trailCC=(uint8_t)(firstUnit>>8); 7261cb0ef41Sopenharmony_ci if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 7271cb0ef41Sopenharmony_ci leadCC=(uint8_t)(*(mapping-1)>>8); 7281cb0ef41Sopenharmony_ci } else { 7291cb0ef41Sopenharmony_ci leadCC=0; 7301cb0ef41Sopenharmony_ci } 7311cb0ef41Sopenharmony_ci return buffer.append((const char16_t *)mapping+1, length, true, leadCC, trailCC, errorCode); 7321cb0ef41Sopenharmony_ci} 7331cb0ef41Sopenharmony_ci 7341cb0ef41Sopenharmony_ci// Dual functionality: 7351cb0ef41Sopenharmony_ci// sink != nullptr: normalize 7361cb0ef41Sopenharmony_ci// sink == nullptr: isNormalized/spanQuickCheckYes 7371cb0ef41Sopenharmony_ciconst uint8_t * 7381cb0ef41Sopenharmony_ciNormalizer2Impl::decomposeUTF8(uint32_t options, 7391cb0ef41Sopenharmony_ci const uint8_t *src, const uint8_t *limit, 7401cb0ef41Sopenharmony_ci ByteSink *sink, Edits *edits, UErrorCode &errorCode) const { 7411cb0ef41Sopenharmony_ci U_ASSERT(limit != nullptr); 7421cb0ef41Sopenharmony_ci UnicodeString s16; 7431cb0ef41Sopenharmony_ci uint8_t minNoLead = leadByteForCP(minDecompNoCP); 7441cb0ef41Sopenharmony_ci 7451cb0ef41Sopenharmony_ci const uint8_t *prevBoundary = src; 7461cb0ef41Sopenharmony_ci // only for quick check 7471cb0ef41Sopenharmony_ci uint8_t prevCC = 0; 7481cb0ef41Sopenharmony_ci 7491cb0ef41Sopenharmony_ci for (;;) { 7501cb0ef41Sopenharmony_ci // Fast path: Scan over a sequence of characters below the minimum "no" code point, 7511cb0ef41Sopenharmony_ci // or with (decompYes && ccc==0) properties. 7521cb0ef41Sopenharmony_ci const uint8_t *fastStart = src; 7531cb0ef41Sopenharmony_ci const uint8_t *prevSrc; 7541cb0ef41Sopenharmony_ci uint16_t norm16 = 0; 7551cb0ef41Sopenharmony_ci 7561cb0ef41Sopenharmony_ci for (;;) { 7571cb0ef41Sopenharmony_ci if (src == limit) { 7581cb0ef41Sopenharmony_ci if (prevBoundary != limit && sink != nullptr) { 7591cb0ef41Sopenharmony_ci ByteSinkUtil::appendUnchanged(prevBoundary, limit, 7601cb0ef41Sopenharmony_ci *sink, options, edits, errorCode); 7611cb0ef41Sopenharmony_ci } 7621cb0ef41Sopenharmony_ci return src; 7631cb0ef41Sopenharmony_ci } 7641cb0ef41Sopenharmony_ci if (*src < minNoLead) { 7651cb0ef41Sopenharmony_ci ++src; 7661cb0ef41Sopenharmony_ci } else { 7671cb0ef41Sopenharmony_ci prevSrc = src; 7681cb0ef41Sopenharmony_ci UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16); 7691cb0ef41Sopenharmony_ci if (!isMostDecompYesAndZeroCC(norm16)) { 7701cb0ef41Sopenharmony_ci break; 7711cb0ef41Sopenharmony_ci } 7721cb0ef41Sopenharmony_ci } 7731cb0ef41Sopenharmony_ci } 7741cb0ef41Sopenharmony_ci // isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo, 7751cb0ef41Sopenharmony_ci // and the current character at [prevSrc..src[ is not a common case with cc=0 7761cb0ef41Sopenharmony_ci // (MIN_NORMAL_MAYBE_YES or JAMO_VT). 7771cb0ef41Sopenharmony_ci // It could still be a maybeYes with cc=0. 7781cb0ef41Sopenharmony_ci if (prevSrc != fastStart) { 7791cb0ef41Sopenharmony_ci // The fast path looped over yes/0 characters before the current one. 7801cb0ef41Sopenharmony_ci if (sink != nullptr && 7811cb0ef41Sopenharmony_ci !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 7821cb0ef41Sopenharmony_ci *sink, options, edits, errorCode)) { 7831cb0ef41Sopenharmony_ci break; 7841cb0ef41Sopenharmony_ci } 7851cb0ef41Sopenharmony_ci prevBoundary = prevSrc; 7861cb0ef41Sopenharmony_ci prevCC = 0; 7871cb0ef41Sopenharmony_ci } 7881cb0ef41Sopenharmony_ci 7891cb0ef41Sopenharmony_ci // Medium-fast path: Quick check. 7901cb0ef41Sopenharmony_ci if (isMaybeOrNonZeroCC(norm16)) { 7911cb0ef41Sopenharmony_ci // Does not decompose. 7921cb0ef41Sopenharmony_ci uint8_t cc = getCCFromYesOrMaybe(norm16); 7931cb0ef41Sopenharmony_ci if (prevCC <= cc || cc == 0) { 7941cb0ef41Sopenharmony_ci prevCC = cc; 7951cb0ef41Sopenharmony_ci if (cc <= 1) { 7961cb0ef41Sopenharmony_ci if (sink != nullptr && 7971cb0ef41Sopenharmony_ci !ByteSinkUtil::appendUnchanged(prevBoundary, src, 7981cb0ef41Sopenharmony_ci *sink, options, edits, errorCode)) { 7991cb0ef41Sopenharmony_ci break; 8001cb0ef41Sopenharmony_ci } 8011cb0ef41Sopenharmony_ci prevBoundary = src; 8021cb0ef41Sopenharmony_ci } 8031cb0ef41Sopenharmony_ci continue; 8041cb0ef41Sopenharmony_ci } 8051cb0ef41Sopenharmony_ci } 8061cb0ef41Sopenharmony_ci if (sink == nullptr) { 8071cb0ef41Sopenharmony_ci return prevBoundary; // quick check: "no" or cc out of order 8081cb0ef41Sopenharmony_ci } 8091cb0ef41Sopenharmony_ci 8101cb0ef41Sopenharmony_ci // Slow path 8111cb0ef41Sopenharmony_ci // Decompose up to and including the current character. 8121cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) { 8131cb0ef41Sopenharmony_ci if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 8141cb0ef41Sopenharmony_ci *sink, options, edits, errorCode)) { 8151cb0ef41Sopenharmony_ci break; 8161cb0ef41Sopenharmony_ci } 8171cb0ef41Sopenharmony_ci prevBoundary = prevSrc; 8181cb0ef41Sopenharmony_ci } 8191cb0ef41Sopenharmony_ci ReorderingBuffer buffer(*this, s16, errorCode); 8201cb0ef41Sopenharmony_ci if (U_FAILURE(errorCode)) { 8211cb0ef41Sopenharmony_ci break; 8221cb0ef41Sopenharmony_ci } 8231cb0ef41Sopenharmony_ci decomposeShort(prevBoundary, src, STOP_AT_LIMIT, false /* onlyContiguous */, 8241cb0ef41Sopenharmony_ci buffer, errorCode); 8251cb0ef41Sopenharmony_ci // Decompose until the next boundary. 8261cb0ef41Sopenharmony_ci if (buffer.getLastCC() > 1) { 8271cb0ef41Sopenharmony_ci src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, false /* onlyContiguous */, 8281cb0ef41Sopenharmony_ci buffer, errorCode); 8291cb0ef41Sopenharmony_ci } 8301cb0ef41Sopenharmony_ci if (U_FAILURE(errorCode)) { 8311cb0ef41Sopenharmony_ci break; 8321cb0ef41Sopenharmony_ci } 8331cb0ef41Sopenharmony_ci if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals() 8341cb0ef41Sopenharmony_ci errorCode = U_INDEX_OUTOFBOUNDS_ERROR; 8351cb0ef41Sopenharmony_ci break; 8361cb0ef41Sopenharmony_ci } 8371cb0ef41Sopenharmony_ci // We already know there was a change if the original character decomposed; 8381cb0ef41Sopenharmony_ci // otherwise compare. 8391cb0ef41Sopenharmony_ci if (isMaybeOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) { 8401cb0ef41Sopenharmony_ci if (!ByteSinkUtil::appendUnchanged(prevBoundary, src, 8411cb0ef41Sopenharmony_ci *sink, options, edits, errorCode)) { 8421cb0ef41Sopenharmony_ci break; 8431cb0ef41Sopenharmony_ci } 8441cb0ef41Sopenharmony_ci } else { 8451cb0ef41Sopenharmony_ci if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(), 8461cb0ef41Sopenharmony_ci *sink, edits, errorCode)) { 8471cb0ef41Sopenharmony_ci break; 8481cb0ef41Sopenharmony_ci } 8491cb0ef41Sopenharmony_ci } 8501cb0ef41Sopenharmony_ci prevBoundary = src; 8511cb0ef41Sopenharmony_ci prevCC = 0; 8521cb0ef41Sopenharmony_ci } 8531cb0ef41Sopenharmony_ci return src; 8541cb0ef41Sopenharmony_ci} 8551cb0ef41Sopenharmony_ci 8561cb0ef41Sopenharmony_ciconst uint8_t * 8571cb0ef41Sopenharmony_ciNormalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit, 8581cb0ef41Sopenharmony_ci StopAt stopAt, UBool onlyContiguous, 8591cb0ef41Sopenharmony_ci ReorderingBuffer &buffer, UErrorCode &errorCode) const { 8601cb0ef41Sopenharmony_ci if (U_FAILURE(errorCode)) { 8611cb0ef41Sopenharmony_ci return nullptr; 8621cb0ef41Sopenharmony_ci } 8631cb0ef41Sopenharmony_ci while (src < limit) { 8641cb0ef41Sopenharmony_ci const uint8_t *prevSrc = src; 8651cb0ef41Sopenharmony_ci uint16_t norm16; 8661cb0ef41Sopenharmony_ci UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16); 8671cb0ef41Sopenharmony_ci // Get the decomposition and the lead and trail cc's. 8681cb0ef41Sopenharmony_ci UChar32 c = U_SENTINEL; 8691cb0ef41Sopenharmony_ci if (norm16 >= limitNoNo) { 8701cb0ef41Sopenharmony_ci if (isMaybeOrNonZeroCC(norm16)) { 8711cb0ef41Sopenharmony_ci // No comp boundaries around this character. 8721cb0ef41Sopenharmony_ci uint8_t cc = getCCFromYesOrMaybe(norm16); 8731cb0ef41Sopenharmony_ci if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) { 8741cb0ef41Sopenharmony_ci return prevSrc; 8751cb0ef41Sopenharmony_ci } 8761cb0ef41Sopenharmony_ci c = codePointFromValidUTF8(prevSrc, src); 8771cb0ef41Sopenharmony_ci if (!buffer.append(c, cc, errorCode)) { 8781cb0ef41Sopenharmony_ci return nullptr; 8791cb0ef41Sopenharmony_ci } 8801cb0ef41Sopenharmony_ci if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) { 8811cb0ef41Sopenharmony_ci return src; 8821cb0ef41Sopenharmony_ci } 8831cb0ef41Sopenharmony_ci continue; 8841cb0ef41Sopenharmony_ci } 8851cb0ef41Sopenharmony_ci // Maps to an isCompYesAndZeroCC. 8861cb0ef41Sopenharmony_ci if (stopAt != STOP_AT_LIMIT) { 8871cb0ef41Sopenharmony_ci return prevSrc; 8881cb0ef41Sopenharmony_ci } 8891cb0ef41Sopenharmony_ci c = codePointFromValidUTF8(prevSrc, src); 8901cb0ef41Sopenharmony_ci c = mapAlgorithmic(c, norm16); 8911cb0ef41Sopenharmony_ci norm16 = getRawNorm16(c); 8921cb0ef41Sopenharmony_ci } else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) { 8931cb0ef41Sopenharmony_ci return prevSrc; 8941cb0ef41Sopenharmony_ci } 8951cb0ef41Sopenharmony_ci // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8. 8961cb0ef41Sopenharmony_ci // We do not see invalid UTF-8 here because 8971cb0ef41Sopenharmony_ci // its norm16==INERT is normalization-inert, 8981cb0ef41Sopenharmony_ci // so it gets copied unchanged in the fast path, 8991cb0ef41Sopenharmony_ci // and we stop the slow path where invalid UTF-8 begins. 9001cb0ef41Sopenharmony_ci // c >= 0 is the result of an algorithmic mapping. 9011cb0ef41Sopenharmony_ci U_ASSERT(c >= 0 || norm16 != INERT); 9021cb0ef41Sopenharmony_ci if (norm16 < minYesNo) { 9031cb0ef41Sopenharmony_ci if (c < 0) { 9041cb0ef41Sopenharmony_ci c = codePointFromValidUTF8(prevSrc, src); 9051cb0ef41Sopenharmony_ci } 9061cb0ef41Sopenharmony_ci // does not decompose 9071cb0ef41Sopenharmony_ci if (!buffer.append(c, 0, errorCode)) { 9081cb0ef41Sopenharmony_ci return nullptr; 9091cb0ef41Sopenharmony_ci } 9101cb0ef41Sopenharmony_ci } else if (isHangulLV(norm16) || isHangulLVT(norm16)) { 9111cb0ef41Sopenharmony_ci // Hangul syllable: decompose algorithmically 9121cb0ef41Sopenharmony_ci if (c < 0) { 9131cb0ef41Sopenharmony_ci c = codePointFromValidUTF8(prevSrc, src); 9141cb0ef41Sopenharmony_ci } 9151cb0ef41Sopenharmony_ci char16_t jamos[3]; 9161cb0ef41Sopenharmony_ci if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) { 9171cb0ef41Sopenharmony_ci return nullptr; 9181cb0ef41Sopenharmony_ci } 9191cb0ef41Sopenharmony_ci } else { 9201cb0ef41Sopenharmony_ci // The character decomposes, get everything from the variable-length extra data. 9211cb0ef41Sopenharmony_ci const uint16_t *mapping = getMapping(norm16); 9221cb0ef41Sopenharmony_ci uint16_t firstUnit = *mapping; 9231cb0ef41Sopenharmony_ci int32_t length = firstUnit & MAPPING_LENGTH_MASK; 9241cb0ef41Sopenharmony_ci uint8_t trailCC = (uint8_t)(firstUnit >> 8); 9251cb0ef41Sopenharmony_ci uint8_t leadCC; 9261cb0ef41Sopenharmony_ci if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) { 9271cb0ef41Sopenharmony_ci leadCC = (uint8_t)(*(mapping-1) >> 8); 9281cb0ef41Sopenharmony_ci } else { 9291cb0ef41Sopenharmony_ci leadCC = 0; 9301cb0ef41Sopenharmony_ci } 9311cb0ef41Sopenharmony_ci if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) { 9321cb0ef41Sopenharmony_ci return prevSrc; 9331cb0ef41Sopenharmony_ci } 9341cb0ef41Sopenharmony_ci if (!buffer.append((const char16_t *)mapping+1, length, true, leadCC, trailCC, errorCode)) { 9351cb0ef41Sopenharmony_ci return nullptr; 9361cb0ef41Sopenharmony_ci } 9371cb0ef41Sopenharmony_ci } 9381cb0ef41Sopenharmony_ci if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) || 9391cb0ef41Sopenharmony_ci (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) { 9401cb0ef41Sopenharmony_ci return src; 9411cb0ef41Sopenharmony_ci } 9421cb0ef41Sopenharmony_ci } 9431cb0ef41Sopenharmony_ci return src; 9441cb0ef41Sopenharmony_ci} 9451cb0ef41Sopenharmony_ci 9461cb0ef41Sopenharmony_ciconst char16_t * 9471cb0ef41Sopenharmony_ciNormalizer2Impl::getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const { 9481cb0ef41Sopenharmony_ci uint16_t norm16; 9491cb0ef41Sopenharmony_ci if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { 9501cb0ef41Sopenharmony_ci // c does not decompose 9511cb0ef41Sopenharmony_ci return nullptr; 9521cb0ef41Sopenharmony_ci } 9531cb0ef41Sopenharmony_ci const char16_t *decomp = nullptr; 9541cb0ef41Sopenharmony_ci if(isDecompNoAlgorithmic(norm16)) { 9551cb0ef41Sopenharmony_ci // Maps to an isCompYesAndZeroCC. 9561cb0ef41Sopenharmony_ci c=mapAlgorithmic(c, norm16); 9571cb0ef41Sopenharmony_ci decomp=buffer; 9581cb0ef41Sopenharmony_ci length=0; 9591cb0ef41Sopenharmony_ci U16_APPEND_UNSAFE(buffer, length, c); 9601cb0ef41Sopenharmony_ci // The mapping might decompose further. 9611cb0ef41Sopenharmony_ci norm16 = getRawNorm16(c); 9621cb0ef41Sopenharmony_ci } 9631cb0ef41Sopenharmony_ci if (norm16 < minYesNo) { 9641cb0ef41Sopenharmony_ci return decomp; 9651cb0ef41Sopenharmony_ci } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 9661cb0ef41Sopenharmony_ci // Hangul syllable: decompose algorithmically 9671cb0ef41Sopenharmony_ci length=Hangul::decompose(c, buffer); 9681cb0ef41Sopenharmony_ci return buffer; 9691cb0ef41Sopenharmony_ci } 9701cb0ef41Sopenharmony_ci // c decomposes, get everything from the variable-length extra data 9711cb0ef41Sopenharmony_ci const uint16_t *mapping=getMapping(norm16); 9721cb0ef41Sopenharmony_ci length=*mapping&MAPPING_LENGTH_MASK; 9731cb0ef41Sopenharmony_ci return (const char16_t *)mapping+1; 9741cb0ef41Sopenharmony_ci} 9751cb0ef41Sopenharmony_ci 9761cb0ef41Sopenharmony_ci// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1 9771cb0ef41Sopenharmony_ci// so that a raw mapping fits that consists of one unit ("rm0") 9781cb0ef41Sopenharmony_ci// plus all but the first two code units of the normal mapping. 9791cb0ef41Sopenharmony_ci// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK. 9801cb0ef41Sopenharmony_ciconst char16_t * 9811cb0ef41Sopenharmony_ciNormalizer2Impl::getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &length) const { 9821cb0ef41Sopenharmony_ci uint16_t norm16; 9831cb0ef41Sopenharmony_ci if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 9841cb0ef41Sopenharmony_ci // c does not decompose 9851cb0ef41Sopenharmony_ci return nullptr; 9861cb0ef41Sopenharmony_ci } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 9871cb0ef41Sopenharmony_ci // Hangul syllable: decompose algorithmically 9881cb0ef41Sopenharmony_ci Hangul::getRawDecomposition(c, buffer); 9891cb0ef41Sopenharmony_ci length=2; 9901cb0ef41Sopenharmony_ci return buffer; 9911cb0ef41Sopenharmony_ci } else if(isDecompNoAlgorithmic(norm16)) { 9921cb0ef41Sopenharmony_ci c=mapAlgorithmic(c, norm16); 9931cb0ef41Sopenharmony_ci length=0; 9941cb0ef41Sopenharmony_ci U16_APPEND_UNSAFE(buffer, length, c); 9951cb0ef41Sopenharmony_ci return buffer; 9961cb0ef41Sopenharmony_ci } 9971cb0ef41Sopenharmony_ci // c decomposes, get everything from the variable-length extra data 9981cb0ef41Sopenharmony_ci const uint16_t *mapping=getMapping(norm16); 9991cb0ef41Sopenharmony_ci uint16_t firstUnit=*mapping; 10001cb0ef41Sopenharmony_ci int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 10011cb0ef41Sopenharmony_ci if(firstUnit&MAPPING_HAS_RAW_MAPPING) { 10021cb0ef41Sopenharmony_ci // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 10031cb0ef41Sopenharmony_ci // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 10041cb0ef41Sopenharmony_ci const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1; 10051cb0ef41Sopenharmony_ci uint16_t rm0=*rawMapping; 10061cb0ef41Sopenharmony_ci if(rm0<=MAPPING_LENGTH_MASK) { 10071cb0ef41Sopenharmony_ci length=rm0; 10081cb0ef41Sopenharmony_ci return (const char16_t *)rawMapping-rm0; 10091cb0ef41Sopenharmony_ci } else { 10101cb0ef41Sopenharmony_ci // Copy the normal mapping and replace its first two code units with rm0. 10111cb0ef41Sopenharmony_ci buffer[0]=(char16_t)rm0; 10121cb0ef41Sopenharmony_ci u_memcpy(buffer+1, (const char16_t *)mapping+1+2, mLength-2); 10131cb0ef41Sopenharmony_ci length=mLength-1; 10141cb0ef41Sopenharmony_ci return buffer; 10151cb0ef41Sopenharmony_ci } 10161cb0ef41Sopenharmony_ci } else { 10171cb0ef41Sopenharmony_ci length=mLength; 10181cb0ef41Sopenharmony_ci return (const char16_t *)mapping+1; 10191cb0ef41Sopenharmony_ci } 10201cb0ef41Sopenharmony_ci} 10211cb0ef41Sopenharmony_ci 10221cb0ef41Sopenharmony_civoid Normalizer2Impl::decomposeAndAppend(const char16_t *src, const char16_t *limit, 10231cb0ef41Sopenharmony_ci UBool doDecompose, 10241cb0ef41Sopenharmony_ci UnicodeString &safeMiddle, 10251cb0ef41Sopenharmony_ci ReorderingBuffer &buffer, 10261cb0ef41Sopenharmony_ci UErrorCode &errorCode) const { 10271cb0ef41Sopenharmony_ci buffer.copyReorderableSuffixTo(safeMiddle); 10281cb0ef41Sopenharmony_ci if(doDecompose) { 10291cb0ef41Sopenharmony_ci decompose(src, limit, &buffer, errorCode); 10301cb0ef41Sopenharmony_ci return; 10311cb0ef41Sopenharmony_ci } 10321cb0ef41Sopenharmony_ci // Just merge the strings at the boundary. 10331cb0ef41Sopenharmony_ci bool isFirst = true; 10341cb0ef41Sopenharmony_ci uint8_t firstCC = 0, prevCC = 0, cc; 10351cb0ef41Sopenharmony_ci const char16_t *p = src; 10361cb0ef41Sopenharmony_ci while (p != limit) { 10371cb0ef41Sopenharmony_ci const char16_t *codePointStart = p; 10381cb0ef41Sopenharmony_ci UChar32 c; 10391cb0ef41Sopenharmony_ci uint16_t norm16; 10401cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16); 10411cb0ef41Sopenharmony_ci if ((cc = getCC(norm16)) == 0) { 10421cb0ef41Sopenharmony_ci p = codePointStart; 10431cb0ef41Sopenharmony_ci break; 10441cb0ef41Sopenharmony_ci } 10451cb0ef41Sopenharmony_ci if (isFirst) { 10461cb0ef41Sopenharmony_ci firstCC = cc; 10471cb0ef41Sopenharmony_ci isFirst = false; 10481cb0ef41Sopenharmony_ci } 10491cb0ef41Sopenharmony_ci prevCC = cc; 10501cb0ef41Sopenharmony_ci } 10511cb0ef41Sopenharmony_ci if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr 10521cb0ef41Sopenharmony_ci limit=u_strchr(p, 0); 10531cb0ef41Sopenharmony_ci } 10541cb0ef41Sopenharmony_ci 10551cb0ef41Sopenharmony_ci if (buffer.append(src, (int32_t)(p - src), false, firstCC, prevCC, errorCode)) { 10561cb0ef41Sopenharmony_ci buffer.appendZeroCC(p, limit, errorCode); 10571cb0ef41Sopenharmony_ci } 10581cb0ef41Sopenharmony_ci} 10591cb0ef41Sopenharmony_ci 10601cb0ef41Sopenharmony_ciUBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const { 10611cb0ef41Sopenharmony_ci return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || 10621cb0ef41Sopenharmony_ci norm16HasDecompBoundaryBefore(getNorm16(c)); 10631cb0ef41Sopenharmony_ci} 10641cb0ef41Sopenharmony_ci 10651cb0ef41Sopenharmony_ciUBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const { 10661cb0ef41Sopenharmony_ci if (norm16 < minNoNoCompNoMaybeCC) { 10671cb0ef41Sopenharmony_ci return true; 10681cb0ef41Sopenharmony_ci } 10691cb0ef41Sopenharmony_ci if (norm16 >= limitNoNo) { 10701cb0ef41Sopenharmony_ci return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 10711cb0ef41Sopenharmony_ci } 10721cb0ef41Sopenharmony_ci // c decomposes, get everything from the variable-length extra data 10731cb0ef41Sopenharmony_ci const uint16_t *mapping=getMapping(norm16); 10741cb0ef41Sopenharmony_ci uint16_t firstUnit=*mapping; 10751cb0ef41Sopenharmony_ci // true if leadCC==0 (hasFCDBoundaryBefore()) 10761cb0ef41Sopenharmony_ci return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; 10771cb0ef41Sopenharmony_ci} 10781cb0ef41Sopenharmony_ci 10791cb0ef41Sopenharmony_ciUBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const { 10801cb0ef41Sopenharmony_ci if (c < minDecompNoCP) { 10811cb0ef41Sopenharmony_ci return true; 10821cb0ef41Sopenharmony_ci } 10831cb0ef41Sopenharmony_ci if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { 10841cb0ef41Sopenharmony_ci return true; 10851cb0ef41Sopenharmony_ci } 10861cb0ef41Sopenharmony_ci return norm16HasDecompBoundaryAfter(getNorm16(c)); 10871cb0ef41Sopenharmony_ci} 10881cb0ef41Sopenharmony_ci 10891cb0ef41Sopenharmony_ciUBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const { 10901cb0ef41Sopenharmony_ci if(norm16 <= minYesNo || isHangulLVT(norm16)) { 10911cb0ef41Sopenharmony_ci return true; 10921cb0ef41Sopenharmony_ci } 10931cb0ef41Sopenharmony_ci if (norm16 >= limitNoNo) { 10941cb0ef41Sopenharmony_ci if (isMaybeOrNonZeroCC(norm16)) { 10951cb0ef41Sopenharmony_ci return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 10961cb0ef41Sopenharmony_ci } 10971cb0ef41Sopenharmony_ci // Maps to an isCompYesAndZeroCC. 10981cb0ef41Sopenharmony_ci return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; 10991cb0ef41Sopenharmony_ci } 11001cb0ef41Sopenharmony_ci // c decomposes, get everything from the variable-length extra data 11011cb0ef41Sopenharmony_ci const uint16_t *mapping=getMapping(norm16); 11021cb0ef41Sopenharmony_ci uint16_t firstUnit=*mapping; 11031cb0ef41Sopenharmony_ci // decomp after-boundary: same as hasFCDBoundaryAfter(), 11041cb0ef41Sopenharmony_ci // fcd16<=1 || trailCC==0 11051cb0ef41Sopenharmony_ci if(firstUnit>0x1ff) { 11061cb0ef41Sopenharmony_ci return false; // trailCC>1 11071cb0ef41Sopenharmony_ci } 11081cb0ef41Sopenharmony_ci if(firstUnit<=0xff) { 11091cb0ef41Sopenharmony_ci return true; // trailCC==0 11101cb0ef41Sopenharmony_ci } 11111cb0ef41Sopenharmony_ci // if(trailCC==1) test leadCC==0, same as checking for before-boundary 11121cb0ef41Sopenharmony_ci // true if leadCC==0 (hasFCDBoundaryBefore()) 11131cb0ef41Sopenharmony_ci return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; 11141cb0ef41Sopenharmony_ci} 11151cb0ef41Sopenharmony_ci 11161cb0ef41Sopenharmony_ci/* 11171cb0ef41Sopenharmony_ci * Finds the recomposition result for 11181cb0ef41Sopenharmony_ci * a forward-combining "lead" character, 11191cb0ef41Sopenharmony_ci * specified with a pointer to its compositions list, 11201cb0ef41Sopenharmony_ci * and a backward-combining "trail" character. 11211cb0ef41Sopenharmony_ci * 11221cb0ef41Sopenharmony_ci * If the lead and trail characters combine, then this function returns 11231cb0ef41Sopenharmony_ci * the following "compositeAndFwd" value: 11241cb0ef41Sopenharmony_ci * Bits 21..1 composite character 11251cb0ef41Sopenharmony_ci * Bit 0 set if the composite is a forward-combining starter 11261cb0ef41Sopenharmony_ci * otherwise it returns -1. 11271cb0ef41Sopenharmony_ci * 11281cb0ef41Sopenharmony_ci * The compositions list has (trail, compositeAndFwd) pair entries, 11291cb0ef41Sopenharmony_ci * encoded as either pairs or triples of 16-bit units. 11301cb0ef41Sopenharmony_ci * The last entry has the high bit of its first unit set. 11311cb0ef41Sopenharmony_ci * 11321cb0ef41Sopenharmony_ci * The list is sorted by ascending trail characters (there are no duplicates). 11331cb0ef41Sopenharmony_ci * A linear search is used. 11341cb0ef41Sopenharmony_ci * 11351cb0ef41Sopenharmony_ci * See normalizer2impl.h for a more detailed description 11361cb0ef41Sopenharmony_ci * of the compositions list format. 11371cb0ef41Sopenharmony_ci */ 11381cb0ef41Sopenharmony_ciint32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { 11391cb0ef41Sopenharmony_ci uint16_t key1, firstUnit; 11401cb0ef41Sopenharmony_ci if(trail<COMP_1_TRAIL_LIMIT) { 11411cb0ef41Sopenharmony_ci // trail character is 0..33FF 11421cb0ef41Sopenharmony_ci // result entry may have 2 or 3 units 11431cb0ef41Sopenharmony_ci key1=(uint16_t)(trail<<1); 11441cb0ef41Sopenharmony_ci while(key1>(firstUnit=*list)) { 11451cb0ef41Sopenharmony_ci list+=2+(firstUnit&COMP_1_TRIPLE); 11461cb0ef41Sopenharmony_ci } 11471cb0ef41Sopenharmony_ci if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 11481cb0ef41Sopenharmony_ci if(firstUnit&COMP_1_TRIPLE) { 11491cb0ef41Sopenharmony_ci return ((int32_t)list[1]<<16)|list[2]; 11501cb0ef41Sopenharmony_ci } else { 11511cb0ef41Sopenharmony_ci return list[1]; 11521cb0ef41Sopenharmony_ci } 11531cb0ef41Sopenharmony_ci } 11541cb0ef41Sopenharmony_ci } else { 11551cb0ef41Sopenharmony_ci // trail character is 3400..10FFFF 11561cb0ef41Sopenharmony_ci // result entry has 3 units 11571cb0ef41Sopenharmony_ci key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ 11581cb0ef41Sopenharmony_ci (((trail>>COMP_1_TRAIL_SHIFT))& 11591cb0ef41Sopenharmony_ci ~COMP_1_TRIPLE)); 11601cb0ef41Sopenharmony_ci uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT); 11611cb0ef41Sopenharmony_ci uint16_t secondUnit; 11621cb0ef41Sopenharmony_ci for(;;) { 11631cb0ef41Sopenharmony_ci if(key1>(firstUnit=*list)) { 11641cb0ef41Sopenharmony_ci list+=2+(firstUnit&COMP_1_TRIPLE); 11651cb0ef41Sopenharmony_ci } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 11661cb0ef41Sopenharmony_ci if(key2>(secondUnit=list[1])) { 11671cb0ef41Sopenharmony_ci if(firstUnit&COMP_1_LAST_TUPLE) { 11681cb0ef41Sopenharmony_ci break; 11691cb0ef41Sopenharmony_ci } else { 11701cb0ef41Sopenharmony_ci list+=3; 11711cb0ef41Sopenharmony_ci } 11721cb0ef41Sopenharmony_ci } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 11731cb0ef41Sopenharmony_ci return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; 11741cb0ef41Sopenharmony_ci } else { 11751cb0ef41Sopenharmony_ci break; 11761cb0ef41Sopenharmony_ci } 11771cb0ef41Sopenharmony_ci } else { 11781cb0ef41Sopenharmony_ci break; 11791cb0ef41Sopenharmony_ci } 11801cb0ef41Sopenharmony_ci } 11811cb0ef41Sopenharmony_ci } 11821cb0ef41Sopenharmony_ci return -1; 11831cb0ef41Sopenharmony_ci} 11841cb0ef41Sopenharmony_ci 11851cb0ef41Sopenharmony_ci/** 11861cb0ef41Sopenharmony_ci * @param list some character's compositions list 11871cb0ef41Sopenharmony_ci * @param set recursively receives the composites from these compositions 11881cb0ef41Sopenharmony_ci */ 11891cb0ef41Sopenharmony_civoid Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { 11901cb0ef41Sopenharmony_ci uint16_t firstUnit; 11911cb0ef41Sopenharmony_ci int32_t compositeAndFwd; 11921cb0ef41Sopenharmony_ci do { 11931cb0ef41Sopenharmony_ci firstUnit=*list; 11941cb0ef41Sopenharmony_ci if((firstUnit&COMP_1_TRIPLE)==0) { 11951cb0ef41Sopenharmony_ci compositeAndFwd=list[1]; 11961cb0ef41Sopenharmony_ci list+=2; 11971cb0ef41Sopenharmony_ci } else { 11981cb0ef41Sopenharmony_ci compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; 11991cb0ef41Sopenharmony_ci list+=3; 12001cb0ef41Sopenharmony_ci } 12011cb0ef41Sopenharmony_ci UChar32 composite=compositeAndFwd>>1; 12021cb0ef41Sopenharmony_ci if((compositeAndFwd&1)!=0) { 12031cb0ef41Sopenharmony_ci addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set); 12041cb0ef41Sopenharmony_ci } 12051cb0ef41Sopenharmony_ci set.add(composite); 12061cb0ef41Sopenharmony_ci } while((firstUnit&COMP_1_LAST_TUPLE)==0); 12071cb0ef41Sopenharmony_ci} 12081cb0ef41Sopenharmony_ci 12091cb0ef41Sopenharmony_ci/* 12101cb0ef41Sopenharmony_ci * Recomposes the buffer text starting at recomposeStartIndex 12111cb0ef41Sopenharmony_ci * (which is in NFD - decomposed and canonically ordered), 12121cb0ef41Sopenharmony_ci * and truncates the buffer contents. 12131cb0ef41Sopenharmony_ci * 12141cb0ef41Sopenharmony_ci * Note that recomposition never lengthens the text: 12151cb0ef41Sopenharmony_ci * Any character consists of either one or two code units; 12161cb0ef41Sopenharmony_ci * a composition may contain at most one more code unit than the original starter, 12171cb0ef41Sopenharmony_ci * while the combining mark that is removed has at least one code unit. 12181cb0ef41Sopenharmony_ci */ 12191cb0ef41Sopenharmony_civoid Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 12201cb0ef41Sopenharmony_ci UBool onlyContiguous) const { 12211cb0ef41Sopenharmony_ci char16_t *p=buffer.getStart()+recomposeStartIndex; 12221cb0ef41Sopenharmony_ci char16_t *limit=buffer.getLimit(); 12231cb0ef41Sopenharmony_ci if(p==limit) { 12241cb0ef41Sopenharmony_ci return; 12251cb0ef41Sopenharmony_ci } 12261cb0ef41Sopenharmony_ci 12271cb0ef41Sopenharmony_ci char16_t *starter, *pRemove, *q, *r; 12281cb0ef41Sopenharmony_ci const uint16_t *compositionsList; 12291cb0ef41Sopenharmony_ci UChar32 c, compositeAndFwd; 12301cb0ef41Sopenharmony_ci uint16_t norm16; 12311cb0ef41Sopenharmony_ci uint8_t cc, prevCC; 12321cb0ef41Sopenharmony_ci UBool starterIsSupplementary; 12331cb0ef41Sopenharmony_ci 12341cb0ef41Sopenharmony_ci // Some of the following variables are not used until we have a forward-combining starter 12351cb0ef41Sopenharmony_ci // and are only initialized now to avoid compiler warnings. 12361cb0ef41Sopenharmony_ci compositionsList=nullptr; // used as indicator for whether we have a forward-combining starter 12371cb0ef41Sopenharmony_ci starter=nullptr; 12381cb0ef41Sopenharmony_ci starterIsSupplementary=false; 12391cb0ef41Sopenharmony_ci prevCC=0; 12401cb0ef41Sopenharmony_ci 12411cb0ef41Sopenharmony_ci for(;;) { 12421cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16); 12431cb0ef41Sopenharmony_ci cc=getCCFromYesOrMaybe(norm16); 12441cb0ef41Sopenharmony_ci if( // this character combines backward and 12451cb0ef41Sopenharmony_ci isMaybe(norm16) && 12461cb0ef41Sopenharmony_ci // we have seen a starter that combines forward and 12471cb0ef41Sopenharmony_ci compositionsList!=nullptr && 12481cb0ef41Sopenharmony_ci // the backward-combining character is not blocked 12491cb0ef41Sopenharmony_ci (prevCC<cc || prevCC==0) 12501cb0ef41Sopenharmony_ci ) { 12511cb0ef41Sopenharmony_ci if(isJamoVT(norm16)) { 12521cb0ef41Sopenharmony_ci // c is a Jamo V/T, see if we can compose it with the previous character. 12531cb0ef41Sopenharmony_ci if(c<Hangul::JAMO_T_BASE) { 12541cb0ef41Sopenharmony_ci // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 12551cb0ef41Sopenharmony_ci char16_t prev=(char16_t)(*starter-Hangul::JAMO_L_BASE); 12561cb0ef41Sopenharmony_ci if(prev<Hangul::JAMO_L_COUNT) { 12571cb0ef41Sopenharmony_ci pRemove=p-1; 12581cb0ef41Sopenharmony_ci char16_t syllable=(char16_t) 12591cb0ef41Sopenharmony_ci (Hangul::HANGUL_BASE+ 12601cb0ef41Sopenharmony_ci (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 12611cb0ef41Sopenharmony_ci Hangul::JAMO_T_COUNT); 12621cb0ef41Sopenharmony_ci char16_t t; 12631cb0ef41Sopenharmony_ci if(p!=limit && (t=(char16_t)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 12641cb0ef41Sopenharmony_ci ++p; 12651cb0ef41Sopenharmony_ci syllable+=t; // The next character was a Jamo T. 12661cb0ef41Sopenharmony_ci } 12671cb0ef41Sopenharmony_ci *starter=syllable; 12681cb0ef41Sopenharmony_ci // remove the Jamo V/T 12691cb0ef41Sopenharmony_ci q=pRemove; 12701cb0ef41Sopenharmony_ci r=p; 12711cb0ef41Sopenharmony_ci while(r<limit) { 12721cb0ef41Sopenharmony_ci *q++=*r++; 12731cb0ef41Sopenharmony_ci } 12741cb0ef41Sopenharmony_ci limit=q; 12751cb0ef41Sopenharmony_ci p=pRemove; 12761cb0ef41Sopenharmony_ci } 12771cb0ef41Sopenharmony_ci } 12781cb0ef41Sopenharmony_ci /* 12791cb0ef41Sopenharmony_ci * No "else" for Jamo T: 12801cb0ef41Sopenharmony_ci * Since the input is in NFD, there are no Hangul LV syllables that 12811cb0ef41Sopenharmony_ci * a Jamo T could combine with. 12821cb0ef41Sopenharmony_ci * All Jamo Ts are combined above when handling Jamo Vs. 12831cb0ef41Sopenharmony_ci */ 12841cb0ef41Sopenharmony_ci if(p==limit) { 12851cb0ef41Sopenharmony_ci break; 12861cb0ef41Sopenharmony_ci } 12871cb0ef41Sopenharmony_ci compositionsList=nullptr; 12881cb0ef41Sopenharmony_ci continue; 12891cb0ef41Sopenharmony_ci } else if((compositeAndFwd=combine(compositionsList, c))>=0) { 12901cb0ef41Sopenharmony_ci // The starter and the combining mark (c) do combine. 12911cb0ef41Sopenharmony_ci UChar32 composite=compositeAndFwd>>1; 12921cb0ef41Sopenharmony_ci 12931cb0ef41Sopenharmony_ci // Replace the starter with the composite, remove the combining mark. 12941cb0ef41Sopenharmony_ci pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark 12951cb0ef41Sopenharmony_ci if(starterIsSupplementary) { 12961cb0ef41Sopenharmony_ci if(U_IS_SUPPLEMENTARY(composite)) { 12971cb0ef41Sopenharmony_ci // both are supplementary 12981cb0ef41Sopenharmony_ci starter[0]=U16_LEAD(composite); 12991cb0ef41Sopenharmony_ci starter[1]=U16_TRAIL(composite); 13001cb0ef41Sopenharmony_ci } else { 13011cb0ef41Sopenharmony_ci *starter=(char16_t)composite; 13021cb0ef41Sopenharmony_ci // The composite is shorter than the starter, 13031cb0ef41Sopenharmony_ci // move the intermediate characters forward one. 13041cb0ef41Sopenharmony_ci starterIsSupplementary=false; 13051cb0ef41Sopenharmony_ci q=starter+1; 13061cb0ef41Sopenharmony_ci r=q+1; 13071cb0ef41Sopenharmony_ci while(r<pRemove) { 13081cb0ef41Sopenharmony_ci *q++=*r++; 13091cb0ef41Sopenharmony_ci } 13101cb0ef41Sopenharmony_ci --pRemove; 13111cb0ef41Sopenharmony_ci } 13121cb0ef41Sopenharmony_ci } else if(U_IS_SUPPLEMENTARY(composite)) { 13131cb0ef41Sopenharmony_ci // The composite is longer than the starter, 13141cb0ef41Sopenharmony_ci // move the intermediate characters back one. 13151cb0ef41Sopenharmony_ci starterIsSupplementary=true; 13161cb0ef41Sopenharmony_ci ++starter; // temporarily increment for the loop boundary 13171cb0ef41Sopenharmony_ci q=pRemove; 13181cb0ef41Sopenharmony_ci r=++pRemove; 13191cb0ef41Sopenharmony_ci while(starter<q) { 13201cb0ef41Sopenharmony_ci *--r=*--q; 13211cb0ef41Sopenharmony_ci } 13221cb0ef41Sopenharmony_ci *starter=U16_TRAIL(composite); 13231cb0ef41Sopenharmony_ci *--starter=U16_LEAD(composite); // undo the temporary increment 13241cb0ef41Sopenharmony_ci } else { 13251cb0ef41Sopenharmony_ci // both are on the BMP 13261cb0ef41Sopenharmony_ci *starter=(char16_t)composite; 13271cb0ef41Sopenharmony_ci } 13281cb0ef41Sopenharmony_ci 13291cb0ef41Sopenharmony_ci /* remove the combining mark by moving the following text over it */ 13301cb0ef41Sopenharmony_ci if(pRemove<p) { 13311cb0ef41Sopenharmony_ci q=pRemove; 13321cb0ef41Sopenharmony_ci r=p; 13331cb0ef41Sopenharmony_ci while(r<limit) { 13341cb0ef41Sopenharmony_ci *q++=*r++; 13351cb0ef41Sopenharmony_ci } 13361cb0ef41Sopenharmony_ci limit=q; 13371cb0ef41Sopenharmony_ci p=pRemove; 13381cb0ef41Sopenharmony_ci } 13391cb0ef41Sopenharmony_ci // Keep prevCC because we removed the combining mark. 13401cb0ef41Sopenharmony_ci 13411cb0ef41Sopenharmony_ci if(p==limit) { 13421cb0ef41Sopenharmony_ci break; 13431cb0ef41Sopenharmony_ci } 13441cb0ef41Sopenharmony_ci // Is the composite a starter that combines forward? 13451cb0ef41Sopenharmony_ci if(compositeAndFwd&1) { 13461cb0ef41Sopenharmony_ci compositionsList= 13471cb0ef41Sopenharmony_ci getCompositionsListForComposite(getRawNorm16(composite)); 13481cb0ef41Sopenharmony_ci } else { 13491cb0ef41Sopenharmony_ci compositionsList=nullptr; 13501cb0ef41Sopenharmony_ci } 13511cb0ef41Sopenharmony_ci 13521cb0ef41Sopenharmony_ci // We combined; continue with looking for compositions. 13531cb0ef41Sopenharmony_ci continue; 13541cb0ef41Sopenharmony_ci } 13551cb0ef41Sopenharmony_ci } 13561cb0ef41Sopenharmony_ci 13571cb0ef41Sopenharmony_ci // no combination this time 13581cb0ef41Sopenharmony_ci prevCC=cc; 13591cb0ef41Sopenharmony_ci if(p==limit) { 13601cb0ef41Sopenharmony_ci break; 13611cb0ef41Sopenharmony_ci } 13621cb0ef41Sopenharmony_ci 13631cb0ef41Sopenharmony_ci // If c did not combine, then check if it is a starter. 13641cb0ef41Sopenharmony_ci if(cc==0) { 13651cb0ef41Sopenharmony_ci // Found a new starter. 13661cb0ef41Sopenharmony_ci if((compositionsList=getCompositionsListForDecompYes(norm16))!=nullptr) { 13671cb0ef41Sopenharmony_ci // It may combine with something, prepare for it. 13681cb0ef41Sopenharmony_ci if(U_IS_BMP(c)) { 13691cb0ef41Sopenharmony_ci starterIsSupplementary=false; 13701cb0ef41Sopenharmony_ci starter=p-1; 13711cb0ef41Sopenharmony_ci } else { 13721cb0ef41Sopenharmony_ci starterIsSupplementary=true; 13731cb0ef41Sopenharmony_ci starter=p-2; 13741cb0ef41Sopenharmony_ci } 13751cb0ef41Sopenharmony_ci } 13761cb0ef41Sopenharmony_ci } else if(onlyContiguous) { 13771cb0ef41Sopenharmony_ci // FCC: no discontiguous compositions; any intervening character blocks. 13781cb0ef41Sopenharmony_ci compositionsList=nullptr; 13791cb0ef41Sopenharmony_ci } 13801cb0ef41Sopenharmony_ci } 13811cb0ef41Sopenharmony_ci buffer.setReorderingLimit(limit); 13821cb0ef41Sopenharmony_ci} 13831cb0ef41Sopenharmony_ci 13841cb0ef41Sopenharmony_ciUChar32 13851cb0ef41Sopenharmony_ciNormalizer2Impl::composePair(UChar32 a, UChar32 b) const { 13861cb0ef41Sopenharmony_ci uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16 13871cb0ef41Sopenharmony_ci const uint16_t *list; 13881cb0ef41Sopenharmony_ci if(isInert(norm16)) { 13891cb0ef41Sopenharmony_ci return U_SENTINEL; 13901cb0ef41Sopenharmony_ci } else if(norm16<minYesNoMappingsOnly) { 13911cb0ef41Sopenharmony_ci // a combines forward. 13921cb0ef41Sopenharmony_ci if(isJamoL(norm16)) { 13931cb0ef41Sopenharmony_ci b-=Hangul::JAMO_V_BASE; 13941cb0ef41Sopenharmony_ci if(0<=b && b<Hangul::JAMO_V_COUNT) { 13951cb0ef41Sopenharmony_ci return 13961cb0ef41Sopenharmony_ci (Hangul::HANGUL_BASE+ 13971cb0ef41Sopenharmony_ci ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)* 13981cb0ef41Sopenharmony_ci Hangul::JAMO_T_COUNT); 13991cb0ef41Sopenharmony_ci } else { 14001cb0ef41Sopenharmony_ci return U_SENTINEL; 14011cb0ef41Sopenharmony_ci } 14021cb0ef41Sopenharmony_ci } else if(isHangulLV(norm16)) { 14031cb0ef41Sopenharmony_ci b-=Hangul::JAMO_T_BASE; 14041cb0ef41Sopenharmony_ci if(0<b && b<Hangul::JAMO_T_COUNT) { // not b==0! 14051cb0ef41Sopenharmony_ci return a+b; 14061cb0ef41Sopenharmony_ci } else { 14071cb0ef41Sopenharmony_ci return U_SENTINEL; 14081cb0ef41Sopenharmony_ci } 14091cb0ef41Sopenharmony_ci } else { 14101cb0ef41Sopenharmony_ci // 'a' has a compositions list in extraData 14111cb0ef41Sopenharmony_ci list=getMapping(norm16); 14121cb0ef41Sopenharmony_ci if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 14131cb0ef41Sopenharmony_ci list+= // mapping pointer 14141cb0ef41Sopenharmony_ci 1+ // +1 to skip the first unit with the mapping length 14151cb0ef41Sopenharmony_ci (*list&MAPPING_LENGTH_MASK); // + mapping length 14161cb0ef41Sopenharmony_ci } 14171cb0ef41Sopenharmony_ci } 14181cb0ef41Sopenharmony_ci } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 14191cb0ef41Sopenharmony_ci return U_SENTINEL; 14201cb0ef41Sopenharmony_ci } else { 14211cb0ef41Sopenharmony_ci list=getCompositionsListForMaybe(norm16); 14221cb0ef41Sopenharmony_ci } 14231cb0ef41Sopenharmony_ci if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 14241cb0ef41Sopenharmony_ci return U_SENTINEL; 14251cb0ef41Sopenharmony_ci } 14261cb0ef41Sopenharmony_ci#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC 14271cb0ef41Sopenharmony_ci return combine(list, b)>>1; 14281cb0ef41Sopenharmony_ci#else 14291cb0ef41Sopenharmony_ci int32_t compositeAndFwd=combine(list, b); 14301cb0ef41Sopenharmony_ci return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; 14311cb0ef41Sopenharmony_ci#endif 14321cb0ef41Sopenharmony_ci} 14331cb0ef41Sopenharmony_ci 14341cb0ef41Sopenharmony_ci// Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 14351cb0ef41Sopenharmony_ci// doCompose: normalize 14361cb0ef41Sopenharmony_ci// !doCompose: isNormalized (buffer must be empty and initialized) 14371cb0ef41Sopenharmony_ciUBool 14381cb0ef41Sopenharmony_ciNormalizer2Impl::compose(const char16_t *src, const char16_t *limit, 14391cb0ef41Sopenharmony_ci UBool onlyContiguous, 14401cb0ef41Sopenharmony_ci UBool doCompose, 14411cb0ef41Sopenharmony_ci ReorderingBuffer &buffer, 14421cb0ef41Sopenharmony_ci UErrorCode &errorCode) const { 14431cb0ef41Sopenharmony_ci const char16_t *prevBoundary=src; 14441cb0ef41Sopenharmony_ci UChar32 minNoMaybeCP=minCompNoMaybeCP; 14451cb0ef41Sopenharmony_ci if(limit==nullptr) { 14461cb0ef41Sopenharmony_ci src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, 14471cb0ef41Sopenharmony_ci doCompose ? &buffer : nullptr, 14481cb0ef41Sopenharmony_ci errorCode); 14491cb0ef41Sopenharmony_ci if(U_FAILURE(errorCode)) { 14501cb0ef41Sopenharmony_ci return false; 14511cb0ef41Sopenharmony_ci } 14521cb0ef41Sopenharmony_ci limit=u_strchr(src, 0); 14531cb0ef41Sopenharmony_ci if (prevBoundary != src) { 14541cb0ef41Sopenharmony_ci if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) { 14551cb0ef41Sopenharmony_ci prevBoundary = src; 14561cb0ef41Sopenharmony_ci } else { 14571cb0ef41Sopenharmony_ci buffer.removeSuffix(1); 14581cb0ef41Sopenharmony_ci prevBoundary = --src; 14591cb0ef41Sopenharmony_ci } 14601cb0ef41Sopenharmony_ci } 14611cb0ef41Sopenharmony_ci } 14621cb0ef41Sopenharmony_ci 14631cb0ef41Sopenharmony_ci for (;;) { 14641cb0ef41Sopenharmony_ci // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 14651cb0ef41Sopenharmony_ci // or with (compYes && ccc==0) properties. 14661cb0ef41Sopenharmony_ci const char16_t *prevSrc; 14671cb0ef41Sopenharmony_ci UChar32 c = 0; 14681cb0ef41Sopenharmony_ci uint16_t norm16 = 0; 14691cb0ef41Sopenharmony_ci for (;;) { 14701cb0ef41Sopenharmony_ci if (src == limit) { 14711cb0ef41Sopenharmony_ci if (prevBoundary != limit && doCompose) { 14721cb0ef41Sopenharmony_ci buffer.appendZeroCC(prevBoundary, limit, errorCode); 14731cb0ef41Sopenharmony_ci } 14741cb0ef41Sopenharmony_ci return true; 14751cb0ef41Sopenharmony_ci } 14761cb0ef41Sopenharmony_ci if( (c=*src)<minNoMaybeCP || 14771cb0ef41Sopenharmony_ci isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)) 14781cb0ef41Sopenharmony_ci ) { 14791cb0ef41Sopenharmony_ci ++src; 14801cb0ef41Sopenharmony_ci } else { 14811cb0ef41Sopenharmony_ci prevSrc = src++; 14821cb0ef41Sopenharmony_ci if(!U16_IS_LEAD(c)) { 14831cb0ef41Sopenharmony_ci break; 14841cb0ef41Sopenharmony_ci } else { 14851cb0ef41Sopenharmony_ci char16_t c2; 14861cb0ef41Sopenharmony_ci if(src!=limit && U16_IS_TRAIL(c2=*src)) { 14871cb0ef41Sopenharmony_ci ++src; 14881cb0ef41Sopenharmony_ci c=U16_GET_SUPPLEMENTARY(c, c2); 14891cb0ef41Sopenharmony_ci norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c); 14901cb0ef41Sopenharmony_ci if(!isCompYesAndZeroCC(norm16)) { 14911cb0ef41Sopenharmony_ci break; 14921cb0ef41Sopenharmony_ci } 14931cb0ef41Sopenharmony_ci } 14941cb0ef41Sopenharmony_ci } 14951cb0ef41Sopenharmony_ci } 14961cb0ef41Sopenharmony_ci } 14971cb0ef41Sopenharmony_ci // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 14981cb0ef41Sopenharmony_ci // The current character is either a "noNo" (has a mapping) 14991cb0ef41Sopenharmony_ci // or a "maybeYes" (combines backward) 15001cb0ef41Sopenharmony_ci // or a "yesYes" with ccc!=0. 15011cb0ef41Sopenharmony_ci // It is not a Hangul syllable or Jamo L because those have "yes" properties. 15021cb0ef41Sopenharmony_ci 15031cb0ef41Sopenharmony_ci // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 15041cb0ef41Sopenharmony_ci if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes 15051cb0ef41Sopenharmony_ci if (!doCompose) { 15061cb0ef41Sopenharmony_ci return false; 15071cb0ef41Sopenharmony_ci } 15081cb0ef41Sopenharmony_ci // Fast path for mapping a character that is immediately surrounded by boundaries. 15091cb0ef41Sopenharmony_ci // In this case, we need not decompose around the current character. 15101cb0ef41Sopenharmony_ci if (isDecompNoAlgorithmic(norm16)) { 15111cb0ef41Sopenharmony_ci // Maps to a single isCompYesAndZeroCC character 15121cb0ef41Sopenharmony_ci // which also implies hasCompBoundaryBefore. 15131cb0ef41Sopenharmony_ci if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 15141cb0ef41Sopenharmony_ci hasCompBoundaryBefore(src, limit)) { 15151cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 15161cb0ef41Sopenharmony_ci break; 15171cb0ef41Sopenharmony_ci } 15181cb0ef41Sopenharmony_ci if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) { 15191cb0ef41Sopenharmony_ci break; 15201cb0ef41Sopenharmony_ci } 15211cb0ef41Sopenharmony_ci prevBoundary = src; 15221cb0ef41Sopenharmony_ci continue; 15231cb0ef41Sopenharmony_ci } 15241cb0ef41Sopenharmony_ci } else if (norm16 < minNoNoCompBoundaryBefore) { 15251cb0ef41Sopenharmony_ci // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 15261cb0ef41Sopenharmony_ci if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 15271cb0ef41Sopenharmony_ci hasCompBoundaryBefore(src, limit)) { 15281cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 15291cb0ef41Sopenharmony_ci break; 15301cb0ef41Sopenharmony_ci } 15311cb0ef41Sopenharmony_ci const char16_t *mapping = reinterpret_cast<const char16_t *>(getMapping(norm16)); 15321cb0ef41Sopenharmony_ci int32_t length = *mapping++ & MAPPING_LENGTH_MASK; 15331cb0ef41Sopenharmony_ci if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) { 15341cb0ef41Sopenharmony_ci break; 15351cb0ef41Sopenharmony_ci } 15361cb0ef41Sopenharmony_ci prevBoundary = src; 15371cb0ef41Sopenharmony_ci continue; 15381cb0ef41Sopenharmony_ci } 15391cb0ef41Sopenharmony_ci } else if (norm16 >= minNoNoEmpty) { 15401cb0ef41Sopenharmony_ci // The current character maps to nothing. 15411cb0ef41Sopenharmony_ci // Simply omit it from the output if there is a boundary before _or_ after it. 15421cb0ef41Sopenharmony_ci // The character itself implies no boundaries. 15431cb0ef41Sopenharmony_ci if (hasCompBoundaryBefore(src, limit) || 15441cb0ef41Sopenharmony_ci hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) { 15451cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 15461cb0ef41Sopenharmony_ci break; 15471cb0ef41Sopenharmony_ci } 15481cb0ef41Sopenharmony_ci prevBoundary = src; 15491cb0ef41Sopenharmony_ci continue; 15501cb0ef41Sopenharmony_ci } 15511cb0ef41Sopenharmony_ci } 15521cb0ef41Sopenharmony_ci // Other "noNo" type, or need to examine more text around this character: 15531cb0ef41Sopenharmony_ci // Fall through to the slow path. 15541cb0ef41Sopenharmony_ci } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { 15551cb0ef41Sopenharmony_ci char16_t prev=*(prevSrc-1); 15561cb0ef41Sopenharmony_ci if(c<Hangul::JAMO_T_BASE) { 15571cb0ef41Sopenharmony_ci // The current character is a Jamo Vowel, 15581cb0ef41Sopenharmony_ci // compose with previous Jamo L and following Jamo T. 15591cb0ef41Sopenharmony_ci char16_t l = (char16_t)(prev-Hangul::JAMO_L_BASE); 15601cb0ef41Sopenharmony_ci if(l<Hangul::JAMO_L_COUNT) { 15611cb0ef41Sopenharmony_ci if (!doCompose) { 15621cb0ef41Sopenharmony_ci return false; 15631cb0ef41Sopenharmony_ci } 15641cb0ef41Sopenharmony_ci int32_t t; 15651cb0ef41Sopenharmony_ci if (src != limit && 15661cb0ef41Sopenharmony_ci 0 < (t = ((int32_t)*src - Hangul::JAMO_T_BASE)) && 15671cb0ef41Sopenharmony_ci t < Hangul::JAMO_T_COUNT) { 15681cb0ef41Sopenharmony_ci // The next character is a Jamo T. 15691cb0ef41Sopenharmony_ci ++src; 15701cb0ef41Sopenharmony_ci } else if (hasCompBoundaryBefore(src, limit)) { 15711cb0ef41Sopenharmony_ci // No Jamo T follows, not even via decomposition. 15721cb0ef41Sopenharmony_ci t = 0; 15731cb0ef41Sopenharmony_ci } else { 15741cb0ef41Sopenharmony_ci t = -1; 15751cb0ef41Sopenharmony_ci } 15761cb0ef41Sopenharmony_ci if (t >= 0) { 15771cb0ef41Sopenharmony_ci UChar32 syllable = Hangul::HANGUL_BASE + 15781cb0ef41Sopenharmony_ci (l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) * 15791cb0ef41Sopenharmony_ci Hangul::JAMO_T_COUNT + t; 15801cb0ef41Sopenharmony_ci --prevSrc; // Replace the Jamo L as well. 15811cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 15821cb0ef41Sopenharmony_ci break; 15831cb0ef41Sopenharmony_ci } 15841cb0ef41Sopenharmony_ci if(!buffer.appendBMP((char16_t)syllable, 0, errorCode)) { 15851cb0ef41Sopenharmony_ci break; 15861cb0ef41Sopenharmony_ci } 15871cb0ef41Sopenharmony_ci prevBoundary = src; 15881cb0ef41Sopenharmony_ci continue; 15891cb0ef41Sopenharmony_ci } 15901cb0ef41Sopenharmony_ci // If we see L+V+x where x!=T then we drop to the slow path, 15911cb0ef41Sopenharmony_ci // decompose and recompose. 15921cb0ef41Sopenharmony_ci // This is to deal with NFKC finding normal L and V but a 15931cb0ef41Sopenharmony_ci // compatibility variant of a T. 15941cb0ef41Sopenharmony_ci // We need to either fully compose that combination here 15951cb0ef41Sopenharmony_ci // (which would complicate the code and may not work with strange custom data) 15961cb0ef41Sopenharmony_ci // or use the slow path. 15971cb0ef41Sopenharmony_ci } 15981cb0ef41Sopenharmony_ci } else if (Hangul::isHangulLV(prev)) { 15991cb0ef41Sopenharmony_ci // The current character is a Jamo Trailing consonant, 16001cb0ef41Sopenharmony_ci // compose with previous Hangul LV that does not contain a Jamo T. 16011cb0ef41Sopenharmony_ci if (!doCompose) { 16021cb0ef41Sopenharmony_ci return false; 16031cb0ef41Sopenharmony_ci } 16041cb0ef41Sopenharmony_ci UChar32 syllable = prev + c - Hangul::JAMO_T_BASE; 16051cb0ef41Sopenharmony_ci --prevSrc; // Replace the Hangul LV as well. 16061cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 16071cb0ef41Sopenharmony_ci break; 16081cb0ef41Sopenharmony_ci } 16091cb0ef41Sopenharmony_ci if(!buffer.appendBMP((char16_t)syllable, 0, errorCode)) { 16101cb0ef41Sopenharmony_ci break; 16111cb0ef41Sopenharmony_ci } 16121cb0ef41Sopenharmony_ci prevBoundary = src; 16131cb0ef41Sopenharmony_ci continue; 16141cb0ef41Sopenharmony_ci } 16151cb0ef41Sopenharmony_ci // No matching context, or may need to decompose surrounding text first: 16161cb0ef41Sopenharmony_ci // Fall through to the slow path. 16171cb0ef41Sopenharmony_ci } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 16181cb0ef41Sopenharmony_ci // One or more combining marks that do not combine-back: 16191cb0ef41Sopenharmony_ci // Check for canonical order, copy unchanged if ok and 16201cb0ef41Sopenharmony_ci // if followed by a character with a boundary-before. 16211cb0ef41Sopenharmony_ci uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 16221cb0ef41Sopenharmony_ci if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) { 16231cb0ef41Sopenharmony_ci // Fails FCD test, need to decompose and contiguously recompose. 16241cb0ef41Sopenharmony_ci if (!doCompose) { 16251cb0ef41Sopenharmony_ci return false; 16261cb0ef41Sopenharmony_ci } 16271cb0ef41Sopenharmony_ci } else { 16281cb0ef41Sopenharmony_ci // If !onlyContiguous (not FCC), then we ignore the tccc of 16291cb0ef41Sopenharmony_ci // the previous character which passed the quick check "yes && ccc==0" test. 16301cb0ef41Sopenharmony_ci const char16_t *nextSrc; 16311cb0ef41Sopenharmony_ci uint16_t n16; 16321cb0ef41Sopenharmony_ci for (;;) { 16331cb0ef41Sopenharmony_ci if (src == limit) { 16341cb0ef41Sopenharmony_ci if (doCompose) { 16351cb0ef41Sopenharmony_ci buffer.appendZeroCC(prevBoundary, limit, errorCode); 16361cb0ef41Sopenharmony_ci } 16371cb0ef41Sopenharmony_ci return true; 16381cb0ef41Sopenharmony_ci } 16391cb0ef41Sopenharmony_ci uint8_t prevCC = cc; 16401cb0ef41Sopenharmony_ci nextSrc = src; 16411cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16); 16421cb0ef41Sopenharmony_ci if (n16 >= MIN_YES_YES_WITH_CC) { 16431cb0ef41Sopenharmony_ci cc = getCCFromNormalYesOrMaybe(n16); 16441cb0ef41Sopenharmony_ci if (prevCC > cc) { 16451cb0ef41Sopenharmony_ci if (!doCompose) { 16461cb0ef41Sopenharmony_ci return false; 16471cb0ef41Sopenharmony_ci } 16481cb0ef41Sopenharmony_ci break; 16491cb0ef41Sopenharmony_ci } 16501cb0ef41Sopenharmony_ci } else { 16511cb0ef41Sopenharmony_ci break; 16521cb0ef41Sopenharmony_ci } 16531cb0ef41Sopenharmony_ci src = nextSrc; 16541cb0ef41Sopenharmony_ci } 16551cb0ef41Sopenharmony_ci // src is after the last in-order combining mark. 16561cb0ef41Sopenharmony_ci // If there is a boundary here, then we continue with no change. 16571cb0ef41Sopenharmony_ci if (norm16HasCompBoundaryBefore(n16)) { 16581cb0ef41Sopenharmony_ci if (isCompYesAndZeroCC(n16)) { 16591cb0ef41Sopenharmony_ci src = nextSrc; 16601cb0ef41Sopenharmony_ci } 16611cb0ef41Sopenharmony_ci continue; 16621cb0ef41Sopenharmony_ci } 16631cb0ef41Sopenharmony_ci // Use the slow path. There is no boundary in [prevSrc, src[. 16641cb0ef41Sopenharmony_ci } 16651cb0ef41Sopenharmony_ci } 16661cb0ef41Sopenharmony_ci 16671cb0ef41Sopenharmony_ci // Slow path: Find the nearest boundaries around the current character, 16681cb0ef41Sopenharmony_ci // decompose and recompose. 16691cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 16701cb0ef41Sopenharmony_ci const char16_t *p = prevSrc; 16711cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16); 16721cb0ef41Sopenharmony_ci if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 16731cb0ef41Sopenharmony_ci prevSrc = p; 16741cb0ef41Sopenharmony_ci } 16751cb0ef41Sopenharmony_ci } 16761cb0ef41Sopenharmony_ci if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { 16771cb0ef41Sopenharmony_ci break; 16781cb0ef41Sopenharmony_ci } 16791cb0ef41Sopenharmony_ci int32_t recomposeStartIndex=buffer.length(); 16801cb0ef41Sopenharmony_ci // We know there is not a boundary here. 16811cb0ef41Sopenharmony_ci decomposeShort(prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, 16821cb0ef41Sopenharmony_ci buffer, errorCode); 16831cb0ef41Sopenharmony_ci // Decompose until the next boundary. 16841cb0ef41Sopenharmony_ci src = decomposeShort(src, limit, true /* stopAtCompBoundary */, onlyContiguous, 16851cb0ef41Sopenharmony_ci buffer, errorCode); 16861cb0ef41Sopenharmony_ci if (U_FAILURE(errorCode)) { 16871cb0ef41Sopenharmony_ci break; 16881cb0ef41Sopenharmony_ci } 16891cb0ef41Sopenharmony_ci if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals() 16901cb0ef41Sopenharmony_ci errorCode = U_INDEX_OUTOFBOUNDS_ERROR; 16911cb0ef41Sopenharmony_ci return true; 16921cb0ef41Sopenharmony_ci } 16931cb0ef41Sopenharmony_ci recompose(buffer, recomposeStartIndex, onlyContiguous); 16941cb0ef41Sopenharmony_ci if(!doCompose) { 16951cb0ef41Sopenharmony_ci if(!buffer.equals(prevSrc, src)) { 16961cb0ef41Sopenharmony_ci return false; 16971cb0ef41Sopenharmony_ci } 16981cb0ef41Sopenharmony_ci buffer.remove(); 16991cb0ef41Sopenharmony_ci } 17001cb0ef41Sopenharmony_ci prevBoundary=src; 17011cb0ef41Sopenharmony_ci } 17021cb0ef41Sopenharmony_ci return true; 17031cb0ef41Sopenharmony_ci} 17041cb0ef41Sopenharmony_ci 17051cb0ef41Sopenharmony_ci// Very similar to compose(): Make the same changes in both places if relevant. 17061cb0ef41Sopenharmony_ci// pQCResult==nullptr: spanQuickCheckYes 17071cb0ef41Sopenharmony_ci// pQCResult!=nullptr: quickCheck (*pQCResult must be UNORM_YES) 17081cb0ef41Sopenharmony_ciconst char16_t * 17091cb0ef41Sopenharmony_ciNormalizer2Impl::composeQuickCheck(const char16_t *src, const char16_t *limit, 17101cb0ef41Sopenharmony_ci UBool onlyContiguous, 17111cb0ef41Sopenharmony_ci UNormalizationCheckResult *pQCResult) const { 17121cb0ef41Sopenharmony_ci const char16_t *prevBoundary=src; 17131cb0ef41Sopenharmony_ci UChar32 minNoMaybeCP=minCompNoMaybeCP; 17141cb0ef41Sopenharmony_ci if(limit==nullptr) { 17151cb0ef41Sopenharmony_ci UErrorCode errorCode=U_ZERO_ERROR; 17161cb0ef41Sopenharmony_ci src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, nullptr, errorCode); 17171cb0ef41Sopenharmony_ci limit=u_strchr(src, 0); 17181cb0ef41Sopenharmony_ci if (prevBoundary != src) { 17191cb0ef41Sopenharmony_ci if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) { 17201cb0ef41Sopenharmony_ci prevBoundary = src; 17211cb0ef41Sopenharmony_ci } else { 17221cb0ef41Sopenharmony_ci prevBoundary = --src; 17231cb0ef41Sopenharmony_ci } 17241cb0ef41Sopenharmony_ci } 17251cb0ef41Sopenharmony_ci } 17261cb0ef41Sopenharmony_ci 17271cb0ef41Sopenharmony_ci for(;;) { 17281cb0ef41Sopenharmony_ci // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 17291cb0ef41Sopenharmony_ci // or with (compYes && ccc==0) properties. 17301cb0ef41Sopenharmony_ci const char16_t *prevSrc; 17311cb0ef41Sopenharmony_ci UChar32 c = 0; 17321cb0ef41Sopenharmony_ci uint16_t norm16 = 0; 17331cb0ef41Sopenharmony_ci for (;;) { 17341cb0ef41Sopenharmony_ci if(src==limit) { 17351cb0ef41Sopenharmony_ci return src; 17361cb0ef41Sopenharmony_ci } 17371cb0ef41Sopenharmony_ci if( (c=*src)<minNoMaybeCP || 17381cb0ef41Sopenharmony_ci isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c)) 17391cb0ef41Sopenharmony_ci ) { 17401cb0ef41Sopenharmony_ci ++src; 17411cb0ef41Sopenharmony_ci } else { 17421cb0ef41Sopenharmony_ci prevSrc = src++; 17431cb0ef41Sopenharmony_ci if(!U16_IS_LEAD(c)) { 17441cb0ef41Sopenharmony_ci break; 17451cb0ef41Sopenharmony_ci } else { 17461cb0ef41Sopenharmony_ci char16_t c2; 17471cb0ef41Sopenharmony_ci if(src!=limit && U16_IS_TRAIL(c2=*src)) { 17481cb0ef41Sopenharmony_ci ++src; 17491cb0ef41Sopenharmony_ci c=U16_GET_SUPPLEMENTARY(c, c2); 17501cb0ef41Sopenharmony_ci norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c); 17511cb0ef41Sopenharmony_ci if(!isCompYesAndZeroCC(norm16)) { 17521cb0ef41Sopenharmony_ci break; 17531cb0ef41Sopenharmony_ci } 17541cb0ef41Sopenharmony_ci } 17551cb0ef41Sopenharmony_ci } 17561cb0ef41Sopenharmony_ci } 17571cb0ef41Sopenharmony_ci } 17581cb0ef41Sopenharmony_ci // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 17591cb0ef41Sopenharmony_ci // The current character is either a "noNo" (has a mapping) 17601cb0ef41Sopenharmony_ci // or a "maybeYes" (combines backward) 17611cb0ef41Sopenharmony_ci // or a "yesYes" with ccc!=0. 17621cb0ef41Sopenharmony_ci // It is not a Hangul syllable or Jamo L because those have "yes" properties. 17631cb0ef41Sopenharmony_ci 17641cb0ef41Sopenharmony_ci uint16_t prevNorm16 = INERT; 17651cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc) { 17661cb0ef41Sopenharmony_ci if (norm16HasCompBoundaryBefore(norm16)) { 17671cb0ef41Sopenharmony_ci prevBoundary = prevSrc; 17681cb0ef41Sopenharmony_ci } else { 17691cb0ef41Sopenharmony_ci const char16_t *p = prevSrc; 17701cb0ef41Sopenharmony_ci uint16_t n16; 17711cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, n16); 17721cb0ef41Sopenharmony_ci if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) { 17731cb0ef41Sopenharmony_ci prevBoundary = prevSrc; 17741cb0ef41Sopenharmony_ci } else { 17751cb0ef41Sopenharmony_ci prevBoundary = p; 17761cb0ef41Sopenharmony_ci prevNorm16 = n16; 17771cb0ef41Sopenharmony_ci } 17781cb0ef41Sopenharmony_ci } 17791cb0ef41Sopenharmony_ci } 17801cb0ef41Sopenharmony_ci 17811cb0ef41Sopenharmony_ci if(isMaybeOrNonZeroCC(norm16)) { 17821cb0ef41Sopenharmony_ci uint8_t cc=getCCFromYesOrMaybe(norm16); 17831cb0ef41Sopenharmony_ci if (onlyContiguous /* FCC */ && cc != 0 && 17841cb0ef41Sopenharmony_ci getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { 17851cb0ef41Sopenharmony_ci // The [prevBoundary..prevSrc[ character 17861cb0ef41Sopenharmony_ci // passed the quick check "yes && ccc==0" test 17871cb0ef41Sopenharmony_ci // but is out of canonical order with the current combining mark. 17881cb0ef41Sopenharmony_ci } else { 17891cb0ef41Sopenharmony_ci // If !onlyContiguous (not FCC), then we ignore the tccc of 17901cb0ef41Sopenharmony_ci // the previous character which passed the quick check "yes && ccc==0" test. 17911cb0ef41Sopenharmony_ci const char16_t *nextSrc; 17921cb0ef41Sopenharmony_ci for (;;) { 17931cb0ef41Sopenharmony_ci if (norm16 < MIN_YES_YES_WITH_CC) { 17941cb0ef41Sopenharmony_ci if (pQCResult != nullptr) { 17951cb0ef41Sopenharmony_ci *pQCResult = UNORM_MAYBE; 17961cb0ef41Sopenharmony_ci } else { 17971cb0ef41Sopenharmony_ci return prevBoundary; 17981cb0ef41Sopenharmony_ci } 17991cb0ef41Sopenharmony_ci } 18001cb0ef41Sopenharmony_ci if (src == limit) { 18011cb0ef41Sopenharmony_ci return src; 18021cb0ef41Sopenharmony_ci } 18031cb0ef41Sopenharmony_ci uint8_t prevCC = cc; 18041cb0ef41Sopenharmony_ci nextSrc = src; 18051cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16); 18061cb0ef41Sopenharmony_ci if (isMaybeOrNonZeroCC(norm16)) { 18071cb0ef41Sopenharmony_ci cc = getCCFromYesOrMaybe(norm16); 18081cb0ef41Sopenharmony_ci if (!(prevCC <= cc || cc == 0)) { 18091cb0ef41Sopenharmony_ci break; 18101cb0ef41Sopenharmony_ci } 18111cb0ef41Sopenharmony_ci } else { 18121cb0ef41Sopenharmony_ci break; 18131cb0ef41Sopenharmony_ci } 18141cb0ef41Sopenharmony_ci src = nextSrc; 18151cb0ef41Sopenharmony_ci } 18161cb0ef41Sopenharmony_ci // src is after the last in-order combining mark. 18171cb0ef41Sopenharmony_ci if (isCompYesAndZeroCC(norm16)) { 18181cb0ef41Sopenharmony_ci prevBoundary = src; 18191cb0ef41Sopenharmony_ci src = nextSrc; 18201cb0ef41Sopenharmony_ci continue; 18211cb0ef41Sopenharmony_ci } 18221cb0ef41Sopenharmony_ci } 18231cb0ef41Sopenharmony_ci } 18241cb0ef41Sopenharmony_ci if(pQCResult!=nullptr) { 18251cb0ef41Sopenharmony_ci *pQCResult=UNORM_NO; 18261cb0ef41Sopenharmony_ci } 18271cb0ef41Sopenharmony_ci return prevBoundary; 18281cb0ef41Sopenharmony_ci } 18291cb0ef41Sopenharmony_ci} 18301cb0ef41Sopenharmony_ci 18311cb0ef41Sopenharmony_civoid Normalizer2Impl::composeAndAppend(const char16_t *src, const char16_t *limit, 18321cb0ef41Sopenharmony_ci UBool doCompose, 18331cb0ef41Sopenharmony_ci UBool onlyContiguous, 18341cb0ef41Sopenharmony_ci UnicodeString &safeMiddle, 18351cb0ef41Sopenharmony_ci ReorderingBuffer &buffer, 18361cb0ef41Sopenharmony_ci UErrorCode &errorCode) const { 18371cb0ef41Sopenharmony_ci if(!buffer.isEmpty()) { 18381cb0ef41Sopenharmony_ci const char16_t *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous); 18391cb0ef41Sopenharmony_ci if(src!=firstStarterInSrc) { 18401cb0ef41Sopenharmony_ci const char16_t *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), 18411cb0ef41Sopenharmony_ci buffer.getLimit(), onlyContiguous); 18421cb0ef41Sopenharmony_ci int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest); 18431cb0ef41Sopenharmony_ci UnicodeString middle(lastStarterInDest, destSuffixLength); 18441cb0ef41Sopenharmony_ci buffer.removeSuffix(destSuffixLength); 18451cb0ef41Sopenharmony_ci safeMiddle=middle; 18461cb0ef41Sopenharmony_ci middle.append(src, (int32_t)(firstStarterInSrc-src)); 18471cb0ef41Sopenharmony_ci const char16_t *middleStart=middle.getBuffer(); 18481cb0ef41Sopenharmony_ci compose(middleStart, middleStart+middle.length(), onlyContiguous, 18491cb0ef41Sopenharmony_ci true, buffer, errorCode); 18501cb0ef41Sopenharmony_ci if(U_FAILURE(errorCode)) { 18511cb0ef41Sopenharmony_ci return; 18521cb0ef41Sopenharmony_ci } 18531cb0ef41Sopenharmony_ci src=firstStarterInSrc; 18541cb0ef41Sopenharmony_ci } 18551cb0ef41Sopenharmony_ci } 18561cb0ef41Sopenharmony_ci if(doCompose) { 18571cb0ef41Sopenharmony_ci compose(src, limit, onlyContiguous, true, buffer, errorCode); 18581cb0ef41Sopenharmony_ci } else { 18591cb0ef41Sopenharmony_ci if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr 18601cb0ef41Sopenharmony_ci limit=u_strchr(src, 0); 18611cb0ef41Sopenharmony_ci } 18621cb0ef41Sopenharmony_ci buffer.appendZeroCC(src, limit, errorCode); 18631cb0ef41Sopenharmony_ci } 18641cb0ef41Sopenharmony_ci} 18651cb0ef41Sopenharmony_ci 18661cb0ef41Sopenharmony_ciUBool 18671cb0ef41Sopenharmony_ciNormalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, 18681cb0ef41Sopenharmony_ci const uint8_t *src, const uint8_t *limit, 18691cb0ef41Sopenharmony_ci ByteSink *sink, Edits *edits, UErrorCode &errorCode) const { 18701cb0ef41Sopenharmony_ci U_ASSERT(limit != nullptr); 18711cb0ef41Sopenharmony_ci UnicodeString s16; 18721cb0ef41Sopenharmony_ci uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP); 18731cb0ef41Sopenharmony_ci const uint8_t *prevBoundary = src; 18741cb0ef41Sopenharmony_ci 18751cb0ef41Sopenharmony_ci for (;;) { 18761cb0ef41Sopenharmony_ci // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 18771cb0ef41Sopenharmony_ci // or with (compYes && ccc==0) properties. 18781cb0ef41Sopenharmony_ci const uint8_t *prevSrc; 18791cb0ef41Sopenharmony_ci uint16_t norm16 = 0; 18801cb0ef41Sopenharmony_ci for (;;) { 18811cb0ef41Sopenharmony_ci if (src == limit) { 18821cb0ef41Sopenharmony_ci if (prevBoundary != limit && sink != nullptr) { 18831cb0ef41Sopenharmony_ci ByteSinkUtil::appendUnchanged(prevBoundary, limit, 18841cb0ef41Sopenharmony_ci *sink, options, edits, errorCode); 18851cb0ef41Sopenharmony_ci } 18861cb0ef41Sopenharmony_ci return true; 18871cb0ef41Sopenharmony_ci } 18881cb0ef41Sopenharmony_ci if (*src < minNoMaybeLead) { 18891cb0ef41Sopenharmony_ci ++src; 18901cb0ef41Sopenharmony_ci } else { 18911cb0ef41Sopenharmony_ci prevSrc = src; 18921cb0ef41Sopenharmony_ci UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16); 18931cb0ef41Sopenharmony_ci if (!isCompYesAndZeroCC(norm16)) { 18941cb0ef41Sopenharmony_ci break; 18951cb0ef41Sopenharmony_ci } 18961cb0ef41Sopenharmony_ci } 18971cb0ef41Sopenharmony_ci } 18981cb0ef41Sopenharmony_ci // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 18991cb0ef41Sopenharmony_ci // The current character is either a "noNo" (has a mapping) 19001cb0ef41Sopenharmony_ci // or a "maybeYes" (combines backward) 19011cb0ef41Sopenharmony_ci // or a "yesYes" with ccc!=0. 19021cb0ef41Sopenharmony_ci // It is not a Hangul syllable or Jamo L because those have "yes" properties. 19031cb0ef41Sopenharmony_ci 19041cb0ef41Sopenharmony_ci // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 19051cb0ef41Sopenharmony_ci if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes 19061cb0ef41Sopenharmony_ci if (sink == nullptr) { 19071cb0ef41Sopenharmony_ci return false; 19081cb0ef41Sopenharmony_ci } 19091cb0ef41Sopenharmony_ci // Fast path for mapping a character that is immediately surrounded by boundaries. 19101cb0ef41Sopenharmony_ci // In this case, we need not decompose around the current character. 19111cb0ef41Sopenharmony_ci if (isDecompNoAlgorithmic(norm16)) { 19121cb0ef41Sopenharmony_ci // Maps to a single isCompYesAndZeroCC character 19131cb0ef41Sopenharmony_ci // which also implies hasCompBoundaryBefore. 19141cb0ef41Sopenharmony_ci if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 19151cb0ef41Sopenharmony_ci hasCompBoundaryBefore(src, limit)) { 19161cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && 19171cb0ef41Sopenharmony_ci !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 19181cb0ef41Sopenharmony_ci *sink, options, edits, errorCode)) { 19191cb0ef41Sopenharmony_ci break; 19201cb0ef41Sopenharmony_ci } 19211cb0ef41Sopenharmony_ci appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits); 19221cb0ef41Sopenharmony_ci prevBoundary = src; 19231cb0ef41Sopenharmony_ci continue; 19241cb0ef41Sopenharmony_ci } 19251cb0ef41Sopenharmony_ci } else if (norm16 < minNoNoCompBoundaryBefore) { 19261cb0ef41Sopenharmony_ci // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 19271cb0ef41Sopenharmony_ci if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 19281cb0ef41Sopenharmony_ci hasCompBoundaryBefore(src, limit)) { 19291cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && 19301cb0ef41Sopenharmony_ci !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 19311cb0ef41Sopenharmony_ci *sink, options, edits, errorCode)) { 19321cb0ef41Sopenharmony_ci break; 19331cb0ef41Sopenharmony_ci } 19341cb0ef41Sopenharmony_ci const uint16_t *mapping = getMapping(norm16); 19351cb0ef41Sopenharmony_ci int32_t length = *mapping++ & MAPPING_LENGTH_MASK; 19361cb0ef41Sopenharmony_ci if (!ByteSinkUtil::appendChange(prevSrc, src, (const char16_t *)mapping, length, 19371cb0ef41Sopenharmony_ci *sink, edits, errorCode)) { 19381cb0ef41Sopenharmony_ci break; 19391cb0ef41Sopenharmony_ci } 19401cb0ef41Sopenharmony_ci prevBoundary = src; 19411cb0ef41Sopenharmony_ci continue; 19421cb0ef41Sopenharmony_ci } 19431cb0ef41Sopenharmony_ci } else if (norm16 >= minNoNoEmpty) { 19441cb0ef41Sopenharmony_ci // The current character maps to nothing. 19451cb0ef41Sopenharmony_ci // Simply omit it from the output if there is a boundary before _or_ after it. 19461cb0ef41Sopenharmony_ci // The character itself implies no boundaries. 19471cb0ef41Sopenharmony_ci if (hasCompBoundaryBefore(src, limit) || 19481cb0ef41Sopenharmony_ci hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) { 19491cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && 19501cb0ef41Sopenharmony_ci !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 19511cb0ef41Sopenharmony_ci *sink, options, edits, errorCode)) { 19521cb0ef41Sopenharmony_ci break; 19531cb0ef41Sopenharmony_ci } 19541cb0ef41Sopenharmony_ci if (edits != nullptr) { 19551cb0ef41Sopenharmony_ci edits->addReplace((int32_t)(src - prevSrc), 0); 19561cb0ef41Sopenharmony_ci } 19571cb0ef41Sopenharmony_ci prevBoundary = src; 19581cb0ef41Sopenharmony_ci continue; 19591cb0ef41Sopenharmony_ci } 19601cb0ef41Sopenharmony_ci } 19611cb0ef41Sopenharmony_ci // Other "noNo" type, or need to examine more text around this character: 19621cb0ef41Sopenharmony_ci // Fall through to the slow path. 19631cb0ef41Sopenharmony_ci } else if (isJamoVT(norm16)) { 19641cb0ef41Sopenharmony_ci // Jamo L: E1 84 80..92 19651cb0ef41Sopenharmony_ci // Jamo V: E1 85 A1..B5 19661cb0ef41Sopenharmony_ci // Jamo T: E1 86 A8..E1 87 82 19671cb0ef41Sopenharmony_ci U_ASSERT((src - prevSrc) == 3 && *prevSrc == 0xe1); 19681cb0ef41Sopenharmony_ci UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc); 19691cb0ef41Sopenharmony_ci if (prevSrc[1] == 0x85) { 19701cb0ef41Sopenharmony_ci // The current character is a Jamo Vowel, 19711cb0ef41Sopenharmony_ci // compose with previous Jamo L and following Jamo T. 19721cb0ef41Sopenharmony_ci UChar32 l = prev - Hangul::JAMO_L_BASE; 19731cb0ef41Sopenharmony_ci if ((uint32_t)l < Hangul::JAMO_L_COUNT) { 19741cb0ef41Sopenharmony_ci if (sink == nullptr) { 19751cb0ef41Sopenharmony_ci return false; 19761cb0ef41Sopenharmony_ci } 19771cb0ef41Sopenharmony_ci int32_t t = getJamoTMinusBase(src, limit); 19781cb0ef41Sopenharmony_ci if (t >= 0) { 19791cb0ef41Sopenharmony_ci // The next character is a Jamo T. 19801cb0ef41Sopenharmony_ci src += 3; 19811cb0ef41Sopenharmony_ci } else if (hasCompBoundaryBefore(src, limit)) { 19821cb0ef41Sopenharmony_ci // No Jamo T follows, not even via decomposition. 19831cb0ef41Sopenharmony_ci t = 0; 19841cb0ef41Sopenharmony_ci } 19851cb0ef41Sopenharmony_ci if (t >= 0) { 19861cb0ef41Sopenharmony_ci UChar32 syllable = Hangul::HANGUL_BASE + 19871cb0ef41Sopenharmony_ci (l*Hangul::JAMO_V_COUNT + (prevSrc[2]-0xa1)) * 19881cb0ef41Sopenharmony_ci Hangul::JAMO_T_COUNT + t; 19891cb0ef41Sopenharmony_ci prevSrc -= 3; // Replace the Jamo L as well. 19901cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && 19911cb0ef41Sopenharmony_ci !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 19921cb0ef41Sopenharmony_ci *sink, options, edits, errorCode)) { 19931cb0ef41Sopenharmony_ci break; 19941cb0ef41Sopenharmony_ci } 19951cb0ef41Sopenharmony_ci ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits); 19961cb0ef41Sopenharmony_ci prevBoundary = src; 19971cb0ef41Sopenharmony_ci continue; 19981cb0ef41Sopenharmony_ci } 19991cb0ef41Sopenharmony_ci // If we see L+V+x where x!=T then we drop to the slow path, 20001cb0ef41Sopenharmony_ci // decompose and recompose. 20011cb0ef41Sopenharmony_ci // This is to deal with NFKC finding normal L and V but a 20021cb0ef41Sopenharmony_ci // compatibility variant of a T. 20031cb0ef41Sopenharmony_ci // We need to either fully compose that combination here 20041cb0ef41Sopenharmony_ci // (which would complicate the code and may not work with strange custom data) 20051cb0ef41Sopenharmony_ci // or use the slow path. 20061cb0ef41Sopenharmony_ci } 20071cb0ef41Sopenharmony_ci } else if (Hangul::isHangulLV(prev)) { 20081cb0ef41Sopenharmony_ci // The current character is a Jamo Trailing consonant, 20091cb0ef41Sopenharmony_ci // compose with previous Hangul LV that does not contain a Jamo T. 20101cb0ef41Sopenharmony_ci if (sink == nullptr) { 20111cb0ef41Sopenharmony_ci return false; 20121cb0ef41Sopenharmony_ci } 20131cb0ef41Sopenharmony_ci UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src); 20141cb0ef41Sopenharmony_ci prevSrc -= 3; // Replace the Hangul LV as well. 20151cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && 20161cb0ef41Sopenharmony_ci !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 20171cb0ef41Sopenharmony_ci *sink, options, edits, errorCode)) { 20181cb0ef41Sopenharmony_ci break; 20191cb0ef41Sopenharmony_ci } 20201cb0ef41Sopenharmony_ci ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits); 20211cb0ef41Sopenharmony_ci prevBoundary = src; 20221cb0ef41Sopenharmony_ci continue; 20231cb0ef41Sopenharmony_ci } 20241cb0ef41Sopenharmony_ci // No matching context, or may need to decompose surrounding text first: 20251cb0ef41Sopenharmony_ci // Fall through to the slow path. 20261cb0ef41Sopenharmony_ci } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 20271cb0ef41Sopenharmony_ci // One or more combining marks that do not combine-back: 20281cb0ef41Sopenharmony_ci // Check for canonical order, copy unchanged if ok and 20291cb0ef41Sopenharmony_ci // if followed by a character with a boundary-before. 20301cb0ef41Sopenharmony_ci uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 20311cb0ef41Sopenharmony_ci if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) { 20321cb0ef41Sopenharmony_ci // Fails FCD test, need to decompose and contiguously recompose. 20331cb0ef41Sopenharmony_ci if (sink == nullptr) { 20341cb0ef41Sopenharmony_ci return false; 20351cb0ef41Sopenharmony_ci } 20361cb0ef41Sopenharmony_ci } else { 20371cb0ef41Sopenharmony_ci // If !onlyContiguous (not FCC), then we ignore the tccc of 20381cb0ef41Sopenharmony_ci // the previous character which passed the quick check "yes && ccc==0" test. 20391cb0ef41Sopenharmony_ci const uint8_t *nextSrc; 20401cb0ef41Sopenharmony_ci uint16_t n16; 20411cb0ef41Sopenharmony_ci for (;;) { 20421cb0ef41Sopenharmony_ci if (src == limit) { 20431cb0ef41Sopenharmony_ci if (sink != nullptr) { 20441cb0ef41Sopenharmony_ci ByteSinkUtil::appendUnchanged(prevBoundary, limit, 20451cb0ef41Sopenharmony_ci *sink, options, edits, errorCode); 20461cb0ef41Sopenharmony_ci } 20471cb0ef41Sopenharmony_ci return true; 20481cb0ef41Sopenharmony_ci } 20491cb0ef41Sopenharmony_ci uint8_t prevCC = cc; 20501cb0ef41Sopenharmony_ci nextSrc = src; 20511cb0ef41Sopenharmony_ci UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, n16); 20521cb0ef41Sopenharmony_ci if (n16 >= MIN_YES_YES_WITH_CC) { 20531cb0ef41Sopenharmony_ci cc = getCCFromNormalYesOrMaybe(n16); 20541cb0ef41Sopenharmony_ci if (prevCC > cc) { 20551cb0ef41Sopenharmony_ci if (sink == nullptr) { 20561cb0ef41Sopenharmony_ci return false; 20571cb0ef41Sopenharmony_ci } 20581cb0ef41Sopenharmony_ci break; 20591cb0ef41Sopenharmony_ci } 20601cb0ef41Sopenharmony_ci } else { 20611cb0ef41Sopenharmony_ci break; 20621cb0ef41Sopenharmony_ci } 20631cb0ef41Sopenharmony_ci src = nextSrc; 20641cb0ef41Sopenharmony_ci } 20651cb0ef41Sopenharmony_ci // src is after the last in-order combining mark. 20661cb0ef41Sopenharmony_ci // If there is a boundary here, then we continue with no change. 20671cb0ef41Sopenharmony_ci if (norm16HasCompBoundaryBefore(n16)) { 20681cb0ef41Sopenharmony_ci if (isCompYesAndZeroCC(n16)) { 20691cb0ef41Sopenharmony_ci src = nextSrc; 20701cb0ef41Sopenharmony_ci } 20711cb0ef41Sopenharmony_ci continue; 20721cb0ef41Sopenharmony_ci } 20731cb0ef41Sopenharmony_ci // Use the slow path. There is no boundary in [prevSrc, src[. 20741cb0ef41Sopenharmony_ci } 20751cb0ef41Sopenharmony_ci } 20761cb0ef41Sopenharmony_ci 20771cb0ef41Sopenharmony_ci // Slow path: Find the nearest boundaries around the current character, 20781cb0ef41Sopenharmony_ci // decompose and recompose. 20791cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 20801cb0ef41Sopenharmony_ci const uint8_t *p = prevSrc; 20811cb0ef41Sopenharmony_ci UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, prevBoundary, p, norm16); 20821cb0ef41Sopenharmony_ci if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 20831cb0ef41Sopenharmony_ci prevSrc = p; 20841cb0ef41Sopenharmony_ci } 20851cb0ef41Sopenharmony_ci } 20861cb0ef41Sopenharmony_ci ReorderingBuffer buffer(*this, s16, errorCode); 20871cb0ef41Sopenharmony_ci if (U_FAILURE(errorCode)) { 20881cb0ef41Sopenharmony_ci break; 20891cb0ef41Sopenharmony_ci } 20901cb0ef41Sopenharmony_ci // We know there is not a boundary here. 20911cb0ef41Sopenharmony_ci decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous, 20921cb0ef41Sopenharmony_ci buffer, errorCode); 20931cb0ef41Sopenharmony_ci // Decompose until the next boundary. 20941cb0ef41Sopenharmony_ci src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous, 20951cb0ef41Sopenharmony_ci buffer, errorCode); 20961cb0ef41Sopenharmony_ci if (U_FAILURE(errorCode)) { 20971cb0ef41Sopenharmony_ci break; 20981cb0ef41Sopenharmony_ci } 20991cb0ef41Sopenharmony_ci if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals() 21001cb0ef41Sopenharmony_ci errorCode = U_INDEX_OUTOFBOUNDS_ERROR; 21011cb0ef41Sopenharmony_ci return true; 21021cb0ef41Sopenharmony_ci } 21031cb0ef41Sopenharmony_ci recompose(buffer, 0, onlyContiguous); 21041cb0ef41Sopenharmony_ci if (!buffer.equals(prevSrc, src)) { 21051cb0ef41Sopenharmony_ci if (sink == nullptr) { 21061cb0ef41Sopenharmony_ci return false; 21071cb0ef41Sopenharmony_ci } 21081cb0ef41Sopenharmony_ci if (prevBoundary != prevSrc && 21091cb0ef41Sopenharmony_ci !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc, 21101cb0ef41Sopenharmony_ci *sink, options, edits, errorCode)) { 21111cb0ef41Sopenharmony_ci break; 21121cb0ef41Sopenharmony_ci } 21131cb0ef41Sopenharmony_ci if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(), 21141cb0ef41Sopenharmony_ci *sink, edits, errorCode)) { 21151cb0ef41Sopenharmony_ci break; 21161cb0ef41Sopenharmony_ci } 21171cb0ef41Sopenharmony_ci prevBoundary = src; 21181cb0ef41Sopenharmony_ci } 21191cb0ef41Sopenharmony_ci } 21201cb0ef41Sopenharmony_ci return true; 21211cb0ef41Sopenharmony_ci} 21221cb0ef41Sopenharmony_ci 21231cb0ef41Sopenharmony_ciUBool Normalizer2Impl::hasCompBoundaryBefore(const char16_t *src, const char16_t *limit) const { 21241cb0ef41Sopenharmony_ci if (src == limit || *src < minCompNoMaybeCP) { 21251cb0ef41Sopenharmony_ci return true; 21261cb0ef41Sopenharmony_ci } 21271cb0ef41Sopenharmony_ci UChar32 c; 21281cb0ef41Sopenharmony_ci uint16_t norm16; 21291cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16); 21301cb0ef41Sopenharmony_ci return norm16HasCompBoundaryBefore(norm16); 21311cb0ef41Sopenharmony_ci} 21321cb0ef41Sopenharmony_ci 21331cb0ef41Sopenharmony_ciUBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const { 21341cb0ef41Sopenharmony_ci if (src == limit) { 21351cb0ef41Sopenharmony_ci return true; 21361cb0ef41Sopenharmony_ci } 21371cb0ef41Sopenharmony_ci uint16_t norm16; 21381cb0ef41Sopenharmony_ci UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16); 21391cb0ef41Sopenharmony_ci return norm16HasCompBoundaryBefore(norm16); 21401cb0ef41Sopenharmony_ci} 21411cb0ef41Sopenharmony_ci 21421cb0ef41Sopenharmony_ciUBool Normalizer2Impl::hasCompBoundaryAfter(const char16_t *start, const char16_t *p, 21431cb0ef41Sopenharmony_ci UBool onlyContiguous) const { 21441cb0ef41Sopenharmony_ci if (start == p) { 21451cb0ef41Sopenharmony_ci return true; 21461cb0ef41Sopenharmony_ci } 21471cb0ef41Sopenharmony_ci UChar32 c; 21481cb0ef41Sopenharmony_ci uint16_t norm16; 21491cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16); 21501cb0ef41Sopenharmony_ci return norm16HasCompBoundaryAfter(norm16, onlyContiguous); 21511cb0ef41Sopenharmony_ci} 21521cb0ef41Sopenharmony_ci 21531cb0ef41Sopenharmony_ciUBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p, 21541cb0ef41Sopenharmony_ci UBool onlyContiguous) const { 21551cb0ef41Sopenharmony_ci if (start == p) { 21561cb0ef41Sopenharmony_ci return true; 21571cb0ef41Sopenharmony_ci } 21581cb0ef41Sopenharmony_ci uint16_t norm16; 21591cb0ef41Sopenharmony_ci UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, start, p, norm16); 21601cb0ef41Sopenharmony_ci return norm16HasCompBoundaryAfter(norm16, onlyContiguous); 21611cb0ef41Sopenharmony_ci} 21621cb0ef41Sopenharmony_ci 21631cb0ef41Sopenharmony_ciconst char16_t *Normalizer2Impl::findPreviousCompBoundary(const char16_t *start, const char16_t *p, 21641cb0ef41Sopenharmony_ci UBool onlyContiguous) const { 21651cb0ef41Sopenharmony_ci while (p != start) { 21661cb0ef41Sopenharmony_ci const char16_t *codePointLimit = p; 21671cb0ef41Sopenharmony_ci UChar32 c; 21681cb0ef41Sopenharmony_ci uint16_t norm16; 21691cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16); 21701cb0ef41Sopenharmony_ci if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 21711cb0ef41Sopenharmony_ci return codePointLimit; 21721cb0ef41Sopenharmony_ci } 21731cb0ef41Sopenharmony_ci if (hasCompBoundaryBefore(c, norm16)) { 21741cb0ef41Sopenharmony_ci return p; 21751cb0ef41Sopenharmony_ci } 21761cb0ef41Sopenharmony_ci } 21771cb0ef41Sopenharmony_ci return p; 21781cb0ef41Sopenharmony_ci} 21791cb0ef41Sopenharmony_ci 21801cb0ef41Sopenharmony_ciconst char16_t *Normalizer2Impl::findNextCompBoundary(const char16_t *p, const char16_t *limit, 21811cb0ef41Sopenharmony_ci UBool onlyContiguous) const { 21821cb0ef41Sopenharmony_ci while (p != limit) { 21831cb0ef41Sopenharmony_ci const char16_t *codePointStart = p; 21841cb0ef41Sopenharmony_ci UChar32 c; 21851cb0ef41Sopenharmony_ci uint16_t norm16; 21861cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16); 21871cb0ef41Sopenharmony_ci if (hasCompBoundaryBefore(c, norm16)) { 21881cb0ef41Sopenharmony_ci return codePointStart; 21891cb0ef41Sopenharmony_ci } 21901cb0ef41Sopenharmony_ci if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 21911cb0ef41Sopenharmony_ci return p; 21921cb0ef41Sopenharmony_ci } 21931cb0ef41Sopenharmony_ci } 21941cb0ef41Sopenharmony_ci return p; 21951cb0ef41Sopenharmony_ci} 21961cb0ef41Sopenharmony_ci 21971cb0ef41Sopenharmony_ciuint8_t Normalizer2Impl::getPreviousTrailCC(const char16_t *start, const char16_t *p) const { 21981cb0ef41Sopenharmony_ci if (start == p) { 21991cb0ef41Sopenharmony_ci return 0; 22001cb0ef41Sopenharmony_ci } 22011cb0ef41Sopenharmony_ci int32_t i = (int32_t)(p - start); 22021cb0ef41Sopenharmony_ci UChar32 c; 22031cb0ef41Sopenharmony_ci U16_PREV(start, 0, i, c); 22041cb0ef41Sopenharmony_ci return (uint8_t)getFCD16(c); 22051cb0ef41Sopenharmony_ci} 22061cb0ef41Sopenharmony_ci 22071cb0ef41Sopenharmony_ciuint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const { 22081cb0ef41Sopenharmony_ci if (start == p) { 22091cb0ef41Sopenharmony_ci return 0; 22101cb0ef41Sopenharmony_ci } 22111cb0ef41Sopenharmony_ci int32_t i = (int32_t)(p - start); 22121cb0ef41Sopenharmony_ci UChar32 c; 22131cb0ef41Sopenharmony_ci U8_PREV(start, 0, i, c); 22141cb0ef41Sopenharmony_ci return (uint8_t)getFCD16(c); 22151cb0ef41Sopenharmony_ci} 22161cb0ef41Sopenharmony_ci 22171cb0ef41Sopenharmony_ci// Note: normalizer2impl.cpp r30982 (2011-nov-27) 22181cb0ef41Sopenharmony_ci// still had getFCDTrie() which built and cached an FCD trie. 22191cb0ef41Sopenharmony_ci// That provided faster access to FCD data than getFCD16FromNormData() 22201cb0ef41Sopenharmony_ci// but required synchronization and consumed some 10kB of heap memory 22211cb0ef41Sopenharmony_ci// in any process that uses FCD (e.g., via collation). 22221cb0ef41Sopenharmony_ci// minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance, 22231cb0ef41Sopenharmony_ci// at least for ASCII & CJK. 22241cb0ef41Sopenharmony_ci 22251cb0ef41Sopenharmony_ci// Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this 22261cb0ef41Sopenharmony_ci// function on Windows ARM64. As a work-around, we disable optimizations for this function. 22271cb0ef41Sopenharmony_ci// This work-around could/should be removed once the following versions of Visual Studio are no 22281cb0ef41Sopenharmony_ci// longer supported: All versions of VS2017, and versions of VS2019 below 16.4. 22291cb0ef41Sopenharmony_ci#if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924)) 22301cb0ef41Sopenharmony_ci#pragma optimize( "", off ) 22311cb0ef41Sopenharmony_ci#endif 22321cb0ef41Sopenharmony_ci// Gets the FCD value from the regular normalization data. 22331cb0ef41Sopenharmony_ciuint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { 22341cb0ef41Sopenharmony_ci uint16_t norm16=getNorm16(c); 22351cb0ef41Sopenharmony_ci if (norm16 >= limitNoNo) { 22361cb0ef41Sopenharmony_ci if(norm16>=MIN_NORMAL_MAYBE_YES) { 22371cb0ef41Sopenharmony_ci // combining mark 22381cb0ef41Sopenharmony_ci norm16=getCCFromNormalYesOrMaybe(norm16); 22391cb0ef41Sopenharmony_ci return norm16|(norm16<<8); 22401cb0ef41Sopenharmony_ci } else if(norm16>=minMaybeYes) { 22411cb0ef41Sopenharmony_ci return 0; 22421cb0ef41Sopenharmony_ci } else { // isDecompNoAlgorithmic(norm16) 22431cb0ef41Sopenharmony_ci uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK; 22441cb0ef41Sopenharmony_ci if (deltaTrailCC <= DELTA_TCCC_1) { 22451cb0ef41Sopenharmony_ci return deltaTrailCC >> OFFSET_SHIFT; 22461cb0ef41Sopenharmony_ci } 22471cb0ef41Sopenharmony_ci // Maps to an isCompYesAndZeroCC. 22481cb0ef41Sopenharmony_ci c=mapAlgorithmic(c, norm16); 22491cb0ef41Sopenharmony_ci norm16=getRawNorm16(c); 22501cb0ef41Sopenharmony_ci } 22511cb0ef41Sopenharmony_ci } 22521cb0ef41Sopenharmony_ci if(norm16<=minYesNo || isHangulLVT(norm16)) { 22531cb0ef41Sopenharmony_ci // no decomposition or Hangul syllable, all zeros 22541cb0ef41Sopenharmony_ci return 0; 22551cb0ef41Sopenharmony_ci } 22561cb0ef41Sopenharmony_ci // c decomposes, get everything from the variable-length extra data 22571cb0ef41Sopenharmony_ci const uint16_t *mapping=getMapping(norm16); 22581cb0ef41Sopenharmony_ci uint16_t firstUnit=*mapping; 22591cb0ef41Sopenharmony_ci norm16=firstUnit>>8; // tccc 22601cb0ef41Sopenharmony_ci if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 22611cb0ef41Sopenharmony_ci norm16|=*(mapping-1)&0xff00; // lccc 22621cb0ef41Sopenharmony_ci } 22631cb0ef41Sopenharmony_ci return norm16; 22641cb0ef41Sopenharmony_ci} 22651cb0ef41Sopenharmony_ci#if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924)) 22661cb0ef41Sopenharmony_ci#pragma optimize( "", on ) 22671cb0ef41Sopenharmony_ci#endif 22681cb0ef41Sopenharmony_ci 22691cb0ef41Sopenharmony_ci// Dual functionality: 22701cb0ef41Sopenharmony_ci// buffer!=nullptr: normalize 22711cb0ef41Sopenharmony_ci// buffer==nullptr: isNormalized/quickCheck/spanQuickCheckYes 22721cb0ef41Sopenharmony_ciconst char16_t * 22731cb0ef41Sopenharmony_ciNormalizer2Impl::makeFCD(const char16_t *src, const char16_t *limit, 22741cb0ef41Sopenharmony_ci ReorderingBuffer *buffer, 22751cb0ef41Sopenharmony_ci UErrorCode &errorCode) const { 22761cb0ef41Sopenharmony_ci // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 22771cb0ef41Sopenharmony_ci // Similar to the prevBoundary in the compose() implementation. 22781cb0ef41Sopenharmony_ci const char16_t *prevBoundary=src; 22791cb0ef41Sopenharmony_ci int32_t prevFCD16=0; 22801cb0ef41Sopenharmony_ci if(limit==nullptr) { 22811cb0ef41Sopenharmony_ci src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode); 22821cb0ef41Sopenharmony_ci if(U_FAILURE(errorCode)) { 22831cb0ef41Sopenharmony_ci return src; 22841cb0ef41Sopenharmony_ci } 22851cb0ef41Sopenharmony_ci if(prevBoundary<src) { 22861cb0ef41Sopenharmony_ci prevBoundary=src; 22871cb0ef41Sopenharmony_ci // We know that the previous character's lccc==0. 22881cb0ef41Sopenharmony_ci // Fetching the fcd16 value was deferred for this below-U+0300 code point. 22891cb0ef41Sopenharmony_ci prevFCD16=getFCD16(*(src-1)); 22901cb0ef41Sopenharmony_ci if(prevFCD16>1) { 22911cb0ef41Sopenharmony_ci --prevBoundary; 22921cb0ef41Sopenharmony_ci } 22931cb0ef41Sopenharmony_ci } 22941cb0ef41Sopenharmony_ci limit=u_strchr(src, 0); 22951cb0ef41Sopenharmony_ci } 22961cb0ef41Sopenharmony_ci 22971cb0ef41Sopenharmony_ci // Note: In this function we use buffer->appendZeroCC() because we track 22981cb0ef41Sopenharmony_ci // the lead and trail combining classes here, rather than leaving it to 22991cb0ef41Sopenharmony_ci // the ReorderingBuffer. 23001cb0ef41Sopenharmony_ci // The exception is the call to decomposeShort() which uses the buffer 23011cb0ef41Sopenharmony_ci // in the normal way. 23021cb0ef41Sopenharmony_ci 23031cb0ef41Sopenharmony_ci const char16_t *prevSrc; 23041cb0ef41Sopenharmony_ci UChar32 c=0; 23051cb0ef41Sopenharmony_ci uint16_t fcd16=0; 23061cb0ef41Sopenharmony_ci 23071cb0ef41Sopenharmony_ci for(;;) { 23081cb0ef41Sopenharmony_ci // count code units with lccc==0 23091cb0ef41Sopenharmony_ci for(prevSrc=src; src!=limit;) { 23101cb0ef41Sopenharmony_ci if((c=*src)<minLcccCP) { 23111cb0ef41Sopenharmony_ci prevFCD16=~c; 23121cb0ef41Sopenharmony_ci ++src; 23131cb0ef41Sopenharmony_ci } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 23141cb0ef41Sopenharmony_ci prevFCD16=0; 23151cb0ef41Sopenharmony_ci ++src; 23161cb0ef41Sopenharmony_ci } else { 23171cb0ef41Sopenharmony_ci if(U16_IS_LEAD(c)) { 23181cb0ef41Sopenharmony_ci char16_t c2; 23191cb0ef41Sopenharmony_ci if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 23201cb0ef41Sopenharmony_ci c=U16_GET_SUPPLEMENTARY(c, c2); 23211cb0ef41Sopenharmony_ci } 23221cb0ef41Sopenharmony_ci } 23231cb0ef41Sopenharmony_ci if((fcd16=getFCD16FromNormData(c))<=0xff) { 23241cb0ef41Sopenharmony_ci prevFCD16=fcd16; 23251cb0ef41Sopenharmony_ci src+=U16_LENGTH(c); 23261cb0ef41Sopenharmony_ci } else { 23271cb0ef41Sopenharmony_ci break; 23281cb0ef41Sopenharmony_ci } 23291cb0ef41Sopenharmony_ci } 23301cb0ef41Sopenharmony_ci } 23311cb0ef41Sopenharmony_ci // copy these code units all at once 23321cb0ef41Sopenharmony_ci if(src!=prevSrc) { 23331cb0ef41Sopenharmony_ci if(buffer!=nullptr && !buffer->appendZeroCC(prevSrc, src, errorCode)) { 23341cb0ef41Sopenharmony_ci break; 23351cb0ef41Sopenharmony_ci } 23361cb0ef41Sopenharmony_ci if(src==limit) { 23371cb0ef41Sopenharmony_ci break; 23381cb0ef41Sopenharmony_ci } 23391cb0ef41Sopenharmony_ci prevBoundary=src; 23401cb0ef41Sopenharmony_ci // We know that the previous character's lccc==0. 23411cb0ef41Sopenharmony_ci if(prevFCD16<0) { 23421cb0ef41Sopenharmony_ci // Fetching the fcd16 value was deferred for this below-minLcccCP code point. 23431cb0ef41Sopenharmony_ci UChar32 prev=~prevFCD16; 23441cb0ef41Sopenharmony_ci if(prev<minDecompNoCP) { 23451cb0ef41Sopenharmony_ci prevFCD16=0; 23461cb0ef41Sopenharmony_ci } else { 23471cb0ef41Sopenharmony_ci prevFCD16=getFCD16FromNormData(prev); 23481cb0ef41Sopenharmony_ci if(prevFCD16>1) { 23491cb0ef41Sopenharmony_ci --prevBoundary; 23501cb0ef41Sopenharmony_ci } 23511cb0ef41Sopenharmony_ci } 23521cb0ef41Sopenharmony_ci } else { 23531cb0ef41Sopenharmony_ci const char16_t *p=src-1; 23541cb0ef41Sopenharmony_ci if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { 23551cb0ef41Sopenharmony_ci --p; 23561cb0ef41Sopenharmony_ci // Need to fetch the previous character's FCD value because 23571cb0ef41Sopenharmony_ci // prevFCD16 was just for the trail surrogate code point. 23581cb0ef41Sopenharmony_ci prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])); 23591cb0ef41Sopenharmony_ci // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 23601cb0ef41Sopenharmony_ci } 23611cb0ef41Sopenharmony_ci if(prevFCD16>1) { 23621cb0ef41Sopenharmony_ci prevBoundary=p; 23631cb0ef41Sopenharmony_ci } 23641cb0ef41Sopenharmony_ci } 23651cb0ef41Sopenharmony_ci // The start of the current character (c). 23661cb0ef41Sopenharmony_ci prevSrc=src; 23671cb0ef41Sopenharmony_ci } else if(src==limit) { 23681cb0ef41Sopenharmony_ci break; 23691cb0ef41Sopenharmony_ci } 23701cb0ef41Sopenharmony_ci 23711cb0ef41Sopenharmony_ci src+=U16_LENGTH(c); 23721cb0ef41Sopenharmony_ci // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 23731cb0ef41Sopenharmony_ci // Check for proper order, and decompose locally if necessary. 23741cb0ef41Sopenharmony_ci if((prevFCD16&0xff)<=(fcd16>>8)) { 23751cb0ef41Sopenharmony_ci // proper order: prev tccc <= current lccc 23761cb0ef41Sopenharmony_ci if((fcd16&0xff)<=1) { 23771cb0ef41Sopenharmony_ci prevBoundary=src; 23781cb0ef41Sopenharmony_ci } 23791cb0ef41Sopenharmony_ci if(buffer!=nullptr && !buffer->appendZeroCC(c, errorCode)) { 23801cb0ef41Sopenharmony_ci break; 23811cb0ef41Sopenharmony_ci } 23821cb0ef41Sopenharmony_ci prevFCD16=fcd16; 23831cb0ef41Sopenharmony_ci continue; 23841cb0ef41Sopenharmony_ci } else if(buffer==nullptr) { 23851cb0ef41Sopenharmony_ci return prevBoundary; // quick check "no" 23861cb0ef41Sopenharmony_ci } else { 23871cb0ef41Sopenharmony_ci /* 23881cb0ef41Sopenharmony_ci * Back out the part of the source that we copied or appended 23891cb0ef41Sopenharmony_ci * already but is now going to be decomposed. 23901cb0ef41Sopenharmony_ci * prevSrc is set to after what was copied/appended. 23911cb0ef41Sopenharmony_ci */ 23921cb0ef41Sopenharmony_ci buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); 23931cb0ef41Sopenharmony_ci /* 23941cb0ef41Sopenharmony_ci * Find the part of the source that needs to be decomposed, 23951cb0ef41Sopenharmony_ci * up to the next safe boundary. 23961cb0ef41Sopenharmony_ci */ 23971cb0ef41Sopenharmony_ci src=findNextFCDBoundary(src, limit); 23981cb0ef41Sopenharmony_ci /* 23991cb0ef41Sopenharmony_ci * The source text does not fulfill the conditions for FCD. 24001cb0ef41Sopenharmony_ci * Decompose and reorder a limited piece of the text. 24011cb0ef41Sopenharmony_ci */ 24021cb0ef41Sopenharmony_ci decomposeShort(prevBoundary, src, false, false, *buffer, errorCode); 24031cb0ef41Sopenharmony_ci if (U_FAILURE(errorCode)) { 24041cb0ef41Sopenharmony_ci break; 24051cb0ef41Sopenharmony_ci } 24061cb0ef41Sopenharmony_ci prevBoundary=src; 24071cb0ef41Sopenharmony_ci prevFCD16=0; 24081cb0ef41Sopenharmony_ci } 24091cb0ef41Sopenharmony_ci } 24101cb0ef41Sopenharmony_ci return src; 24111cb0ef41Sopenharmony_ci} 24121cb0ef41Sopenharmony_ci 24131cb0ef41Sopenharmony_civoid Normalizer2Impl::makeFCDAndAppend(const char16_t *src, const char16_t *limit, 24141cb0ef41Sopenharmony_ci UBool doMakeFCD, 24151cb0ef41Sopenharmony_ci UnicodeString &safeMiddle, 24161cb0ef41Sopenharmony_ci ReorderingBuffer &buffer, 24171cb0ef41Sopenharmony_ci UErrorCode &errorCode) const { 24181cb0ef41Sopenharmony_ci if(!buffer.isEmpty()) { 24191cb0ef41Sopenharmony_ci const char16_t *firstBoundaryInSrc=findNextFCDBoundary(src, limit); 24201cb0ef41Sopenharmony_ci if(src!=firstBoundaryInSrc) { 24211cb0ef41Sopenharmony_ci const char16_t *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), 24221cb0ef41Sopenharmony_ci buffer.getLimit()); 24231cb0ef41Sopenharmony_ci int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); 24241cb0ef41Sopenharmony_ci UnicodeString middle(lastBoundaryInDest, destSuffixLength); 24251cb0ef41Sopenharmony_ci buffer.removeSuffix(destSuffixLength); 24261cb0ef41Sopenharmony_ci safeMiddle=middle; 24271cb0ef41Sopenharmony_ci middle.append(src, (int32_t)(firstBoundaryInSrc-src)); 24281cb0ef41Sopenharmony_ci const char16_t *middleStart=middle.getBuffer(); 24291cb0ef41Sopenharmony_ci makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); 24301cb0ef41Sopenharmony_ci if(U_FAILURE(errorCode)) { 24311cb0ef41Sopenharmony_ci return; 24321cb0ef41Sopenharmony_ci } 24331cb0ef41Sopenharmony_ci src=firstBoundaryInSrc; 24341cb0ef41Sopenharmony_ci } 24351cb0ef41Sopenharmony_ci } 24361cb0ef41Sopenharmony_ci if(doMakeFCD) { 24371cb0ef41Sopenharmony_ci makeFCD(src, limit, &buffer, errorCode); 24381cb0ef41Sopenharmony_ci } else { 24391cb0ef41Sopenharmony_ci if(limit==nullptr) { // appendZeroCC() needs limit!=nullptr 24401cb0ef41Sopenharmony_ci limit=u_strchr(src, 0); 24411cb0ef41Sopenharmony_ci } 24421cb0ef41Sopenharmony_ci buffer.appendZeroCC(src, limit, errorCode); 24431cb0ef41Sopenharmony_ci } 24441cb0ef41Sopenharmony_ci} 24451cb0ef41Sopenharmony_ci 24461cb0ef41Sopenharmony_ciconst char16_t *Normalizer2Impl::findPreviousFCDBoundary(const char16_t *start, const char16_t *p) const { 24471cb0ef41Sopenharmony_ci while(start<p) { 24481cb0ef41Sopenharmony_ci const char16_t *codePointLimit = p; 24491cb0ef41Sopenharmony_ci UChar32 c; 24501cb0ef41Sopenharmony_ci uint16_t norm16; 24511cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16); 24521cb0ef41Sopenharmony_ci if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) { 24531cb0ef41Sopenharmony_ci return codePointLimit; 24541cb0ef41Sopenharmony_ci } 24551cb0ef41Sopenharmony_ci if (norm16HasDecompBoundaryBefore(norm16)) { 24561cb0ef41Sopenharmony_ci return p; 24571cb0ef41Sopenharmony_ci } 24581cb0ef41Sopenharmony_ci } 24591cb0ef41Sopenharmony_ci return p; 24601cb0ef41Sopenharmony_ci} 24611cb0ef41Sopenharmony_ci 24621cb0ef41Sopenharmony_ciconst char16_t *Normalizer2Impl::findNextFCDBoundary(const char16_t *p, const char16_t *limit) const { 24631cb0ef41Sopenharmony_ci while(p<limit) { 24641cb0ef41Sopenharmony_ci const char16_t *codePointStart=p; 24651cb0ef41Sopenharmony_ci UChar32 c; 24661cb0ef41Sopenharmony_ci uint16_t norm16; 24671cb0ef41Sopenharmony_ci UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16); 24681cb0ef41Sopenharmony_ci if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) { 24691cb0ef41Sopenharmony_ci return codePointStart; 24701cb0ef41Sopenharmony_ci } 24711cb0ef41Sopenharmony_ci if (norm16HasDecompBoundaryAfter(norm16)) { 24721cb0ef41Sopenharmony_ci return p; 24731cb0ef41Sopenharmony_ci } 24741cb0ef41Sopenharmony_ci } 24751cb0ef41Sopenharmony_ci return p; 24761cb0ef41Sopenharmony_ci} 24771cb0ef41Sopenharmony_ci 24781cb0ef41Sopenharmony_ci// CanonicalIterator data -------------------------------------------------- *** 24791cb0ef41Sopenharmony_ci 24801cb0ef41Sopenharmony_ciCanonIterData::CanonIterData(UErrorCode &errorCode) : 24811cb0ef41Sopenharmony_ci mutableTrie(umutablecptrie_open(0, 0, &errorCode)), trie(nullptr), 24821cb0ef41Sopenharmony_ci canonStartSets(uprv_deleteUObject, nullptr, errorCode) {} 24831cb0ef41Sopenharmony_ci 24841cb0ef41Sopenharmony_ciCanonIterData::~CanonIterData() { 24851cb0ef41Sopenharmony_ci umutablecptrie_close(mutableTrie); 24861cb0ef41Sopenharmony_ci ucptrie_close(trie); 24871cb0ef41Sopenharmony_ci} 24881cb0ef41Sopenharmony_ci 24891cb0ef41Sopenharmony_civoid CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) { 24901cb0ef41Sopenharmony_ci uint32_t canonValue = umutablecptrie_get(mutableTrie, decompLead); 24911cb0ef41Sopenharmony_ci if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 24921cb0ef41Sopenharmony_ci // origin is the first character whose decomposition starts with 24931cb0ef41Sopenharmony_ci // the character for which we are setting the value. 24941cb0ef41Sopenharmony_ci umutablecptrie_set(mutableTrie, decompLead, canonValue|origin, &errorCode); 24951cb0ef41Sopenharmony_ci } else { 24961cb0ef41Sopenharmony_ci // origin is not the first character, or it is U+0000. 24971cb0ef41Sopenharmony_ci UnicodeSet *set; 24981cb0ef41Sopenharmony_ci if((canonValue&CANON_HAS_SET)==0) { 24991cb0ef41Sopenharmony_ci LocalPointer<UnicodeSet> lpSet(new UnicodeSet, errorCode); 25001cb0ef41Sopenharmony_ci set=lpSet.getAlias(); 25011cb0ef41Sopenharmony_ci if(U_FAILURE(errorCode)) { 25021cb0ef41Sopenharmony_ci return; 25031cb0ef41Sopenharmony_ci } 25041cb0ef41Sopenharmony_ci UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); 25051cb0ef41Sopenharmony_ci canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); 25061cb0ef41Sopenharmony_ci umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode); 25071cb0ef41Sopenharmony_ci canonStartSets.adoptElement(lpSet.orphan(), errorCode); 25081cb0ef41Sopenharmony_ci if (U_FAILURE(errorCode)) { 25091cb0ef41Sopenharmony_ci return; 25101cb0ef41Sopenharmony_ci } 25111cb0ef41Sopenharmony_ci if(firstOrigin!=0) { 25121cb0ef41Sopenharmony_ci set->add(firstOrigin); 25131cb0ef41Sopenharmony_ci } 25141cb0ef41Sopenharmony_ci } else { 25151cb0ef41Sopenharmony_ci set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; 25161cb0ef41Sopenharmony_ci } 25171cb0ef41Sopenharmony_ci set->add(origin); 25181cb0ef41Sopenharmony_ci } 25191cb0ef41Sopenharmony_ci} 25201cb0ef41Sopenharmony_ci 25211cb0ef41Sopenharmony_ci// C++ class for friend access to private Normalizer2Impl members. 25221cb0ef41Sopenharmony_ciclass InitCanonIterData { 25231cb0ef41Sopenharmony_cipublic: 25241cb0ef41Sopenharmony_ci static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode); 25251cb0ef41Sopenharmony_ci}; 25261cb0ef41Sopenharmony_ci 25271cb0ef41Sopenharmony_ciU_CDECL_BEGIN 25281cb0ef41Sopenharmony_ci 25291cb0ef41Sopenharmony_ci// UInitOnce instantiation function for CanonIterData 25301cb0ef41Sopenharmony_cistatic void U_CALLCONV 25311cb0ef41Sopenharmony_ciinitCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) { 25321cb0ef41Sopenharmony_ci InitCanonIterData::doInit(impl, errorCode); 25331cb0ef41Sopenharmony_ci} 25341cb0ef41Sopenharmony_ci 25351cb0ef41Sopenharmony_ciU_CDECL_END 25361cb0ef41Sopenharmony_ci 25371cb0ef41Sopenharmony_civoid InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) { 25381cb0ef41Sopenharmony_ci U_ASSERT(impl->fCanonIterData == nullptr); 25391cb0ef41Sopenharmony_ci impl->fCanonIterData = new CanonIterData(errorCode); 25401cb0ef41Sopenharmony_ci if (impl->fCanonIterData == nullptr) { 25411cb0ef41Sopenharmony_ci errorCode=U_MEMORY_ALLOCATION_ERROR; 25421cb0ef41Sopenharmony_ci } 25431cb0ef41Sopenharmony_ci if (U_SUCCESS(errorCode)) { 25441cb0ef41Sopenharmony_ci UChar32 start = 0, end; 25451cb0ef41Sopenharmony_ci uint32_t value; 25461cb0ef41Sopenharmony_ci while ((end = ucptrie_getRange(impl->normTrie, start, 25471cb0ef41Sopenharmony_ci UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT, 25481cb0ef41Sopenharmony_ci nullptr, nullptr, &value)) >= 0) { 25491cb0ef41Sopenharmony_ci // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. 25501cb0ef41Sopenharmony_ci if (value != Normalizer2Impl::INERT) { 25511cb0ef41Sopenharmony_ci impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode); 25521cb0ef41Sopenharmony_ci } 25531cb0ef41Sopenharmony_ci start = end + 1; 25541cb0ef41Sopenharmony_ci } 25551cb0ef41Sopenharmony_ci#ifdef UCPTRIE_DEBUG 25561cb0ef41Sopenharmony_ci umutablecptrie_setName(impl->fCanonIterData->mutableTrie, "CanonIterData"); 25571cb0ef41Sopenharmony_ci#endif 25581cb0ef41Sopenharmony_ci impl->fCanonIterData->trie = umutablecptrie_buildImmutable( 25591cb0ef41Sopenharmony_ci impl->fCanonIterData->mutableTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_32, &errorCode); 25601cb0ef41Sopenharmony_ci umutablecptrie_close(impl->fCanonIterData->mutableTrie); 25611cb0ef41Sopenharmony_ci impl->fCanonIterData->mutableTrie = nullptr; 25621cb0ef41Sopenharmony_ci } 25631cb0ef41Sopenharmony_ci if (U_FAILURE(errorCode)) { 25641cb0ef41Sopenharmony_ci delete impl->fCanonIterData; 25651cb0ef41Sopenharmony_ci impl->fCanonIterData = nullptr; 25661cb0ef41Sopenharmony_ci } 25671cb0ef41Sopenharmony_ci} 25681cb0ef41Sopenharmony_ci 25691cb0ef41Sopenharmony_civoid Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16, 25701cb0ef41Sopenharmony_ci CanonIterData &newData, 25711cb0ef41Sopenharmony_ci UErrorCode &errorCode) const { 25721cb0ef41Sopenharmony_ci if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) { 25731cb0ef41Sopenharmony_ci // Inert, or 2-way mapping (including Hangul syllable). 25741cb0ef41Sopenharmony_ci // We do not write a canonStartSet for any yesNo character. 25751cb0ef41Sopenharmony_ci // Composites from 2-way mappings are added at runtime from the 25761cb0ef41Sopenharmony_ci // starter's compositions list, and the other characters in 25771cb0ef41Sopenharmony_ci // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 25781cb0ef41Sopenharmony_ci // "maybe" characters. 25791cb0ef41Sopenharmony_ci return; 25801cb0ef41Sopenharmony_ci } 25811cb0ef41Sopenharmony_ci for(UChar32 c=start; c<=end; ++c) { 25821cb0ef41Sopenharmony_ci uint32_t oldValue = umutablecptrie_get(newData.mutableTrie, c); 25831cb0ef41Sopenharmony_ci uint32_t newValue=oldValue; 25841cb0ef41Sopenharmony_ci if(isMaybeOrNonZeroCC(norm16)) { 25851cb0ef41Sopenharmony_ci // not a segment starter if it occurs in a decomposition or has cc!=0 25861cb0ef41Sopenharmony_ci newValue|=CANON_NOT_SEGMENT_STARTER; 25871cb0ef41Sopenharmony_ci if(norm16<MIN_NORMAL_MAYBE_YES) { 25881cb0ef41Sopenharmony_ci newValue|=CANON_HAS_COMPOSITIONS; 25891cb0ef41Sopenharmony_ci } 25901cb0ef41Sopenharmony_ci } else if(norm16<minYesNo) { 25911cb0ef41Sopenharmony_ci newValue|=CANON_HAS_COMPOSITIONS; 25921cb0ef41Sopenharmony_ci } else { 25931cb0ef41Sopenharmony_ci // c has a one-way decomposition 25941cb0ef41Sopenharmony_ci UChar32 c2=c; 25951cb0ef41Sopenharmony_ci // Do not modify the whole-range norm16 value. 25961cb0ef41Sopenharmony_ci uint16_t norm16_2=norm16; 25971cb0ef41Sopenharmony_ci if (isDecompNoAlgorithmic(norm16_2)) { 25981cb0ef41Sopenharmony_ci // Maps to an isCompYesAndZeroCC. 25991cb0ef41Sopenharmony_ci c2 = mapAlgorithmic(c2, norm16_2); 26001cb0ef41Sopenharmony_ci norm16_2 = getRawNorm16(c2); 26011cb0ef41Sopenharmony_ci // No compatibility mappings for the CanonicalIterator. 26021cb0ef41Sopenharmony_ci U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2))); 26031cb0ef41Sopenharmony_ci } 26041cb0ef41Sopenharmony_ci if (norm16_2 > minYesNo) { 26051cb0ef41Sopenharmony_ci // c decomposes, get everything from the variable-length extra data 26061cb0ef41Sopenharmony_ci const uint16_t *mapping=getMapping(norm16_2); 26071cb0ef41Sopenharmony_ci uint16_t firstUnit=*mapping; 26081cb0ef41Sopenharmony_ci int32_t length=firstUnit&MAPPING_LENGTH_MASK; 26091cb0ef41Sopenharmony_ci if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 26101cb0ef41Sopenharmony_ci if(c==c2 && (*(mapping-1)&0xff)!=0) { 26111cb0ef41Sopenharmony_ci newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 26121cb0ef41Sopenharmony_ci } 26131cb0ef41Sopenharmony_ci } 26141cb0ef41Sopenharmony_ci // Skip empty mappings (no characters in the decomposition). 26151cb0ef41Sopenharmony_ci if(length!=0) { 26161cb0ef41Sopenharmony_ci ++mapping; // skip over the firstUnit 26171cb0ef41Sopenharmony_ci // add c to first code point's start set 26181cb0ef41Sopenharmony_ci int32_t i=0; 26191cb0ef41Sopenharmony_ci U16_NEXT_UNSAFE(mapping, i, c2); 26201cb0ef41Sopenharmony_ci newData.addToStartSet(c, c2, errorCode); 26211cb0ef41Sopenharmony_ci // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 26221cb0ef41Sopenharmony_ci // one-way mapping. A 2-way mapping is possible here after 26231cb0ef41Sopenharmony_ci // intermediate algorithmic mapping. 26241cb0ef41Sopenharmony_ci if(norm16_2>=minNoNo) { 26251cb0ef41Sopenharmony_ci while(i<length) { 26261cb0ef41Sopenharmony_ci U16_NEXT_UNSAFE(mapping, i, c2); 26271cb0ef41Sopenharmony_ci uint32_t c2Value = umutablecptrie_get(newData.mutableTrie, c2); 26281cb0ef41Sopenharmony_ci if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 26291cb0ef41Sopenharmony_ci umutablecptrie_set(newData.mutableTrie, c2, 26301cb0ef41Sopenharmony_ci c2Value|CANON_NOT_SEGMENT_STARTER, &errorCode); 26311cb0ef41Sopenharmony_ci } 26321cb0ef41Sopenharmony_ci } 26331cb0ef41Sopenharmony_ci } 26341cb0ef41Sopenharmony_ci } 26351cb0ef41Sopenharmony_ci } else { 26361cb0ef41Sopenharmony_ci // c decomposed to c2 algorithmically; c has cc==0 26371cb0ef41Sopenharmony_ci newData.addToStartSet(c, c2, errorCode); 26381cb0ef41Sopenharmony_ci } 26391cb0ef41Sopenharmony_ci } 26401cb0ef41Sopenharmony_ci if(newValue!=oldValue) { 26411cb0ef41Sopenharmony_ci umutablecptrie_set(newData.mutableTrie, c, newValue, &errorCode); 26421cb0ef41Sopenharmony_ci } 26431cb0ef41Sopenharmony_ci } 26441cb0ef41Sopenharmony_ci} 26451cb0ef41Sopenharmony_ci 26461cb0ef41Sopenharmony_ciUBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { 26471cb0ef41Sopenharmony_ci // Logically const: Synchronized instantiation. 26481cb0ef41Sopenharmony_ci Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 26491cb0ef41Sopenharmony_ci umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode); 26501cb0ef41Sopenharmony_ci return U_SUCCESS(errorCode); 26511cb0ef41Sopenharmony_ci} 26521cb0ef41Sopenharmony_ci 26531cb0ef41Sopenharmony_ciint32_t Normalizer2Impl::getCanonValue(UChar32 c) const { 26541cb0ef41Sopenharmony_ci return (int32_t)ucptrie_get(fCanonIterData->trie, c); 26551cb0ef41Sopenharmony_ci} 26561cb0ef41Sopenharmony_ci 26571cb0ef41Sopenharmony_ciconst UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { 26581cb0ef41Sopenharmony_ci return *(const UnicodeSet *)fCanonIterData->canonStartSets[n]; 26591cb0ef41Sopenharmony_ci} 26601cb0ef41Sopenharmony_ci 26611cb0ef41Sopenharmony_ciUBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { 26621cb0ef41Sopenharmony_ci return getCanonValue(c)>=0; 26631cb0ef41Sopenharmony_ci} 26641cb0ef41Sopenharmony_ci 26651cb0ef41Sopenharmony_ciUBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { 26661cb0ef41Sopenharmony_ci int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; 26671cb0ef41Sopenharmony_ci if(canonValue==0) { 26681cb0ef41Sopenharmony_ci return false; 26691cb0ef41Sopenharmony_ci } 26701cb0ef41Sopenharmony_ci set.clear(); 26711cb0ef41Sopenharmony_ci int32_t value=canonValue&CANON_VALUE_MASK; 26721cb0ef41Sopenharmony_ci if((canonValue&CANON_HAS_SET)!=0) { 26731cb0ef41Sopenharmony_ci set.addAll(getCanonStartSet(value)); 26741cb0ef41Sopenharmony_ci } else if(value!=0) { 26751cb0ef41Sopenharmony_ci set.add(value); 26761cb0ef41Sopenharmony_ci } 26771cb0ef41Sopenharmony_ci if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 26781cb0ef41Sopenharmony_ci uint16_t norm16=getRawNorm16(c); 26791cb0ef41Sopenharmony_ci if(norm16==JAMO_L) { 26801cb0ef41Sopenharmony_ci UChar32 syllable= 26811cb0ef41Sopenharmony_ci (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); 26821cb0ef41Sopenharmony_ci set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); 26831cb0ef41Sopenharmony_ci } else { 26841cb0ef41Sopenharmony_ci addComposites(getCompositionsList(norm16), set); 26851cb0ef41Sopenharmony_ci } 26861cb0ef41Sopenharmony_ci } 26871cb0ef41Sopenharmony_ci return true; 26881cb0ef41Sopenharmony_ci} 26891cb0ef41Sopenharmony_ci 26901cb0ef41Sopenharmony_ciU_NAMESPACE_END 26911cb0ef41Sopenharmony_ci 26921cb0ef41Sopenharmony_ci// Normalizer2 data swapping ----------------------------------------------- *** 26931cb0ef41Sopenharmony_ci 26941cb0ef41Sopenharmony_ciU_NAMESPACE_USE 26951cb0ef41Sopenharmony_ci 26961cb0ef41Sopenharmony_ciU_CAPI int32_t U_EXPORT2 26971cb0ef41Sopenharmony_ciunorm2_swap(const UDataSwapper *ds, 26981cb0ef41Sopenharmony_ci const void *inData, int32_t length, void *outData, 26991cb0ef41Sopenharmony_ci UErrorCode *pErrorCode) { 27001cb0ef41Sopenharmony_ci const UDataInfo *pInfo; 27011cb0ef41Sopenharmony_ci int32_t headerSize; 27021cb0ef41Sopenharmony_ci 27031cb0ef41Sopenharmony_ci const uint8_t *inBytes; 27041cb0ef41Sopenharmony_ci uint8_t *outBytes; 27051cb0ef41Sopenharmony_ci 27061cb0ef41Sopenharmony_ci const int32_t *inIndexes; 27071cb0ef41Sopenharmony_ci int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+1]; 27081cb0ef41Sopenharmony_ci 27091cb0ef41Sopenharmony_ci int32_t i, offset, nextOffset, size; 27101cb0ef41Sopenharmony_ci 27111cb0ef41Sopenharmony_ci /* udata_swapDataHeader checks the arguments */ 27121cb0ef41Sopenharmony_ci headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 27131cb0ef41Sopenharmony_ci if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 27141cb0ef41Sopenharmony_ci return 0; 27151cb0ef41Sopenharmony_ci } 27161cb0ef41Sopenharmony_ci 27171cb0ef41Sopenharmony_ci /* check data format and format version */ 27181cb0ef41Sopenharmony_ci pInfo=(const UDataInfo *)((const char *)inData+4); 27191cb0ef41Sopenharmony_ci uint8_t formatVersion0=pInfo->formatVersion[0]; 27201cb0ef41Sopenharmony_ci if(!( 27211cb0ef41Sopenharmony_ci pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 27221cb0ef41Sopenharmony_ci pInfo->dataFormat[1]==0x72 && 27231cb0ef41Sopenharmony_ci pInfo->dataFormat[2]==0x6d && 27241cb0ef41Sopenharmony_ci pInfo->dataFormat[3]==0x32 && 27251cb0ef41Sopenharmony_ci (1<=formatVersion0 && formatVersion0<=4) 27261cb0ef41Sopenharmony_ci )) { 27271cb0ef41Sopenharmony_ci udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", 27281cb0ef41Sopenharmony_ci pInfo->dataFormat[0], pInfo->dataFormat[1], 27291cb0ef41Sopenharmony_ci pInfo->dataFormat[2], pInfo->dataFormat[3], 27301cb0ef41Sopenharmony_ci pInfo->formatVersion[0]); 27311cb0ef41Sopenharmony_ci *pErrorCode=U_UNSUPPORTED_ERROR; 27321cb0ef41Sopenharmony_ci return 0; 27331cb0ef41Sopenharmony_ci } 27341cb0ef41Sopenharmony_ci 27351cb0ef41Sopenharmony_ci inBytes=(const uint8_t *)inData+headerSize; 27361cb0ef41Sopenharmony_ci outBytes=(outData == nullptr) ? nullptr : (uint8_t *)outData+headerSize; 27371cb0ef41Sopenharmony_ci 27381cb0ef41Sopenharmony_ci inIndexes=(const int32_t *)inBytes; 27391cb0ef41Sopenharmony_ci int32_t minIndexesLength; 27401cb0ef41Sopenharmony_ci if(formatVersion0==1) { 27411cb0ef41Sopenharmony_ci minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1; 27421cb0ef41Sopenharmony_ci } else if(formatVersion0==2) { 27431cb0ef41Sopenharmony_ci minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1; 27441cb0ef41Sopenharmony_ci } else { 27451cb0ef41Sopenharmony_ci minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1; 27461cb0ef41Sopenharmony_ci } 27471cb0ef41Sopenharmony_ci 27481cb0ef41Sopenharmony_ci if(length>=0) { 27491cb0ef41Sopenharmony_ci length-=headerSize; 27501cb0ef41Sopenharmony_ci if(length<minIndexesLength*4) { 27511cb0ef41Sopenharmony_ci udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", 27521cb0ef41Sopenharmony_ci length); 27531cb0ef41Sopenharmony_ci *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 27541cb0ef41Sopenharmony_ci return 0; 27551cb0ef41Sopenharmony_ci } 27561cb0ef41Sopenharmony_ci } 27571cb0ef41Sopenharmony_ci 27581cb0ef41Sopenharmony_ci /* read the first few indexes */ 27591cb0ef41Sopenharmony_ci for(i=0; i<UPRV_LENGTHOF(indexes); ++i) { 27601cb0ef41Sopenharmony_ci indexes[i]=udata_readInt32(ds, inIndexes[i]); 27611cb0ef41Sopenharmony_ci } 27621cb0ef41Sopenharmony_ci 27631cb0ef41Sopenharmony_ci /* get the total length of the data */ 27641cb0ef41Sopenharmony_ci size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 27651cb0ef41Sopenharmony_ci 27661cb0ef41Sopenharmony_ci if(length>=0) { 27671cb0ef41Sopenharmony_ci if(length<size) { 27681cb0ef41Sopenharmony_ci udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n", 27691cb0ef41Sopenharmony_ci length); 27701cb0ef41Sopenharmony_ci *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 27711cb0ef41Sopenharmony_ci return 0; 27721cb0ef41Sopenharmony_ci } 27731cb0ef41Sopenharmony_ci 27741cb0ef41Sopenharmony_ci /* copy the data for inaccessible bytes */ 27751cb0ef41Sopenharmony_ci if(inBytes!=outBytes) { 27761cb0ef41Sopenharmony_ci uprv_memcpy(outBytes, inBytes, size); 27771cb0ef41Sopenharmony_ci } 27781cb0ef41Sopenharmony_ci 27791cb0ef41Sopenharmony_ci offset=0; 27801cb0ef41Sopenharmony_ci 27811cb0ef41Sopenharmony_ci /* swap the int32_t indexes[] */ 27821cb0ef41Sopenharmony_ci nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]; 27831cb0ef41Sopenharmony_ci ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); 27841cb0ef41Sopenharmony_ci offset=nextOffset; 27851cb0ef41Sopenharmony_ci 27861cb0ef41Sopenharmony_ci /* swap the trie */ 27871cb0ef41Sopenharmony_ci nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; 27881cb0ef41Sopenharmony_ci utrie_swapAnyVersion(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 27891cb0ef41Sopenharmony_ci offset=nextOffset; 27901cb0ef41Sopenharmony_ci 27911cb0ef41Sopenharmony_ci /* swap the uint16_t extraData[] */ 27921cb0ef41Sopenharmony_ci nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]; 27931cb0ef41Sopenharmony_ci ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 27941cb0ef41Sopenharmony_ci offset=nextOffset; 27951cb0ef41Sopenharmony_ci 27961cb0ef41Sopenharmony_ci /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */ 27971cb0ef41Sopenharmony_ci nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; 27981cb0ef41Sopenharmony_ci offset=nextOffset; 27991cb0ef41Sopenharmony_ci 28001cb0ef41Sopenharmony_ci U_ASSERT(offset==size); 28011cb0ef41Sopenharmony_ci } 28021cb0ef41Sopenharmony_ci 28031cb0ef41Sopenharmony_ci return headerSize+size; 28041cb0ef41Sopenharmony_ci} 28051cb0ef41Sopenharmony_ci 28061cb0ef41Sopenharmony_ci#endif // !UCONFIG_NO_NORMALIZATION 2807