11cb0ef41Sopenharmony_ci// © 2018 and later: Unicode, Inc. and others. 21cb0ef41Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 31cb0ef41Sopenharmony_ci 41cb0ef41Sopenharmony_ci#include "unicode/utypes.h" 51cb0ef41Sopenharmony_ci 61cb0ef41Sopenharmony_ci#if !UCONFIG_NO_FORMATTING 71cb0ef41Sopenharmony_ci 81cb0ef41Sopenharmony_ci// Allow implicit conversion from char16_t* to UnicodeString for this file: 91cb0ef41Sopenharmony_ci// Helpful in toString methods and elsewhere. 101cb0ef41Sopenharmony_ci#define UNISTR_FROM_STRING_EXPLICIT 111cb0ef41Sopenharmony_ci 121cb0ef41Sopenharmony_ci#include "static_unicode_sets.h" 131cb0ef41Sopenharmony_ci#include "umutex.h" 141cb0ef41Sopenharmony_ci#include "ucln_cmn.h" 151cb0ef41Sopenharmony_ci#include "unicode/uniset.h" 161cb0ef41Sopenharmony_ci#include "uresimp.h" 171cb0ef41Sopenharmony_ci#include "cstring.h" 181cb0ef41Sopenharmony_ci#include "uassert.h" 191cb0ef41Sopenharmony_ci 201cb0ef41Sopenharmony_ciusing namespace icu; 211cb0ef41Sopenharmony_ciusing namespace icu::unisets; 221cb0ef41Sopenharmony_ci 231cb0ef41Sopenharmony_ci 241cb0ef41Sopenharmony_cinamespace { 251cb0ef41Sopenharmony_ci 261cb0ef41Sopenharmony_ciUnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {}; 271cb0ef41Sopenharmony_ci 281cb0ef41Sopenharmony_ci// Save the empty instance in static memory to have well-defined behavior if a 291cb0ef41Sopenharmony_ci// regular UnicodeSet cannot be allocated. 301cb0ef41Sopenharmony_cialignas(UnicodeSet) 311cb0ef41Sopenharmony_cichar gEmptyUnicodeSet[sizeof(UnicodeSet)]; 321cb0ef41Sopenharmony_ci 331cb0ef41Sopenharmony_ci// Whether the gEmptyUnicodeSet is initialized and ready to use. 341cb0ef41Sopenharmony_ciUBool gEmptyUnicodeSetInitialized = false; 351cb0ef41Sopenharmony_ci 361cb0ef41Sopenharmony_ciinline UnicodeSet* getImpl(Key key) { 371cb0ef41Sopenharmony_ci UnicodeSet* candidate = gUnicodeSets[key]; 381cb0ef41Sopenharmony_ci if (candidate == nullptr) { 391cb0ef41Sopenharmony_ci return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet); 401cb0ef41Sopenharmony_ci } 411cb0ef41Sopenharmony_ci return candidate; 421cb0ef41Sopenharmony_ci} 431cb0ef41Sopenharmony_ci 441cb0ef41Sopenharmony_ciUnicodeSet* computeUnion(Key k1, Key k2) { 451cb0ef41Sopenharmony_ci UnicodeSet* result = new UnicodeSet(); 461cb0ef41Sopenharmony_ci if (result == nullptr) { 471cb0ef41Sopenharmony_ci return nullptr; 481cb0ef41Sopenharmony_ci } 491cb0ef41Sopenharmony_ci result->addAll(*getImpl(k1)); 501cb0ef41Sopenharmony_ci result->addAll(*getImpl(k2)); 511cb0ef41Sopenharmony_ci result->freeze(); 521cb0ef41Sopenharmony_ci return result; 531cb0ef41Sopenharmony_ci} 541cb0ef41Sopenharmony_ci 551cb0ef41Sopenharmony_ciUnicodeSet* computeUnion(Key k1, Key k2, Key k3) { 561cb0ef41Sopenharmony_ci UnicodeSet* result = new UnicodeSet(); 571cb0ef41Sopenharmony_ci if (result == nullptr) { 581cb0ef41Sopenharmony_ci return nullptr; 591cb0ef41Sopenharmony_ci } 601cb0ef41Sopenharmony_ci result->addAll(*getImpl(k1)); 611cb0ef41Sopenharmony_ci result->addAll(*getImpl(k2)); 621cb0ef41Sopenharmony_ci result->addAll(*getImpl(k3)); 631cb0ef41Sopenharmony_ci result->freeze(); 641cb0ef41Sopenharmony_ci return result; 651cb0ef41Sopenharmony_ci} 661cb0ef41Sopenharmony_ci 671cb0ef41Sopenharmony_ci 681cb0ef41Sopenharmony_civoid saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) { 691cb0ef41Sopenharmony_ci // assert unicodeSets.get(key) == null; 701cb0ef41Sopenharmony_ci gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status); 711cb0ef41Sopenharmony_ci} 721cb0ef41Sopenharmony_ci 731cb0ef41Sopenharmony_ciclass ParseDataSink : public ResourceSink { 741cb0ef41Sopenharmony_ci public: 751cb0ef41Sopenharmony_ci void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) override { 761cb0ef41Sopenharmony_ci ResourceTable contextsTable = value.getTable(status); 771cb0ef41Sopenharmony_ci if (U_FAILURE(status)) { return; } 781cb0ef41Sopenharmony_ci for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) { 791cb0ef41Sopenharmony_ci if (uprv_strcmp(key, "date") == 0) { 801cb0ef41Sopenharmony_ci // ignore 811cb0ef41Sopenharmony_ci } else { 821cb0ef41Sopenharmony_ci ResourceTable strictnessTable = value.getTable(status); 831cb0ef41Sopenharmony_ci if (U_FAILURE(status)) { return; } 841cb0ef41Sopenharmony_ci for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) { 851cb0ef41Sopenharmony_ci bool isLenient = (uprv_strcmp(key, "lenient") == 0); 861cb0ef41Sopenharmony_ci ResourceArray array = value.getArray(status); 871cb0ef41Sopenharmony_ci if (U_FAILURE(status)) { return; } 881cb0ef41Sopenharmony_ci for (int k = 0; k < array.getSize(); k++) { 891cb0ef41Sopenharmony_ci array.getValue(k, value); 901cb0ef41Sopenharmony_ci UnicodeString str = value.getUnicodeString(status); 911cb0ef41Sopenharmony_ci if (U_FAILURE(status)) { return; } 921cb0ef41Sopenharmony_ci // There is both lenient and strict data for comma/period, 931cb0ef41Sopenharmony_ci // but not for any of the other symbols. 941cb0ef41Sopenharmony_ci if (str.indexOf(u'.') != -1) { 951cb0ef41Sopenharmony_ci saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status); 961cb0ef41Sopenharmony_ci } else if (str.indexOf(u',') != -1) { 971cb0ef41Sopenharmony_ci saveSet(isLenient ? COMMA : STRICT_COMMA, str, status); 981cb0ef41Sopenharmony_ci } else if (str.indexOf(u'+') != -1) { 991cb0ef41Sopenharmony_ci saveSet(PLUS_SIGN, str, status); 1001cb0ef41Sopenharmony_ci } else if (str.indexOf(u'-') != -1) { 1011cb0ef41Sopenharmony_ci saveSet(MINUS_SIGN, str, status); 1021cb0ef41Sopenharmony_ci } else if (str.indexOf(u'$') != -1) { 1031cb0ef41Sopenharmony_ci saveSet(DOLLAR_SIGN, str, status); 1041cb0ef41Sopenharmony_ci } else if (str.indexOf(u'£') != -1) { 1051cb0ef41Sopenharmony_ci saveSet(POUND_SIGN, str, status); 1061cb0ef41Sopenharmony_ci } else if (str.indexOf(u'₹') != -1) { 1071cb0ef41Sopenharmony_ci saveSet(RUPEE_SIGN, str, status); 1081cb0ef41Sopenharmony_ci } else if (str.indexOf(u'¥') != -1) { 1091cb0ef41Sopenharmony_ci saveSet(YEN_SIGN, str, status); 1101cb0ef41Sopenharmony_ci } else if (str.indexOf(u'₩') != -1) { 1111cb0ef41Sopenharmony_ci saveSet(WON_SIGN, str, status); 1121cb0ef41Sopenharmony_ci } else if (str.indexOf(u'%') != -1) { 1131cb0ef41Sopenharmony_ci saveSet(PERCENT_SIGN, str, status); 1141cb0ef41Sopenharmony_ci } else if (str.indexOf(u'‰') != -1) { 1151cb0ef41Sopenharmony_ci saveSet(PERMILLE_SIGN, str, status); 1161cb0ef41Sopenharmony_ci } else if (str.indexOf(u'’') != -1) { 1171cb0ef41Sopenharmony_ci saveSet(APOSTROPHE_SIGN, str, status); 1181cb0ef41Sopenharmony_ci } else { 1191cb0ef41Sopenharmony_ci // Unknown class of parse lenients 1201cb0ef41Sopenharmony_ci // TODO(ICU-20428): Make ICU automatically accept new classes? 1211cb0ef41Sopenharmony_ci U_ASSERT(false); 1221cb0ef41Sopenharmony_ci } 1231cb0ef41Sopenharmony_ci if (U_FAILURE(status)) { return; } 1241cb0ef41Sopenharmony_ci } 1251cb0ef41Sopenharmony_ci } 1261cb0ef41Sopenharmony_ci } 1271cb0ef41Sopenharmony_ci } 1281cb0ef41Sopenharmony_ci } 1291cb0ef41Sopenharmony_ci}; 1301cb0ef41Sopenharmony_ci 1311cb0ef41Sopenharmony_ci 1321cb0ef41Sopenharmony_ciicu::UInitOnce gNumberParseUniSetsInitOnce {}; 1331cb0ef41Sopenharmony_ci 1341cb0ef41Sopenharmony_ciUBool U_CALLCONV cleanupNumberParseUniSets() { 1351cb0ef41Sopenharmony_ci if (gEmptyUnicodeSetInitialized) { 1361cb0ef41Sopenharmony_ci reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet(); 1371cb0ef41Sopenharmony_ci gEmptyUnicodeSetInitialized = false; 1381cb0ef41Sopenharmony_ci } 1391cb0ef41Sopenharmony_ci for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) { 1401cb0ef41Sopenharmony_ci delete gUnicodeSets[i]; 1411cb0ef41Sopenharmony_ci gUnicodeSets[i] = nullptr; 1421cb0ef41Sopenharmony_ci } 1431cb0ef41Sopenharmony_ci gNumberParseUniSetsInitOnce.reset(); 1441cb0ef41Sopenharmony_ci return true; 1451cb0ef41Sopenharmony_ci} 1461cb0ef41Sopenharmony_ci 1471cb0ef41Sopenharmony_civoid U_CALLCONV initNumberParseUniSets(UErrorCode& status) { 1481cb0ef41Sopenharmony_ci ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets); 1491cb0ef41Sopenharmony_ci 1501cb0ef41Sopenharmony_ci // Initialize the empty instance for well-defined fallback behavior 1511cb0ef41Sopenharmony_ci new(gEmptyUnicodeSet) UnicodeSet(); 1521cb0ef41Sopenharmony_ci reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze(); 1531cb0ef41Sopenharmony_ci gEmptyUnicodeSetInitialized = true; 1541cb0ef41Sopenharmony_ci 1551cb0ef41Sopenharmony_ci // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309. 1561cb0ef41Sopenharmony_ci // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). 1571cb0ef41Sopenharmony_ci gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet( 1581cb0ef41Sopenharmony_ci u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status); 1591cb0ef41Sopenharmony_ci gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status); 1601cb0ef41Sopenharmony_ci 1611cb0ef41Sopenharmony_ci LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status)); 1621cb0ef41Sopenharmony_ci if (U_FAILURE(status)) { return; } 1631cb0ef41Sopenharmony_ci ParseDataSink sink; 1641cb0ef41Sopenharmony_ci ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status); 1651cb0ef41Sopenharmony_ci if (U_FAILURE(status)) { return; } 1661cb0ef41Sopenharmony_ci 1671cb0ef41Sopenharmony_ci // NOTE: It is OK for these assertions to fail if there was a no-data build. 1681cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[COMMA] != nullptr); 1691cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr); 1701cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[PERIOD] != nullptr); 1711cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr); 1721cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr); 1731cb0ef41Sopenharmony_ci 1741cb0ef41Sopenharmony_ci LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet( 1751cb0ef41Sopenharmony_ci u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", 1761cb0ef41Sopenharmony_ci status 1771cb0ef41Sopenharmony_ci ), status); 1781cb0ef41Sopenharmony_ci if (U_FAILURE(status)) { return; } 1791cb0ef41Sopenharmony_ci otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]); 1801cb0ef41Sopenharmony_ci gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan(); 1811cb0ef41Sopenharmony_ci gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS); 1821cb0ef41Sopenharmony_ci gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion( 1831cb0ef41Sopenharmony_ci STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS); 1841cb0ef41Sopenharmony_ci 1851cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr); 1861cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr); 1871cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr); 1881cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr); 1891cb0ef41Sopenharmony_ci 1901cb0ef41Sopenharmony_ci gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status); 1911cb0ef41Sopenharmony_ci if (U_FAILURE(status)) { return; } 1921cb0ef41Sopenharmony_ci 1931cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr); 1941cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr); 1951cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr); 1961cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr); 1971cb0ef41Sopenharmony_ci U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr); 1981cb0ef41Sopenharmony_ci 1991cb0ef41Sopenharmony_ci gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status); 2001cb0ef41Sopenharmony_ci if (U_FAILURE(status)) { return; } 2011cb0ef41Sopenharmony_ci gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS); 2021cb0ef41Sopenharmony_ci gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS); 2031cb0ef41Sopenharmony_ci 2041cb0ef41Sopenharmony_ci for (auto* uniset : gUnicodeSets) { 2051cb0ef41Sopenharmony_ci if (uniset != nullptr) { 2061cb0ef41Sopenharmony_ci uniset->freeze(); 2071cb0ef41Sopenharmony_ci } 2081cb0ef41Sopenharmony_ci } 2091cb0ef41Sopenharmony_ci} 2101cb0ef41Sopenharmony_ci 2111cb0ef41Sopenharmony_ci} 2121cb0ef41Sopenharmony_ci 2131cb0ef41Sopenharmony_ciconst UnicodeSet* unisets::get(Key key) { 2141cb0ef41Sopenharmony_ci UErrorCode localStatus = U_ZERO_ERROR; 2151cb0ef41Sopenharmony_ci umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus); 2161cb0ef41Sopenharmony_ci if (U_FAILURE(localStatus)) { 2171cb0ef41Sopenharmony_ci return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet); 2181cb0ef41Sopenharmony_ci } 2191cb0ef41Sopenharmony_ci return getImpl(key); 2201cb0ef41Sopenharmony_ci} 2211cb0ef41Sopenharmony_ci 2221cb0ef41Sopenharmony_ciKey unisets::chooseFrom(UnicodeString str, Key key1) { 2231cb0ef41Sopenharmony_ci return get(key1)->contains(str) ? key1 : NONE; 2241cb0ef41Sopenharmony_ci} 2251cb0ef41Sopenharmony_ci 2261cb0ef41Sopenharmony_ciKey unisets::chooseFrom(UnicodeString str, Key key1, Key key2) { 2271cb0ef41Sopenharmony_ci return get(key1)->contains(str) ? key1 : chooseFrom(str, key2); 2281cb0ef41Sopenharmony_ci} 2291cb0ef41Sopenharmony_ci 2301cb0ef41Sopenharmony_ci//Key unisets::chooseCurrency(UnicodeString str) { 2311cb0ef41Sopenharmony_ci// if (get(DOLLAR_SIGN)->contains(str)) { 2321cb0ef41Sopenharmony_ci// return DOLLAR_SIGN; 2331cb0ef41Sopenharmony_ci// } else if (get(POUND_SIGN)->contains(str)) { 2341cb0ef41Sopenharmony_ci// return POUND_SIGN; 2351cb0ef41Sopenharmony_ci// } else if (get(RUPEE_SIGN)->contains(str)) { 2361cb0ef41Sopenharmony_ci// return RUPEE_SIGN; 2371cb0ef41Sopenharmony_ci// } else if (get(YEN_SIGN)->contains(str)) { 2381cb0ef41Sopenharmony_ci// return YEN_SIGN; 2391cb0ef41Sopenharmony_ci// } else { 2401cb0ef41Sopenharmony_ci// return NONE; 2411cb0ef41Sopenharmony_ci// } 2421cb0ef41Sopenharmony_ci//} 2431cb0ef41Sopenharmony_ci 2441cb0ef41Sopenharmony_ci 2451cb0ef41Sopenharmony_ci#endif /* #if !UCONFIG_NO_FORMATTING */ 246