12e5b6d6dSopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 22e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 32e5b6d6dSopenharmony_ci/* 42e5b6d6dSopenharmony_ci******************************************************************************* 52e5b6d6dSopenharmony_ci* 62e5b6d6dSopenharmony_ci* Copyright (C) 1999-2014, International Business Machines 72e5b6d6dSopenharmony_ci* Corporation and others. All Rights Reserved. 82e5b6d6dSopenharmony_ci* 92e5b6d6dSopenharmony_ci******************************************************************************* 102e5b6d6dSopenharmony_ci* file name: uniset_props.cpp 112e5b6d6dSopenharmony_ci* encoding: UTF-8 122e5b6d6dSopenharmony_ci* tab size: 8 (not used) 132e5b6d6dSopenharmony_ci* indentation:4 142e5b6d6dSopenharmony_ci* 152e5b6d6dSopenharmony_ci* created on: 2004aug25 162e5b6d6dSopenharmony_ci* created by: Markus W. Scherer 172e5b6d6dSopenharmony_ci* 182e5b6d6dSopenharmony_ci* Character property dependent functions moved here from uniset.cpp 192e5b6d6dSopenharmony_ci*/ 202e5b6d6dSopenharmony_ci 212e5b6d6dSopenharmony_ci#include "unicode/utypes.h" 222e5b6d6dSopenharmony_ci#include "unicode/uniset.h" 232e5b6d6dSopenharmony_ci#include "unicode/parsepos.h" 242e5b6d6dSopenharmony_ci#include "unicode/uchar.h" 252e5b6d6dSopenharmony_ci#include "unicode/uscript.h" 262e5b6d6dSopenharmony_ci#include "unicode/symtable.h" 272e5b6d6dSopenharmony_ci#include "unicode/uset.h" 282e5b6d6dSopenharmony_ci#include "unicode/locid.h" 292e5b6d6dSopenharmony_ci#include "unicode/brkiter.h" 302e5b6d6dSopenharmony_ci#include "uset_imp.h" 312e5b6d6dSopenharmony_ci#include "ruleiter.h" 322e5b6d6dSopenharmony_ci#include "cmemory.h" 332e5b6d6dSopenharmony_ci#include "ucln_cmn.h" 342e5b6d6dSopenharmony_ci#include "util.h" 352e5b6d6dSopenharmony_ci#include "uvector.h" 362e5b6d6dSopenharmony_ci#include "uprops.h" 372e5b6d6dSopenharmony_ci#include "propname.h" 382e5b6d6dSopenharmony_ci#include "normalizer2impl.h" 392e5b6d6dSopenharmony_ci#include "uinvchar.h" 402e5b6d6dSopenharmony_ci#include "uprops.h" 412e5b6d6dSopenharmony_ci#include "charstr.h" 422e5b6d6dSopenharmony_ci#include "cstring.h" 432e5b6d6dSopenharmony_ci#include "mutex.h" 442e5b6d6dSopenharmony_ci#include "umutex.h" 452e5b6d6dSopenharmony_ci#include "uassert.h" 462e5b6d6dSopenharmony_ci#include "hash.h" 472e5b6d6dSopenharmony_ci 482e5b6d6dSopenharmony_ciU_NAMESPACE_USE 492e5b6d6dSopenharmony_ci 502e5b6d6dSopenharmony_ci// Special property set IDs 512e5b6d6dSopenharmony_cistatic const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] 522e5b6d6dSopenharmony_cistatic const char ASCII[] = "ASCII"; // [\u0000-\u007F] 532e5b6d6dSopenharmony_cistatic const char ASSIGNED[] = "Assigned"; // [:^Cn:] 542e5b6d6dSopenharmony_ci 552e5b6d6dSopenharmony_ci// Unicode name property alias 562e5b6d6dSopenharmony_ci#define NAME_PROP "na" 572e5b6d6dSopenharmony_ci#define NAME_PROP_LENGTH 2 582e5b6d6dSopenharmony_ci 592e5b6d6dSopenharmony_ci// Cached sets ------------------------------------------------------------- *** 602e5b6d6dSopenharmony_ci 612e5b6d6dSopenharmony_ciU_CDECL_BEGIN 622e5b6d6dSopenharmony_cistatic UBool U_CALLCONV uset_cleanup(); 632e5b6d6dSopenharmony_ci 642e5b6d6dSopenharmony_cistatic UnicodeSet *uni32Singleton; 652e5b6d6dSopenharmony_cistatic icu::UInitOnce uni32InitOnce {}; 662e5b6d6dSopenharmony_ci 672e5b6d6dSopenharmony_ci/** 682e5b6d6dSopenharmony_ci * Cleanup function for UnicodeSet 692e5b6d6dSopenharmony_ci */ 702e5b6d6dSopenharmony_cistatic UBool U_CALLCONV uset_cleanup(void) { 712e5b6d6dSopenharmony_ci delete uni32Singleton; 722e5b6d6dSopenharmony_ci uni32Singleton = NULL; 732e5b6d6dSopenharmony_ci uni32InitOnce.reset(); 742e5b6d6dSopenharmony_ci return true; 752e5b6d6dSopenharmony_ci} 762e5b6d6dSopenharmony_ci 772e5b6d6dSopenharmony_ciU_CDECL_END 782e5b6d6dSopenharmony_ci 792e5b6d6dSopenharmony_ciU_NAMESPACE_BEGIN 802e5b6d6dSopenharmony_ci 812e5b6d6dSopenharmony_cinamespace { 822e5b6d6dSopenharmony_ci 832e5b6d6dSopenharmony_ci// Cache some sets for other services -------------------------------------- *** 842e5b6d6dSopenharmony_civoid U_CALLCONV createUni32Set(UErrorCode &errorCode) { 852e5b6d6dSopenharmony_ci U_ASSERT(uni32Singleton == NULL); 862e5b6d6dSopenharmony_ci uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode); 872e5b6d6dSopenharmony_ci if(uni32Singleton==NULL) { 882e5b6d6dSopenharmony_ci errorCode=U_MEMORY_ALLOCATION_ERROR; 892e5b6d6dSopenharmony_ci } else { 902e5b6d6dSopenharmony_ci uni32Singleton->freeze(); 912e5b6d6dSopenharmony_ci } 922e5b6d6dSopenharmony_ci ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 932e5b6d6dSopenharmony_ci} 942e5b6d6dSopenharmony_ci 952e5b6d6dSopenharmony_ci 962e5b6d6dSopenharmony_ciU_CFUNC UnicodeSet * 972e5b6d6dSopenharmony_ciuniset_getUnicode32Instance(UErrorCode &errorCode) { 982e5b6d6dSopenharmony_ci umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode); 992e5b6d6dSopenharmony_ci return uni32Singleton; 1002e5b6d6dSopenharmony_ci} 1012e5b6d6dSopenharmony_ci 1022e5b6d6dSopenharmony_ci// helper functions for matching of pattern syntax pieces ------------------ *** 1032e5b6d6dSopenharmony_ci// these functions are parallel to the PERL_OPEN etc. strings above 1042e5b6d6dSopenharmony_ci 1052e5b6d6dSopenharmony_ci// using these functions is not only faster than UnicodeString::compare() and 1062e5b6d6dSopenharmony_ci// caseCompare(), but they also make UnicodeSet work for simple patterns when 1072e5b6d6dSopenharmony_ci// no Unicode properties data is available - when caseCompare() fails 1082e5b6d6dSopenharmony_ci 1092e5b6d6dSopenharmony_cistatic inline UBool 1102e5b6d6dSopenharmony_ciisPerlOpen(const UnicodeString &pattern, int32_t pos) { 1112e5b6d6dSopenharmony_ci UChar c; 1122e5b6d6dSopenharmony_ci return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P'); 1132e5b6d6dSopenharmony_ci} 1142e5b6d6dSopenharmony_ci 1152e5b6d6dSopenharmony_ci/*static inline UBool 1162e5b6d6dSopenharmony_ciisPerlClose(const UnicodeString &pattern, int32_t pos) { 1172e5b6d6dSopenharmony_ci return pattern.charAt(pos)==u'}'; 1182e5b6d6dSopenharmony_ci}*/ 1192e5b6d6dSopenharmony_ci 1202e5b6d6dSopenharmony_cistatic inline UBool 1212e5b6d6dSopenharmony_ciisNameOpen(const UnicodeString &pattern, int32_t pos) { 1222e5b6d6dSopenharmony_ci return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N'; 1232e5b6d6dSopenharmony_ci} 1242e5b6d6dSopenharmony_ci 1252e5b6d6dSopenharmony_cistatic inline UBool 1262e5b6d6dSopenharmony_ciisPOSIXOpen(const UnicodeString &pattern, int32_t pos) { 1272e5b6d6dSopenharmony_ci return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':'; 1282e5b6d6dSopenharmony_ci} 1292e5b6d6dSopenharmony_ci 1302e5b6d6dSopenharmony_ci/*static inline UBool 1312e5b6d6dSopenharmony_ciisPOSIXClose(const UnicodeString &pattern, int32_t pos) { 1322e5b6d6dSopenharmony_ci return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']'; 1332e5b6d6dSopenharmony_ci}*/ 1342e5b6d6dSopenharmony_ci 1352e5b6d6dSopenharmony_ci// TODO memory debugging provided inside uniset.cpp 1362e5b6d6dSopenharmony_ci// could be made available here but probably obsolete with use of modern 1372e5b6d6dSopenharmony_ci// memory leak checker tools 1382e5b6d6dSopenharmony_ci#define _dbgct(me) 1392e5b6d6dSopenharmony_ci 1402e5b6d6dSopenharmony_ci} // namespace 1412e5b6d6dSopenharmony_ci 1422e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 1432e5b6d6dSopenharmony_ci// Constructors &c 1442e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 1452e5b6d6dSopenharmony_ci 1462e5b6d6dSopenharmony_ci/** 1472e5b6d6dSopenharmony_ci * Constructs a set from the given pattern, optionally ignoring 1482e5b6d6dSopenharmony_ci * white space. See the class description for the syntax of the 1492e5b6d6dSopenharmony_ci * pattern language. 1502e5b6d6dSopenharmony_ci * @param pattern a string specifying what characters are in the set 1512e5b6d6dSopenharmony_ci */ 1522e5b6d6dSopenharmony_ciUnicodeSet::UnicodeSet(const UnicodeString& pattern, 1532e5b6d6dSopenharmony_ci UErrorCode& status) { 1542e5b6d6dSopenharmony_ci applyPattern(pattern, status); 1552e5b6d6dSopenharmony_ci _dbgct(this); 1562e5b6d6dSopenharmony_ci} 1572e5b6d6dSopenharmony_ci 1582e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 1592e5b6d6dSopenharmony_ci// Public API 1602e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 1612e5b6d6dSopenharmony_ci 1622e5b6d6dSopenharmony_ciUnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 1632e5b6d6dSopenharmony_ci UErrorCode& status) { 1642e5b6d6dSopenharmony_ci // Equivalent to 1652e5b6d6dSopenharmony_ci // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 1662e5b6d6dSopenharmony_ci // but without dependency on closeOver(). 1672e5b6d6dSopenharmony_ci ParsePosition pos(0); 1682e5b6d6dSopenharmony_ci applyPatternIgnoreSpace(pattern, pos, NULL, status); 1692e5b6d6dSopenharmony_ci if (U_FAILURE(status)) return *this; 1702e5b6d6dSopenharmony_ci 1712e5b6d6dSopenharmony_ci int32_t i = pos.getIndex(); 1722e5b6d6dSopenharmony_ci // Skip over trailing whitespace 1732e5b6d6dSopenharmony_ci ICU_Utility::skipWhitespace(pattern, i, true); 1742e5b6d6dSopenharmony_ci if (i != pattern.length()) { 1752e5b6d6dSopenharmony_ci status = U_ILLEGAL_ARGUMENT_ERROR; 1762e5b6d6dSopenharmony_ci } 1772e5b6d6dSopenharmony_ci return *this; 1782e5b6d6dSopenharmony_ci} 1792e5b6d6dSopenharmony_ci 1802e5b6d6dSopenharmony_civoid 1812e5b6d6dSopenharmony_ciUnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, 1822e5b6d6dSopenharmony_ci ParsePosition& pos, 1832e5b6d6dSopenharmony_ci const SymbolTable* symbols, 1842e5b6d6dSopenharmony_ci UErrorCode& status) { 1852e5b6d6dSopenharmony_ci if (U_FAILURE(status)) { 1862e5b6d6dSopenharmony_ci return; 1872e5b6d6dSopenharmony_ci } 1882e5b6d6dSopenharmony_ci if (isFrozen()) { 1892e5b6d6dSopenharmony_ci status = U_NO_WRITE_PERMISSION; 1902e5b6d6dSopenharmony_ci return; 1912e5b6d6dSopenharmony_ci } 1922e5b6d6dSopenharmony_ci // Need to build the pattern in a temporary string because 1932e5b6d6dSopenharmony_ci // _applyPattern calls add() etc., which set pat to empty. 1942e5b6d6dSopenharmony_ci UnicodeString rebuiltPat; 1952e5b6d6dSopenharmony_ci RuleCharacterIterator chars(pattern, symbols, pos); 1962e5b6d6dSopenharmony_ci applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status); 1972e5b6d6dSopenharmony_ci if (U_FAILURE(status)) return; 1982e5b6d6dSopenharmony_ci if (chars.inVariable()) { 1992e5b6d6dSopenharmony_ci // syntaxError(chars, "Extra chars in variable value"); 2002e5b6d6dSopenharmony_ci status = U_MALFORMED_SET; 2012e5b6d6dSopenharmony_ci return; 2022e5b6d6dSopenharmony_ci } 2032e5b6d6dSopenharmony_ci setPattern(rebuiltPat); 2042e5b6d6dSopenharmony_ci} 2052e5b6d6dSopenharmony_ci 2062e5b6d6dSopenharmony_ci/** 2072e5b6d6dSopenharmony_ci * Return true if the given position, in the given pattern, appears 2082e5b6d6dSopenharmony_ci * to be the start of a UnicodeSet pattern. 2092e5b6d6dSopenharmony_ci */ 2102e5b6d6dSopenharmony_ciUBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { 2112e5b6d6dSopenharmony_ci return ((pos+1) < pattern.length() && 2122e5b6d6dSopenharmony_ci pattern.charAt(pos) == (UChar)91/*[*/) || 2132e5b6d6dSopenharmony_ci resemblesPropertyPattern(pattern, pos); 2142e5b6d6dSopenharmony_ci} 2152e5b6d6dSopenharmony_ci 2162e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 2172e5b6d6dSopenharmony_ci// Implementation: Pattern parsing 2182e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 2192e5b6d6dSopenharmony_ci 2202e5b6d6dSopenharmony_cinamespace { 2212e5b6d6dSopenharmony_ci 2222e5b6d6dSopenharmony_ci/** 2232e5b6d6dSopenharmony_ci * A small all-inline class to manage a UnicodeSet pointer. Add 2242e5b6d6dSopenharmony_ci * operator->() etc. as needed. 2252e5b6d6dSopenharmony_ci */ 2262e5b6d6dSopenharmony_ciclass UnicodeSetPointer { 2272e5b6d6dSopenharmony_ci UnicodeSet* p; 2282e5b6d6dSopenharmony_cipublic: 2292e5b6d6dSopenharmony_ci inline UnicodeSetPointer() : p(0) {} 2302e5b6d6dSopenharmony_ci inline ~UnicodeSetPointer() { delete p; } 2312e5b6d6dSopenharmony_ci inline UnicodeSet* pointer() { return p; } 2322e5b6d6dSopenharmony_ci inline UBool allocate() { 2332e5b6d6dSopenharmony_ci if (p == 0) { 2342e5b6d6dSopenharmony_ci p = new UnicodeSet(); 2352e5b6d6dSopenharmony_ci } 2362e5b6d6dSopenharmony_ci return p != 0; 2372e5b6d6dSopenharmony_ci } 2382e5b6d6dSopenharmony_ci}; 2392e5b6d6dSopenharmony_ci 2402e5b6d6dSopenharmony_ciconstexpr int32_t MAX_DEPTH = 100; 2412e5b6d6dSopenharmony_ci 2422e5b6d6dSopenharmony_ci} // namespace 2432e5b6d6dSopenharmony_ci 2442e5b6d6dSopenharmony_ci/** 2452e5b6d6dSopenharmony_ci * Parse the pattern from the given RuleCharacterIterator. The 2462e5b6d6dSopenharmony_ci * iterator is advanced over the parsed pattern. 2472e5b6d6dSopenharmony_ci * @param chars iterator over the pattern characters. Upon return 2482e5b6d6dSopenharmony_ci * it will be advanced to the first character after the parsed 2492e5b6d6dSopenharmony_ci * pattern, or the end of the iteration if all characters are 2502e5b6d6dSopenharmony_ci * parsed. 2512e5b6d6dSopenharmony_ci * @param symbols symbol table to use to parse and dereference 2522e5b6d6dSopenharmony_ci * variables, or null if none. 2532e5b6d6dSopenharmony_ci * @param rebuiltPat the pattern that was parsed, rebuilt or 2542e5b6d6dSopenharmony_ci * copied from the input pattern, as appropriate. 2552e5b6d6dSopenharmony_ci * @param options a bit mask of zero or more of the following: 2562e5b6d6dSopenharmony_ci * IGNORE_SPACE, CASE. 2572e5b6d6dSopenharmony_ci */ 2582e5b6d6dSopenharmony_civoid UnicodeSet::applyPattern(RuleCharacterIterator& chars, 2592e5b6d6dSopenharmony_ci const SymbolTable* symbols, 2602e5b6d6dSopenharmony_ci UnicodeString& rebuiltPat, 2612e5b6d6dSopenharmony_ci uint32_t options, 2622e5b6d6dSopenharmony_ci UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), 2632e5b6d6dSopenharmony_ci int32_t depth, 2642e5b6d6dSopenharmony_ci UErrorCode& ec) { 2652e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return; 2662e5b6d6dSopenharmony_ci if (depth > MAX_DEPTH) { 2672e5b6d6dSopenharmony_ci ec = U_ILLEGAL_ARGUMENT_ERROR; 2682e5b6d6dSopenharmony_ci return; 2692e5b6d6dSopenharmony_ci } 2702e5b6d6dSopenharmony_ci 2712e5b6d6dSopenharmony_ci // Syntax characters: [ ] ^ - & { } 2722e5b6d6dSopenharmony_ci 2732e5b6d6dSopenharmony_ci // Recognized special forms for chars, sets: c-c s-s s&s 2742e5b6d6dSopenharmony_ci 2752e5b6d6dSopenharmony_ci int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | 2762e5b6d6dSopenharmony_ci RuleCharacterIterator::PARSE_ESCAPES; 2772e5b6d6dSopenharmony_ci if ((options & USET_IGNORE_SPACE) != 0) { 2782e5b6d6dSopenharmony_ci opts |= RuleCharacterIterator::SKIP_WHITESPACE; 2792e5b6d6dSopenharmony_ci } 2802e5b6d6dSopenharmony_ci 2812e5b6d6dSopenharmony_ci UnicodeString patLocal, buf; 2822e5b6d6dSopenharmony_ci UBool usePat = false; 2832e5b6d6dSopenharmony_ci UnicodeSetPointer scratch; 2842e5b6d6dSopenharmony_ci RuleCharacterIterator::Pos backup; 2852e5b6d6dSopenharmony_ci 2862e5b6d6dSopenharmony_ci // mode: 0=before [, 1=between [...], 2=after ] 2872e5b6d6dSopenharmony_ci // lastItem: 0=none, 1=char, 2=set 2882e5b6d6dSopenharmony_ci int8_t lastItem = 0, mode = 0; 2892e5b6d6dSopenharmony_ci UChar32 lastChar = 0; 2902e5b6d6dSopenharmony_ci UChar op = 0; 2912e5b6d6dSopenharmony_ci 2922e5b6d6dSopenharmony_ci UBool invert = false; 2932e5b6d6dSopenharmony_ci 2942e5b6d6dSopenharmony_ci clear(); 2952e5b6d6dSopenharmony_ci 2962e5b6d6dSopenharmony_ci while (mode != 2 && !chars.atEnd()) { 2972e5b6d6dSopenharmony_ci U_ASSERT((lastItem == 0 && op == 0) || 2982e5b6d6dSopenharmony_ci (lastItem == 1 && (op == 0 || op == u'-')) || 2992e5b6d6dSopenharmony_ci (lastItem == 2 && (op == 0 || op == u'-' || op == u'&'))); 3002e5b6d6dSopenharmony_ci 3012e5b6d6dSopenharmony_ci UChar32 c = 0; 3022e5b6d6dSopenharmony_ci UBool literal = false; 3032e5b6d6dSopenharmony_ci UnicodeSet* nested = 0; // alias - do not delete 3042e5b6d6dSopenharmony_ci 3052e5b6d6dSopenharmony_ci // -------- Check for property pattern 3062e5b6d6dSopenharmony_ci 3072e5b6d6dSopenharmony_ci // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 3082e5b6d6dSopenharmony_ci int8_t setMode = 0; 3092e5b6d6dSopenharmony_ci if (resemblesPropertyPattern(chars, opts)) { 3102e5b6d6dSopenharmony_ci setMode = 2; 3112e5b6d6dSopenharmony_ci } 3122e5b6d6dSopenharmony_ci 3132e5b6d6dSopenharmony_ci // -------- Parse '[' of opening delimiter OR nested set. 3142e5b6d6dSopenharmony_ci // If there is a nested set, use `setMode' to define how 3152e5b6d6dSopenharmony_ci // the set should be parsed. If the '[' is part of the 3162e5b6d6dSopenharmony_ci // opening delimiter for this pattern, parse special 3172e5b6d6dSopenharmony_ci // strings "[", "[^", "[-", and "[^-". Check for stand-in 3182e5b6d6dSopenharmony_ci // characters representing a nested set in the symbol 3192e5b6d6dSopenharmony_ci // table. 3202e5b6d6dSopenharmony_ci 3212e5b6d6dSopenharmony_ci else { 3222e5b6d6dSopenharmony_ci // Prepare to backup if necessary 3232e5b6d6dSopenharmony_ci chars.getPos(backup); 3242e5b6d6dSopenharmony_ci c = chars.next(opts, literal, ec); 3252e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return; 3262e5b6d6dSopenharmony_ci 3272e5b6d6dSopenharmony_ci if (c == u'[' && !literal) { 3282e5b6d6dSopenharmony_ci if (mode == 1) { 3292e5b6d6dSopenharmony_ci chars.setPos(backup); // backup 3302e5b6d6dSopenharmony_ci setMode = 1; 3312e5b6d6dSopenharmony_ci } else { 3322e5b6d6dSopenharmony_ci // Handle opening '[' delimiter 3332e5b6d6dSopenharmony_ci mode = 1; 3342e5b6d6dSopenharmony_ci patLocal.append(u'['); 3352e5b6d6dSopenharmony_ci chars.getPos(backup); // prepare to backup 3362e5b6d6dSopenharmony_ci c = chars.next(opts, literal, ec); 3372e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return; 3382e5b6d6dSopenharmony_ci if (c == u'^' && !literal) { 3392e5b6d6dSopenharmony_ci invert = true; 3402e5b6d6dSopenharmony_ci patLocal.append(u'^'); 3412e5b6d6dSopenharmony_ci chars.getPos(backup); // prepare to backup 3422e5b6d6dSopenharmony_ci c = chars.next(opts, literal, ec); 3432e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return; 3442e5b6d6dSopenharmony_ci } 3452e5b6d6dSopenharmony_ci // Fall through to handle special leading '-'; 3462e5b6d6dSopenharmony_ci // otherwise restart loop for nested [], \p{}, etc. 3472e5b6d6dSopenharmony_ci if (c == u'-') { 3482e5b6d6dSopenharmony_ci literal = true; 3492e5b6d6dSopenharmony_ci // Fall through to handle literal '-' below 3502e5b6d6dSopenharmony_ci } else { 3512e5b6d6dSopenharmony_ci chars.setPos(backup); // backup 3522e5b6d6dSopenharmony_ci continue; 3532e5b6d6dSopenharmony_ci } 3542e5b6d6dSopenharmony_ci } 3552e5b6d6dSopenharmony_ci } else if (symbols != 0) { 3562e5b6d6dSopenharmony_ci const UnicodeFunctor *m = symbols->lookupMatcher(c); 3572e5b6d6dSopenharmony_ci if (m != 0) { 3582e5b6d6dSopenharmony_ci const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); 3592e5b6d6dSopenharmony_ci if (ms == NULL) { 3602e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 3612e5b6d6dSopenharmony_ci return; 3622e5b6d6dSopenharmony_ci } 3632e5b6d6dSopenharmony_ci // casting away const, but `nested' won't be modified 3642e5b6d6dSopenharmony_ci // (important not to modify stored set) 3652e5b6d6dSopenharmony_ci nested = const_cast<UnicodeSet*>(ms); 3662e5b6d6dSopenharmony_ci setMode = 3; 3672e5b6d6dSopenharmony_ci } 3682e5b6d6dSopenharmony_ci } 3692e5b6d6dSopenharmony_ci } 3702e5b6d6dSopenharmony_ci 3712e5b6d6dSopenharmony_ci // -------- Handle a nested set. This either is inline in 3722e5b6d6dSopenharmony_ci // the pattern or represented by a stand-in that has 3732e5b6d6dSopenharmony_ci // previously been parsed and was looked up in the symbol 3742e5b6d6dSopenharmony_ci // table. 3752e5b6d6dSopenharmony_ci 3762e5b6d6dSopenharmony_ci if (setMode != 0) { 3772e5b6d6dSopenharmony_ci if (lastItem == 1) { 3782e5b6d6dSopenharmony_ci if (op != 0) { 3792e5b6d6dSopenharmony_ci // syntaxError(chars, "Char expected after operator"); 3802e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 3812e5b6d6dSopenharmony_ci return; 3822e5b6d6dSopenharmony_ci } 3832e5b6d6dSopenharmony_ci add(lastChar, lastChar); 3842e5b6d6dSopenharmony_ci _appendToPat(patLocal, lastChar, false); 3852e5b6d6dSopenharmony_ci lastItem = 0; 3862e5b6d6dSopenharmony_ci op = 0; 3872e5b6d6dSopenharmony_ci } 3882e5b6d6dSopenharmony_ci 3892e5b6d6dSopenharmony_ci if (op == u'-' || op == u'&') { 3902e5b6d6dSopenharmony_ci patLocal.append(op); 3912e5b6d6dSopenharmony_ci } 3922e5b6d6dSopenharmony_ci 3932e5b6d6dSopenharmony_ci if (nested == 0) { 3942e5b6d6dSopenharmony_ci // lazy allocation 3952e5b6d6dSopenharmony_ci if (!scratch.allocate()) { 3962e5b6d6dSopenharmony_ci ec = U_MEMORY_ALLOCATION_ERROR; 3972e5b6d6dSopenharmony_ci return; 3982e5b6d6dSopenharmony_ci } 3992e5b6d6dSopenharmony_ci nested = scratch.pointer(); 4002e5b6d6dSopenharmony_ci } 4012e5b6d6dSopenharmony_ci switch (setMode) { 4022e5b6d6dSopenharmony_ci case 1: 4032e5b6d6dSopenharmony_ci nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec); 4042e5b6d6dSopenharmony_ci break; 4052e5b6d6dSopenharmony_ci case 2: 4062e5b6d6dSopenharmony_ci chars.skipIgnored(opts); 4072e5b6d6dSopenharmony_ci nested->applyPropertyPattern(chars, patLocal, ec); 4082e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return; 4092e5b6d6dSopenharmony_ci break; 4102e5b6d6dSopenharmony_ci case 3: // `nested' already parsed 4112e5b6d6dSopenharmony_ci nested->_toPattern(patLocal, false); 4122e5b6d6dSopenharmony_ci break; 4132e5b6d6dSopenharmony_ci } 4142e5b6d6dSopenharmony_ci 4152e5b6d6dSopenharmony_ci usePat = true; 4162e5b6d6dSopenharmony_ci 4172e5b6d6dSopenharmony_ci if (mode == 0) { 4182e5b6d6dSopenharmony_ci // Entire pattern is a category; leave parse loop 4192e5b6d6dSopenharmony_ci *this = *nested; 4202e5b6d6dSopenharmony_ci mode = 2; 4212e5b6d6dSopenharmony_ci break; 4222e5b6d6dSopenharmony_ci } 4232e5b6d6dSopenharmony_ci 4242e5b6d6dSopenharmony_ci switch (op) { 4252e5b6d6dSopenharmony_ci case u'-': 4262e5b6d6dSopenharmony_ci removeAll(*nested); 4272e5b6d6dSopenharmony_ci break; 4282e5b6d6dSopenharmony_ci case u'&': 4292e5b6d6dSopenharmony_ci retainAll(*nested); 4302e5b6d6dSopenharmony_ci break; 4312e5b6d6dSopenharmony_ci case 0: 4322e5b6d6dSopenharmony_ci addAll(*nested); 4332e5b6d6dSopenharmony_ci break; 4342e5b6d6dSopenharmony_ci } 4352e5b6d6dSopenharmony_ci 4362e5b6d6dSopenharmony_ci op = 0; 4372e5b6d6dSopenharmony_ci lastItem = 2; 4382e5b6d6dSopenharmony_ci 4392e5b6d6dSopenharmony_ci continue; 4402e5b6d6dSopenharmony_ci } 4412e5b6d6dSopenharmony_ci 4422e5b6d6dSopenharmony_ci if (mode == 0) { 4432e5b6d6dSopenharmony_ci // syntaxError(chars, "Missing '['"); 4442e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 4452e5b6d6dSopenharmony_ci return; 4462e5b6d6dSopenharmony_ci } 4472e5b6d6dSopenharmony_ci 4482e5b6d6dSopenharmony_ci // -------- Parse special (syntax) characters. If the 4492e5b6d6dSopenharmony_ci // current character is not special, or if it is escaped, 4502e5b6d6dSopenharmony_ci // then fall through and handle it below. 4512e5b6d6dSopenharmony_ci 4522e5b6d6dSopenharmony_ci if (!literal) { 4532e5b6d6dSopenharmony_ci switch (c) { 4542e5b6d6dSopenharmony_ci case u']': 4552e5b6d6dSopenharmony_ci if (lastItem == 1) { 4562e5b6d6dSopenharmony_ci add(lastChar, lastChar); 4572e5b6d6dSopenharmony_ci _appendToPat(patLocal, lastChar, false); 4582e5b6d6dSopenharmony_ci } 4592e5b6d6dSopenharmony_ci // Treat final trailing '-' as a literal 4602e5b6d6dSopenharmony_ci if (op == u'-') { 4612e5b6d6dSopenharmony_ci add(op, op); 4622e5b6d6dSopenharmony_ci patLocal.append(op); 4632e5b6d6dSopenharmony_ci } else if (op == u'&') { 4642e5b6d6dSopenharmony_ci // syntaxError(chars, "Trailing '&'"); 4652e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 4662e5b6d6dSopenharmony_ci return; 4672e5b6d6dSopenharmony_ci } 4682e5b6d6dSopenharmony_ci patLocal.append(u']'); 4692e5b6d6dSopenharmony_ci mode = 2; 4702e5b6d6dSopenharmony_ci continue; 4712e5b6d6dSopenharmony_ci case u'-': 4722e5b6d6dSopenharmony_ci if (op == 0) { 4732e5b6d6dSopenharmony_ci if (lastItem != 0) { 4742e5b6d6dSopenharmony_ci op = (UChar) c; 4752e5b6d6dSopenharmony_ci continue; 4762e5b6d6dSopenharmony_ci } else { 4772e5b6d6dSopenharmony_ci // Treat final trailing '-' as a literal 4782e5b6d6dSopenharmony_ci add(c, c); 4792e5b6d6dSopenharmony_ci c = chars.next(opts, literal, ec); 4802e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return; 4812e5b6d6dSopenharmony_ci if (c == u']' && !literal) { 4822e5b6d6dSopenharmony_ci patLocal.append(u"-]", 2); 4832e5b6d6dSopenharmony_ci mode = 2; 4842e5b6d6dSopenharmony_ci continue; 4852e5b6d6dSopenharmony_ci } 4862e5b6d6dSopenharmony_ci } 4872e5b6d6dSopenharmony_ci } 4882e5b6d6dSopenharmony_ci // syntaxError(chars, "'-' not after char or set"); 4892e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 4902e5b6d6dSopenharmony_ci return; 4912e5b6d6dSopenharmony_ci case u'&': 4922e5b6d6dSopenharmony_ci if (lastItem == 2 && op == 0) { 4932e5b6d6dSopenharmony_ci op = (UChar) c; 4942e5b6d6dSopenharmony_ci continue; 4952e5b6d6dSopenharmony_ci } 4962e5b6d6dSopenharmony_ci // syntaxError(chars, "'&' not after set"); 4972e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 4982e5b6d6dSopenharmony_ci return; 4992e5b6d6dSopenharmony_ci case u'^': 5002e5b6d6dSopenharmony_ci // syntaxError(chars, "'^' not after '['"); 5012e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 5022e5b6d6dSopenharmony_ci return; 5032e5b6d6dSopenharmony_ci case u'{': 5042e5b6d6dSopenharmony_ci if (op != 0) { 5052e5b6d6dSopenharmony_ci // syntaxError(chars, "Missing operand after operator"); 5062e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 5072e5b6d6dSopenharmony_ci return; 5082e5b6d6dSopenharmony_ci } 5092e5b6d6dSopenharmony_ci if (lastItem == 1) { 5102e5b6d6dSopenharmony_ci add(lastChar, lastChar); 5112e5b6d6dSopenharmony_ci _appendToPat(patLocal, lastChar, false); 5122e5b6d6dSopenharmony_ci } 5132e5b6d6dSopenharmony_ci lastItem = 0; 5142e5b6d6dSopenharmony_ci buf.truncate(0); 5152e5b6d6dSopenharmony_ci { 5162e5b6d6dSopenharmony_ci UBool ok = false; 5172e5b6d6dSopenharmony_ci while (!chars.atEnd()) { 5182e5b6d6dSopenharmony_ci c = chars.next(opts, literal, ec); 5192e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return; 5202e5b6d6dSopenharmony_ci if (c == u'}' && !literal) { 5212e5b6d6dSopenharmony_ci ok = true; 5222e5b6d6dSopenharmony_ci break; 5232e5b6d6dSopenharmony_ci } 5242e5b6d6dSopenharmony_ci buf.append(c); 5252e5b6d6dSopenharmony_ci } 5262e5b6d6dSopenharmony_ci if (!ok) { 5272e5b6d6dSopenharmony_ci // syntaxError(chars, "Invalid multicharacter string"); 5282e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 5292e5b6d6dSopenharmony_ci return; 5302e5b6d6dSopenharmony_ci } 5312e5b6d6dSopenharmony_ci } 5322e5b6d6dSopenharmony_ci // We have new string. Add it to set and continue; 5332e5b6d6dSopenharmony_ci // we don't need to drop through to the further 5342e5b6d6dSopenharmony_ci // processing 5352e5b6d6dSopenharmony_ci add(buf); 5362e5b6d6dSopenharmony_ci patLocal.append(u'{'); 5372e5b6d6dSopenharmony_ci _appendToPat(patLocal, buf, false); 5382e5b6d6dSopenharmony_ci patLocal.append(u'}'); 5392e5b6d6dSopenharmony_ci continue; 5402e5b6d6dSopenharmony_ci case SymbolTable::SYMBOL_REF: 5412e5b6d6dSopenharmony_ci // symbols nosymbols 5422e5b6d6dSopenharmony_ci // [a-$] error error (ambiguous) 5432e5b6d6dSopenharmony_ci // [a$] anchor anchor 5442e5b6d6dSopenharmony_ci // [a-$x] var "x"* literal '$' 5452e5b6d6dSopenharmony_ci // [a-$.] error literal '$' 5462e5b6d6dSopenharmony_ci // *We won't get here in the case of var "x" 5472e5b6d6dSopenharmony_ci { 5482e5b6d6dSopenharmony_ci chars.getPos(backup); 5492e5b6d6dSopenharmony_ci c = chars.next(opts, literal, ec); 5502e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return; 5512e5b6d6dSopenharmony_ci UBool anchor = (c == u']' && !literal); 5522e5b6d6dSopenharmony_ci if (symbols == 0 && !anchor) { 5532e5b6d6dSopenharmony_ci c = SymbolTable::SYMBOL_REF; 5542e5b6d6dSopenharmony_ci chars.setPos(backup); 5552e5b6d6dSopenharmony_ci break; // literal '$' 5562e5b6d6dSopenharmony_ci } 5572e5b6d6dSopenharmony_ci if (anchor && op == 0) { 5582e5b6d6dSopenharmony_ci if (lastItem == 1) { 5592e5b6d6dSopenharmony_ci add(lastChar, lastChar); 5602e5b6d6dSopenharmony_ci _appendToPat(patLocal, lastChar, false); 5612e5b6d6dSopenharmony_ci } 5622e5b6d6dSopenharmony_ci add(U_ETHER); 5632e5b6d6dSopenharmony_ci usePat = true; 5642e5b6d6dSopenharmony_ci patLocal.append((UChar) SymbolTable::SYMBOL_REF); 5652e5b6d6dSopenharmony_ci patLocal.append(u']'); 5662e5b6d6dSopenharmony_ci mode = 2; 5672e5b6d6dSopenharmony_ci continue; 5682e5b6d6dSopenharmony_ci } 5692e5b6d6dSopenharmony_ci // syntaxError(chars, "Unquoted '$'"); 5702e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 5712e5b6d6dSopenharmony_ci return; 5722e5b6d6dSopenharmony_ci } 5732e5b6d6dSopenharmony_ci default: 5742e5b6d6dSopenharmony_ci break; 5752e5b6d6dSopenharmony_ci } 5762e5b6d6dSopenharmony_ci } 5772e5b6d6dSopenharmony_ci 5782e5b6d6dSopenharmony_ci // -------- Parse literal characters. This includes both 5792e5b6d6dSopenharmony_ci // escaped chars ("\u4E01") and non-syntax characters 5802e5b6d6dSopenharmony_ci // ("a"). 5812e5b6d6dSopenharmony_ci 5822e5b6d6dSopenharmony_ci switch (lastItem) { 5832e5b6d6dSopenharmony_ci case 0: 5842e5b6d6dSopenharmony_ci lastItem = 1; 5852e5b6d6dSopenharmony_ci lastChar = c; 5862e5b6d6dSopenharmony_ci break; 5872e5b6d6dSopenharmony_ci case 1: 5882e5b6d6dSopenharmony_ci if (op == u'-') { 5892e5b6d6dSopenharmony_ci if (lastChar >= c) { 5902e5b6d6dSopenharmony_ci // Don't allow redundant (a-a) or empty (b-a) ranges; 5912e5b6d6dSopenharmony_ci // these are most likely typos. 5922e5b6d6dSopenharmony_ci // syntaxError(chars, "Invalid range"); 5932e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 5942e5b6d6dSopenharmony_ci return; 5952e5b6d6dSopenharmony_ci } 5962e5b6d6dSopenharmony_ci add(lastChar, c); 5972e5b6d6dSopenharmony_ci _appendToPat(patLocal, lastChar, false); 5982e5b6d6dSopenharmony_ci patLocal.append(op); 5992e5b6d6dSopenharmony_ci _appendToPat(patLocal, c, false); 6002e5b6d6dSopenharmony_ci lastItem = 0; 6012e5b6d6dSopenharmony_ci op = 0; 6022e5b6d6dSopenharmony_ci } else { 6032e5b6d6dSopenharmony_ci add(lastChar, lastChar); 6042e5b6d6dSopenharmony_ci _appendToPat(patLocal, lastChar, false); 6052e5b6d6dSopenharmony_ci lastChar = c; 6062e5b6d6dSopenharmony_ci } 6072e5b6d6dSopenharmony_ci break; 6082e5b6d6dSopenharmony_ci case 2: 6092e5b6d6dSopenharmony_ci if (op != 0) { 6102e5b6d6dSopenharmony_ci // syntaxError(chars, "Set expected after operator"); 6112e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 6122e5b6d6dSopenharmony_ci return; 6132e5b6d6dSopenharmony_ci } 6142e5b6d6dSopenharmony_ci lastChar = c; 6152e5b6d6dSopenharmony_ci lastItem = 1; 6162e5b6d6dSopenharmony_ci break; 6172e5b6d6dSopenharmony_ci } 6182e5b6d6dSopenharmony_ci } 6192e5b6d6dSopenharmony_ci 6202e5b6d6dSopenharmony_ci if (mode != 2) { 6212e5b6d6dSopenharmony_ci // syntaxError(chars, "Missing ']'"); 6222e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 6232e5b6d6dSopenharmony_ci return; 6242e5b6d6dSopenharmony_ci } 6252e5b6d6dSopenharmony_ci 6262e5b6d6dSopenharmony_ci chars.skipIgnored(opts); 6272e5b6d6dSopenharmony_ci 6282e5b6d6dSopenharmony_ci /** 6292e5b6d6dSopenharmony_ci * Handle global flags (invert, case insensitivity). If this 6302e5b6d6dSopenharmony_ci * pattern should be compiled case-insensitive, then we need 6312e5b6d6dSopenharmony_ci * to close over case BEFORE COMPLEMENTING. This makes 6322e5b6d6dSopenharmony_ci * patterns like /[^abc]/i work. 6332e5b6d6dSopenharmony_ci */ 6342e5b6d6dSopenharmony_ci if ((options & USET_CASE_INSENSITIVE) != 0) { 6352e5b6d6dSopenharmony_ci (this->*caseClosure)(USET_CASE_INSENSITIVE); 6362e5b6d6dSopenharmony_ci } 6372e5b6d6dSopenharmony_ci else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { 6382e5b6d6dSopenharmony_ci (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); 6392e5b6d6dSopenharmony_ci } 6402e5b6d6dSopenharmony_ci if (invert) { 6412e5b6d6dSopenharmony_ci complement().removeAllStrings(); // code point complement 6422e5b6d6dSopenharmony_ci } 6432e5b6d6dSopenharmony_ci 6442e5b6d6dSopenharmony_ci // Use the rebuilt pattern (patLocal) only if necessary. Prefer the 6452e5b6d6dSopenharmony_ci // generated pattern. 6462e5b6d6dSopenharmony_ci if (usePat) { 6472e5b6d6dSopenharmony_ci rebuiltPat.append(patLocal); 6482e5b6d6dSopenharmony_ci } else { 6492e5b6d6dSopenharmony_ci _generatePattern(rebuiltPat, false); 6502e5b6d6dSopenharmony_ci } 6512e5b6d6dSopenharmony_ci if (isBogus() && U_SUCCESS(ec)) { 6522e5b6d6dSopenharmony_ci // We likely ran out of memory. AHHH! 6532e5b6d6dSopenharmony_ci ec = U_MEMORY_ALLOCATION_ERROR; 6542e5b6d6dSopenharmony_ci } 6552e5b6d6dSopenharmony_ci} 6562e5b6d6dSopenharmony_ci 6572e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 6582e5b6d6dSopenharmony_ci// Property set implementation 6592e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 6602e5b6d6dSopenharmony_ci 6612e5b6d6dSopenharmony_cinamespace { 6622e5b6d6dSopenharmony_ci 6632e5b6d6dSopenharmony_cistatic UBool numericValueFilter(UChar32 ch, void* context) { 6642e5b6d6dSopenharmony_ci return u_getNumericValue(ch) == *(double*)context; 6652e5b6d6dSopenharmony_ci} 6662e5b6d6dSopenharmony_ci 6672e5b6d6dSopenharmony_cistatic UBool generalCategoryMaskFilter(UChar32 ch, void* context) { 6682e5b6d6dSopenharmony_ci int32_t value = *(int32_t*)context; 6692e5b6d6dSopenharmony_ci return (U_GET_GC_MASK((UChar32) ch) & value) != 0; 6702e5b6d6dSopenharmony_ci} 6712e5b6d6dSopenharmony_ci 6722e5b6d6dSopenharmony_cistatic UBool versionFilter(UChar32 ch, void* context) { 6732e5b6d6dSopenharmony_ci static const UVersionInfo none = { 0, 0, 0, 0 }; 6742e5b6d6dSopenharmony_ci UVersionInfo v; 6752e5b6d6dSopenharmony_ci u_charAge(ch, v); 6762e5b6d6dSopenharmony_ci UVersionInfo* version = (UVersionInfo*)context; 6772e5b6d6dSopenharmony_ci return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; 6782e5b6d6dSopenharmony_ci} 6792e5b6d6dSopenharmony_ci 6802e5b6d6dSopenharmony_citypedef struct { 6812e5b6d6dSopenharmony_ci UProperty prop; 6822e5b6d6dSopenharmony_ci int32_t value; 6832e5b6d6dSopenharmony_ci} IntPropertyContext; 6842e5b6d6dSopenharmony_ci 6852e5b6d6dSopenharmony_cistatic UBool intPropertyFilter(UChar32 ch, void* context) { 6862e5b6d6dSopenharmony_ci IntPropertyContext* c = (IntPropertyContext*)context; 6872e5b6d6dSopenharmony_ci return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; 6882e5b6d6dSopenharmony_ci} 6892e5b6d6dSopenharmony_ci 6902e5b6d6dSopenharmony_cistatic UBool scriptExtensionsFilter(UChar32 ch, void* context) { 6912e5b6d6dSopenharmony_ci return uscript_hasScript(ch, *(UScriptCode*)context); 6922e5b6d6dSopenharmony_ci} 6932e5b6d6dSopenharmony_ci 6942e5b6d6dSopenharmony_ci} // namespace 6952e5b6d6dSopenharmony_ci 6962e5b6d6dSopenharmony_ci/** 6972e5b6d6dSopenharmony_ci * Generic filter-based scanning code for UCD property UnicodeSets. 6982e5b6d6dSopenharmony_ci */ 6992e5b6d6dSopenharmony_civoid UnicodeSet::applyFilter(UnicodeSet::Filter filter, 7002e5b6d6dSopenharmony_ci void* context, 7012e5b6d6dSopenharmony_ci const UnicodeSet* inclusions, 7022e5b6d6dSopenharmony_ci UErrorCode &status) { 7032e5b6d6dSopenharmony_ci if (U_FAILURE(status)) return; 7042e5b6d6dSopenharmony_ci 7052e5b6d6dSopenharmony_ci // Logically, walk through all Unicode characters, noting the start 7062e5b6d6dSopenharmony_ci // and end of each range for which filter.contain(c) is 7072e5b6d6dSopenharmony_ci // true. Add each range to a set. 7082e5b6d6dSopenharmony_ci // 7092e5b6d6dSopenharmony_ci // To improve performance, use an inclusions set which 7102e5b6d6dSopenharmony_ci // encodes information about character ranges that are known 7112e5b6d6dSopenharmony_ci // to have identical properties. 7122e5b6d6dSopenharmony_ci // inclusions contains the first characters of 7132e5b6d6dSopenharmony_ci // same-value ranges for the given property. 7142e5b6d6dSopenharmony_ci 7152e5b6d6dSopenharmony_ci clear(); 7162e5b6d6dSopenharmony_ci 7172e5b6d6dSopenharmony_ci UChar32 startHasProperty = -1; 7182e5b6d6dSopenharmony_ci int32_t limitRange = inclusions->getRangeCount(); 7192e5b6d6dSopenharmony_ci 7202e5b6d6dSopenharmony_ci for (int j=0; j<limitRange; ++j) { 7212e5b6d6dSopenharmony_ci // get current range 7222e5b6d6dSopenharmony_ci UChar32 start = inclusions->getRangeStart(j); 7232e5b6d6dSopenharmony_ci UChar32 end = inclusions->getRangeEnd(j); 7242e5b6d6dSopenharmony_ci 7252e5b6d6dSopenharmony_ci // for all the code points in the range, process 7262e5b6d6dSopenharmony_ci for (UChar32 ch = start; ch <= end; ++ch) { 7272e5b6d6dSopenharmony_ci // only add to this UnicodeSet on inflection points -- 7282e5b6d6dSopenharmony_ci // where the hasProperty value changes to false 7292e5b6d6dSopenharmony_ci if ((*filter)(ch, context)) { 7302e5b6d6dSopenharmony_ci if (startHasProperty < 0) { 7312e5b6d6dSopenharmony_ci startHasProperty = ch; 7322e5b6d6dSopenharmony_ci } 7332e5b6d6dSopenharmony_ci } else if (startHasProperty >= 0) { 7342e5b6d6dSopenharmony_ci add(startHasProperty, ch-1); 7352e5b6d6dSopenharmony_ci startHasProperty = -1; 7362e5b6d6dSopenharmony_ci } 7372e5b6d6dSopenharmony_ci } 7382e5b6d6dSopenharmony_ci } 7392e5b6d6dSopenharmony_ci if (startHasProperty >= 0) { 7402e5b6d6dSopenharmony_ci add((UChar32)startHasProperty, (UChar32)0x10FFFF); 7412e5b6d6dSopenharmony_ci } 7422e5b6d6dSopenharmony_ci if (isBogus() && U_SUCCESS(status)) { 7432e5b6d6dSopenharmony_ci // We likely ran out of memory. AHHH! 7442e5b6d6dSopenharmony_ci status = U_MEMORY_ALLOCATION_ERROR; 7452e5b6d6dSopenharmony_ci } 7462e5b6d6dSopenharmony_ci} 7472e5b6d6dSopenharmony_ci 7482e5b6d6dSopenharmony_cinamespace { 7492e5b6d6dSopenharmony_ci 7502e5b6d6dSopenharmony_cistatic UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { 7512e5b6d6dSopenharmony_ci /* Note: we use ' ' in compiler code page */ 7522e5b6d6dSopenharmony_ci int32_t j = 0; 7532e5b6d6dSopenharmony_ci char ch; 7542e5b6d6dSopenharmony_ci --dstCapacity; /* make room for term. zero */ 7552e5b6d6dSopenharmony_ci while ((ch = *src++) != 0) { 7562e5b6d6dSopenharmony_ci if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { 7572e5b6d6dSopenharmony_ci continue; 7582e5b6d6dSopenharmony_ci } 7592e5b6d6dSopenharmony_ci if (j >= dstCapacity) return false; 7602e5b6d6dSopenharmony_ci dst[j++] = ch; 7612e5b6d6dSopenharmony_ci } 7622e5b6d6dSopenharmony_ci if (j > 0 && dst[j-1] == ' ') --j; 7632e5b6d6dSopenharmony_ci dst[j] = 0; 7642e5b6d6dSopenharmony_ci return true; 7652e5b6d6dSopenharmony_ci} 7662e5b6d6dSopenharmony_ci 7672e5b6d6dSopenharmony_ci} // namespace 7682e5b6d6dSopenharmony_ci 7692e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 7702e5b6d6dSopenharmony_ci// Property set API 7712e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 7722e5b6d6dSopenharmony_ci 7732e5b6d6dSopenharmony_ci#define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \ 7742e5b6d6dSopenharmony_ci ec=U_ILLEGAL_ARGUMENT_ERROR; \ 7752e5b6d6dSopenharmony_ci return *this; \ 7762e5b6d6dSopenharmony_ci} UPRV_BLOCK_MACRO_END 7772e5b6d6dSopenharmony_ci 7782e5b6d6dSopenharmony_ciUnicodeSet& 7792e5b6d6dSopenharmony_ciUnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { 7802e5b6d6dSopenharmony_ci if (U_FAILURE(ec) || isFrozen()) { return *this; } 7812e5b6d6dSopenharmony_ci if (prop == UCHAR_GENERAL_CATEGORY_MASK) { 7822e5b6d6dSopenharmony_ci const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); 7832e5b6d6dSopenharmony_ci applyFilter(generalCategoryMaskFilter, &value, inclusions, ec); 7842e5b6d6dSopenharmony_ci } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { 7852e5b6d6dSopenharmony_ci const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); 7862e5b6d6dSopenharmony_ci UScriptCode script = (UScriptCode)value; 7872e5b6d6dSopenharmony_ci applyFilter(scriptExtensionsFilter, &script, inclusions, ec); 7882e5b6d6dSopenharmony_ci } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) { 7892e5b6d6dSopenharmony_ci if (value == 0 || value == 1) { 7902e5b6d6dSopenharmony_ci const USet *set = u_getBinaryPropertySet(prop, &ec); 7912e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) { return *this; } 7922e5b6d6dSopenharmony_ci copyFrom(*UnicodeSet::fromUSet(set), true); 7932e5b6d6dSopenharmony_ci if (value == 0) { 7942e5b6d6dSopenharmony_ci complement().removeAllStrings(); // code point complement 7952e5b6d6dSopenharmony_ci } 7962e5b6d6dSopenharmony_ci } else { 7972e5b6d6dSopenharmony_ci clear(); 7982e5b6d6dSopenharmony_ci } 7992e5b6d6dSopenharmony_ci } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { 8002e5b6d6dSopenharmony_ci const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); 8012e5b6d6dSopenharmony_ci IntPropertyContext c = {prop, value}; 8022e5b6d6dSopenharmony_ci applyFilter(intPropertyFilter, &c, inclusions, ec); 8032e5b6d6dSopenharmony_ci } else { 8042e5b6d6dSopenharmony_ci ec = U_ILLEGAL_ARGUMENT_ERROR; 8052e5b6d6dSopenharmony_ci } 8062e5b6d6dSopenharmony_ci return *this; 8072e5b6d6dSopenharmony_ci} 8082e5b6d6dSopenharmony_ci 8092e5b6d6dSopenharmony_ciUnicodeSet& 8102e5b6d6dSopenharmony_ciUnicodeSet::applyPropertyAlias(const UnicodeString& prop, 8112e5b6d6dSopenharmony_ci const UnicodeString& value, 8122e5b6d6dSopenharmony_ci UErrorCode& ec) { 8132e5b6d6dSopenharmony_ci if (U_FAILURE(ec) || isFrozen()) return *this; 8142e5b6d6dSopenharmony_ci 8152e5b6d6dSopenharmony_ci // prop and value used to be converted to char * using the default 8162e5b6d6dSopenharmony_ci // converter instead of the invariant conversion. 8172e5b6d6dSopenharmony_ci // This should not be necessary because all Unicode property and value 8182e5b6d6dSopenharmony_ci // names use only invariant characters. 8192e5b6d6dSopenharmony_ci // If there are any variant characters, then we won't find them anyway. 8202e5b6d6dSopenharmony_ci // Checking first avoids assertion failures in the conversion. 8212e5b6d6dSopenharmony_ci if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || 8222e5b6d6dSopenharmony_ci !uprv_isInvariantUString(value.getBuffer(), value.length()) 8232e5b6d6dSopenharmony_ci ) { 8242e5b6d6dSopenharmony_ci FAIL(ec); 8252e5b6d6dSopenharmony_ci } 8262e5b6d6dSopenharmony_ci CharString pname, vname; 8272e5b6d6dSopenharmony_ci pname.appendInvariantChars(prop, ec); 8282e5b6d6dSopenharmony_ci vname.appendInvariantChars(value, ec); 8292e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return *this; 8302e5b6d6dSopenharmony_ci 8312e5b6d6dSopenharmony_ci UProperty p; 8322e5b6d6dSopenharmony_ci int32_t v; 8332e5b6d6dSopenharmony_ci UBool invert = false; 8342e5b6d6dSopenharmony_ci 8352e5b6d6dSopenharmony_ci if (value.length() > 0) { 8362e5b6d6dSopenharmony_ci p = u_getPropertyEnum(pname.data()); 8372e5b6d6dSopenharmony_ci if (p == UCHAR_INVALID_CODE) FAIL(ec); 8382e5b6d6dSopenharmony_ci 8392e5b6d6dSopenharmony_ci // Treat gc as gcm 8402e5b6d6dSopenharmony_ci if (p == UCHAR_GENERAL_CATEGORY) { 8412e5b6d6dSopenharmony_ci p = UCHAR_GENERAL_CATEGORY_MASK; 8422e5b6d6dSopenharmony_ci } 8432e5b6d6dSopenharmony_ci 8442e5b6d6dSopenharmony_ci if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || 8452e5b6d6dSopenharmony_ci (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || 8462e5b6d6dSopenharmony_ci (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { 8472e5b6d6dSopenharmony_ci v = u_getPropertyValueEnum(p, vname.data()); 8482e5b6d6dSopenharmony_ci if (v == UCHAR_INVALID_CODE) { 8492e5b6d6dSopenharmony_ci // Handle numeric CCC 8502e5b6d6dSopenharmony_ci if (p == UCHAR_CANONICAL_COMBINING_CLASS || 8512e5b6d6dSopenharmony_ci p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || 8522e5b6d6dSopenharmony_ci p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { 8532e5b6d6dSopenharmony_ci char* end; 8542e5b6d6dSopenharmony_ci double val = uprv_strtod(vname.data(), &end); 8552e5b6d6dSopenharmony_ci // Anything between 0 and 255 is valid even if unused. 8562e5b6d6dSopenharmony_ci // Cast double->int only after range check. 8572e5b6d6dSopenharmony_ci // We catch NaN here because comparing it with both 0 and 255 will be false 8582e5b6d6dSopenharmony_ci // (as are all comparisons with NaN). 8592e5b6d6dSopenharmony_ci if (*end != 0 || !(0 <= val && val <= 255) || 8602e5b6d6dSopenharmony_ci (v = (int32_t)val) != val) { 8612e5b6d6dSopenharmony_ci // non-integral value or outside 0..255, or trailing junk 8622e5b6d6dSopenharmony_ci FAIL(ec); 8632e5b6d6dSopenharmony_ci } 8642e5b6d6dSopenharmony_ci } else { 8652e5b6d6dSopenharmony_ci FAIL(ec); 8662e5b6d6dSopenharmony_ci } 8672e5b6d6dSopenharmony_ci } 8682e5b6d6dSopenharmony_ci } 8692e5b6d6dSopenharmony_ci 8702e5b6d6dSopenharmony_ci else { 8712e5b6d6dSopenharmony_ci 8722e5b6d6dSopenharmony_ci switch (p) { 8732e5b6d6dSopenharmony_ci case UCHAR_NUMERIC_VALUE: 8742e5b6d6dSopenharmony_ci { 8752e5b6d6dSopenharmony_ci char* end; 8762e5b6d6dSopenharmony_ci double val = uprv_strtod(vname.data(), &end); 8772e5b6d6dSopenharmony_ci if (*end != 0) { 8782e5b6d6dSopenharmony_ci FAIL(ec); 8792e5b6d6dSopenharmony_ci } 8802e5b6d6dSopenharmony_ci applyFilter(numericValueFilter, &val, 8812e5b6d6dSopenharmony_ci CharacterProperties::getInclusionsForProperty(p, ec), ec); 8822e5b6d6dSopenharmony_ci return *this; 8832e5b6d6dSopenharmony_ci } 8842e5b6d6dSopenharmony_ci case UCHAR_NAME: 8852e5b6d6dSopenharmony_ci { 8862e5b6d6dSopenharmony_ci // Must munge name, since u_charFromName() does not do 8872e5b6d6dSopenharmony_ci // 'loose' matching. 8882e5b6d6dSopenharmony_ci char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength 8892e5b6d6dSopenharmony_ci if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 8902e5b6d6dSopenharmony_ci UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); 8912e5b6d6dSopenharmony_ci if (U_SUCCESS(ec)) { 8922e5b6d6dSopenharmony_ci clear(); 8932e5b6d6dSopenharmony_ci add(ch); 8942e5b6d6dSopenharmony_ci return *this; 8952e5b6d6dSopenharmony_ci } else { 8962e5b6d6dSopenharmony_ci FAIL(ec); 8972e5b6d6dSopenharmony_ci } 8982e5b6d6dSopenharmony_ci } 8992e5b6d6dSopenharmony_ci case UCHAR_UNICODE_1_NAME: 9002e5b6d6dSopenharmony_ci // ICU 49 deprecates the Unicode_1_Name property APIs. 9012e5b6d6dSopenharmony_ci FAIL(ec); 9022e5b6d6dSopenharmony_ci case UCHAR_AGE: 9032e5b6d6dSopenharmony_ci { 9042e5b6d6dSopenharmony_ci // Must munge name, since u_versionFromString() does not do 9052e5b6d6dSopenharmony_ci // 'loose' matching. 9062e5b6d6dSopenharmony_ci char buf[128]; 9072e5b6d6dSopenharmony_ci if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 9082e5b6d6dSopenharmony_ci UVersionInfo version; 9092e5b6d6dSopenharmony_ci u_versionFromString(version, buf); 9102e5b6d6dSopenharmony_ci applyFilter(versionFilter, &version, 9112e5b6d6dSopenharmony_ci CharacterProperties::getInclusionsForProperty(p, ec), ec); 9122e5b6d6dSopenharmony_ci return *this; 9132e5b6d6dSopenharmony_ci } 9142e5b6d6dSopenharmony_ci case UCHAR_SCRIPT_EXTENSIONS: 9152e5b6d6dSopenharmony_ci v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); 9162e5b6d6dSopenharmony_ci if (v == UCHAR_INVALID_CODE) { 9172e5b6d6dSopenharmony_ci FAIL(ec); 9182e5b6d6dSopenharmony_ci } 9192e5b6d6dSopenharmony_ci // fall through to calling applyIntPropertyValue() 9202e5b6d6dSopenharmony_ci break; 9212e5b6d6dSopenharmony_ci default: 9222e5b6d6dSopenharmony_ci // p is a non-binary, non-enumerated property that we 9232e5b6d6dSopenharmony_ci // don't support (yet). 9242e5b6d6dSopenharmony_ci FAIL(ec); 9252e5b6d6dSopenharmony_ci } 9262e5b6d6dSopenharmony_ci } 9272e5b6d6dSopenharmony_ci } 9282e5b6d6dSopenharmony_ci 9292e5b6d6dSopenharmony_ci else { 9302e5b6d6dSopenharmony_ci // value is empty. Interpret as General Category, Script, or 9312e5b6d6dSopenharmony_ci // Binary property. 9322e5b6d6dSopenharmony_ci p = UCHAR_GENERAL_CATEGORY_MASK; 9332e5b6d6dSopenharmony_ci v = u_getPropertyValueEnum(p, pname.data()); 9342e5b6d6dSopenharmony_ci if (v == UCHAR_INVALID_CODE) { 9352e5b6d6dSopenharmony_ci p = UCHAR_SCRIPT; 9362e5b6d6dSopenharmony_ci v = u_getPropertyValueEnum(p, pname.data()); 9372e5b6d6dSopenharmony_ci if (v == UCHAR_INVALID_CODE) { 9382e5b6d6dSopenharmony_ci p = u_getPropertyEnum(pname.data()); 9392e5b6d6dSopenharmony_ci if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { 9402e5b6d6dSopenharmony_ci v = 1; 9412e5b6d6dSopenharmony_ci } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { 9422e5b6d6dSopenharmony_ci set(MIN_VALUE, MAX_VALUE); 9432e5b6d6dSopenharmony_ci return *this; 9442e5b6d6dSopenharmony_ci } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { 9452e5b6d6dSopenharmony_ci set(0, 0x7F); 9462e5b6d6dSopenharmony_ci return *this; 9472e5b6d6dSopenharmony_ci } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { 9482e5b6d6dSopenharmony_ci // [:Assigned:]=[:^Cn:] 9492e5b6d6dSopenharmony_ci p = UCHAR_GENERAL_CATEGORY_MASK; 9502e5b6d6dSopenharmony_ci v = U_GC_CN_MASK; 9512e5b6d6dSopenharmony_ci invert = true; 9522e5b6d6dSopenharmony_ci } else { 9532e5b6d6dSopenharmony_ci FAIL(ec); 9542e5b6d6dSopenharmony_ci } 9552e5b6d6dSopenharmony_ci } 9562e5b6d6dSopenharmony_ci } 9572e5b6d6dSopenharmony_ci } 9582e5b6d6dSopenharmony_ci 9592e5b6d6dSopenharmony_ci applyIntPropertyValue(p, v, ec); 9602e5b6d6dSopenharmony_ci if(invert) { 9612e5b6d6dSopenharmony_ci complement().removeAllStrings(); // code point complement 9622e5b6d6dSopenharmony_ci } 9632e5b6d6dSopenharmony_ci 9642e5b6d6dSopenharmony_ci if (isBogus() && U_SUCCESS(ec)) { 9652e5b6d6dSopenharmony_ci // We likely ran out of memory. AHHH! 9662e5b6d6dSopenharmony_ci ec = U_MEMORY_ALLOCATION_ERROR; 9672e5b6d6dSopenharmony_ci } 9682e5b6d6dSopenharmony_ci return *this; 9692e5b6d6dSopenharmony_ci} 9702e5b6d6dSopenharmony_ci 9712e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 9722e5b6d6dSopenharmony_ci// Property set patterns 9732e5b6d6dSopenharmony_ci//---------------------------------------------------------------- 9742e5b6d6dSopenharmony_ci 9752e5b6d6dSopenharmony_ci/** 9762e5b6d6dSopenharmony_ci * Return true if the given position, in the given pattern, appears 9772e5b6d6dSopenharmony_ci * to be the start of a property set pattern. 9782e5b6d6dSopenharmony_ci */ 9792e5b6d6dSopenharmony_ciUBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, 9802e5b6d6dSopenharmony_ci int32_t pos) { 9812e5b6d6dSopenharmony_ci // Patterns are at least 5 characters long 9822e5b6d6dSopenharmony_ci if ((pos+5) > pattern.length()) { 9832e5b6d6dSopenharmony_ci return false; 9842e5b6d6dSopenharmony_ci } 9852e5b6d6dSopenharmony_ci 9862e5b6d6dSopenharmony_ci // Look for an opening [:, [:^, \p, or \P 9872e5b6d6dSopenharmony_ci return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); 9882e5b6d6dSopenharmony_ci} 9892e5b6d6dSopenharmony_ci 9902e5b6d6dSopenharmony_ci/** 9912e5b6d6dSopenharmony_ci * Return true if the given iterator appears to point at a 9922e5b6d6dSopenharmony_ci * property pattern. Regardless of the result, return with the 9932e5b6d6dSopenharmony_ci * iterator unchanged. 9942e5b6d6dSopenharmony_ci * @param chars iterator over the pattern characters. Upon return 9952e5b6d6dSopenharmony_ci * it will be unchanged. 9962e5b6d6dSopenharmony_ci * @param iterOpts RuleCharacterIterator options 9972e5b6d6dSopenharmony_ci */ 9982e5b6d6dSopenharmony_ciUBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, 9992e5b6d6dSopenharmony_ci int32_t iterOpts) { 10002e5b6d6dSopenharmony_ci // NOTE: literal will always be false, because we don't parse escapes. 10012e5b6d6dSopenharmony_ci UBool result = false, literal; 10022e5b6d6dSopenharmony_ci UErrorCode ec = U_ZERO_ERROR; 10032e5b6d6dSopenharmony_ci iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; 10042e5b6d6dSopenharmony_ci RuleCharacterIterator::Pos pos; 10052e5b6d6dSopenharmony_ci chars.getPos(pos); 10062e5b6d6dSopenharmony_ci UChar32 c = chars.next(iterOpts, literal, ec); 10072e5b6d6dSopenharmony_ci if (c == u'[' || c == u'\\') { 10082e5b6d6dSopenharmony_ci UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, 10092e5b6d6dSopenharmony_ci literal, ec); 10102e5b6d6dSopenharmony_ci result = (c == u'[') ? (d == u':') : 10112e5b6d6dSopenharmony_ci (d == u'N' || d == u'p' || d == u'P'); 10122e5b6d6dSopenharmony_ci } 10132e5b6d6dSopenharmony_ci chars.setPos(pos); 10142e5b6d6dSopenharmony_ci return result && U_SUCCESS(ec); 10152e5b6d6dSopenharmony_ci} 10162e5b6d6dSopenharmony_ci 10172e5b6d6dSopenharmony_ci/** 10182e5b6d6dSopenharmony_ci * Parse the given property pattern at the given parse position. 10192e5b6d6dSopenharmony_ci */ 10202e5b6d6dSopenharmony_ciUnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, 10212e5b6d6dSopenharmony_ci ParsePosition& ppos, 10222e5b6d6dSopenharmony_ci UErrorCode &ec) { 10232e5b6d6dSopenharmony_ci int32_t pos = ppos.getIndex(); 10242e5b6d6dSopenharmony_ci 10252e5b6d6dSopenharmony_ci UBool posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 10262e5b6d6dSopenharmony_ci UBool isName = false; // true for \N{pat}, o/w false 10272e5b6d6dSopenharmony_ci UBool invert = false; 10282e5b6d6dSopenharmony_ci 10292e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return *this; 10302e5b6d6dSopenharmony_ci 10312e5b6d6dSopenharmony_ci // Minimum length is 5 characters, e.g. \p{L} 10322e5b6d6dSopenharmony_ci if ((pos+5) > pattern.length()) { 10332e5b6d6dSopenharmony_ci FAIL(ec); 10342e5b6d6dSopenharmony_ci } 10352e5b6d6dSopenharmony_ci 10362e5b6d6dSopenharmony_ci // On entry, ppos should point to one of the following locations: 10372e5b6d6dSopenharmony_ci // Look for an opening [:, [:^, \p, or \P 10382e5b6d6dSopenharmony_ci if (isPOSIXOpen(pattern, pos)) { 10392e5b6d6dSopenharmony_ci posix = true; 10402e5b6d6dSopenharmony_ci pos += 2; 10412e5b6d6dSopenharmony_ci pos = ICU_Utility::skipWhitespace(pattern, pos); 10422e5b6d6dSopenharmony_ci if (pos < pattern.length() && pattern.charAt(pos) == u'^') { 10432e5b6d6dSopenharmony_ci ++pos; 10442e5b6d6dSopenharmony_ci invert = true; 10452e5b6d6dSopenharmony_ci } 10462e5b6d6dSopenharmony_ci } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { 10472e5b6d6dSopenharmony_ci UChar c = pattern.charAt(pos+1); 10482e5b6d6dSopenharmony_ci invert = (c == u'P'); 10492e5b6d6dSopenharmony_ci isName = (c == u'N'); 10502e5b6d6dSopenharmony_ci pos += 2; 10512e5b6d6dSopenharmony_ci pos = ICU_Utility::skipWhitespace(pattern, pos); 10522e5b6d6dSopenharmony_ci if (pos == pattern.length() || pattern.charAt(pos++) != u'{') { 10532e5b6d6dSopenharmony_ci // Syntax error; "\p" or "\P" not followed by "{" 10542e5b6d6dSopenharmony_ci FAIL(ec); 10552e5b6d6dSopenharmony_ci } 10562e5b6d6dSopenharmony_ci } else { 10572e5b6d6dSopenharmony_ci // Open delimiter not seen 10582e5b6d6dSopenharmony_ci FAIL(ec); 10592e5b6d6dSopenharmony_ci } 10602e5b6d6dSopenharmony_ci 10612e5b6d6dSopenharmony_ci // Look for the matching close delimiter, either :] or } 10622e5b6d6dSopenharmony_ci int32_t close; 10632e5b6d6dSopenharmony_ci if (posix) { 10642e5b6d6dSopenharmony_ci close = pattern.indexOf(u":]", 2, pos); 10652e5b6d6dSopenharmony_ci } else { 10662e5b6d6dSopenharmony_ci close = pattern.indexOf(u'}', pos); 10672e5b6d6dSopenharmony_ci } 10682e5b6d6dSopenharmony_ci if (close < 0) { 10692e5b6d6dSopenharmony_ci // Syntax error; close delimiter missing 10702e5b6d6dSopenharmony_ci FAIL(ec); 10712e5b6d6dSopenharmony_ci } 10722e5b6d6dSopenharmony_ci 10732e5b6d6dSopenharmony_ci // Look for an '=' sign. If this is present, we will parse a 10742e5b6d6dSopenharmony_ci // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 10752e5b6d6dSopenharmony_ci // pattern. 10762e5b6d6dSopenharmony_ci int32_t equals = pattern.indexOf(u'=', pos); 10772e5b6d6dSopenharmony_ci UnicodeString propName, valueName; 10782e5b6d6dSopenharmony_ci if (equals >= 0 && equals < close && !isName) { 10792e5b6d6dSopenharmony_ci // Equals seen; parse medium/long pattern 10802e5b6d6dSopenharmony_ci pattern.extractBetween(pos, equals, propName); 10812e5b6d6dSopenharmony_ci pattern.extractBetween(equals+1, close, valueName); 10822e5b6d6dSopenharmony_ci } 10832e5b6d6dSopenharmony_ci 10842e5b6d6dSopenharmony_ci else { 10852e5b6d6dSopenharmony_ci // Handle case where no '=' is seen, and \N{} 10862e5b6d6dSopenharmony_ci pattern.extractBetween(pos, close, propName); 10872e5b6d6dSopenharmony_ci 10882e5b6d6dSopenharmony_ci // Handle \N{name} 10892e5b6d6dSopenharmony_ci if (isName) { 10902e5b6d6dSopenharmony_ci // This is a little inefficient since it means we have to 10912e5b6d6dSopenharmony_ci // parse NAME_PROP back to UCHAR_NAME even though we already 10922e5b6d6dSopenharmony_ci // know it's UCHAR_NAME. If we refactor the API to 10932e5b6d6dSopenharmony_ci // support args of (UProperty, char*) then we can remove 10942e5b6d6dSopenharmony_ci // NAME_PROP and make this a little more efficient. 10952e5b6d6dSopenharmony_ci valueName = propName; 10962e5b6d6dSopenharmony_ci propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); 10972e5b6d6dSopenharmony_ci } 10982e5b6d6dSopenharmony_ci } 10992e5b6d6dSopenharmony_ci 11002e5b6d6dSopenharmony_ci applyPropertyAlias(propName, valueName, ec); 11012e5b6d6dSopenharmony_ci 11022e5b6d6dSopenharmony_ci if (U_SUCCESS(ec)) { 11032e5b6d6dSopenharmony_ci if (invert) { 11042e5b6d6dSopenharmony_ci complement().removeAllStrings(); // code point complement 11052e5b6d6dSopenharmony_ci } 11062e5b6d6dSopenharmony_ci 11072e5b6d6dSopenharmony_ci // Move to the limit position after the close delimiter if the 11082e5b6d6dSopenharmony_ci // parse succeeded. 11092e5b6d6dSopenharmony_ci ppos.setIndex(close + (posix ? 2 : 1)); 11102e5b6d6dSopenharmony_ci } 11112e5b6d6dSopenharmony_ci 11122e5b6d6dSopenharmony_ci return *this; 11132e5b6d6dSopenharmony_ci} 11142e5b6d6dSopenharmony_ci 11152e5b6d6dSopenharmony_ci/** 11162e5b6d6dSopenharmony_ci * Parse a property pattern. 11172e5b6d6dSopenharmony_ci * @param chars iterator over the pattern characters. Upon return 11182e5b6d6dSopenharmony_ci * it will be advanced to the first character after the parsed 11192e5b6d6dSopenharmony_ci * pattern, or the end of the iteration if all characters are 11202e5b6d6dSopenharmony_ci * parsed. 11212e5b6d6dSopenharmony_ci * @param rebuiltPat the pattern that was parsed, rebuilt or 11222e5b6d6dSopenharmony_ci * copied from the input pattern, as appropriate. 11232e5b6d6dSopenharmony_ci */ 11242e5b6d6dSopenharmony_civoid UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, 11252e5b6d6dSopenharmony_ci UnicodeString& rebuiltPat, 11262e5b6d6dSopenharmony_ci UErrorCode& ec) { 11272e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return; 11282e5b6d6dSopenharmony_ci UnicodeString pattern; 11292e5b6d6dSopenharmony_ci chars.lookahead(pattern); 11302e5b6d6dSopenharmony_ci ParsePosition pos(0); 11312e5b6d6dSopenharmony_ci applyPropertyPattern(pattern, pos, ec); 11322e5b6d6dSopenharmony_ci if (U_FAILURE(ec)) return; 11332e5b6d6dSopenharmony_ci if (pos.getIndex() == 0) { 11342e5b6d6dSopenharmony_ci // syntaxError(chars, "Invalid property pattern"); 11352e5b6d6dSopenharmony_ci ec = U_MALFORMED_SET; 11362e5b6d6dSopenharmony_ci return; 11372e5b6d6dSopenharmony_ci } 11382e5b6d6dSopenharmony_ci chars.jumpahead(pos.getIndex()); 11392e5b6d6dSopenharmony_ci rebuiltPat.append(pattern, 0, pos.getIndex()); 11402e5b6d6dSopenharmony_ci} 11412e5b6d6dSopenharmony_ci 11422e5b6d6dSopenharmony_ciU_NAMESPACE_END 1143