12e5b6d6dSopenharmony_ci// © 2016 and later: Unicode, Inc. and others.
22e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html
32e5b6d6dSopenharmony_ci/*
42e5b6d6dSopenharmony_ci*******************************************************************************
52e5b6d6dSopenharmony_ci*
62e5b6d6dSopenharmony_ci*   Copyright (C) 1999-2014, International Business Machines
72e5b6d6dSopenharmony_ci*   Corporation and others.  All Rights Reserved.
82e5b6d6dSopenharmony_ci*
92e5b6d6dSopenharmony_ci*******************************************************************************
102e5b6d6dSopenharmony_ci*   file name:  uniset_props.cpp
112e5b6d6dSopenharmony_ci*   encoding:   UTF-8
122e5b6d6dSopenharmony_ci*   tab size:   8 (not used)
132e5b6d6dSopenharmony_ci*   indentation:4
142e5b6d6dSopenharmony_ci*
152e5b6d6dSopenharmony_ci*   created on: 2004aug25
162e5b6d6dSopenharmony_ci*   created by: Markus W. Scherer
172e5b6d6dSopenharmony_ci*
182e5b6d6dSopenharmony_ci*   Character property dependent functions moved here from uniset.cpp
192e5b6d6dSopenharmony_ci*/
202e5b6d6dSopenharmony_ci
212e5b6d6dSopenharmony_ci#include "unicode/utypes.h"
222e5b6d6dSopenharmony_ci#include "unicode/uniset.h"
232e5b6d6dSopenharmony_ci#include "unicode/parsepos.h"
242e5b6d6dSopenharmony_ci#include "unicode/uchar.h"
252e5b6d6dSopenharmony_ci#include "unicode/uscript.h"
262e5b6d6dSopenharmony_ci#include "unicode/symtable.h"
272e5b6d6dSopenharmony_ci#include "unicode/uset.h"
282e5b6d6dSopenharmony_ci#include "unicode/locid.h"
292e5b6d6dSopenharmony_ci#include "unicode/brkiter.h"
302e5b6d6dSopenharmony_ci#include "uset_imp.h"
312e5b6d6dSopenharmony_ci#include "ruleiter.h"
322e5b6d6dSopenharmony_ci#include "cmemory.h"
332e5b6d6dSopenharmony_ci#include "ucln_cmn.h"
342e5b6d6dSopenharmony_ci#include "util.h"
352e5b6d6dSopenharmony_ci#include "uvector.h"
362e5b6d6dSopenharmony_ci#include "uprops.h"
372e5b6d6dSopenharmony_ci#include "propname.h"
382e5b6d6dSopenharmony_ci#include "normalizer2impl.h"
392e5b6d6dSopenharmony_ci#include "uinvchar.h"
402e5b6d6dSopenharmony_ci#include "uprops.h"
412e5b6d6dSopenharmony_ci#include "charstr.h"
422e5b6d6dSopenharmony_ci#include "cstring.h"
432e5b6d6dSopenharmony_ci#include "mutex.h"
442e5b6d6dSopenharmony_ci#include "umutex.h"
452e5b6d6dSopenharmony_ci#include "uassert.h"
462e5b6d6dSopenharmony_ci#include "hash.h"
472e5b6d6dSopenharmony_ci
482e5b6d6dSopenharmony_ciU_NAMESPACE_USE
492e5b6d6dSopenharmony_ci
502e5b6d6dSopenharmony_ci// Special property set IDs
512e5b6d6dSopenharmony_cistatic const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
522e5b6d6dSopenharmony_cistatic const char ASCII[] = "ASCII"; // [\u0000-\u007F]
532e5b6d6dSopenharmony_cistatic const char ASSIGNED[] = "Assigned"; // [:^Cn:]
542e5b6d6dSopenharmony_ci
552e5b6d6dSopenharmony_ci// Unicode name property alias
562e5b6d6dSopenharmony_ci#define NAME_PROP "na"
572e5b6d6dSopenharmony_ci#define NAME_PROP_LENGTH 2
582e5b6d6dSopenharmony_ci
592e5b6d6dSopenharmony_ci// Cached sets ------------------------------------------------------------- ***
602e5b6d6dSopenharmony_ci
612e5b6d6dSopenharmony_ciU_CDECL_BEGIN
622e5b6d6dSopenharmony_cistatic UBool U_CALLCONV uset_cleanup();
632e5b6d6dSopenharmony_ci
642e5b6d6dSopenharmony_cistatic UnicodeSet *uni32Singleton;
652e5b6d6dSopenharmony_cistatic icu::UInitOnce uni32InitOnce {};
662e5b6d6dSopenharmony_ci
672e5b6d6dSopenharmony_ci/**
682e5b6d6dSopenharmony_ci * Cleanup function for UnicodeSet
692e5b6d6dSopenharmony_ci */
702e5b6d6dSopenharmony_cistatic UBool U_CALLCONV uset_cleanup(void) {
712e5b6d6dSopenharmony_ci    delete uni32Singleton;
722e5b6d6dSopenharmony_ci    uni32Singleton = NULL;
732e5b6d6dSopenharmony_ci    uni32InitOnce.reset();
742e5b6d6dSopenharmony_ci    return true;
752e5b6d6dSopenharmony_ci}
762e5b6d6dSopenharmony_ci
772e5b6d6dSopenharmony_ciU_CDECL_END
782e5b6d6dSopenharmony_ci
792e5b6d6dSopenharmony_ciU_NAMESPACE_BEGIN
802e5b6d6dSopenharmony_ci
812e5b6d6dSopenharmony_cinamespace {
822e5b6d6dSopenharmony_ci
832e5b6d6dSopenharmony_ci// Cache some sets for other services -------------------------------------- ***
842e5b6d6dSopenharmony_civoid U_CALLCONV createUni32Set(UErrorCode &errorCode) {
852e5b6d6dSopenharmony_ci    U_ASSERT(uni32Singleton == NULL);
862e5b6d6dSopenharmony_ci    uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
872e5b6d6dSopenharmony_ci    if(uni32Singleton==NULL) {
882e5b6d6dSopenharmony_ci        errorCode=U_MEMORY_ALLOCATION_ERROR;
892e5b6d6dSopenharmony_ci    } else {
902e5b6d6dSopenharmony_ci        uni32Singleton->freeze();
912e5b6d6dSopenharmony_ci    }
922e5b6d6dSopenharmony_ci    ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
932e5b6d6dSopenharmony_ci}
942e5b6d6dSopenharmony_ci
952e5b6d6dSopenharmony_ci
962e5b6d6dSopenharmony_ciU_CFUNC UnicodeSet *
972e5b6d6dSopenharmony_ciuniset_getUnicode32Instance(UErrorCode &errorCode) {
982e5b6d6dSopenharmony_ci    umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
992e5b6d6dSopenharmony_ci    return uni32Singleton;
1002e5b6d6dSopenharmony_ci}
1012e5b6d6dSopenharmony_ci
1022e5b6d6dSopenharmony_ci// helper functions for matching of pattern syntax pieces ------------------ ***
1032e5b6d6dSopenharmony_ci// these functions are parallel to the PERL_OPEN etc. strings above
1042e5b6d6dSopenharmony_ci
1052e5b6d6dSopenharmony_ci// using these functions is not only faster than UnicodeString::compare() and
1062e5b6d6dSopenharmony_ci// caseCompare(), but they also make UnicodeSet work for simple patterns when
1072e5b6d6dSopenharmony_ci// no Unicode properties data is available - when caseCompare() fails
1082e5b6d6dSopenharmony_ci
1092e5b6d6dSopenharmony_cistatic inline UBool
1102e5b6d6dSopenharmony_ciisPerlOpen(const UnicodeString &pattern, int32_t pos) {
1112e5b6d6dSopenharmony_ci    UChar c;
1122e5b6d6dSopenharmony_ci    return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
1132e5b6d6dSopenharmony_ci}
1142e5b6d6dSopenharmony_ci
1152e5b6d6dSopenharmony_ci/*static inline UBool
1162e5b6d6dSopenharmony_ciisPerlClose(const UnicodeString &pattern, int32_t pos) {
1172e5b6d6dSopenharmony_ci    return pattern.charAt(pos)==u'}';
1182e5b6d6dSopenharmony_ci}*/
1192e5b6d6dSopenharmony_ci
1202e5b6d6dSopenharmony_cistatic inline UBool
1212e5b6d6dSopenharmony_ciisNameOpen(const UnicodeString &pattern, int32_t pos) {
1222e5b6d6dSopenharmony_ci    return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
1232e5b6d6dSopenharmony_ci}
1242e5b6d6dSopenharmony_ci
1252e5b6d6dSopenharmony_cistatic inline UBool
1262e5b6d6dSopenharmony_ciisPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
1272e5b6d6dSopenharmony_ci    return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
1282e5b6d6dSopenharmony_ci}
1292e5b6d6dSopenharmony_ci
1302e5b6d6dSopenharmony_ci/*static inline UBool
1312e5b6d6dSopenharmony_ciisPOSIXClose(const UnicodeString &pattern, int32_t pos) {
1322e5b6d6dSopenharmony_ci    return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
1332e5b6d6dSopenharmony_ci}*/
1342e5b6d6dSopenharmony_ci
1352e5b6d6dSopenharmony_ci// TODO memory debugging provided inside uniset.cpp
1362e5b6d6dSopenharmony_ci// could be made available here but probably obsolete with use of modern
1372e5b6d6dSopenharmony_ci// memory leak checker tools
1382e5b6d6dSopenharmony_ci#define _dbgct(me)
1392e5b6d6dSopenharmony_ci
1402e5b6d6dSopenharmony_ci}  // namespace
1412e5b6d6dSopenharmony_ci
1422e5b6d6dSopenharmony_ci//----------------------------------------------------------------
1432e5b6d6dSopenharmony_ci// Constructors &c
1442e5b6d6dSopenharmony_ci//----------------------------------------------------------------
1452e5b6d6dSopenharmony_ci
1462e5b6d6dSopenharmony_ci/**
1472e5b6d6dSopenharmony_ci * Constructs a set from the given pattern, optionally ignoring
1482e5b6d6dSopenharmony_ci * white space.  See the class description for the syntax of the
1492e5b6d6dSopenharmony_ci * pattern language.
1502e5b6d6dSopenharmony_ci * @param pattern a string specifying what characters are in the set
1512e5b6d6dSopenharmony_ci */
1522e5b6d6dSopenharmony_ciUnicodeSet::UnicodeSet(const UnicodeString& pattern,
1532e5b6d6dSopenharmony_ci                       UErrorCode& status) {
1542e5b6d6dSopenharmony_ci    applyPattern(pattern, status);
1552e5b6d6dSopenharmony_ci    _dbgct(this);
1562e5b6d6dSopenharmony_ci}
1572e5b6d6dSopenharmony_ci
1582e5b6d6dSopenharmony_ci//----------------------------------------------------------------
1592e5b6d6dSopenharmony_ci// Public API
1602e5b6d6dSopenharmony_ci//----------------------------------------------------------------
1612e5b6d6dSopenharmony_ci
1622e5b6d6dSopenharmony_ciUnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
1632e5b6d6dSopenharmony_ci                                     UErrorCode& status) {
1642e5b6d6dSopenharmony_ci    // Equivalent to
1652e5b6d6dSopenharmony_ci    //   return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
1662e5b6d6dSopenharmony_ci    // but without dependency on closeOver().
1672e5b6d6dSopenharmony_ci    ParsePosition pos(0);
1682e5b6d6dSopenharmony_ci    applyPatternIgnoreSpace(pattern, pos, NULL, status);
1692e5b6d6dSopenharmony_ci    if (U_FAILURE(status)) return *this;
1702e5b6d6dSopenharmony_ci
1712e5b6d6dSopenharmony_ci    int32_t i = pos.getIndex();
1722e5b6d6dSopenharmony_ci    // Skip over trailing whitespace
1732e5b6d6dSopenharmony_ci    ICU_Utility::skipWhitespace(pattern, i, true);
1742e5b6d6dSopenharmony_ci    if (i != pattern.length()) {
1752e5b6d6dSopenharmony_ci        status = U_ILLEGAL_ARGUMENT_ERROR;
1762e5b6d6dSopenharmony_ci    }
1772e5b6d6dSopenharmony_ci    return *this;
1782e5b6d6dSopenharmony_ci}
1792e5b6d6dSopenharmony_ci
1802e5b6d6dSopenharmony_civoid
1812e5b6d6dSopenharmony_ciUnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
1822e5b6d6dSopenharmony_ci                                    ParsePosition& pos,
1832e5b6d6dSopenharmony_ci                                    const SymbolTable* symbols,
1842e5b6d6dSopenharmony_ci                                    UErrorCode& status) {
1852e5b6d6dSopenharmony_ci    if (U_FAILURE(status)) {
1862e5b6d6dSopenharmony_ci        return;
1872e5b6d6dSopenharmony_ci    }
1882e5b6d6dSopenharmony_ci    if (isFrozen()) {
1892e5b6d6dSopenharmony_ci        status = U_NO_WRITE_PERMISSION;
1902e5b6d6dSopenharmony_ci        return;
1912e5b6d6dSopenharmony_ci    }
1922e5b6d6dSopenharmony_ci    // Need to build the pattern in a temporary string because
1932e5b6d6dSopenharmony_ci    // _applyPattern calls add() etc., which set pat to empty.
1942e5b6d6dSopenharmony_ci    UnicodeString rebuiltPat;
1952e5b6d6dSopenharmony_ci    RuleCharacterIterator chars(pattern, symbols, pos);
1962e5b6d6dSopenharmony_ci    applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status);
1972e5b6d6dSopenharmony_ci    if (U_FAILURE(status)) return;
1982e5b6d6dSopenharmony_ci    if (chars.inVariable()) {
1992e5b6d6dSopenharmony_ci        // syntaxError(chars, "Extra chars in variable value");
2002e5b6d6dSopenharmony_ci        status = U_MALFORMED_SET;
2012e5b6d6dSopenharmony_ci        return;
2022e5b6d6dSopenharmony_ci    }
2032e5b6d6dSopenharmony_ci    setPattern(rebuiltPat);
2042e5b6d6dSopenharmony_ci}
2052e5b6d6dSopenharmony_ci
2062e5b6d6dSopenharmony_ci/**
2072e5b6d6dSopenharmony_ci * Return true if the given position, in the given pattern, appears
2082e5b6d6dSopenharmony_ci * to be the start of a UnicodeSet pattern.
2092e5b6d6dSopenharmony_ci */
2102e5b6d6dSopenharmony_ciUBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
2112e5b6d6dSopenharmony_ci    return ((pos+1) < pattern.length() &&
2122e5b6d6dSopenharmony_ci            pattern.charAt(pos) == (UChar)91/*[*/) ||
2132e5b6d6dSopenharmony_ci        resemblesPropertyPattern(pattern, pos);
2142e5b6d6dSopenharmony_ci}
2152e5b6d6dSopenharmony_ci
2162e5b6d6dSopenharmony_ci//----------------------------------------------------------------
2172e5b6d6dSopenharmony_ci// Implementation: Pattern parsing
2182e5b6d6dSopenharmony_ci//----------------------------------------------------------------
2192e5b6d6dSopenharmony_ci
2202e5b6d6dSopenharmony_cinamespace {
2212e5b6d6dSopenharmony_ci
2222e5b6d6dSopenharmony_ci/**
2232e5b6d6dSopenharmony_ci * A small all-inline class to manage a UnicodeSet pointer.  Add
2242e5b6d6dSopenharmony_ci * operator->() etc. as needed.
2252e5b6d6dSopenharmony_ci */
2262e5b6d6dSopenharmony_ciclass UnicodeSetPointer {
2272e5b6d6dSopenharmony_ci    UnicodeSet* p;
2282e5b6d6dSopenharmony_cipublic:
2292e5b6d6dSopenharmony_ci    inline UnicodeSetPointer() : p(0) {}
2302e5b6d6dSopenharmony_ci    inline ~UnicodeSetPointer() { delete p; }
2312e5b6d6dSopenharmony_ci    inline UnicodeSet* pointer() { return p; }
2322e5b6d6dSopenharmony_ci    inline UBool allocate() {
2332e5b6d6dSopenharmony_ci        if (p == 0) {
2342e5b6d6dSopenharmony_ci            p = new UnicodeSet();
2352e5b6d6dSopenharmony_ci        }
2362e5b6d6dSopenharmony_ci        return p != 0;
2372e5b6d6dSopenharmony_ci    }
2382e5b6d6dSopenharmony_ci};
2392e5b6d6dSopenharmony_ci
2402e5b6d6dSopenharmony_ciconstexpr int32_t MAX_DEPTH = 100;
2412e5b6d6dSopenharmony_ci
2422e5b6d6dSopenharmony_ci}  // namespace
2432e5b6d6dSopenharmony_ci
2442e5b6d6dSopenharmony_ci/**
2452e5b6d6dSopenharmony_ci * Parse the pattern from the given RuleCharacterIterator.  The
2462e5b6d6dSopenharmony_ci * iterator is advanced over the parsed pattern.
2472e5b6d6dSopenharmony_ci * @param chars iterator over the pattern characters.  Upon return
2482e5b6d6dSopenharmony_ci * it will be advanced to the first character after the parsed
2492e5b6d6dSopenharmony_ci * pattern, or the end of the iteration if all characters are
2502e5b6d6dSopenharmony_ci * parsed.
2512e5b6d6dSopenharmony_ci * @param symbols symbol table to use to parse and dereference
2522e5b6d6dSopenharmony_ci * variables, or null if none.
2532e5b6d6dSopenharmony_ci * @param rebuiltPat the pattern that was parsed, rebuilt or
2542e5b6d6dSopenharmony_ci * copied from the input pattern, as appropriate.
2552e5b6d6dSopenharmony_ci * @param options a bit mask of zero or more of the following:
2562e5b6d6dSopenharmony_ci * IGNORE_SPACE, CASE.
2572e5b6d6dSopenharmony_ci */
2582e5b6d6dSopenharmony_civoid UnicodeSet::applyPattern(RuleCharacterIterator& chars,
2592e5b6d6dSopenharmony_ci                              const SymbolTable* symbols,
2602e5b6d6dSopenharmony_ci                              UnicodeString& rebuiltPat,
2612e5b6d6dSopenharmony_ci                              uint32_t options,
2622e5b6d6dSopenharmony_ci                              UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
2632e5b6d6dSopenharmony_ci                              int32_t depth,
2642e5b6d6dSopenharmony_ci                              UErrorCode& ec) {
2652e5b6d6dSopenharmony_ci    if (U_FAILURE(ec)) return;
2662e5b6d6dSopenharmony_ci    if (depth > MAX_DEPTH) {
2672e5b6d6dSopenharmony_ci        ec = U_ILLEGAL_ARGUMENT_ERROR;
2682e5b6d6dSopenharmony_ci        return;
2692e5b6d6dSopenharmony_ci    }
2702e5b6d6dSopenharmony_ci
2712e5b6d6dSopenharmony_ci    // Syntax characters: [ ] ^ - & { }
2722e5b6d6dSopenharmony_ci
2732e5b6d6dSopenharmony_ci    // Recognized special forms for chars, sets: c-c s-s s&s
2742e5b6d6dSopenharmony_ci
2752e5b6d6dSopenharmony_ci    int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
2762e5b6d6dSopenharmony_ci                   RuleCharacterIterator::PARSE_ESCAPES;
2772e5b6d6dSopenharmony_ci    if ((options & USET_IGNORE_SPACE) != 0) {
2782e5b6d6dSopenharmony_ci        opts |= RuleCharacterIterator::SKIP_WHITESPACE;
2792e5b6d6dSopenharmony_ci    }
2802e5b6d6dSopenharmony_ci
2812e5b6d6dSopenharmony_ci    UnicodeString patLocal, buf;
2822e5b6d6dSopenharmony_ci    UBool usePat = false;
2832e5b6d6dSopenharmony_ci    UnicodeSetPointer scratch;
2842e5b6d6dSopenharmony_ci    RuleCharacterIterator::Pos backup;
2852e5b6d6dSopenharmony_ci
2862e5b6d6dSopenharmony_ci    // mode: 0=before [, 1=between [...], 2=after ]
2872e5b6d6dSopenharmony_ci    // lastItem: 0=none, 1=char, 2=set
2882e5b6d6dSopenharmony_ci    int8_t lastItem = 0, mode = 0;
2892e5b6d6dSopenharmony_ci    UChar32 lastChar = 0;
2902e5b6d6dSopenharmony_ci    UChar op = 0;
2912e5b6d6dSopenharmony_ci
2922e5b6d6dSopenharmony_ci    UBool invert = false;
2932e5b6d6dSopenharmony_ci
2942e5b6d6dSopenharmony_ci    clear();
2952e5b6d6dSopenharmony_ci
2962e5b6d6dSopenharmony_ci    while (mode != 2 && !chars.atEnd()) {
2972e5b6d6dSopenharmony_ci        U_ASSERT((lastItem == 0 && op == 0) ||
2982e5b6d6dSopenharmony_ci                 (lastItem == 1 && (op == 0 || op == u'-')) ||
2992e5b6d6dSopenharmony_ci                 (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
3002e5b6d6dSopenharmony_ci
3012e5b6d6dSopenharmony_ci        UChar32 c = 0;
3022e5b6d6dSopenharmony_ci        UBool literal = false;
3032e5b6d6dSopenharmony_ci        UnicodeSet* nested = 0; // alias - do not delete
3042e5b6d6dSopenharmony_ci
3052e5b6d6dSopenharmony_ci        // -------- Check for property pattern
3062e5b6d6dSopenharmony_ci
3072e5b6d6dSopenharmony_ci        // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
3082e5b6d6dSopenharmony_ci        int8_t setMode = 0;
3092e5b6d6dSopenharmony_ci        if (resemblesPropertyPattern(chars, opts)) {
3102e5b6d6dSopenharmony_ci            setMode = 2;
3112e5b6d6dSopenharmony_ci        }
3122e5b6d6dSopenharmony_ci
3132e5b6d6dSopenharmony_ci        // -------- Parse '[' of opening delimiter OR nested set.
3142e5b6d6dSopenharmony_ci        // If there is a nested set, use `setMode' to define how
3152e5b6d6dSopenharmony_ci        // the set should be parsed.  If the '[' is part of the
3162e5b6d6dSopenharmony_ci        // opening delimiter for this pattern, parse special
3172e5b6d6dSopenharmony_ci        // strings "[", "[^", "[-", and "[^-".  Check for stand-in
3182e5b6d6dSopenharmony_ci        // characters representing a nested set in the symbol
3192e5b6d6dSopenharmony_ci        // table.
3202e5b6d6dSopenharmony_ci
3212e5b6d6dSopenharmony_ci        else {
3222e5b6d6dSopenharmony_ci            // Prepare to backup if necessary
3232e5b6d6dSopenharmony_ci            chars.getPos(backup);
3242e5b6d6dSopenharmony_ci            c = chars.next(opts, literal, ec);
3252e5b6d6dSopenharmony_ci            if (U_FAILURE(ec)) return;
3262e5b6d6dSopenharmony_ci
3272e5b6d6dSopenharmony_ci            if (c == u'[' && !literal) {
3282e5b6d6dSopenharmony_ci                if (mode == 1) {
3292e5b6d6dSopenharmony_ci                    chars.setPos(backup); // backup
3302e5b6d6dSopenharmony_ci                    setMode = 1;
3312e5b6d6dSopenharmony_ci                } else {
3322e5b6d6dSopenharmony_ci                    // Handle opening '[' delimiter
3332e5b6d6dSopenharmony_ci                    mode = 1;
3342e5b6d6dSopenharmony_ci                    patLocal.append(u'[');
3352e5b6d6dSopenharmony_ci                    chars.getPos(backup); // prepare to backup
3362e5b6d6dSopenharmony_ci                    c = chars.next(opts, literal, ec);
3372e5b6d6dSopenharmony_ci                    if (U_FAILURE(ec)) return;
3382e5b6d6dSopenharmony_ci                    if (c == u'^' && !literal) {
3392e5b6d6dSopenharmony_ci                        invert = true;
3402e5b6d6dSopenharmony_ci                        patLocal.append(u'^');
3412e5b6d6dSopenharmony_ci                        chars.getPos(backup); // prepare to backup
3422e5b6d6dSopenharmony_ci                        c = chars.next(opts, literal, ec);
3432e5b6d6dSopenharmony_ci                        if (U_FAILURE(ec)) return;
3442e5b6d6dSopenharmony_ci                    }
3452e5b6d6dSopenharmony_ci                    // Fall through to handle special leading '-';
3462e5b6d6dSopenharmony_ci                    // otherwise restart loop for nested [], \p{}, etc.
3472e5b6d6dSopenharmony_ci                    if (c == u'-') {
3482e5b6d6dSopenharmony_ci                        literal = true;
3492e5b6d6dSopenharmony_ci                        // Fall through to handle literal '-' below
3502e5b6d6dSopenharmony_ci                    } else {
3512e5b6d6dSopenharmony_ci                        chars.setPos(backup); // backup
3522e5b6d6dSopenharmony_ci                        continue;
3532e5b6d6dSopenharmony_ci                    }
3542e5b6d6dSopenharmony_ci                }
3552e5b6d6dSopenharmony_ci            } else if (symbols != 0) {
3562e5b6d6dSopenharmony_ci                const UnicodeFunctor *m = symbols->lookupMatcher(c);
3572e5b6d6dSopenharmony_ci                if (m != 0) {
3582e5b6d6dSopenharmony_ci                    const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
3592e5b6d6dSopenharmony_ci                    if (ms == NULL) {
3602e5b6d6dSopenharmony_ci                        ec = U_MALFORMED_SET;
3612e5b6d6dSopenharmony_ci                        return;
3622e5b6d6dSopenharmony_ci                    }
3632e5b6d6dSopenharmony_ci                    // casting away const, but `nested' won't be modified
3642e5b6d6dSopenharmony_ci                    // (important not to modify stored set)
3652e5b6d6dSopenharmony_ci                    nested = const_cast<UnicodeSet*>(ms);
3662e5b6d6dSopenharmony_ci                    setMode = 3;
3672e5b6d6dSopenharmony_ci                }
3682e5b6d6dSopenharmony_ci            }
3692e5b6d6dSopenharmony_ci        }
3702e5b6d6dSopenharmony_ci
3712e5b6d6dSopenharmony_ci        // -------- Handle a nested set.  This either is inline in
3722e5b6d6dSopenharmony_ci        // the pattern or represented by a stand-in that has
3732e5b6d6dSopenharmony_ci        // previously been parsed and was looked up in the symbol
3742e5b6d6dSopenharmony_ci        // table.
3752e5b6d6dSopenharmony_ci
3762e5b6d6dSopenharmony_ci        if (setMode != 0) {
3772e5b6d6dSopenharmony_ci            if (lastItem == 1) {
3782e5b6d6dSopenharmony_ci                if (op != 0) {
3792e5b6d6dSopenharmony_ci                    // syntaxError(chars, "Char expected after operator");
3802e5b6d6dSopenharmony_ci                    ec = U_MALFORMED_SET;
3812e5b6d6dSopenharmony_ci                    return;
3822e5b6d6dSopenharmony_ci                }
3832e5b6d6dSopenharmony_ci                add(lastChar, lastChar);
3842e5b6d6dSopenharmony_ci                _appendToPat(patLocal, lastChar, false);
3852e5b6d6dSopenharmony_ci                lastItem = 0;
3862e5b6d6dSopenharmony_ci                op = 0;
3872e5b6d6dSopenharmony_ci            }
3882e5b6d6dSopenharmony_ci
3892e5b6d6dSopenharmony_ci            if (op == u'-' || op == u'&') {
3902e5b6d6dSopenharmony_ci                patLocal.append(op);
3912e5b6d6dSopenharmony_ci            }
3922e5b6d6dSopenharmony_ci
3932e5b6d6dSopenharmony_ci            if (nested == 0) {
3942e5b6d6dSopenharmony_ci                // lazy allocation
3952e5b6d6dSopenharmony_ci                if (!scratch.allocate()) {
3962e5b6d6dSopenharmony_ci                    ec = U_MEMORY_ALLOCATION_ERROR;
3972e5b6d6dSopenharmony_ci                    return;
3982e5b6d6dSopenharmony_ci                }
3992e5b6d6dSopenharmony_ci                nested = scratch.pointer();
4002e5b6d6dSopenharmony_ci            }
4012e5b6d6dSopenharmony_ci            switch (setMode) {
4022e5b6d6dSopenharmony_ci            case 1:
4032e5b6d6dSopenharmony_ci                nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
4042e5b6d6dSopenharmony_ci                break;
4052e5b6d6dSopenharmony_ci            case 2:
4062e5b6d6dSopenharmony_ci                chars.skipIgnored(opts);
4072e5b6d6dSopenharmony_ci                nested->applyPropertyPattern(chars, patLocal, ec);
4082e5b6d6dSopenharmony_ci                if (U_FAILURE(ec)) return;
4092e5b6d6dSopenharmony_ci                break;
4102e5b6d6dSopenharmony_ci            case 3: // `nested' already parsed
4112e5b6d6dSopenharmony_ci                nested->_toPattern(patLocal, false);
4122e5b6d6dSopenharmony_ci                break;
4132e5b6d6dSopenharmony_ci            }
4142e5b6d6dSopenharmony_ci
4152e5b6d6dSopenharmony_ci            usePat = true;
4162e5b6d6dSopenharmony_ci
4172e5b6d6dSopenharmony_ci            if (mode == 0) {
4182e5b6d6dSopenharmony_ci                // Entire pattern is a category; leave parse loop
4192e5b6d6dSopenharmony_ci                *this = *nested;
4202e5b6d6dSopenharmony_ci                mode = 2;
4212e5b6d6dSopenharmony_ci                break;
4222e5b6d6dSopenharmony_ci            }
4232e5b6d6dSopenharmony_ci
4242e5b6d6dSopenharmony_ci            switch (op) {
4252e5b6d6dSopenharmony_ci            case u'-':
4262e5b6d6dSopenharmony_ci                removeAll(*nested);
4272e5b6d6dSopenharmony_ci                break;
4282e5b6d6dSopenharmony_ci            case u'&':
4292e5b6d6dSopenharmony_ci                retainAll(*nested);
4302e5b6d6dSopenharmony_ci                break;
4312e5b6d6dSopenharmony_ci            case 0:
4322e5b6d6dSopenharmony_ci                addAll(*nested);
4332e5b6d6dSopenharmony_ci                break;
4342e5b6d6dSopenharmony_ci            }
4352e5b6d6dSopenharmony_ci
4362e5b6d6dSopenharmony_ci            op = 0;
4372e5b6d6dSopenharmony_ci            lastItem = 2;
4382e5b6d6dSopenharmony_ci
4392e5b6d6dSopenharmony_ci            continue;
4402e5b6d6dSopenharmony_ci        }
4412e5b6d6dSopenharmony_ci
4422e5b6d6dSopenharmony_ci        if (mode == 0) {
4432e5b6d6dSopenharmony_ci            // syntaxError(chars, "Missing '['");
4442e5b6d6dSopenharmony_ci            ec = U_MALFORMED_SET;
4452e5b6d6dSopenharmony_ci            return;
4462e5b6d6dSopenharmony_ci        }
4472e5b6d6dSopenharmony_ci
4482e5b6d6dSopenharmony_ci        // -------- Parse special (syntax) characters.  If the
4492e5b6d6dSopenharmony_ci        // current character is not special, or if it is escaped,
4502e5b6d6dSopenharmony_ci        // then fall through and handle it below.
4512e5b6d6dSopenharmony_ci
4522e5b6d6dSopenharmony_ci        if (!literal) {
4532e5b6d6dSopenharmony_ci            switch (c) {
4542e5b6d6dSopenharmony_ci            case u']':
4552e5b6d6dSopenharmony_ci                if (lastItem == 1) {
4562e5b6d6dSopenharmony_ci                    add(lastChar, lastChar);
4572e5b6d6dSopenharmony_ci                    _appendToPat(patLocal, lastChar, false);
4582e5b6d6dSopenharmony_ci                }
4592e5b6d6dSopenharmony_ci                // Treat final trailing '-' as a literal
4602e5b6d6dSopenharmony_ci                if (op == u'-') {
4612e5b6d6dSopenharmony_ci                    add(op, op);
4622e5b6d6dSopenharmony_ci                    patLocal.append(op);
4632e5b6d6dSopenharmony_ci                } else if (op == u'&') {
4642e5b6d6dSopenharmony_ci                    // syntaxError(chars, "Trailing '&'");
4652e5b6d6dSopenharmony_ci                    ec = U_MALFORMED_SET;
4662e5b6d6dSopenharmony_ci                    return;
4672e5b6d6dSopenharmony_ci                }
4682e5b6d6dSopenharmony_ci                patLocal.append(u']');
4692e5b6d6dSopenharmony_ci                mode = 2;
4702e5b6d6dSopenharmony_ci                continue;
4712e5b6d6dSopenharmony_ci            case u'-':
4722e5b6d6dSopenharmony_ci                if (op == 0) {
4732e5b6d6dSopenharmony_ci                    if (lastItem != 0) {
4742e5b6d6dSopenharmony_ci                        op = (UChar) c;
4752e5b6d6dSopenharmony_ci                        continue;
4762e5b6d6dSopenharmony_ci                    } else {
4772e5b6d6dSopenharmony_ci                        // Treat final trailing '-' as a literal
4782e5b6d6dSopenharmony_ci                        add(c, c);
4792e5b6d6dSopenharmony_ci                        c = chars.next(opts, literal, ec);
4802e5b6d6dSopenharmony_ci                        if (U_FAILURE(ec)) return;
4812e5b6d6dSopenharmony_ci                        if (c == u']' && !literal) {
4822e5b6d6dSopenharmony_ci                            patLocal.append(u"-]", 2);
4832e5b6d6dSopenharmony_ci                            mode = 2;
4842e5b6d6dSopenharmony_ci                            continue;
4852e5b6d6dSopenharmony_ci                        }
4862e5b6d6dSopenharmony_ci                    }
4872e5b6d6dSopenharmony_ci                }
4882e5b6d6dSopenharmony_ci                // syntaxError(chars, "'-' not after char or set");
4892e5b6d6dSopenharmony_ci                ec = U_MALFORMED_SET;
4902e5b6d6dSopenharmony_ci                return;
4912e5b6d6dSopenharmony_ci            case u'&':
4922e5b6d6dSopenharmony_ci                if (lastItem == 2 && op == 0) {
4932e5b6d6dSopenharmony_ci                    op = (UChar) c;
4942e5b6d6dSopenharmony_ci                    continue;
4952e5b6d6dSopenharmony_ci                }
4962e5b6d6dSopenharmony_ci                // syntaxError(chars, "'&' not after set");
4972e5b6d6dSopenharmony_ci                ec = U_MALFORMED_SET;
4982e5b6d6dSopenharmony_ci                return;
4992e5b6d6dSopenharmony_ci            case u'^':
5002e5b6d6dSopenharmony_ci                // syntaxError(chars, "'^' not after '['");
5012e5b6d6dSopenharmony_ci                ec = U_MALFORMED_SET;
5022e5b6d6dSopenharmony_ci                return;
5032e5b6d6dSopenharmony_ci            case u'{':
5042e5b6d6dSopenharmony_ci                if (op != 0) {
5052e5b6d6dSopenharmony_ci                    // syntaxError(chars, "Missing operand after operator");
5062e5b6d6dSopenharmony_ci                    ec = U_MALFORMED_SET;
5072e5b6d6dSopenharmony_ci                    return;
5082e5b6d6dSopenharmony_ci                }
5092e5b6d6dSopenharmony_ci                if (lastItem == 1) {
5102e5b6d6dSopenharmony_ci                    add(lastChar, lastChar);
5112e5b6d6dSopenharmony_ci                    _appendToPat(patLocal, lastChar, false);
5122e5b6d6dSopenharmony_ci                }
5132e5b6d6dSopenharmony_ci                lastItem = 0;
5142e5b6d6dSopenharmony_ci                buf.truncate(0);
5152e5b6d6dSopenharmony_ci                {
5162e5b6d6dSopenharmony_ci                    UBool ok = false;
5172e5b6d6dSopenharmony_ci                    while (!chars.atEnd()) {
5182e5b6d6dSopenharmony_ci                        c = chars.next(opts, literal, ec);
5192e5b6d6dSopenharmony_ci                        if (U_FAILURE(ec)) return;
5202e5b6d6dSopenharmony_ci                        if (c == u'}' && !literal) {
5212e5b6d6dSopenharmony_ci                            ok = true;
5222e5b6d6dSopenharmony_ci                            break;
5232e5b6d6dSopenharmony_ci                        }
5242e5b6d6dSopenharmony_ci                        buf.append(c);
5252e5b6d6dSopenharmony_ci                    }
5262e5b6d6dSopenharmony_ci                    if (!ok) {
5272e5b6d6dSopenharmony_ci                        // syntaxError(chars, "Invalid multicharacter string");
5282e5b6d6dSopenharmony_ci                        ec = U_MALFORMED_SET;
5292e5b6d6dSopenharmony_ci                        return;
5302e5b6d6dSopenharmony_ci                    }
5312e5b6d6dSopenharmony_ci                }
5322e5b6d6dSopenharmony_ci                // We have new string. Add it to set and continue;
5332e5b6d6dSopenharmony_ci                // we don't need to drop through to the further
5342e5b6d6dSopenharmony_ci                // processing
5352e5b6d6dSopenharmony_ci                add(buf);
5362e5b6d6dSopenharmony_ci                patLocal.append(u'{');
5372e5b6d6dSopenharmony_ci                _appendToPat(patLocal, buf, false);
5382e5b6d6dSopenharmony_ci                patLocal.append(u'}');
5392e5b6d6dSopenharmony_ci                continue;
5402e5b6d6dSopenharmony_ci            case SymbolTable::SYMBOL_REF:
5412e5b6d6dSopenharmony_ci                //         symbols  nosymbols
5422e5b6d6dSopenharmony_ci                // [a-$]   error    error (ambiguous)
5432e5b6d6dSopenharmony_ci                // [a$]    anchor   anchor
5442e5b6d6dSopenharmony_ci                // [a-$x]  var "x"* literal '$'
5452e5b6d6dSopenharmony_ci                // [a-$.]  error    literal '$'
5462e5b6d6dSopenharmony_ci                // *We won't get here in the case of var "x"
5472e5b6d6dSopenharmony_ci                {
5482e5b6d6dSopenharmony_ci                    chars.getPos(backup);
5492e5b6d6dSopenharmony_ci                    c = chars.next(opts, literal, ec);
5502e5b6d6dSopenharmony_ci                    if (U_FAILURE(ec)) return;
5512e5b6d6dSopenharmony_ci                    UBool anchor = (c == u']' && !literal);
5522e5b6d6dSopenharmony_ci                    if (symbols == 0 && !anchor) {
5532e5b6d6dSopenharmony_ci                        c = SymbolTable::SYMBOL_REF;
5542e5b6d6dSopenharmony_ci                        chars.setPos(backup);
5552e5b6d6dSopenharmony_ci                        break; // literal '$'
5562e5b6d6dSopenharmony_ci                    }
5572e5b6d6dSopenharmony_ci                    if (anchor && op == 0) {
5582e5b6d6dSopenharmony_ci                        if (lastItem == 1) {
5592e5b6d6dSopenharmony_ci                            add(lastChar, lastChar);
5602e5b6d6dSopenharmony_ci                            _appendToPat(patLocal, lastChar, false);
5612e5b6d6dSopenharmony_ci                        }
5622e5b6d6dSopenharmony_ci                        add(U_ETHER);
5632e5b6d6dSopenharmony_ci                        usePat = true;
5642e5b6d6dSopenharmony_ci                        patLocal.append((UChar) SymbolTable::SYMBOL_REF);
5652e5b6d6dSopenharmony_ci                        patLocal.append(u']');
5662e5b6d6dSopenharmony_ci                        mode = 2;
5672e5b6d6dSopenharmony_ci                        continue;
5682e5b6d6dSopenharmony_ci                    }
5692e5b6d6dSopenharmony_ci                    // syntaxError(chars, "Unquoted '$'");
5702e5b6d6dSopenharmony_ci                    ec = U_MALFORMED_SET;
5712e5b6d6dSopenharmony_ci                    return;
5722e5b6d6dSopenharmony_ci                }
5732e5b6d6dSopenharmony_ci            default:
5742e5b6d6dSopenharmony_ci                break;
5752e5b6d6dSopenharmony_ci            }
5762e5b6d6dSopenharmony_ci        }
5772e5b6d6dSopenharmony_ci
5782e5b6d6dSopenharmony_ci        // -------- Parse literal characters.  This includes both
5792e5b6d6dSopenharmony_ci        // escaped chars ("\u4E01") and non-syntax characters
5802e5b6d6dSopenharmony_ci        // ("a").
5812e5b6d6dSopenharmony_ci
5822e5b6d6dSopenharmony_ci        switch (lastItem) {
5832e5b6d6dSopenharmony_ci        case 0:
5842e5b6d6dSopenharmony_ci            lastItem = 1;
5852e5b6d6dSopenharmony_ci            lastChar = c;
5862e5b6d6dSopenharmony_ci            break;
5872e5b6d6dSopenharmony_ci        case 1:
5882e5b6d6dSopenharmony_ci            if (op == u'-') {
5892e5b6d6dSopenharmony_ci                if (lastChar >= c) {
5902e5b6d6dSopenharmony_ci                    // Don't allow redundant (a-a) or empty (b-a) ranges;
5912e5b6d6dSopenharmony_ci                    // these are most likely typos.
5922e5b6d6dSopenharmony_ci                    // syntaxError(chars, "Invalid range");
5932e5b6d6dSopenharmony_ci                    ec = U_MALFORMED_SET;
5942e5b6d6dSopenharmony_ci                    return;
5952e5b6d6dSopenharmony_ci                }
5962e5b6d6dSopenharmony_ci                add(lastChar, c);
5972e5b6d6dSopenharmony_ci                _appendToPat(patLocal, lastChar, false);
5982e5b6d6dSopenharmony_ci                patLocal.append(op);
5992e5b6d6dSopenharmony_ci                _appendToPat(patLocal, c, false);
6002e5b6d6dSopenharmony_ci                lastItem = 0;
6012e5b6d6dSopenharmony_ci                op = 0;
6022e5b6d6dSopenharmony_ci            } else {
6032e5b6d6dSopenharmony_ci                add(lastChar, lastChar);
6042e5b6d6dSopenharmony_ci                _appendToPat(patLocal, lastChar, false);
6052e5b6d6dSopenharmony_ci                lastChar = c;
6062e5b6d6dSopenharmony_ci            }
6072e5b6d6dSopenharmony_ci            break;
6082e5b6d6dSopenharmony_ci        case 2:
6092e5b6d6dSopenharmony_ci            if (op != 0) {
6102e5b6d6dSopenharmony_ci                // syntaxError(chars, "Set expected after operator");
6112e5b6d6dSopenharmony_ci                ec = U_MALFORMED_SET;
6122e5b6d6dSopenharmony_ci                return;
6132e5b6d6dSopenharmony_ci            }
6142e5b6d6dSopenharmony_ci            lastChar = c;
6152e5b6d6dSopenharmony_ci            lastItem = 1;
6162e5b6d6dSopenharmony_ci            break;
6172e5b6d6dSopenharmony_ci        }
6182e5b6d6dSopenharmony_ci    }
6192e5b6d6dSopenharmony_ci
6202e5b6d6dSopenharmony_ci    if (mode != 2) {
6212e5b6d6dSopenharmony_ci        // syntaxError(chars, "Missing ']'");
6222e5b6d6dSopenharmony_ci        ec = U_MALFORMED_SET;
6232e5b6d6dSopenharmony_ci        return;
6242e5b6d6dSopenharmony_ci    }
6252e5b6d6dSopenharmony_ci
6262e5b6d6dSopenharmony_ci    chars.skipIgnored(opts);
6272e5b6d6dSopenharmony_ci
6282e5b6d6dSopenharmony_ci    /**
6292e5b6d6dSopenharmony_ci     * Handle global flags (invert, case insensitivity).  If this
6302e5b6d6dSopenharmony_ci     * pattern should be compiled case-insensitive, then we need
6312e5b6d6dSopenharmony_ci     * to close over case BEFORE COMPLEMENTING.  This makes
6322e5b6d6dSopenharmony_ci     * patterns like /[^abc]/i work.
6332e5b6d6dSopenharmony_ci     */
6342e5b6d6dSopenharmony_ci    if ((options & USET_CASE_INSENSITIVE) != 0) {
6352e5b6d6dSopenharmony_ci        (this->*caseClosure)(USET_CASE_INSENSITIVE);
6362e5b6d6dSopenharmony_ci    }
6372e5b6d6dSopenharmony_ci    else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
6382e5b6d6dSopenharmony_ci        (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
6392e5b6d6dSopenharmony_ci    }
6402e5b6d6dSopenharmony_ci    if (invert) {
6412e5b6d6dSopenharmony_ci        complement().removeAllStrings();  // code point complement
6422e5b6d6dSopenharmony_ci    }
6432e5b6d6dSopenharmony_ci
6442e5b6d6dSopenharmony_ci    // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
6452e5b6d6dSopenharmony_ci    // generated pattern.
6462e5b6d6dSopenharmony_ci    if (usePat) {
6472e5b6d6dSopenharmony_ci        rebuiltPat.append(patLocal);
6482e5b6d6dSopenharmony_ci    } else {
6492e5b6d6dSopenharmony_ci        _generatePattern(rebuiltPat, false);
6502e5b6d6dSopenharmony_ci    }
6512e5b6d6dSopenharmony_ci    if (isBogus() && U_SUCCESS(ec)) {
6522e5b6d6dSopenharmony_ci        // We likely ran out of memory. AHHH!
6532e5b6d6dSopenharmony_ci        ec = U_MEMORY_ALLOCATION_ERROR;
6542e5b6d6dSopenharmony_ci    }
6552e5b6d6dSopenharmony_ci}
6562e5b6d6dSopenharmony_ci
6572e5b6d6dSopenharmony_ci//----------------------------------------------------------------
6582e5b6d6dSopenharmony_ci// Property set implementation
6592e5b6d6dSopenharmony_ci//----------------------------------------------------------------
6602e5b6d6dSopenharmony_ci
6612e5b6d6dSopenharmony_cinamespace {
6622e5b6d6dSopenharmony_ci
6632e5b6d6dSopenharmony_cistatic UBool numericValueFilter(UChar32 ch, void* context) {
6642e5b6d6dSopenharmony_ci    return u_getNumericValue(ch) == *(double*)context;
6652e5b6d6dSopenharmony_ci}
6662e5b6d6dSopenharmony_ci
6672e5b6d6dSopenharmony_cistatic UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
6682e5b6d6dSopenharmony_ci    int32_t value = *(int32_t*)context;
6692e5b6d6dSopenharmony_ci    return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
6702e5b6d6dSopenharmony_ci}
6712e5b6d6dSopenharmony_ci
6722e5b6d6dSopenharmony_cistatic UBool versionFilter(UChar32 ch, void* context) {
6732e5b6d6dSopenharmony_ci    static const UVersionInfo none = { 0, 0, 0, 0 };
6742e5b6d6dSopenharmony_ci    UVersionInfo v;
6752e5b6d6dSopenharmony_ci    u_charAge(ch, v);
6762e5b6d6dSopenharmony_ci    UVersionInfo* version = (UVersionInfo*)context;
6772e5b6d6dSopenharmony_ci    return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
6782e5b6d6dSopenharmony_ci}
6792e5b6d6dSopenharmony_ci
6802e5b6d6dSopenharmony_citypedef struct {
6812e5b6d6dSopenharmony_ci    UProperty prop;
6822e5b6d6dSopenharmony_ci    int32_t value;
6832e5b6d6dSopenharmony_ci} IntPropertyContext;
6842e5b6d6dSopenharmony_ci
6852e5b6d6dSopenharmony_cistatic UBool intPropertyFilter(UChar32 ch, void* context) {
6862e5b6d6dSopenharmony_ci    IntPropertyContext* c = (IntPropertyContext*)context;
6872e5b6d6dSopenharmony_ci    return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
6882e5b6d6dSopenharmony_ci}
6892e5b6d6dSopenharmony_ci
6902e5b6d6dSopenharmony_cistatic UBool scriptExtensionsFilter(UChar32 ch, void* context) {
6912e5b6d6dSopenharmony_ci    return uscript_hasScript(ch, *(UScriptCode*)context);
6922e5b6d6dSopenharmony_ci}
6932e5b6d6dSopenharmony_ci
6942e5b6d6dSopenharmony_ci}  // namespace
6952e5b6d6dSopenharmony_ci
6962e5b6d6dSopenharmony_ci/**
6972e5b6d6dSopenharmony_ci * Generic filter-based scanning code for UCD property UnicodeSets.
6982e5b6d6dSopenharmony_ci */
6992e5b6d6dSopenharmony_civoid UnicodeSet::applyFilter(UnicodeSet::Filter filter,
7002e5b6d6dSopenharmony_ci                             void* context,
7012e5b6d6dSopenharmony_ci                             const UnicodeSet* inclusions,
7022e5b6d6dSopenharmony_ci                             UErrorCode &status) {
7032e5b6d6dSopenharmony_ci    if (U_FAILURE(status)) return;
7042e5b6d6dSopenharmony_ci
7052e5b6d6dSopenharmony_ci    // Logically, walk through all Unicode characters, noting the start
7062e5b6d6dSopenharmony_ci    // and end of each range for which filter.contain(c) is
7072e5b6d6dSopenharmony_ci    // true.  Add each range to a set.
7082e5b6d6dSopenharmony_ci    //
7092e5b6d6dSopenharmony_ci    // To improve performance, use an inclusions set which
7102e5b6d6dSopenharmony_ci    // encodes information about character ranges that are known
7112e5b6d6dSopenharmony_ci    // to have identical properties.
7122e5b6d6dSopenharmony_ci    // inclusions contains the first characters of
7132e5b6d6dSopenharmony_ci    // same-value ranges for the given property.
7142e5b6d6dSopenharmony_ci
7152e5b6d6dSopenharmony_ci    clear();
7162e5b6d6dSopenharmony_ci
7172e5b6d6dSopenharmony_ci    UChar32 startHasProperty = -1;
7182e5b6d6dSopenharmony_ci    int32_t limitRange = inclusions->getRangeCount();
7192e5b6d6dSopenharmony_ci
7202e5b6d6dSopenharmony_ci    for (int j=0; j<limitRange; ++j) {
7212e5b6d6dSopenharmony_ci        // get current range
7222e5b6d6dSopenharmony_ci        UChar32 start = inclusions->getRangeStart(j);
7232e5b6d6dSopenharmony_ci        UChar32 end = inclusions->getRangeEnd(j);
7242e5b6d6dSopenharmony_ci
7252e5b6d6dSopenharmony_ci        // for all the code points in the range, process
7262e5b6d6dSopenharmony_ci        for (UChar32 ch = start; ch <= end; ++ch) {
7272e5b6d6dSopenharmony_ci            // only add to this UnicodeSet on inflection points --
7282e5b6d6dSopenharmony_ci            // where the hasProperty value changes to false
7292e5b6d6dSopenharmony_ci            if ((*filter)(ch, context)) {
7302e5b6d6dSopenharmony_ci                if (startHasProperty < 0) {
7312e5b6d6dSopenharmony_ci                    startHasProperty = ch;
7322e5b6d6dSopenharmony_ci                }
7332e5b6d6dSopenharmony_ci            } else if (startHasProperty >= 0) {
7342e5b6d6dSopenharmony_ci                add(startHasProperty, ch-1);
7352e5b6d6dSopenharmony_ci                startHasProperty = -1;
7362e5b6d6dSopenharmony_ci            }
7372e5b6d6dSopenharmony_ci        }
7382e5b6d6dSopenharmony_ci    }
7392e5b6d6dSopenharmony_ci    if (startHasProperty >= 0) {
7402e5b6d6dSopenharmony_ci        add((UChar32)startHasProperty, (UChar32)0x10FFFF);
7412e5b6d6dSopenharmony_ci    }
7422e5b6d6dSopenharmony_ci    if (isBogus() && U_SUCCESS(status)) {
7432e5b6d6dSopenharmony_ci        // We likely ran out of memory. AHHH!
7442e5b6d6dSopenharmony_ci        status = U_MEMORY_ALLOCATION_ERROR;
7452e5b6d6dSopenharmony_ci    }
7462e5b6d6dSopenharmony_ci}
7472e5b6d6dSopenharmony_ci
7482e5b6d6dSopenharmony_cinamespace {
7492e5b6d6dSopenharmony_ci
7502e5b6d6dSopenharmony_cistatic UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
7512e5b6d6dSopenharmony_ci    /* Note: we use ' ' in compiler code page */
7522e5b6d6dSopenharmony_ci    int32_t j = 0;
7532e5b6d6dSopenharmony_ci    char ch;
7542e5b6d6dSopenharmony_ci    --dstCapacity; /* make room for term. zero */
7552e5b6d6dSopenharmony_ci    while ((ch = *src++) != 0) {
7562e5b6d6dSopenharmony_ci        if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
7572e5b6d6dSopenharmony_ci            continue;
7582e5b6d6dSopenharmony_ci        }
7592e5b6d6dSopenharmony_ci        if (j >= dstCapacity) return false;
7602e5b6d6dSopenharmony_ci        dst[j++] = ch;
7612e5b6d6dSopenharmony_ci    }
7622e5b6d6dSopenharmony_ci    if (j > 0 && dst[j-1] == ' ') --j;
7632e5b6d6dSopenharmony_ci    dst[j] = 0;
7642e5b6d6dSopenharmony_ci    return true;
7652e5b6d6dSopenharmony_ci}
7662e5b6d6dSopenharmony_ci
7672e5b6d6dSopenharmony_ci}  // namespace
7682e5b6d6dSopenharmony_ci
7692e5b6d6dSopenharmony_ci//----------------------------------------------------------------
7702e5b6d6dSopenharmony_ci// Property set API
7712e5b6d6dSopenharmony_ci//----------------------------------------------------------------
7722e5b6d6dSopenharmony_ci
7732e5b6d6dSopenharmony_ci#define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
7742e5b6d6dSopenharmony_ci    ec=U_ILLEGAL_ARGUMENT_ERROR; \
7752e5b6d6dSopenharmony_ci    return *this; \
7762e5b6d6dSopenharmony_ci} UPRV_BLOCK_MACRO_END
7772e5b6d6dSopenharmony_ci
7782e5b6d6dSopenharmony_ciUnicodeSet&
7792e5b6d6dSopenharmony_ciUnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
7802e5b6d6dSopenharmony_ci    if (U_FAILURE(ec) || isFrozen()) { return *this; }
7812e5b6d6dSopenharmony_ci    if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
7822e5b6d6dSopenharmony_ci        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
7832e5b6d6dSopenharmony_ci        applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
7842e5b6d6dSopenharmony_ci    } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
7852e5b6d6dSopenharmony_ci        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
7862e5b6d6dSopenharmony_ci        UScriptCode script = (UScriptCode)value;
7872e5b6d6dSopenharmony_ci        applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
7882e5b6d6dSopenharmony_ci    } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
7892e5b6d6dSopenharmony_ci        if (value == 0 || value == 1) {
7902e5b6d6dSopenharmony_ci            const USet *set = u_getBinaryPropertySet(prop, &ec);
7912e5b6d6dSopenharmony_ci            if (U_FAILURE(ec)) { return *this; }
7922e5b6d6dSopenharmony_ci            copyFrom(*UnicodeSet::fromUSet(set), true);
7932e5b6d6dSopenharmony_ci            if (value == 0) {
7942e5b6d6dSopenharmony_ci                complement().removeAllStrings();  // code point complement
7952e5b6d6dSopenharmony_ci            }
7962e5b6d6dSopenharmony_ci        } else {
7972e5b6d6dSopenharmony_ci            clear();
7982e5b6d6dSopenharmony_ci        }
7992e5b6d6dSopenharmony_ci    } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
8002e5b6d6dSopenharmony_ci        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
8012e5b6d6dSopenharmony_ci        IntPropertyContext c = {prop, value};
8022e5b6d6dSopenharmony_ci        applyFilter(intPropertyFilter, &c, inclusions, ec);
8032e5b6d6dSopenharmony_ci    } else {
8042e5b6d6dSopenharmony_ci        ec = U_ILLEGAL_ARGUMENT_ERROR;
8052e5b6d6dSopenharmony_ci    }
8062e5b6d6dSopenharmony_ci    return *this;
8072e5b6d6dSopenharmony_ci}
8082e5b6d6dSopenharmony_ci
8092e5b6d6dSopenharmony_ciUnicodeSet&
8102e5b6d6dSopenharmony_ciUnicodeSet::applyPropertyAlias(const UnicodeString& prop,
8112e5b6d6dSopenharmony_ci                               const UnicodeString& value,
8122e5b6d6dSopenharmony_ci                               UErrorCode& ec) {
8132e5b6d6dSopenharmony_ci    if (U_FAILURE(ec) || isFrozen()) return *this;
8142e5b6d6dSopenharmony_ci
8152e5b6d6dSopenharmony_ci    // prop and value used to be converted to char * using the default
8162e5b6d6dSopenharmony_ci    // converter instead of the invariant conversion.
8172e5b6d6dSopenharmony_ci    // This should not be necessary because all Unicode property and value
8182e5b6d6dSopenharmony_ci    // names use only invariant characters.
8192e5b6d6dSopenharmony_ci    // If there are any variant characters, then we won't find them anyway.
8202e5b6d6dSopenharmony_ci    // Checking first avoids assertion failures in the conversion.
8212e5b6d6dSopenharmony_ci    if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
8222e5b6d6dSopenharmony_ci        !uprv_isInvariantUString(value.getBuffer(), value.length())
8232e5b6d6dSopenharmony_ci    ) {
8242e5b6d6dSopenharmony_ci        FAIL(ec);
8252e5b6d6dSopenharmony_ci    }
8262e5b6d6dSopenharmony_ci    CharString pname, vname;
8272e5b6d6dSopenharmony_ci    pname.appendInvariantChars(prop, ec);
8282e5b6d6dSopenharmony_ci    vname.appendInvariantChars(value, ec);
8292e5b6d6dSopenharmony_ci    if (U_FAILURE(ec)) return *this;
8302e5b6d6dSopenharmony_ci
8312e5b6d6dSopenharmony_ci    UProperty p;
8322e5b6d6dSopenharmony_ci    int32_t v;
8332e5b6d6dSopenharmony_ci    UBool invert = false;
8342e5b6d6dSopenharmony_ci
8352e5b6d6dSopenharmony_ci    if (value.length() > 0) {
8362e5b6d6dSopenharmony_ci        p = u_getPropertyEnum(pname.data());
8372e5b6d6dSopenharmony_ci        if (p == UCHAR_INVALID_CODE) FAIL(ec);
8382e5b6d6dSopenharmony_ci
8392e5b6d6dSopenharmony_ci        // Treat gc as gcm
8402e5b6d6dSopenharmony_ci        if (p == UCHAR_GENERAL_CATEGORY) {
8412e5b6d6dSopenharmony_ci            p = UCHAR_GENERAL_CATEGORY_MASK;
8422e5b6d6dSopenharmony_ci        }
8432e5b6d6dSopenharmony_ci
8442e5b6d6dSopenharmony_ci        if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
8452e5b6d6dSopenharmony_ci            (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
8462e5b6d6dSopenharmony_ci            (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
8472e5b6d6dSopenharmony_ci            v = u_getPropertyValueEnum(p, vname.data());
8482e5b6d6dSopenharmony_ci            if (v == UCHAR_INVALID_CODE) {
8492e5b6d6dSopenharmony_ci                // Handle numeric CCC
8502e5b6d6dSopenharmony_ci                if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
8512e5b6d6dSopenharmony_ci                    p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
8522e5b6d6dSopenharmony_ci                    p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
8532e5b6d6dSopenharmony_ci                    char* end;
8542e5b6d6dSopenharmony_ci                    double val = uprv_strtod(vname.data(), &end);
8552e5b6d6dSopenharmony_ci                    // Anything between 0 and 255 is valid even if unused.
8562e5b6d6dSopenharmony_ci                    // Cast double->int only after range check.
8572e5b6d6dSopenharmony_ci                    // We catch NaN here because comparing it with both 0 and 255 will be false
8582e5b6d6dSopenharmony_ci                    // (as are all comparisons with NaN).
8592e5b6d6dSopenharmony_ci                    if (*end != 0 || !(0 <= val && val <= 255) ||
8602e5b6d6dSopenharmony_ci                            (v = (int32_t)val) != val) {
8612e5b6d6dSopenharmony_ci                        // non-integral value or outside 0..255, or trailing junk
8622e5b6d6dSopenharmony_ci                        FAIL(ec);
8632e5b6d6dSopenharmony_ci                    }
8642e5b6d6dSopenharmony_ci                } else {
8652e5b6d6dSopenharmony_ci                    FAIL(ec);
8662e5b6d6dSopenharmony_ci                }
8672e5b6d6dSopenharmony_ci            }
8682e5b6d6dSopenharmony_ci        }
8692e5b6d6dSopenharmony_ci
8702e5b6d6dSopenharmony_ci        else {
8712e5b6d6dSopenharmony_ci
8722e5b6d6dSopenharmony_ci            switch (p) {
8732e5b6d6dSopenharmony_ci            case UCHAR_NUMERIC_VALUE:
8742e5b6d6dSopenharmony_ci                {
8752e5b6d6dSopenharmony_ci                    char* end;
8762e5b6d6dSopenharmony_ci                    double val = uprv_strtod(vname.data(), &end);
8772e5b6d6dSopenharmony_ci                    if (*end != 0) {
8782e5b6d6dSopenharmony_ci                        FAIL(ec);
8792e5b6d6dSopenharmony_ci                    }
8802e5b6d6dSopenharmony_ci                    applyFilter(numericValueFilter, &val,
8812e5b6d6dSopenharmony_ci                                CharacterProperties::getInclusionsForProperty(p, ec), ec);
8822e5b6d6dSopenharmony_ci                    return *this;
8832e5b6d6dSopenharmony_ci                }
8842e5b6d6dSopenharmony_ci            case UCHAR_NAME:
8852e5b6d6dSopenharmony_ci                {
8862e5b6d6dSopenharmony_ci                    // Must munge name, since u_charFromName() does not do
8872e5b6d6dSopenharmony_ci                    // 'loose' matching.
8882e5b6d6dSopenharmony_ci                    char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
8892e5b6d6dSopenharmony_ci                    if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
8902e5b6d6dSopenharmony_ci                    UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
8912e5b6d6dSopenharmony_ci                    if (U_SUCCESS(ec)) {
8922e5b6d6dSopenharmony_ci                        clear();
8932e5b6d6dSopenharmony_ci                        add(ch);
8942e5b6d6dSopenharmony_ci                        return *this;
8952e5b6d6dSopenharmony_ci                    } else {
8962e5b6d6dSopenharmony_ci                        FAIL(ec);
8972e5b6d6dSopenharmony_ci                    }
8982e5b6d6dSopenharmony_ci                }
8992e5b6d6dSopenharmony_ci            case UCHAR_UNICODE_1_NAME:
9002e5b6d6dSopenharmony_ci                // ICU 49 deprecates the Unicode_1_Name property APIs.
9012e5b6d6dSopenharmony_ci                FAIL(ec);
9022e5b6d6dSopenharmony_ci            case UCHAR_AGE:
9032e5b6d6dSopenharmony_ci                {
9042e5b6d6dSopenharmony_ci                    // Must munge name, since u_versionFromString() does not do
9052e5b6d6dSopenharmony_ci                    // 'loose' matching.
9062e5b6d6dSopenharmony_ci                    char buf[128];
9072e5b6d6dSopenharmony_ci                    if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
9082e5b6d6dSopenharmony_ci                    UVersionInfo version;
9092e5b6d6dSopenharmony_ci                    u_versionFromString(version, buf);
9102e5b6d6dSopenharmony_ci                    applyFilter(versionFilter, &version,
9112e5b6d6dSopenharmony_ci                                CharacterProperties::getInclusionsForProperty(p, ec), ec);
9122e5b6d6dSopenharmony_ci                    return *this;
9132e5b6d6dSopenharmony_ci                }
9142e5b6d6dSopenharmony_ci            case UCHAR_SCRIPT_EXTENSIONS:
9152e5b6d6dSopenharmony_ci                v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
9162e5b6d6dSopenharmony_ci                if (v == UCHAR_INVALID_CODE) {
9172e5b6d6dSopenharmony_ci                    FAIL(ec);
9182e5b6d6dSopenharmony_ci                }
9192e5b6d6dSopenharmony_ci                // fall through to calling applyIntPropertyValue()
9202e5b6d6dSopenharmony_ci                break;
9212e5b6d6dSopenharmony_ci            default:
9222e5b6d6dSopenharmony_ci                // p is a non-binary, non-enumerated property that we
9232e5b6d6dSopenharmony_ci                // don't support (yet).
9242e5b6d6dSopenharmony_ci                FAIL(ec);
9252e5b6d6dSopenharmony_ci            }
9262e5b6d6dSopenharmony_ci        }
9272e5b6d6dSopenharmony_ci    }
9282e5b6d6dSopenharmony_ci
9292e5b6d6dSopenharmony_ci    else {
9302e5b6d6dSopenharmony_ci        // value is empty.  Interpret as General Category, Script, or
9312e5b6d6dSopenharmony_ci        // Binary property.
9322e5b6d6dSopenharmony_ci        p = UCHAR_GENERAL_CATEGORY_MASK;
9332e5b6d6dSopenharmony_ci        v = u_getPropertyValueEnum(p, pname.data());
9342e5b6d6dSopenharmony_ci        if (v == UCHAR_INVALID_CODE) {
9352e5b6d6dSopenharmony_ci            p = UCHAR_SCRIPT;
9362e5b6d6dSopenharmony_ci            v = u_getPropertyValueEnum(p, pname.data());
9372e5b6d6dSopenharmony_ci            if (v == UCHAR_INVALID_CODE) {
9382e5b6d6dSopenharmony_ci                p = u_getPropertyEnum(pname.data());
9392e5b6d6dSopenharmony_ci                if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
9402e5b6d6dSopenharmony_ci                    v = 1;
9412e5b6d6dSopenharmony_ci                } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
9422e5b6d6dSopenharmony_ci                    set(MIN_VALUE, MAX_VALUE);
9432e5b6d6dSopenharmony_ci                    return *this;
9442e5b6d6dSopenharmony_ci                } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
9452e5b6d6dSopenharmony_ci                    set(0, 0x7F);
9462e5b6d6dSopenharmony_ci                    return *this;
9472e5b6d6dSopenharmony_ci                } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
9482e5b6d6dSopenharmony_ci                    // [:Assigned:]=[:^Cn:]
9492e5b6d6dSopenharmony_ci                    p = UCHAR_GENERAL_CATEGORY_MASK;
9502e5b6d6dSopenharmony_ci                    v = U_GC_CN_MASK;
9512e5b6d6dSopenharmony_ci                    invert = true;
9522e5b6d6dSopenharmony_ci                } else {
9532e5b6d6dSopenharmony_ci                    FAIL(ec);
9542e5b6d6dSopenharmony_ci                }
9552e5b6d6dSopenharmony_ci            }
9562e5b6d6dSopenharmony_ci        }
9572e5b6d6dSopenharmony_ci    }
9582e5b6d6dSopenharmony_ci
9592e5b6d6dSopenharmony_ci    applyIntPropertyValue(p, v, ec);
9602e5b6d6dSopenharmony_ci    if(invert) {
9612e5b6d6dSopenharmony_ci        complement().removeAllStrings();  // code point complement
9622e5b6d6dSopenharmony_ci    }
9632e5b6d6dSopenharmony_ci
9642e5b6d6dSopenharmony_ci    if (isBogus() && U_SUCCESS(ec)) {
9652e5b6d6dSopenharmony_ci        // We likely ran out of memory. AHHH!
9662e5b6d6dSopenharmony_ci        ec = U_MEMORY_ALLOCATION_ERROR;
9672e5b6d6dSopenharmony_ci    }
9682e5b6d6dSopenharmony_ci    return *this;
9692e5b6d6dSopenharmony_ci}
9702e5b6d6dSopenharmony_ci
9712e5b6d6dSopenharmony_ci//----------------------------------------------------------------
9722e5b6d6dSopenharmony_ci// Property set patterns
9732e5b6d6dSopenharmony_ci//----------------------------------------------------------------
9742e5b6d6dSopenharmony_ci
9752e5b6d6dSopenharmony_ci/**
9762e5b6d6dSopenharmony_ci * Return true if the given position, in the given pattern, appears
9772e5b6d6dSopenharmony_ci * to be the start of a property set pattern.
9782e5b6d6dSopenharmony_ci */
9792e5b6d6dSopenharmony_ciUBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
9802e5b6d6dSopenharmony_ci                                           int32_t pos) {
9812e5b6d6dSopenharmony_ci    // Patterns are at least 5 characters long
9822e5b6d6dSopenharmony_ci    if ((pos+5) > pattern.length()) {
9832e5b6d6dSopenharmony_ci        return false;
9842e5b6d6dSopenharmony_ci    }
9852e5b6d6dSopenharmony_ci
9862e5b6d6dSopenharmony_ci    // Look for an opening [:, [:^, \p, or \P
9872e5b6d6dSopenharmony_ci    return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
9882e5b6d6dSopenharmony_ci}
9892e5b6d6dSopenharmony_ci
9902e5b6d6dSopenharmony_ci/**
9912e5b6d6dSopenharmony_ci * Return true if the given iterator appears to point at a
9922e5b6d6dSopenharmony_ci * property pattern.  Regardless of the result, return with the
9932e5b6d6dSopenharmony_ci * iterator unchanged.
9942e5b6d6dSopenharmony_ci * @param chars iterator over the pattern characters.  Upon return
9952e5b6d6dSopenharmony_ci * it will be unchanged.
9962e5b6d6dSopenharmony_ci * @param iterOpts RuleCharacterIterator options
9972e5b6d6dSopenharmony_ci */
9982e5b6d6dSopenharmony_ciUBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
9992e5b6d6dSopenharmony_ci                                           int32_t iterOpts) {
10002e5b6d6dSopenharmony_ci    // NOTE: literal will always be false, because we don't parse escapes.
10012e5b6d6dSopenharmony_ci    UBool result = false, literal;
10022e5b6d6dSopenharmony_ci    UErrorCode ec = U_ZERO_ERROR;
10032e5b6d6dSopenharmony_ci    iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
10042e5b6d6dSopenharmony_ci    RuleCharacterIterator::Pos pos;
10052e5b6d6dSopenharmony_ci    chars.getPos(pos);
10062e5b6d6dSopenharmony_ci    UChar32 c = chars.next(iterOpts, literal, ec);
10072e5b6d6dSopenharmony_ci    if (c == u'[' || c == u'\\') {
10082e5b6d6dSopenharmony_ci        UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
10092e5b6d6dSopenharmony_ci                               literal, ec);
10102e5b6d6dSopenharmony_ci        result = (c == u'[') ? (d == u':') :
10112e5b6d6dSopenharmony_ci                               (d == u'N' || d == u'p' || d == u'P');
10122e5b6d6dSopenharmony_ci    }
10132e5b6d6dSopenharmony_ci    chars.setPos(pos);
10142e5b6d6dSopenharmony_ci    return result && U_SUCCESS(ec);
10152e5b6d6dSopenharmony_ci}
10162e5b6d6dSopenharmony_ci
10172e5b6d6dSopenharmony_ci/**
10182e5b6d6dSopenharmony_ci * Parse the given property pattern at the given parse position.
10192e5b6d6dSopenharmony_ci */
10202e5b6d6dSopenharmony_ciUnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
10212e5b6d6dSopenharmony_ci                                             ParsePosition& ppos,
10222e5b6d6dSopenharmony_ci                                             UErrorCode &ec) {
10232e5b6d6dSopenharmony_ci    int32_t pos = ppos.getIndex();
10242e5b6d6dSopenharmony_ci
10252e5b6d6dSopenharmony_ci    UBool posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
10262e5b6d6dSopenharmony_ci    UBool isName = false; // true for \N{pat}, o/w false
10272e5b6d6dSopenharmony_ci    UBool invert = false;
10282e5b6d6dSopenharmony_ci
10292e5b6d6dSopenharmony_ci    if (U_FAILURE(ec)) return *this;
10302e5b6d6dSopenharmony_ci
10312e5b6d6dSopenharmony_ci    // Minimum length is 5 characters, e.g. \p{L}
10322e5b6d6dSopenharmony_ci    if ((pos+5) > pattern.length()) {
10332e5b6d6dSopenharmony_ci        FAIL(ec);
10342e5b6d6dSopenharmony_ci    }
10352e5b6d6dSopenharmony_ci
10362e5b6d6dSopenharmony_ci    // On entry, ppos should point to one of the following locations:
10372e5b6d6dSopenharmony_ci    // Look for an opening [:, [:^, \p, or \P
10382e5b6d6dSopenharmony_ci    if (isPOSIXOpen(pattern, pos)) {
10392e5b6d6dSopenharmony_ci        posix = true;
10402e5b6d6dSopenharmony_ci        pos += 2;
10412e5b6d6dSopenharmony_ci        pos = ICU_Utility::skipWhitespace(pattern, pos);
10422e5b6d6dSopenharmony_ci        if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
10432e5b6d6dSopenharmony_ci            ++pos;
10442e5b6d6dSopenharmony_ci            invert = true;
10452e5b6d6dSopenharmony_ci        }
10462e5b6d6dSopenharmony_ci    } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
10472e5b6d6dSopenharmony_ci        UChar c = pattern.charAt(pos+1);
10482e5b6d6dSopenharmony_ci        invert = (c == u'P');
10492e5b6d6dSopenharmony_ci        isName = (c == u'N');
10502e5b6d6dSopenharmony_ci        pos += 2;
10512e5b6d6dSopenharmony_ci        pos = ICU_Utility::skipWhitespace(pattern, pos);
10522e5b6d6dSopenharmony_ci        if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
10532e5b6d6dSopenharmony_ci            // Syntax error; "\p" or "\P" not followed by "{"
10542e5b6d6dSopenharmony_ci            FAIL(ec);
10552e5b6d6dSopenharmony_ci        }
10562e5b6d6dSopenharmony_ci    } else {
10572e5b6d6dSopenharmony_ci        // Open delimiter not seen
10582e5b6d6dSopenharmony_ci        FAIL(ec);
10592e5b6d6dSopenharmony_ci    }
10602e5b6d6dSopenharmony_ci
10612e5b6d6dSopenharmony_ci    // Look for the matching close delimiter, either :] or }
10622e5b6d6dSopenharmony_ci    int32_t close;
10632e5b6d6dSopenharmony_ci    if (posix) {
10642e5b6d6dSopenharmony_ci      close = pattern.indexOf(u":]", 2, pos);
10652e5b6d6dSopenharmony_ci    } else {
10662e5b6d6dSopenharmony_ci      close = pattern.indexOf(u'}', pos);
10672e5b6d6dSopenharmony_ci    }
10682e5b6d6dSopenharmony_ci    if (close < 0) {
10692e5b6d6dSopenharmony_ci        // Syntax error; close delimiter missing
10702e5b6d6dSopenharmony_ci        FAIL(ec);
10712e5b6d6dSopenharmony_ci    }
10722e5b6d6dSopenharmony_ci
10732e5b6d6dSopenharmony_ci    // Look for an '=' sign.  If this is present, we will parse a
10742e5b6d6dSopenharmony_ci    // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
10752e5b6d6dSopenharmony_ci    // pattern.
10762e5b6d6dSopenharmony_ci    int32_t equals = pattern.indexOf(u'=', pos);
10772e5b6d6dSopenharmony_ci    UnicodeString propName, valueName;
10782e5b6d6dSopenharmony_ci    if (equals >= 0 && equals < close && !isName) {
10792e5b6d6dSopenharmony_ci        // Equals seen; parse medium/long pattern
10802e5b6d6dSopenharmony_ci        pattern.extractBetween(pos, equals, propName);
10812e5b6d6dSopenharmony_ci        pattern.extractBetween(equals+1, close, valueName);
10822e5b6d6dSopenharmony_ci    }
10832e5b6d6dSopenharmony_ci
10842e5b6d6dSopenharmony_ci    else {
10852e5b6d6dSopenharmony_ci        // Handle case where no '=' is seen, and \N{}
10862e5b6d6dSopenharmony_ci        pattern.extractBetween(pos, close, propName);
10872e5b6d6dSopenharmony_ci
10882e5b6d6dSopenharmony_ci        // Handle \N{name}
10892e5b6d6dSopenharmony_ci        if (isName) {
10902e5b6d6dSopenharmony_ci            // This is a little inefficient since it means we have to
10912e5b6d6dSopenharmony_ci            // parse NAME_PROP back to UCHAR_NAME even though we already
10922e5b6d6dSopenharmony_ci            // know it's UCHAR_NAME.  If we refactor the API to
10932e5b6d6dSopenharmony_ci            // support args of (UProperty, char*) then we can remove
10942e5b6d6dSopenharmony_ci            // NAME_PROP and make this a little more efficient.
10952e5b6d6dSopenharmony_ci            valueName = propName;
10962e5b6d6dSopenharmony_ci            propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
10972e5b6d6dSopenharmony_ci        }
10982e5b6d6dSopenharmony_ci    }
10992e5b6d6dSopenharmony_ci
11002e5b6d6dSopenharmony_ci    applyPropertyAlias(propName, valueName, ec);
11012e5b6d6dSopenharmony_ci
11022e5b6d6dSopenharmony_ci    if (U_SUCCESS(ec)) {
11032e5b6d6dSopenharmony_ci        if (invert) {
11042e5b6d6dSopenharmony_ci            complement().removeAllStrings();  // code point complement
11052e5b6d6dSopenharmony_ci        }
11062e5b6d6dSopenharmony_ci
11072e5b6d6dSopenharmony_ci        // Move to the limit position after the close delimiter if the
11082e5b6d6dSopenharmony_ci        // parse succeeded.
11092e5b6d6dSopenharmony_ci        ppos.setIndex(close + (posix ? 2 : 1));
11102e5b6d6dSopenharmony_ci    }
11112e5b6d6dSopenharmony_ci
11122e5b6d6dSopenharmony_ci    return *this;
11132e5b6d6dSopenharmony_ci}
11142e5b6d6dSopenharmony_ci
11152e5b6d6dSopenharmony_ci/**
11162e5b6d6dSopenharmony_ci * Parse a property pattern.
11172e5b6d6dSopenharmony_ci * @param chars iterator over the pattern characters.  Upon return
11182e5b6d6dSopenharmony_ci * it will be advanced to the first character after the parsed
11192e5b6d6dSopenharmony_ci * pattern, or the end of the iteration if all characters are
11202e5b6d6dSopenharmony_ci * parsed.
11212e5b6d6dSopenharmony_ci * @param rebuiltPat the pattern that was parsed, rebuilt or
11222e5b6d6dSopenharmony_ci * copied from the input pattern, as appropriate.
11232e5b6d6dSopenharmony_ci */
11242e5b6d6dSopenharmony_civoid UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
11252e5b6d6dSopenharmony_ci                                      UnicodeString& rebuiltPat,
11262e5b6d6dSopenharmony_ci                                      UErrorCode& ec) {
11272e5b6d6dSopenharmony_ci    if (U_FAILURE(ec)) return;
11282e5b6d6dSopenharmony_ci    UnicodeString pattern;
11292e5b6d6dSopenharmony_ci    chars.lookahead(pattern);
11302e5b6d6dSopenharmony_ci    ParsePosition pos(0);
11312e5b6d6dSopenharmony_ci    applyPropertyPattern(pattern, pos, ec);
11322e5b6d6dSopenharmony_ci    if (U_FAILURE(ec)) return;
11332e5b6d6dSopenharmony_ci    if (pos.getIndex() == 0) {
11342e5b6d6dSopenharmony_ci        // syntaxError(chars, "Invalid property pattern");
11352e5b6d6dSopenharmony_ci        ec = U_MALFORMED_SET;
11362e5b6d6dSopenharmony_ci        return;
11372e5b6d6dSopenharmony_ci    }
11382e5b6d6dSopenharmony_ci    chars.jumpahead(pos.getIndex());
11392e5b6d6dSopenharmony_ci    rebuiltPat.append(pattern, 0, pos.getIndex());
11402e5b6d6dSopenharmony_ci}
11412e5b6d6dSopenharmony_ci
11422e5b6d6dSopenharmony_ciU_NAMESPACE_END
1143