12e5b6d6dSopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 22e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 32e5b6d6dSopenharmony_ci// 42e5b6d6dSopenharmony_ci// file: rbbiscan.cpp 52e5b6d6dSopenharmony_ci// 62e5b6d6dSopenharmony_ci// Copyright (C) 2002-2016, International Business Machines Corporation and others. 72e5b6d6dSopenharmony_ci// All Rights Reserved. 82e5b6d6dSopenharmony_ci// 92e5b6d6dSopenharmony_ci// This file contains the Rule Based Break Iterator Rule Builder functions for 102e5b6d6dSopenharmony_ci// scanning the rules and assembling a parse tree. This is the first phase 112e5b6d6dSopenharmony_ci// of compiling the rules. 122e5b6d6dSopenharmony_ci// 132e5b6d6dSopenharmony_ci// The overall of the rules is managed by class RBBIRuleBuilder, which will 142e5b6d6dSopenharmony_ci// create and use an instance of this class as part of the process. 152e5b6d6dSopenharmony_ci// 162e5b6d6dSopenharmony_ci 172e5b6d6dSopenharmony_ci#include "unicode/utypes.h" 182e5b6d6dSopenharmony_ci 192e5b6d6dSopenharmony_ci#if !UCONFIG_NO_BREAK_ITERATION 202e5b6d6dSopenharmony_ci 212e5b6d6dSopenharmony_ci#include "unicode/unistr.h" 222e5b6d6dSopenharmony_ci#include "unicode/uniset.h" 232e5b6d6dSopenharmony_ci#include "unicode/uchar.h" 242e5b6d6dSopenharmony_ci#include "unicode/uchriter.h" 252e5b6d6dSopenharmony_ci#include "unicode/parsepos.h" 262e5b6d6dSopenharmony_ci#include "unicode/parseerr.h" 272e5b6d6dSopenharmony_ci#include "cmemory.h" 282e5b6d6dSopenharmony_ci#include "cstring.h" 292e5b6d6dSopenharmony_ci 302e5b6d6dSopenharmony_ci#include "rbbirpt.h" // Contains state table for the rbbi rules parser. 312e5b6d6dSopenharmony_ci // generated by a Perl script. 322e5b6d6dSopenharmony_ci#include "rbbirb.h" 332e5b6d6dSopenharmony_ci#include "rbbinode.h" 342e5b6d6dSopenharmony_ci#include "rbbiscan.h" 352e5b6d6dSopenharmony_ci#include "rbbitblb.h" 362e5b6d6dSopenharmony_ci 372e5b6d6dSopenharmony_ci#include "uassert.h" 382e5b6d6dSopenharmony_ci 392e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 402e5b6d6dSopenharmony_ci// 412e5b6d6dSopenharmony_ci// Unicode Set init strings for each of the character classes needed for parsing a rule file. 422e5b6d6dSopenharmony_ci// (Initialized with hex values for portability to EBCDIC based machines. 432e5b6d6dSopenharmony_ci// Really ugly, but there's no good way to avoid it.) 442e5b6d6dSopenharmony_ci// 452e5b6d6dSopenharmony_ci// The sets are referred to by name in the rbbirpt.txt, which is the 462e5b6d6dSopenharmony_ci// source form of the state transition table for the RBBI rule parser. 472e5b6d6dSopenharmony_ci// 482e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 492e5b6d6dSopenharmony_cistatic const UChar gRuleSet_rule_char_pattern[] = { 502e5b6d6dSopenharmony_ci // Characters that may appear as literals in patterns without escaping or quoting. 512e5b6d6dSopenharmony_ci // [ ^ [ \ p { Z } \ u 0 0 2 0 522e5b6d6dSopenharmony_ci 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30, 532e5b6d6dSopenharmony_ci // - \ u 0 0 7 f ] - [ \ p 542e5b6d6dSopenharmony_ci 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 552e5b6d6dSopenharmony_ci // { L } ] - [ \ p { N } ] ] 562e5b6d6dSopenharmony_ci 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0}; 572e5b6d6dSopenharmony_ci 582e5b6d6dSopenharmony_cistatic const UChar gRuleSet_name_char_pattern[] = { 592e5b6d6dSopenharmony_ci// [ _ \ p { L } \ p { N } ] 602e5b6d6dSopenharmony_ci 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0}; 612e5b6d6dSopenharmony_ci 622e5b6d6dSopenharmony_cistatic const UChar gRuleSet_digit_char_pattern[] = { 632e5b6d6dSopenharmony_ci// [ 0 - 9 ] 642e5b6d6dSopenharmony_ci 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; 652e5b6d6dSopenharmony_ci 662e5b6d6dSopenharmony_cistatic const UChar gRuleSet_name_start_char_pattern[] = { 672e5b6d6dSopenharmony_ci// [ _ \ p { L } ] 682e5b6d6dSopenharmony_ci 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 }; 692e5b6d6dSopenharmony_ci 702e5b6d6dSopenharmony_cistatic const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any" 712e5b6d6dSopenharmony_ci 722e5b6d6dSopenharmony_ci 732e5b6d6dSopenharmony_ciU_CDECL_BEGIN 742e5b6d6dSopenharmony_cistatic void U_CALLCONV RBBISetTable_deleter(void *p) { 752e5b6d6dSopenharmony_ci icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p; 762e5b6d6dSopenharmony_ci delete px->key; 772e5b6d6dSopenharmony_ci // Note: px->val is owned by the linked list "fSetsListHead" in scanner. 782e5b6d6dSopenharmony_ci // Don't delete the value nodes here. 792e5b6d6dSopenharmony_ci uprv_free(px); 802e5b6d6dSopenharmony_ci} 812e5b6d6dSopenharmony_ciU_CDECL_END 822e5b6d6dSopenharmony_ci 832e5b6d6dSopenharmony_ciU_NAMESPACE_BEGIN 842e5b6d6dSopenharmony_ci 852e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 862e5b6d6dSopenharmony_ci// 872e5b6d6dSopenharmony_ci// Constructor. 882e5b6d6dSopenharmony_ci// 892e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 902e5b6d6dSopenharmony_ciRBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) 912e5b6d6dSopenharmony_ci{ 922e5b6d6dSopenharmony_ci fRB = rb; 932e5b6d6dSopenharmony_ci fScanIndex = 0; 942e5b6d6dSopenharmony_ci fNextIndex = 0; 952e5b6d6dSopenharmony_ci fQuoteMode = false; 962e5b6d6dSopenharmony_ci fLineNum = 1; 972e5b6d6dSopenharmony_ci fCharNum = 0; 982e5b6d6dSopenharmony_ci fLastChar = 0; 992e5b6d6dSopenharmony_ci 1002e5b6d6dSopenharmony_ci fStateTable = NULL; 1012e5b6d6dSopenharmony_ci fStack[0] = 0; 1022e5b6d6dSopenharmony_ci fStackPtr = 0; 1032e5b6d6dSopenharmony_ci fNodeStack[0] = NULL; 1042e5b6d6dSopenharmony_ci fNodeStackPtr = 0; 1052e5b6d6dSopenharmony_ci 1062e5b6d6dSopenharmony_ci fReverseRule = false; 1072e5b6d6dSopenharmony_ci fLookAheadRule = false; 1082e5b6d6dSopenharmony_ci fNoChainInRule = false; 1092e5b6d6dSopenharmony_ci 1102e5b6d6dSopenharmony_ci fSymbolTable = NULL; 1112e5b6d6dSopenharmony_ci fSetTable = NULL; 1122e5b6d6dSopenharmony_ci fRuleNum = 0; 1132e5b6d6dSopenharmony_ci fOptionStart = 0; 1142e5b6d6dSopenharmony_ci 1152e5b6d6dSopenharmony_ci // Do not check status until after all critical fields are sufficiently initialized 1162e5b6d6dSopenharmony_ci // that the destructor can run cleanly. 1172e5b6d6dSopenharmony_ci if (U_FAILURE(*rb->fStatus)) { 1182e5b6d6dSopenharmony_ci return; 1192e5b6d6dSopenharmony_ci } 1202e5b6d6dSopenharmony_ci 1212e5b6d6dSopenharmony_ci // 1222e5b6d6dSopenharmony_ci // Set up the constant Unicode Sets. 1232e5b6d6dSopenharmony_ci // Note: These could be made static, lazily initialized, and shared among 1242e5b6d6dSopenharmony_ci // all instances of RBBIRuleScanners. BUT this is quite a bit simpler, 1252e5b6d6dSopenharmony_ci // and the time to build these few sets should be small compared to a 1262e5b6d6dSopenharmony_ci // full break iterator build. 1272e5b6d6dSopenharmony_ci fRuleSets[kRuleSet_rule_char-128] 1282e5b6d6dSopenharmony_ci = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus); 1292e5b6d6dSopenharmony_ci // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:] 1302e5b6d6dSopenharmony_ci fRuleSets[kRuleSet_white_space-128]. 1312e5b6d6dSopenharmony_ci add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029); 1322e5b6d6dSopenharmony_ci fRuleSets[kRuleSet_name_char-128] 1332e5b6d6dSopenharmony_ci = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus); 1342e5b6d6dSopenharmony_ci fRuleSets[kRuleSet_name_start_char-128] 1352e5b6d6dSopenharmony_ci = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus); 1362e5b6d6dSopenharmony_ci fRuleSets[kRuleSet_digit_char-128] 1372e5b6d6dSopenharmony_ci = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus); 1382e5b6d6dSopenharmony_ci if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) { 1392e5b6d6dSopenharmony_ci // This case happens if ICU's data is missing. UnicodeSet tries to look up property 1402e5b6d6dSopenharmony_ci // names from the init string, can't find them, and claims an illegal argument. 1412e5b6d6dSopenharmony_ci // Change the error so that the actual problem will be clearer to users. 1422e5b6d6dSopenharmony_ci *rb->fStatus = U_BRK_INIT_ERROR; 1432e5b6d6dSopenharmony_ci } 1442e5b6d6dSopenharmony_ci if (U_FAILURE(*rb->fStatus)) { 1452e5b6d6dSopenharmony_ci return; 1462e5b6d6dSopenharmony_ci } 1472e5b6d6dSopenharmony_ci 1482e5b6d6dSopenharmony_ci fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus); 1492e5b6d6dSopenharmony_ci if (fSymbolTable == NULL) { 1502e5b6d6dSopenharmony_ci *rb->fStatus = U_MEMORY_ALLOCATION_ERROR; 1512e5b6d6dSopenharmony_ci return; 1522e5b6d6dSopenharmony_ci } 1532e5b6d6dSopenharmony_ci fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus); 1542e5b6d6dSopenharmony_ci if (U_FAILURE(*rb->fStatus)) { 1552e5b6d6dSopenharmony_ci return; 1562e5b6d6dSopenharmony_ci } 1572e5b6d6dSopenharmony_ci uhash_setValueDeleter(fSetTable, RBBISetTable_deleter); 1582e5b6d6dSopenharmony_ci} 1592e5b6d6dSopenharmony_ci 1602e5b6d6dSopenharmony_ci 1612e5b6d6dSopenharmony_ci 1622e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 1632e5b6d6dSopenharmony_ci// 1642e5b6d6dSopenharmony_ci// Destructor 1652e5b6d6dSopenharmony_ci// 1662e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 1672e5b6d6dSopenharmony_ciRBBIRuleScanner::~RBBIRuleScanner() { 1682e5b6d6dSopenharmony_ci delete fSymbolTable; 1692e5b6d6dSopenharmony_ci if (fSetTable != NULL) { 1702e5b6d6dSopenharmony_ci uhash_close(fSetTable); 1712e5b6d6dSopenharmony_ci fSetTable = NULL; 1722e5b6d6dSopenharmony_ci 1732e5b6d6dSopenharmony_ci } 1742e5b6d6dSopenharmony_ci 1752e5b6d6dSopenharmony_ci 1762e5b6d6dSopenharmony_ci // Node Stack. 1772e5b6d6dSopenharmony_ci // Normally has one entry, which is the entire parse tree for the rules. 1782e5b6d6dSopenharmony_ci // If errors occurred, there may be additional subtrees left on the stack. 1792e5b6d6dSopenharmony_ci while (fNodeStackPtr > 0) { 1802e5b6d6dSopenharmony_ci delete fNodeStack[fNodeStackPtr]; 1812e5b6d6dSopenharmony_ci fNodeStackPtr--; 1822e5b6d6dSopenharmony_ci } 1832e5b6d6dSopenharmony_ci 1842e5b6d6dSopenharmony_ci} 1852e5b6d6dSopenharmony_ci 1862e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 1872e5b6d6dSopenharmony_ci// 1882e5b6d6dSopenharmony_ci// doParseAction Do some action during rule parsing. 1892e5b6d6dSopenharmony_ci// Called by the parse state machine. 1902e5b6d6dSopenharmony_ci// Actions build the parse tree and Unicode Sets, 1912e5b6d6dSopenharmony_ci// and maintain the parse stack for nested expressions. 1922e5b6d6dSopenharmony_ci// 1932e5b6d6dSopenharmony_ci// TODO: unify EParseAction and RBBI_RuleParseAction enum types. 1942e5b6d6dSopenharmony_ci// They represent exactly the same thing. They're separate 1952e5b6d6dSopenharmony_ci// only to work around enum forward declaration restrictions 1962e5b6d6dSopenharmony_ci// in some compilers, while at the same time avoiding multiple 1972e5b6d6dSopenharmony_ci// definitions problems. I'm sure that there's a better way. 1982e5b6d6dSopenharmony_ci// 1992e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 2002e5b6d6dSopenharmony_ciUBool RBBIRuleScanner::doParseActions(int32_t action) 2012e5b6d6dSopenharmony_ci{ 2022e5b6d6dSopenharmony_ci RBBINode *n = NULL; 2032e5b6d6dSopenharmony_ci 2042e5b6d6dSopenharmony_ci UBool returnVal = true; 2052e5b6d6dSopenharmony_ci 2062e5b6d6dSopenharmony_ci switch (action) { 2072e5b6d6dSopenharmony_ci 2082e5b6d6dSopenharmony_ci case doExprStart: 2092e5b6d6dSopenharmony_ci pushNewNode(RBBINode::opStart); 2102e5b6d6dSopenharmony_ci fRuleNum++; 2112e5b6d6dSopenharmony_ci break; 2122e5b6d6dSopenharmony_ci 2132e5b6d6dSopenharmony_ci 2142e5b6d6dSopenharmony_ci case doNoChain: 2152e5b6d6dSopenharmony_ci // Scanned a '^' while on the rule start state. 2162e5b6d6dSopenharmony_ci fNoChainInRule = true; 2172e5b6d6dSopenharmony_ci break; 2182e5b6d6dSopenharmony_ci 2192e5b6d6dSopenharmony_ci 2202e5b6d6dSopenharmony_ci case doExprOrOperator: 2212e5b6d6dSopenharmony_ci { 2222e5b6d6dSopenharmony_ci fixOpStack(RBBINode::precOpCat); 2232e5b6d6dSopenharmony_ci RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 2242e5b6d6dSopenharmony_ci RBBINode *orNode = pushNewNode(RBBINode::opOr); 2252e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 2262e5b6d6dSopenharmony_ci break; 2272e5b6d6dSopenharmony_ci } 2282e5b6d6dSopenharmony_ci orNode->fLeftChild = operandNode; 2292e5b6d6dSopenharmony_ci operandNode->fParent = orNode; 2302e5b6d6dSopenharmony_ci } 2312e5b6d6dSopenharmony_ci break; 2322e5b6d6dSopenharmony_ci 2332e5b6d6dSopenharmony_ci case doExprCatOperator: 2342e5b6d6dSopenharmony_ci // concatenation operator. 2352e5b6d6dSopenharmony_ci // For the implicit concatenation of adjacent terms in an expression that are 2362e5b6d6dSopenharmony_ci // not separated by any other operator. Action is invoked between the 2372e5b6d6dSopenharmony_ci // actions for the two terms. 2382e5b6d6dSopenharmony_ci { 2392e5b6d6dSopenharmony_ci fixOpStack(RBBINode::precOpCat); 2402e5b6d6dSopenharmony_ci RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 2412e5b6d6dSopenharmony_ci RBBINode *catNode = pushNewNode(RBBINode::opCat); 2422e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 2432e5b6d6dSopenharmony_ci break; 2442e5b6d6dSopenharmony_ci } 2452e5b6d6dSopenharmony_ci catNode->fLeftChild = operandNode; 2462e5b6d6dSopenharmony_ci operandNode->fParent = catNode; 2472e5b6d6dSopenharmony_ci } 2482e5b6d6dSopenharmony_ci break; 2492e5b6d6dSopenharmony_ci 2502e5b6d6dSopenharmony_ci case doLParen: 2512e5b6d6dSopenharmony_ci // Open Paren. 2522e5b6d6dSopenharmony_ci // The openParen node is a dummy operation type with a low precedence, 2532e5b6d6dSopenharmony_ci // which has the affect of ensuring that any real binary op that 2542e5b6d6dSopenharmony_ci // follows within the parens binds more tightly to the operands than 2552e5b6d6dSopenharmony_ci // stuff outside of the parens. 2562e5b6d6dSopenharmony_ci pushNewNode(RBBINode::opLParen); 2572e5b6d6dSopenharmony_ci break; 2582e5b6d6dSopenharmony_ci 2592e5b6d6dSopenharmony_ci case doExprRParen: 2602e5b6d6dSopenharmony_ci fixOpStack(RBBINode::precLParen); 2612e5b6d6dSopenharmony_ci break; 2622e5b6d6dSopenharmony_ci 2632e5b6d6dSopenharmony_ci case doNOP: 2642e5b6d6dSopenharmony_ci break; 2652e5b6d6dSopenharmony_ci 2662e5b6d6dSopenharmony_ci case doStartAssign: 2672e5b6d6dSopenharmony_ci // We've just scanned "$variable = " 2682e5b6d6dSopenharmony_ci // The top of the node stack has the $variable ref node. 2692e5b6d6dSopenharmony_ci 2702e5b6d6dSopenharmony_ci // Save the start position of the RHS text in the StartExpression node 2712e5b6d6dSopenharmony_ci // that precedes the $variableReference node on the stack. 2722e5b6d6dSopenharmony_ci // This will eventually be used when saving the full $variable replacement 2732e5b6d6dSopenharmony_ci // text as a string. 2742e5b6d6dSopenharmony_ci n = fNodeStack[fNodeStackPtr-1]; 2752e5b6d6dSopenharmony_ci n->fFirstPos = fNextIndex; // move past the '=' 2762e5b6d6dSopenharmony_ci 2772e5b6d6dSopenharmony_ci // Push a new start-of-expression node; needed to keep parse of the 2782e5b6d6dSopenharmony_ci // RHS expression happy. 2792e5b6d6dSopenharmony_ci pushNewNode(RBBINode::opStart); 2802e5b6d6dSopenharmony_ci break; 2812e5b6d6dSopenharmony_ci 2822e5b6d6dSopenharmony_ci 2832e5b6d6dSopenharmony_ci 2842e5b6d6dSopenharmony_ci 2852e5b6d6dSopenharmony_ci case doEndAssign: 2862e5b6d6dSopenharmony_ci { 2872e5b6d6dSopenharmony_ci // We have reached the end of an assignment statement. 2882e5b6d6dSopenharmony_ci // Current scan char is the ';' that terminates the assignment. 2892e5b6d6dSopenharmony_ci 2902e5b6d6dSopenharmony_ci // Terminate expression, leaves expression parse tree rooted in TOS node. 2912e5b6d6dSopenharmony_ci fixOpStack(RBBINode::precStart); 2922e5b6d6dSopenharmony_ci 2932e5b6d6dSopenharmony_ci RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2]; 2942e5b6d6dSopenharmony_ci RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1]; 2952e5b6d6dSopenharmony_ci RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr]; 2962e5b6d6dSopenharmony_ci 2972e5b6d6dSopenharmony_ci // Save original text of right side of assignment, excluding the terminating ';' 2982e5b6d6dSopenharmony_ci // in the root of the node for the right-hand-side expression. 2992e5b6d6dSopenharmony_ci RHSExprNode->fFirstPos = startExprNode->fFirstPos; 3002e5b6d6dSopenharmony_ci RHSExprNode->fLastPos = fScanIndex; 3012e5b6d6dSopenharmony_ci fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText); 3022e5b6d6dSopenharmony_ci 3032e5b6d6dSopenharmony_ci // Expression parse tree becomes l. child of the $variable reference node. 3042e5b6d6dSopenharmony_ci varRefNode->fLeftChild = RHSExprNode; 3052e5b6d6dSopenharmony_ci RHSExprNode->fParent = varRefNode; 3062e5b6d6dSopenharmony_ci 3072e5b6d6dSopenharmony_ci // Make a symbol table entry for the $variableRef node. 3082e5b6d6dSopenharmony_ci fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus); 3092e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 3102e5b6d6dSopenharmony_ci // This is a round-about way to get the parse position set 3112e5b6d6dSopenharmony_ci // so that duplicate symbols error messages include a line number. 3122e5b6d6dSopenharmony_ci UErrorCode t = *fRB->fStatus; 3132e5b6d6dSopenharmony_ci *fRB->fStatus = U_ZERO_ERROR; 3142e5b6d6dSopenharmony_ci error(t); 3152e5b6d6dSopenharmony_ci } 3162e5b6d6dSopenharmony_ci 3172e5b6d6dSopenharmony_ci // Clean up the stack. 3182e5b6d6dSopenharmony_ci delete startExprNode; 3192e5b6d6dSopenharmony_ci fNodeStackPtr-=3; 3202e5b6d6dSopenharmony_ci break; 3212e5b6d6dSopenharmony_ci } 3222e5b6d6dSopenharmony_ci 3232e5b6d6dSopenharmony_ci case doEndOfRule: 3242e5b6d6dSopenharmony_ci { 3252e5b6d6dSopenharmony_ci fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression 3262e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node. 3272e5b6d6dSopenharmony_ci break; 3282e5b6d6dSopenharmony_ci } 3292e5b6d6dSopenharmony_ci#ifdef RBBI_DEBUG 3302e5b6d6dSopenharmony_ci if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");} 3312e5b6d6dSopenharmony_ci#endif 3322e5b6d6dSopenharmony_ci U_ASSERT(fNodeStackPtr == 1); 3332e5b6d6dSopenharmony_ci RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 3342e5b6d6dSopenharmony_ci 3352e5b6d6dSopenharmony_ci // If this rule includes a look-ahead '/', add a endMark node to the 3362e5b6d6dSopenharmony_ci // expression tree. 3372e5b6d6dSopenharmony_ci if (fLookAheadRule) { 3382e5b6d6dSopenharmony_ci RBBINode *endNode = pushNewNode(RBBINode::endMark); 3392e5b6d6dSopenharmony_ci RBBINode *catNode = pushNewNode(RBBINode::opCat); 3402e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 3412e5b6d6dSopenharmony_ci break; 3422e5b6d6dSopenharmony_ci } 3432e5b6d6dSopenharmony_ci fNodeStackPtr -= 2; 3442e5b6d6dSopenharmony_ci catNode->fLeftChild = thisRule; 3452e5b6d6dSopenharmony_ci catNode->fRightChild = endNode; 3462e5b6d6dSopenharmony_ci fNodeStack[fNodeStackPtr] = catNode; 3472e5b6d6dSopenharmony_ci endNode->fVal = fRuleNum; 3482e5b6d6dSopenharmony_ci endNode->fLookAheadEnd = true; 3492e5b6d6dSopenharmony_ci thisRule = catNode; 3502e5b6d6dSopenharmony_ci 3512e5b6d6dSopenharmony_ci // TODO: Disable chaining out of look-ahead (hard break) rules. 3522e5b6d6dSopenharmony_ci // The break on rule match is forced, so there is no point in building up 3532e5b6d6dSopenharmony_ci // the state table to chain into another rule for a longer match. 3542e5b6d6dSopenharmony_ci } 3552e5b6d6dSopenharmony_ci 3562e5b6d6dSopenharmony_ci // Mark this node as being the root of a rule. 3572e5b6d6dSopenharmony_ci thisRule->fRuleRoot = true; 3582e5b6d6dSopenharmony_ci 3592e5b6d6dSopenharmony_ci // Flag if chaining into this rule is wanted. 3602e5b6d6dSopenharmony_ci // 3612e5b6d6dSopenharmony_ci if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain 3622e5b6d6dSopenharmony_ci !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule 3632e5b6d6dSopenharmony_ci thisRule->fChainIn = true; 3642e5b6d6dSopenharmony_ci } 3652e5b6d6dSopenharmony_ci 3662e5b6d6dSopenharmony_ci 3672e5b6d6dSopenharmony_ci // All rule expressions are ORed together. 3682e5b6d6dSopenharmony_ci // The ';' that terminates an expression really just functions as a '|' with 3692e5b6d6dSopenharmony_ci // a low operator prededence. 3702e5b6d6dSopenharmony_ci // 3712e5b6d6dSopenharmony_ci // Each of the four sets of rules are collected separately. 3722e5b6d6dSopenharmony_ci // (forward, reverse, safe_forward, safe_reverse) 3732e5b6d6dSopenharmony_ci // OR this rule into the appropriate group of them. 3742e5b6d6dSopenharmony_ci // 3752e5b6d6dSopenharmony_ci RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree); 3762e5b6d6dSopenharmony_ci 3772e5b6d6dSopenharmony_ci if (*destRules != NULL) { 3782e5b6d6dSopenharmony_ci // This is not the first rule encountered. 3792e5b6d6dSopenharmony_ci // OR previous stuff (from *destRules) 3802e5b6d6dSopenharmony_ci // with the current rule expression (on the Node Stack) 3812e5b6d6dSopenharmony_ci // with the resulting OR expression going to *destRules 3822e5b6d6dSopenharmony_ci // 3832e5b6d6dSopenharmony_ci thisRule = fNodeStack[fNodeStackPtr]; 3842e5b6d6dSopenharmony_ci RBBINode *prevRules = *destRules; 3852e5b6d6dSopenharmony_ci RBBINode *orNode = pushNewNode(RBBINode::opOr); 3862e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 3872e5b6d6dSopenharmony_ci break; 3882e5b6d6dSopenharmony_ci } 3892e5b6d6dSopenharmony_ci orNode->fLeftChild = prevRules; 3902e5b6d6dSopenharmony_ci prevRules->fParent = orNode; 3912e5b6d6dSopenharmony_ci orNode->fRightChild = thisRule; 3922e5b6d6dSopenharmony_ci thisRule->fParent = orNode; 3932e5b6d6dSopenharmony_ci *destRules = orNode; 3942e5b6d6dSopenharmony_ci } 3952e5b6d6dSopenharmony_ci else 3962e5b6d6dSopenharmony_ci { 3972e5b6d6dSopenharmony_ci // This is the first rule encountered (for this direction). 3982e5b6d6dSopenharmony_ci // Just move its parse tree from the stack to *destRules. 3992e5b6d6dSopenharmony_ci *destRules = fNodeStack[fNodeStackPtr]; 4002e5b6d6dSopenharmony_ci } 4012e5b6d6dSopenharmony_ci fReverseRule = false; // in preparation for the next rule. 4022e5b6d6dSopenharmony_ci fLookAheadRule = false; 4032e5b6d6dSopenharmony_ci fNoChainInRule = false; 4042e5b6d6dSopenharmony_ci fNodeStackPtr = 0; 4052e5b6d6dSopenharmony_ci } 4062e5b6d6dSopenharmony_ci break; 4072e5b6d6dSopenharmony_ci 4082e5b6d6dSopenharmony_ci 4092e5b6d6dSopenharmony_ci case doRuleError: 4102e5b6d6dSopenharmony_ci error(U_BRK_RULE_SYNTAX); 4112e5b6d6dSopenharmony_ci returnVal = false; 4122e5b6d6dSopenharmony_ci break; 4132e5b6d6dSopenharmony_ci 4142e5b6d6dSopenharmony_ci 4152e5b6d6dSopenharmony_ci case doVariableNameExpectedErr: 4162e5b6d6dSopenharmony_ci error(U_BRK_RULE_SYNTAX); 4172e5b6d6dSopenharmony_ci break; 4182e5b6d6dSopenharmony_ci 4192e5b6d6dSopenharmony_ci 4202e5b6d6dSopenharmony_ci // 4212e5b6d6dSopenharmony_ci // Unary operands + ? * 4222e5b6d6dSopenharmony_ci // These all appear after the operand to which they apply. 4232e5b6d6dSopenharmony_ci // When we hit one, the operand (may be a whole sub expression) 4242e5b6d6dSopenharmony_ci // will be on the top of the stack. 4252e5b6d6dSopenharmony_ci // Unary Operator becomes TOS, with the old TOS as its one child. 4262e5b6d6dSopenharmony_ci case doUnaryOpPlus: 4272e5b6d6dSopenharmony_ci { 4282e5b6d6dSopenharmony_ci RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 4292e5b6d6dSopenharmony_ci RBBINode *plusNode = pushNewNode(RBBINode::opPlus); 4302e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 4312e5b6d6dSopenharmony_ci break; 4322e5b6d6dSopenharmony_ci } 4332e5b6d6dSopenharmony_ci plusNode->fLeftChild = operandNode; 4342e5b6d6dSopenharmony_ci operandNode->fParent = plusNode; 4352e5b6d6dSopenharmony_ci } 4362e5b6d6dSopenharmony_ci break; 4372e5b6d6dSopenharmony_ci 4382e5b6d6dSopenharmony_ci case doUnaryOpQuestion: 4392e5b6d6dSopenharmony_ci { 4402e5b6d6dSopenharmony_ci RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 4412e5b6d6dSopenharmony_ci RBBINode *qNode = pushNewNode(RBBINode::opQuestion); 4422e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 4432e5b6d6dSopenharmony_ci break; 4442e5b6d6dSopenharmony_ci } 4452e5b6d6dSopenharmony_ci qNode->fLeftChild = operandNode; 4462e5b6d6dSopenharmony_ci operandNode->fParent = qNode; 4472e5b6d6dSopenharmony_ci } 4482e5b6d6dSopenharmony_ci break; 4492e5b6d6dSopenharmony_ci 4502e5b6d6dSopenharmony_ci case doUnaryOpStar: 4512e5b6d6dSopenharmony_ci { 4522e5b6d6dSopenharmony_ci RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 4532e5b6d6dSopenharmony_ci RBBINode *starNode = pushNewNode(RBBINode::opStar); 4542e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 4552e5b6d6dSopenharmony_ci break; 4562e5b6d6dSopenharmony_ci } 4572e5b6d6dSopenharmony_ci starNode->fLeftChild = operandNode; 4582e5b6d6dSopenharmony_ci operandNode->fParent = starNode; 4592e5b6d6dSopenharmony_ci } 4602e5b6d6dSopenharmony_ci break; 4612e5b6d6dSopenharmony_ci 4622e5b6d6dSopenharmony_ci case doRuleChar: 4632e5b6d6dSopenharmony_ci // A "Rule Character" is any single character that is a literal part 4642e5b6d6dSopenharmony_ci // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]" 4652e5b6d6dSopenharmony_ci // These are pretty uncommon in break rules; the terms are more commonly 4662e5b6d6dSopenharmony_ci // sets. To keep things uniform, treat these characters like as 4672e5b6d6dSopenharmony_ci // sets that just happen to contain only one character. 4682e5b6d6dSopenharmony_ci { 4692e5b6d6dSopenharmony_ci n = pushNewNode(RBBINode::setRef); 4702e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 4712e5b6d6dSopenharmony_ci break; 4722e5b6d6dSopenharmony_ci } 4732e5b6d6dSopenharmony_ci findSetFor(UnicodeString(fC.fChar), n); 4742e5b6d6dSopenharmony_ci n->fFirstPos = fScanIndex; 4752e5b6d6dSopenharmony_ci n->fLastPos = fNextIndex; 4762e5b6d6dSopenharmony_ci fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 4772e5b6d6dSopenharmony_ci break; 4782e5b6d6dSopenharmony_ci } 4792e5b6d6dSopenharmony_ci 4802e5b6d6dSopenharmony_ci case doDotAny: 4812e5b6d6dSopenharmony_ci // scanned a ".", meaning match any single character. 4822e5b6d6dSopenharmony_ci { 4832e5b6d6dSopenharmony_ci n = pushNewNode(RBBINode::setRef); 4842e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 4852e5b6d6dSopenharmony_ci break; 4862e5b6d6dSopenharmony_ci } 4872e5b6d6dSopenharmony_ci findSetFor(UnicodeString(true, kAny, 3), n); 4882e5b6d6dSopenharmony_ci n->fFirstPos = fScanIndex; 4892e5b6d6dSopenharmony_ci n->fLastPos = fNextIndex; 4902e5b6d6dSopenharmony_ci fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 4912e5b6d6dSopenharmony_ci break; 4922e5b6d6dSopenharmony_ci } 4932e5b6d6dSopenharmony_ci 4942e5b6d6dSopenharmony_ci case doSlash: 4952e5b6d6dSopenharmony_ci // Scanned a '/', which identifies a look-ahead break position in a rule. 4962e5b6d6dSopenharmony_ci n = pushNewNode(RBBINode::lookAhead); 4972e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 4982e5b6d6dSopenharmony_ci break; 4992e5b6d6dSopenharmony_ci } 5002e5b6d6dSopenharmony_ci n->fVal = fRuleNum; 5012e5b6d6dSopenharmony_ci n->fFirstPos = fScanIndex; 5022e5b6d6dSopenharmony_ci n->fLastPos = fNextIndex; 5032e5b6d6dSopenharmony_ci fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 5042e5b6d6dSopenharmony_ci fLookAheadRule = true; 5052e5b6d6dSopenharmony_ci break; 5062e5b6d6dSopenharmony_ci 5072e5b6d6dSopenharmony_ci 5082e5b6d6dSopenharmony_ci case doStartTagValue: 5092e5b6d6dSopenharmony_ci // Scanned a '{', the opening delimiter for a tag value within a rule. 5102e5b6d6dSopenharmony_ci n = pushNewNode(RBBINode::tag); 5112e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 5122e5b6d6dSopenharmony_ci break; 5132e5b6d6dSopenharmony_ci } 5142e5b6d6dSopenharmony_ci n->fVal = 0; 5152e5b6d6dSopenharmony_ci n->fFirstPos = fScanIndex; 5162e5b6d6dSopenharmony_ci n->fLastPos = fNextIndex; 5172e5b6d6dSopenharmony_ci break; 5182e5b6d6dSopenharmony_ci 5192e5b6d6dSopenharmony_ci case doTagDigit: 5202e5b6d6dSopenharmony_ci // Just scanned a decimal digit that's part of a tag value 5212e5b6d6dSopenharmony_ci { 5222e5b6d6dSopenharmony_ci n = fNodeStack[fNodeStackPtr]; 5232e5b6d6dSopenharmony_ci uint32_t v = u_charDigitValue(fC.fChar); 5242e5b6d6dSopenharmony_ci U_ASSERT(v < 10); 5252e5b6d6dSopenharmony_ci n->fVal = n->fVal*10 + v; 5262e5b6d6dSopenharmony_ci break; 5272e5b6d6dSopenharmony_ci } 5282e5b6d6dSopenharmony_ci 5292e5b6d6dSopenharmony_ci case doTagValue: 5302e5b6d6dSopenharmony_ci n = fNodeStack[fNodeStackPtr]; 5312e5b6d6dSopenharmony_ci n->fLastPos = fNextIndex; 5322e5b6d6dSopenharmony_ci fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 5332e5b6d6dSopenharmony_ci break; 5342e5b6d6dSopenharmony_ci 5352e5b6d6dSopenharmony_ci case doTagExpectedError: 5362e5b6d6dSopenharmony_ci error(U_BRK_MALFORMED_RULE_TAG); 5372e5b6d6dSopenharmony_ci returnVal = false; 5382e5b6d6dSopenharmony_ci break; 5392e5b6d6dSopenharmony_ci 5402e5b6d6dSopenharmony_ci case doOptionStart: 5412e5b6d6dSopenharmony_ci // Scanning a !!option. At the start of string. 5422e5b6d6dSopenharmony_ci fOptionStart = fScanIndex; 5432e5b6d6dSopenharmony_ci break; 5442e5b6d6dSopenharmony_ci 5452e5b6d6dSopenharmony_ci case doOptionEnd: 5462e5b6d6dSopenharmony_ci { 5472e5b6d6dSopenharmony_ci UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart); 5482e5b6d6dSopenharmony_ci if (opt == UNICODE_STRING("chain", 5)) { 5492e5b6d6dSopenharmony_ci fRB->fChainRules = true; 5502e5b6d6dSopenharmony_ci } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) { 5512e5b6d6dSopenharmony_ci fRB->fLBCMNoChain = true; 5522e5b6d6dSopenharmony_ci } else if (opt == UNICODE_STRING("forward", 7)) { 5532e5b6d6dSopenharmony_ci fRB->fDefaultTree = &fRB->fForwardTree; 5542e5b6d6dSopenharmony_ci } else if (opt == UNICODE_STRING("reverse", 7)) { 5552e5b6d6dSopenharmony_ci fRB->fDefaultTree = &fRB->fReverseTree; 5562e5b6d6dSopenharmony_ci } else if (opt == UNICODE_STRING("safe_forward", 12)) { 5572e5b6d6dSopenharmony_ci fRB->fDefaultTree = &fRB->fSafeFwdTree; 5582e5b6d6dSopenharmony_ci } else if (opt == UNICODE_STRING("safe_reverse", 12)) { 5592e5b6d6dSopenharmony_ci fRB->fDefaultTree = &fRB->fSafeRevTree; 5602e5b6d6dSopenharmony_ci } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) { 5612e5b6d6dSopenharmony_ci fRB->fLookAheadHardBreak = true; 5622e5b6d6dSopenharmony_ci } else if (opt == UNICODE_STRING("quoted_literals_only", 20)) { 5632e5b6d6dSopenharmony_ci fRuleSets[kRuleSet_rule_char-128].clear(); 5642e5b6d6dSopenharmony_ci } else if (opt == UNICODE_STRING("unquoted_literals", 17)) { 5652e5b6d6dSopenharmony_ci fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus); 5662e5b6d6dSopenharmony_ci } else { 5672e5b6d6dSopenharmony_ci error(U_BRK_UNRECOGNIZED_OPTION); 5682e5b6d6dSopenharmony_ci } 5692e5b6d6dSopenharmony_ci } 5702e5b6d6dSopenharmony_ci break; 5712e5b6d6dSopenharmony_ci 5722e5b6d6dSopenharmony_ci case doReverseDir: 5732e5b6d6dSopenharmony_ci fReverseRule = true; 5742e5b6d6dSopenharmony_ci break; 5752e5b6d6dSopenharmony_ci 5762e5b6d6dSopenharmony_ci case doStartVariableName: 5772e5b6d6dSopenharmony_ci n = pushNewNode(RBBINode::varRef); 5782e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 5792e5b6d6dSopenharmony_ci break; 5802e5b6d6dSopenharmony_ci } 5812e5b6d6dSopenharmony_ci n->fFirstPos = fScanIndex; 5822e5b6d6dSopenharmony_ci break; 5832e5b6d6dSopenharmony_ci 5842e5b6d6dSopenharmony_ci case doEndVariableName: 5852e5b6d6dSopenharmony_ci n = fNodeStack[fNodeStackPtr]; 5862e5b6d6dSopenharmony_ci if (n==NULL || n->fType != RBBINode::varRef) { 5872e5b6d6dSopenharmony_ci error(U_BRK_INTERNAL_ERROR); 5882e5b6d6dSopenharmony_ci break; 5892e5b6d6dSopenharmony_ci } 5902e5b6d6dSopenharmony_ci n->fLastPos = fScanIndex; 5912e5b6d6dSopenharmony_ci fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); 5922e5b6d6dSopenharmony_ci // Look the newly scanned name up in the symbol table 5932e5b6d6dSopenharmony_ci // If there's an entry, set the l. child of the var ref to the replacement expression. 5942e5b6d6dSopenharmony_ci // (We also pass through here when scanning assignments, but no harm is done, other 5952e5b6d6dSopenharmony_ci // than a slight wasted effort that seems hard to avoid. Lookup will be null) 5962e5b6d6dSopenharmony_ci n->fLeftChild = fSymbolTable->lookupNode(n->fText); 5972e5b6d6dSopenharmony_ci break; 5982e5b6d6dSopenharmony_ci 5992e5b6d6dSopenharmony_ci case doCheckVarDef: 6002e5b6d6dSopenharmony_ci n = fNodeStack[fNodeStackPtr]; 6012e5b6d6dSopenharmony_ci if (n->fLeftChild == NULL) { 6022e5b6d6dSopenharmony_ci error(U_BRK_UNDEFINED_VARIABLE); 6032e5b6d6dSopenharmony_ci returnVal = false; 6042e5b6d6dSopenharmony_ci } 6052e5b6d6dSopenharmony_ci break; 6062e5b6d6dSopenharmony_ci 6072e5b6d6dSopenharmony_ci case doExprFinished: 6082e5b6d6dSopenharmony_ci break; 6092e5b6d6dSopenharmony_ci 6102e5b6d6dSopenharmony_ci case doRuleErrorAssignExpr: 6112e5b6d6dSopenharmony_ci error(U_BRK_ASSIGN_ERROR); 6122e5b6d6dSopenharmony_ci returnVal = false; 6132e5b6d6dSopenharmony_ci break; 6142e5b6d6dSopenharmony_ci 6152e5b6d6dSopenharmony_ci case doExit: 6162e5b6d6dSopenharmony_ci returnVal = false; 6172e5b6d6dSopenharmony_ci break; 6182e5b6d6dSopenharmony_ci 6192e5b6d6dSopenharmony_ci case doScanUnicodeSet: 6202e5b6d6dSopenharmony_ci scanSet(); 6212e5b6d6dSopenharmony_ci break; 6222e5b6d6dSopenharmony_ci 6232e5b6d6dSopenharmony_ci default: 6242e5b6d6dSopenharmony_ci error(U_BRK_INTERNAL_ERROR); 6252e5b6d6dSopenharmony_ci returnVal = false; 6262e5b6d6dSopenharmony_ci break; 6272e5b6d6dSopenharmony_ci } 6282e5b6d6dSopenharmony_ci return returnVal && U_SUCCESS(*fRB->fStatus); 6292e5b6d6dSopenharmony_ci} 6302e5b6d6dSopenharmony_ci 6312e5b6d6dSopenharmony_ci 6322e5b6d6dSopenharmony_ci 6332e5b6d6dSopenharmony_ci 6342e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 6352e5b6d6dSopenharmony_ci// 6362e5b6d6dSopenharmony_ci// Error Report a rule parse error. 6372e5b6d6dSopenharmony_ci// Only report it if no previous error has been recorded. 6382e5b6d6dSopenharmony_ci// 6392e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 6402e5b6d6dSopenharmony_civoid RBBIRuleScanner::error(UErrorCode e) { 6412e5b6d6dSopenharmony_ci if (U_SUCCESS(*fRB->fStatus)) { 6422e5b6d6dSopenharmony_ci *fRB->fStatus = e; 6432e5b6d6dSopenharmony_ci if (fRB->fParseError) { 6442e5b6d6dSopenharmony_ci fRB->fParseError->line = fLineNum; 6452e5b6d6dSopenharmony_ci fRB->fParseError->offset = fCharNum; 6462e5b6d6dSopenharmony_ci fRB->fParseError->preContext[0] = 0; 6472e5b6d6dSopenharmony_ci fRB->fParseError->postContext[0] = 0; 6482e5b6d6dSopenharmony_ci } 6492e5b6d6dSopenharmony_ci } 6502e5b6d6dSopenharmony_ci} 6512e5b6d6dSopenharmony_ci 6522e5b6d6dSopenharmony_ci 6532e5b6d6dSopenharmony_ci 6542e5b6d6dSopenharmony_ci 6552e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 6562e5b6d6dSopenharmony_ci// 6572e5b6d6dSopenharmony_ci// fixOpStack The parse stack holds partially assembled chunks of the parse tree. 6582e5b6d6dSopenharmony_ci// An entry on the stack may be as small as a single setRef node, 6592e5b6d6dSopenharmony_ci// or as large as the parse tree 6602e5b6d6dSopenharmony_ci// for an entire expression (this will be the one item left on the stack 6612e5b6d6dSopenharmony_ci// when the parsing of an RBBI rule completes. 6622e5b6d6dSopenharmony_ci// 6632e5b6d6dSopenharmony_ci// This function is called when a binary operator is encountered. 6642e5b6d6dSopenharmony_ci// It looks back up the stack for operators that are not yet associated 6652e5b6d6dSopenharmony_ci// with a right operand, and if the precedence of the stacked operator >= 6662e5b6d6dSopenharmony_ci// the precedence of the current operator, binds the operand left, 6672e5b6d6dSopenharmony_ci// to the previously encountered operator. 6682e5b6d6dSopenharmony_ci// 6692e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 6702e5b6d6dSopenharmony_civoid RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { 6712e5b6d6dSopenharmony_ci RBBINode *n; 6722e5b6d6dSopenharmony_ci // printNodeStack("entering fixOpStack()"); 6732e5b6d6dSopenharmony_ci for (;;) { 6742e5b6d6dSopenharmony_ci n = fNodeStack[fNodeStackPtr-1]; // an operator node 6752e5b6d6dSopenharmony_ci if (n->fPrecedence == 0) { 6762e5b6d6dSopenharmony_ci RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node"); 6772e5b6d6dSopenharmony_ci error(U_BRK_INTERNAL_ERROR); 6782e5b6d6dSopenharmony_ci return; 6792e5b6d6dSopenharmony_ci } 6802e5b6d6dSopenharmony_ci 6812e5b6d6dSopenharmony_ci if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { 6822e5b6d6dSopenharmony_ci // The most recent operand goes with the current operator, 6832e5b6d6dSopenharmony_ci // not with the previously stacked one. 6842e5b6d6dSopenharmony_ci break; 6852e5b6d6dSopenharmony_ci } 6862e5b6d6dSopenharmony_ci // Stack operator is a binary op ( '|' or concatenation) 6872e5b6d6dSopenharmony_ci // TOS operand becomes right child of this operator. 6882e5b6d6dSopenharmony_ci // Resulting subexpression becomes the TOS operand. 6892e5b6d6dSopenharmony_ci n->fRightChild = fNodeStack[fNodeStackPtr]; 6902e5b6d6dSopenharmony_ci fNodeStack[fNodeStackPtr]->fParent = n; 6912e5b6d6dSopenharmony_ci fNodeStackPtr--; 6922e5b6d6dSopenharmony_ci // printNodeStack("looping in fixOpStack() "); 6932e5b6d6dSopenharmony_ci } 6942e5b6d6dSopenharmony_ci 6952e5b6d6dSopenharmony_ci if (p <= RBBINode::precLParen) { 6962e5b6d6dSopenharmony_ci // Scan is at a right paren or end of expression. 6972e5b6d6dSopenharmony_ci // The scanned item must match the stack, or else there was an error. 6982e5b6d6dSopenharmony_ci // Discard the left paren (or start expr) node from the stack, 6992e5b6d6dSopenharmony_ci // leaving the completed (sub)expression as TOS. 7002e5b6d6dSopenharmony_ci if (n->fPrecedence != p) { 7012e5b6d6dSopenharmony_ci // Right paren encountered matched start of expression node, or 7022e5b6d6dSopenharmony_ci // end of expression matched with a left paren node. 7032e5b6d6dSopenharmony_ci error(U_BRK_MISMATCHED_PAREN); 7042e5b6d6dSopenharmony_ci } 7052e5b6d6dSopenharmony_ci fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr]; 7062e5b6d6dSopenharmony_ci fNodeStackPtr--; 7072e5b6d6dSopenharmony_ci // Delete the now-discarded LParen or Start node. 7082e5b6d6dSopenharmony_ci delete n; 7092e5b6d6dSopenharmony_ci } 7102e5b6d6dSopenharmony_ci // printNodeStack("leaving fixOpStack()"); 7112e5b6d6dSopenharmony_ci} 7122e5b6d6dSopenharmony_ci 7132e5b6d6dSopenharmony_ci 7142e5b6d6dSopenharmony_ci 7152e5b6d6dSopenharmony_ci 7162e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 7172e5b6d6dSopenharmony_ci// 7182e5b6d6dSopenharmony_ci// findSetFor given a UnicodeString, 7192e5b6d6dSopenharmony_ci// - find the corresponding Unicode Set (uset node) 7202e5b6d6dSopenharmony_ci// (create one if necessary) 7212e5b6d6dSopenharmony_ci// - Set fLeftChild of the caller's node (should be a setRef node) 7222e5b6d6dSopenharmony_ci// to the uset node 7232e5b6d6dSopenharmony_ci// Maintain a hash table of uset nodes, so the same one is always used 7242e5b6d6dSopenharmony_ci// for the same string. 7252e5b6d6dSopenharmony_ci// If a "to adopt" set is provided and we haven't seen this key before, 7262e5b6d6dSopenharmony_ci// add the provided set to the hash table. 7272e5b6d6dSopenharmony_ci// If the string is one (32 bit) char in length, the set contains 7282e5b6d6dSopenharmony_ci// just one element which is the char in question. 7292e5b6d6dSopenharmony_ci// If the string is "any", return a set containing all chars. 7302e5b6d6dSopenharmony_ci// 7312e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 7322e5b6d6dSopenharmony_civoid RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { 7332e5b6d6dSopenharmony_ci 7342e5b6d6dSopenharmony_ci RBBISetTableEl *el; 7352e5b6d6dSopenharmony_ci 7362e5b6d6dSopenharmony_ci // First check whether we've already cached a set for this string. 7372e5b6d6dSopenharmony_ci // If so, just use the cached set in the new node. 7382e5b6d6dSopenharmony_ci // delete any set provided by the caller, since we own it. 7392e5b6d6dSopenharmony_ci el = (RBBISetTableEl *)uhash_get(fSetTable, &s); 7402e5b6d6dSopenharmony_ci if (el != NULL) { 7412e5b6d6dSopenharmony_ci delete setToAdopt; 7422e5b6d6dSopenharmony_ci node->fLeftChild = el->val; 7432e5b6d6dSopenharmony_ci U_ASSERT(node->fLeftChild->fType == RBBINode::uset); 7442e5b6d6dSopenharmony_ci return; 7452e5b6d6dSopenharmony_ci } 7462e5b6d6dSopenharmony_ci 7472e5b6d6dSopenharmony_ci // Haven't seen this set before. 7482e5b6d6dSopenharmony_ci // If the caller didn't provide us with a prebuilt set, 7492e5b6d6dSopenharmony_ci // create a new UnicodeSet now. 7502e5b6d6dSopenharmony_ci if (setToAdopt == NULL) { 7512e5b6d6dSopenharmony_ci if (s.compare(kAny, -1) == 0) { 7522e5b6d6dSopenharmony_ci setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 7532e5b6d6dSopenharmony_ci } else { 7542e5b6d6dSopenharmony_ci UChar32 c; 7552e5b6d6dSopenharmony_ci c = s.char32At(0); 7562e5b6d6dSopenharmony_ci setToAdopt = new UnicodeSet(c, c); 7572e5b6d6dSopenharmony_ci } 7582e5b6d6dSopenharmony_ci } 7592e5b6d6dSopenharmony_ci 7602e5b6d6dSopenharmony_ci // 7612e5b6d6dSopenharmony_ci // Make a new uset node to refer to this UnicodeSet 7622e5b6d6dSopenharmony_ci // This new uset node becomes the child of the caller's setReference node. 7632e5b6d6dSopenharmony_ci // 7642e5b6d6dSopenharmony_ci RBBINode *usetNode = new RBBINode(RBBINode::uset); 7652e5b6d6dSopenharmony_ci if (usetNode == NULL) { 7662e5b6d6dSopenharmony_ci error(U_MEMORY_ALLOCATION_ERROR); 7672e5b6d6dSopenharmony_ci return; 7682e5b6d6dSopenharmony_ci } 7692e5b6d6dSopenharmony_ci usetNode->fInputSet = setToAdopt; 7702e5b6d6dSopenharmony_ci usetNode->fParent = node; 7712e5b6d6dSopenharmony_ci node->fLeftChild = usetNode; 7722e5b6d6dSopenharmony_ci usetNode->fText = s; 7732e5b6d6dSopenharmony_ci 7742e5b6d6dSopenharmony_ci 7752e5b6d6dSopenharmony_ci // 7762e5b6d6dSopenharmony_ci // Add the new uset node to the list of all uset nodes. 7772e5b6d6dSopenharmony_ci // 7782e5b6d6dSopenharmony_ci fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus); 7792e5b6d6dSopenharmony_ci 7802e5b6d6dSopenharmony_ci 7812e5b6d6dSopenharmony_ci // 7822e5b6d6dSopenharmony_ci // Add the new set to the set hash table. 7832e5b6d6dSopenharmony_ci // 7842e5b6d6dSopenharmony_ci el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl)); 7852e5b6d6dSopenharmony_ci UnicodeString *tkey = new UnicodeString(s); 7862e5b6d6dSopenharmony_ci if (tkey == NULL || el == NULL || setToAdopt == NULL) { 7872e5b6d6dSopenharmony_ci // Delete to avoid memory leak 7882e5b6d6dSopenharmony_ci delete tkey; 7892e5b6d6dSopenharmony_ci tkey = NULL; 7902e5b6d6dSopenharmony_ci uprv_free(el); 7912e5b6d6dSopenharmony_ci el = NULL; 7922e5b6d6dSopenharmony_ci delete setToAdopt; 7932e5b6d6dSopenharmony_ci setToAdopt = NULL; 7942e5b6d6dSopenharmony_ci 7952e5b6d6dSopenharmony_ci error(U_MEMORY_ALLOCATION_ERROR); 7962e5b6d6dSopenharmony_ci return; 7972e5b6d6dSopenharmony_ci } 7982e5b6d6dSopenharmony_ci el->key = tkey; 7992e5b6d6dSopenharmony_ci el->val = usetNode; 8002e5b6d6dSopenharmony_ci uhash_put(fSetTable, el->key, el, fRB->fStatus); 8012e5b6d6dSopenharmony_ci 8022e5b6d6dSopenharmony_ci return; 8032e5b6d6dSopenharmony_ci} 8042e5b6d6dSopenharmony_ci 8052e5b6d6dSopenharmony_ci 8062e5b6d6dSopenharmony_ci 8072e5b6d6dSopenharmony_ci// 8082e5b6d6dSopenharmony_ci// Assorted Unicode character constants. 8092e5b6d6dSopenharmony_ci// Numeric because there is no portable way to enter them as literals. 8102e5b6d6dSopenharmony_ci// (Think EBCDIC). 8112e5b6d6dSopenharmony_ci// 8122e5b6d6dSopenharmony_cistatic const UChar chCR = 0x0d; // New lines, for terminating comments. 8132e5b6d6dSopenharmony_cistatic const UChar chLF = 0x0a; 8142e5b6d6dSopenharmony_cistatic const UChar chNEL = 0x85; // NEL newline variant 8152e5b6d6dSopenharmony_cistatic const UChar chLS = 0x2028; // Unicode Line Separator 8162e5b6d6dSopenharmony_cistatic const UChar chApos = 0x27; // single quote, for quoted chars. 8172e5b6d6dSopenharmony_cistatic const UChar chPound = 0x23; // '#', introduces a comment. 8182e5b6d6dSopenharmony_cistatic const UChar chBackSlash = 0x5c; // '\' introduces a char escape 8192e5b6d6dSopenharmony_cistatic const UChar chLParen = 0x28; 8202e5b6d6dSopenharmony_cistatic const UChar chRParen = 0x29; 8212e5b6d6dSopenharmony_ci 8222e5b6d6dSopenharmony_ci 8232e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 8242e5b6d6dSopenharmony_ci// 8252e5b6d6dSopenharmony_ci// stripRules Return a rules string without extra spaces. 8262e5b6d6dSopenharmony_ci// (Comments are removed separately, during rule parsing.) 8272e5b6d6dSopenharmony_ci// 8282e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 8292e5b6d6dSopenharmony_ciUnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { 8302e5b6d6dSopenharmony_ci UnicodeString strippedRules; 8312e5b6d6dSopenharmony_ci int32_t rulesLength = rules.length(); 8322e5b6d6dSopenharmony_ci 8332e5b6d6dSopenharmony_ci for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) { 8342e5b6d6dSopenharmony_ci UChar32 cp = rules.char32At(idx); 8352e5b6d6dSopenharmony_ci bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE); 8362e5b6d6dSopenharmony_ci if (whiteSpace) { 8372e5b6d6dSopenharmony_ci continue; 8382e5b6d6dSopenharmony_ci } 8392e5b6d6dSopenharmony_ci strippedRules.append(cp); 8402e5b6d6dSopenharmony_ci } 8412e5b6d6dSopenharmony_ci return strippedRules; 8422e5b6d6dSopenharmony_ci} 8432e5b6d6dSopenharmony_ci 8442e5b6d6dSopenharmony_ci 8452e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 8462e5b6d6dSopenharmony_ci// 8472e5b6d6dSopenharmony_ci// nextCharLL Low Level Next Char from rule input source. 8482e5b6d6dSopenharmony_ci// Get a char from the input character iterator, 8492e5b6d6dSopenharmony_ci// keep track of input position for error reporting. 8502e5b6d6dSopenharmony_ci// 8512e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 8522e5b6d6dSopenharmony_ciUChar32 RBBIRuleScanner::nextCharLL() { 8532e5b6d6dSopenharmony_ci UChar32 ch; 8542e5b6d6dSopenharmony_ci 8552e5b6d6dSopenharmony_ci if (fNextIndex >= fRB->fRules.length()) { 8562e5b6d6dSopenharmony_ci return (UChar32)-1; 8572e5b6d6dSopenharmony_ci } 8582e5b6d6dSopenharmony_ci ch = fRB->fRules.char32At(fNextIndex); 8592e5b6d6dSopenharmony_ci if (U_IS_SURROGATE(ch)) { 8602e5b6d6dSopenharmony_ci error(U_ILLEGAL_CHAR_FOUND); 8612e5b6d6dSopenharmony_ci return U_SENTINEL; 8622e5b6d6dSopenharmony_ci } 8632e5b6d6dSopenharmony_ci fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1); 8642e5b6d6dSopenharmony_ci 8652e5b6d6dSopenharmony_ci if (ch == chCR || 8662e5b6d6dSopenharmony_ci ch == chNEL || 8672e5b6d6dSopenharmony_ci ch == chLS || 8682e5b6d6dSopenharmony_ci (ch == chLF && fLastChar != chCR)) { 8692e5b6d6dSopenharmony_ci // Character is starting a new line. Bump up the line number, and 8702e5b6d6dSopenharmony_ci // reset the column to 0. 8712e5b6d6dSopenharmony_ci fLineNum++; 8722e5b6d6dSopenharmony_ci fCharNum=0; 8732e5b6d6dSopenharmony_ci if (fQuoteMode) { 8742e5b6d6dSopenharmony_ci error(U_BRK_NEW_LINE_IN_QUOTED_STRING); 8752e5b6d6dSopenharmony_ci fQuoteMode = false; 8762e5b6d6dSopenharmony_ci } 8772e5b6d6dSopenharmony_ci } 8782e5b6d6dSopenharmony_ci else { 8792e5b6d6dSopenharmony_ci // Character is not starting a new line. Except in the case of a 8802e5b6d6dSopenharmony_ci // LF following a CR, increment the column position. 8812e5b6d6dSopenharmony_ci if (ch != chLF) { 8822e5b6d6dSopenharmony_ci fCharNum++; 8832e5b6d6dSopenharmony_ci } 8842e5b6d6dSopenharmony_ci } 8852e5b6d6dSopenharmony_ci fLastChar = ch; 8862e5b6d6dSopenharmony_ci return ch; 8872e5b6d6dSopenharmony_ci} 8882e5b6d6dSopenharmony_ci 8892e5b6d6dSopenharmony_ci 8902e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 8912e5b6d6dSopenharmony_ci// 8922e5b6d6dSopenharmony_ci// nextChar for rules scanning. At this level, we handle stripping 8932e5b6d6dSopenharmony_ci// out comments and processing backslash character escapes. 8942e5b6d6dSopenharmony_ci// The rest of the rules grammar is handled at the next level up. 8952e5b6d6dSopenharmony_ci// 8962e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 8972e5b6d6dSopenharmony_civoid RBBIRuleScanner::nextChar(RBBIRuleChar &c) { 8982e5b6d6dSopenharmony_ci 8992e5b6d6dSopenharmony_ci // Unicode Character constants needed for the processing done by nextChar(), 9002e5b6d6dSopenharmony_ci // in hex because literals wont work on EBCDIC machines. 9012e5b6d6dSopenharmony_ci 9022e5b6d6dSopenharmony_ci fScanIndex = fNextIndex; 9032e5b6d6dSopenharmony_ci c.fChar = nextCharLL(); 9042e5b6d6dSopenharmony_ci c.fEscaped = false; 9052e5b6d6dSopenharmony_ci 9062e5b6d6dSopenharmony_ci // 9072e5b6d6dSopenharmony_ci // check for '' sequence. 9082e5b6d6dSopenharmony_ci // These are recognized in all contexts, whether in quoted text or not. 9092e5b6d6dSopenharmony_ci // 9102e5b6d6dSopenharmony_ci if (c.fChar == chApos) { 9112e5b6d6dSopenharmony_ci if (fRB->fRules.char32At(fNextIndex) == chApos) { 9122e5b6d6dSopenharmony_ci c.fChar = nextCharLL(); // get nextChar officially so character counts 9132e5b6d6dSopenharmony_ci c.fEscaped = true; // stay correct. 9142e5b6d6dSopenharmony_ci } 9152e5b6d6dSopenharmony_ci else 9162e5b6d6dSopenharmony_ci { 9172e5b6d6dSopenharmony_ci // Single quote, by itself. 9182e5b6d6dSopenharmony_ci // Toggle quoting mode. 9192e5b6d6dSopenharmony_ci // Return either '(' or ')', because quotes cause a grouping of the quoted text. 9202e5b6d6dSopenharmony_ci fQuoteMode = !fQuoteMode; 9212e5b6d6dSopenharmony_ci if (fQuoteMode == true) { 9222e5b6d6dSopenharmony_ci c.fChar = chLParen; 9232e5b6d6dSopenharmony_ci } else { 9242e5b6d6dSopenharmony_ci c.fChar = chRParen; 9252e5b6d6dSopenharmony_ci } 9262e5b6d6dSopenharmony_ci c.fEscaped = false; // The paren that we return is not escaped. 9272e5b6d6dSopenharmony_ci return; 9282e5b6d6dSopenharmony_ci } 9292e5b6d6dSopenharmony_ci } 9302e5b6d6dSopenharmony_ci 9312e5b6d6dSopenharmony_ci if (fQuoteMode) { 9322e5b6d6dSopenharmony_ci c.fEscaped = true; 9332e5b6d6dSopenharmony_ci } 9342e5b6d6dSopenharmony_ci else 9352e5b6d6dSopenharmony_ci { 9362e5b6d6dSopenharmony_ci // We are not in a 'quoted region' of the source. 9372e5b6d6dSopenharmony_ci // 9382e5b6d6dSopenharmony_ci if (c.fChar == chPound) { 9392e5b6d6dSopenharmony_ci // Start of a comment. Consume the rest of it. 9402e5b6d6dSopenharmony_ci // The new-line char that terminates the comment is always returned. 9412e5b6d6dSopenharmony_ci // It will be treated as white-space, and serves to break up anything 9422e5b6d6dSopenharmony_ci // that might otherwise incorrectly clump together with a comment in 9432e5b6d6dSopenharmony_ci // the middle (a variable name, for example.) 9442e5b6d6dSopenharmony_ci int32_t commentStart = fScanIndex; 9452e5b6d6dSopenharmony_ci for (;;) { 9462e5b6d6dSopenharmony_ci c.fChar = nextCharLL(); 9472e5b6d6dSopenharmony_ci if (c.fChar == (UChar32)-1 || // EOF 9482e5b6d6dSopenharmony_ci c.fChar == chCR || 9492e5b6d6dSopenharmony_ci c.fChar == chLF || 9502e5b6d6dSopenharmony_ci c.fChar == chNEL || 9512e5b6d6dSopenharmony_ci c.fChar == chLS) {break;} 9522e5b6d6dSopenharmony_ci } 9532e5b6d6dSopenharmony_ci for (int32_t i=commentStart; i<fNextIndex-1; ++i) { 9542e5b6d6dSopenharmony_ci fRB->fStrippedRules.setCharAt(i, u' '); 9552e5b6d6dSopenharmony_ci } 9562e5b6d6dSopenharmony_ci } 9572e5b6d6dSopenharmony_ci if (c.fChar == (UChar32)-1) { 9582e5b6d6dSopenharmony_ci return; 9592e5b6d6dSopenharmony_ci } 9602e5b6d6dSopenharmony_ci 9612e5b6d6dSopenharmony_ci // 9622e5b6d6dSopenharmony_ci // check for backslash escaped characters. 9632e5b6d6dSopenharmony_ci // Use UnicodeString::unescapeAt() to handle them. 9642e5b6d6dSopenharmony_ci // 9652e5b6d6dSopenharmony_ci if (c.fChar == chBackSlash) { 9662e5b6d6dSopenharmony_ci c.fEscaped = true; 9672e5b6d6dSopenharmony_ci int32_t startX = fNextIndex; 9682e5b6d6dSopenharmony_ci c.fChar = fRB->fRules.unescapeAt(fNextIndex); 9692e5b6d6dSopenharmony_ci if (fNextIndex == startX) { 9702e5b6d6dSopenharmony_ci error(U_BRK_HEX_DIGITS_EXPECTED); 9712e5b6d6dSopenharmony_ci } 9722e5b6d6dSopenharmony_ci fCharNum += fNextIndex-startX; 9732e5b6d6dSopenharmony_ci } 9742e5b6d6dSopenharmony_ci } 9752e5b6d6dSopenharmony_ci // putc(c.fChar, stdout); 9762e5b6d6dSopenharmony_ci} 9772e5b6d6dSopenharmony_ci 9782e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 9792e5b6d6dSopenharmony_ci// 9802e5b6d6dSopenharmony_ci// Parse RBBI rules. The state machine for rules parsing is here. 9812e5b6d6dSopenharmony_ci// The state tables are hand-written in the file rbbirpt.txt, 9822e5b6d6dSopenharmony_ci// and converted to the form used here by a perl 9832e5b6d6dSopenharmony_ci// script rbbicst.pl 9842e5b6d6dSopenharmony_ci// 9852e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 9862e5b6d6dSopenharmony_civoid RBBIRuleScanner::parse() { 9872e5b6d6dSopenharmony_ci uint16_t state; 9882e5b6d6dSopenharmony_ci const RBBIRuleTableEl *tableEl; 9892e5b6d6dSopenharmony_ci 9902e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 9912e5b6d6dSopenharmony_ci return; 9922e5b6d6dSopenharmony_ci } 9932e5b6d6dSopenharmony_ci 9942e5b6d6dSopenharmony_ci state = 1; 9952e5b6d6dSopenharmony_ci nextChar(fC); 9962e5b6d6dSopenharmony_ci // 9972e5b6d6dSopenharmony_ci // Main loop for the rule parsing state machine. 9982e5b6d6dSopenharmony_ci // Runs once per state transition. 9992e5b6d6dSopenharmony_ci // Each time through optionally performs, depending on the state table, 10002e5b6d6dSopenharmony_ci // - an advance to the the next input char 10012e5b6d6dSopenharmony_ci // - an action to be performed. 10022e5b6d6dSopenharmony_ci // - pushing or popping a state to/from the local state return stack. 10032e5b6d6dSopenharmony_ci // 10042e5b6d6dSopenharmony_ci for (;;) { 10052e5b6d6dSopenharmony_ci // Bail out if anything has gone wrong. 10062e5b6d6dSopenharmony_ci // RBBI rule file parsing stops on the first error encountered. 10072e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 10082e5b6d6dSopenharmony_ci break; 10092e5b6d6dSopenharmony_ci } 10102e5b6d6dSopenharmony_ci 10112e5b6d6dSopenharmony_ci // Quit if state == 0. This is the normal way to exit the state machine. 10122e5b6d6dSopenharmony_ci // 10132e5b6d6dSopenharmony_ci if (state == 0) { 10142e5b6d6dSopenharmony_ci break; 10152e5b6d6dSopenharmony_ci } 10162e5b6d6dSopenharmony_ci 10172e5b6d6dSopenharmony_ci // Find the state table element that matches the input char from the rule, or the 10182e5b6d6dSopenharmony_ci // class of the input character. Start with the first table row for this 10192e5b6d6dSopenharmony_ci // state, then linearly scan forward until we find a row that matches the 10202e5b6d6dSopenharmony_ci // character. The last row for each state always matches all characters, so 10212e5b6d6dSopenharmony_ci // the search will stop there, if not before. 10222e5b6d6dSopenharmony_ci // 10232e5b6d6dSopenharmony_ci tableEl = &gRuleParseStateTable[state]; 10242e5b6d6dSopenharmony_ci #ifdef RBBI_DEBUG 10252e5b6d6dSopenharmony_ci if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { 10262e5b6d6dSopenharmony_ci RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ", 10272e5b6d6dSopenharmony_ci fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); 10282e5b6d6dSopenharmony_ci } 10292e5b6d6dSopenharmony_ci #endif 10302e5b6d6dSopenharmony_ci 10312e5b6d6dSopenharmony_ci for (;;) { 10322e5b6d6dSopenharmony_ci #ifdef RBBI_DEBUG 10332e5b6d6dSopenharmony_ci if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);} 10342e5b6d6dSopenharmony_ci #endif 10352e5b6d6dSopenharmony_ci if (tableEl->fCharClass < 127 && fC.fEscaped == false && tableEl->fCharClass == fC.fChar) { 10362e5b6d6dSopenharmony_ci // Table row specified an individual character, not a set, and 10372e5b6d6dSopenharmony_ci // the input character is not escaped, and 10382e5b6d6dSopenharmony_ci // the input character matched it. 10392e5b6d6dSopenharmony_ci break; 10402e5b6d6dSopenharmony_ci } 10412e5b6d6dSopenharmony_ci if (tableEl->fCharClass == 255) { 10422e5b6d6dSopenharmony_ci // Table row specified default, match anything character class. 10432e5b6d6dSopenharmony_ci break; 10442e5b6d6dSopenharmony_ci } 10452e5b6d6dSopenharmony_ci if (tableEl->fCharClass == 254 && fC.fEscaped) { 10462e5b6d6dSopenharmony_ci // Table row specified "escaped" and the char was escaped. 10472e5b6d6dSopenharmony_ci break; 10482e5b6d6dSopenharmony_ci } 10492e5b6d6dSopenharmony_ci if (tableEl->fCharClass == 253 && fC.fEscaped && 10502e5b6d6dSopenharmony_ci (fC.fChar == 0x50 || fC.fChar == 0x70 )) { 10512e5b6d6dSopenharmony_ci // Table row specified "escaped P" and the char is either 'p' or 'P'. 10522e5b6d6dSopenharmony_ci break; 10532e5b6d6dSopenharmony_ci } 10542e5b6d6dSopenharmony_ci if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) { 10552e5b6d6dSopenharmony_ci // Table row specified eof and we hit eof on the input. 10562e5b6d6dSopenharmony_ci break; 10572e5b6d6dSopenharmony_ci } 10582e5b6d6dSopenharmony_ci 10592e5b6d6dSopenharmony_ci if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 10602e5b6d6dSopenharmony_ci fC.fEscaped == false && // char is not escaped && 10612e5b6d6dSopenharmony_ci fC.fChar != (UChar32)-1) { // char is not EOF 10622e5b6d6dSopenharmony_ci U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets)); 10632e5b6d6dSopenharmony_ci if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 10642e5b6d6dSopenharmony_ci // Table row specified a character class, or set of characters, 10652e5b6d6dSopenharmony_ci // and the current char matches it. 10662e5b6d6dSopenharmony_ci break; 10672e5b6d6dSopenharmony_ci } 10682e5b6d6dSopenharmony_ci } 10692e5b6d6dSopenharmony_ci 10702e5b6d6dSopenharmony_ci // No match on this row, advance to the next row for this state, 10712e5b6d6dSopenharmony_ci tableEl++; 10722e5b6d6dSopenharmony_ci } 10732e5b6d6dSopenharmony_ci if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");} 10742e5b6d6dSopenharmony_ci 10752e5b6d6dSopenharmony_ci // 10762e5b6d6dSopenharmony_ci // We've found the row of the state table that matches the current input 10772e5b6d6dSopenharmony_ci // character from the rules string. 10782e5b6d6dSopenharmony_ci // Perform any action specified by this row in the state table. 10792e5b6d6dSopenharmony_ci if (doParseActions((int32_t)tableEl->fAction) == false) { 10802e5b6d6dSopenharmony_ci // Break out of the state machine loop if the 10812e5b6d6dSopenharmony_ci // the action signalled some kind of error, or 10822e5b6d6dSopenharmony_ci // the action was to exit, occurs on normal end-of-rules-input. 10832e5b6d6dSopenharmony_ci break; 10842e5b6d6dSopenharmony_ci } 10852e5b6d6dSopenharmony_ci 10862e5b6d6dSopenharmony_ci if (tableEl->fPushState != 0) { 10872e5b6d6dSopenharmony_ci fStackPtr++; 10882e5b6d6dSopenharmony_ci if (fStackPtr >= kStackSize) { 10892e5b6d6dSopenharmony_ci error(U_BRK_INTERNAL_ERROR); 10902e5b6d6dSopenharmony_ci RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow."); 10912e5b6d6dSopenharmony_ci fStackPtr--; 10922e5b6d6dSopenharmony_ci } 10932e5b6d6dSopenharmony_ci fStack[fStackPtr] = tableEl->fPushState; 10942e5b6d6dSopenharmony_ci } 10952e5b6d6dSopenharmony_ci 10962e5b6d6dSopenharmony_ci if (tableEl->fNextChar) { 10972e5b6d6dSopenharmony_ci nextChar(fC); 10982e5b6d6dSopenharmony_ci } 10992e5b6d6dSopenharmony_ci 11002e5b6d6dSopenharmony_ci // Get the next state from the table entry, or from the 11012e5b6d6dSopenharmony_ci // state stack if the next state was specified as "pop". 11022e5b6d6dSopenharmony_ci if (tableEl->fNextState != 255) { 11032e5b6d6dSopenharmony_ci state = tableEl->fNextState; 11042e5b6d6dSopenharmony_ci } else { 11052e5b6d6dSopenharmony_ci state = fStack[fStackPtr]; 11062e5b6d6dSopenharmony_ci fStackPtr--; 11072e5b6d6dSopenharmony_ci if (fStackPtr < 0) { 11082e5b6d6dSopenharmony_ci error(U_BRK_INTERNAL_ERROR); 11092e5b6d6dSopenharmony_ci RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow."); 11102e5b6d6dSopenharmony_ci fStackPtr++; 11112e5b6d6dSopenharmony_ci } 11122e5b6d6dSopenharmony_ci } 11132e5b6d6dSopenharmony_ci 11142e5b6d6dSopenharmony_ci } 11152e5b6d6dSopenharmony_ci 11162e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 11172e5b6d6dSopenharmony_ci return; 11182e5b6d6dSopenharmony_ci } 11192e5b6d6dSopenharmony_ci 11202e5b6d6dSopenharmony_ci // If there are no forward rules set an error. 11212e5b6d6dSopenharmony_ci // 11222e5b6d6dSopenharmony_ci if (fRB->fForwardTree == NULL) { 11232e5b6d6dSopenharmony_ci error(U_BRK_RULE_SYNTAX); 11242e5b6d6dSopenharmony_ci return; 11252e5b6d6dSopenharmony_ci } 11262e5b6d6dSopenharmony_ci 11272e5b6d6dSopenharmony_ci // 11282e5b6d6dSopenharmony_ci // Parsing of the input RBBI rules is complete. 11292e5b6d6dSopenharmony_ci // We now have a parse tree for the rule expressions 11302e5b6d6dSopenharmony_ci // and a list of all UnicodeSets that are referenced. 11312e5b6d6dSopenharmony_ci // 11322e5b6d6dSopenharmony_ci#ifdef RBBI_DEBUG 11332e5b6d6dSopenharmony_ci if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();} 11342e5b6d6dSopenharmony_ci if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) { 11352e5b6d6dSopenharmony_ci RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n"); 11362e5b6d6dSopenharmony_ci RBBINode::printTree(fRB->fForwardTree, true); 11372e5b6d6dSopenharmony_ci RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n"); 11382e5b6d6dSopenharmony_ci RBBINode::printTree(fRB->fReverseTree, true); 11392e5b6d6dSopenharmony_ci RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n"); 11402e5b6d6dSopenharmony_ci RBBINode::printTree(fRB->fSafeFwdTree, true); 11412e5b6d6dSopenharmony_ci RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n"); 11422e5b6d6dSopenharmony_ci RBBINode::printTree(fRB->fSafeRevTree, true); 11432e5b6d6dSopenharmony_ci } 11442e5b6d6dSopenharmony_ci#endif 11452e5b6d6dSopenharmony_ci} 11462e5b6d6dSopenharmony_ci 11472e5b6d6dSopenharmony_ci 11482e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 11492e5b6d6dSopenharmony_ci// 11502e5b6d6dSopenharmony_ci// printNodeStack for debugging... 11512e5b6d6dSopenharmony_ci// 11522e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 11532e5b6d6dSopenharmony_ci#ifdef RBBI_DEBUG 11542e5b6d6dSopenharmony_civoid RBBIRuleScanner::printNodeStack(const char *title) { 11552e5b6d6dSopenharmony_ci int i; 11562e5b6d6dSopenharmony_ci RBBIDebugPrintf("%s. Dumping node stack...\n", title); 11572e5b6d6dSopenharmony_ci for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], true);} 11582e5b6d6dSopenharmony_ci} 11592e5b6d6dSopenharmony_ci#endif 11602e5b6d6dSopenharmony_ci 11612e5b6d6dSopenharmony_ci 11622e5b6d6dSopenharmony_ci 11632e5b6d6dSopenharmony_ci 11642e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 11652e5b6d6dSopenharmony_ci// 11662e5b6d6dSopenharmony_ci// pushNewNode create a new RBBINode of the specified type and push it 11672e5b6d6dSopenharmony_ci// onto the stack of nodes. 11682e5b6d6dSopenharmony_ci// 11692e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 11702e5b6d6dSopenharmony_ciRBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { 11712e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 11722e5b6d6dSopenharmony_ci return NULL; 11732e5b6d6dSopenharmony_ci } 11742e5b6d6dSopenharmony_ci if (fNodeStackPtr >= kStackSize - 1) { 11752e5b6d6dSopenharmony_ci error(U_BRK_RULE_SYNTAX); 11762e5b6d6dSopenharmony_ci RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow."); 11772e5b6d6dSopenharmony_ci return NULL; 11782e5b6d6dSopenharmony_ci } 11792e5b6d6dSopenharmony_ci fNodeStackPtr++; 11802e5b6d6dSopenharmony_ci fNodeStack[fNodeStackPtr] = new RBBINode(t); 11812e5b6d6dSopenharmony_ci if (fNodeStack[fNodeStackPtr] == NULL) { 11822e5b6d6dSopenharmony_ci *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR; 11832e5b6d6dSopenharmony_ci } 11842e5b6d6dSopenharmony_ci return fNodeStack[fNodeStackPtr]; 11852e5b6d6dSopenharmony_ci} 11862e5b6d6dSopenharmony_ci 11872e5b6d6dSopenharmony_ci 11882e5b6d6dSopenharmony_ci 11892e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 11902e5b6d6dSopenharmony_ci// 11912e5b6d6dSopenharmony_ci// scanSet Construct a UnicodeSet from the text at the current scan 11922e5b6d6dSopenharmony_ci// position. Advance the scan position to the first character 11932e5b6d6dSopenharmony_ci// after the set. 11942e5b6d6dSopenharmony_ci// 11952e5b6d6dSopenharmony_ci// A new RBBI setref node referring to the set is pushed onto the node 11962e5b6d6dSopenharmony_ci// stack. 11972e5b6d6dSopenharmony_ci// 11982e5b6d6dSopenharmony_ci// The scan position is normally under the control of the state machine 11992e5b6d6dSopenharmony_ci// that controls rule parsing. UnicodeSets, however, are parsed by 12002e5b6d6dSopenharmony_ci// the UnicodeSet constructor, not by the RBBI rule parser. 12012e5b6d6dSopenharmony_ci// 12022e5b6d6dSopenharmony_ci//------------------------------------------------------------------------------ 12032e5b6d6dSopenharmony_civoid RBBIRuleScanner::scanSet() { 12042e5b6d6dSopenharmony_ci UnicodeSet *uset; 12052e5b6d6dSopenharmony_ci ParsePosition pos; 12062e5b6d6dSopenharmony_ci int startPos; 12072e5b6d6dSopenharmony_ci int i; 12082e5b6d6dSopenharmony_ci 12092e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 12102e5b6d6dSopenharmony_ci return; 12112e5b6d6dSopenharmony_ci } 12122e5b6d6dSopenharmony_ci 12132e5b6d6dSopenharmony_ci pos.setIndex(fScanIndex); 12142e5b6d6dSopenharmony_ci startPos = fScanIndex; 12152e5b6d6dSopenharmony_ci UErrorCode localStatus = U_ZERO_ERROR; 12162e5b6d6dSopenharmony_ci uset = new UnicodeSet(); 12172e5b6d6dSopenharmony_ci if (uset == NULL) { 12182e5b6d6dSopenharmony_ci localStatus = U_MEMORY_ALLOCATION_ERROR; 12192e5b6d6dSopenharmony_ci } else { 12202e5b6d6dSopenharmony_ci uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus); 12212e5b6d6dSopenharmony_ci } 12222e5b6d6dSopenharmony_ci if (U_FAILURE(localStatus)) { 12232e5b6d6dSopenharmony_ci // TODO: Get more accurate position of the error from UnicodeSet's return info. 12242e5b6d6dSopenharmony_ci // UnicodeSet appears to not be reporting correctly at this time. 12252e5b6d6dSopenharmony_ci #ifdef RBBI_DEBUG 12262e5b6d6dSopenharmony_ci RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex()); 12272e5b6d6dSopenharmony_ci #endif 12282e5b6d6dSopenharmony_ci error(localStatus); 12292e5b6d6dSopenharmony_ci delete uset; 12302e5b6d6dSopenharmony_ci return; 12312e5b6d6dSopenharmony_ci } 12322e5b6d6dSopenharmony_ci 12332e5b6d6dSopenharmony_ci // Verify that the set contains at least one code point. 12342e5b6d6dSopenharmony_ci // 12352e5b6d6dSopenharmony_ci U_ASSERT(uset!=NULL); 12362e5b6d6dSopenharmony_ci if (uset->isEmpty()) { 12372e5b6d6dSopenharmony_ci // This set is empty. 12382e5b6d6dSopenharmony_ci // Make it an error, because it almost certainly is not what the user wanted. 12392e5b6d6dSopenharmony_ci // Also, avoids having to think about corner cases in the tree manipulation code 12402e5b6d6dSopenharmony_ci // that occurs later on. 12412e5b6d6dSopenharmony_ci error(U_BRK_RULE_EMPTY_SET); 12422e5b6d6dSopenharmony_ci delete uset; 12432e5b6d6dSopenharmony_ci return; 12442e5b6d6dSopenharmony_ci } 12452e5b6d6dSopenharmony_ci 12462e5b6d6dSopenharmony_ci 12472e5b6d6dSopenharmony_ci // Advance the RBBI parse position over the UnicodeSet pattern. 12482e5b6d6dSopenharmony_ci // Don't just set fScanIndex because the line/char positions maintained 12492e5b6d6dSopenharmony_ci // for error reporting would be thrown off. 12502e5b6d6dSopenharmony_ci i = pos.getIndex(); 12512e5b6d6dSopenharmony_ci for (;;) { 12522e5b6d6dSopenharmony_ci if (fNextIndex >= i) { 12532e5b6d6dSopenharmony_ci break; 12542e5b6d6dSopenharmony_ci } 12552e5b6d6dSopenharmony_ci nextCharLL(); 12562e5b6d6dSopenharmony_ci } 12572e5b6d6dSopenharmony_ci 12582e5b6d6dSopenharmony_ci if (U_SUCCESS(*fRB->fStatus)) { 12592e5b6d6dSopenharmony_ci RBBINode *n; 12602e5b6d6dSopenharmony_ci 12612e5b6d6dSopenharmony_ci n = pushNewNode(RBBINode::setRef); 12622e5b6d6dSopenharmony_ci if (U_FAILURE(*fRB->fStatus)) { 12632e5b6d6dSopenharmony_ci return; 12642e5b6d6dSopenharmony_ci } 12652e5b6d6dSopenharmony_ci n->fFirstPos = startPos; 12662e5b6d6dSopenharmony_ci n->fLastPos = fNextIndex; 12672e5b6d6dSopenharmony_ci fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 12682e5b6d6dSopenharmony_ci // findSetFor() serves several purposes here: 12692e5b6d6dSopenharmony_ci // - Adopts storage for the UnicodeSet, will be responsible for deleting. 12702e5b6d6dSopenharmony_ci // - Maintains collection of all sets in use, needed later for establishing 12712e5b6d6dSopenharmony_ci // character categories for run time engine. 12722e5b6d6dSopenharmony_ci // - Eliminates mulitiple instances of the same set. 12732e5b6d6dSopenharmony_ci // - Creates a new uset node if necessary (if this isn't a duplicate.) 12742e5b6d6dSopenharmony_ci findSetFor(n->fText, n, uset); 12752e5b6d6dSopenharmony_ci } 12762e5b6d6dSopenharmony_ci 12772e5b6d6dSopenharmony_ci} 12782e5b6d6dSopenharmony_ci 12792e5b6d6dSopenharmony_ciint32_t RBBIRuleScanner::numRules() { 12802e5b6d6dSopenharmony_ci return fRuleNum; 12812e5b6d6dSopenharmony_ci} 12822e5b6d6dSopenharmony_ci 12832e5b6d6dSopenharmony_ciU_NAMESPACE_END 12842e5b6d6dSopenharmony_ci 12852e5b6d6dSopenharmony_ci#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1286