1cb93a386Sopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 2cb93a386Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 3cb93a386Sopenharmony_ci// 4cb93a386Sopenharmony_ci// rbbiscan.h 5cb93a386Sopenharmony_ci// 6cb93a386Sopenharmony_ci// Copyright (C) 2002-2016, International Business Machines Corporation and others. 7cb93a386Sopenharmony_ci// All Rights Reserved. 8cb93a386Sopenharmony_ci// 9cb93a386Sopenharmony_ci// This file contains declarations for class RBBIRuleScanner 10cb93a386Sopenharmony_ci// 11cb93a386Sopenharmony_ci 12cb93a386Sopenharmony_ci 13cb93a386Sopenharmony_ci#ifndef RBBISCAN_H 14cb93a386Sopenharmony_ci#define RBBISCAN_H 15cb93a386Sopenharmony_ci 16cb93a386Sopenharmony_ci#include "unicode/utypes.h" 17cb93a386Sopenharmony_ci#include "unicode/uobject.h" 18cb93a386Sopenharmony_ci#include "unicode/rbbi.h" 19cb93a386Sopenharmony_ci#include "unicode/uniset.h" 20cb93a386Sopenharmony_ci#include "unicode/parseerr.h" 21cb93a386Sopenharmony_ci#include "uhash.h" 22cb93a386Sopenharmony_ci#include "uvector.h" 23cb93a386Sopenharmony_ci#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 24cb93a386Sopenharmony_ci // looks up references to $variables within a set. 25cb93a386Sopenharmony_ci#include "rbbinode.h" 26cb93a386Sopenharmony_ci#include "rbbirpt.h" 27cb93a386Sopenharmony_ci 28cb93a386Sopenharmony_ciU_NAMESPACE_BEGIN 29cb93a386Sopenharmony_ci 30cb93a386Sopenharmony_ciclass RBBIRuleBuilder; 31cb93a386Sopenharmony_ciclass RBBISymbolTable; 32cb93a386Sopenharmony_ci 33cb93a386Sopenharmony_ci 34cb93a386Sopenharmony_ci//-------------------------------------------------------------------------------- 35cb93a386Sopenharmony_ci// 36cb93a386Sopenharmony_ci// class RBBIRuleScanner does the lowest level, character-at-a-time 37cb93a386Sopenharmony_ci// scanning of break iterator rules. 38cb93a386Sopenharmony_ci// 39cb93a386Sopenharmony_ci// The output of the scanner is parse trees for 40cb93a386Sopenharmony_ci// the rule expressions and a list of all Unicode Sets 41cb93a386Sopenharmony_ci// encountered. 42cb93a386Sopenharmony_ci// 43cb93a386Sopenharmony_ci//-------------------------------------------------------------------------------- 44cb93a386Sopenharmony_ci 45cb93a386Sopenharmony_ciclass RBBIRuleScanner : public UMemory { 46cb93a386Sopenharmony_cipublic: 47cb93a386Sopenharmony_ci 48cb93a386Sopenharmony_ci enum { 49cb93a386Sopenharmony_ci kStackSize = 100 // The size of the state stack for 50cb93a386Sopenharmony_ci }; // rules parsing. Corresponds roughly 51cb93a386Sopenharmony_ci // to the depth of parentheses nesting 52cb93a386Sopenharmony_ci // that is allowed in the rules. 53cb93a386Sopenharmony_ci 54cb93a386Sopenharmony_ci struct RBBIRuleChar { 55cb93a386Sopenharmony_ci UChar32 fChar; 56cb93a386Sopenharmony_ci UBool fEscaped; 57cb93a386Sopenharmony_ci RBBIRuleChar() : fChar(0), fEscaped(false) {} 58cb93a386Sopenharmony_ci }; 59cb93a386Sopenharmony_ci 60cb93a386Sopenharmony_ci RBBIRuleScanner(RBBIRuleBuilder *rb); 61cb93a386Sopenharmony_ci 62cb93a386Sopenharmony_ci 63cb93a386Sopenharmony_ci virtual ~RBBIRuleScanner(); 64cb93a386Sopenharmony_ci 65cb93a386Sopenharmony_ci void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. 66cb93a386Sopenharmony_ci // Return false if at end. 67cb93a386Sopenharmony_ci 68cb93a386Sopenharmony_ci UBool push(const RBBIRuleChar &c); // Push (unget) one character. 69cb93a386Sopenharmony_ci // Only a single character may be pushed. 70cb93a386Sopenharmony_ci 71cb93a386Sopenharmony_ci void parse(); // Parse the rules, generating two parse 72cb93a386Sopenharmony_ci // trees, one each for the forward and 73cb93a386Sopenharmony_ci // reverse rules, 74cb93a386Sopenharmony_ci // and a list of UnicodeSets encountered. 75cb93a386Sopenharmony_ci 76cb93a386Sopenharmony_ci int32_t numRules(); // Return the number of rules that have been seen. 77cb93a386Sopenharmony_ci 78cb93a386Sopenharmony_ci /** 79cb93a386Sopenharmony_ci * Return a rules string without unnecessary 80cb93a386Sopenharmony_ci * characters. 81cb93a386Sopenharmony_ci */ 82cb93a386Sopenharmony_ci static UnicodeString stripRules(const UnicodeString &rules); 83cb93a386Sopenharmony_ciprivate: 84cb93a386Sopenharmony_ci 85cb93a386Sopenharmony_ci UBool doParseActions(int32_t a); 86cb93a386Sopenharmony_ci void error(UErrorCode e); // error reporting convenience function. 87cb93a386Sopenharmony_ci void fixOpStack(RBBINode::OpPrecedence p); 88cb93a386Sopenharmony_ci // a character. 89cb93a386Sopenharmony_ci void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); 90cb93a386Sopenharmony_ci 91cb93a386Sopenharmony_ci UChar32 nextCharLL(); 92cb93a386Sopenharmony_ci#ifdef RBBI_DEBUG 93cb93a386Sopenharmony_ci void printNodeStack(const char *title); 94cb93a386Sopenharmony_ci#endif 95cb93a386Sopenharmony_ci RBBINode *pushNewNode(RBBINode::NodeType t); 96cb93a386Sopenharmony_ci void scanSet(); 97cb93a386Sopenharmony_ci 98cb93a386Sopenharmony_ci 99cb93a386Sopenharmony_ci RBBIRuleBuilder *fRB; // The rule builder that we are part of. 100cb93a386Sopenharmony_ci 101cb93a386Sopenharmony_ci int32_t fScanIndex; // Index of current character being processed 102cb93a386Sopenharmony_ci // in the rule input string. 103cb93a386Sopenharmony_ci int32_t fNextIndex; // Index of the next character, which 104cb93a386Sopenharmony_ci // is the first character not yet scanned. 105cb93a386Sopenharmony_ci UBool fQuoteMode; // Scan is in a 'quoted region' 106cb93a386Sopenharmony_ci int32_t fLineNum; // Line number in input file. 107cb93a386Sopenharmony_ci int32_t fCharNum; // Char position within the line. 108cb93a386Sopenharmony_ci UChar32 fLastChar; // Previous char, needed to count CR-LF 109cb93a386Sopenharmony_ci // as a single line, not two. 110cb93a386Sopenharmony_ci 111cb93a386Sopenharmony_ci RBBIRuleChar fC; // Current char for parse state machine 112cb93a386Sopenharmony_ci // processing. 113cb93a386Sopenharmony_ci UnicodeString fVarName; // $variableName, valid when we've just 114cb93a386Sopenharmony_ci // scanned one. 115cb93a386Sopenharmony_ci 116cb93a386Sopenharmony_ci RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule 117cb93a386Sopenharmony_ci // parsing. index by p[state][char-class] 118cb93a386Sopenharmony_ci 119cb93a386Sopenharmony_ci uint16_t fStack[kStackSize]; // State stack, holds state pushes 120cb93a386Sopenharmony_ci int32_t fStackPtr; // and pops as specified in the state 121cb93a386Sopenharmony_ci // transition rules. 122cb93a386Sopenharmony_ci 123cb93a386Sopenharmony_ci RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created 124cb93a386Sopenharmony_ci // during the parse of a rule 125cb93a386Sopenharmony_ci int32_t fNodeStackPtr; 126cb93a386Sopenharmony_ci 127cb93a386Sopenharmony_ci 128cb93a386Sopenharmony_ci UBool fReverseRule; // True if the rule currently being scanned 129cb93a386Sopenharmony_ci // is a reverse direction rule (if it 130cb93a386Sopenharmony_ci // starts with a '!') 131cb93a386Sopenharmony_ci 132cb93a386Sopenharmony_ci UBool fLookAheadRule; // True if the rule includes a '/' 133cb93a386Sopenharmony_ci // somewhere within it. 134cb93a386Sopenharmony_ci 135cb93a386Sopenharmony_ci UBool fNoChainInRule; // True if the current rule starts with a '^'. 136cb93a386Sopenharmony_ci 137cb93a386Sopenharmony_ci RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of 138cb93a386Sopenharmony_ci // $variable symbols. 139cb93a386Sopenharmony_ci 140cb93a386Sopenharmony_ci UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to 141cb93a386Sopenharmony_ci // the sets created while parsing rules. 142cb93a386Sopenharmony_ci // The key is the string used for creating 143cb93a386Sopenharmony_ci // the set. 144cb93a386Sopenharmony_ci 145cb93a386Sopenharmony_ci UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during 146cb93a386Sopenharmony_ci // the scanning of RBBI rules. The 147cb93a386Sopenharmony_ci // indicies for these are assigned by the 148cb93a386Sopenharmony_ci // perl script that builds the state tables. 149cb93a386Sopenharmony_ci // See rbbirpt.h. 150cb93a386Sopenharmony_ci 151cb93a386Sopenharmony_ci int32_t fRuleNum; // Counts each rule as it is scanned. 152cb93a386Sopenharmony_ci 153cb93a386Sopenharmony_ci int32_t fOptionStart; // Input index of start of a !!option 154cb93a386Sopenharmony_ci // keyword, while being scanned. 155cb93a386Sopenharmony_ci 156cb93a386Sopenharmony_ci UnicodeSet *gRuleSet_rule_char; 157cb93a386Sopenharmony_ci UnicodeSet *gRuleSet_white_space; 158cb93a386Sopenharmony_ci UnicodeSet *gRuleSet_name_char; 159cb93a386Sopenharmony_ci UnicodeSet *gRuleSet_name_start_char; 160cb93a386Sopenharmony_ci 161cb93a386Sopenharmony_ci RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class 162cb93a386Sopenharmony_ci RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class 163cb93a386Sopenharmony_ci}; 164cb93a386Sopenharmony_ci 165cb93a386Sopenharmony_ciU_NAMESPACE_END 166cb93a386Sopenharmony_ci 167cb93a386Sopenharmony_ci#endif 168