1cb93a386Sopenharmony_ci// © 2016 and later: Unicode, Inc. and others.
2cb93a386Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html
3cb93a386Sopenharmony_ci//
4cb93a386Sopenharmony_ci//  rbbiscan.h
5cb93a386Sopenharmony_ci//
6cb93a386Sopenharmony_ci//  Copyright (C) 2002-2016, International Business Machines Corporation and others.
7cb93a386Sopenharmony_ci//  All Rights Reserved.
8cb93a386Sopenharmony_ci//
9cb93a386Sopenharmony_ci//  This file contains declarations for class RBBIRuleScanner
10cb93a386Sopenharmony_ci//
11cb93a386Sopenharmony_ci
12cb93a386Sopenharmony_ci
13cb93a386Sopenharmony_ci#ifndef RBBISCAN_H
14cb93a386Sopenharmony_ci#define RBBISCAN_H
15cb93a386Sopenharmony_ci
16cb93a386Sopenharmony_ci#include "unicode/utypes.h"
17cb93a386Sopenharmony_ci#include "unicode/uobject.h"
18cb93a386Sopenharmony_ci#include "unicode/rbbi.h"
19cb93a386Sopenharmony_ci#include "unicode/uniset.h"
20cb93a386Sopenharmony_ci#include "unicode/parseerr.h"
21cb93a386Sopenharmony_ci#include "uhash.h"
22cb93a386Sopenharmony_ci#include "uvector.h"
23cb93a386Sopenharmony_ci#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
24cb93a386Sopenharmony_ci                          //    looks up references to $variables within a set.
25cb93a386Sopenharmony_ci#include "rbbinode.h"
26cb93a386Sopenharmony_ci#include "rbbirpt.h"
27cb93a386Sopenharmony_ci
28cb93a386Sopenharmony_ciU_NAMESPACE_BEGIN
29cb93a386Sopenharmony_ci
30cb93a386Sopenharmony_ciclass   RBBIRuleBuilder;
31cb93a386Sopenharmony_ciclass   RBBISymbolTable;
32cb93a386Sopenharmony_ci
33cb93a386Sopenharmony_ci
34cb93a386Sopenharmony_ci//--------------------------------------------------------------------------------
35cb93a386Sopenharmony_ci//
36cb93a386Sopenharmony_ci//  class RBBIRuleScanner does the lowest level, character-at-a-time
37cb93a386Sopenharmony_ci//                        scanning of break iterator rules.
38cb93a386Sopenharmony_ci//
39cb93a386Sopenharmony_ci//                        The output of the scanner is parse trees for
40cb93a386Sopenharmony_ci//                        the rule expressions and a list of all Unicode Sets
41cb93a386Sopenharmony_ci//                        encountered.
42cb93a386Sopenharmony_ci//
43cb93a386Sopenharmony_ci//--------------------------------------------------------------------------------
44cb93a386Sopenharmony_ci
45cb93a386Sopenharmony_ciclass RBBIRuleScanner : public UMemory {
46cb93a386Sopenharmony_cipublic:
47cb93a386Sopenharmony_ci
48cb93a386Sopenharmony_ci    enum {
49cb93a386Sopenharmony_ci        kStackSize = 100            // The size of the state stack for
50cb93a386Sopenharmony_ci    };                              //   rules parsing.  Corresponds roughly
51cb93a386Sopenharmony_ci                                    //   to the depth of parentheses nesting
52cb93a386Sopenharmony_ci                                    //   that is allowed in the rules.
53cb93a386Sopenharmony_ci
54cb93a386Sopenharmony_ci    struct RBBIRuleChar {
55cb93a386Sopenharmony_ci        UChar32             fChar;
56cb93a386Sopenharmony_ci        UBool               fEscaped;
57cb93a386Sopenharmony_ci        RBBIRuleChar() : fChar(0), fEscaped(false) {}
58cb93a386Sopenharmony_ci    };
59cb93a386Sopenharmony_ci
60cb93a386Sopenharmony_ci    RBBIRuleScanner(RBBIRuleBuilder  *rb);
61cb93a386Sopenharmony_ci
62cb93a386Sopenharmony_ci
63cb93a386Sopenharmony_ci    virtual    ~RBBIRuleScanner();
64cb93a386Sopenharmony_ci
65cb93a386Sopenharmony_ci    void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream.
66cb93a386Sopenharmony_ci                                                    // Return false if at end.
67cb93a386Sopenharmony_ci
68cb93a386Sopenharmony_ci    UBool       push(const RBBIRuleChar &c);        // Push (unget) one character.
69cb93a386Sopenharmony_ci                                                    //   Only a single character may be pushed.
70cb93a386Sopenharmony_ci
71cb93a386Sopenharmony_ci    void        parse();                            // Parse the rules, generating two parse
72cb93a386Sopenharmony_ci                                                    //   trees, one each for the forward and
73cb93a386Sopenharmony_ci                                                    //   reverse rules,
74cb93a386Sopenharmony_ci                                                    //   and a list of UnicodeSets encountered.
75cb93a386Sopenharmony_ci
76cb93a386Sopenharmony_ci    int32_t     numRules();                         // Return the number of rules that have been seen.
77cb93a386Sopenharmony_ci
78cb93a386Sopenharmony_ci    /**
79cb93a386Sopenharmony_ci     * Return a rules string without unnecessary
80cb93a386Sopenharmony_ci     * characters.
81cb93a386Sopenharmony_ci     */
82cb93a386Sopenharmony_ci    static UnicodeString stripRules(const UnicodeString &rules);
83cb93a386Sopenharmony_ciprivate:
84cb93a386Sopenharmony_ci
85cb93a386Sopenharmony_ci    UBool       doParseActions(int32_t a);
86cb93a386Sopenharmony_ci    void        error(UErrorCode e);                   // error reporting convenience function.
87cb93a386Sopenharmony_ci    void        fixOpStack(RBBINode::OpPrecedence p);
88cb93a386Sopenharmony_ci                                                       //   a character.
89cb93a386Sopenharmony_ci    void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
90cb93a386Sopenharmony_ci
91cb93a386Sopenharmony_ci    UChar32     nextCharLL();
92cb93a386Sopenharmony_ci#ifdef RBBI_DEBUG
93cb93a386Sopenharmony_ci    void        printNodeStack(const char *title);
94cb93a386Sopenharmony_ci#endif
95cb93a386Sopenharmony_ci    RBBINode    *pushNewNode(RBBINode::NodeType  t);
96cb93a386Sopenharmony_ci    void        scanSet();
97cb93a386Sopenharmony_ci
98cb93a386Sopenharmony_ci
99cb93a386Sopenharmony_ci    RBBIRuleBuilder               *fRB;              // The rule builder that we are part of.
100cb93a386Sopenharmony_ci
101cb93a386Sopenharmony_ci    int32_t                       fScanIndex;        // Index of current character being processed
102cb93a386Sopenharmony_ci                                                     //   in the rule input string.
103cb93a386Sopenharmony_ci    int32_t                       fNextIndex;        // Index of the next character, which
104cb93a386Sopenharmony_ci                                                     //   is the first character not yet scanned.
105cb93a386Sopenharmony_ci    UBool                         fQuoteMode;        // Scan is in a 'quoted region'
106cb93a386Sopenharmony_ci    int32_t                       fLineNum;          // Line number in input file.
107cb93a386Sopenharmony_ci    int32_t                       fCharNum;          // Char position within the line.
108cb93a386Sopenharmony_ci    UChar32                       fLastChar;         // Previous char, needed to count CR-LF
109cb93a386Sopenharmony_ci                                                     //   as a single line, not two.
110cb93a386Sopenharmony_ci
111cb93a386Sopenharmony_ci    RBBIRuleChar                  fC;                // Current char for parse state machine
112cb93a386Sopenharmony_ci                                                     //   processing.
113cb93a386Sopenharmony_ci    UnicodeString                 fVarName;          // $variableName, valid when we've just
114cb93a386Sopenharmony_ci                                                     //   scanned one.
115cb93a386Sopenharmony_ci
116cb93a386Sopenharmony_ci    RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule
117cb93a386Sopenharmony_ci                                                     //   parsing.  index by p[state][char-class]
118cb93a386Sopenharmony_ci
119cb93a386Sopenharmony_ci    uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
120cb93a386Sopenharmony_ci    int32_t                       fStackPtr;           //  and pops as specified in the state
121cb93a386Sopenharmony_ci                                                       //  transition rules.
122cb93a386Sopenharmony_ci
123cb93a386Sopenharmony_ci    RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created
124cb93a386Sopenharmony_ci                                                           //  during the parse of a rule
125cb93a386Sopenharmony_ci    int32_t                        fNodeStackPtr;
126cb93a386Sopenharmony_ci
127cb93a386Sopenharmony_ci
128cb93a386Sopenharmony_ci    UBool                          fReverseRule;     // True if the rule currently being scanned
129cb93a386Sopenharmony_ci                                                     //  is a reverse direction rule (if it
130cb93a386Sopenharmony_ci                                                     //  starts with a '!')
131cb93a386Sopenharmony_ci
132cb93a386Sopenharmony_ci    UBool                          fLookAheadRule;   // True if the rule includes a '/'
133cb93a386Sopenharmony_ci                                                     //   somewhere within it.
134cb93a386Sopenharmony_ci
135cb93a386Sopenharmony_ci    UBool                          fNoChainInRule;   // True if the current rule starts with a '^'.
136cb93a386Sopenharmony_ci
137cb93a386Sopenharmony_ci    RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of
138cb93a386Sopenharmony_ci                                                     //   $variable symbols.
139cb93a386Sopenharmony_ci
140cb93a386Sopenharmony_ci    UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to
141cb93a386Sopenharmony_ci                                                     //   the sets created while parsing rules.
142cb93a386Sopenharmony_ci                                                     //   The key is the string used for creating
143cb93a386Sopenharmony_ci                                                     //   the set.
144cb93a386Sopenharmony_ci
145cb93a386Sopenharmony_ci    UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during
146cb93a386Sopenharmony_ci                                                     //  the scanning of RBBI rules.  The
147cb93a386Sopenharmony_ci                                                     //  indicies for these are assigned by the
148cb93a386Sopenharmony_ci                                                     //  perl script that builds the state tables.
149cb93a386Sopenharmony_ci                                                     //  See rbbirpt.h.
150cb93a386Sopenharmony_ci
151cb93a386Sopenharmony_ci    int32_t                        fRuleNum;         // Counts each rule as it is scanned.
152cb93a386Sopenharmony_ci
153cb93a386Sopenharmony_ci    int32_t                        fOptionStart;     // Input index of start of a !!option
154cb93a386Sopenharmony_ci                                                     //   keyword, while being scanned.
155cb93a386Sopenharmony_ci
156cb93a386Sopenharmony_ci    UnicodeSet *gRuleSet_rule_char;
157cb93a386Sopenharmony_ci    UnicodeSet *gRuleSet_white_space;
158cb93a386Sopenharmony_ci    UnicodeSet *gRuleSet_name_char;
159cb93a386Sopenharmony_ci    UnicodeSet *gRuleSet_name_start_char;
160cb93a386Sopenharmony_ci
161cb93a386Sopenharmony_ci    RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
162cb93a386Sopenharmony_ci    RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
163cb93a386Sopenharmony_ci};
164cb93a386Sopenharmony_ci
165cb93a386Sopenharmony_ciU_NAMESPACE_END
166cb93a386Sopenharmony_ci
167cb93a386Sopenharmony_ci#endif
168