1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4//  file:  rbbirb.cpp
5//
6//  Copyright (C) 2002-2011, International Business Machines Corporation and others.
7//  All Rights Reserved.
8//
9//  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
10//    building (compiling) break rules into the tables required by the runtime
11//    RBBI engine.
12//
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/brkiter.h"
19#include "unicode/rbbi.h"
20#include "unicode/ubrk.h"
21#include "unicode/unistr.h"
22#include "unicode/uniset.h"
23#include "unicode/uchar.h"
24#include "unicode/uchriter.h"
25#include "unicode/ustring.h"
26#include "unicode/parsepos.h"
27#include "unicode/parseerr.h"
28
29#include "cmemory.h"
30#include "cstring.h"
31#include "rbbirb.h"
32#include "rbbinode.h"
33#include "rbbiscan.h"
34#include "rbbisetb.h"
35#include "rbbitblb.h"
36#include "rbbidata.h"
37#include "uassert.h"
38
39
40U_NAMESPACE_BEGIN
41
42
43//----------------------------------------------------------------------------------------
44//
45//  Constructor.
46//
47//----------------------------------------------------------------------------------------
48RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
49                                       UParseError     *parseErr,
50                                       UErrorCode      &status)
51 : fRules(rules), fStrippedRules(rules)
52{
53    fStatus = &status; // status is checked below
54    fParseError = parseErr;
55    fDebugEnv   = nullptr;
56#ifdef RBBI_DEBUG
57    fDebugEnv   = getenv("U_RBBIDEBUG");
58#endif
59
60
61    fForwardTree        = nullptr;
62    fReverseTree        = nullptr;
63    fSafeFwdTree        = nullptr;
64    fSafeRevTree        = nullptr;
65    fDefaultTree        = &fForwardTree;
66    fForwardTable       = nullptr;
67    fRuleStatusVals     = nullptr;
68    fChainRules         = false;
69    fLookAheadHardBreak = false;
70    fUSetNodes          = nullptr;
71    fRuleStatusVals     = nullptr;
72    fScanner            = nullptr;
73    fSetBuilder         = nullptr;
74    if (parseErr) {
75        uprv_memset(parseErr, 0, sizeof(UParseError));
76    }
77
78    if (U_FAILURE(status)) {
79        return;
80    }
81
82    fUSetNodes          = new UVector(status); // bcos status gets overwritten here
83    fRuleStatusVals     = new UVector(status);
84    fScanner            = new RBBIRuleScanner(this);
85    fSetBuilder         = new RBBISetBuilder(this);
86    if (U_FAILURE(status)) {
87        return;
88    }
89    if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
90        status = U_MEMORY_ALLOCATION_ERROR;
91    }
92}
93
94
95
96//----------------------------------------------------------------------------------------
97//
98//  Destructor
99//
100//----------------------------------------------------------------------------------------
101RBBIRuleBuilder::~RBBIRuleBuilder() {
102
103    int        i;
104    for (i=0; ; i++) {
105        RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
106        if (n==nullptr) {
107            break;
108        }
109        delete n;
110    }
111
112    delete fUSetNodes;
113    delete fSetBuilder;
114    delete fForwardTable;
115    delete fForwardTree;
116    delete fReverseTree;
117    delete fSafeFwdTree;
118    delete fSafeRevTree;
119    delete fScanner;
120    delete fRuleStatusVals;
121}
122
123
124
125
126
127//----------------------------------------------------------------------------------------
128//
129//   flattenData() -  Collect up the compiled RBBI rule data and put it into
130//                    the format for saving in ICU data files,
131//                    which is also the format needed by the RBBI runtime engine.
132//
133//----------------------------------------------------------------------------------------
134static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
135
136RBBIDataHeader *RBBIRuleBuilder::flattenData() {
137    int32_t    i;
138
139    if (U_FAILURE(*fStatus)) {
140        return nullptr;
141    }
142
143    // Remove whitespace from the rules to make it smaller.
144    // The rule parser has already removed comments.
145    fStrippedRules = fScanner->stripRules(fStrippedRules);
146
147    // Calculate the size of each section in the data.
148    //   Sizes here are padded up to a multiple of 8 for better memory alignment.
149    //   Sections sizes actually stored in the header are for the actual data
150    //     without the padding.
151    //
152    int32_t headerSize        = align8(sizeof(RBBIDataHeader));
153    int32_t forwardTableSize  = align8(fForwardTable->getTableSize());
154    int32_t reverseTableSize  = align8(fForwardTable->getSafeTableSize());
155    int32_t trieSize          = align8(fSetBuilder->getTrieSize());
156    int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
157
158    int32_t rulesLengthInUTF8 = 0;
159    u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
160                       fStrippedRules.getBuffer(), fStrippedRules.length(),
161                       0xfffd, nullptr, fStatus);
162    *fStatus = U_ZERO_ERROR;
163
164    int32_t rulesSize         = align8((rulesLengthInUTF8+1));
165
166    int32_t         totalSize = headerSize
167                                + forwardTableSize
168                                + reverseTableSize
169                                + statusTableSize + trieSize + rulesSize;
170
171#ifdef RBBI_DEBUG
172    if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
173        RBBIDebugPrintf("Header Size:        %8d\n", headerSize);
174        RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
175        RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
176        RBBIDebugPrintf("Trie Size:          %8d\n", trieSize);
177        RBBIDebugPrintf("Status Table Size:  %8d\n", statusTableSize);
178        RBBIDebugPrintf("Rules Size:         %8d\n", rulesSize);
179        RBBIDebugPrintf("-----------------------------\n");
180        RBBIDebugPrintf("Total Size:         %8d\n", totalSize);
181    }
182#endif
183
184    RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
185    if (data == nullptr) {
186        *fStatus = U_MEMORY_ALLOCATION_ERROR;
187        return nullptr;
188    }
189    uprv_memset(data, 0, totalSize);
190
191
192    data->fMagic            = 0xb1a0;
193    data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
194    data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
195    data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
196    data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
197    data->fLength           = totalSize;
198    data->fCatCount         = fSetBuilder->getNumCharCategories();
199
200    data->fFTable        = headerSize;
201    data->fFTableLen     = forwardTableSize;
202
203    data->fRTable        = data->fFTable  + data->fFTableLen;
204    data->fRTableLen     = reverseTableSize;
205
206    data->fTrie          = data->fRTable + data->fRTableLen;
207    data->fTrieLen       = trieSize;
208    data->fStatusTable   = data->fTrie    + data->fTrieLen;
209    data->fStatusTableLen= statusTableSize;
210    data->fRuleSource    = data->fStatusTable + statusTableSize;
211    data->fRuleSourceLen = rulesLengthInUTF8;
212
213    uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
214
215    fForwardTable->exportTable((uint8_t *)data + data->fFTable);
216    fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
217    fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
218
219    int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
220    for (i=0; i<fRuleStatusVals->size(); i++) {
221        ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
222    }
223
224    u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
225                       fStrippedRules.getBuffer(), fStrippedRules.length(),
226                       0xfffd, nullptr, fStatus);
227    if (U_FAILURE(*fStatus)) {
228        return nullptr;
229    }
230
231    return data;
232}
233
234
235//----------------------------------------------------------------------------------------
236//
237//  createRuleBasedBreakIterator    construct from source rules that are passed in
238//                                  in a UnicodeString
239//
240//----------------------------------------------------------------------------------------
241BreakIterator *
242RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
243                                    UParseError      *parseError,
244                                    UErrorCode       &status)
245{
246    //
247    // Read the input rules, generate a parse tree, symbol table,
248    // and list of all Unicode Sets referenced by the rules.
249    //
250    RBBIRuleBuilder  builder(rules, parseError, status);
251    if (U_FAILURE(status)) { // status checked here bcos build below doesn't
252        return nullptr;
253    }
254
255    RBBIDataHeader *data = builder.build(status);
256
257    if (U_FAILURE(status)) {
258        return nullptr;
259    }
260
261    //
262    //  Create a break iterator from the compiled rules.
263    //     (Identical to creation from stored pre-compiled rules)
264    //
265    // status is checked after init in construction.
266    RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
267    if (U_FAILURE(status)) {
268        delete This;
269        This = nullptr;
270    }
271    else if(This == nullptr) { // test for nullptr
272        status = U_MEMORY_ALLOCATION_ERROR;
273    }
274    return This;
275}
276
277RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
278    if (U_FAILURE(status)) {
279        return nullptr;
280    }
281
282    fScanner->parse();
283    if (U_FAILURE(status)) {
284        return nullptr;
285    }
286
287    //
288    // UnicodeSet processing.
289    //    Munge the Unicode Sets to create an initial set of character categories.
290    //
291    fSetBuilder->buildRanges();
292
293    //
294    //   Generate the DFA state transition table.
295    //
296    fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
297    if (fForwardTable == nullptr) {
298        status = U_MEMORY_ALLOCATION_ERROR;
299        return nullptr;
300    }
301
302    fForwardTable->buildForwardTable();
303
304    // State table and character category optimization.
305    // Merge equivalent rows and columns.
306    // Note that this process alters the initial set of character categories,
307    // causing the representation of UnicodeSets in the parse tree to become invalid.
308
309    optimizeTables();
310    fForwardTable->buildSafeReverseTable(status);
311
312
313#ifdef RBBI_DEBUG
314    if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
315        fForwardTable->printStates();
316        fForwardTable->printRuleStatusTable();
317        fForwardTable->printReverseTable();
318    }
319#endif
320
321    //    Generate the mapping tables (TRIE) from input code points to
322    //    the character categories.
323    //
324    fSetBuilder->buildTrie();
325
326    //
327    //   Package up the compiled data into a memory image
328    //      in the run-time format.
329    //
330    RBBIDataHeader *data = flattenData(); // returns nullptr if error
331    if (U_FAILURE(status)) {
332        return nullptr;
333    }
334    return data;
335}
336
337void RBBIRuleBuilder::optimizeTables() {
338    bool didSomething;
339    do {
340        didSomething = false;
341
342        // Begin looking for duplicates with char class 3.
343        // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
344        // and should not have other categories merged into them.
345        IntPair duplPair = {3, 0};
346        while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
347            fSetBuilder->mergeCategories(duplPair);
348            fForwardTable->removeColumn(duplPair.second);
349            didSomething = true;
350        }
351
352        while (fForwardTable->removeDuplicateStates() > 0) {
353            didSomething = true;
354        }
355    } while (didSomething);
356}
357
358U_NAMESPACE_END
359
360#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
361