1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3// 4// file: rbbirb.cpp 5// 6// Copyright (C) 2002-2011, International Business Machines Corporation and others. 7// All Rights Reserved. 8// 9// This file contains the RBBIRuleBuilder class implementation. This is the main class for 10// building (compiling) break rules into the tables required by the runtime 11// RBBI engine. 12// 13 14#include "unicode/utypes.h" 15 16#if !UCONFIG_NO_BREAK_ITERATION 17 18#include "unicode/brkiter.h" 19#include "unicode/rbbi.h" 20#include "unicode/ubrk.h" 21#include "unicode/unistr.h" 22#include "unicode/uniset.h" 23#include "unicode/uchar.h" 24#include "unicode/uchriter.h" 25#include "unicode/ustring.h" 26#include "unicode/parsepos.h" 27#include "unicode/parseerr.h" 28 29#include "cmemory.h" 30#include "cstring.h" 31#include "rbbirb.h" 32#include "rbbinode.h" 33#include "rbbiscan.h" 34#include "rbbisetb.h" 35#include "rbbitblb.h" 36#include "rbbidata.h" 37#include "uassert.h" 38 39 40U_NAMESPACE_BEGIN 41 42 43//---------------------------------------------------------------------------------------- 44// 45// Constructor. 46// 47//---------------------------------------------------------------------------------------- 48RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, 49 UParseError *parseErr, 50 UErrorCode &status) 51 : fRules(rules), fStrippedRules(rules) 52{ 53 fStatus = &status; // status is checked below 54 fParseError = parseErr; 55 fDebugEnv = nullptr; 56#ifdef RBBI_DEBUG 57 fDebugEnv = getenv("U_RBBIDEBUG"); 58#endif 59 60 61 fForwardTree = nullptr; 62 fReverseTree = nullptr; 63 fSafeFwdTree = nullptr; 64 fSafeRevTree = nullptr; 65 fDefaultTree = &fForwardTree; 66 fForwardTable = nullptr; 67 fRuleStatusVals = nullptr; 68 fChainRules = false; 69 fLookAheadHardBreak = false; 70 fUSetNodes = nullptr; 71 fRuleStatusVals = nullptr; 72 fScanner = nullptr; 73 fSetBuilder = nullptr; 74 if (parseErr) { 75 uprv_memset(parseErr, 0, sizeof(UParseError)); 76 } 77 78 if (U_FAILURE(status)) { 79 return; 80 } 81 82 fUSetNodes = new UVector(status); // bcos status gets overwritten here 83 fRuleStatusVals = new UVector(status); 84 fScanner = new RBBIRuleScanner(this); 85 fSetBuilder = new RBBISetBuilder(this); 86 if (U_FAILURE(status)) { 87 return; 88 } 89 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) { 90 status = U_MEMORY_ALLOCATION_ERROR; 91 } 92} 93 94 95 96//---------------------------------------------------------------------------------------- 97// 98// Destructor 99// 100//---------------------------------------------------------------------------------------- 101RBBIRuleBuilder::~RBBIRuleBuilder() { 102 103 int i; 104 for (i=0; ; i++) { 105 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i); 106 if (n==nullptr) { 107 break; 108 } 109 delete n; 110 } 111 112 delete fUSetNodes; 113 delete fSetBuilder; 114 delete fForwardTable; 115 delete fForwardTree; 116 delete fReverseTree; 117 delete fSafeFwdTree; 118 delete fSafeRevTree; 119 delete fScanner; 120 delete fRuleStatusVals; 121} 122 123 124 125 126 127//---------------------------------------------------------------------------------------- 128// 129// flattenData() - Collect up the compiled RBBI rule data and put it into 130// the format for saving in ICU data files, 131// which is also the format needed by the RBBI runtime engine. 132// 133//---------------------------------------------------------------------------------------- 134static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;} 135 136RBBIDataHeader *RBBIRuleBuilder::flattenData() { 137 int32_t i; 138 139 if (U_FAILURE(*fStatus)) { 140 return nullptr; 141 } 142 143 // Remove whitespace from the rules to make it smaller. 144 // The rule parser has already removed comments. 145 fStrippedRules = fScanner->stripRules(fStrippedRules); 146 147 // Calculate the size of each section in the data. 148 // Sizes here are padded up to a multiple of 8 for better memory alignment. 149 // Sections sizes actually stored in the header are for the actual data 150 // without the padding. 151 // 152 int32_t headerSize = align8(sizeof(RBBIDataHeader)); 153 int32_t forwardTableSize = align8(fForwardTable->getTableSize()); 154 int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize()); 155 int32_t trieSize = align8(fSetBuilder->getTrieSize()); 156 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); 157 158 int32_t rulesLengthInUTF8 = 0; 159 u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8, 160 fStrippedRules.getBuffer(), fStrippedRules.length(), 161 0xfffd, nullptr, fStatus); 162 *fStatus = U_ZERO_ERROR; 163 164 int32_t rulesSize = align8((rulesLengthInUTF8+1)); 165 166 int32_t totalSize = headerSize 167 + forwardTableSize 168 + reverseTableSize 169 + statusTableSize + trieSize + rulesSize; 170 171#ifdef RBBI_DEBUG 172 if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) { 173 RBBIDebugPrintf("Header Size: %8d\n", headerSize); 174 RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize); 175 RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize); 176 RBBIDebugPrintf("Trie Size: %8d\n", trieSize); 177 RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize); 178 RBBIDebugPrintf("Rules Size: %8d\n", rulesSize); 179 RBBIDebugPrintf("-----------------------------\n"); 180 RBBIDebugPrintf("Total Size: %8d\n", totalSize); 181 } 182#endif 183 184 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); 185 if (data == nullptr) { 186 *fStatus = U_MEMORY_ALLOCATION_ERROR; 187 return nullptr; 188 } 189 uprv_memset(data, 0, totalSize); 190 191 192 data->fMagic = 0xb1a0; 193 data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0]; 194 data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1]; 195 data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2]; 196 data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3]; 197 data->fLength = totalSize; 198 data->fCatCount = fSetBuilder->getNumCharCategories(); 199 200 data->fFTable = headerSize; 201 data->fFTableLen = forwardTableSize; 202 203 data->fRTable = data->fFTable + data->fFTableLen; 204 data->fRTableLen = reverseTableSize; 205 206 data->fTrie = data->fRTable + data->fRTableLen; 207 data->fTrieLen = trieSize; 208 data->fStatusTable = data->fTrie + data->fTrieLen; 209 data->fStatusTableLen= statusTableSize; 210 data->fRuleSource = data->fStatusTable + statusTableSize; 211 data->fRuleSourceLen = rulesLengthInUTF8; 212 213 uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); 214 215 fForwardTable->exportTable((uint8_t *)data + data->fFTable); 216 fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable); 217 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); 218 219 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); 220 for (i=0; i<fRuleStatusVals->size(); i++) { 221 ruleStatusTable[i] = fRuleStatusVals->elementAti(i); 222 } 223 224 u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8, 225 fStrippedRules.getBuffer(), fStrippedRules.length(), 226 0xfffd, nullptr, fStatus); 227 if (U_FAILURE(*fStatus)) { 228 return nullptr; 229 } 230 231 return data; 232} 233 234 235//---------------------------------------------------------------------------------------- 236// 237// createRuleBasedBreakIterator construct from source rules that are passed in 238// in a UnicodeString 239// 240//---------------------------------------------------------------------------------------- 241BreakIterator * 242RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, 243 UParseError *parseError, 244 UErrorCode &status) 245{ 246 // 247 // Read the input rules, generate a parse tree, symbol table, 248 // and list of all Unicode Sets referenced by the rules. 249 // 250 RBBIRuleBuilder builder(rules, parseError, status); 251 if (U_FAILURE(status)) { // status checked here bcos build below doesn't 252 return nullptr; 253 } 254 255 RBBIDataHeader *data = builder.build(status); 256 257 if (U_FAILURE(status)) { 258 return nullptr; 259 } 260 261 // 262 // Create a break iterator from the compiled rules. 263 // (Identical to creation from stored pre-compiled rules) 264 // 265 // status is checked after init in construction. 266 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); 267 if (U_FAILURE(status)) { 268 delete This; 269 This = nullptr; 270 } 271 else if(This == nullptr) { // test for nullptr 272 status = U_MEMORY_ALLOCATION_ERROR; 273 } 274 return This; 275} 276 277RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { 278 if (U_FAILURE(status)) { 279 return nullptr; 280 } 281 282 fScanner->parse(); 283 if (U_FAILURE(status)) { 284 return nullptr; 285 } 286 287 // 288 // UnicodeSet processing. 289 // Munge the Unicode Sets to create an initial set of character categories. 290 // 291 fSetBuilder->buildRanges(); 292 293 // 294 // Generate the DFA state transition table. 295 // 296 fForwardTable = new RBBITableBuilder(this, &fForwardTree, status); 297 if (fForwardTable == nullptr) { 298 status = U_MEMORY_ALLOCATION_ERROR; 299 return nullptr; 300 } 301 302 fForwardTable->buildForwardTable(); 303 304 // State table and character category optimization. 305 // Merge equivalent rows and columns. 306 // Note that this process alters the initial set of character categories, 307 // causing the representation of UnicodeSets in the parse tree to become invalid. 308 309 optimizeTables(); 310 fForwardTable->buildSafeReverseTable(status); 311 312 313#ifdef RBBI_DEBUG 314 if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) { 315 fForwardTable->printStates(); 316 fForwardTable->printRuleStatusTable(); 317 fForwardTable->printReverseTable(); 318 } 319#endif 320 321 // Generate the mapping tables (TRIE) from input code points to 322 // the character categories. 323 // 324 fSetBuilder->buildTrie(); 325 326 // 327 // Package up the compiled data into a memory image 328 // in the run-time format. 329 // 330 RBBIDataHeader *data = flattenData(); // returns nullptr if error 331 if (U_FAILURE(status)) { 332 return nullptr; 333 } 334 return data; 335} 336 337void RBBIRuleBuilder::optimizeTables() { 338 bool didSomething; 339 do { 340 didSomething = false; 341 342 // Begin looking for duplicates with char class 3. 343 // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, 344 // and should not have other categories merged into them. 345 IntPair duplPair = {3, 0}; 346 while (fForwardTable->findDuplCharClassFrom(&duplPair)) { 347 fSetBuilder->mergeCategories(duplPair); 348 fForwardTable->removeColumn(duplPair.second); 349 didSomething = true; 350 } 351 352 while (fForwardTable->removeDuplicateStates() > 0) { 353 didSomething = true; 354 } 355 } while (didSomething); 356} 357 358U_NAMESPACE_END 359 360#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 361