11cb0ef41Sopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 21cb0ef41Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 31cb0ef41Sopenharmony_ci/* 41cb0ef41Sopenharmony_ci******************************************************************************* 51cb0ef41Sopenharmony_ci* 61cb0ef41Sopenharmony_ci* Copyright (C) 1999-2014 International Business Machines 71cb0ef41Sopenharmony_ci* Corporation and others. All Rights Reserved. 81cb0ef41Sopenharmony_ci* 91cb0ef41Sopenharmony_ci******************************************************************************* 101cb0ef41Sopenharmony_ci* file name: rbbidata.h 111cb0ef41Sopenharmony_ci* encoding: UTF-8 121cb0ef41Sopenharmony_ci* tab size: 8 (not used) 131cb0ef41Sopenharmony_ci* indentation:4 141cb0ef41Sopenharmony_ci* 151cb0ef41Sopenharmony_ci* RBBI data formats Includes 161cb0ef41Sopenharmony_ci* 171cb0ef41Sopenharmony_ci* Structs that describes the format of the Binary RBBI data, 181cb0ef41Sopenharmony_ci* as it is stored in ICU's data file. 191cb0ef41Sopenharmony_ci* 201cb0ef41Sopenharmony_ci* RBBIDataWrapper - Instances of this class sit between the 211cb0ef41Sopenharmony_ci* raw data structs and the RulesBasedBreakIterator objects 221cb0ef41Sopenharmony_ci* that are created by applications. The wrapper class 231cb0ef41Sopenharmony_ci* provides reference counting for the underlying data, 241cb0ef41Sopenharmony_ci* and direct pointers to data that would not otherwise 251cb0ef41Sopenharmony_ci* be accessible without ugly pointer arithmetic. The 261cb0ef41Sopenharmony_ci* wrapper does not attempt to provide any higher level 271cb0ef41Sopenharmony_ci* abstractions for the data itself. 281cb0ef41Sopenharmony_ci* 291cb0ef41Sopenharmony_ci* There will be only one instance of RBBIDataWrapper for any 301cb0ef41Sopenharmony_ci* set of RBBI run time data being shared by instances 311cb0ef41Sopenharmony_ci* (clones) of RulesBasedBreakIterator. 321cb0ef41Sopenharmony_ci*/ 331cb0ef41Sopenharmony_ci 341cb0ef41Sopenharmony_ci#ifndef __RBBIDATA_H__ 351cb0ef41Sopenharmony_ci#define __RBBIDATA_H__ 361cb0ef41Sopenharmony_ci 371cb0ef41Sopenharmony_ci#include "unicode/utypes.h" 381cb0ef41Sopenharmony_ci#include "unicode/udata.h" 391cb0ef41Sopenharmony_ci#include "udataswp.h" 401cb0ef41Sopenharmony_ci 411cb0ef41Sopenharmony_ci/** 421cb0ef41Sopenharmony_ci * Swap RBBI data. See udataswp.h. 431cb0ef41Sopenharmony_ci * @internal 441cb0ef41Sopenharmony_ci */ 451cb0ef41Sopenharmony_ciU_CAPI int32_t U_EXPORT2 461cb0ef41Sopenharmony_ciubrk_swap(const UDataSwapper *ds, 471cb0ef41Sopenharmony_ci const void *inData, int32_t length, void *outData, 481cb0ef41Sopenharmony_ci UErrorCode *pErrorCode); 491cb0ef41Sopenharmony_ci 501cb0ef41Sopenharmony_ci#ifdef __cplusplus 511cb0ef41Sopenharmony_ci 521cb0ef41Sopenharmony_ci#include "unicode/ucptrie.h" 531cb0ef41Sopenharmony_ci#include "unicode/uobject.h" 541cb0ef41Sopenharmony_ci#include "unicode/unistr.h" 551cb0ef41Sopenharmony_ci#include "unicode/uversion.h" 561cb0ef41Sopenharmony_ci#include "umutex.h" 571cb0ef41Sopenharmony_ci 581cb0ef41Sopenharmony_ci 591cb0ef41Sopenharmony_ciU_NAMESPACE_BEGIN 601cb0ef41Sopenharmony_ci 611cb0ef41Sopenharmony_ci// The current RBBI data format version. 621cb0ef41Sopenharmony_cistatic const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0}; 631cb0ef41Sopenharmony_ci 641cb0ef41Sopenharmony_ci/* 651cb0ef41Sopenharmony_ci * The following structs map exactly onto the raw data from ICU common data file. 661cb0ef41Sopenharmony_ci */ 671cb0ef41Sopenharmony_cistruct RBBIDataHeader { 681cb0ef41Sopenharmony_ci uint32_t fMagic; /* == 0xbla0 */ 691cb0ef41Sopenharmony_ci UVersionInfo fFormatVersion; /* Data Format. Same as the value in struct UDataInfo */ 701cb0ef41Sopenharmony_ci /* if there is one associated with this data. */ 711cb0ef41Sopenharmony_ci /* (version originates in rbbi, is copied to UDataInfo) */ 721cb0ef41Sopenharmony_ci uint32_t fLength; /* Total length in bytes of this RBBI Data, */ 731cb0ef41Sopenharmony_ci /* including all sections, not just the header. */ 741cb0ef41Sopenharmony_ci uint32_t fCatCount; /* Number of character categories. */ 751cb0ef41Sopenharmony_ci 761cb0ef41Sopenharmony_ci /* */ 771cb0ef41Sopenharmony_ci /* Offsets and sizes of each of the subsections within the RBBI data. */ 781cb0ef41Sopenharmony_ci /* All offsets are bytes from the start of the RBBIDataHeader. */ 791cb0ef41Sopenharmony_ci /* All sizes are in bytes. */ 801cb0ef41Sopenharmony_ci /* */ 811cb0ef41Sopenharmony_ci uint32_t fFTable; /* forward state transition table. */ 821cb0ef41Sopenharmony_ci uint32_t fFTableLen; 831cb0ef41Sopenharmony_ci uint32_t fRTable; /* Offset to the reverse state transition table. */ 841cb0ef41Sopenharmony_ci uint32_t fRTableLen; 851cb0ef41Sopenharmony_ci uint32_t fTrie; /* Offset to Trie data for character categories */ 861cb0ef41Sopenharmony_ci uint32_t fTrieLen; 871cb0ef41Sopenharmony_ci uint32_t fRuleSource; /* Offset to the source for for the break */ 881cb0ef41Sopenharmony_ci uint32_t fRuleSourceLen; /* rules. Stored char16_t *. */ 891cb0ef41Sopenharmony_ci uint32_t fStatusTable; /* Offset to the table of rule status values */ 901cb0ef41Sopenharmony_ci uint32_t fStatusTableLen; 911cb0ef41Sopenharmony_ci 921cb0ef41Sopenharmony_ci uint32_t fReserved[6]; /* Reserved for expansion */ 931cb0ef41Sopenharmony_ci 941cb0ef41Sopenharmony_ci}; 951cb0ef41Sopenharmony_ci 961cb0ef41Sopenharmony_ci 971cb0ef41Sopenharmony_ci 981cb0ef41Sopenharmony_citemplate <typename T> 991cb0ef41Sopenharmony_cistruct RBBIStateTableRowT { 1001cb0ef41Sopenharmony_ci T fAccepting; // Non-zero if this row is for an accepting state. 1011cb0ef41Sopenharmony_ci // Value 0: not an accepting state. 1021cb0ef41Sopenharmony_ci // 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state. 1031cb0ef41Sopenharmony_ci // >1: Look-ahead match has completed. 1041cb0ef41Sopenharmony_ci // Actual boundary position happened earlier. 1051cb0ef41Sopenharmony_ci // Value here == fLookAhead in earlier 1061cb0ef41Sopenharmony_ci // state, at actual boundary pos. 1071cb0ef41Sopenharmony_ci T fLookAhead; // Non-zero if this row is for a state that 1081cb0ef41Sopenharmony_ci // corresponds to a '/' in the rule source. 1091cb0ef41Sopenharmony_ci // Value is the same as the fAccepting 1101cb0ef41Sopenharmony_ci // value for the rule (which will appear 1111cb0ef41Sopenharmony_ci // in a different state. 1121cb0ef41Sopenharmony_ci T fTagsIdx; // Non-zero if this row covers a {tagged} position 1131cb0ef41Sopenharmony_ci // from a rule. Value is the index in the 1141cb0ef41Sopenharmony_ci // StatusTable of the set of matching 1151cb0ef41Sopenharmony_ci // tags (rule status values) 1161cb0ef41Sopenharmony_ci T fNextState[1]; // Next State, indexed by char category. 1171cb0ef41Sopenharmony_ci // Variable-length array declared with length 1 1181cb0ef41Sopenharmony_ci // to disable bounds checkers. 1191cb0ef41Sopenharmony_ci // Array Size is actually fData->fHeader->fCatCount 1201cb0ef41Sopenharmony_ci // CAUTION: see RBBITableBuilder::getTableSize() 1211cb0ef41Sopenharmony_ci // before changing anything here. 1221cb0ef41Sopenharmony_ci}; 1231cb0ef41Sopenharmony_ci 1241cb0ef41Sopenharmony_citypedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8; 1251cb0ef41Sopenharmony_citypedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16; 1261cb0ef41Sopenharmony_ci 1271cb0ef41Sopenharmony_ciconstexpr uint16_t ACCEPTING_UNCONDITIONAL = 1; // Value constant for RBBIStateTableRow::fAccepting 1281cb0ef41Sopenharmony_ci 1291cb0ef41Sopenharmony_ciunion RBBIStateTableRow { 1301cb0ef41Sopenharmony_ci RBBIStateTableRow16 r16; 1311cb0ef41Sopenharmony_ci RBBIStateTableRow8 r8; 1321cb0ef41Sopenharmony_ci}; 1331cb0ef41Sopenharmony_ci 1341cb0ef41Sopenharmony_cistruct RBBIStateTable { 1351cb0ef41Sopenharmony_ci uint32_t fNumStates; // Number of states. 1361cb0ef41Sopenharmony_ci uint32_t fRowLen; // Length of a state table row, in bytes. 1371cb0ef41Sopenharmony_ci uint32_t fDictCategoriesStart; // Char category number of the first dictionary 1381cb0ef41Sopenharmony_ci // char class, or the the largest category number + 1 1391cb0ef41Sopenharmony_ci // if there are no dictionary categories. 1401cb0ef41Sopenharmony_ci uint32_t fLookAheadResultsSize; // Size of run-time array required for holding 1411cb0ef41Sopenharmony_ci // look-ahead results. Indexed by row.fLookAhead. 1421cb0ef41Sopenharmony_ci uint32_t fFlags; // Option Flags for this state table. 1431cb0ef41Sopenharmony_ci char fTableData[1]; // First RBBIStateTableRow begins here. 1441cb0ef41Sopenharmony_ci // Variable-length array declared with length 1 1451cb0ef41Sopenharmony_ci // to disable bounds checkers. 1461cb0ef41Sopenharmony_ci // (making it char[] simplifies ugly address 1471cb0ef41Sopenharmony_ci // arithmetic for indexing variable length rows.) 1481cb0ef41Sopenharmony_ci}; 1491cb0ef41Sopenharmony_ci 1501cb0ef41Sopenharmony_ciconstexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1; 1511cb0ef41Sopenharmony_ciconstexpr uint32_t RBBI_BOF_REQUIRED = 2; 1521cb0ef41Sopenharmony_ciconstexpr uint32_t RBBI_8BITS_ROWS = 4; 1531cb0ef41Sopenharmony_ci 1541cb0ef41Sopenharmony_ci 1551cb0ef41Sopenharmony_ci/* */ 1561cb0ef41Sopenharmony_ci/* The reference counting wrapper class */ 1571cb0ef41Sopenharmony_ci/* */ 1581cb0ef41Sopenharmony_ciclass RBBIDataWrapper : public UMemory { 1591cb0ef41Sopenharmony_cipublic: 1601cb0ef41Sopenharmony_ci enum EDontAdopt { 1611cb0ef41Sopenharmony_ci kDontAdopt 1621cb0ef41Sopenharmony_ci }; 1631cb0ef41Sopenharmony_ci RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status); 1641cb0ef41Sopenharmony_ci RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status); 1651cb0ef41Sopenharmony_ci RBBIDataWrapper(UDataMemory* udm, UErrorCode &status); 1661cb0ef41Sopenharmony_ci ~RBBIDataWrapper(); 1671cb0ef41Sopenharmony_ci 1681cb0ef41Sopenharmony_ci static UBool isDataVersionAcceptable(const UVersionInfo version); 1691cb0ef41Sopenharmony_ci 1701cb0ef41Sopenharmony_ci void init0(); 1711cb0ef41Sopenharmony_ci void init(const RBBIDataHeader *data, UErrorCode &status); 1721cb0ef41Sopenharmony_ci RBBIDataWrapper *addReference(); 1731cb0ef41Sopenharmony_ci void removeReference(); 1741cb0ef41Sopenharmony_ci bool operator ==(const RBBIDataWrapper &other) const; 1751cb0ef41Sopenharmony_ci int32_t hashCode(); 1761cb0ef41Sopenharmony_ci const UnicodeString &getRuleSourceString() const; 1771cb0ef41Sopenharmony_ci void printData(); 1781cb0ef41Sopenharmony_ci void printTable(const char *heading, const RBBIStateTable *table); 1791cb0ef41Sopenharmony_ci 1801cb0ef41Sopenharmony_ci /* */ 1811cb0ef41Sopenharmony_ci /* Pointers to items within the data */ 1821cb0ef41Sopenharmony_ci /* */ 1831cb0ef41Sopenharmony_ci const RBBIDataHeader *fHeader; 1841cb0ef41Sopenharmony_ci const RBBIStateTable *fForwardTable; 1851cb0ef41Sopenharmony_ci const RBBIStateTable *fReverseTable; 1861cb0ef41Sopenharmony_ci const char *fRuleSource; 1871cb0ef41Sopenharmony_ci const int32_t *fRuleStatusTable; 1881cb0ef41Sopenharmony_ci 1891cb0ef41Sopenharmony_ci /* number of int32_t values in the rule status table. Used to sanity check indexing */ 1901cb0ef41Sopenharmony_ci int32_t fStatusMaxIdx; 1911cb0ef41Sopenharmony_ci 1921cb0ef41Sopenharmony_ci UCPTrie *fTrie; 1931cb0ef41Sopenharmony_ci 1941cb0ef41Sopenharmony_ciprivate: 1951cb0ef41Sopenharmony_ci u_atomic_int32_t fRefCount; 1961cb0ef41Sopenharmony_ci UDataMemory *fUDataMem; 1971cb0ef41Sopenharmony_ci UnicodeString fRuleString; 1981cb0ef41Sopenharmony_ci UBool fDontFreeData; 1991cb0ef41Sopenharmony_ci 2001cb0ef41Sopenharmony_ci RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ 2011cb0ef41Sopenharmony_ci RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ 2021cb0ef41Sopenharmony_ci}; 2031cb0ef41Sopenharmony_ci 2041cb0ef41Sopenharmony_ci 2051cb0ef41Sopenharmony_ci 2061cb0ef41Sopenharmony_ciU_NAMESPACE_END 2071cb0ef41Sopenharmony_ci 2081cb0ef41Sopenharmony_ciU_CFUNC UBool rbbi_cleanup(); 2091cb0ef41Sopenharmony_ci 2101cb0ef41Sopenharmony_ci#endif /* C++ */ 2111cb0ef41Sopenharmony_ci 2121cb0ef41Sopenharmony_ci#endif 213