12e5b6d6dSopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 22e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 32e5b6d6dSopenharmony_ci/* 42e5b6d6dSopenharmony_ci******************************************************************************* 52e5b6d6dSopenharmony_ci* 62e5b6d6dSopenharmony_ci* Copyright (C) 1999-2014 International Business Machines 72e5b6d6dSopenharmony_ci* Corporation and others. All Rights Reserved. 82e5b6d6dSopenharmony_ci* 92e5b6d6dSopenharmony_ci******************************************************************************* 102e5b6d6dSopenharmony_ci* file name: rbbidata.h 112e5b6d6dSopenharmony_ci* encoding: UTF-8 122e5b6d6dSopenharmony_ci* tab size: 8 (not used) 132e5b6d6dSopenharmony_ci* indentation:4 142e5b6d6dSopenharmony_ci* 152e5b6d6dSopenharmony_ci* RBBI data formats Includes 162e5b6d6dSopenharmony_ci* 172e5b6d6dSopenharmony_ci* Structs that describes the format of the Binary RBBI data, 182e5b6d6dSopenharmony_ci* as it is stored in ICU's data file. 192e5b6d6dSopenharmony_ci* 202e5b6d6dSopenharmony_ci* RBBIDataWrapper - Instances of this class sit between the 212e5b6d6dSopenharmony_ci* raw data structs and the RulesBasedBreakIterator objects 222e5b6d6dSopenharmony_ci* that are created by applications. The wrapper class 232e5b6d6dSopenharmony_ci* provides reference counting for the underlying data, 242e5b6d6dSopenharmony_ci* and direct pointers to data that would not otherwise 252e5b6d6dSopenharmony_ci* be accessible without ugly pointer arithmetic. The 262e5b6d6dSopenharmony_ci* wrapper does not attempt to provide any higher level 272e5b6d6dSopenharmony_ci* abstractions for the data itself. 282e5b6d6dSopenharmony_ci* 292e5b6d6dSopenharmony_ci* There will be only one instance of RBBIDataWrapper for any 302e5b6d6dSopenharmony_ci* set of RBBI run time data being shared by instances 312e5b6d6dSopenharmony_ci* (clones) of RulesBasedBreakIterator. 322e5b6d6dSopenharmony_ci*/ 332e5b6d6dSopenharmony_ci 342e5b6d6dSopenharmony_ci#ifndef __RBBIDATA_H__ 352e5b6d6dSopenharmony_ci#define __RBBIDATA_H__ 362e5b6d6dSopenharmony_ci 372e5b6d6dSopenharmony_ci#include "unicode/utypes.h" 382e5b6d6dSopenharmony_ci#include "unicode/udata.h" 392e5b6d6dSopenharmony_ci#include "udataswp.h" 402e5b6d6dSopenharmony_ci 412e5b6d6dSopenharmony_ci/** 422e5b6d6dSopenharmony_ci * Swap RBBI data. See udataswp.h. 432e5b6d6dSopenharmony_ci * @internal 442e5b6d6dSopenharmony_ci */ 452e5b6d6dSopenharmony_ciU_CAPI int32_t U_EXPORT2 462e5b6d6dSopenharmony_ciubrk_swap(const UDataSwapper *ds, 472e5b6d6dSopenharmony_ci const void *inData, int32_t length, void *outData, 482e5b6d6dSopenharmony_ci UErrorCode *pErrorCode); 492e5b6d6dSopenharmony_ci 502e5b6d6dSopenharmony_ci#ifdef __cplusplus 512e5b6d6dSopenharmony_ci 522e5b6d6dSopenharmony_ci#include "unicode/ucptrie.h" 532e5b6d6dSopenharmony_ci#include "unicode/uobject.h" 542e5b6d6dSopenharmony_ci#include "unicode/unistr.h" 552e5b6d6dSopenharmony_ci#include "unicode/uversion.h" 562e5b6d6dSopenharmony_ci#include "umutex.h" 572e5b6d6dSopenharmony_ci 582e5b6d6dSopenharmony_ci 592e5b6d6dSopenharmony_ciU_NAMESPACE_BEGIN 602e5b6d6dSopenharmony_ci 612e5b6d6dSopenharmony_ci// The current RBBI data format version. 622e5b6d6dSopenharmony_cistatic const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0}; 632e5b6d6dSopenharmony_ci 642e5b6d6dSopenharmony_ci/* 652e5b6d6dSopenharmony_ci * The following structs map exactly onto the raw data from ICU common data file. 662e5b6d6dSopenharmony_ci */ 672e5b6d6dSopenharmony_cistruct RBBIDataHeader { 682e5b6d6dSopenharmony_ci uint32_t fMagic; /* == 0xbla0 */ 692e5b6d6dSopenharmony_ci UVersionInfo fFormatVersion; /* Data Format. Same as the value in struct UDataInfo */ 702e5b6d6dSopenharmony_ci /* if there is one associated with this data. */ 712e5b6d6dSopenharmony_ci /* (version originates in rbbi, is copied to UDataInfo) */ 722e5b6d6dSopenharmony_ci uint32_t fLength; /* Total length in bytes of this RBBI Data, */ 732e5b6d6dSopenharmony_ci /* including all sections, not just the header. */ 742e5b6d6dSopenharmony_ci uint32_t fCatCount; /* Number of character categories. */ 752e5b6d6dSopenharmony_ci 762e5b6d6dSopenharmony_ci /* */ 772e5b6d6dSopenharmony_ci /* Offsets and sizes of each of the subsections within the RBBI data. */ 782e5b6d6dSopenharmony_ci /* All offsets are bytes from the start of the RBBIDataHeader. */ 792e5b6d6dSopenharmony_ci /* All sizes are in bytes. */ 802e5b6d6dSopenharmony_ci /* */ 812e5b6d6dSopenharmony_ci uint32_t fFTable; /* forward state transition table. */ 822e5b6d6dSopenharmony_ci uint32_t fFTableLen; 832e5b6d6dSopenharmony_ci uint32_t fRTable; /* Offset to the reverse state transition table. */ 842e5b6d6dSopenharmony_ci uint32_t fRTableLen; 852e5b6d6dSopenharmony_ci uint32_t fTrie; /* Offset to Trie data for character categories */ 862e5b6d6dSopenharmony_ci uint32_t fTrieLen; 872e5b6d6dSopenharmony_ci uint32_t fRuleSource; /* Offset to the source for for the break */ 882e5b6d6dSopenharmony_ci uint32_t fRuleSourceLen; /* rules. Stored UChar *. */ 892e5b6d6dSopenharmony_ci uint32_t fStatusTable; /* Offset to the table of rule status values */ 902e5b6d6dSopenharmony_ci uint32_t fStatusTableLen; 912e5b6d6dSopenharmony_ci 922e5b6d6dSopenharmony_ci uint32_t fReserved[6]; /* Reserved for expansion */ 932e5b6d6dSopenharmony_ci 942e5b6d6dSopenharmony_ci}; 952e5b6d6dSopenharmony_ci 962e5b6d6dSopenharmony_ci 972e5b6d6dSopenharmony_ci 982e5b6d6dSopenharmony_citemplate <typename T> 992e5b6d6dSopenharmony_cistruct RBBIStateTableRowT { 1002e5b6d6dSopenharmony_ci T fAccepting; // Non-zero if this row is for an accepting state. 1012e5b6d6dSopenharmony_ci // Value 0: not an accepting state. 1022e5b6d6dSopenharmony_ci // 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state. 1032e5b6d6dSopenharmony_ci // >1: Look-ahead match has completed. 1042e5b6d6dSopenharmony_ci // Actual boundary position happened earlier. 1052e5b6d6dSopenharmony_ci // Value here == fLookAhead in earlier 1062e5b6d6dSopenharmony_ci // state, at actual boundary pos. 1072e5b6d6dSopenharmony_ci T fLookAhead; // Non-zero if this row is for a state that 1082e5b6d6dSopenharmony_ci // corresponds to a '/' in the rule source. 1092e5b6d6dSopenharmony_ci // Value is the same as the fAccepting 1102e5b6d6dSopenharmony_ci // value for the rule (which will appear 1112e5b6d6dSopenharmony_ci // in a different state. 1122e5b6d6dSopenharmony_ci T fTagsIdx; // Non-zero if this row covers a {tagged} position 1132e5b6d6dSopenharmony_ci // from a rule. Value is the index in the 1142e5b6d6dSopenharmony_ci // StatusTable of the set of matching 1152e5b6d6dSopenharmony_ci // tags (rule status values) 1162e5b6d6dSopenharmony_ci T fNextState[1]; // Next State, indexed by char category. 1172e5b6d6dSopenharmony_ci // Variable-length array declared with length 1 1182e5b6d6dSopenharmony_ci // to disable bounds checkers. 1192e5b6d6dSopenharmony_ci // Array Size is actually fData->fHeader->fCatCount 1202e5b6d6dSopenharmony_ci // CAUTION: see RBBITableBuilder::getTableSize() 1212e5b6d6dSopenharmony_ci // before changing anything here. 1222e5b6d6dSopenharmony_ci}; 1232e5b6d6dSopenharmony_ci 1242e5b6d6dSopenharmony_citypedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8; 1252e5b6d6dSopenharmony_citypedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16; 1262e5b6d6dSopenharmony_ci 1272e5b6d6dSopenharmony_ciconstexpr uint16_t ACCEPTING_UNCONDITIONAL = 1; // Value constant for RBBIStateTableRow::fAccepting 1282e5b6d6dSopenharmony_ci 1292e5b6d6dSopenharmony_ciunion RBBIStateTableRow { 1302e5b6d6dSopenharmony_ci RBBIStateTableRow16 r16; 1312e5b6d6dSopenharmony_ci RBBIStateTableRow8 r8; 1322e5b6d6dSopenharmony_ci}; 1332e5b6d6dSopenharmony_ci 1342e5b6d6dSopenharmony_cistruct RBBIStateTable { 1352e5b6d6dSopenharmony_ci uint32_t fNumStates; // Number of states. 1362e5b6d6dSopenharmony_ci uint32_t fRowLen; // Length of a state table row, in bytes. 1372e5b6d6dSopenharmony_ci uint32_t fDictCategoriesStart; // Char category number of the first dictionary 1382e5b6d6dSopenharmony_ci // char class, or the the largest category number + 1 1392e5b6d6dSopenharmony_ci // if there are no dictionary categories. 1402e5b6d6dSopenharmony_ci uint32_t fLookAheadResultsSize; // Size of run-time array required for holding 1412e5b6d6dSopenharmony_ci // look-ahead results. Indexed by row.fLookAhead. 1422e5b6d6dSopenharmony_ci uint32_t fFlags; // Option Flags for this state table. 1432e5b6d6dSopenharmony_ci char fTableData[1]; // First RBBIStateTableRow begins here. 1442e5b6d6dSopenharmony_ci // Variable-length array declared with length 1 1452e5b6d6dSopenharmony_ci // to disable bounds checkers. 1462e5b6d6dSopenharmony_ci // (making it char[] simplifies ugly address 1472e5b6d6dSopenharmony_ci // arithmetic for indexing variable length rows.) 1482e5b6d6dSopenharmony_ci}; 1492e5b6d6dSopenharmony_ci 1502e5b6d6dSopenharmony_ciconstexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1; 1512e5b6d6dSopenharmony_ciconstexpr uint32_t RBBI_BOF_REQUIRED = 2; 1522e5b6d6dSopenharmony_ciconstexpr uint32_t RBBI_8BITS_ROWS = 4; 1532e5b6d6dSopenharmony_ci 1542e5b6d6dSopenharmony_ci 1552e5b6d6dSopenharmony_ci/* */ 1562e5b6d6dSopenharmony_ci/* The reference counting wrapper class */ 1572e5b6d6dSopenharmony_ci/* */ 1582e5b6d6dSopenharmony_ciclass RBBIDataWrapper : public UMemory { 1592e5b6d6dSopenharmony_cipublic: 1602e5b6d6dSopenharmony_ci enum EDontAdopt { 1612e5b6d6dSopenharmony_ci kDontAdopt 1622e5b6d6dSopenharmony_ci }; 1632e5b6d6dSopenharmony_ci RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status); 1642e5b6d6dSopenharmony_ci RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status); 1652e5b6d6dSopenharmony_ci RBBIDataWrapper(UDataMemory* udm, UErrorCode &status); 1662e5b6d6dSopenharmony_ci ~RBBIDataWrapper(); 1672e5b6d6dSopenharmony_ci 1682e5b6d6dSopenharmony_ci static UBool isDataVersionAcceptable(const UVersionInfo version); 1692e5b6d6dSopenharmony_ci 1702e5b6d6dSopenharmony_ci void init0(); 1712e5b6d6dSopenharmony_ci void init(const RBBIDataHeader *data, UErrorCode &status); 1722e5b6d6dSopenharmony_ci RBBIDataWrapper *addReference(); 1732e5b6d6dSopenharmony_ci void removeReference(); 1742e5b6d6dSopenharmony_ci bool operator ==(const RBBIDataWrapper &other) const; 1752e5b6d6dSopenharmony_ci int32_t hashCode(); 1762e5b6d6dSopenharmony_ci const UnicodeString &getRuleSourceString() const; 1772e5b6d6dSopenharmony_ci void printData(); 1782e5b6d6dSopenharmony_ci void printTable(const char *heading, const RBBIStateTable *table); 1792e5b6d6dSopenharmony_ci 1802e5b6d6dSopenharmony_ci /* */ 1812e5b6d6dSopenharmony_ci /* Pointers to items within the data */ 1822e5b6d6dSopenharmony_ci /* */ 1832e5b6d6dSopenharmony_ci const RBBIDataHeader *fHeader; 1842e5b6d6dSopenharmony_ci const RBBIStateTable *fForwardTable; 1852e5b6d6dSopenharmony_ci const RBBIStateTable *fReverseTable; 1862e5b6d6dSopenharmony_ci const char *fRuleSource; 1872e5b6d6dSopenharmony_ci const int32_t *fRuleStatusTable; 1882e5b6d6dSopenharmony_ci 1892e5b6d6dSopenharmony_ci /* number of int32_t values in the rule status table. Used to sanity check indexing */ 1902e5b6d6dSopenharmony_ci int32_t fStatusMaxIdx; 1912e5b6d6dSopenharmony_ci 1922e5b6d6dSopenharmony_ci UCPTrie *fTrie; 1932e5b6d6dSopenharmony_ci 1942e5b6d6dSopenharmony_ciprivate: 1952e5b6d6dSopenharmony_ci u_atomic_int32_t fRefCount; 1962e5b6d6dSopenharmony_ci UDataMemory *fUDataMem; 1972e5b6d6dSopenharmony_ci UnicodeString fRuleString; 1982e5b6d6dSopenharmony_ci UBool fDontFreeData; 1992e5b6d6dSopenharmony_ci 2002e5b6d6dSopenharmony_ci RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ 2012e5b6d6dSopenharmony_ci RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ 2022e5b6d6dSopenharmony_ci}; 2032e5b6d6dSopenharmony_ci 2042e5b6d6dSopenharmony_ci 2052e5b6d6dSopenharmony_ci 2062e5b6d6dSopenharmony_ciU_NAMESPACE_END 2072e5b6d6dSopenharmony_ci 2082e5b6d6dSopenharmony_ciU_CFUNC UBool rbbi_cleanup(void); 2092e5b6d6dSopenharmony_ci 2102e5b6d6dSopenharmony_ci#endif /* C++ */ 2112e5b6d6dSopenharmony_ci 2122e5b6d6dSopenharmony_ci#endif 213