11cb0ef41Sopenharmony_ci// © 2016 and later: Unicode, Inc. and others.
21cb0ef41Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html
31cb0ef41Sopenharmony_ci/*
41cb0ef41Sopenharmony_ci*******************************************************************************
51cb0ef41Sopenharmony_ci*
61cb0ef41Sopenharmony_ci*   Copyright (C) 1999-2014 International Business Machines
71cb0ef41Sopenharmony_ci*   Corporation and others.  All Rights Reserved.
81cb0ef41Sopenharmony_ci*
91cb0ef41Sopenharmony_ci*******************************************************************************
101cb0ef41Sopenharmony_ci*   file name:  rbbidata.h
111cb0ef41Sopenharmony_ci*   encoding:   UTF-8
121cb0ef41Sopenharmony_ci*   tab size:   8 (not used)
131cb0ef41Sopenharmony_ci*   indentation:4
141cb0ef41Sopenharmony_ci*
151cb0ef41Sopenharmony_ci*   RBBI data formats  Includes
161cb0ef41Sopenharmony_ci*
171cb0ef41Sopenharmony_ci*                          Structs that describes the format of the Binary RBBI data,
181cb0ef41Sopenharmony_ci*                          as it is stored in ICU's data file.
191cb0ef41Sopenharmony_ci*
201cb0ef41Sopenharmony_ci*      RBBIDataWrapper  -  Instances of this class sit between the
211cb0ef41Sopenharmony_ci*                          raw data structs and the RulesBasedBreakIterator objects
221cb0ef41Sopenharmony_ci*                          that are created by applications.  The wrapper class
231cb0ef41Sopenharmony_ci*                          provides reference counting for the underlying data,
241cb0ef41Sopenharmony_ci*                          and direct pointers to data that would not otherwise
251cb0ef41Sopenharmony_ci*                          be accessible without ugly pointer arithmetic.  The
261cb0ef41Sopenharmony_ci*                          wrapper does not attempt to provide any higher level
271cb0ef41Sopenharmony_ci*                          abstractions for the data itself.
281cb0ef41Sopenharmony_ci*
291cb0ef41Sopenharmony_ci*                          There will be only one instance of RBBIDataWrapper for any
301cb0ef41Sopenharmony_ci*                          set of RBBI run time data being shared by instances
311cb0ef41Sopenharmony_ci*                          (clones) of RulesBasedBreakIterator.
321cb0ef41Sopenharmony_ci*/
331cb0ef41Sopenharmony_ci
341cb0ef41Sopenharmony_ci#ifndef __RBBIDATA_H__
351cb0ef41Sopenharmony_ci#define __RBBIDATA_H__
361cb0ef41Sopenharmony_ci
371cb0ef41Sopenharmony_ci#include "unicode/utypes.h"
381cb0ef41Sopenharmony_ci#include "unicode/udata.h"
391cb0ef41Sopenharmony_ci#include "udataswp.h"
401cb0ef41Sopenharmony_ci
411cb0ef41Sopenharmony_ci/**
421cb0ef41Sopenharmony_ci * Swap RBBI data. See udataswp.h.
431cb0ef41Sopenharmony_ci * @internal
441cb0ef41Sopenharmony_ci */
451cb0ef41Sopenharmony_ciU_CAPI int32_t U_EXPORT2
461cb0ef41Sopenharmony_ciubrk_swap(const UDataSwapper *ds,
471cb0ef41Sopenharmony_ci          const void *inData, int32_t length, void *outData,
481cb0ef41Sopenharmony_ci          UErrorCode *pErrorCode);
491cb0ef41Sopenharmony_ci
501cb0ef41Sopenharmony_ci#ifdef __cplusplus
511cb0ef41Sopenharmony_ci
521cb0ef41Sopenharmony_ci#include "unicode/ucptrie.h"
531cb0ef41Sopenharmony_ci#include "unicode/uobject.h"
541cb0ef41Sopenharmony_ci#include "unicode/unistr.h"
551cb0ef41Sopenharmony_ci#include "unicode/uversion.h"
561cb0ef41Sopenharmony_ci#include "umutex.h"
571cb0ef41Sopenharmony_ci
581cb0ef41Sopenharmony_ci
591cb0ef41Sopenharmony_ciU_NAMESPACE_BEGIN
601cb0ef41Sopenharmony_ci
611cb0ef41Sopenharmony_ci// The current RBBI data format version.
621cb0ef41Sopenharmony_cistatic const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0};
631cb0ef41Sopenharmony_ci
641cb0ef41Sopenharmony_ci/*
651cb0ef41Sopenharmony_ci *   The following structs map exactly onto the raw data from ICU common data file.
661cb0ef41Sopenharmony_ci */
671cb0ef41Sopenharmony_cistruct RBBIDataHeader {
681cb0ef41Sopenharmony_ci    uint32_t         fMagic;           /*  == 0xbla0                                               */
691cb0ef41Sopenharmony_ci    UVersionInfo     fFormatVersion;   /* Data Format.  Same as the value in struct UDataInfo      */
701cb0ef41Sopenharmony_ci                                       /*   if there is one associated with this data.             */
711cb0ef41Sopenharmony_ci                                       /*     (version originates in rbbi, is copied to UDataInfo) */
721cb0ef41Sopenharmony_ci    uint32_t         fLength;          /*  Total length in bytes of this RBBI Data,                */
731cb0ef41Sopenharmony_ci                                       /*      including all sections, not just the header.        */
741cb0ef41Sopenharmony_ci    uint32_t         fCatCount;        /*  Number of character categories.                         */
751cb0ef41Sopenharmony_ci
761cb0ef41Sopenharmony_ci    /*                                                                        */
771cb0ef41Sopenharmony_ci    /*  Offsets and sizes of each of the subsections within the RBBI data.    */
781cb0ef41Sopenharmony_ci    /*  All offsets are bytes from the start of the RBBIDataHeader.           */
791cb0ef41Sopenharmony_ci    /*  All sizes are in bytes.                                               */
801cb0ef41Sopenharmony_ci    /*                                                                        */
811cb0ef41Sopenharmony_ci    uint32_t         fFTable;         /*  forward state transition table. */
821cb0ef41Sopenharmony_ci    uint32_t         fFTableLen;
831cb0ef41Sopenharmony_ci    uint32_t         fRTable;         /*  Offset to the reverse state transition table. */
841cb0ef41Sopenharmony_ci    uint32_t         fRTableLen;
851cb0ef41Sopenharmony_ci    uint32_t         fTrie;           /*  Offset to Trie data for character categories */
861cb0ef41Sopenharmony_ci    uint32_t         fTrieLen;
871cb0ef41Sopenharmony_ci    uint32_t         fRuleSource;     /*  Offset to the source for for the break */
881cb0ef41Sopenharmony_ci    uint32_t         fRuleSourceLen;  /*    rules.  Stored char16_t *. */
891cb0ef41Sopenharmony_ci    uint32_t         fStatusTable;    /* Offset to the table of rule status values */
901cb0ef41Sopenharmony_ci    uint32_t         fStatusTableLen;
911cb0ef41Sopenharmony_ci
921cb0ef41Sopenharmony_ci    uint32_t         fReserved[6];    /*  Reserved for expansion */
931cb0ef41Sopenharmony_ci
941cb0ef41Sopenharmony_ci};
951cb0ef41Sopenharmony_ci
961cb0ef41Sopenharmony_ci
971cb0ef41Sopenharmony_ci
981cb0ef41Sopenharmony_citemplate <typename T>
991cb0ef41Sopenharmony_cistruct RBBIStateTableRowT {
1001cb0ef41Sopenharmony_ci    T               fAccepting;    //  Non-zero if this row is for an accepting state.
1011cb0ef41Sopenharmony_ci                                   //  Value 0: not an accepting state.
1021cb0ef41Sopenharmony_ci                                   //        1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state.
1031cb0ef41Sopenharmony_ci                                   //       >1: Look-ahead match has completed.
1041cb0ef41Sopenharmony_ci                                   //           Actual boundary position happened earlier.
1051cb0ef41Sopenharmony_ci                                   //           Value here == fLookAhead in earlier
1061cb0ef41Sopenharmony_ci                                   //           state, at actual boundary pos.
1071cb0ef41Sopenharmony_ci    T               fLookAhead;    //  Non-zero if this row is for a state that
1081cb0ef41Sopenharmony_ci                                   //    corresponds to a '/' in the rule source.
1091cb0ef41Sopenharmony_ci                                   //    Value is the same as the fAccepting
1101cb0ef41Sopenharmony_ci                                   //    value for the rule (which will appear
1111cb0ef41Sopenharmony_ci                                   //    in a different state.
1121cb0ef41Sopenharmony_ci    T               fTagsIdx;      //  Non-zero if this row covers a {tagged} position
1131cb0ef41Sopenharmony_ci                                   //    from a rule.  Value is the index in the
1141cb0ef41Sopenharmony_ci                                   //    StatusTable of the set of matching
1151cb0ef41Sopenharmony_ci                                   //    tags (rule status values)
1161cb0ef41Sopenharmony_ci    T               fNextState[1]; //  Next State, indexed by char category.
1171cb0ef41Sopenharmony_ci                                   //    Variable-length array declared with length 1
1181cb0ef41Sopenharmony_ci                                   //    to disable bounds checkers.
1191cb0ef41Sopenharmony_ci                                   //    Array Size is actually fData->fHeader->fCatCount
1201cb0ef41Sopenharmony_ci                                   //    CAUTION:  see RBBITableBuilder::getTableSize()
1211cb0ef41Sopenharmony_ci                                   //              before changing anything here.
1221cb0ef41Sopenharmony_ci};
1231cb0ef41Sopenharmony_ci
1241cb0ef41Sopenharmony_citypedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8;
1251cb0ef41Sopenharmony_citypedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16;
1261cb0ef41Sopenharmony_ci
1271cb0ef41Sopenharmony_ciconstexpr uint16_t ACCEPTING_UNCONDITIONAL = 1;   // Value constant for RBBIStateTableRow::fAccepting
1281cb0ef41Sopenharmony_ci
1291cb0ef41Sopenharmony_ciunion RBBIStateTableRow {
1301cb0ef41Sopenharmony_ci  RBBIStateTableRow16 r16;
1311cb0ef41Sopenharmony_ci  RBBIStateTableRow8 r8;
1321cb0ef41Sopenharmony_ci};
1331cb0ef41Sopenharmony_ci
1341cb0ef41Sopenharmony_cistruct RBBIStateTable {
1351cb0ef41Sopenharmony_ci    uint32_t         fNumStates;            // Number of states.
1361cb0ef41Sopenharmony_ci    uint32_t         fRowLen;               // Length of a state table row, in bytes.
1371cb0ef41Sopenharmony_ci    uint32_t         fDictCategoriesStart;  // Char category number of the first dictionary
1381cb0ef41Sopenharmony_ci                                            //   char class, or the the largest category number + 1
1391cb0ef41Sopenharmony_ci                                            //   if there are no dictionary categories.
1401cb0ef41Sopenharmony_ci    uint32_t         fLookAheadResultsSize; // Size of run-time array required for holding
1411cb0ef41Sopenharmony_ci                                            //   look-ahead results. Indexed by row.fLookAhead.
1421cb0ef41Sopenharmony_ci    uint32_t         fFlags;                // Option Flags for this state table.
1431cb0ef41Sopenharmony_ci    char             fTableData[1];         // First RBBIStateTableRow begins here.
1441cb0ef41Sopenharmony_ci                                            //   Variable-length array declared with length 1
1451cb0ef41Sopenharmony_ci                                            //   to disable bounds checkers.
1461cb0ef41Sopenharmony_ci                                            //   (making it char[] simplifies ugly address
1471cb0ef41Sopenharmony_ci                                            //   arithmetic for indexing variable length rows.)
1481cb0ef41Sopenharmony_ci};
1491cb0ef41Sopenharmony_ci
1501cb0ef41Sopenharmony_ciconstexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;
1511cb0ef41Sopenharmony_ciconstexpr uint32_t RBBI_BOF_REQUIRED = 2;
1521cb0ef41Sopenharmony_ciconstexpr uint32_t RBBI_8BITS_ROWS = 4;
1531cb0ef41Sopenharmony_ci
1541cb0ef41Sopenharmony_ci
1551cb0ef41Sopenharmony_ci/*                                        */
1561cb0ef41Sopenharmony_ci/*   The reference counting wrapper class */
1571cb0ef41Sopenharmony_ci/*                                        */
1581cb0ef41Sopenharmony_ciclass RBBIDataWrapper : public UMemory {
1591cb0ef41Sopenharmony_cipublic:
1601cb0ef41Sopenharmony_ci    enum EDontAdopt {
1611cb0ef41Sopenharmony_ci        kDontAdopt
1621cb0ef41Sopenharmony_ci    };
1631cb0ef41Sopenharmony_ci    RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
1641cb0ef41Sopenharmony_ci    RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status);
1651cb0ef41Sopenharmony_ci    RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
1661cb0ef41Sopenharmony_ci    ~RBBIDataWrapper();
1671cb0ef41Sopenharmony_ci
1681cb0ef41Sopenharmony_ci    static UBool          isDataVersionAcceptable(const UVersionInfo version);
1691cb0ef41Sopenharmony_ci
1701cb0ef41Sopenharmony_ci    void                  init0();
1711cb0ef41Sopenharmony_ci    void                  init(const RBBIDataHeader *data, UErrorCode &status);
1721cb0ef41Sopenharmony_ci    RBBIDataWrapper      *addReference();
1731cb0ef41Sopenharmony_ci    void                  removeReference();
1741cb0ef41Sopenharmony_ci    bool                  operator ==(const RBBIDataWrapper &other) const;
1751cb0ef41Sopenharmony_ci    int32_t               hashCode();
1761cb0ef41Sopenharmony_ci    const UnicodeString  &getRuleSourceString() const;
1771cb0ef41Sopenharmony_ci    void                  printData();
1781cb0ef41Sopenharmony_ci    void                  printTable(const char *heading, const RBBIStateTable *table);
1791cb0ef41Sopenharmony_ci
1801cb0ef41Sopenharmony_ci    /*                                     */
1811cb0ef41Sopenharmony_ci    /*   Pointers to items within the data */
1821cb0ef41Sopenharmony_ci    /*                                     */
1831cb0ef41Sopenharmony_ci    const RBBIDataHeader     *fHeader;
1841cb0ef41Sopenharmony_ci    const RBBIStateTable     *fForwardTable;
1851cb0ef41Sopenharmony_ci    const RBBIStateTable     *fReverseTable;
1861cb0ef41Sopenharmony_ci    const char               *fRuleSource;
1871cb0ef41Sopenharmony_ci    const int32_t            *fRuleStatusTable;
1881cb0ef41Sopenharmony_ci
1891cb0ef41Sopenharmony_ci    /* number of int32_t values in the rule status table.   Used to sanity check indexing */
1901cb0ef41Sopenharmony_ci    int32_t             fStatusMaxIdx;
1911cb0ef41Sopenharmony_ci
1921cb0ef41Sopenharmony_ci    UCPTrie             *fTrie;
1931cb0ef41Sopenharmony_ci
1941cb0ef41Sopenharmony_ciprivate:
1951cb0ef41Sopenharmony_ci    u_atomic_int32_t    fRefCount;
1961cb0ef41Sopenharmony_ci    UDataMemory        *fUDataMem;
1971cb0ef41Sopenharmony_ci    UnicodeString       fRuleString;
1981cb0ef41Sopenharmony_ci    UBool               fDontFreeData;
1991cb0ef41Sopenharmony_ci
2001cb0ef41Sopenharmony_ci    RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /*  forbid copying of this class */
2011cb0ef41Sopenharmony_ci    RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /*  forbid copying of this class */
2021cb0ef41Sopenharmony_ci};
2031cb0ef41Sopenharmony_ci
2041cb0ef41Sopenharmony_ci
2051cb0ef41Sopenharmony_ci
2061cb0ef41Sopenharmony_ciU_NAMESPACE_END
2071cb0ef41Sopenharmony_ci
2081cb0ef41Sopenharmony_ciU_CFUNC UBool rbbi_cleanup();
2091cb0ef41Sopenharmony_ci
2101cb0ef41Sopenharmony_ci#endif /* C++ */
2111cb0ef41Sopenharmony_ci
2121cb0ef41Sopenharmony_ci#endif
213