11cb0ef41Sopenharmony_ci// © 2016 and later: Unicode, Inc. and others.
21cb0ef41Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html
31cb0ef41Sopenharmony_ci/**
41cb0ef41Sopenharmony_ci *******************************************************************************
51cb0ef41Sopenharmony_ci * Copyright (C) 2006-2014, International Business Machines Corporation   *
61cb0ef41Sopenharmony_ci * and others. All Rights Reserved.                                            *
71cb0ef41Sopenharmony_ci *******************************************************************************
81cb0ef41Sopenharmony_ci */
91cb0ef41Sopenharmony_ci
101cb0ef41Sopenharmony_ci#ifndef DICTBE_H
111cb0ef41Sopenharmony_ci#define DICTBE_H
121cb0ef41Sopenharmony_ci
131cb0ef41Sopenharmony_ci#include "unicode/utypes.h"
141cb0ef41Sopenharmony_ci#include "unicode/uniset.h"
151cb0ef41Sopenharmony_ci#include "unicode/utext.h"
161cb0ef41Sopenharmony_ci
171cb0ef41Sopenharmony_ci#include "brkeng.h"
181cb0ef41Sopenharmony_ci#include "hash.h"
191cb0ef41Sopenharmony_ci#include "mlbe.h"
201cb0ef41Sopenharmony_ci#include "uvectr32.h"
211cb0ef41Sopenharmony_ci
221cb0ef41Sopenharmony_ciU_NAMESPACE_BEGIN
231cb0ef41Sopenharmony_ci
241cb0ef41Sopenharmony_ciclass DictionaryMatcher;
251cb0ef41Sopenharmony_ciclass MlBreakEngine;
261cb0ef41Sopenharmony_ciclass Normalizer2;
271cb0ef41Sopenharmony_ci
281cb0ef41Sopenharmony_ci/*******************************************************************
291cb0ef41Sopenharmony_ci * DictionaryBreakEngine
301cb0ef41Sopenharmony_ci */
311cb0ef41Sopenharmony_ci
321cb0ef41Sopenharmony_ci/**
331cb0ef41Sopenharmony_ci * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
341cb0ef41Sopenharmony_ci * dictionary to determine language-specific breaks.</p>
351cb0ef41Sopenharmony_ci *
361cb0ef41Sopenharmony_ci * <p>After it is constructed a DictionaryBreakEngine may be shared between
371cb0ef41Sopenharmony_ci * threads without synchronization.</p>
381cb0ef41Sopenharmony_ci */
391cb0ef41Sopenharmony_ciclass DictionaryBreakEngine : public LanguageBreakEngine {
401cb0ef41Sopenharmony_ci private:
411cb0ef41Sopenharmony_ci    /**
421cb0ef41Sopenharmony_ci     * The set of characters handled by this engine
431cb0ef41Sopenharmony_ci     * @internal
441cb0ef41Sopenharmony_ci     */
451cb0ef41Sopenharmony_ci
461cb0ef41Sopenharmony_ci  UnicodeSet    fSet;
471cb0ef41Sopenharmony_ci
481cb0ef41Sopenharmony_ci public:
491cb0ef41Sopenharmony_ci
501cb0ef41Sopenharmony_ci  /**
511cb0ef41Sopenharmony_ci   * <p>Constructor </p>
521cb0ef41Sopenharmony_ci   */
531cb0ef41Sopenharmony_ci  DictionaryBreakEngine();
541cb0ef41Sopenharmony_ci
551cb0ef41Sopenharmony_ci  /**
561cb0ef41Sopenharmony_ci   * <p>Virtual destructor.</p>
571cb0ef41Sopenharmony_ci   */
581cb0ef41Sopenharmony_ci  virtual ~DictionaryBreakEngine();
591cb0ef41Sopenharmony_ci
601cb0ef41Sopenharmony_ci  /**
611cb0ef41Sopenharmony_ci   * <p>Indicate whether this engine handles a particular character for
621cb0ef41Sopenharmony_ci   * a particular kind of break.</p>
631cb0ef41Sopenharmony_ci   *
641cb0ef41Sopenharmony_ci   * @param c A character which begins a run that the engine might handle
651cb0ef41Sopenharmony_ci   * @param locale The locale.
661cb0ef41Sopenharmony_ci   * @return true if this engine handles the particular character and break
671cb0ef41Sopenharmony_ci   * type.
681cb0ef41Sopenharmony_ci   */
691cb0ef41Sopenharmony_ci  virtual UBool handles(UChar32 c, const char* locale) const override;
701cb0ef41Sopenharmony_ci
711cb0ef41Sopenharmony_ci  /**
721cb0ef41Sopenharmony_ci   * <p>Find any breaks within a run in the supplied text.</p>
731cb0ef41Sopenharmony_ci   *
741cb0ef41Sopenharmony_ci   * @param text A UText representing the text. The iterator is left at
751cb0ef41Sopenharmony_ci   * the end of the run of characters which the engine is capable of handling
761cb0ef41Sopenharmony_ci   * that starts from the first character in the range.
771cb0ef41Sopenharmony_ci   * @param startPos The start of the run within the supplied text.
781cb0ef41Sopenharmony_ci   * @param endPos The end of the run within the supplied text.
791cb0ef41Sopenharmony_ci   * @param foundBreaks vector of int32_t to receive the break positions
801cb0ef41Sopenharmony_ci   * @param status Information on any errors encountered.
811cb0ef41Sopenharmony_ci   * @return The number of breaks found.
821cb0ef41Sopenharmony_ci   */
831cb0ef41Sopenharmony_ci  virtual int32_t findBreaks( UText *text,
841cb0ef41Sopenharmony_ci                              int32_t startPos,
851cb0ef41Sopenharmony_ci                              int32_t endPos,
861cb0ef41Sopenharmony_ci                              UVector32 &foundBreaks,
871cb0ef41Sopenharmony_ci                              UBool isPhraseBreaking,
881cb0ef41Sopenharmony_ci                              UErrorCode& status ) const override;
891cb0ef41Sopenharmony_ci
901cb0ef41Sopenharmony_ci protected:
911cb0ef41Sopenharmony_ci
921cb0ef41Sopenharmony_ci /**
931cb0ef41Sopenharmony_ci  * <p>Set the character set handled by this engine.</p>
941cb0ef41Sopenharmony_ci  *
951cb0ef41Sopenharmony_ci  * @param set A UnicodeSet of the set of characters handled by the engine
961cb0ef41Sopenharmony_ci  */
971cb0ef41Sopenharmony_ci  virtual void setCharacters( const UnicodeSet &set );
981cb0ef41Sopenharmony_ci
991cb0ef41Sopenharmony_ci /**
1001cb0ef41Sopenharmony_ci  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
1011cb0ef41Sopenharmony_ci  *
1021cb0ef41Sopenharmony_ci  * @param text A UText representing the text
1031cb0ef41Sopenharmony_ci  * @param rangeStart The start of the range of dictionary characters
1041cb0ef41Sopenharmony_ci  * @param rangeEnd The end of the range of dictionary characters
1051cb0ef41Sopenharmony_ci  * @param foundBreaks Output of C array of int32_t break positions, or 0
1061cb0ef41Sopenharmony_ci  * @param status Information on any errors encountered.
1071cb0ef41Sopenharmony_ci  * @return The number of breaks found
1081cb0ef41Sopenharmony_ci  */
1091cb0ef41Sopenharmony_ci  virtual int32_t divideUpDictionaryRange( UText *text,
1101cb0ef41Sopenharmony_ci                                           int32_t rangeStart,
1111cb0ef41Sopenharmony_ci                                           int32_t rangeEnd,
1121cb0ef41Sopenharmony_ci                                           UVector32 &foundBreaks,
1131cb0ef41Sopenharmony_ci                                           UBool isPhraseBreaking,
1141cb0ef41Sopenharmony_ci                                           UErrorCode& status) const = 0;
1151cb0ef41Sopenharmony_ci
1161cb0ef41Sopenharmony_ci};
1171cb0ef41Sopenharmony_ci
1181cb0ef41Sopenharmony_ci/*******************************************************************
1191cb0ef41Sopenharmony_ci * ThaiBreakEngine
1201cb0ef41Sopenharmony_ci */
1211cb0ef41Sopenharmony_ci
1221cb0ef41Sopenharmony_ci/**
1231cb0ef41Sopenharmony_ci * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
1241cb0ef41Sopenharmony_ci * dictionary and heuristics to determine Thai-specific breaks.</p>
1251cb0ef41Sopenharmony_ci *
1261cb0ef41Sopenharmony_ci * <p>After it is constructed a ThaiBreakEngine may be shared between
1271cb0ef41Sopenharmony_ci * threads without synchronization.</p>
1281cb0ef41Sopenharmony_ci */
1291cb0ef41Sopenharmony_ciclass ThaiBreakEngine : public DictionaryBreakEngine {
1301cb0ef41Sopenharmony_ci private:
1311cb0ef41Sopenharmony_ci    /**
1321cb0ef41Sopenharmony_ci     * The set of characters handled by this engine
1331cb0ef41Sopenharmony_ci     * @internal
1341cb0ef41Sopenharmony_ci     */
1351cb0ef41Sopenharmony_ci
1361cb0ef41Sopenharmony_ci  UnicodeSet                fEndWordSet;
1371cb0ef41Sopenharmony_ci  UnicodeSet                fBeginWordSet;
1381cb0ef41Sopenharmony_ci  UnicodeSet                fSuffixSet;
1391cb0ef41Sopenharmony_ci  UnicodeSet                fMarkSet;
1401cb0ef41Sopenharmony_ci  DictionaryMatcher  *fDictionary;
1411cb0ef41Sopenharmony_ci
1421cb0ef41Sopenharmony_ci public:
1431cb0ef41Sopenharmony_ci
1441cb0ef41Sopenharmony_ci  /**
1451cb0ef41Sopenharmony_ci   * <p>Default constructor.</p>
1461cb0ef41Sopenharmony_ci   *
1471cb0ef41Sopenharmony_ci   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
1481cb0ef41Sopenharmony_ci   * engine is deleted.
1491cb0ef41Sopenharmony_ci   */
1501cb0ef41Sopenharmony_ci  ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
1511cb0ef41Sopenharmony_ci
1521cb0ef41Sopenharmony_ci  /**
1531cb0ef41Sopenharmony_ci   * <p>Virtual destructor.</p>
1541cb0ef41Sopenharmony_ci   */
1551cb0ef41Sopenharmony_ci  virtual ~ThaiBreakEngine();
1561cb0ef41Sopenharmony_ci
1571cb0ef41Sopenharmony_ci protected:
1581cb0ef41Sopenharmony_ci /**
1591cb0ef41Sopenharmony_ci  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
1601cb0ef41Sopenharmony_ci  *
1611cb0ef41Sopenharmony_ci  * @param text A UText representing the text
1621cb0ef41Sopenharmony_ci  * @param rangeStart The start of the range of dictionary characters
1631cb0ef41Sopenharmony_ci  * @param rangeEnd The end of the range of dictionary characters
1641cb0ef41Sopenharmony_ci  * @param foundBreaks Output of C array of int32_t break positions, or 0
1651cb0ef41Sopenharmony_ci  * @param status Information on any errors encountered.
1661cb0ef41Sopenharmony_ci  * @return The number of breaks found
1671cb0ef41Sopenharmony_ci  */
1681cb0ef41Sopenharmony_ci  virtual int32_t divideUpDictionaryRange( UText *text,
1691cb0ef41Sopenharmony_ci                                           int32_t rangeStart,
1701cb0ef41Sopenharmony_ci                                           int32_t rangeEnd,
1711cb0ef41Sopenharmony_ci                                           UVector32 &foundBreaks,
1721cb0ef41Sopenharmony_ci                                           UBool isPhraseBreaking,
1731cb0ef41Sopenharmony_ci                                           UErrorCode& status) const override;
1741cb0ef41Sopenharmony_ci
1751cb0ef41Sopenharmony_ci};
1761cb0ef41Sopenharmony_ci
1771cb0ef41Sopenharmony_ci/*******************************************************************
1781cb0ef41Sopenharmony_ci * LaoBreakEngine
1791cb0ef41Sopenharmony_ci */
1801cb0ef41Sopenharmony_ci
1811cb0ef41Sopenharmony_ci/**
1821cb0ef41Sopenharmony_ci * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
1831cb0ef41Sopenharmony_ci * dictionary and heuristics to determine Lao-specific breaks.</p>
1841cb0ef41Sopenharmony_ci *
1851cb0ef41Sopenharmony_ci * <p>After it is constructed a LaoBreakEngine may be shared between
1861cb0ef41Sopenharmony_ci * threads without synchronization.</p>
1871cb0ef41Sopenharmony_ci */
1881cb0ef41Sopenharmony_ciclass LaoBreakEngine : public DictionaryBreakEngine {
1891cb0ef41Sopenharmony_ci private:
1901cb0ef41Sopenharmony_ci    /**
1911cb0ef41Sopenharmony_ci     * The set of characters handled by this engine
1921cb0ef41Sopenharmony_ci     * @internal
1931cb0ef41Sopenharmony_ci     */
1941cb0ef41Sopenharmony_ci
1951cb0ef41Sopenharmony_ci  UnicodeSet                fEndWordSet;
1961cb0ef41Sopenharmony_ci  UnicodeSet                fBeginWordSet;
1971cb0ef41Sopenharmony_ci  UnicodeSet                fMarkSet;
1981cb0ef41Sopenharmony_ci  DictionaryMatcher  *fDictionary;
1991cb0ef41Sopenharmony_ci
2001cb0ef41Sopenharmony_ci public:
2011cb0ef41Sopenharmony_ci
2021cb0ef41Sopenharmony_ci  /**
2031cb0ef41Sopenharmony_ci   * <p>Default constructor.</p>
2041cb0ef41Sopenharmony_ci   *
2051cb0ef41Sopenharmony_ci   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
2061cb0ef41Sopenharmony_ci   * engine is deleted.
2071cb0ef41Sopenharmony_ci   */
2081cb0ef41Sopenharmony_ci  LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
2091cb0ef41Sopenharmony_ci
2101cb0ef41Sopenharmony_ci  /**
2111cb0ef41Sopenharmony_ci   * <p>Virtual destructor.</p>
2121cb0ef41Sopenharmony_ci   */
2131cb0ef41Sopenharmony_ci  virtual ~LaoBreakEngine();
2141cb0ef41Sopenharmony_ci
2151cb0ef41Sopenharmony_ci protected:
2161cb0ef41Sopenharmony_ci /**
2171cb0ef41Sopenharmony_ci  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
2181cb0ef41Sopenharmony_ci  *
2191cb0ef41Sopenharmony_ci  * @param text A UText representing the text
2201cb0ef41Sopenharmony_ci  * @param rangeStart The start of the range of dictionary characters
2211cb0ef41Sopenharmony_ci  * @param rangeEnd The end of the range of dictionary characters
2221cb0ef41Sopenharmony_ci  * @param foundBreaks Output of C array of int32_t break positions, or 0
2231cb0ef41Sopenharmony_ci  * @param status Information on any errors encountered.
2241cb0ef41Sopenharmony_ci  * @return The number of breaks found
2251cb0ef41Sopenharmony_ci  */
2261cb0ef41Sopenharmony_ci  virtual int32_t divideUpDictionaryRange( UText *text,
2271cb0ef41Sopenharmony_ci                                           int32_t rangeStart,
2281cb0ef41Sopenharmony_ci                                           int32_t rangeEnd,
2291cb0ef41Sopenharmony_ci                                           UVector32 &foundBreaks,
2301cb0ef41Sopenharmony_ci                                           UBool isPhraseBreaking,
2311cb0ef41Sopenharmony_ci                                           UErrorCode& status) const override;
2321cb0ef41Sopenharmony_ci
2331cb0ef41Sopenharmony_ci};
2341cb0ef41Sopenharmony_ci
2351cb0ef41Sopenharmony_ci/*******************************************************************
2361cb0ef41Sopenharmony_ci * BurmeseBreakEngine
2371cb0ef41Sopenharmony_ci */
2381cb0ef41Sopenharmony_ci
2391cb0ef41Sopenharmony_ci/**
2401cb0ef41Sopenharmony_ci * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
2411cb0ef41Sopenharmony_ci * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
2421cb0ef41Sopenharmony_ci *
2431cb0ef41Sopenharmony_ci * <p>After it is constructed a BurmeseBreakEngine may be shared between
2441cb0ef41Sopenharmony_ci * threads without synchronization.</p>
2451cb0ef41Sopenharmony_ci */
2461cb0ef41Sopenharmony_ciclass BurmeseBreakEngine : public DictionaryBreakEngine {
2471cb0ef41Sopenharmony_ci private:
2481cb0ef41Sopenharmony_ci    /**
2491cb0ef41Sopenharmony_ci     * The set of characters handled by this engine
2501cb0ef41Sopenharmony_ci     * @internal
2511cb0ef41Sopenharmony_ci     */
2521cb0ef41Sopenharmony_ci
2531cb0ef41Sopenharmony_ci  UnicodeSet                fEndWordSet;
2541cb0ef41Sopenharmony_ci  UnicodeSet                fBeginWordSet;
2551cb0ef41Sopenharmony_ci  UnicodeSet                fMarkSet;
2561cb0ef41Sopenharmony_ci  DictionaryMatcher  *fDictionary;
2571cb0ef41Sopenharmony_ci
2581cb0ef41Sopenharmony_ci public:
2591cb0ef41Sopenharmony_ci
2601cb0ef41Sopenharmony_ci  /**
2611cb0ef41Sopenharmony_ci   * <p>Default constructor.</p>
2621cb0ef41Sopenharmony_ci   *
2631cb0ef41Sopenharmony_ci   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
2641cb0ef41Sopenharmony_ci   * engine is deleted.
2651cb0ef41Sopenharmony_ci   */
2661cb0ef41Sopenharmony_ci  BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
2671cb0ef41Sopenharmony_ci
2681cb0ef41Sopenharmony_ci  /**
2691cb0ef41Sopenharmony_ci   * <p>Virtual destructor.</p>
2701cb0ef41Sopenharmony_ci   */
2711cb0ef41Sopenharmony_ci  virtual ~BurmeseBreakEngine();
2721cb0ef41Sopenharmony_ci
2731cb0ef41Sopenharmony_ci protected:
2741cb0ef41Sopenharmony_ci /**
2751cb0ef41Sopenharmony_ci  * <p>Divide up a range of known dictionary characters.</p>
2761cb0ef41Sopenharmony_ci  *
2771cb0ef41Sopenharmony_ci  * @param text A UText representing the text
2781cb0ef41Sopenharmony_ci  * @param rangeStart The start of the range of dictionary characters
2791cb0ef41Sopenharmony_ci  * @param rangeEnd The end of the range of dictionary characters
2801cb0ef41Sopenharmony_ci  * @param foundBreaks Output of C array of int32_t break positions, or 0
2811cb0ef41Sopenharmony_ci  * @param status Information on any errors encountered.
2821cb0ef41Sopenharmony_ci  * @return The number of breaks found
2831cb0ef41Sopenharmony_ci  */
2841cb0ef41Sopenharmony_ci  virtual int32_t divideUpDictionaryRange( UText *text,
2851cb0ef41Sopenharmony_ci                                           int32_t rangeStart,
2861cb0ef41Sopenharmony_ci                                           int32_t rangeEnd,
2871cb0ef41Sopenharmony_ci                                           UVector32 &foundBreaks,
2881cb0ef41Sopenharmony_ci                                           UBool isPhraseBreaking,
2891cb0ef41Sopenharmony_ci                                           UErrorCode& status) const override;
2901cb0ef41Sopenharmony_ci
2911cb0ef41Sopenharmony_ci};
2921cb0ef41Sopenharmony_ci
2931cb0ef41Sopenharmony_ci/*******************************************************************
2941cb0ef41Sopenharmony_ci * KhmerBreakEngine
2951cb0ef41Sopenharmony_ci */
2961cb0ef41Sopenharmony_ci
2971cb0ef41Sopenharmony_ci/**
2981cb0ef41Sopenharmony_ci * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
2991cb0ef41Sopenharmony_ci * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
3001cb0ef41Sopenharmony_ci *
3011cb0ef41Sopenharmony_ci * <p>After it is constructed a KhmerBreakEngine may be shared between
3021cb0ef41Sopenharmony_ci * threads without synchronization.</p>
3031cb0ef41Sopenharmony_ci */
3041cb0ef41Sopenharmony_ciclass KhmerBreakEngine : public DictionaryBreakEngine {
3051cb0ef41Sopenharmony_ci private:
3061cb0ef41Sopenharmony_ci    /**
3071cb0ef41Sopenharmony_ci     * The set of characters handled by this engine
3081cb0ef41Sopenharmony_ci     * @internal
3091cb0ef41Sopenharmony_ci     */
3101cb0ef41Sopenharmony_ci
3111cb0ef41Sopenharmony_ci  UnicodeSet                fEndWordSet;
3121cb0ef41Sopenharmony_ci  UnicodeSet                fBeginWordSet;
3131cb0ef41Sopenharmony_ci  UnicodeSet                fMarkSet;
3141cb0ef41Sopenharmony_ci  DictionaryMatcher  *fDictionary;
3151cb0ef41Sopenharmony_ci
3161cb0ef41Sopenharmony_ci public:
3171cb0ef41Sopenharmony_ci
3181cb0ef41Sopenharmony_ci  /**
3191cb0ef41Sopenharmony_ci   * <p>Default constructor.</p>
3201cb0ef41Sopenharmony_ci   *
3211cb0ef41Sopenharmony_ci   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
3221cb0ef41Sopenharmony_ci   * engine is deleted.
3231cb0ef41Sopenharmony_ci   */
3241cb0ef41Sopenharmony_ci  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
3251cb0ef41Sopenharmony_ci
3261cb0ef41Sopenharmony_ci  /**
3271cb0ef41Sopenharmony_ci   * <p>Virtual destructor.</p>
3281cb0ef41Sopenharmony_ci   */
3291cb0ef41Sopenharmony_ci  virtual ~KhmerBreakEngine();
3301cb0ef41Sopenharmony_ci
3311cb0ef41Sopenharmony_ci protected:
3321cb0ef41Sopenharmony_ci /**
3331cb0ef41Sopenharmony_ci  * <p>Divide up a range of known dictionary characters.</p>
3341cb0ef41Sopenharmony_ci  *
3351cb0ef41Sopenharmony_ci  * @param text A UText representing the text
3361cb0ef41Sopenharmony_ci  * @param rangeStart The start of the range of dictionary characters
3371cb0ef41Sopenharmony_ci  * @param rangeEnd The end of the range of dictionary characters
3381cb0ef41Sopenharmony_ci  * @param foundBreaks Output of C array of int32_t break positions, or 0
3391cb0ef41Sopenharmony_ci  * @param status Information on any errors encountered.
3401cb0ef41Sopenharmony_ci  * @return The number of breaks found
3411cb0ef41Sopenharmony_ci  */
3421cb0ef41Sopenharmony_ci  virtual int32_t divideUpDictionaryRange( UText *text,
3431cb0ef41Sopenharmony_ci                                           int32_t rangeStart,
3441cb0ef41Sopenharmony_ci                                           int32_t rangeEnd,
3451cb0ef41Sopenharmony_ci                                           UVector32 &foundBreaks,
3461cb0ef41Sopenharmony_ci                                           UBool isPhraseBreaking,
3471cb0ef41Sopenharmony_ci                                           UErrorCode& status) const override;
3481cb0ef41Sopenharmony_ci
3491cb0ef41Sopenharmony_ci};
3501cb0ef41Sopenharmony_ci
3511cb0ef41Sopenharmony_ci#if !UCONFIG_NO_NORMALIZATION
3521cb0ef41Sopenharmony_ci
3531cb0ef41Sopenharmony_ci/*******************************************************************
3541cb0ef41Sopenharmony_ci * CjkBreakEngine
3551cb0ef41Sopenharmony_ci */
3561cb0ef41Sopenharmony_ci
3571cb0ef41Sopenharmony_ci//indicates language/script that the CjkBreakEngine will handle
3581cb0ef41Sopenharmony_cienum LanguageType {
3591cb0ef41Sopenharmony_ci    kKorean,
3601cb0ef41Sopenharmony_ci    kChineseJapanese
3611cb0ef41Sopenharmony_ci};
3621cb0ef41Sopenharmony_ci
3631cb0ef41Sopenharmony_ci/**
3641cb0ef41Sopenharmony_ci * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
3651cb0ef41Sopenharmony_ci * dictionary with costs associated with each word and
3661cb0ef41Sopenharmony_ci * Viterbi decoding to determine CJK-specific breaks.</p>
3671cb0ef41Sopenharmony_ci */
3681cb0ef41Sopenharmony_ciclass CjkBreakEngine : public DictionaryBreakEngine {
3691cb0ef41Sopenharmony_ci protected:
3701cb0ef41Sopenharmony_ci    /**
3711cb0ef41Sopenharmony_ci     * The set of characters handled by this engine
3721cb0ef41Sopenharmony_ci     * @internal
3731cb0ef41Sopenharmony_ci     */
3741cb0ef41Sopenharmony_ci  UnicodeSet                fHangulWordSet;
3751cb0ef41Sopenharmony_ci  UnicodeSet                fDigitOrOpenPunctuationOrAlphabetSet;
3761cb0ef41Sopenharmony_ci  UnicodeSet                fClosePunctuationSet;
3771cb0ef41Sopenharmony_ci
3781cb0ef41Sopenharmony_ci  DictionaryMatcher        *fDictionary;
3791cb0ef41Sopenharmony_ci  const Normalizer2        *nfkcNorm2;
3801cb0ef41Sopenharmony_ci  MlBreakEngine            *fMlBreakEngine;
3811cb0ef41Sopenharmony_ci  bool                      isCj;
3821cb0ef41Sopenharmony_ci
3831cb0ef41Sopenharmony_ci private:
3841cb0ef41Sopenharmony_ci  // Load Japanese extensions.
3851cb0ef41Sopenharmony_ci  void loadJapaneseExtensions(UErrorCode& error);
3861cb0ef41Sopenharmony_ci  // Load Japanese Hiragana.
3871cb0ef41Sopenharmony_ci  void loadHiragana(UErrorCode& error);
3881cb0ef41Sopenharmony_ci  // Initialize fSkipSet by loading Japanese Hiragana and extensions.
3891cb0ef41Sopenharmony_ci  void initJapanesePhraseParameter(UErrorCode& error);
3901cb0ef41Sopenharmony_ci
3911cb0ef41Sopenharmony_ci  Hashtable fSkipSet;
3921cb0ef41Sopenharmony_ci
3931cb0ef41Sopenharmony_ci public:
3941cb0ef41Sopenharmony_ci
3951cb0ef41Sopenharmony_ci    /**
3961cb0ef41Sopenharmony_ci     * <p>Default constructor.</p>
3971cb0ef41Sopenharmony_ci     *
3981cb0ef41Sopenharmony_ci     * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
3991cb0ef41Sopenharmony_ci     * engine is deleted. The DictionaryMatcher must contain costs for each word
4001cb0ef41Sopenharmony_ci     * in order for the dictionary to work properly.
4011cb0ef41Sopenharmony_ci     */
4021cb0ef41Sopenharmony_ci  CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
4031cb0ef41Sopenharmony_ci
4041cb0ef41Sopenharmony_ci    /**
4051cb0ef41Sopenharmony_ci     * <p>Virtual destructor.</p>
4061cb0ef41Sopenharmony_ci     */
4071cb0ef41Sopenharmony_ci  virtual ~CjkBreakEngine();
4081cb0ef41Sopenharmony_ci
4091cb0ef41Sopenharmony_ci protected:
4101cb0ef41Sopenharmony_ci    /**
4111cb0ef41Sopenharmony_ci     * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
4121cb0ef41Sopenharmony_ci     *
4131cb0ef41Sopenharmony_ci     * @param text A UText representing the text
4141cb0ef41Sopenharmony_ci     * @param rangeStart The start of the range of dictionary characters
4151cb0ef41Sopenharmony_ci     * @param rangeEnd The end of the range of dictionary characters
4161cb0ef41Sopenharmony_ci     * @param foundBreaks Output of C array of int32_t break positions, or 0
4171cb0ef41Sopenharmony_ci     * @param status Information on any errors encountered.
4181cb0ef41Sopenharmony_ci     * @return The number of breaks found
4191cb0ef41Sopenharmony_ci     */
4201cb0ef41Sopenharmony_ci  virtual int32_t divideUpDictionaryRange( UText *text,
4211cb0ef41Sopenharmony_ci          int32_t rangeStart,
4221cb0ef41Sopenharmony_ci          int32_t rangeEnd,
4231cb0ef41Sopenharmony_ci          UVector32 &foundBreaks,
4241cb0ef41Sopenharmony_ci          UBool isPhraseBreaking,
4251cb0ef41Sopenharmony_ci          UErrorCode& status) const override;
4261cb0ef41Sopenharmony_ci
4271cb0ef41Sopenharmony_ci};
4281cb0ef41Sopenharmony_ci
4291cb0ef41Sopenharmony_ci#endif
4301cb0ef41Sopenharmony_ci
4311cb0ef41Sopenharmony_ciU_NAMESPACE_END
4321cb0ef41Sopenharmony_ci
4331cb0ef41Sopenharmony_ci    /* DICTBE_H */
4341cb0ef41Sopenharmony_ci#endif
435