11cb0ef41Sopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 21cb0ef41Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 31cb0ef41Sopenharmony_ci/** 41cb0ef41Sopenharmony_ci ******************************************************************************* 51cb0ef41Sopenharmony_ci * Copyright (C) 2006-2014, International Business Machines Corporation * 61cb0ef41Sopenharmony_ci * and others. All Rights Reserved. * 71cb0ef41Sopenharmony_ci ******************************************************************************* 81cb0ef41Sopenharmony_ci */ 91cb0ef41Sopenharmony_ci 101cb0ef41Sopenharmony_ci#ifndef DICTBE_H 111cb0ef41Sopenharmony_ci#define DICTBE_H 121cb0ef41Sopenharmony_ci 131cb0ef41Sopenharmony_ci#include "unicode/utypes.h" 141cb0ef41Sopenharmony_ci#include "unicode/uniset.h" 151cb0ef41Sopenharmony_ci#include "unicode/utext.h" 161cb0ef41Sopenharmony_ci 171cb0ef41Sopenharmony_ci#include "brkeng.h" 181cb0ef41Sopenharmony_ci#include "hash.h" 191cb0ef41Sopenharmony_ci#include "mlbe.h" 201cb0ef41Sopenharmony_ci#include "uvectr32.h" 211cb0ef41Sopenharmony_ci 221cb0ef41Sopenharmony_ciU_NAMESPACE_BEGIN 231cb0ef41Sopenharmony_ci 241cb0ef41Sopenharmony_ciclass DictionaryMatcher; 251cb0ef41Sopenharmony_ciclass MlBreakEngine; 261cb0ef41Sopenharmony_ciclass Normalizer2; 271cb0ef41Sopenharmony_ci 281cb0ef41Sopenharmony_ci/******************************************************************* 291cb0ef41Sopenharmony_ci * DictionaryBreakEngine 301cb0ef41Sopenharmony_ci */ 311cb0ef41Sopenharmony_ci 321cb0ef41Sopenharmony_ci/** 331cb0ef41Sopenharmony_ci * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 341cb0ef41Sopenharmony_ci * dictionary to determine language-specific breaks.</p> 351cb0ef41Sopenharmony_ci * 361cb0ef41Sopenharmony_ci * <p>After it is constructed a DictionaryBreakEngine may be shared between 371cb0ef41Sopenharmony_ci * threads without synchronization.</p> 381cb0ef41Sopenharmony_ci */ 391cb0ef41Sopenharmony_ciclass DictionaryBreakEngine : public LanguageBreakEngine { 401cb0ef41Sopenharmony_ci private: 411cb0ef41Sopenharmony_ci /** 421cb0ef41Sopenharmony_ci * The set of characters handled by this engine 431cb0ef41Sopenharmony_ci * @internal 441cb0ef41Sopenharmony_ci */ 451cb0ef41Sopenharmony_ci 461cb0ef41Sopenharmony_ci UnicodeSet fSet; 471cb0ef41Sopenharmony_ci 481cb0ef41Sopenharmony_ci public: 491cb0ef41Sopenharmony_ci 501cb0ef41Sopenharmony_ci /** 511cb0ef41Sopenharmony_ci * <p>Constructor </p> 521cb0ef41Sopenharmony_ci */ 531cb0ef41Sopenharmony_ci DictionaryBreakEngine(); 541cb0ef41Sopenharmony_ci 551cb0ef41Sopenharmony_ci /** 561cb0ef41Sopenharmony_ci * <p>Virtual destructor.</p> 571cb0ef41Sopenharmony_ci */ 581cb0ef41Sopenharmony_ci virtual ~DictionaryBreakEngine(); 591cb0ef41Sopenharmony_ci 601cb0ef41Sopenharmony_ci /** 611cb0ef41Sopenharmony_ci * <p>Indicate whether this engine handles a particular character for 621cb0ef41Sopenharmony_ci * a particular kind of break.</p> 631cb0ef41Sopenharmony_ci * 641cb0ef41Sopenharmony_ci * @param c A character which begins a run that the engine might handle 651cb0ef41Sopenharmony_ci * @param locale The locale. 661cb0ef41Sopenharmony_ci * @return true if this engine handles the particular character and break 671cb0ef41Sopenharmony_ci * type. 681cb0ef41Sopenharmony_ci */ 691cb0ef41Sopenharmony_ci virtual UBool handles(UChar32 c, const char* locale) const override; 701cb0ef41Sopenharmony_ci 711cb0ef41Sopenharmony_ci /** 721cb0ef41Sopenharmony_ci * <p>Find any breaks within a run in the supplied text.</p> 731cb0ef41Sopenharmony_ci * 741cb0ef41Sopenharmony_ci * @param text A UText representing the text. The iterator is left at 751cb0ef41Sopenharmony_ci * the end of the run of characters which the engine is capable of handling 761cb0ef41Sopenharmony_ci * that starts from the first character in the range. 771cb0ef41Sopenharmony_ci * @param startPos The start of the run within the supplied text. 781cb0ef41Sopenharmony_ci * @param endPos The end of the run within the supplied text. 791cb0ef41Sopenharmony_ci * @param foundBreaks vector of int32_t to receive the break positions 801cb0ef41Sopenharmony_ci * @param status Information on any errors encountered. 811cb0ef41Sopenharmony_ci * @return The number of breaks found. 821cb0ef41Sopenharmony_ci */ 831cb0ef41Sopenharmony_ci virtual int32_t findBreaks( UText *text, 841cb0ef41Sopenharmony_ci int32_t startPos, 851cb0ef41Sopenharmony_ci int32_t endPos, 861cb0ef41Sopenharmony_ci UVector32 &foundBreaks, 871cb0ef41Sopenharmony_ci UBool isPhraseBreaking, 881cb0ef41Sopenharmony_ci UErrorCode& status ) const override; 891cb0ef41Sopenharmony_ci 901cb0ef41Sopenharmony_ci protected: 911cb0ef41Sopenharmony_ci 921cb0ef41Sopenharmony_ci /** 931cb0ef41Sopenharmony_ci * <p>Set the character set handled by this engine.</p> 941cb0ef41Sopenharmony_ci * 951cb0ef41Sopenharmony_ci * @param set A UnicodeSet of the set of characters handled by the engine 961cb0ef41Sopenharmony_ci */ 971cb0ef41Sopenharmony_ci virtual void setCharacters( const UnicodeSet &set ); 981cb0ef41Sopenharmony_ci 991cb0ef41Sopenharmony_ci /** 1001cb0ef41Sopenharmony_ci * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 1011cb0ef41Sopenharmony_ci * 1021cb0ef41Sopenharmony_ci * @param text A UText representing the text 1031cb0ef41Sopenharmony_ci * @param rangeStart The start of the range of dictionary characters 1041cb0ef41Sopenharmony_ci * @param rangeEnd The end of the range of dictionary characters 1051cb0ef41Sopenharmony_ci * @param foundBreaks Output of C array of int32_t break positions, or 0 1061cb0ef41Sopenharmony_ci * @param status Information on any errors encountered. 1071cb0ef41Sopenharmony_ci * @return The number of breaks found 1081cb0ef41Sopenharmony_ci */ 1091cb0ef41Sopenharmony_ci virtual int32_t divideUpDictionaryRange( UText *text, 1101cb0ef41Sopenharmony_ci int32_t rangeStart, 1111cb0ef41Sopenharmony_ci int32_t rangeEnd, 1121cb0ef41Sopenharmony_ci UVector32 &foundBreaks, 1131cb0ef41Sopenharmony_ci UBool isPhraseBreaking, 1141cb0ef41Sopenharmony_ci UErrorCode& status) const = 0; 1151cb0ef41Sopenharmony_ci 1161cb0ef41Sopenharmony_ci}; 1171cb0ef41Sopenharmony_ci 1181cb0ef41Sopenharmony_ci/******************************************************************* 1191cb0ef41Sopenharmony_ci * ThaiBreakEngine 1201cb0ef41Sopenharmony_ci */ 1211cb0ef41Sopenharmony_ci 1221cb0ef41Sopenharmony_ci/** 1231cb0ef41Sopenharmony_ci * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 1241cb0ef41Sopenharmony_ci * dictionary and heuristics to determine Thai-specific breaks.</p> 1251cb0ef41Sopenharmony_ci * 1261cb0ef41Sopenharmony_ci * <p>After it is constructed a ThaiBreakEngine may be shared between 1271cb0ef41Sopenharmony_ci * threads without synchronization.</p> 1281cb0ef41Sopenharmony_ci */ 1291cb0ef41Sopenharmony_ciclass ThaiBreakEngine : public DictionaryBreakEngine { 1301cb0ef41Sopenharmony_ci private: 1311cb0ef41Sopenharmony_ci /** 1321cb0ef41Sopenharmony_ci * The set of characters handled by this engine 1331cb0ef41Sopenharmony_ci * @internal 1341cb0ef41Sopenharmony_ci */ 1351cb0ef41Sopenharmony_ci 1361cb0ef41Sopenharmony_ci UnicodeSet fEndWordSet; 1371cb0ef41Sopenharmony_ci UnicodeSet fBeginWordSet; 1381cb0ef41Sopenharmony_ci UnicodeSet fSuffixSet; 1391cb0ef41Sopenharmony_ci UnicodeSet fMarkSet; 1401cb0ef41Sopenharmony_ci DictionaryMatcher *fDictionary; 1411cb0ef41Sopenharmony_ci 1421cb0ef41Sopenharmony_ci public: 1431cb0ef41Sopenharmony_ci 1441cb0ef41Sopenharmony_ci /** 1451cb0ef41Sopenharmony_ci * <p>Default constructor.</p> 1461cb0ef41Sopenharmony_ci * 1471cb0ef41Sopenharmony_ci * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 1481cb0ef41Sopenharmony_ci * engine is deleted. 1491cb0ef41Sopenharmony_ci */ 1501cb0ef41Sopenharmony_ci ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 1511cb0ef41Sopenharmony_ci 1521cb0ef41Sopenharmony_ci /** 1531cb0ef41Sopenharmony_ci * <p>Virtual destructor.</p> 1541cb0ef41Sopenharmony_ci */ 1551cb0ef41Sopenharmony_ci virtual ~ThaiBreakEngine(); 1561cb0ef41Sopenharmony_ci 1571cb0ef41Sopenharmony_ci protected: 1581cb0ef41Sopenharmony_ci /** 1591cb0ef41Sopenharmony_ci * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 1601cb0ef41Sopenharmony_ci * 1611cb0ef41Sopenharmony_ci * @param text A UText representing the text 1621cb0ef41Sopenharmony_ci * @param rangeStart The start of the range of dictionary characters 1631cb0ef41Sopenharmony_ci * @param rangeEnd The end of the range of dictionary characters 1641cb0ef41Sopenharmony_ci * @param foundBreaks Output of C array of int32_t break positions, or 0 1651cb0ef41Sopenharmony_ci * @param status Information on any errors encountered. 1661cb0ef41Sopenharmony_ci * @return The number of breaks found 1671cb0ef41Sopenharmony_ci */ 1681cb0ef41Sopenharmony_ci virtual int32_t divideUpDictionaryRange( UText *text, 1691cb0ef41Sopenharmony_ci int32_t rangeStart, 1701cb0ef41Sopenharmony_ci int32_t rangeEnd, 1711cb0ef41Sopenharmony_ci UVector32 &foundBreaks, 1721cb0ef41Sopenharmony_ci UBool isPhraseBreaking, 1731cb0ef41Sopenharmony_ci UErrorCode& status) const override; 1741cb0ef41Sopenharmony_ci 1751cb0ef41Sopenharmony_ci}; 1761cb0ef41Sopenharmony_ci 1771cb0ef41Sopenharmony_ci/******************************************************************* 1781cb0ef41Sopenharmony_ci * LaoBreakEngine 1791cb0ef41Sopenharmony_ci */ 1801cb0ef41Sopenharmony_ci 1811cb0ef41Sopenharmony_ci/** 1821cb0ef41Sopenharmony_ci * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 1831cb0ef41Sopenharmony_ci * dictionary and heuristics to determine Lao-specific breaks.</p> 1841cb0ef41Sopenharmony_ci * 1851cb0ef41Sopenharmony_ci * <p>After it is constructed a LaoBreakEngine may be shared between 1861cb0ef41Sopenharmony_ci * threads without synchronization.</p> 1871cb0ef41Sopenharmony_ci */ 1881cb0ef41Sopenharmony_ciclass LaoBreakEngine : public DictionaryBreakEngine { 1891cb0ef41Sopenharmony_ci private: 1901cb0ef41Sopenharmony_ci /** 1911cb0ef41Sopenharmony_ci * The set of characters handled by this engine 1921cb0ef41Sopenharmony_ci * @internal 1931cb0ef41Sopenharmony_ci */ 1941cb0ef41Sopenharmony_ci 1951cb0ef41Sopenharmony_ci UnicodeSet fEndWordSet; 1961cb0ef41Sopenharmony_ci UnicodeSet fBeginWordSet; 1971cb0ef41Sopenharmony_ci UnicodeSet fMarkSet; 1981cb0ef41Sopenharmony_ci DictionaryMatcher *fDictionary; 1991cb0ef41Sopenharmony_ci 2001cb0ef41Sopenharmony_ci public: 2011cb0ef41Sopenharmony_ci 2021cb0ef41Sopenharmony_ci /** 2031cb0ef41Sopenharmony_ci * <p>Default constructor.</p> 2041cb0ef41Sopenharmony_ci * 2051cb0ef41Sopenharmony_ci * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 2061cb0ef41Sopenharmony_ci * engine is deleted. 2071cb0ef41Sopenharmony_ci */ 2081cb0ef41Sopenharmony_ci LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 2091cb0ef41Sopenharmony_ci 2101cb0ef41Sopenharmony_ci /** 2111cb0ef41Sopenharmony_ci * <p>Virtual destructor.</p> 2121cb0ef41Sopenharmony_ci */ 2131cb0ef41Sopenharmony_ci virtual ~LaoBreakEngine(); 2141cb0ef41Sopenharmony_ci 2151cb0ef41Sopenharmony_ci protected: 2161cb0ef41Sopenharmony_ci /** 2171cb0ef41Sopenharmony_ci * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 2181cb0ef41Sopenharmony_ci * 2191cb0ef41Sopenharmony_ci * @param text A UText representing the text 2201cb0ef41Sopenharmony_ci * @param rangeStart The start of the range of dictionary characters 2211cb0ef41Sopenharmony_ci * @param rangeEnd The end of the range of dictionary characters 2221cb0ef41Sopenharmony_ci * @param foundBreaks Output of C array of int32_t break positions, or 0 2231cb0ef41Sopenharmony_ci * @param status Information on any errors encountered. 2241cb0ef41Sopenharmony_ci * @return The number of breaks found 2251cb0ef41Sopenharmony_ci */ 2261cb0ef41Sopenharmony_ci virtual int32_t divideUpDictionaryRange( UText *text, 2271cb0ef41Sopenharmony_ci int32_t rangeStart, 2281cb0ef41Sopenharmony_ci int32_t rangeEnd, 2291cb0ef41Sopenharmony_ci UVector32 &foundBreaks, 2301cb0ef41Sopenharmony_ci UBool isPhraseBreaking, 2311cb0ef41Sopenharmony_ci UErrorCode& status) const override; 2321cb0ef41Sopenharmony_ci 2331cb0ef41Sopenharmony_ci}; 2341cb0ef41Sopenharmony_ci 2351cb0ef41Sopenharmony_ci/******************************************************************* 2361cb0ef41Sopenharmony_ci * BurmeseBreakEngine 2371cb0ef41Sopenharmony_ci */ 2381cb0ef41Sopenharmony_ci 2391cb0ef41Sopenharmony_ci/** 2401cb0ef41Sopenharmony_ci * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a 2411cb0ef41Sopenharmony_ci * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> 2421cb0ef41Sopenharmony_ci * 2431cb0ef41Sopenharmony_ci * <p>After it is constructed a BurmeseBreakEngine may be shared between 2441cb0ef41Sopenharmony_ci * threads without synchronization.</p> 2451cb0ef41Sopenharmony_ci */ 2461cb0ef41Sopenharmony_ciclass BurmeseBreakEngine : public DictionaryBreakEngine { 2471cb0ef41Sopenharmony_ci private: 2481cb0ef41Sopenharmony_ci /** 2491cb0ef41Sopenharmony_ci * The set of characters handled by this engine 2501cb0ef41Sopenharmony_ci * @internal 2511cb0ef41Sopenharmony_ci */ 2521cb0ef41Sopenharmony_ci 2531cb0ef41Sopenharmony_ci UnicodeSet fEndWordSet; 2541cb0ef41Sopenharmony_ci UnicodeSet fBeginWordSet; 2551cb0ef41Sopenharmony_ci UnicodeSet fMarkSet; 2561cb0ef41Sopenharmony_ci DictionaryMatcher *fDictionary; 2571cb0ef41Sopenharmony_ci 2581cb0ef41Sopenharmony_ci public: 2591cb0ef41Sopenharmony_ci 2601cb0ef41Sopenharmony_ci /** 2611cb0ef41Sopenharmony_ci * <p>Default constructor.</p> 2621cb0ef41Sopenharmony_ci * 2631cb0ef41Sopenharmony_ci * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 2641cb0ef41Sopenharmony_ci * engine is deleted. 2651cb0ef41Sopenharmony_ci */ 2661cb0ef41Sopenharmony_ci BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 2671cb0ef41Sopenharmony_ci 2681cb0ef41Sopenharmony_ci /** 2691cb0ef41Sopenharmony_ci * <p>Virtual destructor.</p> 2701cb0ef41Sopenharmony_ci */ 2711cb0ef41Sopenharmony_ci virtual ~BurmeseBreakEngine(); 2721cb0ef41Sopenharmony_ci 2731cb0ef41Sopenharmony_ci protected: 2741cb0ef41Sopenharmony_ci /** 2751cb0ef41Sopenharmony_ci * <p>Divide up a range of known dictionary characters.</p> 2761cb0ef41Sopenharmony_ci * 2771cb0ef41Sopenharmony_ci * @param text A UText representing the text 2781cb0ef41Sopenharmony_ci * @param rangeStart The start of the range of dictionary characters 2791cb0ef41Sopenharmony_ci * @param rangeEnd The end of the range of dictionary characters 2801cb0ef41Sopenharmony_ci * @param foundBreaks Output of C array of int32_t break positions, or 0 2811cb0ef41Sopenharmony_ci * @param status Information on any errors encountered. 2821cb0ef41Sopenharmony_ci * @return The number of breaks found 2831cb0ef41Sopenharmony_ci */ 2841cb0ef41Sopenharmony_ci virtual int32_t divideUpDictionaryRange( UText *text, 2851cb0ef41Sopenharmony_ci int32_t rangeStart, 2861cb0ef41Sopenharmony_ci int32_t rangeEnd, 2871cb0ef41Sopenharmony_ci UVector32 &foundBreaks, 2881cb0ef41Sopenharmony_ci UBool isPhraseBreaking, 2891cb0ef41Sopenharmony_ci UErrorCode& status) const override; 2901cb0ef41Sopenharmony_ci 2911cb0ef41Sopenharmony_ci}; 2921cb0ef41Sopenharmony_ci 2931cb0ef41Sopenharmony_ci/******************************************************************* 2941cb0ef41Sopenharmony_ci * KhmerBreakEngine 2951cb0ef41Sopenharmony_ci */ 2961cb0ef41Sopenharmony_ci 2971cb0ef41Sopenharmony_ci/** 2981cb0ef41Sopenharmony_ci * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 2991cb0ef41Sopenharmony_ci * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 3001cb0ef41Sopenharmony_ci * 3011cb0ef41Sopenharmony_ci * <p>After it is constructed a KhmerBreakEngine may be shared between 3021cb0ef41Sopenharmony_ci * threads without synchronization.</p> 3031cb0ef41Sopenharmony_ci */ 3041cb0ef41Sopenharmony_ciclass KhmerBreakEngine : public DictionaryBreakEngine { 3051cb0ef41Sopenharmony_ci private: 3061cb0ef41Sopenharmony_ci /** 3071cb0ef41Sopenharmony_ci * The set of characters handled by this engine 3081cb0ef41Sopenharmony_ci * @internal 3091cb0ef41Sopenharmony_ci */ 3101cb0ef41Sopenharmony_ci 3111cb0ef41Sopenharmony_ci UnicodeSet fEndWordSet; 3121cb0ef41Sopenharmony_ci UnicodeSet fBeginWordSet; 3131cb0ef41Sopenharmony_ci UnicodeSet fMarkSet; 3141cb0ef41Sopenharmony_ci DictionaryMatcher *fDictionary; 3151cb0ef41Sopenharmony_ci 3161cb0ef41Sopenharmony_ci public: 3171cb0ef41Sopenharmony_ci 3181cb0ef41Sopenharmony_ci /** 3191cb0ef41Sopenharmony_ci * <p>Default constructor.</p> 3201cb0ef41Sopenharmony_ci * 3211cb0ef41Sopenharmony_ci * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 3221cb0ef41Sopenharmony_ci * engine is deleted. 3231cb0ef41Sopenharmony_ci */ 3241cb0ef41Sopenharmony_ci KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 3251cb0ef41Sopenharmony_ci 3261cb0ef41Sopenharmony_ci /** 3271cb0ef41Sopenharmony_ci * <p>Virtual destructor.</p> 3281cb0ef41Sopenharmony_ci */ 3291cb0ef41Sopenharmony_ci virtual ~KhmerBreakEngine(); 3301cb0ef41Sopenharmony_ci 3311cb0ef41Sopenharmony_ci protected: 3321cb0ef41Sopenharmony_ci /** 3331cb0ef41Sopenharmony_ci * <p>Divide up a range of known dictionary characters.</p> 3341cb0ef41Sopenharmony_ci * 3351cb0ef41Sopenharmony_ci * @param text A UText representing the text 3361cb0ef41Sopenharmony_ci * @param rangeStart The start of the range of dictionary characters 3371cb0ef41Sopenharmony_ci * @param rangeEnd The end of the range of dictionary characters 3381cb0ef41Sopenharmony_ci * @param foundBreaks Output of C array of int32_t break positions, or 0 3391cb0ef41Sopenharmony_ci * @param status Information on any errors encountered. 3401cb0ef41Sopenharmony_ci * @return The number of breaks found 3411cb0ef41Sopenharmony_ci */ 3421cb0ef41Sopenharmony_ci virtual int32_t divideUpDictionaryRange( UText *text, 3431cb0ef41Sopenharmony_ci int32_t rangeStart, 3441cb0ef41Sopenharmony_ci int32_t rangeEnd, 3451cb0ef41Sopenharmony_ci UVector32 &foundBreaks, 3461cb0ef41Sopenharmony_ci UBool isPhraseBreaking, 3471cb0ef41Sopenharmony_ci UErrorCode& status) const override; 3481cb0ef41Sopenharmony_ci 3491cb0ef41Sopenharmony_ci}; 3501cb0ef41Sopenharmony_ci 3511cb0ef41Sopenharmony_ci#if !UCONFIG_NO_NORMALIZATION 3521cb0ef41Sopenharmony_ci 3531cb0ef41Sopenharmony_ci/******************************************************************* 3541cb0ef41Sopenharmony_ci * CjkBreakEngine 3551cb0ef41Sopenharmony_ci */ 3561cb0ef41Sopenharmony_ci 3571cb0ef41Sopenharmony_ci//indicates language/script that the CjkBreakEngine will handle 3581cb0ef41Sopenharmony_cienum LanguageType { 3591cb0ef41Sopenharmony_ci kKorean, 3601cb0ef41Sopenharmony_ci kChineseJapanese 3611cb0ef41Sopenharmony_ci}; 3621cb0ef41Sopenharmony_ci 3631cb0ef41Sopenharmony_ci/** 3641cb0ef41Sopenharmony_ci * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 3651cb0ef41Sopenharmony_ci * dictionary with costs associated with each word and 3661cb0ef41Sopenharmony_ci * Viterbi decoding to determine CJK-specific breaks.</p> 3671cb0ef41Sopenharmony_ci */ 3681cb0ef41Sopenharmony_ciclass CjkBreakEngine : public DictionaryBreakEngine { 3691cb0ef41Sopenharmony_ci protected: 3701cb0ef41Sopenharmony_ci /** 3711cb0ef41Sopenharmony_ci * The set of characters handled by this engine 3721cb0ef41Sopenharmony_ci * @internal 3731cb0ef41Sopenharmony_ci */ 3741cb0ef41Sopenharmony_ci UnicodeSet fHangulWordSet; 3751cb0ef41Sopenharmony_ci UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; 3761cb0ef41Sopenharmony_ci UnicodeSet fClosePunctuationSet; 3771cb0ef41Sopenharmony_ci 3781cb0ef41Sopenharmony_ci DictionaryMatcher *fDictionary; 3791cb0ef41Sopenharmony_ci const Normalizer2 *nfkcNorm2; 3801cb0ef41Sopenharmony_ci MlBreakEngine *fMlBreakEngine; 3811cb0ef41Sopenharmony_ci bool isCj; 3821cb0ef41Sopenharmony_ci 3831cb0ef41Sopenharmony_ci private: 3841cb0ef41Sopenharmony_ci // Load Japanese extensions. 3851cb0ef41Sopenharmony_ci void loadJapaneseExtensions(UErrorCode& error); 3861cb0ef41Sopenharmony_ci // Load Japanese Hiragana. 3871cb0ef41Sopenharmony_ci void loadHiragana(UErrorCode& error); 3881cb0ef41Sopenharmony_ci // Initialize fSkipSet by loading Japanese Hiragana and extensions. 3891cb0ef41Sopenharmony_ci void initJapanesePhraseParameter(UErrorCode& error); 3901cb0ef41Sopenharmony_ci 3911cb0ef41Sopenharmony_ci Hashtable fSkipSet; 3921cb0ef41Sopenharmony_ci 3931cb0ef41Sopenharmony_ci public: 3941cb0ef41Sopenharmony_ci 3951cb0ef41Sopenharmony_ci /** 3961cb0ef41Sopenharmony_ci * <p>Default constructor.</p> 3971cb0ef41Sopenharmony_ci * 3981cb0ef41Sopenharmony_ci * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 3991cb0ef41Sopenharmony_ci * engine is deleted. The DictionaryMatcher must contain costs for each word 4001cb0ef41Sopenharmony_ci * in order for the dictionary to work properly. 4011cb0ef41Sopenharmony_ci */ 4021cb0ef41Sopenharmony_ci CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 4031cb0ef41Sopenharmony_ci 4041cb0ef41Sopenharmony_ci /** 4051cb0ef41Sopenharmony_ci * <p>Virtual destructor.</p> 4061cb0ef41Sopenharmony_ci */ 4071cb0ef41Sopenharmony_ci virtual ~CjkBreakEngine(); 4081cb0ef41Sopenharmony_ci 4091cb0ef41Sopenharmony_ci protected: 4101cb0ef41Sopenharmony_ci /** 4111cb0ef41Sopenharmony_ci * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 4121cb0ef41Sopenharmony_ci * 4131cb0ef41Sopenharmony_ci * @param text A UText representing the text 4141cb0ef41Sopenharmony_ci * @param rangeStart The start of the range of dictionary characters 4151cb0ef41Sopenharmony_ci * @param rangeEnd The end of the range of dictionary characters 4161cb0ef41Sopenharmony_ci * @param foundBreaks Output of C array of int32_t break positions, or 0 4171cb0ef41Sopenharmony_ci * @param status Information on any errors encountered. 4181cb0ef41Sopenharmony_ci * @return The number of breaks found 4191cb0ef41Sopenharmony_ci */ 4201cb0ef41Sopenharmony_ci virtual int32_t divideUpDictionaryRange( UText *text, 4211cb0ef41Sopenharmony_ci int32_t rangeStart, 4221cb0ef41Sopenharmony_ci int32_t rangeEnd, 4231cb0ef41Sopenharmony_ci UVector32 &foundBreaks, 4241cb0ef41Sopenharmony_ci UBool isPhraseBreaking, 4251cb0ef41Sopenharmony_ci UErrorCode& status) const override; 4261cb0ef41Sopenharmony_ci 4271cb0ef41Sopenharmony_ci}; 4281cb0ef41Sopenharmony_ci 4291cb0ef41Sopenharmony_ci#endif 4301cb0ef41Sopenharmony_ci 4311cb0ef41Sopenharmony_ciU_NAMESPACE_END 4321cb0ef41Sopenharmony_ci 4331cb0ef41Sopenharmony_ci /* DICTBE_H */ 4341cb0ef41Sopenharmony_ci#endif 435