17777dab0Sopenharmony_ci// © 2016 and later: Unicode, Inc. and others. 27777dab0Sopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 37777dab0Sopenharmony_ci/* 47777dab0Sopenharmony_ci****************************************************************************** 57777dab0Sopenharmony_ci* Copyright (C) 1996-2015, International Business Machines Corporation and others. 67777dab0Sopenharmony_ci* All Rights Reserved. 77777dab0Sopenharmony_ci****************************************************************************** 87777dab0Sopenharmony_ci*/ 97777dab0Sopenharmony_ci 107777dab0Sopenharmony_ci#ifndef UBRK_H 117777dab0Sopenharmony_ci#define UBRK_H 127777dab0Sopenharmony_ci 137777dab0Sopenharmony_ci#include "unicode/utypes.h" 147777dab0Sopenharmony_ci#include "unicode/uloc.h" 157777dab0Sopenharmony_ci 167777dab0Sopenharmony_ci#if U_SHOW_CPLUSPLUS_API 177777dab0Sopenharmony_ci#include "unicode/localpointer.h" 187777dab0Sopenharmony_ci#endif // U_SHOW_CPLUSPLUS_API 197777dab0Sopenharmony_ci 207777dab0Sopenharmony_ci/** 217777dab0Sopenharmony_ci * A text-break iterator. 227777dab0Sopenharmony_ci * For usage in C programs. 237777dab0Sopenharmony_ci */ 247777dab0Sopenharmony_ci#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR 257777dab0Sopenharmony_ci# define UBRK_TYPEDEF_UBREAK_ITERATOR 267777dab0Sopenharmony_ci /** 277777dab0Sopenharmony_ci * Opaque type representing an ICU Break iterator object. 287777dab0Sopenharmony_ci * @stable ICU 2.0 297777dab0Sopenharmony_ci */ 307777dab0Sopenharmony_ci typedef struct UBreakIterator UBreakIterator; 317777dab0Sopenharmony_ci#endif 327777dab0Sopenharmony_ci 337777dab0Sopenharmony_ci#include "unicode/parseerr.h" 347777dab0Sopenharmony_ci 357777dab0Sopenharmony_ci#if !UCONFIG_NO_BREAK_ITERATION 367777dab0Sopenharmony_ci/** 377777dab0Sopenharmony_ci * \file 387777dab0Sopenharmony_ci * \brief C API: BreakIterator 397777dab0Sopenharmony_ci * 407777dab0Sopenharmony_ci * <h2> BreakIterator C API </h2> 417777dab0Sopenharmony_ci * 427777dab0Sopenharmony_ci * The BreakIterator C API defines methods for finding the location 437777dab0Sopenharmony_ci * of boundaries in text. Pointer to a UBreakIterator maintain a 447777dab0Sopenharmony_ci * current position and scan over text returning the index of characters 457777dab0Sopenharmony_ci * where boundaries occur. 467777dab0Sopenharmony_ci * <p> 477777dab0Sopenharmony_ci * Line boundary analysis determines where a text string can be broken 487777dab0Sopenharmony_ci * when line-wrapping. The mechanism correctly handles punctuation and 497777dab0Sopenharmony_ci * hyphenated words. 507777dab0Sopenharmony_ci * <p> 517777dab0Sopenharmony_ci * Note: The locale keyword "lb" can be used to modify line break 527777dab0Sopenharmony_ci * behavior according to the CSS level 3 line-break options, see 537777dab0Sopenharmony_ci * <http://dev.w3.org/csswg/css-text/#line-breaking>. For example: 547777dab0Sopenharmony_ci * "ja@lb=strict", "zh@lb=loose". 557777dab0Sopenharmony_ci * <p> 567777dab0Sopenharmony_ci * Sentence boundary analysis allows selection with correct 577777dab0Sopenharmony_ci * interpretation of periods within numbers and abbreviations, and 587777dab0Sopenharmony_ci * trailing punctuation marks such as quotation marks and parentheses. 597777dab0Sopenharmony_ci * <p> 607777dab0Sopenharmony_ci * Note: The locale keyword "ss" can be used to enable use of 617777dab0Sopenharmony_ci * segmentation suppression data (preventing breaks in English after 627777dab0Sopenharmony_ci * abbreviations such as "Mr." or "Est.", for example), as follows: 637777dab0Sopenharmony_ci * "en@ss=standard". 647777dab0Sopenharmony_ci * <p> 657777dab0Sopenharmony_ci * Word boundary analysis is used by search and replace functions, as 667777dab0Sopenharmony_ci * well as within text editing applications that allow the user to 677777dab0Sopenharmony_ci * select words with a double click. Word selection provides correct 687777dab0Sopenharmony_ci * interpretation of punctuation marks within and following 697777dab0Sopenharmony_ci * words. Characters that are not part of a word, such as symbols or 707777dab0Sopenharmony_ci * punctuation marks, have word-breaks on both sides. 717777dab0Sopenharmony_ci * <p> 727777dab0Sopenharmony_ci * Character boundary analysis identifies the boundaries of 737777dab0Sopenharmony_ci * "Extended Grapheme Clusters", which are groupings of codepoints 747777dab0Sopenharmony_ci * that should be treated as character-like units for many text operations. 757777dab0Sopenharmony_ci * Please see Unicode Standard Annex #29, Unicode Text Segmentation, 767777dab0Sopenharmony_ci * http://www.unicode.org/reports/tr29/ for additional information 777777dab0Sopenharmony_ci * on grapheme clusters and guidelines on their use. 787777dab0Sopenharmony_ci * <p> 797777dab0Sopenharmony_ci * Title boundary analysis locates all positions, 807777dab0Sopenharmony_ci * typically starts of words, that should be set to Title Case 817777dab0Sopenharmony_ci * when title casing the text. 827777dab0Sopenharmony_ci * <p> 837777dab0Sopenharmony_ci * The text boundary positions are found according to the rules 847777dab0Sopenharmony_ci * described in Unicode Standard Annex #29, Text Boundaries, and 857777dab0Sopenharmony_ci * Unicode Standard Annex #14, Line Breaking Properties. These 867777dab0Sopenharmony_ci * are available at http://www.unicode.org/reports/tr14/ and 877777dab0Sopenharmony_ci * http://www.unicode.org/reports/tr29/. 887777dab0Sopenharmony_ci * <p> 897777dab0Sopenharmony_ci * In addition to the plain C API defined in this header file, an 907777dab0Sopenharmony_ci * object oriented C++ API with equivalent functionality is defined in the 917777dab0Sopenharmony_ci * file brkiter.h. 927777dab0Sopenharmony_ci * <p> 937777dab0Sopenharmony_ci * Code snippets illustrating the use of the Break Iterator APIs 947777dab0Sopenharmony_ci * are available in the ICU User Guide, 957777dab0Sopenharmony_ci * https://unicode-org.github.io/icu/userguide/boundaryanalysis/ 967777dab0Sopenharmony_ci * and in the sample program icu/source/samples/break/break.cpp 977777dab0Sopenharmony_ci */ 987777dab0Sopenharmony_ci 997777dab0Sopenharmony_ci/** The possible types of text boundaries. @stable ICU 2.0 */ 1007777dab0Sopenharmony_citypedef enum UBreakIteratorType { 1017777dab0Sopenharmony_ci /** Character breaks @stable ICU 2.0 */ 1027777dab0Sopenharmony_ci UBRK_CHARACTER = 0, 1037777dab0Sopenharmony_ci /** Word breaks @stable ICU 2.0 */ 1047777dab0Sopenharmony_ci UBRK_WORD = 1, 1057777dab0Sopenharmony_ci /** Line breaks @stable ICU 2.0 */ 1067777dab0Sopenharmony_ci UBRK_LINE = 2, 1077777dab0Sopenharmony_ci /** Sentence breaks @stable ICU 2.0 */ 1087777dab0Sopenharmony_ci UBRK_SENTENCE = 3, 1097777dab0Sopenharmony_ci} UBreakIteratorType; 1107777dab0Sopenharmony_ci 1117777dab0Sopenharmony_ci/** Value indicating all text boundaries have been returned. 1127777dab0Sopenharmony_ci * @stable ICU 2.0 1137777dab0Sopenharmony_ci */ 1147777dab0Sopenharmony_ci#define UBRK_DONE ((int32_t) -1) 1157777dab0Sopenharmony_ci 1167777dab0Sopenharmony_ci 1177777dab0Sopenharmony_ci/** 1187777dab0Sopenharmony_ci * Enum constants for the word break tags returned by 1197777dab0Sopenharmony_ci * getRuleStatus(). A range of values is defined for each category of 1207777dab0Sopenharmony_ci * word, to allow for further subdivisions of a category in future releases. 1217777dab0Sopenharmony_ci * Applications should check for tag values falling within the range, rather 1227777dab0Sopenharmony_ci * than for single individual values. 1237777dab0Sopenharmony_ci * 1247777dab0Sopenharmony_ci * The numeric values of all of these constants are stable (will not change). 1257777dab0Sopenharmony_ci * 1267777dab0Sopenharmony_ci * @stable ICU 2.2 1277777dab0Sopenharmony_ci*/ 1287777dab0Sopenharmony_citypedef enum UWordBreak { 1297777dab0Sopenharmony_ci /** Tag value for "words" that do not fit into any of other categories. 1307777dab0Sopenharmony_ci * Includes spaces and most punctuation. */ 1317777dab0Sopenharmony_ci UBRK_WORD_NONE = 0, 1327777dab0Sopenharmony_ci /** Upper bound for tags for uncategorized words. */ 1337777dab0Sopenharmony_ci UBRK_WORD_NONE_LIMIT = 100, 1347777dab0Sopenharmony_ci /** Tag value for words that appear to be numbers, lower limit. */ 1357777dab0Sopenharmony_ci UBRK_WORD_NUMBER = 100, 1367777dab0Sopenharmony_ci /** Tag value for words that appear to be numbers, upper limit. */ 1377777dab0Sopenharmony_ci UBRK_WORD_NUMBER_LIMIT = 200, 1387777dab0Sopenharmony_ci /** Tag value for words that contain letters, excluding 1397777dab0Sopenharmony_ci * hiragana, katakana or ideographic characters, lower limit. */ 1407777dab0Sopenharmony_ci UBRK_WORD_LETTER = 200, 1417777dab0Sopenharmony_ci /** Tag value for words containing letters, upper limit */ 1427777dab0Sopenharmony_ci UBRK_WORD_LETTER_LIMIT = 300, 1437777dab0Sopenharmony_ci /** Tag value for words containing kana characters, lower limit */ 1447777dab0Sopenharmony_ci UBRK_WORD_KANA = 300, 1457777dab0Sopenharmony_ci /** Tag value for words containing kana characters, upper limit */ 1467777dab0Sopenharmony_ci UBRK_WORD_KANA_LIMIT = 400, 1477777dab0Sopenharmony_ci /** Tag value for words containing ideographic characters, lower limit */ 1487777dab0Sopenharmony_ci UBRK_WORD_IDEO = 400, 1497777dab0Sopenharmony_ci /** Tag value for words containing ideographic characters, upper limit */ 1507777dab0Sopenharmony_ci UBRK_WORD_IDEO_LIMIT = 500 1517777dab0Sopenharmony_ci} UWordBreak; 1527777dab0Sopenharmony_ci 1537777dab0Sopenharmony_ci/** 1547777dab0Sopenharmony_ci * Enum constants for the line break tags returned by getRuleStatus(). 1557777dab0Sopenharmony_ci * A range of values is defined for each category of 1567777dab0Sopenharmony_ci * word, to allow for further subdivisions of a category in future releases. 1577777dab0Sopenharmony_ci * Applications should check for tag values falling within the range, rather 1587777dab0Sopenharmony_ci * than for single individual values. 1597777dab0Sopenharmony_ci * 1607777dab0Sopenharmony_ci * The numeric values of all of these constants are stable (will not change). 1617777dab0Sopenharmony_ci * 1627777dab0Sopenharmony_ci * @stable ICU 2.8 1637777dab0Sopenharmony_ci*/ 1647777dab0Sopenharmony_citypedef enum ULineBreakTag { 1657777dab0Sopenharmony_ci /** Tag value for soft line breaks, positions at which a line break 1667777dab0Sopenharmony_ci * is acceptable but not required */ 1677777dab0Sopenharmony_ci UBRK_LINE_SOFT = 0, 1687777dab0Sopenharmony_ci /** Upper bound for soft line breaks. */ 1697777dab0Sopenharmony_ci UBRK_LINE_SOFT_LIMIT = 100, 1707777dab0Sopenharmony_ci /** Tag value for a hard, or mandatory line break */ 1717777dab0Sopenharmony_ci UBRK_LINE_HARD = 100, 1727777dab0Sopenharmony_ci /** Upper bound for hard line breaks. */ 1737777dab0Sopenharmony_ci UBRK_LINE_HARD_LIMIT = 200 1747777dab0Sopenharmony_ci} ULineBreakTag; 1757777dab0Sopenharmony_ci 1767777dab0Sopenharmony_ci 1777777dab0Sopenharmony_ci 1787777dab0Sopenharmony_ci/** 1797777dab0Sopenharmony_ci * Enum constants for the sentence break tags returned by getRuleStatus(). 1807777dab0Sopenharmony_ci * A range of values is defined for each category of 1817777dab0Sopenharmony_ci * sentence, to allow for further subdivisions of a category in future releases. 1827777dab0Sopenharmony_ci * Applications should check for tag values falling within the range, rather 1837777dab0Sopenharmony_ci * than for single individual values. 1847777dab0Sopenharmony_ci * 1857777dab0Sopenharmony_ci * The numeric values of all of these constants are stable (will not change). 1867777dab0Sopenharmony_ci * 1877777dab0Sopenharmony_ci * @stable ICU 2.8 1887777dab0Sopenharmony_ci*/ 1897777dab0Sopenharmony_citypedef enum USentenceBreakTag { 1907777dab0Sopenharmony_ci /** Tag value for for sentences ending with a sentence terminator 1917777dab0Sopenharmony_ci * ('.', '?', '!', etc.) character, possibly followed by a 1927777dab0Sopenharmony_ci * hard separator (CR, LF, PS, etc.) 1937777dab0Sopenharmony_ci */ 1947777dab0Sopenharmony_ci UBRK_SENTENCE_TERM = 0, 1957777dab0Sopenharmony_ci /** Upper bound for tags for sentences ended by sentence terminators. */ 1967777dab0Sopenharmony_ci UBRK_SENTENCE_TERM_LIMIT = 100, 1977777dab0Sopenharmony_ci /** Tag value for for sentences that do not contain an ending 1987777dab0Sopenharmony_ci * sentence terminator ('.', '?', '!', etc.) character, but 1997777dab0Sopenharmony_ci * are ended only by a hard separator (CR, LF, PS, etc.) or end of input. 2007777dab0Sopenharmony_ci */ 2017777dab0Sopenharmony_ci UBRK_SENTENCE_SEP = 100, 2027777dab0Sopenharmony_ci /** Upper bound for tags for sentences ended by a separator. */ 2037777dab0Sopenharmony_ci UBRK_SENTENCE_SEP_LIMIT = 200 2047777dab0Sopenharmony_ci /** Tag value for a hard, or mandatory line break */ 2057777dab0Sopenharmony_ci} USentenceBreakTag; 2067777dab0Sopenharmony_ci 2077777dab0Sopenharmony_ci 2087777dab0Sopenharmony_ci/** 2097777dab0Sopenharmony_ci * Open a new UBreakIterator for locating text boundaries for a specified locale. 2107777dab0Sopenharmony_ci * A UBreakIterator may be used for detecting character, line, word, 2117777dab0Sopenharmony_ci * and sentence breaks in text. 2127777dab0Sopenharmony_ci * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD, 2137777dab0Sopenharmony_ci * UBRK_LINE, UBRK_SENTENCE 2147777dab0Sopenharmony_ci * @param locale The locale specifying the text-breaking conventions. Note that 2157777dab0Sopenharmony_ci * locale keys such as "lb" and "ss" may be used to modify text break behavior, 2167777dab0Sopenharmony_ci * see general discussion of BreakIterator C API. 2177777dab0Sopenharmony_ci * @param text The text to be iterated over. May be null, in which case ubrk_setText() is 2187777dab0Sopenharmony_ci * used to specify the text to be iterated. 2197777dab0Sopenharmony_ci * @param textLength The number of characters in text, or -1 if null-terminated. 2207777dab0Sopenharmony_ci * @param status A UErrorCode to receive any errors. 2217777dab0Sopenharmony_ci * @return A UBreakIterator for the specified locale. 2227777dab0Sopenharmony_ci * @see ubrk_openRules 2237777dab0Sopenharmony_ci * @stable ICU 2.0 2247777dab0Sopenharmony_ci */ 2257777dab0Sopenharmony_ciU_CAPI UBreakIterator* U_EXPORT2 2267777dab0Sopenharmony_ciubrk_open(UBreakIteratorType type, 2277777dab0Sopenharmony_ci const char *locale, 2287777dab0Sopenharmony_ci const UChar *text, 2297777dab0Sopenharmony_ci int32_t textLength, 2307777dab0Sopenharmony_ci UErrorCode *status); 2317777dab0Sopenharmony_ci 2327777dab0Sopenharmony_ci/** 2337777dab0Sopenharmony_ci * Open a new UBreakIterator for locating text boundaries using specified breaking rules. 2347777dab0Sopenharmony_ci * The rule syntax is ... (TBD) 2357777dab0Sopenharmony_ci * @param rules A set of rules specifying the text breaking conventions. 2367777dab0Sopenharmony_ci * @param rulesLength The number of characters in rules, or -1 if null-terminated. 2377777dab0Sopenharmony_ci * @param text The text to be iterated over. May be null, in which case ubrk_setText() is 2387777dab0Sopenharmony_ci * used to specify the text to be iterated. 2397777dab0Sopenharmony_ci * @param textLength The number of characters in text, or -1 if null-terminated. 2407777dab0Sopenharmony_ci * @param parseErr Receives position and context information for any syntax errors 2417777dab0Sopenharmony_ci * detected while parsing the rules. 2427777dab0Sopenharmony_ci * @param status A UErrorCode to receive any errors. 2437777dab0Sopenharmony_ci * @return A UBreakIterator for the specified rules. 2447777dab0Sopenharmony_ci * @see ubrk_open 2457777dab0Sopenharmony_ci * @stable ICU 2.2 2467777dab0Sopenharmony_ci */ 2477777dab0Sopenharmony_ciU_CAPI UBreakIterator* U_EXPORT2 2487777dab0Sopenharmony_ciubrk_openRules(const UChar *rules, 2497777dab0Sopenharmony_ci int32_t rulesLength, 2507777dab0Sopenharmony_ci const UChar *text, 2517777dab0Sopenharmony_ci int32_t textLength, 2527777dab0Sopenharmony_ci UParseError *parseErr, 2537777dab0Sopenharmony_ci UErrorCode *status); 2547777dab0Sopenharmony_ci 2557777dab0Sopenharmony_ci/** 2567777dab0Sopenharmony_ci * Open a new UBreakIterator for locating text boundaries using precompiled binary rules. 2577777dab0Sopenharmony_ci * Opening a UBreakIterator this way is substantially faster than using ubrk_openRules. 2587777dab0Sopenharmony_ci * Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not 2597777dab0Sopenharmony_ci * compatible across different major versions of ICU, nor across platforms of different 2607777dab0Sopenharmony_ci * endianness or different base character set family (ASCII vs EBCDIC). 2617777dab0Sopenharmony_ci * @param binaryRules A set of compiled binary rules specifying the text breaking 2627777dab0Sopenharmony_ci * conventions. Ownership of the storage containing the compiled 2637777dab0Sopenharmony_ci * rules remains with the caller of this function. The compiled 2647777dab0Sopenharmony_ci * rules must not be modified or deleted during the life of the 2657777dab0Sopenharmony_ci * break iterator. 2667777dab0Sopenharmony_ci * @param rulesLength The length of binaryRules in bytes; must be >= 0. 2677777dab0Sopenharmony_ci * @param text The text to be iterated over. May be null, in which case 2687777dab0Sopenharmony_ci * ubrk_setText() is used to specify the text to be iterated. 2697777dab0Sopenharmony_ci * @param textLength The number of characters in text, or -1 if null-terminated. 2707777dab0Sopenharmony_ci * @param status Pointer to UErrorCode to receive any errors. 2717777dab0Sopenharmony_ci * @return UBreakIterator for the specified rules. 2727777dab0Sopenharmony_ci * @see ubrk_getBinaryRules 2737777dab0Sopenharmony_ci * @stable ICU 59 2747777dab0Sopenharmony_ci */ 2757777dab0Sopenharmony_ciU_CAPI UBreakIterator* U_EXPORT2 2767777dab0Sopenharmony_ciubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength, 2777777dab0Sopenharmony_ci const UChar * text, int32_t textLength, 2787777dab0Sopenharmony_ci UErrorCode * status); 2797777dab0Sopenharmony_ci 2807777dab0Sopenharmony_ci#ifndef U_HIDE_DEPRECATED_API 2817777dab0Sopenharmony_ci 2827777dab0Sopenharmony_ci#endif /* U_HIDE_DEPRECATED_API */ 2837777dab0Sopenharmony_ci 2847777dab0Sopenharmony_ci/** 2857777dab0Sopenharmony_ci * Thread safe cloning operation. 2867777dab0Sopenharmony_ci * @param bi iterator to be cloned 2877777dab0Sopenharmony_ci * @param status to indicate whether the operation went on smoothly or there were errors 2887777dab0Sopenharmony_ci * @return pointer to the new clone 2897777dab0Sopenharmony_ci * @stable ICU 69 2907777dab0Sopenharmony_ci */ 2917777dab0Sopenharmony_ciU_CAPI UBreakIterator * U_EXPORT2 2927777dab0Sopenharmony_ciubrk_clone(const UBreakIterator *bi, 2937777dab0Sopenharmony_ci UErrorCode *status); 2947777dab0Sopenharmony_ci 2957777dab0Sopenharmony_ci/** 2967777dab0Sopenharmony_ci* Close a UBreakIterator. 2977777dab0Sopenharmony_ci* Once closed, a UBreakIterator may no longer be used. 2987777dab0Sopenharmony_ci* @param bi The break iterator to close. 2997777dab0Sopenharmony_ci * @stable ICU 2.0 3007777dab0Sopenharmony_ci*/ 3017777dab0Sopenharmony_ciU_CAPI void U_EXPORT2 3027777dab0Sopenharmony_ciubrk_close(UBreakIterator *bi); 3037777dab0Sopenharmony_ci 3047777dab0Sopenharmony_ci#if U_SHOW_CPLUSPLUS_API 3057777dab0Sopenharmony_ci 3067777dab0Sopenharmony_ciU_NAMESPACE_BEGIN 3077777dab0Sopenharmony_ci 3087777dab0Sopenharmony_ci/** 3097777dab0Sopenharmony_ci * \class LocalUBreakIteratorPointer 3107777dab0Sopenharmony_ci * "Smart pointer" class, closes a UBreakIterator via ubrk_close(). 3117777dab0Sopenharmony_ci * For most methods see the LocalPointerBase base class. 3127777dab0Sopenharmony_ci * 3137777dab0Sopenharmony_ci * @see LocalPointerBase 3147777dab0Sopenharmony_ci * @see LocalPointer 3157777dab0Sopenharmony_ci * @stable ICU 4.4 3167777dab0Sopenharmony_ci */ 3177777dab0Sopenharmony_ciU_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close); 3187777dab0Sopenharmony_ci 3197777dab0Sopenharmony_ciU_NAMESPACE_END 3207777dab0Sopenharmony_ci 3217777dab0Sopenharmony_ci#endif 3227777dab0Sopenharmony_ci 3237777dab0Sopenharmony_ci/** 3247777dab0Sopenharmony_ci * Sets an existing iterator to point to a new piece of text. 3257777dab0Sopenharmony_ci * The break iterator retains a pointer to the supplied text. 3267777dab0Sopenharmony_ci * The caller must not modify or delete the text while the BreakIterator 3277777dab0Sopenharmony_ci * retains the reference. 3287777dab0Sopenharmony_ci * 3297777dab0Sopenharmony_ci * @param bi The iterator to use 3307777dab0Sopenharmony_ci * @param text The text to be set 3317777dab0Sopenharmony_ci * @param textLength The length of the text 3327777dab0Sopenharmony_ci * @param status The error code 3337777dab0Sopenharmony_ci * @stable ICU 2.0 3347777dab0Sopenharmony_ci */ 3357777dab0Sopenharmony_ciU_CAPI void U_EXPORT2 3367777dab0Sopenharmony_ciubrk_setText(UBreakIterator* bi, 3377777dab0Sopenharmony_ci const UChar* text, 3387777dab0Sopenharmony_ci int32_t textLength, 3397777dab0Sopenharmony_ci UErrorCode* status); 3407777dab0Sopenharmony_ci 3417777dab0Sopenharmony_ci/** 3427777dab0Sopenharmony_ci * Determine the most recently-returned text boundary. 3437777dab0Sopenharmony_ci * 3447777dab0Sopenharmony_ci * @param bi The break iterator to use. 3457777dab0Sopenharmony_ci * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous, 3467777dab0Sopenharmony_ci * \ref ubrk_first, or \ref ubrk_last. 3477777dab0Sopenharmony_ci * @stable ICU 2.0 3487777dab0Sopenharmony_ci */ 3497777dab0Sopenharmony_ciU_CAPI int32_t U_EXPORT2 3507777dab0Sopenharmony_ciubrk_current(const UBreakIterator *bi); 3517777dab0Sopenharmony_ci 3527777dab0Sopenharmony_ci/** 3537777dab0Sopenharmony_ci * Advance the iterator to the boundary following the current boundary. 3547777dab0Sopenharmony_ci * 3557777dab0Sopenharmony_ci * @param bi The break iterator to use. 3567777dab0Sopenharmony_ci * @return The character index of the next text boundary, or UBRK_DONE 3577777dab0Sopenharmony_ci * if all text boundaries have been returned. 3587777dab0Sopenharmony_ci * @see ubrk_previous 3597777dab0Sopenharmony_ci * @stable ICU 2.0 3607777dab0Sopenharmony_ci */ 3617777dab0Sopenharmony_ciU_CAPI int32_t U_EXPORT2 3627777dab0Sopenharmony_ciubrk_next(UBreakIterator *bi); 3637777dab0Sopenharmony_ci 3647777dab0Sopenharmony_ci/** 3657777dab0Sopenharmony_ci * Set the iterator position to the boundary preceding the current boundary. 3667777dab0Sopenharmony_ci * 3677777dab0Sopenharmony_ci * @param bi The break iterator to use. 3687777dab0Sopenharmony_ci * @return The character index of the preceding text boundary, or UBRK_DONE 3697777dab0Sopenharmony_ci * if all text boundaries have been returned. 3707777dab0Sopenharmony_ci * @see ubrk_next 3717777dab0Sopenharmony_ci * @stable ICU 2.0 3727777dab0Sopenharmony_ci */ 3737777dab0Sopenharmony_ciU_CAPI int32_t U_EXPORT2 3747777dab0Sopenharmony_ciubrk_previous(UBreakIterator *bi); 3757777dab0Sopenharmony_ci 3767777dab0Sopenharmony_ci/** 3777777dab0Sopenharmony_ci * Set the iterator position to zero, the start of the text being scanned. 3787777dab0Sopenharmony_ci * @param bi The break iterator to use. 3797777dab0Sopenharmony_ci * @return The new iterator position (zero). 3807777dab0Sopenharmony_ci * @see ubrk_last 3817777dab0Sopenharmony_ci * @stable ICU 2.0 3827777dab0Sopenharmony_ci */ 3837777dab0Sopenharmony_ciU_CAPI int32_t U_EXPORT2 3847777dab0Sopenharmony_ciubrk_first(UBreakIterator *bi); 3857777dab0Sopenharmony_ci 3867777dab0Sopenharmony_ci/** 3877777dab0Sopenharmony_ci * Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned. 3887777dab0Sopenharmony_ci * This is not the same as the last character. 3897777dab0Sopenharmony_ci * @param bi The break iterator to use. 3907777dab0Sopenharmony_ci * @return The character offset immediately <EM>beyond</EM> the last character in the 3917777dab0Sopenharmony_ci * text being scanned. 3927777dab0Sopenharmony_ci * @see ubrk_first 3937777dab0Sopenharmony_ci * @stable ICU 2.0 3947777dab0Sopenharmony_ci */ 3957777dab0Sopenharmony_ciU_CAPI int32_t U_EXPORT2 3967777dab0Sopenharmony_ciubrk_last(UBreakIterator *bi); 3977777dab0Sopenharmony_ci 3987777dab0Sopenharmony_ci/** 3997777dab0Sopenharmony_ci * Set the iterator position to the first boundary preceding the specified offset. 4007777dab0Sopenharmony_ci * The new position is always smaller than offset, or UBRK_DONE. 4017777dab0Sopenharmony_ci * @param bi The break iterator to use. 4027777dab0Sopenharmony_ci * @param offset The offset to begin scanning. 4037777dab0Sopenharmony_ci * @return The text boundary preceding offset, or UBRK_DONE. 4047777dab0Sopenharmony_ci * @see ubrk_following 4057777dab0Sopenharmony_ci * @stable ICU 2.0 4067777dab0Sopenharmony_ci */ 4077777dab0Sopenharmony_ciU_CAPI int32_t U_EXPORT2 4087777dab0Sopenharmony_ciubrk_preceding(UBreakIterator *bi, 4097777dab0Sopenharmony_ci int32_t offset); 4107777dab0Sopenharmony_ci 4117777dab0Sopenharmony_ci/** 4127777dab0Sopenharmony_ci * Advance the iterator to the first boundary following the specified offset. 4137777dab0Sopenharmony_ci * The value returned is always greater than offset, or UBRK_DONE. 4147777dab0Sopenharmony_ci * @param bi The break iterator to use. 4157777dab0Sopenharmony_ci * @param offset The offset to begin scanning. 4167777dab0Sopenharmony_ci * @return The text boundary following offset, or UBRK_DONE. 4177777dab0Sopenharmony_ci * @see ubrk_preceding 4187777dab0Sopenharmony_ci * @stable ICU 2.0 4197777dab0Sopenharmony_ci */ 4207777dab0Sopenharmony_ciU_CAPI int32_t U_EXPORT2 4217777dab0Sopenharmony_ciubrk_following(UBreakIterator *bi, 4227777dab0Sopenharmony_ci int32_t offset); 4237777dab0Sopenharmony_ci 4247777dab0Sopenharmony_ci/** 4257777dab0Sopenharmony_ci* Get a locale for which text breaking information is available. 4267777dab0Sopenharmony_ci* A UBreakIterator in a locale returned by this function will perform the correct 4277777dab0Sopenharmony_ci* text breaking for the locale. 4287777dab0Sopenharmony_ci* @param index The index of the desired locale. 4297777dab0Sopenharmony_ci* @return A locale for which number text breaking information is available, or 0 if none. 4307777dab0Sopenharmony_ci* @see ubrk_countAvailable 4317777dab0Sopenharmony_ci* @stable ICU 2.0 4327777dab0Sopenharmony_ci*/ 4337777dab0Sopenharmony_ciU_CAPI const char* U_EXPORT2 4347777dab0Sopenharmony_ciubrk_getAvailable(int32_t index); 4357777dab0Sopenharmony_ci 4367777dab0Sopenharmony_ci/** 4377777dab0Sopenharmony_ci* Determine how many locales have text breaking information available. 4387777dab0Sopenharmony_ci* This function is most useful as determining the loop ending condition for 4397777dab0Sopenharmony_ci* calls to \ref ubrk_getAvailable. 4407777dab0Sopenharmony_ci* @return The number of locales for which text breaking information is available. 4417777dab0Sopenharmony_ci* @see ubrk_getAvailable 4427777dab0Sopenharmony_ci* @stable ICU 2.0 4437777dab0Sopenharmony_ci*/ 4447777dab0Sopenharmony_ciU_CAPI int32_t U_EXPORT2 4457777dab0Sopenharmony_ciubrk_countAvailable(void); 4467777dab0Sopenharmony_ci 4477777dab0Sopenharmony_ci 4487777dab0Sopenharmony_ci/** 4497777dab0Sopenharmony_ci* Returns true if the specified position is a boundary position. As a side 4507777dab0Sopenharmony_ci* effect, leaves the iterator pointing to the first boundary position at 4517777dab0Sopenharmony_ci* or after "offset". 4527777dab0Sopenharmony_ci* @param bi The break iterator to use. 4537777dab0Sopenharmony_ci* @param offset the offset to check. 4547777dab0Sopenharmony_ci* @return True if "offset" is a boundary position. 4557777dab0Sopenharmony_ci* @stable ICU 2.0 4567777dab0Sopenharmony_ci*/ 4577777dab0Sopenharmony_ciU_CAPI UBool U_EXPORT2 4587777dab0Sopenharmony_ciubrk_isBoundary(UBreakIterator *bi, int32_t offset); 4597777dab0Sopenharmony_ci 4607777dab0Sopenharmony_ci/** 4617777dab0Sopenharmony_ci * Return the status from the break rule that determined the most recently 4627777dab0Sopenharmony_ci * returned break position. The values appear in the rule source 4637777dab0Sopenharmony_ci * within brackets, {123}, for example. For rules that do not specify a 4647777dab0Sopenharmony_ci * status, a default value of 0 is returned. 4657777dab0Sopenharmony_ci * <p> 4667777dab0Sopenharmony_ci * For word break iterators, the possible values are defined in enum UWordBreak. 4677777dab0Sopenharmony_ci * @stable ICU 2.2 4687777dab0Sopenharmony_ci */ 4697777dab0Sopenharmony_ciU_CAPI int32_t U_EXPORT2 4707777dab0Sopenharmony_ciubrk_getRuleStatus(UBreakIterator *bi); 4717777dab0Sopenharmony_ci 4727777dab0Sopenharmony_ci/** 4737777dab0Sopenharmony_ci * Get the statuses from the break rules that determined the most recently 4747777dab0Sopenharmony_ci * returned break position. The values appear in the rule source 4757777dab0Sopenharmony_ci * within brackets, {123}, for example. The default status value for rules 4767777dab0Sopenharmony_ci * that do not explicitly provide one is zero. 4777777dab0Sopenharmony_ci * <p> 4787777dab0Sopenharmony_ci * For word break iterators, the possible values are defined in enum UWordBreak. 4797777dab0Sopenharmony_ci * @param bi The break iterator to use 4807777dab0Sopenharmony_ci * @param fillInVec an array to be filled in with the status values. 4817777dab0Sopenharmony_ci * @param capacity the length of the supplied vector. A length of zero causes 4827777dab0Sopenharmony_ci * the function to return the number of status values, in the 4837777dab0Sopenharmony_ci * normal way, without attempting to store any values. 4847777dab0Sopenharmony_ci * @param status receives error codes. 4857777dab0Sopenharmony_ci * @return The number of rule status values from rules that determined 4867777dab0Sopenharmony_ci * the most recent boundary returned by the break iterator. 4877777dab0Sopenharmony_ci * @stable ICU 3.0 4887777dab0Sopenharmony_ci */ 4897777dab0Sopenharmony_ciU_CAPI int32_t U_EXPORT2 4907777dab0Sopenharmony_ciubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status); 4917777dab0Sopenharmony_ci 4927777dab0Sopenharmony_ci/** 4937777dab0Sopenharmony_ci * Return the locale of the break iterator. You can choose between the valid and 4947777dab0Sopenharmony_ci * the actual locale. 4957777dab0Sopenharmony_ci * @param bi break iterator 4967777dab0Sopenharmony_ci * @param type locale type (valid or actual) 4977777dab0Sopenharmony_ci * @param status error code 4987777dab0Sopenharmony_ci * @return locale string 4997777dab0Sopenharmony_ci * @stable ICU 2.8 5007777dab0Sopenharmony_ci */ 5017777dab0Sopenharmony_ciU_CAPI const char* U_EXPORT2 5027777dab0Sopenharmony_ciubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status); 5037777dab0Sopenharmony_ci 5047777dab0Sopenharmony_ci/** 5057777dab0Sopenharmony_ci * Get a compiled binary version of the rules specifying the behavior of a UBreakIterator. 5067777dab0Sopenharmony_ci * The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator 5077777dab0Sopenharmony_ci * more quickly than using ubrk_openRules. The compiled rules are not compatible across 5087777dab0Sopenharmony_ci * different major versions of ICU, nor across platforms of different endianness or 5097777dab0Sopenharmony_ci * different base character set family (ASCII vs EBCDIC). Supports preflighting (with 5107777dab0Sopenharmony_ci * binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to 5117777dab0Sopenharmony_ci * the binaryRules buffer. However, whether preflighting or not, if the actual length 5127777dab0Sopenharmony_ci * is greater than INT32_MAX, then the function returns 0 and sets *status to 5137777dab0Sopenharmony_ci * U_INDEX_OUTOFBOUNDS_ERROR. 5147777dab0Sopenharmony_ci 5157777dab0Sopenharmony_ci * @param bi The break iterator to use. 5167777dab0Sopenharmony_ci * @param binaryRules Buffer to receive the compiled binary rules; set to NULL for 5177777dab0Sopenharmony_ci * preflighting. 5187777dab0Sopenharmony_ci * @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for 5197777dab0Sopenharmony_ci * preflighting. Must be >= 0. 5207777dab0Sopenharmony_ci * @param status Pointer to UErrorCode to receive any errors, such as 5217777dab0Sopenharmony_ci * U_BUFFER_OVERFLOW_ERROR, U_INDEX_OUTOFBOUNDS_ERROR, or 5227777dab0Sopenharmony_ci * U_ILLEGAL_ARGUMENT_ERROR. 5237777dab0Sopenharmony_ci * @return The actual byte length of the binary rules, if <= INT32_MAX; 5247777dab0Sopenharmony_ci * otherwise 0. If not preflighting and this is larger than 5257777dab0Sopenharmony_ci * rulesCapacity, *status will be set to an error. 5267777dab0Sopenharmony_ci * @see ubrk_openBinaryRules 5277777dab0Sopenharmony_ci * @stable ICU 59 5287777dab0Sopenharmony_ci */ 5297777dab0Sopenharmony_ciU_CAPI int32_t U_EXPORT2 5307777dab0Sopenharmony_ciubrk_getBinaryRules(UBreakIterator *bi, 5317777dab0Sopenharmony_ci uint8_t * binaryRules, int32_t rulesCapacity, 5327777dab0Sopenharmony_ci UErrorCode * status); 5337777dab0Sopenharmony_ci 5347777dab0Sopenharmony_ci#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 5357777dab0Sopenharmony_ci 5367777dab0Sopenharmony_ci#endif 537