1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (c) 1996-2015, International Business Machines Corporation and others. 6 * All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 #ifndef UCOL_H 11 #define UCOL_H 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_COLLATION 16 17 #include "unicode/parseerr.h" 18 #include "unicode/uloc.h" 19 #include "unicode/uset.h" 20 #include "unicode/uscript.h" 21 22 #if U_SHOW_CPLUSPLUS_API 23 #include "unicode/localpointer.h" 24 #endif // U_SHOW_CPLUSPLUS_API 25 26 /** 27 * \file 28 * \brief C API: Collator 29 * 30 * <h2> Collator C API </h2> 31 * 32 * The C API for Collator performs locale-sensitive 33 * string comparison. You use this service to build 34 * searching and sorting routines for natural language text. 35 * <p> 36 * For more information about the collation service see 37 * <a href="https://unicode-org.github.io/icu/userguide/collation">the User Guide</a>. 38 * <p> 39 * Collation service provides correct sorting orders for most locales supported in ICU. 40 * If specific data for a locale is not available, the orders eventually falls back 41 * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>. 42 * <p> 43 * Sort ordering may be customized by providing your own set of rules. For more on 44 * this subject see the <a href="https://unicode-org.github.io/icu/userguide/collation/customization"> 45 * Collation Customization</a> section of the User Guide. 46 * <p> 47 * @see UCollationResult 48 * @see UNormalizationMode 49 * @see UCollationStrength 50 * @see UCollationElements 51 */ 52 53 /** A collator. 54 * For usage in C programs. 55 */ 56 struct UCollator; 57 /** structure representing a collator object instance 58 * @stable ICU 2.0 59 */ 60 typedef struct UCollator UCollator; 61 62 63 /** 64 * UCOL_LESS is returned if source string is compared to be less than target 65 * string in the ucol_strcoll() method. 66 * UCOL_EQUAL is returned if source string is compared to be equal to target 67 * string in the ucol_strcoll() method. 68 * UCOL_GREATER is returned if source string is compared to be greater than 69 * target string in the ucol_strcoll() method. 70 * @see ucol_strcoll() 71 * <p> 72 * Possible values for a comparison result 73 * @stable ICU 2.0 74 */ 75 typedef enum { 76 /** string a == string b */ 77 UCOL_EQUAL = 0, 78 /** string a > string b */ 79 UCOL_GREATER = 1, 80 /** string a < string b */ 81 UCOL_LESS = -1 82 } UCollationResult ; 83 84 85 /** Enum containing attribute values for controlling collation behavior. 86 * Here are all the allowable values. Not every attribute can take every value. The only 87 * universal value is UCOL_DEFAULT, which resets the attribute value to the predefined 88 * value for that locale 89 * @stable ICU 2.0 90 */ 91 typedef enum { 92 /** accepted by most attributes */ 93 UCOL_DEFAULT = -1, 94 95 /** Primary collation strength */ 96 UCOL_PRIMARY = 0, 97 /** Secondary collation strength */ 98 UCOL_SECONDARY = 1, 99 /** Tertiary collation strength */ 100 UCOL_TERTIARY = 2, 101 /** Default collation strength */ 102 UCOL_DEFAULT_STRENGTH = UCOL_TERTIARY, 103 UCOL_CE_STRENGTH_LIMIT, 104 /** Quaternary collation strength */ 105 UCOL_QUATERNARY=3, 106 /** Identical collation strength */ 107 UCOL_IDENTICAL=15, 108 UCOL_STRENGTH_LIMIT, 109 110 /** Turn the feature off - works for UCOL_FRENCH_COLLATION, 111 UCOL_CASE_LEVEL, UCOL_HIRAGANA_QUATERNARY_MODE 112 & UCOL_DECOMPOSITION_MODE*/ 113 UCOL_OFF = 16, 114 /** Turn the feature on - works for UCOL_FRENCH_COLLATION, 115 UCOL_CASE_LEVEL, UCOL_HIRAGANA_QUATERNARY_MODE 116 & UCOL_DECOMPOSITION_MODE*/ 117 UCOL_ON = 17, 118 119 /** Valid for UCOL_ALTERNATE_HANDLING. Alternate handling will be shifted */ 120 UCOL_SHIFTED = 20, 121 /** Valid for UCOL_ALTERNATE_HANDLING. Alternate handling will be non ignorable */ 122 UCOL_NON_IGNORABLE = 21, 123 124 /** Valid for UCOL_CASE_FIRST - 125 lower case sorts before upper case */ 126 UCOL_LOWER_FIRST = 24, 127 /** upper case sorts before lower case */ 128 UCOL_UPPER_FIRST = 25 129 } UColAttributeValue; 130 131 /** 132 * Enum containing the codes for reordering segments of the collation table that are not script 133 * codes. These reordering codes are to be used in conjunction with the script codes. 134 * @see ucol_getReorderCodes 135 * @see ucol_setReorderCodes 136 * @see ucol_getEquivalentReorderCodes 137 * @see UScriptCode 138 * @stable ICU 4.8 139 */ 140 typedef enum { 141 /** 142 * A special reordering code that is used to specify the default 143 * reordering codes for a locale. 144 * @stable ICU 4.8 145 */ 146 UCOL_REORDER_CODE_DEFAULT = -1, 147 /** 148 * A special reordering code that is used to specify no reordering codes. 149 * @stable ICU 4.8 150 */ 151 UCOL_REORDER_CODE_NONE = USCRIPT_UNKNOWN, 152 /** 153 * A special reordering code that is used to specify all other codes used for 154 * reordering except for the codes lised as UColReorderCode values and those 155 * listed explicitly in a reordering. 156 * @stable ICU 4.8 157 */ 158 UCOL_REORDER_CODE_OTHERS = USCRIPT_UNKNOWN, 159 /** 160 * Characters with the space property. 161 * This is equivalent to the rule value "space". 162 * @stable ICU 4.8 163 */ 164 UCOL_REORDER_CODE_SPACE = 0x1000, 165 /** 166 * The first entry in the enumeration of reordering groups. This is intended for use in 167 * range checking and enumeration of the reorder codes. 168 * @stable ICU 4.8 169 */ 170 UCOL_REORDER_CODE_FIRST = UCOL_REORDER_CODE_SPACE, 171 /** 172 * Characters with the punctuation property. 173 * This is equivalent to the rule value "punct". 174 * @stable ICU 4.8 175 */ 176 UCOL_REORDER_CODE_PUNCTUATION = 0x1001, 177 /** 178 * Characters with the symbol property. 179 * This is equivalent to the rule value "symbol". 180 * @stable ICU 4.8 181 */ 182 UCOL_REORDER_CODE_SYMBOL = 0x1002, 183 /** 184 * Characters with the currency property. 185 * This is equivalent to the rule value "currency". 186 * @stable ICU 4.8 187 */ 188 UCOL_REORDER_CODE_CURRENCY = 0x1003, 189 /** 190 * Characters with the digit property. 191 * This is equivalent to the rule value "digit". 192 * @stable ICU 4.8 193 */ 194 UCOL_REORDER_CODE_DIGIT = 0x1004 195 } UColReorderCode; 196 197 /** 198 * Base letter represents a primary difference. Set comparison 199 * level to UCOL_PRIMARY to ignore secondary and tertiary differences. 200 * Use this to set the strength of a Collator object. 201 * Example of primary difference, "abc" < "abd" 202 * 203 * Diacritical differences on the same base letter represent a secondary 204 * difference. Set comparison level to UCOL_SECONDARY to ignore tertiary 205 * differences. Use this to set the strength of a Collator object. 206 * Example of secondary difference, "ä" >> "a". 207 * 208 * Uppercase and lowercase versions of the same character represents a 209 * tertiary difference. Set comparison level to UCOL_TERTIARY to include 210 * all comparison differences. Use this to set the strength of a Collator 211 * object. 212 * Example of tertiary difference, "abc" <<< "ABC". 213 * 214 * Two characters are considered "identical" when they have the same 215 * unicode spellings. UCOL_IDENTICAL. 216 * For example, "ä" == "ä". 217 * 218 * UCollationStrength is also used to determine the strength of sort keys 219 * generated from UCollator objects 220 * These values can be now found in the UColAttributeValue enum. 221 * @stable ICU 2.0 222 **/ 223 typedef UColAttributeValue UCollationStrength; 224 225 /** Attributes that collation service understands. All the attributes can take UCOL_DEFAULT 226 * value, as well as the values specific to each one. 227 * @stable ICU 2.0 228 */ 229 typedef enum { 230 /** Attribute for direction of secondary weights - used in Canadian French. 231 * Acceptable values are UCOL_ON, which results in secondary weights 232 * being considered backwards and UCOL_OFF which treats secondary 233 * weights in the order they appear. 234 * @stable ICU 2.0 235 */ 236 UCOL_FRENCH_COLLATION, 237 /** Attribute for handling variable elements. 238 * Acceptable values are UCOL_NON_IGNORABLE (default) 239 * which treats all the codepoints with non-ignorable 240 * primary weights in the same way, 241 * and UCOL_SHIFTED which causes codepoints with primary 242 * weights that are equal or below the variable top value 243 * to be ignored on primary level and moved to the quaternary 244 * level. 245 * @stable ICU 2.0 246 */ 247 UCOL_ALTERNATE_HANDLING, 248 /** Controls the ordering of upper and lower case letters. 249 * Acceptable values are UCOL_OFF (default), which orders 250 * upper and lower case letters in accordance to their tertiary 251 * weights, UCOL_UPPER_FIRST which forces upper case letters to 252 * sort before lower case letters, and UCOL_LOWER_FIRST which does 253 * the opposite. 254 * @stable ICU 2.0 255 */ 256 UCOL_CASE_FIRST, 257 /** Controls whether an extra case level (positioned before the third 258 * level) is generated or not. Acceptable values are UCOL_OFF (default), 259 * when case level is not generated, and UCOL_ON which causes the case 260 * level to be generated. Contents of the case level are affected by 261 * the value of UCOL_CASE_FIRST attribute. A simple way to ignore 262 * accent differences in a string is to set the strength to UCOL_PRIMARY 263 * and enable case level. 264 * @stable ICU 2.0 265 */ 266 UCOL_CASE_LEVEL, 267 /** Controls whether the normalization check and necessary normalizations 268 * are performed. When set to UCOL_OFF (default) no normalization check 269 * is performed. The correctness of the result is guaranteed only if the 270 * input data is in so-called FCD form (see users manual for more info). 271 * When set to UCOL_ON, an incremental check is performed to see whether 272 * the input data is in the FCD form. If the data is not in the FCD form, 273 * incremental NFD normalization is performed. 274 * @stable ICU 2.0 275 */ 276 UCOL_NORMALIZATION_MODE, 277 /** An alias for UCOL_NORMALIZATION_MODE attribute. 278 * @stable ICU 2.0 279 */ 280 UCOL_DECOMPOSITION_MODE = UCOL_NORMALIZATION_MODE, 281 /** The strength attribute. Can be either UCOL_PRIMARY, UCOL_SECONDARY, 282 * UCOL_TERTIARY, UCOL_QUATERNARY or UCOL_IDENTICAL. The usual strength 283 * for most locales (except Japanese) is tertiary. 284 * 285 * Quaternary strength 286 * is useful when combined with shifted setting for alternate handling 287 * attribute and for JIS X 4061 collation, when it is used to distinguish 288 * between Katakana and Hiragana. 289 * Otherwise, quaternary level 290 * is affected only by the number of non-ignorable code points in 291 * the string. 292 * 293 * Identical strength is rarely useful, as it amounts 294 * to codepoints of the NFD form of the string. 295 * @stable ICU 2.0 296 */ 297 UCOL_STRENGTH, 298 /** 299 * When turned on, this attribute makes 300 * substrings of digits sort according to their numeric values. 301 * 302 * This is a way to get '100' to sort AFTER '2'. Note that the longest 303 * digit substring that can be treated as a single unit is 304 * 254 digits (not counting leading zeros). If a digit substring is 305 * longer than that, the digits beyond the limit will be treated as a 306 * separate digit substring. 307 * 308 * A "digit" in this sense is a code point with General_Category=Nd, 309 * which does not include circled numbers, roman numerals, etc. 310 * Only a contiguous digit substring is considered, that is, 311 * non-negative integers without separators. 312 * There is no support for plus/minus signs, decimals, exponents, etc. 313 * 314 * @stable ICU 2.8 315 */ 316 UCOL_NUMERIC_COLLATION = UCOL_STRENGTH + 2 317 } UColAttribute; 318 319 /** Options for retrieving the rule string 320 * @stable ICU 2.0 321 */ 322 typedef enum { 323 /** 324 * Retrieves the tailoring rules only. 325 * Same as calling the version of getRules() without UColRuleOption. 326 * @stable ICU 2.0 327 */ 328 UCOL_TAILORING_ONLY, 329 /** 330 * Retrieves the "UCA rules" concatenated with the tailoring rules. 331 * The "UCA rules" are an <i>approximation</i> of the root collator's sort order. 332 * They are almost never used or useful at runtime and can be removed from the data. 333 * See https://unicode-org.github.io/icu/userguide/collation/customization#building-on-existing-locales 334 * @stable ICU 2.0 335 */ 336 UCOL_FULL_RULES 337 } UColRuleOption ; 338 339 /** 340 * Open a UCollator for comparing strings. 341 * 342 * For some languages, multiple collation types are available; 343 * for example, "de@collation=phonebook". 344 * Starting with ICU 54, collation attributes can be specified via locale keywords as well, 345 * in the old locale extension syntax ("el@colCaseFirst=upper") 346 * or in language tag syntax ("el-u-kf-upper"). 347 * See <a href="https://unicode-org.github.io/icu/userguide/collation/api">User Guide: Collation API</a>. 348 * 349 * The UCollator pointer is used in all the calls to the Collation 350 * service. After finished, collator must be disposed of by calling 351 * {@link #ucol_close }. 352 * @param loc The locale containing the required collation rules. 353 * Special values for locales can be passed in - 354 * if NULL is passed for the locale, the default locale 355 * collation rules will be used. If empty string ("") or 356 * "root" are passed, the root collator will be returned. 357 * @param status A pointer to a UErrorCode to receive any errors 358 * @return A pointer to a UCollator, or 0 if an error occurred. 359 * @see ucol_openRules 360 * @see ucol_clone 361 * @see ucol_close 362 * @stable ICU 2.0 363 */ 364 U_CAPI UCollator* U_EXPORT2 365 ucol_open(const char *loc, UErrorCode *status); 366 367 /** 368 * Produce a UCollator instance according to the rules supplied. 369 * The rules are used to change the default ordering, defined in the 370 * UCA in a process called tailoring. The resulting UCollator pointer 371 * can be used in the same way as the one obtained by {@link #ucol_strcoll }. 372 * @param rules A string describing the collation rules. For the syntax 373 * of the rules please see users guide. 374 * @param rulesLength The length of rules, or -1 if null-terminated. 375 * @param normalizationMode The normalization mode: One of 376 * UCOL_OFF (expect the text to not need normalization), 377 * UCOL_ON (normalize), or 378 * UCOL_DEFAULT (set the mode according to the rules) 379 * @param strength The default collation strength; one of UCOL_PRIMARY, UCOL_SECONDARY, 380 * UCOL_TERTIARY, UCOL_IDENTICAL,UCOL_DEFAULT_STRENGTH - can be also set in the rules. 381 * @param parseError A pointer to UParseError to receive information about errors 382 * occurred during parsing. This argument can currently be set 383 * to NULL, but at users own risk. Please provide a real structure. 384 * @param status A pointer to a UErrorCode to receive any errors 385 * @return A pointer to a UCollator. It is not guaranteed that NULL be returned in case 386 * of error - please use status argument to check for errors. 387 * @see ucol_open 388 * @see ucol_clone 389 * @see ucol_close 390 * @stable ICU 2.0 391 */ 392 U_CAPI UCollator* U_EXPORT2 393 ucol_openRules( const UChar *rules, 394 int32_t rulesLength, 395 UColAttributeValue normalizationMode, 396 UCollationStrength strength, 397 UParseError *parseError, 398 UErrorCode *status); 399 400 /** 401 * Get a set containing the expansions defined by the collator. The set includes 402 * both the root collator's expansions and the expansions defined by the tailoring 403 * @param coll collator 404 * @param contractions if not NULL, the set to hold the contractions 405 * @param expansions if not NULL, the set to hold the expansions 406 * @param addPrefixes add the prefix contextual elements to contractions 407 * @param status to hold the error code 408 * 409 * @stable ICU 3.4 410 */ 411 U_CAPI void U_EXPORT2 412 ucol_getContractionsAndExpansions( const UCollator *coll, 413 USet *contractions, USet *expansions, 414 UBool addPrefixes, UErrorCode *status); 415 416 /** 417 * Close a UCollator. 418 * Once closed, a UCollator should not be used. Every open collator should 419 * be closed. Otherwise, a memory leak will result. 420 * @param coll The UCollator to close. 421 * @see ucol_open 422 * @see ucol_openRules 423 * @see ucol_clone 424 * @stable ICU 2.0 425 */ 426 U_CAPI void U_EXPORT2 427 ucol_close(UCollator *coll); 428 429 #if U_SHOW_CPLUSPLUS_API 430 431 U_NAMESPACE_BEGIN 432 433 /** 434 * \class LocalUCollatorPointer 435 * "Smart pointer" class, closes a UCollator via ucol_close(). 436 * For most methods see the LocalPointerBase base class. 437 * 438 * @see LocalPointerBase 439 * @see LocalPointer 440 * @stable ICU 4.4 441 */ 442 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCollatorPointer, UCollator, ucol_close); 443 444 U_NAMESPACE_END 445 446 #endif 447 448 /** 449 * Compare two strings. 450 * The strings will be compared using the options already specified. 451 * @param coll The UCollator containing the comparison rules. 452 * @param source The source string. 453 * @param sourceLength The length of source, or -1 if null-terminated. 454 * @param target The target string. 455 * @param targetLength The length of target, or -1 if null-terminated. 456 * @return The result of comparing the strings; one of UCOL_EQUAL, 457 * UCOL_GREATER, UCOL_LESS 458 * @see ucol_greater 459 * @see ucol_greaterOrEqual 460 * @see ucol_equal 461 * @stable ICU 2.0 462 */ 463 U_CAPI UCollationResult U_EXPORT2 464 ucol_strcoll( const UCollator *coll, 465 const UChar *source, 466 int32_t sourceLength, 467 const UChar *target, 468 int32_t targetLength); 469 470 /** 471 * Compare two strings in UTF-8. 472 * The strings will be compared using the options already specified. 473 * Note: When input string contains malformed a UTF-8 byte sequence, 474 * this function treats these bytes as REPLACEMENT CHARACTER (U+FFFD). 475 * @param coll The UCollator containing the comparison rules. 476 * @param source The source UTF-8 string. 477 * @param sourceLength The length of source, or -1 if null-terminated. 478 * @param target The target UTF-8 string. 479 * @param targetLength The length of target, or -1 if null-terminated. 480 * @param status A pointer to a UErrorCode to receive any errors 481 * @return The result of comparing the strings; one of UCOL_EQUAL, 482 * UCOL_GREATER, UCOL_LESS 483 * @see ucol_greater 484 * @see ucol_greaterOrEqual 485 * @see ucol_equal 486 * @stable ICU 50 487 */ 488 U_CAPI UCollationResult U_EXPORT2 489 ucol_strcollUTF8( 490 const UCollator *coll, 491 const char *source, 492 int32_t sourceLength, 493 const char *target, 494 int32_t targetLength, 495 UErrorCode *status); 496 497 /** 498 * Determine if one string is greater than another. 499 * This function is equivalent to {@link #ucol_strcoll } == UCOL_GREATER 500 * @param coll The UCollator containing the comparison rules. 501 * @param source The source string. 502 * @param sourceLength The length of source, or -1 if null-terminated. 503 * @param target The target string. 504 * @param targetLength The length of target, or -1 if null-terminated. 505 * @return true if source is greater than target, false otherwise. 506 * @see ucol_strcoll 507 * @see ucol_greaterOrEqual 508 * @see ucol_equal 509 * @stable ICU 2.0 510 */ 511 U_CAPI UBool U_EXPORT2 512 ucol_greater(const UCollator *coll, 513 const UChar *source, int32_t sourceLength, 514 const UChar *target, int32_t targetLength); 515 516 /** 517 * Determine if one string is greater than or equal to another. 518 * This function is equivalent to {@link #ucol_strcoll } != UCOL_LESS 519 * @param coll The UCollator containing the comparison rules. 520 * @param source The source string. 521 * @param sourceLength The length of source, or -1 if null-terminated. 522 * @param target The target string. 523 * @param targetLength The length of target, or -1 if null-terminated. 524 * @return true if source is greater than or equal to target, false otherwise. 525 * @see ucol_strcoll 526 * @see ucol_greater 527 * @see ucol_equal 528 * @stable ICU 2.0 529 */ 530 U_CAPI UBool U_EXPORT2 531 ucol_greaterOrEqual(const UCollator *coll, 532 const UChar *source, int32_t sourceLength, 533 const UChar *target, int32_t targetLength); 534 535 /** 536 * Compare two strings for equality. 537 * This function is equivalent to {@link #ucol_strcoll } == UCOL_EQUAL 538 * @param coll The UCollator containing the comparison rules. 539 * @param source The source string. 540 * @param sourceLength The length of source, or -1 if null-terminated. 541 * @param target The target string. 542 * @param targetLength The length of target, or -1 if null-terminated. 543 * @return true if source is equal to target, false otherwise 544 * @see ucol_strcoll 545 * @see ucol_greater 546 * @see ucol_greaterOrEqual 547 * @stable ICU 2.0 548 */ 549 U_CAPI UBool U_EXPORT2 550 ucol_equal(const UCollator *coll, 551 const UChar *source, int32_t sourceLength, 552 const UChar *target, int32_t targetLength); 553 554 555 /** 556 * Get the collation strength used in a UCollator. 557 * The strength influences how strings are compared. 558 * @param coll The UCollator to query. 559 * @return The collation strength; one of UCOL_PRIMARY, UCOL_SECONDARY, 560 * UCOL_TERTIARY, UCOL_QUATERNARY, UCOL_IDENTICAL 561 * @see ucol_setStrength 562 * @stable ICU 2.0 563 */ 564 U_CAPI UCollationStrength U_EXPORT2 565 ucol_getStrength(const UCollator *coll); 566 567 /** 568 * Set the collation strength used in a UCollator. 569 * The strength influences how strings are compared. 570 * @param coll The UCollator to set. 571 * @param strength The desired collation strength; one of UCOL_PRIMARY, 572 * UCOL_SECONDARY, UCOL_TERTIARY, UCOL_QUATERNARY, UCOL_IDENTICAL, UCOL_DEFAULT 573 * @see ucol_getStrength 574 * @stable ICU 2.0 575 */ 576 U_CAPI void U_EXPORT2 577 ucol_setStrength(UCollator *coll, 578 UCollationStrength strength); 579 580 /** 581 * Retrieves the reordering codes for this collator. 582 * These reordering codes are a combination of UScript codes and UColReorderCode entries. 583 * @param coll The UCollator to query. 584 * @param dest The array to fill with the script ordering. 585 * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function 586 * will only return the length of the result without writing any codes (pre-flighting). 587 * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a 588 * failure before the function call. 589 * @return The number of reordering codes written to the dest array. 590 * @see ucol_setReorderCodes 591 * @see ucol_getEquivalentReorderCodes 592 * @see UScriptCode 593 * @see UColReorderCode 594 * @stable ICU 4.8 595 */ 596 U_CAPI int32_t U_EXPORT2 597 ucol_getReorderCodes(const UCollator* coll, 598 int32_t* dest, 599 int32_t destCapacity, 600 UErrorCode *pErrorCode); 601 /** 602 * Sets the reordering codes for this collator. 603 * Collation reordering allows scripts and some other groups of characters 604 * to be moved relative to each other. This reordering is done on top of 605 * the DUCET/CLDR standard collation order. Reordering can specify groups to be placed 606 * at the start and/or the end of the collation order. These groups are specified using 607 * UScript codes and UColReorderCode entries. 608 * 609 * <p>By default, reordering codes specified for the start of the order are placed in the 610 * order given after several special non-script blocks. These special groups of characters 611 * are space, punctuation, symbol, currency, and digit. These special groups are represented with 612 * UColReorderCode entries. Script groups can be intermingled with 613 * these special non-script groups if those special groups are explicitly specified in the reordering. 614 * 615 * <p>The special code OTHERS stands for any script that is not explicitly 616 * mentioned in the list of reordering codes given. Anything that is after OTHERS 617 * will go at the very end of the reordering in the order given. 618 * 619 * <p>The special reorder code DEFAULT will reset the reordering for this collator 620 * to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that 621 * was specified when this collator was created from resource data or from rules. The 622 * DEFAULT code <b>must</b> be the sole code supplied when it is used. 623 * If not, then U_ILLEGAL_ARGUMENT_ERROR will be set. 624 * 625 * <p>The special reorder code NONE will remove any reordering for this collator. 626 * The result of setting no reordering will be to have the DUCET/CLDR ordering used. The 627 * NONE code <b>must</b> be the sole code supplied when it is used. 628 * 629 * @param coll The UCollator to set. 630 * @param reorderCodes An array of script codes in the new order. This can be NULL if the 631 * length is also set to 0. An empty array will clear any reordering codes on the collator. 632 * @param reorderCodesLength The length of reorderCodes. 633 * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a 634 * failure before the function call. 635 * @see ucol_getReorderCodes 636 * @see ucol_getEquivalentReorderCodes 637 * @see UScriptCode 638 * @see UColReorderCode 639 * @stable ICU 4.8 640 */ 641 U_CAPI void U_EXPORT2 642 ucol_setReorderCodes(UCollator* coll, 643 const int32_t* reorderCodes, 644 int32_t reorderCodesLength, 645 UErrorCode *pErrorCode); 646 647 /** 648 * Retrieves the reorder codes that are grouped with the given reorder code. Some reorder 649 * codes will be grouped and must reorder together. 650 * Beginning with ICU 55, scripts only reorder together if they are primary-equal, 651 * for example Hiragana and Katakana. 652 * 653 * @param reorderCode The reorder code to determine equivalence for. 654 * @param dest The array to fill with the script ordering. 655 * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function 656 * will only return the length of the result without writing any codes (pre-flighting). 657 * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate 658 * a failure before the function call. 659 * @return The number of reordering codes written to the dest array. 660 * @see ucol_setReorderCodes 661 * @see ucol_getReorderCodes 662 * @see UScriptCode 663 * @see UColReorderCode 664 * @stable ICU 4.8 665 */ 666 U_CAPI int32_t U_EXPORT2 667 ucol_getEquivalentReorderCodes(int32_t reorderCode, 668 int32_t* dest, 669 int32_t destCapacity, 670 UErrorCode *pErrorCode); 671 672 /** 673 * Get the display name for a UCollator. 674 * The display name is suitable for presentation to a user. 675 * @param objLoc The locale of the collator in question. 676 * @param dispLoc The locale for display. 677 * @param result A pointer to a buffer to receive the attribute. 678 * @param resultLength The maximum size of result. 679 * @param status A pointer to a UErrorCode to receive any errors 680 * @return The total buffer size needed; if greater than resultLength, 681 * the output was truncated. 682 * @stable ICU 2.0 683 */ 684 U_CAPI int32_t U_EXPORT2 685 ucol_getDisplayName( const char *objLoc, 686 const char *dispLoc, 687 UChar *result, 688 int32_t resultLength, 689 UErrorCode *status); 690 691 /** 692 * Get a locale for which collation rules are available. 693 * A UCollator in a locale returned by this function will perform the correct 694 * collation for the locale. 695 * @param localeIndex The index of the desired locale. 696 * @return A locale for which collation rules are available, or 0 if none. 697 * @see ucol_countAvailable 698 * @stable ICU 2.0 699 */ 700 U_CAPI const char* U_EXPORT2 701 ucol_getAvailable(int32_t localeIndex); 702 703 /** 704 * Determine how many locales have collation rules available. 705 * This function is most useful as determining the loop ending condition for 706 * calls to {@link #ucol_getAvailable }. 707 * @return The number of locales for which collation rules are available. 708 * @see ucol_getAvailable 709 * @stable ICU 2.0 710 */ 711 U_CAPI int32_t U_EXPORT2 712 ucol_countAvailable(void); 713 714 #if !UCONFIG_NO_SERVICE 715 /** 716 * Create a string enumerator of all locales for which a valid 717 * collator may be opened. 718 * @param status input-output error code 719 * @return a string enumeration over locale strings. The caller is 720 * responsible for closing the result. 721 * @stable ICU 3.0 722 */ 723 U_CAPI UEnumeration* U_EXPORT2 724 ucol_openAvailableLocales(UErrorCode *status); 725 #endif 726 727 /** 728 * Create a string enumerator of all possible keywords that are relevant to 729 * collation. At this point, the only recognized keyword for this 730 * service is "collation". 731 * @param status input-output error code 732 * @return a string enumeration over locale strings. The caller is 733 * responsible for closing the result. 734 * @stable ICU 3.0 735 */ 736 U_CAPI UEnumeration* U_EXPORT2 737 ucol_getKeywords(UErrorCode *status); 738 739 /** 740 * Given a keyword, create a string enumeration of all values 741 * for that keyword that are currently in use. 742 * @param keyword a particular keyword as enumerated by 743 * ucol_getKeywords. If any other keyword is passed in, *status is set 744 * to U_ILLEGAL_ARGUMENT_ERROR. 745 * @param status input-output error code 746 * @return a string enumeration over collation keyword values, or NULL 747 * upon error. The caller is responsible for closing the result. 748 * @stable ICU 3.0 749 */ 750 U_CAPI UEnumeration* U_EXPORT2 751 ucol_getKeywordValues(const char *keyword, UErrorCode *status); 752 753 /** 754 * Given a key and a locale, returns an array of string values in a preferred 755 * order that would make a difference. These are all and only those values where 756 * the open (creation) of the service with the locale formed from the input locale 757 * plus input keyword and that value has different behavior than creation with the 758 * input locale alone. 759 * @param key one of the keys supported by this service. For now, only 760 * "collation" is supported. 761 * @param locale the locale 762 * @param commonlyUsed if set to true it will return only commonly used values 763 * with the given locale in preferred order. Otherwise, 764 * it will return all the available values for the locale. 765 * @param status error status 766 * @return a string enumeration over keyword values for the given key and the locale. 767 * @stable ICU 4.2 768 */ 769 U_CAPI UEnumeration* U_EXPORT2 770 ucol_getKeywordValuesForLocale(const char* key, 771 const char* locale, 772 UBool commonlyUsed, 773 UErrorCode* status); 774 775 /** 776 * Return the functionally equivalent locale for the specified 777 * input locale, with respect to given keyword, for the 778 * collation service. If two different input locale + keyword 779 * combinations produce the same result locale, then collators 780 * instantiated for these two different input locales will behave 781 * equivalently. The converse is not always true; two collators 782 * may in fact be equivalent, but return different results, due to 783 * internal details. The return result has no other meaning than 784 * that stated above, and implies nothing as to the relationship 785 * between the two locales. This is intended for use by 786 * applications who wish to cache collators, or otherwise reuse 787 * collators when possible. The functional equivalent may change 788 * over time. For more information, please see the <a 789 * href="https://unicode-org.github.io/icu/userguide/locale#locales-and-services"> 790 * Locales and Services</a> section of the ICU User Guide. 791 * @param result fillin for the functionally equivalent result locale 792 * @param resultCapacity capacity of the fillin buffer 793 * @param keyword a particular keyword as enumerated by 794 * ucol_getKeywords. 795 * @param locale the specified input locale 796 * @param isAvailable if non-NULL, pointer to a fillin parameter that 797 * on return indicates whether the specified input locale was 'available' 798 * to the collation service. A locale is defined as 'available' if it 799 * physically exists within the collation locale data. 800 * @param status pointer to input-output error code 801 * @return the actual buffer size needed for the locale. If greater 802 * than resultCapacity, the returned full name will be truncated and 803 * an error code will be returned. 804 * @stable ICU 3.0 805 */ 806 U_CAPI int32_t U_EXPORT2 807 ucol_getFunctionalEquivalent(char* result, int32_t resultCapacity, 808 const char* keyword, const char* locale, 809 UBool* isAvailable, UErrorCode* status); 810 811 /** 812 * Get the collation tailoring rules from a UCollator. 813 * The rules will follow the rule syntax. 814 * @param coll The UCollator to query. 815 * @param length 816 * @return The collation tailoring rules. 817 * @stable ICU 2.0 818 */ 819 U_CAPI const UChar* U_EXPORT2 820 ucol_getRules( const UCollator *coll, 821 int32_t *length); 822 823 /** 824 * Get a sort key for a string from a UCollator. 825 * Sort keys may be compared using <TT>strcmp</TT>. 826 * 827 * Note that sort keys are often less efficient than simply doing comparison. 828 * For more details, see the ICU User Guide. 829 * 830 * Like ICU functions that write to an output buffer, the buffer contents 831 * is undefined if the buffer capacity (resultLength parameter) is too small. 832 * Unlike ICU functions that write a string to an output buffer, 833 * the terminating zero byte is counted in the sort key length. 834 * @param coll The UCollator containing the collation rules. 835 * @param source The string to transform. 836 * @param sourceLength The length of source, or -1 if null-terminated. 837 * @param result A pointer to a buffer to receive the attribute. 838 * @param resultLength The maximum size of result. 839 * @return The size needed to fully store the sort key. 840 * If there was an internal error generating the sort key, 841 * a zero value is returned. 842 * @see ucol_keyHashCode 843 * @stable ICU 2.0 844 */ 845 U_CAPI int32_t U_EXPORT2 846 ucol_getSortKey(const UCollator *coll, 847 const UChar *source, 848 int32_t sourceLength, 849 uint8_t *result, 850 int32_t resultLength); 851 852 /** enum that is taken by ucol_getBound API 853 * See below for explanation 854 * do not change the values assigned to the 855 * members of this enum. Underlying code 856 * depends on them having these numbers 857 * @stable ICU 2.0 858 */ 859 typedef enum { 860 /** lower bound */ 861 UCOL_BOUND_LOWER = 0, 862 /** upper bound that will match strings of exact size */ 863 UCOL_BOUND_UPPER = 1, 864 /** upper bound that will match all the strings that have the same initial substring as the given string */ 865 UCOL_BOUND_UPPER_LONG = 2 866 } UColBoundMode; 867 868 /** 869 * Produce a bound for a given sortkey and a number of levels. 870 * Return value is always the number of bytes needed, regardless of 871 * whether the result buffer was big enough or even valid.<br> 872 * Resulting bounds can be used to produce a range of strings that are 873 * between upper and lower bounds. For example, if bounds are produced 874 * for a sortkey of string "smith", strings between upper and lower 875 * bounds with one level would include "Smith", "SMITH", "sMiTh".<br> 876 * There are two upper bounds that can be produced. If UCOL_BOUND_UPPER 877 * is produced, strings matched would be as above. However, if bound 878 * produced using UCOL_BOUND_UPPER_LONG is used, the above example will 879 * also match "Smithsonian" and similar.<br> 880 * For more on usage, see example in cintltst/capitst.c in procedure 881 * TestBounds. 882 * Sort keys may be compared using <TT>strcmp</TT>. 883 * @param source The source sortkey. 884 * @param sourceLength The length of source, or -1 if null-terminated. 885 * (If an unmodified sortkey is passed, it is always null 886 * terminated). 887 * @param boundType Type of bound required. It can be UCOL_BOUND_LOWER, which 888 * produces a lower inclusive bound, UCOL_BOUND_UPPER, that 889 * produces upper bound that matches strings of the same length 890 * or UCOL_BOUND_UPPER_LONG that matches strings that have the 891 * same starting substring as the source string. 892 * @param noOfLevels Number of levels required in the resulting bound (for most 893 * uses, the recommended value is 1). See users guide for 894 * explanation on number of levels a sortkey can have. 895 * @param result A pointer to a buffer to receive the resulting sortkey. 896 * @param resultLength The maximum size of result. 897 * @param status Used for returning error code if something went wrong. If the 898 * number of levels requested is higher than the number of levels 899 * in the source key, a warning (U_SORT_KEY_TOO_SHORT_WARNING) is 900 * issued. 901 * @return The size needed to fully store the bound. 902 * @see ucol_keyHashCode 903 * @stable ICU 2.1 904 */ 905 U_CAPI int32_t U_EXPORT2 906 ucol_getBound(const uint8_t *source, 907 int32_t sourceLength, 908 UColBoundMode boundType, 909 uint32_t noOfLevels, 910 uint8_t *result, 911 int32_t resultLength, 912 UErrorCode *status); 913 914 915 /** 916 * Merges two sort keys. The levels are merged with their corresponding counterparts 917 * (primaries with primaries, secondaries with secondaries etc.). Between the values 918 * from the same level a separator is inserted. 919 * 920 * This is useful, for example, for combining sort keys from first and last names 921 * to sort such pairs. 922 * See http://www.unicode.org/reports/tr10/#Merging_Sort_Keys 923 * 924 * The recommended way to achieve "merged" sorting is by 925 * concatenating strings with U+FFFE between them. 926 * The concatenation has the same sort order as the merged sort keys, 927 * but merge(getSortKey(str1), getSortKey(str2)) may differ from getSortKey(str1 + '\\uFFFE' + str2). 928 * Using strings with U+FFFE may yield shorter sort keys. 929 * 930 * For details about Sort Key Features see 931 * https://unicode-org.github.io/icu/userguide/collation/api#sort-key-features 932 * 933 * It is possible to merge multiple sort keys by consecutively merging 934 * another one with the intermediate result. 935 * 936 * The length of the merge result is the sum of the lengths of the input sort keys. 937 * 938 * Example (uncompressed): 939 * <pre>191B1D 01 050505 01 910505 00 940 * 1F2123 01 050505 01 910505 00</pre> 941 * will be merged as 942 * <pre>191B1D 02 1F2123 01 050505 02 050505 01 910505 02 910505 00</pre> 943 * 944 * If the destination buffer is not big enough, then its contents are undefined. 945 * If any of source lengths are zero or any of the source pointers are NULL/undefined, 946 * the result is of size zero. 947 * 948 * @param src1 the first sort key 949 * @param src1Length the length of the first sort key, including the zero byte at the end; 950 * can be -1 if the function is to find the length 951 * @param src2 the second sort key 952 * @param src2Length the length of the second sort key, including the zero byte at the end; 953 * can be -1 if the function is to find the length 954 * @param dest the buffer where the merged sort key is written, 955 * can be NULL if destCapacity==0 956 * @param destCapacity the number of bytes in the dest buffer 957 * @return the length of the merged sort key, src1Length+src2Length; 958 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments), 959 * in which cases the contents of dest is undefined 960 * @stable ICU 2.0 961 */ 962 U_CAPI int32_t U_EXPORT2 963 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 964 const uint8_t *src2, int32_t src2Length, 965 uint8_t *dest, int32_t destCapacity); 966 967 /** 968 * Universal attribute setter 969 * @param coll collator which attributes are to be changed 970 * @param attr attribute type 971 * @param value attribute value 972 * @param status to indicate whether the operation went on smoothly or there were errors 973 * @see UColAttribute 974 * @see UColAttributeValue 975 * @see ucol_getAttribute 976 * @stable ICU 2.0 977 */ 978 U_CAPI void U_EXPORT2 979 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status); 980 981 /** 982 * Universal attribute getter 983 * @param coll collator which attributes are to be changed 984 * @param attr attribute type 985 * @return attribute value 986 * @param status to indicate whether the operation went on smoothly or there were errors 987 * @see UColAttribute 988 * @see UColAttributeValue 989 * @see ucol_setAttribute 990 * @stable ICU 2.0 991 */ 992 U_CAPI UColAttributeValue U_EXPORT2 993 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status); 994 995 /** 996 * Sets the variable top to the top of the specified reordering group. 997 * The variable top determines the highest-sorting character 998 * which is affected by UCOL_ALTERNATE_HANDLING. 999 * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect. 1000 * @param coll the collator 1001 * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION, 1002 * UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY; 1003 * or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group 1004 * @param pErrorCode Standard ICU error code. Its input value must 1005 * pass the U_SUCCESS() test, or else the function returns 1006 * immediately. Check for U_FAILURE() on output or use with 1007 * function chaining. (See User Guide for details.) 1008 * @see ucol_getMaxVariable 1009 * @stable ICU 53 1010 */ 1011 U_CAPI void U_EXPORT2 1012 ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCode); 1013 1014 /** 1015 * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING. 1016 * @param coll the collator 1017 * @return the maximum variable reordering group. 1018 * @see ucol_setMaxVariable 1019 * @stable ICU 53 1020 */ 1021 U_CAPI UColReorderCode U_EXPORT2 1022 ucol_getMaxVariable(const UCollator *coll); 1023 1024 /** 1025 * Gets the variable top value of a Collator. 1026 * @param coll collator which variable top needs to be retrieved 1027 * @param status error code (not changed by function). If error code is set, 1028 * the return value is undefined. 1029 * @return the variable top primary weight 1030 * @see ucol_getMaxVariable 1031 * @see ucol_setVariableTop 1032 * @see ucol_restoreVariableTop 1033 * @stable ICU 2.0 1034 */ 1035 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status); 1036 1037 /** 1038 * Thread safe cloning operation. The result is a clone of a given collator. 1039 * @param coll collator to be cloned 1040 * @param status to indicate whether the operation went on smoothly or there were errors 1041 * @return pointer to the new clone 1042 * @see ucol_open 1043 * @see ucol_openRules 1044 * @see ucol_close 1045 * @stable ICU 71 1046 */ 1047 U_CAPI UCollator* U_EXPORT2 ucol_clone(const UCollator *coll, UErrorCode *status); 1048 1049 /** 1050 * Returns current rules. Delta defines whether full rules are returned or just the tailoring. 1051 * Returns number of UChars needed to store rules. If buffer is NULL or bufferLen is not enough 1052 * to store rules, will store up to available space. 1053 * 1054 * ucol_getRules() should normally be used instead. 1055 * See https://unicode-org.github.io/icu/userguide/collation/customization#building-on-existing-locales 1056 * @param coll collator to get the rules from 1057 * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES. 1058 * @param buffer buffer to store the result in. If NULL, you'll get no rules. 1059 * @param bufferLen length of buffer to store rules in. If less than needed you'll get only the part that fits in. 1060 * @return current rules 1061 * @stable ICU 2.0 1062 * @see UCOL_FULL_RULES 1063 */ 1064 U_CAPI int32_t U_EXPORT2 1065 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen); 1066 1067 /** 1068 * gets the locale name of the collator. If the collator 1069 * is instantiated from the rules, then this function returns 1070 * NULL. 1071 * @param coll The UCollator for which the locale is needed 1072 * @param type You can choose between requested, valid and actual 1073 * locale. For description see the definition of 1074 * ULocDataLocaleType in uloc.h 1075 * @param status error code of the operation 1076 * @return real locale name from which the collation data comes. 1077 * If the collator was instantiated from rules, returns 1078 * NULL. 1079 * @stable ICU 2.8 1080 */ 1081 U_CAPI const char * U_EXPORT2 1082 ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status); 1083 1084 /** 1085 * Get a Unicode set that contains all the characters and sequences tailored in 1086 * this collator. The result must be disposed of by using uset_close. 1087 * @param coll The UCollator for which we want to get tailored chars 1088 * @param status error code of the operation 1089 * @return a pointer to newly created USet. Must be be disposed by using uset_close 1090 * @see ucol_openRules 1091 * @see uset_close 1092 * @stable ICU 2.4 1093 */ 1094 U_CAPI USet * U_EXPORT2 1095 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status); 1096 1097 /** Creates a binary image of a collator. This binary image can be stored and 1098 * later used to instantiate a collator using ucol_openBinary. 1099 * This API supports preflighting. 1100 * @param coll Collator 1101 * @param buffer a fill-in buffer to receive the binary image 1102 * @param capacity capacity of the destination buffer 1103 * @param status for catching errors 1104 * @return size of the image 1105 * @see ucol_openBinary 1106 * @stable ICU 3.2 1107 */ 1108 U_CAPI int32_t U_EXPORT2 1109 ucol_cloneBinary(const UCollator *coll, 1110 uint8_t *buffer, int32_t capacity, 1111 UErrorCode *status); 1112 1113 /** Opens a collator from a collator binary image created using 1114 * ucol_cloneBinary. Binary image used in instantiation of the 1115 * collator remains owned by the user and should stay around for 1116 * the lifetime of the collator. The API also takes a base collator 1117 * which must be the root collator. 1118 * @param bin binary image owned by the user and required through the 1119 * lifetime of the collator 1120 * @param length size of the image. If negative, the API will try to 1121 * figure out the length of the image 1122 * @param base Base collator, for lookup of untailored characters. 1123 * Must be the root collator, must not be NULL. 1124 * The base is required to be present through the lifetime of the collator. 1125 * @param status for catching errors 1126 * @return newly created collator 1127 * @see ucol_cloneBinary 1128 * @stable ICU 3.2 1129 */ 1130 U_CAPI UCollator* U_EXPORT2 1131 ucol_openBinary(const uint8_t *bin, int32_t length, 1132 const UCollator *base, 1133 UErrorCode *status); 1134 1135 1136 #endif /* #if !UCONFIG_NO_COLLATION */ 1137 1138 #endif 1139