1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (c) 1996-2015, International Business Machines Corporation and others.
6 * All Rights Reserved.
7 *******************************************************************************
8 */
9 
10 #ifndef UCOL_H
11 #define UCOL_H
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_COLLATION
16 
17 #include "unicode/parseerr.h"
18 #include "unicode/uloc.h"
19 #include "unicode/uset.h"
20 #include "unicode/uscript.h"
21 
22 #if U_SHOW_CPLUSPLUS_API
23 #include "unicode/localpointer.h"
24 #endif   // U_SHOW_CPLUSPLUS_API
25 
26 /**
27  * \file
28  * \brief C API: Collator
29  *
30  * <h2> Collator C API </h2>
31  *
32  * The C API for Collator performs locale-sensitive
33  * string comparison. You use this service to build
34  * searching and sorting routines for natural language text.
35  * <p>
36  * For more information about the collation service see
37  * <a href="https://unicode-org.github.io/icu/userguide/collation">the User Guide</a>.
38  * <p>
39  * Collation service provides correct sorting orders for most locales supported in ICU.
40  * If specific data for a locale is not available, the orders eventually falls back
41  * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>.
42  * <p>
43  * Sort ordering may be customized by providing your own set of rules. For more on
44  * this subject see the <a href="https://unicode-org.github.io/icu/userguide/collation/customization">
45  * Collation Customization</a> section of the User Guide.
46  * <p>
47  * @see         UCollationResult
48  * @see         UNormalizationMode
49  * @see         UCollationStrength
50  * @see         UCollationElements
51  */
52 
53 /** A collator.
54 *  For usage in C programs.
55 */
56 struct UCollator;
57 /** structure representing a collator object instance
58  * @stable ICU 2.0
59  */
60 typedef struct UCollator UCollator;
61 
62 
63 /**
64  * UCOL_LESS is returned if source string is compared to be less than target
65  * string in the ucol_strcoll() method.
66  * UCOL_EQUAL is returned if source string is compared to be equal to target
67  * string in the ucol_strcoll() method.
68  * UCOL_GREATER is returned if source string is compared to be greater than
69  * target string in the ucol_strcoll() method.
70  * @see ucol_strcoll()
71  * <p>
72  * Possible values for a comparison result
73  * @stable ICU 2.0
74  */
75 typedef enum {
76   /** string a == string b */
77   UCOL_EQUAL    = 0,
78   /** string a > string b */
79   UCOL_GREATER    = 1,
80   /** string a < string b */
81   UCOL_LESS    = -1
82 } UCollationResult ;
83 
84 
85 /** Enum containing attribute values for controlling collation behavior.
86  * Here are all the allowable values. Not every attribute can take every value. The only
87  * universal value is UCOL_DEFAULT, which resets the attribute value to the predefined
88  * value for that locale
89  * @stable ICU 2.0
90  */
91 typedef enum {
92   /** accepted by most attributes */
93   UCOL_DEFAULT = -1,
94 
95   /** Primary collation strength */
96   UCOL_PRIMARY = 0,
97   /** Secondary collation strength */
98   UCOL_SECONDARY = 1,
99   /** Tertiary collation strength */
100   UCOL_TERTIARY = 2,
101   /** Default collation strength */
102   UCOL_DEFAULT_STRENGTH = UCOL_TERTIARY,
103   UCOL_CE_STRENGTH_LIMIT,
104   /** Quaternary collation strength */
105   UCOL_QUATERNARY=3,
106   /** Identical collation strength */
107   UCOL_IDENTICAL=15,
108   UCOL_STRENGTH_LIMIT,
109 
110   /** Turn the feature off - works for UCOL_FRENCH_COLLATION,
111       UCOL_CASE_LEVEL, UCOL_HIRAGANA_QUATERNARY_MODE
112       & UCOL_DECOMPOSITION_MODE*/
113   UCOL_OFF = 16,
114   /** Turn the feature on - works for UCOL_FRENCH_COLLATION,
115       UCOL_CASE_LEVEL, UCOL_HIRAGANA_QUATERNARY_MODE
116       & UCOL_DECOMPOSITION_MODE*/
117   UCOL_ON = 17,
118 
119   /** Valid for UCOL_ALTERNATE_HANDLING. Alternate handling will be shifted */
120   UCOL_SHIFTED = 20,
121   /** Valid for UCOL_ALTERNATE_HANDLING. Alternate handling will be non ignorable */
122   UCOL_NON_IGNORABLE = 21,
123 
124   /** Valid for UCOL_CASE_FIRST -
125       lower case sorts before upper case */
126   UCOL_LOWER_FIRST = 24,
127   /** upper case sorts before lower case */
128   UCOL_UPPER_FIRST = 25
129 } UColAttributeValue;
130 
131 /**
132  * Enum containing the codes for reordering segments of the collation table that are not script
133  * codes. These reordering codes are to be used in conjunction with the script codes.
134  * @see ucol_getReorderCodes
135  * @see ucol_setReorderCodes
136  * @see ucol_getEquivalentReorderCodes
137  * @see UScriptCode
138  * @stable ICU 4.8
139  */
140  typedef enum {
141    /**
142     * A special reordering code that is used to specify the default
143     * reordering codes for a locale.
144     * @stable ICU 4.8
145     */
146     UCOL_REORDER_CODE_DEFAULT       = -1,
147    /**
148     * A special reordering code that is used to specify no reordering codes.
149     * @stable ICU 4.8
150     */
151     UCOL_REORDER_CODE_NONE          = USCRIPT_UNKNOWN,
152    /**
153     * A special reordering code that is used to specify all other codes used for
154     * reordering except for the codes lised as UColReorderCode values and those
155     * listed explicitly in a reordering.
156     * @stable ICU 4.8
157     */
158     UCOL_REORDER_CODE_OTHERS        = USCRIPT_UNKNOWN,
159    /**
160     * Characters with the space property.
161     * This is equivalent to the rule value "space".
162     * @stable ICU 4.8
163     */
164     UCOL_REORDER_CODE_SPACE         = 0x1000,
165    /**
166     * The first entry in the enumeration of reordering groups. This is intended for use in
167     * range checking and enumeration of the reorder codes.
168     * @stable ICU 4.8
169     */
170     UCOL_REORDER_CODE_FIRST         = UCOL_REORDER_CODE_SPACE,
171    /**
172     * Characters with the punctuation property.
173     * This is equivalent to the rule value "punct".
174     * @stable ICU 4.8
175     */
176     UCOL_REORDER_CODE_PUNCTUATION   = 0x1001,
177    /**
178     * Characters with the symbol property.
179     * This is equivalent to the rule value "symbol".
180     * @stable ICU 4.8
181     */
182     UCOL_REORDER_CODE_SYMBOL        = 0x1002,
183    /**
184     * Characters with the currency property.
185     * This is equivalent to the rule value "currency".
186     * @stable ICU 4.8
187     */
188     UCOL_REORDER_CODE_CURRENCY      = 0x1003,
189    /**
190     * Characters with the digit property.
191     * This is equivalent to the rule value "digit".
192     * @stable ICU 4.8
193     */
194     UCOL_REORDER_CODE_DIGIT         = 0x1004
195 } UColReorderCode;
196 
197 /**
198  * Base letter represents a primary difference.  Set comparison
199  * level to UCOL_PRIMARY to ignore secondary and tertiary differences.
200  * Use this to set the strength of a Collator object.
201  * Example of primary difference, "abc" &lt; "abd"
202  *
203  * Diacritical differences on the same base letter represent a secondary
204  * difference.  Set comparison level to UCOL_SECONDARY to ignore tertiary
205  * differences. Use this to set the strength of a Collator object.
206  * Example of secondary difference, "&auml;" >> "a".
207  *
208  * Uppercase and lowercase versions of the same character represents a
209  * tertiary difference.  Set comparison level to UCOL_TERTIARY to include
210  * all comparison differences. Use this to set the strength of a Collator
211  * object.
212  * Example of tertiary difference, "abc" &lt;&lt;&lt; "ABC".
213  *
214  * Two characters are considered "identical" when they have the same
215  * unicode spellings.  UCOL_IDENTICAL.
216  * For example, "&auml;" == "&auml;".
217  *
218  * UCollationStrength is also used to determine the strength of sort keys
219  * generated from UCollator objects
220  * These values can be now found in the UColAttributeValue enum.
221  * @stable ICU 2.0
222  **/
223 typedef UColAttributeValue UCollationStrength;
224 
225 /** Attributes that collation service understands. All the attributes can take UCOL_DEFAULT
226  * value, as well as the values specific to each one.
227  * @stable ICU 2.0
228  */
229 typedef enum {
230      /** Attribute for direction of secondary weights - used in Canadian French.
231       * Acceptable values are UCOL_ON, which results in secondary weights
232       * being considered backwards and UCOL_OFF which treats secondary
233       * weights in the order they appear.
234       * @stable ICU 2.0
235       */
236      UCOL_FRENCH_COLLATION,
237      /** Attribute for handling variable elements.
238       * Acceptable values are UCOL_NON_IGNORABLE (default)
239       * which treats all the codepoints with non-ignorable
240       * primary weights in the same way,
241       * and UCOL_SHIFTED which causes codepoints with primary
242       * weights that are equal or below the variable top value
243       * to be ignored on primary level and moved to the quaternary
244       * level.
245       * @stable ICU 2.0
246       */
247      UCOL_ALTERNATE_HANDLING,
248      /** Controls the ordering of upper and lower case letters.
249       * Acceptable values are UCOL_OFF (default), which orders
250       * upper and lower case letters in accordance to their tertiary
251       * weights, UCOL_UPPER_FIRST which forces upper case letters to
252       * sort before lower case letters, and UCOL_LOWER_FIRST which does
253       * the opposite.
254       * @stable ICU 2.0
255       */
256      UCOL_CASE_FIRST,
257      /** Controls whether an extra case level (positioned before the third
258       * level) is generated or not. Acceptable values are UCOL_OFF (default),
259       * when case level is not generated, and UCOL_ON which causes the case
260       * level to be generated. Contents of the case level are affected by
261       * the value of UCOL_CASE_FIRST attribute. A simple way to ignore
262       * accent differences in a string is to set the strength to UCOL_PRIMARY
263       * and enable case level.
264       * @stable ICU 2.0
265       */
266      UCOL_CASE_LEVEL,
267      /** Controls whether the normalization check and necessary normalizations
268       * are performed. When set to UCOL_OFF (default) no normalization check
269       * is performed. The correctness of the result is guaranteed only if the
270       * input data is in so-called FCD form (see users manual for more info).
271       * When set to UCOL_ON, an incremental check is performed to see whether
272       * the input data is in the FCD form. If the data is not in the FCD form,
273       * incremental NFD normalization is performed.
274       * @stable ICU 2.0
275       */
276      UCOL_NORMALIZATION_MODE,
277      /** An alias for UCOL_NORMALIZATION_MODE attribute.
278       * @stable ICU 2.0
279       */
280      UCOL_DECOMPOSITION_MODE = UCOL_NORMALIZATION_MODE,
281      /** The strength attribute. Can be either UCOL_PRIMARY, UCOL_SECONDARY,
282       * UCOL_TERTIARY, UCOL_QUATERNARY or UCOL_IDENTICAL. The usual strength
283       * for most locales (except Japanese) is tertiary.
284       *
285       * Quaternary strength
286       * is useful when combined with shifted setting for alternate handling
287       * attribute and for JIS X 4061 collation, when it is used to distinguish
288       * between Katakana and Hiragana.
289       * Otherwise, quaternary level
290       * is affected only by the number of non-ignorable code points in
291       * the string.
292       *
293       * Identical strength is rarely useful, as it amounts
294       * to codepoints of the NFD form of the string.
295       * @stable ICU 2.0
296       */
297      UCOL_STRENGTH,
298      /**
299       * When turned on, this attribute makes
300       * substrings of digits sort according to their numeric values.
301       *
302       * This is a way to get '100' to sort AFTER '2'. Note that the longest
303       * digit substring that can be treated as a single unit is
304       * 254 digits (not counting leading zeros). If a digit substring is
305       * longer than that, the digits beyond the limit will be treated as a
306       * separate digit substring.
307       *
308       * A "digit" in this sense is a code point with General_Category=Nd,
309       * which does not include circled numbers, roman numerals, etc.
310       * Only a contiguous digit substring is considered, that is,
311       * non-negative integers without separators.
312       * There is no support for plus/minus signs, decimals, exponents, etc.
313       *
314       * @stable ICU 2.8
315       */
316      UCOL_NUMERIC_COLLATION = UCOL_STRENGTH + 2
317 } UColAttribute;
318 
319 /** Options for retrieving the rule string
320  *  @stable ICU 2.0
321  */
322 typedef enum {
323   /**
324    * Retrieves the tailoring rules only.
325    * Same as calling the version of getRules() without UColRuleOption.
326    * @stable ICU 2.0
327    */
328   UCOL_TAILORING_ONLY,
329   /**
330    * Retrieves the "UCA rules" concatenated with the tailoring rules.
331    * The "UCA rules" are an <i>approximation</i> of the root collator's sort order.
332    * They are almost never used or useful at runtime and can be removed from the data.
333    * See https://unicode-org.github.io/icu/userguide/collation/customization#building-on-existing-locales
334    * @stable ICU 2.0
335    */
336   UCOL_FULL_RULES
337 } UColRuleOption ;
338 
339 /**
340  * Open a UCollator for comparing strings.
341  *
342  * For some languages, multiple collation types are available;
343  * for example, "de@collation=phonebook".
344  * Starting with ICU 54, collation attributes can be specified via locale keywords as well,
345  * in the old locale extension syntax ("el@colCaseFirst=upper")
346  * or in language tag syntax ("el-u-kf-upper").
347  * See <a href="https://unicode-org.github.io/icu/userguide/collation/api">User Guide: Collation API</a>.
348  *
349  * The UCollator pointer is used in all the calls to the Collation
350  * service. After finished, collator must be disposed of by calling
351  * {@link #ucol_close }.
352  * @param loc The locale containing the required collation rules.
353  *            Special values for locales can be passed in -
354  *            if NULL is passed for the locale, the default locale
355  *            collation rules will be used. If empty string ("") or
356  *            "root" are passed, the root collator will be returned.
357  * @param status A pointer to a UErrorCode to receive any errors
358  * @return A pointer to a UCollator, or 0 if an error occurred.
359  * @see ucol_openRules
360  * @see ucol_clone
361  * @see ucol_close
362  * @stable ICU 2.0
363  */
364 U_CAPI UCollator* U_EXPORT2
365 ucol_open(const char *loc, UErrorCode *status);
366 
367 /**
368  * Produce a UCollator instance according to the rules supplied.
369  * The rules are used to change the default ordering, defined in the
370  * UCA in a process called tailoring. The resulting UCollator pointer
371  * can be used in the same way as the one obtained by {@link #ucol_strcoll }.
372  * @param rules A string describing the collation rules. For the syntax
373  *              of the rules please see users guide.
374  * @param rulesLength The length of rules, or -1 if null-terminated.
375  * @param normalizationMode The normalization mode: One of
376  *             UCOL_OFF     (expect the text to not need normalization),
377  *             UCOL_ON      (normalize), or
378  *             UCOL_DEFAULT (set the mode according to the rules)
379  * @param strength The default collation strength; one of UCOL_PRIMARY, UCOL_SECONDARY,
380  * UCOL_TERTIARY, UCOL_IDENTICAL,UCOL_DEFAULT_STRENGTH - can be also set in the rules.
381  * @param parseError  A pointer to UParseError to receive information about errors
382  *                    occurred during parsing. This argument can currently be set
383  *                    to NULL, but at users own risk. Please provide a real structure.
384  * @param status A pointer to a UErrorCode to receive any errors
385  * @return A pointer to a UCollator. It is not guaranteed that NULL be returned in case
386  *         of error - please use status argument to check for errors.
387  * @see ucol_open
388  * @see ucol_clone
389  * @see ucol_close
390  * @stable ICU 2.0
391  */
392 U_CAPI UCollator* U_EXPORT2
393 ucol_openRules( const UChar        *rules,
394                 int32_t            rulesLength,
395                 UColAttributeValue normalizationMode,
396                 UCollationStrength strength,
397                 UParseError        *parseError,
398                 UErrorCode         *status);
399 
400 /**
401  * Get a set containing the expansions defined by the collator. The set includes
402  * both the root collator's expansions and the expansions defined by the tailoring
403  * @param coll collator
404  * @param contractions if not NULL, the set to hold the contractions
405  * @param expansions if not NULL, the set to hold the expansions
406  * @param addPrefixes add the prefix contextual elements to contractions
407  * @param status to hold the error code
408  *
409  * @stable ICU 3.4
410  */
411 U_CAPI void U_EXPORT2
412 ucol_getContractionsAndExpansions( const UCollator *coll,
413                   USet *contractions, USet *expansions,
414                   UBool addPrefixes, UErrorCode *status);
415 
416 /**
417  * Close a UCollator.
418  * Once closed, a UCollator should not be used. Every open collator should
419  * be closed. Otherwise, a memory leak will result.
420  * @param coll The UCollator to close.
421  * @see ucol_open
422  * @see ucol_openRules
423  * @see ucol_clone
424  * @stable ICU 2.0
425  */
426 U_CAPI void U_EXPORT2
427 ucol_close(UCollator *coll);
428 
429 #if U_SHOW_CPLUSPLUS_API
430 
431 U_NAMESPACE_BEGIN
432 
433 /**
434  * \class LocalUCollatorPointer
435  * "Smart pointer" class, closes a UCollator via ucol_close().
436  * For most methods see the LocalPointerBase base class.
437  *
438  * @see LocalPointerBase
439  * @see LocalPointer
440  * @stable ICU 4.4
441  */
442 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCollatorPointer, UCollator, ucol_close);
443 
444 U_NAMESPACE_END
445 
446 #endif
447 
448 /**
449  * Compare two strings.
450  * The strings will be compared using the options already specified.
451  * @param coll The UCollator containing the comparison rules.
452  * @param source The source string.
453  * @param sourceLength The length of source, or -1 if null-terminated.
454  * @param target The target string.
455  * @param targetLength The length of target, or -1 if null-terminated.
456  * @return The result of comparing the strings; one of UCOL_EQUAL,
457  * UCOL_GREATER, UCOL_LESS
458  * @see ucol_greater
459  * @see ucol_greaterOrEqual
460  * @see ucol_equal
461  * @stable ICU 2.0
462  */
463 U_CAPI UCollationResult U_EXPORT2
464 ucol_strcoll(    const    UCollator    *coll,
465         const    UChar        *source,
466         int32_t            sourceLength,
467         const    UChar        *target,
468         int32_t            targetLength);
469 
470 /**
471 * Compare two strings in UTF-8.
472 * The strings will be compared using the options already specified.
473 * Note: When input string contains malformed a UTF-8 byte sequence,
474 * this function treats these bytes as REPLACEMENT CHARACTER (U+FFFD).
475 * @param coll The UCollator containing the comparison rules.
476 * @param source The source UTF-8 string.
477 * @param sourceLength The length of source, or -1 if null-terminated.
478 * @param target The target UTF-8 string.
479 * @param targetLength The length of target, or -1 if null-terminated.
480 * @param status A pointer to a UErrorCode to receive any errors
481 * @return The result of comparing the strings; one of UCOL_EQUAL,
482 * UCOL_GREATER, UCOL_LESS
483 * @see ucol_greater
484 * @see ucol_greaterOrEqual
485 * @see ucol_equal
486 * @stable ICU 50
487 */
488 U_CAPI UCollationResult U_EXPORT2
489 ucol_strcollUTF8(
490         const UCollator *coll,
491         const char      *source,
492         int32_t         sourceLength,
493         const char      *target,
494         int32_t         targetLength,
495         UErrorCode      *status);
496 
497 /**
498  * Determine if one string is greater than another.
499  * This function is equivalent to {@link #ucol_strcoll } == UCOL_GREATER
500  * @param coll The UCollator containing the comparison rules.
501  * @param source The source string.
502  * @param sourceLength The length of source, or -1 if null-terminated.
503  * @param target The target string.
504  * @param targetLength The length of target, or -1 if null-terminated.
505  * @return true if source is greater than target, false otherwise.
506  * @see ucol_strcoll
507  * @see ucol_greaterOrEqual
508  * @see ucol_equal
509  * @stable ICU 2.0
510  */
511 U_CAPI UBool U_EXPORT2
512 ucol_greater(const UCollator *coll,
513              const UChar     *source, int32_t sourceLength,
514              const UChar     *target, int32_t targetLength);
515 
516 /**
517  * Determine if one string is greater than or equal to another.
518  * This function is equivalent to {@link #ucol_strcoll } != UCOL_LESS
519  * @param coll The UCollator containing the comparison rules.
520  * @param source The source string.
521  * @param sourceLength The length of source, or -1 if null-terminated.
522  * @param target The target string.
523  * @param targetLength The length of target, or -1 if null-terminated.
524  * @return true if source is greater than or equal to target, false otherwise.
525  * @see ucol_strcoll
526  * @see ucol_greater
527  * @see ucol_equal
528  * @stable ICU 2.0
529  */
530 U_CAPI UBool U_EXPORT2
531 ucol_greaterOrEqual(const UCollator *coll,
532                     const UChar     *source, int32_t sourceLength,
533                     const UChar     *target, int32_t targetLength);
534 
535 /**
536  * Compare two strings for equality.
537  * This function is equivalent to {@link #ucol_strcoll } == UCOL_EQUAL
538  * @param coll The UCollator containing the comparison rules.
539  * @param source The source string.
540  * @param sourceLength The length of source, or -1 if null-terminated.
541  * @param target The target string.
542  * @param targetLength The length of target, or -1 if null-terminated.
543  * @return true if source is equal to target, false otherwise
544  * @see ucol_strcoll
545  * @see ucol_greater
546  * @see ucol_greaterOrEqual
547  * @stable ICU 2.0
548  */
549 U_CAPI UBool U_EXPORT2
550 ucol_equal(const UCollator *coll,
551            const UChar     *source, int32_t sourceLength,
552            const UChar     *target, int32_t targetLength);
553 
554 
555 /**
556  * Get the collation strength used in a UCollator.
557  * The strength influences how strings are compared.
558  * @param coll The UCollator to query.
559  * @return The collation strength; one of UCOL_PRIMARY, UCOL_SECONDARY,
560  * UCOL_TERTIARY, UCOL_QUATERNARY, UCOL_IDENTICAL
561  * @see ucol_setStrength
562  * @stable ICU 2.0
563  */
564 U_CAPI UCollationStrength U_EXPORT2
565 ucol_getStrength(const UCollator *coll);
566 
567 /**
568  * Set the collation strength used in a UCollator.
569  * The strength influences how strings are compared.
570  * @param coll The UCollator to set.
571  * @param strength The desired collation strength; one of UCOL_PRIMARY,
572  * UCOL_SECONDARY, UCOL_TERTIARY, UCOL_QUATERNARY, UCOL_IDENTICAL, UCOL_DEFAULT
573  * @see ucol_getStrength
574  * @stable ICU 2.0
575  */
576 U_CAPI void U_EXPORT2
577 ucol_setStrength(UCollator *coll,
578                  UCollationStrength strength);
579 
580 /**
581  * Retrieves the reordering codes for this collator.
582  * These reordering codes are a combination of UScript codes and UColReorderCode entries.
583  * @param coll The UCollator to query.
584  * @param dest The array to fill with the script ordering.
585  * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
586  * will only return the length of the result without writing any codes (pre-flighting).
587  * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a
588  * failure before the function call.
589  * @return The number of reordering codes written to the dest array.
590  * @see ucol_setReorderCodes
591  * @see ucol_getEquivalentReorderCodes
592  * @see UScriptCode
593  * @see UColReorderCode
594  * @stable ICU 4.8
595  */
596 U_CAPI int32_t U_EXPORT2
597 ucol_getReorderCodes(const UCollator* coll,
598                     int32_t* dest,
599                     int32_t destCapacity,
600                     UErrorCode *pErrorCode);
601 /**
602  * Sets the reordering codes for this collator.
603  * Collation reordering allows scripts and some other groups of characters
604  * to be moved relative to each other. This reordering is done on top of
605  * the DUCET/CLDR standard collation order. Reordering can specify groups to be placed
606  * at the start and/or the end of the collation order. These groups are specified using
607  * UScript codes and UColReorderCode entries.
608  *
609  * <p>By default, reordering codes specified for the start of the order are placed in the
610  * order given after several special non-script blocks. These special groups of characters
611  * are space, punctuation, symbol, currency, and digit. These special groups are represented with
612  * UColReorderCode entries. Script groups can be intermingled with
613  * these special non-script groups if those special groups are explicitly specified in the reordering.
614  *
615  * <p>The special code OTHERS stands for any script that is not explicitly
616  * mentioned in the list of reordering codes given. Anything that is after OTHERS
617  * will go at the very end of the reordering in the order given.
618  *
619  * <p>The special reorder code DEFAULT will reset the reordering for this collator
620  * to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that
621  * was specified when this collator was created from resource data or from rules. The
622  * DEFAULT code <b>must</b> be the sole code supplied when it is used.
623  * If not, then U_ILLEGAL_ARGUMENT_ERROR will be set.
624  *
625  * <p>The special reorder code NONE will remove any reordering for this collator.
626  * The result of setting no reordering will be to have the DUCET/CLDR ordering used. The
627  * NONE code <b>must</b> be the sole code supplied when it is used.
628  *
629  * @param coll The UCollator to set.
630  * @param reorderCodes An array of script codes in the new order. This can be NULL if the
631  * length is also set to 0. An empty array will clear any reordering codes on the collator.
632  * @param reorderCodesLength The length of reorderCodes.
633  * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a
634  * failure before the function call.
635  * @see ucol_getReorderCodes
636  * @see ucol_getEquivalentReorderCodes
637  * @see UScriptCode
638  * @see UColReorderCode
639  * @stable ICU 4.8
640  */
641 U_CAPI void U_EXPORT2
642 ucol_setReorderCodes(UCollator* coll,
643                     const int32_t* reorderCodes,
644                     int32_t reorderCodesLength,
645                     UErrorCode *pErrorCode);
646 
647 /**
648  * Retrieves the reorder codes that are grouped with the given reorder code. Some reorder
649  * codes will be grouped and must reorder together.
650  * Beginning with ICU 55, scripts only reorder together if they are primary-equal,
651  * for example Hiragana and Katakana.
652  *
653  * @param reorderCode The reorder code to determine equivalence for.
654  * @param dest The array to fill with the script ordering.
655  * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
656  * will only return the length of the result without writing any codes (pre-flighting).
657  * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate
658  * a failure before the function call.
659  * @return The number of reordering codes written to the dest array.
660  * @see ucol_setReorderCodes
661  * @see ucol_getReorderCodes
662  * @see UScriptCode
663  * @see UColReorderCode
664  * @stable ICU 4.8
665  */
666 U_CAPI int32_t U_EXPORT2
667 ucol_getEquivalentReorderCodes(int32_t reorderCode,
668                     int32_t* dest,
669                     int32_t destCapacity,
670                     UErrorCode *pErrorCode);
671 
672 /**
673  * Get the display name for a UCollator.
674  * The display name is suitable for presentation to a user.
675  * @param objLoc The locale of the collator in question.
676  * @param dispLoc The locale for display.
677  * @param result A pointer to a buffer to receive the attribute.
678  * @param resultLength The maximum size of result.
679  * @param status A pointer to a UErrorCode to receive any errors
680  * @return The total buffer size needed; if greater than resultLength,
681  * the output was truncated.
682  * @stable ICU 2.0
683  */
684 U_CAPI int32_t U_EXPORT2
685 ucol_getDisplayName(    const    char        *objLoc,
686             const    char        *dispLoc,
687             UChar             *result,
688             int32_t         resultLength,
689             UErrorCode        *status);
690 
691 /**
692  * Get a locale for which collation rules are available.
693  * A UCollator in a locale returned by this function will perform the correct
694  * collation for the locale.
695  * @param localeIndex The index of the desired locale.
696  * @return A locale for which collation rules are available, or 0 if none.
697  * @see ucol_countAvailable
698  * @stable ICU 2.0
699  */
700 U_CAPI const char* U_EXPORT2
701 ucol_getAvailable(int32_t localeIndex);
702 
703 /**
704  * Determine how many locales have collation rules available.
705  * This function is most useful as determining the loop ending condition for
706  * calls to {@link #ucol_getAvailable }.
707  * @return The number of locales for which collation rules are available.
708  * @see ucol_getAvailable
709  * @stable ICU 2.0
710  */
711 U_CAPI int32_t U_EXPORT2
712 ucol_countAvailable(void);
713 
714 #if !UCONFIG_NO_SERVICE
715 /**
716  * Create a string enumerator of all locales for which a valid
717  * collator may be opened.
718  * @param status input-output error code
719  * @return a string enumeration over locale strings. The caller is
720  * responsible for closing the result.
721  * @stable ICU 3.0
722  */
723 U_CAPI UEnumeration* U_EXPORT2
724 ucol_openAvailableLocales(UErrorCode *status);
725 #endif
726 
727 /**
728  * Create a string enumerator of all possible keywords that are relevant to
729  * collation. At this point, the only recognized keyword for this
730  * service is "collation".
731  * @param status input-output error code
732  * @return a string enumeration over locale strings. The caller is
733  * responsible for closing the result.
734  * @stable ICU 3.0
735  */
736 U_CAPI UEnumeration* U_EXPORT2
737 ucol_getKeywords(UErrorCode *status);
738 
739 /**
740  * Given a keyword, create a string enumeration of all values
741  * for that keyword that are currently in use.
742  * @param keyword a particular keyword as enumerated by
743  * ucol_getKeywords. If any other keyword is passed in, *status is set
744  * to U_ILLEGAL_ARGUMENT_ERROR.
745  * @param status input-output error code
746  * @return a string enumeration over collation keyword values, or NULL
747  * upon error. The caller is responsible for closing the result.
748  * @stable ICU 3.0
749  */
750 U_CAPI UEnumeration* U_EXPORT2
751 ucol_getKeywordValues(const char *keyword, UErrorCode *status);
752 
753 /**
754  * Given a key and a locale, returns an array of string values in a preferred
755  * order that would make a difference. These are all and only those values where
756  * the open (creation) of the service with the locale formed from the input locale
757  * plus input keyword and that value has different behavior than creation with the
758  * input locale alone.
759  * @param key           one of the keys supported by this service.  For now, only
760  *                      "collation" is supported.
761  * @param locale        the locale
762  * @param commonlyUsed  if set to true it will return only commonly used values
763  *                      with the given locale in preferred order.  Otherwise,
764  *                      it will return all the available values for the locale.
765  * @param status error status
766  * @return a string enumeration over keyword values for the given key and the locale.
767  * @stable ICU 4.2
768  */
769 U_CAPI UEnumeration* U_EXPORT2
770 ucol_getKeywordValuesForLocale(const char* key,
771                                const char* locale,
772                                UBool commonlyUsed,
773                                UErrorCode* status);
774 
775 /**
776  * Return the functionally equivalent locale for the specified
777  * input locale, with respect to given keyword, for the
778  * collation service. If two different input locale + keyword
779  * combinations produce the same result locale, then collators
780  * instantiated for these two different input locales will behave
781  * equivalently. The converse is not always true; two collators
782  * may in fact be equivalent, but return different results, due to
783  * internal details. The return result has no other meaning than
784  * that stated above, and implies nothing as to the relationship
785  * between the two locales. This is intended for use by
786  * applications who wish to cache collators, or otherwise reuse
787  * collators when possible. The functional equivalent may change
788  * over time. For more information, please see the <a
789  * href="https://unicode-org.github.io/icu/userguide/locale#locales-and-services">
790  * Locales and Services</a> section of the ICU User Guide.
791  * @param result fillin for the functionally equivalent result locale
792  * @param resultCapacity capacity of the fillin buffer
793  * @param keyword a particular keyword as enumerated by
794  * ucol_getKeywords.
795  * @param locale the specified input locale
796  * @param isAvailable if non-NULL, pointer to a fillin parameter that
797  * on return indicates whether the specified input locale was 'available'
798  * to the collation service. A locale is defined as 'available' if it
799  * physically exists within the collation locale data.
800  * @param status pointer to input-output error code
801  * @return the actual buffer size needed for the locale. If greater
802  * than resultCapacity, the returned full name will be truncated and
803  * an error code will be returned.
804  * @stable ICU 3.0
805  */
806 U_CAPI int32_t U_EXPORT2
807 ucol_getFunctionalEquivalent(char* result, int32_t resultCapacity,
808                              const char* keyword, const char* locale,
809                              UBool* isAvailable, UErrorCode* status);
810 
811 /**
812  * Get the collation tailoring rules from a UCollator.
813  * The rules will follow the rule syntax.
814  * @param coll The UCollator to query.
815  * @param length
816  * @return The collation tailoring rules.
817  * @stable ICU 2.0
818  */
819 U_CAPI const UChar* U_EXPORT2
820 ucol_getRules(    const    UCollator    *coll,
821         int32_t            *length);
822 
823 /**
824  * Get a sort key for a string from a UCollator.
825  * Sort keys may be compared using <TT>strcmp</TT>.
826  *
827  * Note that sort keys are often less efficient than simply doing comparison.
828  * For more details, see the ICU User Guide.
829  *
830  * Like ICU functions that write to an output buffer, the buffer contents
831  * is undefined if the buffer capacity (resultLength parameter) is too small.
832  * Unlike ICU functions that write a string to an output buffer,
833  * the terminating zero byte is counted in the sort key length.
834  * @param coll The UCollator containing the collation rules.
835  * @param source The string to transform.
836  * @param sourceLength The length of source, or -1 if null-terminated.
837  * @param result A pointer to a buffer to receive the attribute.
838  * @param resultLength The maximum size of result.
839  * @return The size needed to fully store the sort key.
840  *      If there was an internal error generating the sort key,
841  *      a zero value is returned.
842  * @see ucol_keyHashCode
843  * @stable ICU 2.0
844  */
845 U_CAPI int32_t U_EXPORT2
846 ucol_getSortKey(const    UCollator    *coll,
847         const    UChar        *source,
848         int32_t        sourceLength,
849         uint8_t        *result,
850         int32_t        resultLength);
851 
852 /** enum that is taken by ucol_getBound API
853  * See below for explanation
854  * do not change the values assigned to the
855  * members of this enum. Underlying code
856  * depends on them having these numbers
857  * @stable ICU 2.0
858  */
859 typedef enum {
860   /** lower bound */
861   UCOL_BOUND_LOWER = 0,
862   /** upper bound that will match strings of exact size */
863   UCOL_BOUND_UPPER = 1,
864   /** upper bound that will match all the strings that have the same initial substring as the given string */
865   UCOL_BOUND_UPPER_LONG = 2
866 } UColBoundMode;
867 
868 /**
869  * Produce a bound for a given sortkey and a number of levels.
870  * Return value is always the number of bytes needed, regardless of
871  * whether the result buffer was big enough or even valid.<br>
872  * Resulting bounds can be used to produce a range of strings that are
873  * between upper and lower bounds. For example, if bounds are produced
874  * for a sortkey of string "smith", strings between upper and lower
875  * bounds with one level would include "Smith", "SMITH", "sMiTh".<br>
876  * There are two upper bounds that can be produced. If UCOL_BOUND_UPPER
877  * is produced, strings matched would be as above. However, if bound
878  * produced using UCOL_BOUND_UPPER_LONG is used, the above example will
879  * also match "Smithsonian" and similar.<br>
880  * For more on usage, see example in cintltst/capitst.c in procedure
881  * TestBounds.
882  * Sort keys may be compared using <TT>strcmp</TT>.
883  * @param source The source sortkey.
884  * @param sourceLength The length of source, or -1 if null-terminated.
885  *                     (If an unmodified sortkey is passed, it is always null
886  *                      terminated).
887  * @param boundType Type of bound required. It can be UCOL_BOUND_LOWER, which
888  *                  produces a lower inclusive bound, UCOL_BOUND_UPPER, that
889  *                  produces upper bound that matches strings of the same length
890  *                  or UCOL_BOUND_UPPER_LONG that matches strings that have the
891  *                  same starting substring as the source string.
892  * @param noOfLevels  Number of levels required in the resulting bound (for most
893  *                    uses, the recommended value is 1). See users guide for
894  *                    explanation on number of levels a sortkey can have.
895  * @param result A pointer to a buffer to receive the resulting sortkey.
896  * @param resultLength The maximum size of result.
897  * @param status Used for returning error code if something went wrong. If the
898  *               number of levels requested is higher than the number of levels
899  *               in the source key, a warning (U_SORT_KEY_TOO_SHORT_WARNING) is
900  *               issued.
901  * @return The size needed to fully store the bound.
902  * @see ucol_keyHashCode
903  * @stable ICU 2.1
904  */
905 U_CAPI int32_t U_EXPORT2
906 ucol_getBound(const uint8_t       *source,
907         int32_t             sourceLength,
908         UColBoundMode       boundType,
909         uint32_t            noOfLevels,
910         uint8_t             *result,
911         int32_t             resultLength,
912         UErrorCode          *status);
913 
914 
915 /**
916  * Merges two sort keys. The levels are merged with their corresponding counterparts
917  * (primaries with primaries, secondaries with secondaries etc.). Between the values
918  * from the same level a separator is inserted.
919  *
920  * This is useful, for example, for combining sort keys from first and last names
921  * to sort such pairs.
922  * See http://www.unicode.org/reports/tr10/#Merging_Sort_Keys
923  *
924  * The recommended way to achieve "merged" sorting is by
925  * concatenating strings with U+FFFE between them.
926  * The concatenation has the same sort order as the merged sort keys,
927  * but merge(getSortKey(str1), getSortKey(str2)) may differ from getSortKey(str1 + '\\uFFFE' + str2).
928  * Using strings with U+FFFE may yield shorter sort keys.
929  *
930  * For details about Sort Key Features see
931  * https://unicode-org.github.io/icu/userguide/collation/api#sort-key-features
932  *
933  * It is possible to merge multiple sort keys by consecutively merging
934  * another one with the intermediate result.
935  *
936  * The length of the merge result is the sum of the lengths of the input sort keys.
937  *
938  * Example (uncompressed):
939  * <pre>191B1D 01 050505 01 910505 00
940  * 1F2123 01 050505 01 910505 00</pre>
941  * will be merged as
942  * <pre>191B1D 02 1F2123 01 050505 02 050505 01 910505 02 910505 00</pre>
943  *
944  * If the destination buffer is not big enough, then its contents are undefined.
945  * If any of source lengths are zero or any of the source pointers are NULL/undefined,
946  * the result is of size zero.
947  *
948  * @param src1 the first sort key
949  * @param src1Length the length of the first sort key, including the zero byte at the end;
950  *        can be -1 if the function is to find the length
951  * @param src2 the second sort key
952  * @param src2Length the length of the second sort key, including the zero byte at the end;
953  *        can be -1 if the function is to find the length
954  * @param dest the buffer where the merged sort key is written,
955  *        can be NULL if destCapacity==0
956  * @param destCapacity the number of bytes in the dest buffer
957  * @return the length of the merged sort key, src1Length+src2Length;
958  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
959  *         in which cases the contents of dest is undefined
960  * @stable ICU 2.0
961  */
962 U_CAPI int32_t U_EXPORT2
963 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
964                    const uint8_t *src2, int32_t src2Length,
965                    uint8_t *dest, int32_t destCapacity);
966 
967 /**
968  * Universal attribute setter
969  * @param coll collator which attributes are to be changed
970  * @param attr attribute type
971  * @param value attribute value
972  * @param status to indicate whether the operation went on smoothly or there were errors
973  * @see UColAttribute
974  * @see UColAttributeValue
975  * @see ucol_getAttribute
976  * @stable ICU 2.0
977  */
978 U_CAPI void U_EXPORT2
979 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status);
980 
981 /**
982  * Universal attribute getter
983  * @param coll collator which attributes are to be changed
984  * @param attr attribute type
985  * @return attribute value
986  * @param status to indicate whether the operation went on smoothly or there were errors
987  * @see UColAttribute
988  * @see UColAttributeValue
989  * @see ucol_setAttribute
990  * @stable ICU 2.0
991  */
992 U_CAPI UColAttributeValue  U_EXPORT2
993 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status);
994 
995 /**
996  * Sets the variable top to the top of the specified reordering group.
997  * The variable top determines the highest-sorting character
998  * which is affected by UCOL_ALTERNATE_HANDLING.
999  * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect.
1000  * @param coll the collator
1001  * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION,
1002  *              UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY;
1003  *              or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group
1004  * @param pErrorCode Standard ICU error code. Its input value must
1005  *                   pass the U_SUCCESS() test, or else the function returns
1006  *                   immediately. Check for U_FAILURE() on output or use with
1007  *                   function chaining. (See User Guide for details.)
1008  * @see ucol_getMaxVariable
1009  * @stable ICU 53
1010  */
1011 U_CAPI void U_EXPORT2
1012 ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCode);
1013 
1014 /**
1015  * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING.
1016  * @param coll the collator
1017  * @return the maximum variable reordering group.
1018  * @see ucol_setMaxVariable
1019  * @stable ICU 53
1020  */
1021 U_CAPI UColReorderCode U_EXPORT2
1022 ucol_getMaxVariable(const UCollator *coll);
1023 
1024 /**
1025  * Gets the variable top value of a Collator.
1026  * @param coll collator which variable top needs to be retrieved
1027  * @param status error code (not changed by function). If error code is set,
1028  *               the return value is undefined.
1029  * @return the variable top primary weight
1030  * @see ucol_getMaxVariable
1031  * @see ucol_setVariableTop
1032  * @see ucol_restoreVariableTop
1033  * @stable ICU 2.0
1034  */
1035 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status);
1036 
1037 /**
1038  * Thread safe cloning operation. The result is a clone of a given collator.
1039  * @param coll collator to be cloned
1040  * @param status to indicate whether the operation went on smoothly or there were errors
1041  * @return pointer to the new clone
1042  * @see ucol_open
1043  * @see ucol_openRules
1044  * @see ucol_close
1045  * @stable ICU 71
1046  */
1047 U_CAPI UCollator* U_EXPORT2 ucol_clone(const UCollator *coll, UErrorCode *status);
1048 
1049 /**
1050  * Returns current rules. Delta defines whether full rules are returned or just the tailoring.
1051  * Returns number of UChars needed to store rules. If buffer is NULL or bufferLen is not enough
1052  * to store rules, will store up to available space.
1053  *
1054  * ucol_getRules() should normally be used instead.
1055  * See https://unicode-org.github.io/icu/userguide/collation/customization#building-on-existing-locales
1056  * @param coll collator to get the rules from
1057  * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES.
1058  * @param buffer buffer to store the result in. If NULL, you'll get no rules.
1059  * @param bufferLen length of buffer to store rules in. If less than needed you'll get only the part that fits in.
1060  * @return current rules
1061  * @stable ICU 2.0
1062  * @see UCOL_FULL_RULES
1063  */
1064 U_CAPI int32_t U_EXPORT2
1065 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen);
1066 
1067 /**
1068  * gets the locale name of the collator. If the collator
1069  * is instantiated from the rules, then this function returns
1070  * NULL.
1071  * @param coll The UCollator for which the locale is needed
1072  * @param type You can choose between requested, valid and actual
1073  *             locale. For description see the definition of
1074  *             ULocDataLocaleType in uloc.h
1075  * @param status error code of the operation
1076  * @return real locale name from which the collation data comes.
1077  *         If the collator was instantiated from rules, returns
1078  *         NULL.
1079  * @stable ICU 2.8
1080  */
1081 U_CAPI const char * U_EXPORT2
1082 ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status);
1083 
1084 /**
1085  * Get a Unicode set that contains all the characters and sequences tailored in
1086  * this collator. The result must be disposed of by using uset_close.
1087  * @param coll        The UCollator for which we want to get tailored chars
1088  * @param status      error code of the operation
1089  * @return a pointer to newly created USet. Must be be disposed by using uset_close
1090  * @see ucol_openRules
1091  * @see uset_close
1092  * @stable ICU 2.4
1093  */
1094 U_CAPI USet * U_EXPORT2
1095 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status);
1096 
1097 /** Creates a binary image of a collator. This binary image can be stored and
1098  *  later used to instantiate a collator using ucol_openBinary.
1099  *  This API supports preflighting.
1100  *  @param coll Collator
1101  *  @param buffer a fill-in buffer to receive the binary image
1102  *  @param capacity capacity of the destination buffer
1103  *  @param status for catching errors
1104  *  @return size of the image
1105  *  @see ucol_openBinary
1106  *  @stable ICU 3.2
1107  */
1108 U_CAPI int32_t U_EXPORT2
1109 ucol_cloneBinary(const UCollator *coll,
1110                  uint8_t *buffer, int32_t capacity,
1111                  UErrorCode *status);
1112 
1113 /** Opens a collator from a collator binary image created using
1114  *  ucol_cloneBinary. Binary image used in instantiation of the
1115  *  collator remains owned by the user and should stay around for
1116  *  the lifetime of the collator. The API also takes a base collator
1117  *  which must be the root collator.
1118  *  @param bin binary image owned by the user and required through the
1119  *             lifetime of the collator
1120  *  @param length size of the image. If negative, the API will try to
1121  *                figure out the length of the image
1122  *  @param base Base collator, for lookup of untailored characters.
1123  *              Must be the root collator, must not be NULL.
1124  *              The base is required to be present through the lifetime of the collator.
1125  *  @param status for catching errors
1126  *  @return newly created collator
1127  *  @see ucol_cloneBinary
1128  *  @stable ICU 3.2
1129  */
1130 U_CAPI UCollator* U_EXPORT2
1131 ucol_openBinary(const uint8_t *bin, int32_t length,
1132                 const UCollator *base,
1133                 UErrorCode *status);
1134 
1135 
1136 #endif /* #if !UCONFIG_NO_COLLATION */
1137 
1138 #endif
1139