1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (c) 1996-2015, International Business Machines Corporation and others.
6* All Rights Reserved.
7*******************************************************************************
8*/
9
10#ifndef UCOL_H
11#define UCOL_H
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_COLLATION
16
17#include "unicode/parseerr.h"
18#include "unicode/uloc.h"
19#include "unicode/uset.h"
20#include "unicode/uscript.h"
21
22#if U_SHOW_CPLUSPLUS_API
23#include "unicode/localpointer.h"
24#endif   // U_SHOW_CPLUSPLUS_API
25
26/**
27 * \file
28 * \brief C API: Collator
29 *
30 * <h2> Collator C API </h2>
31 *
32 * The C API for Collator performs locale-sensitive
33 * string comparison. You use this service to build
34 * searching and sorting routines for natural language text.
35 * <p>
36 * For more information about the collation service see
37 * <a href="https://unicode-org.github.io/icu/userguide/collation">the User Guide</a>.
38 * <p>
39 * Collation service provides correct sorting orders for most locales supported in ICU.
40 * If specific data for a locale is not available, the orders eventually falls back
41 * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>.
42 * <p>
43 * Sort ordering may be customized by providing your own set of rules. For more on
44 * this subject see the <a href="https://unicode-org.github.io/icu/userguide/collation/customization">
45 * Collation Customization</a> section of the User Guide.
46 * <p>
47 * @see         UCollationResult
48 * @see         UNormalizationMode
49 * @see         UCollationStrength
50 * @see         UCollationElements
51 */
52
53/** A collator.
54*  For usage in C programs.
55*/
56struct UCollator;
57/** structure representing a collator object instance
58 * @stable ICU 2.0
59 */
60typedef struct UCollator UCollator;
61
62
63/**
64 * UCOL_LESS is returned if source string is compared to be less than target
65 * string in the ucol_strcoll() method.
66 * UCOL_EQUAL is returned if source string is compared to be equal to target
67 * string in the ucol_strcoll() method.
68 * UCOL_GREATER is returned if source string is compared to be greater than
69 * target string in the ucol_strcoll() method.
70 * @see ucol_strcoll()
71 * <p>
72 * Possible values for a comparison result
73 * @stable ICU 2.0
74 */
75typedef enum {
76  /** string a == string b */
77  UCOL_EQUAL    = 0,
78  /** string a > string b */
79  UCOL_GREATER    = 1,
80  /** string a < string b */
81  UCOL_LESS    = -1
82} UCollationResult ;
83
84
85/** Enum containing attribute values for controlling collation behavior.
86 * Here are all the allowable values. Not every attribute can take every value. The only
87 * universal value is UCOL_DEFAULT, which resets the attribute value to the predefined
88 * value for that locale
89 * @stable ICU 2.0
90 */
91typedef enum {
92  /** accepted by most attributes */
93  UCOL_DEFAULT = -1,
94
95  /** Primary collation strength */
96  UCOL_PRIMARY = 0,
97  /** Secondary collation strength */
98  UCOL_SECONDARY = 1,
99  /** Tertiary collation strength */
100  UCOL_TERTIARY = 2,
101  /** Default collation strength */
102  UCOL_DEFAULT_STRENGTH = UCOL_TERTIARY,
103  UCOL_CE_STRENGTH_LIMIT,
104  /** Quaternary collation strength */
105  UCOL_QUATERNARY=3,
106  /** Identical collation strength */
107  UCOL_IDENTICAL=15,
108  UCOL_STRENGTH_LIMIT,
109
110  /** Turn the feature off - works for UCOL_FRENCH_COLLATION,
111      UCOL_CASE_LEVEL, UCOL_HIRAGANA_QUATERNARY_MODE
112      & UCOL_DECOMPOSITION_MODE*/
113  UCOL_OFF = 16,
114  /** Turn the feature on - works for UCOL_FRENCH_COLLATION,
115      UCOL_CASE_LEVEL, UCOL_HIRAGANA_QUATERNARY_MODE
116      & UCOL_DECOMPOSITION_MODE*/
117  UCOL_ON = 17,
118
119  /** Valid for UCOL_ALTERNATE_HANDLING. Alternate handling will be shifted */
120  UCOL_SHIFTED = 20,
121  /** Valid for UCOL_ALTERNATE_HANDLING. Alternate handling will be non ignorable */
122  UCOL_NON_IGNORABLE = 21,
123
124  /** Valid for UCOL_CASE_FIRST -
125      lower case sorts before upper case */
126  UCOL_LOWER_FIRST = 24,
127  /** upper case sorts before lower case */
128  UCOL_UPPER_FIRST = 25
129} UColAttributeValue;
130
131/**
132 * Enum containing the codes for reordering segments of the collation table that are not script
133 * codes. These reordering codes are to be used in conjunction with the script codes.
134 * @see ucol_getReorderCodes
135 * @see ucol_setReorderCodes
136 * @see ucol_getEquivalentReorderCodes
137 * @see UScriptCode
138 * @stable ICU 4.8
139 */
140 typedef enum {
141   /**
142    * A special reordering code that is used to specify the default
143    * reordering codes for a locale.
144    * @stable ICU 4.8
145    */
146    UCOL_REORDER_CODE_DEFAULT       = -1,
147   /**
148    * A special reordering code that is used to specify no reordering codes.
149    * @stable ICU 4.8
150    */
151    UCOL_REORDER_CODE_NONE          = USCRIPT_UNKNOWN,
152   /**
153    * A special reordering code that is used to specify all other codes used for
154    * reordering except for the codes lised as UColReorderCode values and those
155    * listed explicitly in a reordering.
156    * @stable ICU 4.8
157    */
158    UCOL_REORDER_CODE_OTHERS        = USCRIPT_UNKNOWN,
159   /**
160    * Characters with the space property.
161    * This is equivalent to the rule value "space".
162    * @stable ICU 4.8
163    */
164    UCOL_REORDER_CODE_SPACE         = 0x1000,
165   /**
166    * The first entry in the enumeration of reordering groups. This is intended for use in
167    * range checking and enumeration of the reorder codes.
168    * @stable ICU 4.8
169    */
170    UCOL_REORDER_CODE_FIRST         = UCOL_REORDER_CODE_SPACE,
171   /**
172    * Characters with the punctuation property.
173    * This is equivalent to the rule value "punct".
174    * @stable ICU 4.8
175    */
176    UCOL_REORDER_CODE_PUNCTUATION   = 0x1001,
177   /**
178    * Characters with the symbol property.
179    * This is equivalent to the rule value "symbol".
180    * @stable ICU 4.8
181    */
182    UCOL_REORDER_CODE_SYMBOL        = 0x1002,
183   /**
184    * Characters with the currency property.
185    * This is equivalent to the rule value "currency".
186    * @stable ICU 4.8
187    */
188    UCOL_REORDER_CODE_CURRENCY      = 0x1003,
189   /**
190    * Characters with the digit property.
191    * This is equivalent to the rule value "digit".
192    * @stable ICU 4.8
193    */
194    UCOL_REORDER_CODE_DIGIT         = 0x1004
195} UColReorderCode;
196
197/**
198 * Base letter represents a primary difference.  Set comparison
199 * level to UCOL_PRIMARY to ignore secondary and tertiary differences.
200 * Use this to set the strength of a Collator object.
201 * Example of primary difference, "abc" &lt; "abd"
202 *
203 * Diacritical differences on the same base letter represent a secondary
204 * difference.  Set comparison level to UCOL_SECONDARY to ignore tertiary
205 * differences. Use this to set the strength of a Collator object.
206 * Example of secondary difference, "&auml;" >> "a".
207 *
208 * Uppercase and lowercase versions of the same character represents a
209 * tertiary difference.  Set comparison level to UCOL_TERTIARY to include
210 * all comparison differences. Use this to set the strength of a Collator
211 * object.
212 * Example of tertiary difference, "abc" &lt;&lt;&lt; "ABC".
213 *
214 * Two characters are considered "identical" when they have the same
215 * unicode spellings.  UCOL_IDENTICAL.
216 * For example, "&auml;" == "&auml;".
217 *
218 * UCollationStrength is also used to determine the strength of sort keys
219 * generated from UCollator objects
220 * These values can be now found in the UColAttributeValue enum.
221 * @stable ICU 2.0
222 **/
223typedef UColAttributeValue UCollationStrength;
224
225/** Attributes that collation service understands. All the attributes can take UCOL_DEFAULT
226 * value, as well as the values specific to each one.
227 * @stable ICU 2.0
228 */
229typedef enum {
230     /** Attribute for direction of secondary weights - used in Canadian French.
231      * Acceptable values are UCOL_ON, which results in secondary weights
232      * being considered backwards and UCOL_OFF which treats secondary
233      * weights in the order they appear.
234      * @stable ICU 2.0
235      */
236     UCOL_FRENCH_COLLATION,
237     /** Attribute for handling variable elements.
238      * Acceptable values are UCOL_NON_IGNORABLE (default)
239      * which treats all the codepoints with non-ignorable
240      * primary weights in the same way,
241      * and UCOL_SHIFTED which causes codepoints with primary
242      * weights that are equal or below the variable top value
243      * to be ignored on primary level and moved to the quaternary
244      * level.
245      * @stable ICU 2.0
246      */
247     UCOL_ALTERNATE_HANDLING,
248     /** Controls the ordering of upper and lower case letters.
249      * Acceptable values are UCOL_OFF (default), which orders
250      * upper and lower case letters in accordance to their tertiary
251      * weights, UCOL_UPPER_FIRST which forces upper case letters to
252      * sort before lower case letters, and UCOL_LOWER_FIRST which does
253      * the opposite.
254      * @stable ICU 2.0
255      */
256     UCOL_CASE_FIRST,
257     /** Controls whether an extra case level (positioned before the third
258      * level) is generated or not. Acceptable values are UCOL_OFF (default),
259      * when case level is not generated, and UCOL_ON which causes the case
260      * level to be generated. Contents of the case level are affected by
261      * the value of UCOL_CASE_FIRST attribute. A simple way to ignore
262      * accent differences in a string is to set the strength to UCOL_PRIMARY
263      * and enable case level.
264      * @stable ICU 2.0
265      */
266     UCOL_CASE_LEVEL,
267     /** Controls whether the normalization check and necessary normalizations
268      * are performed. When set to UCOL_OFF (default) no normalization check
269      * is performed. The correctness of the result is guaranteed only if the
270      * input data is in so-called FCD form (see users manual for more info).
271      * When set to UCOL_ON, an incremental check is performed to see whether
272      * the input data is in the FCD form. If the data is not in the FCD form,
273      * incremental NFD normalization is performed.
274      * @stable ICU 2.0
275      */
276     UCOL_NORMALIZATION_MODE,
277     /** An alias for UCOL_NORMALIZATION_MODE attribute.
278      * @stable ICU 2.0
279      */
280     UCOL_DECOMPOSITION_MODE = UCOL_NORMALIZATION_MODE,
281     /** The strength attribute. Can be either UCOL_PRIMARY, UCOL_SECONDARY,
282      * UCOL_TERTIARY, UCOL_QUATERNARY or UCOL_IDENTICAL. The usual strength
283      * for most locales (except Japanese) is tertiary.
284      *
285      * Quaternary strength
286      * is useful when combined with shifted setting for alternate handling
287      * attribute and for JIS X 4061 collation, when it is used to distinguish
288      * between Katakana and Hiragana.
289      * Otherwise, quaternary level
290      * is affected only by the number of non-ignorable code points in
291      * the string.
292      *
293      * Identical strength is rarely useful, as it amounts
294      * to codepoints of the NFD form of the string.
295      * @stable ICU 2.0
296      */
297     UCOL_STRENGTH,
298     /**
299      * When turned on, this attribute makes
300      * substrings of digits sort according to their numeric values.
301      *
302      * This is a way to get '100' to sort AFTER '2'. Note that the longest
303      * digit substring that can be treated as a single unit is
304      * 254 digits (not counting leading zeros). If a digit substring is
305      * longer than that, the digits beyond the limit will be treated as a
306      * separate digit substring.
307      *
308      * A "digit" in this sense is a code point with General_Category=Nd,
309      * which does not include circled numbers, roman numerals, etc.
310      * Only a contiguous digit substring is considered, that is,
311      * non-negative integers without separators.
312      * There is no support for plus/minus signs, decimals, exponents, etc.
313      *
314      * @stable ICU 2.8
315      */
316     UCOL_NUMERIC_COLLATION = UCOL_STRENGTH + 2
317} UColAttribute;
318
319/** Options for retrieving the rule string
320 *  @stable ICU 2.0
321 */
322typedef enum {
323  /**
324   * Retrieves the tailoring rules only.
325   * Same as calling the version of getRules() without UColRuleOption.
326   * @stable ICU 2.0
327   */
328  UCOL_TAILORING_ONLY,
329  /**
330   * Retrieves the "UCA rules" concatenated with the tailoring rules.
331   * The "UCA rules" are an <i>approximation</i> of the root collator's sort order.
332   * They are almost never used or useful at runtime and can be removed from the data.
333   * See https://unicode-org.github.io/icu/userguide/collation/customization#building-on-existing-locales
334   * @stable ICU 2.0
335   */
336  UCOL_FULL_RULES
337} UColRuleOption ;
338
339/**
340 * Open a UCollator for comparing strings.
341 *
342 * For some languages, multiple collation types are available;
343 * for example, "de@collation=phonebook".
344 * Starting with ICU 54, collation attributes can be specified via locale keywords as well,
345 * in the old locale extension syntax ("el@colCaseFirst=upper")
346 * or in language tag syntax ("el-u-kf-upper").
347 * See <a href="https://unicode-org.github.io/icu/userguide/collation/api">User Guide: Collation API</a>.
348 *
349 * The UCollator pointer is used in all the calls to the Collation
350 * service. After finished, collator must be disposed of by calling
351 * {@link #ucol_close }.
352 * @param loc The locale containing the required collation rules.
353 *            Special values for locales can be passed in -
354 *            if NULL is passed for the locale, the default locale
355 *            collation rules will be used. If empty string ("") or
356 *            "root" are passed, the root collator will be returned.
357 * @param status A pointer to a UErrorCode to receive any errors
358 * @return A pointer to a UCollator, or 0 if an error occurred.
359 * @see ucol_openRules
360 * @see ucol_clone
361 * @see ucol_close
362 * @stable ICU 2.0
363 */
364U_CAPI UCollator* U_EXPORT2
365ucol_open(const char *loc, UErrorCode *status);
366
367/**
368 * Produce a UCollator instance according to the rules supplied.
369 * The rules are used to change the default ordering, defined in the
370 * UCA in a process called tailoring. The resulting UCollator pointer
371 * can be used in the same way as the one obtained by {@link #ucol_strcoll }.
372 * @param rules A string describing the collation rules. For the syntax
373 *              of the rules please see users guide.
374 * @param rulesLength The length of rules, or -1 if null-terminated.
375 * @param normalizationMode The normalization mode: One of
376 *             UCOL_OFF     (expect the text to not need normalization),
377 *             UCOL_ON      (normalize), or
378 *             UCOL_DEFAULT (set the mode according to the rules)
379 * @param strength The default collation strength; one of UCOL_PRIMARY, UCOL_SECONDARY,
380 * UCOL_TERTIARY, UCOL_IDENTICAL,UCOL_DEFAULT_STRENGTH - can be also set in the rules.
381 * @param parseError  A pointer to UParseError to receive information about errors
382 *                    occurred during parsing. This argument can currently be set
383 *                    to NULL, but at users own risk. Please provide a real structure.
384 * @param status A pointer to a UErrorCode to receive any errors
385 * @return A pointer to a UCollator. It is not guaranteed that NULL be returned in case
386 *         of error - please use status argument to check for errors.
387 * @see ucol_open
388 * @see ucol_clone
389 * @see ucol_close
390 * @stable ICU 2.0
391 */
392U_CAPI UCollator* U_EXPORT2
393ucol_openRules( const UChar        *rules,
394                int32_t            rulesLength,
395                UColAttributeValue normalizationMode,
396                UCollationStrength strength,
397                UParseError        *parseError,
398                UErrorCode         *status);
399
400/**
401 * Get a set containing the expansions defined by the collator. The set includes
402 * both the root collator's expansions and the expansions defined by the tailoring
403 * @param coll collator
404 * @param contractions if not NULL, the set to hold the contractions
405 * @param expansions if not NULL, the set to hold the expansions
406 * @param addPrefixes add the prefix contextual elements to contractions
407 * @param status to hold the error code
408 *
409 * @stable ICU 3.4
410 */
411U_CAPI void U_EXPORT2
412ucol_getContractionsAndExpansions( const UCollator *coll,
413                  USet *contractions, USet *expansions,
414                  UBool addPrefixes, UErrorCode *status);
415
416/**
417 * Close a UCollator.
418 * Once closed, a UCollator should not be used. Every open collator should
419 * be closed. Otherwise, a memory leak will result.
420 * @param coll The UCollator to close.
421 * @see ucol_open
422 * @see ucol_openRules
423 * @see ucol_clone
424 * @stable ICU 2.0
425 */
426U_CAPI void U_EXPORT2
427ucol_close(UCollator *coll);
428
429#if U_SHOW_CPLUSPLUS_API
430
431U_NAMESPACE_BEGIN
432
433/**
434 * \class LocalUCollatorPointer
435 * "Smart pointer" class, closes a UCollator via ucol_close().
436 * For most methods see the LocalPointerBase base class.
437 *
438 * @see LocalPointerBase
439 * @see LocalPointer
440 * @stable ICU 4.4
441 */
442U_DEFINE_LOCAL_OPEN_POINTER(LocalUCollatorPointer, UCollator, ucol_close);
443
444U_NAMESPACE_END
445
446#endif
447
448/**
449 * Compare two strings.
450 * The strings will be compared using the options already specified.
451 * @param coll The UCollator containing the comparison rules.
452 * @param source The source string.
453 * @param sourceLength The length of source, or -1 if null-terminated.
454 * @param target The target string.
455 * @param targetLength The length of target, or -1 if null-terminated.
456 * @return The result of comparing the strings; one of UCOL_EQUAL,
457 * UCOL_GREATER, UCOL_LESS
458 * @see ucol_greater
459 * @see ucol_greaterOrEqual
460 * @see ucol_equal
461 * @stable ICU 2.0
462 */
463U_CAPI UCollationResult U_EXPORT2
464ucol_strcoll(    const    UCollator    *coll,
465        const    UChar        *source,
466        int32_t            sourceLength,
467        const    UChar        *target,
468        int32_t            targetLength);
469
470/**
471* Compare two strings in UTF-8.
472* The strings will be compared using the options already specified.
473* Note: When input string contains malformed a UTF-8 byte sequence,
474* this function treats these bytes as REPLACEMENT CHARACTER (U+FFFD).
475* @param coll The UCollator containing the comparison rules.
476* @param source The source UTF-8 string.
477* @param sourceLength The length of source, or -1 if null-terminated.
478* @param target The target UTF-8 string.
479* @param targetLength The length of target, or -1 if null-terminated.
480* @param status A pointer to a UErrorCode to receive any errors
481* @return The result of comparing the strings; one of UCOL_EQUAL,
482* UCOL_GREATER, UCOL_LESS
483* @see ucol_greater
484* @see ucol_greaterOrEqual
485* @see ucol_equal
486* @stable ICU 50
487*/
488U_CAPI UCollationResult U_EXPORT2
489ucol_strcollUTF8(
490        const UCollator *coll,
491        const char      *source,
492        int32_t         sourceLength,
493        const char      *target,
494        int32_t         targetLength,
495        UErrorCode      *status);
496
497/**
498 * Determine if one string is greater than another.
499 * This function is equivalent to {@link #ucol_strcoll } == UCOL_GREATER
500 * @param coll The UCollator containing the comparison rules.
501 * @param source The source string.
502 * @param sourceLength The length of source, or -1 if null-terminated.
503 * @param target The target string.
504 * @param targetLength The length of target, or -1 if null-terminated.
505 * @return true if source is greater than target, false otherwise.
506 * @see ucol_strcoll
507 * @see ucol_greaterOrEqual
508 * @see ucol_equal
509 * @stable ICU 2.0
510 */
511U_CAPI UBool U_EXPORT2
512ucol_greater(const UCollator *coll,
513             const UChar     *source, int32_t sourceLength,
514             const UChar     *target, int32_t targetLength);
515
516/**
517 * Determine if one string is greater than or equal to another.
518 * This function is equivalent to {@link #ucol_strcoll } != UCOL_LESS
519 * @param coll The UCollator containing the comparison rules.
520 * @param source The source string.
521 * @param sourceLength The length of source, or -1 if null-terminated.
522 * @param target The target string.
523 * @param targetLength The length of target, or -1 if null-terminated.
524 * @return true if source is greater than or equal to target, false otherwise.
525 * @see ucol_strcoll
526 * @see ucol_greater
527 * @see ucol_equal
528 * @stable ICU 2.0
529 */
530U_CAPI UBool U_EXPORT2
531ucol_greaterOrEqual(const UCollator *coll,
532                    const UChar     *source, int32_t sourceLength,
533                    const UChar     *target, int32_t targetLength);
534
535/**
536 * Compare two strings for equality.
537 * This function is equivalent to {@link #ucol_strcoll } == UCOL_EQUAL
538 * @param coll The UCollator containing the comparison rules.
539 * @param source The source string.
540 * @param sourceLength The length of source, or -1 if null-terminated.
541 * @param target The target string.
542 * @param targetLength The length of target, or -1 if null-terminated.
543 * @return true if source is equal to target, false otherwise
544 * @see ucol_strcoll
545 * @see ucol_greater
546 * @see ucol_greaterOrEqual
547 * @stable ICU 2.0
548 */
549U_CAPI UBool U_EXPORT2
550ucol_equal(const UCollator *coll,
551           const UChar     *source, int32_t sourceLength,
552           const UChar     *target, int32_t targetLength);
553
554
555/**
556 * Get the collation strength used in a UCollator.
557 * The strength influences how strings are compared.
558 * @param coll The UCollator to query.
559 * @return The collation strength; one of UCOL_PRIMARY, UCOL_SECONDARY,
560 * UCOL_TERTIARY, UCOL_QUATERNARY, UCOL_IDENTICAL
561 * @see ucol_setStrength
562 * @stable ICU 2.0
563 */
564U_CAPI UCollationStrength U_EXPORT2
565ucol_getStrength(const UCollator *coll);
566
567/**
568 * Set the collation strength used in a UCollator.
569 * The strength influences how strings are compared.
570 * @param coll The UCollator to set.
571 * @param strength The desired collation strength; one of UCOL_PRIMARY,
572 * UCOL_SECONDARY, UCOL_TERTIARY, UCOL_QUATERNARY, UCOL_IDENTICAL, UCOL_DEFAULT
573 * @see ucol_getStrength
574 * @stable ICU 2.0
575 */
576U_CAPI void U_EXPORT2
577ucol_setStrength(UCollator *coll,
578                 UCollationStrength strength);
579
580/**
581 * Retrieves the reordering codes for this collator.
582 * These reordering codes are a combination of UScript codes and UColReorderCode entries.
583 * @param coll The UCollator to query.
584 * @param dest The array to fill with the script ordering.
585 * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
586 * will only return the length of the result without writing any codes (pre-flighting).
587 * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a
588 * failure before the function call.
589 * @return The number of reordering codes written to the dest array.
590 * @see ucol_setReorderCodes
591 * @see ucol_getEquivalentReorderCodes
592 * @see UScriptCode
593 * @see UColReorderCode
594 * @stable ICU 4.8
595 */
596U_CAPI int32_t U_EXPORT2
597ucol_getReorderCodes(const UCollator* coll,
598                    int32_t* dest,
599                    int32_t destCapacity,
600                    UErrorCode *pErrorCode);
601/**
602 * Sets the reordering codes for this collator.
603 * Collation reordering allows scripts and some other groups of characters
604 * to be moved relative to each other. This reordering is done on top of
605 * the DUCET/CLDR standard collation order. Reordering can specify groups to be placed
606 * at the start and/or the end of the collation order. These groups are specified using
607 * UScript codes and UColReorderCode entries.
608 *
609 * <p>By default, reordering codes specified for the start of the order are placed in the
610 * order given after several special non-script blocks. These special groups of characters
611 * are space, punctuation, symbol, currency, and digit. These special groups are represented with
612 * UColReorderCode entries. Script groups can be intermingled with
613 * these special non-script groups if those special groups are explicitly specified in the reordering.
614 *
615 * <p>The special code OTHERS stands for any script that is not explicitly
616 * mentioned in the list of reordering codes given. Anything that is after OTHERS
617 * will go at the very end of the reordering in the order given.
618 *
619 * <p>The special reorder code DEFAULT will reset the reordering for this collator
620 * to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that
621 * was specified when this collator was created from resource data or from rules. The
622 * DEFAULT code <b>must</b> be the sole code supplied when it is used.
623 * If not, then U_ILLEGAL_ARGUMENT_ERROR will be set.
624 *
625 * <p>The special reorder code NONE will remove any reordering for this collator.
626 * The result of setting no reordering will be to have the DUCET/CLDR ordering used. The
627 * NONE code <b>must</b> be the sole code supplied when it is used.
628 *
629 * @param coll The UCollator to set.
630 * @param reorderCodes An array of script codes in the new order. This can be NULL if the
631 * length is also set to 0. An empty array will clear any reordering codes on the collator.
632 * @param reorderCodesLength The length of reorderCodes.
633 * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a
634 * failure before the function call.
635 * @see ucol_getReorderCodes
636 * @see ucol_getEquivalentReorderCodes
637 * @see UScriptCode
638 * @see UColReorderCode
639 * @stable ICU 4.8
640 */
641U_CAPI void U_EXPORT2
642ucol_setReorderCodes(UCollator* coll,
643                    const int32_t* reorderCodes,
644                    int32_t reorderCodesLength,
645                    UErrorCode *pErrorCode);
646
647/**
648 * Retrieves the reorder codes that are grouped with the given reorder code. Some reorder
649 * codes will be grouped and must reorder together.
650 * Beginning with ICU 55, scripts only reorder together if they are primary-equal,
651 * for example Hiragana and Katakana.
652 *
653 * @param reorderCode The reorder code to determine equivalence for.
654 * @param dest The array to fill with the script ordering.
655 * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
656 * will only return the length of the result without writing any codes (pre-flighting).
657 * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate
658 * a failure before the function call.
659 * @return The number of reordering codes written to the dest array.
660 * @see ucol_setReorderCodes
661 * @see ucol_getReorderCodes
662 * @see UScriptCode
663 * @see UColReorderCode
664 * @stable ICU 4.8
665 */
666U_CAPI int32_t U_EXPORT2
667ucol_getEquivalentReorderCodes(int32_t reorderCode,
668                    int32_t* dest,
669                    int32_t destCapacity,
670                    UErrorCode *pErrorCode);
671
672/**
673 * Get the display name for a UCollator.
674 * The display name is suitable for presentation to a user.
675 * @param objLoc The locale of the collator in question.
676 * @param dispLoc The locale for display.
677 * @param result A pointer to a buffer to receive the attribute.
678 * @param resultLength The maximum size of result.
679 * @param status A pointer to a UErrorCode to receive any errors
680 * @return The total buffer size needed; if greater than resultLength,
681 * the output was truncated.
682 * @stable ICU 2.0
683 */
684U_CAPI int32_t U_EXPORT2
685ucol_getDisplayName(    const    char        *objLoc,
686            const    char        *dispLoc,
687            UChar             *result,
688            int32_t         resultLength,
689            UErrorCode        *status);
690
691/**
692 * Get a locale for which collation rules are available.
693 * A UCollator in a locale returned by this function will perform the correct
694 * collation for the locale.
695 * @param localeIndex The index of the desired locale.
696 * @return A locale for which collation rules are available, or 0 if none.
697 * @see ucol_countAvailable
698 * @stable ICU 2.0
699 */
700U_CAPI const char* U_EXPORT2
701ucol_getAvailable(int32_t localeIndex);
702
703/**
704 * Determine how many locales have collation rules available.
705 * This function is most useful as determining the loop ending condition for
706 * calls to {@link #ucol_getAvailable }.
707 * @return The number of locales for which collation rules are available.
708 * @see ucol_getAvailable
709 * @stable ICU 2.0
710 */
711U_CAPI int32_t U_EXPORT2
712ucol_countAvailable(void);
713
714#if !UCONFIG_NO_SERVICE
715/**
716 * Create a string enumerator of all locales for which a valid
717 * collator may be opened.
718 * @param status input-output error code
719 * @return a string enumeration over locale strings. The caller is
720 * responsible for closing the result.
721 * @stable ICU 3.0
722 */
723U_CAPI UEnumeration* U_EXPORT2
724ucol_openAvailableLocales(UErrorCode *status);
725#endif
726
727/**
728 * Create a string enumerator of all possible keywords that are relevant to
729 * collation. At this point, the only recognized keyword for this
730 * service is "collation".
731 * @param status input-output error code
732 * @return a string enumeration over locale strings. The caller is
733 * responsible for closing the result.
734 * @stable ICU 3.0
735 */
736U_CAPI UEnumeration* U_EXPORT2
737ucol_getKeywords(UErrorCode *status);
738
739/**
740 * Given a keyword, create a string enumeration of all values
741 * for that keyword that are currently in use.
742 * @param keyword a particular keyword as enumerated by
743 * ucol_getKeywords. If any other keyword is passed in, *status is set
744 * to U_ILLEGAL_ARGUMENT_ERROR.
745 * @param status input-output error code
746 * @return a string enumeration over collation keyword values, or NULL
747 * upon error. The caller is responsible for closing the result.
748 * @stable ICU 3.0
749 */
750U_CAPI UEnumeration* U_EXPORT2
751ucol_getKeywordValues(const char *keyword, UErrorCode *status);
752
753/**
754 * Given a key and a locale, returns an array of string values in a preferred
755 * order that would make a difference. These are all and only those values where
756 * the open (creation) of the service with the locale formed from the input locale
757 * plus input keyword and that value has different behavior than creation with the
758 * input locale alone.
759 * @param key           one of the keys supported by this service.  For now, only
760 *                      "collation" is supported.
761 * @param locale        the locale
762 * @param commonlyUsed  if set to true it will return only commonly used values
763 *                      with the given locale in preferred order.  Otherwise,
764 *                      it will return all the available values for the locale.
765 * @param status error status
766 * @return a string enumeration over keyword values for the given key and the locale.
767 * @stable ICU 4.2
768 */
769U_CAPI UEnumeration* U_EXPORT2
770ucol_getKeywordValuesForLocale(const char* key,
771                               const char* locale,
772                               UBool commonlyUsed,
773                               UErrorCode* status);
774
775/**
776 * Return the functionally equivalent locale for the specified
777 * input locale, with respect to given keyword, for the
778 * collation service. If two different input locale + keyword
779 * combinations produce the same result locale, then collators
780 * instantiated for these two different input locales will behave
781 * equivalently. The converse is not always true; two collators
782 * may in fact be equivalent, but return different results, due to
783 * internal details. The return result has no other meaning than
784 * that stated above, and implies nothing as to the relationship
785 * between the two locales. This is intended for use by
786 * applications who wish to cache collators, or otherwise reuse
787 * collators when possible. The functional equivalent may change
788 * over time. For more information, please see the <a
789 * href="https://unicode-org.github.io/icu/userguide/locale#locales-and-services">
790 * Locales and Services</a> section of the ICU User Guide.
791 * @param result fillin for the functionally equivalent result locale
792 * @param resultCapacity capacity of the fillin buffer
793 * @param keyword a particular keyword as enumerated by
794 * ucol_getKeywords.
795 * @param locale the specified input locale
796 * @param isAvailable if non-NULL, pointer to a fillin parameter that
797 * on return indicates whether the specified input locale was 'available'
798 * to the collation service. A locale is defined as 'available' if it
799 * physically exists within the collation locale data.
800 * @param status pointer to input-output error code
801 * @return the actual buffer size needed for the locale. If greater
802 * than resultCapacity, the returned full name will be truncated and
803 * an error code will be returned.
804 * @stable ICU 3.0
805 */
806U_CAPI int32_t U_EXPORT2
807ucol_getFunctionalEquivalent(char* result, int32_t resultCapacity,
808                             const char* keyword, const char* locale,
809                             UBool* isAvailable, UErrorCode* status);
810
811/**
812 * Get the collation tailoring rules from a UCollator.
813 * The rules will follow the rule syntax.
814 * @param coll The UCollator to query.
815 * @param length
816 * @return The collation tailoring rules.
817 * @stable ICU 2.0
818 */
819U_CAPI const UChar* U_EXPORT2
820ucol_getRules(    const    UCollator    *coll,
821        int32_t            *length);
822
823/**
824 * Get a sort key for a string from a UCollator.
825 * Sort keys may be compared using <TT>strcmp</TT>.
826 *
827 * Note that sort keys are often less efficient than simply doing comparison.
828 * For more details, see the ICU User Guide.
829 *
830 * Like ICU functions that write to an output buffer, the buffer contents
831 * is undefined if the buffer capacity (resultLength parameter) is too small.
832 * Unlike ICU functions that write a string to an output buffer,
833 * the terminating zero byte is counted in the sort key length.
834 * @param coll The UCollator containing the collation rules.
835 * @param source The string to transform.
836 * @param sourceLength The length of source, or -1 if null-terminated.
837 * @param result A pointer to a buffer to receive the attribute.
838 * @param resultLength The maximum size of result.
839 * @return The size needed to fully store the sort key.
840 *      If there was an internal error generating the sort key,
841 *      a zero value is returned.
842 * @see ucol_keyHashCode
843 * @stable ICU 2.0
844 */
845U_CAPI int32_t U_EXPORT2
846ucol_getSortKey(const    UCollator    *coll,
847        const    UChar        *source,
848        int32_t        sourceLength,
849        uint8_t        *result,
850        int32_t        resultLength);
851
852/** enum that is taken by ucol_getBound API
853 * See below for explanation
854 * do not change the values assigned to the
855 * members of this enum. Underlying code
856 * depends on them having these numbers
857 * @stable ICU 2.0
858 */
859typedef enum {
860  /** lower bound */
861  UCOL_BOUND_LOWER = 0,
862  /** upper bound that will match strings of exact size */
863  UCOL_BOUND_UPPER = 1,
864  /** upper bound that will match all the strings that have the same initial substring as the given string */
865  UCOL_BOUND_UPPER_LONG = 2
866} UColBoundMode;
867
868/**
869 * Produce a bound for a given sortkey and a number of levels.
870 * Return value is always the number of bytes needed, regardless of
871 * whether the result buffer was big enough or even valid.<br>
872 * Resulting bounds can be used to produce a range of strings that are
873 * between upper and lower bounds. For example, if bounds are produced
874 * for a sortkey of string "smith", strings between upper and lower
875 * bounds with one level would include "Smith", "SMITH", "sMiTh".<br>
876 * There are two upper bounds that can be produced. If UCOL_BOUND_UPPER
877 * is produced, strings matched would be as above. However, if bound
878 * produced using UCOL_BOUND_UPPER_LONG is used, the above example will
879 * also match "Smithsonian" and similar.<br>
880 * For more on usage, see example in cintltst/capitst.c in procedure
881 * TestBounds.
882 * Sort keys may be compared using <TT>strcmp</TT>.
883 * @param source The source sortkey.
884 * @param sourceLength The length of source, or -1 if null-terminated.
885 *                     (If an unmodified sortkey is passed, it is always null
886 *                      terminated).
887 * @param boundType Type of bound required. It can be UCOL_BOUND_LOWER, which
888 *                  produces a lower inclusive bound, UCOL_BOUND_UPPER, that
889 *                  produces upper bound that matches strings of the same length
890 *                  or UCOL_BOUND_UPPER_LONG that matches strings that have the
891 *                  same starting substring as the source string.
892 * @param noOfLevels  Number of levels required in the resulting bound (for most
893 *                    uses, the recommended value is 1). See users guide for
894 *                    explanation on number of levels a sortkey can have.
895 * @param result A pointer to a buffer to receive the resulting sortkey.
896 * @param resultLength The maximum size of result.
897 * @param status Used for returning error code if something went wrong. If the
898 *               number of levels requested is higher than the number of levels
899 *               in the source key, a warning (U_SORT_KEY_TOO_SHORT_WARNING) is
900 *               issued.
901 * @return The size needed to fully store the bound.
902 * @see ucol_keyHashCode
903 * @stable ICU 2.1
904 */
905U_CAPI int32_t U_EXPORT2
906ucol_getBound(const uint8_t       *source,
907        int32_t             sourceLength,
908        UColBoundMode       boundType,
909        uint32_t            noOfLevels,
910        uint8_t             *result,
911        int32_t             resultLength,
912        UErrorCode          *status);
913
914
915/**
916 * Merges two sort keys. The levels are merged with their corresponding counterparts
917 * (primaries with primaries, secondaries with secondaries etc.). Between the values
918 * from the same level a separator is inserted.
919 *
920 * This is useful, for example, for combining sort keys from first and last names
921 * to sort such pairs.
922 * See http://www.unicode.org/reports/tr10/#Merging_Sort_Keys
923 *
924 * The recommended way to achieve "merged" sorting is by
925 * concatenating strings with U+FFFE between them.
926 * The concatenation has the same sort order as the merged sort keys,
927 * but merge(getSortKey(str1), getSortKey(str2)) may differ from getSortKey(str1 + '\\uFFFE' + str2).
928 * Using strings with U+FFFE may yield shorter sort keys.
929 *
930 * For details about Sort Key Features see
931 * https://unicode-org.github.io/icu/userguide/collation/api#sort-key-features
932 *
933 * It is possible to merge multiple sort keys by consecutively merging
934 * another one with the intermediate result.
935 *
936 * The length of the merge result is the sum of the lengths of the input sort keys.
937 *
938 * Example (uncompressed):
939 * <pre>191B1D 01 050505 01 910505 00
940 * 1F2123 01 050505 01 910505 00</pre>
941 * will be merged as
942 * <pre>191B1D 02 1F2123 01 050505 02 050505 01 910505 02 910505 00</pre>
943 *
944 * If the destination buffer is not big enough, then its contents are undefined.
945 * If any of source lengths are zero or any of the source pointers are NULL/undefined,
946 * the result is of size zero.
947 *
948 * @param src1 the first sort key
949 * @param src1Length the length of the first sort key, including the zero byte at the end;
950 *        can be -1 if the function is to find the length
951 * @param src2 the second sort key
952 * @param src2Length the length of the second sort key, including the zero byte at the end;
953 *        can be -1 if the function is to find the length
954 * @param dest the buffer where the merged sort key is written,
955 *        can be NULL if destCapacity==0
956 * @param destCapacity the number of bytes in the dest buffer
957 * @return the length of the merged sort key, src1Length+src2Length;
958 *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
959 *         in which cases the contents of dest is undefined
960 * @stable ICU 2.0
961 */
962U_CAPI int32_t U_EXPORT2
963ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
964                   const uint8_t *src2, int32_t src2Length,
965                   uint8_t *dest, int32_t destCapacity);
966
967/**
968 * Universal attribute setter
969 * @param coll collator which attributes are to be changed
970 * @param attr attribute type
971 * @param value attribute value
972 * @param status to indicate whether the operation went on smoothly or there were errors
973 * @see UColAttribute
974 * @see UColAttributeValue
975 * @see ucol_getAttribute
976 * @stable ICU 2.0
977 */
978U_CAPI void U_EXPORT2
979ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status);
980
981/**
982 * Universal attribute getter
983 * @param coll collator which attributes are to be changed
984 * @param attr attribute type
985 * @return attribute value
986 * @param status to indicate whether the operation went on smoothly or there were errors
987 * @see UColAttribute
988 * @see UColAttributeValue
989 * @see ucol_setAttribute
990 * @stable ICU 2.0
991 */
992U_CAPI UColAttributeValue  U_EXPORT2
993ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status);
994
995/**
996 * Sets the variable top to the top of the specified reordering group.
997 * The variable top determines the highest-sorting character
998 * which is affected by UCOL_ALTERNATE_HANDLING.
999 * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect.
1000 * @param coll the collator
1001 * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION,
1002 *              UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY;
1003 *              or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group
1004 * @param pErrorCode Standard ICU error code. Its input value must
1005 *                   pass the U_SUCCESS() test, or else the function returns
1006 *                   immediately. Check for U_FAILURE() on output or use with
1007 *                   function chaining. (See User Guide for details.)
1008 * @see ucol_getMaxVariable
1009 * @stable ICU 53
1010 */
1011U_CAPI void U_EXPORT2
1012ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCode);
1013
1014/**
1015 * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING.
1016 * @param coll the collator
1017 * @return the maximum variable reordering group.
1018 * @see ucol_setMaxVariable
1019 * @stable ICU 53
1020 */
1021U_CAPI UColReorderCode U_EXPORT2
1022ucol_getMaxVariable(const UCollator *coll);
1023
1024/**
1025 * Gets the variable top value of a Collator.
1026 * @param coll collator which variable top needs to be retrieved
1027 * @param status error code (not changed by function). If error code is set,
1028 *               the return value is undefined.
1029 * @return the variable top primary weight
1030 * @see ucol_getMaxVariable
1031 * @see ucol_setVariableTop
1032 * @see ucol_restoreVariableTop
1033 * @stable ICU 2.0
1034 */
1035U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status);
1036
1037/**
1038 * Thread safe cloning operation. The result is a clone of a given collator.
1039 * @param coll collator to be cloned
1040 * @param status to indicate whether the operation went on smoothly or there were errors
1041 * @return pointer to the new clone
1042 * @see ucol_open
1043 * @see ucol_openRules
1044 * @see ucol_close
1045 * @stable ICU 71
1046 */
1047U_CAPI UCollator* U_EXPORT2 ucol_clone(const UCollator *coll, UErrorCode *status);
1048
1049/**
1050 * Returns current rules. Delta defines whether full rules are returned or just the tailoring.
1051 * Returns number of UChars needed to store rules. If buffer is NULL or bufferLen is not enough
1052 * to store rules, will store up to available space.
1053 *
1054 * ucol_getRules() should normally be used instead.
1055 * See https://unicode-org.github.io/icu/userguide/collation/customization#building-on-existing-locales
1056 * @param coll collator to get the rules from
1057 * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES.
1058 * @param buffer buffer to store the result in. If NULL, you'll get no rules.
1059 * @param bufferLen length of buffer to store rules in. If less than needed you'll get only the part that fits in.
1060 * @return current rules
1061 * @stable ICU 2.0
1062 * @see UCOL_FULL_RULES
1063 */
1064U_CAPI int32_t U_EXPORT2
1065ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen);
1066
1067/**
1068 * gets the locale name of the collator. If the collator
1069 * is instantiated from the rules, then this function returns
1070 * NULL.
1071 * @param coll The UCollator for which the locale is needed
1072 * @param type You can choose between requested, valid and actual
1073 *             locale. For description see the definition of
1074 *             ULocDataLocaleType in uloc.h
1075 * @param status error code of the operation
1076 * @return real locale name from which the collation data comes.
1077 *         If the collator was instantiated from rules, returns
1078 *         NULL.
1079 * @stable ICU 2.8
1080 */
1081U_CAPI const char * U_EXPORT2
1082ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status);
1083
1084/**
1085 * Get a Unicode set that contains all the characters and sequences tailored in
1086 * this collator. The result must be disposed of by using uset_close.
1087 * @param coll        The UCollator for which we want to get tailored chars
1088 * @param status      error code of the operation
1089 * @return a pointer to newly created USet. Must be be disposed by using uset_close
1090 * @see ucol_openRules
1091 * @see uset_close
1092 * @stable ICU 2.4
1093 */
1094U_CAPI USet * U_EXPORT2
1095ucol_getTailoredSet(const UCollator *coll, UErrorCode *status);
1096
1097/** Creates a binary image of a collator. This binary image can be stored and
1098 *  later used to instantiate a collator using ucol_openBinary.
1099 *  This API supports preflighting.
1100 *  @param coll Collator
1101 *  @param buffer a fill-in buffer to receive the binary image
1102 *  @param capacity capacity of the destination buffer
1103 *  @param status for catching errors
1104 *  @return size of the image
1105 *  @see ucol_openBinary
1106 *  @stable ICU 3.2
1107 */
1108U_CAPI int32_t U_EXPORT2
1109ucol_cloneBinary(const UCollator *coll,
1110                 uint8_t *buffer, int32_t capacity,
1111                 UErrorCode *status);
1112
1113/** Opens a collator from a collator binary image created using
1114 *  ucol_cloneBinary. Binary image used in instantiation of the
1115 *  collator remains owned by the user and should stay around for
1116 *  the lifetime of the collator. The API also takes a base collator
1117 *  which must be the root collator.
1118 *  @param bin binary image owned by the user and required through the
1119 *             lifetime of the collator
1120 *  @param length size of the image. If negative, the API will try to
1121 *                figure out the length of the image
1122 *  @param base Base collator, for lookup of untailored characters.
1123 *              Must be the root collator, must not be NULL.
1124 *              The base is required to be present through the lifetime of the collator.
1125 *  @param status for catching errors
1126 *  @return newly created collator
1127 *  @see ucol_cloneBinary
1128 *  @stable ICU 3.2
1129 */
1130U_CAPI UCollator* U_EXPORT2
1131ucol_openBinary(const uint8_t *bin, int32_t length,
1132                const UCollator *base,
1133                UErrorCode *status);
1134
1135
1136#endif /* #if !UCONFIG_NO_COLLATION */
1137
1138#endif
1139