12e5b6d6dSopenharmony_ci---
22e5b6d6dSopenharmony_cilayout: default
32e5b6d6dSopenharmony_cititle: Collation Examples
42e5b6d6dSopenharmony_cinav_order: 7
52e5b6d6dSopenharmony_ciparent: Collation
62e5b6d6dSopenharmony_ci---
72e5b6d6dSopenharmony_ci<!--
82e5b6d6dSopenharmony_ci© 2020 and later: Unicode, Inc. and others.
92e5b6d6dSopenharmony_ciLicense & terms of use: http://www.unicode.org/copyright.html
102e5b6d6dSopenharmony_ci-->
112e5b6d6dSopenharmony_ci
122e5b6d6dSopenharmony_ci# Collation Examples
132e5b6d6dSopenharmony_ci{: .no_toc }
142e5b6d6dSopenharmony_ci
152e5b6d6dSopenharmony_ci## Contents
162e5b6d6dSopenharmony_ci{: .no_toc .text-delta }
172e5b6d6dSopenharmony_ci
182e5b6d6dSopenharmony_ci1. TOC
192e5b6d6dSopenharmony_ci{:toc}
202e5b6d6dSopenharmony_ci
212e5b6d6dSopenharmony_ci---
222e5b6d6dSopenharmony_ci
232e5b6d6dSopenharmony_ci## Simple Collation Sample Customization
242e5b6d6dSopenharmony_ci
252e5b6d6dSopenharmony_ciThe following program demonstrates how to compare and create sort keys with
262e5b6d6dSopenharmony_cidefault locale.
272e5b6d6dSopenharmony_ci
282e5b6d6dSopenharmony_ciIn **C:**
292e5b6d6dSopenharmony_ci
302e5b6d6dSopenharmony_ci```c
312e5b6d6dSopenharmony_ci#include <stdio.h>
322e5b6d6dSopenharmony_ci#include <memory.h>
332e5b6d6dSopenharmony_ci#include <string.h>
342e5b6d6dSopenharmony_ci#include "unicode/ustring.h"
352e5b6d6dSopenharmony_ci#include "unicode/utypes.h"
362e5b6d6dSopenharmony_ci#include "unicode/uloc.h"
372e5b6d6dSopenharmony_ci#include "unicode/ucol.h"
382e5b6d6dSopenharmony_ci
392e5b6d6dSopenharmony_ci#define MAXBUFFERSIZE 100
402e5b6d6dSopenharmony_ci#define BIGBUFFERSIZE 5000
412e5b6d6dSopenharmony_ci
422e5b6d6dSopenharmony_ciUBool collateWithLocaleInC(const char* locale, UErrorCode *status)
432e5b6d6dSopenharmony_ci{
442e5b6d6dSopenharmony_ci    UChar         dispName    [MAXBUFFERSIZE]; 
452e5b6d6dSopenharmony_ci    int32_t       bufferLen   = 0;
462e5b6d6dSopenharmony_ci    UChar         source            [MAXBUFFERSIZE];
472e5b6d6dSopenharmony_ci    UChar         target            [MAXBUFFERSIZE];
482e5b6d6dSopenharmony_ci    UCollationResult result   = UCOL_EQUAL;
492e5b6d6dSopenharmony_ci    uint8_t             sourceKeyArray    [MAXBUFFERSIZE];
502e5b6d6dSopenharmony_ci    uint8_t             targetKeyArray    [MAXBUFFERSIZE]; 
512e5b6d6dSopenharmony_ci    int32_t       sourceKeyOut      = 0, 
522e5b6d6dSopenharmony_ci                targetKeyOut = 0;
532e5b6d6dSopenharmony_ci    UCollator     *myCollator = 0;
542e5b6d6dSopenharmony_ci    if (U_FAILURE(*status))
552e5b6d6dSopenharmony_ci    {
562e5b6d6dSopenharmony_ci        return false;
572e5b6d6dSopenharmony_ci    }
582e5b6d6dSopenharmony_ci    u_uastrcpy(source, "This is a test.");
592e5b6d6dSopenharmony_ci    u_uastrcpy(target, "THIS IS A TEST.");
602e5b6d6dSopenharmony_ci    myCollator = ucol_open(locale, status);
612e5b6d6dSopenharmony_ci    if (U_FAILURE(*status)){
622e5b6d6dSopenharmony_ci        bufferLen = uloc_getDisplayName(locale, 0, dispName, MAXBUFFERSIZE, status);
632e5b6d6dSopenharmony_ci        /*Report the error with display name... */
642e5b6d6dSopenharmony_ci        fprintf(stderr,
652e5b6d6dSopenharmony_ci        "Failed to create the collator for : \"%s\"\n", dispName);
662e5b6d6dSopenharmony_ci        return false;
672e5b6d6dSopenharmony_ci    }
682e5b6d6dSopenharmony_ci    result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target));
692e5b6d6dSopenharmony_ci    /* result is 1, secondary differences only for ignorable space characters*/
702e5b6d6dSopenharmony_ci    if (result != UCOL_LESS)
712e5b6d6dSopenharmony_ci    {
722e5b6d6dSopenharmony_ci        fprintf(stderr,
732e5b6d6dSopenharmony_ci        "Comparing two strings with only secondary differences in C failed.\n");
742e5b6d6dSopenharmony_ci        return false;
752e5b6d6dSopenharmony_ci    }
762e5b6d6dSopenharmony_ci    /* To compare them with just primary differences */
772e5b6d6dSopenharmony_ci    ucol_setStrength(myCollator, UCOL_PRIMARY);
782e5b6d6dSopenharmony_ci    result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target));
792e5b6d6dSopenharmony_ci    /* result is 0 */
802e5b6d6dSopenharmony_ci    if (result != 0)
812e5b6d6dSopenharmony_ci    {
822e5b6d6dSopenharmony_ci        fprintf(stderr,
832e5b6d6dSopenharmony_ci        "Comparing two strings with no differences in C failed.\n");
842e5b6d6dSopenharmony_ci        return false;
852e5b6d6dSopenharmony_ci    }
862e5b6d6dSopenharmony_ci
872e5b6d6dSopenharmony_ci    /* Now, do the same comparison with keys */
882e5b6d6dSopenharmony_ci    sourceKeyOut = ucol_getSortKey(myCollator, source, -1, sourceKeyArray, MAXBUFFERSIZE);
892e5b6d6dSopenharmony_ci    targetKeyOut = ucol_getSortKey(myCollator, target, -1, targetKeyArray, MAXBUFFERSIZE);
902e5b6d6dSopenharmony_ci    result = 0;
912e5b6d6dSopenharmony_ci    result = strcmp(sourceKeyArray, targetKeyArray);
922e5b6d6dSopenharmony_ci    if (result != 0)
932e5b6d6dSopenharmony_ci    {
942e5b6d6dSopenharmony_ci        fprintf(stderr,
952e5b6d6dSopenharmony_ci        "Comparing two strings with sort keys in C failed.\n");
962e5b6d6dSopenharmony_ci        return false;
972e5b6d6dSopenharmony_ci    }
982e5b6d6dSopenharmony_ci    ucol_close(myCollator);
992e5b6d6dSopenharmony_ci    return true;
1002e5b6d6dSopenharmony_ci}
1012e5b6d6dSopenharmony_ci```
1022e5b6d6dSopenharmony_ci
1032e5b6d6dSopenharmony_ciIn **C++:**
1042e5b6d6dSopenharmony_ci
1052e5b6d6dSopenharmony_ci```c++
1062e5b6d6dSopenharmony_ci#include <stdio.h>
1072e5b6d6dSopenharmony_ci#include "unicode/unistr.h"
1082e5b6d6dSopenharmony_ci#include "unicode/utypes.h"
1092e5b6d6dSopenharmony_ci#include "unicode/locid.h"
1102e5b6d6dSopenharmony_ci#include "unicode/coll.h"
1112e5b6d6dSopenharmony_ci#include "unicode/tblcoll.h"
1122e5b6d6dSopenharmony_ci#include "unicode/coleitr.h"
1132e5b6d6dSopenharmony_ci#include "unicode/sortkey.h"
1142e5b6d6dSopenharmony_ciUBool collateWithLocaleInCPP(const Locale& locale, UErrorCode& status)
1152e5b6d6dSopenharmony_ci{
1162e5b6d6dSopenharmony_ci    UnicodeString dispName; 
1172e5b6d6dSopenharmony_ci    UnicodeString source("This is a test.");
1182e5b6d6dSopenharmony_ci    UnicodeString target("THIS IS A TEST.");
1192e5b6d6dSopenharmony_ci    Collator::EComparisonResult result    = Collator::EQUAL;
1202e5b6d6dSopenharmony_ci    CollationKey sourceKey;
1212e5b6d6dSopenharmony_ci    CollationKey targetKey; 
1222e5b6d6dSopenharmony_ci    Collator      *myCollator = 0;
1232e5b6d6dSopenharmony_ci    if (U_FAILURE(status))
1242e5b6d6dSopenharmony_ci    {
1252e5b6d6dSopenharmony_ci        return false;
1262e5b6d6dSopenharmony_ci    }
1272e5b6d6dSopenharmony_ci    myCollator = Collator::createInstance(locale, status);
1282e5b6d6dSopenharmony_ci    if (U_FAILURE(status)){
1292e5b6d6dSopenharmony_ci        locale.getDisplayName(dispName);
1302e5b6d6dSopenharmony_ci        /*Report the error with display name... */
1312e5b6d6dSopenharmony_ci        fprintf(stderr,
1322e5b6d6dSopenharmony_ci        "%s: Failed to create the collator for : \"%s\"\n", dispName);
1332e5b6d6dSopenharmony_ci        return false;
1342e5b6d6dSopenharmony_ci    }
1352e5b6d6dSopenharmony_ci    result = myCollator->compare(source, target);
1362e5b6d6dSopenharmony_ci    /* result is 1, secondary differences only for ignorable space characters*/
1372e5b6d6dSopenharmony_ci    if (result != UCOL_LESS)
1382e5b6d6dSopenharmony_ci    {
1392e5b6d6dSopenharmony_ci        fprintf(stderr,
1402e5b6d6dSopenharmony_ci        "Comparing two strings with only secondary differences in C failed.\n");
1412e5b6d6dSopenharmony_ci        return false;
1422e5b6d6dSopenharmony_ci    }
1432e5b6d6dSopenharmony_ci    /* To compare them with just primary differences */
1442e5b6d6dSopenharmony_ci    myCollator->setStrength(Collator::PRIMARY);
1452e5b6d6dSopenharmony_ci    result = myCollator->compare(source, target);
1462e5b6d6dSopenharmony_ci    /* result is 0 */
1472e5b6d6dSopenharmony_ci    if (result != 0)
1482e5b6d6dSopenharmony_ci    {
1492e5b6d6dSopenharmony_ci        fprintf(stderr,
1502e5b6d6dSopenharmony_ci        "Comparing two strings with no differences in C failed.\n");
1512e5b6d6dSopenharmony_ci        return false;
1522e5b6d6dSopenharmony_ci    }
1532e5b6d6dSopenharmony_ci    /* Now, do the same comparison with keys */
1542e5b6d6dSopenharmony_ci    myCollator->getCollationKey(source, sourceKey, status);
1552e5b6d6dSopenharmony_ci    myCollator->getCollationKey(target, targetKey, status);
1562e5b6d6dSopenharmony_ci    result = Collator::EQUAL;
1572e5b6d6dSopenharmony_ci
1582e5b6d6dSopenharmony_ci    result = sourceKey.compareTo(targetKey);
1592e5b6d6dSopenharmony_ci    if (result != 0)
1602e5b6d6dSopenharmony_ci    {
1612e5b6d6dSopenharmony_ci        fprintf(stderr,
1622e5b6d6dSopenharmony_ci        "%s: Comparing two strings with sort keys in C failed.\n");
1632e5b6d6dSopenharmony_ci        return false;
1642e5b6d6dSopenharmony_ci    }
1652e5b6d6dSopenharmony_ci    delete myCollator;
1662e5b6d6dSopenharmony_ci    return true;
1672e5b6d6dSopenharmony_ci}
1682e5b6d6dSopenharmony_ci```
1692e5b6d6dSopenharmony_ci
1702e5b6d6dSopenharmony_ci### Main Function
1712e5b6d6dSopenharmony_ci
1722e5b6d6dSopenharmony_ci```c++
1732e5b6d6dSopenharmony_ciextern "C" UBool collateWithLocaleInC(const char* locale, UErrorCode *status);
1742e5b6d6dSopenharmony_ciint main()
1752e5b6d6dSopenharmony_ci{
1762e5b6d6dSopenharmony_ci   UErrorCode status = U_ZERO_ERROR;
1772e5b6d6dSopenharmony_ci   fprintf(stdout, "\n");
1782e5b6d6dSopenharmony_ci   if (collateWithLocaleInCPP(Locale("en", "US"), status) != true)
1792e5b6d6dSopenharmony_ci   {
1802e5b6d6dSopenharmony_ci        fprintf(stderr,
1812e5b6d6dSopenharmony_ci        "Collate with locale in C++ failed.\n");
1822e5b6d6dSopenharmony_ci   } else 
1832e5b6d6dSopenharmony_ci   {
1842e5b6d6dSopenharmony_ci       fprintf(stdout, "Collate with Locale C++ example worked!!\n");
1852e5b6d6dSopenharmony_ci   }
1862e5b6d6dSopenharmony_ci   status = U_ZERO_ERROR;
1872e5b6d6dSopenharmony_ci   fprintf(stdout, "\n");
1882e5b6d6dSopenharmony_ci   if (collateWithLocaleInC("en_US", &status) != true)
1892e5b6d6dSopenharmony_ci   {
1902e5b6d6dSopenharmony_ci        fprintf(stderr,
1912e5b6d6dSopenharmony_ci        "%s: Collate with locale in C failed.\n");
1922e5b6d6dSopenharmony_ci   } else 
1932e5b6d6dSopenharmony_ci   {
1942e5b6d6dSopenharmony_ci       fprintf(stdout, "Collate with Locale C example worked!!\n");
1952e5b6d6dSopenharmony_ci   }
1962e5b6d6dSopenharmony_ci   return 0;
1972e5b6d6dSopenharmony_ci}
1982e5b6d6dSopenharmony_ci```
1992e5b6d6dSopenharmony_ci
2002e5b6d6dSopenharmony_ciIn **Java:**
2012e5b6d6dSopenharmony_ci
2022e5b6d6dSopenharmony_ci```java
2032e5b6d6dSopenharmony_ciimport com.ibm.icu.text.Collator;
2042e5b6d6dSopenharmony_ciimport com.ibm.icu.text.CollationElementIterator;
2052e5b6d6dSopenharmony_ciimport com.ibm.icu.text.CollationKey;
2062e5b6d6dSopenharmony_ciimport java.util.Locale;
2072e5b6d6dSopenharmony_ci
2082e5b6d6dSopenharmony_cipublic class CollateExample
2092e5b6d6dSopenharmony_ci{
2102e5b6d6dSopenharmony_ci
2112e5b6d6dSopenharmony_ci    public static void main(String arg[]) 
2122e5b6d6dSopenharmony_ci    {
2132e5b6d6dSopenharmony_ci        CollateExample example = new CollateExample();
2142e5b6d6dSopenharmony_ci        try {
2152e5b6d6dSopenharmony_ci            if (!example.collateWithLocale(Locale.US)) {
2162e5b6d6dSopenharmony_ci                System.err.println("Collate with locale example failed.");
2172e5b6d6dSopenharmony_ci            } 
2182e5b6d6dSopenharmony_ci            else {
2192e5b6d6dSopenharmony_ci                System.out.println("Collate with Locale example worked!!");
2202e5b6d6dSopenharmony_ci            }
2212e5b6d6dSopenharmony_ci        } catch (Exception e) {
2222e5b6d6dSopenharmony_ci            System.err.println("Collating with locale failed");
2232e5b6d6dSopenharmony_ci            e.printStackTrace();
2242e5b6d6dSopenharmony_ci        }
2252e5b6d6dSopenharmony_ci    }
2262e5b6d6dSopenharmony_ci
2272e5b6d6dSopenharmony_ci    public boolean collateWithLocale(Locale locale) throws Exception
2282e5b6d6dSopenharmony_ci    {
2292e5b6d6dSopenharmony_ci        String source = "This is a test.";
2302e5b6d6dSopenharmony_ci        String target = "THIS IS A TEST.";
2312e5b6d6dSopenharmony_ci        Collator myCollator = Collator.getInstance(locale);
2322e5b6d6dSopenharmony_ci
2332e5b6d6dSopenharmony_ci        int result = myCollator.compare(source, target);
2342e5b6d6dSopenharmony_ci        // result is 1, secondary differences only for ignorable space characters
2352e5b6d6dSopenharmony_ci        if (result >= 0) {
2362e5b6d6dSopenharmony_ci            System.err.println(
2372e5b6d6dSopenharmony_ci                "Comparing two strings with only secondary differences failed.");
2382e5b6d6dSopenharmony_ci            return false;
2392e5b6d6dSopenharmony_ci        }
2402e5b6d6dSopenharmony_ci        // To compare them with just primary differences
2412e5b6d6dSopenharmony_ci        myCollator.setStrength(Collator.PRIMARY);
2422e5b6d6dSopenharmony_ci        result = myCollator.compare(source, target);
2432e5b6d6dSopenharmony_ci        // result is 0
2442e5b6d6dSopenharmony_ci        if (result != 0) {
2452e5b6d6dSopenharmony_ci            System.err.println(
2462e5b6d6dSopenharmony_ci                           "Comparing two strings with no differences failed.");
2472e5b6d6dSopenharmony_ci            return false;
2482e5b6d6dSopenharmony_ci        }
2492e5b6d6dSopenharmony_ci        // Now, do the same comparison with keys
2502e5b6d6dSopenharmony_ci        CollationKey sourceKey = myCollator.getCollationKey(source);
2512e5b6d6dSopenharmony_ci        CollationKey targetKey = myCollator.getCollationKey(target);
2522e5b6d6dSopenharmony_ci        result = sourceKey.compareTo(targetKey);
2532e5b6d6dSopenharmony_ci        if (result != 0) {
2542e5b6d6dSopenharmony_ci            System.err.println("Comparing two strings with sort keys failed.");
2552e5b6d6dSopenharmony_ci            return false;
2562e5b6d6dSopenharmony_ci        }
2572e5b6d6dSopenharmony_ci        return true;
2582e5b6d6dSopenharmony_ci    }   
2592e5b6d6dSopenharmony_ci}
2602e5b6d6dSopenharmony_ci```
2612e5b6d6dSopenharmony_ci
2622e5b6d6dSopenharmony_ci## Language-sensitive searching
2632e5b6d6dSopenharmony_ci
2642e5b6d6dSopenharmony_ciString searching is a well-researched area, and there are algorithms that can
2652e5b6d6dSopenharmony_cioptimize the searching process. Perhaps the best is the Boyer-Moore method. For a
2662e5b6d6dSopenharmony_cifull description of this concept, please see Laura
2672e5b6d6dSopenharmony_ciWerner's text searching article for more details
2682e5b6d6dSopenharmony_ci(<http://icu-project.org/docs/papers/efficient_text_searching_in_java.html>).
2692e5b6d6dSopenharmony_ci
2702e5b6d6dSopenharmony_ciHowever, implementing collation-based search with the Boyer-Moore method
2712e5b6d6dSopenharmony_ciwhile getting correct results is very tricky, and ICU no longer uses this method
2722e5b6d6dSopenharmony_ci(as of ICU4C 4.0 and ICU4J 53).
2732e5b6d6dSopenharmony_ci
2742e5b6d6dSopenharmony_ciPlease see the [String Search Service](./string-search) chapter.
2752e5b6d6dSopenharmony_ci
2762e5b6d6dSopenharmony_ci## Using large buffers to manage sort keys
2772e5b6d6dSopenharmony_ci
2782e5b6d6dSopenharmony_ciA good solution for the problem of not knowing the sort key size in advance is
2792e5b6d6dSopenharmony_cito allocate a large buffer and store all the sort keys there, while keeping a
2802e5b6d6dSopenharmony_cilist of indexes or pointers to that buffer.
2812e5b6d6dSopenharmony_ci
2822e5b6d6dSopenharmony_ciFollowing is sample code that will take a pointer to an array of UChar pointer,
2832e5b6d6dSopenharmony_cian array of key indexes. It will allocate and fill a buffer with sort keys and
2842e5b6d6dSopenharmony_cireturn the maximum size for a sort key. Once you have done this to your string,
2852e5b6d6dSopenharmony_ciyou just need to allocate a field of maximum size and copy your sortkeys from
2862e5b6d6dSopenharmony_cithe buffer to fields.
2872e5b6d6dSopenharmony_ci
2882e5b6d6dSopenharmony_ci```c++
2892e5b6d6dSopenharmony_ciuint32_t fillBufferWithKeys(UCollator *coll, UChar **source, uint32_t *keys,
2902e5b6d6dSopenharmony_ci                            uint32_t sourceSize, uint8_t **buffer,
2912e5b6d6dSopenharmony_ci                            uint32_t *maxSize, UErrorCode *status) 
2922e5b6d6dSopenharmony_ci{
2932e5b6d6dSopenharmony_ci  if(status == NULL || U_FAILURE(*status)) {
2942e5b6d6dSopenharmony_ci    return 0;
2952e5b6d6dSopenharmony_ci  }
2962e5b6d6dSopenharmony_ci
2972e5b6d6dSopenharmony_ci  uint32_t bufferSize = 16384;
2982e5b6d6dSopenharmony_ci  uint32_t increment = 16384;
2992e5b6d6dSopenharmony_ci  uint32_t currentOffset = 0;
3002e5b6d6dSopenharmony_ci  uint32_t keySize = 0;
3012e5b6d6dSopenharmony_ci  uint32_t i = 0;
3022e5b6d6dSopenharmony_ci  *maxSize = 0;
3032e5b6d6dSopenharmony_ci
3042e5b6d6dSopenharmony_ci  *buffer = (uint8_t *)malloc(bufferSize * sizeof(uint8_t));
3052e5b6d6dSopenharmony_ci  if(buffer == NULL) {
3062e5b6d6dSopenharmony_ci    *status = U_MEMORY_ALLOCATION_ERROR;
3072e5b6d6dSopenharmony_ci    return 0;
3082e5b6d6dSopenharmony_ci  }
3092e5b6d6dSopenharmony_ci
3102e5b6d6dSopenharmony_ci  for(i = 0; i < sourceSize; i++) {
3112e5b6d6dSopenharmony_ci    keys[i] = currentOffset;
3122e5b6d6dSopenharmony_ci    keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset);
3132e5b6d6dSopenharmony_ci    if(keySize > bufferSize-currentOffset) {
3142e5b6d6dSopenharmony_ci      *buffer = (uint8_t *)realloc(*buffer, bufferSize+increment);
3152e5b6d6dSopenharmony_ci      if(buffer == NULL) {
3162e5b6d6dSopenharmony_ci        *status = U_MEMORY_ALLOCATION_ERROR;
3172e5b6d6dSopenharmony_ci        return 0;
3182e5b6d6dSopenharmony_ci      }
3192e5b6d6dSopenharmony_ci      bufferSize += increment;
3202e5b6d6dSopenharmony_ci      keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset);
3212e5b6d6dSopenharmony_ci    }
3222e5b6d6dSopenharmony_ci    /* here you can hook code that does something interesting with the keySize - 
3232e5b6d6dSopenharmony_ci     * remembers the maximum or similar...
3242e5b6d6dSopenharmony_ci     */
3252e5b6d6dSopenharmony_ci    if(keySize > *maxSize) {
3262e5b6d6dSopenharmony_ci      *maxSize = keySize;
3272e5b6d6dSopenharmony_ci    }
3282e5b6d6dSopenharmony_ci    currentOffset += keySize;
3292e5b6d6dSopenharmony_ci  }
3302e5b6d6dSopenharmony_ci
3312e5b6d6dSopenharmony_ci  return currentOffset;
3322e5b6d6dSopenharmony_ci}
3332e5b6d6dSopenharmony_ci```
334