12e5b6d6dSopenharmony_ci--- 22e5b6d6dSopenharmony_cilayout: default 32e5b6d6dSopenharmony_cititle: Collation Examples 42e5b6d6dSopenharmony_cinav_order: 7 52e5b6d6dSopenharmony_ciparent: Collation 62e5b6d6dSopenharmony_ci--- 72e5b6d6dSopenharmony_ci<!-- 82e5b6d6dSopenharmony_ci© 2020 and later: Unicode, Inc. and others. 92e5b6d6dSopenharmony_ciLicense & terms of use: http://www.unicode.org/copyright.html 102e5b6d6dSopenharmony_ci--> 112e5b6d6dSopenharmony_ci 122e5b6d6dSopenharmony_ci# Collation Examples 132e5b6d6dSopenharmony_ci{: .no_toc } 142e5b6d6dSopenharmony_ci 152e5b6d6dSopenharmony_ci## Contents 162e5b6d6dSopenharmony_ci{: .no_toc .text-delta } 172e5b6d6dSopenharmony_ci 182e5b6d6dSopenharmony_ci1. TOC 192e5b6d6dSopenharmony_ci{:toc} 202e5b6d6dSopenharmony_ci 212e5b6d6dSopenharmony_ci--- 222e5b6d6dSopenharmony_ci 232e5b6d6dSopenharmony_ci## Simple Collation Sample Customization 242e5b6d6dSopenharmony_ci 252e5b6d6dSopenharmony_ciThe following program demonstrates how to compare and create sort keys with 262e5b6d6dSopenharmony_cidefault locale. 272e5b6d6dSopenharmony_ci 282e5b6d6dSopenharmony_ciIn **C:** 292e5b6d6dSopenharmony_ci 302e5b6d6dSopenharmony_ci```c 312e5b6d6dSopenharmony_ci#include <stdio.h> 322e5b6d6dSopenharmony_ci#include <memory.h> 332e5b6d6dSopenharmony_ci#include <string.h> 342e5b6d6dSopenharmony_ci#include "unicode/ustring.h" 352e5b6d6dSopenharmony_ci#include "unicode/utypes.h" 362e5b6d6dSopenharmony_ci#include "unicode/uloc.h" 372e5b6d6dSopenharmony_ci#include "unicode/ucol.h" 382e5b6d6dSopenharmony_ci 392e5b6d6dSopenharmony_ci#define MAXBUFFERSIZE 100 402e5b6d6dSopenharmony_ci#define BIGBUFFERSIZE 5000 412e5b6d6dSopenharmony_ci 422e5b6d6dSopenharmony_ciUBool collateWithLocaleInC(const char* locale, UErrorCode *status) 432e5b6d6dSopenharmony_ci{ 442e5b6d6dSopenharmony_ci UChar dispName [MAXBUFFERSIZE]; 452e5b6d6dSopenharmony_ci int32_t bufferLen = 0; 462e5b6d6dSopenharmony_ci UChar source [MAXBUFFERSIZE]; 472e5b6d6dSopenharmony_ci UChar target [MAXBUFFERSIZE]; 482e5b6d6dSopenharmony_ci UCollationResult result = UCOL_EQUAL; 492e5b6d6dSopenharmony_ci uint8_t sourceKeyArray [MAXBUFFERSIZE]; 502e5b6d6dSopenharmony_ci uint8_t targetKeyArray [MAXBUFFERSIZE]; 512e5b6d6dSopenharmony_ci int32_t sourceKeyOut = 0, 522e5b6d6dSopenharmony_ci targetKeyOut = 0; 532e5b6d6dSopenharmony_ci UCollator *myCollator = 0; 542e5b6d6dSopenharmony_ci if (U_FAILURE(*status)) 552e5b6d6dSopenharmony_ci { 562e5b6d6dSopenharmony_ci return false; 572e5b6d6dSopenharmony_ci } 582e5b6d6dSopenharmony_ci u_uastrcpy(source, "This is a test."); 592e5b6d6dSopenharmony_ci u_uastrcpy(target, "THIS IS A TEST."); 602e5b6d6dSopenharmony_ci myCollator = ucol_open(locale, status); 612e5b6d6dSopenharmony_ci if (U_FAILURE(*status)){ 622e5b6d6dSopenharmony_ci bufferLen = uloc_getDisplayName(locale, 0, dispName, MAXBUFFERSIZE, status); 632e5b6d6dSopenharmony_ci /*Report the error with display name... */ 642e5b6d6dSopenharmony_ci fprintf(stderr, 652e5b6d6dSopenharmony_ci "Failed to create the collator for : \"%s\"\n", dispName); 662e5b6d6dSopenharmony_ci return false; 672e5b6d6dSopenharmony_ci } 682e5b6d6dSopenharmony_ci result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target)); 692e5b6d6dSopenharmony_ci /* result is 1, secondary differences only for ignorable space characters*/ 702e5b6d6dSopenharmony_ci if (result != UCOL_LESS) 712e5b6d6dSopenharmony_ci { 722e5b6d6dSopenharmony_ci fprintf(stderr, 732e5b6d6dSopenharmony_ci "Comparing two strings with only secondary differences in C failed.\n"); 742e5b6d6dSopenharmony_ci return false; 752e5b6d6dSopenharmony_ci } 762e5b6d6dSopenharmony_ci /* To compare them with just primary differences */ 772e5b6d6dSopenharmony_ci ucol_setStrength(myCollator, UCOL_PRIMARY); 782e5b6d6dSopenharmony_ci result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target)); 792e5b6d6dSopenharmony_ci /* result is 0 */ 802e5b6d6dSopenharmony_ci if (result != 0) 812e5b6d6dSopenharmony_ci { 822e5b6d6dSopenharmony_ci fprintf(stderr, 832e5b6d6dSopenharmony_ci "Comparing two strings with no differences in C failed.\n"); 842e5b6d6dSopenharmony_ci return false; 852e5b6d6dSopenharmony_ci } 862e5b6d6dSopenharmony_ci 872e5b6d6dSopenharmony_ci /* Now, do the same comparison with keys */ 882e5b6d6dSopenharmony_ci sourceKeyOut = ucol_getSortKey(myCollator, source, -1, sourceKeyArray, MAXBUFFERSIZE); 892e5b6d6dSopenharmony_ci targetKeyOut = ucol_getSortKey(myCollator, target, -1, targetKeyArray, MAXBUFFERSIZE); 902e5b6d6dSopenharmony_ci result = 0; 912e5b6d6dSopenharmony_ci result = strcmp(sourceKeyArray, targetKeyArray); 922e5b6d6dSopenharmony_ci if (result != 0) 932e5b6d6dSopenharmony_ci { 942e5b6d6dSopenharmony_ci fprintf(stderr, 952e5b6d6dSopenharmony_ci "Comparing two strings with sort keys in C failed.\n"); 962e5b6d6dSopenharmony_ci return false; 972e5b6d6dSopenharmony_ci } 982e5b6d6dSopenharmony_ci ucol_close(myCollator); 992e5b6d6dSopenharmony_ci return true; 1002e5b6d6dSopenharmony_ci} 1012e5b6d6dSopenharmony_ci``` 1022e5b6d6dSopenharmony_ci 1032e5b6d6dSopenharmony_ciIn **C++:** 1042e5b6d6dSopenharmony_ci 1052e5b6d6dSopenharmony_ci```c++ 1062e5b6d6dSopenharmony_ci#include <stdio.h> 1072e5b6d6dSopenharmony_ci#include "unicode/unistr.h" 1082e5b6d6dSopenharmony_ci#include "unicode/utypes.h" 1092e5b6d6dSopenharmony_ci#include "unicode/locid.h" 1102e5b6d6dSopenharmony_ci#include "unicode/coll.h" 1112e5b6d6dSopenharmony_ci#include "unicode/tblcoll.h" 1122e5b6d6dSopenharmony_ci#include "unicode/coleitr.h" 1132e5b6d6dSopenharmony_ci#include "unicode/sortkey.h" 1142e5b6d6dSopenharmony_ciUBool collateWithLocaleInCPP(const Locale& locale, UErrorCode& status) 1152e5b6d6dSopenharmony_ci{ 1162e5b6d6dSopenharmony_ci UnicodeString dispName; 1172e5b6d6dSopenharmony_ci UnicodeString source("This is a test."); 1182e5b6d6dSopenharmony_ci UnicodeString target("THIS IS A TEST."); 1192e5b6d6dSopenharmony_ci Collator::EComparisonResult result = Collator::EQUAL; 1202e5b6d6dSopenharmony_ci CollationKey sourceKey; 1212e5b6d6dSopenharmony_ci CollationKey targetKey; 1222e5b6d6dSopenharmony_ci Collator *myCollator = 0; 1232e5b6d6dSopenharmony_ci if (U_FAILURE(status)) 1242e5b6d6dSopenharmony_ci { 1252e5b6d6dSopenharmony_ci return false; 1262e5b6d6dSopenharmony_ci } 1272e5b6d6dSopenharmony_ci myCollator = Collator::createInstance(locale, status); 1282e5b6d6dSopenharmony_ci if (U_FAILURE(status)){ 1292e5b6d6dSopenharmony_ci locale.getDisplayName(dispName); 1302e5b6d6dSopenharmony_ci /*Report the error with display name... */ 1312e5b6d6dSopenharmony_ci fprintf(stderr, 1322e5b6d6dSopenharmony_ci "%s: Failed to create the collator for : \"%s\"\n", dispName); 1332e5b6d6dSopenharmony_ci return false; 1342e5b6d6dSopenharmony_ci } 1352e5b6d6dSopenharmony_ci result = myCollator->compare(source, target); 1362e5b6d6dSopenharmony_ci /* result is 1, secondary differences only for ignorable space characters*/ 1372e5b6d6dSopenharmony_ci if (result != UCOL_LESS) 1382e5b6d6dSopenharmony_ci { 1392e5b6d6dSopenharmony_ci fprintf(stderr, 1402e5b6d6dSopenharmony_ci "Comparing two strings with only secondary differences in C failed.\n"); 1412e5b6d6dSopenharmony_ci return false; 1422e5b6d6dSopenharmony_ci } 1432e5b6d6dSopenharmony_ci /* To compare them with just primary differences */ 1442e5b6d6dSopenharmony_ci myCollator->setStrength(Collator::PRIMARY); 1452e5b6d6dSopenharmony_ci result = myCollator->compare(source, target); 1462e5b6d6dSopenharmony_ci /* result is 0 */ 1472e5b6d6dSopenharmony_ci if (result != 0) 1482e5b6d6dSopenharmony_ci { 1492e5b6d6dSopenharmony_ci fprintf(stderr, 1502e5b6d6dSopenharmony_ci "Comparing two strings with no differences in C failed.\n"); 1512e5b6d6dSopenharmony_ci return false; 1522e5b6d6dSopenharmony_ci } 1532e5b6d6dSopenharmony_ci /* Now, do the same comparison with keys */ 1542e5b6d6dSopenharmony_ci myCollator->getCollationKey(source, sourceKey, status); 1552e5b6d6dSopenharmony_ci myCollator->getCollationKey(target, targetKey, status); 1562e5b6d6dSopenharmony_ci result = Collator::EQUAL; 1572e5b6d6dSopenharmony_ci 1582e5b6d6dSopenharmony_ci result = sourceKey.compareTo(targetKey); 1592e5b6d6dSopenharmony_ci if (result != 0) 1602e5b6d6dSopenharmony_ci { 1612e5b6d6dSopenharmony_ci fprintf(stderr, 1622e5b6d6dSopenharmony_ci "%s: Comparing two strings with sort keys in C failed.\n"); 1632e5b6d6dSopenharmony_ci return false; 1642e5b6d6dSopenharmony_ci } 1652e5b6d6dSopenharmony_ci delete myCollator; 1662e5b6d6dSopenharmony_ci return true; 1672e5b6d6dSopenharmony_ci} 1682e5b6d6dSopenharmony_ci``` 1692e5b6d6dSopenharmony_ci 1702e5b6d6dSopenharmony_ci### Main Function 1712e5b6d6dSopenharmony_ci 1722e5b6d6dSopenharmony_ci```c++ 1732e5b6d6dSopenharmony_ciextern "C" UBool collateWithLocaleInC(const char* locale, UErrorCode *status); 1742e5b6d6dSopenharmony_ciint main() 1752e5b6d6dSopenharmony_ci{ 1762e5b6d6dSopenharmony_ci UErrorCode status = U_ZERO_ERROR; 1772e5b6d6dSopenharmony_ci fprintf(stdout, "\n"); 1782e5b6d6dSopenharmony_ci if (collateWithLocaleInCPP(Locale("en", "US"), status) != true) 1792e5b6d6dSopenharmony_ci { 1802e5b6d6dSopenharmony_ci fprintf(stderr, 1812e5b6d6dSopenharmony_ci "Collate with locale in C++ failed.\n"); 1822e5b6d6dSopenharmony_ci } else 1832e5b6d6dSopenharmony_ci { 1842e5b6d6dSopenharmony_ci fprintf(stdout, "Collate with Locale C++ example worked!!\n"); 1852e5b6d6dSopenharmony_ci } 1862e5b6d6dSopenharmony_ci status = U_ZERO_ERROR; 1872e5b6d6dSopenharmony_ci fprintf(stdout, "\n"); 1882e5b6d6dSopenharmony_ci if (collateWithLocaleInC("en_US", &status) != true) 1892e5b6d6dSopenharmony_ci { 1902e5b6d6dSopenharmony_ci fprintf(stderr, 1912e5b6d6dSopenharmony_ci "%s: Collate with locale in C failed.\n"); 1922e5b6d6dSopenharmony_ci } else 1932e5b6d6dSopenharmony_ci { 1942e5b6d6dSopenharmony_ci fprintf(stdout, "Collate with Locale C example worked!!\n"); 1952e5b6d6dSopenharmony_ci } 1962e5b6d6dSopenharmony_ci return 0; 1972e5b6d6dSopenharmony_ci} 1982e5b6d6dSopenharmony_ci``` 1992e5b6d6dSopenharmony_ci 2002e5b6d6dSopenharmony_ciIn **Java:** 2012e5b6d6dSopenharmony_ci 2022e5b6d6dSopenharmony_ci```java 2032e5b6d6dSopenharmony_ciimport com.ibm.icu.text.Collator; 2042e5b6d6dSopenharmony_ciimport com.ibm.icu.text.CollationElementIterator; 2052e5b6d6dSopenharmony_ciimport com.ibm.icu.text.CollationKey; 2062e5b6d6dSopenharmony_ciimport java.util.Locale; 2072e5b6d6dSopenharmony_ci 2082e5b6d6dSopenharmony_cipublic class CollateExample 2092e5b6d6dSopenharmony_ci{ 2102e5b6d6dSopenharmony_ci 2112e5b6d6dSopenharmony_ci public static void main(String arg[]) 2122e5b6d6dSopenharmony_ci { 2132e5b6d6dSopenharmony_ci CollateExample example = new CollateExample(); 2142e5b6d6dSopenharmony_ci try { 2152e5b6d6dSopenharmony_ci if (!example.collateWithLocale(Locale.US)) { 2162e5b6d6dSopenharmony_ci System.err.println("Collate with locale example failed."); 2172e5b6d6dSopenharmony_ci } 2182e5b6d6dSopenharmony_ci else { 2192e5b6d6dSopenharmony_ci System.out.println("Collate with Locale example worked!!"); 2202e5b6d6dSopenharmony_ci } 2212e5b6d6dSopenharmony_ci } catch (Exception e) { 2222e5b6d6dSopenharmony_ci System.err.println("Collating with locale failed"); 2232e5b6d6dSopenharmony_ci e.printStackTrace(); 2242e5b6d6dSopenharmony_ci } 2252e5b6d6dSopenharmony_ci } 2262e5b6d6dSopenharmony_ci 2272e5b6d6dSopenharmony_ci public boolean collateWithLocale(Locale locale) throws Exception 2282e5b6d6dSopenharmony_ci { 2292e5b6d6dSopenharmony_ci String source = "This is a test."; 2302e5b6d6dSopenharmony_ci String target = "THIS IS A TEST."; 2312e5b6d6dSopenharmony_ci Collator myCollator = Collator.getInstance(locale); 2322e5b6d6dSopenharmony_ci 2332e5b6d6dSopenharmony_ci int result = myCollator.compare(source, target); 2342e5b6d6dSopenharmony_ci // result is 1, secondary differences only for ignorable space characters 2352e5b6d6dSopenharmony_ci if (result >= 0) { 2362e5b6d6dSopenharmony_ci System.err.println( 2372e5b6d6dSopenharmony_ci "Comparing two strings with only secondary differences failed."); 2382e5b6d6dSopenharmony_ci return false; 2392e5b6d6dSopenharmony_ci } 2402e5b6d6dSopenharmony_ci // To compare them with just primary differences 2412e5b6d6dSopenharmony_ci myCollator.setStrength(Collator.PRIMARY); 2422e5b6d6dSopenharmony_ci result = myCollator.compare(source, target); 2432e5b6d6dSopenharmony_ci // result is 0 2442e5b6d6dSopenharmony_ci if (result != 0) { 2452e5b6d6dSopenharmony_ci System.err.println( 2462e5b6d6dSopenharmony_ci "Comparing two strings with no differences failed."); 2472e5b6d6dSopenharmony_ci return false; 2482e5b6d6dSopenharmony_ci } 2492e5b6d6dSopenharmony_ci // Now, do the same comparison with keys 2502e5b6d6dSopenharmony_ci CollationKey sourceKey = myCollator.getCollationKey(source); 2512e5b6d6dSopenharmony_ci CollationKey targetKey = myCollator.getCollationKey(target); 2522e5b6d6dSopenharmony_ci result = sourceKey.compareTo(targetKey); 2532e5b6d6dSopenharmony_ci if (result != 0) { 2542e5b6d6dSopenharmony_ci System.err.println("Comparing two strings with sort keys failed."); 2552e5b6d6dSopenharmony_ci return false; 2562e5b6d6dSopenharmony_ci } 2572e5b6d6dSopenharmony_ci return true; 2582e5b6d6dSopenharmony_ci } 2592e5b6d6dSopenharmony_ci} 2602e5b6d6dSopenharmony_ci``` 2612e5b6d6dSopenharmony_ci 2622e5b6d6dSopenharmony_ci## Language-sensitive searching 2632e5b6d6dSopenharmony_ci 2642e5b6d6dSopenharmony_ciString searching is a well-researched area, and there are algorithms that can 2652e5b6d6dSopenharmony_cioptimize the searching process. Perhaps the best is the Boyer-Moore method. For a 2662e5b6d6dSopenharmony_cifull description of this concept, please see Laura 2672e5b6d6dSopenharmony_ciWerner's text searching article for more details 2682e5b6d6dSopenharmony_ci(<http://icu-project.org/docs/papers/efficient_text_searching_in_java.html>). 2692e5b6d6dSopenharmony_ci 2702e5b6d6dSopenharmony_ciHowever, implementing collation-based search with the Boyer-Moore method 2712e5b6d6dSopenharmony_ciwhile getting correct results is very tricky, and ICU no longer uses this method 2722e5b6d6dSopenharmony_ci(as of ICU4C 4.0 and ICU4J 53). 2732e5b6d6dSopenharmony_ci 2742e5b6d6dSopenharmony_ciPlease see the [String Search Service](./string-search) chapter. 2752e5b6d6dSopenharmony_ci 2762e5b6d6dSopenharmony_ci## Using large buffers to manage sort keys 2772e5b6d6dSopenharmony_ci 2782e5b6d6dSopenharmony_ciA good solution for the problem of not knowing the sort key size in advance is 2792e5b6d6dSopenharmony_cito allocate a large buffer and store all the sort keys there, while keeping a 2802e5b6d6dSopenharmony_cilist of indexes or pointers to that buffer. 2812e5b6d6dSopenharmony_ci 2822e5b6d6dSopenharmony_ciFollowing is sample code that will take a pointer to an array of UChar pointer, 2832e5b6d6dSopenharmony_cian array of key indexes. It will allocate and fill a buffer with sort keys and 2842e5b6d6dSopenharmony_cireturn the maximum size for a sort key. Once you have done this to your string, 2852e5b6d6dSopenharmony_ciyou just need to allocate a field of maximum size and copy your sortkeys from 2862e5b6d6dSopenharmony_cithe buffer to fields. 2872e5b6d6dSopenharmony_ci 2882e5b6d6dSopenharmony_ci```c++ 2892e5b6d6dSopenharmony_ciuint32_t fillBufferWithKeys(UCollator *coll, UChar **source, uint32_t *keys, 2902e5b6d6dSopenharmony_ci uint32_t sourceSize, uint8_t **buffer, 2912e5b6d6dSopenharmony_ci uint32_t *maxSize, UErrorCode *status) 2922e5b6d6dSopenharmony_ci{ 2932e5b6d6dSopenharmony_ci if(status == NULL || U_FAILURE(*status)) { 2942e5b6d6dSopenharmony_ci return 0; 2952e5b6d6dSopenharmony_ci } 2962e5b6d6dSopenharmony_ci 2972e5b6d6dSopenharmony_ci uint32_t bufferSize = 16384; 2982e5b6d6dSopenharmony_ci uint32_t increment = 16384; 2992e5b6d6dSopenharmony_ci uint32_t currentOffset = 0; 3002e5b6d6dSopenharmony_ci uint32_t keySize = 0; 3012e5b6d6dSopenharmony_ci uint32_t i = 0; 3022e5b6d6dSopenharmony_ci *maxSize = 0; 3032e5b6d6dSopenharmony_ci 3042e5b6d6dSopenharmony_ci *buffer = (uint8_t *)malloc(bufferSize * sizeof(uint8_t)); 3052e5b6d6dSopenharmony_ci if(buffer == NULL) { 3062e5b6d6dSopenharmony_ci *status = U_MEMORY_ALLOCATION_ERROR; 3072e5b6d6dSopenharmony_ci return 0; 3082e5b6d6dSopenharmony_ci } 3092e5b6d6dSopenharmony_ci 3102e5b6d6dSopenharmony_ci for(i = 0; i < sourceSize; i++) { 3112e5b6d6dSopenharmony_ci keys[i] = currentOffset; 3122e5b6d6dSopenharmony_ci keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset); 3132e5b6d6dSopenharmony_ci if(keySize > bufferSize-currentOffset) { 3142e5b6d6dSopenharmony_ci *buffer = (uint8_t *)realloc(*buffer, bufferSize+increment); 3152e5b6d6dSopenharmony_ci if(buffer == NULL) { 3162e5b6d6dSopenharmony_ci *status = U_MEMORY_ALLOCATION_ERROR; 3172e5b6d6dSopenharmony_ci return 0; 3182e5b6d6dSopenharmony_ci } 3192e5b6d6dSopenharmony_ci bufferSize += increment; 3202e5b6d6dSopenharmony_ci keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset); 3212e5b6d6dSopenharmony_ci } 3222e5b6d6dSopenharmony_ci /* here you can hook code that does something interesting with the keySize - 3232e5b6d6dSopenharmony_ci * remembers the maximum or similar... 3242e5b6d6dSopenharmony_ci */ 3252e5b6d6dSopenharmony_ci if(keySize > *maxSize) { 3262e5b6d6dSopenharmony_ci *maxSize = keySize; 3272e5b6d6dSopenharmony_ci } 3282e5b6d6dSopenharmony_ci currentOffset += keySize; 3292e5b6d6dSopenharmony_ci } 3302e5b6d6dSopenharmony_ci 3312e5b6d6dSopenharmony_ci return currentOffset; 3322e5b6d6dSopenharmony_ci} 3332e5b6d6dSopenharmony_ci``` 334