1/***********************************************************************
2 * © 2016 and later: Unicode, Inc. and others.
3 * License & terms of use: http://www.unicode.org/copyright.html
4 ***********************************************************************
5 ***********************************************************************
6 * COPYRIGHT:
7 * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
8 *
9 ***********************************************************************/
10/********************************************************************************
11*
12* File CALLCOLL.C
13*
14* Modification History:
15*        Name                     Description
16*     Andy Heninger             First Version
17*
18*********************************************************************************
19*/
20
21//
22//  This program tests string collation and sort key generation performance.
23//      Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
24//      A file of names is required as input, one per line.  It must be in utf-8 or utf-16 format,
25//      and include a byte order mark.  Either LE or BE format is OK.
26//
27
28const char gUsageString[] =
29 "usage:  collperf options...\n"
30    "-help                      Display this message.\n"
31    "-file file_name            utf-16 format file of names.\n"
32    "-locale name               ICU locale to use.  Default is en_US\n"
33    "-rules file_name           Collation rules file (overrides locale)\n"
34    "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
35    "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
36    "-win                       Run test using Windows native services.  (ICU is default)\n"
37    "-unix                      Run test using Unix strxfrm, strcoll services.\n"
38    "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
39    "-usekeys                   Run tests using sortkeys rather than strcoll\n"
40    "-strcmp                    Run tests using u_strcmp rather than strcoll\n"
41    "-strcmpCPO                 Run tests using u_strcmpCodePointOrder rather than strcoll\n"
42    "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
43    "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
44    "                               under test at each call point.  For measuring test overhead.\n"
45    "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
46    "-french                    French accent ordering\n"
47    "-frenchoff                 No French accent ordering (for use with French locales.)\n"
48    "-norm                      Normalizing mode on\n"
49    "-shifted                   Shifted mode\n"
50    "-lower                     Lower case first\n"
51    "-upper                     Upper case first\n"
52    "-case                      Enable separate case level\n"
53    "-level n                   Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
54    "-keyhist                   Produce a table sort key size vs. string length\n"
55    "-binsearch                 Binary Search timing test\n"
56    "-keygen                    Sort Key Generation timing test\n"
57    "-qsort                     Quicksort timing test\n"
58    "-iter                      Iteration Performance Test\n"
59    "-dump                      Display strings, sort keys and CEs.\n"
60    ;
61
62
63
64#include <stdio.h>
65#include <string.h>
66#include <stdlib.h>
67#include <math.h>
68#include <locale.h>
69#include <errno.h>
70
71#include <unicode/utypes.h>
72#include <unicode/ucol.h>
73#include <unicode/ucoleitr.h>
74#include <unicode/uloc.h>
75#include <unicode/ustring.h>
76#include <unicode/ures.h>
77#include <unicode/uchar.h>
78#include <unicode/ucnv.h>
79#include <unicode/utf8.h>
80
81#ifdef WIN32
82#include <windows.h>
83#else
84//
85//  Stubs for Windows API functions when building on UNIXes.
86//
87typedef int DWORD;
88inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
89#include <sys/time.h>
90unsigned long timeGetTime() {
91    struct timeval t;
92    gettimeofday(&t, 0);
93    unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
94    val += t.tv_usec / 1000;
95    return val;
96}
97inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
98const int LCMAP_SORTKEY = 0;
99#define MAKELCID(a,b) 0
100const int SORT_DEFAULT = 0;
101#endif
102
103
104
105//
106//  Command line option variables
107//     These global variables are set according to the options specified
108//     on the command line by the user.
109char * opt_fName      = 0;
110const char * opt_locale     = "en_US";
111int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
112char * opt_rules      = 0;
113UBool  opt_help       = false;
114int    opt_loopCount  = 1;
115int    opt_iLoopCount = 1;
116UBool  opt_terse      = false;
117UBool  opt_qsort      = false;
118UBool  opt_binsearch  = false;
119UBool  opt_icu        = true;
120UBool  opt_win        = false;      // Run with Windows native functions.
121UBool  opt_unix       = false;      // Run with UNIX strcoll, strxfrm functions.
122UBool  opt_uselen     = false;
123UBool  opt_usekeys    = false;
124UBool  opt_strcmp     = false;
125UBool  opt_strcmpCPO  = false;
126UBool  opt_norm       = false;
127UBool  opt_keygen     = false;
128UBool  opt_french     = false;
129UBool  opt_frenchoff  = false;
130UBool  opt_shifted    = false;
131UBool  opt_lower      = false;
132UBool  opt_upper      = false;
133UBool  opt_case       = false;
134int    opt_level      = 0;
135UBool  opt_keyhist    = false;
136UBool  opt_itertest   = false;
137UBool  opt_dump       = false;
138
139
140
141//
142//   Definitions for the command line options
143//
144struct OptSpec {
145    const char *name;
146    enum {FLAG, NUM, STRING} type;
147    void *pVar;
148};
149
150OptSpec opts[] = {
151    {"-file",        OptSpec::STRING, &opt_fName},
152    {"-locale",      OptSpec::STRING, &opt_locale},
153    {"-langid",      OptSpec::NUM,    &opt_langid},
154    {"-rules",       OptSpec::STRING, &opt_rules},
155    {"-qsort",       OptSpec::FLAG,   &opt_qsort},
156    {"-binsearch",   OptSpec::FLAG,   &opt_binsearch},
157    {"-iter",        OptSpec::FLAG,   &opt_itertest},
158    {"-win",         OptSpec::FLAG,   &opt_win},
159    {"-unix",        OptSpec::FLAG,   &opt_unix},
160    {"-uselen",      OptSpec::FLAG,   &opt_uselen},
161    {"-usekeys",     OptSpec::FLAG,   &opt_usekeys},
162    {"-strcmp",      OptSpec::FLAG,   &opt_strcmp},
163    {"-strcmpCPO",   OptSpec::FLAG,   &opt_strcmpCPO},
164    {"-norm",        OptSpec::FLAG,   &opt_norm},
165    {"-french",      OptSpec::FLAG,   &opt_french},
166    {"-frenchoff",   OptSpec::FLAG,   &opt_frenchoff},
167    {"-shifted",     OptSpec::FLAG,   &opt_shifted},
168    {"-lower",       OptSpec::FLAG,   &opt_lower},
169    {"-upper",       OptSpec::FLAG,   &opt_upper},
170    {"-case",        OptSpec::FLAG,   &opt_case},
171    {"-level",       OptSpec::NUM,    &opt_level},
172    {"-keyhist",     OptSpec::FLAG,   &opt_keyhist},
173    {"-keygen",      OptSpec::FLAG,   &opt_keygen},
174    {"-loop",        OptSpec::NUM,    &opt_loopCount},
175    {"-iloop",       OptSpec::NUM,    &opt_iLoopCount},
176    {"-terse",       OptSpec::FLAG,   &opt_terse},
177    {"-dump",        OptSpec::FLAG,   &opt_dump},
178    {"-help",        OptSpec::FLAG,   &opt_help},
179    {"-?",           OptSpec::FLAG,   &opt_help},
180    {0, OptSpec::FLAG, 0}
181};
182
183
184//---------------------------------------------------------------------------
185//
186//  Global variables pointing to and describing the test file
187//
188//---------------------------------------------------------------------------
189
190//
191//   struct Line
192//
193//      Each line from the source file (containing a name, presumably) gets
194//      one of these structs.
195//
196struct  Line {
197    UChar     *name;
198    int        len;
199    char      *winSortKey;
200    char      *icuSortKey;
201    char      *unixSortKey;
202    char      *unixName;
203};
204
205
206
207Line          *gFileLines;           // Ptr to array of Line structs, one per line in the file.
208int            gNumFileLines;
209UCollator     *gCol;
210DWORD          gWinLCID;
211
212Line          **gSortedLines;
213Line          **gRandomLines;
214int            gCount;
215
216
217
218//---------------------------------------------------------------------------
219//
220//  ProcessOptions()    Function to read the command line options.
221//
222//---------------------------------------------------------------------------
223UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
224{
225    int         i;
226    int         argNum;
227    const char  *pArgName;
228    OptSpec    *pOpt;
229
230    for (argNum=1; argNum<argc; argNum++) {
231        pArgName = argv[argNum];
232        for (pOpt = opts;  pOpt->name != 0; pOpt++) {
233            if (strcmp(pOpt->name, pArgName) == 0) {
234                switch (pOpt->type) {
235                case OptSpec::FLAG:
236                    *(UBool *)(pOpt->pVar) = true;
237                    break;
238                case OptSpec::STRING:
239                    argNum ++;
240                    if (argNum >= argc) {
241                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
242                        return false;
243                    }
244                    *(const char **)(pOpt->pVar)  = argv[argNum];
245                    break;
246                case OptSpec::NUM:
247                    argNum ++;
248                    if (argNum >= argc) {
249                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
250                        return false;
251                    }
252                    char *endp;
253                    i = strtol(argv[argNum], &endp, 0);
254                    if (endp == argv[argNum]) {
255                        fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
256                        return false;
257                    }
258                    *(int *)(pOpt->pVar) = i;
259                }
260                break;
261            }
262        }
263        if (pOpt->name == 0)
264        {
265            fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
266            return false;
267        }
268    }
269return true;
270}
271
272//---------------------------------------------------------------------------------------
273//
274//   Comparison functions for use by qsort.
275//
276//       Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
277//           or null terminated.
278//
279//---------------------------------------------------------------------------------------
280int ICUstrcmpK(const void *a, const void *b) {
281    gCount++;
282    int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
283    return t;
284}
285
286
287int ICUstrcmpL(const void *a, const void *b) {
288    gCount++;
289    UCollationResult t;
290    t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
291    if (t == UCOL_LESS) return -1;
292    if (t == UCOL_GREATER) return +1;
293    return 0;
294}
295
296
297int ICUstrcmp(const void *a, const void *b) {
298    gCount++;
299    UCollationResult t;
300    t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
301    if (t == UCOL_LESS) return -1;
302    if (t == UCOL_GREATER) return +1;
303    return 0;
304}
305
306
307int Winstrcmp(const void *a, const void *b) {
308    gCount++;
309    int t;
310    t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
311    return t-2;
312}
313
314
315int UNIXstrcmp(const void *a, const void *b) {
316    gCount++;
317    int t;
318    t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
319    return t;
320}
321
322
323int WinstrcmpL(const void *a, const void *b) {
324    gCount++;
325    int t;
326    t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
327    return t-2;
328}
329
330
331int WinstrcmpK(const void *a, const void *b) {
332    gCount++;
333    int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
334    return t;
335}
336
337
338//---------------------------------------------------------------------------------------
339//
340//   Function for sorting the names (lines) into a random order.
341//      Order is based on a hash of the  ICU Sort key for the lines
342//      The randomized order is used as input for the sorting timing tests.
343//
344//---------------------------------------------------------------------------------------
345int ICURandomCmp(const void *a, const void *b) {
346    char  *ask = (*(Line **)a)->icuSortKey;
347    char  *bsk = (*(Line **)b)->icuSortKey;
348    int   aVal = 0;
349    int   bVal = 0;
350    int   retVal;
351    while (*ask != 0) {
352        aVal += aVal*37 + *ask++;
353    }
354    while (*bsk != 0) {
355        bVal += bVal*37 + *bsk++;
356    }
357    retVal = -1;
358    if (aVal == bVal) {
359        retVal = 0;
360    }
361    else if (aVal > bVal) {
362        retVal = 1;
363    }
364    return retVal;
365}
366
367//---------------------------------------------------------------------------------------
368//
369//   doKeyGen()     Key Generation Timing Test
370//
371//---------------------------------------------------------------------------------------
372void doKeyGen()
373{
374    int  line;
375    int  loops = 0;
376    int  iLoop;
377    int  len=-1;
378
379    // Adjust loop count to compensate for file size.   Should be order n
380    double dLoopCount = double(opt_loopCount) * (1000. /  double(gNumFileLines));
381    int adj_loopCount = int(dLoopCount);
382    if (adj_loopCount < 1) adj_loopCount = 1;
383
384
385    unsigned long startTime = timeGetTime();
386
387    if (opt_win) {
388        for (loops=0; loops<adj_loopCount; loops++) {
389            for (line=0; line < gNumFileLines; line++) {
390                if (opt_uselen) {
391                    len = gFileLines[line].len;
392                }
393                for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
394                    LCMapStringW(gWinLCID, LCMAP_SORTKEY,
395                        gFileLines[line].name, len,
396                        (UChar *)gFileLines[line].winSortKey, 5000);    // TODO  something with length.
397                }
398            }
399        }
400    }
401    else if (opt_icu)
402    {
403        for (loops=0; loops<adj_loopCount; loops++) {
404            for (line=0; line < gNumFileLines; line++) {
405                if (opt_uselen) {
406                    len = gFileLines[line].len;
407                }
408                for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
409                    ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
410                }
411            }
412        }
413    }
414    else if (opt_unix)
415    {
416        for (loops=0; loops<adj_loopCount; loops++) {
417            for (line=0; line < gNumFileLines; line++) {
418                for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
419                    strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
420                }
421            }
422        }
423    }
424
425    unsigned long elapsedTime = timeGetTime() - startTime;
426    int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
427
428    if (opt_terse == false) {
429        printf("Sort Key Generation:  total # of keys = %d\n", loops*gNumFileLines);
430        printf("Sort Key Generation:  time per key = %d ns\n", ns);
431    }
432    else {
433        printf("%d,  ", ns);
434    }
435
436    int   totalKeyLen = 0;
437    int   totalChars  = 0;
438    for (line=0; line<gNumFileLines; line++) {
439        totalChars += u_strlen(gFileLines[line].name);
440        if (opt_win) {
441            totalKeyLen += strlen(gFileLines[line].winSortKey);
442        }
443        else if (opt_icu) {
444            totalKeyLen += strlen(gFileLines[line].icuSortKey);
445        }
446        else if (opt_unix) {
447            totalKeyLen += strlen(gFileLines[line].unixSortKey);
448        }
449
450    }
451    if (opt_terse == false) {
452        printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
453    } else {
454        printf("%f, ", (float)totalKeyLen / (float)totalChars);
455    }
456}
457
458
459
460//---------------------------------------------------------------------------------------
461//
462//    doBinarySearch()    Binary Search timing test.  Each name from the list
463//                        is looked up in the full sorted list of names.
464//
465//---------------------------------------------------------------------------------------
466void doBinarySearch()
467{
468
469    gCount = 0;
470    int  line;
471    int  loops = 0;
472    int  iLoop = 0;
473    unsigned long elapsedTime = 0;
474
475    // Adjust loop count to compensate for file size.   Should be order n (lookups) * log n  (compares/lookup)
476    // Accurate timings do not depend on this being perfect.  The correction is just to try to
477    //   get total running times of about the right order, so the that user doesn't need to
478    //   manually adjust the loop count for every different file size.
479    double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
480    if (opt_usekeys) dLoopCount *= 5;
481    int adj_loopCount = int(dLoopCount);
482    if (adj_loopCount < 1) adj_loopCount = 1;
483
484
485    for (;;) {  // not really a loop, just allows "break" to work, to simplify
486                //   inadvertently running more than one test through here.
487        if (opt_strcmp || opt_strcmpCPO)
488        {
489            unsigned long startTime = timeGetTime();
490            typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
491            PF pf = u_strcmp;
492            if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
493            //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;}   // Damn the difference between int32_t and int
494                                                            //   which forces the use of a cast here.
495
496            int r = 0;
497            for (loops=0; loops<adj_loopCount; loops++) {
498
499                for (line=0; line < gNumFileLines; line++) {
500                    int hi      = gNumFileLines-1;
501                    int lo      = 0;
502                    int  guess = -1;
503                    for (;;) {
504                        int newGuess = (hi + lo) / 2;
505                        if (newGuess == guess)
506                            break;
507                        guess = newGuess;
508                        for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
509                            r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
510                        }
511                        gCount++;
512                        if (r== 0)
513                            break;
514                        if (r < 0)
515                            hi = guess;
516                        else
517                            lo   = guess;
518                    }
519                }
520            }
521            elapsedTime = timeGetTime() - startTime;
522            break;
523        }
524
525
526        if (opt_icu)
527        {
528            unsigned long startTime = timeGetTime();
529            UCollationResult  r = UCOL_EQUAL;
530            for (loops=0; loops<adj_loopCount; loops++) {
531
532                for (line=0; line < gNumFileLines; line++) {
533                    int lineLen  = -1;
534                    int guessLen = -1;
535                    if (opt_uselen) {
536                        lineLen = (gSortedLines[line])->len;
537                    }
538                    int hi      = gNumFileLines-1;
539                    int lo      = 0;
540                    int  guess = -1;
541                    for (;;) {
542                        int newGuess = (hi + lo) / 2;
543                        if (newGuess == guess)
544                            break;
545                        guess = newGuess;
546                        int ri = 0;
547                        if (opt_usekeys) {
548                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
549                                ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
550                            }
551                            gCount++;
552                            r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
553                        }
554                        else
555                        {
556                            if (opt_uselen) {
557                                guessLen = (gSortedLines[guess])->len;
558                            }
559                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
560                                r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
561                            }
562                            gCount++;
563                        }
564                        if (r== UCOL_EQUAL)
565                            break;
566                        if (r == UCOL_LESS)
567                            hi = guess;
568                        else
569                            lo   = guess;
570                    }
571                }
572            }
573            elapsedTime = timeGetTime() - startTime;
574            break;
575        }
576
577        if (opt_win)
578        {
579            unsigned long startTime = timeGetTime();
580            int r = 0;
581            for (loops=0; loops<adj_loopCount; loops++) {
582
583                for (line=0; line < gNumFileLines; line++) {
584                    int lineLen  = -1;
585                    int guessLen = -1;
586                    if (opt_uselen) {
587                        lineLen = (gSortedLines[line])->len;
588                    }
589                    int hi   = gNumFileLines-1;
590                    int lo   = 0;
591                    int  guess = -1;
592                    for (;;) {
593                        int newGuess = (hi + lo) / 2;
594                        if (newGuess == guess)
595                            break;
596                        guess = newGuess;
597                        if (opt_usekeys) {
598                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
599                                r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
600                            }
601                            gCount++;
602                            r+=2;
603                        }
604                        else
605                        {
606                            if (opt_uselen) {
607                                guessLen = (gSortedLines[guess])->len;
608                            }
609                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
610                                r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
611                            }
612                            if (r == 0) {
613                                if (opt_terse == false) {
614                                    fprintf(stderr, "Error returned from Windows CompareStringW.\n");
615                                }
616                                exit(-1);
617                            }
618                            gCount++;
619                        }
620                        if (r== 2)   //  strings ==
621                            break;
622                        if (r == 1)  //  line < guess
623                            hi = guess;
624                        else         //  line > guess
625                            lo   = guess;
626                    }
627                }
628            }
629            elapsedTime = timeGetTime() - startTime;
630            break;
631        }
632
633        if (opt_unix)
634        {
635            unsigned long startTime = timeGetTime();
636            int r = 0;
637            for (loops=0; loops<adj_loopCount; loops++) {
638
639                for (line=0; line < gNumFileLines; line++) {
640                    int hi   = gNumFileLines-1;
641                    int lo   = 0;
642                    int  guess = -1;
643                    for (;;) {
644                        int newGuess = (hi + lo) / 2;
645                        if (newGuess == guess)
646                            break;
647                        guess = newGuess;
648                        if (opt_usekeys) {
649                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
650                                 r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
651                            }
652                            gCount++;
653                        }
654                        else
655                        {
656                            for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
657                                r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
658                            }
659                            errno = 0;
660                            if (errno != 0) {
661                                fprintf(stderr, "Error %d returned from strcoll.\n", errno);
662                                exit(-1);
663                            }
664                            gCount++;
665                        }
666                        if (r == 0)   //  strings ==
667                            break;
668                        if (r < 0)  //  line < guess
669                            hi = guess;
670                        else         //  line > guess
671                            lo   = guess;
672                    }
673                }
674            }
675            elapsedTime = timeGetTime() - startTime;
676            break;
677        }
678        break;
679    }
680
681    int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
682    if (opt_terse == false) {
683        printf("binary search:  total # of string compares = %d\n", gCount);
684        printf("binary search:  compares per loop = %d\n", gCount / loops);
685        printf("binary search:  time per compare = %d ns\n", ns);
686    } else {
687        printf("%d, ", ns);
688    }
689
690}
691
692
693
694
695//---------------------------------------------------------------------------------------
696//
697//   doQSort()    The quick sort timing test.  Uses the C library qsort function.
698//
699//---------------------------------------------------------------------------------------
700void doQSort() {
701    int i;
702    Line **sortBuf = new Line *[gNumFileLines];
703
704    // Adjust loop count to compensate for file size.   QSort should be n log(n)
705    double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
706    if (opt_usekeys) dLoopCount *= 5;
707    int adj_loopCount = int(dLoopCount);
708    if (adj_loopCount < 1) adj_loopCount = 1;
709
710
711    gCount = 0;
712    unsigned long startTime = timeGetTime();
713    if (opt_win && opt_usekeys) {
714        for (i=0; i<opt_loopCount; i++) {
715            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
716            qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
717        }
718    }
719
720    else if (opt_win && opt_uselen) {
721        for (i=0; i<adj_loopCount; i++) {
722            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
723            qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
724        }
725    }
726
727
728    else if (opt_win && !opt_uselen) {
729        for (i=0; i<adj_loopCount; i++) {
730            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
731            qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
732        }
733    }
734
735    else if (opt_icu && opt_usekeys) {
736        for (i=0; i<adj_loopCount; i++) {
737            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
738            qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
739        }
740    }
741
742    else if (opt_icu && opt_uselen) {
743        for (i=0; i<adj_loopCount; i++) {
744            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
745            qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
746        }
747    }
748
749
750    else if (opt_icu && !opt_uselen) {
751        for (i=0; i<adj_loopCount; i++) {
752            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
753            qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
754        }
755    }
756
757    else if (opt_unix && !opt_usekeys) {
758        for (i=0; i<adj_loopCount; i++) {
759            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
760            qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
761        }
762    }
763
764    unsigned long elapsedTime = timeGetTime() - startTime;
765    int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
766    if (opt_terse == false) {
767        printf("qsort:  total # of string compares = %d\n", gCount);
768        printf("qsort:  time per compare = %d ns\n", ns);
769    } else {
770        printf("%d, ", ns);
771    }
772}
773
774
775
776//---------------------------------------------------------------------------------------
777//
778//    doKeyHist()       Output a table of data for
779//                        average sort key size vs. string length.
780//
781//---------------------------------------------------------------------------------------
782void doKeyHist() {
783    int     i;
784    int     maxLen = 0;
785
786    // Find the maximum string length
787    for (i=0; i<gNumFileLines; i++) {
788        if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
789    }
790
791    // Allocate arrays to hold the histogram data
792    int *accumulatedLen  = new int[maxLen+1];
793    int *numKeysOfSize   = new int[maxLen+1];
794    for (i=0; i<=maxLen; i++) {
795        accumulatedLen[i] = 0;
796        numKeysOfSize[i] = 0;
797    }
798
799    // Fill the arrays...
800    for (i=0; i<gNumFileLines; i++) {
801        int len = gFileLines[i].len;
802        accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
803        numKeysOfSize[len] += 1;
804    }
805
806    // And write out averages
807    printf("String Length,  Avg Key Length,  Avg Key Len per char\n");
808    for (i=1; i<=maxLen; i++) {
809        if (numKeysOfSize[i] > 0) {
810            printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
811                (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
812        }
813    }
814    delete []accumulatedLen;
815    delete []numKeysOfSize ;
816}
817
818//---------------------------------------------------------------------------------------
819//
820//    doForwardIterTest(UBool)       Forward iteration test
821//                                   argument null-terminated string used
822//
823//---------------------------------------------------------------------------------------
824void doForwardIterTest(UBool haslen) {
825    int count = 0;
826
827    UErrorCode error = U_ZERO_ERROR;
828    printf("\n\nPerforming forward iteration performance test with ");
829
830    if (haslen) {
831        printf("non-null terminated data -----------\n");
832    }
833    else {
834        printf("null terminated data -----------\n");
835    }
836    printf("performance test on strings from file -----------\n");
837
838    UChar dummytext[] = {0, 0};
839    UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
840    ucol_setText(iter, dummytext, 1, &error);
841
842    gCount = 0;
843    unsigned long startTime = timeGetTime();
844    while (count < opt_loopCount) {
845        int linecount = 0;
846        while (linecount < gNumFileLines) {
847            UChar *str = gFileLines[linecount].name;
848            int strlen = haslen?gFileLines[linecount].len:-1;
849            ucol_setText(iter, str, strlen, &error);
850            while (ucol_next(iter, &error) != UCOL_NULLORDER) {
851                gCount++;
852            }
853
854            linecount ++;
855        }
856        count ++;
857    }
858    unsigned long elapsedTime = timeGetTime() - startTime;
859    printf("elapsedTime %ld\n", elapsedTime);
860
861    // empty loop recalculation
862    count = 0;
863    startTime = timeGetTime();
864    while (count < opt_loopCount) {
865        int linecount = 0;
866        while (linecount < gNumFileLines) {
867            UChar *str = gFileLines[linecount].name;
868            int strlen = haslen?gFileLines[linecount].len:-1;
869            ucol_setText(iter, str, strlen, &error);
870            linecount ++;
871        }
872        count ++;
873    }
874    elapsedTime -= (timeGetTime() - startTime);
875    printf("elapsedTime %ld\n", elapsedTime);
876
877    ucol_closeElements(iter);
878
879    int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
880    printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
881                                                                opt_loopCount);
882    printf("Average time per ucol_next() nano seconds %d\n", ns);
883
884    printf("performance test on skipped-5 concatenated strings from file -----------\n");
885
886    UChar *str;
887    int    strlen = 0;
888    // appending all the strings
889    int linecount = 0;
890    while (linecount < gNumFileLines) {
891        strlen += haslen?gFileLines[linecount].len:
892                                      u_strlen(gFileLines[linecount].name);
893        linecount ++;
894    }
895    str = (UChar *)malloc(sizeof(UChar) * strlen);
896    int strindex = 0;
897    linecount = 0;
898    while (strindex < strlen) {
899        int len = 0;
900        len += haslen?gFileLines[linecount].len:
901                                      u_strlen(gFileLines[linecount].name);
902        memcpy(str + strindex, gFileLines[linecount].name,
903               sizeof(UChar) * len);
904        strindex += len;
905        linecount ++;
906    }
907
908    printf("Total size of strings %d\n", strlen);
909
910    gCount = 0;
911    count  = 0;
912
913    if (!haslen) {
914        strlen = -1;
915    }
916    iter = ucol_openElements(gCol, str, strlen, &error);
917    if (!haslen) {
918        strlen = u_strlen(str);
919    }
920    strlen -= 5; // any left over characters are not iterated,
921                 // this is to ensure the backwards and forwards iterators
922                 // gets the same position
923    startTime = timeGetTime();
924    while (count < opt_loopCount) {
925        int count5 = 5;
926        strindex = 0;
927        ucol_setOffset(iter, strindex, &error);
928        while (true) {
929            if (ucol_next(iter, &error) == UCOL_NULLORDER) {
930                break;
931            }
932            gCount++;
933            count5 --;
934            if (count5 == 0) {
935                strindex += 10;
936                if (strindex > strlen) {
937                    break;
938                }
939                ucol_setOffset(iter, strindex, &error);
940                count5 = 5;
941            }
942        }
943        count ++;
944    }
945
946    elapsedTime = timeGetTime() - startTime;
947    printf("elapsedTime %ld\n", elapsedTime);
948
949    // empty loop recalculation
950    int tempgCount = 0;
951    count = 0;
952    startTime = timeGetTime();
953    while (count < opt_loopCount) {
954        int count5 = 5;
955        strindex = 0;
956        ucol_setOffset(iter, strindex, &error);
957        while (true) {
958            tempgCount ++;
959            count5 --;
960            if (count5 == 0) {
961                strindex += 10;
962                if (strindex > strlen) {
963                    break;
964                }
965                ucol_setOffset(iter, strindex, &error);
966                count5 = 5;
967            }
968        }
969        count ++;
970    }
971    elapsedTime -= (timeGetTime() - startTime);
972    printf("elapsedTime %ld\n", elapsedTime);
973
974    ucol_closeElements(iter);
975
976    printf("gCount %d\n", gCount);
977    ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
978    printf("Average time per ucol_next() nano seconds %d\n", ns);
979}
980
981//---------------------------------------------------------------------------------------
982//
983//    doBackwardIterTest(UBool)      Backwards iteration test
984//                                   argument null-terminated string used
985//
986//---------------------------------------------------------------------------------------
987void doBackwardIterTest(UBool haslen) {
988    int count = 0;
989    UErrorCode error = U_ZERO_ERROR;
990    printf("\n\nPerforming backward iteration performance test with ");
991
992    if (haslen) {
993        printf("non-null terminated data -----------\n");
994    }
995    else {
996        printf("null terminated data -----------\n");
997    }
998
999    printf("performance test on strings from file -----------\n");
1000
1001    UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
1002    UChar dummytext[] = {0, 0};
1003    ucol_setText(iter, dummytext, 1, &error);
1004
1005    gCount = 0;
1006    unsigned long startTime = timeGetTime();
1007    while (count < opt_loopCount) {
1008        int linecount = 0;
1009        while (linecount < gNumFileLines) {
1010            UChar *str = gFileLines[linecount].name;
1011            int strlen = haslen?gFileLines[linecount].len:-1;
1012            ucol_setText(iter, str, strlen, &error);
1013            while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
1014                gCount ++;
1015            }
1016
1017            linecount ++;
1018        }
1019        count ++;
1020    }
1021    unsigned long elapsedTime = timeGetTime() - startTime;
1022
1023    printf("elapsedTime %ld\n", elapsedTime);
1024
1025    // empty loop recalculation
1026    count = 0;
1027    startTime = timeGetTime();
1028    while (count < opt_loopCount) {
1029        int linecount = 0;
1030        while (linecount < gNumFileLines) {
1031            UChar *str = gFileLines[linecount].name;
1032            int strlen = haslen?gFileLines[linecount].len:-1;
1033            ucol_setText(iter, str, strlen, &error);
1034            linecount ++;
1035        }
1036        count ++;
1037    }
1038    elapsedTime -= (timeGetTime() - startTime);
1039
1040    printf("elapsedTime %ld\n", elapsedTime);
1041    ucol_closeElements(iter);
1042
1043    int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1044    printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
1045                                                                opt_loopCount);
1046    printf("Average time per ucol_previous() nano seconds %d\n", ns);
1047
1048    printf("performance test on skipped-5 concatenated strings from file -----------\n");
1049
1050    UChar *str;
1051    int    strlen = 0;
1052    // appending all the strings
1053    int linecount = 0;
1054    while (linecount < gNumFileLines) {
1055        strlen += haslen?gFileLines[linecount].len:
1056                                      u_strlen(gFileLines[linecount].name);
1057        linecount ++;
1058    }
1059    str = (UChar *)malloc(sizeof(UChar) * strlen);
1060    int strindex = 0;
1061    linecount = 0;
1062    while (strindex < strlen) {
1063        int len = 0;
1064        len += haslen?gFileLines[linecount].len:
1065                                      u_strlen(gFileLines[linecount].name);
1066        memcpy(str + strindex, gFileLines[linecount].name,
1067               sizeof(UChar) * len);
1068        strindex += len;
1069        linecount ++;
1070    }
1071
1072    printf("Total size of strings %d\n", strlen);
1073
1074    gCount = 0;
1075    count  = 0;
1076
1077    if (!haslen) {
1078        strlen = -1;
1079    }
1080
1081    iter = ucol_openElements(gCol, str, strlen, &error);
1082    if (!haslen) {
1083        strlen = u_strlen(str);
1084    }
1085
1086    startTime = timeGetTime();
1087    while (count < opt_loopCount) {
1088        int count5 = 5;
1089        strindex = 5;
1090        ucol_setOffset(iter, strindex, &error);
1091        while (true) {
1092            if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
1093                break;
1094            }
1095             gCount ++;
1096             count5 --;
1097             if (count5 == 0) {
1098                 strindex += 10;
1099                 if (strindex > strlen) {
1100                    break;
1101                 }
1102                 ucol_setOffset(iter, strindex, &error);
1103                 count5 = 5;
1104             }
1105        }
1106        count ++;
1107    }
1108
1109    elapsedTime = timeGetTime() - startTime;
1110    printf("elapsedTime %ld\n", elapsedTime);
1111
1112    // empty loop recalculation
1113    count = 0;
1114    int tempgCount = 0;
1115    startTime = timeGetTime();
1116    while (count < opt_loopCount) {
1117        int count5 = 5;
1118        strindex = 5;
1119        ucol_setOffset(iter, strindex, &error);
1120        while (true) {
1121             tempgCount ++;
1122             count5 --;
1123             if (count5 == 0) {
1124                 strindex += 10;
1125                 if (strindex > strlen) {
1126                    break;
1127                 }
1128                 ucol_setOffset(iter, strindex, &error);
1129                 count5 = 5;
1130             }
1131        }
1132        count ++;
1133    }
1134    elapsedTime -= (timeGetTime() - startTime);
1135    printf("elapsedTime %ld\n", elapsedTime);
1136    ucol_closeElements(iter);
1137
1138    printf("gCount %d\n", gCount);
1139    ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1140    printf("Average time per ucol_previous() nano seconds %d\n", ns);
1141}
1142
1143//---------------------------------------------------------------------------------------
1144//
1145//    doIterTest()       Iteration test
1146//
1147//---------------------------------------------------------------------------------------
1148void doIterTest() {
1149    doForwardIterTest(opt_uselen);
1150    doBackwardIterTest(opt_uselen);
1151}
1152
1153
1154//----------------------------------------------------------------------------------------
1155//
1156//   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
1157//                    Since it appears that Unicode support is going in the general
1158//                    direction of the use of UTF-8 locales, that is the approach
1159//                    that is used here.
1160//
1161//----------------------------------------------------------------------------------------
1162void  UnixConvert() {
1163    int    line;
1164
1165    UConverter   *cvrtr;    // An ICU code page converter.
1166    UErrorCode    status = U_ZERO_ERROR;
1167
1168
1169    cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
1170    if (U_FAILURE(status)) {
1171        fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
1172        exit(-1);
1173    }
1174
1175    for (line=0; line < gNumFileLines; line++) {
1176        int sizeNeeded = ucnv_fromUChars(cvrtr,
1177                                         0,            // ptr to target buffer.
1178                                         0,            // length of target buffer.
1179                                         gFileLines[line].name,
1180                                         -1,           //  source is null terminated
1181                                         &status);
1182        if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
1183            //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
1184            //exit(-1);
1185        }
1186        status = U_ZERO_ERROR;
1187        gFileLines[line].unixName = new char[sizeNeeded+1];
1188        sizeNeeded = ucnv_fromUChars(cvrtr,
1189                                         gFileLines[line].unixName, // ptr to target buffer.
1190                                         sizeNeeded+1, // length of target buffer.
1191                                         gFileLines[line].name,
1192                                         -1,           //  source is null terminated
1193                                         &status);
1194        if (U_FAILURE(status)) {
1195            fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
1196            exit(-1);
1197        }
1198        gFileLines[line].unixName[sizeNeeded] = 0;
1199    };
1200    ucnv_close(cvrtr);
1201}
1202
1203
1204//----------------------------------------------------------------------------------------
1205//
1206//  class UCharFile   Class to hide all the gorp to read a file in
1207//                    and produce a stream of UChars.
1208//
1209//----------------------------------------------------------------------------------------
1210class UCharFile {
1211public:
1212    UCharFile(const char *fileName);
1213    ~UCharFile();
1214    UChar   get();
1215    UBool   eof() {return fEof;};
1216    UBool   error() {return fError;};
1217
1218private:
1219    UCharFile (const UCharFile & /*other*/) {};                         // No copy constructor.
1220    UCharFile & operator = (const UCharFile &/*other*/) {return *this;};   // No assignment op
1221
1222    FILE         *fFile;
1223    const char   *fName;
1224    UBool        fEof;
1225    UBool        fError;
1226    UChar        fPending2ndSurrogate;
1227
1228    enum {UTF16LE, UTF16BE, UTF8} fEncoding;
1229};
1230
1231UCharFile::UCharFile(const char * fileName) {
1232    fEof                 = false;
1233    fError               = false;
1234    fName                = fileName;
1235    fFile                = fopen(fName, "rb");
1236    fPending2ndSurrogate = 0;
1237    if (fFile == NULL) {
1238        fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
1239        fError = true;
1240        return;
1241    }
1242    //
1243    //  Look for the byte order mark at the start of the file.
1244    //
1245    int BOMC1, BOMC2, BOMC3;
1246    BOMC1 = fgetc(fFile);
1247    BOMC2 = fgetc(fFile);
1248
1249    if (BOMC1 == 0xff && BOMC2 == 0xfe) {
1250        fEncoding = UTF16LE; }
1251    else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
1252        fEncoding = UTF16BE; }
1253    else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
1254        fEncoding = UTF8; }
1255    else
1256    {
1257        fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
1258            "must include a BOM.\n", fileName);
1259        fError = true;
1260        return;
1261    }
1262}
1263
1264
1265UCharFile::~UCharFile() {
1266    fclose(fFile);
1267}
1268
1269
1270
1271UChar UCharFile::get() {
1272    UChar   c;
1273    switch (fEncoding) {
1274    case UTF16LE:
1275        {
1276            int  cL, cH;
1277            cL = fgetc(fFile);
1278            cH = fgetc(fFile);
1279            c  = cL  | (cH << 8);
1280            if (cH == EOF) {
1281                c   = 0;
1282                fEof = true;
1283            }
1284            break;
1285        }
1286    case UTF16BE:
1287        {
1288            int  cL, cH;
1289            cH = fgetc(fFile);
1290            cL = fgetc(fFile);
1291            c  = cL  | (cH << 8);
1292            if (cL == EOF) {
1293                c   = 0;
1294                fEof = true;
1295            }
1296            break;
1297        }
1298    case UTF8:
1299        {
1300            if (fPending2ndSurrogate != 0) {
1301                c = fPending2ndSurrogate;
1302                fPending2ndSurrogate = 0;
1303                break;
1304            }
1305
1306            int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
1307            if (ch == EOF) {
1308                c = 0;
1309                fEof = true;
1310                break;
1311            }
1312
1313            if (ch <= 0x7f) {
1314                // It's ascii.  No further utf-8 conversion.
1315                c = ch;
1316                break;
1317            }
1318
1319            // Figure out the length of the char and read the rest of the bytes
1320            //   into a temp array.
1321            int nBytes;
1322            if (ch >= 0xF0) {nBytes=4;}
1323            else if (ch >= 0xE0) {nBytes=3;}
1324            else if (ch >= 0xC0) {nBytes=2;}
1325            else {
1326                fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1327                fError = true;
1328                return 0;
1329            }
1330
1331            unsigned char  bytes[10];
1332            bytes[0] = (unsigned char)ch;
1333            int i;
1334            for (i=1; i<nBytes; i++) {
1335                bytes[i] = fgetc(fFile);
1336                if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
1337                    fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1338                    fError = true;
1339                    return 0;
1340                }
1341            }
1342
1343            // Convert the bytes from the temp array to a Unicode char.
1344            i = 0;
1345            uint32_t  cp;
1346            U8_NEXT_UNSAFE(bytes, i, cp);
1347            c = (UChar)cp;
1348
1349            if (cp >= 0x10000) {
1350                // The code point needs to be broken up into a utf-16 surrogate pair.
1351                //  Process first half this time through the main loop, and
1352                //   remember the other half for the next time through.
1353                UChar utf16Buf[3];
1354                i = 0;
1355                UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
1356                fPending2ndSurrogate = utf16Buf[1];
1357                c = utf16Buf[0];
1358            }
1359            break;
1360        };
1361    default:
1362        c = 0xFFFD; /* Error, unspecified codepage*/
1363        fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
1364        exit(1);
1365    }
1366    return c;
1367}
1368
1369//----------------------------------------------------------------------------------------
1370//
1371//   openRulesCollator  - Command line specified a rules file.  Read it in
1372//                        and open a collator with it.
1373//
1374//----------------------------------------------------------------------------------------
1375UCollator *openRulesCollator() {
1376    UCharFile f(opt_rules);
1377    if (f.error()) {
1378        return 0;
1379    }
1380
1381    int  bufLen = 10000;
1382    UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
1383    UChar *tmp;
1384    int i = 0;
1385
1386    for(;;) {
1387        buf[i] = f.get();
1388        if (f.eof()) {
1389            break;
1390        }
1391        if (f.error()) {
1392            return 0;
1393        }
1394        i++;
1395        if (i >= bufLen) {
1396            tmp = buf;
1397            bufLen += 10000;
1398            buf = (UChar *)realloc(buf, bufLen);
1399            if (buf == NULL) {
1400                free(tmp);
1401                return 0;
1402            }
1403        }
1404    }
1405    buf[i] = 0;
1406
1407    UErrorCode    status = U_ZERO_ERROR;
1408    UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
1409                                         UCOL_DEFAULT_STRENGTH, NULL, &status);
1410    if (U_FAILURE(status)) {
1411        fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
1412        return 0;
1413    }
1414    free(buf);
1415    return coll;
1416}
1417
1418
1419
1420
1421
1422//----------------------------------------------------------------------------------------
1423//
1424//    Main   --  process command line, read in and pre-process the test file,
1425//                 call other functions to do the actual tests.
1426//
1427//----------------------------------------------------------------------------------------
1428int main(int argc, const char** argv) {
1429    if (ProcessOptions(argc, argv, opts) != true || opt_help || opt_fName == 0) {
1430        printf(gUsageString);
1431        exit (1);
1432    }
1433
1434    // Make sure that we've only got one API selected.
1435    if (opt_unix || opt_win) opt_icu = false;
1436    if (opt_unix) opt_win = false;
1437
1438    //
1439    //  Set up an ICU collator
1440    //
1441    UErrorCode          status = U_ZERO_ERROR;
1442
1443    if (opt_rules != 0) {
1444        gCol = openRulesCollator();
1445        if (gCol == 0) {return -1;}
1446    }
1447    else {
1448        gCol = ucol_open(opt_locale, &status);
1449        if (U_FAILURE(status)) {
1450            fprintf(stderr, "Collator creation failed.: %d\n", status);
1451            return -1;
1452        }
1453    }
1454    if (status==U_USING_DEFAULT_WARNING && opt_terse==false) {
1455        fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
1456    }
1457    if (status==U_USING_FALLBACK_WARNING && opt_terse==false) {
1458        fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
1459    }
1460
1461    if (opt_norm) {
1462        ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
1463    }
1464    if (opt_french && opt_frenchoff) {
1465        fprintf(stderr, "collperf:  Error, specified both -french and -frenchoff options.");
1466        exit(-1);
1467    }
1468    if (opt_french) {
1469        ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
1470    }
1471    if (opt_frenchoff) {
1472        ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1473    }
1474    if (opt_lower) {
1475        ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
1476    }
1477    if (opt_upper) {
1478        ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
1479    }
1480    if (opt_case) {
1481        ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
1482    }
1483    if (opt_shifted) {
1484        ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
1485    }
1486    if (opt_level != 0) {
1487        switch (opt_level) {
1488        case 1:
1489            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1490            break;
1491        case 2:
1492            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
1493            break;
1494        case 3:
1495            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
1496            break;
1497        case 4:
1498            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
1499            break;
1500        case 5:
1501            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
1502            break;
1503        default:
1504            fprintf(stderr, "-level param must be between 1 and 5\n");
1505            exit(-1);
1506        }
1507    }
1508
1509    if (U_FAILURE(status)) {
1510        fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
1511        return -1;
1512    }
1513
1514
1515    //
1516    //  Set up a Windows LCID
1517    //
1518    if (opt_langid != 0) {
1519        gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
1520    }
1521    else {
1522        gWinLCID = uloc_getLCID(opt_locale);
1523    }
1524
1525
1526    //
1527    //  Set the UNIX locale
1528    //
1529    if (opt_unix) {
1530        if (setlocale(LC_ALL, opt_locale) == 0) {
1531            fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
1532            exit(-1);
1533        }
1534    }
1535
1536    // Read in  the input file.
1537    //   File assumed to be utf-16.
1538    //   Lines go onto heap buffers.  Global index array to line starts is created.
1539    //   Lines themselves are null terminated.
1540    //
1541
1542    UCharFile f(opt_fName);
1543    if (f.error()) {
1544        exit(-1);
1545    }
1546
1547    const int MAXLINES = 100000;
1548    gFileLines = new Line[MAXLINES];
1549    UChar buf[1024];
1550    int   column = 0;
1551
1552    //  Read the file, split into lines, and save in memory.
1553    //  Loop runs once per utf-16 value from the input file,
1554    //    (The number of bytes read from file per loop iteration depends on external encoding.)
1555    for (;;) {
1556
1557        UChar c = f.get();
1558        if (f.error()){
1559            exit(-1);
1560        }
1561
1562
1563        // We now have a good UTF-16 value in c.
1564
1565        // Watch for CR, LF, EOF; these finish off a line.
1566        if (c == 0xd) {
1567            continue;
1568        }
1569
1570        if (f.eof() || c == 0x0a || c==0x2028) {  // Unipad inserts 2028 line separators!
1571            buf[column++] = 0;
1572            if (column > 1) {
1573                gFileLines[gNumFileLines].name  = new UChar[column];
1574                gFileLines[gNumFileLines].len   = column-1;
1575                memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
1576                gNumFileLines++;
1577                column = 0;
1578                if (gNumFileLines >= MAXLINES) {
1579                    fprintf(stderr, "File too big.  Max number of lines is %d\n", MAXLINES);
1580                    exit(-1);
1581                }
1582
1583            }
1584            if (c == 0xa || c == 0x2028)
1585                continue;
1586            else
1587                break;  // EOF
1588        }
1589        buf[column++] = c;
1590        if (column >= 1023)
1591        {
1592            static UBool warnFlag = true;
1593            if (warnFlag) {
1594                fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
1595                warnFlag = false;
1596            }
1597            column--;
1598        }
1599    }
1600
1601    if (opt_terse == false) {
1602        printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
1603    }
1604
1605
1606    // Convert the lines to the UNIX encoding.
1607    if (opt_unix) {
1608        UnixConvert();
1609    }
1610
1611    //
1612    //  Pre-compute ICU sort keys for the lines of the file.
1613    //
1614    int line;
1615    int32_t t;
1616
1617    for (line=0; line<gNumFileLines; line++) {
1618         t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
1619         gFileLines[line].icuSortKey  = new char[t];
1620
1621         if (t > (int32_t)sizeof(buf)) {
1622             t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
1623         }
1624         else
1625         {
1626             memcpy(gFileLines[line].icuSortKey, buf, t);
1627         }
1628    }
1629
1630
1631
1632    //
1633    //  Pre-compute Windows sort keys for the lines of the file.
1634    //
1635    for (line=0; line<gNumFileLines; line++) {
1636         t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
1637         gFileLines[line].winSortKey  = new char[t];
1638         if (t > (int32_t)sizeof(buf)) {
1639             t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (UChar *)(gFileLines[line].winSortKey), t);
1640         }
1641         else
1642         {
1643             memcpy(gFileLines[line].winSortKey, buf, t);
1644         }
1645    }
1646
1647    //
1648    //  Pre-compute UNIX sort keys for the lines of the file.
1649    //
1650    if (opt_unix) {
1651        for (line=0; line<gNumFileLines; line++) {
1652            t=strxfrm((char *)buf,  gFileLines[line].unixName,  sizeof(buf));
1653            gFileLines[line].unixSortKey  = new char[t];
1654            if (t > (int32_t)sizeof(buf)) {
1655                t = strxfrm(gFileLines[line].unixSortKey,  gFileLines[line].unixName,  sizeof(buf));
1656            }
1657            else
1658            {
1659                memcpy(gFileLines[line].unixSortKey, buf, t);
1660            }
1661        }
1662    }
1663
1664
1665    //
1666    //  Dump file lines, CEs, Sort Keys if requested.
1667    //
1668    if (opt_dump) {
1669        int  i;
1670        for (line=0; line<gNumFileLines; line++) {
1671            for (i=0;;i++) {
1672                UChar  c = gFileLines[line].name[i];
1673                if (c == 0)
1674                    break;
1675                if (c < 0x20 || c > 0x7e) {
1676                    printf("\\u%.4x", c);
1677                }
1678                else {
1679                    printf("%c", c);
1680                }
1681            }
1682            printf("\n");
1683
1684            printf("   CEs: ");
1685            UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
1686            int32_t ce;
1687            i = 0;
1688            for (;;) {
1689                ce = ucol_next(CEiter, &status);
1690                if (ce == UCOL_NULLORDER) {
1691                    break;
1692                }
1693                printf(" %.8x", ce);
1694                if (++i > 8) {
1695                    printf("\n        ");
1696                    i = 0;
1697                }
1698            }
1699            printf("\n");
1700            ucol_closeElements(CEiter);
1701
1702
1703            printf("   ICU Sort Key: ");
1704            for (i=0; ; i++) {
1705                unsigned char c = gFileLines[line].icuSortKey[i];
1706                printf("%02x ", c);
1707                if (c == 0) {
1708                    break;
1709                }
1710                if (i > 0 && i % 20 == 0) {
1711                    printf("\n                 ");
1712                }
1713           }
1714            printf("\n");
1715        }
1716    }
1717
1718
1719    //
1720    //  Pre-sort the lines.
1721    //
1722    int i;
1723    gSortedLines = new Line *[gNumFileLines];
1724    for (i=0; i<gNumFileLines; i++) {
1725        gSortedLines[i] = &gFileLines[i];
1726    }
1727
1728    if (opt_win) {
1729        qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
1730    }
1731    else if (opt_unix) {
1732        qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
1733    }
1734    else   /* ICU */
1735    {
1736        qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
1737    }
1738
1739
1740    //
1741    //  Make up a randomized order, will be used for sorting tests.
1742    //
1743    gRandomLines = new Line *[gNumFileLines];
1744    for (i=0; i<gNumFileLines; i++) {
1745        gRandomLines[i] = &gFileLines[i];
1746    }
1747    qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
1748
1749
1750
1751
1752    //
1753    //  We've got the file read into memory.  Go do something with it.
1754    //
1755
1756    if (opt_qsort)     doQSort();
1757    if (opt_binsearch) doBinarySearch();
1758    if (opt_keygen)    doKeyGen();
1759    if (opt_keyhist)   doKeyHist();
1760    if (opt_itertest)  doIterTest();
1761
1762    return 0;
1763
1764}
1765