1/*********************************************************************** 2 * © 2016 and later: Unicode, Inc. and others. 3 * License & terms of use: http://www.unicode.org/copyright.html 4 *********************************************************************** 5 *********************************************************************** 6 * COPYRIGHT: 7 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. 8 * 9 ***********************************************************************/ 10/******************************************************************************** 11* 12* File CALLCOLL.C 13* 14* Modification History: 15* Name Description 16* Andy Heninger First Version 17* 18********************************************************************************* 19*/ 20 21// 22// This program tests string collation and sort key generation performance. 23// Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString 24// A file of names is required as input, one per line. It must be in utf-8 or utf-16 format, 25// and include a byte order mark. Either LE or BE format is OK. 26// 27 28const char gUsageString[] = 29 "usage: collperf options...\n" 30 "-help Display this message.\n" 31 "-file file_name utf-16 format file of names.\n" 32 "-locale name ICU locale to use. Default is en_US\n" 33 "-rules file_name Collation rules file (overrides locale)\n" 34 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" 35 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" 36 "-win Run test using Windows native services. (ICU is default)\n" 37 "-unix Run test using Unix strxfrm, strcoll services.\n" 38 "-uselen Use API with string lengths. Default is null-terminated strings\n" 39 "-usekeys Run tests using sortkeys rather than strcoll\n" 40 "-strcmp Run tests using u_strcmp rather than strcoll\n" 41 "-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n" 42 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" 43 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" 44 " under test at each call point. For measuring test overhead.\n" 45 "-terse Terse numbers-only output. Intended for use by scripts.\n" 46 "-french French accent ordering\n" 47 "-frenchoff No French accent ordering (for use with French locales.)\n" 48 "-norm Normalizing mode on\n" 49 "-shifted Shifted mode\n" 50 "-lower Lower case first\n" 51 "-upper Upper case first\n" 52 "-case Enable separate case level\n" 53 "-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n" 54 "-keyhist Produce a table sort key size vs. string length\n" 55 "-binsearch Binary Search timing test\n" 56 "-keygen Sort Key Generation timing test\n" 57 "-qsort Quicksort timing test\n" 58 "-iter Iteration Performance Test\n" 59 "-dump Display strings, sort keys and CEs.\n" 60 ; 61 62 63 64#include <stdio.h> 65#include <string.h> 66#include <stdlib.h> 67#include <math.h> 68#include <locale.h> 69#include <errno.h> 70 71#include <unicode/utypes.h> 72#include <unicode/ucol.h> 73#include <unicode/ucoleitr.h> 74#include <unicode/uloc.h> 75#include <unicode/ustring.h> 76#include <unicode/ures.h> 77#include <unicode/uchar.h> 78#include <unicode/ucnv.h> 79#include <unicode/utf8.h> 80 81#ifdef WIN32 82#include <windows.h> 83#else 84// 85// Stubs for Windows API functions when building on UNIXes. 86// 87typedef int DWORD; 88inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} 89#include <sys/time.h> 90unsigned long timeGetTime() { 91 struct timeval t; 92 gettimeofday(&t, 0); 93 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. 94 val += t.tv_usec / 1000; 95 return val; 96} 97inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} 98const int LCMAP_SORTKEY = 0; 99#define MAKELCID(a,b) 0 100const int SORT_DEFAULT = 0; 101#endif 102 103 104 105// 106// Command line option variables 107// These global variables are set according to the options specified 108// on the command line by the user. 109char * opt_fName = 0; 110const char * opt_locale = "en_US"; 111int opt_langid = 0; // Defaults to value corresponding to opt_locale. 112char * opt_rules = 0; 113UBool opt_help = false; 114int opt_loopCount = 1; 115int opt_iLoopCount = 1; 116UBool opt_terse = false; 117UBool opt_qsort = false; 118UBool opt_binsearch = false; 119UBool opt_icu = true; 120UBool opt_win = false; // Run with Windows native functions. 121UBool opt_unix = false; // Run with UNIX strcoll, strxfrm functions. 122UBool opt_uselen = false; 123UBool opt_usekeys = false; 124UBool opt_strcmp = false; 125UBool opt_strcmpCPO = false; 126UBool opt_norm = false; 127UBool opt_keygen = false; 128UBool opt_french = false; 129UBool opt_frenchoff = false; 130UBool opt_shifted = false; 131UBool opt_lower = false; 132UBool opt_upper = false; 133UBool opt_case = false; 134int opt_level = 0; 135UBool opt_keyhist = false; 136UBool opt_itertest = false; 137UBool opt_dump = false; 138 139 140 141// 142// Definitions for the command line options 143// 144struct OptSpec { 145 const char *name; 146 enum {FLAG, NUM, STRING} type; 147 void *pVar; 148}; 149 150OptSpec opts[] = { 151 {"-file", OptSpec::STRING, &opt_fName}, 152 {"-locale", OptSpec::STRING, &opt_locale}, 153 {"-langid", OptSpec::NUM, &opt_langid}, 154 {"-rules", OptSpec::STRING, &opt_rules}, 155 {"-qsort", OptSpec::FLAG, &opt_qsort}, 156 {"-binsearch", OptSpec::FLAG, &opt_binsearch}, 157 {"-iter", OptSpec::FLAG, &opt_itertest}, 158 {"-win", OptSpec::FLAG, &opt_win}, 159 {"-unix", OptSpec::FLAG, &opt_unix}, 160 {"-uselen", OptSpec::FLAG, &opt_uselen}, 161 {"-usekeys", OptSpec::FLAG, &opt_usekeys}, 162 {"-strcmp", OptSpec::FLAG, &opt_strcmp}, 163 {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO}, 164 {"-norm", OptSpec::FLAG, &opt_norm}, 165 {"-french", OptSpec::FLAG, &opt_french}, 166 {"-frenchoff", OptSpec::FLAG, &opt_frenchoff}, 167 {"-shifted", OptSpec::FLAG, &opt_shifted}, 168 {"-lower", OptSpec::FLAG, &opt_lower}, 169 {"-upper", OptSpec::FLAG, &opt_upper}, 170 {"-case", OptSpec::FLAG, &opt_case}, 171 {"-level", OptSpec::NUM, &opt_level}, 172 {"-keyhist", OptSpec::FLAG, &opt_keyhist}, 173 {"-keygen", OptSpec::FLAG, &opt_keygen}, 174 {"-loop", OptSpec::NUM, &opt_loopCount}, 175 {"-iloop", OptSpec::NUM, &opt_iLoopCount}, 176 {"-terse", OptSpec::FLAG, &opt_terse}, 177 {"-dump", OptSpec::FLAG, &opt_dump}, 178 {"-help", OptSpec::FLAG, &opt_help}, 179 {"-?", OptSpec::FLAG, &opt_help}, 180 {0, OptSpec::FLAG, 0} 181}; 182 183 184//--------------------------------------------------------------------------- 185// 186// Global variables pointing to and describing the test file 187// 188//--------------------------------------------------------------------------- 189 190// 191// struct Line 192// 193// Each line from the source file (containing a name, presumably) gets 194// one of these structs. 195// 196struct Line { 197 UChar *name; 198 int len; 199 char *winSortKey; 200 char *icuSortKey; 201 char *unixSortKey; 202 char *unixName; 203}; 204 205 206 207Line *gFileLines; // Ptr to array of Line structs, one per line in the file. 208int gNumFileLines; 209UCollator *gCol; 210DWORD gWinLCID; 211 212Line **gSortedLines; 213Line **gRandomLines; 214int gCount; 215 216 217 218//--------------------------------------------------------------------------- 219// 220// ProcessOptions() Function to read the command line options. 221// 222//--------------------------------------------------------------------------- 223UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) 224{ 225 int i; 226 int argNum; 227 const char *pArgName; 228 OptSpec *pOpt; 229 230 for (argNum=1; argNum<argc; argNum++) { 231 pArgName = argv[argNum]; 232 for (pOpt = opts; pOpt->name != 0; pOpt++) { 233 if (strcmp(pOpt->name, pArgName) == 0) { 234 switch (pOpt->type) { 235 case OptSpec::FLAG: 236 *(UBool *)(pOpt->pVar) = true; 237 break; 238 case OptSpec::STRING: 239 argNum ++; 240 if (argNum >= argc) { 241 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 242 return false; 243 } 244 *(const char **)(pOpt->pVar) = argv[argNum]; 245 break; 246 case OptSpec::NUM: 247 argNum ++; 248 if (argNum >= argc) { 249 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); 250 return false; 251 } 252 char *endp; 253 i = strtol(argv[argNum], &endp, 0); 254 if (endp == argv[argNum]) { 255 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); 256 return false; 257 } 258 *(int *)(pOpt->pVar) = i; 259 } 260 break; 261 } 262 } 263 if (pOpt->name == 0) 264 { 265 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); 266 return false; 267 } 268 } 269return true; 270} 271 272//--------------------------------------------------------------------------------------- 273// 274// Comparison functions for use by qsort. 275// 276// Six flavors, ICU or Windows, SortKey or String Compare, Strings with length 277// or null terminated. 278// 279//--------------------------------------------------------------------------------------- 280int ICUstrcmpK(const void *a, const void *b) { 281 gCount++; 282 int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey); 283 return t; 284} 285 286 287int ICUstrcmpL(const void *a, const void *b) { 288 gCount++; 289 UCollationResult t; 290 t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); 291 if (t == UCOL_LESS) return -1; 292 if (t == UCOL_GREATER) return +1; 293 return 0; 294} 295 296 297int ICUstrcmp(const void *a, const void *b) { 298 gCount++; 299 UCollationResult t; 300 t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); 301 if (t == UCOL_LESS) return -1; 302 if (t == UCOL_GREATER) return +1; 303 return 0; 304} 305 306 307int Winstrcmp(const void *a, const void *b) { 308 gCount++; 309 int t; 310 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); 311 return t-2; 312} 313 314 315int UNIXstrcmp(const void *a, const void *b) { 316 gCount++; 317 int t; 318 t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName); 319 return t; 320} 321 322 323int WinstrcmpL(const void *a, const void *b) { 324 gCount++; 325 int t; 326 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); 327 return t-2; 328} 329 330 331int WinstrcmpK(const void *a, const void *b) { 332 gCount++; 333 int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey); 334 return t; 335} 336 337 338//--------------------------------------------------------------------------------------- 339// 340// Function for sorting the names (lines) into a random order. 341// Order is based on a hash of the ICU Sort key for the lines 342// The randomized order is used as input for the sorting timing tests. 343// 344//--------------------------------------------------------------------------------------- 345int ICURandomCmp(const void *a, const void *b) { 346 char *ask = (*(Line **)a)->icuSortKey; 347 char *bsk = (*(Line **)b)->icuSortKey; 348 int aVal = 0; 349 int bVal = 0; 350 int retVal; 351 while (*ask != 0) { 352 aVal += aVal*37 + *ask++; 353 } 354 while (*bsk != 0) { 355 bVal += bVal*37 + *bsk++; 356 } 357 retVal = -1; 358 if (aVal == bVal) { 359 retVal = 0; 360 } 361 else if (aVal > bVal) { 362 retVal = 1; 363 } 364 return retVal; 365} 366 367//--------------------------------------------------------------------------------------- 368// 369// doKeyGen() Key Generation Timing Test 370// 371//--------------------------------------------------------------------------------------- 372void doKeyGen() 373{ 374 int line; 375 int loops = 0; 376 int iLoop; 377 int len=-1; 378 379 // Adjust loop count to compensate for file size. Should be order n 380 double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines)); 381 int adj_loopCount = int(dLoopCount); 382 if (adj_loopCount < 1) adj_loopCount = 1; 383 384 385 unsigned long startTime = timeGetTime(); 386 387 if (opt_win) { 388 for (loops=0; loops<adj_loopCount; loops++) { 389 for (line=0; line < gNumFileLines; line++) { 390 if (opt_uselen) { 391 len = gFileLines[line].len; 392 } 393 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 394 LCMapStringW(gWinLCID, LCMAP_SORTKEY, 395 gFileLines[line].name, len, 396 (UChar *)gFileLines[line].winSortKey, 5000); // TODO something with length. 397 } 398 } 399 } 400 } 401 else if (opt_icu) 402 { 403 for (loops=0; loops<adj_loopCount; loops++) { 404 for (line=0; line < gNumFileLines; line++) { 405 if (opt_uselen) { 406 len = gFileLines[line].len; 407 } 408 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 409 ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000); 410 } 411 } 412 } 413 } 414 else if (opt_unix) 415 { 416 for (loops=0; loops<adj_loopCount; loops++) { 417 for (line=0; line < gNumFileLines; line++) { 418 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 419 strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000); 420 } 421 } 422 } 423 } 424 425 unsigned long elapsedTime = timeGetTime() - startTime; 426 int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines)); 427 428 if (opt_terse == false) { 429 printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLines); 430 printf("Sort Key Generation: time per key = %d ns\n", ns); 431 } 432 else { 433 printf("%d, ", ns); 434 } 435 436 int totalKeyLen = 0; 437 int totalChars = 0; 438 for (line=0; line<gNumFileLines; line++) { 439 totalChars += u_strlen(gFileLines[line].name); 440 if (opt_win) { 441 totalKeyLen += strlen(gFileLines[line].winSortKey); 442 } 443 else if (opt_icu) { 444 totalKeyLen += strlen(gFileLines[line].icuSortKey); 445 } 446 else if (opt_unix) { 447 totalKeyLen += strlen(gFileLines[line].unixSortKey); 448 } 449 450 } 451 if (opt_terse == false) { 452 printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars); 453 } else { 454 printf("%f, ", (float)totalKeyLen / (float)totalChars); 455 } 456} 457 458 459 460//--------------------------------------------------------------------------------------- 461// 462// doBinarySearch() Binary Search timing test. Each name from the list 463// is looked up in the full sorted list of names. 464// 465//--------------------------------------------------------------------------------------- 466void doBinarySearch() 467{ 468 469 gCount = 0; 470 int line; 471 int loops = 0; 472 int iLoop = 0; 473 unsigned long elapsedTime = 0; 474 475 // Adjust loop count to compensate for file size. Should be order n (lookups) * log n (compares/lookup) 476 // Accurate timings do not depend on this being perfect. The correction is just to try to 477 // get total running times of about the right order, so the that user doesn't need to 478 // manually adjust the loop count for every different file size. 479 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines)); 480 if (opt_usekeys) dLoopCount *= 5; 481 int adj_loopCount = int(dLoopCount); 482 if (adj_loopCount < 1) adj_loopCount = 1; 483 484 485 for (;;) { // not really a loop, just allows "break" to work, to simplify 486 // inadvertently running more than one test through here. 487 if (opt_strcmp || opt_strcmpCPO) 488 { 489 unsigned long startTime = timeGetTime(); 490 typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *); 491 PF pf = u_strcmp; 492 if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;} 493 //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the difference between int32_t and int 494 // which forces the use of a cast here. 495 496 int r = 0; 497 for (loops=0; loops<adj_loopCount; loops++) { 498 499 for (line=0; line < gNumFileLines; line++) { 500 int hi = gNumFileLines-1; 501 int lo = 0; 502 int guess = -1; 503 for (;;) { 504 int newGuess = (hi + lo) / 2; 505 if (newGuess == guess) 506 break; 507 guess = newGuess; 508 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 509 r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name); 510 } 511 gCount++; 512 if (r== 0) 513 break; 514 if (r < 0) 515 hi = guess; 516 else 517 lo = guess; 518 } 519 } 520 } 521 elapsedTime = timeGetTime() - startTime; 522 break; 523 } 524 525 526 if (opt_icu) 527 { 528 unsigned long startTime = timeGetTime(); 529 UCollationResult r = UCOL_EQUAL; 530 for (loops=0; loops<adj_loopCount; loops++) { 531 532 for (line=0; line < gNumFileLines; line++) { 533 int lineLen = -1; 534 int guessLen = -1; 535 if (opt_uselen) { 536 lineLen = (gSortedLines[line])->len; 537 } 538 int hi = gNumFileLines-1; 539 int lo = 0; 540 int guess = -1; 541 for (;;) { 542 int newGuess = (hi + lo) / 2; 543 if (newGuess == guess) 544 break; 545 guess = newGuess; 546 int ri = 0; 547 if (opt_usekeys) { 548 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 549 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey); 550 } 551 gCount++; 552 r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;} 553 } 554 else 555 { 556 if (opt_uselen) { 557 guessLen = (gSortedLines[guess])->len; 558 } 559 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 560 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); 561 } 562 gCount++; 563 } 564 if (r== UCOL_EQUAL) 565 break; 566 if (r == UCOL_LESS) 567 hi = guess; 568 else 569 lo = guess; 570 } 571 } 572 } 573 elapsedTime = timeGetTime() - startTime; 574 break; 575 } 576 577 if (opt_win) 578 { 579 unsigned long startTime = timeGetTime(); 580 int r = 0; 581 for (loops=0; loops<adj_loopCount; loops++) { 582 583 for (line=0; line < gNumFileLines; line++) { 584 int lineLen = -1; 585 int guessLen = -1; 586 if (opt_uselen) { 587 lineLen = (gSortedLines[line])->len; 588 } 589 int hi = gNumFileLines-1; 590 int lo = 0; 591 int guess = -1; 592 for (;;) { 593 int newGuess = (hi + lo) / 2; 594 if (newGuess == guess) 595 break; 596 guess = newGuess; 597 if (opt_usekeys) { 598 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 599 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey); 600 } 601 gCount++; 602 r+=2; 603 } 604 else 605 { 606 if (opt_uselen) { 607 guessLen = (gSortedLines[guess])->len; 608 } 609 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 610 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); 611 } 612 if (r == 0) { 613 if (opt_terse == false) { 614 fprintf(stderr, "Error returned from Windows CompareStringW.\n"); 615 } 616 exit(-1); 617 } 618 gCount++; 619 } 620 if (r== 2) // strings == 621 break; 622 if (r == 1) // line < guess 623 hi = guess; 624 else // line > guess 625 lo = guess; 626 } 627 } 628 } 629 elapsedTime = timeGetTime() - startTime; 630 break; 631 } 632 633 if (opt_unix) 634 { 635 unsigned long startTime = timeGetTime(); 636 int r = 0; 637 for (loops=0; loops<adj_loopCount; loops++) { 638 639 for (line=0; line < gNumFileLines; line++) { 640 int hi = gNumFileLines-1; 641 int lo = 0; 642 int guess = -1; 643 for (;;) { 644 int newGuess = (hi + lo) / 2; 645 if (newGuess == guess) 646 break; 647 guess = newGuess; 648 if (opt_usekeys) { 649 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 650 r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey); 651 } 652 gCount++; 653 } 654 else 655 { 656 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { 657 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName); 658 } 659 errno = 0; 660 if (errno != 0) { 661 fprintf(stderr, "Error %d returned from strcoll.\n", errno); 662 exit(-1); 663 } 664 gCount++; 665 } 666 if (r == 0) // strings == 667 break; 668 if (r < 0) // line < guess 669 hi = guess; 670 else // line > guess 671 lo = guess; 672 } 673 } 674 } 675 elapsedTime = timeGetTime() - startTime; 676 break; 677 } 678 break; 679 } 680 681 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 682 if (opt_terse == false) { 683 printf("binary search: total # of string compares = %d\n", gCount); 684 printf("binary search: compares per loop = %d\n", gCount / loops); 685 printf("binary search: time per compare = %d ns\n", ns); 686 } else { 687 printf("%d, ", ns); 688 } 689 690} 691 692 693 694 695//--------------------------------------------------------------------------------------- 696// 697// doQSort() The quick sort timing test. Uses the C library qsort function. 698// 699//--------------------------------------------------------------------------------------- 700void doQSort() { 701 int i; 702 Line **sortBuf = new Line *[gNumFileLines]; 703 704 // Adjust loop count to compensate for file size. QSort should be n log(n) 705 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines)); 706 if (opt_usekeys) dLoopCount *= 5; 707 int adj_loopCount = int(dLoopCount); 708 if (adj_loopCount < 1) adj_loopCount = 1; 709 710 711 gCount = 0; 712 unsigned long startTime = timeGetTime(); 713 if (opt_win && opt_usekeys) { 714 for (i=0; i<opt_loopCount; i++) { 715 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 716 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK); 717 } 718 } 719 720 else if (opt_win && opt_uselen) { 721 for (i=0; i<adj_loopCount; i++) { 722 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 723 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL); 724 } 725 } 726 727 728 else if (opt_win && !opt_uselen) { 729 for (i=0; i<adj_loopCount; i++) { 730 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 731 qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp); 732 } 733 } 734 735 else if (opt_icu && opt_usekeys) { 736 for (i=0; i<adj_loopCount; i++) { 737 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 738 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK); 739 } 740 } 741 742 else if (opt_icu && opt_uselen) { 743 for (i=0; i<adj_loopCount; i++) { 744 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 745 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL); 746 } 747 } 748 749 750 else if (opt_icu && !opt_uselen) { 751 for (i=0; i<adj_loopCount; i++) { 752 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 753 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp); 754 } 755 } 756 757 else if (opt_unix && !opt_usekeys) { 758 for (i=0; i<adj_loopCount; i++) { 759 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); 760 qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp); 761 } 762 } 763 764 unsigned long elapsedTime = timeGetTime() - startTime; 765 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 766 if (opt_terse == false) { 767 printf("qsort: total # of string compares = %d\n", gCount); 768 printf("qsort: time per compare = %d ns\n", ns); 769 } else { 770 printf("%d, ", ns); 771 } 772} 773 774 775 776//--------------------------------------------------------------------------------------- 777// 778// doKeyHist() Output a table of data for 779// average sort key size vs. string length. 780// 781//--------------------------------------------------------------------------------------- 782void doKeyHist() { 783 int i; 784 int maxLen = 0; 785 786 // Find the maximum string length 787 for (i=0; i<gNumFileLines; i++) { 788 if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len; 789 } 790 791 // Allocate arrays to hold the histogram data 792 int *accumulatedLen = new int[maxLen+1]; 793 int *numKeysOfSize = new int[maxLen+1]; 794 for (i=0; i<=maxLen; i++) { 795 accumulatedLen[i] = 0; 796 numKeysOfSize[i] = 0; 797 } 798 799 // Fill the arrays... 800 for (i=0; i<gNumFileLines; i++) { 801 int len = gFileLines[i].len; 802 accumulatedLen[len] += strlen(gFileLines[i].icuSortKey); 803 numKeysOfSize[len] += 1; 804 } 805 806 // And write out averages 807 printf("String Length, Avg Key Length, Avg Key Len per char\n"); 808 for (i=1; i<=maxLen; i++) { 809 if (numKeysOfSize[i] > 0) { 810 printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i], 811 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i)); 812 } 813 } 814 delete []accumulatedLen; 815 delete []numKeysOfSize ; 816} 817 818//--------------------------------------------------------------------------------------- 819// 820// doForwardIterTest(UBool) Forward iteration test 821// argument null-terminated string used 822// 823//--------------------------------------------------------------------------------------- 824void doForwardIterTest(UBool haslen) { 825 int count = 0; 826 827 UErrorCode error = U_ZERO_ERROR; 828 printf("\n\nPerforming forward iteration performance test with "); 829 830 if (haslen) { 831 printf("non-null terminated data -----------\n"); 832 } 833 else { 834 printf("null terminated data -----------\n"); 835 } 836 printf("performance test on strings from file -----------\n"); 837 838 UChar dummytext[] = {0, 0}; 839 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); 840 ucol_setText(iter, dummytext, 1, &error); 841 842 gCount = 0; 843 unsigned long startTime = timeGetTime(); 844 while (count < opt_loopCount) { 845 int linecount = 0; 846 while (linecount < gNumFileLines) { 847 UChar *str = gFileLines[linecount].name; 848 int strlen = haslen?gFileLines[linecount].len:-1; 849 ucol_setText(iter, str, strlen, &error); 850 while (ucol_next(iter, &error) != UCOL_NULLORDER) { 851 gCount++; 852 } 853 854 linecount ++; 855 } 856 count ++; 857 } 858 unsigned long elapsedTime = timeGetTime() - startTime; 859 printf("elapsedTime %ld\n", elapsedTime); 860 861 // empty loop recalculation 862 count = 0; 863 startTime = timeGetTime(); 864 while (count < opt_loopCount) { 865 int linecount = 0; 866 while (linecount < gNumFileLines) { 867 UChar *str = gFileLines[linecount].name; 868 int strlen = haslen?gFileLines[linecount].len:-1; 869 ucol_setText(iter, str, strlen, &error); 870 linecount ++; 871 } 872 count ++; 873 } 874 elapsedTime -= (timeGetTime() - startTime); 875 printf("elapsedTime %ld\n", elapsedTime); 876 877 ucol_closeElements(iter); 878 879 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 880 printf("Total number of strings compared %d in %d loops\n", gNumFileLines, 881 opt_loopCount); 882 printf("Average time per ucol_next() nano seconds %d\n", ns); 883 884 printf("performance test on skipped-5 concatenated strings from file -----------\n"); 885 886 UChar *str; 887 int strlen = 0; 888 // appending all the strings 889 int linecount = 0; 890 while (linecount < gNumFileLines) { 891 strlen += haslen?gFileLines[linecount].len: 892 u_strlen(gFileLines[linecount].name); 893 linecount ++; 894 } 895 str = (UChar *)malloc(sizeof(UChar) * strlen); 896 int strindex = 0; 897 linecount = 0; 898 while (strindex < strlen) { 899 int len = 0; 900 len += haslen?gFileLines[linecount].len: 901 u_strlen(gFileLines[linecount].name); 902 memcpy(str + strindex, gFileLines[linecount].name, 903 sizeof(UChar) * len); 904 strindex += len; 905 linecount ++; 906 } 907 908 printf("Total size of strings %d\n", strlen); 909 910 gCount = 0; 911 count = 0; 912 913 if (!haslen) { 914 strlen = -1; 915 } 916 iter = ucol_openElements(gCol, str, strlen, &error); 917 if (!haslen) { 918 strlen = u_strlen(str); 919 } 920 strlen -= 5; // any left over characters are not iterated, 921 // this is to ensure the backwards and forwards iterators 922 // gets the same position 923 startTime = timeGetTime(); 924 while (count < opt_loopCount) { 925 int count5 = 5; 926 strindex = 0; 927 ucol_setOffset(iter, strindex, &error); 928 while (true) { 929 if (ucol_next(iter, &error) == UCOL_NULLORDER) { 930 break; 931 } 932 gCount++; 933 count5 --; 934 if (count5 == 0) { 935 strindex += 10; 936 if (strindex > strlen) { 937 break; 938 } 939 ucol_setOffset(iter, strindex, &error); 940 count5 = 5; 941 } 942 } 943 count ++; 944 } 945 946 elapsedTime = timeGetTime() - startTime; 947 printf("elapsedTime %ld\n", elapsedTime); 948 949 // empty loop recalculation 950 int tempgCount = 0; 951 count = 0; 952 startTime = timeGetTime(); 953 while (count < opt_loopCount) { 954 int count5 = 5; 955 strindex = 0; 956 ucol_setOffset(iter, strindex, &error); 957 while (true) { 958 tempgCount ++; 959 count5 --; 960 if (count5 == 0) { 961 strindex += 10; 962 if (strindex > strlen) { 963 break; 964 } 965 ucol_setOffset(iter, strindex, &error); 966 count5 = 5; 967 } 968 } 969 count ++; 970 } 971 elapsedTime -= (timeGetTime() - startTime); 972 printf("elapsedTime %ld\n", elapsedTime); 973 974 ucol_closeElements(iter); 975 976 printf("gCount %d\n", gCount); 977 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 978 printf("Average time per ucol_next() nano seconds %d\n", ns); 979} 980 981//--------------------------------------------------------------------------------------- 982// 983// doBackwardIterTest(UBool) Backwards iteration test 984// argument null-terminated string used 985// 986//--------------------------------------------------------------------------------------- 987void doBackwardIterTest(UBool haslen) { 988 int count = 0; 989 UErrorCode error = U_ZERO_ERROR; 990 printf("\n\nPerforming backward iteration performance test with "); 991 992 if (haslen) { 993 printf("non-null terminated data -----------\n"); 994 } 995 else { 996 printf("null terminated data -----------\n"); 997 } 998 999 printf("performance test on strings from file -----------\n"); 1000 1001 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); 1002 UChar dummytext[] = {0, 0}; 1003 ucol_setText(iter, dummytext, 1, &error); 1004 1005 gCount = 0; 1006 unsigned long startTime = timeGetTime(); 1007 while (count < opt_loopCount) { 1008 int linecount = 0; 1009 while (linecount < gNumFileLines) { 1010 UChar *str = gFileLines[linecount].name; 1011 int strlen = haslen?gFileLines[linecount].len:-1; 1012 ucol_setText(iter, str, strlen, &error); 1013 while (ucol_previous(iter, &error) != UCOL_NULLORDER) { 1014 gCount ++; 1015 } 1016 1017 linecount ++; 1018 } 1019 count ++; 1020 } 1021 unsigned long elapsedTime = timeGetTime() - startTime; 1022 1023 printf("elapsedTime %ld\n", elapsedTime); 1024 1025 // empty loop recalculation 1026 count = 0; 1027 startTime = timeGetTime(); 1028 while (count < opt_loopCount) { 1029 int linecount = 0; 1030 while (linecount < gNumFileLines) { 1031 UChar *str = gFileLines[linecount].name; 1032 int strlen = haslen?gFileLines[linecount].len:-1; 1033 ucol_setText(iter, str, strlen, &error); 1034 linecount ++; 1035 } 1036 count ++; 1037 } 1038 elapsedTime -= (timeGetTime() - startTime); 1039 1040 printf("elapsedTime %ld\n", elapsedTime); 1041 ucol_closeElements(iter); 1042 1043 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 1044 printf("Total number of strings compared %d in %d loops\n", gNumFileLines, 1045 opt_loopCount); 1046 printf("Average time per ucol_previous() nano seconds %d\n", ns); 1047 1048 printf("performance test on skipped-5 concatenated strings from file -----------\n"); 1049 1050 UChar *str; 1051 int strlen = 0; 1052 // appending all the strings 1053 int linecount = 0; 1054 while (linecount < gNumFileLines) { 1055 strlen += haslen?gFileLines[linecount].len: 1056 u_strlen(gFileLines[linecount].name); 1057 linecount ++; 1058 } 1059 str = (UChar *)malloc(sizeof(UChar) * strlen); 1060 int strindex = 0; 1061 linecount = 0; 1062 while (strindex < strlen) { 1063 int len = 0; 1064 len += haslen?gFileLines[linecount].len: 1065 u_strlen(gFileLines[linecount].name); 1066 memcpy(str + strindex, gFileLines[linecount].name, 1067 sizeof(UChar) * len); 1068 strindex += len; 1069 linecount ++; 1070 } 1071 1072 printf("Total size of strings %d\n", strlen); 1073 1074 gCount = 0; 1075 count = 0; 1076 1077 if (!haslen) { 1078 strlen = -1; 1079 } 1080 1081 iter = ucol_openElements(gCol, str, strlen, &error); 1082 if (!haslen) { 1083 strlen = u_strlen(str); 1084 } 1085 1086 startTime = timeGetTime(); 1087 while (count < opt_loopCount) { 1088 int count5 = 5; 1089 strindex = 5; 1090 ucol_setOffset(iter, strindex, &error); 1091 while (true) { 1092 if (ucol_previous(iter, &error) == UCOL_NULLORDER) { 1093 break; 1094 } 1095 gCount ++; 1096 count5 --; 1097 if (count5 == 0) { 1098 strindex += 10; 1099 if (strindex > strlen) { 1100 break; 1101 } 1102 ucol_setOffset(iter, strindex, &error); 1103 count5 = 5; 1104 } 1105 } 1106 count ++; 1107 } 1108 1109 elapsedTime = timeGetTime() - startTime; 1110 printf("elapsedTime %ld\n", elapsedTime); 1111 1112 // empty loop recalculation 1113 count = 0; 1114 int tempgCount = 0; 1115 startTime = timeGetTime(); 1116 while (count < opt_loopCount) { 1117 int count5 = 5; 1118 strindex = 5; 1119 ucol_setOffset(iter, strindex, &error); 1120 while (true) { 1121 tempgCount ++; 1122 count5 --; 1123 if (count5 == 0) { 1124 strindex += 10; 1125 if (strindex > strlen) { 1126 break; 1127 } 1128 ucol_setOffset(iter, strindex, &error); 1129 count5 = 5; 1130 } 1131 } 1132 count ++; 1133 } 1134 elapsedTime -= (timeGetTime() - startTime); 1135 printf("elapsedTime %ld\n", elapsedTime); 1136 ucol_closeElements(iter); 1137 1138 printf("gCount %d\n", gCount); 1139 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); 1140 printf("Average time per ucol_previous() nano seconds %d\n", ns); 1141} 1142 1143//--------------------------------------------------------------------------------------- 1144// 1145// doIterTest() Iteration test 1146// 1147//--------------------------------------------------------------------------------------- 1148void doIterTest() { 1149 doForwardIterTest(opt_uselen); 1150 doBackwardIterTest(opt_uselen); 1151} 1152 1153 1154//---------------------------------------------------------------------------------------- 1155// 1156// UnixConvert -- Convert the lines of the file to the encoding for UNIX 1157// Since it appears that Unicode support is going in the general 1158// direction of the use of UTF-8 locales, that is the approach 1159// that is used here. 1160// 1161//---------------------------------------------------------------------------------------- 1162void UnixConvert() { 1163 int line; 1164 1165 UConverter *cvrtr; // An ICU code page converter. 1166 UErrorCode status = U_ZERO_ERROR; 1167 1168 1169 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. 1170 if (U_FAILURE(status)) { 1171 fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status)); 1172 exit(-1); 1173 } 1174 1175 for (line=0; line < gNumFileLines; line++) { 1176 int sizeNeeded = ucnv_fromUChars(cvrtr, 1177 0, // ptr to target buffer. 1178 0, // length of target buffer. 1179 gFileLines[line].name, 1180 -1, // source is null terminated 1181 &status); 1182 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { 1183 //fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); 1184 //exit(-1); 1185 } 1186 status = U_ZERO_ERROR; 1187 gFileLines[line].unixName = new char[sizeNeeded+1]; 1188 sizeNeeded = ucnv_fromUChars(cvrtr, 1189 gFileLines[line].unixName, // ptr to target buffer. 1190 sizeNeeded+1, // length of target buffer. 1191 gFileLines[line].name, 1192 -1, // source is null terminated 1193 &status); 1194 if (U_FAILURE(status)) { 1195 fprintf(stderr, "ICU Conversion Failed.: %d\n", status); 1196 exit(-1); 1197 } 1198 gFileLines[line].unixName[sizeNeeded] = 0; 1199 }; 1200 ucnv_close(cvrtr); 1201} 1202 1203 1204//---------------------------------------------------------------------------------------- 1205// 1206// class UCharFile Class to hide all the gorp to read a file in 1207// and produce a stream of UChars. 1208// 1209//---------------------------------------------------------------------------------------- 1210class UCharFile { 1211public: 1212 UCharFile(const char *fileName); 1213 ~UCharFile(); 1214 UChar get(); 1215 UBool eof() {return fEof;}; 1216 UBool error() {return fError;}; 1217 1218private: 1219 UCharFile (const UCharFile & /*other*/) {}; // No copy constructor. 1220 UCharFile & operator = (const UCharFile &/*other*/) {return *this;}; // No assignment op 1221 1222 FILE *fFile; 1223 const char *fName; 1224 UBool fEof; 1225 UBool fError; 1226 UChar fPending2ndSurrogate; 1227 1228 enum {UTF16LE, UTF16BE, UTF8} fEncoding; 1229}; 1230 1231UCharFile::UCharFile(const char * fileName) { 1232 fEof = false; 1233 fError = false; 1234 fName = fileName; 1235 fFile = fopen(fName, "rb"); 1236 fPending2ndSurrogate = 0; 1237 if (fFile == NULL) { 1238 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); 1239 fError = true; 1240 return; 1241 } 1242 // 1243 // Look for the byte order mark at the start of the file. 1244 // 1245 int BOMC1, BOMC2, BOMC3; 1246 BOMC1 = fgetc(fFile); 1247 BOMC2 = fgetc(fFile); 1248 1249 if (BOMC1 == 0xff && BOMC2 == 0xfe) { 1250 fEncoding = UTF16LE; } 1251 else if (BOMC1 == 0xfe && BOMC2 == 0xff) { 1252 fEncoding = UTF16BE; } 1253 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { 1254 fEncoding = UTF8; } 1255 else 1256 { 1257 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " 1258 "must include a BOM.\n", fileName); 1259 fError = true; 1260 return; 1261 } 1262} 1263 1264 1265UCharFile::~UCharFile() { 1266 fclose(fFile); 1267} 1268 1269 1270 1271UChar UCharFile::get() { 1272 UChar c; 1273 switch (fEncoding) { 1274 case UTF16LE: 1275 { 1276 int cL, cH; 1277 cL = fgetc(fFile); 1278 cH = fgetc(fFile); 1279 c = cL | (cH << 8); 1280 if (cH == EOF) { 1281 c = 0; 1282 fEof = true; 1283 } 1284 break; 1285 } 1286 case UTF16BE: 1287 { 1288 int cL, cH; 1289 cH = fgetc(fFile); 1290 cL = fgetc(fFile); 1291 c = cL | (cH << 8); 1292 if (cL == EOF) { 1293 c = 0; 1294 fEof = true; 1295 } 1296 break; 1297 } 1298 case UTF8: 1299 { 1300 if (fPending2ndSurrogate != 0) { 1301 c = fPending2ndSurrogate; 1302 fPending2ndSurrogate = 0; 1303 break; 1304 } 1305 1306 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. 1307 if (ch == EOF) { 1308 c = 0; 1309 fEof = true; 1310 break; 1311 } 1312 1313 if (ch <= 0x7f) { 1314 // It's ascii. No further utf-8 conversion. 1315 c = ch; 1316 break; 1317 } 1318 1319 // Figure out the length of the char and read the rest of the bytes 1320 // into a temp array. 1321 int nBytes; 1322 if (ch >= 0xF0) {nBytes=4;} 1323 else if (ch >= 0xE0) {nBytes=3;} 1324 else if (ch >= 0xC0) {nBytes=2;} 1325 else { 1326 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); 1327 fError = true; 1328 return 0; 1329 } 1330 1331 unsigned char bytes[10]; 1332 bytes[0] = (unsigned char)ch; 1333 int i; 1334 for (i=1; i<nBytes; i++) { 1335 bytes[i] = fgetc(fFile); 1336 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { 1337 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); 1338 fError = true; 1339 return 0; 1340 } 1341 } 1342 1343 // Convert the bytes from the temp array to a Unicode char. 1344 i = 0; 1345 uint32_t cp; 1346 U8_NEXT_UNSAFE(bytes, i, cp); 1347 c = (UChar)cp; 1348 1349 if (cp >= 0x10000) { 1350 // The code point needs to be broken up into a utf-16 surrogate pair. 1351 // Process first half this time through the main loop, and 1352 // remember the other half for the next time through. 1353 UChar utf16Buf[3]; 1354 i = 0; 1355 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); 1356 fPending2ndSurrogate = utf16Buf[1]; 1357 c = utf16Buf[0]; 1358 } 1359 break; 1360 }; 1361 default: 1362 c = 0xFFFD; /* Error, unspecified codepage*/ 1363 fprintf(stderr, "UCharFile: Error: unknown fEncoding\n"); 1364 exit(1); 1365 } 1366 return c; 1367} 1368 1369//---------------------------------------------------------------------------------------- 1370// 1371// openRulesCollator - Command line specified a rules file. Read it in 1372// and open a collator with it. 1373// 1374//---------------------------------------------------------------------------------------- 1375UCollator *openRulesCollator() { 1376 UCharFile f(opt_rules); 1377 if (f.error()) { 1378 return 0; 1379 } 1380 1381 int bufLen = 10000; 1382 UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar)); 1383 UChar *tmp; 1384 int i = 0; 1385 1386 for(;;) { 1387 buf[i] = f.get(); 1388 if (f.eof()) { 1389 break; 1390 } 1391 if (f.error()) { 1392 return 0; 1393 } 1394 i++; 1395 if (i >= bufLen) { 1396 tmp = buf; 1397 bufLen += 10000; 1398 buf = (UChar *)realloc(buf, bufLen); 1399 if (buf == NULL) { 1400 free(tmp); 1401 return 0; 1402 } 1403 } 1404 } 1405 buf[i] = 0; 1406 1407 UErrorCode status = U_ZERO_ERROR; 1408 UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF, 1409 UCOL_DEFAULT_STRENGTH, NULL, &status); 1410 if (U_FAILURE(status)) { 1411 fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status); 1412 return 0; 1413 } 1414 free(buf); 1415 return coll; 1416} 1417 1418 1419 1420 1421 1422//---------------------------------------------------------------------------------------- 1423// 1424// Main -- process command line, read in and pre-process the test file, 1425// call other functions to do the actual tests. 1426// 1427//---------------------------------------------------------------------------------------- 1428int main(int argc, const char** argv) { 1429 if (ProcessOptions(argc, argv, opts) != true || opt_help || opt_fName == 0) { 1430 printf(gUsageString); 1431 exit (1); 1432 } 1433 1434 // Make sure that we've only got one API selected. 1435 if (opt_unix || opt_win) opt_icu = false; 1436 if (opt_unix) opt_win = false; 1437 1438 // 1439 // Set up an ICU collator 1440 // 1441 UErrorCode status = U_ZERO_ERROR; 1442 1443 if (opt_rules != 0) { 1444 gCol = openRulesCollator(); 1445 if (gCol == 0) {return -1;} 1446 } 1447 else { 1448 gCol = ucol_open(opt_locale, &status); 1449 if (U_FAILURE(status)) { 1450 fprintf(stderr, "Collator creation failed.: %d\n", status); 1451 return -1; 1452 } 1453 } 1454 if (status==U_USING_DEFAULT_WARNING && opt_terse==false) { 1455 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); 1456 } 1457 if (status==U_USING_FALLBACK_WARNING && opt_terse==false) { 1458 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); 1459 } 1460 1461 if (opt_norm) { 1462 ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 1463 } 1464 if (opt_french && opt_frenchoff) { 1465 fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options."); 1466 exit(-1); 1467 } 1468 if (opt_french) { 1469 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status); 1470 } 1471 if (opt_frenchoff) { 1472 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status); 1473 } 1474 if (opt_lower) { 1475 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status); 1476 } 1477 if (opt_upper) { 1478 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status); 1479 } 1480 if (opt_case) { 1481 ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status); 1482 } 1483 if (opt_shifted) { 1484 ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status); 1485 } 1486 if (opt_level != 0) { 1487 switch (opt_level) { 1488 case 1: 1489 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status); 1490 break; 1491 case 2: 1492 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status); 1493 break; 1494 case 3: 1495 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status); 1496 break; 1497 case 4: 1498 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status); 1499 break; 1500 case 5: 1501 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status); 1502 break; 1503 default: 1504 fprintf(stderr, "-level param must be between 1 and 5\n"); 1505 exit(-1); 1506 } 1507 } 1508 1509 if (U_FAILURE(status)) { 1510 fprintf(stderr, "Collator attribute setting failed.: %d\n", status); 1511 return -1; 1512 } 1513 1514 1515 // 1516 // Set up a Windows LCID 1517 // 1518 if (opt_langid != 0) { 1519 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); 1520 } 1521 else { 1522 gWinLCID = uloc_getLCID(opt_locale); 1523 } 1524 1525 1526 // 1527 // Set the UNIX locale 1528 // 1529 if (opt_unix) { 1530 if (setlocale(LC_ALL, opt_locale) == 0) { 1531 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); 1532 exit(-1); 1533 } 1534 } 1535 1536 // Read in the input file. 1537 // File assumed to be utf-16. 1538 // Lines go onto heap buffers. Global index array to line starts is created. 1539 // Lines themselves are null terminated. 1540 // 1541 1542 UCharFile f(opt_fName); 1543 if (f.error()) { 1544 exit(-1); 1545 } 1546 1547 const int MAXLINES = 100000; 1548 gFileLines = new Line[MAXLINES]; 1549 UChar buf[1024]; 1550 int column = 0; 1551 1552 // Read the file, split into lines, and save in memory. 1553 // Loop runs once per utf-16 value from the input file, 1554 // (The number of bytes read from file per loop iteration depends on external encoding.) 1555 for (;;) { 1556 1557 UChar c = f.get(); 1558 if (f.error()){ 1559 exit(-1); 1560 } 1561 1562 1563 // We now have a good UTF-16 value in c. 1564 1565 // Watch for CR, LF, EOF; these finish off a line. 1566 if (c == 0xd) { 1567 continue; 1568 } 1569 1570 if (f.eof() || c == 0x0a || c==0x2028) { // Unipad inserts 2028 line separators! 1571 buf[column++] = 0; 1572 if (column > 1) { 1573 gFileLines[gNumFileLines].name = new UChar[column]; 1574 gFileLines[gNumFileLines].len = column-1; 1575 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar)); 1576 gNumFileLines++; 1577 column = 0; 1578 if (gNumFileLines >= MAXLINES) { 1579 fprintf(stderr, "File too big. Max number of lines is %d\n", MAXLINES); 1580 exit(-1); 1581 } 1582 1583 } 1584 if (c == 0xa || c == 0x2028) 1585 continue; 1586 else 1587 break; // EOF 1588 } 1589 buf[column++] = c; 1590 if (column >= 1023) 1591 { 1592 static UBool warnFlag = true; 1593 if (warnFlag) { 1594 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n"); 1595 warnFlag = false; 1596 } 1597 column--; 1598 } 1599 } 1600 1601 if (opt_terse == false) { 1602 printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines); 1603 } 1604 1605 1606 // Convert the lines to the UNIX encoding. 1607 if (opt_unix) { 1608 UnixConvert(); 1609 } 1610 1611 // 1612 // Pre-compute ICU sort keys for the lines of the file. 1613 // 1614 int line; 1615 int32_t t; 1616 1617 for (line=0; line<gNumFileLines; line++) { 1618 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf)); 1619 gFileLines[line].icuSortKey = new char[t]; 1620 1621 if (t > (int32_t)sizeof(buf)) { 1622 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t); 1623 } 1624 else 1625 { 1626 memcpy(gFileLines[line].icuSortKey, buf, t); 1627 } 1628 } 1629 1630 1631 1632 // 1633 // Pre-compute Windows sort keys for the lines of the file. 1634 // 1635 for (line=0; line<gNumFileLines; line++) { 1636 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf)); 1637 gFileLines[line].winSortKey = new char[t]; 1638 if (t > (int32_t)sizeof(buf)) { 1639 t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (UChar *)(gFileLines[line].winSortKey), t); 1640 } 1641 else 1642 { 1643 memcpy(gFileLines[line].winSortKey, buf, t); 1644 } 1645 } 1646 1647 // 1648 // Pre-compute UNIX sort keys for the lines of the file. 1649 // 1650 if (opt_unix) { 1651 for (line=0; line<gNumFileLines; line++) { 1652 t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf)); 1653 gFileLines[line].unixSortKey = new char[t]; 1654 if (t > (int32_t)sizeof(buf)) { 1655 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, sizeof(buf)); 1656 } 1657 else 1658 { 1659 memcpy(gFileLines[line].unixSortKey, buf, t); 1660 } 1661 } 1662 } 1663 1664 1665 // 1666 // Dump file lines, CEs, Sort Keys if requested. 1667 // 1668 if (opt_dump) { 1669 int i; 1670 for (line=0; line<gNumFileLines; line++) { 1671 for (i=0;;i++) { 1672 UChar c = gFileLines[line].name[i]; 1673 if (c == 0) 1674 break; 1675 if (c < 0x20 || c > 0x7e) { 1676 printf("\\u%.4x", c); 1677 } 1678 else { 1679 printf("%c", c); 1680 } 1681 } 1682 printf("\n"); 1683 1684 printf(" CEs: "); 1685 UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status); 1686 int32_t ce; 1687 i = 0; 1688 for (;;) { 1689 ce = ucol_next(CEiter, &status); 1690 if (ce == UCOL_NULLORDER) { 1691 break; 1692 } 1693 printf(" %.8x", ce); 1694 if (++i > 8) { 1695 printf("\n "); 1696 i = 0; 1697 } 1698 } 1699 printf("\n"); 1700 ucol_closeElements(CEiter); 1701 1702 1703 printf(" ICU Sort Key: "); 1704 for (i=0; ; i++) { 1705 unsigned char c = gFileLines[line].icuSortKey[i]; 1706 printf("%02x ", c); 1707 if (c == 0) { 1708 break; 1709 } 1710 if (i > 0 && i % 20 == 0) { 1711 printf("\n "); 1712 } 1713 } 1714 printf("\n"); 1715 } 1716 } 1717 1718 1719 // 1720 // Pre-sort the lines. 1721 // 1722 int i; 1723 gSortedLines = new Line *[gNumFileLines]; 1724 for (i=0; i<gNumFileLines; i++) { 1725 gSortedLines[i] = &gFileLines[i]; 1726 } 1727 1728 if (opt_win) { 1729 qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp); 1730 } 1731 else if (opt_unix) { 1732 qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp); 1733 } 1734 else /* ICU */ 1735 { 1736 qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp); 1737 } 1738 1739 1740 // 1741 // Make up a randomized order, will be used for sorting tests. 1742 // 1743 gRandomLines = new Line *[gNumFileLines]; 1744 for (i=0; i<gNumFileLines; i++) { 1745 gRandomLines[i] = &gFileLines[i]; 1746 } 1747 qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp); 1748 1749 1750 1751 1752 // 1753 // We've got the file read into memory. Go do something with it. 1754 // 1755 1756 if (opt_qsort) doQSort(); 1757 if (opt_binsearch) doBinarySearch(); 1758 if (opt_keygen) doKeyGen(); 1759 if (opt_keyhist) doKeyHist(); 1760 if (opt_itertest) doIterTest(); 1761 1762 return 0; 1763 1764} 1765