1// © 2017 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3#include "sortedlines.h" 4 5static int codePointCmp(const void *a, const void *b) { 6 return u_strcmp((*(Line **)a)->name, (*(Line **)b)->name); 7} 8 9SortedLines::SortedLines(const UnicodeSet &set, const UnicodeSet &excludeBounds, const StrengthProbe &probe, 10 UPrinter *logger, UPrinter *debug) : 11toSort(NULL), 12toSortCapacity(0), 13lines(NULL), 14size(0), 15capacity(0), 16repertoire(set), 17excludeBounds(excludeBounds), 18probe(probe), 19first(NULL), 20last(NULL), 21logger(logger), 22debug(debug), 23contractionsTable(NULL), 24duplicators(NULL), 25maxExpansionPrefixSize(0), 26wordSort(false), 27frenchSecondary(false), 28upperFirst(false), 29sortkeys(NULL), 30sortkeyOffset(0) 31{ 32 memset(UB, 0, sizeof(UB)); 33 int32_t i = 0; 34 for(i = 0; i < UCOL_OFF; i++) { 35 UB[i] = ∅ 36 } 37 init(); 38} 39 40SortedLines::~SortedLines() 41{ 42 delete[] lines; 43 if(sortkeys) { 44 delete[] sortkeys; 45 } 46 if(toSort) { 47 delete[] toSort; 48 } 49 if(contractionsTable) { 50 delete contractionsTable; 51 } 52 if(duplicators) { 53 delete duplicators; 54 } 55} 56 57void 58SortedLines::getBounds(UErrorCode &status) { 59 // first sort through the set 60 debug->log(toString(), true); 61 int32_t i = 0, j = 0; 62 UColAttributeValue strength = UCOL_OFF; 63 for(i = 0; i < size; i++) { 64 if(toSort[i]->strengthFromEmpty < strength) { 65 if(i && strength < UCOL_OFF) { 66 //u_strcpy(UB[strength], toSort[i-1]->name); 67 j = 1; 68 while(excludeBounds.contains(UnicodeString(toSort[i-j]->name, toSort[i-j]->len))) { 69 j++; 70 } 71 UB[strength] = toSort[i-j]; 72 } 73 strength = toSort[i]->strengthFromEmpty; 74 if(strength == UCOL_PRIMARY) { 75 probe.SE = toSort[i]->name[0]; 76 } 77 } 78 } 79 //u_strcpy(UB[strength], toSort[size-1]->name); 80 // a different solution for bounds: go from end and see if the guys on the top 81 // cause duplication for things 82 UChar dupch[] = { 0x0020, 0x0030, 0x0042, 0x0051, 0x0062, 0x0071, 0x0391, 0x0396, 0x03b1, 0x03b6 }; 83 j = 1; 84 Line dup; 85 Line bound; 86 int32_t dups = 0; 87 while(j < size) { 88 dups = 0; 89 for(i = 0; i < sizeof(dupch)/sizeof(dupch[0]); i++) { 90 dup.setTo(dupch[i]); 91 dup.append(dupch[i]); 92 bound.setTo(dupch[i]); 93 bound.append(toSort[size-j]->name, toSort[size-j]->len); 94 if(probe.getStrength(dup, bound) >= UCOL_IDENTICAL) { 95 dups++; 96 } 97 } 98 if(dups == 0) { 99 break; 100 } else { 101 if(!duplicators) { 102 duplicators = new Hashtable(); 103 } 104 duplicators->put(UnicodeString(toSort[size-j]->name, toSort[size-j]->len), &toSort[size-j], status); 105 debug->log(toSort[size-j]->toString()); 106 debug->log(" is not good enough to be an upper bound\n"); 107 j++; 108 } 109 } 110 if(j == size) { 111 debug->log("Oi! I'm hallucinating. Will use the first upper bound"); 112 delete duplicators; 113 duplicators = NULL; 114 j = 1; 115 } 116/* 117 j = 1; 118 while(excludeBounds.contains(UnicodeString(toSort[size-j]->name, toSort[size-j]->len))) { 119 j++; 120 } 121*/ 122 UB[strength] = toSort[size-j]; 123 for(i = 0; i < UCOL_OFF; i++) { 124 if(UB[i]) { 125 //debug->log(UB[i], true); 126 debug->log(UB[i]->toString(true), true); 127 } 128 } 129} 130 131// classifies repertoire according to the strength of their difference 132// from the empty string 133void 134SortedLines::classifyRepertoire() { 135 UColAttributeValue strongestStrengthFromEmpty = UCOL_OFF; 136 int32_t lastChange = 0; 137 int32_t i = 0, j = 0; 138 while(i < size) // && probe.distanceFromEmptyString(*toSort[i]) > UCOL_PRIMARY) 139 { 140 toSort[i]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i]); 141 if(toSort[i]->strengthFromEmpty < strongestStrengthFromEmpty) { 142 strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty; 143 lastChange = i; 144 } else if (toSort[i]->strengthFromEmpty > strongestStrengthFromEmpty) { 145 // there is a problem in detection. Most probably a quaternary. 146 // why don't we try to interpolate 147 UColAttributeValue nextStrength = UCOL_OFF; 148 UColAttributeValue prevStrength = UCOL_OFF; 149 UColAttributeValue st = UCOL_OFF; 150 151 logger->log("Interpolating to get the distance from empty for Line "); 152 logger->log(toSort[i]->toString(true), true); 153 154 if(i) { 155 st = probe.getStrength(*toSort[i-1], *toSort[i]); 156 if(st == UCOL_OFF) { 157 logger->log("Cannot deduce distance from empty using previous element. Something is very wrong! Line:"); 158 logger->log(toSort[i]->toString(true), true); 159 } else if(st == UCOL_IDENTICAL || st >= toSort[i-1]->strengthFromEmpty) { 160 prevStrength = toSort[i-1]->strengthFromEmpty; 161 } else if(st < toSort[i-1]->strengthFromEmpty) { 162 prevStrength = st; 163 } 164 toSort[i]->strengthFromEmpty = prevStrength; 165 } 166 if(i < size-2) { 167 toSort[i+1]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i+1]); 168 st = probe.getStrength(*toSort[i+1], *toSort[i]); 169 if(st == UCOL_OFF) { 170 logger->log("Cannot deduce distance from empty using next element. Something is very wrong! Line:"); 171 logger->log(toSort[i]->toString(true), true); 172 } else if(st == UCOL_IDENTICAL || st < toSort[i+1]->strengthFromEmpty) { 173 nextStrength = toSort[i+1]->strengthFromEmpty; 174 } else if(st >= toSort[i+1]->strengthFromEmpty) { 175 nextStrength = st; 176 } 177 if(i) { 178 if(prevStrength != nextStrength) { 179 logger->log("Inconsistent results from interpolation! Results will most likely be wrong\n"); 180 } 181 } 182 toSort[i]->strengthFromEmpty = nextStrength; 183 } 184 /* 185 UColAttributeValue problemStrength = UCOL_PRIMARY; 186 for(j = lastChange; j < i ; j++) { 187 if(toSort[j]->strength > problemStrength) { 188 problemStrength = toSort[j]->strength; 189 } 190 } 191 for(j = lastChange; j < i ; j++) { 192 toSort[j]->strengthFromEmpty = problemStrength; 193 } 194 strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty; 195 lastChange = i; 196 debug->log("Problem detected in distances from empty. Most probably word sort is on\n"); 197 */ 198 wordSort = true; 199 } 200 i++; 201 } 202 debug->log("Distances from empty string\n"); 203 debug->log(toStringFromEmpty(), true); 204} 205 206void 207SortedLines::analyse(UErrorCode &status) { 208 frenchSecondary = probe.isFrenchSecondary(status); 209 if(U_FAILURE(status)) { 210 logger->log("Test for French secondary failed. Bailing out!\n"); 211 return; 212 } 213 logger->log("French secondary value is %i\n", frenchSecondary, frenchSecondary); 214 upperFirst = probe.isUpperFirst(status); 215 if(U_FAILURE(status)) { 216 logger->log("Test for upper first failed. Bailing out!\n"); 217 return; 218 } 219 logger->log("upper first value is %i\n", upperFirst, upperFirst); 220 sort(true, true); 221 classifyRepertoire(); 222 getBounds(status); 223 //sort(true, true); 224 addContractionsToRepertoire(status); 225 //sort(true, true); 226 debug->log("\n*** Order after detecting contractions\n\n"); 227 calculateSortKeys(); 228 debug->log(toPrettyString(false, true), true); 229 detectExpansions(); 230} 231 232void SortedLines::init() 233{ 234 size = repertoire.size(); 235 capacity = 5*size; 236 lines = new Line[capacity]; 237 init(repertoire, lines); 238} 239 240void SortedLines::init(UnicodeSet &rep, Line *lin) 241{ 242 243 UnicodeSetIterator exemplarUSetIter(rep); 244 int32_t size = 0; 245 246 while(exemplarUSetIter.next()) { 247 Line *currLine = lin+size; 248 if(exemplarUSetIter.isString()) { // process a string 249 currLine->setTo(exemplarUSetIter.getString()); 250 } else { // process code point 251 currLine->setTo(exemplarUSetIter.getCodepoint()); 252 } 253 currLine->name[currLine->len] = 0; // zero terminate, for our evil ways 254 //currLine->index = size; 255 size++; 256 } 257} 258 259void 260SortedLines::setSortingArray(Line **sortingArray, Line *elements, int32_t sizeToSort) { 261 int32_t i = 0; 262 for(i = 0; i < sizeToSort; i++) { 263 sortingArray[i] = &elements[i]; 264 } 265} 266 267int32_t 268SortedLines::setSortingArray(Line **sortingArray, Hashtable *table) { 269 int32_t size = table->count(); 270 int32_t hashIndex = -1; 271 const UHashElement *hashElement = NULL; 272 int32_t count = 0; 273 while((hashElement = table->nextElement(hashIndex)) != NULL) { 274 sortingArray[count++] = (Line *)hashElement->value.pointer; 275 } 276 return size; 277} 278 279void 280SortedLines::sort(Line **sortingArray, int32_t sizeToSort, UBool setStrengths, UBool link) { 281 int32_t i = 0; 282 int32_t equalStart = 0; 283 UColAttributeValue equalStrength = UCOL_OFF; 284 285 qsort(sortingArray, sizeToSort, sizeof(Line *), probe.comparer); 286 287 if(setStrengths) { // analyze strengths 288 for(i = 1; i < sizeToSort; i++) { 289 sortingArray[i]->strength = probe.getStrength(*sortingArray[i-1], *sortingArray[i]); 290 } 291 // for equal guys, do the code point ordering 292 293 i = 1; 294 while(i < sizeToSort) 295 { 296 if(sortingArray[i]->strength == UCOL_IDENTICAL) { 297 equalStart = i - 1; 298 equalStrength = sortingArray[equalStart]->strength; 299 sortingArray[equalStart]->strength = UCOL_IDENTICAL; 300 while(i < sizeToSort && sortingArray[i]->strength == UCOL_IDENTICAL) { 301 i++; 302 } 303 qsort(sortingArray+equalStart, i-equalStart, sizeof(Line *), codePointCmp); 304 sortingArray[equalStart]->strength = equalStrength; 305 } else { 306 i++; 307 } 308 } 309 310 } 311 312 313 314 if(link) { // do the linking 315 for(i = 0; i < sizeToSort - 1; i++) { 316 Line *curr = *(sortingArray+i); 317 curr->next = *(sortingArray+i+1); 318 (*(sortingArray+i+1))->previous = curr; 319 } 320 } 321} 322 323void 324SortedLines::sort(UBool setStrengths, UBool link) { 325 if(toSortCapacity < size || !toSort) { 326 if(toSort) { 327 delete[] toSort; 328 } 329 toSort = new Line*[size*2]; 330 toSortCapacity = size*2; 331 } 332 333 setSortingArray(toSort, lines, size); 334 sort(toSort, size, setStrengths, link); 335 336 first = last = NULL; 337 338 if(link) { // do the linking 339 first = *toSort; 340 last = *(toSort+size-1); 341 } 342} 343 344void 345SortedLines::updateBounds(UnicodeSet &set) { 346 Line line; 347 UnicodeString s1; 348 UnicodeSetIterator it1(set); 349 while(it1.next()) { 350 if(!debug->isOn()) { 351 logger->log("."); 352 } 353 if(it1.isString()) { // process a string 354 s1.setTo(it1.getString()); 355 } else { // process code point 356 s1.setTo(it1.getCodepoint()); 357 } 358 //line.setTo(s1); 359 UColAttributeValue strength = probe.distanceFromEmptyString(s1); 360 if(probe.compare(UnicodeString(UB[strength]->name), s1) < 0) { 361 // TODO: leak here - fixit! 362 UB[strength] = new Line(s1); 363 //u_strcpy(UB[strength], s1.getTerminatedBuffer()); 364 } 365 } 366 367 368 369} 370 371void SortedLines::addAll(Line* toAdd, int32_t toAddSize) 372{ 373 if(size+toAddSize > capacity) { 374 int32_t doGrowingBreakpoint = 0; 375 // we need to do growing here 376 } 377 int32_t i = 0; 378 379 for(i = 0; i < toAddSize; i++) { 380 lines[size+i] = toAdd[i]; 381 } 382 size += toAddSize; 383} 384 385void SortedLines::setDistancesFromEmpty(Line* array, int32_t arraySize) 386{ 387 int32_t i = 0; 388 for(i = 0; i < arraySize; i++) { 389 array[i].strengthFromEmpty = probe.distanceFromEmptyString(array[i]); 390 } 391} 392 393 394// adds contractions in to repertoire 395int32_t SortedLines::addContractionsToRepertoire(UErrorCode &status) 396{ 397 logger->log("\n*** Detecting contractions\n\n"); 398 contractionsTable = new Hashtable(); 399 int32_t noConts = 0; 400 int32_t allocateSize = 50*size; 401 // first check for simple contractions 402 Line* delta = new Line[allocateSize]; 403 Line** deltaSorted = new Line*[allocateSize]; 404 Line* lesserToAddTo = new Line[allocateSize]; 405 Line* newDelta = new Line[allocateSize]; 406 Line** newDeltaSorted = new Line*[allocateSize]; 407 Line* deltaP = delta; 408 Line** deltaPP = deltaSorted; 409 Line* newDeltaP = newDelta; 410 int32_t deltaSize = 0, lesserToAddToSize = 0, newDeltaSize = 0; 411 logger->log("++ Contraction detection generation 0\n"); 412 noConts = detectContractions(toSort, size, toSort, size, 413 delta, deltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status); 414 setSortingArray(deltaSorted, delta, deltaSize); 415 sort(deltaSorted, deltaSize, true); 416 417 setDistancesFromEmpty(delta, deltaSize); 418 int32_t deltaPSize = deltaSize; 419 //updateBounds(delta); 420 421 int32_t generation = 0; 422 // if we found any, we have to try multiple contractions 423 // However, we want to prevent the contractions explosion 424 // if the number of simple contractions is greater than the 425 // starting size, chances are that we either have an algorithmic 426 // contraction (like iteration marks on w2k) or something 427 // is seriosly wrong. 428 if(deltaPSize < size/2) { 429 while (deltaPSize && generation < 1) { 430 generation++; 431 logger->log("\n++ Contraction detection generation %i\n", generation, generation); 432 // find more, but avoid testing the combinations we already have 433 noConts += detectContractions(toSort, size, deltaPP, deltaPSize, 434 newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status); 435 noConts += detectContractions(deltaPP, deltaPSize, toSort, size, 436 newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status); 437 calculateSortKeys(); 438 439 addAll(deltaP, deltaPSize); 440 setSortingArray(toSort, lines, size); 441 sort(true, true); 442 setSortingArray(newDeltaSorted, newDeltaP, newDeltaSize); 443 sort(newDeltaSorted, newDeltaSize, true); 444 445 // if no new ones, bail 446 //if (newDeltaSize == 0) break; 447 448 deltaPSize = newDeltaSize; 449 newDeltaSize = 0; 450 if(deltaP == delta) { 451 deltaP = newDelta; 452 deltaPP = newDeltaSorted; 453 newDeltaP = delta; 454 } else { 455 deltaP = delta; 456 deltaPP = deltaSorted; 457 newDeltaP = newDelta; 458 } 459 setDistancesFromEmpty(deltaP, deltaPSize); 460 } 461 } 462 status = U_ZERO_ERROR; 463 // add stuff from the last batch 464 addAll(deltaP, deltaPSize); 465 466 // warning: we don't add the lesser ones in recursively, since they will 467 // infinitely loop 468 setDistancesFromEmpty(lesserToAddTo, lesserToAddToSize); 469 addAll(lesserToAddTo, lesserToAddToSize); 470 setSortingArray(toSort, lines, size); 471 sort(true, true); 472 473 delete[] deltaSorted; 474 delete[] delta; 475 delete[] lesserToAddTo; 476 delete[] newDeltaSorted; 477 delete[] newDelta; 478 return noConts; 479} 480 481 482int32_t SortedLines::detectContractions(Line **firstRep, int32_t firstSize, 483 Line **secondRep, int32_t secondSize, 484 Line *toAddTo, int32_t &toAddToSize, 485 Line *lesserToAddTo, int32_t &lesserToAddToSize, 486 int32_t capacity, UErrorCode &status) 487{ 488 int32_t noConts = 0; 489 int i = 0, j = 0, k = 0; 490 Line lower, upper, trial, toAdd, helper; 491 UChar32 firstStart, firstEnd, secondStart; 492 UChar NFCTrial[256]; 493 int32_t NFCTrialLen = 0; 494 UBool thai; 495 i = -1; 496 while(i < firstSize-1 && U_SUCCESS(status)) { 497 i++; 498 if(!debug->isOn()) { 499 logger->log("\rTesting %05i/%05i. Found %05i conts.", i, firstSize, noConts); 500 } 501 U16_GET(firstRep[i]->name, 0, 0, firstRep[i]->len, firstStart); 502 if(uscript_getScript(firstStart, &status) == USCRIPT_HAN || firstRep[i]->strengthFromEmpty > UCOL_PRIMARY) //UCOL_TERTIARY) 503 { 504 continue; 505 } 506 lower = *firstRep[i]; 507 for(j = 0; j < secondSize; j++) { 508 if(noConts == capacity) { 509 return noConts; 510 } 511 U16_GET(secondRep[j]->name, 0, 0, secondRep[j]->len, secondStart); 512 if(firstStart == 0x41 && secondStart == 0x308) { 513 int32_t putBreakPointHere = 0; 514 } 515 if(uscript_getScript(secondStart, &status) == USCRIPT_HAN) // || secondRep[j]->strengthFromEmpty > UCOL_TERTIARY) 516 { 517 continue; 518 } 519 if(duplicators && duplicators->get(UnicodeString(secondRep[j]->name, secondRep[j]->len)) != NULL) { 520 debug->log("Skipping duplicator "); 521 debug->log(secondRep[j]->toString(), true); 522 continue; 523 } 524 525 if(firstRep[i]->name[0] == 0x61 && secondRep[j]->name[0] == 0x308) { 526 int32_t putBreakpointhere = 0; 527 } 528 upper.setToConcat(firstRep[i], UB[UCOL_PRIMARY]); 529 //upper.setToConcat(firstRep[i], UB[secondRep[j]->strengthFromEmpty]); 530 toAdd.setToConcat(firstRep[i], secondRep[j]); 531 U16_GET(firstRep[i]->name, 0, firstRep[i]->len-1, firstRep[i]->len, firstEnd); 532 if((thai = u_hasBinaryProperty(firstEnd, UCHAR_LOGICAL_ORDER_EXCEPTION))) { 533 // this means that the lower is single reordering character 534 // if we do the lower test without taking this into account, 535 // we'll comparing the secondRep directly to Thai. We add UB[UCOL_PRIMARY] to 536 // end of lower and in the middle of trial, so we will have 537 // lower = Thai + UB, trial Thai + UB + x, resolving to 538 // UB + Thai vs UB + Thai + x. 539 // for upper bound, we do the similar, so we have 540 // upper = Thai + UB + UB, trial = Thai + UB + x, 541 // resolving to UB + Thai + UB vs UB + Thai + x 542 if(secondRep[j]->firstCC) { 543 UChar32 UBChar; 544 U16_GET(UB[UCOL_SECONDARY]->name, 0, 0, UB[UCOL_SECONDARY]->len, UBChar); 545 if(secondRep[j]->firstCC > u_getCombiningClass(UBChar)) { 546 continue; 547 } 548 } 549 upper = *firstRep[i]; 550 upper.append(*UB[UCOL_PRIMARY]); 551 //upper.append(*UB[secondRep[j]->strengthFromEmpty]); 552 upper.append(*UB[UCOL_PRIMARY]); 553 lower.append(*UB[UCOL_PRIMARY]); 554 trial = *firstRep[i]; 555 trial.append(*UB[UCOL_PRIMARY]); 556 trial.append(*secondRep[j]); 557 } else if((firstRep[i]->lastCC > secondRep[j]->firstCC && secondRep[j]->firstCC && !frenchSecondary) 558 || (firstRep[i]->firstCC < secondRep[j]->lastCC && firstRep[i]->firstCC && frenchSecondary)) { 559 // Skip because normalization will reorder 560 // there will be a chance to check this again, since if we 561 // try a+b, we will also try b+a 562 continue; 563 } else if(frenchSecondary && (firstRep[i]->strengthFromEmpty > UCOL_PRIMARY && secondRep[j]->strengthFromEmpty > UCOL_PRIMARY)) { 564 continue; 565 }else if(firstRep[i]->lastCC && secondRep[j]->firstCC && frenchSecondary) { 566 trial.setToConcat(secondRep[j], firstRep[i]); 567 } else { 568 trial.setToConcat(firstRep[i], secondRep[j]); 569 } 570 // Now let's check the trial. The problem is that when you combine characters, 571 // you can end up with concatenation that is unknown for the examined API. 572 NFCTrialLen = unorm_normalize(trial.name, trial.len, UNORM_NFC, 0, NFCTrial, 256, &status); 573 if((u_strcmp(trial.name, NFCTrial) == 0) || u_strFindLast(NFCTrial, NFCTrialLen, secondRep[j]->name, secondRep[j]->len)) { 574 if(secondRep[j]->strengthFromEmpty > UCOL_TERTIARY) { 575 continue; 576 } 577 } 578 UChar32 c; 579 U16_GET(NFCTrial, 0, 0, NFCTrialLen, c); 580 helper.setTo(c); 581 if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) { 582 continue; 583 } 584 if(NFCTrialLen > 1) { 585 U16_GET(NFCTrial, 0, NFCTrialLen-1, NFCTrialLen, c); 586 helper.setTo(c); 587 if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) { 588 continue; 589 } 590 } 591 592 if (probe.compare(lower, trial) >= 0) { // if lower is bigger than trial 593 // this might be ok, but I'm having doubts. Here is an additional check: 594 if(firstRep[i]->len == 1 || secondRep[j]->strengthFromEmpty == UCOL_PRIMARY) { 595 // I'm basically saying that I'll add this kind of contraction for cases where I combine 596 // one letter with an accent OR when I'm combining more than one symbol with a letter. 597 noteContraction("L", lesserToAddTo, lesserToAddToSize, firstRep[i], secondRep[j], noConts, status); 598 } 599 } 600 else if (probe.compare(trial, upper) > 0) { // trial is bigger than upper?? 601 noteContraction("U", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); 602 } 603#if 0 604 else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY) 605 { 606 Line expansionLine; 607 if(getExpansionLine(trial, *firstRep[i], *secondRep[j], expansionLine) && 608 expansionLine.len && !(expansionLine == *secondRep[j])) { 609 noteContraction("D", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); 610 } 611 } 612#endif 613 else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && probe.getStrength(lower, trial) < secondRep[j]->strengthFromEmpty) { 614 noteContraction("D1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); 615 } 616 else if (firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && secondRep[j]->strengthFromEmpty == UCOL_PRIMARY) 617 { 618 // I have added an additional check. The checks versus upper and lower bound should be sufficient 619 // when the right side is a combining mark. There might be a reordering of combining marks, but 620 // that should be already visible in their order. 621 // compare the sequence 622 // Y- <? Y <? Y+ 623 // and 624 // XY- <? XY <? XY+ 625 Line xym, xyp, xy; 626 UBool xymIsContraction = false, toAddIsContraction = false; 627 if(j) { 628 if(((!secondRep[j-1]->firstCC || firstRep[i]->lastCC < secondRep[j-1]->firstCC) && !frenchSecondary) 629 ||((!firstRep[i]->firstCC || firstRep[i]->firstCC > secondRep[j-1]->lastCC) && frenchSecondary)) { 630 xym.setToConcat(firstRep[i], secondRep[j-1]); 631 toAdd.strength = probe.getStrength(xym, toAdd); 632 if(secondRep[j]->strength != toAdd.strength) { 633 // there is possibility that either xym or xy are contractions 634 // There are two situations: 635 // xym > xy or xym <n xy and ym <k y but n != k 636 // if they are reordered, we are going to see if each of them 637 // is further reordered 638 if(toAdd.strength == UCOL_OFF) { 639 // check whether toAdd shifted more down 640 k = j - 2; 641 while(k>=0 && secondRep[k]->strength > secondRep[j]->strength) { 642 k--; 643 } 644 while(!toAddIsContraction && k>=0) { 645 xyp.setToConcat(firstRep[i], secondRep[k]); 646 if(contractionsTable->get(UnicodeString(xyp.name, xyp.len)) != NULL) { 647 k--; 648 continue; 649 } 650 if(probe.compare(xyp, xym) >= 0) { 651 // xyp looks like a contraction 652 noteContraction("!1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); 653 toAddIsContraction = true; 654 } else { 655 break; 656 } 657 } 658 // first let's see if xym has moved beyond 659 if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) { 660 k = j+1; 661 // ignore weaker strengths 662 while(k < secondSize && secondRep[k]->strength > secondRep[j]->strength) { 663 k++; 664 } 665 // check if we skipped the following guy 666 if(k < secondSize) { 667 xyp.setToConcat(firstRep[i], secondRep[k]); 668 if(probe.compare(xyp, xym) <= 0) { 669 // xyp looks like a contraction 670 noteContraction("!2", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status); 671 xymIsContraction = true; 672 } 673 } 674 } else { 675 xymIsContraction = true; 676 } 677 // if they have reordered, but none has moved, then we add them both 678 // and hope for the best 679 if(!xymIsContraction && !toAddIsContraction) { 680 // it is possible that there is an NFC version version of one of the 681 // strings. If we have XY > XZ, but NFC(XZ) = W and X < W, we might have 682 // have a false contraction. 683 trial.len = unorm_normalize(toAdd.name, toAdd.len, UNORM_NFC, 0, trial.name, 25, &status); 684 //UColAttributeValue strength = probe.getStrength(*firstRep[i], trial); 685 if(trial == toAdd) { 686 noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status); 687 noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); 688 } else { 689 noteContraction("!4", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); 690 } 691 } 692 } else { // only the strength has changed 693 // check whether the previous is contraction and if not, add the current 694 if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) { 695 noteContraction("!5", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status); 696 } 697 } 698 } 699 } 700 } 701 } 702 if(thai) { // restore lower 703 lower = *firstRep[i]; 704 } 705 } 706 } 707 return noConts; 708} 709 710void 711SortedLines::noteContraction(const char* msg, Line *toAddTo, int32_t &toAddToSize, Line *left, Line *right, int32_t &noConts, UErrorCode &status) 712{ 713 Line toAdd; 714 toAdd.setToConcat(left, right); 715 toAdd.left = left; 716 toAdd.right = right; 717 // if we're adding an accent to an existing contraction, we want to check 718#if 0 719 Line test, trial1, trial2; 720 if(right->strengthFromEmpty > UCOL_PRIMARY) { 721 if(left->right && left->right->previous && left->right->next) { 722 test.setToConcat(left->left, left->right->previous); 723 trial1.setToConcat(&test, right); 724 725 test.setToConcat(left->left, left->right->next); 726 trial2.setToConcat(&test, right); 727 if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) { 728 // this means that the contraction has been broken by the newly added accent 729 // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above 730 debug->log("Con -"); 731 debug->log(msg); 732 debug->log(toAdd.toString(false), true); 733 return; 734 } 735 } else { 736 if(right->previous && right->next) { 737 trial1.setToConcat(left, right->previous); 738 trial2.setToConcat(left, right->next); 739 if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) { 740 // this means that the contraction has been broken by the newly added accent 741 // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above 742 debug->log("Con -"); 743 debug->log(msg); 744 debug->log(toAdd.toString(false), true); 745 return; 746 } 747 } 748 if(left->previous && left->next) { 749 trial1.setToConcat(left->previous, right); 750 trial2.setToConcat(left->next, right); 751 if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) { 752 // this means that the contraction has been broken by the newly added accent 753 // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above 754 debug->log("Con -"); 755 debug->log(msg); 756 debug->log(toAdd.toString(false), true); 757 return; 758 } 759 } 760 761 } 762 } 763 if(right->right && right->right->strengthFromEmpty > UCOL_PRIMARY && right->left->previous && right->left->next) { // maybe we already had a contraction with an accent 764 test.setToConcat(right->left->previous, right->right); 765 trial1.setToConcat(left, &test); 766 test.setToConcat(right->left->next, right->right); 767 trial2.setToConcat(left, &test); 768 if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) { 769 // this means that the contraction has been broken by the newly added accent 770 // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above 771 debug->log("Con -"); 772 debug->log(msg); 773 debug->log(toAdd.toString(false), true); 774 return; 775 } 776 } 777#endif 778 if(contractionsTable->get(UnicodeString(toAdd.name, toAdd.len)) == NULL) { 779 if(probe.distanceFromEmptyString(toAdd) <= UCOL_TERTIARY) { 780 toAddTo[toAddToSize++] = toAdd; 781 contractionsTable->put(UnicodeString(toAdd.name, toAdd.len), &toAdd, status); 782 noConts++; 783 debug->log(msg); 784 debug->log(" Con + "); 785 debug->log(toAdd.toString(false), true); 786 787 if(!left->sortKey) { 788 calculateSortKey(*left); 789 } 790 debug->log(left->dumpSortkey()); 791 debug->log(" + "); 792 793 if(!right->sortKey) { 794 calculateSortKey(*right); 795 } 796 debug->log(right->dumpSortkey()); 797 debug->log(" = "); 798 799 calculateSortKey(toAdd); 800 debug->log(toAdd.dumpSortkey(), true); 801 if(noConts > size/2) { 802 status = U_BUFFER_OVERFLOW_ERROR; 803 } 804 } 805 } 806} 807 808 809UBool 810SortedLines::getExpansionLine(const Line &expansion, const Line &previous, const Line &exp, Line &expansionLine) 811{ 812 int expIndexSize = 0; 813 UColAttributeValue expStrength = UCOL_OFF; 814 int32_t comparisonResult = 0; 815 int32_t i = 0, k = 0, prevK = 0; 816 Line trial; 817 UBool sequenceCompleted = false; 818 int32_t expIndexes[256]; 819 int32_t expIndexesSize = 0; 820 821 if(!sequenceCompleted) { 822 expIndexSize = 0; 823 expansionLine.clear(); 824 825 // we will start from strength between the expansion 826 // and the target (toSort[i] and toSort[j]. First we 827 // will add as many primaries as possible. Then we will 828 // try to add secondary pieces and then tertiary. 829 // found an expansion - what is the expanding sequence? 830 831 expStrength = UCOL_PRIMARY; 832 while(!sequenceCompleted) { 833 k = 0; 834 prevK = 0; 835 while(k < size) { 836 if(expansionLine.len > 15) { 837 sequenceCompleted = true; 838 break; 839 } 840 while(k < size && toSort[k]->strength != UCOL_PRIMARY) 841 { 842 k++; 843 } 844 // nothing found 845 if(k == size) { 846 break; 847 } 848 // we need to skip over reordering things. If they were worthy, they would 849 // have been detected in the previous iteration. 850 //if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) { 851 //k++; 852 //continue; 853 //} 854 trial = previous; 855 trial.append(expansionLine); 856 trial.append(*toSort[k]); 857 if(toSort[k]->name[0] == 0x0067) { 858 int32_t putBreakPointHere = 0; 859 } 860 comparisonResult = probe.compare(trial, expansion); 861 if(comparisonResult == 0) { 862 expansionLine = *toSort[k]; 863 return true; 864 } else if (comparisonResult > 0) { 865 if(prevK) { 866 if(exp == *toSort[prevK]) { 867 expansionLine = exp; 868 return true; 869 } 870 i = prevK; 871 while(i < k-1) { 872 i++; 873 if(toSort[i]->strength > exp.strength) { 874 continue; 875 } 876 trial = previous; 877 trial.append(expansionLine); 878 trial.append(*toSort[i]); 879 if(probe.compare(trial, expansion) > 0) { 880 break; 881 } 882 } 883 // we got into situation where we have ch > ch+dot-below 884 // however, ch is a contraction and therefore we cannot use 885 // it properly. If we have hit on a contraction, we'll just try 886 // to continue. Probably need more logic here. 887 if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) { 888 expansionLine.append(*toSort[i-1]); 889 expIndexes[expIndexSize++] = i-1; 890 break; 891 } else { 892 int32_t putBreakPointHere = 0; 893 } 894 } else { 895 sequenceCompleted = true; 896 break; 897 } 898 //break; 899 } 900 prevK = k; 901 k++; 902 } 903 if(!prevK || k == size) { 904 break; 905 } 906 } 907 } 908 return expIndexSize > 0; 909} 910 911int32_t 912SortedLines::gooseUp(int32_t resetIndex, int32_t expansionIndex, Line &expLine, int32_t *expIndexes, int32_t &expIndexSize, UColAttributeValue strength) 913{ 914 int32_t i = expansionIndex, k = resetIndex+1, n = 0, m = 0, start = 0; 915 UBool haveChanges = false; 916 Line trial, prefix, suffix; 917 // we will first try goosing up the reset index 918 //while(toSort[k]->strength >= strength) 919 for( ; toSort[k]->strength == strength; k++) 920 { 921 //if(toSort[k]->strength > strength) { 922 //continue; 923 //} 924 trial.setToConcat(toSort[k], &expLine); 925 if(probe.compare(trial, *toSort[i]) > 0) { 926 break; 927 } 928 } 929 resetIndex = k-1; 930 931 // goose up individual characters 932 prefix = *toSort[resetIndex]; 933 for(n = 0; n < expIndexSize; n++) { 934 suffix.clear(); 935 for(m = n+1; m < expIndexSize; m++) { 936 suffix.append(*toSort[expIndexes[m]]); 937 } 938 k = expIndexes[n]+1; 939 //while(toSort[k]->strength >= strength) 940 for( ; toSort[k]->strength == strength; k++) 941 { 942 //if(toSort[k]->strength > strength) { 943 //continue; 944 //} 945 trial.setToConcat(&prefix, toSort[k]); 946 trial.append(suffix); 947 if(probe.compare(trial, *toSort[i]) > 0) { 948 break; 949 } 950 } 951 if(k > expIndexes[n]+1) { 952 haveChanges = true; 953 expIndexes[n] = k-1; 954 } 955 prefix.append(*toSort[expIndexes[n]]); 956 } 957 958 // try inserting ignorables 959 UColAttributeValue lastStr = UCOL_OFF; 960 k = 0; 961 while(toSort[k]->strengthFromEmpty > strength) { 962 k++; 963 } 964 if(toSort[k]->strengthFromEmpty == strength) { 965 start = k; 966 prefix = *toSort[resetIndex]; 967 n = 0; 968 while(n <= expIndexSize) { 969 suffix.clear(); 970 for(m = n; m < expIndexSize; m++) { 971 suffix.append(*toSort[expIndexes[m]]); 972 } 973 k = start; 974 while(toSort[k]->strengthFromEmpty == strength) { 975 trial.setToConcat(&prefix, toSort[k]); 976 trial.append(suffix); 977 lastStr = probe.getStrength(trial, *toSort[i]); 978 if(lastStr == UCOL_OFF) { // shot over - we won't find anything here 979 break; 980 } else if(lastStr > strength) { 981 for(m = expIndexSize; m > n; m--) { 982 expIndexes[m] = expIndexes[m-1]; 983 } 984 expIndexes[n] = k; 985 expIndexSize++; 986 haveChanges = true; 987 break; 988 } 989#if 0 990 if(probe.compare(trial, *toSort[i]) > 0) { 991 // if the first one skips, that means that 992 // this position doesn't work 993 if(k > start) { 994 // insert an ignorable on position n 995 for(m = expIndexSize; m > n; m--) { 996 expIndexes[m] = expIndexes[m-1]; 997 } 998 expIndexes[n] = k-1; 999 expIndexSize++; 1000 haveChanges = true; 1001 if(n == expIndexSize-1) { // added to the end of the string 1002 UColAttributeValue str = probe.getStrength(trial, *toSort[i]); 1003 int32_t putBreakHere = 0; 1004 } 1005 } 1006 break; 1007 } else { 1008 lastStr = probe.getStrength(trial, *toSort[i]); 1009 } 1010#endif 1011 k++; 1012 } 1013 prefix.append(*toSort[expIndexes[n]]); 1014 n++; 1015 } 1016 } 1017 1018 if(haveChanges) { 1019 expLine.clear(); 1020 for(m = 0; m < expIndexSize; m++) { 1021 expLine.append(*toSort[expIndexes[m]]); 1022 } 1023 } 1024 return resetIndex; 1025} 1026 1027int32_t 1028SortedLines::detectExpansions() 1029{ 1030 logger->log("\n*** Detecting expansions\n\n"); 1031 int32_t exCount = 0; 1032 int32_t i = 0, j = 0, k = 0, prevK = 0; 1033 Line *previous, trial, expansionLine; 1034 UBool foundExp = false, sequenceCompleted = false; 1035 UColAttributeValue strength = UCOL_OFF; 1036 UColAttributeValue maxStrength = UCOL_IDENTICAL; 1037 UColAttributeValue expStrength = UCOL_OFF; 1038 int32_t expIndexes[256]; 1039 int32_t expIndexSize = 0; 1040 memset(expIndexes, 0, sizeof(expIndexes)); 1041 1042 // for each element, we look back to find whether there is such a q for which 1043 // q <n x < qUBn. These are possible expansions. When going backwards we skip 1044 // over already detected expansions. 1045 i = 0; 1046 // it turns out that looking at accents as possible expansions is 1047 // quite a stupid thing to do, especially on non ICU platforms. 1048 // Previously this line skipped over identicals only, but 1049 // now we are going to skip all the way to non-ignorables. 1050 while(toSort[i]->strengthFromEmpty > UCOL_PRIMARY) { 1051 i++; 1052 } 1053 i++; 1054 for( ; i < size; i++) { 1055 if(toSort[i]->name[0]==0x0063 && toSort[i]->name[1] == 0x68) // && toSort[i]->name[1] == 0x308)0043 0043 0219 1056 { 1057 int32_t putBreakpointhere = 0; 1058 } 1059 foundExp = false; 1060 sequenceCompleted = false; 1061 strength = toSort[i]->strength; 1062 if(strength == UCOL_IDENTICAL && toSort[i-1]->isExpansion == true) { 1063 u_strcpy(toSort[i]->expansionString, toSort[i-1]->expansionString); 1064 toSort[i]->expLen = toSort[i-1]->expLen; 1065 toSort[i]->isExpansion = true; 1066 toSort[i]->expIndex = toSort[i-1]->expIndex; 1067 toSort[i]->expStrength = UCOL_IDENTICAL; 1068 //toSort[i]->expStrength = toSort[i-1]->expStrength; 1069 foundExp = true; 1070 sequenceCompleted = true; 1071 } 1072 //logger->log("%i %i\n", i, j); 1073 while(!foundExp && strength <= maxStrength) { 1074 j = i-1; 1075 while(j && (toSort[j]->isExpansion == true || toSort[j]->isRemoved == true)) { 1076 //if(toSort[j]->strength < strength) { 1077 //strength = toSort[j]->strength; 1078 //} 1079 j--; 1080 } 1081 1082 //while(j && toSort[j]->strength > strength) 1083 while(j && toSort[j]->strength > probe.getStrength(*toSort[j], *toSort[i])) 1084 { 1085 j--; 1086 } 1087 //if(toSort[j]->strength == strength) { 1088 previous = toSort[j]; 1089 if(previous->strengthFromEmpty >= UCOL_IDENTICAL || 1090 (previous->strengthFromEmpty == UCOL_SECONDARY 1091 && strength == UCOL_SECONDARY 1092 && previous->lastCC > UB[strength]->firstCC)) { 1093 break; 1094 //continue; 1095 } 1096 //trial.setToConcat(previous, UB[strength]); 1097 trial.setToConcat(previous, UB[probe.getStrength(*toSort[j], *toSort[i])]); 1098 if(probe.compare(trial, *toSort[i]) > 0) { 1099 foundExp = true; 1100 } 1101 //} 1102 if(strength == UCOL_QUATERNARY) { 1103 strength = UCOL_IDENTICAL; 1104 } else { 1105 strength = (UColAttributeValue)(strength + 1); 1106 } 1107 } 1108 // calculate the expanding sequence 1109 if(foundExp && !sequenceCompleted) { 1110 expIndexSize = 0; 1111 expansionLine.clear(); 1112 exCount++; 1113 // we will start from strength between the expansion 1114 // and the target (toSort[i] and toSort[j]. First we 1115 // will add as many primaries as possible. Then we will 1116 // try to add secondary pieces and then tertiary. 1117 // found an expansion - what is the expanding sequence? 1118 1119 expStrength = UCOL_PRIMARY; 1120 while(!sequenceCompleted) { 1121 k = 0; 1122 prevK = 0; 1123 while(k < size) { 1124 if(expansionLine.len > 15) { 1125 sequenceCompleted = true; 1126 break; 1127 } 1128 while(k < size && toSort[k]->strength != UCOL_PRIMARY) { 1129 k++; 1130 } 1131 // nothing found 1132 if(k == size) { 1133 break; 1134 } 1135 // we need to skip over reordering things. If they were worthy, they would 1136 // have been detected in the previous iteration. 1137 //if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) { 1138 //k++; 1139 //continue; 1140 //} 1141 trial = *previous; 1142 trial.append(expansionLine); 1143 trial.append(*toSort[k]); 1144 if(toSort[k]->name[0] == 0x0067) { 1145 int32_t putBreakPointHere = 0; 1146 } 1147 if(probe.compare(trial, *toSort[i]) > 0) { 1148 if(prevK) { 1149 // we got into situation where we have ch > ch+dot-below 1150 // however, ch is a contraction and therefore we cannot use 1151 // it properly. If we have hit on a contraction, we'll just try 1152 // to continue. Probably need more logic here. 1153 if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) { 1154 expansionLine.append(*toSort[prevK]); 1155 expIndexes[expIndexSize++] = prevK; 1156 break; 1157 } else { 1158 int32_t putBreakPointHere = 0; 1159 } 1160 } else { 1161 sequenceCompleted = true; 1162 break; 1163 } 1164 //break; 1165 } 1166 prevK = k; 1167 k++; 1168 } 1169 if(!prevK || k == size) { 1170 break; 1171 } 1172 } 1173 // after this we have primaries lined up. 1174 // we are going to goose up with secondaries and 1175 // tertiaries 1176 trial.setToConcat(toSort[j], &expansionLine); 1177 expStrength = probe.getStrength(trial, *toSort[i]); 1178 if(expStrength > UCOL_PRIMARY) { 1179 if(expStrength == UCOL_SECONDARY || expStrength == UCOL_OFF) { 1180 j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_SECONDARY); 1181 trial.setToConcat(toSort[j], &expansionLine); 1182 expStrength = probe.getStrength(trial, *toSort[i]); 1183 if(expStrength == UCOL_TERTIARY) { 1184 j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY); 1185 } 1186 } else if(expStrength == UCOL_TERTIARY) { 1187 j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY); 1188 } 1189 } 1190 trial.setToConcat(toSort[j], &expansionLine); 1191 expStrength = probe.getStrength(trial, *toSort[i]); 1192 if(expansionLine.len) { 1193 if(expansionLine.name[0] == 0x73 && expansionLine.name[1] == 0x7a) { 1194 int32_t putBreakpointhere = 0; 1195 } 1196 UBool isExpansionLineAContraction = (contractionsTable->get(UnicodeString(expansionLine.name, expansionLine.len)) != NULL); 1197 // we have an expansion line and an expansion. There could be some expansions where 1198 // the difference between expansion line and the end of expansion sequence is less or 1199 // equal than the expansion strength. These should probably be removed. 1200 int32_t diffLen = toSort[i]->len - expansionLine.len; 1201 if(diffLen > 0) { 1202 trial.setTo(UnicodeString(toSort[i]->name + diffLen, toSort[i]->len - diffLen)); 1203 } else { 1204 trial = *toSort[i]; 1205 } 1206 UColAttributeValue s1 = probe.getStrength(trial, expansionLine); 1207 if(s1 == UCOL_OFF) { 1208 s1 = probe.getStrength(expansionLine, trial); 1209 } 1210 if((!isExpansionLineAContraction && s1 >= expStrength) || (diffLen <= 0 && s1 == UCOL_IDENTICAL)) { 1211 contractionsTable->remove(UnicodeString(toSort[i]->name, toSort[i]->len)); 1212 toSort[i]->isRemoved = true; 1213 if(toSort[i]->next && toSort[i]->previous) { 1214 toSort[i]->previous->next = toSort[i]->next; 1215 } 1216 if(toSort[i]->previous && toSort[i]->next) { 1217 toSort[i]->next->previous = toSort[i]->previous; 1218 } 1219 debug->log("Exp -N: "); 1220 debug->log(toSort[i]->toString(false)); 1221 debug->log(" / "); 1222 debug->log(expansionLine.toString(false), true); 1223 } 1224 else 1225 { 1226 u_strncat(toSort[i]->expansionString, expansionLine.name, expansionLine.len); 1227 toSort[i]->isExpansion = true; 1228 toSort[i]->expStrength = expStrength; 1229 toSort[i]->expLen = expansionLine.len; 1230 toSort[i]->expansionString[toSort[i]->expLen] = 0; 1231 toSort[i]->expIndex = j; 1232 } 1233 } 1234 } 1235 if(toSort[i]->isExpansion == true) { 1236 if(debug->isOn()) { 1237 debug->log("Exp + : &"); 1238 debug->log(toSort[j]->toString(false)); 1239 debug->log(toSort[i]->strengthToString(toSort[i]->expStrength, true)); 1240 debug->log(toSort[i]->toString(false)); 1241 debug->log(" "); 1242 if(!toSort[j]->sortKey) { 1243 calculateSortKey(*toSort[j]); 1244 } 1245 debug->log(toSort[j]->dumpSortkey()); 1246 debug->log(" ... "); 1247 if(!toSort[i]->sortKey) { 1248 calculateSortKey(*toSort[i]); 1249 } 1250 debug->log(toSort[i]->dumpSortkey()); 1251 calculateSortKey(expansionLine); 1252 debug->log("/"); 1253 debug->log(expansionLine.dumpSortkey(), true); 1254 } 1255 1256 } 1257 } 1258 // after detecting expansions, we want to position them. 1259 // it is better to position expansions after all have been detected, 1260 // since otherwise we will change the ordering. 1261 for(i = size-1; i >= 0; i--) { 1262 if(toSort[i]->isExpansion) { 1263 if(toSort[i]->name[0] == 0x2A3) { 1264 int32_t putBreakPointHere = 0; 1265 } 1266 if(i) { 1267 if(toSort[i]->previous) { 1268 toSort[i]->previous->next = toSort[i]->next; 1269 } 1270 } 1271 if(i < size-1) { 1272 if(toSort[i]->next) { 1273 toSort[i]->next->previous = toSort[i]->previous; 1274 } 1275 } 1276 j = toSort[i]->expIndex; 1277 toSort[i]->next = toSort[j]->next; 1278 toSort[i]->previous = toSort[j]; 1279 toSort[j]->next = toSort[i]; 1280 if(toSort[i]->next) { 1281 toSort[i]->next->previous = toSort[i]; 1282 } 1283 toSort[i]->strength = toSort[i]->expStrength; 1284 } 1285 } 1286 return exCount; 1287} 1288 1289 1290Line * 1291SortedLines::getFirst() { 1292 current = first; 1293 return current; 1294} 1295 1296Line * 1297SortedLines::getLast() { 1298 current = last; 1299 return current; 1300} 1301 1302void 1303SortedLines::add(Line *line, UBool linkIn) { 1304 if(size++ == capacity) { 1305 // grow 1306 } 1307 lines[size] = *line; 1308 Line *toAdd = &lines[size]; 1309 if(linkIn && first) { 1310 Line *current = first; 1311 while(current != NULL && probe.comparer(¤t, &toAdd) < 0) { 1312 current = current->next; 1313 } 1314 if(current == NULL) { 1315 toAdd->previous = last; 1316 toAdd->next = NULL; 1317 if(last != NULL) { 1318 last->next = toAdd; 1319 } 1320 last = toAdd; 1321 if(first == NULL) { 1322 first = toAdd; 1323 } 1324 } else { // current != NULL 1325 toAdd->next = current; 1326 toAdd->previous = current->previous; 1327 if(current->previous) { 1328 current->previous->next = toAdd; 1329 } else { 1330 first = toAdd; 1331 } 1332 current->previous = toAdd; 1333 } 1334 } 1335} 1336 1337 1338Line * 1339SortedLines::getNext() 1340{ 1341 if(current != NULL) { 1342 current=current->next; 1343 } 1344 return current; 1345} 1346 1347Line * 1348SortedLines::getPrevious() 1349{ 1350 if(current != NULL) { 1351 current=current->previous; 1352 } 1353 return current; 1354} 1355 1356Line * 1357SortedLines::operator[](int32_t index) 1358{ 1359 int32_t i = 0; 1360 Line *c = first; 1361 for(i = 0; i < index; i++) { 1362 if(c != NULL) { 1363 c = c->next; 1364 } 1365 } 1366 return c; 1367} 1368 1369UnicodeString 1370SortedLines::arrayToString(Line** sortedLines, int32_t linesSize, UBool pretty, UBool useLinks, UBool printSortKeys) { 1371 UnicodeString result; 1372 int32_t i = 0; 1373 1374 Line *line = NULL; 1375 Line *previous = sortedLines[0]; 1376 if(printSortKeys && !sortkeys) { 1377 printSortKeys = false; 1378 } 1379 if(previous->isReset) { 1380 result.append(" & "); 1381 result.append(previous->name, previous->len); 1382 if(pretty) { 1383 result.append(" # "); 1384 result.append(previous->stringToName(previous->name, previous->len)); 1385 result.append("\n"); 1386 } 1387 } else if(!previous->isRemoved) { 1388 result.append(previous->toString(pretty)); 1389 if(pretty) { 1390 result.append("\n"); 1391 } 1392 } 1393 i = 1; 1394 while((i < linesSize && !useLinks) || (previous->next && useLinks)) { 1395 if(useLinks) { 1396 line = previous->next; 1397 } else { 1398 line = sortedLines[i]; 1399 } 1400 if(line->isReset) { 1401 result.append(" &"); 1402 result.append(line->name, line->len); 1403 if(pretty) { 1404 result.append(" # "); 1405 result.append(line->stringToName(line->name, line->len)); 1406 result.append("\n"); 1407 } 1408 } else if(!line->isRemoved) { 1409 if(i > 0) { 1410 result.append(line->strengthToString(line->strength, pretty)); 1411 } 1412 result.append(line->toString(pretty)); 1413 if(printSortKeys) { 1414 result.append(line->dumpSortkey()); 1415 } 1416 if(pretty) { 1417 result.append("\n"); 1418 } 1419 } 1420 previous = line; 1421 i++; 1422 } 1423 return result; 1424} 1425 1426SortedLines::SortedLines(FILE *file, UPrinter *logger, UPrinter *debug, UErrorCode &status) : 1427toSort(NULL), 1428toSortCapacity(0), 1429lines(NULL), 1430size(0), 1431capacity(0), 1432first(NULL), 1433last(NULL), 1434logger(logger), 1435debug(debug), 1436contractionsTable(NULL), 1437duplicators(NULL), 1438maxExpansionPrefixSize(0), 1439wordSort(false), 1440frenchSecondary(false), 1441upperFirst(false), 1442sortkeys(NULL), 1443sortkeyOffset(0) 1444{ 1445 debug->log("*** loading a dump\n"); 1446 memset(UB, 0, sizeof(UB)); 1447 int32_t i = 0; 1448 for(i = 0; i < UCOL_OFF; i++) { 1449 UB[i] = ∅ 1450 } 1451 1452 int32_t newFrench, newUpperFirst; 1453 fscanf(file, "%i,%i,%i\n", &size, &newFrench, &newUpperFirst); 1454 debug->log("Read size %i, frenchSecondary %i and upperFirst %i\n", size, newFrench, newUpperFirst); 1455 frenchSecondary = (UBool)newFrench; 1456 upperFirst = (UBool)newUpperFirst; 1457 capacity = size; 1458 lines = new Line[capacity]; 1459 i = 0; 1460 1461 char buff[256]; 1462 1463 while(fgets(buff, 256, file)) { 1464 if(i % 20 == 0) { 1465 logger->log("\rLine: %04i", i, buff); 1466 } 1467 lines[i].initFromString(buff, 256, status); 1468 if(i) { 1469 lines[i].previous = &lines[i-1]; 1470 lines[i-1].next = &lines[i]; 1471 } 1472 i++; 1473 } 1474 size = i; 1475 toSort = new Line*[size]; 1476 setSortingArray(toSort, lines, size); 1477 first = &lines[0]; 1478 last = &lines[size-1]; 1479} 1480 1481void 1482SortedLines::toFile(FILE *file, UBool useLinks, UErrorCode &status) 1483{ 1484 fprintf(file, "%i,%i,%i\n", size, frenchSecondary, upperFirst); 1485 int32_t i = 1; 1486 Line *previous = toSort[0]; 1487 Line *line = NULL; 1488 char buff[256]; 1489 previous->write(buff, 256, status); 1490 fprintf(file, "%s\n", buff); 1491 fflush(file); 1492 while(previous->next) { 1493 if(useLinks) { 1494 line = previous->next; 1495 } else { 1496 line = toSort[i]; 1497 } 1498 line->write(buff, 256, status); 1499 fprintf(file, "%s\n", buff); 1500 i++; 1501 previous = line; 1502 } 1503} 1504 1505 1506 1507UnicodeString 1508SortedLines::toStringFromEmpty() { 1509 UBool useLinks = false; 1510 UBool pretty = false; 1511 UnicodeString result; 1512 int32_t i = 0; 1513 1514 Line *line = NULL; 1515 Line *previous = toSort[0]; 1516 if(previous->isReset) { 1517 result.append(" & "); 1518 if(pretty) { 1519 result.append("\n"); 1520 } 1521 result.append(previous->name, previous->len); 1522 } else if(!previous->isRemoved) { 1523 result.append(previous->toString(pretty)); 1524 if(pretty) { 1525 result.append("\n"); 1526 } 1527 } 1528 i = 1; 1529 while(i < size || previous->next) { 1530 if(useLinks) { 1531 line = previous->next; 1532 } else { 1533 line = toSort[i]; 1534 } 1535 if(line->isReset) { 1536 result.append(" &"); 1537 result.append(line->name, line->len); 1538 if(pretty) { 1539 result.append(" # "); 1540 result.append(line->stringToName(line->name, line->len)); 1541 result.append("\n"); 1542 } 1543 } else if(!line->isRemoved) { 1544 if(i > 0) { 1545 result.append(line->strengthToString(line->strengthFromEmpty, pretty)); 1546 } 1547 result.append(line->toString(pretty)); 1548 if(pretty) { 1549 result.append("\n"); 1550 } 1551 } 1552 previous = line; 1553 i++; 1554 } 1555 return result; 1556} 1557 1558UnicodeString 1559SortedLines::toString(UBool useLinks) 1560{ 1561 return arrayToString(toSort, size, false, useLinks, false); 1562} 1563 1564 1565UnicodeString 1566SortedLines::toPrettyString(UBool useLinks, UBool printSortKeys) 1567{ 1568 return arrayToString(toSort, size, true, useLinks, printSortKeys); 1569} 1570 1571UnicodeString 1572SortedLines::toOutput(const char *format, 1573 const char *locale, const char *platform, const char *reference, 1574 UBool useLinks, UBool initialize, UBool moreToCome) { 1575 if(strcmp(format, "HTML") == 0) { 1576 return toHTML(locale, platform, reference, useLinks, initialize, moreToCome); 1577 } else if(strcmp(format, "XML") == 0) { 1578 return toXML(locale, platform, reference, useLinks, initialize, moreToCome); 1579 } else { 1580 return toBundle(locale, platform, reference, useLinks, initialize, moreToCome); 1581 } 1582} 1583 1584 1585UnicodeString 1586SortedLines::toHTML(const char *locale, 1587 const char *platform, const char *reference, 1588 UBool useLinks, UBool initialize, UBool moreToCome) 1589{ 1590 UnicodeString result; 1591 int32_t i = 0; 1592 if(initialize) { 1593 result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n"); 1594 result.append("# Collation data resource bundle generated for locale: "); 1595 result.append(locale); 1596 result.append("<br>\n# For platform "); 1597 result.append(platform); 1598 result.append(" reference platform "); 1599 result.append(reference); 1600 result.append("<br><br>\n\n\n"); 1601 1602 result.append(locale); 1603 if(platform) { 1604 result.append("_"); 1605 result.append(platform); 1606 } 1607 if(reference) { 1608 result.append("_vs_"); 1609 result.append(reference); 1610 } 1611 result.append(" {<br>\n"); 1612 1613 result.append(" collations {<br>\n standard {<br>\n Sequence {<br>\n"); 1614 } 1615 1616 if(frenchSecondary) { 1617 result.append("[backwards 2]<br>\n"); 1618 } 1619 if(upperFirst) { 1620 result.append("[casefirst upper]<br>\n"); 1621 } 1622 1623 Line *line = toSort[0]; 1624 1625 i = 0; 1626 while((i < size && !useLinks) || (line->next && useLinks)) { 1627 if(line->isReset || !line->isRemoved) { 1628 result.append(line->toHTMLString()); 1629 } 1630 i++; 1631 if(useLinks) { 1632 line = line->next; 1633 } else { 1634 line = toSort[i]; 1635 } 1636 } 1637 if(!moreToCome) { 1638 result.append(" }<br>\n }<br>\n }<br>\n}<br>\n"); 1639 1640 result.append("</html>\n"); 1641 } 1642 1643 return result; 1644} 1645 1646UnicodeString 1647SortedLines::toXML(const char *locale, 1648 const char *platform, const char *reference, 1649 UBool useLinks, UBool initialize, UBool moreToCome) 1650{ 1651 UnicodeString result; 1652 int32_t i = 0; 1653 if(initialize) { 1654 result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n"); 1655 result.append("# Collation data resource bundle generated for locale: "); 1656 result.append(locale); 1657 result.append("<br>\n# For platform "); 1658 result.append(platform); 1659 result.append(" reference platform "); 1660 result.append(reference); 1661 result.append("<br><br>\n\n\n"); 1662 1663 result.append(locale); 1664 if(platform) { 1665 result.append("_"); 1666 result.append(platform); 1667 } 1668 if(reference) { 1669 result.append("_vs_"); 1670 result.append(reference); 1671 } 1672 result.append(" {<br>\n"); 1673 1674 result.append(" collations {<br>\n standard {<br>\n Sequence {<br>\n"); 1675 } 1676 1677 if(frenchSecondary) { 1678 result.append("[backwards 2]<br>\n"); 1679 } 1680 if(upperFirst) { 1681 result.append("[casefirst upper]<br>\n"); 1682 } 1683 1684 Line *line = toSort[0]; 1685 1686 i = 0; 1687 while((i < size && !useLinks) || (line->next && useLinks)) { 1688 if(line->isReset || !line->isRemoved) { 1689 result.append(line->toHTMLString()); 1690 } 1691 i++; 1692 if(useLinks) { 1693 line = line->next; 1694 } else { 1695 line = toSort[i]; 1696 } 1697 } 1698 if(!moreToCome) { 1699 result.append(" }<br>\n }<br>\n }<br>\n}<br>\n"); 1700 1701 result.append("</html>\n"); 1702 } 1703 1704 return result; 1705} 1706 1707UnicodeString 1708SortedLines::toBundle(const char *locale, 1709 const char *platform, const char *reference, 1710 UBool useLinks, UBool initialize, UBool moreToCome) 1711{ 1712 UnicodeString result; 1713 int32_t i = 0; 1714 1715 if(initialize) { 1716 result.append("// Collation data resource bundle generated for locale: "); 1717 result.append(locale); 1718 result.append("\n// For platform "); 1719 result.append(platform); 1720 result.append(" reference platform "); 1721 result.append(reference); 1722 result.append("\n\n\n"); 1723 1724 result.append(locale); 1725 /* 1726 if(platform) { 1727 result.append("_"); 1728 result.append(platform); 1729 } 1730 if(reference) { 1731 result.append("_vs_"); 1732 result.append(reference); 1733 } 1734 */ 1735 result.append(" {\n"); 1736 1737 result.append(" collations {\n standard {\n Sequence {\n"); 1738 } 1739 1740 if(frenchSecondary) { 1741 result.append("[backwards 2]\n"); 1742 } 1743 if(upperFirst) { 1744 result.append("[casefirst upper]\n"); 1745 } 1746 1747 Line *line = toSort[0]; 1748 1749 i = 0; 1750 while((i < size && !useLinks) || (line->next && useLinks)) { 1751 if(line->isReset || !line->isRemoved) { 1752 result.append(line->toBundleString()); 1753 } 1754 i++; 1755 if(useLinks) { 1756 line = line->next; 1757 } else { 1758 line = toSort[i]; 1759 } 1760 } 1761 1762 if(!moreToCome) { 1763 result.append(" }\n }\n }\n}\n"); 1764 } 1765 1766 return result; 1767} 1768 1769 1770int32_t 1771SortedLines::getSize() const { 1772 return repertoire.size(); 1773} 1774 1775void 1776SortedLines::reduceDifference(SortedLines& reference) { 1777 UErrorCode status = U_ZERO_ERROR; 1778 if(upperFirst) { 1779 swapCase(); 1780 } 1781 // both sorted lines structures need to have established links and strengths 1782 // We walk down both structures and note differences. These 1783 // differences will modify this by removng elements, setting resets 1784 // etc... 1785 // we will prefer insertions from tailoring to reference, then deletions 1786 // there are two tables that keep seen elements. 1787 Hashtable *seenThis = new Hashtable(); 1788 Hashtable *seenReference = new Hashtable(); 1789 1790 1791 UBool found = false; 1792 UBool finished = false; 1793 const int32_t lookForward = 20; 1794 int32_t tailoringMove = 0; 1795 //int32_t referenceSize = reference.getSize(); 1796 Line *refLine = reference.getFirst(); 1797 Line *refLatestEqual = refLine; 1798 refLine = refLine->next; 1799 Line *myLine = getFirst(); 1800 Line *myLatestEqual = myLine; 1801 myLatestEqual->isRemoved = true; 1802 myLine = myLine->next; 1803 while(myLine && refLine) { 1804 found = false; 1805 while(myLine && refLine && myLine->equals(*refLine)) { 1806 myLatestEqual = myLine; 1807 myLatestEqual->isRemoved = true; 1808 myLine = myLine->next; 1809 refLatestEqual = refLine; 1810 refLine = refLine->next; 1811 if(refLine == NULL && myLine == NULL) { 1812 finished = true; 1813 } 1814 } 1815 if(myLine) { 1816 myLine->cumulativeStrength = myLine->strength; 1817 } 1818 if(refLine) { 1819 refLine->cumulativeStrength = refLine->strength; 1820 } 1821 1822 // here is the difference 1823 while(!found && !finished) { 1824 tailoringMove = 0; 1825 if(myLine && refLine) { 1826 if(myLine->cumulativeStrength > refLine->cumulativeStrength) { 1827 // tailoring z <<< x, UCA z < y 1828 while(myLine->cumulativeStrength > refLine->cumulativeStrength) { 1829 myLine = myLine->next; 1830 if(myLine) { 1831 transferCumulativeStrength(myLine->previous, myLine); 1832 } else { 1833 break; 1834 } 1835 } 1836 } else if(myLine->cumulativeStrength < refLine->cumulativeStrength) { 1837 // tailoring z < x, UCA z <<< y 1838 while(myLine->cumulativeStrength < refLine->cumulativeStrength) { 1839 seenReference->put(UnicodeString(refLine->name, refLine->len), refLine, status); 1840 refLine = refLine->next; 1841 if(refLine) { 1842 transferCumulativeStrength(refLine->previous, refLine); 1843 } else { 1844 break; 1845 } 1846 } 1847 } 1848 // this is the interesting point. Now we search for character match 1849 while(myLine && refLine && (!myLine->equals(*refLine) || myLine->strength == UCOL_IDENTICAL) 1850 && tailoringMove < lookForward) { 1851 if(seenThis->get(UnicodeString(refLine->name, refLine->len))) { 1852 // we are not interested in stuff from the reference that is already accounted 1853 // for in the tailoring. 1854 refLine = refLine->next; 1855 if(refLine) { 1856 transferCumulativeStrength(refLine->previous, refLine); 1857 } 1858 } else { 1859 myLine = myLine->next; 1860 if(myLine) { 1861 transferCumulativeStrength(myLine->previous, myLine); 1862 if(!seenReference->get(UnicodeString(myLine->name, myLine->len))) { 1863 tailoringMove++; 1864 } 1865 } 1866 } 1867 } 1868 } 1869 if(refLine == NULL) { // ran out of reference 1870 // this is the tail of tailoring - the last insertion 1871 myLine = NULL; 1872 found = true; 1873 } else if(tailoringMove == lookForward || myLine == NULL) { // run over treshold or out of tailoring 1874 tailoringMove = 0; 1875 // we didn't find insertion after all 1876 // we will try substitution next 1877 // reset the tailoring pointer 1878 myLine = myLatestEqual->next; 1879 // move the reference 1880 refLine = refLine->next; 1881 if(refLine) { 1882 transferCumulativeStrength(refLine->previous, refLine); 1883 } 1884 } else { // we found an insertion 1885 tailoringMove = 0; 1886 if(myLine->strength != refLine->strength) { 1887 while(myLine && refLine && *myLine == *refLine 1888 && (myLine->strength != refLine->strength 1889 || myLine->strength == UCOL_IDENTICAL)) { 1890 myLine = myLine->next; 1891 refLine = refLine->next; 1892 } 1893 if(*myLine != *refLine) { 1894 continue; 1895 } 1896 } 1897 if(myLine && refLine && myLine->previous->strength < myLine->strength) { 1898 myLine = myLine->next; 1899 refLine = refLine->next; 1900 if(*myLine != *refLine) { 1901 continue; 1902 } 1903 } 1904 found = true; 1905 } 1906 if(found) { 1907 if(myLatestEqual->next != myLine || refLine == NULL) { 1908 Line *myStart = NULL; 1909 // this is a reset and a sequence 1910 // myLatestEqual points at the last point that was the same 1911 // This point will be a reset 1912 if(myLine && refLine) { // if there is anything more to do - it might be worth saving it 1913 myStart = myLatestEqual; 1914 while(myStart != myLine) { 1915 seenThis->put(UnicodeString(myStart->name, myStart->len), myStart, status); 1916 myStart = myStart->next; 1917 } 1918 } 1919 // Try to weed out stuff that is not affected, like: 1920 // Tailoring: 1921 // <<<S<<\u017F<\u0161<<<\u0160<t 1922 // UCA: 1923 // <<<S<<\u0161<<<\u0160<<\u017F<t 1924 // Result: 1925 // &S<<\u017F<\u0161<<<\u0160 1926 // we have a sequence that spans from myLatestEqual to myLine (that one could be NULL, 1927 // so we have to go down from myLatestEqual. 1928 // Basically, for every element, we want to see the strongest cumulative difference 1929 // from the reset point. If the cumulative difference is the same in both the reference and 1930 // tailoring, that element could be removed. 1931 calculateCumulativeStrengths(myLatestEqual, myLine); 1932 calculateCumulativeStrengths(refLatestEqual, refLine); 1933 myStart = myLatestEqual; 1934 int32_t removed = 0; 1935 int32_t traversed = 0; 1936 while(myStart && myStart != myLine) { 1937 Line *refStart = refLatestEqual; 1938 while(refStart && refStart != refLine) { 1939 if(*myStart == *refStart) { 1940 if(myStart->cumulativeStrength == refStart->cumulativeStrength) { 1941 myStart->isRemoved = true; 1942 removed++; 1943 } 1944 } 1945 refStart = refStart->next; 1946 } 1947 myStart = myStart->next; 1948 traversed++; 1949 } 1950 if(removed < traversed) { 1951 myLatestEqual->isReset = true; 1952 myLatestEqual->isRemoved = false; 1953 } 1954 1955 myLatestEqual = myLine; 1956 } 1957 } 1958 } 1959 } 1960 1961 if(upperFirst) { 1962 //swapCase(); 1963 } 1964 1965 delete seenThis; 1966 delete seenReference; 1967 1968} 1969 1970void 1971SortedLines::transferCumulativeStrength(Line *previous, Line *that) { 1972 if(that->strength > previous->cumulativeStrength) { 1973 that->cumulativeStrength = previous->cumulativeStrength; 1974 } else { 1975 that->cumulativeStrength = that->strength; 1976 } 1977} 1978 1979void 1980SortedLines::calculateCumulativeStrengths(Line *start, Line *end) { 1981 // start is a reset - end may be NULL 1982 start = start->next; 1983 UColAttributeValue cumulativeStrength = UCOL_OFF; 1984 while(start && start != end) { 1985 if(start->strength < cumulativeStrength) { 1986 cumulativeStrength = start->strength; 1987 } 1988 start->cumulativeStrength = cumulativeStrength; 1989 start = start->next; 1990 } 1991} 1992 1993 1994void 1995SortedLines::getRepertoire(UnicodeSet &fillIn) { 1996 fillIn.clear(); 1997 fillIn.addAll(repertoire); 1998} 1999 2000 2001void 2002SortedLines::removeDecompositionsFromRepertoire() { 2003 UnicodeSetIterator repertoireIter(repertoire); 2004 UErrorCode status = U_ZERO_ERROR; 2005 UChar string[256]; 2006 UChar composed[256]; 2007 int32_t len = 0, compLen = 0; 2008 UnicodeString compString; 2009 UnicodeSet toRemove; 2010 2011 while(repertoireIter.next()) { 2012 len = 0; 2013 if(repertoireIter.isString()) { // process a string 2014 len = repertoireIter.getString().length(); 2015 u_memcpy(string, repertoireIter.getString().getBuffer(), len); 2016 } else { // process code point 2017 UBool isError = false; 2018 U16_APPEND(string, len, 25, repertoireIter.getCodepoint(), isError); 2019 } 2020 string[len] = 0; // zero terminate, for our evil ways 2021 compLen = unorm_normalize(string, len, UNORM_NFC, 0, composed, 256, &status); 2022 if(compLen != len || u_strcmp(string, composed) != 0) { 2023 compString.setTo(composed, compLen); 2024 if(repertoire.contains(compString)) { 2025 toRemove.add(UnicodeString(string, len)); 2026 } 2027 } 2028 } 2029 debug->log("\nRemoving\n"); 2030 debug->log(toRemove.toPattern(compString, true), true); 2031 repertoire.removeAll(toRemove); 2032} 2033 2034 2035void 2036SortedLines::swapCase() 2037{ 2038 int32_t i = 0; 2039 for(i = 0; i < size; i++) { 2040 toSort[i]->swapCase(); 2041 } 2042} 2043 2044void 2045SortedLines::calculateSortKey(Line &line) 2046{ 2047 if(!sortkeys) { 2048 sortkeys = new uint8_t[size*1024]; 2049 memset(sortkeys, 0, size*1024); 2050 } 2051 line.sortKey = sortkeys+sortkeyOffset; 2052 sortkeyOffset += probe.getSortKey(line, sortkeys+sortkeyOffset, size*256-sortkeyOffset); 2053} 2054 2055 2056void 2057SortedLines::calculateSortKeys() 2058{ 2059 if(sortkeys) { 2060 delete[] sortkeys; 2061 } 2062 sortkeyOffset = 0; 2063 sortkeys = new uint8_t[size*256]; 2064 memset(sortkeys, 0, size*256); 2065 int32_t i = 0; 2066 for(i = 0; i < size; i++) { 2067 calculateSortKey(*toSort[i]); 2068 } 2069} 2070