1// © 2017 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3#include "sortedlines.h"
4
5static int codePointCmp(const void *a, const void *b) {
6  return u_strcmp((*(Line **)a)->name, (*(Line **)b)->name);
7}
8
9SortedLines::SortedLines(const UnicodeSet &set, const UnicodeSet &excludeBounds, const StrengthProbe &probe,
10                         UPrinter *logger, UPrinter *debug) :
11toSort(NULL),
12toSortCapacity(0),
13lines(NULL),
14size(0),
15capacity(0),
16repertoire(set),
17excludeBounds(excludeBounds),
18probe(probe),
19first(NULL),
20last(NULL),
21logger(logger),
22debug(debug),
23contractionsTable(NULL),
24duplicators(NULL),
25maxExpansionPrefixSize(0),
26wordSort(false),
27frenchSecondary(false),
28upperFirst(false),
29sortkeys(NULL),
30sortkeyOffset(0)
31{
32  memset(UB, 0, sizeof(UB));
33  int32_t i = 0;
34  for(i = 0; i < UCOL_OFF; i++) {
35    UB[i] = &empty;
36  }
37  init();
38}
39
40SortedLines::~SortedLines()
41{
42  delete[] lines;
43  if(sortkeys) {
44    delete[] sortkeys;
45  }
46  if(toSort) {
47    delete[] toSort;
48  }
49  if(contractionsTable) {
50    delete contractionsTable;
51  }
52  if(duplicators) {
53    delete duplicators;
54  }
55}
56
57void
58SortedLines::getBounds(UErrorCode &status) {
59  // first sort through the set
60  debug->log(toString(), true);
61  int32_t i = 0, j = 0;
62  UColAttributeValue strength = UCOL_OFF;
63  for(i = 0; i < size; i++) {
64    if(toSort[i]->strengthFromEmpty < strength) {
65      if(i && strength < UCOL_OFF) {
66        //u_strcpy(UB[strength], toSort[i-1]->name);
67        j = 1;
68        while(excludeBounds.contains(UnicodeString(toSort[i-j]->name, toSort[i-j]->len))) {
69          j++;
70        }
71        UB[strength] = toSort[i-j];
72      }
73      strength = toSort[i]->strengthFromEmpty;
74      if(strength == UCOL_PRIMARY) {
75        probe.SE = toSort[i]->name[0];
76      }
77    }
78  }
79  //u_strcpy(UB[strength], toSort[size-1]->name);
80  // a different solution for bounds: go from end and see if the guys on the top
81  // cause duplication for things
82  UChar dupch[] = { 0x0020, 0x0030, 0x0042, 0x0051, 0x0062, 0x0071, 0x0391, 0x0396, 0x03b1, 0x03b6 };
83  j = 1;
84  Line dup;
85  Line bound;
86  int32_t dups = 0;
87  while(j < size) {
88    dups = 0;
89    for(i = 0; i < sizeof(dupch)/sizeof(dupch[0]); i++) {
90      dup.setTo(dupch[i]);
91      dup.append(dupch[i]);
92      bound.setTo(dupch[i]);
93      bound.append(toSort[size-j]->name, toSort[size-j]->len);
94      if(probe.getStrength(dup, bound) >= UCOL_IDENTICAL) {
95        dups++;
96      }
97    }
98    if(dups == 0) {
99      break;
100    } else {
101      if(!duplicators) {
102        duplicators = new Hashtable();
103      }
104      duplicators->put(UnicodeString(toSort[size-j]->name, toSort[size-j]->len), &toSort[size-j], status);
105      debug->log(toSort[size-j]->toString());
106      debug->log(" is not good enough to be an upper bound\n");
107      j++;
108    }
109  }
110  if(j == size) {
111    debug->log("Oi! I'm hallucinating. Will use the first upper bound");
112    delete duplicators;
113    duplicators = NULL;
114    j = 1;
115  }
116/*
117  j = 1;
118  while(excludeBounds.contains(UnicodeString(toSort[size-j]->name, toSort[size-j]->len))) {
119    j++;
120  }
121*/
122  UB[strength] = toSort[size-j];
123  for(i = 0; i < UCOL_OFF; i++) {
124    if(UB[i]) {
125      //debug->log(UB[i], true);
126      debug->log(UB[i]->toString(true), true);
127    }
128  }
129}
130
131// classifies repertoire according to the strength of their difference
132// from the empty string
133void
134SortedLines::classifyRepertoire() {
135  UColAttributeValue strongestStrengthFromEmpty = UCOL_OFF;
136  int32_t lastChange = 0;
137  int32_t i = 0, j = 0;
138  while(i < size) // && probe.distanceFromEmptyString(*toSort[i]) > UCOL_PRIMARY)
139  {
140    toSort[i]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i]);
141    if(toSort[i]->strengthFromEmpty < strongestStrengthFromEmpty) {
142      strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty;
143      lastChange = i;
144    } else if (toSort[i]->strengthFromEmpty > strongestStrengthFromEmpty) {
145      // there is a problem in detection. Most probably a quaternary.
146      // why don't we try to interpolate
147      UColAttributeValue nextStrength = UCOL_OFF;
148      UColAttributeValue prevStrength = UCOL_OFF;
149      UColAttributeValue st = UCOL_OFF;
150
151      logger->log("Interpolating to get the distance from empty for Line ");
152      logger->log(toSort[i]->toString(true), true);
153
154      if(i) {
155        st = probe.getStrength(*toSort[i-1], *toSort[i]);
156        if(st == UCOL_OFF) {
157          logger->log("Cannot deduce distance from empty using previous element. Something is very wrong! Line:");
158          logger->log(toSort[i]->toString(true), true);
159        } else if(st == UCOL_IDENTICAL || st >= toSort[i-1]->strengthFromEmpty) {
160          prevStrength  = toSort[i-1]->strengthFromEmpty;
161        } else if(st < toSort[i-1]->strengthFromEmpty) {
162          prevStrength = st;
163        }
164        toSort[i]->strengthFromEmpty = prevStrength;
165      }
166      if(i < size-2) {
167        toSort[i+1]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i+1]);
168        st = probe.getStrength(*toSort[i+1], *toSort[i]);
169        if(st == UCOL_OFF) {
170          logger->log("Cannot deduce distance from empty using next element. Something is very wrong! Line:");
171          logger->log(toSort[i]->toString(true), true);
172        } else if(st == UCOL_IDENTICAL || st < toSort[i+1]->strengthFromEmpty) {
173          nextStrength = toSort[i+1]->strengthFromEmpty;
174        } else if(st >= toSort[i+1]->strengthFromEmpty) {
175          nextStrength = st;
176        }
177        if(i) {
178          if(prevStrength != nextStrength) {
179            logger->log("Inconsistent results from interpolation! Results will most likely be wrong\n");
180          }
181        }
182        toSort[i]->strengthFromEmpty = nextStrength;
183      }
184      /*
185      UColAttributeValue problemStrength = UCOL_PRIMARY;
186      for(j = lastChange; j < i ; j++) {
187        if(toSort[j]->strength > problemStrength) {
188          problemStrength = toSort[j]->strength;
189        }
190      }
191      for(j = lastChange; j < i ; j++) {
192        toSort[j]->strengthFromEmpty = problemStrength;
193      }
194      strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty;
195      lastChange = i;
196      debug->log("Problem detected in distances from empty. Most probably word sort is on\n");
197      */
198      wordSort = true;
199    }
200    i++;
201  }
202  debug->log("Distances from empty string\n");
203  debug->log(toStringFromEmpty(), true);
204}
205
206void
207SortedLines::analyse(UErrorCode &status) {
208  frenchSecondary = probe.isFrenchSecondary(status);
209  if(U_FAILURE(status)) {
210    logger->log("Test for French secondary failed. Bailing out!\n");
211    return;
212  }
213  logger->log("French secondary value is %i\n", frenchSecondary, frenchSecondary);
214  upperFirst = probe.isUpperFirst(status);
215  if(U_FAILURE(status)) {
216    logger->log("Test for upper first failed. Bailing out!\n");
217    return;
218  }
219  logger->log("upper first value is %i\n", upperFirst, upperFirst);
220  sort(true, true);
221  classifyRepertoire();
222  getBounds(status);
223  //sort(true, true);
224  addContractionsToRepertoire(status);
225  //sort(true, true);
226  debug->log("\n*** Order after detecting contractions\n\n");
227  calculateSortKeys();
228  debug->log(toPrettyString(false, true), true);
229  detectExpansions();
230}
231
232void SortedLines::init()
233{
234  size = repertoire.size();
235  capacity = 5*size;
236  lines = new Line[capacity];
237  init(repertoire, lines);
238}
239
240void SortedLines::init(UnicodeSet &rep, Line *lin)
241{
242
243  UnicodeSetIterator exemplarUSetIter(rep);
244  int32_t size = 0;
245
246  while(exemplarUSetIter.next()) {
247    Line *currLine = lin+size;
248    if(exemplarUSetIter.isString()) { // process a string
249      currLine->setTo(exemplarUSetIter.getString());
250    } else { // process code point
251      currLine->setTo(exemplarUSetIter.getCodepoint());
252    }
253    currLine->name[currLine->len] = 0; // zero terminate, for our evil ways
254    //currLine->index = size;
255    size++;
256  }
257}
258
259void
260SortedLines::setSortingArray(Line **sortingArray, Line *elements, int32_t sizeToSort) {
261  int32_t i = 0;
262  for(i = 0; i < sizeToSort; i++) {
263    sortingArray[i] = &elements[i];
264  }
265}
266
267int32_t
268SortedLines::setSortingArray(Line **sortingArray, Hashtable *table) {
269  int32_t size = table->count();
270  int32_t hashIndex = -1;
271  const UHashElement *hashElement = NULL;
272  int32_t count = 0;
273  while((hashElement = table->nextElement(hashIndex)) != NULL) {
274    sortingArray[count++] = (Line *)hashElement->value.pointer;
275  }
276  return size;
277}
278
279void
280SortedLines::sort(Line **sortingArray, int32_t sizeToSort, UBool setStrengths, UBool link) {
281  int32_t i = 0;
282  int32_t equalStart = 0;
283  UColAttributeValue equalStrength = UCOL_OFF;
284
285  qsort(sortingArray, sizeToSort, sizeof(Line *), probe.comparer);
286
287  if(setStrengths) { // analyze strengths
288    for(i = 1; i < sizeToSort; i++) {
289      sortingArray[i]->strength = probe.getStrength(*sortingArray[i-1], *sortingArray[i]);
290    }
291    // for equal guys, do the code point ordering
292
293    i = 1;
294    while(i < sizeToSort)
295    {
296      if(sortingArray[i]->strength == UCOL_IDENTICAL) {
297        equalStart = i - 1;
298        equalStrength = sortingArray[equalStart]->strength;
299        sortingArray[equalStart]->strength = UCOL_IDENTICAL;
300        while(i < sizeToSort && sortingArray[i]->strength == UCOL_IDENTICAL) {
301          i++;
302        }
303        qsort(sortingArray+equalStart, i-equalStart, sizeof(Line *), codePointCmp);
304        sortingArray[equalStart]->strength = equalStrength;
305      } else {
306        i++;
307      }
308    }
309
310  }
311
312
313
314  if(link) { // do the linking
315    for(i = 0; i < sizeToSort - 1; i++) {
316      Line *curr = *(sortingArray+i);
317      curr->next = *(sortingArray+i+1);
318      (*(sortingArray+i+1))->previous = curr;
319    }
320  }
321}
322
323void
324SortedLines::sort(UBool setStrengths, UBool link) {
325  if(toSortCapacity < size || !toSort) {
326    if(toSort) {
327      delete[] toSort;
328    }
329    toSort = new Line*[size*2];
330    toSortCapacity = size*2;
331  }
332
333  setSortingArray(toSort, lines, size);
334  sort(toSort, size, setStrengths, link);
335
336  first = last = NULL;
337
338  if(link) { // do the linking
339    first = *toSort;
340    last = *(toSort+size-1);
341  }
342}
343
344void
345SortedLines::updateBounds(UnicodeSet &set) {
346  Line line;
347  UnicodeString s1;
348  UnicodeSetIterator it1(set);
349  while(it1.next()) {
350    if(!debug->isOn()) {
351      logger->log(".");
352    }
353    if(it1.isString()) { // process a string
354      s1.setTo(it1.getString());
355    } else { // process code point
356      s1.setTo(it1.getCodepoint());
357    }
358    //line.setTo(s1);
359    UColAttributeValue strength = probe.distanceFromEmptyString(s1);
360    if(probe.compare(UnicodeString(UB[strength]->name), s1) < 0) {
361      // TODO: leak here - fixit!
362      UB[strength] = new Line(s1);
363      //u_strcpy(UB[strength], s1.getTerminatedBuffer());
364    }
365  }
366
367
368
369}
370
371void SortedLines::addAll(Line* toAdd, int32_t toAddSize)
372{
373  if(size+toAddSize > capacity) {
374    int32_t doGrowingBreakpoint = 0;
375    // we need to do growing here
376  }
377  int32_t i = 0;
378
379  for(i = 0; i < toAddSize; i++) {
380    lines[size+i] = toAdd[i];
381  }
382  size += toAddSize;
383}
384
385void SortedLines::setDistancesFromEmpty(Line* array, int32_t arraySize)
386{
387  int32_t i = 0;
388  for(i = 0; i < arraySize; i++) {
389    array[i].strengthFromEmpty = probe.distanceFromEmptyString(array[i]);
390  }
391}
392
393
394// adds contractions in to repertoire
395int32_t SortedLines::addContractionsToRepertoire(UErrorCode &status)
396{
397  logger->log("\n*** Detecting contractions\n\n");
398  contractionsTable = new Hashtable();
399  int32_t noConts = 0;
400  int32_t allocateSize = 50*size;
401  // first check for simple contractions
402  Line* delta = new Line[allocateSize];
403  Line** deltaSorted = new Line*[allocateSize];
404  Line* lesserToAddTo = new Line[allocateSize];
405  Line* newDelta = new Line[allocateSize];
406  Line** newDeltaSorted = new Line*[allocateSize];
407  Line* deltaP = delta;
408  Line** deltaPP = deltaSorted;
409  Line* newDeltaP = newDelta;
410  int32_t deltaSize = 0, lesserToAddToSize = 0, newDeltaSize = 0;
411  logger->log("++ Contraction detection generation 0\n");
412  noConts = detectContractions(toSort, size, toSort, size,
413			       delta, deltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
414  setSortingArray(deltaSorted, delta, deltaSize);
415  sort(deltaSorted, deltaSize, true);
416
417  setDistancesFromEmpty(delta, deltaSize);
418  int32_t deltaPSize = deltaSize;
419  //updateBounds(delta);
420
421  int32_t generation = 0;
422  // if we found any, we have to try multiple contractions
423  // However, we want to prevent the contractions explosion
424  // if the number of simple contractions is greater than the
425  // starting size, chances are that we either have an algorithmic
426  // contraction (like iteration marks on w2k) or something
427  // is seriosly wrong.
428  if(deltaPSize < size/2) {
429    while (deltaPSize && generation < 1) {
430      generation++;
431      logger->log("\n++ Contraction detection generation %i\n", generation, generation);
432      // find more, but avoid testing the combinations we already have
433      noConts += detectContractions(toSort, size, deltaPP, deltaPSize,
434				    newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
435      noConts += detectContractions(deltaPP, deltaPSize, toSort, size,
436				    newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
437      calculateSortKeys();
438
439      addAll(deltaP, deltaPSize);
440      setSortingArray(toSort, lines, size);
441      sort(true, true);
442      setSortingArray(newDeltaSorted, newDeltaP, newDeltaSize);
443      sort(newDeltaSorted, newDeltaSize, true);
444
445      // if no new ones, bail
446      //if (newDeltaSize == 0) break;
447
448      deltaPSize = newDeltaSize;
449      newDeltaSize = 0;
450      if(deltaP == delta) {
451        deltaP = newDelta;
452        deltaPP = newDeltaSorted;
453        newDeltaP = delta;
454      } else {
455        deltaP = delta;
456        deltaPP = deltaSorted;
457        newDeltaP = newDelta;
458      }
459      setDistancesFromEmpty(deltaP, deltaPSize);
460    }
461  }
462  status = U_ZERO_ERROR;
463  // add stuff from the last batch
464  addAll(deltaP, deltaPSize);
465
466  // warning: we don't add the lesser ones in recursively, since they will
467  // infinitely loop
468  setDistancesFromEmpty(lesserToAddTo, lesserToAddToSize);
469  addAll(lesserToAddTo, lesserToAddToSize);
470  setSortingArray(toSort, lines, size);
471  sort(true, true);
472
473  delete[] deltaSorted;
474  delete[] delta;
475  delete[] lesserToAddTo;
476  delete[] newDeltaSorted;
477  delete[] newDelta;
478  return noConts;
479}
480
481
482int32_t SortedLines::detectContractions(Line **firstRep, int32_t firstSize,
483                                        Line **secondRep, int32_t secondSize,
484                                        Line *toAddTo, int32_t &toAddToSize,
485                                        Line *lesserToAddTo, int32_t &lesserToAddToSize,
486                                        int32_t capacity, UErrorCode &status)
487{
488  int32_t noConts = 0;
489  int i = 0, j = 0, k = 0;
490  Line lower, upper, trial, toAdd, helper;
491  UChar32 firstStart, firstEnd, secondStart;
492  UChar NFCTrial[256];
493  int32_t NFCTrialLen = 0;
494  UBool thai;
495  i = -1;
496  while(i < firstSize-1 && U_SUCCESS(status)) {
497    i++;
498    if(!debug->isOn()) {
499      logger->log("\rTesting %05i/%05i. Found %05i conts.", i, firstSize, noConts);
500    }
501    U16_GET(firstRep[i]->name, 0, 0, firstRep[i]->len, firstStart);
502    if(uscript_getScript(firstStart, &status) == USCRIPT_HAN || firstRep[i]->strengthFromEmpty > UCOL_PRIMARY) //UCOL_TERTIARY)
503      {
504        continue;
505      }
506    lower = *firstRep[i];
507    for(j = 0; j < secondSize; j++) {
508      if(noConts == capacity) {
509        return noConts;
510      }
511      U16_GET(secondRep[j]->name, 0, 0, secondRep[j]->len, secondStart);
512      if(firstStart == 0x41 && secondStart == 0x308) {
513      int32_t putBreakPointHere = 0;
514    }
515      if(uscript_getScript(secondStart, &status) == USCRIPT_HAN) // || secondRep[j]->strengthFromEmpty > UCOL_TERTIARY)
516	{
517          continue;
518        }
519      	if(duplicators && duplicators->get(UnicodeString(secondRep[j]->name, secondRep[j]->len)) != NULL) {
520          debug->log("Skipping duplicator ");
521          debug->log(secondRep[j]->toString(), true);
522		  continue;
523		}
524
525      if(firstRep[i]->name[0] == 0x61 && secondRep[j]->name[0] == 0x308) {
526	    int32_t putBreakpointhere = 0;
527      }
528      upper.setToConcat(firstRep[i], UB[UCOL_PRIMARY]);
529      //upper.setToConcat(firstRep[i], UB[secondRep[j]->strengthFromEmpty]);
530      toAdd.setToConcat(firstRep[i], secondRep[j]);
531      U16_GET(firstRep[i]->name, 0, firstRep[i]->len-1, firstRep[i]->len, firstEnd);
532      if((thai = u_hasBinaryProperty(firstEnd, UCHAR_LOGICAL_ORDER_EXCEPTION))) {
533	// this means that the lower is single reordering character
534	// if we do the lower test without taking this into account,
535	// we'll comparing the secondRep directly to Thai. We add UB[UCOL_PRIMARY] to
536	// end of lower and in the middle of trial, so we will have
537	// lower = Thai + UB, trial Thai + UB + x, resolving to
538	// UB + Thai vs UB + Thai + x.
539	// for upper bound, we do the similar, so we have
540	// upper = Thai + UB + UB, trial = Thai + UB + x,
541	// resolving to UB + Thai + UB vs UB + Thai + x
542	if(secondRep[j]->firstCC) {
543	  UChar32 UBChar;
544	  U16_GET(UB[UCOL_SECONDARY]->name, 0, 0, UB[UCOL_SECONDARY]->len, UBChar);
545	  if(secondRep[j]->firstCC > u_getCombiningClass(UBChar)) {
546	    continue;
547	  }
548	}
549	upper = *firstRep[i];
550	upper.append(*UB[UCOL_PRIMARY]);
551	//upper.append(*UB[secondRep[j]->strengthFromEmpty]);
552	upper.append(*UB[UCOL_PRIMARY]);
553	lower.append(*UB[UCOL_PRIMARY]);
554	trial = *firstRep[i];
555	trial.append(*UB[UCOL_PRIMARY]);
556	trial.append(*secondRep[j]);
557      } else if((firstRep[i]->lastCC > secondRep[j]->firstCC && secondRep[j]->firstCC && !frenchSecondary)
558		|| (firstRep[i]->firstCC < secondRep[j]->lastCC && firstRep[i]->firstCC && frenchSecondary)) {
559	// Skip because normalization will reorder
560	// there will be a chance to check this again, since if we
561	// try a+b, we will also try b+a
562	    continue;
563      } else if(frenchSecondary && (firstRep[i]->strengthFromEmpty > UCOL_PRIMARY && secondRep[j]->strengthFromEmpty > UCOL_PRIMARY)) {
564	    continue;
565      }else if(firstRep[i]->lastCC && secondRep[j]->firstCC && frenchSecondary) {
566	    trial.setToConcat(secondRep[j], firstRep[i]);
567      } else {
568    	trial.setToConcat(firstRep[i], secondRep[j]);
569      }
570      // Now let's check the trial. The problem is that when you combine characters,
571      // you can end up with concatenation that is unknown for the examined API.
572      NFCTrialLen = unorm_normalize(trial.name, trial.len, UNORM_NFC, 0, NFCTrial, 256, &status);
573      if((u_strcmp(trial.name, NFCTrial) == 0) || u_strFindLast(NFCTrial, NFCTrialLen, secondRep[j]->name, secondRep[j]->len)) {
574	   if(secondRep[j]->strengthFromEmpty > UCOL_TERTIARY) {
575	     continue;
576	   }
577	 }
578      UChar32 c;
579      U16_GET(NFCTrial, 0, 0, NFCTrialLen, c);
580      helper.setTo(c);
581      if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) {
582	    continue;
583      }
584      if(NFCTrialLen > 1) {
585	    U16_GET(NFCTrial, 0, NFCTrialLen-1, NFCTrialLen, c);
586	    helper.setTo(c);
587        if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) {
588	      continue;
589	    }
590      }
591
592      if (probe.compare(lower, trial) >= 0) { // if lower is bigger than trial
593        // this might be ok, but I'm having doubts. Here is an additional check:
594        if(firstRep[i]->len == 1 || secondRep[j]->strengthFromEmpty == UCOL_PRIMARY) {
595          // I'm basically saying that I'll add this kind of contraction for cases where I combine
596          // one letter with an accent OR when I'm combining more than one symbol with a letter.
597          noteContraction("L", lesserToAddTo, lesserToAddToSize, firstRep[i], secondRep[j], noConts, status);
598        }
599      }
600      else if (probe.compare(trial, upper) > 0) { // trial is bigger than upper??
601        noteContraction("U", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
602      }
603#if 0
604      else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY)
605      {
606        Line expansionLine;
607        if(getExpansionLine(trial, *firstRep[i], *secondRep[j], expansionLine) &&
608        expansionLine.len && !(expansionLine == *secondRep[j])) {
609          noteContraction("D", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
610        }
611      }
612#endif
613      else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && probe.getStrength(lower, trial) < secondRep[j]->strengthFromEmpty) {
614        noteContraction("D1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
615      }
616      else if (firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && secondRep[j]->strengthFromEmpty == UCOL_PRIMARY)
617      {
618        // I have added an additional check. The checks versus upper and lower bound should be sufficient
619        // when the right side is a combining mark. There might be a reordering of combining marks, but
620        // that should be already visible in their order.
621	// compare the sequence
622	// Y- <? Y <? Y+
623	// and
624	// XY- <? XY <? XY+
625	Line xym, xyp, xy;
626	UBool xymIsContraction = false, toAddIsContraction = false;
627    if(j) {
628	  if(((!secondRep[j-1]->firstCC || firstRep[i]->lastCC < secondRep[j-1]->firstCC) && !frenchSecondary)
629             ||((!firstRep[i]->firstCC || firstRep[i]->firstCC > secondRep[j-1]->lastCC) && frenchSecondary)) {
630	    xym.setToConcat(firstRep[i], secondRep[j-1]);
631	    toAdd.strength = probe.getStrength(xym, toAdd);
632	    if(secondRep[j]->strength != toAdd.strength) {
633	      // there is possibility that either xym or xy are contractions
634	      // There are two situations:
635	      // xym > xy or xym <n xy and ym <k y but n != k
636	      // if they are reordered, we are going to see if each of them
637	      // is further reordered
638	      if(toAdd.strength == UCOL_OFF) {
639		// check whether toAdd shifted more down
640		k = j - 2;
641		while(k>=0 && secondRep[k]->strength > secondRep[j]->strength) {
642		  k--;
643		}
644		while(!toAddIsContraction && k>=0) {
645		  xyp.setToConcat(firstRep[i], secondRep[k]);
646		  if(contractionsTable->get(UnicodeString(xyp.name, xyp.len)) != NULL) {
647		    k--;
648		    continue;
649		  }
650		  if(probe.compare(xyp, xym) >= 0) {
651		    // xyp looks like a contraction
652		    noteContraction("!1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
653		    toAddIsContraction = true;
654		  } else {
655		    break;
656		  }
657		}
658          // first let's see if xym has moved beyond
659          if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) {
660            k = j+1;
661            // ignore weaker strengths
662            while(k < secondSize && secondRep[k]->strength > secondRep[j]->strength) {
663              k++;
664            }
665            // check if we skipped the following guy
666            if(k < secondSize) {
667              xyp.setToConcat(firstRep[i], secondRep[k]);
668              if(probe.compare(xyp, xym) <= 0) {
669	            // xyp looks like a contraction
670	            noteContraction("!2", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status);
671	            xymIsContraction = true;
672              }
673            }
674          } else {
675            xymIsContraction = true;
676          }
677          // if they have reordered, but none has moved, then we add them both
678          // and hope for the best
679          if(!xymIsContraction && !toAddIsContraction) {
680              // it is possible that there is an NFC version version of one of the
681              // strings. If we have XY > XZ, but NFC(XZ) = W and X < W, we might have
682              // have a false contraction.
683              trial.len = unorm_normalize(toAdd.name, toAdd.len, UNORM_NFC, 0, trial.name, 25, &status);
684              //UColAttributeValue strength = probe.getStrength(*firstRep[i], trial);
685              if(trial == toAdd) {
686                noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status);
687                noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
688              } else {
689                noteContraction("!4", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
690              }
691            }
692	      } else { // only the strength has changed
693            // check whether the previous is contraction and if not, add the current
694            if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) {
695              noteContraction("!5", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
696            }
697	      }
698	    }
699	  }
700	}
701      }
702      if(thai) { // restore lower
703        lower = *firstRep[i];
704      }
705    }
706  }
707  return noConts;
708}
709
710void
711SortedLines::noteContraction(const char* msg, Line *toAddTo, int32_t &toAddToSize, Line *left, Line *right, int32_t &noConts, UErrorCode &status)
712{
713  Line toAdd;
714  toAdd.setToConcat(left, right);
715  toAdd.left = left;
716  toAdd.right = right;
717  // if we're adding an accent to an existing contraction, we want to check
718#if 0
719  Line test, trial1, trial2;
720  if(right->strengthFromEmpty > UCOL_PRIMARY) {
721    if(left->right && left->right->previous && left->right->next) {
722      test.setToConcat(left->left, left->right->previous);
723      trial1.setToConcat(&test, right);
724
725      test.setToConcat(left->left, left->right->next);
726      trial2.setToConcat(&test, right);
727      if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
728        // this means that the contraction has been broken by the newly added accent
729        // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
730        debug->log("Con -");
731        debug->log(msg);
732        debug->log(toAdd.toString(false), true);
733        return;
734      }
735    } else {
736      if(right->previous && right->next) {
737        trial1.setToConcat(left, right->previous);
738        trial2.setToConcat(left, right->next);
739        if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
740          // this means that the contraction has been broken by the newly added accent
741          // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
742          debug->log("Con -");
743          debug->log(msg);
744          debug->log(toAdd.toString(false), true);
745          return;
746        }
747      }
748      if(left->previous && left->next) {
749        trial1.setToConcat(left->previous, right);
750        trial2.setToConcat(left->next, right);
751        if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
752          // this means that the contraction has been broken by the newly added accent
753          // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
754          debug->log("Con -");
755          debug->log(msg);
756          debug->log(toAdd.toString(false), true);
757          return;
758        }
759      }
760
761    }
762  }
763  if(right->right && right->right->strengthFromEmpty > UCOL_PRIMARY && right->left->previous && right->left->next) { // maybe we already had a contraction with an accent
764    test.setToConcat(right->left->previous, right->right);
765    trial1.setToConcat(left, &test);
766    test.setToConcat(right->left->next, right->right);
767    trial2.setToConcat(left, &test);
768    if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
769      // this means that the contraction has been broken by the newly added accent
770      // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
771      debug->log("Con -");
772      debug->log(msg);
773      debug->log(toAdd.toString(false), true);
774      return;
775    }
776  }
777#endif
778  if(contractionsTable->get(UnicodeString(toAdd.name, toAdd.len)) == NULL) {
779    if(probe.distanceFromEmptyString(toAdd) <= UCOL_TERTIARY) {
780      toAddTo[toAddToSize++] = toAdd;
781      contractionsTable->put(UnicodeString(toAdd.name, toAdd.len), &toAdd, status);
782      noConts++;
783      debug->log(msg);
784      debug->log(" Con + ");
785      debug->log(toAdd.toString(false), true);
786
787      if(!left->sortKey) {
788        calculateSortKey(*left);
789      }
790      debug->log(left->dumpSortkey());
791      debug->log(" + ");
792
793      if(!right->sortKey) {
794        calculateSortKey(*right);
795      }
796      debug->log(right->dumpSortkey());
797      debug->log(" = ");
798
799      calculateSortKey(toAdd);
800      debug->log(toAdd.dumpSortkey(), true);
801      if(noConts > size/2) {
802        status = U_BUFFER_OVERFLOW_ERROR;
803      }
804    }
805  }
806}
807
808
809UBool
810SortedLines::getExpansionLine(const Line &expansion, const Line &previous, const Line &exp, Line &expansionLine)
811{
812  int expIndexSize = 0;
813  UColAttributeValue expStrength = UCOL_OFF;
814  int32_t comparisonResult = 0;
815  int32_t i = 0, k = 0, prevK = 0;
816  Line trial;
817  UBool sequenceCompleted = false;
818  int32_t expIndexes[256];
819  int32_t expIndexesSize = 0;
820
821  if(!sequenceCompleted) {
822    expIndexSize = 0;
823    expansionLine.clear();
824
825    // we will start from strength between the expansion
826    // and the target (toSort[i] and toSort[j]. First we
827    // will add as many primaries as possible. Then we will
828    // try to add secondary pieces and then tertiary.
829    // found an expansion - what is the expanding sequence?
830
831    expStrength = UCOL_PRIMARY;
832    while(!sequenceCompleted) {
833      k = 0;
834      prevK = 0;
835      while(k < size) {
836        if(expansionLine.len > 15) {
837          sequenceCompleted = true;
838          break;
839        }
840        while(k < size && toSort[k]->strength != UCOL_PRIMARY)
841        {
842          k++;
843        }
844        // nothing found
845        if(k == size) {
846          break;
847        }
848        // we need to skip over reordering things. If they were worthy, they would
849        // have been detected in the previous iteration.
850        //if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) {
851          //k++;
852          //continue;
853        //}
854        trial = previous;
855        trial.append(expansionLine);
856        trial.append(*toSort[k]);
857        if(toSort[k]->name[0] == 0x0067) {
858          int32_t putBreakPointHere = 0;
859        }
860        comparisonResult = probe.compare(trial, expansion);
861        if(comparisonResult == 0) {
862          expansionLine = *toSort[k];
863          return true;
864        } else if (comparisonResult > 0) {
865          if(prevK) {
866            if(exp == *toSort[prevK]) {
867              expansionLine = exp;
868              return true;
869            }
870            i = prevK;
871            while(i < k-1) {
872              i++;
873              if(toSort[i]->strength > exp.strength) {
874                continue;
875              }
876              trial = previous;
877              trial.append(expansionLine);
878              trial.append(*toSort[i]);
879              if(probe.compare(trial, expansion) > 0) {
880                break;
881              }
882            }
883            // we got into situation where we have ch > ch+dot-below
884            // however, ch is a contraction and therefore we cannot use
885            // it properly. If we have hit on a contraction, we'll just try
886            // to continue. Probably need more logic here.
887            if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) {
888              expansionLine.append(*toSort[i-1]);
889              expIndexes[expIndexSize++] = i-1;
890              break;
891            } else {
892              int32_t putBreakPointHere = 0;
893            }
894          } else {
895            sequenceCompleted = true;
896            break;
897          }
898          //break;
899        }
900        prevK = k;
901        k++;
902      }
903      if(!prevK || k == size) {
904        break;
905      }
906    }
907  }
908  return expIndexSize > 0;
909}
910
911int32_t
912SortedLines::gooseUp(int32_t resetIndex, int32_t expansionIndex, Line &expLine, int32_t *expIndexes, int32_t &expIndexSize, UColAttributeValue strength)
913{
914  int32_t i = expansionIndex, k = resetIndex+1, n = 0, m = 0, start = 0;
915  UBool haveChanges = false;
916  Line trial, prefix, suffix;
917  // we will first try goosing up the reset index
918  //while(toSort[k]->strength >= strength)
919  for( ; toSort[k]->strength == strength; k++)
920  {
921    //if(toSort[k]->strength > strength) {
922      //continue;
923    //}
924    trial.setToConcat(toSort[k], &expLine);
925    if(probe.compare(trial, *toSort[i]) > 0) {
926      break;
927    }
928  }
929  resetIndex = k-1;
930
931  // goose up individual characters
932  prefix = *toSort[resetIndex];
933  for(n = 0; n < expIndexSize; n++) {
934    suffix.clear();
935    for(m = n+1; m < expIndexSize; m++) {
936      suffix.append(*toSort[expIndexes[m]]);
937    }
938    k = expIndexes[n]+1;
939    //while(toSort[k]->strength >= strength)
940    for( ; toSort[k]->strength == strength; k++)
941    {
942      //if(toSort[k]->strength > strength) {
943        //continue;
944      //}
945      trial.setToConcat(&prefix, toSort[k]);
946      trial.append(suffix);
947      if(probe.compare(trial, *toSort[i]) > 0) {
948        break;
949      }
950    }
951    if(k > expIndexes[n]+1) {
952      haveChanges = true;
953      expIndexes[n] = k-1;
954    }
955    prefix.append(*toSort[expIndexes[n]]);
956  }
957
958  // try inserting ignorables
959  UColAttributeValue lastStr = UCOL_OFF;
960  k = 0;
961  while(toSort[k]->strengthFromEmpty > strength) {
962    k++;
963  }
964  if(toSort[k]->strengthFromEmpty == strength) {
965    start = k;
966    prefix = *toSort[resetIndex];
967    n = 0;
968    while(n <= expIndexSize) {
969      suffix.clear();
970      for(m = n; m < expIndexSize; m++) {
971        suffix.append(*toSort[expIndexes[m]]);
972      }
973      k = start;
974      while(toSort[k]->strengthFromEmpty == strength) {
975        trial.setToConcat(&prefix, toSort[k]);
976        trial.append(suffix);
977        lastStr = probe.getStrength(trial, *toSort[i]);
978        if(lastStr == UCOL_OFF) { // shot over - we won't find anything here
979          break;
980        } else if(lastStr > strength) {
981          for(m = expIndexSize; m > n; m--) {
982            expIndexes[m] = expIndexes[m-1];
983          }
984          expIndexes[n] = k;
985          expIndexSize++;
986          haveChanges = true;
987          break;
988        }
989#if 0
990        if(probe.compare(trial, *toSort[i]) > 0) {
991          // if the first one skips, that means that
992          // this position doesn't work
993          if(k > start) {
994            // insert an ignorable on position n
995            for(m = expIndexSize; m > n; m--) {
996              expIndexes[m] = expIndexes[m-1];
997            }
998            expIndexes[n] = k-1;
999            expIndexSize++;
1000            haveChanges = true;
1001            if(n == expIndexSize-1) { // added to the end of the string
1002              UColAttributeValue str = probe.getStrength(trial, *toSort[i]);
1003              int32_t putBreakHere = 0;
1004            }
1005          }
1006          break;
1007        } else {
1008          lastStr = probe.getStrength(trial, *toSort[i]);
1009        }
1010#endif
1011        k++;
1012      }
1013      prefix.append(*toSort[expIndexes[n]]);
1014      n++;
1015    }
1016  }
1017
1018  if(haveChanges) {
1019    expLine.clear();
1020    for(m = 0; m < expIndexSize; m++) {
1021      expLine.append(*toSort[expIndexes[m]]);
1022    }
1023  }
1024  return resetIndex;
1025}
1026
1027int32_t
1028SortedLines::detectExpansions()
1029{
1030  logger->log("\n*** Detecting expansions\n\n");
1031  int32_t exCount = 0;
1032  int32_t i = 0, j = 0, k = 0, prevK = 0;
1033  Line *previous, trial, expansionLine;
1034  UBool foundExp = false, sequenceCompleted = false;
1035  UColAttributeValue strength = UCOL_OFF;
1036  UColAttributeValue maxStrength = UCOL_IDENTICAL;
1037  UColAttributeValue expStrength = UCOL_OFF;
1038  int32_t expIndexes[256];
1039  int32_t expIndexSize = 0;
1040  memset(expIndexes, 0, sizeof(expIndexes));
1041
1042  // for each element, we look back to find whether there is such a q for which
1043  // q <n x < qUBn. These are possible expansions. When going backwards we skip
1044  // over already detected expansions.
1045  i = 0;
1046  // it turns out that looking at accents as possible expansions is
1047  // quite a stupid thing to do, especially on non ICU platforms.
1048  // Previously this line skipped over identicals only, but
1049  // now we are going to skip all the way to non-ignorables.
1050  while(toSort[i]->strengthFromEmpty > UCOL_PRIMARY) {
1051    i++;
1052  }
1053  i++;
1054  for( ; i < size; i++) {
1055    if(toSort[i]->name[0]==0x0063 && toSort[i]->name[1] == 0x68) // && toSort[i]->name[1] == 0x308)0043 0043 0219
1056    {
1057      int32_t putBreakpointhere = 0;
1058    }
1059    foundExp = false;
1060    sequenceCompleted = false;
1061    strength = toSort[i]->strength;
1062    if(strength == UCOL_IDENTICAL && toSort[i-1]->isExpansion == true) {
1063      u_strcpy(toSort[i]->expansionString, toSort[i-1]->expansionString);
1064      toSort[i]->expLen = toSort[i-1]->expLen;
1065      toSort[i]->isExpansion = true;
1066      toSort[i]->expIndex = toSort[i-1]->expIndex;
1067      toSort[i]->expStrength = UCOL_IDENTICAL;
1068      //toSort[i]->expStrength = toSort[i-1]->expStrength;
1069      foundExp = true;
1070      sequenceCompleted = true;
1071    }
1072    //logger->log("%i %i\n", i, j);
1073    while(!foundExp && strength <= maxStrength) {
1074      j = i-1;
1075      while(j && (toSort[j]->isExpansion == true || toSort[j]->isRemoved == true)) {
1076        //if(toSort[j]->strength < strength) {
1077          //strength = toSort[j]->strength;
1078        //}
1079        j--;
1080      }
1081
1082      //while(j && toSort[j]->strength > strength)
1083      while(j && toSort[j]->strength > probe.getStrength(*toSort[j], *toSort[i]))
1084      {
1085        j--;
1086      }
1087      //if(toSort[j]->strength == strength) {
1088        previous = toSort[j];
1089        if(previous->strengthFromEmpty >= UCOL_IDENTICAL ||
1090          (previous->strengthFromEmpty == UCOL_SECONDARY
1091          && strength == UCOL_SECONDARY
1092          && previous->lastCC > UB[strength]->firstCC)) {
1093          break;
1094          //continue;
1095        }
1096        //trial.setToConcat(previous, UB[strength]);
1097        trial.setToConcat(previous, UB[probe.getStrength(*toSort[j], *toSort[i])]);
1098        if(probe.compare(trial, *toSort[i]) > 0) {
1099          foundExp = true;
1100        }
1101      //}
1102      if(strength == UCOL_QUATERNARY) {
1103        strength = UCOL_IDENTICAL;
1104      } else {
1105        strength = (UColAttributeValue)(strength + 1);
1106      }
1107    }
1108    // calculate the expanding sequence
1109    if(foundExp && !sequenceCompleted) {
1110      expIndexSize = 0;
1111      expansionLine.clear();
1112      exCount++;
1113      // we will start from strength between the expansion
1114      // and the target (toSort[i] and toSort[j]. First we
1115      // will add as many primaries as possible. Then we will
1116      // try to add secondary pieces and then tertiary.
1117      // found an expansion - what is the expanding sequence?
1118
1119      expStrength = UCOL_PRIMARY;
1120      while(!sequenceCompleted) {
1121        k = 0;
1122        prevK = 0;
1123        while(k < size) {
1124          if(expansionLine.len > 15) {
1125            sequenceCompleted = true;
1126            break;
1127          }
1128          while(k < size && toSort[k]->strength != UCOL_PRIMARY) {
1129            k++;
1130          }
1131          // nothing found
1132          if(k == size) {
1133            break;
1134          }
1135          // we need to skip over reordering things. If they were worthy, they would
1136          // have been detected in the previous iteration.
1137          //if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) {
1138            //k++;
1139            //continue;
1140          //}
1141          trial = *previous;
1142          trial.append(expansionLine);
1143          trial.append(*toSort[k]);
1144          if(toSort[k]->name[0] == 0x0067) {
1145            int32_t putBreakPointHere = 0;
1146          }
1147          if(probe.compare(trial, *toSort[i]) > 0) {
1148            if(prevK) {
1149              // we got into situation where we have ch > ch+dot-below
1150              // however, ch is a contraction and therefore we cannot use
1151              // it properly. If we have hit on a contraction, we'll just try
1152              // to continue. Probably need more logic here.
1153              if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) {
1154                expansionLine.append(*toSort[prevK]);
1155                expIndexes[expIndexSize++] = prevK;
1156                break;
1157              } else {
1158                int32_t putBreakPointHere = 0;
1159              }
1160            } else {
1161              sequenceCompleted = true;
1162              break;
1163            }
1164            //break;
1165          }
1166          prevK = k;
1167          k++;
1168        }
1169        if(!prevK || k == size) {
1170          break;
1171        }
1172      }
1173      // after this we have primaries lined up.
1174      // we are going to goose up with secondaries and
1175      // tertiaries
1176      trial.setToConcat(toSort[j], &expansionLine);
1177      expStrength = probe.getStrength(trial, *toSort[i]);
1178      if(expStrength > UCOL_PRIMARY) {
1179        if(expStrength == UCOL_SECONDARY || expStrength == UCOL_OFF) {
1180          j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_SECONDARY);
1181          trial.setToConcat(toSort[j], &expansionLine);
1182          expStrength = probe.getStrength(trial, *toSort[i]);
1183          if(expStrength == UCOL_TERTIARY) {
1184            j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY);
1185          }
1186        } else if(expStrength == UCOL_TERTIARY) {
1187          j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY);
1188        }
1189      }
1190      trial.setToConcat(toSort[j], &expansionLine);
1191      expStrength = probe.getStrength(trial, *toSort[i]);
1192      if(expansionLine.len) {
1193        if(expansionLine.name[0] == 0x73 && expansionLine.name[1] == 0x7a) {
1194          int32_t putBreakpointhere = 0;
1195        }
1196        UBool isExpansionLineAContraction = (contractionsTable->get(UnicodeString(expansionLine.name, expansionLine.len)) != NULL);
1197        // we have an expansion line and an expansion. There could be some expansions where
1198        // the difference between expansion line and the end of expansion sequence is less or
1199        // equal than the expansion strength. These should probably be removed.
1200        int32_t diffLen = toSort[i]->len - expansionLine.len;
1201        if(diffLen > 0) {
1202          trial.setTo(UnicodeString(toSort[i]->name + diffLen, toSort[i]->len - diffLen));
1203        } else {
1204          trial = *toSort[i];
1205        }
1206        UColAttributeValue s1 = probe.getStrength(trial, expansionLine);
1207        if(s1 == UCOL_OFF) {
1208          s1 = probe.getStrength(expansionLine, trial);
1209        }
1210        if((!isExpansionLineAContraction && s1 >= expStrength) || (diffLen <= 0 && s1 == UCOL_IDENTICAL)) {
1211          contractionsTable->remove(UnicodeString(toSort[i]->name, toSort[i]->len));
1212          toSort[i]->isRemoved = true;
1213          if(toSort[i]->next && toSort[i]->previous) {
1214            toSort[i]->previous->next = toSort[i]->next;
1215          }
1216          if(toSort[i]->previous && toSort[i]->next) {
1217            toSort[i]->next->previous = toSort[i]->previous;
1218          }
1219	      debug->log("Exp -N: ");
1220	      debug->log(toSort[i]->toString(false));
1221	      debug->log(" / ");
1222	      debug->log(expansionLine.toString(false), true);
1223        }
1224        else
1225        {
1226          u_strncat(toSort[i]->expansionString, expansionLine.name, expansionLine.len);
1227          toSort[i]->isExpansion = true;
1228          toSort[i]->expStrength = expStrength;
1229          toSort[i]->expLen = expansionLine.len;
1230          toSort[i]->expansionString[toSort[i]->expLen] = 0;
1231          toSort[i]->expIndex = j;
1232        }
1233      }
1234    }
1235    if(toSort[i]->isExpansion == true) {
1236      if(debug->isOn()) {
1237        debug->log("Exp + : &");
1238        debug->log(toSort[j]->toString(false));
1239        debug->log(toSort[i]->strengthToString(toSort[i]->expStrength, true));
1240        debug->log(toSort[i]->toString(false));
1241        debug->log(" ");
1242        if(!toSort[j]->sortKey) {
1243          calculateSortKey(*toSort[j]);
1244        }
1245        debug->log(toSort[j]->dumpSortkey());
1246        debug->log(" ... ");
1247        if(!toSort[i]->sortKey) {
1248          calculateSortKey(*toSort[i]);
1249        }
1250        debug->log(toSort[i]->dumpSortkey());
1251        calculateSortKey(expansionLine);
1252        debug->log("/");
1253        debug->log(expansionLine.dumpSortkey(), true);
1254      }
1255
1256    }
1257  }
1258  // after detecting expansions, we want to position them.
1259  // it is better to position expansions after all have been detected,
1260  // since otherwise we will change the ordering.
1261  for(i = size-1; i >= 0; i--) {
1262    if(toSort[i]->isExpansion) {
1263      if(toSort[i]->name[0] == 0x2A3) {
1264        int32_t putBreakPointHere = 0;
1265      }
1266      if(i) {
1267        if(toSort[i]->previous) {
1268          toSort[i]->previous->next = toSort[i]->next;
1269        }
1270      }
1271      if(i < size-1) {
1272        if(toSort[i]->next) {
1273          toSort[i]->next->previous = toSort[i]->previous;
1274        }
1275      }
1276      j = toSort[i]->expIndex;
1277      toSort[i]->next = toSort[j]->next;
1278      toSort[i]->previous = toSort[j];
1279      toSort[j]->next = toSort[i];
1280      if(toSort[i]->next) {
1281        toSort[i]->next->previous = toSort[i];
1282      }
1283      toSort[i]->strength = toSort[i]->expStrength;
1284    }
1285  }
1286  return exCount;
1287}
1288
1289
1290Line *
1291SortedLines::getFirst() {
1292  current = first;
1293  return current;
1294}
1295
1296Line *
1297SortedLines::getLast() {
1298  current = last;
1299  return current;
1300}
1301
1302void
1303SortedLines::add(Line *line, UBool linkIn) {
1304  if(size++ == capacity) {
1305    // grow
1306  }
1307  lines[size] = *line;
1308  Line *toAdd = &lines[size];
1309  if(linkIn && first) {
1310    Line *current = first;
1311    while(current != NULL && probe.comparer(&current, &toAdd) < 0) {
1312      current = current->next;
1313    }
1314    if(current == NULL) {
1315      toAdd->previous = last;
1316      toAdd->next = NULL;
1317      if(last != NULL) {
1318        last->next = toAdd;
1319      }
1320      last = toAdd;
1321      if(first == NULL) {
1322        first = toAdd;
1323      }
1324    } else { // current != NULL
1325      toAdd->next = current;
1326      toAdd->previous = current->previous;
1327      if(current->previous) {
1328        current->previous->next = toAdd;
1329      } else {
1330        first = toAdd;
1331      }
1332      current->previous = toAdd;
1333    }
1334  }
1335}
1336
1337
1338Line *
1339SortedLines::getNext()
1340{
1341  if(current != NULL) {
1342    current=current->next;
1343  }
1344  return current;
1345}
1346
1347Line *
1348SortedLines::getPrevious()
1349{
1350  if(current != NULL) {
1351    current=current->previous;
1352  }
1353  return current;
1354}
1355
1356Line *
1357SortedLines::operator[](int32_t index)
1358{
1359  int32_t i = 0;
1360  Line *c = first;
1361  for(i = 0; i < index; i++) {
1362    if(c != NULL) {
1363      c = c->next;
1364    }
1365  }
1366  return c;
1367}
1368
1369UnicodeString
1370SortedLines::arrayToString(Line** sortedLines, int32_t linesSize, UBool pretty, UBool useLinks, UBool printSortKeys) {
1371  UnicodeString result;
1372  int32_t i = 0;
1373
1374  Line *line = NULL;
1375  Line *previous = sortedLines[0];
1376  if(printSortKeys && !sortkeys) {
1377    printSortKeys = false;
1378  }
1379  if(previous->isReset) {
1380    result.append(" & ");
1381    result.append(previous->name, previous->len);
1382    if(pretty) {
1383      result.append("        # ");
1384      result.append(previous->stringToName(previous->name, previous->len));
1385      result.append("\n");
1386    }
1387  } else if(!previous->isRemoved) {
1388    result.append(previous->toString(pretty));
1389    if(pretty) {
1390      result.append("\n");
1391    }
1392  }
1393  i = 1;
1394  while((i < linesSize && !useLinks) || (previous->next && useLinks)) {
1395    if(useLinks) {
1396      line = previous->next;
1397    } else {
1398      line = sortedLines[i];
1399    }
1400    if(line->isReset) {
1401      result.append(" &");
1402      result.append(line->name, line->len);
1403      if(pretty) {
1404        result.append("        # ");
1405        result.append(line->stringToName(line->name, line->len));
1406        result.append("\n");
1407      }
1408    } else if(!line->isRemoved) {
1409      if(i > 0) {
1410        result.append(line->strengthToString(line->strength, pretty));
1411      }
1412      result.append(line->toString(pretty));
1413      if(printSortKeys) {
1414        result.append(line->dumpSortkey());
1415      }
1416      if(pretty) {
1417        result.append("\n");
1418      }
1419    }
1420    previous = line;
1421    i++;
1422  }
1423  return result;
1424}
1425
1426SortedLines::SortedLines(FILE *file, UPrinter *logger, UPrinter *debug, UErrorCode &status) :
1427toSort(NULL),
1428toSortCapacity(0),
1429lines(NULL),
1430size(0),
1431capacity(0),
1432first(NULL),
1433last(NULL),
1434logger(logger),
1435debug(debug),
1436contractionsTable(NULL),
1437duplicators(NULL),
1438maxExpansionPrefixSize(0),
1439wordSort(false),
1440frenchSecondary(false),
1441upperFirst(false),
1442sortkeys(NULL),
1443sortkeyOffset(0)
1444{
1445  debug->log("*** loading a dump\n");
1446  memset(UB, 0, sizeof(UB));
1447  int32_t i = 0;
1448  for(i = 0; i < UCOL_OFF; i++) {
1449    UB[i] = &empty;
1450  }
1451
1452  int32_t newFrench, newUpperFirst;
1453  fscanf(file, "%i,%i,%i\n", &size, &newFrench, &newUpperFirst);
1454  debug->log("Read size %i, frenchSecondary %i and upperFirst %i\n", size, newFrench, newUpperFirst);
1455  frenchSecondary = (UBool)newFrench;
1456  upperFirst = (UBool)newUpperFirst;
1457  capacity = size;
1458  lines = new Line[capacity];
1459  i = 0;
1460
1461  char buff[256];
1462
1463  while(fgets(buff, 256, file)) {
1464    if(i % 20 == 0) {
1465      logger->log("\rLine: %04i", i, buff);
1466    }
1467    lines[i].initFromString(buff, 256, status);
1468    if(i) {
1469      lines[i].previous = &lines[i-1];
1470      lines[i-1].next = &lines[i];
1471    }
1472    i++;
1473  }
1474  size = i;
1475  toSort = new Line*[size];
1476  setSortingArray(toSort, lines, size);
1477  first = &lines[0];
1478  last = &lines[size-1];
1479}
1480
1481void
1482SortedLines::toFile(FILE *file, UBool useLinks, UErrorCode &status)
1483{
1484  fprintf(file, "%i,%i,%i\n", size, frenchSecondary, upperFirst);
1485  int32_t i = 1;
1486  Line *previous = toSort[0];
1487  Line *line = NULL;
1488  char buff[256];
1489  previous->write(buff, 256, status);
1490  fprintf(file, "%s\n", buff);
1491  fflush(file);
1492  while(previous->next) {
1493    if(useLinks) {
1494      line = previous->next;
1495    } else {
1496      line = toSort[i];
1497    }
1498    line->write(buff, 256, status);
1499    fprintf(file, "%s\n", buff);
1500    i++;
1501    previous = line;
1502  }
1503}
1504
1505
1506
1507UnicodeString
1508SortedLines::toStringFromEmpty() {
1509  UBool useLinks = false;
1510  UBool pretty = false;
1511  UnicodeString result;
1512  int32_t i = 0;
1513
1514  Line *line = NULL;
1515  Line *previous = toSort[0];
1516  if(previous->isReset) {
1517    result.append(" & ");
1518    if(pretty) {
1519      result.append("\n");
1520    }
1521    result.append(previous->name, previous->len);
1522  } else if(!previous->isRemoved) {
1523    result.append(previous->toString(pretty));
1524    if(pretty) {
1525      result.append("\n");
1526    }
1527  }
1528  i = 1;
1529  while(i < size || previous->next) {
1530    if(useLinks) {
1531      line = previous->next;
1532    } else {
1533      line = toSort[i];
1534    }
1535    if(line->isReset) {
1536      result.append(" &");
1537      result.append(line->name, line->len);
1538      if(pretty) {
1539        result.append("        # ");
1540        result.append(line->stringToName(line->name, line->len));
1541        result.append("\n");
1542      }
1543    } else if(!line->isRemoved) {
1544      if(i > 0) {
1545        result.append(line->strengthToString(line->strengthFromEmpty, pretty));
1546      }
1547      result.append(line->toString(pretty));
1548      if(pretty) {
1549        result.append("\n");
1550      }
1551    }
1552    previous = line;
1553    i++;
1554  }
1555  return result;
1556}
1557
1558UnicodeString
1559SortedLines::toString(UBool useLinks)
1560{
1561  return arrayToString(toSort, size, false, useLinks, false);
1562}
1563
1564
1565UnicodeString
1566SortedLines::toPrettyString(UBool useLinks, UBool printSortKeys)
1567{
1568  return arrayToString(toSort, size, true, useLinks, printSortKeys);
1569}
1570
1571UnicodeString
1572SortedLines::toOutput(const char *format,
1573                       const char *locale, const char *platform, const char *reference,
1574                       UBool useLinks, UBool initialize, UBool moreToCome) {
1575  if(strcmp(format, "HTML") == 0) {
1576    return toHTML(locale, platform, reference, useLinks, initialize, moreToCome);
1577  } else if(strcmp(format, "XML") == 0) {
1578    return toXML(locale, platform, reference, useLinks, initialize, moreToCome);
1579  } else {
1580    return toBundle(locale, platform, reference, useLinks, initialize, moreToCome);
1581  }
1582}
1583
1584
1585UnicodeString
1586SortedLines::toHTML(const char *locale,
1587                       const char *platform, const char *reference,
1588                       UBool useLinks, UBool initialize, UBool moreToCome)
1589{
1590  UnicodeString result;
1591  int32_t i = 0;
1592  if(initialize) {
1593    result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n");
1594    result.append("# Collation data resource bundle generated for locale: ");
1595    result.append(locale);
1596    result.append("<br>\n# For platform ");
1597    result.append(platform);
1598    result.append(" reference platform ");
1599    result.append(reference);
1600    result.append("<br><br>\n\n\n");
1601
1602    result.append(locale);
1603    if(platform) {
1604      result.append("_");
1605      result.append(platform);
1606    }
1607    if(reference) {
1608      result.append("_vs_");
1609      result.append(reference);
1610    }
1611    result.append("&nbsp;{<br>\n");
1612
1613    result.append("&nbsp;&nbsp;collations&nbsp;{<br>\n&nbsp;&nbsp;&nbsp;&nbsp;standard&nbsp;{<br>\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Sequence&nbsp;{<br>\n");
1614  }
1615
1616  if(frenchSecondary) {
1617    result.append("[backwards 2]<br>\n");
1618  }
1619  if(upperFirst) {
1620    result.append("[casefirst upper]<br>\n");
1621  }
1622
1623  Line *line = toSort[0];
1624
1625  i = 0;
1626  while((i < size && !useLinks) || (line->next && useLinks)) {
1627    if(line->isReset || !line->isRemoved) {
1628      result.append(line->toHTMLString());
1629    }
1630    i++;
1631    if(useLinks) {
1632      line = line->next;
1633    } else {
1634      line = toSort[i];
1635    }
1636  }
1637  if(!moreToCome) {
1638    result.append("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;}<br>\n&nbsp;&nbsp;&nbsp;&nbsp;}<br>\n&nbsp;&nbsp;}<br>\n}<br>\n");
1639
1640    result.append("</html>\n");
1641  }
1642
1643  return result;
1644}
1645
1646UnicodeString
1647SortedLines::toXML(const char *locale,
1648                       const char *platform, const char *reference,
1649                       UBool useLinks, UBool initialize, UBool moreToCome)
1650{
1651  UnicodeString result;
1652  int32_t i = 0;
1653  if(initialize) {
1654    result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n");
1655    result.append("# Collation data resource bundle generated for locale: ");
1656    result.append(locale);
1657    result.append("<br>\n# For platform ");
1658    result.append(platform);
1659    result.append(" reference platform ");
1660    result.append(reference);
1661    result.append("<br><br>\n\n\n");
1662
1663    result.append(locale);
1664    if(platform) {
1665      result.append("_");
1666      result.append(platform);
1667    }
1668    if(reference) {
1669      result.append("_vs_");
1670      result.append(reference);
1671    }
1672    result.append("&nbsp;{<br>\n");
1673
1674    result.append("&nbsp;&nbsp;collations&nbsp;{<br>\n&nbsp;&nbsp;&nbsp;&nbsp;standard&nbsp;{<br>\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Sequence&nbsp;{<br>\n");
1675  }
1676
1677  if(frenchSecondary) {
1678    result.append("[backwards 2]<br>\n");
1679  }
1680  if(upperFirst) {
1681    result.append("[casefirst upper]<br>\n");
1682  }
1683
1684  Line *line = toSort[0];
1685
1686  i = 0;
1687  while((i < size && !useLinks) || (line->next && useLinks)) {
1688    if(line->isReset || !line->isRemoved) {
1689      result.append(line->toHTMLString());
1690    }
1691    i++;
1692    if(useLinks) {
1693      line = line->next;
1694    } else {
1695      line = toSort[i];
1696    }
1697  }
1698  if(!moreToCome) {
1699    result.append("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;}<br>\n&nbsp;&nbsp;&nbsp;&nbsp;}<br>\n&nbsp;&nbsp;}<br>\n}<br>\n");
1700
1701    result.append("</html>\n");
1702  }
1703
1704  return result;
1705}
1706
1707UnicodeString
1708SortedLines::toBundle(const char *locale,
1709                       const char *platform, const char *reference,
1710                       UBool useLinks, UBool initialize, UBool moreToCome)
1711{
1712  UnicodeString result;
1713  int32_t i = 0;
1714
1715  if(initialize) {
1716    result.append("// Collation data resource bundle generated for locale: ");
1717    result.append(locale);
1718    result.append("\n// For platform ");
1719    result.append(platform);
1720    result.append(" reference platform ");
1721    result.append(reference);
1722    result.append("\n\n\n");
1723
1724    result.append(locale);
1725    /*
1726    if(platform) {
1727      result.append("_");
1728      result.append(platform);
1729    }
1730    if(reference) {
1731      result.append("_vs_");
1732      result.append(reference);
1733    }
1734    */
1735    result.append(" {\n");
1736
1737    result.append("  collations {\n   standard {\n    Sequence {\n");
1738  }
1739
1740  if(frenchSecondary) {
1741    result.append("[backwards 2]\n");
1742  }
1743  if(upperFirst) {
1744    result.append("[casefirst upper]\n");
1745  }
1746
1747  Line *line = toSort[0];
1748
1749  i = 0;
1750  while((i < size && !useLinks) || (line->next && useLinks)) {
1751    if(line->isReset || !line->isRemoved) {
1752      result.append(line->toBundleString());
1753    }
1754    i++;
1755    if(useLinks) {
1756      line = line->next;
1757    } else {
1758      line = toSort[i];
1759    }
1760  }
1761
1762  if(!moreToCome) {
1763    result.append("    }\n   }\n  }\n}\n");
1764  }
1765
1766  return result;
1767}
1768
1769
1770int32_t
1771SortedLines::getSize() const {
1772  return repertoire.size();
1773}
1774
1775void
1776SortedLines::reduceDifference(SortedLines& reference) {
1777  UErrorCode status = U_ZERO_ERROR;
1778  if(upperFirst) {
1779    swapCase();
1780  }
1781  // both sorted lines structures need to have established links and strengths
1782  // We walk down both structures and note differences. These
1783  // differences will modify this by removng elements, setting resets
1784  // etc...
1785  // we will prefer insertions from tailoring to reference, then deletions
1786  // there are two tables that keep seen elements.
1787  Hashtable *seenThis = new Hashtable();
1788  Hashtable *seenReference = new Hashtable();
1789
1790
1791  UBool found = false;
1792  UBool finished = false;
1793  const int32_t lookForward = 20;
1794  int32_t tailoringMove = 0;
1795  //int32_t referenceSize = reference.getSize();
1796  Line *refLine = reference.getFirst();
1797  Line *refLatestEqual = refLine;
1798  refLine = refLine->next;
1799  Line *myLine = getFirst();
1800  Line *myLatestEqual = myLine;
1801  myLatestEqual->isRemoved = true;
1802  myLine = myLine->next;
1803  while(myLine && refLine) {
1804    found = false;
1805    while(myLine && refLine && myLine->equals(*refLine)) {
1806      myLatestEqual = myLine;
1807      myLatestEqual->isRemoved = true;
1808      myLine = myLine->next;
1809      refLatestEqual = refLine;
1810      refLine = refLine->next;
1811      if(refLine == NULL && myLine == NULL) {
1812        finished = true;
1813      }
1814    }
1815    if(myLine) {
1816      myLine->cumulativeStrength = myLine->strength;
1817    }
1818    if(refLine) {
1819      refLine->cumulativeStrength = refLine->strength;
1820    }
1821
1822    // here is the difference
1823    while(!found && !finished) {
1824      tailoringMove = 0;
1825      if(myLine && refLine) {
1826        if(myLine->cumulativeStrength > refLine->cumulativeStrength) {
1827          // tailoring z <<< x, UCA z < y
1828          while(myLine->cumulativeStrength > refLine->cumulativeStrength) {
1829            myLine = myLine->next;
1830            if(myLine) {
1831              transferCumulativeStrength(myLine->previous, myLine);
1832            } else {
1833              break;
1834            }
1835          }
1836        } else if(myLine->cumulativeStrength < refLine->cumulativeStrength) {
1837          // tailoring z < x, UCA z <<< y
1838          while(myLine->cumulativeStrength < refLine->cumulativeStrength) {
1839            seenReference->put(UnicodeString(refLine->name, refLine->len), refLine, status);
1840            refLine = refLine->next;
1841            if(refLine) {
1842              transferCumulativeStrength(refLine->previous, refLine);
1843            } else {
1844              break;
1845            }
1846          }
1847        }
1848        // this is the interesting point. Now we search for character match
1849        while(myLine && refLine && (!myLine->equals(*refLine) || myLine->strength == UCOL_IDENTICAL)
1850          && tailoringMove < lookForward) {
1851          if(seenThis->get(UnicodeString(refLine->name, refLine->len))) {
1852            // we are not interested in stuff from the reference that is already accounted
1853            // for in the tailoring.
1854            refLine = refLine->next;
1855            if(refLine) {
1856              transferCumulativeStrength(refLine->previous, refLine);
1857            }
1858          } else {
1859            myLine = myLine->next;
1860            if(myLine) {
1861              transferCumulativeStrength(myLine->previous, myLine);
1862              if(!seenReference->get(UnicodeString(myLine->name, myLine->len))) {
1863                tailoringMove++;
1864              }
1865            }
1866          }
1867        }
1868      }
1869      if(refLine == NULL) { // ran out of reference
1870        // this is the tail of tailoring - the last insertion
1871        myLine  = NULL;
1872        found = true;
1873      } else if(tailoringMove == lookForward || myLine == NULL) { // run over treshold or out of tailoring
1874        tailoringMove = 0;
1875        // we didn't find insertion after all
1876        // we will try substitution next
1877        // reset the tailoring pointer
1878        myLine = myLatestEqual->next;
1879        // move the reference
1880        refLine = refLine->next;
1881        if(refLine) {
1882          transferCumulativeStrength(refLine->previous, refLine);
1883        }
1884      } else { // we found an insertion
1885        tailoringMove = 0;
1886        if(myLine->strength != refLine->strength) {
1887          while(myLine && refLine && *myLine == *refLine
1888            && (myLine->strength != refLine->strength
1889            || myLine->strength == UCOL_IDENTICAL)) {
1890            myLine = myLine->next;
1891            refLine = refLine->next;
1892          }
1893          if(*myLine != *refLine) {
1894            continue;
1895          }
1896        }
1897        if(myLine && refLine && myLine->previous->strength < myLine->strength) {
1898          myLine = myLine->next;
1899          refLine = refLine->next;
1900          if(*myLine != *refLine) {
1901            continue;
1902          }
1903        }
1904        found = true;
1905      }
1906      if(found) {
1907        if(myLatestEqual->next != myLine || refLine == NULL) {
1908          Line *myStart = NULL;
1909          // this is a reset and a sequence
1910          // myLatestEqual points at the last point that was the same
1911          // This point will be a reset
1912          if(myLine && refLine) { // if there is anything more to do - it might be worth saving it
1913            myStart = myLatestEqual;
1914            while(myStart != myLine) {
1915              seenThis->put(UnicodeString(myStart->name, myStart->len), myStart, status);
1916              myStart = myStart->next;
1917            }
1918          }
1919          // Try to weed out stuff that is not affected, like:
1920          // Tailoring:
1921          // <<<S<<\u017F<\u0161<<<\u0160<t
1922          // UCA:
1923          // <<<S<<\u0161<<<\u0160<<\u017F<t
1924          // Result:
1925          // &S<<\u017F<\u0161<<<\u0160
1926          // we have a sequence that spans from myLatestEqual to myLine (that one could be NULL,
1927          // so we have to go down from myLatestEqual.
1928          // Basically, for every element, we want to see the strongest cumulative difference
1929          // from the reset point. If the cumulative difference is the same in both the reference and
1930          // tailoring, that element could be removed.
1931          calculateCumulativeStrengths(myLatestEqual, myLine);
1932          calculateCumulativeStrengths(refLatestEqual, refLine);
1933          myStart = myLatestEqual;
1934          int32_t removed = 0;
1935          int32_t traversed = 0;
1936          while(myStart && myStart != myLine) {
1937            Line *refStart = refLatestEqual;
1938            while(refStart && refStart != refLine) {
1939              if(*myStart == *refStart) {
1940                if(myStart->cumulativeStrength == refStart->cumulativeStrength) {
1941                  myStart->isRemoved = true;
1942                  removed++;
1943                }
1944              }
1945              refStart = refStart->next;
1946            }
1947            myStart = myStart->next;
1948            traversed++;
1949          }
1950          if(removed < traversed) {
1951            myLatestEqual->isReset = true;
1952            myLatestEqual->isRemoved = false;
1953          }
1954
1955          myLatestEqual = myLine;
1956        }
1957      }
1958    }
1959  }
1960
1961  if(upperFirst) {
1962    //swapCase();
1963  }
1964
1965  delete seenThis;
1966  delete seenReference;
1967
1968}
1969
1970void
1971SortedLines::transferCumulativeStrength(Line *previous, Line *that) {
1972  if(that->strength > previous->cumulativeStrength) {
1973    that->cumulativeStrength = previous->cumulativeStrength;
1974  } else {
1975    that->cumulativeStrength = that->strength;
1976  }
1977}
1978
1979void
1980SortedLines::calculateCumulativeStrengths(Line *start, Line *end) {
1981  // start is a reset - end may be NULL
1982  start = start->next;
1983  UColAttributeValue cumulativeStrength = UCOL_OFF;
1984  while(start && start != end) {
1985    if(start->strength < cumulativeStrength) {
1986      cumulativeStrength = start->strength;
1987    }
1988    start->cumulativeStrength = cumulativeStrength;
1989    start = start->next;
1990  }
1991}
1992
1993
1994void
1995SortedLines::getRepertoire(UnicodeSet &fillIn) {
1996  fillIn.clear();
1997  fillIn.addAll(repertoire);
1998}
1999
2000
2001void
2002SortedLines::removeDecompositionsFromRepertoire() {
2003  UnicodeSetIterator repertoireIter(repertoire);
2004  UErrorCode status = U_ZERO_ERROR;
2005  UChar string[256];
2006  UChar composed[256];
2007  int32_t len = 0, compLen = 0;
2008  UnicodeString compString;
2009  UnicodeSet toRemove;
2010
2011  while(repertoireIter.next()) {
2012    len = 0;
2013    if(repertoireIter.isString()) { // process a string
2014      len = repertoireIter.getString().length();
2015      u_memcpy(string, repertoireIter.getString().getBuffer(), len);
2016    } else { // process code point
2017      UBool isError = false;
2018      U16_APPEND(string, len, 25, repertoireIter.getCodepoint(), isError);
2019    }
2020    string[len] = 0; // zero terminate, for our evil ways
2021    compLen = unorm_normalize(string, len, UNORM_NFC, 0, composed, 256, &status);
2022    if(compLen != len || u_strcmp(string, composed) != 0) {
2023      compString.setTo(composed, compLen);
2024      if(repertoire.contains(compString)) {
2025        toRemove.add(UnicodeString(string, len));
2026      }
2027    }
2028  }
2029  debug->log("\nRemoving\n");
2030  debug->log(toRemove.toPattern(compString, true), true);
2031  repertoire.removeAll(toRemove);
2032}
2033
2034
2035void
2036SortedLines::swapCase()
2037{
2038  int32_t i = 0;
2039  for(i = 0; i < size; i++) {
2040    toSort[i]->swapCase();
2041  }
2042}
2043
2044void
2045SortedLines::calculateSortKey(Line &line)
2046{
2047  if(!sortkeys) {
2048    sortkeys = new uint8_t[size*1024];
2049    memset(sortkeys, 0, size*1024);
2050  }
2051  line.sortKey = sortkeys+sortkeyOffset;
2052  sortkeyOffset += probe.getSortKey(line, sortkeys+sortkeyOffset, size*256-sortkeyOffset);
2053}
2054
2055
2056void
2057SortedLines::calculateSortKeys()
2058{
2059  if(sortkeys) {
2060    delete[] sortkeys;
2061  }
2062  sortkeyOffset = 0;
2063  sortkeys = new uint8_t[size*256];
2064  memset(sortkeys, 0, size*256);
2065  int32_t i = 0;
2066  for(i = 0; i < size; i++) {
2067    calculateSortKey(*toSort[i]);
2068  }
2069}
2070