1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 1996-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* collationcompare.cpp
9*
10* created on: 2012feb14 with new and old collation code
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/ucol.h"
19#include "cmemory.h"
20#include "collation.h"
21#include "collationcompare.h"
22#include "collationiterator.h"
23#include "collationsettings.h"
24#include "uassert.h"
25
26U_NAMESPACE_BEGIN
27
28UCollationResult
29CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
30                                        const CollationSettings &settings,
31                                        UErrorCode &errorCode) {
32    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
33
34    int32_t options = settings.options;
35    uint32_t variableTop;
36    if((options & CollationSettings::ALTERNATE_MASK) == 0) {
37        variableTop = 0;
38    } else {
39        // +1 so that we can use "<" and primary ignorables test out early.
40        variableTop = settings.variableTop + 1;
41    }
42    UBool anyVariable = false;
43
44    // Fetch CEs, compare primaries, store secondary & tertiary weights.
45    for(;;) {
46        // We fetch CEs until we get a non-ignorable primary or reach the end.
47        uint32_t leftPrimary;
48        do {
49            int64_t ce = left.nextCE(errorCode);
50            leftPrimary = (uint32_t)(ce >> 32);
51            if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
52                // Variable CE, shift it to quaternary level.
53                // Ignore all following primary ignorables, and shift further variable CEs.
54                anyVariable = true;
55                do {
56                    // Store only the primary of the variable CE.
57                    left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
58                    for(;;) {
59                        ce = left.nextCE(errorCode);
60                        leftPrimary = (uint32_t)(ce >> 32);
61                        if(leftPrimary == 0) {
62                            left.setCurrentCE(0);
63                        } else {
64                            break;
65                        }
66                    }
67                } while(leftPrimary < variableTop &&
68                        leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
69            }
70        } while(leftPrimary == 0);
71
72        uint32_t rightPrimary;
73        do {
74            int64_t ce = right.nextCE(errorCode);
75            rightPrimary = (uint32_t)(ce >> 32);
76            if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
77                // Variable CE, shift it to quaternary level.
78                // Ignore all following primary ignorables, and shift further variable CEs.
79                anyVariable = true;
80                do {
81                    // Store only the primary of the variable CE.
82                    right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
83                    for(;;) {
84                        ce = right.nextCE(errorCode);
85                        rightPrimary = (uint32_t)(ce >> 32);
86                        if(rightPrimary == 0) {
87                            right.setCurrentCE(0);
88                        } else {
89                            break;
90                        }
91                    }
92                } while(rightPrimary < variableTop &&
93                        rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
94            }
95        } while(rightPrimary == 0);
96
97        if(leftPrimary != rightPrimary) {
98            // Return the primary difference, with script reordering.
99            if(settings.hasReordering()) {
100                leftPrimary = settings.reorder(leftPrimary);
101                rightPrimary = settings.reorder(rightPrimary);
102            }
103            return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
104        }
105        if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
106    }
107    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
108
109    // Compare the buffered secondary & tertiary weights.
110    // We might skip the secondary level but continue with the case level
111    // which is turned on separately.
112    if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
113        if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
114            int32_t leftIndex = 0;
115            int32_t rightIndex = 0;
116            for(;;) {
117                uint32_t leftSecondary;
118                do {
119                    leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
120                } while(leftSecondary == 0);
121
122                uint32_t rightSecondary;
123                do {
124                    rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
125                } while(rightSecondary == 0);
126
127                if(leftSecondary != rightSecondary) {
128                    return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
129                }
130                if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
131            }
132        } else {
133            // The backwards secondary level compares secondary weights backwards
134            // within segments separated by the merge separator (U+FFFE, weight 02).
135            int32_t leftStart = 0;
136            int32_t rightStart = 0;
137            for(;;) {
138                // Find the merge separator or the NO_CE terminator.
139                uint32_t p;
140                int32_t leftLimit = leftStart;
141                while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
142                            Collation::MERGE_SEPARATOR_PRIMARY ||
143                        p == 0) {
144                    ++leftLimit;
145                }
146                int32_t rightLimit = rightStart;
147                while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
148                            Collation::MERGE_SEPARATOR_PRIMARY ||
149                        p == 0) {
150                    ++rightLimit;
151                }
152
153                // Compare the segments.
154                int32_t leftIndex = leftLimit;
155                int32_t rightIndex = rightLimit;
156                for(;;) {
157                    int32_t leftSecondary = 0;
158                    while(leftSecondary == 0 && leftIndex > leftStart) {
159                        leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
160                    }
161
162                    int32_t rightSecondary = 0;
163                    while(rightSecondary == 0 && rightIndex > rightStart) {
164                        rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
165                    }
166
167                    if(leftSecondary != rightSecondary) {
168                        return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
169                    }
170                    if(leftSecondary == 0) { break; }
171                }
172
173                // Did we reach the end of either string?
174                // Both strings have the same number of merge separators,
175                // or else there would have been a primary-level difference.
176                U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
177                if(p == Collation::NO_CE_PRIMARY) { break; }
178                // Skip both merge separators and continue.
179                leftStart = leftLimit + 1;
180                rightStart = rightLimit + 1;
181            }
182        }
183    }
184
185    if((options & CollationSettings::CASE_LEVEL) != 0) {
186        int32_t strength = CollationSettings::getStrength(options);
187        int32_t leftIndex = 0;
188        int32_t rightIndex = 0;
189        for(;;) {
190            uint32_t leftCase, leftLower32, rightCase;
191            if(strength == UCOL_PRIMARY) {
192                // Primary+caseLevel: Ignore case level weights of primary ignorables.
193                // Otherwise we would get a-umlaut > a
194                // which is not desirable for accent-insensitive sorting.
195                // Check for (lower 32 bits) == 0 as well because variable CEs are stored
196                // with only primary weights.
197                int64_t ce;
198                do {
199                    ce = left.getCE(leftIndex++);
200                    leftCase = (uint32_t)ce;
201                } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
202                leftLower32 = leftCase;
203                leftCase &= 0xc000;
204
205                do {
206                    ce = right.getCE(rightIndex++);
207                    rightCase = (uint32_t)ce;
208                } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
209                rightCase &= 0xc000;
210            } else {
211                // Secondary+caseLevel: By analogy with the above,
212                // ignore case level weights of secondary ignorables.
213                //
214                // Note: A tertiary CE has uppercase case bits (0.0.ut)
215                // to keep tertiary+caseFirst well-formed.
216                //
217                // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
218                // Otherwise a tertiary CE's uppercase would be no greater than
219                // a primary/secondary CE's uppercase.
220                // (See UCA well-formedness condition 2.)
221                // We could construct a special case weight higher than uppercase,
222                // but it's simpler to always ignore case weights of secondary ignorables,
223                // turning 0.0.ut into 0.0.0.t.
224                // (See LDML Collation, Case Parameters.)
225                do {
226                    leftCase = (uint32_t)left.getCE(leftIndex++);
227                } while(leftCase <= 0xffff);
228                leftLower32 = leftCase;
229                leftCase &= 0xc000;
230
231                do {
232                    rightCase = (uint32_t)right.getCE(rightIndex++);
233                } while(rightCase <= 0xffff);
234                rightCase &= 0xc000;
235            }
236
237            // No need to handle NO_CE and MERGE_SEPARATOR specially:
238            // There is one case weight for each previous-level weight,
239            // so level length differences were handled there.
240            if(leftCase != rightCase) {
241                if((options & CollationSettings::UPPER_FIRST) == 0) {
242                    return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
243                } else {
244                    return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
245                }
246            }
247            if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
248        }
249    }
250    if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
251
252    uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
253
254    int32_t leftIndex = 0;
255    int32_t rightIndex = 0;
256    uint32_t anyQuaternaries = 0;
257    for(;;) {
258        uint32_t leftLower32, leftTertiary;
259        do {
260            leftLower32 = (uint32_t)left.getCE(leftIndex++);
261            anyQuaternaries |= leftLower32;
262            U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
263                     (leftLower32 & 0xc0c0) == 0);
264            leftTertiary = leftLower32 & tertiaryMask;
265        } while(leftTertiary == 0);
266
267        uint32_t rightLower32, rightTertiary;
268        do {
269            rightLower32 = (uint32_t)right.getCE(rightIndex++);
270            anyQuaternaries |= rightLower32;
271            U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
272                     (rightLower32 & 0xc0c0) == 0);
273            rightTertiary = rightLower32 & tertiaryMask;
274        } while(rightTertiary == 0);
275
276        if(leftTertiary != rightTertiary) {
277            if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
278                // Pass through NO_CE and keep real tertiary weights larger than that.
279                // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
280                // to keep tertiary CEs well-formed.
281                // Their case+tertiary weights must be greater than those of
282                // primary and secondary CEs.
283                if(leftTertiary > Collation::NO_CE_WEIGHT16) {
284                    if(leftLower32 > 0xffff) {
285                        leftTertiary ^= 0xc000;
286                    } else {
287                        leftTertiary += 0x4000;
288                    }
289                }
290                if(rightTertiary > Collation::NO_CE_WEIGHT16) {
291                    if(rightLower32 > 0xffff) {
292                        rightTertiary ^= 0xc000;
293                    } else {
294                        rightTertiary += 0x4000;
295                    }
296                }
297            }
298            return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
299        }
300        if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
301    }
302    if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
303
304    if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
305        // If there are no "variable" CEs and no non-zero quaternary weights,
306        // then there are no quaternary differences.
307        return UCOL_EQUAL;
308    }
309
310    leftIndex = 0;
311    rightIndex = 0;
312    for(;;) {
313        uint32_t leftQuaternary;
314        do {
315            int64_t ce = left.getCE(leftIndex++);
316            leftQuaternary = (uint32_t)ce & 0xffff;
317            if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
318                // Variable primary or completely ignorable or NO_CE.
319                leftQuaternary = (uint32_t)(ce >> 32);
320            } else {
321                // Regular CE, not tertiary ignorable.
322                // Preserve the quaternary weight in bits 7..6.
323                leftQuaternary |= 0xffffff3f;
324            }
325        } while(leftQuaternary == 0);
326
327        uint32_t rightQuaternary;
328        do {
329            int64_t ce = right.getCE(rightIndex++);
330            rightQuaternary = (uint32_t)ce & 0xffff;
331            if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
332                // Variable primary or completely ignorable or NO_CE.
333                rightQuaternary = (uint32_t)(ce >> 32);
334            } else {
335                // Regular CE, not tertiary ignorable.
336                // Preserve the quaternary weight in bits 7..6.
337                rightQuaternary |= 0xffffff3f;
338            }
339        } while(rightQuaternary == 0);
340
341        if(leftQuaternary != rightQuaternary) {
342            // Return the difference, with script reordering.
343            if(settings.hasReordering()) {
344                leftQuaternary = settings.reorder(leftQuaternary);
345                rightQuaternary = settings.reorder(rightQuaternary);
346            }
347            return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
348        }
349        if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
350    }
351    return UCOL_EQUAL;
352}
353
354U_NAMESPACE_END
355
356#endif  // !UCONFIG_NO_COLLATION
357