1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*   Copyright (C) 2010-2011, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*   file name:  ucharstrie.h
9*   encoding:   UTF-8
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2010nov14
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/appendable.h"
19#include "unicode/ucharstrie.h"
20#include "unicode/uobject.h"
21#include "unicode/utf16.h"
22#include "cmemory.h"
23#include "uassert.h"
24
25U_NAMESPACE_BEGIN
26
27UCharsTrie::~UCharsTrie() {
28    uprv_free(ownedArray_);
29}
30
31UStringTrieResult
32UCharsTrie::current() const {
33    const char16_t *pos=pos_;
34    if(pos==nullptr) {
35        return USTRINGTRIE_NO_MATCH;
36    } else {
37        int32_t node;
38        return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
39                valueResult(node) : USTRINGTRIE_NO_VALUE;
40    }
41}
42
43UStringTrieResult
44UCharsTrie::firstForCodePoint(UChar32 cp) {
45    return cp<=0xffff ?
46        first(cp) :
47        (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ?
48            next(U16_TRAIL(cp)) :
49            USTRINGTRIE_NO_MATCH);
50}
51
52UStringTrieResult
53UCharsTrie::nextForCodePoint(UChar32 cp) {
54    return cp<=0xffff ?
55        next(cp) :
56        (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ?
57            next(U16_TRAIL(cp)) :
58            USTRINGTRIE_NO_MATCH);
59}
60
61UStringTrieResult
62UCharsTrie::branchNext(const char16_t *pos, int32_t length, int32_t uchar) {
63    // Branch according to the current unit.
64    if(length==0) {
65        length=*pos++;
66    }
67    ++length;
68    // The length of the branch is the number of units to select from.
69    // The data structure encodes a binary search.
70    while(length>kMaxBranchLinearSubNodeLength) {
71        if(uchar<*pos++) {
72            length>>=1;
73            pos=jumpByDelta(pos);
74        } else {
75            length=length-(length>>1);
76            pos=skipDelta(pos);
77        }
78    }
79    // Drop down to linear search for the last few units.
80    // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
81    // and divides length by 2.
82    do {
83        if(uchar==*pos++) {
84            UStringTrieResult result;
85            int32_t node=*pos;
86            if(node&kValueIsFinal) {
87                // Leave the final value for getValue() to read.
88                result=USTRINGTRIE_FINAL_VALUE;
89            } else {
90                // Use the non-final value as the jump delta.
91                ++pos;
92                // int32_t delta=readValue(pos, node);
93                int32_t delta;
94                if(node<kMinTwoUnitValueLead) {
95                    delta=node;
96                } else if(node<kThreeUnitValueLead) {
97                    delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
98                } else {
99                    delta=(pos[0]<<16)|pos[1];
100                    pos+=2;
101                }
102                // end readValue()
103                pos+=delta;
104                node=*pos;
105                result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
106            }
107            pos_=pos;
108            return result;
109        }
110        --length;
111        pos=skipValue(pos);
112    } while(length>1);
113    if(uchar==*pos++) {
114        pos_=pos;
115        int32_t node=*pos;
116        return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
117    } else {
118        stop();
119        return USTRINGTRIE_NO_MATCH;
120    }
121}
122
123UStringTrieResult
124UCharsTrie::nextImpl(const char16_t *pos, int32_t uchar) {
125    int32_t node=*pos++;
126    for(;;) {
127        if(node<kMinLinearMatch) {
128            return branchNext(pos, node, uchar);
129        } else if(node<kMinValueLead) {
130            // Match the first of length+1 units.
131            int32_t length=node-kMinLinearMatch;  // Actual match length minus 1.
132            if(uchar==*pos++) {
133                remainingMatchLength_=--length;
134                pos_=pos;
135                return (length<0 && (node=*pos)>=kMinValueLead) ?
136                        valueResult(node) : USTRINGTRIE_NO_VALUE;
137            } else {
138                // No match.
139                break;
140            }
141        } else if(node&kValueIsFinal) {
142            // No further matching units.
143            break;
144        } else {
145            // Skip intermediate value.
146            pos=skipNodeValue(pos, node);
147            node&=kNodeTypeMask;
148        }
149    }
150    stop();
151    return USTRINGTRIE_NO_MATCH;
152}
153
154UStringTrieResult
155UCharsTrie::next(int32_t uchar) {
156    const char16_t *pos=pos_;
157    if(pos==nullptr) {
158        return USTRINGTRIE_NO_MATCH;
159    }
160    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
161    if(length>=0) {
162        // Remaining part of a linear-match node.
163        if(uchar==*pos++) {
164            remainingMatchLength_=--length;
165            pos_=pos;
166            int32_t node;
167            return (length<0 && (node=*pos)>=kMinValueLead) ?
168                    valueResult(node) : USTRINGTRIE_NO_VALUE;
169        } else {
170            stop();
171            return USTRINGTRIE_NO_MATCH;
172        }
173    }
174    return nextImpl(pos, uchar);
175}
176
177UStringTrieResult
178UCharsTrie::next(ConstChar16Ptr ptr, int32_t sLength) {
179    const char16_t *s=ptr;
180    if(sLength<0 ? *s==0 : sLength==0) {
181        // Empty input.
182        return current();
183    }
184    const char16_t *pos=pos_;
185    if(pos==nullptr) {
186        return USTRINGTRIE_NO_MATCH;
187    }
188    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
189    for(;;) {
190        // Fetch the next input unit, if there is one.
191        // Continue a linear-match node without rechecking sLength<0.
192        int32_t uchar;
193        if(sLength<0) {
194            for(;;) {
195                if((uchar=*s++)==0) {
196                    remainingMatchLength_=length;
197                    pos_=pos;
198                    int32_t node;
199                    return (length<0 && (node=*pos)>=kMinValueLead) ?
200                            valueResult(node) : USTRINGTRIE_NO_VALUE;
201                }
202                if(length<0) {
203                    remainingMatchLength_=length;
204                    break;
205                }
206                if(uchar!=*pos) {
207                    stop();
208                    return USTRINGTRIE_NO_MATCH;
209                }
210                ++pos;
211                --length;
212            }
213        } else {
214            for(;;) {
215                if(sLength==0) {
216                    remainingMatchLength_=length;
217                    pos_=pos;
218                    int32_t node;
219                    return (length<0 && (node=*pos)>=kMinValueLead) ?
220                            valueResult(node) : USTRINGTRIE_NO_VALUE;
221                }
222                uchar=*s++;
223                --sLength;
224                if(length<0) {
225                    remainingMatchLength_=length;
226                    break;
227                }
228                if(uchar!=*pos) {
229                    stop();
230                    return USTRINGTRIE_NO_MATCH;
231                }
232                ++pos;
233                --length;
234            }
235        }
236        int32_t node=*pos++;
237        for(;;) {
238            if(node<kMinLinearMatch) {
239                UStringTrieResult result=branchNext(pos, node, uchar);
240                if(result==USTRINGTRIE_NO_MATCH) {
241                    return USTRINGTRIE_NO_MATCH;
242                }
243                // Fetch the next input unit, if there is one.
244                if(sLength<0) {
245                    if((uchar=*s++)==0) {
246                        return result;
247                    }
248                } else {
249                    if(sLength==0) {
250                        return result;
251                    }
252                    uchar=*s++;
253                    --sLength;
254                }
255                if(result==USTRINGTRIE_FINAL_VALUE) {
256                    // No further matching units.
257                    stop();
258                    return USTRINGTRIE_NO_MATCH;
259                }
260                pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
261                node=*pos++;
262            } else if(node<kMinValueLead) {
263                // Match length+1 units.
264                length=node-kMinLinearMatch;  // Actual match length minus 1.
265                if(uchar!=*pos) {
266                    stop();
267                    return USTRINGTRIE_NO_MATCH;
268                }
269                ++pos;
270                --length;
271                break;
272            } else if(node&kValueIsFinal) {
273                // No further matching units.
274                stop();
275                return USTRINGTRIE_NO_MATCH;
276            } else {
277                // Skip intermediate value.
278                pos=skipNodeValue(pos, node);
279                node&=kNodeTypeMask;
280            }
281        }
282    }
283}
284
285const char16_t *
286UCharsTrie::findUniqueValueFromBranch(const char16_t *pos, int32_t length,
287                                      UBool haveUniqueValue, int32_t &uniqueValue) {
288    while(length>kMaxBranchLinearSubNodeLength) {
289        ++pos;  // ignore the comparison unit
290        if(nullptr==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
291            return nullptr;
292        }
293        length=length-(length>>1);
294        pos=skipDelta(pos);
295    }
296    do {
297        ++pos;  // ignore a comparison unit
298        // handle its value
299        int32_t node=*pos++;
300        UBool isFinal=(UBool)(node>>15);
301        node&=0x7fff;
302        int32_t value=readValue(pos, node);
303        pos=skipValue(pos, node);
304        if(isFinal) {
305            if(haveUniqueValue) {
306                if(value!=uniqueValue) {
307                    return nullptr;
308                }
309            } else {
310                uniqueValue=value;
311                haveUniqueValue=true;
312            }
313        } else {
314            if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
315                return nullptr;
316            }
317            haveUniqueValue=true;
318        }
319    } while(--length>1);
320    return pos+1;  // ignore the last comparison unit
321}
322
323UBool
324UCharsTrie::findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
325    int32_t node=*pos++;
326    for(;;) {
327        if(node<kMinLinearMatch) {
328            if(node==0) {
329                node=*pos++;
330            }
331            pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
332            if(pos==nullptr) {
333                return false;
334            }
335            haveUniqueValue=true;
336            node=*pos++;
337        } else if(node<kMinValueLead) {
338            // linear-match node
339            pos+=node-kMinLinearMatch+1;  // Ignore the match units.
340            node=*pos++;
341        } else {
342            UBool isFinal=(UBool)(node>>15);
343            int32_t value;
344            if(isFinal) {
345                value=readValue(pos, node&0x7fff);
346            } else {
347                value=readNodeValue(pos, node);
348            }
349            if(haveUniqueValue) {
350                if(value!=uniqueValue) {
351                    return false;
352                }
353            } else {
354                uniqueValue=value;
355                haveUniqueValue=true;
356            }
357            if(isFinal) {
358                return true;
359            }
360            pos=skipNodeValue(pos, node);
361            node&=kNodeTypeMask;
362        }
363    }
364}
365
366int32_t
367UCharsTrie::getNextUChars(Appendable &out) const {
368    const char16_t *pos=pos_;
369    if(pos==nullptr) {
370        return 0;
371    }
372    if(remainingMatchLength_>=0) {
373        out.appendCodeUnit(*pos);  // Next unit of a pending linear-match node.
374        return 1;
375    }
376    int32_t node=*pos++;
377    if(node>=kMinValueLead) {
378        if(node&kValueIsFinal) {
379            return 0;
380        } else {
381            pos=skipNodeValue(pos, node);
382            node&=kNodeTypeMask;
383        }
384    }
385    if(node<kMinLinearMatch) {
386        if(node==0) {
387            node=*pos++;
388        }
389        out.reserveAppendCapacity(++node);
390        getNextBranchUChars(pos, node, out);
391        return node;
392    } else {
393        // First unit of the linear-match node.
394        out.appendCodeUnit(*pos);
395        return 1;
396    }
397}
398
399void
400UCharsTrie::getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out) {
401    while(length>kMaxBranchLinearSubNodeLength) {
402        ++pos;  // ignore the comparison unit
403        getNextBranchUChars(jumpByDelta(pos), length>>1, out);
404        length=length-(length>>1);
405        pos=skipDelta(pos);
406    }
407    do {
408        out.appendCodeUnit(*pos++);
409        pos=skipValue(pos);
410    } while(--length>1);
411    out.appendCodeUnit(*pos);
412}
413
414U_NAMESPACE_END
415