1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*   Copyright (C) 2010-2011, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*   file name:  bytestrie.cpp
9*   encoding:   UTF-8
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2010sep25
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/bytestream.h"
19#include "unicode/bytestrie.h"
20#include "unicode/uobject.h"
21#include "cmemory.h"
22#include "uassert.h"
23
24U_NAMESPACE_BEGIN
25
26BytesTrie::~BytesTrie() {
27    uprv_free(ownedArray_);
28}
29
30// lead byte already shifted right by 1.
31int32_t
32BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) {
33    int32_t value;
34    if(leadByte<kMinTwoByteValueLead) {
35        value=leadByte-kMinOneByteValueLead;
36    } else if(leadByte<kMinThreeByteValueLead) {
37        value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;
38    } else if(leadByte<kFourByteValueLead) {
39        value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
40    } else if(leadByte==kFourByteValueLead) {
41        value=(pos[0]<<16)|(pos[1]<<8)|pos[2];
42    } else {
43        value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
44    }
45    return value;
46}
47
48const uint8_t *
49BytesTrie::jumpByDelta(const uint8_t *pos) {
50    int32_t delta=*pos++;
51    if(delta<kMinTwoByteDeltaLead) {
52        // nothing to do
53    } else if(delta<kMinThreeByteDeltaLead) {
54        delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;
55    } else if(delta<kFourByteDeltaLead) {
56        delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];
57        pos+=2;
58    } else if(delta==kFourByteDeltaLead) {
59        delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
60        pos+=3;
61    } else {
62        delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
63        pos+=4;
64    }
65    return pos+delta;
66}
67
68UStringTrieResult
69BytesTrie::current() const {
70    const uint8_t *pos=pos_;
71    if(pos==nullptr) {
72        return USTRINGTRIE_NO_MATCH;
73    } else {
74        int32_t node;
75        return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
76                valueResult(node) : USTRINGTRIE_NO_VALUE;
77    }
78}
79
80UStringTrieResult
81BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {
82    // Branch according to the current byte.
83    if(length==0) {
84        length=*pos++;
85    }
86    ++length;
87    // The length of the branch is the number of bytes to select from.
88    // The data structure encodes a binary search.
89    while(length>kMaxBranchLinearSubNodeLength) {
90        if(inByte<*pos++) {
91            length>>=1;
92            pos=jumpByDelta(pos);
93        } else {
94            length=length-(length>>1);
95            pos=skipDelta(pos);
96        }
97    }
98    // Drop down to linear search for the last few bytes.
99    // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
100    // and divides length by 2.
101    do {
102        if(inByte==*pos++) {
103            UStringTrieResult result;
104            int32_t node=*pos;
105            U_ASSERT(node>=kMinValueLead);
106            if(node&kValueIsFinal) {
107                // Leave the final value for getValue() to read.
108                result=USTRINGTRIE_FINAL_VALUE;
109            } else {
110                // Use the non-final value as the jump delta.
111                ++pos;
112                // int32_t delta=readValue(pos, node>>1);
113                node>>=1;
114                int32_t delta;
115                if(node<kMinTwoByteValueLead) {
116                    delta=node-kMinOneByteValueLead;
117                } else if(node<kMinThreeByteValueLead) {
118                    delta=((node-kMinTwoByteValueLead)<<8)|*pos++;
119                } else if(node<kFourByteValueLead) {
120                    delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
121                    pos+=2;
122                } else if(node==kFourByteValueLead) {
123                    delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
124                    pos+=3;
125                } else {
126                    delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
127                    pos+=4;
128                }
129                // end readValue()
130                pos+=delta;
131                node=*pos;
132                result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
133            }
134            pos_=pos;
135            return result;
136        }
137        --length;
138        pos=skipValue(pos);
139    } while(length>1);
140    if(inByte==*pos++) {
141        pos_=pos;
142        int32_t node=*pos;
143        return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
144    } else {
145        stop();
146        return USTRINGTRIE_NO_MATCH;
147    }
148}
149
150UStringTrieResult
151BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) {
152    for(;;) {
153        int32_t node=*pos++;
154        if(node<kMinLinearMatch) {
155            return branchNext(pos, node, inByte);
156        } else if(node<kMinValueLead) {
157            // Match the first of length+1 bytes.
158            int32_t length=node-kMinLinearMatch;  // Actual match length minus 1.
159            if(inByte==*pos++) {
160                remainingMatchLength_=--length;
161                pos_=pos;
162                return (length<0 && (node=*pos)>=kMinValueLead) ?
163                        valueResult(node) : USTRINGTRIE_NO_VALUE;
164            } else {
165                // No match.
166                break;
167            }
168        } else if(node&kValueIsFinal) {
169            // No further matching bytes.
170            break;
171        } else {
172            // Skip intermediate value.
173            pos=skipValue(pos, node);
174            // The next node must not also be a value node.
175            U_ASSERT(*pos<kMinValueLead);
176        }
177    }
178    stop();
179    return USTRINGTRIE_NO_MATCH;
180}
181
182UStringTrieResult
183BytesTrie::next(int32_t inByte) {
184    const uint8_t *pos=pos_;
185    if(pos==nullptr) {
186        return USTRINGTRIE_NO_MATCH;
187    }
188    if(inByte<0) {
189        inByte+=0x100;
190    }
191    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
192    if(length>=0) {
193        // Remaining part of a linear-match node.
194        if(inByte==*pos++) {
195            remainingMatchLength_=--length;
196            pos_=pos;
197            int32_t node;
198            return (length<0 && (node=*pos)>=kMinValueLead) ?
199                    valueResult(node) : USTRINGTRIE_NO_VALUE;
200        } else {
201            stop();
202            return USTRINGTRIE_NO_MATCH;
203        }
204    }
205    return nextImpl(pos, inByte);
206}
207
208UStringTrieResult
209BytesTrie::next(const char *s, int32_t sLength) {
210    if(sLength<0 ? *s==0 : sLength==0) {
211        // Empty input.
212        return current();
213    }
214    const uint8_t *pos=pos_;
215    if(pos==nullptr) {
216        return USTRINGTRIE_NO_MATCH;
217    }
218    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
219    for(;;) {
220        // Fetch the next input byte, if there is one.
221        // Continue a linear-match node without rechecking sLength<0.
222        int32_t inByte;
223        if(sLength<0) {
224            for(;;) {
225                if((inByte=*s++)==0) {
226                    remainingMatchLength_=length;
227                    pos_=pos;
228                    int32_t node;
229                    return (length<0 && (node=*pos)>=kMinValueLead) ?
230                            valueResult(node) : USTRINGTRIE_NO_VALUE;
231                }
232                if(length<0) {
233                    remainingMatchLength_=length;
234                    break;
235                }
236                if(inByte!=*pos) {
237                    stop();
238                    return USTRINGTRIE_NO_MATCH;
239                }
240                ++pos;
241                --length;
242            }
243        } else {
244            for(;;) {
245                if(sLength==0) {
246                    remainingMatchLength_=length;
247                    pos_=pos;
248                    int32_t node;
249                    return (length<0 && (node=*pos)>=kMinValueLead) ?
250                            valueResult(node) : USTRINGTRIE_NO_VALUE;
251                }
252                inByte=*s++;
253                --sLength;
254                if(length<0) {
255                    remainingMatchLength_=length;
256                    break;
257                }
258                if(inByte!=*pos) {
259                    stop();
260                    return USTRINGTRIE_NO_MATCH;
261                }
262                ++pos;
263                --length;
264            }
265        }
266        for(;;) {
267            int32_t node=*pos++;
268            if(node<kMinLinearMatch) {
269                UStringTrieResult result=branchNext(pos, node, inByte);
270                if(result==USTRINGTRIE_NO_MATCH) {
271                    return USTRINGTRIE_NO_MATCH;
272                }
273                // Fetch the next input byte, if there is one.
274                if(sLength<0) {
275                    if((inByte=*s++)==0) {
276                        return result;
277                    }
278                } else {
279                    if(sLength==0) {
280                        return result;
281                    }
282                    inByte=*s++;
283                    --sLength;
284                }
285                if(result==USTRINGTRIE_FINAL_VALUE) {
286                    // No further matching bytes.
287                    stop();
288                    return USTRINGTRIE_NO_MATCH;
289                }
290                pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
291            } else if(node<kMinValueLead) {
292                // Match length+1 bytes.
293                length=node-kMinLinearMatch;  // Actual match length minus 1.
294                if(inByte!=*pos) {
295                    stop();
296                    return USTRINGTRIE_NO_MATCH;
297                }
298                ++pos;
299                --length;
300                break;
301            } else if(node&kValueIsFinal) {
302                // No further matching bytes.
303                stop();
304                return USTRINGTRIE_NO_MATCH;
305            } else {
306                // Skip intermediate value.
307                pos=skipValue(pos, node);
308                // The next node must not also be a value node.
309                U_ASSERT(*pos<kMinValueLead);
310            }
311        }
312    }
313}
314
315const uint8_t *
316BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
317                                     UBool haveUniqueValue, int32_t &uniqueValue) {
318    while(length>kMaxBranchLinearSubNodeLength) {
319        ++pos;  // ignore the comparison byte
320        if(nullptr==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
321            return nullptr;
322        }
323        length=length-(length>>1);
324        pos=skipDelta(pos);
325    }
326    do {
327        ++pos;  // ignore a comparison byte
328        // handle its value
329        int32_t node=*pos++;
330        UBool isFinal=(UBool)(node&kValueIsFinal);
331        int32_t value=readValue(pos, node>>1);
332        pos=skipValue(pos, node);
333        if(isFinal) {
334            if(haveUniqueValue) {
335                if(value!=uniqueValue) {
336                    return nullptr;
337                }
338            } else {
339                uniqueValue=value;
340                haveUniqueValue=true;
341            }
342        } else {
343            if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
344                return nullptr;
345            }
346            haveUniqueValue=true;
347        }
348    } while(--length>1);
349    return pos+1;  // ignore the last comparison byte
350}
351
352UBool
353BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
354    for(;;) {
355        int32_t node=*pos++;
356        if(node<kMinLinearMatch) {
357            if(node==0) {
358                node=*pos++;
359            }
360            pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
361            if(pos==nullptr) {
362                return false;
363            }
364            haveUniqueValue=true;
365        } else if(node<kMinValueLead) {
366            // linear-match node
367            pos+=node-kMinLinearMatch+1;  // Ignore the match bytes.
368        } else {
369            UBool isFinal=(UBool)(node&kValueIsFinal);
370            int32_t value=readValue(pos, node>>1);
371            if(haveUniqueValue) {
372                if(value!=uniqueValue) {
373                    return false;
374                }
375            } else {
376                uniqueValue=value;
377                haveUniqueValue=true;
378            }
379            if(isFinal) {
380                return true;
381            }
382            pos=skipValue(pos, node);
383        }
384    }
385}
386
387int32_t
388BytesTrie::getNextBytes(ByteSink &out) const {
389    const uint8_t *pos=pos_;
390    if(pos==nullptr) {
391        return 0;
392    }
393    if(remainingMatchLength_>=0) {
394        append(out, *pos);  // Next byte of a pending linear-match node.
395        return 1;
396    }
397    int32_t node=*pos++;
398    if(node>=kMinValueLead) {
399        if(node&kValueIsFinal) {
400            return 0;
401        } else {
402            pos=skipValue(pos, node);
403            node=*pos++;
404            U_ASSERT(node<kMinValueLead);
405        }
406    }
407    if(node<kMinLinearMatch) {
408        if(node==0) {
409            node=*pos++;
410        }
411        getNextBranchBytes(pos, ++node, out);
412        return node;
413    } else {
414        // First byte of the linear-match node.
415        append(out, *pos);
416        return 1;
417    }
418}
419
420void
421BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {
422    while(length>kMaxBranchLinearSubNodeLength) {
423        ++pos;  // ignore the comparison byte
424        getNextBranchBytes(jumpByDelta(pos), length>>1, out);
425        length=length-(length>>1);
426        pos=skipDelta(pos);
427    }
428    do {
429        append(out, *pos++);
430        pos=skipValue(pos);
431    } while(--length>1);
432    append(out, *pos);
433}
434
435void
436BytesTrie::append(ByteSink &out, int c) {
437    char ch=(char)c;
438    out.Append(&ch, 1);
439}
440
441U_NAMESPACE_END
442