1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*   Copyright (C) 2010-2012, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*   file name:  ucharstriebuilder.h
9*   encoding:   UTF-8
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2010nov14
14*   created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/ucharstrie.h"
19#include "unicode/ucharstriebuilder.h"
20#include "unicode/unistr.h"
21#include "unicode/ustring.h"
22#include "cmemory.h"
23#include "uarrsort.h"
24#include "uassert.h"
25#include "uhash.h"
26#include "ustr_imp.h"
27
28U_NAMESPACE_BEGIN
29
30/*
31 * Note: This builder implementation stores (string, value) pairs with full copies
32 * of the 16-bit-unit sequences, until the UCharsTrie is built.
33 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
34 */
35
36class UCharsTrieElement : public UMemory {
37public:
38    // Use compiler's default constructor, initializes nothing.
39
40    void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
41
42    UnicodeString getString(const UnicodeString &strings) const {
43        int32_t length=strings[stringOffset];
44        return strings.tempSubString(stringOffset+1, length);
45    }
46    int32_t getStringLength(const UnicodeString &strings) const {
47        return strings[stringOffset];
48    }
49
50    UChar charAt(int32_t index, const UnicodeString &strings) const {
51        return strings[stringOffset+1+index];
52    }
53
54    int32_t getValue() const { return value; }
55
56    int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
57
58private:
59    // The first strings unit contains the string length.
60    // (Compared with a stringLength field here, this saves 2 bytes per string.)
61    int32_t stringOffset;
62    int32_t value;
63};
64
65void
66UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
67                         UnicodeString &strings, UErrorCode &errorCode) {
68    if(U_FAILURE(errorCode)) {
69        return;
70    }
71    int32_t length=s.length();
72    if(length>0xffff) {
73        // Too long: We store the length in 1 unit.
74        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
75        return;
76    }
77    stringOffset=strings.length();
78    strings.append((UChar)length);
79    value=val;
80    strings.append(s);
81}
82
83int32_t
84UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
85    return getString(strings).compare(other.getString(strings));
86}
87
88UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
89        : elements(NULL), elementsCapacity(0), elementsLength(0),
90          uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
91
92UCharsTrieBuilder::~UCharsTrieBuilder() {
93    delete[] elements;
94    uprv_free(uchars);
95}
96
97UCharsTrieBuilder &
98UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
99    if(U_FAILURE(errorCode)) {
100        return *this;
101    }
102    if(ucharsLength>0) {
103        // Cannot add elements after building.
104        errorCode=U_NO_WRITE_PERMISSION;
105        return *this;
106    }
107    if(elementsLength==elementsCapacity) {
108        int32_t newCapacity;
109        if(elementsCapacity==0) {
110            newCapacity=1024;
111        } else {
112            newCapacity=4*elementsCapacity;
113        }
114        UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
115        if(newElements==NULL) {
116            errorCode=U_MEMORY_ALLOCATION_ERROR;
117            return *this;
118        }
119        if(elementsLength>0) {
120            uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(UCharsTrieElement));
121        }
122        delete[] elements;
123        elements=newElements;
124        elementsCapacity=newCapacity;
125    }
126    elements[elementsLength++].setTo(s, value, strings, errorCode);
127    if(U_SUCCESS(errorCode) && strings.isBogus()) {
128        errorCode=U_MEMORY_ALLOCATION_ERROR;
129    }
130    return *this;
131}
132
133U_CDECL_BEGIN
134
135static int32_t U_CALLCONV
136compareElementStrings(const void *context, const void *left, const void *right) {
137    const UnicodeString *strings=static_cast<const UnicodeString *>(context);
138    const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
139    const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
140    return leftElement->compareStringTo(*rightElement, *strings);
141}
142
143U_CDECL_END
144
145UCharsTrie *
146UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
147    buildUChars(buildOption, errorCode);
148    UCharsTrie *newTrie=NULL;
149    if(U_SUCCESS(errorCode)) {
150        newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
151        if(newTrie==NULL) {
152            errorCode=U_MEMORY_ALLOCATION_ERROR;
153        } else {
154            uchars=NULL;  // The new trie now owns the array.
155            ucharsCapacity=0;
156        }
157    }
158    return newTrie;
159}
160
161UnicodeString &
162UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
163                                      UErrorCode &errorCode) {
164    buildUChars(buildOption, errorCode);
165    if(U_SUCCESS(errorCode)) {
166        result.setTo(false, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
167    }
168    return result;
169}
170
171void
172UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
173    if(U_FAILURE(errorCode)) {
174        return;
175    }
176    if(uchars!=NULL && ucharsLength>0) {
177        // Already built.
178        return;
179    }
180    if(ucharsLength==0) {
181        if(elementsLength==0) {
182            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
183            return;
184        }
185        if(strings.isBogus()) {
186            errorCode=U_MEMORY_ALLOCATION_ERROR;
187            return;
188        }
189        uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
190                      compareElementStrings, &strings,
191                      false,  // need not be a stable sort
192                      &errorCode);
193        if(U_FAILURE(errorCode)) {
194            return;
195        }
196        // Duplicate strings are not allowed.
197        UnicodeString prev=elements[0].getString(strings);
198        for(int32_t i=1; i<elementsLength; ++i) {
199            UnicodeString current=elements[i].getString(strings);
200            if(prev==current) {
201                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
202                return;
203            }
204            prev.fastCopyFrom(current);
205        }
206    }
207    // Create and UChar-serialize the trie for the elements.
208    ucharsLength=0;
209    int32_t capacity=strings.length();
210    if(capacity<1024) {
211        capacity=1024;
212    }
213    if(ucharsCapacity<capacity) {
214        uprv_free(uchars);
215        uchars=static_cast<UChar *>(uprv_malloc(capacity*2));
216        if(uchars==NULL) {
217            errorCode=U_MEMORY_ALLOCATION_ERROR;
218            ucharsCapacity=0;
219            return;
220        }
221        ucharsCapacity=capacity;
222    }
223    StringTrieBuilder::build(buildOption, elementsLength, errorCode);
224    if(uchars==NULL) {
225        errorCode=U_MEMORY_ALLOCATION_ERROR;
226    }
227}
228
229int32_t
230UCharsTrieBuilder::getElementStringLength(int32_t i) const {
231    return elements[i].getStringLength(strings);
232}
233
234UChar
235UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
236    return elements[i].charAt(unitIndex, strings);
237}
238
239int32_t
240UCharsTrieBuilder::getElementValue(int32_t i) const {
241    return elements[i].getValue();
242}
243
244int32_t
245UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
246    const UCharsTrieElement &firstElement=elements[first];
247    const UCharsTrieElement &lastElement=elements[last];
248    int32_t minStringLength=firstElement.getStringLength(strings);
249    while(++unitIndex<minStringLength &&
250            firstElement.charAt(unitIndex, strings)==
251            lastElement.charAt(unitIndex, strings)) {}
252    return unitIndex;
253}
254
255int32_t
256UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
257    int32_t length=0;  // Number of different units at unitIndex.
258    int32_t i=start;
259    do {
260        UChar unit=elements[i++].charAt(unitIndex, strings);
261        while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
262            ++i;
263        }
264        ++length;
265    } while(i<limit);
266    return length;
267}
268
269int32_t
270UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
271    do {
272        UChar unit=elements[i++].charAt(unitIndex, strings);
273        while(unit==elements[i].charAt(unitIndex, strings)) {
274            ++i;
275        }
276    } while(--count>0);
277    return i;
278}
279
280int32_t
281UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
282    while(unit==elements[i].charAt(unitIndex, strings)) {
283        ++i;
284    }
285    return i;
286}
287
288UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
289        : LinearMatchNode(len, nextNode), s(units) {
290    hash=hash*37u+ustr_hashUCharsN(units, len);
291}
292
293bool
294UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
295    if(this==&other) {
296        return true;
297    }
298    if(!LinearMatchNode::operator==(other)) {
299        return false;
300    }
301    const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
302    return 0==u_memcmp(s, o.s, length);
303}
304
305void
306UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
307    UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
308    next->write(builder);
309    b.write(s, length);
310    offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
311}
312
313StringTrieBuilder::Node *
314UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
315                                         Node *nextNode) const {
316    return new UCTLinearMatchNode(
317            elements[i].getString(strings).getBuffer()+unitIndex,
318            length,
319            nextNode);
320}
321
322UBool
323UCharsTrieBuilder::ensureCapacity(int32_t length) {
324    if(uchars==NULL) {
325        return false;  // previous memory allocation had failed
326    }
327    if(length>ucharsCapacity) {
328        int32_t newCapacity=ucharsCapacity;
329        do {
330            newCapacity*=2;
331        } while(newCapacity<=length);
332        UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2));
333        if(newUChars==NULL) {
334            // unable to allocate memory
335            uprv_free(uchars);
336            uchars=NULL;
337            ucharsCapacity=0;
338            return false;
339        }
340        u_memcpy(newUChars+(newCapacity-ucharsLength),
341                 uchars+(ucharsCapacity-ucharsLength), ucharsLength);
342        uprv_free(uchars);
343        uchars=newUChars;
344        ucharsCapacity=newCapacity;
345    }
346    return true;
347}
348
349int32_t
350UCharsTrieBuilder::write(int32_t unit) {
351    int32_t newLength=ucharsLength+1;
352    if(ensureCapacity(newLength)) {
353        ucharsLength=newLength;
354        uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
355    }
356    return ucharsLength;
357}
358
359int32_t
360UCharsTrieBuilder::write(const UChar *s, int32_t length) {
361    int32_t newLength=ucharsLength+length;
362    if(ensureCapacity(newLength)) {
363        ucharsLength=newLength;
364        u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
365    }
366    return ucharsLength;
367}
368
369int32_t
370UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
371    return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
372}
373
374int32_t
375UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
376    if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
377        return write(i|(isFinal<<15));
378    }
379    UChar intUnits[3];
380    int32_t length;
381    if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
382        intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
383        intUnits[1]=(UChar)((uint32_t)i>>16);
384        intUnits[2]=(UChar)i;
385        length=3;
386    // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
387    //     intUnits[0]=(UChar)(i);
388    //     length=1;
389    } else {
390        intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
391        intUnits[1]=(UChar)i;
392        length=2;
393    }
394    intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
395    return write(intUnits, length);
396}
397
398int32_t
399UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
400    if(!hasValue) {
401        return write(node);
402    }
403    UChar intUnits[3];
404    int32_t length;
405    if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
406        intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
407        intUnits[1]=(UChar)((uint32_t)value>>16);
408        intUnits[2]=(UChar)value;
409        length=3;
410    } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
411        intUnits[0]=(UChar)((value+1)<<6);
412        length=1;
413    } else {
414        intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
415        intUnits[1]=(UChar)value;
416        length=2;
417    }
418    intUnits[0]|=(UChar)node;
419    return write(intUnits, length);
420}
421
422int32_t
423UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
424    int32_t i=ucharsLength-jumpTarget;
425    U_ASSERT(i>=0);
426    if(i<=UCharsTrie::kMaxOneUnitDelta) {
427        return write(i);
428    }
429    UChar intUnits[3];
430    int32_t length;
431    if(i<=UCharsTrie::kMaxTwoUnitDelta) {
432        intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
433        length=1;
434    } else {
435        intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
436        intUnits[1]=(UChar)(i>>16);
437        length=2;
438    }
439    intUnits[length++]=(UChar)i;
440    return write(intUnits, length);
441}
442
443U_NAMESPACE_END
444