1// © 2017 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2012-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* collationbasedatabuilder.h
9*
10* created on: 2012aug11
11* created by: Markus W. Scherer
12*/
13
14#ifndef __COLLATIONBASEDATABUILDER_H__
15#define __COLLATIONBASEDATABUILDER_H__
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_COLLATION
20
21#include "unicode/uniset.h"
22#include "unicode/unistr.h"
23#include "unicode/uscript.h"
24#include "collation.h"
25#include "collationdata.h"
26#include "collationdatabuilder.h"
27#include "normalizer2impl.h"
28#include "utrie2.h"
29#include "uvectr32.h"
30#include "uvectr64.h"
31#include "uvector.h"
32
33U_NAMESPACE_BEGIN
34
35/**
36 * Low-level base CollationData builder.
37 */
38class U_I18N_API CollationBaseDataBuilder : public CollationDataBuilder {
39public:
40    CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode);
41
42    virtual ~CollationBaseDataBuilder();
43
44    void init(UErrorCode &errorCode);
45
46    /**
47     * Sets the Han ranges as ranges of offset CE32s.
48     * Note: Unihan extension A sorts after the other BMP ranges.
49     * See http://www.unicode.org/reports/tr10/#Implicit_Weights
50     *
51     * @param ranges array of ranges of [:Unified_Ideograph:] in collation order,
52     *               as (start, end) code point pairs
53     * @param length number of code points (not pairs)
54     * @param errorCode in/out error code
55     */
56    void initHanRanges(const UChar32 ranges[], int32_t length, UErrorCode &errorCode);
57
58    void setNumericPrimary(uint32_t np) { numericPrimary = np; }
59
60    virtual UBool isCompressibleLeadByte(uint32_t b) const;
61
62    void setCompressibleLeadByte(uint32_t b);
63
64    static int32_t diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible);
65    static int32_t diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible);
66
67    virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
68
69    void addRootElements(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
70    void addRootElement(int64_t ce, UErrorCode &errorCode);
71
72    void addScriptStart(int32_t script, uint32_t p);
73
74    virtual void build(CollationData &data, UErrorCode &errorCode);
75
76    void buildRootElementsTable(UVector32 &table, UErrorCode &errorCode);
77
78private:
79    int32_t writeRootElementsRange(
80            uint32_t prevPrimary, uint32_t p, int32_t i,
81            UVector32 &table, UErrorCode &errorCode);
82
83    // Flags for which primary-weight lead bytes are compressible.
84    UBool compressibleBytes[256];
85    uint32_t numericPrimary;
86    uint32_t firstHanPrimary;
87    uint32_t lastHanPrimary;
88    int32_t hanStep;
89    UVector64 rootElements;
90    uint16_t scriptsIndex[USCRIPT_CODE_LIMIT + 16];  // need exactly this many
91    uint16_t scriptStarts[USCRIPT_CODE_LIMIT + 16];  // should be safely more than needed
92    int32_t scriptStartsLength;
93};
94
95U_NAMESPACE_END
96
97#endif  // !UCONFIG_NO_COLLATION
98#endif  // __COLLATIONBASEDATABUILDER_H__
99