1// © 2017 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* Copyright (C) 2012-2015, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* collationbasedatabuilder.h 9* 10* created on: 2012aug11 11* created by: Markus W. Scherer 12*/ 13 14#ifndef __COLLATIONBASEDATABUILDER_H__ 15#define __COLLATIONBASEDATABUILDER_H__ 16 17#include "unicode/utypes.h" 18 19#if !UCONFIG_NO_COLLATION 20 21#include "unicode/uniset.h" 22#include "unicode/unistr.h" 23#include "unicode/uscript.h" 24#include "collation.h" 25#include "collationdata.h" 26#include "collationdatabuilder.h" 27#include "normalizer2impl.h" 28#include "utrie2.h" 29#include "uvectr32.h" 30#include "uvectr64.h" 31#include "uvector.h" 32 33U_NAMESPACE_BEGIN 34 35/** 36 * Low-level base CollationData builder. 37 */ 38class U_I18N_API CollationBaseDataBuilder : public CollationDataBuilder { 39public: 40 CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode); 41 42 virtual ~CollationBaseDataBuilder(); 43 44 void init(UErrorCode &errorCode); 45 46 /** 47 * Sets the Han ranges as ranges of offset CE32s. 48 * Note: Unihan extension A sorts after the other BMP ranges. 49 * See http://www.unicode.org/reports/tr10/#Implicit_Weights 50 * 51 * @param ranges array of ranges of [:Unified_Ideograph:] in collation order, 52 * as (start, end) code point pairs 53 * @param length number of code points (not pairs) 54 * @param errorCode in/out error code 55 */ 56 void initHanRanges(const UChar32 ranges[], int32_t length, UErrorCode &errorCode); 57 58 void setNumericPrimary(uint32_t np) { numericPrimary = np; } 59 60 virtual UBool isCompressibleLeadByte(uint32_t b) const; 61 62 void setCompressibleLeadByte(uint32_t b); 63 64 static int32_t diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible); 65 static int32_t diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible); 66 67 virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); 68 69 void addRootElements(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); 70 void addRootElement(int64_t ce, UErrorCode &errorCode); 71 72 void addScriptStart(int32_t script, uint32_t p); 73 74 virtual void build(CollationData &data, UErrorCode &errorCode); 75 76 void buildRootElementsTable(UVector32 &table, UErrorCode &errorCode); 77 78private: 79 int32_t writeRootElementsRange( 80 uint32_t prevPrimary, uint32_t p, int32_t i, 81 UVector32 &table, UErrorCode &errorCode); 82 83 // Flags for which primary-weight lead bytes are compressible. 84 UBool compressibleBytes[256]; 85 uint32_t numericPrimary; 86 uint32_t firstHanPrimary; 87 uint32_t lastHanPrimary; 88 int32_t hanStep; 89 UVector64 rootElements; 90 uint16_t scriptsIndex[USCRIPT_CODE_LIMIT + 16]; // need exactly this many 91 uint16_t scriptStarts[USCRIPT_CODE_LIMIT + 16]; // should be safely more than needed 92 int32_t scriptStartsLength; 93}; 94 95U_NAMESPACE_END 96 97#endif // !UCONFIG_NO_COLLATION 98#endif // __COLLATIONBASEDATABUILDER_H__ 99