12e5b6d6dSopenharmony_ci// © 2017 and later: Unicode, Inc. and others. 22e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html 32e5b6d6dSopenharmony_ci/* 42e5b6d6dSopenharmony_ci******************************************************************************* 52e5b6d6dSopenharmony_ci* 62e5b6d6dSopenharmony_ci* Copyright (C) 1999-2015, International Business Machines 72e5b6d6dSopenharmony_ci* Corporation and others. All Rights Reserved. 82e5b6d6dSopenharmony_ci* 92e5b6d6dSopenharmony_ci******************************************************************************* 102e5b6d6dSopenharmony_ci* file name: namespropsbuilder.cpp (was gennames/gennames.c) 112e5b6d6dSopenharmony_ci* encoding: US-ASCII 122e5b6d6dSopenharmony_ci* tab size: 8 (not used) 132e5b6d6dSopenharmony_ci* indentation:4 142e5b6d6dSopenharmony_ci* 152e5b6d6dSopenharmony_ci* created on: 1999sep30 162e5b6d6dSopenharmony_ci* created by: Markus W. Scherer 172e5b6d6dSopenharmony_ci* 182e5b6d6dSopenharmony_ci* This builder reads Unicode character names and aliases, 192e5b6d6dSopenharmony_ci* tokenizes and compresses them, and builds 202e5b6d6dSopenharmony_ci* compact binary tables for random-access lookup 212e5b6d6dSopenharmony_ci* in a u_charName() API function. 222e5b6d6dSopenharmony_ci* 232e5b6d6dSopenharmony_ci* unames.icu file format (after UDataInfo header etc. - see udata.c) 242e5b6d6dSopenharmony_ci* (all data is static const) 252e5b6d6dSopenharmony_ci* 262e5b6d6dSopenharmony_ci* UDataInfo fields: 272e5b6d6dSopenharmony_ci* dataFormat "unam" 282e5b6d6dSopenharmony_ci* formatVersion 1.0 292e5b6d6dSopenharmony_ci* dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0 302e5b6d6dSopenharmony_ci* 312e5b6d6dSopenharmony_ci* -- data-based names 322e5b6d6dSopenharmony_ci* uint32_t tokenStringOffset, 332e5b6d6dSopenharmony_ci* groupsOffset, 342e5b6d6dSopenharmony_ci* groupStringOffset, 352e5b6d6dSopenharmony_ci* algNamesOffset; 362e5b6d6dSopenharmony_ci* 372e5b6d6dSopenharmony_ci* uint16_t tokenCount; 382e5b6d6dSopenharmony_ci* uint16_t tokenTable[tokenCount]; 392e5b6d6dSopenharmony_ci* 402e5b6d6dSopenharmony_ci* char tokenStrings[]; -- padded to even count 412e5b6d6dSopenharmony_ci* 422e5b6d6dSopenharmony_ci* -- strings (groupStrings) are tokenized as follows: 432e5b6d6dSopenharmony_ci* for each character c 442e5b6d6dSopenharmony_ci* if(c>=tokenCount) write that character c directly 452e5b6d6dSopenharmony_ci* else 462e5b6d6dSopenharmony_ci* token=tokenTable[c]; 472e5b6d6dSopenharmony_ci* if(token==0xfffe) -- lead byte of double-byte token 482e5b6d6dSopenharmony_ci* token=tokenTable[c<<8|next character]; 492e5b6d6dSopenharmony_ci* if(token==-1) 502e5b6d6dSopenharmony_ci* write c directly 512e5b6d6dSopenharmony_ci* else 522e5b6d6dSopenharmony_ci* tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;) 532e5b6d6dSopenharmony_ci* append zero-terminated tokenString; 542e5b6d6dSopenharmony_ci* 552e5b6d6dSopenharmony_ci* Different strings for a code point - normal name, 1.0 name, and ISO comment - 562e5b6d6dSopenharmony_ci* are separated by ';'. 572e5b6d6dSopenharmony_ci* 582e5b6d6dSopenharmony_ci* uint16_t groupCount; 592e5b6d6dSopenharmony_ci* struct { 602e5b6d6dSopenharmony_ci* uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5 612e5b6d6dSopenharmony_ci* uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset 622e5b6d6dSopenharmony_ci* uint16_t offsetLow; 632e5b6d6dSopenharmony_ci* } groupTable[groupCount]; 642e5b6d6dSopenharmony_ci* 652e5b6d6dSopenharmony_ci* char groupStrings[]; -- padded to 4-count 662e5b6d6dSopenharmony_ci* 672e5b6d6dSopenharmony_ci* -- The actual, tokenized group strings are not zero-terminated because 682e5b6d6dSopenharmony_ci* that would take up too much space. 692e5b6d6dSopenharmony_ci* Instead, they are preceeded by their length, written in a variable-length sequence: 702e5b6d6dSopenharmony_ci* For each of the 32 group strings, one or two nibbles are stored for its length. 712e5b6d6dSopenharmony_ci* Nibbles (4-bit values, half-bytes) are read MSB first. 722e5b6d6dSopenharmony_ci* A nibble with a value of 0..11 directly indicates the length of the name string. 732e5b6d6dSopenharmony_ci* A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m 742e5b6d6dSopenharmony_ci* by (((n-12)<<4)|m)+12, reaching values of 12..75. 752e5b6d6dSopenharmony_ci* These lengths are sequentially for each tokenized string, not for the de-tokenized result. 762e5b6d6dSopenharmony_ci* For the de-tokenizing, see token description above; the strings immediately follow the 772e5b6d6dSopenharmony_ci* 32 lengths. 782e5b6d6dSopenharmony_ci* 792e5b6d6dSopenharmony_ci* -- algorithmic names 802e5b6d6dSopenharmony_ci* 812e5b6d6dSopenharmony_ci* typedef struct AlgorithmicRange { 822e5b6d6dSopenharmony_ci* uint32_t rangeStart, rangeEnd; 832e5b6d6dSopenharmony_ci* uint8_t algorithmType, algorithmVariant; 842e5b6d6dSopenharmony_ci* uint16_t rangeSize; 852e5b6d6dSopenharmony_ci* } AlgorithmicRange; 862e5b6d6dSopenharmony_ci* 872e5b6d6dSopenharmony_ci* uint32_t algRangesCount; -- number of data blocks for ranges of 882e5b6d6dSopenharmony_ci* algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames) 892e5b6d6dSopenharmony_ci* 902e5b6d6dSopenharmony_ci* struct { 912e5b6d6dSopenharmony_ci* AlgorithmicRange algRange; 922e5b6d6dSopenharmony_ci* uint8_t algRangeData[]; -- padded to 4-count except in last range 932e5b6d6dSopenharmony_ci* } algRanges[algNamesCount]; 942e5b6d6dSopenharmony_ci* -- not a real array because each part has a different size 952e5b6d6dSopenharmony_ci* of algRange.rangeSize (including AlgorithmicRange) 962e5b6d6dSopenharmony_ci* 972e5b6d6dSopenharmony_ci* -- algorithmic range types: 982e5b6d6dSopenharmony_ci* 992e5b6d6dSopenharmony_ci* 0 Names are formed from a string prefix that is stored in 1002e5b6d6dSopenharmony_ci* the algRangeData (zero-terminated), followed by the Unicode code point 1012e5b6d6dSopenharmony_ci* of the character in hexadecimal digits; 1022e5b6d6dSopenharmony_ci* algRange.algorithmVariant digits are written 1032e5b6d6dSopenharmony_ci* 1042e5b6d6dSopenharmony_ci* 1 Names are formed by calculating modulo-factors of the code point value as follows: 1052e5b6d6dSopenharmony_ci* algRange.algorithmVariant is the count of modulo factors 1062e5b6d6dSopenharmony_ci* algRangeData contains 1072e5b6d6dSopenharmony_ci* uint16_t factors[algRange.algorithmVariant]; 1082e5b6d6dSopenharmony_ci* char strings[]; 1092e5b6d6dSopenharmony_ci* the first zero-terminated string is written as the prefix; then: 1102e5b6d6dSopenharmony_ci* 1112e5b6d6dSopenharmony_ci* The rangeStart is subtracted; with the difference, here "code": 1122e5b6d6dSopenharmony_ci* for(i=algRange.algorithmVariant-1 to 0 step -1) 1132e5b6d6dSopenharmony_ci* index[i]=code%factor[i]; 1142e5b6d6dSopenharmony_ci* code/=factor[i]; 1152e5b6d6dSopenharmony_ci* 1162e5b6d6dSopenharmony_ci* The strings after the prefix are short pieces that are then appended to the result 1172e5b6d6dSopenharmony_ci* according to index[0..algRange.algorithmVariant-1]. 1182e5b6d6dSopenharmony_ci*/ 1192e5b6d6dSopenharmony_ci 1202e5b6d6dSopenharmony_ci#include <stdio.h> 1212e5b6d6dSopenharmony_ci#include "unicode/utypes.h" 1222e5b6d6dSopenharmony_ci#include "unicode/putil.h" 1232e5b6d6dSopenharmony_ci#include "unicode/udata.h" 1242e5b6d6dSopenharmony_ci#include "charstr.h" 1252e5b6d6dSopenharmony_ci#include "cmemory.h" 1262e5b6d6dSopenharmony_ci#include "cstring.h" 1272e5b6d6dSopenharmony_ci#include "genprops.h" 1282e5b6d6dSopenharmony_ci#include "ppucd.h" 1292e5b6d6dSopenharmony_ci#include "uarrsort.h" 1302e5b6d6dSopenharmony_ci#include "uassert.h" 1312e5b6d6dSopenharmony_ci#include "unewdata.h" 1322e5b6d6dSopenharmony_ci#include "uoptions.h" 1332e5b6d6dSopenharmony_ci 1342e5b6d6dSopenharmony_ci#define STRING_STORE_SIZE 2000000 1352e5b6d6dSopenharmony_ci#define GROUP_STORE_SIZE 5000 1362e5b6d6dSopenharmony_ci 1372e5b6d6dSopenharmony_ci#define GROUP_SHIFT 5 1382e5b6d6dSopenharmony_ci#define LINES_PER_GROUP (1UL<<GROUP_SHIFT) 1392e5b6d6dSopenharmony_ci#define GROUP_MASK (LINES_PER_GROUP-1) 1402e5b6d6dSopenharmony_ci 1412e5b6d6dSopenharmony_ci#define MAX_LINE_COUNT 50000 1422e5b6d6dSopenharmony_ci#define MAX_WORD_COUNT 20000 1432e5b6d6dSopenharmony_ci#define MAX_GROUP_COUNT 5000 1442e5b6d6dSopenharmony_ci 1452e5b6d6dSopenharmony_ci#define NAME_SEPARATOR_CHAR ';' 1462e5b6d6dSopenharmony_ci 1472e5b6d6dSopenharmony_ci/* generator data ----------------------------------------------------------- */ 1482e5b6d6dSopenharmony_ci 1492e5b6d6dSopenharmony_ciU_NAMESPACE_USE 1502e5b6d6dSopenharmony_ci 1512e5b6d6dSopenharmony_ci/* UDataInfo cf. udata.h */ 1522e5b6d6dSopenharmony_cistatic UDataInfo dataInfo={ 1532e5b6d6dSopenharmony_ci sizeof(UDataInfo), 1542e5b6d6dSopenharmony_ci 0, 1552e5b6d6dSopenharmony_ci 1562e5b6d6dSopenharmony_ci U_IS_BIG_ENDIAN, 1572e5b6d6dSopenharmony_ci U_CHARSET_FAMILY, 1582e5b6d6dSopenharmony_ci sizeof(UChar), 1592e5b6d6dSopenharmony_ci 0, 1602e5b6d6dSopenharmony_ci 1612e5b6d6dSopenharmony_ci {0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */ 1622e5b6d6dSopenharmony_ci {1, 0, 0, 0}, /* formatVersion */ 1632e5b6d6dSopenharmony_ci {3, 0, 0, 0} /* dataVersion */ 1642e5b6d6dSopenharmony_ci}; 1652e5b6d6dSopenharmony_ci 1662e5b6d6dSopenharmony_cistatic uint8_t stringStore[STRING_STORE_SIZE], 1672e5b6d6dSopenharmony_ci groupStore[GROUP_STORE_SIZE], 1682e5b6d6dSopenharmony_ci lineLengths[LINES_PER_GROUP]; 1692e5b6d6dSopenharmony_ci 1702e5b6d6dSopenharmony_cistatic uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop; 1712e5b6d6dSopenharmony_ci 1722e5b6d6dSopenharmony_citypedef struct { 1732e5b6d6dSopenharmony_ci uint32_t code; 1742e5b6d6dSopenharmony_ci int16_t length; 1752e5b6d6dSopenharmony_ci uint8_t *s; 1762e5b6d6dSopenharmony_ci} Line; 1772e5b6d6dSopenharmony_ci 1782e5b6d6dSopenharmony_citypedef struct { 1792e5b6d6dSopenharmony_ci int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */ 1802e5b6d6dSopenharmony_ci int16_t count; 1812e5b6d6dSopenharmony_ci int16_t length; 1822e5b6d6dSopenharmony_ci uint8_t *s; 1832e5b6d6dSopenharmony_ci} Word; 1842e5b6d6dSopenharmony_ci 1852e5b6d6dSopenharmony_cistatic Line lines[MAX_LINE_COUNT]; 1862e5b6d6dSopenharmony_cistatic Word words[MAX_WORD_COUNT]; 1872e5b6d6dSopenharmony_ci 1882e5b6d6dSopenharmony_cistatic uint32_t lineCount=0, wordCount=0; 1892e5b6d6dSopenharmony_ci 1902e5b6d6dSopenharmony_cistatic int16_t leadByteCount; 1912e5b6d6dSopenharmony_ci 1922e5b6d6dSopenharmony_ci#define LEADBYTE_LIMIT 16 1932e5b6d6dSopenharmony_ci 1942e5b6d6dSopenharmony_cistatic int16_t tokens[LEADBYTE_LIMIT*256]; 1952e5b6d6dSopenharmony_cistatic uint32_t tokenCount; 1962e5b6d6dSopenharmony_ci 1972e5b6d6dSopenharmony_ci/* the structure for algorithmic names needs to be 4-aligned */ 1982e5b6d6dSopenharmony_cistruct AlgorithmicRange { 1992e5b6d6dSopenharmony_ci UChar32 start, end; 2002e5b6d6dSopenharmony_ci uint8_t type, variant; 2012e5b6d6dSopenharmony_ci uint16_t size; 2022e5b6d6dSopenharmony_ci}; 2032e5b6d6dSopenharmony_ci 2042e5b6d6dSopenharmony_ciclass NamesPropsBuilder : public PropsBuilder { 2052e5b6d6dSopenharmony_cipublic: 2062e5b6d6dSopenharmony_ci NamesPropsBuilder(UErrorCode &errorCode); 2072e5b6d6dSopenharmony_ci virtual ~NamesPropsBuilder(); 2082e5b6d6dSopenharmony_ci 2092e5b6d6dSopenharmony_ci virtual void setUnicodeVersion(const UVersionInfo version); 2102e5b6d6dSopenharmony_ci virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode); 2112e5b6d6dSopenharmony_ci virtual void build(UErrorCode &errorCode); 2122e5b6d6dSopenharmony_ci virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode); 2132e5b6d6dSopenharmony_ci 2142e5b6d6dSopenharmony_ciprivate: 2152e5b6d6dSopenharmony_ci virtual void setAlgNamesRange(UChar32 start, UChar32 end, 2162e5b6d6dSopenharmony_ci const char *type, const char *prefix, UErrorCode &errorCode); 2172e5b6d6dSopenharmony_ci 2182e5b6d6dSopenharmony_ci CharString algRanges; 2192e5b6d6dSopenharmony_ci int32_t countAlgRanges; 2202e5b6d6dSopenharmony_ci}; 2212e5b6d6dSopenharmony_ci 2222e5b6d6dSopenharmony_ciNamesPropsBuilder::NamesPropsBuilder(UErrorCode &errorCode) 2232e5b6d6dSopenharmony_ci : countAlgRanges(0) { 2242e5b6d6dSopenharmony_ci for(int i=0; i<256; ++i) { 2252e5b6d6dSopenharmony_ci tokens[i]=0; 2262e5b6d6dSopenharmony_ci } 2272e5b6d6dSopenharmony_ci} 2282e5b6d6dSopenharmony_ci 2292e5b6d6dSopenharmony_ciNamesPropsBuilder::~NamesPropsBuilder() { 2302e5b6d6dSopenharmony_ci} 2312e5b6d6dSopenharmony_ci 2322e5b6d6dSopenharmony_civoid 2332e5b6d6dSopenharmony_ciNamesPropsBuilder::setUnicodeVersion(const UVersionInfo version) { 2342e5b6d6dSopenharmony_ci uprv_memcpy(dataInfo.dataVersion, version, 4); 2352e5b6d6dSopenharmony_ci} 2362e5b6d6dSopenharmony_ci 2372e5b6d6dSopenharmony_ci/* prototypes --------------------------------------------------------------- */ 2382e5b6d6dSopenharmony_ci 2392e5b6d6dSopenharmony_cistatic void 2402e5b6d6dSopenharmony_ciparseName(const char *name, int16_t length); 2412e5b6d6dSopenharmony_ci 2422e5b6d6dSopenharmony_cistatic int16_t 2432e5b6d6dSopenharmony_ciskipNoise(const char *line, int16_t start, int16_t limit); 2442e5b6d6dSopenharmony_ci 2452e5b6d6dSopenharmony_cistatic int16_t 2462e5b6d6dSopenharmony_cigetWord(const char *line, int16_t start, int16_t limit); 2472e5b6d6dSopenharmony_ci 2482e5b6d6dSopenharmony_cistatic void 2492e5b6d6dSopenharmony_cicompress(UErrorCode &errorCode); 2502e5b6d6dSopenharmony_ci 2512e5b6d6dSopenharmony_cistatic void 2522e5b6d6dSopenharmony_cicompressLines(void); 2532e5b6d6dSopenharmony_ci 2542e5b6d6dSopenharmony_cistatic int16_t 2552e5b6d6dSopenharmony_cicompressLine(uint8_t *s, int16_t length, int16_t *pGroupTop); 2562e5b6d6dSopenharmony_ci 2572e5b6d6dSopenharmony_cistatic int32_t 2582e5b6d6dSopenharmony_cicompareWords(const void *context, const void *word1, const void *word2); 2592e5b6d6dSopenharmony_ci 2602e5b6d6dSopenharmony_cistatic int16_t 2612e5b6d6dSopenharmony_cifindToken(uint8_t *s, int16_t length); 2622e5b6d6dSopenharmony_ci 2632e5b6d6dSopenharmony_cistatic Word * 2642e5b6d6dSopenharmony_cifindWord(const char *s, int16_t length); 2652e5b6d6dSopenharmony_ci 2662e5b6d6dSopenharmony_cistatic Word * 2672e5b6d6dSopenharmony_ciaddWord(const char *s, int16_t length); 2682e5b6d6dSopenharmony_ci 2692e5b6d6dSopenharmony_cistatic void 2702e5b6d6dSopenharmony_cicountWord(Word *word); 2712e5b6d6dSopenharmony_ci 2722e5b6d6dSopenharmony_cistatic void 2732e5b6d6dSopenharmony_ciaddLine(UChar32 code, const char *names[], int16_t lengths[], int16_t count); 2742e5b6d6dSopenharmony_ci 2752e5b6d6dSopenharmony_cistatic void 2762e5b6d6dSopenharmony_ciaddGroup(uint32_t groupMSB, uint8_t *strings, int16_t length); 2772e5b6d6dSopenharmony_ci 2782e5b6d6dSopenharmony_cistatic uint32_t 2792e5b6d6dSopenharmony_ciaddToken(uint8_t *s, int16_t length); 2802e5b6d6dSopenharmony_ci 2812e5b6d6dSopenharmony_cistatic void 2822e5b6d6dSopenharmony_ciappendLineLength(int16_t length); 2832e5b6d6dSopenharmony_ci 2842e5b6d6dSopenharmony_cistatic void 2852e5b6d6dSopenharmony_ciappendLineLengthNibble(uint8_t nibble); 2862e5b6d6dSopenharmony_ci 2872e5b6d6dSopenharmony_cistatic uint8_t * 2882e5b6d6dSopenharmony_ciallocLine(int32_t length); 2892e5b6d6dSopenharmony_ci 2902e5b6d6dSopenharmony_cistatic uint8_t * 2912e5b6d6dSopenharmony_ciallocWord(uint32_t length); 2922e5b6d6dSopenharmony_ci 2932e5b6d6dSopenharmony_ci/* parsing ------------------------------------------------------------------ */ 2942e5b6d6dSopenharmony_ci 2952e5b6d6dSopenharmony_civoid 2962e5b6d6dSopenharmony_ciNamesPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, 2972e5b6d6dSopenharmony_ci UErrorCode &errorCode) { 2982e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode)) { return; } 2992e5b6d6dSopenharmony_ci if(!newValues.contains(UCHAR_NAME) && !newValues.contains(PPUCD_NAME_ALIAS)) { 3002e5b6d6dSopenharmony_ci return; 3012e5b6d6dSopenharmony_ci } 3022e5b6d6dSopenharmony_ci 3032e5b6d6dSopenharmony_ci U_ASSERT(props.start==props.end); 3042e5b6d6dSopenharmony_ci 3052e5b6d6dSopenharmony_ci const char *names[4]={ NULL, NULL, NULL, NULL }; 3062e5b6d6dSopenharmony_ci int16_t lengths[4]={ 0, 0, 0, 0 }; 3072e5b6d6dSopenharmony_ci 3082e5b6d6dSopenharmony_ci /* get the character name */ 3092e5b6d6dSopenharmony_ci if(props.name!=NULL) { 3102e5b6d6dSopenharmony_ci names[0]=props.name; 3112e5b6d6dSopenharmony_ci lengths[0]=(int16_t)uprv_strlen(props.name); 3122e5b6d6dSopenharmony_ci parseName(names[0], lengths[0]); 3132e5b6d6dSopenharmony_ci } 3142e5b6d6dSopenharmony_ci 3152e5b6d6dSopenharmony_ci CharString buffer; 3162e5b6d6dSopenharmony_ci if(props.nameAlias!=NULL) { 3172e5b6d6dSopenharmony_ci /* 3182e5b6d6dSopenharmony_ci * Only use "correction" aliases for now, from Unicode 6.1 NameAliases.txt with 3 fields per line. 3192e5b6d6dSopenharmony_ci * TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character. 3202e5b6d6dSopenharmony_ci */ 3212e5b6d6dSopenharmony_ci const char *corr=uprv_strstr(props.nameAlias, "correction="); 3222e5b6d6dSopenharmony_ci if(corr!=NULL) { 3232e5b6d6dSopenharmony_ci corr+=11; // skip "correction=" 3242e5b6d6dSopenharmony_ci const char *limit=uprv_strchr(corr, ','); 3252e5b6d6dSopenharmony_ci if(limit!=NULL) { 3262e5b6d6dSopenharmony_ci buffer.append(corr, limit-corr, errorCode); 3272e5b6d6dSopenharmony_ci names[3]=buffer.data(); 3282e5b6d6dSopenharmony_ci lengths[3]=(int16_t)(limit-corr); 3292e5b6d6dSopenharmony_ci } else { 3302e5b6d6dSopenharmony_ci names[3]=corr; 3312e5b6d6dSopenharmony_ci lengths[3]=(int16_t)uprv_strlen(corr); 3322e5b6d6dSopenharmony_ci } 3332e5b6d6dSopenharmony_ci parseName(names[3], lengths[3]); 3342e5b6d6dSopenharmony_ci } 3352e5b6d6dSopenharmony_ci } 3362e5b6d6dSopenharmony_ci 3372e5b6d6dSopenharmony_ci addLine(props.start, names, lengths, LENGTHOF(names)); 3382e5b6d6dSopenharmony_ci} 3392e5b6d6dSopenharmony_ci 3402e5b6d6dSopenharmony_cistatic void 3412e5b6d6dSopenharmony_ciparseName(const char *name, int16_t length) { 3422e5b6d6dSopenharmony_ci int16_t start=0, limit, wordLength/*, prevStart=-1*/; 3432e5b6d6dSopenharmony_ci Word *word; 3442e5b6d6dSopenharmony_ci 3452e5b6d6dSopenharmony_ci while(start<length) { 3462e5b6d6dSopenharmony_ci /* skip any "noise" characters */ 3472e5b6d6dSopenharmony_ci limit=skipNoise(name, start, length); 3482e5b6d6dSopenharmony_ci if(start<limit) { 3492e5b6d6dSopenharmony_ci /*prevStart=-1;*/ 3502e5b6d6dSopenharmony_ci start=limit; 3512e5b6d6dSopenharmony_ci } 3522e5b6d6dSopenharmony_ci if(start==length) { 3532e5b6d6dSopenharmony_ci break; 3542e5b6d6dSopenharmony_ci } 3552e5b6d6dSopenharmony_ci 3562e5b6d6dSopenharmony_ci /* get a word and add it if it is longer than 1 */ 3572e5b6d6dSopenharmony_ci limit=getWord(name, start, length); 3582e5b6d6dSopenharmony_ci wordLength=(int16_t)(limit-start); 3592e5b6d6dSopenharmony_ci if(wordLength>1) { 3602e5b6d6dSopenharmony_ci word=findWord(name+start, wordLength); 3612e5b6d6dSopenharmony_ci if(word==NULL) { 3622e5b6d6dSopenharmony_ci word=addWord(name+start, wordLength); 3632e5b6d6dSopenharmony_ci } 3642e5b6d6dSopenharmony_ci countWord(word); 3652e5b6d6dSopenharmony_ci } 3662e5b6d6dSopenharmony_ci 3672e5b6d6dSopenharmony_ci#if 0 3682e5b6d6dSopenharmony_ci /* 3692e5b6d6dSopenharmony_ci * if there was a word before this 3702e5b6d6dSopenharmony_ci * (with no noise in between), then add the pair of words, too 3712e5b6d6dSopenharmony_ci */ 3722e5b6d6dSopenharmony_ci if(prevStart!=-1) { 3732e5b6d6dSopenharmony_ci wordLength=limit-prevStart; 3742e5b6d6dSopenharmony_ci word=findWord(name+prevStart, wordLength); 3752e5b6d6dSopenharmony_ci if(word==NULL) { 3762e5b6d6dSopenharmony_ci word=addWord(name+prevStart, wordLength); 3772e5b6d6dSopenharmony_ci } 3782e5b6d6dSopenharmony_ci countWord(word); 3792e5b6d6dSopenharmony_ci } 3802e5b6d6dSopenharmony_ci#endif 3812e5b6d6dSopenharmony_ci 3822e5b6d6dSopenharmony_ci /*prevStart=start;*/ 3832e5b6d6dSopenharmony_ci start=limit; 3842e5b6d6dSopenharmony_ci } 3852e5b6d6dSopenharmony_ci} 3862e5b6d6dSopenharmony_ci 3872e5b6d6dSopenharmony_cistatic UBool 3882e5b6d6dSopenharmony_ciisWordChar(char c) { 3892e5b6d6dSopenharmony_ci return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */ 3902e5b6d6dSopenharmony_ci ('J'<=c && c<='R') || 3912e5b6d6dSopenharmony_ci ('S'<=c && c<='Z') || 3922e5b6d6dSopenharmony_ci 3932e5b6d6dSopenharmony_ci ('0'<=c && c<='9'); 3942e5b6d6dSopenharmony_ci} 3952e5b6d6dSopenharmony_ci 3962e5b6d6dSopenharmony_cistatic int16_t 3972e5b6d6dSopenharmony_ciskipNoise(const char *line, int16_t start, int16_t limit) { 3982e5b6d6dSopenharmony_ci /* skip anything that is not part of a word in this sense */ 3992e5b6d6dSopenharmony_ci while(start<limit && !isWordChar(line[start])) { 4002e5b6d6dSopenharmony_ci ++start; 4012e5b6d6dSopenharmony_ci } 4022e5b6d6dSopenharmony_ci 4032e5b6d6dSopenharmony_ci return start; 4042e5b6d6dSopenharmony_ci} 4052e5b6d6dSopenharmony_ci 4062e5b6d6dSopenharmony_cistatic int16_t 4072e5b6d6dSopenharmony_cigetWord(const char *line, int16_t start, int16_t limit) { 4082e5b6d6dSopenharmony_ci char c=0; /* initialize to avoid a compiler warning although the code was safe */ 4092e5b6d6dSopenharmony_ci 4102e5b6d6dSopenharmony_ci /* a unicode character name word consists of A-Z0-9 */ 4112e5b6d6dSopenharmony_ci while(start<limit && isWordChar(line[start])) { 4122e5b6d6dSopenharmony_ci ++start; 4132e5b6d6dSopenharmony_ci } 4142e5b6d6dSopenharmony_ci 4152e5b6d6dSopenharmony_ci /* include a following space or dash */ 4162e5b6d6dSopenharmony_ci if(start<limit && ((c=line[start])==' ' || c=='-')) { 4172e5b6d6dSopenharmony_ci ++start; 4182e5b6d6dSopenharmony_ci } 4192e5b6d6dSopenharmony_ci 4202e5b6d6dSopenharmony_ci return start; 4212e5b6d6dSopenharmony_ci} 4222e5b6d6dSopenharmony_ci 4232e5b6d6dSopenharmony_civoid 4242e5b6d6dSopenharmony_ciNamesPropsBuilder::setAlgNamesRange(UChar32 start, UChar32 end, 4252e5b6d6dSopenharmony_ci const char *type, 4262e5b6d6dSopenharmony_ci const char *prefix, // number of hex digits 4272e5b6d6dSopenharmony_ci UErrorCode &errorCode) { 4282e5b6d6dSopenharmony_ci /* modulo factors, maximum 8 */ 4292e5b6d6dSopenharmony_ci /* 3 factors: 19, 21, 28, most-to-least-significant */ 4302e5b6d6dSopenharmony_ci static const uint16_t hangulFactors[3]={ 4312e5b6d6dSopenharmony_ci 19, 21, 28 4322e5b6d6dSopenharmony_ci }; 4332e5b6d6dSopenharmony_ci 4342e5b6d6dSopenharmony_ci static const char jamo[]= 4352e5b6d6dSopenharmony_ci "HANGUL SYLLABLE \0" 4362e5b6d6dSopenharmony_ci 4372e5b6d6dSopenharmony_ci "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0" 4382e5b6d6dSopenharmony_ci "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0" 4392e5b6d6dSopenharmony_ci 4402e5b6d6dSopenharmony_ci "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0" 4412e5b6d6dSopenharmony_ci "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0" 4422e5b6d6dSopenharmony_ci "YU\0EU\0YI\0I\0" 4432e5b6d6dSopenharmony_ci 4442e5b6d6dSopenharmony_ci "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0" 4452e5b6d6dSopenharmony_ci "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0" 4462e5b6d6dSopenharmony_ci "S\0SS\0NG\0J\0C\0K\0T\0P\0H"; 4472e5b6d6dSopenharmony_ci 4482e5b6d6dSopenharmony_ci int32_t prefixLength=0; 4492e5b6d6dSopenharmony_ci AlgorithmicRange range; 4502e5b6d6dSopenharmony_ci uprv_memset(&range, 0, sizeof(AlgorithmicRange)); 4512e5b6d6dSopenharmony_ci int32_t rangeSize=(int32_t)sizeof(AlgorithmicRange); 4522e5b6d6dSopenharmony_ci range.start=start; 4532e5b6d6dSopenharmony_ci range.end=end; 4542e5b6d6dSopenharmony_ci if(0==uprv_strcmp(type, "han")) { 4552e5b6d6dSopenharmony_ci range.type=0; 4562e5b6d6dSopenharmony_ci range.variant= end<=0xffff ? 4 : 5; 4572e5b6d6dSopenharmony_ci prefixLength=uprv_strlen(prefix)+1; 4582e5b6d6dSopenharmony_ci rangeSize+=prefixLength; 4592e5b6d6dSopenharmony_ci } else if(0==uprv_strcmp(type, "hangul")) { 4602e5b6d6dSopenharmony_ci range.type=1; 4612e5b6d6dSopenharmony_ci range.variant=(uint8_t)LENGTHOF(hangulFactors); 4622e5b6d6dSopenharmony_ci rangeSize+=(int32_t)sizeof(hangulFactors); 4632e5b6d6dSopenharmony_ci rangeSize+=(int32_t)sizeof(jamo); 4642e5b6d6dSopenharmony_ci } else { 4652e5b6d6dSopenharmony_ci fprintf(stderr, "genprops error: unknown algnamesrange type '%s'\n", prefix); 4662e5b6d6dSopenharmony_ci errorCode=U_ILLEGAL_ARGUMENT_ERROR; 4672e5b6d6dSopenharmony_ci return; 4682e5b6d6dSopenharmony_ci } 4692e5b6d6dSopenharmony_ci int32_t paddingLength=rangeSize&3; 4702e5b6d6dSopenharmony_ci if(paddingLength) { 4712e5b6d6dSopenharmony_ci paddingLength=4-paddingLength; 4722e5b6d6dSopenharmony_ci rangeSize+=paddingLength; 4732e5b6d6dSopenharmony_ci } 4742e5b6d6dSopenharmony_ci range.size=(uint16_t)rangeSize; 4752e5b6d6dSopenharmony_ci algRanges.append((char *)&range, (int32_t)sizeof(AlgorithmicRange), errorCode); 4762e5b6d6dSopenharmony_ci if(range.type==0) { // han 4772e5b6d6dSopenharmony_ci algRanges.append(prefix, prefixLength, errorCode); 4782e5b6d6dSopenharmony_ci } else /* type==1 */ { // hangul 4792e5b6d6dSopenharmony_ci algRanges.append((char *)hangulFactors, (int32_t)sizeof(hangulFactors), errorCode); 4802e5b6d6dSopenharmony_ci algRanges.append(jamo, (int32_t)sizeof(jamo), errorCode); 4812e5b6d6dSopenharmony_ci } 4822e5b6d6dSopenharmony_ci while(paddingLength) { 4832e5b6d6dSopenharmony_ci algRanges.append((char)0xaa, errorCode); 4842e5b6d6dSopenharmony_ci --paddingLength; 4852e5b6d6dSopenharmony_ci } 4862e5b6d6dSopenharmony_ci ++countAlgRanges; 4872e5b6d6dSopenharmony_ci} 4882e5b6d6dSopenharmony_ci 4892e5b6d6dSopenharmony_ci/* compressing -------------------------------------------------------------- */ 4902e5b6d6dSopenharmony_ci 4912e5b6d6dSopenharmony_cistatic void 4922e5b6d6dSopenharmony_cicompress(UErrorCode &errorCode) { 4932e5b6d6dSopenharmony_ci uint32_t i, letterCount; 4942e5b6d6dSopenharmony_ci int16_t wordNumber; 4952e5b6d6dSopenharmony_ci 4962e5b6d6dSopenharmony_ci /* sort the words in reverse order by weight */ 4972e5b6d6dSopenharmony_ci uprv_sortArray(words, wordCount, sizeof(Word), 4982e5b6d6dSopenharmony_ci compareWords, NULL, false, &errorCode); 4992e5b6d6dSopenharmony_ci 5002e5b6d6dSopenharmony_ci /* remove the words that do not save anything */ 5012e5b6d6dSopenharmony_ci while(wordCount>0 && words[wordCount-1].weight<1) { 5022e5b6d6dSopenharmony_ci --wordCount; 5032e5b6d6dSopenharmony_ci } 5042e5b6d6dSopenharmony_ci 5052e5b6d6dSopenharmony_ci /* count the letters in the token range */ 5062e5b6d6dSopenharmony_ci letterCount=0; 5072e5b6d6dSopenharmony_ci for(i=LEADBYTE_LIMIT; i<256; ++i) { 5082e5b6d6dSopenharmony_ci if(tokens[i]==-1) { 5092e5b6d6dSopenharmony_ci ++letterCount; 5102e5b6d6dSopenharmony_ci } 5112e5b6d6dSopenharmony_ci } 5122e5b6d6dSopenharmony_ci if(!beQuiet) { 5132e5b6d6dSopenharmony_ci printf("number of letters used in the names: %d\n", (int)letterCount); 5142e5b6d6dSopenharmony_ci } 5152e5b6d6dSopenharmony_ci 5162e5b6d6dSopenharmony_ci /* do we need double-byte tokens? */ 5172e5b6d6dSopenharmony_ci if(wordCount+letterCount<=256) { 5182e5b6d6dSopenharmony_ci /* no, single-byte tokens are enough */ 5192e5b6d6dSopenharmony_ci leadByteCount=0; 5202e5b6d6dSopenharmony_ci for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) { 5212e5b6d6dSopenharmony_ci if(tokens[i]!=-1) { 5222e5b6d6dSopenharmony_ci tokens[i]=wordNumber; 5232e5b6d6dSopenharmony_ci if(beVerbose) { 5242e5b6d6dSopenharmony_ci printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", 5252e5b6d6dSopenharmony_ci (int)i, (long)words[wordNumber].weight, 5262e5b6d6dSopenharmony_ci words[wordNumber].length, words[wordNumber].s); 5272e5b6d6dSopenharmony_ci } 5282e5b6d6dSopenharmony_ci ++wordNumber; 5292e5b6d6dSopenharmony_ci } 5302e5b6d6dSopenharmony_ci } 5312e5b6d6dSopenharmony_ci tokenCount=i; 5322e5b6d6dSopenharmony_ci } else { 5332e5b6d6dSopenharmony_ci /* 5342e5b6d6dSopenharmony_ci * The tokens that need two token bytes 5352e5b6d6dSopenharmony_ci * get their weight reduced by their count 5362e5b6d6dSopenharmony_ci * because they save less. 5372e5b6d6dSopenharmony_ci */ 5382e5b6d6dSopenharmony_ci tokenCount=256-letterCount; 5392e5b6d6dSopenharmony_ci for(i=tokenCount; i<wordCount; ++i) { 5402e5b6d6dSopenharmony_ci words[i].weight-=words[i].count; 5412e5b6d6dSopenharmony_ci } 5422e5b6d6dSopenharmony_ci 5432e5b6d6dSopenharmony_ci /* sort these words in reverse order by weight */ 5442e5b6d6dSopenharmony_ci errorCode=U_ZERO_ERROR; 5452e5b6d6dSopenharmony_ci uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word), 5462e5b6d6dSopenharmony_ci compareWords, NULL, false, &errorCode); 5472e5b6d6dSopenharmony_ci 5482e5b6d6dSopenharmony_ci /* remove the words that do not save anything */ 5492e5b6d6dSopenharmony_ci while(wordCount>0 && words[wordCount-1].weight<1) { 5502e5b6d6dSopenharmony_ci --wordCount; 5512e5b6d6dSopenharmony_ci } 5522e5b6d6dSopenharmony_ci 5532e5b6d6dSopenharmony_ci /* how many tokens and lead bytes do we have now? */ 5542e5b6d6dSopenharmony_ci tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1); 5552e5b6d6dSopenharmony_ci /* 5562e5b6d6dSopenharmony_ci * adjust upwards to take into account that 5572e5b6d6dSopenharmony_ci * double-byte tokens must not 5582e5b6d6dSopenharmony_ci * use NAME_SEPARATOR_CHAR as a second byte 5592e5b6d6dSopenharmony_ci */ 5602e5b6d6dSopenharmony_ci tokenCount+=(tokenCount-256+254)/255; 5612e5b6d6dSopenharmony_ci 5622e5b6d6dSopenharmony_ci leadByteCount=(int16_t)(tokenCount>>8); 5632e5b6d6dSopenharmony_ci if(leadByteCount<LEADBYTE_LIMIT) { 5642e5b6d6dSopenharmony_ci /* adjust for the real number of lead bytes */ 5652e5b6d6dSopenharmony_ci tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount; 5662e5b6d6dSopenharmony_ci } else { 5672e5b6d6dSopenharmony_ci /* limit the number of lead bytes */ 5682e5b6d6dSopenharmony_ci leadByteCount=LEADBYTE_LIMIT-1; 5692e5b6d6dSopenharmony_ci tokenCount=LEADBYTE_LIMIT*256; 5702e5b6d6dSopenharmony_ci wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1); 5712e5b6d6dSopenharmony_ci /* adjust again to skip double-byte tokens with ';' */ 5722e5b6d6dSopenharmony_ci wordCount-=(tokenCount-256+254)/255; 5732e5b6d6dSopenharmony_ci } 5742e5b6d6dSopenharmony_ci 5752e5b6d6dSopenharmony_ci /* set token 0 to word 0 */ 5762e5b6d6dSopenharmony_ci tokens[0]=0; 5772e5b6d6dSopenharmony_ci if(beVerbose) { 5782e5b6d6dSopenharmony_ci printf("tokens[0x000]: word%8ld \"%.*s\"\n", 5792e5b6d6dSopenharmony_ci (long)words[0].weight, 5802e5b6d6dSopenharmony_ci words[0].length, words[0].s); 5812e5b6d6dSopenharmony_ci } 5822e5b6d6dSopenharmony_ci wordNumber=1; 5832e5b6d6dSopenharmony_ci 5842e5b6d6dSopenharmony_ci /* set the lead byte tokens */ 5852e5b6d6dSopenharmony_ci for(i=1; (int16_t)i<=leadByteCount; ++i) { 5862e5b6d6dSopenharmony_ci tokens[i]=-2; 5872e5b6d6dSopenharmony_ci } 5882e5b6d6dSopenharmony_ci 5892e5b6d6dSopenharmony_ci /* set the tokens */ 5902e5b6d6dSopenharmony_ci for(; i<256; ++i) { 5912e5b6d6dSopenharmony_ci /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */ 5922e5b6d6dSopenharmony_ci if(tokens[i]!=-1) { 5932e5b6d6dSopenharmony_ci tokens[i]=wordNumber; 5942e5b6d6dSopenharmony_ci if(beVerbose) { 5952e5b6d6dSopenharmony_ci printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", 5962e5b6d6dSopenharmony_ci (int)i, (long)words[wordNumber].weight, 5972e5b6d6dSopenharmony_ci words[wordNumber].length, words[wordNumber].s); 5982e5b6d6dSopenharmony_ci } 5992e5b6d6dSopenharmony_ci ++wordNumber; 6002e5b6d6dSopenharmony_ci } 6012e5b6d6dSopenharmony_ci } 6022e5b6d6dSopenharmony_ci 6032e5b6d6dSopenharmony_ci /* continue above 255 where there are no letters */ 6042e5b6d6dSopenharmony_ci for(; (uint32_t)wordNumber<wordCount; ++i) { 6052e5b6d6dSopenharmony_ci if((i&0xff)==NAME_SEPARATOR_CHAR) { 6062e5b6d6dSopenharmony_ci tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */ 6072e5b6d6dSopenharmony_ci } else { 6082e5b6d6dSopenharmony_ci tokens[i]=wordNumber; 6092e5b6d6dSopenharmony_ci if(beVerbose) { 6102e5b6d6dSopenharmony_ci printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", 6112e5b6d6dSopenharmony_ci (int)i, (long)words[wordNumber].weight, 6122e5b6d6dSopenharmony_ci words[wordNumber].length, words[wordNumber].s); 6132e5b6d6dSopenharmony_ci } 6142e5b6d6dSopenharmony_ci ++wordNumber; 6152e5b6d6dSopenharmony_ci } 6162e5b6d6dSopenharmony_ci } 6172e5b6d6dSopenharmony_ci tokenCount=i; /* should be already tokenCount={i or i+1} */ 6182e5b6d6dSopenharmony_ci } 6192e5b6d6dSopenharmony_ci 6202e5b6d6dSopenharmony_ci if(!beQuiet) { 6212e5b6d6dSopenharmony_ci printf("number of lead bytes: %d\n", leadByteCount); 6222e5b6d6dSopenharmony_ci printf("number of single-byte tokens: %lu\n", 6232e5b6d6dSopenharmony_ci (unsigned long)256-letterCount-leadByteCount); 6242e5b6d6dSopenharmony_ci printf("number of tokens: %lu\n", (unsigned long)tokenCount); 6252e5b6d6dSopenharmony_ci } 6262e5b6d6dSopenharmony_ci 6272e5b6d6dSopenharmony_ci compressLines(); 6282e5b6d6dSopenharmony_ci} 6292e5b6d6dSopenharmony_ci 6302e5b6d6dSopenharmony_cistatic void 6312e5b6d6dSopenharmony_cicompressLines() { 6322e5b6d6dSopenharmony_ci Line *line=NULL; 6332e5b6d6dSopenharmony_ci uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */, 6342e5b6d6dSopenharmony_ci groupMSB=0xffff, lineCount2; 6352e5b6d6dSopenharmony_ci int16_t groupTop=0; 6362e5b6d6dSopenharmony_ci 6372e5b6d6dSopenharmony_ci /* store the groups like lines, with compressed data after raw strings */ 6382e5b6d6dSopenharmony_ci groupBottom=lineTop; 6392e5b6d6dSopenharmony_ci lineCount2=lineCount; 6402e5b6d6dSopenharmony_ci lineCount=0; 6412e5b6d6dSopenharmony_ci 6422e5b6d6dSopenharmony_ci /* loop over all lines */ 6432e5b6d6dSopenharmony_ci while(i<lineCount2) { 6442e5b6d6dSopenharmony_ci line=lines+i++; 6452e5b6d6dSopenharmony_ci inLine=line->code; 6462e5b6d6dSopenharmony_ci 6472e5b6d6dSopenharmony_ci /* segment the lines to groups of 32 */ 6482e5b6d6dSopenharmony_ci if(inLine>>GROUP_SHIFT!=groupMSB) { 6492e5b6d6dSopenharmony_ci /* finish the current group with empty lines */ 6502e5b6d6dSopenharmony_ci while((++outLine&GROUP_MASK)!=0) { 6512e5b6d6dSopenharmony_ci appendLineLength(0); 6522e5b6d6dSopenharmony_ci } 6532e5b6d6dSopenharmony_ci 6542e5b6d6dSopenharmony_ci /* store the group like a line */ 6552e5b6d6dSopenharmony_ci if(groupTop>0) { 6562e5b6d6dSopenharmony_ci if(groupTop>GROUP_STORE_SIZE) { 6572e5b6d6dSopenharmony_ci fprintf(stderr, "gennames: group store overflow\n"); 6582e5b6d6dSopenharmony_ci exit(U_BUFFER_OVERFLOW_ERROR); 6592e5b6d6dSopenharmony_ci } 6602e5b6d6dSopenharmony_ci addGroup(groupMSB, groupStore, groupTop); 6612e5b6d6dSopenharmony_ci } 6622e5b6d6dSopenharmony_ci 6632e5b6d6dSopenharmony_ci /* start the new group */ 6642e5b6d6dSopenharmony_ci lineLengthsTop=0; 6652e5b6d6dSopenharmony_ci groupTop=0; 6662e5b6d6dSopenharmony_ci groupMSB=inLine>>GROUP_SHIFT; 6672e5b6d6dSopenharmony_ci outLine=(inLine&~GROUP_MASK)-1; 6682e5b6d6dSopenharmony_ci } 6692e5b6d6dSopenharmony_ci 6702e5b6d6dSopenharmony_ci /* write empty lines between the previous line in the group and this one */ 6712e5b6d6dSopenharmony_ci while(++outLine<inLine) { 6722e5b6d6dSopenharmony_ci appendLineLength(0); 6732e5b6d6dSopenharmony_ci } 6742e5b6d6dSopenharmony_ci 6752e5b6d6dSopenharmony_ci /* write characters and tokens for this line */ 6762e5b6d6dSopenharmony_ci appendLineLength(compressLine(line->s, line->length, &groupTop)); 6772e5b6d6dSopenharmony_ci } 6782e5b6d6dSopenharmony_ci 6792e5b6d6dSopenharmony_ci /* finish and store the last group */ 6802e5b6d6dSopenharmony_ci if(line && groupMSB!=0xffff) { 6812e5b6d6dSopenharmony_ci /* finish the current group with empty lines */ 6822e5b6d6dSopenharmony_ci while((++outLine&GROUP_MASK)!=0) { 6832e5b6d6dSopenharmony_ci appendLineLength(0); 6842e5b6d6dSopenharmony_ci } 6852e5b6d6dSopenharmony_ci 6862e5b6d6dSopenharmony_ci /* store the group like a line */ 6872e5b6d6dSopenharmony_ci if(groupTop>0) { 6882e5b6d6dSopenharmony_ci if(groupTop>GROUP_STORE_SIZE) { 6892e5b6d6dSopenharmony_ci fprintf(stderr, "gennames: group store overflow\n"); 6902e5b6d6dSopenharmony_ci exit(U_BUFFER_OVERFLOW_ERROR); 6912e5b6d6dSopenharmony_ci } 6922e5b6d6dSopenharmony_ci addGroup(groupMSB, groupStore, groupTop); 6932e5b6d6dSopenharmony_ci } 6942e5b6d6dSopenharmony_ci } 6952e5b6d6dSopenharmony_ci 6962e5b6d6dSopenharmony_ci if(!beQuiet) { 6972e5b6d6dSopenharmony_ci printf("number of groups: %lu\n", (unsigned long)lineCount); 6982e5b6d6dSopenharmony_ci } 6992e5b6d6dSopenharmony_ci} 7002e5b6d6dSopenharmony_ci 7012e5b6d6dSopenharmony_cistatic int16_t 7022e5b6d6dSopenharmony_cicompressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) { 7032e5b6d6dSopenharmony_ci int16_t start, limit, token, groupTop=*pGroupTop; 7042e5b6d6dSopenharmony_ci 7052e5b6d6dSopenharmony_ci start=0; 7062e5b6d6dSopenharmony_ci do { 7072e5b6d6dSopenharmony_ci /* write any "noise" characters */ 7082e5b6d6dSopenharmony_ci limit=skipNoise((char *)s, start, length); 7092e5b6d6dSopenharmony_ci while(start<limit) { 7102e5b6d6dSopenharmony_ci groupStore[groupTop++]=s[start++]; 7112e5b6d6dSopenharmony_ci } 7122e5b6d6dSopenharmony_ci 7132e5b6d6dSopenharmony_ci if(start==length) { 7142e5b6d6dSopenharmony_ci break; 7152e5b6d6dSopenharmony_ci } 7162e5b6d6dSopenharmony_ci 7172e5b6d6dSopenharmony_ci /* write a word, as token or directly */ 7182e5b6d6dSopenharmony_ci limit=getWord((char *)s, start, length); 7192e5b6d6dSopenharmony_ci if(limit-start==1) { 7202e5b6d6dSopenharmony_ci groupStore[groupTop++]=s[start++]; 7212e5b6d6dSopenharmony_ci } else { 7222e5b6d6dSopenharmony_ci token=findToken(s+start, (int16_t)(limit-start)); 7232e5b6d6dSopenharmony_ci if(token!=-1) { 7242e5b6d6dSopenharmony_ci if(token>0xff) { 7252e5b6d6dSopenharmony_ci groupStore[groupTop++]=(uint8_t)(token>>8); 7262e5b6d6dSopenharmony_ci } 7272e5b6d6dSopenharmony_ci groupStore[groupTop++]=(uint8_t)token; 7282e5b6d6dSopenharmony_ci start=limit; 7292e5b6d6dSopenharmony_ci } else { 7302e5b6d6dSopenharmony_ci while(start<limit) { 7312e5b6d6dSopenharmony_ci groupStore[groupTop++]=s[start++]; 7322e5b6d6dSopenharmony_ci } 7332e5b6d6dSopenharmony_ci } 7342e5b6d6dSopenharmony_ci } 7352e5b6d6dSopenharmony_ci } while(start<length); 7362e5b6d6dSopenharmony_ci 7372e5b6d6dSopenharmony_ci length=(int16_t)(groupTop-*pGroupTop); 7382e5b6d6dSopenharmony_ci *pGroupTop=groupTop; 7392e5b6d6dSopenharmony_ci return length; 7402e5b6d6dSopenharmony_ci} 7412e5b6d6dSopenharmony_ci 7422e5b6d6dSopenharmony_cistatic int32_t 7432e5b6d6dSopenharmony_cicompareWords(const void *context, const void *word1, const void *word2) { 7442e5b6d6dSopenharmony_ci /* reverse sort by word weight */ 7452e5b6d6dSopenharmony_ci return ((Word *)word2)->weight-((Word *)word1)->weight; 7462e5b6d6dSopenharmony_ci} 7472e5b6d6dSopenharmony_ci 7482e5b6d6dSopenharmony_civoid 7492e5b6d6dSopenharmony_ciNamesPropsBuilder::build(UErrorCode &errorCode) { 7502e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode)) { return; } 7512e5b6d6dSopenharmony_ci 7522e5b6d6dSopenharmony_ci if(!beQuiet) { 7532e5b6d6dSopenharmony_ci puts("* unames.icu stats *"); 7542e5b6d6dSopenharmony_ci printf("size of all names in the database: %lu\n", 7552e5b6d6dSopenharmony_ci (unsigned long)lineTop); 7562e5b6d6dSopenharmony_ci printf("number of named Unicode characters: %lu\n", 7572e5b6d6dSopenharmony_ci (unsigned long)lineCount); 7582e5b6d6dSopenharmony_ci printf("number of words in the dictionary from these names: %lu\n", 7592e5b6d6dSopenharmony_ci (unsigned long)wordCount); 7602e5b6d6dSopenharmony_ci } 7612e5b6d6dSopenharmony_ci compress(errorCode); 7622e5b6d6dSopenharmony_ci} 7632e5b6d6dSopenharmony_ci 7642e5b6d6dSopenharmony_ci/* generate output data ----------------------------------------------------- */ 7652e5b6d6dSopenharmony_ci 7662e5b6d6dSopenharmony_civoid 7672e5b6d6dSopenharmony_ciNamesPropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) { 7682e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode)) { return; } 7692e5b6d6dSopenharmony_ci 7702e5b6d6dSopenharmony_ci UNewDataMemory *pData=udata_create(path, "icu", "unames", &dataInfo, 7712e5b6d6dSopenharmony_ci withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 7722e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode)) { 7732e5b6d6dSopenharmony_ci fprintf(stderr, "genprops: udata_create(%s, unames.icu) failed - %s\n", 7742e5b6d6dSopenharmony_ci path, u_errorName(errorCode)); 7752e5b6d6dSopenharmony_ci return; 7762e5b6d6dSopenharmony_ci } 7772e5b6d6dSopenharmony_ci 7782e5b6d6dSopenharmony_ci uint16_t groupWords[3]; 7792e5b6d6dSopenharmony_ci uint32_t i, groupTop=lineTop, size, 7802e5b6d6dSopenharmony_ci tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; 7812e5b6d6dSopenharmony_ci long dataLength; 7822e5b6d6dSopenharmony_ci int16_t token; 7832e5b6d6dSopenharmony_ci 7842e5b6d6dSopenharmony_ci /* first, see how much space we need, and prepare the token strings */ 7852e5b6d6dSopenharmony_ci for(i=0; i<tokenCount; ++i) { 7862e5b6d6dSopenharmony_ci token=tokens[i]; 7872e5b6d6dSopenharmony_ci if(token!=-1 && token!=-2) { 7882e5b6d6dSopenharmony_ci tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop); 7892e5b6d6dSopenharmony_ci } 7902e5b6d6dSopenharmony_ci } 7912e5b6d6dSopenharmony_ci 7922e5b6d6dSopenharmony_ci /* 7932e5b6d6dSopenharmony_ci * Required padding for data swapping: 7942e5b6d6dSopenharmony_ci * The token table undergoes a permutation during data swapping when the 7952e5b6d6dSopenharmony_ci * input and output charsets are different. 7962e5b6d6dSopenharmony_ci * The token table cannot grow during swapping, so we need to make sure that 7972e5b6d6dSopenharmony_ci * the table is long enough for successful in-place permutation. 7982e5b6d6dSopenharmony_ci * 7992e5b6d6dSopenharmony_ci * We simply round up tokenCount to the next multiple of 256 to account for 8002e5b6d6dSopenharmony_ci * all possible permutations. 8012e5b6d6dSopenharmony_ci * 8022e5b6d6dSopenharmony_ci * An optimization is possible if we only ever swap between ASCII and EBCDIC: 8032e5b6d6dSopenharmony_ci * 8042e5b6d6dSopenharmony_ci * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used 8052e5b6d6dSopenharmony_ci * and will be swapped between ASCII and EBCDIC between 8062e5b6d6dSopenharmony_ci * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon). 8072e5b6d6dSopenharmony_ci * This should be the only -1 entry in tokens[256..511] on which the data 8082e5b6d6dSopenharmony_ci * swapper bases its trail byte permutation map (trailMap[]). 8092e5b6d6dSopenharmony_ci * 8102e5b6d6dSopenharmony_ci * It would be sufficient to increase tokenCount so that its lower 8 bits 8112e5b6d6dSopenharmony_ci * are at least 0x5e+1 to make room for swapping between the two semicolons. 8122e5b6d6dSopenharmony_ci * For values higher than 0x5e, the trail byte permutation map (trailMap[]) 8132e5b6d6dSopenharmony_ci * should always be an identity map, where we do not need additional room. 8142e5b6d6dSopenharmony_ci */ 8152e5b6d6dSopenharmony_ci i=tokenCount; 8162e5b6d6dSopenharmony_ci tokenCount=(tokenCount+0xff)&~0xff; 8172e5b6d6dSopenharmony_ci if(!beQuiet && i<tokenCount) { 8182e5b6d6dSopenharmony_ci printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i)); 8192e5b6d6dSopenharmony_ci } 8202e5b6d6dSopenharmony_ci for(; i<tokenCount; ++i) { 8212e5b6d6dSopenharmony_ci if((i&0xff)==NAME_SEPARATOR_CHAR) { 8222e5b6d6dSopenharmony_ci tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */ 8232e5b6d6dSopenharmony_ci } else { 8242e5b6d6dSopenharmony_ci tokens[i]=0; /* unused token for padding */ 8252e5b6d6dSopenharmony_ci } 8262e5b6d6dSopenharmony_ci } 8272e5b6d6dSopenharmony_ci 8282e5b6d6dSopenharmony_ci /* 8292e5b6d6dSopenharmony_ci * Calculate the total size in bytes of the data including: 8302e5b6d6dSopenharmony_ci * - the offset to the token strings, uint32_t (4) 8312e5b6d6dSopenharmony_ci * - the offset to the group table, uint32_t (4) 8322e5b6d6dSopenharmony_ci * - the offset to the group strings, uint32_t (4) 8332e5b6d6dSopenharmony_ci * - the offset to the algorithmic names, uint32_t (4) 8342e5b6d6dSopenharmony_ci * 8352e5b6d6dSopenharmony_ci * - the number of tokens, uint16_t (2) 8362e5b6d6dSopenharmony_ci * - the token table, uint16_t[tokenCount] (2*tokenCount) 8372e5b6d6dSopenharmony_ci * 8382e5b6d6dSopenharmony_ci * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded 8392e5b6d6dSopenharmony_ci * 8402e5b6d6dSopenharmony_ci * - the number of groups, uint16_t (2) 8412e5b6d6dSopenharmony_ci * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount] 8422e5b6d6dSopenharmony_ci * 8432e5b6d6dSopenharmony_ci * - the group strings (groupTop-groupBottom), 2-padded 8442e5b6d6dSopenharmony_ci * 8452e5b6d6dSopenharmony_ci * - the size of the data for the algorithmic names 8462e5b6d6dSopenharmony_ci */ 8472e5b6d6dSopenharmony_ci tokenStringOffset=4+4+4+4+2+2*tokenCount; 8482e5b6d6dSopenharmony_ci groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1; 8492e5b6d6dSopenharmony_ci groupStringOffset=groupsOffset+2+6*lineCount; 8502e5b6d6dSopenharmony_ci algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3; 8512e5b6d6dSopenharmony_ci 8522e5b6d6dSopenharmony_ci size=algNamesOffset+4+algRanges.length(); 8532e5b6d6dSopenharmony_ci 8542e5b6d6dSopenharmony_ci if(!beQuiet) { 8552e5b6d6dSopenharmony_ci printf("size of the Unicode Names data:\n" 8562e5b6d6dSopenharmony_ci "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n", 8572e5b6d6dSopenharmony_ci (unsigned long)size, (unsigned long)(lineTop-groupTop), 8582e5b6d6dSopenharmony_ci (unsigned long)(groupTop-groupBottom), (unsigned long)(4+algRanges.length())); 8592e5b6d6dSopenharmony_ci } 8602e5b6d6dSopenharmony_ci 8612e5b6d6dSopenharmony_ci /* write the data to the file */ 8622e5b6d6dSopenharmony_ci /* offsets */ 8632e5b6d6dSopenharmony_ci udata_write32(pData, tokenStringOffset); 8642e5b6d6dSopenharmony_ci udata_write32(pData, groupsOffset); 8652e5b6d6dSopenharmony_ci udata_write32(pData, groupStringOffset); 8662e5b6d6dSopenharmony_ci udata_write32(pData, algNamesOffset); 8672e5b6d6dSopenharmony_ci 8682e5b6d6dSopenharmony_ci /* token table */ 8692e5b6d6dSopenharmony_ci udata_write16(pData, (uint16_t)tokenCount); 8702e5b6d6dSopenharmony_ci udata_writeBlock(pData, tokens, 2*tokenCount); 8712e5b6d6dSopenharmony_ci 8722e5b6d6dSopenharmony_ci /* token strings */ 8732e5b6d6dSopenharmony_ci udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop); 8742e5b6d6dSopenharmony_ci if((lineTop-groupTop)&1) { 8752e5b6d6dSopenharmony_ci /* 2-padding */ 8762e5b6d6dSopenharmony_ci udata_writePadding(pData, 1); 8772e5b6d6dSopenharmony_ci } 8782e5b6d6dSopenharmony_ci 8792e5b6d6dSopenharmony_ci /* group table */ 8802e5b6d6dSopenharmony_ci udata_write16(pData, (uint16_t)lineCount); 8812e5b6d6dSopenharmony_ci for(i=0; i<lineCount; ++i) { 8822e5b6d6dSopenharmony_ci /* groupMSB */ 8832e5b6d6dSopenharmony_ci groupWords[0]=(uint16_t)lines[i].code; 8842e5b6d6dSopenharmony_ci 8852e5b6d6dSopenharmony_ci /* offset */ 8862e5b6d6dSopenharmony_ci uint32_t offset = (uint32_t)((lines[i].s - stringStore)-groupBottom); 8872e5b6d6dSopenharmony_ci groupWords[1]=(uint16_t)(offset>>16); 8882e5b6d6dSopenharmony_ci groupWords[2]=(uint16_t)(offset); 8892e5b6d6dSopenharmony_ci udata_writeBlock(pData, groupWords, 6); 8902e5b6d6dSopenharmony_ci } 8912e5b6d6dSopenharmony_ci 8922e5b6d6dSopenharmony_ci /* group strings */ 8932e5b6d6dSopenharmony_ci udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom); 8942e5b6d6dSopenharmony_ci 8952e5b6d6dSopenharmony_ci /* 4-align the algorithmic names data */ 8962e5b6d6dSopenharmony_ci udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom))); 8972e5b6d6dSopenharmony_ci 8982e5b6d6dSopenharmony_ci udata_write32(pData, countAlgRanges); 8992e5b6d6dSopenharmony_ci udata_writeBlock(pData, algRanges.data(), algRanges.length()); 9002e5b6d6dSopenharmony_ci 9012e5b6d6dSopenharmony_ci /* finish up */ 9022e5b6d6dSopenharmony_ci dataLength=udata_finish(pData, &errorCode); 9032e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode)) { 9042e5b6d6dSopenharmony_ci fprintf(stderr, "gennames: error %d writing the output file\n", errorCode); 9052e5b6d6dSopenharmony_ci exit(errorCode); 9062e5b6d6dSopenharmony_ci } 9072e5b6d6dSopenharmony_ci 9082e5b6d6dSopenharmony_ci if(dataLength!=(long)size) { 9092e5b6d6dSopenharmony_ci fprintf(stderr, "gennames: data length %ld != calculated size %lu\n", 9102e5b6d6dSopenharmony_cidataLength, (unsigned long)size); 9112e5b6d6dSopenharmony_ci exit(U_INTERNAL_PROGRAM_ERROR); 9122e5b6d6dSopenharmony_ci } 9132e5b6d6dSopenharmony_ci} 9142e5b6d6dSopenharmony_ci 9152e5b6d6dSopenharmony_ci/* helpers ------------------------------------------------------------------ */ 9162e5b6d6dSopenharmony_ci 9172e5b6d6dSopenharmony_cistatic int16_t 9182e5b6d6dSopenharmony_cifindToken(uint8_t *s, int16_t length) { 9192e5b6d6dSopenharmony_ci int16_t i, token; 9202e5b6d6dSopenharmony_ci 9212e5b6d6dSopenharmony_ci for(i=0; i<(int16_t)tokenCount; ++i) { 9222e5b6d6dSopenharmony_ci token=tokens[i]; 9232e5b6d6dSopenharmony_ci if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) { 9242e5b6d6dSopenharmony_ci return i; 9252e5b6d6dSopenharmony_ci } 9262e5b6d6dSopenharmony_ci } 9272e5b6d6dSopenharmony_ci 9282e5b6d6dSopenharmony_ci return -1; 9292e5b6d6dSopenharmony_ci} 9302e5b6d6dSopenharmony_ci 9312e5b6d6dSopenharmony_cistatic Word * 9322e5b6d6dSopenharmony_cifindWord(const char *s, int16_t length) { 9332e5b6d6dSopenharmony_ci uint32_t i; 9342e5b6d6dSopenharmony_ci 9352e5b6d6dSopenharmony_ci for(i=0; i<wordCount; ++i) { 9362e5b6d6dSopenharmony_ci if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) { 9372e5b6d6dSopenharmony_ci return words+i; 9382e5b6d6dSopenharmony_ci } 9392e5b6d6dSopenharmony_ci } 9402e5b6d6dSopenharmony_ci 9412e5b6d6dSopenharmony_ci return NULL; 9422e5b6d6dSopenharmony_ci} 9432e5b6d6dSopenharmony_ci 9442e5b6d6dSopenharmony_cistatic Word * 9452e5b6d6dSopenharmony_ciaddWord(const char *s, int16_t length) { 9462e5b6d6dSopenharmony_ci uint8_t *stringStart; 9472e5b6d6dSopenharmony_ci Word *word; 9482e5b6d6dSopenharmony_ci 9492e5b6d6dSopenharmony_ci if(wordCount==MAX_WORD_COUNT) { 9502e5b6d6dSopenharmony_ci fprintf(stderr, "gennames: too many words\n"); 9512e5b6d6dSopenharmony_ci exit(U_BUFFER_OVERFLOW_ERROR); 9522e5b6d6dSopenharmony_ci } 9532e5b6d6dSopenharmony_ci 9542e5b6d6dSopenharmony_ci stringStart=allocWord(length); 9552e5b6d6dSopenharmony_ci uprv_memcpy(stringStart, s, length); 9562e5b6d6dSopenharmony_ci 9572e5b6d6dSopenharmony_ci word=words+wordCount; 9582e5b6d6dSopenharmony_ci 9592e5b6d6dSopenharmony_ci /* 9602e5b6d6dSopenharmony_ci * Initialize the weight with the costs for this token: 9612e5b6d6dSopenharmony_ci * a zero-terminated string and a 16-bit offset. 9622e5b6d6dSopenharmony_ci */ 9632e5b6d6dSopenharmony_ci word->weight=-(length+1+2); 9642e5b6d6dSopenharmony_ci word->count=0; 9652e5b6d6dSopenharmony_ci word->length=length; 9662e5b6d6dSopenharmony_ci word->s=stringStart; 9672e5b6d6dSopenharmony_ci 9682e5b6d6dSopenharmony_ci ++wordCount; 9692e5b6d6dSopenharmony_ci 9702e5b6d6dSopenharmony_ci return word; 9712e5b6d6dSopenharmony_ci} 9722e5b6d6dSopenharmony_ci 9732e5b6d6dSopenharmony_cistatic void 9742e5b6d6dSopenharmony_cicountWord(Word *word) { 9752e5b6d6dSopenharmony_ci /* add to the weight the savings: the length of the word minus 1 byte for the token */ 9762e5b6d6dSopenharmony_ci word->weight+=word->length-1; 9772e5b6d6dSopenharmony_ci ++word->count; 9782e5b6d6dSopenharmony_ci} 9792e5b6d6dSopenharmony_ci 9802e5b6d6dSopenharmony_cistatic void 9812e5b6d6dSopenharmony_ciaddLine(UChar32 code, const char *names[], int16_t lengths[], int16_t count) { 9822e5b6d6dSopenharmony_ci uint8_t *stringStart; 9832e5b6d6dSopenharmony_ci Line *line; 9842e5b6d6dSopenharmony_ci int16_t i, length; 9852e5b6d6dSopenharmony_ci 9862e5b6d6dSopenharmony_ci if(lineCount==MAX_LINE_COUNT) { 9872e5b6d6dSopenharmony_ci fprintf(stderr, "gennames: too many lines\n"); 9882e5b6d6dSopenharmony_ci exit(U_BUFFER_OVERFLOW_ERROR); 9892e5b6d6dSopenharmony_ci } 9902e5b6d6dSopenharmony_ci 9912e5b6d6dSopenharmony_ci /* find the last non-empty name */ 9922e5b6d6dSopenharmony_ci while(count>0 && lengths[count-1]==0) { 9932e5b6d6dSopenharmony_ci --count; 9942e5b6d6dSopenharmony_ci } 9952e5b6d6dSopenharmony_ci if(count==0) { 9962e5b6d6dSopenharmony_ci return; /* should not occur: caller should not have called */ 9972e5b6d6dSopenharmony_ci } 9982e5b6d6dSopenharmony_ci 9992e5b6d6dSopenharmony_ci /* there will be (count-1) separator characters */ 10002e5b6d6dSopenharmony_ci i=count; 10012e5b6d6dSopenharmony_ci length=count-1; 10022e5b6d6dSopenharmony_ci 10032e5b6d6dSopenharmony_ci /* add lengths of strings */ 10042e5b6d6dSopenharmony_ci while(i>0) { 10052e5b6d6dSopenharmony_ci length+=lengths[--i]; 10062e5b6d6dSopenharmony_ci } 10072e5b6d6dSopenharmony_ci 10082e5b6d6dSopenharmony_ci /* allocate line memory */ 10092e5b6d6dSopenharmony_ci stringStart=allocLine(length); 10102e5b6d6dSopenharmony_ci 10112e5b6d6dSopenharmony_ci /* copy all strings into the line memory */ 10122e5b6d6dSopenharmony_ci length=0; /* number of chars copied so far */ 10132e5b6d6dSopenharmony_ci for(i=0; i<count; ++i) { 10142e5b6d6dSopenharmony_ci if(i>0) { 10152e5b6d6dSopenharmony_ci stringStart[length++]=NAME_SEPARATOR_CHAR; 10162e5b6d6dSopenharmony_ci } 10172e5b6d6dSopenharmony_ci if(lengths[i]>0) { 10182e5b6d6dSopenharmony_ci uprv_memcpy(stringStart+length, names[i], lengths[i]); 10192e5b6d6dSopenharmony_ci length+=lengths[i]; 10202e5b6d6dSopenharmony_ci } 10212e5b6d6dSopenharmony_ci } 10222e5b6d6dSopenharmony_ci 10232e5b6d6dSopenharmony_ci line=lines+lineCount; 10242e5b6d6dSopenharmony_ci 10252e5b6d6dSopenharmony_ci line->code=code; 10262e5b6d6dSopenharmony_ci line->length=length; 10272e5b6d6dSopenharmony_ci line->s=stringStart; 10282e5b6d6dSopenharmony_ci 10292e5b6d6dSopenharmony_ci ++lineCount; 10302e5b6d6dSopenharmony_ci 10312e5b6d6dSopenharmony_ci /* prevent a character value that is actually in a name from becoming a token */ 10322e5b6d6dSopenharmony_ci while(length>0) { 10332e5b6d6dSopenharmony_ci tokens[stringStart[--length]]=-1; 10342e5b6d6dSopenharmony_ci } 10352e5b6d6dSopenharmony_ci} 10362e5b6d6dSopenharmony_ci 10372e5b6d6dSopenharmony_cistatic void 10382e5b6d6dSopenharmony_ciaddGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) { 10392e5b6d6dSopenharmony_ci uint8_t *stringStart; 10402e5b6d6dSopenharmony_ci Line *line; 10412e5b6d6dSopenharmony_ci 10422e5b6d6dSopenharmony_ci if(lineCount==MAX_LINE_COUNT) { 10432e5b6d6dSopenharmony_ci fprintf(stderr, "gennames: too many groups\n"); 10442e5b6d6dSopenharmony_ci exit(U_BUFFER_OVERFLOW_ERROR); 10452e5b6d6dSopenharmony_ci } 10462e5b6d6dSopenharmony_ci 10472e5b6d6dSopenharmony_ci /* store the line lengths first, then the strings */ 10482e5b6d6dSopenharmony_ci lineLengthsTop=(lineLengthsTop+1)/2; 10492e5b6d6dSopenharmony_ci stringStart=allocLine(lineLengthsTop+length); 10502e5b6d6dSopenharmony_ci uprv_memcpy(stringStart, lineLengths, lineLengthsTop); 10512e5b6d6dSopenharmony_ci uprv_memcpy(stringStart+lineLengthsTop, strings, length); 10522e5b6d6dSopenharmony_ci 10532e5b6d6dSopenharmony_ci line=lines+lineCount; 10542e5b6d6dSopenharmony_ci 10552e5b6d6dSopenharmony_ci line->code=groupMSB; 10562e5b6d6dSopenharmony_ci line->length=length; 10572e5b6d6dSopenharmony_ci line->s=stringStart; 10582e5b6d6dSopenharmony_ci 10592e5b6d6dSopenharmony_ci ++lineCount; 10602e5b6d6dSopenharmony_ci} 10612e5b6d6dSopenharmony_ci 10622e5b6d6dSopenharmony_cistatic uint32_t 10632e5b6d6dSopenharmony_ciaddToken(uint8_t *s, int16_t length) { 10642e5b6d6dSopenharmony_ci uint8_t *stringStart; 10652e5b6d6dSopenharmony_ci 10662e5b6d6dSopenharmony_ci stringStart=allocLine(length+1); 10672e5b6d6dSopenharmony_ci uprv_memcpy(stringStart, s, length); 10682e5b6d6dSopenharmony_ci stringStart[length]=0; 10692e5b6d6dSopenharmony_ci 10702e5b6d6dSopenharmony_ci return (uint32_t)(stringStart - stringStore); 10712e5b6d6dSopenharmony_ci} 10722e5b6d6dSopenharmony_ci 10732e5b6d6dSopenharmony_cistatic void 10742e5b6d6dSopenharmony_ciappendLineLength(int16_t length) { 10752e5b6d6dSopenharmony_ci if(length>=76) { 10762e5b6d6dSopenharmony_ci fprintf(stderr, "gennames: compressed line too long\n"); 10772e5b6d6dSopenharmony_ci exit(U_BUFFER_OVERFLOW_ERROR); 10782e5b6d6dSopenharmony_ci } 10792e5b6d6dSopenharmony_ci if(length>=12) { 10802e5b6d6dSopenharmony_ci length-=12; 10812e5b6d6dSopenharmony_ci appendLineLengthNibble((uint8_t)((length>>4)|12)); 10822e5b6d6dSopenharmony_ci } 10832e5b6d6dSopenharmony_ci appendLineLengthNibble((uint8_t)length); 10842e5b6d6dSopenharmony_ci} 10852e5b6d6dSopenharmony_ci 10862e5b6d6dSopenharmony_cistatic void 10872e5b6d6dSopenharmony_ciappendLineLengthNibble(uint8_t nibble) { 10882e5b6d6dSopenharmony_ci if((lineLengthsTop&1)==0) { 10892e5b6d6dSopenharmony_ci lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4); 10902e5b6d6dSopenharmony_ci } else { 10912e5b6d6dSopenharmony_ci lineLengths[lineLengthsTop/2]|=nibble&0xf; 10922e5b6d6dSopenharmony_ci } 10932e5b6d6dSopenharmony_ci ++lineLengthsTop; 10942e5b6d6dSopenharmony_ci} 10952e5b6d6dSopenharmony_ci 10962e5b6d6dSopenharmony_cistatic uint8_t * 10972e5b6d6dSopenharmony_ciallocLine(int32_t length) { 10982e5b6d6dSopenharmony_ci uint32_t top=lineTop+length; 10992e5b6d6dSopenharmony_ci uint8_t *p; 11002e5b6d6dSopenharmony_ci 11012e5b6d6dSopenharmony_ci if(top>wordBottom) { 11022e5b6d6dSopenharmony_ci fprintf(stderr, "gennames allocLine(): out of memory\n"); 11032e5b6d6dSopenharmony_ci exit(U_MEMORY_ALLOCATION_ERROR); 11042e5b6d6dSopenharmony_ci } 11052e5b6d6dSopenharmony_ci p=stringStore+lineTop; 11062e5b6d6dSopenharmony_ci lineTop=top; 11072e5b6d6dSopenharmony_ci return p; 11082e5b6d6dSopenharmony_ci} 11092e5b6d6dSopenharmony_ci 11102e5b6d6dSopenharmony_cistatic uint8_t * 11112e5b6d6dSopenharmony_ciallocWord(uint32_t length) { 11122e5b6d6dSopenharmony_ci uint32_t bottom=wordBottom-length; 11132e5b6d6dSopenharmony_ci 11142e5b6d6dSopenharmony_ci if(lineTop>bottom) { 11152e5b6d6dSopenharmony_ci fprintf(stderr, "gennames allocWord(): out of memory\n"); 11162e5b6d6dSopenharmony_ci exit(U_MEMORY_ALLOCATION_ERROR); 11172e5b6d6dSopenharmony_ci } 11182e5b6d6dSopenharmony_ci wordBottom=bottom; 11192e5b6d6dSopenharmony_ci return stringStore+bottom; 11202e5b6d6dSopenharmony_ci} 11212e5b6d6dSopenharmony_ci 11222e5b6d6dSopenharmony_ciPropsBuilder * 11232e5b6d6dSopenharmony_cicreateNamesPropsBuilder(UErrorCode &errorCode) { 11242e5b6d6dSopenharmony_ci if(U_FAILURE(errorCode)) { return NULL; } 11252e5b6d6dSopenharmony_ci PropsBuilder *pb=new NamesPropsBuilder(errorCode); 11262e5b6d6dSopenharmony_ci if(pb==NULL) { 11272e5b6d6dSopenharmony_ci errorCode=U_MEMORY_ALLOCATION_ERROR; 11282e5b6d6dSopenharmony_ci } 11292e5b6d6dSopenharmony_ci return pb; 11302e5b6d6dSopenharmony_ci} 11312e5b6d6dSopenharmony_ci 11322e5b6d6dSopenharmony_ci/* 11332e5b6d6dSopenharmony_ci * Hey, Emacs, please set the following: 11342e5b6d6dSopenharmony_ci * 11352e5b6d6dSopenharmony_ci * Local Variables: 11362e5b6d6dSopenharmony_ci * indent-tabs-mode: nil 11372e5b6d6dSopenharmony_ci * End: 11382e5b6d6dSopenharmony_ci * 11392e5b6d6dSopenharmony_ci */ 1140