12e5b6d6dSopenharmony_ci// © 2017 and later: Unicode, Inc. and others.
22e5b6d6dSopenharmony_ci// License & terms of use: http://www.unicode.org/copyright.html
32e5b6d6dSopenharmony_ci/*
42e5b6d6dSopenharmony_ci*******************************************************************************
52e5b6d6dSopenharmony_ci*
62e5b6d6dSopenharmony_ci*   Copyright (C) 1999-2015, International Business Machines
72e5b6d6dSopenharmony_ci*   Corporation and others.  All Rights Reserved.
82e5b6d6dSopenharmony_ci*
92e5b6d6dSopenharmony_ci*******************************************************************************
102e5b6d6dSopenharmony_ci*   file name:  namespropsbuilder.cpp (was gennames/gennames.c)
112e5b6d6dSopenharmony_ci*   encoding:   US-ASCII
122e5b6d6dSopenharmony_ci*   tab size:   8 (not used)
132e5b6d6dSopenharmony_ci*   indentation:4
142e5b6d6dSopenharmony_ci*
152e5b6d6dSopenharmony_ci*   created on: 1999sep30
162e5b6d6dSopenharmony_ci*   created by: Markus W. Scherer
172e5b6d6dSopenharmony_ci*
182e5b6d6dSopenharmony_ci*   This builder reads Unicode character names and aliases,
192e5b6d6dSopenharmony_ci*   tokenizes and compresses them, and builds
202e5b6d6dSopenharmony_ci*   compact binary tables for random-access lookup
212e5b6d6dSopenharmony_ci*   in a u_charName() API function.
222e5b6d6dSopenharmony_ci*
232e5b6d6dSopenharmony_ci* unames.icu file format (after UDataInfo header etc. - see udata.c)
242e5b6d6dSopenharmony_ci* (all data is static const)
252e5b6d6dSopenharmony_ci*
262e5b6d6dSopenharmony_ci* UDataInfo fields:
272e5b6d6dSopenharmony_ci*   dataFormat "unam"
282e5b6d6dSopenharmony_ci*   formatVersion 1.0
292e5b6d6dSopenharmony_ci*   dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
302e5b6d6dSopenharmony_ci*
312e5b6d6dSopenharmony_ci* -- data-based names
322e5b6d6dSopenharmony_ci* uint32_t tokenStringOffset,
332e5b6d6dSopenharmony_ci*          groupsOffset,
342e5b6d6dSopenharmony_ci*          groupStringOffset,
352e5b6d6dSopenharmony_ci*          algNamesOffset;
362e5b6d6dSopenharmony_ci*
372e5b6d6dSopenharmony_ci* uint16_t tokenCount;
382e5b6d6dSopenharmony_ci* uint16_t tokenTable[tokenCount];
392e5b6d6dSopenharmony_ci*
402e5b6d6dSopenharmony_ci* char     tokenStrings[]; -- padded to even count
412e5b6d6dSopenharmony_ci*
422e5b6d6dSopenharmony_ci* -- strings (groupStrings) are tokenized as follows:
432e5b6d6dSopenharmony_ci*   for each character c
442e5b6d6dSopenharmony_ci*       if(c>=tokenCount) write that character c directly
452e5b6d6dSopenharmony_ci*   else
462e5b6d6dSopenharmony_ci*       token=tokenTable[c];
472e5b6d6dSopenharmony_ci*       if(token==0xfffe) -- lead byte of double-byte token
482e5b6d6dSopenharmony_ci*           token=tokenTable[c<<8|next character];
492e5b6d6dSopenharmony_ci*       if(token==-1)
502e5b6d6dSopenharmony_ci*           write c directly
512e5b6d6dSopenharmony_ci*       else
522e5b6d6dSopenharmony_ci*           tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
532e5b6d6dSopenharmony_ci*           append zero-terminated tokenString;
542e5b6d6dSopenharmony_ci*
552e5b6d6dSopenharmony_ci*    Different strings for a code point - normal name, 1.0 name, and ISO comment -
562e5b6d6dSopenharmony_ci*    are separated by ';'.
572e5b6d6dSopenharmony_ci*
582e5b6d6dSopenharmony_ci* uint16_t groupCount;
592e5b6d6dSopenharmony_ci* struct {
602e5b6d6dSopenharmony_ci*   uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
612e5b6d6dSopenharmony_ci*   uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
622e5b6d6dSopenharmony_ci*   uint16_t offsetLow;
632e5b6d6dSopenharmony_ci* } groupTable[groupCount];
642e5b6d6dSopenharmony_ci*
652e5b6d6dSopenharmony_ci* char     groupStrings[]; -- padded to 4-count
662e5b6d6dSopenharmony_ci*
672e5b6d6dSopenharmony_ci* -- The actual, tokenized group strings are not zero-terminated because
682e5b6d6dSopenharmony_ci*   that would take up too much space.
692e5b6d6dSopenharmony_ci*   Instead, they are preceeded by their length, written in a variable-length sequence:
702e5b6d6dSopenharmony_ci*   For each of the 32 group strings, one or two nibbles are stored for its length.
712e5b6d6dSopenharmony_ci*   Nibbles (4-bit values, half-bytes) are read MSB first.
722e5b6d6dSopenharmony_ci*   A nibble with a value of 0..11 directly indicates the length of the name string.
732e5b6d6dSopenharmony_ci*   A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
742e5b6d6dSopenharmony_ci*   by (((n-12)<<4)|m)+12, reaching values of 12..75.
752e5b6d6dSopenharmony_ci*   These lengths are sequentially for each tokenized string, not for the de-tokenized result.
762e5b6d6dSopenharmony_ci*   For the de-tokenizing, see token description above; the strings immediately follow the
772e5b6d6dSopenharmony_ci*   32 lengths.
782e5b6d6dSopenharmony_ci*
792e5b6d6dSopenharmony_ci* -- algorithmic names
802e5b6d6dSopenharmony_ci*
812e5b6d6dSopenharmony_ci* typedef struct AlgorithmicRange {
822e5b6d6dSopenharmony_ci*     uint32_t rangeStart, rangeEnd;
832e5b6d6dSopenharmony_ci*     uint8_t algorithmType, algorithmVariant;
842e5b6d6dSopenharmony_ci*     uint16_t rangeSize;
852e5b6d6dSopenharmony_ci* } AlgorithmicRange;
862e5b6d6dSopenharmony_ci*
872e5b6d6dSopenharmony_ci* uint32_t algRangesCount; -- number of data blocks for ranges of
882e5b6d6dSopenharmony_ci*               algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
892e5b6d6dSopenharmony_ci*
902e5b6d6dSopenharmony_ci* struct {
912e5b6d6dSopenharmony_ci*     AlgorithmicRange algRange;
922e5b6d6dSopenharmony_ci*     uint8_t algRangeData[]; -- padded to 4-count except in last range
932e5b6d6dSopenharmony_ci* } algRanges[algNamesCount];
942e5b6d6dSopenharmony_ci* -- not a real array because each part has a different size
952e5b6d6dSopenharmony_ci*    of algRange.rangeSize (including AlgorithmicRange)
962e5b6d6dSopenharmony_ci*
972e5b6d6dSopenharmony_ci* -- algorithmic range types:
982e5b6d6dSopenharmony_ci*
992e5b6d6dSopenharmony_ci* 0 Names are formed from a string prefix that is stored in
1002e5b6d6dSopenharmony_ci*   the algRangeData (zero-terminated), followed by the Unicode code point
1012e5b6d6dSopenharmony_ci*   of the character in hexadecimal digits;
1022e5b6d6dSopenharmony_ci*   algRange.algorithmVariant digits are written
1032e5b6d6dSopenharmony_ci*
1042e5b6d6dSopenharmony_ci* 1 Names are formed by calculating modulo-factors of the code point value as follows:
1052e5b6d6dSopenharmony_ci*   algRange.algorithmVariant is the count of modulo factors
1062e5b6d6dSopenharmony_ci*   algRangeData contains
1072e5b6d6dSopenharmony_ci*       uint16_t factors[algRange.algorithmVariant];
1082e5b6d6dSopenharmony_ci*       char strings[];
1092e5b6d6dSopenharmony_ci*   the first zero-terminated string is written as the prefix; then:
1102e5b6d6dSopenharmony_ci*
1112e5b6d6dSopenharmony_ci*   The rangeStart is subtracted; with the difference, here "code":
1122e5b6d6dSopenharmony_ci*   for(i=algRange.algorithmVariant-1 to 0 step -1)
1132e5b6d6dSopenharmony_ci*       index[i]=code%factor[i];
1142e5b6d6dSopenharmony_ci*       code/=factor[i];
1152e5b6d6dSopenharmony_ci*
1162e5b6d6dSopenharmony_ci*   The strings after the prefix are short pieces that are then appended to the result
1172e5b6d6dSopenharmony_ci*   according to index[0..algRange.algorithmVariant-1].
1182e5b6d6dSopenharmony_ci*/
1192e5b6d6dSopenharmony_ci
1202e5b6d6dSopenharmony_ci#include <stdio.h>
1212e5b6d6dSopenharmony_ci#include "unicode/utypes.h"
1222e5b6d6dSopenharmony_ci#include "unicode/putil.h"
1232e5b6d6dSopenharmony_ci#include "unicode/udata.h"
1242e5b6d6dSopenharmony_ci#include "charstr.h"
1252e5b6d6dSopenharmony_ci#include "cmemory.h"
1262e5b6d6dSopenharmony_ci#include "cstring.h"
1272e5b6d6dSopenharmony_ci#include "genprops.h"
1282e5b6d6dSopenharmony_ci#include "ppucd.h"
1292e5b6d6dSopenharmony_ci#include "uarrsort.h"
1302e5b6d6dSopenharmony_ci#include "uassert.h"
1312e5b6d6dSopenharmony_ci#include "unewdata.h"
1322e5b6d6dSopenharmony_ci#include "uoptions.h"
1332e5b6d6dSopenharmony_ci
1342e5b6d6dSopenharmony_ci#define STRING_STORE_SIZE 2000000
1352e5b6d6dSopenharmony_ci#define GROUP_STORE_SIZE 5000
1362e5b6d6dSopenharmony_ci
1372e5b6d6dSopenharmony_ci#define GROUP_SHIFT 5
1382e5b6d6dSopenharmony_ci#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
1392e5b6d6dSopenharmony_ci#define GROUP_MASK (LINES_PER_GROUP-1)
1402e5b6d6dSopenharmony_ci
1412e5b6d6dSopenharmony_ci#define MAX_LINE_COUNT 50000
1422e5b6d6dSopenharmony_ci#define MAX_WORD_COUNT 20000
1432e5b6d6dSopenharmony_ci#define MAX_GROUP_COUNT 5000
1442e5b6d6dSopenharmony_ci
1452e5b6d6dSopenharmony_ci#define NAME_SEPARATOR_CHAR ';'
1462e5b6d6dSopenharmony_ci
1472e5b6d6dSopenharmony_ci/* generator data ----------------------------------------------------------- */
1482e5b6d6dSopenharmony_ci
1492e5b6d6dSopenharmony_ciU_NAMESPACE_USE
1502e5b6d6dSopenharmony_ci
1512e5b6d6dSopenharmony_ci/* UDataInfo cf. udata.h */
1522e5b6d6dSopenharmony_cistatic UDataInfo dataInfo={
1532e5b6d6dSopenharmony_ci    sizeof(UDataInfo),
1542e5b6d6dSopenharmony_ci    0,
1552e5b6d6dSopenharmony_ci
1562e5b6d6dSopenharmony_ci    U_IS_BIG_ENDIAN,
1572e5b6d6dSopenharmony_ci    U_CHARSET_FAMILY,
1582e5b6d6dSopenharmony_ci    sizeof(UChar),
1592e5b6d6dSopenharmony_ci    0,
1602e5b6d6dSopenharmony_ci
1612e5b6d6dSopenharmony_ci    {0x75, 0x6e, 0x61, 0x6d},     /* dataFormat="unam" */
1622e5b6d6dSopenharmony_ci    {1, 0, 0, 0},                 /* formatVersion */
1632e5b6d6dSopenharmony_ci    {3, 0, 0, 0}                  /* dataVersion */
1642e5b6d6dSopenharmony_ci};
1652e5b6d6dSopenharmony_ci
1662e5b6d6dSopenharmony_cistatic uint8_t stringStore[STRING_STORE_SIZE],
1672e5b6d6dSopenharmony_ci               groupStore[GROUP_STORE_SIZE],
1682e5b6d6dSopenharmony_ci               lineLengths[LINES_PER_GROUP];
1692e5b6d6dSopenharmony_ci
1702e5b6d6dSopenharmony_cistatic uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
1712e5b6d6dSopenharmony_ci
1722e5b6d6dSopenharmony_citypedef struct {
1732e5b6d6dSopenharmony_ci    uint32_t code;
1742e5b6d6dSopenharmony_ci    int16_t length;
1752e5b6d6dSopenharmony_ci    uint8_t *s;
1762e5b6d6dSopenharmony_ci} Line;
1772e5b6d6dSopenharmony_ci
1782e5b6d6dSopenharmony_citypedef struct {
1792e5b6d6dSopenharmony_ci    int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
1802e5b6d6dSopenharmony_ci    int16_t count;
1812e5b6d6dSopenharmony_ci    int16_t length;
1822e5b6d6dSopenharmony_ci    uint8_t *s;
1832e5b6d6dSopenharmony_ci} Word;
1842e5b6d6dSopenharmony_ci
1852e5b6d6dSopenharmony_cistatic Line lines[MAX_LINE_COUNT];
1862e5b6d6dSopenharmony_cistatic Word words[MAX_WORD_COUNT];
1872e5b6d6dSopenharmony_ci
1882e5b6d6dSopenharmony_cistatic uint32_t lineCount=0, wordCount=0;
1892e5b6d6dSopenharmony_ci
1902e5b6d6dSopenharmony_cistatic int16_t leadByteCount;
1912e5b6d6dSopenharmony_ci
1922e5b6d6dSopenharmony_ci#define LEADBYTE_LIMIT 16
1932e5b6d6dSopenharmony_ci
1942e5b6d6dSopenharmony_cistatic int16_t tokens[LEADBYTE_LIMIT*256];
1952e5b6d6dSopenharmony_cistatic uint32_t tokenCount;
1962e5b6d6dSopenharmony_ci
1972e5b6d6dSopenharmony_ci/* the structure for algorithmic names needs to be 4-aligned */
1982e5b6d6dSopenharmony_cistruct AlgorithmicRange {
1992e5b6d6dSopenharmony_ci    UChar32 start, end;
2002e5b6d6dSopenharmony_ci    uint8_t type, variant;
2012e5b6d6dSopenharmony_ci    uint16_t size;
2022e5b6d6dSopenharmony_ci};
2032e5b6d6dSopenharmony_ci
2042e5b6d6dSopenharmony_ciclass NamesPropsBuilder : public PropsBuilder {
2052e5b6d6dSopenharmony_cipublic:
2062e5b6d6dSopenharmony_ci    NamesPropsBuilder(UErrorCode &errorCode);
2072e5b6d6dSopenharmony_ci    virtual ~NamesPropsBuilder();
2082e5b6d6dSopenharmony_ci
2092e5b6d6dSopenharmony_ci    virtual void setUnicodeVersion(const UVersionInfo version);
2102e5b6d6dSopenharmony_ci    virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
2112e5b6d6dSopenharmony_ci    virtual void build(UErrorCode &errorCode);
2122e5b6d6dSopenharmony_ci    virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode);
2132e5b6d6dSopenharmony_ci
2142e5b6d6dSopenharmony_ciprivate:
2152e5b6d6dSopenharmony_ci    virtual void setAlgNamesRange(UChar32 start, UChar32 end,
2162e5b6d6dSopenharmony_ci                                  const char *type, const char *prefix, UErrorCode &errorCode);
2172e5b6d6dSopenharmony_ci
2182e5b6d6dSopenharmony_ci    CharString algRanges;
2192e5b6d6dSopenharmony_ci    int32_t countAlgRanges;
2202e5b6d6dSopenharmony_ci};
2212e5b6d6dSopenharmony_ci
2222e5b6d6dSopenharmony_ciNamesPropsBuilder::NamesPropsBuilder(UErrorCode &errorCode)
2232e5b6d6dSopenharmony_ci        : countAlgRanges(0) {
2242e5b6d6dSopenharmony_ci    for(int i=0; i<256; ++i) {
2252e5b6d6dSopenharmony_ci        tokens[i]=0;
2262e5b6d6dSopenharmony_ci    }
2272e5b6d6dSopenharmony_ci}
2282e5b6d6dSopenharmony_ci
2292e5b6d6dSopenharmony_ciNamesPropsBuilder::~NamesPropsBuilder() {
2302e5b6d6dSopenharmony_ci}
2312e5b6d6dSopenharmony_ci
2322e5b6d6dSopenharmony_civoid
2332e5b6d6dSopenharmony_ciNamesPropsBuilder::setUnicodeVersion(const UVersionInfo version) {
2342e5b6d6dSopenharmony_ci    uprv_memcpy(dataInfo.dataVersion, version, 4);
2352e5b6d6dSopenharmony_ci}
2362e5b6d6dSopenharmony_ci
2372e5b6d6dSopenharmony_ci/* prototypes --------------------------------------------------------------- */
2382e5b6d6dSopenharmony_ci
2392e5b6d6dSopenharmony_cistatic void
2402e5b6d6dSopenharmony_ciparseName(const char *name, int16_t length);
2412e5b6d6dSopenharmony_ci
2422e5b6d6dSopenharmony_cistatic int16_t
2432e5b6d6dSopenharmony_ciskipNoise(const char *line, int16_t start, int16_t limit);
2442e5b6d6dSopenharmony_ci
2452e5b6d6dSopenharmony_cistatic int16_t
2462e5b6d6dSopenharmony_cigetWord(const char *line, int16_t start, int16_t limit);
2472e5b6d6dSopenharmony_ci
2482e5b6d6dSopenharmony_cistatic void
2492e5b6d6dSopenharmony_cicompress(UErrorCode &errorCode);
2502e5b6d6dSopenharmony_ci
2512e5b6d6dSopenharmony_cistatic void
2522e5b6d6dSopenharmony_cicompressLines(void);
2532e5b6d6dSopenharmony_ci
2542e5b6d6dSopenharmony_cistatic int16_t
2552e5b6d6dSopenharmony_cicompressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
2562e5b6d6dSopenharmony_ci
2572e5b6d6dSopenharmony_cistatic int32_t
2582e5b6d6dSopenharmony_cicompareWords(const void *context, const void *word1, const void *word2);
2592e5b6d6dSopenharmony_ci
2602e5b6d6dSopenharmony_cistatic int16_t
2612e5b6d6dSopenharmony_cifindToken(uint8_t *s, int16_t length);
2622e5b6d6dSopenharmony_ci
2632e5b6d6dSopenharmony_cistatic Word *
2642e5b6d6dSopenharmony_cifindWord(const char *s, int16_t length);
2652e5b6d6dSopenharmony_ci
2662e5b6d6dSopenharmony_cistatic Word *
2672e5b6d6dSopenharmony_ciaddWord(const char *s, int16_t length);
2682e5b6d6dSopenharmony_ci
2692e5b6d6dSopenharmony_cistatic void
2702e5b6d6dSopenharmony_cicountWord(Word *word);
2712e5b6d6dSopenharmony_ci
2722e5b6d6dSopenharmony_cistatic void
2732e5b6d6dSopenharmony_ciaddLine(UChar32 code, const char *names[], int16_t lengths[], int16_t count);
2742e5b6d6dSopenharmony_ci
2752e5b6d6dSopenharmony_cistatic void
2762e5b6d6dSopenharmony_ciaddGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
2772e5b6d6dSopenharmony_ci
2782e5b6d6dSopenharmony_cistatic uint32_t
2792e5b6d6dSopenharmony_ciaddToken(uint8_t *s, int16_t length);
2802e5b6d6dSopenharmony_ci
2812e5b6d6dSopenharmony_cistatic void
2822e5b6d6dSopenharmony_ciappendLineLength(int16_t length);
2832e5b6d6dSopenharmony_ci
2842e5b6d6dSopenharmony_cistatic void
2852e5b6d6dSopenharmony_ciappendLineLengthNibble(uint8_t nibble);
2862e5b6d6dSopenharmony_ci
2872e5b6d6dSopenharmony_cistatic uint8_t *
2882e5b6d6dSopenharmony_ciallocLine(int32_t length);
2892e5b6d6dSopenharmony_ci
2902e5b6d6dSopenharmony_cistatic uint8_t *
2912e5b6d6dSopenharmony_ciallocWord(uint32_t length);
2922e5b6d6dSopenharmony_ci
2932e5b6d6dSopenharmony_ci/* parsing ------------------------------------------------------------------ */
2942e5b6d6dSopenharmony_ci
2952e5b6d6dSopenharmony_civoid
2962e5b6d6dSopenharmony_ciNamesPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
2972e5b6d6dSopenharmony_ci                            UErrorCode &errorCode) {
2982e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode)) { return; }
2992e5b6d6dSopenharmony_ci    if(!newValues.contains(UCHAR_NAME) && !newValues.contains(PPUCD_NAME_ALIAS)) {
3002e5b6d6dSopenharmony_ci        return;
3012e5b6d6dSopenharmony_ci    }
3022e5b6d6dSopenharmony_ci
3032e5b6d6dSopenharmony_ci    U_ASSERT(props.start==props.end);
3042e5b6d6dSopenharmony_ci
3052e5b6d6dSopenharmony_ci    const char *names[4]={ NULL, NULL, NULL, NULL };
3062e5b6d6dSopenharmony_ci    int16_t lengths[4]={ 0, 0, 0, 0 };
3072e5b6d6dSopenharmony_ci
3082e5b6d6dSopenharmony_ci    /* get the character name */
3092e5b6d6dSopenharmony_ci    if(props.name!=NULL) {
3102e5b6d6dSopenharmony_ci        names[0]=props.name;
3112e5b6d6dSopenharmony_ci        lengths[0]=(int16_t)uprv_strlen(props.name);
3122e5b6d6dSopenharmony_ci        parseName(names[0], lengths[0]);
3132e5b6d6dSopenharmony_ci    }
3142e5b6d6dSopenharmony_ci
3152e5b6d6dSopenharmony_ci    CharString buffer;
3162e5b6d6dSopenharmony_ci    if(props.nameAlias!=NULL) {
3172e5b6d6dSopenharmony_ci        /*
3182e5b6d6dSopenharmony_ci         * Only use "correction" aliases for now, from Unicode 6.1 NameAliases.txt with 3 fields per line.
3192e5b6d6dSopenharmony_ci         * TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character.
3202e5b6d6dSopenharmony_ci         */
3212e5b6d6dSopenharmony_ci        const char *corr=uprv_strstr(props.nameAlias, "correction=");
3222e5b6d6dSopenharmony_ci        if(corr!=NULL) {
3232e5b6d6dSopenharmony_ci            corr+=11;  // skip "correction="
3242e5b6d6dSopenharmony_ci            const char *limit=uprv_strchr(corr, ',');
3252e5b6d6dSopenharmony_ci            if(limit!=NULL) {
3262e5b6d6dSopenharmony_ci                buffer.append(corr, limit-corr, errorCode);
3272e5b6d6dSopenharmony_ci                names[3]=buffer.data();
3282e5b6d6dSopenharmony_ci                lengths[3]=(int16_t)(limit-corr);
3292e5b6d6dSopenharmony_ci            } else {
3302e5b6d6dSopenharmony_ci                names[3]=corr;
3312e5b6d6dSopenharmony_ci                lengths[3]=(int16_t)uprv_strlen(corr);
3322e5b6d6dSopenharmony_ci            }
3332e5b6d6dSopenharmony_ci            parseName(names[3], lengths[3]);
3342e5b6d6dSopenharmony_ci        }
3352e5b6d6dSopenharmony_ci    }
3362e5b6d6dSopenharmony_ci
3372e5b6d6dSopenharmony_ci    addLine(props.start, names, lengths, LENGTHOF(names));
3382e5b6d6dSopenharmony_ci}
3392e5b6d6dSopenharmony_ci
3402e5b6d6dSopenharmony_cistatic void
3412e5b6d6dSopenharmony_ciparseName(const char *name, int16_t length) {
3422e5b6d6dSopenharmony_ci    int16_t start=0, limit, wordLength/*, prevStart=-1*/;
3432e5b6d6dSopenharmony_ci    Word *word;
3442e5b6d6dSopenharmony_ci
3452e5b6d6dSopenharmony_ci    while(start<length) {
3462e5b6d6dSopenharmony_ci        /* skip any "noise" characters */
3472e5b6d6dSopenharmony_ci        limit=skipNoise(name, start, length);
3482e5b6d6dSopenharmony_ci        if(start<limit) {
3492e5b6d6dSopenharmony_ci            /*prevStart=-1;*/
3502e5b6d6dSopenharmony_ci            start=limit;
3512e5b6d6dSopenharmony_ci        }
3522e5b6d6dSopenharmony_ci        if(start==length) {
3532e5b6d6dSopenharmony_ci            break;
3542e5b6d6dSopenharmony_ci        }
3552e5b6d6dSopenharmony_ci
3562e5b6d6dSopenharmony_ci        /* get a word and add it if it is longer than 1 */
3572e5b6d6dSopenharmony_ci        limit=getWord(name, start, length);
3582e5b6d6dSopenharmony_ci        wordLength=(int16_t)(limit-start);
3592e5b6d6dSopenharmony_ci        if(wordLength>1) {
3602e5b6d6dSopenharmony_ci            word=findWord(name+start, wordLength);
3612e5b6d6dSopenharmony_ci            if(word==NULL) {
3622e5b6d6dSopenharmony_ci                word=addWord(name+start, wordLength);
3632e5b6d6dSopenharmony_ci            }
3642e5b6d6dSopenharmony_ci            countWord(word);
3652e5b6d6dSopenharmony_ci        }
3662e5b6d6dSopenharmony_ci
3672e5b6d6dSopenharmony_ci#if 0
3682e5b6d6dSopenharmony_ci        /*
3692e5b6d6dSopenharmony_ci         * if there was a word before this
3702e5b6d6dSopenharmony_ci         * (with no noise in between), then add the pair of words, too
3712e5b6d6dSopenharmony_ci         */
3722e5b6d6dSopenharmony_ci        if(prevStart!=-1) {
3732e5b6d6dSopenharmony_ci            wordLength=limit-prevStart;
3742e5b6d6dSopenharmony_ci            word=findWord(name+prevStart, wordLength);
3752e5b6d6dSopenharmony_ci            if(word==NULL) {
3762e5b6d6dSopenharmony_ci                word=addWord(name+prevStart, wordLength);
3772e5b6d6dSopenharmony_ci            }
3782e5b6d6dSopenharmony_ci            countWord(word);
3792e5b6d6dSopenharmony_ci        }
3802e5b6d6dSopenharmony_ci#endif
3812e5b6d6dSopenharmony_ci
3822e5b6d6dSopenharmony_ci        /*prevStart=start;*/
3832e5b6d6dSopenharmony_ci        start=limit;
3842e5b6d6dSopenharmony_ci    }
3852e5b6d6dSopenharmony_ci}
3862e5b6d6dSopenharmony_ci
3872e5b6d6dSopenharmony_cistatic UBool
3882e5b6d6dSopenharmony_ciisWordChar(char c) {
3892e5b6d6dSopenharmony_ci    return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
3902e5b6d6dSopenharmony_ci           ('J'<=c && c<='R') ||
3912e5b6d6dSopenharmony_ci           ('S'<=c && c<='Z') ||
3922e5b6d6dSopenharmony_ci
3932e5b6d6dSopenharmony_ci           ('0'<=c && c<='9');
3942e5b6d6dSopenharmony_ci}
3952e5b6d6dSopenharmony_ci
3962e5b6d6dSopenharmony_cistatic int16_t
3972e5b6d6dSopenharmony_ciskipNoise(const char *line, int16_t start, int16_t limit) {
3982e5b6d6dSopenharmony_ci    /* skip anything that is not part of a word in this sense */
3992e5b6d6dSopenharmony_ci    while(start<limit && !isWordChar(line[start])) {
4002e5b6d6dSopenharmony_ci        ++start;
4012e5b6d6dSopenharmony_ci    }
4022e5b6d6dSopenharmony_ci
4032e5b6d6dSopenharmony_ci    return start;
4042e5b6d6dSopenharmony_ci}
4052e5b6d6dSopenharmony_ci
4062e5b6d6dSopenharmony_cistatic int16_t
4072e5b6d6dSopenharmony_cigetWord(const char *line, int16_t start, int16_t limit) {
4082e5b6d6dSopenharmony_ci    char c=0; /* initialize to avoid a compiler warning although the code was safe */
4092e5b6d6dSopenharmony_ci
4102e5b6d6dSopenharmony_ci    /* a unicode character name word consists of A-Z0-9 */
4112e5b6d6dSopenharmony_ci    while(start<limit && isWordChar(line[start])) {
4122e5b6d6dSopenharmony_ci        ++start;
4132e5b6d6dSopenharmony_ci    }
4142e5b6d6dSopenharmony_ci
4152e5b6d6dSopenharmony_ci    /* include a following space or dash */
4162e5b6d6dSopenharmony_ci    if(start<limit && ((c=line[start])==' ' || c=='-')) {
4172e5b6d6dSopenharmony_ci        ++start;
4182e5b6d6dSopenharmony_ci    }
4192e5b6d6dSopenharmony_ci
4202e5b6d6dSopenharmony_ci    return start;
4212e5b6d6dSopenharmony_ci}
4222e5b6d6dSopenharmony_ci
4232e5b6d6dSopenharmony_civoid
4242e5b6d6dSopenharmony_ciNamesPropsBuilder::setAlgNamesRange(UChar32 start, UChar32 end,
4252e5b6d6dSopenharmony_ci                                    const char *type,
4262e5b6d6dSopenharmony_ci                                    const char *prefix,  // number of hex digits
4272e5b6d6dSopenharmony_ci                                    UErrorCode &errorCode) {
4282e5b6d6dSopenharmony_ci    /* modulo factors, maximum 8 */
4292e5b6d6dSopenharmony_ci    /* 3 factors: 19, 21, 28, most-to-least-significant */
4302e5b6d6dSopenharmony_ci    static const uint16_t hangulFactors[3]={
4312e5b6d6dSopenharmony_ci        19, 21, 28
4322e5b6d6dSopenharmony_ci    };
4332e5b6d6dSopenharmony_ci
4342e5b6d6dSopenharmony_ci    static const char jamo[]=
4352e5b6d6dSopenharmony_ci        "HANGUL SYLLABLE \0"
4362e5b6d6dSopenharmony_ci
4372e5b6d6dSopenharmony_ci        "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
4382e5b6d6dSopenharmony_ci        "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
4392e5b6d6dSopenharmony_ci
4402e5b6d6dSopenharmony_ci        "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
4412e5b6d6dSopenharmony_ci        "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
4422e5b6d6dSopenharmony_ci        "YU\0EU\0YI\0I\0"
4432e5b6d6dSopenharmony_ci
4442e5b6d6dSopenharmony_ci        "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
4452e5b6d6dSopenharmony_ci        "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
4462e5b6d6dSopenharmony_ci        "S\0SS\0NG\0J\0C\0K\0T\0P\0H";
4472e5b6d6dSopenharmony_ci
4482e5b6d6dSopenharmony_ci    int32_t prefixLength=0;
4492e5b6d6dSopenharmony_ci    AlgorithmicRange range;
4502e5b6d6dSopenharmony_ci    uprv_memset(&range, 0, sizeof(AlgorithmicRange));
4512e5b6d6dSopenharmony_ci    int32_t rangeSize=(int32_t)sizeof(AlgorithmicRange);
4522e5b6d6dSopenharmony_ci    range.start=start;
4532e5b6d6dSopenharmony_ci    range.end=end;
4542e5b6d6dSopenharmony_ci    if(0==uprv_strcmp(type, "han")) {
4552e5b6d6dSopenharmony_ci        range.type=0;
4562e5b6d6dSopenharmony_ci        range.variant= end<=0xffff ? 4 : 5;
4572e5b6d6dSopenharmony_ci        prefixLength=uprv_strlen(prefix)+1;
4582e5b6d6dSopenharmony_ci        rangeSize+=prefixLength;
4592e5b6d6dSopenharmony_ci    } else if(0==uprv_strcmp(type, "hangul")) {
4602e5b6d6dSopenharmony_ci        range.type=1;
4612e5b6d6dSopenharmony_ci        range.variant=(uint8_t)LENGTHOF(hangulFactors);
4622e5b6d6dSopenharmony_ci        rangeSize+=(int32_t)sizeof(hangulFactors);
4632e5b6d6dSopenharmony_ci        rangeSize+=(int32_t)sizeof(jamo);
4642e5b6d6dSopenharmony_ci    } else {
4652e5b6d6dSopenharmony_ci        fprintf(stderr, "genprops error: unknown algnamesrange type '%s'\n", prefix);
4662e5b6d6dSopenharmony_ci        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
4672e5b6d6dSopenharmony_ci        return;
4682e5b6d6dSopenharmony_ci    }
4692e5b6d6dSopenharmony_ci    int32_t paddingLength=rangeSize&3;
4702e5b6d6dSopenharmony_ci    if(paddingLength) {
4712e5b6d6dSopenharmony_ci        paddingLength=4-paddingLength;
4722e5b6d6dSopenharmony_ci        rangeSize+=paddingLength;
4732e5b6d6dSopenharmony_ci    }
4742e5b6d6dSopenharmony_ci    range.size=(uint16_t)rangeSize;
4752e5b6d6dSopenharmony_ci    algRanges.append((char *)&range, (int32_t)sizeof(AlgorithmicRange), errorCode);
4762e5b6d6dSopenharmony_ci    if(range.type==0) {  // han
4772e5b6d6dSopenharmony_ci        algRanges.append(prefix, prefixLength, errorCode);
4782e5b6d6dSopenharmony_ci    } else /* type==1 */ {  // hangul
4792e5b6d6dSopenharmony_ci        algRanges.append((char *)hangulFactors, (int32_t)sizeof(hangulFactors), errorCode);
4802e5b6d6dSopenharmony_ci        algRanges.append(jamo, (int32_t)sizeof(jamo), errorCode);
4812e5b6d6dSopenharmony_ci    }
4822e5b6d6dSopenharmony_ci    while(paddingLength) {
4832e5b6d6dSopenharmony_ci        algRanges.append((char)0xaa, errorCode);
4842e5b6d6dSopenharmony_ci        --paddingLength;
4852e5b6d6dSopenharmony_ci    }
4862e5b6d6dSopenharmony_ci    ++countAlgRanges;
4872e5b6d6dSopenharmony_ci}
4882e5b6d6dSopenharmony_ci
4892e5b6d6dSopenharmony_ci/* compressing -------------------------------------------------------------- */
4902e5b6d6dSopenharmony_ci
4912e5b6d6dSopenharmony_cistatic void
4922e5b6d6dSopenharmony_cicompress(UErrorCode &errorCode) {
4932e5b6d6dSopenharmony_ci    uint32_t i, letterCount;
4942e5b6d6dSopenharmony_ci    int16_t wordNumber;
4952e5b6d6dSopenharmony_ci
4962e5b6d6dSopenharmony_ci    /* sort the words in reverse order by weight */
4972e5b6d6dSopenharmony_ci    uprv_sortArray(words, wordCount, sizeof(Word),
4982e5b6d6dSopenharmony_ci                   compareWords, NULL, false, &errorCode);
4992e5b6d6dSopenharmony_ci
5002e5b6d6dSopenharmony_ci    /* remove the words that do not save anything */
5012e5b6d6dSopenharmony_ci    while(wordCount>0 && words[wordCount-1].weight<1) {
5022e5b6d6dSopenharmony_ci        --wordCount;
5032e5b6d6dSopenharmony_ci    }
5042e5b6d6dSopenharmony_ci
5052e5b6d6dSopenharmony_ci    /* count the letters in the token range */
5062e5b6d6dSopenharmony_ci    letterCount=0;
5072e5b6d6dSopenharmony_ci    for(i=LEADBYTE_LIMIT; i<256; ++i) {
5082e5b6d6dSopenharmony_ci        if(tokens[i]==-1) {
5092e5b6d6dSopenharmony_ci            ++letterCount;
5102e5b6d6dSopenharmony_ci        }
5112e5b6d6dSopenharmony_ci    }
5122e5b6d6dSopenharmony_ci    if(!beQuiet) {
5132e5b6d6dSopenharmony_ci        printf("number of letters used in the names: %d\n", (int)letterCount);
5142e5b6d6dSopenharmony_ci    }
5152e5b6d6dSopenharmony_ci
5162e5b6d6dSopenharmony_ci    /* do we need double-byte tokens? */
5172e5b6d6dSopenharmony_ci    if(wordCount+letterCount<=256) {
5182e5b6d6dSopenharmony_ci        /* no, single-byte tokens are enough */
5192e5b6d6dSopenharmony_ci        leadByteCount=0;
5202e5b6d6dSopenharmony_ci        for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
5212e5b6d6dSopenharmony_ci            if(tokens[i]!=-1) {
5222e5b6d6dSopenharmony_ci                tokens[i]=wordNumber;
5232e5b6d6dSopenharmony_ci                if(beVerbose) {
5242e5b6d6dSopenharmony_ci                    printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
5252e5b6d6dSopenharmony_ci                            (int)i, (long)words[wordNumber].weight,
5262e5b6d6dSopenharmony_ci                            words[wordNumber].length, words[wordNumber].s);
5272e5b6d6dSopenharmony_ci                }
5282e5b6d6dSopenharmony_ci                ++wordNumber;
5292e5b6d6dSopenharmony_ci            }
5302e5b6d6dSopenharmony_ci        }
5312e5b6d6dSopenharmony_ci        tokenCount=i;
5322e5b6d6dSopenharmony_ci    } else {
5332e5b6d6dSopenharmony_ci        /*
5342e5b6d6dSopenharmony_ci         * The tokens that need two token bytes
5352e5b6d6dSopenharmony_ci         * get their weight reduced by their count
5362e5b6d6dSopenharmony_ci         * because they save less.
5372e5b6d6dSopenharmony_ci         */
5382e5b6d6dSopenharmony_ci        tokenCount=256-letterCount;
5392e5b6d6dSopenharmony_ci        for(i=tokenCount; i<wordCount; ++i) {
5402e5b6d6dSopenharmony_ci            words[i].weight-=words[i].count;
5412e5b6d6dSopenharmony_ci        }
5422e5b6d6dSopenharmony_ci
5432e5b6d6dSopenharmony_ci        /* sort these words in reverse order by weight */
5442e5b6d6dSopenharmony_ci        errorCode=U_ZERO_ERROR;
5452e5b6d6dSopenharmony_ci        uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
5462e5b6d6dSopenharmony_ci                        compareWords, NULL, false, &errorCode);
5472e5b6d6dSopenharmony_ci
5482e5b6d6dSopenharmony_ci        /* remove the words that do not save anything */
5492e5b6d6dSopenharmony_ci        while(wordCount>0 && words[wordCount-1].weight<1) {
5502e5b6d6dSopenharmony_ci            --wordCount;
5512e5b6d6dSopenharmony_ci        }
5522e5b6d6dSopenharmony_ci
5532e5b6d6dSopenharmony_ci        /* how many tokens and lead bytes do we have now? */
5542e5b6d6dSopenharmony_ci        tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
5552e5b6d6dSopenharmony_ci        /*
5562e5b6d6dSopenharmony_ci         * adjust upwards to take into account that
5572e5b6d6dSopenharmony_ci         * double-byte tokens must not
5582e5b6d6dSopenharmony_ci         * use NAME_SEPARATOR_CHAR as a second byte
5592e5b6d6dSopenharmony_ci         */
5602e5b6d6dSopenharmony_ci        tokenCount+=(tokenCount-256+254)/255;
5612e5b6d6dSopenharmony_ci
5622e5b6d6dSopenharmony_ci        leadByteCount=(int16_t)(tokenCount>>8);
5632e5b6d6dSopenharmony_ci        if(leadByteCount<LEADBYTE_LIMIT) {
5642e5b6d6dSopenharmony_ci            /* adjust for the real number of lead bytes */
5652e5b6d6dSopenharmony_ci            tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
5662e5b6d6dSopenharmony_ci        } else {
5672e5b6d6dSopenharmony_ci            /* limit the number of lead bytes */
5682e5b6d6dSopenharmony_ci            leadByteCount=LEADBYTE_LIMIT-1;
5692e5b6d6dSopenharmony_ci            tokenCount=LEADBYTE_LIMIT*256;
5702e5b6d6dSopenharmony_ci            wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
5712e5b6d6dSopenharmony_ci            /* adjust again to skip double-byte tokens with ';' */
5722e5b6d6dSopenharmony_ci            wordCount-=(tokenCount-256+254)/255;
5732e5b6d6dSopenharmony_ci        }
5742e5b6d6dSopenharmony_ci
5752e5b6d6dSopenharmony_ci        /* set token 0 to word 0 */
5762e5b6d6dSopenharmony_ci        tokens[0]=0;
5772e5b6d6dSopenharmony_ci        if(beVerbose) {
5782e5b6d6dSopenharmony_ci            printf("tokens[0x000]: word%8ld \"%.*s\"\n",
5792e5b6d6dSopenharmony_ci                    (long)words[0].weight,
5802e5b6d6dSopenharmony_ci                    words[0].length, words[0].s);
5812e5b6d6dSopenharmony_ci        }
5822e5b6d6dSopenharmony_ci        wordNumber=1;
5832e5b6d6dSopenharmony_ci
5842e5b6d6dSopenharmony_ci        /* set the lead byte tokens */
5852e5b6d6dSopenharmony_ci        for(i=1; (int16_t)i<=leadByteCount; ++i) {
5862e5b6d6dSopenharmony_ci            tokens[i]=-2;
5872e5b6d6dSopenharmony_ci        }
5882e5b6d6dSopenharmony_ci
5892e5b6d6dSopenharmony_ci        /* set the tokens */
5902e5b6d6dSopenharmony_ci        for(; i<256; ++i) {
5912e5b6d6dSopenharmony_ci            /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
5922e5b6d6dSopenharmony_ci            if(tokens[i]!=-1) {
5932e5b6d6dSopenharmony_ci                tokens[i]=wordNumber;
5942e5b6d6dSopenharmony_ci                if(beVerbose) {
5952e5b6d6dSopenharmony_ci                    printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
5962e5b6d6dSopenharmony_ci                            (int)i, (long)words[wordNumber].weight,
5972e5b6d6dSopenharmony_ci                            words[wordNumber].length, words[wordNumber].s);
5982e5b6d6dSopenharmony_ci                }
5992e5b6d6dSopenharmony_ci                ++wordNumber;
6002e5b6d6dSopenharmony_ci            }
6012e5b6d6dSopenharmony_ci        }
6022e5b6d6dSopenharmony_ci
6032e5b6d6dSopenharmony_ci        /* continue above 255 where there are no letters */
6042e5b6d6dSopenharmony_ci        for(; (uint32_t)wordNumber<wordCount; ++i) {
6052e5b6d6dSopenharmony_ci            if((i&0xff)==NAME_SEPARATOR_CHAR) {
6062e5b6d6dSopenharmony_ci                tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
6072e5b6d6dSopenharmony_ci            } else {
6082e5b6d6dSopenharmony_ci                tokens[i]=wordNumber;
6092e5b6d6dSopenharmony_ci                if(beVerbose) {
6102e5b6d6dSopenharmony_ci                    printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
6112e5b6d6dSopenharmony_ci                            (int)i, (long)words[wordNumber].weight,
6122e5b6d6dSopenharmony_ci                            words[wordNumber].length, words[wordNumber].s);
6132e5b6d6dSopenharmony_ci                }
6142e5b6d6dSopenharmony_ci                ++wordNumber;
6152e5b6d6dSopenharmony_ci            }
6162e5b6d6dSopenharmony_ci        }
6172e5b6d6dSopenharmony_ci        tokenCount=i; /* should be already tokenCount={i or i+1} */
6182e5b6d6dSopenharmony_ci    }
6192e5b6d6dSopenharmony_ci
6202e5b6d6dSopenharmony_ci    if(!beQuiet) {
6212e5b6d6dSopenharmony_ci        printf("number of lead bytes: %d\n", leadByteCount);
6222e5b6d6dSopenharmony_ci        printf("number of single-byte tokens: %lu\n",
6232e5b6d6dSopenharmony_ci            (unsigned long)256-letterCount-leadByteCount);
6242e5b6d6dSopenharmony_ci        printf("number of tokens: %lu\n", (unsigned long)tokenCount);
6252e5b6d6dSopenharmony_ci    }
6262e5b6d6dSopenharmony_ci
6272e5b6d6dSopenharmony_ci    compressLines();
6282e5b6d6dSopenharmony_ci}
6292e5b6d6dSopenharmony_ci
6302e5b6d6dSopenharmony_cistatic void
6312e5b6d6dSopenharmony_cicompressLines() {
6322e5b6d6dSopenharmony_ci    Line *line=NULL;
6332e5b6d6dSopenharmony_ci    uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
6342e5b6d6dSopenharmony_ci             groupMSB=0xffff, lineCount2;
6352e5b6d6dSopenharmony_ci    int16_t groupTop=0;
6362e5b6d6dSopenharmony_ci
6372e5b6d6dSopenharmony_ci    /* store the groups like lines, with compressed data after raw strings */
6382e5b6d6dSopenharmony_ci    groupBottom=lineTop;
6392e5b6d6dSopenharmony_ci    lineCount2=lineCount;
6402e5b6d6dSopenharmony_ci    lineCount=0;
6412e5b6d6dSopenharmony_ci
6422e5b6d6dSopenharmony_ci    /* loop over all lines */
6432e5b6d6dSopenharmony_ci    while(i<lineCount2) {
6442e5b6d6dSopenharmony_ci        line=lines+i++;
6452e5b6d6dSopenharmony_ci        inLine=line->code;
6462e5b6d6dSopenharmony_ci
6472e5b6d6dSopenharmony_ci        /* segment the lines to groups of 32 */
6482e5b6d6dSopenharmony_ci        if(inLine>>GROUP_SHIFT!=groupMSB) {
6492e5b6d6dSopenharmony_ci            /* finish the current group with empty lines */
6502e5b6d6dSopenharmony_ci            while((++outLine&GROUP_MASK)!=0) {
6512e5b6d6dSopenharmony_ci                appendLineLength(0);
6522e5b6d6dSopenharmony_ci            }
6532e5b6d6dSopenharmony_ci
6542e5b6d6dSopenharmony_ci            /* store the group like a line */
6552e5b6d6dSopenharmony_ci            if(groupTop>0) {
6562e5b6d6dSopenharmony_ci                if(groupTop>GROUP_STORE_SIZE) {
6572e5b6d6dSopenharmony_ci                    fprintf(stderr, "gennames: group store overflow\n");
6582e5b6d6dSopenharmony_ci                    exit(U_BUFFER_OVERFLOW_ERROR);
6592e5b6d6dSopenharmony_ci                }
6602e5b6d6dSopenharmony_ci                addGroup(groupMSB, groupStore, groupTop);
6612e5b6d6dSopenharmony_ci            }
6622e5b6d6dSopenharmony_ci
6632e5b6d6dSopenharmony_ci            /* start the new group */
6642e5b6d6dSopenharmony_ci            lineLengthsTop=0;
6652e5b6d6dSopenharmony_ci            groupTop=0;
6662e5b6d6dSopenharmony_ci            groupMSB=inLine>>GROUP_SHIFT;
6672e5b6d6dSopenharmony_ci            outLine=(inLine&~GROUP_MASK)-1;
6682e5b6d6dSopenharmony_ci        }
6692e5b6d6dSopenharmony_ci
6702e5b6d6dSopenharmony_ci        /* write empty lines between the previous line in the group and this one */
6712e5b6d6dSopenharmony_ci        while(++outLine<inLine) {
6722e5b6d6dSopenharmony_ci            appendLineLength(0);
6732e5b6d6dSopenharmony_ci        }
6742e5b6d6dSopenharmony_ci
6752e5b6d6dSopenharmony_ci        /* write characters and tokens for this line */
6762e5b6d6dSopenharmony_ci        appendLineLength(compressLine(line->s, line->length, &groupTop));
6772e5b6d6dSopenharmony_ci    }
6782e5b6d6dSopenharmony_ci
6792e5b6d6dSopenharmony_ci    /* finish and store the last group */
6802e5b6d6dSopenharmony_ci    if(line && groupMSB!=0xffff) {
6812e5b6d6dSopenharmony_ci        /* finish the current group with empty lines */
6822e5b6d6dSopenharmony_ci        while((++outLine&GROUP_MASK)!=0) {
6832e5b6d6dSopenharmony_ci            appendLineLength(0);
6842e5b6d6dSopenharmony_ci        }
6852e5b6d6dSopenharmony_ci
6862e5b6d6dSopenharmony_ci        /* store the group like a line */
6872e5b6d6dSopenharmony_ci        if(groupTop>0) {
6882e5b6d6dSopenharmony_ci            if(groupTop>GROUP_STORE_SIZE) {
6892e5b6d6dSopenharmony_ci                fprintf(stderr, "gennames: group store overflow\n");
6902e5b6d6dSopenharmony_ci                exit(U_BUFFER_OVERFLOW_ERROR);
6912e5b6d6dSopenharmony_ci            }
6922e5b6d6dSopenharmony_ci            addGroup(groupMSB, groupStore, groupTop);
6932e5b6d6dSopenharmony_ci        }
6942e5b6d6dSopenharmony_ci    }
6952e5b6d6dSopenharmony_ci
6962e5b6d6dSopenharmony_ci    if(!beQuiet) {
6972e5b6d6dSopenharmony_ci        printf("number of groups: %lu\n", (unsigned long)lineCount);
6982e5b6d6dSopenharmony_ci    }
6992e5b6d6dSopenharmony_ci}
7002e5b6d6dSopenharmony_ci
7012e5b6d6dSopenharmony_cistatic int16_t
7022e5b6d6dSopenharmony_cicompressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
7032e5b6d6dSopenharmony_ci    int16_t start, limit, token, groupTop=*pGroupTop;
7042e5b6d6dSopenharmony_ci
7052e5b6d6dSopenharmony_ci    start=0;
7062e5b6d6dSopenharmony_ci    do {
7072e5b6d6dSopenharmony_ci        /* write any "noise" characters */
7082e5b6d6dSopenharmony_ci        limit=skipNoise((char *)s, start, length);
7092e5b6d6dSopenharmony_ci        while(start<limit) {
7102e5b6d6dSopenharmony_ci            groupStore[groupTop++]=s[start++];
7112e5b6d6dSopenharmony_ci        }
7122e5b6d6dSopenharmony_ci
7132e5b6d6dSopenharmony_ci        if(start==length) {
7142e5b6d6dSopenharmony_ci            break;
7152e5b6d6dSopenharmony_ci        }
7162e5b6d6dSopenharmony_ci
7172e5b6d6dSopenharmony_ci        /* write a word, as token or directly */
7182e5b6d6dSopenharmony_ci        limit=getWord((char *)s, start, length);
7192e5b6d6dSopenharmony_ci        if(limit-start==1) {
7202e5b6d6dSopenharmony_ci            groupStore[groupTop++]=s[start++];
7212e5b6d6dSopenharmony_ci        } else {
7222e5b6d6dSopenharmony_ci            token=findToken(s+start, (int16_t)(limit-start));
7232e5b6d6dSopenharmony_ci            if(token!=-1) {
7242e5b6d6dSopenharmony_ci                if(token>0xff) {
7252e5b6d6dSopenharmony_ci                    groupStore[groupTop++]=(uint8_t)(token>>8);
7262e5b6d6dSopenharmony_ci                }
7272e5b6d6dSopenharmony_ci                groupStore[groupTop++]=(uint8_t)token;
7282e5b6d6dSopenharmony_ci                start=limit;
7292e5b6d6dSopenharmony_ci            } else {
7302e5b6d6dSopenharmony_ci                while(start<limit) {
7312e5b6d6dSopenharmony_ci                    groupStore[groupTop++]=s[start++];
7322e5b6d6dSopenharmony_ci                }
7332e5b6d6dSopenharmony_ci            }
7342e5b6d6dSopenharmony_ci        }
7352e5b6d6dSopenharmony_ci    } while(start<length);
7362e5b6d6dSopenharmony_ci
7372e5b6d6dSopenharmony_ci    length=(int16_t)(groupTop-*pGroupTop);
7382e5b6d6dSopenharmony_ci    *pGroupTop=groupTop;
7392e5b6d6dSopenharmony_ci    return length;
7402e5b6d6dSopenharmony_ci}
7412e5b6d6dSopenharmony_ci
7422e5b6d6dSopenharmony_cistatic int32_t
7432e5b6d6dSopenharmony_cicompareWords(const void *context, const void *word1, const void *word2) {
7442e5b6d6dSopenharmony_ci    /* reverse sort by word weight */
7452e5b6d6dSopenharmony_ci    return ((Word *)word2)->weight-((Word *)word1)->weight;
7462e5b6d6dSopenharmony_ci}
7472e5b6d6dSopenharmony_ci
7482e5b6d6dSopenharmony_civoid
7492e5b6d6dSopenharmony_ciNamesPropsBuilder::build(UErrorCode &errorCode) {
7502e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode)) { return; }
7512e5b6d6dSopenharmony_ci
7522e5b6d6dSopenharmony_ci    if(!beQuiet) {
7532e5b6d6dSopenharmony_ci        puts("* unames.icu stats *");
7542e5b6d6dSopenharmony_ci        printf("size of all names in the database: %lu\n",
7552e5b6d6dSopenharmony_ci            (unsigned long)lineTop);
7562e5b6d6dSopenharmony_ci        printf("number of named Unicode characters: %lu\n",
7572e5b6d6dSopenharmony_ci            (unsigned long)lineCount);
7582e5b6d6dSopenharmony_ci        printf("number of words in the dictionary from these names: %lu\n",
7592e5b6d6dSopenharmony_ci            (unsigned long)wordCount);
7602e5b6d6dSopenharmony_ci    }
7612e5b6d6dSopenharmony_ci    compress(errorCode);
7622e5b6d6dSopenharmony_ci}
7632e5b6d6dSopenharmony_ci
7642e5b6d6dSopenharmony_ci/* generate output data ----------------------------------------------------- */
7652e5b6d6dSopenharmony_ci
7662e5b6d6dSopenharmony_civoid
7672e5b6d6dSopenharmony_ciNamesPropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
7682e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode)) { return; }
7692e5b6d6dSopenharmony_ci
7702e5b6d6dSopenharmony_ci    UNewDataMemory *pData=udata_create(path, "icu", "unames", &dataInfo,
7712e5b6d6dSopenharmony_ci                                       withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
7722e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode)) {
7732e5b6d6dSopenharmony_ci        fprintf(stderr, "genprops: udata_create(%s, unames.icu) failed - %s\n",
7742e5b6d6dSopenharmony_ci                path, u_errorName(errorCode));
7752e5b6d6dSopenharmony_ci        return;
7762e5b6d6dSopenharmony_ci    }
7772e5b6d6dSopenharmony_ci
7782e5b6d6dSopenharmony_ci    uint16_t groupWords[3];
7792e5b6d6dSopenharmony_ci    uint32_t i, groupTop=lineTop, size,
7802e5b6d6dSopenharmony_ci             tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
7812e5b6d6dSopenharmony_ci    long dataLength;
7822e5b6d6dSopenharmony_ci    int16_t token;
7832e5b6d6dSopenharmony_ci
7842e5b6d6dSopenharmony_ci    /* first, see how much space we need, and prepare the token strings */
7852e5b6d6dSopenharmony_ci    for(i=0; i<tokenCount; ++i) {
7862e5b6d6dSopenharmony_ci        token=tokens[i];
7872e5b6d6dSopenharmony_ci        if(token!=-1 && token!=-2) {
7882e5b6d6dSopenharmony_ci            tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
7892e5b6d6dSopenharmony_ci        }
7902e5b6d6dSopenharmony_ci    }
7912e5b6d6dSopenharmony_ci
7922e5b6d6dSopenharmony_ci    /*
7932e5b6d6dSopenharmony_ci     * Required padding for data swapping:
7942e5b6d6dSopenharmony_ci     * The token table undergoes a permutation during data swapping when the
7952e5b6d6dSopenharmony_ci     * input and output charsets are different.
7962e5b6d6dSopenharmony_ci     * The token table cannot grow during swapping, so we need to make sure that
7972e5b6d6dSopenharmony_ci     * the table is long enough for successful in-place permutation.
7982e5b6d6dSopenharmony_ci     *
7992e5b6d6dSopenharmony_ci     * We simply round up tokenCount to the next multiple of 256 to account for
8002e5b6d6dSopenharmony_ci     * all possible permutations.
8012e5b6d6dSopenharmony_ci     *
8022e5b6d6dSopenharmony_ci     * An optimization is possible if we only ever swap between ASCII and EBCDIC:
8032e5b6d6dSopenharmony_ci     *
8042e5b6d6dSopenharmony_ci     * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
8052e5b6d6dSopenharmony_ci     * and will be swapped between ASCII and EBCDIC between
8062e5b6d6dSopenharmony_ci     * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
8072e5b6d6dSopenharmony_ci     * This should be the only -1 entry in tokens[256..511] on which the data
8082e5b6d6dSopenharmony_ci     * swapper bases its trail byte permutation map (trailMap[]).
8092e5b6d6dSopenharmony_ci     *
8102e5b6d6dSopenharmony_ci     * It would be sufficient to increase tokenCount so that its lower 8 bits
8112e5b6d6dSopenharmony_ci     * are at least 0x5e+1 to make room for swapping between the two semicolons.
8122e5b6d6dSopenharmony_ci     * For values higher than 0x5e, the trail byte permutation map (trailMap[])
8132e5b6d6dSopenharmony_ci     * should always be an identity map, where we do not need additional room.
8142e5b6d6dSopenharmony_ci     */
8152e5b6d6dSopenharmony_ci    i=tokenCount;
8162e5b6d6dSopenharmony_ci    tokenCount=(tokenCount+0xff)&~0xff;
8172e5b6d6dSopenharmony_ci    if(!beQuiet && i<tokenCount) {
8182e5b6d6dSopenharmony_ci        printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
8192e5b6d6dSopenharmony_ci    }
8202e5b6d6dSopenharmony_ci    for(; i<tokenCount; ++i) {
8212e5b6d6dSopenharmony_ci        if((i&0xff)==NAME_SEPARATOR_CHAR) {
8222e5b6d6dSopenharmony_ci            tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
8232e5b6d6dSopenharmony_ci        } else {
8242e5b6d6dSopenharmony_ci            tokens[i]=0; /* unused token for padding */
8252e5b6d6dSopenharmony_ci        }
8262e5b6d6dSopenharmony_ci    }
8272e5b6d6dSopenharmony_ci
8282e5b6d6dSopenharmony_ci    /*
8292e5b6d6dSopenharmony_ci     * Calculate the total size in bytes of the data including:
8302e5b6d6dSopenharmony_ci     * - the offset to the token strings, uint32_t (4)
8312e5b6d6dSopenharmony_ci     * - the offset to the group table, uint32_t (4)
8322e5b6d6dSopenharmony_ci     * - the offset to the group strings, uint32_t (4)
8332e5b6d6dSopenharmony_ci     * - the offset to the algorithmic names, uint32_t (4)
8342e5b6d6dSopenharmony_ci     *
8352e5b6d6dSopenharmony_ci     * - the number of tokens, uint16_t (2)
8362e5b6d6dSopenharmony_ci     * - the token table, uint16_t[tokenCount] (2*tokenCount)
8372e5b6d6dSopenharmony_ci     *
8382e5b6d6dSopenharmony_ci     * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
8392e5b6d6dSopenharmony_ci     *
8402e5b6d6dSopenharmony_ci     * - the number of groups, uint16_t (2)
8412e5b6d6dSopenharmony_ci     * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
8422e5b6d6dSopenharmony_ci     *
8432e5b6d6dSopenharmony_ci     * - the group strings (groupTop-groupBottom), 2-padded
8442e5b6d6dSopenharmony_ci     *
8452e5b6d6dSopenharmony_ci     * - the size of the data for the algorithmic names
8462e5b6d6dSopenharmony_ci     */
8472e5b6d6dSopenharmony_ci    tokenStringOffset=4+4+4+4+2+2*tokenCount;
8482e5b6d6dSopenharmony_ci    groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1;
8492e5b6d6dSopenharmony_ci    groupStringOffset=groupsOffset+2+6*lineCount;
8502e5b6d6dSopenharmony_ci    algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3;
8512e5b6d6dSopenharmony_ci
8522e5b6d6dSopenharmony_ci    size=algNamesOffset+4+algRanges.length();
8532e5b6d6dSopenharmony_ci
8542e5b6d6dSopenharmony_ci    if(!beQuiet) {
8552e5b6d6dSopenharmony_ci        printf("size of the Unicode Names data:\n"
8562e5b6d6dSopenharmony_ci               "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
8572e5b6d6dSopenharmony_ci                (unsigned long)size, (unsigned long)(lineTop-groupTop),
8582e5b6d6dSopenharmony_ci                (unsigned long)(groupTop-groupBottom), (unsigned long)(4+algRanges.length()));
8592e5b6d6dSopenharmony_ci    }
8602e5b6d6dSopenharmony_ci
8612e5b6d6dSopenharmony_ci    /* write the data to the file */
8622e5b6d6dSopenharmony_ci    /* offsets */
8632e5b6d6dSopenharmony_ci    udata_write32(pData, tokenStringOffset);
8642e5b6d6dSopenharmony_ci    udata_write32(pData, groupsOffset);
8652e5b6d6dSopenharmony_ci    udata_write32(pData, groupStringOffset);
8662e5b6d6dSopenharmony_ci    udata_write32(pData, algNamesOffset);
8672e5b6d6dSopenharmony_ci
8682e5b6d6dSopenharmony_ci    /* token table */
8692e5b6d6dSopenharmony_ci    udata_write16(pData, (uint16_t)tokenCount);
8702e5b6d6dSopenharmony_ci    udata_writeBlock(pData, tokens, 2*tokenCount);
8712e5b6d6dSopenharmony_ci
8722e5b6d6dSopenharmony_ci    /* token strings */
8732e5b6d6dSopenharmony_ci    udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
8742e5b6d6dSopenharmony_ci    if((lineTop-groupTop)&1) {
8752e5b6d6dSopenharmony_ci        /* 2-padding */
8762e5b6d6dSopenharmony_ci        udata_writePadding(pData, 1);
8772e5b6d6dSopenharmony_ci    }
8782e5b6d6dSopenharmony_ci
8792e5b6d6dSopenharmony_ci    /* group table */
8802e5b6d6dSopenharmony_ci    udata_write16(pData, (uint16_t)lineCount);
8812e5b6d6dSopenharmony_ci    for(i=0; i<lineCount; ++i) {
8822e5b6d6dSopenharmony_ci        /* groupMSB */
8832e5b6d6dSopenharmony_ci        groupWords[0]=(uint16_t)lines[i].code;
8842e5b6d6dSopenharmony_ci
8852e5b6d6dSopenharmony_ci        /* offset */
8862e5b6d6dSopenharmony_ci        uint32_t offset = (uint32_t)((lines[i].s - stringStore)-groupBottom);
8872e5b6d6dSopenharmony_ci        groupWords[1]=(uint16_t)(offset>>16);
8882e5b6d6dSopenharmony_ci        groupWords[2]=(uint16_t)(offset);
8892e5b6d6dSopenharmony_ci        udata_writeBlock(pData, groupWords, 6);
8902e5b6d6dSopenharmony_ci    }
8912e5b6d6dSopenharmony_ci
8922e5b6d6dSopenharmony_ci    /* group strings */
8932e5b6d6dSopenharmony_ci    udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom);
8942e5b6d6dSopenharmony_ci
8952e5b6d6dSopenharmony_ci    /* 4-align the algorithmic names data */
8962e5b6d6dSopenharmony_ci    udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
8972e5b6d6dSopenharmony_ci
8982e5b6d6dSopenharmony_ci    udata_write32(pData, countAlgRanges);
8992e5b6d6dSopenharmony_ci    udata_writeBlock(pData, algRanges.data(), algRanges.length());
9002e5b6d6dSopenharmony_ci
9012e5b6d6dSopenharmony_ci    /* finish up */
9022e5b6d6dSopenharmony_ci    dataLength=udata_finish(pData, &errorCode);
9032e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode)) {
9042e5b6d6dSopenharmony_ci        fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
9052e5b6d6dSopenharmony_ci        exit(errorCode);
9062e5b6d6dSopenharmony_ci    }
9072e5b6d6dSopenharmony_ci
9082e5b6d6dSopenharmony_ci    if(dataLength!=(long)size) {
9092e5b6d6dSopenharmony_ci        fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
9102e5b6d6dSopenharmony_cidataLength, (unsigned long)size);
9112e5b6d6dSopenharmony_ci        exit(U_INTERNAL_PROGRAM_ERROR);
9122e5b6d6dSopenharmony_ci    }
9132e5b6d6dSopenharmony_ci}
9142e5b6d6dSopenharmony_ci
9152e5b6d6dSopenharmony_ci/* helpers ------------------------------------------------------------------ */
9162e5b6d6dSopenharmony_ci
9172e5b6d6dSopenharmony_cistatic int16_t
9182e5b6d6dSopenharmony_cifindToken(uint8_t *s, int16_t length) {
9192e5b6d6dSopenharmony_ci    int16_t i, token;
9202e5b6d6dSopenharmony_ci
9212e5b6d6dSopenharmony_ci    for(i=0; i<(int16_t)tokenCount; ++i) {
9222e5b6d6dSopenharmony_ci        token=tokens[i];
9232e5b6d6dSopenharmony_ci        if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
9242e5b6d6dSopenharmony_ci            return i;
9252e5b6d6dSopenharmony_ci        }
9262e5b6d6dSopenharmony_ci    }
9272e5b6d6dSopenharmony_ci
9282e5b6d6dSopenharmony_ci    return -1;
9292e5b6d6dSopenharmony_ci}
9302e5b6d6dSopenharmony_ci
9312e5b6d6dSopenharmony_cistatic Word *
9322e5b6d6dSopenharmony_cifindWord(const char *s, int16_t length) {
9332e5b6d6dSopenharmony_ci    uint32_t i;
9342e5b6d6dSopenharmony_ci
9352e5b6d6dSopenharmony_ci    for(i=0; i<wordCount; ++i) {
9362e5b6d6dSopenharmony_ci        if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
9372e5b6d6dSopenharmony_ci            return words+i;
9382e5b6d6dSopenharmony_ci        }
9392e5b6d6dSopenharmony_ci    }
9402e5b6d6dSopenharmony_ci
9412e5b6d6dSopenharmony_ci    return NULL;
9422e5b6d6dSopenharmony_ci}
9432e5b6d6dSopenharmony_ci
9442e5b6d6dSopenharmony_cistatic Word *
9452e5b6d6dSopenharmony_ciaddWord(const char *s, int16_t length) {
9462e5b6d6dSopenharmony_ci    uint8_t *stringStart;
9472e5b6d6dSopenharmony_ci    Word *word;
9482e5b6d6dSopenharmony_ci
9492e5b6d6dSopenharmony_ci    if(wordCount==MAX_WORD_COUNT) {
9502e5b6d6dSopenharmony_ci        fprintf(stderr, "gennames: too many words\n");
9512e5b6d6dSopenharmony_ci        exit(U_BUFFER_OVERFLOW_ERROR);
9522e5b6d6dSopenharmony_ci    }
9532e5b6d6dSopenharmony_ci
9542e5b6d6dSopenharmony_ci    stringStart=allocWord(length);
9552e5b6d6dSopenharmony_ci    uprv_memcpy(stringStart, s, length);
9562e5b6d6dSopenharmony_ci
9572e5b6d6dSopenharmony_ci    word=words+wordCount;
9582e5b6d6dSopenharmony_ci
9592e5b6d6dSopenharmony_ci    /*
9602e5b6d6dSopenharmony_ci     * Initialize the weight with the costs for this token:
9612e5b6d6dSopenharmony_ci     * a zero-terminated string and a 16-bit offset.
9622e5b6d6dSopenharmony_ci     */
9632e5b6d6dSopenharmony_ci    word->weight=-(length+1+2);
9642e5b6d6dSopenharmony_ci    word->count=0;
9652e5b6d6dSopenharmony_ci    word->length=length;
9662e5b6d6dSopenharmony_ci    word->s=stringStart;
9672e5b6d6dSopenharmony_ci
9682e5b6d6dSopenharmony_ci    ++wordCount;
9692e5b6d6dSopenharmony_ci
9702e5b6d6dSopenharmony_ci    return word;
9712e5b6d6dSopenharmony_ci}
9722e5b6d6dSopenharmony_ci
9732e5b6d6dSopenharmony_cistatic void
9742e5b6d6dSopenharmony_cicountWord(Word *word) {
9752e5b6d6dSopenharmony_ci    /* add to the weight the savings: the length of the word minus 1 byte for the token */
9762e5b6d6dSopenharmony_ci    word->weight+=word->length-1;
9772e5b6d6dSopenharmony_ci    ++word->count;
9782e5b6d6dSopenharmony_ci}
9792e5b6d6dSopenharmony_ci
9802e5b6d6dSopenharmony_cistatic void
9812e5b6d6dSopenharmony_ciaddLine(UChar32 code, const char *names[], int16_t lengths[], int16_t count) {
9822e5b6d6dSopenharmony_ci    uint8_t *stringStart;
9832e5b6d6dSopenharmony_ci    Line *line;
9842e5b6d6dSopenharmony_ci    int16_t i, length;
9852e5b6d6dSopenharmony_ci
9862e5b6d6dSopenharmony_ci    if(lineCount==MAX_LINE_COUNT) {
9872e5b6d6dSopenharmony_ci        fprintf(stderr, "gennames: too many lines\n");
9882e5b6d6dSopenharmony_ci        exit(U_BUFFER_OVERFLOW_ERROR);
9892e5b6d6dSopenharmony_ci    }
9902e5b6d6dSopenharmony_ci
9912e5b6d6dSopenharmony_ci    /* find the last non-empty name */
9922e5b6d6dSopenharmony_ci    while(count>0 && lengths[count-1]==0) {
9932e5b6d6dSopenharmony_ci        --count;
9942e5b6d6dSopenharmony_ci    }
9952e5b6d6dSopenharmony_ci    if(count==0) {
9962e5b6d6dSopenharmony_ci        return; /* should not occur: caller should not have called */
9972e5b6d6dSopenharmony_ci    }
9982e5b6d6dSopenharmony_ci
9992e5b6d6dSopenharmony_ci    /* there will be (count-1) separator characters */
10002e5b6d6dSopenharmony_ci    i=count;
10012e5b6d6dSopenharmony_ci    length=count-1;
10022e5b6d6dSopenharmony_ci
10032e5b6d6dSopenharmony_ci    /* add lengths of strings */
10042e5b6d6dSopenharmony_ci    while(i>0) {
10052e5b6d6dSopenharmony_ci        length+=lengths[--i];
10062e5b6d6dSopenharmony_ci    }
10072e5b6d6dSopenharmony_ci
10082e5b6d6dSopenharmony_ci    /* allocate line memory */
10092e5b6d6dSopenharmony_ci    stringStart=allocLine(length);
10102e5b6d6dSopenharmony_ci
10112e5b6d6dSopenharmony_ci    /* copy all strings into the line memory */
10122e5b6d6dSopenharmony_ci    length=0; /* number of chars copied so far */
10132e5b6d6dSopenharmony_ci    for(i=0; i<count; ++i) {
10142e5b6d6dSopenharmony_ci        if(i>0) {
10152e5b6d6dSopenharmony_ci            stringStart[length++]=NAME_SEPARATOR_CHAR;
10162e5b6d6dSopenharmony_ci        }
10172e5b6d6dSopenharmony_ci        if(lengths[i]>0) {
10182e5b6d6dSopenharmony_ci            uprv_memcpy(stringStart+length, names[i], lengths[i]);
10192e5b6d6dSopenharmony_ci            length+=lengths[i];
10202e5b6d6dSopenharmony_ci        }
10212e5b6d6dSopenharmony_ci    }
10222e5b6d6dSopenharmony_ci
10232e5b6d6dSopenharmony_ci    line=lines+lineCount;
10242e5b6d6dSopenharmony_ci
10252e5b6d6dSopenharmony_ci    line->code=code;
10262e5b6d6dSopenharmony_ci    line->length=length;
10272e5b6d6dSopenharmony_ci    line->s=stringStart;
10282e5b6d6dSopenharmony_ci
10292e5b6d6dSopenharmony_ci    ++lineCount;
10302e5b6d6dSopenharmony_ci
10312e5b6d6dSopenharmony_ci    /* prevent a character value that is actually in a name from becoming a token */
10322e5b6d6dSopenharmony_ci    while(length>0) {
10332e5b6d6dSopenharmony_ci        tokens[stringStart[--length]]=-1;
10342e5b6d6dSopenharmony_ci    }
10352e5b6d6dSopenharmony_ci}
10362e5b6d6dSopenharmony_ci
10372e5b6d6dSopenharmony_cistatic void
10382e5b6d6dSopenharmony_ciaddGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
10392e5b6d6dSopenharmony_ci    uint8_t *stringStart;
10402e5b6d6dSopenharmony_ci    Line *line;
10412e5b6d6dSopenharmony_ci
10422e5b6d6dSopenharmony_ci    if(lineCount==MAX_LINE_COUNT) {
10432e5b6d6dSopenharmony_ci        fprintf(stderr, "gennames: too many groups\n");
10442e5b6d6dSopenharmony_ci        exit(U_BUFFER_OVERFLOW_ERROR);
10452e5b6d6dSopenharmony_ci    }
10462e5b6d6dSopenharmony_ci
10472e5b6d6dSopenharmony_ci    /* store the line lengths first, then the strings */
10482e5b6d6dSopenharmony_ci    lineLengthsTop=(lineLengthsTop+1)/2;
10492e5b6d6dSopenharmony_ci    stringStart=allocLine(lineLengthsTop+length);
10502e5b6d6dSopenharmony_ci    uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
10512e5b6d6dSopenharmony_ci    uprv_memcpy(stringStart+lineLengthsTop, strings, length);
10522e5b6d6dSopenharmony_ci
10532e5b6d6dSopenharmony_ci    line=lines+lineCount;
10542e5b6d6dSopenharmony_ci
10552e5b6d6dSopenharmony_ci    line->code=groupMSB;
10562e5b6d6dSopenharmony_ci    line->length=length;
10572e5b6d6dSopenharmony_ci    line->s=stringStart;
10582e5b6d6dSopenharmony_ci
10592e5b6d6dSopenharmony_ci    ++lineCount;
10602e5b6d6dSopenharmony_ci}
10612e5b6d6dSopenharmony_ci
10622e5b6d6dSopenharmony_cistatic uint32_t
10632e5b6d6dSopenharmony_ciaddToken(uint8_t *s, int16_t length) {
10642e5b6d6dSopenharmony_ci    uint8_t *stringStart;
10652e5b6d6dSopenharmony_ci
10662e5b6d6dSopenharmony_ci    stringStart=allocLine(length+1);
10672e5b6d6dSopenharmony_ci    uprv_memcpy(stringStart, s, length);
10682e5b6d6dSopenharmony_ci    stringStart[length]=0;
10692e5b6d6dSopenharmony_ci
10702e5b6d6dSopenharmony_ci    return (uint32_t)(stringStart - stringStore);
10712e5b6d6dSopenharmony_ci}
10722e5b6d6dSopenharmony_ci
10732e5b6d6dSopenharmony_cistatic void
10742e5b6d6dSopenharmony_ciappendLineLength(int16_t length) {
10752e5b6d6dSopenharmony_ci    if(length>=76) {
10762e5b6d6dSopenharmony_ci        fprintf(stderr, "gennames: compressed line too long\n");
10772e5b6d6dSopenharmony_ci        exit(U_BUFFER_OVERFLOW_ERROR);
10782e5b6d6dSopenharmony_ci    }
10792e5b6d6dSopenharmony_ci    if(length>=12) {
10802e5b6d6dSopenharmony_ci        length-=12;
10812e5b6d6dSopenharmony_ci        appendLineLengthNibble((uint8_t)((length>>4)|12));
10822e5b6d6dSopenharmony_ci    }
10832e5b6d6dSopenharmony_ci    appendLineLengthNibble((uint8_t)length);
10842e5b6d6dSopenharmony_ci}
10852e5b6d6dSopenharmony_ci
10862e5b6d6dSopenharmony_cistatic void
10872e5b6d6dSopenharmony_ciappendLineLengthNibble(uint8_t nibble) {
10882e5b6d6dSopenharmony_ci    if((lineLengthsTop&1)==0) {
10892e5b6d6dSopenharmony_ci        lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
10902e5b6d6dSopenharmony_ci    } else {
10912e5b6d6dSopenharmony_ci        lineLengths[lineLengthsTop/2]|=nibble&0xf;
10922e5b6d6dSopenharmony_ci    }
10932e5b6d6dSopenharmony_ci    ++lineLengthsTop;
10942e5b6d6dSopenharmony_ci}
10952e5b6d6dSopenharmony_ci
10962e5b6d6dSopenharmony_cistatic uint8_t *
10972e5b6d6dSopenharmony_ciallocLine(int32_t length) {
10982e5b6d6dSopenharmony_ci    uint32_t top=lineTop+length;
10992e5b6d6dSopenharmony_ci    uint8_t *p;
11002e5b6d6dSopenharmony_ci
11012e5b6d6dSopenharmony_ci    if(top>wordBottom) {
11022e5b6d6dSopenharmony_ci        fprintf(stderr, "gennames allocLine(): out of memory\n");
11032e5b6d6dSopenharmony_ci        exit(U_MEMORY_ALLOCATION_ERROR);
11042e5b6d6dSopenharmony_ci    }
11052e5b6d6dSopenharmony_ci    p=stringStore+lineTop;
11062e5b6d6dSopenharmony_ci    lineTop=top;
11072e5b6d6dSopenharmony_ci    return p;
11082e5b6d6dSopenharmony_ci}
11092e5b6d6dSopenharmony_ci
11102e5b6d6dSopenharmony_cistatic uint8_t *
11112e5b6d6dSopenharmony_ciallocWord(uint32_t length) {
11122e5b6d6dSopenharmony_ci    uint32_t bottom=wordBottom-length;
11132e5b6d6dSopenharmony_ci
11142e5b6d6dSopenharmony_ci    if(lineTop>bottom) {
11152e5b6d6dSopenharmony_ci        fprintf(stderr, "gennames allocWord(): out of memory\n");
11162e5b6d6dSopenharmony_ci        exit(U_MEMORY_ALLOCATION_ERROR);
11172e5b6d6dSopenharmony_ci    }
11182e5b6d6dSopenharmony_ci    wordBottom=bottom;
11192e5b6d6dSopenharmony_ci    return stringStore+bottom;
11202e5b6d6dSopenharmony_ci}
11212e5b6d6dSopenharmony_ci
11222e5b6d6dSopenharmony_ciPropsBuilder *
11232e5b6d6dSopenharmony_cicreateNamesPropsBuilder(UErrorCode &errorCode) {
11242e5b6d6dSopenharmony_ci    if(U_FAILURE(errorCode)) { return NULL; }
11252e5b6d6dSopenharmony_ci    PropsBuilder *pb=new NamesPropsBuilder(errorCode);
11262e5b6d6dSopenharmony_ci    if(pb==NULL) {
11272e5b6d6dSopenharmony_ci        errorCode=U_MEMORY_ALLOCATION_ERROR;
11282e5b6d6dSopenharmony_ci    }
11292e5b6d6dSopenharmony_ci    return pb;
11302e5b6d6dSopenharmony_ci}
11312e5b6d6dSopenharmony_ci
11322e5b6d6dSopenharmony_ci/*
11332e5b6d6dSopenharmony_ci * Hey, Emacs, please set the following:
11342e5b6d6dSopenharmony_ci *
11352e5b6d6dSopenharmony_ci * Local Variables:
11362e5b6d6dSopenharmony_ci * indent-tabs-mode: nil
11372e5b6d6dSopenharmony_ci * End:
11382e5b6d6dSopenharmony_ci *
11392e5b6d6dSopenharmony_ci */
1140