1// © 2017 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*   Copyright (C) 2010, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*   file name:  genuts46.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2010mar02
14*   created by: Markus W. Scherer
15*
16* quick & dirty tool to recreate the UTS #46 data table according to the spec
17*/
18
19#include <stdio.h>
20#include <stdlib.h>
21#include <string>
22#include <string.h>
23#include "unicode/utypes.h"
24#include "unicode/errorcode.h"
25#include "unicode/normalizer2.h"
26#include "unicode/uniset.h"
27#include "unicode/unistr.h"
28#include "unicode/usetiter.h"
29#include "unicode/usprep.h"
30#include "sprpimpl.h"  // HACK
31
32/**
33 * icu::ErrorCode subclass for easy UErrorCode handling.
34 * The destructor calls handleFailure() which calls exit(errorCode) when isFailure().
35 */
36class ExitingErrorCode : public icu::ErrorCode {
37public:
38    /**
39     * @param loc A short string describing where the ExitingErrorCode is used.
40     */
41    ExitingErrorCode(const char *loc) : location(loc) {}
42    virtual ~ExitingErrorCode();
43protected:
44    virtual void handleFailure() const;
45private:
46    const char *location;
47};
48
49ExitingErrorCode::~ExitingErrorCode() {
50    // Safe because our handleFailure() does not throw exceptions.
51    if(isFailure()) { handleFailure(); }
52}
53
54void ExitingErrorCode::handleFailure() const {
55    fprintf(stderr, "error at %s: %s\n", location, errorName());
56    exit(errorCode);
57}
58
59static int
60toIDNA2003(const UStringPrepProfile *prep, UChar32 c, icu::UnicodeString &destString) {
61    UChar src[2];
62    int32_t srcLength=0;
63    U16_APPEND_UNSAFE(src, srcLength, c);
64    UChar *dest;
65    int32_t destLength;
66    dest=destString.getBuffer(32);
67    if(dest==NULL) {
68        return false;
69    }
70    UErrorCode errorCode=U_ZERO_ERROR;
71    destLength=usprep_prepare(prep, src, srcLength,
72                              dest, destString.getCapacity(),
73                              USPREP_DEFAULT, NULL, &errorCode);
74    destString.releaseBuffer(destLength);
75    if(errorCode==U_STRINGPREP_PROHIBITED_ERROR) {
76        return -1;
77    } else {
78        // Returns false=0 for U_STRINGPREP_UNASSIGNED_ERROR and processing errors,
79        // true=1 if c is valid or mapped.
80        return U_SUCCESS(errorCode);
81    }
82}
83
84enum Status {
85    DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID,
86    DISALLOWED_STD3_VALID, DISALLOWED_STD3_MAPPED
87};
88static const char *const statusNames[]={
89    "disallowed", "ignored", "mapped", "deviation", "valid",
90    "disallowed_STD3_valid", "disallowed_STD3_mapped"
91};
92
93static void
94printLine(UChar32 start, UChar32 end, Status status, const icu::UnicodeString &mapping) {
95    if(start==end) {
96        printf("%04lX          ", (long)start);
97    } else {
98        printf("%04lX..%04lX    ", (long)start, (long)end);
99    }
100    printf("; %s", statusNames[status]);
101    if(status==MAPPED || status==DEVIATION || !mapping.isEmpty()) {
102        printf(" ;");
103        const UChar *buffer=mapping.getBuffer();
104        int32_t length=mapping.length();
105        int32_t i=0;
106        UChar32 c;
107        while(i<length) {
108            U16_NEXT(buffer, i, length, c);
109            printf(" %04lX", (long)c);
110        }
111    }
112    puts("");
113}
114
115static void
116getAgeIfAssigned(UChar32 c, UVersionInfo age) {
117    if(u_isdefined(c)) {
118        u_charAge(c, age);
119    } else if(U_IS_UNICODE_NONCHAR(c)) {
120        age[0]=0;
121        age[1]=0;
122        age[2]=0;
123        age[3]=1;
124    } else {
125        memset(age, 0, 4);
126    }
127}
128
129extern int
130main(int argc, const char *argv[]) {
131    ExitingErrorCode errorCode("genuts46");
132
133    // predefined base sets
134    icu::UnicodeSet unassignedSet(UNICODE_STRING_SIMPLE("[:Cn:]"), errorCode);
135
136    icu::UnicodeSet labelSeparators(
137        UNICODE_STRING_SIMPLE("[\\u002E\\u3002\\uFF0E\\uFF61]"), errorCode);
138
139    icu::UnicodeSet mappedSet(
140        UNICODE_STRING_SIMPLE("[:Changes_When_NFKC_Casefolded:]"), errorCode);
141    mappedSet.removeAll(labelSeparators);  // simplifies checking of mapped characters
142
143    icu::UnicodeSet baseValidSet(icu::UnicodeString(
144        "[[[[:^Changes_When_NFKC_Casefolded:]"
145        "-[:C:]-[:Z:]"
146        "-[:Block=Ideographic_Description_Characters:]]"
147        "[:ascii:]]-[.]]", -1, US_INV), errorCode);
148
149    // Characters that are disallowed when STD3 rules are applied,
150    // but valid when STD3 rules are not applied.
151    icu::UnicodeSet disallowedSTD3Set(icu::UnicodeString(
152        "[[:ascii:]-[\\u002D.a-zA-Z0-9]]", -1, US_INV), errorCode);
153
154    icu::UnicodeSet deviationSet(
155        UNICODE_STRING_SIMPLE("[\\u00DF\\u03C2\\u200C\\u200D]"), errorCode);
156    errorCode.assertSuccess();
157
158    // derived sets
159    icu::LocalUStringPrepProfilePointer namePrep(usprep_openByType(USPREP_RFC3491_NAMEPREP, errorCode));
160    const icu::Normalizer2 *nfkc_cf=
161        icu::Normalizer2::getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode);
162    errorCode.assertSuccess();
163
164    // HACK: The StringPrep API performs a BiDi check according to the data.
165    // We need to override that for this data generation, by resetting an internal flag.
166    namePrep->checkBiDi=false;
167
168    icu::UnicodeSet baseExclusionSet;
169    icu::UnicodeString cString, mapping, namePrepResult;
170    for(UChar32 c=0; c<=0x10ffff; ++c) {
171        if(c==0xd800) {
172            c=0xe000;
173        }
174        int namePrepStatus=toIDNA2003(namePrep.getAlias(), c, namePrepResult);
175        if(namePrepStatus!=0) {
176            // get the UTS #46 base mapping value
177            switch(c) {
178            case 0xff0e:
179            case 0x3002:
180            case 0xff61:
181                mapping.setTo(0x2e);
182                break;
183            default:
184                cString.setTo(c);
185                nfkc_cf->normalize(cString, mapping, errorCode);
186                break;
187            }
188            if(
189                namePrepStatus>0 ?
190                    // c is valid or mapped in IDNA2003
191                    !labelSeparators.contains(c) && namePrepResult!=mapping :
192                    // namePrepStatus<0: c is prohibited in IDNA2003
193                    baseValidSet.contains(c) || (cString!=mapping && baseValidSet.containsAll(mapping))
194            ) {
195                baseExclusionSet.add(c);
196            }
197        }
198    }
199
200    icu::UnicodeSet disallowedSet(0, 0x10ffff);
201    disallowedSet.
202        removeAll(labelSeparators).
203        removeAll(deviationSet).
204        removeAll(mappedSet).
205        removeAll(baseValidSet).
206        addAll(baseExclusionSet).
207        addAll(unassignedSet);
208
209    const icu::Normalizer2 *nfd=
210        icu::Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode);
211    errorCode.assertSuccess();
212
213    icu::UnicodeSet ignoredSet;  // will be a subset of mappedSet
214    icu::UnicodeSet removeSet;
215    icu::UnicodeString nfdString;
216    {
217        icu::UnicodeSetIterator iter(mappedSet);
218        while(iter.next()) {
219            UChar32 c=iter.getCodepoint();
220            cString.setTo(c);
221            nfkc_cf->normalize(cString, mapping, errorCode);
222            if(!baseValidSet.containsAll(mapping)) {
223                fprintf(stderr, "U+%04lX mapped -> disallowed: mapping not wholly in base valid set\n", (long)c);
224                disallowedSet.add(c);
225                removeSet.add(c);
226            } else if(mapping.isEmpty()) {
227                ignoredSet.add(c);
228            }
229        }
230        mappedSet.removeAll(removeSet);
231    }
232    errorCode.assertSuccess();
233
234    icu::UnicodeSet validSet(baseValidSet);
235    validSet.
236        removeAll(labelSeparators).  // non-ASCII label separators will be mapped in the end
237        removeAll(deviationSet).
238        removeAll(disallowedSet).
239        removeAll(mappedSet).
240        add(0x2e);  // not mapped, simply valid
241    UBool madeChange;
242    do {
243        madeChange=false;
244        {
245            removeSet.clear();
246            icu::UnicodeSetIterator iter(validSet);
247            while(iter.next()) {
248                UChar32 c=iter.getCodepoint();
249                if(nfd->getDecomposition(c, nfdString) && !validSet.containsAll(nfdString)) {
250                    fprintf(stderr, "U+%04lX valid -> disallowed: NFD not wholly valid\n", (long)c);
251                    disallowedSet.add(c);
252                    removeSet.add(c);
253                    madeChange=true;
254                }
255            }
256            validSet.removeAll(removeSet);
257        }
258        {
259            removeSet.clear();
260            icu::UnicodeSetIterator iter(mappedSet);
261            while(iter.next()) {
262                UChar32 c=iter.getCodepoint();
263                cString.setTo(c);
264                nfkc_cf->normalize(cString, mapping, errorCode);
265                nfd->normalize(mapping, nfdString, errorCode);
266                if(!validSet.containsAll(nfdString)) {
267                    fprintf(stderr, "U+%04lX mapped -> disallowed: NFD of mapping not wholly valid\n", (long)c);
268                    disallowedSet.add(c);
269                    removeSet.add(c);
270                    madeChange=true;
271                }
272            }
273            mappedSet.removeAll(removeSet);
274        }
275    } while(madeChange);
276    errorCode.assertSuccess();
277
278    // finish up
279    labelSeparators.remove(0x2e).freeze();  // U+002E is simply valid
280    deviationSet.freeze();
281    ignoredSet.freeze();
282    validSet.freeze();
283    mappedSet.freeze();
284    disallowedSTD3Set.freeze();
285
286    // output
287    UChar32 prevStart=0, c=0;
288    Status prevStatus=DISALLOWED_STD3_VALID, status;
289    icu::UnicodeString prevMapping;
290    UVersionInfo prevAge={ 1, 1, 0, 0 }, age;
291
292    icu::UnicodeSetIterator iter(disallowedSet);
293    while(iter.nextRange()) {
294        UChar32 start=iter.getCodepoint();
295        while(c<start) {
296            mapping.remove();
297            if(labelSeparators.contains(c)) {
298                status=MAPPED;
299                mapping.setTo(0x2e);
300            } else if(deviationSet.contains(c)) {
301                status=DEVIATION;
302                cString.setTo(c);
303                nfkc_cf->normalize(cString, mapping, errorCode);
304            } else if(ignoredSet.contains(c)) {
305                status=IGNORED;
306            } else if(validSet.contains(c)) {
307                if(disallowedSTD3Set.contains(c)) {
308                    fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: itself not STD3\n", (long)c);
309                    status=DISALLOWED_STD3_VALID;
310                } else if( nfd->getDecomposition(c, nfdString) &&
311                    disallowedSTD3Set.containsSome(nfdString)
312                ) {
313                    fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: NFD not wholly STD3\n", (long)c);
314                    status=DISALLOWED_STD3_VALID;
315                } else {
316                    status=VALID;
317                }
318            } else if(mappedSet.contains(c)) {
319                cString.setTo(c);
320                nfkc_cf->normalize(cString, mapping, errorCode);
321                if(disallowedSTD3Set.containsSome(mapping)) {
322                    fprintf(stderr, "U+%04lX mapped -> disallowed_STD3_mapped\n", (long)c);
323                    status=DISALLOWED_STD3_MAPPED;
324                } else {
325                    status=MAPPED;
326                }
327            } else {
328                fprintf(stderr, "*** undetermined status of U+%04lX\n", (long)c);
329            }
330            // Print a new line where the status, the mapping or
331            // the character age change.
332            getAgeIfAssigned(c, age);
333            if( prevStart<c &&
334                (status!=prevStatus || mapping!=prevMapping || 0!=memcmp(prevAge, age, 4))
335            ) {
336                printLine(prevStart, c-1, prevStatus, prevMapping);
337                prevStart=c;
338                prevStatus=status;
339                prevMapping=mapping;
340                memcpy(prevAge, age, 4);
341            }
342            ++c;
343        }
344        // c==start is disallowed
345        if(prevStart<c) {
346            printLine(prevStart, c-1, prevStatus, prevMapping);
347        }
348        prevStart=c;
349        prevStatus=DISALLOWED;
350        prevMapping.remove();
351        getAgeIfAssigned(c, prevAge);
352        UChar32 end=iter.getCodepointEnd();
353        while(++c<=end) {
354            getAgeIfAssigned(c, age);
355            if(prevStart<c && 0!=memcmp(prevAge, age, 4)) {
356                printLine(prevStart, c-1, prevStatus, prevMapping);
357                prevStart=c;
358                memcpy(prevAge, age, 4);
359            }
360        }
361    }
362    if(prevStart<c) {
363        printLine(prevStart, c-1, prevStatus, prevMapping);
364    }
365    return 0;
366}
367