1// © 2017 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* Copyright (C) 2010, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* file name: genuts46.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2010mar02 14* created by: Markus W. Scherer 15* 16* quick & dirty tool to recreate the UTS #46 data table according to the spec 17*/ 18 19#include <stdio.h> 20#include <stdlib.h> 21#include <string> 22#include <string.h> 23#include "unicode/utypes.h" 24#include "unicode/errorcode.h" 25#include "unicode/normalizer2.h" 26#include "unicode/uniset.h" 27#include "unicode/unistr.h" 28#include "unicode/usetiter.h" 29#include "unicode/usprep.h" 30#include "sprpimpl.h" // HACK 31 32/** 33 * icu::ErrorCode subclass for easy UErrorCode handling. 34 * The destructor calls handleFailure() which calls exit(errorCode) when isFailure(). 35 */ 36class ExitingErrorCode : public icu::ErrorCode { 37public: 38 /** 39 * @param loc A short string describing where the ExitingErrorCode is used. 40 */ 41 ExitingErrorCode(const char *loc) : location(loc) {} 42 virtual ~ExitingErrorCode(); 43protected: 44 virtual void handleFailure() const; 45private: 46 const char *location; 47}; 48 49ExitingErrorCode::~ExitingErrorCode() { 50 // Safe because our handleFailure() does not throw exceptions. 51 if(isFailure()) { handleFailure(); } 52} 53 54void ExitingErrorCode::handleFailure() const { 55 fprintf(stderr, "error at %s: %s\n", location, errorName()); 56 exit(errorCode); 57} 58 59static int 60toIDNA2003(const UStringPrepProfile *prep, UChar32 c, icu::UnicodeString &destString) { 61 UChar src[2]; 62 int32_t srcLength=0; 63 U16_APPEND_UNSAFE(src, srcLength, c); 64 UChar *dest; 65 int32_t destLength; 66 dest=destString.getBuffer(32); 67 if(dest==NULL) { 68 return false; 69 } 70 UErrorCode errorCode=U_ZERO_ERROR; 71 destLength=usprep_prepare(prep, src, srcLength, 72 dest, destString.getCapacity(), 73 USPREP_DEFAULT, NULL, &errorCode); 74 destString.releaseBuffer(destLength); 75 if(errorCode==U_STRINGPREP_PROHIBITED_ERROR) { 76 return -1; 77 } else { 78 // Returns false=0 for U_STRINGPREP_UNASSIGNED_ERROR and processing errors, 79 // true=1 if c is valid or mapped. 80 return U_SUCCESS(errorCode); 81 } 82} 83 84enum Status { 85 DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID, 86 DISALLOWED_STD3_VALID, DISALLOWED_STD3_MAPPED 87}; 88static const char *const statusNames[]={ 89 "disallowed", "ignored", "mapped", "deviation", "valid", 90 "disallowed_STD3_valid", "disallowed_STD3_mapped" 91}; 92 93static void 94printLine(UChar32 start, UChar32 end, Status status, const icu::UnicodeString &mapping) { 95 if(start==end) { 96 printf("%04lX ", (long)start); 97 } else { 98 printf("%04lX..%04lX ", (long)start, (long)end); 99 } 100 printf("; %s", statusNames[status]); 101 if(status==MAPPED || status==DEVIATION || !mapping.isEmpty()) { 102 printf(" ;"); 103 const UChar *buffer=mapping.getBuffer(); 104 int32_t length=mapping.length(); 105 int32_t i=0; 106 UChar32 c; 107 while(i<length) { 108 U16_NEXT(buffer, i, length, c); 109 printf(" %04lX", (long)c); 110 } 111 } 112 puts(""); 113} 114 115static void 116getAgeIfAssigned(UChar32 c, UVersionInfo age) { 117 if(u_isdefined(c)) { 118 u_charAge(c, age); 119 } else if(U_IS_UNICODE_NONCHAR(c)) { 120 age[0]=0; 121 age[1]=0; 122 age[2]=0; 123 age[3]=1; 124 } else { 125 memset(age, 0, 4); 126 } 127} 128 129extern int 130main(int argc, const char *argv[]) { 131 ExitingErrorCode errorCode("genuts46"); 132 133 // predefined base sets 134 icu::UnicodeSet unassignedSet(UNICODE_STRING_SIMPLE("[:Cn:]"), errorCode); 135 136 icu::UnicodeSet labelSeparators( 137 UNICODE_STRING_SIMPLE("[\\u002E\\u3002\\uFF0E\\uFF61]"), errorCode); 138 139 icu::UnicodeSet mappedSet( 140 UNICODE_STRING_SIMPLE("[:Changes_When_NFKC_Casefolded:]"), errorCode); 141 mappedSet.removeAll(labelSeparators); // simplifies checking of mapped characters 142 143 icu::UnicodeSet baseValidSet(icu::UnicodeString( 144 "[[[[:^Changes_When_NFKC_Casefolded:]" 145 "-[:C:]-[:Z:]" 146 "-[:Block=Ideographic_Description_Characters:]]" 147 "[:ascii:]]-[.]]", -1, US_INV), errorCode); 148 149 // Characters that are disallowed when STD3 rules are applied, 150 // but valid when STD3 rules are not applied. 151 icu::UnicodeSet disallowedSTD3Set(icu::UnicodeString( 152 "[[:ascii:]-[\\u002D.a-zA-Z0-9]]", -1, US_INV), errorCode); 153 154 icu::UnicodeSet deviationSet( 155 UNICODE_STRING_SIMPLE("[\\u00DF\\u03C2\\u200C\\u200D]"), errorCode); 156 errorCode.assertSuccess(); 157 158 // derived sets 159 icu::LocalUStringPrepProfilePointer namePrep(usprep_openByType(USPREP_RFC3491_NAMEPREP, errorCode)); 160 const icu::Normalizer2 *nfkc_cf= 161 icu::Normalizer2::getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode); 162 errorCode.assertSuccess(); 163 164 // HACK: The StringPrep API performs a BiDi check according to the data. 165 // We need to override that for this data generation, by resetting an internal flag. 166 namePrep->checkBiDi=false; 167 168 icu::UnicodeSet baseExclusionSet; 169 icu::UnicodeString cString, mapping, namePrepResult; 170 for(UChar32 c=0; c<=0x10ffff; ++c) { 171 if(c==0xd800) { 172 c=0xe000; 173 } 174 int namePrepStatus=toIDNA2003(namePrep.getAlias(), c, namePrepResult); 175 if(namePrepStatus!=0) { 176 // get the UTS #46 base mapping value 177 switch(c) { 178 case 0xff0e: 179 case 0x3002: 180 case 0xff61: 181 mapping.setTo(0x2e); 182 break; 183 default: 184 cString.setTo(c); 185 nfkc_cf->normalize(cString, mapping, errorCode); 186 break; 187 } 188 if( 189 namePrepStatus>0 ? 190 // c is valid or mapped in IDNA2003 191 !labelSeparators.contains(c) && namePrepResult!=mapping : 192 // namePrepStatus<0: c is prohibited in IDNA2003 193 baseValidSet.contains(c) || (cString!=mapping && baseValidSet.containsAll(mapping)) 194 ) { 195 baseExclusionSet.add(c); 196 } 197 } 198 } 199 200 icu::UnicodeSet disallowedSet(0, 0x10ffff); 201 disallowedSet. 202 removeAll(labelSeparators). 203 removeAll(deviationSet). 204 removeAll(mappedSet). 205 removeAll(baseValidSet). 206 addAll(baseExclusionSet). 207 addAll(unassignedSet); 208 209 const icu::Normalizer2 *nfd= 210 icu::Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode); 211 errorCode.assertSuccess(); 212 213 icu::UnicodeSet ignoredSet; // will be a subset of mappedSet 214 icu::UnicodeSet removeSet; 215 icu::UnicodeString nfdString; 216 { 217 icu::UnicodeSetIterator iter(mappedSet); 218 while(iter.next()) { 219 UChar32 c=iter.getCodepoint(); 220 cString.setTo(c); 221 nfkc_cf->normalize(cString, mapping, errorCode); 222 if(!baseValidSet.containsAll(mapping)) { 223 fprintf(stderr, "U+%04lX mapped -> disallowed: mapping not wholly in base valid set\n", (long)c); 224 disallowedSet.add(c); 225 removeSet.add(c); 226 } else if(mapping.isEmpty()) { 227 ignoredSet.add(c); 228 } 229 } 230 mappedSet.removeAll(removeSet); 231 } 232 errorCode.assertSuccess(); 233 234 icu::UnicodeSet validSet(baseValidSet); 235 validSet. 236 removeAll(labelSeparators). // non-ASCII label separators will be mapped in the end 237 removeAll(deviationSet). 238 removeAll(disallowedSet). 239 removeAll(mappedSet). 240 add(0x2e); // not mapped, simply valid 241 UBool madeChange; 242 do { 243 madeChange=false; 244 { 245 removeSet.clear(); 246 icu::UnicodeSetIterator iter(validSet); 247 while(iter.next()) { 248 UChar32 c=iter.getCodepoint(); 249 if(nfd->getDecomposition(c, nfdString) && !validSet.containsAll(nfdString)) { 250 fprintf(stderr, "U+%04lX valid -> disallowed: NFD not wholly valid\n", (long)c); 251 disallowedSet.add(c); 252 removeSet.add(c); 253 madeChange=true; 254 } 255 } 256 validSet.removeAll(removeSet); 257 } 258 { 259 removeSet.clear(); 260 icu::UnicodeSetIterator iter(mappedSet); 261 while(iter.next()) { 262 UChar32 c=iter.getCodepoint(); 263 cString.setTo(c); 264 nfkc_cf->normalize(cString, mapping, errorCode); 265 nfd->normalize(mapping, nfdString, errorCode); 266 if(!validSet.containsAll(nfdString)) { 267 fprintf(stderr, "U+%04lX mapped -> disallowed: NFD of mapping not wholly valid\n", (long)c); 268 disallowedSet.add(c); 269 removeSet.add(c); 270 madeChange=true; 271 } 272 } 273 mappedSet.removeAll(removeSet); 274 } 275 } while(madeChange); 276 errorCode.assertSuccess(); 277 278 // finish up 279 labelSeparators.remove(0x2e).freeze(); // U+002E is simply valid 280 deviationSet.freeze(); 281 ignoredSet.freeze(); 282 validSet.freeze(); 283 mappedSet.freeze(); 284 disallowedSTD3Set.freeze(); 285 286 // output 287 UChar32 prevStart=0, c=0; 288 Status prevStatus=DISALLOWED_STD3_VALID, status; 289 icu::UnicodeString prevMapping; 290 UVersionInfo prevAge={ 1, 1, 0, 0 }, age; 291 292 icu::UnicodeSetIterator iter(disallowedSet); 293 while(iter.nextRange()) { 294 UChar32 start=iter.getCodepoint(); 295 while(c<start) { 296 mapping.remove(); 297 if(labelSeparators.contains(c)) { 298 status=MAPPED; 299 mapping.setTo(0x2e); 300 } else if(deviationSet.contains(c)) { 301 status=DEVIATION; 302 cString.setTo(c); 303 nfkc_cf->normalize(cString, mapping, errorCode); 304 } else if(ignoredSet.contains(c)) { 305 status=IGNORED; 306 } else if(validSet.contains(c)) { 307 if(disallowedSTD3Set.contains(c)) { 308 fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: itself not STD3\n", (long)c); 309 status=DISALLOWED_STD3_VALID; 310 } else if( nfd->getDecomposition(c, nfdString) && 311 disallowedSTD3Set.containsSome(nfdString) 312 ) { 313 fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: NFD not wholly STD3\n", (long)c); 314 status=DISALLOWED_STD3_VALID; 315 } else { 316 status=VALID; 317 } 318 } else if(mappedSet.contains(c)) { 319 cString.setTo(c); 320 nfkc_cf->normalize(cString, mapping, errorCode); 321 if(disallowedSTD3Set.containsSome(mapping)) { 322 fprintf(stderr, "U+%04lX mapped -> disallowed_STD3_mapped\n", (long)c); 323 status=DISALLOWED_STD3_MAPPED; 324 } else { 325 status=MAPPED; 326 } 327 } else { 328 fprintf(stderr, "*** undetermined status of U+%04lX\n", (long)c); 329 } 330 // Print a new line where the status, the mapping or 331 // the character age change. 332 getAgeIfAssigned(c, age); 333 if( prevStart<c && 334 (status!=prevStatus || mapping!=prevMapping || 0!=memcmp(prevAge, age, 4)) 335 ) { 336 printLine(prevStart, c-1, prevStatus, prevMapping); 337 prevStart=c; 338 prevStatus=status; 339 prevMapping=mapping; 340 memcpy(prevAge, age, 4); 341 } 342 ++c; 343 } 344 // c==start is disallowed 345 if(prevStart<c) { 346 printLine(prevStart, c-1, prevStatus, prevMapping); 347 } 348 prevStart=c; 349 prevStatus=DISALLOWED; 350 prevMapping.remove(); 351 getAgeIfAssigned(c, prevAge); 352 UChar32 end=iter.getCodepointEnd(); 353 while(++c<=end) { 354 getAgeIfAssigned(c, age); 355 if(prevStart<c && 0!=memcmp(prevAge, age, 4)) { 356 printLine(prevStart, c-1, prevStatus, prevMapping); 357 prevStart=c; 358 memcpy(prevAge, age, 4); 359 } 360 } 361 } 362 if(prevStart<c) { 363 printLine(prevStart, c-1, prevStatus, prevMapping); 364 } 365 return 0; 366} 367