1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1997-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /*******************************************************************************
9 *
10 * File CUCDTST.C
11 *
12 * Modification History:
13 * Name Description
14 * Madhu Katragadda Ported for C API, added tests for string functions
15 ********************************************************************************
16 */
17
18 #include <math.h>
19 #include <stdbool.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "unicode/utypes.h"
24 #include "unicode/uchar.h"
25 #include "unicode/putil.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uloc.h"
28 #include "unicode/unorm2.h"
29 #include "unicode/utf16.h"
30 #include "unicode/utf_old.h"
31 #include "cintltst.h"
32 #include "putilimp.h"
33 #include "uparse.h"
34 #include "ucase.h"
35 #include "ubidi_props.h"
36 #include "uprops.h"
37 #include "uset_imp.h"
38 #include "usc_impl.h"
39 #include "udatamem.h"
40 #include "cucdapi.h"
41 #include "cmemory.h"
42
43 /* prototypes --------------------------------------------------------------- */
44
45 static void TestUpperLower(void);
46 static void TestLetterNumber(void);
47 static void TestMisc(void);
48 static void TestPOSIX(void);
49 static void TestControlPrint(void);
50 static void TestIdentifier(void);
51 static void TestUnicodeData(void);
52 static void TestCodeUnit(void);
53 static void TestCodePoint(void);
54 static void TestCharLength(void);
55 static void TestCharNames(void);
56 static void TestUCharFromNameUnderflow(void);
57 static void TestMirroring(void);
58 static void TestUScriptRunAPI(void);
59 static void TestAdditionalProperties(void);
60 static void TestNumericProperties(void);
61 static void TestPropertyNames(void);
62 static void TestPropertyValues(void);
63 static void TestConsistency(void);
64 static void TestCaseFolding(void);
65 static void TestBinaryCharacterPropertiesAPI(void);
66 static void TestIntCharacterPropertiesAPI(void);
67
68 /* internal methods used */
69 static int32_t MakeProp(char* str);
70 static int32_t MakeDir(char* str);
71
72 /* helpers ------------------------------------------------------------------ */
73
74 static void
parseUCDFile(const char *filename, char *fields[][2], int32_t fieldCount, UParseLineFn *lineFn, void *context, UErrorCode *pErrorCode)75 parseUCDFile(const char *filename,
76 char *fields[][2], int32_t fieldCount,
77 UParseLineFn *lineFn, void *context,
78 UErrorCode *pErrorCode) {
79 char path[512];
80 char backupPath[512];
81
82 if(U_FAILURE(*pErrorCode)) {
83 return;
84 }
85
86 /* Look inside ICU_DATA first */
87 strcpy(path, u_getDataDirectory());
88 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
89 strcat(path, filename);
90
91 /* As a fallback, try to guess where the source data was located
92 * at the time ICU was built, and look there.
93 */
94 strcpy(backupPath, ctest_dataSrcDir());
95 strcat(backupPath, U_FILE_SEP_STRING);
96 strcat(backupPath, "unidata" U_FILE_SEP_STRING);
97 strcat(backupPath, filename);
98
99 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
100 if(*pErrorCode==U_FILE_ACCESS_ERROR) {
101 *pErrorCode=U_ZERO_ERROR;
102 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
103 }
104 if(U_FAILURE(*pErrorCode)) {
105 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
106 }
107 }
108
109 /* test data ---------------------------------------------------------------- */
110
111 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
112 static const int32_t tagValues[] =
113 {
114 /* Mn */ U_NON_SPACING_MARK,
115 /* Mc */ U_COMBINING_SPACING_MARK,
116 /* Me */ U_ENCLOSING_MARK,
117 /* Nd */ U_DECIMAL_DIGIT_NUMBER,
118 /* Nl */ U_LETTER_NUMBER,
119 /* No */ U_OTHER_NUMBER,
120 /* Zs */ U_SPACE_SEPARATOR,
121 /* Zl */ U_LINE_SEPARATOR,
122 /* Zp */ U_PARAGRAPH_SEPARATOR,
123 /* Cc */ U_CONTROL_CHAR,
124 /* Cf */ U_FORMAT_CHAR,
125 /* Cs */ U_SURROGATE,
126 /* Co */ U_PRIVATE_USE_CHAR,
127 /* Cn */ U_UNASSIGNED,
128 /* Lu */ U_UPPERCASE_LETTER,
129 /* Ll */ U_LOWERCASE_LETTER,
130 /* Lt */ U_TITLECASE_LETTER,
131 /* Lm */ U_MODIFIER_LETTER,
132 /* Lo */ U_OTHER_LETTER,
133 /* Pc */ U_CONNECTOR_PUNCTUATION,
134 /* Pd */ U_DASH_PUNCTUATION,
135 /* Ps */ U_START_PUNCTUATION,
136 /* Pe */ U_END_PUNCTUATION,
137 /* Po */ U_OTHER_PUNCTUATION,
138 /* Sm */ U_MATH_SYMBOL,
139 /* Sc */ U_CURRENCY_SYMBOL,
140 /* Sk */ U_MODIFIER_SYMBOL,
141 /* So */ U_OTHER_SYMBOL,
142 /* Pi */ U_INITIAL_PUNCTUATION,
143 /* Pf */ U_FINAL_PUNCTUATION
144 };
145
146 static const char dirStrings[][5] = {
147 "L",
148 "R",
149 "EN",
150 "ES",
151 "ET",
152 "AN",
153 "CS",
154 "B",
155 "S",
156 "WS",
157 "ON",
158 "LRE",
159 "LRO",
160 "AL",
161 "RLE",
162 "RLO",
163 "PDF",
164 "NSM",
165 "BN",
166 /* new in Unicode 6.3/ICU 52 */
167 "FSI",
168 "LRI",
169 "RLI",
170 "PDI"
171 };
172
173 void addUnicodeTest(TestNode** root);
174
addUnicodeTest(TestNode** root)175 void addUnicodeTest(TestNode** root)
176 {
177 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
178 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
179 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
180 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
181 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
182 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
183 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
184 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
185 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
186 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
187 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
188 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
189 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
190 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
191 addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
192 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
193 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
194 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
195 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
196 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
197 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
198 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
199 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
200 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
201 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
202 addTest(root, &TestBinaryCharacterPropertiesAPI,
203 "tsutil/cucdtst/TestBinaryCharacterPropertiesAPI");
204 addTest(root, &TestIntCharacterPropertiesAPI,
205 "tsutil/cucdtst/TestIntCharacterPropertiesAPI");
206 }
207
208 /*==================================================== */
209 /* test u_toupper() and u_tolower() */
210 /*==================================================== */
TestUpperLowernull211 static void TestUpperLower()
212 {
213 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
214 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
215 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
216 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
217 int32_t i;
218
219 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
220 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
221
222 /*
223 Checks LetterLike Symbols which were previously a source of confusion
224 [Bertrand A. D. 02/04/98]
225 */
226 for (i=0x2100;i<0x2138;i++)
227 {
228 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
229 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
230 {
231 if (i != (int)u_tolower(i)) /* itself */
232 log_err("Failed case conversion with itself: U+%04x\n", i);
233 if (i != (int)u_toupper(i))
234 log_err("Failed case conversion with itself: U+%04x\n", i);
235 }
236 }
237
238 for(i=0; i < u_strlen(upper); i++){
239 if(u_tolower(upper[i]) != lower[i]){
240 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
241 }
242 }
243
244 log_verbose("testing upper lower\n");
245 for (i = 0; i < 21; i++) {
246
247 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
248 {
249 log_err("Failed isLowerCase test at %c\n", upperTest[i]);
250 }
251 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
252 {
253 log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
254 }
255 else if (upperTest[i] != u_tolower(lowerTest[i]))
256 {
257 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]);
258 }
259 else if (lowerTest[i] != u_toupper(upperTest[i]))
260 {
261 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
262 }
263 else if (upperTest[i] != u_tolower(upperTest[i]))
264 {
265 log_err("Failed case conversion with itself: %c\n", upperTest[i]);
266 }
267 else if (lowerTest[i] != u_toupper(lowerTest[i]))
268 {
269 log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
270 }
271 }
272 log_verbose("done testing upper lower\n");
273
274 log_verbose("testing u_istitle\n");
275 {
276 static const UChar expected[] = {
277 0x1F88,
278 0x1F89,
279 0x1F8A,
280 0x1F8B,
281 0x1F8C,
282 0x1F8D,
283 0x1F8E,
284 0x1F8F,
285 0x1F88,
286 0x1F89,
287 0x1F8A,
288 0x1F8B,
289 0x1F8C,
290 0x1F8D,
291 0x1F8E,
292 0x1F8F,
293 0x1F98,
294 0x1F99,
295 0x1F9A,
296 0x1F9B,
297 0x1F9C,
298 0x1F9D,
299 0x1F9E,
300 0x1F9F,
301 0x1F98,
302 0x1F99,
303 0x1F9A,
304 0x1F9B,
305 0x1F9C,
306 0x1F9D,
307 0x1F9E,
308 0x1F9F,
309 0x1FA8,
310 0x1FA9,
311 0x1FAA,
312 0x1FAB,
313 0x1FAC,
314 0x1FAD,
315 0x1FAE,
316 0x1FAF,
317 0x1FA8,
318 0x1FA9,
319 0x1FAA,
320 0x1FAB,
321 0x1FAC,
322 0x1FAD,
323 0x1FAE,
324 0x1FAF,
325 0x1FBC,
326 0x1FBC,
327 0x1FCC,
328 0x1FCC,
329 0x1FFC,
330 0x1FFC,
331 };
332 int32_t num = UPRV_LENGTHOF(expected);
333 for(i=0; i<num; i++){
334 if(!u_istitle(expected[i])){
335 log_err("u_istitle failed for 0x%4X. Expected true, got false\n",expected[i]);
336 }
337 }
338
339 }
340 }
341
342 /* compare two sets and verify that their difference or intersection is empty */
343 static UBool
showADiffB(const USet *a, const USet *b, const char *a_name, const char *b_name, UBool expect, UBool diffIsError)344 showADiffB(const USet *a, const USet *b,
345 const char *a_name, const char *b_name,
346 UBool expect, UBool diffIsError) {
347 USet *aa;
348 int32_t i, start, end, length;
349 UErrorCode errorCode;
350
351 /*
352 * expect:
353 * true -> a-b should be empty, that is, b should contain all of a
354 * false -> a&b should be empty, that is, a should contain none of b (and vice versa)
355 */
356 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
357 return true;
358 }
359
360 /* clone a to aa because a is const */
361 aa=uset_open(1, 0);
362 if(aa==NULL) {
363 /* unusual problem - out of memory? */
364 return false;
365 }
366 uset_addAll(aa, a);
367
368 /* compute the set in question */
369 if(expect) {
370 /* a-b */
371 uset_removeAll(aa, b);
372 } else {
373 /* a&b */
374 uset_retainAll(aa, b);
375 }
376
377 /* aa is not empty because of the initial tests above; show its contents */
378 errorCode=U_ZERO_ERROR;
379 i=0;
380 for(;;) {
381 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
382 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
383 break; /* done */
384 }
385 if(U_FAILURE(errorCode)) {
386 log_err("error comparing %s with %s at difference item %d: %s\n",
387 a_name, b_name, i, u_errorName(errorCode));
388 break;
389 }
390 if(length!=0) {
391 break; /* done with code points, got a string or -1 */
392 }
393
394 if(diffIsError) {
395 if(expect) {
396 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
397 } else {
398 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
399 }
400 } else {
401 if(expect) {
402 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
403 } else {
404 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
405 }
406 }
407
408 ++i;
409 }
410
411 uset_close(aa);
412 return false;
413 }
414
415 static UBool
showAMinusB(const USet *a, const USet *b, const char *a_name, const char *b_name, UBool diffIsError)416 showAMinusB(const USet *a, const USet *b,
417 const char *a_name, const char *b_name,
418 UBool diffIsError) {
419 return showADiffB(a, b, a_name, b_name, true, diffIsError);
420 }
421
422 static UBool
showAIntersectB(const USet *a, const USet *b, const char *a_name, const char *b_name, UBool diffIsError)423 showAIntersectB(const USet *a, const USet *b,
424 const char *a_name, const char *b_name,
425 UBool diffIsError) {
426 return showADiffB(a, b, a_name, b_name, false, diffIsError);
427 }
428
429 static UBool
compareUSets(const USet *a, const USet *b, const char *a_name, const char *b_name, UBool diffIsError)430 compareUSets(const USet *a, const USet *b,
431 const char *a_name, const char *b_name,
432 UBool diffIsError) {
433 /*
434 * Use an arithmetic & not a logical && so that both branches
435 * are always taken and all differences are shown.
436 */
437 return
438 showAMinusB(a, b, a_name, b_name, diffIsError) &
439 showAMinusB(b, a, b_name, a_name, diffIsError);
440 }
441
442 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
TestLetterNumbernull443 static void TestLetterNumber()
444 {
445 UChar i = 0x0000;
446
447 log_verbose("Testing for isalpha\n");
448 for (i = 0x0041; i < 0x005B; i++) {
449 if (!u_isalpha(i))
450 {
451 log_err("Failed isLetter test at %.4X\n", i);
452 }
453 }
454 for (i = 0x0660; i < 0x066A; i++) {
455 if (u_isalpha(i))
456 {
457 log_err("Failed isLetter test with numbers at %.4X\n", i);
458 }
459 }
460
461 log_verbose("Testing for isdigit\n");
462 for (i = 0x0660; i < 0x066A; i++) {
463 if (!u_isdigit(i))
464 {
465 log_verbose("Failed isNumber test at %.4X\n", i);
466 }
467 }
468
469 log_verbose("Testing for isalnum\n");
470 for (i = 0x0041; i < 0x005B; i++) {
471 if (!u_isalnum(i))
472 {
473 log_err("Failed isAlNum test at %.4X\n", i);
474 }
475 }
476 for (i = 0x0660; i < 0x066A; i++) {
477 if (!u_isalnum(i))
478 {
479 log_err("Failed isAlNum test at %.4X\n", i);
480 }
481 }
482
483 {
484 /*
485 * The following checks work only starting from Unicode 4.0.
486 * Check the version number here.
487 */
488 static UVersionInfo u401={ 4, 0, 1, 0 };
489 UVersionInfo version;
490 u_getUnicodeVersion(version);
491 if(version[0]<4 || 0==memcmp(version, u401, 4)) {
492 return;
493 }
494 }
495
496 {
497 /*
498 * Sanity check:
499 * Verify that exactly the digit characters have decimal digit values.
500 * This assumption is used in the implementation of u_digit()
501 * (which checks nt=de)
502 * compared with the parallel java.lang.Character.digit()
503 * (which checks Nd).
504 *
505 * This was not true in Unicode 3.2 and earlier.
506 * Unicode 4.0 fixed discrepancies.
507 * Unicode 4.0.1 re-introduced problems in this area due to an
508 * unintentionally incomplete last-minute change.
509 */
510 U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
511 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
512
513 USet *digits, *decimalValues;
514 UErrorCode errorCode;
515
516 U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
517 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
518 errorCode=U_ZERO_ERROR;
519 digits=uset_openPattern(digitsPattern, 6, &errorCode);
520 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
521
522 if(U_SUCCESS(errorCode)) {
523 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", true);
524 }
525
526 uset_close(digits);
527 uset_close(decimalValues);
528 }
529 }
530
531 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
532 const UChar32 *sampleChars, int32_t sampleCharsLength,
533 UBool expected) {
534 int32_t i;
535 for (i = 0; i < sampleCharsLength; ++i) {
536 UBool result = propFn(sampleChars[i]);
537 if (result != expected) {
538 log_err("error: character property function %s(U+%04x)=%d is wrong\n",
539 propName, sampleChars[i], result);
540 }
541 }
542 }
543
544 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
TestMiscnull545 static void TestMisc()
546 {
547 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
548 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
549 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
550 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
551 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
552 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
553 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
554 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
555 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
556 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
557 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
558
559 static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
560
561 uint32_t mask;
562
563 int32_t i;
564 char icuVersion[U_MAX_VERSION_STRING_LENGTH];
565 UVersionInfo realVersion;
566
567 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
568
569 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), true);
570 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), false);
571
572 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
573 sampleSpaces, UPRV_LENGTHOF(sampleSpaces), true);
574 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
575 sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), false);
576
577 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
578 sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), true);
579 testSampleCharProps(u_isWhitespace, "u_isWhitespace",
580 sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), false);
581
582 testSampleCharProps(u_isdefined, "u_isdefined",
583 sampleDefined, UPRV_LENGTHOF(sampleDefined), true);
584 testSampleCharProps(u_isdefined, "u_isdefined",
585 sampleUndefined, UPRV_LENGTHOF(sampleUndefined), false);
586
587 testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), true);
588 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), false);
589
590 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), true);
591 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), false);
592
593 for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
594 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
595 log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
596 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
597 }
598 }
599
600 /* Tests the ICU version #*/
601 u_getVersion(realVersion);
602 u_versionToString(realVersion, icuVersion);
603 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
604 {
605 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
606 }
607 #if defined(ICU_VERSION)
608 /* test only happens where we have configure.in with VERSION - sanity check. */
609 if(strcmp(U_ICU_VERSION, ICU_VERSION))
610 {
611 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION);
612 }
613 #endif
614
615 /* test U_GC_... */
616 if(
617 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
618 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
619 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
620 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
621 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
622 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
623 ) {
624 log_err("error: U_GET_GC_MASK does not work properly\n");
625 }
626
627 mask=0;
628 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
629
630 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
631 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
632 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
633 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
634 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
635
636 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
637 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
638 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
639
640 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
641 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
642 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
643
644 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
645 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
646 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
647
648 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
649 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
650 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
651 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
652
653 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
654 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
655 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
656 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
657 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
658
659 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
660 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
661 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
662 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
663
664 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
665 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
666
667 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
668 log_err("error: problems with U_GC_XX_MASK constants\n");
669 }
670
671 mask=0;
672 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
673 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
674 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
675 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
676 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
677 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
678 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
679
680 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
681 log_err("error: problems with U_GC_Y_MASK constants\n");
682 }
683 {
684 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
685 for(i=0; i<10; i++){
686 if(digit[i]!=u_forDigit(i,10)){
687 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
688 }
689 }
690 }
691
692 /* test u_digit() */
693 {
694 static const struct {
695 UChar32 c;
696 int8_t radix, value;
697 } data[]={
698 /* base 16 */
699 { 0x0031, 16, 1 },
700 { 0x0038, 16, 8 },
701 { 0x0043, 16, 12 },
702 { 0x0066, 16, 15 },
703 { 0x00e4, 16, -1 },
704 { 0x0662, 16, 2 },
705 { 0x06f5, 16, 5 },
706 { 0xff13, 16, 3 },
707 { 0xff41, 16, 10 },
708
709 /* base 8 */
710 { 0x0031, 8, 1 },
711 { 0x0038, 8, -1 },
712 { 0x0043, 8, -1 },
713 { 0x0066, 8, -1 },
714 { 0x00e4, 8, -1 },
715 { 0x0662, 8, 2 },
716 { 0x06f5, 8, 5 },
717 { 0xff13, 8, 3 },
718 { 0xff41, 8, -1 },
719
720 /* base 36 */
721 { 0x5a, 36, 35 },
722 { 0x7a, 36, 35 },
723 { 0xff3a, 36, 35 },
724 { 0xff5a, 36, 35 },
725
726 /* wrong radix values */
727 { 0x0031, 1, -1 },
728 { 0xff3a, 37, -1 }
729 };
730
731 for(i=0; i<UPRV_LENGTHOF(data); ++i) {
732 if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
733 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
734 data[i].c,
735 data[i].radix,
736 u_digit(data[i].c, data[i].radix),
737 data[i].value);
738 }
739 }
740 }
741 }
742
743 /* test C/POSIX-style functions --------------------------------------------- */
744
745 /* bit flags */
746 #define ISAL 1
747 #define ISLO 2
748 #define ISUP 4
749
750 #define ISDI 8
751 #define ISXD 0x10
752
753 #define ISAN 0x20
754
755 #define ISPU 0x40
756 #define ISGR 0x80
757 #define ISPR 0x100
758
759 #define ISSP 0x200
760 #define ISBL 0x400
761 #define ISCN 0x800
762
763 /* C/POSIX-style functions, in the same order as the bit flags */
764 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
765
766 static const struct {
767 IsPOSIXClass *fn;
768 const char *name;
769 } posixClasses[]={
770 { u_isalpha, "isalpha" },
771 { u_islower, "islower" },
772 { u_isupper, "isupper" },
773 { u_isdigit, "isdigit" },
774 { u_isxdigit, "isxdigit" },
775 { u_isalnum, "isalnum" },
776 { u_ispunct, "ispunct" },
777 { u_isgraph, "isgraph" },
778 { u_isprint, "isprint" },
779 { u_isspace, "isspace" },
780 { u_isblank, "isblank" },
781 { u_iscntrl, "iscntrl" }
782 };
783
784 static const struct {
785 UChar32 c;
786 uint32_t posixResults;
787 } posixData[]={
788 { 0x0008, ISCN }, /* backspace */
789 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */
790 { 0x000a, ISSP| ISCN }, /* LF */
791 { 0x000c, ISSP| ISCN }, /* FF */
792 { 0x000d, ISSP| ISCN }, /* CR */
793 { 0x0020, ISPR|ISSP|ISBL }, /* space */
794 { 0x0021, ISPU|ISGR|ISPR }, /* ! */
795 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */
796 { 0x0040, ISPU|ISGR|ISPR }, /* @ */
797 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */
798 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */
799 { 0x007b, ISPU|ISGR|ISPR }, /* { */
800 { 0x0085, ISSP| ISCN }, /* NEL */
801 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */
802 { 0x00a4, ISGR|ISPR }, /* currency sign */
803 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */
804 { 0x0300, ISGR|ISPR }, /* combining grave */
805 { 0x0600, ISCN }, /* arabic number sign */
806 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */
807 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */
808 { 0x2002, ISPR|ISSP|ISBL }, /* en space */
809 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */
810 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */
811 { 0x200b, ISCN }, /* ZWSP */
812 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
813 { 0x200e, ISCN }, /* LRM */
814 { 0x2028, ISPR|ISSP| ISCN }, /* LS */
815 { 0x2029, ISPR|ISSP| ISCN }, /* PS */
816 { 0x20ac, ISGR|ISPR }, /* Euro */
817 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */
818 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */
819 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */
820 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */
821 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */
822 };
823
824 static void
TestPOSIXnull825 TestPOSIX() {
826 uint32_t mask;
827 int32_t cl, i;
828 UBool expect;
829
830 mask=1;
831 for(cl=0; cl<12; ++cl) {
832 for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
833 expect=(UBool)((posixData[i].posixResults&mask)!=0);
834 if(posixClasses[cl].fn(posixData[i].c)!=expect) {
835 log_err("u_%s(U+%04x)=%s is wrong\n",
836 posixClasses[cl].name, posixData[i].c, expect ? "false" : "true");
837 }
838 }
839 mask<<=1;
840 }
841 }
842
843 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
TestControlPrintnull844 static void TestControlPrint()
845 {
846 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
847 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
848 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
849 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
850 UChar32 c;
851
852 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), true);
853 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), false);
854
855 testSampleCharProps(u_isprint, "u_isprint",
856 samplePrintable, UPRV_LENGTHOF(samplePrintable), true);
857 testSampleCharProps(u_isprint, "u_isprint",
858 sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), false);
859
860 /* test all ISO 8 controls */
861 for(c=0; c<=0x9f; ++c) {
862 if(c==0x20) {
863 /* skip ASCII graphic characters and continue with DEL */
864 c=0x7f;
865 }
866 if(!u_iscntrl(c)) {
867 log_err("error: u_iscntrl(ISO 8 control U+%04x)=false\n", c);
868 }
869 if(!u_isISOControl(c)) {
870 log_err("error: u_isISOControl(ISO 8 control U+%04x)=false\n", c);
871 }
872 if(u_isprint(c)) {
873 log_err("error: u_isprint(ISO 8 control U+%04x)=true\n", c);
874 }
875 }
876
877 /* test all Latin-1 graphic characters */
878 for(c=0x20; c<=0xff; ++c) {
879 if(c==0x7f) {
880 c=0xa0;
881 } else if(c==0xad) {
882 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
883 ++c;
884 }
885 if(!u_isprint(c)) {
886 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=false\n", c);
887 }
888 }
889 }
890
891 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
TestIdentifiernull892 static void TestIdentifier()
893 {
894 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
895 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
896 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
897 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
898 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
899 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
900 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
901 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
902 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
903 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
904
905 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
906 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), true);
907 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
908 sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), false);
909
910 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
911 sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), true);
912 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
913 sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), false);
914
915 /* IDPart should imply IDStart */
916 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
917 sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), true);
918
919 testSampleCharProps(u_isIDStart, "u_isIDStart",
920 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), true);
921 testSampleCharProps(u_isIDStart, "u_isIDStart",
922 sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), false);
923
924 testSampleCharProps(u_isIDPart, "u_isIDPart",
925 sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), true);
926 testSampleCharProps(u_isIDPart, "u_isIDPart",
927 sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), false);
928
929 /* IDPart should imply IDStart */
930 testSampleCharProps(u_isIDPart, "u_isIDPart",
931 sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), true);
932
933 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
934 sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), true);
935 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
936 sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), false);
937 }
938
939 /* for each line of UnicodeData.txt, check some of the properties */
940 typedef struct UnicodeDataContext {
941 #if UCONFIG_NO_NORMALIZATION
942 const void *dummy;
943 #else
944 const UNormalizer2 *nfc;
945 const UNormalizer2 *nfkc;
946 #endif
947 } UnicodeDataContext;
948
949 /*
950 * ### TODO
951 * This test fails incorrectly if the First or Last code point of a repetitive area
952 * is overridden, which is allowed and is encouraged for the PUAs.
953 * Currently, this means that both area First/Last and override lines are
954 * tested against the properties from the API,
955 * and the area boundary will not match and cause an error.
956 *
957 * This function should detect area boundaries and skip them for the test of individual
958 * code points' properties.
959 * Then it should check that the areas contain all the same properties except where overridden.
960 * For this, it would have had to set a flag for which code points were listed explicitly.
961 */
962 static void U_CALLCONV
unicodeDataLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode)963 unicodeDataLineFn(void *context,
964 char *fields[][2], int32_t fieldCount,
965 UErrorCode *pErrorCode)
966 {
967 (void)fieldCount; // suppress compiler warnings about unused variable
968 char buffer[100];
969 const char *d;
970 char *end;
971 uint32_t value;
972 UChar32 c;
973 int32_t i;
974 int8_t type;
975 int32_t dt;
976 UChar dm[32], s[32];
977 int32_t dmLength, length;
978
979 #if !UCONFIG_NO_NORMALIZATION
980 const UNormalizer2 *nfc, *nfkc;
981 #endif
982
983 /* get the character code, field 0 */
984 c=strtoul(fields[0][0], &end, 16);
985 if(end<=fields[0][0] || end!=fields[0][1]) {
986 log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
987 return;
988 }
989 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
990 log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
991 return;
992 }
993
994 /* get general category, field 2 */
995 *fields[2][1]=0;
996 type = (int8_t)tagValues[MakeProp(fields[2][0])];
997 if(u_charType(c)!=type) {
998 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
999 }
1000 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1001 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1002 }
1003
1004 /* get canonical combining class, field 3 */
1005 value=strtoul(fields[3][0], &end, 10);
1006 if(end<=fields[3][0] || end!=fields[3][1]) {
1007 log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1008 return;
1009 }
1010 if(value>255) {
1011 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1012 return;
1013 }
1014 #if !UCONFIG_NO_NORMALIZATION
1015 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1016 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1017 }
1018 nfkc=((UnicodeDataContext *)context)->nfkc;
1019 if(value!=unorm2_getCombiningClass(nfkc, c)) {
1020 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1021 }
1022 #endif
1023
1024 /* get BiDi category, field 4 */
1025 *fields[4][1]=0;
1026 i=MakeDir(fields[4][0]);
1027 if(i!=(int32_t)u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1028 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1029 }
1030
1031 /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1032 d=NULL;
1033 if(fields[5][0]==fields[5][1]) {
1034 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1035 if(c==0xac00 || c==0xd7a3) {
1036 dt=U_DT_CANONICAL;
1037 } else {
1038 dt=U_DT_NONE;
1039 }
1040 } else {
1041 d=fields[5][0];
1042 *fields[5][1]=0;
1043 dt=UCHAR_INVALID_CODE;
1044 if(*d=='<') {
1045 end=strchr(++d, '>');
1046 if(end!=NULL) {
1047 *end=0;
1048 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1049 d=u_skipWhitespace(end+1);
1050 }
1051 } else {
1052 dt=U_DT_CANONICAL;
1053 }
1054 }
1055 if(dt>U_DT_NONE) {
1056 if(c==0xac00) {
1057 dm[0]=0x1100;
1058 dm[1]=0x1161;
1059 dm[2]=0;
1060 dmLength=2;
1061 } else if(c==0xd7a3) {
1062 dm[0]=0xd788;
1063 dm[1]=0x11c2;
1064 dm[2]=0;
1065 dmLength=2;
1066 } else {
1067 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1068 }
1069 } else {
1070 dmLength=-1;
1071 }
1072 if(dt<0 || U_FAILURE(*pErrorCode)) {
1073 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1074 return;
1075 }
1076 #if !UCONFIG_NO_NORMALIZATION
1077 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1078 if(i!=dt) {
1079 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1080 }
1081 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1082 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1083 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1084 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1085 "or the Decomposition_Mapping is different (%s)\n",
1086 c, length, dmLength, u_errorName(*pErrorCode));
1087 return;
1088 }
1089 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1090 if(dt!=U_DT_CANONICAL) {
1091 dmLength=-1;
1092 }
1093 nfc=((UnicodeDataContext *)context)->nfc;
1094 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1095 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1096 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1097 "or the Decomposition_Mapping is different (%s)\n",
1098 c, length, dmLength, u_errorName(*pErrorCode));
1099 return;
1100 }
1101 /* recompose */
1102 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1103 UChar32 a, b, composite;
1104 i=0;
1105 U16_NEXT(dm, i, dmLength, a);
1106 U16_NEXT(dm, i, dmLength, b);
1107 /* i==dmLength */
1108 composite=unorm2_composePair(nfc, a, b);
1109 if(composite!=c) {
1110 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1111 (long)c, (long)a, (long)b, (long)composite);
1112 }
1113 /*
1114 * Note: NFKC has fewer round-trip mappings than NFC,
1115 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1116 */
1117 }
1118 #endif
1119
1120 /* get ISO Comment, field 11 */
1121 *fields[11][1]=0;
1122 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1123 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1124 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1125 c, u_errorName(*pErrorCode),
1126 U_FAILURE(*pErrorCode) ? buffer : "[error]",
1127 fields[11][0]);
1128 }
1129
1130 /* get uppercase mapping, field 12 */
1131 if(fields[12][0]!=fields[12][1]) {
1132 value=strtoul(fields[12][0], &end, 16);
1133 if(end!=fields[12][1]) {
1134 log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1135 return;
1136 }
1137 if((UChar32)value!=u_toupper(c)) {
1138 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1139 }
1140 } else {
1141 /* no case mapping: the API must map the code point to itself */
1142 if(c!=u_toupper(c)) {
1143 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1144 }
1145 }
1146
1147 /* get lowercase mapping, field 13 */
1148 if(fields[13][0]!=fields[13][1]) {
1149 value=strtoul(fields[13][0], &end, 16);
1150 if(end!=fields[13][1]) {
1151 log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1152 return;
1153 }
1154 if((UChar32)value!=u_tolower(c)) {
1155 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1156 }
1157 } else {
1158 /* no case mapping: the API must map the code point to itself */
1159 if(c!=u_tolower(c)) {
1160 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1161 }
1162 }
1163
1164 /* get titlecase mapping, field 14 */
1165 if(fields[14][0]!=fields[14][1]) {
1166 value=strtoul(fields[14][0], &end, 16);
1167 if(end!=fields[14][1]) {
1168 log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1169 return;
1170 }
1171 if((UChar32)value!=u_totitle(c)) {
1172 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1173 }
1174 } else {
1175 /* no case mapping: the API must map the code point to itself */
1176 if(c!=u_totitle(c)) {
1177 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1178 }
1179 }
1180 }
1181
1182 static UBool U_CALLCONV
enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type)1183 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1184 static const UChar32 test[][2]={
1185 {0x41, U_UPPERCASE_LETTER},
1186 {0x308, U_NON_SPACING_MARK},
1187 {0xfffe, U_GENERAL_OTHER_TYPES},
1188 {0xe0041, U_FORMAT_CHAR},
1189 {0xeffff, U_UNASSIGNED}
1190 };
1191
1192 int32_t i, count;
1193
1194 if(0!=strcmp((const char *)context, "a1")) {
1195 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1196 return false;
1197 }
1198
1199 count=UPRV_LENGTHOF(test);
1200 for(i=0; i<count; ++i) {
1201 if(start<=test[i][0] && test[i][0]<limit) {
1202 if(type!=(UCharCategory)test[i][1]) {
1203 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1204 start, limit, (long)type, test[i][0], test[i][1]);
1205 }
1206 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1207 return i==(count-1) ? false : true;
1208 }
1209 }
1210
1211 if(start>test[count-1][0]) {
1212 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1213 start, limit, (long)type);
1214 return false;
1215 }
1216
1217 return true;
1218 }
1219
1220 static UBool U_CALLCONV
enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type)1221 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1222 (void)context; // suppress compiler warnings about unused variable
1223
1224 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1225 static const int32_t defaultBidi[][2]={ /* { limit, class } */
1226 { 0x0590, U_LEFT_TO_RIGHT },
1227 { 0x0600, U_RIGHT_TO_LEFT },
1228 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1229 { 0x0860, U_RIGHT_TO_LEFT },
1230 { 0x0870, U_RIGHT_TO_LEFT_ARABIC }, // Unicode 10 changes U+0860..U+086F from R to AL.
1231 { 0x08A0, U_RIGHT_TO_LEFT_ARABIC }, // Unicode 14 changes U+0870..U+089F from R to AL.
1232 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1233 { 0x20A0, U_LEFT_TO_RIGHT },
1234 { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1235 { 0xFB1D, U_LEFT_TO_RIGHT },
1236 { 0xFB50, U_RIGHT_TO_LEFT },
1237 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1238 { 0xFE70, U_LEFT_TO_RIGHT },
1239 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1240
1241 { 0x10800, U_LEFT_TO_RIGHT },
1242 { 0x10D00, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10D00..U+10D3F from R to AL.
1243 { 0x10D40, U_RIGHT_TO_LEFT_ARABIC },
1244 { 0x10EC0, U_RIGHT_TO_LEFT }, // Unicode 15 changes U+10EC0..U+10EFF from R to AL.
1245 { 0x10F00, U_RIGHT_TO_LEFT_ARABIC },
1246 { 0x10F30, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+10F30..U+10F6F from R to AL.
1247 { 0x10F70, U_RIGHT_TO_LEFT_ARABIC },
1248 { 0x11000, U_RIGHT_TO_LEFT },
1249
1250 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1251 { 0x1EC70, U_RIGHT_TO_LEFT }, // Unicode 11 changes U+1EC70..U+1ECBF from R to AL.
1252 { 0x1ECC0, U_RIGHT_TO_LEFT_ARABIC },
1253 { 0x1ED00, U_RIGHT_TO_LEFT }, // Unicode 12 changes U+1ED00..U+1ED4F from R to AL.
1254 { 0x1ED50, U_RIGHT_TO_LEFT_ARABIC },
1255 { 0x1EE00, U_RIGHT_TO_LEFT },
1256 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1257 { 0x1F000, U_RIGHT_TO_LEFT },
1258 { 0x110000, U_LEFT_TO_RIGHT }
1259 };
1260
1261 UChar32 c;
1262 int32_t i;
1263 UCharDirection shouldBeDir;
1264
1265 /*
1266 * LineBreak.txt specifies:
1267 * # - Assigned characters that are not listed explicitly are given the value
1268 * # "AL".
1269 * # - Unassigned characters are given the value "XX".
1270 *
1271 * PUA characters are listed explicitly with "XX".
1272 * Verify that no assigned character has "XX".
1273 */
1274 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1275 c=start;
1276 while(c<limit) {
1277 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1278 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1279 }
1280 ++c;
1281 }
1282 }
1283
1284 /*
1285 * Verify default Bidi classes.
1286 * See DerivedBidiClass.txt, especially for unassigned code points.
1287 */
1288 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1289 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1290 c=start;
1291 for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1292 if((int32_t)c<defaultBidi[i][0]) {
1293 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1294 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1295 shouldBeDir=U_BOUNDARY_NEUTRAL;
1296 } else {
1297 shouldBeDir=(UCharDirection)defaultBidi[i][1];
1298 }
1299
1300 if( u_charDirection(c)!=shouldBeDir ||
1301 (UCharDirection)u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1302 ) {
1303 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1304 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1305 }
1306 ++c;
1307 }
1308 }
1309 }
1310 }
1311
1312 return true;
1313 }
1314
1315 /* tests for several properties */
TestUnicodeDatanull1316 static void TestUnicodeData()
1317 {
1318 UVersionInfo expectVersionArray;
1319 UVersionInfo versionArray;
1320 char *fields[15][2];
1321 UErrorCode errorCode;
1322 UChar32 c;
1323 int8_t type;
1324
1325 UnicodeDataContext context;
1326
1327 u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1328 u_getUnicodeVersion(versionArray);
1329 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1330 {
1331 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1332 versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1333 }
1334
1335 #if defined(ICU_UNICODE_VERSION)
1336 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1337 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1338 {
1339 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1340 }
1341 #endif
1342
1343 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1344 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1345 }
1346
1347 errorCode=U_ZERO_ERROR;
1348 #if !UCONFIG_NO_NORMALIZATION
1349 context.nfc=unorm2_getNFCInstance(&errorCode);
1350 context.nfkc=unorm2_getNFKCInstance(&errorCode);
1351 if(U_FAILURE(errorCode)) {
1352 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1353 return;
1354 }
1355 #endif
1356 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1357 if(U_FAILURE(errorCode)) {
1358 return; /* if we couldn't parse UnicodeData.txt, we should return */
1359 }
1360
1361 /* sanity check on repeated properties */
1362 for(c=0xfffe; c<=0x10ffff;) {
1363 type=u_charType(c);
1364 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1365 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1366 }
1367 if(type!=U_UNASSIGNED) {
1368 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1369 }
1370 if((c&0xffff)==0xfffe) {
1371 ++c;
1372 } else {
1373 c+=0xffff;
1374 }
1375 }
1376
1377 /* test that PUA is not "unassigned" */
1378 for(c=0xe000; c<=0x10fffd;) {
1379 type=u_charType(c);
1380 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1381 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1382 }
1383 if(type==U_UNASSIGNED) {
1384 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1385 } else if(type!=U_PRIVATE_USE_CHAR) {
1386 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1387 }
1388 if(c==0xf8ff) {
1389 c=0xf0000;
1390 } else if(c==0xffffd) {
1391 c=0x100000;
1392 } else {
1393 ++c;
1394 }
1395 }
1396
1397 /* test u_enumCharTypes() */
1398 u_enumCharTypes(enumTypeRange, "a1");
1399
1400 /* check default properties */
1401 u_enumCharTypes(enumDefaultsRange, NULL);
1402 }
1403
TestCodeUnitnull1404 static void TestCodeUnit(){
1405 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1406
1407 int32_t i;
1408
1409 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1410 UChar c=codeunit[i];
1411 if(i<4){
1412 if(!(U16_IS_SINGLE(c)) || (U16_IS_LEAD(c)) || (U16_IS_TRAIL(c)) ||
1413 U16_IS_SURROGATE(c) || U_IS_SURROGATE(c)) {
1414 log_err("ERROR: U+%04x is a single", c);
1415 }
1416
1417 }
1418 if(i >= 4 && i< 8){
1419 if(!(U16_IS_LEAD(c)) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c) ||
1420 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1421 log_err("ERROR: U+%04x is a first surrogate", c);
1422 }
1423 }
1424 if(i >= 8 && i< 12){
1425 if(!(U16_IS_TRAIL(c)) || U16_IS_SINGLE(c) || U16_IS_LEAD(c) ||
1426 !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1427 log_err("ERROR: U+%04x is a second surrogate", c);
1428 }
1429 }
1430 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1431 if(i<4){
1432 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1433 log_err("ERROR: U+%04x is a single", c);
1434 }
1435
1436 }
1437 if(i >= 4 && i< 8){
1438 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1439 log_err("ERROR: U+%04x is a first surrogate", c);
1440 }
1441 }
1442 if(i >= 8 && i< 12){
1443 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1444 log_err("ERROR: U+%04x is a second surrogate", c);
1445 }
1446 }
1447 #endif
1448 }
1449 }
1450
TestCodePointnull1451 static void TestCodePoint(){
1452 const UChar32 codePoint[]={
1453 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1454 0xd800,
1455 0xdbff,
1456 0xdc00,
1457 0xdfff,
1458 0xdc04,
1459 0xd821,
1460 /*not a surrogate, valid, isUnicodeChar , not Error*/
1461 0x20ac,
1462 0xd7ff,
1463 0xe000,
1464 0xe123,
1465 0x0061,
1466 0xe065,
1467 0x20402,
1468 0x24506,
1469 0x23456,
1470 0x20402,
1471 0x10402,
1472 0x23456,
1473 /*not a surrogate, not valid, isUnicodeChar, isError */
1474 0x0015,
1475 0x009f,
1476 /*not a surrogate, not valid, not isUnicodeChar, isError */
1477 0xffff,
1478 0xfffe,
1479 };
1480 int32_t i;
1481 for(i=0; i<UPRV_LENGTHOF(codePoint); i++) {
1482 UChar32 c=codePoint[i];
1483 if(i<6) {
1484 if(!U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)) {
1485 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1486 }
1487 if(U_IS_UNICODE_CHAR(c)) {
1488 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1489 }
1490 } else if(i >=6 && i<18) {
1491 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1492 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1493 }
1494 if(!U_IS_UNICODE_CHAR(c)) {
1495 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1496 }
1497 } else if(i >=18 && i<20) {
1498 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1499 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1500 }
1501 if(!U_IS_UNICODE_CHAR(c)) {
1502 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1503 }
1504 } else if(i >=18 && i<UPRV_LENGTHOF(codePoint)) {
1505 if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1506 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1507 }
1508 if(U_IS_UNICODE_CHAR(c)) {
1509 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1510 }
1511 }
1512 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1513 if(i<6){
1514 if(!UTF_IS_SURROGATE(c)){
1515 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1516 }
1517 if(UTF_IS_VALID(c)){
1518 log_err("ERROR: isValid() failed for U+%04x\n", c);
1519 }
1520 if(UTF_IS_UNICODE_CHAR(c)){
1521 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1522 }
1523 if(UTF_IS_ERROR(c)){
1524 log_err("ERROR: isError() failed for U+%04x\n", c);
1525 }
1526 }else if(i >=6 && i<18){
1527 if(UTF_IS_SURROGATE(c)){
1528 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1529 }
1530 if(!UTF_IS_VALID(c)){
1531 log_err("ERROR: isValid() failed for U+%04x\n", c);
1532 }
1533 if(!UTF_IS_UNICODE_CHAR(c)){
1534 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1535 }
1536 if(UTF_IS_ERROR(c)){
1537 log_err("ERROR: isError() failed for U+%04x\n", c);
1538 }
1539 }else if(i >=18 && i<20){
1540 if(UTF_IS_SURROGATE(c)){
1541 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1542 }
1543 if(UTF_IS_VALID(c)){
1544 log_err("ERROR: isValid() failed for U+%04x\n", c);
1545 }
1546 if(!UTF_IS_UNICODE_CHAR(c)){
1547 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1548 }
1549 if(!UTF_IS_ERROR(c)){
1550 log_err("ERROR: isError() failed for U+%04x\n", c);
1551 }
1552 }
1553 else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1554 if(UTF_IS_SURROGATE(c)){
1555 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1556 }
1557 if(UTF_IS_VALID(c)){
1558 log_err("ERROR: isValid() failed for U+%04x\n", c);
1559 }
1560 if(UTF_IS_UNICODE_CHAR(c)){
1561 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1562 }
1563 if(!UTF_IS_ERROR(c)){
1564 log_err("ERROR: isError() failed for U+%04x\n", c);
1565 }
1566 }
1567 #endif
1568 }
1569
1570 if(
1571 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1572 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1573 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1574 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1575 ) {
1576 log_err("error with U_IS_BMP()\n");
1577 }
1578
1579 if(
1580 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1581 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1582 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1583 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1584 ) {
1585 log_err("error with U_IS_SUPPLEMENTARY()\n");
1586 }
1587 }
1588
TestCharLengthnull1589 static void TestCharLength()
1590 {
1591 const int32_t codepoint[]={
1592 1, 0x0061,
1593 1, 0xe065,
1594 1, 0x20ac,
1595 2, 0x20402,
1596 2, 0x23456,
1597 2, 0x24506,
1598 2, 0x20402,
1599 2, 0x10402,
1600 1, 0xd7ff,
1601 1, 0xe000
1602 };
1603
1604 int32_t i;
1605 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1606 UBool multiple;
1607 #endif
1608 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1609 UChar32 c=codepoint[i+1];
1610 if(
1611 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1612 UTF_CHAR_LENGTH(c) != codepoint[i] ||
1613 #endif
1614 U16_LENGTH(c) != codepoint[i]) {
1615 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1616 }
1617 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1618 multiple=(UBool)(codepoint[i] == 1 ? false : true);
1619 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1620 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1621 }
1622 #endif
1623 }
1624 }
1625
1626 /*internal functions ----*/
MakeProp(char* str)1627 static int32_t MakeProp(char* str)
1628 {
1629 int32_t result = 0;
1630 char* matchPosition =0;
1631
1632 matchPosition = strstr(tagStrings, str);
1633 if (matchPosition == 0)
1634 {
1635 log_err("unrecognized type letter ");
1636 log_err(str);
1637 }
1638 else
1639 result = (int32_t)((matchPosition - tagStrings) / 2);
1640 return result;
1641 }
1642
MakeDir(char* str)1643 static int32_t MakeDir(char* str)
1644 {
1645 int32_t pos = 0;
1646 for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1647 if (strcmp(str, dirStrings[pos]) == 0) {
1648 return pos;
1649 }
1650 }
1651 return -1;
1652 }
1653
1654 /* test u_charName() -------------------------------------------------------- */
1655
1656 static const struct {
1657 uint32_t code;
1658 const char *name, *oldName, *extName, *alias;
1659 } names[]={
1660 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A", NULL},
1661 {0x01a2, "LATIN CAPITAL LETTER OI", "",
1662 "LATIN CAPITAL LETTER OI",
1663 "LATIN CAPITAL LETTER GHA"},
1664 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1665 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", NULL},
1666 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1667 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1668 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1669 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401", NULL},
1670 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED", NULL},
1671 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA", NULL},
1672 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH", NULL},
1673 {0xd800, "", "", "<lead surrogate-D800>", NULL},
1674 {0xdc00, "", "", "<trail surrogate-DC00>", NULL},
1675 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS", NULL},
1676 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN", NULL},
1677 {0xffff, "", "", "<noncharacter-FFFF>", NULL},
1678 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1679 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1680 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1681 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456", NULL}
1682 };
1683
1684 static UBool
enumCharNamesFn(void *context, UChar32 code, UCharNameChoice nameChoice, const char *name, int32_t length)1685 enumCharNamesFn(void *context,
1686 UChar32 code, UCharNameChoice nameChoice,
1687 const char *name, int32_t length) {
1688 int32_t *pCount=(int32_t *)context;
1689 const char *expected;
1690 int i;
1691
1692 if(length<=0 || length!=(int32_t)strlen(name)) {
1693 /* should not be called with an empty string or invalid length */
1694 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1695 return true;
1696 }
1697
1698 ++*pCount;
1699 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1700 if(code==(UChar32)names[i].code) {
1701 switch (nameChoice) {
1702 case U_EXTENDED_CHAR_NAME:
1703 if(0!=strcmp(name, names[i].extName)) {
1704 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1705 }
1706 break;
1707 case U_UNICODE_CHAR_NAME:
1708 if(0!=strcmp(name, names[i].name)) {
1709 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1710 }
1711 break;
1712 case U_UNICODE_10_CHAR_NAME:
1713 expected=names[i].oldName;
1714 if(expected[0]==0 || 0!=strcmp(name, expected)) {
1715 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1716 }
1717 break;
1718 case U_CHAR_NAME_ALIAS:
1719 expected=names[i].alias;
1720 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1721 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1722 }
1723 break;
1724 case U_CHAR_NAME_CHOICE_COUNT:
1725 break;
1726 }
1727 break;
1728 }
1729 }
1730 return true;
1731 }
1732
1733 struct enumExtCharNamesContext {
1734 uint32_t length;
1735 int32_t last;
1736 };
1737
1738 static UBool
enumExtCharNamesFn(void *context, UChar32 code, UCharNameChoice nameChoice, const char *name, int32_t length)1739 enumExtCharNamesFn(void *context,
1740 UChar32 code, UCharNameChoice nameChoice,
1741 const char *name, int32_t length) {
1742 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1743
1744 if (ecncp->last != (int32_t) code - 1) {
1745 if (ecncp->last < 0) {
1746 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1747 } else {
1748 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1749 }
1750 }
1751 ecncp->last = (int32_t) code;
1752
1753 if (!*name) {
1754 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1755 }
1756
1757 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1758 }
1759
1760 /**
1761 * This can be made more efficient by moving it into putil.c and having
1762 * it directly access the ebcdic translation tables.
1763 * TODO: If we get this method in putil.c, then delete it from here.
1764 */
1765 static UChar
u_charToUChar(char c)1766 u_charToUChar(char c) {
1767 UChar uc;
1768 u_charsToUChars(&c, &uc, 1);
1769 return uc;
1770 }
1771
1772 static void
TestCharNamesnull1773 TestCharNames() {
1774 static char name[80];
1775 UErrorCode errorCode=U_ZERO_ERROR;
1776 struct enumExtCharNamesContext extContext;
1777 const char *expected;
1778 int32_t length;
1779 UChar32 c;
1780 int32_t i;
1781
1782 log_verbose("Testing uprv_getMaxCharNameLength()\n");
1783 length=uprv_getMaxCharNameLength();
1784 if(length==0) {
1785 /* no names data available */
1786 return;
1787 }
1788 if(length<83) { /* Unicode 3.2 max char name length */
1789 log_err("uprv_getMaxCharNameLength()=%d is too short");
1790 }
1791 /* ### TODO same tests for max ISO comment length as for max name length */
1792
1793 log_verbose("Testing u_charName()\n");
1794 for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1795 /* modern Unicode character name */
1796 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1797 if(U_FAILURE(errorCode)) {
1798 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1799 return;
1800 }
1801 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1802 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1803 }
1804
1805 /* find the modern name */
1806 if (*names[i].name) {
1807 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1808 if(U_FAILURE(errorCode)) {
1809 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1810 return;
1811 }
1812 if(c!=(UChar32)names[i].code) {
1813 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1814 }
1815 }
1816
1817 /* Unicode 1.0 character name */
1818 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1819 if(U_FAILURE(errorCode)) {
1820 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1821 return;
1822 }
1823 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1824 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1825 }
1826
1827 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1828 if(names[i].oldName[0]!=0 /* && length>0 */) {
1829 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1830 if(U_FAILURE(errorCode)) {
1831 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1832 return;
1833 }
1834 if(c!=(UChar32)names[i].code) {
1835 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1836 }
1837 }
1838
1839 /* Unicode character name alias */
1840 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1841 if(U_FAILURE(errorCode)) {
1842 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1843 return;
1844 }
1845 expected=names[i].alias;
1846 if(expected==NULL) {
1847 expected="";
1848 }
1849 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1850 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1851 names[i].code, name, length, expected);
1852 }
1853
1854 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1855 if(expected[0]!=0 /* && length>0 */) {
1856 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1857 if(U_FAILURE(errorCode)) {
1858 log_err("u_charFromName(%s - alias) error %s\n",
1859 expected, u_errorName(errorCode));
1860 return;
1861 }
1862 if(c!=(UChar32)names[i].code) {
1863 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1864 expected, c, names[i].code);
1865 }
1866 }
1867 }
1868
1869 /* test u_enumCharNames() */
1870 length=0;
1871 errorCode=U_ZERO_ERROR;
1872 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1873 if(U_FAILURE(errorCode) || length<94140) {
1874 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1875 }
1876
1877 extContext.length = 0;
1878 extContext.last = -1;
1879 errorCode=U_ZERO_ERROR;
1880 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1881 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1882 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1883 }
1884
1885 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1886 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1887 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1888 }
1889
1890 /* Test getCharNameCharacters */
1891 if(!getTestOption(QUICK_OPTION)) {
1892 enum { BUFSIZE = 256 };
1893 UErrorCode ec = U_ZERO_ERROR;
1894 char buf[BUFSIZE];
1895 int32_t maxLength;
1896 UChar32 cp;
1897 UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1898 int32_t l1, l2;
1899 UBool map[256];
1900 UBool ok;
1901
1902 USet* set = uset_open(1, 0); /* empty set */
1903 USet* dumb = uset_open(1, 0); /* empty set */
1904
1905 /*
1906 * uprv_getCharNameCharacters() will likely return more lowercase
1907 * letters than actual character names contain because
1908 * it includes all the characters in lowercased names of
1909 * general categories, for the full possible set of extended names.
1910 */
1911 {
1912 USetAdder sa={
1913 NULL,
1914 uset_add,
1915 uset_addRange,
1916 uset_addString,
1917 NULL, /* don't need remove() */
1918 NULL /* don't need removeRange() */
1919 };
1920 sa.set=set;
1921 uprv_getCharNameCharacters(&sa);
1922 }
1923
1924 /* build set the dumb (but sure-fire) way */
1925 for (i=0; i<256; ++i) {
1926 map[i] = false;
1927 }
1928
1929 maxLength=0;
1930 for (cp=0; cp<0x110000; ++cp) {
1931 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1932 buf, BUFSIZE, &ec);
1933 if (U_FAILURE(ec)) {
1934 log_err("FAIL: u_charName failed when it shouldn't\n");
1935 uset_close(set);
1936 uset_close(dumb);
1937 return;
1938 }
1939 if(len>maxLength) {
1940 maxLength=len;
1941 }
1942
1943 for (i=0; i<len; ++i) {
1944 if (!map[(uint8_t) buf[i]]) {
1945 uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1946 map[(uint8_t) buf[i]] = true;
1947 }
1948 }
1949
1950 /* test for leading/trailing whitespace */
1951 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1952 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1953 }
1954 }
1955
1956 if(map[(uint8_t)'\t']) {
1957 log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1958 }
1959
1960 length=uprv_getMaxCharNameLength();
1961 if(length!=maxLength) {
1962 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1963 length, maxLength);
1964 }
1965
1966 /* compare the sets. Where is my uset_equals?!! */
1967 ok=true;
1968 for(i=0; i<256; ++i) {
1969 if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1970 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1971 /* ignore lowercase a-z that are in set but not in dumb */
1972 ok=true;
1973 } else {
1974 ok=false;
1975 break;
1976 }
1977 }
1978 }
1979
1980 l1 = uset_toPattern(set, pat, BUFSIZE, true, &ec);
1981 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, true, &ec);
1982 if (U_FAILURE(ec)) {
1983 log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1984 uset_close(set);
1985 uset_close(dumb);
1986 return;
1987 }
1988
1989 if (l1 >= BUFSIZE) {
1990 l1 = BUFSIZE-1;
1991 pat[l1] = 0;
1992 }
1993 if (l2 >= BUFSIZE) {
1994 l2 = BUFSIZE-1;
1995 dumbPat[l2] = 0;
1996 }
1997
1998 if (!ok) {
1999 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
2000 aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
2001 } else if(getTestOption(VERBOSITY_OPTION)) {
2002 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
2003 }
2004
2005 uset_close(set);
2006 uset_close(dumb);
2007 }
2008
2009 /* ### TODO: test error cases and other interesting things */
2010 }
2011
2012 static void
TestUCharFromNameUnderflownull2013 TestUCharFromNameUnderflow() {
2014 // Ticket #10889: Underflow crash when there is no dash.
2015 const char *name="<NO BREAK SPACE>";
2016 UErrorCode errorCode=U_ZERO_ERROR;
2017 UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2018 if(U_SUCCESS(errorCode)) {
2019 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2020 name, c, u_errorName(errorCode));
2021 }
2022
2023 // Test related edge cases.
2024 name="<-00a0>";
2025 errorCode=U_ZERO_ERROR;
2026 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2027 if(U_SUCCESS(errorCode)) {
2028 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2029 name, c, u_errorName(errorCode));
2030 }
2031
2032 errorCode=U_ZERO_ERROR;
2033 name="<control->";
2034 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2035 if(U_SUCCESS(errorCode)) {
2036 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2037 name, c, u_errorName(errorCode));
2038 }
2039
2040 errorCode=U_ZERO_ERROR;
2041 name="<control-111111>";
2042 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2043 if(U_SUCCESS(errorCode)) {
2044 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2045 name, c, u_errorName(errorCode));
2046 }
2047
2048 // ICU-20292: integer overflow
2049 errorCode=U_ZERO_ERROR;
2050 name="<noncharacter-10010FFFF>";
2051 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2052 if(U_SUCCESS(errorCode)) {
2053 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2054 name, c, u_errorName(errorCode));
2055 }
2056
2057 errorCode=U_ZERO_ERROR;
2058 name="<noncharacter-00010FFFF>"; // too many digits even if only leading 0s
2059 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2060 if(U_SUCCESS(errorCode)) {
2061 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2062 name, c, u_errorName(errorCode));
2063 }
2064
2065 errorCode=U_ZERO_ERROR;
2066 name="<noncharacter-fFFf>>";
2067 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2068 if(U_SUCCESS(errorCode)) {
2069 log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2070 name, c, u_errorName(errorCode));
2071 }
2072 }
2073
2074 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
2075
2076 static void
TestMirroringnull2077 TestMirroring() {
2078 USet *set;
2079 UErrorCode errorCode;
2080
2081 UChar32 start, end, c2, c3;
2082 int32_t i;
2083
2084 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2085
2086 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2087
2088 log_verbose("Testing u_isMirrored()\n");
2089 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
2090 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
2091 )
2092 ) {
2093 log_err("u_isMirrored() does not work correctly\n");
2094 }
2095
2096 log_verbose("Testing u_charMirror()\n");
2097 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
2098 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
2099 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
2100 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2101 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
2102 )
2103 ) {
2104 log_err("u_charMirror() does not work correctly\n");
2105 }
2106
2107 /* verify that Bidi_Mirroring_Glyph roundtrips */
2108 errorCode=U_ZERO_ERROR;
2109 set=uset_openPattern(mirroredPattern, 17, &errorCode);
2110
2111 if (U_FAILURE(errorCode)) {
2112 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2113 } else {
2114 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2115 do {
2116 c2=u_charMirror(start);
2117 c3=u_charMirror(c2);
2118 if(c3!=start) {
2119 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2120 }
2121 c3=u_getBidiPairedBracket(start);
2122 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2123 if(c3!=start) {
2124 log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2125 (long)start);
2126 }
2127 } else {
2128 if(c3!=c2) {
2129 log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2130 (long)start, (long)c2);
2131 }
2132 }
2133 } while(++start<=end);
2134 }
2135 }
2136
2137 uset_close(set);
2138 }
2139
2140
2141 struct RunTestData
2142 {
2143 const char *runText;
2144 UScriptCode runCode;
2145 };
2146
2147 typedef struct RunTestData RunTestData;
2148
2149 static void
CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns, const char *prefix)2150 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2151 const char *prefix)
2152 {
2153 int32_t run, runStart, runLimit;
2154 UScriptCode runCode;
2155
2156 /* iterate over all the runs */
2157 run = 0;
2158 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2159 if (runStart != runStarts[run]) {
2160 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2161 prefix, run, runStarts[run], runStart);
2162 }
2163
2164 if (runLimit != runStarts[run + 1]) {
2165 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2166 prefix, run, runStarts[run + 1], runLimit);
2167 }
2168
2169 if (runCode != testData[run].runCode) {
2170 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2171 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2172 }
2173
2174 run += 1;
2175
2176 /* stop when we've seen all the runs we expect to see */
2177 if (run >= nRuns) {
2178 break;
2179 }
2180 }
2181
2182 /* Complain if we didn't see then number of runs we expected */
2183 if (run != nRuns) {
2184 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2185 }
2186 }
2187
2188 static void
TestUScriptRunAPInull2189 TestUScriptRunAPI()
2190 {
2191 static const RunTestData testData1[] = {
2192 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2193 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2194 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2195 {"English (", USCRIPT_LATIN},
2196 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2197 {") ", USCRIPT_LATIN},
2198 {"\\u6F22\\u5B75", USCRIPT_HAN},
2199 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2200 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2201 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2202 };
2203
2204 static const RunTestData testData2[] = {
2205 {"((((((((((abc))))))))))", USCRIPT_LATIN}
2206 };
2207
2208 static const struct {
2209 const RunTestData *testData;
2210 int32_t nRuns;
2211 } testDataEntries[] = {
2212 {testData1, UPRV_LENGTHOF(testData1)},
2213 {testData2, UPRV_LENGTHOF(testData2)}
2214 };
2215
2216 static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2217 int32_t testEntry;
2218
2219 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2220 UChar testString[1024];
2221 int32_t runStarts[256];
2222 int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2223 const RunTestData *testData = testDataEntries[testEntry].testData;
2224
2225 int32_t run, stringLimit;
2226 UScriptRun *scriptRun = NULL;
2227 UErrorCode err;
2228
2229 /*
2230 * Fill in the test string and the runStarts array.
2231 */
2232 stringLimit = 0;
2233 for (run = 0; run < nTestRuns; run += 1) {
2234 runStarts[run] = stringLimit;
2235 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2236 /*stringLimit -= 1;*/
2237 }
2238
2239 /* The limit of the last run */
2240 runStarts[nTestRuns] = stringLimit;
2241
2242 /*
2243 * Make sure that calling uscript_OpenRun with a NULL text pointer
2244 * and a non-zero text length returns the correct error.
2245 */
2246 err = U_ZERO_ERROR;
2247 scriptRun = uscript_openRun(NULL, stringLimit, &err);
2248
2249 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2250 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2251 }
2252
2253 if (scriptRun != NULL) {
2254 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2255 uscript_closeRun(scriptRun);
2256 }
2257
2258 /*
2259 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2260 * and a zero text length returns the correct error.
2261 */
2262 err = U_ZERO_ERROR;
2263 scriptRun = uscript_openRun(testString, 0, &err);
2264
2265 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2266 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2267 }
2268
2269 if (scriptRun != NULL) {
2270 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2271 uscript_closeRun(scriptRun);
2272 }
2273
2274 /*
2275 * Make sure that calling uscript_openRun with a NULL text pointer
2276 * and a zero text length doesn't return an error.
2277 */
2278 err = U_ZERO_ERROR;
2279 scriptRun = uscript_openRun(NULL, 0, &err);
2280
2281 if (U_FAILURE(err)) {
2282 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2283 }
2284
2285 /* Make sure that the empty iterator doesn't find any runs */
2286 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2287 log_err("uscript_nextRun(...) returned true for an empty iterator.\n");
2288 }
2289
2290 /*
2291 * Make sure that calling uscript_setRunText with a NULL text pointer
2292 * and a non-zero text length returns the correct error.
2293 */
2294 err = U_ZERO_ERROR;
2295 uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2296
2297 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2298 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2299 }
2300
2301 /*
2302 * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2303 * and a zero text length returns the correct error.
2304 */
2305 err = U_ZERO_ERROR;
2306 uscript_setRunText(scriptRun, testString, 0, &err);
2307
2308 if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2309 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2310 }
2311
2312 /*
2313 * Now call uscript_setRunText on the empty iterator
2314 * and make sure that it works.
2315 */
2316 err = U_ZERO_ERROR;
2317 uscript_setRunText(scriptRun, testString, stringLimit, &err);
2318
2319 if (U_FAILURE(err)) {
2320 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2321 } else {
2322 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2323 }
2324
2325 uscript_closeRun(scriptRun);
2326
2327 /*
2328 * Now open an iterator over the testString
2329 * using uscript_openRun and make sure that it works
2330 */
2331 scriptRun = uscript_openRun(testString, stringLimit, &err);
2332
2333 if (U_FAILURE(err)) {
2334 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2335 } else {
2336 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2337 }
2338
2339 /* Now reset the iterator, and make sure
2340 * that it still works.
2341 */
2342 uscript_resetRun(scriptRun);
2343
2344 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2345
2346 /* Close the iterator */
2347 uscript_closeRun(scriptRun);
2348 }
2349 }
2350
2351 /* test additional, non-core properties */
2352 static void
TestAdditionalPropertiesnull2353 TestAdditionalProperties() {
2354 /* test data for u_charAge() */
2355 static const struct {
2356 UChar32 c;
2357 UVersionInfo version;
2358 } charAges[]={
2359 {0x41, { 1, 1, 0, 0 }},
2360 {0xffff, { 1, 1, 0, 0 }},
2361 {0x20ab, { 2, 0, 0, 0 }},
2362 {0x2fffe, { 2, 0, 0, 0 }},
2363 {0x20ac, { 2, 1, 0, 0 }},
2364 {0xfb1d, { 3, 0, 0, 0 }},
2365 {0x3f4, { 3, 1, 0, 0 }},
2366 {0x10300, { 3, 1, 0, 0 }},
2367 {0x220, { 3, 2, 0, 0 }},
2368 {0xff60, { 3, 2, 0, 0 }}
2369 };
2370
2371 /* test data for u_hasBinaryProperty() */
2372 static const int32_t
2373 props[][3]={ /* code point, property, value */
2374 { 0x0627, UCHAR_ALPHABETIC, true },
2375 { 0x1034a, UCHAR_ALPHABETIC, true },
2376 { 0x2028, UCHAR_ALPHABETIC, false },
2377
2378 { 0x0066, UCHAR_ASCII_HEX_DIGIT, true },
2379 { 0x0067, UCHAR_ASCII_HEX_DIGIT, false },
2380
2381 { 0x202c, UCHAR_BIDI_CONTROL, true },
2382 { 0x202f, UCHAR_BIDI_CONTROL, false },
2383
2384 { 0x003c, UCHAR_BIDI_MIRRORED, true },
2385 { 0x003d, UCHAR_BIDI_MIRRORED, false },
2386
2387 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2388 { 0x2018, UCHAR_BIDI_MIRRORED, false },
2389 { 0x201d, UCHAR_BIDI_MIRRORED, false },
2390 { 0x201f, UCHAR_BIDI_MIRRORED, false },
2391 { 0x301e, UCHAR_BIDI_MIRRORED, false },
2392
2393 { 0x058a, UCHAR_DASH, true },
2394 { 0x007e, UCHAR_DASH, false },
2395
2396 { 0x0c4d, UCHAR_DIACRITIC, true },
2397 { 0x3000, UCHAR_DIACRITIC, false },
2398
2399 { 0x0e46, UCHAR_EXTENDER, true },
2400 { 0x0020, UCHAR_EXTENDER, false },
2401
2402 #if !UCONFIG_NO_NORMALIZATION
2403 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, true },
2404 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, true },
2405 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, false },
2406
2407 { 0x110a, UCHAR_NFD_INERT, true }, /* Jamo L */
2408 { 0x0308, UCHAR_NFD_INERT, false },
2409
2410 { 0x1164, UCHAR_NFKD_INERT, true }, /* Jamo V */
2411 { 0x1d79d, UCHAR_NFKD_INERT, false }, /* math compat version of xi */
2412
2413 { 0x0021, UCHAR_NFC_INERT, true }, /* ! */
2414 { 0x0061, UCHAR_NFC_INERT, false }, /* a */
2415 { 0x00e4, UCHAR_NFC_INERT, false }, /* a-umlaut */
2416 { 0x0102, UCHAR_NFC_INERT, false }, /* a-breve */
2417 { 0xac1c, UCHAR_NFC_INERT, false }, /* Hangul LV */
2418 { 0xac1d, UCHAR_NFC_INERT, true }, /* Hangul LVT */
2419
2420 { 0x1d79d, UCHAR_NFKC_INERT, false }, /* math compat version of xi */
2421 { 0x2a6d6, UCHAR_NFKC_INERT, true }, /* Han, last of CJK ext. B */
2422
2423 { 0x00e4, UCHAR_SEGMENT_STARTER, true },
2424 { 0x0308, UCHAR_SEGMENT_STARTER, false },
2425 { 0x110a, UCHAR_SEGMENT_STARTER, true }, /* Jamo L */
2426 { 0x1164, UCHAR_SEGMENT_STARTER, false },/* Jamo V */
2427 { 0xac1c, UCHAR_SEGMENT_STARTER, true }, /* Hangul LV */
2428 { 0xac1d, UCHAR_SEGMENT_STARTER, true }, /* Hangul LVT */
2429 #endif
2430
2431 { 0x0044, UCHAR_HEX_DIGIT, true },
2432 { 0xff46, UCHAR_HEX_DIGIT, true },
2433 { 0x0047, UCHAR_HEX_DIGIT, false },
2434
2435 { 0x30fb, UCHAR_HYPHEN, true },
2436 { 0xfe58, UCHAR_HYPHEN, false },
2437
2438 { 0x2172, UCHAR_ID_CONTINUE, true },
2439 { 0x0307, UCHAR_ID_CONTINUE, true },
2440 { 0x005c, UCHAR_ID_CONTINUE, false },
2441
2442 { 0x2172, UCHAR_ID_START, true },
2443 { 0x007a, UCHAR_ID_START, true },
2444 { 0x0039, UCHAR_ID_START, false },
2445
2446 { 0x4db5, UCHAR_IDEOGRAPHIC, true },
2447 { 0x2f999, UCHAR_IDEOGRAPHIC, true },
2448 { 0x2f99, UCHAR_IDEOGRAPHIC, false },
2449
2450 { 0x200c, UCHAR_JOIN_CONTROL, true },
2451 { 0x2029, UCHAR_JOIN_CONTROL, false },
2452
2453 { 0x1d7bc, UCHAR_LOWERCASE, true },
2454 { 0x0345, UCHAR_LOWERCASE, true },
2455 { 0x0030, UCHAR_LOWERCASE, false },
2456
2457 { 0x1d7a9, UCHAR_MATH, true },
2458 { 0x2135, UCHAR_MATH, true },
2459 { 0x0062, UCHAR_MATH, false },
2460
2461 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, true },
2462 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, true },
2463 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, false },
2464
2465 { 0x0022, UCHAR_QUOTATION_MARK, true },
2466 { 0xff62, UCHAR_QUOTATION_MARK, true },
2467 { 0xd840, UCHAR_QUOTATION_MARK, false },
2468
2469 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, true },
2470 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, false },
2471
2472 { 0x1d44a, UCHAR_UPPERCASE, true },
2473 { 0x2162, UCHAR_UPPERCASE, true },
2474 { 0x0345, UCHAR_UPPERCASE, false },
2475
2476 { 0x0020, UCHAR_WHITE_SPACE, true },
2477 { 0x202f, UCHAR_WHITE_SPACE, true },
2478 { 0x3001, UCHAR_WHITE_SPACE, false },
2479
2480 { 0x0711, UCHAR_XID_CONTINUE, true },
2481 { 0x1d1aa, UCHAR_XID_CONTINUE, true },
2482 { 0x007c, UCHAR_XID_CONTINUE, false },
2483
2484 { 0x16ee, UCHAR_XID_START, true },
2485 { 0x23456, UCHAR_XID_START, true },
2486 { 0x1d1aa, UCHAR_XID_START, false },
2487
2488 /*
2489 * Version break:
2490 * The following properties are only supported starting with the
2491 * Unicode version indicated in the second field.
2492 */
2493 { -1, 0x320, 0 },
2494
2495 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, true },
2496 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, true },
2497 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, false },
2498
2499 { 0x0149, UCHAR_DEPRECATED, true }, /* changed in Unicode 5.2 */
2500 { 0x0341, UCHAR_DEPRECATED, false }, /* changed in Unicode 5.2 */
2501 { 0xe0001, UCHAR_DEPRECATED, true }, /* changed from Unicode 5 to 5.1 */
2502 { 0xe0100, UCHAR_DEPRECATED, false },
2503
2504 { 0x00a0, UCHAR_GRAPHEME_BASE, true },
2505 { 0x0a4d, UCHAR_GRAPHEME_BASE, false },
2506 { 0xff9d, UCHAR_GRAPHEME_BASE, true },
2507 { 0xff9f, UCHAR_GRAPHEME_BASE, false }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2508
2509 { 0x0300, UCHAR_GRAPHEME_EXTEND, true },
2510 { 0xff9d, UCHAR_GRAPHEME_EXTEND, false },
2511 { 0xff9f, UCHAR_GRAPHEME_EXTEND, true }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2512 { 0x0603, UCHAR_GRAPHEME_EXTEND, false },
2513
2514 { 0x0a4d, UCHAR_GRAPHEME_LINK, true },
2515 { 0xff9f, UCHAR_GRAPHEME_LINK, false },
2516
2517 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, true },
2518 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, false },
2519
2520 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, true },
2521 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, false },
2522
2523 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, true },
2524 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, false },
2525
2526 { 0x2e9b, UCHAR_RADICAL, true },
2527 { 0x4e00, UCHAR_RADICAL, false },
2528
2529 { 0x012f, UCHAR_SOFT_DOTTED, true },
2530 { 0x0049, UCHAR_SOFT_DOTTED, false },
2531
2532 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, true },
2533 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, false },
2534
2535 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2536
2537 { 0x002e, UCHAR_S_TERM, true },
2538 { 0x0061, UCHAR_S_TERM, false },
2539
2540 { 0x180c, UCHAR_VARIATION_SELECTOR, true },
2541 { 0xfe03, UCHAR_VARIATION_SELECTOR, true },
2542 { 0xe01ef, UCHAR_VARIATION_SELECTOR, true },
2543 { 0xe0200, UCHAR_VARIATION_SELECTOR, false },
2544
2545 /* enum/integer type properties */
2546
2547 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2548 /* test default Bidi classes for unassigned code points */
2549 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2550 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2551 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2552 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2553 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2554 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2555 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2556 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2557 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2558 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2559
2560 { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2561 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2562 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2563 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2564 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2565 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2566 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2567
2568 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2569 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2570 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2571 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2572 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2573 { 0x2FE0, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2574 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2575 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2576 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2577 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2578 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2579
2580 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2581 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2582
2583 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2584 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2585 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2586 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2587 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2588 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2589 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2590 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2591 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2592
2593 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2594 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2595 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2596 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2597 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2598 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2599 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2600 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2601 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2602 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2603 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2604 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2605 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2606 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2607 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2608 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2609 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2610
2611 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2612 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2613 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */
2614
2615 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2616 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2617 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2618 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2619 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2620
2621 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2622 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2623 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2624 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2625 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2626 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2627 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2628 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2629
2630 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2631 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2632 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2633 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2634 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2635 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2636 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2637 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2638 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2639 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2640 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2641 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2642 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2643 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2644 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2645 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2646
2647 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2648
2649 /* UCHAR_SCRIPT tested in cucdapi.c TestUScriptCodeAPI() */
2650
2651 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2652 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2653 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2654 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2655 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2656 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2657 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2658
2659 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2660 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2661 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */
2662 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2663
2664 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2665 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2666 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2667 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2668 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2669 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2670
2671 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2672 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2673 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */
2674 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2675
2676 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2677 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2678 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2679 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2680 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2681 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2682 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2683
2684 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2685 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2686 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */
2687 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2688
2689 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2690 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2691 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2692 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2693
2694 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2695 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2696 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2697 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2698 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2699
2700 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2701
2702 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2703
2704 { 0x00d7, UCHAR_PATTERN_SYNTAX, true },
2705 { 0xfe45, UCHAR_PATTERN_SYNTAX, true },
2706 { 0x0061, UCHAR_PATTERN_SYNTAX, false },
2707
2708 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, true },
2709 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, true },
2710 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, true },
2711 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, false },
2712 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, false },
2713
2714 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2715 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC },
2716 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2717
2718 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE },
2719 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC },
2720 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2721 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2722
2723 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2724 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2725 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2726 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2727 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2728 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2729
2730 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2731 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2732 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2733 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2734
2735 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2736 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2737 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2738 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2739
2740 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2741 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2742 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2743 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2744
2745 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2746
2747 /* unassigned code points in new default Bidi R blocks */
2748 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2749 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2750
2751 /* test some script codes >127 */
2752 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM },
2753 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
2754 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2755
2756 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2757
2758 /* value changed in Unicode 6.0 */
2759 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2760
2761 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2762
2763 /* unassigned code points in new/changed default Bidi AL blocks */
2764 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2765 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2766
2767 { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2768
2769 /* unassigned code points in the currency symbols block now default to ET */
2770 { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2771 { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2772
2773 /* new property in Unicode 6.3 */
2774 { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2775 { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2776 { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2777 { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2778 { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2779 { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2780
2781 { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2782
2783 /* new character range with Joining_Group values */
2784 { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2785 { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2786 { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2787 { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2788 { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2789
2790 { -1, 0xa00, 0 }, // version break for Unicode 10
2791
2792 { 0x1F1E5, UCHAR_REGIONAL_INDICATOR, false },
2793 { 0x1F1E7, UCHAR_REGIONAL_INDICATOR, true },
2794 { 0x1F1FF, UCHAR_REGIONAL_INDICATOR, true },
2795 { 0x1F200, UCHAR_REGIONAL_INDICATOR, false },
2796
2797 { 0x0600, UCHAR_PREPENDED_CONCATENATION_MARK, true },
2798 { 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, false },
2799 { 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, true },
2800
2801 /* undefined UProperty values */
2802 { 0x61, 0x4a7, 0 },
2803 { 0x234bc, 0x15ed, 0 }
2804 };
2805
2806 UVersionInfo version;
2807 UChar32 c;
2808 int32_t i, result, uVersion;
2809 UProperty which;
2810
2811 /* what is our Unicode version? */
2812 u_getUnicodeVersion(version);
2813 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2814
2815 u_charAge(0x20, version);
2816 if(version[0]==0) {
2817 /* no additional properties available */
2818 log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2819 return;
2820 }
2821
2822 /* test u_charAge() */
2823 for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2824 u_charAge(charAges[i].c, version);
2825 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2826 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2827 charAges[i].c,
2828 version[0], version[1], version[2], version[3],
2829 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2830 }
2831 }
2832
2833 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2834 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2835 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */
2836 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2837 u_getIntPropertyMinValue(0x2345)!=0
2838 ) {
2839 log_err("error: u_getIntPropertyMinValue() wrong\n");
2840 }
2841 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2842 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2843 }
2844 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2845 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2846 }
2847 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2848 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2849 }
2850 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2851 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2852 }
2853 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2854 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2855 }
2856 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2857 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2858 }
2859 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2860 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2861 }
2862 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2863 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2864 }
2865 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2866 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2867 }
2868 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2869 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2870 }
2871 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2872 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2873 }
2874 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2875 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2876 }
2877 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2878 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2879 }
2880 if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2881 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2882 }
2883 /*JB#2410*/
2884 if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2885 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2886 }
2887 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2888 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2889 }
2890 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) {
2891 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2892 }
2893 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2894 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2895 }
2896 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2897 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2898 }
2899
2900 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2901 for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2902 const char *whichName;
2903
2904 if(props[i][0]<0) {
2905 /* Unicode version break */
2906 if(uVersion<props[i][1]) {
2907 break; /* do not test properties that are not yet supported */
2908 } else {
2909 continue; /* skip this row */
2910 }
2911 }
2912
2913 c=(UChar32)props[i][0];
2914 which=(UProperty)props[i][1];
2915 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2916
2917 if(which<UCHAR_INT_START) {
2918 result=u_hasBinaryProperty(c, which);
2919 if(result!=props[i][2]) {
2920 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2921 c, whichName, result, i);
2922 }
2923 }
2924
2925 result=u_getIntPropertyValue(c, which);
2926 if(result!=props[i][2]) {
2927 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2928 c, whichName, result, props[i][2], i);
2929 }
2930
2931 /* test separate functions, too */
2932 switch((UProperty)props[i][1]) {
2933 case UCHAR_ALPHABETIC:
2934 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2935 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2936 props[i][0], result, i);
2937 }
2938 break;
2939 case UCHAR_LOWERCASE:
2940 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2941 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2942 props[i][0], result, i);
2943 }
2944 break;
2945 case UCHAR_UPPERCASE:
2946 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2947 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2948 props[i][0], result, i);
2949 }
2950 break;
2951 case UCHAR_WHITE_SPACE:
2952 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2953 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2954 props[i][0], result, i);
2955 }
2956 break;
2957 default:
2958 break;
2959 }
2960 }
2961
2962 // C API coverage
2963 if (u_stringHasBinaryProperty(u"⏱", 1, UCHAR_BASIC_EMOJI) ||
2964 u_stringHasBinaryProperty(u"⏱", -1, UCHAR_BASIC_EMOJI) ||
2965 !u_stringHasBinaryProperty(u"⏱\uFE0F", 2, UCHAR_BASIC_EMOJI) ||
2966 !u_stringHasBinaryProperty(u"⏱\uFE0F", -1, UCHAR_BASIC_EMOJI)) {
2967 log_data_err("error: u_stringHasBinaryProperty(stopwatch variants) is wrong\n");
2968 }
2969 }
2970
2971 static void
TestNumericProperties(void)2972 TestNumericProperties(void) {
2973 /* see UnicodeData.txt, DerivedNumericValues.txt */
2974 static const struct {
2975 UChar32 c;
2976 int32_t type;
2977 double numValue;
2978 } values[]={
2979 { 0x0F33, U_NT_NUMERIC, -1./2. },
2980 { 0x0C66, U_NT_DECIMAL, 0 },
2981 { 0x96f6, U_NT_NUMERIC, 0 },
2982 { 0xa833, U_NT_NUMERIC, 1./16. },
2983 { 0x2152, U_NT_NUMERIC, 1./10. },
2984 { 0x2151, U_NT_NUMERIC, 1./9. },
2985 { 0x1245f, U_NT_NUMERIC, 1./8. },
2986 { 0x2150, U_NT_NUMERIC, 1./7. },
2987 { 0x2159, U_NT_NUMERIC, 1./6. },
2988 { 0x09f6, U_NT_NUMERIC, 3./16. },
2989 { 0x2155, U_NT_NUMERIC, 1./5. },
2990 { 0x00BD, U_NT_NUMERIC, 1./2. },
2991 { 0x0031, U_NT_DECIMAL, 1. },
2992 { 0x4e00, U_NT_NUMERIC, 1. },
2993 { 0x58f1, U_NT_NUMERIC, 1. },
2994 { 0x10320, U_NT_NUMERIC, 1. },
2995 { 0x0F2B, U_NT_NUMERIC, 3./2. },
2996 { 0x00B2, U_NT_DIGIT, 2. },
2997 { 0x5f10, U_NT_NUMERIC, 2. },
2998 { 0x1813, U_NT_DECIMAL, 3. },
2999 { 0x5f0e, U_NT_NUMERIC, 3. },
3000 { 0x2173, U_NT_NUMERIC, 4. },
3001 { 0x8086, U_NT_NUMERIC, 4. },
3002 { 0x278E, U_NT_DIGIT, 5. },
3003 { 0x1D7F2, U_NT_DECIMAL, 6. },
3004 { 0x247A, U_NT_DIGIT, 7. },
3005 { 0x7396, U_NT_NUMERIC, 9. },
3006 { 0x1372, U_NT_NUMERIC, 10. },
3007 { 0x216B, U_NT_NUMERIC, 12. },
3008 { 0x16EE, U_NT_NUMERIC, 17. },
3009 { 0x249A, U_NT_NUMERIC, 19. },
3010 { 0x303A, U_NT_NUMERIC, 30. },
3011 { 0x5345, U_NT_NUMERIC, 30. },
3012 { 0x32B2, U_NT_NUMERIC, 37. },
3013 { 0x1375, U_NT_NUMERIC, 40. },
3014 { 0x10323, U_NT_NUMERIC, 50. },
3015 { 0x0BF1, U_NT_NUMERIC, 100. },
3016 { 0x964c, U_NT_NUMERIC, 100. },
3017 { 0x217E, U_NT_NUMERIC, 500. },
3018 { 0x2180, U_NT_NUMERIC, 1000. },
3019 { 0x4edf, U_NT_NUMERIC, 1000. },
3020 { 0x2181, U_NT_NUMERIC, 5000. },
3021 { 0x137C, U_NT_NUMERIC, 10000. },
3022 { 0x4e07, U_NT_NUMERIC, 10000. },
3023 { 0x12432, U_NT_NUMERIC, 216000. },
3024 { 0x12433, U_NT_NUMERIC, 432000. },
3025 { 0x4ebf, U_NT_NUMERIC, 100000000. },
3026 { 0x5146, U_NT_NUMERIC, 1000000000000. },
3027 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
3028 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
3029 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
3030 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
3031 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
3032 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
3033 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
3034 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
3035 };
3036
3037 double nv;
3038 UChar32 c;
3039 int32_t i, type;
3040
3041 for(i=0; i<UPRV_LENGTHOF(values); ++i) {
3042 c=values[i].c;
3043 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
3044 nv=u_getNumericValue(c);
3045
3046 if(type!=values[i].type) {
3047 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
3048 }
3049 if(0.000001 <= fabs(nv - values[i].numValue)) {
3050 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
3051 }
3052 }
3053 }
3054
3055 /**
3056 * Test the property names and property value names API.
3057 */
3058 static void
TestPropertyNames(void)3059 TestPropertyNames(void) {
3060 int32_t p, v, choice=0, rev;
3061 UBool atLeastSomething = false;
3062
3063 for (p=0; ; ++p) {
3064 UProperty propEnum = (UProperty)p;
3065 UBool sawProp = false;
3066 if(p > 10 && !atLeastSomething) {
3067 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
3068 return;
3069 }
3070
3071 for (choice=0; ; ++choice) {
3072 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
3073 if (name) {
3074 if (!sawProp)
3075 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
3076 log_verbose("%d=\"%s\"", choice, name);
3077 sawProp = true;
3078 atLeastSomething = true;
3079
3080 /* test reverse mapping */
3081 rev = u_getPropertyEnum(name);
3082 if (rev != p) {
3083 log_err("Property round-trip failure: %d -> %s -> %d\n",
3084 p, name, rev);
3085 }
3086 }
3087 if (!name && choice>0) break;
3088 }
3089 if (sawProp) {
3090 /* looks like a valid property; check the values */
3091 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3092 int32_t max = 0;
3093 if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
3094 max = 255;
3095 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
3096 /* it's far too slow to iterate all the way up to
3097 the real max, U_GC_P_MASK */
3098 max = U_GC_NL_MASK;
3099 } else if (p == UCHAR_BLOCK) {
3100 /* UBlockCodes, unlike other values, start at 1 */
3101 max = 1;
3102 }
3103 log_verbose("\n");
3104 for (v=-1; ; ++v) {
3105 UBool sawValue = false;
3106 for (choice=0; ; ++choice) {
3107 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
3108 if (vname) {
3109 if (!sawValue) log_verbose(" %s, value %d:", pname, v);
3110 log_verbose("%d=\"%s\"", choice, vname);
3111 sawValue = true;
3112
3113 /* test reverse mapping */
3114 rev = u_getPropertyValueEnum(propEnum, vname);
3115 if (rev != v) {
3116 log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
3117 pname, v, vname, rev);
3118 }
3119 }
3120 if (!vname && choice>0) break;
3121 }
3122 if (sawValue) {
3123 log_verbose("\n");
3124 }
3125 if (!sawValue && v>=max) break;
3126 }
3127 }
3128 if (!sawProp) {
3129 if (p>=UCHAR_STRING_LIMIT) {
3130 break;
3131 } else if (p>=UCHAR_DOUBLE_LIMIT) {
3132 p = UCHAR_STRING_START - 1;
3133 } else if (p>=UCHAR_MASK_LIMIT) {
3134 p = UCHAR_DOUBLE_START - 1;
3135 } else if (p>=UCHAR_INT_LIMIT) {
3136 p = UCHAR_MASK_START - 1;
3137 } else if (p>=UCHAR_BINARY_LIMIT) {
3138 p = UCHAR_INT_START - 1;
3139 }
3140 }
3141 }
3142 }
3143
3144 /**
3145 * Test the property values API. See JB#2410.
3146 */
3147 static void
TestPropertyValues(void)3148 TestPropertyValues(void) {
3149 int32_t i, p, min, max;
3150 UErrorCode ec;
3151
3152 /* Min should be 0 for everything. */
3153 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3154 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3155 UProperty propEnum = (UProperty)p;
3156 min = u_getIntPropertyMinValue(propEnum);
3157 if (min != 0) {
3158 if (p == UCHAR_BLOCK) {
3159 /* This is okay...for now. See JB#2487.
3160 TODO Update this for JB#2487. */
3161 } else {
3162 const char* name;
3163 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3164 if (name == NULL)
3165 name = "<ERROR>";
3166 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3167 name, min);
3168 }
3169 }
3170 }
3171
3172 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3173 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3174 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3175 }
3176
3177 /* Max should be -1 for invalid properties. */
3178 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3179 if (max != -1) {
3180 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3181 max);
3182 }
3183
3184 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3185 for (i=0; i<2; ++i) {
3186 int32_t script;
3187 const char* desc;
3188 ec = U_ZERO_ERROR;
3189 switch (i) {
3190 case 0:
3191 script = uscript_getScript(-1, &ec);
3192 desc = "uscript_getScript(-1)";
3193 break;
3194 case 1:
3195 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3196 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3197 break;
3198 default:
3199 log_err("Internal test error. Too many scripts\n");
3200 return;
3201 }
3202 /* We don't explicitly test ec. It should be U_FAILURE but it
3203 isn't documented as such. */
3204 if (script != (int32_t)USCRIPT_INVALID_CODE) {
3205 log_err("FAIL: %s = %d, exp. 0\n",
3206 desc, script);
3207 }
3208 }
3209 }
3210
3211 /* various tests for consistency of UCD data and API behavior */
3212 static void
TestConsistencynull3213 TestConsistency() {
3214 char buffer[300];
3215 USet *set1, *set2, *set3, *set4;
3216 UErrorCode errorCode;
3217
3218 UChar32 start, end;
3219 int32_t i, length;
3220
3221 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3222 U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3223 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3224 U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3225 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3226
3227 U_STRING_DECL(mathBlocksPattern,
3228 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3229 214);
3230 U_STRING_DECL(mathPattern, "[:Math:]", 8);
3231 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3232 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3233 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3234
3235 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3236 U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3237 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3238 U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3239 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3240
3241 U_STRING_INIT(mathBlocksPattern,
3242 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3243 214);
3244 U_STRING_INIT(mathPattern, "[:Math:]", 8);
3245 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3246 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3247 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3248
3249 /*
3250 * It used to be that UCD.html and its precursors said
3251 * "Those dashes used to mark connections between pieces of words,
3252 * plus the Katakana middle dot."
3253 *
3254 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3255 * but not from Hyphen.
3256 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3257 * Therefore, do not show errors when testing the Hyphen property.
3258 */
3259 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3260 "known to the UTC and not considered errors.\n");
3261
3262 errorCode=U_ZERO_ERROR;
3263 set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3264 set2=uset_openPattern(dashPattern, 8, &errorCode);
3265 if(U_SUCCESS(errorCode)) {
3266 /* remove the Katakana middle dot(s) from set1 */
3267 uset_remove(set1, 0x30fb);
3268 uset_remove(set1, 0xff65); /* halfwidth variant */
3269 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", false);
3270 } else {
3271 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3272 }
3273
3274 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3275 set3=uset_openPattern(formatPattern, 6, &errorCode);
3276 set4=uset_openPattern(alphaPattern, 14, &errorCode);
3277 if(U_SUCCESS(errorCode)) {
3278 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", false);
3279 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", true);
3280 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", true);
3281 } else {
3282 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3283 }
3284
3285 uset_close(set1);
3286 uset_close(set2);
3287 uset_close(set3);
3288 uset_close(set4);
3289
3290 /*
3291 * Check that each lowercase character has "small" in its name
3292 * and not "capital".
3293 * There are some such characters, some of which seem odd.
3294 * Use the verbose flag to see these notices.
3295 */
3296 errorCode=U_ZERO_ERROR;
3297 set1=uset_openPattern(lowerPattern, 13, &errorCode);
3298 if(U_SUCCESS(errorCode)) {
3299 for(i=0;; ++i) {
3300 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3301 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3302 break; /* done */
3303 }
3304 if(U_FAILURE(errorCode)) {
3305 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3306 i, u_errorName(errorCode));
3307 break;
3308 }
3309 if(length!=0) {
3310 break; /* done with code points, got a string or -1 */
3311 }
3312
3313 while(start<=end) {
3314 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3315 if(U_FAILURE(errorCode)) {
3316 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3317 errorCode=U_ZERO_ERROR;
3318 }
3319 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3320 strstr(buffer, "SMALL CAPITAL")==NULL
3321 ) {
3322 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3323 }
3324 ++start;
3325 }
3326 }
3327 } else {
3328 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3329 }
3330 uset_close(set1);
3331
3332 /* verify that all assigned characters in Math blocks are exactly Math characters */
3333 errorCode=U_ZERO_ERROR;
3334 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3335 set2=uset_openPattern(mathPattern, 8, &errorCode);
3336 set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3337 if(U_SUCCESS(errorCode)) {
3338 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3339 uset_complement(set3); /* assigned characters */
3340 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3341 compareUSets(set1, set2,
3342 "[assigned Math block chars]", "[math blocks]&[:Math:]",
3343 true);
3344 } else {
3345 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3346 }
3347 uset_close(set1);
3348 uset_close(set2);
3349 uset_close(set3);
3350
3351 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3352 errorCode=U_ZERO_ERROR;
3353 set1=uset_openPattern(unknownPattern, 14, &errorCode);
3354 set2=uset_openPattern(reservedPattern, 20, &errorCode);
3355 if(U_SUCCESS(errorCode)) {
3356 compareUSets(set1, set2,
3357 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3358 true);
3359 } else {
3360 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3361 }
3362 uset_close(set1);
3363 uset_close(set2);
3364 }
3365
3366 /* test case folding, compare return values with CaseFolding.txt ------------ */
3367
3368 /* bit set for which case foldings for a character have been tested already */
3369 enum {
3370 CF_SIMPLE=1,
3371 CF_FULL=2,
3372 CF_TURKIC=4,
3373 CF_ALL=7
3374 };
3375
3376 static void
testFold(UChar32 c, int which, UChar32 simple, UChar32 turkic, const UChar *full, int32_t fullLength, const UChar *turkicFull, int32_t turkicFullLength)3377 testFold(UChar32 c, int which,
3378 UChar32 simple, UChar32 turkic,
3379 const UChar *full, int32_t fullLength,
3380 const UChar *turkicFull, int32_t turkicFullLength) {
3381 UChar s[2], t[32];
3382 UChar32 c2;
3383 int32_t length, length2;
3384
3385 UErrorCode errorCode=U_ZERO_ERROR;
3386
3387 length=0;
3388 U16_APPEND_UNSAFE(s, length, c);
3389
3390 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3391 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3392 }
3393 if((which&CF_FULL)!=0) {
3394 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3395 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3396 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3397 }
3398 }
3399 if((which&CF_TURKIC)!=0) {
3400 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3401 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3402 }
3403
3404 length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3405 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3406 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3407 }
3408 }
3409 }
3410
3411 /* test that c case-folds to itself */
3412 static void
testFoldToSelf(UChar32 c, int which)3413 testFoldToSelf(UChar32 c, int which) {
3414 UChar s[2];
3415 int32_t length;
3416
3417 length=0;
3418 U16_APPEND_UNSAFE(s, length, c);
3419 testFold(c, which, c, c, s, length, s, length);
3420 }
3421
3422 struct CaseFoldingData {
3423 USet *notSeen;
3424 UChar32 prev, prevSimple;
3425 UChar prevFull[32];
3426 int32_t prevFullLength;
3427 int which;
3428 };
3429 typedef struct CaseFoldingData CaseFoldingData;
3430
3431 static void U_CALLCONV
caseFoldingLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode)3432 caseFoldingLineFn(void *context,
3433 char *fields[][2], int32_t fieldCount,
3434 UErrorCode *pErrorCode) {
3435 (void)fieldCount; // suppress compiler warnings about unused variable
3436
3437 CaseFoldingData *pData=(CaseFoldingData *)context;
3438 char *end;
3439 UChar full[32];
3440 UChar32 c, prev, simple;
3441 int32_t count;
3442 int which;
3443 char status;
3444
3445 /* get code point */
3446 const char *s=u_skipWhitespace(fields[0][0]);
3447 if(0==strncmp(s, "0000..10FFFF", 12)) {
3448 /*
3449 * Ignore the line
3450 * # @missing: 0000..10FFFF; C; <code point>
3451 * because maps-to-self is already our default, and this line breaks this parser.
3452 */
3453 return;
3454 }
3455 c=(UChar32)strtoul(s, &end, 16);
3456 end=(char *)u_skipWhitespace(end);
3457 if(end<=fields[0][0] || end!=fields[0][1]) {
3458 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3459 *pErrorCode=U_PARSE_ERROR;
3460 return;
3461 }
3462
3463 /* get the status of this mapping */
3464 status=*u_skipWhitespace(fields[1][0]);
3465 if(status!='C' && status!='S' && status!='F' && status!='T') {
3466 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3467 *pErrorCode=U_PARSE_ERROR;
3468 return;
3469 }
3470
3471 /* get the mapping */
3472 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3473 if(U_FAILURE(*pErrorCode)) {
3474 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3475 return;
3476 }
3477
3478 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3479 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3480 simple=c;
3481 }
3482
3483 if(c!=(prev=pData->prev)) {
3484 /*
3485 * Test remaining mappings for the previous code point.
3486 * If a turkic folding was not mentioned, then it should fold the same
3487 * as the regular simple case folding.
3488 */
3489 UChar prevString[2];
3490 int32_t length;
3491
3492 length=0;
3493 U16_APPEND_UNSAFE(prevString, length, prev);
3494 testFold(prev, (~pData->which)&CF_ALL,
3495 prev, pData->prevSimple,
3496 prevString, length,
3497 pData->prevFull, pData->prevFullLength);
3498 pData->prev=pData->prevSimple=c;
3499 length=0;
3500 U16_APPEND_UNSAFE(pData->prevFull, length, c);
3501 pData->prevFullLength=length;
3502 pData->which=0;
3503 }
3504
3505 /*
3506 * Turn the status into a bit set of case foldings to test.
3507 * Remember non-Turkic case foldings as defaults for Turkic mode.
3508 */
3509 switch(status) {
3510 case 'C':
3511 which=CF_SIMPLE|CF_FULL;
3512 pData->prevSimple=simple;
3513 u_memcpy(pData->prevFull, full, count);
3514 pData->prevFullLength=count;
3515 break;
3516 case 'S':
3517 which=CF_SIMPLE;
3518 pData->prevSimple=simple;
3519 break;
3520 case 'F':
3521 which=CF_FULL;
3522 u_memcpy(pData->prevFull, full, count);
3523 pData->prevFullLength=count;
3524 break;
3525 case 'T':
3526 which=CF_TURKIC;
3527 break;
3528 default:
3529 which=0;
3530 break; /* won't happen because of test above */
3531 }
3532
3533 testFold(c, which, simple, simple, full, count, full, count);
3534
3535 /* remember which case foldings of c have been tested */
3536 pData->which|=which;
3537
3538 /* remove c from the set of ones not mentioned in CaseFolding.txt */
3539 uset_remove(pData->notSeen, c);
3540 }
3541
3542 static void
TestCaseFoldingnull3543 TestCaseFolding() {
3544 CaseFoldingData data={ NULL, 0, 0, {0}, 0, 0 };
3545 char *fields[3][2];
3546 UErrorCode errorCode;
3547
3548 static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3549
3550 errorCode=U_ZERO_ERROR;
3551 /* test BMP & plane 1 - nothing interesting above */
3552 data.notSeen=uset_open(0, 0x1ffff);
3553 data.prevFullLength=1; /* length of full case folding of U+0000 */
3554
3555 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3556 if(U_SUCCESS(errorCode)) {
3557 int32_t i, start, end;
3558
3559 /* add a pseudo-last line to finish testing of the actual last one */
3560 fields[0][0]=lastLine;
3561 fields[0][1]=lastLine+6;
3562 fields[1][0]=lastLine+7;
3563 fields[1][1]=lastLine+9;
3564 fields[2][0]=lastLine+10;
3565 fields[2][1]=lastLine+17;
3566 caseFoldingLineFn(&data, fields, 3, &errorCode);
3567
3568 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3569 for(i=0;
3570 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3571 U_SUCCESS(errorCode);
3572 ++i
3573 ) {
3574 do {
3575 testFoldToSelf(start, CF_ALL);
3576 } while(++start<=end);
3577 }
3578 }
3579
3580 uset_close(data.notSeen);
3581 }
3582
TestBinaryCharacterPropertiesAPInull3583 static void TestBinaryCharacterPropertiesAPI() {
3584 // API test only. See intltest/ucdtest.cpp for functional test.
3585 UErrorCode errorCode = U_ZERO_ERROR;
3586 const USet *set = u_getBinaryPropertySet(-1, &errorCode);
3587 if (U_SUCCESS(errorCode)) {
3588 log_err("u_getBinaryPropertySet(-1) did not fail\n");
3589 }
3590 errorCode = U_ZERO_ERROR;
3591 set = u_getBinaryPropertySet(UCHAR_BINARY_LIMIT, &errorCode);
3592 if (U_SUCCESS(errorCode)) {
3593 log_err("u_getBinaryPropertySet(UCHAR_BINARY_LIMIT) did not fail\n");
3594 }
3595 errorCode = U_ZERO_ERROR;
3596 set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &errorCode);
3597 if (!uset_contains(set, 0x20) || uset_contains(set, 0x61)) {
3598 log_err("u_getBinaryPropertySet(UCHAR_WHITE_SPACE) wrong contents\n");
3599 }
3600 }
3601
TestIntCharacterPropertiesAPInull3602 static void TestIntCharacterPropertiesAPI() {
3603 // API test only. See intltest/ucdtest.cpp for functional test.
3604 UErrorCode errorCode = U_ZERO_ERROR;
3605 const UCPMap *map = u_getIntPropertyMap(UCHAR_INT_START - 1, &errorCode);
3606 if (U_SUCCESS(errorCode)) {
3607 log_err("u_getIntPropertyMap(UCHAR_INT_START - 1) did not fail\n");
3608 }
3609 errorCode = U_ZERO_ERROR;
3610 map = u_getIntPropertyMap(UCHAR_INT_LIMIT, &errorCode);
3611 if (U_SUCCESS(errorCode)) {
3612 log_err("u_getIntPropertyMap(UCHAR_INT_LIMIT) did not fail\n");
3613 }
3614 errorCode = U_ZERO_ERROR;
3615 map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &errorCode);
3616 if (ucpmap_get(map, 0x20) != U_SPACE_SEPARATOR || ucpmap_get(map, 0x23456) != U_OTHER_LETTER) {
3617 log_err("u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY) wrong contents\n");
3618 }
3619 }
3620