1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4 ***************************************************************************** 5 * 6 * Copyright (C) 1998-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ***************************************************************************** 10 * 11 * ucnv_err.c 12 * Implements error behaviour functions called by T_UConverter_{from,to}Unicode 13 * 14 * 15* Change history: 16* 17* 06/29/2000 helena Major rewrite of the callback APIs. 18*/ 19 20#include "unicode/utypes.h" 21 22#if !UCONFIG_NO_CONVERSION 23 24#include "unicode/ucnv_err.h" 25#include "unicode/ucnv_cb.h" 26#include "ucnv_cnv.h" 27#include "cmemory.h" 28#include "unicode/ucnv.h" 29#include "ustrfmt.h" 30 31#define VALUE_STRING_LENGTH 48 32/*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */ 33#define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025 34#define UNICODE_U_CODEPOINT 0x0055 35#define UNICODE_X_CODEPOINT 0x0058 36#define UNICODE_RS_CODEPOINT 0x005C 37#define UNICODE_U_LOW_CODEPOINT 0x0075 38#define UNICODE_X_LOW_CODEPOINT 0x0078 39#define UNICODE_AMP_CODEPOINT 0x0026 40#define UNICODE_HASH_CODEPOINT 0x0023 41#define UNICODE_SEMICOLON_CODEPOINT 0x003B 42#define UNICODE_PLUS_CODEPOINT 0x002B 43#define UNICODE_LEFT_CURLY_CODEPOINT 0x007B 44#define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D 45#define UNICODE_SPACE_CODEPOINT 0x0020 46#define UCNV_PRV_ESCAPE_ICU 0 47#define UCNV_PRV_ESCAPE_C 'C' 48#define UCNV_PRV_ESCAPE_XML_DEC 'D' 49#define UCNV_PRV_ESCAPE_XML_HEX 'X' 50#define UCNV_PRV_ESCAPE_JAVA 'J' 51#define UCNV_PRV_ESCAPE_UNICODE 'U' 52#define UCNV_PRV_ESCAPE_CSS2 'S' 53#define UCNV_PRV_STOP_ON_ILLEGAL 'i' 54 55/* 56 * IS_DEFAULT_IGNORABLE_CODE_POINT 57 * This is to check if a code point has the default ignorable unicode property. 58 * As such, this list needs to be updated if the ignorable code point list ever 59 * changes. 60 * To avoid dependency on other code, this list is hard coded here. 61 * When an ignorable code point is found and is unmappable, the default callbacks 62 * will ignore them. 63 * For a list of the default ignorable code points, use this link: 64 * https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i= 65 * 66 * This list should be sync with the one in CharsetCallback.java 67 */ 68#define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \ 69 (c == 0x00AD) || \ 70 (c == 0x034F) || \ 71 (c == 0x061C) || \ 72 (c == 0x115F) || \ 73 (c == 0x1160) || \ 74 (0x17B4 <= c && c <= 0x17B5) || \ 75 (0x180B <= c && c <= 0x180F) || \ 76 (0x200B <= c && c <= 0x200F) || \ 77 (0x202A <= c && c <= 0x202E) || \ 78 (0x2060 <= c && c <= 0x206F) || \ 79 (c == 0x3164) || \ 80 (0xFE00 <= c && c <= 0xFE0F) || \ 81 (c == 0xFEFF) || \ 82 (c == 0xFFA0) || \ 83 (0xFFF0 <= c && c <= 0xFFF8) || \ 84 (0x1BCA0 <= c && c <= 0x1BCA3) || \ 85 (0x1D173 <= c && c <= 0x1D17A) || \ 86 (0xE0000 <= c && c <= 0xE0FFF)) 87 88 89/*Function Pointer STOPS at the ILLEGAL_SEQUENCE */ 90U_CAPI void U_EXPORT2 91UCNV_FROM_U_CALLBACK_STOP ( 92 const void *context, 93 UConverterFromUnicodeArgs *fromUArgs, 94 const char16_t* codeUnits, 95 int32_t length, 96 UChar32 codePoint, 97 UConverterCallbackReason reason, 98 UErrorCode * err) 99{ 100 (void)context; 101 (void)fromUArgs; 102 (void)codeUnits; 103 (void)length; 104 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 105 { 106 /* 107 * Skip if the codepoint has unicode property of default ignorable. 108 */ 109 *err = U_ZERO_ERROR; 110 } 111 /* the caller must have set the error code accordingly */ 112 return; 113} 114 115 116/*Function Pointer STOPS at the ILLEGAL_SEQUENCE */ 117U_CAPI void U_EXPORT2 118UCNV_TO_U_CALLBACK_STOP ( 119 const void *context, 120 UConverterToUnicodeArgs *toUArgs, 121 const char* codePoints, 122 int32_t length, 123 UConverterCallbackReason reason, 124 UErrorCode * err) 125{ 126 /* the caller must have set the error code accordingly */ 127 (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err; 128 return; 129} 130 131U_CAPI void U_EXPORT2 132UCNV_FROM_U_CALLBACK_SKIP ( 133 const void *context, 134 UConverterFromUnicodeArgs *fromUArgs, 135 const char16_t* codeUnits, 136 int32_t length, 137 UChar32 codePoint, 138 UConverterCallbackReason reason, 139 UErrorCode * err) 140{ 141 (void)fromUArgs; 142 (void)codeUnits; 143 (void)length; 144 if (reason <= UCNV_IRREGULAR) 145 { 146 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 147 { 148 /* 149 * Skip if the codepoint has unicode property of default ignorable. 150 */ 151 *err = U_ZERO_ERROR; 152 } 153 else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 154 { 155 *err = U_ZERO_ERROR; 156 } 157 /* else the caller must have set the error code accordingly. */ 158 } 159 /* else ignore the reset, close and clone calls. */ 160} 161 162U_CAPI void U_EXPORT2 163UCNV_FROM_U_CALLBACK_SUBSTITUTE ( 164 const void *context, 165 UConverterFromUnicodeArgs *fromArgs, 166 const char16_t* codeUnits, 167 int32_t length, 168 UChar32 codePoint, 169 UConverterCallbackReason reason, 170 UErrorCode * err) 171{ 172 (void)codeUnits; 173 (void)length; 174 if (reason <= UCNV_IRREGULAR) 175 { 176 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 177 { 178 /* 179 * Skip if the codepoint has unicode property of default ignorable. 180 */ 181 *err = U_ZERO_ERROR; 182 } 183 else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 184 { 185 *err = U_ZERO_ERROR; 186 ucnv_cbFromUWriteSub(fromArgs, 0, err); 187 } 188 /* else the caller must have set the error code accordingly. */ 189 } 190 /* else ignore the reset, close and clone calls. */ 191} 192 193/*uses uprv_itou to get a unicode escape sequence of the offensive sequence, 194 *uses a clean copy (resetted) of the converter, to convert that unicode 195 *escape sequence to the target codepage (if conversion failure happens then 196 *we revert to substituting with subchar) 197 */ 198U_CAPI void U_EXPORT2 199UCNV_FROM_U_CALLBACK_ESCAPE ( 200 const void *context, 201 UConverterFromUnicodeArgs *fromArgs, 202 const char16_t *codeUnits, 203 int32_t length, 204 UChar32 codePoint, 205 UConverterCallbackReason reason, 206 UErrorCode * err) 207{ 208 209 char16_t valueString[VALUE_STRING_LENGTH]; 210 int32_t valueStringLength = 0; 211 int32_t i = 0; 212 213 const char16_t *myValueSource = nullptr; 214 UErrorCode err2 = U_ZERO_ERROR; 215 UConverterFromUCallback original = nullptr; 216 const void *originalContext; 217 218 UConverterFromUCallback ignoredCallback = nullptr; 219 const void *ignoredContext; 220 221 if (reason > UCNV_IRREGULAR) 222 { 223 return; 224 } 225 else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 226 { 227 /* 228 * Skip if the codepoint has unicode property of default ignorable. 229 */ 230 *err = U_ZERO_ERROR; 231 return; 232 } 233 234 ucnv_setFromUCallBack (fromArgs->converter, 235 (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE, 236 nullptr, 237 &original, 238 &originalContext, 239 &err2); 240 241 if (U_FAILURE (err2)) 242 { 243 *err = err2; 244 return; 245 } 246 if(context==nullptr) 247 { 248 while (i < length) 249 { 250 valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 251 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ 252 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 253 } 254 } 255 else 256 { 257 switch(*((char*)context)) 258 { 259 case UCNV_PRV_ESCAPE_JAVA: 260 while (i < length) 261 { 262 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ 263 valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */ 264 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 265 } 266 break; 267 268 case UCNV_PRV_ESCAPE_C: 269 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ 270 271 if(length==2){ 272 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ 273 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8); 274 275 } 276 else{ 277 valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */ 278 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4); 279 } 280 break; 281 282 case UCNV_PRV_ESCAPE_XML_DEC: 283 284 valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ 285 valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ 286 if(length==2){ 287 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0); 288 } 289 else{ 290 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0); 291 } 292 valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 293 break; 294 295 case UCNV_PRV_ESCAPE_XML_HEX: 296 297 valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ 298 valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ 299 valueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */ 300 if(length==2){ 301 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0); 302 } 303 else{ 304 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0); 305 } 306 valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 307 break; 308 309 case UCNV_PRV_ESCAPE_UNICODE: 310 valueString[valueStringLength++] = (char16_t) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */ 311 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ 312 valueString[valueStringLength++] = (char16_t) UNICODE_PLUS_CODEPOINT; /* adding + */ 313 if (length == 2) { 314 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4); 315 } else { 316 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4); 317 } 318 valueString[valueStringLength++] = (char16_t) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */ 319 break; 320 321 case UCNV_PRV_ESCAPE_CSS2: 322 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ 323 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0); 324 /* Always add space character, because the next character might be whitespace, 325 which would erroneously be considered the termination of the escape sequence. */ 326 valueString[valueStringLength++] = (char16_t) UNICODE_SPACE_CODEPOINT; 327 break; 328 329 default: 330 while (i < length) 331 { 332 valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 333 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ 334 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 335 } 336 } 337 } 338 myValueSource = valueString; 339 340 /* reset the error */ 341 *err = U_ZERO_ERROR; 342 343 ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err); 344 345 ucnv_setFromUCallBack (fromArgs->converter, 346 original, 347 originalContext, 348 &ignoredCallback, 349 &ignoredContext, 350 &err2); 351 if (U_FAILURE (err2)) 352 { 353 *err = err2; 354 return; 355 } 356 357 return; 358} 359 360 361 362U_CAPI void U_EXPORT2 363UCNV_TO_U_CALLBACK_SKIP ( 364 const void *context, 365 UConverterToUnicodeArgs *toArgs, 366 const char* codeUnits, 367 int32_t length, 368 UConverterCallbackReason reason, 369 UErrorCode * err) 370{ 371 (void)toArgs; 372 (void)codeUnits; 373 (void)length; 374 if (reason <= UCNV_IRREGULAR) 375 { 376 if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 377 { 378 *err = U_ZERO_ERROR; 379 } 380 /* else the caller must have set the error code accordingly. */ 381 } 382 /* else ignore the reset, close and clone calls. */ 383} 384 385U_CAPI void U_EXPORT2 386UCNV_TO_U_CALLBACK_SUBSTITUTE ( 387 const void *context, 388 UConverterToUnicodeArgs *toArgs, 389 const char* codeUnits, 390 int32_t length, 391 UConverterCallbackReason reason, 392 UErrorCode * err) 393{ 394 (void)codeUnits; 395 (void)length; 396 if (reason <= UCNV_IRREGULAR) 397 { 398 if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 399 { 400 *err = U_ZERO_ERROR; 401 ucnv_cbToUWriteSub(toArgs,0,err); 402 } 403 /* else the caller must have set the error code accordingly. */ 404 } 405 /* else ignore the reset, close and clone calls. */ 406} 407 408/*uses uprv_itou to get a unicode escape sequence of the offensive sequence, 409 *and uses that as the substitution sequence 410 */ 411U_CAPI void U_EXPORT2 412UCNV_TO_U_CALLBACK_ESCAPE ( 413 const void *context, 414 UConverterToUnicodeArgs *toArgs, 415 const char* codeUnits, 416 int32_t length, 417 UConverterCallbackReason reason, 418 UErrorCode * err) 419{ 420 char16_t uniValueString[VALUE_STRING_LENGTH]; 421 int32_t valueStringLength = 0; 422 int32_t i = 0; 423 424 if (reason > UCNV_IRREGULAR) 425 { 426 return; 427 } 428 429 if(context==nullptr) 430 { 431 while (i < length) 432 { 433 uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 434 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */ 435 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2); 436 } 437 } 438 else 439 { 440 switch(*((char*)context)) 441 { 442 case UCNV_PRV_ESCAPE_XML_DEC: 443 while (i < length) 444 { 445 uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ 446 uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ 447 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0); 448 uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 449 } 450 break; 451 452 case UCNV_PRV_ESCAPE_XML_HEX: 453 while (i < length) 454 { 455 uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ 456 uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ 457 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */ 458 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0); 459 uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 460 } 461 break; 462 case UCNV_PRV_ESCAPE_C: 463 while (i < length) 464 { 465 uniValueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ 466 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */ 467 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2); 468 } 469 break; 470 default: 471 while (i < length) 472 { 473 uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 474 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */ 475 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2); 476 valueStringLength += 2; 477 } 478 } 479 } 480 /* reset the error */ 481 *err = U_ZERO_ERROR; 482 483 ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err); 484} 485 486#endif 487