1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 *****************************************************************************
5 *
6 *   Copyright (C) 1998-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *****************************************************************************
10 *
11 *  ucnv_err.c
12 *  Implements error behaviour functions called by T_UConverter_{from,to}Unicode
13 *
14 *
15*   Change history:
16*
17*   06/29/2000  helena      Major rewrite of the callback APIs.
18*/
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_CONVERSION
23
24#include "unicode/ucnv_err.h"
25#include "unicode/ucnv_cb.h"
26#include "ucnv_cnv.h"
27#include "cmemory.h"
28#include "unicode/ucnv.h"
29#include "ustrfmt.h"
30
31#define VALUE_STRING_LENGTH 48
32/*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33#define UNICODE_PERCENT_SIGN_CODEPOINT  0x0025
34#define UNICODE_U_CODEPOINT             0x0055
35#define UNICODE_X_CODEPOINT             0x0058
36#define UNICODE_RS_CODEPOINT            0x005C
37#define UNICODE_U_LOW_CODEPOINT         0x0075
38#define UNICODE_X_LOW_CODEPOINT         0x0078
39#define UNICODE_AMP_CODEPOINT           0x0026
40#define UNICODE_HASH_CODEPOINT          0x0023
41#define UNICODE_SEMICOLON_CODEPOINT     0x003B
42#define UNICODE_PLUS_CODEPOINT          0x002B
43#define UNICODE_LEFT_CURLY_CODEPOINT    0x007B
44#define UNICODE_RIGHT_CURLY_CODEPOINT   0x007D
45#define UNICODE_SPACE_CODEPOINT         0x0020
46#define UCNV_PRV_ESCAPE_ICU         0
47#define UCNV_PRV_ESCAPE_C           'C'
48#define UCNV_PRV_ESCAPE_XML_DEC     'D'
49#define UCNV_PRV_ESCAPE_XML_HEX     'X'
50#define UCNV_PRV_ESCAPE_JAVA        'J'
51#define UCNV_PRV_ESCAPE_UNICODE     'U'
52#define UCNV_PRV_ESCAPE_CSS2        'S'
53#define UCNV_PRV_STOP_ON_ILLEGAL    'i'
54
55/*
56 * IS_DEFAULT_IGNORABLE_CODE_POINT
57 * This is to check if a code point has the default ignorable unicode property.
58 * As such, this list needs to be updated if the ignorable code point list ever
59 * changes.
60 * To avoid dependency on other code, this list is hard coded here.
61 * When an ignorable code point is found and is unmappable, the default callbacks
62 * will ignore them.
63 * For a list of the default ignorable code points, use this link:
64 * https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
65 *
66 * This list should be sync with the one in CharsetCallback.java
67 */
68#define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \
69    (c == 0x00AD) || \
70    (c == 0x034F) || \
71    (c == 0x061C) || \
72    (c == 0x115F) || \
73    (c == 0x1160) || \
74    (0x17B4 <= c && c <= 0x17B5) || \
75    (0x180B <= c && c <= 0x180F) || \
76    (0x200B <= c && c <= 0x200F) || \
77    (0x202A <= c && c <= 0x202E) || \
78    (0x2060 <= c && c <= 0x206F) || \
79    (c == 0x3164) || \
80    (0xFE00 <= c && c <= 0xFE0F) || \
81    (c == 0xFEFF) || \
82    (c == 0xFFA0) || \
83    (0xFFF0 <= c && c <= 0xFFF8) || \
84    (0x1BCA0 <= c && c <= 0x1BCA3) || \
85    (0x1D173 <= c && c <= 0x1D17A) || \
86    (0xE0000 <= c && c <= 0xE0FFF))
87
88
89/*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
90U_CAPI void    U_EXPORT2
91UCNV_FROM_U_CALLBACK_STOP (
92                  const void *context,
93                  UConverterFromUnicodeArgs *fromUArgs,
94                  const char16_t* codeUnits,
95                  int32_t length,
96                  UChar32 codePoint,
97                  UConverterCallbackReason reason,
98                  UErrorCode * err)
99{
100    (void)context;
101    (void)fromUArgs;
102    (void)codeUnits;
103    (void)length;
104    if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
105    {
106        /*
107         * Skip if the codepoint has unicode property of default ignorable.
108         */
109        *err = U_ZERO_ERROR;
110    }
111    /* the caller must have set the error code accordingly */
112    return;
113}
114
115
116/*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
117U_CAPI void    U_EXPORT2
118UCNV_TO_U_CALLBACK_STOP (
119                   const void *context,
120                   UConverterToUnicodeArgs *toUArgs,
121                   const char* codePoints,
122                   int32_t length,
123                   UConverterCallbackReason reason,
124                   UErrorCode * err)
125{
126    /* the caller must have set the error code accordingly */
127    (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
128    return;
129}
130
131U_CAPI void    U_EXPORT2
132UCNV_FROM_U_CALLBACK_SKIP (
133                  const void *context,
134                  UConverterFromUnicodeArgs *fromUArgs,
135                  const char16_t* codeUnits,
136                  int32_t length,
137                  UChar32 codePoint,
138                  UConverterCallbackReason reason,
139                  UErrorCode * err)
140{
141    (void)fromUArgs;
142    (void)codeUnits;
143    (void)length;
144    if (reason <= UCNV_IRREGULAR)
145    {
146        if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
147        {
148            /*
149             * Skip if the codepoint has unicode property of default ignorable.
150             */
151            *err = U_ZERO_ERROR;
152        }
153        else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
154        {
155            *err = U_ZERO_ERROR;
156        }
157        /* else the caller must have set the error code accordingly. */
158    }
159    /* else ignore the reset, close and clone calls. */
160}
161
162U_CAPI void    U_EXPORT2
163UCNV_FROM_U_CALLBACK_SUBSTITUTE (
164                  const void *context,
165                  UConverterFromUnicodeArgs *fromArgs,
166                  const char16_t* codeUnits,
167                  int32_t length,
168                  UChar32 codePoint,
169                  UConverterCallbackReason reason,
170                  UErrorCode * err)
171{
172    (void)codeUnits;
173    (void)length;
174    if (reason <= UCNV_IRREGULAR)
175    {
176        if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
177        {
178            /*
179             * Skip if the codepoint has unicode property of default ignorable.
180             */
181            *err = U_ZERO_ERROR;
182        }
183        else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
184        {
185            *err = U_ZERO_ERROR;
186            ucnv_cbFromUWriteSub(fromArgs, 0, err);
187        }
188        /* else the caller must have set the error code accordingly. */
189    }
190    /* else ignore the reset, close and clone calls. */
191}
192
193/*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
194 *uses a clean copy (resetted) of the converter, to convert that unicode
195 *escape sequence to the target codepage (if conversion failure happens then
196 *we revert to substituting with subchar)
197 */
198U_CAPI void    U_EXPORT2
199UCNV_FROM_U_CALLBACK_ESCAPE (
200                         const void *context,
201                         UConverterFromUnicodeArgs *fromArgs,
202                         const char16_t *codeUnits,
203                         int32_t length,
204                         UChar32 codePoint,
205                         UConverterCallbackReason reason,
206                         UErrorCode * err)
207{
208
209  char16_t valueString[VALUE_STRING_LENGTH];
210  int32_t valueStringLength = 0;
211  int32_t i = 0;
212
213  const char16_t *myValueSource = nullptr;
214  UErrorCode err2 = U_ZERO_ERROR;
215  UConverterFromUCallback original = nullptr;
216  const void *originalContext;
217
218  UConverterFromUCallback ignoredCallback = nullptr;
219  const void *ignoredContext;
220
221  if (reason > UCNV_IRREGULAR)
222  {
223      return;
224  }
225  else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
226  {
227      /*
228       * Skip if the codepoint has unicode property of default ignorable.
229       */
230      *err = U_ZERO_ERROR;
231      return;
232  }
233
234  ucnv_setFromUCallBack (fromArgs->converter,
235                     (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
236                     nullptr,
237                     &original,
238                     &originalContext,
239                     &err2);
240
241  if (U_FAILURE (err2))
242  {
243    *err = err2;
244    return;
245  }
246  if(context==nullptr)
247  {
248      while (i < length)
249      {
250        valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
251        valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
252        valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
253      }
254  }
255  else
256  {
257      switch(*((char*)context))
258      {
259      case UCNV_PRV_ESCAPE_JAVA:
260          while (i < length)
261          {
262              valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
263              valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
264              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
265          }
266          break;
267
268      case UCNV_PRV_ESCAPE_C:
269          valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
270
271          if(length==2){
272              valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
273              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
274
275          }
276          else{
277              valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
278              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
279          }
280          break;
281
282      case UCNV_PRV_ESCAPE_XML_DEC:
283
284          valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
285          valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
286          if(length==2){
287              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
288          }
289          else{
290              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
291          }
292          valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
293          break;
294
295      case UCNV_PRV_ESCAPE_XML_HEX:
296
297          valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
298          valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
299          valueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
300          if(length==2){
301              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
302          }
303          else{
304              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
305          }
306          valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
307          break;
308
309      case UCNV_PRV_ESCAPE_UNICODE:
310          valueString[valueStringLength++] = (char16_t) UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
311          valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT;    /* adding U */
312          valueString[valueStringLength++] = (char16_t) UNICODE_PLUS_CODEPOINT; /* adding + */
313          if (length == 2) {
314              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
315          } else {
316              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
317          }
318          valueString[valueStringLength++] = (char16_t) UNICODE_RIGHT_CURLY_CODEPOINT;    /* adding } */
319          break;
320
321      case UCNV_PRV_ESCAPE_CSS2:
322          valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
323          valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
324          /* Always add space character, because the next character might be whitespace,
325             which would erroneously be considered the termination of the escape sequence. */
326          valueString[valueStringLength++] = (char16_t) UNICODE_SPACE_CODEPOINT;
327          break;
328
329      default:
330          while (i < length)
331          {
332              valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
333              valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT;             /* adding U */
334              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
335          }
336      }
337  }
338  myValueSource = valueString;
339
340  /* reset the error */
341  *err = U_ZERO_ERROR;
342
343  ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
344
345  ucnv_setFromUCallBack (fromArgs->converter,
346                         original,
347                         originalContext,
348                         &ignoredCallback,
349                         &ignoredContext,
350                         &err2);
351  if (U_FAILURE (err2))
352  {
353      *err = err2;
354      return;
355  }
356
357  return;
358}
359
360
361
362U_CAPI void  U_EXPORT2
363UCNV_TO_U_CALLBACK_SKIP (
364                 const void *context,
365                 UConverterToUnicodeArgs *toArgs,
366                 const char* codeUnits,
367                 int32_t length,
368                 UConverterCallbackReason reason,
369                 UErrorCode * err)
370{
371    (void)toArgs;
372    (void)codeUnits;
373    (void)length;
374    if (reason <= UCNV_IRREGULAR)
375    {
376        if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
377        {
378            *err = U_ZERO_ERROR;
379        }
380        /* else the caller must have set the error code accordingly. */
381    }
382    /* else ignore the reset, close and clone calls. */
383}
384
385U_CAPI void    U_EXPORT2
386UCNV_TO_U_CALLBACK_SUBSTITUTE (
387                 const void *context,
388                 UConverterToUnicodeArgs *toArgs,
389                 const char* codeUnits,
390                 int32_t length,
391                 UConverterCallbackReason reason,
392                 UErrorCode * err)
393{
394    (void)codeUnits;
395    (void)length;
396    if (reason <= UCNV_IRREGULAR)
397    {
398        if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
399        {
400            *err = U_ZERO_ERROR;
401            ucnv_cbToUWriteSub(toArgs,0,err);
402        }
403        /* else the caller must have set the error code accordingly. */
404    }
405    /* else ignore the reset, close and clone calls. */
406}
407
408/*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
409 *and uses that as the substitution sequence
410 */
411U_CAPI void   U_EXPORT2
412UCNV_TO_U_CALLBACK_ESCAPE (
413                 const void *context,
414                 UConverterToUnicodeArgs *toArgs,
415                 const char* codeUnits,
416                 int32_t length,
417                 UConverterCallbackReason reason,
418                 UErrorCode * err)
419{
420    char16_t uniValueString[VALUE_STRING_LENGTH];
421    int32_t valueStringLength = 0;
422    int32_t i = 0;
423
424    if (reason > UCNV_IRREGULAR)
425    {
426        return;
427    }
428
429    if(context==nullptr)
430    {
431        while (i < length)
432        {
433            uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
434            uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT;    /* adding X */
435            valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
436        }
437    }
438    else
439    {
440        switch(*((char*)context))
441        {
442        case UCNV_PRV_ESCAPE_XML_DEC:
443            while (i < length)
444            {
445                uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
446                uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
447                valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
448                uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
449            }
450            break;
451
452        case UCNV_PRV_ESCAPE_XML_HEX:
453            while (i < length)
454            {
455                uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
456                uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
457                uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
458                valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
459                uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
460            }
461            break;
462        case UCNV_PRV_ESCAPE_C:
463            while (i < length)
464            {
465                uniValueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
466                uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
467                valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
468            }
469            break;
470        default:
471            while (i < length)
472            {
473                uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
474                uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT;    /* adding X */
475                uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
476                valueStringLength += 2;
477            }
478        }
479    }
480    /* reset the error */
481    *err = U_ZERO_ERROR;
482
483    ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
484}
485
486#endif
487