xref: /third_party/lzma/CPP/Common/UTFConvert.cpp (revision 370b324c)
1// UTFConvert.cpp
2
3#include "StdAfx.h"
4
5// #include <stdio.h>
6
7#include "MyTypes.h"
8#include "UTFConvert.h"
9
10
11#ifndef Z7_WCHART_IS_16BIT
12#ifndef __APPLE__
13  // we define it if the system supports files with non-utf8 symbols:
14  #define MY_UTF8_RAW_NON_UTF8_SUPPORTED
15#endif
16#endif
17
18/*
19  MY_UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte
20
21  n : MY_UTF8_START(n) : Bits of code point
22
23  0 : 0x80 :    : unused
24  1 : 0xC0 : 11 :
25  2 : 0xE0 : 16 : Basic Multilingual Plane
26  3 : 0xF0 : 21 : Unicode space
27  4 : 0xF8 : 26 :
28  5 : 0xFC : 31 : UCS-4 : wcstombs() in ubuntu is limited to that value
29  6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value
30  7 : 0xFF :
31*/
32
33#define MY_UTF8_START(n) (0x100 - (1 << (7 - (n))))
34
35#define MY_UTF8_HEAD_PARSE2(n) \
36    if (c < MY_UTF8_START((n) + 1)) \
37    { numBytes = (n); val -= MY_UTF8_START(n); }
38
39#ifndef Z7_WCHART_IS_16BIT
40
41/*
42   if (wchar_t is 32-bit), we can support large points in long UTF-8 sequence,
43   when we convert wchar_t strings to UTF-8:
44     (_UTF8_NUM_TAIL_BYTES_MAX == 3) : (21-bits points) - Unicode
45     (_UTF8_NUM_TAIL_BYTES_MAX == 5) : (31-bits points) - UCS-4
46     (_UTF8_NUM_TAIL_BYTES_MAX == 6) : (36-bit hack)
47*/
48
49#define MY_UTF8_NUM_TAIL_BYTES_MAX 5
50#endif
51
52/*
53#define MY_UTF8_HEAD_PARSE \
54    UInt32 val = c; \
55         MY_UTF8_HEAD_PARSE2(1) \
56    else MY_UTF8_HEAD_PARSE2(2) \
57    else MY_UTF8_HEAD_PARSE2(3) \
58    else MY_UTF8_HEAD_PARSE2(4) \
59    else MY_UTF8_HEAD_PARSE2(5) \
60  #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6
61    else MY_UTF8_HEAD_PARSE2(6)
62  #endif
63*/
64
65#define MY_UTF8_HEAD_PARSE_MAX_3_BYTES \
66    UInt32 val = c; \
67         MY_UTF8_HEAD_PARSE2(1) \
68    else MY_UTF8_HEAD_PARSE2(2) \
69    else { numBytes = 3; val -= MY_UTF8_START(3); }
70
71
72#define MY_UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6))
73
74
75#define START_POINT_FOR_SURROGATE 0x10000
76
77
78/* we use 128 bytes block in 16-bit BMP-PLANE to encode non-UTF-8 Escapes
79   Also we can use additional HIGH-PLANE (we use 21-bit points above 0x1f0000)
80   to simplify internal intermediate conversion in Linux:
81   RAW-UTF-8 <-> internal wchar_t utf-16 strings <-> RAW-UTF-UTF-8
82*/
83
84
85#if defined(Z7_WCHART_IS_16BIT)
86
87#define UTF_ESCAPE_PLANE 0
88
89#else
90
91/*
92we can place 128 ESCAPE chars to
93   ef 80 -    ee be 80 (3-bytes utf-8) : similar to WSL
94   ef ff -    ee bf bf
95
961f ef 80 - f7 be be 80 (4-bytes utf-8) : last  4-bytes utf-8 plane (out of Unicode)
971f ef ff - f7 be bf bf (4-bytes utf-8) : last  4-bytes utf-8 plane (out of Unicode)
98*/
99
100// #define UTF_ESCAPE_PLANE_HIGH  (0x1f << 16)
101// #define UTF_ESCAPE_PLANE        UTF_ESCAPE_PLANE_HIGH
102#define UTF_ESCAPE_PLANE 0
103
104/*
105  if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is set)
106  {
107    if (UTF_ESCAPE_PLANE is UTF_ESCAPE_PLANE_HIGH)
108    {
109      we can restore any 8-bit Escape from ESCAPE-PLANE-21 plane.
110      But ESCAPE-PLANE-21 point cannot be stored to utf-16 (7z archive)
111      So we still need a way to extract 8-bit Escapes and BMP-Escapes-8
112      from same BMP-Escapes-16 stored in 7z.
113      And if we want to restore any 8-bit from 7z archive,
114      we still must use Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT for (utf-8 -> utf-16)
115      Also we need additional Conversions to tranform from utf-16 to utf-16-With-Escapes-21
116    }
117    else (UTF_ESCAPE_PLANE == 0)
118    {
119      we must convert original 3-bytes utf-8 BMP-Escape point to sequence
120      of 3 BMP-Escape-16 points with Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
121      so we can extract original RAW-UTF-8 from UTFD-16 later.
122    }
123  }
124*/
125
126#endif
127
128
129
130#define UTF_ESCAPE_BASE 0xef00
131
132
133#ifdef UTF_ESCAPE_BASE
134#define IS_ESCAPE_POINT(v, plane) (((v) & (UInt32)0xffffff80) == (plane) + UTF_ESCAPE_BASE + 0x80)
135#endif
136
137#define IS_SURROGATE_POINT(v)     (((v) & (UInt32)0xfffff800) == 0xd800)
138#define IS_LOW_SURROGATE_POINT(v) (((v) & (UInt32)0xfffffC00) == 0xdc00)
139
140
141#define UTF_ERROR_UTF8_CHECK \
142  { NonUtf = true; continue; }
143
144void CUtf8Check::Check_Buf(const char *src, size_t size) throw()
145{
146  Clear();
147  // Byte maxByte = 0;
148
149  for (;;)
150  {
151    if (size == 0)
152      break;
153
154    const Byte c = (Byte)(*src++);
155    size--;
156
157    if (c == 0)
158    {
159      ZeroChar = true;
160      continue;
161    }
162
163    /*
164    if (c > maxByte)
165      maxByte = c;
166    */
167
168    if (c < 0x80)
169      continue;
170
171    if (c < 0xc0 + 2) // it's limit for 0x140000 unicode codes : win32 compatibility
172      UTF_ERROR_UTF8_CHECK
173
174    unsigned numBytes;
175
176    UInt32 val = c;
177         MY_UTF8_HEAD_PARSE2(1)
178    else MY_UTF8_HEAD_PARSE2(2)
179    else MY_UTF8_HEAD_PARSE2(4)
180    else MY_UTF8_HEAD_PARSE2(5)
181    else
182    {
183      UTF_ERROR_UTF8_CHECK
184    }
185
186    unsigned pos = 0;
187    do
188    {
189      if (pos == size)
190        break;
191      unsigned c2 = (Byte)src[pos];
192      c2 -= 0x80;
193      if (c2 >= 0x40)
194        break;
195      val <<= 6;
196      val |= c2;
197      if (pos == 0)
198        if (val < (((unsigned)1 << 7) >> numBytes))
199          break;
200      pos++;
201    }
202    while (--numBytes);
203
204    if (numBytes != 0)
205    {
206      if (pos == size)
207        Truncated = true;
208      else
209        UTF_ERROR_UTF8_CHECK
210    }
211
212    #ifdef UTF_ESCAPE_BASE
213      if (IS_ESCAPE_POINT(val, 0))
214        Escape = true;
215    #endif
216
217    if (MaxHighPoint < val)
218      MaxHighPoint = val;
219
220    if (IS_SURROGATE_POINT(val))
221      SingleSurrogate = true;
222
223    src += pos;
224    size -= pos;
225  }
226
227  // MaxByte = maxByte;
228}
229
230bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw()
231{
232  CUtf8Check check;
233  check.Check_Buf(src, size);
234  return check.IsOK(allowReduced);
235}
236
237/*
238bool CheckUTF8_chars(const char *src, bool allowReduced) throw()
239{
240  CUtf8Check check;
241  check.CheckBuf(src, strlen(src));
242  return check.IsOK(allowReduced);
243}
244*/
245
246bool CheckUTF8_AString(const AString &s) throw()
247{
248  CUtf8Check check;
249  check.Check_AString(s);
250  return check.IsOK();
251}
252
253
254/*
255bool CheckUTF8(const char *src, bool allowReduced) throw()
256{
257  // return Check_UTF8_Buf(src, strlen(src), allowReduced);
258
259  for (;;)
260  {
261    const Byte c = (Byte)(*src++);
262    if (c == 0)
263      return true;
264
265    if (c < 0x80)
266      continue;
267    if (c < 0xC0 + 2 || c >= 0xf5)
268      return false;
269
270    unsigned numBytes;
271    MY_UTF8_HEAD_PARSE
272    else
273      return false;
274
275    unsigned pos = 0;
276
277    do
278    {
279      Byte c2 = (Byte)(*src++);
280      if (c2 < 0x80 || c2 >= 0xC0)
281        return allowReduced && c2 == 0;
282      val <<= 6;
283      val |= (c2 - 0x80);
284      pos++;
285    }
286    while (--numBytes);
287
288    if (val < MY_UTF8_RANGE(pos - 1))
289      return false;
290
291    if (val >= 0x110000)
292      return false;
293  }
294}
295*/
296
297// in case of UTF-8 error we have two ways:
298// 21.01- : old : 0xfffd: REPLACEMENT CHARACTER : old version
299// 21.02+ : new : 0xef00 + (c) : similar to WSL scheme for low symbols
300
301#define UTF_REPLACEMENT_CHAR  0xfffd
302
303
304
305#define UTF_ESCAPE(c) \
306   ((flags & Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE) ? \
307    UTF_ESCAPE_PLANE + UTF_ESCAPE_BASE + (c) : UTF_REPLACEMENT_CHAR)
308
309/*
310#define UTF_HARD_ERROR_UTF8
311  { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \
312    destPos++; ok = false; continue; }
313*/
314
315// we ignore utf errors, and don't change (ok) variable!
316
317#define UTF_ERROR_UTF8 \
318  { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \
319    destPos++; continue; }
320
321// we store UTF-16 in wchar_t strings. So we use surrogates for big unicode points:
322
323// for debug puposes only we can store UTF-32 in wchar_t:
324// #define START_POINT_FOR_SURROGATE ((UInt32)0 - 1)
325
326
327/*
328  WIN32 MultiByteToWideChar(CP_UTF8) emits 0xfffd point, if utf-8 error was found.
329  Ant it can emit single 0xfffd from 2 src bytes.
330  It doesn't emit single 0xfffd from 3-4 src bytes.
331  We can
332    1) emit Escape point for each incorrect byte. So we can data recover later
333    2) emit 0xfffd for each incorrect byte.
334       That scheme is similar to Escape scheme, but we emit 0xfffd
335       instead of each Escape point.
336    3) emit single 0xfffd from 1-2 incorrect bytes, as WIN32 MultiByteToWideChar scheme
337*/
338
339static bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim, unsigned flags) throw()
340{
341  size_t destPos = 0;
342  bool ok = true;
343
344  for (;;)
345  {
346    if (src == srcLim)
347    {
348      *destLen = destPos;
349      return ok;
350    }
351
352    const Byte c = (Byte)(*src++);
353
354    if (c < 0x80)
355    {
356      if (dest)
357        dest[destPos] = (wchar_t)c;
358      destPos++;
359      continue;
360    }
361
362    if (c < 0xc0 + 2
363      || c >= 0xf5) // it's limit for 0x140000 unicode codes : win32 compatibility
364    {
365      UTF_ERROR_UTF8
366    }
367
368    unsigned numBytes;
369
370    MY_UTF8_HEAD_PARSE_MAX_3_BYTES
371
372    unsigned pos = 0;
373    do
374    {
375      if (src + pos == srcLim)
376        break;
377      unsigned c2 = (Byte)src[pos];
378      c2 -= 0x80;
379      if (c2 >= 0x40)
380        break;
381      val <<= 6;
382      val |= c2;
383      pos++;
384      if (pos == 1)
385      {
386        if (val < (((unsigned)1 << 7) >> numBytes))
387          break;
388        if (numBytes == 2)
389        {
390          if (flags & Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR)
391            if ((val & (0xF800 >> 6)) == (0xd800 >> 6))
392              break;
393        }
394        else if (numBytes == 3 && val >= (0x110000 >> 12))
395          break;
396      }
397    }
398    while (--numBytes);
399
400    if (numBytes != 0)
401    {
402      if ((flags & Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE) == 0)
403      {
404        // the following code to emit the 0xfffd chars as win32 Utf8 function.
405        // disable the folling line, if you need 0xfffd for each incorrect byte as in Escape mode
406        src += pos;
407      }
408      UTF_ERROR_UTF8
409    }
410
411    /*
412    if (val < MY_UTF8_RANGE(pos - 1))
413      UTF_ERROR_UTF8
414    */
415
416    #ifdef UTF_ESCAPE_BASE
417
418      if ((flags & Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT)
419          && IS_ESCAPE_POINT(val, 0))
420      {
421        // We will emit 3 utf16-Escape-16-21 points from one Escape-16 point (3 bytes)
422        UTF_ERROR_UTF8
423      }
424
425    #endif
426
427    /*
428       We don't expect virtual Escape-21 points in UTF-8 stream.
429       And we don't check for Escape-21.
430       So utf8-Escape-21 will be converted to another 3 utf16-Escape-21 points.
431       Maybe we could convert virtual utf8-Escape-21 to one utf16-Escape-21 point in some cases?
432    */
433
434    if (val < START_POINT_FOR_SURROGATE)
435    {
436      /*
437      if ((flags & Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR)
438          && IS_SURROGATE_POINT(val))
439      {
440        // We will emit 3 utf16-Escape-16-21 points from one Surrogate-16 point (3 bytes)
441        UTF_ERROR_UTF8
442      }
443      */
444      if (dest)
445        dest[destPos] = (wchar_t)val;
446      destPos++;
447    }
448    else
449    {
450      /*
451      if (val >= 0x110000)
452      {
453        // We will emit utf16-Escape-16-21 point from each source byte
454        UTF_ERROR_UTF8
455      }
456      */
457      if (dest)
458      {
459        dest[destPos + 0] = (wchar_t)(0xd800 - (0x10000 >> 10) + (val >> 10));
460        dest[destPos + 1] = (wchar_t)(0xdc00 + (val & 0x3ff));
461      }
462      destPos += 2;
463    }
464    src += pos;
465  }
466}
467
468
469
470#define MY_UTF8_HEAD(n, val) ((char)(MY_UTF8_START(n) + (val >> (6 * (n)))))
471#define MY_UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F)))
472
473static size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim, unsigned flags)
474{
475  size_t size = (size_t)(srcLim - src);
476  for (;;)
477  {
478    if (src == srcLim)
479      return size;
480
481    UInt32 val = (UInt32)(*src++);
482
483    if (val < 0x80)
484      continue;
485
486    if (val < MY_UTF8_RANGE(1))
487    {
488      size++;
489      continue;
490    }
491
492    #ifdef UTF_ESCAPE_BASE
493
494    #if UTF_ESCAPE_PLANE != 0
495    if (flags & Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE)
496      if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE))
497        continue;
498    #endif
499
500    if (flags & Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE)
501      if (IS_ESCAPE_POINT(val, 0))
502        continue;
503
504    #endif
505
506    if (IS_SURROGATE_POINT(val))
507    {
508      // it's hack to UTF-8 encoding
509
510      if (val < 0xdc00 && src != srcLim)
511      {
512        const UInt32 c2 = (UInt32)*src;
513        if (c2 >= 0xdc00 && c2 < 0xe000)
514          src++;
515      }
516      size += 2;
517      continue;
518    }
519
520    #ifdef Z7_WCHART_IS_16BIT
521
522    size += 2;
523
524    #else
525
526         if (val < MY_UTF8_RANGE(2)) size += 2;
527    else if (val < MY_UTF8_RANGE(3)) size += 3;
528    else if (val < MY_UTF8_RANGE(4)) size += 4;
529    else if (val < MY_UTF8_RANGE(5)) size += 5;
530    else
531    #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6
532      size += 6;
533    #else
534      size += 3;
535    #endif
536
537    #endif
538  }
539}
540
541
542static char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim, unsigned flags)
543{
544  for (;;)
545  {
546    if (src == srcLim)
547      return dest;
548
549    UInt32 val = (UInt32)*src++;
550
551    if (val < 0x80)
552    {
553      *dest++ = (char)val;
554      continue;
555    }
556
557    if (val < MY_UTF8_RANGE(1))
558    {
559      dest[0] = MY_UTF8_HEAD(1, val);
560      dest[1] = MY_UTF8_CHAR(0, val);
561      dest += 2;
562      continue;
563    }
564
565    #ifdef UTF_ESCAPE_BASE
566
567    #if UTF_ESCAPE_PLANE != 0
568    /*
569       if (wchar_t is 32-bit)
570            && (Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE is set)
571            && (point is virtual escape plane)
572          we extract 8-bit byte from virtual HIGH-ESCAPE PLANE.
573    */
574    if (flags & Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE)
575      if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE))
576      {
577        *dest++ = (char)(val);
578        continue;
579      }
580    #endif // UTF_ESCAPE_PLANE != 0
581
582    /* if (Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE is defined)
583          we extract 8-bit byte from BMP-ESCAPE PLANE. */
584
585    if (flags & Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE)
586      if (IS_ESCAPE_POINT(val, 0))
587      {
588        *dest++ = (char)(val);
589        continue;
590      }
591
592    #endif // UTF_ESCAPE_BASE
593
594    if (IS_SURROGATE_POINT(val))
595    {
596      // it's hack to UTF-8 encoding
597      if (val < 0xdc00 && src != srcLim)
598      {
599        const UInt32 c2 = (UInt32)*src;
600        if (IS_LOW_SURROGATE_POINT(c2))
601        {
602          src++;
603          val = (((val - 0xd800) << 10) | (c2 - 0xdc00)) + 0x10000;
604          dest[0] = MY_UTF8_HEAD(3, val);
605          dest[1] = MY_UTF8_CHAR(2, val);
606          dest[2] = MY_UTF8_CHAR(1, val);
607          dest[3] = MY_UTF8_CHAR(0, val);
608          dest += 4;
609          continue;
610        }
611      }
612      if (flags & Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR)
613        val = UTF_REPLACEMENT_CHAR; // WIN32 function does it
614    }
615
616    #ifndef Z7_WCHART_IS_16BIT
617    if (val < MY_UTF8_RANGE(2))
618    #endif
619    {
620      dest[0] = MY_UTF8_HEAD(2, val);
621      dest[1] = MY_UTF8_CHAR(1, val);
622      dest[2] = MY_UTF8_CHAR(0, val);
623      dest += 3;
624      continue;
625    }
626
627    #ifndef Z7_WCHART_IS_16BIT
628
629    // we don't expect this case. so we can throw exception
630    // throw 20210407;
631
632    char b;
633    unsigned numBits;
634         if (val < MY_UTF8_RANGE(3)) { numBits = 6 * 3; b = MY_UTF8_HEAD(3, val); }
635    else if (val < MY_UTF8_RANGE(4)) { numBits = 6 * 4; b = MY_UTF8_HEAD(4, val); }
636    else if (val < MY_UTF8_RANGE(5)) { numBits = 6 * 5; b = MY_UTF8_HEAD(5, val); }
637    #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6
638    else                           { numBits = 6 * 6; b = (char)MY_UTF8_START(6); }
639    #else
640    else
641    {
642      val = UTF_REPLACEMENT_CHAR;
643                                   { numBits = 6 * 3; b = MY_UTF8_HEAD(3, val); }
644    }
645    #endif
646
647    *dest++ = b;
648
649    do
650    {
651      numBits -= 6;
652      *dest++ = (char)(0x80 + ((val >> numBits) & 0x3F));
653    }
654    while (numBits != 0);
655
656    #endif
657  }
658}
659
660bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags)
661{
662  dest.Empty();
663  size_t destLen = 0;
664  Utf8_To_Utf16(NULL, &destLen, src, src + srcSize, flags);
665  bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src + srcSize, flags);
666  dest.ReleaseBuf_SetEnd((unsigned)destLen);
667  return res;
668}
669
670bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags)
671{
672  return Convert_UTF8_Buf_To_Unicode(src, src.Len(), dest,  flags);
673}
674
675
676static
677unsigned g_UTF8_To_Unicode_Flags =
678    Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
679  #ifndef Z7_WCHART_IS_16BIT
680    | Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
681  #ifdef MY_UTF8_RAW_NON_UTF8_SUPPORTED
682    | Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
683  #endif
684  #endif
685    ;
686
687
688/*
689bool ConvertUTF8ToUnicode_boolRes(const AString &src, UString &dest)
690{
691  return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags);
692}
693*/
694
695bool ConvertUTF8ToUnicode(const AString &src, UString &dest)
696{
697  return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags);
698}
699
700void Print_UString(const UString &a);
701
702void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags)
703{
704  /*
705  if (src.Len()== 24)
706    throw "202104";
707  */
708  dest.Empty();
709  const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags);
710  char *destStart = dest.GetBuf((unsigned)destLen);
711  const char *destEnd = Utf16_To_Utf8(destStart, src, src.Ptr(src.Len()), flags);
712  dest.ReleaseBuf_SetEnd((unsigned)destLen);
713  // printf("\nlen = %d\n", src.Len());
714  if (destLen != (size_t)(destEnd - destStart))
715  {
716    /*
717    // dest.ReleaseBuf_SetEnd((unsigned)(destEnd - destStart));
718    printf("\nlen = %d\n", (unsigned)destLen);
719    printf("\n(destEnd - destStart) = %d\n", (unsigned)(destEnd - destStart));
720    printf("\n");
721    // Print_UString(src);
722    printf("\n");
723    // printf("\nlen = %d\n", destLen);
724    */
725    throw 20210406;
726  }
727}
728
729
730
731unsigned g_Unicode_To_UTF8_Flags =
732      // Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE
733      0
734  #ifndef _WIN32
735    #ifdef MY_UTF8_RAW_NON_UTF8_SUPPORTED
736      | Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE
737    #else
738      | Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
739    #endif
740  #endif
741    ;
742
743void ConvertUnicodeToUTF8(const UString &src, AString &dest)
744{
745  ConvertUnicodeToUTF8_Flags(src, dest, g_Unicode_To_UTF8_Flags);
746}
747
748void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest)
749{
750  const unsigned flags = g_Unicode_To_UTF8_Flags;
751  dest.Free();
752  const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags);
753  dest.Alloc(destLen);
754  const char *destEnd = Utf16_To_Utf8((char *)(void *)(Byte *)dest, src, src.Ptr(src.Len()), flags);
755  if (destLen != (size_t)(destEnd - (char *)(void *)(Byte *)dest))
756    throw 202104;
757}
758
759/*
760
761#ifndef _WIN32
762void Convert_UTF16_To_UTF32(const UString &src, UString &dest)
763{
764  dest.Empty();
765  for (size_t i = 0; i < src.Len();)
766  {
767    wchar_t c = src[i++];
768    if (c >= 0xd800 && c < 0xdc00 && i < src.Len())
769    {
770      const wchar_t c2 = src[i];
771      if (c2 >= 0xdc00 && c2 < 0x10000)
772      {
773        // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2);
774        c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
775        // printf("%4x\n", (int)c);
776        i++;
777      }
778    }
779    dest += c;
780  }
781}
782
783void Convert_UTF32_To_UTF16(const UString &src, UString &dest)
784{
785  dest.Empty();
786  for (size_t i = 0; i < src.Len();)
787  {
788    wchar_t w = src[i++];
789    if (w >= 0x10000 && w < 0x110000)
790    {
791      w -= 0x10000;
792      dest += (wchar_t)((unsigned)0xd800 + (((unsigned)w >> 10) & 0x3ff));
793      w = 0xdc00 + (w & 0x3ff);
794    }
795    dest += w;
796  }
797}
798
799bool UTF32_IsThere_BigPoint(const UString &src)
800{
801  for (size_t i = 0; i < src.Len();)
802  {
803    const UInt32 c = (UInt32)src[i++];
804    if (c >= 0x110000)
805      return true;
806  }
807  return false;
808}
809
810bool Unicode_IsThere_BmpEscape(const UString &src)
811{
812  for (size_t i = 0; i < src.Len();)
813  {
814    const UInt32 c = (UInt32)src[i++];
815    if (IS_ESCAPE_POINT(c, 0))
816      return true;
817  }
818  return false;
819}
820
821
822#endif
823
824bool Unicode_IsThere_Utf16SurrogateError(const UString &src)
825{
826  for (size_t i = 0; i < src.Len();)
827  {
828    const UInt32 val = (UInt32)src[i++];
829    if (IS_SURROGATE_POINT(val))
830    {
831      // it's hack to UTF-8 encoding
832      if (val >= 0xdc00 || i == src.Len())
833        return true;
834      const UInt32 c2 = (UInt32)*src;
835      if (!IS_LOW_SURROGATE_POINT(c2))
836        return true;
837    }
838  }
839  return false;
840}
841*/
842
843#ifndef Z7_WCHART_IS_16BIT
844
845void Convert_UnicodeEsc16_To_UnicodeEscHigh
846#if UTF_ESCAPE_PLANE == 0
847    (UString &) {}
848#else
849    (UString &s)
850{
851  const unsigned len = s.Len();
852  for (unsigned i = 0; i < len; i++)
853  {
854    wchar_t c = s[i];
855    if (IS_ESCAPE_POINT(c, 0))
856    {
857      c += UTF_ESCAPE_PLANE;
858      s.ReplaceOneCharAtPos(i, c);
859    }
860  }
861}
862#endif
863#endif
864