1370b324cSopenharmony_ci// UTFConvert.cpp
2370b324cSopenharmony_ci
3370b324cSopenharmony_ci#include "StdAfx.h"
4370b324cSopenharmony_ci
5370b324cSopenharmony_ci// #include <stdio.h>
6370b324cSopenharmony_ci
7370b324cSopenharmony_ci#include "MyTypes.h"
8370b324cSopenharmony_ci#include "UTFConvert.h"
9370b324cSopenharmony_ci
10370b324cSopenharmony_ci
11370b324cSopenharmony_ci#ifndef Z7_WCHART_IS_16BIT
12370b324cSopenharmony_ci#ifndef __APPLE__
13370b324cSopenharmony_ci  // we define it if the system supports files with non-utf8 symbols:
14370b324cSopenharmony_ci  #define MY_UTF8_RAW_NON_UTF8_SUPPORTED
15370b324cSopenharmony_ci#endif
16370b324cSopenharmony_ci#endif
17370b324cSopenharmony_ci
18370b324cSopenharmony_ci/*
19370b324cSopenharmony_ci  MY_UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte
20370b324cSopenharmony_ci
21370b324cSopenharmony_ci  n : MY_UTF8_START(n) : Bits of code point
22370b324cSopenharmony_ci
23370b324cSopenharmony_ci  0 : 0x80 :    : unused
24370b324cSopenharmony_ci  1 : 0xC0 : 11 :
25370b324cSopenharmony_ci  2 : 0xE0 : 16 : Basic Multilingual Plane
26370b324cSopenharmony_ci  3 : 0xF0 : 21 : Unicode space
27370b324cSopenharmony_ci  4 : 0xF8 : 26 :
28370b324cSopenharmony_ci  5 : 0xFC : 31 : UCS-4 : wcstombs() in ubuntu is limited to that value
29370b324cSopenharmony_ci  6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value
30370b324cSopenharmony_ci  7 : 0xFF :
31370b324cSopenharmony_ci*/
32370b324cSopenharmony_ci
33370b324cSopenharmony_ci#define MY_UTF8_START(n) (0x100 - (1 << (7 - (n))))
34370b324cSopenharmony_ci
35370b324cSopenharmony_ci#define MY_UTF8_HEAD_PARSE2(n) \
36370b324cSopenharmony_ci    if (c < MY_UTF8_START((n) + 1)) \
37370b324cSopenharmony_ci    { numBytes = (n); val -= MY_UTF8_START(n); }
38370b324cSopenharmony_ci
39370b324cSopenharmony_ci#ifndef Z7_WCHART_IS_16BIT
40370b324cSopenharmony_ci
41370b324cSopenharmony_ci/*
42370b324cSopenharmony_ci   if (wchar_t is 32-bit), we can support large points in long UTF-8 sequence,
43370b324cSopenharmony_ci   when we convert wchar_t strings to UTF-8:
44370b324cSopenharmony_ci     (_UTF8_NUM_TAIL_BYTES_MAX == 3) : (21-bits points) - Unicode
45370b324cSopenharmony_ci     (_UTF8_NUM_TAIL_BYTES_MAX == 5) : (31-bits points) - UCS-4
46370b324cSopenharmony_ci     (_UTF8_NUM_TAIL_BYTES_MAX == 6) : (36-bit hack)
47370b324cSopenharmony_ci*/
48370b324cSopenharmony_ci
49370b324cSopenharmony_ci#define MY_UTF8_NUM_TAIL_BYTES_MAX 5
50370b324cSopenharmony_ci#endif
51370b324cSopenharmony_ci
52370b324cSopenharmony_ci/*
53370b324cSopenharmony_ci#define MY_UTF8_HEAD_PARSE \
54370b324cSopenharmony_ci    UInt32 val = c; \
55370b324cSopenharmony_ci         MY_UTF8_HEAD_PARSE2(1) \
56370b324cSopenharmony_ci    else MY_UTF8_HEAD_PARSE2(2) \
57370b324cSopenharmony_ci    else MY_UTF8_HEAD_PARSE2(3) \
58370b324cSopenharmony_ci    else MY_UTF8_HEAD_PARSE2(4) \
59370b324cSopenharmony_ci    else MY_UTF8_HEAD_PARSE2(5) \
60370b324cSopenharmony_ci  #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6
61370b324cSopenharmony_ci    else MY_UTF8_HEAD_PARSE2(6)
62370b324cSopenharmony_ci  #endif
63370b324cSopenharmony_ci*/
64370b324cSopenharmony_ci
65370b324cSopenharmony_ci#define MY_UTF8_HEAD_PARSE_MAX_3_BYTES \
66370b324cSopenharmony_ci    UInt32 val = c; \
67370b324cSopenharmony_ci         MY_UTF8_HEAD_PARSE2(1) \
68370b324cSopenharmony_ci    else MY_UTF8_HEAD_PARSE2(2) \
69370b324cSopenharmony_ci    else { numBytes = 3; val -= MY_UTF8_START(3); }
70370b324cSopenharmony_ci
71370b324cSopenharmony_ci
72370b324cSopenharmony_ci#define MY_UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6))
73370b324cSopenharmony_ci
74370b324cSopenharmony_ci
75370b324cSopenharmony_ci#define START_POINT_FOR_SURROGATE 0x10000
76370b324cSopenharmony_ci
77370b324cSopenharmony_ci
78370b324cSopenharmony_ci/* we use 128 bytes block in 16-bit BMP-PLANE to encode non-UTF-8 Escapes
79370b324cSopenharmony_ci   Also we can use additional HIGH-PLANE (we use 21-bit points above 0x1f0000)
80370b324cSopenharmony_ci   to simplify internal intermediate conversion in Linux:
81370b324cSopenharmony_ci   RAW-UTF-8 <-> internal wchar_t utf-16 strings <-> RAW-UTF-UTF-8
82370b324cSopenharmony_ci*/
83370b324cSopenharmony_ci
84370b324cSopenharmony_ci
85370b324cSopenharmony_ci#if defined(Z7_WCHART_IS_16BIT)
86370b324cSopenharmony_ci
87370b324cSopenharmony_ci#define UTF_ESCAPE_PLANE 0
88370b324cSopenharmony_ci
89370b324cSopenharmony_ci#else
90370b324cSopenharmony_ci
91370b324cSopenharmony_ci/*
92370b324cSopenharmony_ciwe can place 128 ESCAPE chars to
93370b324cSopenharmony_ci   ef 80 -    ee be 80 (3-bytes utf-8) : similar to WSL
94370b324cSopenharmony_ci   ef ff -    ee bf bf
95370b324cSopenharmony_ci
96370b324cSopenharmony_ci1f ef 80 - f7 be be 80 (4-bytes utf-8) : last  4-bytes utf-8 plane (out of Unicode)
97370b324cSopenharmony_ci1f ef ff - f7 be bf bf (4-bytes utf-8) : last  4-bytes utf-8 plane (out of Unicode)
98370b324cSopenharmony_ci*/
99370b324cSopenharmony_ci
100370b324cSopenharmony_ci// #define UTF_ESCAPE_PLANE_HIGH  (0x1f << 16)
101370b324cSopenharmony_ci// #define UTF_ESCAPE_PLANE        UTF_ESCAPE_PLANE_HIGH
102370b324cSopenharmony_ci#define UTF_ESCAPE_PLANE 0
103370b324cSopenharmony_ci
104370b324cSopenharmony_ci/*
105370b324cSopenharmony_ci  if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is set)
106370b324cSopenharmony_ci  {
107370b324cSopenharmony_ci    if (UTF_ESCAPE_PLANE is UTF_ESCAPE_PLANE_HIGH)
108370b324cSopenharmony_ci    {
109370b324cSopenharmony_ci      we can restore any 8-bit Escape from ESCAPE-PLANE-21 plane.
110370b324cSopenharmony_ci      But ESCAPE-PLANE-21 point cannot be stored to utf-16 (7z archive)
111370b324cSopenharmony_ci      So we still need a way to extract 8-bit Escapes and BMP-Escapes-8
112370b324cSopenharmony_ci      from same BMP-Escapes-16 stored in 7z.
113370b324cSopenharmony_ci      And if we want to restore any 8-bit from 7z archive,
114370b324cSopenharmony_ci      we still must use Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT for (utf-8 -> utf-16)
115370b324cSopenharmony_ci      Also we need additional Conversions to tranform from utf-16 to utf-16-With-Escapes-21
116370b324cSopenharmony_ci    }
117370b324cSopenharmony_ci    else (UTF_ESCAPE_PLANE == 0)
118370b324cSopenharmony_ci    {
119370b324cSopenharmony_ci      we must convert original 3-bytes utf-8 BMP-Escape point to sequence
120370b324cSopenharmony_ci      of 3 BMP-Escape-16 points with Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
121370b324cSopenharmony_ci      so we can extract original RAW-UTF-8 from UTFD-16 later.
122370b324cSopenharmony_ci    }
123370b324cSopenharmony_ci  }
124370b324cSopenharmony_ci*/
125370b324cSopenharmony_ci
126370b324cSopenharmony_ci#endif
127370b324cSopenharmony_ci
128370b324cSopenharmony_ci
129370b324cSopenharmony_ci
130370b324cSopenharmony_ci#define UTF_ESCAPE_BASE 0xef00
131370b324cSopenharmony_ci
132370b324cSopenharmony_ci
133370b324cSopenharmony_ci#ifdef UTF_ESCAPE_BASE
134370b324cSopenharmony_ci#define IS_ESCAPE_POINT(v, plane) (((v) & (UInt32)0xffffff80) == (plane) + UTF_ESCAPE_BASE + 0x80)
135370b324cSopenharmony_ci#endif
136370b324cSopenharmony_ci
137370b324cSopenharmony_ci#define IS_SURROGATE_POINT(v)     (((v) & (UInt32)0xfffff800) == 0xd800)
138370b324cSopenharmony_ci#define IS_LOW_SURROGATE_POINT(v) (((v) & (UInt32)0xfffffC00) == 0xdc00)
139370b324cSopenharmony_ci
140370b324cSopenharmony_ci
141370b324cSopenharmony_ci#define UTF_ERROR_UTF8_CHECK \
142370b324cSopenharmony_ci  { NonUtf = true; continue; }
143370b324cSopenharmony_ci
144370b324cSopenharmony_civoid CUtf8Check::Check_Buf(const char *src, size_t size) throw()
145370b324cSopenharmony_ci{
146370b324cSopenharmony_ci  Clear();
147370b324cSopenharmony_ci  // Byte maxByte = 0;
148370b324cSopenharmony_ci
149370b324cSopenharmony_ci  for (;;)
150370b324cSopenharmony_ci  {
151370b324cSopenharmony_ci    if (size == 0)
152370b324cSopenharmony_ci      break;
153370b324cSopenharmony_ci
154370b324cSopenharmony_ci    const Byte c = (Byte)(*src++);
155370b324cSopenharmony_ci    size--;
156370b324cSopenharmony_ci
157370b324cSopenharmony_ci    if (c == 0)
158370b324cSopenharmony_ci    {
159370b324cSopenharmony_ci      ZeroChar = true;
160370b324cSopenharmony_ci      continue;
161370b324cSopenharmony_ci    }
162370b324cSopenharmony_ci
163370b324cSopenharmony_ci    /*
164370b324cSopenharmony_ci    if (c > maxByte)
165370b324cSopenharmony_ci      maxByte = c;
166370b324cSopenharmony_ci    */
167370b324cSopenharmony_ci
168370b324cSopenharmony_ci    if (c < 0x80)
169370b324cSopenharmony_ci      continue;
170370b324cSopenharmony_ci
171370b324cSopenharmony_ci    if (c < 0xc0 + 2) // it's limit for 0x140000 unicode codes : win32 compatibility
172370b324cSopenharmony_ci      UTF_ERROR_UTF8_CHECK
173370b324cSopenharmony_ci
174370b324cSopenharmony_ci    unsigned numBytes;
175370b324cSopenharmony_ci
176370b324cSopenharmony_ci    UInt32 val = c;
177370b324cSopenharmony_ci         MY_UTF8_HEAD_PARSE2(1)
178370b324cSopenharmony_ci    else MY_UTF8_HEAD_PARSE2(2)
179370b324cSopenharmony_ci    else MY_UTF8_HEAD_PARSE2(4)
180370b324cSopenharmony_ci    else MY_UTF8_HEAD_PARSE2(5)
181370b324cSopenharmony_ci    else
182370b324cSopenharmony_ci    {
183370b324cSopenharmony_ci      UTF_ERROR_UTF8_CHECK
184370b324cSopenharmony_ci    }
185370b324cSopenharmony_ci
186370b324cSopenharmony_ci    unsigned pos = 0;
187370b324cSopenharmony_ci    do
188370b324cSopenharmony_ci    {
189370b324cSopenharmony_ci      if (pos == size)
190370b324cSopenharmony_ci        break;
191370b324cSopenharmony_ci      unsigned c2 = (Byte)src[pos];
192370b324cSopenharmony_ci      c2 -= 0x80;
193370b324cSopenharmony_ci      if (c2 >= 0x40)
194370b324cSopenharmony_ci        break;
195370b324cSopenharmony_ci      val <<= 6;
196370b324cSopenharmony_ci      val |= c2;
197370b324cSopenharmony_ci      if (pos == 0)
198370b324cSopenharmony_ci        if (val < (((unsigned)1 << 7) >> numBytes))
199370b324cSopenharmony_ci          break;
200370b324cSopenharmony_ci      pos++;
201370b324cSopenharmony_ci    }
202370b324cSopenharmony_ci    while (--numBytes);
203370b324cSopenharmony_ci
204370b324cSopenharmony_ci    if (numBytes != 0)
205370b324cSopenharmony_ci    {
206370b324cSopenharmony_ci      if (pos == size)
207370b324cSopenharmony_ci        Truncated = true;
208370b324cSopenharmony_ci      else
209370b324cSopenharmony_ci        UTF_ERROR_UTF8_CHECK
210370b324cSopenharmony_ci    }
211370b324cSopenharmony_ci
212370b324cSopenharmony_ci    #ifdef UTF_ESCAPE_BASE
213370b324cSopenharmony_ci      if (IS_ESCAPE_POINT(val, 0))
214370b324cSopenharmony_ci        Escape = true;
215370b324cSopenharmony_ci    #endif
216370b324cSopenharmony_ci
217370b324cSopenharmony_ci    if (MaxHighPoint < val)
218370b324cSopenharmony_ci      MaxHighPoint = val;
219370b324cSopenharmony_ci
220370b324cSopenharmony_ci    if (IS_SURROGATE_POINT(val))
221370b324cSopenharmony_ci      SingleSurrogate = true;
222370b324cSopenharmony_ci
223370b324cSopenharmony_ci    src += pos;
224370b324cSopenharmony_ci    size -= pos;
225370b324cSopenharmony_ci  }
226370b324cSopenharmony_ci
227370b324cSopenharmony_ci  // MaxByte = maxByte;
228370b324cSopenharmony_ci}
229370b324cSopenharmony_ci
230370b324cSopenharmony_cibool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw()
231370b324cSopenharmony_ci{
232370b324cSopenharmony_ci  CUtf8Check check;
233370b324cSopenharmony_ci  check.Check_Buf(src, size);
234370b324cSopenharmony_ci  return check.IsOK(allowReduced);
235370b324cSopenharmony_ci}
236370b324cSopenharmony_ci
237370b324cSopenharmony_ci/*
238370b324cSopenharmony_cibool CheckUTF8_chars(const char *src, bool allowReduced) throw()
239370b324cSopenharmony_ci{
240370b324cSopenharmony_ci  CUtf8Check check;
241370b324cSopenharmony_ci  check.CheckBuf(src, strlen(src));
242370b324cSopenharmony_ci  return check.IsOK(allowReduced);
243370b324cSopenharmony_ci}
244370b324cSopenharmony_ci*/
245370b324cSopenharmony_ci
246370b324cSopenharmony_cibool CheckUTF8_AString(const AString &s) throw()
247370b324cSopenharmony_ci{
248370b324cSopenharmony_ci  CUtf8Check check;
249370b324cSopenharmony_ci  check.Check_AString(s);
250370b324cSopenharmony_ci  return check.IsOK();
251370b324cSopenharmony_ci}
252370b324cSopenharmony_ci
253370b324cSopenharmony_ci
254370b324cSopenharmony_ci/*
255370b324cSopenharmony_cibool CheckUTF8(const char *src, bool allowReduced) throw()
256370b324cSopenharmony_ci{
257370b324cSopenharmony_ci  // return Check_UTF8_Buf(src, strlen(src), allowReduced);
258370b324cSopenharmony_ci
259370b324cSopenharmony_ci  for (;;)
260370b324cSopenharmony_ci  {
261370b324cSopenharmony_ci    const Byte c = (Byte)(*src++);
262370b324cSopenharmony_ci    if (c == 0)
263370b324cSopenharmony_ci      return true;
264370b324cSopenharmony_ci
265370b324cSopenharmony_ci    if (c < 0x80)
266370b324cSopenharmony_ci      continue;
267370b324cSopenharmony_ci    if (c < 0xC0 + 2 || c >= 0xf5)
268370b324cSopenharmony_ci      return false;
269370b324cSopenharmony_ci
270370b324cSopenharmony_ci    unsigned numBytes;
271370b324cSopenharmony_ci    MY_UTF8_HEAD_PARSE
272370b324cSopenharmony_ci    else
273370b324cSopenharmony_ci      return false;
274370b324cSopenharmony_ci
275370b324cSopenharmony_ci    unsigned pos = 0;
276370b324cSopenharmony_ci
277370b324cSopenharmony_ci    do
278370b324cSopenharmony_ci    {
279370b324cSopenharmony_ci      Byte c2 = (Byte)(*src++);
280370b324cSopenharmony_ci      if (c2 < 0x80 || c2 >= 0xC0)
281370b324cSopenharmony_ci        return allowReduced && c2 == 0;
282370b324cSopenharmony_ci      val <<= 6;
283370b324cSopenharmony_ci      val |= (c2 - 0x80);
284370b324cSopenharmony_ci      pos++;
285370b324cSopenharmony_ci    }
286370b324cSopenharmony_ci    while (--numBytes);
287370b324cSopenharmony_ci
288370b324cSopenharmony_ci    if (val < MY_UTF8_RANGE(pos - 1))
289370b324cSopenharmony_ci      return false;
290370b324cSopenharmony_ci
291370b324cSopenharmony_ci    if (val >= 0x110000)
292370b324cSopenharmony_ci      return false;
293370b324cSopenharmony_ci  }
294370b324cSopenharmony_ci}
295370b324cSopenharmony_ci*/
296370b324cSopenharmony_ci
297370b324cSopenharmony_ci// in case of UTF-8 error we have two ways:
298370b324cSopenharmony_ci// 21.01- : old : 0xfffd: REPLACEMENT CHARACTER : old version
299370b324cSopenharmony_ci// 21.02+ : new : 0xef00 + (c) : similar to WSL scheme for low symbols
300370b324cSopenharmony_ci
301370b324cSopenharmony_ci#define UTF_REPLACEMENT_CHAR  0xfffd
302370b324cSopenharmony_ci
303370b324cSopenharmony_ci
304370b324cSopenharmony_ci
305370b324cSopenharmony_ci#define UTF_ESCAPE(c) \
306370b324cSopenharmony_ci   ((flags & Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE) ? \
307370b324cSopenharmony_ci    UTF_ESCAPE_PLANE + UTF_ESCAPE_BASE + (c) : UTF_REPLACEMENT_CHAR)
308370b324cSopenharmony_ci
309370b324cSopenharmony_ci/*
310370b324cSopenharmony_ci#define UTF_HARD_ERROR_UTF8
311370b324cSopenharmony_ci  { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \
312370b324cSopenharmony_ci    destPos++; ok = false; continue; }
313370b324cSopenharmony_ci*/
314370b324cSopenharmony_ci
315370b324cSopenharmony_ci// we ignore utf errors, and don't change (ok) variable!
316370b324cSopenharmony_ci
317370b324cSopenharmony_ci#define UTF_ERROR_UTF8 \
318370b324cSopenharmony_ci  { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \
319370b324cSopenharmony_ci    destPos++; continue; }
320370b324cSopenharmony_ci
321370b324cSopenharmony_ci// we store UTF-16 in wchar_t strings. So we use surrogates for big unicode points:
322370b324cSopenharmony_ci
323370b324cSopenharmony_ci// for debug puposes only we can store UTF-32 in wchar_t:
324370b324cSopenharmony_ci// #define START_POINT_FOR_SURROGATE ((UInt32)0 - 1)
325370b324cSopenharmony_ci
326370b324cSopenharmony_ci
327370b324cSopenharmony_ci/*
328370b324cSopenharmony_ci  WIN32 MultiByteToWideChar(CP_UTF8) emits 0xfffd point, if utf-8 error was found.
329370b324cSopenharmony_ci  Ant it can emit single 0xfffd from 2 src bytes.
330370b324cSopenharmony_ci  It doesn't emit single 0xfffd from 3-4 src bytes.
331370b324cSopenharmony_ci  We can
332370b324cSopenharmony_ci    1) emit Escape point for each incorrect byte. So we can data recover later
333370b324cSopenharmony_ci    2) emit 0xfffd for each incorrect byte.
334370b324cSopenharmony_ci       That scheme is similar to Escape scheme, but we emit 0xfffd
335370b324cSopenharmony_ci       instead of each Escape point.
336370b324cSopenharmony_ci    3) emit single 0xfffd from 1-2 incorrect bytes, as WIN32 MultiByteToWideChar scheme
337370b324cSopenharmony_ci*/
338370b324cSopenharmony_ci
339370b324cSopenharmony_cistatic bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim, unsigned flags) throw()
340370b324cSopenharmony_ci{
341370b324cSopenharmony_ci  size_t destPos = 0;
342370b324cSopenharmony_ci  bool ok = true;
343370b324cSopenharmony_ci
344370b324cSopenharmony_ci  for (;;)
345370b324cSopenharmony_ci  {
346370b324cSopenharmony_ci    if (src == srcLim)
347370b324cSopenharmony_ci    {
348370b324cSopenharmony_ci      *destLen = destPos;
349370b324cSopenharmony_ci      return ok;
350370b324cSopenharmony_ci    }
351370b324cSopenharmony_ci
352370b324cSopenharmony_ci    const Byte c = (Byte)(*src++);
353370b324cSopenharmony_ci
354370b324cSopenharmony_ci    if (c < 0x80)
355370b324cSopenharmony_ci    {
356370b324cSopenharmony_ci      if (dest)
357370b324cSopenharmony_ci        dest[destPos] = (wchar_t)c;
358370b324cSopenharmony_ci      destPos++;
359370b324cSopenharmony_ci      continue;
360370b324cSopenharmony_ci    }
361370b324cSopenharmony_ci
362370b324cSopenharmony_ci    if (c < 0xc0 + 2
363370b324cSopenharmony_ci      || c >= 0xf5) // it's limit for 0x140000 unicode codes : win32 compatibility
364370b324cSopenharmony_ci    {
365370b324cSopenharmony_ci      UTF_ERROR_UTF8
366370b324cSopenharmony_ci    }
367370b324cSopenharmony_ci
368370b324cSopenharmony_ci    unsigned numBytes;
369370b324cSopenharmony_ci
370370b324cSopenharmony_ci    MY_UTF8_HEAD_PARSE_MAX_3_BYTES
371370b324cSopenharmony_ci
372370b324cSopenharmony_ci    unsigned pos = 0;
373370b324cSopenharmony_ci    do
374370b324cSopenharmony_ci    {
375370b324cSopenharmony_ci      if (src + pos == srcLim)
376370b324cSopenharmony_ci        break;
377370b324cSopenharmony_ci      unsigned c2 = (Byte)src[pos];
378370b324cSopenharmony_ci      c2 -= 0x80;
379370b324cSopenharmony_ci      if (c2 >= 0x40)
380370b324cSopenharmony_ci        break;
381370b324cSopenharmony_ci      val <<= 6;
382370b324cSopenharmony_ci      val |= c2;
383370b324cSopenharmony_ci      pos++;
384370b324cSopenharmony_ci      if (pos == 1)
385370b324cSopenharmony_ci      {
386370b324cSopenharmony_ci        if (val < (((unsigned)1 << 7) >> numBytes))
387370b324cSopenharmony_ci          break;
388370b324cSopenharmony_ci        if (numBytes == 2)
389370b324cSopenharmony_ci        {
390370b324cSopenharmony_ci          if (flags & Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR)
391370b324cSopenharmony_ci            if ((val & (0xF800 >> 6)) == (0xd800 >> 6))
392370b324cSopenharmony_ci              break;
393370b324cSopenharmony_ci        }
394370b324cSopenharmony_ci        else if (numBytes == 3 && val >= (0x110000 >> 12))
395370b324cSopenharmony_ci          break;
396370b324cSopenharmony_ci      }
397370b324cSopenharmony_ci    }
398370b324cSopenharmony_ci    while (--numBytes);
399370b324cSopenharmony_ci
400370b324cSopenharmony_ci    if (numBytes != 0)
401370b324cSopenharmony_ci    {
402370b324cSopenharmony_ci      if ((flags & Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE) == 0)
403370b324cSopenharmony_ci      {
404370b324cSopenharmony_ci        // the following code to emit the 0xfffd chars as win32 Utf8 function.
405370b324cSopenharmony_ci        // disable the folling line, if you need 0xfffd for each incorrect byte as in Escape mode
406370b324cSopenharmony_ci        src += pos;
407370b324cSopenharmony_ci      }
408370b324cSopenharmony_ci      UTF_ERROR_UTF8
409370b324cSopenharmony_ci    }
410370b324cSopenharmony_ci
411370b324cSopenharmony_ci    /*
412370b324cSopenharmony_ci    if (val < MY_UTF8_RANGE(pos - 1))
413370b324cSopenharmony_ci      UTF_ERROR_UTF8
414370b324cSopenharmony_ci    */
415370b324cSopenharmony_ci
416370b324cSopenharmony_ci    #ifdef UTF_ESCAPE_BASE
417370b324cSopenharmony_ci
418370b324cSopenharmony_ci      if ((flags & Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT)
419370b324cSopenharmony_ci          && IS_ESCAPE_POINT(val, 0))
420370b324cSopenharmony_ci      {
421370b324cSopenharmony_ci        // We will emit 3 utf16-Escape-16-21 points from one Escape-16 point (3 bytes)
422370b324cSopenharmony_ci        UTF_ERROR_UTF8
423370b324cSopenharmony_ci      }
424370b324cSopenharmony_ci
425370b324cSopenharmony_ci    #endif
426370b324cSopenharmony_ci
427370b324cSopenharmony_ci    /*
428370b324cSopenharmony_ci       We don't expect virtual Escape-21 points in UTF-8 stream.
429370b324cSopenharmony_ci       And we don't check for Escape-21.
430370b324cSopenharmony_ci       So utf8-Escape-21 will be converted to another 3 utf16-Escape-21 points.
431370b324cSopenharmony_ci       Maybe we could convert virtual utf8-Escape-21 to one utf16-Escape-21 point in some cases?
432370b324cSopenharmony_ci    */
433370b324cSopenharmony_ci
434370b324cSopenharmony_ci    if (val < START_POINT_FOR_SURROGATE)
435370b324cSopenharmony_ci    {
436370b324cSopenharmony_ci      /*
437370b324cSopenharmony_ci      if ((flags & Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR)
438370b324cSopenharmony_ci          && IS_SURROGATE_POINT(val))
439370b324cSopenharmony_ci      {
440370b324cSopenharmony_ci        // We will emit 3 utf16-Escape-16-21 points from one Surrogate-16 point (3 bytes)
441370b324cSopenharmony_ci        UTF_ERROR_UTF8
442370b324cSopenharmony_ci      }
443370b324cSopenharmony_ci      */
444370b324cSopenharmony_ci      if (dest)
445370b324cSopenharmony_ci        dest[destPos] = (wchar_t)val;
446370b324cSopenharmony_ci      destPos++;
447370b324cSopenharmony_ci    }
448370b324cSopenharmony_ci    else
449370b324cSopenharmony_ci    {
450370b324cSopenharmony_ci      /*
451370b324cSopenharmony_ci      if (val >= 0x110000)
452370b324cSopenharmony_ci      {
453370b324cSopenharmony_ci        // We will emit utf16-Escape-16-21 point from each source byte
454370b324cSopenharmony_ci        UTF_ERROR_UTF8
455370b324cSopenharmony_ci      }
456370b324cSopenharmony_ci      */
457370b324cSopenharmony_ci      if (dest)
458370b324cSopenharmony_ci      {
459370b324cSopenharmony_ci        dest[destPos + 0] = (wchar_t)(0xd800 - (0x10000 >> 10) + (val >> 10));
460370b324cSopenharmony_ci        dest[destPos + 1] = (wchar_t)(0xdc00 + (val & 0x3ff));
461370b324cSopenharmony_ci      }
462370b324cSopenharmony_ci      destPos += 2;
463370b324cSopenharmony_ci    }
464370b324cSopenharmony_ci    src += pos;
465370b324cSopenharmony_ci  }
466370b324cSopenharmony_ci}
467370b324cSopenharmony_ci
468370b324cSopenharmony_ci
469370b324cSopenharmony_ci
470370b324cSopenharmony_ci#define MY_UTF8_HEAD(n, val) ((char)(MY_UTF8_START(n) + (val >> (6 * (n)))))
471370b324cSopenharmony_ci#define MY_UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F)))
472370b324cSopenharmony_ci
473370b324cSopenharmony_cistatic size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim, unsigned flags)
474370b324cSopenharmony_ci{
475370b324cSopenharmony_ci  size_t size = (size_t)(srcLim - src);
476370b324cSopenharmony_ci  for (;;)
477370b324cSopenharmony_ci  {
478370b324cSopenharmony_ci    if (src == srcLim)
479370b324cSopenharmony_ci      return size;
480370b324cSopenharmony_ci
481370b324cSopenharmony_ci    UInt32 val = (UInt32)(*src++);
482370b324cSopenharmony_ci
483370b324cSopenharmony_ci    if (val < 0x80)
484370b324cSopenharmony_ci      continue;
485370b324cSopenharmony_ci
486370b324cSopenharmony_ci    if (val < MY_UTF8_RANGE(1))
487370b324cSopenharmony_ci    {
488370b324cSopenharmony_ci      size++;
489370b324cSopenharmony_ci      continue;
490370b324cSopenharmony_ci    }
491370b324cSopenharmony_ci
492370b324cSopenharmony_ci    #ifdef UTF_ESCAPE_BASE
493370b324cSopenharmony_ci
494370b324cSopenharmony_ci    #if UTF_ESCAPE_PLANE != 0
495370b324cSopenharmony_ci    if (flags & Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE)
496370b324cSopenharmony_ci      if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE))
497370b324cSopenharmony_ci        continue;
498370b324cSopenharmony_ci    #endif
499370b324cSopenharmony_ci
500370b324cSopenharmony_ci    if (flags & Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE)
501370b324cSopenharmony_ci      if (IS_ESCAPE_POINT(val, 0))
502370b324cSopenharmony_ci        continue;
503370b324cSopenharmony_ci
504370b324cSopenharmony_ci    #endif
505370b324cSopenharmony_ci
506370b324cSopenharmony_ci    if (IS_SURROGATE_POINT(val))
507370b324cSopenharmony_ci    {
508370b324cSopenharmony_ci      // it's hack to UTF-8 encoding
509370b324cSopenharmony_ci
510370b324cSopenharmony_ci      if (val < 0xdc00 && src != srcLim)
511370b324cSopenharmony_ci      {
512370b324cSopenharmony_ci        const UInt32 c2 = (UInt32)*src;
513370b324cSopenharmony_ci        if (c2 >= 0xdc00 && c2 < 0xe000)
514370b324cSopenharmony_ci          src++;
515370b324cSopenharmony_ci      }
516370b324cSopenharmony_ci      size += 2;
517370b324cSopenharmony_ci      continue;
518370b324cSopenharmony_ci    }
519370b324cSopenharmony_ci
520370b324cSopenharmony_ci    #ifdef Z7_WCHART_IS_16BIT
521370b324cSopenharmony_ci
522370b324cSopenharmony_ci    size += 2;
523370b324cSopenharmony_ci
524370b324cSopenharmony_ci    #else
525370b324cSopenharmony_ci
526370b324cSopenharmony_ci         if (val < MY_UTF8_RANGE(2)) size += 2;
527370b324cSopenharmony_ci    else if (val < MY_UTF8_RANGE(3)) size += 3;
528370b324cSopenharmony_ci    else if (val < MY_UTF8_RANGE(4)) size += 4;
529370b324cSopenharmony_ci    else if (val < MY_UTF8_RANGE(5)) size += 5;
530370b324cSopenharmony_ci    else
531370b324cSopenharmony_ci    #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6
532370b324cSopenharmony_ci      size += 6;
533370b324cSopenharmony_ci    #else
534370b324cSopenharmony_ci      size += 3;
535370b324cSopenharmony_ci    #endif
536370b324cSopenharmony_ci
537370b324cSopenharmony_ci    #endif
538370b324cSopenharmony_ci  }
539370b324cSopenharmony_ci}
540370b324cSopenharmony_ci
541370b324cSopenharmony_ci
542370b324cSopenharmony_cistatic char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim, unsigned flags)
543370b324cSopenharmony_ci{
544370b324cSopenharmony_ci  for (;;)
545370b324cSopenharmony_ci  {
546370b324cSopenharmony_ci    if (src == srcLim)
547370b324cSopenharmony_ci      return dest;
548370b324cSopenharmony_ci
549370b324cSopenharmony_ci    UInt32 val = (UInt32)*src++;
550370b324cSopenharmony_ci
551370b324cSopenharmony_ci    if (val < 0x80)
552370b324cSopenharmony_ci    {
553370b324cSopenharmony_ci      *dest++ = (char)val;
554370b324cSopenharmony_ci      continue;
555370b324cSopenharmony_ci    }
556370b324cSopenharmony_ci
557370b324cSopenharmony_ci    if (val < MY_UTF8_RANGE(1))
558370b324cSopenharmony_ci    {
559370b324cSopenharmony_ci      dest[0] = MY_UTF8_HEAD(1, val);
560370b324cSopenharmony_ci      dest[1] = MY_UTF8_CHAR(0, val);
561370b324cSopenharmony_ci      dest += 2;
562370b324cSopenharmony_ci      continue;
563370b324cSopenharmony_ci    }
564370b324cSopenharmony_ci
565370b324cSopenharmony_ci    #ifdef UTF_ESCAPE_BASE
566370b324cSopenharmony_ci
567370b324cSopenharmony_ci    #if UTF_ESCAPE_PLANE != 0
568370b324cSopenharmony_ci    /*
569370b324cSopenharmony_ci       if (wchar_t is 32-bit)
570370b324cSopenharmony_ci            && (Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE is set)
571370b324cSopenharmony_ci            && (point is virtual escape plane)
572370b324cSopenharmony_ci          we extract 8-bit byte from virtual HIGH-ESCAPE PLANE.
573370b324cSopenharmony_ci    */
574370b324cSopenharmony_ci    if (flags & Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE)
575370b324cSopenharmony_ci      if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE))
576370b324cSopenharmony_ci      {
577370b324cSopenharmony_ci        *dest++ = (char)(val);
578370b324cSopenharmony_ci        continue;
579370b324cSopenharmony_ci      }
580370b324cSopenharmony_ci    #endif // UTF_ESCAPE_PLANE != 0
581370b324cSopenharmony_ci
582370b324cSopenharmony_ci    /* if (Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE is defined)
583370b324cSopenharmony_ci          we extract 8-bit byte from BMP-ESCAPE PLANE. */
584370b324cSopenharmony_ci
585370b324cSopenharmony_ci    if (flags & Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE)
586370b324cSopenharmony_ci      if (IS_ESCAPE_POINT(val, 0))
587370b324cSopenharmony_ci      {
588370b324cSopenharmony_ci        *dest++ = (char)(val);
589370b324cSopenharmony_ci        continue;
590370b324cSopenharmony_ci      }
591370b324cSopenharmony_ci
592370b324cSopenharmony_ci    #endif // UTF_ESCAPE_BASE
593370b324cSopenharmony_ci
594370b324cSopenharmony_ci    if (IS_SURROGATE_POINT(val))
595370b324cSopenharmony_ci    {
596370b324cSopenharmony_ci      // it's hack to UTF-8 encoding
597370b324cSopenharmony_ci      if (val < 0xdc00 && src != srcLim)
598370b324cSopenharmony_ci      {
599370b324cSopenharmony_ci        const UInt32 c2 = (UInt32)*src;
600370b324cSopenharmony_ci        if (IS_LOW_SURROGATE_POINT(c2))
601370b324cSopenharmony_ci        {
602370b324cSopenharmony_ci          src++;
603370b324cSopenharmony_ci          val = (((val - 0xd800) << 10) | (c2 - 0xdc00)) + 0x10000;
604370b324cSopenharmony_ci          dest[0] = MY_UTF8_HEAD(3, val);
605370b324cSopenharmony_ci          dest[1] = MY_UTF8_CHAR(2, val);
606370b324cSopenharmony_ci          dest[2] = MY_UTF8_CHAR(1, val);
607370b324cSopenharmony_ci          dest[3] = MY_UTF8_CHAR(0, val);
608370b324cSopenharmony_ci          dest += 4;
609370b324cSopenharmony_ci          continue;
610370b324cSopenharmony_ci        }
611370b324cSopenharmony_ci      }
612370b324cSopenharmony_ci      if (flags & Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR)
613370b324cSopenharmony_ci        val = UTF_REPLACEMENT_CHAR; // WIN32 function does it
614370b324cSopenharmony_ci    }
615370b324cSopenharmony_ci
616370b324cSopenharmony_ci    #ifndef Z7_WCHART_IS_16BIT
617370b324cSopenharmony_ci    if (val < MY_UTF8_RANGE(2))
618370b324cSopenharmony_ci    #endif
619370b324cSopenharmony_ci    {
620370b324cSopenharmony_ci      dest[0] = MY_UTF8_HEAD(2, val);
621370b324cSopenharmony_ci      dest[1] = MY_UTF8_CHAR(1, val);
622370b324cSopenharmony_ci      dest[2] = MY_UTF8_CHAR(0, val);
623370b324cSopenharmony_ci      dest += 3;
624370b324cSopenharmony_ci      continue;
625370b324cSopenharmony_ci    }
626370b324cSopenharmony_ci
627370b324cSopenharmony_ci    #ifndef Z7_WCHART_IS_16BIT
628370b324cSopenharmony_ci
629370b324cSopenharmony_ci    // we don't expect this case. so we can throw exception
630370b324cSopenharmony_ci    // throw 20210407;
631370b324cSopenharmony_ci
632370b324cSopenharmony_ci    char b;
633370b324cSopenharmony_ci    unsigned numBits;
634370b324cSopenharmony_ci         if (val < MY_UTF8_RANGE(3)) { numBits = 6 * 3; b = MY_UTF8_HEAD(3, val); }
635370b324cSopenharmony_ci    else if (val < MY_UTF8_RANGE(4)) { numBits = 6 * 4; b = MY_UTF8_HEAD(4, val); }
636370b324cSopenharmony_ci    else if (val < MY_UTF8_RANGE(5)) { numBits = 6 * 5; b = MY_UTF8_HEAD(5, val); }
637370b324cSopenharmony_ci    #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6
638370b324cSopenharmony_ci    else                           { numBits = 6 * 6; b = (char)MY_UTF8_START(6); }
639370b324cSopenharmony_ci    #else
640370b324cSopenharmony_ci    else
641370b324cSopenharmony_ci    {
642370b324cSopenharmony_ci      val = UTF_REPLACEMENT_CHAR;
643370b324cSopenharmony_ci                                   { numBits = 6 * 3; b = MY_UTF8_HEAD(3, val); }
644370b324cSopenharmony_ci    }
645370b324cSopenharmony_ci    #endif
646370b324cSopenharmony_ci
647370b324cSopenharmony_ci    *dest++ = b;
648370b324cSopenharmony_ci
649370b324cSopenharmony_ci    do
650370b324cSopenharmony_ci    {
651370b324cSopenharmony_ci      numBits -= 6;
652370b324cSopenharmony_ci      *dest++ = (char)(0x80 + ((val >> numBits) & 0x3F));
653370b324cSopenharmony_ci    }
654370b324cSopenharmony_ci    while (numBits != 0);
655370b324cSopenharmony_ci
656370b324cSopenharmony_ci    #endif
657370b324cSopenharmony_ci  }
658370b324cSopenharmony_ci}
659370b324cSopenharmony_ci
660370b324cSopenharmony_cibool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags)
661370b324cSopenharmony_ci{
662370b324cSopenharmony_ci  dest.Empty();
663370b324cSopenharmony_ci  size_t destLen = 0;
664370b324cSopenharmony_ci  Utf8_To_Utf16(NULL, &destLen, src, src + srcSize, flags);
665370b324cSopenharmony_ci  bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src + srcSize, flags);
666370b324cSopenharmony_ci  dest.ReleaseBuf_SetEnd((unsigned)destLen);
667370b324cSopenharmony_ci  return res;
668370b324cSopenharmony_ci}
669370b324cSopenharmony_ci
670370b324cSopenharmony_cibool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags)
671370b324cSopenharmony_ci{
672370b324cSopenharmony_ci  return Convert_UTF8_Buf_To_Unicode(src, src.Len(), dest,  flags);
673370b324cSopenharmony_ci}
674370b324cSopenharmony_ci
675370b324cSopenharmony_ci
676370b324cSopenharmony_cistatic
677370b324cSopenharmony_ciunsigned g_UTF8_To_Unicode_Flags =
678370b324cSopenharmony_ci    Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
679370b324cSopenharmony_ci  #ifndef Z7_WCHART_IS_16BIT
680370b324cSopenharmony_ci    | Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
681370b324cSopenharmony_ci  #ifdef MY_UTF8_RAW_NON_UTF8_SUPPORTED
682370b324cSopenharmony_ci    | Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
683370b324cSopenharmony_ci  #endif
684370b324cSopenharmony_ci  #endif
685370b324cSopenharmony_ci    ;
686370b324cSopenharmony_ci
687370b324cSopenharmony_ci
688370b324cSopenharmony_ci/*
689370b324cSopenharmony_cibool ConvertUTF8ToUnicode_boolRes(const AString &src, UString &dest)
690370b324cSopenharmony_ci{
691370b324cSopenharmony_ci  return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags);
692370b324cSopenharmony_ci}
693370b324cSopenharmony_ci*/
694370b324cSopenharmony_ci
695370b324cSopenharmony_cibool ConvertUTF8ToUnicode(const AString &src, UString &dest)
696370b324cSopenharmony_ci{
697370b324cSopenharmony_ci  return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags);
698370b324cSopenharmony_ci}
699370b324cSopenharmony_ci
700370b324cSopenharmony_civoid Print_UString(const UString &a);
701370b324cSopenharmony_ci
702370b324cSopenharmony_civoid ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags)
703370b324cSopenharmony_ci{
704370b324cSopenharmony_ci  /*
705370b324cSopenharmony_ci  if (src.Len()== 24)
706370b324cSopenharmony_ci    throw "202104";
707370b324cSopenharmony_ci  */
708370b324cSopenharmony_ci  dest.Empty();
709370b324cSopenharmony_ci  const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags);
710370b324cSopenharmony_ci  char *destStart = dest.GetBuf((unsigned)destLen);
711370b324cSopenharmony_ci  const char *destEnd = Utf16_To_Utf8(destStart, src, src.Ptr(src.Len()), flags);
712370b324cSopenharmony_ci  dest.ReleaseBuf_SetEnd((unsigned)destLen);
713370b324cSopenharmony_ci  // printf("\nlen = %d\n", src.Len());
714370b324cSopenharmony_ci  if (destLen != (size_t)(destEnd - destStart))
715370b324cSopenharmony_ci  {
716370b324cSopenharmony_ci    /*
717370b324cSopenharmony_ci    // dest.ReleaseBuf_SetEnd((unsigned)(destEnd - destStart));
718370b324cSopenharmony_ci    printf("\nlen = %d\n", (unsigned)destLen);
719370b324cSopenharmony_ci    printf("\n(destEnd - destStart) = %d\n", (unsigned)(destEnd - destStart));
720370b324cSopenharmony_ci    printf("\n");
721370b324cSopenharmony_ci    // Print_UString(src);
722370b324cSopenharmony_ci    printf("\n");
723370b324cSopenharmony_ci    // printf("\nlen = %d\n", destLen);
724370b324cSopenharmony_ci    */
725370b324cSopenharmony_ci    throw 20210406;
726370b324cSopenharmony_ci  }
727370b324cSopenharmony_ci}
728370b324cSopenharmony_ci
729370b324cSopenharmony_ci
730370b324cSopenharmony_ci
731370b324cSopenharmony_ciunsigned g_Unicode_To_UTF8_Flags =
732370b324cSopenharmony_ci      // Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE
733370b324cSopenharmony_ci      0
734370b324cSopenharmony_ci  #ifndef _WIN32
735370b324cSopenharmony_ci    #ifdef MY_UTF8_RAW_NON_UTF8_SUPPORTED
736370b324cSopenharmony_ci      | Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE
737370b324cSopenharmony_ci    #else
738370b324cSopenharmony_ci      | Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
739370b324cSopenharmony_ci    #endif
740370b324cSopenharmony_ci  #endif
741370b324cSopenharmony_ci    ;
742370b324cSopenharmony_ci
743370b324cSopenharmony_civoid ConvertUnicodeToUTF8(const UString &src, AString &dest)
744370b324cSopenharmony_ci{
745370b324cSopenharmony_ci  ConvertUnicodeToUTF8_Flags(src, dest, g_Unicode_To_UTF8_Flags);
746370b324cSopenharmony_ci}
747370b324cSopenharmony_ci
748370b324cSopenharmony_civoid Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest)
749370b324cSopenharmony_ci{
750370b324cSopenharmony_ci  const unsigned flags = g_Unicode_To_UTF8_Flags;
751370b324cSopenharmony_ci  dest.Free();
752370b324cSopenharmony_ci  const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags);
753370b324cSopenharmony_ci  dest.Alloc(destLen);
754370b324cSopenharmony_ci  const char *destEnd = Utf16_To_Utf8((char *)(void *)(Byte *)dest, src, src.Ptr(src.Len()), flags);
755370b324cSopenharmony_ci  if (destLen != (size_t)(destEnd - (char *)(void *)(Byte *)dest))
756370b324cSopenharmony_ci    throw 202104;
757370b324cSopenharmony_ci}
758370b324cSopenharmony_ci
759370b324cSopenharmony_ci/*
760370b324cSopenharmony_ci
761370b324cSopenharmony_ci#ifndef _WIN32
762370b324cSopenharmony_civoid Convert_UTF16_To_UTF32(const UString &src, UString &dest)
763370b324cSopenharmony_ci{
764370b324cSopenharmony_ci  dest.Empty();
765370b324cSopenharmony_ci  for (size_t i = 0; i < src.Len();)
766370b324cSopenharmony_ci  {
767370b324cSopenharmony_ci    wchar_t c = src[i++];
768370b324cSopenharmony_ci    if (c >= 0xd800 && c < 0xdc00 && i < src.Len())
769370b324cSopenharmony_ci    {
770370b324cSopenharmony_ci      const wchar_t c2 = src[i];
771370b324cSopenharmony_ci      if (c2 >= 0xdc00 && c2 < 0x10000)
772370b324cSopenharmony_ci      {
773370b324cSopenharmony_ci        // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2);
774370b324cSopenharmony_ci        c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
775370b324cSopenharmony_ci        // printf("%4x\n", (int)c);
776370b324cSopenharmony_ci        i++;
777370b324cSopenharmony_ci      }
778370b324cSopenharmony_ci    }
779370b324cSopenharmony_ci    dest += c;
780370b324cSopenharmony_ci  }
781370b324cSopenharmony_ci}
782370b324cSopenharmony_ci
783370b324cSopenharmony_civoid Convert_UTF32_To_UTF16(const UString &src, UString &dest)
784370b324cSopenharmony_ci{
785370b324cSopenharmony_ci  dest.Empty();
786370b324cSopenharmony_ci  for (size_t i = 0; i < src.Len();)
787370b324cSopenharmony_ci  {
788370b324cSopenharmony_ci    wchar_t w = src[i++];
789370b324cSopenharmony_ci    if (w >= 0x10000 && w < 0x110000)
790370b324cSopenharmony_ci    {
791370b324cSopenharmony_ci      w -= 0x10000;
792370b324cSopenharmony_ci      dest += (wchar_t)((unsigned)0xd800 + (((unsigned)w >> 10) & 0x3ff));
793370b324cSopenharmony_ci      w = 0xdc00 + (w & 0x3ff);
794370b324cSopenharmony_ci    }
795370b324cSopenharmony_ci    dest += w;
796370b324cSopenharmony_ci  }
797370b324cSopenharmony_ci}
798370b324cSopenharmony_ci
799370b324cSopenharmony_cibool UTF32_IsThere_BigPoint(const UString &src)
800370b324cSopenharmony_ci{
801370b324cSopenharmony_ci  for (size_t i = 0; i < src.Len();)
802370b324cSopenharmony_ci  {
803370b324cSopenharmony_ci    const UInt32 c = (UInt32)src[i++];
804370b324cSopenharmony_ci    if (c >= 0x110000)
805370b324cSopenharmony_ci      return true;
806370b324cSopenharmony_ci  }
807370b324cSopenharmony_ci  return false;
808370b324cSopenharmony_ci}
809370b324cSopenharmony_ci
810370b324cSopenharmony_cibool Unicode_IsThere_BmpEscape(const UString &src)
811370b324cSopenharmony_ci{
812370b324cSopenharmony_ci  for (size_t i = 0; i < src.Len();)
813370b324cSopenharmony_ci  {
814370b324cSopenharmony_ci    const UInt32 c = (UInt32)src[i++];
815370b324cSopenharmony_ci    if (IS_ESCAPE_POINT(c, 0))
816370b324cSopenharmony_ci      return true;
817370b324cSopenharmony_ci  }
818370b324cSopenharmony_ci  return false;
819370b324cSopenharmony_ci}
820370b324cSopenharmony_ci
821370b324cSopenharmony_ci
822370b324cSopenharmony_ci#endif
823370b324cSopenharmony_ci
824370b324cSopenharmony_cibool Unicode_IsThere_Utf16SurrogateError(const UString &src)
825370b324cSopenharmony_ci{
826370b324cSopenharmony_ci  for (size_t i = 0; i < src.Len();)
827370b324cSopenharmony_ci  {
828370b324cSopenharmony_ci    const UInt32 val = (UInt32)src[i++];
829370b324cSopenharmony_ci    if (IS_SURROGATE_POINT(val))
830370b324cSopenharmony_ci    {
831370b324cSopenharmony_ci      // it's hack to UTF-8 encoding
832370b324cSopenharmony_ci      if (val >= 0xdc00 || i == src.Len())
833370b324cSopenharmony_ci        return true;
834370b324cSopenharmony_ci      const UInt32 c2 = (UInt32)*src;
835370b324cSopenharmony_ci      if (!IS_LOW_SURROGATE_POINT(c2))
836370b324cSopenharmony_ci        return true;
837370b324cSopenharmony_ci    }
838370b324cSopenharmony_ci  }
839370b324cSopenharmony_ci  return false;
840370b324cSopenharmony_ci}
841370b324cSopenharmony_ci*/
842370b324cSopenharmony_ci
843370b324cSopenharmony_ci#ifndef Z7_WCHART_IS_16BIT
844370b324cSopenharmony_ci
845370b324cSopenharmony_civoid Convert_UnicodeEsc16_To_UnicodeEscHigh
846370b324cSopenharmony_ci#if UTF_ESCAPE_PLANE == 0
847370b324cSopenharmony_ci    (UString &) {}
848370b324cSopenharmony_ci#else
849370b324cSopenharmony_ci    (UString &s)
850370b324cSopenharmony_ci{
851370b324cSopenharmony_ci  const unsigned len = s.Len();
852370b324cSopenharmony_ci  for (unsigned i = 0; i < len; i++)
853370b324cSopenharmony_ci  {
854370b324cSopenharmony_ci    wchar_t c = s[i];
855370b324cSopenharmony_ci    if (IS_ESCAPE_POINT(c, 0))
856370b324cSopenharmony_ci    {
857370b324cSopenharmony_ci      c += UTF_ESCAPE_PLANE;
858370b324cSopenharmony_ci      s.ReplaceOneCharAtPos(i, c);
859370b324cSopenharmony_ci    }
860370b324cSopenharmony_ci  }
861370b324cSopenharmony_ci}
862370b324cSopenharmony_ci#endif
863370b324cSopenharmony_ci#endif
864