xref: /third_party/lzma/CPP/Common/UTFConvert.h (revision 370b324c)
1// Common/UTFConvert.h
2
3#ifndef ZIP7_INC_COMMON_UTF_CONVERT_H
4#define ZIP7_INC_COMMON_UTF_CONVERT_H
5
6#include "MyBuffer.h"
7#include "MyString.h"
8
9struct CUtf8Check
10{
11  // Byte MaxByte;     // in original src stream
12  bool NonUtf;
13  bool ZeroChar;
14  bool SingleSurrogate;
15  bool Escape;
16  bool Truncated;
17  UInt32 MaxHighPoint;  // only for points >= 0x80
18
19  CUtf8Check() { Clear(); }
20
21  void Clear()
22  {
23    // MaxByte = 0;
24    NonUtf = false;
25    ZeroChar = false;
26    SingleSurrogate = false;
27    Escape = false;
28    Truncated = false;
29    MaxHighPoint = 0;
30  }
31
32  void Update(const CUtf8Check &c)
33  {
34    if (c.NonUtf) NonUtf = true;
35    if (c.ZeroChar) ZeroChar = true;
36    if (c.SingleSurrogate) SingleSurrogate = true;
37    if (c.Escape) Escape = true;
38    if (c.Truncated) Truncated = true;
39    if (MaxHighPoint < c.MaxHighPoint) MaxHighPoint = c.MaxHighPoint;
40  }
41
42  void PrintStatus(AString &s) const
43  {
44    s.Empty();
45
46    // s.Add_OptSpaced("MaxByte=");
47    // s.Add_UInt32(MaxByte);
48
49    if (NonUtf)          s.Add_OptSpaced("non-UTF8");
50    if (ZeroChar)        s.Add_OptSpaced("ZeroChar");
51    if (SingleSurrogate) s.Add_OptSpaced("SingleSurrogate");
52    if (Escape)          s.Add_OptSpaced("Escape");
53    if (Truncated)       s.Add_OptSpaced("Truncated");
54
55    if (MaxHighPoint != 0)
56    {
57      s.Add_OptSpaced("MaxUnicode=");
58      s.Add_UInt32(MaxHighPoint);
59    }
60  }
61
62
63  bool IsOK(bool allowReduced = false) const
64  {
65    if (NonUtf || SingleSurrogate || ZeroChar)
66      return false;
67    if (MaxHighPoint >= 0x110000)
68      return false;
69    if (Truncated && !allowReduced)
70      return false;
71    return true;
72  }
73
74  // it checks full buffer as specified in (size) and it doesn't stop on zero char
75  void Check_Buf(const char *src, size_t size) throw();
76
77  void Check_AString(const AString &s) throw()
78  {
79    Check_Buf(s.Ptr(), s.Len());
80  }
81};
82
83/*
84if (allowReduced == false) - all UTF-8 character sequences must be finished.
85if (allowReduced == true)  - it allows truncated last character-Utf8-sequence
86*/
87
88bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw();
89bool CheckUTF8_AString(const AString &s) throw();
90
91#define Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR    (1 << 0)
92#define Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE         (1 << 1)
93#define Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT (1 << 2)
94
95/*
96Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
97
98   if (flag is NOT set)
99   {
100     it processes SINGLE-SURROGATE-8 as valid Unicode point.
101     it converts  SINGLE-SURROGATE-8 to SINGLE-SURROGATE-16
102     Note: some sequencies of two SINGLE-SURROGATE-8 points
103           will generate correct SURROGATE-16-PAIR, and
104           that SURROGATE-16-PAIR later will be converted to correct
105           UTF8-SURROGATE-21 point. So we don't restore original
106           STR-8 sequence in that case.
107   }
108
109   if (flag is set)
110   {
111     if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is defined)
112        it generates ESCAPE for SINGLE-SURROGATE-8,
113     if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is not defined)
114        it generates U+fffd for SINGLE-SURROGATE-8,
115   }
116
117
118Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
119
120   if (flag is NOT set)
121     it generates (U+fffd) code for non-UTF-8 (invalid) characters
122
123   if (flag is set)
124   {
125     It generates (ESCAPE) codes for NON-UTF-8 (invalid) characters.
126     And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes.
127   }
128
129Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
130
131   if (flag is NOT set)
132   {
133     it process ESCAPE-8 points as another Unicode points.
134     In Linux: ESCAPE-16 will mean two different ESCAPE-8 seqences,
135       so we need HIGH-ESCAPE-PLANE-21 to restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW
136   }
137
138   if (flag is set)
139   {
140     it generates ESCAPE-16-21 for ESCAPE-8 points
141     so we can restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW without HIGH-ESCAPE-PLANE-21.
142   }
143
144
145Main USE CASES with UTF-8 <-> UTF-16 conversions:
146
147 WIN32:   UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW
148   {
149            set Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
150     Do NOT set Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
151     Do NOT set Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
152
153     So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8.
154   }
155
156 Linux:   UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
157   {
158     we want restore original UTF-8-RAW sequence later from that ESCAPE-16.
159     Set the flags:
160       Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
161       Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
162       Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
163   }
164
165 MacOS:   UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
166   {
167     we want to restore correct UTF-8 without any BMP processing:
168     Set the flags:
169       Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
170       Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
171   }
172
173*/
174
175// zero char is not allowed in (src) buf
176bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags = 0);
177
178bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0);
179bool ConvertUTF8ToUnicode(const AString &src, UString &dest);
180
181#define Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR    (1 << 8)
182#define Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE (1 << 9)
183// #define Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE  (1 << 10)
184
185/*
186Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
187
188  if (flag is NOT set)
189  {
190     we extract SINGLE-SURROGATE as normal UTF-8
191
192     In Windows : for UTF-16-RAW <-> UTF-8 (archive) <-> UTF-16-RAW in .
193
194     In Linux :
195       use-case-1: UTF-8 -> UTF-16 -> UTF-8  doesn't generate UTF-16 SINGLE-SURROGATE,
196                   if (Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) is used.
197       use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux)
198                   will generate SINGLE-SURROGATE-UTF-8 here.
199  }
200
201  if (flag is set)
202  {
203     we generate UTF_REPLACEMENT_CHAR (0xfffd) for SINGLE_SURROGATE
204     it can be used for compatibility mode with WIN32 UTF function
205     or if we want UTF-8 stream without any errors
206  }
207
208
209Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE
210
211  if (flag is NOT set) it doesn't extract  raw 8-bit symbol from Escape-Plane-16
212  if (flag is set)     it         extracts raw 8-bit symbol from Escape-Plane-16
213
214  in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive):
215  if (we       use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane.
216  if (we don't use High-Escape-Plane), we must use Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE.
217
218
219Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE
220  // that flag affects the code only if (wchar_t is 32-bit)
221  // that mode with high-escape can be disabled now in UTFConvert.cpp
222  if (flag is NOT set)
223     it doesn't extract raw 8-bit symbol from High-Escape-Plane
224  if (flag is set)
225     it        extracts raw 8-bit symbol from High-Escape-Plane
226
227Main use cases:
228
229WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW
230   {
231     Do NOT set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE.
232     Do NOT set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR.
233     So we restore original UTF-16-RAW.
234   }
235
236Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes
237     set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive
238     set Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE for intermediate UTF-16.
239     Note: high esacape mode can be ignored now in UTFConvert.cpp
240
241macOS:
242     the system doesn't support incorrect UTF-8 in file names.
243     set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
244*/
245
246extern unsigned g_Unicode_To_UTF8_Flags;
247
248void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags = 0);
249void ConvertUnicodeToUTF8(const UString &src, AString &dest);
250
251void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest);
252
253/*
254#ifndef _WIN32
255void Convert_UTF16_To_UTF32(const UString &src, UString &dest);
256void Convert_UTF32_To_UTF16(const UString &src, UString &dest);
257bool UTF32_IsThere_BigPoint(const UString &src);
258bool Unicode_IsThere_BmpEscape(const UString &src);
259#endif
260
261bool Unicode_IsThere_Utf16SurrogateError(const UString &src);
262*/
263
264#ifdef Z7_WCHART_IS_16BIT
265#define Convert_UnicodeEsc16_To_UnicodeEscHigh(s)
266#else
267void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s);
268#endif
269
270/*
271// #include "../../C/CpuArch.h"
272
273// ---------- Utf16 Little endian functions ----------
274
275// We store 16-bit surrogates even in 32-bit WCHARs in Linux.
276// So now we don't use the following code:
277
278#if WCHAR_MAX > 0xffff
279
280// void *p     : pointer to src bytes stream
281// size_t len  : num Utf16 characters : it can include or not include NULL character
282
283inline size_t Utf16LE__Get_Num_WCHARs(const void *p, size_t len)
284{
285  #if WCHAR_MAX > 0xffff
286  size_t num_wchars = 0;
287  for (size_t i = 0; i < len; i++)
288  {
289    wchar_t c = GetUi16(p);
290    p = (const void *)((const Byte *)p + 2);
291    if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
292    {
293      wchar_t c2 = GetUi16(p);
294      if (c2 >= 0xdc00 && c2 < 0xe000)
295      {
296        c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
297        p = (const void *)((const Byte *)p + 2);
298        i++;
299      }
300    }
301    num_wchars++;
302  }
303  return num_wchars;
304  #else
305  UNUSED_VAR(p)
306  return len;
307  #endif
308}
309
310// #include <stdio.h>
311
312inline wchar_t *Utf16LE__To_WCHARs_Sep(const void *p, size_t len, wchar_t *dest)
313{
314  for (size_t i = 0; i < len; i++)
315  {
316    wchar_t c = GetUi16(p);
317    p = (const void *)((const Byte *)p + 2);
318
319    #if WCHAR_PATH_SEPARATOR != L'/'
320    if (c == L'/')
321      c = WCHAR_PATH_SEPARATOR;
322    #endif
323
324    #if WCHAR_MAX > 0xffff
325
326    if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
327    {
328      wchar_t c2 = GetUi16(p);
329      if (c2 >= 0xdc00 && c2 < 0xe000)
330      {
331        // printf("\nSurragate : %4x %4x -> ", (int)c, (int)c2);
332        c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
333        p = (const void *)((const Byte *)p + 2);
334        i++;
335        // printf("%4x\n", (int)c);
336      }
337    }
338
339    #endif
340
341    *dest++ = c;
342  }
343  return dest;
344}
345
346
347inline size_t Get_Num_Utf16_chars_from_wchar_string(const wchar_t *p)
348{
349  size_t num = 0;
350  for (;;)
351  {
352    wchar_t c = *p++;
353    if (c == 0)
354      return num;
355    num += ((c >= 0x10000 && c < 0x110000) ? 2 : 1);
356  }
357  return num;
358}
359
360inline Byte *wchars_to_Utf16LE(const wchar_t *p, Byte *dest)
361{
362  for (;;)
363  {
364    wchar_t c = *p++;
365    if (c == 0)
366      return dest;
367    if (c >= 0x10000 && c < 0x110000)
368    {
369      SetUi16(dest    , (UInt16)(0xd800 + ((c >> 10) & 0x3FF)));
370      SetUi16(dest + 2, (UInt16)(0xdc00 + ( c        & 0x3FF)));
371      dest += 4;
372    }
373    else
374    {
375      SetUi16(dest, c);
376      dest += 2;
377    }
378  }
379}
380
381#endif
382*/
383
384#endif
385