1370b324cSopenharmony_ci// Common/StringConvert.cpp
2370b324cSopenharmony_ci
3370b324cSopenharmony_ci#include "StdAfx.h"
4370b324cSopenharmony_ci
5370b324cSopenharmony_ci#include "StringConvert.h"
6370b324cSopenharmony_ci
7370b324cSopenharmony_ci#ifndef _WIN32
8370b324cSopenharmony_ci// #include <stdio.h>
9370b324cSopenharmony_ci#include <stdlib.h>
10370b324cSopenharmony_ci#endif
11370b324cSopenharmony_ci
12370b324cSopenharmony_ci#if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
13370b324cSopenharmony_ci#include "UTFConvert.h"
14370b324cSopenharmony_ci#endif
15370b324cSopenharmony_ci
16370b324cSopenharmony_ci#ifdef ENV_HAVE_LOCALE
17370b324cSopenharmony_ci#include <locale.h>
18370b324cSopenharmony_ci#endif
19370b324cSopenharmony_ci
20370b324cSopenharmony_cistatic const char k_DefultChar = '_';
21370b324cSopenharmony_ci
22370b324cSopenharmony_ci#ifdef _WIN32
23370b324cSopenharmony_ci
24370b324cSopenharmony_ci/*
25370b324cSopenharmony_ciMultiByteToWideChar(CodePage, DWORD dwFlags,
26370b324cSopenharmony_ci    LPCSTR lpMultiByteStr, int cbMultiByte,
27370b324cSopenharmony_ci    LPWSTR lpWideCharStr, int cchWideChar)
28370b324cSopenharmony_ci
29370b324cSopenharmony_ci  if (cbMultiByte == 0)
30370b324cSopenharmony_ci    return: 0. ERR: ERROR_INVALID_PARAMETER
31370b324cSopenharmony_ci
32370b324cSopenharmony_ci  if (cchWideChar == 0)
33370b324cSopenharmony_ci    return: the required buffer size in characters.
34370b324cSopenharmony_ci
35370b324cSopenharmony_ci  if (supplied buffer size was not large enough)
36370b324cSopenharmony_ci    return: 0. ERR: ERROR_INSUFFICIENT_BUFFER
37370b324cSopenharmony_ci    The number of filled characters in lpWideCharStr can be smaller than cchWideChar (if last character is complex)
38370b324cSopenharmony_ci
39370b324cSopenharmony_ci  If there are illegal characters:
40370b324cSopenharmony_ci    if MB_ERR_INVALID_CHARS is set in dwFlags:
41370b324cSopenharmony_ci      - the function stops conversion on illegal character.
42370b324cSopenharmony_ci      - Return: 0. ERR: ERROR_NO_UNICODE_TRANSLATION.
43370b324cSopenharmony_ci
44370b324cSopenharmony_ci    if MB_ERR_INVALID_CHARS is NOT set in dwFlags:
45370b324cSopenharmony_ci      before Vista: illegal character is dropped (skipped). WinXP-64: GetLastError() returns 0.
46370b324cSopenharmony_ci      in Vista+:    illegal character is not dropped (MSDN). Undocumented: illegal
47370b324cSopenharmony_ci                    character is converted to U+FFFD, which is REPLACEMENT CHARACTER.
48370b324cSopenharmony_ci*/
49370b324cSopenharmony_ci
50370b324cSopenharmony_ci
51370b324cSopenharmony_civoid MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
52370b324cSopenharmony_ci{
53370b324cSopenharmony_ci  dest.Empty();
54370b324cSopenharmony_ci  if (src.IsEmpty())
55370b324cSopenharmony_ci    return;
56370b324cSopenharmony_ci  {
57370b324cSopenharmony_ci    /*
58370b324cSopenharmony_ci    wchar_t *d = dest.GetBuf(src.Len());
59370b324cSopenharmony_ci    const char *s = (const char *)src;
60370b324cSopenharmony_ci    unsigned i;
61370b324cSopenharmony_ci
62370b324cSopenharmony_ci    for (i = 0;;)
63370b324cSopenharmony_ci    {
64370b324cSopenharmony_ci      Byte c = (Byte)s[i];
65370b324cSopenharmony_ci      if (c >= 0x80 || c == 0)
66370b324cSopenharmony_ci        break;
67370b324cSopenharmony_ci      d[i++] = (wchar_t)c;
68370b324cSopenharmony_ci    }
69370b324cSopenharmony_ci
70370b324cSopenharmony_ci    if (i != src.Len())
71370b324cSopenharmony_ci    {
72370b324cSopenharmony_ci      unsigned len = MultiByteToWideChar(codePage, 0, s + i,
73370b324cSopenharmony_ci          src.Len() - i, d + i,
74370b324cSopenharmony_ci          src.Len() + 1 - i);
75370b324cSopenharmony_ci      if (len == 0)
76370b324cSopenharmony_ci        throw 282228;
77370b324cSopenharmony_ci      i += len;
78370b324cSopenharmony_ci    }
79370b324cSopenharmony_ci
80370b324cSopenharmony_ci    d[i] = 0;
81370b324cSopenharmony_ci    dest.ReleaseBuf_SetLen(i);
82370b324cSopenharmony_ci    */
83370b324cSopenharmony_ci    unsigned len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), NULL, 0);
84370b324cSopenharmony_ci    if (len == 0)
85370b324cSopenharmony_ci    {
86370b324cSopenharmony_ci      if (GetLastError() != 0)
87370b324cSopenharmony_ci        throw 282228;
88370b324cSopenharmony_ci    }
89370b324cSopenharmony_ci    else
90370b324cSopenharmony_ci    {
91370b324cSopenharmony_ci      len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), dest.GetBuf(len), (int)len);
92370b324cSopenharmony_ci      if (len == 0)
93370b324cSopenharmony_ci        throw 282228;
94370b324cSopenharmony_ci      dest.ReleaseBuf_SetEnd(len);
95370b324cSopenharmony_ci    }
96370b324cSopenharmony_ci  }
97370b324cSopenharmony_ci}
98370b324cSopenharmony_ci
99370b324cSopenharmony_ci/*
100370b324cSopenharmony_ci  int WideCharToMultiByte(
101370b324cSopenharmony_ci      UINT CodePage, DWORD dwFlags,
102370b324cSopenharmony_ci      LPCWSTR lpWideCharStr, int cchWideChar,
103370b324cSopenharmony_ci      LPSTR lpMultiByteStr, int cbMultiByte,
104370b324cSopenharmony_ci      LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar);
105370b324cSopenharmony_ci
106370b324cSopenharmony_ciif (lpDefaultChar == NULL),
107370b324cSopenharmony_ci  - it uses system default value.
108370b324cSopenharmony_ci
109370b324cSopenharmony_ciif (CodePage == CP_UTF7 || CodePage == CP_UTF8)
110370b324cSopenharmony_ci  if (lpDefaultChar != NULL || lpUsedDefaultChar != NULL)
111370b324cSopenharmony_ci    return: 0. ERR: ERROR_INVALID_PARAMETER.
112370b324cSopenharmony_ci
113370b324cSopenharmony_ciThe function operates most efficiently, if (lpDefaultChar == NULL && lpUsedDefaultChar == NULL)
114370b324cSopenharmony_ci
115370b324cSopenharmony_ci*/
116370b324cSopenharmony_ci
117370b324cSopenharmony_cistatic void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
118370b324cSopenharmony_ci{
119370b324cSopenharmony_ci  dest.Empty();
120370b324cSopenharmony_ci  defaultCharWasUsed = false;
121370b324cSopenharmony_ci  if (src.IsEmpty())
122370b324cSopenharmony_ci    return;
123370b324cSopenharmony_ci  {
124370b324cSopenharmony_ci    /*
125370b324cSopenharmony_ci    unsigned numRequiredBytes = src.Len() * 2;
126370b324cSopenharmony_ci    char *d = dest.GetBuf(numRequiredBytes);
127370b324cSopenharmony_ci    const wchar_t *s = (const wchar_t *)src;
128370b324cSopenharmony_ci    unsigned i;
129370b324cSopenharmony_ci
130370b324cSopenharmony_ci    for (i = 0;;)
131370b324cSopenharmony_ci    {
132370b324cSopenharmony_ci      wchar_t c = s[i];
133370b324cSopenharmony_ci      if (c >= 0x80 || c == 0)
134370b324cSopenharmony_ci        break;
135370b324cSopenharmony_ci      d[i++] = (char)c;
136370b324cSopenharmony_ci    }
137370b324cSopenharmony_ci
138370b324cSopenharmony_ci    if (i != src.Len())
139370b324cSopenharmony_ci    {
140370b324cSopenharmony_ci      BOOL defUsed = FALSE;
141370b324cSopenharmony_ci      defaultChar = defaultChar;
142370b324cSopenharmony_ci
143370b324cSopenharmony_ci      bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
144370b324cSopenharmony_ci      unsigned len = WideCharToMultiByte(codePage, 0, s + i, src.Len() - i,
145370b324cSopenharmony_ci          d + i, numRequiredBytes + 1 - i,
146370b324cSopenharmony_ci          (isUtf ? NULL : &defaultChar),
147370b324cSopenharmony_ci          (isUtf ? NULL : &defUsed));
148370b324cSopenharmony_ci      defaultCharWasUsed = (defUsed != FALSE);
149370b324cSopenharmony_ci      if (len == 0)
150370b324cSopenharmony_ci        throw 282229;
151370b324cSopenharmony_ci      i += len;
152370b324cSopenharmony_ci    }
153370b324cSopenharmony_ci
154370b324cSopenharmony_ci    d[i] = 0;
155370b324cSopenharmony_ci    dest.ReleaseBuf_SetLen(i);
156370b324cSopenharmony_ci    */
157370b324cSopenharmony_ci
158370b324cSopenharmony_ci    /*
159370b324cSopenharmony_ci    if (codePage != CP_UTF7)
160370b324cSopenharmony_ci    {
161370b324cSopenharmony_ci      const wchar_t *s = (const wchar_t *)src;
162370b324cSopenharmony_ci      unsigned i;
163370b324cSopenharmony_ci      for (i = 0;; i++)
164370b324cSopenharmony_ci      {
165370b324cSopenharmony_ci        wchar_t c = s[i];
166370b324cSopenharmony_ci        if (c >= 0x80 || c == 0)
167370b324cSopenharmony_ci          break;
168370b324cSopenharmony_ci      }
169370b324cSopenharmony_ci
170370b324cSopenharmony_ci      if (s[i] == 0)
171370b324cSopenharmony_ci      {
172370b324cSopenharmony_ci        char *d = dest.GetBuf(src.Len());
173370b324cSopenharmony_ci        for (i = 0;;)
174370b324cSopenharmony_ci        {
175370b324cSopenharmony_ci          wchar_t c = s[i];
176370b324cSopenharmony_ci          if (c == 0)
177370b324cSopenharmony_ci            break;
178370b324cSopenharmony_ci          d[i++] = (char)c;
179370b324cSopenharmony_ci        }
180370b324cSopenharmony_ci        d[i] = 0;
181370b324cSopenharmony_ci        dest.ReleaseBuf_SetLen(i);
182370b324cSopenharmony_ci        return;
183370b324cSopenharmony_ci      }
184370b324cSopenharmony_ci    }
185370b324cSopenharmony_ci    */
186370b324cSopenharmony_ci
187370b324cSopenharmony_ci    unsigned len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), NULL, 0, NULL, NULL);
188370b324cSopenharmony_ci    if (len == 0)
189370b324cSopenharmony_ci    {
190370b324cSopenharmony_ci      if (GetLastError() != 0)
191370b324cSopenharmony_ci        throw 282228;
192370b324cSopenharmony_ci    }
193370b324cSopenharmony_ci    else
194370b324cSopenharmony_ci    {
195370b324cSopenharmony_ci      BOOL defUsed = FALSE;
196370b324cSopenharmony_ci      bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
197370b324cSopenharmony_ci      // defaultChar = defaultChar;
198370b324cSopenharmony_ci      len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(),
199370b324cSopenharmony_ci          dest.GetBuf(len), (int)len,
200370b324cSopenharmony_ci          (isUtf ? NULL : &defaultChar),
201370b324cSopenharmony_ci          (isUtf ? NULL : &defUsed)
202370b324cSopenharmony_ci          );
203370b324cSopenharmony_ci      if (!isUtf)
204370b324cSopenharmony_ci        defaultCharWasUsed = (defUsed != FALSE);
205370b324cSopenharmony_ci      if (len == 0)
206370b324cSopenharmony_ci        throw 282228;
207370b324cSopenharmony_ci      dest.ReleaseBuf_SetEnd(len);
208370b324cSopenharmony_ci    }
209370b324cSopenharmony_ci  }
210370b324cSopenharmony_ci}
211370b324cSopenharmony_ci
212370b324cSopenharmony_ci/*
213370b324cSopenharmony_ci#ifndef UNDER_CE
214370b324cSopenharmony_ciAString SystemStringToOemString(const CSysString &src)
215370b324cSopenharmony_ci{
216370b324cSopenharmony_ci  AString dest;
217370b324cSopenharmony_ci  const unsigned len = src.Len() * 2;
218370b324cSopenharmony_ci  CharToOem(src, dest.GetBuf(len));
219370b324cSopenharmony_ci  dest.ReleaseBuf_CalcLen(len);
220370b324cSopenharmony_ci  return dest;
221370b324cSopenharmony_ci}
222370b324cSopenharmony_ci#endif
223370b324cSopenharmony_ci*/
224370b324cSopenharmony_ci
225370b324cSopenharmony_ci#else // _WIN32
226370b324cSopenharmony_ci
227370b324cSopenharmony_ci// #include <stdio.h>
228370b324cSopenharmony_ci/*
229370b324cSopenharmony_ci  if (wchar_t is 32-bit (#if WCHAR_MAX > 0xffff),
230370b324cSopenharmony_ci      and utf-8 string contains big unicode character > 0xffff),
231370b324cSopenharmony_ci  then we still use 16-bit surrogate pair in UString.
232370b324cSopenharmony_ci  It simplifies another code where utf-16 encoding is used.
233370b324cSopenharmony_ci  So we use surrogate-conversion code only in is file.
234370b324cSopenharmony_ci*/
235370b324cSopenharmony_ci
236370b324cSopenharmony_ci/*
237370b324cSopenharmony_ci   mbstowcs() returns error if there is error in utf-8 stream,
238370b324cSopenharmony_ci   mbstowcs() returns error if there is single surrogates point (d800-dfff) in utf-8 stream
239370b324cSopenharmony_ci*/
240370b324cSopenharmony_ci
241370b324cSopenharmony_ci/*
242370b324cSopenharmony_cistatic void MultiByteToUnicodeString2_Native(UString &dest, const AString &src)
243370b324cSopenharmony_ci{
244370b324cSopenharmony_ci  dest.Empty();
245370b324cSopenharmony_ci  if (src.IsEmpty())
246370b324cSopenharmony_ci    return;
247370b324cSopenharmony_ci
248370b324cSopenharmony_ci  const size_t limit = ((size_t)src.Len() + 1) * 2;
249370b324cSopenharmony_ci  wchar_t *d = dest.GetBuf((unsigned)limit);
250370b324cSopenharmony_ci  const size_t len = mbstowcs(d, src, limit);
251370b324cSopenharmony_ci  if (len != (size_t)-1)
252370b324cSopenharmony_ci  {
253370b324cSopenharmony_ci    dest.ReleaseBuf_SetEnd((unsigned)len);
254370b324cSopenharmony_ci    return;
255370b324cSopenharmony_ci  }
256370b324cSopenharmony_ci  dest.ReleaseBuf_SetEnd(0);
257370b324cSopenharmony_ci}
258370b324cSopenharmony_ci*/
259370b324cSopenharmony_ci
260370b324cSopenharmony_cibool g_ForceToUTF8 = true; // false;
261370b324cSopenharmony_ci
262370b324cSopenharmony_civoid MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
263370b324cSopenharmony_ci{
264370b324cSopenharmony_ci  dest.Empty();
265370b324cSopenharmony_ci  if (src.IsEmpty())
266370b324cSopenharmony_ci    return;
267370b324cSopenharmony_ci
268370b324cSopenharmony_ci  if (codePage == CP_UTF8 || g_ForceToUTF8)
269370b324cSopenharmony_ci  {
270370b324cSopenharmony_ci    ConvertUTF8ToUnicode(src, dest);
271370b324cSopenharmony_ci    return;
272370b324cSopenharmony_ci  }
273370b324cSopenharmony_ci
274370b324cSopenharmony_ci  const size_t limit = ((size_t)src.Len() + 1) * 2;
275370b324cSopenharmony_ci  wchar_t *d = dest.GetBuf((unsigned)limit);
276370b324cSopenharmony_ci  const size_t len = mbstowcs(d, src, limit);
277370b324cSopenharmony_ci  if (len != (size_t)-1)
278370b324cSopenharmony_ci  {
279370b324cSopenharmony_ci    dest.ReleaseBuf_SetEnd((unsigned)len);
280370b324cSopenharmony_ci
281370b324cSopenharmony_ci    #if WCHAR_MAX > 0xffff
282370b324cSopenharmony_ci    d = dest.GetBuf();
283370b324cSopenharmony_ci    for (size_t i = 0;; i++)
284370b324cSopenharmony_ci    {
285370b324cSopenharmony_ci      // wchar_t c = dest[i];
286370b324cSopenharmony_ci      wchar_t c = d[i];
287370b324cSopenharmony_ci      if (c == 0)
288370b324cSopenharmony_ci        break;
289370b324cSopenharmony_ci      if (c >= 0x10000 && c < 0x110000)
290370b324cSopenharmony_ci      {
291370b324cSopenharmony_ci        /*
292370b324cSopenharmony_ci        c -= 0x10000;
293370b324cSopenharmony_ci        unsigned c0 = 0xd800 + ((c >> 10) & 0x3FF);
294370b324cSopenharmony_ci        dest.ReplaceOneCharAtPos(i, c0);
295370b324cSopenharmony_ci        i++;
296370b324cSopenharmony_ci        c = 0xdc00 + (c & 0x3FF);
297370b324cSopenharmony_ci        dest.Insert_wchar_t(i, c);
298370b324cSopenharmony_ci        */
299370b324cSopenharmony_ci        UString temp = d + i;
300370b324cSopenharmony_ci
301370b324cSopenharmony_ci        for (size_t t = 0;; t++)
302370b324cSopenharmony_ci        {
303370b324cSopenharmony_ci          wchar_t w = temp[t];
304370b324cSopenharmony_ci          if (w == 0)
305370b324cSopenharmony_ci            break;
306370b324cSopenharmony_ci          if (i == limit)
307370b324cSopenharmony_ci            break; // unexpected error
308370b324cSopenharmony_ci          if (w >= 0x10000 && w < 0x110000)
309370b324cSopenharmony_ci          {
310370b324cSopenharmony_ci            if (i + 1 == limit)
311370b324cSopenharmony_ci              break; // unexpected error
312370b324cSopenharmony_ci            w -= 0x10000;
313370b324cSopenharmony_ci            d[i++] = (unsigned)0xd800 + (((unsigned)w >> 10) & 0x3FF);
314370b324cSopenharmony_ci            w = 0xdc00 + (w & 0x3FF);
315370b324cSopenharmony_ci          }
316370b324cSopenharmony_ci          d[i++] = w;
317370b324cSopenharmony_ci        }
318370b324cSopenharmony_ci        dest.ReleaseBuf_SetEnd((unsigned)i);
319370b324cSopenharmony_ci      }
320370b324cSopenharmony_ci    }
321370b324cSopenharmony_ci
322370b324cSopenharmony_ci    #endif
323370b324cSopenharmony_ci
324370b324cSopenharmony_ci    /*
325370b324cSopenharmony_ci    printf("\nMultiByteToUnicodeString2 (%d) %s\n", (int)src.Len(),  src.Ptr());
326370b324cSopenharmony_ci    printf("char:    ");
327370b324cSopenharmony_ci    for (unsigned i = 0; i < src.Len(); i++)
328370b324cSopenharmony_ci      printf (" %02x", (int)(Byte)src[i]);
329370b324cSopenharmony_ci    printf("\n");
330370b324cSopenharmony_ci    printf("\n-> (%d) %ls\n", (int)dest.Len(), dest.Ptr());
331370b324cSopenharmony_ci    printf("wchar_t: ");
332370b324cSopenharmony_ci    for (unsigned i = 0; i < dest.Len(); i++)
333370b324cSopenharmony_ci    {
334370b324cSopenharmony_ci      printf (" %02x", (int)dest[i]);
335370b324cSopenharmony_ci    }
336370b324cSopenharmony_ci    printf("\n");
337370b324cSopenharmony_ci    */
338370b324cSopenharmony_ci
339370b324cSopenharmony_ci    return;
340370b324cSopenharmony_ci  }
341370b324cSopenharmony_ci
342370b324cSopenharmony_ci  /* if there is mbstowcs() error, we have two ways:
343370b324cSopenharmony_ci
344370b324cSopenharmony_ci     1) change 0x80+ characters to some character: '_'
345370b324cSopenharmony_ci        in that case we lose data, but we have correct UString()
346370b324cSopenharmony_ci        and that scheme can show errors to user in early stages,
347370b324cSopenharmony_ci        when file converted back to mbs() cannot be found
348370b324cSopenharmony_ci
349370b324cSopenharmony_ci     2) transfer bad characters in some UTF-16 range.
350370b324cSopenharmony_ci        it can be non-original Unicode character.
351370b324cSopenharmony_ci        but later we still can restore original character.
352370b324cSopenharmony_ci  */
353370b324cSopenharmony_ci
354370b324cSopenharmony_ci
355370b324cSopenharmony_ci  // printf("\nmbstowcs  ERROR !!!!!! s=%s\n", src.Ptr());
356370b324cSopenharmony_ci  {
357370b324cSopenharmony_ci    unsigned i;
358370b324cSopenharmony_ci    const char *s = (const char *)src;
359370b324cSopenharmony_ci    for (i = 0;;)
360370b324cSopenharmony_ci    {
361370b324cSopenharmony_ci      Byte c = (Byte)s[i];
362370b324cSopenharmony_ci      if (c == 0)
363370b324cSopenharmony_ci        break;
364370b324cSopenharmony_ci      // we can use ascii compatibilty character '_'
365370b324cSopenharmony_ci      // if (c > 0x7F) c = '_'; // we replace "bad: character
366370b324cSopenharmony_ci      d[i++] = (wchar_t)c;
367370b324cSopenharmony_ci    }
368370b324cSopenharmony_ci    d[i] = 0;
369370b324cSopenharmony_ci    dest.ReleaseBuf_SetLen(i);
370370b324cSopenharmony_ci  }
371370b324cSopenharmony_ci}
372370b324cSopenharmony_ci
373370b324cSopenharmony_cistatic void UnicodeStringToMultiByte2_Native(AString &dest, const UString &src)
374370b324cSopenharmony_ci{
375370b324cSopenharmony_ci  dest.Empty();
376370b324cSopenharmony_ci  if (src.IsEmpty())
377370b324cSopenharmony_ci    return;
378370b324cSopenharmony_ci
379370b324cSopenharmony_ci  const size_t limit = ((size_t)src.Len() + 1) * 6;
380370b324cSopenharmony_ci  char *d = dest.GetBuf((unsigned)limit);
381370b324cSopenharmony_ci
382370b324cSopenharmony_ci  const size_t len = wcstombs(d, src, limit);
383370b324cSopenharmony_ci
384370b324cSopenharmony_ci  if (len != (size_t)-1)
385370b324cSopenharmony_ci  {
386370b324cSopenharmony_ci    dest.ReleaseBuf_SetEnd((unsigned)len);
387370b324cSopenharmony_ci    return;
388370b324cSopenharmony_ci  }
389370b324cSopenharmony_ci  dest.ReleaseBuf_SetEnd(0);
390370b324cSopenharmony_ci}
391370b324cSopenharmony_ci
392370b324cSopenharmony_ci
393370b324cSopenharmony_cistatic void UnicodeStringToMultiByte2(AString &dest, const UString &src2, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
394370b324cSopenharmony_ci{
395370b324cSopenharmony_ci  // if (codePage == 1234567) // for debug purposes
396370b324cSopenharmony_ci  if (codePage == CP_UTF8 || g_ForceToUTF8)
397370b324cSopenharmony_ci  {
398370b324cSopenharmony_ci    defaultCharWasUsed = false;
399370b324cSopenharmony_ci    ConvertUnicodeToUTF8(src2, dest);
400370b324cSopenharmony_ci    return;
401370b324cSopenharmony_ci  }
402370b324cSopenharmony_ci
403370b324cSopenharmony_ci  UString src = src2;
404370b324cSopenharmony_ci  #if WCHAR_MAX > 0xffff
405370b324cSopenharmony_ci  {
406370b324cSopenharmony_ci    src.Empty();
407370b324cSopenharmony_ci    for (unsigned i = 0; i < src2.Len();)
408370b324cSopenharmony_ci    {
409370b324cSopenharmony_ci      wchar_t c = src2[i];
410370b324cSopenharmony_ci      if (c >= 0xd800 && c < 0xdc00 && i + 1 != src2.Len())
411370b324cSopenharmony_ci      {
412370b324cSopenharmony_ci        const wchar_t c2 = src2[i + 1];
413370b324cSopenharmony_ci        if (c2 >= 0xdc00 && c2 < 0x10000)
414370b324cSopenharmony_ci        {
415370b324cSopenharmony_ci          // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2);
416370b324cSopenharmony_ci          c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
417370b324cSopenharmony_ci          // printf("%4x\n", (int)c);
418370b324cSopenharmony_ci          i++;
419370b324cSopenharmony_ci        }
420370b324cSopenharmony_ci      }
421370b324cSopenharmony_ci      src += c;
422370b324cSopenharmony_ci      i++;
423370b324cSopenharmony_ci    }
424370b324cSopenharmony_ci  }
425370b324cSopenharmony_ci  #endif
426370b324cSopenharmony_ci
427370b324cSopenharmony_ci  dest.Empty();
428370b324cSopenharmony_ci  defaultCharWasUsed = false;
429370b324cSopenharmony_ci  if (src.IsEmpty())
430370b324cSopenharmony_ci    return;
431370b324cSopenharmony_ci
432370b324cSopenharmony_ci  const size_t len = wcstombs(NULL, src, 0);
433370b324cSopenharmony_ci
434370b324cSopenharmony_ci  if (len != (size_t)-1)
435370b324cSopenharmony_ci  {
436370b324cSopenharmony_ci    const unsigned limit = ((unsigned)len);
437370b324cSopenharmony_ci    if (limit == len)
438370b324cSopenharmony_ci    {
439370b324cSopenharmony_ci      char *d = dest.GetBuf(limit);
440370b324cSopenharmony_ci
441370b324cSopenharmony_ci      /*
442370b324cSopenharmony_ci      {
443370b324cSopenharmony_ci        printf("\nwcstombs; len = %d %ls \n", (int)src.Len(), src.Ptr());
444370b324cSopenharmony_ci        for (unsigned i = 0; i < src.Len(); i++)
445370b324cSopenharmony_ci          printf (" %02x", (int)src[i]);
446370b324cSopenharmony_ci        printf("\n");
447370b324cSopenharmony_ci        printf("\ndest Limit = %d \n", limit);
448370b324cSopenharmony_ci      }
449370b324cSopenharmony_ci      */
450370b324cSopenharmony_ci
451370b324cSopenharmony_ci      const size_t len2 = wcstombs(d, src, len + 1);
452370b324cSopenharmony_ci
453370b324cSopenharmony_ci      if (len2 != (size_t)-1 && len2 <= limit)
454370b324cSopenharmony_ci      {
455370b324cSopenharmony_ci        /*
456370b324cSopenharmony_ci        printf("\nOK : destLen = %d : %s\n", (int)len, dest.Ptr());
457370b324cSopenharmony_ci        for (unsigned i = 0; i < len2; i++)
458370b324cSopenharmony_ci          printf(" %02x", (int)(Byte)dest[i]);
459370b324cSopenharmony_ci        printf("\n");
460370b324cSopenharmony_ci        */
461370b324cSopenharmony_ci        dest.ReleaseBuf_SetEnd((unsigned)len2);
462370b324cSopenharmony_ci        return;
463370b324cSopenharmony_ci      }
464370b324cSopenharmony_ci    }
465370b324cSopenharmony_ci  }
466370b324cSopenharmony_ci
467370b324cSopenharmony_ci  {
468370b324cSopenharmony_ci    const wchar_t *s = (const wchar_t *)src;
469370b324cSopenharmony_ci    char *d = dest.GetBuf(src.Len());
470370b324cSopenharmony_ci
471370b324cSopenharmony_ci    unsigned i;
472370b324cSopenharmony_ci    for (i = 0;;)
473370b324cSopenharmony_ci    {
474370b324cSopenharmony_ci      wchar_t c = s[i];
475370b324cSopenharmony_ci      if (c == 0)
476370b324cSopenharmony_ci        break;
477370b324cSopenharmony_ci      if (c >=
478370b324cSopenharmony_ci            0x100
479370b324cSopenharmony_ci            // 0x80
480370b324cSopenharmony_ci          )
481370b324cSopenharmony_ci      {
482370b324cSopenharmony_ci        c = defaultChar;
483370b324cSopenharmony_ci        defaultCharWasUsed = true;
484370b324cSopenharmony_ci      }
485370b324cSopenharmony_ci
486370b324cSopenharmony_ci      d[i++] = (char)c;
487370b324cSopenharmony_ci    }
488370b324cSopenharmony_ci    d[i] = 0;
489370b324cSopenharmony_ci    dest.ReleaseBuf_SetLen(i);
490370b324cSopenharmony_ci    /*
491370b324cSopenharmony_ci    printf("\nUnicodeStringToMultiByte2; len = %d \n", (int)src.Len());
492370b324cSopenharmony_ci    printf("ERROR: %s\n", dest.Ptr());
493370b324cSopenharmony_ci    */
494370b324cSopenharmony_ci  }
495370b324cSopenharmony_ci}
496370b324cSopenharmony_ci
497370b324cSopenharmony_ci#endif // _WIN32
498370b324cSopenharmony_ci
499370b324cSopenharmony_ci
500370b324cSopenharmony_ciUString MultiByteToUnicodeString(const AString &src, UINT codePage)
501370b324cSopenharmony_ci{
502370b324cSopenharmony_ci  UString dest;
503370b324cSopenharmony_ci  MultiByteToUnicodeString2(dest, src, codePage);
504370b324cSopenharmony_ci  return dest;
505370b324cSopenharmony_ci}
506370b324cSopenharmony_ci
507370b324cSopenharmony_ciUString MultiByteToUnicodeString(const char *src, UINT codePage)
508370b324cSopenharmony_ci{
509370b324cSopenharmony_ci  return MultiByteToUnicodeString(AString(src), codePage);
510370b324cSopenharmony_ci}
511370b324cSopenharmony_ci
512370b324cSopenharmony_ci
513370b324cSopenharmony_civoid UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage)
514370b324cSopenharmony_ci{
515370b324cSopenharmony_ci  bool defaultCharWasUsed;
516370b324cSopenharmony_ci  UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
517370b324cSopenharmony_ci}
518370b324cSopenharmony_ci
519370b324cSopenharmony_ciAString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
520370b324cSopenharmony_ci{
521370b324cSopenharmony_ci  AString dest;
522370b324cSopenharmony_ci  UnicodeStringToMultiByte2(dest, src, codePage, defaultChar, defaultCharWasUsed);
523370b324cSopenharmony_ci  return dest;
524370b324cSopenharmony_ci}
525370b324cSopenharmony_ci
526370b324cSopenharmony_ciAString UnicodeStringToMultiByte(const UString &src, UINT codePage)
527370b324cSopenharmony_ci{
528370b324cSopenharmony_ci  AString dest;
529370b324cSopenharmony_ci  bool defaultCharWasUsed;
530370b324cSopenharmony_ci  UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
531370b324cSopenharmony_ci  return dest;
532370b324cSopenharmony_ci}
533370b324cSopenharmony_ci
534370b324cSopenharmony_ci
535370b324cSopenharmony_ci
536370b324cSopenharmony_ci
537370b324cSopenharmony_ci#if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
538370b324cSopenharmony_ci
539370b324cSopenharmony_ci#ifdef _WIN32
540370b324cSopenharmony_ci#define U_to_A(a, b, c)  UnicodeStringToMultiByte2
541370b324cSopenharmony_ci// #define A_to_U(a, b, c)  MultiByteToUnicodeString2
542370b324cSopenharmony_ci#else
543370b324cSopenharmony_ci// void MultiByteToUnicodeString2_Native(UString &dest, const AString &src);
544370b324cSopenharmony_ci#define U_to_A(a, b, c)  UnicodeStringToMultiByte2_Native(a, b)
545370b324cSopenharmony_ci// #define A_to_U(a, b, c)  MultiByteToUnicodeString2_Native(a, b)
546370b324cSopenharmony_ci#endif
547370b324cSopenharmony_ci
548370b324cSopenharmony_cibool IsNativeUTF8()
549370b324cSopenharmony_ci{
550370b324cSopenharmony_ci  UString u;
551370b324cSopenharmony_ci  AString a, a2;
552370b324cSopenharmony_ci  // for (unsigned c = 0x80; c < (UInt32)0x10000; c += (c >> 9) + 1)
553370b324cSopenharmony_ci  for (unsigned c = 0x80; c < (UInt32)0xD000; c += (c >> 2) + 1)
554370b324cSopenharmony_ci  {
555370b324cSopenharmony_ci    u.Empty();
556370b324cSopenharmony_ci    u += (wchar_t)c;
557370b324cSopenharmony_ci    /*
558370b324cSopenharmony_ci    if (Unicode_Is_There_Utf16SurrogateError(u))
559370b324cSopenharmony_ci      continue;
560370b324cSopenharmony_ci    #ifndef _WIN32
561370b324cSopenharmony_ci    if (Unicode_Is_There_BmpEscape(u))
562370b324cSopenharmony_ci      continue;
563370b324cSopenharmony_ci    #endif
564370b324cSopenharmony_ci    */
565370b324cSopenharmony_ci    ConvertUnicodeToUTF8(u, a);
566370b324cSopenharmony_ci    U_to_A(a2, u, CP_OEMCP);
567370b324cSopenharmony_ci    if (a != a2)
568370b324cSopenharmony_ci      return false;
569370b324cSopenharmony_ci  }
570370b324cSopenharmony_ci  return true;
571370b324cSopenharmony_ci}
572370b324cSopenharmony_ci
573370b324cSopenharmony_ci#endif
574370b324cSopenharmony_ci
575370b324cSopenharmony_ci
576370b324cSopenharmony_ci#ifdef ENV_HAVE_LOCALE
577370b324cSopenharmony_ci
578370b324cSopenharmony_ciconst char *GetLocale(void)
579370b324cSopenharmony_ci{
580370b324cSopenharmony_ci  #ifdef ENV_HAVE_LOCALE
581370b324cSopenharmony_ci    // printf("\n\nsetlocale(LC_CTYPE, NULL) : return : ");
582370b324cSopenharmony_ci    const char *s = setlocale(LC_CTYPE, NULL);
583370b324cSopenharmony_ci    if (!s)
584370b324cSopenharmony_ci    {
585370b324cSopenharmony_ci      // printf("[NULL]\n");
586370b324cSopenharmony_ci      s = "C";
587370b324cSopenharmony_ci    }
588370b324cSopenharmony_ci    else
589370b324cSopenharmony_ci    {
590370b324cSopenharmony_ci      // ubuntu returns "C" after program start
591370b324cSopenharmony_ci      // printf("\"%s\"\n", s);
592370b324cSopenharmony_ci    }
593370b324cSopenharmony_ci    return s;
594370b324cSopenharmony_ci  #elif defined(LOCALE_IS_UTF8)
595370b324cSopenharmony_ci    return "utf8";
596370b324cSopenharmony_ci  #else
597370b324cSopenharmony_ci    return "C";
598370b324cSopenharmony_ci  #endif
599370b324cSopenharmony_ci}
600370b324cSopenharmony_ci
601370b324cSopenharmony_ci#ifdef _WIN32
602370b324cSopenharmony_ci  static void Set_ForceToUTF8(bool) {}
603370b324cSopenharmony_ci#else
604370b324cSopenharmony_ci  static void Set_ForceToUTF8(bool val) { g_ForceToUTF8 = val; }
605370b324cSopenharmony_ci#endif
606370b324cSopenharmony_ci
607370b324cSopenharmony_cistatic bool Is_Default_Basic_Locale(const char *locale)
608370b324cSopenharmony_ci{
609370b324cSopenharmony_ci  const AString a (locale);
610370b324cSopenharmony_ci  if (a.IsEqualTo_Ascii_NoCase("")
611370b324cSopenharmony_ci      || a.IsEqualTo_Ascii_NoCase("C")
612370b324cSopenharmony_ci      || a.IsEqualTo_Ascii_NoCase("POSIX"))
613370b324cSopenharmony_ci      return true;
614370b324cSopenharmony_ci  return false;
615370b324cSopenharmony_ci}
616370b324cSopenharmony_ci
617370b324cSopenharmony_cistatic bool Is_Default_Basic_Locale()
618370b324cSopenharmony_ci{
619370b324cSopenharmony_ci  return Is_Default_Basic_Locale(GetLocale());
620370b324cSopenharmony_ci}
621370b324cSopenharmony_ci
622370b324cSopenharmony_ci
623370b324cSopenharmony_civoid MY_SetLocale()
624370b324cSopenharmony_ci{
625370b324cSopenharmony_ci  #ifdef ENV_HAVE_LOCALE
626370b324cSopenharmony_ci  /*
627370b324cSopenharmony_ci  {
628370b324cSopenharmony_ci    const char *s = GetLocale();
629370b324cSopenharmony_ci    printf("\nGetLocale() : returned : \"%s\"\n", s);
630370b324cSopenharmony_ci  }
631370b324cSopenharmony_ci  */
632370b324cSopenharmony_ci
633370b324cSopenharmony_ci  unsigned start = 0;
634370b324cSopenharmony_ci  // unsigned lim = 0;
635370b324cSopenharmony_ci  unsigned lim = 3;
636370b324cSopenharmony_ci
637370b324cSopenharmony_ci  /*
638370b324cSopenharmony_ci  #define MY_SET_LOCALE_FLAGS__FROM_ENV 1
639370b324cSopenharmony_ci  #define MY_SET_LOCALE_FLAGS__TRY_UTF8 2
640370b324cSopenharmony_ci
641370b324cSopenharmony_ci  unsigned flags =
642370b324cSopenharmony_ci      MY_SET_LOCALE_FLAGS__FROM_ENV |
643370b324cSopenharmony_ci      MY_SET_LOCALE_FLAGS__TRY_UTF8
644370b324cSopenharmony_ci
645370b324cSopenharmony_ci  if (flags != 0)
646370b324cSopenharmony_ci  {
647370b324cSopenharmony_ci    if (flags & MY_SET_LOCALE_FLAGS__FROM_ENV)
648370b324cSopenharmony_ci      lim = (flags & MY_SET_LOCALE_FLAGS__TRY_UTF8) ? 3 : 1;
649370b324cSopenharmony_ci    else
650370b324cSopenharmony_ci    {
651370b324cSopenharmony_ci      start = 1;
652370b324cSopenharmony_ci      lim = 2;
653370b324cSopenharmony_ci    }
654370b324cSopenharmony_ci  }
655370b324cSopenharmony_ci  */
656370b324cSopenharmony_ci
657370b324cSopenharmony_ci  for (unsigned i = start; i < lim; i++)
658370b324cSopenharmony_ci  {
659370b324cSopenharmony_ci    /*
660370b324cSopenharmony_ci    man7: "If locale is an empty string, "", each part of the locale that
661370b324cSopenharmony_ci    should be modified is set according to the environment variables.
662370b324cSopenharmony_ci    for glibc: glibc, first from the user's environment variables:
663370b324cSopenharmony_ci      1) the environment variable LC_ALL,
664370b324cSopenharmony_ci      2) environment variable with the same name as the category (see the
665370b324cSopenharmony_ci      3) the environment variable LANG
666370b324cSopenharmony_ci    The locale "C" or "POSIX" is a portable locale; it exists on all conforming systems.
667370b324cSopenharmony_ci
668370b324cSopenharmony_ci    for WIN32 : MSDN :
669370b324cSopenharmony_ci      Sets the locale to the default, which is the user-default
670370b324cSopenharmony_ci      ANSI code page obtained from the operating system.
671370b324cSopenharmony_ci      The locale name is set to the value returned by GetUserDefaultLocaleName.
672370b324cSopenharmony_ci      The code page is set to the value returned by GetACP
673370b324cSopenharmony_ci  */
674370b324cSopenharmony_ci    const char *newLocale = "";
675370b324cSopenharmony_ci
676370b324cSopenharmony_ci    #ifdef __APPLE__
677370b324cSopenharmony_ci
678370b324cSopenharmony_ci    /* look also CFLocale
679370b324cSopenharmony_ci       there is no C.UTF-8 in macos
680370b324cSopenharmony_ci       macos has UTF-8 locale only with some language like en_US.UTF-8
681370b324cSopenharmony_ci       what is best way to set UTF-8 locale in macos? */
682370b324cSopenharmony_ci    if (i == 1)
683370b324cSopenharmony_ci      newLocale = "en_US.UTF-8";
684370b324cSopenharmony_ci
685370b324cSopenharmony_ci    /* file open with non-utf8 sequencies return
686370b324cSopenharmony_ci      #define EILSEQ    92    // "Illegal byte sequence"
687370b324cSopenharmony_ci    */
688370b324cSopenharmony_ci#else
689370b324cSopenharmony_ci    // newLocale = "C";
690370b324cSopenharmony_ci    if (i == 1)
691370b324cSopenharmony_ci    {
692370b324cSopenharmony_ci      newLocale = "C.UTF-8";    // main UTF-8 locale in ubuntu
693370b324cSopenharmony_ci      // newLocale = ".utf8";    // supported in new Windows 10 build 17134 (April 2018 Update), the Universal C Runtime
694370b324cSopenharmony_ci      // newLocale = "en_US.utf8"; // supported by ubuntu ?
695370b324cSopenharmony_ci      // newLocale = "en_US.UTF-8";
696370b324cSopenharmony_ci      /* setlocale() in ubuntu allows locales with minor chracter changes in strings
697370b324cSopenharmony_ci        "en_US.UTF-8" /  "en_US.utf8" */
698370b324cSopenharmony_ci    }
699370b324cSopenharmony_ci
700370b324cSopenharmony_ci#endif
701370b324cSopenharmony_ci
702370b324cSopenharmony_ci    // printf("\nsetlocale(LC_ALL, \"%s\") : returned: ", newLocale);
703370b324cSopenharmony_ci
704370b324cSopenharmony_ci    // const char *s =
705370b324cSopenharmony_ci    setlocale(LC_ALL, newLocale);
706370b324cSopenharmony_ci
707370b324cSopenharmony_ci    /*
708370b324cSopenharmony_ci    if (!s)
709370b324cSopenharmony_ci      printf("NULL: can't set locale");
710370b324cSopenharmony_ci    else
711370b324cSopenharmony_ci      printf("\"%s\"\n", s);
712370b324cSopenharmony_ci    */
713370b324cSopenharmony_ci
714370b324cSopenharmony_ci    // request curent locale of program
715370b324cSopenharmony_ci    const char *locale = GetLocale();
716370b324cSopenharmony_ci    if (locale)
717370b324cSopenharmony_ci    {
718370b324cSopenharmony_ci      AString a (locale);
719370b324cSopenharmony_ci      a.MakeLower_Ascii();
720370b324cSopenharmony_ci      // if (a.Find("utf") >= 0)
721370b324cSopenharmony_ci      {
722370b324cSopenharmony_ci        if (IsNativeUTF8())
723370b324cSopenharmony_ci        {
724370b324cSopenharmony_ci          Set_ForceToUTF8(true);
725370b324cSopenharmony_ci          return;
726370b324cSopenharmony_ci        }
727370b324cSopenharmony_ci      }
728370b324cSopenharmony_ci      if (!Is_Default_Basic_Locale(locale))
729370b324cSopenharmony_ci      {
730370b324cSopenharmony_ci        // if there is some non-default and non-utf locale, we want to use it
731370b324cSopenharmony_ci        break; // comment it for debug
732370b324cSopenharmony_ci      }
733370b324cSopenharmony_ci    }
734370b324cSopenharmony_ci  }
735370b324cSopenharmony_ci
736370b324cSopenharmony_ci  if (IsNativeUTF8())
737370b324cSopenharmony_ci  {
738370b324cSopenharmony_ci    Set_ForceToUTF8(true);
739370b324cSopenharmony_ci    return;
740370b324cSopenharmony_ci  }
741370b324cSopenharmony_ci
742370b324cSopenharmony_ci  if (Is_Default_Basic_Locale())
743370b324cSopenharmony_ci  {
744370b324cSopenharmony_ci    Set_ForceToUTF8(true);
745370b324cSopenharmony_ci    return;
746370b324cSopenharmony_ci  }
747370b324cSopenharmony_ci
748370b324cSopenharmony_ci  Set_ForceToUTF8(false);
749370b324cSopenharmony_ci
750370b324cSopenharmony_ci  #elif defined(LOCALE_IS_UTF8)
751370b324cSopenharmony_ci    // assume LC_CTYPE="utf8"
752370b324cSopenharmony_ci  #else
753370b324cSopenharmony_ci    // assume LC_CTYPE="C"
754370b324cSopenharmony_ci  #endif
755370b324cSopenharmony_ci}
756370b324cSopenharmony_ci#endif
757