1// Common/StringConvert.cpp
2
3#include "StdAfx.h"
4
5#include "StringConvert.h"
6
7#ifndef _WIN32
8// #include <stdio.h>
9#include <stdlib.h>
10#endif
11
12#if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
13#include "UTFConvert.h"
14#endif
15
16#ifdef ENV_HAVE_LOCALE
17#include <locale.h>
18#endif
19
20static const char k_DefultChar = '_';
21
22#ifdef _WIN32
23
24/*
25MultiByteToWideChar(CodePage, DWORD dwFlags,
26    LPCSTR lpMultiByteStr, int cbMultiByte,
27    LPWSTR lpWideCharStr, int cchWideChar)
28
29  if (cbMultiByte == 0)
30    return: 0. ERR: ERROR_INVALID_PARAMETER
31
32  if (cchWideChar == 0)
33    return: the required buffer size in characters.
34
35  if (supplied buffer size was not large enough)
36    return: 0. ERR: ERROR_INSUFFICIENT_BUFFER
37    The number of filled characters in lpWideCharStr can be smaller than cchWideChar (if last character is complex)
38
39  If there are illegal characters:
40    if MB_ERR_INVALID_CHARS is set in dwFlags:
41      - the function stops conversion on illegal character.
42      - Return: 0. ERR: ERROR_NO_UNICODE_TRANSLATION.
43
44    if MB_ERR_INVALID_CHARS is NOT set in dwFlags:
45      before Vista: illegal character is dropped (skipped). WinXP-64: GetLastError() returns 0.
46      in Vista+:    illegal character is not dropped (MSDN). Undocumented: illegal
47                    character is converted to U+FFFD, which is REPLACEMENT CHARACTER.
48*/
49
50
51void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
52{
53  dest.Empty();
54  if (src.IsEmpty())
55    return;
56  {
57    /*
58    wchar_t *d = dest.GetBuf(src.Len());
59    const char *s = (const char *)src;
60    unsigned i;
61
62    for (i = 0;;)
63    {
64      Byte c = (Byte)s[i];
65      if (c >= 0x80 || c == 0)
66        break;
67      d[i++] = (wchar_t)c;
68    }
69
70    if (i != src.Len())
71    {
72      unsigned len = MultiByteToWideChar(codePage, 0, s + i,
73          src.Len() - i, d + i,
74          src.Len() + 1 - i);
75      if (len == 0)
76        throw 282228;
77      i += len;
78    }
79
80    d[i] = 0;
81    dest.ReleaseBuf_SetLen(i);
82    */
83    unsigned len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), NULL, 0);
84    if (len == 0)
85    {
86      if (GetLastError() != 0)
87        throw 282228;
88    }
89    else
90    {
91      len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), dest.GetBuf(len), (int)len);
92      if (len == 0)
93        throw 282228;
94      dest.ReleaseBuf_SetEnd(len);
95    }
96  }
97}
98
99/*
100  int WideCharToMultiByte(
101      UINT CodePage, DWORD dwFlags,
102      LPCWSTR lpWideCharStr, int cchWideChar,
103      LPSTR lpMultiByteStr, int cbMultiByte,
104      LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar);
105
106if (lpDefaultChar == NULL),
107  - it uses system default value.
108
109if (CodePage == CP_UTF7 || CodePage == CP_UTF8)
110  if (lpDefaultChar != NULL || lpUsedDefaultChar != NULL)
111    return: 0. ERR: ERROR_INVALID_PARAMETER.
112
113The function operates most efficiently, if (lpDefaultChar == NULL && lpUsedDefaultChar == NULL)
114
115*/
116
117static void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
118{
119  dest.Empty();
120  defaultCharWasUsed = false;
121  if (src.IsEmpty())
122    return;
123  {
124    /*
125    unsigned numRequiredBytes = src.Len() * 2;
126    char *d = dest.GetBuf(numRequiredBytes);
127    const wchar_t *s = (const wchar_t *)src;
128    unsigned i;
129
130    for (i = 0;;)
131    {
132      wchar_t c = s[i];
133      if (c >= 0x80 || c == 0)
134        break;
135      d[i++] = (char)c;
136    }
137
138    if (i != src.Len())
139    {
140      BOOL defUsed = FALSE;
141      defaultChar = defaultChar;
142
143      bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
144      unsigned len = WideCharToMultiByte(codePage, 0, s + i, src.Len() - i,
145          d + i, numRequiredBytes + 1 - i,
146          (isUtf ? NULL : &defaultChar),
147          (isUtf ? NULL : &defUsed));
148      defaultCharWasUsed = (defUsed != FALSE);
149      if (len == 0)
150        throw 282229;
151      i += len;
152    }
153
154    d[i] = 0;
155    dest.ReleaseBuf_SetLen(i);
156    */
157
158    /*
159    if (codePage != CP_UTF7)
160    {
161      const wchar_t *s = (const wchar_t *)src;
162      unsigned i;
163      for (i = 0;; i++)
164      {
165        wchar_t c = s[i];
166        if (c >= 0x80 || c == 0)
167          break;
168      }
169
170      if (s[i] == 0)
171      {
172        char *d = dest.GetBuf(src.Len());
173        for (i = 0;;)
174        {
175          wchar_t c = s[i];
176          if (c == 0)
177            break;
178          d[i++] = (char)c;
179        }
180        d[i] = 0;
181        dest.ReleaseBuf_SetLen(i);
182        return;
183      }
184    }
185    */
186
187    unsigned len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), NULL, 0, NULL, NULL);
188    if (len == 0)
189    {
190      if (GetLastError() != 0)
191        throw 282228;
192    }
193    else
194    {
195      BOOL defUsed = FALSE;
196      bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
197      // defaultChar = defaultChar;
198      len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(),
199          dest.GetBuf(len), (int)len,
200          (isUtf ? NULL : &defaultChar),
201          (isUtf ? NULL : &defUsed)
202          );
203      if (!isUtf)
204        defaultCharWasUsed = (defUsed != FALSE);
205      if (len == 0)
206        throw 282228;
207      dest.ReleaseBuf_SetEnd(len);
208    }
209  }
210}
211
212/*
213#ifndef UNDER_CE
214AString SystemStringToOemString(const CSysString &src)
215{
216  AString dest;
217  const unsigned len = src.Len() * 2;
218  CharToOem(src, dest.GetBuf(len));
219  dest.ReleaseBuf_CalcLen(len);
220  return dest;
221}
222#endif
223*/
224
225#else // _WIN32
226
227// #include <stdio.h>
228/*
229  if (wchar_t is 32-bit (#if WCHAR_MAX > 0xffff),
230      and utf-8 string contains big unicode character > 0xffff),
231  then we still use 16-bit surrogate pair in UString.
232  It simplifies another code where utf-16 encoding is used.
233  So we use surrogate-conversion code only in is file.
234*/
235
236/*
237   mbstowcs() returns error if there is error in utf-8 stream,
238   mbstowcs() returns error if there is single surrogates point (d800-dfff) in utf-8 stream
239*/
240
241/*
242static void MultiByteToUnicodeString2_Native(UString &dest, const AString &src)
243{
244  dest.Empty();
245  if (src.IsEmpty())
246    return;
247
248  const size_t limit = ((size_t)src.Len() + 1) * 2;
249  wchar_t *d = dest.GetBuf((unsigned)limit);
250  const size_t len = mbstowcs(d, src, limit);
251  if (len != (size_t)-1)
252  {
253    dest.ReleaseBuf_SetEnd((unsigned)len);
254    return;
255  }
256  dest.ReleaseBuf_SetEnd(0);
257}
258*/
259
260bool g_ForceToUTF8 = true; // false;
261
262void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
263{
264  dest.Empty();
265  if (src.IsEmpty())
266    return;
267
268  if (codePage == CP_UTF8 || g_ForceToUTF8)
269  {
270    ConvertUTF8ToUnicode(src, dest);
271    return;
272  }
273
274  const size_t limit = ((size_t)src.Len() + 1) * 2;
275  wchar_t *d = dest.GetBuf((unsigned)limit);
276  const size_t len = mbstowcs(d, src, limit);
277  if (len != (size_t)-1)
278  {
279    dest.ReleaseBuf_SetEnd((unsigned)len);
280
281    #if WCHAR_MAX > 0xffff
282    d = dest.GetBuf();
283    for (size_t i = 0;; i++)
284    {
285      // wchar_t c = dest[i];
286      wchar_t c = d[i];
287      if (c == 0)
288        break;
289      if (c >= 0x10000 && c < 0x110000)
290      {
291        /*
292        c -= 0x10000;
293        unsigned c0 = 0xd800 + ((c >> 10) & 0x3FF);
294        dest.ReplaceOneCharAtPos(i, c0);
295        i++;
296        c = 0xdc00 + (c & 0x3FF);
297        dest.Insert_wchar_t(i, c);
298        */
299        UString temp = d + i;
300
301        for (size_t t = 0;; t++)
302        {
303          wchar_t w = temp[t];
304          if (w == 0)
305            break;
306          if (i == limit)
307            break; // unexpected error
308          if (w >= 0x10000 && w < 0x110000)
309          {
310            if (i + 1 == limit)
311              break; // unexpected error
312            w -= 0x10000;
313            d[i++] = (unsigned)0xd800 + (((unsigned)w >> 10) & 0x3FF);
314            w = 0xdc00 + (w & 0x3FF);
315          }
316          d[i++] = w;
317        }
318        dest.ReleaseBuf_SetEnd((unsigned)i);
319      }
320    }
321
322    #endif
323
324    /*
325    printf("\nMultiByteToUnicodeString2 (%d) %s\n", (int)src.Len(),  src.Ptr());
326    printf("char:    ");
327    for (unsigned i = 0; i < src.Len(); i++)
328      printf (" %02x", (int)(Byte)src[i]);
329    printf("\n");
330    printf("\n-> (%d) %ls\n", (int)dest.Len(), dest.Ptr());
331    printf("wchar_t: ");
332    for (unsigned i = 0; i < dest.Len(); i++)
333    {
334      printf (" %02x", (int)dest[i]);
335    }
336    printf("\n");
337    */
338
339    return;
340  }
341
342  /* if there is mbstowcs() error, we have two ways:
343
344     1) change 0x80+ characters to some character: '_'
345        in that case we lose data, but we have correct UString()
346        and that scheme can show errors to user in early stages,
347        when file converted back to mbs() cannot be found
348
349     2) transfer bad characters in some UTF-16 range.
350        it can be non-original Unicode character.
351        but later we still can restore original character.
352  */
353
354
355  // printf("\nmbstowcs  ERROR !!!!!! s=%s\n", src.Ptr());
356  {
357    unsigned i;
358    const char *s = (const char *)src;
359    for (i = 0;;)
360    {
361      Byte c = (Byte)s[i];
362      if (c == 0)
363        break;
364      // we can use ascii compatibilty character '_'
365      // if (c > 0x7F) c = '_'; // we replace "bad: character
366      d[i++] = (wchar_t)c;
367    }
368    d[i] = 0;
369    dest.ReleaseBuf_SetLen(i);
370  }
371}
372
373static void UnicodeStringToMultiByte2_Native(AString &dest, const UString &src)
374{
375  dest.Empty();
376  if (src.IsEmpty())
377    return;
378
379  const size_t limit = ((size_t)src.Len() + 1) * 6;
380  char *d = dest.GetBuf((unsigned)limit);
381
382  const size_t len = wcstombs(d, src, limit);
383
384  if (len != (size_t)-1)
385  {
386    dest.ReleaseBuf_SetEnd((unsigned)len);
387    return;
388  }
389  dest.ReleaseBuf_SetEnd(0);
390}
391
392
393static void UnicodeStringToMultiByte2(AString &dest, const UString &src2, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
394{
395  // if (codePage == 1234567) // for debug purposes
396  if (codePage == CP_UTF8 || g_ForceToUTF8)
397  {
398    defaultCharWasUsed = false;
399    ConvertUnicodeToUTF8(src2, dest);
400    return;
401  }
402
403  UString src = src2;
404  #if WCHAR_MAX > 0xffff
405  {
406    src.Empty();
407    for (unsigned i = 0; i < src2.Len();)
408    {
409      wchar_t c = src2[i];
410      if (c >= 0xd800 && c < 0xdc00 && i + 1 != src2.Len())
411      {
412        const wchar_t c2 = src2[i + 1];
413        if (c2 >= 0xdc00 && c2 < 0x10000)
414        {
415          // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2);
416          c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
417          // printf("%4x\n", (int)c);
418          i++;
419        }
420      }
421      src += c;
422      i++;
423    }
424  }
425  #endif
426
427  dest.Empty();
428  defaultCharWasUsed = false;
429  if (src.IsEmpty())
430    return;
431
432  const size_t len = wcstombs(NULL, src, 0);
433
434  if (len != (size_t)-1)
435  {
436    const unsigned limit = ((unsigned)len);
437    if (limit == len)
438    {
439      char *d = dest.GetBuf(limit);
440
441      /*
442      {
443        printf("\nwcstombs; len = %d %ls \n", (int)src.Len(), src.Ptr());
444        for (unsigned i = 0; i < src.Len(); i++)
445          printf (" %02x", (int)src[i]);
446        printf("\n");
447        printf("\ndest Limit = %d \n", limit);
448      }
449      */
450
451      const size_t len2 = wcstombs(d, src, len + 1);
452
453      if (len2 != (size_t)-1 && len2 <= limit)
454      {
455        /*
456        printf("\nOK : destLen = %d : %s\n", (int)len, dest.Ptr());
457        for (unsigned i = 0; i < len2; i++)
458          printf(" %02x", (int)(Byte)dest[i]);
459        printf("\n");
460        */
461        dest.ReleaseBuf_SetEnd((unsigned)len2);
462        return;
463      }
464    }
465  }
466
467  {
468    const wchar_t *s = (const wchar_t *)src;
469    char *d = dest.GetBuf(src.Len());
470
471    unsigned i;
472    for (i = 0;;)
473    {
474      wchar_t c = s[i];
475      if (c == 0)
476        break;
477      if (c >=
478            0x100
479            // 0x80
480          )
481      {
482        c = defaultChar;
483        defaultCharWasUsed = true;
484      }
485
486      d[i++] = (char)c;
487    }
488    d[i] = 0;
489    dest.ReleaseBuf_SetLen(i);
490    /*
491    printf("\nUnicodeStringToMultiByte2; len = %d \n", (int)src.Len());
492    printf("ERROR: %s\n", dest.Ptr());
493    */
494  }
495}
496
497#endif // _WIN32
498
499
500UString MultiByteToUnicodeString(const AString &src, UINT codePage)
501{
502  UString dest;
503  MultiByteToUnicodeString2(dest, src, codePage);
504  return dest;
505}
506
507UString MultiByteToUnicodeString(const char *src, UINT codePage)
508{
509  return MultiByteToUnicodeString(AString(src), codePage);
510}
511
512
513void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage)
514{
515  bool defaultCharWasUsed;
516  UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
517}
518
519AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
520{
521  AString dest;
522  UnicodeStringToMultiByte2(dest, src, codePage, defaultChar, defaultCharWasUsed);
523  return dest;
524}
525
526AString UnicodeStringToMultiByte(const UString &src, UINT codePage)
527{
528  AString dest;
529  bool defaultCharWasUsed;
530  UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
531  return dest;
532}
533
534
535
536
537#if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
538
539#ifdef _WIN32
540#define U_to_A(a, b, c)  UnicodeStringToMultiByte2
541// #define A_to_U(a, b, c)  MultiByteToUnicodeString2
542#else
543// void MultiByteToUnicodeString2_Native(UString &dest, const AString &src);
544#define U_to_A(a, b, c)  UnicodeStringToMultiByte2_Native(a, b)
545// #define A_to_U(a, b, c)  MultiByteToUnicodeString2_Native(a, b)
546#endif
547
548bool IsNativeUTF8()
549{
550  UString u;
551  AString a, a2;
552  // for (unsigned c = 0x80; c < (UInt32)0x10000; c += (c >> 9) + 1)
553  for (unsigned c = 0x80; c < (UInt32)0xD000; c += (c >> 2) + 1)
554  {
555    u.Empty();
556    u += (wchar_t)c;
557    /*
558    if (Unicode_Is_There_Utf16SurrogateError(u))
559      continue;
560    #ifndef _WIN32
561    if (Unicode_Is_There_BmpEscape(u))
562      continue;
563    #endif
564    */
565    ConvertUnicodeToUTF8(u, a);
566    U_to_A(a2, u, CP_OEMCP);
567    if (a != a2)
568      return false;
569  }
570  return true;
571}
572
573#endif
574
575
576#ifdef ENV_HAVE_LOCALE
577
578const char *GetLocale(void)
579{
580  #ifdef ENV_HAVE_LOCALE
581    // printf("\n\nsetlocale(LC_CTYPE, NULL) : return : ");
582    const char *s = setlocale(LC_CTYPE, NULL);
583    if (!s)
584    {
585      // printf("[NULL]\n");
586      s = "C";
587    }
588    else
589    {
590      // ubuntu returns "C" after program start
591      // printf("\"%s\"\n", s);
592    }
593    return s;
594  #elif defined(LOCALE_IS_UTF8)
595    return "utf8";
596  #else
597    return "C";
598  #endif
599}
600
601#ifdef _WIN32
602  static void Set_ForceToUTF8(bool) {}
603#else
604  static void Set_ForceToUTF8(bool val) { g_ForceToUTF8 = val; }
605#endif
606
607static bool Is_Default_Basic_Locale(const char *locale)
608{
609  const AString a (locale);
610  if (a.IsEqualTo_Ascii_NoCase("")
611      || a.IsEqualTo_Ascii_NoCase("C")
612      || a.IsEqualTo_Ascii_NoCase("POSIX"))
613      return true;
614  return false;
615}
616
617static bool Is_Default_Basic_Locale()
618{
619  return Is_Default_Basic_Locale(GetLocale());
620}
621
622
623void MY_SetLocale()
624{
625  #ifdef ENV_HAVE_LOCALE
626  /*
627  {
628    const char *s = GetLocale();
629    printf("\nGetLocale() : returned : \"%s\"\n", s);
630  }
631  */
632
633  unsigned start = 0;
634  // unsigned lim = 0;
635  unsigned lim = 3;
636
637  /*
638  #define MY_SET_LOCALE_FLAGS__FROM_ENV 1
639  #define MY_SET_LOCALE_FLAGS__TRY_UTF8 2
640
641  unsigned flags =
642      MY_SET_LOCALE_FLAGS__FROM_ENV |
643      MY_SET_LOCALE_FLAGS__TRY_UTF8
644
645  if (flags != 0)
646  {
647    if (flags & MY_SET_LOCALE_FLAGS__FROM_ENV)
648      lim = (flags & MY_SET_LOCALE_FLAGS__TRY_UTF8) ? 3 : 1;
649    else
650    {
651      start = 1;
652      lim = 2;
653    }
654  }
655  */
656
657  for (unsigned i = start; i < lim; i++)
658  {
659    /*
660    man7: "If locale is an empty string, "", each part of the locale that
661    should be modified is set according to the environment variables.
662    for glibc: glibc, first from the user's environment variables:
663      1) the environment variable LC_ALL,
664      2) environment variable with the same name as the category (see the
665      3) the environment variable LANG
666    The locale "C" or "POSIX" is a portable locale; it exists on all conforming systems.
667
668    for WIN32 : MSDN :
669      Sets the locale to the default, which is the user-default
670      ANSI code page obtained from the operating system.
671      The locale name is set to the value returned by GetUserDefaultLocaleName.
672      The code page is set to the value returned by GetACP
673  */
674    const char *newLocale = "";
675
676    #ifdef __APPLE__
677
678    /* look also CFLocale
679       there is no C.UTF-8 in macos
680       macos has UTF-8 locale only with some language like en_US.UTF-8
681       what is best way to set UTF-8 locale in macos? */
682    if (i == 1)
683      newLocale = "en_US.UTF-8";
684
685    /* file open with non-utf8 sequencies return
686      #define EILSEQ    92    // "Illegal byte sequence"
687    */
688#else
689    // newLocale = "C";
690    if (i == 1)
691    {
692      newLocale = "C.UTF-8";    // main UTF-8 locale in ubuntu
693      // newLocale = ".utf8";    // supported in new Windows 10 build 17134 (April 2018 Update), the Universal C Runtime
694      // newLocale = "en_US.utf8"; // supported by ubuntu ?
695      // newLocale = "en_US.UTF-8";
696      /* setlocale() in ubuntu allows locales with minor chracter changes in strings
697        "en_US.UTF-8" /  "en_US.utf8" */
698    }
699
700#endif
701
702    // printf("\nsetlocale(LC_ALL, \"%s\") : returned: ", newLocale);
703
704    // const char *s =
705    setlocale(LC_ALL, newLocale);
706
707    /*
708    if (!s)
709      printf("NULL: can't set locale");
710    else
711      printf("\"%s\"\n", s);
712    */
713
714    // request curent locale of program
715    const char *locale = GetLocale();
716    if (locale)
717    {
718      AString a (locale);
719      a.MakeLower_Ascii();
720      // if (a.Find("utf") >= 0)
721      {
722        if (IsNativeUTF8())
723        {
724          Set_ForceToUTF8(true);
725          return;
726        }
727      }
728      if (!Is_Default_Basic_Locale(locale))
729      {
730        // if there is some non-default and non-utf locale, we want to use it
731        break; // comment it for debug
732      }
733    }
734  }
735
736  if (IsNativeUTF8())
737  {
738    Set_ForceToUTF8(true);
739    return;
740  }
741
742  if (Is_Default_Basic_Locale())
743  {
744    Set_ForceToUTF8(true);
745    return;
746  }
747
748  Set_ForceToUTF8(false);
749
750  #elif defined(LOCALE_IS_UTF8)
751    // assume LC_CTYPE="utf8"
752  #else
753    // assume LC_CTYPE="C"
754  #endif
755}
756#endif
757