1// Common/StringConvert.cpp 2 3#include "StdAfx.h" 4 5#include "StringConvert.h" 6 7#ifndef _WIN32 8// #include <stdio.h> 9#include <stdlib.h> 10#endif 11 12#if !defined(_WIN32) || defined(ENV_HAVE_LOCALE) 13#include "UTFConvert.h" 14#endif 15 16#ifdef ENV_HAVE_LOCALE 17#include <locale.h> 18#endif 19 20static const char k_DefultChar = '_'; 21 22#ifdef _WIN32 23 24/* 25MultiByteToWideChar(CodePage, DWORD dwFlags, 26 LPCSTR lpMultiByteStr, int cbMultiByte, 27 LPWSTR lpWideCharStr, int cchWideChar) 28 29 if (cbMultiByte == 0) 30 return: 0. ERR: ERROR_INVALID_PARAMETER 31 32 if (cchWideChar == 0) 33 return: the required buffer size in characters. 34 35 if (supplied buffer size was not large enough) 36 return: 0. ERR: ERROR_INSUFFICIENT_BUFFER 37 The number of filled characters in lpWideCharStr can be smaller than cchWideChar (if last character is complex) 38 39 If there are illegal characters: 40 if MB_ERR_INVALID_CHARS is set in dwFlags: 41 - the function stops conversion on illegal character. 42 - Return: 0. ERR: ERROR_NO_UNICODE_TRANSLATION. 43 44 if MB_ERR_INVALID_CHARS is NOT set in dwFlags: 45 before Vista: illegal character is dropped (skipped). WinXP-64: GetLastError() returns 0. 46 in Vista+: illegal character is not dropped (MSDN). Undocumented: illegal 47 character is converted to U+FFFD, which is REPLACEMENT CHARACTER. 48*/ 49 50 51void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage) 52{ 53 dest.Empty(); 54 if (src.IsEmpty()) 55 return; 56 { 57 /* 58 wchar_t *d = dest.GetBuf(src.Len()); 59 const char *s = (const char *)src; 60 unsigned i; 61 62 for (i = 0;;) 63 { 64 Byte c = (Byte)s[i]; 65 if (c >= 0x80 || c == 0) 66 break; 67 d[i++] = (wchar_t)c; 68 } 69 70 if (i != src.Len()) 71 { 72 unsigned len = MultiByteToWideChar(codePage, 0, s + i, 73 src.Len() - i, d + i, 74 src.Len() + 1 - i); 75 if (len == 0) 76 throw 282228; 77 i += len; 78 } 79 80 d[i] = 0; 81 dest.ReleaseBuf_SetLen(i); 82 */ 83 unsigned len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), NULL, 0); 84 if (len == 0) 85 { 86 if (GetLastError() != 0) 87 throw 282228; 88 } 89 else 90 { 91 len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), dest.GetBuf(len), (int)len); 92 if (len == 0) 93 throw 282228; 94 dest.ReleaseBuf_SetEnd(len); 95 } 96 } 97} 98 99/* 100 int WideCharToMultiByte( 101 UINT CodePage, DWORD dwFlags, 102 LPCWSTR lpWideCharStr, int cchWideChar, 103 LPSTR lpMultiByteStr, int cbMultiByte, 104 LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar); 105 106if (lpDefaultChar == NULL), 107 - it uses system default value. 108 109if (CodePage == CP_UTF7 || CodePage == CP_UTF8) 110 if (lpDefaultChar != NULL || lpUsedDefaultChar != NULL) 111 return: 0. ERR: ERROR_INVALID_PARAMETER. 112 113The function operates most efficiently, if (lpDefaultChar == NULL && lpUsedDefaultChar == NULL) 114 115*/ 116 117static void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed) 118{ 119 dest.Empty(); 120 defaultCharWasUsed = false; 121 if (src.IsEmpty()) 122 return; 123 { 124 /* 125 unsigned numRequiredBytes = src.Len() * 2; 126 char *d = dest.GetBuf(numRequiredBytes); 127 const wchar_t *s = (const wchar_t *)src; 128 unsigned i; 129 130 for (i = 0;;) 131 { 132 wchar_t c = s[i]; 133 if (c >= 0x80 || c == 0) 134 break; 135 d[i++] = (char)c; 136 } 137 138 if (i != src.Len()) 139 { 140 BOOL defUsed = FALSE; 141 defaultChar = defaultChar; 142 143 bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7); 144 unsigned len = WideCharToMultiByte(codePage, 0, s + i, src.Len() - i, 145 d + i, numRequiredBytes + 1 - i, 146 (isUtf ? NULL : &defaultChar), 147 (isUtf ? NULL : &defUsed)); 148 defaultCharWasUsed = (defUsed != FALSE); 149 if (len == 0) 150 throw 282229; 151 i += len; 152 } 153 154 d[i] = 0; 155 dest.ReleaseBuf_SetLen(i); 156 */ 157 158 /* 159 if (codePage != CP_UTF7) 160 { 161 const wchar_t *s = (const wchar_t *)src; 162 unsigned i; 163 for (i = 0;; i++) 164 { 165 wchar_t c = s[i]; 166 if (c >= 0x80 || c == 0) 167 break; 168 } 169 170 if (s[i] == 0) 171 { 172 char *d = dest.GetBuf(src.Len()); 173 for (i = 0;;) 174 { 175 wchar_t c = s[i]; 176 if (c == 0) 177 break; 178 d[i++] = (char)c; 179 } 180 d[i] = 0; 181 dest.ReleaseBuf_SetLen(i); 182 return; 183 } 184 } 185 */ 186 187 unsigned len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), NULL, 0, NULL, NULL); 188 if (len == 0) 189 { 190 if (GetLastError() != 0) 191 throw 282228; 192 } 193 else 194 { 195 BOOL defUsed = FALSE; 196 bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7); 197 // defaultChar = defaultChar; 198 len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), 199 dest.GetBuf(len), (int)len, 200 (isUtf ? NULL : &defaultChar), 201 (isUtf ? NULL : &defUsed) 202 ); 203 if (!isUtf) 204 defaultCharWasUsed = (defUsed != FALSE); 205 if (len == 0) 206 throw 282228; 207 dest.ReleaseBuf_SetEnd(len); 208 } 209 } 210} 211 212/* 213#ifndef UNDER_CE 214AString SystemStringToOemString(const CSysString &src) 215{ 216 AString dest; 217 const unsigned len = src.Len() * 2; 218 CharToOem(src, dest.GetBuf(len)); 219 dest.ReleaseBuf_CalcLen(len); 220 return dest; 221} 222#endif 223*/ 224 225#else // _WIN32 226 227// #include <stdio.h> 228/* 229 if (wchar_t is 32-bit (#if WCHAR_MAX > 0xffff), 230 and utf-8 string contains big unicode character > 0xffff), 231 then we still use 16-bit surrogate pair in UString. 232 It simplifies another code where utf-16 encoding is used. 233 So we use surrogate-conversion code only in is file. 234*/ 235 236/* 237 mbstowcs() returns error if there is error in utf-8 stream, 238 mbstowcs() returns error if there is single surrogates point (d800-dfff) in utf-8 stream 239*/ 240 241/* 242static void MultiByteToUnicodeString2_Native(UString &dest, const AString &src) 243{ 244 dest.Empty(); 245 if (src.IsEmpty()) 246 return; 247 248 const size_t limit = ((size_t)src.Len() + 1) * 2; 249 wchar_t *d = dest.GetBuf((unsigned)limit); 250 const size_t len = mbstowcs(d, src, limit); 251 if (len != (size_t)-1) 252 { 253 dest.ReleaseBuf_SetEnd((unsigned)len); 254 return; 255 } 256 dest.ReleaseBuf_SetEnd(0); 257} 258*/ 259 260bool g_ForceToUTF8 = true; // false; 261 262void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage) 263{ 264 dest.Empty(); 265 if (src.IsEmpty()) 266 return; 267 268 if (codePage == CP_UTF8 || g_ForceToUTF8) 269 { 270 ConvertUTF8ToUnicode(src, dest); 271 return; 272 } 273 274 const size_t limit = ((size_t)src.Len() + 1) * 2; 275 wchar_t *d = dest.GetBuf((unsigned)limit); 276 const size_t len = mbstowcs(d, src, limit); 277 if (len != (size_t)-1) 278 { 279 dest.ReleaseBuf_SetEnd((unsigned)len); 280 281 #if WCHAR_MAX > 0xffff 282 d = dest.GetBuf(); 283 for (size_t i = 0;; i++) 284 { 285 // wchar_t c = dest[i]; 286 wchar_t c = d[i]; 287 if (c == 0) 288 break; 289 if (c >= 0x10000 && c < 0x110000) 290 { 291 /* 292 c -= 0x10000; 293 unsigned c0 = 0xd800 + ((c >> 10) & 0x3FF); 294 dest.ReplaceOneCharAtPos(i, c0); 295 i++; 296 c = 0xdc00 + (c & 0x3FF); 297 dest.Insert_wchar_t(i, c); 298 */ 299 UString temp = d + i; 300 301 for (size_t t = 0;; t++) 302 { 303 wchar_t w = temp[t]; 304 if (w == 0) 305 break; 306 if (i == limit) 307 break; // unexpected error 308 if (w >= 0x10000 && w < 0x110000) 309 { 310 if (i + 1 == limit) 311 break; // unexpected error 312 w -= 0x10000; 313 d[i++] = (unsigned)0xd800 + (((unsigned)w >> 10) & 0x3FF); 314 w = 0xdc00 + (w & 0x3FF); 315 } 316 d[i++] = w; 317 } 318 dest.ReleaseBuf_SetEnd((unsigned)i); 319 } 320 } 321 322 #endif 323 324 /* 325 printf("\nMultiByteToUnicodeString2 (%d) %s\n", (int)src.Len(), src.Ptr()); 326 printf("char: "); 327 for (unsigned i = 0; i < src.Len(); i++) 328 printf (" %02x", (int)(Byte)src[i]); 329 printf("\n"); 330 printf("\n-> (%d) %ls\n", (int)dest.Len(), dest.Ptr()); 331 printf("wchar_t: "); 332 for (unsigned i = 0; i < dest.Len(); i++) 333 { 334 printf (" %02x", (int)dest[i]); 335 } 336 printf("\n"); 337 */ 338 339 return; 340 } 341 342 /* if there is mbstowcs() error, we have two ways: 343 344 1) change 0x80+ characters to some character: '_' 345 in that case we lose data, but we have correct UString() 346 and that scheme can show errors to user in early stages, 347 when file converted back to mbs() cannot be found 348 349 2) transfer bad characters in some UTF-16 range. 350 it can be non-original Unicode character. 351 but later we still can restore original character. 352 */ 353 354 355 // printf("\nmbstowcs ERROR !!!!!! s=%s\n", src.Ptr()); 356 { 357 unsigned i; 358 const char *s = (const char *)src; 359 for (i = 0;;) 360 { 361 Byte c = (Byte)s[i]; 362 if (c == 0) 363 break; 364 // we can use ascii compatibilty character '_' 365 // if (c > 0x7F) c = '_'; // we replace "bad: character 366 d[i++] = (wchar_t)c; 367 } 368 d[i] = 0; 369 dest.ReleaseBuf_SetLen(i); 370 } 371} 372 373static void UnicodeStringToMultiByte2_Native(AString &dest, const UString &src) 374{ 375 dest.Empty(); 376 if (src.IsEmpty()) 377 return; 378 379 const size_t limit = ((size_t)src.Len() + 1) * 6; 380 char *d = dest.GetBuf((unsigned)limit); 381 382 const size_t len = wcstombs(d, src, limit); 383 384 if (len != (size_t)-1) 385 { 386 dest.ReleaseBuf_SetEnd((unsigned)len); 387 return; 388 } 389 dest.ReleaseBuf_SetEnd(0); 390} 391 392 393static void UnicodeStringToMultiByte2(AString &dest, const UString &src2, UINT codePage, char defaultChar, bool &defaultCharWasUsed) 394{ 395 // if (codePage == 1234567) // for debug purposes 396 if (codePage == CP_UTF8 || g_ForceToUTF8) 397 { 398 defaultCharWasUsed = false; 399 ConvertUnicodeToUTF8(src2, dest); 400 return; 401 } 402 403 UString src = src2; 404 #if WCHAR_MAX > 0xffff 405 { 406 src.Empty(); 407 for (unsigned i = 0; i < src2.Len();) 408 { 409 wchar_t c = src2[i]; 410 if (c >= 0xd800 && c < 0xdc00 && i + 1 != src2.Len()) 411 { 412 const wchar_t c2 = src2[i + 1]; 413 if (c2 >= 0xdc00 && c2 < 0x10000) 414 { 415 // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2); 416 c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); 417 // printf("%4x\n", (int)c); 418 i++; 419 } 420 } 421 src += c; 422 i++; 423 } 424 } 425 #endif 426 427 dest.Empty(); 428 defaultCharWasUsed = false; 429 if (src.IsEmpty()) 430 return; 431 432 const size_t len = wcstombs(NULL, src, 0); 433 434 if (len != (size_t)-1) 435 { 436 const unsigned limit = ((unsigned)len); 437 if (limit == len) 438 { 439 char *d = dest.GetBuf(limit); 440 441 /* 442 { 443 printf("\nwcstombs; len = %d %ls \n", (int)src.Len(), src.Ptr()); 444 for (unsigned i = 0; i < src.Len(); i++) 445 printf (" %02x", (int)src[i]); 446 printf("\n"); 447 printf("\ndest Limit = %d \n", limit); 448 } 449 */ 450 451 const size_t len2 = wcstombs(d, src, len + 1); 452 453 if (len2 != (size_t)-1 && len2 <= limit) 454 { 455 /* 456 printf("\nOK : destLen = %d : %s\n", (int)len, dest.Ptr()); 457 for (unsigned i = 0; i < len2; i++) 458 printf(" %02x", (int)(Byte)dest[i]); 459 printf("\n"); 460 */ 461 dest.ReleaseBuf_SetEnd((unsigned)len2); 462 return; 463 } 464 } 465 } 466 467 { 468 const wchar_t *s = (const wchar_t *)src; 469 char *d = dest.GetBuf(src.Len()); 470 471 unsigned i; 472 for (i = 0;;) 473 { 474 wchar_t c = s[i]; 475 if (c == 0) 476 break; 477 if (c >= 478 0x100 479 // 0x80 480 ) 481 { 482 c = defaultChar; 483 defaultCharWasUsed = true; 484 } 485 486 d[i++] = (char)c; 487 } 488 d[i] = 0; 489 dest.ReleaseBuf_SetLen(i); 490 /* 491 printf("\nUnicodeStringToMultiByte2; len = %d \n", (int)src.Len()); 492 printf("ERROR: %s\n", dest.Ptr()); 493 */ 494 } 495} 496 497#endif // _WIN32 498 499 500UString MultiByteToUnicodeString(const AString &src, UINT codePage) 501{ 502 UString dest; 503 MultiByteToUnicodeString2(dest, src, codePage); 504 return dest; 505} 506 507UString MultiByteToUnicodeString(const char *src, UINT codePage) 508{ 509 return MultiByteToUnicodeString(AString(src), codePage); 510} 511 512 513void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage) 514{ 515 bool defaultCharWasUsed; 516 UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed); 517} 518 519AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed) 520{ 521 AString dest; 522 UnicodeStringToMultiByte2(dest, src, codePage, defaultChar, defaultCharWasUsed); 523 return dest; 524} 525 526AString UnicodeStringToMultiByte(const UString &src, UINT codePage) 527{ 528 AString dest; 529 bool defaultCharWasUsed; 530 UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed); 531 return dest; 532} 533 534 535 536 537#if !defined(_WIN32) || defined(ENV_HAVE_LOCALE) 538 539#ifdef _WIN32 540#define U_to_A(a, b, c) UnicodeStringToMultiByte2 541// #define A_to_U(a, b, c) MultiByteToUnicodeString2 542#else 543// void MultiByteToUnicodeString2_Native(UString &dest, const AString &src); 544#define U_to_A(a, b, c) UnicodeStringToMultiByte2_Native(a, b) 545// #define A_to_U(a, b, c) MultiByteToUnicodeString2_Native(a, b) 546#endif 547 548bool IsNativeUTF8() 549{ 550 UString u; 551 AString a, a2; 552 // for (unsigned c = 0x80; c < (UInt32)0x10000; c += (c >> 9) + 1) 553 for (unsigned c = 0x80; c < (UInt32)0xD000; c += (c >> 2) + 1) 554 { 555 u.Empty(); 556 u += (wchar_t)c; 557 /* 558 if (Unicode_Is_There_Utf16SurrogateError(u)) 559 continue; 560 #ifndef _WIN32 561 if (Unicode_Is_There_BmpEscape(u)) 562 continue; 563 #endif 564 */ 565 ConvertUnicodeToUTF8(u, a); 566 U_to_A(a2, u, CP_OEMCP); 567 if (a != a2) 568 return false; 569 } 570 return true; 571} 572 573#endif 574 575 576#ifdef ENV_HAVE_LOCALE 577 578const char *GetLocale(void) 579{ 580 #ifdef ENV_HAVE_LOCALE 581 // printf("\n\nsetlocale(LC_CTYPE, NULL) : return : "); 582 const char *s = setlocale(LC_CTYPE, NULL); 583 if (!s) 584 { 585 // printf("[NULL]\n"); 586 s = "C"; 587 } 588 else 589 { 590 // ubuntu returns "C" after program start 591 // printf("\"%s\"\n", s); 592 } 593 return s; 594 #elif defined(LOCALE_IS_UTF8) 595 return "utf8"; 596 #else 597 return "C"; 598 #endif 599} 600 601#ifdef _WIN32 602 static void Set_ForceToUTF8(bool) {} 603#else 604 static void Set_ForceToUTF8(bool val) { g_ForceToUTF8 = val; } 605#endif 606 607static bool Is_Default_Basic_Locale(const char *locale) 608{ 609 const AString a (locale); 610 if (a.IsEqualTo_Ascii_NoCase("") 611 || a.IsEqualTo_Ascii_NoCase("C") 612 || a.IsEqualTo_Ascii_NoCase("POSIX")) 613 return true; 614 return false; 615} 616 617static bool Is_Default_Basic_Locale() 618{ 619 return Is_Default_Basic_Locale(GetLocale()); 620} 621 622 623void MY_SetLocale() 624{ 625 #ifdef ENV_HAVE_LOCALE 626 /* 627 { 628 const char *s = GetLocale(); 629 printf("\nGetLocale() : returned : \"%s\"\n", s); 630 } 631 */ 632 633 unsigned start = 0; 634 // unsigned lim = 0; 635 unsigned lim = 3; 636 637 /* 638 #define MY_SET_LOCALE_FLAGS__FROM_ENV 1 639 #define MY_SET_LOCALE_FLAGS__TRY_UTF8 2 640 641 unsigned flags = 642 MY_SET_LOCALE_FLAGS__FROM_ENV | 643 MY_SET_LOCALE_FLAGS__TRY_UTF8 644 645 if (flags != 0) 646 { 647 if (flags & MY_SET_LOCALE_FLAGS__FROM_ENV) 648 lim = (flags & MY_SET_LOCALE_FLAGS__TRY_UTF8) ? 3 : 1; 649 else 650 { 651 start = 1; 652 lim = 2; 653 } 654 } 655 */ 656 657 for (unsigned i = start; i < lim; i++) 658 { 659 /* 660 man7: "If locale is an empty string, "", each part of the locale that 661 should be modified is set according to the environment variables. 662 for glibc: glibc, first from the user's environment variables: 663 1) the environment variable LC_ALL, 664 2) environment variable with the same name as the category (see the 665 3) the environment variable LANG 666 The locale "C" or "POSIX" is a portable locale; it exists on all conforming systems. 667 668 for WIN32 : MSDN : 669 Sets the locale to the default, which is the user-default 670 ANSI code page obtained from the operating system. 671 The locale name is set to the value returned by GetUserDefaultLocaleName. 672 The code page is set to the value returned by GetACP 673 */ 674 const char *newLocale = ""; 675 676 #ifdef __APPLE__ 677 678 /* look also CFLocale 679 there is no C.UTF-8 in macos 680 macos has UTF-8 locale only with some language like en_US.UTF-8 681 what is best way to set UTF-8 locale in macos? */ 682 if (i == 1) 683 newLocale = "en_US.UTF-8"; 684 685 /* file open with non-utf8 sequencies return 686 #define EILSEQ 92 // "Illegal byte sequence" 687 */ 688#else 689 // newLocale = "C"; 690 if (i == 1) 691 { 692 newLocale = "C.UTF-8"; // main UTF-8 locale in ubuntu 693 // newLocale = ".utf8"; // supported in new Windows 10 build 17134 (April 2018 Update), the Universal C Runtime 694 // newLocale = "en_US.utf8"; // supported by ubuntu ? 695 // newLocale = "en_US.UTF-8"; 696 /* setlocale() in ubuntu allows locales with minor chracter changes in strings 697 "en_US.UTF-8" / "en_US.utf8" */ 698 } 699 700#endif 701 702 // printf("\nsetlocale(LC_ALL, \"%s\") : returned: ", newLocale); 703 704 // const char *s = 705 setlocale(LC_ALL, newLocale); 706 707 /* 708 if (!s) 709 printf("NULL: can't set locale"); 710 else 711 printf("\"%s\"\n", s); 712 */ 713 714 // request curent locale of program 715 const char *locale = GetLocale(); 716 if (locale) 717 { 718 AString a (locale); 719 a.MakeLower_Ascii(); 720 // if (a.Find("utf") >= 0) 721 { 722 if (IsNativeUTF8()) 723 { 724 Set_ForceToUTF8(true); 725 return; 726 } 727 } 728 if (!Is_Default_Basic_Locale(locale)) 729 { 730 // if there is some non-default and non-utf locale, we want to use it 731 break; // comment it for debug 732 } 733 } 734 } 735 736 if (IsNativeUTF8()) 737 { 738 Set_ForceToUTF8(true); 739 return; 740 } 741 742 if (Is_Default_Basic_Locale()) 743 { 744 Set_ForceToUTF8(true); 745 return; 746 } 747 748 Set_ForceToUTF8(false); 749 750 #elif defined(LOCALE_IS_UTF8) 751 // assume LC_CTYPE="utf8" 752 #else 753 // assume LC_CTYPE="C" 754 #endif 755} 756#endif 757