1370b324cSopenharmony_ci// Common/StringConvert.cpp 2370b324cSopenharmony_ci 3370b324cSopenharmony_ci#include "StdAfx.h" 4370b324cSopenharmony_ci 5370b324cSopenharmony_ci#include "StringConvert.h" 6370b324cSopenharmony_ci 7370b324cSopenharmony_ci#ifndef _WIN32 8370b324cSopenharmony_ci// #include <stdio.h> 9370b324cSopenharmony_ci#include <stdlib.h> 10370b324cSopenharmony_ci#endif 11370b324cSopenharmony_ci 12370b324cSopenharmony_ci#if !defined(_WIN32) || defined(ENV_HAVE_LOCALE) 13370b324cSopenharmony_ci#include "UTFConvert.h" 14370b324cSopenharmony_ci#endif 15370b324cSopenharmony_ci 16370b324cSopenharmony_ci#ifdef ENV_HAVE_LOCALE 17370b324cSopenharmony_ci#include <locale.h> 18370b324cSopenharmony_ci#endif 19370b324cSopenharmony_ci 20370b324cSopenharmony_cistatic const char k_DefultChar = '_'; 21370b324cSopenharmony_ci 22370b324cSopenharmony_ci#ifdef _WIN32 23370b324cSopenharmony_ci 24370b324cSopenharmony_ci/* 25370b324cSopenharmony_ciMultiByteToWideChar(CodePage, DWORD dwFlags, 26370b324cSopenharmony_ci LPCSTR lpMultiByteStr, int cbMultiByte, 27370b324cSopenharmony_ci LPWSTR lpWideCharStr, int cchWideChar) 28370b324cSopenharmony_ci 29370b324cSopenharmony_ci if (cbMultiByte == 0) 30370b324cSopenharmony_ci return: 0. ERR: ERROR_INVALID_PARAMETER 31370b324cSopenharmony_ci 32370b324cSopenharmony_ci if (cchWideChar == 0) 33370b324cSopenharmony_ci return: the required buffer size in characters. 34370b324cSopenharmony_ci 35370b324cSopenharmony_ci if (supplied buffer size was not large enough) 36370b324cSopenharmony_ci return: 0. ERR: ERROR_INSUFFICIENT_BUFFER 37370b324cSopenharmony_ci The number of filled characters in lpWideCharStr can be smaller than cchWideChar (if last character is complex) 38370b324cSopenharmony_ci 39370b324cSopenharmony_ci If there are illegal characters: 40370b324cSopenharmony_ci if MB_ERR_INVALID_CHARS is set in dwFlags: 41370b324cSopenharmony_ci - the function stops conversion on illegal character. 42370b324cSopenharmony_ci - Return: 0. ERR: ERROR_NO_UNICODE_TRANSLATION. 43370b324cSopenharmony_ci 44370b324cSopenharmony_ci if MB_ERR_INVALID_CHARS is NOT set in dwFlags: 45370b324cSopenharmony_ci before Vista: illegal character is dropped (skipped). WinXP-64: GetLastError() returns 0. 46370b324cSopenharmony_ci in Vista+: illegal character is not dropped (MSDN). Undocumented: illegal 47370b324cSopenharmony_ci character is converted to U+FFFD, which is REPLACEMENT CHARACTER. 48370b324cSopenharmony_ci*/ 49370b324cSopenharmony_ci 50370b324cSopenharmony_ci 51370b324cSopenharmony_civoid MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage) 52370b324cSopenharmony_ci{ 53370b324cSopenharmony_ci dest.Empty(); 54370b324cSopenharmony_ci if (src.IsEmpty()) 55370b324cSopenharmony_ci return; 56370b324cSopenharmony_ci { 57370b324cSopenharmony_ci /* 58370b324cSopenharmony_ci wchar_t *d = dest.GetBuf(src.Len()); 59370b324cSopenharmony_ci const char *s = (const char *)src; 60370b324cSopenharmony_ci unsigned i; 61370b324cSopenharmony_ci 62370b324cSopenharmony_ci for (i = 0;;) 63370b324cSopenharmony_ci { 64370b324cSopenharmony_ci Byte c = (Byte)s[i]; 65370b324cSopenharmony_ci if (c >= 0x80 || c == 0) 66370b324cSopenharmony_ci break; 67370b324cSopenharmony_ci d[i++] = (wchar_t)c; 68370b324cSopenharmony_ci } 69370b324cSopenharmony_ci 70370b324cSopenharmony_ci if (i != src.Len()) 71370b324cSopenharmony_ci { 72370b324cSopenharmony_ci unsigned len = MultiByteToWideChar(codePage, 0, s + i, 73370b324cSopenharmony_ci src.Len() - i, d + i, 74370b324cSopenharmony_ci src.Len() + 1 - i); 75370b324cSopenharmony_ci if (len == 0) 76370b324cSopenharmony_ci throw 282228; 77370b324cSopenharmony_ci i += len; 78370b324cSopenharmony_ci } 79370b324cSopenharmony_ci 80370b324cSopenharmony_ci d[i] = 0; 81370b324cSopenharmony_ci dest.ReleaseBuf_SetLen(i); 82370b324cSopenharmony_ci */ 83370b324cSopenharmony_ci unsigned len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), NULL, 0); 84370b324cSopenharmony_ci if (len == 0) 85370b324cSopenharmony_ci { 86370b324cSopenharmony_ci if (GetLastError() != 0) 87370b324cSopenharmony_ci throw 282228; 88370b324cSopenharmony_ci } 89370b324cSopenharmony_ci else 90370b324cSopenharmony_ci { 91370b324cSopenharmony_ci len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), dest.GetBuf(len), (int)len); 92370b324cSopenharmony_ci if (len == 0) 93370b324cSopenharmony_ci throw 282228; 94370b324cSopenharmony_ci dest.ReleaseBuf_SetEnd(len); 95370b324cSopenharmony_ci } 96370b324cSopenharmony_ci } 97370b324cSopenharmony_ci} 98370b324cSopenharmony_ci 99370b324cSopenharmony_ci/* 100370b324cSopenharmony_ci int WideCharToMultiByte( 101370b324cSopenharmony_ci UINT CodePage, DWORD dwFlags, 102370b324cSopenharmony_ci LPCWSTR lpWideCharStr, int cchWideChar, 103370b324cSopenharmony_ci LPSTR lpMultiByteStr, int cbMultiByte, 104370b324cSopenharmony_ci LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar); 105370b324cSopenharmony_ci 106370b324cSopenharmony_ciif (lpDefaultChar == NULL), 107370b324cSopenharmony_ci - it uses system default value. 108370b324cSopenharmony_ci 109370b324cSopenharmony_ciif (CodePage == CP_UTF7 || CodePage == CP_UTF8) 110370b324cSopenharmony_ci if (lpDefaultChar != NULL || lpUsedDefaultChar != NULL) 111370b324cSopenharmony_ci return: 0. ERR: ERROR_INVALID_PARAMETER. 112370b324cSopenharmony_ci 113370b324cSopenharmony_ciThe function operates most efficiently, if (lpDefaultChar == NULL && lpUsedDefaultChar == NULL) 114370b324cSopenharmony_ci 115370b324cSopenharmony_ci*/ 116370b324cSopenharmony_ci 117370b324cSopenharmony_cistatic void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed) 118370b324cSopenharmony_ci{ 119370b324cSopenharmony_ci dest.Empty(); 120370b324cSopenharmony_ci defaultCharWasUsed = false; 121370b324cSopenharmony_ci if (src.IsEmpty()) 122370b324cSopenharmony_ci return; 123370b324cSopenharmony_ci { 124370b324cSopenharmony_ci /* 125370b324cSopenharmony_ci unsigned numRequiredBytes = src.Len() * 2; 126370b324cSopenharmony_ci char *d = dest.GetBuf(numRequiredBytes); 127370b324cSopenharmony_ci const wchar_t *s = (const wchar_t *)src; 128370b324cSopenharmony_ci unsigned i; 129370b324cSopenharmony_ci 130370b324cSopenharmony_ci for (i = 0;;) 131370b324cSopenharmony_ci { 132370b324cSopenharmony_ci wchar_t c = s[i]; 133370b324cSopenharmony_ci if (c >= 0x80 || c == 0) 134370b324cSopenharmony_ci break; 135370b324cSopenharmony_ci d[i++] = (char)c; 136370b324cSopenharmony_ci } 137370b324cSopenharmony_ci 138370b324cSopenharmony_ci if (i != src.Len()) 139370b324cSopenharmony_ci { 140370b324cSopenharmony_ci BOOL defUsed = FALSE; 141370b324cSopenharmony_ci defaultChar = defaultChar; 142370b324cSopenharmony_ci 143370b324cSopenharmony_ci bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7); 144370b324cSopenharmony_ci unsigned len = WideCharToMultiByte(codePage, 0, s + i, src.Len() - i, 145370b324cSopenharmony_ci d + i, numRequiredBytes + 1 - i, 146370b324cSopenharmony_ci (isUtf ? NULL : &defaultChar), 147370b324cSopenharmony_ci (isUtf ? NULL : &defUsed)); 148370b324cSopenharmony_ci defaultCharWasUsed = (defUsed != FALSE); 149370b324cSopenharmony_ci if (len == 0) 150370b324cSopenharmony_ci throw 282229; 151370b324cSopenharmony_ci i += len; 152370b324cSopenharmony_ci } 153370b324cSopenharmony_ci 154370b324cSopenharmony_ci d[i] = 0; 155370b324cSopenharmony_ci dest.ReleaseBuf_SetLen(i); 156370b324cSopenharmony_ci */ 157370b324cSopenharmony_ci 158370b324cSopenharmony_ci /* 159370b324cSopenharmony_ci if (codePage != CP_UTF7) 160370b324cSopenharmony_ci { 161370b324cSopenharmony_ci const wchar_t *s = (const wchar_t *)src; 162370b324cSopenharmony_ci unsigned i; 163370b324cSopenharmony_ci for (i = 0;; i++) 164370b324cSopenharmony_ci { 165370b324cSopenharmony_ci wchar_t c = s[i]; 166370b324cSopenharmony_ci if (c >= 0x80 || c == 0) 167370b324cSopenharmony_ci break; 168370b324cSopenharmony_ci } 169370b324cSopenharmony_ci 170370b324cSopenharmony_ci if (s[i] == 0) 171370b324cSopenharmony_ci { 172370b324cSopenharmony_ci char *d = dest.GetBuf(src.Len()); 173370b324cSopenharmony_ci for (i = 0;;) 174370b324cSopenharmony_ci { 175370b324cSopenharmony_ci wchar_t c = s[i]; 176370b324cSopenharmony_ci if (c == 0) 177370b324cSopenharmony_ci break; 178370b324cSopenharmony_ci d[i++] = (char)c; 179370b324cSopenharmony_ci } 180370b324cSopenharmony_ci d[i] = 0; 181370b324cSopenharmony_ci dest.ReleaseBuf_SetLen(i); 182370b324cSopenharmony_ci return; 183370b324cSopenharmony_ci } 184370b324cSopenharmony_ci } 185370b324cSopenharmony_ci */ 186370b324cSopenharmony_ci 187370b324cSopenharmony_ci unsigned len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), NULL, 0, NULL, NULL); 188370b324cSopenharmony_ci if (len == 0) 189370b324cSopenharmony_ci { 190370b324cSopenharmony_ci if (GetLastError() != 0) 191370b324cSopenharmony_ci throw 282228; 192370b324cSopenharmony_ci } 193370b324cSopenharmony_ci else 194370b324cSopenharmony_ci { 195370b324cSopenharmony_ci BOOL defUsed = FALSE; 196370b324cSopenharmony_ci bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7); 197370b324cSopenharmony_ci // defaultChar = defaultChar; 198370b324cSopenharmony_ci len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), 199370b324cSopenharmony_ci dest.GetBuf(len), (int)len, 200370b324cSopenharmony_ci (isUtf ? NULL : &defaultChar), 201370b324cSopenharmony_ci (isUtf ? NULL : &defUsed) 202370b324cSopenharmony_ci ); 203370b324cSopenharmony_ci if (!isUtf) 204370b324cSopenharmony_ci defaultCharWasUsed = (defUsed != FALSE); 205370b324cSopenharmony_ci if (len == 0) 206370b324cSopenharmony_ci throw 282228; 207370b324cSopenharmony_ci dest.ReleaseBuf_SetEnd(len); 208370b324cSopenharmony_ci } 209370b324cSopenharmony_ci } 210370b324cSopenharmony_ci} 211370b324cSopenharmony_ci 212370b324cSopenharmony_ci/* 213370b324cSopenharmony_ci#ifndef UNDER_CE 214370b324cSopenharmony_ciAString SystemStringToOemString(const CSysString &src) 215370b324cSopenharmony_ci{ 216370b324cSopenharmony_ci AString dest; 217370b324cSopenharmony_ci const unsigned len = src.Len() * 2; 218370b324cSopenharmony_ci CharToOem(src, dest.GetBuf(len)); 219370b324cSopenharmony_ci dest.ReleaseBuf_CalcLen(len); 220370b324cSopenharmony_ci return dest; 221370b324cSopenharmony_ci} 222370b324cSopenharmony_ci#endif 223370b324cSopenharmony_ci*/ 224370b324cSopenharmony_ci 225370b324cSopenharmony_ci#else // _WIN32 226370b324cSopenharmony_ci 227370b324cSopenharmony_ci// #include <stdio.h> 228370b324cSopenharmony_ci/* 229370b324cSopenharmony_ci if (wchar_t is 32-bit (#if WCHAR_MAX > 0xffff), 230370b324cSopenharmony_ci and utf-8 string contains big unicode character > 0xffff), 231370b324cSopenharmony_ci then we still use 16-bit surrogate pair in UString. 232370b324cSopenharmony_ci It simplifies another code where utf-16 encoding is used. 233370b324cSopenharmony_ci So we use surrogate-conversion code only in is file. 234370b324cSopenharmony_ci*/ 235370b324cSopenharmony_ci 236370b324cSopenharmony_ci/* 237370b324cSopenharmony_ci mbstowcs() returns error if there is error in utf-8 stream, 238370b324cSopenharmony_ci mbstowcs() returns error if there is single surrogates point (d800-dfff) in utf-8 stream 239370b324cSopenharmony_ci*/ 240370b324cSopenharmony_ci 241370b324cSopenharmony_ci/* 242370b324cSopenharmony_cistatic void MultiByteToUnicodeString2_Native(UString &dest, const AString &src) 243370b324cSopenharmony_ci{ 244370b324cSopenharmony_ci dest.Empty(); 245370b324cSopenharmony_ci if (src.IsEmpty()) 246370b324cSopenharmony_ci return; 247370b324cSopenharmony_ci 248370b324cSopenharmony_ci const size_t limit = ((size_t)src.Len() + 1) * 2; 249370b324cSopenharmony_ci wchar_t *d = dest.GetBuf((unsigned)limit); 250370b324cSopenharmony_ci const size_t len = mbstowcs(d, src, limit); 251370b324cSopenharmony_ci if (len != (size_t)-1) 252370b324cSopenharmony_ci { 253370b324cSopenharmony_ci dest.ReleaseBuf_SetEnd((unsigned)len); 254370b324cSopenharmony_ci return; 255370b324cSopenharmony_ci } 256370b324cSopenharmony_ci dest.ReleaseBuf_SetEnd(0); 257370b324cSopenharmony_ci} 258370b324cSopenharmony_ci*/ 259370b324cSopenharmony_ci 260370b324cSopenharmony_cibool g_ForceToUTF8 = true; // false; 261370b324cSopenharmony_ci 262370b324cSopenharmony_civoid MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage) 263370b324cSopenharmony_ci{ 264370b324cSopenharmony_ci dest.Empty(); 265370b324cSopenharmony_ci if (src.IsEmpty()) 266370b324cSopenharmony_ci return; 267370b324cSopenharmony_ci 268370b324cSopenharmony_ci if (codePage == CP_UTF8 || g_ForceToUTF8) 269370b324cSopenharmony_ci { 270370b324cSopenharmony_ci ConvertUTF8ToUnicode(src, dest); 271370b324cSopenharmony_ci return; 272370b324cSopenharmony_ci } 273370b324cSopenharmony_ci 274370b324cSopenharmony_ci const size_t limit = ((size_t)src.Len() + 1) * 2; 275370b324cSopenharmony_ci wchar_t *d = dest.GetBuf((unsigned)limit); 276370b324cSopenharmony_ci const size_t len = mbstowcs(d, src, limit); 277370b324cSopenharmony_ci if (len != (size_t)-1) 278370b324cSopenharmony_ci { 279370b324cSopenharmony_ci dest.ReleaseBuf_SetEnd((unsigned)len); 280370b324cSopenharmony_ci 281370b324cSopenharmony_ci #if WCHAR_MAX > 0xffff 282370b324cSopenharmony_ci d = dest.GetBuf(); 283370b324cSopenharmony_ci for (size_t i = 0;; i++) 284370b324cSopenharmony_ci { 285370b324cSopenharmony_ci // wchar_t c = dest[i]; 286370b324cSopenharmony_ci wchar_t c = d[i]; 287370b324cSopenharmony_ci if (c == 0) 288370b324cSopenharmony_ci break; 289370b324cSopenharmony_ci if (c >= 0x10000 && c < 0x110000) 290370b324cSopenharmony_ci { 291370b324cSopenharmony_ci /* 292370b324cSopenharmony_ci c -= 0x10000; 293370b324cSopenharmony_ci unsigned c0 = 0xd800 + ((c >> 10) & 0x3FF); 294370b324cSopenharmony_ci dest.ReplaceOneCharAtPos(i, c0); 295370b324cSopenharmony_ci i++; 296370b324cSopenharmony_ci c = 0xdc00 + (c & 0x3FF); 297370b324cSopenharmony_ci dest.Insert_wchar_t(i, c); 298370b324cSopenharmony_ci */ 299370b324cSopenharmony_ci UString temp = d + i; 300370b324cSopenharmony_ci 301370b324cSopenharmony_ci for (size_t t = 0;; t++) 302370b324cSopenharmony_ci { 303370b324cSopenharmony_ci wchar_t w = temp[t]; 304370b324cSopenharmony_ci if (w == 0) 305370b324cSopenharmony_ci break; 306370b324cSopenharmony_ci if (i == limit) 307370b324cSopenharmony_ci break; // unexpected error 308370b324cSopenharmony_ci if (w >= 0x10000 && w < 0x110000) 309370b324cSopenharmony_ci { 310370b324cSopenharmony_ci if (i + 1 == limit) 311370b324cSopenharmony_ci break; // unexpected error 312370b324cSopenharmony_ci w -= 0x10000; 313370b324cSopenharmony_ci d[i++] = (unsigned)0xd800 + (((unsigned)w >> 10) & 0x3FF); 314370b324cSopenharmony_ci w = 0xdc00 + (w & 0x3FF); 315370b324cSopenharmony_ci } 316370b324cSopenharmony_ci d[i++] = w; 317370b324cSopenharmony_ci } 318370b324cSopenharmony_ci dest.ReleaseBuf_SetEnd((unsigned)i); 319370b324cSopenharmony_ci } 320370b324cSopenharmony_ci } 321370b324cSopenharmony_ci 322370b324cSopenharmony_ci #endif 323370b324cSopenharmony_ci 324370b324cSopenharmony_ci /* 325370b324cSopenharmony_ci printf("\nMultiByteToUnicodeString2 (%d) %s\n", (int)src.Len(), src.Ptr()); 326370b324cSopenharmony_ci printf("char: "); 327370b324cSopenharmony_ci for (unsigned i = 0; i < src.Len(); i++) 328370b324cSopenharmony_ci printf (" %02x", (int)(Byte)src[i]); 329370b324cSopenharmony_ci printf("\n"); 330370b324cSopenharmony_ci printf("\n-> (%d) %ls\n", (int)dest.Len(), dest.Ptr()); 331370b324cSopenharmony_ci printf("wchar_t: "); 332370b324cSopenharmony_ci for (unsigned i = 0; i < dest.Len(); i++) 333370b324cSopenharmony_ci { 334370b324cSopenharmony_ci printf (" %02x", (int)dest[i]); 335370b324cSopenharmony_ci } 336370b324cSopenharmony_ci printf("\n"); 337370b324cSopenharmony_ci */ 338370b324cSopenharmony_ci 339370b324cSopenharmony_ci return; 340370b324cSopenharmony_ci } 341370b324cSopenharmony_ci 342370b324cSopenharmony_ci /* if there is mbstowcs() error, we have two ways: 343370b324cSopenharmony_ci 344370b324cSopenharmony_ci 1) change 0x80+ characters to some character: '_' 345370b324cSopenharmony_ci in that case we lose data, but we have correct UString() 346370b324cSopenharmony_ci and that scheme can show errors to user in early stages, 347370b324cSopenharmony_ci when file converted back to mbs() cannot be found 348370b324cSopenharmony_ci 349370b324cSopenharmony_ci 2) transfer bad characters in some UTF-16 range. 350370b324cSopenharmony_ci it can be non-original Unicode character. 351370b324cSopenharmony_ci but later we still can restore original character. 352370b324cSopenharmony_ci */ 353370b324cSopenharmony_ci 354370b324cSopenharmony_ci 355370b324cSopenharmony_ci // printf("\nmbstowcs ERROR !!!!!! s=%s\n", src.Ptr()); 356370b324cSopenharmony_ci { 357370b324cSopenharmony_ci unsigned i; 358370b324cSopenharmony_ci const char *s = (const char *)src; 359370b324cSopenharmony_ci for (i = 0;;) 360370b324cSopenharmony_ci { 361370b324cSopenharmony_ci Byte c = (Byte)s[i]; 362370b324cSopenharmony_ci if (c == 0) 363370b324cSopenharmony_ci break; 364370b324cSopenharmony_ci // we can use ascii compatibilty character '_' 365370b324cSopenharmony_ci // if (c > 0x7F) c = '_'; // we replace "bad: character 366370b324cSopenharmony_ci d[i++] = (wchar_t)c; 367370b324cSopenharmony_ci } 368370b324cSopenharmony_ci d[i] = 0; 369370b324cSopenharmony_ci dest.ReleaseBuf_SetLen(i); 370370b324cSopenharmony_ci } 371370b324cSopenharmony_ci} 372370b324cSopenharmony_ci 373370b324cSopenharmony_cistatic void UnicodeStringToMultiByte2_Native(AString &dest, const UString &src) 374370b324cSopenharmony_ci{ 375370b324cSopenharmony_ci dest.Empty(); 376370b324cSopenharmony_ci if (src.IsEmpty()) 377370b324cSopenharmony_ci return; 378370b324cSopenharmony_ci 379370b324cSopenharmony_ci const size_t limit = ((size_t)src.Len() + 1) * 6; 380370b324cSopenharmony_ci char *d = dest.GetBuf((unsigned)limit); 381370b324cSopenharmony_ci 382370b324cSopenharmony_ci const size_t len = wcstombs(d, src, limit); 383370b324cSopenharmony_ci 384370b324cSopenharmony_ci if (len != (size_t)-1) 385370b324cSopenharmony_ci { 386370b324cSopenharmony_ci dest.ReleaseBuf_SetEnd((unsigned)len); 387370b324cSopenharmony_ci return; 388370b324cSopenharmony_ci } 389370b324cSopenharmony_ci dest.ReleaseBuf_SetEnd(0); 390370b324cSopenharmony_ci} 391370b324cSopenharmony_ci 392370b324cSopenharmony_ci 393370b324cSopenharmony_cistatic void UnicodeStringToMultiByte2(AString &dest, const UString &src2, UINT codePage, char defaultChar, bool &defaultCharWasUsed) 394370b324cSopenharmony_ci{ 395370b324cSopenharmony_ci // if (codePage == 1234567) // for debug purposes 396370b324cSopenharmony_ci if (codePage == CP_UTF8 || g_ForceToUTF8) 397370b324cSopenharmony_ci { 398370b324cSopenharmony_ci defaultCharWasUsed = false; 399370b324cSopenharmony_ci ConvertUnicodeToUTF8(src2, dest); 400370b324cSopenharmony_ci return; 401370b324cSopenharmony_ci } 402370b324cSopenharmony_ci 403370b324cSopenharmony_ci UString src = src2; 404370b324cSopenharmony_ci #if WCHAR_MAX > 0xffff 405370b324cSopenharmony_ci { 406370b324cSopenharmony_ci src.Empty(); 407370b324cSopenharmony_ci for (unsigned i = 0; i < src2.Len();) 408370b324cSopenharmony_ci { 409370b324cSopenharmony_ci wchar_t c = src2[i]; 410370b324cSopenharmony_ci if (c >= 0xd800 && c < 0xdc00 && i + 1 != src2.Len()) 411370b324cSopenharmony_ci { 412370b324cSopenharmony_ci const wchar_t c2 = src2[i + 1]; 413370b324cSopenharmony_ci if (c2 >= 0xdc00 && c2 < 0x10000) 414370b324cSopenharmony_ci { 415370b324cSopenharmony_ci // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2); 416370b324cSopenharmony_ci c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); 417370b324cSopenharmony_ci // printf("%4x\n", (int)c); 418370b324cSopenharmony_ci i++; 419370b324cSopenharmony_ci } 420370b324cSopenharmony_ci } 421370b324cSopenharmony_ci src += c; 422370b324cSopenharmony_ci i++; 423370b324cSopenharmony_ci } 424370b324cSopenharmony_ci } 425370b324cSopenharmony_ci #endif 426370b324cSopenharmony_ci 427370b324cSopenharmony_ci dest.Empty(); 428370b324cSopenharmony_ci defaultCharWasUsed = false; 429370b324cSopenharmony_ci if (src.IsEmpty()) 430370b324cSopenharmony_ci return; 431370b324cSopenharmony_ci 432370b324cSopenharmony_ci const size_t len = wcstombs(NULL, src, 0); 433370b324cSopenharmony_ci 434370b324cSopenharmony_ci if (len != (size_t)-1) 435370b324cSopenharmony_ci { 436370b324cSopenharmony_ci const unsigned limit = ((unsigned)len); 437370b324cSopenharmony_ci if (limit == len) 438370b324cSopenharmony_ci { 439370b324cSopenharmony_ci char *d = dest.GetBuf(limit); 440370b324cSopenharmony_ci 441370b324cSopenharmony_ci /* 442370b324cSopenharmony_ci { 443370b324cSopenharmony_ci printf("\nwcstombs; len = %d %ls \n", (int)src.Len(), src.Ptr()); 444370b324cSopenharmony_ci for (unsigned i = 0; i < src.Len(); i++) 445370b324cSopenharmony_ci printf (" %02x", (int)src[i]); 446370b324cSopenharmony_ci printf("\n"); 447370b324cSopenharmony_ci printf("\ndest Limit = %d \n", limit); 448370b324cSopenharmony_ci } 449370b324cSopenharmony_ci */ 450370b324cSopenharmony_ci 451370b324cSopenharmony_ci const size_t len2 = wcstombs(d, src, len + 1); 452370b324cSopenharmony_ci 453370b324cSopenharmony_ci if (len2 != (size_t)-1 && len2 <= limit) 454370b324cSopenharmony_ci { 455370b324cSopenharmony_ci /* 456370b324cSopenharmony_ci printf("\nOK : destLen = %d : %s\n", (int)len, dest.Ptr()); 457370b324cSopenharmony_ci for (unsigned i = 0; i < len2; i++) 458370b324cSopenharmony_ci printf(" %02x", (int)(Byte)dest[i]); 459370b324cSopenharmony_ci printf("\n"); 460370b324cSopenharmony_ci */ 461370b324cSopenharmony_ci dest.ReleaseBuf_SetEnd((unsigned)len2); 462370b324cSopenharmony_ci return; 463370b324cSopenharmony_ci } 464370b324cSopenharmony_ci } 465370b324cSopenharmony_ci } 466370b324cSopenharmony_ci 467370b324cSopenharmony_ci { 468370b324cSopenharmony_ci const wchar_t *s = (const wchar_t *)src; 469370b324cSopenharmony_ci char *d = dest.GetBuf(src.Len()); 470370b324cSopenharmony_ci 471370b324cSopenharmony_ci unsigned i; 472370b324cSopenharmony_ci for (i = 0;;) 473370b324cSopenharmony_ci { 474370b324cSopenharmony_ci wchar_t c = s[i]; 475370b324cSopenharmony_ci if (c == 0) 476370b324cSopenharmony_ci break; 477370b324cSopenharmony_ci if (c >= 478370b324cSopenharmony_ci 0x100 479370b324cSopenharmony_ci // 0x80 480370b324cSopenharmony_ci ) 481370b324cSopenharmony_ci { 482370b324cSopenharmony_ci c = defaultChar; 483370b324cSopenharmony_ci defaultCharWasUsed = true; 484370b324cSopenharmony_ci } 485370b324cSopenharmony_ci 486370b324cSopenharmony_ci d[i++] = (char)c; 487370b324cSopenharmony_ci } 488370b324cSopenharmony_ci d[i] = 0; 489370b324cSopenharmony_ci dest.ReleaseBuf_SetLen(i); 490370b324cSopenharmony_ci /* 491370b324cSopenharmony_ci printf("\nUnicodeStringToMultiByte2; len = %d \n", (int)src.Len()); 492370b324cSopenharmony_ci printf("ERROR: %s\n", dest.Ptr()); 493370b324cSopenharmony_ci */ 494370b324cSopenharmony_ci } 495370b324cSopenharmony_ci} 496370b324cSopenharmony_ci 497370b324cSopenharmony_ci#endif // _WIN32 498370b324cSopenharmony_ci 499370b324cSopenharmony_ci 500370b324cSopenharmony_ciUString MultiByteToUnicodeString(const AString &src, UINT codePage) 501370b324cSopenharmony_ci{ 502370b324cSopenharmony_ci UString dest; 503370b324cSopenharmony_ci MultiByteToUnicodeString2(dest, src, codePage); 504370b324cSopenharmony_ci return dest; 505370b324cSopenharmony_ci} 506370b324cSopenharmony_ci 507370b324cSopenharmony_ciUString MultiByteToUnicodeString(const char *src, UINT codePage) 508370b324cSopenharmony_ci{ 509370b324cSopenharmony_ci return MultiByteToUnicodeString(AString(src), codePage); 510370b324cSopenharmony_ci} 511370b324cSopenharmony_ci 512370b324cSopenharmony_ci 513370b324cSopenharmony_civoid UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage) 514370b324cSopenharmony_ci{ 515370b324cSopenharmony_ci bool defaultCharWasUsed; 516370b324cSopenharmony_ci UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed); 517370b324cSopenharmony_ci} 518370b324cSopenharmony_ci 519370b324cSopenharmony_ciAString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed) 520370b324cSopenharmony_ci{ 521370b324cSopenharmony_ci AString dest; 522370b324cSopenharmony_ci UnicodeStringToMultiByte2(dest, src, codePage, defaultChar, defaultCharWasUsed); 523370b324cSopenharmony_ci return dest; 524370b324cSopenharmony_ci} 525370b324cSopenharmony_ci 526370b324cSopenharmony_ciAString UnicodeStringToMultiByte(const UString &src, UINT codePage) 527370b324cSopenharmony_ci{ 528370b324cSopenharmony_ci AString dest; 529370b324cSopenharmony_ci bool defaultCharWasUsed; 530370b324cSopenharmony_ci UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed); 531370b324cSopenharmony_ci return dest; 532370b324cSopenharmony_ci} 533370b324cSopenharmony_ci 534370b324cSopenharmony_ci 535370b324cSopenharmony_ci 536370b324cSopenharmony_ci 537370b324cSopenharmony_ci#if !defined(_WIN32) || defined(ENV_HAVE_LOCALE) 538370b324cSopenharmony_ci 539370b324cSopenharmony_ci#ifdef _WIN32 540370b324cSopenharmony_ci#define U_to_A(a, b, c) UnicodeStringToMultiByte2 541370b324cSopenharmony_ci// #define A_to_U(a, b, c) MultiByteToUnicodeString2 542370b324cSopenharmony_ci#else 543370b324cSopenharmony_ci// void MultiByteToUnicodeString2_Native(UString &dest, const AString &src); 544370b324cSopenharmony_ci#define U_to_A(a, b, c) UnicodeStringToMultiByte2_Native(a, b) 545370b324cSopenharmony_ci// #define A_to_U(a, b, c) MultiByteToUnicodeString2_Native(a, b) 546370b324cSopenharmony_ci#endif 547370b324cSopenharmony_ci 548370b324cSopenharmony_cibool IsNativeUTF8() 549370b324cSopenharmony_ci{ 550370b324cSopenharmony_ci UString u; 551370b324cSopenharmony_ci AString a, a2; 552370b324cSopenharmony_ci // for (unsigned c = 0x80; c < (UInt32)0x10000; c += (c >> 9) + 1) 553370b324cSopenharmony_ci for (unsigned c = 0x80; c < (UInt32)0xD000; c += (c >> 2) + 1) 554370b324cSopenharmony_ci { 555370b324cSopenharmony_ci u.Empty(); 556370b324cSopenharmony_ci u += (wchar_t)c; 557370b324cSopenharmony_ci /* 558370b324cSopenharmony_ci if (Unicode_Is_There_Utf16SurrogateError(u)) 559370b324cSopenharmony_ci continue; 560370b324cSopenharmony_ci #ifndef _WIN32 561370b324cSopenharmony_ci if (Unicode_Is_There_BmpEscape(u)) 562370b324cSopenharmony_ci continue; 563370b324cSopenharmony_ci #endif 564370b324cSopenharmony_ci */ 565370b324cSopenharmony_ci ConvertUnicodeToUTF8(u, a); 566370b324cSopenharmony_ci U_to_A(a2, u, CP_OEMCP); 567370b324cSopenharmony_ci if (a != a2) 568370b324cSopenharmony_ci return false; 569370b324cSopenharmony_ci } 570370b324cSopenharmony_ci return true; 571370b324cSopenharmony_ci} 572370b324cSopenharmony_ci 573370b324cSopenharmony_ci#endif 574370b324cSopenharmony_ci 575370b324cSopenharmony_ci 576370b324cSopenharmony_ci#ifdef ENV_HAVE_LOCALE 577370b324cSopenharmony_ci 578370b324cSopenharmony_ciconst char *GetLocale(void) 579370b324cSopenharmony_ci{ 580370b324cSopenharmony_ci #ifdef ENV_HAVE_LOCALE 581370b324cSopenharmony_ci // printf("\n\nsetlocale(LC_CTYPE, NULL) : return : "); 582370b324cSopenharmony_ci const char *s = setlocale(LC_CTYPE, NULL); 583370b324cSopenharmony_ci if (!s) 584370b324cSopenharmony_ci { 585370b324cSopenharmony_ci // printf("[NULL]\n"); 586370b324cSopenharmony_ci s = "C"; 587370b324cSopenharmony_ci } 588370b324cSopenharmony_ci else 589370b324cSopenharmony_ci { 590370b324cSopenharmony_ci // ubuntu returns "C" after program start 591370b324cSopenharmony_ci // printf("\"%s\"\n", s); 592370b324cSopenharmony_ci } 593370b324cSopenharmony_ci return s; 594370b324cSopenharmony_ci #elif defined(LOCALE_IS_UTF8) 595370b324cSopenharmony_ci return "utf8"; 596370b324cSopenharmony_ci #else 597370b324cSopenharmony_ci return "C"; 598370b324cSopenharmony_ci #endif 599370b324cSopenharmony_ci} 600370b324cSopenharmony_ci 601370b324cSopenharmony_ci#ifdef _WIN32 602370b324cSopenharmony_ci static void Set_ForceToUTF8(bool) {} 603370b324cSopenharmony_ci#else 604370b324cSopenharmony_ci static void Set_ForceToUTF8(bool val) { g_ForceToUTF8 = val; } 605370b324cSopenharmony_ci#endif 606370b324cSopenharmony_ci 607370b324cSopenharmony_cistatic bool Is_Default_Basic_Locale(const char *locale) 608370b324cSopenharmony_ci{ 609370b324cSopenharmony_ci const AString a (locale); 610370b324cSopenharmony_ci if (a.IsEqualTo_Ascii_NoCase("") 611370b324cSopenharmony_ci || a.IsEqualTo_Ascii_NoCase("C") 612370b324cSopenharmony_ci || a.IsEqualTo_Ascii_NoCase("POSIX")) 613370b324cSopenharmony_ci return true; 614370b324cSopenharmony_ci return false; 615370b324cSopenharmony_ci} 616370b324cSopenharmony_ci 617370b324cSopenharmony_cistatic bool Is_Default_Basic_Locale() 618370b324cSopenharmony_ci{ 619370b324cSopenharmony_ci return Is_Default_Basic_Locale(GetLocale()); 620370b324cSopenharmony_ci} 621370b324cSopenharmony_ci 622370b324cSopenharmony_ci 623370b324cSopenharmony_civoid MY_SetLocale() 624370b324cSopenharmony_ci{ 625370b324cSopenharmony_ci #ifdef ENV_HAVE_LOCALE 626370b324cSopenharmony_ci /* 627370b324cSopenharmony_ci { 628370b324cSopenharmony_ci const char *s = GetLocale(); 629370b324cSopenharmony_ci printf("\nGetLocale() : returned : \"%s\"\n", s); 630370b324cSopenharmony_ci } 631370b324cSopenharmony_ci */ 632370b324cSopenharmony_ci 633370b324cSopenharmony_ci unsigned start = 0; 634370b324cSopenharmony_ci // unsigned lim = 0; 635370b324cSopenharmony_ci unsigned lim = 3; 636370b324cSopenharmony_ci 637370b324cSopenharmony_ci /* 638370b324cSopenharmony_ci #define MY_SET_LOCALE_FLAGS__FROM_ENV 1 639370b324cSopenharmony_ci #define MY_SET_LOCALE_FLAGS__TRY_UTF8 2 640370b324cSopenharmony_ci 641370b324cSopenharmony_ci unsigned flags = 642370b324cSopenharmony_ci MY_SET_LOCALE_FLAGS__FROM_ENV | 643370b324cSopenharmony_ci MY_SET_LOCALE_FLAGS__TRY_UTF8 644370b324cSopenharmony_ci 645370b324cSopenharmony_ci if (flags != 0) 646370b324cSopenharmony_ci { 647370b324cSopenharmony_ci if (flags & MY_SET_LOCALE_FLAGS__FROM_ENV) 648370b324cSopenharmony_ci lim = (flags & MY_SET_LOCALE_FLAGS__TRY_UTF8) ? 3 : 1; 649370b324cSopenharmony_ci else 650370b324cSopenharmony_ci { 651370b324cSopenharmony_ci start = 1; 652370b324cSopenharmony_ci lim = 2; 653370b324cSopenharmony_ci } 654370b324cSopenharmony_ci } 655370b324cSopenharmony_ci */ 656370b324cSopenharmony_ci 657370b324cSopenharmony_ci for (unsigned i = start; i < lim; i++) 658370b324cSopenharmony_ci { 659370b324cSopenharmony_ci /* 660370b324cSopenharmony_ci man7: "If locale is an empty string, "", each part of the locale that 661370b324cSopenharmony_ci should be modified is set according to the environment variables. 662370b324cSopenharmony_ci for glibc: glibc, first from the user's environment variables: 663370b324cSopenharmony_ci 1) the environment variable LC_ALL, 664370b324cSopenharmony_ci 2) environment variable with the same name as the category (see the 665370b324cSopenharmony_ci 3) the environment variable LANG 666370b324cSopenharmony_ci The locale "C" or "POSIX" is a portable locale; it exists on all conforming systems. 667370b324cSopenharmony_ci 668370b324cSopenharmony_ci for WIN32 : MSDN : 669370b324cSopenharmony_ci Sets the locale to the default, which is the user-default 670370b324cSopenharmony_ci ANSI code page obtained from the operating system. 671370b324cSopenharmony_ci The locale name is set to the value returned by GetUserDefaultLocaleName. 672370b324cSopenharmony_ci The code page is set to the value returned by GetACP 673370b324cSopenharmony_ci */ 674370b324cSopenharmony_ci const char *newLocale = ""; 675370b324cSopenharmony_ci 676370b324cSopenharmony_ci #ifdef __APPLE__ 677370b324cSopenharmony_ci 678370b324cSopenharmony_ci /* look also CFLocale 679370b324cSopenharmony_ci there is no C.UTF-8 in macos 680370b324cSopenharmony_ci macos has UTF-8 locale only with some language like en_US.UTF-8 681370b324cSopenharmony_ci what is best way to set UTF-8 locale in macos? */ 682370b324cSopenharmony_ci if (i == 1) 683370b324cSopenharmony_ci newLocale = "en_US.UTF-8"; 684370b324cSopenharmony_ci 685370b324cSopenharmony_ci /* file open with non-utf8 sequencies return 686370b324cSopenharmony_ci #define EILSEQ 92 // "Illegal byte sequence" 687370b324cSopenharmony_ci */ 688370b324cSopenharmony_ci#else 689370b324cSopenharmony_ci // newLocale = "C"; 690370b324cSopenharmony_ci if (i == 1) 691370b324cSopenharmony_ci { 692370b324cSopenharmony_ci newLocale = "C.UTF-8"; // main UTF-8 locale in ubuntu 693370b324cSopenharmony_ci // newLocale = ".utf8"; // supported in new Windows 10 build 17134 (April 2018 Update), the Universal C Runtime 694370b324cSopenharmony_ci // newLocale = "en_US.utf8"; // supported by ubuntu ? 695370b324cSopenharmony_ci // newLocale = "en_US.UTF-8"; 696370b324cSopenharmony_ci /* setlocale() in ubuntu allows locales with minor chracter changes in strings 697370b324cSopenharmony_ci "en_US.UTF-8" / "en_US.utf8" */ 698370b324cSopenharmony_ci } 699370b324cSopenharmony_ci 700370b324cSopenharmony_ci#endif 701370b324cSopenharmony_ci 702370b324cSopenharmony_ci // printf("\nsetlocale(LC_ALL, \"%s\") : returned: ", newLocale); 703370b324cSopenharmony_ci 704370b324cSopenharmony_ci // const char *s = 705370b324cSopenharmony_ci setlocale(LC_ALL, newLocale); 706370b324cSopenharmony_ci 707370b324cSopenharmony_ci /* 708370b324cSopenharmony_ci if (!s) 709370b324cSopenharmony_ci printf("NULL: can't set locale"); 710370b324cSopenharmony_ci else 711370b324cSopenharmony_ci printf("\"%s\"\n", s); 712370b324cSopenharmony_ci */ 713370b324cSopenharmony_ci 714370b324cSopenharmony_ci // request curent locale of program 715370b324cSopenharmony_ci const char *locale = GetLocale(); 716370b324cSopenharmony_ci if (locale) 717370b324cSopenharmony_ci { 718370b324cSopenharmony_ci AString a (locale); 719370b324cSopenharmony_ci a.MakeLower_Ascii(); 720370b324cSopenharmony_ci // if (a.Find("utf") >= 0) 721370b324cSopenharmony_ci { 722370b324cSopenharmony_ci if (IsNativeUTF8()) 723370b324cSopenharmony_ci { 724370b324cSopenharmony_ci Set_ForceToUTF8(true); 725370b324cSopenharmony_ci return; 726370b324cSopenharmony_ci } 727370b324cSopenharmony_ci } 728370b324cSopenharmony_ci if (!Is_Default_Basic_Locale(locale)) 729370b324cSopenharmony_ci { 730370b324cSopenharmony_ci // if there is some non-default and non-utf locale, we want to use it 731370b324cSopenharmony_ci break; // comment it for debug 732370b324cSopenharmony_ci } 733370b324cSopenharmony_ci } 734370b324cSopenharmony_ci } 735370b324cSopenharmony_ci 736370b324cSopenharmony_ci if (IsNativeUTF8()) 737370b324cSopenharmony_ci { 738370b324cSopenharmony_ci Set_ForceToUTF8(true); 739370b324cSopenharmony_ci return; 740370b324cSopenharmony_ci } 741370b324cSopenharmony_ci 742370b324cSopenharmony_ci if (Is_Default_Basic_Locale()) 743370b324cSopenharmony_ci { 744370b324cSopenharmony_ci Set_ForceToUTF8(true); 745370b324cSopenharmony_ci return; 746370b324cSopenharmony_ci } 747370b324cSopenharmony_ci 748370b324cSopenharmony_ci Set_ForceToUTF8(false); 749370b324cSopenharmony_ci 750370b324cSopenharmony_ci #elif defined(LOCALE_IS_UTF8) 751370b324cSopenharmony_ci // assume LC_CTYPE="utf8" 752370b324cSopenharmony_ci #else 753370b324cSopenharmony_ci // assume LC_CTYPE="C" 754370b324cSopenharmony_ci #endif 755370b324cSopenharmony_ci} 756370b324cSopenharmony_ci#endif 757