1370b324cSopenharmony_ci// UTFConvert.cpp 2370b324cSopenharmony_ci 3370b324cSopenharmony_ci#include "StdAfx.h" 4370b324cSopenharmony_ci 5370b324cSopenharmony_ci// #include <stdio.h> 6370b324cSopenharmony_ci 7370b324cSopenharmony_ci#include "MyTypes.h" 8370b324cSopenharmony_ci#include "UTFConvert.h" 9370b324cSopenharmony_ci 10370b324cSopenharmony_ci 11370b324cSopenharmony_ci#ifndef Z7_WCHART_IS_16BIT 12370b324cSopenharmony_ci#ifndef __APPLE__ 13370b324cSopenharmony_ci // we define it if the system supports files with non-utf8 symbols: 14370b324cSopenharmony_ci #define MY_UTF8_RAW_NON_UTF8_SUPPORTED 15370b324cSopenharmony_ci#endif 16370b324cSopenharmony_ci#endif 17370b324cSopenharmony_ci 18370b324cSopenharmony_ci/* 19370b324cSopenharmony_ci MY_UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte 20370b324cSopenharmony_ci 21370b324cSopenharmony_ci n : MY_UTF8_START(n) : Bits of code point 22370b324cSopenharmony_ci 23370b324cSopenharmony_ci 0 : 0x80 : : unused 24370b324cSopenharmony_ci 1 : 0xC0 : 11 : 25370b324cSopenharmony_ci 2 : 0xE0 : 16 : Basic Multilingual Plane 26370b324cSopenharmony_ci 3 : 0xF0 : 21 : Unicode space 27370b324cSopenharmony_ci 4 : 0xF8 : 26 : 28370b324cSopenharmony_ci 5 : 0xFC : 31 : UCS-4 : wcstombs() in ubuntu is limited to that value 29370b324cSopenharmony_ci 6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value 30370b324cSopenharmony_ci 7 : 0xFF : 31370b324cSopenharmony_ci*/ 32370b324cSopenharmony_ci 33370b324cSopenharmony_ci#define MY_UTF8_START(n) (0x100 - (1 << (7 - (n)))) 34370b324cSopenharmony_ci 35370b324cSopenharmony_ci#define MY_UTF8_HEAD_PARSE2(n) \ 36370b324cSopenharmony_ci if (c < MY_UTF8_START((n) + 1)) \ 37370b324cSopenharmony_ci { numBytes = (n); val -= MY_UTF8_START(n); } 38370b324cSopenharmony_ci 39370b324cSopenharmony_ci#ifndef Z7_WCHART_IS_16BIT 40370b324cSopenharmony_ci 41370b324cSopenharmony_ci/* 42370b324cSopenharmony_ci if (wchar_t is 32-bit), we can support large points in long UTF-8 sequence, 43370b324cSopenharmony_ci when we convert wchar_t strings to UTF-8: 44370b324cSopenharmony_ci (_UTF8_NUM_TAIL_BYTES_MAX == 3) : (21-bits points) - Unicode 45370b324cSopenharmony_ci (_UTF8_NUM_TAIL_BYTES_MAX == 5) : (31-bits points) - UCS-4 46370b324cSopenharmony_ci (_UTF8_NUM_TAIL_BYTES_MAX == 6) : (36-bit hack) 47370b324cSopenharmony_ci*/ 48370b324cSopenharmony_ci 49370b324cSopenharmony_ci#define MY_UTF8_NUM_TAIL_BYTES_MAX 5 50370b324cSopenharmony_ci#endif 51370b324cSopenharmony_ci 52370b324cSopenharmony_ci/* 53370b324cSopenharmony_ci#define MY_UTF8_HEAD_PARSE \ 54370b324cSopenharmony_ci UInt32 val = c; \ 55370b324cSopenharmony_ci MY_UTF8_HEAD_PARSE2(1) \ 56370b324cSopenharmony_ci else MY_UTF8_HEAD_PARSE2(2) \ 57370b324cSopenharmony_ci else MY_UTF8_HEAD_PARSE2(3) \ 58370b324cSopenharmony_ci else MY_UTF8_HEAD_PARSE2(4) \ 59370b324cSopenharmony_ci else MY_UTF8_HEAD_PARSE2(5) \ 60370b324cSopenharmony_ci #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6 61370b324cSopenharmony_ci else MY_UTF8_HEAD_PARSE2(6) 62370b324cSopenharmony_ci #endif 63370b324cSopenharmony_ci*/ 64370b324cSopenharmony_ci 65370b324cSopenharmony_ci#define MY_UTF8_HEAD_PARSE_MAX_3_BYTES \ 66370b324cSopenharmony_ci UInt32 val = c; \ 67370b324cSopenharmony_ci MY_UTF8_HEAD_PARSE2(1) \ 68370b324cSopenharmony_ci else MY_UTF8_HEAD_PARSE2(2) \ 69370b324cSopenharmony_ci else { numBytes = 3; val -= MY_UTF8_START(3); } 70370b324cSopenharmony_ci 71370b324cSopenharmony_ci 72370b324cSopenharmony_ci#define MY_UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6)) 73370b324cSopenharmony_ci 74370b324cSopenharmony_ci 75370b324cSopenharmony_ci#define START_POINT_FOR_SURROGATE 0x10000 76370b324cSopenharmony_ci 77370b324cSopenharmony_ci 78370b324cSopenharmony_ci/* we use 128 bytes block in 16-bit BMP-PLANE to encode non-UTF-8 Escapes 79370b324cSopenharmony_ci Also we can use additional HIGH-PLANE (we use 21-bit points above 0x1f0000) 80370b324cSopenharmony_ci to simplify internal intermediate conversion in Linux: 81370b324cSopenharmony_ci RAW-UTF-8 <-> internal wchar_t utf-16 strings <-> RAW-UTF-UTF-8 82370b324cSopenharmony_ci*/ 83370b324cSopenharmony_ci 84370b324cSopenharmony_ci 85370b324cSopenharmony_ci#if defined(Z7_WCHART_IS_16BIT) 86370b324cSopenharmony_ci 87370b324cSopenharmony_ci#define UTF_ESCAPE_PLANE 0 88370b324cSopenharmony_ci 89370b324cSopenharmony_ci#else 90370b324cSopenharmony_ci 91370b324cSopenharmony_ci/* 92370b324cSopenharmony_ciwe can place 128 ESCAPE chars to 93370b324cSopenharmony_ci ef 80 - ee be 80 (3-bytes utf-8) : similar to WSL 94370b324cSopenharmony_ci ef ff - ee bf bf 95370b324cSopenharmony_ci 96370b324cSopenharmony_ci1f ef 80 - f7 be be 80 (4-bytes utf-8) : last 4-bytes utf-8 plane (out of Unicode) 97370b324cSopenharmony_ci1f ef ff - f7 be bf bf (4-bytes utf-8) : last 4-bytes utf-8 plane (out of Unicode) 98370b324cSopenharmony_ci*/ 99370b324cSopenharmony_ci 100370b324cSopenharmony_ci// #define UTF_ESCAPE_PLANE_HIGH (0x1f << 16) 101370b324cSopenharmony_ci// #define UTF_ESCAPE_PLANE UTF_ESCAPE_PLANE_HIGH 102370b324cSopenharmony_ci#define UTF_ESCAPE_PLANE 0 103370b324cSopenharmony_ci 104370b324cSopenharmony_ci/* 105370b324cSopenharmony_ci if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is set) 106370b324cSopenharmony_ci { 107370b324cSopenharmony_ci if (UTF_ESCAPE_PLANE is UTF_ESCAPE_PLANE_HIGH) 108370b324cSopenharmony_ci { 109370b324cSopenharmony_ci we can restore any 8-bit Escape from ESCAPE-PLANE-21 plane. 110370b324cSopenharmony_ci But ESCAPE-PLANE-21 point cannot be stored to utf-16 (7z archive) 111370b324cSopenharmony_ci So we still need a way to extract 8-bit Escapes and BMP-Escapes-8 112370b324cSopenharmony_ci from same BMP-Escapes-16 stored in 7z. 113370b324cSopenharmony_ci And if we want to restore any 8-bit from 7z archive, 114370b324cSopenharmony_ci we still must use Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT for (utf-8 -> utf-16) 115370b324cSopenharmony_ci Also we need additional Conversions to tranform from utf-16 to utf-16-With-Escapes-21 116370b324cSopenharmony_ci } 117370b324cSopenharmony_ci else (UTF_ESCAPE_PLANE == 0) 118370b324cSopenharmony_ci { 119370b324cSopenharmony_ci we must convert original 3-bytes utf-8 BMP-Escape point to sequence 120370b324cSopenharmony_ci of 3 BMP-Escape-16 points with Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT 121370b324cSopenharmony_ci so we can extract original RAW-UTF-8 from UTFD-16 later. 122370b324cSopenharmony_ci } 123370b324cSopenharmony_ci } 124370b324cSopenharmony_ci*/ 125370b324cSopenharmony_ci 126370b324cSopenharmony_ci#endif 127370b324cSopenharmony_ci 128370b324cSopenharmony_ci 129370b324cSopenharmony_ci 130370b324cSopenharmony_ci#define UTF_ESCAPE_BASE 0xef00 131370b324cSopenharmony_ci 132370b324cSopenharmony_ci 133370b324cSopenharmony_ci#ifdef UTF_ESCAPE_BASE 134370b324cSopenharmony_ci#define IS_ESCAPE_POINT(v, plane) (((v) & (UInt32)0xffffff80) == (plane) + UTF_ESCAPE_BASE + 0x80) 135370b324cSopenharmony_ci#endif 136370b324cSopenharmony_ci 137370b324cSopenharmony_ci#define IS_SURROGATE_POINT(v) (((v) & (UInt32)0xfffff800) == 0xd800) 138370b324cSopenharmony_ci#define IS_LOW_SURROGATE_POINT(v) (((v) & (UInt32)0xfffffC00) == 0xdc00) 139370b324cSopenharmony_ci 140370b324cSopenharmony_ci 141370b324cSopenharmony_ci#define UTF_ERROR_UTF8_CHECK \ 142370b324cSopenharmony_ci { NonUtf = true; continue; } 143370b324cSopenharmony_ci 144370b324cSopenharmony_civoid CUtf8Check::Check_Buf(const char *src, size_t size) throw() 145370b324cSopenharmony_ci{ 146370b324cSopenharmony_ci Clear(); 147370b324cSopenharmony_ci // Byte maxByte = 0; 148370b324cSopenharmony_ci 149370b324cSopenharmony_ci for (;;) 150370b324cSopenharmony_ci { 151370b324cSopenharmony_ci if (size == 0) 152370b324cSopenharmony_ci break; 153370b324cSopenharmony_ci 154370b324cSopenharmony_ci const Byte c = (Byte)(*src++); 155370b324cSopenharmony_ci size--; 156370b324cSopenharmony_ci 157370b324cSopenharmony_ci if (c == 0) 158370b324cSopenharmony_ci { 159370b324cSopenharmony_ci ZeroChar = true; 160370b324cSopenharmony_ci continue; 161370b324cSopenharmony_ci } 162370b324cSopenharmony_ci 163370b324cSopenharmony_ci /* 164370b324cSopenharmony_ci if (c > maxByte) 165370b324cSopenharmony_ci maxByte = c; 166370b324cSopenharmony_ci */ 167370b324cSopenharmony_ci 168370b324cSopenharmony_ci if (c < 0x80) 169370b324cSopenharmony_ci continue; 170370b324cSopenharmony_ci 171370b324cSopenharmony_ci if (c < 0xc0 + 2) // it's limit for 0x140000 unicode codes : win32 compatibility 172370b324cSopenharmony_ci UTF_ERROR_UTF8_CHECK 173370b324cSopenharmony_ci 174370b324cSopenharmony_ci unsigned numBytes; 175370b324cSopenharmony_ci 176370b324cSopenharmony_ci UInt32 val = c; 177370b324cSopenharmony_ci MY_UTF8_HEAD_PARSE2(1) 178370b324cSopenharmony_ci else MY_UTF8_HEAD_PARSE2(2) 179370b324cSopenharmony_ci else MY_UTF8_HEAD_PARSE2(4) 180370b324cSopenharmony_ci else MY_UTF8_HEAD_PARSE2(5) 181370b324cSopenharmony_ci else 182370b324cSopenharmony_ci { 183370b324cSopenharmony_ci UTF_ERROR_UTF8_CHECK 184370b324cSopenharmony_ci } 185370b324cSopenharmony_ci 186370b324cSopenharmony_ci unsigned pos = 0; 187370b324cSopenharmony_ci do 188370b324cSopenharmony_ci { 189370b324cSopenharmony_ci if (pos == size) 190370b324cSopenharmony_ci break; 191370b324cSopenharmony_ci unsigned c2 = (Byte)src[pos]; 192370b324cSopenharmony_ci c2 -= 0x80; 193370b324cSopenharmony_ci if (c2 >= 0x40) 194370b324cSopenharmony_ci break; 195370b324cSopenharmony_ci val <<= 6; 196370b324cSopenharmony_ci val |= c2; 197370b324cSopenharmony_ci if (pos == 0) 198370b324cSopenharmony_ci if (val < (((unsigned)1 << 7) >> numBytes)) 199370b324cSopenharmony_ci break; 200370b324cSopenharmony_ci pos++; 201370b324cSopenharmony_ci } 202370b324cSopenharmony_ci while (--numBytes); 203370b324cSopenharmony_ci 204370b324cSopenharmony_ci if (numBytes != 0) 205370b324cSopenharmony_ci { 206370b324cSopenharmony_ci if (pos == size) 207370b324cSopenharmony_ci Truncated = true; 208370b324cSopenharmony_ci else 209370b324cSopenharmony_ci UTF_ERROR_UTF8_CHECK 210370b324cSopenharmony_ci } 211370b324cSopenharmony_ci 212370b324cSopenharmony_ci #ifdef UTF_ESCAPE_BASE 213370b324cSopenharmony_ci if (IS_ESCAPE_POINT(val, 0)) 214370b324cSopenharmony_ci Escape = true; 215370b324cSopenharmony_ci #endif 216370b324cSopenharmony_ci 217370b324cSopenharmony_ci if (MaxHighPoint < val) 218370b324cSopenharmony_ci MaxHighPoint = val; 219370b324cSopenharmony_ci 220370b324cSopenharmony_ci if (IS_SURROGATE_POINT(val)) 221370b324cSopenharmony_ci SingleSurrogate = true; 222370b324cSopenharmony_ci 223370b324cSopenharmony_ci src += pos; 224370b324cSopenharmony_ci size -= pos; 225370b324cSopenharmony_ci } 226370b324cSopenharmony_ci 227370b324cSopenharmony_ci // MaxByte = maxByte; 228370b324cSopenharmony_ci} 229370b324cSopenharmony_ci 230370b324cSopenharmony_cibool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw() 231370b324cSopenharmony_ci{ 232370b324cSopenharmony_ci CUtf8Check check; 233370b324cSopenharmony_ci check.Check_Buf(src, size); 234370b324cSopenharmony_ci return check.IsOK(allowReduced); 235370b324cSopenharmony_ci} 236370b324cSopenharmony_ci 237370b324cSopenharmony_ci/* 238370b324cSopenharmony_cibool CheckUTF8_chars(const char *src, bool allowReduced) throw() 239370b324cSopenharmony_ci{ 240370b324cSopenharmony_ci CUtf8Check check; 241370b324cSopenharmony_ci check.CheckBuf(src, strlen(src)); 242370b324cSopenharmony_ci return check.IsOK(allowReduced); 243370b324cSopenharmony_ci} 244370b324cSopenharmony_ci*/ 245370b324cSopenharmony_ci 246370b324cSopenharmony_cibool CheckUTF8_AString(const AString &s) throw() 247370b324cSopenharmony_ci{ 248370b324cSopenharmony_ci CUtf8Check check; 249370b324cSopenharmony_ci check.Check_AString(s); 250370b324cSopenharmony_ci return check.IsOK(); 251370b324cSopenharmony_ci} 252370b324cSopenharmony_ci 253370b324cSopenharmony_ci 254370b324cSopenharmony_ci/* 255370b324cSopenharmony_cibool CheckUTF8(const char *src, bool allowReduced) throw() 256370b324cSopenharmony_ci{ 257370b324cSopenharmony_ci // return Check_UTF8_Buf(src, strlen(src), allowReduced); 258370b324cSopenharmony_ci 259370b324cSopenharmony_ci for (;;) 260370b324cSopenharmony_ci { 261370b324cSopenharmony_ci const Byte c = (Byte)(*src++); 262370b324cSopenharmony_ci if (c == 0) 263370b324cSopenharmony_ci return true; 264370b324cSopenharmony_ci 265370b324cSopenharmony_ci if (c < 0x80) 266370b324cSopenharmony_ci continue; 267370b324cSopenharmony_ci if (c < 0xC0 + 2 || c >= 0xf5) 268370b324cSopenharmony_ci return false; 269370b324cSopenharmony_ci 270370b324cSopenharmony_ci unsigned numBytes; 271370b324cSopenharmony_ci MY_UTF8_HEAD_PARSE 272370b324cSopenharmony_ci else 273370b324cSopenharmony_ci return false; 274370b324cSopenharmony_ci 275370b324cSopenharmony_ci unsigned pos = 0; 276370b324cSopenharmony_ci 277370b324cSopenharmony_ci do 278370b324cSopenharmony_ci { 279370b324cSopenharmony_ci Byte c2 = (Byte)(*src++); 280370b324cSopenharmony_ci if (c2 < 0x80 || c2 >= 0xC0) 281370b324cSopenharmony_ci return allowReduced && c2 == 0; 282370b324cSopenharmony_ci val <<= 6; 283370b324cSopenharmony_ci val |= (c2 - 0x80); 284370b324cSopenharmony_ci pos++; 285370b324cSopenharmony_ci } 286370b324cSopenharmony_ci while (--numBytes); 287370b324cSopenharmony_ci 288370b324cSopenharmony_ci if (val < MY_UTF8_RANGE(pos - 1)) 289370b324cSopenharmony_ci return false; 290370b324cSopenharmony_ci 291370b324cSopenharmony_ci if (val >= 0x110000) 292370b324cSopenharmony_ci return false; 293370b324cSopenharmony_ci } 294370b324cSopenharmony_ci} 295370b324cSopenharmony_ci*/ 296370b324cSopenharmony_ci 297370b324cSopenharmony_ci// in case of UTF-8 error we have two ways: 298370b324cSopenharmony_ci// 21.01- : old : 0xfffd: REPLACEMENT CHARACTER : old version 299370b324cSopenharmony_ci// 21.02+ : new : 0xef00 + (c) : similar to WSL scheme for low symbols 300370b324cSopenharmony_ci 301370b324cSopenharmony_ci#define UTF_REPLACEMENT_CHAR 0xfffd 302370b324cSopenharmony_ci 303370b324cSopenharmony_ci 304370b324cSopenharmony_ci 305370b324cSopenharmony_ci#define UTF_ESCAPE(c) \ 306370b324cSopenharmony_ci ((flags & Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE) ? \ 307370b324cSopenharmony_ci UTF_ESCAPE_PLANE + UTF_ESCAPE_BASE + (c) : UTF_REPLACEMENT_CHAR) 308370b324cSopenharmony_ci 309370b324cSopenharmony_ci/* 310370b324cSopenharmony_ci#define UTF_HARD_ERROR_UTF8 311370b324cSopenharmony_ci { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \ 312370b324cSopenharmony_ci destPos++; ok = false; continue; } 313370b324cSopenharmony_ci*/ 314370b324cSopenharmony_ci 315370b324cSopenharmony_ci// we ignore utf errors, and don't change (ok) variable! 316370b324cSopenharmony_ci 317370b324cSopenharmony_ci#define UTF_ERROR_UTF8 \ 318370b324cSopenharmony_ci { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \ 319370b324cSopenharmony_ci destPos++; continue; } 320370b324cSopenharmony_ci 321370b324cSopenharmony_ci// we store UTF-16 in wchar_t strings. So we use surrogates for big unicode points: 322370b324cSopenharmony_ci 323370b324cSopenharmony_ci// for debug puposes only we can store UTF-32 in wchar_t: 324370b324cSopenharmony_ci// #define START_POINT_FOR_SURROGATE ((UInt32)0 - 1) 325370b324cSopenharmony_ci 326370b324cSopenharmony_ci 327370b324cSopenharmony_ci/* 328370b324cSopenharmony_ci WIN32 MultiByteToWideChar(CP_UTF8) emits 0xfffd point, if utf-8 error was found. 329370b324cSopenharmony_ci Ant it can emit single 0xfffd from 2 src bytes. 330370b324cSopenharmony_ci It doesn't emit single 0xfffd from 3-4 src bytes. 331370b324cSopenharmony_ci We can 332370b324cSopenharmony_ci 1) emit Escape point for each incorrect byte. So we can data recover later 333370b324cSopenharmony_ci 2) emit 0xfffd for each incorrect byte. 334370b324cSopenharmony_ci That scheme is similar to Escape scheme, but we emit 0xfffd 335370b324cSopenharmony_ci instead of each Escape point. 336370b324cSopenharmony_ci 3) emit single 0xfffd from 1-2 incorrect bytes, as WIN32 MultiByteToWideChar scheme 337370b324cSopenharmony_ci*/ 338370b324cSopenharmony_ci 339370b324cSopenharmony_cistatic bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim, unsigned flags) throw() 340370b324cSopenharmony_ci{ 341370b324cSopenharmony_ci size_t destPos = 0; 342370b324cSopenharmony_ci bool ok = true; 343370b324cSopenharmony_ci 344370b324cSopenharmony_ci for (;;) 345370b324cSopenharmony_ci { 346370b324cSopenharmony_ci if (src == srcLim) 347370b324cSopenharmony_ci { 348370b324cSopenharmony_ci *destLen = destPos; 349370b324cSopenharmony_ci return ok; 350370b324cSopenharmony_ci } 351370b324cSopenharmony_ci 352370b324cSopenharmony_ci const Byte c = (Byte)(*src++); 353370b324cSopenharmony_ci 354370b324cSopenharmony_ci if (c < 0x80) 355370b324cSopenharmony_ci { 356370b324cSopenharmony_ci if (dest) 357370b324cSopenharmony_ci dest[destPos] = (wchar_t)c; 358370b324cSopenharmony_ci destPos++; 359370b324cSopenharmony_ci continue; 360370b324cSopenharmony_ci } 361370b324cSopenharmony_ci 362370b324cSopenharmony_ci if (c < 0xc0 + 2 363370b324cSopenharmony_ci || c >= 0xf5) // it's limit for 0x140000 unicode codes : win32 compatibility 364370b324cSopenharmony_ci { 365370b324cSopenharmony_ci UTF_ERROR_UTF8 366370b324cSopenharmony_ci } 367370b324cSopenharmony_ci 368370b324cSopenharmony_ci unsigned numBytes; 369370b324cSopenharmony_ci 370370b324cSopenharmony_ci MY_UTF8_HEAD_PARSE_MAX_3_BYTES 371370b324cSopenharmony_ci 372370b324cSopenharmony_ci unsigned pos = 0; 373370b324cSopenharmony_ci do 374370b324cSopenharmony_ci { 375370b324cSopenharmony_ci if (src + pos == srcLim) 376370b324cSopenharmony_ci break; 377370b324cSopenharmony_ci unsigned c2 = (Byte)src[pos]; 378370b324cSopenharmony_ci c2 -= 0x80; 379370b324cSopenharmony_ci if (c2 >= 0x40) 380370b324cSopenharmony_ci break; 381370b324cSopenharmony_ci val <<= 6; 382370b324cSopenharmony_ci val |= c2; 383370b324cSopenharmony_ci pos++; 384370b324cSopenharmony_ci if (pos == 1) 385370b324cSopenharmony_ci { 386370b324cSopenharmony_ci if (val < (((unsigned)1 << 7) >> numBytes)) 387370b324cSopenharmony_ci break; 388370b324cSopenharmony_ci if (numBytes == 2) 389370b324cSopenharmony_ci { 390370b324cSopenharmony_ci if (flags & Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) 391370b324cSopenharmony_ci if ((val & (0xF800 >> 6)) == (0xd800 >> 6)) 392370b324cSopenharmony_ci break; 393370b324cSopenharmony_ci } 394370b324cSopenharmony_ci else if (numBytes == 3 && val >= (0x110000 >> 12)) 395370b324cSopenharmony_ci break; 396370b324cSopenharmony_ci } 397370b324cSopenharmony_ci } 398370b324cSopenharmony_ci while (--numBytes); 399370b324cSopenharmony_ci 400370b324cSopenharmony_ci if (numBytes != 0) 401370b324cSopenharmony_ci { 402370b324cSopenharmony_ci if ((flags & Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE) == 0) 403370b324cSopenharmony_ci { 404370b324cSopenharmony_ci // the following code to emit the 0xfffd chars as win32 Utf8 function. 405370b324cSopenharmony_ci // disable the folling line, if you need 0xfffd for each incorrect byte as in Escape mode 406370b324cSopenharmony_ci src += pos; 407370b324cSopenharmony_ci } 408370b324cSopenharmony_ci UTF_ERROR_UTF8 409370b324cSopenharmony_ci } 410370b324cSopenharmony_ci 411370b324cSopenharmony_ci /* 412370b324cSopenharmony_ci if (val < MY_UTF8_RANGE(pos - 1)) 413370b324cSopenharmony_ci UTF_ERROR_UTF8 414370b324cSopenharmony_ci */ 415370b324cSopenharmony_ci 416370b324cSopenharmony_ci #ifdef UTF_ESCAPE_BASE 417370b324cSopenharmony_ci 418370b324cSopenharmony_ci if ((flags & Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT) 419370b324cSopenharmony_ci && IS_ESCAPE_POINT(val, 0)) 420370b324cSopenharmony_ci { 421370b324cSopenharmony_ci // We will emit 3 utf16-Escape-16-21 points from one Escape-16 point (3 bytes) 422370b324cSopenharmony_ci UTF_ERROR_UTF8 423370b324cSopenharmony_ci } 424370b324cSopenharmony_ci 425370b324cSopenharmony_ci #endif 426370b324cSopenharmony_ci 427370b324cSopenharmony_ci /* 428370b324cSopenharmony_ci We don't expect virtual Escape-21 points in UTF-8 stream. 429370b324cSopenharmony_ci And we don't check for Escape-21. 430370b324cSopenharmony_ci So utf8-Escape-21 will be converted to another 3 utf16-Escape-21 points. 431370b324cSopenharmony_ci Maybe we could convert virtual utf8-Escape-21 to one utf16-Escape-21 point in some cases? 432370b324cSopenharmony_ci */ 433370b324cSopenharmony_ci 434370b324cSopenharmony_ci if (val < START_POINT_FOR_SURROGATE) 435370b324cSopenharmony_ci { 436370b324cSopenharmony_ci /* 437370b324cSopenharmony_ci if ((flags & Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) 438370b324cSopenharmony_ci && IS_SURROGATE_POINT(val)) 439370b324cSopenharmony_ci { 440370b324cSopenharmony_ci // We will emit 3 utf16-Escape-16-21 points from one Surrogate-16 point (3 bytes) 441370b324cSopenharmony_ci UTF_ERROR_UTF8 442370b324cSopenharmony_ci } 443370b324cSopenharmony_ci */ 444370b324cSopenharmony_ci if (dest) 445370b324cSopenharmony_ci dest[destPos] = (wchar_t)val; 446370b324cSopenharmony_ci destPos++; 447370b324cSopenharmony_ci } 448370b324cSopenharmony_ci else 449370b324cSopenharmony_ci { 450370b324cSopenharmony_ci /* 451370b324cSopenharmony_ci if (val >= 0x110000) 452370b324cSopenharmony_ci { 453370b324cSopenharmony_ci // We will emit utf16-Escape-16-21 point from each source byte 454370b324cSopenharmony_ci UTF_ERROR_UTF8 455370b324cSopenharmony_ci } 456370b324cSopenharmony_ci */ 457370b324cSopenharmony_ci if (dest) 458370b324cSopenharmony_ci { 459370b324cSopenharmony_ci dest[destPos + 0] = (wchar_t)(0xd800 - (0x10000 >> 10) + (val >> 10)); 460370b324cSopenharmony_ci dest[destPos + 1] = (wchar_t)(0xdc00 + (val & 0x3ff)); 461370b324cSopenharmony_ci } 462370b324cSopenharmony_ci destPos += 2; 463370b324cSopenharmony_ci } 464370b324cSopenharmony_ci src += pos; 465370b324cSopenharmony_ci } 466370b324cSopenharmony_ci} 467370b324cSopenharmony_ci 468370b324cSopenharmony_ci 469370b324cSopenharmony_ci 470370b324cSopenharmony_ci#define MY_UTF8_HEAD(n, val) ((char)(MY_UTF8_START(n) + (val >> (6 * (n))))) 471370b324cSopenharmony_ci#define MY_UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F))) 472370b324cSopenharmony_ci 473370b324cSopenharmony_cistatic size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim, unsigned flags) 474370b324cSopenharmony_ci{ 475370b324cSopenharmony_ci size_t size = (size_t)(srcLim - src); 476370b324cSopenharmony_ci for (;;) 477370b324cSopenharmony_ci { 478370b324cSopenharmony_ci if (src == srcLim) 479370b324cSopenharmony_ci return size; 480370b324cSopenharmony_ci 481370b324cSopenharmony_ci UInt32 val = (UInt32)(*src++); 482370b324cSopenharmony_ci 483370b324cSopenharmony_ci if (val < 0x80) 484370b324cSopenharmony_ci continue; 485370b324cSopenharmony_ci 486370b324cSopenharmony_ci if (val < MY_UTF8_RANGE(1)) 487370b324cSopenharmony_ci { 488370b324cSopenharmony_ci size++; 489370b324cSopenharmony_ci continue; 490370b324cSopenharmony_ci } 491370b324cSopenharmony_ci 492370b324cSopenharmony_ci #ifdef UTF_ESCAPE_BASE 493370b324cSopenharmony_ci 494370b324cSopenharmony_ci #if UTF_ESCAPE_PLANE != 0 495370b324cSopenharmony_ci if (flags & Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE) 496370b324cSopenharmony_ci if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE)) 497370b324cSopenharmony_ci continue; 498370b324cSopenharmony_ci #endif 499370b324cSopenharmony_ci 500370b324cSopenharmony_ci if (flags & Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE) 501370b324cSopenharmony_ci if (IS_ESCAPE_POINT(val, 0)) 502370b324cSopenharmony_ci continue; 503370b324cSopenharmony_ci 504370b324cSopenharmony_ci #endif 505370b324cSopenharmony_ci 506370b324cSopenharmony_ci if (IS_SURROGATE_POINT(val)) 507370b324cSopenharmony_ci { 508370b324cSopenharmony_ci // it's hack to UTF-8 encoding 509370b324cSopenharmony_ci 510370b324cSopenharmony_ci if (val < 0xdc00 && src != srcLim) 511370b324cSopenharmony_ci { 512370b324cSopenharmony_ci const UInt32 c2 = (UInt32)*src; 513370b324cSopenharmony_ci if (c2 >= 0xdc00 && c2 < 0xe000) 514370b324cSopenharmony_ci src++; 515370b324cSopenharmony_ci } 516370b324cSopenharmony_ci size += 2; 517370b324cSopenharmony_ci continue; 518370b324cSopenharmony_ci } 519370b324cSopenharmony_ci 520370b324cSopenharmony_ci #ifdef Z7_WCHART_IS_16BIT 521370b324cSopenharmony_ci 522370b324cSopenharmony_ci size += 2; 523370b324cSopenharmony_ci 524370b324cSopenharmony_ci #else 525370b324cSopenharmony_ci 526370b324cSopenharmony_ci if (val < MY_UTF8_RANGE(2)) size += 2; 527370b324cSopenharmony_ci else if (val < MY_UTF8_RANGE(3)) size += 3; 528370b324cSopenharmony_ci else if (val < MY_UTF8_RANGE(4)) size += 4; 529370b324cSopenharmony_ci else if (val < MY_UTF8_RANGE(5)) size += 5; 530370b324cSopenharmony_ci else 531370b324cSopenharmony_ci #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6 532370b324cSopenharmony_ci size += 6; 533370b324cSopenharmony_ci #else 534370b324cSopenharmony_ci size += 3; 535370b324cSopenharmony_ci #endif 536370b324cSopenharmony_ci 537370b324cSopenharmony_ci #endif 538370b324cSopenharmony_ci } 539370b324cSopenharmony_ci} 540370b324cSopenharmony_ci 541370b324cSopenharmony_ci 542370b324cSopenharmony_cistatic char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim, unsigned flags) 543370b324cSopenharmony_ci{ 544370b324cSopenharmony_ci for (;;) 545370b324cSopenharmony_ci { 546370b324cSopenharmony_ci if (src == srcLim) 547370b324cSopenharmony_ci return dest; 548370b324cSopenharmony_ci 549370b324cSopenharmony_ci UInt32 val = (UInt32)*src++; 550370b324cSopenharmony_ci 551370b324cSopenharmony_ci if (val < 0x80) 552370b324cSopenharmony_ci { 553370b324cSopenharmony_ci *dest++ = (char)val; 554370b324cSopenharmony_ci continue; 555370b324cSopenharmony_ci } 556370b324cSopenharmony_ci 557370b324cSopenharmony_ci if (val < MY_UTF8_RANGE(1)) 558370b324cSopenharmony_ci { 559370b324cSopenharmony_ci dest[0] = MY_UTF8_HEAD(1, val); 560370b324cSopenharmony_ci dest[1] = MY_UTF8_CHAR(0, val); 561370b324cSopenharmony_ci dest += 2; 562370b324cSopenharmony_ci continue; 563370b324cSopenharmony_ci } 564370b324cSopenharmony_ci 565370b324cSopenharmony_ci #ifdef UTF_ESCAPE_BASE 566370b324cSopenharmony_ci 567370b324cSopenharmony_ci #if UTF_ESCAPE_PLANE != 0 568370b324cSopenharmony_ci /* 569370b324cSopenharmony_ci if (wchar_t is 32-bit) 570370b324cSopenharmony_ci && (Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE is set) 571370b324cSopenharmony_ci && (point is virtual escape plane) 572370b324cSopenharmony_ci we extract 8-bit byte from virtual HIGH-ESCAPE PLANE. 573370b324cSopenharmony_ci */ 574370b324cSopenharmony_ci if (flags & Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE) 575370b324cSopenharmony_ci if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE)) 576370b324cSopenharmony_ci { 577370b324cSopenharmony_ci *dest++ = (char)(val); 578370b324cSopenharmony_ci continue; 579370b324cSopenharmony_ci } 580370b324cSopenharmony_ci #endif // UTF_ESCAPE_PLANE != 0 581370b324cSopenharmony_ci 582370b324cSopenharmony_ci /* if (Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE is defined) 583370b324cSopenharmony_ci we extract 8-bit byte from BMP-ESCAPE PLANE. */ 584370b324cSopenharmony_ci 585370b324cSopenharmony_ci if (flags & Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE) 586370b324cSopenharmony_ci if (IS_ESCAPE_POINT(val, 0)) 587370b324cSopenharmony_ci { 588370b324cSopenharmony_ci *dest++ = (char)(val); 589370b324cSopenharmony_ci continue; 590370b324cSopenharmony_ci } 591370b324cSopenharmony_ci 592370b324cSopenharmony_ci #endif // UTF_ESCAPE_BASE 593370b324cSopenharmony_ci 594370b324cSopenharmony_ci if (IS_SURROGATE_POINT(val)) 595370b324cSopenharmony_ci { 596370b324cSopenharmony_ci // it's hack to UTF-8 encoding 597370b324cSopenharmony_ci if (val < 0xdc00 && src != srcLim) 598370b324cSopenharmony_ci { 599370b324cSopenharmony_ci const UInt32 c2 = (UInt32)*src; 600370b324cSopenharmony_ci if (IS_LOW_SURROGATE_POINT(c2)) 601370b324cSopenharmony_ci { 602370b324cSopenharmony_ci src++; 603370b324cSopenharmony_ci val = (((val - 0xd800) << 10) | (c2 - 0xdc00)) + 0x10000; 604370b324cSopenharmony_ci dest[0] = MY_UTF8_HEAD(3, val); 605370b324cSopenharmony_ci dest[1] = MY_UTF8_CHAR(2, val); 606370b324cSopenharmony_ci dest[2] = MY_UTF8_CHAR(1, val); 607370b324cSopenharmony_ci dest[3] = MY_UTF8_CHAR(0, val); 608370b324cSopenharmony_ci dest += 4; 609370b324cSopenharmony_ci continue; 610370b324cSopenharmony_ci } 611370b324cSopenharmony_ci } 612370b324cSopenharmony_ci if (flags & Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR) 613370b324cSopenharmony_ci val = UTF_REPLACEMENT_CHAR; // WIN32 function does it 614370b324cSopenharmony_ci } 615370b324cSopenharmony_ci 616370b324cSopenharmony_ci #ifndef Z7_WCHART_IS_16BIT 617370b324cSopenharmony_ci if (val < MY_UTF8_RANGE(2)) 618370b324cSopenharmony_ci #endif 619370b324cSopenharmony_ci { 620370b324cSopenharmony_ci dest[0] = MY_UTF8_HEAD(2, val); 621370b324cSopenharmony_ci dest[1] = MY_UTF8_CHAR(1, val); 622370b324cSopenharmony_ci dest[2] = MY_UTF8_CHAR(0, val); 623370b324cSopenharmony_ci dest += 3; 624370b324cSopenharmony_ci continue; 625370b324cSopenharmony_ci } 626370b324cSopenharmony_ci 627370b324cSopenharmony_ci #ifndef Z7_WCHART_IS_16BIT 628370b324cSopenharmony_ci 629370b324cSopenharmony_ci // we don't expect this case. so we can throw exception 630370b324cSopenharmony_ci // throw 20210407; 631370b324cSopenharmony_ci 632370b324cSopenharmony_ci char b; 633370b324cSopenharmony_ci unsigned numBits; 634370b324cSopenharmony_ci if (val < MY_UTF8_RANGE(3)) { numBits = 6 * 3; b = MY_UTF8_HEAD(3, val); } 635370b324cSopenharmony_ci else if (val < MY_UTF8_RANGE(4)) { numBits = 6 * 4; b = MY_UTF8_HEAD(4, val); } 636370b324cSopenharmony_ci else if (val < MY_UTF8_RANGE(5)) { numBits = 6 * 5; b = MY_UTF8_HEAD(5, val); } 637370b324cSopenharmony_ci #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6 638370b324cSopenharmony_ci else { numBits = 6 * 6; b = (char)MY_UTF8_START(6); } 639370b324cSopenharmony_ci #else 640370b324cSopenharmony_ci else 641370b324cSopenharmony_ci { 642370b324cSopenharmony_ci val = UTF_REPLACEMENT_CHAR; 643370b324cSopenharmony_ci { numBits = 6 * 3; b = MY_UTF8_HEAD(3, val); } 644370b324cSopenharmony_ci } 645370b324cSopenharmony_ci #endif 646370b324cSopenharmony_ci 647370b324cSopenharmony_ci *dest++ = b; 648370b324cSopenharmony_ci 649370b324cSopenharmony_ci do 650370b324cSopenharmony_ci { 651370b324cSopenharmony_ci numBits -= 6; 652370b324cSopenharmony_ci *dest++ = (char)(0x80 + ((val >> numBits) & 0x3F)); 653370b324cSopenharmony_ci } 654370b324cSopenharmony_ci while (numBits != 0); 655370b324cSopenharmony_ci 656370b324cSopenharmony_ci #endif 657370b324cSopenharmony_ci } 658370b324cSopenharmony_ci} 659370b324cSopenharmony_ci 660370b324cSopenharmony_cibool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags) 661370b324cSopenharmony_ci{ 662370b324cSopenharmony_ci dest.Empty(); 663370b324cSopenharmony_ci size_t destLen = 0; 664370b324cSopenharmony_ci Utf8_To_Utf16(NULL, &destLen, src, src + srcSize, flags); 665370b324cSopenharmony_ci bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src + srcSize, flags); 666370b324cSopenharmony_ci dest.ReleaseBuf_SetEnd((unsigned)destLen); 667370b324cSopenharmony_ci return res; 668370b324cSopenharmony_ci} 669370b324cSopenharmony_ci 670370b324cSopenharmony_cibool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags) 671370b324cSopenharmony_ci{ 672370b324cSopenharmony_ci return Convert_UTF8_Buf_To_Unicode(src, src.Len(), dest, flags); 673370b324cSopenharmony_ci} 674370b324cSopenharmony_ci 675370b324cSopenharmony_ci 676370b324cSopenharmony_cistatic 677370b324cSopenharmony_ciunsigned g_UTF8_To_Unicode_Flags = 678370b324cSopenharmony_ci Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE 679370b324cSopenharmony_ci #ifndef Z7_WCHART_IS_16BIT 680370b324cSopenharmony_ci | Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR 681370b324cSopenharmony_ci #ifdef MY_UTF8_RAW_NON_UTF8_SUPPORTED 682370b324cSopenharmony_ci | Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT 683370b324cSopenharmony_ci #endif 684370b324cSopenharmony_ci #endif 685370b324cSopenharmony_ci ; 686370b324cSopenharmony_ci 687370b324cSopenharmony_ci 688370b324cSopenharmony_ci/* 689370b324cSopenharmony_cibool ConvertUTF8ToUnicode_boolRes(const AString &src, UString &dest) 690370b324cSopenharmony_ci{ 691370b324cSopenharmony_ci return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags); 692370b324cSopenharmony_ci} 693370b324cSopenharmony_ci*/ 694370b324cSopenharmony_ci 695370b324cSopenharmony_cibool ConvertUTF8ToUnicode(const AString &src, UString &dest) 696370b324cSopenharmony_ci{ 697370b324cSopenharmony_ci return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags); 698370b324cSopenharmony_ci} 699370b324cSopenharmony_ci 700370b324cSopenharmony_civoid Print_UString(const UString &a); 701370b324cSopenharmony_ci 702370b324cSopenharmony_civoid ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags) 703370b324cSopenharmony_ci{ 704370b324cSopenharmony_ci /* 705370b324cSopenharmony_ci if (src.Len()== 24) 706370b324cSopenharmony_ci throw "202104"; 707370b324cSopenharmony_ci */ 708370b324cSopenharmony_ci dest.Empty(); 709370b324cSopenharmony_ci const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags); 710370b324cSopenharmony_ci char *destStart = dest.GetBuf((unsigned)destLen); 711370b324cSopenharmony_ci const char *destEnd = Utf16_To_Utf8(destStart, src, src.Ptr(src.Len()), flags); 712370b324cSopenharmony_ci dest.ReleaseBuf_SetEnd((unsigned)destLen); 713370b324cSopenharmony_ci // printf("\nlen = %d\n", src.Len()); 714370b324cSopenharmony_ci if (destLen != (size_t)(destEnd - destStart)) 715370b324cSopenharmony_ci { 716370b324cSopenharmony_ci /* 717370b324cSopenharmony_ci // dest.ReleaseBuf_SetEnd((unsigned)(destEnd - destStart)); 718370b324cSopenharmony_ci printf("\nlen = %d\n", (unsigned)destLen); 719370b324cSopenharmony_ci printf("\n(destEnd - destStart) = %d\n", (unsigned)(destEnd - destStart)); 720370b324cSopenharmony_ci printf("\n"); 721370b324cSopenharmony_ci // Print_UString(src); 722370b324cSopenharmony_ci printf("\n"); 723370b324cSopenharmony_ci // printf("\nlen = %d\n", destLen); 724370b324cSopenharmony_ci */ 725370b324cSopenharmony_ci throw 20210406; 726370b324cSopenharmony_ci } 727370b324cSopenharmony_ci} 728370b324cSopenharmony_ci 729370b324cSopenharmony_ci 730370b324cSopenharmony_ci 731370b324cSopenharmony_ciunsigned g_Unicode_To_UTF8_Flags = 732370b324cSopenharmony_ci // Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE 733370b324cSopenharmony_ci 0 734370b324cSopenharmony_ci #ifndef _WIN32 735370b324cSopenharmony_ci #ifdef MY_UTF8_RAW_NON_UTF8_SUPPORTED 736370b324cSopenharmony_ci | Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE 737370b324cSopenharmony_ci #else 738370b324cSopenharmony_ci | Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR 739370b324cSopenharmony_ci #endif 740370b324cSopenharmony_ci #endif 741370b324cSopenharmony_ci ; 742370b324cSopenharmony_ci 743370b324cSopenharmony_civoid ConvertUnicodeToUTF8(const UString &src, AString &dest) 744370b324cSopenharmony_ci{ 745370b324cSopenharmony_ci ConvertUnicodeToUTF8_Flags(src, dest, g_Unicode_To_UTF8_Flags); 746370b324cSopenharmony_ci} 747370b324cSopenharmony_ci 748370b324cSopenharmony_civoid Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest) 749370b324cSopenharmony_ci{ 750370b324cSopenharmony_ci const unsigned flags = g_Unicode_To_UTF8_Flags; 751370b324cSopenharmony_ci dest.Free(); 752370b324cSopenharmony_ci const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags); 753370b324cSopenharmony_ci dest.Alloc(destLen); 754370b324cSopenharmony_ci const char *destEnd = Utf16_To_Utf8((char *)(void *)(Byte *)dest, src, src.Ptr(src.Len()), flags); 755370b324cSopenharmony_ci if (destLen != (size_t)(destEnd - (char *)(void *)(Byte *)dest)) 756370b324cSopenharmony_ci throw 202104; 757370b324cSopenharmony_ci} 758370b324cSopenharmony_ci 759370b324cSopenharmony_ci/* 760370b324cSopenharmony_ci 761370b324cSopenharmony_ci#ifndef _WIN32 762370b324cSopenharmony_civoid Convert_UTF16_To_UTF32(const UString &src, UString &dest) 763370b324cSopenharmony_ci{ 764370b324cSopenharmony_ci dest.Empty(); 765370b324cSopenharmony_ci for (size_t i = 0; i < src.Len();) 766370b324cSopenharmony_ci { 767370b324cSopenharmony_ci wchar_t c = src[i++]; 768370b324cSopenharmony_ci if (c >= 0xd800 && c < 0xdc00 && i < src.Len()) 769370b324cSopenharmony_ci { 770370b324cSopenharmony_ci const wchar_t c2 = src[i]; 771370b324cSopenharmony_ci if (c2 >= 0xdc00 && c2 < 0x10000) 772370b324cSopenharmony_ci { 773370b324cSopenharmony_ci // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2); 774370b324cSopenharmony_ci c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); 775370b324cSopenharmony_ci // printf("%4x\n", (int)c); 776370b324cSopenharmony_ci i++; 777370b324cSopenharmony_ci } 778370b324cSopenharmony_ci } 779370b324cSopenharmony_ci dest += c; 780370b324cSopenharmony_ci } 781370b324cSopenharmony_ci} 782370b324cSopenharmony_ci 783370b324cSopenharmony_civoid Convert_UTF32_To_UTF16(const UString &src, UString &dest) 784370b324cSopenharmony_ci{ 785370b324cSopenharmony_ci dest.Empty(); 786370b324cSopenharmony_ci for (size_t i = 0; i < src.Len();) 787370b324cSopenharmony_ci { 788370b324cSopenharmony_ci wchar_t w = src[i++]; 789370b324cSopenharmony_ci if (w >= 0x10000 && w < 0x110000) 790370b324cSopenharmony_ci { 791370b324cSopenharmony_ci w -= 0x10000; 792370b324cSopenharmony_ci dest += (wchar_t)((unsigned)0xd800 + (((unsigned)w >> 10) & 0x3ff)); 793370b324cSopenharmony_ci w = 0xdc00 + (w & 0x3ff); 794370b324cSopenharmony_ci } 795370b324cSopenharmony_ci dest += w; 796370b324cSopenharmony_ci } 797370b324cSopenharmony_ci} 798370b324cSopenharmony_ci 799370b324cSopenharmony_cibool UTF32_IsThere_BigPoint(const UString &src) 800370b324cSopenharmony_ci{ 801370b324cSopenharmony_ci for (size_t i = 0; i < src.Len();) 802370b324cSopenharmony_ci { 803370b324cSopenharmony_ci const UInt32 c = (UInt32)src[i++]; 804370b324cSopenharmony_ci if (c >= 0x110000) 805370b324cSopenharmony_ci return true; 806370b324cSopenharmony_ci } 807370b324cSopenharmony_ci return false; 808370b324cSopenharmony_ci} 809370b324cSopenharmony_ci 810370b324cSopenharmony_cibool Unicode_IsThere_BmpEscape(const UString &src) 811370b324cSopenharmony_ci{ 812370b324cSopenharmony_ci for (size_t i = 0; i < src.Len();) 813370b324cSopenharmony_ci { 814370b324cSopenharmony_ci const UInt32 c = (UInt32)src[i++]; 815370b324cSopenharmony_ci if (IS_ESCAPE_POINT(c, 0)) 816370b324cSopenharmony_ci return true; 817370b324cSopenharmony_ci } 818370b324cSopenharmony_ci return false; 819370b324cSopenharmony_ci} 820370b324cSopenharmony_ci 821370b324cSopenharmony_ci 822370b324cSopenharmony_ci#endif 823370b324cSopenharmony_ci 824370b324cSopenharmony_cibool Unicode_IsThere_Utf16SurrogateError(const UString &src) 825370b324cSopenharmony_ci{ 826370b324cSopenharmony_ci for (size_t i = 0; i < src.Len();) 827370b324cSopenharmony_ci { 828370b324cSopenharmony_ci const UInt32 val = (UInt32)src[i++]; 829370b324cSopenharmony_ci if (IS_SURROGATE_POINT(val)) 830370b324cSopenharmony_ci { 831370b324cSopenharmony_ci // it's hack to UTF-8 encoding 832370b324cSopenharmony_ci if (val >= 0xdc00 || i == src.Len()) 833370b324cSopenharmony_ci return true; 834370b324cSopenharmony_ci const UInt32 c2 = (UInt32)*src; 835370b324cSopenharmony_ci if (!IS_LOW_SURROGATE_POINT(c2)) 836370b324cSopenharmony_ci return true; 837370b324cSopenharmony_ci } 838370b324cSopenharmony_ci } 839370b324cSopenharmony_ci return false; 840370b324cSopenharmony_ci} 841370b324cSopenharmony_ci*/ 842370b324cSopenharmony_ci 843370b324cSopenharmony_ci#ifndef Z7_WCHART_IS_16BIT 844370b324cSopenharmony_ci 845370b324cSopenharmony_civoid Convert_UnicodeEsc16_To_UnicodeEscHigh 846370b324cSopenharmony_ci#if UTF_ESCAPE_PLANE == 0 847370b324cSopenharmony_ci (UString &) {} 848370b324cSopenharmony_ci#else 849370b324cSopenharmony_ci (UString &s) 850370b324cSopenharmony_ci{ 851370b324cSopenharmony_ci const unsigned len = s.Len(); 852370b324cSopenharmony_ci for (unsigned i = 0; i < len; i++) 853370b324cSopenharmony_ci { 854370b324cSopenharmony_ci wchar_t c = s[i]; 855370b324cSopenharmony_ci if (IS_ESCAPE_POINT(c, 0)) 856370b324cSopenharmony_ci { 857370b324cSopenharmony_ci c += UTF_ESCAPE_PLANE; 858370b324cSopenharmony_ci s.ReplaceOneCharAtPos(i, c); 859370b324cSopenharmony_ci } 860370b324cSopenharmony_ci } 861370b324cSopenharmony_ci} 862370b324cSopenharmony_ci#endif 863370b324cSopenharmony_ci#endif 864