1// UTFConvert.cpp 2 3#include "StdAfx.h" 4 5// #include <stdio.h> 6 7#include "MyTypes.h" 8#include "UTFConvert.h" 9 10 11#ifndef Z7_WCHART_IS_16BIT 12#ifndef __APPLE__ 13 // we define it if the system supports files with non-utf8 symbols: 14 #define MY_UTF8_RAW_NON_UTF8_SUPPORTED 15#endif 16#endif 17 18/* 19 MY_UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte 20 21 n : MY_UTF8_START(n) : Bits of code point 22 23 0 : 0x80 : : unused 24 1 : 0xC0 : 11 : 25 2 : 0xE0 : 16 : Basic Multilingual Plane 26 3 : 0xF0 : 21 : Unicode space 27 4 : 0xF8 : 26 : 28 5 : 0xFC : 31 : UCS-4 : wcstombs() in ubuntu is limited to that value 29 6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value 30 7 : 0xFF : 31*/ 32 33#define MY_UTF8_START(n) (0x100 - (1 << (7 - (n)))) 34 35#define MY_UTF8_HEAD_PARSE2(n) \ 36 if (c < MY_UTF8_START((n) + 1)) \ 37 { numBytes = (n); val -= MY_UTF8_START(n); } 38 39#ifndef Z7_WCHART_IS_16BIT 40 41/* 42 if (wchar_t is 32-bit), we can support large points in long UTF-8 sequence, 43 when we convert wchar_t strings to UTF-8: 44 (_UTF8_NUM_TAIL_BYTES_MAX == 3) : (21-bits points) - Unicode 45 (_UTF8_NUM_TAIL_BYTES_MAX == 5) : (31-bits points) - UCS-4 46 (_UTF8_NUM_TAIL_BYTES_MAX == 6) : (36-bit hack) 47*/ 48 49#define MY_UTF8_NUM_TAIL_BYTES_MAX 5 50#endif 51 52/* 53#define MY_UTF8_HEAD_PARSE \ 54 UInt32 val = c; \ 55 MY_UTF8_HEAD_PARSE2(1) \ 56 else MY_UTF8_HEAD_PARSE2(2) \ 57 else MY_UTF8_HEAD_PARSE2(3) \ 58 else MY_UTF8_HEAD_PARSE2(4) \ 59 else MY_UTF8_HEAD_PARSE2(5) \ 60 #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6 61 else MY_UTF8_HEAD_PARSE2(6) 62 #endif 63*/ 64 65#define MY_UTF8_HEAD_PARSE_MAX_3_BYTES \ 66 UInt32 val = c; \ 67 MY_UTF8_HEAD_PARSE2(1) \ 68 else MY_UTF8_HEAD_PARSE2(2) \ 69 else { numBytes = 3; val -= MY_UTF8_START(3); } 70 71 72#define MY_UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6)) 73 74 75#define START_POINT_FOR_SURROGATE 0x10000 76 77 78/* we use 128 bytes block in 16-bit BMP-PLANE to encode non-UTF-8 Escapes 79 Also we can use additional HIGH-PLANE (we use 21-bit points above 0x1f0000) 80 to simplify internal intermediate conversion in Linux: 81 RAW-UTF-8 <-> internal wchar_t utf-16 strings <-> RAW-UTF-UTF-8 82*/ 83 84 85#if defined(Z7_WCHART_IS_16BIT) 86 87#define UTF_ESCAPE_PLANE 0 88 89#else 90 91/* 92we can place 128 ESCAPE chars to 93 ef 80 - ee be 80 (3-bytes utf-8) : similar to WSL 94 ef ff - ee bf bf 95 961f ef 80 - f7 be be 80 (4-bytes utf-8) : last 4-bytes utf-8 plane (out of Unicode) 971f ef ff - f7 be bf bf (4-bytes utf-8) : last 4-bytes utf-8 plane (out of Unicode) 98*/ 99 100// #define UTF_ESCAPE_PLANE_HIGH (0x1f << 16) 101// #define UTF_ESCAPE_PLANE UTF_ESCAPE_PLANE_HIGH 102#define UTF_ESCAPE_PLANE 0 103 104/* 105 if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is set) 106 { 107 if (UTF_ESCAPE_PLANE is UTF_ESCAPE_PLANE_HIGH) 108 { 109 we can restore any 8-bit Escape from ESCAPE-PLANE-21 plane. 110 But ESCAPE-PLANE-21 point cannot be stored to utf-16 (7z archive) 111 So we still need a way to extract 8-bit Escapes and BMP-Escapes-8 112 from same BMP-Escapes-16 stored in 7z. 113 And if we want to restore any 8-bit from 7z archive, 114 we still must use Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT for (utf-8 -> utf-16) 115 Also we need additional Conversions to tranform from utf-16 to utf-16-With-Escapes-21 116 } 117 else (UTF_ESCAPE_PLANE == 0) 118 { 119 we must convert original 3-bytes utf-8 BMP-Escape point to sequence 120 of 3 BMP-Escape-16 points with Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT 121 so we can extract original RAW-UTF-8 from UTFD-16 later. 122 } 123 } 124*/ 125 126#endif 127 128 129 130#define UTF_ESCAPE_BASE 0xef00 131 132 133#ifdef UTF_ESCAPE_BASE 134#define IS_ESCAPE_POINT(v, plane) (((v) & (UInt32)0xffffff80) == (plane) + UTF_ESCAPE_BASE + 0x80) 135#endif 136 137#define IS_SURROGATE_POINT(v) (((v) & (UInt32)0xfffff800) == 0xd800) 138#define IS_LOW_SURROGATE_POINT(v) (((v) & (UInt32)0xfffffC00) == 0xdc00) 139 140 141#define UTF_ERROR_UTF8_CHECK \ 142 { NonUtf = true; continue; } 143 144void CUtf8Check::Check_Buf(const char *src, size_t size) throw() 145{ 146 Clear(); 147 // Byte maxByte = 0; 148 149 for (;;) 150 { 151 if (size == 0) 152 break; 153 154 const Byte c = (Byte)(*src++); 155 size--; 156 157 if (c == 0) 158 { 159 ZeroChar = true; 160 continue; 161 } 162 163 /* 164 if (c > maxByte) 165 maxByte = c; 166 */ 167 168 if (c < 0x80) 169 continue; 170 171 if (c < 0xc0 + 2) // it's limit for 0x140000 unicode codes : win32 compatibility 172 UTF_ERROR_UTF8_CHECK 173 174 unsigned numBytes; 175 176 UInt32 val = c; 177 MY_UTF8_HEAD_PARSE2(1) 178 else MY_UTF8_HEAD_PARSE2(2) 179 else MY_UTF8_HEAD_PARSE2(4) 180 else MY_UTF8_HEAD_PARSE2(5) 181 else 182 { 183 UTF_ERROR_UTF8_CHECK 184 } 185 186 unsigned pos = 0; 187 do 188 { 189 if (pos == size) 190 break; 191 unsigned c2 = (Byte)src[pos]; 192 c2 -= 0x80; 193 if (c2 >= 0x40) 194 break; 195 val <<= 6; 196 val |= c2; 197 if (pos == 0) 198 if (val < (((unsigned)1 << 7) >> numBytes)) 199 break; 200 pos++; 201 } 202 while (--numBytes); 203 204 if (numBytes != 0) 205 { 206 if (pos == size) 207 Truncated = true; 208 else 209 UTF_ERROR_UTF8_CHECK 210 } 211 212 #ifdef UTF_ESCAPE_BASE 213 if (IS_ESCAPE_POINT(val, 0)) 214 Escape = true; 215 #endif 216 217 if (MaxHighPoint < val) 218 MaxHighPoint = val; 219 220 if (IS_SURROGATE_POINT(val)) 221 SingleSurrogate = true; 222 223 src += pos; 224 size -= pos; 225 } 226 227 // MaxByte = maxByte; 228} 229 230bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw() 231{ 232 CUtf8Check check; 233 check.Check_Buf(src, size); 234 return check.IsOK(allowReduced); 235} 236 237/* 238bool CheckUTF8_chars(const char *src, bool allowReduced) throw() 239{ 240 CUtf8Check check; 241 check.CheckBuf(src, strlen(src)); 242 return check.IsOK(allowReduced); 243} 244*/ 245 246bool CheckUTF8_AString(const AString &s) throw() 247{ 248 CUtf8Check check; 249 check.Check_AString(s); 250 return check.IsOK(); 251} 252 253 254/* 255bool CheckUTF8(const char *src, bool allowReduced) throw() 256{ 257 // return Check_UTF8_Buf(src, strlen(src), allowReduced); 258 259 for (;;) 260 { 261 const Byte c = (Byte)(*src++); 262 if (c == 0) 263 return true; 264 265 if (c < 0x80) 266 continue; 267 if (c < 0xC0 + 2 || c >= 0xf5) 268 return false; 269 270 unsigned numBytes; 271 MY_UTF8_HEAD_PARSE 272 else 273 return false; 274 275 unsigned pos = 0; 276 277 do 278 { 279 Byte c2 = (Byte)(*src++); 280 if (c2 < 0x80 || c2 >= 0xC0) 281 return allowReduced && c2 == 0; 282 val <<= 6; 283 val |= (c2 - 0x80); 284 pos++; 285 } 286 while (--numBytes); 287 288 if (val < MY_UTF8_RANGE(pos - 1)) 289 return false; 290 291 if (val >= 0x110000) 292 return false; 293 } 294} 295*/ 296 297// in case of UTF-8 error we have two ways: 298// 21.01- : old : 0xfffd: REPLACEMENT CHARACTER : old version 299// 21.02+ : new : 0xef00 + (c) : similar to WSL scheme for low symbols 300 301#define UTF_REPLACEMENT_CHAR 0xfffd 302 303 304 305#define UTF_ESCAPE(c) \ 306 ((flags & Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE) ? \ 307 UTF_ESCAPE_PLANE + UTF_ESCAPE_BASE + (c) : UTF_REPLACEMENT_CHAR) 308 309/* 310#define UTF_HARD_ERROR_UTF8 311 { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \ 312 destPos++; ok = false; continue; } 313*/ 314 315// we ignore utf errors, and don't change (ok) variable! 316 317#define UTF_ERROR_UTF8 \ 318 { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \ 319 destPos++; continue; } 320 321// we store UTF-16 in wchar_t strings. So we use surrogates for big unicode points: 322 323// for debug puposes only we can store UTF-32 in wchar_t: 324// #define START_POINT_FOR_SURROGATE ((UInt32)0 - 1) 325 326 327/* 328 WIN32 MultiByteToWideChar(CP_UTF8) emits 0xfffd point, if utf-8 error was found. 329 Ant it can emit single 0xfffd from 2 src bytes. 330 It doesn't emit single 0xfffd from 3-4 src bytes. 331 We can 332 1) emit Escape point for each incorrect byte. So we can data recover later 333 2) emit 0xfffd for each incorrect byte. 334 That scheme is similar to Escape scheme, but we emit 0xfffd 335 instead of each Escape point. 336 3) emit single 0xfffd from 1-2 incorrect bytes, as WIN32 MultiByteToWideChar scheme 337*/ 338 339static bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim, unsigned flags) throw() 340{ 341 size_t destPos = 0; 342 bool ok = true; 343 344 for (;;) 345 { 346 if (src == srcLim) 347 { 348 *destLen = destPos; 349 return ok; 350 } 351 352 const Byte c = (Byte)(*src++); 353 354 if (c < 0x80) 355 { 356 if (dest) 357 dest[destPos] = (wchar_t)c; 358 destPos++; 359 continue; 360 } 361 362 if (c < 0xc0 + 2 363 || c >= 0xf5) // it's limit for 0x140000 unicode codes : win32 compatibility 364 { 365 UTF_ERROR_UTF8 366 } 367 368 unsigned numBytes; 369 370 MY_UTF8_HEAD_PARSE_MAX_3_BYTES 371 372 unsigned pos = 0; 373 do 374 { 375 if (src + pos == srcLim) 376 break; 377 unsigned c2 = (Byte)src[pos]; 378 c2 -= 0x80; 379 if (c2 >= 0x40) 380 break; 381 val <<= 6; 382 val |= c2; 383 pos++; 384 if (pos == 1) 385 { 386 if (val < (((unsigned)1 << 7) >> numBytes)) 387 break; 388 if (numBytes == 2) 389 { 390 if (flags & Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) 391 if ((val & (0xF800 >> 6)) == (0xd800 >> 6)) 392 break; 393 } 394 else if (numBytes == 3 && val >= (0x110000 >> 12)) 395 break; 396 } 397 } 398 while (--numBytes); 399 400 if (numBytes != 0) 401 { 402 if ((flags & Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE) == 0) 403 { 404 // the following code to emit the 0xfffd chars as win32 Utf8 function. 405 // disable the folling line, if you need 0xfffd for each incorrect byte as in Escape mode 406 src += pos; 407 } 408 UTF_ERROR_UTF8 409 } 410 411 /* 412 if (val < MY_UTF8_RANGE(pos - 1)) 413 UTF_ERROR_UTF8 414 */ 415 416 #ifdef UTF_ESCAPE_BASE 417 418 if ((flags & Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT) 419 && IS_ESCAPE_POINT(val, 0)) 420 { 421 // We will emit 3 utf16-Escape-16-21 points from one Escape-16 point (3 bytes) 422 UTF_ERROR_UTF8 423 } 424 425 #endif 426 427 /* 428 We don't expect virtual Escape-21 points in UTF-8 stream. 429 And we don't check for Escape-21. 430 So utf8-Escape-21 will be converted to another 3 utf16-Escape-21 points. 431 Maybe we could convert virtual utf8-Escape-21 to one utf16-Escape-21 point in some cases? 432 */ 433 434 if (val < START_POINT_FOR_SURROGATE) 435 { 436 /* 437 if ((flags & Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) 438 && IS_SURROGATE_POINT(val)) 439 { 440 // We will emit 3 utf16-Escape-16-21 points from one Surrogate-16 point (3 bytes) 441 UTF_ERROR_UTF8 442 } 443 */ 444 if (dest) 445 dest[destPos] = (wchar_t)val; 446 destPos++; 447 } 448 else 449 { 450 /* 451 if (val >= 0x110000) 452 { 453 // We will emit utf16-Escape-16-21 point from each source byte 454 UTF_ERROR_UTF8 455 } 456 */ 457 if (dest) 458 { 459 dest[destPos + 0] = (wchar_t)(0xd800 - (0x10000 >> 10) + (val >> 10)); 460 dest[destPos + 1] = (wchar_t)(0xdc00 + (val & 0x3ff)); 461 } 462 destPos += 2; 463 } 464 src += pos; 465 } 466} 467 468 469 470#define MY_UTF8_HEAD(n, val) ((char)(MY_UTF8_START(n) + (val >> (6 * (n))))) 471#define MY_UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F))) 472 473static size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim, unsigned flags) 474{ 475 size_t size = (size_t)(srcLim - src); 476 for (;;) 477 { 478 if (src == srcLim) 479 return size; 480 481 UInt32 val = (UInt32)(*src++); 482 483 if (val < 0x80) 484 continue; 485 486 if (val < MY_UTF8_RANGE(1)) 487 { 488 size++; 489 continue; 490 } 491 492 #ifdef UTF_ESCAPE_BASE 493 494 #if UTF_ESCAPE_PLANE != 0 495 if (flags & Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE) 496 if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE)) 497 continue; 498 #endif 499 500 if (flags & Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE) 501 if (IS_ESCAPE_POINT(val, 0)) 502 continue; 503 504 #endif 505 506 if (IS_SURROGATE_POINT(val)) 507 { 508 // it's hack to UTF-8 encoding 509 510 if (val < 0xdc00 && src != srcLim) 511 { 512 const UInt32 c2 = (UInt32)*src; 513 if (c2 >= 0xdc00 && c2 < 0xe000) 514 src++; 515 } 516 size += 2; 517 continue; 518 } 519 520 #ifdef Z7_WCHART_IS_16BIT 521 522 size += 2; 523 524 #else 525 526 if (val < MY_UTF8_RANGE(2)) size += 2; 527 else if (val < MY_UTF8_RANGE(3)) size += 3; 528 else if (val < MY_UTF8_RANGE(4)) size += 4; 529 else if (val < MY_UTF8_RANGE(5)) size += 5; 530 else 531 #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6 532 size += 6; 533 #else 534 size += 3; 535 #endif 536 537 #endif 538 } 539} 540 541 542static char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim, unsigned flags) 543{ 544 for (;;) 545 { 546 if (src == srcLim) 547 return dest; 548 549 UInt32 val = (UInt32)*src++; 550 551 if (val < 0x80) 552 { 553 *dest++ = (char)val; 554 continue; 555 } 556 557 if (val < MY_UTF8_RANGE(1)) 558 { 559 dest[0] = MY_UTF8_HEAD(1, val); 560 dest[1] = MY_UTF8_CHAR(0, val); 561 dest += 2; 562 continue; 563 } 564 565 #ifdef UTF_ESCAPE_BASE 566 567 #if UTF_ESCAPE_PLANE != 0 568 /* 569 if (wchar_t is 32-bit) 570 && (Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE is set) 571 && (point is virtual escape plane) 572 we extract 8-bit byte from virtual HIGH-ESCAPE PLANE. 573 */ 574 if (flags & Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE) 575 if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE)) 576 { 577 *dest++ = (char)(val); 578 continue; 579 } 580 #endif // UTF_ESCAPE_PLANE != 0 581 582 /* if (Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE is defined) 583 we extract 8-bit byte from BMP-ESCAPE PLANE. */ 584 585 if (flags & Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE) 586 if (IS_ESCAPE_POINT(val, 0)) 587 { 588 *dest++ = (char)(val); 589 continue; 590 } 591 592 #endif // UTF_ESCAPE_BASE 593 594 if (IS_SURROGATE_POINT(val)) 595 { 596 // it's hack to UTF-8 encoding 597 if (val < 0xdc00 && src != srcLim) 598 { 599 const UInt32 c2 = (UInt32)*src; 600 if (IS_LOW_SURROGATE_POINT(c2)) 601 { 602 src++; 603 val = (((val - 0xd800) << 10) | (c2 - 0xdc00)) + 0x10000; 604 dest[0] = MY_UTF8_HEAD(3, val); 605 dest[1] = MY_UTF8_CHAR(2, val); 606 dest[2] = MY_UTF8_CHAR(1, val); 607 dest[3] = MY_UTF8_CHAR(0, val); 608 dest += 4; 609 continue; 610 } 611 } 612 if (flags & Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR) 613 val = UTF_REPLACEMENT_CHAR; // WIN32 function does it 614 } 615 616 #ifndef Z7_WCHART_IS_16BIT 617 if (val < MY_UTF8_RANGE(2)) 618 #endif 619 { 620 dest[0] = MY_UTF8_HEAD(2, val); 621 dest[1] = MY_UTF8_CHAR(1, val); 622 dest[2] = MY_UTF8_CHAR(0, val); 623 dest += 3; 624 continue; 625 } 626 627 #ifndef Z7_WCHART_IS_16BIT 628 629 // we don't expect this case. so we can throw exception 630 // throw 20210407; 631 632 char b; 633 unsigned numBits; 634 if (val < MY_UTF8_RANGE(3)) { numBits = 6 * 3; b = MY_UTF8_HEAD(3, val); } 635 else if (val < MY_UTF8_RANGE(4)) { numBits = 6 * 4; b = MY_UTF8_HEAD(4, val); } 636 else if (val < MY_UTF8_RANGE(5)) { numBits = 6 * 5; b = MY_UTF8_HEAD(5, val); } 637 #if MY_UTF8_NUM_TAIL_BYTES_MAX >= 6 638 else { numBits = 6 * 6; b = (char)MY_UTF8_START(6); } 639 #else 640 else 641 { 642 val = UTF_REPLACEMENT_CHAR; 643 { numBits = 6 * 3; b = MY_UTF8_HEAD(3, val); } 644 } 645 #endif 646 647 *dest++ = b; 648 649 do 650 { 651 numBits -= 6; 652 *dest++ = (char)(0x80 + ((val >> numBits) & 0x3F)); 653 } 654 while (numBits != 0); 655 656 #endif 657 } 658} 659 660bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags) 661{ 662 dest.Empty(); 663 size_t destLen = 0; 664 Utf8_To_Utf16(NULL, &destLen, src, src + srcSize, flags); 665 bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src + srcSize, flags); 666 dest.ReleaseBuf_SetEnd((unsigned)destLen); 667 return res; 668} 669 670bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags) 671{ 672 return Convert_UTF8_Buf_To_Unicode(src, src.Len(), dest, flags); 673} 674 675 676static 677unsigned g_UTF8_To_Unicode_Flags = 678 Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE 679 #ifndef Z7_WCHART_IS_16BIT 680 | Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR 681 #ifdef MY_UTF8_RAW_NON_UTF8_SUPPORTED 682 | Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT 683 #endif 684 #endif 685 ; 686 687 688/* 689bool ConvertUTF8ToUnicode_boolRes(const AString &src, UString &dest) 690{ 691 return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags); 692} 693*/ 694 695bool ConvertUTF8ToUnicode(const AString &src, UString &dest) 696{ 697 return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags); 698} 699 700void Print_UString(const UString &a); 701 702void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags) 703{ 704 /* 705 if (src.Len()== 24) 706 throw "202104"; 707 */ 708 dest.Empty(); 709 const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags); 710 char *destStart = dest.GetBuf((unsigned)destLen); 711 const char *destEnd = Utf16_To_Utf8(destStart, src, src.Ptr(src.Len()), flags); 712 dest.ReleaseBuf_SetEnd((unsigned)destLen); 713 // printf("\nlen = %d\n", src.Len()); 714 if (destLen != (size_t)(destEnd - destStart)) 715 { 716 /* 717 // dest.ReleaseBuf_SetEnd((unsigned)(destEnd - destStart)); 718 printf("\nlen = %d\n", (unsigned)destLen); 719 printf("\n(destEnd - destStart) = %d\n", (unsigned)(destEnd - destStart)); 720 printf("\n"); 721 // Print_UString(src); 722 printf("\n"); 723 // printf("\nlen = %d\n", destLen); 724 */ 725 throw 20210406; 726 } 727} 728 729 730 731unsigned g_Unicode_To_UTF8_Flags = 732 // Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE 733 0 734 #ifndef _WIN32 735 #ifdef MY_UTF8_RAW_NON_UTF8_SUPPORTED 736 | Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE 737 #else 738 | Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR 739 #endif 740 #endif 741 ; 742 743void ConvertUnicodeToUTF8(const UString &src, AString &dest) 744{ 745 ConvertUnicodeToUTF8_Flags(src, dest, g_Unicode_To_UTF8_Flags); 746} 747 748void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest) 749{ 750 const unsigned flags = g_Unicode_To_UTF8_Flags; 751 dest.Free(); 752 const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags); 753 dest.Alloc(destLen); 754 const char *destEnd = Utf16_To_Utf8((char *)(void *)(Byte *)dest, src, src.Ptr(src.Len()), flags); 755 if (destLen != (size_t)(destEnd - (char *)(void *)(Byte *)dest)) 756 throw 202104; 757} 758 759/* 760 761#ifndef _WIN32 762void Convert_UTF16_To_UTF32(const UString &src, UString &dest) 763{ 764 dest.Empty(); 765 for (size_t i = 0; i < src.Len();) 766 { 767 wchar_t c = src[i++]; 768 if (c >= 0xd800 && c < 0xdc00 && i < src.Len()) 769 { 770 const wchar_t c2 = src[i]; 771 if (c2 >= 0xdc00 && c2 < 0x10000) 772 { 773 // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2); 774 c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); 775 // printf("%4x\n", (int)c); 776 i++; 777 } 778 } 779 dest += c; 780 } 781} 782 783void Convert_UTF32_To_UTF16(const UString &src, UString &dest) 784{ 785 dest.Empty(); 786 for (size_t i = 0; i < src.Len();) 787 { 788 wchar_t w = src[i++]; 789 if (w >= 0x10000 && w < 0x110000) 790 { 791 w -= 0x10000; 792 dest += (wchar_t)((unsigned)0xd800 + (((unsigned)w >> 10) & 0x3ff)); 793 w = 0xdc00 + (w & 0x3ff); 794 } 795 dest += w; 796 } 797} 798 799bool UTF32_IsThere_BigPoint(const UString &src) 800{ 801 for (size_t i = 0; i < src.Len();) 802 { 803 const UInt32 c = (UInt32)src[i++]; 804 if (c >= 0x110000) 805 return true; 806 } 807 return false; 808} 809 810bool Unicode_IsThere_BmpEscape(const UString &src) 811{ 812 for (size_t i = 0; i < src.Len();) 813 { 814 const UInt32 c = (UInt32)src[i++]; 815 if (IS_ESCAPE_POINT(c, 0)) 816 return true; 817 } 818 return false; 819} 820 821 822#endif 823 824bool Unicode_IsThere_Utf16SurrogateError(const UString &src) 825{ 826 for (size_t i = 0; i < src.Len();) 827 { 828 const UInt32 val = (UInt32)src[i++]; 829 if (IS_SURROGATE_POINT(val)) 830 { 831 // it's hack to UTF-8 encoding 832 if (val >= 0xdc00 || i == src.Len()) 833 return true; 834 const UInt32 c2 = (UInt32)*src; 835 if (!IS_LOW_SURROGATE_POINT(c2)) 836 return true; 837 } 838 } 839 return false; 840} 841*/ 842 843#ifndef Z7_WCHART_IS_16BIT 844 845void Convert_UnicodeEsc16_To_UnicodeEscHigh 846#if UTF_ESCAPE_PLANE == 0 847 (UString &) {} 848#else 849 (UString &s) 850{ 851 const unsigned len = s.Len(); 852 for (unsigned i = 0; i < len; i++) 853 { 854 wchar_t c = s[i]; 855 if (IS_ESCAPE_POINT(c, 0)) 856 { 857 c += UTF_ESCAPE_PLANE; 858 s.ReplaceOneCharAtPos(i, c); 859 } 860 } 861} 862#endif 863#endif 864