11cb0ef41Sopenharmony_ci// Copyright 2019 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci// found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci#include "src/inspector/v8-string-conversions.h" 61cb0ef41Sopenharmony_ci 71cb0ef41Sopenharmony_ci#include <limits> 81cb0ef41Sopenharmony_ci#include <vector> 91cb0ef41Sopenharmony_ci 101cb0ef41Sopenharmony_ci#include "src/base/logging.h" 111cb0ef41Sopenharmony_ci#include "src/base/v8-fallthrough.h" 121cb0ef41Sopenharmony_ci 131cb0ef41Sopenharmony_cinamespace v8_inspector { 141cb0ef41Sopenharmony_cinamespace { 151cb0ef41Sopenharmony_ciusing UChar = uint16_t; 161cb0ef41Sopenharmony_ciusing UChar32 = uint32_t; 171cb0ef41Sopenharmony_ci 181cb0ef41Sopenharmony_cibool isASCII(UChar c) { return !(c & ~0x7F); } 191cb0ef41Sopenharmony_ci 201cb0ef41Sopenharmony_ciconst UChar replacementCharacter = 0xFFFD; 211cb0ef41Sopenharmony_ci 221cb0ef41Sopenharmony_ciinline int inlineUTF8SequenceLengthNonASCII(char b0) { 231cb0ef41Sopenharmony_ci if ((b0 & 0xC0) != 0xC0) return 0; 241cb0ef41Sopenharmony_ci if ((b0 & 0xE0) == 0xC0) return 2; 251cb0ef41Sopenharmony_ci if ((b0 & 0xF0) == 0xE0) return 3; 261cb0ef41Sopenharmony_ci if ((b0 & 0xF8) == 0xF0) return 4; 271cb0ef41Sopenharmony_ci return 0; 281cb0ef41Sopenharmony_ci} 291cb0ef41Sopenharmony_ci 301cb0ef41Sopenharmony_ciinline int inlineUTF8SequenceLength(char b0) { 311cb0ef41Sopenharmony_ci return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); 321cb0ef41Sopenharmony_ci} 331cb0ef41Sopenharmony_ci 341cb0ef41Sopenharmony_ci// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 351cb0ef41Sopenharmony_ci// into the first byte, depending on how many bytes follow. There are 361cb0ef41Sopenharmony_ci// as many entries in this table as there are UTF-8 sequence types. 371cb0ef41Sopenharmony_ci// (I.e., one byte sequence, two byte... etc.). Remember that sequences 381cb0ef41Sopenharmony_ci// for *legal* UTF-8 will be 4 or fewer bytes total. 391cb0ef41Sopenharmony_cistatic const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 401cb0ef41Sopenharmony_ci 0xF0, 0xF8, 0xFC}; 411cb0ef41Sopenharmony_ci 421cb0ef41Sopenharmony_cienum ConversionResult { 431cb0ef41Sopenharmony_ci conversionOK, // conversion successful 441cb0ef41Sopenharmony_ci sourceExhausted, // partial character in source, but hit end 451cb0ef41Sopenharmony_ci targetExhausted, // insuff. room in target for conversion 461cb0ef41Sopenharmony_ci sourceIllegal // source sequence is illegal/malformed 471cb0ef41Sopenharmony_ci}; 481cb0ef41Sopenharmony_ci 491cb0ef41Sopenharmony_ciConversionResult convertUTF16ToUTF8(const UChar** sourceStart, 501cb0ef41Sopenharmony_ci const UChar* sourceEnd, char** targetStart, 511cb0ef41Sopenharmony_ci char* targetEnd, bool strict) { 521cb0ef41Sopenharmony_ci ConversionResult result = conversionOK; 531cb0ef41Sopenharmony_ci const UChar* source = *sourceStart; 541cb0ef41Sopenharmony_ci char* target = *targetStart; 551cb0ef41Sopenharmony_ci while (source < sourceEnd) { 561cb0ef41Sopenharmony_ci UChar32 ch; 571cb0ef41Sopenharmony_ci uint32_t bytesToWrite = 0; 581cb0ef41Sopenharmony_ci const UChar32 byteMask = 0xBF; 591cb0ef41Sopenharmony_ci const UChar32 byteMark = 0x80; 601cb0ef41Sopenharmony_ci const UChar* oldSource = 611cb0ef41Sopenharmony_ci source; // In case we have to back up because of target overflow. 621cb0ef41Sopenharmony_ci ch = static_cast<uint16_t>(*source++); 631cb0ef41Sopenharmony_ci // If we have a surrogate pair, convert to UChar32 first. 641cb0ef41Sopenharmony_ci if (ch >= 0xD800 && ch <= 0xDBFF) { 651cb0ef41Sopenharmony_ci // If the 16 bits following the high surrogate are in the source buffer... 661cb0ef41Sopenharmony_ci if (source < sourceEnd) { 671cb0ef41Sopenharmony_ci UChar32 ch2 = static_cast<uint16_t>(*source); 681cb0ef41Sopenharmony_ci // If it's a low surrogate, convert to UChar32. 691cb0ef41Sopenharmony_ci if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 701cb0ef41Sopenharmony_ci ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; 711cb0ef41Sopenharmony_ci ++source; 721cb0ef41Sopenharmony_ci } else if (strict) { // it's an unpaired high surrogate 731cb0ef41Sopenharmony_ci --source; // return to the illegal value itself 741cb0ef41Sopenharmony_ci result = sourceIllegal; 751cb0ef41Sopenharmony_ci break; 761cb0ef41Sopenharmony_ci } 771cb0ef41Sopenharmony_ci } else { // We don't have the 16 bits following the high surrogate. 781cb0ef41Sopenharmony_ci --source; // return to the high surrogate 791cb0ef41Sopenharmony_ci result = sourceExhausted; 801cb0ef41Sopenharmony_ci break; 811cb0ef41Sopenharmony_ci } 821cb0ef41Sopenharmony_ci } else if (strict) { 831cb0ef41Sopenharmony_ci // UTF-16 surrogate values are illegal in UTF-32 841cb0ef41Sopenharmony_ci if (ch >= 0xDC00 && ch <= 0xDFFF) { 851cb0ef41Sopenharmony_ci --source; // return to the illegal value itself 861cb0ef41Sopenharmony_ci result = sourceIllegal; 871cb0ef41Sopenharmony_ci break; 881cb0ef41Sopenharmony_ci } 891cb0ef41Sopenharmony_ci } 901cb0ef41Sopenharmony_ci // Figure out how many bytes the result will require 911cb0ef41Sopenharmony_ci if (ch < static_cast<UChar32>(0x80)) { 921cb0ef41Sopenharmony_ci bytesToWrite = 1; 931cb0ef41Sopenharmony_ci } else if (ch < static_cast<UChar32>(0x800)) { 941cb0ef41Sopenharmony_ci bytesToWrite = 2; 951cb0ef41Sopenharmony_ci } else if (ch < static_cast<UChar32>(0x10000)) { 961cb0ef41Sopenharmony_ci bytesToWrite = 3; 971cb0ef41Sopenharmony_ci } else if (ch < static_cast<UChar32>(0x110000)) { 981cb0ef41Sopenharmony_ci bytesToWrite = 4; 991cb0ef41Sopenharmony_ci } else { 1001cb0ef41Sopenharmony_ci bytesToWrite = 3; 1011cb0ef41Sopenharmony_ci ch = replacementCharacter; 1021cb0ef41Sopenharmony_ci } 1031cb0ef41Sopenharmony_ci 1041cb0ef41Sopenharmony_ci target += bytesToWrite; 1051cb0ef41Sopenharmony_ci if (target > targetEnd) { 1061cb0ef41Sopenharmony_ci source = oldSource; // Back up source pointer! 1071cb0ef41Sopenharmony_ci target -= bytesToWrite; 1081cb0ef41Sopenharmony_ci result = targetExhausted; 1091cb0ef41Sopenharmony_ci break; 1101cb0ef41Sopenharmony_ci } 1111cb0ef41Sopenharmony_ci switch (bytesToWrite) { 1121cb0ef41Sopenharmony_ci case 4: 1131cb0ef41Sopenharmony_ci *--target = static_cast<char>((ch | byteMark) & byteMask); 1141cb0ef41Sopenharmony_ci ch >>= 6; 1151cb0ef41Sopenharmony_ci V8_FALLTHROUGH; 1161cb0ef41Sopenharmony_ci case 3: 1171cb0ef41Sopenharmony_ci *--target = static_cast<char>((ch | byteMark) & byteMask); 1181cb0ef41Sopenharmony_ci ch >>= 6; 1191cb0ef41Sopenharmony_ci V8_FALLTHROUGH; 1201cb0ef41Sopenharmony_ci case 2: 1211cb0ef41Sopenharmony_ci *--target = static_cast<char>((ch | byteMark) & byteMask); 1221cb0ef41Sopenharmony_ci ch >>= 6; 1231cb0ef41Sopenharmony_ci V8_FALLTHROUGH; 1241cb0ef41Sopenharmony_ci case 1: 1251cb0ef41Sopenharmony_ci *--target = static_cast<char>(ch | firstByteMark[bytesToWrite]); 1261cb0ef41Sopenharmony_ci } 1271cb0ef41Sopenharmony_ci target += bytesToWrite; 1281cb0ef41Sopenharmony_ci } 1291cb0ef41Sopenharmony_ci *sourceStart = source; 1301cb0ef41Sopenharmony_ci *targetStart = target; 1311cb0ef41Sopenharmony_ci return result; 1321cb0ef41Sopenharmony_ci} 1331cb0ef41Sopenharmony_ci 1341cb0ef41Sopenharmony_ci/** 1351cb0ef41Sopenharmony_ci * Is this code point a BMP code point (U+0000..U+ffff)? 1361cb0ef41Sopenharmony_ci * @param c 32-bit code point 1371cb0ef41Sopenharmony_ci * @return TRUE or FALSE 1381cb0ef41Sopenharmony_ci * @stable ICU 2.8 1391cb0ef41Sopenharmony_ci */ 1401cb0ef41Sopenharmony_ci#define U_IS_BMP(c) ((uint32_t)(c) <= 0xFFFF) 1411cb0ef41Sopenharmony_ci 1421cb0ef41Sopenharmony_ci/** 1431cb0ef41Sopenharmony_ci * Is this code point a supplementary code point (U+010000..U+10FFFF)? 1441cb0ef41Sopenharmony_ci * @param c 32-bit code point 1451cb0ef41Sopenharmony_ci * @return TRUE or FALSE 1461cb0ef41Sopenharmony_ci * @stable ICU 2.8 1471cb0ef41Sopenharmony_ci */ 1481cb0ef41Sopenharmony_ci#define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x010000) <= 0xFFFFF) 1491cb0ef41Sopenharmony_ci 1501cb0ef41Sopenharmony_ci/** 1511cb0ef41Sopenharmony_ci * Is this code point a surrogate (U+d800..U+dfff)? 1521cb0ef41Sopenharmony_ci * @param c 32-bit code point 1531cb0ef41Sopenharmony_ci * @return TRUE or FALSE 1541cb0ef41Sopenharmony_ci * @stable ICU 2.4 1551cb0ef41Sopenharmony_ci */ 1561cb0ef41Sopenharmony_ci#define U_IS_SURROGATE(c) (((c)&0xFFFFF800) == 0xD800) 1571cb0ef41Sopenharmony_ci 1581cb0ef41Sopenharmony_ci/** 1591cb0ef41Sopenharmony_ci * Get the lead surrogate (0xD800..0xDBFF) for a 1601cb0ef41Sopenharmony_ci * supplementary code point (0x010000..0x10FFFF). 1611cb0ef41Sopenharmony_ci * @param supplementary 32-bit code point (U+010000..U+10FFFF) 1621cb0ef41Sopenharmony_ci * @return lead surrogate (U+D800..U+DBFF) for supplementary 1631cb0ef41Sopenharmony_ci * @stable ICU 2.4 1641cb0ef41Sopenharmony_ci */ 1651cb0ef41Sopenharmony_ci#define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xD7C0) 1661cb0ef41Sopenharmony_ci 1671cb0ef41Sopenharmony_ci/** 1681cb0ef41Sopenharmony_ci * Get the trail surrogate (0xDC00..0xDFFF) for a 1691cb0ef41Sopenharmony_ci * supplementary code point (0x010000..0x10FFFF). 1701cb0ef41Sopenharmony_ci * @param supplementary 32-bit code point (U+010000..U+10FFFF) 1711cb0ef41Sopenharmony_ci * @return trail surrogate (U+DC00..U+DFFF) for supplementary 1721cb0ef41Sopenharmony_ci * @stable ICU 2.4 1731cb0ef41Sopenharmony_ci */ 1741cb0ef41Sopenharmony_ci#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3FF) | 0xDC00) 1751cb0ef41Sopenharmony_ci 1761cb0ef41Sopenharmony_ci// This must be called with the length pre-determined by the first byte. 1771cb0ef41Sopenharmony_ci// If presented with a length > 4, this returns false. The Unicode 1781cb0ef41Sopenharmony_ci// definition of UTF-8 goes up to 4-byte sequences. 1791cb0ef41Sopenharmony_cistatic bool isLegalUTF8(const unsigned char* source, int length) { 1801cb0ef41Sopenharmony_ci unsigned char a; 1811cb0ef41Sopenharmony_ci const unsigned char* srcptr = source + length; 1821cb0ef41Sopenharmony_ci switch (length) { 1831cb0ef41Sopenharmony_ci default: 1841cb0ef41Sopenharmony_ci return false; 1851cb0ef41Sopenharmony_ci // Everything else falls through when "true"... 1861cb0ef41Sopenharmony_ci case 4: 1871cb0ef41Sopenharmony_ci if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 1881cb0ef41Sopenharmony_ci V8_FALLTHROUGH; 1891cb0ef41Sopenharmony_ci case 3: 1901cb0ef41Sopenharmony_ci if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 1911cb0ef41Sopenharmony_ci V8_FALLTHROUGH; 1921cb0ef41Sopenharmony_ci case 2: 1931cb0ef41Sopenharmony_ci if ((a = (*--srcptr)) > 0xBF) return false; 1941cb0ef41Sopenharmony_ci 1951cb0ef41Sopenharmony_ci // no fall-through in this inner switch 1961cb0ef41Sopenharmony_ci switch (*source) { 1971cb0ef41Sopenharmony_ci case 0xE0: 1981cb0ef41Sopenharmony_ci if (a < 0xA0) return false; 1991cb0ef41Sopenharmony_ci break; 2001cb0ef41Sopenharmony_ci case 0xED: 2011cb0ef41Sopenharmony_ci if (a > 0x9F) return false; 2021cb0ef41Sopenharmony_ci break; 2031cb0ef41Sopenharmony_ci case 0xF0: 2041cb0ef41Sopenharmony_ci if (a < 0x90) return false; 2051cb0ef41Sopenharmony_ci break; 2061cb0ef41Sopenharmony_ci case 0xF4: 2071cb0ef41Sopenharmony_ci if (a > 0x8F) return false; 2081cb0ef41Sopenharmony_ci break; 2091cb0ef41Sopenharmony_ci default: 2101cb0ef41Sopenharmony_ci if (a < 0x80) return false; 2111cb0ef41Sopenharmony_ci } 2121cb0ef41Sopenharmony_ci V8_FALLTHROUGH; 2131cb0ef41Sopenharmony_ci 2141cb0ef41Sopenharmony_ci case 1: 2151cb0ef41Sopenharmony_ci if (*source >= 0x80 && *source < 0xC2) return false; 2161cb0ef41Sopenharmony_ci } 2171cb0ef41Sopenharmony_ci if (*source > 0xF4) return false; 2181cb0ef41Sopenharmony_ci return true; 2191cb0ef41Sopenharmony_ci} 2201cb0ef41Sopenharmony_ci 2211cb0ef41Sopenharmony_ci// Magic values subtracted from a buffer value during UTF8 conversion. 2221cb0ef41Sopenharmony_ci// This table contains as many values as there might be trailing bytes 2231cb0ef41Sopenharmony_ci// in a UTF-8 sequence. 2241cb0ef41Sopenharmony_cistatic const UChar32 offsetsFromUTF8[6] = {0x00000000UL, 2251cb0ef41Sopenharmony_ci 0x00003080UL, 2261cb0ef41Sopenharmony_ci 0x000E2080UL, 2271cb0ef41Sopenharmony_ci 0x03C82080UL, 2281cb0ef41Sopenharmony_ci static_cast<UChar32>(0xFA082080UL), 2291cb0ef41Sopenharmony_ci static_cast<UChar32>(0x82082080UL)}; 2301cb0ef41Sopenharmony_ci 2311cb0ef41Sopenharmony_cistatic inline UChar32 readUTF8Sequence(const char*& sequence, size_t length) { 2321cb0ef41Sopenharmony_ci UChar32 character = 0; 2331cb0ef41Sopenharmony_ci 2341cb0ef41Sopenharmony_ci // The cases all fall through. 2351cb0ef41Sopenharmony_ci switch (length) { 2361cb0ef41Sopenharmony_ci case 6: 2371cb0ef41Sopenharmony_ci character += static_cast<unsigned char>(*sequence++); 2381cb0ef41Sopenharmony_ci character <<= 6; 2391cb0ef41Sopenharmony_ci V8_FALLTHROUGH; 2401cb0ef41Sopenharmony_ci case 5: 2411cb0ef41Sopenharmony_ci character += static_cast<unsigned char>(*sequence++); 2421cb0ef41Sopenharmony_ci character <<= 6; 2431cb0ef41Sopenharmony_ci V8_FALLTHROUGH; 2441cb0ef41Sopenharmony_ci case 4: 2451cb0ef41Sopenharmony_ci character += static_cast<unsigned char>(*sequence++); 2461cb0ef41Sopenharmony_ci character <<= 6; 2471cb0ef41Sopenharmony_ci V8_FALLTHROUGH; 2481cb0ef41Sopenharmony_ci case 3: 2491cb0ef41Sopenharmony_ci character += static_cast<unsigned char>(*sequence++); 2501cb0ef41Sopenharmony_ci character <<= 6; 2511cb0ef41Sopenharmony_ci V8_FALLTHROUGH; 2521cb0ef41Sopenharmony_ci case 2: 2531cb0ef41Sopenharmony_ci character += static_cast<unsigned char>(*sequence++); 2541cb0ef41Sopenharmony_ci character <<= 6; 2551cb0ef41Sopenharmony_ci V8_FALLTHROUGH; 2561cb0ef41Sopenharmony_ci case 1: 2571cb0ef41Sopenharmony_ci character += static_cast<unsigned char>(*sequence++); 2581cb0ef41Sopenharmony_ci } 2591cb0ef41Sopenharmony_ci 2601cb0ef41Sopenharmony_ci return character - offsetsFromUTF8[length - 1]; 2611cb0ef41Sopenharmony_ci} 2621cb0ef41Sopenharmony_ci 2631cb0ef41Sopenharmony_ciConversionResult convertUTF8ToUTF16(const char** sourceStart, 2641cb0ef41Sopenharmony_ci const char* sourceEnd, UChar** targetStart, 2651cb0ef41Sopenharmony_ci UChar* targetEnd, bool* sourceAllASCII, 2661cb0ef41Sopenharmony_ci bool strict) { 2671cb0ef41Sopenharmony_ci ConversionResult result = conversionOK; 2681cb0ef41Sopenharmony_ci const char* source = *sourceStart; 2691cb0ef41Sopenharmony_ci UChar* target = *targetStart; 2701cb0ef41Sopenharmony_ci UChar orAllData = 0; 2711cb0ef41Sopenharmony_ci while (source < sourceEnd) { 2721cb0ef41Sopenharmony_ci int utf8SequenceLength = inlineUTF8SequenceLength(*source); 2731cb0ef41Sopenharmony_ci if (sourceEnd - source < utf8SequenceLength) { 2741cb0ef41Sopenharmony_ci result = sourceExhausted; 2751cb0ef41Sopenharmony_ci break; 2761cb0ef41Sopenharmony_ci } 2771cb0ef41Sopenharmony_ci // Do this check whether lenient or strict 2781cb0ef41Sopenharmony_ci if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), 2791cb0ef41Sopenharmony_ci utf8SequenceLength)) { 2801cb0ef41Sopenharmony_ci result = sourceIllegal; 2811cb0ef41Sopenharmony_ci break; 2821cb0ef41Sopenharmony_ci } 2831cb0ef41Sopenharmony_ci 2841cb0ef41Sopenharmony_ci UChar32 character = readUTF8Sequence(source, utf8SequenceLength); 2851cb0ef41Sopenharmony_ci 2861cb0ef41Sopenharmony_ci if (target >= targetEnd) { 2871cb0ef41Sopenharmony_ci source -= utf8SequenceLength; // Back up source pointer! 2881cb0ef41Sopenharmony_ci result = targetExhausted; 2891cb0ef41Sopenharmony_ci break; 2901cb0ef41Sopenharmony_ci } 2911cb0ef41Sopenharmony_ci 2921cb0ef41Sopenharmony_ci if (U_IS_BMP(character)) { 2931cb0ef41Sopenharmony_ci // UTF-16 surrogate values are illegal in UTF-32 2941cb0ef41Sopenharmony_ci if (U_IS_SURROGATE(character)) { 2951cb0ef41Sopenharmony_ci if (strict) { 2961cb0ef41Sopenharmony_ci source -= utf8SequenceLength; // return to the illegal value itself 2971cb0ef41Sopenharmony_ci result = sourceIllegal; 2981cb0ef41Sopenharmony_ci break; 2991cb0ef41Sopenharmony_ci } 3001cb0ef41Sopenharmony_ci *target++ = replacementCharacter; 3011cb0ef41Sopenharmony_ci orAllData |= replacementCharacter; 3021cb0ef41Sopenharmony_ci } else { 3031cb0ef41Sopenharmony_ci *target++ = static_cast<UChar>(character); // normal case 3041cb0ef41Sopenharmony_ci orAllData |= character; 3051cb0ef41Sopenharmony_ci } 3061cb0ef41Sopenharmony_ci } else if (U_IS_SUPPLEMENTARY(character)) { 3071cb0ef41Sopenharmony_ci // target is a character in range 0xFFFF - 0x10FFFF 3081cb0ef41Sopenharmony_ci if (target + 1 >= targetEnd) { 3091cb0ef41Sopenharmony_ci source -= utf8SequenceLength; // Back up source pointer! 3101cb0ef41Sopenharmony_ci result = targetExhausted; 3111cb0ef41Sopenharmony_ci break; 3121cb0ef41Sopenharmony_ci } 3131cb0ef41Sopenharmony_ci *target++ = U16_LEAD(character); 3141cb0ef41Sopenharmony_ci *target++ = U16_TRAIL(character); 3151cb0ef41Sopenharmony_ci orAllData = 0xFFFF; 3161cb0ef41Sopenharmony_ci } else { 3171cb0ef41Sopenharmony_ci if (strict) { 3181cb0ef41Sopenharmony_ci source -= utf8SequenceLength; // return to the start 3191cb0ef41Sopenharmony_ci result = sourceIllegal; 3201cb0ef41Sopenharmony_ci break; // Bail out; shouldn't continue 3211cb0ef41Sopenharmony_ci } else { 3221cb0ef41Sopenharmony_ci *target++ = replacementCharacter; 3231cb0ef41Sopenharmony_ci orAllData |= replacementCharacter; 3241cb0ef41Sopenharmony_ci } 3251cb0ef41Sopenharmony_ci } 3261cb0ef41Sopenharmony_ci } 3271cb0ef41Sopenharmony_ci *sourceStart = source; 3281cb0ef41Sopenharmony_ci *targetStart = target; 3291cb0ef41Sopenharmony_ci 3301cb0ef41Sopenharmony_ci if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7F); 3311cb0ef41Sopenharmony_ci 3321cb0ef41Sopenharmony_ci return result; 3331cb0ef41Sopenharmony_ci} 3341cb0ef41Sopenharmony_ci 3351cb0ef41Sopenharmony_ci// Helper to write a three-byte UTF-8 code point to the buffer, caller must 3361cb0ef41Sopenharmony_ci// check room is available. 3371cb0ef41Sopenharmony_cistatic inline void putUTF8Triple(char*& buffer, UChar ch) { 3381cb0ef41Sopenharmony_ci *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); 3391cb0ef41Sopenharmony_ci *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); 3401cb0ef41Sopenharmony_ci *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); 3411cb0ef41Sopenharmony_ci} 3421cb0ef41Sopenharmony_ci} // namespace 3431cb0ef41Sopenharmony_ci 3441cb0ef41Sopenharmony_cistd::string UTF16ToUTF8(const UChar* stringStart, size_t length) { 3451cb0ef41Sopenharmony_ci if (!stringStart || !length) return std::string(); 3461cb0ef41Sopenharmony_ci 3471cb0ef41Sopenharmony_ci // Allocate a buffer big enough to hold all the characters 3481cb0ef41Sopenharmony_ci // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). 3491cb0ef41Sopenharmony_ci // Optimization ideas, if we find this function is hot: 3501cb0ef41Sopenharmony_ci // * We could speculatively create a CStringBuffer to contain 'length' 3511cb0ef41Sopenharmony_ci // characters, and resize if necessary (i.e. if the buffer contains 3521cb0ef41Sopenharmony_ci // non-ascii characters). (Alternatively, scan the buffer first for 3531cb0ef41Sopenharmony_ci // ascii characters, so we know this will be sufficient). 3541cb0ef41Sopenharmony_ci // * We could allocate a CStringBuffer with an appropriate size to 3551cb0ef41Sopenharmony_ci // have a good chance of being able to write the string into the 3561cb0ef41Sopenharmony_ci // buffer without reallocing (say, 1.5 x length). 3571cb0ef41Sopenharmony_ci if (length > std::numeric_limits<unsigned>::max() / 3) return std::string(); 3581cb0ef41Sopenharmony_ci 3591cb0ef41Sopenharmony_ci std::string output(length * 3, '\0'); 3601cb0ef41Sopenharmony_ci const UChar* characters = stringStart; 3611cb0ef41Sopenharmony_ci const UChar* characters_end = characters + length; 3621cb0ef41Sopenharmony_ci char* buffer = &*output.begin(); 3631cb0ef41Sopenharmony_ci char* buffer_end = &*output.end(); 3641cb0ef41Sopenharmony_ci while (characters < characters_end) { 3651cb0ef41Sopenharmony_ci // Use strict conversion to detect unpaired surrogates. 3661cb0ef41Sopenharmony_ci ConversionResult result = convertUTF16ToUTF8( 3671cb0ef41Sopenharmony_ci &characters, characters_end, &buffer, buffer_end, /* strict= */ true); 3681cb0ef41Sopenharmony_ci DCHECK_NE(result, targetExhausted); 3691cb0ef41Sopenharmony_ci // Conversion fails when there is an unpaired surrogate. Put 3701cb0ef41Sopenharmony_ci // replacement character (U+FFFD) instead of the unpaired 3711cb0ef41Sopenharmony_ci // surrogate. 3721cb0ef41Sopenharmony_ci if (result != conversionOK) { 3731cb0ef41Sopenharmony_ci DCHECK_LE(0xD800, *characters); 3741cb0ef41Sopenharmony_ci DCHECK_LE(*characters, 0xDFFF); 3751cb0ef41Sopenharmony_ci // There should be room left, since one UChar hasn't been 3761cb0ef41Sopenharmony_ci // converted. 3771cb0ef41Sopenharmony_ci DCHECK_LE(buffer + 3, buffer_end); 3781cb0ef41Sopenharmony_ci putUTF8Triple(buffer, replacementCharacter); 3791cb0ef41Sopenharmony_ci ++characters; 3801cb0ef41Sopenharmony_ci } 3811cb0ef41Sopenharmony_ci } 3821cb0ef41Sopenharmony_ci 3831cb0ef41Sopenharmony_ci output.resize(buffer - output.data()); 3841cb0ef41Sopenharmony_ci return output; 3851cb0ef41Sopenharmony_ci} 3861cb0ef41Sopenharmony_ci 3871cb0ef41Sopenharmony_cistd::basic_string<UChar> UTF8ToUTF16(const char* stringStart, size_t length) { 3881cb0ef41Sopenharmony_ci if (!stringStart || !length) return std::basic_string<UChar>(); 3891cb0ef41Sopenharmony_ci std::vector<uint16_t> buffer(length); 3901cb0ef41Sopenharmony_ci UChar* bufferStart = buffer.data(); 3911cb0ef41Sopenharmony_ci 3921cb0ef41Sopenharmony_ci UChar* bufferCurrent = bufferStart; 3931cb0ef41Sopenharmony_ci const char* stringCurrent = reinterpret_cast<const char*>(stringStart); 3941cb0ef41Sopenharmony_ci if (convertUTF8ToUTF16(&stringCurrent, 3951cb0ef41Sopenharmony_ci reinterpret_cast<const char*>(stringStart + length), 3961cb0ef41Sopenharmony_ci &bufferCurrent, bufferCurrent + buffer.size(), nullptr, 3971cb0ef41Sopenharmony_ci true) != conversionOK) 3981cb0ef41Sopenharmony_ci return std::basic_string<uint16_t>(); 3991cb0ef41Sopenharmony_ci size_t utf16Length = bufferCurrent - bufferStart; 4001cb0ef41Sopenharmony_ci return std::basic_string<UChar>(bufferStart, bufferStart + utf16Length); 4011cb0ef41Sopenharmony_ci} 4021cb0ef41Sopenharmony_ci 4031cb0ef41Sopenharmony_ci} // namespace v8_inspector 404