1// Copyright 2019 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/inspector/v8-string-conversions.h"
6
7#include <limits>
8#include <vector>
9
10#include "src/base/logging.h"
11#include "src/base/v8-fallthrough.h"
12
13namespace v8_inspector {
14namespace {
15using UChar = uint16_t;
16using UChar32 = uint32_t;
17
18bool isASCII(UChar c) { return !(c & ~0x7F); }
19
20const UChar replacementCharacter = 0xFFFD;
21
22inline int inlineUTF8SequenceLengthNonASCII(char b0) {
23  if ((b0 & 0xC0) != 0xC0) return 0;
24  if ((b0 & 0xE0) == 0xC0) return 2;
25  if ((b0 & 0xF0) == 0xE0) return 3;
26  if ((b0 & 0xF8) == 0xF0) return 4;
27  return 0;
28}
29
30inline int inlineUTF8SequenceLength(char b0) {
31  return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
32}
33
34// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
35// into the first byte, depending on how many bytes follow.  There are
36// as many entries in this table as there are UTF-8 sequence types.
37// (I.e., one byte sequence, two byte... etc.). Remember that sequences
38// for *legal* UTF-8 will be 4 or fewer bytes total.
39static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,
40                                               0xF0, 0xF8, 0xFC};
41
42enum ConversionResult {
43  conversionOK,     // conversion successful
44  sourceExhausted,  // partial character in source, but hit end
45  targetExhausted,  // insuff. room in target for conversion
46  sourceIllegal     // source sequence is illegal/malformed
47};
48
49ConversionResult convertUTF16ToUTF8(const UChar** sourceStart,
50                                    const UChar* sourceEnd, char** targetStart,
51                                    char* targetEnd, bool strict) {
52  ConversionResult result = conversionOK;
53  const UChar* source = *sourceStart;
54  char* target = *targetStart;
55  while (source < sourceEnd) {
56    UChar32 ch;
57    uint32_t bytesToWrite = 0;
58    const UChar32 byteMask = 0xBF;
59    const UChar32 byteMark = 0x80;
60    const UChar* oldSource =
61        source;  // In case we have to back up because of target overflow.
62    ch = static_cast<uint16_t>(*source++);
63    // If we have a surrogate pair, convert to UChar32 first.
64    if (ch >= 0xD800 && ch <= 0xDBFF) {
65      // If the 16 bits following the high surrogate are in the source buffer...
66      if (source < sourceEnd) {
67        UChar32 ch2 = static_cast<uint16_t>(*source);
68        // If it's a low surrogate, convert to UChar32.
69        if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
70          ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
71          ++source;
72        } else if (strict) {  // it's an unpaired high surrogate
73          --source;           // return to the illegal value itself
74          result = sourceIllegal;
75          break;
76        }
77      } else {     // We don't have the 16 bits following the high surrogate.
78        --source;  // return to the high surrogate
79        result = sourceExhausted;
80        break;
81      }
82    } else if (strict) {
83      // UTF-16 surrogate values are illegal in UTF-32
84      if (ch >= 0xDC00 && ch <= 0xDFFF) {
85        --source;  // return to the illegal value itself
86        result = sourceIllegal;
87        break;
88      }
89    }
90    // Figure out how many bytes the result will require
91    if (ch < static_cast<UChar32>(0x80)) {
92      bytesToWrite = 1;
93    } else if (ch < static_cast<UChar32>(0x800)) {
94      bytesToWrite = 2;
95    } else if (ch < static_cast<UChar32>(0x10000)) {
96      bytesToWrite = 3;
97    } else if (ch < static_cast<UChar32>(0x110000)) {
98      bytesToWrite = 4;
99    } else {
100      bytesToWrite = 3;
101      ch = replacementCharacter;
102    }
103
104    target += bytesToWrite;
105    if (target > targetEnd) {
106      source = oldSource;  // Back up source pointer!
107      target -= bytesToWrite;
108      result = targetExhausted;
109      break;
110    }
111    switch (bytesToWrite) {
112      case 4:
113        *--target = static_cast<char>((ch | byteMark) & byteMask);
114        ch >>= 6;
115        V8_FALLTHROUGH;
116      case 3:
117        *--target = static_cast<char>((ch | byteMark) & byteMask);
118        ch >>= 6;
119        V8_FALLTHROUGH;
120      case 2:
121        *--target = static_cast<char>((ch | byteMark) & byteMask);
122        ch >>= 6;
123        V8_FALLTHROUGH;
124      case 1:
125        *--target = static_cast<char>(ch | firstByteMark[bytesToWrite]);
126    }
127    target += bytesToWrite;
128  }
129  *sourceStart = source;
130  *targetStart = target;
131  return result;
132}
133
134/**
135 * Is this code point a BMP code point (U+0000..U+ffff)?
136 * @param c 32-bit code point
137 * @return TRUE or FALSE
138 * @stable ICU 2.8
139 */
140#define U_IS_BMP(c) ((uint32_t)(c) <= 0xFFFF)
141
142/**
143 * Is this code point a supplementary code point (U+010000..U+10FFFF)?
144 * @param c 32-bit code point
145 * @return TRUE or FALSE
146 * @stable ICU 2.8
147 */
148#define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x010000) <= 0xFFFFF)
149
150/**
151 * Is this code point a surrogate (U+d800..U+dfff)?
152 * @param c 32-bit code point
153 * @return TRUE or FALSE
154 * @stable ICU 2.4
155 */
156#define U_IS_SURROGATE(c) (((c)&0xFFFFF800) == 0xD800)
157
158/**
159 * Get the lead surrogate (0xD800..0xDBFF) for a
160 * supplementary code point (0x010000..0x10FFFF).
161 * @param supplementary 32-bit code point (U+010000..U+10FFFF)
162 * @return lead surrogate (U+D800..U+DBFF) for supplementary
163 * @stable ICU 2.4
164 */
165#define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xD7C0)
166
167/**
168 * Get the trail surrogate (0xDC00..0xDFFF) for a
169 * supplementary code point (0x010000..0x10FFFF).
170 * @param supplementary 32-bit code point (U+010000..U+10FFFF)
171 * @return trail surrogate (U+DC00..U+DFFF) for supplementary
172 * @stable ICU 2.4
173 */
174#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3FF) | 0xDC00)
175
176// This must be called with the length pre-determined by the first byte.
177// If presented with a length > 4, this returns false.  The Unicode
178// definition of UTF-8 goes up to 4-byte sequences.
179static bool isLegalUTF8(const unsigned char* source, int length) {
180  unsigned char a;
181  const unsigned char* srcptr = source + length;
182  switch (length) {
183    default:
184      return false;
185    // Everything else falls through when "true"...
186    case 4:
187      if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
188      V8_FALLTHROUGH;
189    case 3:
190      if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
191      V8_FALLTHROUGH;
192    case 2:
193      if ((a = (*--srcptr)) > 0xBF) return false;
194
195      // no fall-through in this inner switch
196      switch (*source) {
197        case 0xE0:
198          if (a < 0xA0) return false;
199          break;
200        case 0xED:
201          if (a > 0x9F) return false;
202          break;
203        case 0xF0:
204          if (a < 0x90) return false;
205          break;
206        case 0xF4:
207          if (a > 0x8F) return false;
208          break;
209        default:
210          if (a < 0x80) return false;
211      }
212      V8_FALLTHROUGH;
213
214    case 1:
215      if (*source >= 0x80 && *source < 0xC2) return false;
216  }
217  if (*source > 0xF4) return false;
218  return true;
219}
220
221// Magic values subtracted from a buffer value during UTF8 conversion.
222// This table contains as many values as there might be trailing bytes
223// in a UTF-8 sequence.
224static const UChar32 offsetsFromUTF8[6] = {0x00000000UL,
225                                           0x00003080UL,
226                                           0x000E2080UL,
227                                           0x03C82080UL,
228                                           static_cast<UChar32>(0xFA082080UL),
229                                           static_cast<UChar32>(0x82082080UL)};
230
231static inline UChar32 readUTF8Sequence(const char*& sequence, size_t length) {
232  UChar32 character = 0;
233
234  // The cases all fall through.
235  switch (length) {
236    case 6:
237      character += static_cast<unsigned char>(*sequence++);
238      character <<= 6;
239      V8_FALLTHROUGH;
240    case 5:
241      character += static_cast<unsigned char>(*sequence++);
242      character <<= 6;
243      V8_FALLTHROUGH;
244    case 4:
245      character += static_cast<unsigned char>(*sequence++);
246      character <<= 6;
247      V8_FALLTHROUGH;
248    case 3:
249      character += static_cast<unsigned char>(*sequence++);
250      character <<= 6;
251      V8_FALLTHROUGH;
252    case 2:
253      character += static_cast<unsigned char>(*sequence++);
254      character <<= 6;
255      V8_FALLTHROUGH;
256    case 1:
257      character += static_cast<unsigned char>(*sequence++);
258  }
259
260  return character - offsetsFromUTF8[length - 1];
261}
262
263ConversionResult convertUTF8ToUTF16(const char** sourceStart,
264                                    const char* sourceEnd, UChar** targetStart,
265                                    UChar* targetEnd, bool* sourceAllASCII,
266                                    bool strict) {
267  ConversionResult result = conversionOK;
268  const char* source = *sourceStart;
269  UChar* target = *targetStart;
270  UChar orAllData = 0;
271  while (source < sourceEnd) {
272    int utf8SequenceLength = inlineUTF8SequenceLength(*source);
273    if (sourceEnd - source < utf8SequenceLength) {
274      result = sourceExhausted;
275      break;
276    }
277    // Do this check whether lenient or strict
278    if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source),
279                     utf8SequenceLength)) {
280      result = sourceIllegal;
281      break;
282    }
283
284    UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
285
286    if (target >= targetEnd) {
287      source -= utf8SequenceLength;  // Back up source pointer!
288      result = targetExhausted;
289      break;
290    }
291
292    if (U_IS_BMP(character)) {
293      // UTF-16 surrogate values are illegal in UTF-32
294      if (U_IS_SURROGATE(character)) {
295        if (strict) {
296          source -= utf8SequenceLength;  // return to the illegal value itself
297          result = sourceIllegal;
298          break;
299        }
300        *target++ = replacementCharacter;
301        orAllData |= replacementCharacter;
302      } else {
303        *target++ = static_cast<UChar>(character);  // normal case
304        orAllData |= character;
305      }
306    } else if (U_IS_SUPPLEMENTARY(character)) {
307      // target is a character in range 0xFFFF - 0x10FFFF
308      if (target + 1 >= targetEnd) {
309        source -= utf8SequenceLength;  // Back up source pointer!
310        result = targetExhausted;
311        break;
312      }
313      *target++ = U16_LEAD(character);
314      *target++ = U16_TRAIL(character);
315      orAllData = 0xFFFF;
316    } else {
317      if (strict) {
318        source -= utf8SequenceLength;  // return to the start
319        result = sourceIllegal;
320        break;  // Bail out; shouldn't continue
321      } else {
322        *target++ = replacementCharacter;
323        orAllData |= replacementCharacter;
324      }
325    }
326  }
327  *sourceStart = source;
328  *targetStart = target;
329
330  if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7F);
331
332  return result;
333}
334
335// Helper to write a three-byte UTF-8 code point to the buffer, caller must
336// check room is available.
337static inline void putUTF8Triple(char*& buffer, UChar ch) {
338  *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
339  *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
340  *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
341}
342}  // namespace
343
344std::string UTF16ToUTF8(const UChar* stringStart, size_t length) {
345  if (!stringStart || !length) return std::string();
346
347  // Allocate a buffer big enough to hold all the characters
348  // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
349  // Optimization ideas, if we find this function is hot:
350  //  * We could speculatively create a CStringBuffer to contain 'length'
351  //    characters, and resize if necessary (i.e. if the buffer contains
352  //    non-ascii characters). (Alternatively, scan the buffer first for
353  //    ascii characters, so we know this will be sufficient).
354  //  * We could allocate a CStringBuffer with an appropriate size to
355  //    have a good chance of being able to write the string into the
356  //    buffer without reallocing (say, 1.5 x length).
357  if (length > std::numeric_limits<unsigned>::max() / 3) return std::string();
358
359  std::string output(length * 3, '\0');
360  const UChar* characters = stringStart;
361  const UChar* characters_end = characters + length;
362  char* buffer = &*output.begin();
363  char* buffer_end = &*output.end();
364  while (characters < characters_end) {
365    // Use strict conversion to detect unpaired surrogates.
366    ConversionResult result = convertUTF16ToUTF8(
367        &characters, characters_end, &buffer, buffer_end, /* strict= */ true);
368    DCHECK_NE(result, targetExhausted);
369    // Conversion fails when there is an unpaired surrogate.  Put
370    // replacement character (U+FFFD) instead of the unpaired
371    // surrogate.
372    if (result != conversionOK) {
373      DCHECK_LE(0xD800, *characters);
374      DCHECK_LE(*characters, 0xDFFF);
375      // There should be room left, since one UChar hasn't been
376      // converted.
377      DCHECK_LE(buffer + 3, buffer_end);
378      putUTF8Triple(buffer, replacementCharacter);
379      ++characters;
380    }
381  }
382
383  output.resize(buffer - output.data());
384  return output;
385}
386
387std::basic_string<UChar> UTF8ToUTF16(const char* stringStart, size_t length) {
388  if (!stringStart || !length) return std::basic_string<UChar>();
389  std::vector<uint16_t> buffer(length);
390  UChar* bufferStart = buffer.data();
391
392  UChar* bufferCurrent = bufferStart;
393  const char* stringCurrent = reinterpret_cast<const char*>(stringStart);
394  if (convertUTF8ToUTF16(&stringCurrent,
395                         reinterpret_cast<const char*>(stringStart + length),
396                         &bufferCurrent, bufferCurrent + buffer.size(), nullptr,
397                         true) != conversionOK)
398    return std::basic_string<uint16_t>();
399  size_t utf16Length = bufferCurrent - bufferStart;
400  return std::basic_string<UChar>(bufferStart, bufferStart + utf16Length);
401}
402
403}  // namespace v8_inspector
404