1// Copyright 2014 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/strings/unicode-decoder.h" 6 7#include "src/strings/unicode-inl.h" 8#include "src/utils/memcopy.h" 9 10namespace v8 { 11namespace internal { 12 13Utf8Decoder::Utf8Decoder(const base::Vector<const uint8_t>& chars) 14 : encoding_(Encoding::kAscii), 15 non_ascii_start_(NonAsciiStart(chars.begin(), chars.length())), 16 utf16_length_(non_ascii_start_) { 17 if (non_ascii_start_ == chars.length()) return; 18 19 const uint8_t* cursor = chars.begin() + non_ascii_start_; 20 const uint8_t* end = chars.begin() + chars.length(); 21 22 bool is_one_byte = true; 23 uint32_t incomplete_char = 0; 24 unibrow::Utf8::State state = unibrow::Utf8::State::kAccept; 25 26 while (cursor < end) { 27 unibrow::uchar t = 28 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); 29 if (t != unibrow::Utf8::kIncomplete) { 30 is_one_byte = is_one_byte && t <= unibrow::Latin1::kMaxChar; 31 utf16_length_++; 32 if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) utf16_length_++; 33 } 34 } 35 36 unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state); 37 if (t != unibrow::Utf8::kBufferEmpty) { 38 is_one_byte = false; 39 utf16_length_++; 40 } 41 42 encoding_ = is_one_byte ? Encoding::kLatin1 : Encoding::kUtf16; 43} 44 45template <typename Char> 46void Utf8Decoder::Decode(Char* out, const base::Vector<const uint8_t>& data) { 47 CopyChars(out, data.begin(), non_ascii_start_); 48 49 out += non_ascii_start_; 50 51 uint32_t incomplete_char = 0; 52 unibrow::Utf8::State state = unibrow::Utf8::State::kAccept; 53 54 const uint8_t* cursor = data.begin() + non_ascii_start_; 55 const uint8_t* end = data.begin() + data.length(); 56 57 while (cursor < end) { 58 unibrow::uchar t = 59 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char); 60 if (t != unibrow::Utf8::kIncomplete) { 61 if (sizeof(Char) == 1 || t <= unibrow::Utf16::kMaxNonSurrogateCharCode) { 62 *(out++) = static_cast<Char>(t); 63 } else { 64 *(out++) = unibrow::Utf16::LeadSurrogate(t); 65 *(out++) = unibrow::Utf16::TrailSurrogate(t); 66 } 67 } 68 } 69 70 unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state); 71 if (t != unibrow::Utf8::kBufferEmpty) *out = static_cast<Char>(t); 72} 73 74template V8_EXPORT_PRIVATE void Utf8Decoder::Decode( 75 uint8_t* out, const base::Vector<const uint8_t>& data); 76 77template V8_EXPORT_PRIVATE void Utf8Decoder::Decode( 78 uint16_t* out, const base::Vector<const uint8_t>& data); 79 80} // namespace internal 81} // namespace v8 82