11cb0ef41Sopenharmony_ci// Copyright 2007-2010 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci// found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci#ifndef V8_STRINGS_UNICODE_INL_H_ 61cb0ef41Sopenharmony_ci#define V8_STRINGS_UNICODE_INL_H_ 71cb0ef41Sopenharmony_ci 81cb0ef41Sopenharmony_ci#include "src/base/logging.h" 91cb0ef41Sopenharmony_ci#include "src/strings/unicode.h" 101cb0ef41Sopenharmony_ci#include "src/utils/utils.h" 111cb0ef41Sopenharmony_ci 121cb0ef41Sopenharmony_cinamespace unibrow { 131cb0ef41Sopenharmony_ci 141cb0ef41Sopenharmony_ci#ifndef V8_INTL_SUPPORT 151cb0ef41Sopenharmony_citemplate <class T, int s> 161cb0ef41Sopenharmony_cibool Predicate<T, s>::get(uchar code_point) { 171cb0ef41Sopenharmony_ci CacheEntry entry = entries_[code_point & kMask]; 181cb0ef41Sopenharmony_ci if (entry.code_point() == code_point) return entry.value(); 191cb0ef41Sopenharmony_ci return CalculateValue(code_point); 201cb0ef41Sopenharmony_ci} 211cb0ef41Sopenharmony_ci 221cb0ef41Sopenharmony_citemplate <class T, int s> 231cb0ef41Sopenharmony_cibool Predicate<T, s>::CalculateValue(uchar code_point) { 241cb0ef41Sopenharmony_ci bool result = T::Is(code_point); 251cb0ef41Sopenharmony_ci entries_[code_point & kMask] = CacheEntry(code_point, result); 261cb0ef41Sopenharmony_ci return result; 271cb0ef41Sopenharmony_ci} 281cb0ef41Sopenharmony_ci 291cb0ef41Sopenharmony_citemplate <class T, int s> 301cb0ef41Sopenharmony_ciint Mapping<T, s>::get(uchar c, uchar n, uchar* result) { 311cb0ef41Sopenharmony_ci CacheEntry entry = entries_[c & kMask]; 321cb0ef41Sopenharmony_ci if (entry.code_point_ == c) { 331cb0ef41Sopenharmony_ci if (entry.offset_ == 0) { 341cb0ef41Sopenharmony_ci return 0; 351cb0ef41Sopenharmony_ci } else { 361cb0ef41Sopenharmony_ci result[0] = c + entry.offset_; 371cb0ef41Sopenharmony_ci return 1; 381cb0ef41Sopenharmony_ci } 391cb0ef41Sopenharmony_ci } else { 401cb0ef41Sopenharmony_ci return CalculateValue(c, n, result); 411cb0ef41Sopenharmony_ci } 421cb0ef41Sopenharmony_ci} 431cb0ef41Sopenharmony_ci 441cb0ef41Sopenharmony_citemplate <class T, int s> 451cb0ef41Sopenharmony_ciint Mapping<T, s>::CalculateValue(uchar c, uchar n, uchar* result) { 461cb0ef41Sopenharmony_ci bool allow_caching = true; 471cb0ef41Sopenharmony_ci int length = T::Convert(c, n, result, &allow_caching); 481cb0ef41Sopenharmony_ci if (allow_caching) { 491cb0ef41Sopenharmony_ci if (length == 1) { 501cb0ef41Sopenharmony_ci entries_[c & kMask] = CacheEntry(c, result[0] - c); 511cb0ef41Sopenharmony_ci return 1; 521cb0ef41Sopenharmony_ci } else { 531cb0ef41Sopenharmony_ci entries_[c & kMask] = CacheEntry(c, 0); 541cb0ef41Sopenharmony_ci return 0; 551cb0ef41Sopenharmony_ci } 561cb0ef41Sopenharmony_ci } else { 571cb0ef41Sopenharmony_ci return length; 581cb0ef41Sopenharmony_ci } 591cb0ef41Sopenharmony_ci} 601cb0ef41Sopenharmony_ci#endif // !V8_INTL_SUPPORT 611cb0ef41Sopenharmony_ci 621cb0ef41Sopenharmony_cibool Utf16::HasUnpairedSurrogate(const uint16_t* code_units, size_t length) { 631cb0ef41Sopenharmony_ci for (size_t i = 0; i < length; ++i) { 641cb0ef41Sopenharmony_ci const int code_unit = code_units[i]; 651cb0ef41Sopenharmony_ci if (IsLeadSurrogate(code_unit)) { 661cb0ef41Sopenharmony_ci // The current code unit is a leading surrogate. Check if it is followed 671cb0ef41Sopenharmony_ci // by a trailing surrogate. 681cb0ef41Sopenharmony_ci if (i == length - 1) return true; 691cb0ef41Sopenharmony_ci if (!IsTrailSurrogate(code_units[i + 1])) return true; 701cb0ef41Sopenharmony_ci // Skip the paired trailing surrogate. 711cb0ef41Sopenharmony_ci ++i; 721cb0ef41Sopenharmony_ci } else if (IsTrailSurrogate(code_unit)) { 731cb0ef41Sopenharmony_ci // All paired trailing surrogates are skipped above, so this branch is 741cb0ef41Sopenharmony_ci // only for those that are unpaired. 751cb0ef41Sopenharmony_ci return true; 761cb0ef41Sopenharmony_ci } 771cb0ef41Sopenharmony_ci } 781cb0ef41Sopenharmony_ci return false; 791cb0ef41Sopenharmony_ci} 801cb0ef41Sopenharmony_ci 811cb0ef41Sopenharmony_ci// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they 821cb0ef41Sopenharmony_ci// stream in. This **must** be followed by a call to ValueOfIncrementalFinish 831cb0ef41Sopenharmony_ci// when the stream is complete, to ensure incomplete sequences are handled. 841cb0ef41Sopenharmony_ciuchar Utf8::ValueOfIncremental(const byte** cursor, State* state, 851cb0ef41Sopenharmony_ci Utf8IncrementalBuffer* buffer) { 861cb0ef41Sopenharmony_ci DCHECK_NOT_NULL(buffer); 871cb0ef41Sopenharmony_ci State old_state = *state; 881cb0ef41Sopenharmony_ci byte next = **cursor; 891cb0ef41Sopenharmony_ci *cursor += 1; 901cb0ef41Sopenharmony_ci 911cb0ef41Sopenharmony_ci if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) { 921cb0ef41Sopenharmony_ci DCHECK_EQ(0u, *buffer); 931cb0ef41Sopenharmony_ci return static_cast<uchar>(next); 941cb0ef41Sopenharmony_ci } 951cb0ef41Sopenharmony_ci 961cb0ef41Sopenharmony_ci // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation 971cb0ef41Sopenharmony_ci // char in that sequence. 981cb0ef41Sopenharmony_ci Utf8DfaDecoder::Decode(next, state, buffer); 991cb0ef41Sopenharmony_ci 1001cb0ef41Sopenharmony_ci switch (*state) { 1011cb0ef41Sopenharmony_ci case State::kAccept: { 1021cb0ef41Sopenharmony_ci uchar t = *buffer; 1031cb0ef41Sopenharmony_ci *buffer = 0; 1041cb0ef41Sopenharmony_ci return t; 1051cb0ef41Sopenharmony_ci } 1061cb0ef41Sopenharmony_ci 1071cb0ef41Sopenharmony_ci case State::kReject: 1081cb0ef41Sopenharmony_ci *state = State::kAccept; 1091cb0ef41Sopenharmony_ci *buffer = 0; 1101cb0ef41Sopenharmony_ci 1111cb0ef41Sopenharmony_ci // If we hit a bad byte, we need to determine if we were trying to start 1121cb0ef41Sopenharmony_ci // a sequence or continue one. If we were trying to start a sequence, 1131cb0ef41Sopenharmony_ci // that means it's just an invalid lead byte and we need to continue to 1141cb0ef41Sopenharmony_ci // the next (which we already did above). If we were already in a 1151cb0ef41Sopenharmony_ci // sequence, we need to reprocess this same byte after resetting to the 1161cb0ef41Sopenharmony_ci // initial state. 1171cb0ef41Sopenharmony_ci if (old_state != State::kAccept) { 1181cb0ef41Sopenharmony_ci // We were trying to continue a sequence, so let's reprocess this byte 1191cb0ef41Sopenharmony_ci // next time. 1201cb0ef41Sopenharmony_ci *cursor -= 1; 1211cb0ef41Sopenharmony_ci } 1221cb0ef41Sopenharmony_ci return kBadChar; 1231cb0ef41Sopenharmony_ci 1241cb0ef41Sopenharmony_ci default: 1251cb0ef41Sopenharmony_ci return kIncomplete; 1261cb0ef41Sopenharmony_ci } 1271cb0ef41Sopenharmony_ci} 1281cb0ef41Sopenharmony_ci 1291cb0ef41Sopenharmony_ciunsigned Utf8::EncodeOneByte(char* str, uint8_t c) { 1301cb0ef41Sopenharmony_ci static const int kMask = ~(1 << 6); 1311cb0ef41Sopenharmony_ci if (c <= kMaxOneByteChar) { 1321cb0ef41Sopenharmony_ci str[0] = c; 1331cb0ef41Sopenharmony_ci return 1; 1341cb0ef41Sopenharmony_ci } 1351cb0ef41Sopenharmony_ci str[0] = 0xC0 | (c >> 6); 1361cb0ef41Sopenharmony_ci str[1] = 0x80 | (c & kMask); 1371cb0ef41Sopenharmony_ci return 2; 1381cb0ef41Sopenharmony_ci} 1391cb0ef41Sopenharmony_ci 1401cb0ef41Sopenharmony_ci// Encode encodes the UTF-16 code units c and previous into the given str 1411cb0ef41Sopenharmony_ci// buffer, and combines surrogate code units into single code points. If 1421cb0ef41Sopenharmony_ci// replace_invalid is set to true, orphan surrogate code units will be replaced 1431cb0ef41Sopenharmony_ci// with kBadChar. 1441cb0ef41Sopenharmony_ciunsigned Utf8::Encode(char* str, uchar c, int previous, bool replace_invalid) { 1451cb0ef41Sopenharmony_ci static const int kMask = ~(1 << 6); 1461cb0ef41Sopenharmony_ci if (c <= kMaxOneByteChar) { 1471cb0ef41Sopenharmony_ci str[0] = c; 1481cb0ef41Sopenharmony_ci return 1; 1491cb0ef41Sopenharmony_ci } else if (c <= kMaxTwoByteChar) { 1501cb0ef41Sopenharmony_ci str[0] = 0xC0 | (c >> 6); 1511cb0ef41Sopenharmony_ci str[1] = 0x80 | (c & kMask); 1521cb0ef41Sopenharmony_ci return 2; 1531cb0ef41Sopenharmony_ci } else if (c <= kMaxThreeByteChar) { 1541cb0ef41Sopenharmony_ci DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter)); 1551cb0ef41Sopenharmony_ci if (Utf16::IsSurrogatePair(previous, c)) { 1561cb0ef41Sopenharmony_ci const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; 1571cb0ef41Sopenharmony_ci return Encode(str - kUnmatchedSize, 1581cb0ef41Sopenharmony_ci Utf16::CombineSurrogatePair(previous, c), 1591cb0ef41Sopenharmony_ci Utf16::kNoPreviousCharacter, replace_invalid) - 1601cb0ef41Sopenharmony_ci kUnmatchedSize; 1611cb0ef41Sopenharmony_ci } else if (replace_invalid && 1621cb0ef41Sopenharmony_ci (Utf16::IsLeadSurrogate(c) || Utf16::IsTrailSurrogate(c))) { 1631cb0ef41Sopenharmony_ci c = kBadChar; 1641cb0ef41Sopenharmony_ci } 1651cb0ef41Sopenharmony_ci str[0] = 0xE0 | (c >> 12); 1661cb0ef41Sopenharmony_ci str[1] = 0x80 | ((c >> 6) & kMask); 1671cb0ef41Sopenharmony_ci str[2] = 0x80 | (c & kMask); 1681cb0ef41Sopenharmony_ci return 3; 1691cb0ef41Sopenharmony_ci } else { 1701cb0ef41Sopenharmony_ci str[0] = 0xF0 | (c >> 18); 1711cb0ef41Sopenharmony_ci str[1] = 0x80 | ((c >> 12) & kMask); 1721cb0ef41Sopenharmony_ci str[2] = 0x80 | ((c >> 6) & kMask); 1731cb0ef41Sopenharmony_ci str[3] = 0x80 | (c & kMask); 1741cb0ef41Sopenharmony_ci return 4; 1751cb0ef41Sopenharmony_ci } 1761cb0ef41Sopenharmony_ci} 1771cb0ef41Sopenharmony_ci 1781cb0ef41Sopenharmony_ciuchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) { 1791cb0ef41Sopenharmony_ci if (length <= 0) return kBadChar; 1801cb0ef41Sopenharmony_ci byte first = bytes[0]; 1811cb0ef41Sopenharmony_ci // Characters between 0000 and 007F are encoded as a single character 1821cb0ef41Sopenharmony_ci if (V8_LIKELY(first <= kMaxOneByteChar)) { 1831cb0ef41Sopenharmony_ci *cursor += 1; 1841cb0ef41Sopenharmony_ci return first; 1851cb0ef41Sopenharmony_ci } 1861cb0ef41Sopenharmony_ci return CalculateValue(bytes, length, cursor); 1871cb0ef41Sopenharmony_ci} 1881cb0ef41Sopenharmony_ci 1891cb0ef41Sopenharmony_ciunsigned Utf8::Length(uchar c, int previous) { 1901cb0ef41Sopenharmony_ci if (c <= kMaxOneByteChar) { 1911cb0ef41Sopenharmony_ci return 1; 1921cb0ef41Sopenharmony_ci } else if (c <= kMaxTwoByteChar) { 1931cb0ef41Sopenharmony_ci return 2; 1941cb0ef41Sopenharmony_ci } else if (c <= kMaxThreeByteChar) { 1951cb0ef41Sopenharmony_ci DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter)); 1961cb0ef41Sopenharmony_ci if (Utf16::IsSurrogatePair(previous, c)) { 1971cb0ef41Sopenharmony_ci return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; 1981cb0ef41Sopenharmony_ci } 1991cb0ef41Sopenharmony_ci return 3; 2001cb0ef41Sopenharmony_ci } else { 2011cb0ef41Sopenharmony_ci return 4; 2021cb0ef41Sopenharmony_ci } 2031cb0ef41Sopenharmony_ci} 2041cb0ef41Sopenharmony_ci 2051cb0ef41Sopenharmony_cibool Utf8::IsValidCharacter(uchar c) { 2061cb0ef41Sopenharmony_ci return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) || 2071cb0ef41Sopenharmony_ci (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu && 2081cb0ef41Sopenharmony_ci c != kBadChar); 2091cb0ef41Sopenharmony_ci} 2101cb0ef41Sopenharmony_ci 2111cb0ef41Sopenharmony_ci} // namespace unibrow 2121cb0ef41Sopenharmony_ci 2131cb0ef41Sopenharmony_ci#endif // V8_STRINGS_UNICODE_INL_H_ 214