11cb0ef41Sopenharmony_ci// Copyright 2007-2010 the V8 project authors. All rights reserved.
21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
31cb0ef41Sopenharmony_ci// found in the LICENSE file.
41cb0ef41Sopenharmony_ci
51cb0ef41Sopenharmony_ci#ifndef V8_STRINGS_UNICODE_INL_H_
61cb0ef41Sopenharmony_ci#define V8_STRINGS_UNICODE_INL_H_
71cb0ef41Sopenharmony_ci
81cb0ef41Sopenharmony_ci#include "src/base/logging.h"
91cb0ef41Sopenharmony_ci#include "src/strings/unicode.h"
101cb0ef41Sopenharmony_ci#include "src/utils/utils.h"
111cb0ef41Sopenharmony_ci
121cb0ef41Sopenharmony_cinamespace unibrow {
131cb0ef41Sopenharmony_ci
141cb0ef41Sopenharmony_ci#ifndef V8_INTL_SUPPORT
151cb0ef41Sopenharmony_citemplate <class T, int s>
161cb0ef41Sopenharmony_cibool Predicate<T, s>::get(uchar code_point) {
171cb0ef41Sopenharmony_ci  CacheEntry entry = entries_[code_point & kMask];
181cb0ef41Sopenharmony_ci  if (entry.code_point() == code_point) return entry.value();
191cb0ef41Sopenharmony_ci  return CalculateValue(code_point);
201cb0ef41Sopenharmony_ci}
211cb0ef41Sopenharmony_ci
221cb0ef41Sopenharmony_citemplate <class T, int s>
231cb0ef41Sopenharmony_cibool Predicate<T, s>::CalculateValue(uchar code_point) {
241cb0ef41Sopenharmony_ci  bool result = T::Is(code_point);
251cb0ef41Sopenharmony_ci  entries_[code_point & kMask] = CacheEntry(code_point, result);
261cb0ef41Sopenharmony_ci  return result;
271cb0ef41Sopenharmony_ci}
281cb0ef41Sopenharmony_ci
291cb0ef41Sopenharmony_citemplate <class T, int s>
301cb0ef41Sopenharmony_ciint Mapping<T, s>::get(uchar c, uchar n, uchar* result) {
311cb0ef41Sopenharmony_ci  CacheEntry entry = entries_[c & kMask];
321cb0ef41Sopenharmony_ci  if (entry.code_point_ == c) {
331cb0ef41Sopenharmony_ci    if (entry.offset_ == 0) {
341cb0ef41Sopenharmony_ci      return 0;
351cb0ef41Sopenharmony_ci    } else {
361cb0ef41Sopenharmony_ci      result[0] = c + entry.offset_;
371cb0ef41Sopenharmony_ci      return 1;
381cb0ef41Sopenharmony_ci    }
391cb0ef41Sopenharmony_ci  } else {
401cb0ef41Sopenharmony_ci    return CalculateValue(c, n, result);
411cb0ef41Sopenharmony_ci  }
421cb0ef41Sopenharmony_ci}
431cb0ef41Sopenharmony_ci
441cb0ef41Sopenharmony_citemplate <class T, int s>
451cb0ef41Sopenharmony_ciint Mapping<T, s>::CalculateValue(uchar c, uchar n, uchar* result) {
461cb0ef41Sopenharmony_ci  bool allow_caching = true;
471cb0ef41Sopenharmony_ci  int length = T::Convert(c, n, result, &allow_caching);
481cb0ef41Sopenharmony_ci  if (allow_caching) {
491cb0ef41Sopenharmony_ci    if (length == 1) {
501cb0ef41Sopenharmony_ci      entries_[c & kMask] = CacheEntry(c, result[0] - c);
511cb0ef41Sopenharmony_ci      return 1;
521cb0ef41Sopenharmony_ci    } else {
531cb0ef41Sopenharmony_ci      entries_[c & kMask] = CacheEntry(c, 0);
541cb0ef41Sopenharmony_ci      return 0;
551cb0ef41Sopenharmony_ci    }
561cb0ef41Sopenharmony_ci  } else {
571cb0ef41Sopenharmony_ci    return length;
581cb0ef41Sopenharmony_ci  }
591cb0ef41Sopenharmony_ci}
601cb0ef41Sopenharmony_ci#endif  // !V8_INTL_SUPPORT
611cb0ef41Sopenharmony_ci
621cb0ef41Sopenharmony_cibool Utf16::HasUnpairedSurrogate(const uint16_t* code_units, size_t length) {
631cb0ef41Sopenharmony_ci  for (size_t i = 0; i < length; ++i) {
641cb0ef41Sopenharmony_ci    const int code_unit = code_units[i];
651cb0ef41Sopenharmony_ci    if (IsLeadSurrogate(code_unit)) {
661cb0ef41Sopenharmony_ci      // The current code unit is a leading surrogate. Check if it is followed
671cb0ef41Sopenharmony_ci      // by a trailing surrogate.
681cb0ef41Sopenharmony_ci      if (i == length - 1) return true;
691cb0ef41Sopenharmony_ci      if (!IsTrailSurrogate(code_units[i + 1])) return true;
701cb0ef41Sopenharmony_ci      // Skip the paired trailing surrogate.
711cb0ef41Sopenharmony_ci      ++i;
721cb0ef41Sopenharmony_ci    } else if (IsTrailSurrogate(code_unit)) {
731cb0ef41Sopenharmony_ci      // All paired trailing surrogates are skipped above, so this branch is
741cb0ef41Sopenharmony_ci      // only for those that are unpaired.
751cb0ef41Sopenharmony_ci      return true;
761cb0ef41Sopenharmony_ci    }
771cb0ef41Sopenharmony_ci  }
781cb0ef41Sopenharmony_ci  return false;
791cb0ef41Sopenharmony_ci}
801cb0ef41Sopenharmony_ci
811cb0ef41Sopenharmony_ci// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
821cb0ef41Sopenharmony_ci// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
831cb0ef41Sopenharmony_ci// when the stream is complete, to ensure incomplete sequences are handled.
841cb0ef41Sopenharmony_ciuchar Utf8::ValueOfIncremental(const byte** cursor, State* state,
851cb0ef41Sopenharmony_ci                               Utf8IncrementalBuffer* buffer) {
861cb0ef41Sopenharmony_ci  DCHECK_NOT_NULL(buffer);
871cb0ef41Sopenharmony_ci  State old_state = *state;
881cb0ef41Sopenharmony_ci  byte next = **cursor;
891cb0ef41Sopenharmony_ci  *cursor += 1;
901cb0ef41Sopenharmony_ci
911cb0ef41Sopenharmony_ci  if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
921cb0ef41Sopenharmony_ci    DCHECK_EQ(0u, *buffer);
931cb0ef41Sopenharmony_ci    return static_cast<uchar>(next);
941cb0ef41Sopenharmony_ci  }
951cb0ef41Sopenharmony_ci
961cb0ef41Sopenharmony_ci  // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
971cb0ef41Sopenharmony_ci  // char in that sequence.
981cb0ef41Sopenharmony_ci  Utf8DfaDecoder::Decode(next, state, buffer);
991cb0ef41Sopenharmony_ci
1001cb0ef41Sopenharmony_ci  switch (*state) {
1011cb0ef41Sopenharmony_ci    case State::kAccept: {
1021cb0ef41Sopenharmony_ci      uchar t = *buffer;
1031cb0ef41Sopenharmony_ci      *buffer = 0;
1041cb0ef41Sopenharmony_ci      return t;
1051cb0ef41Sopenharmony_ci    }
1061cb0ef41Sopenharmony_ci
1071cb0ef41Sopenharmony_ci    case State::kReject:
1081cb0ef41Sopenharmony_ci      *state = State::kAccept;
1091cb0ef41Sopenharmony_ci      *buffer = 0;
1101cb0ef41Sopenharmony_ci
1111cb0ef41Sopenharmony_ci      // If we hit a bad byte, we need to determine if we were trying to start
1121cb0ef41Sopenharmony_ci      // a sequence or continue one. If we were trying to start a sequence,
1131cb0ef41Sopenharmony_ci      // that means it's just an invalid lead byte and we need to continue to
1141cb0ef41Sopenharmony_ci      // the next (which we already did above). If we were already in a
1151cb0ef41Sopenharmony_ci      // sequence, we need to reprocess this same byte after resetting to the
1161cb0ef41Sopenharmony_ci      // initial state.
1171cb0ef41Sopenharmony_ci      if (old_state != State::kAccept) {
1181cb0ef41Sopenharmony_ci        // We were trying to continue a sequence, so let's reprocess this byte
1191cb0ef41Sopenharmony_ci        // next time.
1201cb0ef41Sopenharmony_ci        *cursor -= 1;
1211cb0ef41Sopenharmony_ci      }
1221cb0ef41Sopenharmony_ci      return kBadChar;
1231cb0ef41Sopenharmony_ci
1241cb0ef41Sopenharmony_ci    default:
1251cb0ef41Sopenharmony_ci      return kIncomplete;
1261cb0ef41Sopenharmony_ci  }
1271cb0ef41Sopenharmony_ci}
1281cb0ef41Sopenharmony_ci
1291cb0ef41Sopenharmony_ciunsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
1301cb0ef41Sopenharmony_ci  static const int kMask = ~(1 << 6);
1311cb0ef41Sopenharmony_ci  if (c <= kMaxOneByteChar) {
1321cb0ef41Sopenharmony_ci    str[0] = c;
1331cb0ef41Sopenharmony_ci    return 1;
1341cb0ef41Sopenharmony_ci  }
1351cb0ef41Sopenharmony_ci  str[0] = 0xC0 | (c >> 6);
1361cb0ef41Sopenharmony_ci  str[1] = 0x80 | (c & kMask);
1371cb0ef41Sopenharmony_ci  return 2;
1381cb0ef41Sopenharmony_ci}
1391cb0ef41Sopenharmony_ci
1401cb0ef41Sopenharmony_ci// Encode encodes the UTF-16 code units c and previous into the given str
1411cb0ef41Sopenharmony_ci// buffer, and combines surrogate code units into single code points. If
1421cb0ef41Sopenharmony_ci// replace_invalid is set to true, orphan surrogate code units will be replaced
1431cb0ef41Sopenharmony_ci// with kBadChar.
1441cb0ef41Sopenharmony_ciunsigned Utf8::Encode(char* str, uchar c, int previous, bool replace_invalid) {
1451cb0ef41Sopenharmony_ci  static const int kMask = ~(1 << 6);
1461cb0ef41Sopenharmony_ci  if (c <= kMaxOneByteChar) {
1471cb0ef41Sopenharmony_ci    str[0] = c;
1481cb0ef41Sopenharmony_ci    return 1;
1491cb0ef41Sopenharmony_ci  } else if (c <= kMaxTwoByteChar) {
1501cb0ef41Sopenharmony_ci    str[0] = 0xC0 | (c >> 6);
1511cb0ef41Sopenharmony_ci    str[1] = 0x80 | (c & kMask);
1521cb0ef41Sopenharmony_ci    return 2;
1531cb0ef41Sopenharmony_ci  } else if (c <= kMaxThreeByteChar) {
1541cb0ef41Sopenharmony_ci    DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));
1551cb0ef41Sopenharmony_ci    if (Utf16::IsSurrogatePair(previous, c)) {
1561cb0ef41Sopenharmony_ci      const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
1571cb0ef41Sopenharmony_ci      return Encode(str - kUnmatchedSize,
1581cb0ef41Sopenharmony_ci                    Utf16::CombineSurrogatePair(previous, c),
1591cb0ef41Sopenharmony_ci                    Utf16::kNoPreviousCharacter, replace_invalid) -
1601cb0ef41Sopenharmony_ci             kUnmatchedSize;
1611cb0ef41Sopenharmony_ci    } else if (replace_invalid &&
1621cb0ef41Sopenharmony_ci               (Utf16::IsLeadSurrogate(c) || Utf16::IsTrailSurrogate(c))) {
1631cb0ef41Sopenharmony_ci      c = kBadChar;
1641cb0ef41Sopenharmony_ci    }
1651cb0ef41Sopenharmony_ci    str[0] = 0xE0 | (c >> 12);
1661cb0ef41Sopenharmony_ci    str[1] = 0x80 | ((c >> 6) & kMask);
1671cb0ef41Sopenharmony_ci    str[2] = 0x80 | (c & kMask);
1681cb0ef41Sopenharmony_ci    return 3;
1691cb0ef41Sopenharmony_ci  } else {
1701cb0ef41Sopenharmony_ci    str[0] = 0xF0 | (c >> 18);
1711cb0ef41Sopenharmony_ci    str[1] = 0x80 | ((c >> 12) & kMask);
1721cb0ef41Sopenharmony_ci    str[2] = 0x80 | ((c >> 6) & kMask);
1731cb0ef41Sopenharmony_ci    str[3] = 0x80 | (c & kMask);
1741cb0ef41Sopenharmony_ci    return 4;
1751cb0ef41Sopenharmony_ci  }
1761cb0ef41Sopenharmony_ci}
1771cb0ef41Sopenharmony_ci
1781cb0ef41Sopenharmony_ciuchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) {
1791cb0ef41Sopenharmony_ci  if (length <= 0) return kBadChar;
1801cb0ef41Sopenharmony_ci  byte first = bytes[0];
1811cb0ef41Sopenharmony_ci  // Characters between 0000 and 007F are encoded as a single character
1821cb0ef41Sopenharmony_ci  if (V8_LIKELY(first <= kMaxOneByteChar)) {
1831cb0ef41Sopenharmony_ci    *cursor += 1;
1841cb0ef41Sopenharmony_ci    return first;
1851cb0ef41Sopenharmony_ci  }
1861cb0ef41Sopenharmony_ci  return CalculateValue(bytes, length, cursor);
1871cb0ef41Sopenharmony_ci}
1881cb0ef41Sopenharmony_ci
1891cb0ef41Sopenharmony_ciunsigned Utf8::Length(uchar c, int previous) {
1901cb0ef41Sopenharmony_ci  if (c <= kMaxOneByteChar) {
1911cb0ef41Sopenharmony_ci    return 1;
1921cb0ef41Sopenharmony_ci  } else if (c <= kMaxTwoByteChar) {
1931cb0ef41Sopenharmony_ci    return 2;
1941cb0ef41Sopenharmony_ci  } else if (c <= kMaxThreeByteChar) {
1951cb0ef41Sopenharmony_ci    DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));
1961cb0ef41Sopenharmony_ci    if (Utf16::IsSurrogatePair(previous, c)) {
1971cb0ef41Sopenharmony_ci      return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
1981cb0ef41Sopenharmony_ci    }
1991cb0ef41Sopenharmony_ci    return 3;
2001cb0ef41Sopenharmony_ci  } else {
2011cb0ef41Sopenharmony_ci    return 4;
2021cb0ef41Sopenharmony_ci  }
2031cb0ef41Sopenharmony_ci}
2041cb0ef41Sopenharmony_ci
2051cb0ef41Sopenharmony_cibool Utf8::IsValidCharacter(uchar c) {
2061cb0ef41Sopenharmony_ci  return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) ||
2071cb0ef41Sopenharmony_ci         (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu &&
2081cb0ef41Sopenharmony_ci          c != kBadChar);
2091cb0ef41Sopenharmony_ci}
2101cb0ef41Sopenharmony_ci
2111cb0ef41Sopenharmony_ci}  // namespace unibrow
2121cb0ef41Sopenharmony_ci
2131cb0ef41Sopenharmony_ci#endif  // V8_STRINGS_UNICODE_INL_H_
214