11cb0ef41Sopenharmony_ci// Copyright 2011 the V8 project authors. All rights reserved.
21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be
31cb0ef41Sopenharmony_ci// found in the LICENSE file.
41cb0ef41Sopenharmony_ci
51cb0ef41Sopenharmony_ci#ifndef SRC_STRING_SEARCH_H_
61cb0ef41Sopenharmony_ci#define SRC_STRING_SEARCH_H_
71cb0ef41Sopenharmony_ci
81cb0ef41Sopenharmony_ci#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
91cb0ef41Sopenharmony_ci
101cb0ef41Sopenharmony_ci#include "util.h"
111cb0ef41Sopenharmony_ci
121cb0ef41Sopenharmony_ci#include <cstring>
131cb0ef41Sopenharmony_ci#include <algorithm>
141cb0ef41Sopenharmony_ci
151cb0ef41Sopenharmony_cinamespace node {
161cb0ef41Sopenharmony_cinamespace stringsearch {
171cb0ef41Sopenharmony_ci
181cb0ef41Sopenharmony_citemplate <typename T>
191cb0ef41Sopenharmony_ciclass Vector {
201cb0ef41Sopenharmony_ci public:
211cb0ef41Sopenharmony_ci  Vector(T* data, size_t length, bool isForward)
221cb0ef41Sopenharmony_ci      : start_(data), length_(length), is_forward_(isForward) {
231cb0ef41Sopenharmony_ci    CHECK(length > 0 && data != nullptr);
241cb0ef41Sopenharmony_ci  }
251cb0ef41Sopenharmony_ci
261cb0ef41Sopenharmony_ci  // Returns the start of the memory range.
271cb0ef41Sopenharmony_ci  // For vector v this is NOT necessarily &v[0], see forward().
281cb0ef41Sopenharmony_ci  const T* start() const { return start_; }
291cb0ef41Sopenharmony_ci
301cb0ef41Sopenharmony_ci  // Returns the length of the vector, in characters.
311cb0ef41Sopenharmony_ci  size_t length() const { return length_; }
321cb0ef41Sopenharmony_ci
331cb0ef41Sopenharmony_ci  // Returns true if the Vector is front-to-back, false if back-to-front.
341cb0ef41Sopenharmony_ci  // In the latter case, v[0] corresponds to the *end* of the memory range.
351cb0ef41Sopenharmony_ci  bool forward() const { return is_forward_; }
361cb0ef41Sopenharmony_ci
371cb0ef41Sopenharmony_ci  // Access individual vector elements - checks bounds in debug mode.
381cb0ef41Sopenharmony_ci  T& operator[](size_t index) const {
391cb0ef41Sopenharmony_ci    DCHECK_LT(index, length_);
401cb0ef41Sopenharmony_ci    return start_[is_forward_ ? index : (length_ - index - 1)];
411cb0ef41Sopenharmony_ci  }
421cb0ef41Sopenharmony_ci
431cb0ef41Sopenharmony_ci private:
441cb0ef41Sopenharmony_ci  T* start_;
451cb0ef41Sopenharmony_ci  size_t length_;
461cb0ef41Sopenharmony_ci  bool is_forward_;
471cb0ef41Sopenharmony_ci};
481cb0ef41Sopenharmony_ci
491cb0ef41Sopenharmony_ci
501cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
511cb0ef41Sopenharmony_ci// String Search object.
521cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
531cb0ef41Sopenharmony_ci
541cb0ef41Sopenharmony_ci// Class holding constants and methods that apply to all string search variants,
551cb0ef41Sopenharmony_ci// independently of subject and pattern char size.
561cb0ef41Sopenharmony_ciclass StringSearchBase {
571cb0ef41Sopenharmony_ci protected:
581cb0ef41Sopenharmony_ci  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
591cb0ef41Sopenharmony_ci  // limit, we can fix the size of tables. For a needle longer than this limit,
601cb0ef41Sopenharmony_ci  // search will not be optimal, since we only build tables for a suffix
611cb0ef41Sopenharmony_ci  // of the string, but it is a safe approximation.
621cb0ef41Sopenharmony_ci  static const int kBMMaxShift = 250;
631cb0ef41Sopenharmony_ci
641cb0ef41Sopenharmony_ci  // Reduce alphabet to this size.
651cb0ef41Sopenharmony_ci  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
661cb0ef41Sopenharmony_ci  // proportional to the input alphabet. We reduce the alphabet size by
671cb0ef41Sopenharmony_ci  // equating input characters modulo a smaller alphabet size. This gives
681cb0ef41Sopenharmony_ci  // a potentially less efficient searching, but is a safe approximation.
691cb0ef41Sopenharmony_ci  // For needles using only characters in the same Unicode 256-code point page,
701cb0ef41Sopenharmony_ci  // there is no search speed degradation.
711cb0ef41Sopenharmony_ci  static const int kLatin1AlphabetSize = 256;
721cb0ef41Sopenharmony_ci  static const int kUC16AlphabetSize = 256;
731cb0ef41Sopenharmony_ci
741cb0ef41Sopenharmony_ci  // Bad-char shift table stored in the state. It's length is the alphabet size.
751cb0ef41Sopenharmony_ci  // For patterns below this length, the skip length of Boyer-Moore is too short
761cb0ef41Sopenharmony_ci  // to compensate for the algorithmic overhead compared to simple brute force.
771cb0ef41Sopenharmony_ci  static const int kBMMinPatternLength = 8;
781cb0ef41Sopenharmony_ci
791cb0ef41Sopenharmony_ci  // Store for the BoyerMoore(Horspool) bad char shift table.
801cb0ef41Sopenharmony_ci  int bad_char_shift_table_[kUC16AlphabetSize];
811cb0ef41Sopenharmony_ci  // Store for the BoyerMoore good suffix shift table.
821cb0ef41Sopenharmony_ci  int good_suffix_shift_table_[kBMMaxShift + 1];
831cb0ef41Sopenharmony_ci  // Table used temporarily while building the BoyerMoore good suffix
841cb0ef41Sopenharmony_ci  // shift table.
851cb0ef41Sopenharmony_ci  int suffix_table_[kBMMaxShift + 1];
861cb0ef41Sopenharmony_ci};
871cb0ef41Sopenharmony_ci
881cb0ef41Sopenharmony_citemplate <typename Char>
891cb0ef41Sopenharmony_ciclass StringSearch : private StringSearchBase {
901cb0ef41Sopenharmony_ci public:
911cb0ef41Sopenharmony_ci  typedef stringsearch::Vector<const Char> Vector;
921cb0ef41Sopenharmony_ci
931cb0ef41Sopenharmony_ci  explicit StringSearch(Vector pattern)
941cb0ef41Sopenharmony_ci      : pattern_(pattern), start_(0) {
951cb0ef41Sopenharmony_ci    if (pattern.length() >= kBMMaxShift) {
961cb0ef41Sopenharmony_ci      start_ = pattern.length() - kBMMaxShift;
971cb0ef41Sopenharmony_ci    }
981cb0ef41Sopenharmony_ci
991cb0ef41Sopenharmony_ci    size_t pattern_length = pattern_.length();
1001cb0ef41Sopenharmony_ci    CHECK_GT(pattern_length, 0);
1011cb0ef41Sopenharmony_ci    if (pattern_length < kBMMinPatternLength) {
1021cb0ef41Sopenharmony_ci      if (pattern_length == 1) {
1031cb0ef41Sopenharmony_ci        strategy_ = SearchStrategy::kSingleChar;
1041cb0ef41Sopenharmony_ci        return;
1051cb0ef41Sopenharmony_ci      }
1061cb0ef41Sopenharmony_ci      strategy_ = SearchStrategy::kLinear;
1071cb0ef41Sopenharmony_ci      return;
1081cb0ef41Sopenharmony_ci    }
1091cb0ef41Sopenharmony_ci    strategy_ = SearchStrategy::kInitial;
1101cb0ef41Sopenharmony_ci  }
1111cb0ef41Sopenharmony_ci
1121cb0ef41Sopenharmony_ci  size_t Search(Vector subject, size_t index) {
1131cb0ef41Sopenharmony_ci    switch (strategy_) {
1141cb0ef41Sopenharmony_ci      case kBoyerMooreHorspool:
1151cb0ef41Sopenharmony_ci        return BoyerMooreHorspoolSearch(subject, index);
1161cb0ef41Sopenharmony_ci      case kBoyerMoore:
1171cb0ef41Sopenharmony_ci        return BoyerMooreSearch(subject, index);
1181cb0ef41Sopenharmony_ci      case kInitial:
1191cb0ef41Sopenharmony_ci        return InitialSearch(subject, index);
1201cb0ef41Sopenharmony_ci      case kLinear:
1211cb0ef41Sopenharmony_ci        return LinearSearch(subject, index);
1221cb0ef41Sopenharmony_ci      case kSingleChar:
1231cb0ef41Sopenharmony_ci        return SingleCharSearch(subject, index);
1241cb0ef41Sopenharmony_ci    }
1251cb0ef41Sopenharmony_ci    UNREACHABLE();
1261cb0ef41Sopenharmony_ci  }
1271cb0ef41Sopenharmony_ci
1281cb0ef41Sopenharmony_ci  static inline int AlphabetSize() {
1291cb0ef41Sopenharmony_ci    if (sizeof(Char) == 1) {
1301cb0ef41Sopenharmony_ci      // Latin1 needle.
1311cb0ef41Sopenharmony_ci      return kLatin1AlphabetSize;
1321cb0ef41Sopenharmony_ci    } else {
1331cb0ef41Sopenharmony_ci      // UC16 needle.
1341cb0ef41Sopenharmony_ci      return kUC16AlphabetSize;
1351cb0ef41Sopenharmony_ci    }
1361cb0ef41Sopenharmony_ci
1371cb0ef41Sopenharmony_ci    static_assert(sizeof(Char) == sizeof(uint8_t) ||
1381cb0ef41Sopenharmony_ci                  sizeof(Char) == sizeof(uint16_t),
1391cb0ef41Sopenharmony_ci                  "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)");
1401cb0ef41Sopenharmony_ci  }
1411cb0ef41Sopenharmony_ci
1421cb0ef41Sopenharmony_ci private:
1431cb0ef41Sopenharmony_ci  typedef size_t (StringSearch::*SearchFunction)(Vector, size_t);
1441cb0ef41Sopenharmony_ci  size_t SingleCharSearch(Vector subject, size_t start_index);
1451cb0ef41Sopenharmony_ci  size_t LinearSearch(Vector subject, size_t start_index);
1461cb0ef41Sopenharmony_ci  size_t InitialSearch(Vector subject, size_t start_index);
1471cb0ef41Sopenharmony_ci  size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index);
1481cb0ef41Sopenharmony_ci  size_t BoyerMooreSearch(Vector subject, size_t start_index);
1491cb0ef41Sopenharmony_ci
1501cb0ef41Sopenharmony_ci  void PopulateBoyerMooreHorspoolTable();
1511cb0ef41Sopenharmony_ci
1521cb0ef41Sopenharmony_ci  void PopulateBoyerMooreTable();
1531cb0ef41Sopenharmony_ci
1541cb0ef41Sopenharmony_ci  static inline int CharOccurrence(int* bad_char_occurrence,
1551cb0ef41Sopenharmony_ci                                   Char char_code) {
1561cb0ef41Sopenharmony_ci    if (sizeof(Char) == 1) {
1571cb0ef41Sopenharmony_ci      return bad_char_occurrence[static_cast<int>(char_code)];
1581cb0ef41Sopenharmony_ci    }
1591cb0ef41Sopenharmony_ci    // Both pattern and subject are UC16. Reduce character to equivalence class.
1601cb0ef41Sopenharmony_ci    int equiv_class = char_code % kUC16AlphabetSize;
1611cb0ef41Sopenharmony_ci    return bad_char_occurrence[equiv_class];
1621cb0ef41Sopenharmony_ci  }
1631cb0ef41Sopenharmony_ci
1641cb0ef41Sopenharmony_ci  enum SearchStrategy {
1651cb0ef41Sopenharmony_ci    kBoyerMooreHorspool,
1661cb0ef41Sopenharmony_ci    kBoyerMoore,
1671cb0ef41Sopenharmony_ci    kInitial,
1681cb0ef41Sopenharmony_ci    kLinear,
1691cb0ef41Sopenharmony_ci    kSingleChar,
1701cb0ef41Sopenharmony_ci  };
1711cb0ef41Sopenharmony_ci
1721cb0ef41Sopenharmony_ci  // The pattern to search for.
1731cb0ef41Sopenharmony_ci  Vector pattern_;
1741cb0ef41Sopenharmony_ci  SearchStrategy strategy_;
1751cb0ef41Sopenharmony_ci  // Cache value of Max(0, pattern_length() - kBMMaxShift)
1761cb0ef41Sopenharmony_ci  size_t start_;
1771cb0ef41Sopenharmony_ci};
1781cb0ef41Sopenharmony_ci
1791cb0ef41Sopenharmony_ci
1801cb0ef41Sopenharmony_citemplate <typename T, typename U>
1811cb0ef41Sopenharmony_ciinline T AlignDown(T value, U alignment) {
1821cb0ef41Sopenharmony_ci  return reinterpret_cast<T>(
1831cb0ef41Sopenharmony_ci      (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
1841cb0ef41Sopenharmony_ci}
1851cb0ef41Sopenharmony_ci
1861cb0ef41Sopenharmony_ci
1871cb0ef41Sopenharmony_ciinline uint8_t GetHighestValueByte(uint16_t character) {
1881cb0ef41Sopenharmony_ci  return std::max(static_cast<uint8_t>(character & 0xFF),
1891cb0ef41Sopenharmony_ci                  static_cast<uint8_t>(character >> 8));
1901cb0ef41Sopenharmony_ci}
1911cb0ef41Sopenharmony_ci
1921cb0ef41Sopenharmony_ci
1931cb0ef41Sopenharmony_ciinline uint8_t GetHighestValueByte(uint8_t character) { return character; }
1941cb0ef41Sopenharmony_ci
1951cb0ef41Sopenharmony_ci
1961cb0ef41Sopenharmony_ci// Searches for a byte value in a memory buffer, back to front.
1971cb0ef41Sopenharmony_ci// Uses memrchr(3) on systems which support it, for speed.
1981cb0ef41Sopenharmony_ci// Falls back to a vanilla for loop on non-GNU systems such as Windows.
1991cb0ef41Sopenharmony_ciinline const void* MemrchrFill(const void* haystack, uint8_t needle,
2001cb0ef41Sopenharmony_ci                               size_t haystack_len) {
2011cb0ef41Sopenharmony_ci#ifdef _GNU_SOURCE
2021cb0ef41Sopenharmony_ci  return memrchr(haystack, needle, haystack_len);
2031cb0ef41Sopenharmony_ci#else
2041cb0ef41Sopenharmony_ci  const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack);
2051cb0ef41Sopenharmony_ci  for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) {
2061cb0ef41Sopenharmony_ci    if (haystack8[i] == needle) {
2071cb0ef41Sopenharmony_ci      return haystack8 + i;
2081cb0ef41Sopenharmony_ci    }
2091cb0ef41Sopenharmony_ci  }
2101cb0ef41Sopenharmony_ci  return nullptr;
2111cb0ef41Sopenharmony_ci#endif
2121cb0ef41Sopenharmony_ci}
2131cb0ef41Sopenharmony_ci
2141cb0ef41Sopenharmony_ci
2151cb0ef41Sopenharmony_ci// Finds the first occurrence of *two-byte* character pattern[0] in the string
2161cb0ef41Sopenharmony_ci// `subject`. Does not check that the whole pattern matches.
2171cb0ef41Sopenharmony_citemplate <typename Char>
2181cb0ef41Sopenharmony_ciinline size_t FindFirstCharacter(Vector<const Char> pattern,
2191cb0ef41Sopenharmony_ci                                 Vector<const Char> subject, size_t index) {
2201cb0ef41Sopenharmony_ci  const Char pattern_first_char = pattern[0];
2211cb0ef41Sopenharmony_ci  const size_t max_n = (subject.length() - pattern.length() + 1);
2221cb0ef41Sopenharmony_ci
2231cb0ef41Sopenharmony_ci  // For speed, search for the more `rare` of the two bytes in pattern[0]
2241cb0ef41Sopenharmony_ci  // using memchr / memrchr (which are much faster than a simple for loop).
2251cb0ef41Sopenharmony_ci  const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
2261cb0ef41Sopenharmony_ci  size_t pos = index;
2271cb0ef41Sopenharmony_ci  do {
2281cb0ef41Sopenharmony_ci    const size_t bytes_to_search = (max_n - pos) * sizeof(Char);
2291cb0ef41Sopenharmony_ci    const void* void_pos;
2301cb0ef41Sopenharmony_ci    if (subject.forward()) {
2311cb0ef41Sopenharmony_ci      // Assert that bytes_to_search won't overflow
2321cb0ef41Sopenharmony_ci      CHECK_LE(pos, max_n);
2331cb0ef41Sopenharmony_ci      CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char));
2341cb0ef41Sopenharmony_ci      void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search);
2351cb0ef41Sopenharmony_ci    } else {
2361cb0ef41Sopenharmony_ci      CHECK_LE(pos, subject.length());
2371cb0ef41Sopenharmony_ci      CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char));
2381cb0ef41Sopenharmony_ci      void_pos = MemrchrFill(subject.start() + pattern.length() - 1,
2391cb0ef41Sopenharmony_ci                             search_byte,
2401cb0ef41Sopenharmony_ci                             bytes_to_search);
2411cb0ef41Sopenharmony_ci    }
2421cb0ef41Sopenharmony_ci    const Char* char_pos = static_cast<const Char*>(void_pos);
2431cb0ef41Sopenharmony_ci    if (char_pos == nullptr)
2441cb0ef41Sopenharmony_ci      return subject.length();
2451cb0ef41Sopenharmony_ci
2461cb0ef41Sopenharmony_ci    // Then, for each match, verify that the full two bytes match pattern[0].
2471cb0ef41Sopenharmony_ci    char_pos = AlignDown(char_pos, sizeof(Char));
2481cb0ef41Sopenharmony_ci    size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
2491cb0ef41Sopenharmony_ci    pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1);
2501cb0ef41Sopenharmony_ci    if (subject[pos] == pattern_first_char) {
2511cb0ef41Sopenharmony_ci      // Match found, hooray.
2521cb0ef41Sopenharmony_ci      return pos;
2531cb0ef41Sopenharmony_ci    }
2541cb0ef41Sopenharmony_ci    // Search byte matched, but the other byte of pattern[0] didn't. Keep going.
2551cb0ef41Sopenharmony_ci  } while (++pos < max_n);
2561cb0ef41Sopenharmony_ci
2571cb0ef41Sopenharmony_ci  return subject.length();
2581cb0ef41Sopenharmony_ci}
2591cb0ef41Sopenharmony_ci
2601cb0ef41Sopenharmony_ci
2611cb0ef41Sopenharmony_ci// Finds the first occurrence of the byte pattern[0] in string `subject`.
2621cb0ef41Sopenharmony_ci// Does not verify that the whole pattern matches.
2631cb0ef41Sopenharmony_citemplate <>
2641cb0ef41Sopenharmony_ciinline size_t FindFirstCharacter(Vector<const uint8_t> pattern,
2651cb0ef41Sopenharmony_ci                                 Vector<const uint8_t> subject,
2661cb0ef41Sopenharmony_ci                                 size_t index) {
2671cb0ef41Sopenharmony_ci  const uint8_t pattern_first_char = pattern[0];
2681cb0ef41Sopenharmony_ci  const size_t subj_len = subject.length();
2691cb0ef41Sopenharmony_ci  const size_t max_n = (subject.length() - pattern.length() + 1);
2701cb0ef41Sopenharmony_ci
2711cb0ef41Sopenharmony_ci  const void* pos;
2721cb0ef41Sopenharmony_ci  if (subject.forward()) {
2731cb0ef41Sopenharmony_ci    pos = memchr(subject.start() + index, pattern_first_char, max_n - index);
2741cb0ef41Sopenharmony_ci  } else {
2751cb0ef41Sopenharmony_ci    pos = MemrchrFill(subject.start() + pattern.length() - 1,
2761cb0ef41Sopenharmony_ci                      pattern_first_char,
2771cb0ef41Sopenharmony_ci                      max_n - index);
2781cb0ef41Sopenharmony_ci  }
2791cb0ef41Sopenharmony_ci  const uint8_t* char_pos = static_cast<const uint8_t*>(pos);
2801cb0ef41Sopenharmony_ci  if (char_pos == nullptr) {
2811cb0ef41Sopenharmony_ci    return subj_len;
2821cb0ef41Sopenharmony_ci  }
2831cb0ef41Sopenharmony_ci
2841cb0ef41Sopenharmony_ci  size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
2851cb0ef41Sopenharmony_ci  return subject.forward() ? raw_pos : (subj_len - raw_pos - 1);
2861cb0ef41Sopenharmony_ci}
2871cb0ef41Sopenharmony_ci
2881cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
2891cb0ef41Sopenharmony_ci// Single Character Pattern Search Strategy
2901cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
2911cb0ef41Sopenharmony_ci
2921cb0ef41Sopenharmony_citemplate <typename Char>
2931cb0ef41Sopenharmony_cisize_t StringSearch<Char>::SingleCharSearch(
2941cb0ef41Sopenharmony_ci    Vector subject,
2951cb0ef41Sopenharmony_ci    size_t index) {
2961cb0ef41Sopenharmony_ci  CHECK_EQ(1, pattern_.length());
2971cb0ef41Sopenharmony_ci  return FindFirstCharacter(pattern_, subject, index);
2981cb0ef41Sopenharmony_ci}
2991cb0ef41Sopenharmony_ci
3001cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
3011cb0ef41Sopenharmony_ci// Linear Search Strategy
3021cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
3031cb0ef41Sopenharmony_ci
3041cb0ef41Sopenharmony_ci// Simple linear search for short patterns. Never bails out.
3051cb0ef41Sopenharmony_citemplate <typename Char>
3061cb0ef41Sopenharmony_cisize_t StringSearch<Char>::LinearSearch(
3071cb0ef41Sopenharmony_ci    Vector subject,
3081cb0ef41Sopenharmony_ci    size_t index) {
3091cb0ef41Sopenharmony_ci  CHECK_GT(pattern_.length(), 1);
3101cb0ef41Sopenharmony_ci  const size_t n = subject.length() - pattern_.length();
3111cb0ef41Sopenharmony_ci  for (size_t i = index; i <= n; i++) {
3121cb0ef41Sopenharmony_ci    i = FindFirstCharacter(pattern_, subject, i);
3131cb0ef41Sopenharmony_ci    if (i == subject.length())
3141cb0ef41Sopenharmony_ci      return subject.length();
3151cb0ef41Sopenharmony_ci    CHECK_LE(i, n);
3161cb0ef41Sopenharmony_ci
3171cb0ef41Sopenharmony_ci    bool matches = true;
3181cb0ef41Sopenharmony_ci    for (size_t j = 1; j < pattern_.length(); j++) {
3191cb0ef41Sopenharmony_ci      if (pattern_[j] != subject[i + j]) {
3201cb0ef41Sopenharmony_ci        matches = false;
3211cb0ef41Sopenharmony_ci        break;
3221cb0ef41Sopenharmony_ci      }
3231cb0ef41Sopenharmony_ci    }
3241cb0ef41Sopenharmony_ci    if (matches) {
3251cb0ef41Sopenharmony_ci      return i;
3261cb0ef41Sopenharmony_ci    }
3271cb0ef41Sopenharmony_ci  }
3281cb0ef41Sopenharmony_ci  return subject.length();
3291cb0ef41Sopenharmony_ci}
3301cb0ef41Sopenharmony_ci
3311cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
3321cb0ef41Sopenharmony_ci// Boyer-Moore string search
3331cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
3341cb0ef41Sopenharmony_ci
3351cb0ef41Sopenharmony_citemplate <typename Char>
3361cb0ef41Sopenharmony_cisize_t StringSearch<Char>::BoyerMooreSearch(
3371cb0ef41Sopenharmony_ci    Vector subject,
3381cb0ef41Sopenharmony_ci    size_t start_index) {
3391cb0ef41Sopenharmony_ci  const size_t subject_length = subject.length();
3401cb0ef41Sopenharmony_ci  const size_t pattern_length = pattern_.length();
3411cb0ef41Sopenharmony_ci  // Only preprocess at most kBMMaxShift last characters of pattern.
3421cb0ef41Sopenharmony_ci  size_t start = start_;
3431cb0ef41Sopenharmony_ci
3441cb0ef41Sopenharmony_ci  int* bad_char_occurrence = bad_char_shift_table_;
3451cb0ef41Sopenharmony_ci  int* good_suffix_shift = good_suffix_shift_table_ - start_;
3461cb0ef41Sopenharmony_ci
3471cb0ef41Sopenharmony_ci  Char last_char = pattern_[pattern_length - 1];
3481cb0ef41Sopenharmony_ci  size_t index = start_index;
3491cb0ef41Sopenharmony_ci  // Continue search from i.
3501cb0ef41Sopenharmony_ci  while (index <= subject_length - pattern_length) {
3511cb0ef41Sopenharmony_ci    size_t j = pattern_length - 1;
3521cb0ef41Sopenharmony_ci    int c;
3531cb0ef41Sopenharmony_ci    while (last_char != (c = subject[index + j])) {
3541cb0ef41Sopenharmony_ci      int shift = j - CharOccurrence(bad_char_occurrence, c);
3551cb0ef41Sopenharmony_ci      index += shift;
3561cb0ef41Sopenharmony_ci      if (index > subject_length - pattern_length) {
3571cb0ef41Sopenharmony_ci        return subject.length();
3581cb0ef41Sopenharmony_ci      }
3591cb0ef41Sopenharmony_ci    }
3601cb0ef41Sopenharmony_ci    while (pattern_[j] == (c = subject[index + j])) {
3611cb0ef41Sopenharmony_ci      if (j == 0) {
3621cb0ef41Sopenharmony_ci        return index;
3631cb0ef41Sopenharmony_ci      }
3641cb0ef41Sopenharmony_ci      j--;
3651cb0ef41Sopenharmony_ci    }
3661cb0ef41Sopenharmony_ci    if (j < start) {
3671cb0ef41Sopenharmony_ci      // we have matched more than our tables allow us to be smart about.
3681cb0ef41Sopenharmony_ci      // Fall back on BMH shift.
3691cb0ef41Sopenharmony_ci      index += pattern_length - 1 -
3701cb0ef41Sopenharmony_ci               CharOccurrence(bad_char_occurrence, last_char);
3711cb0ef41Sopenharmony_ci    } else {
3721cb0ef41Sopenharmony_ci      int gs_shift = good_suffix_shift[j + 1];
3731cb0ef41Sopenharmony_ci      int bc_occ = CharOccurrence(bad_char_occurrence, c);
3741cb0ef41Sopenharmony_ci      int shift = j - bc_occ;
3751cb0ef41Sopenharmony_ci      if (gs_shift > shift) {
3761cb0ef41Sopenharmony_ci        shift = gs_shift;
3771cb0ef41Sopenharmony_ci      }
3781cb0ef41Sopenharmony_ci      index += shift;
3791cb0ef41Sopenharmony_ci    }
3801cb0ef41Sopenharmony_ci  }
3811cb0ef41Sopenharmony_ci
3821cb0ef41Sopenharmony_ci  return subject.length();
3831cb0ef41Sopenharmony_ci}
3841cb0ef41Sopenharmony_ci
3851cb0ef41Sopenharmony_citemplate <typename Char>
3861cb0ef41Sopenharmony_civoid StringSearch<Char>::PopulateBoyerMooreTable() {
3871cb0ef41Sopenharmony_ci  const size_t pattern_length = pattern_.length();
3881cb0ef41Sopenharmony_ci  // Only look at the last kBMMaxShift characters of pattern (from start_
3891cb0ef41Sopenharmony_ci  // to pattern_length).
3901cb0ef41Sopenharmony_ci  const size_t start = start_;
3911cb0ef41Sopenharmony_ci  const size_t length = pattern_length - start;
3921cb0ef41Sopenharmony_ci
3931cb0ef41Sopenharmony_ci  // Biased tables so that we can use pattern indices as table indices,
3941cb0ef41Sopenharmony_ci  // even if we only cover the part of the pattern from offset start.
3951cb0ef41Sopenharmony_ci  int* shift_table = good_suffix_shift_table_ - start_;
3961cb0ef41Sopenharmony_ci  int* suffix_table = suffix_table_ - start_;
3971cb0ef41Sopenharmony_ci
3981cb0ef41Sopenharmony_ci  // Initialize table.
3991cb0ef41Sopenharmony_ci  for (size_t i = start; i < pattern_length; i++) {
4001cb0ef41Sopenharmony_ci    shift_table[i] = length;
4011cb0ef41Sopenharmony_ci  }
4021cb0ef41Sopenharmony_ci  shift_table[pattern_length] = 1;
4031cb0ef41Sopenharmony_ci  suffix_table[pattern_length] = pattern_length + 1;
4041cb0ef41Sopenharmony_ci
4051cb0ef41Sopenharmony_ci  if (pattern_length <= start) {
4061cb0ef41Sopenharmony_ci    return;
4071cb0ef41Sopenharmony_ci  }
4081cb0ef41Sopenharmony_ci
4091cb0ef41Sopenharmony_ci  // Find suffixes.
4101cb0ef41Sopenharmony_ci  Char last_char = pattern_[pattern_length - 1];
4111cb0ef41Sopenharmony_ci  size_t suffix = pattern_length + 1;
4121cb0ef41Sopenharmony_ci  {
4131cb0ef41Sopenharmony_ci    size_t i = pattern_length;
4141cb0ef41Sopenharmony_ci    while (i > start) {
4151cb0ef41Sopenharmony_ci      Char c = pattern_[i - 1];
4161cb0ef41Sopenharmony_ci      while (suffix <= pattern_length && c != pattern_[suffix - 1]) {
4171cb0ef41Sopenharmony_ci        if (static_cast<size_t>(shift_table[suffix]) == length) {
4181cb0ef41Sopenharmony_ci          shift_table[suffix] = suffix - i;
4191cb0ef41Sopenharmony_ci        }
4201cb0ef41Sopenharmony_ci        suffix = suffix_table[suffix];
4211cb0ef41Sopenharmony_ci      }
4221cb0ef41Sopenharmony_ci      suffix_table[--i] = --suffix;
4231cb0ef41Sopenharmony_ci      if (suffix == pattern_length) {
4241cb0ef41Sopenharmony_ci        // No suffix to extend, so we check against last_char only.
4251cb0ef41Sopenharmony_ci        while ((i > start) && (pattern_[i - 1] != last_char)) {
4261cb0ef41Sopenharmony_ci          if (static_cast<size_t>(shift_table[pattern_length]) == length) {
4271cb0ef41Sopenharmony_ci            shift_table[pattern_length] = pattern_length - i;
4281cb0ef41Sopenharmony_ci          }
4291cb0ef41Sopenharmony_ci          suffix_table[--i] = pattern_length;
4301cb0ef41Sopenharmony_ci        }
4311cb0ef41Sopenharmony_ci        if (i > start) {
4321cb0ef41Sopenharmony_ci          suffix_table[--i] = --suffix;
4331cb0ef41Sopenharmony_ci        }
4341cb0ef41Sopenharmony_ci      }
4351cb0ef41Sopenharmony_ci    }
4361cb0ef41Sopenharmony_ci  }
4371cb0ef41Sopenharmony_ci  // Build shift table using suffixes.
4381cb0ef41Sopenharmony_ci  if (suffix < pattern_length) {
4391cb0ef41Sopenharmony_ci    for (size_t i = start; i <= pattern_length; i++) {
4401cb0ef41Sopenharmony_ci      if (static_cast<size_t>(shift_table[i]) == length) {
4411cb0ef41Sopenharmony_ci        shift_table[i] = suffix - start;
4421cb0ef41Sopenharmony_ci      }
4431cb0ef41Sopenharmony_ci      if (i == suffix) {
4441cb0ef41Sopenharmony_ci        suffix = suffix_table[suffix];
4451cb0ef41Sopenharmony_ci      }
4461cb0ef41Sopenharmony_ci    }
4471cb0ef41Sopenharmony_ci  }
4481cb0ef41Sopenharmony_ci}
4491cb0ef41Sopenharmony_ci
4501cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
4511cb0ef41Sopenharmony_ci// Boyer-Moore-Horspool string search.
4521cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
4531cb0ef41Sopenharmony_ci
4541cb0ef41Sopenharmony_citemplate <typename Char>
4551cb0ef41Sopenharmony_cisize_t StringSearch<Char>::BoyerMooreHorspoolSearch(
4561cb0ef41Sopenharmony_ci    Vector subject,
4571cb0ef41Sopenharmony_ci    size_t start_index) {
4581cb0ef41Sopenharmony_ci  const size_t subject_length = subject.length();
4591cb0ef41Sopenharmony_ci  const size_t pattern_length = pattern_.length();
4601cb0ef41Sopenharmony_ci  int* char_occurrences = bad_char_shift_table_;
4611cb0ef41Sopenharmony_ci  int64_t badness = -static_cast<int64_t>(pattern_length);
4621cb0ef41Sopenharmony_ci
4631cb0ef41Sopenharmony_ci  // How bad we are doing without a good-suffix table.
4641cb0ef41Sopenharmony_ci  Char last_char = pattern_[pattern_length - 1];
4651cb0ef41Sopenharmony_ci  int last_char_shift =
4661cb0ef41Sopenharmony_ci      pattern_length - 1 -
4671cb0ef41Sopenharmony_ci      CharOccurrence(char_occurrences, last_char);
4681cb0ef41Sopenharmony_ci
4691cb0ef41Sopenharmony_ci  // Perform search
4701cb0ef41Sopenharmony_ci  size_t index = start_index;  // No matches found prior to this index.
4711cb0ef41Sopenharmony_ci  while (index <= subject_length - pattern_length) {
4721cb0ef41Sopenharmony_ci    size_t j = pattern_length - 1;
4731cb0ef41Sopenharmony_ci    int subject_char;
4741cb0ef41Sopenharmony_ci    while (last_char != (subject_char = subject[index + j])) {
4751cb0ef41Sopenharmony_ci      int bc_occ = CharOccurrence(char_occurrences, subject_char);
4761cb0ef41Sopenharmony_ci      int shift = j - bc_occ;
4771cb0ef41Sopenharmony_ci      index += shift;
4781cb0ef41Sopenharmony_ci      badness += 1 - shift;  // at most zero, so badness cannot increase.
4791cb0ef41Sopenharmony_ci      if (index > subject_length - pattern_length) {
4801cb0ef41Sopenharmony_ci        return subject_length;
4811cb0ef41Sopenharmony_ci      }
4821cb0ef41Sopenharmony_ci    }
4831cb0ef41Sopenharmony_ci    j--;
4841cb0ef41Sopenharmony_ci    while (pattern_[j] == (subject[index + j])) {
4851cb0ef41Sopenharmony_ci      if (j == 0) {
4861cb0ef41Sopenharmony_ci        return index;
4871cb0ef41Sopenharmony_ci      }
4881cb0ef41Sopenharmony_ci      j--;
4891cb0ef41Sopenharmony_ci    }
4901cb0ef41Sopenharmony_ci    index += last_char_shift;
4911cb0ef41Sopenharmony_ci    // Badness increases by the number of characters we have
4921cb0ef41Sopenharmony_ci    // checked, and decreases by the number of characters we
4931cb0ef41Sopenharmony_ci    // can skip by shifting. It's a measure of how we are doing
4941cb0ef41Sopenharmony_ci    // compared to reading each character exactly once.
4951cb0ef41Sopenharmony_ci    badness += (pattern_length - j) - last_char_shift;
4961cb0ef41Sopenharmony_ci    if (badness > 0) {
4971cb0ef41Sopenharmony_ci      PopulateBoyerMooreTable();
4981cb0ef41Sopenharmony_ci      strategy_ = SearchStrategy::kBoyerMoore;
4991cb0ef41Sopenharmony_ci      return BoyerMooreSearch(subject, index);
5001cb0ef41Sopenharmony_ci    }
5011cb0ef41Sopenharmony_ci  }
5021cb0ef41Sopenharmony_ci  return subject.length();
5031cb0ef41Sopenharmony_ci}
5041cb0ef41Sopenharmony_ci
5051cb0ef41Sopenharmony_citemplate <typename Char>
5061cb0ef41Sopenharmony_civoid StringSearch<Char>::PopulateBoyerMooreHorspoolTable() {
5071cb0ef41Sopenharmony_ci  const size_t pattern_length = pattern_.length();
5081cb0ef41Sopenharmony_ci
5091cb0ef41Sopenharmony_ci  int* bad_char_occurrence = bad_char_shift_table_;
5101cb0ef41Sopenharmony_ci
5111cb0ef41Sopenharmony_ci  // Only preprocess at most kBMMaxShift last characters of pattern.
5121cb0ef41Sopenharmony_ci  const size_t start = start_;
5131cb0ef41Sopenharmony_ci  // Run forwards to populate bad_char_table, so that *last* instance
5141cb0ef41Sopenharmony_ci  // of character equivalence class is the one registered.
5151cb0ef41Sopenharmony_ci  // Notice: Doesn't include the last character.
5161cb0ef41Sopenharmony_ci  const size_t table_size = AlphabetSize();
5171cb0ef41Sopenharmony_ci  if (start == 0) {
5181cb0ef41Sopenharmony_ci    // All patterns less than kBMMaxShift in length.
5191cb0ef41Sopenharmony_ci    memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
5201cb0ef41Sopenharmony_ci  } else {
5211cb0ef41Sopenharmony_ci    for (size_t i = 0; i < table_size; i++) {
5221cb0ef41Sopenharmony_ci      bad_char_occurrence[i] = start - 1;
5231cb0ef41Sopenharmony_ci    }
5241cb0ef41Sopenharmony_ci  }
5251cb0ef41Sopenharmony_ci  for (size_t i = start; i < pattern_length - 1; i++) {
5261cb0ef41Sopenharmony_ci    Char c = pattern_[i];
5271cb0ef41Sopenharmony_ci    int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize();
5281cb0ef41Sopenharmony_ci    bad_char_occurrence[bucket] = i;
5291cb0ef41Sopenharmony_ci  }
5301cb0ef41Sopenharmony_ci}
5311cb0ef41Sopenharmony_ci
5321cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
5331cb0ef41Sopenharmony_ci// Linear string search with bailout to BMH.
5341cb0ef41Sopenharmony_ci//---------------------------------------------------------------------
5351cb0ef41Sopenharmony_ci
5361cb0ef41Sopenharmony_ci// Simple linear search for short patterns, which bails out if the string
5371cb0ef41Sopenharmony_ci// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
5381cb0ef41Sopenharmony_citemplate <typename Char>
5391cb0ef41Sopenharmony_cisize_t StringSearch<Char>::InitialSearch(
5401cb0ef41Sopenharmony_ci    Vector subject,
5411cb0ef41Sopenharmony_ci    size_t index) {
5421cb0ef41Sopenharmony_ci  const size_t pattern_length = pattern_.length();
5431cb0ef41Sopenharmony_ci  // Badness is a count of how much work we have done.  When we have
5441cb0ef41Sopenharmony_ci  // done enough work we decide it's probably worth switching to a better
5451cb0ef41Sopenharmony_ci  // algorithm.
5461cb0ef41Sopenharmony_ci  int64_t badness = -10 - (pattern_length << 2);
5471cb0ef41Sopenharmony_ci
5481cb0ef41Sopenharmony_ci  // We know our pattern is at least 2 characters, we cache the first so
5491cb0ef41Sopenharmony_ci  // the common case of the first character not matching is faster.
5501cb0ef41Sopenharmony_ci  for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) {
5511cb0ef41Sopenharmony_ci    badness++;
5521cb0ef41Sopenharmony_ci    if (badness <= 0) {
5531cb0ef41Sopenharmony_ci      i = FindFirstCharacter(pattern_, subject, i);
5541cb0ef41Sopenharmony_ci      if (i == subject.length())
5551cb0ef41Sopenharmony_ci        return subject.length();
5561cb0ef41Sopenharmony_ci      CHECK_LE(i, n);
5571cb0ef41Sopenharmony_ci      size_t j = 1;
5581cb0ef41Sopenharmony_ci      do {
5591cb0ef41Sopenharmony_ci        if (pattern_[j] != subject[i + j]) {
5601cb0ef41Sopenharmony_ci          break;
5611cb0ef41Sopenharmony_ci        }
5621cb0ef41Sopenharmony_ci        j++;
5631cb0ef41Sopenharmony_ci      } while (j < pattern_length);
5641cb0ef41Sopenharmony_ci      if (j == pattern_length) {
5651cb0ef41Sopenharmony_ci        return i;
5661cb0ef41Sopenharmony_ci      }
5671cb0ef41Sopenharmony_ci      badness += j;
5681cb0ef41Sopenharmony_ci    } else {
5691cb0ef41Sopenharmony_ci      PopulateBoyerMooreHorspoolTable();
5701cb0ef41Sopenharmony_ci      strategy_ = SearchStrategy::kBoyerMooreHorspool;
5711cb0ef41Sopenharmony_ci      return BoyerMooreHorspoolSearch(subject, i);
5721cb0ef41Sopenharmony_ci    }
5731cb0ef41Sopenharmony_ci  }
5741cb0ef41Sopenharmony_ci  return subject.length();
5751cb0ef41Sopenharmony_ci}
5761cb0ef41Sopenharmony_ci
5771cb0ef41Sopenharmony_ci// Perform a single stand-alone search.
5781cb0ef41Sopenharmony_ci// If searching multiple times for the same pattern, a search
5791cb0ef41Sopenharmony_ci// object should be constructed once and the Search function then called
5801cb0ef41Sopenharmony_ci// for each search.
5811cb0ef41Sopenharmony_citemplate <typename Char>
5821cb0ef41Sopenharmony_cisize_t SearchString(Vector<const Char> subject,
5831cb0ef41Sopenharmony_ci                    Vector<const Char> pattern,
5841cb0ef41Sopenharmony_ci                    size_t start_index) {
5851cb0ef41Sopenharmony_ci  StringSearch<Char> search(pattern);
5861cb0ef41Sopenharmony_ci  return search.Search(subject, start_index);
5871cb0ef41Sopenharmony_ci}
5881cb0ef41Sopenharmony_ci}  // namespace stringsearch
5891cb0ef41Sopenharmony_ci}  // namespace node
5901cb0ef41Sopenharmony_ci
5911cb0ef41Sopenharmony_cinamespace node {
5921cb0ef41Sopenharmony_ci
5931cb0ef41Sopenharmony_citemplate <typename Char>
5941cb0ef41Sopenharmony_cisize_t SearchString(const Char* haystack,
5951cb0ef41Sopenharmony_ci                    size_t haystack_length,
5961cb0ef41Sopenharmony_ci                    const Char* needle,
5971cb0ef41Sopenharmony_ci                    size_t needle_length,
5981cb0ef41Sopenharmony_ci                    size_t start_index,
5991cb0ef41Sopenharmony_ci                    bool is_forward) {
6001cb0ef41Sopenharmony_ci  if (haystack_length < needle_length) return haystack_length;
6011cb0ef41Sopenharmony_ci  // To do a reverse search (lastIndexOf instead of indexOf) without redundant
6021cb0ef41Sopenharmony_ci  // code, create two vectors that are reversed views into the input strings.
6031cb0ef41Sopenharmony_ci  // For example, v_needle[0] would return the *last* character of the needle.
6041cb0ef41Sopenharmony_ci  // So we're searching for the first instance of rev(needle) in rev(haystack)
6051cb0ef41Sopenharmony_ci  stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward);
6061cb0ef41Sopenharmony_ci  stringsearch::Vector<const Char> v_haystack(
6071cb0ef41Sopenharmony_ci      haystack, haystack_length, is_forward);
6081cb0ef41Sopenharmony_ci  size_t diff = haystack_length - needle_length;
6091cb0ef41Sopenharmony_ci  size_t relative_start_index;
6101cb0ef41Sopenharmony_ci  if (is_forward) {
6111cb0ef41Sopenharmony_ci    relative_start_index = start_index;
6121cb0ef41Sopenharmony_ci  } else if (diff < start_index) {
6131cb0ef41Sopenharmony_ci    relative_start_index = 0;
6141cb0ef41Sopenharmony_ci  } else {
6151cb0ef41Sopenharmony_ci    relative_start_index = diff - start_index;
6161cb0ef41Sopenharmony_ci  }
6171cb0ef41Sopenharmony_ci  size_t pos = node::stringsearch::SearchString(
6181cb0ef41Sopenharmony_ci      v_haystack, v_needle, relative_start_index);
6191cb0ef41Sopenharmony_ci  if (pos == haystack_length) {
6201cb0ef41Sopenharmony_ci    // not found
6211cb0ef41Sopenharmony_ci    return pos;
6221cb0ef41Sopenharmony_ci  }
6231cb0ef41Sopenharmony_ci  return is_forward ? pos : (haystack_length - needle_length - pos);
6241cb0ef41Sopenharmony_ci}
6251cb0ef41Sopenharmony_ci
6261cb0ef41Sopenharmony_citemplate <size_t N>
6271cb0ef41Sopenharmony_cisize_t SearchString(const char* haystack, size_t haystack_length,
6281cb0ef41Sopenharmony_ci                    const char (&needle)[N]) {
6291cb0ef41Sopenharmony_ci  return SearchString(
6301cb0ef41Sopenharmony_ci      reinterpret_cast<const uint8_t*>(haystack), haystack_length,
6311cb0ef41Sopenharmony_ci      reinterpret_cast<const uint8_t*>(needle), N - 1, 0, true);
6321cb0ef41Sopenharmony_ci}
6331cb0ef41Sopenharmony_ci
6341cb0ef41Sopenharmony_ci}  // namespace node
6351cb0ef41Sopenharmony_ci
6361cb0ef41Sopenharmony_ci#endif  // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
6371cb0ef41Sopenharmony_ci
6381cb0ef41Sopenharmony_ci#endif  // SRC_STRING_SEARCH_H_
639