11cb0ef41Sopenharmony_ci// Copyright 2011 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci// found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci#ifndef V8_STRINGS_STRING_SEARCH_H_ 61cb0ef41Sopenharmony_ci#define V8_STRINGS_STRING_SEARCH_H_ 71cb0ef41Sopenharmony_ci 81cb0ef41Sopenharmony_ci#include "src/base/strings.h" 91cb0ef41Sopenharmony_ci#include "src/base/vector.h" 101cb0ef41Sopenharmony_ci#include "src/execution/isolate.h" 111cb0ef41Sopenharmony_ci 121cb0ef41Sopenharmony_cinamespace v8 { 131cb0ef41Sopenharmony_cinamespace internal { 141cb0ef41Sopenharmony_ci 151cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 161cb0ef41Sopenharmony_ci// String Search object. 171cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 181cb0ef41Sopenharmony_ci 191cb0ef41Sopenharmony_ci// Class holding constants and methods that apply to all string search variants, 201cb0ef41Sopenharmony_ci// independently of subject and pattern char size. 211cb0ef41Sopenharmony_ciclass StringSearchBase { 221cb0ef41Sopenharmony_ci protected: 231cb0ef41Sopenharmony_ci // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 241cb0ef41Sopenharmony_ci // limit, we can fix the size of tables. For a needle longer than this limit, 251cb0ef41Sopenharmony_ci // search will not be optimal, since we only build tables for a suffix 261cb0ef41Sopenharmony_ci // of the string, but it is a safe approximation. 271cb0ef41Sopenharmony_ci static const int kBMMaxShift = Isolate::kBMMaxShift; 281cb0ef41Sopenharmony_ci 291cb0ef41Sopenharmony_ci // Reduce alphabet to this size. 301cb0ef41Sopenharmony_ci // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 311cb0ef41Sopenharmony_ci // proportional to the input alphabet. We reduce the alphabet size by 321cb0ef41Sopenharmony_ci // equating input characters modulo a smaller alphabet size. This gives 331cb0ef41Sopenharmony_ci // a potentially less efficient searching, but is a safe approximation. 341cb0ef41Sopenharmony_ci // For needles using only characters in the same Unicode 256-code point page, 351cb0ef41Sopenharmony_ci // there is no search speed degradation. 361cb0ef41Sopenharmony_ci static const int kLatin1AlphabetSize = 256; 371cb0ef41Sopenharmony_ci static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize; 381cb0ef41Sopenharmony_ci 391cb0ef41Sopenharmony_ci // Bad-char shift table stored in the state. It's length is the alphabet size. 401cb0ef41Sopenharmony_ci // For patterns below this length, the skip length of Boyer-Moore is too short 411cb0ef41Sopenharmony_ci // to compensate for the algorithmic overhead compared to simple brute force. 421cb0ef41Sopenharmony_ci static const int kBMMinPatternLength = 7; 431cb0ef41Sopenharmony_ci 441cb0ef41Sopenharmony_ci static inline bool IsOneByteString(base::Vector<const uint8_t> string) { 451cb0ef41Sopenharmony_ci return true; 461cb0ef41Sopenharmony_ci } 471cb0ef41Sopenharmony_ci 481cb0ef41Sopenharmony_ci static inline bool IsOneByteString(base::Vector<const base::uc16> string) { 491cb0ef41Sopenharmony_ci return String::IsOneByte(string.begin(), string.length()); 501cb0ef41Sopenharmony_ci } 511cb0ef41Sopenharmony_ci 521cb0ef41Sopenharmony_ci friend class Isolate; 531cb0ef41Sopenharmony_ci}; 541cb0ef41Sopenharmony_ci 551cb0ef41Sopenharmony_citemplate <typename PatternChar, typename SubjectChar> 561cb0ef41Sopenharmony_ciclass StringSearch : private StringSearchBase { 571cb0ef41Sopenharmony_ci public: 581cb0ef41Sopenharmony_ci StringSearch(Isolate* isolate, base::Vector<const PatternChar> pattern) 591cb0ef41Sopenharmony_ci : isolate_(isolate), 601cb0ef41Sopenharmony_ci pattern_(pattern), 611cb0ef41Sopenharmony_ci start_(std::max(0, pattern.length() - kBMMaxShift)) { 621cb0ef41Sopenharmony_ci if (sizeof(PatternChar) > sizeof(SubjectChar)) { 631cb0ef41Sopenharmony_ci if (!IsOneByteString(pattern_)) { 641cb0ef41Sopenharmony_ci strategy_ = &FailSearch; 651cb0ef41Sopenharmony_ci return; 661cb0ef41Sopenharmony_ci } 671cb0ef41Sopenharmony_ci } 681cb0ef41Sopenharmony_ci int pattern_length = pattern_.length(); 691cb0ef41Sopenharmony_ci if (pattern_length < kBMMinPatternLength) { 701cb0ef41Sopenharmony_ci if (pattern_length == 1) { 711cb0ef41Sopenharmony_ci strategy_ = &SingleCharSearch; 721cb0ef41Sopenharmony_ci return; 731cb0ef41Sopenharmony_ci } 741cb0ef41Sopenharmony_ci strategy_ = &LinearSearch; 751cb0ef41Sopenharmony_ci return; 761cb0ef41Sopenharmony_ci } 771cb0ef41Sopenharmony_ci strategy_ = &InitialSearch; 781cb0ef41Sopenharmony_ci } 791cb0ef41Sopenharmony_ci 801cb0ef41Sopenharmony_ci int Search(base::Vector<const SubjectChar> subject, int index) { 811cb0ef41Sopenharmony_ci return strategy_(this, subject, index); 821cb0ef41Sopenharmony_ci } 831cb0ef41Sopenharmony_ci 841cb0ef41Sopenharmony_ci static inline int AlphabetSize() { 851cb0ef41Sopenharmony_ci if (sizeof(PatternChar) == 1) { 861cb0ef41Sopenharmony_ci // Latin1 needle. 871cb0ef41Sopenharmony_ci return kLatin1AlphabetSize; 881cb0ef41Sopenharmony_ci } else { 891cb0ef41Sopenharmony_ci DCHECK_EQ(sizeof(PatternChar), 2); 901cb0ef41Sopenharmony_ci // UC16 needle. 911cb0ef41Sopenharmony_ci return kUC16AlphabetSize; 921cb0ef41Sopenharmony_ci } 931cb0ef41Sopenharmony_ci } 941cb0ef41Sopenharmony_ci 951cb0ef41Sopenharmony_ci private: 961cb0ef41Sopenharmony_ci using SearchFunction = int (*)(StringSearch<PatternChar, SubjectChar>*, 971cb0ef41Sopenharmony_ci base::Vector<const SubjectChar>, int); 981cb0ef41Sopenharmony_ci 991cb0ef41Sopenharmony_ci static int FailSearch(StringSearch<PatternChar, SubjectChar>*, 1001cb0ef41Sopenharmony_ci base::Vector<const SubjectChar>, int) { 1011cb0ef41Sopenharmony_ci return -1; 1021cb0ef41Sopenharmony_ci } 1031cb0ef41Sopenharmony_ci 1041cb0ef41Sopenharmony_ci static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, 1051cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject, 1061cb0ef41Sopenharmony_ci int start_index); 1071cb0ef41Sopenharmony_ci 1081cb0ef41Sopenharmony_ci static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, 1091cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject, 1101cb0ef41Sopenharmony_ci int start_index); 1111cb0ef41Sopenharmony_ci 1121cb0ef41Sopenharmony_ci static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, 1131cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject, 1141cb0ef41Sopenharmony_ci int start_index); 1151cb0ef41Sopenharmony_ci 1161cb0ef41Sopenharmony_ci static int BoyerMooreHorspoolSearch( 1171cb0ef41Sopenharmony_ci StringSearch<PatternChar, SubjectChar>* search, 1181cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject, int start_index); 1191cb0ef41Sopenharmony_ci 1201cb0ef41Sopenharmony_ci static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, 1211cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject, 1221cb0ef41Sopenharmony_ci int start_index); 1231cb0ef41Sopenharmony_ci 1241cb0ef41Sopenharmony_ci void PopulateBoyerMooreHorspoolTable(); 1251cb0ef41Sopenharmony_ci 1261cb0ef41Sopenharmony_ci void PopulateBoyerMooreTable(); 1271cb0ef41Sopenharmony_ci 1281cb0ef41Sopenharmony_ci static inline bool exceedsOneByte(uint8_t c) { return false; } 1291cb0ef41Sopenharmony_ci 1301cb0ef41Sopenharmony_ci static inline bool exceedsOneByte(uint16_t c) { 1311cb0ef41Sopenharmony_ci return c > String::kMaxOneByteCharCodeU; 1321cb0ef41Sopenharmony_ci } 1331cb0ef41Sopenharmony_ci 1341cb0ef41Sopenharmony_ci static inline int CharOccurrence(int* bad_char_occurrence, 1351cb0ef41Sopenharmony_ci SubjectChar char_code) { 1361cb0ef41Sopenharmony_ci if (sizeof(SubjectChar) == 1) { 1371cb0ef41Sopenharmony_ci return bad_char_occurrence[static_cast<int>(char_code)]; 1381cb0ef41Sopenharmony_ci } 1391cb0ef41Sopenharmony_ci if (sizeof(PatternChar) == 1) { 1401cb0ef41Sopenharmony_ci if (exceedsOneByte(char_code)) { 1411cb0ef41Sopenharmony_ci return -1; 1421cb0ef41Sopenharmony_ci } 1431cb0ef41Sopenharmony_ci return bad_char_occurrence[static_cast<unsigned int>(char_code)]; 1441cb0ef41Sopenharmony_ci } 1451cb0ef41Sopenharmony_ci // Both pattern and subject are UC16. Reduce character to equivalence 1461cb0ef41Sopenharmony_ci // class. 1471cb0ef41Sopenharmony_ci int equiv_class = char_code % kUC16AlphabetSize; 1481cb0ef41Sopenharmony_ci return bad_char_occurrence[equiv_class]; 1491cb0ef41Sopenharmony_ci } 1501cb0ef41Sopenharmony_ci 1511cb0ef41Sopenharmony_ci // The following tables are shared by all searches. 1521cb0ef41Sopenharmony_ci // TODO(lrn): Introduce a way for a pattern to keep its tables 1531cb0ef41Sopenharmony_ci // between searches (e.g., for an Atom RegExp). 1541cb0ef41Sopenharmony_ci 1551cb0ef41Sopenharmony_ci // Store for the BoyerMoore(Horspool) bad char shift table. 1561cb0ef41Sopenharmony_ci // Return a table covering the last kBMMaxShift+1 positions of 1571cb0ef41Sopenharmony_ci // pattern. 1581cb0ef41Sopenharmony_ci int* bad_char_table() { return isolate_->bad_char_shift_table(); } 1591cb0ef41Sopenharmony_ci 1601cb0ef41Sopenharmony_ci // Store for the BoyerMoore good suffix shift table. 1611cb0ef41Sopenharmony_ci int* good_suffix_shift_table() { 1621cb0ef41Sopenharmony_ci // Return biased pointer that maps the range [start_..pattern_.length() 1631cb0ef41Sopenharmony_ci // to the kGoodSuffixShiftTable array. 1641cb0ef41Sopenharmony_ci return isolate_->good_suffix_shift_table() - start_; 1651cb0ef41Sopenharmony_ci } 1661cb0ef41Sopenharmony_ci 1671cb0ef41Sopenharmony_ci // Table used temporarily while building the BoyerMoore good suffix 1681cb0ef41Sopenharmony_ci // shift table. 1691cb0ef41Sopenharmony_ci int* suffix_table() { 1701cb0ef41Sopenharmony_ci // Return biased pointer that maps the range [start_..pattern_.length() 1711cb0ef41Sopenharmony_ci // to the kSuffixTable array. 1721cb0ef41Sopenharmony_ci return isolate_->suffix_table() - start_; 1731cb0ef41Sopenharmony_ci } 1741cb0ef41Sopenharmony_ci 1751cb0ef41Sopenharmony_ci Isolate* isolate_; 1761cb0ef41Sopenharmony_ci // The pattern to search for. 1771cb0ef41Sopenharmony_ci base::Vector<const PatternChar> pattern_; 1781cb0ef41Sopenharmony_ci // Pointer to implementation of the search. 1791cb0ef41Sopenharmony_ci SearchFunction strategy_; 1801cb0ef41Sopenharmony_ci // Cache value of max(0, pattern_length() - kBMMaxShift) 1811cb0ef41Sopenharmony_ci int start_; 1821cb0ef41Sopenharmony_ci}; 1831cb0ef41Sopenharmony_ci 1841cb0ef41Sopenharmony_citemplate <typename T, typename U> 1851cb0ef41Sopenharmony_ciinline T AlignDown(T value, U alignment) { 1861cb0ef41Sopenharmony_ci return reinterpret_cast<T>( 1871cb0ef41Sopenharmony_ci (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); 1881cb0ef41Sopenharmony_ci} 1891cb0ef41Sopenharmony_ci 1901cb0ef41Sopenharmony_ciinline uint8_t GetHighestValueByte(base::uc16 character) { 1911cb0ef41Sopenharmony_ci return std::max(static_cast<uint8_t>(character & 0xFF), 1921cb0ef41Sopenharmony_ci static_cast<uint8_t>(character >> 8)); 1931cb0ef41Sopenharmony_ci} 1941cb0ef41Sopenharmony_ci 1951cb0ef41Sopenharmony_ciinline uint8_t GetHighestValueByte(uint8_t character) { return character; } 1961cb0ef41Sopenharmony_ci 1971cb0ef41Sopenharmony_citemplate <typename PatternChar, typename SubjectChar> 1981cb0ef41Sopenharmony_ciinline int FindFirstCharacter(base::Vector<const PatternChar> pattern, 1991cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject, 2001cb0ef41Sopenharmony_ci int index) { 2011cb0ef41Sopenharmony_ci const PatternChar pattern_first_char = pattern[0]; 2021cb0ef41Sopenharmony_ci const int max_n = (subject.length() - pattern.length() + 1); 2031cb0ef41Sopenharmony_ci 2041cb0ef41Sopenharmony_ci if (sizeof(SubjectChar) == 2 && pattern_first_char == 0) { 2051cb0ef41Sopenharmony_ci // Special-case looking for the 0 char in other than one-byte strings. 2061cb0ef41Sopenharmony_ci // memchr mostly fails in this case due to every other byte being 0 in text 2071cb0ef41Sopenharmony_ci // that is mostly ascii characters. 2081cb0ef41Sopenharmony_ci for (int i = index; i < max_n; ++i) { 2091cb0ef41Sopenharmony_ci if (subject[i] == 0) return i; 2101cb0ef41Sopenharmony_ci } 2111cb0ef41Sopenharmony_ci return -1; 2121cb0ef41Sopenharmony_ci } 2131cb0ef41Sopenharmony_ci const uint8_t search_byte = GetHighestValueByte(pattern_first_char); 2141cb0ef41Sopenharmony_ci const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); 2151cb0ef41Sopenharmony_ci int pos = index; 2161cb0ef41Sopenharmony_ci do { 2171cb0ef41Sopenharmony_ci DCHECK_GE(max_n - pos, 0); 2181cb0ef41Sopenharmony_ci const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>( 2191cb0ef41Sopenharmony_ci memchr(subject.begin() + pos, search_byte, 2201cb0ef41Sopenharmony_ci (max_n - pos) * sizeof(SubjectChar))); 2211cb0ef41Sopenharmony_ci if (char_pos == nullptr) return -1; 2221cb0ef41Sopenharmony_ci char_pos = AlignDown(char_pos, sizeof(SubjectChar)); 2231cb0ef41Sopenharmony_ci pos = static_cast<int>(char_pos - subject.begin()); 2241cb0ef41Sopenharmony_ci if (subject[pos] == search_char) return pos; 2251cb0ef41Sopenharmony_ci } while (++pos < max_n); 2261cb0ef41Sopenharmony_ci 2271cb0ef41Sopenharmony_ci return -1; 2281cb0ef41Sopenharmony_ci} 2291cb0ef41Sopenharmony_ci 2301cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 2311cb0ef41Sopenharmony_ci// Single Character Pattern Search Strategy 2321cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 2331cb0ef41Sopenharmony_ci 2341cb0ef41Sopenharmony_citemplate <typename PatternChar, typename SubjectChar> 2351cb0ef41Sopenharmony_ciint StringSearch<PatternChar, SubjectChar>::SingleCharSearch( 2361cb0ef41Sopenharmony_ci StringSearch<PatternChar, SubjectChar>* search, 2371cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject, int index) { 2381cb0ef41Sopenharmony_ci DCHECK_EQ(1, search->pattern_.length()); 2391cb0ef41Sopenharmony_ci PatternChar pattern_first_char = search->pattern_[0]; 2401cb0ef41Sopenharmony_ci if (sizeof(PatternChar) > sizeof(SubjectChar)) { 2411cb0ef41Sopenharmony_ci if (exceedsOneByte(pattern_first_char)) { 2421cb0ef41Sopenharmony_ci return -1; 2431cb0ef41Sopenharmony_ci } 2441cb0ef41Sopenharmony_ci } 2451cb0ef41Sopenharmony_ci return FindFirstCharacter(search->pattern_, subject, index); 2461cb0ef41Sopenharmony_ci} 2471cb0ef41Sopenharmony_ci 2481cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 2491cb0ef41Sopenharmony_ci// Linear Search Strategy 2501cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 2511cb0ef41Sopenharmony_ci 2521cb0ef41Sopenharmony_citemplate <typename PatternChar, typename SubjectChar> 2531cb0ef41Sopenharmony_ciinline bool CharCompare(const PatternChar* pattern, const SubjectChar* subject, 2541cb0ef41Sopenharmony_ci int length) { 2551cb0ef41Sopenharmony_ci DCHECK_GT(length, 0); 2561cb0ef41Sopenharmony_ci int pos = 0; 2571cb0ef41Sopenharmony_ci do { 2581cb0ef41Sopenharmony_ci if (pattern[pos] != subject[pos]) { 2591cb0ef41Sopenharmony_ci return false; 2601cb0ef41Sopenharmony_ci } 2611cb0ef41Sopenharmony_ci pos++; 2621cb0ef41Sopenharmony_ci } while (pos < length); 2631cb0ef41Sopenharmony_ci return true; 2641cb0ef41Sopenharmony_ci} 2651cb0ef41Sopenharmony_ci 2661cb0ef41Sopenharmony_ci// Simple linear search for short patterns. Never bails out. 2671cb0ef41Sopenharmony_citemplate <typename PatternChar, typename SubjectChar> 2681cb0ef41Sopenharmony_ciint StringSearch<PatternChar, SubjectChar>::LinearSearch( 2691cb0ef41Sopenharmony_ci StringSearch<PatternChar, SubjectChar>* search, 2701cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject, int index) { 2711cb0ef41Sopenharmony_ci base::Vector<const PatternChar> pattern = search->pattern_; 2721cb0ef41Sopenharmony_ci DCHECK_GT(pattern.length(), 1); 2731cb0ef41Sopenharmony_ci int pattern_length = pattern.length(); 2741cb0ef41Sopenharmony_ci int i = index; 2751cb0ef41Sopenharmony_ci int n = subject.length() - pattern_length; 2761cb0ef41Sopenharmony_ci while (i <= n) { 2771cb0ef41Sopenharmony_ci i = FindFirstCharacter(pattern, subject, i); 2781cb0ef41Sopenharmony_ci if (i == -1) return -1; 2791cb0ef41Sopenharmony_ci DCHECK_LE(i, n); 2801cb0ef41Sopenharmony_ci i++; 2811cb0ef41Sopenharmony_ci // Loop extracted to separate function to allow using return to do 2821cb0ef41Sopenharmony_ci // a deeper break. 2831cb0ef41Sopenharmony_ci if (CharCompare(pattern.begin() + 1, subject.begin() + i, 2841cb0ef41Sopenharmony_ci pattern_length - 1)) { 2851cb0ef41Sopenharmony_ci return i - 1; 2861cb0ef41Sopenharmony_ci } 2871cb0ef41Sopenharmony_ci } 2881cb0ef41Sopenharmony_ci return -1; 2891cb0ef41Sopenharmony_ci} 2901cb0ef41Sopenharmony_ci 2911cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 2921cb0ef41Sopenharmony_ci// Boyer-Moore string search 2931cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 2941cb0ef41Sopenharmony_ci 2951cb0ef41Sopenharmony_citemplate <typename PatternChar, typename SubjectChar> 2961cb0ef41Sopenharmony_ciint StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( 2971cb0ef41Sopenharmony_ci StringSearch<PatternChar, SubjectChar>* search, 2981cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject, int start_index) { 2991cb0ef41Sopenharmony_ci base::Vector<const PatternChar> pattern = search->pattern_; 3001cb0ef41Sopenharmony_ci int subject_length = subject.length(); 3011cb0ef41Sopenharmony_ci int pattern_length = pattern.length(); 3021cb0ef41Sopenharmony_ci // Only preprocess at most kBMMaxShift last characters of pattern. 3031cb0ef41Sopenharmony_ci int start = search->start_; 3041cb0ef41Sopenharmony_ci 3051cb0ef41Sopenharmony_ci int* bad_char_occurence = search->bad_char_table(); 3061cb0ef41Sopenharmony_ci int* good_suffix_shift = search->good_suffix_shift_table(); 3071cb0ef41Sopenharmony_ci 3081cb0ef41Sopenharmony_ci PatternChar last_char = pattern[pattern_length - 1]; 3091cb0ef41Sopenharmony_ci int index = start_index; 3101cb0ef41Sopenharmony_ci // Continue search from i. 3111cb0ef41Sopenharmony_ci while (index <= subject_length - pattern_length) { 3121cb0ef41Sopenharmony_ci int j = pattern_length - 1; 3131cb0ef41Sopenharmony_ci int c; 3141cb0ef41Sopenharmony_ci while (last_char != (c = subject[index + j])) { 3151cb0ef41Sopenharmony_ci int shift = j - CharOccurrence(bad_char_occurence, c); 3161cb0ef41Sopenharmony_ci index += shift; 3171cb0ef41Sopenharmony_ci if (index > subject_length - pattern_length) { 3181cb0ef41Sopenharmony_ci return -1; 3191cb0ef41Sopenharmony_ci } 3201cb0ef41Sopenharmony_ci } 3211cb0ef41Sopenharmony_ci while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; 3221cb0ef41Sopenharmony_ci if (j < 0) { 3231cb0ef41Sopenharmony_ci return index; 3241cb0ef41Sopenharmony_ci } else if (j < start) { 3251cb0ef41Sopenharmony_ci // we have matched more than our tables allow us to be smart about. 3261cb0ef41Sopenharmony_ci // Fall back on BMH shift. 3271cb0ef41Sopenharmony_ci index += pattern_length - 1 - 3281cb0ef41Sopenharmony_ci CharOccurrence(bad_char_occurence, 3291cb0ef41Sopenharmony_ci static_cast<SubjectChar>(last_char)); 3301cb0ef41Sopenharmony_ci } else { 3311cb0ef41Sopenharmony_ci int gs_shift = good_suffix_shift[j + 1]; 3321cb0ef41Sopenharmony_ci int bc_occ = CharOccurrence(bad_char_occurence, c); 3331cb0ef41Sopenharmony_ci int shift = j - bc_occ; 3341cb0ef41Sopenharmony_ci if (gs_shift > shift) { 3351cb0ef41Sopenharmony_ci shift = gs_shift; 3361cb0ef41Sopenharmony_ci } 3371cb0ef41Sopenharmony_ci index += shift; 3381cb0ef41Sopenharmony_ci } 3391cb0ef41Sopenharmony_ci } 3401cb0ef41Sopenharmony_ci 3411cb0ef41Sopenharmony_ci return -1; 3421cb0ef41Sopenharmony_ci} 3431cb0ef41Sopenharmony_ci 3441cb0ef41Sopenharmony_citemplate <typename PatternChar, typename SubjectChar> 3451cb0ef41Sopenharmony_civoid StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { 3461cb0ef41Sopenharmony_ci int pattern_length = pattern_.length(); 3471cb0ef41Sopenharmony_ci const PatternChar* pattern = pattern_.begin(); 3481cb0ef41Sopenharmony_ci // Only look at the last kBMMaxShift characters of pattern (from start_ 3491cb0ef41Sopenharmony_ci // to pattern_length). 3501cb0ef41Sopenharmony_ci int start = start_; 3511cb0ef41Sopenharmony_ci int length = pattern_length - start; 3521cb0ef41Sopenharmony_ci 3531cb0ef41Sopenharmony_ci // Biased tables so that we can use pattern indices as table indices, 3541cb0ef41Sopenharmony_ci // even if we only cover the part of the pattern from offset start. 3551cb0ef41Sopenharmony_ci int* shift_table = good_suffix_shift_table(); 3561cb0ef41Sopenharmony_ci int* suffix_table = this->suffix_table(); 3571cb0ef41Sopenharmony_ci 3581cb0ef41Sopenharmony_ci // Initialize table. 3591cb0ef41Sopenharmony_ci for (int i = start; i < pattern_length; i++) { 3601cb0ef41Sopenharmony_ci shift_table[i] = length; 3611cb0ef41Sopenharmony_ci } 3621cb0ef41Sopenharmony_ci shift_table[pattern_length] = 1; 3631cb0ef41Sopenharmony_ci suffix_table[pattern_length] = pattern_length + 1; 3641cb0ef41Sopenharmony_ci 3651cb0ef41Sopenharmony_ci if (pattern_length <= start) { 3661cb0ef41Sopenharmony_ci return; 3671cb0ef41Sopenharmony_ci } 3681cb0ef41Sopenharmony_ci 3691cb0ef41Sopenharmony_ci // Find suffixes. 3701cb0ef41Sopenharmony_ci PatternChar last_char = pattern[pattern_length - 1]; 3711cb0ef41Sopenharmony_ci int suffix = pattern_length + 1; 3721cb0ef41Sopenharmony_ci { 3731cb0ef41Sopenharmony_ci int i = pattern_length; 3741cb0ef41Sopenharmony_ci while (i > start) { 3751cb0ef41Sopenharmony_ci PatternChar c = pattern[i - 1]; 3761cb0ef41Sopenharmony_ci while (suffix <= pattern_length && c != pattern[suffix - 1]) { 3771cb0ef41Sopenharmony_ci if (shift_table[suffix] == length) { 3781cb0ef41Sopenharmony_ci shift_table[suffix] = suffix - i; 3791cb0ef41Sopenharmony_ci } 3801cb0ef41Sopenharmony_ci suffix = suffix_table[suffix]; 3811cb0ef41Sopenharmony_ci } 3821cb0ef41Sopenharmony_ci suffix_table[--i] = --suffix; 3831cb0ef41Sopenharmony_ci if (suffix == pattern_length) { 3841cb0ef41Sopenharmony_ci // No suffix to extend, so we check against last_char only. 3851cb0ef41Sopenharmony_ci while ((i > start) && (pattern[i - 1] != last_char)) { 3861cb0ef41Sopenharmony_ci if (shift_table[pattern_length] == length) { 3871cb0ef41Sopenharmony_ci shift_table[pattern_length] = pattern_length - i; 3881cb0ef41Sopenharmony_ci } 3891cb0ef41Sopenharmony_ci suffix_table[--i] = pattern_length; 3901cb0ef41Sopenharmony_ci } 3911cb0ef41Sopenharmony_ci if (i > start) { 3921cb0ef41Sopenharmony_ci suffix_table[--i] = --suffix; 3931cb0ef41Sopenharmony_ci } 3941cb0ef41Sopenharmony_ci } 3951cb0ef41Sopenharmony_ci } 3961cb0ef41Sopenharmony_ci } 3971cb0ef41Sopenharmony_ci // Build shift table using suffixes. 3981cb0ef41Sopenharmony_ci if (suffix < pattern_length) { 3991cb0ef41Sopenharmony_ci for (int i = start; i <= pattern_length; i++) { 4001cb0ef41Sopenharmony_ci if (shift_table[i] == length) { 4011cb0ef41Sopenharmony_ci shift_table[i] = suffix - start; 4021cb0ef41Sopenharmony_ci } 4031cb0ef41Sopenharmony_ci if (i == suffix) { 4041cb0ef41Sopenharmony_ci suffix = suffix_table[suffix]; 4051cb0ef41Sopenharmony_ci } 4061cb0ef41Sopenharmony_ci } 4071cb0ef41Sopenharmony_ci } 4081cb0ef41Sopenharmony_ci} 4091cb0ef41Sopenharmony_ci 4101cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 4111cb0ef41Sopenharmony_ci// Boyer-Moore-Horspool string search. 4121cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 4131cb0ef41Sopenharmony_ci 4141cb0ef41Sopenharmony_citemplate <typename PatternChar, typename SubjectChar> 4151cb0ef41Sopenharmony_ciint StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( 4161cb0ef41Sopenharmony_ci StringSearch<PatternChar, SubjectChar>* search, 4171cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject, int start_index) { 4181cb0ef41Sopenharmony_ci base::Vector<const PatternChar> pattern = search->pattern_; 4191cb0ef41Sopenharmony_ci int subject_length = subject.length(); 4201cb0ef41Sopenharmony_ci int pattern_length = pattern.length(); 4211cb0ef41Sopenharmony_ci int* char_occurrences = search->bad_char_table(); 4221cb0ef41Sopenharmony_ci int badness = -pattern_length; 4231cb0ef41Sopenharmony_ci 4241cb0ef41Sopenharmony_ci // How bad we are doing without a good-suffix table. 4251cb0ef41Sopenharmony_ci PatternChar last_char = pattern[pattern_length - 1]; 4261cb0ef41Sopenharmony_ci int last_char_shift = 4271cb0ef41Sopenharmony_ci pattern_length - 1 - 4281cb0ef41Sopenharmony_ci CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); 4291cb0ef41Sopenharmony_ci // Perform search 4301cb0ef41Sopenharmony_ci int index = start_index; // No matches found prior to this index. 4311cb0ef41Sopenharmony_ci while (index <= subject_length - pattern_length) { 4321cb0ef41Sopenharmony_ci int j = pattern_length - 1; 4331cb0ef41Sopenharmony_ci int subject_char; 4341cb0ef41Sopenharmony_ci while (last_char != (subject_char = subject[index + j])) { 4351cb0ef41Sopenharmony_ci int bc_occ = CharOccurrence(char_occurrences, subject_char); 4361cb0ef41Sopenharmony_ci int shift = j - bc_occ; 4371cb0ef41Sopenharmony_ci index += shift; 4381cb0ef41Sopenharmony_ci badness += 1 - shift; // at most zero, so badness cannot increase. 4391cb0ef41Sopenharmony_ci if (index > subject_length - pattern_length) { 4401cb0ef41Sopenharmony_ci return -1; 4411cb0ef41Sopenharmony_ci } 4421cb0ef41Sopenharmony_ci } 4431cb0ef41Sopenharmony_ci j--; 4441cb0ef41Sopenharmony_ci while (j >= 0 && pattern[j] == (subject[index + j])) j--; 4451cb0ef41Sopenharmony_ci if (j < 0) { 4461cb0ef41Sopenharmony_ci return index; 4471cb0ef41Sopenharmony_ci } else { 4481cb0ef41Sopenharmony_ci index += last_char_shift; 4491cb0ef41Sopenharmony_ci // Badness increases by the number of characters we have 4501cb0ef41Sopenharmony_ci // checked, and decreases by the number of characters we 4511cb0ef41Sopenharmony_ci // can skip by shifting. It's a measure of how we are doing 4521cb0ef41Sopenharmony_ci // compared to reading each character exactly once. 4531cb0ef41Sopenharmony_ci badness += (pattern_length - j) - last_char_shift; 4541cb0ef41Sopenharmony_ci if (badness > 0) { 4551cb0ef41Sopenharmony_ci search->PopulateBoyerMooreTable(); 4561cb0ef41Sopenharmony_ci search->strategy_ = &BoyerMooreSearch; 4571cb0ef41Sopenharmony_ci return BoyerMooreSearch(search, subject, index); 4581cb0ef41Sopenharmony_ci } 4591cb0ef41Sopenharmony_ci } 4601cb0ef41Sopenharmony_ci } 4611cb0ef41Sopenharmony_ci return -1; 4621cb0ef41Sopenharmony_ci} 4631cb0ef41Sopenharmony_ci 4641cb0ef41Sopenharmony_citemplate <typename PatternChar, typename SubjectChar> 4651cb0ef41Sopenharmony_civoid StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { 4661cb0ef41Sopenharmony_ci int pattern_length = pattern_.length(); 4671cb0ef41Sopenharmony_ci 4681cb0ef41Sopenharmony_ci int* bad_char_occurrence = bad_char_table(); 4691cb0ef41Sopenharmony_ci 4701cb0ef41Sopenharmony_ci // Only preprocess at most kBMMaxShift last characters of pattern. 4711cb0ef41Sopenharmony_ci int start = start_; 4721cb0ef41Sopenharmony_ci // Run forwards to populate bad_char_table, so that *last* instance 4731cb0ef41Sopenharmony_ci // of character equivalence class is the one registered. 4741cb0ef41Sopenharmony_ci // Notice: Doesn't include the last character. 4751cb0ef41Sopenharmony_ci int table_size = AlphabetSize(); 4761cb0ef41Sopenharmony_ci if (start == 0) { // All patterns less than kBMMaxShift in length. 4771cb0ef41Sopenharmony_ci memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); 4781cb0ef41Sopenharmony_ci } else { 4791cb0ef41Sopenharmony_ci for (int i = 0; i < table_size; i++) { 4801cb0ef41Sopenharmony_ci bad_char_occurrence[i] = start - 1; 4811cb0ef41Sopenharmony_ci } 4821cb0ef41Sopenharmony_ci } 4831cb0ef41Sopenharmony_ci for (int i = start; i < pattern_length - 1; i++) { 4841cb0ef41Sopenharmony_ci PatternChar c = pattern_[i]; 4851cb0ef41Sopenharmony_ci int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); 4861cb0ef41Sopenharmony_ci bad_char_occurrence[bucket] = i; 4871cb0ef41Sopenharmony_ci } 4881cb0ef41Sopenharmony_ci} 4891cb0ef41Sopenharmony_ci 4901cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 4911cb0ef41Sopenharmony_ci// Linear string search with bailout to BMH. 4921cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 4931cb0ef41Sopenharmony_ci 4941cb0ef41Sopenharmony_ci// Simple linear search for short patterns, which bails out if the string 4951cb0ef41Sopenharmony_ci// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. 4961cb0ef41Sopenharmony_citemplate <typename PatternChar, typename SubjectChar> 4971cb0ef41Sopenharmony_ciint StringSearch<PatternChar, SubjectChar>::InitialSearch( 4981cb0ef41Sopenharmony_ci StringSearch<PatternChar, SubjectChar>* search, 4991cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject, int index) { 5001cb0ef41Sopenharmony_ci base::Vector<const PatternChar> pattern = search->pattern_; 5011cb0ef41Sopenharmony_ci int pattern_length = pattern.length(); 5021cb0ef41Sopenharmony_ci // Badness is a count of how much work we have done. When we have 5031cb0ef41Sopenharmony_ci // done enough work we decide it's probably worth switching to a better 5041cb0ef41Sopenharmony_ci // algorithm. 5051cb0ef41Sopenharmony_ci int badness = -10 - (pattern_length << 2); 5061cb0ef41Sopenharmony_ci 5071cb0ef41Sopenharmony_ci // We know our pattern is at least 2 characters, we cache the first so 5081cb0ef41Sopenharmony_ci // the common case of the first character not matching is faster. 5091cb0ef41Sopenharmony_ci for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { 5101cb0ef41Sopenharmony_ci badness++; 5111cb0ef41Sopenharmony_ci if (badness <= 0) { 5121cb0ef41Sopenharmony_ci i = FindFirstCharacter(pattern, subject, i); 5131cb0ef41Sopenharmony_ci if (i == -1) return -1; 5141cb0ef41Sopenharmony_ci DCHECK_LE(i, n); 5151cb0ef41Sopenharmony_ci int j = 1; 5161cb0ef41Sopenharmony_ci do { 5171cb0ef41Sopenharmony_ci if (pattern[j] != subject[i + j]) { 5181cb0ef41Sopenharmony_ci break; 5191cb0ef41Sopenharmony_ci } 5201cb0ef41Sopenharmony_ci j++; 5211cb0ef41Sopenharmony_ci } while (j < pattern_length); 5221cb0ef41Sopenharmony_ci if (j == pattern_length) { 5231cb0ef41Sopenharmony_ci return i; 5241cb0ef41Sopenharmony_ci } 5251cb0ef41Sopenharmony_ci badness += j; 5261cb0ef41Sopenharmony_ci } else { 5271cb0ef41Sopenharmony_ci search->PopulateBoyerMooreHorspoolTable(); 5281cb0ef41Sopenharmony_ci search->strategy_ = &BoyerMooreHorspoolSearch; 5291cb0ef41Sopenharmony_ci return BoyerMooreHorspoolSearch(search, subject, i); 5301cb0ef41Sopenharmony_ci } 5311cb0ef41Sopenharmony_ci } 5321cb0ef41Sopenharmony_ci return -1; 5331cb0ef41Sopenharmony_ci} 5341cb0ef41Sopenharmony_ci 5351cb0ef41Sopenharmony_ci// Perform a a single stand-alone search. 5361cb0ef41Sopenharmony_ci// If searching multiple times for the same pattern, a search 5371cb0ef41Sopenharmony_ci// object should be constructed once and the Search function then called 5381cb0ef41Sopenharmony_ci// for each search. 5391cb0ef41Sopenharmony_citemplate <typename SubjectChar, typename PatternChar> 5401cb0ef41Sopenharmony_ciint SearchString(Isolate* isolate, base::Vector<const SubjectChar> subject, 5411cb0ef41Sopenharmony_ci base::Vector<const PatternChar> pattern, int start_index) { 5421cb0ef41Sopenharmony_ci StringSearch<PatternChar, SubjectChar> search(isolate, pattern); 5431cb0ef41Sopenharmony_ci return search.Search(subject, start_index); 5441cb0ef41Sopenharmony_ci} 5451cb0ef41Sopenharmony_ci 5461cb0ef41Sopenharmony_ci// A wrapper function around SearchString that wraps raw pointers to the subject 5471cb0ef41Sopenharmony_ci// and pattern as vectors before calling SearchString. Used from the 5481cb0ef41Sopenharmony_ci// StringIndexOf builtin. 5491cb0ef41Sopenharmony_citemplate <typename SubjectChar, typename PatternChar> 5501cb0ef41Sopenharmony_ciintptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr, 5511cb0ef41Sopenharmony_ci int subject_length, const PatternChar* pattern_ptr, 5521cb0ef41Sopenharmony_ci int pattern_length, int start_index) { 5531cb0ef41Sopenharmony_ci DisallowGarbageCollection no_gc; 5541cb0ef41Sopenharmony_ci base::Vector<const SubjectChar> subject(subject_ptr, subject_length); 5551cb0ef41Sopenharmony_ci base::Vector<const PatternChar> pattern(pattern_ptr, pattern_length); 5561cb0ef41Sopenharmony_ci return SearchString(isolate, subject, pattern, start_index); 5571cb0ef41Sopenharmony_ci} 5581cb0ef41Sopenharmony_ci 5591cb0ef41Sopenharmony_ci} // namespace internal 5601cb0ef41Sopenharmony_ci} // namespace v8 5611cb0ef41Sopenharmony_ci 5621cb0ef41Sopenharmony_ci#endif // V8_STRINGS_STRING_SEARCH_H_ 563