11cb0ef41Sopenharmony_ci// Copyright 2011 the V8 project authors. All rights reserved. 21cb0ef41Sopenharmony_ci// Use of this source code is governed by a BSD-style license that can be 31cb0ef41Sopenharmony_ci// found in the LICENSE file. 41cb0ef41Sopenharmony_ci 51cb0ef41Sopenharmony_ci#ifndef SRC_STRING_SEARCH_H_ 61cb0ef41Sopenharmony_ci#define SRC_STRING_SEARCH_H_ 71cb0ef41Sopenharmony_ci 81cb0ef41Sopenharmony_ci#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS 91cb0ef41Sopenharmony_ci 101cb0ef41Sopenharmony_ci#include "util.h" 111cb0ef41Sopenharmony_ci 121cb0ef41Sopenharmony_ci#include <cstring> 131cb0ef41Sopenharmony_ci#include <algorithm> 141cb0ef41Sopenharmony_ci 151cb0ef41Sopenharmony_cinamespace node { 161cb0ef41Sopenharmony_cinamespace stringsearch { 171cb0ef41Sopenharmony_ci 181cb0ef41Sopenharmony_citemplate <typename T> 191cb0ef41Sopenharmony_ciclass Vector { 201cb0ef41Sopenharmony_ci public: 211cb0ef41Sopenharmony_ci Vector(T* data, size_t length, bool isForward) 221cb0ef41Sopenharmony_ci : start_(data), length_(length), is_forward_(isForward) { 231cb0ef41Sopenharmony_ci CHECK(length > 0 && data != nullptr); 241cb0ef41Sopenharmony_ci } 251cb0ef41Sopenharmony_ci 261cb0ef41Sopenharmony_ci // Returns the start of the memory range. 271cb0ef41Sopenharmony_ci // For vector v this is NOT necessarily &v[0], see forward(). 281cb0ef41Sopenharmony_ci const T* start() const { return start_; } 291cb0ef41Sopenharmony_ci 301cb0ef41Sopenharmony_ci // Returns the length of the vector, in characters. 311cb0ef41Sopenharmony_ci size_t length() const { return length_; } 321cb0ef41Sopenharmony_ci 331cb0ef41Sopenharmony_ci // Returns true if the Vector is front-to-back, false if back-to-front. 341cb0ef41Sopenharmony_ci // In the latter case, v[0] corresponds to the *end* of the memory range. 351cb0ef41Sopenharmony_ci bool forward() const { return is_forward_; } 361cb0ef41Sopenharmony_ci 371cb0ef41Sopenharmony_ci // Access individual vector elements - checks bounds in debug mode. 381cb0ef41Sopenharmony_ci T& operator[](size_t index) const { 391cb0ef41Sopenharmony_ci DCHECK_LT(index, length_); 401cb0ef41Sopenharmony_ci return start_[is_forward_ ? index : (length_ - index - 1)]; 411cb0ef41Sopenharmony_ci } 421cb0ef41Sopenharmony_ci 431cb0ef41Sopenharmony_ci private: 441cb0ef41Sopenharmony_ci T* start_; 451cb0ef41Sopenharmony_ci size_t length_; 461cb0ef41Sopenharmony_ci bool is_forward_; 471cb0ef41Sopenharmony_ci}; 481cb0ef41Sopenharmony_ci 491cb0ef41Sopenharmony_ci 501cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 511cb0ef41Sopenharmony_ci// String Search object. 521cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 531cb0ef41Sopenharmony_ci 541cb0ef41Sopenharmony_ci// Class holding constants and methods that apply to all string search variants, 551cb0ef41Sopenharmony_ci// independently of subject and pattern char size. 561cb0ef41Sopenharmony_ciclass StringSearchBase { 571cb0ef41Sopenharmony_ci protected: 581cb0ef41Sopenharmony_ci // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 591cb0ef41Sopenharmony_ci // limit, we can fix the size of tables. For a needle longer than this limit, 601cb0ef41Sopenharmony_ci // search will not be optimal, since we only build tables for a suffix 611cb0ef41Sopenharmony_ci // of the string, but it is a safe approximation. 621cb0ef41Sopenharmony_ci static const int kBMMaxShift = 250; 631cb0ef41Sopenharmony_ci 641cb0ef41Sopenharmony_ci // Reduce alphabet to this size. 651cb0ef41Sopenharmony_ci // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 661cb0ef41Sopenharmony_ci // proportional to the input alphabet. We reduce the alphabet size by 671cb0ef41Sopenharmony_ci // equating input characters modulo a smaller alphabet size. This gives 681cb0ef41Sopenharmony_ci // a potentially less efficient searching, but is a safe approximation. 691cb0ef41Sopenharmony_ci // For needles using only characters in the same Unicode 256-code point page, 701cb0ef41Sopenharmony_ci // there is no search speed degradation. 711cb0ef41Sopenharmony_ci static const int kLatin1AlphabetSize = 256; 721cb0ef41Sopenharmony_ci static const int kUC16AlphabetSize = 256; 731cb0ef41Sopenharmony_ci 741cb0ef41Sopenharmony_ci // Bad-char shift table stored in the state. It's length is the alphabet size. 751cb0ef41Sopenharmony_ci // For patterns below this length, the skip length of Boyer-Moore is too short 761cb0ef41Sopenharmony_ci // to compensate for the algorithmic overhead compared to simple brute force. 771cb0ef41Sopenharmony_ci static const int kBMMinPatternLength = 8; 781cb0ef41Sopenharmony_ci 791cb0ef41Sopenharmony_ci // Store for the BoyerMoore(Horspool) bad char shift table. 801cb0ef41Sopenharmony_ci int bad_char_shift_table_[kUC16AlphabetSize]; 811cb0ef41Sopenharmony_ci // Store for the BoyerMoore good suffix shift table. 821cb0ef41Sopenharmony_ci int good_suffix_shift_table_[kBMMaxShift + 1]; 831cb0ef41Sopenharmony_ci // Table used temporarily while building the BoyerMoore good suffix 841cb0ef41Sopenharmony_ci // shift table. 851cb0ef41Sopenharmony_ci int suffix_table_[kBMMaxShift + 1]; 861cb0ef41Sopenharmony_ci}; 871cb0ef41Sopenharmony_ci 881cb0ef41Sopenharmony_citemplate <typename Char> 891cb0ef41Sopenharmony_ciclass StringSearch : private StringSearchBase { 901cb0ef41Sopenharmony_ci public: 911cb0ef41Sopenharmony_ci typedef stringsearch::Vector<const Char> Vector; 921cb0ef41Sopenharmony_ci 931cb0ef41Sopenharmony_ci explicit StringSearch(Vector pattern) 941cb0ef41Sopenharmony_ci : pattern_(pattern), start_(0) { 951cb0ef41Sopenharmony_ci if (pattern.length() >= kBMMaxShift) { 961cb0ef41Sopenharmony_ci start_ = pattern.length() - kBMMaxShift; 971cb0ef41Sopenharmony_ci } 981cb0ef41Sopenharmony_ci 991cb0ef41Sopenharmony_ci size_t pattern_length = pattern_.length(); 1001cb0ef41Sopenharmony_ci CHECK_GT(pattern_length, 0); 1011cb0ef41Sopenharmony_ci if (pattern_length < kBMMinPatternLength) { 1021cb0ef41Sopenharmony_ci if (pattern_length == 1) { 1031cb0ef41Sopenharmony_ci strategy_ = SearchStrategy::kSingleChar; 1041cb0ef41Sopenharmony_ci return; 1051cb0ef41Sopenharmony_ci } 1061cb0ef41Sopenharmony_ci strategy_ = SearchStrategy::kLinear; 1071cb0ef41Sopenharmony_ci return; 1081cb0ef41Sopenharmony_ci } 1091cb0ef41Sopenharmony_ci strategy_ = SearchStrategy::kInitial; 1101cb0ef41Sopenharmony_ci } 1111cb0ef41Sopenharmony_ci 1121cb0ef41Sopenharmony_ci size_t Search(Vector subject, size_t index) { 1131cb0ef41Sopenharmony_ci switch (strategy_) { 1141cb0ef41Sopenharmony_ci case kBoyerMooreHorspool: 1151cb0ef41Sopenharmony_ci return BoyerMooreHorspoolSearch(subject, index); 1161cb0ef41Sopenharmony_ci case kBoyerMoore: 1171cb0ef41Sopenharmony_ci return BoyerMooreSearch(subject, index); 1181cb0ef41Sopenharmony_ci case kInitial: 1191cb0ef41Sopenharmony_ci return InitialSearch(subject, index); 1201cb0ef41Sopenharmony_ci case kLinear: 1211cb0ef41Sopenharmony_ci return LinearSearch(subject, index); 1221cb0ef41Sopenharmony_ci case kSingleChar: 1231cb0ef41Sopenharmony_ci return SingleCharSearch(subject, index); 1241cb0ef41Sopenharmony_ci } 1251cb0ef41Sopenharmony_ci UNREACHABLE(); 1261cb0ef41Sopenharmony_ci } 1271cb0ef41Sopenharmony_ci 1281cb0ef41Sopenharmony_ci static inline int AlphabetSize() { 1291cb0ef41Sopenharmony_ci if (sizeof(Char) == 1) { 1301cb0ef41Sopenharmony_ci // Latin1 needle. 1311cb0ef41Sopenharmony_ci return kLatin1AlphabetSize; 1321cb0ef41Sopenharmony_ci } else { 1331cb0ef41Sopenharmony_ci // UC16 needle. 1341cb0ef41Sopenharmony_ci return kUC16AlphabetSize; 1351cb0ef41Sopenharmony_ci } 1361cb0ef41Sopenharmony_ci 1371cb0ef41Sopenharmony_ci static_assert(sizeof(Char) == sizeof(uint8_t) || 1381cb0ef41Sopenharmony_ci sizeof(Char) == sizeof(uint16_t), 1391cb0ef41Sopenharmony_ci "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)"); 1401cb0ef41Sopenharmony_ci } 1411cb0ef41Sopenharmony_ci 1421cb0ef41Sopenharmony_ci private: 1431cb0ef41Sopenharmony_ci typedef size_t (StringSearch::*SearchFunction)(Vector, size_t); 1441cb0ef41Sopenharmony_ci size_t SingleCharSearch(Vector subject, size_t start_index); 1451cb0ef41Sopenharmony_ci size_t LinearSearch(Vector subject, size_t start_index); 1461cb0ef41Sopenharmony_ci size_t InitialSearch(Vector subject, size_t start_index); 1471cb0ef41Sopenharmony_ci size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index); 1481cb0ef41Sopenharmony_ci size_t BoyerMooreSearch(Vector subject, size_t start_index); 1491cb0ef41Sopenharmony_ci 1501cb0ef41Sopenharmony_ci void PopulateBoyerMooreHorspoolTable(); 1511cb0ef41Sopenharmony_ci 1521cb0ef41Sopenharmony_ci void PopulateBoyerMooreTable(); 1531cb0ef41Sopenharmony_ci 1541cb0ef41Sopenharmony_ci static inline int CharOccurrence(int* bad_char_occurrence, 1551cb0ef41Sopenharmony_ci Char char_code) { 1561cb0ef41Sopenharmony_ci if (sizeof(Char) == 1) { 1571cb0ef41Sopenharmony_ci return bad_char_occurrence[static_cast<int>(char_code)]; 1581cb0ef41Sopenharmony_ci } 1591cb0ef41Sopenharmony_ci // Both pattern and subject are UC16. Reduce character to equivalence class. 1601cb0ef41Sopenharmony_ci int equiv_class = char_code % kUC16AlphabetSize; 1611cb0ef41Sopenharmony_ci return bad_char_occurrence[equiv_class]; 1621cb0ef41Sopenharmony_ci } 1631cb0ef41Sopenharmony_ci 1641cb0ef41Sopenharmony_ci enum SearchStrategy { 1651cb0ef41Sopenharmony_ci kBoyerMooreHorspool, 1661cb0ef41Sopenharmony_ci kBoyerMoore, 1671cb0ef41Sopenharmony_ci kInitial, 1681cb0ef41Sopenharmony_ci kLinear, 1691cb0ef41Sopenharmony_ci kSingleChar, 1701cb0ef41Sopenharmony_ci }; 1711cb0ef41Sopenharmony_ci 1721cb0ef41Sopenharmony_ci // The pattern to search for. 1731cb0ef41Sopenharmony_ci Vector pattern_; 1741cb0ef41Sopenharmony_ci SearchStrategy strategy_; 1751cb0ef41Sopenharmony_ci // Cache value of Max(0, pattern_length() - kBMMaxShift) 1761cb0ef41Sopenharmony_ci size_t start_; 1771cb0ef41Sopenharmony_ci}; 1781cb0ef41Sopenharmony_ci 1791cb0ef41Sopenharmony_ci 1801cb0ef41Sopenharmony_citemplate <typename T, typename U> 1811cb0ef41Sopenharmony_ciinline T AlignDown(T value, U alignment) { 1821cb0ef41Sopenharmony_ci return reinterpret_cast<T>( 1831cb0ef41Sopenharmony_ci (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); 1841cb0ef41Sopenharmony_ci} 1851cb0ef41Sopenharmony_ci 1861cb0ef41Sopenharmony_ci 1871cb0ef41Sopenharmony_ciinline uint8_t GetHighestValueByte(uint16_t character) { 1881cb0ef41Sopenharmony_ci return std::max(static_cast<uint8_t>(character & 0xFF), 1891cb0ef41Sopenharmony_ci static_cast<uint8_t>(character >> 8)); 1901cb0ef41Sopenharmony_ci} 1911cb0ef41Sopenharmony_ci 1921cb0ef41Sopenharmony_ci 1931cb0ef41Sopenharmony_ciinline uint8_t GetHighestValueByte(uint8_t character) { return character; } 1941cb0ef41Sopenharmony_ci 1951cb0ef41Sopenharmony_ci 1961cb0ef41Sopenharmony_ci// Searches for a byte value in a memory buffer, back to front. 1971cb0ef41Sopenharmony_ci// Uses memrchr(3) on systems which support it, for speed. 1981cb0ef41Sopenharmony_ci// Falls back to a vanilla for loop on non-GNU systems such as Windows. 1991cb0ef41Sopenharmony_ciinline const void* MemrchrFill(const void* haystack, uint8_t needle, 2001cb0ef41Sopenharmony_ci size_t haystack_len) { 2011cb0ef41Sopenharmony_ci#ifdef _GNU_SOURCE 2021cb0ef41Sopenharmony_ci return memrchr(haystack, needle, haystack_len); 2031cb0ef41Sopenharmony_ci#else 2041cb0ef41Sopenharmony_ci const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack); 2051cb0ef41Sopenharmony_ci for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) { 2061cb0ef41Sopenharmony_ci if (haystack8[i] == needle) { 2071cb0ef41Sopenharmony_ci return haystack8 + i; 2081cb0ef41Sopenharmony_ci } 2091cb0ef41Sopenharmony_ci } 2101cb0ef41Sopenharmony_ci return nullptr; 2111cb0ef41Sopenharmony_ci#endif 2121cb0ef41Sopenharmony_ci} 2131cb0ef41Sopenharmony_ci 2141cb0ef41Sopenharmony_ci 2151cb0ef41Sopenharmony_ci// Finds the first occurrence of *two-byte* character pattern[0] in the string 2161cb0ef41Sopenharmony_ci// `subject`. Does not check that the whole pattern matches. 2171cb0ef41Sopenharmony_citemplate <typename Char> 2181cb0ef41Sopenharmony_ciinline size_t FindFirstCharacter(Vector<const Char> pattern, 2191cb0ef41Sopenharmony_ci Vector<const Char> subject, size_t index) { 2201cb0ef41Sopenharmony_ci const Char pattern_first_char = pattern[0]; 2211cb0ef41Sopenharmony_ci const size_t max_n = (subject.length() - pattern.length() + 1); 2221cb0ef41Sopenharmony_ci 2231cb0ef41Sopenharmony_ci // For speed, search for the more `rare` of the two bytes in pattern[0] 2241cb0ef41Sopenharmony_ci // using memchr / memrchr (which are much faster than a simple for loop). 2251cb0ef41Sopenharmony_ci const uint8_t search_byte = GetHighestValueByte(pattern_first_char); 2261cb0ef41Sopenharmony_ci size_t pos = index; 2271cb0ef41Sopenharmony_ci do { 2281cb0ef41Sopenharmony_ci const size_t bytes_to_search = (max_n - pos) * sizeof(Char); 2291cb0ef41Sopenharmony_ci const void* void_pos; 2301cb0ef41Sopenharmony_ci if (subject.forward()) { 2311cb0ef41Sopenharmony_ci // Assert that bytes_to_search won't overflow 2321cb0ef41Sopenharmony_ci CHECK_LE(pos, max_n); 2331cb0ef41Sopenharmony_ci CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char)); 2341cb0ef41Sopenharmony_ci void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); 2351cb0ef41Sopenharmony_ci } else { 2361cb0ef41Sopenharmony_ci CHECK_LE(pos, subject.length()); 2371cb0ef41Sopenharmony_ci CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char)); 2381cb0ef41Sopenharmony_ci void_pos = MemrchrFill(subject.start() + pattern.length() - 1, 2391cb0ef41Sopenharmony_ci search_byte, 2401cb0ef41Sopenharmony_ci bytes_to_search); 2411cb0ef41Sopenharmony_ci } 2421cb0ef41Sopenharmony_ci const Char* char_pos = static_cast<const Char*>(void_pos); 2431cb0ef41Sopenharmony_ci if (char_pos == nullptr) 2441cb0ef41Sopenharmony_ci return subject.length(); 2451cb0ef41Sopenharmony_ci 2461cb0ef41Sopenharmony_ci // Then, for each match, verify that the full two bytes match pattern[0]. 2471cb0ef41Sopenharmony_ci char_pos = AlignDown(char_pos, sizeof(Char)); 2481cb0ef41Sopenharmony_ci size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); 2491cb0ef41Sopenharmony_ci pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1); 2501cb0ef41Sopenharmony_ci if (subject[pos] == pattern_first_char) { 2511cb0ef41Sopenharmony_ci // Match found, hooray. 2521cb0ef41Sopenharmony_ci return pos; 2531cb0ef41Sopenharmony_ci } 2541cb0ef41Sopenharmony_ci // Search byte matched, but the other byte of pattern[0] didn't. Keep going. 2551cb0ef41Sopenharmony_ci } while (++pos < max_n); 2561cb0ef41Sopenharmony_ci 2571cb0ef41Sopenharmony_ci return subject.length(); 2581cb0ef41Sopenharmony_ci} 2591cb0ef41Sopenharmony_ci 2601cb0ef41Sopenharmony_ci 2611cb0ef41Sopenharmony_ci// Finds the first occurrence of the byte pattern[0] in string `subject`. 2621cb0ef41Sopenharmony_ci// Does not verify that the whole pattern matches. 2631cb0ef41Sopenharmony_citemplate <> 2641cb0ef41Sopenharmony_ciinline size_t FindFirstCharacter(Vector<const uint8_t> pattern, 2651cb0ef41Sopenharmony_ci Vector<const uint8_t> subject, 2661cb0ef41Sopenharmony_ci size_t index) { 2671cb0ef41Sopenharmony_ci const uint8_t pattern_first_char = pattern[0]; 2681cb0ef41Sopenharmony_ci const size_t subj_len = subject.length(); 2691cb0ef41Sopenharmony_ci const size_t max_n = (subject.length() - pattern.length() + 1); 2701cb0ef41Sopenharmony_ci 2711cb0ef41Sopenharmony_ci const void* pos; 2721cb0ef41Sopenharmony_ci if (subject.forward()) { 2731cb0ef41Sopenharmony_ci pos = memchr(subject.start() + index, pattern_first_char, max_n - index); 2741cb0ef41Sopenharmony_ci } else { 2751cb0ef41Sopenharmony_ci pos = MemrchrFill(subject.start() + pattern.length() - 1, 2761cb0ef41Sopenharmony_ci pattern_first_char, 2771cb0ef41Sopenharmony_ci max_n - index); 2781cb0ef41Sopenharmony_ci } 2791cb0ef41Sopenharmony_ci const uint8_t* char_pos = static_cast<const uint8_t*>(pos); 2801cb0ef41Sopenharmony_ci if (char_pos == nullptr) { 2811cb0ef41Sopenharmony_ci return subj_len; 2821cb0ef41Sopenharmony_ci } 2831cb0ef41Sopenharmony_ci 2841cb0ef41Sopenharmony_ci size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); 2851cb0ef41Sopenharmony_ci return subject.forward() ? raw_pos : (subj_len - raw_pos - 1); 2861cb0ef41Sopenharmony_ci} 2871cb0ef41Sopenharmony_ci 2881cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 2891cb0ef41Sopenharmony_ci// Single Character Pattern Search Strategy 2901cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 2911cb0ef41Sopenharmony_ci 2921cb0ef41Sopenharmony_citemplate <typename Char> 2931cb0ef41Sopenharmony_cisize_t StringSearch<Char>::SingleCharSearch( 2941cb0ef41Sopenharmony_ci Vector subject, 2951cb0ef41Sopenharmony_ci size_t index) { 2961cb0ef41Sopenharmony_ci CHECK_EQ(1, pattern_.length()); 2971cb0ef41Sopenharmony_ci return FindFirstCharacter(pattern_, subject, index); 2981cb0ef41Sopenharmony_ci} 2991cb0ef41Sopenharmony_ci 3001cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 3011cb0ef41Sopenharmony_ci// Linear Search Strategy 3021cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 3031cb0ef41Sopenharmony_ci 3041cb0ef41Sopenharmony_ci// Simple linear search for short patterns. Never bails out. 3051cb0ef41Sopenharmony_citemplate <typename Char> 3061cb0ef41Sopenharmony_cisize_t StringSearch<Char>::LinearSearch( 3071cb0ef41Sopenharmony_ci Vector subject, 3081cb0ef41Sopenharmony_ci size_t index) { 3091cb0ef41Sopenharmony_ci CHECK_GT(pattern_.length(), 1); 3101cb0ef41Sopenharmony_ci const size_t n = subject.length() - pattern_.length(); 3111cb0ef41Sopenharmony_ci for (size_t i = index; i <= n; i++) { 3121cb0ef41Sopenharmony_ci i = FindFirstCharacter(pattern_, subject, i); 3131cb0ef41Sopenharmony_ci if (i == subject.length()) 3141cb0ef41Sopenharmony_ci return subject.length(); 3151cb0ef41Sopenharmony_ci CHECK_LE(i, n); 3161cb0ef41Sopenharmony_ci 3171cb0ef41Sopenharmony_ci bool matches = true; 3181cb0ef41Sopenharmony_ci for (size_t j = 1; j < pattern_.length(); j++) { 3191cb0ef41Sopenharmony_ci if (pattern_[j] != subject[i + j]) { 3201cb0ef41Sopenharmony_ci matches = false; 3211cb0ef41Sopenharmony_ci break; 3221cb0ef41Sopenharmony_ci } 3231cb0ef41Sopenharmony_ci } 3241cb0ef41Sopenharmony_ci if (matches) { 3251cb0ef41Sopenharmony_ci return i; 3261cb0ef41Sopenharmony_ci } 3271cb0ef41Sopenharmony_ci } 3281cb0ef41Sopenharmony_ci return subject.length(); 3291cb0ef41Sopenharmony_ci} 3301cb0ef41Sopenharmony_ci 3311cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 3321cb0ef41Sopenharmony_ci// Boyer-Moore string search 3331cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 3341cb0ef41Sopenharmony_ci 3351cb0ef41Sopenharmony_citemplate <typename Char> 3361cb0ef41Sopenharmony_cisize_t StringSearch<Char>::BoyerMooreSearch( 3371cb0ef41Sopenharmony_ci Vector subject, 3381cb0ef41Sopenharmony_ci size_t start_index) { 3391cb0ef41Sopenharmony_ci const size_t subject_length = subject.length(); 3401cb0ef41Sopenharmony_ci const size_t pattern_length = pattern_.length(); 3411cb0ef41Sopenharmony_ci // Only preprocess at most kBMMaxShift last characters of pattern. 3421cb0ef41Sopenharmony_ci size_t start = start_; 3431cb0ef41Sopenharmony_ci 3441cb0ef41Sopenharmony_ci int* bad_char_occurrence = bad_char_shift_table_; 3451cb0ef41Sopenharmony_ci int* good_suffix_shift = good_suffix_shift_table_ - start_; 3461cb0ef41Sopenharmony_ci 3471cb0ef41Sopenharmony_ci Char last_char = pattern_[pattern_length - 1]; 3481cb0ef41Sopenharmony_ci size_t index = start_index; 3491cb0ef41Sopenharmony_ci // Continue search from i. 3501cb0ef41Sopenharmony_ci while (index <= subject_length - pattern_length) { 3511cb0ef41Sopenharmony_ci size_t j = pattern_length - 1; 3521cb0ef41Sopenharmony_ci int c; 3531cb0ef41Sopenharmony_ci while (last_char != (c = subject[index + j])) { 3541cb0ef41Sopenharmony_ci int shift = j - CharOccurrence(bad_char_occurrence, c); 3551cb0ef41Sopenharmony_ci index += shift; 3561cb0ef41Sopenharmony_ci if (index > subject_length - pattern_length) { 3571cb0ef41Sopenharmony_ci return subject.length(); 3581cb0ef41Sopenharmony_ci } 3591cb0ef41Sopenharmony_ci } 3601cb0ef41Sopenharmony_ci while (pattern_[j] == (c = subject[index + j])) { 3611cb0ef41Sopenharmony_ci if (j == 0) { 3621cb0ef41Sopenharmony_ci return index; 3631cb0ef41Sopenharmony_ci } 3641cb0ef41Sopenharmony_ci j--; 3651cb0ef41Sopenharmony_ci } 3661cb0ef41Sopenharmony_ci if (j < start) { 3671cb0ef41Sopenharmony_ci // we have matched more than our tables allow us to be smart about. 3681cb0ef41Sopenharmony_ci // Fall back on BMH shift. 3691cb0ef41Sopenharmony_ci index += pattern_length - 1 - 3701cb0ef41Sopenharmony_ci CharOccurrence(bad_char_occurrence, last_char); 3711cb0ef41Sopenharmony_ci } else { 3721cb0ef41Sopenharmony_ci int gs_shift = good_suffix_shift[j + 1]; 3731cb0ef41Sopenharmony_ci int bc_occ = CharOccurrence(bad_char_occurrence, c); 3741cb0ef41Sopenharmony_ci int shift = j - bc_occ; 3751cb0ef41Sopenharmony_ci if (gs_shift > shift) { 3761cb0ef41Sopenharmony_ci shift = gs_shift; 3771cb0ef41Sopenharmony_ci } 3781cb0ef41Sopenharmony_ci index += shift; 3791cb0ef41Sopenharmony_ci } 3801cb0ef41Sopenharmony_ci } 3811cb0ef41Sopenharmony_ci 3821cb0ef41Sopenharmony_ci return subject.length(); 3831cb0ef41Sopenharmony_ci} 3841cb0ef41Sopenharmony_ci 3851cb0ef41Sopenharmony_citemplate <typename Char> 3861cb0ef41Sopenharmony_civoid StringSearch<Char>::PopulateBoyerMooreTable() { 3871cb0ef41Sopenharmony_ci const size_t pattern_length = pattern_.length(); 3881cb0ef41Sopenharmony_ci // Only look at the last kBMMaxShift characters of pattern (from start_ 3891cb0ef41Sopenharmony_ci // to pattern_length). 3901cb0ef41Sopenharmony_ci const size_t start = start_; 3911cb0ef41Sopenharmony_ci const size_t length = pattern_length - start; 3921cb0ef41Sopenharmony_ci 3931cb0ef41Sopenharmony_ci // Biased tables so that we can use pattern indices as table indices, 3941cb0ef41Sopenharmony_ci // even if we only cover the part of the pattern from offset start. 3951cb0ef41Sopenharmony_ci int* shift_table = good_suffix_shift_table_ - start_; 3961cb0ef41Sopenharmony_ci int* suffix_table = suffix_table_ - start_; 3971cb0ef41Sopenharmony_ci 3981cb0ef41Sopenharmony_ci // Initialize table. 3991cb0ef41Sopenharmony_ci for (size_t i = start; i < pattern_length; i++) { 4001cb0ef41Sopenharmony_ci shift_table[i] = length; 4011cb0ef41Sopenharmony_ci } 4021cb0ef41Sopenharmony_ci shift_table[pattern_length] = 1; 4031cb0ef41Sopenharmony_ci suffix_table[pattern_length] = pattern_length + 1; 4041cb0ef41Sopenharmony_ci 4051cb0ef41Sopenharmony_ci if (pattern_length <= start) { 4061cb0ef41Sopenharmony_ci return; 4071cb0ef41Sopenharmony_ci } 4081cb0ef41Sopenharmony_ci 4091cb0ef41Sopenharmony_ci // Find suffixes. 4101cb0ef41Sopenharmony_ci Char last_char = pattern_[pattern_length - 1]; 4111cb0ef41Sopenharmony_ci size_t suffix = pattern_length + 1; 4121cb0ef41Sopenharmony_ci { 4131cb0ef41Sopenharmony_ci size_t i = pattern_length; 4141cb0ef41Sopenharmony_ci while (i > start) { 4151cb0ef41Sopenharmony_ci Char c = pattern_[i - 1]; 4161cb0ef41Sopenharmony_ci while (suffix <= pattern_length && c != pattern_[suffix - 1]) { 4171cb0ef41Sopenharmony_ci if (static_cast<size_t>(shift_table[suffix]) == length) { 4181cb0ef41Sopenharmony_ci shift_table[suffix] = suffix - i; 4191cb0ef41Sopenharmony_ci } 4201cb0ef41Sopenharmony_ci suffix = suffix_table[suffix]; 4211cb0ef41Sopenharmony_ci } 4221cb0ef41Sopenharmony_ci suffix_table[--i] = --suffix; 4231cb0ef41Sopenharmony_ci if (suffix == pattern_length) { 4241cb0ef41Sopenharmony_ci // No suffix to extend, so we check against last_char only. 4251cb0ef41Sopenharmony_ci while ((i > start) && (pattern_[i - 1] != last_char)) { 4261cb0ef41Sopenharmony_ci if (static_cast<size_t>(shift_table[pattern_length]) == length) { 4271cb0ef41Sopenharmony_ci shift_table[pattern_length] = pattern_length - i; 4281cb0ef41Sopenharmony_ci } 4291cb0ef41Sopenharmony_ci suffix_table[--i] = pattern_length; 4301cb0ef41Sopenharmony_ci } 4311cb0ef41Sopenharmony_ci if (i > start) { 4321cb0ef41Sopenharmony_ci suffix_table[--i] = --suffix; 4331cb0ef41Sopenharmony_ci } 4341cb0ef41Sopenharmony_ci } 4351cb0ef41Sopenharmony_ci } 4361cb0ef41Sopenharmony_ci } 4371cb0ef41Sopenharmony_ci // Build shift table using suffixes. 4381cb0ef41Sopenharmony_ci if (suffix < pattern_length) { 4391cb0ef41Sopenharmony_ci for (size_t i = start; i <= pattern_length; i++) { 4401cb0ef41Sopenharmony_ci if (static_cast<size_t>(shift_table[i]) == length) { 4411cb0ef41Sopenharmony_ci shift_table[i] = suffix - start; 4421cb0ef41Sopenharmony_ci } 4431cb0ef41Sopenharmony_ci if (i == suffix) { 4441cb0ef41Sopenharmony_ci suffix = suffix_table[suffix]; 4451cb0ef41Sopenharmony_ci } 4461cb0ef41Sopenharmony_ci } 4471cb0ef41Sopenharmony_ci } 4481cb0ef41Sopenharmony_ci} 4491cb0ef41Sopenharmony_ci 4501cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 4511cb0ef41Sopenharmony_ci// Boyer-Moore-Horspool string search. 4521cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 4531cb0ef41Sopenharmony_ci 4541cb0ef41Sopenharmony_citemplate <typename Char> 4551cb0ef41Sopenharmony_cisize_t StringSearch<Char>::BoyerMooreHorspoolSearch( 4561cb0ef41Sopenharmony_ci Vector subject, 4571cb0ef41Sopenharmony_ci size_t start_index) { 4581cb0ef41Sopenharmony_ci const size_t subject_length = subject.length(); 4591cb0ef41Sopenharmony_ci const size_t pattern_length = pattern_.length(); 4601cb0ef41Sopenharmony_ci int* char_occurrences = bad_char_shift_table_; 4611cb0ef41Sopenharmony_ci int64_t badness = -static_cast<int64_t>(pattern_length); 4621cb0ef41Sopenharmony_ci 4631cb0ef41Sopenharmony_ci // How bad we are doing without a good-suffix table. 4641cb0ef41Sopenharmony_ci Char last_char = pattern_[pattern_length - 1]; 4651cb0ef41Sopenharmony_ci int last_char_shift = 4661cb0ef41Sopenharmony_ci pattern_length - 1 - 4671cb0ef41Sopenharmony_ci CharOccurrence(char_occurrences, last_char); 4681cb0ef41Sopenharmony_ci 4691cb0ef41Sopenharmony_ci // Perform search 4701cb0ef41Sopenharmony_ci size_t index = start_index; // No matches found prior to this index. 4711cb0ef41Sopenharmony_ci while (index <= subject_length - pattern_length) { 4721cb0ef41Sopenharmony_ci size_t j = pattern_length - 1; 4731cb0ef41Sopenharmony_ci int subject_char; 4741cb0ef41Sopenharmony_ci while (last_char != (subject_char = subject[index + j])) { 4751cb0ef41Sopenharmony_ci int bc_occ = CharOccurrence(char_occurrences, subject_char); 4761cb0ef41Sopenharmony_ci int shift = j - bc_occ; 4771cb0ef41Sopenharmony_ci index += shift; 4781cb0ef41Sopenharmony_ci badness += 1 - shift; // at most zero, so badness cannot increase. 4791cb0ef41Sopenharmony_ci if (index > subject_length - pattern_length) { 4801cb0ef41Sopenharmony_ci return subject_length; 4811cb0ef41Sopenharmony_ci } 4821cb0ef41Sopenharmony_ci } 4831cb0ef41Sopenharmony_ci j--; 4841cb0ef41Sopenharmony_ci while (pattern_[j] == (subject[index + j])) { 4851cb0ef41Sopenharmony_ci if (j == 0) { 4861cb0ef41Sopenharmony_ci return index; 4871cb0ef41Sopenharmony_ci } 4881cb0ef41Sopenharmony_ci j--; 4891cb0ef41Sopenharmony_ci } 4901cb0ef41Sopenharmony_ci index += last_char_shift; 4911cb0ef41Sopenharmony_ci // Badness increases by the number of characters we have 4921cb0ef41Sopenharmony_ci // checked, and decreases by the number of characters we 4931cb0ef41Sopenharmony_ci // can skip by shifting. It's a measure of how we are doing 4941cb0ef41Sopenharmony_ci // compared to reading each character exactly once. 4951cb0ef41Sopenharmony_ci badness += (pattern_length - j) - last_char_shift; 4961cb0ef41Sopenharmony_ci if (badness > 0) { 4971cb0ef41Sopenharmony_ci PopulateBoyerMooreTable(); 4981cb0ef41Sopenharmony_ci strategy_ = SearchStrategy::kBoyerMoore; 4991cb0ef41Sopenharmony_ci return BoyerMooreSearch(subject, index); 5001cb0ef41Sopenharmony_ci } 5011cb0ef41Sopenharmony_ci } 5021cb0ef41Sopenharmony_ci return subject.length(); 5031cb0ef41Sopenharmony_ci} 5041cb0ef41Sopenharmony_ci 5051cb0ef41Sopenharmony_citemplate <typename Char> 5061cb0ef41Sopenharmony_civoid StringSearch<Char>::PopulateBoyerMooreHorspoolTable() { 5071cb0ef41Sopenharmony_ci const size_t pattern_length = pattern_.length(); 5081cb0ef41Sopenharmony_ci 5091cb0ef41Sopenharmony_ci int* bad_char_occurrence = bad_char_shift_table_; 5101cb0ef41Sopenharmony_ci 5111cb0ef41Sopenharmony_ci // Only preprocess at most kBMMaxShift last characters of pattern. 5121cb0ef41Sopenharmony_ci const size_t start = start_; 5131cb0ef41Sopenharmony_ci // Run forwards to populate bad_char_table, so that *last* instance 5141cb0ef41Sopenharmony_ci // of character equivalence class is the one registered. 5151cb0ef41Sopenharmony_ci // Notice: Doesn't include the last character. 5161cb0ef41Sopenharmony_ci const size_t table_size = AlphabetSize(); 5171cb0ef41Sopenharmony_ci if (start == 0) { 5181cb0ef41Sopenharmony_ci // All patterns less than kBMMaxShift in length. 5191cb0ef41Sopenharmony_ci memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); 5201cb0ef41Sopenharmony_ci } else { 5211cb0ef41Sopenharmony_ci for (size_t i = 0; i < table_size; i++) { 5221cb0ef41Sopenharmony_ci bad_char_occurrence[i] = start - 1; 5231cb0ef41Sopenharmony_ci } 5241cb0ef41Sopenharmony_ci } 5251cb0ef41Sopenharmony_ci for (size_t i = start; i < pattern_length - 1; i++) { 5261cb0ef41Sopenharmony_ci Char c = pattern_[i]; 5271cb0ef41Sopenharmony_ci int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize(); 5281cb0ef41Sopenharmony_ci bad_char_occurrence[bucket] = i; 5291cb0ef41Sopenharmony_ci } 5301cb0ef41Sopenharmony_ci} 5311cb0ef41Sopenharmony_ci 5321cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 5331cb0ef41Sopenharmony_ci// Linear string search with bailout to BMH. 5341cb0ef41Sopenharmony_ci//--------------------------------------------------------------------- 5351cb0ef41Sopenharmony_ci 5361cb0ef41Sopenharmony_ci// Simple linear search for short patterns, which bails out if the string 5371cb0ef41Sopenharmony_ci// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. 5381cb0ef41Sopenharmony_citemplate <typename Char> 5391cb0ef41Sopenharmony_cisize_t StringSearch<Char>::InitialSearch( 5401cb0ef41Sopenharmony_ci Vector subject, 5411cb0ef41Sopenharmony_ci size_t index) { 5421cb0ef41Sopenharmony_ci const size_t pattern_length = pattern_.length(); 5431cb0ef41Sopenharmony_ci // Badness is a count of how much work we have done. When we have 5441cb0ef41Sopenharmony_ci // done enough work we decide it's probably worth switching to a better 5451cb0ef41Sopenharmony_ci // algorithm. 5461cb0ef41Sopenharmony_ci int64_t badness = -10 - (pattern_length << 2); 5471cb0ef41Sopenharmony_ci 5481cb0ef41Sopenharmony_ci // We know our pattern is at least 2 characters, we cache the first so 5491cb0ef41Sopenharmony_ci // the common case of the first character not matching is faster. 5501cb0ef41Sopenharmony_ci for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) { 5511cb0ef41Sopenharmony_ci badness++; 5521cb0ef41Sopenharmony_ci if (badness <= 0) { 5531cb0ef41Sopenharmony_ci i = FindFirstCharacter(pattern_, subject, i); 5541cb0ef41Sopenharmony_ci if (i == subject.length()) 5551cb0ef41Sopenharmony_ci return subject.length(); 5561cb0ef41Sopenharmony_ci CHECK_LE(i, n); 5571cb0ef41Sopenharmony_ci size_t j = 1; 5581cb0ef41Sopenharmony_ci do { 5591cb0ef41Sopenharmony_ci if (pattern_[j] != subject[i + j]) { 5601cb0ef41Sopenharmony_ci break; 5611cb0ef41Sopenharmony_ci } 5621cb0ef41Sopenharmony_ci j++; 5631cb0ef41Sopenharmony_ci } while (j < pattern_length); 5641cb0ef41Sopenharmony_ci if (j == pattern_length) { 5651cb0ef41Sopenharmony_ci return i; 5661cb0ef41Sopenharmony_ci } 5671cb0ef41Sopenharmony_ci badness += j; 5681cb0ef41Sopenharmony_ci } else { 5691cb0ef41Sopenharmony_ci PopulateBoyerMooreHorspoolTable(); 5701cb0ef41Sopenharmony_ci strategy_ = SearchStrategy::kBoyerMooreHorspool; 5711cb0ef41Sopenharmony_ci return BoyerMooreHorspoolSearch(subject, i); 5721cb0ef41Sopenharmony_ci } 5731cb0ef41Sopenharmony_ci } 5741cb0ef41Sopenharmony_ci return subject.length(); 5751cb0ef41Sopenharmony_ci} 5761cb0ef41Sopenharmony_ci 5771cb0ef41Sopenharmony_ci// Perform a single stand-alone search. 5781cb0ef41Sopenharmony_ci// If searching multiple times for the same pattern, a search 5791cb0ef41Sopenharmony_ci// object should be constructed once and the Search function then called 5801cb0ef41Sopenharmony_ci// for each search. 5811cb0ef41Sopenharmony_citemplate <typename Char> 5821cb0ef41Sopenharmony_cisize_t SearchString(Vector<const Char> subject, 5831cb0ef41Sopenharmony_ci Vector<const Char> pattern, 5841cb0ef41Sopenharmony_ci size_t start_index) { 5851cb0ef41Sopenharmony_ci StringSearch<Char> search(pattern); 5861cb0ef41Sopenharmony_ci return search.Search(subject, start_index); 5871cb0ef41Sopenharmony_ci} 5881cb0ef41Sopenharmony_ci} // namespace stringsearch 5891cb0ef41Sopenharmony_ci} // namespace node 5901cb0ef41Sopenharmony_ci 5911cb0ef41Sopenharmony_cinamespace node { 5921cb0ef41Sopenharmony_ci 5931cb0ef41Sopenharmony_citemplate <typename Char> 5941cb0ef41Sopenharmony_cisize_t SearchString(const Char* haystack, 5951cb0ef41Sopenharmony_ci size_t haystack_length, 5961cb0ef41Sopenharmony_ci const Char* needle, 5971cb0ef41Sopenharmony_ci size_t needle_length, 5981cb0ef41Sopenharmony_ci size_t start_index, 5991cb0ef41Sopenharmony_ci bool is_forward) { 6001cb0ef41Sopenharmony_ci if (haystack_length < needle_length) return haystack_length; 6011cb0ef41Sopenharmony_ci // To do a reverse search (lastIndexOf instead of indexOf) without redundant 6021cb0ef41Sopenharmony_ci // code, create two vectors that are reversed views into the input strings. 6031cb0ef41Sopenharmony_ci // For example, v_needle[0] would return the *last* character of the needle. 6041cb0ef41Sopenharmony_ci // So we're searching for the first instance of rev(needle) in rev(haystack) 6051cb0ef41Sopenharmony_ci stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward); 6061cb0ef41Sopenharmony_ci stringsearch::Vector<const Char> v_haystack( 6071cb0ef41Sopenharmony_ci haystack, haystack_length, is_forward); 6081cb0ef41Sopenharmony_ci size_t diff = haystack_length - needle_length; 6091cb0ef41Sopenharmony_ci size_t relative_start_index; 6101cb0ef41Sopenharmony_ci if (is_forward) { 6111cb0ef41Sopenharmony_ci relative_start_index = start_index; 6121cb0ef41Sopenharmony_ci } else if (diff < start_index) { 6131cb0ef41Sopenharmony_ci relative_start_index = 0; 6141cb0ef41Sopenharmony_ci } else { 6151cb0ef41Sopenharmony_ci relative_start_index = diff - start_index; 6161cb0ef41Sopenharmony_ci } 6171cb0ef41Sopenharmony_ci size_t pos = node::stringsearch::SearchString( 6181cb0ef41Sopenharmony_ci v_haystack, v_needle, relative_start_index); 6191cb0ef41Sopenharmony_ci if (pos == haystack_length) { 6201cb0ef41Sopenharmony_ci // not found 6211cb0ef41Sopenharmony_ci return pos; 6221cb0ef41Sopenharmony_ci } 6231cb0ef41Sopenharmony_ci return is_forward ? pos : (haystack_length - needle_length - pos); 6241cb0ef41Sopenharmony_ci} 6251cb0ef41Sopenharmony_ci 6261cb0ef41Sopenharmony_citemplate <size_t N> 6271cb0ef41Sopenharmony_cisize_t SearchString(const char* haystack, size_t haystack_length, 6281cb0ef41Sopenharmony_ci const char (&needle)[N]) { 6291cb0ef41Sopenharmony_ci return SearchString( 6301cb0ef41Sopenharmony_ci reinterpret_cast<const uint8_t*>(haystack), haystack_length, 6311cb0ef41Sopenharmony_ci reinterpret_cast<const uint8_t*>(needle), N - 1, 0, true); 6321cb0ef41Sopenharmony_ci} 6331cb0ef41Sopenharmony_ci 6341cb0ef41Sopenharmony_ci} // namespace node 6351cb0ef41Sopenharmony_ci 6361cb0ef41Sopenharmony_ci#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS 6371cb0ef41Sopenharmony_ci 6381cb0ef41Sopenharmony_ci#endif // SRC_STRING_SEARCH_H_ 639