1// Copyright 2011 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef SRC_STRING_SEARCH_H_ 6#define SRC_STRING_SEARCH_H_ 7 8#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS 9 10#include "util.h" 11 12#include <cstring> 13#include <algorithm> 14 15namespace node { 16namespace stringsearch { 17 18template <typename T> 19class Vector { 20 public: 21 Vector(T* data, size_t length, bool isForward) 22 : start_(data), length_(length), is_forward_(isForward) { 23 CHECK(length > 0 && data != nullptr); 24 } 25 26 // Returns the start of the memory range. 27 // For vector v this is NOT necessarily &v[0], see forward(). 28 const T* start() const { return start_; } 29 30 // Returns the length of the vector, in characters. 31 size_t length() const { return length_; } 32 33 // Returns true if the Vector is front-to-back, false if back-to-front. 34 // In the latter case, v[0] corresponds to the *end* of the memory range. 35 bool forward() const { return is_forward_; } 36 37 // Access individual vector elements - checks bounds in debug mode. 38 T& operator[](size_t index) const { 39 DCHECK_LT(index, length_); 40 return start_[is_forward_ ? index : (length_ - index - 1)]; 41 } 42 43 private: 44 T* start_; 45 size_t length_; 46 bool is_forward_; 47}; 48 49 50//--------------------------------------------------------------------- 51// String Search object. 52//--------------------------------------------------------------------- 53 54// Class holding constants and methods that apply to all string search variants, 55// independently of subject and pattern char size. 56class StringSearchBase { 57 protected: 58 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 59 // limit, we can fix the size of tables. For a needle longer than this limit, 60 // search will not be optimal, since we only build tables for a suffix 61 // of the string, but it is a safe approximation. 62 static const int kBMMaxShift = 250; 63 64 // Reduce alphabet to this size. 65 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 66 // proportional to the input alphabet. We reduce the alphabet size by 67 // equating input characters modulo a smaller alphabet size. This gives 68 // a potentially less efficient searching, but is a safe approximation. 69 // For needles using only characters in the same Unicode 256-code point page, 70 // there is no search speed degradation. 71 static const int kLatin1AlphabetSize = 256; 72 static const int kUC16AlphabetSize = 256; 73 74 // Bad-char shift table stored in the state. It's length is the alphabet size. 75 // For patterns below this length, the skip length of Boyer-Moore is too short 76 // to compensate for the algorithmic overhead compared to simple brute force. 77 static const int kBMMinPatternLength = 8; 78 79 // Store for the BoyerMoore(Horspool) bad char shift table. 80 int bad_char_shift_table_[kUC16AlphabetSize]; 81 // Store for the BoyerMoore good suffix shift table. 82 int good_suffix_shift_table_[kBMMaxShift + 1]; 83 // Table used temporarily while building the BoyerMoore good suffix 84 // shift table. 85 int suffix_table_[kBMMaxShift + 1]; 86}; 87 88template <typename Char> 89class StringSearch : private StringSearchBase { 90 public: 91 typedef stringsearch::Vector<const Char> Vector; 92 93 explicit StringSearch(Vector pattern) 94 : pattern_(pattern), start_(0) { 95 if (pattern.length() >= kBMMaxShift) { 96 start_ = pattern.length() - kBMMaxShift; 97 } 98 99 size_t pattern_length = pattern_.length(); 100 CHECK_GT(pattern_length, 0); 101 if (pattern_length < kBMMinPatternLength) { 102 if (pattern_length == 1) { 103 strategy_ = SearchStrategy::kSingleChar; 104 return; 105 } 106 strategy_ = SearchStrategy::kLinear; 107 return; 108 } 109 strategy_ = SearchStrategy::kInitial; 110 } 111 112 size_t Search(Vector subject, size_t index) { 113 switch (strategy_) { 114 case kBoyerMooreHorspool: 115 return BoyerMooreHorspoolSearch(subject, index); 116 case kBoyerMoore: 117 return BoyerMooreSearch(subject, index); 118 case kInitial: 119 return InitialSearch(subject, index); 120 case kLinear: 121 return LinearSearch(subject, index); 122 case kSingleChar: 123 return SingleCharSearch(subject, index); 124 } 125 UNREACHABLE(); 126 } 127 128 static inline int AlphabetSize() { 129 if (sizeof(Char) == 1) { 130 // Latin1 needle. 131 return kLatin1AlphabetSize; 132 } else { 133 // UC16 needle. 134 return kUC16AlphabetSize; 135 } 136 137 static_assert(sizeof(Char) == sizeof(uint8_t) || 138 sizeof(Char) == sizeof(uint16_t), 139 "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)"); 140 } 141 142 private: 143 typedef size_t (StringSearch::*SearchFunction)(Vector, size_t); 144 size_t SingleCharSearch(Vector subject, size_t start_index); 145 size_t LinearSearch(Vector subject, size_t start_index); 146 size_t InitialSearch(Vector subject, size_t start_index); 147 size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index); 148 size_t BoyerMooreSearch(Vector subject, size_t start_index); 149 150 void PopulateBoyerMooreHorspoolTable(); 151 152 void PopulateBoyerMooreTable(); 153 154 static inline int CharOccurrence(int* bad_char_occurrence, 155 Char char_code) { 156 if (sizeof(Char) == 1) { 157 return bad_char_occurrence[static_cast<int>(char_code)]; 158 } 159 // Both pattern and subject are UC16. Reduce character to equivalence class. 160 int equiv_class = char_code % kUC16AlphabetSize; 161 return bad_char_occurrence[equiv_class]; 162 } 163 164 enum SearchStrategy { 165 kBoyerMooreHorspool, 166 kBoyerMoore, 167 kInitial, 168 kLinear, 169 kSingleChar, 170 }; 171 172 // The pattern to search for. 173 Vector pattern_; 174 SearchStrategy strategy_; 175 // Cache value of Max(0, pattern_length() - kBMMaxShift) 176 size_t start_; 177}; 178 179 180template <typename T, typename U> 181inline T AlignDown(T value, U alignment) { 182 return reinterpret_cast<T>( 183 (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); 184} 185 186 187inline uint8_t GetHighestValueByte(uint16_t character) { 188 return std::max(static_cast<uint8_t>(character & 0xFF), 189 static_cast<uint8_t>(character >> 8)); 190} 191 192 193inline uint8_t GetHighestValueByte(uint8_t character) { return character; } 194 195 196// Searches for a byte value in a memory buffer, back to front. 197// Uses memrchr(3) on systems which support it, for speed. 198// Falls back to a vanilla for loop on non-GNU systems such as Windows. 199inline const void* MemrchrFill(const void* haystack, uint8_t needle, 200 size_t haystack_len) { 201#ifdef _GNU_SOURCE 202 return memrchr(haystack, needle, haystack_len); 203#else 204 const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack); 205 for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) { 206 if (haystack8[i] == needle) { 207 return haystack8 + i; 208 } 209 } 210 return nullptr; 211#endif 212} 213 214 215// Finds the first occurrence of *two-byte* character pattern[0] in the string 216// `subject`. Does not check that the whole pattern matches. 217template <typename Char> 218inline size_t FindFirstCharacter(Vector<const Char> pattern, 219 Vector<const Char> subject, size_t index) { 220 const Char pattern_first_char = pattern[0]; 221 const size_t max_n = (subject.length() - pattern.length() + 1); 222 223 // For speed, search for the more `rare` of the two bytes in pattern[0] 224 // using memchr / memrchr (which are much faster than a simple for loop). 225 const uint8_t search_byte = GetHighestValueByte(pattern_first_char); 226 size_t pos = index; 227 do { 228 const size_t bytes_to_search = (max_n - pos) * sizeof(Char); 229 const void* void_pos; 230 if (subject.forward()) { 231 // Assert that bytes_to_search won't overflow 232 CHECK_LE(pos, max_n); 233 CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char)); 234 void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); 235 } else { 236 CHECK_LE(pos, subject.length()); 237 CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char)); 238 void_pos = MemrchrFill(subject.start() + pattern.length() - 1, 239 search_byte, 240 bytes_to_search); 241 } 242 const Char* char_pos = static_cast<const Char*>(void_pos); 243 if (char_pos == nullptr) 244 return subject.length(); 245 246 // Then, for each match, verify that the full two bytes match pattern[0]. 247 char_pos = AlignDown(char_pos, sizeof(Char)); 248 size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); 249 pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1); 250 if (subject[pos] == pattern_first_char) { 251 // Match found, hooray. 252 return pos; 253 } 254 // Search byte matched, but the other byte of pattern[0] didn't. Keep going. 255 } while (++pos < max_n); 256 257 return subject.length(); 258} 259 260 261// Finds the first occurrence of the byte pattern[0] in string `subject`. 262// Does not verify that the whole pattern matches. 263template <> 264inline size_t FindFirstCharacter(Vector<const uint8_t> pattern, 265 Vector<const uint8_t> subject, 266 size_t index) { 267 const uint8_t pattern_first_char = pattern[0]; 268 const size_t subj_len = subject.length(); 269 const size_t max_n = (subject.length() - pattern.length() + 1); 270 271 const void* pos; 272 if (subject.forward()) { 273 pos = memchr(subject.start() + index, pattern_first_char, max_n - index); 274 } else { 275 pos = MemrchrFill(subject.start() + pattern.length() - 1, 276 pattern_first_char, 277 max_n - index); 278 } 279 const uint8_t* char_pos = static_cast<const uint8_t*>(pos); 280 if (char_pos == nullptr) { 281 return subj_len; 282 } 283 284 size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); 285 return subject.forward() ? raw_pos : (subj_len - raw_pos - 1); 286} 287 288//--------------------------------------------------------------------- 289// Single Character Pattern Search Strategy 290//--------------------------------------------------------------------- 291 292template <typename Char> 293size_t StringSearch<Char>::SingleCharSearch( 294 Vector subject, 295 size_t index) { 296 CHECK_EQ(1, pattern_.length()); 297 return FindFirstCharacter(pattern_, subject, index); 298} 299 300//--------------------------------------------------------------------- 301// Linear Search Strategy 302//--------------------------------------------------------------------- 303 304// Simple linear search for short patterns. Never bails out. 305template <typename Char> 306size_t StringSearch<Char>::LinearSearch( 307 Vector subject, 308 size_t index) { 309 CHECK_GT(pattern_.length(), 1); 310 const size_t n = subject.length() - pattern_.length(); 311 for (size_t i = index; i <= n; i++) { 312 i = FindFirstCharacter(pattern_, subject, i); 313 if (i == subject.length()) 314 return subject.length(); 315 CHECK_LE(i, n); 316 317 bool matches = true; 318 for (size_t j = 1; j < pattern_.length(); j++) { 319 if (pattern_[j] != subject[i + j]) { 320 matches = false; 321 break; 322 } 323 } 324 if (matches) { 325 return i; 326 } 327 } 328 return subject.length(); 329} 330 331//--------------------------------------------------------------------- 332// Boyer-Moore string search 333//--------------------------------------------------------------------- 334 335template <typename Char> 336size_t StringSearch<Char>::BoyerMooreSearch( 337 Vector subject, 338 size_t start_index) { 339 const size_t subject_length = subject.length(); 340 const size_t pattern_length = pattern_.length(); 341 // Only preprocess at most kBMMaxShift last characters of pattern. 342 size_t start = start_; 343 344 int* bad_char_occurrence = bad_char_shift_table_; 345 int* good_suffix_shift = good_suffix_shift_table_ - start_; 346 347 Char last_char = pattern_[pattern_length - 1]; 348 size_t index = start_index; 349 // Continue search from i. 350 while (index <= subject_length - pattern_length) { 351 size_t j = pattern_length - 1; 352 int c; 353 while (last_char != (c = subject[index + j])) { 354 int shift = j - CharOccurrence(bad_char_occurrence, c); 355 index += shift; 356 if (index > subject_length - pattern_length) { 357 return subject.length(); 358 } 359 } 360 while (pattern_[j] == (c = subject[index + j])) { 361 if (j == 0) { 362 return index; 363 } 364 j--; 365 } 366 if (j < start) { 367 // we have matched more than our tables allow us to be smart about. 368 // Fall back on BMH shift. 369 index += pattern_length - 1 - 370 CharOccurrence(bad_char_occurrence, last_char); 371 } else { 372 int gs_shift = good_suffix_shift[j + 1]; 373 int bc_occ = CharOccurrence(bad_char_occurrence, c); 374 int shift = j - bc_occ; 375 if (gs_shift > shift) { 376 shift = gs_shift; 377 } 378 index += shift; 379 } 380 } 381 382 return subject.length(); 383} 384 385template <typename Char> 386void StringSearch<Char>::PopulateBoyerMooreTable() { 387 const size_t pattern_length = pattern_.length(); 388 // Only look at the last kBMMaxShift characters of pattern (from start_ 389 // to pattern_length). 390 const size_t start = start_; 391 const size_t length = pattern_length - start; 392 393 // Biased tables so that we can use pattern indices as table indices, 394 // even if we only cover the part of the pattern from offset start. 395 int* shift_table = good_suffix_shift_table_ - start_; 396 int* suffix_table = suffix_table_ - start_; 397 398 // Initialize table. 399 for (size_t i = start; i < pattern_length; i++) { 400 shift_table[i] = length; 401 } 402 shift_table[pattern_length] = 1; 403 suffix_table[pattern_length] = pattern_length + 1; 404 405 if (pattern_length <= start) { 406 return; 407 } 408 409 // Find suffixes. 410 Char last_char = pattern_[pattern_length - 1]; 411 size_t suffix = pattern_length + 1; 412 { 413 size_t i = pattern_length; 414 while (i > start) { 415 Char c = pattern_[i - 1]; 416 while (suffix <= pattern_length && c != pattern_[suffix - 1]) { 417 if (static_cast<size_t>(shift_table[suffix]) == length) { 418 shift_table[suffix] = suffix - i; 419 } 420 suffix = suffix_table[suffix]; 421 } 422 suffix_table[--i] = --suffix; 423 if (suffix == pattern_length) { 424 // No suffix to extend, so we check against last_char only. 425 while ((i > start) && (pattern_[i - 1] != last_char)) { 426 if (static_cast<size_t>(shift_table[pattern_length]) == length) { 427 shift_table[pattern_length] = pattern_length - i; 428 } 429 suffix_table[--i] = pattern_length; 430 } 431 if (i > start) { 432 suffix_table[--i] = --suffix; 433 } 434 } 435 } 436 } 437 // Build shift table using suffixes. 438 if (suffix < pattern_length) { 439 for (size_t i = start; i <= pattern_length; i++) { 440 if (static_cast<size_t>(shift_table[i]) == length) { 441 shift_table[i] = suffix - start; 442 } 443 if (i == suffix) { 444 suffix = suffix_table[suffix]; 445 } 446 } 447 } 448} 449 450//--------------------------------------------------------------------- 451// Boyer-Moore-Horspool string search. 452//--------------------------------------------------------------------- 453 454template <typename Char> 455size_t StringSearch<Char>::BoyerMooreHorspoolSearch( 456 Vector subject, 457 size_t start_index) { 458 const size_t subject_length = subject.length(); 459 const size_t pattern_length = pattern_.length(); 460 int* char_occurrences = bad_char_shift_table_; 461 int64_t badness = -static_cast<int64_t>(pattern_length); 462 463 // How bad we are doing without a good-suffix table. 464 Char last_char = pattern_[pattern_length - 1]; 465 int last_char_shift = 466 pattern_length - 1 - 467 CharOccurrence(char_occurrences, last_char); 468 469 // Perform search 470 size_t index = start_index; // No matches found prior to this index. 471 while (index <= subject_length - pattern_length) { 472 size_t j = pattern_length - 1; 473 int subject_char; 474 while (last_char != (subject_char = subject[index + j])) { 475 int bc_occ = CharOccurrence(char_occurrences, subject_char); 476 int shift = j - bc_occ; 477 index += shift; 478 badness += 1 - shift; // at most zero, so badness cannot increase. 479 if (index > subject_length - pattern_length) { 480 return subject_length; 481 } 482 } 483 j--; 484 while (pattern_[j] == (subject[index + j])) { 485 if (j == 0) { 486 return index; 487 } 488 j--; 489 } 490 index += last_char_shift; 491 // Badness increases by the number of characters we have 492 // checked, and decreases by the number of characters we 493 // can skip by shifting. It's a measure of how we are doing 494 // compared to reading each character exactly once. 495 badness += (pattern_length - j) - last_char_shift; 496 if (badness > 0) { 497 PopulateBoyerMooreTable(); 498 strategy_ = SearchStrategy::kBoyerMoore; 499 return BoyerMooreSearch(subject, index); 500 } 501 } 502 return subject.length(); 503} 504 505template <typename Char> 506void StringSearch<Char>::PopulateBoyerMooreHorspoolTable() { 507 const size_t pattern_length = pattern_.length(); 508 509 int* bad_char_occurrence = bad_char_shift_table_; 510 511 // Only preprocess at most kBMMaxShift last characters of pattern. 512 const size_t start = start_; 513 // Run forwards to populate bad_char_table, so that *last* instance 514 // of character equivalence class is the one registered. 515 // Notice: Doesn't include the last character. 516 const size_t table_size = AlphabetSize(); 517 if (start == 0) { 518 // All patterns less than kBMMaxShift in length. 519 memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); 520 } else { 521 for (size_t i = 0; i < table_size; i++) { 522 bad_char_occurrence[i] = start - 1; 523 } 524 } 525 for (size_t i = start; i < pattern_length - 1; i++) { 526 Char c = pattern_[i]; 527 int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize(); 528 bad_char_occurrence[bucket] = i; 529 } 530} 531 532//--------------------------------------------------------------------- 533// Linear string search with bailout to BMH. 534//--------------------------------------------------------------------- 535 536// Simple linear search for short patterns, which bails out if the string 537// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. 538template <typename Char> 539size_t StringSearch<Char>::InitialSearch( 540 Vector subject, 541 size_t index) { 542 const size_t pattern_length = pattern_.length(); 543 // Badness is a count of how much work we have done. When we have 544 // done enough work we decide it's probably worth switching to a better 545 // algorithm. 546 int64_t badness = -10 - (pattern_length << 2); 547 548 // We know our pattern is at least 2 characters, we cache the first so 549 // the common case of the first character not matching is faster. 550 for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) { 551 badness++; 552 if (badness <= 0) { 553 i = FindFirstCharacter(pattern_, subject, i); 554 if (i == subject.length()) 555 return subject.length(); 556 CHECK_LE(i, n); 557 size_t j = 1; 558 do { 559 if (pattern_[j] != subject[i + j]) { 560 break; 561 } 562 j++; 563 } while (j < pattern_length); 564 if (j == pattern_length) { 565 return i; 566 } 567 badness += j; 568 } else { 569 PopulateBoyerMooreHorspoolTable(); 570 strategy_ = SearchStrategy::kBoyerMooreHorspool; 571 return BoyerMooreHorspoolSearch(subject, i); 572 } 573 } 574 return subject.length(); 575} 576 577// Perform a single stand-alone search. 578// If searching multiple times for the same pattern, a search 579// object should be constructed once and the Search function then called 580// for each search. 581template <typename Char> 582size_t SearchString(Vector<const Char> subject, 583 Vector<const Char> pattern, 584 size_t start_index) { 585 StringSearch<Char> search(pattern); 586 return search.Search(subject, start_index); 587} 588} // namespace stringsearch 589} // namespace node 590 591namespace node { 592 593template <typename Char> 594size_t SearchString(const Char* haystack, 595 size_t haystack_length, 596 const Char* needle, 597 size_t needle_length, 598 size_t start_index, 599 bool is_forward) { 600 if (haystack_length < needle_length) return haystack_length; 601 // To do a reverse search (lastIndexOf instead of indexOf) without redundant 602 // code, create two vectors that are reversed views into the input strings. 603 // For example, v_needle[0] would return the *last* character of the needle. 604 // So we're searching for the first instance of rev(needle) in rev(haystack) 605 stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward); 606 stringsearch::Vector<const Char> v_haystack( 607 haystack, haystack_length, is_forward); 608 size_t diff = haystack_length - needle_length; 609 size_t relative_start_index; 610 if (is_forward) { 611 relative_start_index = start_index; 612 } else if (diff < start_index) { 613 relative_start_index = 0; 614 } else { 615 relative_start_index = diff - start_index; 616 } 617 size_t pos = node::stringsearch::SearchString( 618 v_haystack, v_needle, relative_start_index); 619 if (pos == haystack_length) { 620 // not found 621 return pos; 622 } 623 return is_forward ? pos : (haystack_length - needle_length - pos); 624} 625 626template <size_t N> 627size_t SearchString(const char* haystack, size_t haystack_length, 628 const char (&needle)[N]) { 629 return SearchString( 630 reinterpret_cast<const uint8_t*>(haystack), haystack_length, 631 reinterpret_cast<const uint8_t*>(needle), N - 1, 0, true); 632} 633 634} // namespace node 635 636#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS 637 638#endif // SRC_STRING_SEARCH_H_ 639