1// Copyright 2011 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef V8_STRINGS_STRING_SEARCH_H_ 6#define V8_STRINGS_STRING_SEARCH_H_ 7 8#include "src/base/strings.h" 9#include "src/base/vector.h" 10#include "src/execution/isolate.h" 11 12namespace v8 { 13namespace internal { 14 15//--------------------------------------------------------------------- 16// String Search object. 17//--------------------------------------------------------------------- 18 19// Class holding constants and methods that apply to all string search variants, 20// independently of subject and pattern char size. 21class StringSearchBase { 22 protected: 23 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 24 // limit, we can fix the size of tables. For a needle longer than this limit, 25 // search will not be optimal, since we only build tables for a suffix 26 // of the string, but it is a safe approximation. 27 static const int kBMMaxShift = Isolate::kBMMaxShift; 28 29 // Reduce alphabet to this size. 30 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 31 // proportional to the input alphabet. We reduce the alphabet size by 32 // equating input characters modulo a smaller alphabet size. This gives 33 // a potentially less efficient searching, but is a safe approximation. 34 // For needles using only characters in the same Unicode 256-code point page, 35 // there is no search speed degradation. 36 static const int kLatin1AlphabetSize = 256; 37 static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize; 38 39 // Bad-char shift table stored in the state. It's length is the alphabet size. 40 // For patterns below this length, the skip length of Boyer-Moore is too short 41 // to compensate for the algorithmic overhead compared to simple brute force. 42 static const int kBMMinPatternLength = 7; 43 44 static inline bool IsOneByteString(base::Vector<const uint8_t> string) { 45 return true; 46 } 47 48 static inline bool IsOneByteString(base::Vector<const base::uc16> string) { 49 return String::IsOneByte(string.begin(), string.length()); 50 } 51 52 friend class Isolate; 53}; 54 55template <typename PatternChar, typename SubjectChar> 56class StringSearch : private StringSearchBase { 57 public: 58 StringSearch(Isolate* isolate, base::Vector<const PatternChar> pattern) 59 : isolate_(isolate), 60 pattern_(pattern), 61 start_(std::max(0, pattern.length() - kBMMaxShift)) { 62 if (sizeof(PatternChar) > sizeof(SubjectChar)) { 63 if (!IsOneByteString(pattern_)) { 64 strategy_ = &FailSearch; 65 return; 66 } 67 } 68 int pattern_length = pattern_.length(); 69 if (pattern_length < kBMMinPatternLength) { 70 if (pattern_length == 1) { 71 strategy_ = &SingleCharSearch; 72 return; 73 } 74 strategy_ = &LinearSearch; 75 return; 76 } 77 strategy_ = &InitialSearch; 78 } 79 80 int Search(base::Vector<const SubjectChar> subject, int index) { 81 return strategy_(this, subject, index); 82 } 83 84 static inline int AlphabetSize() { 85 if (sizeof(PatternChar) == 1) { 86 // Latin1 needle. 87 return kLatin1AlphabetSize; 88 } else { 89 DCHECK_EQ(sizeof(PatternChar), 2); 90 // UC16 needle. 91 return kUC16AlphabetSize; 92 } 93 } 94 95 private: 96 using SearchFunction = int (*)(StringSearch<PatternChar, SubjectChar>*, 97 base::Vector<const SubjectChar>, int); 98 99 static int FailSearch(StringSearch<PatternChar, SubjectChar>*, 100 base::Vector<const SubjectChar>, int) { 101 return -1; 102 } 103 104 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, 105 base::Vector<const SubjectChar> subject, 106 int start_index); 107 108 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, 109 base::Vector<const SubjectChar> subject, 110 int start_index); 111 112 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, 113 base::Vector<const SubjectChar> subject, 114 int start_index); 115 116 static int BoyerMooreHorspoolSearch( 117 StringSearch<PatternChar, SubjectChar>* search, 118 base::Vector<const SubjectChar> subject, int start_index); 119 120 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, 121 base::Vector<const SubjectChar> subject, 122 int start_index); 123 124 void PopulateBoyerMooreHorspoolTable(); 125 126 void PopulateBoyerMooreTable(); 127 128 static inline bool exceedsOneByte(uint8_t c) { return false; } 129 130 static inline bool exceedsOneByte(uint16_t c) { 131 return c > String::kMaxOneByteCharCodeU; 132 } 133 134 static inline int CharOccurrence(int* bad_char_occurrence, 135 SubjectChar char_code) { 136 if (sizeof(SubjectChar) == 1) { 137 return bad_char_occurrence[static_cast<int>(char_code)]; 138 } 139 if (sizeof(PatternChar) == 1) { 140 if (exceedsOneByte(char_code)) { 141 return -1; 142 } 143 return bad_char_occurrence[static_cast<unsigned int>(char_code)]; 144 } 145 // Both pattern and subject are UC16. Reduce character to equivalence 146 // class. 147 int equiv_class = char_code % kUC16AlphabetSize; 148 return bad_char_occurrence[equiv_class]; 149 } 150 151 // The following tables are shared by all searches. 152 // TODO(lrn): Introduce a way for a pattern to keep its tables 153 // between searches (e.g., for an Atom RegExp). 154 155 // Store for the BoyerMoore(Horspool) bad char shift table. 156 // Return a table covering the last kBMMaxShift+1 positions of 157 // pattern. 158 int* bad_char_table() { return isolate_->bad_char_shift_table(); } 159 160 // Store for the BoyerMoore good suffix shift table. 161 int* good_suffix_shift_table() { 162 // Return biased pointer that maps the range [start_..pattern_.length() 163 // to the kGoodSuffixShiftTable array. 164 return isolate_->good_suffix_shift_table() - start_; 165 } 166 167 // Table used temporarily while building the BoyerMoore good suffix 168 // shift table. 169 int* suffix_table() { 170 // Return biased pointer that maps the range [start_..pattern_.length() 171 // to the kSuffixTable array. 172 return isolate_->suffix_table() - start_; 173 } 174 175 Isolate* isolate_; 176 // The pattern to search for. 177 base::Vector<const PatternChar> pattern_; 178 // Pointer to implementation of the search. 179 SearchFunction strategy_; 180 // Cache value of max(0, pattern_length() - kBMMaxShift) 181 int start_; 182}; 183 184template <typename T, typename U> 185inline T AlignDown(T value, U alignment) { 186 return reinterpret_cast<T>( 187 (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); 188} 189 190inline uint8_t GetHighestValueByte(base::uc16 character) { 191 return std::max(static_cast<uint8_t>(character & 0xFF), 192 static_cast<uint8_t>(character >> 8)); 193} 194 195inline uint8_t GetHighestValueByte(uint8_t character) { return character; } 196 197template <typename PatternChar, typename SubjectChar> 198inline int FindFirstCharacter(base::Vector<const PatternChar> pattern, 199 base::Vector<const SubjectChar> subject, 200 int index) { 201 const PatternChar pattern_first_char = pattern[0]; 202 const int max_n = (subject.length() - pattern.length() + 1); 203 204 if (sizeof(SubjectChar) == 2 && pattern_first_char == 0) { 205 // Special-case looking for the 0 char in other than one-byte strings. 206 // memchr mostly fails in this case due to every other byte being 0 in text 207 // that is mostly ascii characters. 208 for (int i = index; i < max_n; ++i) { 209 if (subject[i] == 0) return i; 210 } 211 return -1; 212 } 213 const uint8_t search_byte = GetHighestValueByte(pattern_first_char); 214 const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); 215 int pos = index; 216 do { 217 DCHECK_GE(max_n - pos, 0); 218 const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>( 219 memchr(subject.begin() + pos, search_byte, 220 (max_n - pos) * sizeof(SubjectChar))); 221 if (char_pos == nullptr) return -1; 222 char_pos = AlignDown(char_pos, sizeof(SubjectChar)); 223 pos = static_cast<int>(char_pos - subject.begin()); 224 if (subject[pos] == search_char) return pos; 225 } while (++pos < max_n); 226 227 return -1; 228} 229 230//--------------------------------------------------------------------- 231// Single Character Pattern Search Strategy 232//--------------------------------------------------------------------- 233 234template <typename PatternChar, typename SubjectChar> 235int StringSearch<PatternChar, SubjectChar>::SingleCharSearch( 236 StringSearch<PatternChar, SubjectChar>* search, 237 base::Vector<const SubjectChar> subject, int index) { 238 DCHECK_EQ(1, search->pattern_.length()); 239 PatternChar pattern_first_char = search->pattern_[0]; 240 if (sizeof(PatternChar) > sizeof(SubjectChar)) { 241 if (exceedsOneByte(pattern_first_char)) { 242 return -1; 243 } 244 } 245 return FindFirstCharacter(search->pattern_, subject, index); 246} 247 248//--------------------------------------------------------------------- 249// Linear Search Strategy 250//--------------------------------------------------------------------- 251 252template <typename PatternChar, typename SubjectChar> 253inline bool CharCompare(const PatternChar* pattern, const SubjectChar* subject, 254 int length) { 255 DCHECK_GT(length, 0); 256 int pos = 0; 257 do { 258 if (pattern[pos] != subject[pos]) { 259 return false; 260 } 261 pos++; 262 } while (pos < length); 263 return true; 264} 265 266// Simple linear search for short patterns. Never bails out. 267template <typename PatternChar, typename SubjectChar> 268int StringSearch<PatternChar, SubjectChar>::LinearSearch( 269 StringSearch<PatternChar, SubjectChar>* search, 270 base::Vector<const SubjectChar> subject, int index) { 271 base::Vector<const PatternChar> pattern = search->pattern_; 272 DCHECK_GT(pattern.length(), 1); 273 int pattern_length = pattern.length(); 274 int i = index; 275 int n = subject.length() - pattern_length; 276 while (i <= n) { 277 i = FindFirstCharacter(pattern, subject, i); 278 if (i == -1) return -1; 279 DCHECK_LE(i, n); 280 i++; 281 // Loop extracted to separate function to allow using return to do 282 // a deeper break. 283 if (CharCompare(pattern.begin() + 1, subject.begin() + i, 284 pattern_length - 1)) { 285 return i - 1; 286 } 287 } 288 return -1; 289} 290 291//--------------------------------------------------------------------- 292// Boyer-Moore string search 293//--------------------------------------------------------------------- 294 295template <typename PatternChar, typename SubjectChar> 296int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( 297 StringSearch<PatternChar, SubjectChar>* search, 298 base::Vector<const SubjectChar> subject, int start_index) { 299 base::Vector<const PatternChar> pattern = search->pattern_; 300 int subject_length = subject.length(); 301 int pattern_length = pattern.length(); 302 // Only preprocess at most kBMMaxShift last characters of pattern. 303 int start = search->start_; 304 305 int* bad_char_occurence = search->bad_char_table(); 306 int* good_suffix_shift = search->good_suffix_shift_table(); 307 308 PatternChar last_char = pattern[pattern_length - 1]; 309 int index = start_index; 310 // Continue search from i. 311 while (index <= subject_length - pattern_length) { 312 int j = pattern_length - 1; 313 int c; 314 while (last_char != (c = subject[index + j])) { 315 int shift = j - CharOccurrence(bad_char_occurence, c); 316 index += shift; 317 if (index > subject_length - pattern_length) { 318 return -1; 319 } 320 } 321 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; 322 if (j < 0) { 323 return index; 324 } else if (j < start) { 325 // we have matched more than our tables allow us to be smart about. 326 // Fall back on BMH shift. 327 index += pattern_length - 1 - 328 CharOccurrence(bad_char_occurence, 329 static_cast<SubjectChar>(last_char)); 330 } else { 331 int gs_shift = good_suffix_shift[j + 1]; 332 int bc_occ = CharOccurrence(bad_char_occurence, c); 333 int shift = j - bc_occ; 334 if (gs_shift > shift) { 335 shift = gs_shift; 336 } 337 index += shift; 338 } 339 } 340 341 return -1; 342} 343 344template <typename PatternChar, typename SubjectChar> 345void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { 346 int pattern_length = pattern_.length(); 347 const PatternChar* pattern = pattern_.begin(); 348 // Only look at the last kBMMaxShift characters of pattern (from start_ 349 // to pattern_length). 350 int start = start_; 351 int length = pattern_length - start; 352 353 // Biased tables so that we can use pattern indices as table indices, 354 // even if we only cover the part of the pattern from offset start. 355 int* shift_table = good_suffix_shift_table(); 356 int* suffix_table = this->suffix_table(); 357 358 // Initialize table. 359 for (int i = start; i < pattern_length; i++) { 360 shift_table[i] = length; 361 } 362 shift_table[pattern_length] = 1; 363 suffix_table[pattern_length] = pattern_length + 1; 364 365 if (pattern_length <= start) { 366 return; 367 } 368 369 // Find suffixes. 370 PatternChar last_char = pattern[pattern_length - 1]; 371 int suffix = pattern_length + 1; 372 { 373 int i = pattern_length; 374 while (i > start) { 375 PatternChar c = pattern[i - 1]; 376 while (suffix <= pattern_length && c != pattern[suffix - 1]) { 377 if (shift_table[suffix] == length) { 378 shift_table[suffix] = suffix - i; 379 } 380 suffix = suffix_table[suffix]; 381 } 382 suffix_table[--i] = --suffix; 383 if (suffix == pattern_length) { 384 // No suffix to extend, so we check against last_char only. 385 while ((i > start) && (pattern[i - 1] != last_char)) { 386 if (shift_table[pattern_length] == length) { 387 shift_table[pattern_length] = pattern_length - i; 388 } 389 suffix_table[--i] = pattern_length; 390 } 391 if (i > start) { 392 suffix_table[--i] = --suffix; 393 } 394 } 395 } 396 } 397 // Build shift table using suffixes. 398 if (suffix < pattern_length) { 399 for (int i = start; i <= pattern_length; i++) { 400 if (shift_table[i] == length) { 401 shift_table[i] = suffix - start; 402 } 403 if (i == suffix) { 404 suffix = suffix_table[suffix]; 405 } 406 } 407 } 408} 409 410//--------------------------------------------------------------------- 411// Boyer-Moore-Horspool string search. 412//--------------------------------------------------------------------- 413 414template <typename PatternChar, typename SubjectChar> 415int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( 416 StringSearch<PatternChar, SubjectChar>* search, 417 base::Vector<const SubjectChar> subject, int start_index) { 418 base::Vector<const PatternChar> pattern = search->pattern_; 419 int subject_length = subject.length(); 420 int pattern_length = pattern.length(); 421 int* char_occurrences = search->bad_char_table(); 422 int badness = -pattern_length; 423 424 // How bad we are doing without a good-suffix table. 425 PatternChar last_char = pattern[pattern_length - 1]; 426 int last_char_shift = 427 pattern_length - 1 - 428 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); 429 // Perform search 430 int index = start_index; // No matches found prior to this index. 431 while (index <= subject_length - pattern_length) { 432 int j = pattern_length - 1; 433 int subject_char; 434 while (last_char != (subject_char = subject[index + j])) { 435 int bc_occ = CharOccurrence(char_occurrences, subject_char); 436 int shift = j - bc_occ; 437 index += shift; 438 badness += 1 - shift; // at most zero, so badness cannot increase. 439 if (index > subject_length - pattern_length) { 440 return -1; 441 } 442 } 443 j--; 444 while (j >= 0 && pattern[j] == (subject[index + j])) j--; 445 if (j < 0) { 446 return index; 447 } else { 448 index += last_char_shift; 449 // Badness increases by the number of characters we have 450 // checked, and decreases by the number of characters we 451 // can skip by shifting. It's a measure of how we are doing 452 // compared to reading each character exactly once. 453 badness += (pattern_length - j) - last_char_shift; 454 if (badness > 0) { 455 search->PopulateBoyerMooreTable(); 456 search->strategy_ = &BoyerMooreSearch; 457 return BoyerMooreSearch(search, subject, index); 458 } 459 } 460 } 461 return -1; 462} 463 464template <typename PatternChar, typename SubjectChar> 465void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { 466 int pattern_length = pattern_.length(); 467 468 int* bad_char_occurrence = bad_char_table(); 469 470 // Only preprocess at most kBMMaxShift last characters of pattern. 471 int start = start_; 472 // Run forwards to populate bad_char_table, so that *last* instance 473 // of character equivalence class is the one registered. 474 // Notice: Doesn't include the last character. 475 int table_size = AlphabetSize(); 476 if (start == 0) { // All patterns less than kBMMaxShift in length. 477 memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); 478 } else { 479 for (int i = 0; i < table_size; i++) { 480 bad_char_occurrence[i] = start - 1; 481 } 482 } 483 for (int i = start; i < pattern_length - 1; i++) { 484 PatternChar c = pattern_[i]; 485 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); 486 bad_char_occurrence[bucket] = i; 487 } 488} 489 490//--------------------------------------------------------------------- 491// Linear string search with bailout to BMH. 492//--------------------------------------------------------------------- 493 494// Simple linear search for short patterns, which bails out if the string 495// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. 496template <typename PatternChar, typename SubjectChar> 497int StringSearch<PatternChar, SubjectChar>::InitialSearch( 498 StringSearch<PatternChar, SubjectChar>* search, 499 base::Vector<const SubjectChar> subject, int index) { 500 base::Vector<const PatternChar> pattern = search->pattern_; 501 int pattern_length = pattern.length(); 502 // Badness is a count of how much work we have done. When we have 503 // done enough work we decide it's probably worth switching to a better 504 // algorithm. 505 int badness = -10 - (pattern_length << 2); 506 507 // We know our pattern is at least 2 characters, we cache the first so 508 // the common case of the first character not matching is faster. 509 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { 510 badness++; 511 if (badness <= 0) { 512 i = FindFirstCharacter(pattern, subject, i); 513 if (i == -1) return -1; 514 DCHECK_LE(i, n); 515 int j = 1; 516 do { 517 if (pattern[j] != subject[i + j]) { 518 break; 519 } 520 j++; 521 } while (j < pattern_length); 522 if (j == pattern_length) { 523 return i; 524 } 525 badness += j; 526 } else { 527 search->PopulateBoyerMooreHorspoolTable(); 528 search->strategy_ = &BoyerMooreHorspoolSearch; 529 return BoyerMooreHorspoolSearch(search, subject, i); 530 } 531 } 532 return -1; 533} 534 535// Perform a a single stand-alone search. 536// If searching multiple times for the same pattern, a search 537// object should be constructed once and the Search function then called 538// for each search. 539template <typename SubjectChar, typename PatternChar> 540int SearchString(Isolate* isolate, base::Vector<const SubjectChar> subject, 541 base::Vector<const PatternChar> pattern, int start_index) { 542 StringSearch<PatternChar, SubjectChar> search(isolate, pattern); 543 return search.Search(subject, start_index); 544} 545 546// A wrapper function around SearchString that wraps raw pointers to the subject 547// and pattern as vectors before calling SearchString. Used from the 548// StringIndexOf builtin. 549template <typename SubjectChar, typename PatternChar> 550intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr, 551 int subject_length, const PatternChar* pattern_ptr, 552 int pattern_length, int start_index) { 553 DisallowGarbageCollection no_gc; 554 base::Vector<const SubjectChar> subject(subject_ptr, subject_length); 555 base::Vector<const PatternChar> pattern(pattern_ptr, pattern_length); 556 return SearchString(isolate, subject, pattern, start_index); 557} 558 559} // namespace internal 560} // namespace v8 561 562#endif // V8_STRINGS_STRING_SEARCH_H_ 563