1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_STRINGS_STRING_SEARCH_H_
6 #define V8_STRINGS_STRING_SEARCH_H_
7
8 #include "src/base/strings.h"
9 #include "src/base/vector.h"
10 #include "src/execution/isolate.h"
11
12 namespace v8 {
13 namespace internal {
14
15 //---------------------------------------------------------------------
16 // String Search object.
17 //---------------------------------------------------------------------
18
19 // Class holding constants and methods that apply to all string search variants,
20 // independently of subject and pattern char size.
21 class StringSearchBase {
22 protected:
23 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
24 // limit, we can fix the size of tables. For a needle longer than this limit,
25 // search will not be optimal, since we only build tables for a suffix
26 // of the string, but it is a safe approximation.
27 static const int kBMMaxShift = Isolate::kBMMaxShift;
28
29 // Reduce alphabet to this size.
30 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
31 // proportional to the input alphabet. We reduce the alphabet size by
32 // equating input characters modulo a smaller alphabet size. This gives
33 // a potentially less efficient searching, but is a safe approximation.
34 // For needles using only characters in the same Unicode 256-code point page,
35 // there is no search speed degradation.
36 static const int kLatin1AlphabetSize = 256;
37 static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
38
39 // Bad-char shift table stored in the state. It's length is the alphabet size.
40 // For patterns below this length, the skip length of Boyer-Moore is too short
41 // to compensate for the algorithmic overhead compared to simple brute force.
42 static const int kBMMinPatternLength = 7;
43
IsOneByteString(base::Vector<const uint8_t> string)44 static inline bool IsOneByteString(base::Vector<const uint8_t> string) {
45 return true;
46 }
47
IsOneByteString(base::Vector<const base::uc16> string)48 static inline bool IsOneByteString(base::Vector<const base::uc16> string) {
49 return String::IsOneByte(string.begin(), string.length());
50 }
51
52 friend class Isolate;
53 };
54
55 template <typename PatternChar, typename SubjectChar>
56 class StringSearch : private StringSearchBase {
57 public:
StringSearch(Isolate* isolate, base::Vector<const PatternChar> pattern)58 StringSearch(Isolate* isolate, base::Vector<const PatternChar> pattern)
59 : isolate_(isolate),
60 pattern_(pattern),
61 start_(std::max(0, pattern.length() - kBMMaxShift)) {
62 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
63 if (!IsOneByteString(pattern_)) {
64 strategy_ = &FailSearch;
65 return;
66 }
67 }
68 int pattern_length = pattern_.length();
69 if (pattern_length < kBMMinPatternLength) {
70 if (pattern_length == 1) {
71 strategy_ = &SingleCharSearch;
72 return;
73 }
74 strategy_ = &LinearSearch;
75 return;
76 }
77 strategy_ = &InitialSearch;
78 }
79
Search(base::Vector<const SubjectChar> subject, int index)80 int Search(base::Vector<const SubjectChar> subject, int index) {
81 return strategy_(this, subject, index);
82 }
83
AlphabetSize()84 static inline int AlphabetSize() {
85 if (sizeof(PatternChar) == 1) {
86 // Latin1 needle.
87 return kLatin1AlphabetSize;
88 } else {
89 DCHECK_EQ(sizeof(PatternChar), 2);
90 // UC16 needle.
91 return kUC16AlphabetSize;
92 }
93 }
94
95 private:
96 using SearchFunction = int (*)(StringSearch<PatternChar, SubjectChar>*,
97 base::Vector<const SubjectChar>, int);
98
FailSearch(StringSearch<PatternChar, SubjectChar>*, base::Vector<const SubjectChar>, int)99 static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
100 base::Vector<const SubjectChar>, int) {
101 return -1;
102 }
103
104 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
105 base::Vector<const SubjectChar> subject,
106 int start_index);
107
108 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
109 base::Vector<const SubjectChar> subject,
110 int start_index);
111
112 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
113 base::Vector<const SubjectChar> subject,
114 int start_index);
115
116 static int BoyerMooreHorspoolSearch(
117 StringSearch<PatternChar, SubjectChar>* search,
118 base::Vector<const SubjectChar> subject, int start_index);
119
120 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
121 base::Vector<const SubjectChar> subject,
122 int start_index);
123
124 void PopulateBoyerMooreHorspoolTable();
125
126 void PopulateBoyerMooreTable();
127
exceedsOneByte(uint8_t c)128 static inline bool exceedsOneByte(uint8_t c) { return false; }
129
exceedsOneByte(uint16_t c)130 static inline bool exceedsOneByte(uint16_t c) {
131 return c > String::kMaxOneByteCharCodeU;
132 }
133
CharOccurrence(int* bad_char_occurrence, SubjectChar char_code)134 static inline int CharOccurrence(int* bad_char_occurrence,
135 SubjectChar char_code) {
136 if (sizeof(SubjectChar) == 1) {
137 return bad_char_occurrence[static_cast<int>(char_code)];
138 }
139 if (sizeof(PatternChar) == 1) {
140 if (exceedsOneByte(char_code)) {
141 return -1;
142 }
143 return bad_char_occurrence[static_cast<unsigned int>(char_code)];
144 }
145 // Both pattern and subject are UC16. Reduce character to equivalence
146 // class.
147 int equiv_class = char_code % kUC16AlphabetSize;
148 return bad_char_occurrence[equiv_class];
149 }
150
151 // The following tables are shared by all searches.
152 // TODO(lrn): Introduce a way for a pattern to keep its tables
153 // between searches (e.g., for an Atom RegExp).
154
155 // Store for the BoyerMoore(Horspool) bad char shift table.
156 // Return a table covering the last kBMMaxShift+1 positions of
157 // pattern.
bad_char_table()158 int* bad_char_table() { return isolate_->bad_char_shift_table(); }
159
160 // Store for the BoyerMoore good suffix shift table.
good_suffix_shift_table()161 int* good_suffix_shift_table() {
162 // Return biased pointer that maps the range [start_..pattern_.length()
163 // to the kGoodSuffixShiftTable array.
164 return isolate_->good_suffix_shift_table() - start_;
165 }
166
167 // Table used temporarily while building the BoyerMoore good suffix
168 // shift table.
suffix_table()169 int* suffix_table() {
170 // Return biased pointer that maps the range [start_..pattern_.length()
171 // to the kSuffixTable array.
172 return isolate_->suffix_table() - start_;
173 }
174
175 Isolate* isolate_;
176 // The pattern to search for.
177 base::Vector<const PatternChar> pattern_;
178 // Pointer to implementation of the search.
179 SearchFunction strategy_;
180 // Cache value of max(0, pattern_length() - kBMMaxShift)
181 int start_;
182 };
183
184 template <typename T, typename U>
AlignDown(T value, U alignment)185 inline T AlignDown(T value, U alignment) {
186 return reinterpret_cast<T>(
187 (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
188 }
189
GetHighestValueByte(base::uc16 character)190 inline uint8_t GetHighestValueByte(base::uc16 character) {
191 return std::max(static_cast<uint8_t>(character & 0xFF),
192 static_cast<uint8_t>(character >> 8));
193 }
194
GetHighestValueByte(uint8_t character)195 inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
196
197 template <typename PatternChar, typename SubjectChar>
FindFirstCharacter(base::Vector<const PatternChar> pattern, base::Vector<const SubjectChar> subject, int index)198 inline int FindFirstCharacter(base::Vector<const PatternChar> pattern,
199 base::Vector<const SubjectChar> subject,
200 int index) {
201 const PatternChar pattern_first_char = pattern[0];
202 const int max_n = (subject.length() - pattern.length() + 1);
203
204 if (sizeof(SubjectChar) == 2 && pattern_first_char == 0) {
205 // Special-case looking for the 0 char in other than one-byte strings.
206 // memchr mostly fails in this case due to every other byte being 0 in text
207 // that is mostly ascii characters.
208 for (int i = index; i < max_n; ++i) {
209 if (subject[i] == 0) return i;
210 }
211 return -1;
212 }
213 const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
214 const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
215 int pos = index;
216 do {
217 DCHECK_GE(max_n - pos, 0);
218 const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
219 memchr(subject.begin() + pos, search_byte,
220 (max_n - pos) * sizeof(SubjectChar)));
221 if (char_pos == nullptr) return -1;
222 char_pos = AlignDown(char_pos, sizeof(SubjectChar));
223 pos = static_cast<int>(char_pos - subject.begin());
224 if (subject[pos] == search_char) return pos;
225 } while (++pos < max_n);
226
227 return -1;
228 }
229
230 //---------------------------------------------------------------------
231 // Single Character Pattern Search Strategy
232 //---------------------------------------------------------------------
233
234 template <typename PatternChar, typename SubjectChar>
SingleCharSearch( StringSearch<PatternChar, SubjectChar>* search, base::Vector<const SubjectChar> subject, int index)235 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
236 StringSearch<PatternChar, SubjectChar>* search,
237 base::Vector<const SubjectChar> subject, int index) {
238 DCHECK_EQ(1, search->pattern_.length());
239 PatternChar pattern_first_char = search->pattern_[0];
240 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
241 if (exceedsOneByte(pattern_first_char)) {
242 return -1;
243 }
244 }
245 return FindFirstCharacter(search->pattern_, subject, index);
246 }
247
248 //---------------------------------------------------------------------
249 // Linear Search Strategy
250 //---------------------------------------------------------------------
251
252 template <typename PatternChar, typename SubjectChar>
CharCompare(const PatternChar* pattern, const SubjectChar* subject, int length)253 inline bool CharCompare(const PatternChar* pattern, const SubjectChar* subject,
254 int length) {
255 DCHECK_GT(length, 0);
256 int pos = 0;
257 do {
258 if (pattern[pos] != subject[pos]) {
259 return false;
260 }
261 pos++;
262 } while (pos < length);
263 return true;
264 }
265
266 // Simple linear search for short patterns. Never bails out.
267 template <typename PatternChar, typename SubjectChar>
LinearSearch( StringSearch<PatternChar, SubjectChar>* search, base::Vector<const SubjectChar> subject, int index)268 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
269 StringSearch<PatternChar, SubjectChar>* search,
270 base::Vector<const SubjectChar> subject, int index) {
271 base::Vector<const PatternChar> pattern = search->pattern_;
272 DCHECK_GT(pattern.length(), 1);
273 int pattern_length = pattern.length();
274 int i = index;
275 int n = subject.length() - pattern_length;
276 while (i <= n) {
277 i = FindFirstCharacter(pattern, subject, i);
278 if (i == -1) return -1;
279 DCHECK_LE(i, n);
280 i++;
281 // Loop extracted to separate function to allow using return to do
282 // a deeper break.
283 if (CharCompare(pattern.begin() + 1, subject.begin() + i,
284 pattern_length - 1)) {
285 return i - 1;
286 }
287 }
288 return -1;
289 }
290
291 //---------------------------------------------------------------------
292 // Boyer-Moore string search
293 //---------------------------------------------------------------------
294
295 template <typename PatternChar, typename SubjectChar>
BoyerMooreSearch( StringSearch<PatternChar, SubjectChar>* search, base::Vector<const SubjectChar> subject, int start_index)296 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
297 StringSearch<PatternChar, SubjectChar>* search,
298 base::Vector<const SubjectChar> subject, int start_index) {
299 base::Vector<const PatternChar> pattern = search->pattern_;
300 int subject_length = subject.length();
301 int pattern_length = pattern.length();
302 // Only preprocess at most kBMMaxShift last characters of pattern.
303 int start = search->start_;
304
305 int* bad_char_occurence = search->bad_char_table();
306 int* good_suffix_shift = search->good_suffix_shift_table();
307
308 PatternChar last_char = pattern[pattern_length - 1];
309 int index = start_index;
310 // Continue search from i.
311 while (index <= subject_length - pattern_length) {
312 int j = pattern_length - 1;
313 int c;
314 while (last_char != (c = subject[index + j])) {
315 int shift = j - CharOccurrence(bad_char_occurence, c);
316 index += shift;
317 if (index > subject_length - pattern_length) {
318 return -1;
319 }
320 }
321 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
322 if (j < 0) {
323 return index;
324 } else if (j < start) {
325 // we have matched more than our tables allow us to be smart about.
326 // Fall back on BMH shift.
327 index += pattern_length - 1 -
328 CharOccurrence(bad_char_occurence,
329 static_cast<SubjectChar>(last_char));
330 } else {
331 int gs_shift = good_suffix_shift[j + 1];
332 int bc_occ = CharOccurrence(bad_char_occurence, c);
333 int shift = j - bc_occ;
334 if (gs_shift > shift) {
335 shift = gs_shift;
336 }
337 index += shift;
338 }
339 }
340
341 return -1;
342 }
343
344 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreTable()345 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
346 int pattern_length = pattern_.length();
347 const PatternChar* pattern = pattern_.begin();
348 // Only look at the last kBMMaxShift characters of pattern (from start_
349 // to pattern_length).
350 int start = start_;
351 int length = pattern_length - start;
352
353 // Biased tables so that we can use pattern indices as table indices,
354 // even if we only cover the part of the pattern from offset start.
355 int* shift_table = good_suffix_shift_table();
356 int* suffix_table = this->suffix_table();
357
358 // Initialize table.
359 for (int i = start; i < pattern_length; i++) {
360 shift_table[i] = length;
361 }
362 shift_table[pattern_length] = 1;
363 suffix_table[pattern_length] = pattern_length + 1;
364
365 if (pattern_length <= start) {
366 return;
367 }
368
369 // Find suffixes.
370 PatternChar last_char = pattern[pattern_length - 1];
371 int suffix = pattern_length + 1;
372 {
373 int i = pattern_length;
374 while (i > start) {
375 PatternChar c = pattern[i - 1];
376 while (suffix <= pattern_length && c != pattern[suffix - 1]) {
377 if (shift_table[suffix] == length) {
378 shift_table[suffix] = suffix - i;
379 }
380 suffix = suffix_table[suffix];
381 }
382 suffix_table[--i] = --suffix;
383 if (suffix == pattern_length) {
384 // No suffix to extend, so we check against last_char only.
385 while ((i > start) && (pattern[i - 1] != last_char)) {
386 if (shift_table[pattern_length] == length) {
387 shift_table[pattern_length] = pattern_length - i;
388 }
389 suffix_table[--i] = pattern_length;
390 }
391 if (i > start) {
392 suffix_table[--i] = --suffix;
393 }
394 }
395 }
396 }
397 // Build shift table using suffixes.
398 if (suffix < pattern_length) {
399 for (int i = start; i <= pattern_length; i++) {
400 if (shift_table[i] == length) {
401 shift_table[i] = suffix - start;
402 }
403 if (i == suffix) {
404 suffix = suffix_table[suffix];
405 }
406 }
407 }
408 }
409
410 //---------------------------------------------------------------------
411 // Boyer-Moore-Horspool string search.
412 //---------------------------------------------------------------------
413
414 template <typename PatternChar, typename SubjectChar>
BoyerMooreHorspoolSearch( StringSearch<PatternChar, SubjectChar>* search, base::Vector<const SubjectChar> subject, int start_index)415 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
416 StringSearch<PatternChar, SubjectChar>* search,
417 base::Vector<const SubjectChar> subject, int start_index) {
418 base::Vector<const PatternChar> pattern = search->pattern_;
419 int subject_length = subject.length();
420 int pattern_length = pattern.length();
421 int* char_occurrences = search->bad_char_table();
422 int badness = -pattern_length;
423
424 // How bad we are doing without a good-suffix table.
425 PatternChar last_char = pattern[pattern_length - 1];
426 int last_char_shift =
427 pattern_length - 1 -
428 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
429 // Perform search
430 int index = start_index; // No matches found prior to this index.
431 while (index <= subject_length - pattern_length) {
432 int j = pattern_length - 1;
433 int subject_char;
434 while (last_char != (subject_char = subject[index + j])) {
435 int bc_occ = CharOccurrence(char_occurrences, subject_char);
436 int shift = j - bc_occ;
437 index += shift;
438 badness += 1 - shift; // at most zero, so badness cannot increase.
439 if (index > subject_length - pattern_length) {
440 return -1;
441 }
442 }
443 j--;
444 while (j >= 0 && pattern[j] == (subject[index + j])) j--;
445 if (j < 0) {
446 return index;
447 } else {
448 index += last_char_shift;
449 // Badness increases by the number of characters we have
450 // checked, and decreases by the number of characters we
451 // can skip by shifting. It's a measure of how we are doing
452 // compared to reading each character exactly once.
453 badness += (pattern_length - j) - last_char_shift;
454 if (badness > 0) {
455 search->PopulateBoyerMooreTable();
456 search->strategy_ = &BoyerMooreSearch;
457 return BoyerMooreSearch(search, subject, index);
458 }
459 }
460 }
461 return -1;
462 }
463
464 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreHorspoolTable()465 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
466 int pattern_length = pattern_.length();
467
468 int* bad_char_occurrence = bad_char_table();
469
470 // Only preprocess at most kBMMaxShift last characters of pattern.
471 int start = start_;
472 // Run forwards to populate bad_char_table, so that *last* instance
473 // of character equivalence class is the one registered.
474 // Notice: Doesn't include the last character.
475 int table_size = AlphabetSize();
476 if (start == 0) { // All patterns less than kBMMaxShift in length.
477 memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
478 } else {
479 for (int i = 0; i < table_size; i++) {
480 bad_char_occurrence[i] = start - 1;
481 }
482 }
483 for (int i = start; i < pattern_length - 1; i++) {
484 PatternChar c = pattern_[i];
485 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
486 bad_char_occurrence[bucket] = i;
487 }
488 }
489
490 //---------------------------------------------------------------------
491 // Linear string search with bailout to BMH.
492 //---------------------------------------------------------------------
493
494 // Simple linear search for short patterns, which bails out if the string
495 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
496 template <typename PatternChar, typename SubjectChar>
InitialSearch( StringSearch<PatternChar, SubjectChar>* search, base::Vector<const SubjectChar> subject, int index)497 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
498 StringSearch<PatternChar, SubjectChar>* search,
499 base::Vector<const SubjectChar> subject, int index) {
500 base::Vector<const PatternChar> pattern = search->pattern_;
501 int pattern_length = pattern.length();
502 // Badness is a count of how much work we have done. When we have
503 // done enough work we decide it's probably worth switching to a better
504 // algorithm.
505 int badness = -10 - (pattern_length << 2);
506
507 // We know our pattern is at least 2 characters, we cache the first so
508 // the common case of the first character not matching is faster.
509 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
510 badness++;
511 if (badness <= 0) {
512 i = FindFirstCharacter(pattern, subject, i);
513 if (i == -1) return -1;
514 DCHECK_LE(i, n);
515 int j = 1;
516 do {
517 if (pattern[j] != subject[i + j]) {
518 break;
519 }
520 j++;
521 } while (j < pattern_length);
522 if (j == pattern_length) {
523 return i;
524 }
525 badness += j;
526 } else {
527 search->PopulateBoyerMooreHorspoolTable();
528 search->strategy_ = &BoyerMooreHorspoolSearch;
529 return BoyerMooreHorspoolSearch(search, subject, i);
530 }
531 }
532 return -1;
533 }
534
535 // Perform a a single stand-alone search.
536 // If searching multiple times for the same pattern, a search
537 // object should be constructed once and the Search function then called
538 // for each search.
539 template <typename SubjectChar, typename PatternChar>
SearchString(Isolate* isolate, base::Vector<const SubjectChar> subject, base::Vector<const PatternChar> pattern, int start_index)540 int SearchString(Isolate* isolate, base::Vector<const SubjectChar> subject,
541 base::Vector<const PatternChar> pattern, int start_index) {
542 StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
543 return search.Search(subject, start_index);
544 }
545
546 // A wrapper function around SearchString that wraps raw pointers to the subject
547 // and pattern as vectors before calling SearchString. Used from the
548 // StringIndexOf builtin.
549 template <typename SubjectChar, typename PatternChar>
SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr, int subject_length, const PatternChar* pattern_ptr, int pattern_length, int start_index)550 intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr,
551 int subject_length, const PatternChar* pattern_ptr,
552 int pattern_length, int start_index) {
553 DisallowGarbageCollection no_gc;
554 base::Vector<const SubjectChar> subject(subject_ptr, subject_length);
555 base::Vector<const PatternChar> pattern(pattern_ptr, pattern_length);
556 return SearchString(isolate, subject, pattern, start_index);
557 }
558
559 } // namespace internal
560 } // namespace v8
561
562 #endif // V8_STRINGS_STRING_SEARCH_H_
563