1// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_STRINGS_STRING_SEARCH_H_
6#define V8_STRINGS_STRING_SEARCH_H_
7
8#include "src/base/strings.h"
9#include "src/base/vector.h"
10#include "src/execution/isolate.h"
11
12namespace v8 {
13namespace internal {
14
15//---------------------------------------------------------------------
16// String Search object.
17//---------------------------------------------------------------------
18
19// Class holding constants and methods that apply to all string search variants,
20// independently of subject and pattern char size.
21class StringSearchBase {
22 protected:
23  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
24  // limit, we can fix the size of tables. For a needle longer than this limit,
25  // search will not be optimal, since we only build tables for a suffix
26  // of the string, but it is a safe approximation.
27  static const int kBMMaxShift = Isolate::kBMMaxShift;
28
29  // Reduce alphabet to this size.
30  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
31  // proportional to the input alphabet. We reduce the alphabet size by
32  // equating input characters modulo a smaller alphabet size. This gives
33  // a potentially less efficient searching, but is a safe approximation.
34  // For needles using only characters in the same Unicode 256-code point page,
35  // there is no search speed degradation.
36  static const int kLatin1AlphabetSize = 256;
37  static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
38
39  // Bad-char shift table stored in the state. It's length is the alphabet size.
40  // For patterns below this length, the skip length of Boyer-Moore is too short
41  // to compensate for the algorithmic overhead compared to simple brute force.
42  static const int kBMMinPatternLength = 7;
43
44  static inline bool IsOneByteString(base::Vector<const uint8_t> string) {
45    return true;
46  }
47
48  static inline bool IsOneByteString(base::Vector<const base::uc16> string) {
49    return String::IsOneByte(string.begin(), string.length());
50  }
51
52  friend class Isolate;
53};
54
55template <typename PatternChar, typename SubjectChar>
56class StringSearch : private StringSearchBase {
57 public:
58  StringSearch(Isolate* isolate, base::Vector<const PatternChar> pattern)
59      : isolate_(isolate),
60        pattern_(pattern),
61        start_(std::max(0, pattern.length() - kBMMaxShift)) {
62    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
63      if (!IsOneByteString(pattern_)) {
64        strategy_ = &FailSearch;
65        return;
66      }
67    }
68    int pattern_length = pattern_.length();
69    if (pattern_length < kBMMinPatternLength) {
70      if (pattern_length == 1) {
71        strategy_ = &SingleCharSearch;
72        return;
73      }
74      strategy_ = &LinearSearch;
75      return;
76    }
77    strategy_ = &InitialSearch;
78  }
79
80  int Search(base::Vector<const SubjectChar> subject, int index) {
81    return strategy_(this, subject, index);
82  }
83
84  static inline int AlphabetSize() {
85    if (sizeof(PatternChar) == 1) {
86      // Latin1 needle.
87      return kLatin1AlphabetSize;
88    } else {
89      DCHECK_EQ(sizeof(PatternChar), 2);
90      // UC16 needle.
91      return kUC16AlphabetSize;
92    }
93  }
94
95 private:
96  using SearchFunction = int (*)(StringSearch<PatternChar, SubjectChar>*,
97                                 base::Vector<const SubjectChar>, int);
98
99  static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
100                        base::Vector<const SubjectChar>, int) {
101    return -1;
102  }
103
104  static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
105                              base::Vector<const SubjectChar> subject,
106                              int start_index);
107
108  static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
109                          base::Vector<const SubjectChar> subject,
110                          int start_index);
111
112  static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
113                           base::Vector<const SubjectChar> subject,
114                           int start_index);
115
116  static int BoyerMooreHorspoolSearch(
117      StringSearch<PatternChar, SubjectChar>* search,
118      base::Vector<const SubjectChar> subject, int start_index);
119
120  static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
121                              base::Vector<const SubjectChar> subject,
122                              int start_index);
123
124  void PopulateBoyerMooreHorspoolTable();
125
126  void PopulateBoyerMooreTable();
127
128  static inline bool exceedsOneByte(uint8_t c) { return false; }
129
130  static inline bool exceedsOneByte(uint16_t c) {
131    return c > String::kMaxOneByteCharCodeU;
132  }
133
134  static inline int CharOccurrence(int* bad_char_occurrence,
135                                   SubjectChar char_code) {
136    if (sizeof(SubjectChar) == 1) {
137      return bad_char_occurrence[static_cast<int>(char_code)];
138    }
139    if (sizeof(PatternChar) == 1) {
140      if (exceedsOneByte(char_code)) {
141        return -1;
142      }
143      return bad_char_occurrence[static_cast<unsigned int>(char_code)];
144    }
145    // Both pattern and subject are UC16. Reduce character to equivalence
146    // class.
147    int equiv_class = char_code % kUC16AlphabetSize;
148    return bad_char_occurrence[equiv_class];
149  }
150
151  // The following tables are shared by all searches.
152  // TODO(lrn): Introduce a way for a pattern to keep its tables
153  // between searches (e.g., for an Atom RegExp).
154
155  // Store for the BoyerMoore(Horspool) bad char shift table.
156  // Return a table covering the last kBMMaxShift+1 positions of
157  // pattern.
158  int* bad_char_table() { return isolate_->bad_char_shift_table(); }
159
160  // Store for the BoyerMoore good suffix shift table.
161  int* good_suffix_shift_table() {
162    // Return biased pointer that maps the range  [start_..pattern_.length()
163    // to the kGoodSuffixShiftTable array.
164    return isolate_->good_suffix_shift_table() - start_;
165  }
166
167  // Table used temporarily while building the BoyerMoore good suffix
168  // shift table.
169  int* suffix_table() {
170    // Return biased pointer that maps the range  [start_..pattern_.length()
171    // to the kSuffixTable array.
172    return isolate_->suffix_table() - start_;
173  }
174
175  Isolate* isolate_;
176  // The pattern to search for.
177  base::Vector<const PatternChar> pattern_;
178  // Pointer to implementation of the search.
179  SearchFunction strategy_;
180  // Cache value of max(0, pattern_length() - kBMMaxShift)
181  int start_;
182};
183
184template <typename T, typename U>
185inline T AlignDown(T value, U alignment) {
186  return reinterpret_cast<T>(
187      (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
188}
189
190inline uint8_t GetHighestValueByte(base::uc16 character) {
191  return std::max(static_cast<uint8_t>(character & 0xFF),
192                  static_cast<uint8_t>(character >> 8));
193}
194
195inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
196
197template <typename PatternChar, typename SubjectChar>
198inline int FindFirstCharacter(base::Vector<const PatternChar> pattern,
199                              base::Vector<const SubjectChar> subject,
200                              int index) {
201  const PatternChar pattern_first_char = pattern[0];
202  const int max_n = (subject.length() - pattern.length() + 1);
203
204  if (sizeof(SubjectChar) == 2 && pattern_first_char == 0) {
205    // Special-case looking for the 0 char in other than one-byte strings.
206    // memchr mostly fails in this case due to every other byte being 0 in text
207    // that is mostly ascii characters.
208    for (int i = index; i < max_n; ++i) {
209      if (subject[i] == 0) return i;
210    }
211    return -1;
212  }
213  const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
214  const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
215  int pos = index;
216  do {
217    DCHECK_GE(max_n - pos, 0);
218    const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
219        memchr(subject.begin() + pos, search_byte,
220               (max_n - pos) * sizeof(SubjectChar)));
221    if (char_pos == nullptr) return -1;
222    char_pos = AlignDown(char_pos, sizeof(SubjectChar));
223    pos = static_cast<int>(char_pos - subject.begin());
224    if (subject[pos] == search_char) return pos;
225  } while (++pos < max_n);
226
227  return -1;
228}
229
230//---------------------------------------------------------------------
231// Single Character Pattern Search Strategy
232//---------------------------------------------------------------------
233
234template <typename PatternChar, typename SubjectChar>
235int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
236    StringSearch<PatternChar, SubjectChar>* search,
237    base::Vector<const SubjectChar> subject, int index) {
238  DCHECK_EQ(1, search->pattern_.length());
239  PatternChar pattern_first_char = search->pattern_[0];
240  if (sizeof(PatternChar) > sizeof(SubjectChar)) {
241    if (exceedsOneByte(pattern_first_char)) {
242      return -1;
243    }
244  }
245  return FindFirstCharacter(search->pattern_, subject, index);
246}
247
248//---------------------------------------------------------------------
249// Linear Search Strategy
250//---------------------------------------------------------------------
251
252template <typename PatternChar, typename SubjectChar>
253inline bool CharCompare(const PatternChar* pattern, const SubjectChar* subject,
254                        int length) {
255  DCHECK_GT(length, 0);
256  int pos = 0;
257  do {
258    if (pattern[pos] != subject[pos]) {
259      return false;
260    }
261    pos++;
262  } while (pos < length);
263  return true;
264}
265
266// Simple linear search for short patterns. Never bails out.
267template <typename PatternChar, typename SubjectChar>
268int StringSearch<PatternChar, SubjectChar>::LinearSearch(
269    StringSearch<PatternChar, SubjectChar>* search,
270    base::Vector<const SubjectChar> subject, int index) {
271  base::Vector<const PatternChar> pattern = search->pattern_;
272  DCHECK_GT(pattern.length(), 1);
273  int pattern_length = pattern.length();
274  int i = index;
275  int n = subject.length() - pattern_length;
276  while (i <= n) {
277    i = FindFirstCharacter(pattern, subject, i);
278    if (i == -1) return -1;
279    DCHECK_LE(i, n);
280    i++;
281    // Loop extracted to separate function to allow using return to do
282    // a deeper break.
283    if (CharCompare(pattern.begin() + 1, subject.begin() + i,
284                    pattern_length - 1)) {
285      return i - 1;
286    }
287  }
288  return -1;
289}
290
291//---------------------------------------------------------------------
292// Boyer-Moore string search
293//---------------------------------------------------------------------
294
295template <typename PatternChar, typename SubjectChar>
296int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
297    StringSearch<PatternChar, SubjectChar>* search,
298    base::Vector<const SubjectChar> subject, int start_index) {
299  base::Vector<const PatternChar> pattern = search->pattern_;
300  int subject_length = subject.length();
301  int pattern_length = pattern.length();
302  // Only preprocess at most kBMMaxShift last characters of pattern.
303  int start = search->start_;
304
305  int* bad_char_occurence = search->bad_char_table();
306  int* good_suffix_shift = search->good_suffix_shift_table();
307
308  PatternChar last_char = pattern[pattern_length - 1];
309  int index = start_index;
310  // Continue search from i.
311  while (index <= subject_length - pattern_length) {
312    int j = pattern_length - 1;
313    int c;
314    while (last_char != (c = subject[index + j])) {
315      int shift = j - CharOccurrence(bad_char_occurence, c);
316      index += shift;
317      if (index > subject_length - pattern_length) {
318        return -1;
319      }
320    }
321    while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
322    if (j < 0) {
323      return index;
324    } else if (j < start) {
325      // we have matched more than our tables allow us to be smart about.
326      // Fall back on BMH shift.
327      index += pattern_length - 1 -
328               CharOccurrence(bad_char_occurence,
329                              static_cast<SubjectChar>(last_char));
330    } else {
331      int gs_shift = good_suffix_shift[j + 1];
332      int bc_occ = CharOccurrence(bad_char_occurence, c);
333      int shift = j - bc_occ;
334      if (gs_shift > shift) {
335        shift = gs_shift;
336      }
337      index += shift;
338    }
339  }
340
341  return -1;
342}
343
344template <typename PatternChar, typename SubjectChar>
345void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
346  int pattern_length = pattern_.length();
347  const PatternChar* pattern = pattern_.begin();
348  // Only look at the last kBMMaxShift characters of pattern (from start_
349  // to pattern_length).
350  int start = start_;
351  int length = pattern_length - start;
352
353  // Biased tables so that we can use pattern indices as table indices,
354  // even if we only cover the part of the pattern from offset start.
355  int* shift_table = good_suffix_shift_table();
356  int* suffix_table = this->suffix_table();
357
358  // Initialize table.
359  for (int i = start; i < pattern_length; i++) {
360    shift_table[i] = length;
361  }
362  shift_table[pattern_length] = 1;
363  suffix_table[pattern_length] = pattern_length + 1;
364
365  if (pattern_length <= start) {
366    return;
367  }
368
369  // Find suffixes.
370  PatternChar last_char = pattern[pattern_length - 1];
371  int suffix = pattern_length + 1;
372  {
373    int i = pattern_length;
374    while (i > start) {
375      PatternChar c = pattern[i - 1];
376      while (suffix <= pattern_length && c != pattern[suffix - 1]) {
377        if (shift_table[suffix] == length) {
378          shift_table[suffix] = suffix - i;
379        }
380        suffix = suffix_table[suffix];
381      }
382      suffix_table[--i] = --suffix;
383      if (suffix == pattern_length) {
384        // No suffix to extend, so we check against last_char only.
385        while ((i > start) && (pattern[i - 1] != last_char)) {
386          if (shift_table[pattern_length] == length) {
387            shift_table[pattern_length] = pattern_length - i;
388          }
389          suffix_table[--i] = pattern_length;
390        }
391        if (i > start) {
392          suffix_table[--i] = --suffix;
393        }
394      }
395    }
396  }
397  // Build shift table using suffixes.
398  if (suffix < pattern_length) {
399    for (int i = start; i <= pattern_length; i++) {
400      if (shift_table[i] == length) {
401        shift_table[i] = suffix - start;
402      }
403      if (i == suffix) {
404        suffix = suffix_table[suffix];
405      }
406    }
407  }
408}
409
410//---------------------------------------------------------------------
411// Boyer-Moore-Horspool string search.
412//---------------------------------------------------------------------
413
414template <typename PatternChar, typename SubjectChar>
415int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
416    StringSearch<PatternChar, SubjectChar>* search,
417    base::Vector<const SubjectChar> subject, int start_index) {
418  base::Vector<const PatternChar> pattern = search->pattern_;
419  int subject_length = subject.length();
420  int pattern_length = pattern.length();
421  int* char_occurrences = search->bad_char_table();
422  int badness = -pattern_length;
423
424  // How bad we are doing without a good-suffix table.
425  PatternChar last_char = pattern[pattern_length - 1];
426  int last_char_shift =
427      pattern_length - 1 -
428      CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
429  // Perform search
430  int index = start_index;  // No matches found prior to this index.
431  while (index <= subject_length - pattern_length) {
432    int j = pattern_length - 1;
433    int subject_char;
434    while (last_char != (subject_char = subject[index + j])) {
435      int bc_occ = CharOccurrence(char_occurrences, subject_char);
436      int shift = j - bc_occ;
437      index += shift;
438      badness += 1 - shift;  // at most zero, so badness cannot increase.
439      if (index > subject_length - pattern_length) {
440        return -1;
441      }
442    }
443    j--;
444    while (j >= 0 && pattern[j] == (subject[index + j])) j--;
445    if (j < 0) {
446      return index;
447    } else {
448      index += last_char_shift;
449      // Badness increases by the number of characters we have
450      // checked, and decreases by the number of characters we
451      // can skip by shifting. It's a measure of how we are doing
452      // compared to reading each character exactly once.
453      badness += (pattern_length - j) - last_char_shift;
454      if (badness > 0) {
455        search->PopulateBoyerMooreTable();
456        search->strategy_ = &BoyerMooreSearch;
457        return BoyerMooreSearch(search, subject, index);
458      }
459    }
460  }
461  return -1;
462}
463
464template <typename PatternChar, typename SubjectChar>
465void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
466  int pattern_length = pattern_.length();
467
468  int* bad_char_occurrence = bad_char_table();
469
470  // Only preprocess at most kBMMaxShift last characters of pattern.
471  int start = start_;
472  // Run forwards to populate bad_char_table, so that *last* instance
473  // of character equivalence class is the one registered.
474  // Notice: Doesn't include the last character.
475  int table_size = AlphabetSize();
476  if (start == 0) {  // All patterns less than kBMMaxShift in length.
477    memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
478  } else {
479    for (int i = 0; i < table_size; i++) {
480      bad_char_occurrence[i] = start - 1;
481    }
482  }
483  for (int i = start; i < pattern_length - 1; i++) {
484    PatternChar c = pattern_[i];
485    int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
486    bad_char_occurrence[bucket] = i;
487  }
488}
489
490//---------------------------------------------------------------------
491// Linear string search with bailout to BMH.
492//---------------------------------------------------------------------
493
494// Simple linear search for short patterns, which bails out if the string
495// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
496template <typename PatternChar, typename SubjectChar>
497int StringSearch<PatternChar, SubjectChar>::InitialSearch(
498    StringSearch<PatternChar, SubjectChar>* search,
499    base::Vector<const SubjectChar> subject, int index) {
500  base::Vector<const PatternChar> pattern = search->pattern_;
501  int pattern_length = pattern.length();
502  // Badness is a count of how much work we have done.  When we have
503  // done enough work we decide it's probably worth switching to a better
504  // algorithm.
505  int badness = -10 - (pattern_length << 2);
506
507  // We know our pattern is at least 2 characters, we cache the first so
508  // the common case of the first character not matching is faster.
509  for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
510    badness++;
511    if (badness <= 0) {
512      i = FindFirstCharacter(pattern, subject, i);
513      if (i == -1) return -1;
514      DCHECK_LE(i, n);
515      int j = 1;
516      do {
517        if (pattern[j] != subject[i + j]) {
518          break;
519        }
520        j++;
521      } while (j < pattern_length);
522      if (j == pattern_length) {
523        return i;
524      }
525      badness += j;
526    } else {
527      search->PopulateBoyerMooreHorspoolTable();
528      search->strategy_ = &BoyerMooreHorspoolSearch;
529      return BoyerMooreHorspoolSearch(search, subject, i);
530    }
531  }
532  return -1;
533}
534
535// Perform a a single stand-alone search.
536// If searching multiple times for the same pattern, a search
537// object should be constructed once and the Search function then called
538// for each search.
539template <typename SubjectChar, typename PatternChar>
540int SearchString(Isolate* isolate, base::Vector<const SubjectChar> subject,
541                 base::Vector<const PatternChar> pattern, int start_index) {
542  StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
543  return search.Search(subject, start_index);
544}
545
546// A wrapper function around SearchString that wraps raw pointers to the subject
547// and pattern as vectors before calling SearchString. Used from the
548// StringIndexOf builtin.
549template <typename SubjectChar, typename PatternChar>
550intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr,
551                         int subject_length, const PatternChar* pattern_ptr,
552                         int pattern_length, int start_index) {
553  DisallowGarbageCollection no_gc;
554  base::Vector<const SubjectChar> subject(subject_ptr, subject_length);
555  base::Vector<const PatternChar> pattern(pattern_ptr, pattern_length);
556  return SearchString(isolate, subject, pattern, start_index);
557}
558
559}  // namespace internal
560}  // namespace v8
561
562#endif  // V8_STRINGS_STRING_SEARCH_H_
563