1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_STRINGS_STRING_SEARCH_H_
6 #define V8_STRINGS_STRING_SEARCH_H_
7 
8 #include "src/base/strings.h"
9 #include "src/base/vector.h"
10 #include "src/execution/isolate.h"
11 
12 namespace v8 {
13 namespace internal {
14 
15 //---------------------------------------------------------------------
16 // String Search object.
17 //---------------------------------------------------------------------
18 
19 // Class holding constants and methods that apply to all string search variants,
20 // independently of subject and pattern char size.
21 class StringSearchBase {
22  protected:
23   // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
24   // limit, we can fix the size of tables. For a needle longer than this limit,
25   // search will not be optimal, since we only build tables for a suffix
26   // of the string, but it is a safe approximation.
27   static const int kBMMaxShift = Isolate::kBMMaxShift;
28 
29   // Reduce alphabet to this size.
30   // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
31   // proportional to the input alphabet. We reduce the alphabet size by
32   // equating input characters modulo a smaller alphabet size. This gives
33   // a potentially less efficient searching, but is a safe approximation.
34   // For needles using only characters in the same Unicode 256-code point page,
35   // there is no search speed degradation.
36   static const int kLatin1AlphabetSize = 256;
37   static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
38 
39   // Bad-char shift table stored in the state. It's length is the alphabet size.
40   // For patterns below this length, the skip length of Boyer-Moore is too short
41   // to compensate for the algorithmic overhead compared to simple brute force.
42   static const int kBMMinPatternLength = 7;
43 
IsOneByteString(base::Vector<const uint8_t> string)44   static inline bool IsOneByteString(base::Vector<const uint8_t> string) {
45     return true;
46   }
47 
IsOneByteString(base::Vector<const base::uc16> string)48   static inline bool IsOneByteString(base::Vector<const base::uc16> string) {
49     return String::IsOneByte(string.begin(), string.length());
50   }
51 
52   friend class Isolate;
53 };
54 
55 template <typename PatternChar, typename SubjectChar>
56 class StringSearch : private StringSearchBase {
57  public:
StringSearch(Isolate* isolate, base::Vector<const PatternChar> pattern)58   StringSearch(Isolate* isolate, base::Vector<const PatternChar> pattern)
59       : isolate_(isolate),
60         pattern_(pattern),
61         start_(std::max(0, pattern.length() - kBMMaxShift)) {
62     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
63       if (!IsOneByteString(pattern_)) {
64         strategy_ = &FailSearch;
65         return;
66       }
67     }
68     int pattern_length = pattern_.length();
69     if (pattern_length < kBMMinPatternLength) {
70       if (pattern_length == 1) {
71         strategy_ = &SingleCharSearch;
72         return;
73       }
74       strategy_ = &LinearSearch;
75       return;
76     }
77     strategy_ = &InitialSearch;
78   }
79 
Search(base::Vector<const SubjectChar> subject, int index)80   int Search(base::Vector<const SubjectChar> subject, int index) {
81     return strategy_(this, subject, index);
82   }
83 
AlphabetSize()84   static inline int AlphabetSize() {
85     if (sizeof(PatternChar) == 1) {
86       // Latin1 needle.
87       return kLatin1AlphabetSize;
88     } else {
89       DCHECK_EQ(sizeof(PatternChar), 2);
90       // UC16 needle.
91       return kUC16AlphabetSize;
92     }
93   }
94 
95  private:
96   using SearchFunction = int (*)(StringSearch<PatternChar, SubjectChar>*,
97                                  base::Vector<const SubjectChar>, int);
98 
FailSearch(StringSearch<PatternChar, SubjectChar>*, base::Vector<const SubjectChar>, int)99   static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
100                         base::Vector<const SubjectChar>, int) {
101     return -1;
102   }
103 
104   static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
105                               base::Vector<const SubjectChar> subject,
106                               int start_index);
107 
108   static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
109                           base::Vector<const SubjectChar> subject,
110                           int start_index);
111 
112   static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
113                            base::Vector<const SubjectChar> subject,
114                            int start_index);
115 
116   static int BoyerMooreHorspoolSearch(
117       StringSearch<PatternChar, SubjectChar>* search,
118       base::Vector<const SubjectChar> subject, int start_index);
119 
120   static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
121                               base::Vector<const SubjectChar> subject,
122                               int start_index);
123 
124   void PopulateBoyerMooreHorspoolTable();
125 
126   void PopulateBoyerMooreTable();
127 
exceedsOneByte(uint8_t c)128   static inline bool exceedsOneByte(uint8_t c) { return false; }
129 
exceedsOneByte(uint16_t c)130   static inline bool exceedsOneByte(uint16_t c) {
131     return c > String::kMaxOneByteCharCodeU;
132   }
133 
CharOccurrence(int* bad_char_occurrence, SubjectChar char_code)134   static inline int CharOccurrence(int* bad_char_occurrence,
135                                    SubjectChar char_code) {
136     if (sizeof(SubjectChar) == 1) {
137       return bad_char_occurrence[static_cast<int>(char_code)];
138     }
139     if (sizeof(PatternChar) == 1) {
140       if (exceedsOneByte(char_code)) {
141         return -1;
142       }
143       return bad_char_occurrence[static_cast<unsigned int>(char_code)];
144     }
145     // Both pattern and subject are UC16. Reduce character to equivalence
146     // class.
147     int equiv_class = char_code % kUC16AlphabetSize;
148     return bad_char_occurrence[equiv_class];
149   }
150 
151   // The following tables are shared by all searches.
152   // TODO(lrn): Introduce a way for a pattern to keep its tables
153   // between searches (e.g., for an Atom RegExp).
154 
155   // Store for the BoyerMoore(Horspool) bad char shift table.
156   // Return a table covering the last kBMMaxShift+1 positions of
157   // pattern.
bad_char_table()158   int* bad_char_table() { return isolate_->bad_char_shift_table(); }
159 
160   // Store for the BoyerMoore good suffix shift table.
good_suffix_shift_table()161   int* good_suffix_shift_table() {
162     // Return biased pointer that maps the range  [start_..pattern_.length()
163     // to the kGoodSuffixShiftTable array.
164     return isolate_->good_suffix_shift_table() - start_;
165   }
166 
167   // Table used temporarily while building the BoyerMoore good suffix
168   // shift table.
suffix_table()169   int* suffix_table() {
170     // Return biased pointer that maps the range  [start_..pattern_.length()
171     // to the kSuffixTable array.
172     return isolate_->suffix_table() - start_;
173   }
174 
175   Isolate* isolate_;
176   // The pattern to search for.
177   base::Vector<const PatternChar> pattern_;
178   // Pointer to implementation of the search.
179   SearchFunction strategy_;
180   // Cache value of max(0, pattern_length() - kBMMaxShift)
181   int start_;
182 };
183 
184 template <typename T, typename U>
AlignDown(T value, U alignment)185 inline T AlignDown(T value, U alignment) {
186   return reinterpret_cast<T>(
187       (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
188 }
189 
GetHighestValueByte(base::uc16 character)190 inline uint8_t GetHighestValueByte(base::uc16 character) {
191   return std::max(static_cast<uint8_t>(character & 0xFF),
192                   static_cast<uint8_t>(character >> 8));
193 }
194 
GetHighestValueByte(uint8_t character)195 inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
196 
197 template <typename PatternChar, typename SubjectChar>
FindFirstCharacter(base::Vector<const PatternChar> pattern, base::Vector<const SubjectChar> subject, int index)198 inline int FindFirstCharacter(base::Vector<const PatternChar> pattern,
199                               base::Vector<const SubjectChar> subject,
200                               int index) {
201   const PatternChar pattern_first_char = pattern[0];
202   const int max_n = (subject.length() - pattern.length() + 1);
203 
204   if (sizeof(SubjectChar) == 2 && pattern_first_char == 0) {
205     // Special-case looking for the 0 char in other than one-byte strings.
206     // memchr mostly fails in this case due to every other byte being 0 in text
207     // that is mostly ascii characters.
208     for (int i = index; i < max_n; ++i) {
209       if (subject[i] == 0) return i;
210     }
211     return -1;
212   }
213   const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
214   const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
215   int pos = index;
216   do {
217     DCHECK_GE(max_n - pos, 0);
218     const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
219         memchr(subject.begin() + pos, search_byte,
220                (max_n - pos) * sizeof(SubjectChar)));
221     if (char_pos == nullptr) return -1;
222     char_pos = AlignDown(char_pos, sizeof(SubjectChar));
223     pos = static_cast<int>(char_pos - subject.begin());
224     if (subject[pos] == search_char) return pos;
225   } while (++pos < max_n);
226 
227   return -1;
228 }
229 
230 //---------------------------------------------------------------------
231 // Single Character Pattern Search Strategy
232 //---------------------------------------------------------------------
233 
234 template <typename PatternChar, typename SubjectChar>
SingleCharSearch( StringSearch<PatternChar, SubjectChar>* search, base::Vector<const SubjectChar> subject, int index)235 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
236     StringSearch<PatternChar, SubjectChar>* search,
237     base::Vector<const SubjectChar> subject, int index) {
238   DCHECK_EQ(1, search->pattern_.length());
239   PatternChar pattern_first_char = search->pattern_[0];
240   if (sizeof(PatternChar) > sizeof(SubjectChar)) {
241     if (exceedsOneByte(pattern_first_char)) {
242       return -1;
243     }
244   }
245   return FindFirstCharacter(search->pattern_, subject, index);
246 }
247 
248 //---------------------------------------------------------------------
249 // Linear Search Strategy
250 //---------------------------------------------------------------------
251 
252 template <typename PatternChar, typename SubjectChar>
CharCompare(const PatternChar* pattern, const SubjectChar* subject, int length)253 inline bool CharCompare(const PatternChar* pattern, const SubjectChar* subject,
254                         int length) {
255   DCHECK_GT(length, 0);
256   int pos = 0;
257   do {
258     if (pattern[pos] != subject[pos]) {
259       return false;
260     }
261     pos++;
262   } while (pos < length);
263   return true;
264 }
265 
266 // Simple linear search for short patterns. Never bails out.
267 template <typename PatternChar, typename SubjectChar>
LinearSearch( StringSearch<PatternChar, SubjectChar>* search, base::Vector<const SubjectChar> subject, int index)268 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
269     StringSearch<PatternChar, SubjectChar>* search,
270     base::Vector<const SubjectChar> subject, int index) {
271   base::Vector<const PatternChar> pattern = search->pattern_;
272   DCHECK_GT(pattern.length(), 1);
273   int pattern_length = pattern.length();
274   int i = index;
275   int n = subject.length() - pattern_length;
276   while (i <= n) {
277     i = FindFirstCharacter(pattern, subject, i);
278     if (i == -1) return -1;
279     DCHECK_LE(i, n);
280     i++;
281     // Loop extracted to separate function to allow using return to do
282     // a deeper break.
283     if (CharCompare(pattern.begin() + 1, subject.begin() + i,
284                     pattern_length - 1)) {
285       return i - 1;
286     }
287   }
288   return -1;
289 }
290 
291 //---------------------------------------------------------------------
292 // Boyer-Moore string search
293 //---------------------------------------------------------------------
294 
295 template <typename PatternChar, typename SubjectChar>
BoyerMooreSearch( StringSearch<PatternChar, SubjectChar>* search, base::Vector<const SubjectChar> subject, int start_index)296 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
297     StringSearch<PatternChar, SubjectChar>* search,
298     base::Vector<const SubjectChar> subject, int start_index) {
299   base::Vector<const PatternChar> pattern = search->pattern_;
300   int subject_length = subject.length();
301   int pattern_length = pattern.length();
302   // Only preprocess at most kBMMaxShift last characters of pattern.
303   int start = search->start_;
304 
305   int* bad_char_occurence = search->bad_char_table();
306   int* good_suffix_shift = search->good_suffix_shift_table();
307 
308   PatternChar last_char = pattern[pattern_length - 1];
309   int index = start_index;
310   // Continue search from i.
311   while (index <= subject_length - pattern_length) {
312     int j = pattern_length - 1;
313     int c;
314     while (last_char != (c = subject[index + j])) {
315       int shift = j - CharOccurrence(bad_char_occurence, c);
316       index += shift;
317       if (index > subject_length - pattern_length) {
318         return -1;
319       }
320     }
321     while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
322     if (j < 0) {
323       return index;
324     } else if (j < start) {
325       // we have matched more than our tables allow us to be smart about.
326       // Fall back on BMH shift.
327       index += pattern_length - 1 -
328                CharOccurrence(bad_char_occurence,
329                               static_cast<SubjectChar>(last_char));
330     } else {
331       int gs_shift = good_suffix_shift[j + 1];
332       int bc_occ = CharOccurrence(bad_char_occurence, c);
333       int shift = j - bc_occ;
334       if (gs_shift > shift) {
335         shift = gs_shift;
336       }
337       index += shift;
338     }
339   }
340 
341   return -1;
342 }
343 
344 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreTable()345 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
346   int pattern_length = pattern_.length();
347   const PatternChar* pattern = pattern_.begin();
348   // Only look at the last kBMMaxShift characters of pattern (from start_
349   // to pattern_length).
350   int start = start_;
351   int length = pattern_length - start;
352 
353   // Biased tables so that we can use pattern indices as table indices,
354   // even if we only cover the part of the pattern from offset start.
355   int* shift_table = good_suffix_shift_table();
356   int* suffix_table = this->suffix_table();
357 
358   // Initialize table.
359   for (int i = start; i < pattern_length; i++) {
360     shift_table[i] = length;
361   }
362   shift_table[pattern_length] = 1;
363   suffix_table[pattern_length] = pattern_length + 1;
364 
365   if (pattern_length <= start) {
366     return;
367   }
368 
369   // Find suffixes.
370   PatternChar last_char = pattern[pattern_length - 1];
371   int suffix = pattern_length + 1;
372   {
373     int i = pattern_length;
374     while (i > start) {
375       PatternChar c = pattern[i - 1];
376       while (suffix <= pattern_length && c != pattern[suffix - 1]) {
377         if (shift_table[suffix] == length) {
378           shift_table[suffix] = suffix - i;
379         }
380         suffix = suffix_table[suffix];
381       }
382       suffix_table[--i] = --suffix;
383       if (suffix == pattern_length) {
384         // No suffix to extend, so we check against last_char only.
385         while ((i > start) && (pattern[i - 1] != last_char)) {
386           if (shift_table[pattern_length] == length) {
387             shift_table[pattern_length] = pattern_length - i;
388           }
389           suffix_table[--i] = pattern_length;
390         }
391         if (i > start) {
392           suffix_table[--i] = --suffix;
393         }
394       }
395     }
396   }
397   // Build shift table using suffixes.
398   if (suffix < pattern_length) {
399     for (int i = start; i <= pattern_length; i++) {
400       if (shift_table[i] == length) {
401         shift_table[i] = suffix - start;
402       }
403       if (i == suffix) {
404         suffix = suffix_table[suffix];
405       }
406     }
407   }
408 }
409 
410 //---------------------------------------------------------------------
411 // Boyer-Moore-Horspool string search.
412 //---------------------------------------------------------------------
413 
414 template <typename PatternChar, typename SubjectChar>
BoyerMooreHorspoolSearch( StringSearch<PatternChar, SubjectChar>* search, base::Vector<const SubjectChar> subject, int start_index)415 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
416     StringSearch<PatternChar, SubjectChar>* search,
417     base::Vector<const SubjectChar> subject, int start_index) {
418   base::Vector<const PatternChar> pattern = search->pattern_;
419   int subject_length = subject.length();
420   int pattern_length = pattern.length();
421   int* char_occurrences = search->bad_char_table();
422   int badness = -pattern_length;
423 
424   // How bad we are doing without a good-suffix table.
425   PatternChar last_char = pattern[pattern_length - 1];
426   int last_char_shift =
427       pattern_length - 1 -
428       CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
429   // Perform search
430   int index = start_index;  // No matches found prior to this index.
431   while (index <= subject_length - pattern_length) {
432     int j = pattern_length - 1;
433     int subject_char;
434     while (last_char != (subject_char = subject[index + j])) {
435       int bc_occ = CharOccurrence(char_occurrences, subject_char);
436       int shift = j - bc_occ;
437       index += shift;
438       badness += 1 - shift;  // at most zero, so badness cannot increase.
439       if (index > subject_length - pattern_length) {
440         return -1;
441       }
442     }
443     j--;
444     while (j >= 0 && pattern[j] == (subject[index + j])) j--;
445     if (j < 0) {
446       return index;
447     } else {
448       index += last_char_shift;
449       // Badness increases by the number of characters we have
450       // checked, and decreases by the number of characters we
451       // can skip by shifting. It's a measure of how we are doing
452       // compared to reading each character exactly once.
453       badness += (pattern_length - j) - last_char_shift;
454       if (badness > 0) {
455         search->PopulateBoyerMooreTable();
456         search->strategy_ = &BoyerMooreSearch;
457         return BoyerMooreSearch(search, subject, index);
458       }
459     }
460   }
461   return -1;
462 }
463 
464 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreHorspoolTable()465 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
466   int pattern_length = pattern_.length();
467 
468   int* bad_char_occurrence = bad_char_table();
469 
470   // Only preprocess at most kBMMaxShift last characters of pattern.
471   int start = start_;
472   // Run forwards to populate bad_char_table, so that *last* instance
473   // of character equivalence class is the one registered.
474   // Notice: Doesn't include the last character.
475   int table_size = AlphabetSize();
476   if (start == 0) {  // All patterns less than kBMMaxShift in length.
477     memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
478   } else {
479     for (int i = 0; i < table_size; i++) {
480       bad_char_occurrence[i] = start - 1;
481     }
482   }
483   for (int i = start; i < pattern_length - 1; i++) {
484     PatternChar c = pattern_[i];
485     int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
486     bad_char_occurrence[bucket] = i;
487   }
488 }
489 
490 //---------------------------------------------------------------------
491 // Linear string search with bailout to BMH.
492 //---------------------------------------------------------------------
493 
494 // Simple linear search for short patterns, which bails out if the string
495 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
496 template <typename PatternChar, typename SubjectChar>
InitialSearch( StringSearch<PatternChar, SubjectChar>* search, base::Vector<const SubjectChar> subject, int index)497 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
498     StringSearch<PatternChar, SubjectChar>* search,
499     base::Vector<const SubjectChar> subject, int index) {
500   base::Vector<const PatternChar> pattern = search->pattern_;
501   int pattern_length = pattern.length();
502   // Badness is a count of how much work we have done.  When we have
503   // done enough work we decide it's probably worth switching to a better
504   // algorithm.
505   int badness = -10 - (pattern_length << 2);
506 
507   // We know our pattern is at least 2 characters, we cache the first so
508   // the common case of the first character not matching is faster.
509   for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
510     badness++;
511     if (badness <= 0) {
512       i = FindFirstCharacter(pattern, subject, i);
513       if (i == -1) return -1;
514       DCHECK_LE(i, n);
515       int j = 1;
516       do {
517         if (pattern[j] != subject[i + j]) {
518           break;
519         }
520         j++;
521       } while (j < pattern_length);
522       if (j == pattern_length) {
523         return i;
524       }
525       badness += j;
526     } else {
527       search->PopulateBoyerMooreHorspoolTable();
528       search->strategy_ = &BoyerMooreHorspoolSearch;
529       return BoyerMooreHorspoolSearch(search, subject, i);
530     }
531   }
532   return -1;
533 }
534 
535 // Perform a a single stand-alone search.
536 // If searching multiple times for the same pattern, a search
537 // object should be constructed once and the Search function then called
538 // for each search.
539 template <typename SubjectChar, typename PatternChar>
SearchString(Isolate* isolate, base::Vector<const SubjectChar> subject, base::Vector<const PatternChar> pattern, int start_index)540 int SearchString(Isolate* isolate, base::Vector<const SubjectChar> subject,
541                  base::Vector<const PatternChar> pattern, int start_index) {
542   StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
543   return search.Search(subject, start_index);
544 }
545 
546 // A wrapper function around SearchString that wraps raw pointers to the subject
547 // and pattern as vectors before calling SearchString. Used from the
548 // StringIndexOf builtin.
549 template <typename SubjectChar, typename PatternChar>
SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr, int subject_length, const PatternChar* pattern_ptr, int pattern_length, int start_index)550 intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr,
551                          int subject_length, const PatternChar* pattern_ptr,
552                          int pattern_length, int start_index) {
553   DisallowGarbageCollection no_gc;
554   base::Vector<const SubjectChar> subject(subject_ptr, subject_length);
555   base::Vector<const PatternChar> pattern(pattern_ptr, pattern_length);
556   return SearchString(isolate, subject, pattern, start_index);
557 }
558 
559 }  // namespace internal
560 }  // namespace v8
561 
562 #endif  // V8_STRINGS_STRING_SEARCH_H_
563