xref: /third_party/node/src/string_search.h (revision 1cb0ef41)
1// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef SRC_STRING_SEARCH_H_
6#define SRC_STRING_SEARCH_H_
7
8#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
9
10#include "util.h"
11
12#include <cstring>
13#include <algorithm>
14
15namespace node {
16namespace stringsearch {
17
18template <typename T>
19class Vector {
20 public:
21  Vector(T* data, size_t length, bool isForward)
22      : start_(data), length_(length), is_forward_(isForward) {
23    CHECK(length > 0 && data != nullptr);
24  }
25
26  // Returns the start of the memory range.
27  // For vector v this is NOT necessarily &v[0], see forward().
28  const T* start() const { return start_; }
29
30  // Returns the length of the vector, in characters.
31  size_t length() const { return length_; }
32
33  // Returns true if the Vector is front-to-back, false if back-to-front.
34  // In the latter case, v[0] corresponds to the *end* of the memory range.
35  bool forward() const { return is_forward_; }
36
37  // Access individual vector elements - checks bounds in debug mode.
38  T& operator[](size_t index) const {
39    DCHECK_LT(index, length_);
40    return start_[is_forward_ ? index : (length_ - index - 1)];
41  }
42
43 private:
44  T* start_;
45  size_t length_;
46  bool is_forward_;
47};
48
49
50//---------------------------------------------------------------------
51// String Search object.
52//---------------------------------------------------------------------
53
54// Class holding constants and methods that apply to all string search variants,
55// independently of subject and pattern char size.
56class StringSearchBase {
57 protected:
58  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
59  // limit, we can fix the size of tables. For a needle longer than this limit,
60  // search will not be optimal, since we only build tables for a suffix
61  // of the string, but it is a safe approximation.
62  static const int kBMMaxShift = 250;
63
64  // Reduce alphabet to this size.
65  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
66  // proportional to the input alphabet. We reduce the alphabet size by
67  // equating input characters modulo a smaller alphabet size. This gives
68  // a potentially less efficient searching, but is a safe approximation.
69  // For needles using only characters in the same Unicode 256-code point page,
70  // there is no search speed degradation.
71  static const int kLatin1AlphabetSize = 256;
72  static const int kUC16AlphabetSize = 256;
73
74  // Bad-char shift table stored in the state. It's length is the alphabet size.
75  // For patterns below this length, the skip length of Boyer-Moore is too short
76  // to compensate for the algorithmic overhead compared to simple brute force.
77  static const int kBMMinPatternLength = 8;
78
79  // Store for the BoyerMoore(Horspool) bad char shift table.
80  int bad_char_shift_table_[kUC16AlphabetSize];
81  // Store for the BoyerMoore good suffix shift table.
82  int good_suffix_shift_table_[kBMMaxShift + 1];
83  // Table used temporarily while building the BoyerMoore good suffix
84  // shift table.
85  int suffix_table_[kBMMaxShift + 1];
86};
87
88template <typename Char>
89class StringSearch : private StringSearchBase {
90 public:
91  typedef stringsearch::Vector<const Char> Vector;
92
93  explicit StringSearch(Vector pattern)
94      : pattern_(pattern), start_(0) {
95    if (pattern.length() >= kBMMaxShift) {
96      start_ = pattern.length() - kBMMaxShift;
97    }
98
99    size_t pattern_length = pattern_.length();
100    CHECK_GT(pattern_length, 0);
101    if (pattern_length < kBMMinPatternLength) {
102      if (pattern_length == 1) {
103        strategy_ = SearchStrategy::kSingleChar;
104        return;
105      }
106      strategy_ = SearchStrategy::kLinear;
107      return;
108    }
109    strategy_ = SearchStrategy::kInitial;
110  }
111
112  size_t Search(Vector subject, size_t index) {
113    switch (strategy_) {
114      case kBoyerMooreHorspool:
115        return BoyerMooreHorspoolSearch(subject, index);
116      case kBoyerMoore:
117        return BoyerMooreSearch(subject, index);
118      case kInitial:
119        return InitialSearch(subject, index);
120      case kLinear:
121        return LinearSearch(subject, index);
122      case kSingleChar:
123        return SingleCharSearch(subject, index);
124    }
125    UNREACHABLE();
126  }
127
128  static inline int AlphabetSize() {
129    if (sizeof(Char) == 1) {
130      // Latin1 needle.
131      return kLatin1AlphabetSize;
132    } else {
133      // UC16 needle.
134      return kUC16AlphabetSize;
135    }
136
137    static_assert(sizeof(Char) == sizeof(uint8_t) ||
138                  sizeof(Char) == sizeof(uint16_t),
139                  "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)");
140  }
141
142 private:
143  typedef size_t (StringSearch::*SearchFunction)(Vector, size_t);
144  size_t SingleCharSearch(Vector subject, size_t start_index);
145  size_t LinearSearch(Vector subject, size_t start_index);
146  size_t InitialSearch(Vector subject, size_t start_index);
147  size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index);
148  size_t BoyerMooreSearch(Vector subject, size_t start_index);
149
150  void PopulateBoyerMooreHorspoolTable();
151
152  void PopulateBoyerMooreTable();
153
154  static inline int CharOccurrence(int* bad_char_occurrence,
155                                   Char char_code) {
156    if (sizeof(Char) == 1) {
157      return bad_char_occurrence[static_cast<int>(char_code)];
158    }
159    // Both pattern and subject are UC16. Reduce character to equivalence class.
160    int equiv_class = char_code % kUC16AlphabetSize;
161    return bad_char_occurrence[equiv_class];
162  }
163
164  enum SearchStrategy {
165    kBoyerMooreHorspool,
166    kBoyerMoore,
167    kInitial,
168    kLinear,
169    kSingleChar,
170  };
171
172  // The pattern to search for.
173  Vector pattern_;
174  SearchStrategy strategy_;
175  // Cache value of Max(0, pattern_length() - kBMMaxShift)
176  size_t start_;
177};
178
179
180template <typename T, typename U>
181inline T AlignDown(T value, U alignment) {
182  return reinterpret_cast<T>(
183      (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
184}
185
186
187inline uint8_t GetHighestValueByte(uint16_t character) {
188  return std::max(static_cast<uint8_t>(character & 0xFF),
189                  static_cast<uint8_t>(character >> 8));
190}
191
192
193inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
194
195
196// Searches for a byte value in a memory buffer, back to front.
197// Uses memrchr(3) on systems which support it, for speed.
198// Falls back to a vanilla for loop on non-GNU systems such as Windows.
199inline const void* MemrchrFill(const void* haystack, uint8_t needle,
200                               size_t haystack_len) {
201#ifdef _GNU_SOURCE
202  return memrchr(haystack, needle, haystack_len);
203#else
204  const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack);
205  for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) {
206    if (haystack8[i] == needle) {
207      return haystack8 + i;
208    }
209  }
210  return nullptr;
211#endif
212}
213
214
215// Finds the first occurrence of *two-byte* character pattern[0] in the string
216// `subject`. Does not check that the whole pattern matches.
217template <typename Char>
218inline size_t FindFirstCharacter(Vector<const Char> pattern,
219                                 Vector<const Char> subject, size_t index) {
220  const Char pattern_first_char = pattern[0];
221  const size_t max_n = (subject.length() - pattern.length() + 1);
222
223  // For speed, search for the more `rare` of the two bytes in pattern[0]
224  // using memchr / memrchr (which are much faster than a simple for loop).
225  const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
226  size_t pos = index;
227  do {
228    const size_t bytes_to_search = (max_n - pos) * sizeof(Char);
229    const void* void_pos;
230    if (subject.forward()) {
231      // Assert that bytes_to_search won't overflow
232      CHECK_LE(pos, max_n);
233      CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char));
234      void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search);
235    } else {
236      CHECK_LE(pos, subject.length());
237      CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char));
238      void_pos = MemrchrFill(subject.start() + pattern.length() - 1,
239                             search_byte,
240                             bytes_to_search);
241    }
242    const Char* char_pos = static_cast<const Char*>(void_pos);
243    if (char_pos == nullptr)
244      return subject.length();
245
246    // Then, for each match, verify that the full two bytes match pattern[0].
247    char_pos = AlignDown(char_pos, sizeof(Char));
248    size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
249    pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1);
250    if (subject[pos] == pattern_first_char) {
251      // Match found, hooray.
252      return pos;
253    }
254    // Search byte matched, but the other byte of pattern[0] didn't. Keep going.
255  } while (++pos < max_n);
256
257  return subject.length();
258}
259
260
261// Finds the first occurrence of the byte pattern[0] in string `subject`.
262// Does not verify that the whole pattern matches.
263template <>
264inline size_t FindFirstCharacter(Vector<const uint8_t> pattern,
265                                 Vector<const uint8_t> subject,
266                                 size_t index) {
267  const uint8_t pattern_first_char = pattern[0];
268  const size_t subj_len = subject.length();
269  const size_t max_n = (subject.length() - pattern.length() + 1);
270
271  const void* pos;
272  if (subject.forward()) {
273    pos = memchr(subject.start() + index, pattern_first_char, max_n - index);
274  } else {
275    pos = MemrchrFill(subject.start() + pattern.length() - 1,
276                      pattern_first_char,
277                      max_n - index);
278  }
279  const uint8_t* char_pos = static_cast<const uint8_t*>(pos);
280  if (char_pos == nullptr) {
281    return subj_len;
282  }
283
284  size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
285  return subject.forward() ? raw_pos : (subj_len - raw_pos - 1);
286}
287
288//---------------------------------------------------------------------
289// Single Character Pattern Search Strategy
290//---------------------------------------------------------------------
291
292template <typename Char>
293size_t StringSearch<Char>::SingleCharSearch(
294    Vector subject,
295    size_t index) {
296  CHECK_EQ(1, pattern_.length());
297  return FindFirstCharacter(pattern_, subject, index);
298}
299
300//---------------------------------------------------------------------
301// Linear Search Strategy
302//---------------------------------------------------------------------
303
304// Simple linear search for short patterns. Never bails out.
305template <typename Char>
306size_t StringSearch<Char>::LinearSearch(
307    Vector subject,
308    size_t index) {
309  CHECK_GT(pattern_.length(), 1);
310  const size_t n = subject.length() - pattern_.length();
311  for (size_t i = index; i <= n; i++) {
312    i = FindFirstCharacter(pattern_, subject, i);
313    if (i == subject.length())
314      return subject.length();
315    CHECK_LE(i, n);
316
317    bool matches = true;
318    for (size_t j = 1; j < pattern_.length(); j++) {
319      if (pattern_[j] != subject[i + j]) {
320        matches = false;
321        break;
322      }
323    }
324    if (matches) {
325      return i;
326    }
327  }
328  return subject.length();
329}
330
331//---------------------------------------------------------------------
332// Boyer-Moore string search
333//---------------------------------------------------------------------
334
335template <typename Char>
336size_t StringSearch<Char>::BoyerMooreSearch(
337    Vector subject,
338    size_t start_index) {
339  const size_t subject_length = subject.length();
340  const size_t pattern_length = pattern_.length();
341  // Only preprocess at most kBMMaxShift last characters of pattern.
342  size_t start = start_;
343
344  int* bad_char_occurrence = bad_char_shift_table_;
345  int* good_suffix_shift = good_suffix_shift_table_ - start_;
346
347  Char last_char = pattern_[pattern_length - 1];
348  size_t index = start_index;
349  // Continue search from i.
350  while (index <= subject_length - pattern_length) {
351    size_t j = pattern_length - 1;
352    int c;
353    while (last_char != (c = subject[index + j])) {
354      int shift = j - CharOccurrence(bad_char_occurrence, c);
355      index += shift;
356      if (index > subject_length - pattern_length) {
357        return subject.length();
358      }
359    }
360    while (pattern_[j] == (c = subject[index + j])) {
361      if (j == 0) {
362        return index;
363      }
364      j--;
365    }
366    if (j < start) {
367      // we have matched more than our tables allow us to be smart about.
368      // Fall back on BMH shift.
369      index += pattern_length - 1 -
370               CharOccurrence(bad_char_occurrence, last_char);
371    } else {
372      int gs_shift = good_suffix_shift[j + 1];
373      int bc_occ = CharOccurrence(bad_char_occurrence, c);
374      int shift = j - bc_occ;
375      if (gs_shift > shift) {
376        shift = gs_shift;
377      }
378      index += shift;
379    }
380  }
381
382  return subject.length();
383}
384
385template <typename Char>
386void StringSearch<Char>::PopulateBoyerMooreTable() {
387  const size_t pattern_length = pattern_.length();
388  // Only look at the last kBMMaxShift characters of pattern (from start_
389  // to pattern_length).
390  const size_t start = start_;
391  const size_t length = pattern_length - start;
392
393  // Biased tables so that we can use pattern indices as table indices,
394  // even if we only cover the part of the pattern from offset start.
395  int* shift_table = good_suffix_shift_table_ - start_;
396  int* suffix_table = suffix_table_ - start_;
397
398  // Initialize table.
399  for (size_t i = start; i < pattern_length; i++) {
400    shift_table[i] = length;
401  }
402  shift_table[pattern_length] = 1;
403  suffix_table[pattern_length] = pattern_length + 1;
404
405  if (pattern_length <= start) {
406    return;
407  }
408
409  // Find suffixes.
410  Char last_char = pattern_[pattern_length - 1];
411  size_t suffix = pattern_length + 1;
412  {
413    size_t i = pattern_length;
414    while (i > start) {
415      Char c = pattern_[i - 1];
416      while (suffix <= pattern_length && c != pattern_[suffix - 1]) {
417        if (static_cast<size_t>(shift_table[suffix]) == length) {
418          shift_table[suffix] = suffix - i;
419        }
420        suffix = suffix_table[suffix];
421      }
422      suffix_table[--i] = --suffix;
423      if (suffix == pattern_length) {
424        // No suffix to extend, so we check against last_char only.
425        while ((i > start) && (pattern_[i - 1] != last_char)) {
426          if (static_cast<size_t>(shift_table[pattern_length]) == length) {
427            shift_table[pattern_length] = pattern_length - i;
428          }
429          suffix_table[--i] = pattern_length;
430        }
431        if (i > start) {
432          suffix_table[--i] = --suffix;
433        }
434      }
435    }
436  }
437  // Build shift table using suffixes.
438  if (suffix < pattern_length) {
439    for (size_t i = start; i <= pattern_length; i++) {
440      if (static_cast<size_t>(shift_table[i]) == length) {
441        shift_table[i] = suffix - start;
442      }
443      if (i == suffix) {
444        suffix = suffix_table[suffix];
445      }
446    }
447  }
448}
449
450//---------------------------------------------------------------------
451// Boyer-Moore-Horspool string search.
452//---------------------------------------------------------------------
453
454template <typename Char>
455size_t StringSearch<Char>::BoyerMooreHorspoolSearch(
456    Vector subject,
457    size_t start_index) {
458  const size_t subject_length = subject.length();
459  const size_t pattern_length = pattern_.length();
460  int* char_occurrences = bad_char_shift_table_;
461  int64_t badness = -static_cast<int64_t>(pattern_length);
462
463  // How bad we are doing without a good-suffix table.
464  Char last_char = pattern_[pattern_length - 1];
465  int last_char_shift =
466      pattern_length - 1 -
467      CharOccurrence(char_occurrences, last_char);
468
469  // Perform search
470  size_t index = start_index;  // No matches found prior to this index.
471  while (index <= subject_length - pattern_length) {
472    size_t j = pattern_length - 1;
473    int subject_char;
474    while (last_char != (subject_char = subject[index + j])) {
475      int bc_occ = CharOccurrence(char_occurrences, subject_char);
476      int shift = j - bc_occ;
477      index += shift;
478      badness += 1 - shift;  // at most zero, so badness cannot increase.
479      if (index > subject_length - pattern_length) {
480        return subject_length;
481      }
482    }
483    j--;
484    while (pattern_[j] == (subject[index + j])) {
485      if (j == 0) {
486        return index;
487      }
488      j--;
489    }
490    index += last_char_shift;
491    // Badness increases by the number of characters we have
492    // checked, and decreases by the number of characters we
493    // can skip by shifting. It's a measure of how we are doing
494    // compared to reading each character exactly once.
495    badness += (pattern_length - j) - last_char_shift;
496    if (badness > 0) {
497      PopulateBoyerMooreTable();
498      strategy_ = SearchStrategy::kBoyerMoore;
499      return BoyerMooreSearch(subject, index);
500    }
501  }
502  return subject.length();
503}
504
505template <typename Char>
506void StringSearch<Char>::PopulateBoyerMooreHorspoolTable() {
507  const size_t pattern_length = pattern_.length();
508
509  int* bad_char_occurrence = bad_char_shift_table_;
510
511  // Only preprocess at most kBMMaxShift last characters of pattern.
512  const size_t start = start_;
513  // Run forwards to populate bad_char_table, so that *last* instance
514  // of character equivalence class is the one registered.
515  // Notice: Doesn't include the last character.
516  const size_t table_size = AlphabetSize();
517  if (start == 0) {
518    // All patterns less than kBMMaxShift in length.
519    memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
520  } else {
521    for (size_t i = 0; i < table_size; i++) {
522      bad_char_occurrence[i] = start - 1;
523    }
524  }
525  for (size_t i = start; i < pattern_length - 1; i++) {
526    Char c = pattern_[i];
527    int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize();
528    bad_char_occurrence[bucket] = i;
529  }
530}
531
532//---------------------------------------------------------------------
533// Linear string search with bailout to BMH.
534//---------------------------------------------------------------------
535
536// Simple linear search for short patterns, which bails out if the string
537// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
538template <typename Char>
539size_t StringSearch<Char>::InitialSearch(
540    Vector subject,
541    size_t index) {
542  const size_t pattern_length = pattern_.length();
543  // Badness is a count of how much work we have done.  When we have
544  // done enough work we decide it's probably worth switching to a better
545  // algorithm.
546  int64_t badness = -10 - (pattern_length << 2);
547
548  // We know our pattern is at least 2 characters, we cache the first so
549  // the common case of the first character not matching is faster.
550  for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) {
551    badness++;
552    if (badness <= 0) {
553      i = FindFirstCharacter(pattern_, subject, i);
554      if (i == subject.length())
555        return subject.length();
556      CHECK_LE(i, n);
557      size_t j = 1;
558      do {
559        if (pattern_[j] != subject[i + j]) {
560          break;
561        }
562        j++;
563      } while (j < pattern_length);
564      if (j == pattern_length) {
565        return i;
566      }
567      badness += j;
568    } else {
569      PopulateBoyerMooreHorspoolTable();
570      strategy_ = SearchStrategy::kBoyerMooreHorspool;
571      return BoyerMooreHorspoolSearch(subject, i);
572    }
573  }
574  return subject.length();
575}
576
577// Perform a single stand-alone search.
578// If searching multiple times for the same pattern, a search
579// object should be constructed once and the Search function then called
580// for each search.
581template <typename Char>
582size_t SearchString(Vector<const Char> subject,
583                    Vector<const Char> pattern,
584                    size_t start_index) {
585  StringSearch<Char> search(pattern);
586  return search.Search(subject, start_index);
587}
588}  // namespace stringsearch
589}  // namespace node
590
591namespace node {
592
593template <typename Char>
594size_t SearchString(const Char* haystack,
595                    size_t haystack_length,
596                    const Char* needle,
597                    size_t needle_length,
598                    size_t start_index,
599                    bool is_forward) {
600  if (haystack_length < needle_length) return haystack_length;
601  // To do a reverse search (lastIndexOf instead of indexOf) without redundant
602  // code, create two vectors that are reversed views into the input strings.
603  // For example, v_needle[0] would return the *last* character of the needle.
604  // So we're searching for the first instance of rev(needle) in rev(haystack)
605  stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward);
606  stringsearch::Vector<const Char> v_haystack(
607      haystack, haystack_length, is_forward);
608  size_t diff = haystack_length - needle_length;
609  size_t relative_start_index;
610  if (is_forward) {
611    relative_start_index = start_index;
612  } else if (diff < start_index) {
613    relative_start_index = 0;
614  } else {
615    relative_start_index = diff - start_index;
616  }
617  size_t pos = node::stringsearch::SearchString(
618      v_haystack, v_needle, relative_start_index);
619  if (pos == haystack_length) {
620    // not found
621    return pos;
622  }
623  return is_forward ? pos : (haystack_length - needle_length - pos);
624}
625
626template <size_t N>
627size_t SearchString(const char* haystack, size_t haystack_length,
628                    const char (&needle)[N]) {
629  return SearchString(
630      reinterpret_cast<const uint8_t*>(haystack), haystack_length,
631      reinterpret_cast<const uint8_t*>(needle), N - 1, 0, true);
632}
633
634}  // namespace node
635
636#endif  // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
637
638#endif  // SRC_STRING_SEARCH_H_
639