1c67d6573Sopenharmony_ciuse test::Bencher;
2c67d6573Sopenharmony_ci
3c67d6573Sopenharmony_ciuse crate::{Regex, Text};
4c67d6573Sopenharmony_ci
5c67d6573Sopenharmony_ci// USAGE: sherlock!(name, pattern, count)
6c67d6573Sopenharmony_ci//
7c67d6573Sopenharmony_ci// This is same as bench_find, except it always uses the Sherlock haystack.
8c67d6573Sopenharmony_cimacro_rules! sherlock {
9c67d6573Sopenharmony_ci    ($name:ident, $pattern:expr, $count:expr) => {
10c67d6573Sopenharmony_ci        bench_find!(
11c67d6573Sopenharmony_ci            $name,
12c67d6573Sopenharmony_ci            $pattern,
13c67d6573Sopenharmony_ci            $count,
14c67d6573Sopenharmony_ci            include_str!("data/sherlock.txt").to_owned()
15c67d6573Sopenharmony_ci        );
16c67d6573Sopenharmony_ci    };
17c67d6573Sopenharmony_ci}
18c67d6573Sopenharmony_ci
19c67d6573Sopenharmony_ci// These patterns are all single string literals that compile down to a variant
20c67d6573Sopenharmony_ci// of Boyer-Moore w/ memchr. This also demonstrates the impact that the
21c67d6573Sopenharmony_ci// frequency of a match has on performance.
22c67d6573Sopenharmony_cisherlock!(name_sherlock, r"Sherlock", 97);
23c67d6573Sopenharmony_cisherlock!(name_holmes, r"Holmes", 461);
24c67d6573Sopenharmony_cisherlock!(name_sherlock_holmes, r"Sherlock Holmes", 91);
25c67d6573Sopenharmony_ci
26c67d6573Sopenharmony_ci// Like the above, except case insensitively. The prefix detector will extract
27c67d6573Sopenharmony_ci// multiple *cut* prefix literals for each of the following before hitting its
28c67d6573Sopenharmony_ci// limit. All of these should be able to use either memchr2 or memchr3.
29c67d6573Sopenharmony_ci// std C++ does not support inline modifier syntax
30c67d6573Sopenharmony_cisherlock!(name_sherlock_nocase, r"(?i)Sherlock", 102);
31c67d6573Sopenharmony_cisherlock!(name_holmes_nocase, r"(?i)Holmes", 467);
32c67d6573Sopenharmony_cisherlock!(name_sherlock_holmes_nocase, r"(?i)Sherlock Holmes", 96);
33c67d6573Sopenharmony_ci
34c67d6573Sopenharmony_ci// Will quickly find instances of 'Sherlock', but then needs to fall back to
35c67d6573Sopenharmony_ci// the lazy DFA to process the Unicode aware `\s`.
36c67d6573Sopenharmony_cisherlock!(name_whitespace, r"Sherlock\s+Holmes", 97);
37c67d6573Sopenharmony_ci
38c67d6573Sopenharmony_ci// Now try more variations on name matching.
39c67d6573Sopenharmony_ci// This one has two alternates that both start with 'S'. This should compile
40c67d6573Sopenharmony_ci// to an Aho-Corasick automaton that uses memchr. Never enters lazy DFA.
41c67d6573Sopenharmony_cisherlock!(name_alt1, r"Sherlock|Street", 158);
42c67d6573Sopenharmony_ci// This one doesn't have a common byte, but should still use Aho-Corasick and
43c67d6573Sopenharmony_ci// memchr2.
44c67d6573Sopenharmony_ci// Never enters lazy DFA.
45c67d6573Sopenharmony_cisherlock!(name_alt2, r"Sherlock|Holmes", 558);
46c67d6573Sopenharmony_ci// Still using Aho-Corasick, but more patterns. Never enters lazy DFA but
47c67d6573Sopenharmony_ci// also can't use any memchr variant.
48c67d6573Sopenharmony_cisherlock!(name_alt3, r"Sherlock|Holmes|Watson|Irene|Adler|John|Baker", 740);
49c67d6573Sopenharmony_ci// Still using Aho-Corasick, but needs the lazy DFA.
50c67d6573Sopenharmony_cisherlock!(
51c67d6573Sopenharmony_ci    name_alt3_nocase,
52c67d6573Sopenharmony_ci    r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker",
53c67d6573Sopenharmony_ci    753
54c67d6573Sopenharmony_ci);
55c67d6573Sopenharmony_ci// Should still use Aho-Corasick for the prefixes in each alternate, but
56c67d6573Sopenharmony_ci// we need to use the lazy DFA to complete it.
57c67d6573Sopenharmony_cisherlock!(name_alt4, r"Sher[a-z]+|Hol[a-z]+", 582);
58c67d6573Sopenharmony_cisherlock!(name_alt4_nocase, r"(?i)Sher[a-z]+|Hol[a-z]+", 697);
59c67d6573Sopenharmony_ci// Uses Aho-Corasick, but can use memchr3 (unlike name_alt3).
60c67d6573Sopenharmony_cisherlock!(name_alt5, r"Sherlock|Holmes|Watson", 639);
61c67d6573Sopenharmony_cisherlock!(name_alt5_nocase, r"(?i)Sherlock|Holmes|Watson", 650);
62c67d6573Sopenharmony_ci
63c67d6573Sopenharmony_ci// How long does it take to discover that there's no match? In the first two
64c67d6573Sopenharmony_ci// cases, we detect the rarest byte in the literal to run memchr on. In the
65c67d6573Sopenharmony_ci// first, it's 'z' and in the second it's 'j'. The third case only has common
66c67d6573Sopenharmony_ci// letters, and is therefore slower.
67c67d6573Sopenharmony_cisherlock!(no_match_uncommon, r"zqj", 0);
68c67d6573Sopenharmony_cisherlock!(no_match_common, r"aqj", 0);
69c67d6573Sopenharmony_cisherlock!(no_match_really_common, r"aei", 0);
70c67d6573Sopenharmony_ci
71c67d6573Sopenharmony_ci// Various twiddling on very common words. This tends to stress the constant
72c67d6573Sopenharmony_ci// overhead of actually reporting a match. (None of these actually enter any
73c67d6573Sopenharmony_ci// matching engines.)
74c67d6573Sopenharmony_cisherlock!(the_lower, r"the", 7218);
75c67d6573Sopenharmony_cisherlock!(the_upper, r"The", 741);
76c67d6573Sopenharmony_cisherlock!(the_nocase, r"(?i)the", 7987);
77c67d6573Sopenharmony_ci
78c67d6573Sopenharmony_ci// Process whitespace after a very common word.
79c67d6573Sopenharmony_ci// Uses Boyer-Moore to find `the` and the lazy DFA for the rest.
80c67d6573Sopenharmony_cisherlock!(the_whitespace, r"the\s+\w+", 5410);
81c67d6573Sopenharmony_ci
82c67d6573Sopenharmony_ci// How fast can we match everything? This essentially defeats any clever prefix
83c67d6573Sopenharmony_ci// tricks and just executes the DFA across the entire input.
84c67d6573Sopenharmony_ci#[cfg(not(feature = "re-pcre1"))]
85c67d6573Sopenharmony_ci#[cfg(not(feature = "re-pcre2"))]
86c67d6573Sopenharmony_ci#[cfg(not(feature = "re-tcl"))]
87c67d6573Sopenharmony_cisherlock!(everything_greedy, r".*", 13053);
88c67d6573Sopenharmony_ci#[cfg(not(feature = "re-onig"))]
89c67d6573Sopenharmony_ci#[cfg(not(feature = "re-pcre1"))]
90c67d6573Sopenharmony_ci#[cfg(not(feature = "re-pcre2"))]
91c67d6573Sopenharmony_ci#[cfg(not(feature = "re-tcl"))]
92c67d6573Sopenharmony_cisherlock!(everything_greedy_nl, r"(?s).*", 1);
93c67d6573Sopenharmony_ci
94c67d6573Sopenharmony_ci// How fast can we match every letter? This also defeats any clever prefix
95c67d6573Sopenharmony_ci// tricks.
96c67d6573Sopenharmony_ci#[cfg(not(feature = "re-tcl"))]
97c67d6573Sopenharmony_cisherlock!(letters, r"\p{L}", 447160);
98c67d6573Sopenharmony_ci
99c67d6573Sopenharmony_ci#[cfg(not(feature = "re-tcl"))]
100c67d6573Sopenharmony_cisherlock!(letters_upper, r"\p{Lu}", 14180);
101c67d6573Sopenharmony_ci
102c67d6573Sopenharmony_ci#[cfg(not(feature = "re-tcl"))]
103c67d6573Sopenharmony_cisherlock!(letters_lower, r"\p{Ll}", 432980);
104c67d6573Sopenharmony_ci
105c67d6573Sopenharmony_ci// Similarly, for words.
106c67d6573Sopenharmony_ci#[cfg(not(feature = "re-re2"))]
107c67d6573Sopenharmony_cisherlock!(words, r"\w+", 109214);
108c67d6573Sopenharmony_ci#[cfg(feature = "re-re2")]
109c67d6573Sopenharmony_cisherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?
110c67d6573Sopenharmony_ci
111c67d6573Sopenharmony_ci// Find complete words before Holmes. The `\w` defeats any prefix
112c67d6573Sopenharmony_ci// optimizations.
113c67d6573Sopenharmony_cisherlock!(before_holmes, r"\w+\s+Holmes", 319);
114c67d6573Sopenharmony_ci
115c67d6573Sopenharmony_ci// Find complete words before Holmes. Both of the `\w`s defeat any prefix
116c67d6573Sopenharmony_ci// and suffix optimizations.
117c67d6573Sopenharmony_cisherlock!(before_after_holmes, r"\w+\s+Holmes\s+\w+", 137);
118c67d6573Sopenharmony_ci
119c67d6573Sopenharmony_ci// Find Holmes co-occurring with Watson in a particular window of characters.
120c67d6573Sopenharmony_ci// This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for
121c67d6573Sopenharmony_ci// the rest.
122c67d6573Sopenharmony_cisherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7);
123c67d6573Sopenharmony_ci
124c67d6573Sopenharmony_ci// Find Holmes co-occurring with Watson in a particular window of words.
125c67d6573Sopenharmony_ci// This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for
126c67d6573Sopenharmony_ci// the rest.
127c67d6573Sopenharmony_ci#[cfg(not(feature = "re-onig"))]
128c67d6573Sopenharmony_ci#[cfg(not(feature = "re-pcre1"))]
129c67d6573Sopenharmony_ci#[cfg(not(feature = "re-pcre2"))]
130c67d6573Sopenharmony_ci#[cfg(not(feature = "re-tcl"))]
131c67d6573Sopenharmony_cisherlock!(
132c67d6573Sopenharmony_ci    holmes_coword_watson,
133c67d6573Sopenharmony_ci    r"Holmes(?:\s*.+\s*){0,10}Watson|Watson(?:\s*.+\s*){0,10}Holmes",
134c67d6573Sopenharmony_ci    51
135c67d6573Sopenharmony_ci);
136c67d6573Sopenharmony_ci
137c67d6573Sopenharmony_ci// Find some subset of quotes in the text.
138c67d6573Sopenharmony_ci// This does detect the `"` or `'` prefix literal and does a quick scan for
139c67d6573Sopenharmony_ci// either byte before starting the lazy DFA.
140c67d6573Sopenharmony_cisherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767);
141c67d6573Sopenharmony_ci
142c67d6573Sopenharmony_ci// Finds all occurrences of Sherlock Holmes at the beginning or end of a line.
143c67d6573Sopenharmony_ci// The empty assertions defeat any detection of prefix literals, so it's the
144c67d6573Sopenharmony_ci// lazy DFA the entire way.
145c67d6573Sopenharmony_cisherlock!(
146c67d6573Sopenharmony_ci    line_boundary_sherlock_holmes,
147c67d6573Sopenharmony_ci    r"(?m)^Sherlock Holmes|Sherlock Holmes$",
148c67d6573Sopenharmony_ci    34
149c67d6573Sopenharmony_ci);
150c67d6573Sopenharmony_ci
151c67d6573Sopenharmony_ci// All words ending in `n`. This uses Unicode word boundaries, which the DFA
152c67d6573Sopenharmony_ci// can speculatively handle. Since this benchmark is on mostly ASCII text, it
153c67d6573Sopenharmony_ci// performs well here. A different benchmark with non-Western text would be
154c67d6573Sopenharmony_ci// more revealing since the search would be forced to fall back to an NFA
155c67d6573Sopenharmony_ci// simulation.
156c67d6573Sopenharmony_ci#[cfg(not(feature = "re-tcl"))]
157c67d6573Sopenharmony_cisherlock!(word_ending_n, r"\b\w+n\b", 8366);
158c67d6573Sopenharmony_ci
159c67d6573Sopenharmony_ci// This is a real bad one for Rust's engine. This particular expression
160c67d6573Sopenharmony_ci// fills the state cache quite frequently, which results in a lot of churn.
161c67d6573Sopenharmony_ci// This can be made to go roughly the speed of PCRE by increasing the DFA cache
162c67d6573Sopenharmony_ci// size.
163c67d6573Sopenharmony_ci//
164c67d6573Sopenharmony_ci// Its only salvation is that the DFA realizes it's executing slowly, gives up
165c67d6573Sopenharmony_ci// quickly and falls back to the NFA algorithm.
166c67d6573Sopenharmony_ci//
167c67d6573Sopenharmony_ci// RE2 seems to do a worse job at this than Rust. So much so that it's slow
168c67d6573Sopenharmony_ci// enough to be annoying, so we disable it.
169c67d6573Sopenharmony_ci#[cfg(not(feature = "re-re2"))]
170c67d6573Sopenharmony_cisherlock!(repeated_class_negation, r"[a-q][^u-z]{13}x", 142);
171c67d6573Sopenharmony_ci
172c67d6573Sopenharmony_ci// This defeats any prefix optimizations but triggers the reverse suffix
173c67d6573Sopenharmony_ci// optimization.
174c67d6573Sopenharmony_cisherlock!(ing_suffix, r"[a-zA-Z]+ing", 2824);
175c67d6573Sopenharmony_ci
176c67d6573Sopenharmony_ci// Similar to ing_suffix, but a little more complex by limiting the length
177c67d6573Sopenharmony_ci// of the word and making sure it's surrounded by whitespace. The trailing
178c67d6573Sopenharmony_ci// `\s` defeats the reverse suffix optimization.
179c67d6573Sopenharmony_ci//
180c67d6573Sopenharmony_ci// Onig does surprisingly well on this benchmark and yet does quite poorly on
181c67d6573Sopenharmony_ci// the ing_suffix benchmark. That one has me stumped.
182c67d6573Sopenharmony_cisherlock!(ing_suffix_limited_space, r"\s[a-zA-Z]{0,12}ing\s", 2081);
183