xref: /third_party/rust/crates/regex/bench/src/misc.rs (revision c67d6573)
1#![allow(non_snake_case)]
2
3use std::iter::repeat;
4
5use test::Bencher;
6
7#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))]
8use crate::RegexSet;
9use crate::{Regex, Text};
10
11#[cfg(not(feature = "re-onig"))]
12#[cfg(not(feature = "re-pcre1"))]
13#[cfg(not(feature = "re-pcre2"))]
14bench_match!(
15    no_exponential,
16    {
17        format!(
18            "{}{}",
19            repeat("a?").take(100).collect::<String>(),
20            repeat("a").take(100).collect::<String>()
21        )
22    },
23    repeat("a").take(100).collect()
24);
25
26bench_match!(literal, r"y", {
27    format!("{}y", repeat("x").take(50).collect::<String>())
28});
29
30bench_match!(not_literal, r".y", {
31    format!("{}y", repeat("x").take(50).collect::<String>())
32});
33
34bench_match!(match_class, "[abcdw]", {
35    format!("{}w", repeat("xxxx").take(20).collect::<String>())
36});
37
38bench_match!(match_class_in_range, "[ac]", {
39    format!("{}c", repeat("bbbb").take(20).collect::<String>())
40});
41
42#[cfg(not(feature = "re-rust-bytes"))]
43#[cfg(not(feature = "re-tcl"))]
44bench_match!(match_class_unicode, r"\p{L}", {
45    format!("{}a", repeat("☃5☃5").take(20).collect::<String>())
46});
47
48bench_not_match!(anchored_literal_short_non_match, r"^zbc(d|e)", {
49    "abcdefghijklmnopqrstuvwxyz".to_owned()
50});
51
52bench_not_match!(anchored_literal_long_non_match, r"^zbc(d|e)", {
53    repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect::<String>()
54});
55
56bench_match!(anchored_literal_short_match, r"^.bc(d|e)", {
57    "abcdefghijklmnopqrstuvwxyz".to_owned()
58});
59
60bench_match!(anchored_literal_long_match, r"^.bc(d|e)", {
61    repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect::<String>()
62});
63
64bench_match!(one_pass_short, r"^.bc(d|e)*$", {
65    "abcddddddeeeededd".to_owned()
66});
67
68bench_match!(one_pass_short_not, r".bc(d|e)*$", {
69    "abcddddddeeeededd".to_owned()
70});
71
72bench_match!(one_pass_long_prefix, r"^abcdefghijklmnopqrstuvwxyz.*$", {
73    "abcdefghijklmnopqrstuvwxyz".to_owned()
74});
75
76bench_match!(one_pass_long_prefix_not, r"^.bcdefghijklmnopqrstuvwxyz.*$", {
77    "abcdefghijklmnopqrstuvwxyz".to_owned()
78});
79
80bench_match!(long_needle1, r"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", {
81    repeat("a").take(100_000).collect::<String>() + "b"
82});
83
84bench_match!(long_needle2, r"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbba", {
85    repeat("b").take(100_000).collect::<String>() + "a"
86});
87
88// This benchmark specifically targets the "reverse suffix literal"
89// optimization. In particular, it is easy for a naive implementation to
90// take quadratic worst case time. This benchmark provides a case for such
91// a scenario.
92bench_not_match!(reverse_suffix_no_quadratic, r"[r-z].*bcdefghijklmnopq", {
93    repeat("bcdefghijklmnopq").take(500).collect::<String>()
94});
95
96#[cfg(feature = "re-rust")]
97#[bench]
98fn replace_all(b: &mut Bencher) {
99    let re = regex!("[cjrw]");
100    let text = "abcdefghijklmnopqrstuvwxyz";
101    b.iter(|| re.replace_all(text, ""));
102}
103
104const TXT_32: &'static str = include_str!("data/32.txt");
105const TXT_1K: &'static str = include_str!("data/1K.txt");
106const TXT_32K: &'static str = include_str!("data/32K.txt");
107const TXT_1MB: &'static str = include_str!("data/1MB.txt");
108
109fn get_text(corpus: &str, suffix: String) -> String {
110    let mut corpus = corpus.to_string();
111    corpus.push_str(&suffix);
112    corpus
113}
114
115fn easy0_suffix() -> String {
116    "ABCDEFGHIJKLMNOPQRSTUVWXYZ".to_string()
117}
118
119macro_rules! easy0 {
120    () => {
121        "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
122    };
123}
124
125bench_match!(easy0_32, easy0!(), get_text(TXT_32, easy0_suffix()));
126bench_match!(easy0_1K, easy0!(), get_text(TXT_1K, easy0_suffix()));
127bench_match!(easy0_32K, easy0!(), get_text(TXT_32K, easy0_suffix()));
128bench_match!(easy0_1MB, easy0!(), get_text(TXT_1MB, easy0_suffix()));
129
130fn easy1_suffix() -> String {
131    "AABCCCDEEEFGGHHHIJJ".to_string()
132}
133
134macro_rules! easy1 {
135    () => {
136        r"A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"
137    };
138}
139
140bench_match!(easy1_32, easy1!(), get_text(TXT_32, easy1_suffix()));
141bench_match!(easy1_1K, easy1!(), get_text(TXT_1K, easy1_suffix()));
142bench_match!(easy1_32K, easy1!(), get_text(TXT_32K, easy1_suffix()));
143bench_match!(easy1_1MB, easy1!(), get_text(TXT_1MB, easy1_suffix()));
144
145fn medium_suffix() -> String {
146    "XABCDEFGHIJKLMNOPQRSTUVWXYZ".to_string()
147}
148
149macro_rules! medium {
150    () => {
151        r"[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
152    };
153}
154
155bench_match!(medium_32, medium!(), get_text(TXT_32, medium_suffix()));
156bench_match!(medium_1K, medium!(), get_text(TXT_1K, medium_suffix()));
157bench_match!(medium_32K, medium!(), get_text(TXT_32K, medium_suffix()));
158bench_match!(medium_1MB, medium!(), get_text(TXT_1MB, medium_suffix()));
159
160fn hard_suffix() -> String {
161    "ABCDEFGHIJKLMNOPQRSTUVWXYZ".to_string()
162}
163
164macro_rules! hard {
165    () => {
166        r"[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
167    };
168}
169
170bench_match!(hard_32, hard!(), get_text(TXT_32, hard_suffix()));
171bench_match!(hard_1K, hard!(), get_text(TXT_1K, hard_suffix()));
172bench_match!(hard_32K, hard!(), get_text(TXT_32K, hard_suffix()));
173bench_match!(hard_1MB, hard!(), get_text(TXT_1MB, hard_suffix()));
174
175fn reallyhard_suffix() -> String {
176    "ABCDEFGHIJKLMNOPQRSTUVWXYZ".to_string()
177}
178
179macro_rules! reallyhard {
180    () => {
181        // The point of this being "really" hard is that it should completely
182        // thwart any prefix or suffix literal optimizations.
183        r"[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ.*"
184    };
185}
186
187bench_match!(
188    reallyhard_32,
189    reallyhard!(),
190    get_text(TXT_32, reallyhard_suffix())
191);
192bench_match!(
193    reallyhard_1K,
194    reallyhard!(),
195    get_text(TXT_1K, reallyhard_suffix())
196);
197bench_match!(
198    reallyhard_32K,
199    reallyhard!(),
200    get_text(TXT_32K, reallyhard_suffix())
201);
202bench_match!(
203    reallyhard_1MB,
204    reallyhard!(),
205    get_text(TXT_1MB, reallyhard_suffix())
206);
207
208fn reallyhard2_suffix() -> String {
209    "Sherlock Holmes".to_string()
210}
211
212macro_rules! reallyhard2 {
213    () => {
214        r"\w+\s+Holmes"
215    };
216}
217
218bench_match!(
219    reallyhard2_1K,
220    reallyhard2!(),
221    get_text(TXT_1K, reallyhard2_suffix())
222);
223
224//
225// Benchmarks to justify the short-haystack NFA fallthrough optimization
226// implemented by `read_captures_at` in regex/src/exec.rs. See github issue
227// #348.
228//
229// The procedure used to try to determine the right hardcoded cutoff
230// for the short-haystack optimization in issue #348 is as follows.
231//
232// ```
233// > cd bench
234// > cargo bench --features re-rust short_hay | tee dfa-nfa.res
235// > # modify the `MatchType::Dfa` branch in exec.rs:read_captures_at
236// > # to just execute the nfa
237// > cargo bench --features re-rust short_hay | tee nfa-only.res
238// > cargo benchcmp dfa-nfa.res nfa-only.res
239// ```
240//
241// The expected result is that short inputs will go faster under
242// the nfa-only mode, but at some turnover point the dfa-nfa mode
243// will start to win again. Unfortunately, that is not what happened.
244// Instead there was no noticeable change in the bench results, so
245// I've opted to just do the more conservative anchor optimization.
246//
247bench_captures!(
248    short_haystack_1x,
249    Regex::new(r"(bbbb)cccc(bbb)").unwrap(),
250    2,
251    String::from("aaaabbbbccccbbbdddd")
252);
253bench_captures!(
254    short_haystack_2x,
255    Regex::new(r"(bbbb)cccc(bbb)").unwrap(),
256    2,
257    format!(
258        "{}bbbbccccbbb{}",
259        repeat("aaaa").take(2).collect::<String>(),
260        repeat("dddd").take(2).collect::<String>(),
261    )
262);
263bench_captures!(
264    short_haystack_3x,
265    Regex::new(r"(bbbb)cccc(bbb)").unwrap(),
266    2,
267    format!(
268        "{}bbbbccccbbb{}",
269        repeat("aaaa").take(3).collect::<String>(),
270        repeat("dddd").take(3).collect::<String>(),
271    )
272);
273bench_captures!(
274    short_haystack_4x,
275    Regex::new(r"(bbbb)cccc(bbb)").unwrap(),
276    2,
277    format!(
278        "{}bbbbccccbbb{}",
279        repeat("aaaa").take(4).collect::<String>(),
280        repeat("dddd").take(4).collect::<String>(),
281    )
282);
283bench_captures!(
284    short_haystack_10x,
285    Regex::new(r"(bbbb)cccc(bbb)").unwrap(),
286    2,
287    format!(
288        "{}bbbbccccbbb{}",
289        repeat("aaaa").take(10).collect::<String>(),
290        repeat("dddd").take(10).collect::<String>(),
291    )
292);
293bench_captures!(
294    short_haystack_100x,
295    Regex::new(r"(bbbb)cccc(bbb)").unwrap(),
296    2,
297    format!(
298        "{}bbbbccccbbb{}",
299        repeat("aaaa").take(100).collect::<String>(),
300        repeat("dddd").take(100).collect::<String>(),
301    )
302);
303bench_captures!(
304    short_haystack_1000x,
305    Regex::new(r"(bbbb)cccc(bbb)").unwrap(),
306    2,
307    format!(
308        "{}bbbbccccbbb{}",
309        repeat("aaaa").take(1000).collect::<String>(),
310        repeat("dddd").take(1000).collect::<String>(),
311    )
312);
313bench_captures!(
314    short_haystack_10000x,
315    Regex::new(r"(bbbb)cccc(bbb)").unwrap(),
316    2,
317    format!(
318        "{}bbbbccccbbb{}",
319        repeat("aaaa").take(10000).collect::<String>(),
320        repeat("dddd").take(10000).collect::<String>(),
321    )
322);
323bench_captures!(
324    short_haystack_100000x,
325    Regex::new(r"(bbbb)cccc(bbb)").unwrap(),
326    2,
327    format!(
328        "{}bbbbccccbbb{}",
329        repeat("aaaa").take(100000).collect::<String>(),
330        repeat("dddd").take(100000).collect::<String>(),
331    )
332);
333bench_captures!(
334    short_haystack_1000000x,
335    Regex::new(r"(bbbb)cccc(bbb)").unwrap(),
336    2,
337    format!(
338        "{}bbbbccccbbb{}",
339        repeat("aaaa").take(1000000).collect::<String>(),
340        repeat("dddd").take(1000000).collect::<String>(),
341    )
342);
343
344#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))]
345bench_is_match_set!(
346    is_match_set,
347    true,
348    RegexSet::new(vec![
349        "aaaaaaaaaaaaaaaaaaa",
350        "abc579",
351        "def.+",
352        "e24fg",
353        "a.*2c",
354        "23.",
355    ])
356    .unwrap(),
357    format!(
358        "{}a482c{}",
359        repeat('a').take(10).collect::<String>(),
360        repeat('b').take(10).collect::<String>()
361    )
362);
363
364#[cfg(any(feature = "re-rust", feature = "re-rust-bytes"))]
365bench_matches_set!(
366    matches_set,
367    true,
368    RegexSet::new(vec![
369        "aaaaaaaaaaaaaaaaaaa",
370        "abc579",
371        "def.+",
372        "e24fg",
373        "a.*2c",
374        "23.",
375    ])
376    .unwrap(),
377    format!(
378        "{}a482c{}",
379        repeat('a').take(10).collect::<String>(),
380        repeat('b').take(10).collect::<String>()
381    )
382);
383