1use std::iter::repeat;
2
3/// Create a sequence of tests that should be run by memchr implementations.
4pub fn memchr_tests() -> Vec<MemchrTest> {
5    let mut tests = Vec::new();
6    for statict in MEMCHR_TESTS {
7        assert!(!statict.corpus.contains("%"), "% is not allowed in corpora");
8        assert!(!statict.corpus.contains("#"), "# is not allowed in corpora");
9        assert!(!statict.needles.contains(&b'%'), "% is an invalid needle");
10        assert!(!statict.needles.contains(&b'#'), "# is an invalid needle");
11
12        let t = MemchrTest {
13            corpus: statict.corpus.to_string(),
14            needles: statict.needles.to_vec(),
15            positions: statict.positions.to_vec(),
16        };
17        tests.push(t.clone());
18        tests.extend(t.expand());
19    }
20    tests
21}
22
23/// A set of tests for memchr-like functions.
24///
25/// These tests mostly try to cover the short string cases. We cover the longer
26/// string cases via the benchmarks (which are tests themselves), via
27/// quickcheck tests and via automatic expansion of each test case (by
28/// increasing the corpus size). Finally, we cover different alignment cases
29/// in the tests by varying the starting point of the slice.
30const MEMCHR_TESTS: &[MemchrTestStatic] = &[
31    // one needle (applied to memchr + memchr2 + memchr3)
32    MemchrTestStatic { corpus: "a", needles: &[b'a'], positions: &[0] },
33    MemchrTestStatic { corpus: "aa", needles: &[b'a'], positions: &[0, 1] },
34    MemchrTestStatic {
35        corpus: "aaa",
36        needles: &[b'a'],
37        positions: &[0, 1, 2],
38    },
39    MemchrTestStatic { corpus: "", needles: &[b'a'], positions: &[] },
40    MemchrTestStatic { corpus: "z", needles: &[b'a'], positions: &[] },
41    MemchrTestStatic { corpus: "zz", needles: &[b'a'], positions: &[] },
42    MemchrTestStatic { corpus: "zza", needles: &[b'a'], positions: &[2] },
43    MemchrTestStatic { corpus: "zaza", needles: &[b'a'], positions: &[1, 3] },
44    MemchrTestStatic { corpus: "zzza", needles: &[b'a'], positions: &[3] },
45    MemchrTestStatic { corpus: "\x00a", needles: &[b'a'], positions: &[1] },
46    MemchrTestStatic { corpus: "\x00", needles: &[b'\x00'], positions: &[0] },
47    MemchrTestStatic {
48        corpus: "\x00\x00",
49        needles: &[b'\x00'],
50        positions: &[0, 1],
51    },
52    MemchrTestStatic {
53        corpus: "\x00a\x00",
54        needles: &[b'\x00'],
55        positions: &[0, 2],
56    },
57    MemchrTestStatic {
58        corpus: "zzzzzzzzzzzzzzzza",
59        needles: &[b'a'],
60        positions: &[16],
61    },
62    MemchrTestStatic {
63        corpus: "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzza",
64        needles: &[b'a'],
65        positions: &[32],
66    },
67    // two needles (applied to memchr2 + memchr3)
68    MemchrTestStatic {
69        corpus: "az",
70        needles: &[b'a', b'z'],
71        positions: &[0, 1],
72    },
73    MemchrTestStatic {
74        corpus: "az",
75        needles: &[b'a', b'z'],
76        positions: &[0, 1],
77    },
78    MemchrTestStatic { corpus: "az", needles: &[b'x', b'y'], positions: &[] },
79    MemchrTestStatic { corpus: "az", needles: &[b'a', b'y'], positions: &[0] },
80    MemchrTestStatic { corpus: "az", needles: &[b'x', b'z'], positions: &[1] },
81    MemchrTestStatic {
82        corpus: "yyyyaz",
83        needles: &[b'a', b'z'],
84        positions: &[4, 5],
85    },
86    MemchrTestStatic {
87        corpus: "yyyyaz",
88        needles: &[b'z', b'a'],
89        positions: &[4, 5],
90    },
91    // three needles (applied to memchr3)
92    MemchrTestStatic {
93        corpus: "xyz",
94        needles: &[b'x', b'y', b'z'],
95        positions: &[0, 1, 2],
96    },
97    MemchrTestStatic {
98        corpus: "zxy",
99        needles: &[b'x', b'y', b'z'],
100        positions: &[0, 1, 2],
101    },
102    MemchrTestStatic {
103        corpus: "zxy",
104        needles: &[b'x', b'a', b'z'],
105        positions: &[0, 1],
106    },
107    MemchrTestStatic {
108        corpus: "zxy",
109        needles: &[b't', b'a', b'z'],
110        positions: &[0],
111    },
112    MemchrTestStatic {
113        corpus: "yxz",
114        needles: &[b't', b'a', b'z'],
115        positions: &[2],
116    },
117];
118
119/// A description of a test on a memchr like function.
120#[derive(Clone, Debug)]
121pub struct MemchrTest {
122    /// The thing to search. We use `&str` instead of `&[u8]` because they
123    /// are nicer to write in tests, and we don't miss much since memchr
124    /// doesn't care about UTF-8.
125    ///
126    /// Corpora cannot contain either '%' or '#'. We use these bytes when
127    /// expanding test cases into many test cases, and we assume they are not
128    /// used. If they are used, `memchr_tests` will panic.
129    corpus: String,
130    /// The needles to search for. This is intended to be an "alternation" of
131    /// needles. The number of needles may cause this test to be skipped for
132    /// some memchr variants. For example, a test with 2 needles cannot be used
133    /// to test `memchr`, but can be used to test `memchr2` and `memchr3`.
134    /// However, a test with only 1 needle can be used to test all of `memchr`,
135    /// `memchr2` and `memchr3`. We achieve this by filling in the needles with
136    /// bytes that we never used in the corpus (such as '#').
137    needles: Vec<u8>,
138    /// The positions expected to match for all of the needles.
139    positions: Vec<usize>,
140}
141
142/// Like MemchrTest, but easier to define as a constant.
143#[derive(Clone, Debug)]
144pub struct MemchrTestStatic {
145    corpus: &'static str,
146    needles: &'static [u8],
147    positions: &'static [usize],
148}
149
150impl MemchrTest {
151    pub fn one<F: Fn(u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F) {
152        let needles = match self.needles(1) {
153            None => return,
154            Some(needles) => needles,
155        };
156        // We test different alignments here. Since some implementations use
157        // AVX2, which can read 32 bytes at a time, we test at least that.
158        // Moreover, with loop unrolling, we sometimes process 64 (sse2) or 128
159        // (avx) bytes at a time, so we include that in our offsets as well.
160        //
161        // You might think this would cause most needles to not be found, but
162        // we actually expand our tests to include corpus sizes all the way up
163        // to >500 bytes, so we should exercise most branches.
164        for align in 0..130 {
165            let corpus = self.corpus(align);
166            assert_eq!(
167                self.positions(align, reverse).get(0).cloned(),
168                f(needles[0], corpus.as_bytes()),
169                "search for {:?} failed in: {:?} (len: {}, alignment: {})",
170                needles[0] as char,
171                corpus,
172                corpus.len(),
173                align
174            );
175        }
176    }
177
178    pub fn two<F: Fn(u8, u8, &[u8]) -> Option<usize>>(
179        &self,
180        reverse: bool,
181        f: F,
182    ) {
183        let needles = match self.needles(2) {
184            None => return,
185            Some(needles) => needles,
186        };
187        for align in 0..130 {
188            let corpus = self.corpus(align);
189            assert_eq!(
190                self.positions(align, reverse).get(0).cloned(),
191                f(needles[0], needles[1], corpus.as_bytes()),
192                "search for {:?}|{:?} failed in: {:?} \
193                 (len: {}, alignment: {})",
194                needles[0] as char,
195                needles[1] as char,
196                corpus,
197                corpus.len(),
198                align
199            );
200        }
201    }
202
203    pub fn three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>(
204        &self,
205        reverse: bool,
206        f: F,
207    ) {
208        let needles = match self.needles(3) {
209            None => return,
210            Some(needles) => needles,
211        };
212        for align in 0..130 {
213            let corpus = self.corpus(align);
214            assert_eq!(
215                self.positions(align, reverse).get(0).cloned(),
216                f(needles[0], needles[1], needles[2], corpus.as_bytes()),
217                "search for {:?}|{:?}|{:?} failed in: {:?} \
218                 (len: {}, alignment: {})",
219                needles[0] as char,
220                needles[1] as char,
221                needles[2] as char,
222                corpus,
223                corpus.len(),
224                align
225            );
226        }
227    }
228
229    pub fn iter_one<'a, I, F>(&'a self, reverse: bool, f: F)
230    where
231        F: FnOnce(u8, &'a [u8]) -> I,
232        I: Iterator<Item = usize>,
233    {
234        if let Some(ns) = self.needles(1) {
235            self.iter(reverse, f(ns[0], self.corpus.as_bytes()));
236        }
237    }
238
239    pub fn iter_two<'a, I, F>(&'a self, reverse: bool, f: F)
240    where
241        F: FnOnce(u8, u8, &'a [u8]) -> I,
242        I: Iterator<Item = usize>,
243    {
244        if let Some(ns) = self.needles(2) {
245            self.iter(reverse, f(ns[0], ns[1], self.corpus.as_bytes()));
246        }
247    }
248
249    pub fn iter_three<'a, I, F>(&'a self, reverse: bool, f: F)
250    where
251        F: FnOnce(u8, u8, u8, &'a [u8]) -> I,
252        I: Iterator<Item = usize>,
253    {
254        if let Some(ns) = self.needles(3) {
255            self.iter(reverse, f(ns[0], ns[1], ns[2], self.corpus.as_bytes()));
256        }
257    }
258
259    /// Test that the positions yielded by the given iterator match the
260    /// positions in this test. If reverse is true, then reverse the positions
261    /// before comparing them.
262    fn iter<I: Iterator<Item = usize>>(&self, reverse: bool, it: I) {
263        assert_eq!(
264            self.positions(0, reverse),
265            it.collect::<Vec<usize>>(),
266            r"search for {:?} failed in: {:?}",
267            self.needles.iter().map(|&b| b as char).collect::<Vec<char>>(),
268            self.corpus
269        );
270    }
271
272    /// Expand this test into many variations of the same test.
273    ///
274    /// In particular, this will generate more tests with larger corpus sizes.
275    /// The expected positions are updated to maintain the integrity of the
276    /// test.
277    ///
278    /// This is important in testing a memchr implementation, because there are
279    /// often different cases depending on the length of the corpus.
280    ///
281    /// Note that we extend the corpus by adding `%` bytes, which we
282    /// don't otherwise use as a needle.
283    fn expand(&self) -> Vec<MemchrTest> {
284        let mut more = Vec::new();
285
286        // Add bytes to the start of the corpus.
287        for i in 1..515 {
288            let mut t = self.clone();
289            let mut new_corpus: String = repeat('%').take(i).collect();
290            new_corpus.push_str(&t.corpus);
291            t.corpus = new_corpus;
292            t.positions = t.positions.into_iter().map(|p| p + i).collect();
293            more.push(t);
294        }
295        // Add bytes to the end of the corpus.
296        for i in 1..515 {
297            let mut t = self.clone();
298            let padding: String = repeat('%').take(i).collect();
299            t.corpus.push_str(&padding);
300            more.push(t);
301        }
302
303        more
304    }
305
306    /// Return the corpus at the given alignment.
307    ///
308    /// If the alignment exceeds the length of the corpus, then this returns
309    /// an empty slice.
310    fn corpus(&self, align: usize) -> &str {
311        self.corpus.get(align..).unwrap_or("")
312    }
313
314    /// Return exactly `count` needles from this test. If this test has less
315    /// than `count` needles, then add `#` until the number of needles
316    /// matches `count`. If this test has more than `count` needles, then
317    /// return `None` (because there is no way to use this test data for a
318    /// search using fewer needles).
319    fn needles(&self, count: usize) -> Option<Vec<u8>> {
320        if self.needles.len() > count {
321            return None;
322        }
323
324        let mut needles = self.needles.to_vec();
325        for _ in needles.len()..count {
326            // we assume # is never used in tests.
327            needles.push(b'#');
328        }
329        Some(needles)
330    }
331
332    /// Return the positions in this test, reversed if `reverse` is true.
333    ///
334    /// If alignment is given, then all positions greater than or equal to that
335    /// alignment are offset by the alignment. Positions less than the
336    /// alignment are dropped.
337    fn positions(&self, align: usize, reverse: bool) -> Vec<usize> {
338        let positions = if reverse {
339            let mut positions = self.positions.to_vec();
340            positions.reverse();
341            positions
342        } else {
343            self.positions.to_vec()
344        };
345        positions
346            .into_iter()
347            .filter(|&p| p >= align)
348            .map(|p| p - align)
349            .collect()
350    }
351}
352