1c67d6573Sopenharmony_ci// This is the backtracking matching engine. It has the same exact capability
2c67d6573Sopenharmony_ci// as the full NFA simulation, except it is artificially restricted to small
3c67d6573Sopenharmony_ci// regexes on small inputs because of its memory requirements.
4c67d6573Sopenharmony_ci//
5c67d6573Sopenharmony_ci// In particular, this is a *bounded* backtracking engine. It retains worst
6c67d6573Sopenharmony_ci// case linear time by keeping track of the states that it has visited (using a
7c67d6573Sopenharmony_ci// bitmap). Namely, once a state is visited, it is never visited again. Since a
8c67d6573Sopenharmony_ci// state is keyed by `(instruction index, input index)`, we have that its time
9c67d6573Sopenharmony_ci// complexity is `O(mn)` (i.e., linear in the size of the search text).
10c67d6573Sopenharmony_ci//
11c67d6573Sopenharmony_ci// The backtracking engine can beat out the NFA simulation on small
12c67d6573Sopenharmony_ci// regexes/inputs because it doesn't have to keep track of multiple copies of
13c67d6573Sopenharmony_ci// the capture groups. In benchmarks, the backtracking engine is roughly twice
14c67d6573Sopenharmony_ci// as fast as the full NFA simulation. Note though that its performance doesn't
15c67d6573Sopenharmony_ci// scale, even if you're willing to live with the memory requirements. Namely,
16c67d6573Sopenharmony_ci// the bitset has to be zeroed on each execution, which becomes quite expensive
17c67d6573Sopenharmony_ci// on large bitsets.
18c67d6573Sopenharmony_ci
19c67d6573Sopenharmony_ciuse crate::exec::ProgramCache;
20c67d6573Sopenharmony_ciuse crate::input::{Input, InputAt};
21c67d6573Sopenharmony_ciuse crate::prog::{InstPtr, Program};
22c67d6573Sopenharmony_ciuse crate::re_trait::Slot;
23c67d6573Sopenharmony_ci
24c67d6573Sopenharmony_citype Bits = u32;
25c67d6573Sopenharmony_ci
26c67d6573Sopenharmony_ciconst BIT_SIZE: usize = 32;
27c67d6573Sopenharmony_ciconst MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB
28c67d6573Sopenharmony_ci
29c67d6573Sopenharmony_ci/// Returns true iff the given regex and input should be executed by this
30c67d6573Sopenharmony_ci/// engine with reasonable memory usage.
31c67d6573Sopenharmony_cipub fn should_exec(num_insts: usize, text_len: usize) -> bool {
32c67d6573Sopenharmony_ci    // Total memory usage in bytes is determined by:
33c67d6573Sopenharmony_ci    //
34c67d6573Sopenharmony_ci    //   ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
35c67d6573Sopenharmony_ci    //
36c67d6573Sopenharmony_ci    // The actual limit picked is pretty much a heuristic.
37c67d6573Sopenharmony_ci    // See: https://github.com/rust-lang/regex/issues/215
38c67d6573Sopenharmony_ci    let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
39c67d6573Sopenharmony_ci    size <= MAX_SIZE_BYTES
40c67d6573Sopenharmony_ci}
41c67d6573Sopenharmony_ci
42c67d6573Sopenharmony_ci/// A backtracking matching engine.
43c67d6573Sopenharmony_ci#[derive(Debug)]
44c67d6573Sopenharmony_cipub struct Bounded<'a, 'm, 'r, 's, I> {
45c67d6573Sopenharmony_ci    prog: &'r Program,
46c67d6573Sopenharmony_ci    input: I,
47c67d6573Sopenharmony_ci    matches: &'m mut [bool],
48c67d6573Sopenharmony_ci    slots: &'s mut [Slot],
49c67d6573Sopenharmony_ci    m: &'a mut Cache,
50c67d6573Sopenharmony_ci}
51c67d6573Sopenharmony_ci
52c67d6573Sopenharmony_ci/// Shared cached state between multiple invocations of a backtracking engine
53c67d6573Sopenharmony_ci/// in the same thread.
54c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
55c67d6573Sopenharmony_cipub struct Cache {
56c67d6573Sopenharmony_ci    jobs: Vec<Job>,
57c67d6573Sopenharmony_ci    visited: Vec<Bits>,
58c67d6573Sopenharmony_ci}
59c67d6573Sopenharmony_ci
60c67d6573Sopenharmony_ciimpl Cache {
61c67d6573Sopenharmony_ci    /// Create new empty cache for the backtracking engine.
62c67d6573Sopenharmony_ci    pub fn new(_prog: &Program) -> Self {
63c67d6573Sopenharmony_ci        Cache { jobs: vec![], visited: vec![] }
64c67d6573Sopenharmony_ci    }
65c67d6573Sopenharmony_ci}
66c67d6573Sopenharmony_ci
67c67d6573Sopenharmony_ci/// A job is an explicit unit of stack space in the backtracking engine.
68c67d6573Sopenharmony_ci///
69c67d6573Sopenharmony_ci/// The "normal" representation is a single state transition, which corresponds
70c67d6573Sopenharmony_ci/// to an NFA state and a character in the input. However, the backtracking
71c67d6573Sopenharmony_ci/// engine must keep track of old capture group values. We use the explicit
72c67d6573Sopenharmony_ci/// stack to do it.
73c67d6573Sopenharmony_ci#[derive(Clone, Copy, Debug)]
74c67d6573Sopenharmony_cienum Job {
75c67d6573Sopenharmony_ci    Inst { ip: InstPtr, at: InputAt },
76c67d6573Sopenharmony_ci    SaveRestore { slot: usize, old_pos: Option<usize> },
77c67d6573Sopenharmony_ci}
78c67d6573Sopenharmony_ci
79c67d6573Sopenharmony_ciimpl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
80c67d6573Sopenharmony_ci    /// Execute the backtracking matching engine.
81c67d6573Sopenharmony_ci    ///
82c67d6573Sopenharmony_ci    /// If there's a match, `exec` returns `true` and populates the given
83c67d6573Sopenharmony_ci    /// captures accordingly.
84c67d6573Sopenharmony_ci    pub fn exec(
85c67d6573Sopenharmony_ci        prog: &'r Program,
86c67d6573Sopenharmony_ci        cache: &ProgramCache,
87c67d6573Sopenharmony_ci        matches: &'m mut [bool],
88c67d6573Sopenharmony_ci        slots: &'s mut [Slot],
89c67d6573Sopenharmony_ci        input: I,
90c67d6573Sopenharmony_ci        start: usize,
91c67d6573Sopenharmony_ci        end: usize,
92c67d6573Sopenharmony_ci    ) -> bool {
93c67d6573Sopenharmony_ci        let mut cache = cache.borrow_mut();
94c67d6573Sopenharmony_ci        let cache = &mut cache.backtrack;
95c67d6573Sopenharmony_ci        let start = input.at(start);
96c67d6573Sopenharmony_ci        let mut b = Bounded { prog, input, matches, slots, m: cache };
97c67d6573Sopenharmony_ci        b.exec_(start, end)
98c67d6573Sopenharmony_ci    }
99c67d6573Sopenharmony_ci
100c67d6573Sopenharmony_ci    /// Clears the cache such that the backtracking engine can be executed
101c67d6573Sopenharmony_ci    /// on some input of fixed length.
102c67d6573Sopenharmony_ci    fn clear(&mut self) {
103c67d6573Sopenharmony_ci        // Reset the job memory so that we start fresh.
104c67d6573Sopenharmony_ci        self.m.jobs.clear();
105c67d6573Sopenharmony_ci
106c67d6573Sopenharmony_ci        // Now we need to clear the bit state set.
107c67d6573Sopenharmony_ci        // We do this by figuring out how much space we need to keep track
108c67d6573Sopenharmony_ci        // of the states we've visited.
109c67d6573Sopenharmony_ci        // Then we reset all existing allocated space to 0.
110c67d6573Sopenharmony_ci        // Finally, we request more space if we need it.
111c67d6573Sopenharmony_ci        //
112c67d6573Sopenharmony_ci        // This is all a little circuitous, but doing this using unchecked
113c67d6573Sopenharmony_ci        // operations doesn't seem to have a measurable impact on performance.
114c67d6573Sopenharmony_ci        // (Probably because backtracking is limited to such small
115c67d6573Sopenharmony_ci        // inputs/regexes in the first place.)
116c67d6573Sopenharmony_ci        let visited_len =
117c67d6573Sopenharmony_ci            (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
118c67d6573Sopenharmony_ci                / BIT_SIZE;
119c67d6573Sopenharmony_ci        self.m.visited.truncate(visited_len);
120c67d6573Sopenharmony_ci        for v in &mut self.m.visited {
121c67d6573Sopenharmony_ci            *v = 0;
122c67d6573Sopenharmony_ci        }
123c67d6573Sopenharmony_ci        if visited_len > self.m.visited.len() {
124c67d6573Sopenharmony_ci            let len = self.m.visited.len();
125c67d6573Sopenharmony_ci            self.m.visited.reserve_exact(visited_len - len);
126c67d6573Sopenharmony_ci            for _ in 0..(visited_len - len) {
127c67d6573Sopenharmony_ci                self.m.visited.push(0);
128c67d6573Sopenharmony_ci            }
129c67d6573Sopenharmony_ci        }
130c67d6573Sopenharmony_ci    }
131c67d6573Sopenharmony_ci
132c67d6573Sopenharmony_ci    /// Start backtracking at the given position in the input, but also look
133c67d6573Sopenharmony_ci    /// for literal prefixes.
134c67d6573Sopenharmony_ci    fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
135c67d6573Sopenharmony_ci        self.clear();
136c67d6573Sopenharmony_ci        // If this is an anchored regex at the beginning of the input, then
137c67d6573Sopenharmony_ci        // we're either already done or we only need to try backtracking once.
138c67d6573Sopenharmony_ci        if self.prog.is_anchored_start {
139c67d6573Sopenharmony_ci            return if !at.is_start() { false } else { self.backtrack(at) };
140c67d6573Sopenharmony_ci        }
141c67d6573Sopenharmony_ci        let mut matched = false;
142c67d6573Sopenharmony_ci        loop {
143c67d6573Sopenharmony_ci            if !self.prog.prefixes.is_empty() {
144c67d6573Sopenharmony_ci                at = match self.input.prefix_at(&self.prog.prefixes, at) {
145c67d6573Sopenharmony_ci                    None => break,
146c67d6573Sopenharmony_ci                    Some(at) => at,
147c67d6573Sopenharmony_ci                };
148c67d6573Sopenharmony_ci            }
149c67d6573Sopenharmony_ci            matched = self.backtrack(at) || matched;
150c67d6573Sopenharmony_ci            if matched && self.prog.matches.len() == 1 {
151c67d6573Sopenharmony_ci                return true;
152c67d6573Sopenharmony_ci            }
153c67d6573Sopenharmony_ci            if at.pos() >= end {
154c67d6573Sopenharmony_ci                break;
155c67d6573Sopenharmony_ci            }
156c67d6573Sopenharmony_ci            at = self.input.at(at.next_pos());
157c67d6573Sopenharmony_ci        }
158c67d6573Sopenharmony_ci        matched
159c67d6573Sopenharmony_ci    }
160c67d6573Sopenharmony_ci
161c67d6573Sopenharmony_ci    /// The main backtracking loop starting at the given input position.
162c67d6573Sopenharmony_ci    fn backtrack(&mut self, start: InputAt) -> bool {
163c67d6573Sopenharmony_ci        // N.B. We use an explicit stack to avoid recursion.
164c67d6573Sopenharmony_ci        // To avoid excessive pushing and popping, most transitions are handled
165c67d6573Sopenharmony_ci        // in the `step` helper function, which only pushes to the stack when
166c67d6573Sopenharmony_ci        // there's a capture or a branch.
167c67d6573Sopenharmony_ci        let mut matched = false;
168c67d6573Sopenharmony_ci        self.m.jobs.push(Job::Inst { ip: 0, at: start });
169c67d6573Sopenharmony_ci        while let Some(job) = self.m.jobs.pop() {
170c67d6573Sopenharmony_ci            match job {
171c67d6573Sopenharmony_ci                Job::Inst { ip, at } => {
172c67d6573Sopenharmony_ci                    if self.step(ip, at) {
173c67d6573Sopenharmony_ci                        // Only quit if we're matching one regex.
174c67d6573Sopenharmony_ci                        // If we're matching a regex set, then mush on and
175c67d6573Sopenharmony_ci                        // try to find other matches (if we want them).
176c67d6573Sopenharmony_ci                        if self.prog.matches.len() == 1 {
177c67d6573Sopenharmony_ci                            return true;
178c67d6573Sopenharmony_ci                        }
179c67d6573Sopenharmony_ci                        matched = true;
180c67d6573Sopenharmony_ci                    }
181c67d6573Sopenharmony_ci                }
182c67d6573Sopenharmony_ci                Job::SaveRestore { slot, old_pos } => {
183c67d6573Sopenharmony_ci                    if slot < self.slots.len() {
184c67d6573Sopenharmony_ci                        self.slots[slot] = old_pos;
185c67d6573Sopenharmony_ci                    }
186c67d6573Sopenharmony_ci                }
187c67d6573Sopenharmony_ci            }
188c67d6573Sopenharmony_ci        }
189c67d6573Sopenharmony_ci        matched
190c67d6573Sopenharmony_ci    }
191c67d6573Sopenharmony_ci
192c67d6573Sopenharmony_ci    fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
193c67d6573Sopenharmony_ci        use crate::prog::Inst::*;
194c67d6573Sopenharmony_ci        loop {
195c67d6573Sopenharmony_ci            // This loop is an optimization to avoid constantly pushing/popping
196c67d6573Sopenharmony_ci            // from the stack. Namely, if we're pushing a job only to run it
197c67d6573Sopenharmony_ci            // next, avoid the push and just mutate `ip` (and possibly `at`)
198c67d6573Sopenharmony_ci            // in place.
199c67d6573Sopenharmony_ci            if self.has_visited(ip, at) {
200c67d6573Sopenharmony_ci                return false;
201c67d6573Sopenharmony_ci            }
202c67d6573Sopenharmony_ci            match self.prog[ip] {
203c67d6573Sopenharmony_ci                Match(slot) => {
204c67d6573Sopenharmony_ci                    if slot < self.matches.len() {
205c67d6573Sopenharmony_ci                        self.matches[slot] = true;
206c67d6573Sopenharmony_ci                    }
207c67d6573Sopenharmony_ci                    return true;
208c67d6573Sopenharmony_ci                }
209c67d6573Sopenharmony_ci                Save(ref inst) => {
210c67d6573Sopenharmony_ci                    if let Some(&old_pos) = self.slots.get(inst.slot) {
211c67d6573Sopenharmony_ci                        // If this path doesn't work out, then we save the old
212c67d6573Sopenharmony_ci                        // capture index (if one exists) in an alternate
213c67d6573Sopenharmony_ci                        // job. If the next path fails, then the alternate
214c67d6573Sopenharmony_ci                        // job is popped and the old capture index is restored.
215c67d6573Sopenharmony_ci                        self.m.jobs.push(Job::SaveRestore {
216c67d6573Sopenharmony_ci                            slot: inst.slot,
217c67d6573Sopenharmony_ci                            old_pos,
218c67d6573Sopenharmony_ci                        });
219c67d6573Sopenharmony_ci                        self.slots[inst.slot] = Some(at.pos());
220c67d6573Sopenharmony_ci                    }
221c67d6573Sopenharmony_ci                    ip = inst.goto;
222c67d6573Sopenharmony_ci                }
223c67d6573Sopenharmony_ci                Split(ref inst) => {
224c67d6573Sopenharmony_ci                    self.m.jobs.push(Job::Inst { ip: inst.goto2, at });
225c67d6573Sopenharmony_ci                    ip = inst.goto1;
226c67d6573Sopenharmony_ci                }
227c67d6573Sopenharmony_ci                EmptyLook(ref inst) => {
228c67d6573Sopenharmony_ci                    if self.input.is_empty_match(at, inst) {
229c67d6573Sopenharmony_ci                        ip = inst.goto;
230c67d6573Sopenharmony_ci                    } else {
231c67d6573Sopenharmony_ci                        return false;
232c67d6573Sopenharmony_ci                    }
233c67d6573Sopenharmony_ci                }
234c67d6573Sopenharmony_ci                Char(ref inst) => {
235c67d6573Sopenharmony_ci                    if inst.c == at.char() {
236c67d6573Sopenharmony_ci                        ip = inst.goto;
237c67d6573Sopenharmony_ci                        at = self.input.at(at.next_pos());
238c67d6573Sopenharmony_ci                    } else {
239c67d6573Sopenharmony_ci                        return false;
240c67d6573Sopenharmony_ci                    }
241c67d6573Sopenharmony_ci                }
242c67d6573Sopenharmony_ci                Ranges(ref inst) => {
243c67d6573Sopenharmony_ci                    if inst.matches(at.char()) {
244c67d6573Sopenharmony_ci                        ip = inst.goto;
245c67d6573Sopenharmony_ci                        at = self.input.at(at.next_pos());
246c67d6573Sopenharmony_ci                    } else {
247c67d6573Sopenharmony_ci                        return false;
248c67d6573Sopenharmony_ci                    }
249c67d6573Sopenharmony_ci                }
250c67d6573Sopenharmony_ci                Bytes(ref inst) => {
251c67d6573Sopenharmony_ci                    if let Some(b) = at.byte() {
252c67d6573Sopenharmony_ci                        if inst.matches(b) {
253c67d6573Sopenharmony_ci                            ip = inst.goto;
254c67d6573Sopenharmony_ci                            at = self.input.at(at.next_pos());
255c67d6573Sopenharmony_ci                            continue;
256c67d6573Sopenharmony_ci                        }
257c67d6573Sopenharmony_ci                    }
258c67d6573Sopenharmony_ci                    return false;
259c67d6573Sopenharmony_ci                }
260c67d6573Sopenharmony_ci            }
261c67d6573Sopenharmony_ci        }
262c67d6573Sopenharmony_ci    }
263c67d6573Sopenharmony_ci
264c67d6573Sopenharmony_ci    fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
265c67d6573Sopenharmony_ci        let k = ip * (self.input.len() + 1) + at.pos();
266c67d6573Sopenharmony_ci        let k1 = k / BIT_SIZE;
267c67d6573Sopenharmony_ci        let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
268c67d6573Sopenharmony_ci        if self.m.visited[k1] & k2 == 0 {
269c67d6573Sopenharmony_ci            self.m.visited[k1] |= k2;
270c67d6573Sopenharmony_ci            false
271c67d6573Sopenharmony_ci        } else {
272c67d6573Sopenharmony_ci            true
273c67d6573Sopenharmony_ci        }
274c67d6573Sopenharmony_ci    }
275c67d6573Sopenharmony_ci}
276c67d6573Sopenharmony_ci
277c67d6573Sopenharmony_cifn usize_to_u32(n: usize) -> u32 {
278c67d6573Sopenharmony_ci    if (n as u64) > (::std::u32::MAX as u64) {
279c67d6573Sopenharmony_ci        panic!("BUG: {} is too big to fit into u32", n)
280c67d6573Sopenharmony_ci    }
281c67d6573Sopenharmony_ci    n as u32
282c67d6573Sopenharmony_ci}
283