1c67d6573Sopenharmony_ci// This module implements the Pike VM. That is, it guarantees linear time
2c67d6573Sopenharmony_ci// search of a regex on any text with memory use proportional to the size of
3c67d6573Sopenharmony_ci// the regex.
4c67d6573Sopenharmony_ci//
5c67d6573Sopenharmony_ci// It is equal in power to the backtracking engine in this crate, except the
6c67d6573Sopenharmony_ci// backtracking engine is typically faster on small regexes/texts at the
7c67d6573Sopenharmony_ci// expense of a bigger memory footprint.
8c67d6573Sopenharmony_ci//
9c67d6573Sopenharmony_ci// It can do more than the DFA can (specifically, record capture locations
10c67d6573Sopenharmony_ci// and execute Unicode word boundary assertions), but at a slower speed.
11c67d6573Sopenharmony_ci// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
12c67d6573Sopenharmony_ci// epsilon transitions. That is, the Pike VM engine can be in multiple states
13c67d6573Sopenharmony_ci// at once where as the DFA is only ever in one state at a time.
14c67d6573Sopenharmony_ci//
15c67d6573Sopenharmony_ci// Therefore, the Pike VM is generally treated as the fallback when the other
16c67d6573Sopenharmony_ci// matching engines either aren't feasible to run or are insufficient.
17c67d6573Sopenharmony_ci
18c67d6573Sopenharmony_ciuse std::mem;
19c67d6573Sopenharmony_ci
20c67d6573Sopenharmony_ciuse crate::exec::ProgramCache;
21c67d6573Sopenharmony_ciuse crate::input::{Input, InputAt};
22c67d6573Sopenharmony_ciuse crate::prog::{InstPtr, Program};
23c67d6573Sopenharmony_ciuse crate::re_trait::Slot;
24c67d6573Sopenharmony_ciuse crate::sparse::SparseSet;
25c67d6573Sopenharmony_ci
26c67d6573Sopenharmony_ci/// An NFA simulation matching engine.
27c67d6573Sopenharmony_ci#[derive(Debug)]
28c67d6573Sopenharmony_cipub struct Fsm<'r, I> {
29c67d6573Sopenharmony_ci    /// The sequence of opcodes (among other things) that is actually executed.
30c67d6573Sopenharmony_ci    ///
31c67d6573Sopenharmony_ci    /// The program may be byte oriented or Unicode codepoint oriented.
32c67d6573Sopenharmony_ci    prog: &'r Program,
33c67d6573Sopenharmony_ci    /// An explicit stack used for following epsilon transitions. (This is
34c67d6573Sopenharmony_ci    /// borrowed from the cache.)
35c67d6573Sopenharmony_ci    stack: &'r mut Vec<FollowEpsilon>,
36c67d6573Sopenharmony_ci    /// The input to search.
37c67d6573Sopenharmony_ci    input: I,
38c67d6573Sopenharmony_ci}
39c67d6573Sopenharmony_ci
40c67d6573Sopenharmony_ci/// A cached allocation that can be reused on each execution.
41c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
42c67d6573Sopenharmony_cipub struct Cache {
43c67d6573Sopenharmony_ci    /// A pair of ordered sets for tracking NFA states.
44c67d6573Sopenharmony_ci    clist: Threads,
45c67d6573Sopenharmony_ci    nlist: Threads,
46c67d6573Sopenharmony_ci    /// An explicit stack used for following epsilon transitions.
47c67d6573Sopenharmony_ci    stack: Vec<FollowEpsilon>,
48c67d6573Sopenharmony_ci}
49c67d6573Sopenharmony_ci
50c67d6573Sopenharmony_ci/// An ordered set of NFA states and their captures.
51c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
52c67d6573Sopenharmony_cistruct Threads {
53c67d6573Sopenharmony_ci    /// An ordered set of opcodes (each opcode is an NFA state).
54c67d6573Sopenharmony_ci    set: SparseSet,
55c67d6573Sopenharmony_ci    /// Captures for every NFA state.
56c67d6573Sopenharmony_ci    ///
57c67d6573Sopenharmony_ci    /// It is stored in row-major order, where the columns are the capture
58c67d6573Sopenharmony_ci    /// slots and the rows are the states.
59c67d6573Sopenharmony_ci    caps: Vec<Slot>,
60c67d6573Sopenharmony_ci    /// The number of capture slots stored per thread. (Every capture has
61c67d6573Sopenharmony_ci    /// two slots.)
62c67d6573Sopenharmony_ci    slots_per_thread: usize,
63c67d6573Sopenharmony_ci}
64c67d6573Sopenharmony_ci
65c67d6573Sopenharmony_ci/// A representation of an explicit stack frame when following epsilon
66c67d6573Sopenharmony_ci/// transitions. This is used to avoid recursion.
67c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
68c67d6573Sopenharmony_cienum FollowEpsilon {
69c67d6573Sopenharmony_ci    /// Follow transitions at the given instruction pointer.
70c67d6573Sopenharmony_ci    IP(InstPtr),
71c67d6573Sopenharmony_ci    /// Restore the capture slot with the given position in the input.
72c67d6573Sopenharmony_ci    Capture { slot: usize, pos: Slot },
73c67d6573Sopenharmony_ci}
74c67d6573Sopenharmony_ci
75c67d6573Sopenharmony_ciimpl Cache {
76c67d6573Sopenharmony_ci    /// Create a new allocation used by the NFA machine to record execution
77c67d6573Sopenharmony_ci    /// and captures.
78c67d6573Sopenharmony_ci    pub fn new(_prog: &Program) -> Self {
79c67d6573Sopenharmony_ci        Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
80c67d6573Sopenharmony_ci    }
81c67d6573Sopenharmony_ci}
82c67d6573Sopenharmony_ci
83c67d6573Sopenharmony_ciimpl<'r, I: Input> Fsm<'r, I> {
84c67d6573Sopenharmony_ci    /// Execute the NFA matching engine.
85c67d6573Sopenharmony_ci    ///
86c67d6573Sopenharmony_ci    /// If there's a match, `exec` returns `true` and populates the given
87c67d6573Sopenharmony_ci    /// captures accordingly.
88c67d6573Sopenharmony_ci    pub fn exec(
89c67d6573Sopenharmony_ci        prog: &'r Program,
90c67d6573Sopenharmony_ci        cache: &ProgramCache,
91c67d6573Sopenharmony_ci        matches: &mut [bool],
92c67d6573Sopenharmony_ci        slots: &mut [Slot],
93c67d6573Sopenharmony_ci        quit_after_match: bool,
94c67d6573Sopenharmony_ci        input: I,
95c67d6573Sopenharmony_ci        start: usize,
96c67d6573Sopenharmony_ci        end: usize,
97c67d6573Sopenharmony_ci    ) -> bool {
98c67d6573Sopenharmony_ci        let mut cache = cache.borrow_mut();
99c67d6573Sopenharmony_ci        let cache = &mut cache.pikevm;
100c67d6573Sopenharmony_ci        cache.clist.resize(prog.len(), prog.captures.len());
101c67d6573Sopenharmony_ci        cache.nlist.resize(prog.len(), prog.captures.len());
102c67d6573Sopenharmony_ci        let at = input.at(start);
103c67d6573Sopenharmony_ci        Fsm { prog, stack: &mut cache.stack, input }.exec_(
104c67d6573Sopenharmony_ci            &mut cache.clist,
105c67d6573Sopenharmony_ci            &mut cache.nlist,
106c67d6573Sopenharmony_ci            matches,
107c67d6573Sopenharmony_ci            slots,
108c67d6573Sopenharmony_ci            quit_after_match,
109c67d6573Sopenharmony_ci            at,
110c67d6573Sopenharmony_ci            end,
111c67d6573Sopenharmony_ci        )
112c67d6573Sopenharmony_ci    }
113c67d6573Sopenharmony_ci
114c67d6573Sopenharmony_ci    fn exec_(
115c67d6573Sopenharmony_ci        &mut self,
116c67d6573Sopenharmony_ci        mut clist: &mut Threads,
117c67d6573Sopenharmony_ci        mut nlist: &mut Threads,
118c67d6573Sopenharmony_ci        matches: &mut [bool],
119c67d6573Sopenharmony_ci        slots: &mut [Slot],
120c67d6573Sopenharmony_ci        quit_after_match: bool,
121c67d6573Sopenharmony_ci        mut at: InputAt,
122c67d6573Sopenharmony_ci        end: usize,
123c67d6573Sopenharmony_ci    ) -> bool {
124c67d6573Sopenharmony_ci        let mut matched = false;
125c67d6573Sopenharmony_ci        let mut all_matched = false;
126c67d6573Sopenharmony_ci        clist.set.clear();
127c67d6573Sopenharmony_ci        nlist.set.clear();
128c67d6573Sopenharmony_ci        'LOOP: loop {
129c67d6573Sopenharmony_ci            if clist.set.is_empty() {
130c67d6573Sopenharmony_ci                // Three ways to bail out when our current set of threads is
131c67d6573Sopenharmony_ci                // empty.
132c67d6573Sopenharmony_ci                //
133c67d6573Sopenharmony_ci                // 1. We have a match---so we're done exploring any possible
134c67d6573Sopenharmony_ci                //    alternatives. Time to quit. (We can't do this if we're
135c67d6573Sopenharmony_ci                //    looking for matches for multiple regexes, unless we know
136c67d6573Sopenharmony_ci                //    they all matched.)
137c67d6573Sopenharmony_ci                //
138c67d6573Sopenharmony_ci                // 2. If the expression starts with a '^' we can terminate as
139c67d6573Sopenharmony_ci                //    soon as the last thread dies.
140c67d6573Sopenharmony_ci                if (matched && matches.len() <= 1)
141c67d6573Sopenharmony_ci                    || all_matched
142c67d6573Sopenharmony_ci                    || (!at.is_start() && self.prog.is_anchored_start)
143c67d6573Sopenharmony_ci                {
144c67d6573Sopenharmony_ci                    break;
145c67d6573Sopenharmony_ci                }
146c67d6573Sopenharmony_ci
147c67d6573Sopenharmony_ci                // 3. If there's a literal prefix for the program, try to
148c67d6573Sopenharmony_ci                //    jump ahead quickly. If it can't be found, then we can
149c67d6573Sopenharmony_ci                //    bail out early.
150c67d6573Sopenharmony_ci                if !self.prog.prefixes.is_empty() {
151c67d6573Sopenharmony_ci                    at = match self.input.prefix_at(&self.prog.prefixes, at) {
152c67d6573Sopenharmony_ci                        None => break,
153c67d6573Sopenharmony_ci                        Some(at) => at,
154c67d6573Sopenharmony_ci                    };
155c67d6573Sopenharmony_ci                }
156c67d6573Sopenharmony_ci            }
157c67d6573Sopenharmony_ci
158c67d6573Sopenharmony_ci            // This simulates a preceding '.*?' for every regex by adding
159c67d6573Sopenharmony_ci            // a state starting at the current position in the input for the
160c67d6573Sopenharmony_ci            // beginning of the program only if we don't already have a match.
161c67d6573Sopenharmony_ci            if clist.set.is_empty()
162c67d6573Sopenharmony_ci                || (!self.prog.is_anchored_start && !all_matched)
163c67d6573Sopenharmony_ci            {
164c67d6573Sopenharmony_ci                self.add(&mut clist, slots, 0, at);
165c67d6573Sopenharmony_ci            }
166c67d6573Sopenharmony_ci            // The previous call to "add" actually inspects the position just
167c67d6573Sopenharmony_ci            // before the current character. For stepping through the machine,
168c67d6573Sopenharmony_ci            // we can to look at the current character, so we advance the
169c67d6573Sopenharmony_ci            // input.
170c67d6573Sopenharmony_ci            let at_next = self.input.at(at.next_pos());
171c67d6573Sopenharmony_ci            for i in 0..clist.set.len() {
172c67d6573Sopenharmony_ci                let ip = clist.set[i];
173c67d6573Sopenharmony_ci                if self.step(
174c67d6573Sopenharmony_ci                    &mut nlist,
175c67d6573Sopenharmony_ci                    matches,
176c67d6573Sopenharmony_ci                    slots,
177c67d6573Sopenharmony_ci                    clist.caps(ip),
178c67d6573Sopenharmony_ci                    ip,
179c67d6573Sopenharmony_ci                    at,
180c67d6573Sopenharmony_ci                    at_next,
181c67d6573Sopenharmony_ci                ) {
182c67d6573Sopenharmony_ci                    matched = true;
183c67d6573Sopenharmony_ci                    all_matched = all_matched || matches.iter().all(|&b| b);
184c67d6573Sopenharmony_ci                    if quit_after_match {
185c67d6573Sopenharmony_ci                        // If we only care if a match occurs (not its
186c67d6573Sopenharmony_ci                        // position), then we can quit right now.
187c67d6573Sopenharmony_ci                        break 'LOOP;
188c67d6573Sopenharmony_ci                    }
189c67d6573Sopenharmony_ci                    if self.prog.matches.len() == 1 {
190c67d6573Sopenharmony_ci                        // We don't need to check the rest of the threads
191c67d6573Sopenharmony_ci                        // in this set because we've matched something
192c67d6573Sopenharmony_ci                        // ("leftmost-first"). However, we still need to check
193c67d6573Sopenharmony_ci                        // threads in the next set to support things like
194c67d6573Sopenharmony_ci                        // greedy matching.
195c67d6573Sopenharmony_ci                        //
196c67d6573Sopenharmony_ci                        // This is only true on normal regexes. For regex sets,
197c67d6573Sopenharmony_ci                        // we need to mush on to observe other matches.
198c67d6573Sopenharmony_ci                        break;
199c67d6573Sopenharmony_ci                    }
200c67d6573Sopenharmony_ci                }
201c67d6573Sopenharmony_ci            }
202c67d6573Sopenharmony_ci            if at.pos() >= end {
203c67d6573Sopenharmony_ci                break;
204c67d6573Sopenharmony_ci            }
205c67d6573Sopenharmony_ci            at = at_next;
206c67d6573Sopenharmony_ci            mem::swap(clist, nlist);
207c67d6573Sopenharmony_ci            nlist.set.clear();
208c67d6573Sopenharmony_ci        }
209c67d6573Sopenharmony_ci        matched
210c67d6573Sopenharmony_ci    }
211c67d6573Sopenharmony_ci
212c67d6573Sopenharmony_ci    /// Step through the input, one token (byte or codepoint) at a time.
213c67d6573Sopenharmony_ci    ///
214c67d6573Sopenharmony_ci    /// nlist is the set of states that will be processed on the next token
215c67d6573Sopenharmony_ci    /// in the input.
216c67d6573Sopenharmony_ci    ///
217c67d6573Sopenharmony_ci    /// caps is the set of captures passed by the caller of the NFA. They are
218c67d6573Sopenharmony_ci    /// written to only when a match state is visited.
219c67d6573Sopenharmony_ci    ///
220c67d6573Sopenharmony_ci    /// thread_caps is the set of captures set for the current NFA state, ip.
221c67d6573Sopenharmony_ci    ///
222c67d6573Sopenharmony_ci    /// at and at_next are the current and next positions in the input. at or
223c67d6573Sopenharmony_ci    /// at_next may be EOF.
224c67d6573Sopenharmony_ci    fn step(
225c67d6573Sopenharmony_ci        &mut self,
226c67d6573Sopenharmony_ci        nlist: &mut Threads,
227c67d6573Sopenharmony_ci        matches: &mut [bool],
228c67d6573Sopenharmony_ci        slots: &mut [Slot],
229c67d6573Sopenharmony_ci        thread_caps: &mut [Option<usize>],
230c67d6573Sopenharmony_ci        ip: usize,
231c67d6573Sopenharmony_ci        at: InputAt,
232c67d6573Sopenharmony_ci        at_next: InputAt,
233c67d6573Sopenharmony_ci    ) -> bool {
234c67d6573Sopenharmony_ci        use crate::prog::Inst::*;
235c67d6573Sopenharmony_ci        match self.prog[ip] {
236c67d6573Sopenharmony_ci            Match(match_slot) => {
237c67d6573Sopenharmony_ci                if match_slot < matches.len() {
238c67d6573Sopenharmony_ci                    matches[match_slot] = true;
239c67d6573Sopenharmony_ci                }
240c67d6573Sopenharmony_ci                for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
241c67d6573Sopenharmony_ci                    *slot = *val;
242c67d6573Sopenharmony_ci                }
243c67d6573Sopenharmony_ci                true
244c67d6573Sopenharmony_ci            }
245c67d6573Sopenharmony_ci            Char(ref inst) => {
246c67d6573Sopenharmony_ci                if inst.c == at.char() {
247c67d6573Sopenharmony_ci                    self.add(nlist, thread_caps, inst.goto, at_next);
248c67d6573Sopenharmony_ci                }
249c67d6573Sopenharmony_ci                false
250c67d6573Sopenharmony_ci            }
251c67d6573Sopenharmony_ci            Ranges(ref inst) => {
252c67d6573Sopenharmony_ci                if inst.matches(at.char()) {
253c67d6573Sopenharmony_ci                    self.add(nlist, thread_caps, inst.goto, at_next);
254c67d6573Sopenharmony_ci                }
255c67d6573Sopenharmony_ci                false
256c67d6573Sopenharmony_ci            }
257c67d6573Sopenharmony_ci            Bytes(ref inst) => {
258c67d6573Sopenharmony_ci                if let Some(b) = at.byte() {
259c67d6573Sopenharmony_ci                    if inst.matches(b) {
260c67d6573Sopenharmony_ci                        self.add(nlist, thread_caps, inst.goto, at_next);
261c67d6573Sopenharmony_ci                    }
262c67d6573Sopenharmony_ci                }
263c67d6573Sopenharmony_ci                false
264c67d6573Sopenharmony_ci            }
265c67d6573Sopenharmony_ci            EmptyLook(_) | Save(_) | Split(_) => false,
266c67d6573Sopenharmony_ci        }
267c67d6573Sopenharmony_ci    }
268c67d6573Sopenharmony_ci
269c67d6573Sopenharmony_ci    /// Follows epsilon transitions and adds them for processing to nlist,
270c67d6573Sopenharmony_ci    /// starting at and including ip.
271c67d6573Sopenharmony_ci    fn add(
272c67d6573Sopenharmony_ci        &mut self,
273c67d6573Sopenharmony_ci        nlist: &mut Threads,
274c67d6573Sopenharmony_ci        thread_caps: &mut [Option<usize>],
275c67d6573Sopenharmony_ci        ip: usize,
276c67d6573Sopenharmony_ci        at: InputAt,
277c67d6573Sopenharmony_ci    ) {
278c67d6573Sopenharmony_ci        self.stack.push(FollowEpsilon::IP(ip));
279c67d6573Sopenharmony_ci        while let Some(frame) = self.stack.pop() {
280c67d6573Sopenharmony_ci            match frame {
281c67d6573Sopenharmony_ci                FollowEpsilon::IP(ip) => {
282c67d6573Sopenharmony_ci                    self.add_step(nlist, thread_caps, ip, at);
283c67d6573Sopenharmony_ci                }
284c67d6573Sopenharmony_ci                FollowEpsilon::Capture { slot, pos } => {
285c67d6573Sopenharmony_ci                    thread_caps[slot] = pos;
286c67d6573Sopenharmony_ci                }
287c67d6573Sopenharmony_ci            }
288c67d6573Sopenharmony_ci        }
289c67d6573Sopenharmony_ci    }
290c67d6573Sopenharmony_ci
291c67d6573Sopenharmony_ci    /// A helper function for add that avoids excessive pushing to the stack.
292c67d6573Sopenharmony_ci    fn add_step(
293c67d6573Sopenharmony_ci        &mut self,
294c67d6573Sopenharmony_ci        nlist: &mut Threads,
295c67d6573Sopenharmony_ci        thread_caps: &mut [Option<usize>],
296c67d6573Sopenharmony_ci        mut ip: usize,
297c67d6573Sopenharmony_ci        at: InputAt,
298c67d6573Sopenharmony_ci    ) {
299c67d6573Sopenharmony_ci        // Instead of pushing and popping to the stack, we mutate ip as we
300c67d6573Sopenharmony_ci        // traverse the set of states. We only push to the stack when we
301c67d6573Sopenharmony_ci        // absolutely need recursion (restoring captures or following a
302c67d6573Sopenharmony_ci        // branch).
303c67d6573Sopenharmony_ci        use crate::prog::Inst::*;
304c67d6573Sopenharmony_ci        loop {
305c67d6573Sopenharmony_ci            // Don't visit states we've already added.
306c67d6573Sopenharmony_ci            if nlist.set.contains(ip) {
307c67d6573Sopenharmony_ci                return;
308c67d6573Sopenharmony_ci            }
309c67d6573Sopenharmony_ci            nlist.set.insert(ip);
310c67d6573Sopenharmony_ci            match self.prog[ip] {
311c67d6573Sopenharmony_ci                EmptyLook(ref inst) => {
312c67d6573Sopenharmony_ci                    if self.input.is_empty_match(at, inst) {
313c67d6573Sopenharmony_ci                        ip = inst.goto;
314c67d6573Sopenharmony_ci                    }
315c67d6573Sopenharmony_ci                }
316c67d6573Sopenharmony_ci                Save(ref inst) => {
317c67d6573Sopenharmony_ci                    if inst.slot < thread_caps.len() {
318c67d6573Sopenharmony_ci                        self.stack.push(FollowEpsilon::Capture {
319c67d6573Sopenharmony_ci                            slot: inst.slot,
320c67d6573Sopenharmony_ci                            pos: thread_caps[inst.slot],
321c67d6573Sopenharmony_ci                        });
322c67d6573Sopenharmony_ci                        thread_caps[inst.slot] = Some(at.pos());
323c67d6573Sopenharmony_ci                    }
324c67d6573Sopenharmony_ci                    ip = inst.goto;
325c67d6573Sopenharmony_ci                }
326c67d6573Sopenharmony_ci                Split(ref inst) => {
327c67d6573Sopenharmony_ci                    self.stack.push(FollowEpsilon::IP(inst.goto2));
328c67d6573Sopenharmony_ci                    ip = inst.goto1;
329c67d6573Sopenharmony_ci                }
330c67d6573Sopenharmony_ci                Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
331c67d6573Sopenharmony_ci                    let t = &mut nlist.caps(ip);
332c67d6573Sopenharmony_ci                    for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
333c67d6573Sopenharmony_ci                        *slot = *val;
334c67d6573Sopenharmony_ci                    }
335c67d6573Sopenharmony_ci                    return;
336c67d6573Sopenharmony_ci                }
337c67d6573Sopenharmony_ci            }
338c67d6573Sopenharmony_ci        }
339c67d6573Sopenharmony_ci    }
340c67d6573Sopenharmony_ci}
341c67d6573Sopenharmony_ci
342c67d6573Sopenharmony_ciimpl Threads {
343c67d6573Sopenharmony_ci    fn new() -> Self {
344c67d6573Sopenharmony_ci        Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
345c67d6573Sopenharmony_ci    }
346c67d6573Sopenharmony_ci
347c67d6573Sopenharmony_ci    fn resize(&mut self, num_insts: usize, ncaps: usize) {
348c67d6573Sopenharmony_ci        if num_insts == self.set.capacity() {
349c67d6573Sopenharmony_ci            return;
350c67d6573Sopenharmony_ci        }
351c67d6573Sopenharmony_ci        self.slots_per_thread = ncaps * 2;
352c67d6573Sopenharmony_ci        self.set = SparseSet::new(num_insts);
353c67d6573Sopenharmony_ci        self.caps = vec![None; self.slots_per_thread * num_insts];
354c67d6573Sopenharmony_ci    }
355c67d6573Sopenharmony_ci
356c67d6573Sopenharmony_ci    fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
357c67d6573Sopenharmony_ci        let i = pc * self.slots_per_thread;
358c67d6573Sopenharmony_ci        &mut self.caps[i..i + self.slots_per_thread]
359c67d6573Sopenharmony_ci    }
360c67d6573Sopenharmony_ci}
361