1c67d6573Sopenharmony_ci/*!
2c67d6573Sopenharmony_ciThe DFA matching engine.
3c67d6573Sopenharmony_ci
4c67d6573Sopenharmony_ciA DFA provides faster matching because the engine is in exactly one state at
5c67d6573Sopenharmony_ciany point in time. In the NFA, there may be multiple active states, and
6c67d6573Sopenharmony_ciconsiderable CPU cycles are spent shuffling them around. In finite automata
7c67d6573Sopenharmony_cispeak, the DFA follows epsilon transitions in the regex far less than the NFA.
8c67d6573Sopenharmony_ci
9c67d6573Sopenharmony_ciA DFA is a classic trade off between time and space. The NFA is slower, but
10c67d6573Sopenharmony_ciits memory requirements are typically small and predictable. The DFA is faster,
11c67d6573Sopenharmony_cibut given the right regex and the right input, the number of states in the
12c67d6573Sopenharmony_ciDFA can grow exponentially. To mitigate this space problem, we do two things:
13c67d6573Sopenharmony_ci
14c67d6573Sopenharmony_ci1. We implement an *online* DFA. That is, the DFA is constructed from the NFA
15c67d6573Sopenharmony_ci   during a search. When a new state is computed, it is stored in a cache so
16c67d6573Sopenharmony_ci   that it may be reused. An important consequence of this implementation
17c67d6573Sopenharmony_ci   is that states that are never reached for a particular input are never
18c67d6573Sopenharmony_ci   computed. (This is impossible in an "offline" DFA which needs to compute
19c67d6573Sopenharmony_ci   all possible states up front.)
20c67d6573Sopenharmony_ci2. If the cache gets too big, we wipe it and continue matching.
21c67d6573Sopenharmony_ci
22c67d6573Sopenharmony_ciIn pathological cases, a new state can be created for every byte of input.
23c67d6573Sopenharmony_ci(e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.)
24c67d6573Sopenharmony_ciIn this case, performance regresses to slightly slower than the full NFA
25c67d6573Sopenharmony_cisimulation, in large part because the cache becomes useless. If the cache
26c67d6573Sopenharmony_ciis wiped too frequently, the DFA quits and control falls back to one of the
27c67d6573Sopenharmony_ciNFA simulations.
28c67d6573Sopenharmony_ci
29c67d6573Sopenharmony_ciBecause of the "lazy" nature of this DFA, the inner matching loop is
30c67d6573Sopenharmony_ciconsiderably more complex than one might expect out of a DFA. A number of
31c67d6573Sopenharmony_citricks are employed to make it fast. Tread carefully.
32c67d6573Sopenharmony_ci
33c67d6573Sopenharmony_ciN.B. While this implementation is heavily commented, Russ Cox's series of
34c67d6573Sopenharmony_ciarticles on regexes is strongly recommended: <https://swtch.com/~rsc/regexp/>
35c67d6573Sopenharmony_ci(As is the DFA implementation in RE2, which heavily influenced this
36c67d6573Sopenharmony_ciimplementation.)
37c67d6573Sopenharmony_ci*/
38c67d6573Sopenharmony_ci
39c67d6573Sopenharmony_ciuse std::collections::HashMap;
40c67d6573Sopenharmony_ciuse std::fmt;
41c67d6573Sopenharmony_ciuse std::iter::repeat;
42c67d6573Sopenharmony_ciuse std::mem;
43c67d6573Sopenharmony_ciuse std::sync::Arc;
44c67d6573Sopenharmony_ci
45c67d6573Sopenharmony_ciuse crate::exec::ProgramCache;
46c67d6573Sopenharmony_ciuse crate::prog::{Inst, Program};
47c67d6573Sopenharmony_ciuse crate::sparse::SparseSet;
48c67d6573Sopenharmony_ci
49c67d6573Sopenharmony_ci/// Return true if and only if the given program can be executed by a DFA.
50c67d6573Sopenharmony_ci///
51c67d6573Sopenharmony_ci/// Generally, a DFA is always possible. A pathological case where it is not
52c67d6573Sopenharmony_ci/// possible is if the number of NFA states exceeds `u32::MAX`, in which case,
53c67d6573Sopenharmony_ci/// this function will return false.
54c67d6573Sopenharmony_ci///
55c67d6573Sopenharmony_ci/// This function will also return false if the given program has any Unicode
56c67d6573Sopenharmony_ci/// instructions (Char or Ranges) since the DFA operates on bytes only.
57c67d6573Sopenharmony_cipub fn can_exec(insts: &Program) -> bool {
58c67d6573Sopenharmony_ci    use crate::prog::Inst::*;
59c67d6573Sopenharmony_ci    // If for some reason we manage to allocate a regex program with more
60c67d6573Sopenharmony_ci    // than i32::MAX instructions, then we can't execute the DFA because we
61c67d6573Sopenharmony_ci    // use 32 bit instruction pointer deltas for memory savings.
62c67d6573Sopenharmony_ci    // If i32::MAX is the largest positive delta,
63c67d6573Sopenharmony_ci    // then -i32::MAX == i32::MIN + 1 is the largest negative delta,
64c67d6573Sopenharmony_ci    // and we are OK to use 32 bits.
65c67d6573Sopenharmony_ci    if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize {
66c67d6573Sopenharmony_ci        return false;
67c67d6573Sopenharmony_ci    }
68c67d6573Sopenharmony_ci    for inst in insts {
69c67d6573Sopenharmony_ci        match *inst {
70c67d6573Sopenharmony_ci            Char(_) | Ranges(_) => return false,
71c67d6573Sopenharmony_ci            EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
72c67d6573Sopenharmony_ci        }
73c67d6573Sopenharmony_ci    }
74c67d6573Sopenharmony_ci    true
75c67d6573Sopenharmony_ci}
76c67d6573Sopenharmony_ci
77c67d6573Sopenharmony_ci/// A reusable cache of DFA states.
78c67d6573Sopenharmony_ci///
79c67d6573Sopenharmony_ci/// This cache is reused between multiple invocations of the same regex
80c67d6573Sopenharmony_ci/// program. (It is not shared simultaneously between threads. If there is
81c67d6573Sopenharmony_ci/// contention, then new caches are created.)
82c67d6573Sopenharmony_ci#[derive(Debug)]
83c67d6573Sopenharmony_cipub struct Cache {
84c67d6573Sopenharmony_ci    /// Group persistent DFA related cache state together. The sparse sets
85c67d6573Sopenharmony_ci    /// listed below are used as scratch space while computing uncached states.
86c67d6573Sopenharmony_ci    inner: CacheInner,
87c67d6573Sopenharmony_ci    /// qcur and qnext are ordered sets with constant time
88c67d6573Sopenharmony_ci    /// addition/membership/clearing-whole-set and linear time iteration. They
89c67d6573Sopenharmony_ci    /// are used to manage the sets of NFA states in DFA states when computing
90c67d6573Sopenharmony_ci    /// cached DFA states. In particular, the order of the NFA states matters
91c67d6573Sopenharmony_ci    /// for leftmost-first style matching. Namely, when computing a cached
92c67d6573Sopenharmony_ci    /// state, the set of NFA states stops growing as soon as the first Match
93c67d6573Sopenharmony_ci    /// instruction is observed.
94c67d6573Sopenharmony_ci    qcur: SparseSet,
95c67d6573Sopenharmony_ci    qnext: SparseSet,
96c67d6573Sopenharmony_ci}
97c67d6573Sopenharmony_ci
98c67d6573Sopenharmony_ci/// `CacheInner` is logically just a part of Cache, but groups together fields
99c67d6573Sopenharmony_ci/// that aren't passed as function parameters throughout search. (This split
100c67d6573Sopenharmony_ci/// is mostly an artifact of the borrow checker. It is happily paid.)
101c67d6573Sopenharmony_ci#[derive(Debug)]
102c67d6573Sopenharmony_cistruct CacheInner {
103c67d6573Sopenharmony_ci    /// A cache of pre-compiled DFA states, keyed by the set of NFA states
104c67d6573Sopenharmony_ci    /// and the set of empty-width flags set at the byte in the input when the
105c67d6573Sopenharmony_ci    /// state was observed.
106c67d6573Sopenharmony_ci    ///
107c67d6573Sopenharmony_ci    /// A StatePtr is effectively a `*State`, but to avoid various inconvenient
108c67d6573Sopenharmony_ci    /// things, we just pass indexes around manually. The performance impact of
109c67d6573Sopenharmony_ci    /// this is probably an instruction or two in the inner loop. However, on
110c67d6573Sopenharmony_ci    /// 64 bit, each StatePtr is half the size of a *State.
111c67d6573Sopenharmony_ci    compiled: StateMap,
112c67d6573Sopenharmony_ci    /// The transition table.
113c67d6573Sopenharmony_ci    ///
114c67d6573Sopenharmony_ci    /// The transition table is laid out in row-major order, where states are
115c67d6573Sopenharmony_ci    /// rows and the transitions for each state are columns. At a high level,
116c67d6573Sopenharmony_ci    /// given state `s` and byte `b`, the next state can be found at index
117c67d6573Sopenharmony_ci    /// `s * 256 + b`.
118c67d6573Sopenharmony_ci    ///
119c67d6573Sopenharmony_ci    /// This is, of course, a lie. A StatePtr is actually a pointer to the
120c67d6573Sopenharmony_ci    /// *start* of a row in this table. When indexing in the DFA's inner loop,
121c67d6573Sopenharmony_ci    /// this removes the need to multiply the StatePtr by the stride. Yes, it
122c67d6573Sopenharmony_ci    /// matters. This reduces the number of states we can store, but: the
123c67d6573Sopenharmony_ci    /// stride is rarely 256 since we define transitions in terms of
124c67d6573Sopenharmony_ci    /// *equivalence classes* of bytes. Each class corresponds to a set of
125c67d6573Sopenharmony_ci    /// bytes that never discriminate a distinct path through the DFA from each
126c67d6573Sopenharmony_ci    /// other.
127c67d6573Sopenharmony_ci    trans: Transitions,
128c67d6573Sopenharmony_ci    /// A set of cached start states, which are limited to the number of
129c67d6573Sopenharmony_ci    /// permutations of flags set just before the initial byte of input. (The
130c67d6573Sopenharmony_ci    /// index into this vec is a `EmptyFlags`.)
131c67d6573Sopenharmony_ci    ///
132c67d6573Sopenharmony_ci    /// N.B. A start state can be "dead" (i.e., no possible match), so we
133c67d6573Sopenharmony_ci    /// represent it with a StatePtr.
134c67d6573Sopenharmony_ci    start_states: Vec<StatePtr>,
135c67d6573Sopenharmony_ci    /// Stack scratch space used to follow epsilon transitions in the NFA.
136c67d6573Sopenharmony_ci    /// (This permits us to avoid recursion.)
137c67d6573Sopenharmony_ci    ///
138c67d6573Sopenharmony_ci    /// The maximum stack size is the number of NFA states.
139c67d6573Sopenharmony_ci    stack: Vec<InstPtr>,
140c67d6573Sopenharmony_ci    /// The total number of times this cache has been flushed by the DFA
141c67d6573Sopenharmony_ci    /// because of space constraints.
142c67d6573Sopenharmony_ci    flush_count: u64,
143c67d6573Sopenharmony_ci    /// The total heap size of the DFA's cache. We use this to determine when
144c67d6573Sopenharmony_ci    /// we should flush the cache.
145c67d6573Sopenharmony_ci    size: usize,
146c67d6573Sopenharmony_ci    /// Scratch space used when building instruction pointer lists for new
147c67d6573Sopenharmony_ci    /// states. This helps amortize allocation.
148c67d6573Sopenharmony_ci    insts_scratch_space: Vec<u8>,
149c67d6573Sopenharmony_ci}
150c67d6573Sopenharmony_ci
151c67d6573Sopenharmony_ci/// The transition table.
152c67d6573Sopenharmony_ci///
153c67d6573Sopenharmony_ci/// It is laid out in row-major order, with states as rows and byte class
154c67d6573Sopenharmony_ci/// transitions as columns.
155c67d6573Sopenharmony_ci///
156c67d6573Sopenharmony_ci/// The transition table is responsible for producing valid `StatePtrs`. A
157c67d6573Sopenharmony_ci/// `StatePtr` points to the start of a particular row in this table. When
158c67d6573Sopenharmony_ci/// indexing to find the next state this allows us to avoid a multiplication
159c67d6573Sopenharmony_ci/// when computing an index into the table.
160c67d6573Sopenharmony_ci#[derive(Clone)]
161c67d6573Sopenharmony_cistruct Transitions {
162c67d6573Sopenharmony_ci    /// The table.
163c67d6573Sopenharmony_ci    table: Vec<StatePtr>,
164c67d6573Sopenharmony_ci    /// The stride.
165c67d6573Sopenharmony_ci    num_byte_classes: usize,
166c67d6573Sopenharmony_ci}
167c67d6573Sopenharmony_ci
168c67d6573Sopenharmony_ci/// Fsm encapsulates the actual execution of the DFA.
169c67d6573Sopenharmony_ci#[derive(Debug)]
170c67d6573Sopenharmony_cipub struct Fsm<'a> {
171c67d6573Sopenharmony_ci    /// prog contains the NFA instruction opcodes. DFA execution uses either
172c67d6573Sopenharmony_ci    /// the `dfa` instructions or the `dfa_reverse` instructions from
173c67d6573Sopenharmony_ci    /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have
174c67d6573Sopenharmony_ci    /// Unicode opcodes that cannot be executed by the DFA.)
175c67d6573Sopenharmony_ci    prog: &'a Program,
176c67d6573Sopenharmony_ci    /// The start state. We record it here because the pointer may change
177c67d6573Sopenharmony_ci    /// when the cache is wiped.
178c67d6573Sopenharmony_ci    start: StatePtr,
179c67d6573Sopenharmony_ci    /// The current position in the input.
180c67d6573Sopenharmony_ci    at: usize,
181c67d6573Sopenharmony_ci    /// Should we quit after seeing the first match? e.g., When the caller
182c67d6573Sopenharmony_ci    /// uses `is_match` or `shortest_match`.
183c67d6573Sopenharmony_ci    quit_after_match: bool,
184c67d6573Sopenharmony_ci    /// The last state that matched.
185c67d6573Sopenharmony_ci    ///
186c67d6573Sopenharmony_ci    /// When no match has occurred, this is set to STATE_UNKNOWN.
187c67d6573Sopenharmony_ci    ///
188c67d6573Sopenharmony_ci    /// This is only useful when matching regex sets. The last match state
189c67d6573Sopenharmony_ci    /// is useful because it contains all of the match instructions seen,
190c67d6573Sopenharmony_ci    /// thereby allowing us to enumerate which regexes in the set matched.
191c67d6573Sopenharmony_ci    last_match_si: StatePtr,
192c67d6573Sopenharmony_ci    /// The input position of the last cache flush. We use this to determine
193c67d6573Sopenharmony_ci    /// if we're thrashing in the cache too often. If so, the DFA quits so
194c67d6573Sopenharmony_ci    /// that we can fall back to the NFA algorithm.
195c67d6573Sopenharmony_ci    last_cache_flush: usize,
196c67d6573Sopenharmony_ci    /// All cached DFA information that is persisted between searches.
197c67d6573Sopenharmony_ci    cache: &'a mut CacheInner,
198c67d6573Sopenharmony_ci}
199c67d6573Sopenharmony_ci
200c67d6573Sopenharmony_ci/// The result of running the DFA.
201c67d6573Sopenharmony_ci///
202c67d6573Sopenharmony_ci/// Generally, the result is either a match or not a match, but sometimes the
203c67d6573Sopenharmony_ci/// DFA runs too slowly because the cache size is too small. In that case, it
204c67d6573Sopenharmony_ci/// gives up with the intent of falling back to the NFA algorithm.
205c67d6573Sopenharmony_ci///
206c67d6573Sopenharmony_ci/// The DFA can also give up if it runs out of room to create new states, or if
207c67d6573Sopenharmony_ci/// it sees non-ASCII bytes in the presence of a Unicode word boundary.
208c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
209c67d6573Sopenharmony_cipub enum Result<T> {
210c67d6573Sopenharmony_ci    Match(T),
211c67d6573Sopenharmony_ci    NoMatch(usize),
212c67d6573Sopenharmony_ci    Quit,
213c67d6573Sopenharmony_ci}
214c67d6573Sopenharmony_ci
215c67d6573Sopenharmony_ciimpl<T> Result<T> {
216c67d6573Sopenharmony_ci    /// Returns true if this result corresponds to a match.
217c67d6573Sopenharmony_ci    pub fn is_match(&self) -> bool {
218c67d6573Sopenharmony_ci        match *self {
219c67d6573Sopenharmony_ci            Result::Match(_) => true,
220c67d6573Sopenharmony_ci            Result::NoMatch(_) | Result::Quit => false,
221c67d6573Sopenharmony_ci        }
222c67d6573Sopenharmony_ci    }
223c67d6573Sopenharmony_ci
224c67d6573Sopenharmony_ci    /// Maps the given function onto T and returns the result.
225c67d6573Sopenharmony_ci    ///
226c67d6573Sopenharmony_ci    /// If this isn't a match, then this is a no-op.
227c67d6573Sopenharmony_ci    #[cfg(feature = "perf-literal")]
228c67d6573Sopenharmony_ci    pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> {
229c67d6573Sopenharmony_ci        match self {
230c67d6573Sopenharmony_ci            Result::Match(t) => Result::Match(f(t)),
231c67d6573Sopenharmony_ci            Result::NoMatch(x) => Result::NoMatch(x),
232c67d6573Sopenharmony_ci            Result::Quit => Result::Quit,
233c67d6573Sopenharmony_ci        }
234c67d6573Sopenharmony_ci    }
235c67d6573Sopenharmony_ci
236c67d6573Sopenharmony_ci    /// Sets the non-match position.
237c67d6573Sopenharmony_ci    ///
238c67d6573Sopenharmony_ci    /// If this isn't a non-match, then this is a no-op.
239c67d6573Sopenharmony_ci    fn set_non_match(self, at: usize) -> Result<T> {
240c67d6573Sopenharmony_ci        match self {
241c67d6573Sopenharmony_ci            Result::NoMatch(_) => Result::NoMatch(at),
242c67d6573Sopenharmony_ci            r => r,
243c67d6573Sopenharmony_ci        }
244c67d6573Sopenharmony_ci    }
245c67d6573Sopenharmony_ci}
246c67d6573Sopenharmony_ci
247c67d6573Sopenharmony_ci/// `State` is a DFA state. It contains an ordered set of NFA states (not
248c67d6573Sopenharmony_ci/// necessarily complete) and a smattering of flags.
249c67d6573Sopenharmony_ci///
250c67d6573Sopenharmony_ci/// The flags are packed into the first byte of data.
251c67d6573Sopenharmony_ci///
252c67d6573Sopenharmony_ci/// States don't carry their transitions. Instead, transitions are stored in
253c67d6573Sopenharmony_ci/// a single row-major table.
254c67d6573Sopenharmony_ci///
255c67d6573Sopenharmony_ci/// Delta encoding is used to store the instruction pointers.
256c67d6573Sopenharmony_ci/// The first instruction pointer is stored directly starting
257c67d6573Sopenharmony_ci/// at data[1], and each following pointer is stored as an offset
258c67d6573Sopenharmony_ci/// to the previous one. If a delta is in the range -127..127,
259c67d6573Sopenharmony_ci/// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8)
260c67d6573Sopenharmony_ci/// is coded as a flag, followed by 4 bytes encoding the delta.
261c67d6573Sopenharmony_ci#[derive(Clone, Eq, Hash, PartialEq)]
262c67d6573Sopenharmony_cistruct State {
263c67d6573Sopenharmony_ci    data: Arc<[u8]>,
264c67d6573Sopenharmony_ci}
265c67d6573Sopenharmony_ci
266c67d6573Sopenharmony_ci/// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
267c67d6573Sopenharmony_ci/// an NFA state).
268c67d6573Sopenharmony_ci///
269c67d6573Sopenharmony_ci/// Throughout this library, this is usually set to `usize`, but we force a
270c67d6573Sopenharmony_ci/// `u32` here for the DFA to save on space.
271c67d6573Sopenharmony_citype InstPtr = u32;
272c67d6573Sopenharmony_ci
273c67d6573Sopenharmony_ci/// Adds ip to data using delta encoding with respect to prev.
274c67d6573Sopenharmony_ci///
275c67d6573Sopenharmony_ci/// After completion, `data` will contain `ip` and `prev` will be set to `ip`.
276c67d6573Sopenharmony_cifn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) {
277c67d6573Sopenharmony_ci    let delta = (ip as i32) - (*prev as i32);
278c67d6573Sopenharmony_ci    write_vari32(data, delta);
279c67d6573Sopenharmony_ci    *prev = ip;
280c67d6573Sopenharmony_ci}
281c67d6573Sopenharmony_ci
282c67d6573Sopenharmony_cistruct InstPtrs<'a> {
283c67d6573Sopenharmony_ci    base: usize,
284c67d6573Sopenharmony_ci    data: &'a [u8],
285c67d6573Sopenharmony_ci}
286c67d6573Sopenharmony_ci
287c67d6573Sopenharmony_ciimpl<'a> Iterator for InstPtrs<'a> {
288c67d6573Sopenharmony_ci    type Item = usize;
289c67d6573Sopenharmony_ci
290c67d6573Sopenharmony_ci    fn next(&mut self) -> Option<usize> {
291c67d6573Sopenharmony_ci        if self.data.is_empty() {
292c67d6573Sopenharmony_ci            return None;
293c67d6573Sopenharmony_ci        }
294c67d6573Sopenharmony_ci        let (delta, nread) = read_vari32(self.data);
295c67d6573Sopenharmony_ci        let base = self.base as i32 + delta;
296c67d6573Sopenharmony_ci        debug_assert!(base >= 0);
297c67d6573Sopenharmony_ci        debug_assert!(nread > 0);
298c67d6573Sopenharmony_ci        self.data = &self.data[nread..];
299c67d6573Sopenharmony_ci        self.base = base as usize;
300c67d6573Sopenharmony_ci        Some(self.base)
301c67d6573Sopenharmony_ci    }
302c67d6573Sopenharmony_ci}
303c67d6573Sopenharmony_ci
304c67d6573Sopenharmony_ciimpl State {
305c67d6573Sopenharmony_ci    fn flags(&self) -> StateFlags {
306c67d6573Sopenharmony_ci        StateFlags(self.data[0])
307c67d6573Sopenharmony_ci    }
308c67d6573Sopenharmony_ci
309c67d6573Sopenharmony_ci    fn inst_ptrs(&self) -> InstPtrs<'_> {
310c67d6573Sopenharmony_ci        InstPtrs { base: 0, data: &self.data[1..] }
311c67d6573Sopenharmony_ci    }
312c67d6573Sopenharmony_ci}
313c67d6573Sopenharmony_ci
314c67d6573Sopenharmony_ci/// `StatePtr` is a 32 bit pointer to the start of a row in the transition
315c67d6573Sopenharmony_ci/// table.
316c67d6573Sopenharmony_ci///
317c67d6573Sopenharmony_ci/// It has many special values. There are two types of special values:
318c67d6573Sopenharmony_ci/// sentinels and flags.
319c67d6573Sopenharmony_ci///
320c67d6573Sopenharmony_ci/// Sentinels corresponds to special states that carry some kind of
321c67d6573Sopenharmony_ci/// significance. There are three such states: unknown, dead and quit states.
322c67d6573Sopenharmony_ci///
323c67d6573Sopenharmony_ci/// Unknown states are states that haven't been computed yet. They indicate
324c67d6573Sopenharmony_ci/// that a transition should be filled in that points to either an existing
325c67d6573Sopenharmony_ci/// cached state or a new state altogether. In general, an unknown state means
326c67d6573Sopenharmony_ci/// "follow the NFA's epsilon transitions."
327c67d6573Sopenharmony_ci///
328c67d6573Sopenharmony_ci/// Dead states are states that can never lead to a match, no matter what
329c67d6573Sopenharmony_ci/// subsequent input is observed. This means that the DFA should quit
330c67d6573Sopenharmony_ci/// immediately and return the longest match it has found thus far.
331c67d6573Sopenharmony_ci///
332c67d6573Sopenharmony_ci/// Quit states are states that imply the DFA is not capable of matching the
333c67d6573Sopenharmony_ci/// regex correctly. Currently, this is only used when a Unicode word boundary
334c67d6573Sopenharmony_ci/// exists in the regex *and* a non-ASCII byte is observed.
335c67d6573Sopenharmony_ci///
336c67d6573Sopenharmony_ci/// The other type of state pointer is a state pointer with special flag bits.
337c67d6573Sopenharmony_ci/// There are two flags: a start flag and a match flag. The lower bits of both
338c67d6573Sopenharmony_ci/// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX`
339c67d6573Sopenharmony_ci/// mask).
340c67d6573Sopenharmony_ci///
341c67d6573Sopenharmony_ci/// The start flag means that the state is a start state, and therefore may be
342c67d6573Sopenharmony_ci/// subject to special prefix scanning optimizations.
343c67d6573Sopenharmony_ci///
344c67d6573Sopenharmony_ci/// The match flag means that the state is a match state, and therefore the
345c67d6573Sopenharmony_ci/// current position in the input (while searching) should be recorded.
346c67d6573Sopenharmony_ci///
347c67d6573Sopenharmony_ci/// The above exists mostly in the service of making the inner loop fast.
348c67d6573Sopenharmony_ci/// In particular, the inner *inner* loop looks something like this:
349c67d6573Sopenharmony_ci///
350c67d6573Sopenharmony_ci/// ```ignore
351c67d6573Sopenharmony_ci/// while state <= STATE_MAX and i < len(text):
352c67d6573Sopenharmony_ci///     state = state.next[i]
353c67d6573Sopenharmony_ci/// ```
354c67d6573Sopenharmony_ci///
355c67d6573Sopenharmony_ci/// This is nice because it lets us execute a lazy DFA as if it were an
356c67d6573Sopenharmony_ci/// entirely offline DFA (i.e., with very few instructions). The loop will
357c67d6573Sopenharmony_ci/// quit only when we need to examine a case that needs special attention.
358c67d6573Sopenharmony_citype StatePtr = u32;
359c67d6573Sopenharmony_ci
360c67d6573Sopenharmony_ci/// An unknown state means that the state has not been computed yet, and that
361c67d6573Sopenharmony_ci/// the only way to progress is to compute it.
362c67d6573Sopenharmony_ciconst STATE_UNKNOWN: StatePtr = 1 << 31;
363c67d6573Sopenharmony_ci
364c67d6573Sopenharmony_ci/// A dead state means that the state has been computed and it is known that
365c67d6573Sopenharmony_ci/// once it is entered, no future match can ever occur.
366c67d6573Sopenharmony_ciconst STATE_DEAD: StatePtr = STATE_UNKNOWN + 1;
367c67d6573Sopenharmony_ci
368c67d6573Sopenharmony_ci/// A quit state means that the DFA came across some input that it doesn't
369c67d6573Sopenharmony_ci/// know how to process correctly. The DFA should quit and another matching
370c67d6573Sopenharmony_ci/// engine should be run in its place.
371c67d6573Sopenharmony_ciconst STATE_QUIT: StatePtr = STATE_DEAD + 1;
372c67d6573Sopenharmony_ci
373c67d6573Sopenharmony_ci/// A start state is a state that the DFA can start in.
374c67d6573Sopenharmony_ci///
375c67d6573Sopenharmony_ci/// Note that start states have their lower bits set to a state pointer.
376c67d6573Sopenharmony_ciconst STATE_START: StatePtr = 1 << 30;
377c67d6573Sopenharmony_ci
378c67d6573Sopenharmony_ci/// A match state means that the regex has successfully matched.
379c67d6573Sopenharmony_ci///
380c67d6573Sopenharmony_ci/// Note that match states have their lower bits set to a state pointer.
381c67d6573Sopenharmony_ciconst STATE_MATCH: StatePtr = 1 << 29;
382c67d6573Sopenharmony_ci
383c67d6573Sopenharmony_ci/// The maximum state pointer. This is useful to mask out the "valid" state
384c67d6573Sopenharmony_ci/// pointer from a state with the "start" or "match" bits set.
385c67d6573Sopenharmony_ci///
386c67d6573Sopenharmony_ci/// It doesn't make sense to use this with unknown, dead or quit state
387c67d6573Sopenharmony_ci/// pointers, since those pointers are sentinels and never have their lower
388c67d6573Sopenharmony_ci/// bits set to anything meaningful.
389c67d6573Sopenharmony_ciconst STATE_MAX: StatePtr = STATE_MATCH - 1;
390c67d6573Sopenharmony_ci
391c67d6573Sopenharmony_ci/// Byte is a u8 in spirit, but a u16 in practice so that we can represent the
392c67d6573Sopenharmony_ci/// special EOF sentinel value.
393c67d6573Sopenharmony_ci#[derive(Copy, Clone, Debug)]
394c67d6573Sopenharmony_cistruct Byte(u16);
395c67d6573Sopenharmony_ci
396c67d6573Sopenharmony_ci/// A set of flags for zero-width assertions.
397c67d6573Sopenharmony_ci#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)]
398c67d6573Sopenharmony_cistruct EmptyFlags {
399c67d6573Sopenharmony_ci    start: bool,
400c67d6573Sopenharmony_ci    end: bool,
401c67d6573Sopenharmony_ci    start_line: bool,
402c67d6573Sopenharmony_ci    end_line: bool,
403c67d6573Sopenharmony_ci    word_boundary: bool,
404c67d6573Sopenharmony_ci    not_word_boundary: bool,
405c67d6573Sopenharmony_ci}
406c67d6573Sopenharmony_ci
407c67d6573Sopenharmony_ci/// A set of flags describing various configurations of a DFA state. This is
408c67d6573Sopenharmony_ci/// represented by a `u8` so that it is compact.
409c67d6573Sopenharmony_ci#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)]
410c67d6573Sopenharmony_cistruct StateFlags(u8);
411c67d6573Sopenharmony_ci
412c67d6573Sopenharmony_ciimpl Cache {
413c67d6573Sopenharmony_ci    /// Create new empty cache for the DFA engine.
414c67d6573Sopenharmony_ci    pub fn new(prog: &Program) -> Self {
415c67d6573Sopenharmony_ci        // We add 1 to account for the special EOF byte.
416c67d6573Sopenharmony_ci        let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1;
417c67d6573Sopenharmony_ci        let starts = vec![STATE_UNKNOWN; 256];
418c67d6573Sopenharmony_ci        let mut cache = Cache {
419c67d6573Sopenharmony_ci            inner: CacheInner {
420c67d6573Sopenharmony_ci                compiled: StateMap::new(num_byte_classes),
421c67d6573Sopenharmony_ci                trans: Transitions::new(num_byte_classes),
422c67d6573Sopenharmony_ci                start_states: starts,
423c67d6573Sopenharmony_ci                stack: vec![],
424c67d6573Sopenharmony_ci                flush_count: 0,
425c67d6573Sopenharmony_ci                size: 0,
426c67d6573Sopenharmony_ci                insts_scratch_space: vec![],
427c67d6573Sopenharmony_ci            },
428c67d6573Sopenharmony_ci            qcur: SparseSet::new(prog.insts.len()),
429c67d6573Sopenharmony_ci            qnext: SparseSet::new(prog.insts.len()),
430c67d6573Sopenharmony_ci        };
431c67d6573Sopenharmony_ci        cache.inner.reset_size();
432c67d6573Sopenharmony_ci        cache
433c67d6573Sopenharmony_ci    }
434c67d6573Sopenharmony_ci}
435c67d6573Sopenharmony_ci
436c67d6573Sopenharmony_ciimpl CacheInner {
437c67d6573Sopenharmony_ci    /// Resets the cache size to account for fixed costs, such as the program
438c67d6573Sopenharmony_ci    /// and stack sizes.
439c67d6573Sopenharmony_ci    fn reset_size(&mut self) {
440c67d6573Sopenharmony_ci        self.size = (self.start_states.len() * mem::size_of::<StatePtr>())
441c67d6573Sopenharmony_ci            + (self.stack.len() * mem::size_of::<InstPtr>());
442c67d6573Sopenharmony_ci    }
443c67d6573Sopenharmony_ci}
444c67d6573Sopenharmony_ci
445c67d6573Sopenharmony_ciimpl<'a> Fsm<'a> {
446c67d6573Sopenharmony_ci    #[cfg_attr(feature = "perf-inline", inline(always))]
447c67d6573Sopenharmony_ci    pub fn forward(
448c67d6573Sopenharmony_ci        prog: &'a Program,
449c67d6573Sopenharmony_ci        cache: &ProgramCache,
450c67d6573Sopenharmony_ci        quit_after_match: bool,
451c67d6573Sopenharmony_ci        text: &[u8],
452c67d6573Sopenharmony_ci        at: usize,
453c67d6573Sopenharmony_ci    ) -> Result<usize> {
454c67d6573Sopenharmony_ci        let mut cache = cache.borrow_mut();
455c67d6573Sopenharmony_ci        let cache = &mut cache.dfa;
456c67d6573Sopenharmony_ci        let mut dfa = Fsm {
457c67d6573Sopenharmony_ci            prog,
458c67d6573Sopenharmony_ci            start: 0, // filled in below
459c67d6573Sopenharmony_ci            at,
460c67d6573Sopenharmony_ci            quit_after_match,
461c67d6573Sopenharmony_ci            last_match_si: STATE_UNKNOWN,
462c67d6573Sopenharmony_ci            last_cache_flush: at,
463c67d6573Sopenharmony_ci            cache: &mut cache.inner,
464c67d6573Sopenharmony_ci        };
465c67d6573Sopenharmony_ci        let (empty_flags, state_flags) = dfa.start_flags(text, at);
466c67d6573Sopenharmony_ci        dfa.start =
467c67d6573Sopenharmony_ci            match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
468c67d6573Sopenharmony_ci                None => return Result::Quit,
469c67d6573Sopenharmony_ci                Some(STATE_DEAD) => return Result::NoMatch(at),
470c67d6573Sopenharmony_ci                Some(si) => si,
471c67d6573Sopenharmony_ci            };
472c67d6573Sopenharmony_ci        debug_assert!(dfa.start != STATE_UNKNOWN);
473c67d6573Sopenharmony_ci        dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
474c67d6573Sopenharmony_ci    }
475c67d6573Sopenharmony_ci
476c67d6573Sopenharmony_ci    #[cfg_attr(feature = "perf-inline", inline(always))]
477c67d6573Sopenharmony_ci    pub fn reverse(
478c67d6573Sopenharmony_ci        prog: &'a Program,
479c67d6573Sopenharmony_ci        cache: &ProgramCache,
480c67d6573Sopenharmony_ci        quit_after_match: bool,
481c67d6573Sopenharmony_ci        text: &[u8],
482c67d6573Sopenharmony_ci        at: usize,
483c67d6573Sopenharmony_ci    ) -> Result<usize> {
484c67d6573Sopenharmony_ci        let mut cache = cache.borrow_mut();
485c67d6573Sopenharmony_ci        let cache = &mut cache.dfa_reverse;
486c67d6573Sopenharmony_ci        let mut dfa = Fsm {
487c67d6573Sopenharmony_ci            prog,
488c67d6573Sopenharmony_ci            start: 0, // filled in below
489c67d6573Sopenharmony_ci            at,
490c67d6573Sopenharmony_ci            quit_after_match,
491c67d6573Sopenharmony_ci            last_match_si: STATE_UNKNOWN,
492c67d6573Sopenharmony_ci            last_cache_flush: at,
493c67d6573Sopenharmony_ci            cache: &mut cache.inner,
494c67d6573Sopenharmony_ci        };
495c67d6573Sopenharmony_ci        let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
496c67d6573Sopenharmony_ci        dfa.start =
497c67d6573Sopenharmony_ci            match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
498c67d6573Sopenharmony_ci                None => return Result::Quit,
499c67d6573Sopenharmony_ci                Some(STATE_DEAD) => return Result::NoMatch(at),
500c67d6573Sopenharmony_ci                Some(si) => si,
501c67d6573Sopenharmony_ci            };
502c67d6573Sopenharmony_ci        debug_assert!(dfa.start != STATE_UNKNOWN);
503c67d6573Sopenharmony_ci        dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
504c67d6573Sopenharmony_ci    }
505c67d6573Sopenharmony_ci
506c67d6573Sopenharmony_ci    #[cfg_attr(feature = "perf-inline", inline(always))]
507c67d6573Sopenharmony_ci    pub fn forward_many(
508c67d6573Sopenharmony_ci        prog: &'a Program,
509c67d6573Sopenharmony_ci        cache: &ProgramCache,
510c67d6573Sopenharmony_ci        matches: &mut [bool],
511c67d6573Sopenharmony_ci        text: &[u8],
512c67d6573Sopenharmony_ci        at: usize,
513c67d6573Sopenharmony_ci    ) -> Result<usize> {
514c67d6573Sopenharmony_ci        debug_assert!(matches.len() == prog.matches.len());
515c67d6573Sopenharmony_ci        let mut cache = cache.borrow_mut();
516c67d6573Sopenharmony_ci        let cache = &mut cache.dfa;
517c67d6573Sopenharmony_ci        let mut dfa = Fsm {
518c67d6573Sopenharmony_ci            prog,
519c67d6573Sopenharmony_ci            start: 0, // filled in below
520c67d6573Sopenharmony_ci            at,
521c67d6573Sopenharmony_ci            quit_after_match: false,
522c67d6573Sopenharmony_ci            last_match_si: STATE_UNKNOWN,
523c67d6573Sopenharmony_ci            last_cache_flush: at,
524c67d6573Sopenharmony_ci            cache: &mut cache.inner,
525c67d6573Sopenharmony_ci        };
526c67d6573Sopenharmony_ci        let (empty_flags, state_flags) = dfa.start_flags(text, at);
527c67d6573Sopenharmony_ci        dfa.start =
528c67d6573Sopenharmony_ci            match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
529c67d6573Sopenharmony_ci                None => return Result::Quit,
530c67d6573Sopenharmony_ci                Some(STATE_DEAD) => return Result::NoMatch(at),
531c67d6573Sopenharmony_ci                Some(si) => si,
532c67d6573Sopenharmony_ci            };
533c67d6573Sopenharmony_ci        debug_assert!(dfa.start != STATE_UNKNOWN);
534c67d6573Sopenharmony_ci        let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text);
535c67d6573Sopenharmony_ci        if result.is_match() {
536c67d6573Sopenharmony_ci            if matches.len() == 1 {
537c67d6573Sopenharmony_ci                matches[0] = true;
538c67d6573Sopenharmony_ci            } else {
539c67d6573Sopenharmony_ci                debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
540c67d6573Sopenharmony_ci                debug_assert!(dfa.last_match_si != STATE_DEAD);
541c67d6573Sopenharmony_ci                for ip in dfa.state(dfa.last_match_si).inst_ptrs() {
542c67d6573Sopenharmony_ci                    if let Inst::Match(slot) = dfa.prog[ip] {
543c67d6573Sopenharmony_ci                        matches[slot] = true;
544c67d6573Sopenharmony_ci                    }
545c67d6573Sopenharmony_ci                }
546c67d6573Sopenharmony_ci            }
547c67d6573Sopenharmony_ci        }
548c67d6573Sopenharmony_ci        result
549c67d6573Sopenharmony_ci    }
550c67d6573Sopenharmony_ci
551c67d6573Sopenharmony_ci    /// Executes the DFA on a forward NFA.
552c67d6573Sopenharmony_ci    ///
553c67d6573Sopenharmony_ci    /// {qcur,qnext} are scratch ordered sets which may be non-empty.
554c67d6573Sopenharmony_ci    #[cfg_attr(feature = "perf-inline", inline(always))]
555c67d6573Sopenharmony_ci    fn exec_at(
556c67d6573Sopenharmony_ci        &mut self,
557c67d6573Sopenharmony_ci        qcur: &mut SparseSet,
558c67d6573Sopenharmony_ci        qnext: &mut SparseSet,
559c67d6573Sopenharmony_ci        text: &[u8],
560c67d6573Sopenharmony_ci    ) -> Result<usize> {
561c67d6573Sopenharmony_ci        // For the most part, the DFA is basically:
562c67d6573Sopenharmony_ci        //
563c67d6573Sopenharmony_ci        //   last_match = null
564c67d6573Sopenharmony_ci        //   while current_byte != EOF:
565c67d6573Sopenharmony_ci        //     si = current_state.next[current_byte]
566c67d6573Sopenharmony_ci        //     if si is match
567c67d6573Sopenharmony_ci        //       last_match = si
568c67d6573Sopenharmony_ci        //   return last_match
569c67d6573Sopenharmony_ci        //
570c67d6573Sopenharmony_ci        // However, we need to deal with a few things:
571c67d6573Sopenharmony_ci        //
572c67d6573Sopenharmony_ci        //   1. This is an *online* DFA, so the current state's next list
573c67d6573Sopenharmony_ci        //      may not point to anywhere yet, so we must go out and compute
574c67d6573Sopenharmony_ci        //      them. (They are then cached into the current state's next list
575c67d6573Sopenharmony_ci        //      to avoid re-computation.)
576c67d6573Sopenharmony_ci        //   2. If we come across a state that is known to be dead (i.e., never
577c67d6573Sopenharmony_ci        //      leads to a match), then we can quit early.
578c67d6573Sopenharmony_ci        //   3. If the caller just wants to know if a match occurs, then we
579c67d6573Sopenharmony_ci        //      can quit as soon as we know we have a match. (Full leftmost
580c67d6573Sopenharmony_ci        //      first semantics require continuing on.)
581c67d6573Sopenharmony_ci        //   4. If we're in the start state, then we can use a pre-computed set
582c67d6573Sopenharmony_ci        //      of prefix literals to skip quickly along the input.
583c67d6573Sopenharmony_ci        //   5. After the input is exhausted, we run the DFA on one symbol
584c67d6573Sopenharmony_ci        //      that stands for EOF. This is useful for handling empty width
585c67d6573Sopenharmony_ci        //      assertions.
586c67d6573Sopenharmony_ci        //   6. We can't actually do state.next[byte]. Instead, we have to do
587c67d6573Sopenharmony_ci        //      state.next[byte_classes[byte]], which permits us to keep the
588c67d6573Sopenharmony_ci        //      'next' list very small.
589c67d6573Sopenharmony_ci        //
590c67d6573Sopenharmony_ci        // Since there's a bunch of extra stuff we need to consider, we do some
591c67d6573Sopenharmony_ci        // pretty hairy tricks to get the inner loop to run as fast as
592c67d6573Sopenharmony_ci        // possible.
593c67d6573Sopenharmony_ci        debug_assert!(!self.prog.is_reverse);
594c67d6573Sopenharmony_ci
595c67d6573Sopenharmony_ci        // The last match is the currently known ending match position. It is
596c67d6573Sopenharmony_ci        // reported as an index to the most recent byte that resulted in a
597c67d6573Sopenharmony_ci        // transition to a match state and is always stored in capture slot `1`
598c67d6573Sopenharmony_ci        // when searching forwards. Its maximum value is `text.len()`.
599c67d6573Sopenharmony_ci        let mut result = Result::NoMatch(self.at);
600c67d6573Sopenharmony_ci        let (mut prev_si, mut next_si) = (self.start, self.start);
601c67d6573Sopenharmony_ci        let mut at = self.at;
602c67d6573Sopenharmony_ci        while at < text.len() {
603c67d6573Sopenharmony_ci            // This is the real inner loop. We take advantage of special bits
604c67d6573Sopenharmony_ci            // set in the state pointer to determine whether a state is in the
605c67d6573Sopenharmony_ci            // "common" case or not. Specifically, the common case is a
606c67d6573Sopenharmony_ci            // non-match non-start non-dead state that has already been
607c67d6573Sopenharmony_ci            // computed. So long as we remain in the common case, this inner
608c67d6573Sopenharmony_ci            // loop will chew through the input.
609c67d6573Sopenharmony_ci            //
610c67d6573Sopenharmony_ci            // We also unroll the loop 4 times to amortize the cost of checking
611c67d6573Sopenharmony_ci            // whether we've consumed the entire input. We are also careful
612c67d6573Sopenharmony_ci            // to make sure that `prev_si` always represents the previous state
613c67d6573Sopenharmony_ci            // and `next_si` always represents the next state after the loop
614c67d6573Sopenharmony_ci            // exits, even if it isn't always true inside the loop.
615c67d6573Sopenharmony_ci            while next_si <= STATE_MAX && at < text.len() {
616c67d6573Sopenharmony_ci                // Argument for safety is in the definition of next_si.
617c67d6573Sopenharmony_ci                prev_si = unsafe { self.next_si(next_si, text, at) };
618c67d6573Sopenharmony_ci                at += 1;
619c67d6573Sopenharmony_ci                if prev_si > STATE_MAX || at + 2 >= text.len() {
620c67d6573Sopenharmony_ci                    mem::swap(&mut prev_si, &mut next_si);
621c67d6573Sopenharmony_ci                    break;
622c67d6573Sopenharmony_ci                }
623c67d6573Sopenharmony_ci                next_si = unsafe { self.next_si(prev_si, text, at) };
624c67d6573Sopenharmony_ci                at += 1;
625c67d6573Sopenharmony_ci                if next_si > STATE_MAX {
626c67d6573Sopenharmony_ci                    break;
627c67d6573Sopenharmony_ci                }
628c67d6573Sopenharmony_ci                prev_si = unsafe { self.next_si(next_si, text, at) };
629c67d6573Sopenharmony_ci                at += 1;
630c67d6573Sopenharmony_ci                if prev_si > STATE_MAX {
631c67d6573Sopenharmony_ci                    mem::swap(&mut prev_si, &mut next_si);
632c67d6573Sopenharmony_ci                    break;
633c67d6573Sopenharmony_ci                }
634c67d6573Sopenharmony_ci                next_si = unsafe { self.next_si(prev_si, text, at) };
635c67d6573Sopenharmony_ci                at += 1;
636c67d6573Sopenharmony_ci            }
637c67d6573Sopenharmony_ci            if next_si & STATE_MATCH > 0 {
638c67d6573Sopenharmony_ci                // A match state is outside of the common case because it needs
639c67d6573Sopenharmony_ci                // special case analysis. In particular, we need to record the
640c67d6573Sopenharmony_ci                // last position as having matched and possibly quit the DFA if
641c67d6573Sopenharmony_ci                // we don't need to keep matching.
642c67d6573Sopenharmony_ci                next_si &= !STATE_MATCH;
643c67d6573Sopenharmony_ci                result = Result::Match(at - 1);
644c67d6573Sopenharmony_ci                if self.quit_after_match {
645c67d6573Sopenharmony_ci                    return result;
646c67d6573Sopenharmony_ci                }
647c67d6573Sopenharmony_ci                self.last_match_si = next_si;
648c67d6573Sopenharmony_ci                prev_si = next_si;
649c67d6573Sopenharmony_ci
650c67d6573Sopenharmony_ci                // This permits short-circuiting when matching a regex set.
651c67d6573Sopenharmony_ci                // In particular, if this DFA state contains only match states,
652c67d6573Sopenharmony_ci                // then it's impossible to extend the set of matches since
653c67d6573Sopenharmony_ci                // match states are final. Therefore, we can quit.
654c67d6573Sopenharmony_ci                if self.prog.matches.len() > 1 {
655c67d6573Sopenharmony_ci                    let state = self.state(next_si);
656c67d6573Sopenharmony_ci                    let just_matches =
657c67d6573Sopenharmony_ci                        state.inst_ptrs().all(|ip| self.prog[ip].is_match());
658c67d6573Sopenharmony_ci                    if just_matches {
659c67d6573Sopenharmony_ci                        return result;
660c67d6573Sopenharmony_ci                    }
661c67d6573Sopenharmony_ci                }
662c67d6573Sopenharmony_ci
663c67d6573Sopenharmony_ci                // Another inner loop! If the DFA stays in this particular
664c67d6573Sopenharmony_ci                // match state, then we can rip through all of the input
665c67d6573Sopenharmony_ci                // very quickly, and only recording the match location once
666c67d6573Sopenharmony_ci                // we've left this particular state.
667c67d6573Sopenharmony_ci                let cur = at;
668c67d6573Sopenharmony_ci                while (next_si & !STATE_MATCH) == prev_si
669c67d6573Sopenharmony_ci                    && at + 2 < text.len()
670c67d6573Sopenharmony_ci                {
671c67d6573Sopenharmony_ci                    // Argument for safety is in the definition of next_si.
672c67d6573Sopenharmony_ci                    next_si = unsafe {
673c67d6573Sopenharmony_ci                        self.next_si(next_si & !STATE_MATCH, text, at)
674c67d6573Sopenharmony_ci                    };
675c67d6573Sopenharmony_ci                    at += 1;
676c67d6573Sopenharmony_ci                }
677c67d6573Sopenharmony_ci                if at > cur {
678c67d6573Sopenharmony_ci                    result = Result::Match(at - 2);
679c67d6573Sopenharmony_ci                }
680c67d6573Sopenharmony_ci            } else if next_si & STATE_START > 0 {
681c67d6573Sopenharmony_ci                // A start state isn't in the common case because we may
682c67d6573Sopenharmony_ci                // want to do quick prefix scanning. If the program doesn't
683c67d6573Sopenharmony_ci                // have a detected prefix, then start states are actually
684c67d6573Sopenharmony_ci                // considered common and this case is never reached.
685c67d6573Sopenharmony_ci                debug_assert!(self.has_prefix());
686c67d6573Sopenharmony_ci                next_si &= !STATE_START;
687c67d6573Sopenharmony_ci                prev_si = next_si;
688c67d6573Sopenharmony_ci                at = match self.prefix_at(text, at) {
689c67d6573Sopenharmony_ci                    None => return Result::NoMatch(text.len()),
690c67d6573Sopenharmony_ci                    Some(i) => i,
691c67d6573Sopenharmony_ci                };
692c67d6573Sopenharmony_ci            } else if next_si >= STATE_UNKNOWN {
693c67d6573Sopenharmony_ci                if next_si == STATE_QUIT {
694c67d6573Sopenharmony_ci                    return Result::Quit;
695c67d6573Sopenharmony_ci                }
696c67d6573Sopenharmony_ci                // Finally, this corresponds to the case where the transition
697c67d6573Sopenharmony_ci                // entered a state that can never lead to a match or a state
698c67d6573Sopenharmony_ci                // that hasn't been computed yet. The latter being the "slow"
699c67d6573Sopenharmony_ci                // path.
700c67d6573Sopenharmony_ci                let byte = Byte::byte(text[at - 1]);
701c67d6573Sopenharmony_ci                // We no longer care about the special bits in the state
702c67d6573Sopenharmony_ci                // pointer.
703c67d6573Sopenharmony_ci                prev_si &= STATE_MAX;
704c67d6573Sopenharmony_ci                // Record where we are. This is used to track progress for
705c67d6573Sopenharmony_ci                // determining whether we should quit if we've flushed the
706c67d6573Sopenharmony_ci                // cache too much.
707c67d6573Sopenharmony_ci                self.at = at;
708c67d6573Sopenharmony_ci                next_si = match self.next_state(qcur, qnext, prev_si, byte) {
709c67d6573Sopenharmony_ci                    None => return Result::Quit,
710c67d6573Sopenharmony_ci                    Some(STATE_DEAD) => return result.set_non_match(at),
711c67d6573Sopenharmony_ci                    Some(si) => si,
712c67d6573Sopenharmony_ci                };
713c67d6573Sopenharmony_ci                debug_assert!(next_si != STATE_UNKNOWN);
714c67d6573Sopenharmony_ci                if next_si & STATE_MATCH > 0 {
715c67d6573Sopenharmony_ci                    next_si &= !STATE_MATCH;
716c67d6573Sopenharmony_ci                    result = Result::Match(at - 1);
717c67d6573Sopenharmony_ci                    if self.quit_after_match {
718c67d6573Sopenharmony_ci                        return result;
719c67d6573Sopenharmony_ci                    }
720c67d6573Sopenharmony_ci                    self.last_match_si = next_si;
721c67d6573Sopenharmony_ci                }
722c67d6573Sopenharmony_ci                prev_si = next_si;
723c67d6573Sopenharmony_ci            } else {
724c67d6573Sopenharmony_ci                prev_si = next_si;
725c67d6573Sopenharmony_ci            }
726c67d6573Sopenharmony_ci        }
727c67d6573Sopenharmony_ci
728c67d6573Sopenharmony_ci        // Run the DFA once more on the special EOF sentinel value.
729c67d6573Sopenharmony_ci        // We don't care about the special bits in the state pointer any more,
730c67d6573Sopenharmony_ci        // so get rid of them.
731c67d6573Sopenharmony_ci        prev_si &= STATE_MAX;
732c67d6573Sopenharmony_ci        prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
733c67d6573Sopenharmony_ci            None => return Result::Quit,
734c67d6573Sopenharmony_ci            Some(STATE_DEAD) => return result.set_non_match(text.len()),
735c67d6573Sopenharmony_ci            Some(si) => si & !STATE_START,
736c67d6573Sopenharmony_ci        };
737c67d6573Sopenharmony_ci        debug_assert!(prev_si != STATE_UNKNOWN);
738c67d6573Sopenharmony_ci        if prev_si & STATE_MATCH > 0 {
739c67d6573Sopenharmony_ci            prev_si &= !STATE_MATCH;
740c67d6573Sopenharmony_ci            self.last_match_si = prev_si;
741c67d6573Sopenharmony_ci            result = Result::Match(text.len());
742c67d6573Sopenharmony_ci        }
743c67d6573Sopenharmony_ci        result
744c67d6573Sopenharmony_ci    }
745c67d6573Sopenharmony_ci
746c67d6573Sopenharmony_ci    /// Executes the DFA on a reverse NFA.
747c67d6573Sopenharmony_ci    #[cfg_attr(feature = "perf-inline", inline(always))]
748c67d6573Sopenharmony_ci    fn exec_at_reverse(
749c67d6573Sopenharmony_ci        &mut self,
750c67d6573Sopenharmony_ci        qcur: &mut SparseSet,
751c67d6573Sopenharmony_ci        qnext: &mut SparseSet,
752c67d6573Sopenharmony_ci        text: &[u8],
753c67d6573Sopenharmony_ci    ) -> Result<usize> {
754c67d6573Sopenharmony_ci        // The comments in `exec_at` above mostly apply here too. The main
755c67d6573Sopenharmony_ci        // difference is that we move backwards over the input and we look for
756c67d6573Sopenharmony_ci        // the longest possible match instead of the leftmost-first match.
757c67d6573Sopenharmony_ci        //
758c67d6573Sopenharmony_ci        // N.B. The code duplication here is regrettable. Efforts to improve
759c67d6573Sopenharmony_ci        // it without sacrificing performance are welcome. ---AG
760c67d6573Sopenharmony_ci        debug_assert!(self.prog.is_reverse);
761c67d6573Sopenharmony_ci        let mut result = Result::NoMatch(self.at);
762c67d6573Sopenharmony_ci        let (mut prev_si, mut next_si) = (self.start, self.start);
763c67d6573Sopenharmony_ci        let mut at = self.at;
764c67d6573Sopenharmony_ci        while at > 0 {
765c67d6573Sopenharmony_ci            while next_si <= STATE_MAX && at > 0 {
766c67d6573Sopenharmony_ci                // Argument for safety is in the definition of next_si.
767c67d6573Sopenharmony_ci                at -= 1;
768c67d6573Sopenharmony_ci                prev_si = unsafe { self.next_si(next_si, text, at) };
769c67d6573Sopenharmony_ci                if prev_si > STATE_MAX || at <= 4 {
770c67d6573Sopenharmony_ci                    mem::swap(&mut prev_si, &mut next_si);
771c67d6573Sopenharmony_ci                    break;
772c67d6573Sopenharmony_ci                }
773c67d6573Sopenharmony_ci                at -= 1;
774c67d6573Sopenharmony_ci                next_si = unsafe { self.next_si(prev_si, text, at) };
775c67d6573Sopenharmony_ci                if next_si > STATE_MAX {
776c67d6573Sopenharmony_ci                    break;
777c67d6573Sopenharmony_ci                }
778c67d6573Sopenharmony_ci                at -= 1;
779c67d6573Sopenharmony_ci                prev_si = unsafe { self.next_si(next_si, text, at) };
780c67d6573Sopenharmony_ci                if prev_si > STATE_MAX {
781c67d6573Sopenharmony_ci                    mem::swap(&mut prev_si, &mut next_si);
782c67d6573Sopenharmony_ci                    break;
783c67d6573Sopenharmony_ci                }
784c67d6573Sopenharmony_ci                at -= 1;
785c67d6573Sopenharmony_ci                next_si = unsafe { self.next_si(prev_si, text, at) };
786c67d6573Sopenharmony_ci            }
787c67d6573Sopenharmony_ci            if next_si & STATE_MATCH > 0 {
788c67d6573Sopenharmony_ci                next_si &= !STATE_MATCH;
789c67d6573Sopenharmony_ci                result = Result::Match(at + 1);
790c67d6573Sopenharmony_ci                if self.quit_after_match {
791c67d6573Sopenharmony_ci                    return result;
792c67d6573Sopenharmony_ci                }
793c67d6573Sopenharmony_ci                self.last_match_si = next_si;
794c67d6573Sopenharmony_ci                prev_si = next_si;
795c67d6573Sopenharmony_ci                let cur = at;
796c67d6573Sopenharmony_ci                while (next_si & !STATE_MATCH) == prev_si && at >= 2 {
797c67d6573Sopenharmony_ci                    // Argument for safety is in the definition of next_si.
798c67d6573Sopenharmony_ci                    at -= 1;
799c67d6573Sopenharmony_ci                    next_si = unsafe {
800c67d6573Sopenharmony_ci                        self.next_si(next_si & !STATE_MATCH, text, at)
801c67d6573Sopenharmony_ci                    };
802c67d6573Sopenharmony_ci                }
803c67d6573Sopenharmony_ci                if at < cur {
804c67d6573Sopenharmony_ci                    result = Result::Match(at + 2);
805c67d6573Sopenharmony_ci                }
806c67d6573Sopenharmony_ci            } else if next_si >= STATE_UNKNOWN {
807c67d6573Sopenharmony_ci                if next_si == STATE_QUIT {
808c67d6573Sopenharmony_ci                    return Result::Quit;
809c67d6573Sopenharmony_ci                }
810c67d6573Sopenharmony_ci                let byte = Byte::byte(text[at]);
811c67d6573Sopenharmony_ci                prev_si &= STATE_MAX;
812c67d6573Sopenharmony_ci                self.at = at;
813c67d6573Sopenharmony_ci                next_si = match self.next_state(qcur, qnext, prev_si, byte) {
814c67d6573Sopenharmony_ci                    None => return Result::Quit,
815c67d6573Sopenharmony_ci                    Some(STATE_DEAD) => return result.set_non_match(at),
816c67d6573Sopenharmony_ci                    Some(si) => si,
817c67d6573Sopenharmony_ci                };
818c67d6573Sopenharmony_ci                debug_assert!(next_si != STATE_UNKNOWN);
819c67d6573Sopenharmony_ci                if next_si & STATE_MATCH > 0 {
820c67d6573Sopenharmony_ci                    next_si &= !STATE_MATCH;
821c67d6573Sopenharmony_ci                    result = Result::Match(at + 1);
822c67d6573Sopenharmony_ci                    if self.quit_after_match {
823c67d6573Sopenharmony_ci                        return result;
824c67d6573Sopenharmony_ci                    }
825c67d6573Sopenharmony_ci                    self.last_match_si = next_si;
826c67d6573Sopenharmony_ci                }
827c67d6573Sopenharmony_ci                prev_si = next_si;
828c67d6573Sopenharmony_ci            } else {
829c67d6573Sopenharmony_ci                prev_si = next_si;
830c67d6573Sopenharmony_ci            }
831c67d6573Sopenharmony_ci        }
832c67d6573Sopenharmony_ci
833c67d6573Sopenharmony_ci        // Run the DFA once more on the special EOF sentinel value.
834c67d6573Sopenharmony_ci        prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
835c67d6573Sopenharmony_ci            None => return Result::Quit,
836c67d6573Sopenharmony_ci            Some(STATE_DEAD) => return result.set_non_match(0),
837c67d6573Sopenharmony_ci            Some(si) => si,
838c67d6573Sopenharmony_ci        };
839c67d6573Sopenharmony_ci        debug_assert!(prev_si != STATE_UNKNOWN);
840c67d6573Sopenharmony_ci        if prev_si & STATE_MATCH > 0 {
841c67d6573Sopenharmony_ci            prev_si &= !STATE_MATCH;
842c67d6573Sopenharmony_ci            self.last_match_si = prev_si;
843c67d6573Sopenharmony_ci            result = Result::Match(0);
844c67d6573Sopenharmony_ci        }
845c67d6573Sopenharmony_ci        result
846c67d6573Sopenharmony_ci    }
847c67d6573Sopenharmony_ci
848c67d6573Sopenharmony_ci    /// next_si transitions to the next state, where the transition input
849c67d6573Sopenharmony_ci    /// corresponds to text[i].
850c67d6573Sopenharmony_ci    ///
851c67d6573Sopenharmony_ci    /// This elides bounds checks, and is therefore not safe.
852c67d6573Sopenharmony_ci    #[cfg_attr(feature = "perf-inline", inline(always))]
853c67d6573Sopenharmony_ci    unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr {
854c67d6573Sopenharmony_ci        // What is the argument for safety here?
855c67d6573Sopenharmony_ci        // We have three unchecked accesses that could possibly violate safety:
856c67d6573Sopenharmony_ci        //
857c67d6573Sopenharmony_ci        //   1. The given byte of input (`text[i]`).
858c67d6573Sopenharmony_ci        //   2. The class of the byte of input (`classes[text[i]]`).
859c67d6573Sopenharmony_ci        //   3. The transition for the class (`trans[si + cls]`).
860c67d6573Sopenharmony_ci        //
861c67d6573Sopenharmony_ci        // (1) is only safe when calling next_si is guarded by
862c67d6573Sopenharmony_ci        // `i < text.len()`.
863c67d6573Sopenharmony_ci        //
864c67d6573Sopenharmony_ci        // (2) is the easiest case to guarantee since `text[i]` is always a
865c67d6573Sopenharmony_ci        // `u8` and `self.prog.byte_classes` always has length `u8::MAX`.
866c67d6573Sopenharmony_ci        // (See `ByteClassSet.byte_classes` in `compile.rs`.)
867c67d6573Sopenharmony_ci        //
868c67d6573Sopenharmony_ci        // (3) is only safe if (1)+(2) are safe. Namely, the transitions
869c67d6573Sopenharmony_ci        // of every state are defined to have length equal to the number of
870c67d6573Sopenharmony_ci        // byte classes in the program. Therefore, a valid class leads to a
871c67d6573Sopenharmony_ci        // valid transition. (All possible transitions are valid lookups, even
872c67d6573Sopenharmony_ci        // if it points to a state that hasn't been computed yet.) (3) also
873c67d6573Sopenharmony_ci        // relies on `si` being correct, but StatePtrs should only ever be
874c67d6573Sopenharmony_ci        // retrieved from the transition table, which ensures they are correct.
875c67d6573Sopenharmony_ci        debug_assert!(i < text.len());
876c67d6573Sopenharmony_ci        let b = *text.get_unchecked(i);
877c67d6573Sopenharmony_ci        debug_assert!((b as usize) < self.prog.byte_classes.len());
878c67d6573Sopenharmony_ci        let cls = *self.prog.byte_classes.get_unchecked(b as usize);
879c67d6573Sopenharmony_ci        self.cache.trans.next_unchecked(si, cls as usize)
880c67d6573Sopenharmony_ci    }
881c67d6573Sopenharmony_ci
882c67d6573Sopenharmony_ci    /// Computes the next state given the current state and the current input
883c67d6573Sopenharmony_ci    /// byte (which may be EOF).
884c67d6573Sopenharmony_ci    ///
885c67d6573Sopenharmony_ci    /// If STATE_DEAD is returned, then there is no valid state transition.
886c67d6573Sopenharmony_ci    /// This implies that no permutation of future input can lead to a match
887c67d6573Sopenharmony_ci    /// state.
888c67d6573Sopenharmony_ci    ///
889c67d6573Sopenharmony_ci    /// STATE_UNKNOWN can never be returned.
890c67d6573Sopenharmony_ci    fn exec_byte(
891c67d6573Sopenharmony_ci        &mut self,
892c67d6573Sopenharmony_ci        qcur: &mut SparseSet,
893c67d6573Sopenharmony_ci        qnext: &mut SparseSet,
894c67d6573Sopenharmony_ci        mut si: StatePtr,
895c67d6573Sopenharmony_ci        b: Byte,
896c67d6573Sopenharmony_ci    ) -> Option<StatePtr> {
897c67d6573Sopenharmony_ci        use crate::prog::Inst::*;
898c67d6573Sopenharmony_ci
899c67d6573Sopenharmony_ci        // Initialize a queue with the current DFA state's NFA states.
900c67d6573Sopenharmony_ci        qcur.clear();
901c67d6573Sopenharmony_ci        for ip in self.state(si).inst_ptrs() {
902c67d6573Sopenharmony_ci            qcur.insert(ip);
903c67d6573Sopenharmony_ci        }
904c67d6573Sopenharmony_ci
905c67d6573Sopenharmony_ci        // Before inspecting the current byte, we may need to also inspect
906c67d6573Sopenharmony_ci        // whether the position immediately preceding the current byte
907c67d6573Sopenharmony_ci        // satisfies the empty assertions found in the current state.
908c67d6573Sopenharmony_ci        //
909c67d6573Sopenharmony_ci        // We only need to do this step if there are any empty assertions in
910c67d6573Sopenharmony_ci        // the current state.
911c67d6573Sopenharmony_ci        let is_word_last = self.state(si).flags().is_word();
912c67d6573Sopenharmony_ci        let is_word = b.is_ascii_word();
913c67d6573Sopenharmony_ci        if self.state(si).flags().has_empty() {
914c67d6573Sopenharmony_ci            // Compute the flags immediately preceding the current byte.
915c67d6573Sopenharmony_ci            // This means we only care about the "end" or "end line" flags.
916c67d6573Sopenharmony_ci            // (The "start" flags are computed immediately following the
917c67d6573Sopenharmony_ci            // current byte and are handled below.)
918c67d6573Sopenharmony_ci            let mut flags = EmptyFlags::default();
919c67d6573Sopenharmony_ci            if b.is_eof() {
920c67d6573Sopenharmony_ci                flags.end = true;
921c67d6573Sopenharmony_ci                flags.end_line = true;
922c67d6573Sopenharmony_ci            } else if b.as_byte().map_or(false, |b| b == b'\n') {
923c67d6573Sopenharmony_ci                flags.end_line = true;
924c67d6573Sopenharmony_ci            }
925c67d6573Sopenharmony_ci            if is_word_last == is_word {
926c67d6573Sopenharmony_ci                flags.not_word_boundary = true;
927c67d6573Sopenharmony_ci            } else {
928c67d6573Sopenharmony_ci                flags.word_boundary = true;
929c67d6573Sopenharmony_ci            }
930c67d6573Sopenharmony_ci            // Now follow epsilon transitions from every NFA state, but make
931c67d6573Sopenharmony_ci            // sure we only follow transitions that satisfy our flags.
932c67d6573Sopenharmony_ci            qnext.clear();
933c67d6573Sopenharmony_ci            for &ip in &*qcur {
934c67d6573Sopenharmony_ci                self.follow_epsilons(usize_to_u32(ip), qnext, flags);
935c67d6573Sopenharmony_ci            }
936c67d6573Sopenharmony_ci            mem::swap(qcur, qnext);
937c67d6573Sopenharmony_ci        }
938c67d6573Sopenharmony_ci
939c67d6573Sopenharmony_ci        // Now we set flags for immediately after the current byte. Since start
940c67d6573Sopenharmony_ci        // states are processed separately, and are the only states that can
941c67d6573Sopenharmony_ci        // have the StartText flag set, we therefore only need to worry about
942c67d6573Sopenharmony_ci        // the StartLine flag here.
943c67d6573Sopenharmony_ci        //
944c67d6573Sopenharmony_ci        // We do also keep track of whether this DFA state contains a NFA state
945c67d6573Sopenharmony_ci        // that is a matching state. This is precisely how we delay the DFA
946c67d6573Sopenharmony_ci        // matching by one byte in order to process the special EOF sentinel
947c67d6573Sopenharmony_ci        // byte. Namely, if this DFA state containing a matching NFA state,
948c67d6573Sopenharmony_ci        // then it is the *next* DFA state that is marked as a match.
949c67d6573Sopenharmony_ci        let mut empty_flags = EmptyFlags::default();
950c67d6573Sopenharmony_ci        let mut state_flags = StateFlags::default();
951c67d6573Sopenharmony_ci        empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n');
952c67d6573Sopenharmony_ci        if b.is_ascii_word() {
953c67d6573Sopenharmony_ci            state_flags.set_word();
954c67d6573Sopenharmony_ci        }
955c67d6573Sopenharmony_ci        // Now follow all epsilon transitions again, but only after consuming
956c67d6573Sopenharmony_ci        // the current byte.
957c67d6573Sopenharmony_ci        qnext.clear();
958c67d6573Sopenharmony_ci        for &ip in &*qcur {
959c67d6573Sopenharmony_ci            match self.prog[ip as usize] {
960c67d6573Sopenharmony_ci                // These states never happen in a byte-based program.
961c67d6573Sopenharmony_ci                Char(_) | Ranges(_) => unreachable!(),
962c67d6573Sopenharmony_ci                // These states are handled when following epsilon transitions.
963c67d6573Sopenharmony_ci                Save(_) | Split(_) | EmptyLook(_) => {}
964c67d6573Sopenharmony_ci                Match(_) => {
965c67d6573Sopenharmony_ci                    state_flags.set_match();
966c67d6573Sopenharmony_ci                    if !self.continue_past_first_match() {
967c67d6573Sopenharmony_ci                        break;
968c67d6573Sopenharmony_ci                    } else if self.prog.matches.len() > 1
969c67d6573Sopenharmony_ci                        && !qnext.contains(ip as usize)
970c67d6573Sopenharmony_ci                    {
971c67d6573Sopenharmony_ci                        // If we are continuing on to find other matches,
972c67d6573Sopenharmony_ci                        // then keep a record of the match states we've seen.
973c67d6573Sopenharmony_ci                        qnext.insert(ip);
974c67d6573Sopenharmony_ci                    }
975c67d6573Sopenharmony_ci                }
976c67d6573Sopenharmony_ci                Bytes(ref inst) => {
977c67d6573Sopenharmony_ci                    if b.as_byte().map_or(false, |b| inst.matches(b)) {
978c67d6573Sopenharmony_ci                        self.follow_epsilons(
979c67d6573Sopenharmony_ci                            inst.goto as InstPtr,
980c67d6573Sopenharmony_ci                            qnext,
981c67d6573Sopenharmony_ci                            empty_flags,
982c67d6573Sopenharmony_ci                        );
983c67d6573Sopenharmony_ci                    }
984c67d6573Sopenharmony_ci                }
985c67d6573Sopenharmony_ci            }
986c67d6573Sopenharmony_ci        }
987c67d6573Sopenharmony_ci
988c67d6573Sopenharmony_ci        let cache = if b.is_eof() && self.prog.matches.len() > 1 {
989c67d6573Sopenharmony_ci            // If we're processing the last byte of the input and we're
990c67d6573Sopenharmony_ci            // matching a regex set, then make the next state contain the
991c67d6573Sopenharmony_ci            // previous states transitions. We do this so that the main
992c67d6573Sopenharmony_ci            // matching loop can extract all of the match instructions.
993c67d6573Sopenharmony_ci            mem::swap(qcur, qnext);
994c67d6573Sopenharmony_ci            // And don't cache this state because it's totally bunk.
995c67d6573Sopenharmony_ci            false
996c67d6573Sopenharmony_ci        } else {
997c67d6573Sopenharmony_ci            true
998c67d6573Sopenharmony_ci        };
999c67d6573Sopenharmony_ci
1000c67d6573Sopenharmony_ci        // We've now built up the set of NFA states that ought to comprise the
1001c67d6573Sopenharmony_ci        // next DFA state, so try to find it in the cache, and if it doesn't
1002c67d6573Sopenharmony_ci        // exist, cache it.
1003c67d6573Sopenharmony_ci        //
1004c67d6573Sopenharmony_ci        // N.B. We pass `&mut si` here because the cache may clear itself if
1005c67d6573Sopenharmony_ci        // it has gotten too full. When that happens, the location of the
1006c67d6573Sopenharmony_ci        // current state may change.
1007c67d6573Sopenharmony_ci        let mut next =
1008c67d6573Sopenharmony_ci            match self.cached_state(qnext, state_flags, Some(&mut si)) {
1009c67d6573Sopenharmony_ci                None => return None,
1010c67d6573Sopenharmony_ci                Some(next) => next,
1011c67d6573Sopenharmony_ci            };
1012c67d6573Sopenharmony_ci        if (self.start & !STATE_START) == next {
1013c67d6573Sopenharmony_ci            // Start states can never be match states since all matches are
1014c67d6573Sopenharmony_ci            // delayed by one byte.
1015c67d6573Sopenharmony_ci            debug_assert!(!self.state(next).flags().is_match());
1016c67d6573Sopenharmony_ci            next = self.start_ptr(next);
1017c67d6573Sopenharmony_ci        }
1018c67d6573Sopenharmony_ci        if next <= STATE_MAX && self.state(next).flags().is_match() {
1019c67d6573Sopenharmony_ci            next |= STATE_MATCH;
1020c67d6573Sopenharmony_ci        }
1021c67d6573Sopenharmony_ci        debug_assert!(next != STATE_UNKNOWN);
1022c67d6573Sopenharmony_ci        // And now store our state in the current state's next list.
1023c67d6573Sopenharmony_ci        if cache {
1024c67d6573Sopenharmony_ci            let cls = self.byte_class(b);
1025c67d6573Sopenharmony_ci            self.cache.trans.set_next(si, cls, next);
1026c67d6573Sopenharmony_ci        }
1027c67d6573Sopenharmony_ci        Some(next)
1028c67d6573Sopenharmony_ci    }
1029c67d6573Sopenharmony_ci
1030c67d6573Sopenharmony_ci    /// Follows the epsilon transitions starting at (and including) `ip`. The
1031c67d6573Sopenharmony_ci    /// resulting states are inserted into the ordered set `q`.
1032c67d6573Sopenharmony_ci    ///
1033c67d6573Sopenharmony_ci    /// Conditional epsilon transitions (i.e., empty width assertions) are only
1034c67d6573Sopenharmony_ci    /// followed if they are satisfied by the given flags, which should
1035c67d6573Sopenharmony_ci    /// represent the flags set at the current location in the input.
1036c67d6573Sopenharmony_ci    ///
1037c67d6573Sopenharmony_ci    /// If the current location corresponds to the empty string, then only the
1038c67d6573Sopenharmony_ci    /// end line and/or end text flags may be set. If the current location
1039c67d6573Sopenharmony_ci    /// corresponds to a real byte in the input, then only the start line
1040c67d6573Sopenharmony_ci    /// and/or start text flags may be set.
1041c67d6573Sopenharmony_ci    ///
1042c67d6573Sopenharmony_ci    /// As an exception to the above, when finding the initial state, any of
1043c67d6573Sopenharmony_ci    /// the above flags may be set:
1044c67d6573Sopenharmony_ci    ///
1045c67d6573Sopenharmony_ci    /// If matching starts at the beginning of the input, then start text and
1046c67d6573Sopenharmony_ci    /// start line should be set. If the input is empty, then end text and end
1047c67d6573Sopenharmony_ci    /// line should also be set.
1048c67d6573Sopenharmony_ci    ///
1049c67d6573Sopenharmony_ci    /// If matching starts after the beginning of the input, then only start
1050c67d6573Sopenharmony_ci    /// line should be set if the preceding byte is `\n`. End line should never
1051c67d6573Sopenharmony_ci    /// be set in this case. (Even if the following byte is a `\n`, it will
1052c67d6573Sopenharmony_ci    /// be handled in a subsequent DFA state.)
1053c67d6573Sopenharmony_ci    fn follow_epsilons(
1054c67d6573Sopenharmony_ci        &mut self,
1055c67d6573Sopenharmony_ci        ip: InstPtr,
1056c67d6573Sopenharmony_ci        q: &mut SparseSet,
1057c67d6573Sopenharmony_ci        flags: EmptyFlags,
1058c67d6573Sopenharmony_ci    ) {
1059c67d6573Sopenharmony_ci        use crate::prog::EmptyLook::*;
1060c67d6573Sopenharmony_ci        use crate::prog::Inst::*;
1061c67d6573Sopenharmony_ci
1062c67d6573Sopenharmony_ci        // We need to traverse the NFA to follow epsilon transitions, so avoid
1063c67d6573Sopenharmony_ci        // recursion with an explicit stack.
1064c67d6573Sopenharmony_ci        self.cache.stack.push(ip);
1065c67d6573Sopenharmony_ci        while let Some(mut ip) = self.cache.stack.pop() {
1066c67d6573Sopenharmony_ci            // Try to munch through as many states as possible without
1067c67d6573Sopenharmony_ci            // pushes/pops to the stack.
1068c67d6573Sopenharmony_ci            loop {
1069c67d6573Sopenharmony_ci                // Don't visit states we've already added.
1070c67d6573Sopenharmony_ci                if q.contains(ip as usize) {
1071c67d6573Sopenharmony_ci                    break;
1072c67d6573Sopenharmony_ci                }
1073c67d6573Sopenharmony_ci                q.insert(ip as usize);
1074c67d6573Sopenharmony_ci                match self.prog[ip as usize] {
1075c67d6573Sopenharmony_ci                    Char(_) | Ranges(_) => unreachable!(),
1076c67d6573Sopenharmony_ci                    Match(_) | Bytes(_) => {
1077c67d6573Sopenharmony_ci                        break;
1078c67d6573Sopenharmony_ci                    }
1079c67d6573Sopenharmony_ci                    EmptyLook(ref inst) => {
1080c67d6573Sopenharmony_ci                        // Only follow empty assertion states if our flags
1081c67d6573Sopenharmony_ci                        // satisfy the assertion.
1082c67d6573Sopenharmony_ci                        match inst.look {
1083c67d6573Sopenharmony_ci                            StartLine if flags.start_line => {
1084c67d6573Sopenharmony_ci                                ip = inst.goto as InstPtr;
1085c67d6573Sopenharmony_ci                            }
1086c67d6573Sopenharmony_ci                            EndLine if flags.end_line => {
1087c67d6573Sopenharmony_ci                                ip = inst.goto as InstPtr;
1088c67d6573Sopenharmony_ci                            }
1089c67d6573Sopenharmony_ci                            StartText if flags.start => {
1090c67d6573Sopenharmony_ci                                ip = inst.goto as InstPtr;
1091c67d6573Sopenharmony_ci                            }
1092c67d6573Sopenharmony_ci                            EndText if flags.end => {
1093c67d6573Sopenharmony_ci                                ip = inst.goto as InstPtr;
1094c67d6573Sopenharmony_ci                            }
1095c67d6573Sopenharmony_ci                            WordBoundaryAscii if flags.word_boundary => {
1096c67d6573Sopenharmony_ci                                ip = inst.goto as InstPtr;
1097c67d6573Sopenharmony_ci                            }
1098c67d6573Sopenharmony_ci                            NotWordBoundaryAscii
1099c67d6573Sopenharmony_ci                                if flags.not_word_boundary =>
1100c67d6573Sopenharmony_ci                            {
1101c67d6573Sopenharmony_ci                                ip = inst.goto as InstPtr;
1102c67d6573Sopenharmony_ci                            }
1103c67d6573Sopenharmony_ci                            WordBoundary if flags.word_boundary => {
1104c67d6573Sopenharmony_ci                                ip = inst.goto as InstPtr;
1105c67d6573Sopenharmony_ci                            }
1106c67d6573Sopenharmony_ci                            NotWordBoundary if flags.not_word_boundary => {
1107c67d6573Sopenharmony_ci                                ip = inst.goto as InstPtr;
1108c67d6573Sopenharmony_ci                            }
1109c67d6573Sopenharmony_ci                            StartLine | EndLine | StartText | EndText
1110c67d6573Sopenharmony_ci                            | WordBoundaryAscii | NotWordBoundaryAscii
1111c67d6573Sopenharmony_ci                            | WordBoundary | NotWordBoundary => {
1112c67d6573Sopenharmony_ci                                break;
1113c67d6573Sopenharmony_ci                            }
1114c67d6573Sopenharmony_ci                        }
1115c67d6573Sopenharmony_ci                    }
1116c67d6573Sopenharmony_ci                    Save(ref inst) => {
1117c67d6573Sopenharmony_ci                        ip = inst.goto as InstPtr;
1118c67d6573Sopenharmony_ci                    }
1119c67d6573Sopenharmony_ci                    Split(ref inst) => {
1120c67d6573Sopenharmony_ci                        self.cache.stack.push(inst.goto2 as InstPtr);
1121c67d6573Sopenharmony_ci                        ip = inst.goto1 as InstPtr;
1122c67d6573Sopenharmony_ci                    }
1123c67d6573Sopenharmony_ci                }
1124c67d6573Sopenharmony_ci            }
1125c67d6573Sopenharmony_ci        }
1126c67d6573Sopenharmony_ci    }
1127c67d6573Sopenharmony_ci
1128c67d6573Sopenharmony_ci    /// Find a previously computed state matching the given set of instructions
1129c67d6573Sopenharmony_ci    /// and is_match bool.
1130c67d6573Sopenharmony_ci    ///
1131c67d6573Sopenharmony_ci    /// The given set of instructions should represent a single state in the
1132c67d6573Sopenharmony_ci    /// NFA along with all states reachable without consuming any input.
1133c67d6573Sopenharmony_ci    ///
1134c67d6573Sopenharmony_ci    /// The is_match bool should be true if and only if the preceding DFA state
1135c67d6573Sopenharmony_ci    /// contains an NFA matching state. The cached state produced here will
1136c67d6573Sopenharmony_ci    /// then signify a match. (This enables us to delay a match by one byte,
1137c67d6573Sopenharmony_ci    /// in order to account for the EOF sentinel byte.)
1138c67d6573Sopenharmony_ci    ///
1139c67d6573Sopenharmony_ci    /// If the cache is full, then it is wiped before caching a new state.
1140c67d6573Sopenharmony_ci    ///
1141c67d6573Sopenharmony_ci    /// The current state should be specified if it exists, since it will need
1142c67d6573Sopenharmony_ci    /// to be preserved if the cache clears itself. (Start states are
1143c67d6573Sopenharmony_ci    /// always saved, so they should not be passed here.) It takes a mutable
1144c67d6573Sopenharmony_ci    /// pointer to the index because if the cache is cleared, the state's
1145c67d6573Sopenharmony_ci    /// location may change.
1146c67d6573Sopenharmony_ci    fn cached_state(
1147c67d6573Sopenharmony_ci        &mut self,
1148c67d6573Sopenharmony_ci        q: &SparseSet,
1149c67d6573Sopenharmony_ci        mut state_flags: StateFlags,
1150c67d6573Sopenharmony_ci        current_state: Option<&mut StatePtr>,
1151c67d6573Sopenharmony_ci    ) -> Option<StatePtr> {
1152c67d6573Sopenharmony_ci        // If we couldn't come up with a non-empty key to represent this state,
1153c67d6573Sopenharmony_ci        // then it is dead and can never lead to a match.
1154c67d6573Sopenharmony_ci        //
1155c67d6573Sopenharmony_ci        // Note that inst_flags represent the set of empty width assertions
1156c67d6573Sopenharmony_ci        // in q. We use this as an optimization in exec_byte to determine when
1157c67d6573Sopenharmony_ci        // we should follow epsilon transitions at the empty string preceding
1158c67d6573Sopenharmony_ci        // the current byte.
1159c67d6573Sopenharmony_ci        let key = match self.cached_state_key(q, &mut state_flags) {
1160c67d6573Sopenharmony_ci            None => return Some(STATE_DEAD),
1161c67d6573Sopenharmony_ci            Some(v) => v,
1162c67d6573Sopenharmony_ci        };
1163c67d6573Sopenharmony_ci        // In the cache? Cool. Done.
1164c67d6573Sopenharmony_ci        if let Some(si) = self.cache.compiled.get_ptr(&key) {
1165c67d6573Sopenharmony_ci            return Some(si);
1166c67d6573Sopenharmony_ci        }
1167c67d6573Sopenharmony_ci        // If the cache has gotten too big, wipe it.
1168c67d6573Sopenharmony_ci        if self.approximate_size() > self.prog.dfa_size_limit
1169c67d6573Sopenharmony_ci            && !self.clear_cache_and_save(current_state)
1170c67d6573Sopenharmony_ci        {
1171c67d6573Sopenharmony_ci            // Ooops. DFA is giving up.
1172c67d6573Sopenharmony_ci            return None;
1173c67d6573Sopenharmony_ci        }
1174c67d6573Sopenharmony_ci        // Allocate room for our state and add it.
1175c67d6573Sopenharmony_ci        self.add_state(key)
1176c67d6573Sopenharmony_ci    }
1177c67d6573Sopenharmony_ci
1178c67d6573Sopenharmony_ci    /// Produces a key suitable for describing a state in the DFA cache.
1179c67d6573Sopenharmony_ci    ///
1180c67d6573Sopenharmony_ci    /// The key invariant here is that equivalent keys are produced for any two
1181c67d6573Sopenharmony_ci    /// sets of ordered NFA states (and toggling of whether the previous NFA
1182c67d6573Sopenharmony_ci    /// states contain a match state) that do not discriminate a match for any
1183c67d6573Sopenharmony_ci    /// input.
1184c67d6573Sopenharmony_ci    ///
1185c67d6573Sopenharmony_ci    /// Specifically, q should be an ordered set of NFA states and is_match
1186c67d6573Sopenharmony_ci    /// should be true if and only if the previous NFA states contained a match
1187c67d6573Sopenharmony_ci    /// state.
1188c67d6573Sopenharmony_ci    fn cached_state_key(
1189c67d6573Sopenharmony_ci        &mut self,
1190c67d6573Sopenharmony_ci        q: &SparseSet,
1191c67d6573Sopenharmony_ci        state_flags: &mut StateFlags,
1192c67d6573Sopenharmony_ci    ) -> Option<State> {
1193c67d6573Sopenharmony_ci        use crate::prog::Inst::*;
1194c67d6573Sopenharmony_ci
1195c67d6573Sopenharmony_ci        // We need to build up enough information to recognize pre-built states
1196c67d6573Sopenharmony_ci        // in the DFA. Generally speaking, this includes every instruction
1197c67d6573Sopenharmony_ci        // except for those which are purely epsilon transitions, e.g., the
1198c67d6573Sopenharmony_ci        // Save and Split instructions.
1199c67d6573Sopenharmony_ci        //
1200c67d6573Sopenharmony_ci        // Empty width assertions are also epsilon transitions, but since they
1201c67d6573Sopenharmony_ci        // are conditional, we need to make them part of a state's key in the
1202c67d6573Sopenharmony_ci        // cache.
1203c67d6573Sopenharmony_ci
1204c67d6573Sopenharmony_ci        let mut insts =
1205c67d6573Sopenharmony_ci            mem::replace(&mut self.cache.insts_scratch_space, vec![]);
1206c67d6573Sopenharmony_ci        insts.clear();
1207c67d6573Sopenharmony_ci        // Reserve 1 byte for flags.
1208c67d6573Sopenharmony_ci        insts.push(0);
1209c67d6573Sopenharmony_ci
1210c67d6573Sopenharmony_ci        let mut prev = 0;
1211c67d6573Sopenharmony_ci        for &ip in q {
1212c67d6573Sopenharmony_ci            let ip = usize_to_u32(ip);
1213c67d6573Sopenharmony_ci            match self.prog[ip as usize] {
1214c67d6573Sopenharmony_ci                Char(_) | Ranges(_) => unreachable!(),
1215c67d6573Sopenharmony_ci                Save(_) | Split(_) => {}
1216c67d6573Sopenharmony_ci                Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
1217c67d6573Sopenharmony_ci                EmptyLook(_) => {
1218c67d6573Sopenharmony_ci                    state_flags.set_empty();
1219c67d6573Sopenharmony_ci                    push_inst_ptr(&mut insts, &mut prev, ip)
1220c67d6573Sopenharmony_ci                }
1221c67d6573Sopenharmony_ci                Match(_) => {
1222c67d6573Sopenharmony_ci                    push_inst_ptr(&mut insts, &mut prev, ip);
1223c67d6573Sopenharmony_ci                    if !self.continue_past_first_match() {
1224c67d6573Sopenharmony_ci                        break;
1225c67d6573Sopenharmony_ci                    }
1226c67d6573Sopenharmony_ci                }
1227c67d6573Sopenharmony_ci            }
1228c67d6573Sopenharmony_ci        }
1229c67d6573Sopenharmony_ci        // If we couldn't transition to any other instructions and we didn't
1230c67d6573Sopenharmony_ci        // see a match when expanding NFA states previously, then this is a
1231c67d6573Sopenharmony_ci        // dead state and no amount of additional input can transition out
1232c67d6573Sopenharmony_ci        // of this state.
1233c67d6573Sopenharmony_ci        let opt_state = if insts.len() == 1 && !state_flags.is_match() {
1234c67d6573Sopenharmony_ci            None
1235c67d6573Sopenharmony_ci        } else {
1236c67d6573Sopenharmony_ci            let StateFlags(f) = *state_flags;
1237c67d6573Sopenharmony_ci            insts[0] = f;
1238c67d6573Sopenharmony_ci            Some(State { data: Arc::from(&*insts) })
1239c67d6573Sopenharmony_ci        };
1240c67d6573Sopenharmony_ci        self.cache.insts_scratch_space = insts;
1241c67d6573Sopenharmony_ci        opt_state
1242c67d6573Sopenharmony_ci    }
1243c67d6573Sopenharmony_ci
1244c67d6573Sopenharmony_ci    /// Clears the cache, but saves and restores current_state if it is not
1245c67d6573Sopenharmony_ci    /// none.
1246c67d6573Sopenharmony_ci    ///
1247c67d6573Sopenharmony_ci    /// The current state must be provided here in case its location in the
1248c67d6573Sopenharmony_ci    /// cache changes.
1249c67d6573Sopenharmony_ci    ///
1250c67d6573Sopenharmony_ci    /// This returns false if the cache is not cleared and the DFA should
1251c67d6573Sopenharmony_ci    /// give up.
1252c67d6573Sopenharmony_ci    fn clear_cache_and_save(
1253c67d6573Sopenharmony_ci        &mut self,
1254c67d6573Sopenharmony_ci        current_state: Option<&mut StatePtr>,
1255c67d6573Sopenharmony_ci    ) -> bool {
1256c67d6573Sopenharmony_ci        if self.cache.compiled.is_empty() {
1257c67d6573Sopenharmony_ci            // Nothing to clear...
1258c67d6573Sopenharmony_ci            return true;
1259c67d6573Sopenharmony_ci        }
1260c67d6573Sopenharmony_ci        match current_state {
1261c67d6573Sopenharmony_ci            None => self.clear_cache(),
1262c67d6573Sopenharmony_ci            Some(si) => {
1263c67d6573Sopenharmony_ci                let cur = self.state(*si).clone();
1264c67d6573Sopenharmony_ci                if !self.clear_cache() {
1265c67d6573Sopenharmony_ci                    return false;
1266c67d6573Sopenharmony_ci                }
1267c67d6573Sopenharmony_ci                // The unwrap is OK because we just cleared the cache and
1268c67d6573Sopenharmony_ci                // therefore know that the next state pointer won't exceed
1269c67d6573Sopenharmony_ci                // STATE_MAX.
1270c67d6573Sopenharmony_ci                *si = self.restore_state(cur).unwrap();
1271c67d6573Sopenharmony_ci                true
1272c67d6573Sopenharmony_ci            }
1273c67d6573Sopenharmony_ci        }
1274c67d6573Sopenharmony_ci    }
1275c67d6573Sopenharmony_ci
1276c67d6573Sopenharmony_ci    /// Wipes the state cache, but saves and restores the current start state.
1277c67d6573Sopenharmony_ci    ///
1278c67d6573Sopenharmony_ci    /// This returns false if the cache is not cleared and the DFA should
1279c67d6573Sopenharmony_ci    /// give up.
1280c67d6573Sopenharmony_ci    fn clear_cache(&mut self) -> bool {
1281c67d6573Sopenharmony_ci        // Bail out of the DFA if we're moving too "slowly."
1282c67d6573Sopenharmony_ci        // A heuristic from RE2: assume the DFA is too slow if it is processing
1283c67d6573Sopenharmony_ci        // 10 or fewer bytes per state.
1284c67d6573Sopenharmony_ci        // Additionally, we permit the cache to be flushed a few times before
1285c67d6573Sopenharmony_ci        // caling it quits.
1286c67d6573Sopenharmony_ci        let nstates = self.cache.compiled.len();
1287c67d6573Sopenharmony_ci        if self.cache.flush_count >= 3
1288c67d6573Sopenharmony_ci            && self.at >= self.last_cache_flush
1289c67d6573Sopenharmony_ci            && (self.at - self.last_cache_flush) <= 10 * nstates
1290c67d6573Sopenharmony_ci        {
1291c67d6573Sopenharmony_ci            return false;
1292c67d6573Sopenharmony_ci        }
1293c67d6573Sopenharmony_ci        // Update statistics tracking cache flushes.
1294c67d6573Sopenharmony_ci        self.last_cache_flush = self.at;
1295c67d6573Sopenharmony_ci        self.cache.flush_count += 1;
1296c67d6573Sopenharmony_ci
1297c67d6573Sopenharmony_ci        // OK, actually flush the cache.
1298c67d6573Sopenharmony_ci        let start = self.state(self.start & !STATE_START).clone();
1299c67d6573Sopenharmony_ci        let last_match = if self.last_match_si <= STATE_MAX {
1300c67d6573Sopenharmony_ci            Some(self.state(self.last_match_si).clone())
1301c67d6573Sopenharmony_ci        } else {
1302c67d6573Sopenharmony_ci            None
1303c67d6573Sopenharmony_ci        };
1304c67d6573Sopenharmony_ci        self.cache.reset_size();
1305c67d6573Sopenharmony_ci        self.cache.trans.clear();
1306c67d6573Sopenharmony_ci        self.cache.compiled.clear();
1307c67d6573Sopenharmony_ci        for s in &mut self.cache.start_states {
1308c67d6573Sopenharmony_ci            *s = STATE_UNKNOWN;
1309c67d6573Sopenharmony_ci        }
1310c67d6573Sopenharmony_ci        // The unwraps are OK because we just cleared the cache and therefore
1311c67d6573Sopenharmony_ci        // know that the next state pointer won't exceed STATE_MAX.
1312c67d6573Sopenharmony_ci        let start_ptr = self.restore_state(start).unwrap();
1313c67d6573Sopenharmony_ci        self.start = self.start_ptr(start_ptr);
1314c67d6573Sopenharmony_ci        if let Some(last_match) = last_match {
1315c67d6573Sopenharmony_ci            self.last_match_si = self.restore_state(last_match).unwrap();
1316c67d6573Sopenharmony_ci        }
1317c67d6573Sopenharmony_ci        true
1318c67d6573Sopenharmony_ci    }
1319c67d6573Sopenharmony_ci
1320c67d6573Sopenharmony_ci    /// Restores the given state back into the cache, and returns a pointer
1321c67d6573Sopenharmony_ci    /// to it.
1322c67d6573Sopenharmony_ci    fn restore_state(&mut self, state: State) -> Option<StatePtr> {
1323c67d6573Sopenharmony_ci        // If we've already stored this state, just return a pointer to it.
1324c67d6573Sopenharmony_ci        // None will be the wiser.
1325c67d6573Sopenharmony_ci        if let Some(si) = self.cache.compiled.get_ptr(&state) {
1326c67d6573Sopenharmony_ci            return Some(si);
1327c67d6573Sopenharmony_ci        }
1328c67d6573Sopenharmony_ci        self.add_state(state)
1329c67d6573Sopenharmony_ci    }
1330c67d6573Sopenharmony_ci
1331c67d6573Sopenharmony_ci    /// Returns the next state given the current state si and current byte
1332c67d6573Sopenharmony_ci    /// b. {qcur,qnext} are used as scratch space for storing ordered NFA
1333c67d6573Sopenharmony_ci    /// states.
1334c67d6573Sopenharmony_ci    ///
1335c67d6573Sopenharmony_ci    /// This tries to fetch the next state from the cache, but if that fails,
1336c67d6573Sopenharmony_ci    /// it computes the next state, caches it and returns a pointer to it.
1337c67d6573Sopenharmony_ci    ///
1338c67d6573Sopenharmony_ci    /// The pointer can be to a real state, or it can be STATE_DEAD.
1339c67d6573Sopenharmony_ci    /// STATE_UNKNOWN cannot be returned.
1340c67d6573Sopenharmony_ci    ///
1341c67d6573Sopenharmony_ci    /// None is returned if a new state could not be allocated (i.e., the DFA
1342c67d6573Sopenharmony_ci    /// ran out of space and thinks it's running too slowly).
1343c67d6573Sopenharmony_ci    fn next_state(
1344c67d6573Sopenharmony_ci        &mut self,
1345c67d6573Sopenharmony_ci        qcur: &mut SparseSet,
1346c67d6573Sopenharmony_ci        qnext: &mut SparseSet,
1347c67d6573Sopenharmony_ci        si: StatePtr,
1348c67d6573Sopenharmony_ci        b: Byte,
1349c67d6573Sopenharmony_ci    ) -> Option<StatePtr> {
1350c67d6573Sopenharmony_ci        if si == STATE_DEAD {
1351c67d6573Sopenharmony_ci            return Some(STATE_DEAD);
1352c67d6573Sopenharmony_ci        }
1353c67d6573Sopenharmony_ci        match self.cache.trans.next(si, self.byte_class(b)) {
1354c67d6573Sopenharmony_ci            STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
1355c67d6573Sopenharmony_ci            STATE_QUIT => None,
1356c67d6573Sopenharmony_ci            nsi => Some(nsi),
1357c67d6573Sopenharmony_ci        }
1358c67d6573Sopenharmony_ci    }
1359c67d6573Sopenharmony_ci
1360c67d6573Sopenharmony_ci    /// Computes and returns the start state, where searching begins at
1361c67d6573Sopenharmony_ci    /// position `at` in `text`. If the state has already been computed,
1362c67d6573Sopenharmony_ci    /// then it is pulled from the cache. If the state hasn't been cached,
1363c67d6573Sopenharmony_ci    /// then it is computed, cached and a pointer to it is returned.
1364c67d6573Sopenharmony_ci    ///
1365c67d6573Sopenharmony_ci    /// This may return STATE_DEAD but never STATE_UNKNOWN.
1366c67d6573Sopenharmony_ci    #[cfg_attr(feature = "perf-inline", inline(always))]
1367c67d6573Sopenharmony_ci    fn start_state(
1368c67d6573Sopenharmony_ci        &mut self,
1369c67d6573Sopenharmony_ci        q: &mut SparseSet,
1370c67d6573Sopenharmony_ci        empty_flags: EmptyFlags,
1371c67d6573Sopenharmony_ci        state_flags: StateFlags,
1372c67d6573Sopenharmony_ci    ) -> Option<StatePtr> {
1373c67d6573Sopenharmony_ci        // Compute an index into our cache of start states based on the set
1374c67d6573Sopenharmony_ci        // of empty/state flags set at the current position in the input. We
1375c67d6573Sopenharmony_ci        // don't use every flag since not all flags matter. For example, since
1376c67d6573Sopenharmony_ci        // matches are delayed by one byte, start states can never be match
1377c67d6573Sopenharmony_ci        // states.
1378c67d6573Sopenharmony_ci        let flagi = {
1379c67d6573Sopenharmony_ci            (((empty_flags.start as u8) << 0)
1380c67d6573Sopenharmony_ci                | ((empty_flags.end as u8) << 1)
1381c67d6573Sopenharmony_ci                | ((empty_flags.start_line as u8) << 2)
1382c67d6573Sopenharmony_ci                | ((empty_flags.end_line as u8) << 3)
1383c67d6573Sopenharmony_ci                | ((empty_flags.word_boundary as u8) << 4)
1384c67d6573Sopenharmony_ci                | ((empty_flags.not_word_boundary as u8) << 5)
1385c67d6573Sopenharmony_ci                | ((state_flags.is_word() as u8) << 6)) as usize
1386c67d6573Sopenharmony_ci        };
1387c67d6573Sopenharmony_ci        match self.cache.start_states[flagi] {
1388c67d6573Sopenharmony_ci            STATE_UNKNOWN => {}
1389c67d6573Sopenharmony_ci            si => return Some(si),
1390c67d6573Sopenharmony_ci        }
1391c67d6573Sopenharmony_ci        q.clear();
1392c67d6573Sopenharmony_ci        let start = usize_to_u32(self.prog.start);
1393c67d6573Sopenharmony_ci        self.follow_epsilons(start, q, empty_flags);
1394c67d6573Sopenharmony_ci        // Start states can never be match states because we delay every match
1395c67d6573Sopenharmony_ci        // by one byte. Given an empty string and an empty match, the match
1396c67d6573Sopenharmony_ci        // won't actually occur until the DFA processes the special EOF
1397c67d6573Sopenharmony_ci        // sentinel byte.
1398c67d6573Sopenharmony_ci        let sp = match self.cached_state(q, state_flags, None) {
1399c67d6573Sopenharmony_ci            None => return None,
1400c67d6573Sopenharmony_ci            Some(sp) => self.start_ptr(sp),
1401c67d6573Sopenharmony_ci        };
1402c67d6573Sopenharmony_ci        self.cache.start_states[flagi] = sp;
1403c67d6573Sopenharmony_ci        Some(sp)
1404c67d6573Sopenharmony_ci    }
1405c67d6573Sopenharmony_ci
1406c67d6573Sopenharmony_ci    /// Computes the set of starting flags for the given position in text.
1407c67d6573Sopenharmony_ci    ///
1408c67d6573Sopenharmony_ci    /// This should only be used when executing the DFA forwards over the
1409c67d6573Sopenharmony_ci    /// input.
1410c67d6573Sopenharmony_ci    fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) {
1411c67d6573Sopenharmony_ci        let mut empty_flags = EmptyFlags::default();
1412c67d6573Sopenharmony_ci        let mut state_flags = StateFlags::default();
1413c67d6573Sopenharmony_ci        empty_flags.start = at == 0;
1414c67d6573Sopenharmony_ci        empty_flags.end = text.is_empty();
1415c67d6573Sopenharmony_ci        empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
1416c67d6573Sopenharmony_ci        empty_flags.end_line = text.is_empty();
1417c67d6573Sopenharmony_ci
1418c67d6573Sopenharmony_ci        let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
1419c67d6573Sopenharmony_ci        let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word();
1420c67d6573Sopenharmony_ci        if is_word_last {
1421c67d6573Sopenharmony_ci            state_flags.set_word();
1422c67d6573Sopenharmony_ci        }
1423c67d6573Sopenharmony_ci        if is_word == is_word_last {
1424c67d6573Sopenharmony_ci            empty_flags.not_word_boundary = true;
1425c67d6573Sopenharmony_ci        } else {
1426c67d6573Sopenharmony_ci            empty_flags.word_boundary = true;
1427c67d6573Sopenharmony_ci        }
1428c67d6573Sopenharmony_ci        (empty_flags, state_flags)
1429c67d6573Sopenharmony_ci    }
1430c67d6573Sopenharmony_ci
1431c67d6573Sopenharmony_ci    /// Computes the set of starting flags for the given position in text.
1432c67d6573Sopenharmony_ci    ///
1433c67d6573Sopenharmony_ci    /// This should only be used when executing the DFA in reverse over the
1434c67d6573Sopenharmony_ci    /// input.
1435c67d6573Sopenharmony_ci    fn start_flags_reverse(
1436c67d6573Sopenharmony_ci        &self,
1437c67d6573Sopenharmony_ci        text: &[u8],
1438c67d6573Sopenharmony_ci        at: usize,
1439c67d6573Sopenharmony_ci    ) -> (EmptyFlags, StateFlags) {
1440c67d6573Sopenharmony_ci        let mut empty_flags = EmptyFlags::default();
1441c67d6573Sopenharmony_ci        let mut state_flags = StateFlags::default();
1442c67d6573Sopenharmony_ci        empty_flags.start = at == text.len();
1443c67d6573Sopenharmony_ci        empty_flags.end = text.is_empty();
1444c67d6573Sopenharmony_ci        empty_flags.start_line = at == text.len() || text[at] == b'\n';
1445c67d6573Sopenharmony_ci        empty_flags.end_line = text.is_empty();
1446c67d6573Sopenharmony_ci
1447c67d6573Sopenharmony_ci        let is_word_last =
1448c67d6573Sopenharmony_ci            at < text.len() && Byte::byte(text[at]).is_ascii_word();
1449c67d6573Sopenharmony_ci        let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
1450c67d6573Sopenharmony_ci        if is_word_last {
1451c67d6573Sopenharmony_ci            state_flags.set_word();
1452c67d6573Sopenharmony_ci        }
1453c67d6573Sopenharmony_ci        if is_word == is_word_last {
1454c67d6573Sopenharmony_ci            empty_flags.not_word_boundary = true;
1455c67d6573Sopenharmony_ci        } else {
1456c67d6573Sopenharmony_ci            empty_flags.word_boundary = true;
1457c67d6573Sopenharmony_ci        }
1458c67d6573Sopenharmony_ci        (empty_flags, state_flags)
1459c67d6573Sopenharmony_ci    }
1460c67d6573Sopenharmony_ci
1461c67d6573Sopenharmony_ci    /// Returns a reference to a State given a pointer to it.
1462c67d6573Sopenharmony_ci    fn state(&self, si: StatePtr) -> &State {
1463c67d6573Sopenharmony_ci        self.cache.compiled.get_state(si).unwrap()
1464c67d6573Sopenharmony_ci    }
1465c67d6573Sopenharmony_ci
1466c67d6573Sopenharmony_ci    /// Adds the given state to the DFA.
1467c67d6573Sopenharmony_ci    ///
1468c67d6573Sopenharmony_ci    /// This allocates room for transitions out of this state in
1469c67d6573Sopenharmony_ci    /// self.cache.trans. The transitions can be set with the returned
1470c67d6573Sopenharmony_ci    /// StatePtr.
1471c67d6573Sopenharmony_ci    ///
1472c67d6573Sopenharmony_ci    /// If None is returned, then the state limit was reached and the DFA
1473c67d6573Sopenharmony_ci    /// should quit.
1474c67d6573Sopenharmony_ci    fn add_state(&mut self, state: State) -> Option<StatePtr> {
1475c67d6573Sopenharmony_ci        // This will fail if the next state pointer exceeds STATE_PTR. In
1476c67d6573Sopenharmony_ci        // practice, the cache limit will prevent us from ever getting here,
1477c67d6573Sopenharmony_ci        // but maybe callers will set the cache size to something ridiculous...
1478c67d6573Sopenharmony_ci        let si = match self.cache.trans.add() {
1479c67d6573Sopenharmony_ci            None => return None,
1480c67d6573Sopenharmony_ci            Some(si) => si,
1481c67d6573Sopenharmony_ci        };
1482c67d6573Sopenharmony_ci        // If the program has a Unicode word boundary, then set any transitions
1483c67d6573Sopenharmony_ci        // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a
1484c67d6573Sopenharmony_ci        // transition, then it will quit and an alternative matching engine
1485c67d6573Sopenharmony_ci        // will take over.
1486c67d6573Sopenharmony_ci        if self.prog.has_unicode_word_boundary {
1487c67d6573Sopenharmony_ci            for b in 128..256 {
1488c67d6573Sopenharmony_ci                let cls = self.byte_class(Byte::byte(b as u8));
1489c67d6573Sopenharmony_ci                self.cache.trans.set_next(si, cls, STATE_QUIT);
1490c67d6573Sopenharmony_ci            }
1491c67d6573Sopenharmony_ci        }
1492c67d6573Sopenharmony_ci        // Finally, put our actual state on to our heap of states and index it
1493c67d6573Sopenharmony_ci        // so we can find it later.
1494c67d6573Sopenharmony_ci        self.cache.size += self.cache.trans.state_heap_size()
1495c67d6573Sopenharmony_ci            + state.data.len()
1496c67d6573Sopenharmony_ci            + (2 * mem::size_of::<State>())
1497c67d6573Sopenharmony_ci            + mem::size_of::<StatePtr>();
1498c67d6573Sopenharmony_ci        self.cache.compiled.insert(state, si);
1499c67d6573Sopenharmony_ci        // Transition table and set of states and map should all be in sync.
1500c67d6573Sopenharmony_ci        debug_assert!(
1501c67d6573Sopenharmony_ci            self.cache.compiled.len() == self.cache.trans.num_states()
1502c67d6573Sopenharmony_ci        );
1503c67d6573Sopenharmony_ci        Some(si)
1504c67d6573Sopenharmony_ci    }
1505c67d6573Sopenharmony_ci
1506c67d6573Sopenharmony_ci    /// Quickly finds the next occurrence of any literal prefixes in the regex.
1507c67d6573Sopenharmony_ci    /// If there are no literal prefixes, then the current position is
1508c67d6573Sopenharmony_ci    /// returned. If there are literal prefixes and one could not be found,
1509c67d6573Sopenharmony_ci    /// then None is returned.
1510c67d6573Sopenharmony_ci    ///
1511c67d6573Sopenharmony_ci    /// This should only be called when the DFA is in a start state.
1512c67d6573Sopenharmony_ci    fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
1513c67d6573Sopenharmony_ci        self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
1514c67d6573Sopenharmony_ci    }
1515c67d6573Sopenharmony_ci
1516c67d6573Sopenharmony_ci    /// Returns the number of byte classes required to discriminate transitions
1517c67d6573Sopenharmony_ci    /// in each state.
1518c67d6573Sopenharmony_ci    ///
1519c67d6573Sopenharmony_ci    /// invariant: num_byte_classes() == len(State.next)
1520c67d6573Sopenharmony_ci    fn num_byte_classes(&self) -> usize {
1521c67d6573Sopenharmony_ci        // We add 1 to account for the special EOF byte.
1522c67d6573Sopenharmony_ci        (self.prog.byte_classes[255] as usize + 1) + 1
1523c67d6573Sopenharmony_ci    }
1524c67d6573Sopenharmony_ci
1525c67d6573Sopenharmony_ci    /// Given an input byte or the special EOF sentinel, return its
1526c67d6573Sopenharmony_ci    /// corresponding byte class.
1527c67d6573Sopenharmony_ci    #[cfg_attr(feature = "perf-inline", inline(always))]
1528c67d6573Sopenharmony_ci    fn byte_class(&self, b: Byte) -> usize {
1529c67d6573Sopenharmony_ci        match b.as_byte() {
1530c67d6573Sopenharmony_ci            None => self.num_byte_classes() - 1,
1531c67d6573Sopenharmony_ci            Some(b) => self.u8_class(b),
1532c67d6573Sopenharmony_ci        }
1533c67d6573Sopenharmony_ci    }
1534c67d6573Sopenharmony_ci
1535c67d6573Sopenharmony_ci    /// Like byte_class, but explicitly for u8s.
1536c67d6573Sopenharmony_ci    #[cfg_attr(feature = "perf-inline", inline(always))]
1537c67d6573Sopenharmony_ci    fn u8_class(&self, b: u8) -> usize {
1538c67d6573Sopenharmony_ci        self.prog.byte_classes[b as usize] as usize
1539c67d6573Sopenharmony_ci    }
1540c67d6573Sopenharmony_ci
1541c67d6573Sopenharmony_ci    /// Returns true if the DFA should continue searching past the first match.
1542c67d6573Sopenharmony_ci    ///
1543c67d6573Sopenharmony_ci    /// Leftmost first semantics in the DFA are preserved by not following NFA
1544c67d6573Sopenharmony_ci    /// transitions after the first match is seen.
1545c67d6573Sopenharmony_ci    ///
1546c67d6573Sopenharmony_ci    /// On occasion, we want to avoid leftmost first semantics to find either
1547c67d6573Sopenharmony_ci    /// the longest match (for reverse search) or all possible matches (for
1548c67d6573Sopenharmony_ci    /// regex sets).
1549c67d6573Sopenharmony_ci    fn continue_past_first_match(&self) -> bool {
1550c67d6573Sopenharmony_ci        self.prog.is_reverse || self.prog.matches.len() > 1
1551c67d6573Sopenharmony_ci    }
1552c67d6573Sopenharmony_ci
1553c67d6573Sopenharmony_ci    /// Returns true if there is a prefix we can quickly search for.
1554c67d6573Sopenharmony_ci    fn has_prefix(&self) -> bool {
1555c67d6573Sopenharmony_ci        !self.prog.is_reverse
1556c67d6573Sopenharmony_ci            && !self.prog.prefixes.is_empty()
1557c67d6573Sopenharmony_ci            && !self.prog.is_anchored_start
1558c67d6573Sopenharmony_ci    }
1559c67d6573Sopenharmony_ci
1560c67d6573Sopenharmony_ci    /// Sets the STATE_START bit in the given state pointer if and only if
1561c67d6573Sopenharmony_ci    /// we have a prefix to scan for.
1562c67d6573Sopenharmony_ci    ///
1563c67d6573Sopenharmony_ci    /// If there's no prefix, then it's a waste to treat the start state
1564c67d6573Sopenharmony_ci    /// specially.
1565c67d6573Sopenharmony_ci    fn start_ptr(&self, si: StatePtr) -> StatePtr {
1566c67d6573Sopenharmony_ci        if self.has_prefix() {
1567c67d6573Sopenharmony_ci            si | STATE_START
1568c67d6573Sopenharmony_ci        } else {
1569c67d6573Sopenharmony_ci            si
1570c67d6573Sopenharmony_ci        }
1571c67d6573Sopenharmony_ci    }
1572c67d6573Sopenharmony_ci
1573c67d6573Sopenharmony_ci    /// Approximate size returns the approximate heap space currently used by
1574c67d6573Sopenharmony_ci    /// the DFA. It is used to determine whether the DFA's state cache needs to
1575c67d6573Sopenharmony_ci    /// be wiped. Namely, it is possible that for certain regexes on certain
1576c67d6573Sopenharmony_ci    /// inputs, a new state could be created for every byte of input. (This is
1577c67d6573Sopenharmony_ci    /// bad for memory use, so we bound it with a cache.)
1578c67d6573Sopenharmony_ci    fn approximate_size(&self) -> usize {
1579c67d6573Sopenharmony_ci        self.cache.size + self.prog.approximate_size()
1580c67d6573Sopenharmony_ci    }
1581c67d6573Sopenharmony_ci}
1582c67d6573Sopenharmony_ci
1583c67d6573Sopenharmony_ci/// An abstraction for representing a map of states. The map supports two
1584c67d6573Sopenharmony_ci/// different ways of state lookup. One is fast constant time access via a
1585c67d6573Sopenharmony_ci/// state pointer. The other is a hashmap lookup based on the DFA's
1586c67d6573Sopenharmony_ci/// constituent NFA states.
1587c67d6573Sopenharmony_ci///
1588c67d6573Sopenharmony_ci/// A DFA state internally uses an Arc such that we only need to store the
1589c67d6573Sopenharmony_ci/// set of NFA states on the heap once, even though we support looking up
1590c67d6573Sopenharmony_ci/// states by two different means. A more natural way to express this might
1591c67d6573Sopenharmony_ci/// use raw pointers, but an Arc is safe and effectively achieves the same
1592c67d6573Sopenharmony_ci/// thing.
1593c67d6573Sopenharmony_ci#[derive(Debug)]
1594c67d6573Sopenharmony_cistruct StateMap {
1595c67d6573Sopenharmony_ci    /// The keys are not actually static but rely on always pointing to a
1596c67d6573Sopenharmony_ci    /// buffer in `states` which will never be moved except when clearing
1597c67d6573Sopenharmony_ci    /// the map or on drop, in which case the keys of this map will be
1598c67d6573Sopenharmony_ci    /// removed before
1599c67d6573Sopenharmony_ci    map: HashMap<State, StatePtr>,
1600c67d6573Sopenharmony_ci    /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
1601c67d6573Sopenharmony_ci    /// this Vec rather than just a `StatePtr`.
1602c67d6573Sopenharmony_ci    states: Vec<State>,
1603c67d6573Sopenharmony_ci    /// The number of byte classes in the DFA. Used to index `states`.
1604c67d6573Sopenharmony_ci    num_byte_classes: usize,
1605c67d6573Sopenharmony_ci}
1606c67d6573Sopenharmony_ci
1607c67d6573Sopenharmony_ciimpl StateMap {
1608c67d6573Sopenharmony_ci    fn new(num_byte_classes: usize) -> StateMap {
1609c67d6573Sopenharmony_ci        StateMap { map: HashMap::new(), states: vec![], num_byte_classes }
1610c67d6573Sopenharmony_ci    }
1611c67d6573Sopenharmony_ci
1612c67d6573Sopenharmony_ci    fn len(&self) -> usize {
1613c67d6573Sopenharmony_ci        self.states.len()
1614c67d6573Sopenharmony_ci    }
1615c67d6573Sopenharmony_ci
1616c67d6573Sopenharmony_ci    fn is_empty(&self) -> bool {
1617c67d6573Sopenharmony_ci        self.states.is_empty()
1618c67d6573Sopenharmony_ci    }
1619c67d6573Sopenharmony_ci
1620c67d6573Sopenharmony_ci    fn get_ptr(&self, state: &State) -> Option<StatePtr> {
1621c67d6573Sopenharmony_ci        self.map.get(state).cloned()
1622c67d6573Sopenharmony_ci    }
1623c67d6573Sopenharmony_ci
1624c67d6573Sopenharmony_ci    fn get_state(&self, si: StatePtr) -> Option<&State> {
1625c67d6573Sopenharmony_ci        self.states.get(si as usize / self.num_byte_classes)
1626c67d6573Sopenharmony_ci    }
1627c67d6573Sopenharmony_ci
1628c67d6573Sopenharmony_ci    fn insert(&mut self, state: State, si: StatePtr) {
1629c67d6573Sopenharmony_ci        self.map.insert(state.clone(), si);
1630c67d6573Sopenharmony_ci        self.states.push(state);
1631c67d6573Sopenharmony_ci    }
1632c67d6573Sopenharmony_ci
1633c67d6573Sopenharmony_ci    fn clear(&mut self) {
1634c67d6573Sopenharmony_ci        self.map.clear();
1635c67d6573Sopenharmony_ci        self.states.clear();
1636c67d6573Sopenharmony_ci    }
1637c67d6573Sopenharmony_ci}
1638c67d6573Sopenharmony_ci
1639c67d6573Sopenharmony_ciimpl Transitions {
1640c67d6573Sopenharmony_ci    /// Create a new transition table.
1641c67d6573Sopenharmony_ci    ///
1642c67d6573Sopenharmony_ci    /// The number of byte classes corresponds to the stride. Every state will
1643c67d6573Sopenharmony_ci    /// have `num_byte_classes` slots for transitions.
1644c67d6573Sopenharmony_ci    fn new(num_byte_classes: usize) -> Transitions {
1645c67d6573Sopenharmony_ci        Transitions { table: vec![], num_byte_classes }
1646c67d6573Sopenharmony_ci    }
1647c67d6573Sopenharmony_ci
1648c67d6573Sopenharmony_ci    /// Returns the total number of states currently in this table.
1649c67d6573Sopenharmony_ci    fn num_states(&self) -> usize {
1650c67d6573Sopenharmony_ci        self.table.len() / self.num_byte_classes
1651c67d6573Sopenharmony_ci    }
1652c67d6573Sopenharmony_ci
1653c67d6573Sopenharmony_ci    /// Allocates room for one additional state and returns a pointer to it.
1654c67d6573Sopenharmony_ci    ///
1655c67d6573Sopenharmony_ci    /// If there's no more room, None is returned.
1656c67d6573Sopenharmony_ci    fn add(&mut self) -> Option<StatePtr> {
1657c67d6573Sopenharmony_ci        let si = self.table.len();
1658c67d6573Sopenharmony_ci        if si > STATE_MAX as usize {
1659c67d6573Sopenharmony_ci            return None;
1660c67d6573Sopenharmony_ci        }
1661c67d6573Sopenharmony_ci        self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes));
1662c67d6573Sopenharmony_ci        Some(usize_to_u32(si))
1663c67d6573Sopenharmony_ci    }
1664c67d6573Sopenharmony_ci
1665c67d6573Sopenharmony_ci    /// Clears the table of all states.
1666c67d6573Sopenharmony_ci    fn clear(&mut self) {
1667c67d6573Sopenharmony_ci        self.table.clear();
1668c67d6573Sopenharmony_ci    }
1669c67d6573Sopenharmony_ci
1670c67d6573Sopenharmony_ci    /// Sets the transition from (si, cls) to next.
1671c67d6573Sopenharmony_ci    fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) {
1672c67d6573Sopenharmony_ci        self.table[si as usize + cls] = next;
1673c67d6573Sopenharmony_ci    }
1674c67d6573Sopenharmony_ci
1675c67d6573Sopenharmony_ci    /// Returns the transition corresponding to (si, cls).
1676c67d6573Sopenharmony_ci    fn next(&self, si: StatePtr, cls: usize) -> StatePtr {
1677c67d6573Sopenharmony_ci        self.table[si as usize + cls]
1678c67d6573Sopenharmony_ci    }
1679c67d6573Sopenharmony_ci
1680c67d6573Sopenharmony_ci    /// The heap size, in bytes, of a single state in the transition table.
1681c67d6573Sopenharmony_ci    fn state_heap_size(&self) -> usize {
1682c67d6573Sopenharmony_ci        self.num_byte_classes * mem::size_of::<StatePtr>()
1683c67d6573Sopenharmony_ci    }
1684c67d6573Sopenharmony_ci
1685c67d6573Sopenharmony_ci    /// Like `next`, but uses unchecked access and is therefore not safe.
1686c67d6573Sopenharmony_ci    unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
1687c67d6573Sopenharmony_ci        debug_assert!((si as usize) < self.table.len());
1688c67d6573Sopenharmony_ci        debug_assert!(cls < self.num_byte_classes);
1689c67d6573Sopenharmony_ci        *self.table.get_unchecked(si as usize + cls)
1690c67d6573Sopenharmony_ci    }
1691c67d6573Sopenharmony_ci}
1692c67d6573Sopenharmony_ci
1693c67d6573Sopenharmony_ciimpl StateFlags {
1694c67d6573Sopenharmony_ci    fn is_match(&self) -> bool {
1695c67d6573Sopenharmony_ci        self.0 & 0b0000_0001 > 0
1696c67d6573Sopenharmony_ci    }
1697c67d6573Sopenharmony_ci
1698c67d6573Sopenharmony_ci    fn set_match(&mut self) {
1699c67d6573Sopenharmony_ci        self.0 |= 0b0000_0001;
1700c67d6573Sopenharmony_ci    }
1701c67d6573Sopenharmony_ci
1702c67d6573Sopenharmony_ci    fn is_word(&self) -> bool {
1703c67d6573Sopenharmony_ci        self.0 & 0b0000_0010 > 0
1704c67d6573Sopenharmony_ci    }
1705c67d6573Sopenharmony_ci
1706c67d6573Sopenharmony_ci    fn set_word(&mut self) {
1707c67d6573Sopenharmony_ci        self.0 |= 0b0000_0010;
1708c67d6573Sopenharmony_ci    }
1709c67d6573Sopenharmony_ci
1710c67d6573Sopenharmony_ci    fn has_empty(&self) -> bool {
1711c67d6573Sopenharmony_ci        self.0 & 0b0000_0100 > 0
1712c67d6573Sopenharmony_ci    }
1713c67d6573Sopenharmony_ci
1714c67d6573Sopenharmony_ci    fn set_empty(&mut self) {
1715c67d6573Sopenharmony_ci        self.0 |= 0b0000_0100;
1716c67d6573Sopenharmony_ci    }
1717c67d6573Sopenharmony_ci}
1718c67d6573Sopenharmony_ci
1719c67d6573Sopenharmony_ciimpl Byte {
1720c67d6573Sopenharmony_ci    fn byte(b: u8) -> Self {
1721c67d6573Sopenharmony_ci        Byte(b as u16)
1722c67d6573Sopenharmony_ci    }
1723c67d6573Sopenharmony_ci    fn eof() -> Self {
1724c67d6573Sopenharmony_ci        Byte(256)
1725c67d6573Sopenharmony_ci    }
1726c67d6573Sopenharmony_ci    fn is_eof(&self) -> bool {
1727c67d6573Sopenharmony_ci        self.0 == 256
1728c67d6573Sopenharmony_ci    }
1729c67d6573Sopenharmony_ci
1730c67d6573Sopenharmony_ci    fn is_ascii_word(&self) -> bool {
1731c67d6573Sopenharmony_ci        let b = match self.as_byte() {
1732c67d6573Sopenharmony_ci            None => return false,
1733c67d6573Sopenharmony_ci            Some(b) => b,
1734c67d6573Sopenharmony_ci        };
1735c67d6573Sopenharmony_ci        match b {
1736c67d6573Sopenharmony_ci            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true,
1737c67d6573Sopenharmony_ci            _ => false,
1738c67d6573Sopenharmony_ci        }
1739c67d6573Sopenharmony_ci    }
1740c67d6573Sopenharmony_ci
1741c67d6573Sopenharmony_ci    fn as_byte(&self) -> Option<u8> {
1742c67d6573Sopenharmony_ci        if self.is_eof() {
1743c67d6573Sopenharmony_ci            None
1744c67d6573Sopenharmony_ci        } else {
1745c67d6573Sopenharmony_ci            Some(self.0 as u8)
1746c67d6573Sopenharmony_ci        }
1747c67d6573Sopenharmony_ci    }
1748c67d6573Sopenharmony_ci}
1749c67d6573Sopenharmony_ci
1750c67d6573Sopenharmony_ciimpl fmt::Debug for State {
1751c67d6573Sopenharmony_ci    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1752c67d6573Sopenharmony_ci        let ips: Vec<usize> = self.inst_ptrs().collect();
1753c67d6573Sopenharmony_ci        f.debug_struct("State")
1754c67d6573Sopenharmony_ci            .field("flags", &self.flags())
1755c67d6573Sopenharmony_ci            .field("insts", &ips)
1756c67d6573Sopenharmony_ci            .finish()
1757c67d6573Sopenharmony_ci    }
1758c67d6573Sopenharmony_ci}
1759c67d6573Sopenharmony_ci
1760c67d6573Sopenharmony_ciimpl fmt::Debug for Transitions {
1761c67d6573Sopenharmony_ci    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1762c67d6573Sopenharmony_ci        let mut fmtd = f.debug_map();
1763c67d6573Sopenharmony_ci        for si in 0..self.num_states() {
1764c67d6573Sopenharmony_ci            let s = si * self.num_byte_classes;
1765c67d6573Sopenharmony_ci            let e = s + self.num_byte_classes;
1766c67d6573Sopenharmony_ci            fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e]));
1767c67d6573Sopenharmony_ci        }
1768c67d6573Sopenharmony_ci        fmtd.finish()
1769c67d6573Sopenharmony_ci    }
1770c67d6573Sopenharmony_ci}
1771c67d6573Sopenharmony_ci
1772c67d6573Sopenharmony_cistruct TransitionsRow<'a>(&'a [StatePtr]);
1773c67d6573Sopenharmony_ci
1774c67d6573Sopenharmony_ciimpl<'a> fmt::Debug for TransitionsRow<'a> {
1775c67d6573Sopenharmony_ci    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1776c67d6573Sopenharmony_ci        let mut fmtd = f.debug_map();
1777c67d6573Sopenharmony_ci        for (b, si) in self.0.iter().enumerate() {
1778c67d6573Sopenharmony_ci            match *si {
1779c67d6573Sopenharmony_ci                STATE_UNKNOWN => {}
1780c67d6573Sopenharmony_ci                STATE_DEAD => {
1781c67d6573Sopenharmony_ci                    fmtd.entry(&vb(b as usize), &"DEAD");
1782c67d6573Sopenharmony_ci                }
1783c67d6573Sopenharmony_ci                si => {
1784c67d6573Sopenharmony_ci                    fmtd.entry(&vb(b as usize), &si.to_string());
1785c67d6573Sopenharmony_ci                }
1786c67d6573Sopenharmony_ci            }
1787c67d6573Sopenharmony_ci        }
1788c67d6573Sopenharmony_ci        fmtd.finish()
1789c67d6573Sopenharmony_ci    }
1790c67d6573Sopenharmony_ci}
1791c67d6573Sopenharmony_ci
1792c67d6573Sopenharmony_ciimpl fmt::Debug for StateFlags {
1793c67d6573Sopenharmony_ci    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1794c67d6573Sopenharmony_ci        f.debug_struct("StateFlags")
1795c67d6573Sopenharmony_ci            .field("is_match", &self.is_match())
1796c67d6573Sopenharmony_ci            .field("is_word", &self.is_word())
1797c67d6573Sopenharmony_ci            .field("has_empty", &self.has_empty())
1798c67d6573Sopenharmony_ci            .finish()
1799c67d6573Sopenharmony_ci    }
1800c67d6573Sopenharmony_ci}
1801c67d6573Sopenharmony_ci
1802c67d6573Sopenharmony_ci/// Helper function for formatting a byte as a nice-to-read escaped string.
1803c67d6573Sopenharmony_cifn vb(b: usize) -> String {
1804c67d6573Sopenharmony_ci    use std::ascii::escape_default;
1805c67d6573Sopenharmony_ci
1806c67d6573Sopenharmony_ci    if b > ::std::u8::MAX as usize {
1807c67d6573Sopenharmony_ci        "EOF".to_owned()
1808c67d6573Sopenharmony_ci    } else {
1809c67d6573Sopenharmony_ci        let escaped = escape_default(b as u8).collect::<Vec<u8>>();
1810c67d6573Sopenharmony_ci        String::from_utf8_lossy(&escaped).into_owned()
1811c67d6573Sopenharmony_ci    }
1812c67d6573Sopenharmony_ci}
1813c67d6573Sopenharmony_ci
1814c67d6573Sopenharmony_cifn usize_to_u32(n: usize) -> u32 {
1815c67d6573Sopenharmony_ci    if (n as u64) > (::std::u32::MAX as u64) {
1816c67d6573Sopenharmony_ci        panic!("BUG: {} is too big to fit into u32", n)
1817c67d6573Sopenharmony_ci    }
1818c67d6573Sopenharmony_ci    n as u32
1819c67d6573Sopenharmony_ci}
1820c67d6573Sopenharmony_ci
1821c67d6573Sopenharmony_ci#[allow(dead_code)] // useful for debugging
1822c67d6573Sopenharmony_cifn show_state_ptr(si: StatePtr) -> String {
1823c67d6573Sopenharmony_ci    let mut s = format!("{:?}", si & STATE_MAX);
1824c67d6573Sopenharmony_ci    if si == STATE_UNKNOWN {
1825c67d6573Sopenharmony_ci        s = format!("{} (unknown)", s);
1826c67d6573Sopenharmony_ci    }
1827c67d6573Sopenharmony_ci    if si == STATE_DEAD {
1828c67d6573Sopenharmony_ci        s = format!("{} (dead)", s);
1829c67d6573Sopenharmony_ci    }
1830c67d6573Sopenharmony_ci    if si == STATE_QUIT {
1831c67d6573Sopenharmony_ci        s = format!("{} (quit)", s);
1832c67d6573Sopenharmony_ci    }
1833c67d6573Sopenharmony_ci    if si & STATE_START > 0 {
1834c67d6573Sopenharmony_ci        s = format!("{} (start)", s);
1835c67d6573Sopenharmony_ci    }
1836c67d6573Sopenharmony_ci    if si & STATE_MATCH > 0 {
1837c67d6573Sopenharmony_ci        s = format!("{} (match)", s);
1838c67d6573Sopenharmony_ci    }
1839c67d6573Sopenharmony_ci    s
1840c67d6573Sopenharmony_ci}
1841c67d6573Sopenharmony_ci
1842c67d6573Sopenharmony_ci/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1843c67d6573Sopenharmony_cifn write_vari32(data: &mut Vec<u8>, n: i32) {
1844c67d6573Sopenharmony_ci    let mut un = (n as u32) << 1;
1845c67d6573Sopenharmony_ci    if n < 0 {
1846c67d6573Sopenharmony_ci        un = !un;
1847c67d6573Sopenharmony_ci    }
1848c67d6573Sopenharmony_ci    write_varu32(data, un)
1849c67d6573Sopenharmony_ci}
1850c67d6573Sopenharmony_ci
1851c67d6573Sopenharmony_ci/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1852c67d6573Sopenharmony_cifn read_vari32(data: &[u8]) -> (i32, usize) {
1853c67d6573Sopenharmony_ci    let (un, i) = read_varu32(data);
1854c67d6573Sopenharmony_ci    let mut n = (un >> 1) as i32;
1855c67d6573Sopenharmony_ci    if un & 1 != 0 {
1856c67d6573Sopenharmony_ci        n = !n;
1857c67d6573Sopenharmony_ci    }
1858c67d6573Sopenharmony_ci    (n, i)
1859c67d6573Sopenharmony_ci}
1860c67d6573Sopenharmony_ci
1861c67d6573Sopenharmony_ci/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1862c67d6573Sopenharmony_cifn write_varu32(data: &mut Vec<u8>, mut n: u32) {
1863c67d6573Sopenharmony_ci    while n >= 0b1000_0000 {
1864c67d6573Sopenharmony_ci        data.push((n as u8) | 0b1000_0000);
1865c67d6573Sopenharmony_ci        n >>= 7;
1866c67d6573Sopenharmony_ci    }
1867c67d6573Sopenharmony_ci    data.push(n as u8);
1868c67d6573Sopenharmony_ci}
1869c67d6573Sopenharmony_ci
1870c67d6573Sopenharmony_ci/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1871c67d6573Sopenharmony_cifn read_varu32(data: &[u8]) -> (u32, usize) {
1872c67d6573Sopenharmony_ci    let mut n: u32 = 0;
1873c67d6573Sopenharmony_ci    let mut shift: u32 = 0;
1874c67d6573Sopenharmony_ci    for (i, &b) in data.iter().enumerate() {
1875c67d6573Sopenharmony_ci        if b < 0b1000_0000 {
1876c67d6573Sopenharmony_ci            return (n | ((b as u32) << shift), i + 1);
1877c67d6573Sopenharmony_ci        }
1878c67d6573Sopenharmony_ci        n |= ((b as u32) & 0b0111_1111) << shift;
1879c67d6573Sopenharmony_ci        shift += 7;
1880c67d6573Sopenharmony_ci    }
1881c67d6573Sopenharmony_ci    (0, 0)
1882c67d6573Sopenharmony_ci}
1883c67d6573Sopenharmony_ci
1884c67d6573Sopenharmony_ci#[cfg(test)]
1885c67d6573Sopenharmony_cimod tests {
1886c67d6573Sopenharmony_ci
1887c67d6573Sopenharmony_ci    use super::{
1888c67d6573Sopenharmony_ci        push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32,
1889c67d6573Sopenharmony_ci        State, StateFlags,
1890c67d6573Sopenharmony_ci    };
1891c67d6573Sopenharmony_ci    use quickcheck::{quickcheck, Gen, QuickCheck};
1892c67d6573Sopenharmony_ci    use std::sync::Arc;
1893c67d6573Sopenharmony_ci
1894c67d6573Sopenharmony_ci    #[test]
1895c67d6573Sopenharmony_ci    fn prop_state_encode_decode() {
1896c67d6573Sopenharmony_ci        fn p(mut ips: Vec<u32>, flags: u8) -> bool {
1897c67d6573Sopenharmony_ci            // It looks like our encoding scheme can't handle instruction
1898c67d6573Sopenharmony_ci            // pointers at or above 2**31. We should fix that, but it seems
1899c67d6573Sopenharmony_ci            // unlikely to occur in real code due to the amount of memory
1900c67d6573Sopenharmony_ci            // required for such a state machine. So for now, we just clamp
1901c67d6573Sopenharmony_ci            // our test data.
1902c67d6573Sopenharmony_ci            for ip in &mut ips {
1903c67d6573Sopenharmony_ci                if *ip >= 1 << 31 {
1904c67d6573Sopenharmony_ci                    *ip = (1 << 31) - 1;
1905c67d6573Sopenharmony_ci                }
1906c67d6573Sopenharmony_ci            }
1907c67d6573Sopenharmony_ci            let mut data = vec![flags];
1908c67d6573Sopenharmony_ci            let mut prev = 0;
1909c67d6573Sopenharmony_ci            for &ip in ips.iter() {
1910c67d6573Sopenharmony_ci                push_inst_ptr(&mut data, &mut prev, ip);
1911c67d6573Sopenharmony_ci            }
1912c67d6573Sopenharmony_ci            let state = State { data: Arc::from(&data[..]) };
1913c67d6573Sopenharmony_ci
1914c67d6573Sopenharmony_ci            let expected: Vec<usize> =
1915c67d6573Sopenharmony_ci                ips.into_iter().map(|ip| ip as usize).collect();
1916c67d6573Sopenharmony_ci            let got: Vec<usize> = state.inst_ptrs().collect();
1917c67d6573Sopenharmony_ci            expected == got && state.flags() == StateFlags(flags)
1918c67d6573Sopenharmony_ci        }
1919c67d6573Sopenharmony_ci        QuickCheck::new()
1920c67d6573Sopenharmony_ci            .gen(Gen::new(10_000))
1921c67d6573Sopenharmony_ci            .quickcheck(p as fn(Vec<u32>, u8) -> bool);
1922c67d6573Sopenharmony_ci    }
1923c67d6573Sopenharmony_ci
1924c67d6573Sopenharmony_ci    #[test]
1925c67d6573Sopenharmony_ci    fn prop_read_write_u32() {
1926c67d6573Sopenharmony_ci        fn p(n: u32) -> bool {
1927c67d6573Sopenharmony_ci            let mut buf = vec![];
1928c67d6573Sopenharmony_ci            write_varu32(&mut buf, n);
1929c67d6573Sopenharmony_ci            let (got, nread) = read_varu32(&buf);
1930c67d6573Sopenharmony_ci            nread == buf.len() && got == n
1931c67d6573Sopenharmony_ci        }
1932c67d6573Sopenharmony_ci        quickcheck(p as fn(u32) -> bool);
1933c67d6573Sopenharmony_ci    }
1934c67d6573Sopenharmony_ci
1935c67d6573Sopenharmony_ci    #[test]
1936c67d6573Sopenharmony_ci    fn prop_read_write_i32() {
1937c67d6573Sopenharmony_ci        fn p(n: i32) -> bool {
1938c67d6573Sopenharmony_ci            let mut buf = vec![];
1939c67d6573Sopenharmony_ci            write_vari32(&mut buf, n);
1940c67d6573Sopenharmony_ci            let (got, nread) = read_vari32(&buf);
1941c67d6573Sopenharmony_ci            nread == buf.len() && got == n
1942c67d6573Sopenharmony_ci        }
1943c67d6573Sopenharmony_ci        quickcheck(p as fn(i32) -> bool);
1944c67d6573Sopenharmony_ci    }
1945c67d6573Sopenharmony_ci}
1946