1c67d6573Sopenharmony_ci/*! 2c67d6573Sopenharmony_ciThe DFA matching engine. 3c67d6573Sopenharmony_ci 4c67d6573Sopenharmony_ciA DFA provides faster matching because the engine is in exactly one state at 5c67d6573Sopenharmony_ciany point in time. In the NFA, there may be multiple active states, and 6c67d6573Sopenharmony_ciconsiderable CPU cycles are spent shuffling them around. In finite automata 7c67d6573Sopenharmony_cispeak, the DFA follows epsilon transitions in the regex far less than the NFA. 8c67d6573Sopenharmony_ci 9c67d6573Sopenharmony_ciA DFA is a classic trade off between time and space. The NFA is slower, but 10c67d6573Sopenharmony_ciits memory requirements are typically small and predictable. The DFA is faster, 11c67d6573Sopenharmony_cibut given the right regex and the right input, the number of states in the 12c67d6573Sopenharmony_ciDFA can grow exponentially. To mitigate this space problem, we do two things: 13c67d6573Sopenharmony_ci 14c67d6573Sopenharmony_ci1. We implement an *online* DFA. That is, the DFA is constructed from the NFA 15c67d6573Sopenharmony_ci during a search. When a new state is computed, it is stored in a cache so 16c67d6573Sopenharmony_ci that it may be reused. An important consequence of this implementation 17c67d6573Sopenharmony_ci is that states that are never reached for a particular input are never 18c67d6573Sopenharmony_ci computed. (This is impossible in an "offline" DFA which needs to compute 19c67d6573Sopenharmony_ci all possible states up front.) 20c67d6573Sopenharmony_ci2. If the cache gets too big, we wipe it and continue matching. 21c67d6573Sopenharmony_ci 22c67d6573Sopenharmony_ciIn pathological cases, a new state can be created for every byte of input. 23c67d6573Sopenharmony_ci(e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.) 24c67d6573Sopenharmony_ciIn this case, performance regresses to slightly slower than the full NFA 25c67d6573Sopenharmony_cisimulation, in large part because the cache becomes useless. If the cache 26c67d6573Sopenharmony_ciis wiped too frequently, the DFA quits and control falls back to one of the 27c67d6573Sopenharmony_ciNFA simulations. 28c67d6573Sopenharmony_ci 29c67d6573Sopenharmony_ciBecause of the "lazy" nature of this DFA, the inner matching loop is 30c67d6573Sopenharmony_ciconsiderably more complex than one might expect out of a DFA. A number of 31c67d6573Sopenharmony_citricks are employed to make it fast. Tread carefully. 32c67d6573Sopenharmony_ci 33c67d6573Sopenharmony_ciN.B. While this implementation is heavily commented, Russ Cox's series of 34c67d6573Sopenharmony_ciarticles on regexes is strongly recommended: <https://swtch.com/~rsc/regexp/> 35c67d6573Sopenharmony_ci(As is the DFA implementation in RE2, which heavily influenced this 36c67d6573Sopenharmony_ciimplementation.) 37c67d6573Sopenharmony_ci*/ 38c67d6573Sopenharmony_ci 39c67d6573Sopenharmony_ciuse std::collections::HashMap; 40c67d6573Sopenharmony_ciuse std::fmt; 41c67d6573Sopenharmony_ciuse std::iter::repeat; 42c67d6573Sopenharmony_ciuse std::mem; 43c67d6573Sopenharmony_ciuse std::sync::Arc; 44c67d6573Sopenharmony_ci 45c67d6573Sopenharmony_ciuse crate::exec::ProgramCache; 46c67d6573Sopenharmony_ciuse crate::prog::{Inst, Program}; 47c67d6573Sopenharmony_ciuse crate::sparse::SparseSet; 48c67d6573Sopenharmony_ci 49c67d6573Sopenharmony_ci/// Return true if and only if the given program can be executed by a DFA. 50c67d6573Sopenharmony_ci/// 51c67d6573Sopenharmony_ci/// Generally, a DFA is always possible. A pathological case where it is not 52c67d6573Sopenharmony_ci/// possible is if the number of NFA states exceeds `u32::MAX`, in which case, 53c67d6573Sopenharmony_ci/// this function will return false. 54c67d6573Sopenharmony_ci/// 55c67d6573Sopenharmony_ci/// This function will also return false if the given program has any Unicode 56c67d6573Sopenharmony_ci/// instructions (Char or Ranges) since the DFA operates on bytes only. 57c67d6573Sopenharmony_cipub fn can_exec(insts: &Program) -> bool { 58c67d6573Sopenharmony_ci use crate::prog::Inst::*; 59c67d6573Sopenharmony_ci // If for some reason we manage to allocate a regex program with more 60c67d6573Sopenharmony_ci // than i32::MAX instructions, then we can't execute the DFA because we 61c67d6573Sopenharmony_ci // use 32 bit instruction pointer deltas for memory savings. 62c67d6573Sopenharmony_ci // If i32::MAX is the largest positive delta, 63c67d6573Sopenharmony_ci // then -i32::MAX == i32::MIN + 1 is the largest negative delta, 64c67d6573Sopenharmony_ci // and we are OK to use 32 bits. 65c67d6573Sopenharmony_ci if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize { 66c67d6573Sopenharmony_ci return false; 67c67d6573Sopenharmony_ci } 68c67d6573Sopenharmony_ci for inst in insts { 69c67d6573Sopenharmony_ci match *inst { 70c67d6573Sopenharmony_ci Char(_) | Ranges(_) => return false, 71c67d6573Sopenharmony_ci EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {} 72c67d6573Sopenharmony_ci } 73c67d6573Sopenharmony_ci } 74c67d6573Sopenharmony_ci true 75c67d6573Sopenharmony_ci} 76c67d6573Sopenharmony_ci 77c67d6573Sopenharmony_ci/// A reusable cache of DFA states. 78c67d6573Sopenharmony_ci/// 79c67d6573Sopenharmony_ci/// This cache is reused between multiple invocations of the same regex 80c67d6573Sopenharmony_ci/// program. (It is not shared simultaneously between threads. If there is 81c67d6573Sopenharmony_ci/// contention, then new caches are created.) 82c67d6573Sopenharmony_ci#[derive(Debug)] 83c67d6573Sopenharmony_cipub struct Cache { 84c67d6573Sopenharmony_ci /// Group persistent DFA related cache state together. The sparse sets 85c67d6573Sopenharmony_ci /// listed below are used as scratch space while computing uncached states. 86c67d6573Sopenharmony_ci inner: CacheInner, 87c67d6573Sopenharmony_ci /// qcur and qnext are ordered sets with constant time 88c67d6573Sopenharmony_ci /// addition/membership/clearing-whole-set and linear time iteration. They 89c67d6573Sopenharmony_ci /// are used to manage the sets of NFA states in DFA states when computing 90c67d6573Sopenharmony_ci /// cached DFA states. In particular, the order of the NFA states matters 91c67d6573Sopenharmony_ci /// for leftmost-first style matching. Namely, when computing a cached 92c67d6573Sopenharmony_ci /// state, the set of NFA states stops growing as soon as the first Match 93c67d6573Sopenharmony_ci /// instruction is observed. 94c67d6573Sopenharmony_ci qcur: SparseSet, 95c67d6573Sopenharmony_ci qnext: SparseSet, 96c67d6573Sopenharmony_ci} 97c67d6573Sopenharmony_ci 98c67d6573Sopenharmony_ci/// `CacheInner` is logically just a part of Cache, but groups together fields 99c67d6573Sopenharmony_ci/// that aren't passed as function parameters throughout search. (This split 100c67d6573Sopenharmony_ci/// is mostly an artifact of the borrow checker. It is happily paid.) 101c67d6573Sopenharmony_ci#[derive(Debug)] 102c67d6573Sopenharmony_cistruct CacheInner { 103c67d6573Sopenharmony_ci /// A cache of pre-compiled DFA states, keyed by the set of NFA states 104c67d6573Sopenharmony_ci /// and the set of empty-width flags set at the byte in the input when the 105c67d6573Sopenharmony_ci /// state was observed. 106c67d6573Sopenharmony_ci /// 107c67d6573Sopenharmony_ci /// A StatePtr is effectively a `*State`, but to avoid various inconvenient 108c67d6573Sopenharmony_ci /// things, we just pass indexes around manually. The performance impact of 109c67d6573Sopenharmony_ci /// this is probably an instruction or two in the inner loop. However, on 110c67d6573Sopenharmony_ci /// 64 bit, each StatePtr is half the size of a *State. 111c67d6573Sopenharmony_ci compiled: StateMap, 112c67d6573Sopenharmony_ci /// The transition table. 113c67d6573Sopenharmony_ci /// 114c67d6573Sopenharmony_ci /// The transition table is laid out in row-major order, where states are 115c67d6573Sopenharmony_ci /// rows and the transitions for each state are columns. At a high level, 116c67d6573Sopenharmony_ci /// given state `s` and byte `b`, the next state can be found at index 117c67d6573Sopenharmony_ci /// `s * 256 + b`. 118c67d6573Sopenharmony_ci /// 119c67d6573Sopenharmony_ci /// This is, of course, a lie. A StatePtr is actually a pointer to the 120c67d6573Sopenharmony_ci /// *start* of a row in this table. When indexing in the DFA's inner loop, 121c67d6573Sopenharmony_ci /// this removes the need to multiply the StatePtr by the stride. Yes, it 122c67d6573Sopenharmony_ci /// matters. This reduces the number of states we can store, but: the 123c67d6573Sopenharmony_ci /// stride is rarely 256 since we define transitions in terms of 124c67d6573Sopenharmony_ci /// *equivalence classes* of bytes. Each class corresponds to a set of 125c67d6573Sopenharmony_ci /// bytes that never discriminate a distinct path through the DFA from each 126c67d6573Sopenharmony_ci /// other. 127c67d6573Sopenharmony_ci trans: Transitions, 128c67d6573Sopenharmony_ci /// A set of cached start states, which are limited to the number of 129c67d6573Sopenharmony_ci /// permutations of flags set just before the initial byte of input. (The 130c67d6573Sopenharmony_ci /// index into this vec is a `EmptyFlags`.) 131c67d6573Sopenharmony_ci /// 132c67d6573Sopenharmony_ci /// N.B. A start state can be "dead" (i.e., no possible match), so we 133c67d6573Sopenharmony_ci /// represent it with a StatePtr. 134c67d6573Sopenharmony_ci start_states: Vec<StatePtr>, 135c67d6573Sopenharmony_ci /// Stack scratch space used to follow epsilon transitions in the NFA. 136c67d6573Sopenharmony_ci /// (This permits us to avoid recursion.) 137c67d6573Sopenharmony_ci /// 138c67d6573Sopenharmony_ci /// The maximum stack size is the number of NFA states. 139c67d6573Sopenharmony_ci stack: Vec<InstPtr>, 140c67d6573Sopenharmony_ci /// The total number of times this cache has been flushed by the DFA 141c67d6573Sopenharmony_ci /// because of space constraints. 142c67d6573Sopenharmony_ci flush_count: u64, 143c67d6573Sopenharmony_ci /// The total heap size of the DFA's cache. We use this to determine when 144c67d6573Sopenharmony_ci /// we should flush the cache. 145c67d6573Sopenharmony_ci size: usize, 146c67d6573Sopenharmony_ci /// Scratch space used when building instruction pointer lists for new 147c67d6573Sopenharmony_ci /// states. This helps amortize allocation. 148c67d6573Sopenharmony_ci insts_scratch_space: Vec<u8>, 149c67d6573Sopenharmony_ci} 150c67d6573Sopenharmony_ci 151c67d6573Sopenharmony_ci/// The transition table. 152c67d6573Sopenharmony_ci/// 153c67d6573Sopenharmony_ci/// It is laid out in row-major order, with states as rows and byte class 154c67d6573Sopenharmony_ci/// transitions as columns. 155c67d6573Sopenharmony_ci/// 156c67d6573Sopenharmony_ci/// The transition table is responsible for producing valid `StatePtrs`. A 157c67d6573Sopenharmony_ci/// `StatePtr` points to the start of a particular row in this table. When 158c67d6573Sopenharmony_ci/// indexing to find the next state this allows us to avoid a multiplication 159c67d6573Sopenharmony_ci/// when computing an index into the table. 160c67d6573Sopenharmony_ci#[derive(Clone)] 161c67d6573Sopenharmony_cistruct Transitions { 162c67d6573Sopenharmony_ci /// The table. 163c67d6573Sopenharmony_ci table: Vec<StatePtr>, 164c67d6573Sopenharmony_ci /// The stride. 165c67d6573Sopenharmony_ci num_byte_classes: usize, 166c67d6573Sopenharmony_ci} 167c67d6573Sopenharmony_ci 168c67d6573Sopenharmony_ci/// Fsm encapsulates the actual execution of the DFA. 169c67d6573Sopenharmony_ci#[derive(Debug)] 170c67d6573Sopenharmony_cipub struct Fsm<'a> { 171c67d6573Sopenharmony_ci /// prog contains the NFA instruction opcodes. DFA execution uses either 172c67d6573Sopenharmony_ci /// the `dfa` instructions or the `dfa_reverse` instructions from 173c67d6573Sopenharmony_ci /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have 174c67d6573Sopenharmony_ci /// Unicode opcodes that cannot be executed by the DFA.) 175c67d6573Sopenharmony_ci prog: &'a Program, 176c67d6573Sopenharmony_ci /// The start state. We record it here because the pointer may change 177c67d6573Sopenharmony_ci /// when the cache is wiped. 178c67d6573Sopenharmony_ci start: StatePtr, 179c67d6573Sopenharmony_ci /// The current position in the input. 180c67d6573Sopenharmony_ci at: usize, 181c67d6573Sopenharmony_ci /// Should we quit after seeing the first match? e.g., When the caller 182c67d6573Sopenharmony_ci /// uses `is_match` or `shortest_match`. 183c67d6573Sopenharmony_ci quit_after_match: bool, 184c67d6573Sopenharmony_ci /// The last state that matched. 185c67d6573Sopenharmony_ci /// 186c67d6573Sopenharmony_ci /// When no match has occurred, this is set to STATE_UNKNOWN. 187c67d6573Sopenharmony_ci /// 188c67d6573Sopenharmony_ci /// This is only useful when matching regex sets. The last match state 189c67d6573Sopenharmony_ci /// is useful because it contains all of the match instructions seen, 190c67d6573Sopenharmony_ci /// thereby allowing us to enumerate which regexes in the set matched. 191c67d6573Sopenharmony_ci last_match_si: StatePtr, 192c67d6573Sopenharmony_ci /// The input position of the last cache flush. We use this to determine 193c67d6573Sopenharmony_ci /// if we're thrashing in the cache too often. If so, the DFA quits so 194c67d6573Sopenharmony_ci /// that we can fall back to the NFA algorithm. 195c67d6573Sopenharmony_ci last_cache_flush: usize, 196c67d6573Sopenharmony_ci /// All cached DFA information that is persisted between searches. 197c67d6573Sopenharmony_ci cache: &'a mut CacheInner, 198c67d6573Sopenharmony_ci} 199c67d6573Sopenharmony_ci 200c67d6573Sopenharmony_ci/// The result of running the DFA. 201c67d6573Sopenharmony_ci/// 202c67d6573Sopenharmony_ci/// Generally, the result is either a match or not a match, but sometimes the 203c67d6573Sopenharmony_ci/// DFA runs too slowly because the cache size is too small. In that case, it 204c67d6573Sopenharmony_ci/// gives up with the intent of falling back to the NFA algorithm. 205c67d6573Sopenharmony_ci/// 206c67d6573Sopenharmony_ci/// The DFA can also give up if it runs out of room to create new states, or if 207c67d6573Sopenharmony_ci/// it sees non-ASCII bytes in the presence of a Unicode word boundary. 208c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 209c67d6573Sopenharmony_cipub enum Result<T> { 210c67d6573Sopenharmony_ci Match(T), 211c67d6573Sopenharmony_ci NoMatch(usize), 212c67d6573Sopenharmony_ci Quit, 213c67d6573Sopenharmony_ci} 214c67d6573Sopenharmony_ci 215c67d6573Sopenharmony_ciimpl<T> Result<T> { 216c67d6573Sopenharmony_ci /// Returns true if this result corresponds to a match. 217c67d6573Sopenharmony_ci pub fn is_match(&self) -> bool { 218c67d6573Sopenharmony_ci match *self { 219c67d6573Sopenharmony_ci Result::Match(_) => true, 220c67d6573Sopenharmony_ci Result::NoMatch(_) | Result::Quit => false, 221c67d6573Sopenharmony_ci } 222c67d6573Sopenharmony_ci } 223c67d6573Sopenharmony_ci 224c67d6573Sopenharmony_ci /// Maps the given function onto T and returns the result. 225c67d6573Sopenharmony_ci /// 226c67d6573Sopenharmony_ci /// If this isn't a match, then this is a no-op. 227c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 228c67d6573Sopenharmony_ci pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> { 229c67d6573Sopenharmony_ci match self { 230c67d6573Sopenharmony_ci Result::Match(t) => Result::Match(f(t)), 231c67d6573Sopenharmony_ci Result::NoMatch(x) => Result::NoMatch(x), 232c67d6573Sopenharmony_ci Result::Quit => Result::Quit, 233c67d6573Sopenharmony_ci } 234c67d6573Sopenharmony_ci } 235c67d6573Sopenharmony_ci 236c67d6573Sopenharmony_ci /// Sets the non-match position. 237c67d6573Sopenharmony_ci /// 238c67d6573Sopenharmony_ci /// If this isn't a non-match, then this is a no-op. 239c67d6573Sopenharmony_ci fn set_non_match(self, at: usize) -> Result<T> { 240c67d6573Sopenharmony_ci match self { 241c67d6573Sopenharmony_ci Result::NoMatch(_) => Result::NoMatch(at), 242c67d6573Sopenharmony_ci r => r, 243c67d6573Sopenharmony_ci } 244c67d6573Sopenharmony_ci } 245c67d6573Sopenharmony_ci} 246c67d6573Sopenharmony_ci 247c67d6573Sopenharmony_ci/// `State` is a DFA state. It contains an ordered set of NFA states (not 248c67d6573Sopenharmony_ci/// necessarily complete) and a smattering of flags. 249c67d6573Sopenharmony_ci/// 250c67d6573Sopenharmony_ci/// The flags are packed into the first byte of data. 251c67d6573Sopenharmony_ci/// 252c67d6573Sopenharmony_ci/// States don't carry their transitions. Instead, transitions are stored in 253c67d6573Sopenharmony_ci/// a single row-major table. 254c67d6573Sopenharmony_ci/// 255c67d6573Sopenharmony_ci/// Delta encoding is used to store the instruction pointers. 256c67d6573Sopenharmony_ci/// The first instruction pointer is stored directly starting 257c67d6573Sopenharmony_ci/// at data[1], and each following pointer is stored as an offset 258c67d6573Sopenharmony_ci/// to the previous one. If a delta is in the range -127..127, 259c67d6573Sopenharmony_ci/// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8) 260c67d6573Sopenharmony_ci/// is coded as a flag, followed by 4 bytes encoding the delta. 261c67d6573Sopenharmony_ci#[derive(Clone, Eq, Hash, PartialEq)] 262c67d6573Sopenharmony_cistruct State { 263c67d6573Sopenharmony_ci data: Arc<[u8]>, 264c67d6573Sopenharmony_ci} 265c67d6573Sopenharmony_ci 266c67d6573Sopenharmony_ci/// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes 267c67d6573Sopenharmony_ci/// an NFA state). 268c67d6573Sopenharmony_ci/// 269c67d6573Sopenharmony_ci/// Throughout this library, this is usually set to `usize`, but we force a 270c67d6573Sopenharmony_ci/// `u32` here for the DFA to save on space. 271c67d6573Sopenharmony_citype InstPtr = u32; 272c67d6573Sopenharmony_ci 273c67d6573Sopenharmony_ci/// Adds ip to data using delta encoding with respect to prev. 274c67d6573Sopenharmony_ci/// 275c67d6573Sopenharmony_ci/// After completion, `data` will contain `ip` and `prev` will be set to `ip`. 276c67d6573Sopenharmony_cifn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) { 277c67d6573Sopenharmony_ci let delta = (ip as i32) - (*prev as i32); 278c67d6573Sopenharmony_ci write_vari32(data, delta); 279c67d6573Sopenharmony_ci *prev = ip; 280c67d6573Sopenharmony_ci} 281c67d6573Sopenharmony_ci 282c67d6573Sopenharmony_cistruct InstPtrs<'a> { 283c67d6573Sopenharmony_ci base: usize, 284c67d6573Sopenharmony_ci data: &'a [u8], 285c67d6573Sopenharmony_ci} 286c67d6573Sopenharmony_ci 287c67d6573Sopenharmony_ciimpl<'a> Iterator for InstPtrs<'a> { 288c67d6573Sopenharmony_ci type Item = usize; 289c67d6573Sopenharmony_ci 290c67d6573Sopenharmony_ci fn next(&mut self) -> Option<usize> { 291c67d6573Sopenharmony_ci if self.data.is_empty() { 292c67d6573Sopenharmony_ci return None; 293c67d6573Sopenharmony_ci } 294c67d6573Sopenharmony_ci let (delta, nread) = read_vari32(self.data); 295c67d6573Sopenharmony_ci let base = self.base as i32 + delta; 296c67d6573Sopenharmony_ci debug_assert!(base >= 0); 297c67d6573Sopenharmony_ci debug_assert!(nread > 0); 298c67d6573Sopenharmony_ci self.data = &self.data[nread..]; 299c67d6573Sopenharmony_ci self.base = base as usize; 300c67d6573Sopenharmony_ci Some(self.base) 301c67d6573Sopenharmony_ci } 302c67d6573Sopenharmony_ci} 303c67d6573Sopenharmony_ci 304c67d6573Sopenharmony_ciimpl State { 305c67d6573Sopenharmony_ci fn flags(&self) -> StateFlags { 306c67d6573Sopenharmony_ci StateFlags(self.data[0]) 307c67d6573Sopenharmony_ci } 308c67d6573Sopenharmony_ci 309c67d6573Sopenharmony_ci fn inst_ptrs(&self) -> InstPtrs<'_> { 310c67d6573Sopenharmony_ci InstPtrs { base: 0, data: &self.data[1..] } 311c67d6573Sopenharmony_ci } 312c67d6573Sopenharmony_ci} 313c67d6573Sopenharmony_ci 314c67d6573Sopenharmony_ci/// `StatePtr` is a 32 bit pointer to the start of a row in the transition 315c67d6573Sopenharmony_ci/// table. 316c67d6573Sopenharmony_ci/// 317c67d6573Sopenharmony_ci/// It has many special values. There are two types of special values: 318c67d6573Sopenharmony_ci/// sentinels and flags. 319c67d6573Sopenharmony_ci/// 320c67d6573Sopenharmony_ci/// Sentinels corresponds to special states that carry some kind of 321c67d6573Sopenharmony_ci/// significance. There are three such states: unknown, dead and quit states. 322c67d6573Sopenharmony_ci/// 323c67d6573Sopenharmony_ci/// Unknown states are states that haven't been computed yet. They indicate 324c67d6573Sopenharmony_ci/// that a transition should be filled in that points to either an existing 325c67d6573Sopenharmony_ci/// cached state or a new state altogether. In general, an unknown state means 326c67d6573Sopenharmony_ci/// "follow the NFA's epsilon transitions." 327c67d6573Sopenharmony_ci/// 328c67d6573Sopenharmony_ci/// Dead states are states that can never lead to a match, no matter what 329c67d6573Sopenharmony_ci/// subsequent input is observed. This means that the DFA should quit 330c67d6573Sopenharmony_ci/// immediately and return the longest match it has found thus far. 331c67d6573Sopenharmony_ci/// 332c67d6573Sopenharmony_ci/// Quit states are states that imply the DFA is not capable of matching the 333c67d6573Sopenharmony_ci/// regex correctly. Currently, this is only used when a Unicode word boundary 334c67d6573Sopenharmony_ci/// exists in the regex *and* a non-ASCII byte is observed. 335c67d6573Sopenharmony_ci/// 336c67d6573Sopenharmony_ci/// The other type of state pointer is a state pointer with special flag bits. 337c67d6573Sopenharmony_ci/// There are two flags: a start flag and a match flag. The lower bits of both 338c67d6573Sopenharmony_ci/// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX` 339c67d6573Sopenharmony_ci/// mask). 340c67d6573Sopenharmony_ci/// 341c67d6573Sopenharmony_ci/// The start flag means that the state is a start state, and therefore may be 342c67d6573Sopenharmony_ci/// subject to special prefix scanning optimizations. 343c67d6573Sopenharmony_ci/// 344c67d6573Sopenharmony_ci/// The match flag means that the state is a match state, and therefore the 345c67d6573Sopenharmony_ci/// current position in the input (while searching) should be recorded. 346c67d6573Sopenharmony_ci/// 347c67d6573Sopenharmony_ci/// The above exists mostly in the service of making the inner loop fast. 348c67d6573Sopenharmony_ci/// In particular, the inner *inner* loop looks something like this: 349c67d6573Sopenharmony_ci/// 350c67d6573Sopenharmony_ci/// ```ignore 351c67d6573Sopenharmony_ci/// while state <= STATE_MAX and i < len(text): 352c67d6573Sopenharmony_ci/// state = state.next[i] 353c67d6573Sopenharmony_ci/// ``` 354c67d6573Sopenharmony_ci/// 355c67d6573Sopenharmony_ci/// This is nice because it lets us execute a lazy DFA as if it were an 356c67d6573Sopenharmony_ci/// entirely offline DFA (i.e., with very few instructions). The loop will 357c67d6573Sopenharmony_ci/// quit only when we need to examine a case that needs special attention. 358c67d6573Sopenharmony_citype StatePtr = u32; 359c67d6573Sopenharmony_ci 360c67d6573Sopenharmony_ci/// An unknown state means that the state has not been computed yet, and that 361c67d6573Sopenharmony_ci/// the only way to progress is to compute it. 362c67d6573Sopenharmony_ciconst STATE_UNKNOWN: StatePtr = 1 << 31; 363c67d6573Sopenharmony_ci 364c67d6573Sopenharmony_ci/// A dead state means that the state has been computed and it is known that 365c67d6573Sopenharmony_ci/// once it is entered, no future match can ever occur. 366c67d6573Sopenharmony_ciconst STATE_DEAD: StatePtr = STATE_UNKNOWN + 1; 367c67d6573Sopenharmony_ci 368c67d6573Sopenharmony_ci/// A quit state means that the DFA came across some input that it doesn't 369c67d6573Sopenharmony_ci/// know how to process correctly. The DFA should quit and another matching 370c67d6573Sopenharmony_ci/// engine should be run in its place. 371c67d6573Sopenharmony_ciconst STATE_QUIT: StatePtr = STATE_DEAD + 1; 372c67d6573Sopenharmony_ci 373c67d6573Sopenharmony_ci/// A start state is a state that the DFA can start in. 374c67d6573Sopenharmony_ci/// 375c67d6573Sopenharmony_ci/// Note that start states have their lower bits set to a state pointer. 376c67d6573Sopenharmony_ciconst STATE_START: StatePtr = 1 << 30; 377c67d6573Sopenharmony_ci 378c67d6573Sopenharmony_ci/// A match state means that the regex has successfully matched. 379c67d6573Sopenharmony_ci/// 380c67d6573Sopenharmony_ci/// Note that match states have their lower bits set to a state pointer. 381c67d6573Sopenharmony_ciconst STATE_MATCH: StatePtr = 1 << 29; 382c67d6573Sopenharmony_ci 383c67d6573Sopenharmony_ci/// The maximum state pointer. This is useful to mask out the "valid" state 384c67d6573Sopenharmony_ci/// pointer from a state with the "start" or "match" bits set. 385c67d6573Sopenharmony_ci/// 386c67d6573Sopenharmony_ci/// It doesn't make sense to use this with unknown, dead or quit state 387c67d6573Sopenharmony_ci/// pointers, since those pointers are sentinels and never have their lower 388c67d6573Sopenharmony_ci/// bits set to anything meaningful. 389c67d6573Sopenharmony_ciconst STATE_MAX: StatePtr = STATE_MATCH - 1; 390c67d6573Sopenharmony_ci 391c67d6573Sopenharmony_ci/// Byte is a u8 in spirit, but a u16 in practice so that we can represent the 392c67d6573Sopenharmony_ci/// special EOF sentinel value. 393c67d6573Sopenharmony_ci#[derive(Copy, Clone, Debug)] 394c67d6573Sopenharmony_cistruct Byte(u16); 395c67d6573Sopenharmony_ci 396c67d6573Sopenharmony_ci/// A set of flags for zero-width assertions. 397c67d6573Sopenharmony_ci#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)] 398c67d6573Sopenharmony_cistruct EmptyFlags { 399c67d6573Sopenharmony_ci start: bool, 400c67d6573Sopenharmony_ci end: bool, 401c67d6573Sopenharmony_ci start_line: bool, 402c67d6573Sopenharmony_ci end_line: bool, 403c67d6573Sopenharmony_ci word_boundary: bool, 404c67d6573Sopenharmony_ci not_word_boundary: bool, 405c67d6573Sopenharmony_ci} 406c67d6573Sopenharmony_ci 407c67d6573Sopenharmony_ci/// A set of flags describing various configurations of a DFA state. This is 408c67d6573Sopenharmony_ci/// represented by a `u8` so that it is compact. 409c67d6573Sopenharmony_ci#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)] 410c67d6573Sopenharmony_cistruct StateFlags(u8); 411c67d6573Sopenharmony_ci 412c67d6573Sopenharmony_ciimpl Cache { 413c67d6573Sopenharmony_ci /// Create new empty cache for the DFA engine. 414c67d6573Sopenharmony_ci pub fn new(prog: &Program) -> Self { 415c67d6573Sopenharmony_ci // We add 1 to account for the special EOF byte. 416c67d6573Sopenharmony_ci let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1; 417c67d6573Sopenharmony_ci let starts = vec![STATE_UNKNOWN; 256]; 418c67d6573Sopenharmony_ci let mut cache = Cache { 419c67d6573Sopenharmony_ci inner: CacheInner { 420c67d6573Sopenharmony_ci compiled: StateMap::new(num_byte_classes), 421c67d6573Sopenharmony_ci trans: Transitions::new(num_byte_classes), 422c67d6573Sopenharmony_ci start_states: starts, 423c67d6573Sopenharmony_ci stack: vec![], 424c67d6573Sopenharmony_ci flush_count: 0, 425c67d6573Sopenharmony_ci size: 0, 426c67d6573Sopenharmony_ci insts_scratch_space: vec![], 427c67d6573Sopenharmony_ci }, 428c67d6573Sopenharmony_ci qcur: SparseSet::new(prog.insts.len()), 429c67d6573Sopenharmony_ci qnext: SparseSet::new(prog.insts.len()), 430c67d6573Sopenharmony_ci }; 431c67d6573Sopenharmony_ci cache.inner.reset_size(); 432c67d6573Sopenharmony_ci cache 433c67d6573Sopenharmony_ci } 434c67d6573Sopenharmony_ci} 435c67d6573Sopenharmony_ci 436c67d6573Sopenharmony_ciimpl CacheInner { 437c67d6573Sopenharmony_ci /// Resets the cache size to account for fixed costs, such as the program 438c67d6573Sopenharmony_ci /// and stack sizes. 439c67d6573Sopenharmony_ci fn reset_size(&mut self) { 440c67d6573Sopenharmony_ci self.size = (self.start_states.len() * mem::size_of::<StatePtr>()) 441c67d6573Sopenharmony_ci + (self.stack.len() * mem::size_of::<InstPtr>()); 442c67d6573Sopenharmony_ci } 443c67d6573Sopenharmony_ci} 444c67d6573Sopenharmony_ci 445c67d6573Sopenharmony_ciimpl<'a> Fsm<'a> { 446c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 447c67d6573Sopenharmony_ci pub fn forward( 448c67d6573Sopenharmony_ci prog: &'a Program, 449c67d6573Sopenharmony_ci cache: &ProgramCache, 450c67d6573Sopenharmony_ci quit_after_match: bool, 451c67d6573Sopenharmony_ci text: &[u8], 452c67d6573Sopenharmony_ci at: usize, 453c67d6573Sopenharmony_ci ) -> Result<usize> { 454c67d6573Sopenharmony_ci let mut cache = cache.borrow_mut(); 455c67d6573Sopenharmony_ci let cache = &mut cache.dfa; 456c67d6573Sopenharmony_ci let mut dfa = Fsm { 457c67d6573Sopenharmony_ci prog, 458c67d6573Sopenharmony_ci start: 0, // filled in below 459c67d6573Sopenharmony_ci at, 460c67d6573Sopenharmony_ci quit_after_match, 461c67d6573Sopenharmony_ci last_match_si: STATE_UNKNOWN, 462c67d6573Sopenharmony_ci last_cache_flush: at, 463c67d6573Sopenharmony_ci cache: &mut cache.inner, 464c67d6573Sopenharmony_ci }; 465c67d6573Sopenharmony_ci let (empty_flags, state_flags) = dfa.start_flags(text, at); 466c67d6573Sopenharmony_ci dfa.start = 467c67d6573Sopenharmony_ci match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { 468c67d6573Sopenharmony_ci None => return Result::Quit, 469c67d6573Sopenharmony_ci Some(STATE_DEAD) => return Result::NoMatch(at), 470c67d6573Sopenharmony_ci Some(si) => si, 471c67d6573Sopenharmony_ci }; 472c67d6573Sopenharmony_ci debug_assert!(dfa.start != STATE_UNKNOWN); 473c67d6573Sopenharmony_ci dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text) 474c67d6573Sopenharmony_ci } 475c67d6573Sopenharmony_ci 476c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 477c67d6573Sopenharmony_ci pub fn reverse( 478c67d6573Sopenharmony_ci prog: &'a Program, 479c67d6573Sopenharmony_ci cache: &ProgramCache, 480c67d6573Sopenharmony_ci quit_after_match: bool, 481c67d6573Sopenharmony_ci text: &[u8], 482c67d6573Sopenharmony_ci at: usize, 483c67d6573Sopenharmony_ci ) -> Result<usize> { 484c67d6573Sopenharmony_ci let mut cache = cache.borrow_mut(); 485c67d6573Sopenharmony_ci let cache = &mut cache.dfa_reverse; 486c67d6573Sopenharmony_ci let mut dfa = Fsm { 487c67d6573Sopenharmony_ci prog, 488c67d6573Sopenharmony_ci start: 0, // filled in below 489c67d6573Sopenharmony_ci at, 490c67d6573Sopenharmony_ci quit_after_match, 491c67d6573Sopenharmony_ci last_match_si: STATE_UNKNOWN, 492c67d6573Sopenharmony_ci last_cache_flush: at, 493c67d6573Sopenharmony_ci cache: &mut cache.inner, 494c67d6573Sopenharmony_ci }; 495c67d6573Sopenharmony_ci let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at); 496c67d6573Sopenharmony_ci dfa.start = 497c67d6573Sopenharmony_ci match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { 498c67d6573Sopenharmony_ci None => return Result::Quit, 499c67d6573Sopenharmony_ci Some(STATE_DEAD) => return Result::NoMatch(at), 500c67d6573Sopenharmony_ci Some(si) => si, 501c67d6573Sopenharmony_ci }; 502c67d6573Sopenharmony_ci debug_assert!(dfa.start != STATE_UNKNOWN); 503c67d6573Sopenharmony_ci dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text) 504c67d6573Sopenharmony_ci } 505c67d6573Sopenharmony_ci 506c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 507c67d6573Sopenharmony_ci pub fn forward_many( 508c67d6573Sopenharmony_ci prog: &'a Program, 509c67d6573Sopenharmony_ci cache: &ProgramCache, 510c67d6573Sopenharmony_ci matches: &mut [bool], 511c67d6573Sopenharmony_ci text: &[u8], 512c67d6573Sopenharmony_ci at: usize, 513c67d6573Sopenharmony_ci ) -> Result<usize> { 514c67d6573Sopenharmony_ci debug_assert!(matches.len() == prog.matches.len()); 515c67d6573Sopenharmony_ci let mut cache = cache.borrow_mut(); 516c67d6573Sopenharmony_ci let cache = &mut cache.dfa; 517c67d6573Sopenharmony_ci let mut dfa = Fsm { 518c67d6573Sopenharmony_ci prog, 519c67d6573Sopenharmony_ci start: 0, // filled in below 520c67d6573Sopenharmony_ci at, 521c67d6573Sopenharmony_ci quit_after_match: false, 522c67d6573Sopenharmony_ci last_match_si: STATE_UNKNOWN, 523c67d6573Sopenharmony_ci last_cache_flush: at, 524c67d6573Sopenharmony_ci cache: &mut cache.inner, 525c67d6573Sopenharmony_ci }; 526c67d6573Sopenharmony_ci let (empty_flags, state_flags) = dfa.start_flags(text, at); 527c67d6573Sopenharmony_ci dfa.start = 528c67d6573Sopenharmony_ci match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { 529c67d6573Sopenharmony_ci None => return Result::Quit, 530c67d6573Sopenharmony_ci Some(STATE_DEAD) => return Result::NoMatch(at), 531c67d6573Sopenharmony_ci Some(si) => si, 532c67d6573Sopenharmony_ci }; 533c67d6573Sopenharmony_ci debug_assert!(dfa.start != STATE_UNKNOWN); 534c67d6573Sopenharmony_ci let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text); 535c67d6573Sopenharmony_ci if result.is_match() { 536c67d6573Sopenharmony_ci if matches.len() == 1 { 537c67d6573Sopenharmony_ci matches[0] = true; 538c67d6573Sopenharmony_ci } else { 539c67d6573Sopenharmony_ci debug_assert!(dfa.last_match_si != STATE_UNKNOWN); 540c67d6573Sopenharmony_ci debug_assert!(dfa.last_match_si != STATE_DEAD); 541c67d6573Sopenharmony_ci for ip in dfa.state(dfa.last_match_si).inst_ptrs() { 542c67d6573Sopenharmony_ci if let Inst::Match(slot) = dfa.prog[ip] { 543c67d6573Sopenharmony_ci matches[slot] = true; 544c67d6573Sopenharmony_ci } 545c67d6573Sopenharmony_ci } 546c67d6573Sopenharmony_ci } 547c67d6573Sopenharmony_ci } 548c67d6573Sopenharmony_ci result 549c67d6573Sopenharmony_ci } 550c67d6573Sopenharmony_ci 551c67d6573Sopenharmony_ci /// Executes the DFA on a forward NFA. 552c67d6573Sopenharmony_ci /// 553c67d6573Sopenharmony_ci /// {qcur,qnext} are scratch ordered sets which may be non-empty. 554c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 555c67d6573Sopenharmony_ci fn exec_at( 556c67d6573Sopenharmony_ci &mut self, 557c67d6573Sopenharmony_ci qcur: &mut SparseSet, 558c67d6573Sopenharmony_ci qnext: &mut SparseSet, 559c67d6573Sopenharmony_ci text: &[u8], 560c67d6573Sopenharmony_ci ) -> Result<usize> { 561c67d6573Sopenharmony_ci // For the most part, the DFA is basically: 562c67d6573Sopenharmony_ci // 563c67d6573Sopenharmony_ci // last_match = null 564c67d6573Sopenharmony_ci // while current_byte != EOF: 565c67d6573Sopenharmony_ci // si = current_state.next[current_byte] 566c67d6573Sopenharmony_ci // if si is match 567c67d6573Sopenharmony_ci // last_match = si 568c67d6573Sopenharmony_ci // return last_match 569c67d6573Sopenharmony_ci // 570c67d6573Sopenharmony_ci // However, we need to deal with a few things: 571c67d6573Sopenharmony_ci // 572c67d6573Sopenharmony_ci // 1. This is an *online* DFA, so the current state's next list 573c67d6573Sopenharmony_ci // may not point to anywhere yet, so we must go out and compute 574c67d6573Sopenharmony_ci // them. (They are then cached into the current state's next list 575c67d6573Sopenharmony_ci // to avoid re-computation.) 576c67d6573Sopenharmony_ci // 2. If we come across a state that is known to be dead (i.e., never 577c67d6573Sopenharmony_ci // leads to a match), then we can quit early. 578c67d6573Sopenharmony_ci // 3. If the caller just wants to know if a match occurs, then we 579c67d6573Sopenharmony_ci // can quit as soon as we know we have a match. (Full leftmost 580c67d6573Sopenharmony_ci // first semantics require continuing on.) 581c67d6573Sopenharmony_ci // 4. If we're in the start state, then we can use a pre-computed set 582c67d6573Sopenharmony_ci // of prefix literals to skip quickly along the input. 583c67d6573Sopenharmony_ci // 5. After the input is exhausted, we run the DFA on one symbol 584c67d6573Sopenharmony_ci // that stands for EOF. This is useful for handling empty width 585c67d6573Sopenharmony_ci // assertions. 586c67d6573Sopenharmony_ci // 6. We can't actually do state.next[byte]. Instead, we have to do 587c67d6573Sopenharmony_ci // state.next[byte_classes[byte]], which permits us to keep the 588c67d6573Sopenharmony_ci // 'next' list very small. 589c67d6573Sopenharmony_ci // 590c67d6573Sopenharmony_ci // Since there's a bunch of extra stuff we need to consider, we do some 591c67d6573Sopenharmony_ci // pretty hairy tricks to get the inner loop to run as fast as 592c67d6573Sopenharmony_ci // possible. 593c67d6573Sopenharmony_ci debug_assert!(!self.prog.is_reverse); 594c67d6573Sopenharmony_ci 595c67d6573Sopenharmony_ci // The last match is the currently known ending match position. It is 596c67d6573Sopenharmony_ci // reported as an index to the most recent byte that resulted in a 597c67d6573Sopenharmony_ci // transition to a match state and is always stored in capture slot `1` 598c67d6573Sopenharmony_ci // when searching forwards. Its maximum value is `text.len()`. 599c67d6573Sopenharmony_ci let mut result = Result::NoMatch(self.at); 600c67d6573Sopenharmony_ci let (mut prev_si, mut next_si) = (self.start, self.start); 601c67d6573Sopenharmony_ci let mut at = self.at; 602c67d6573Sopenharmony_ci while at < text.len() { 603c67d6573Sopenharmony_ci // This is the real inner loop. We take advantage of special bits 604c67d6573Sopenharmony_ci // set in the state pointer to determine whether a state is in the 605c67d6573Sopenharmony_ci // "common" case or not. Specifically, the common case is a 606c67d6573Sopenharmony_ci // non-match non-start non-dead state that has already been 607c67d6573Sopenharmony_ci // computed. So long as we remain in the common case, this inner 608c67d6573Sopenharmony_ci // loop will chew through the input. 609c67d6573Sopenharmony_ci // 610c67d6573Sopenharmony_ci // We also unroll the loop 4 times to amortize the cost of checking 611c67d6573Sopenharmony_ci // whether we've consumed the entire input. We are also careful 612c67d6573Sopenharmony_ci // to make sure that `prev_si` always represents the previous state 613c67d6573Sopenharmony_ci // and `next_si` always represents the next state after the loop 614c67d6573Sopenharmony_ci // exits, even if it isn't always true inside the loop. 615c67d6573Sopenharmony_ci while next_si <= STATE_MAX && at < text.len() { 616c67d6573Sopenharmony_ci // Argument for safety is in the definition of next_si. 617c67d6573Sopenharmony_ci prev_si = unsafe { self.next_si(next_si, text, at) }; 618c67d6573Sopenharmony_ci at += 1; 619c67d6573Sopenharmony_ci if prev_si > STATE_MAX || at + 2 >= text.len() { 620c67d6573Sopenharmony_ci mem::swap(&mut prev_si, &mut next_si); 621c67d6573Sopenharmony_ci break; 622c67d6573Sopenharmony_ci } 623c67d6573Sopenharmony_ci next_si = unsafe { self.next_si(prev_si, text, at) }; 624c67d6573Sopenharmony_ci at += 1; 625c67d6573Sopenharmony_ci if next_si > STATE_MAX { 626c67d6573Sopenharmony_ci break; 627c67d6573Sopenharmony_ci } 628c67d6573Sopenharmony_ci prev_si = unsafe { self.next_si(next_si, text, at) }; 629c67d6573Sopenharmony_ci at += 1; 630c67d6573Sopenharmony_ci if prev_si > STATE_MAX { 631c67d6573Sopenharmony_ci mem::swap(&mut prev_si, &mut next_si); 632c67d6573Sopenharmony_ci break; 633c67d6573Sopenharmony_ci } 634c67d6573Sopenharmony_ci next_si = unsafe { self.next_si(prev_si, text, at) }; 635c67d6573Sopenharmony_ci at += 1; 636c67d6573Sopenharmony_ci } 637c67d6573Sopenharmony_ci if next_si & STATE_MATCH > 0 { 638c67d6573Sopenharmony_ci // A match state is outside of the common case because it needs 639c67d6573Sopenharmony_ci // special case analysis. In particular, we need to record the 640c67d6573Sopenharmony_ci // last position as having matched and possibly quit the DFA if 641c67d6573Sopenharmony_ci // we don't need to keep matching. 642c67d6573Sopenharmony_ci next_si &= !STATE_MATCH; 643c67d6573Sopenharmony_ci result = Result::Match(at - 1); 644c67d6573Sopenharmony_ci if self.quit_after_match { 645c67d6573Sopenharmony_ci return result; 646c67d6573Sopenharmony_ci } 647c67d6573Sopenharmony_ci self.last_match_si = next_si; 648c67d6573Sopenharmony_ci prev_si = next_si; 649c67d6573Sopenharmony_ci 650c67d6573Sopenharmony_ci // This permits short-circuiting when matching a regex set. 651c67d6573Sopenharmony_ci // In particular, if this DFA state contains only match states, 652c67d6573Sopenharmony_ci // then it's impossible to extend the set of matches since 653c67d6573Sopenharmony_ci // match states are final. Therefore, we can quit. 654c67d6573Sopenharmony_ci if self.prog.matches.len() > 1 { 655c67d6573Sopenharmony_ci let state = self.state(next_si); 656c67d6573Sopenharmony_ci let just_matches = 657c67d6573Sopenharmony_ci state.inst_ptrs().all(|ip| self.prog[ip].is_match()); 658c67d6573Sopenharmony_ci if just_matches { 659c67d6573Sopenharmony_ci return result; 660c67d6573Sopenharmony_ci } 661c67d6573Sopenharmony_ci } 662c67d6573Sopenharmony_ci 663c67d6573Sopenharmony_ci // Another inner loop! If the DFA stays in this particular 664c67d6573Sopenharmony_ci // match state, then we can rip through all of the input 665c67d6573Sopenharmony_ci // very quickly, and only recording the match location once 666c67d6573Sopenharmony_ci // we've left this particular state. 667c67d6573Sopenharmony_ci let cur = at; 668c67d6573Sopenharmony_ci while (next_si & !STATE_MATCH) == prev_si 669c67d6573Sopenharmony_ci && at + 2 < text.len() 670c67d6573Sopenharmony_ci { 671c67d6573Sopenharmony_ci // Argument for safety is in the definition of next_si. 672c67d6573Sopenharmony_ci next_si = unsafe { 673c67d6573Sopenharmony_ci self.next_si(next_si & !STATE_MATCH, text, at) 674c67d6573Sopenharmony_ci }; 675c67d6573Sopenharmony_ci at += 1; 676c67d6573Sopenharmony_ci } 677c67d6573Sopenharmony_ci if at > cur { 678c67d6573Sopenharmony_ci result = Result::Match(at - 2); 679c67d6573Sopenharmony_ci } 680c67d6573Sopenharmony_ci } else if next_si & STATE_START > 0 { 681c67d6573Sopenharmony_ci // A start state isn't in the common case because we may 682c67d6573Sopenharmony_ci // want to do quick prefix scanning. If the program doesn't 683c67d6573Sopenharmony_ci // have a detected prefix, then start states are actually 684c67d6573Sopenharmony_ci // considered common and this case is never reached. 685c67d6573Sopenharmony_ci debug_assert!(self.has_prefix()); 686c67d6573Sopenharmony_ci next_si &= !STATE_START; 687c67d6573Sopenharmony_ci prev_si = next_si; 688c67d6573Sopenharmony_ci at = match self.prefix_at(text, at) { 689c67d6573Sopenharmony_ci None => return Result::NoMatch(text.len()), 690c67d6573Sopenharmony_ci Some(i) => i, 691c67d6573Sopenharmony_ci }; 692c67d6573Sopenharmony_ci } else if next_si >= STATE_UNKNOWN { 693c67d6573Sopenharmony_ci if next_si == STATE_QUIT { 694c67d6573Sopenharmony_ci return Result::Quit; 695c67d6573Sopenharmony_ci } 696c67d6573Sopenharmony_ci // Finally, this corresponds to the case where the transition 697c67d6573Sopenharmony_ci // entered a state that can never lead to a match or a state 698c67d6573Sopenharmony_ci // that hasn't been computed yet. The latter being the "slow" 699c67d6573Sopenharmony_ci // path. 700c67d6573Sopenharmony_ci let byte = Byte::byte(text[at - 1]); 701c67d6573Sopenharmony_ci // We no longer care about the special bits in the state 702c67d6573Sopenharmony_ci // pointer. 703c67d6573Sopenharmony_ci prev_si &= STATE_MAX; 704c67d6573Sopenharmony_ci // Record where we are. This is used to track progress for 705c67d6573Sopenharmony_ci // determining whether we should quit if we've flushed the 706c67d6573Sopenharmony_ci // cache too much. 707c67d6573Sopenharmony_ci self.at = at; 708c67d6573Sopenharmony_ci next_si = match self.next_state(qcur, qnext, prev_si, byte) { 709c67d6573Sopenharmony_ci None => return Result::Quit, 710c67d6573Sopenharmony_ci Some(STATE_DEAD) => return result.set_non_match(at), 711c67d6573Sopenharmony_ci Some(si) => si, 712c67d6573Sopenharmony_ci }; 713c67d6573Sopenharmony_ci debug_assert!(next_si != STATE_UNKNOWN); 714c67d6573Sopenharmony_ci if next_si & STATE_MATCH > 0 { 715c67d6573Sopenharmony_ci next_si &= !STATE_MATCH; 716c67d6573Sopenharmony_ci result = Result::Match(at - 1); 717c67d6573Sopenharmony_ci if self.quit_after_match { 718c67d6573Sopenharmony_ci return result; 719c67d6573Sopenharmony_ci } 720c67d6573Sopenharmony_ci self.last_match_si = next_si; 721c67d6573Sopenharmony_ci } 722c67d6573Sopenharmony_ci prev_si = next_si; 723c67d6573Sopenharmony_ci } else { 724c67d6573Sopenharmony_ci prev_si = next_si; 725c67d6573Sopenharmony_ci } 726c67d6573Sopenharmony_ci } 727c67d6573Sopenharmony_ci 728c67d6573Sopenharmony_ci // Run the DFA once more on the special EOF sentinel value. 729c67d6573Sopenharmony_ci // We don't care about the special bits in the state pointer any more, 730c67d6573Sopenharmony_ci // so get rid of them. 731c67d6573Sopenharmony_ci prev_si &= STATE_MAX; 732c67d6573Sopenharmony_ci prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) { 733c67d6573Sopenharmony_ci None => return Result::Quit, 734c67d6573Sopenharmony_ci Some(STATE_DEAD) => return result.set_non_match(text.len()), 735c67d6573Sopenharmony_ci Some(si) => si & !STATE_START, 736c67d6573Sopenharmony_ci }; 737c67d6573Sopenharmony_ci debug_assert!(prev_si != STATE_UNKNOWN); 738c67d6573Sopenharmony_ci if prev_si & STATE_MATCH > 0 { 739c67d6573Sopenharmony_ci prev_si &= !STATE_MATCH; 740c67d6573Sopenharmony_ci self.last_match_si = prev_si; 741c67d6573Sopenharmony_ci result = Result::Match(text.len()); 742c67d6573Sopenharmony_ci } 743c67d6573Sopenharmony_ci result 744c67d6573Sopenharmony_ci } 745c67d6573Sopenharmony_ci 746c67d6573Sopenharmony_ci /// Executes the DFA on a reverse NFA. 747c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 748c67d6573Sopenharmony_ci fn exec_at_reverse( 749c67d6573Sopenharmony_ci &mut self, 750c67d6573Sopenharmony_ci qcur: &mut SparseSet, 751c67d6573Sopenharmony_ci qnext: &mut SparseSet, 752c67d6573Sopenharmony_ci text: &[u8], 753c67d6573Sopenharmony_ci ) -> Result<usize> { 754c67d6573Sopenharmony_ci // The comments in `exec_at` above mostly apply here too. The main 755c67d6573Sopenharmony_ci // difference is that we move backwards over the input and we look for 756c67d6573Sopenharmony_ci // the longest possible match instead of the leftmost-first match. 757c67d6573Sopenharmony_ci // 758c67d6573Sopenharmony_ci // N.B. The code duplication here is regrettable. Efforts to improve 759c67d6573Sopenharmony_ci // it without sacrificing performance are welcome. ---AG 760c67d6573Sopenharmony_ci debug_assert!(self.prog.is_reverse); 761c67d6573Sopenharmony_ci let mut result = Result::NoMatch(self.at); 762c67d6573Sopenharmony_ci let (mut prev_si, mut next_si) = (self.start, self.start); 763c67d6573Sopenharmony_ci let mut at = self.at; 764c67d6573Sopenharmony_ci while at > 0 { 765c67d6573Sopenharmony_ci while next_si <= STATE_MAX && at > 0 { 766c67d6573Sopenharmony_ci // Argument for safety is in the definition of next_si. 767c67d6573Sopenharmony_ci at -= 1; 768c67d6573Sopenharmony_ci prev_si = unsafe { self.next_si(next_si, text, at) }; 769c67d6573Sopenharmony_ci if prev_si > STATE_MAX || at <= 4 { 770c67d6573Sopenharmony_ci mem::swap(&mut prev_si, &mut next_si); 771c67d6573Sopenharmony_ci break; 772c67d6573Sopenharmony_ci } 773c67d6573Sopenharmony_ci at -= 1; 774c67d6573Sopenharmony_ci next_si = unsafe { self.next_si(prev_si, text, at) }; 775c67d6573Sopenharmony_ci if next_si > STATE_MAX { 776c67d6573Sopenharmony_ci break; 777c67d6573Sopenharmony_ci } 778c67d6573Sopenharmony_ci at -= 1; 779c67d6573Sopenharmony_ci prev_si = unsafe { self.next_si(next_si, text, at) }; 780c67d6573Sopenharmony_ci if prev_si > STATE_MAX { 781c67d6573Sopenharmony_ci mem::swap(&mut prev_si, &mut next_si); 782c67d6573Sopenharmony_ci break; 783c67d6573Sopenharmony_ci } 784c67d6573Sopenharmony_ci at -= 1; 785c67d6573Sopenharmony_ci next_si = unsafe { self.next_si(prev_si, text, at) }; 786c67d6573Sopenharmony_ci } 787c67d6573Sopenharmony_ci if next_si & STATE_MATCH > 0 { 788c67d6573Sopenharmony_ci next_si &= !STATE_MATCH; 789c67d6573Sopenharmony_ci result = Result::Match(at + 1); 790c67d6573Sopenharmony_ci if self.quit_after_match { 791c67d6573Sopenharmony_ci return result; 792c67d6573Sopenharmony_ci } 793c67d6573Sopenharmony_ci self.last_match_si = next_si; 794c67d6573Sopenharmony_ci prev_si = next_si; 795c67d6573Sopenharmony_ci let cur = at; 796c67d6573Sopenharmony_ci while (next_si & !STATE_MATCH) == prev_si && at >= 2 { 797c67d6573Sopenharmony_ci // Argument for safety is in the definition of next_si. 798c67d6573Sopenharmony_ci at -= 1; 799c67d6573Sopenharmony_ci next_si = unsafe { 800c67d6573Sopenharmony_ci self.next_si(next_si & !STATE_MATCH, text, at) 801c67d6573Sopenharmony_ci }; 802c67d6573Sopenharmony_ci } 803c67d6573Sopenharmony_ci if at < cur { 804c67d6573Sopenharmony_ci result = Result::Match(at + 2); 805c67d6573Sopenharmony_ci } 806c67d6573Sopenharmony_ci } else if next_si >= STATE_UNKNOWN { 807c67d6573Sopenharmony_ci if next_si == STATE_QUIT { 808c67d6573Sopenharmony_ci return Result::Quit; 809c67d6573Sopenharmony_ci } 810c67d6573Sopenharmony_ci let byte = Byte::byte(text[at]); 811c67d6573Sopenharmony_ci prev_si &= STATE_MAX; 812c67d6573Sopenharmony_ci self.at = at; 813c67d6573Sopenharmony_ci next_si = match self.next_state(qcur, qnext, prev_si, byte) { 814c67d6573Sopenharmony_ci None => return Result::Quit, 815c67d6573Sopenharmony_ci Some(STATE_DEAD) => return result.set_non_match(at), 816c67d6573Sopenharmony_ci Some(si) => si, 817c67d6573Sopenharmony_ci }; 818c67d6573Sopenharmony_ci debug_assert!(next_si != STATE_UNKNOWN); 819c67d6573Sopenharmony_ci if next_si & STATE_MATCH > 0 { 820c67d6573Sopenharmony_ci next_si &= !STATE_MATCH; 821c67d6573Sopenharmony_ci result = Result::Match(at + 1); 822c67d6573Sopenharmony_ci if self.quit_after_match { 823c67d6573Sopenharmony_ci return result; 824c67d6573Sopenharmony_ci } 825c67d6573Sopenharmony_ci self.last_match_si = next_si; 826c67d6573Sopenharmony_ci } 827c67d6573Sopenharmony_ci prev_si = next_si; 828c67d6573Sopenharmony_ci } else { 829c67d6573Sopenharmony_ci prev_si = next_si; 830c67d6573Sopenharmony_ci } 831c67d6573Sopenharmony_ci } 832c67d6573Sopenharmony_ci 833c67d6573Sopenharmony_ci // Run the DFA once more on the special EOF sentinel value. 834c67d6573Sopenharmony_ci prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) { 835c67d6573Sopenharmony_ci None => return Result::Quit, 836c67d6573Sopenharmony_ci Some(STATE_DEAD) => return result.set_non_match(0), 837c67d6573Sopenharmony_ci Some(si) => si, 838c67d6573Sopenharmony_ci }; 839c67d6573Sopenharmony_ci debug_assert!(prev_si != STATE_UNKNOWN); 840c67d6573Sopenharmony_ci if prev_si & STATE_MATCH > 0 { 841c67d6573Sopenharmony_ci prev_si &= !STATE_MATCH; 842c67d6573Sopenharmony_ci self.last_match_si = prev_si; 843c67d6573Sopenharmony_ci result = Result::Match(0); 844c67d6573Sopenharmony_ci } 845c67d6573Sopenharmony_ci result 846c67d6573Sopenharmony_ci } 847c67d6573Sopenharmony_ci 848c67d6573Sopenharmony_ci /// next_si transitions to the next state, where the transition input 849c67d6573Sopenharmony_ci /// corresponds to text[i]. 850c67d6573Sopenharmony_ci /// 851c67d6573Sopenharmony_ci /// This elides bounds checks, and is therefore not safe. 852c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 853c67d6573Sopenharmony_ci unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr { 854c67d6573Sopenharmony_ci // What is the argument for safety here? 855c67d6573Sopenharmony_ci // We have three unchecked accesses that could possibly violate safety: 856c67d6573Sopenharmony_ci // 857c67d6573Sopenharmony_ci // 1. The given byte of input (`text[i]`). 858c67d6573Sopenharmony_ci // 2. The class of the byte of input (`classes[text[i]]`). 859c67d6573Sopenharmony_ci // 3. The transition for the class (`trans[si + cls]`). 860c67d6573Sopenharmony_ci // 861c67d6573Sopenharmony_ci // (1) is only safe when calling next_si is guarded by 862c67d6573Sopenharmony_ci // `i < text.len()`. 863c67d6573Sopenharmony_ci // 864c67d6573Sopenharmony_ci // (2) is the easiest case to guarantee since `text[i]` is always a 865c67d6573Sopenharmony_ci // `u8` and `self.prog.byte_classes` always has length `u8::MAX`. 866c67d6573Sopenharmony_ci // (See `ByteClassSet.byte_classes` in `compile.rs`.) 867c67d6573Sopenharmony_ci // 868c67d6573Sopenharmony_ci // (3) is only safe if (1)+(2) are safe. Namely, the transitions 869c67d6573Sopenharmony_ci // of every state are defined to have length equal to the number of 870c67d6573Sopenharmony_ci // byte classes in the program. Therefore, a valid class leads to a 871c67d6573Sopenharmony_ci // valid transition. (All possible transitions are valid lookups, even 872c67d6573Sopenharmony_ci // if it points to a state that hasn't been computed yet.) (3) also 873c67d6573Sopenharmony_ci // relies on `si` being correct, but StatePtrs should only ever be 874c67d6573Sopenharmony_ci // retrieved from the transition table, which ensures they are correct. 875c67d6573Sopenharmony_ci debug_assert!(i < text.len()); 876c67d6573Sopenharmony_ci let b = *text.get_unchecked(i); 877c67d6573Sopenharmony_ci debug_assert!((b as usize) < self.prog.byte_classes.len()); 878c67d6573Sopenharmony_ci let cls = *self.prog.byte_classes.get_unchecked(b as usize); 879c67d6573Sopenharmony_ci self.cache.trans.next_unchecked(si, cls as usize) 880c67d6573Sopenharmony_ci } 881c67d6573Sopenharmony_ci 882c67d6573Sopenharmony_ci /// Computes the next state given the current state and the current input 883c67d6573Sopenharmony_ci /// byte (which may be EOF). 884c67d6573Sopenharmony_ci /// 885c67d6573Sopenharmony_ci /// If STATE_DEAD is returned, then there is no valid state transition. 886c67d6573Sopenharmony_ci /// This implies that no permutation of future input can lead to a match 887c67d6573Sopenharmony_ci /// state. 888c67d6573Sopenharmony_ci /// 889c67d6573Sopenharmony_ci /// STATE_UNKNOWN can never be returned. 890c67d6573Sopenharmony_ci fn exec_byte( 891c67d6573Sopenharmony_ci &mut self, 892c67d6573Sopenharmony_ci qcur: &mut SparseSet, 893c67d6573Sopenharmony_ci qnext: &mut SparseSet, 894c67d6573Sopenharmony_ci mut si: StatePtr, 895c67d6573Sopenharmony_ci b: Byte, 896c67d6573Sopenharmony_ci ) -> Option<StatePtr> { 897c67d6573Sopenharmony_ci use crate::prog::Inst::*; 898c67d6573Sopenharmony_ci 899c67d6573Sopenharmony_ci // Initialize a queue with the current DFA state's NFA states. 900c67d6573Sopenharmony_ci qcur.clear(); 901c67d6573Sopenharmony_ci for ip in self.state(si).inst_ptrs() { 902c67d6573Sopenharmony_ci qcur.insert(ip); 903c67d6573Sopenharmony_ci } 904c67d6573Sopenharmony_ci 905c67d6573Sopenharmony_ci // Before inspecting the current byte, we may need to also inspect 906c67d6573Sopenharmony_ci // whether the position immediately preceding the current byte 907c67d6573Sopenharmony_ci // satisfies the empty assertions found in the current state. 908c67d6573Sopenharmony_ci // 909c67d6573Sopenharmony_ci // We only need to do this step if there are any empty assertions in 910c67d6573Sopenharmony_ci // the current state. 911c67d6573Sopenharmony_ci let is_word_last = self.state(si).flags().is_word(); 912c67d6573Sopenharmony_ci let is_word = b.is_ascii_word(); 913c67d6573Sopenharmony_ci if self.state(si).flags().has_empty() { 914c67d6573Sopenharmony_ci // Compute the flags immediately preceding the current byte. 915c67d6573Sopenharmony_ci // This means we only care about the "end" or "end line" flags. 916c67d6573Sopenharmony_ci // (The "start" flags are computed immediately following the 917c67d6573Sopenharmony_ci // current byte and are handled below.) 918c67d6573Sopenharmony_ci let mut flags = EmptyFlags::default(); 919c67d6573Sopenharmony_ci if b.is_eof() { 920c67d6573Sopenharmony_ci flags.end = true; 921c67d6573Sopenharmony_ci flags.end_line = true; 922c67d6573Sopenharmony_ci } else if b.as_byte().map_or(false, |b| b == b'\n') { 923c67d6573Sopenharmony_ci flags.end_line = true; 924c67d6573Sopenharmony_ci } 925c67d6573Sopenharmony_ci if is_word_last == is_word { 926c67d6573Sopenharmony_ci flags.not_word_boundary = true; 927c67d6573Sopenharmony_ci } else { 928c67d6573Sopenharmony_ci flags.word_boundary = true; 929c67d6573Sopenharmony_ci } 930c67d6573Sopenharmony_ci // Now follow epsilon transitions from every NFA state, but make 931c67d6573Sopenharmony_ci // sure we only follow transitions that satisfy our flags. 932c67d6573Sopenharmony_ci qnext.clear(); 933c67d6573Sopenharmony_ci for &ip in &*qcur { 934c67d6573Sopenharmony_ci self.follow_epsilons(usize_to_u32(ip), qnext, flags); 935c67d6573Sopenharmony_ci } 936c67d6573Sopenharmony_ci mem::swap(qcur, qnext); 937c67d6573Sopenharmony_ci } 938c67d6573Sopenharmony_ci 939c67d6573Sopenharmony_ci // Now we set flags for immediately after the current byte. Since start 940c67d6573Sopenharmony_ci // states are processed separately, and are the only states that can 941c67d6573Sopenharmony_ci // have the StartText flag set, we therefore only need to worry about 942c67d6573Sopenharmony_ci // the StartLine flag here. 943c67d6573Sopenharmony_ci // 944c67d6573Sopenharmony_ci // We do also keep track of whether this DFA state contains a NFA state 945c67d6573Sopenharmony_ci // that is a matching state. This is precisely how we delay the DFA 946c67d6573Sopenharmony_ci // matching by one byte in order to process the special EOF sentinel 947c67d6573Sopenharmony_ci // byte. Namely, if this DFA state containing a matching NFA state, 948c67d6573Sopenharmony_ci // then it is the *next* DFA state that is marked as a match. 949c67d6573Sopenharmony_ci let mut empty_flags = EmptyFlags::default(); 950c67d6573Sopenharmony_ci let mut state_flags = StateFlags::default(); 951c67d6573Sopenharmony_ci empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n'); 952c67d6573Sopenharmony_ci if b.is_ascii_word() { 953c67d6573Sopenharmony_ci state_flags.set_word(); 954c67d6573Sopenharmony_ci } 955c67d6573Sopenharmony_ci // Now follow all epsilon transitions again, but only after consuming 956c67d6573Sopenharmony_ci // the current byte. 957c67d6573Sopenharmony_ci qnext.clear(); 958c67d6573Sopenharmony_ci for &ip in &*qcur { 959c67d6573Sopenharmony_ci match self.prog[ip as usize] { 960c67d6573Sopenharmony_ci // These states never happen in a byte-based program. 961c67d6573Sopenharmony_ci Char(_) | Ranges(_) => unreachable!(), 962c67d6573Sopenharmony_ci // These states are handled when following epsilon transitions. 963c67d6573Sopenharmony_ci Save(_) | Split(_) | EmptyLook(_) => {} 964c67d6573Sopenharmony_ci Match(_) => { 965c67d6573Sopenharmony_ci state_flags.set_match(); 966c67d6573Sopenharmony_ci if !self.continue_past_first_match() { 967c67d6573Sopenharmony_ci break; 968c67d6573Sopenharmony_ci } else if self.prog.matches.len() > 1 969c67d6573Sopenharmony_ci && !qnext.contains(ip as usize) 970c67d6573Sopenharmony_ci { 971c67d6573Sopenharmony_ci // If we are continuing on to find other matches, 972c67d6573Sopenharmony_ci // then keep a record of the match states we've seen. 973c67d6573Sopenharmony_ci qnext.insert(ip); 974c67d6573Sopenharmony_ci } 975c67d6573Sopenharmony_ci } 976c67d6573Sopenharmony_ci Bytes(ref inst) => { 977c67d6573Sopenharmony_ci if b.as_byte().map_or(false, |b| inst.matches(b)) { 978c67d6573Sopenharmony_ci self.follow_epsilons( 979c67d6573Sopenharmony_ci inst.goto as InstPtr, 980c67d6573Sopenharmony_ci qnext, 981c67d6573Sopenharmony_ci empty_flags, 982c67d6573Sopenharmony_ci ); 983c67d6573Sopenharmony_ci } 984c67d6573Sopenharmony_ci } 985c67d6573Sopenharmony_ci } 986c67d6573Sopenharmony_ci } 987c67d6573Sopenharmony_ci 988c67d6573Sopenharmony_ci let cache = if b.is_eof() && self.prog.matches.len() > 1 { 989c67d6573Sopenharmony_ci // If we're processing the last byte of the input and we're 990c67d6573Sopenharmony_ci // matching a regex set, then make the next state contain the 991c67d6573Sopenharmony_ci // previous states transitions. We do this so that the main 992c67d6573Sopenharmony_ci // matching loop can extract all of the match instructions. 993c67d6573Sopenharmony_ci mem::swap(qcur, qnext); 994c67d6573Sopenharmony_ci // And don't cache this state because it's totally bunk. 995c67d6573Sopenharmony_ci false 996c67d6573Sopenharmony_ci } else { 997c67d6573Sopenharmony_ci true 998c67d6573Sopenharmony_ci }; 999c67d6573Sopenharmony_ci 1000c67d6573Sopenharmony_ci // We've now built up the set of NFA states that ought to comprise the 1001c67d6573Sopenharmony_ci // next DFA state, so try to find it in the cache, and if it doesn't 1002c67d6573Sopenharmony_ci // exist, cache it. 1003c67d6573Sopenharmony_ci // 1004c67d6573Sopenharmony_ci // N.B. We pass `&mut si` here because the cache may clear itself if 1005c67d6573Sopenharmony_ci // it has gotten too full. When that happens, the location of the 1006c67d6573Sopenharmony_ci // current state may change. 1007c67d6573Sopenharmony_ci let mut next = 1008c67d6573Sopenharmony_ci match self.cached_state(qnext, state_flags, Some(&mut si)) { 1009c67d6573Sopenharmony_ci None => return None, 1010c67d6573Sopenharmony_ci Some(next) => next, 1011c67d6573Sopenharmony_ci }; 1012c67d6573Sopenharmony_ci if (self.start & !STATE_START) == next { 1013c67d6573Sopenharmony_ci // Start states can never be match states since all matches are 1014c67d6573Sopenharmony_ci // delayed by one byte. 1015c67d6573Sopenharmony_ci debug_assert!(!self.state(next).flags().is_match()); 1016c67d6573Sopenharmony_ci next = self.start_ptr(next); 1017c67d6573Sopenharmony_ci } 1018c67d6573Sopenharmony_ci if next <= STATE_MAX && self.state(next).flags().is_match() { 1019c67d6573Sopenharmony_ci next |= STATE_MATCH; 1020c67d6573Sopenharmony_ci } 1021c67d6573Sopenharmony_ci debug_assert!(next != STATE_UNKNOWN); 1022c67d6573Sopenharmony_ci // And now store our state in the current state's next list. 1023c67d6573Sopenharmony_ci if cache { 1024c67d6573Sopenharmony_ci let cls = self.byte_class(b); 1025c67d6573Sopenharmony_ci self.cache.trans.set_next(si, cls, next); 1026c67d6573Sopenharmony_ci } 1027c67d6573Sopenharmony_ci Some(next) 1028c67d6573Sopenharmony_ci } 1029c67d6573Sopenharmony_ci 1030c67d6573Sopenharmony_ci /// Follows the epsilon transitions starting at (and including) `ip`. The 1031c67d6573Sopenharmony_ci /// resulting states are inserted into the ordered set `q`. 1032c67d6573Sopenharmony_ci /// 1033c67d6573Sopenharmony_ci /// Conditional epsilon transitions (i.e., empty width assertions) are only 1034c67d6573Sopenharmony_ci /// followed if they are satisfied by the given flags, which should 1035c67d6573Sopenharmony_ci /// represent the flags set at the current location in the input. 1036c67d6573Sopenharmony_ci /// 1037c67d6573Sopenharmony_ci /// If the current location corresponds to the empty string, then only the 1038c67d6573Sopenharmony_ci /// end line and/or end text flags may be set. If the current location 1039c67d6573Sopenharmony_ci /// corresponds to a real byte in the input, then only the start line 1040c67d6573Sopenharmony_ci /// and/or start text flags may be set. 1041c67d6573Sopenharmony_ci /// 1042c67d6573Sopenharmony_ci /// As an exception to the above, when finding the initial state, any of 1043c67d6573Sopenharmony_ci /// the above flags may be set: 1044c67d6573Sopenharmony_ci /// 1045c67d6573Sopenharmony_ci /// If matching starts at the beginning of the input, then start text and 1046c67d6573Sopenharmony_ci /// start line should be set. If the input is empty, then end text and end 1047c67d6573Sopenharmony_ci /// line should also be set. 1048c67d6573Sopenharmony_ci /// 1049c67d6573Sopenharmony_ci /// If matching starts after the beginning of the input, then only start 1050c67d6573Sopenharmony_ci /// line should be set if the preceding byte is `\n`. End line should never 1051c67d6573Sopenharmony_ci /// be set in this case. (Even if the following byte is a `\n`, it will 1052c67d6573Sopenharmony_ci /// be handled in a subsequent DFA state.) 1053c67d6573Sopenharmony_ci fn follow_epsilons( 1054c67d6573Sopenharmony_ci &mut self, 1055c67d6573Sopenharmony_ci ip: InstPtr, 1056c67d6573Sopenharmony_ci q: &mut SparseSet, 1057c67d6573Sopenharmony_ci flags: EmptyFlags, 1058c67d6573Sopenharmony_ci ) { 1059c67d6573Sopenharmony_ci use crate::prog::EmptyLook::*; 1060c67d6573Sopenharmony_ci use crate::prog::Inst::*; 1061c67d6573Sopenharmony_ci 1062c67d6573Sopenharmony_ci // We need to traverse the NFA to follow epsilon transitions, so avoid 1063c67d6573Sopenharmony_ci // recursion with an explicit stack. 1064c67d6573Sopenharmony_ci self.cache.stack.push(ip); 1065c67d6573Sopenharmony_ci while let Some(mut ip) = self.cache.stack.pop() { 1066c67d6573Sopenharmony_ci // Try to munch through as many states as possible without 1067c67d6573Sopenharmony_ci // pushes/pops to the stack. 1068c67d6573Sopenharmony_ci loop { 1069c67d6573Sopenharmony_ci // Don't visit states we've already added. 1070c67d6573Sopenharmony_ci if q.contains(ip as usize) { 1071c67d6573Sopenharmony_ci break; 1072c67d6573Sopenharmony_ci } 1073c67d6573Sopenharmony_ci q.insert(ip as usize); 1074c67d6573Sopenharmony_ci match self.prog[ip as usize] { 1075c67d6573Sopenharmony_ci Char(_) | Ranges(_) => unreachable!(), 1076c67d6573Sopenharmony_ci Match(_) | Bytes(_) => { 1077c67d6573Sopenharmony_ci break; 1078c67d6573Sopenharmony_ci } 1079c67d6573Sopenharmony_ci EmptyLook(ref inst) => { 1080c67d6573Sopenharmony_ci // Only follow empty assertion states if our flags 1081c67d6573Sopenharmony_ci // satisfy the assertion. 1082c67d6573Sopenharmony_ci match inst.look { 1083c67d6573Sopenharmony_ci StartLine if flags.start_line => { 1084c67d6573Sopenharmony_ci ip = inst.goto as InstPtr; 1085c67d6573Sopenharmony_ci } 1086c67d6573Sopenharmony_ci EndLine if flags.end_line => { 1087c67d6573Sopenharmony_ci ip = inst.goto as InstPtr; 1088c67d6573Sopenharmony_ci } 1089c67d6573Sopenharmony_ci StartText if flags.start => { 1090c67d6573Sopenharmony_ci ip = inst.goto as InstPtr; 1091c67d6573Sopenharmony_ci } 1092c67d6573Sopenharmony_ci EndText if flags.end => { 1093c67d6573Sopenharmony_ci ip = inst.goto as InstPtr; 1094c67d6573Sopenharmony_ci } 1095c67d6573Sopenharmony_ci WordBoundaryAscii if flags.word_boundary => { 1096c67d6573Sopenharmony_ci ip = inst.goto as InstPtr; 1097c67d6573Sopenharmony_ci } 1098c67d6573Sopenharmony_ci NotWordBoundaryAscii 1099c67d6573Sopenharmony_ci if flags.not_word_boundary => 1100c67d6573Sopenharmony_ci { 1101c67d6573Sopenharmony_ci ip = inst.goto as InstPtr; 1102c67d6573Sopenharmony_ci } 1103c67d6573Sopenharmony_ci WordBoundary if flags.word_boundary => { 1104c67d6573Sopenharmony_ci ip = inst.goto as InstPtr; 1105c67d6573Sopenharmony_ci } 1106c67d6573Sopenharmony_ci NotWordBoundary if flags.not_word_boundary => { 1107c67d6573Sopenharmony_ci ip = inst.goto as InstPtr; 1108c67d6573Sopenharmony_ci } 1109c67d6573Sopenharmony_ci StartLine | EndLine | StartText | EndText 1110c67d6573Sopenharmony_ci | WordBoundaryAscii | NotWordBoundaryAscii 1111c67d6573Sopenharmony_ci | WordBoundary | NotWordBoundary => { 1112c67d6573Sopenharmony_ci break; 1113c67d6573Sopenharmony_ci } 1114c67d6573Sopenharmony_ci } 1115c67d6573Sopenharmony_ci } 1116c67d6573Sopenharmony_ci Save(ref inst) => { 1117c67d6573Sopenharmony_ci ip = inst.goto as InstPtr; 1118c67d6573Sopenharmony_ci } 1119c67d6573Sopenharmony_ci Split(ref inst) => { 1120c67d6573Sopenharmony_ci self.cache.stack.push(inst.goto2 as InstPtr); 1121c67d6573Sopenharmony_ci ip = inst.goto1 as InstPtr; 1122c67d6573Sopenharmony_ci } 1123c67d6573Sopenharmony_ci } 1124c67d6573Sopenharmony_ci } 1125c67d6573Sopenharmony_ci } 1126c67d6573Sopenharmony_ci } 1127c67d6573Sopenharmony_ci 1128c67d6573Sopenharmony_ci /// Find a previously computed state matching the given set of instructions 1129c67d6573Sopenharmony_ci /// and is_match bool. 1130c67d6573Sopenharmony_ci /// 1131c67d6573Sopenharmony_ci /// The given set of instructions should represent a single state in the 1132c67d6573Sopenharmony_ci /// NFA along with all states reachable without consuming any input. 1133c67d6573Sopenharmony_ci /// 1134c67d6573Sopenharmony_ci /// The is_match bool should be true if and only if the preceding DFA state 1135c67d6573Sopenharmony_ci /// contains an NFA matching state. The cached state produced here will 1136c67d6573Sopenharmony_ci /// then signify a match. (This enables us to delay a match by one byte, 1137c67d6573Sopenharmony_ci /// in order to account for the EOF sentinel byte.) 1138c67d6573Sopenharmony_ci /// 1139c67d6573Sopenharmony_ci /// If the cache is full, then it is wiped before caching a new state. 1140c67d6573Sopenharmony_ci /// 1141c67d6573Sopenharmony_ci /// The current state should be specified if it exists, since it will need 1142c67d6573Sopenharmony_ci /// to be preserved if the cache clears itself. (Start states are 1143c67d6573Sopenharmony_ci /// always saved, so they should not be passed here.) It takes a mutable 1144c67d6573Sopenharmony_ci /// pointer to the index because if the cache is cleared, the state's 1145c67d6573Sopenharmony_ci /// location may change. 1146c67d6573Sopenharmony_ci fn cached_state( 1147c67d6573Sopenharmony_ci &mut self, 1148c67d6573Sopenharmony_ci q: &SparseSet, 1149c67d6573Sopenharmony_ci mut state_flags: StateFlags, 1150c67d6573Sopenharmony_ci current_state: Option<&mut StatePtr>, 1151c67d6573Sopenharmony_ci ) -> Option<StatePtr> { 1152c67d6573Sopenharmony_ci // If we couldn't come up with a non-empty key to represent this state, 1153c67d6573Sopenharmony_ci // then it is dead and can never lead to a match. 1154c67d6573Sopenharmony_ci // 1155c67d6573Sopenharmony_ci // Note that inst_flags represent the set of empty width assertions 1156c67d6573Sopenharmony_ci // in q. We use this as an optimization in exec_byte to determine when 1157c67d6573Sopenharmony_ci // we should follow epsilon transitions at the empty string preceding 1158c67d6573Sopenharmony_ci // the current byte. 1159c67d6573Sopenharmony_ci let key = match self.cached_state_key(q, &mut state_flags) { 1160c67d6573Sopenharmony_ci None => return Some(STATE_DEAD), 1161c67d6573Sopenharmony_ci Some(v) => v, 1162c67d6573Sopenharmony_ci }; 1163c67d6573Sopenharmony_ci // In the cache? Cool. Done. 1164c67d6573Sopenharmony_ci if let Some(si) = self.cache.compiled.get_ptr(&key) { 1165c67d6573Sopenharmony_ci return Some(si); 1166c67d6573Sopenharmony_ci } 1167c67d6573Sopenharmony_ci // If the cache has gotten too big, wipe it. 1168c67d6573Sopenharmony_ci if self.approximate_size() > self.prog.dfa_size_limit 1169c67d6573Sopenharmony_ci && !self.clear_cache_and_save(current_state) 1170c67d6573Sopenharmony_ci { 1171c67d6573Sopenharmony_ci // Ooops. DFA is giving up. 1172c67d6573Sopenharmony_ci return None; 1173c67d6573Sopenharmony_ci } 1174c67d6573Sopenharmony_ci // Allocate room for our state and add it. 1175c67d6573Sopenharmony_ci self.add_state(key) 1176c67d6573Sopenharmony_ci } 1177c67d6573Sopenharmony_ci 1178c67d6573Sopenharmony_ci /// Produces a key suitable for describing a state in the DFA cache. 1179c67d6573Sopenharmony_ci /// 1180c67d6573Sopenharmony_ci /// The key invariant here is that equivalent keys are produced for any two 1181c67d6573Sopenharmony_ci /// sets of ordered NFA states (and toggling of whether the previous NFA 1182c67d6573Sopenharmony_ci /// states contain a match state) that do not discriminate a match for any 1183c67d6573Sopenharmony_ci /// input. 1184c67d6573Sopenharmony_ci /// 1185c67d6573Sopenharmony_ci /// Specifically, q should be an ordered set of NFA states and is_match 1186c67d6573Sopenharmony_ci /// should be true if and only if the previous NFA states contained a match 1187c67d6573Sopenharmony_ci /// state. 1188c67d6573Sopenharmony_ci fn cached_state_key( 1189c67d6573Sopenharmony_ci &mut self, 1190c67d6573Sopenharmony_ci q: &SparseSet, 1191c67d6573Sopenharmony_ci state_flags: &mut StateFlags, 1192c67d6573Sopenharmony_ci ) -> Option<State> { 1193c67d6573Sopenharmony_ci use crate::prog::Inst::*; 1194c67d6573Sopenharmony_ci 1195c67d6573Sopenharmony_ci // We need to build up enough information to recognize pre-built states 1196c67d6573Sopenharmony_ci // in the DFA. Generally speaking, this includes every instruction 1197c67d6573Sopenharmony_ci // except for those which are purely epsilon transitions, e.g., the 1198c67d6573Sopenharmony_ci // Save and Split instructions. 1199c67d6573Sopenharmony_ci // 1200c67d6573Sopenharmony_ci // Empty width assertions are also epsilon transitions, but since they 1201c67d6573Sopenharmony_ci // are conditional, we need to make them part of a state's key in the 1202c67d6573Sopenharmony_ci // cache. 1203c67d6573Sopenharmony_ci 1204c67d6573Sopenharmony_ci let mut insts = 1205c67d6573Sopenharmony_ci mem::replace(&mut self.cache.insts_scratch_space, vec![]); 1206c67d6573Sopenharmony_ci insts.clear(); 1207c67d6573Sopenharmony_ci // Reserve 1 byte for flags. 1208c67d6573Sopenharmony_ci insts.push(0); 1209c67d6573Sopenharmony_ci 1210c67d6573Sopenharmony_ci let mut prev = 0; 1211c67d6573Sopenharmony_ci for &ip in q { 1212c67d6573Sopenharmony_ci let ip = usize_to_u32(ip); 1213c67d6573Sopenharmony_ci match self.prog[ip as usize] { 1214c67d6573Sopenharmony_ci Char(_) | Ranges(_) => unreachable!(), 1215c67d6573Sopenharmony_ci Save(_) | Split(_) => {} 1216c67d6573Sopenharmony_ci Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip), 1217c67d6573Sopenharmony_ci EmptyLook(_) => { 1218c67d6573Sopenharmony_ci state_flags.set_empty(); 1219c67d6573Sopenharmony_ci push_inst_ptr(&mut insts, &mut prev, ip) 1220c67d6573Sopenharmony_ci } 1221c67d6573Sopenharmony_ci Match(_) => { 1222c67d6573Sopenharmony_ci push_inst_ptr(&mut insts, &mut prev, ip); 1223c67d6573Sopenharmony_ci if !self.continue_past_first_match() { 1224c67d6573Sopenharmony_ci break; 1225c67d6573Sopenharmony_ci } 1226c67d6573Sopenharmony_ci } 1227c67d6573Sopenharmony_ci } 1228c67d6573Sopenharmony_ci } 1229c67d6573Sopenharmony_ci // If we couldn't transition to any other instructions and we didn't 1230c67d6573Sopenharmony_ci // see a match when expanding NFA states previously, then this is a 1231c67d6573Sopenharmony_ci // dead state and no amount of additional input can transition out 1232c67d6573Sopenharmony_ci // of this state. 1233c67d6573Sopenharmony_ci let opt_state = if insts.len() == 1 && !state_flags.is_match() { 1234c67d6573Sopenharmony_ci None 1235c67d6573Sopenharmony_ci } else { 1236c67d6573Sopenharmony_ci let StateFlags(f) = *state_flags; 1237c67d6573Sopenharmony_ci insts[0] = f; 1238c67d6573Sopenharmony_ci Some(State { data: Arc::from(&*insts) }) 1239c67d6573Sopenharmony_ci }; 1240c67d6573Sopenharmony_ci self.cache.insts_scratch_space = insts; 1241c67d6573Sopenharmony_ci opt_state 1242c67d6573Sopenharmony_ci } 1243c67d6573Sopenharmony_ci 1244c67d6573Sopenharmony_ci /// Clears the cache, but saves and restores current_state if it is not 1245c67d6573Sopenharmony_ci /// none. 1246c67d6573Sopenharmony_ci /// 1247c67d6573Sopenharmony_ci /// The current state must be provided here in case its location in the 1248c67d6573Sopenharmony_ci /// cache changes. 1249c67d6573Sopenharmony_ci /// 1250c67d6573Sopenharmony_ci /// This returns false if the cache is not cleared and the DFA should 1251c67d6573Sopenharmony_ci /// give up. 1252c67d6573Sopenharmony_ci fn clear_cache_and_save( 1253c67d6573Sopenharmony_ci &mut self, 1254c67d6573Sopenharmony_ci current_state: Option<&mut StatePtr>, 1255c67d6573Sopenharmony_ci ) -> bool { 1256c67d6573Sopenharmony_ci if self.cache.compiled.is_empty() { 1257c67d6573Sopenharmony_ci // Nothing to clear... 1258c67d6573Sopenharmony_ci return true; 1259c67d6573Sopenharmony_ci } 1260c67d6573Sopenharmony_ci match current_state { 1261c67d6573Sopenharmony_ci None => self.clear_cache(), 1262c67d6573Sopenharmony_ci Some(si) => { 1263c67d6573Sopenharmony_ci let cur = self.state(*si).clone(); 1264c67d6573Sopenharmony_ci if !self.clear_cache() { 1265c67d6573Sopenharmony_ci return false; 1266c67d6573Sopenharmony_ci } 1267c67d6573Sopenharmony_ci // The unwrap is OK because we just cleared the cache and 1268c67d6573Sopenharmony_ci // therefore know that the next state pointer won't exceed 1269c67d6573Sopenharmony_ci // STATE_MAX. 1270c67d6573Sopenharmony_ci *si = self.restore_state(cur).unwrap(); 1271c67d6573Sopenharmony_ci true 1272c67d6573Sopenharmony_ci } 1273c67d6573Sopenharmony_ci } 1274c67d6573Sopenharmony_ci } 1275c67d6573Sopenharmony_ci 1276c67d6573Sopenharmony_ci /// Wipes the state cache, but saves and restores the current start state. 1277c67d6573Sopenharmony_ci /// 1278c67d6573Sopenharmony_ci /// This returns false if the cache is not cleared and the DFA should 1279c67d6573Sopenharmony_ci /// give up. 1280c67d6573Sopenharmony_ci fn clear_cache(&mut self) -> bool { 1281c67d6573Sopenharmony_ci // Bail out of the DFA if we're moving too "slowly." 1282c67d6573Sopenharmony_ci // A heuristic from RE2: assume the DFA is too slow if it is processing 1283c67d6573Sopenharmony_ci // 10 or fewer bytes per state. 1284c67d6573Sopenharmony_ci // Additionally, we permit the cache to be flushed a few times before 1285c67d6573Sopenharmony_ci // caling it quits. 1286c67d6573Sopenharmony_ci let nstates = self.cache.compiled.len(); 1287c67d6573Sopenharmony_ci if self.cache.flush_count >= 3 1288c67d6573Sopenharmony_ci && self.at >= self.last_cache_flush 1289c67d6573Sopenharmony_ci && (self.at - self.last_cache_flush) <= 10 * nstates 1290c67d6573Sopenharmony_ci { 1291c67d6573Sopenharmony_ci return false; 1292c67d6573Sopenharmony_ci } 1293c67d6573Sopenharmony_ci // Update statistics tracking cache flushes. 1294c67d6573Sopenharmony_ci self.last_cache_flush = self.at; 1295c67d6573Sopenharmony_ci self.cache.flush_count += 1; 1296c67d6573Sopenharmony_ci 1297c67d6573Sopenharmony_ci // OK, actually flush the cache. 1298c67d6573Sopenharmony_ci let start = self.state(self.start & !STATE_START).clone(); 1299c67d6573Sopenharmony_ci let last_match = if self.last_match_si <= STATE_MAX { 1300c67d6573Sopenharmony_ci Some(self.state(self.last_match_si).clone()) 1301c67d6573Sopenharmony_ci } else { 1302c67d6573Sopenharmony_ci None 1303c67d6573Sopenharmony_ci }; 1304c67d6573Sopenharmony_ci self.cache.reset_size(); 1305c67d6573Sopenharmony_ci self.cache.trans.clear(); 1306c67d6573Sopenharmony_ci self.cache.compiled.clear(); 1307c67d6573Sopenharmony_ci for s in &mut self.cache.start_states { 1308c67d6573Sopenharmony_ci *s = STATE_UNKNOWN; 1309c67d6573Sopenharmony_ci } 1310c67d6573Sopenharmony_ci // The unwraps are OK because we just cleared the cache and therefore 1311c67d6573Sopenharmony_ci // know that the next state pointer won't exceed STATE_MAX. 1312c67d6573Sopenharmony_ci let start_ptr = self.restore_state(start).unwrap(); 1313c67d6573Sopenharmony_ci self.start = self.start_ptr(start_ptr); 1314c67d6573Sopenharmony_ci if let Some(last_match) = last_match { 1315c67d6573Sopenharmony_ci self.last_match_si = self.restore_state(last_match).unwrap(); 1316c67d6573Sopenharmony_ci } 1317c67d6573Sopenharmony_ci true 1318c67d6573Sopenharmony_ci } 1319c67d6573Sopenharmony_ci 1320c67d6573Sopenharmony_ci /// Restores the given state back into the cache, and returns a pointer 1321c67d6573Sopenharmony_ci /// to it. 1322c67d6573Sopenharmony_ci fn restore_state(&mut self, state: State) -> Option<StatePtr> { 1323c67d6573Sopenharmony_ci // If we've already stored this state, just return a pointer to it. 1324c67d6573Sopenharmony_ci // None will be the wiser. 1325c67d6573Sopenharmony_ci if let Some(si) = self.cache.compiled.get_ptr(&state) { 1326c67d6573Sopenharmony_ci return Some(si); 1327c67d6573Sopenharmony_ci } 1328c67d6573Sopenharmony_ci self.add_state(state) 1329c67d6573Sopenharmony_ci } 1330c67d6573Sopenharmony_ci 1331c67d6573Sopenharmony_ci /// Returns the next state given the current state si and current byte 1332c67d6573Sopenharmony_ci /// b. {qcur,qnext} are used as scratch space for storing ordered NFA 1333c67d6573Sopenharmony_ci /// states. 1334c67d6573Sopenharmony_ci /// 1335c67d6573Sopenharmony_ci /// This tries to fetch the next state from the cache, but if that fails, 1336c67d6573Sopenharmony_ci /// it computes the next state, caches it and returns a pointer to it. 1337c67d6573Sopenharmony_ci /// 1338c67d6573Sopenharmony_ci /// The pointer can be to a real state, or it can be STATE_DEAD. 1339c67d6573Sopenharmony_ci /// STATE_UNKNOWN cannot be returned. 1340c67d6573Sopenharmony_ci /// 1341c67d6573Sopenharmony_ci /// None is returned if a new state could not be allocated (i.e., the DFA 1342c67d6573Sopenharmony_ci /// ran out of space and thinks it's running too slowly). 1343c67d6573Sopenharmony_ci fn next_state( 1344c67d6573Sopenharmony_ci &mut self, 1345c67d6573Sopenharmony_ci qcur: &mut SparseSet, 1346c67d6573Sopenharmony_ci qnext: &mut SparseSet, 1347c67d6573Sopenharmony_ci si: StatePtr, 1348c67d6573Sopenharmony_ci b: Byte, 1349c67d6573Sopenharmony_ci ) -> Option<StatePtr> { 1350c67d6573Sopenharmony_ci if si == STATE_DEAD { 1351c67d6573Sopenharmony_ci return Some(STATE_DEAD); 1352c67d6573Sopenharmony_ci } 1353c67d6573Sopenharmony_ci match self.cache.trans.next(si, self.byte_class(b)) { 1354c67d6573Sopenharmony_ci STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b), 1355c67d6573Sopenharmony_ci STATE_QUIT => None, 1356c67d6573Sopenharmony_ci nsi => Some(nsi), 1357c67d6573Sopenharmony_ci } 1358c67d6573Sopenharmony_ci } 1359c67d6573Sopenharmony_ci 1360c67d6573Sopenharmony_ci /// Computes and returns the start state, where searching begins at 1361c67d6573Sopenharmony_ci /// position `at` in `text`. If the state has already been computed, 1362c67d6573Sopenharmony_ci /// then it is pulled from the cache. If the state hasn't been cached, 1363c67d6573Sopenharmony_ci /// then it is computed, cached and a pointer to it is returned. 1364c67d6573Sopenharmony_ci /// 1365c67d6573Sopenharmony_ci /// This may return STATE_DEAD but never STATE_UNKNOWN. 1366c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 1367c67d6573Sopenharmony_ci fn start_state( 1368c67d6573Sopenharmony_ci &mut self, 1369c67d6573Sopenharmony_ci q: &mut SparseSet, 1370c67d6573Sopenharmony_ci empty_flags: EmptyFlags, 1371c67d6573Sopenharmony_ci state_flags: StateFlags, 1372c67d6573Sopenharmony_ci ) -> Option<StatePtr> { 1373c67d6573Sopenharmony_ci // Compute an index into our cache of start states based on the set 1374c67d6573Sopenharmony_ci // of empty/state flags set at the current position in the input. We 1375c67d6573Sopenharmony_ci // don't use every flag since not all flags matter. For example, since 1376c67d6573Sopenharmony_ci // matches are delayed by one byte, start states can never be match 1377c67d6573Sopenharmony_ci // states. 1378c67d6573Sopenharmony_ci let flagi = { 1379c67d6573Sopenharmony_ci (((empty_flags.start as u8) << 0) 1380c67d6573Sopenharmony_ci | ((empty_flags.end as u8) << 1) 1381c67d6573Sopenharmony_ci | ((empty_flags.start_line as u8) << 2) 1382c67d6573Sopenharmony_ci | ((empty_flags.end_line as u8) << 3) 1383c67d6573Sopenharmony_ci | ((empty_flags.word_boundary as u8) << 4) 1384c67d6573Sopenharmony_ci | ((empty_flags.not_word_boundary as u8) << 5) 1385c67d6573Sopenharmony_ci | ((state_flags.is_word() as u8) << 6)) as usize 1386c67d6573Sopenharmony_ci }; 1387c67d6573Sopenharmony_ci match self.cache.start_states[flagi] { 1388c67d6573Sopenharmony_ci STATE_UNKNOWN => {} 1389c67d6573Sopenharmony_ci si => return Some(si), 1390c67d6573Sopenharmony_ci } 1391c67d6573Sopenharmony_ci q.clear(); 1392c67d6573Sopenharmony_ci let start = usize_to_u32(self.prog.start); 1393c67d6573Sopenharmony_ci self.follow_epsilons(start, q, empty_flags); 1394c67d6573Sopenharmony_ci // Start states can never be match states because we delay every match 1395c67d6573Sopenharmony_ci // by one byte. Given an empty string and an empty match, the match 1396c67d6573Sopenharmony_ci // won't actually occur until the DFA processes the special EOF 1397c67d6573Sopenharmony_ci // sentinel byte. 1398c67d6573Sopenharmony_ci let sp = match self.cached_state(q, state_flags, None) { 1399c67d6573Sopenharmony_ci None => return None, 1400c67d6573Sopenharmony_ci Some(sp) => self.start_ptr(sp), 1401c67d6573Sopenharmony_ci }; 1402c67d6573Sopenharmony_ci self.cache.start_states[flagi] = sp; 1403c67d6573Sopenharmony_ci Some(sp) 1404c67d6573Sopenharmony_ci } 1405c67d6573Sopenharmony_ci 1406c67d6573Sopenharmony_ci /// Computes the set of starting flags for the given position in text. 1407c67d6573Sopenharmony_ci /// 1408c67d6573Sopenharmony_ci /// This should only be used when executing the DFA forwards over the 1409c67d6573Sopenharmony_ci /// input. 1410c67d6573Sopenharmony_ci fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) { 1411c67d6573Sopenharmony_ci let mut empty_flags = EmptyFlags::default(); 1412c67d6573Sopenharmony_ci let mut state_flags = StateFlags::default(); 1413c67d6573Sopenharmony_ci empty_flags.start = at == 0; 1414c67d6573Sopenharmony_ci empty_flags.end = text.is_empty(); 1415c67d6573Sopenharmony_ci empty_flags.start_line = at == 0 || text[at - 1] == b'\n'; 1416c67d6573Sopenharmony_ci empty_flags.end_line = text.is_empty(); 1417c67d6573Sopenharmony_ci 1418c67d6573Sopenharmony_ci let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word(); 1419c67d6573Sopenharmony_ci let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word(); 1420c67d6573Sopenharmony_ci if is_word_last { 1421c67d6573Sopenharmony_ci state_flags.set_word(); 1422c67d6573Sopenharmony_ci } 1423c67d6573Sopenharmony_ci if is_word == is_word_last { 1424c67d6573Sopenharmony_ci empty_flags.not_word_boundary = true; 1425c67d6573Sopenharmony_ci } else { 1426c67d6573Sopenharmony_ci empty_flags.word_boundary = true; 1427c67d6573Sopenharmony_ci } 1428c67d6573Sopenharmony_ci (empty_flags, state_flags) 1429c67d6573Sopenharmony_ci } 1430c67d6573Sopenharmony_ci 1431c67d6573Sopenharmony_ci /// Computes the set of starting flags for the given position in text. 1432c67d6573Sopenharmony_ci /// 1433c67d6573Sopenharmony_ci /// This should only be used when executing the DFA in reverse over the 1434c67d6573Sopenharmony_ci /// input. 1435c67d6573Sopenharmony_ci fn start_flags_reverse( 1436c67d6573Sopenharmony_ci &self, 1437c67d6573Sopenharmony_ci text: &[u8], 1438c67d6573Sopenharmony_ci at: usize, 1439c67d6573Sopenharmony_ci ) -> (EmptyFlags, StateFlags) { 1440c67d6573Sopenharmony_ci let mut empty_flags = EmptyFlags::default(); 1441c67d6573Sopenharmony_ci let mut state_flags = StateFlags::default(); 1442c67d6573Sopenharmony_ci empty_flags.start = at == text.len(); 1443c67d6573Sopenharmony_ci empty_flags.end = text.is_empty(); 1444c67d6573Sopenharmony_ci empty_flags.start_line = at == text.len() || text[at] == b'\n'; 1445c67d6573Sopenharmony_ci empty_flags.end_line = text.is_empty(); 1446c67d6573Sopenharmony_ci 1447c67d6573Sopenharmony_ci let is_word_last = 1448c67d6573Sopenharmony_ci at < text.len() && Byte::byte(text[at]).is_ascii_word(); 1449c67d6573Sopenharmony_ci let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word(); 1450c67d6573Sopenharmony_ci if is_word_last { 1451c67d6573Sopenharmony_ci state_flags.set_word(); 1452c67d6573Sopenharmony_ci } 1453c67d6573Sopenharmony_ci if is_word == is_word_last { 1454c67d6573Sopenharmony_ci empty_flags.not_word_boundary = true; 1455c67d6573Sopenharmony_ci } else { 1456c67d6573Sopenharmony_ci empty_flags.word_boundary = true; 1457c67d6573Sopenharmony_ci } 1458c67d6573Sopenharmony_ci (empty_flags, state_flags) 1459c67d6573Sopenharmony_ci } 1460c67d6573Sopenharmony_ci 1461c67d6573Sopenharmony_ci /// Returns a reference to a State given a pointer to it. 1462c67d6573Sopenharmony_ci fn state(&self, si: StatePtr) -> &State { 1463c67d6573Sopenharmony_ci self.cache.compiled.get_state(si).unwrap() 1464c67d6573Sopenharmony_ci } 1465c67d6573Sopenharmony_ci 1466c67d6573Sopenharmony_ci /// Adds the given state to the DFA. 1467c67d6573Sopenharmony_ci /// 1468c67d6573Sopenharmony_ci /// This allocates room for transitions out of this state in 1469c67d6573Sopenharmony_ci /// self.cache.trans. The transitions can be set with the returned 1470c67d6573Sopenharmony_ci /// StatePtr. 1471c67d6573Sopenharmony_ci /// 1472c67d6573Sopenharmony_ci /// If None is returned, then the state limit was reached and the DFA 1473c67d6573Sopenharmony_ci /// should quit. 1474c67d6573Sopenharmony_ci fn add_state(&mut self, state: State) -> Option<StatePtr> { 1475c67d6573Sopenharmony_ci // This will fail if the next state pointer exceeds STATE_PTR. In 1476c67d6573Sopenharmony_ci // practice, the cache limit will prevent us from ever getting here, 1477c67d6573Sopenharmony_ci // but maybe callers will set the cache size to something ridiculous... 1478c67d6573Sopenharmony_ci let si = match self.cache.trans.add() { 1479c67d6573Sopenharmony_ci None => return None, 1480c67d6573Sopenharmony_ci Some(si) => si, 1481c67d6573Sopenharmony_ci }; 1482c67d6573Sopenharmony_ci // If the program has a Unicode word boundary, then set any transitions 1483c67d6573Sopenharmony_ci // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a 1484c67d6573Sopenharmony_ci // transition, then it will quit and an alternative matching engine 1485c67d6573Sopenharmony_ci // will take over. 1486c67d6573Sopenharmony_ci if self.prog.has_unicode_word_boundary { 1487c67d6573Sopenharmony_ci for b in 128..256 { 1488c67d6573Sopenharmony_ci let cls = self.byte_class(Byte::byte(b as u8)); 1489c67d6573Sopenharmony_ci self.cache.trans.set_next(si, cls, STATE_QUIT); 1490c67d6573Sopenharmony_ci } 1491c67d6573Sopenharmony_ci } 1492c67d6573Sopenharmony_ci // Finally, put our actual state on to our heap of states and index it 1493c67d6573Sopenharmony_ci // so we can find it later. 1494c67d6573Sopenharmony_ci self.cache.size += self.cache.trans.state_heap_size() 1495c67d6573Sopenharmony_ci + state.data.len() 1496c67d6573Sopenharmony_ci + (2 * mem::size_of::<State>()) 1497c67d6573Sopenharmony_ci + mem::size_of::<StatePtr>(); 1498c67d6573Sopenharmony_ci self.cache.compiled.insert(state, si); 1499c67d6573Sopenharmony_ci // Transition table and set of states and map should all be in sync. 1500c67d6573Sopenharmony_ci debug_assert!( 1501c67d6573Sopenharmony_ci self.cache.compiled.len() == self.cache.trans.num_states() 1502c67d6573Sopenharmony_ci ); 1503c67d6573Sopenharmony_ci Some(si) 1504c67d6573Sopenharmony_ci } 1505c67d6573Sopenharmony_ci 1506c67d6573Sopenharmony_ci /// Quickly finds the next occurrence of any literal prefixes in the regex. 1507c67d6573Sopenharmony_ci /// If there are no literal prefixes, then the current position is 1508c67d6573Sopenharmony_ci /// returned. If there are literal prefixes and one could not be found, 1509c67d6573Sopenharmony_ci /// then None is returned. 1510c67d6573Sopenharmony_ci /// 1511c67d6573Sopenharmony_ci /// This should only be called when the DFA is in a start state. 1512c67d6573Sopenharmony_ci fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> { 1513c67d6573Sopenharmony_ci self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s) 1514c67d6573Sopenharmony_ci } 1515c67d6573Sopenharmony_ci 1516c67d6573Sopenharmony_ci /// Returns the number of byte classes required to discriminate transitions 1517c67d6573Sopenharmony_ci /// in each state. 1518c67d6573Sopenharmony_ci /// 1519c67d6573Sopenharmony_ci /// invariant: num_byte_classes() == len(State.next) 1520c67d6573Sopenharmony_ci fn num_byte_classes(&self) -> usize { 1521c67d6573Sopenharmony_ci // We add 1 to account for the special EOF byte. 1522c67d6573Sopenharmony_ci (self.prog.byte_classes[255] as usize + 1) + 1 1523c67d6573Sopenharmony_ci } 1524c67d6573Sopenharmony_ci 1525c67d6573Sopenharmony_ci /// Given an input byte or the special EOF sentinel, return its 1526c67d6573Sopenharmony_ci /// corresponding byte class. 1527c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 1528c67d6573Sopenharmony_ci fn byte_class(&self, b: Byte) -> usize { 1529c67d6573Sopenharmony_ci match b.as_byte() { 1530c67d6573Sopenharmony_ci None => self.num_byte_classes() - 1, 1531c67d6573Sopenharmony_ci Some(b) => self.u8_class(b), 1532c67d6573Sopenharmony_ci } 1533c67d6573Sopenharmony_ci } 1534c67d6573Sopenharmony_ci 1535c67d6573Sopenharmony_ci /// Like byte_class, but explicitly for u8s. 1536c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 1537c67d6573Sopenharmony_ci fn u8_class(&self, b: u8) -> usize { 1538c67d6573Sopenharmony_ci self.prog.byte_classes[b as usize] as usize 1539c67d6573Sopenharmony_ci } 1540c67d6573Sopenharmony_ci 1541c67d6573Sopenharmony_ci /// Returns true if the DFA should continue searching past the first match. 1542c67d6573Sopenharmony_ci /// 1543c67d6573Sopenharmony_ci /// Leftmost first semantics in the DFA are preserved by not following NFA 1544c67d6573Sopenharmony_ci /// transitions after the first match is seen. 1545c67d6573Sopenharmony_ci /// 1546c67d6573Sopenharmony_ci /// On occasion, we want to avoid leftmost first semantics to find either 1547c67d6573Sopenharmony_ci /// the longest match (for reverse search) or all possible matches (for 1548c67d6573Sopenharmony_ci /// regex sets). 1549c67d6573Sopenharmony_ci fn continue_past_first_match(&self) -> bool { 1550c67d6573Sopenharmony_ci self.prog.is_reverse || self.prog.matches.len() > 1 1551c67d6573Sopenharmony_ci } 1552c67d6573Sopenharmony_ci 1553c67d6573Sopenharmony_ci /// Returns true if there is a prefix we can quickly search for. 1554c67d6573Sopenharmony_ci fn has_prefix(&self) -> bool { 1555c67d6573Sopenharmony_ci !self.prog.is_reverse 1556c67d6573Sopenharmony_ci && !self.prog.prefixes.is_empty() 1557c67d6573Sopenharmony_ci && !self.prog.is_anchored_start 1558c67d6573Sopenharmony_ci } 1559c67d6573Sopenharmony_ci 1560c67d6573Sopenharmony_ci /// Sets the STATE_START bit in the given state pointer if and only if 1561c67d6573Sopenharmony_ci /// we have a prefix to scan for. 1562c67d6573Sopenharmony_ci /// 1563c67d6573Sopenharmony_ci /// If there's no prefix, then it's a waste to treat the start state 1564c67d6573Sopenharmony_ci /// specially. 1565c67d6573Sopenharmony_ci fn start_ptr(&self, si: StatePtr) -> StatePtr { 1566c67d6573Sopenharmony_ci if self.has_prefix() { 1567c67d6573Sopenharmony_ci si | STATE_START 1568c67d6573Sopenharmony_ci } else { 1569c67d6573Sopenharmony_ci si 1570c67d6573Sopenharmony_ci } 1571c67d6573Sopenharmony_ci } 1572c67d6573Sopenharmony_ci 1573c67d6573Sopenharmony_ci /// Approximate size returns the approximate heap space currently used by 1574c67d6573Sopenharmony_ci /// the DFA. It is used to determine whether the DFA's state cache needs to 1575c67d6573Sopenharmony_ci /// be wiped. Namely, it is possible that for certain regexes on certain 1576c67d6573Sopenharmony_ci /// inputs, a new state could be created for every byte of input. (This is 1577c67d6573Sopenharmony_ci /// bad for memory use, so we bound it with a cache.) 1578c67d6573Sopenharmony_ci fn approximate_size(&self) -> usize { 1579c67d6573Sopenharmony_ci self.cache.size + self.prog.approximate_size() 1580c67d6573Sopenharmony_ci } 1581c67d6573Sopenharmony_ci} 1582c67d6573Sopenharmony_ci 1583c67d6573Sopenharmony_ci/// An abstraction for representing a map of states. The map supports two 1584c67d6573Sopenharmony_ci/// different ways of state lookup. One is fast constant time access via a 1585c67d6573Sopenharmony_ci/// state pointer. The other is a hashmap lookup based on the DFA's 1586c67d6573Sopenharmony_ci/// constituent NFA states. 1587c67d6573Sopenharmony_ci/// 1588c67d6573Sopenharmony_ci/// A DFA state internally uses an Arc such that we only need to store the 1589c67d6573Sopenharmony_ci/// set of NFA states on the heap once, even though we support looking up 1590c67d6573Sopenharmony_ci/// states by two different means. A more natural way to express this might 1591c67d6573Sopenharmony_ci/// use raw pointers, but an Arc is safe and effectively achieves the same 1592c67d6573Sopenharmony_ci/// thing. 1593c67d6573Sopenharmony_ci#[derive(Debug)] 1594c67d6573Sopenharmony_cistruct StateMap { 1595c67d6573Sopenharmony_ci /// The keys are not actually static but rely on always pointing to a 1596c67d6573Sopenharmony_ci /// buffer in `states` which will never be moved except when clearing 1597c67d6573Sopenharmony_ci /// the map or on drop, in which case the keys of this map will be 1598c67d6573Sopenharmony_ci /// removed before 1599c67d6573Sopenharmony_ci map: HashMap<State, StatePtr>, 1600c67d6573Sopenharmony_ci /// Our set of states. Note that `StatePtr / num_byte_classes` indexes 1601c67d6573Sopenharmony_ci /// this Vec rather than just a `StatePtr`. 1602c67d6573Sopenharmony_ci states: Vec<State>, 1603c67d6573Sopenharmony_ci /// The number of byte classes in the DFA. Used to index `states`. 1604c67d6573Sopenharmony_ci num_byte_classes: usize, 1605c67d6573Sopenharmony_ci} 1606c67d6573Sopenharmony_ci 1607c67d6573Sopenharmony_ciimpl StateMap { 1608c67d6573Sopenharmony_ci fn new(num_byte_classes: usize) -> StateMap { 1609c67d6573Sopenharmony_ci StateMap { map: HashMap::new(), states: vec![], num_byte_classes } 1610c67d6573Sopenharmony_ci } 1611c67d6573Sopenharmony_ci 1612c67d6573Sopenharmony_ci fn len(&self) -> usize { 1613c67d6573Sopenharmony_ci self.states.len() 1614c67d6573Sopenharmony_ci } 1615c67d6573Sopenharmony_ci 1616c67d6573Sopenharmony_ci fn is_empty(&self) -> bool { 1617c67d6573Sopenharmony_ci self.states.is_empty() 1618c67d6573Sopenharmony_ci } 1619c67d6573Sopenharmony_ci 1620c67d6573Sopenharmony_ci fn get_ptr(&self, state: &State) -> Option<StatePtr> { 1621c67d6573Sopenharmony_ci self.map.get(state).cloned() 1622c67d6573Sopenharmony_ci } 1623c67d6573Sopenharmony_ci 1624c67d6573Sopenharmony_ci fn get_state(&self, si: StatePtr) -> Option<&State> { 1625c67d6573Sopenharmony_ci self.states.get(si as usize / self.num_byte_classes) 1626c67d6573Sopenharmony_ci } 1627c67d6573Sopenharmony_ci 1628c67d6573Sopenharmony_ci fn insert(&mut self, state: State, si: StatePtr) { 1629c67d6573Sopenharmony_ci self.map.insert(state.clone(), si); 1630c67d6573Sopenharmony_ci self.states.push(state); 1631c67d6573Sopenharmony_ci } 1632c67d6573Sopenharmony_ci 1633c67d6573Sopenharmony_ci fn clear(&mut self) { 1634c67d6573Sopenharmony_ci self.map.clear(); 1635c67d6573Sopenharmony_ci self.states.clear(); 1636c67d6573Sopenharmony_ci } 1637c67d6573Sopenharmony_ci} 1638c67d6573Sopenharmony_ci 1639c67d6573Sopenharmony_ciimpl Transitions { 1640c67d6573Sopenharmony_ci /// Create a new transition table. 1641c67d6573Sopenharmony_ci /// 1642c67d6573Sopenharmony_ci /// The number of byte classes corresponds to the stride. Every state will 1643c67d6573Sopenharmony_ci /// have `num_byte_classes` slots for transitions. 1644c67d6573Sopenharmony_ci fn new(num_byte_classes: usize) -> Transitions { 1645c67d6573Sopenharmony_ci Transitions { table: vec![], num_byte_classes } 1646c67d6573Sopenharmony_ci } 1647c67d6573Sopenharmony_ci 1648c67d6573Sopenharmony_ci /// Returns the total number of states currently in this table. 1649c67d6573Sopenharmony_ci fn num_states(&self) -> usize { 1650c67d6573Sopenharmony_ci self.table.len() / self.num_byte_classes 1651c67d6573Sopenharmony_ci } 1652c67d6573Sopenharmony_ci 1653c67d6573Sopenharmony_ci /// Allocates room for one additional state and returns a pointer to it. 1654c67d6573Sopenharmony_ci /// 1655c67d6573Sopenharmony_ci /// If there's no more room, None is returned. 1656c67d6573Sopenharmony_ci fn add(&mut self) -> Option<StatePtr> { 1657c67d6573Sopenharmony_ci let si = self.table.len(); 1658c67d6573Sopenharmony_ci if si > STATE_MAX as usize { 1659c67d6573Sopenharmony_ci return None; 1660c67d6573Sopenharmony_ci } 1661c67d6573Sopenharmony_ci self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes)); 1662c67d6573Sopenharmony_ci Some(usize_to_u32(si)) 1663c67d6573Sopenharmony_ci } 1664c67d6573Sopenharmony_ci 1665c67d6573Sopenharmony_ci /// Clears the table of all states. 1666c67d6573Sopenharmony_ci fn clear(&mut self) { 1667c67d6573Sopenharmony_ci self.table.clear(); 1668c67d6573Sopenharmony_ci } 1669c67d6573Sopenharmony_ci 1670c67d6573Sopenharmony_ci /// Sets the transition from (si, cls) to next. 1671c67d6573Sopenharmony_ci fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) { 1672c67d6573Sopenharmony_ci self.table[si as usize + cls] = next; 1673c67d6573Sopenharmony_ci } 1674c67d6573Sopenharmony_ci 1675c67d6573Sopenharmony_ci /// Returns the transition corresponding to (si, cls). 1676c67d6573Sopenharmony_ci fn next(&self, si: StatePtr, cls: usize) -> StatePtr { 1677c67d6573Sopenharmony_ci self.table[si as usize + cls] 1678c67d6573Sopenharmony_ci } 1679c67d6573Sopenharmony_ci 1680c67d6573Sopenharmony_ci /// The heap size, in bytes, of a single state in the transition table. 1681c67d6573Sopenharmony_ci fn state_heap_size(&self) -> usize { 1682c67d6573Sopenharmony_ci self.num_byte_classes * mem::size_of::<StatePtr>() 1683c67d6573Sopenharmony_ci } 1684c67d6573Sopenharmony_ci 1685c67d6573Sopenharmony_ci /// Like `next`, but uses unchecked access and is therefore not safe. 1686c67d6573Sopenharmony_ci unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr { 1687c67d6573Sopenharmony_ci debug_assert!((si as usize) < self.table.len()); 1688c67d6573Sopenharmony_ci debug_assert!(cls < self.num_byte_classes); 1689c67d6573Sopenharmony_ci *self.table.get_unchecked(si as usize + cls) 1690c67d6573Sopenharmony_ci } 1691c67d6573Sopenharmony_ci} 1692c67d6573Sopenharmony_ci 1693c67d6573Sopenharmony_ciimpl StateFlags { 1694c67d6573Sopenharmony_ci fn is_match(&self) -> bool { 1695c67d6573Sopenharmony_ci self.0 & 0b0000_0001 > 0 1696c67d6573Sopenharmony_ci } 1697c67d6573Sopenharmony_ci 1698c67d6573Sopenharmony_ci fn set_match(&mut self) { 1699c67d6573Sopenharmony_ci self.0 |= 0b0000_0001; 1700c67d6573Sopenharmony_ci } 1701c67d6573Sopenharmony_ci 1702c67d6573Sopenharmony_ci fn is_word(&self) -> bool { 1703c67d6573Sopenharmony_ci self.0 & 0b0000_0010 > 0 1704c67d6573Sopenharmony_ci } 1705c67d6573Sopenharmony_ci 1706c67d6573Sopenharmony_ci fn set_word(&mut self) { 1707c67d6573Sopenharmony_ci self.0 |= 0b0000_0010; 1708c67d6573Sopenharmony_ci } 1709c67d6573Sopenharmony_ci 1710c67d6573Sopenharmony_ci fn has_empty(&self) -> bool { 1711c67d6573Sopenharmony_ci self.0 & 0b0000_0100 > 0 1712c67d6573Sopenharmony_ci } 1713c67d6573Sopenharmony_ci 1714c67d6573Sopenharmony_ci fn set_empty(&mut self) { 1715c67d6573Sopenharmony_ci self.0 |= 0b0000_0100; 1716c67d6573Sopenharmony_ci } 1717c67d6573Sopenharmony_ci} 1718c67d6573Sopenharmony_ci 1719c67d6573Sopenharmony_ciimpl Byte { 1720c67d6573Sopenharmony_ci fn byte(b: u8) -> Self { 1721c67d6573Sopenharmony_ci Byte(b as u16) 1722c67d6573Sopenharmony_ci } 1723c67d6573Sopenharmony_ci fn eof() -> Self { 1724c67d6573Sopenharmony_ci Byte(256) 1725c67d6573Sopenharmony_ci } 1726c67d6573Sopenharmony_ci fn is_eof(&self) -> bool { 1727c67d6573Sopenharmony_ci self.0 == 256 1728c67d6573Sopenharmony_ci } 1729c67d6573Sopenharmony_ci 1730c67d6573Sopenharmony_ci fn is_ascii_word(&self) -> bool { 1731c67d6573Sopenharmony_ci let b = match self.as_byte() { 1732c67d6573Sopenharmony_ci None => return false, 1733c67d6573Sopenharmony_ci Some(b) => b, 1734c67d6573Sopenharmony_ci }; 1735c67d6573Sopenharmony_ci match b { 1736c67d6573Sopenharmony_ci b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true, 1737c67d6573Sopenharmony_ci _ => false, 1738c67d6573Sopenharmony_ci } 1739c67d6573Sopenharmony_ci } 1740c67d6573Sopenharmony_ci 1741c67d6573Sopenharmony_ci fn as_byte(&self) -> Option<u8> { 1742c67d6573Sopenharmony_ci if self.is_eof() { 1743c67d6573Sopenharmony_ci None 1744c67d6573Sopenharmony_ci } else { 1745c67d6573Sopenharmony_ci Some(self.0 as u8) 1746c67d6573Sopenharmony_ci } 1747c67d6573Sopenharmony_ci } 1748c67d6573Sopenharmony_ci} 1749c67d6573Sopenharmony_ci 1750c67d6573Sopenharmony_ciimpl fmt::Debug for State { 1751c67d6573Sopenharmony_ci fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1752c67d6573Sopenharmony_ci let ips: Vec<usize> = self.inst_ptrs().collect(); 1753c67d6573Sopenharmony_ci f.debug_struct("State") 1754c67d6573Sopenharmony_ci .field("flags", &self.flags()) 1755c67d6573Sopenharmony_ci .field("insts", &ips) 1756c67d6573Sopenharmony_ci .finish() 1757c67d6573Sopenharmony_ci } 1758c67d6573Sopenharmony_ci} 1759c67d6573Sopenharmony_ci 1760c67d6573Sopenharmony_ciimpl fmt::Debug for Transitions { 1761c67d6573Sopenharmony_ci fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1762c67d6573Sopenharmony_ci let mut fmtd = f.debug_map(); 1763c67d6573Sopenharmony_ci for si in 0..self.num_states() { 1764c67d6573Sopenharmony_ci let s = si * self.num_byte_classes; 1765c67d6573Sopenharmony_ci let e = s + self.num_byte_classes; 1766c67d6573Sopenharmony_ci fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e])); 1767c67d6573Sopenharmony_ci } 1768c67d6573Sopenharmony_ci fmtd.finish() 1769c67d6573Sopenharmony_ci } 1770c67d6573Sopenharmony_ci} 1771c67d6573Sopenharmony_ci 1772c67d6573Sopenharmony_cistruct TransitionsRow<'a>(&'a [StatePtr]); 1773c67d6573Sopenharmony_ci 1774c67d6573Sopenharmony_ciimpl<'a> fmt::Debug for TransitionsRow<'a> { 1775c67d6573Sopenharmony_ci fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1776c67d6573Sopenharmony_ci let mut fmtd = f.debug_map(); 1777c67d6573Sopenharmony_ci for (b, si) in self.0.iter().enumerate() { 1778c67d6573Sopenharmony_ci match *si { 1779c67d6573Sopenharmony_ci STATE_UNKNOWN => {} 1780c67d6573Sopenharmony_ci STATE_DEAD => { 1781c67d6573Sopenharmony_ci fmtd.entry(&vb(b as usize), &"DEAD"); 1782c67d6573Sopenharmony_ci } 1783c67d6573Sopenharmony_ci si => { 1784c67d6573Sopenharmony_ci fmtd.entry(&vb(b as usize), &si.to_string()); 1785c67d6573Sopenharmony_ci } 1786c67d6573Sopenharmony_ci } 1787c67d6573Sopenharmony_ci } 1788c67d6573Sopenharmony_ci fmtd.finish() 1789c67d6573Sopenharmony_ci } 1790c67d6573Sopenharmony_ci} 1791c67d6573Sopenharmony_ci 1792c67d6573Sopenharmony_ciimpl fmt::Debug for StateFlags { 1793c67d6573Sopenharmony_ci fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1794c67d6573Sopenharmony_ci f.debug_struct("StateFlags") 1795c67d6573Sopenharmony_ci .field("is_match", &self.is_match()) 1796c67d6573Sopenharmony_ci .field("is_word", &self.is_word()) 1797c67d6573Sopenharmony_ci .field("has_empty", &self.has_empty()) 1798c67d6573Sopenharmony_ci .finish() 1799c67d6573Sopenharmony_ci } 1800c67d6573Sopenharmony_ci} 1801c67d6573Sopenharmony_ci 1802c67d6573Sopenharmony_ci/// Helper function for formatting a byte as a nice-to-read escaped string. 1803c67d6573Sopenharmony_cifn vb(b: usize) -> String { 1804c67d6573Sopenharmony_ci use std::ascii::escape_default; 1805c67d6573Sopenharmony_ci 1806c67d6573Sopenharmony_ci if b > ::std::u8::MAX as usize { 1807c67d6573Sopenharmony_ci "EOF".to_owned() 1808c67d6573Sopenharmony_ci } else { 1809c67d6573Sopenharmony_ci let escaped = escape_default(b as u8).collect::<Vec<u8>>(); 1810c67d6573Sopenharmony_ci String::from_utf8_lossy(&escaped).into_owned() 1811c67d6573Sopenharmony_ci } 1812c67d6573Sopenharmony_ci} 1813c67d6573Sopenharmony_ci 1814c67d6573Sopenharmony_cifn usize_to_u32(n: usize) -> u32 { 1815c67d6573Sopenharmony_ci if (n as u64) > (::std::u32::MAX as u64) { 1816c67d6573Sopenharmony_ci panic!("BUG: {} is too big to fit into u32", n) 1817c67d6573Sopenharmony_ci } 1818c67d6573Sopenharmony_ci n as u32 1819c67d6573Sopenharmony_ci} 1820c67d6573Sopenharmony_ci 1821c67d6573Sopenharmony_ci#[allow(dead_code)] // useful for debugging 1822c67d6573Sopenharmony_cifn show_state_ptr(si: StatePtr) -> String { 1823c67d6573Sopenharmony_ci let mut s = format!("{:?}", si & STATE_MAX); 1824c67d6573Sopenharmony_ci if si == STATE_UNKNOWN { 1825c67d6573Sopenharmony_ci s = format!("{} (unknown)", s); 1826c67d6573Sopenharmony_ci } 1827c67d6573Sopenharmony_ci if si == STATE_DEAD { 1828c67d6573Sopenharmony_ci s = format!("{} (dead)", s); 1829c67d6573Sopenharmony_ci } 1830c67d6573Sopenharmony_ci if si == STATE_QUIT { 1831c67d6573Sopenharmony_ci s = format!("{} (quit)", s); 1832c67d6573Sopenharmony_ci } 1833c67d6573Sopenharmony_ci if si & STATE_START > 0 { 1834c67d6573Sopenharmony_ci s = format!("{} (start)", s); 1835c67d6573Sopenharmony_ci } 1836c67d6573Sopenharmony_ci if si & STATE_MATCH > 0 { 1837c67d6573Sopenharmony_ci s = format!("{} (match)", s); 1838c67d6573Sopenharmony_ci } 1839c67d6573Sopenharmony_ci s 1840c67d6573Sopenharmony_ci} 1841c67d6573Sopenharmony_ci 1842c67d6573Sopenharmony_ci/// https://developers.google.com/protocol-buffers/docs/encoding#varints 1843c67d6573Sopenharmony_cifn write_vari32(data: &mut Vec<u8>, n: i32) { 1844c67d6573Sopenharmony_ci let mut un = (n as u32) << 1; 1845c67d6573Sopenharmony_ci if n < 0 { 1846c67d6573Sopenharmony_ci un = !un; 1847c67d6573Sopenharmony_ci } 1848c67d6573Sopenharmony_ci write_varu32(data, un) 1849c67d6573Sopenharmony_ci} 1850c67d6573Sopenharmony_ci 1851c67d6573Sopenharmony_ci/// https://developers.google.com/protocol-buffers/docs/encoding#varints 1852c67d6573Sopenharmony_cifn read_vari32(data: &[u8]) -> (i32, usize) { 1853c67d6573Sopenharmony_ci let (un, i) = read_varu32(data); 1854c67d6573Sopenharmony_ci let mut n = (un >> 1) as i32; 1855c67d6573Sopenharmony_ci if un & 1 != 0 { 1856c67d6573Sopenharmony_ci n = !n; 1857c67d6573Sopenharmony_ci } 1858c67d6573Sopenharmony_ci (n, i) 1859c67d6573Sopenharmony_ci} 1860c67d6573Sopenharmony_ci 1861c67d6573Sopenharmony_ci/// https://developers.google.com/protocol-buffers/docs/encoding#varints 1862c67d6573Sopenharmony_cifn write_varu32(data: &mut Vec<u8>, mut n: u32) { 1863c67d6573Sopenharmony_ci while n >= 0b1000_0000 { 1864c67d6573Sopenharmony_ci data.push((n as u8) | 0b1000_0000); 1865c67d6573Sopenharmony_ci n >>= 7; 1866c67d6573Sopenharmony_ci } 1867c67d6573Sopenharmony_ci data.push(n as u8); 1868c67d6573Sopenharmony_ci} 1869c67d6573Sopenharmony_ci 1870c67d6573Sopenharmony_ci/// https://developers.google.com/protocol-buffers/docs/encoding#varints 1871c67d6573Sopenharmony_cifn read_varu32(data: &[u8]) -> (u32, usize) { 1872c67d6573Sopenharmony_ci let mut n: u32 = 0; 1873c67d6573Sopenharmony_ci let mut shift: u32 = 0; 1874c67d6573Sopenharmony_ci for (i, &b) in data.iter().enumerate() { 1875c67d6573Sopenharmony_ci if b < 0b1000_0000 { 1876c67d6573Sopenharmony_ci return (n | ((b as u32) << shift), i + 1); 1877c67d6573Sopenharmony_ci } 1878c67d6573Sopenharmony_ci n |= ((b as u32) & 0b0111_1111) << shift; 1879c67d6573Sopenharmony_ci shift += 7; 1880c67d6573Sopenharmony_ci } 1881c67d6573Sopenharmony_ci (0, 0) 1882c67d6573Sopenharmony_ci} 1883c67d6573Sopenharmony_ci 1884c67d6573Sopenharmony_ci#[cfg(test)] 1885c67d6573Sopenharmony_cimod tests { 1886c67d6573Sopenharmony_ci 1887c67d6573Sopenharmony_ci use super::{ 1888c67d6573Sopenharmony_ci push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32, 1889c67d6573Sopenharmony_ci State, StateFlags, 1890c67d6573Sopenharmony_ci }; 1891c67d6573Sopenharmony_ci use quickcheck::{quickcheck, Gen, QuickCheck}; 1892c67d6573Sopenharmony_ci use std::sync::Arc; 1893c67d6573Sopenharmony_ci 1894c67d6573Sopenharmony_ci #[test] 1895c67d6573Sopenharmony_ci fn prop_state_encode_decode() { 1896c67d6573Sopenharmony_ci fn p(mut ips: Vec<u32>, flags: u8) -> bool { 1897c67d6573Sopenharmony_ci // It looks like our encoding scheme can't handle instruction 1898c67d6573Sopenharmony_ci // pointers at or above 2**31. We should fix that, but it seems 1899c67d6573Sopenharmony_ci // unlikely to occur in real code due to the amount of memory 1900c67d6573Sopenharmony_ci // required for such a state machine. So for now, we just clamp 1901c67d6573Sopenharmony_ci // our test data. 1902c67d6573Sopenharmony_ci for ip in &mut ips { 1903c67d6573Sopenharmony_ci if *ip >= 1 << 31 { 1904c67d6573Sopenharmony_ci *ip = (1 << 31) - 1; 1905c67d6573Sopenharmony_ci } 1906c67d6573Sopenharmony_ci } 1907c67d6573Sopenharmony_ci let mut data = vec![flags]; 1908c67d6573Sopenharmony_ci let mut prev = 0; 1909c67d6573Sopenharmony_ci for &ip in ips.iter() { 1910c67d6573Sopenharmony_ci push_inst_ptr(&mut data, &mut prev, ip); 1911c67d6573Sopenharmony_ci } 1912c67d6573Sopenharmony_ci let state = State { data: Arc::from(&data[..]) }; 1913c67d6573Sopenharmony_ci 1914c67d6573Sopenharmony_ci let expected: Vec<usize> = 1915c67d6573Sopenharmony_ci ips.into_iter().map(|ip| ip as usize).collect(); 1916c67d6573Sopenharmony_ci let got: Vec<usize> = state.inst_ptrs().collect(); 1917c67d6573Sopenharmony_ci expected == got && state.flags() == StateFlags(flags) 1918c67d6573Sopenharmony_ci } 1919c67d6573Sopenharmony_ci QuickCheck::new() 1920c67d6573Sopenharmony_ci .gen(Gen::new(10_000)) 1921c67d6573Sopenharmony_ci .quickcheck(p as fn(Vec<u32>, u8) -> bool); 1922c67d6573Sopenharmony_ci } 1923c67d6573Sopenharmony_ci 1924c67d6573Sopenharmony_ci #[test] 1925c67d6573Sopenharmony_ci fn prop_read_write_u32() { 1926c67d6573Sopenharmony_ci fn p(n: u32) -> bool { 1927c67d6573Sopenharmony_ci let mut buf = vec![]; 1928c67d6573Sopenharmony_ci write_varu32(&mut buf, n); 1929c67d6573Sopenharmony_ci let (got, nread) = read_varu32(&buf); 1930c67d6573Sopenharmony_ci nread == buf.len() && got == n 1931c67d6573Sopenharmony_ci } 1932c67d6573Sopenharmony_ci quickcheck(p as fn(u32) -> bool); 1933c67d6573Sopenharmony_ci } 1934c67d6573Sopenharmony_ci 1935c67d6573Sopenharmony_ci #[test] 1936c67d6573Sopenharmony_ci fn prop_read_write_i32() { 1937c67d6573Sopenharmony_ci fn p(n: i32) -> bool { 1938c67d6573Sopenharmony_ci let mut buf = vec![]; 1939c67d6573Sopenharmony_ci write_vari32(&mut buf, n); 1940c67d6573Sopenharmony_ci let (got, nread) = read_vari32(&buf); 1941c67d6573Sopenharmony_ci nread == buf.len() && got == n 1942c67d6573Sopenharmony_ci } 1943c67d6573Sopenharmony_ci quickcheck(p as fn(i32) -> bool); 1944c67d6573Sopenharmony_ci } 1945c67d6573Sopenharmony_ci} 1946