1c67d6573Sopenharmony_ci// This module implements the Pike VM. That is, it guarantees linear time 2c67d6573Sopenharmony_ci// search of a regex on any text with memory use proportional to the size of 3c67d6573Sopenharmony_ci// the regex. 4c67d6573Sopenharmony_ci// 5c67d6573Sopenharmony_ci// It is equal in power to the backtracking engine in this crate, except the 6c67d6573Sopenharmony_ci// backtracking engine is typically faster on small regexes/texts at the 7c67d6573Sopenharmony_ci// expense of a bigger memory footprint. 8c67d6573Sopenharmony_ci// 9c67d6573Sopenharmony_ci// It can do more than the DFA can (specifically, record capture locations 10c67d6573Sopenharmony_ci// and execute Unicode word boundary assertions), but at a slower speed. 11c67d6573Sopenharmony_ci// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding 12c67d6573Sopenharmony_ci// epsilon transitions. That is, the Pike VM engine can be in multiple states 13c67d6573Sopenharmony_ci// at once where as the DFA is only ever in one state at a time. 14c67d6573Sopenharmony_ci// 15c67d6573Sopenharmony_ci// Therefore, the Pike VM is generally treated as the fallback when the other 16c67d6573Sopenharmony_ci// matching engines either aren't feasible to run or are insufficient. 17c67d6573Sopenharmony_ci 18c67d6573Sopenharmony_ciuse std::mem; 19c67d6573Sopenharmony_ci 20c67d6573Sopenharmony_ciuse crate::exec::ProgramCache; 21c67d6573Sopenharmony_ciuse crate::input::{Input, InputAt}; 22c67d6573Sopenharmony_ciuse crate::prog::{InstPtr, Program}; 23c67d6573Sopenharmony_ciuse crate::re_trait::Slot; 24c67d6573Sopenharmony_ciuse crate::sparse::SparseSet; 25c67d6573Sopenharmony_ci 26c67d6573Sopenharmony_ci/// An NFA simulation matching engine. 27c67d6573Sopenharmony_ci#[derive(Debug)] 28c67d6573Sopenharmony_cipub struct Fsm<'r, I> { 29c67d6573Sopenharmony_ci /// The sequence of opcodes (among other things) that is actually executed. 30c67d6573Sopenharmony_ci /// 31c67d6573Sopenharmony_ci /// The program may be byte oriented or Unicode codepoint oriented. 32c67d6573Sopenharmony_ci prog: &'r Program, 33c67d6573Sopenharmony_ci /// An explicit stack used for following epsilon transitions. (This is 34c67d6573Sopenharmony_ci /// borrowed from the cache.) 35c67d6573Sopenharmony_ci stack: &'r mut Vec<FollowEpsilon>, 36c67d6573Sopenharmony_ci /// The input to search. 37c67d6573Sopenharmony_ci input: I, 38c67d6573Sopenharmony_ci} 39c67d6573Sopenharmony_ci 40c67d6573Sopenharmony_ci/// A cached allocation that can be reused on each execution. 41c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 42c67d6573Sopenharmony_cipub struct Cache { 43c67d6573Sopenharmony_ci /// A pair of ordered sets for tracking NFA states. 44c67d6573Sopenharmony_ci clist: Threads, 45c67d6573Sopenharmony_ci nlist: Threads, 46c67d6573Sopenharmony_ci /// An explicit stack used for following epsilon transitions. 47c67d6573Sopenharmony_ci stack: Vec<FollowEpsilon>, 48c67d6573Sopenharmony_ci} 49c67d6573Sopenharmony_ci 50c67d6573Sopenharmony_ci/// An ordered set of NFA states and their captures. 51c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 52c67d6573Sopenharmony_cistruct Threads { 53c67d6573Sopenharmony_ci /// An ordered set of opcodes (each opcode is an NFA state). 54c67d6573Sopenharmony_ci set: SparseSet, 55c67d6573Sopenharmony_ci /// Captures for every NFA state. 56c67d6573Sopenharmony_ci /// 57c67d6573Sopenharmony_ci /// It is stored in row-major order, where the columns are the capture 58c67d6573Sopenharmony_ci /// slots and the rows are the states. 59c67d6573Sopenharmony_ci caps: Vec<Slot>, 60c67d6573Sopenharmony_ci /// The number of capture slots stored per thread. (Every capture has 61c67d6573Sopenharmony_ci /// two slots.) 62c67d6573Sopenharmony_ci slots_per_thread: usize, 63c67d6573Sopenharmony_ci} 64c67d6573Sopenharmony_ci 65c67d6573Sopenharmony_ci/// A representation of an explicit stack frame when following epsilon 66c67d6573Sopenharmony_ci/// transitions. This is used to avoid recursion. 67c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 68c67d6573Sopenharmony_cienum FollowEpsilon { 69c67d6573Sopenharmony_ci /// Follow transitions at the given instruction pointer. 70c67d6573Sopenharmony_ci IP(InstPtr), 71c67d6573Sopenharmony_ci /// Restore the capture slot with the given position in the input. 72c67d6573Sopenharmony_ci Capture { slot: usize, pos: Slot }, 73c67d6573Sopenharmony_ci} 74c67d6573Sopenharmony_ci 75c67d6573Sopenharmony_ciimpl Cache { 76c67d6573Sopenharmony_ci /// Create a new allocation used by the NFA machine to record execution 77c67d6573Sopenharmony_ci /// and captures. 78c67d6573Sopenharmony_ci pub fn new(_prog: &Program) -> Self { 79c67d6573Sopenharmony_ci Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] } 80c67d6573Sopenharmony_ci } 81c67d6573Sopenharmony_ci} 82c67d6573Sopenharmony_ci 83c67d6573Sopenharmony_ciimpl<'r, I: Input> Fsm<'r, I> { 84c67d6573Sopenharmony_ci /// Execute the NFA matching engine. 85c67d6573Sopenharmony_ci /// 86c67d6573Sopenharmony_ci /// If there's a match, `exec` returns `true` and populates the given 87c67d6573Sopenharmony_ci /// captures accordingly. 88c67d6573Sopenharmony_ci pub fn exec( 89c67d6573Sopenharmony_ci prog: &'r Program, 90c67d6573Sopenharmony_ci cache: &ProgramCache, 91c67d6573Sopenharmony_ci matches: &mut [bool], 92c67d6573Sopenharmony_ci slots: &mut [Slot], 93c67d6573Sopenharmony_ci quit_after_match: bool, 94c67d6573Sopenharmony_ci input: I, 95c67d6573Sopenharmony_ci start: usize, 96c67d6573Sopenharmony_ci end: usize, 97c67d6573Sopenharmony_ci ) -> bool { 98c67d6573Sopenharmony_ci let mut cache = cache.borrow_mut(); 99c67d6573Sopenharmony_ci let cache = &mut cache.pikevm; 100c67d6573Sopenharmony_ci cache.clist.resize(prog.len(), prog.captures.len()); 101c67d6573Sopenharmony_ci cache.nlist.resize(prog.len(), prog.captures.len()); 102c67d6573Sopenharmony_ci let at = input.at(start); 103c67d6573Sopenharmony_ci Fsm { prog, stack: &mut cache.stack, input }.exec_( 104c67d6573Sopenharmony_ci &mut cache.clist, 105c67d6573Sopenharmony_ci &mut cache.nlist, 106c67d6573Sopenharmony_ci matches, 107c67d6573Sopenharmony_ci slots, 108c67d6573Sopenharmony_ci quit_after_match, 109c67d6573Sopenharmony_ci at, 110c67d6573Sopenharmony_ci end, 111c67d6573Sopenharmony_ci ) 112c67d6573Sopenharmony_ci } 113c67d6573Sopenharmony_ci 114c67d6573Sopenharmony_ci fn exec_( 115c67d6573Sopenharmony_ci &mut self, 116c67d6573Sopenharmony_ci mut clist: &mut Threads, 117c67d6573Sopenharmony_ci mut nlist: &mut Threads, 118c67d6573Sopenharmony_ci matches: &mut [bool], 119c67d6573Sopenharmony_ci slots: &mut [Slot], 120c67d6573Sopenharmony_ci quit_after_match: bool, 121c67d6573Sopenharmony_ci mut at: InputAt, 122c67d6573Sopenharmony_ci end: usize, 123c67d6573Sopenharmony_ci ) -> bool { 124c67d6573Sopenharmony_ci let mut matched = false; 125c67d6573Sopenharmony_ci let mut all_matched = false; 126c67d6573Sopenharmony_ci clist.set.clear(); 127c67d6573Sopenharmony_ci nlist.set.clear(); 128c67d6573Sopenharmony_ci 'LOOP: loop { 129c67d6573Sopenharmony_ci if clist.set.is_empty() { 130c67d6573Sopenharmony_ci // Three ways to bail out when our current set of threads is 131c67d6573Sopenharmony_ci // empty. 132c67d6573Sopenharmony_ci // 133c67d6573Sopenharmony_ci // 1. We have a match---so we're done exploring any possible 134c67d6573Sopenharmony_ci // alternatives. Time to quit. (We can't do this if we're 135c67d6573Sopenharmony_ci // looking for matches for multiple regexes, unless we know 136c67d6573Sopenharmony_ci // they all matched.) 137c67d6573Sopenharmony_ci // 138c67d6573Sopenharmony_ci // 2. If the expression starts with a '^' we can terminate as 139c67d6573Sopenharmony_ci // soon as the last thread dies. 140c67d6573Sopenharmony_ci if (matched && matches.len() <= 1) 141c67d6573Sopenharmony_ci || all_matched 142c67d6573Sopenharmony_ci || (!at.is_start() && self.prog.is_anchored_start) 143c67d6573Sopenharmony_ci { 144c67d6573Sopenharmony_ci break; 145c67d6573Sopenharmony_ci } 146c67d6573Sopenharmony_ci 147c67d6573Sopenharmony_ci // 3. If there's a literal prefix for the program, try to 148c67d6573Sopenharmony_ci // jump ahead quickly. If it can't be found, then we can 149c67d6573Sopenharmony_ci // bail out early. 150c67d6573Sopenharmony_ci if !self.prog.prefixes.is_empty() { 151c67d6573Sopenharmony_ci at = match self.input.prefix_at(&self.prog.prefixes, at) { 152c67d6573Sopenharmony_ci None => break, 153c67d6573Sopenharmony_ci Some(at) => at, 154c67d6573Sopenharmony_ci }; 155c67d6573Sopenharmony_ci } 156c67d6573Sopenharmony_ci } 157c67d6573Sopenharmony_ci 158c67d6573Sopenharmony_ci // This simulates a preceding '.*?' for every regex by adding 159c67d6573Sopenharmony_ci // a state starting at the current position in the input for the 160c67d6573Sopenharmony_ci // beginning of the program only if we don't already have a match. 161c67d6573Sopenharmony_ci if clist.set.is_empty() 162c67d6573Sopenharmony_ci || (!self.prog.is_anchored_start && !all_matched) 163c67d6573Sopenharmony_ci { 164c67d6573Sopenharmony_ci self.add(&mut clist, slots, 0, at); 165c67d6573Sopenharmony_ci } 166c67d6573Sopenharmony_ci // The previous call to "add" actually inspects the position just 167c67d6573Sopenharmony_ci // before the current character. For stepping through the machine, 168c67d6573Sopenharmony_ci // we can to look at the current character, so we advance the 169c67d6573Sopenharmony_ci // input. 170c67d6573Sopenharmony_ci let at_next = self.input.at(at.next_pos()); 171c67d6573Sopenharmony_ci for i in 0..clist.set.len() { 172c67d6573Sopenharmony_ci let ip = clist.set[i]; 173c67d6573Sopenharmony_ci if self.step( 174c67d6573Sopenharmony_ci &mut nlist, 175c67d6573Sopenharmony_ci matches, 176c67d6573Sopenharmony_ci slots, 177c67d6573Sopenharmony_ci clist.caps(ip), 178c67d6573Sopenharmony_ci ip, 179c67d6573Sopenharmony_ci at, 180c67d6573Sopenharmony_ci at_next, 181c67d6573Sopenharmony_ci ) { 182c67d6573Sopenharmony_ci matched = true; 183c67d6573Sopenharmony_ci all_matched = all_matched || matches.iter().all(|&b| b); 184c67d6573Sopenharmony_ci if quit_after_match { 185c67d6573Sopenharmony_ci // If we only care if a match occurs (not its 186c67d6573Sopenharmony_ci // position), then we can quit right now. 187c67d6573Sopenharmony_ci break 'LOOP; 188c67d6573Sopenharmony_ci } 189c67d6573Sopenharmony_ci if self.prog.matches.len() == 1 { 190c67d6573Sopenharmony_ci // We don't need to check the rest of the threads 191c67d6573Sopenharmony_ci // in this set because we've matched something 192c67d6573Sopenharmony_ci // ("leftmost-first"). However, we still need to check 193c67d6573Sopenharmony_ci // threads in the next set to support things like 194c67d6573Sopenharmony_ci // greedy matching. 195c67d6573Sopenharmony_ci // 196c67d6573Sopenharmony_ci // This is only true on normal regexes. For regex sets, 197c67d6573Sopenharmony_ci // we need to mush on to observe other matches. 198c67d6573Sopenharmony_ci break; 199c67d6573Sopenharmony_ci } 200c67d6573Sopenharmony_ci } 201c67d6573Sopenharmony_ci } 202c67d6573Sopenharmony_ci if at.pos() >= end { 203c67d6573Sopenharmony_ci break; 204c67d6573Sopenharmony_ci } 205c67d6573Sopenharmony_ci at = at_next; 206c67d6573Sopenharmony_ci mem::swap(clist, nlist); 207c67d6573Sopenharmony_ci nlist.set.clear(); 208c67d6573Sopenharmony_ci } 209c67d6573Sopenharmony_ci matched 210c67d6573Sopenharmony_ci } 211c67d6573Sopenharmony_ci 212c67d6573Sopenharmony_ci /// Step through the input, one token (byte or codepoint) at a time. 213c67d6573Sopenharmony_ci /// 214c67d6573Sopenharmony_ci /// nlist is the set of states that will be processed on the next token 215c67d6573Sopenharmony_ci /// in the input. 216c67d6573Sopenharmony_ci /// 217c67d6573Sopenharmony_ci /// caps is the set of captures passed by the caller of the NFA. They are 218c67d6573Sopenharmony_ci /// written to only when a match state is visited. 219c67d6573Sopenharmony_ci /// 220c67d6573Sopenharmony_ci /// thread_caps is the set of captures set for the current NFA state, ip. 221c67d6573Sopenharmony_ci /// 222c67d6573Sopenharmony_ci /// at and at_next are the current and next positions in the input. at or 223c67d6573Sopenharmony_ci /// at_next may be EOF. 224c67d6573Sopenharmony_ci fn step( 225c67d6573Sopenharmony_ci &mut self, 226c67d6573Sopenharmony_ci nlist: &mut Threads, 227c67d6573Sopenharmony_ci matches: &mut [bool], 228c67d6573Sopenharmony_ci slots: &mut [Slot], 229c67d6573Sopenharmony_ci thread_caps: &mut [Option<usize>], 230c67d6573Sopenharmony_ci ip: usize, 231c67d6573Sopenharmony_ci at: InputAt, 232c67d6573Sopenharmony_ci at_next: InputAt, 233c67d6573Sopenharmony_ci ) -> bool { 234c67d6573Sopenharmony_ci use crate::prog::Inst::*; 235c67d6573Sopenharmony_ci match self.prog[ip] { 236c67d6573Sopenharmony_ci Match(match_slot) => { 237c67d6573Sopenharmony_ci if match_slot < matches.len() { 238c67d6573Sopenharmony_ci matches[match_slot] = true; 239c67d6573Sopenharmony_ci } 240c67d6573Sopenharmony_ci for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { 241c67d6573Sopenharmony_ci *slot = *val; 242c67d6573Sopenharmony_ci } 243c67d6573Sopenharmony_ci true 244c67d6573Sopenharmony_ci } 245c67d6573Sopenharmony_ci Char(ref inst) => { 246c67d6573Sopenharmony_ci if inst.c == at.char() { 247c67d6573Sopenharmony_ci self.add(nlist, thread_caps, inst.goto, at_next); 248c67d6573Sopenharmony_ci } 249c67d6573Sopenharmony_ci false 250c67d6573Sopenharmony_ci } 251c67d6573Sopenharmony_ci Ranges(ref inst) => { 252c67d6573Sopenharmony_ci if inst.matches(at.char()) { 253c67d6573Sopenharmony_ci self.add(nlist, thread_caps, inst.goto, at_next); 254c67d6573Sopenharmony_ci } 255c67d6573Sopenharmony_ci false 256c67d6573Sopenharmony_ci } 257c67d6573Sopenharmony_ci Bytes(ref inst) => { 258c67d6573Sopenharmony_ci if let Some(b) = at.byte() { 259c67d6573Sopenharmony_ci if inst.matches(b) { 260c67d6573Sopenharmony_ci self.add(nlist, thread_caps, inst.goto, at_next); 261c67d6573Sopenharmony_ci } 262c67d6573Sopenharmony_ci } 263c67d6573Sopenharmony_ci false 264c67d6573Sopenharmony_ci } 265c67d6573Sopenharmony_ci EmptyLook(_) | Save(_) | Split(_) => false, 266c67d6573Sopenharmony_ci } 267c67d6573Sopenharmony_ci } 268c67d6573Sopenharmony_ci 269c67d6573Sopenharmony_ci /// Follows epsilon transitions and adds them for processing to nlist, 270c67d6573Sopenharmony_ci /// starting at and including ip. 271c67d6573Sopenharmony_ci fn add( 272c67d6573Sopenharmony_ci &mut self, 273c67d6573Sopenharmony_ci nlist: &mut Threads, 274c67d6573Sopenharmony_ci thread_caps: &mut [Option<usize>], 275c67d6573Sopenharmony_ci ip: usize, 276c67d6573Sopenharmony_ci at: InputAt, 277c67d6573Sopenharmony_ci ) { 278c67d6573Sopenharmony_ci self.stack.push(FollowEpsilon::IP(ip)); 279c67d6573Sopenharmony_ci while let Some(frame) = self.stack.pop() { 280c67d6573Sopenharmony_ci match frame { 281c67d6573Sopenharmony_ci FollowEpsilon::IP(ip) => { 282c67d6573Sopenharmony_ci self.add_step(nlist, thread_caps, ip, at); 283c67d6573Sopenharmony_ci } 284c67d6573Sopenharmony_ci FollowEpsilon::Capture { slot, pos } => { 285c67d6573Sopenharmony_ci thread_caps[slot] = pos; 286c67d6573Sopenharmony_ci } 287c67d6573Sopenharmony_ci } 288c67d6573Sopenharmony_ci } 289c67d6573Sopenharmony_ci } 290c67d6573Sopenharmony_ci 291c67d6573Sopenharmony_ci /// A helper function for add that avoids excessive pushing to the stack. 292c67d6573Sopenharmony_ci fn add_step( 293c67d6573Sopenharmony_ci &mut self, 294c67d6573Sopenharmony_ci nlist: &mut Threads, 295c67d6573Sopenharmony_ci thread_caps: &mut [Option<usize>], 296c67d6573Sopenharmony_ci mut ip: usize, 297c67d6573Sopenharmony_ci at: InputAt, 298c67d6573Sopenharmony_ci ) { 299c67d6573Sopenharmony_ci // Instead of pushing and popping to the stack, we mutate ip as we 300c67d6573Sopenharmony_ci // traverse the set of states. We only push to the stack when we 301c67d6573Sopenharmony_ci // absolutely need recursion (restoring captures or following a 302c67d6573Sopenharmony_ci // branch). 303c67d6573Sopenharmony_ci use crate::prog::Inst::*; 304c67d6573Sopenharmony_ci loop { 305c67d6573Sopenharmony_ci // Don't visit states we've already added. 306c67d6573Sopenharmony_ci if nlist.set.contains(ip) { 307c67d6573Sopenharmony_ci return; 308c67d6573Sopenharmony_ci } 309c67d6573Sopenharmony_ci nlist.set.insert(ip); 310c67d6573Sopenharmony_ci match self.prog[ip] { 311c67d6573Sopenharmony_ci EmptyLook(ref inst) => { 312c67d6573Sopenharmony_ci if self.input.is_empty_match(at, inst) { 313c67d6573Sopenharmony_ci ip = inst.goto; 314c67d6573Sopenharmony_ci } 315c67d6573Sopenharmony_ci } 316c67d6573Sopenharmony_ci Save(ref inst) => { 317c67d6573Sopenharmony_ci if inst.slot < thread_caps.len() { 318c67d6573Sopenharmony_ci self.stack.push(FollowEpsilon::Capture { 319c67d6573Sopenharmony_ci slot: inst.slot, 320c67d6573Sopenharmony_ci pos: thread_caps[inst.slot], 321c67d6573Sopenharmony_ci }); 322c67d6573Sopenharmony_ci thread_caps[inst.slot] = Some(at.pos()); 323c67d6573Sopenharmony_ci } 324c67d6573Sopenharmony_ci ip = inst.goto; 325c67d6573Sopenharmony_ci } 326c67d6573Sopenharmony_ci Split(ref inst) => { 327c67d6573Sopenharmony_ci self.stack.push(FollowEpsilon::IP(inst.goto2)); 328c67d6573Sopenharmony_ci ip = inst.goto1; 329c67d6573Sopenharmony_ci } 330c67d6573Sopenharmony_ci Match(_) | Char(_) | Ranges(_) | Bytes(_) => { 331c67d6573Sopenharmony_ci let t = &mut nlist.caps(ip); 332c67d6573Sopenharmony_ci for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { 333c67d6573Sopenharmony_ci *slot = *val; 334c67d6573Sopenharmony_ci } 335c67d6573Sopenharmony_ci return; 336c67d6573Sopenharmony_ci } 337c67d6573Sopenharmony_ci } 338c67d6573Sopenharmony_ci } 339c67d6573Sopenharmony_ci } 340c67d6573Sopenharmony_ci} 341c67d6573Sopenharmony_ci 342c67d6573Sopenharmony_ciimpl Threads { 343c67d6573Sopenharmony_ci fn new() -> Self { 344c67d6573Sopenharmony_ci Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 } 345c67d6573Sopenharmony_ci } 346c67d6573Sopenharmony_ci 347c67d6573Sopenharmony_ci fn resize(&mut self, num_insts: usize, ncaps: usize) { 348c67d6573Sopenharmony_ci if num_insts == self.set.capacity() { 349c67d6573Sopenharmony_ci return; 350c67d6573Sopenharmony_ci } 351c67d6573Sopenharmony_ci self.slots_per_thread = ncaps * 2; 352c67d6573Sopenharmony_ci self.set = SparseSet::new(num_insts); 353c67d6573Sopenharmony_ci self.caps = vec![None; self.slots_per_thread * num_insts]; 354c67d6573Sopenharmony_ci } 355c67d6573Sopenharmony_ci 356c67d6573Sopenharmony_ci fn caps(&mut self, pc: usize) -> &mut [Option<usize>] { 357c67d6573Sopenharmony_ci let i = pc * self.slots_per_thread; 358c67d6573Sopenharmony_ci &mut self.caps[i..i + self.slots_per_thread] 359c67d6573Sopenharmony_ci } 360c67d6573Sopenharmony_ci} 361