1c67d6573Sopenharmony_ci// This is the backtracking matching engine. It has the same exact capability 2c67d6573Sopenharmony_ci// as the full NFA simulation, except it is artificially restricted to small 3c67d6573Sopenharmony_ci// regexes on small inputs because of its memory requirements. 4c67d6573Sopenharmony_ci// 5c67d6573Sopenharmony_ci// In particular, this is a *bounded* backtracking engine. It retains worst 6c67d6573Sopenharmony_ci// case linear time by keeping track of the states that it has visited (using a 7c67d6573Sopenharmony_ci// bitmap). Namely, once a state is visited, it is never visited again. Since a 8c67d6573Sopenharmony_ci// state is keyed by `(instruction index, input index)`, we have that its time 9c67d6573Sopenharmony_ci// complexity is `O(mn)` (i.e., linear in the size of the search text). 10c67d6573Sopenharmony_ci// 11c67d6573Sopenharmony_ci// The backtracking engine can beat out the NFA simulation on small 12c67d6573Sopenharmony_ci// regexes/inputs because it doesn't have to keep track of multiple copies of 13c67d6573Sopenharmony_ci// the capture groups. In benchmarks, the backtracking engine is roughly twice 14c67d6573Sopenharmony_ci// as fast as the full NFA simulation. Note though that its performance doesn't 15c67d6573Sopenharmony_ci// scale, even if you're willing to live with the memory requirements. Namely, 16c67d6573Sopenharmony_ci// the bitset has to be zeroed on each execution, which becomes quite expensive 17c67d6573Sopenharmony_ci// on large bitsets. 18c67d6573Sopenharmony_ci 19c67d6573Sopenharmony_ciuse crate::exec::ProgramCache; 20c67d6573Sopenharmony_ciuse crate::input::{Input, InputAt}; 21c67d6573Sopenharmony_ciuse crate::prog::{InstPtr, Program}; 22c67d6573Sopenharmony_ciuse crate::re_trait::Slot; 23c67d6573Sopenharmony_ci 24c67d6573Sopenharmony_citype Bits = u32; 25c67d6573Sopenharmony_ci 26c67d6573Sopenharmony_ciconst BIT_SIZE: usize = 32; 27c67d6573Sopenharmony_ciconst MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB 28c67d6573Sopenharmony_ci 29c67d6573Sopenharmony_ci/// Returns true iff the given regex and input should be executed by this 30c67d6573Sopenharmony_ci/// engine with reasonable memory usage. 31c67d6573Sopenharmony_cipub fn should_exec(num_insts: usize, text_len: usize) -> bool { 32c67d6573Sopenharmony_ci // Total memory usage in bytes is determined by: 33c67d6573Sopenharmony_ci // 34c67d6573Sopenharmony_ci // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32)) 35c67d6573Sopenharmony_ci // 36c67d6573Sopenharmony_ci // The actual limit picked is pretty much a heuristic. 37c67d6573Sopenharmony_ci // See: https://github.com/rust-lang/regex/issues/215 38c67d6573Sopenharmony_ci let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4; 39c67d6573Sopenharmony_ci size <= MAX_SIZE_BYTES 40c67d6573Sopenharmony_ci} 41c67d6573Sopenharmony_ci 42c67d6573Sopenharmony_ci/// A backtracking matching engine. 43c67d6573Sopenharmony_ci#[derive(Debug)] 44c67d6573Sopenharmony_cipub struct Bounded<'a, 'm, 'r, 's, I> { 45c67d6573Sopenharmony_ci prog: &'r Program, 46c67d6573Sopenharmony_ci input: I, 47c67d6573Sopenharmony_ci matches: &'m mut [bool], 48c67d6573Sopenharmony_ci slots: &'s mut [Slot], 49c67d6573Sopenharmony_ci m: &'a mut Cache, 50c67d6573Sopenharmony_ci} 51c67d6573Sopenharmony_ci 52c67d6573Sopenharmony_ci/// Shared cached state between multiple invocations of a backtracking engine 53c67d6573Sopenharmony_ci/// in the same thread. 54c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 55c67d6573Sopenharmony_cipub struct Cache { 56c67d6573Sopenharmony_ci jobs: Vec<Job>, 57c67d6573Sopenharmony_ci visited: Vec<Bits>, 58c67d6573Sopenharmony_ci} 59c67d6573Sopenharmony_ci 60c67d6573Sopenharmony_ciimpl Cache { 61c67d6573Sopenharmony_ci /// Create new empty cache for the backtracking engine. 62c67d6573Sopenharmony_ci pub fn new(_prog: &Program) -> Self { 63c67d6573Sopenharmony_ci Cache { jobs: vec![], visited: vec![] } 64c67d6573Sopenharmony_ci } 65c67d6573Sopenharmony_ci} 66c67d6573Sopenharmony_ci 67c67d6573Sopenharmony_ci/// A job is an explicit unit of stack space in the backtracking engine. 68c67d6573Sopenharmony_ci/// 69c67d6573Sopenharmony_ci/// The "normal" representation is a single state transition, which corresponds 70c67d6573Sopenharmony_ci/// to an NFA state and a character in the input. However, the backtracking 71c67d6573Sopenharmony_ci/// engine must keep track of old capture group values. We use the explicit 72c67d6573Sopenharmony_ci/// stack to do it. 73c67d6573Sopenharmony_ci#[derive(Clone, Copy, Debug)] 74c67d6573Sopenharmony_cienum Job { 75c67d6573Sopenharmony_ci Inst { ip: InstPtr, at: InputAt }, 76c67d6573Sopenharmony_ci SaveRestore { slot: usize, old_pos: Option<usize> }, 77c67d6573Sopenharmony_ci} 78c67d6573Sopenharmony_ci 79c67d6573Sopenharmony_ciimpl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { 80c67d6573Sopenharmony_ci /// Execute the backtracking matching engine. 81c67d6573Sopenharmony_ci /// 82c67d6573Sopenharmony_ci /// If there's a match, `exec` returns `true` and populates the given 83c67d6573Sopenharmony_ci /// captures accordingly. 84c67d6573Sopenharmony_ci pub fn exec( 85c67d6573Sopenharmony_ci prog: &'r Program, 86c67d6573Sopenharmony_ci cache: &ProgramCache, 87c67d6573Sopenharmony_ci matches: &'m mut [bool], 88c67d6573Sopenharmony_ci slots: &'s mut [Slot], 89c67d6573Sopenharmony_ci input: I, 90c67d6573Sopenharmony_ci start: usize, 91c67d6573Sopenharmony_ci end: usize, 92c67d6573Sopenharmony_ci ) -> bool { 93c67d6573Sopenharmony_ci let mut cache = cache.borrow_mut(); 94c67d6573Sopenharmony_ci let cache = &mut cache.backtrack; 95c67d6573Sopenharmony_ci let start = input.at(start); 96c67d6573Sopenharmony_ci let mut b = Bounded { prog, input, matches, slots, m: cache }; 97c67d6573Sopenharmony_ci b.exec_(start, end) 98c67d6573Sopenharmony_ci } 99c67d6573Sopenharmony_ci 100c67d6573Sopenharmony_ci /// Clears the cache such that the backtracking engine can be executed 101c67d6573Sopenharmony_ci /// on some input of fixed length. 102c67d6573Sopenharmony_ci fn clear(&mut self) { 103c67d6573Sopenharmony_ci // Reset the job memory so that we start fresh. 104c67d6573Sopenharmony_ci self.m.jobs.clear(); 105c67d6573Sopenharmony_ci 106c67d6573Sopenharmony_ci // Now we need to clear the bit state set. 107c67d6573Sopenharmony_ci // We do this by figuring out how much space we need to keep track 108c67d6573Sopenharmony_ci // of the states we've visited. 109c67d6573Sopenharmony_ci // Then we reset all existing allocated space to 0. 110c67d6573Sopenharmony_ci // Finally, we request more space if we need it. 111c67d6573Sopenharmony_ci // 112c67d6573Sopenharmony_ci // This is all a little circuitous, but doing this using unchecked 113c67d6573Sopenharmony_ci // operations doesn't seem to have a measurable impact on performance. 114c67d6573Sopenharmony_ci // (Probably because backtracking is limited to such small 115c67d6573Sopenharmony_ci // inputs/regexes in the first place.) 116c67d6573Sopenharmony_ci let visited_len = 117c67d6573Sopenharmony_ci (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1) 118c67d6573Sopenharmony_ci / BIT_SIZE; 119c67d6573Sopenharmony_ci self.m.visited.truncate(visited_len); 120c67d6573Sopenharmony_ci for v in &mut self.m.visited { 121c67d6573Sopenharmony_ci *v = 0; 122c67d6573Sopenharmony_ci } 123c67d6573Sopenharmony_ci if visited_len > self.m.visited.len() { 124c67d6573Sopenharmony_ci let len = self.m.visited.len(); 125c67d6573Sopenharmony_ci self.m.visited.reserve_exact(visited_len - len); 126c67d6573Sopenharmony_ci for _ in 0..(visited_len - len) { 127c67d6573Sopenharmony_ci self.m.visited.push(0); 128c67d6573Sopenharmony_ci } 129c67d6573Sopenharmony_ci } 130c67d6573Sopenharmony_ci } 131c67d6573Sopenharmony_ci 132c67d6573Sopenharmony_ci /// Start backtracking at the given position in the input, but also look 133c67d6573Sopenharmony_ci /// for literal prefixes. 134c67d6573Sopenharmony_ci fn exec_(&mut self, mut at: InputAt, end: usize) -> bool { 135c67d6573Sopenharmony_ci self.clear(); 136c67d6573Sopenharmony_ci // If this is an anchored regex at the beginning of the input, then 137c67d6573Sopenharmony_ci // we're either already done or we only need to try backtracking once. 138c67d6573Sopenharmony_ci if self.prog.is_anchored_start { 139c67d6573Sopenharmony_ci return if !at.is_start() { false } else { self.backtrack(at) }; 140c67d6573Sopenharmony_ci } 141c67d6573Sopenharmony_ci let mut matched = false; 142c67d6573Sopenharmony_ci loop { 143c67d6573Sopenharmony_ci if !self.prog.prefixes.is_empty() { 144c67d6573Sopenharmony_ci at = match self.input.prefix_at(&self.prog.prefixes, at) { 145c67d6573Sopenharmony_ci None => break, 146c67d6573Sopenharmony_ci Some(at) => at, 147c67d6573Sopenharmony_ci }; 148c67d6573Sopenharmony_ci } 149c67d6573Sopenharmony_ci matched = self.backtrack(at) || matched; 150c67d6573Sopenharmony_ci if matched && self.prog.matches.len() == 1 { 151c67d6573Sopenharmony_ci return true; 152c67d6573Sopenharmony_ci } 153c67d6573Sopenharmony_ci if at.pos() >= end { 154c67d6573Sopenharmony_ci break; 155c67d6573Sopenharmony_ci } 156c67d6573Sopenharmony_ci at = self.input.at(at.next_pos()); 157c67d6573Sopenharmony_ci } 158c67d6573Sopenharmony_ci matched 159c67d6573Sopenharmony_ci } 160c67d6573Sopenharmony_ci 161c67d6573Sopenharmony_ci /// The main backtracking loop starting at the given input position. 162c67d6573Sopenharmony_ci fn backtrack(&mut self, start: InputAt) -> bool { 163c67d6573Sopenharmony_ci // N.B. We use an explicit stack to avoid recursion. 164c67d6573Sopenharmony_ci // To avoid excessive pushing and popping, most transitions are handled 165c67d6573Sopenharmony_ci // in the `step` helper function, which only pushes to the stack when 166c67d6573Sopenharmony_ci // there's a capture or a branch. 167c67d6573Sopenharmony_ci let mut matched = false; 168c67d6573Sopenharmony_ci self.m.jobs.push(Job::Inst { ip: 0, at: start }); 169c67d6573Sopenharmony_ci while let Some(job) = self.m.jobs.pop() { 170c67d6573Sopenharmony_ci match job { 171c67d6573Sopenharmony_ci Job::Inst { ip, at } => { 172c67d6573Sopenharmony_ci if self.step(ip, at) { 173c67d6573Sopenharmony_ci // Only quit if we're matching one regex. 174c67d6573Sopenharmony_ci // If we're matching a regex set, then mush on and 175c67d6573Sopenharmony_ci // try to find other matches (if we want them). 176c67d6573Sopenharmony_ci if self.prog.matches.len() == 1 { 177c67d6573Sopenharmony_ci return true; 178c67d6573Sopenharmony_ci } 179c67d6573Sopenharmony_ci matched = true; 180c67d6573Sopenharmony_ci } 181c67d6573Sopenharmony_ci } 182c67d6573Sopenharmony_ci Job::SaveRestore { slot, old_pos } => { 183c67d6573Sopenharmony_ci if slot < self.slots.len() { 184c67d6573Sopenharmony_ci self.slots[slot] = old_pos; 185c67d6573Sopenharmony_ci } 186c67d6573Sopenharmony_ci } 187c67d6573Sopenharmony_ci } 188c67d6573Sopenharmony_ci } 189c67d6573Sopenharmony_ci matched 190c67d6573Sopenharmony_ci } 191c67d6573Sopenharmony_ci 192c67d6573Sopenharmony_ci fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { 193c67d6573Sopenharmony_ci use crate::prog::Inst::*; 194c67d6573Sopenharmony_ci loop { 195c67d6573Sopenharmony_ci // This loop is an optimization to avoid constantly pushing/popping 196c67d6573Sopenharmony_ci // from the stack. Namely, if we're pushing a job only to run it 197c67d6573Sopenharmony_ci // next, avoid the push and just mutate `ip` (and possibly `at`) 198c67d6573Sopenharmony_ci // in place. 199c67d6573Sopenharmony_ci if self.has_visited(ip, at) { 200c67d6573Sopenharmony_ci return false; 201c67d6573Sopenharmony_ci } 202c67d6573Sopenharmony_ci match self.prog[ip] { 203c67d6573Sopenharmony_ci Match(slot) => { 204c67d6573Sopenharmony_ci if slot < self.matches.len() { 205c67d6573Sopenharmony_ci self.matches[slot] = true; 206c67d6573Sopenharmony_ci } 207c67d6573Sopenharmony_ci return true; 208c67d6573Sopenharmony_ci } 209c67d6573Sopenharmony_ci Save(ref inst) => { 210c67d6573Sopenharmony_ci if let Some(&old_pos) = self.slots.get(inst.slot) { 211c67d6573Sopenharmony_ci // If this path doesn't work out, then we save the old 212c67d6573Sopenharmony_ci // capture index (if one exists) in an alternate 213c67d6573Sopenharmony_ci // job. If the next path fails, then the alternate 214c67d6573Sopenharmony_ci // job is popped and the old capture index is restored. 215c67d6573Sopenharmony_ci self.m.jobs.push(Job::SaveRestore { 216c67d6573Sopenharmony_ci slot: inst.slot, 217c67d6573Sopenharmony_ci old_pos, 218c67d6573Sopenharmony_ci }); 219c67d6573Sopenharmony_ci self.slots[inst.slot] = Some(at.pos()); 220c67d6573Sopenharmony_ci } 221c67d6573Sopenharmony_ci ip = inst.goto; 222c67d6573Sopenharmony_ci } 223c67d6573Sopenharmony_ci Split(ref inst) => { 224c67d6573Sopenharmony_ci self.m.jobs.push(Job::Inst { ip: inst.goto2, at }); 225c67d6573Sopenharmony_ci ip = inst.goto1; 226c67d6573Sopenharmony_ci } 227c67d6573Sopenharmony_ci EmptyLook(ref inst) => { 228c67d6573Sopenharmony_ci if self.input.is_empty_match(at, inst) { 229c67d6573Sopenharmony_ci ip = inst.goto; 230c67d6573Sopenharmony_ci } else { 231c67d6573Sopenharmony_ci return false; 232c67d6573Sopenharmony_ci } 233c67d6573Sopenharmony_ci } 234c67d6573Sopenharmony_ci Char(ref inst) => { 235c67d6573Sopenharmony_ci if inst.c == at.char() { 236c67d6573Sopenharmony_ci ip = inst.goto; 237c67d6573Sopenharmony_ci at = self.input.at(at.next_pos()); 238c67d6573Sopenharmony_ci } else { 239c67d6573Sopenharmony_ci return false; 240c67d6573Sopenharmony_ci } 241c67d6573Sopenharmony_ci } 242c67d6573Sopenharmony_ci Ranges(ref inst) => { 243c67d6573Sopenharmony_ci if inst.matches(at.char()) { 244c67d6573Sopenharmony_ci ip = inst.goto; 245c67d6573Sopenharmony_ci at = self.input.at(at.next_pos()); 246c67d6573Sopenharmony_ci } else { 247c67d6573Sopenharmony_ci return false; 248c67d6573Sopenharmony_ci } 249c67d6573Sopenharmony_ci } 250c67d6573Sopenharmony_ci Bytes(ref inst) => { 251c67d6573Sopenharmony_ci if let Some(b) = at.byte() { 252c67d6573Sopenharmony_ci if inst.matches(b) { 253c67d6573Sopenharmony_ci ip = inst.goto; 254c67d6573Sopenharmony_ci at = self.input.at(at.next_pos()); 255c67d6573Sopenharmony_ci continue; 256c67d6573Sopenharmony_ci } 257c67d6573Sopenharmony_ci } 258c67d6573Sopenharmony_ci return false; 259c67d6573Sopenharmony_ci } 260c67d6573Sopenharmony_ci } 261c67d6573Sopenharmony_ci } 262c67d6573Sopenharmony_ci } 263c67d6573Sopenharmony_ci 264c67d6573Sopenharmony_ci fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool { 265c67d6573Sopenharmony_ci let k = ip * (self.input.len() + 1) + at.pos(); 266c67d6573Sopenharmony_ci let k1 = k / BIT_SIZE; 267c67d6573Sopenharmony_ci let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1))); 268c67d6573Sopenharmony_ci if self.m.visited[k1] & k2 == 0 { 269c67d6573Sopenharmony_ci self.m.visited[k1] |= k2; 270c67d6573Sopenharmony_ci false 271c67d6573Sopenharmony_ci } else { 272c67d6573Sopenharmony_ci true 273c67d6573Sopenharmony_ci } 274c67d6573Sopenharmony_ci } 275c67d6573Sopenharmony_ci} 276c67d6573Sopenharmony_ci 277c67d6573Sopenharmony_cifn usize_to_u32(n: usize) -> u32 { 278c67d6573Sopenharmony_ci if (n as u64) > (::std::u32::MAX as u64) { 279c67d6573Sopenharmony_ci panic!("BUG: {} is too big to fit into u32", n) 280c67d6573Sopenharmony_ci } 281c67d6573Sopenharmony_ci n as u32 282c67d6573Sopenharmony_ci} 283