106f54294Sopenharmony_ciuse crate::ahocorasick::MatchKind; 206f54294Sopenharmony_ciuse crate::prefilter::{self, Candidate, Prefilter, PrefilterState}; 306f54294Sopenharmony_ciuse crate::state_id::{dead_id, fail_id, StateID}; 406f54294Sopenharmony_ciuse crate::Match; 506f54294Sopenharmony_ci 606f54294Sopenharmony_ci// NOTE: This trait essentially started as a copy of the same trait from from 706f54294Sopenharmony_ci// regex-automata, with some wording changed since we use this trait for 806f54294Sopenharmony_ci// NFAs in addition to DFAs in this crate. Additionally, we do not export 906f54294Sopenharmony_ci// this trait. It's only used internally to reduce code duplication. The 1006f54294Sopenharmony_ci// regex-automata crate needs to expose it because its Regex type is generic 1106f54294Sopenharmony_ci// over implementations of this trait. In this crate, we encapsulate everything 1206f54294Sopenharmony_ci// behind the AhoCorasick type. 1306f54294Sopenharmony_ci// 1406f54294Sopenharmony_ci// This trait is a bit of a mess, but it's not quite clear how to fix it. 1506f54294Sopenharmony_ci// Basically, there are several competing concerns: 1606f54294Sopenharmony_ci// 1706f54294Sopenharmony_ci// * We need performance, so everything effectively needs to get monomorphized. 1806f54294Sopenharmony_ci// * There are several variations on searching Aho-Corasick automatons: 1906f54294Sopenharmony_ci// overlapping, standard and leftmost. Overlapping and standard are somewhat 2006f54294Sopenharmony_ci// combined together below, but there is no real way to combine standard with 2106f54294Sopenharmony_ci// leftmost. Namely, leftmost requires continuing a search even after a match 2206f54294Sopenharmony_ci// is found, in order to correctly disambiguate a match. 2306f54294Sopenharmony_ci// * On top of that, *sometimes* callers want to know which state the automaton 2406f54294Sopenharmony_ci// is in after searching. This is principally useful for overlapping and 2506f54294Sopenharmony_ci// stream searches. However, when callers don't care about this, we really 2606f54294Sopenharmony_ci// do not want to be forced to compute it, since it sometimes requires extra 2706f54294Sopenharmony_ci// work. Thus, there are effectively two copies of leftmost searching: one 2806f54294Sopenharmony_ci// for tracking the state ID and one that doesn't. We should ideally do the 2906f54294Sopenharmony_ci// same for standard searching, but my sanity stopped me. 3006f54294Sopenharmony_ci 3106f54294Sopenharmony_ci// SAFETY RATIONALE: Previously, the code below went to some length to remove 3206f54294Sopenharmony_ci// all bounds checks. This generally produced tighter assembly and lead to 3306f54294Sopenharmony_ci// 20-50% improvements in micro-benchmarks on corpora made up of random 3406f54294Sopenharmony_ci// characters. This somewhat makes sense, since the branch predictor is going 3506f54294Sopenharmony_ci// to be at its worse on random text. 3606f54294Sopenharmony_ci// 3706f54294Sopenharmony_ci// However, using the aho-corasick-debug tool and manually benchmarking 3806f54294Sopenharmony_ci// different inputs, the code *with* bounds checks actually wound up being 3906f54294Sopenharmony_ci// slightly faster: 4006f54294Sopenharmony_ci// 4106f54294Sopenharmony_ci// $ cat input 4206f54294Sopenharmony_ci// Sherlock Holmes 4306f54294Sopenharmony_ci// John Watson 4406f54294Sopenharmony_ci// Professor Moriarty 4506f54294Sopenharmony_ci// Irene Adler 4606f54294Sopenharmony_ci// Mary Watson 4706f54294Sopenharmony_ci// 4806f54294Sopenharmony_ci// $ aho-corasick-debug-safe \ 4906f54294Sopenharmony_ci// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa 5006f54294Sopenharmony_ci// pattern read time: 32.824µs 5106f54294Sopenharmony_ci// automaton build time: 444.687µs 5206f54294Sopenharmony_ci// automaton heap usage: 72392 bytes 5306f54294Sopenharmony_ci// match count: 639 5406f54294Sopenharmony_ci// count time: 1.809961702s 5506f54294Sopenharmony_ci// 5606f54294Sopenharmony_ci// $ aho-corasick-debug-master \ 5706f54294Sopenharmony_ci// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa 5806f54294Sopenharmony_ci// pattern read time: 31.425µs 5906f54294Sopenharmony_ci// automaton build time: 317.434µs 6006f54294Sopenharmony_ci// automaton heap usage: 72392 bytes 6106f54294Sopenharmony_ci// match count: 639 6206f54294Sopenharmony_ci// count time: 2.059157705s 6306f54294Sopenharmony_ci// 6406f54294Sopenharmony_ci// I was able to reproduce this result on two different machines (an i5 and 6506f54294Sopenharmony_ci// an i7). Therefore, we go the route of safe code for now. 6606f54294Sopenharmony_ci 6706f54294Sopenharmony_ci/// A trait describing the interface of an Aho-Corasick finite state machine. 6806f54294Sopenharmony_ci/// 6906f54294Sopenharmony_ci/// Every automaton has exactly one fail state, one dead state and exactly one 7006f54294Sopenharmony_ci/// start state. Generally, these correspond to the first, second and third 7106f54294Sopenharmony_ci/// states, respectively. The dead state is always treated as a sentinel. That 7206f54294Sopenharmony_ci/// is, no correct Aho-Corasick automaton will ever transition into the fail 7306f54294Sopenharmony_ci/// state. The dead state, however, can be transitioned into, but only when 7406f54294Sopenharmony_ci/// leftmost-first or leftmost-longest match semantics are enabled and only 7506f54294Sopenharmony_ci/// when at least one match has been observed. 7606f54294Sopenharmony_ci/// 7706f54294Sopenharmony_ci/// Every automaton also has one or more match states, such that 7806f54294Sopenharmony_ci/// `Automaton::is_match_state(id)` returns `true` if and only if `id` 7906f54294Sopenharmony_ci/// corresponds to a match state. 8006f54294Sopenharmony_cipub trait Automaton { 8106f54294Sopenharmony_ci /// The representation used for state identifiers in this automaton. 8206f54294Sopenharmony_ci /// 8306f54294Sopenharmony_ci /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. 8406f54294Sopenharmony_ci type ID: StateID; 8506f54294Sopenharmony_ci 8606f54294Sopenharmony_ci /// The type of matching that should be done. 8706f54294Sopenharmony_ci fn match_kind(&self) -> &MatchKind; 8806f54294Sopenharmony_ci 8906f54294Sopenharmony_ci /// Returns true if and only if this automaton uses anchored searches. 9006f54294Sopenharmony_ci fn anchored(&self) -> bool; 9106f54294Sopenharmony_ci 9206f54294Sopenharmony_ci /// An optional prefilter for quickly skipping to the next candidate match. 9306f54294Sopenharmony_ci /// A prefilter must report at least every match, although it may report 9406f54294Sopenharmony_ci /// positions that do not correspond to a match. That is, it must not allow 9506f54294Sopenharmony_ci /// false negatives, but can allow false positives. 9606f54294Sopenharmony_ci /// 9706f54294Sopenharmony_ci /// Currently, a prefilter only runs when the automaton is in the start 9806f54294Sopenharmony_ci /// state. That is, the position reported by a prefilter should always 9906f54294Sopenharmony_ci /// correspond to the start of a potential match. 10006f54294Sopenharmony_ci fn prefilter(&self) -> Option<&dyn Prefilter>; 10106f54294Sopenharmony_ci 10206f54294Sopenharmony_ci /// Return the identifier of this automaton's start state. 10306f54294Sopenharmony_ci fn start_state(&self) -> Self::ID; 10406f54294Sopenharmony_ci 10506f54294Sopenharmony_ci /// Returns true if and only if the given state identifier refers to a 10606f54294Sopenharmony_ci /// valid state. 10706f54294Sopenharmony_ci fn is_valid(&self, id: Self::ID) -> bool; 10806f54294Sopenharmony_ci 10906f54294Sopenharmony_ci /// Returns true if and only if the given identifier corresponds to a match 11006f54294Sopenharmony_ci /// state. 11106f54294Sopenharmony_ci /// 11206f54294Sopenharmony_ci /// The state ID given must be valid, or else implementors may panic. 11306f54294Sopenharmony_ci fn is_match_state(&self, id: Self::ID) -> bool; 11406f54294Sopenharmony_ci 11506f54294Sopenharmony_ci /// Returns true if and only if the given identifier corresponds to a state 11606f54294Sopenharmony_ci /// that is either the dead state or a match state. 11706f54294Sopenharmony_ci /// 11806f54294Sopenharmony_ci /// Depending on the implementation of the automaton, this routine can 11906f54294Sopenharmony_ci /// be used to save a branch in the core matching loop. Nevertheless, 12006f54294Sopenharmony_ci /// `is_match_state(id) || id == dead_id()` is always a valid 12106f54294Sopenharmony_ci /// implementation. Indeed, this is the default implementation. 12206f54294Sopenharmony_ci /// 12306f54294Sopenharmony_ci /// The state ID given must be valid, or else implementors may panic. 12406f54294Sopenharmony_ci fn is_match_or_dead_state(&self, id: Self::ID) -> bool { 12506f54294Sopenharmony_ci id == dead_id() || self.is_match_state(id) 12606f54294Sopenharmony_ci } 12706f54294Sopenharmony_ci 12806f54294Sopenharmony_ci /// If the given state is a match state, return the match corresponding 12906f54294Sopenharmony_ci /// to the given match index. `end` must be the ending position of the 13006f54294Sopenharmony_ci /// detected match. If no match exists or if `match_index` exceeds the 13106f54294Sopenharmony_ci /// number of matches in this state, then `None` is returned. 13206f54294Sopenharmony_ci /// 13306f54294Sopenharmony_ci /// The state ID given must be valid, or else implementors may panic. 13406f54294Sopenharmony_ci /// 13506f54294Sopenharmony_ci /// If the given state ID is correct and if the `match_index` is less than 13606f54294Sopenharmony_ci /// the number of matches for that state, then this is guaranteed to return 13706f54294Sopenharmony_ci /// a match. 13806f54294Sopenharmony_ci fn get_match( 13906f54294Sopenharmony_ci &self, 14006f54294Sopenharmony_ci id: Self::ID, 14106f54294Sopenharmony_ci match_index: usize, 14206f54294Sopenharmony_ci end: usize, 14306f54294Sopenharmony_ci ) -> Option<Match>; 14406f54294Sopenharmony_ci 14506f54294Sopenharmony_ci /// Returns the number of matches for the given state. If the given state 14606f54294Sopenharmony_ci /// is not a match state, then this returns 0. 14706f54294Sopenharmony_ci /// 14806f54294Sopenharmony_ci /// The state ID given must be valid, or else implementors must panic. 14906f54294Sopenharmony_ci fn match_count(&self, id: Self::ID) -> usize; 15006f54294Sopenharmony_ci 15106f54294Sopenharmony_ci /// Given the current state that this automaton is in and the next input 15206f54294Sopenharmony_ci /// byte, this method returns the identifier of the next state. The 15306f54294Sopenharmony_ci /// identifier returned must always be valid and may never correspond to 15406f54294Sopenharmony_ci /// the fail state. The returned identifier may, however, point to the 15506f54294Sopenharmony_ci /// dead state. 15606f54294Sopenharmony_ci /// 15706f54294Sopenharmony_ci /// This is not safe so that implementors may look up the next state 15806f54294Sopenharmony_ci /// without memory safety checks such as bounds checks. As such, callers 15906f54294Sopenharmony_ci /// must ensure that the given identifier corresponds to a valid automaton 16006f54294Sopenharmony_ci /// state. Implementors must, in turn, ensure that this routine is safe for 16106f54294Sopenharmony_ci /// all valid state identifiers and for all possible `u8` values. 16206f54294Sopenharmony_ci fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; 16306f54294Sopenharmony_ci 16406f54294Sopenharmony_ci /// Like next_state, but debug_asserts that the underlying 16506f54294Sopenharmony_ci /// implementation never returns a `fail_id()` for the next state. 16606f54294Sopenharmony_ci fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID { 16706f54294Sopenharmony_ci let next = self.next_state(current, input); 16806f54294Sopenharmony_ci // We should never see a transition to the failure state. 16906f54294Sopenharmony_ci debug_assert!( 17006f54294Sopenharmony_ci next != fail_id(), 17106f54294Sopenharmony_ci "automaton should never return fail_id for next state" 17206f54294Sopenharmony_ci ); 17306f54294Sopenharmony_ci next 17406f54294Sopenharmony_ci } 17506f54294Sopenharmony_ci 17606f54294Sopenharmony_ci /// Execute a search using standard match semantics. 17706f54294Sopenharmony_ci /// 17806f54294Sopenharmony_ci /// This can be used even when the automaton was constructed with leftmost 17906f54294Sopenharmony_ci /// match semantics when you want to find the earliest possible match. This 18006f54294Sopenharmony_ci /// can also be used as part of an overlapping search implementation. 18106f54294Sopenharmony_ci /// 18206f54294Sopenharmony_ci /// N.B. This does not report a match if `state_id` is given as a matching 18306f54294Sopenharmony_ci /// state. As such, this should not be used directly. 18406f54294Sopenharmony_ci #[inline(always)] 18506f54294Sopenharmony_ci fn standard_find_at( 18606f54294Sopenharmony_ci &self, 18706f54294Sopenharmony_ci prestate: &mut PrefilterState, 18806f54294Sopenharmony_ci haystack: &[u8], 18906f54294Sopenharmony_ci at: usize, 19006f54294Sopenharmony_ci state_id: &mut Self::ID, 19106f54294Sopenharmony_ci ) -> Option<Match> { 19206f54294Sopenharmony_ci if let Some(pre) = self.prefilter() { 19306f54294Sopenharmony_ci self.standard_find_at_imp( 19406f54294Sopenharmony_ci prestate, 19506f54294Sopenharmony_ci Some(pre), 19606f54294Sopenharmony_ci haystack, 19706f54294Sopenharmony_ci at, 19806f54294Sopenharmony_ci state_id, 19906f54294Sopenharmony_ci ) 20006f54294Sopenharmony_ci } else { 20106f54294Sopenharmony_ci self.standard_find_at_imp(prestate, None, haystack, at, state_id) 20206f54294Sopenharmony_ci } 20306f54294Sopenharmony_ci } 20406f54294Sopenharmony_ci 20506f54294Sopenharmony_ci // It's important for this to always be inlined. Namely, its only caller 20606f54294Sopenharmony_ci // is standard_find_at, and the inlining should remove the case analysis 20706f54294Sopenharmony_ci // for prefilter scanning when there is no prefilter available. 20806f54294Sopenharmony_ci #[inline(always)] 20906f54294Sopenharmony_ci fn standard_find_at_imp( 21006f54294Sopenharmony_ci &self, 21106f54294Sopenharmony_ci prestate: &mut PrefilterState, 21206f54294Sopenharmony_ci prefilter: Option<&dyn Prefilter>, 21306f54294Sopenharmony_ci haystack: &[u8], 21406f54294Sopenharmony_ci mut at: usize, 21506f54294Sopenharmony_ci state_id: &mut Self::ID, 21606f54294Sopenharmony_ci ) -> Option<Match> { 21706f54294Sopenharmony_ci while at < haystack.len() { 21806f54294Sopenharmony_ci if let Some(pre) = prefilter { 21906f54294Sopenharmony_ci if prestate.is_effective(at) && *state_id == self.start_state() 22006f54294Sopenharmony_ci { 22106f54294Sopenharmony_ci let c = prefilter::next(prestate, pre, haystack, at) 22206f54294Sopenharmony_ci .into_option(); 22306f54294Sopenharmony_ci match c { 22406f54294Sopenharmony_ci None => return None, 22506f54294Sopenharmony_ci Some(i) => { 22606f54294Sopenharmony_ci at = i; 22706f54294Sopenharmony_ci } 22806f54294Sopenharmony_ci } 22906f54294Sopenharmony_ci } 23006f54294Sopenharmony_ci } 23106f54294Sopenharmony_ci // CORRECTNESS: next_state is correct for all possible u8 values, 23206f54294Sopenharmony_ci // so the only thing we're concerned about is the validity of 23306f54294Sopenharmony_ci // `state_id`. `state_id` either comes from the caller (in which 23406f54294Sopenharmony_ci // case, we assume it is correct), or it comes from the return 23506f54294Sopenharmony_ci // value of next_state, which is guaranteed to be correct. 23606f54294Sopenharmony_ci *state_id = self.next_state_no_fail(*state_id, haystack[at]); 23706f54294Sopenharmony_ci at += 1; 23806f54294Sopenharmony_ci // This routine always quits immediately after seeing a 23906f54294Sopenharmony_ci // match, and since dead states can only come after seeing 24006f54294Sopenharmony_ci // a match, seeing a dead state here is impossible. (Unless 24106f54294Sopenharmony_ci // we have an anchored automaton, in which case, dead states 24206f54294Sopenharmony_ci // are used to stop a search.) 24306f54294Sopenharmony_ci debug_assert!( 24406f54294Sopenharmony_ci *state_id != dead_id() || self.anchored(), 24506f54294Sopenharmony_ci "standard find should never see a dead state" 24606f54294Sopenharmony_ci ); 24706f54294Sopenharmony_ci 24806f54294Sopenharmony_ci if self.is_match_or_dead_state(*state_id) { 24906f54294Sopenharmony_ci return if *state_id == dead_id() { 25006f54294Sopenharmony_ci None 25106f54294Sopenharmony_ci } else { 25206f54294Sopenharmony_ci self.get_match(*state_id, 0, at) 25306f54294Sopenharmony_ci }; 25406f54294Sopenharmony_ci } 25506f54294Sopenharmony_ci } 25606f54294Sopenharmony_ci None 25706f54294Sopenharmony_ci } 25806f54294Sopenharmony_ci 25906f54294Sopenharmony_ci /// Execute a search using leftmost (either first or longest) match 26006f54294Sopenharmony_ci /// semantics. 26106f54294Sopenharmony_ci /// 26206f54294Sopenharmony_ci /// The principle difference between searching with standard semantics and 26306f54294Sopenharmony_ci /// searching with leftmost semantics is that leftmost searching will 26406f54294Sopenharmony_ci /// continue searching even after a match has been found. Once a match 26506f54294Sopenharmony_ci /// is found, the search does not stop until either the haystack has been 26606f54294Sopenharmony_ci /// exhausted or a dead state is observed in the automaton. (Dead states 26706f54294Sopenharmony_ci /// only exist in automatons constructed with leftmost semantics.) That is, 26806f54294Sopenharmony_ci /// we rely on the construction of the automaton to tell us when to quit. 26906f54294Sopenharmony_ci #[inline(never)] 27006f54294Sopenharmony_ci fn leftmost_find_at( 27106f54294Sopenharmony_ci &self, 27206f54294Sopenharmony_ci prestate: &mut PrefilterState, 27306f54294Sopenharmony_ci haystack: &[u8], 27406f54294Sopenharmony_ci at: usize, 27506f54294Sopenharmony_ci state_id: &mut Self::ID, 27606f54294Sopenharmony_ci ) -> Option<Match> { 27706f54294Sopenharmony_ci if let Some(pre) = self.prefilter() { 27806f54294Sopenharmony_ci self.leftmost_find_at_imp( 27906f54294Sopenharmony_ci prestate, 28006f54294Sopenharmony_ci Some(pre), 28106f54294Sopenharmony_ci haystack, 28206f54294Sopenharmony_ci at, 28306f54294Sopenharmony_ci state_id, 28406f54294Sopenharmony_ci ) 28506f54294Sopenharmony_ci } else { 28606f54294Sopenharmony_ci self.leftmost_find_at_imp(prestate, None, haystack, at, state_id) 28706f54294Sopenharmony_ci } 28806f54294Sopenharmony_ci } 28906f54294Sopenharmony_ci 29006f54294Sopenharmony_ci // It's important for this to always be inlined. Namely, its only caller 29106f54294Sopenharmony_ci // is leftmost_find_at, and the inlining should remove the case analysis 29206f54294Sopenharmony_ci // for prefilter scanning when there is no prefilter available. 29306f54294Sopenharmony_ci #[inline(always)] 29406f54294Sopenharmony_ci fn leftmost_find_at_imp( 29506f54294Sopenharmony_ci &self, 29606f54294Sopenharmony_ci prestate: &mut PrefilterState, 29706f54294Sopenharmony_ci prefilter: Option<&dyn Prefilter>, 29806f54294Sopenharmony_ci haystack: &[u8], 29906f54294Sopenharmony_ci mut at: usize, 30006f54294Sopenharmony_ci state_id: &mut Self::ID, 30106f54294Sopenharmony_ci ) -> Option<Match> { 30206f54294Sopenharmony_ci debug_assert!(self.match_kind().is_leftmost()); 30306f54294Sopenharmony_ci if self.anchored() && at > 0 && *state_id == self.start_state() { 30406f54294Sopenharmony_ci return None; 30506f54294Sopenharmony_ci } 30606f54294Sopenharmony_ci let mut last_match = self.get_match(*state_id, 0, at); 30706f54294Sopenharmony_ci while at < haystack.len() { 30806f54294Sopenharmony_ci if let Some(pre) = prefilter { 30906f54294Sopenharmony_ci if prestate.is_effective(at) && *state_id == self.start_state() 31006f54294Sopenharmony_ci { 31106f54294Sopenharmony_ci let c = prefilter::next(prestate, pre, haystack, at) 31206f54294Sopenharmony_ci .into_option(); 31306f54294Sopenharmony_ci match c { 31406f54294Sopenharmony_ci None => return None, 31506f54294Sopenharmony_ci Some(i) => { 31606f54294Sopenharmony_ci at = i; 31706f54294Sopenharmony_ci } 31806f54294Sopenharmony_ci } 31906f54294Sopenharmony_ci } 32006f54294Sopenharmony_ci } 32106f54294Sopenharmony_ci // CORRECTNESS: next_state is correct for all possible u8 values, 32206f54294Sopenharmony_ci // so the only thing we're concerned about is the validity of 32306f54294Sopenharmony_ci // `state_id`. `state_id` either comes from the caller (in which 32406f54294Sopenharmony_ci // case, we assume it is correct), or it comes from the return 32506f54294Sopenharmony_ci // value of next_state, which is guaranteed to be correct. 32606f54294Sopenharmony_ci *state_id = self.next_state_no_fail(*state_id, haystack[at]); 32706f54294Sopenharmony_ci at += 1; 32806f54294Sopenharmony_ci if self.is_match_or_dead_state(*state_id) { 32906f54294Sopenharmony_ci if *state_id == dead_id() { 33006f54294Sopenharmony_ci // The only way to enter into a dead state is if a match 33106f54294Sopenharmony_ci // has been found, so we assert as much. This is different 33206f54294Sopenharmony_ci // from normal automata, where you might enter a dead state 33306f54294Sopenharmony_ci // if you know a subsequent match will never be found 33406f54294Sopenharmony_ci // (regardless of whether a match has already been found). 33506f54294Sopenharmony_ci // For Aho-Corasick, it is built so that we can match at 33606f54294Sopenharmony_ci // any position, so the possibility of a match always 33706f54294Sopenharmony_ci // exists. 33806f54294Sopenharmony_ci // 33906f54294Sopenharmony_ci // (Unless we have an anchored automaton, in which case, 34006f54294Sopenharmony_ci // dead states are used to stop a search.) 34106f54294Sopenharmony_ci debug_assert!( 34206f54294Sopenharmony_ci last_match.is_some() || self.anchored(), 34306f54294Sopenharmony_ci "dead state should only be seen after match" 34406f54294Sopenharmony_ci ); 34506f54294Sopenharmony_ci return last_match; 34606f54294Sopenharmony_ci } 34706f54294Sopenharmony_ci last_match = self.get_match(*state_id, 0, at); 34806f54294Sopenharmony_ci } 34906f54294Sopenharmony_ci } 35006f54294Sopenharmony_ci last_match 35106f54294Sopenharmony_ci } 35206f54294Sopenharmony_ci 35306f54294Sopenharmony_ci /// This is like leftmost_find_at, but does not need to track a caller 35406f54294Sopenharmony_ci /// provided state id. In other words, the only output of this routine is a 35506f54294Sopenharmony_ci /// match, if one exists. 35606f54294Sopenharmony_ci /// 35706f54294Sopenharmony_ci /// It is regrettable that we need to effectively copy a chunk of 35806f54294Sopenharmony_ci /// implementation twice, but when we don't need to track the state ID, we 35906f54294Sopenharmony_ci /// can allow the prefilter to report matches immediately without having 36006f54294Sopenharmony_ci /// to re-confirm them with the automaton. The re-confirmation step is 36106f54294Sopenharmony_ci /// necessary in leftmost_find_at because tracing through the automaton is 36206f54294Sopenharmony_ci /// the only way to correctly set the state ID. (Perhaps an alternative 36306f54294Sopenharmony_ci /// would be to keep a map from pattern ID to matching state ID, but that 36406f54294Sopenharmony_ci /// complicates the code and still doesn't permit us to defer to the 36506f54294Sopenharmony_ci /// prefilter entirely when possible.) 36606f54294Sopenharmony_ci /// 36706f54294Sopenharmony_ci /// I did try a few things to avoid the code duplication here, but nothing 36806f54294Sopenharmony_ci /// optimized as well as this approach. (In microbenchmarks, there was 36906f54294Sopenharmony_ci /// about a 25% difference.) 37006f54294Sopenharmony_ci #[inline(never)] 37106f54294Sopenharmony_ci fn leftmost_find_at_no_state( 37206f54294Sopenharmony_ci &self, 37306f54294Sopenharmony_ci prestate: &mut PrefilterState, 37406f54294Sopenharmony_ci haystack: &[u8], 37506f54294Sopenharmony_ci at: usize, 37606f54294Sopenharmony_ci ) -> Option<Match> { 37706f54294Sopenharmony_ci if let Some(pre) = self.prefilter() { 37806f54294Sopenharmony_ci self.leftmost_find_at_no_state_imp( 37906f54294Sopenharmony_ci prestate, 38006f54294Sopenharmony_ci Some(pre), 38106f54294Sopenharmony_ci haystack, 38206f54294Sopenharmony_ci at, 38306f54294Sopenharmony_ci ) 38406f54294Sopenharmony_ci } else { 38506f54294Sopenharmony_ci self.leftmost_find_at_no_state_imp(prestate, None, haystack, at) 38606f54294Sopenharmony_ci } 38706f54294Sopenharmony_ci } 38806f54294Sopenharmony_ci 38906f54294Sopenharmony_ci // It's important for this to always be inlined. Namely, its only caller 39006f54294Sopenharmony_ci // is leftmost_find_at_no_state, and the inlining should remove the case 39106f54294Sopenharmony_ci // analysis for prefilter scanning when there is no prefilter available. 39206f54294Sopenharmony_ci #[inline(always)] 39306f54294Sopenharmony_ci fn leftmost_find_at_no_state_imp( 39406f54294Sopenharmony_ci &self, 39506f54294Sopenharmony_ci prestate: &mut PrefilterState, 39606f54294Sopenharmony_ci prefilter: Option<&dyn Prefilter>, 39706f54294Sopenharmony_ci haystack: &[u8], 39806f54294Sopenharmony_ci mut at: usize, 39906f54294Sopenharmony_ci ) -> Option<Match> { 40006f54294Sopenharmony_ci debug_assert!(self.match_kind().is_leftmost()); 40106f54294Sopenharmony_ci if self.anchored() && at > 0 { 40206f54294Sopenharmony_ci return None; 40306f54294Sopenharmony_ci } 40406f54294Sopenharmony_ci // If our prefilter handles confirmation of matches 100% of the 40506f54294Sopenharmony_ci // time, and since we don't need to track state IDs, we can avoid 40606f54294Sopenharmony_ci // Aho-Corasick completely. 40706f54294Sopenharmony_ci if let Some(pre) = prefilter { 40806f54294Sopenharmony_ci // We should never have a prefilter during an anchored search. 40906f54294Sopenharmony_ci debug_assert!(!self.anchored()); 41006f54294Sopenharmony_ci if !pre.reports_false_positives() { 41106f54294Sopenharmony_ci return match pre.next_candidate(prestate, haystack, at) { 41206f54294Sopenharmony_ci Candidate::None => None, 41306f54294Sopenharmony_ci Candidate::Match(m) => Some(m), 41406f54294Sopenharmony_ci Candidate::PossibleStartOfMatch(_) => unreachable!(), 41506f54294Sopenharmony_ci }; 41606f54294Sopenharmony_ci } 41706f54294Sopenharmony_ci } 41806f54294Sopenharmony_ci 41906f54294Sopenharmony_ci let mut state_id = self.start_state(); 42006f54294Sopenharmony_ci let mut last_match = self.get_match(state_id, 0, at); 42106f54294Sopenharmony_ci while at < haystack.len() { 42206f54294Sopenharmony_ci if let Some(pre) = prefilter { 42306f54294Sopenharmony_ci if prestate.is_effective(at) && state_id == self.start_state() 42406f54294Sopenharmony_ci { 42506f54294Sopenharmony_ci match prefilter::next(prestate, pre, haystack, at) { 42606f54294Sopenharmony_ci Candidate::None => return None, 42706f54294Sopenharmony_ci // Since we aren't tracking a state ID, we can 42806f54294Sopenharmony_ci // quit early once we know we have a match. 42906f54294Sopenharmony_ci Candidate::Match(m) => return Some(m), 43006f54294Sopenharmony_ci Candidate::PossibleStartOfMatch(i) => { 43106f54294Sopenharmony_ci at = i; 43206f54294Sopenharmony_ci } 43306f54294Sopenharmony_ci } 43406f54294Sopenharmony_ci } 43506f54294Sopenharmony_ci } 43606f54294Sopenharmony_ci // CORRECTNESS: next_state is correct for all possible u8 values, 43706f54294Sopenharmony_ci // so the only thing we're concerned about is the validity of 43806f54294Sopenharmony_ci // `state_id`. `state_id` either comes from the caller (in which 43906f54294Sopenharmony_ci // case, we assume it is correct), or it comes from the return 44006f54294Sopenharmony_ci // value of next_state, which is guaranteed to be correct. 44106f54294Sopenharmony_ci state_id = self.next_state_no_fail(state_id, haystack[at]); 44206f54294Sopenharmony_ci at += 1; 44306f54294Sopenharmony_ci if self.is_match_or_dead_state(state_id) { 44406f54294Sopenharmony_ci if state_id == dead_id() { 44506f54294Sopenharmony_ci // The only way to enter into a dead state is if a 44606f54294Sopenharmony_ci // match has been found, so we assert as much. This 44706f54294Sopenharmony_ci // is different from normal automata, where you might 44806f54294Sopenharmony_ci // enter a dead state if you know a subsequent match 44906f54294Sopenharmony_ci // will never be found (regardless of whether a match 45006f54294Sopenharmony_ci // has already been found). For Aho-Corasick, it is 45106f54294Sopenharmony_ci // built so that we can match at any position, so the 45206f54294Sopenharmony_ci // possibility of a match always exists. 45306f54294Sopenharmony_ci // 45406f54294Sopenharmony_ci // (Unless we have an anchored automaton, in which 45506f54294Sopenharmony_ci // case, dead states are used to stop a search.) 45606f54294Sopenharmony_ci debug_assert!( 45706f54294Sopenharmony_ci last_match.is_some() || self.anchored(), 45806f54294Sopenharmony_ci "dead state should only be seen after match" 45906f54294Sopenharmony_ci ); 46006f54294Sopenharmony_ci return last_match; 46106f54294Sopenharmony_ci } 46206f54294Sopenharmony_ci last_match = self.get_match(state_id, 0, at); 46306f54294Sopenharmony_ci } 46406f54294Sopenharmony_ci } 46506f54294Sopenharmony_ci last_match 46606f54294Sopenharmony_ci } 46706f54294Sopenharmony_ci 46806f54294Sopenharmony_ci /// Execute an overlapping search. 46906f54294Sopenharmony_ci /// 47006f54294Sopenharmony_ci /// When executing an overlapping match, the previous state ID in addition 47106f54294Sopenharmony_ci /// to the previous match index should be given. If there are more matches 47206f54294Sopenharmony_ci /// at the given state, then the match is reported and the given index is 47306f54294Sopenharmony_ci /// incremented. 47406f54294Sopenharmony_ci #[inline(always)] 47506f54294Sopenharmony_ci fn overlapping_find_at( 47606f54294Sopenharmony_ci &self, 47706f54294Sopenharmony_ci prestate: &mut PrefilterState, 47806f54294Sopenharmony_ci haystack: &[u8], 47906f54294Sopenharmony_ci at: usize, 48006f54294Sopenharmony_ci state_id: &mut Self::ID, 48106f54294Sopenharmony_ci match_index: &mut usize, 48206f54294Sopenharmony_ci ) -> Option<Match> { 48306f54294Sopenharmony_ci if self.anchored() && at > 0 && *state_id == self.start_state() { 48406f54294Sopenharmony_ci return None; 48506f54294Sopenharmony_ci } 48606f54294Sopenharmony_ci 48706f54294Sopenharmony_ci let match_count = self.match_count(*state_id); 48806f54294Sopenharmony_ci if *match_index < match_count { 48906f54294Sopenharmony_ci // This is guaranteed to return a match since 49006f54294Sopenharmony_ci // match_index < match_count. 49106f54294Sopenharmony_ci let result = self.get_match(*state_id, *match_index, at); 49206f54294Sopenharmony_ci debug_assert!(result.is_some(), "must be a match"); 49306f54294Sopenharmony_ci *match_index += 1; 49406f54294Sopenharmony_ci return result; 49506f54294Sopenharmony_ci } 49606f54294Sopenharmony_ci 49706f54294Sopenharmony_ci *match_index = 0; 49806f54294Sopenharmony_ci match self.standard_find_at(prestate, haystack, at, state_id) { 49906f54294Sopenharmony_ci None => None, 50006f54294Sopenharmony_ci Some(m) => { 50106f54294Sopenharmony_ci *match_index = 1; 50206f54294Sopenharmony_ci Some(m) 50306f54294Sopenharmony_ci } 50406f54294Sopenharmony_ci } 50506f54294Sopenharmony_ci } 50606f54294Sopenharmony_ci 50706f54294Sopenharmony_ci /// Return the earliest match found. This returns as soon as we know that 50806f54294Sopenharmony_ci /// we have a match. As such, this does not necessarily correspond to the 50906f54294Sopenharmony_ci /// leftmost starting match, but rather, the leftmost position at which a 51006f54294Sopenharmony_ci /// match ends. 51106f54294Sopenharmony_ci #[inline(always)] 51206f54294Sopenharmony_ci fn earliest_find_at( 51306f54294Sopenharmony_ci &self, 51406f54294Sopenharmony_ci prestate: &mut PrefilterState, 51506f54294Sopenharmony_ci haystack: &[u8], 51606f54294Sopenharmony_ci at: usize, 51706f54294Sopenharmony_ci state_id: &mut Self::ID, 51806f54294Sopenharmony_ci ) -> Option<Match> { 51906f54294Sopenharmony_ci if *state_id == self.start_state() { 52006f54294Sopenharmony_ci if self.anchored() && at > 0 { 52106f54294Sopenharmony_ci return None; 52206f54294Sopenharmony_ci } 52306f54294Sopenharmony_ci if let Some(m) = self.get_match(*state_id, 0, at) { 52406f54294Sopenharmony_ci return Some(m); 52506f54294Sopenharmony_ci } 52606f54294Sopenharmony_ci } 52706f54294Sopenharmony_ci self.standard_find_at(prestate, haystack, at, state_id) 52806f54294Sopenharmony_ci } 52906f54294Sopenharmony_ci 53006f54294Sopenharmony_ci /// A convenience function for finding the next match according to the 53106f54294Sopenharmony_ci /// match semantics of this automaton. For standard match semantics, this 53206f54294Sopenharmony_ci /// finds the earliest match. Otherwise, the leftmost match is found. 53306f54294Sopenharmony_ci #[inline(always)] 53406f54294Sopenharmony_ci fn find_at( 53506f54294Sopenharmony_ci &self, 53606f54294Sopenharmony_ci prestate: &mut PrefilterState, 53706f54294Sopenharmony_ci haystack: &[u8], 53806f54294Sopenharmony_ci at: usize, 53906f54294Sopenharmony_ci state_id: &mut Self::ID, 54006f54294Sopenharmony_ci ) -> Option<Match> { 54106f54294Sopenharmony_ci match *self.match_kind() { 54206f54294Sopenharmony_ci MatchKind::Standard => { 54306f54294Sopenharmony_ci self.earliest_find_at(prestate, haystack, at, state_id) 54406f54294Sopenharmony_ci } 54506f54294Sopenharmony_ci MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { 54606f54294Sopenharmony_ci self.leftmost_find_at(prestate, haystack, at, state_id) 54706f54294Sopenharmony_ci } 54806f54294Sopenharmony_ci MatchKind::__Nonexhaustive => unreachable!(), 54906f54294Sopenharmony_ci } 55006f54294Sopenharmony_ci } 55106f54294Sopenharmony_ci 55206f54294Sopenharmony_ci /// Like find_at, but does not track state identifiers. This permits some 55306f54294Sopenharmony_ci /// optimizations when a prefilter that confirms its own matches is 55406f54294Sopenharmony_ci /// present. 55506f54294Sopenharmony_ci #[inline(always)] 55606f54294Sopenharmony_ci fn find_at_no_state( 55706f54294Sopenharmony_ci &self, 55806f54294Sopenharmony_ci prestate: &mut PrefilterState, 55906f54294Sopenharmony_ci haystack: &[u8], 56006f54294Sopenharmony_ci at: usize, 56106f54294Sopenharmony_ci ) -> Option<Match> { 56206f54294Sopenharmony_ci match *self.match_kind() { 56306f54294Sopenharmony_ci MatchKind::Standard => { 56406f54294Sopenharmony_ci let mut state = self.start_state(); 56506f54294Sopenharmony_ci self.earliest_find_at(prestate, haystack, at, &mut state) 56606f54294Sopenharmony_ci } 56706f54294Sopenharmony_ci MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { 56806f54294Sopenharmony_ci self.leftmost_find_at_no_state(prestate, haystack, at) 56906f54294Sopenharmony_ci } 57006f54294Sopenharmony_ci MatchKind::__Nonexhaustive => unreachable!(), 57106f54294Sopenharmony_ci } 57206f54294Sopenharmony_ci } 57306f54294Sopenharmony_ci} 574