106f54294Sopenharmony_ciuse crate::ahocorasick::MatchKind;
206f54294Sopenharmony_ciuse crate::prefilter::{self, Candidate, Prefilter, PrefilterState};
306f54294Sopenharmony_ciuse crate::state_id::{dead_id, fail_id, StateID};
406f54294Sopenharmony_ciuse crate::Match;
506f54294Sopenharmony_ci
606f54294Sopenharmony_ci// NOTE: This trait essentially started as a copy of the same trait from from
706f54294Sopenharmony_ci// regex-automata, with some wording changed since we use this trait for
806f54294Sopenharmony_ci// NFAs in addition to DFAs in this crate. Additionally, we do not export
906f54294Sopenharmony_ci// this trait. It's only used internally to reduce code duplication. The
1006f54294Sopenharmony_ci// regex-automata crate needs to expose it because its Regex type is generic
1106f54294Sopenharmony_ci// over implementations of this trait. In this crate, we encapsulate everything
1206f54294Sopenharmony_ci// behind the AhoCorasick type.
1306f54294Sopenharmony_ci//
1406f54294Sopenharmony_ci// This trait is a bit of a mess, but it's not quite clear how to fix it.
1506f54294Sopenharmony_ci// Basically, there are several competing concerns:
1606f54294Sopenharmony_ci//
1706f54294Sopenharmony_ci// * We need performance, so everything effectively needs to get monomorphized.
1806f54294Sopenharmony_ci// * There are several variations on searching Aho-Corasick automatons:
1906f54294Sopenharmony_ci//   overlapping, standard and leftmost. Overlapping and standard are somewhat
2006f54294Sopenharmony_ci//   combined together below, but there is no real way to combine standard with
2106f54294Sopenharmony_ci//   leftmost. Namely, leftmost requires continuing a search even after a match
2206f54294Sopenharmony_ci//   is found, in order to correctly disambiguate a match.
2306f54294Sopenharmony_ci// * On top of that, *sometimes* callers want to know which state the automaton
2406f54294Sopenharmony_ci//   is in after searching. This is principally useful for overlapping and
2506f54294Sopenharmony_ci//   stream searches. However, when callers don't care about this, we really
2606f54294Sopenharmony_ci//   do not want to be forced to compute it, since it sometimes requires extra
2706f54294Sopenharmony_ci//   work. Thus, there are effectively two copies of leftmost searching: one
2806f54294Sopenharmony_ci//   for tracking the state ID and one that doesn't. We should ideally do the
2906f54294Sopenharmony_ci//   same for standard searching, but my sanity stopped me.
3006f54294Sopenharmony_ci
3106f54294Sopenharmony_ci// SAFETY RATIONALE: Previously, the code below went to some length to remove
3206f54294Sopenharmony_ci// all bounds checks. This generally produced tighter assembly and lead to
3306f54294Sopenharmony_ci// 20-50% improvements in micro-benchmarks on corpora made up of random
3406f54294Sopenharmony_ci// characters. This somewhat makes sense, since the branch predictor is going
3506f54294Sopenharmony_ci// to be at its worse on random text.
3606f54294Sopenharmony_ci//
3706f54294Sopenharmony_ci// However, using the aho-corasick-debug tool and manually benchmarking
3806f54294Sopenharmony_ci// different inputs, the code *with* bounds checks actually wound up being
3906f54294Sopenharmony_ci// slightly faster:
4006f54294Sopenharmony_ci//
4106f54294Sopenharmony_ci//     $ cat input
4206f54294Sopenharmony_ci//     Sherlock Holmes
4306f54294Sopenharmony_ci//     John Watson
4406f54294Sopenharmony_ci//     Professor Moriarty
4506f54294Sopenharmony_ci//     Irene Adler
4606f54294Sopenharmony_ci//     Mary Watson
4706f54294Sopenharmony_ci//
4806f54294Sopenharmony_ci//     $ aho-corasick-debug-safe \
4906f54294Sopenharmony_ci//         input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
5006f54294Sopenharmony_ci//     pattern read time: 32.824µs
5106f54294Sopenharmony_ci//     automaton build time: 444.687µs
5206f54294Sopenharmony_ci//     automaton heap usage: 72392 bytes
5306f54294Sopenharmony_ci//     match count: 639
5406f54294Sopenharmony_ci//     count time: 1.809961702s
5506f54294Sopenharmony_ci//
5606f54294Sopenharmony_ci//     $ aho-corasick-debug-master \
5706f54294Sopenharmony_ci//         input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
5806f54294Sopenharmony_ci//     pattern read time: 31.425µs
5906f54294Sopenharmony_ci//     automaton build time: 317.434µs
6006f54294Sopenharmony_ci//     automaton heap usage: 72392 bytes
6106f54294Sopenharmony_ci//     match count: 639
6206f54294Sopenharmony_ci//     count time: 2.059157705s
6306f54294Sopenharmony_ci//
6406f54294Sopenharmony_ci// I was able to reproduce this result on two different machines (an i5 and
6506f54294Sopenharmony_ci// an i7). Therefore, we go the route of safe code for now.
6606f54294Sopenharmony_ci
6706f54294Sopenharmony_ci/// A trait describing the interface of an Aho-Corasick finite state machine.
6806f54294Sopenharmony_ci///
6906f54294Sopenharmony_ci/// Every automaton has exactly one fail state, one dead state and exactly one
7006f54294Sopenharmony_ci/// start state. Generally, these correspond to the first, second and third
7106f54294Sopenharmony_ci/// states, respectively. The dead state is always treated as a sentinel. That
7206f54294Sopenharmony_ci/// is, no correct Aho-Corasick automaton will ever transition into the fail
7306f54294Sopenharmony_ci/// state. The dead state, however, can be transitioned into, but only when
7406f54294Sopenharmony_ci/// leftmost-first or leftmost-longest match semantics are enabled and only
7506f54294Sopenharmony_ci/// when at least one match has been observed.
7606f54294Sopenharmony_ci///
7706f54294Sopenharmony_ci/// Every automaton also has one or more match states, such that
7806f54294Sopenharmony_ci/// `Automaton::is_match_state(id)` returns `true` if and only if `id`
7906f54294Sopenharmony_ci/// corresponds to a match state.
8006f54294Sopenharmony_cipub trait Automaton {
8106f54294Sopenharmony_ci    /// The representation used for state identifiers in this automaton.
8206f54294Sopenharmony_ci    ///
8306f54294Sopenharmony_ci    /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
8406f54294Sopenharmony_ci    type ID: StateID;
8506f54294Sopenharmony_ci
8606f54294Sopenharmony_ci    /// The type of matching that should be done.
8706f54294Sopenharmony_ci    fn match_kind(&self) -> &MatchKind;
8806f54294Sopenharmony_ci
8906f54294Sopenharmony_ci    /// Returns true if and only if this automaton uses anchored searches.
9006f54294Sopenharmony_ci    fn anchored(&self) -> bool;
9106f54294Sopenharmony_ci
9206f54294Sopenharmony_ci    /// An optional prefilter for quickly skipping to the next candidate match.
9306f54294Sopenharmony_ci    /// A prefilter must report at least every match, although it may report
9406f54294Sopenharmony_ci    /// positions that do not correspond to a match. That is, it must not allow
9506f54294Sopenharmony_ci    /// false negatives, but can allow false positives.
9606f54294Sopenharmony_ci    ///
9706f54294Sopenharmony_ci    /// Currently, a prefilter only runs when the automaton is in the start
9806f54294Sopenharmony_ci    /// state. That is, the position reported by a prefilter should always
9906f54294Sopenharmony_ci    /// correspond to the start of a potential match.
10006f54294Sopenharmony_ci    fn prefilter(&self) -> Option<&dyn Prefilter>;
10106f54294Sopenharmony_ci
10206f54294Sopenharmony_ci    /// Return the identifier of this automaton's start state.
10306f54294Sopenharmony_ci    fn start_state(&self) -> Self::ID;
10406f54294Sopenharmony_ci
10506f54294Sopenharmony_ci    /// Returns true if and only if the given state identifier refers to a
10606f54294Sopenharmony_ci    /// valid state.
10706f54294Sopenharmony_ci    fn is_valid(&self, id: Self::ID) -> bool;
10806f54294Sopenharmony_ci
10906f54294Sopenharmony_ci    /// Returns true if and only if the given identifier corresponds to a match
11006f54294Sopenharmony_ci    /// state.
11106f54294Sopenharmony_ci    ///
11206f54294Sopenharmony_ci    /// The state ID given must be valid, or else implementors may panic.
11306f54294Sopenharmony_ci    fn is_match_state(&self, id: Self::ID) -> bool;
11406f54294Sopenharmony_ci
11506f54294Sopenharmony_ci    /// Returns true if and only if the given identifier corresponds to a state
11606f54294Sopenharmony_ci    /// that is either the dead state or a match state.
11706f54294Sopenharmony_ci    ///
11806f54294Sopenharmony_ci    /// Depending on the implementation of the automaton, this routine can
11906f54294Sopenharmony_ci    /// be used to save a branch in the core matching loop. Nevertheless,
12006f54294Sopenharmony_ci    /// `is_match_state(id) || id == dead_id()` is always a valid
12106f54294Sopenharmony_ci    /// implementation. Indeed, this is the default implementation.
12206f54294Sopenharmony_ci    ///
12306f54294Sopenharmony_ci    /// The state ID given must be valid, or else implementors may panic.
12406f54294Sopenharmony_ci    fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
12506f54294Sopenharmony_ci        id == dead_id() || self.is_match_state(id)
12606f54294Sopenharmony_ci    }
12706f54294Sopenharmony_ci
12806f54294Sopenharmony_ci    /// If the given state is a match state, return the match corresponding
12906f54294Sopenharmony_ci    /// to the given match index. `end` must be the ending position of the
13006f54294Sopenharmony_ci    /// detected match. If no match exists or if `match_index` exceeds the
13106f54294Sopenharmony_ci    /// number of matches in this state, then `None` is returned.
13206f54294Sopenharmony_ci    ///
13306f54294Sopenharmony_ci    /// The state ID given must be valid, or else implementors may panic.
13406f54294Sopenharmony_ci    ///
13506f54294Sopenharmony_ci    /// If the given state ID is correct and if the `match_index` is less than
13606f54294Sopenharmony_ci    /// the number of matches for that state, then this is guaranteed to return
13706f54294Sopenharmony_ci    /// a match.
13806f54294Sopenharmony_ci    fn get_match(
13906f54294Sopenharmony_ci        &self,
14006f54294Sopenharmony_ci        id: Self::ID,
14106f54294Sopenharmony_ci        match_index: usize,
14206f54294Sopenharmony_ci        end: usize,
14306f54294Sopenharmony_ci    ) -> Option<Match>;
14406f54294Sopenharmony_ci
14506f54294Sopenharmony_ci    /// Returns the number of matches for the given state. If the given state
14606f54294Sopenharmony_ci    /// is not a match state, then this returns 0.
14706f54294Sopenharmony_ci    ///
14806f54294Sopenharmony_ci    /// The state ID given must be valid, or else implementors must panic.
14906f54294Sopenharmony_ci    fn match_count(&self, id: Self::ID) -> usize;
15006f54294Sopenharmony_ci
15106f54294Sopenharmony_ci    /// Given the current state that this automaton is in and the next input
15206f54294Sopenharmony_ci    /// byte, this method returns the identifier of the next state. The
15306f54294Sopenharmony_ci    /// identifier returned must always be valid and may never correspond to
15406f54294Sopenharmony_ci    /// the fail state. The returned identifier may, however, point to the
15506f54294Sopenharmony_ci    /// dead state.
15606f54294Sopenharmony_ci    ///
15706f54294Sopenharmony_ci    /// This is not safe so that implementors may look up the next state
15806f54294Sopenharmony_ci    /// without memory safety checks such as bounds checks. As such, callers
15906f54294Sopenharmony_ci    /// must ensure that the given identifier corresponds to a valid automaton
16006f54294Sopenharmony_ci    /// state. Implementors must, in turn, ensure that this routine is safe for
16106f54294Sopenharmony_ci    /// all valid state identifiers and for all possible `u8` values.
16206f54294Sopenharmony_ci    fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
16306f54294Sopenharmony_ci
16406f54294Sopenharmony_ci    /// Like next_state, but debug_asserts that the underlying
16506f54294Sopenharmony_ci    /// implementation never returns a `fail_id()` for the next state.
16606f54294Sopenharmony_ci    fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID {
16706f54294Sopenharmony_ci        let next = self.next_state(current, input);
16806f54294Sopenharmony_ci        // We should never see a transition to the failure state.
16906f54294Sopenharmony_ci        debug_assert!(
17006f54294Sopenharmony_ci            next != fail_id(),
17106f54294Sopenharmony_ci            "automaton should never return fail_id for next state"
17206f54294Sopenharmony_ci        );
17306f54294Sopenharmony_ci        next
17406f54294Sopenharmony_ci    }
17506f54294Sopenharmony_ci
17606f54294Sopenharmony_ci    /// Execute a search using standard match semantics.
17706f54294Sopenharmony_ci    ///
17806f54294Sopenharmony_ci    /// This can be used even when the automaton was constructed with leftmost
17906f54294Sopenharmony_ci    /// match semantics when you want to find the earliest possible match. This
18006f54294Sopenharmony_ci    /// can also be used as part of an overlapping search implementation.
18106f54294Sopenharmony_ci    ///
18206f54294Sopenharmony_ci    /// N.B. This does not report a match if `state_id` is given as a matching
18306f54294Sopenharmony_ci    /// state. As such, this should not be used directly.
18406f54294Sopenharmony_ci    #[inline(always)]
18506f54294Sopenharmony_ci    fn standard_find_at(
18606f54294Sopenharmony_ci        &self,
18706f54294Sopenharmony_ci        prestate: &mut PrefilterState,
18806f54294Sopenharmony_ci        haystack: &[u8],
18906f54294Sopenharmony_ci        at: usize,
19006f54294Sopenharmony_ci        state_id: &mut Self::ID,
19106f54294Sopenharmony_ci    ) -> Option<Match> {
19206f54294Sopenharmony_ci        if let Some(pre) = self.prefilter() {
19306f54294Sopenharmony_ci            self.standard_find_at_imp(
19406f54294Sopenharmony_ci                prestate,
19506f54294Sopenharmony_ci                Some(pre),
19606f54294Sopenharmony_ci                haystack,
19706f54294Sopenharmony_ci                at,
19806f54294Sopenharmony_ci                state_id,
19906f54294Sopenharmony_ci            )
20006f54294Sopenharmony_ci        } else {
20106f54294Sopenharmony_ci            self.standard_find_at_imp(prestate, None, haystack, at, state_id)
20206f54294Sopenharmony_ci        }
20306f54294Sopenharmony_ci    }
20406f54294Sopenharmony_ci
20506f54294Sopenharmony_ci    // It's important for this to always be inlined. Namely, its only caller
20606f54294Sopenharmony_ci    // is standard_find_at, and the inlining should remove the case analysis
20706f54294Sopenharmony_ci    // for prefilter scanning when there is no prefilter available.
20806f54294Sopenharmony_ci    #[inline(always)]
20906f54294Sopenharmony_ci    fn standard_find_at_imp(
21006f54294Sopenharmony_ci        &self,
21106f54294Sopenharmony_ci        prestate: &mut PrefilterState,
21206f54294Sopenharmony_ci        prefilter: Option<&dyn Prefilter>,
21306f54294Sopenharmony_ci        haystack: &[u8],
21406f54294Sopenharmony_ci        mut at: usize,
21506f54294Sopenharmony_ci        state_id: &mut Self::ID,
21606f54294Sopenharmony_ci    ) -> Option<Match> {
21706f54294Sopenharmony_ci        while at < haystack.len() {
21806f54294Sopenharmony_ci            if let Some(pre) = prefilter {
21906f54294Sopenharmony_ci                if prestate.is_effective(at) && *state_id == self.start_state()
22006f54294Sopenharmony_ci                {
22106f54294Sopenharmony_ci                    let c = prefilter::next(prestate, pre, haystack, at)
22206f54294Sopenharmony_ci                        .into_option();
22306f54294Sopenharmony_ci                    match c {
22406f54294Sopenharmony_ci                        None => return None,
22506f54294Sopenharmony_ci                        Some(i) => {
22606f54294Sopenharmony_ci                            at = i;
22706f54294Sopenharmony_ci                        }
22806f54294Sopenharmony_ci                    }
22906f54294Sopenharmony_ci                }
23006f54294Sopenharmony_ci            }
23106f54294Sopenharmony_ci            // CORRECTNESS: next_state is correct for all possible u8 values,
23206f54294Sopenharmony_ci            // so the only thing we're concerned about is the validity of
23306f54294Sopenharmony_ci            // `state_id`. `state_id` either comes from the caller (in which
23406f54294Sopenharmony_ci            // case, we assume it is correct), or it comes from the return
23506f54294Sopenharmony_ci            // value of next_state, which is guaranteed to be correct.
23606f54294Sopenharmony_ci            *state_id = self.next_state_no_fail(*state_id, haystack[at]);
23706f54294Sopenharmony_ci            at += 1;
23806f54294Sopenharmony_ci            // This routine always quits immediately after seeing a
23906f54294Sopenharmony_ci            // match, and since dead states can only come after seeing
24006f54294Sopenharmony_ci            // a match, seeing a dead state here is impossible. (Unless
24106f54294Sopenharmony_ci            // we have an anchored automaton, in which case, dead states
24206f54294Sopenharmony_ci            // are used to stop a search.)
24306f54294Sopenharmony_ci            debug_assert!(
24406f54294Sopenharmony_ci                *state_id != dead_id() || self.anchored(),
24506f54294Sopenharmony_ci                "standard find should never see a dead state"
24606f54294Sopenharmony_ci            );
24706f54294Sopenharmony_ci
24806f54294Sopenharmony_ci            if self.is_match_or_dead_state(*state_id) {
24906f54294Sopenharmony_ci                return if *state_id == dead_id() {
25006f54294Sopenharmony_ci                    None
25106f54294Sopenharmony_ci                } else {
25206f54294Sopenharmony_ci                    self.get_match(*state_id, 0, at)
25306f54294Sopenharmony_ci                };
25406f54294Sopenharmony_ci            }
25506f54294Sopenharmony_ci        }
25606f54294Sopenharmony_ci        None
25706f54294Sopenharmony_ci    }
25806f54294Sopenharmony_ci
25906f54294Sopenharmony_ci    /// Execute a search using leftmost (either first or longest) match
26006f54294Sopenharmony_ci    /// semantics.
26106f54294Sopenharmony_ci    ///
26206f54294Sopenharmony_ci    /// The principle difference between searching with standard semantics and
26306f54294Sopenharmony_ci    /// searching with leftmost semantics is that leftmost searching will
26406f54294Sopenharmony_ci    /// continue searching even after a match has been found. Once a match
26506f54294Sopenharmony_ci    /// is found, the search does not stop until either the haystack has been
26606f54294Sopenharmony_ci    /// exhausted or a dead state is observed in the automaton. (Dead states
26706f54294Sopenharmony_ci    /// only exist in automatons constructed with leftmost semantics.) That is,
26806f54294Sopenharmony_ci    /// we rely on the construction of the automaton to tell us when to quit.
26906f54294Sopenharmony_ci    #[inline(never)]
27006f54294Sopenharmony_ci    fn leftmost_find_at(
27106f54294Sopenharmony_ci        &self,
27206f54294Sopenharmony_ci        prestate: &mut PrefilterState,
27306f54294Sopenharmony_ci        haystack: &[u8],
27406f54294Sopenharmony_ci        at: usize,
27506f54294Sopenharmony_ci        state_id: &mut Self::ID,
27606f54294Sopenharmony_ci    ) -> Option<Match> {
27706f54294Sopenharmony_ci        if let Some(pre) = self.prefilter() {
27806f54294Sopenharmony_ci            self.leftmost_find_at_imp(
27906f54294Sopenharmony_ci                prestate,
28006f54294Sopenharmony_ci                Some(pre),
28106f54294Sopenharmony_ci                haystack,
28206f54294Sopenharmony_ci                at,
28306f54294Sopenharmony_ci                state_id,
28406f54294Sopenharmony_ci            )
28506f54294Sopenharmony_ci        } else {
28606f54294Sopenharmony_ci            self.leftmost_find_at_imp(prestate, None, haystack, at, state_id)
28706f54294Sopenharmony_ci        }
28806f54294Sopenharmony_ci    }
28906f54294Sopenharmony_ci
29006f54294Sopenharmony_ci    // It's important for this to always be inlined. Namely, its only caller
29106f54294Sopenharmony_ci    // is leftmost_find_at, and the inlining should remove the case analysis
29206f54294Sopenharmony_ci    // for prefilter scanning when there is no prefilter available.
29306f54294Sopenharmony_ci    #[inline(always)]
29406f54294Sopenharmony_ci    fn leftmost_find_at_imp(
29506f54294Sopenharmony_ci        &self,
29606f54294Sopenharmony_ci        prestate: &mut PrefilterState,
29706f54294Sopenharmony_ci        prefilter: Option<&dyn Prefilter>,
29806f54294Sopenharmony_ci        haystack: &[u8],
29906f54294Sopenharmony_ci        mut at: usize,
30006f54294Sopenharmony_ci        state_id: &mut Self::ID,
30106f54294Sopenharmony_ci    ) -> Option<Match> {
30206f54294Sopenharmony_ci        debug_assert!(self.match_kind().is_leftmost());
30306f54294Sopenharmony_ci        if self.anchored() && at > 0 && *state_id == self.start_state() {
30406f54294Sopenharmony_ci            return None;
30506f54294Sopenharmony_ci        }
30606f54294Sopenharmony_ci        let mut last_match = self.get_match(*state_id, 0, at);
30706f54294Sopenharmony_ci        while at < haystack.len() {
30806f54294Sopenharmony_ci            if let Some(pre) = prefilter {
30906f54294Sopenharmony_ci                if prestate.is_effective(at) && *state_id == self.start_state()
31006f54294Sopenharmony_ci                {
31106f54294Sopenharmony_ci                    let c = prefilter::next(prestate, pre, haystack, at)
31206f54294Sopenharmony_ci                        .into_option();
31306f54294Sopenharmony_ci                    match c {
31406f54294Sopenharmony_ci                        None => return None,
31506f54294Sopenharmony_ci                        Some(i) => {
31606f54294Sopenharmony_ci                            at = i;
31706f54294Sopenharmony_ci                        }
31806f54294Sopenharmony_ci                    }
31906f54294Sopenharmony_ci                }
32006f54294Sopenharmony_ci            }
32106f54294Sopenharmony_ci            // CORRECTNESS: next_state is correct for all possible u8 values,
32206f54294Sopenharmony_ci            // so the only thing we're concerned about is the validity of
32306f54294Sopenharmony_ci            // `state_id`. `state_id` either comes from the caller (in which
32406f54294Sopenharmony_ci            // case, we assume it is correct), or it comes from the return
32506f54294Sopenharmony_ci            // value of next_state, which is guaranteed to be correct.
32606f54294Sopenharmony_ci            *state_id = self.next_state_no_fail(*state_id, haystack[at]);
32706f54294Sopenharmony_ci            at += 1;
32806f54294Sopenharmony_ci            if self.is_match_or_dead_state(*state_id) {
32906f54294Sopenharmony_ci                if *state_id == dead_id() {
33006f54294Sopenharmony_ci                    // The only way to enter into a dead state is if a match
33106f54294Sopenharmony_ci                    // has been found, so we assert as much. This is different
33206f54294Sopenharmony_ci                    // from normal automata, where you might enter a dead state
33306f54294Sopenharmony_ci                    // if you know a subsequent match will never be found
33406f54294Sopenharmony_ci                    // (regardless of whether a match has already been found).
33506f54294Sopenharmony_ci                    // For Aho-Corasick, it is built so that we can match at
33606f54294Sopenharmony_ci                    // any position, so the possibility of a match always
33706f54294Sopenharmony_ci                    // exists.
33806f54294Sopenharmony_ci                    //
33906f54294Sopenharmony_ci                    // (Unless we have an anchored automaton, in which case,
34006f54294Sopenharmony_ci                    // dead states are used to stop a search.)
34106f54294Sopenharmony_ci                    debug_assert!(
34206f54294Sopenharmony_ci                        last_match.is_some() || self.anchored(),
34306f54294Sopenharmony_ci                        "dead state should only be seen after match"
34406f54294Sopenharmony_ci                    );
34506f54294Sopenharmony_ci                    return last_match;
34606f54294Sopenharmony_ci                }
34706f54294Sopenharmony_ci                last_match = self.get_match(*state_id, 0, at);
34806f54294Sopenharmony_ci            }
34906f54294Sopenharmony_ci        }
35006f54294Sopenharmony_ci        last_match
35106f54294Sopenharmony_ci    }
35206f54294Sopenharmony_ci
35306f54294Sopenharmony_ci    /// This is like leftmost_find_at, but does not need to track a caller
35406f54294Sopenharmony_ci    /// provided state id. In other words, the only output of this routine is a
35506f54294Sopenharmony_ci    /// match, if one exists.
35606f54294Sopenharmony_ci    ///
35706f54294Sopenharmony_ci    /// It is regrettable that we need to effectively copy a chunk of
35806f54294Sopenharmony_ci    /// implementation twice, but when we don't need to track the state ID, we
35906f54294Sopenharmony_ci    /// can allow the prefilter to report matches immediately without having
36006f54294Sopenharmony_ci    /// to re-confirm them with the automaton. The re-confirmation step is
36106f54294Sopenharmony_ci    /// necessary in leftmost_find_at because tracing through the automaton is
36206f54294Sopenharmony_ci    /// the only way to correctly set the state ID. (Perhaps an alternative
36306f54294Sopenharmony_ci    /// would be to keep a map from pattern ID to matching state ID, but that
36406f54294Sopenharmony_ci    /// complicates the code and still doesn't permit us to defer to the
36506f54294Sopenharmony_ci    /// prefilter entirely when possible.)
36606f54294Sopenharmony_ci    ///
36706f54294Sopenharmony_ci    /// I did try a few things to avoid the code duplication here, but nothing
36806f54294Sopenharmony_ci    /// optimized as well as this approach. (In microbenchmarks, there was
36906f54294Sopenharmony_ci    /// about a 25% difference.)
37006f54294Sopenharmony_ci    #[inline(never)]
37106f54294Sopenharmony_ci    fn leftmost_find_at_no_state(
37206f54294Sopenharmony_ci        &self,
37306f54294Sopenharmony_ci        prestate: &mut PrefilterState,
37406f54294Sopenharmony_ci        haystack: &[u8],
37506f54294Sopenharmony_ci        at: usize,
37606f54294Sopenharmony_ci    ) -> Option<Match> {
37706f54294Sopenharmony_ci        if let Some(pre) = self.prefilter() {
37806f54294Sopenharmony_ci            self.leftmost_find_at_no_state_imp(
37906f54294Sopenharmony_ci                prestate,
38006f54294Sopenharmony_ci                Some(pre),
38106f54294Sopenharmony_ci                haystack,
38206f54294Sopenharmony_ci                at,
38306f54294Sopenharmony_ci            )
38406f54294Sopenharmony_ci        } else {
38506f54294Sopenharmony_ci            self.leftmost_find_at_no_state_imp(prestate, None, haystack, at)
38606f54294Sopenharmony_ci        }
38706f54294Sopenharmony_ci    }
38806f54294Sopenharmony_ci
38906f54294Sopenharmony_ci    // It's important for this to always be inlined. Namely, its only caller
39006f54294Sopenharmony_ci    // is leftmost_find_at_no_state, and the inlining should remove the case
39106f54294Sopenharmony_ci    // analysis for prefilter scanning when there is no prefilter available.
39206f54294Sopenharmony_ci    #[inline(always)]
39306f54294Sopenharmony_ci    fn leftmost_find_at_no_state_imp(
39406f54294Sopenharmony_ci        &self,
39506f54294Sopenharmony_ci        prestate: &mut PrefilterState,
39606f54294Sopenharmony_ci        prefilter: Option<&dyn Prefilter>,
39706f54294Sopenharmony_ci        haystack: &[u8],
39806f54294Sopenharmony_ci        mut at: usize,
39906f54294Sopenharmony_ci    ) -> Option<Match> {
40006f54294Sopenharmony_ci        debug_assert!(self.match_kind().is_leftmost());
40106f54294Sopenharmony_ci        if self.anchored() && at > 0 {
40206f54294Sopenharmony_ci            return None;
40306f54294Sopenharmony_ci        }
40406f54294Sopenharmony_ci        // If our prefilter handles confirmation of matches 100% of the
40506f54294Sopenharmony_ci        // time, and since we don't need to track state IDs, we can avoid
40606f54294Sopenharmony_ci        // Aho-Corasick completely.
40706f54294Sopenharmony_ci        if let Some(pre) = prefilter {
40806f54294Sopenharmony_ci            // We should never have a prefilter during an anchored search.
40906f54294Sopenharmony_ci            debug_assert!(!self.anchored());
41006f54294Sopenharmony_ci            if !pre.reports_false_positives() {
41106f54294Sopenharmony_ci                return match pre.next_candidate(prestate, haystack, at) {
41206f54294Sopenharmony_ci                    Candidate::None => None,
41306f54294Sopenharmony_ci                    Candidate::Match(m) => Some(m),
41406f54294Sopenharmony_ci                    Candidate::PossibleStartOfMatch(_) => unreachable!(),
41506f54294Sopenharmony_ci                };
41606f54294Sopenharmony_ci            }
41706f54294Sopenharmony_ci        }
41806f54294Sopenharmony_ci
41906f54294Sopenharmony_ci        let mut state_id = self.start_state();
42006f54294Sopenharmony_ci        let mut last_match = self.get_match(state_id, 0, at);
42106f54294Sopenharmony_ci        while at < haystack.len() {
42206f54294Sopenharmony_ci            if let Some(pre) = prefilter {
42306f54294Sopenharmony_ci                if prestate.is_effective(at) && state_id == self.start_state()
42406f54294Sopenharmony_ci                {
42506f54294Sopenharmony_ci                    match prefilter::next(prestate, pre, haystack, at) {
42606f54294Sopenharmony_ci                        Candidate::None => return None,
42706f54294Sopenharmony_ci                        // Since we aren't tracking a state ID, we can
42806f54294Sopenharmony_ci                        // quit early once we know we have a match.
42906f54294Sopenharmony_ci                        Candidate::Match(m) => return Some(m),
43006f54294Sopenharmony_ci                        Candidate::PossibleStartOfMatch(i) => {
43106f54294Sopenharmony_ci                            at = i;
43206f54294Sopenharmony_ci                        }
43306f54294Sopenharmony_ci                    }
43406f54294Sopenharmony_ci                }
43506f54294Sopenharmony_ci            }
43606f54294Sopenharmony_ci            // CORRECTNESS: next_state is correct for all possible u8 values,
43706f54294Sopenharmony_ci            // so the only thing we're concerned about is the validity of
43806f54294Sopenharmony_ci            // `state_id`. `state_id` either comes from the caller (in which
43906f54294Sopenharmony_ci            // case, we assume it is correct), or it comes from the return
44006f54294Sopenharmony_ci            // value of next_state, which is guaranteed to be correct.
44106f54294Sopenharmony_ci            state_id = self.next_state_no_fail(state_id, haystack[at]);
44206f54294Sopenharmony_ci            at += 1;
44306f54294Sopenharmony_ci            if self.is_match_or_dead_state(state_id) {
44406f54294Sopenharmony_ci                if state_id == dead_id() {
44506f54294Sopenharmony_ci                    // The only way to enter into a dead state is if a
44606f54294Sopenharmony_ci                    // match has been found, so we assert as much. This
44706f54294Sopenharmony_ci                    // is different from normal automata, where you might
44806f54294Sopenharmony_ci                    // enter a dead state if you know a subsequent match
44906f54294Sopenharmony_ci                    // will never be found (regardless of whether a match
45006f54294Sopenharmony_ci                    // has already been found). For Aho-Corasick, it is
45106f54294Sopenharmony_ci                    // built so that we can match at any position, so the
45206f54294Sopenharmony_ci                    // possibility of a match always exists.
45306f54294Sopenharmony_ci                    //
45406f54294Sopenharmony_ci                    // (Unless we have an anchored automaton, in which
45506f54294Sopenharmony_ci                    // case, dead states are used to stop a search.)
45606f54294Sopenharmony_ci                    debug_assert!(
45706f54294Sopenharmony_ci                        last_match.is_some() || self.anchored(),
45806f54294Sopenharmony_ci                        "dead state should only be seen after match"
45906f54294Sopenharmony_ci                    );
46006f54294Sopenharmony_ci                    return last_match;
46106f54294Sopenharmony_ci                }
46206f54294Sopenharmony_ci                last_match = self.get_match(state_id, 0, at);
46306f54294Sopenharmony_ci            }
46406f54294Sopenharmony_ci        }
46506f54294Sopenharmony_ci        last_match
46606f54294Sopenharmony_ci    }
46706f54294Sopenharmony_ci
46806f54294Sopenharmony_ci    /// Execute an overlapping search.
46906f54294Sopenharmony_ci    ///
47006f54294Sopenharmony_ci    /// When executing an overlapping match, the previous state ID in addition
47106f54294Sopenharmony_ci    /// to the previous match index should be given. If there are more matches
47206f54294Sopenharmony_ci    /// at the given state, then the match is reported and the given index is
47306f54294Sopenharmony_ci    /// incremented.
47406f54294Sopenharmony_ci    #[inline(always)]
47506f54294Sopenharmony_ci    fn overlapping_find_at(
47606f54294Sopenharmony_ci        &self,
47706f54294Sopenharmony_ci        prestate: &mut PrefilterState,
47806f54294Sopenharmony_ci        haystack: &[u8],
47906f54294Sopenharmony_ci        at: usize,
48006f54294Sopenharmony_ci        state_id: &mut Self::ID,
48106f54294Sopenharmony_ci        match_index: &mut usize,
48206f54294Sopenharmony_ci    ) -> Option<Match> {
48306f54294Sopenharmony_ci        if self.anchored() && at > 0 && *state_id == self.start_state() {
48406f54294Sopenharmony_ci            return None;
48506f54294Sopenharmony_ci        }
48606f54294Sopenharmony_ci
48706f54294Sopenharmony_ci        let match_count = self.match_count(*state_id);
48806f54294Sopenharmony_ci        if *match_index < match_count {
48906f54294Sopenharmony_ci            // This is guaranteed to return a match since
49006f54294Sopenharmony_ci            // match_index < match_count.
49106f54294Sopenharmony_ci            let result = self.get_match(*state_id, *match_index, at);
49206f54294Sopenharmony_ci            debug_assert!(result.is_some(), "must be a match");
49306f54294Sopenharmony_ci            *match_index += 1;
49406f54294Sopenharmony_ci            return result;
49506f54294Sopenharmony_ci        }
49606f54294Sopenharmony_ci
49706f54294Sopenharmony_ci        *match_index = 0;
49806f54294Sopenharmony_ci        match self.standard_find_at(prestate, haystack, at, state_id) {
49906f54294Sopenharmony_ci            None => None,
50006f54294Sopenharmony_ci            Some(m) => {
50106f54294Sopenharmony_ci                *match_index = 1;
50206f54294Sopenharmony_ci                Some(m)
50306f54294Sopenharmony_ci            }
50406f54294Sopenharmony_ci        }
50506f54294Sopenharmony_ci    }
50606f54294Sopenharmony_ci
50706f54294Sopenharmony_ci    /// Return the earliest match found. This returns as soon as we know that
50806f54294Sopenharmony_ci    /// we have a match. As such, this does not necessarily correspond to the
50906f54294Sopenharmony_ci    /// leftmost starting match, but rather, the leftmost position at which a
51006f54294Sopenharmony_ci    /// match ends.
51106f54294Sopenharmony_ci    #[inline(always)]
51206f54294Sopenharmony_ci    fn earliest_find_at(
51306f54294Sopenharmony_ci        &self,
51406f54294Sopenharmony_ci        prestate: &mut PrefilterState,
51506f54294Sopenharmony_ci        haystack: &[u8],
51606f54294Sopenharmony_ci        at: usize,
51706f54294Sopenharmony_ci        state_id: &mut Self::ID,
51806f54294Sopenharmony_ci    ) -> Option<Match> {
51906f54294Sopenharmony_ci        if *state_id == self.start_state() {
52006f54294Sopenharmony_ci            if self.anchored() && at > 0 {
52106f54294Sopenharmony_ci                return None;
52206f54294Sopenharmony_ci            }
52306f54294Sopenharmony_ci            if let Some(m) = self.get_match(*state_id, 0, at) {
52406f54294Sopenharmony_ci                return Some(m);
52506f54294Sopenharmony_ci            }
52606f54294Sopenharmony_ci        }
52706f54294Sopenharmony_ci        self.standard_find_at(prestate, haystack, at, state_id)
52806f54294Sopenharmony_ci    }
52906f54294Sopenharmony_ci
53006f54294Sopenharmony_ci    /// A convenience function for finding the next match according to the
53106f54294Sopenharmony_ci    /// match semantics of this automaton. For standard match semantics, this
53206f54294Sopenharmony_ci    /// finds the earliest match. Otherwise, the leftmost match is found.
53306f54294Sopenharmony_ci    #[inline(always)]
53406f54294Sopenharmony_ci    fn find_at(
53506f54294Sopenharmony_ci        &self,
53606f54294Sopenharmony_ci        prestate: &mut PrefilterState,
53706f54294Sopenharmony_ci        haystack: &[u8],
53806f54294Sopenharmony_ci        at: usize,
53906f54294Sopenharmony_ci        state_id: &mut Self::ID,
54006f54294Sopenharmony_ci    ) -> Option<Match> {
54106f54294Sopenharmony_ci        match *self.match_kind() {
54206f54294Sopenharmony_ci            MatchKind::Standard => {
54306f54294Sopenharmony_ci                self.earliest_find_at(prestate, haystack, at, state_id)
54406f54294Sopenharmony_ci            }
54506f54294Sopenharmony_ci            MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
54606f54294Sopenharmony_ci                self.leftmost_find_at(prestate, haystack, at, state_id)
54706f54294Sopenharmony_ci            }
54806f54294Sopenharmony_ci            MatchKind::__Nonexhaustive => unreachable!(),
54906f54294Sopenharmony_ci        }
55006f54294Sopenharmony_ci    }
55106f54294Sopenharmony_ci
55206f54294Sopenharmony_ci    /// Like find_at, but does not track state identifiers. This permits some
55306f54294Sopenharmony_ci    /// optimizations when a prefilter that confirms its own matches is
55406f54294Sopenharmony_ci    /// present.
55506f54294Sopenharmony_ci    #[inline(always)]
55606f54294Sopenharmony_ci    fn find_at_no_state(
55706f54294Sopenharmony_ci        &self,
55806f54294Sopenharmony_ci        prestate: &mut PrefilterState,
55906f54294Sopenharmony_ci        haystack: &[u8],
56006f54294Sopenharmony_ci        at: usize,
56106f54294Sopenharmony_ci    ) -> Option<Match> {
56206f54294Sopenharmony_ci        match *self.match_kind() {
56306f54294Sopenharmony_ci            MatchKind::Standard => {
56406f54294Sopenharmony_ci                let mut state = self.start_state();
56506f54294Sopenharmony_ci                self.earliest_find_at(prestate, haystack, at, &mut state)
56606f54294Sopenharmony_ci            }
56706f54294Sopenharmony_ci            MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
56806f54294Sopenharmony_ci                self.leftmost_find_at_no_state(prestate, haystack, at)
56906f54294Sopenharmony_ci            }
57006f54294Sopenharmony_ci            MatchKind::__Nonexhaustive => unreachable!(),
57106f54294Sopenharmony_ci        }
57206f54294Sopenharmony_ci    }
57306f54294Sopenharmony_ci}
574