1c67d6573Sopenharmony_ciuse std::cell::RefCell; 2c67d6573Sopenharmony_ciuse std::collections::HashMap; 3c67d6573Sopenharmony_ciuse std::panic::AssertUnwindSafe; 4c67d6573Sopenharmony_ciuse std::sync::Arc; 5c67d6573Sopenharmony_ci 6c67d6573Sopenharmony_ci#[cfg(feature = "perf-literal")] 7c67d6573Sopenharmony_ciuse aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; 8c67d6573Sopenharmony_ciuse regex_syntax::hir::literal::Literals; 9c67d6573Sopenharmony_ciuse regex_syntax::hir::Hir; 10c67d6573Sopenharmony_ciuse regex_syntax::ParserBuilder; 11c67d6573Sopenharmony_ci 12c67d6573Sopenharmony_ciuse crate::backtrack; 13c67d6573Sopenharmony_ciuse crate::compile::Compiler; 14c67d6573Sopenharmony_ci#[cfg(feature = "perf-dfa")] 15c67d6573Sopenharmony_ciuse crate::dfa; 16c67d6573Sopenharmony_ciuse crate::error::Error; 17c67d6573Sopenharmony_ciuse crate::input::{ByteInput, CharInput}; 18c67d6573Sopenharmony_ciuse crate::literal::LiteralSearcher; 19c67d6573Sopenharmony_ciuse crate::pikevm; 20c67d6573Sopenharmony_ciuse crate::pool::{Pool, PoolGuard}; 21c67d6573Sopenharmony_ciuse crate::prog::Program; 22c67d6573Sopenharmony_ciuse crate::re_builder::RegexOptions; 23c67d6573Sopenharmony_ciuse crate::re_bytes; 24c67d6573Sopenharmony_ciuse crate::re_set; 25c67d6573Sopenharmony_ciuse crate::re_trait::{Locations, RegularExpression, Slot}; 26c67d6573Sopenharmony_ciuse crate::re_unicode; 27c67d6573Sopenharmony_ciuse crate::utf8::next_utf8; 28c67d6573Sopenharmony_ci 29c67d6573Sopenharmony_ci/// `Exec` manages the execution of a regular expression. 30c67d6573Sopenharmony_ci/// 31c67d6573Sopenharmony_ci/// In particular, this manages the various compiled forms of a single regular 32c67d6573Sopenharmony_ci/// expression and the choice of which matching engine to use to execute a 33c67d6573Sopenharmony_ci/// regular expression. 34c67d6573Sopenharmony_ci#[derive(Debug)] 35c67d6573Sopenharmony_cipub struct Exec { 36c67d6573Sopenharmony_ci /// All read only state. 37c67d6573Sopenharmony_ci ro: Arc<ExecReadOnly>, 38c67d6573Sopenharmony_ci /// A pool of reusable values for the various matching engines. 39c67d6573Sopenharmony_ci /// 40c67d6573Sopenharmony_ci /// Note that boxing this value is not strictly necessary, but it is an 41c67d6573Sopenharmony_ci /// easy way to ensure that T does not bloat the stack sized used by a pool 42c67d6573Sopenharmony_ci /// in the case where T is big. And this turns out to be the case at the 43c67d6573Sopenharmony_ci /// time of writing for regex's use of this pool. At the time of writing, 44c67d6573Sopenharmony_ci /// the size of a Regex on the stack is 856 bytes. Boxing this value 45c67d6573Sopenharmony_ci /// reduces that size to 16 bytes. 46c67d6573Sopenharmony_ci pool: Box<Pool<ProgramCache>>, 47c67d6573Sopenharmony_ci} 48c67d6573Sopenharmony_ci 49c67d6573Sopenharmony_ci/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This 50c67d6573Sopenharmony_ci/// means it is no longer Sync, but we can now avoid the overhead of 51c67d6573Sopenharmony_ci/// synchronization to fetch the cache. 52c67d6573Sopenharmony_ci#[derive(Debug)] 53c67d6573Sopenharmony_cipub struct ExecNoSync<'c> { 54c67d6573Sopenharmony_ci /// All read only state. 55c67d6573Sopenharmony_ci ro: &'c Arc<ExecReadOnly>, 56c67d6573Sopenharmony_ci /// Caches for the various matching engines. 57c67d6573Sopenharmony_ci cache: PoolGuard<'c, ProgramCache>, 58c67d6573Sopenharmony_ci} 59c67d6573Sopenharmony_ci 60c67d6573Sopenharmony_ci/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. 61c67d6573Sopenharmony_ci#[derive(Debug)] 62c67d6573Sopenharmony_cipub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); 63c67d6573Sopenharmony_ci 64c67d6573Sopenharmony_ci/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such 65c67d6573Sopenharmony_ci/// state is determined at compile time and never changes during search. 66c67d6573Sopenharmony_ci#[derive(Debug)] 67c67d6573Sopenharmony_cistruct ExecReadOnly { 68c67d6573Sopenharmony_ci /// The original regular expressions given by the caller to compile. 69c67d6573Sopenharmony_ci res: Vec<String>, 70c67d6573Sopenharmony_ci /// A compiled program that is used in the NFA simulation and backtracking. 71c67d6573Sopenharmony_ci /// It can be byte-based or Unicode codepoint based. 72c67d6573Sopenharmony_ci /// 73c67d6573Sopenharmony_ci /// N.B. It is not possibly to make this byte-based from the public API. 74c67d6573Sopenharmony_ci /// It is only used for testing byte based programs in the NFA simulations. 75c67d6573Sopenharmony_ci nfa: Program, 76c67d6573Sopenharmony_ci /// A compiled byte based program for DFA execution. This is only used 77c67d6573Sopenharmony_ci /// if a DFA can be executed. (Currently, only word boundary assertions are 78c67d6573Sopenharmony_ci /// not supported.) Note that this program contains an embedded `.*?` 79c67d6573Sopenharmony_ci /// preceding the first capture group, unless the regex is anchored at the 80c67d6573Sopenharmony_ci /// beginning. 81c67d6573Sopenharmony_ci dfa: Program, 82c67d6573Sopenharmony_ci /// The same as above, except the program is reversed (and there is no 83c67d6573Sopenharmony_ci /// preceding `.*?`). This is used by the DFA to find the starting location 84c67d6573Sopenharmony_ci /// of matches. 85c67d6573Sopenharmony_ci dfa_reverse: Program, 86c67d6573Sopenharmony_ci /// A set of suffix literals extracted from the regex. 87c67d6573Sopenharmony_ci /// 88c67d6573Sopenharmony_ci /// Prefix literals are stored on the `Program`, since they are used inside 89c67d6573Sopenharmony_ci /// the matching engines. 90c67d6573Sopenharmony_ci suffixes: LiteralSearcher, 91c67d6573Sopenharmony_ci /// An Aho-Corasick automaton with leftmost-first match semantics. 92c67d6573Sopenharmony_ci /// 93c67d6573Sopenharmony_ci /// This is only set when the entire regex is a simple unanchored 94c67d6573Sopenharmony_ci /// alternation of literals. We could probably use it more circumstances, 95c67d6573Sopenharmony_ci /// but this is already hacky enough in this architecture. 96c67d6573Sopenharmony_ci /// 97c67d6573Sopenharmony_ci /// N.B. We use u32 as a state ID representation under the assumption that 98c67d6573Sopenharmony_ci /// if we were to exhaust the ID space, we probably would have long 99c67d6573Sopenharmony_ci /// surpassed the compilation size limit. 100c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 101c67d6573Sopenharmony_ci ac: Option<AhoCorasick<u32>>, 102c67d6573Sopenharmony_ci /// match_type encodes as much upfront knowledge about how we're going to 103c67d6573Sopenharmony_ci /// execute a search as possible. 104c67d6573Sopenharmony_ci match_type: MatchType, 105c67d6573Sopenharmony_ci} 106c67d6573Sopenharmony_ci 107c67d6573Sopenharmony_ci/// Facilitates the construction of an executor by exposing various knobs 108c67d6573Sopenharmony_ci/// to control how a regex is executed and what kinds of resources it's 109c67d6573Sopenharmony_ci/// permitted to use. 110c67d6573Sopenharmony_ci// `ExecBuilder` is only public via the `internal` module, so avoid deriving 111c67d6573Sopenharmony_ci// `Debug`. 112c67d6573Sopenharmony_ci#[allow(missing_debug_implementations)] 113c67d6573Sopenharmony_cipub struct ExecBuilder { 114c67d6573Sopenharmony_ci options: RegexOptions, 115c67d6573Sopenharmony_ci match_type: Option<MatchType>, 116c67d6573Sopenharmony_ci bytes: bool, 117c67d6573Sopenharmony_ci only_utf8: bool, 118c67d6573Sopenharmony_ci} 119c67d6573Sopenharmony_ci 120c67d6573Sopenharmony_ci/// Parsed represents a set of parsed regular expressions and their detected 121c67d6573Sopenharmony_ci/// literals. 122c67d6573Sopenharmony_cistruct Parsed { 123c67d6573Sopenharmony_ci exprs: Vec<Hir>, 124c67d6573Sopenharmony_ci prefixes: Literals, 125c67d6573Sopenharmony_ci suffixes: Literals, 126c67d6573Sopenharmony_ci bytes: bool, 127c67d6573Sopenharmony_ci} 128c67d6573Sopenharmony_ci 129c67d6573Sopenharmony_ciimpl ExecBuilder { 130c67d6573Sopenharmony_ci /// Create a regex execution builder. 131c67d6573Sopenharmony_ci /// 132c67d6573Sopenharmony_ci /// This uses default settings for everything except the regex itself, 133c67d6573Sopenharmony_ci /// which must be provided. Further knobs can be set by calling methods, 134c67d6573Sopenharmony_ci /// and then finally, `build` to actually create the executor. 135c67d6573Sopenharmony_ci pub fn new(re: &str) -> Self { 136c67d6573Sopenharmony_ci Self::new_many(&[re]) 137c67d6573Sopenharmony_ci } 138c67d6573Sopenharmony_ci 139c67d6573Sopenharmony_ci /// Like new, but compiles the union of the given regular expressions. 140c67d6573Sopenharmony_ci /// 141c67d6573Sopenharmony_ci /// Note that when compiling 2 or more regular expressions, capture groups 142c67d6573Sopenharmony_ci /// are completely unsupported. (This means both `find` and `captures` 143c67d6573Sopenharmony_ci /// won't work.) 144c67d6573Sopenharmony_ci pub fn new_many<I, S>(res: I) -> Self 145c67d6573Sopenharmony_ci where 146c67d6573Sopenharmony_ci S: AsRef<str>, 147c67d6573Sopenharmony_ci I: IntoIterator<Item = S>, 148c67d6573Sopenharmony_ci { 149c67d6573Sopenharmony_ci let mut opts = RegexOptions::default(); 150c67d6573Sopenharmony_ci opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); 151c67d6573Sopenharmony_ci Self::new_options(opts) 152c67d6573Sopenharmony_ci } 153c67d6573Sopenharmony_ci 154c67d6573Sopenharmony_ci /// Create a regex execution builder. 155c67d6573Sopenharmony_ci pub fn new_options(opts: RegexOptions) -> Self { 156c67d6573Sopenharmony_ci ExecBuilder { 157c67d6573Sopenharmony_ci options: opts, 158c67d6573Sopenharmony_ci match_type: None, 159c67d6573Sopenharmony_ci bytes: false, 160c67d6573Sopenharmony_ci only_utf8: true, 161c67d6573Sopenharmony_ci } 162c67d6573Sopenharmony_ci } 163c67d6573Sopenharmony_ci 164c67d6573Sopenharmony_ci /// Set the matching engine to be automatically determined. 165c67d6573Sopenharmony_ci /// 166c67d6573Sopenharmony_ci /// This is the default state and will apply whatever optimizations are 167c67d6573Sopenharmony_ci /// possible, such as running a DFA. 168c67d6573Sopenharmony_ci /// 169c67d6573Sopenharmony_ci /// This overrides whatever was previously set via the `nfa` or 170c67d6573Sopenharmony_ci /// `bounded_backtracking` methods. 171c67d6573Sopenharmony_ci pub fn automatic(mut self) -> Self { 172c67d6573Sopenharmony_ci self.match_type = None; 173c67d6573Sopenharmony_ci self 174c67d6573Sopenharmony_ci } 175c67d6573Sopenharmony_ci 176c67d6573Sopenharmony_ci /// Sets the matching engine to use the NFA algorithm no matter what 177c67d6573Sopenharmony_ci /// optimizations are possible. 178c67d6573Sopenharmony_ci /// 179c67d6573Sopenharmony_ci /// This overrides whatever was previously set via the `automatic` or 180c67d6573Sopenharmony_ci /// `bounded_backtracking` methods. 181c67d6573Sopenharmony_ci pub fn nfa(mut self) -> Self { 182c67d6573Sopenharmony_ci self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); 183c67d6573Sopenharmony_ci self 184c67d6573Sopenharmony_ci } 185c67d6573Sopenharmony_ci 186c67d6573Sopenharmony_ci /// Sets the matching engine to use a bounded backtracking engine no 187c67d6573Sopenharmony_ci /// matter what optimizations are possible. 188c67d6573Sopenharmony_ci /// 189c67d6573Sopenharmony_ci /// One must use this with care, since the bounded backtracking engine 190c67d6573Sopenharmony_ci /// uses memory proportion to `len(regex) * len(text)`. 191c67d6573Sopenharmony_ci /// 192c67d6573Sopenharmony_ci /// This overrides whatever was previously set via the `automatic` or 193c67d6573Sopenharmony_ci /// `nfa` methods. 194c67d6573Sopenharmony_ci pub fn bounded_backtracking(mut self) -> Self { 195c67d6573Sopenharmony_ci self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); 196c67d6573Sopenharmony_ci self 197c67d6573Sopenharmony_ci } 198c67d6573Sopenharmony_ci 199c67d6573Sopenharmony_ci /// Compiles byte based programs for use with the NFA matching engines. 200c67d6573Sopenharmony_ci /// 201c67d6573Sopenharmony_ci /// By default, the NFA engines match on Unicode scalar values. They can 202c67d6573Sopenharmony_ci /// be made to use byte based programs instead. In general, the byte based 203c67d6573Sopenharmony_ci /// programs are slower because of a less efficient encoding of character 204c67d6573Sopenharmony_ci /// classes. 205c67d6573Sopenharmony_ci /// 206c67d6573Sopenharmony_ci /// Note that this does not impact DFA matching engines, which always 207c67d6573Sopenharmony_ci /// execute on bytes. 208c67d6573Sopenharmony_ci pub fn bytes(mut self, yes: bool) -> Self { 209c67d6573Sopenharmony_ci self.bytes = yes; 210c67d6573Sopenharmony_ci self 211c67d6573Sopenharmony_ci } 212c67d6573Sopenharmony_ci 213c67d6573Sopenharmony_ci /// When disabled, the program compiled may match arbitrary bytes. 214c67d6573Sopenharmony_ci /// 215c67d6573Sopenharmony_ci /// When enabled (the default), all compiled programs exclusively match 216c67d6573Sopenharmony_ci /// valid UTF-8 bytes. 217c67d6573Sopenharmony_ci pub fn only_utf8(mut self, yes: bool) -> Self { 218c67d6573Sopenharmony_ci self.only_utf8 = yes; 219c67d6573Sopenharmony_ci self 220c67d6573Sopenharmony_ci } 221c67d6573Sopenharmony_ci 222c67d6573Sopenharmony_ci /// Set the Unicode flag. 223c67d6573Sopenharmony_ci pub fn unicode(mut self, yes: bool) -> Self { 224c67d6573Sopenharmony_ci self.options.unicode = yes; 225c67d6573Sopenharmony_ci self 226c67d6573Sopenharmony_ci } 227c67d6573Sopenharmony_ci 228c67d6573Sopenharmony_ci /// Parse the current set of patterns into their AST and extract literals. 229c67d6573Sopenharmony_ci fn parse(&self) -> Result<Parsed, Error> { 230c67d6573Sopenharmony_ci let mut exprs = Vec::with_capacity(self.options.pats.len()); 231c67d6573Sopenharmony_ci let mut prefixes = Some(Literals::empty()); 232c67d6573Sopenharmony_ci let mut suffixes = Some(Literals::empty()); 233c67d6573Sopenharmony_ci let mut bytes = false; 234c67d6573Sopenharmony_ci let is_set = self.options.pats.len() > 1; 235c67d6573Sopenharmony_ci // If we're compiling a regex set and that set has any anchored 236c67d6573Sopenharmony_ci // expressions, then disable all literal optimizations. 237c67d6573Sopenharmony_ci for pat in &self.options.pats { 238c67d6573Sopenharmony_ci let mut parser = ParserBuilder::new() 239c67d6573Sopenharmony_ci .octal(self.options.octal) 240c67d6573Sopenharmony_ci .case_insensitive(self.options.case_insensitive) 241c67d6573Sopenharmony_ci .multi_line(self.options.multi_line) 242c67d6573Sopenharmony_ci .dot_matches_new_line(self.options.dot_matches_new_line) 243c67d6573Sopenharmony_ci .swap_greed(self.options.swap_greed) 244c67d6573Sopenharmony_ci .ignore_whitespace(self.options.ignore_whitespace) 245c67d6573Sopenharmony_ci .unicode(self.options.unicode) 246c67d6573Sopenharmony_ci .allow_invalid_utf8(!self.only_utf8) 247c67d6573Sopenharmony_ci .nest_limit(self.options.nest_limit) 248c67d6573Sopenharmony_ci .build(); 249c67d6573Sopenharmony_ci let expr = 250c67d6573Sopenharmony_ci parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; 251c67d6573Sopenharmony_ci bytes = bytes || !expr.is_always_utf8(); 252c67d6573Sopenharmony_ci 253c67d6573Sopenharmony_ci if cfg!(feature = "perf-literal") { 254c67d6573Sopenharmony_ci if !expr.is_anchored_start() && expr.is_any_anchored_start() { 255c67d6573Sopenharmony_ci // Partial anchors unfortunately make it hard to use 256c67d6573Sopenharmony_ci // prefixes, so disable them. 257c67d6573Sopenharmony_ci prefixes = None; 258c67d6573Sopenharmony_ci } else if is_set && expr.is_anchored_start() { 259c67d6573Sopenharmony_ci // Regex sets with anchors do not go well with literal 260c67d6573Sopenharmony_ci // optimizations. 261c67d6573Sopenharmony_ci prefixes = None; 262c67d6573Sopenharmony_ci } 263c67d6573Sopenharmony_ci prefixes = prefixes.and_then(|mut prefixes| { 264c67d6573Sopenharmony_ci if !prefixes.union_prefixes(&expr) { 265c67d6573Sopenharmony_ci None 266c67d6573Sopenharmony_ci } else { 267c67d6573Sopenharmony_ci Some(prefixes) 268c67d6573Sopenharmony_ci } 269c67d6573Sopenharmony_ci }); 270c67d6573Sopenharmony_ci 271c67d6573Sopenharmony_ci if !expr.is_anchored_end() && expr.is_any_anchored_end() { 272c67d6573Sopenharmony_ci // Partial anchors unfortunately make it hard to use 273c67d6573Sopenharmony_ci // suffixes, so disable them. 274c67d6573Sopenharmony_ci suffixes = None; 275c67d6573Sopenharmony_ci } else if is_set && expr.is_anchored_end() { 276c67d6573Sopenharmony_ci // Regex sets with anchors do not go well with literal 277c67d6573Sopenharmony_ci // optimizations. 278c67d6573Sopenharmony_ci suffixes = None; 279c67d6573Sopenharmony_ci } 280c67d6573Sopenharmony_ci suffixes = suffixes.and_then(|mut suffixes| { 281c67d6573Sopenharmony_ci if !suffixes.union_suffixes(&expr) { 282c67d6573Sopenharmony_ci None 283c67d6573Sopenharmony_ci } else { 284c67d6573Sopenharmony_ci Some(suffixes) 285c67d6573Sopenharmony_ci } 286c67d6573Sopenharmony_ci }); 287c67d6573Sopenharmony_ci } 288c67d6573Sopenharmony_ci exprs.push(expr); 289c67d6573Sopenharmony_ci } 290c67d6573Sopenharmony_ci Ok(Parsed { 291c67d6573Sopenharmony_ci exprs, 292c67d6573Sopenharmony_ci prefixes: prefixes.unwrap_or_else(Literals::empty), 293c67d6573Sopenharmony_ci suffixes: suffixes.unwrap_or_else(Literals::empty), 294c67d6573Sopenharmony_ci bytes, 295c67d6573Sopenharmony_ci }) 296c67d6573Sopenharmony_ci } 297c67d6573Sopenharmony_ci 298c67d6573Sopenharmony_ci /// Build an executor that can run a regular expression. 299c67d6573Sopenharmony_ci pub fn build(self) -> Result<Exec, Error> { 300c67d6573Sopenharmony_ci // Special case when we have no patterns to compile. 301c67d6573Sopenharmony_ci // This can happen when compiling a regex set. 302c67d6573Sopenharmony_ci if self.options.pats.is_empty() { 303c67d6573Sopenharmony_ci let ro = Arc::new(ExecReadOnly { 304c67d6573Sopenharmony_ci res: vec![], 305c67d6573Sopenharmony_ci nfa: Program::new(), 306c67d6573Sopenharmony_ci dfa: Program::new(), 307c67d6573Sopenharmony_ci dfa_reverse: Program::new(), 308c67d6573Sopenharmony_ci suffixes: LiteralSearcher::empty(), 309c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 310c67d6573Sopenharmony_ci ac: None, 311c67d6573Sopenharmony_ci match_type: MatchType::Nothing, 312c67d6573Sopenharmony_ci }); 313c67d6573Sopenharmony_ci let pool = ExecReadOnly::new_pool(&ro); 314c67d6573Sopenharmony_ci return Ok(Exec { ro, pool }); 315c67d6573Sopenharmony_ci } 316c67d6573Sopenharmony_ci let parsed = self.parse()?; 317c67d6573Sopenharmony_ci let mut nfa = Compiler::new() 318c67d6573Sopenharmony_ci .size_limit(self.options.size_limit) 319c67d6573Sopenharmony_ci .bytes(self.bytes || parsed.bytes) 320c67d6573Sopenharmony_ci .only_utf8(self.only_utf8) 321c67d6573Sopenharmony_ci .compile(&parsed.exprs)?; 322c67d6573Sopenharmony_ci let mut dfa = Compiler::new() 323c67d6573Sopenharmony_ci .size_limit(self.options.size_limit) 324c67d6573Sopenharmony_ci .dfa(true) 325c67d6573Sopenharmony_ci .only_utf8(self.only_utf8) 326c67d6573Sopenharmony_ci .compile(&parsed.exprs)?; 327c67d6573Sopenharmony_ci let mut dfa_reverse = Compiler::new() 328c67d6573Sopenharmony_ci .size_limit(self.options.size_limit) 329c67d6573Sopenharmony_ci .dfa(true) 330c67d6573Sopenharmony_ci .only_utf8(self.only_utf8) 331c67d6573Sopenharmony_ci .reverse(true) 332c67d6573Sopenharmony_ci .compile(&parsed.exprs)?; 333c67d6573Sopenharmony_ci 334c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 335c67d6573Sopenharmony_ci let ac = self.build_aho_corasick(&parsed); 336c67d6573Sopenharmony_ci nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes); 337c67d6573Sopenharmony_ci dfa.prefixes = nfa.prefixes.clone(); 338c67d6573Sopenharmony_ci dfa.dfa_size_limit = self.options.dfa_size_limit; 339c67d6573Sopenharmony_ci dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; 340c67d6573Sopenharmony_ci 341c67d6573Sopenharmony_ci let mut ro = ExecReadOnly { 342c67d6573Sopenharmony_ci res: self.options.pats, 343c67d6573Sopenharmony_ci nfa, 344c67d6573Sopenharmony_ci dfa, 345c67d6573Sopenharmony_ci dfa_reverse, 346c67d6573Sopenharmony_ci suffixes: LiteralSearcher::suffixes(parsed.suffixes), 347c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 348c67d6573Sopenharmony_ci ac, 349c67d6573Sopenharmony_ci match_type: MatchType::Nothing, 350c67d6573Sopenharmony_ci }; 351c67d6573Sopenharmony_ci ro.match_type = ro.choose_match_type(self.match_type); 352c67d6573Sopenharmony_ci 353c67d6573Sopenharmony_ci let ro = Arc::new(ro); 354c67d6573Sopenharmony_ci let pool = ExecReadOnly::new_pool(&ro); 355c67d6573Sopenharmony_ci Ok(Exec { ro, pool }) 356c67d6573Sopenharmony_ci } 357c67d6573Sopenharmony_ci 358c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 359c67d6573Sopenharmony_ci fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> { 360c67d6573Sopenharmony_ci if parsed.exprs.len() != 1 { 361c67d6573Sopenharmony_ci return None; 362c67d6573Sopenharmony_ci } 363c67d6573Sopenharmony_ci let lits = match alternation_literals(&parsed.exprs[0]) { 364c67d6573Sopenharmony_ci None => return None, 365c67d6573Sopenharmony_ci Some(lits) => lits, 366c67d6573Sopenharmony_ci }; 367c67d6573Sopenharmony_ci // If we have a small number of literals, then let Teddy handle 368c67d6573Sopenharmony_ci // things (see literal/mod.rs). 369c67d6573Sopenharmony_ci if lits.len() <= 32 { 370c67d6573Sopenharmony_ci return None; 371c67d6573Sopenharmony_ci } 372c67d6573Sopenharmony_ci Some( 373c67d6573Sopenharmony_ci AhoCorasickBuilder::new() 374c67d6573Sopenharmony_ci .match_kind(MatchKind::LeftmostFirst) 375c67d6573Sopenharmony_ci .auto_configure(&lits) 376c67d6573Sopenharmony_ci .build_with_size::<u32, _, _>(&lits) 377c67d6573Sopenharmony_ci // This should never happen because we'd long exceed the 378c67d6573Sopenharmony_ci // compilation limit for regexes first. 379c67d6573Sopenharmony_ci .expect("AC automaton too big"), 380c67d6573Sopenharmony_ci ) 381c67d6573Sopenharmony_ci } 382c67d6573Sopenharmony_ci} 383c67d6573Sopenharmony_ci 384c67d6573Sopenharmony_ciimpl<'c> RegularExpression for ExecNoSyncStr<'c> { 385c67d6573Sopenharmony_ci type Text = str; 386c67d6573Sopenharmony_ci 387c67d6573Sopenharmony_ci fn slots_len(&self) -> usize { 388c67d6573Sopenharmony_ci self.0.slots_len() 389c67d6573Sopenharmony_ci } 390c67d6573Sopenharmony_ci 391c67d6573Sopenharmony_ci fn next_after_empty(&self, text: &str, i: usize) -> usize { 392c67d6573Sopenharmony_ci next_utf8(text.as_bytes(), i) 393c67d6573Sopenharmony_ci } 394c67d6573Sopenharmony_ci 395c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 396c67d6573Sopenharmony_ci fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> { 397c67d6573Sopenharmony_ci self.0.shortest_match_at(text.as_bytes(), start) 398c67d6573Sopenharmony_ci } 399c67d6573Sopenharmony_ci 400c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 401c67d6573Sopenharmony_ci fn is_match_at(&self, text: &str, start: usize) -> bool { 402c67d6573Sopenharmony_ci self.0.is_match_at(text.as_bytes(), start) 403c67d6573Sopenharmony_ci } 404c67d6573Sopenharmony_ci 405c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 406c67d6573Sopenharmony_ci fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { 407c67d6573Sopenharmony_ci self.0.find_at(text.as_bytes(), start) 408c67d6573Sopenharmony_ci } 409c67d6573Sopenharmony_ci 410c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 411c67d6573Sopenharmony_ci fn captures_read_at( 412c67d6573Sopenharmony_ci &self, 413c67d6573Sopenharmony_ci locs: &mut Locations, 414c67d6573Sopenharmony_ci text: &str, 415c67d6573Sopenharmony_ci start: usize, 416c67d6573Sopenharmony_ci ) -> Option<(usize, usize)> { 417c67d6573Sopenharmony_ci self.0.captures_read_at(locs, text.as_bytes(), start) 418c67d6573Sopenharmony_ci } 419c67d6573Sopenharmony_ci} 420c67d6573Sopenharmony_ci 421c67d6573Sopenharmony_ciimpl<'c> RegularExpression for ExecNoSync<'c> { 422c67d6573Sopenharmony_ci type Text = [u8]; 423c67d6573Sopenharmony_ci 424c67d6573Sopenharmony_ci /// Returns the number of capture slots in the regular expression. (There 425c67d6573Sopenharmony_ci /// are two slots for every capture group, corresponding to possibly empty 426c67d6573Sopenharmony_ci /// start and end locations of the capture.) 427c67d6573Sopenharmony_ci fn slots_len(&self) -> usize { 428c67d6573Sopenharmony_ci self.ro.nfa.captures.len() * 2 429c67d6573Sopenharmony_ci } 430c67d6573Sopenharmony_ci 431c67d6573Sopenharmony_ci fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { 432c67d6573Sopenharmony_ci i + 1 433c67d6573Sopenharmony_ci } 434c67d6573Sopenharmony_ci 435c67d6573Sopenharmony_ci /// Returns the end of a match location, possibly occurring before the 436c67d6573Sopenharmony_ci /// end location of the correct leftmost-first match. 437c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 438c67d6573Sopenharmony_ci fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> { 439c67d6573Sopenharmony_ci if !self.is_anchor_end_match(text) { 440c67d6573Sopenharmony_ci return None; 441c67d6573Sopenharmony_ci } 442c67d6573Sopenharmony_ci match self.ro.match_type { 443c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 444c67d6573Sopenharmony_ci MatchType::Literal(ty) => { 445c67d6573Sopenharmony_ci self.find_literals(ty, text, start).map(|(_, e)| e) 446c67d6573Sopenharmony_ci } 447c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 448c67d6573Sopenharmony_ci MatchType::Dfa | MatchType::DfaMany => { 449c67d6573Sopenharmony_ci match self.shortest_dfa(text, start) { 450c67d6573Sopenharmony_ci dfa::Result::Match(end) => Some(end), 451c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => None, 452c67d6573Sopenharmony_ci dfa::Result::Quit => self.shortest_nfa(text, start), 453c67d6573Sopenharmony_ci } 454c67d6573Sopenharmony_ci } 455c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 456c67d6573Sopenharmony_ci MatchType::DfaAnchoredReverse => { 457c67d6573Sopenharmony_ci match dfa::Fsm::reverse( 458c67d6573Sopenharmony_ci &self.ro.dfa_reverse, 459c67d6573Sopenharmony_ci self.cache.value(), 460c67d6573Sopenharmony_ci true, 461c67d6573Sopenharmony_ci &text[start..], 462c67d6573Sopenharmony_ci text.len(), 463c67d6573Sopenharmony_ci ) { 464c67d6573Sopenharmony_ci dfa::Result::Match(_) => Some(text.len()), 465c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => None, 466c67d6573Sopenharmony_ci dfa::Result::Quit => self.shortest_nfa(text, start), 467c67d6573Sopenharmony_ci } 468c67d6573Sopenharmony_ci } 469c67d6573Sopenharmony_ci #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 470c67d6573Sopenharmony_ci MatchType::DfaSuffix => { 471c67d6573Sopenharmony_ci match self.shortest_dfa_reverse_suffix(text, start) { 472c67d6573Sopenharmony_ci dfa::Result::Match(e) => Some(e), 473c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => None, 474c67d6573Sopenharmony_ci dfa::Result::Quit => self.shortest_nfa(text, start), 475c67d6573Sopenharmony_ci } 476c67d6573Sopenharmony_ci } 477c67d6573Sopenharmony_ci MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), 478c67d6573Sopenharmony_ci MatchType::Nothing => None, 479c67d6573Sopenharmony_ci } 480c67d6573Sopenharmony_ci } 481c67d6573Sopenharmony_ci 482c67d6573Sopenharmony_ci /// Returns true if and only if the regex matches text. 483c67d6573Sopenharmony_ci /// 484c67d6573Sopenharmony_ci /// For single regular expressions, this is equivalent to calling 485c67d6573Sopenharmony_ci /// shortest_match(...).is_some(). 486c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 487c67d6573Sopenharmony_ci fn is_match_at(&self, text: &[u8], start: usize) -> bool { 488c67d6573Sopenharmony_ci if !self.is_anchor_end_match(text) { 489c67d6573Sopenharmony_ci return false; 490c67d6573Sopenharmony_ci } 491c67d6573Sopenharmony_ci // We need to do this dance because shortest_match relies on the NFA 492c67d6573Sopenharmony_ci // filling in captures[1], but a RegexSet has no captures. In other 493c67d6573Sopenharmony_ci // words, a RegexSet can't (currently) use shortest_match. ---AG 494c67d6573Sopenharmony_ci match self.ro.match_type { 495c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 496c67d6573Sopenharmony_ci MatchType::Literal(ty) => { 497c67d6573Sopenharmony_ci self.find_literals(ty, text, start).is_some() 498c67d6573Sopenharmony_ci } 499c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 500c67d6573Sopenharmony_ci MatchType::Dfa | MatchType::DfaMany => { 501c67d6573Sopenharmony_ci match self.shortest_dfa(text, start) { 502c67d6573Sopenharmony_ci dfa::Result::Match(_) => true, 503c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => false, 504c67d6573Sopenharmony_ci dfa::Result::Quit => self.match_nfa(text, start), 505c67d6573Sopenharmony_ci } 506c67d6573Sopenharmony_ci } 507c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 508c67d6573Sopenharmony_ci MatchType::DfaAnchoredReverse => { 509c67d6573Sopenharmony_ci match dfa::Fsm::reverse( 510c67d6573Sopenharmony_ci &self.ro.dfa_reverse, 511c67d6573Sopenharmony_ci self.cache.value(), 512c67d6573Sopenharmony_ci true, 513c67d6573Sopenharmony_ci &text[start..], 514c67d6573Sopenharmony_ci text.len(), 515c67d6573Sopenharmony_ci ) { 516c67d6573Sopenharmony_ci dfa::Result::Match(_) => true, 517c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => false, 518c67d6573Sopenharmony_ci dfa::Result::Quit => self.match_nfa(text, start), 519c67d6573Sopenharmony_ci } 520c67d6573Sopenharmony_ci } 521c67d6573Sopenharmony_ci #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 522c67d6573Sopenharmony_ci MatchType::DfaSuffix => { 523c67d6573Sopenharmony_ci match self.shortest_dfa_reverse_suffix(text, start) { 524c67d6573Sopenharmony_ci dfa::Result::Match(_) => true, 525c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => false, 526c67d6573Sopenharmony_ci dfa::Result::Quit => self.match_nfa(text, start), 527c67d6573Sopenharmony_ci } 528c67d6573Sopenharmony_ci } 529c67d6573Sopenharmony_ci MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), 530c67d6573Sopenharmony_ci MatchType::Nothing => false, 531c67d6573Sopenharmony_ci } 532c67d6573Sopenharmony_ci } 533c67d6573Sopenharmony_ci 534c67d6573Sopenharmony_ci /// Finds the start and end location of the leftmost-first match, starting 535c67d6573Sopenharmony_ci /// at the given location. 536c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 537c67d6573Sopenharmony_ci fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { 538c67d6573Sopenharmony_ci if !self.is_anchor_end_match(text) { 539c67d6573Sopenharmony_ci return None; 540c67d6573Sopenharmony_ci } 541c67d6573Sopenharmony_ci match self.ro.match_type { 542c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 543c67d6573Sopenharmony_ci MatchType::Literal(ty) => self.find_literals(ty, text, start), 544c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 545c67d6573Sopenharmony_ci MatchType::Dfa => match self.find_dfa_forward(text, start) { 546c67d6573Sopenharmony_ci dfa::Result::Match((s, e)) => Some((s, e)), 547c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => None, 548c67d6573Sopenharmony_ci dfa::Result::Quit => { 549c67d6573Sopenharmony_ci self.find_nfa(MatchNfaType::Auto, text, start) 550c67d6573Sopenharmony_ci } 551c67d6573Sopenharmony_ci }, 552c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 553c67d6573Sopenharmony_ci MatchType::DfaAnchoredReverse => { 554c67d6573Sopenharmony_ci match self.find_dfa_anchored_reverse(text, start) { 555c67d6573Sopenharmony_ci dfa::Result::Match((s, e)) => Some((s, e)), 556c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => None, 557c67d6573Sopenharmony_ci dfa::Result::Quit => { 558c67d6573Sopenharmony_ci self.find_nfa(MatchNfaType::Auto, text, start) 559c67d6573Sopenharmony_ci } 560c67d6573Sopenharmony_ci } 561c67d6573Sopenharmony_ci } 562c67d6573Sopenharmony_ci #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 563c67d6573Sopenharmony_ci MatchType::DfaSuffix => { 564c67d6573Sopenharmony_ci match self.find_dfa_reverse_suffix(text, start) { 565c67d6573Sopenharmony_ci dfa::Result::Match((s, e)) => Some((s, e)), 566c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => None, 567c67d6573Sopenharmony_ci dfa::Result::Quit => { 568c67d6573Sopenharmony_ci self.find_nfa(MatchNfaType::Auto, text, start) 569c67d6573Sopenharmony_ci } 570c67d6573Sopenharmony_ci } 571c67d6573Sopenharmony_ci } 572c67d6573Sopenharmony_ci MatchType::Nfa(ty) => self.find_nfa(ty, text, start), 573c67d6573Sopenharmony_ci MatchType::Nothing => None, 574c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 575c67d6573Sopenharmony_ci MatchType::DfaMany => { 576c67d6573Sopenharmony_ci unreachable!("BUG: RegexSet cannot be used with find") 577c67d6573Sopenharmony_ci } 578c67d6573Sopenharmony_ci } 579c67d6573Sopenharmony_ci } 580c67d6573Sopenharmony_ci 581c67d6573Sopenharmony_ci /// Finds the start and end location of the leftmost-first match and also 582c67d6573Sopenharmony_ci /// fills in all matching capture groups. 583c67d6573Sopenharmony_ci /// 584c67d6573Sopenharmony_ci /// The number of capture slots given should be equal to the total number 585c67d6573Sopenharmony_ci /// of capture slots in the compiled program. 586c67d6573Sopenharmony_ci /// 587c67d6573Sopenharmony_ci /// Note that the first two slots always correspond to the start and end 588c67d6573Sopenharmony_ci /// locations of the overall match. 589c67d6573Sopenharmony_ci fn captures_read_at( 590c67d6573Sopenharmony_ci &self, 591c67d6573Sopenharmony_ci locs: &mut Locations, 592c67d6573Sopenharmony_ci text: &[u8], 593c67d6573Sopenharmony_ci start: usize, 594c67d6573Sopenharmony_ci ) -> Option<(usize, usize)> { 595c67d6573Sopenharmony_ci let slots = locs.as_slots(); 596c67d6573Sopenharmony_ci for slot in slots.iter_mut() { 597c67d6573Sopenharmony_ci *slot = None; 598c67d6573Sopenharmony_ci } 599c67d6573Sopenharmony_ci // If the caller unnecessarily uses this, then we try to save them 600c67d6573Sopenharmony_ci // from themselves. 601c67d6573Sopenharmony_ci match slots.len() { 602c67d6573Sopenharmony_ci 0 => return self.find_at(text, start), 603c67d6573Sopenharmony_ci 2 => { 604c67d6573Sopenharmony_ci return self.find_at(text, start).map(|(s, e)| { 605c67d6573Sopenharmony_ci slots[0] = Some(s); 606c67d6573Sopenharmony_ci slots[1] = Some(e); 607c67d6573Sopenharmony_ci (s, e) 608c67d6573Sopenharmony_ci }); 609c67d6573Sopenharmony_ci } 610c67d6573Sopenharmony_ci _ => {} // fallthrough 611c67d6573Sopenharmony_ci } 612c67d6573Sopenharmony_ci if !self.is_anchor_end_match(text) { 613c67d6573Sopenharmony_ci return None; 614c67d6573Sopenharmony_ci } 615c67d6573Sopenharmony_ci match self.ro.match_type { 616c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 617c67d6573Sopenharmony_ci MatchType::Literal(ty) => { 618c67d6573Sopenharmony_ci self.find_literals(ty, text, start).and_then(|(s, e)| { 619c67d6573Sopenharmony_ci self.captures_nfa_type( 620c67d6573Sopenharmony_ci MatchNfaType::Auto, 621c67d6573Sopenharmony_ci slots, 622c67d6573Sopenharmony_ci text, 623c67d6573Sopenharmony_ci s, 624c67d6573Sopenharmony_ci e, 625c67d6573Sopenharmony_ci ) 626c67d6573Sopenharmony_ci }) 627c67d6573Sopenharmony_ci } 628c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 629c67d6573Sopenharmony_ci MatchType::Dfa => { 630c67d6573Sopenharmony_ci if self.ro.nfa.is_anchored_start { 631c67d6573Sopenharmony_ci self.captures_nfa(slots, text, start) 632c67d6573Sopenharmony_ci } else { 633c67d6573Sopenharmony_ci match self.find_dfa_forward(text, start) { 634c67d6573Sopenharmony_ci dfa::Result::Match((s, e)) => self.captures_nfa_type( 635c67d6573Sopenharmony_ci MatchNfaType::Auto, 636c67d6573Sopenharmony_ci slots, 637c67d6573Sopenharmony_ci text, 638c67d6573Sopenharmony_ci s, 639c67d6573Sopenharmony_ci e, 640c67d6573Sopenharmony_ci ), 641c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => None, 642c67d6573Sopenharmony_ci dfa::Result::Quit => { 643c67d6573Sopenharmony_ci self.captures_nfa(slots, text, start) 644c67d6573Sopenharmony_ci } 645c67d6573Sopenharmony_ci } 646c67d6573Sopenharmony_ci } 647c67d6573Sopenharmony_ci } 648c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 649c67d6573Sopenharmony_ci MatchType::DfaAnchoredReverse => { 650c67d6573Sopenharmony_ci match self.find_dfa_anchored_reverse(text, start) { 651c67d6573Sopenharmony_ci dfa::Result::Match((s, e)) => self.captures_nfa_type( 652c67d6573Sopenharmony_ci MatchNfaType::Auto, 653c67d6573Sopenharmony_ci slots, 654c67d6573Sopenharmony_ci text, 655c67d6573Sopenharmony_ci s, 656c67d6573Sopenharmony_ci e, 657c67d6573Sopenharmony_ci ), 658c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => None, 659c67d6573Sopenharmony_ci dfa::Result::Quit => self.captures_nfa(slots, text, start), 660c67d6573Sopenharmony_ci } 661c67d6573Sopenharmony_ci } 662c67d6573Sopenharmony_ci #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 663c67d6573Sopenharmony_ci MatchType::DfaSuffix => { 664c67d6573Sopenharmony_ci match self.find_dfa_reverse_suffix(text, start) { 665c67d6573Sopenharmony_ci dfa::Result::Match((s, e)) => self.captures_nfa_type( 666c67d6573Sopenharmony_ci MatchNfaType::Auto, 667c67d6573Sopenharmony_ci slots, 668c67d6573Sopenharmony_ci text, 669c67d6573Sopenharmony_ci s, 670c67d6573Sopenharmony_ci e, 671c67d6573Sopenharmony_ci ), 672c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => None, 673c67d6573Sopenharmony_ci dfa::Result::Quit => self.captures_nfa(slots, text, start), 674c67d6573Sopenharmony_ci } 675c67d6573Sopenharmony_ci } 676c67d6573Sopenharmony_ci MatchType::Nfa(ty) => { 677c67d6573Sopenharmony_ci self.captures_nfa_type(ty, slots, text, start, text.len()) 678c67d6573Sopenharmony_ci } 679c67d6573Sopenharmony_ci MatchType::Nothing => None, 680c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 681c67d6573Sopenharmony_ci MatchType::DfaMany => { 682c67d6573Sopenharmony_ci unreachable!("BUG: RegexSet cannot be used with captures") 683c67d6573Sopenharmony_ci } 684c67d6573Sopenharmony_ci } 685c67d6573Sopenharmony_ci } 686c67d6573Sopenharmony_ci} 687c67d6573Sopenharmony_ci 688c67d6573Sopenharmony_ciimpl<'c> ExecNoSync<'c> { 689c67d6573Sopenharmony_ci /// Finds the leftmost-first match using only literal search. 690c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 691c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 692c67d6573Sopenharmony_ci fn find_literals( 693c67d6573Sopenharmony_ci &self, 694c67d6573Sopenharmony_ci ty: MatchLiteralType, 695c67d6573Sopenharmony_ci text: &[u8], 696c67d6573Sopenharmony_ci start: usize, 697c67d6573Sopenharmony_ci ) -> Option<(usize, usize)> { 698c67d6573Sopenharmony_ci use self::MatchLiteralType::*; 699c67d6573Sopenharmony_ci match ty { 700c67d6573Sopenharmony_ci Unanchored => { 701c67d6573Sopenharmony_ci let lits = &self.ro.nfa.prefixes; 702c67d6573Sopenharmony_ci lits.find(&text[start..]).map(|(s, e)| (start + s, start + e)) 703c67d6573Sopenharmony_ci } 704c67d6573Sopenharmony_ci AnchoredStart => { 705c67d6573Sopenharmony_ci let lits = &self.ro.nfa.prefixes; 706c67d6573Sopenharmony_ci if start == 0 || !self.ro.nfa.is_anchored_start { 707c67d6573Sopenharmony_ci lits.find_start(&text[start..]) 708c67d6573Sopenharmony_ci .map(|(s, e)| (start + s, start + e)) 709c67d6573Sopenharmony_ci } else { 710c67d6573Sopenharmony_ci None 711c67d6573Sopenharmony_ci } 712c67d6573Sopenharmony_ci } 713c67d6573Sopenharmony_ci AnchoredEnd => { 714c67d6573Sopenharmony_ci let lits = &self.ro.suffixes; 715c67d6573Sopenharmony_ci lits.find_end(&text[start..]) 716c67d6573Sopenharmony_ci .map(|(s, e)| (start + s, start + e)) 717c67d6573Sopenharmony_ci } 718c67d6573Sopenharmony_ci AhoCorasick => self 719c67d6573Sopenharmony_ci .ro 720c67d6573Sopenharmony_ci .ac 721c67d6573Sopenharmony_ci .as_ref() 722c67d6573Sopenharmony_ci .unwrap() 723c67d6573Sopenharmony_ci .find(&text[start..]) 724c67d6573Sopenharmony_ci .map(|m| (start + m.start(), start + m.end())), 725c67d6573Sopenharmony_ci } 726c67d6573Sopenharmony_ci } 727c67d6573Sopenharmony_ci 728c67d6573Sopenharmony_ci /// Finds the leftmost-first match (start and end) using only the DFA. 729c67d6573Sopenharmony_ci /// 730c67d6573Sopenharmony_ci /// If the result returned indicates that the DFA quit, then another 731c67d6573Sopenharmony_ci /// matching engine should be used. 732c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 733c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 734c67d6573Sopenharmony_ci fn find_dfa_forward( 735c67d6573Sopenharmony_ci &self, 736c67d6573Sopenharmony_ci text: &[u8], 737c67d6573Sopenharmony_ci start: usize, 738c67d6573Sopenharmony_ci ) -> dfa::Result<(usize, usize)> { 739c67d6573Sopenharmony_ci use crate::dfa::Result::*; 740c67d6573Sopenharmony_ci let end = match dfa::Fsm::forward( 741c67d6573Sopenharmony_ci &self.ro.dfa, 742c67d6573Sopenharmony_ci self.cache.value(), 743c67d6573Sopenharmony_ci false, 744c67d6573Sopenharmony_ci text, 745c67d6573Sopenharmony_ci start, 746c67d6573Sopenharmony_ci ) { 747c67d6573Sopenharmony_ci NoMatch(i) => return NoMatch(i), 748c67d6573Sopenharmony_ci Quit => return Quit, 749c67d6573Sopenharmony_ci Match(end) if start == end => return Match((start, start)), 750c67d6573Sopenharmony_ci Match(end) => end, 751c67d6573Sopenharmony_ci }; 752c67d6573Sopenharmony_ci // Now run the DFA in reverse to find the start of the match. 753c67d6573Sopenharmony_ci match dfa::Fsm::reverse( 754c67d6573Sopenharmony_ci &self.ro.dfa_reverse, 755c67d6573Sopenharmony_ci self.cache.value(), 756c67d6573Sopenharmony_ci false, 757c67d6573Sopenharmony_ci &text[start..], 758c67d6573Sopenharmony_ci end - start, 759c67d6573Sopenharmony_ci ) { 760c67d6573Sopenharmony_ci Match(s) => Match((start + s, end)), 761c67d6573Sopenharmony_ci NoMatch(i) => NoMatch(i), 762c67d6573Sopenharmony_ci Quit => Quit, 763c67d6573Sopenharmony_ci } 764c67d6573Sopenharmony_ci } 765c67d6573Sopenharmony_ci 766c67d6573Sopenharmony_ci /// Finds the leftmost-first match (start and end) using only the DFA, 767c67d6573Sopenharmony_ci /// but assumes the regex is anchored at the end and therefore starts at 768c67d6573Sopenharmony_ci /// the end of the regex and matches in reverse. 769c67d6573Sopenharmony_ci /// 770c67d6573Sopenharmony_ci /// If the result returned indicates that the DFA quit, then another 771c67d6573Sopenharmony_ci /// matching engine should be used. 772c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 773c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 774c67d6573Sopenharmony_ci fn find_dfa_anchored_reverse( 775c67d6573Sopenharmony_ci &self, 776c67d6573Sopenharmony_ci text: &[u8], 777c67d6573Sopenharmony_ci start: usize, 778c67d6573Sopenharmony_ci ) -> dfa::Result<(usize, usize)> { 779c67d6573Sopenharmony_ci use crate::dfa::Result::*; 780c67d6573Sopenharmony_ci match dfa::Fsm::reverse( 781c67d6573Sopenharmony_ci &self.ro.dfa_reverse, 782c67d6573Sopenharmony_ci self.cache.value(), 783c67d6573Sopenharmony_ci false, 784c67d6573Sopenharmony_ci &text[start..], 785c67d6573Sopenharmony_ci text.len() - start, 786c67d6573Sopenharmony_ci ) { 787c67d6573Sopenharmony_ci Match(s) => Match((start + s, text.len())), 788c67d6573Sopenharmony_ci NoMatch(i) => NoMatch(i), 789c67d6573Sopenharmony_ci Quit => Quit, 790c67d6573Sopenharmony_ci } 791c67d6573Sopenharmony_ci } 792c67d6573Sopenharmony_ci 793c67d6573Sopenharmony_ci /// Finds the end of the shortest match using only the DFA. 794c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 795c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 796c67d6573Sopenharmony_ci fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> { 797c67d6573Sopenharmony_ci dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start) 798c67d6573Sopenharmony_ci } 799c67d6573Sopenharmony_ci 800c67d6573Sopenharmony_ci /// Finds the end of the shortest match using only the DFA by scanning for 801c67d6573Sopenharmony_ci /// suffix literals. 802c67d6573Sopenharmony_ci #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 803c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 804c67d6573Sopenharmony_ci fn shortest_dfa_reverse_suffix( 805c67d6573Sopenharmony_ci &self, 806c67d6573Sopenharmony_ci text: &[u8], 807c67d6573Sopenharmony_ci start: usize, 808c67d6573Sopenharmony_ci ) -> dfa::Result<usize> { 809c67d6573Sopenharmony_ci match self.exec_dfa_reverse_suffix(text, start) { 810c67d6573Sopenharmony_ci None => self.shortest_dfa(text, start), 811c67d6573Sopenharmony_ci Some(r) => r.map(|(_, end)| end), 812c67d6573Sopenharmony_ci } 813c67d6573Sopenharmony_ci } 814c67d6573Sopenharmony_ci 815c67d6573Sopenharmony_ci /// Finds the end of the shortest match using only the DFA by scanning for 816c67d6573Sopenharmony_ci /// suffix literals. It also reports the start of the match. 817c67d6573Sopenharmony_ci /// 818c67d6573Sopenharmony_ci /// Note that if None is returned, then the optimization gave up to avoid 819c67d6573Sopenharmony_ci /// worst case quadratic behavior. A forward scanning DFA should be tried 820c67d6573Sopenharmony_ci /// next. 821c67d6573Sopenharmony_ci /// 822c67d6573Sopenharmony_ci /// If a match is returned and the full leftmost-first match is desired, 823c67d6573Sopenharmony_ci /// then a forward scan starting from the beginning of the match must be 824c67d6573Sopenharmony_ci /// done. 825c67d6573Sopenharmony_ci /// 826c67d6573Sopenharmony_ci /// If the result returned indicates that the DFA quit, then another 827c67d6573Sopenharmony_ci /// matching engine should be used. 828c67d6573Sopenharmony_ci #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 829c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 830c67d6573Sopenharmony_ci fn exec_dfa_reverse_suffix( 831c67d6573Sopenharmony_ci &self, 832c67d6573Sopenharmony_ci text: &[u8], 833c67d6573Sopenharmony_ci original_start: usize, 834c67d6573Sopenharmony_ci ) -> Option<dfa::Result<(usize, usize)>> { 835c67d6573Sopenharmony_ci use crate::dfa::Result::*; 836c67d6573Sopenharmony_ci 837c67d6573Sopenharmony_ci let lcs = self.ro.suffixes.lcs(); 838c67d6573Sopenharmony_ci debug_assert!(lcs.len() >= 1); 839c67d6573Sopenharmony_ci let mut start = original_start; 840c67d6573Sopenharmony_ci let mut end = start; 841c67d6573Sopenharmony_ci let mut last_literal = start; 842c67d6573Sopenharmony_ci while end <= text.len() { 843c67d6573Sopenharmony_ci last_literal += match lcs.find(&text[last_literal..]) { 844c67d6573Sopenharmony_ci None => return Some(NoMatch(text.len())), 845c67d6573Sopenharmony_ci Some(i) => i, 846c67d6573Sopenharmony_ci }; 847c67d6573Sopenharmony_ci end = last_literal + lcs.len(); 848c67d6573Sopenharmony_ci match dfa::Fsm::reverse( 849c67d6573Sopenharmony_ci &self.ro.dfa_reverse, 850c67d6573Sopenharmony_ci self.cache.value(), 851c67d6573Sopenharmony_ci false, 852c67d6573Sopenharmony_ci &text[start..end], 853c67d6573Sopenharmony_ci end - start, 854c67d6573Sopenharmony_ci ) { 855c67d6573Sopenharmony_ci Match(0) | NoMatch(0) => return None, 856c67d6573Sopenharmony_ci Match(i) => return Some(Match((start + i, end))), 857c67d6573Sopenharmony_ci NoMatch(i) => { 858c67d6573Sopenharmony_ci start += i; 859c67d6573Sopenharmony_ci last_literal += 1; 860c67d6573Sopenharmony_ci continue; 861c67d6573Sopenharmony_ci } 862c67d6573Sopenharmony_ci Quit => return Some(Quit), 863c67d6573Sopenharmony_ci }; 864c67d6573Sopenharmony_ci } 865c67d6573Sopenharmony_ci Some(NoMatch(text.len())) 866c67d6573Sopenharmony_ci } 867c67d6573Sopenharmony_ci 868c67d6573Sopenharmony_ci /// Finds the leftmost-first match (start and end) using only the DFA 869c67d6573Sopenharmony_ci /// by scanning for suffix literals. 870c67d6573Sopenharmony_ci /// 871c67d6573Sopenharmony_ci /// If the result returned indicates that the DFA quit, then another 872c67d6573Sopenharmony_ci /// matching engine should be used. 873c67d6573Sopenharmony_ci #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 874c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 875c67d6573Sopenharmony_ci fn find_dfa_reverse_suffix( 876c67d6573Sopenharmony_ci &self, 877c67d6573Sopenharmony_ci text: &[u8], 878c67d6573Sopenharmony_ci start: usize, 879c67d6573Sopenharmony_ci ) -> dfa::Result<(usize, usize)> { 880c67d6573Sopenharmony_ci use crate::dfa::Result::*; 881c67d6573Sopenharmony_ci 882c67d6573Sopenharmony_ci let match_start = match self.exec_dfa_reverse_suffix(text, start) { 883c67d6573Sopenharmony_ci None => return self.find_dfa_forward(text, start), 884c67d6573Sopenharmony_ci Some(Match((start, _))) => start, 885c67d6573Sopenharmony_ci Some(r) => return r, 886c67d6573Sopenharmony_ci }; 887c67d6573Sopenharmony_ci // At this point, we've found a match. The only way to quit now 888c67d6573Sopenharmony_ci // without a match is if the DFA gives up (seems unlikely). 889c67d6573Sopenharmony_ci // 890c67d6573Sopenharmony_ci // Now run the DFA forwards to find the proper end of the match. 891c67d6573Sopenharmony_ci // (The suffix literal match can only indicate the earliest 892c67d6573Sopenharmony_ci // possible end location, which may appear before the end of the 893c67d6573Sopenharmony_ci // leftmost-first match.) 894c67d6573Sopenharmony_ci match dfa::Fsm::forward( 895c67d6573Sopenharmony_ci &self.ro.dfa, 896c67d6573Sopenharmony_ci self.cache.value(), 897c67d6573Sopenharmony_ci false, 898c67d6573Sopenharmony_ci text, 899c67d6573Sopenharmony_ci match_start, 900c67d6573Sopenharmony_ci ) { 901c67d6573Sopenharmony_ci NoMatch(_) => panic!("BUG: reverse match implies forward match"), 902c67d6573Sopenharmony_ci Quit => Quit, 903c67d6573Sopenharmony_ci Match(e) => Match((match_start, e)), 904c67d6573Sopenharmony_ci } 905c67d6573Sopenharmony_ci } 906c67d6573Sopenharmony_ci 907c67d6573Sopenharmony_ci /// Executes the NFA engine to return whether there is a match or not. 908c67d6573Sopenharmony_ci /// 909c67d6573Sopenharmony_ci /// Ideally, we could use shortest_nfa(...).is_some() and get the same 910c67d6573Sopenharmony_ci /// performance characteristics, but regex sets don't have captures, which 911c67d6573Sopenharmony_ci /// shortest_nfa depends on. 912c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 913c67d6573Sopenharmony_ci fn match_nfa(&self, text: &[u8], start: usize) -> bool { 914c67d6573Sopenharmony_ci self.match_nfa_type(MatchNfaType::Auto, text, start) 915c67d6573Sopenharmony_ci } 916c67d6573Sopenharmony_ci 917c67d6573Sopenharmony_ci /// Like match_nfa, but allows specification of the type of NFA engine. 918c67d6573Sopenharmony_ci fn match_nfa_type( 919c67d6573Sopenharmony_ci &self, 920c67d6573Sopenharmony_ci ty: MatchNfaType, 921c67d6573Sopenharmony_ci text: &[u8], 922c67d6573Sopenharmony_ci start: usize, 923c67d6573Sopenharmony_ci ) -> bool { 924c67d6573Sopenharmony_ci self.exec_nfa( 925c67d6573Sopenharmony_ci ty, 926c67d6573Sopenharmony_ci &mut [false], 927c67d6573Sopenharmony_ci &mut [], 928c67d6573Sopenharmony_ci true, 929c67d6573Sopenharmony_ci false, 930c67d6573Sopenharmony_ci text, 931c67d6573Sopenharmony_ci start, 932c67d6573Sopenharmony_ci text.len(), 933c67d6573Sopenharmony_ci ) 934c67d6573Sopenharmony_ci } 935c67d6573Sopenharmony_ci 936c67d6573Sopenharmony_ci /// Finds the shortest match using an NFA. 937c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 938c67d6573Sopenharmony_ci fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> { 939c67d6573Sopenharmony_ci self.shortest_nfa_type(MatchNfaType::Auto, text, start) 940c67d6573Sopenharmony_ci } 941c67d6573Sopenharmony_ci 942c67d6573Sopenharmony_ci /// Like shortest_nfa, but allows specification of the type of NFA engine. 943c67d6573Sopenharmony_ci fn shortest_nfa_type( 944c67d6573Sopenharmony_ci &self, 945c67d6573Sopenharmony_ci ty: MatchNfaType, 946c67d6573Sopenharmony_ci text: &[u8], 947c67d6573Sopenharmony_ci start: usize, 948c67d6573Sopenharmony_ci ) -> Option<usize> { 949c67d6573Sopenharmony_ci let mut slots = [None, None]; 950c67d6573Sopenharmony_ci if self.exec_nfa( 951c67d6573Sopenharmony_ci ty, 952c67d6573Sopenharmony_ci &mut [false], 953c67d6573Sopenharmony_ci &mut slots, 954c67d6573Sopenharmony_ci true, 955c67d6573Sopenharmony_ci true, 956c67d6573Sopenharmony_ci text, 957c67d6573Sopenharmony_ci start, 958c67d6573Sopenharmony_ci text.len(), 959c67d6573Sopenharmony_ci ) { 960c67d6573Sopenharmony_ci slots[1] 961c67d6573Sopenharmony_ci } else { 962c67d6573Sopenharmony_ci None 963c67d6573Sopenharmony_ci } 964c67d6573Sopenharmony_ci } 965c67d6573Sopenharmony_ci 966c67d6573Sopenharmony_ci /// Like find, but executes an NFA engine. 967c67d6573Sopenharmony_ci fn find_nfa( 968c67d6573Sopenharmony_ci &self, 969c67d6573Sopenharmony_ci ty: MatchNfaType, 970c67d6573Sopenharmony_ci text: &[u8], 971c67d6573Sopenharmony_ci start: usize, 972c67d6573Sopenharmony_ci ) -> Option<(usize, usize)> { 973c67d6573Sopenharmony_ci let mut slots = [None, None]; 974c67d6573Sopenharmony_ci if self.exec_nfa( 975c67d6573Sopenharmony_ci ty, 976c67d6573Sopenharmony_ci &mut [false], 977c67d6573Sopenharmony_ci &mut slots, 978c67d6573Sopenharmony_ci false, 979c67d6573Sopenharmony_ci false, 980c67d6573Sopenharmony_ci text, 981c67d6573Sopenharmony_ci start, 982c67d6573Sopenharmony_ci text.len(), 983c67d6573Sopenharmony_ci ) { 984c67d6573Sopenharmony_ci match (slots[0], slots[1]) { 985c67d6573Sopenharmony_ci (Some(s), Some(e)) => Some((s, e)), 986c67d6573Sopenharmony_ci _ => None, 987c67d6573Sopenharmony_ci } 988c67d6573Sopenharmony_ci } else { 989c67d6573Sopenharmony_ci None 990c67d6573Sopenharmony_ci } 991c67d6573Sopenharmony_ci } 992c67d6573Sopenharmony_ci 993c67d6573Sopenharmony_ci /// Like find_nfa, but fills in captures. 994c67d6573Sopenharmony_ci /// 995c67d6573Sopenharmony_ci /// `slots` should have length equal to `2 * nfa.captures.len()`. 996c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 997c67d6573Sopenharmony_ci fn captures_nfa( 998c67d6573Sopenharmony_ci &self, 999c67d6573Sopenharmony_ci slots: &mut [Slot], 1000c67d6573Sopenharmony_ci text: &[u8], 1001c67d6573Sopenharmony_ci start: usize, 1002c67d6573Sopenharmony_ci ) -> Option<(usize, usize)> { 1003c67d6573Sopenharmony_ci self.captures_nfa_type( 1004c67d6573Sopenharmony_ci MatchNfaType::Auto, 1005c67d6573Sopenharmony_ci slots, 1006c67d6573Sopenharmony_ci text, 1007c67d6573Sopenharmony_ci start, 1008c67d6573Sopenharmony_ci text.len(), 1009c67d6573Sopenharmony_ci ) 1010c67d6573Sopenharmony_ci } 1011c67d6573Sopenharmony_ci 1012c67d6573Sopenharmony_ci /// Like captures_nfa, but allows specification of type of NFA engine. 1013c67d6573Sopenharmony_ci fn captures_nfa_type( 1014c67d6573Sopenharmony_ci &self, 1015c67d6573Sopenharmony_ci ty: MatchNfaType, 1016c67d6573Sopenharmony_ci slots: &mut [Slot], 1017c67d6573Sopenharmony_ci text: &[u8], 1018c67d6573Sopenharmony_ci start: usize, 1019c67d6573Sopenharmony_ci end: usize, 1020c67d6573Sopenharmony_ci ) -> Option<(usize, usize)> { 1021c67d6573Sopenharmony_ci if self.exec_nfa( 1022c67d6573Sopenharmony_ci ty, 1023c67d6573Sopenharmony_ci &mut [false], 1024c67d6573Sopenharmony_ci slots, 1025c67d6573Sopenharmony_ci false, 1026c67d6573Sopenharmony_ci false, 1027c67d6573Sopenharmony_ci text, 1028c67d6573Sopenharmony_ci start, 1029c67d6573Sopenharmony_ci end, 1030c67d6573Sopenharmony_ci ) { 1031c67d6573Sopenharmony_ci match (slots[0], slots[1]) { 1032c67d6573Sopenharmony_ci (Some(s), Some(e)) => Some((s, e)), 1033c67d6573Sopenharmony_ci _ => None, 1034c67d6573Sopenharmony_ci } 1035c67d6573Sopenharmony_ci } else { 1036c67d6573Sopenharmony_ci None 1037c67d6573Sopenharmony_ci } 1038c67d6573Sopenharmony_ci } 1039c67d6573Sopenharmony_ci 1040c67d6573Sopenharmony_ci fn exec_nfa( 1041c67d6573Sopenharmony_ci &self, 1042c67d6573Sopenharmony_ci mut ty: MatchNfaType, 1043c67d6573Sopenharmony_ci matches: &mut [bool], 1044c67d6573Sopenharmony_ci slots: &mut [Slot], 1045c67d6573Sopenharmony_ci quit_after_match: bool, 1046c67d6573Sopenharmony_ci quit_after_match_with_pos: bool, 1047c67d6573Sopenharmony_ci text: &[u8], 1048c67d6573Sopenharmony_ci start: usize, 1049c67d6573Sopenharmony_ci end: usize, 1050c67d6573Sopenharmony_ci ) -> bool { 1051c67d6573Sopenharmony_ci use self::MatchNfaType::*; 1052c67d6573Sopenharmony_ci if let Auto = ty { 1053c67d6573Sopenharmony_ci if backtrack::should_exec(self.ro.nfa.len(), text.len()) { 1054c67d6573Sopenharmony_ci ty = Backtrack; 1055c67d6573Sopenharmony_ci } else { 1056c67d6573Sopenharmony_ci ty = PikeVM; 1057c67d6573Sopenharmony_ci } 1058c67d6573Sopenharmony_ci } 1059c67d6573Sopenharmony_ci // The backtracker can't return the shortest match position as it is 1060c67d6573Sopenharmony_ci // implemented today. So if someone calls `shortest_match` and we need 1061c67d6573Sopenharmony_ci // to run an NFA, then use the PikeVM. 1062c67d6573Sopenharmony_ci if quit_after_match_with_pos || ty == PikeVM { 1063c67d6573Sopenharmony_ci self.exec_pikevm( 1064c67d6573Sopenharmony_ci matches, 1065c67d6573Sopenharmony_ci slots, 1066c67d6573Sopenharmony_ci quit_after_match, 1067c67d6573Sopenharmony_ci text, 1068c67d6573Sopenharmony_ci start, 1069c67d6573Sopenharmony_ci end, 1070c67d6573Sopenharmony_ci ) 1071c67d6573Sopenharmony_ci } else { 1072c67d6573Sopenharmony_ci self.exec_backtrack(matches, slots, text, start, end) 1073c67d6573Sopenharmony_ci } 1074c67d6573Sopenharmony_ci } 1075c67d6573Sopenharmony_ci 1076c67d6573Sopenharmony_ci /// Always run the NFA algorithm. 1077c67d6573Sopenharmony_ci fn exec_pikevm( 1078c67d6573Sopenharmony_ci &self, 1079c67d6573Sopenharmony_ci matches: &mut [bool], 1080c67d6573Sopenharmony_ci slots: &mut [Slot], 1081c67d6573Sopenharmony_ci quit_after_match: bool, 1082c67d6573Sopenharmony_ci text: &[u8], 1083c67d6573Sopenharmony_ci start: usize, 1084c67d6573Sopenharmony_ci end: usize, 1085c67d6573Sopenharmony_ci ) -> bool { 1086c67d6573Sopenharmony_ci if self.ro.nfa.uses_bytes() { 1087c67d6573Sopenharmony_ci pikevm::Fsm::exec( 1088c67d6573Sopenharmony_ci &self.ro.nfa, 1089c67d6573Sopenharmony_ci self.cache.value(), 1090c67d6573Sopenharmony_ci matches, 1091c67d6573Sopenharmony_ci slots, 1092c67d6573Sopenharmony_ci quit_after_match, 1093c67d6573Sopenharmony_ci ByteInput::new(text, self.ro.nfa.only_utf8), 1094c67d6573Sopenharmony_ci start, 1095c67d6573Sopenharmony_ci end, 1096c67d6573Sopenharmony_ci ) 1097c67d6573Sopenharmony_ci } else { 1098c67d6573Sopenharmony_ci pikevm::Fsm::exec( 1099c67d6573Sopenharmony_ci &self.ro.nfa, 1100c67d6573Sopenharmony_ci self.cache.value(), 1101c67d6573Sopenharmony_ci matches, 1102c67d6573Sopenharmony_ci slots, 1103c67d6573Sopenharmony_ci quit_after_match, 1104c67d6573Sopenharmony_ci CharInput::new(text), 1105c67d6573Sopenharmony_ci start, 1106c67d6573Sopenharmony_ci end, 1107c67d6573Sopenharmony_ci ) 1108c67d6573Sopenharmony_ci } 1109c67d6573Sopenharmony_ci } 1110c67d6573Sopenharmony_ci 1111c67d6573Sopenharmony_ci /// Always runs the NFA using bounded backtracking. 1112c67d6573Sopenharmony_ci fn exec_backtrack( 1113c67d6573Sopenharmony_ci &self, 1114c67d6573Sopenharmony_ci matches: &mut [bool], 1115c67d6573Sopenharmony_ci slots: &mut [Slot], 1116c67d6573Sopenharmony_ci text: &[u8], 1117c67d6573Sopenharmony_ci start: usize, 1118c67d6573Sopenharmony_ci end: usize, 1119c67d6573Sopenharmony_ci ) -> bool { 1120c67d6573Sopenharmony_ci if self.ro.nfa.uses_bytes() { 1121c67d6573Sopenharmony_ci backtrack::Bounded::exec( 1122c67d6573Sopenharmony_ci &self.ro.nfa, 1123c67d6573Sopenharmony_ci self.cache.value(), 1124c67d6573Sopenharmony_ci matches, 1125c67d6573Sopenharmony_ci slots, 1126c67d6573Sopenharmony_ci ByteInput::new(text, self.ro.nfa.only_utf8), 1127c67d6573Sopenharmony_ci start, 1128c67d6573Sopenharmony_ci end, 1129c67d6573Sopenharmony_ci ) 1130c67d6573Sopenharmony_ci } else { 1131c67d6573Sopenharmony_ci backtrack::Bounded::exec( 1132c67d6573Sopenharmony_ci &self.ro.nfa, 1133c67d6573Sopenharmony_ci self.cache.value(), 1134c67d6573Sopenharmony_ci matches, 1135c67d6573Sopenharmony_ci slots, 1136c67d6573Sopenharmony_ci CharInput::new(text), 1137c67d6573Sopenharmony_ci start, 1138c67d6573Sopenharmony_ci end, 1139c67d6573Sopenharmony_ci ) 1140c67d6573Sopenharmony_ci } 1141c67d6573Sopenharmony_ci } 1142c67d6573Sopenharmony_ci 1143c67d6573Sopenharmony_ci /// Finds which regular expressions match the given text. 1144c67d6573Sopenharmony_ci /// 1145c67d6573Sopenharmony_ci /// `matches` should have length equal to the number of regexes being 1146c67d6573Sopenharmony_ci /// searched. 1147c67d6573Sopenharmony_ci /// 1148c67d6573Sopenharmony_ci /// This is only useful when one wants to know which regexes in a set 1149c67d6573Sopenharmony_ci /// match some text. 1150c67d6573Sopenharmony_ci pub fn many_matches_at( 1151c67d6573Sopenharmony_ci &self, 1152c67d6573Sopenharmony_ci matches: &mut [bool], 1153c67d6573Sopenharmony_ci text: &[u8], 1154c67d6573Sopenharmony_ci start: usize, 1155c67d6573Sopenharmony_ci ) -> bool { 1156c67d6573Sopenharmony_ci use self::MatchType::*; 1157c67d6573Sopenharmony_ci if !self.is_anchor_end_match(text) { 1158c67d6573Sopenharmony_ci return false; 1159c67d6573Sopenharmony_ci } 1160c67d6573Sopenharmony_ci match self.ro.match_type { 1161c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 1162c67d6573Sopenharmony_ci Literal(ty) => { 1163c67d6573Sopenharmony_ci debug_assert_eq!(matches.len(), 1); 1164c67d6573Sopenharmony_ci matches[0] = self.find_literals(ty, text, start).is_some(); 1165c67d6573Sopenharmony_ci matches[0] 1166c67d6573Sopenharmony_ci } 1167c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 1168c67d6573Sopenharmony_ci Dfa | DfaAnchoredReverse | DfaMany => { 1169c67d6573Sopenharmony_ci match dfa::Fsm::forward_many( 1170c67d6573Sopenharmony_ci &self.ro.dfa, 1171c67d6573Sopenharmony_ci self.cache.value(), 1172c67d6573Sopenharmony_ci matches, 1173c67d6573Sopenharmony_ci text, 1174c67d6573Sopenharmony_ci start, 1175c67d6573Sopenharmony_ci ) { 1176c67d6573Sopenharmony_ci dfa::Result::Match(_) => true, 1177c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => false, 1178c67d6573Sopenharmony_ci dfa::Result::Quit => self.exec_nfa( 1179c67d6573Sopenharmony_ci MatchNfaType::Auto, 1180c67d6573Sopenharmony_ci matches, 1181c67d6573Sopenharmony_ci &mut [], 1182c67d6573Sopenharmony_ci false, 1183c67d6573Sopenharmony_ci false, 1184c67d6573Sopenharmony_ci text, 1185c67d6573Sopenharmony_ci start, 1186c67d6573Sopenharmony_ci text.len(), 1187c67d6573Sopenharmony_ci ), 1188c67d6573Sopenharmony_ci } 1189c67d6573Sopenharmony_ci } 1190c67d6573Sopenharmony_ci #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 1191c67d6573Sopenharmony_ci DfaSuffix => { 1192c67d6573Sopenharmony_ci match dfa::Fsm::forward_many( 1193c67d6573Sopenharmony_ci &self.ro.dfa, 1194c67d6573Sopenharmony_ci self.cache.value(), 1195c67d6573Sopenharmony_ci matches, 1196c67d6573Sopenharmony_ci text, 1197c67d6573Sopenharmony_ci start, 1198c67d6573Sopenharmony_ci ) { 1199c67d6573Sopenharmony_ci dfa::Result::Match(_) => true, 1200c67d6573Sopenharmony_ci dfa::Result::NoMatch(_) => false, 1201c67d6573Sopenharmony_ci dfa::Result::Quit => self.exec_nfa( 1202c67d6573Sopenharmony_ci MatchNfaType::Auto, 1203c67d6573Sopenharmony_ci matches, 1204c67d6573Sopenharmony_ci &mut [], 1205c67d6573Sopenharmony_ci false, 1206c67d6573Sopenharmony_ci false, 1207c67d6573Sopenharmony_ci text, 1208c67d6573Sopenharmony_ci start, 1209c67d6573Sopenharmony_ci text.len(), 1210c67d6573Sopenharmony_ci ), 1211c67d6573Sopenharmony_ci } 1212c67d6573Sopenharmony_ci } 1213c67d6573Sopenharmony_ci Nfa(ty) => self.exec_nfa( 1214c67d6573Sopenharmony_ci ty, 1215c67d6573Sopenharmony_ci matches, 1216c67d6573Sopenharmony_ci &mut [], 1217c67d6573Sopenharmony_ci false, 1218c67d6573Sopenharmony_ci false, 1219c67d6573Sopenharmony_ci text, 1220c67d6573Sopenharmony_ci start, 1221c67d6573Sopenharmony_ci text.len(), 1222c67d6573Sopenharmony_ci ), 1223c67d6573Sopenharmony_ci Nothing => false, 1224c67d6573Sopenharmony_ci } 1225c67d6573Sopenharmony_ci } 1226c67d6573Sopenharmony_ci 1227c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 1228c67d6573Sopenharmony_ci fn is_anchor_end_match(&self, text: &[u8]) -> bool { 1229c67d6573Sopenharmony_ci #[cfg(not(feature = "perf-literal"))] 1230c67d6573Sopenharmony_ci fn imp(_: &ExecReadOnly, _: &[u8]) -> bool { 1231c67d6573Sopenharmony_ci true 1232c67d6573Sopenharmony_ci } 1233c67d6573Sopenharmony_ci 1234c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 1235c67d6573Sopenharmony_ci fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool { 1236c67d6573Sopenharmony_ci // Only do this check if the haystack is big (>1MB). 1237c67d6573Sopenharmony_ci if text.len() > (1 << 20) && ro.nfa.is_anchored_end { 1238c67d6573Sopenharmony_ci let lcs = ro.suffixes.lcs(); 1239c67d6573Sopenharmony_ci if lcs.len() >= 1 && !lcs.is_suffix(text) { 1240c67d6573Sopenharmony_ci return false; 1241c67d6573Sopenharmony_ci } 1242c67d6573Sopenharmony_ci } 1243c67d6573Sopenharmony_ci true 1244c67d6573Sopenharmony_ci } 1245c67d6573Sopenharmony_ci 1246c67d6573Sopenharmony_ci imp(&self.ro, text) 1247c67d6573Sopenharmony_ci } 1248c67d6573Sopenharmony_ci 1249c67d6573Sopenharmony_ci pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1250c67d6573Sopenharmony_ci &self.ro.nfa.capture_name_idx 1251c67d6573Sopenharmony_ci } 1252c67d6573Sopenharmony_ci} 1253c67d6573Sopenharmony_ci 1254c67d6573Sopenharmony_ciimpl<'c> ExecNoSyncStr<'c> { 1255c67d6573Sopenharmony_ci pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1256c67d6573Sopenharmony_ci self.0.capture_name_idx() 1257c67d6573Sopenharmony_ci } 1258c67d6573Sopenharmony_ci} 1259c67d6573Sopenharmony_ci 1260c67d6573Sopenharmony_ciimpl Exec { 1261c67d6573Sopenharmony_ci /// Get a searcher that isn't Sync. 1262c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 1263c67d6573Sopenharmony_ci pub fn searcher(&self) -> ExecNoSync<'_> { 1264c67d6573Sopenharmony_ci ExecNoSync { 1265c67d6573Sopenharmony_ci ro: &self.ro, // a clone is too expensive here! (and not needed) 1266c67d6573Sopenharmony_ci cache: self.pool.get(), 1267c67d6573Sopenharmony_ci } 1268c67d6573Sopenharmony_ci } 1269c67d6573Sopenharmony_ci 1270c67d6573Sopenharmony_ci /// Get a searcher that isn't Sync and can match on &str. 1271c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 1272c67d6573Sopenharmony_ci pub fn searcher_str(&self) -> ExecNoSyncStr<'_> { 1273c67d6573Sopenharmony_ci ExecNoSyncStr(self.searcher()) 1274c67d6573Sopenharmony_ci } 1275c67d6573Sopenharmony_ci 1276c67d6573Sopenharmony_ci /// Build a Regex from this executor. 1277c67d6573Sopenharmony_ci pub fn into_regex(self) -> re_unicode::Regex { 1278c67d6573Sopenharmony_ci re_unicode::Regex::from(self) 1279c67d6573Sopenharmony_ci } 1280c67d6573Sopenharmony_ci 1281c67d6573Sopenharmony_ci /// Build a RegexSet from this executor. 1282c67d6573Sopenharmony_ci pub fn into_regex_set(self) -> re_set::unicode::RegexSet { 1283c67d6573Sopenharmony_ci re_set::unicode::RegexSet::from(self) 1284c67d6573Sopenharmony_ci } 1285c67d6573Sopenharmony_ci 1286c67d6573Sopenharmony_ci /// Build a Regex from this executor that can match arbitrary bytes. 1287c67d6573Sopenharmony_ci pub fn into_byte_regex(self) -> re_bytes::Regex { 1288c67d6573Sopenharmony_ci re_bytes::Regex::from(self) 1289c67d6573Sopenharmony_ci } 1290c67d6573Sopenharmony_ci 1291c67d6573Sopenharmony_ci /// Build a RegexSet from this executor that can match arbitrary bytes. 1292c67d6573Sopenharmony_ci pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { 1293c67d6573Sopenharmony_ci re_set::bytes::RegexSet::from(self) 1294c67d6573Sopenharmony_ci } 1295c67d6573Sopenharmony_ci 1296c67d6573Sopenharmony_ci /// The original regular expressions given by the caller that were 1297c67d6573Sopenharmony_ci /// compiled. 1298c67d6573Sopenharmony_ci pub fn regex_strings(&self) -> &[String] { 1299c67d6573Sopenharmony_ci &self.ro.res 1300c67d6573Sopenharmony_ci } 1301c67d6573Sopenharmony_ci 1302c67d6573Sopenharmony_ci /// Return a slice of capture names. 1303c67d6573Sopenharmony_ci /// 1304c67d6573Sopenharmony_ci /// Any capture that isn't named is None. 1305c67d6573Sopenharmony_ci pub fn capture_names(&self) -> &[Option<String>] { 1306c67d6573Sopenharmony_ci &self.ro.nfa.captures 1307c67d6573Sopenharmony_ci } 1308c67d6573Sopenharmony_ci 1309c67d6573Sopenharmony_ci /// Return a reference to named groups mapping (from group name to 1310c67d6573Sopenharmony_ci /// group position). 1311c67d6573Sopenharmony_ci pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1312c67d6573Sopenharmony_ci &self.ro.nfa.capture_name_idx 1313c67d6573Sopenharmony_ci } 1314c67d6573Sopenharmony_ci} 1315c67d6573Sopenharmony_ci 1316c67d6573Sopenharmony_ciimpl Clone for Exec { 1317c67d6573Sopenharmony_ci fn clone(&self) -> Exec { 1318c67d6573Sopenharmony_ci let pool = ExecReadOnly::new_pool(&self.ro); 1319c67d6573Sopenharmony_ci Exec { ro: self.ro.clone(), pool } 1320c67d6573Sopenharmony_ci } 1321c67d6573Sopenharmony_ci} 1322c67d6573Sopenharmony_ci 1323c67d6573Sopenharmony_ciimpl ExecReadOnly { 1324c67d6573Sopenharmony_ci fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType { 1325c67d6573Sopenharmony_ci if let Some(MatchType::Nfa(_)) = hint { 1326c67d6573Sopenharmony_ci return hint.unwrap(); 1327c67d6573Sopenharmony_ci } 1328c67d6573Sopenharmony_ci // If the NFA is empty, then we'll never match anything. 1329c67d6573Sopenharmony_ci if self.nfa.insts.is_empty() { 1330c67d6573Sopenharmony_ci return MatchType::Nothing; 1331c67d6573Sopenharmony_ci } 1332c67d6573Sopenharmony_ci if let Some(literalty) = self.choose_literal_match_type() { 1333c67d6573Sopenharmony_ci return literalty; 1334c67d6573Sopenharmony_ci } 1335c67d6573Sopenharmony_ci if let Some(dfaty) = self.choose_dfa_match_type() { 1336c67d6573Sopenharmony_ci return dfaty; 1337c67d6573Sopenharmony_ci } 1338c67d6573Sopenharmony_ci // We're so totally hosed. 1339c67d6573Sopenharmony_ci MatchType::Nfa(MatchNfaType::Auto) 1340c67d6573Sopenharmony_ci } 1341c67d6573Sopenharmony_ci 1342c67d6573Sopenharmony_ci /// If a plain literal scan can be used, then a corresponding literal 1343c67d6573Sopenharmony_ci /// search type is returned. 1344c67d6573Sopenharmony_ci fn choose_literal_match_type(&self) -> Option<MatchType> { 1345c67d6573Sopenharmony_ci #[cfg(not(feature = "perf-literal"))] 1346c67d6573Sopenharmony_ci fn imp(_: &ExecReadOnly) -> Option<MatchType> { 1347c67d6573Sopenharmony_ci None 1348c67d6573Sopenharmony_ci } 1349c67d6573Sopenharmony_ci 1350c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 1351c67d6573Sopenharmony_ci fn imp(ro: &ExecReadOnly) -> Option<MatchType> { 1352c67d6573Sopenharmony_ci // If our set of prefixes is complete, then we can use it to find 1353c67d6573Sopenharmony_ci // a match in lieu of a regex engine. This doesn't quite work well 1354c67d6573Sopenharmony_ci // in the presence of multiple regexes, so only do it when there's 1355c67d6573Sopenharmony_ci // one. 1356c67d6573Sopenharmony_ci // 1357c67d6573Sopenharmony_ci // TODO(burntsushi): Also, don't try to match literals if the regex 1358c67d6573Sopenharmony_ci // is partially anchored. We could technically do it, but we'd need 1359c67d6573Sopenharmony_ci // to create two sets of literals: all of them and then the subset 1360c67d6573Sopenharmony_ci // that aren't anchored. We would then only search for all of them 1361c67d6573Sopenharmony_ci // when at the beginning of the input and use the subset in all 1362c67d6573Sopenharmony_ci // other cases. 1363c67d6573Sopenharmony_ci if ro.res.len() != 1 { 1364c67d6573Sopenharmony_ci return None; 1365c67d6573Sopenharmony_ci } 1366c67d6573Sopenharmony_ci if ro.ac.is_some() { 1367c67d6573Sopenharmony_ci return Some(MatchType::Literal( 1368c67d6573Sopenharmony_ci MatchLiteralType::AhoCorasick, 1369c67d6573Sopenharmony_ci )); 1370c67d6573Sopenharmony_ci } 1371c67d6573Sopenharmony_ci if ro.nfa.prefixes.complete() { 1372c67d6573Sopenharmony_ci return if ro.nfa.is_anchored_start { 1373c67d6573Sopenharmony_ci Some(MatchType::Literal(MatchLiteralType::AnchoredStart)) 1374c67d6573Sopenharmony_ci } else { 1375c67d6573Sopenharmony_ci Some(MatchType::Literal(MatchLiteralType::Unanchored)) 1376c67d6573Sopenharmony_ci }; 1377c67d6573Sopenharmony_ci } 1378c67d6573Sopenharmony_ci if ro.suffixes.complete() { 1379c67d6573Sopenharmony_ci return if ro.nfa.is_anchored_end { 1380c67d6573Sopenharmony_ci Some(MatchType::Literal(MatchLiteralType::AnchoredEnd)) 1381c67d6573Sopenharmony_ci } else { 1382c67d6573Sopenharmony_ci // This case shouldn't happen. When the regex isn't 1383c67d6573Sopenharmony_ci // anchored, then complete prefixes should imply complete 1384c67d6573Sopenharmony_ci // suffixes. 1385c67d6573Sopenharmony_ci Some(MatchType::Literal(MatchLiteralType::Unanchored)) 1386c67d6573Sopenharmony_ci }; 1387c67d6573Sopenharmony_ci } 1388c67d6573Sopenharmony_ci None 1389c67d6573Sopenharmony_ci } 1390c67d6573Sopenharmony_ci 1391c67d6573Sopenharmony_ci imp(self) 1392c67d6573Sopenharmony_ci } 1393c67d6573Sopenharmony_ci 1394c67d6573Sopenharmony_ci /// If a DFA scan can be used, then choose the appropriate DFA strategy. 1395c67d6573Sopenharmony_ci fn choose_dfa_match_type(&self) -> Option<MatchType> { 1396c67d6573Sopenharmony_ci #[cfg(not(feature = "perf-dfa"))] 1397c67d6573Sopenharmony_ci fn imp(_: &ExecReadOnly) -> Option<MatchType> { 1398c67d6573Sopenharmony_ci None 1399c67d6573Sopenharmony_ci } 1400c67d6573Sopenharmony_ci 1401c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 1402c67d6573Sopenharmony_ci fn imp(ro: &ExecReadOnly) -> Option<MatchType> { 1403c67d6573Sopenharmony_ci if !dfa::can_exec(&ro.dfa) { 1404c67d6573Sopenharmony_ci return None; 1405c67d6573Sopenharmony_ci } 1406c67d6573Sopenharmony_ci // Regex sets require a slightly specialized path. 1407c67d6573Sopenharmony_ci if ro.res.len() >= 2 { 1408c67d6573Sopenharmony_ci return Some(MatchType::DfaMany); 1409c67d6573Sopenharmony_ci } 1410c67d6573Sopenharmony_ci // If the regex is anchored at the end but not the start, then 1411c67d6573Sopenharmony_ci // just match in reverse from the end of the haystack. 1412c67d6573Sopenharmony_ci if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end { 1413c67d6573Sopenharmony_ci return Some(MatchType::DfaAnchoredReverse); 1414c67d6573Sopenharmony_ci } 1415c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 1416c67d6573Sopenharmony_ci { 1417c67d6573Sopenharmony_ci // If there's a longish suffix literal, then it might be faster 1418c67d6573Sopenharmony_ci // to look for that first. 1419c67d6573Sopenharmony_ci if ro.should_suffix_scan() { 1420c67d6573Sopenharmony_ci return Some(MatchType::DfaSuffix); 1421c67d6573Sopenharmony_ci } 1422c67d6573Sopenharmony_ci } 1423c67d6573Sopenharmony_ci // Fall back to your garden variety forward searching lazy DFA. 1424c67d6573Sopenharmony_ci Some(MatchType::Dfa) 1425c67d6573Sopenharmony_ci } 1426c67d6573Sopenharmony_ci 1427c67d6573Sopenharmony_ci imp(self) 1428c67d6573Sopenharmony_ci } 1429c67d6573Sopenharmony_ci 1430c67d6573Sopenharmony_ci /// Returns true if the program is amenable to suffix scanning. 1431c67d6573Sopenharmony_ci /// 1432c67d6573Sopenharmony_ci /// When this is true, as a heuristic, we assume it is OK to quickly scan 1433c67d6573Sopenharmony_ci /// for suffix literals and then do a *reverse* DFA match from any matches 1434c67d6573Sopenharmony_ci /// produced by the literal scan. (And then followed by a forward DFA 1435c67d6573Sopenharmony_ci /// search, since the previously found suffix literal maybe not actually be 1436c67d6573Sopenharmony_ci /// the end of a match.) 1437c67d6573Sopenharmony_ci /// 1438c67d6573Sopenharmony_ci /// This is a bit of a specialized optimization, but can result in pretty 1439c67d6573Sopenharmony_ci /// big performance wins if 1) there are no prefix literals and 2) the 1440c67d6573Sopenharmony_ci /// suffix literals are pretty rare in the text. (1) is obviously easy to 1441c67d6573Sopenharmony_ci /// account for but (2) is harder. As a proxy, we assume that longer 1442c67d6573Sopenharmony_ci /// strings are generally rarer, so we only enable this optimization when 1443c67d6573Sopenharmony_ci /// we have a meaty suffix. 1444c67d6573Sopenharmony_ci #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 1445c67d6573Sopenharmony_ci fn should_suffix_scan(&self) -> bool { 1446c67d6573Sopenharmony_ci if self.suffixes.is_empty() { 1447c67d6573Sopenharmony_ci return false; 1448c67d6573Sopenharmony_ci } 1449c67d6573Sopenharmony_ci let lcs_len = self.suffixes.lcs().char_len(); 1450c67d6573Sopenharmony_ci lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() 1451c67d6573Sopenharmony_ci } 1452c67d6573Sopenharmony_ci 1453c67d6573Sopenharmony_ci fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> { 1454c67d6573Sopenharmony_ci let ro = ro.clone(); 1455c67d6573Sopenharmony_ci Box::new(Pool::new(Box::new(move || { 1456c67d6573Sopenharmony_ci AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro))) 1457c67d6573Sopenharmony_ci }))) 1458c67d6573Sopenharmony_ci } 1459c67d6573Sopenharmony_ci} 1460c67d6573Sopenharmony_ci 1461c67d6573Sopenharmony_ci#[derive(Clone, Copy, Debug)] 1462c67d6573Sopenharmony_cienum MatchType { 1463c67d6573Sopenharmony_ci /// A single or multiple literal search. This is only used when the regex 1464c67d6573Sopenharmony_ci /// can be decomposed into a literal search. 1465c67d6573Sopenharmony_ci #[cfg(feature = "perf-literal")] 1466c67d6573Sopenharmony_ci Literal(MatchLiteralType), 1467c67d6573Sopenharmony_ci /// A normal DFA search. 1468c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 1469c67d6573Sopenharmony_ci Dfa, 1470c67d6573Sopenharmony_ci /// A reverse DFA search starting from the end of a haystack. 1471c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 1472c67d6573Sopenharmony_ci DfaAnchoredReverse, 1473c67d6573Sopenharmony_ci /// A reverse DFA search with suffix literal scanning. 1474c67d6573Sopenharmony_ci #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 1475c67d6573Sopenharmony_ci DfaSuffix, 1476c67d6573Sopenharmony_ci /// Use the DFA on two or more regular expressions. 1477c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 1478c67d6573Sopenharmony_ci DfaMany, 1479c67d6573Sopenharmony_ci /// An NFA variant. 1480c67d6573Sopenharmony_ci Nfa(MatchNfaType), 1481c67d6573Sopenharmony_ci /// No match is ever possible, so don't ever try to search. 1482c67d6573Sopenharmony_ci Nothing, 1483c67d6573Sopenharmony_ci} 1484c67d6573Sopenharmony_ci 1485c67d6573Sopenharmony_ci#[derive(Clone, Copy, Debug)] 1486c67d6573Sopenharmony_ci#[cfg(feature = "perf-literal")] 1487c67d6573Sopenharmony_cienum MatchLiteralType { 1488c67d6573Sopenharmony_ci /// Match literals anywhere in text. 1489c67d6573Sopenharmony_ci Unanchored, 1490c67d6573Sopenharmony_ci /// Match literals only at the start of text. 1491c67d6573Sopenharmony_ci AnchoredStart, 1492c67d6573Sopenharmony_ci /// Match literals only at the end of text. 1493c67d6573Sopenharmony_ci AnchoredEnd, 1494c67d6573Sopenharmony_ci /// Use an Aho-Corasick automaton. This requires `ac` to be Some on 1495c67d6573Sopenharmony_ci /// ExecReadOnly. 1496c67d6573Sopenharmony_ci AhoCorasick, 1497c67d6573Sopenharmony_ci} 1498c67d6573Sopenharmony_ci 1499c67d6573Sopenharmony_ci#[derive(Clone, Copy, Debug, Eq, PartialEq)] 1500c67d6573Sopenharmony_cienum MatchNfaType { 1501c67d6573Sopenharmony_ci /// Choose between Backtrack and PikeVM. 1502c67d6573Sopenharmony_ci Auto, 1503c67d6573Sopenharmony_ci /// NFA bounded backtracking. 1504c67d6573Sopenharmony_ci /// 1505c67d6573Sopenharmony_ci /// (This is only set by tests, since it never makes sense to always want 1506c67d6573Sopenharmony_ci /// backtracking.) 1507c67d6573Sopenharmony_ci Backtrack, 1508c67d6573Sopenharmony_ci /// The Pike VM. 1509c67d6573Sopenharmony_ci /// 1510c67d6573Sopenharmony_ci /// (This is only set by tests, since it never makes sense to always want 1511c67d6573Sopenharmony_ci /// the Pike VM.) 1512c67d6573Sopenharmony_ci PikeVM, 1513c67d6573Sopenharmony_ci} 1514c67d6573Sopenharmony_ci 1515c67d6573Sopenharmony_ci/// `ProgramCache` maintains reusable allocations for each matching engine 1516c67d6573Sopenharmony_ci/// available to a particular program. 1517c67d6573Sopenharmony_ci/// 1518c67d6573Sopenharmony_ci/// We declare this as unwind safe since it's a cache that's only used for 1519c67d6573Sopenharmony_ci/// performance purposes. If a panic occurs, it is (or should be) always safe 1520c67d6573Sopenharmony_ci/// to continue using the same regex object. 1521c67d6573Sopenharmony_cipub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>; 1522c67d6573Sopenharmony_ci 1523c67d6573Sopenharmony_ci#[derive(Debug)] 1524c67d6573Sopenharmony_cipub struct ProgramCacheInner { 1525c67d6573Sopenharmony_ci pub pikevm: pikevm::Cache, 1526c67d6573Sopenharmony_ci pub backtrack: backtrack::Cache, 1527c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 1528c67d6573Sopenharmony_ci pub dfa: dfa::Cache, 1529c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 1530c67d6573Sopenharmony_ci pub dfa_reverse: dfa::Cache, 1531c67d6573Sopenharmony_ci} 1532c67d6573Sopenharmony_ci 1533c67d6573Sopenharmony_ciimpl ProgramCacheInner { 1534c67d6573Sopenharmony_ci fn new(ro: &ExecReadOnly) -> Self { 1535c67d6573Sopenharmony_ci ProgramCacheInner { 1536c67d6573Sopenharmony_ci pikevm: pikevm::Cache::new(&ro.nfa), 1537c67d6573Sopenharmony_ci backtrack: backtrack::Cache::new(&ro.nfa), 1538c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 1539c67d6573Sopenharmony_ci dfa: dfa::Cache::new(&ro.dfa), 1540c67d6573Sopenharmony_ci #[cfg(feature = "perf-dfa")] 1541c67d6573Sopenharmony_ci dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), 1542c67d6573Sopenharmony_ci } 1543c67d6573Sopenharmony_ci } 1544c67d6573Sopenharmony_ci} 1545c67d6573Sopenharmony_ci 1546c67d6573Sopenharmony_ci/// Alternation literals checks if the given HIR is a simple alternation of 1547c67d6573Sopenharmony_ci/// literals, and if so, returns them. Otherwise, this returns None. 1548c67d6573Sopenharmony_ci#[cfg(feature = "perf-literal")] 1549c67d6573Sopenharmony_cifn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { 1550c67d6573Sopenharmony_ci use regex_syntax::hir::{HirKind, Literal}; 1551c67d6573Sopenharmony_ci 1552c67d6573Sopenharmony_ci // This is pretty hacky, but basically, if `is_alternation_literal` is 1553c67d6573Sopenharmony_ci // true, then we can make several assumptions about the structure of our 1554c67d6573Sopenharmony_ci // HIR. This is what justifies the `unreachable!` statements below. 1555c67d6573Sopenharmony_ci // 1556c67d6573Sopenharmony_ci // This code should be refactored once we overhaul this crate's 1557c67d6573Sopenharmony_ci // optimization pipeline, because this is a terribly inflexible way to go 1558c67d6573Sopenharmony_ci // about things. 1559c67d6573Sopenharmony_ci 1560c67d6573Sopenharmony_ci if !expr.is_alternation_literal() { 1561c67d6573Sopenharmony_ci return None; 1562c67d6573Sopenharmony_ci } 1563c67d6573Sopenharmony_ci let alts = match *expr.kind() { 1564c67d6573Sopenharmony_ci HirKind::Alternation(ref alts) => alts, 1565c67d6573Sopenharmony_ci _ => return None, // one literal isn't worth it 1566c67d6573Sopenharmony_ci }; 1567c67d6573Sopenharmony_ci 1568c67d6573Sopenharmony_ci let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit { 1569c67d6573Sopenharmony_ci Literal::Unicode(c) => { 1570c67d6573Sopenharmony_ci let mut buf = [0; 4]; 1571c67d6573Sopenharmony_ci dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); 1572c67d6573Sopenharmony_ci } 1573c67d6573Sopenharmony_ci Literal::Byte(b) => { 1574c67d6573Sopenharmony_ci dst.push(b); 1575c67d6573Sopenharmony_ci } 1576c67d6573Sopenharmony_ci }; 1577c67d6573Sopenharmony_ci 1578c67d6573Sopenharmony_ci let mut lits = vec![]; 1579c67d6573Sopenharmony_ci for alt in alts { 1580c67d6573Sopenharmony_ci let mut lit = vec![]; 1581c67d6573Sopenharmony_ci match *alt.kind() { 1582c67d6573Sopenharmony_ci HirKind::Literal(ref x) => extendlit(x, &mut lit), 1583c67d6573Sopenharmony_ci HirKind::Concat(ref exprs) => { 1584c67d6573Sopenharmony_ci for e in exprs { 1585c67d6573Sopenharmony_ci match *e.kind() { 1586c67d6573Sopenharmony_ci HirKind::Literal(ref x) => extendlit(x, &mut lit), 1587c67d6573Sopenharmony_ci _ => unreachable!("expected literal, got {:?}", e), 1588c67d6573Sopenharmony_ci } 1589c67d6573Sopenharmony_ci } 1590c67d6573Sopenharmony_ci } 1591c67d6573Sopenharmony_ci _ => unreachable!("expected literal or concat, got {:?}", alt), 1592c67d6573Sopenharmony_ci } 1593c67d6573Sopenharmony_ci lits.push(lit); 1594c67d6573Sopenharmony_ci } 1595c67d6573Sopenharmony_ci Some(lits) 1596c67d6573Sopenharmony_ci} 1597c67d6573Sopenharmony_ci 1598c67d6573Sopenharmony_ci#[cfg(test)] 1599c67d6573Sopenharmony_cimod test { 1600c67d6573Sopenharmony_ci #[test] 1601c67d6573Sopenharmony_ci fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { 1602c67d6573Sopenharmony_ci use crate::internal::ExecBuilder; 1603c67d6573Sopenharmony_ci 1604c67d6573Sopenharmony_ci let backtrack_bytes_re = ExecBuilder::new("^S") 1605c67d6573Sopenharmony_ci .bounded_backtracking() 1606c67d6573Sopenharmony_ci .only_utf8(false) 1607c67d6573Sopenharmony_ci .build() 1608c67d6573Sopenharmony_ci .map(|exec| exec.into_byte_regex()) 1609c67d6573Sopenharmony_ci .map_err(|err| format!("{}", err)) 1610c67d6573Sopenharmony_ci .unwrap(); 1611c67d6573Sopenharmony_ci 1612c67d6573Sopenharmony_ci let default_bytes_re = ExecBuilder::new("^S") 1613c67d6573Sopenharmony_ci .only_utf8(false) 1614c67d6573Sopenharmony_ci .build() 1615c67d6573Sopenharmony_ci .map(|exec| exec.into_byte_regex()) 1616c67d6573Sopenharmony_ci .map_err(|err| format!("{}", err)) 1617c67d6573Sopenharmony_ci .unwrap(); 1618c67d6573Sopenharmony_ci 1619c67d6573Sopenharmony_ci let input = vec![83, 83]; 1620c67d6573Sopenharmony_ci 1621c67d6573Sopenharmony_ci let s1 = backtrack_bytes_re.split(&input); 1622c67d6573Sopenharmony_ci let s2 = default_bytes_re.split(&input); 1623c67d6573Sopenharmony_ci for (chunk1, chunk2) in s1.zip(s2) { 1624c67d6573Sopenharmony_ci assert_eq!(chunk1, chunk2); 1625c67d6573Sopenharmony_ci } 1626c67d6573Sopenharmony_ci } 1627c67d6573Sopenharmony_ci 1628c67d6573Sopenharmony_ci #[test] 1629c67d6573Sopenharmony_ci fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { 1630c67d6573Sopenharmony_ci use crate::internal::ExecBuilder; 1631c67d6573Sopenharmony_ci 1632c67d6573Sopenharmony_ci let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") 1633c67d6573Sopenharmony_ci .bounded_backtracking() 1634c67d6573Sopenharmony_ci .bytes(true) 1635c67d6573Sopenharmony_ci .build() 1636c67d6573Sopenharmony_ci .map(|exec| exec.into_regex()) 1637c67d6573Sopenharmony_ci .map_err(|err| format!("{}", err)) 1638c67d6573Sopenharmony_ci .unwrap(); 1639c67d6573Sopenharmony_ci 1640c67d6573Sopenharmony_ci let default_bytes_re = ExecBuilder::new(r"^(?u:\*)") 1641c67d6573Sopenharmony_ci .bytes(true) 1642c67d6573Sopenharmony_ci .build() 1643c67d6573Sopenharmony_ci .map(|exec| exec.into_regex()) 1644c67d6573Sopenharmony_ci .map_err(|err| format!("{}", err)) 1645c67d6573Sopenharmony_ci .unwrap(); 1646c67d6573Sopenharmony_ci 1647c67d6573Sopenharmony_ci let input = "**"; 1648c67d6573Sopenharmony_ci 1649c67d6573Sopenharmony_ci let s1 = backtrack_bytes_re.split(input); 1650c67d6573Sopenharmony_ci let s2 = default_bytes_re.split(input); 1651c67d6573Sopenharmony_ci for (chunk1, chunk2) in s1.zip(s2) { 1652c67d6573Sopenharmony_ci assert_eq!(chunk1, chunk2); 1653c67d6573Sopenharmony_ci } 1654c67d6573Sopenharmony_ci } 1655c67d6573Sopenharmony_ci} 1656