1use std::cell::RefCell; 2use std::collections::HashMap; 3use std::panic::AssertUnwindSafe; 4use std::sync::Arc; 5 6#[cfg(feature = "perf-literal")] 7use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; 8use regex_syntax::hir::literal::Literals; 9use regex_syntax::hir::Hir; 10use regex_syntax::ParserBuilder; 11 12use crate::backtrack; 13use crate::compile::Compiler; 14#[cfg(feature = "perf-dfa")] 15use crate::dfa; 16use crate::error::Error; 17use crate::input::{ByteInput, CharInput}; 18use crate::literal::LiteralSearcher; 19use crate::pikevm; 20use crate::pool::{Pool, PoolGuard}; 21use crate::prog::Program; 22use crate::re_builder::RegexOptions; 23use crate::re_bytes; 24use crate::re_set; 25use crate::re_trait::{Locations, RegularExpression, Slot}; 26use crate::re_unicode; 27use crate::utf8::next_utf8; 28 29/// `Exec` manages the execution of a regular expression. 30/// 31/// In particular, this manages the various compiled forms of a single regular 32/// expression and the choice of which matching engine to use to execute a 33/// regular expression. 34#[derive(Debug)] 35pub struct Exec { 36 /// All read only state. 37 ro: Arc<ExecReadOnly>, 38 /// A pool of reusable values for the various matching engines. 39 /// 40 /// Note that boxing this value is not strictly necessary, but it is an 41 /// easy way to ensure that T does not bloat the stack sized used by a pool 42 /// in the case where T is big. And this turns out to be the case at the 43 /// time of writing for regex's use of this pool. At the time of writing, 44 /// the size of a Regex on the stack is 856 bytes. Boxing this value 45 /// reduces that size to 16 bytes. 46 pool: Box<Pool<ProgramCache>>, 47} 48 49/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This 50/// means it is no longer Sync, but we can now avoid the overhead of 51/// synchronization to fetch the cache. 52#[derive(Debug)] 53pub struct ExecNoSync<'c> { 54 /// All read only state. 55 ro: &'c Arc<ExecReadOnly>, 56 /// Caches for the various matching engines. 57 cache: PoolGuard<'c, ProgramCache>, 58} 59 60/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. 61#[derive(Debug)] 62pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); 63 64/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such 65/// state is determined at compile time and never changes during search. 66#[derive(Debug)] 67struct ExecReadOnly { 68 /// The original regular expressions given by the caller to compile. 69 res: Vec<String>, 70 /// A compiled program that is used in the NFA simulation and backtracking. 71 /// It can be byte-based or Unicode codepoint based. 72 /// 73 /// N.B. It is not possibly to make this byte-based from the public API. 74 /// It is only used for testing byte based programs in the NFA simulations. 75 nfa: Program, 76 /// A compiled byte based program for DFA execution. This is only used 77 /// if a DFA can be executed. (Currently, only word boundary assertions are 78 /// not supported.) Note that this program contains an embedded `.*?` 79 /// preceding the first capture group, unless the regex is anchored at the 80 /// beginning. 81 dfa: Program, 82 /// The same as above, except the program is reversed (and there is no 83 /// preceding `.*?`). This is used by the DFA to find the starting location 84 /// of matches. 85 dfa_reverse: Program, 86 /// A set of suffix literals extracted from the regex. 87 /// 88 /// Prefix literals are stored on the `Program`, since they are used inside 89 /// the matching engines. 90 suffixes: LiteralSearcher, 91 /// An Aho-Corasick automaton with leftmost-first match semantics. 92 /// 93 /// This is only set when the entire regex is a simple unanchored 94 /// alternation of literals. We could probably use it more circumstances, 95 /// but this is already hacky enough in this architecture. 96 /// 97 /// N.B. We use u32 as a state ID representation under the assumption that 98 /// if we were to exhaust the ID space, we probably would have long 99 /// surpassed the compilation size limit. 100 #[cfg(feature = "perf-literal")] 101 ac: Option<AhoCorasick<u32>>, 102 /// match_type encodes as much upfront knowledge about how we're going to 103 /// execute a search as possible. 104 match_type: MatchType, 105} 106 107/// Facilitates the construction of an executor by exposing various knobs 108/// to control how a regex is executed and what kinds of resources it's 109/// permitted to use. 110// `ExecBuilder` is only public via the `internal` module, so avoid deriving 111// `Debug`. 112#[allow(missing_debug_implementations)] 113pub struct ExecBuilder { 114 options: RegexOptions, 115 match_type: Option<MatchType>, 116 bytes: bool, 117 only_utf8: bool, 118} 119 120/// Parsed represents a set of parsed regular expressions and their detected 121/// literals. 122struct Parsed { 123 exprs: Vec<Hir>, 124 prefixes: Literals, 125 suffixes: Literals, 126 bytes: bool, 127} 128 129impl ExecBuilder { 130 /// Create a regex execution builder. 131 /// 132 /// This uses default settings for everything except the regex itself, 133 /// which must be provided. Further knobs can be set by calling methods, 134 /// and then finally, `build` to actually create the executor. 135 pub fn new(re: &str) -> Self { 136 Self::new_many(&[re]) 137 } 138 139 /// Like new, but compiles the union of the given regular expressions. 140 /// 141 /// Note that when compiling 2 or more regular expressions, capture groups 142 /// are completely unsupported. (This means both `find` and `captures` 143 /// won't work.) 144 pub fn new_many<I, S>(res: I) -> Self 145 where 146 S: AsRef<str>, 147 I: IntoIterator<Item = S>, 148 { 149 let mut opts = RegexOptions::default(); 150 opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); 151 Self::new_options(opts) 152 } 153 154 /// Create a regex execution builder. 155 pub fn new_options(opts: RegexOptions) -> Self { 156 ExecBuilder { 157 options: opts, 158 match_type: None, 159 bytes: false, 160 only_utf8: true, 161 } 162 } 163 164 /// Set the matching engine to be automatically determined. 165 /// 166 /// This is the default state and will apply whatever optimizations are 167 /// possible, such as running a DFA. 168 /// 169 /// This overrides whatever was previously set via the `nfa` or 170 /// `bounded_backtracking` methods. 171 pub fn automatic(mut self) -> Self { 172 self.match_type = None; 173 self 174 } 175 176 /// Sets the matching engine to use the NFA algorithm no matter what 177 /// optimizations are possible. 178 /// 179 /// This overrides whatever was previously set via the `automatic` or 180 /// `bounded_backtracking` methods. 181 pub fn nfa(mut self) -> Self { 182 self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); 183 self 184 } 185 186 /// Sets the matching engine to use a bounded backtracking engine no 187 /// matter what optimizations are possible. 188 /// 189 /// One must use this with care, since the bounded backtracking engine 190 /// uses memory proportion to `len(regex) * len(text)`. 191 /// 192 /// This overrides whatever was previously set via the `automatic` or 193 /// `nfa` methods. 194 pub fn bounded_backtracking(mut self) -> Self { 195 self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); 196 self 197 } 198 199 /// Compiles byte based programs for use with the NFA matching engines. 200 /// 201 /// By default, the NFA engines match on Unicode scalar values. They can 202 /// be made to use byte based programs instead. In general, the byte based 203 /// programs are slower because of a less efficient encoding of character 204 /// classes. 205 /// 206 /// Note that this does not impact DFA matching engines, which always 207 /// execute on bytes. 208 pub fn bytes(mut self, yes: bool) -> Self { 209 self.bytes = yes; 210 self 211 } 212 213 /// When disabled, the program compiled may match arbitrary bytes. 214 /// 215 /// When enabled (the default), all compiled programs exclusively match 216 /// valid UTF-8 bytes. 217 pub fn only_utf8(mut self, yes: bool) -> Self { 218 self.only_utf8 = yes; 219 self 220 } 221 222 /// Set the Unicode flag. 223 pub fn unicode(mut self, yes: bool) -> Self { 224 self.options.unicode = yes; 225 self 226 } 227 228 /// Parse the current set of patterns into their AST and extract literals. 229 fn parse(&self) -> Result<Parsed, Error> { 230 let mut exprs = Vec::with_capacity(self.options.pats.len()); 231 let mut prefixes = Some(Literals::empty()); 232 let mut suffixes = Some(Literals::empty()); 233 let mut bytes = false; 234 let is_set = self.options.pats.len() > 1; 235 // If we're compiling a regex set and that set has any anchored 236 // expressions, then disable all literal optimizations. 237 for pat in &self.options.pats { 238 let mut parser = ParserBuilder::new() 239 .octal(self.options.octal) 240 .case_insensitive(self.options.case_insensitive) 241 .multi_line(self.options.multi_line) 242 .dot_matches_new_line(self.options.dot_matches_new_line) 243 .swap_greed(self.options.swap_greed) 244 .ignore_whitespace(self.options.ignore_whitespace) 245 .unicode(self.options.unicode) 246 .allow_invalid_utf8(!self.only_utf8) 247 .nest_limit(self.options.nest_limit) 248 .build(); 249 let expr = 250 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; 251 bytes = bytes || !expr.is_always_utf8(); 252 253 if cfg!(feature = "perf-literal") { 254 if !expr.is_anchored_start() && expr.is_any_anchored_start() { 255 // Partial anchors unfortunately make it hard to use 256 // prefixes, so disable them. 257 prefixes = None; 258 } else if is_set && expr.is_anchored_start() { 259 // Regex sets with anchors do not go well with literal 260 // optimizations. 261 prefixes = None; 262 } 263 prefixes = prefixes.and_then(|mut prefixes| { 264 if !prefixes.union_prefixes(&expr) { 265 None 266 } else { 267 Some(prefixes) 268 } 269 }); 270 271 if !expr.is_anchored_end() && expr.is_any_anchored_end() { 272 // Partial anchors unfortunately make it hard to use 273 // suffixes, so disable them. 274 suffixes = None; 275 } else if is_set && expr.is_anchored_end() { 276 // Regex sets with anchors do not go well with literal 277 // optimizations. 278 suffixes = None; 279 } 280 suffixes = suffixes.and_then(|mut suffixes| { 281 if !suffixes.union_suffixes(&expr) { 282 None 283 } else { 284 Some(suffixes) 285 } 286 }); 287 } 288 exprs.push(expr); 289 } 290 Ok(Parsed { 291 exprs, 292 prefixes: prefixes.unwrap_or_else(Literals::empty), 293 suffixes: suffixes.unwrap_or_else(Literals::empty), 294 bytes, 295 }) 296 } 297 298 /// Build an executor that can run a regular expression. 299 pub fn build(self) -> Result<Exec, Error> { 300 // Special case when we have no patterns to compile. 301 // This can happen when compiling a regex set. 302 if self.options.pats.is_empty() { 303 let ro = Arc::new(ExecReadOnly { 304 res: vec![], 305 nfa: Program::new(), 306 dfa: Program::new(), 307 dfa_reverse: Program::new(), 308 suffixes: LiteralSearcher::empty(), 309 #[cfg(feature = "perf-literal")] 310 ac: None, 311 match_type: MatchType::Nothing, 312 }); 313 let pool = ExecReadOnly::new_pool(&ro); 314 return Ok(Exec { ro, pool }); 315 } 316 let parsed = self.parse()?; 317 let mut nfa = Compiler::new() 318 .size_limit(self.options.size_limit) 319 .bytes(self.bytes || parsed.bytes) 320 .only_utf8(self.only_utf8) 321 .compile(&parsed.exprs)?; 322 let mut dfa = Compiler::new() 323 .size_limit(self.options.size_limit) 324 .dfa(true) 325 .only_utf8(self.only_utf8) 326 .compile(&parsed.exprs)?; 327 let mut dfa_reverse = Compiler::new() 328 .size_limit(self.options.size_limit) 329 .dfa(true) 330 .only_utf8(self.only_utf8) 331 .reverse(true) 332 .compile(&parsed.exprs)?; 333 334 #[cfg(feature = "perf-literal")] 335 let ac = self.build_aho_corasick(&parsed); 336 nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes); 337 dfa.prefixes = nfa.prefixes.clone(); 338 dfa.dfa_size_limit = self.options.dfa_size_limit; 339 dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; 340 341 let mut ro = ExecReadOnly { 342 res: self.options.pats, 343 nfa, 344 dfa, 345 dfa_reverse, 346 suffixes: LiteralSearcher::suffixes(parsed.suffixes), 347 #[cfg(feature = "perf-literal")] 348 ac, 349 match_type: MatchType::Nothing, 350 }; 351 ro.match_type = ro.choose_match_type(self.match_type); 352 353 let ro = Arc::new(ro); 354 let pool = ExecReadOnly::new_pool(&ro); 355 Ok(Exec { ro, pool }) 356 } 357 358 #[cfg(feature = "perf-literal")] 359 fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> { 360 if parsed.exprs.len() != 1 { 361 return None; 362 } 363 let lits = match alternation_literals(&parsed.exprs[0]) { 364 None => return None, 365 Some(lits) => lits, 366 }; 367 // If we have a small number of literals, then let Teddy handle 368 // things (see literal/mod.rs). 369 if lits.len() <= 32 { 370 return None; 371 } 372 Some( 373 AhoCorasickBuilder::new() 374 .match_kind(MatchKind::LeftmostFirst) 375 .auto_configure(&lits) 376 .build_with_size::<u32, _, _>(&lits) 377 // This should never happen because we'd long exceed the 378 // compilation limit for regexes first. 379 .expect("AC automaton too big"), 380 ) 381 } 382} 383 384impl<'c> RegularExpression for ExecNoSyncStr<'c> { 385 type Text = str; 386 387 fn slots_len(&self) -> usize { 388 self.0.slots_len() 389 } 390 391 fn next_after_empty(&self, text: &str, i: usize) -> usize { 392 next_utf8(text.as_bytes(), i) 393 } 394 395 #[cfg_attr(feature = "perf-inline", inline(always))] 396 fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> { 397 self.0.shortest_match_at(text.as_bytes(), start) 398 } 399 400 #[cfg_attr(feature = "perf-inline", inline(always))] 401 fn is_match_at(&self, text: &str, start: usize) -> bool { 402 self.0.is_match_at(text.as_bytes(), start) 403 } 404 405 #[cfg_attr(feature = "perf-inline", inline(always))] 406 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { 407 self.0.find_at(text.as_bytes(), start) 408 } 409 410 #[cfg_attr(feature = "perf-inline", inline(always))] 411 fn captures_read_at( 412 &self, 413 locs: &mut Locations, 414 text: &str, 415 start: usize, 416 ) -> Option<(usize, usize)> { 417 self.0.captures_read_at(locs, text.as_bytes(), start) 418 } 419} 420 421impl<'c> RegularExpression for ExecNoSync<'c> { 422 type Text = [u8]; 423 424 /// Returns the number of capture slots in the regular expression. (There 425 /// are two slots for every capture group, corresponding to possibly empty 426 /// start and end locations of the capture.) 427 fn slots_len(&self) -> usize { 428 self.ro.nfa.captures.len() * 2 429 } 430 431 fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { 432 i + 1 433 } 434 435 /// Returns the end of a match location, possibly occurring before the 436 /// end location of the correct leftmost-first match. 437 #[cfg_attr(feature = "perf-inline", inline(always))] 438 fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> { 439 if !self.is_anchor_end_match(text) { 440 return None; 441 } 442 match self.ro.match_type { 443 #[cfg(feature = "perf-literal")] 444 MatchType::Literal(ty) => { 445 self.find_literals(ty, text, start).map(|(_, e)| e) 446 } 447 #[cfg(feature = "perf-dfa")] 448 MatchType::Dfa | MatchType::DfaMany => { 449 match self.shortest_dfa(text, start) { 450 dfa::Result::Match(end) => Some(end), 451 dfa::Result::NoMatch(_) => None, 452 dfa::Result::Quit => self.shortest_nfa(text, start), 453 } 454 } 455 #[cfg(feature = "perf-dfa")] 456 MatchType::DfaAnchoredReverse => { 457 match dfa::Fsm::reverse( 458 &self.ro.dfa_reverse, 459 self.cache.value(), 460 true, 461 &text[start..], 462 text.len(), 463 ) { 464 dfa::Result::Match(_) => Some(text.len()), 465 dfa::Result::NoMatch(_) => None, 466 dfa::Result::Quit => self.shortest_nfa(text, start), 467 } 468 } 469 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 470 MatchType::DfaSuffix => { 471 match self.shortest_dfa_reverse_suffix(text, start) { 472 dfa::Result::Match(e) => Some(e), 473 dfa::Result::NoMatch(_) => None, 474 dfa::Result::Quit => self.shortest_nfa(text, start), 475 } 476 } 477 MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), 478 MatchType::Nothing => None, 479 } 480 } 481 482 /// Returns true if and only if the regex matches text. 483 /// 484 /// For single regular expressions, this is equivalent to calling 485 /// shortest_match(...).is_some(). 486 #[cfg_attr(feature = "perf-inline", inline(always))] 487 fn is_match_at(&self, text: &[u8], start: usize) -> bool { 488 if !self.is_anchor_end_match(text) { 489 return false; 490 } 491 // We need to do this dance because shortest_match relies on the NFA 492 // filling in captures[1], but a RegexSet has no captures. In other 493 // words, a RegexSet can't (currently) use shortest_match. ---AG 494 match self.ro.match_type { 495 #[cfg(feature = "perf-literal")] 496 MatchType::Literal(ty) => { 497 self.find_literals(ty, text, start).is_some() 498 } 499 #[cfg(feature = "perf-dfa")] 500 MatchType::Dfa | MatchType::DfaMany => { 501 match self.shortest_dfa(text, start) { 502 dfa::Result::Match(_) => true, 503 dfa::Result::NoMatch(_) => false, 504 dfa::Result::Quit => self.match_nfa(text, start), 505 } 506 } 507 #[cfg(feature = "perf-dfa")] 508 MatchType::DfaAnchoredReverse => { 509 match dfa::Fsm::reverse( 510 &self.ro.dfa_reverse, 511 self.cache.value(), 512 true, 513 &text[start..], 514 text.len(), 515 ) { 516 dfa::Result::Match(_) => true, 517 dfa::Result::NoMatch(_) => false, 518 dfa::Result::Quit => self.match_nfa(text, start), 519 } 520 } 521 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 522 MatchType::DfaSuffix => { 523 match self.shortest_dfa_reverse_suffix(text, start) { 524 dfa::Result::Match(_) => true, 525 dfa::Result::NoMatch(_) => false, 526 dfa::Result::Quit => self.match_nfa(text, start), 527 } 528 } 529 MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), 530 MatchType::Nothing => false, 531 } 532 } 533 534 /// Finds the start and end location of the leftmost-first match, starting 535 /// at the given location. 536 #[cfg_attr(feature = "perf-inline", inline(always))] 537 fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { 538 if !self.is_anchor_end_match(text) { 539 return None; 540 } 541 match self.ro.match_type { 542 #[cfg(feature = "perf-literal")] 543 MatchType::Literal(ty) => self.find_literals(ty, text, start), 544 #[cfg(feature = "perf-dfa")] 545 MatchType::Dfa => match self.find_dfa_forward(text, start) { 546 dfa::Result::Match((s, e)) => Some((s, e)), 547 dfa::Result::NoMatch(_) => None, 548 dfa::Result::Quit => { 549 self.find_nfa(MatchNfaType::Auto, text, start) 550 } 551 }, 552 #[cfg(feature = "perf-dfa")] 553 MatchType::DfaAnchoredReverse => { 554 match self.find_dfa_anchored_reverse(text, start) { 555 dfa::Result::Match((s, e)) => Some((s, e)), 556 dfa::Result::NoMatch(_) => None, 557 dfa::Result::Quit => { 558 self.find_nfa(MatchNfaType::Auto, text, start) 559 } 560 } 561 } 562 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 563 MatchType::DfaSuffix => { 564 match self.find_dfa_reverse_suffix(text, start) { 565 dfa::Result::Match((s, e)) => Some((s, e)), 566 dfa::Result::NoMatch(_) => None, 567 dfa::Result::Quit => { 568 self.find_nfa(MatchNfaType::Auto, text, start) 569 } 570 } 571 } 572 MatchType::Nfa(ty) => self.find_nfa(ty, text, start), 573 MatchType::Nothing => None, 574 #[cfg(feature = "perf-dfa")] 575 MatchType::DfaMany => { 576 unreachable!("BUG: RegexSet cannot be used with find") 577 } 578 } 579 } 580 581 /// Finds the start and end location of the leftmost-first match and also 582 /// fills in all matching capture groups. 583 /// 584 /// The number of capture slots given should be equal to the total number 585 /// of capture slots in the compiled program. 586 /// 587 /// Note that the first two slots always correspond to the start and end 588 /// locations of the overall match. 589 fn captures_read_at( 590 &self, 591 locs: &mut Locations, 592 text: &[u8], 593 start: usize, 594 ) -> Option<(usize, usize)> { 595 let slots = locs.as_slots(); 596 for slot in slots.iter_mut() { 597 *slot = None; 598 } 599 // If the caller unnecessarily uses this, then we try to save them 600 // from themselves. 601 match slots.len() { 602 0 => return self.find_at(text, start), 603 2 => { 604 return self.find_at(text, start).map(|(s, e)| { 605 slots[0] = Some(s); 606 slots[1] = Some(e); 607 (s, e) 608 }); 609 } 610 _ => {} // fallthrough 611 } 612 if !self.is_anchor_end_match(text) { 613 return None; 614 } 615 match self.ro.match_type { 616 #[cfg(feature = "perf-literal")] 617 MatchType::Literal(ty) => { 618 self.find_literals(ty, text, start).and_then(|(s, e)| { 619 self.captures_nfa_type( 620 MatchNfaType::Auto, 621 slots, 622 text, 623 s, 624 e, 625 ) 626 }) 627 } 628 #[cfg(feature = "perf-dfa")] 629 MatchType::Dfa => { 630 if self.ro.nfa.is_anchored_start { 631 self.captures_nfa(slots, text, start) 632 } else { 633 match self.find_dfa_forward(text, start) { 634 dfa::Result::Match((s, e)) => self.captures_nfa_type( 635 MatchNfaType::Auto, 636 slots, 637 text, 638 s, 639 e, 640 ), 641 dfa::Result::NoMatch(_) => None, 642 dfa::Result::Quit => { 643 self.captures_nfa(slots, text, start) 644 } 645 } 646 } 647 } 648 #[cfg(feature = "perf-dfa")] 649 MatchType::DfaAnchoredReverse => { 650 match self.find_dfa_anchored_reverse(text, start) { 651 dfa::Result::Match((s, e)) => self.captures_nfa_type( 652 MatchNfaType::Auto, 653 slots, 654 text, 655 s, 656 e, 657 ), 658 dfa::Result::NoMatch(_) => None, 659 dfa::Result::Quit => self.captures_nfa(slots, text, start), 660 } 661 } 662 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 663 MatchType::DfaSuffix => { 664 match self.find_dfa_reverse_suffix(text, start) { 665 dfa::Result::Match((s, e)) => self.captures_nfa_type( 666 MatchNfaType::Auto, 667 slots, 668 text, 669 s, 670 e, 671 ), 672 dfa::Result::NoMatch(_) => None, 673 dfa::Result::Quit => self.captures_nfa(slots, text, start), 674 } 675 } 676 MatchType::Nfa(ty) => { 677 self.captures_nfa_type(ty, slots, text, start, text.len()) 678 } 679 MatchType::Nothing => None, 680 #[cfg(feature = "perf-dfa")] 681 MatchType::DfaMany => { 682 unreachable!("BUG: RegexSet cannot be used with captures") 683 } 684 } 685 } 686} 687 688impl<'c> ExecNoSync<'c> { 689 /// Finds the leftmost-first match using only literal search. 690 #[cfg(feature = "perf-literal")] 691 #[cfg_attr(feature = "perf-inline", inline(always))] 692 fn find_literals( 693 &self, 694 ty: MatchLiteralType, 695 text: &[u8], 696 start: usize, 697 ) -> Option<(usize, usize)> { 698 use self::MatchLiteralType::*; 699 match ty { 700 Unanchored => { 701 let lits = &self.ro.nfa.prefixes; 702 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e)) 703 } 704 AnchoredStart => { 705 let lits = &self.ro.nfa.prefixes; 706 if start == 0 || !self.ro.nfa.is_anchored_start { 707 lits.find_start(&text[start..]) 708 .map(|(s, e)| (start + s, start + e)) 709 } else { 710 None 711 } 712 } 713 AnchoredEnd => { 714 let lits = &self.ro.suffixes; 715 lits.find_end(&text[start..]) 716 .map(|(s, e)| (start + s, start + e)) 717 } 718 AhoCorasick => self 719 .ro 720 .ac 721 .as_ref() 722 .unwrap() 723 .find(&text[start..]) 724 .map(|m| (start + m.start(), start + m.end())), 725 } 726 } 727 728 /// Finds the leftmost-first match (start and end) using only the DFA. 729 /// 730 /// If the result returned indicates that the DFA quit, then another 731 /// matching engine should be used. 732 #[cfg(feature = "perf-dfa")] 733 #[cfg_attr(feature = "perf-inline", inline(always))] 734 fn find_dfa_forward( 735 &self, 736 text: &[u8], 737 start: usize, 738 ) -> dfa::Result<(usize, usize)> { 739 use crate::dfa::Result::*; 740 let end = match dfa::Fsm::forward( 741 &self.ro.dfa, 742 self.cache.value(), 743 false, 744 text, 745 start, 746 ) { 747 NoMatch(i) => return NoMatch(i), 748 Quit => return Quit, 749 Match(end) if start == end => return Match((start, start)), 750 Match(end) => end, 751 }; 752 // Now run the DFA in reverse to find the start of the match. 753 match dfa::Fsm::reverse( 754 &self.ro.dfa_reverse, 755 self.cache.value(), 756 false, 757 &text[start..], 758 end - start, 759 ) { 760 Match(s) => Match((start + s, end)), 761 NoMatch(i) => NoMatch(i), 762 Quit => Quit, 763 } 764 } 765 766 /// Finds the leftmost-first match (start and end) using only the DFA, 767 /// but assumes the regex is anchored at the end and therefore starts at 768 /// the end of the regex and matches in reverse. 769 /// 770 /// If the result returned indicates that the DFA quit, then another 771 /// matching engine should be used. 772 #[cfg(feature = "perf-dfa")] 773 #[cfg_attr(feature = "perf-inline", inline(always))] 774 fn find_dfa_anchored_reverse( 775 &self, 776 text: &[u8], 777 start: usize, 778 ) -> dfa::Result<(usize, usize)> { 779 use crate::dfa::Result::*; 780 match dfa::Fsm::reverse( 781 &self.ro.dfa_reverse, 782 self.cache.value(), 783 false, 784 &text[start..], 785 text.len() - start, 786 ) { 787 Match(s) => Match((start + s, text.len())), 788 NoMatch(i) => NoMatch(i), 789 Quit => Quit, 790 } 791 } 792 793 /// Finds the end of the shortest match using only the DFA. 794 #[cfg(feature = "perf-dfa")] 795 #[cfg_attr(feature = "perf-inline", inline(always))] 796 fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> { 797 dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start) 798 } 799 800 /// Finds the end of the shortest match using only the DFA by scanning for 801 /// suffix literals. 802 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 803 #[cfg_attr(feature = "perf-inline", inline(always))] 804 fn shortest_dfa_reverse_suffix( 805 &self, 806 text: &[u8], 807 start: usize, 808 ) -> dfa::Result<usize> { 809 match self.exec_dfa_reverse_suffix(text, start) { 810 None => self.shortest_dfa(text, start), 811 Some(r) => r.map(|(_, end)| end), 812 } 813 } 814 815 /// Finds the end of the shortest match using only the DFA by scanning for 816 /// suffix literals. It also reports the start of the match. 817 /// 818 /// Note that if None is returned, then the optimization gave up to avoid 819 /// worst case quadratic behavior. A forward scanning DFA should be tried 820 /// next. 821 /// 822 /// If a match is returned and the full leftmost-first match is desired, 823 /// then a forward scan starting from the beginning of the match must be 824 /// done. 825 /// 826 /// If the result returned indicates that the DFA quit, then another 827 /// matching engine should be used. 828 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 829 #[cfg_attr(feature = "perf-inline", inline(always))] 830 fn exec_dfa_reverse_suffix( 831 &self, 832 text: &[u8], 833 original_start: usize, 834 ) -> Option<dfa::Result<(usize, usize)>> { 835 use crate::dfa::Result::*; 836 837 let lcs = self.ro.suffixes.lcs(); 838 debug_assert!(lcs.len() >= 1); 839 let mut start = original_start; 840 let mut end = start; 841 let mut last_literal = start; 842 while end <= text.len() { 843 last_literal += match lcs.find(&text[last_literal..]) { 844 None => return Some(NoMatch(text.len())), 845 Some(i) => i, 846 }; 847 end = last_literal + lcs.len(); 848 match dfa::Fsm::reverse( 849 &self.ro.dfa_reverse, 850 self.cache.value(), 851 false, 852 &text[start..end], 853 end - start, 854 ) { 855 Match(0) | NoMatch(0) => return None, 856 Match(i) => return Some(Match((start + i, end))), 857 NoMatch(i) => { 858 start += i; 859 last_literal += 1; 860 continue; 861 } 862 Quit => return Some(Quit), 863 }; 864 } 865 Some(NoMatch(text.len())) 866 } 867 868 /// Finds the leftmost-first match (start and end) using only the DFA 869 /// by scanning for suffix literals. 870 /// 871 /// If the result returned indicates that the DFA quit, then another 872 /// matching engine should be used. 873 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 874 #[cfg_attr(feature = "perf-inline", inline(always))] 875 fn find_dfa_reverse_suffix( 876 &self, 877 text: &[u8], 878 start: usize, 879 ) -> dfa::Result<(usize, usize)> { 880 use crate::dfa::Result::*; 881 882 let match_start = match self.exec_dfa_reverse_suffix(text, start) { 883 None => return self.find_dfa_forward(text, start), 884 Some(Match((start, _))) => start, 885 Some(r) => return r, 886 }; 887 // At this point, we've found a match. The only way to quit now 888 // without a match is if the DFA gives up (seems unlikely). 889 // 890 // Now run the DFA forwards to find the proper end of the match. 891 // (The suffix literal match can only indicate the earliest 892 // possible end location, which may appear before the end of the 893 // leftmost-first match.) 894 match dfa::Fsm::forward( 895 &self.ro.dfa, 896 self.cache.value(), 897 false, 898 text, 899 match_start, 900 ) { 901 NoMatch(_) => panic!("BUG: reverse match implies forward match"), 902 Quit => Quit, 903 Match(e) => Match((match_start, e)), 904 } 905 } 906 907 /// Executes the NFA engine to return whether there is a match or not. 908 /// 909 /// Ideally, we could use shortest_nfa(...).is_some() and get the same 910 /// performance characteristics, but regex sets don't have captures, which 911 /// shortest_nfa depends on. 912 #[cfg(feature = "perf-dfa")] 913 fn match_nfa(&self, text: &[u8], start: usize) -> bool { 914 self.match_nfa_type(MatchNfaType::Auto, text, start) 915 } 916 917 /// Like match_nfa, but allows specification of the type of NFA engine. 918 fn match_nfa_type( 919 &self, 920 ty: MatchNfaType, 921 text: &[u8], 922 start: usize, 923 ) -> bool { 924 self.exec_nfa( 925 ty, 926 &mut [false], 927 &mut [], 928 true, 929 false, 930 text, 931 start, 932 text.len(), 933 ) 934 } 935 936 /// Finds the shortest match using an NFA. 937 #[cfg(feature = "perf-dfa")] 938 fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> { 939 self.shortest_nfa_type(MatchNfaType::Auto, text, start) 940 } 941 942 /// Like shortest_nfa, but allows specification of the type of NFA engine. 943 fn shortest_nfa_type( 944 &self, 945 ty: MatchNfaType, 946 text: &[u8], 947 start: usize, 948 ) -> Option<usize> { 949 let mut slots = [None, None]; 950 if self.exec_nfa( 951 ty, 952 &mut [false], 953 &mut slots, 954 true, 955 true, 956 text, 957 start, 958 text.len(), 959 ) { 960 slots[1] 961 } else { 962 None 963 } 964 } 965 966 /// Like find, but executes an NFA engine. 967 fn find_nfa( 968 &self, 969 ty: MatchNfaType, 970 text: &[u8], 971 start: usize, 972 ) -> Option<(usize, usize)> { 973 let mut slots = [None, None]; 974 if self.exec_nfa( 975 ty, 976 &mut [false], 977 &mut slots, 978 false, 979 false, 980 text, 981 start, 982 text.len(), 983 ) { 984 match (slots[0], slots[1]) { 985 (Some(s), Some(e)) => Some((s, e)), 986 _ => None, 987 } 988 } else { 989 None 990 } 991 } 992 993 /// Like find_nfa, but fills in captures. 994 /// 995 /// `slots` should have length equal to `2 * nfa.captures.len()`. 996 #[cfg(feature = "perf-dfa")] 997 fn captures_nfa( 998 &self, 999 slots: &mut [Slot], 1000 text: &[u8], 1001 start: usize, 1002 ) -> Option<(usize, usize)> { 1003 self.captures_nfa_type( 1004 MatchNfaType::Auto, 1005 slots, 1006 text, 1007 start, 1008 text.len(), 1009 ) 1010 } 1011 1012 /// Like captures_nfa, but allows specification of type of NFA engine. 1013 fn captures_nfa_type( 1014 &self, 1015 ty: MatchNfaType, 1016 slots: &mut [Slot], 1017 text: &[u8], 1018 start: usize, 1019 end: usize, 1020 ) -> Option<(usize, usize)> { 1021 if self.exec_nfa( 1022 ty, 1023 &mut [false], 1024 slots, 1025 false, 1026 false, 1027 text, 1028 start, 1029 end, 1030 ) { 1031 match (slots[0], slots[1]) { 1032 (Some(s), Some(e)) => Some((s, e)), 1033 _ => None, 1034 } 1035 } else { 1036 None 1037 } 1038 } 1039 1040 fn exec_nfa( 1041 &self, 1042 mut ty: MatchNfaType, 1043 matches: &mut [bool], 1044 slots: &mut [Slot], 1045 quit_after_match: bool, 1046 quit_after_match_with_pos: bool, 1047 text: &[u8], 1048 start: usize, 1049 end: usize, 1050 ) -> bool { 1051 use self::MatchNfaType::*; 1052 if let Auto = ty { 1053 if backtrack::should_exec(self.ro.nfa.len(), text.len()) { 1054 ty = Backtrack; 1055 } else { 1056 ty = PikeVM; 1057 } 1058 } 1059 // The backtracker can't return the shortest match position as it is 1060 // implemented today. So if someone calls `shortest_match` and we need 1061 // to run an NFA, then use the PikeVM. 1062 if quit_after_match_with_pos || ty == PikeVM { 1063 self.exec_pikevm( 1064 matches, 1065 slots, 1066 quit_after_match, 1067 text, 1068 start, 1069 end, 1070 ) 1071 } else { 1072 self.exec_backtrack(matches, slots, text, start, end) 1073 } 1074 } 1075 1076 /// Always run the NFA algorithm. 1077 fn exec_pikevm( 1078 &self, 1079 matches: &mut [bool], 1080 slots: &mut [Slot], 1081 quit_after_match: bool, 1082 text: &[u8], 1083 start: usize, 1084 end: usize, 1085 ) -> bool { 1086 if self.ro.nfa.uses_bytes() { 1087 pikevm::Fsm::exec( 1088 &self.ro.nfa, 1089 self.cache.value(), 1090 matches, 1091 slots, 1092 quit_after_match, 1093 ByteInput::new(text, self.ro.nfa.only_utf8), 1094 start, 1095 end, 1096 ) 1097 } else { 1098 pikevm::Fsm::exec( 1099 &self.ro.nfa, 1100 self.cache.value(), 1101 matches, 1102 slots, 1103 quit_after_match, 1104 CharInput::new(text), 1105 start, 1106 end, 1107 ) 1108 } 1109 } 1110 1111 /// Always runs the NFA using bounded backtracking. 1112 fn exec_backtrack( 1113 &self, 1114 matches: &mut [bool], 1115 slots: &mut [Slot], 1116 text: &[u8], 1117 start: usize, 1118 end: usize, 1119 ) -> bool { 1120 if self.ro.nfa.uses_bytes() { 1121 backtrack::Bounded::exec( 1122 &self.ro.nfa, 1123 self.cache.value(), 1124 matches, 1125 slots, 1126 ByteInput::new(text, self.ro.nfa.only_utf8), 1127 start, 1128 end, 1129 ) 1130 } else { 1131 backtrack::Bounded::exec( 1132 &self.ro.nfa, 1133 self.cache.value(), 1134 matches, 1135 slots, 1136 CharInput::new(text), 1137 start, 1138 end, 1139 ) 1140 } 1141 } 1142 1143 /// Finds which regular expressions match the given text. 1144 /// 1145 /// `matches` should have length equal to the number of regexes being 1146 /// searched. 1147 /// 1148 /// This is only useful when one wants to know which regexes in a set 1149 /// match some text. 1150 pub fn many_matches_at( 1151 &self, 1152 matches: &mut [bool], 1153 text: &[u8], 1154 start: usize, 1155 ) -> bool { 1156 use self::MatchType::*; 1157 if !self.is_anchor_end_match(text) { 1158 return false; 1159 } 1160 match self.ro.match_type { 1161 #[cfg(feature = "perf-literal")] 1162 Literal(ty) => { 1163 debug_assert_eq!(matches.len(), 1); 1164 matches[0] = self.find_literals(ty, text, start).is_some(); 1165 matches[0] 1166 } 1167 #[cfg(feature = "perf-dfa")] 1168 Dfa | DfaAnchoredReverse | DfaMany => { 1169 match dfa::Fsm::forward_many( 1170 &self.ro.dfa, 1171 self.cache.value(), 1172 matches, 1173 text, 1174 start, 1175 ) { 1176 dfa::Result::Match(_) => true, 1177 dfa::Result::NoMatch(_) => false, 1178 dfa::Result::Quit => self.exec_nfa( 1179 MatchNfaType::Auto, 1180 matches, 1181 &mut [], 1182 false, 1183 false, 1184 text, 1185 start, 1186 text.len(), 1187 ), 1188 } 1189 } 1190 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 1191 DfaSuffix => { 1192 match dfa::Fsm::forward_many( 1193 &self.ro.dfa, 1194 self.cache.value(), 1195 matches, 1196 text, 1197 start, 1198 ) { 1199 dfa::Result::Match(_) => true, 1200 dfa::Result::NoMatch(_) => false, 1201 dfa::Result::Quit => self.exec_nfa( 1202 MatchNfaType::Auto, 1203 matches, 1204 &mut [], 1205 false, 1206 false, 1207 text, 1208 start, 1209 text.len(), 1210 ), 1211 } 1212 } 1213 Nfa(ty) => self.exec_nfa( 1214 ty, 1215 matches, 1216 &mut [], 1217 false, 1218 false, 1219 text, 1220 start, 1221 text.len(), 1222 ), 1223 Nothing => false, 1224 } 1225 } 1226 1227 #[cfg_attr(feature = "perf-inline", inline(always))] 1228 fn is_anchor_end_match(&self, text: &[u8]) -> bool { 1229 #[cfg(not(feature = "perf-literal"))] 1230 fn imp(_: &ExecReadOnly, _: &[u8]) -> bool { 1231 true 1232 } 1233 1234 #[cfg(feature = "perf-literal")] 1235 fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool { 1236 // Only do this check if the haystack is big (>1MB). 1237 if text.len() > (1 << 20) && ro.nfa.is_anchored_end { 1238 let lcs = ro.suffixes.lcs(); 1239 if lcs.len() >= 1 && !lcs.is_suffix(text) { 1240 return false; 1241 } 1242 } 1243 true 1244 } 1245 1246 imp(&self.ro, text) 1247 } 1248 1249 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1250 &self.ro.nfa.capture_name_idx 1251 } 1252} 1253 1254impl<'c> ExecNoSyncStr<'c> { 1255 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1256 self.0.capture_name_idx() 1257 } 1258} 1259 1260impl Exec { 1261 /// Get a searcher that isn't Sync. 1262 #[cfg_attr(feature = "perf-inline", inline(always))] 1263 pub fn searcher(&self) -> ExecNoSync<'_> { 1264 ExecNoSync { 1265 ro: &self.ro, // a clone is too expensive here! (and not needed) 1266 cache: self.pool.get(), 1267 } 1268 } 1269 1270 /// Get a searcher that isn't Sync and can match on &str. 1271 #[cfg_attr(feature = "perf-inline", inline(always))] 1272 pub fn searcher_str(&self) -> ExecNoSyncStr<'_> { 1273 ExecNoSyncStr(self.searcher()) 1274 } 1275 1276 /// Build a Regex from this executor. 1277 pub fn into_regex(self) -> re_unicode::Regex { 1278 re_unicode::Regex::from(self) 1279 } 1280 1281 /// Build a RegexSet from this executor. 1282 pub fn into_regex_set(self) -> re_set::unicode::RegexSet { 1283 re_set::unicode::RegexSet::from(self) 1284 } 1285 1286 /// Build a Regex from this executor that can match arbitrary bytes. 1287 pub fn into_byte_regex(self) -> re_bytes::Regex { 1288 re_bytes::Regex::from(self) 1289 } 1290 1291 /// Build a RegexSet from this executor that can match arbitrary bytes. 1292 pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { 1293 re_set::bytes::RegexSet::from(self) 1294 } 1295 1296 /// The original regular expressions given by the caller that were 1297 /// compiled. 1298 pub fn regex_strings(&self) -> &[String] { 1299 &self.ro.res 1300 } 1301 1302 /// Return a slice of capture names. 1303 /// 1304 /// Any capture that isn't named is None. 1305 pub fn capture_names(&self) -> &[Option<String>] { 1306 &self.ro.nfa.captures 1307 } 1308 1309 /// Return a reference to named groups mapping (from group name to 1310 /// group position). 1311 pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { 1312 &self.ro.nfa.capture_name_idx 1313 } 1314} 1315 1316impl Clone for Exec { 1317 fn clone(&self) -> Exec { 1318 let pool = ExecReadOnly::new_pool(&self.ro); 1319 Exec { ro: self.ro.clone(), pool } 1320 } 1321} 1322 1323impl ExecReadOnly { 1324 fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType { 1325 if let Some(MatchType::Nfa(_)) = hint { 1326 return hint.unwrap(); 1327 } 1328 // If the NFA is empty, then we'll never match anything. 1329 if self.nfa.insts.is_empty() { 1330 return MatchType::Nothing; 1331 } 1332 if let Some(literalty) = self.choose_literal_match_type() { 1333 return literalty; 1334 } 1335 if let Some(dfaty) = self.choose_dfa_match_type() { 1336 return dfaty; 1337 } 1338 // We're so totally hosed. 1339 MatchType::Nfa(MatchNfaType::Auto) 1340 } 1341 1342 /// If a plain literal scan can be used, then a corresponding literal 1343 /// search type is returned. 1344 fn choose_literal_match_type(&self) -> Option<MatchType> { 1345 #[cfg(not(feature = "perf-literal"))] 1346 fn imp(_: &ExecReadOnly) -> Option<MatchType> { 1347 None 1348 } 1349 1350 #[cfg(feature = "perf-literal")] 1351 fn imp(ro: &ExecReadOnly) -> Option<MatchType> { 1352 // If our set of prefixes is complete, then we can use it to find 1353 // a match in lieu of a regex engine. This doesn't quite work well 1354 // in the presence of multiple regexes, so only do it when there's 1355 // one. 1356 // 1357 // TODO(burntsushi): Also, don't try to match literals if the regex 1358 // is partially anchored. We could technically do it, but we'd need 1359 // to create two sets of literals: all of them and then the subset 1360 // that aren't anchored. We would then only search for all of them 1361 // when at the beginning of the input and use the subset in all 1362 // other cases. 1363 if ro.res.len() != 1 { 1364 return None; 1365 } 1366 if ro.ac.is_some() { 1367 return Some(MatchType::Literal( 1368 MatchLiteralType::AhoCorasick, 1369 )); 1370 } 1371 if ro.nfa.prefixes.complete() { 1372 return if ro.nfa.is_anchored_start { 1373 Some(MatchType::Literal(MatchLiteralType::AnchoredStart)) 1374 } else { 1375 Some(MatchType::Literal(MatchLiteralType::Unanchored)) 1376 }; 1377 } 1378 if ro.suffixes.complete() { 1379 return if ro.nfa.is_anchored_end { 1380 Some(MatchType::Literal(MatchLiteralType::AnchoredEnd)) 1381 } else { 1382 // This case shouldn't happen. When the regex isn't 1383 // anchored, then complete prefixes should imply complete 1384 // suffixes. 1385 Some(MatchType::Literal(MatchLiteralType::Unanchored)) 1386 }; 1387 } 1388 None 1389 } 1390 1391 imp(self) 1392 } 1393 1394 /// If a DFA scan can be used, then choose the appropriate DFA strategy. 1395 fn choose_dfa_match_type(&self) -> Option<MatchType> { 1396 #[cfg(not(feature = "perf-dfa"))] 1397 fn imp(_: &ExecReadOnly) -> Option<MatchType> { 1398 None 1399 } 1400 1401 #[cfg(feature = "perf-dfa")] 1402 fn imp(ro: &ExecReadOnly) -> Option<MatchType> { 1403 if !dfa::can_exec(&ro.dfa) { 1404 return None; 1405 } 1406 // Regex sets require a slightly specialized path. 1407 if ro.res.len() >= 2 { 1408 return Some(MatchType::DfaMany); 1409 } 1410 // If the regex is anchored at the end but not the start, then 1411 // just match in reverse from the end of the haystack. 1412 if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end { 1413 return Some(MatchType::DfaAnchoredReverse); 1414 } 1415 #[cfg(feature = "perf-literal")] 1416 { 1417 // If there's a longish suffix literal, then it might be faster 1418 // to look for that first. 1419 if ro.should_suffix_scan() { 1420 return Some(MatchType::DfaSuffix); 1421 } 1422 } 1423 // Fall back to your garden variety forward searching lazy DFA. 1424 Some(MatchType::Dfa) 1425 } 1426 1427 imp(self) 1428 } 1429 1430 /// Returns true if the program is amenable to suffix scanning. 1431 /// 1432 /// When this is true, as a heuristic, we assume it is OK to quickly scan 1433 /// for suffix literals and then do a *reverse* DFA match from any matches 1434 /// produced by the literal scan. (And then followed by a forward DFA 1435 /// search, since the previously found suffix literal maybe not actually be 1436 /// the end of a match.) 1437 /// 1438 /// This is a bit of a specialized optimization, but can result in pretty 1439 /// big performance wins if 1) there are no prefix literals and 2) the 1440 /// suffix literals are pretty rare in the text. (1) is obviously easy to 1441 /// account for but (2) is harder. As a proxy, we assume that longer 1442 /// strings are generally rarer, so we only enable this optimization when 1443 /// we have a meaty suffix. 1444 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 1445 fn should_suffix_scan(&self) -> bool { 1446 if self.suffixes.is_empty() { 1447 return false; 1448 } 1449 let lcs_len = self.suffixes.lcs().char_len(); 1450 lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() 1451 } 1452 1453 fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> { 1454 let ro = ro.clone(); 1455 Box::new(Pool::new(Box::new(move || { 1456 AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro))) 1457 }))) 1458 } 1459} 1460 1461#[derive(Clone, Copy, Debug)] 1462enum MatchType { 1463 /// A single or multiple literal search. This is only used when the regex 1464 /// can be decomposed into a literal search. 1465 #[cfg(feature = "perf-literal")] 1466 Literal(MatchLiteralType), 1467 /// A normal DFA search. 1468 #[cfg(feature = "perf-dfa")] 1469 Dfa, 1470 /// A reverse DFA search starting from the end of a haystack. 1471 #[cfg(feature = "perf-dfa")] 1472 DfaAnchoredReverse, 1473 /// A reverse DFA search with suffix literal scanning. 1474 #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] 1475 DfaSuffix, 1476 /// Use the DFA on two or more regular expressions. 1477 #[cfg(feature = "perf-dfa")] 1478 DfaMany, 1479 /// An NFA variant. 1480 Nfa(MatchNfaType), 1481 /// No match is ever possible, so don't ever try to search. 1482 Nothing, 1483} 1484 1485#[derive(Clone, Copy, Debug)] 1486#[cfg(feature = "perf-literal")] 1487enum MatchLiteralType { 1488 /// Match literals anywhere in text. 1489 Unanchored, 1490 /// Match literals only at the start of text. 1491 AnchoredStart, 1492 /// Match literals only at the end of text. 1493 AnchoredEnd, 1494 /// Use an Aho-Corasick automaton. This requires `ac` to be Some on 1495 /// ExecReadOnly. 1496 AhoCorasick, 1497} 1498 1499#[derive(Clone, Copy, Debug, Eq, PartialEq)] 1500enum MatchNfaType { 1501 /// Choose between Backtrack and PikeVM. 1502 Auto, 1503 /// NFA bounded backtracking. 1504 /// 1505 /// (This is only set by tests, since it never makes sense to always want 1506 /// backtracking.) 1507 Backtrack, 1508 /// The Pike VM. 1509 /// 1510 /// (This is only set by tests, since it never makes sense to always want 1511 /// the Pike VM.) 1512 PikeVM, 1513} 1514 1515/// `ProgramCache` maintains reusable allocations for each matching engine 1516/// available to a particular program. 1517/// 1518/// We declare this as unwind safe since it's a cache that's only used for 1519/// performance purposes. If a panic occurs, it is (or should be) always safe 1520/// to continue using the same regex object. 1521pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>; 1522 1523#[derive(Debug)] 1524pub struct ProgramCacheInner { 1525 pub pikevm: pikevm::Cache, 1526 pub backtrack: backtrack::Cache, 1527 #[cfg(feature = "perf-dfa")] 1528 pub dfa: dfa::Cache, 1529 #[cfg(feature = "perf-dfa")] 1530 pub dfa_reverse: dfa::Cache, 1531} 1532 1533impl ProgramCacheInner { 1534 fn new(ro: &ExecReadOnly) -> Self { 1535 ProgramCacheInner { 1536 pikevm: pikevm::Cache::new(&ro.nfa), 1537 backtrack: backtrack::Cache::new(&ro.nfa), 1538 #[cfg(feature = "perf-dfa")] 1539 dfa: dfa::Cache::new(&ro.dfa), 1540 #[cfg(feature = "perf-dfa")] 1541 dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), 1542 } 1543 } 1544} 1545 1546/// Alternation literals checks if the given HIR is a simple alternation of 1547/// literals, and if so, returns them. Otherwise, this returns None. 1548#[cfg(feature = "perf-literal")] 1549fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { 1550 use regex_syntax::hir::{HirKind, Literal}; 1551 1552 // This is pretty hacky, but basically, if `is_alternation_literal` is 1553 // true, then we can make several assumptions about the structure of our 1554 // HIR. This is what justifies the `unreachable!` statements below. 1555 // 1556 // This code should be refactored once we overhaul this crate's 1557 // optimization pipeline, because this is a terribly inflexible way to go 1558 // about things. 1559 1560 if !expr.is_alternation_literal() { 1561 return None; 1562 } 1563 let alts = match *expr.kind() { 1564 HirKind::Alternation(ref alts) => alts, 1565 _ => return None, // one literal isn't worth it 1566 }; 1567 1568 let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit { 1569 Literal::Unicode(c) => { 1570 let mut buf = [0; 4]; 1571 dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); 1572 } 1573 Literal::Byte(b) => { 1574 dst.push(b); 1575 } 1576 }; 1577 1578 let mut lits = vec![]; 1579 for alt in alts { 1580 let mut lit = vec![]; 1581 match *alt.kind() { 1582 HirKind::Literal(ref x) => extendlit(x, &mut lit), 1583 HirKind::Concat(ref exprs) => { 1584 for e in exprs { 1585 match *e.kind() { 1586 HirKind::Literal(ref x) => extendlit(x, &mut lit), 1587 _ => unreachable!("expected literal, got {:?}", e), 1588 } 1589 } 1590 } 1591 _ => unreachable!("expected literal or concat, got {:?}", alt), 1592 } 1593 lits.push(lit); 1594 } 1595 Some(lits) 1596} 1597 1598#[cfg(test)] 1599mod test { 1600 #[test] 1601 fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { 1602 use crate::internal::ExecBuilder; 1603 1604 let backtrack_bytes_re = ExecBuilder::new("^S") 1605 .bounded_backtracking() 1606 .only_utf8(false) 1607 .build() 1608 .map(|exec| exec.into_byte_regex()) 1609 .map_err(|err| format!("{}", err)) 1610 .unwrap(); 1611 1612 let default_bytes_re = ExecBuilder::new("^S") 1613 .only_utf8(false) 1614 .build() 1615 .map(|exec| exec.into_byte_regex()) 1616 .map_err(|err| format!("{}", err)) 1617 .unwrap(); 1618 1619 let input = vec![83, 83]; 1620 1621 let s1 = backtrack_bytes_re.split(&input); 1622 let s2 = default_bytes_re.split(&input); 1623 for (chunk1, chunk2) in s1.zip(s2) { 1624 assert_eq!(chunk1, chunk2); 1625 } 1626 } 1627 1628 #[test] 1629 fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { 1630 use crate::internal::ExecBuilder; 1631 1632 let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") 1633 .bounded_backtracking() 1634 .bytes(true) 1635 .build() 1636 .map(|exec| exec.into_regex()) 1637 .map_err(|err| format!("{}", err)) 1638 .unwrap(); 1639 1640 let default_bytes_re = ExecBuilder::new(r"^(?u:\*)") 1641 .bytes(true) 1642 .build() 1643 .map(|exec| exec.into_regex()) 1644 .map_err(|err| format!("{}", err)) 1645 .unwrap(); 1646 1647 let input = "**"; 1648 1649 let s1 = backtrack_bytes_re.split(input); 1650 let s2 = default_bytes_re.split(input); 1651 for (chunk1, chunk2) in s1.zip(s2) { 1652 assert_eq!(chunk1, chunk2); 1653 } 1654 } 1655} 1656