1use std::mem; 2 3use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; 4use memchr::{memchr, memchr2, memchr3, memmem}; 5use regex_syntax::hir::literal::{Literal, Literals}; 6 7/// A prefix extracted from a compiled regular expression. 8/// 9/// A regex prefix is a set of literal strings that *must* be matched at the 10/// beginning of a regex in order for the entire regex to match. Similarly 11/// for a regex suffix. 12#[derive(Clone, Debug)] 13pub struct LiteralSearcher { 14 complete: bool, 15 lcp: Memmem, 16 lcs: Memmem, 17 matcher: Matcher, 18} 19 20#[derive(Clone, Debug)] 21enum Matcher { 22 /// No literals. (Never advances through the input.) 23 Empty, 24 /// A set of four or more single byte literals. 25 Bytes(SingleByteSet), 26 /// A single substring, using vector accelerated routines when available. 27 Memmem(Memmem), 28 /// An Aho-Corasick automaton. 29 AC { ac: AhoCorasick<u32>, lits: Vec<Literal> }, 30 /// A packed multiple substring searcher, using SIMD. 31 /// 32 /// Note that Aho-Corasick will actually use this packed searcher 33 /// internally automatically, however, there is some overhead associated 34 /// with going through the Aho-Corasick machinery. So using the packed 35 /// searcher directly results in some gains. 36 Packed { s: packed::Searcher, lits: Vec<Literal> }, 37} 38 39impl LiteralSearcher { 40 /// Returns a matcher that never matches and never advances the input. 41 pub fn empty() -> Self { 42 Self::new(Literals::empty(), Matcher::Empty) 43 } 44 45 /// Returns a matcher for literal prefixes from the given set. 46 pub fn prefixes(lits: Literals) -> Self { 47 let matcher = Matcher::prefixes(&lits); 48 Self::new(lits, matcher) 49 } 50 51 /// Returns a matcher for literal suffixes from the given set. 52 pub fn suffixes(lits: Literals) -> Self { 53 let matcher = Matcher::suffixes(&lits); 54 Self::new(lits, matcher) 55 } 56 57 fn new(lits: Literals, matcher: Matcher) -> Self { 58 let complete = lits.all_complete(); 59 LiteralSearcher { 60 complete, 61 lcp: Memmem::new(lits.longest_common_prefix()), 62 lcs: Memmem::new(lits.longest_common_suffix()), 63 matcher, 64 } 65 } 66 67 /// Returns true if all matches comprise the entire regular expression. 68 /// 69 /// This does not necessarily mean that a literal match implies a match 70 /// of the regular expression. For example, the regular expression `^a` 71 /// is comprised of a single complete literal `a`, but the regular 72 /// expression demands that it only match at the beginning of a string. 73 pub fn complete(&self) -> bool { 74 self.complete && !self.is_empty() 75 } 76 77 /// Find the position of a literal in `haystack` if it exists. 78 #[cfg_attr(feature = "perf-inline", inline(always))] 79 pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { 80 use self::Matcher::*; 81 match self.matcher { 82 Empty => Some((0, 0)), 83 Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), 84 Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())), 85 AC { ref ac, .. } => { 86 ac.find(haystack).map(|m| (m.start(), m.end())) 87 } 88 Packed { ref s, .. } => { 89 s.find(haystack).map(|m| (m.start(), m.end())) 90 } 91 } 92 } 93 94 /// Like find, except matches must start at index `0`. 95 pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { 96 for lit in self.iter() { 97 if lit.len() > haystack.len() { 98 continue; 99 } 100 if lit == &haystack[0..lit.len()] { 101 return Some((0, lit.len())); 102 } 103 } 104 None 105 } 106 107 /// Like find, except matches must end at index `haystack.len()`. 108 pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { 109 for lit in self.iter() { 110 if lit.len() > haystack.len() { 111 continue; 112 } 113 if lit == &haystack[haystack.len() - lit.len()..] { 114 return Some((haystack.len() - lit.len(), haystack.len())); 115 } 116 } 117 None 118 } 119 120 /// Returns an iterator over all literals to be matched. 121 pub fn iter(&self) -> LiteralIter<'_> { 122 match self.matcher { 123 Matcher::Empty => LiteralIter::Empty, 124 Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), 125 Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()), 126 Matcher::AC { ref lits, .. } => LiteralIter::AC(lits), 127 Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits), 128 } 129 } 130 131 /// Returns a matcher for the longest common prefix of this matcher. 132 pub fn lcp(&self) -> &Memmem { 133 &self.lcp 134 } 135 136 /// Returns a matcher for the longest common suffix of this matcher. 137 pub fn lcs(&self) -> &Memmem { 138 &self.lcs 139 } 140 141 /// Returns true iff this prefix is empty. 142 pub fn is_empty(&self) -> bool { 143 self.len() == 0 144 } 145 146 /// Returns the number of prefixes in this machine. 147 pub fn len(&self) -> usize { 148 use self::Matcher::*; 149 match self.matcher { 150 Empty => 0, 151 Bytes(ref sset) => sset.dense.len(), 152 Memmem(_) => 1, 153 AC { ref ac, .. } => ac.pattern_count(), 154 Packed { ref lits, .. } => lits.len(), 155 } 156 } 157 158 /// Return the approximate heap usage of literals in bytes. 159 pub fn approximate_size(&self) -> usize { 160 use self::Matcher::*; 161 match self.matcher { 162 Empty => 0, 163 Bytes(ref sset) => sset.approximate_size(), 164 Memmem(ref single) => single.approximate_size(), 165 AC { ref ac, .. } => ac.heap_bytes(), 166 Packed { ref s, .. } => s.heap_bytes(), 167 } 168 } 169} 170 171impl Matcher { 172 fn prefixes(lits: &Literals) -> Self { 173 let sset = SingleByteSet::prefixes(lits); 174 Matcher::new(lits, sset) 175 } 176 177 fn suffixes(lits: &Literals) -> Self { 178 let sset = SingleByteSet::suffixes(lits); 179 Matcher::new(lits, sset) 180 } 181 182 fn new(lits: &Literals, sset: SingleByteSet) -> Self { 183 if lits.literals().is_empty() { 184 return Matcher::Empty; 185 } 186 if sset.dense.len() >= 26 { 187 // Avoid trying to match a large number of single bytes. 188 // This is *very* sensitive to a frequency analysis comparison 189 // between the bytes in sset and the composition of the haystack. 190 // No matter the size of sset, if its members all are rare in the 191 // haystack, then it'd be worth using it. How to tune this... IDK. 192 // ---AG 193 return Matcher::Empty; 194 } 195 if sset.complete { 196 return Matcher::Bytes(sset); 197 } 198 if lits.literals().len() == 1 { 199 return Matcher::Memmem(Memmem::new(&lits.literals()[0])); 200 } 201 202 let pats = lits.literals().to_owned(); 203 let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; 204 if lits.literals().len() <= 100 && !is_aho_corasick_fast { 205 let mut builder = packed::Config::new() 206 .match_kind(packed::MatchKind::LeftmostFirst) 207 .builder(); 208 if let Some(s) = builder.extend(&pats).build() { 209 return Matcher::Packed { s, lits: pats }; 210 } 211 } 212 let ac = AhoCorasickBuilder::new() 213 .match_kind(aho_corasick::MatchKind::LeftmostFirst) 214 .dfa(true) 215 .build_with_size::<u32, _, _>(&pats) 216 .unwrap(); 217 Matcher::AC { ac, lits: pats } 218 } 219} 220 221#[derive(Debug)] 222pub enum LiteralIter<'a> { 223 Empty, 224 Bytes(&'a [u8]), 225 Single(&'a [u8]), 226 AC(&'a [Literal]), 227 Packed(&'a [Literal]), 228} 229 230impl<'a> Iterator for LiteralIter<'a> { 231 type Item = &'a [u8]; 232 233 fn next(&mut self) -> Option<Self::Item> { 234 match *self { 235 LiteralIter::Empty => None, 236 LiteralIter::Bytes(ref mut many) => { 237 if many.is_empty() { 238 None 239 } else { 240 let next = &many[0..1]; 241 *many = &many[1..]; 242 Some(next) 243 } 244 } 245 LiteralIter::Single(ref mut one) => { 246 if one.is_empty() { 247 None 248 } else { 249 let next = &one[..]; 250 *one = &[]; 251 Some(next) 252 } 253 } 254 LiteralIter::AC(ref mut lits) => { 255 if lits.is_empty() { 256 None 257 } else { 258 let next = &lits[0]; 259 *lits = &lits[1..]; 260 Some(&**next) 261 } 262 } 263 LiteralIter::Packed(ref mut lits) => { 264 if lits.is_empty() { 265 None 266 } else { 267 let next = &lits[0]; 268 *lits = &lits[1..]; 269 Some(&**next) 270 } 271 } 272 } 273 } 274} 275 276#[derive(Clone, Debug)] 277struct SingleByteSet { 278 sparse: Vec<bool>, 279 dense: Vec<u8>, 280 complete: bool, 281 all_ascii: bool, 282} 283 284impl SingleByteSet { 285 fn new() -> SingleByteSet { 286 SingleByteSet { 287 sparse: vec![false; 256], 288 dense: vec![], 289 complete: true, 290 all_ascii: true, 291 } 292 } 293 294 fn prefixes(lits: &Literals) -> SingleByteSet { 295 let mut sset = SingleByteSet::new(); 296 for lit in lits.literals() { 297 sset.complete = sset.complete && lit.len() == 1; 298 if let Some(&b) = lit.get(0) { 299 if !sset.sparse[b as usize] { 300 if b > 0x7F { 301 sset.all_ascii = false; 302 } 303 sset.dense.push(b); 304 sset.sparse[b as usize] = true; 305 } 306 } 307 } 308 sset 309 } 310 311 fn suffixes(lits: &Literals) -> SingleByteSet { 312 let mut sset = SingleByteSet::new(); 313 for lit in lits.literals() { 314 sset.complete = sset.complete && lit.len() == 1; 315 if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { 316 if !sset.sparse[b as usize] { 317 if b > 0x7F { 318 sset.all_ascii = false; 319 } 320 sset.dense.push(b); 321 sset.sparse[b as usize] = true; 322 } 323 } 324 } 325 sset 326 } 327 328 /// Faster find that special cases certain sizes to use memchr. 329 #[cfg_attr(feature = "perf-inline", inline(always))] 330 fn find(&self, text: &[u8]) -> Option<usize> { 331 match self.dense.len() { 332 0 => None, 333 1 => memchr(self.dense[0], text), 334 2 => memchr2(self.dense[0], self.dense[1], text), 335 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), 336 _ => self._find(text), 337 } 338 } 339 340 /// Generic find that works on any sized set. 341 fn _find(&self, haystack: &[u8]) -> Option<usize> { 342 for (i, &b) in haystack.iter().enumerate() { 343 if self.sparse[b as usize] { 344 return Some(i); 345 } 346 } 347 None 348 } 349 350 fn approximate_size(&self) -> usize { 351 (self.dense.len() * mem::size_of::<u8>()) 352 + (self.sparse.len() * mem::size_of::<bool>()) 353 } 354} 355 356/// A simple wrapper around the memchr crate's memmem implementation. 357/// 358/// The API this exposes mirrors the API of previous substring searchers that 359/// this supplanted. 360#[derive(Clone, Debug)] 361pub struct Memmem { 362 finder: memmem::Finder<'static>, 363 char_len: usize, 364} 365 366impl Memmem { 367 fn new(pat: &[u8]) -> Memmem { 368 Memmem { 369 finder: memmem::Finder::new(pat).into_owned(), 370 char_len: char_len_lossy(pat), 371 } 372 } 373 374 #[cfg_attr(feature = "perf-inline", inline(always))] 375 pub fn find(&self, haystack: &[u8]) -> Option<usize> { 376 self.finder.find(haystack) 377 } 378 379 #[cfg_attr(feature = "perf-inline", inline(always))] 380 pub fn is_suffix(&self, text: &[u8]) -> bool { 381 if text.len() < self.len() { 382 return false; 383 } 384 &text[text.len() - self.len()..] == self.finder.needle() 385 } 386 387 pub fn len(&self) -> usize { 388 self.finder.needle().len() 389 } 390 391 pub fn char_len(&self) -> usize { 392 self.char_len 393 } 394 395 fn approximate_size(&self) -> usize { 396 self.finder.needle().len() * mem::size_of::<u8>() 397 } 398} 399 400fn char_len_lossy(bytes: &[u8]) -> usize { 401 String::from_utf8_lossy(bytes).chars().count() 402} 403