1macro_rules! define_set { 2 ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, 3 $(#[$doc_regexset_example:meta])* ) => { 4 pub mod $name { 5 use std::fmt; 6 use std::iter; 7 use std::slice; 8 use std::vec; 9 10 use crate::error::Error; 11 use crate::exec::Exec; 12 use crate::re_builder::$builder_mod::RegexSetBuilder; 13 use crate::re_trait::RegularExpression; 14 15/// Match multiple (possibly overlapping) regular expressions in a single scan. 16/// 17/// A regex set corresponds to the union of two or more regular expressions. 18/// That is, a regex set will match text where at least one of its 19/// constituent regular expressions matches. A regex set as its formulated here 20/// provides a touch more power: it will also report *which* regular 21/// expressions in the set match. Indeed, this is the key difference between 22/// regex sets and a single `Regex` with many alternates, since only one 23/// alternate can match at a time. 24/// 25/// For example, consider regular expressions to match email addresses and 26/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a 27/// regex set is constructed from those regexes, then searching the text 28/// `foo@example.com` will report both regexes as matching. Of course, one 29/// could accomplish this by compiling each regex on its own and doing two 30/// searches over the text. The key advantage of using a regex set is that it 31/// will report the matching regexes using a *single pass through the text*. 32/// If one has hundreds or thousands of regexes to match repeatedly (like a URL 33/// router for a complex web application or a user agent matcher), then a regex 34/// set can realize huge performance gains. 35/// 36/// # Example 37/// 38/// This shows how the above two regexes (for matching email addresses and 39/// domains) might work: 40/// 41$(#[$doc_regexset_example])* 42/// 43/// Note that it would be possible to adapt the above example to using `Regex` 44/// with an expression like: 45/// 46/// ```text 47/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) 48/// ``` 49/// 50/// After a match, one could then inspect the capture groups to figure out 51/// which alternates matched. The problem is that it is hard to make this 52/// approach scale when there are many regexes since the overlap between each 53/// alternate isn't always obvious to reason about. 54/// 55/// # Limitations 56/// 57/// Regex sets are limited to answering the following two questions: 58/// 59/// 1. Does any regex in the set match? 60/// 2. If so, which regexes in the set match? 61/// 62/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1) 63/// instead of (2) since the matching engines can stop after the first match 64/// is found. 65/// 66/// You cannot directly extract [`Match`][crate::Match] or 67/// [`Captures`][crate::Captures] objects from a regex set. If you need these 68/// operations, the recommended approach is to compile each pattern in the set 69/// independently and scan the exact same input a second time with those 70/// independently compiled patterns: 71/// 72/// ```rust 73/// use regex::{Regex, RegexSet}; 74/// 75/// let patterns = ["foo", "bar"]; 76/// // Both patterns will match different ranges of this string. 77/// let text = "barfoo"; 78/// 79/// // Compile a set matching any of our patterns. 80/// let set = RegexSet::new(&patterns).unwrap(); 81/// // Compile each pattern independently. 82/// let regexes: Vec<_> = set.patterns().iter() 83/// .map(|pat| Regex::new(pat).unwrap()) 84/// .collect(); 85/// 86/// // Match against the whole set first and identify the individual 87/// // matching patterns. 88/// let matches: Vec<&str> = set.matches(text).into_iter() 89/// // Dereference the match index to get the corresponding 90/// // compiled pattern. 91/// .map(|match_idx| ®exes[match_idx]) 92/// // To get match locations or any other info, we then have to search 93/// // the exact same text again, using our separately-compiled pattern. 94/// .map(|pat| pat.find(text).unwrap().as_str()) 95/// .collect(); 96/// 97/// // Matches arrive in the order the constituent patterns were declared, 98/// // not the order they appear in the input. 99/// assert_eq!(vec!["foo", "bar"], matches); 100/// ``` 101/// 102/// # Performance 103/// 104/// A `RegexSet` has the same performance characteristics as `Regex`. Namely, 105/// search takes `O(mn)` time, where `m` is proportional to the size of the 106/// regex set and `n` is proportional to the length of the search text. 107#[derive(Clone)] 108pub struct RegexSet(Exec); 109 110impl RegexSet { 111 /// Create a new regex set with the given regular expressions. 112 /// 113 /// This takes an iterator of `S`, where `S` is something that can produce 114 /// a `&str`. If any of the strings in the iterator are not valid regular 115 /// expressions, then an error is returned. 116 /// 117 /// # Example 118 /// 119 /// Create a new regex set from an iterator of strings: 120 /// 121 /// ```rust 122 /// # use regex::RegexSet; 123 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); 124 /// assert!(set.is_match("foo")); 125 /// ``` 126 pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> 127 where S: AsRef<str>, I: IntoIterator<Item=S> { 128 RegexSetBuilder::new(exprs).build() 129 } 130 131 /// Create a new empty regex set. 132 /// 133 /// # Example 134 /// 135 /// ```rust 136 /// # use regex::RegexSet; 137 /// let set = RegexSet::empty(); 138 /// assert!(set.is_empty()); 139 /// ``` 140 pub fn empty() -> RegexSet { 141 RegexSetBuilder::new(&[""; 0]).build().unwrap() 142 } 143 144 /// Returns true if and only if one of the regexes in this set matches 145 /// the text given. 146 /// 147 /// This method should be preferred if you only need to test whether any 148 /// of the regexes in the set should match, but don't care about *which* 149 /// regexes matched. This is because the underlying matching engine will 150 /// quit immediately after seeing the first match instead of continuing to 151 /// find all matches. 152 /// 153 /// Note that as with searches using `Regex`, the expression is unanchored 154 /// by default. That is, if the regex does not start with `^` or `\A`, or 155 /// end with `$` or `\z`, then it is permitted to match anywhere in the 156 /// text. 157 /// 158 /// # Example 159 /// 160 /// Tests whether a set matches some text: 161 /// 162 /// ```rust 163 /// # use regex::RegexSet; 164 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); 165 /// assert!(set.is_match("foo")); 166 /// assert!(!set.is_match("☃")); 167 /// ``` 168 pub fn is_match(&self, text: $text_ty) -> bool { 169 self.is_match_at(text, 0) 170 } 171 172 /// Returns the same as is_match, but starts the search at the given 173 /// offset. 174 /// 175 /// The significance of the starting point is that it takes the surrounding 176 /// context into consideration. For example, the `\A` anchor can only 177 /// match when `start == 0`. 178 #[doc(hidden)] 179 pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { 180 self.0.searcher().is_match_at($as_bytes(text), start) 181 } 182 183 /// Returns the set of regular expressions that match in the given text. 184 /// 185 /// The set returned contains the index of each regular expression that 186 /// matches in the given text. The index is in correspondence with the 187 /// order of regular expressions given to `RegexSet`'s constructor. 188 /// 189 /// The set can also be used to iterate over the matched indices. 190 /// 191 /// Note that as with searches using `Regex`, the expression is unanchored 192 /// by default. That is, if the regex does not start with `^` or `\A`, or 193 /// end with `$` or `\z`, then it is permitted to match anywhere in the 194 /// text. 195 /// 196 /// # Example 197 /// 198 /// Tests which regular expressions match the given text: 199 /// 200 /// ```rust 201 /// # use regex::RegexSet; 202 /// let set = RegexSet::new(&[ 203 /// r"\w+", 204 /// r"\d+", 205 /// r"\pL+", 206 /// r"foo", 207 /// r"bar", 208 /// r"barfoo", 209 /// r"foobar", 210 /// ]).unwrap(); 211 /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); 212 /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); 213 /// 214 /// // You can also test whether a particular regex matched: 215 /// let matches = set.matches("foobar"); 216 /// assert!(!matches.matched(5)); 217 /// assert!(matches.matched(6)); 218 /// ``` 219 pub fn matches(&self, text: $text_ty) -> SetMatches { 220 let mut matches = vec![false; self.0.regex_strings().len()]; 221 let any = self.read_matches_at(&mut matches, text, 0); 222 SetMatches { 223 matched_any: any, 224 matches: matches, 225 } 226 } 227 228 /// Returns the same as matches, but starts the search at the given 229 /// offset and stores the matches into the slice given. 230 /// 231 /// The significance of the starting point is that it takes the surrounding 232 /// context into consideration. For example, the `\A` anchor can only 233 /// match when `start == 0`. 234 /// 235 /// `matches` must have a length that is at least the number of regexes 236 /// in this set. 237 /// 238 /// This method returns true if and only if at least one member of 239 /// `matches` is true after executing the set against `text`. 240 #[doc(hidden)] 241 pub fn read_matches_at( 242 &self, 243 matches: &mut [bool], 244 text: $text_ty, 245 start: usize, 246 ) -> bool { 247 self.0.searcher().many_matches_at(matches, $as_bytes(text), start) 248 } 249 250 /// Returns the total number of regular expressions in this set. 251 pub fn len(&self) -> usize { 252 self.0.regex_strings().len() 253 } 254 255 /// Returns `true` if this set contains no regular expressions. 256 pub fn is_empty(&self) -> bool { 257 self.0.regex_strings().is_empty() 258 } 259 260 /// Returns the patterns that this set will match on. 261 /// 262 /// This function can be used to determine the pattern for a match. The 263 /// slice returned has exactly as many patterns givens to this regex set, 264 /// and the order of the slice is the same as the order of the patterns 265 /// provided to the set. 266 /// 267 /// # Example 268 /// 269 /// ```rust 270 /// # use regex::RegexSet; 271 /// let set = RegexSet::new(&[ 272 /// r"\w+", 273 /// r"\d+", 274 /// r"\pL+", 275 /// r"foo", 276 /// r"bar", 277 /// r"barfoo", 278 /// r"foobar", 279 /// ]).unwrap(); 280 /// let matches: Vec<_> = set 281 /// .matches("foobar") 282 /// .into_iter() 283 /// .map(|match_idx| &set.patterns()[match_idx]) 284 /// .collect(); 285 /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); 286 /// ``` 287 pub fn patterns(&self) -> &[String] { 288 self.0.regex_strings() 289 } 290} 291 292/// A set of matches returned by a regex set. 293#[derive(Clone, Debug)] 294pub struct SetMatches { 295 matched_any: bool, 296 matches: Vec<bool>, 297} 298 299impl SetMatches { 300 /// Whether this set contains any matches. 301 pub fn matched_any(&self) -> bool { 302 self.matched_any 303 } 304 305 /// Whether the regex at the given index matched. 306 /// 307 /// The index for a regex is determined by its insertion order upon the 308 /// initial construction of a `RegexSet`, starting at `0`. 309 /// 310 /// # Panics 311 /// 312 /// If `regex_index` is greater than or equal to `self.len()`. 313 pub fn matched(&self, regex_index: usize) -> bool { 314 self.matches[regex_index] 315 } 316 317 /// The total number of regexes in the set that created these matches. 318 pub fn len(&self) -> usize { 319 self.matches.len() 320 } 321 322 /// Returns an iterator over indexes in the regex that matched. 323 /// 324 /// This will always produces matches in ascending order of index, where 325 /// the index corresponds to the index of the regex that matched with 326 /// respect to its position when initially building the set. 327 pub fn iter(&self) -> SetMatchesIter<'_> { 328 SetMatchesIter((&*self.matches).into_iter().enumerate()) 329 } 330} 331 332impl IntoIterator for SetMatches { 333 type IntoIter = SetMatchesIntoIter; 334 type Item = usize; 335 336 fn into_iter(self) -> Self::IntoIter { 337 SetMatchesIntoIter(self.matches.into_iter().enumerate()) 338 } 339} 340 341impl<'a> IntoIterator for &'a SetMatches { 342 type IntoIter = SetMatchesIter<'a>; 343 type Item = usize; 344 345 fn into_iter(self) -> Self::IntoIter { 346 self.iter() 347 } 348} 349 350/// An owned iterator over the set of matches from a regex set. 351/// 352/// This will always produces matches in ascending order of index, where the 353/// index corresponds to the index of the regex that matched with respect to 354/// its position when initially building the set. 355#[derive(Debug)] 356pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); 357 358impl Iterator for SetMatchesIntoIter { 359 type Item = usize; 360 361 fn next(&mut self) -> Option<usize> { 362 loop { 363 match self.0.next() { 364 None => return None, 365 Some((_, false)) => {} 366 Some((i, true)) => return Some(i), 367 } 368 } 369 } 370 371 fn size_hint(&self) -> (usize, Option<usize>) { 372 self.0.size_hint() 373 } 374} 375 376impl DoubleEndedIterator for SetMatchesIntoIter { 377 fn next_back(&mut self) -> Option<usize> { 378 loop { 379 match self.0.next_back() { 380 None => return None, 381 Some((_, false)) => {} 382 Some((i, true)) => return Some(i), 383 } 384 } 385 } 386} 387 388impl iter::FusedIterator for SetMatchesIntoIter {} 389 390/// A borrowed iterator over the set of matches from a regex set. 391/// 392/// The lifetime `'a` refers to the lifetime of a `SetMatches` value. 393/// 394/// This will always produces matches in ascending order of index, where the 395/// index corresponds to the index of the regex that matched with respect to 396/// its position when initially building the set. 397#[derive(Clone, Debug)] 398pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); 399 400impl<'a> Iterator for SetMatchesIter<'a> { 401 type Item = usize; 402 403 fn next(&mut self) -> Option<usize> { 404 loop { 405 match self.0.next() { 406 None => return None, 407 Some((_, &false)) => {} 408 Some((i, &true)) => return Some(i), 409 } 410 } 411 } 412 413 fn size_hint(&self) -> (usize, Option<usize>) { 414 self.0.size_hint() 415 } 416} 417 418impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { 419 fn next_back(&mut self) -> Option<usize> { 420 loop { 421 match self.0.next_back() { 422 None => return None, 423 Some((_, &false)) => {} 424 Some((i, &true)) => return Some(i), 425 } 426 } 427 } 428} 429 430impl<'a> iter::FusedIterator for SetMatchesIter<'a> {} 431 432#[doc(hidden)] 433impl From<Exec> for RegexSet { 434 fn from(exec: Exec) -> Self { 435 RegexSet(exec) 436 } 437} 438 439impl fmt::Debug for RegexSet { 440 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 441 write!(f, "RegexSet({:?})", self.0.regex_strings()) 442 } 443} 444 445#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } 446#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } 447 } 448 } 449} 450 451define_set! { 452 unicode, 453 set_unicode, 454 &str, 455 as_bytes_str, 456/// ```rust 457/// # use regex::RegexSet; 458/// let set = RegexSet::new(&[ 459/// r"[a-z]+@[a-z]+\.(com|org|net)", 460/// r"[a-z]+\.(com|org|net)", 461/// ]).unwrap(); 462/// 463/// // Ask whether any regexes in the set match. 464/// assert!(set.is_match("foo@example.com")); 465/// 466/// // Identify which regexes in the set match. 467/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); 468/// assert_eq!(vec![0, 1], matches); 469/// 470/// // Try again, but with text that only matches one of the regexes. 471/// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); 472/// assert_eq!(vec![1], matches); 473/// 474/// // Try again, but with text that doesn't match any regex in the set. 475/// let matches: Vec<_> = set.matches("example").into_iter().collect(); 476/// assert!(matches.is_empty()); 477/// ``` 478} 479 480define_set! { 481 bytes, 482 set_bytes, 483 &[u8], 484 as_bytes_bytes, 485/// ```rust 486/// # use regex::bytes::RegexSet; 487/// let set = RegexSet::new(&[ 488/// r"[a-z]+@[a-z]+\.(com|org|net)", 489/// r"[a-z]+\.(com|org|net)", 490/// ]).unwrap(); 491/// 492/// // Ask whether any regexes in the set match. 493/// assert!(set.is_match(b"foo@example.com")); 494/// 495/// // Identify which regexes in the set match. 496/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); 497/// assert_eq!(vec![0, 1], matches); 498/// 499/// // Try again, but with text that only matches one of the regexes. 500/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); 501/// assert_eq!(vec![1], matches); 502/// 503/// // Try again, but with text that doesn't match any regex in the set. 504/// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); 505/// assert!(matches.is_empty()); 506/// ``` 507} 508