1c67d6573Sopenharmony_ciuse std::fmt; 2c67d6573Sopenharmony_ciuse std::iter::FusedIterator; 3c67d6573Sopenharmony_ci 4c67d6573Sopenharmony_ci/// Slot is a single saved capture location. Note that there are two slots for 5c67d6573Sopenharmony_ci/// every capture in a regular expression (one slot each for the start and end 6c67d6573Sopenharmony_ci/// of the capture). 7c67d6573Sopenharmony_cipub type Slot = Option<usize>; 8c67d6573Sopenharmony_ci 9c67d6573Sopenharmony_ci/// Locations represents the offsets of each capturing group in a regex for 10c67d6573Sopenharmony_ci/// a single match. 11c67d6573Sopenharmony_ci/// 12c67d6573Sopenharmony_ci/// Unlike `Captures`, a `Locations` value only stores offsets. 13c67d6573Sopenharmony_ci#[doc(hidden)] 14c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 15c67d6573Sopenharmony_cipub struct Locations(Vec<Slot>); 16c67d6573Sopenharmony_ci 17c67d6573Sopenharmony_ciimpl Locations { 18c67d6573Sopenharmony_ci /// Returns the start and end positions of the Nth capture group. Returns 19c67d6573Sopenharmony_ci /// `None` if `i` is not a valid capture group or if the capture group did 20c67d6573Sopenharmony_ci /// not match anything. The positions returned are *always* byte indices 21c67d6573Sopenharmony_ci /// with respect to the original string matched. 22c67d6573Sopenharmony_ci pub fn pos(&self, i: usize) -> Option<(usize, usize)> { 23c67d6573Sopenharmony_ci let (s, e) = (i * 2, i * 2 + 1); 24c67d6573Sopenharmony_ci match (self.0.get(s), self.0.get(e)) { 25c67d6573Sopenharmony_ci (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), 26c67d6573Sopenharmony_ci _ => None, 27c67d6573Sopenharmony_ci } 28c67d6573Sopenharmony_ci } 29c67d6573Sopenharmony_ci 30c67d6573Sopenharmony_ci /// Creates an iterator of all the capture group positions in order of 31c67d6573Sopenharmony_ci /// appearance in the regular expression. Positions are byte indices 32c67d6573Sopenharmony_ci /// in terms of the original string matched. 33c67d6573Sopenharmony_ci pub fn iter(&self) -> SubCapturesPosIter<'_> { 34c67d6573Sopenharmony_ci SubCapturesPosIter { idx: 0, locs: self } 35c67d6573Sopenharmony_ci } 36c67d6573Sopenharmony_ci 37c67d6573Sopenharmony_ci /// Returns the total number of capturing groups. 38c67d6573Sopenharmony_ci /// 39c67d6573Sopenharmony_ci /// This is always at least `1` since every regex has at least `1` 40c67d6573Sopenharmony_ci /// capturing group that corresponds to the entire match. 41c67d6573Sopenharmony_ci pub fn len(&self) -> usize { 42c67d6573Sopenharmony_ci self.0.len() / 2 43c67d6573Sopenharmony_ci } 44c67d6573Sopenharmony_ci 45c67d6573Sopenharmony_ci /// Return the individual slots as a slice. 46c67d6573Sopenharmony_ci pub(crate) fn as_slots(&mut self) -> &mut [Slot] { 47c67d6573Sopenharmony_ci &mut self.0 48c67d6573Sopenharmony_ci } 49c67d6573Sopenharmony_ci} 50c67d6573Sopenharmony_ci 51c67d6573Sopenharmony_ci/// An iterator over capture group positions for a particular match of a 52c67d6573Sopenharmony_ci/// regular expression. 53c67d6573Sopenharmony_ci/// 54c67d6573Sopenharmony_ci/// Positions are byte indices in terms of the original string matched. 55c67d6573Sopenharmony_ci/// 56c67d6573Sopenharmony_ci/// `'c` is the lifetime of the captures. 57c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 58c67d6573Sopenharmony_cipub struct SubCapturesPosIter<'c> { 59c67d6573Sopenharmony_ci idx: usize, 60c67d6573Sopenharmony_ci locs: &'c Locations, 61c67d6573Sopenharmony_ci} 62c67d6573Sopenharmony_ci 63c67d6573Sopenharmony_ciimpl<'c> Iterator for SubCapturesPosIter<'c> { 64c67d6573Sopenharmony_ci type Item = Option<(usize, usize)>; 65c67d6573Sopenharmony_ci 66c67d6573Sopenharmony_ci fn next(&mut self) -> Option<Option<(usize, usize)>> { 67c67d6573Sopenharmony_ci if self.idx >= self.locs.len() { 68c67d6573Sopenharmony_ci return None; 69c67d6573Sopenharmony_ci } 70c67d6573Sopenharmony_ci let x = match self.locs.pos(self.idx) { 71c67d6573Sopenharmony_ci None => Some(None), 72c67d6573Sopenharmony_ci Some((s, e)) => Some(Some((s, e))), 73c67d6573Sopenharmony_ci }; 74c67d6573Sopenharmony_ci self.idx += 1; 75c67d6573Sopenharmony_ci x 76c67d6573Sopenharmony_ci } 77c67d6573Sopenharmony_ci 78c67d6573Sopenharmony_ci fn size_hint(&self) -> (usize, Option<usize>) { 79c67d6573Sopenharmony_ci let len = self.locs.len() - self.idx; 80c67d6573Sopenharmony_ci (len, Some(len)) 81c67d6573Sopenharmony_ci } 82c67d6573Sopenharmony_ci 83c67d6573Sopenharmony_ci fn count(self) -> usize { 84c67d6573Sopenharmony_ci self.len() 85c67d6573Sopenharmony_ci } 86c67d6573Sopenharmony_ci} 87c67d6573Sopenharmony_ci 88c67d6573Sopenharmony_ciimpl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {} 89c67d6573Sopenharmony_ci 90c67d6573Sopenharmony_ciimpl<'c> FusedIterator for SubCapturesPosIter<'c> {} 91c67d6573Sopenharmony_ci 92c67d6573Sopenharmony_ci/// `RegularExpression` describes types that can implement regex searching. 93c67d6573Sopenharmony_ci/// 94c67d6573Sopenharmony_ci/// This trait is my attempt at reducing code duplication and to standardize 95c67d6573Sopenharmony_ci/// the internal API. Specific duplication that is avoided are the `find` 96c67d6573Sopenharmony_ci/// and `capture` iterators, which are slightly tricky. 97c67d6573Sopenharmony_ci/// 98c67d6573Sopenharmony_ci/// It's not clear whether this trait is worth it, and it also isn't 99c67d6573Sopenharmony_ci/// clear whether it's useful as a public trait or not. Methods like 100c67d6573Sopenharmony_ci/// `next_after_empty` reak of bad design, but the rest of the methods seem 101c67d6573Sopenharmony_ci/// somewhat reasonable. One particular thing this trait would expose would be 102c67d6573Sopenharmony_ci/// the ability to start the search of a regex anywhere in a haystack, which 103c67d6573Sopenharmony_ci/// isn't possible in the current public API. 104c67d6573Sopenharmony_cipub trait RegularExpression: Sized + fmt::Debug { 105c67d6573Sopenharmony_ci /// The type of the haystack. 106c67d6573Sopenharmony_ci type Text: ?Sized + fmt::Debug; 107c67d6573Sopenharmony_ci 108c67d6573Sopenharmony_ci /// The number of capture slots in the compiled regular expression. This is 109c67d6573Sopenharmony_ci /// always two times the number of capture groups (two slots per group). 110c67d6573Sopenharmony_ci fn slots_len(&self) -> usize; 111c67d6573Sopenharmony_ci 112c67d6573Sopenharmony_ci /// Allocates fresh space for all capturing groups in this regex. 113c67d6573Sopenharmony_ci fn locations(&self) -> Locations { 114c67d6573Sopenharmony_ci Locations(vec![None; self.slots_len()]) 115c67d6573Sopenharmony_ci } 116c67d6573Sopenharmony_ci 117c67d6573Sopenharmony_ci /// Returns the position of the next character after `i`. 118c67d6573Sopenharmony_ci /// 119c67d6573Sopenharmony_ci /// For example, a haystack with type `&[u8]` probably returns `i+1`, 120c67d6573Sopenharmony_ci /// whereas a haystack with type `&str` probably returns `i` plus the 121c67d6573Sopenharmony_ci /// length of the next UTF-8 sequence. 122c67d6573Sopenharmony_ci fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; 123c67d6573Sopenharmony_ci 124c67d6573Sopenharmony_ci /// Returns the location of the shortest match. 125c67d6573Sopenharmony_ci fn shortest_match_at( 126c67d6573Sopenharmony_ci &self, 127c67d6573Sopenharmony_ci text: &Self::Text, 128c67d6573Sopenharmony_ci start: usize, 129c67d6573Sopenharmony_ci ) -> Option<usize>; 130c67d6573Sopenharmony_ci 131c67d6573Sopenharmony_ci /// Returns whether the regex matches the text given. 132c67d6573Sopenharmony_ci fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; 133c67d6573Sopenharmony_ci 134c67d6573Sopenharmony_ci /// Returns the leftmost-first match location if one exists. 135c67d6573Sopenharmony_ci fn find_at( 136c67d6573Sopenharmony_ci &self, 137c67d6573Sopenharmony_ci text: &Self::Text, 138c67d6573Sopenharmony_ci start: usize, 139c67d6573Sopenharmony_ci ) -> Option<(usize, usize)>; 140c67d6573Sopenharmony_ci 141c67d6573Sopenharmony_ci /// Returns the leftmost-first match location if one exists, and also 142c67d6573Sopenharmony_ci /// fills in any matching capture slot locations. 143c67d6573Sopenharmony_ci fn captures_read_at( 144c67d6573Sopenharmony_ci &self, 145c67d6573Sopenharmony_ci locs: &mut Locations, 146c67d6573Sopenharmony_ci text: &Self::Text, 147c67d6573Sopenharmony_ci start: usize, 148c67d6573Sopenharmony_ci ) -> Option<(usize, usize)>; 149c67d6573Sopenharmony_ci 150c67d6573Sopenharmony_ci /// Returns an iterator over all non-overlapping successive leftmost-first 151c67d6573Sopenharmony_ci /// matches. 152c67d6573Sopenharmony_ci fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> { 153c67d6573Sopenharmony_ci Matches { re: self, text, last_end: 0, last_match: None } 154c67d6573Sopenharmony_ci } 155c67d6573Sopenharmony_ci 156c67d6573Sopenharmony_ci /// Returns an iterator over all non-overlapping successive leftmost-first 157c67d6573Sopenharmony_ci /// matches with captures. 158c67d6573Sopenharmony_ci fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> { 159c67d6573Sopenharmony_ci CaptureMatches(self.find_iter(text)) 160c67d6573Sopenharmony_ci } 161c67d6573Sopenharmony_ci} 162c67d6573Sopenharmony_ci 163c67d6573Sopenharmony_ci/// An iterator over all non-overlapping successive leftmost-first matches. 164c67d6573Sopenharmony_ci#[derive(Debug)] 165c67d6573Sopenharmony_cipub struct Matches<'t, R> 166c67d6573Sopenharmony_ciwhere 167c67d6573Sopenharmony_ci R: RegularExpression, 168c67d6573Sopenharmony_ci R::Text: 't, 169c67d6573Sopenharmony_ci{ 170c67d6573Sopenharmony_ci re: R, 171c67d6573Sopenharmony_ci text: &'t R::Text, 172c67d6573Sopenharmony_ci last_end: usize, 173c67d6573Sopenharmony_ci last_match: Option<usize>, 174c67d6573Sopenharmony_ci} 175c67d6573Sopenharmony_ci 176c67d6573Sopenharmony_ciimpl<'t, R> Matches<'t, R> 177c67d6573Sopenharmony_ciwhere 178c67d6573Sopenharmony_ci R: RegularExpression, 179c67d6573Sopenharmony_ci R::Text: 't, 180c67d6573Sopenharmony_ci{ 181c67d6573Sopenharmony_ci /// Return the text being searched. 182c67d6573Sopenharmony_ci pub fn text(&self) -> &'t R::Text { 183c67d6573Sopenharmony_ci self.text 184c67d6573Sopenharmony_ci } 185c67d6573Sopenharmony_ci 186c67d6573Sopenharmony_ci /// Return the underlying regex. 187c67d6573Sopenharmony_ci pub fn regex(&self) -> &R { 188c67d6573Sopenharmony_ci &self.re 189c67d6573Sopenharmony_ci } 190c67d6573Sopenharmony_ci} 191c67d6573Sopenharmony_ci 192c67d6573Sopenharmony_ciimpl<'t, R> Iterator for Matches<'t, R> 193c67d6573Sopenharmony_ciwhere 194c67d6573Sopenharmony_ci R: RegularExpression, 195c67d6573Sopenharmony_ci R::Text: 't + AsRef<[u8]>, 196c67d6573Sopenharmony_ci{ 197c67d6573Sopenharmony_ci type Item = (usize, usize); 198c67d6573Sopenharmony_ci 199c67d6573Sopenharmony_ci fn next(&mut self) -> Option<(usize, usize)> { 200c67d6573Sopenharmony_ci if self.last_end > self.text.as_ref().len() { 201c67d6573Sopenharmony_ci return None; 202c67d6573Sopenharmony_ci } 203c67d6573Sopenharmony_ci let (s, e) = match self.re.find_at(self.text, self.last_end) { 204c67d6573Sopenharmony_ci None => return None, 205c67d6573Sopenharmony_ci Some((s, e)) => (s, e), 206c67d6573Sopenharmony_ci }; 207c67d6573Sopenharmony_ci if s == e { 208c67d6573Sopenharmony_ci // This is an empty match. To ensure we make progress, start 209c67d6573Sopenharmony_ci // the next search at the smallest possible starting position 210c67d6573Sopenharmony_ci // of the next match following this one. 211c67d6573Sopenharmony_ci self.last_end = self.re.next_after_empty(self.text, e); 212c67d6573Sopenharmony_ci // Don't accept empty matches immediately following a match. 213c67d6573Sopenharmony_ci // Just move on to the next match. 214c67d6573Sopenharmony_ci if Some(e) == self.last_match { 215c67d6573Sopenharmony_ci return self.next(); 216c67d6573Sopenharmony_ci } 217c67d6573Sopenharmony_ci } else { 218c67d6573Sopenharmony_ci self.last_end = e; 219c67d6573Sopenharmony_ci } 220c67d6573Sopenharmony_ci self.last_match = Some(e); 221c67d6573Sopenharmony_ci Some((s, e)) 222c67d6573Sopenharmony_ci } 223c67d6573Sopenharmony_ci} 224c67d6573Sopenharmony_ci 225c67d6573Sopenharmony_ciimpl<'t, R> FusedIterator for Matches<'t, R> 226c67d6573Sopenharmony_ciwhere 227c67d6573Sopenharmony_ci R: RegularExpression, 228c67d6573Sopenharmony_ci R::Text: 't + AsRef<[u8]>, 229c67d6573Sopenharmony_ci{ 230c67d6573Sopenharmony_ci} 231c67d6573Sopenharmony_ci 232c67d6573Sopenharmony_ci/// An iterator over all non-overlapping successive leftmost-first matches with 233c67d6573Sopenharmony_ci/// captures. 234c67d6573Sopenharmony_ci#[derive(Debug)] 235c67d6573Sopenharmony_cipub struct CaptureMatches<'t, R>(Matches<'t, R>) 236c67d6573Sopenharmony_ciwhere 237c67d6573Sopenharmony_ci R: RegularExpression, 238c67d6573Sopenharmony_ci R::Text: 't; 239c67d6573Sopenharmony_ci 240c67d6573Sopenharmony_ciimpl<'t, R> CaptureMatches<'t, R> 241c67d6573Sopenharmony_ciwhere 242c67d6573Sopenharmony_ci R: RegularExpression, 243c67d6573Sopenharmony_ci R::Text: 't, 244c67d6573Sopenharmony_ci{ 245c67d6573Sopenharmony_ci /// Return the text being searched. 246c67d6573Sopenharmony_ci pub fn text(&self) -> &'t R::Text { 247c67d6573Sopenharmony_ci self.0.text() 248c67d6573Sopenharmony_ci } 249c67d6573Sopenharmony_ci 250c67d6573Sopenharmony_ci /// Return the underlying regex. 251c67d6573Sopenharmony_ci pub fn regex(&self) -> &R { 252c67d6573Sopenharmony_ci self.0.regex() 253c67d6573Sopenharmony_ci } 254c67d6573Sopenharmony_ci} 255c67d6573Sopenharmony_ci 256c67d6573Sopenharmony_ciimpl<'t, R> Iterator for CaptureMatches<'t, R> 257c67d6573Sopenharmony_ciwhere 258c67d6573Sopenharmony_ci R: RegularExpression, 259c67d6573Sopenharmony_ci R::Text: 't + AsRef<[u8]>, 260c67d6573Sopenharmony_ci{ 261c67d6573Sopenharmony_ci type Item = Locations; 262c67d6573Sopenharmony_ci 263c67d6573Sopenharmony_ci fn next(&mut self) -> Option<Locations> { 264c67d6573Sopenharmony_ci if self.0.last_end > self.0.text.as_ref().len() { 265c67d6573Sopenharmony_ci return None; 266c67d6573Sopenharmony_ci } 267c67d6573Sopenharmony_ci let mut locs = self.0.re.locations(); 268c67d6573Sopenharmony_ci let (s, e) = match self.0.re.captures_read_at( 269c67d6573Sopenharmony_ci &mut locs, 270c67d6573Sopenharmony_ci self.0.text, 271c67d6573Sopenharmony_ci self.0.last_end, 272c67d6573Sopenharmony_ci ) { 273c67d6573Sopenharmony_ci None => return None, 274c67d6573Sopenharmony_ci Some((s, e)) => (s, e), 275c67d6573Sopenharmony_ci }; 276c67d6573Sopenharmony_ci if s == e { 277c67d6573Sopenharmony_ci self.0.last_end = self.0.re.next_after_empty(self.0.text, e); 278c67d6573Sopenharmony_ci if Some(e) == self.0.last_match { 279c67d6573Sopenharmony_ci return self.next(); 280c67d6573Sopenharmony_ci } 281c67d6573Sopenharmony_ci } else { 282c67d6573Sopenharmony_ci self.0.last_end = e; 283c67d6573Sopenharmony_ci } 284c67d6573Sopenharmony_ci self.0.last_match = Some(e); 285c67d6573Sopenharmony_ci Some(locs) 286c67d6573Sopenharmony_ci } 287c67d6573Sopenharmony_ci} 288c67d6573Sopenharmony_ci 289c67d6573Sopenharmony_ciimpl<'t, R> FusedIterator for CaptureMatches<'t, R> 290c67d6573Sopenharmony_ciwhere 291c67d6573Sopenharmony_ci R: RegularExpression, 292c67d6573Sopenharmony_ci R::Text: 't + AsRef<[u8]>, 293c67d6573Sopenharmony_ci{ 294c67d6573Sopenharmony_ci} 295