1use std::fmt; 2use std::iter::FusedIterator; 3 4/// Slot is a single saved capture location. Note that there are two slots for 5/// every capture in a regular expression (one slot each for the start and end 6/// of the capture). 7pub type Slot = Option<usize>; 8 9/// Locations represents the offsets of each capturing group in a regex for 10/// a single match. 11/// 12/// Unlike `Captures`, a `Locations` value only stores offsets. 13#[doc(hidden)] 14#[derive(Clone, Debug)] 15pub struct Locations(Vec<Slot>); 16 17impl Locations { 18 /// Returns the start and end positions of the Nth capture group. Returns 19 /// `None` if `i` is not a valid capture group or if the capture group did 20 /// not match anything. The positions returned are *always* byte indices 21 /// with respect to the original string matched. 22 pub fn pos(&self, i: usize) -> Option<(usize, usize)> { 23 let (s, e) = (i * 2, i * 2 + 1); 24 match (self.0.get(s), self.0.get(e)) { 25 (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), 26 _ => None, 27 } 28 } 29 30 /// Creates an iterator of all the capture group positions in order of 31 /// appearance in the regular expression. Positions are byte indices 32 /// in terms of the original string matched. 33 pub fn iter(&self) -> SubCapturesPosIter<'_> { 34 SubCapturesPosIter { idx: 0, locs: self } 35 } 36 37 /// Returns the total number of capturing groups. 38 /// 39 /// This is always at least `1` since every regex has at least `1` 40 /// capturing group that corresponds to the entire match. 41 pub fn len(&self) -> usize { 42 self.0.len() / 2 43 } 44 45 /// Return the individual slots as a slice. 46 pub(crate) fn as_slots(&mut self) -> &mut [Slot] { 47 &mut self.0 48 } 49} 50 51/// An iterator over capture group positions for a particular match of a 52/// regular expression. 53/// 54/// Positions are byte indices in terms of the original string matched. 55/// 56/// `'c` is the lifetime of the captures. 57#[derive(Clone, Debug)] 58pub struct SubCapturesPosIter<'c> { 59 idx: usize, 60 locs: &'c Locations, 61} 62 63impl<'c> Iterator for SubCapturesPosIter<'c> { 64 type Item = Option<(usize, usize)>; 65 66 fn next(&mut self) -> Option<Option<(usize, usize)>> { 67 if self.idx >= self.locs.len() { 68 return None; 69 } 70 let x = match self.locs.pos(self.idx) { 71 None => Some(None), 72 Some((s, e)) => Some(Some((s, e))), 73 }; 74 self.idx += 1; 75 x 76 } 77 78 fn size_hint(&self) -> (usize, Option<usize>) { 79 let len = self.locs.len() - self.idx; 80 (len, Some(len)) 81 } 82 83 fn count(self) -> usize { 84 self.len() 85 } 86} 87 88impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {} 89 90impl<'c> FusedIterator for SubCapturesPosIter<'c> {} 91 92/// `RegularExpression` describes types that can implement regex searching. 93/// 94/// This trait is my attempt at reducing code duplication and to standardize 95/// the internal API. Specific duplication that is avoided are the `find` 96/// and `capture` iterators, which are slightly tricky. 97/// 98/// It's not clear whether this trait is worth it, and it also isn't 99/// clear whether it's useful as a public trait or not. Methods like 100/// `next_after_empty` reak of bad design, but the rest of the methods seem 101/// somewhat reasonable. One particular thing this trait would expose would be 102/// the ability to start the search of a regex anywhere in a haystack, which 103/// isn't possible in the current public API. 104pub trait RegularExpression: Sized + fmt::Debug { 105 /// The type of the haystack. 106 type Text: ?Sized + fmt::Debug; 107 108 /// The number of capture slots in the compiled regular expression. This is 109 /// always two times the number of capture groups (two slots per group). 110 fn slots_len(&self) -> usize; 111 112 /// Allocates fresh space for all capturing groups in this regex. 113 fn locations(&self) -> Locations { 114 Locations(vec![None; self.slots_len()]) 115 } 116 117 /// Returns the position of the next character after `i`. 118 /// 119 /// For example, a haystack with type `&[u8]` probably returns `i+1`, 120 /// whereas a haystack with type `&str` probably returns `i` plus the 121 /// length of the next UTF-8 sequence. 122 fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; 123 124 /// Returns the location of the shortest match. 125 fn shortest_match_at( 126 &self, 127 text: &Self::Text, 128 start: usize, 129 ) -> Option<usize>; 130 131 /// Returns whether the regex matches the text given. 132 fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; 133 134 /// Returns the leftmost-first match location if one exists. 135 fn find_at( 136 &self, 137 text: &Self::Text, 138 start: usize, 139 ) -> Option<(usize, usize)>; 140 141 /// Returns the leftmost-first match location if one exists, and also 142 /// fills in any matching capture slot locations. 143 fn captures_read_at( 144 &self, 145 locs: &mut Locations, 146 text: &Self::Text, 147 start: usize, 148 ) -> Option<(usize, usize)>; 149 150 /// Returns an iterator over all non-overlapping successive leftmost-first 151 /// matches. 152 fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> { 153 Matches { re: self, text, last_end: 0, last_match: None } 154 } 155 156 /// Returns an iterator over all non-overlapping successive leftmost-first 157 /// matches with captures. 158 fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> { 159 CaptureMatches(self.find_iter(text)) 160 } 161} 162 163/// An iterator over all non-overlapping successive leftmost-first matches. 164#[derive(Debug)] 165pub struct Matches<'t, R> 166where 167 R: RegularExpression, 168 R::Text: 't, 169{ 170 re: R, 171 text: &'t R::Text, 172 last_end: usize, 173 last_match: Option<usize>, 174} 175 176impl<'t, R> Matches<'t, R> 177where 178 R: RegularExpression, 179 R::Text: 't, 180{ 181 /// Return the text being searched. 182 pub fn text(&self) -> &'t R::Text { 183 self.text 184 } 185 186 /// Return the underlying regex. 187 pub fn regex(&self) -> &R { 188 &self.re 189 } 190} 191 192impl<'t, R> Iterator for Matches<'t, R> 193where 194 R: RegularExpression, 195 R::Text: 't + AsRef<[u8]>, 196{ 197 type Item = (usize, usize); 198 199 fn next(&mut self) -> Option<(usize, usize)> { 200 if self.last_end > self.text.as_ref().len() { 201 return None; 202 } 203 let (s, e) = match self.re.find_at(self.text, self.last_end) { 204 None => return None, 205 Some((s, e)) => (s, e), 206 }; 207 if s == e { 208 // This is an empty match. To ensure we make progress, start 209 // the next search at the smallest possible starting position 210 // of the next match following this one. 211 self.last_end = self.re.next_after_empty(self.text, e); 212 // Don't accept empty matches immediately following a match. 213 // Just move on to the next match. 214 if Some(e) == self.last_match { 215 return self.next(); 216 } 217 } else { 218 self.last_end = e; 219 } 220 self.last_match = Some(e); 221 Some((s, e)) 222 } 223} 224 225impl<'t, R> FusedIterator for Matches<'t, R> 226where 227 R: RegularExpression, 228 R::Text: 't + AsRef<[u8]>, 229{ 230} 231 232/// An iterator over all non-overlapping successive leftmost-first matches with 233/// captures. 234#[derive(Debug)] 235pub struct CaptureMatches<'t, R>(Matches<'t, R>) 236where 237 R: RegularExpression, 238 R::Text: 't; 239 240impl<'t, R> CaptureMatches<'t, R> 241where 242 R: RegularExpression, 243 R::Text: 't, 244{ 245 /// Return the text being searched. 246 pub fn text(&self) -> &'t R::Text { 247 self.0.text() 248 } 249 250 /// Return the underlying regex. 251 pub fn regex(&self) -> &R { 252 self.0.regex() 253 } 254} 255 256impl<'t, R> Iterator for CaptureMatches<'t, R> 257where 258 R: RegularExpression, 259 R::Text: 't + AsRef<[u8]>, 260{ 261 type Item = Locations; 262 263 fn next(&mut self) -> Option<Locations> { 264 if self.0.last_end > self.0.text.as_ref().len() { 265 return None; 266 } 267 let mut locs = self.0.re.locations(); 268 let (s, e) = match self.0.re.captures_read_at( 269 &mut locs, 270 self.0.text, 271 self.0.last_end, 272 ) { 273 None => return None, 274 Some((s, e)) => (s, e), 275 }; 276 if s == e { 277 self.0.last_end = self.0.re.next_after_empty(self.0.text, e); 278 if Some(e) == self.0.last_match { 279 return self.next(); 280 } 281 } else { 282 self.0.last_end = e; 283 } 284 self.0.last_match = Some(e); 285 Some(locs) 286 } 287} 288 289impl<'t, R> FusedIterator for CaptureMatches<'t, R> 290where 291 R: RegularExpression, 292 R::Text: 't + AsRef<[u8]>, 293{ 294} 295