xref: /third_party/rust/crates/regex/src/re_trait.rs (revision c67d6573)
1c67d6573Sopenharmony_ciuse std::fmt;
2c67d6573Sopenharmony_ciuse std::iter::FusedIterator;
3c67d6573Sopenharmony_ci
4c67d6573Sopenharmony_ci/// Slot is a single saved capture location. Note that there are two slots for
5c67d6573Sopenharmony_ci/// every capture in a regular expression (one slot each for the start and end
6c67d6573Sopenharmony_ci/// of the capture).
7c67d6573Sopenharmony_cipub type Slot = Option<usize>;
8c67d6573Sopenharmony_ci
9c67d6573Sopenharmony_ci/// Locations represents the offsets of each capturing group in a regex for
10c67d6573Sopenharmony_ci/// a single match.
11c67d6573Sopenharmony_ci///
12c67d6573Sopenharmony_ci/// Unlike `Captures`, a `Locations` value only stores offsets.
13c67d6573Sopenharmony_ci#[doc(hidden)]
14c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
15c67d6573Sopenharmony_cipub struct Locations(Vec<Slot>);
16c67d6573Sopenharmony_ci
17c67d6573Sopenharmony_ciimpl Locations {
18c67d6573Sopenharmony_ci    /// Returns the start and end positions of the Nth capture group. Returns
19c67d6573Sopenharmony_ci    /// `None` if `i` is not a valid capture group or if the capture group did
20c67d6573Sopenharmony_ci    /// not match anything. The positions returned are *always* byte indices
21c67d6573Sopenharmony_ci    /// with respect to the original string matched.
22c67d6573Sopenharmony_ci    pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
23c67d6573Sopenharmony_ci        let (s, e) = (i * 2, i * 2 + 1);
24c67d6573Sopenharmony_ci        match (self.0.get(s), self.0.get(e)) {
25c67d6573Sopenharmony_ci            (Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
26c67d6573Sopenharmony_ci            _ => None,
27c67d6573Sopenharmony_ci        }
28c67d6573Sopenharmony_ci    }
29c67d6573Sopenharmony_ci
30c67d6573Sopenharmony_ci    /// Creates an iterator of all the capture group positions in order of
31c67d6573Sopenharmony_ci    /// appearance in the regular expression. Positions are byte indices
32c67d6573Sopenharmony_ci    /// in terms of the original string matched.
33c67d6573Sopenharmony_ci    pub fn iter(&self) -> SubCapturesPosIter<'_> {
34c67d6573Sopenharmony_ci        SubCapturesPosIter { idx: 0, locs: self }
35c67d6573Sopenharmony_ci    }
36c67d6573Sopenharmony_ci
37c67d6573Sopenharmony_ci    /// Returns the total number of capturing groups.
38c67d6573Sopenharmony_ci    ///
39c67d6573Sopenharmony_ci    /// This is always at least `1` since every regex has at least `1`
40c67d6573Sopenharmony_ci    /// capturing group that corresponds to the entire match.
41c67d6573Sopenharmony_ci    pub fn len(&self) -> usize {
42c67d6573Sopenharmony_ci        self.0.len() / 2
43c67d6573Sopenharmony_ci    }
44c67d6573Sopenharmony_ci
45c67d6573Sopenharmony_ci    /// Return the individual slots as a slice.
46c67d6573Sopenharmony_ci    pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
47c67d6573Sopenharmony_ci        &mut self.0
48c67d6573Sopenharmony_ci    }
49c67d6573Sopenharmony_ci}
50c67d6573Sopenharmony_ci
51c67d6573Sopenharmony_ci/// An iterator over capture group positions for a particular match of a
52c67d6573Sopenharmony_ci/// regular expression.
53c67d6573Sopenharmony_ci///
54c67d6573Sopenharmony_ci/// Positions are byte indices in terms of the original string matched.
55c67d6573Sopenharmony_ci///
56c67d6573Sopenharmony_ci/// `'c` is the lifetime of the captures.
57c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
58c67d6573Sopenharmony_cipub struct SubCapturesPosIter<'c> {
59c67d6573Sopenharmony_ci    idx: usize,
60c67d6573Sopenharmony_ci    locs: &'c Locations,
61c67d6573Sopenharmony_ci}
62c67d6573Sopenharmony_ci
63c67d6573Sopenharmony_ciimpl<'c> Iterator for SubCapturesPosIter<'c> {
64c67d6573Sopenharmony_ci    type Item = Option<(usize, usize)>;
65c67d6573Sopenharmony_ci
66c67d6573Sopenharmony_ci    fn next(&mut self) -> Option<Option<(usize, usize)>> {
67c67d6573Sopenharmony_ci        if self.idx >= self.locs.len() {
68c67d6573Sopenharmony_ci            return None;
69c67d6573Sopenharmony_ci        }
70c67d6573Sopenharmony_ci        let x = match self.locs.pos(self.idx) {
71c67d6573Sopenharmony_ci            None => Some(None),
72c67d6573Sopenharmony_ci            Some((s, e)) => Some(Some((s, e))),
73c67d6573Sopenharmony_ci        };
74c67d6573Sopenharmony_ci        self.idx += 1;
75c67d6573Sopenharmony_ci        x
76c67d6573Sopenharmony_ci    }
77c67d6573Sopenharmony_ci
78c67d6573Sopenharmony_ci    fn size_hint(&self) -> (usize, Option<usize>) {
79c67d6573Sopenharmony_ci        let len = self.locs.len() - self.idx;
80c67d6573Sopenharmony_ci        (len, Some(len))
81c67d6573Sopenharmony_ci    }
82c67d6573Sopenharmony_ci
83c67d6573Sopenharmony_ci    fn count(self) -> usize {
84c67d6573Sopenharmony_ci        self.len()
85c67d6573Sopenharmony_ci    }
86c67d6573Sopenharmony_ci}
87c67d6573Sopenharmony_ci
88c67d6573Sopenharmony_ciimpl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {}
89c67d6573Sopenharmony_ci
90c67d6573Sopenharmony_ciimpl<'c> FusedIterator for SubCapturesPosIter<'c> {}
91c67d6573Sopenharmony_ci
92c67d6573Sopenharmony_ci/// `RegularExpression` describes types that can implement regex searching.
93c67d6573Sopenharmony_ci///
94c67d6573Sopenharmony_ci/// This trait is my attempt at reducing code duplication and to standardize
95c67d6573Sopenharmony_ci/// the internal API. Specific duplication that is avoided are the `find`
96c67d6573Sopenharmony_ci/// and `capture` iterators, which are slightly tricky.
97c67d6573Sopenharmony_ci///
98c67d6573Sopenharmony_ci/// It's not clear whether this trait is worth it, and it also isn't
99c67d6573Sopenharmony_ci/// clear whether it's useful as a public trait or not. Methods like
100c67d6573Sopenharmony_ci/// `next_after_empty` reak of bad design, but the rest of the methods seem
101c67d6573Sopenharmony_ci/// somewhat reasonable. One particular thing this trait would expose would be
102c67d6573Sopenharmony_ci/// the ability to start the search of a regex anywhere in a haystack, which
103c67d6573Sopenharmony_ci/// isn't possible in the current public API.
104c67d6573Sopenharmony_cipub trait RegularExpression: Sized + fmt::Debug {
105c67d6573Sopenharmony_ci    /// The type of the haystack.
106c67d6573Sopenharmony_ci    type Text: ?Sized + fmt::Debug;
107c67d6573Sopenharmony_ci
108c67d6573Sopenharmony_ci    /// The number of capture slots in the compiled regular expression. This is
109c67d6573Sopenharmony_ci    /// always two times the number of capture groups (two slots per group).
110c67d6573Sopenharmony_ci    fn slots_len(&self) -> usize;
111c67d6573Sopenharmony_ci
112c67d6573Sopenharmony_ci    /// Allocates fresh space for all capturing groups in this regex.
113c67d6573Sopenharmony_ci    fn locations(&self) -> Locations {
114c67d6573Sopenharmony_ci        Locations(vec![None; self.slots_len()])
115c67d6573Sopenharmony_ci    }
116c67d6573Sopenharmony_ci
117c67d6573Sopenharmony_ci    /// Returns the position of the next character after `i`.
118c67d6573Sopenharmony_ci    ///
119c67d6573Sopenharmony_ci    /// For example, a haystack with type `&[u8]` probably returns `i+1`,
120c67d6573Sopenharmony_ci    /// whereas a haystack with type `&str` probably returns `i` plus the
121c67d6573Sopenharmony_ci    /// length of the next UTF-8 sequence.
122c67d6573Sopenharmony_ci    fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize;
123c67d6573Sopenharmony_ci
124c67d6573Sopenharmony_ci    /// Returns the location of the shortest match.
125c67d6573Sopenharmony_ci    fn shortest_match_at(
126c67d6573Sopenharmony_ci        &self,
127c67d6573Sopenharmony_ci        text: &Self::Text,
128c67d6573Sopenharmony_ci        start: usize,
129c67d6573Sopenharmony_ci    ) -> Option<usize>;
130c67d6573Sopenharmony_ci
131c67d6573Sopenharmony_ci    /// Returns whether the regex matches the text given.
132c67d6573Sopenharmony_ci    fn is_match_at(&self, text: &Self::Text, start: usize) -> bool;
133c67d6573Sopenharmony_ci
134c67d6573Sopenharmony_ci    /// Returns the leftmost-first match location if one exists.
135c67d6573Sopenharmony_ci    fn find_at(
136c67d6573Sopenharmony_ci        &self,
137c67d6573Sopenharmony_ci        text: &Self::Text,
138c67d6573Sopenharmony_ci        start: usize,
139c67d6573Sopenharmony_ci    ) -> Option<(usize, usize)>;
140c67d6573Sopenharmony_ci
141c67d6573Sopenharmony_ci    /// Returns the leftmost-first match location if one exists, and also
142c67d6573Sopenharmony_ci    /// fills in any matching capture slot locations.
143c67d6573Sopenharmony_ci    fn captures_read_at(
144c67d6573Sopenharmony_ci        &self,
145c67d6573Sopenharmony_ci        locs: &mut Locations,
146c67d6573Sopenharmony_ci        text: &Self::Text,
147c67d6573Sopenharmony_ci        start: usize,
148c67d6573Sopenharmony_ci    ) -> Option<(usize, usize)>;
149c67d6573Sopenharmony_ci
150c67d6573Sopenharmony_ci    /// Returns an iterator over all non-overlapping successive leftmost-first
151c67d6573Sopenharmony_ci    /// matches.
152c67d6573Sopenharmony_ci    fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> {
153c67d6573Sopenharmony_ci        Matches { re: self, text, last_end: 0, last_match: None }
154c67d6573Sopenharmony_ci    }
155c67d6573Sopenharmony_ci
156c67d6573Sopenharmony_ci    /// Returns an iterator over all non-overlapping successive leftmost-first
157c67d6573Sopenharmony_ci    /// matches with captures.
158c67d6573Sopenharmony_ci    fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> {
159c67d6573Sopenharmony_ci        CaptureMatches(self.find_iter(text))
160c67d6573Sopenharmony_ci    }
161c67d6573Sopenharmony_ci}
162c67d6573Sopenharmony_ci
163c67d6573Sopenharmony_ci/// An iterator over all non-overlapping successive leftmost-first matches.
164c67d6573Sopenharmony_ci#[derive(Debug)]
165c67d6573Sopenharmony_cipub struct Matches<'t, R>
166c67d6573Sopenharmony_ciwhere
167c67d6573Sopenharmony_ci    R: RegularExpression,
168c67d6573Sopenharmony_ci    R::Text: 't,
169c67d6573Sopenharmony_ci{
170c67d6573Sopenharmony_ci    re: R,
171c67d6573Sopenharmony_ci    text: &'t R::Text,
172c67d6573Sopenharmony_ci    last_end: usize,
173c67d6573Sopenharmony_ci    last_match: Option<usize>,
174c67d6573Sopenharmony_ci}
175c67d6573Sopenharmony_ci
176c67d6573Sopenharmony_ciimpl<'t, R> Matches<'t, R>
177c67d6573Sopenharmony_ciwhere
178c67d6573Sopenharmony_ci    R: RegularExpression,
179c67d6573Sopenharmony_ci    R::Text: 't,
180c67d6573Sopenharmony_ci{
181c67d6573Sopenharmony_ci    /// Return the text being searched.
182c67d6573Sopenharmony_ci    pub fn text(&self) -> &'t R::Text {
183c67d6573Sopenharmony_ci        self.text
184c67d6573Sopenharmony_ci    }
185c67d6573Sopenharmony_ci
186c67d6573Sopenharmony_ci    /// Return the underlying regex.
187c67d6573Sopenharmony_ci    pub fn regex(&self) -> &R {
188c67d6573Sopenharmony_ci        &self.re
189c67d6573Sopenharmony_ci    }
190c67d6573Sopenharmony_ci}
191c67d6573Sopenharmony_ci
192c67d6573Sopenharmony_ciimpl<'t, R> Iterator for Matches<'t, R>
193c67d6573Sopenharmony_ciwhere
194c67d6573Sopenharmony_ci    R: RegularExpression,
195c67d6573Sopenharmony_ci    R::Text: 't + AsRef<[u8]>,
196c67d6573Sopenharmony_ci{
197c67d6573Sopenharmony_ci    type Item = (usize, usize);
198c67d6573Sopenharmony_ci
199c67d6573Sopenharmony_ci    fn next(&mut self) -> Option<(usize, usize)> {
200c67d6573Sopenharmony_ci        if self.last_end > self.text.as_ref().len() {
201c67d6573Sopenharmony_ci            return None;
202c67d6573Sopenharmony_ci        }
203c67d6573Sopenharmony_ci        let (s, e) = match self.re.find_at(self.text, self.last_end) {
204c67d6573Sopenharmony_ci            None => return None,
205c67d6573Sopenharmony_ci            Some((s, e)) => (s, e),
206c67d6573Sopenharmony_ci        };
207c67d6573Sopenharmony_ci        if s == e {
208c67d6573Sopenharmony_ci            // This is an empty match. To ensure we make progress, start
209c67d6573Sopenharmony_ci            // the next search at the smallest possible starting position
210c67d6573Sopenharmony_ci            // of the next match following this one.
211c67d6573Sopenharmony_ci            self.last_end = self.re.next_after_empty(self.text, e);
212c67d6573Sopenharmony_ci            // Don't accept empty matches immediately following a match.
213c67d6573Sopenharmony_ci            // Just move on to the next match.
214c67d6573Sopenharmony_ci            if Some(e) == self.last_match {
215c67d6573Sopenharmony_ci                return self.next();
216c67d6573Sopenharmony_ci            }
217c67d6573Sopenharmony_ci        } else {
218c67d6573Sopenharmony_ci            self.last_end = e;
219c67d6573Sopenharmony_ci        }
220c67d6573Sopenharmony_ci        self.last_match = Some(e);
221c67d6573Sopenharmony_ci        Some((s, e))
222c67d6573Sopenharmony_ci    }
223c67d6573Sopenharmony_ci}
224c67d6573Sopenharmony_ci
225c67d6573Sopenharmony_ciimpl<'t, R> FusedIterator for Matches<'t, R>
226c67d6573Sopenharmony_ciwhere
227c67d6573Sopenharmony_ci    R: RegularExpression,
228c67d6573Sopenharmony_ci    R::Text: 't + AsRef<[u8]>,
229c67d6573Sopenharmony_ci{
230c67d6573Sopenharmony_ci}
231c67d6573Sopenharmony_ci
232c67d6573Sopenharmony_ci/// An iterator over all non-overlapping successive leftmost-first matches with
233c67d6573Sopenharmony_ci/// captures.
234c67d6573Sopenharmony_ci#[derive(Debug)]
235c67d6573Sopenharmony_cipub struct CaptureMatches<'t, R>(Matches<'t, R>)
236c67d6573Sopenharmony_ciwhere
237c67d6573Sopenharmony_ci    R: RegularExpression,
238c67d6573Sopenharmony_ci    R::Text: 't;
239c67d6573Sopenharmony_ci
240c67d6573Sopenharmony_ciimpl<'t, R> CaptureMatches<'t, R>
241c67d6573Sopenharmony_ciwhere
242c67d6573Sopenharmony_ci    R: RegularExpression,
243c67d6573Sopenharmony_ci    R::Text: 't,
244c67d6573Sopenharmony_ci{
245c67d6573Sopenharmony_ci    /// Return the text being searched.
246c67d6573Sopenharmony_ci    pub fn text(&self) -> &'t R::Text {
247c67d6573Sopenharmony_ci        self.0.text()
248c67d6573Sopenharmony_ci    }
249c67d6573Sopenharmony_ci
250c67d6573Sopenharmony_ci    /// Return the underlying regex.
251c67d6573Sopenharmony_ci    pub fn regex(&self) -> &R {
252c67d6573Sopenharmony_ci        self.0.regex()
253c67d6573Sopenharmony_ci    }
254c67d6573Sopenharmony_ci}
255c67d6573Sopenharmony_ci
256c67d6573Sopenharmony_ciimpl<'t, R> Iterator for CaptureMatches<'t, R>
257c67d6573Sopenharmony_ciwhere
258c67d6573Sopenharmony_ci    R: RegularExpression,
259c67d6573Sopenharmony_ci    R::Text: 't + AsRef<[u8]>,
260c67d6573Sopenharmony_ci{
261c67d6573Sopenharmony_ci    type Item = Locations;
262c67d6573Sopenharmony_ci
263c67d6573Sopenharmony_ci    fn next(&mut self) -> Option<Locations> {
264c67d6573Sopenharmony_ci        if self.0.last_end > self.0.text.as_ref().len() {
265c67d6573Sopenharmony_ci            return None;
266c67d6573Sopenharmony_ci        }
267c67d6573Sopenharmony_ci        let mut locs = self.0.re.locations();
268c67d6573Sopenharmony_ci        let (s, e) = match self.0.re.captures_read_at(
269c67d6573Sopenharmony_ci            &mut locs,
270c67d6573Sopenharmony_ci            self.0.text,
271c67d6573Sopenharmony_ci            self.0.last_end,
272c67d6573Sopenharmony_ci        ) {
273c67d6573Sopenharmony_ci            None => return None,
274c67d6573Sopenharmony_ci            Some((s, e)) => (s, e),
275c67d6573Sopenharmony_ci        };
276c67d6573Sopenharmony_ci        if s == e {
277c67d6573Sopenharmony_ci            self.0.last_end = self.0.re.next_after_empty(self.0.text, e);
278c67d6573Sopenharmony_ci            if Some(e) == self.0.last_match {
279c67d6573Sopenharmony_ci                return self.next();
280c67d6573Sopenharmony_ci            }
281c67d6573Sopenharmony_ci        } else {
282c67d6573Sopenharmony_ci            self.0.last_end = e;
283c67d6573Sopenharmony_ci        }
284c67d6573Sopenharmony_ci        self.0.last_match = Some(e);
285c67d6573Sopenharmony_ci        Some(locs)
286c67d6573Sopenharmony_ci    }
287c67d6573Sopenharmony_ci}
288c67d6573Sopenharmony_ci
289c67d6573Sopenharmony_ciimpl<'t, R> FusedIterator for CaptureMatches<'t, R>
290c67d6573Sopenharmony_ciwhere
291c67d6573Sopenharmony_ci    R: RegularExpression,
292c67d6573Sopenharmony_ci    R::Text: 't + AsRef<[u8]>,
293c67d6573Sopenharmony_ci{
294c67d6573Sopenharmony_ci}
295