1c67d6573Sopenharmony_ci#![allow(non_camel_case_types)]
2c67d6573Sopenharmony_ci
3c67d6573Sopenharmony_ciuse libc::{c_int, c_uchar, c_void};
4c67d6573Sopenharmony_ci
5c67d6573Sopenharmony_ci/// Regex wraps an RE2 regular expression.
6c67d6573Sopenharmony_ci///
7c67d6573Sopenharmony_ci/// It cannot be used safely from multiple threads simultaneously.
8c67d6573Sopenharmony_cipub struct Regex {
9c67d6573Sopenharmony_ci    re: *mut re2_regexp,
10c67d6573Sopenharmony_ci}
11c67d6573Sopenharmony_ci
12c67d6573Sopenharmony_ciunsafe impl Send for Regex {}
13c67d6573Sopenharmony_ci
14c67d6573Sopenharmony_ciimpl Drop for Regex {
15c67d6573Sopenharmony_ci    fn drop(&mut self) {
16c67d6573Sopenharmony_ci        unsafe {
17c67d6573Sopenharmony_ci            re2_regexp_free(self.re);
18c67d6573Sopenharmony_ci        }
19c67d6573Sopenharmony_ci    }
20c67d6573Sopenharmony_ci}
21c67d6573Sopenharmony_ci
22c67d6573Sopenharmony_ci#[derive(Debug)]
23c67d6573Sopenharmony_cipub struct Error(());
24c67d6573Sopenharmony_ci
25c67d6573Sopenharmony_ciimpl Regex {
26c67d6573Sopenharmony_ci    pub fn new(pattern: &str) -> Result<Regex, Error> {
27c67d6573Sopenharmony_ci        unsafe { Ok(Regex { re: re2_regexp_new(pattern.into()) }) }
28c67d6573Sopenharmony_ci    }
29c67d6573Sopenharmony_ci
30c67d6573Sopenharmony_ci    pub fn is_match(&self, text: &str) -> bool {
31c67d6573Sopenharmony_ci        unsafe {
32c67d6573Sopenharmony_ci            re2_regexp_match(self.re, text.into(), 0, text.len() as c_int)
33c67d6573Sopenharmony_ci        }
34c67d6573Sopenharmony_ci    }
35c67d6573Sopenharmony_ci
36c67d6573Sopenharmony_ci    pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
37c67d6573Sopenharmony_ci        FindMatches { re: self, text: text, last_end: 0, last_match: None }
38c67d6573Sopenharmony_ci    }
39c67d6573Sopenharmony_ci
40c67d6573Sopenharmony_ci    fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
41c67d6573Sopenharmony_ci        let (mut s, mut e): (c_int, c_int) = (0, 0);
42c67d6573Sopenharmony_ci        let matched = unsafe {
43c67d6573Sopenharmony_ci            re2_regexp_find(
44c67d6573Sopenharmony_ci                self.re,
45c67d6573Sopenharmony_ci                text.into(),
46c67d6573Sopenharmony_ci                start as c_int,
47c67d6573Sopenharmony_ci                text.len() as c_int,
48c67d6573Sopenharmony_ci                &mut s,
49c67d6573Sopenharmony_ci                &mut e,
50c67d6573Sopenharmony_ci            )
51c67d6573Sopenharmony_ci        };
52c67d6573Sopenharmony_ci        if matched {
53c67d6573Sopenharmony_ci            Some((s as usize, e as usize))
54c67d6573Sopenharmony_ci        } else {
55c67d6573Sopenharmony_ci            None
56c67d6573Sopenharmony_ci        }
57c67d6573Sopenharmony_ci    }
58c67d6573Sopenharmony_ci}
59c67d6573Sopenharmony_ci
60c67d6573Sopenharmony_cipub struct FindMatches<'r, 't> {
61c67d6573Sopenharmony_ci    re: &'r Regex,
62c67d6573Sopenharmony_ci    text: &'t str,
63c67d6573Sopenharmony_ci    last_end: usize,
64c67d6573Sopenharmony_ci    last_match: Option<usize>,
65c67d6573Sopenharmony_ci}
66c67d6573Sopenharmony_ci
67c67d6573Sopenharmony_ci// This implementation is identical to the one Rust uses, since both Rust's
68c67d6573Sopenharmony_ci// regex engine and RE2 handle empty matches in the same way.
69c67d6573Sopenharmony_ciimpl<'r, 't> Iterator for FindMatches<'r, 't> {
70c67d6573Sopenharmony_ci    type Item = (usize, usize);
71c67d6573Sopenharmony_ci
72c67d6573Sopenharmony_ci    fn next(&mut self) -> Option<(usize, usize)> {
73c67d6573Sopenharmony_ci        fn next_after_empty(text: &str, i: usize) -> usize {
74c67d6573Sopenharmony_ci            let b = match text.as_bytes().get(i) {
75c67d6573Sopenharmony_ci                None => return text.len() + 1,
76c67d6573Sopenharmony_ci                Some(&b) => b,
77c67d6573Sopenharmony_ci            };
78c67d6573Sopenharmony_ci            let inc = if b <= 0x7F {
79c67d6573Sopenharmony_ci                1
80c67d6573Sopenharmony_ci            } else if b <= 0b110_11111 {
81c67d6573Sopenharmony_ci                2
82c67d6573Sopenharmony_ci            } else if b <= 0b1110_1111 {
83c67d6573Sopenharmony_ci                3
84c67d6573Sopenharmony_ci            } else {
85c67d6573Sopenharmony_ci                4
86c67d6573Sopenharmony_ci            };
87c67d6573Sopenharmony_ci            i + inc
88c67d6573Sopenharmony_ci        }
89c67d6573Sopenharmony_ci
90c67d6573Sopenharmony_ci        if self.last_end > self.text.len() {
91c67d6573Sopenharmony_ci            return None;
92c67d6573Sopenharmony_ci        }
93c67d6573Sopenharmony_ci        let (s, e) = match self.re.find_at(self.text, self.last_end) {
94c67d6573Sopenharmony_ci            None => return None,
95c67d6573Sopenharmony_ci            Some((s, e)) => (s, e),
96c67d6573Sopenharmony_ci        };
97c67d6573Sopenharmony_ci        assert!(s >= self.last_end);
98c67d6573Sopenharmony_ci        if s == e {
99c67d6573Sopenharmony_ci            // This is an empty match. To ensure we make progress, start
100c67d6573Sopenharmony_ci            // the next search at the smallest possible starting position
101c67d6573Sopenharmony_ci            // of the next match following this one.
102c67d6573Sopenharmony_ci            self.last_end = next_after_empty(&self.text, e);
103c67d6573Sopenharmony_ci            // Don't accept empty matches immediately following a match.
104c67d6573Sopenharmony_ci            // Just move on to the next match.
105c67d6573Sopenharmony_ci            if Some(e) == self.last_match {
106c67d6573Sopenharmony_ci                return self.next();
107c67d6573Sopenharmony_ci            }
108c67d6573Sopenharmony_ci        } else {
109c67d6573Sopenharmony_ci            self.last_end = e;
110c67d6573Sopenharmony_ci        }
111c67d6573Sopenharmony_ci        self.last_match = Some(self.last_end);
112c67d6573Sopenharmony_ci        Some((s, e))
113c67d6573Sopenharmony_ci    }
114c67d6573Sopenharmony_ci}
115c67d6573Sopenharmony_ci
116c67d6573Sopenharmony_ci// RE2 FFI is below. Note that this uses a hand-rolled C API that is defined
117c67d6573Sopenharmony_ci// in re2.cpp.
118c67d6573Sopenharmony_ci
119c67d6573Sopenharmony_citype re2_regexp = c_void;
120c67d6573Sopenharmony_ci
121c67d6573Sopenharmony_ci#[repr(C)]
122c67d6573Sopenharmony_cistruct re2_string {
123c67d6573Sopenharmony_ci    text: *const c_uchar,
124c67d6573Sopenharmony_ci    len: c_int,
125c67d6573Sopenharmony_ci}
126c67d6573Sopenharmony_ci
127c67d6573Sopenharmony_ciimpl<'a> From<&'a str> for re2_string {
128c67d6573Sopenharmony_ci    fn from(s: &'a str) -> re2_string {
129c67d6573Sopenharmony_ci        re2_string { text: s.as_ptr(), len: s.len() as c_int }
130c67d6573Sopenharmony_ci    }
131c67d6573Sopenharmony_ci}
132c67d6573Sopenharmony_ci
133c67d6573Sopenharmony_ciextern "C" {
134c67d6573Sopenharmony_ci    fn re2_regexp_new(pat: re2_string) -> *mut re2_regexp;
135c67d6573Sopenharmony_ci    fn re2_regexp_free(re: *mut re2_regexp);
136c67d6573Sopenharmony_ci    fn re2_regexp_match(
137c67d6573Sopenharmony_ci        re: *mut re2_regexp,
138c67d6573Sopenharmony_ci        text: re2_string,
139c67d6573Sopenharmony_ci        startpos: c_int,
140c67d6573Sopenharmony_ci        endpos: c_int,
141c67d6573Sopenharmony_ci    ) -> bool;
142c67d6573Sopenharmony_ci    fn re2_regexp_find(
143c67d6573Sopenharmony_ci        re: *mut re2_regexp,
144c67d6573Sopenharmony_ci        text: re2_string,
145c67d6573Sopenharmony_ci        startpos: c_int,
146c67d6573Sopenharmony_ci        endpos: c_int,
147c67d6573Sopenharmony_ci        match_start: *mut c_int,
148c67d6573Sopenharmony_ci        match_end: *mut c_int,
149c67d6573Sopenharmony_ci    ) -> bool;
150c67d6573Sopenharmony_ci}
151