xref: /third_party/rust/crates/regex/bench/src/ffi/re2.rs (revision c67d6573)
1#![allow(non_camel_case_types)]
2
3use libc::{c_int, c_uchar, c_void};
4
5/// Regex wraps an RE2 regular expression.
6///
7/// It cannot be used safely from multiple threads simultaneously.
8pub struct Regex {
9    re: *mut re2_regexp,
10}
11
12unsafe impl Send for Regex {}
13
14impl Drop for Regex {
15    fn drop(&mut self) {
16        unsafe {
17            re2_regexp_free(self.re);
18        }
19    }
20}
21
22#[derive(Debug)]
23pub struct Error(());
24
25impl Regex {
26    pub fn new(pattern: &str) -> Result<Regex, Error> {
27        unsafe { Ok(Regex { re: re2_regexp_new(pattern.into()) }) }
28    }
29
30    pub fn is_match(&self, text: &str) -> bool {
31        unsafe {
32            re2_regexp_match(self.re, text.into(), 0, text.len() as c_int)
33        }
34    }
35
36    pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
37        FindMatches { re: self, text: text, last_end: 0, last_match: None }
38    }
39
40    fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
41        let (mut s, mut e): (c_int, c_int) = (0, 0);
42        let matched = unsafe {
43            re2_regexp_find(
44                self.re,
45                text.into(),
46                start as c_int,
47                text.len() as c_int,
48                &mut s,
49                &mut e,
50            )
51        };
52        if matched {
53            Some((s as usize, e as usize))
54        } else {
55            None
56        }
57    }
58}
59
60pub struct FindMatches<'r, 't> {
61    re: &'r Regex,
62    text: &'t str,
63    last_end: usize,
64    last_match: Option<usize>,
65}
66
67// This implementation is identical to the one Rust uses, since both Rust's
68// regex engine and RE2 handle empty matches in the same way.
69impl<'r, 't> Iterator for FindMatches<'r, 't> {
70    type Item = (usize, usize);
71
72    fn next(&mut self) -> Option<(usize, usize)> {
73        fn next_after_empty(text: &str, i: usize) -> usize {
74            let b = match text.as_bytes().get(i) {
75                None => return text.len() + 1,
76                Some(&b) => b,
77            };
78            let inc = if b <= 0x7F {
79                1
80            } else if b <= 0b110_11111 {
81                2
82            } else if b <= 0b1110_1111 {
83                3
84            } else {
85                4
86            };
87            i + inc
88        }
89
90        if self.last_end > self.text.len() {
91            return None;
92        }
93        let (s, e) = match self.re.find_at(self.text, self.last_end) {
94            None => return None,
95            Some((s, e)) => (s, e),
96        };
97        assert!(s >= self.last_end);
98        if s == e {
99            // This is an empty match. To ensure we make progress, start
100            // the next search at the smallest possible starting position
101            // of the next match following this one.
102            self.last_end = next_after_empty(&self.text, e);
103            // Don't accept empty matches immediately following a match.
104            // Just move on to the next match.
105            if Some(e) == self.last_match {
106                return self.next();
107            }
108        } else {
109            self.last_end = e;
110        }
111        self.last_match = Some(self.last_end);
112        Some((s, e))
113    }
114}
115
116// RE2 FFI is below. Note that this uses a hand-rolled C API that is defined
117// in re2.cpp.
118
119type re2_regexp = c_void;
120
121#[repr(C)]
122struct re2_string {
123    text: *const c_uchar,
124    len: c_int,
125}
126
127impl<'a> From<&'a str> for re2_string {
128    fn from(s: &'a str) -> re2_string {
129        re2_string { text: s.as_ptr(), len: s.len() as c_int }
130    }
131}
132
133extern "C" {
134    fn re2_regexp_new(pat: re2_string) -> *mut re2_regexp;
135    fn re2_regexp_free(re: *mut re2_regexp);
136    fn re2_regexp_match(
137        re: *mut re2_regexp,
138        text: re2_string,
139        startpos: c_int,
140        endpos: c_int,
141    ) -> bool;
142    fn re2_regexp_find(
143        re: *mut re2_regexp,
144        text: re2_string,
145        startpos: c_int,
146        endpos: c_int,
147        match_start: *mut c_int,
148        match_end: *mut c_int,
149    ) -> bool;
150}
151