1c67d6573Sopenharmony_ci#![allow(non_camel_case_types)] 2c67d6573Sopenharmony_ci 3c67d6573Sopenharmony_ciuse libc::{c_int, c_uchar, c_void}; 4c67d6573Sopenharmony_ci 5c67d6573Sopenharmony_ci/// Regex wraps an RE2 regular expression. 6c67d6573Sopenharmony_ci/// 7c67d6573Sopenharmony_ci/// It cannot be used safely from multiple threads simultaneously. 8c67d6573Sopenharmony_cipub struct Regex { 9c67d6573Sopenharmony_ci re: *mut re2_regexp, 10c67d6573Sopenharmony_ci} 11c67d6573Sopenharmony_ci 12c67d6573Sopenharmony_ciunsafe impl Send for Regex {} 13c67d6573Sopenharmony_ci 14c67d6573Sopenharmony_ciimpl Drop for Regex { 15c67d6573Sopenharmony_ci fn drop(&mut self) { 16c67d6573Sopenharmony_ci unsafe { 17c67d6573Sopenharmony_ci re2_regexp_free(self.re); 18c67d6573Sopenharmony_ci } 19c67d6573Sopenharmony_ci } 20c67d6573Sopenharmony_ci} 21c67d6573Sopenharmony_ci 22c67d6573Sopenharmony_ci#[derive(Debug)] 23c67d6573Sopenharmony_cipub struct Error(()); 24c67d6573Sopenharmony_ci 25c67d6573Sopenharmony_ciimpl Regex { 26c67d6573Sopenharmony_ci pub fn new(pattern: &str) -> Result<Regex, Error> { 27c67d6573Sopenharmony_ci unsafe { Ok(Regex { re: re2_regexp_new(pattern.into()) }) } 28c67d6573Sopenharmony_ci } 29c67d6573Sopenharmony_ci 30c67d6573Sopenharmony_ci pub fn is_match(&self, text: &str) -> bool { 31c67d6573Sopenharmony_ci unsafe { 32c67d6573Sopenharmony_ci re2_regexp_match(self.re, text.into(), 0, text.len() as c_int) 33c67d6573Sopenharmony_ci } 34c67d6573Sopenharmony_ci } 35c67d6573Sopenharmony_ci 36c67d6573Sopenharmony_ci pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { 37c67d6573Sopenharmony_ci FindMatches { re: self, text: text, last_end: 0, last_match: None } 38c67d6573Sopenharmony_ci } 39c67d6573Sopenharmony_ci 40c67d6573Sopenharmony_ci fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { 41c67d6573Sopenharmony_ci let (mut s, mut e): (c_int, c_int) = (0, 0); 42c67d6573Sopenharmony_ci let matched = unsafe { 43c67d6573Sopenharmony_ci re2_regexp_find( 44c67d6573Sopenharmony_ci self.re, 45c67d6573Sopenharmony_ci text.into(), 46c67d6573Sopenharmony_ci start as c_int, 47c67d6573Sopenharmony_ci text.len() as c_int, 48c67d6573Sopenharmony_ci &mut s, 49c67d6573Sopenharmony_ci &mut e, 50c67d6573Sopenharmony_ci ) 51c67d6573Sopenharmony_ci }; 52c67d6573Sopenharmony_ci if matched { 53c67d6573Sopenharmony_ci Some((s as usize, e as usize)) 54c67d6573Sopenharmony_ci } else { 55c67d6573Sopenharmony_ci None 56c67d6573Sopenharmony_ci } 57c67d6573Sopenharmony_ci } 58c67d6573Sopenharmony_ci} 59c67d6573Sopenharmony_ci 60c67d6573Sopenharmony_cipub struct FindMatches<'r, 't> { 61c67d6573Sopenharmony_ci re: &'r Regex, 62c67d6573Sopenharmony_ci text: &'t str, 63c67d6573Sopenharmony_ci last_end: usize, 64c67d6573Sopenharmony_ci last_match: Option<usize>, 65c67d6573Sopenharmony_ci} 66c67d6573Sopenharmony_ci 67c67d6573Sopenharmony_ci// This implementation is identical to the one Rust uses, since both Rust's 68c67d6573Sopenharmony_ci// regex engine and RE2 handle empty matches in the same way. 69c67d6573Sopenharmony_ciimpl<'r, 't> Iterator for FindMatches<'r, 't> { 70c67d6573Sopenharmony_ci type Item = (usize, usize); 71c67d6573Sopenharmony_ci 72c67d6573Sopenharmony_ci fn next(&mut self) -> Option<(usize, usize)> { 73c67d6573Sopenharmony_ci fn next_after_empty(text: &str, i: usize) -> usize { 74c67d6573Sopenharmony_ci let b = match text.as_bytes().get(i) { 75c67d6573Sopenharmony_ci None => return text.len() + 1, 76c67d6573Sopenharmony_ci Some(&b) => b, 77c67d6573Sopenharmony_ci }; 78c67d6573Sopenharmony_ci let inc = if b <= 0x7F { 79c67d6573Sopenharmony_ci 1 80c67d6573Sopenharmony_ci } else if b <= 0b110_11111 { 81c67d6573Sopenharmony_ci 2 82c67d6573Sopenharmony_ci } else if b <= 0b1110_1111 { 83c67d6573Sopenharmony_ci 3 84c67d6573Sopenharmony_ci } else { 85c67d6573Sopenharmony_ci 4 86c67d6573Sopenharmony_ci }; 87c67d6573Sopenharmony_ci i + inc 88c67d6573Sopenharmony_ci } 89c67d6573Sopenharmony_ci 90c67d6573Sopenharmony_ci if self.last_end > self.text.len() { 91c67d6573Sopenharmony_ci return None; 92c67d6573Sopenharmony_ci } 93c67d6573Sopenharmony_ci let (s, e) = match self.re.find_at(self.text, self.last_end) { 94c67d6573Sopenharmony_ci None => return None, 95c67d6573Sopenharmony_ci Some((s, e)) => (s, e), 96c67d6573Sopenharmony_ci }; 97c67d6573Sopenharmony_ci assert!(s >= self.last_end); 98c67d6573Sopenharmony_ci if s == e { 99c67d6573Sopenharmony_ci // This is an empty match. To ensure we make progress, start 100c67d6573Sopenharmony_ci // the next search at the smallest possible starting position 101c67d6573Sopenharmony_ci // of the next match following this one. 102c67d6573Sopenharmony_ci self.last_end = next_after_empty(&self.text, e); 103c67d6573Sopenharmony_ci // Don't accept empty matches immediately following a match. 104c67d6573Sopenharmony_ci // Just move on to the next match. 105c67d6573Sopenharmony_ci if Some(e) == self.last_match { 106c67d6573Sopenharmony_ci return self.next(); 107c67d6573Sopenharmony_ci } 108c67d6573Sopenharmony_ci } else { 109c67d6573Sopenharmony_ci self.last_end = e; 110c67d6573Sopenharmony_ci } 111c67d6573Sopenharmony_ci self.last_match = Some(self.last_end); 112c67d6573Sopenharmony_ci Some((s, e)) 113c67d6573Sopenharmony_ci } 114c67d6573Sopenharmony_ci} 115c67d6573Sopenharmony_ci 116c67d6573Sopenharmony_ci// RE2 FFI is below. Note that this uses a hand-rolled C API that is defined 117c67d6573Sopenharmony_ci// in re2.cpp. 118c67d6573Sopenharmony_ci 119c67d6573Sopenharmony_citype re2_regexp = c_void; 120c67d6573Sopenharmony_ci 121c67d6573Sopenharmony_ci#[repr(C)] 122c67d6573Sopenharmony_cistruct re2_string { 123c67d6573Sopenharmony_ci text: *const c_uchar, 124c67d6573Sopenharmony_ci len: c_int, 125c67d6573Sopenharmony_ci} 126c67d6573Sopenharmony_ci 127c67d6573Sopenharmony_ciimpl<'a> From<&'a str> for re2_string { 128c67d6573Sopenharmony_ci fn from(s: &'a str) -> re2_string { 129c67d6573Sopenharmony_ci re2_string { text: s.as_ptr(), len: s.len() as c_int } 130c67d6573Sopenharmony_ci } 131c67d6573Sopenharmony_ci} 132c67d6573Sopenharmony_ci 133c67d6573Sopenharmony_ciextern "C" { 134c67d6573Sopenharmony_ci fn re2_regexp_new(pat: re2_string) -> *mut re2_regexp; 135c67d6573Sopenharmony_ci fn re2_regexp_free(re: *mut re2_regexp); 136c67d6573Sopenharmony_ci fn re2_regexp_match( 137c67d6573Sopenharmony_ci re: *mut re2_regexp, 138c67d6573Sopenharmony_ci text: re2_string, 139c67d6573Sopenharmony_ci startpos: c_int, 140c67d6573Sopenharmony_ci endpos: c_int, 141c67d6573Sopenharmony_ci ) -> bool; 142c67d6573Sopenharmony_ci fn re2_regexp_find( 143c67d6573Sopenharmony_ci re: *mut re2_regexp, 144c67d6573Sopenharmony_ci text: re2_string, 145c67d6573Sopenharmony_ci startpos: c_int, 146c67d6573Sopenharmony_ci endpos: c_int, 147c67d6573Sopenharmony_ci match_start: *mut c_int, 148c67d6573Sopenharmony_ci match_end: *mut c_int, 149c67d6573Sopenharmony_ci ) -> bool; 150c67d6573Sopenharmony_ci} 151