1#![allow(non_camel_case_types)] 2 3use libc::{c_int, c_uchar, c_void}; 4 5/// Regex wraps an RE2 regular expression. 6/// 7/// It cannot be used safely from multiple threads simultaneously. 8pub struct Regex { 9 re: *mut re2_regexp, 10} 11 12unsafe impl Send for Regex {} 13 14impl Drop for Regex { 15 fn drop(&mut self) { 16 unsafe { 17 re2_regexp_free(self.re); 18 } 19 } 20} 21 22#[derive(Debug)] 23pub struct Error(()); 24 25impl Regex { 26 pub fn new(pattern: &str) -> Result<Regex, Error> { 27 unsafe { Ok(Regex { re: re2_regexp_new(pattern.into()) }) } 28 } 29 30 pub fn is_match(&self, text: &str) -> bool { 31 unsafe { 32 re2_regexp_match(self.re, text.into(), 0, text.len() as c_int) 33 } 34 } 35 36 pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { 37 FindMatches { re: self, text: text, last_end: 0, last_match: None } 38 } 39 40 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { 41 let (mut s, mut e): (c_int, c_int) = (0, 0); 42 let matched = unsafe { 43 re2_regexp_find( 44 self.re, 45 text.into(), 46 start as c_int, 47 text.len() as c_int, 48 &mut s, 49 &mut e, 50 ) 51 }; 52 if matched { 53 Some((s as usize, e as usize)) 54 } else { 55 None 56 } 57 } 58} 59 60pub struct FindMatches<'r, 't> { 61 re: &'r Regex, 62 text: &'t str, 63 last_end: usize, 64 last_match: Option<usize>, 65} 66 67// This implementation is identical to the one Rust uses, since both Rust's 68// regex engine and RE2 handle empty matches in the same way. 69impl<'r, 't> Iterator for FindMatches<'r, 't> { 70 type Item = (usize, usize); 71 72 fn next(&mut self) -> Option<(usize, usize)> { 73 fn next_after_empty(text: &str, i: usize) -> usize { 74 let b = match text.as_bytes().get(i) { 75 None => return text.len() + 1, 76 Some(&b) => b, 77 }; 78 let inc = if b <= 0x7F { 79 1 80 } else if b <= 0b110_11111 { 81 2 82 } else if b <= 0b1110_1111 { 83 3 84 } else { 85 4 86 }; 87 i + inc 88 } 89 90 if self.last_end > self.text.len() { 91 return None; 92 } 93 let (s, e) = match self.re.find_at(self.text, self.last_end) { 94 None => return None, 95 Some((s, e)) => (s, e), 96 }; 97 assert!(s >= self.last_end); 98 if s == e { 99 // This is an empty match. To ensure we make progress, start 100 // the next search at the smallest possible starting position 101 // of the next match following this one. 102 self.last_end = next_after_empty(&self.text, e); 103 // Don't accept empty matches immediately following a match. 104 // Just move on to the next match. 105 if Some(e) == self.last_match { 106 return self.next(); 107 } 108 } else { 109 self.last_end = e; 110 } 111 self.last_match = Some(self.last_end); 112 Some((s, e)) 113 } 114} 115 116// RE2 FFI is below. Note that this uses a hand-rolled C API that is defined 117// in re2.cpp. 118 119type re2_regexp = c_void; 120 121#[repr(C)] 122struct re2_string { 123 text: *const c_uchar, 124 len: c_int, 125} 126 127impl<'a> From<&'a str> for re2_string { 128 fn from(s: &'a str) -> re2_string { 129 re2_string { text: s.as_ptr(), len: s.len() as c_int } 130 } 131} 132 133extern "C" { 134 fn re2_regexp_new(pat: re2_string) -> *mut re2_regexp; 135 fn re2_regexp_free(re: *mut re2_regexp); 136 fn re2_regexp_match( 137 re: *mut re2_regexp, 138 text: re2_string, 139 startpos: c_int, 140 endpos: c_int, 141 ) -> bool; 142 fn re2_regexp_find( 143 re: *mut re2_regexp, 144 text: re2_string, 145 startpos: c_int, 146 endpos: c_int, 147 match_start: *mut c_int, 148 match_end: *mut c_int, 149 ) -> bool; 150} 151