1c67d6573Sopenharmony_ci#![allow(non_camel_case_types)] 2c67d6573Sopenharmony_ci 3c67d6573Sopenharmony_ciuse std::fmt; 4c67d6573Sopenharmony_ciuse std::ptr; 5c67d6573Sopenharmony_ciuse std::str; 6c67d6573Sopenharmony_ci 7c67d6573Sopenharmony_ciuse libc::{c_int, c_void, size_t}; 8c67d6573Sopenharmony_ci 9c67d6573Sopenharmony_cipub struct Regex { 10c67d6573Sopenharmony_ci code: *mut code, 11c67d6573Sopenharmony_ci match_data: *mut match_data, 12c67d6573Sopenharmony_ci ovector: *mut size_t, 13c67d6573Sopenharmony_ci} 14c67d6573Sopenharmony_ci 15c67d6573Sopenharmony_ciunsafe impl Send for Regex {} 16c67d6573Sopenharmony_ci 17c67d6573Sopenharmony_ciimpl Drop for Regex { 18c67d6573Sopenharmony_ci fn drop(&mut self) { 19c67d6573Sopenharmony_ci unsafe { 20c67d6573Sopenharmony_ci pcre2_match_data_free_8(self.match_data); 21c67d6573Sopenharmony_ci pcre2_code_free_8(self.code); 22c67d6573Sopenharmony_ci } 23c67d6573Sopenharmony_ci } 24c67d6573Sopenharmony_ci} 25c67d6573Sopenharmony_ci 26c67d6573Sopenharmony_cipub struct Error { 27c67d6573Sopenharmony_ci code: c_int, 28c67d6573Sopenharmony_ci offset: size_t, 29c67d6573Sopenharmony_ci} 30c67d6573Sopenharmony_ci 31c67d6573Sopenharmony_ciimpl Regex { 32c67d6573Sopenharmony_ci pub fn new(pattern: &str) -> Result<Regex, Error> { 33c67d6573Sopenharmony_ci let mut error_code: c_int = 0; 34c67d6573Sopenharmony_ci let mut error_offset: size_t = 0; 35c67d6573Sopenharmony_ci let code = unsafe { 36c67d6573Sopenharmony_ci pcre2_compile_8( 37c67d6573Sopenharmony_ci pattern.as_ptr(), 38c67d6573Sopenharmony_ci pattern.len(), 39c67d6573Sopenharmony_ci // PCRE2 can get significantly faster in some cases depending 40c67d6573Sopenharmony_ci // on the permutation of these options (in particular, dropping 41c67d6573Sopenharmony_ci // UCP). We should endeavor to have a separate "ASCII compatible" 42c67d6573Sopenharmony_ci // benchmark. 43c67d6573Sopenharmony_ci PCRE2_UCP | PCRE2_UTF, 44c67d6573Sopenharmony_ci &mut error_code, 45c67d6573Sopenharmony_ci &mut error_offset, 46c67d6573Sopenharmony_ci ptr::null_mut(), 47c67d6573Sopenharmony_ci ) 48c67d6573Sopenharmony_ci }; 49c67d6573Sopenharmony_ci if code.is_null() { 50c67d6573Sopenharmony_ci return Err(Error { code: error_code, offset: error_offset }); 51c67d6573Sopenharmony_ci } 52c67d6573Sopenharmony_ci let err = unsafe { pcre2_jit_compile_8(code, PCRE2_JIT_COMPLETE) }; 53c67d6573Sopenharmony_ci if err < 0 { 54c67d6573Sopenharmony_ci panic!("pcre2_jit_compile_8 failed with error: {:?}", err); 55c67d6573Sopenharmony_ci } 56c67d6573Sopenharmony_ci let match_data = unsafe { 57c67d6573Sopenharmony_ci pcre2_match_data_create_from_pattern_8(code, ptr::null_mut()) 58c67d6573Sopenharmony_ci }; 59c67d6573Sopenharmony_ci if match_data.is_null() { 60c67d6573Sopenharmony_ci panic!("could not allocate match_data"); 61c67d6573Sopenharmony_ci } 62c67d6573Sopenharmony_ci let ovector = unsafe { pcre2_get_ovector_pointer_8(match_data) }; 63c67d6573Sopenharmony_ci if ovector.is_null() { 64c67d6573Sopenharmony_ci panic!("could not get ovector"); 65c67d6573Sopenharmony_ci } 66c67d6573Sopenharmony_ci Ok(Regex { code: code, match_data: match_data, ovector: ovector }) 67c67d6573Sopenharmony_ci } 68c67d6573Sopenharmony_ci 69c67d6573Sopenharmony_ci pub fn is_match(&self, text: &str) -> bool { 70c67d6573Sopenharmony_ci self.find_at(text, 0).is_some() 71c67d6573Sopenharmony_ci } 72c67d6573Sopenharmony_ci 73c67d6573Sopenharmony_ci pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { 74c67d6573Sopenharmony_ci FindMatches { re: self, text: text, last_match_end: 0 } 75c67d6573Sopenharmony_ci } 76c67d6573Sopenharmony_ci 77c67d6573Sopenharmony_ci fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { 78c67d6573Sopenharmony_ci // The man pages for PCRE2 say that pcre2_jit_match is the fastest 79c67d6573Sopenharmony_ci // way to execute a JIT match because it skips sanity checks. We also 80c67d6573Sopenharmony_ci // explicitly disable the UTF-8 validity check, but it's probably not 81c67d6573Sopenharmony_ci // necessary. 82c67d6573Sopenharmony_ci let err = unsafe { 83c67d6573Sopenharmony_ci pcre2_jit_match_8( 84c67d6573Sopenharmony_ci self.code, 85c67d6573Sopenharmony_ci text.as_ptr(), 86c67d6573Sopenharmony_ci text.len(), 87c67d6573Sopenharmony_ci start, 88c67d6573Sopenharmony_ci PCRE2_NO_UTF_CHECK, 89c67d6573Sopenharmony_ci self.match_data, 90c67d6573Sopenharmony_ci ptr::null_mut(), 91c67d6573Sopenharmony_ci ) 92c67d6573Sopenharmony_ci }; 93c67d6573Sopenharmony_ci if err == PCRE2_ERROR_NOMATCH { 94c67d6573Sopenharmony_ci None 95c67d6573Sopenharmony_ci } else if err < 0 { 96c67d6573Sopenharmony_ci panic!("unknown error code: {:?}", err) 97c67d6573Sopenharmony_ci } else { 98c67d6573Sopenharmony_ci Some(unsafe { (*self.ovector, *self.ovector.offset(1)) }) 99c67d6573Sopenharmony_ci } 100c67d6573Sopenharmony_ci } 101c67d6573Sopenharmony_ci} 102c67d6573Sopenharmony_ci 103c67d6573Sopenharmony_cipub struct FindMatches<'r, 't> { 104c67d6573Sopenharmony_ci re: &'r Regex, 105c67d6573Sopenharmony_ci text: &'t str, 106c67d6573Sopenharmony_ci last_match_end: usize, 107c67d6573Sopenharmony_ci} 108c67d6573Sopenharmony_ci 109c67d6573Sopenharmony_ciimpl<'r, 't> Iterator for FindMatches<'r, 't> { 110c67d6573Sopenharmony_ci type Item = (usize, usize); 111c67d6573Sopenharmony_ci 112c67d6573Sopenharmony_ci fn next(&mut self) -> Option<(usize, usize)> { 113c67d6573Sopenharmony_ci match self.re.find_at(self.text, self.last_match_end) { 114c67d6573Sopenharmony_ci None => None, 115c67d6573Sopenharmony_ci Some((s, e)) => { 116c67d6573Sopenharmony_ci self.last_match_end = e; 117c67d6573Sopenharmony_ci Some((s, e)) 118c67d6573Sopenharmony_ci } 119c67d6573Sopenharmony_ci } 120c67d6573Sopenharmony_ci } 121c67d6573Sopenharmony_ci} 122c67d6573Sopenharmony_ci 123c67d6573Sopenharmony_ciimpl fmt::Debug for Error { 124c67d6573Sopenharmony_ci fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 125c67d6573Sopenharmony_ci const BUF_LEN: size_t = 256; 126c67d6573Sopenharmony_ci let mut buf = [0; BUF_LEN]; 127c67d6573Sopenharmony_ci let len = unsafe { 128c67d6573Sopenharmony_ci pcre2_get_error_message_8(self.code, buf.as_mut_ptr(), BUF_LEN) 129c67d6573Sopenharmony_ci }; 130c67d6573Sopenharmony_ci if len < 0 { 131c67d6573Sopenharmony_ci write!( 132c67d6573Sopenharmony_ci f, 133c67d6573Sopenharmony_ci "Unknown PCRE error. (code: {:?}, offset: {:?})", 134c67d6573Sopenharmony_ci self.code, self.offset 135c67d6573Sopenharmony_ci ) 136c67d6573Sopenharmony_ci } else { 137c67d6573Sopenharmony_ci let msg = str::from_utf8(&buf[..len as usize]).unwrap(); 138c67d6573Sopenharmony_ci write!(f, "error at {:?}: {}", self.offset, msg) 139c67d6573Sopenharmony_ci } 140c67d6573Sopenharmony_ci } 141c67d6573Sopenharmony_ci} 142c67d6573Sopenharmony_ci 143c67d6573Sopenharmony_ci// PCRE2 FFI. We only wrap the bits we need. 144c67d6573Sopenharmony_ci 145c67d6573Sopenharmony_ciconst PCRE2_UCP: u32 = 0x00020000; 146c67d6573Sopenharmony_ciconst PCRE2_UTF: u32 = 0x00080000; 147c67d6573Sopenharmony_ciconst PCRE2_NO_UTF_CHECK: u32 = 0x40000000; 148c67d6573Sopenharmony_ciconst PCRE2_JIT_COMPLETE: u32 = 0x00000001; 149c67d6573Sopenharmony_ciconst PCRE2_ERROR_NOMATCH: c_int = -1; 150c67d6573Sopenharmony_ci 151c67d6573Sopenharmony_citype code = c_void; 152c67d6573Sopenharmony_ci 153c67d6573Sopenharmony_citype match_data = c_void; 154c67d6573Sopenharmony_ci 155c67d6573Sopenharmony_citype compile_context = c_void; // unused 156c67d6573Sopenharmony_ci 157c67d6573Sopenharmony_citype general_context = c_void; // unused 158c67d6573Sopenharmony_ci 159c67d6573Sopenharmony_citype match_context = c_void; // unused 160c67d6573Sopenharmony_ci 161c67d6573Sopenharmony_ciextern "C" { 162c67d6573Sopenharmony_ci fn pcre2_compile_8( 163c67d6573Sopenharmony_ci pattern: *const u8, 164c67d6573Sopenharmony_ci len: size_t, 165c67d6573Sopenharmony_ci options: u32, 166c67d6573Sopenharmony_ci error_code: *mut c_int, 167c67d6573Sopenharmony_ci error_offset: *mut size_t, 168c67d6573Sopenharmony_ci context: *mut compile_context, 169c67d6573Sopenharmony_ci ) -> *mut code; 170c67d6573Sopenharmony_ci 171c67d6573Sopenharmony_ci fn pcre2_code_free_8(code: *mut code); 172c67d6573Sopenharmony_ci 173c67d6573Sopenharmony_ci fn pcre2_match_data_create_from_pattern_8( 174c67d6573Sopenharmony_ci code: *const code, 175c67d6573Sopenharmony_ci context: *mut general_context, 176c67d6573Sopenharmony_ci ) -> *mut match_data; 177c67d6573Sopenharmony_ci 178c67d6573Sopenharmony_ci fn pcre2_match_data_free_8(match_data: *mut match_data); 179c67d6573Sopenharmony_ci 180c67d6573Sopenharmony_ci fn pcre2_get_ovector_pointer_8(match_data: *mut match_data) 181c67d6573Sopenharmony_ci -> *mut size_t; 182c67d6573Sopenharmony_ci 183c67d6573Sopenharmony_ci fn pcre2_jit_compile_8(code: *const code, options: u32) -> c_int; 184c67d6573Sopenharmony_ci 185c67d6573Sopenharmony_ci fn pcre2_jit_match_8( 186c67d6573Sopenharmony_ci code: *const code, 187c67d6573Sopenharmony_ci subject: *const u8, 188c67d6573Sopenharmony_ci length: size_t, 189c67d6573Sopenharmony_ci startoffset: size_t, 190c67d6573Sopenharmony_ci options: u32, 191c67d6573Sopenharmony_ci match_data: *mut match_data, 192c67d6573Sopenharmony_ci match_context: *mut match_context, 193c67d6573Sopenharmony_ci ) -> c_int; 194c67d6573Sopenharmony_ci 195c67d6573Sopenharmony_ci fn pcre2_get_error_message_8( 196c67d6573Sopenharmony_ci error_code: c_int, 197c67d6573Sopenharmony_ci buf: *mut u8, 198c67d6573Sopenharmony_ci buflen: size_t, 199c67d6573Sopenharmony_ci ) -> c_int; 200c67d6573Sopenharmony_ci} 201