1c67d6573Sopenharmony_ci#![allow(non_camel_case_types)]
2c67d6573Sopenharmony_ci
3c67d6573Sopenharmony_ciuse std::fmt;
4c67d6573Sopenharmony_ciuse std::ptr;
5c67d6573Sopenharmony_ciuse std::str;
6c67d6573Sopenharmony_ci
7c67d6573Sopenharmony_ciuse libc::{c_int, c_void, size_t};
8c67d6573Sopenharmony_ci
9c67d6573Sopenharmony_cipub struct Regex {
10c67d6573Sopenharmony_ci    code: *mut code,
11c67d6573Sopenharmony_ci    match_data: *mut match_data,
12c67d6573Sopenharmony_ci    ovector: *mut size_t,
13c67d6573Sopenharmony_ci}
14c67d6573Sopenharmony_ci
15c67d6573Sopenharmony_ciunsafe impl Send for Regex {}
16c67d6573Sopenharmony_ci
17c67d6573Sopenharmony_ciimpl Drop for Regex {
18c67d6573Sopenharmony_ci    fn drop(&mut self) {
19c67d6573Sopenharmony_ci        unsafe {
20c67d6573Sopenharmony_ci            pcre2_match_data_free_8(self.match_data);
21c67d6573Sopenharmony_ci            pcre2_code_free_8(self.code);
22c67d6573Sopenharmony_ci        }
23c67d6573Sopenharmony_ci    }
24c67d6573Sopenharmony_ci}
25c67d6573Sopenharmony_ci
26c67d6573Sopenharmony_cipub struct Error {
27c67d6573Sopenharmony_ci    code: c_int,
28c67d6573Sopenharmony_ci    offset: size_t,
29c67d6573Sopenharmony_ci}
30c67d6573Sopenharmony_ci
31c67d6573Sopenharmony_ciimpl Regex {
32c67d6573Sopenharmony_ci    pub fn new(pattern: &str) -> Result<Regex, Error> {
33c67d6573Sopenharmony_ci        let mut error_code: c_int = 0;
34c67d6573Sopenharmony_ci        let mut error_offset: size_t = 0;
35c67d6573Sopenharmony_ci        let code = unsafe {
36c67d6573Sopenharmony_ci            pcre2_compile_8(
37c67d6573Sopenharmony_ci                pattern.as_ptr(),
38c67d6573Sopenharmony_ci                pattern.len(),
39c67d6573Sopenharmony_ci                // PCRE2 can get significantly faster in some cases depending
40c67d6573Sopenharmony_ci                // on the permutation of these options (in particular, dropping
41c67d6573Sopenharmony_ci                // UCP). We should endeavor to have a separate "ASCII compatible"
42c67d6573Sopenharmony_ci                // benchmark.
43c67d6573Sopenharmony_ci                PCRE2_UCP | PCRE2_UTF,
44c67d6573Sopenharmony_ci                &mut error_code,
45c67d6573Sopenharmony_ci                &mut error_offset,
46c67d6573Sopenharmony_ci                ptr::null_mut(),
47c67d6573Sopenharmony_ci            )
48c67d6573Sopenharmony_ci        };
49c67d6573Sopenharmony_ci        if code.is_null() {
50c67d6573Sopenharmony_ci            return Err(Error { code: error_code, offset: error_offset });
51c67d6573Sopenharmony_ci        }
52c67d6573Sopenharmony_ci        let err = unsafe { pcre2_jit_compile_8(code, PCRE2_JIT_COMPLETE) };
53c67d6573Sopenharmony_ci        if err < 0 {
54c67d6573Sopenharmony_ci            panic!("pcre2_jit_compile_8 failed with error: {:?}", err);
55c67d6573Sopenharmony_ci        }
56c67d6573Sopenharmony_ci        let match_data = unsafe {
57c67d6573Sopenharmony_ci            pcre2_match_data_create_from_pattern_8(code, ptr::null_mut())
58c67d6573Sopenharmony_ci        };
59c67d6573Sopenharmony_ci        if match_data.is_null() {
60c67d6573Sopenharmony_ci            panic!("could not allocate match_data");
61c67d6573Sopenharmony_ci        }
62c67d6573Sopenharmony_ci        let ovector = unsafe { pcre2_get_ovector_pointer_8(match_data) };
63c67d6573Sopenharmony_ci        if ovector.is_null() {
64c67d6573Sopenharmony_ci            panic!("could not get ovector");
65c67d6573Sopenharmony_ci        }
66c67d6573Sopenharmony_ci        Ok(Regex { code: code, match_data: match_data, ovector: ovector })
67c67d6573Sopenharmony_ci    }
68c67d6573Sopenharmony_ci
69c67d6573Sopenharmony_ci    pub fn is_match(&self, text: &str) -> bool {
70c67d6573Sopenharmony_ci        self.find_at(text, 0).is_some()
71c67d6573Sopenharmony_ci    }
72c67d6573Sopenharmony_ci
73c67d6573Sopenharmony_ci    pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
74c67d6573Sopenharmony_ci        FindMatches { re: self, text: text, last_match_end: 0 }
75c67d6573Sopenharmony_ci    }
76c67d6573Sopenharmony_ci
77c67d6573Sopenharmony_ci    fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
78c67d6573Sopenharmony_ci        // The man pages for PCRE2 say that pcre2_jit_match is the fastest
79c67d6573Sopenharmony_ci        // way to execute a JIT match because it skips sanity checks. We also
80c67d6573Sopenharmony_ci        // explicitly disable the UTF-8 validity check, but it's probably not
81c67d6573Sopenharmony_ci        // necessary.
82c67d6573Sopenharmony_ci        let err = unsafe {
83c67d6573Sopenharmony_ci            pcre2_jit_match_8(
84c67d6573Sopenharmony_ci                self.code,
85c67d6573Sopenharmony_ci                text.as_ptr(),
86c67d6573Sopenharmony_ci                text.len(),
87c67d6573Sopenharmony_ci                start,
88c67d6573Sopenharmony_ci                PCRE2_NO_UTF_CHECK,
89c67d6573Sopenharmony_ci                self.match_data,
90c67d6573Sopenharmony_ci                ptr::null_mut(),
91c67d6573Sopenharmony_ci            )
92c67d6573Sopenharmony_ci        };
93c67d6573Sopenharmony_ci        if err == PCRE2_ERROR_NOMATCH {
94c67d6573Sopenharmony_ci            None
95c67d6573Sopenharmony_ci        } else if err < 0 {
96c67d6573Sopenharmony_ci            panic!("unknown error code: {:?}", err)
97c67d6573Sopenharmony_ci        } else {
98c67d6573Sopenharmony_ci            Some(unsafe { (*self.ovector, *self.ovector.offset(1)) })
99c67d6573Sopenharmony_ci        }
100c67d6573Sopenharmony_ci    }
101c67d6573Sopenharmony_ci}
102c67d6573Sopenharmony_ci
103c67d6573Sopenharmony_cipub struct FindMatches<'r, 't> {
104c67d6573Sopenharmony_ci    re: &'r Regex,
105c67d6573Sopenharmony_ci    text: &'t str,
106c67d6573Sopenharmony_ci    last_match_end: usize,
107c67d6573Sopenharmony_ci}
108c67d6573Sopenharmony_ci
109c67d6573Sopenharmony_ciimpl<'r, 't> Iterator for FindMatches<'r, 't> {
110c67d6573Sopenharmony_ci    type Item = (usize, usize);
111c67d6573Sopenharmony_ci
112c67d6573Sopenharmony_ci    fn next(&mut self) -> Option<(usize, usize)> {
113c67d6573Sopenharmony_ci        match self.re.find_at(self.text, self.last_match_end) {
114c67d6573Sopenharmony_ci            None => None,
115c67d6573Sopenharmony_ci            Some((s, e)) => {
116c67d6573Sopenharmony_ci                self.last_match_end = e;
117c67d6573Sopenharmony_ci                Some((s, e))
118c67d6573Sopenharmony_ci            }
119c67d6573Sopenharmony_ci        }
120c67d6573Sopenharmony_ci    }
121c67d6573Sopenharmony_ci}
122c67d6573Sopenharmony_ci
123c67d6573Sopenharmony_ciimpl fmt::Debug for Error {
124c67d6573Sopenharmony_ci    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
125c67d6573Sopenharmony_ci        const BUF_LEN: size_t = 256;
126c67d6573Sopenharmony_ci        let mut buf = [0; BUF_LEN];
127c67d6573Sopenharmony_ci        let len = unsafe {
128c67d6573Sopenharmony_ci            pcre2_get_error_message_8(self.code, buf.as_mut_ptr(), BUF_LEN)
129c67d6573Sopenharmony_ci        };
130c67d6573Sopenharmony_ci        if len < 0 {
131c67d6573Sopenharmony_ci            write!(
132c67d6573Sopenharmony_ci                f,
133c67d6573Sopenharmony_ci                "Unknown PCRE error. (code: {:?}, offset: {:?})",
134c67d6573Sopenharmony_ci                self.code, self.offset
135c67d6573Sopenharmony_ci            )
136c67d6573Sopenharmony_ci        } else {
137c67d6573Sopenharmony_ci            let msg = str::from_utf8(&buf[..len as usize]).unwrap();
138c67d6573Sopenharmony_ci            write!(f, "error at {:?}: {}", self.offset, msg)
139c67d6573Sopenharmony_ci        }
140c67d6573Sopenharmony_ci    }
141c67d6573Sopenharmony_ci}
142c67d6573Sopenharmony_ci
143c67d6573Sopenharmony_ci// PCRE2 FFI. We only wrap the bits we need.
144c67d6573Sopenharmony_ci
145c67d6573Sopenharmony_ciconst PCRE2_UCP: u32 = 0x00020000;
146c67d6573Sopenharmony_ciconst PCRE2_UTF: u32 = 0x00080000;
147c67d6573Sopenharmony_ciconst PCRE2_NO_UTF_CHECK: u32 = 0x40000000;
148c67d6573Sopenharmony_ciconst PCRE2_JIT_COMPLETE: u32 = 0x00000001;
149c67d6573Sopenharmony_ciconst PCRE2_ERROR_NOMATCH: c_int = -1;
150c67d6573Sopenharmony_ci
151c67d6573Sopenharmony_citype code = c_void;
152c67d6573Sopenharmony_ci
153c67d6573Sopenharmony_citype match_data = c_void;
154c67d6573Sopenharmony_ci
155c67d6573Sopenharmony_citype compile_context = c_void; // unused
156c67d6573Sopenharmony_ci
157c67d6573Sopenharmony_citype general_context = c_void; // unused
158c67d6573Sopenharmony_ci
159c67d6573Sopenharmony_citype match_context = c_void; // unused
160c67d6573Sopenharmony_ci
161c67d6573Sopenharmony_ciextern "C" {
162c67d6573Sopenharmony_ci    fn pcre2_compile_8(
163c67d6573Sopenharmony_ci        pattern: *const u8,
164c67d6573Sopenharmony_ci        len: size_t,
165c67d6573Sopenharmony_ci        options: u32,
166c67d6573Sopenharmony_ci        error_code: *mut c_int,
167c67d6573Sopenharmony_ci        error_offset: *mut size_t,
168c67d6573Sopenharmony_ci        context: *mut compile_context,
169c67d6573Sopenharmony_ci    ) -> *mut code;
170c67d6573Sopenharmony_ci
171c67d6573Sopenharmony_ci    fn pcre2_code_free_8(code: *mut code);
172c67d6573Sopenharmony_ci
173c67d6573Sopenharmony_ci    fn pcre2_match_data_create_from_pattern_8(
174c67d6573Sopenharmony_ci        code: *const code,
175c67d6573Sopenharmony_ci        context: *mut general_context,
176c67d6573Sopenharmony_ci    ) -> *mut match_data;
177c67d6573Sopenharmony_ci
178c67d6573Sopenharmony_ci    fn pcre2_match_data_free_8(match_data: *mut match_data);
179c67d6573Sopenharmony_ci
180c67d6573Sopenharmony_ci    fn pcre2_get_ovector_pointer_8(match_data: *mut match_data)
181c67d6573Sopenharmony_ci        -> *mut size_t;
182c67d6573Sopenharmony_ci
183c67d6573Sopenharmony_ci    fn pcre2_jit_compile_8(code: *const code, options: u32) -> c_int;
184c67d6573Sopenharmony_ci
185c67d6573Sopenharmony_ci    fn pcre2_jit_match_8(
186c67d6573Sopenharmony_ci        code: *const code,
187c67d6573Sopenharmony_ci        subject: *const u8,
188c67d6573Sopenharmony_ci        length: size_t,
189c67d6573Sopenharmony_ci        startoffset: size_t,
190c67d6573Sopenharmony_ci        options: u32,
191c67d6573Sopenharmony_ci        match_data: *mut match_data,
192c67d6573Sopenharmony_ci        match_context: *mut match_context,
193c67d6573Sopenharmony_ci    ) -> c_int;
194c67d6573Sopenharmony_ci
195c67d6573Sopenharmony_ci    fn pcre2_get_error_message_8(
196c67d6573Sopenharmony_ci        error_code: c_int,
197c67d6573Sopenharmony_ci        buf: *mut u8,
198c67d6573Sopenharmony_ci        buflen: size_t,
199c67d6573Sopenharmony_ci    ) -> c_int;
200c67d6573Sopenharmony_ci}
201