1use std::collections::HashMap; 2use std::ffi::{CStr, CString}; 3use std::ops::Deref; 4use std::ptr; 5use std::slice; 6use std::str; 7 8use libc::{c_char, size_t}; 9use regex::bytes; 10 11use crate::error::{Error, ErrorKind}; 12 13const RURE_FLAG_CASEI: u32 = 1 << 0; 14const RURE_FLAG_MULTI: u32 = 1 << 1; 15const RURE_FLAG_DOTNL: u32 = 1 << 2; 16const RURE_FLAG_SWAP_GREED: u32 = 1 << 3; 17const RURE_FLAG_SPACE: u32 = 1 << 4; 18const RURE_FLAG_UNICODE: u32 = 1 << 5; 19const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE; 20 21pub struct Regex { 22 re: bytes::Regex, 23 capture_names: HashMap<String, i32>, 24} 25 26pub struct Options { 27 size_limit: usize, 28 dfa_size_limit: usize, 29} 30 31// The `RegexSet` is not exposed with option support or matching at an 32// arbitrary position with a crate just yet. To circumvent this, we use 33// the `Exec` structure directly. 34pub struct RegexSet { 35 re: bytes::RegexSet, 36} 37 38#[repr(C)] 39pub struct rure_match { 40 pub start: size_t, 41 pub end: size_t, 42} 43 44pub struct Captures(bytes::Locations); 45 46pub struct Iter { 47 re: *const Regex, 48 last_end: usize, 49 last_match: Option<usize>, 50} 51 52pub struct IterCaptureNames { 53 capture_names: bytes::CaptureNames<'static>, 54 name_ptrs: Vec<*mut c_char>, 55} 56 57impl Deref for Regex { 58 type Target = bytes::Regex; 59 fn deref(&self) -> &bytes::Regex { 60 &self.re 61 } 62} 63 64impl Deref for RegexSet { 65 type Target = bytes::RegexSet; 66 fn deref(&self) -> &bytes::RegexSet { 67 &self.re 68 } 69} 70 71impl Default for Options { 72 fn default() -> Options { 73 Options { size_limit: 10 * (1 << 20), dfa_size_limit: 2 * (1 << 20) } 74 } 75} 76 77ffi_fn! { 78 fn rure_compile_must(pattern: *const c_char) -> *const Regex { 79 let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; 80 let pat = pattern as *const u8; 81 let mut err = Error::new(ErrorKind::None); 82 let re = rure_compile( 83 pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err); 84 if err.is_err() { 85 let _ = writeln!(&mut io::stderr(), "{}", err); 86 let _ = writeln!( 87 &mut io::stderr(), "aborting from rure_compile_must"); 88 unsafe { abort() } 89 } 90 re 91 } 92} 93 94ffi_fn! { 95 fn rure_compile( 96 pattern: *const u8, 97 length: size_t, 98 flags: u32, 99 options: *const Options, 100 error: *mut Error, 101 ) -> *const Regex { 102 let pat = unsafe { slice::from_raw_parts(pattern, length) }; 103 let pat = match str::from_utf8(pat) { 104 Ok(pat) => pat, 105 Err(err) => { 106 unsafe { 107 if !error.is_null() { 108 *error = Error::new(ErrorKind::Str(err)); 109 } 110 return ptr::null(); 111 } 112 } 113 }; 114 let mut builder = bytes::RegexBuilder::new(pat); 115 if !options.is_null() { 116 let options = unsafe { &*options }; 117 builder.size_limit(options.size_limit); 118 builder.dfa_size_limit(options.dfa_size_limit); 119 } 120 builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); 121 builder.multi_line(flags & RURE_FLAG_MULTI > 0); 122 builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0); 123 builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0); 124 builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0); 125 builder.unicode(flags & RURE_FLAG_UNICODE > 0); 126 match builder.build() { 127 Ok(re) => { 128 let mut capture_names = HashMap::new(); 129 for (i, name) in re.capture_names().enumerate() { 130 if let Some(name) = name { 131 capture_names.insert(name.to_owned(), i as i32); 132 } 133 } 134 let re = Regex { 135 re: re, 136 capture_names: capture_names, 137 }; 138 Box::into_raw(Box::new(re)) 139 } 140 Err(err) => { 141 unsafe { 142 if !error.is_null() { 143 *error = Error::new(ErrorKind::Regex(err)); 144 } 145 ptr::null() 146 } 147 } 148 } 149 } 150} 151 152ffi_fn! { 153 fn rure_free(re: *const Regex) { 154 unsafe { drop(Box::from_raw(re as *mut Regex)); } 155 } 156} 157 158ffi_fn! { 159 fn rure_is_match( 160 re: *const Regex, 161 haystack: *const u8, 162 len: size_t, 163 start: size_t, 164 ) -> bool { 165 let re = unsafe { &*re }; 166 let haystack = unsafe { slice::from_raw_parts(haystack, len) }; 167 re.is_match_at(haystack, start) 168 } 169} 170 171ffi_fn! { 172 fn rure_find( 173 re: *const Regex, 174 haystack: *const u8, 175 len: size_t, 176 start: size_t, 177 match_info: *mut rure_match, 178 ) -> bool { 179 let re = unsafe { &*re }; 180 let haystack = unsafe { slice::from_raw_parts(haystack, len) }; 181 re.find_at(haystack, start).map(|m| unsafe { 182 if !match_info.is_null() { 183 (*match_info).start = m.start(); 184 (*match_info).end = m.end(); 185 } 186 }).is_some() 187 } 188} 189 190ffi_fn! { 191 fn rure_find_captures( 192 re: *const Regex, 193 haystack: *const u8, 194 len: size_t, 195 start: size_t, 196 captures: *mut Captures, 197 ) -> bool { 198 let re = unsafe { &*re }; 199 let haystack = unsafe { slice::from_raw_parts(haystack, len) }; 200 let slots = unsafe { &mut (*captures).0 }; 201 re.read_captures_at(slots, haystack, start).is_some() 202 } 203} 204 205ffi_fn! { 206 fn rure_shortest_match( 207 re: *const Regex, 208 haystack: *const u8, 209 len: size_t, 210 start: size_t, 211 end: *mut usize, 212 ) -> bool { 213 let re = unsafe { &*re }; 214 let haystack = unsafe { slice::from_raw_parts(haystack, len) }; 215 match re.shortest_match_at(haystack, start) { 216 None => false, 217 Some(i) => { 218 if !end.is_null() { 219 unsafe { 220 *end = i; 221 } 222 } 223 true 224 } 225 } 226 } 227} 228 229ffi_fn! { 230 fn rure_capture_name_index( 231 re: *const Regex, 232 name: *const c_char, 233 ) -> i32 { 234 let re = unsafe { &*re }; 235 let name = unsafe { CStr::from_ptr(name) }; 236 let name = match name.to_str() { 237 Err(_) => return -1, 238 Ok(name) => name, 239 }; 240 re.capture_names.get(name).map(|&i|i).unwrap_or(-1) 241 } 242} 243 244ffi_fn! { 245 fn rure_iter_capture_names_new( 246 re: *const Regex, 247 ) -> *mut IterCaptureNames { 248 let re = unsafe { &*re }; 249 Box::into_raw(Box::new(IterCaptureNames { 250 capture_names: re.re.capture_names(), 251 name_ptrs: Vec::new(), 252 })) 253 } 254} 255 256ffi_fn! { 257 fn rure_iter_capture_names_free(it: *mut IterCaptureNames) { 258 unsafe { 259 let it = &mut *it; 260 while let Some(ptr) = it.name_ptrs.pop() { 261 drop(CString::from_raw(ptr)); 262 } 263 drop(Box::from_raw(it)); 264 } 265 } 266} 267 268ffi_fn! { 269 fn rure_iter_capture_names_next( 270 it: *mut IterCaptureNames, 271 capture_name: *mut *mut c_char, 272 ) -> bool { 273 if capture_name.is_null() { 274 return false; 275 } 276 277 let it = unsafe { &mut *it }; 278 let cn = match it.capture_names.next() { 279 // Top-level iterator ran out of capture groups 280 None => return false, 281 Some(val) => { 282 let name = match val { 283 // inner Option didn't have a name 284 None => "", 285 Some(name) => name 286 }; 287 name 288 } 289 }; 290 291 unsafe { 292 let cs = match CString::new(cn.as_bytes()) { 293 Result::Ok(val) => val, 294 Result::Err(_) => return false 295 }; 296 let ptr = cs.into_raw(); 297 it.name_ptrs.push(ptr); 298 *capture_name = ptr; 299 } 300 true 301 302 } 303} 304 305ffi_fn! { 306 fn rure_iter_new( 307 re: *const Regex, 308 ) -> *mut Iter { 309 Box::into_raw(Box::new(Iter { 310 re: re, 311 last_end: 0, 312 last_match: None, 313 })) 314 } 315} 316 317ffi_fn! { 318 fn rure_iter_free(it: *mut Iter) { 319 unsafe { drop(Box::from_raw(it)); } 320 } 321} 322 323ffi_fn! { 324 fn rure_iter_next( 325 it: *mut Iter, 326 haystack: *const u8, 327 len: size_t, 328 match_info: *mut rure_match, 329 ) -> bool { 330 let it = unsafe { &mut *it }; 331 let re = unsafe { &*it.re }; 332 let text = unsafe { slice::from_raw_parts(haystack, len) }; 333 if it.last_end > text.len() { 334 return false; 335 } 336 let (s, e) = match re.find_at(text, it.last_end) { 337 None => return false, 338 Some(m) => (m.start(), m.end()), 339 }; 340 if s == e { 341 // This is an empty match. To ensure we make progress, start 342 // the next search at the smallest possible starting position 343 // of the next match following this one. 344 it.last_end += 1; 345 // Don't accept empty matches immediately following a match. 346 // Just move on to the next match. 347 if Some(e) == it.last_match { 348 return rure_iter_next(it, haystack, len, match_info); 349 } 350 } else { 351 it.last_end = e; 352 } 353 it.last_match = Some(e); 354 if !match_info.is_null() { 355 unsafe { 356 (*match_info).start = s; 357 (*match_info).end = e; 358 } 359 } 360 true 361 } 362} 363 364ffi_fn! { 365 fn rure_iter_next_captures( 366 it: *mut Iter, 367 haystack: *const u8, 368 len: size_t, 369 captures: *mut Captures, 370 ) -> bool { 371 let it = unsafe { &mut *it }; 372 let re = unsafe { &*it.re }; 373 let slots = unsafe { &mut (*captures).0 }; 374 let text = unsafe { slice::from_raw_parts(haystack, len) }; 375 if it.last_end > text.len() { 376 return false; 377 } 378 let (s, e) = match re.read_captures_at(slots, text, it.last_end) { 379 None => return false, 380 Some(m) => (m.start(), m.end()), 381 }; 382 if s == e { 383 // This is an empty match. To ensure we make progress, start 384 // the next search at the smallest possible starting position 385 // of the next match following this one. 386 it.last_end += 1; 387 // Don't accept empty matches immediately following a match. 388 // Just move on to the next match. 389 if Some(e) == it.last_match { 390 return rure_iter_next_captures(it, haystack, len, captures); 391 } 392 } else { 393 it.last_end = e; 394 } 395 it.last_match = Some(e); 396 true 397 } 398} 399 400ffi_fn! { 401 fn rure_captures_new(re: *const Regex) -> *mut Captures { 402 let re = unsafe { &*re }; 403 let captures = Captures(re.locations()); 404 Box::into_raw(Box::new(captures)) 405 } 406} 407 408ffi_fn! { 409 fn rure_captures_free(captures: *const Captures) { 410 unsafe { drop(Box::from_raw(captures as *mut Captures)); } 411 } 412} 413 414ffi_fn! { 415 fn rure_captures_at( 416 captures: *const Captures, 417 i: size_t, 418 match_info: *mut rure_match, 419 ) -> bool { 420 let locs = unsafe { &(*captures).0 }; 421 match locs.pos(i) { 422 Some((start, end)) => { 423 if !match_info.is_null() { 424 unsafe { 425 (*match_info).start = start; 426 (*match_info).end = end; 427 } 428 } 429 true 430 } 431 _ => false 432 } 433 } 434} 435 436ffi_fn! { 437 fn rure_captures_len(captures: *const Captures) -> size_t { 438 unsafe { (*captures).0.len() } 439 } 440} 441 442ffi_fn! { 443 fn rure_options_new() -> *mut Options { 444 Box::into_raw(Box::new(Options::default())) 445 } 446} 447 448ffi_fn! { 449 fn rure_options_free(options: *mut Options) { 450 unsafe { drop(Box::from_raw(options)); } 451 } 452} 453 454ffi_fn! { 455 fn rure_options_size_limit(options: *mut Options, limit: size_t) { 456 let options = unsafe { &mut *options }; 457 options.size_limit = limit; 458 } 459} 460 461ffi_fn! { 462 fn rure_options_dfa_size_limit(options: *mut Options, limit: size_t) { 463 let options = unsafe { &mut *options }; 464 options.dfa_size_limit = limit; 465 } 466} 467 468ffi_fn! { 469 fn rure_compile_set( 470 patterns: *const *const u8, 471 patterns_lengths: *const size_t, 472 patterns_count: size_t, 473 flags: u32, 474 options: *const Options, 475 error: *mut Error 476 ) -> *const RegexSet { 477 let (raw_pats, raw_patsl) = unsafe { 478 ( 479 slice::from_raw_parts(patterns, patterns_count), 480 slice::from_raw_parts(patterns_lengths, patterns_count) 481 ) 482 }; 483 484 let mut pats = Vec::with_capacity(patterns_count); 485 for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) { 486 let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) }; 487 pats.push(match str::from_utf8(pat) { 488 Ok(pat) => pat, 489 Err(err) => { 490 unsafe { 491 if !error.is_null() { 492 *error = Error::new(ErrorKind::Str(err)); 493 } 494 return ptr::null(); 495 } 496 } 497 }); 498 } 499 500 let mut builder = bytes::RegexSetBuilder::new(pats); 501 if !options.is_null() { 502 let options = unsafe { &*options }; 503 builder.size_limit(options.size_limit); 504 builder.dfa_size_limit(options.dfa_size_limit); 505 } 506 builder.case_insensitive(flags & RURE_FLAG_CASEI > 0); 507 builder.multi_line(flags & RURE_FLAG_MULTI > 0); 508 builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0); 509 builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0); 510 builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0); 511 builder.unicode(flags & RURE_FLAG_UNICODE > 0); 512 match builder.build() { 513 Ok(re) => { 514 Box::into_raw(Box::new(RegexSet { re: re })) 515 } 516 Err(err) => { 517 unsafe { 518 if !error.is_null() { 519 *error = Error::new(ErrorKind::Regex(err)) 520 } 521 ptr::null() 522 } 523 } 524 } 525 } 526} 527 528ffi_fn! { 529 fn rure_set_free(re: *const RegexSet) { 530 unsafe { drop(Box::from_raw(re as *mut RegexSet)); } 531 } 532} 533 534ffi_fn! { 535 fn rure_set_is_match( 536 re: *const RegexSet, 537 haystack: *const u8, 538 len: size_t, 539 start: size_t 540 ) -> bool { 541 let re = unsafe { &*re }; 542 let haystack = unsafe { slice::from_raw_parts(haystack, len) }; 543 re.is_match_at(haystack, start) 544 } 545} 546 547ffi_fn! { 548 fn rure_set_matches( 549 re: *const RegexSet, 550 haystack: *const u8, 551 len: size_t, 552 start: size_t, 553 matches: *mut bool 554 ) -> bool { 555 let re = unsafe { &*re }; 556 let mut matches = unsafe { 557 slice::from_raw_parts_mut(matches, re.len()) 558 }; 559 let haystack = unsafe { slice::from_raw_parts(haystack, len) }; 560 561 // read_matches_at isn't guaranteed to set non-matches to false 562 for item in matches.iter_mut() { 563 *item = false; 564 } 565 re.read_matches_at(&mut matches, haystack, start) 566 } 567} 568 569ffi_fn! { 570 fn rure_set_len(re: *const RegexSet) -> size_t { 571 unsafe { (*re).len() } 572 } 573} 574 575ffi_fn! { 576 fn rure_escape_must(pattern: *const c_char) -> *const c_char { 577 let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() }; 578 let pat = pattern as *const u8; 579 let mut err = Error::new(ErrorKind::None); 580 let esc = rure_escape(pat, len, &mut err); 581 if err.is_err() { 582 let _ = writeln!(&mut io::stderr(), "{}", err); 583 let _ = writeln!( 584 &mut io::stderr(), "aborting from rure_escape_must"); 585 unsafe { abort() } 586 } 587 esc 588 } 589} 590 591/// A helper function that implements fallible escaping in a way that returns 592/// an error if escaping failed. 593/// 594/// This should ideally be exposed, but it needs API design work. In 595/// particular, this should not return a C string, but a `const uint8_t *` 596/// instead, since it may contain a NUL byte. 597fn rure_escape( 598 pattern: *const u8, 599 length: size_t, 600 error: *mut Error, 601) -> *const c_char { 602 let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) }; 603 let str_pat = match str::from_utf8(pat) { 604 Ok(val) => val, 605 Err(err) => unsafe { 606 if !error.is_null() { 607 *error = Error::new(ErrorKind::Str(err)); 608 } 609 return ptr::null(); 610 }, 611 }; 612 let esc_pat = regex::escape(str_pat); 613 let c_esc_pat = match CString::new(esc_pat) { 614 Ok(val) => val, 615 Err(err) => unsafe { 616 if !error.is_null() { 617 *error = Error::new(ErrorKind::Nul(err)); 618 } 619 return ptr::null(); 620 }, 621 }; 622 c_esc_pat.into_raw() as *const c_char 623} 624 625ffi_fn! { 626 fn rure_cstring_free(s: *mut c_char) { 627 unsafe { drop(CString::from_raw(s)); } 628 } 629} 630