1use std::collections::HashMap; 2use std::fmt; 3use std::iter; 4use std::result; 5use std::sync::Arc; 6 7use regex_syntax::hir::{self, Hir}; 8use regex_syntax::is_word_byte; 9use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; 10 11use crate::prog::{ 12 EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, 13 InstSave, InstSplit, Program, 14}; 15 16use crate::Error; 17 18type Result = result::Result<Patch, Error>; 19type ResultOrEmpty = result::Result<Option<Patch>, Error>; 20 21#[derive(Debug)] 22struct Patch { 23 hole: Hole, 24 entry: InstPtr, 25} 26 27/// A compiler translates a regular expression AST to a sequence of 28/// instructions. The sequence of instructions represents an NFA. 29// `Compiler` is only public via the `internal` module, so avoid deriving 30// `Debug`. 31#[allow(missing_debug_implementations)] 32pub struct Compiler { 33 insts: Vec<MaybeInst>, 34 compiled: Program, 35 capture_name_idx: HashMap<String, usize>, 36 num_exprs: usize, 37 size_limit: usize, 38 suffix_cache: SuffixCache, 39 utf8_seqs: Option<Utf8Sequences>, 40 byte_classes: ByteClassSet, 41 // This keeps track of extra bytes allocated while compiling the regex 42 // program. Currently, this corresponds to two things. First is the heap 43 // memory allocated by Unicode character classes ('InstRanges'). Second is 44 // a "fake" amount of memory used by empty sub-expressions, so that enough 45 // empty sub-expressions will ultimately trigger the compiler to bail 46 // because of a size limit restriction. (That empty sub-expressions don't 47 // add to heap memory usage is more-or-less an implementation detail.) In 48 // the second case, if we don't bail, then an excessively large repetition 49 // on an empty sub-expression can result in the compiler using a very large 50 // amount of CPU time. 51 extra_inst_bytes: usize, 52} 53 54impl Compiler { 55 /// Create a new regular expression compiler. 56 /// 57 /// Various options can be set before calling `compile` on an expression. 58 pub fn new() -> Self { 59 Compiler { 60 insts: vec![], 61 compiled: Program::new(), 62 capture_name_idx: HashMap::new(), 63 num_exprs: 0, 64 size_limit: 10 * (1 << 20), 65 suffix_cache: SuffixCache::new(1000), 66 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), 67 byte_classes: ByteClassSet::new(), 68 extra_inst_bytes: 0, 69 } 70 } 71 72 /// The size of the resulting program is limited by size_limit. If 73 /// the program approximately exceeds the given size (in bytes), then 74 /// compilation will stop and return an error. 75 pub fn size_limit(mut self, size_limit: usize) -> Self { 76 self.size_limit = size_limit; 77 self 78 } 79 80 /// If bytes is true, then the program is compiled as a byte based 81 /// automaton, which incorporates UTF-8 decoding into the machine. If it's 82 /// false, then the automaton is Unicode scalar value based, e.g., an 83 /// engine utilizing such an automaton is responsible for UTF-8 decoding. 84 /// 85 /// The specific invariant is that when returning a byte based machine, 86 /// the neither the `Char` nor `Ranges` instructions are produced. 87 /// Conversely, when producing a Unicode scalar value machine, the `Bytes` 88 /// instruction is never produced. 89 /// 90 /// Note that `dfa(true)` implies `bytes(true)`. 91 pub fn bytes(mut self, yes: bool) -> Self { 92 self.compiled.is_bytes = yes; 93 self 94 } 95 96 /// When disabled, the program compiled may match arbitrary bytes. 97 /// 98 /// When enabled (the default), all compiled programs exclusively match 99 /// valid UTF-8 bytes. 100 pub fn only_utf8(mut self, yes: bool) -> Self { 101 self.compiled.only_utf8 = yes; 102 self 103 } 104 105 /// When set, the machine returned is suitable for use in the DFA matching 106 /// engine. 107 /// 108 /// In particular, this ensures that if the regex is not anchored in the 109 /// beginning, then a preceding `.*?` is included in the program. (The NFA 110 /// based engines handle the preceding `.*?` explicitly, which is difficult 111 /// or impossible in the DFA engine.) 112 pub fn dfa(mut self, yes: bool) -> Self { 113 self.compiled.is_dfa = yes; 114 self 115 } 116 117 /// When set, the machine returned is suitable for matching text in 118 /// reverse. In particular, all concatenations are flipped. 119 pub fn reverse(mut self, yes: bool) -> Self { 120 self.compiled.is_reverse = yes; 121 self 122 } 123 124 /// Compile a regular expression given its AST. 125 /// 126 /// The compiler is guaranteed to succeed unless the program exceeds the 127 /// specified size limit. If the size limit is exceeded, then compilation 128 /// stops and returns an error. 129 pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> { 130 debug_assert!(!exprs.is_empty()); 131 self.num_exprs = exprs.len(); 132 if exprs.len() == 1 { 133 self.compile_one(&exprs[0]) 134 } else { 135 self.compile_many(exprs) 136 } 137 } 138 139 fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { 140 // If we're compiling a forward DFA and we aren't anchored, then 141 // add a `.*?` before the first capture group. 142 // Other matching engines handle this by baking the logic into the 143 // matching engine itself. 144 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; 145 self.compiled.is_anchored_start = expr.is_anchored_start(); 146 self.compiled.is_anchored_end = expr.is_anchored_end(); 147 if self.compiled.needs_dotstar() { 148 dotstar_patch = self.c_dotstar()?; 149 self.compiled.start = dotstar_patch.entry; 150 } 151 self.compiled.captures = vec![None]; 152 let patch = 153 self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); 154 if self.compiled.needs_dotstar() { 155 self.fill(dotstar_patch.hole, patch.entry); 156 } else { 157 self.compiled.start = patch.entry; 158 } 159 self.fill_to_next(patch.hole); 160 self.compiled.matches = vec![self.insts.len()]; 161 self.push_compiled(Inst::Match(0)); 162 self.compile_finish() 163 } 164 165 fn compile_many( 166 mut self, 167 exprs: &[Hir], 168 ) -> result::Result<Program, Error> { 169 debug_assert!(exprs.len() > 1); 170 171 self.compiled.is_anchored_start = 172 exprs.iter().all(|e| e.is_anchored_start()); 173 self.compiled.is_anchored_end = 174 exprs.iter().all(|e| e.is_anchored_end()); 175 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; 176 if self.compiled.needs_dotstar() { 177 dotstar_patch = self.c_dotstar()?; 178 self.compiled.start = dotstar_patch.entry; 179 } else { 180 self.compiled.start = 0; // first instruction is always split 181 } 182 self.fill_to_next(dotstar_patch.hole); 183 184 let mut prev_hole = Hole::None; 185 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { 186 self.fill_to_next(prev_hole); 187 let split = self.push_split_hole(); 188 let Patch { hole, entry } = 189 self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); 190 self.fill_to_next(hole); 191 self.compiled.matches.push(self.insts.len()); 192 self.push_compiled(Inst::Match(i)); 193 prev_hole = self.fill_split(split, Some(entry), None); 194 } 195 let i = exprs.len() - 1; 196 let Patch { hole, entry } = 197 self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst()); 198 self.fill(prev_hole, entry); 199 self.fill_to_next(hole); 200 self.compiled.matches.push(self.insts.len()); 201 self.push_compiled(Inst::Match(i)); 202 self.compile_finish() 203 } 204 205 fn compile_finish(mut self) -> result::Result<Program, Error> { 206 self.compiled.insts = 207 self.insts.into_iter().map(|inst| inst.unwrap()).collect(); 208 self.compiled.byte_classes = self.byte_classes.byte_classes(); 209 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); 210 Ok(self.compiled) 211 } 212 213 /// Compile expr into self.insts, returning a patch on success, 214 /// or an error if we run out of memory. 215 /// 216 /// All of the c_* methods of the compiler share the contract outlined 217 /// here. 218 /// 219 /// The main thing that a c_* method does is mutate `self.insts` 220 /// to add a list of mostly compiled instructions required to execute 221 /// the given expression. `self.insts` contains MaybeInsts rather than 222 /// Insts because there is some backpatching required. 223 /// 224 /// The `Patch` value returned by each c_* method provides metadata 225 /// about the compiled instructions emitted to `self.insts`. The 226 /// `entry` member of the patch refers to the first instruction 227 /// (the entry point), while the `hole` member contains zero or 228 /// more offsets to partial instructions that need to be backpatched. 229 /// The c_* routine can't know where its list of instructions are going to 230 /// jump to after execution, so it is up to the caller to patch 231 /// these jumps to point to the right place. So compiling some 232 /// expression, e, we would end up with a situation that looked like: 233 /// 234 /// ```text 235 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] 236 /// ^ ^ ^ 237 /// | \ / 238 /// entry \ / 239 /// hole 240 /// ``` 241 /// 242 /// To compile two expressions, e1 and e2, concatenated together we 243 /// would do: 244 /// 245 /// ```ignore 246 /// let patch1 = self.c(e1); 247 /// let patch2 = self.c(e2); 248 /// ``` 249 /// 250 /// while leaves us with a situation that looks like 251 /// 252 /// ```text 253 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] 254 /// ^ ^ ^ ^ 255 /// | | | | 256 /// entry1 hole1 entry2 hole2 257 /// ``` 258 /// 259 /// Then to merge the two patches together into one we would backpatch 260 /// hole1 with entry2 and return a new patch that enters at entry1 261 /// and has hole2 for a hole. In fact, if you look at the c_concat 262 /// method you will see that it does exactly this, though it handles 263 /// a list of expressions rather than just the two that we use for 264 /// an example. 265 /// 266 /// Ok(None) is returned when an expression is compiled to no 267 /// instruction, and so no patch.entry value makes sense. 268 fn c(&mut self, expr: &Hir) -> ResultOrEmpty { 269 use crate::prog; 270 use regex_syntax::hir::HirKind::*; 271 272 self.check_size()?; 273 match *expr.kind() { 274 Empty => self.c_empty(), 275 Literal(hir::Literal::Unicode(c)) => self.c_char(c), 276 Literal(hir::Literal::Byte(b)) => { 277 assert!(self.compiled.uses_bytes()); 278 self.c_byte(b) 279 } 280 Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), 281 Class(hir::Class::Bytes(ref cls)) => { 282 if self.compiled.uses_bytes() { 283 self.c_class_bytes(cls.ranges()) 284 } else { 285 assert!(cls.is_all_ascii()); 286 let mut char_ranges = vec![]; 287 for r in cls.iter() { 288 let (s, e) = (r.start() as char, r.end() as char); 289 char_ranges.push(hir::ClassUnicodeRange::new(s, e)); 290 } 291 self.c_class(&char_ranges) 292 } 293 } 294 Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { 295 self.byte_classes.set_range(b'\n', b'\n'); 296 self.c_empty_look(prog::EmptyLook::EndLine) 297 } 298 Anchor(hir::Anchor::StartLine) => { 299 self.byte_classes.set_range(b'\n', b'\n'); 300 self.c_empty_look(prog::EmptyLook::StartLine) 301 } 302 Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { 303 self.byte_classes.set_range(b'\n', b'\n'); 304 self.c_empty_look(prog::EmptyLook::StartLine) 305 } 306 Anchor(hir::Anchor::EndLine) => { 307 self.byte_classes.set_range(b'\n', b'\n'); 308 self.c_empty_look(prog::EmptyLook::EndLine) 309 } 310 Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { 311 self.c_empty_look(prog::EmptyLook::EndText) 312 } 313 Anchor(hir::Anchor::StartText) => { 314 self.c_empty_look(prog::EmptyLook::StartText) 315 } 316 Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { 317 self.c_empty_look(prog::EmptyLook::StartText) 318 } 319 Anchor(hir::Anchor::EndText) => { 320 self.c_empty_look(prog::EmptyLook::EndText) 321 } 322 WordBoundary(hir::WordBoundary::Unicode) => { 323 if !cfg!(feature = "unicode-perl") { 324 return Err(Error::Syntax( 325 "Unicode word boundaries are unavailable when \ 326 the unicode-perl feature is disabled" 327 .to_string(), 328 )); 329 } 330 self.compiled.has_unicode_word_boundary = true; 331 self.byte_classes.set_word_boundary(); 332 // We also make sure that all ASCII bytes are in a different 333 // class from non-ASCII bytes. Otherwise, it's possible for 334 // ASCII bytes to get lumped into the same class as non-ASCII 335 // bytes. This in turn may cause the lazy DFA to falsely start 336 // when it sees an ASCII byte that maps to a byte class with 337 // non-ASCII bytes. This ensures that never happens. 338 self.byte_classes.set_range(0, 0x7F); 339 self.c_empty_look(prog::EmptyLook::WordBoundary) 340 } 341 WordBoundary(hir::WordBoundary::UnicodeNegate) => { 342 if !cfg!(feature = "unicode-perl") { 343 return Err(Error::Syntax( 344 "Unicode word boundaries are unavailable when \ 345 the unicode-perl feature is disabled" 346 .to_string(), 347 )); 348 } 349 self.compiled.has_unicode_word_boundary = true; 350 self.byte_classes.set_word_boundary(); 351 // See comments above for why we set the ASCII range here. 352 self.byte_classes.set_range(0, 0x7F); 353 self.c_empty_look(prog::EmptyLook::NotWordBoundary) 354 } 355 WordBoundary(hir::WordBoundary::Ascii) => { 356 self.byte_classes.set_word_boundary(); 357 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) 358 } 359 WordBoundary(hir::WordBoundary::AsciiNegate) => { 360 self.byte_classes.set_word_boundary(); 361 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) 362 } 363 Group(ref g) => match g.kind { 364 hir::GroupKind::NonCapturing => self.c(&g.hir), 365 hir::GroupKind::CaptureIndex(index) => { 366 if index as usize >= self.compiled.captures.len() { 367 self.compiled.captures.push(None); 368 } 369 self.c_capture(2 * index as usize, &g.hir) 370 } 371 hir::GroupKind::CaptureName { index, ref name } => { 372 if index as usize >= self.compiled.captures.len() { 373 let n = name.to_string(); 374 self.compiled.captures.push(Some(n.clone())); 375 self.capture_name_idx.insert(n, index as usize); 376 } 377 self.c_capture(2 * index as usize, &g.hir) 378 } 379 }, 380 Concat(ref es) => { 381 if self.compiled.is_reverse { 382 self.c_concat(es.iter().rev()) 383 } else { 384 self.c_concat(es) 385 } 386 } 387 Alternation(ref es) => self.c_alternate(&**es), 388 Repetition(ref rep) => self.c_repeat(rep), 389 } 390 } 391 392 fn c_empty(&mut self) -> ResultOrEmpty { 393 // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 394 // See: CVE-2022-24713 395 // 396 // Since 'empty' sub-expressions don't increase the size of 397 // the actual compiled object, we "fake" an increase in its 398 // size so that our 'check_size_limit' routine will eventually 399 // stop compilation if there are too many empty sub-expressions 400 // (e.g., via a large repetition). 401 self.extra_inst_bytes += std::mem::size_of::<Inst>(); 402 Ok(None) 403 } 404 405 fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { 406 if self.num_exprs > 1 || self.compiled.is_dfa { 407 // Don't ever compile Save instructions for regex sets because 408 // they are never used. They are also never used in DFA programs 409 // because DFAs can't handle captures. 410 self.c(expr) 411 } else { 412 let entry = self.insts.len(); 413 let hole = self.push_hole(InstHole::Save { slot: first_slot }); 414 let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst()); 415 self.fill(hole, patch.entry); 416 self.fill_to_next(patch.hole); 417 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); 418 Ok(Some(Patch { hole, entry })) 419 } 420 } 421 422 fn c_dotstar(&mut self) -> Result { 423 Ok(if !self.compiled.only_utf8() { 424 self.c(&Hir::repetition(hir::Repetition { 425 kind: hir::RepetitionKind::ZeroOrMore, 426 greedy: false, 427 hir: Box::new(Hir::any(true)), 428 }))? 429 .unwrap() 430 } else { 431 self.c(&Hir::repetition(hir::Repetition { 432 kind: hir::RepetitionKind::ZeroOrMore, 433 greedy: false, 434 hir: Box::new(Hir::any(false)), 435 }))? 436 .unwrap() 437 }) 438 } 439 440 fn c_char(&mut self, c: char) -> ResultOrEmpty { 441 if self.compiled.uses_bytes() { 442 if c.is_ascii() { 443 let b = c as u8; 444 let hole = 445 self.push_hole(InstHole::Bytes { start: b, end: b }); 446 self.byte_classes.set_range(b, b); 447 Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) 448 } else { 449 self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) 450 } 451 } else { 452 let hole = self.push_hole(InstHole::Char { c }); 453 Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) 454 } 455 } 456 457 fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { 458 use std::mem::size_of; 459 460 assert!(!ranges.is_empty()); 461 if self.compiled.uses_bytes() { 462 Ok(Some(CompileClass { c: self, ranges }.compile()?)) 463 } else { 464 let ranges: Vec<(char, char)> = 465 ranges.iter().map(|r| (r.start(), r.end())).collect(); 466 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { 467 self.push_hole(InstHole::Char { c: ranges[0].0 }) 468 } else { 469 self.extra_inst_bytes += 470 ranges.len() * (size_of::<char>() * 2); 471 self.push_hole(InstHole::Ranges { ranges }) 472 }; 473 Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) 474 } 475 } 476 477 fn c_byte(&mut self, b: u8) -> ResultOrEmpty { 478 self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) 479 } 480 481 fn c_class_bytes( 482 &mut self, 483 ranges: &[hir::ClassBytesRange], 484 ) -> ResultOrEmpty { 485 debug_assert!(!ranges.is_empty()); 486 487 let first_split_entry = self.insts.len(); 488 let mut holes = vec![]; 489 let mut prev_hole = Hole::None; 490 for r in &ranges[0..ranges.len() - 1] { 491 self.fill_to_next(prev_hole); 492 let split = self.push_split_hole(); 493 let next = self.insts.len(); 494 self.byte_classes.set_range(r.start(), r.end()); 495 holes.push(self.push_hole(InstHole::Bytes { 496 start: r.start(), 497 end: r.end(), 498 })); 499 prev_hole = self.fill_split(split, Some(next), None); 500 } 501 let next = self.insts.len(); 502 let r = &ranges[ranges.len() - 1]; 503 self.byte_classes.set_range(r.start(), r.end()); 504 holes.push( 505 self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), 506 ); 507 self.fill(prev_hole, next); 508 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) 509 } 510 511 fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { 512 let hole = self.push_hole(InstHole::EmptyLook { look }); 513 Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) 514 } 515 516 fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty 517 where 518 I: IntoIterator<Item = &'a Hir>, 519 { 520 let mut exprs = exprs.into_iter(); 521 let Patch { mut hole, entry } = loop { 522 match exprs.next() { 523 None => return self.c_empty(), 524 Some(e) => { 525 if let Some(p) = self.c(e)? { 526 break p; 527 } 528 } 529 } 530 }; 531 for e in exprs { 532 if let Some(p) = self.c(e)? { 533 self.fill(hole, p.entry); 534 hole = p.hole; 535 } 536 } 537 Ok(Some(Patch { hole, entry })) 538 } 539 540 fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { 541 debug_assert!( 542 exprs.len() >= 2, 543 "alternates must have at least 2 exprs" 544 ); 545 546 // Initial entry point is always the first split. 547 let first_split_entry = self.insts.len(); 548 549 // Save up all of the holes from each alternate. They will all get 550 // patched to point to the same location. 551 let mut holes = vec![]; 552 553 // true indicates that the hole is a split where we want to fill 554 // the second branch. 555 let mut prev_hole = (Hole::None, false); 556 for e in &exprs[0..exprs.len() - 1] { 557 if prev_hole.1 { 558 let next = self.insts.len(); 559 self.fill_split(prev_hole.0, None, Some(next)); 560 } else { 561 self.fill_to_next(prev_hole.0); 562 } 563 let split = self.push_split_hole(); 564 if let Some(Patch { hole, entry }) = self.c(e)? { 565 holes.push(hole); 566 prev_hole = (self.fill_split(split, Some(entry), None), false); 567 } else { 568 let (split1, split2) = split.dup_one(); 569 holes.push(split1); 570 prev_hole = (split2, true); 571 } 572 } 573 if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { 574 holes.push(hole); 575 if prev_hole.1 { 576 self.fill_split(prev_hole.0, None, Some(entry)); 577 } else { 578 self.fill(prev_hole.0, entry); 579 } 580 } else { 581 // We ignore prev_hole.1. When it's true, it means we have two 582 // empty branches both pushing prev_hole.0 into holes, so both 583 // branches will go to the same place anyway. 584 holes.push(prev_hole.0); 585 } 586 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) 587 } 588 589 fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { 590 use regex_syntax::hir::RepetitionKind::*; 591 match rep.kind { 592 ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), 593 ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), 594 OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), 595 Range(hir::RepetitionRange::Exactly(min_max)) => { 596 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) 597 } 598 Range(hir::RepetitionRange::AtLeast(min)) => { 599 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) 600 } 601 Range(hir::RepetitionRange::Bounded(min, max)) => { 602 self.c_repeat_range(&rep.hir, rep.greedy, min, max) 603 } 604 } 605 } 606 607 fn c_repeat_zero_or_one( 608 &mut self, 609 expr: &Hir, 610 greedy: bool, 611 ) -> ResultOrEmpty { 612 let split_entry = self.insts.len(); 613 let split = self.push_split_hole(); 614 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { 615 Some(p) => p, 616 None => return self.pop_split_hole(), 617 }; 618 let split_hole = if greedy { 619 self.fill_split(split, Some(entry_rep), None) 620 } else { 621 self.fill_split(split, None, Some(entry_rep)) 622 }; 623 let holes = vec![hole_rep, split_hole]; 624 Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) 625 } 626 627 fn c_repeat_zero_or_more( 628 &mut self, 629 expr: &Hir, 630 greedy: bool, 631 ) -> ResultOrEmpty { 632 let split_entry = self.insts.len(); 633 let split = self.push_split_hole(); 634 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { 635 Some(p) => p, 636 None => return self.pop_split_hole(), 637 }; 638 639 self.fill(hole_rep, split_entry); 640 let split_hole = if greedy { 641 self.fill_split(split, Some(entry_rep), None) 642 } else { 643 self.fill_split(split, None, Some(entry_rep)) 644 }; 645 Ok(Some(Patch { hole: split_hole, entry: split_entry })) 646 } 647 648 fn c_repeat_one_or_more( 649 &mut self, 650 expr: &Hir, 651 greedy: bool, 652 ) -> ResultOrEmpty { 653 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { 654 Some(p) => p, 655 None => return Ok(None), 656 }; 657 self.fill_to_next(hole_rep); 658 let split = self.push_split_hole(); 659 660 let split_hole = if greedy { 661 self.fill_split(split, Some(entry_rep), None) 662 } else { 663 self.fill_split(split, None, Some(entry_rep)) 664 }; 665 Ok(Some(Patch { hole: split_hole, entry: entry_rep })) 666 } 667 668 fn c_repeat_range_min_or_more( 669 &mut self, 670 expr: &Hir, 671 greedy: bool, 672 min: u32, 673 ) -> ResultOrEmpty { 674 let min = u32_to_usize(min); 675 // Using next_inst() is ok, because we can't return it (concat would 676 // have to return Some(_) while c_repeat_range_min_or_more returns 677 // None). 678 let patch_concat = self 679 .c_concat(iter::repeat(expr).take(min))? 680 .unwrap_or_else(|| self.next_inst()); 681 if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { 682 self.fill(patch_concat.hole, patch_rep.entry); 683 Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) 684 } else { 685 Ok(None) 686 } 687 } 688 689 fn c_repeat_range( 690 &mut self, 691 expr: &Hir, 692 greedy: bool, 693 min: u32, 694 max: u32, 695 ) -> ResultOrEmpty { 696 let (min, max) = (u32_to_usize(min), u32_to_usize(max)); 697 debug_assert!(min <= max); 698 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; 699 if min == max { 700 return Ok(patch_concat); 701 } 702 // Same reasoning as in c_repeat_range_min_or_more (we know that min < 703 // max at this point). 704 let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst()); 705 let initial_entry = patch_concat.entry; 706 // It is much simpler to compile, e.g., `a{2,5}` as: 707 // 708 // aaa?a?a? 709 // 710 // But you end up with a sequence of instructions like this: 711 // 712 // 0: 'a' 713 // 1: 'a', 714 // 2: split(3, 4) 715 // 3: 'a' 716 // 4: split(5, 6) 717 // 5: 'a' 718 // 6: split(7, 8) 719 // 7: 'a' 720 // 8: MATCH 721 // 722 // This is *incredibly* inefficient because the splits end 723 // up forming a chain, which has to be resolved everything a 724 // transition is followed. 725 let mut holes = vec![]; 726 let mut prev_hole = patch_concat.hole; 727 for _ in min..max { 728 self.fill_to_next(prev_hole); 729 let split = self.push_split_hole(); 730 let Patch { hole, entry } = match self.c(expr)? { 731 Some(p) => p, 732 None => return self.pop_split_hole(), 733 }; 734 prev_hole = hole; 735 if greedy { 736 holes.push(self.fill_split(split, Some(entry), None)); 737 } else { 738 holes.push(self.fill_split(split, None, Some(entry))); 739 } 740 } 741 holes.push(prev_hole); 742 Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) 743 } 744 745 /// Can be used as a default value for the c_* functions when the call to 746 /// c_function is followed by inserting at least one instruction that is 747 /// always executed after the ones written by the c* function. 748 fn next_inst(&self) -> Patch { 749 Patch { hole: Hole::None, entry: self.insts.len() } 750 } 751 752 fn fill(&mut self, hole: Hole, goto: InstPtr) { 753 match hole { 754 Hole::None => {} 755 Hole::One(pc) => { 756 self.insts[pc].fill(goto); 757 } 758 Hole::Many(holes) => { 759 for hole in holes { 760 self.fill(hole, goto); 761 } 762 } 763 } 764 } 765 766 fn fill_to_next(&mut self, hole: Hole) { 767 let next = self.insts.len(); 768 self.fill(hole, next); 769 } 770 771 fn fill_split( 772 &mut self, 773 hole: Hole, 774 goto1: Option<InstPtr>, 775 goto2: Option<InstPtr>, 776 ) -> Hole { 777 match hole { 778 Hole::None => Hole::None, 779 Hole::One(pc) => match (goto1, goto2) { 780 (Some(goto1), Some(goto2)) => { 781 self.insts[pc].fill_split(goto1, goto2); 782 Hole::None 783 } 784 (Some(goto1), None) => { 785 self.insts[pc].half_fill_split_goto1(goto1); 786 Hole::One(pc) 787 } 788 (None, Some(goto2)) => { 789 self.insts[pc].half_fill_split_goto2(goto2); 790 Hole::One(pc) 791 } 792 (None, None) => unreachable!( 793 "at least one of the split \ 794 holes must be filled" 795 ), 796 }, 797 Hole::Many(holes) => { 798 let mut new_holes = vec![]; 799 for hole in holes { 800 new_holes.push(self.fill_split(hole, goto1, goto2)); 801 } 802 if new_holes.is_empty() { 803 Hole::None 804 } else if new_holes.len() == 1 { 805 new_holes.pop().unwrap() 806 } else { 807 Hole::Many(new_holes) 808 } 809 } 810 } 811 } 812 813 fn push_compiled(&mut self, inst: Inst) { 814 self.insts.push(MaybeInst::Compiled(inst)); 815 } 816 817 fn push_hole(&mut self, inst: InstHole) -> Hole { 818 let hole = self.insts.len(); 819 self.insts.push(MaybeInst::Uncompiled(inst)); 820 Hole::One(hole) 821 } 822 823 fn push_split_hole(&mut self) -> Hole { 824 let hole = self.insts.len(); 825 self.insts.push(MaybeInst::Split); 826 Hole::One(hole) 827 } 828 829 fn pop_split_hole(&mut self) -> ResultOrEmpty { 830 self.insts.pop(); 831 Ok(None) 832 } 833 834 fn check_size(&self) -> result::Result<(), Error> { 835 use std::mem::size_of; 836 837 let size = 838 self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>()); 839 if size > self.size_limit { 840 Err(Error::CompiledTooBig(self.size_limit)) 841 } else { 842 Ok(()) 843 } 844 } 845} 846 847#[derive(Debug)] 848enum Hole { 849 None, 850 One(InstPtr), 851 Many(Vec<Hole>), 852} 853 854impl Hole { 855 fn dup_one(self) -> (Self, Self) { 856 match self { 857 Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), 858 Hole::None | Hole::Many(_) => { 859 unreachable!("must be called on single hole") 860 } 861 } 862 } 863} 864 865#[derive(Clone, Debug)] 866enum MaybeInst { 867 Compiled(Inst), 868 Uncompiled(InstHole), 869 Split, 870 Split1(InstPtr), 871 Split2(InstPtr), 872} 873 874impl MaybeInst { 875 fn fill(&mut self, goto: InstPtr) { 876 let maybeinst = match *self { 877 MaybeInst::Split => MaybeInst::Split1(goto), 878 MaybeInst::Uncompiled(ref inst) => { 879 MaybeInst::Compiled(inst.fill(goto)) 880 } 881 MaybeInst::Split1(goto1) => { 882 MaybeInst::Compiled(Inst::Split(InstSplit { 883 goto1, 884 goto2: goto, 885 })) 886 } 887 MaybeInst::Split2(goto2) => { 888 MaybeInst::Compiled(Inst::Split(InstSplit { 889 goto1: goto, 890 goto2, 891 })) 892 } 893 _ => unreachable!( 894 "not all instructions were compiled! \ 895 found uncompiled instruction: {:?}", 896 self 897 ), 898 }; 899 *self = maybeinst; 900 } 901 902 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { 903 let filled = match *self { 904 MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }), 905 _ => unreachable!( 906 "must be called on Split instruction, \ 907 instead it was called on: {:?}", 908 self 909 ), 910 }; 911 *self = MaybeInst::Compiled(filled); 912 } 913 914 fn half_fill_split_goto1(&mut self, goto1: InstPtr) { 915 let half_filled = match *self { 916 MaybeInst::Split => goto1, 917 _ => unreachable!( 918 "must be called on Split instruction, \ 919 instead it was called on: {:?}", 920 self 921 ), 922 }; 923 *self = MaybeInst::Split1(half_filled); 924 } 925 926 fn half_fill_split_goto2(&mut self, goto2: InstPtr) { 927 let half_filled = match *self { 928 MaybeInst::Split => goto2, 929 _ => unreachable!( 930 "must be called on Split instruction, \ 931 instead it was called on: {:?}", 932 self 933 ), 934 }; 935 *self = MaybeInst::Split2(half_filled); 936 } 937 938 fn unwrap(self) -> Inst { 939 match self { 940 MaybeInst::Compiled(inst) => inst, 941 _ => unreachable!( 942 "must be called on a compiled instruction, \ 943 instead it was called on: {:?}", 944 self 945 ), 946 } 947 } 948} 949 950#[derive(Clone, Debug)] 951enum InstHole { 952 Save { slot: usize }, 953 EmptyLook { look: EmptyLook }, 954 Char { c: char }, 955 Ranges { ranges: Vec<(char, char)> }, 956 Bytes { start: u8, end: u8 }, 957} 958 959impl InstHole { 960 fn fill(&self, goto: InstPtr) -> Inst { 961 match *self { 962 InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }), 963 InstHole::EmptyLook { look } => { 964 Inst::EmptyLook(InstEmptyLook { goto, look }) 965 } 966 InstHole::Char { c } => Inst::Char(InstChar { goto, c }), 967 InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges { 968 goto, 969 ranges: ranges.clone().into_boxed_slice(), 970 }), 971 InstHole::Bytes { start, end } => { 972 Inst::Bytes(InstBytes { goto, start, end }) 973 } 974 } 975 } 976} 977 978struct CompileClass<'a, 'b> { 979 c: &'a mut Compiler, 980 ranges: &'b [hir::ClassUnicodeRange], 981} 982 983impl<'a, 'b> CompileClass<'a, 'b> { 984 fn compile(mut self) -> Result { 985 let mut holes = vec![]; 986 let mut initial_entry = None; 987 let mut last_split = Hole::None; 988 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); 989 self.c.suffix_cache.clear(); 990 991 for (i, range) in self.ranges.iter().enumerate() { 992 let is_last_range = i + 1 == self.ranges.len(); 993 utf8_seqs.reset(range.start(), range.end()); 994 let mut it = (&mut utf8_seqs).peekable(); 995 loop { 996 let utf8_seq = match it.next() { 997 None => break, 998 Some(utf8_seq) => utf8_seq, 999 }; 1000 if is_last_range && it.peek().is_none() { 1001 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; 1002 holes.push(hole); 1003 self.c.fill(last_split, entry); 1004 last_split = Hole::None; 1005 if initial_entry.is_none() { 1006 initial_entry = Some(entry); 1007 } 1008 } else { 1009 if initial_entry.is_none() { 1010 initial_entry = Some(self.c.insts.len()); 1011 } 1012 self.c.fill_to_next(last_split); 1013 last_split = self.c.push_split_hole(); 1014 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; 1015 holes.push(hole); 1016 last_split = 1017 self.c.fill_split(last_split, Some(entry), None); 1018 } 1019 } 1020 } 1021 self.c.utf8_seqs = Some(utf8_seqs); 1022 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) 1023 } 1024 1025 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { 1026 if self.c.compiled.is_reverse { 1027 self.c_utf8_seq_(seq) 1028 } else { 1029 self.c_utf8_seq_(seq.into_iter().rev()) 1030 } 1031 } 1032 1033 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result 1034 where 1035 I: IntoIterator<Item = &'r Utf8Range>, 1036 { 1037 // The initial instruction for each UTF-8 sequence should be the same. 1038 let mut from_inst = ::std::usize::MAX; 1039 let mut last_hole = Hole::None; 1040 for byte_range in seq { 1041 let key = SuffixCacheKey { 1042 from_inst, 1043 start: byte_range.start, 1044 end: byte_range.end, 1045 }; 1046 { 1047 let pc = self.c.insts.len(); 1048 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { 1049 from_inst = cached_pc; 1050 continue; 1051 } 1052 } 1053 self.c.byte_classes.set_range(byte_range.start, byte_range.end); 1054 if from_inst == ::std::usize::MAX { 1055 last_hole = self.c.push_hole(InstHole::Bytes { 1056 start: byte_range.start, 1057 end: byte_range.end, 1058 }); 1059 } else { 1060 self.c.push_compiled(Inst::Bytes(InstBytes { 1061 goto: from_inst, 1062 start: byte_range.start, 1063 end: byte_range.end, 1064 })); 1065 } 1066 from_inst = self.c.insts.len().checked_sub(1).unwrap(); 1067 debug_assert!(from_inst < ::std::usize::MAX); 1068 } 1069 debug_assert!(from_inst < ::std::usize::MAX); 1070 Ok(Patch { hole: last_hole, entry: from_inst }) 1071 } 1072} 1073 1074/// `SuffixCache` is a simple bounded hash map for caching suffix entries in 1075/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. 1076/// The set of byte ranges looks like this: 1077/// 1078/// [0-7F] 1079/// [C2-DF][80-BF] 1080/// [E0][A0-BF][80-BF] 1081/// [E1-EC][80-BF][80-BF] 1082/// [ED][80-9F][80-BF] 1083/// [EE-EF][80-BF][80-BF] 1084/// 1085/// Each line above translates to one alternate in the compiled regex program. 1086/// However, all but one of the alternates end in the same suffix, which is 1087/// a waste of an instruction. The suffix cache facilitates reusing them across 1088/// alternates. 1089/// 1090/// Note that a HashMap could be trivially used for this, but we don't need its 1091/// overhead. Some small bounded space (LRU style) is more than enough. 1092/// 1093/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), 1094/// except it uses hashes as original indices and then compares full keys for 1095/// validation against `dense` array. 1096#[derive(Debug)] 1097struct SuffixCache { 1098 sparse: Box<[usize]>, 1099 dense: Vec<SuffixCacheEntry>, 1100} 1101 1102#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] 1103struct SuffixCacheEntry { 1104 key: SuffixCacheKey, 1105 pc: InstPtr, 1106} 1107 1108#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] 1109struct SuffixCacheKey { 1110 from_inst: InstPtr, 1111 start: u8, 1112 end: u8, 1113} 1114 1115impl SuffixCache { 1116 fn new(size: usize) -> Self { 1117 SuffixCache { 1118 sparse: vec![0usize; size].into(), 1119 dense: Vec::with_capacity(size), 1120 } 1121 } 1122 1123 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> { 1124 let hash = self.hash(&key); 1125 let pos = &mut self.sparse[hash]; 1126 if let Some(entry) = self.dense.get(*pos) { 1127 if entry.key == key { 1128 return Some(entry.pc); 1129 } 1130 } 1131 *pos = self.dense.len(); 1132 self.dense.push(SuffixCacheEntry { key, pc }); 1133 None 1134 } 1135 1136 fn clear(&mut self) { 1137 self.dense.clear(); 1138 } 1139 1140 fn hash(&self, suffix: &SuffixCacheKey) -> usize { 1141 // Basic FNV-1a hash as described: 1142 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function 1143 const FNV_PRIME: u64 = 1_099_511_628_211; 1144 let mut h = 14_695_981_039_346_656_037; 1145 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); 1146 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); 1147 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); 1148 (h as usize) % self.sparse.len() 1149 } 1150} 1151 1152struct ByteClassSet([bool; 256]); 1153 1154impl ByteClassSet { 1155 fn new() -> Self { 1156 ByteClassSet([false; 256]) 1157 } 1158 1159 fn set_range(&mut self, start: u8, end: u8) { 1160 debug_assert!(start <= end); 1161 if start > 0 { 1162 self.0[start as usize - 1] = true; 1163 } 1164 self.0[end as usize] = true; 1165 } 1166 1167 fn set_word_boundary(&mut self) { 1168 // We need to mark all ranges of bytes whose pairs result in 1169 // evaluating \b differently. 1170 let iswb = is_word_byte; 1171 let mut b1: u16 = 0; 1172 let mut b2: u16; 1173 while b1 <= 255 { 1174 b2 = b1 + 1; 1175 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { 1176 b2 += 1; 1177 } 1178 self.set_range(b1 as u8, (b2 - 1) as u8); 1179 b1 = b2; 1180 } 1181 } 1182 1183 fn byte_classes(&self) -> Vec<u8> { 1184 // N.B. If you're debugging the DFA, it's useful to simply return 1185 // `(0..256).collect()`, which effectively removes the byte classes 1186 // and makes the transitions easier to read. 1187 // (0usize..256).map(|x| x as u8).collect() 1188 let mut byte_classes = vec![0; 256]; 1189 let mut class = 0u8; 1190 let mut i = 0; 1191 loop { 1192 byte_classes[i] = class as u8; 1193 if i >= 255 { 1194 break; 1195 } 1196 if self.0[i] { 1197 class = class.checked_add(1).unwrap(); 1198 } 1199 i += 1; 1200 } 1201 byte_classes 1202 } 1203} 1204 1205impl fmt::Debug for ByteClassSet { 1206 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1207 f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish() 1208 } 1209} 1210 1211fn u32_to_usize(n: u32) -> usize { 1212 // In case usize is less than 32 bits, we need to guard against overflow. 1213 // On most platforms this compiles to nothing. 1214 // TODO Use `std::convert::TryFrom` once it's stable. 1215 if (n as u64) > (::std::usize::MAX as u64) { 1216 panic!("BUG: {} is too big to be pointer sized", n) 1217 } 1218 n as usize 1219} 1220 1221#[cfg(test)] 1222mod tests { 1223 use super::ByteClassSet; 1224 1225 #[test] 1226 fn byte_classes() { 1227 let mut set = ByteClassSet::new(); 1228 set.set_range(b'a', b'z'); 1229 let classes = set.byte_classes(); 1230 assert_eq!(classes[0], 0); 1231 assert_eq!(classes[1], 0); 1232 assert_eq!(classes[2], 0); 1233 assert_eq!(classes[b'a' as usize - 1], 0); 1234 assert_eq!(classes[b'a' as usize], 1); 1235 assert_eq!(classes[b'm' as usize], 1); 1236 assert_eq!(classes[b'z' as usize], 1); 1237 assert_eq!(classes[b'z' as usize + 1], 2); 1238 assert_eq!(classes[254], 2); 1239 assert_eq!(classes[255], 2); 1240 1241 let mut set = ByteClassSet::new(); 1242 set.set_range(0, 2); 1243 set.set_range(4, 6); 1244 let classes = set.byte_classes(); 1245 assert_eq!(classes[0], 0); 1246 assert_eq!(classes[1], 0); 1247 assert_eq!(classes[2], 0); 1248 assert_eq!(classes[3], 1); 1249 assert_eq!(classes[4], 2); 1250 assert_eq!(classes[5], 2); 1251 assert_eq!(classes[6], 2); 1252 assert_eq!(classes[7], 3); 1253 assert_eq!(classes[255], 3); 1254 } 1255 1256 #[test] 1257 fn full_byte_classes() { 1258 let mut set = ByteClassSet::new(); 1259 for i in 0..256u16 { 1260 set.set_range(i as u8, i as u8); 1261 } 1262 assert_eq!(set.byte_classes().len(), 256); 1263 } 1264} 1265