1c67d6573Sopenharmony_ciuse std::collections::HashMap; 2c67d6573Sopenharmony_ciuse std::fmt; 3c67d6573Sopenharmony_ciuse std::iter; 4c67d6573Sopenharmony_ciuse std::result; 5c67d6573Sopenharmony_ciuse std::sync::Arc; 6c67d6573Sopenharmony_ci 7c67d6573Sopenharmony_ciuse regex_syntax::hir::{self, Hir}; 8c67d6573Sopenharmony_ciuse regex_syntax::is_word_byte; 9c67d6573Sopenharmony_ciuse regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; 10c67d6573Sopenharmony_ci 11c67d6573Sopenharmony_ciuse crate::prog::{ 12c67d6573Sopenharmony_ci EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, 13c67d6573Sopenharmony_ci InstSave, InstSplit, Program, 14c67d6573Sopenharmony_ci}; 15c67d6573Sopenharmony_ci 16c67d6573Sopenharmony_ciuse crate::Error; 17c67d6573Sopenharmony_ci 18c67d6573Sopenharmony_citype Result = result::Result<Patch, Error>; 19c67d6573Sopenharmony_citype ResultOrEmpty = result::Result<Option<Patch>, Error>; 20c67d6573Sopenharmony_ci 21c67d6573Sopenharmony_ci#[derive(Debug)] 22c67d6573Sopenharmony_cistruct Patch { 23c67d6573Sopenharmony_ci hole: Hole, 24c67d6573Sopenharmony_ci entry: InstPtr, 25c67d6573Sopenharmony_ci} 26c67d6573Sopenharmony_ci 27c67d6573Sopenharmony_ci/// A compiler translates a regular expression AST to a sequence of 28c67d6573Sopenharmony_ci/// instructions. The sequence of instructions represents an NFA. 29c67d6573Sopenharmony_ci// `Compiler` is only public via the `internal` module, so avoid deriving 30c67d6573Sopenharmony_ci// `Debug`. 31c67d6573Sopenharmony_ci#[allow(missing_debug_implementations)] 32c67d6573Sopenharmony_cipub struct Compiler { 33c67d6573Sopenharmony_ci insts: Vec<MaybeInst>, 34c67d6573Sopenharmony_ci compiled: Program, 35c67d6573Sopenharmony_ci capture_name_idx: HashMap<String, usize>, 36c67d6573Sopenharmony_ci num_exprs: usize, 37c67d6573Sopenharmony_ci size_limit: usize, 38c67d6573Sopenharmony_ci suffix_cache: SuffixCache, 39c67d6573Sopenharmony_ci utf8_seqs: Option<Utf8Sequences>, 40c67d6573Sopenharmony_ci byte_classes: ByteClassSet, 41c67d6573Sopenharmony_ci // This keeps track of extra bytes allocated while compiling the regex 42c67d6573Sopenharmony_ci // program. Currently, this corresponds to two things. First is the heap 43c67d6573Sopenharmony_ci // memory allocated by Unicode character classes ('InstRanges'). Second is 44c67d6573Sopenharmony_ci // a "fake" amount of memory used by empty sub-expressions, so that enough 45c67d6573Sopenharmony_ci // empty sub-expressions will ultimately trigger the compiler to bail 46c67d6573Sopenharmony_ci // because of a size limit restriction. (That empty sub-expressions don't 47c67d6573Sopenharmony_ci // add to heap memory usage is more-or-less an implementation detail.) In 48c67d6573Sopenharmony_ci // the second case, if we don't bail, then an excessively large repetition 49c67d6573Sopenharmony_ci // on an empty sub-expression can result in the compiler using a very large 50c67d6573Sopenharmony_ci // amount of CPU time. 51c67d6573Sopenharmony_ci extra_inst_bytes: usize, 52c67d6573Sopenharmony_ci} 53c67d6573Sopenharmony_ci 54c67d6573Sopenharmony_ciimpl Compiler { 55c67d6573Sopenharmony_ci /// Create a new regular expression compiler. 56c67d6573Sopenharmony_ci /// 57c67d6573Sopenharmony_ci /// Various options can be set before calling `compile` on an expression. 58c67d6573Sopenharmony_ci pub fn new() -> Self { 59c67d6573Sopenharmony_ci Compiler { 60c67d6573Sopenharmony_ci insts: vec![], 61c67d6573Sopenharmony_ci compiled: Program::new(), 62c67d6573Sopenharmony_ci capture_name_idx: HashMap::new(), 63c67d6573Sopenharmony_ci num_exprs: 0, 64c67d6573Sopenharmony_ci size_limit: 10 * (1 << 20), 65c67d6573Sopenharmony_ci suffix_cache: SuffixCache::new(1000), 66c67d6573Sopenharmony_ci utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), 67c67d6573Sopenharmony_ci byte_classes: ByteClassSet::new(), 68c67d6573Sopenharmony_ci extra_inst_bytes: 0, 69c67d6573Sopenharmony_ci } 70c67d6573Sopenharmony_ci } 71c67d6573Sopenharmony_ci 72c67d6573Sopenharmony_ci /// The size of the resulting program is limited by size_limit. If 73c67d6573Sopenharmony_ci /// the program approximately exceeds the given size (in bytes), then 74c67d6573Sopenharmony_ci /// compilation will stop and return an error. 75c67d6573Sopenharmony_ci pub fn size_limit(mut self, size_limit: usize) -> Self { 76c67d6573Sopenharmony_ci self.size_limit = size_limit; 77c67d6573Sopenharmony_ci self 78c67d6573Sopenharmony_ci } 79c67d6573Sopenharmony_ci 80c67d6573Sopenharmony_ci /// If bytes is true, then the program is compiled as a byte based 81c67d6573Sopenharmony_ci /// automaton, which incorporates UTF-8 decoding into the machine. If it's 82c67d6573Sopenharmony_ci /// false, then the automaton is Unicode scalar value based, e.g., an 83c67d6573Sopenharmony_ci /// engine utilizing such an automaton is responsible for UTF-8 decoding. 84c67d6573Sopenharmony_ci /// 85c67d6573Sopenharmony_ci /// The specific invariant is that when returning a byte based machine, 86c67d6573Sopenharmony_ci /// the neither the `Char` nor `Ranges` instructions are produced. 87c67d6573Sopenharmony_ci /// Conversely, when producing a Unicode scalar value machine, the `Bytes` 88c67d6573Sopenharmony_ci /// instruction is never produced. 89c67d6573Sopenharmony_ci /// 90c67d6573Sopenharmony_ci /// Note that `dfa(true)` implies `bytes(true)`. 91c67d6573Sopenharmony_ci pub fn bytes(mut self, yes: bool) -> Self { 92c67d6573Sopenharmony_ci self.compiled.is_bytes = yes; 93c67d6573Sopenharmony_ci self 94c67d6573Sopenharmony_ci } 95c67d6573Sopenharmony_ci 96c67d6573Sopenharmony_ci /// When disabled, the program compiled may match arbitrary bytes. 97c67d6573Sopenharmony_ci /// 98c67d6573Sopenharmony_ci /// When enabled (the default), all compiled programs exclusively match 99c67d6573Sopenharmony_ci /// valid UTF-8 bytes. 100c67d6573Sopenharmony_ci pub fn only_utf8(mut self, yes: bool) -> Self { 101c67d6573Sopenharmony_ci self.compiled.only_utf8 = yes; 102c67d6573Sopenharmony_ci self 103c67d6573Sopenharmony_ci } 104c67d6573Sopenharmony_ci 105c67d6573Sopenharmony_ci /// When set, the machine returned is suitable for use in the DFA matching 106c67d6573Sopenharmony_ci /// engine. 107c67d6573Sopenharmony_ci /// 108c67d6573Sopenharmony_ci /// In particular, this ensures that if the regex is not anchored in the 109c67d6573Sopenharmony_ci /// beginning, then a preceding `.*?` is included in the program. (The NFA 110c67d6573Sopenharmony_ci /// based engines handle the preceding `.*?` explicitly, which is difficult 111c67d6573Sopenharmony_ci /// or impossible in the DFA engine.) 112c67d6573Sopenharmony_ci pub fn dfa(mut self, yes: bool) -> Self { 113c67d6573Sopenharmony_ci self.compiled.is_dfa = yes; 114c67d6573Sopenharmony_ci self 115c67d6573Sopenharmony_ci } 116c67d6573Sopenharmony_ci 117c67d6573Sopenharmony_ci /// When set, the machine returned is suitable for matching text in 118c67d6573Sopenharmony_ci /// reverse. In particular, all concatenations are flipped. 119c67d6573Sopenharmony_ci pub fn reverse(mut self, yes: bool) -> Self { 120c67d6573Sopenharmony_ci self.compiled.is_reverse = yes; 121c67d6573Sopenharmony_ci self 122c67d6573Sopenharmony_ci } 123c67d6573Sopenharmony_ci 124c67d6573Sopenharmony_ci /// Compile a regular expression given its AST. 125c67d6573Sopenharmony_ci /// 126c67d6573Sopenharmony_ci /// The compiler is guaranteed to succeed unless the program exceeds the 127c67d6573Sopenharmony_ci /// specified size limit. If the size limit is exceeded, then compilation 128c67d6573Sopenharmony_ci /// stops and returns an error. 129c67d6573Sopenharmony_ci pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> { 130c67d6573Sopenharmony_ci debug_assert!(!exprs.is_empty()); 131c67d6573Sopenharmony_ci self.num_exprs = exprs.len(); 132c67d6573Sopenharmony_ci if exprs.len() == 1 { 133c67d6573Sopenharmony_ci self.compile_one(&exprs[0]) 134c67d6573Sopenharmony_ci } else { 135c67d6573Sopenharmony_ci self.compile_many(exprs) 136c67d6573Sopenharmony_ci } 137c67d6573Sopenharmony_ci } 138c67d6573Sopenharmony_ci 139c67d6573Sopenharmony_ci fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { 140c67d6573Sopenharmony_ci // If we're compiling a forward DFA and we aren't anchored, then 141c67d6573Sopenharmony_ci // add a `.*?` before the first capture group. 142c67d6573Sopenharmony_ci // Other matching engines handle this by baking the logic into the 143c67d6573Sopenharmony_ci // matching engine itself. 144c67d6573Sopenharmony_ci let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; 145c67d6573Sopenharmony_ci self.compiled.is_anchored_start = expr.is_anchored_start(); 146c67d6573Sopenharmony_ci self.compiled.is_anchored_end = expr.is_anchored_end(); 147c67d6573Sopenharmony_ci if self.compiled.needs_dotstar() { 148c67d6573Sopenharmony_ci dotstar_patch = self.c_dotstar()?; 149c67d6573Sopenharmony_ci self.compiled.start = dotstar_patch.entry; 150c67d6573Sopenharmony_ci } 151c67d6573Sopenharmony_ci self.compiled.captures = vec![None]; 152c67d6573Sopenharmony_ci let patch = 153c67d6573Sopenharmony_ci self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); 154c67d6573Sopenharmony_ci if self.compiled.needs_dotstar() { 155c67d6573Sopenharmony_ci self.fill(dotstar_patch.hole, patch.entry); 156c67d6573Sopenharmony_ci } else { 157c67d6573Sopenharmony_ci self.compiled.start = patch.entry; 158c67d6573Sopenharmony_ci } 159c67d6573Sopenharmony_ci self.fill_to_next(patch.hole); 160c67d6573Sopenharmony_ci self.compiled.matches = vec![self.insts.len()]; 161c67d6573Sopenharmony_ci self.push_compiled(Inst::Match(0)); 162c67d6573Sopenharmony_ci self.compile_finish() 163c67d6573Sopenharmony_ci } 164c67d6573Sopenharmony_ci 165c67d6573Sopenharmony_ci fn compile_many( 166c67d6573Sopenharmony_ci mut self, 167c67d6573Sopenharmony_ci exprs: &[Hir], 168c67d6573Sopenharmony_ci ) -> result::Result<Program, Error> { 169c67d6573Sopenharmony_ci debug_assert!(exprs.len() > 1); 170c67d6573Sopenharmony_ci 171c67d6573Sopenharmony_ci self.compiled.is_anchored_start = 172c67d6573Sopenharmony_ci exprs.iter().all(|e| e.is_anchored_start()); 173c67d6573Sopenharmony_ci self.compiled.is_anchored_end = 174c67d6573Sopenharmony_ci exprs.iter().all(|e| e.is_anchored_end()); 175c67d6573Sopenharmony_ci let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; 176c67d6573Sopenharmony_ci if self.compiled.needs_dotstar() { 177c67d6573Sopenharmony_ci dotstar_patch = self.c_dotstar()?; 178c67d6573Sopenharmony_ci self.compiled.start = dotstar_patch.entry; 179c67d6573Sopenharmony_ci } else { 180c67d6573Sopenharmony_ci self.compiled.start = 0; // first instruction is always split 181c67d6573Sopenharmony_ci } 182c67d6573Sopenharmony_ci self.fill_to_next(dotstar_patch.hole); 183c67d6573Sopenharmony_ci 184c67d6573Sopenharmony_ci let mut prev_hole = Hole::None; 185c67d6573Sopenharmony_ci for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { 186c67d6573Sopenharmony_ci self.fill_to_next(prev_hole); 187c67d6573Sopenharmony_ci let split = self.push_split_hole(); 188c67d6573Sopenharmony_ci let Patch { hole, entry } = 189c67d6573Sopenharmony_ci self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); 190c67d6573Sopenharmony_ci self.fill_to_next(hole); 191c67d6573Sopenharmony_ci self.compiled.matches.push(self.insts.len()); 192c67d6573Sopenharmony_ci self.push_compiled(Inst::Match(i)); 193c67d6573Sopenharmony_ci prev_hole = self.fill_split(split, Some(entry), None); 194c67d6573Sopenharmony_ci } 195c67d6573Sopenharmony_ci let i = exprs.len() - 1; 196c67d6573Sopenharmony_ci let Patch { hole, entry } = 197c67d6573Sopenharmony_ci self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst()); 198c67d6573Sopenharmony_ci self.fill(prev_hole, entry); 199c67d6573Sopenharmony_ci self.fill_to_next(hole); 200c67d6573Sopenharmony_ci self.compiled.matches.push(self.insts.len()); 201c67d6573Sopenharmony_ci self.push_compiled(Inst::Match(i)); 202c67d6573Sopenharmony_ci self.compile_finish() 203c67d6573Sopenharmony_ci } 204c67d6573Sopenharmony_ci 205c67d6573Sopenharmony_ci fn compile_finish(mut self) -> result::Result<Program, Error> { 206c67d6573Sopenharmony_ci self.compiled.insts = 207c67d6573Sopenharmony_ci self.insts.into_iter().map(|inst| inst.unwrap()).collect(); 208c67d6573Sopenharmony_ci self.compiled.byte_classes = self.byte_classes.byte_classes(); 209c67d6573Sopenharmony_ci self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); 210c67d6573Sopenharmony_ci Ok(self.compiled) 211c67d6573Sopenharmony_ci } 212c67d6573Sopenharmony_ci 213c67d6573Sopenharmony_ci /// Compile expr into self.insts, returning a patch on success, 214c67d6573Sopenharmony_ci /// or an error if we run out of memory. 215c67d6573Sopenharmony_ci /// 216c67d6573Sopenharmony_ci /// All of the c_* methods of the compiler share the contract outlined 217c67d6573Sopenharmony_ci /// here. 218c67d6573Sopenharmony_ci /// 219c67d6573Sopenharmony_ci /// The main thing that a c_* method does is mutate `self.insts` 220c67d6573Sopenharmony_ci /// to add a list of mostly compiled instructions required to execute 221c67d6573Sopenharmony_ci /// the given expression. `self.insts` contains MaybeInsts rather than 222c67d6573Sopenharmony_ci /// Insts because there is some backpatching required. 223c67d6573Sopenharmony_ci /// 224c67d6573Sopenharmony_ci /// The `Patch` value returned by each c_* method provides metadata 225c67d6573Sopenharmony_ci /// about the compiled instructions emitted to `self.insts`. The 226c67d6573Sopenharmony_ci /// `entry` member of the patch refers to the first instruction 227c67d6573Sopenharmony_ci /// (the entry point), while the `hole` member contains zero or 228c67d6573Sopenharmony_ci /// more offsets to partial instructions that need to be backpatched. 229c67d6573Sopenharmony_ci /// The c_* routine can't know where its list of instructions are going to 230c67d6573Sopenharmony_ci /// jump to after execution, so it is up to the caller to patch 231c67d6573Sopenharmony_ci /// these jumps to point to the right place. So compiling some 232c67d6573Sopenharmony_ci /// expression, e, we would end up with a situation that looked like: 233c67d6573Sopenharmony_ci /// 234c67d6573Sopenharmony_ci /// ```text 235c67d6573Sopenharmony_ci /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] 236c67d6573Sopenharmony_ci /// ^ ^ ^ 237c67d6573Sopenharmony_ci /// | \ / 238c67d6573Sopenharmony_ci /// entry \ / 239c67d6573Sopenharmony_ci /// hole 240c67d6573Sopenharmony_ci /// ``` 241c67d6573Sopenharmony_ci /// 242c67d6573Sopenharmony_ci /// To compile two expressions, e1 and e2, concatenated together we 243c67d6573Sopenharmony_ci /// would do: 244c67d6573Sopenharmony_ci /// 245c67d6573Sopenharmony_ci /// ```ignore 246c67d6573Sopenharmony_ci /// let patch1 = self.c(e1); 247c67d6573Sopenharmony_ci /// let patch2 = self.c(e2); 248c67d6573Sopenharmony_ci /// ``` 249c67d6573Sopenharmony_ci /// 250c67d6573Sopenharmony_ci /// while leaves us with a situation that looks like 251c67d6573Sopenharmony_ci /// 252c67d6573Sopenharmony_ci /// ```text 253c67d6573Sopenharmony_ci /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] 254c67d6573Sopenharmony_ci /// ^ ^ ^ ^ 255c67d6573Sopenharmony_ci /// | | | | 256c67d6573Sopenharmony_ci /// entry1 hole1 entry2 hole2 257c67d6573Sopenharmony_ci /// ``` 258c67d6573Sopenharmony_ci /// 259c67d6573Sopenharmony_ci /// Then to merge the two patches together into one we would backpatch 260c67d6573Sopenharmony_ci /// hole1 with entry2 and return a new patch that enters at entry1 261c67d6573Sopenharmony_ci /// and has hole2 for a hole. In fact, if you look at the c_concat 262c67d6573Sopenharmony_ci /// method you will see that it does exactly this, though it handles 263c67d6573Sopenharmony_ci /// a list of expressions rather than just the two that we use for 264c67d6573Sopenharmony_ci /// an example. 265c67d6573Sopenharmony_ci /// 266c67d6573Sopenharmony_ci /// Ok(None) is returned when an expression is compiled to no 267c67d6573Sopenharmony_ci /// instruction, and so no patch.entry value makes sense. 268c67d6573Sopenharmony_ci fn c(&mut self, expr: &Hir) -> ResultOrEmpty { 269c67d6573Sopenharmony_ci use crate::prog; 270c67d6573Sopenharmony_ci use regex_syntax::hir::HirKind::*; 271c67d6573Sopenharmony_ci 272c67d6573Sopenharmony_ci self.check_size()?; 273c67d6573Sopenharmony_ci match *expr.kind() { 274c67d6573Sopenharmony_ci Empty => self.c_empty(), 275c67d6573Sopenharmony_ci Literal(hir::Literal::Unicode(c)) => self.c_char(c), 276c67d6573Sopenharmony_ci Literal(hir::Literal::Byte(b)) => { 277c67d6573Sopenharmony_ci assert!(self.compiled.uses_bytes()); 278c67d6573Sopenharmony_ci self.c_byte(b) 279c67d6573Sopenharmony_ci } 280c67d6573Sopenharmony_ci Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), 281c67d6573Sopenharmony_ci Class(hir::Class::Bytes(ref cls)) => { 282c67d6573Sopenharmony_ci if self.compiled.uses_bytes() { 283c67d6573Sopenharmony_ci self.c_class_bytes(cls.ranges()) 284c67d6573Sopenharmony_ci } else { 285c67d6573Sopenharmony_ci assert!(cls.is_all_ascii()); 286c67d6573Sopenharmony_ci let mut char_ranges = vec![]; 287c67d6573Sopenharmony_ci for r in cls.iter() { 288c67d6573Sopenharmony_ci let (s, e) = (r.start() as char, r.end() as char); 289c67d6573Sopenharmony_ci char_ranges.push(hir::ClassUnicodeRange::new(s, e)); 290c67d6573Sopenharmony_ci } 291c67d6573Sopenharmony_ci self.c_class(&char_ranges) 292c67d6573Sopenharmony_ci } 293c67d6573Sopenharmony_ci } 294c67d6573Sopenharmony_ci Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { 295c67d6573Sopenharmony_ci self.byte_classes.set_range(b'\n', b'\n'); 296c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::EndLine) 297c67d6573Sopenharmony_ci } 298c67d6573Sopenharmony_ci Anchor(hir::Anchor::StartLine) => { 299c67d6573Sopenharmony_ci self.byte_classes.set_range(b'\n', b'\n'); 300c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::StartLine) 301c67d6573Sopenharmony_ci } 302c67d6573Sopenharmony_ci Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { 303c67d6573Sopenharmony_ci self.byte_classes.set_range(b'\n', b'\n'); 304c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::StartLine) 305c67d6573Sopenharmony_ci } 306c67d6573Sopenharmony_ci Anchor(hir::Anchor::EndLine) => { 307c67d6573Sopenharmony_ci self.byte_classes.set_range(b'\n', b'\n'); 308c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::EndLine) 309c67d6573Sopenharmony_ci } 310c67d6573Sopenharmony_ci Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { 311c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::EndText) 312c67d6573Sopenharmony_ci } 313c67d6573Sopenharmony_ci Anchor(hir::Anchor::StartText) => { 314c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::StartText) 315c67d6573Sopenharmony_ci } 316c67d6573Sopenharmony_ci Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { 317c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::StartText) 318c67d6573Sopenharmony_ci } 319c67d6573Sopenharmony_ci Anchor(hir::Anchor::EndText) => { 320c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::EndText) 321c67d6573Sopenharmony_ci } 322c67d6573Sopenharmony_ci WordBoundary(hir::WordBoundary::Unicode) => { 323c67d6573Sopenharmony_ci if !cfg!(feature = "unicode-perl") { 324c67d6573Sopenharmony_ci return Err(Error::Syntax( 325c67d6573Sopenharmony_ci "Unicode word boundaries are unavailable when \ 326c67d6573Sopenharmony_ci the unicode-perl feature is disabled" 327c67d6573Sopenharmony_ci .to_string(), 328c67d6573Sopenharmony_ci )); 329c67d6573Sopenharmony_ci } 330c67d6573Sopenharmony_ci self.compiled.has_unicode_word_boundary = true; 331c67d6573Sopenharmony_ci self.byte_classes.set_word_boundary(); 332c67d6573Sopenharmony_ci // We also make sure that all ASCII bytes are in a different 333c67d6573Sopenharmony_ci // class from non-ASCII bytes. Otherwise, it's possible for 334c67d6573Sopenharmony_ci // ASCII bytes to get lumped into the same class as non-ASCII 335c67d6573Sopenharmony_ci // bytes. This in turn may cause the lazy DFA to falsely start 336c67d6573Sopenharmony_ci // when it sees an ASCII byte that maps to a byte class with 337c67d6573Sopenharmony_ci // non-ASCII bytes. This ensures that never happens. 338c67d6573Sopenharmony_ci self.byte_classes.set_range(0, 0x7F); 339c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::WordBoundary) 340c67d6573Sopenharmony_ci } 341c67d6573Sopenharmony_ci WordBoundary(hir::WordBoundary::UnicodeNegate) => { 342c67d6573Sopenharmony_ci if !cfg!(feature = "unicode-perl") { 343c67d6573Sopenharmony_ci return Err(Error::Syntax( 344c67d6573Sopenharmony_ci "Unicode word boundaries are unavailable when \ 345c67d6573Sopenharmony_ci the unicode-perl feature is disabled" 346c67d6573Sopenharmony_ci .to_string(), 347c67d6573Sopenharmony_ci )); 348c67d6573Sopenharmony_ci } 349c67d6573Sopenharmony_ci self.compiled.has_unicode_word_boundary = true; 350c67d6573Sopenharmony_ci self.byte_classes.set_word_boundary(); 351c67d6573Sopenharmony_ci // See comments above for why we set the ASCII range here. 352c67d6573Sopenharmony_ci self.byte_classes.set_range(0, 0x7F); 353c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::NotWordBoundary) 354c67d6573Sopenharmony_ci } 355c67d6573Sopenharmony_ci WordBoundary(hir::WordBoundary::Ascii) => { 356c67d6573Sopenharmony_ci self.byte_classes.set_word_boundary(); 357c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) 358c67d6573Sopenharmony_ci } 359c67d6573Sopenharmony_ci WordBoundary(hir::WordBoundary::AsciiNegate) => { 360c67d6573Sopenharmony_ci self.byte_classes.set_word_boundary(); 361c67d6573Sopenharmony_ci self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) 362c67d6573Sopenharmony_ci } 363c67d6573Sopenharmony_ci Group(ref g) => match g.kind { 364c67d6573Sopenharmony_ci hir::GroupKind::NonCapturing => self.c(&g.hir), 365c67d6573Sopenharmony_ci hir::GroupKind::CaptureIndex(index) => { 366c67d6573Sopenharmony_ci if index as usize >= self.compiled.captures.len() { 367c67d6573Sopenharmony_ci self.compiled.captures.push(None); 368c67d6573Sopenharmony_ci } 369c67d6573Sopenharmony_ci self.c_capture(2 * index as usize, &g.hir) 370c67d6573Sopenharmony_ci } 371c67d6573Sopenharmony_ci hir::GroupKind::CaptureName { index, ref name } => { 372c67d6573Sopenharmony_ci if index as usize >= self.compiled.captures.len() { 373c67d6573Sopenharmony_ci let n = name.to_string(); 374c67d6573Sopenharmony_ci self.compiled.captures.push(Some(n.clone())); 375c67d6573Sopenharmony_ci self.capture_name_idx.insert(n, index as usize); 376c67d6573Sopenharmony_ci } 377c67d6573Sopenharmony_ci self.c_capture(2 * index as usize, &g.hir) 378c67d6573Sopenharmony_ci } 379c67d6573Sopenharmony_ci }, 380c67d6573Sopenharmony_ci Concat(ref es) => { 381c67d6573Sopenharmony_ci if self.compiled.is_reverse { 382c67d6573Sopenharmony_ci self.c_concat(es.iter().rev()) 383c67d6573Sopenharmony_ci } else { 384c67d6573Sopenharmony_ci self.c_concat(es) 385c67d6573Sopenharmony_ci } 386c67d6573Sopenharmony_ci } 387c67d6573Sopenharmony_ci Alternation(ref es) => self.c_alternate(&**es), 388c67d6573Sopenharmony_ci Repetition(ref rep) => self.c_repeat(rep), 389c67d6573Sopenharmony_ci } 390c67d6573Sopenharmony_ci } 391c67d6573Sopenharmony_ci 392c67d6573Sopenharmony_ci fn c_empty(&mut self) -> ResultOrEmpty { 393c67d6573Sopenharmony_ci // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 394c67d6573Sopenharmony_ci // See: CVE-2022-24713 395c67d6573Sopenharmony_ci // 396c67d6573Sopenharmony_ci // Since 'empty' sub-expressions don't increase the size of 397c67d6573Sopenharmony_ci // the actual compiled object, we "fake" an increase in its 398c67d6573Sopenharmony_ci // size so that our 'check_size_limit' routine will eventually 399c67d6573Sopenharmony_ci // stop compilation if there are too many empty sub-expressions 400c67d6573Sopenharmony_ci // (e.g., via a large repetition). 401c67d6573Sopenharmony_ci self.extra_inst_bytes += std::mem::size_of::<Inst>(); 402c67d6573Sopenharmony_ci Ok(None) 403c67d6573Sopenharmony_ci } 404c67d6573Sopenharmony_ci 405c67d6573Sopenharmony_ci fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { 406c67d6573Sopenharmony_ci if self.num_exprs > 1 || self.compiled.is_dfa { 407c67d6573Sopenharmony_ci // Don't ever compile Save instructions for regex sets because 408c67d6573Sopenharmony_ci // they are never used. They are also never used in DFA programs 409c67d6573Sopenharmony_ci // because DFAs can't handle captures. 410c67d6573Sopenharmony_ci self.c(expr) 411c67d6573Sopenharmony_ci } else { 412c67d6573Sopenharmony_ci let entry = self.insts.len(); 413c67d6573Sopenharmony_ci let hole = self.push_hole(InstHole::Save { slot: first_slot }); 414c67d6573Sopenharmony_ci let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst()); 415c67d6573Sopenharmony_ci self.fill(hole, patch.entry); 416c67d6573Sopenharmony_ci self.fill_to_next(patch.hole); 417c67d6573Sopenharmony_ci let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); 418c67d6573Sopenharmony_ci Ok(Some(Patch { hole, entry })) 419c67d6573Sopenharmony_ci } 420c67d6573Sopenharmony_ci } 421c67d6573Sopenharmony_ci 422c67d6573Sopenharmony_ci fn c_dotstar(&mut self) -> Result { 423c67d6573Sopenharmony_ci Ok(if !self.compiled.only_utf8() { 424c67d6573Sopenharmony_ci self.c(&Hir::repetition(hir::Repetition { 425c67d6573Sopenharmony_ci kind: hir::RepetitionKind::ZeroOrMore, 426c67d6573Sopenharmony_ci greedy: false, 427c67d6573Sopenharmony_ci hir: Box::new(Hir::any(true)), 428c67d6573Sopenharmony_ci }))? 429c67d6573Sopenharmony_ci .unwrap() 430c67d6573Sopenharmony_ci } else { 431c67d6573Sopenharmony_ci self.c(&Hir::repetition(hir::Repetition { 432c67d6573Sopenharmony_ci kind: hir::RepetitionKind::ZeroOrMore, 433c67d6573Sopenharmony_ci greedy: false, 434c67d6573Sopenharmony_ci hir: Box::new(Hir::any(false)), 435c67d6573Sopenharmony_ci }))? 436c67d6573Sopenharmony_ci .unwrap() 437c67d6573Sopenharmony_ci }) 438c67d6573Sopenharmony_ci } 439c67d6573Sopenharmony_ci 440c67d6573Sopenharmony_ci fn c_char(&mut self, c: char) -> ResultOrEmpty { 441c67d6573Sopenharmony_ci if self.compiled.uses_bytes() { 442c67d6573Sopenharmony_ci if c.is_ascii() { 443c67d6573Sopenharmony_ci let b = c as u8; 444c67d6573Sopenharmony_ci let hole = 445c67d6573Sopenharmony_ci self.push_hole(InstHole::Bytes { start: b, end: b }); 446c67d6573Sopenharmony_ci self.byte_classes.set_range(b, b); 447c67d6573Sopenharmony_ci Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) 448c67d6573Sopenharmony_ci } else { 449c67d6573Sopenharmony_ci self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) 450c67d6573Sopenharmony_ci } 451c67d6573Sopenharmony_ci } else { 452c67d6573Sopenharmony_ci let hole = self.push_hole(InstHole::Char { c }); 453c67d6573Sopenharmony_ci Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) 454c67d6573Sopenharmony_ci } 455c67d6573Sopenharmony_ci } 456c67d6573Sopenharmony_ci 457c67d6573Sopenharmony_ci fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { 458c67d6573Sopenharmony_ci use std::mem::size_of; 459c67d6573Sopenharmony_ci 460c67d6573Sopenharmony_ci assert!(!ranges.is_empty()); 461c67d6573Sopenharmony_ci if self.compiled.uses_bytes() { 462c67d6573Sopenharmony_ci Ok(Some(CompileClass { c: self, ranges }.compile()?)) 463c67d6573Sopenharmony_ci } else { 464c67d6573Sopenharmony_ci let ranges: Vec<(char, char)> = 465c67d6573Sopenharmony_ci ranges.iter().map(|r| (r.start(), r.end())).collect(); 466c67d6573Sopenharmony_ci let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { 467c67d6573Sopenharmony_ci self.push_hole(InstHole::Char { c: ranges[0].0 }) 468c67d6573Sopenharmony_ci } else { 469c67d6573Sopenharmony_ci self.extra_inst_bytes += 470c67d6573Sopenharmony_ci ranges.len() * (size_of::<char>() * 2); 471c67d6573Sopenharmony_ci self.push_hole(InstHole::Ranges { ranges }) 472c67d6573Sopenharmony_ci }; 473c67d6573Sopenharmony_ci Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) 474c67d6573Sopenharmony_ci } 475c67d6573Sopenharmony_ci } 476c67d6573Sopenharmony_ci 477c67d6573Sopenharmony_ci fn c_byte(&mut self, b: u8) -> ResultOrEmpty { 478c67d6573Sopenharmony_ci self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) 479c67d6573Sopenharmony_ci } 480c67d6573Sopenharmony_ci 481c67d6573Sopenharmony_ci fn c_class_bytes( 482c67d6573Sopenharmony_ci &mut self, 483c67d6573Sopenharmony_ci ranges: &[hir::ClassBytesRange], 484c67d6573Sopenharmony_ci ) -> ResultOrEmpty { 485c67d6573Sopenharmony_ci debug_assert!(!ranges.is_empty()); 486c67d6573Sopenharmony_ci 487c67d6573Sopenharmony_ci let first_split_entry = self.insts.len(); 488c67d6573Sopenharmony_ci let mut holes = vec![]; 489c67d6573Sopenharmony_ci let mut prev_hole = Hole::None; 490c67d6573Sopenharmony_ci for r in &ranges[0..ranges.len() - 1] { 491c67d6573Sopenharmony_ci self.fill_to_next(prev_hole); 492c67d6573Sopenharmony_ci let split = self.push_split_hole(); 493c67d6573Sopenharmony_ci let next = self.insts.len(); 494c67d6573Sopenharmony_ci self.byte_classes.set_range(r.start(), r.end()); 495c67d6573Sopenharmony_ci holes.push(self.push_hole(InstHole::Bytes { 496c67d6573Sopenharmony_ci start: r.start(), 497c67d6573Sopenharmony_ci end: r.end(), 498c67d6573Sopenharmony_ci })); 499c67d6573Sopenharmony_ci prev_hole = self.fill_split(split, Some(next), None); 500c67d6573Sopenharmony_ci } 501c67d6573Sopenharmony_ci let next = self.insts.len(); 502c67d6573Sopenharmony_ci let r = &ranges[ranges.len() - 1]; 503c67d6573Sopenharmony_ci self.byte_classes.set_range(r.start(), r.end()); 504c67d6573Sopenharmony_ci holes.push( 505c67d6573Sopenharmony_ci self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), 506c67d6573Sopenharmony_ci ); 507c67d6573Sopenharmony_ci self.fill(prev_hole, next); 508c67d6573Sopenharmony_ci Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) 509c67d6573Sopenharmony_ci } 510c67d6573Sopenharmony_ci 511c67d6573Sopenharmony_ci fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { 512c67d6573Sopenharmony_ci let hole = self.push_hole(InstHole::EmptyLook { look }); 513c67d6573Sopenharmony_ci Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) 514c67d6573Sopenharmony_ci } 515c67d6573Sopenharmony_ci 516c67d6573Sopenharmony_ci fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty 517c67d6573Sopenharmony_ci where 518c67d6573Sopenharmony_ci I: IntoIterator<Item = &'a Hir>, 519c67d6573Sopenharmony_ci { 520c67d6573Sopenharmony_ci let mut exprs = exprs.into_iter(); 521c67d6573Sopenharmony_ci let Patch { mut hole, entry } = loop { 522c67d6573Sopenharmony_ci match exprs.next() { 523c67d6573Sopenharmony_ci None => return self.c_empty(), 524c67d6573Sopenharmony_ci Some(e) => { 525c67d6573Sopenharmony_ci if let Some(p) = self.c(e)? { 526c67d6573Sopenharmony_ci break p; 527c67d6573Sopenharmony_ci } 528c67d6573Sopenharmony_ci } 529c67d6573Sopenharmony_ci } 530c67d6573Sopenharmony_ci }; 531c67d6573Sopenharmony_ci for e in exprs { 532c67d6573Sopenharmony_ci if let Some(p) = self.c(e)? { 533c67d6573Sopenharmony_ci self.fill(hole, p.entry); 534c67d6573Sopenharmony_ci hole = p.hole; 535c67d6573Sopenharmony_ci } 536c67d6573Sopenharmony_ci } 537c67d6573Sopenharmony_ci Ok(Some(Patch { hole, entry })) 538c67d6573Sopenharmony_ci } 539c67d6573Sopenharmony_ci 540c67d6573Sopenharmony_ci fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { 541c67d6573Sopenharmony_ci debug_assert!( 542c67d6573Sopenharmony_ci exprs.len() >= 2, 543c67d6573Sopenharmony_ci "alternates must have at least 2 exprs" 544c67d6573Sopenharmony_ci ); 545c67d6573Sopenharmony_ci 546c67d6573Sopenharmony_ci // Initial entry point is always the first split. 547c67d6573Sopenharmony_ci let first_split_entry = self.insts.len(); 548c67d6573Sopenharmony_ci 549c67d6573Sopenharmony_ci // Save up all of the holes from each alternate. They will all get 550c67d6573Sopenharmony_ci // patched to point to the same location. 551c67d6573Sopenharmony_ci let mut holes = vec![]; 552c67d6573Sopenharmony_ci 553c67d6573Sopenharmony_ci // true indicates that the hole is a split where we want to fill 554c67d6573Sopenharmony_ci // the second branch. 555c67d6573Sopenharmony_ci let mut prev_hole = (Hole::None, false); 556c67d6573Sopenharmony_ci for e in &exprs[0..exprs.len() - 1] { 557c67d6573Sopenharmony_ci if prev_hole.1 { 558c67d6573Sopenharmony_ci let next = self.insts.len(); 559c67d6573Sopenharmony_ci self.fill_split(prev_hole.0, None, Some(next)); 560c67d6573Sopenharmony_ci } else { 561c67d6573Sopenharmony_ci self.fill_to_next(prev_hole.0); 562c67d6573Sopenharmony_ci } 563c67d6573Sopenharmony_ci let split = self.push_split_hole(); 564c67d6573Sopenharmony_ci if let Some(Patch { hole, entry }) = self.c(e)? { 565c67d6573Sopenharmony_ci holes.push(hole); 566c67d6573Sopenharmony_ci prev_hole = (self.fill_split(split, Some(entry), None), false); 567c67d6573Sopenharmony_ci } else { 568c67d6573Sopenharmony_ci let (split1, split2) = split.dup_one(); 569c67d6573Sopenharmony_ci holes.push(split1); 570c67d6573Sopenharmony_ci prev_hole = (split2, true); 571c67d6573Sopenharmony_ci } 572c67d6573Sopenharmony_ci } 573c67d6573Sopenharmony_ci if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { 574c67d6573Sopenharmony_ci holes.push(hole); 575c67d6573Sopenharmony_ci if prev_hole.1 { 576c67d6573Sopenharmony_ci self.fill_split(prev_hole.0, None, Some(entry)); 577c67d6573Sopenharmony_ci } else { 578c67d6573Sopenharmony_ci self.fill(prev_hole.0, entry); 579c67d6573Sopenharmony_ci } 580c67d6573Sopenharmony_ci } else { 581c67d6573Sopenharmony_ci // We ignore prev_hole.1. When it's true, it means we have two 582c67d6573Sopenharmony_ci // empty branches both pushing prev_hole.0 into holes, so both 583c67d6573Sopenharmony_ci // branches will go to the same place anyway. 584c67d6573Sopenharmony_ci holes.push(prev_hole.0); 585c67d6573Sopenharmony_ci } 586c67d6573Sopenharmony_ci Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) 587c67d6573Sopenharmony_ci } 588c67d6573Sopenharmony_ci 589c67d6573Sopenharmony_ci fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { 590c67d6573Sopenharmony_ci use regex_syntax::hir::RepetitionKind::*; 591c67d6573Sopenharmony_ci match rep.kind { 592c67d6573Sopenharmony_ci ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), 593c67d6573Sopenharmony_ci ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), 594c67d6573Sopenharmony_ci OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), 595c67d6573Sopenharmony_ci Range(hir::RepetitionRange::Exactly(min_max)) => { 596c67d6573Sopenharmony_ci self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) 597c67d6573Sopenharmony_ci } 598c67d6573Sopenharmony_ci Range(hir::RepetitionRange::AtLeast(min)) => { 599c67d6573Sopenharmony_ci self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) 600c67d6573Sopenharmony_ci } 601c67d6573Sopenharmony_ci Range(hir::RepetitionRange::Bounded(min, max)) => { 602c67d6573Sopenharmony_ci self.c_repeat_range(&rep.hir, rep.greedy, min, max) 603c67d6573Sopenharmony_ci } 604c67d6573Sopenharmony_ci } 605c67d6573Sopenharmony_ci } 606c67d6573Sopenharmony_ci 607c67d6573Sopenharmony_ci fn c_repeat_zero_or_one( 608c67d6573Sopenharmony_ci &mut self, 609c67d6573Sopenharmony_ci expr: &Hir, 610c67d6573Sopenharmony_ci greedy: bool, 611c67d6573Sopenharmony_ci ) -> ResultOrEmpty { 612c67d6573Sopenharmony_ci let split_entry = self.insts.len(); 613c67d6573Sopenharmony_ci let split = self.push_split_hole(); 614c67d6573Sopenharmony_ci let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { 615c67d6573Sopenharmony_ci Some(p) => p, 616c67d6573Sopenharmony_ci None => return self.pop_split_hole(), 617c67d6573Sopenharmony_ci }; 618c67d6573Sopenharmony_ci let split_hole = if greedy { 619c67d6573Sopenharmony_ci self.fill_split(split, Some(entry_rep), None) 620c67d6573Sopenharmony_ci } else { 621c67d6573Sopenharmony_ci self.fill_split(split, None, Some(entry_rep)) 622c67d6573Sopenharmony_ci }; 623c67d6573Sopenharmony_ci let holes = vec![hole_rep, split_hole]; 624c67d6573Sopenharmony_ci Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) 625c67d6573Sopenharmony_ci } 626c67d6573Sopenharmony_ci 627c67d6573Sopenharmony_ci fn c_repeat_zero_or_more( 628c67d6573Sopenharmony_ci &mut self, 629c67d6573Sopenharmony_ci expr: &Hir, 630c67d6573Sopenharmony_ci greedy: bool, 631c67d6573Sopenharmony_ci ) -> ResultOrEmpty { 632c67d6573Sopenharmony_ci let split_entry = self.insts.len(); 633c67d6573Sopenharmony_ci let split = self.push_split_hole(); 634c67d6573Sopenharmony_ci let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { 635c67d6573Sopenharmony_ci Some(p) => p, 636c67d6573Sopenharmony_ci None => return self.pop_split_hole(), 637c67d6573Sopenharmony_ci }; 638c67d6573Sopenharmony_ci 639c67d6573Sopenharmony_ci self.fill(hole_rep, split_entry); 640c67d6573Sopenharmony_ci let split_hole = if greedy { 641c67d6573Sopenharmony_ci self.fill_split(split, Some(entry_rep), None) 642c67d6573Sopenharmony_ci } else { 643c67d6573Sopenharmony_ci self.fill_split(split, None, Some(entry_rep)) 644c67d6573Sopenharmony_ci }; 645c67d6573Sopenharmony_ci Ok(Some(Patch { hole: split_hole, entry: split_entry })) 646c67d6573Sopenharmony_ci } 647c67d6573Sopenharmony_ci 648c67d6573Sopenharmony_ci fn c_repeat_one_or_more( 649c67d6573Sopenharmony_ci &mut self, 650c67d6573Sopenharmony_ci expr: &Hir, 651c67d6573Sopenharmony_ci greedy: bool, 652c67d6573Sopenharmony_ci ) -> ResultOrEmpty { 653c67d6573Sopenharmony_ci let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { 654c67d6573Sopenharmony_ci Some(p) => p, 655c67d6573Sopenharmony_ci None => return Ok(None), 656c67d6573Sopenharmony_ci }; 657c67d6573Sopenharmony_ci self.fill_to_next(hole_rep); 658c67d6573Sopenharmony_ci let split = self.push_split_hole(); 659c67d6573Sopenharmony_ci 660c67d6573Sopenharmony_ci let split_hole = if greedy { 661c67d6573Sopenharmony_ci self.fill_split(split, Some(entry_rep), None) 662c67d6573Sopenharmony_ci } else { 663c67d6573Sopenharmony_ci self.fill_split(split, None, Some(entry_rep)) 664c67d6573Sopenharmony_ci }; 665c67d6573Sopenharmony_ci Ok(Some(Patch { hole: split_hole, entry: entry_rep })) 666c67d6573Sopenharmony_ci } 667c67d6573Sopenharmony_ci 668c67d6573Sopenharmony_ci fn c_repeat_range_min_or_more( 669c67d6573Sopenharmony_ci &mut self, 670c67d6573Sopenharmony_ci expr: &Hir, 671c67d6573Sopenharmony_ci greedy: bool, 672c67d6573Sopenharmony_ci min: u32, 673c67d6573Sopenharmony_ci ) -> ResultOrEmpty { 674c67d6573Sopenharmony_ci let min = u32_to_usize(min); 675c67d6573Sopenharmony_ci // Using next_inst() is ok, because we can't return it (concat would 676c67d6573Sopenharmony_ci // have to return Some(_) while c_repeat_range_min_or_more returns 677c67d6573Sopenharmony_ci // None). 678c67d6573Sopenharmony_ci let patch_concat = self 679c67d6573Sopenharmony_ci .c_concat(iter::repeat(expr).take(min))? 680c67d6573Sopenharmony_ci .unwrap_or_else(|| self.next_inst()); 681c67d6573Sopenharmony_ci if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { 682c67d6573Sopenharmony_ci self.fill(patch_concat.hole, patch_rep.entry); 683c67d6573Sopenharmony_ci Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) 684c67d6573Sopenharmony_ci } else { 685c67d6573Sopenharmony_ci Ok(None) 686c67d6573Sopenharmony_ci } 687c67d6573Sopenharmony_ci } 688c67d6573Sopenharmony_ci 689c67d6573Sopenharmony_ci fn c_repeat_range( 690c67d6573Sopenharmony_ci &mut self, 691c67d6573Sopenharmony_ci expr: &Hir, 692c67d6573Sopenharmony_ci greedy: bool, 693c67d6573Sopenharmony_ci min: u32, 694c67d6573Sopenharmony_ci max: u32, 695c67d6573Sopenharmony_ci ) -> ResultOrEmpty { 696c67d6573Sopenharmony_ci let (min, max) = (u32_to_usize(min), u32_to_usize(max)); 697c67d6573Sopenharmony_ci debug_assert!(min <= max); 698c67d6573Sopenharmony_ci let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; 699c67d6573Sopenharmony_ci if min == max { 700c67d6573Sopenharmony_ci return Ok(patch_concat); 701c67d6573Sopenharmony_ci } 702c67d6573Sopenharmony_ci // Same reasoning as in c_repeat_range_min_or_more (we know that min < 703c67d6573Sopenharmony_ci // max at this point). 704c67d6573Sopenharmony_ci let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst()); 705c67d6573Sopenharmony_ci let initial_entry = patch_concat.entry; 706c67d6573Sopenharmony_ci // It is much simpler to compile, e.g., `a{2,5}` as: 707c67d6573Sopenharmony_ci // 708c67d6573Sopenharmony_ci // aaa?a?a? 709c67d6573Sopenharmony_ci // 710c67d6573Sopenharmony_ci // But you end up with a sequence of instructions like this: 711c67d6573Sopenharmony_ci // 712c67d6573Sopenharmony_ci // 0: 'a' 713c67d6573Sopenharmony_ci // 1: 'a', 714c67d6573Sopenharmony_ci // 2: split(3, 4) 715c67d6573Sopenharmony_ci // 3: 'a' 716c67d6573Sopenharmony_ci // 4: split(5, 6) 717c67d6573Sopenharmony_ci // 5: 'a' 718c67d6573Sopenharmony_ci // 6: split(7, 8) 719c67d6573Sopenharmony_ci // 7: 'a' 720c67d6573Sopenharmony_ci // 8: MATCH 721c67d6573Sopenharmony_ci // 722c67d6573Sopenharmony_ci // This is *incredibly* inefficient because the splits end 723c67d6573Sopenharmony_ci // up forming a chain, which has to be resolved everything a 724c67d6573Sopenharmony_ci // transition is followed. 725c67d6573Sopenharmony_ci let mut holes = vec![]; 726c67d6573Sopenharmony_ci let mut prev_hole = patch_concat.hole; 727c67d6573Sopenharmony_ci for _ in min..max { 728c67d6573Sopenharmony_ci self.fill_to_next(prev_hole); 729c67d6573Sopenharmony_ci let split = self.push_split_hole(); 730c67d6573Sopenharmony_ci let Patch { hole, entry } = match self.c(expr)? { 731c67d6573Sopenharmony_ci Some(p) => p, 732c67d6573Sopenharmony_ci None => return self.pop_split_hole(), 733c67d6573Sopenharmony_ci }; 734c67d6573Sopenharmony_ci prev_hole = hole; 735c67d6573Sopenharmony_ci if greedy { 736c67d6573Sopenharmony_ci holes.push(self.fill_split(split, Some(entry), None)); 737c67d6573Sopenharmony_ci } else { 738c67d6573Sopenharmony_ci holes.push(self.fill_split(split, None, Some(entry))); 739c67d6573Sopenharmony_ci } 740c67d6573Sopenharmony_ci } 741c67d6573Sopenharmony_ci holes.push(prev_hole); 742c67d6573Sopenharmony_ci Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) 743c67d6573Sopenharmony_ci } 744c67d6573Sopenharmony_ci 745c67d6573Sopenharmony_ci /// Can be used as a default value for the c_* functions when the call to 746c67d6573Sopenharmony_ci /// c_function is followed by inserting at least one instruction that is 747c67d6573Sopenharmony_ci /// always executed after the ones written by the c* function. 748c67d6573Sopenharmony_ci fn next_inst(&self) -> Patch { 749c67d6573Sopenharmony_ci Patch { hole: Hole::None, entry: self.insts.len() } 750c67d6573Sopenharmony_ci } 751c67d6573Sopenharmony_ci 752c67d6573Sopenharmony_ci fn fill(&mut self, hole: Hole, goto: InstPtr) { 753c67d6573Sopenharmony_ci match hole { 754c67d6573Sopenharmony_ci Hole::None => {} 755c67d6573Sopenharmony_ci Hole::One(pc) => { 756c67d6573Sopenharmony_ci self.insts[pc].fill(goto); 757c67d6573Sopenharmony_ci } 758c67d6573Sopenharmony_ci Hole::Many(holes) => { 759c67d6573Sopenharmony_ci for hole in holes { 760c67d6573Sopenharmony_ci self.fill(hole, goto); 761c67d6573Sopenharmony_ci } 762c67d6573Sopenharmony_ci } 763c67d6573Sopenharmony_ci } 764c67d6573Sopenharmony_ci } 765c67d6573Sopenharmony_ci 766c67d6573Sopenharmony_ci fn fill_to_next(&mut self, hole: Hole) { 767c67d6573Sopenharmony_ci let next = self.insts.len(); 768c67d6573Sopenharmony_ci self.fill(hole, next); 769c67d6573Sopenharmony_ci } 770c67d6573Sopenharmony_ci 771c67d6573Sopenharmony_ci fn fill_split( 772c67d6573Sopenharmony_ci &mut self, 773c67d6573Sopenharmony_ci hole: Hole, 774c67d6573Sopenharmony_ci goto1: Option<InstPtr>, 775c67d6573Sopenharmony_ci goto2: Option<InstPtr>, 776c67d6573Sopenharmony_ci ) -> Hole { 777c67d6573Sopenharmony_ci match hole { 778c67d6573Sopenharmony_ci Hole::None => Hole::None, 779c67d6573Sopenharmony_ci Hole::One(pc) => match (goto1, goto2) { 780c67d6573Sopenharmony_ci (Some(goto1), Some(goto2)) => { 781c67d6573Sopenharmony_ci self.insts[pc].fill_split(goto1, goto2); 782c67d6573Sopenharmony_ci Hole::None 783c67d6573Sopenharmony_ci } 784c67d6573Sopenharmony_ci (Some(goto1), None) => { 785c67d6573Sopenharmony_ci self.insts[pc].half_fill_split_goto1(goto1); 786c67d6573Sopenharmony_ci Hole::One(pc) 787c67d6573Sopenharmony_ci } 788c67d6573Sopenharmony_ci (None, Some(goto2)) => { 789c67d6573Sopenharmony_ci self.insts[pc].half_fill_split_goto2(goto2); 790c67d6573Sopenharmony_ci Hole::One(pc) 791c67d6573Sopenharmony_ci } 792c67d6573Sopenharmony_ci (None, None) => unreachable!( 793c67d6573Sopenharmony_ci "at least one of the split \ 794c67d6573Sopenharmony_ci holes must be filled" 795c67d6573Sopenharmony_ci ), 796c67d6573Sopenharmony_ci }, 797c67d6573Sopenharmony_ci Hole::Many(holes) => { 798c67d6573Sopenharmony_ci let mut new_holes = vec![]; 799c67d6573Sopenharmony_ci for hole in holes { 800c67d6573Sopenharmony_ci new_holes.push(self.fill_split(hole, goto1, goto2)); 801c67d6573Sopenharmony_ci } 802c67d6573Sopenharmony_ci if new_holes.is_empty() { 803c67d6573Sopenharmony_ci Hole::None 804c67d6573Sopenharmony_ci } else if new_holes.len() == 1 { 805c67d6573Sopenharmony_ci new_holes.pop().unwrap() 806c67d6573Sopenharmony_ci } else { 807c67d6573Sopenharmony_ci Hole::Many(new_holes) 808c67d6573Sopenharmony_ci } 809c67d6573Sopenharmony_ci } 810c67d6573Sopenharmony_ci } 811c67d6573Sopenharmony_ci } 812c67d6573Sopenharmony_ci 813c67d6573Sopenharmony_ci fn push_compiled(&mut self, inst: Inst) { 814c67d6573Sopenharmony_ci self.insts.push(MaybeInst::Compiled(inst)); 815c67d6573Sopenharmony_ci } 816c67d6573Sopenharmony_ci 817c67d6573Sopenharmony_ci fn push_hole(&mut self, inst: InstHole) -> Hole { 818c67d6573Sopenharmony_ci let hole = self.insts.len(); 819c67d6573Sopenharmony_ci self.insts.push(MaybeInst::Uncompiled(inst)); 820c67d6573Sopenharmony_ci Hole::One(hole) 821c67d6573Sopenharmony_ci } 822c67d6573Sopenharmony_ci 823c67d6573Sopenharmony_ci fn push_split_hole(&mut self) -> Hole { 824c67d6573Sopenharmony_ci let hole = self.insts.len(); 825c67d6573Sopenharmony_ci self.insts.push(MaybeInst::Split); 826c67d6573Sopenharmony_ci Hole::One(hole) 827c67d6573Sopenharmony_ci } 828c67d6573Sopenharmony_ci 829c67d6573Sopenharmony_ci fn pop_split_hole(&mut self) -> ResultOrEmpty { 830c67d6573Sopenharmony_ci self.insts.pop(); 831c67d6573Sopenharmony_ci Ok(None) 832c67d6573Sopenharmony_ci } 833c67d6573Sopenharmony_ci 834c67d6573Sopenharmony_ci fn check_size(&self) -> result::Result<(), Error> { 835c67d6573Sopenharmony_ci use std::mem::size_of; 836c67d6573Sopenharmony_ci 837c67d6573Sopenharmony_ci let size = 838c67d6573Sopenharmony_ci self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>()); 839c67d6573Sopenharmony_ci if size > self.size_limit { 840c67d6573Sopenharmony_ci Err(Error::CompiledTooBig(self.size_limit)) 841c67d6573Sopenharmony_ci } else { 842c67d6573Sopenharmony_ci Ok(()) 843c67d6573Sopenharmony_ci } 844c67d6573Sopenharmony_ci } 845c67d6573Sopenharmony_ci} 846c67d6573Sopenharmony_ci 847c67d6573Sopenharmony_ci#[derive(Debug)] 848c67d6573Sopenharmony_cienum Hole { 849c67d6573Sopenharmony_ci None, 850c67d6573Sopenharmony_ci One(InstPtr), 851c67d6573Sopenharmony_ci Many(Vec<Hole>), 852c67d6573Sopenharmony_ci} 853c67d6573Sopenharmony_ci 854c67d6573Sopenharmony_ciimpl Hole { 855c67d6573Sopenharmony_ci fn dup_one(self) -> (Self, Self) { 856c67d6573Sopenharmony_ci match self { 857c67d6573Sopenharmony_ci Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), 858c67d6573Sopenharmony_ci Hole::None | Hole::Many(_) => { 859c67d6573Sopenharmony_ci unreachable!("must be called on single hole") 860c67d6573Sopenharmony_ci } 861c67d6573Sopenharmony_ci } 862c67d6573Sopenharmony_ci } 863c67d6573Sopenharmony_ci} 864c67d6573Sopenharmony_ci 865c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 866c67d6573Sopenharmony_cienum MaybeInst { 867c67d6573Sopenharmony_ci Compiled(Inst), 868c67d6573Sopenharmony_ci Uncompiled(InstHole), 869c67d6573Sopenharmony_ci Split, 870c67d6573Sopenharmony_ci Split1(InstPtr), 871c67d6573Sopenharmony_ci Split2(InstPtr), 872c67d6573Sopenharmony_ci} 873c67d6573Sopenharmony_ci 874c67d6573Sopenharmony_ciimpl MaybeInst { 875c67d6573Sopenharmony_ci fn fill(&mut self, goto: InstPtr) { 876c67d6573Sopenharmony_ci let maybeinst = match *self { 877c67d6573Sopenharmony_ci MaybeInst::Split => MaybeInst::Split1(goto), 878c67d6573Sopenharmony_ci MaybeInst::Uncompiled(ref inst) => { 879c67d6573Sopenharmony_ci MaybeInst::Compiled(inst.fill(goto)) 880c67d6573Sopenharmony_ci } 881c67d6573Sopenharmony_ci MaybeInst::Split1(goto1) => { 882c67d6573Sopenharmony_ci MaybeInst::Compiled(Inst::Split(InstSplit { 883c67d6573Sopenharmony_ci goto1, 884c67d6573Sopenharmony_ci goto2: goto, 885c67d6573Sopenharmony_ci })) 886c67d6573Sopenharmony_ci } 887c67d6573Sopenharmony_ci MaybeInst::Split2(goto2) => { 888c67d6573Sopenharmony_ci MaybeInst::Compiled(Inst::Split(InstSplit { 889c67d6573Sopenharmony_ci goto1: goto, 890c67d6573Sopenharmony_ci goto2, 891c67d6573Sopenharmony_ci })) 892c67d6573Sopenharmony_ci } 893c67d6573Sopenharmony_ci _ => unreachable!( 894c67d6573Sopenharmony_ci "not all instructions were compiled! \ 895c67d6573Sopenharmony_ci found uncompiled instruction: {:?}", 896c67d6573Sopenharmony_ci self 897c67d6573Sopenharmony_ci ), 898c67d6573Sopenharmony_ci }; 899c67d6573Sopenharmony_ci *self = maybeinst; 900c67d6573Sopenharmony_ci } 901c67d6573Sopenharmony_ci 902c67d6573Sopenharmony_ci fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { 903c67d6573Sopenharmony_ci let filled = match *self { 904c67d6573Sopenharmony_ci MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }), 905c67d6573Sopenharmony_ci _ => unreachable!( 906c67d6573Sopenharmony_ci "must be called on Split instruction, \ 907c67d6573Sopenharmony_ci instead it was called on: {:?}", 908c67d6573Sopenharmony_ci self 909c67d6573Sopenharmony_ci ), 910c67d6573Sopenharmony_ci }; 911c67d6573Sopenharmony_ci *self = MaybeInst::Compiled(filled); 912c67d6573Sopenharmony_ci } 913c67d6573Sopenharmony_ci 914c67d6573Sopenharmony_ci fn half_fill_split_goto1(&mut self, goto1: InstPtr) { 915c67d6573Sopenharmony_ci let half_filled = match *self { 916c67d6573Sopenharmony_ci MaybeInst::Split => goto1, 917c67d6573Sopenharmony_ci _ => unreachable!( 918c67d6573Sopenharmony_ci "must be called on Split instruction, \ 919c67d6573Sopenharmony_ci instead it was called on: {:?}", 920c67d6573Sopenharmony_ci self 921c67d6573Sopenharmony_ci ), 922c67d6573Sopenharmony_ci }; 923c67d6573Sopenharmony_ci *self = MaybeInst::Split1(half_filled); 924c67d6573Sopenharmony_ci } 925c67d6573Sopenharmony_ci 926c67d6573Sopenharmony_ci fn half_fill_split_goto2(&mut self, goto2: InstPtr) { 927c67d6573Sopenharmony_ci let half_filled = match *self { 928c67d6573Sopenharmony_ci MaybeInst::Split => goto2, 929c67d6573Sopenharmony_ci _ => unreachable!( 930c67d6573Sopenharmony_ci "must be called on Split instruction, \ 931c67d6573Sopenharmony_ci instead it was called on: {:?}", 932c67d6573Sopenharmony_ci self 933c67d6573Sopenharmony_ci ), 934c67d6573Sopenharmony_ci }; 935c67d6573Sopenharmony_ci *self = MaybeInst::Split2(half_filled); 936c67d6573Sopenharmony_ci } 937c67d6573Sopenharmony_ci 938c67d6573Sopenharmony_ci fn unwrap(self) -> Inst { 939c67d6573Sopenharmony_ci match self { 940c67d6573Sopenharmony_ci MaybeInst::Compiled(inst) => inst, 941c67d6573Sopenharmony_ci _ => unreachable!( 942c67d6573Sopenharmony_ci "must be called on a compiled instruction, \ 943c67d6573Sopenharmony_ci instead it was called on: {:?}", 944c67d6573Sopenharmony_ci self 945c67d6573Sopenharmony_ci ), 946c67d6573Sopenharmony_ci } 947c67d6573Sopenharmony_ci } 948c67d6573Sopenharmony_ci} 949c67d6573Sopenharmony_ci 950c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 951c67d6573Sopenharmony_cienum InstHole { 952c67d6573Sopenharmony_ci Save { slot: usize }, 953c67d6573Sopenharmony_ci EmptyLook { look: EmptyLook }, 954c67d6573Sopenharmony_ci Char { c: char }, 955c67d6573Sopenharmony_ci Ranges { ranges: Vec<(char, char)> }, 956c67d6573Sopenharmony_ci Bytes { start: u8, end: u8 }, 957c67d6573Sopenharmony_ci} 958c67d6573Sopenharmony_ci 959c67d6573Sopenharmony_ciimpl InstHole { 960c67d6573Sopenharmony_ci fn fill(&self, goto: InstPtr) -> Inst { 961c67d6573Sopenharmony_ci match *self { 962c67d6573Sopenharmony_ci InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }), 963c67d6573Sopenharmony_ci InstHole::EmptyLook { look } => { 964c67d6573Sopenharmony_ci Inst::EmptyLook(InstEmptyLook { goto, look }) 965c67d6573Sopenharmony_ci } 966c67d6573Sopenharmony_ci InstHole::Char { c } => Inst::Char(InstChar { goto, c }), 967c67d6573Sopenharmony_ci InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges { 968c67d6573Sopenharmony_ci goto, 969c67d6573Sopenharmony_ci ranges: ranges.clone().into_boxed_slice(), 970c67d6573Sopenharmony_ci }), 971c67d6573Sopenharmony_ci InstHole::Bytes { start, end } => { 972c67d6573Sopenharmony_ci Inst::Bytes(InstBytes { goto, start, end }) 973c67d6573Sopenharmony_ci } 974c67d6573Sopenharmony_ci } 975c67d6573Sopenharmony_ci } 976c67d6573Sopenharmony_ci} 977c67d6573Sopenharmony_ci 978c67d6573Sopenharmony_cistruct CompileClass<'a, 'b> { 979c67d6573Sopenharmony_ci c: &'a mut Compiler, 980c67d6573Sopenharmony_ci ranges: &'b [hir::ClassUnicodeRange], 981c67d6573Sopenharmony_ci} 982c67d6573Sopenharmony_ci 983c67d6573Sopenharmony_ciimpl<'a, 'b> CompileClass<'a, 'b> { 984c67d6573Sopenharmony_ci fn compile(mut self) -> Result { 985c67d6573Sopenharmony_ci let mut holes = vec![]; 986c67d6573Sopenharmony_ci let mut initial_entry = None; 987c67d6573Sopenharmony_ci let mut last_split = Hole::None; 988c67d6573Sopenharmony_ci let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); 989c67d6573Sopenharmony_ci self.c.suffix_cache.clear(); 990c67d6573Sopenharmony_ci 991c67d6573Sopenharmony_ci for (i, range) in self.ranges.iter().enumerate() { 992c67d6573Sopenharmony_ci let is_last_range = i + 1 == self.ranges.len(); 993c67d6573Sopenharmony_ci utf8_seqs.reset(range.start(), range.end()); 994c67d6573Sopenharmony_ci let mut it = (&mut utf8_seqs).peekable(); 995c67d6573Sopenharmony_ci loop { 996c67d6573Sopenharmony_ci let utf8_seq = match it.next() { 997c67d6573Sopenharmony_ci None => break, 998c67d6573Sopenharmony_ci Some(utf8_seq) => utf8_seq, 999c67d6573Sopenharmony_ci }; 1000c67d6573Sopenharmony_ci if is_last_range && it.peek().is_none() { 1001c67d6573Sopenharmony_ci let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; 1002c67d6573Sopenharmony_ci holes.push(hole); 1003c67d6573Sopenharmony_ci self.c.fill(last_split, entry); 1004c67d6573Sopenharmony_ci last_split = Hole::None; 1005c67d6573Sopenharmony_ci if initial_entry.is_none() { 1006c67d6573Sopenharmony_ci initial_entry = Some(entry); 1007c67d6573Sopenharmony_ci } 1008c67d6573Sopenharmony_ci } else { 1009c67d6573Sopenharmony_ci if initial_entry.is_none() { 1010c67d6573Sopenharmony_ci initial_entry = Some(self.c.insts.len()); 1011c67d6573Sopenharmony_ci } 1012c67d6573Sopenharmony_ci self.c.fill_to_next(last_split); 1013c67d6573Sopenharmony_ci last_split = self.c.push_split_hole(); 1014c67d6573Sopenharmony_ci let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; 1015c67d6573Sopenharmony_ci holes.push(hole); 1016c67d6573Sopenharmony_ci last_split = 1017c67d6573Sopenharmony_ci self.c.fill_split(last_split, Some(entry), None); 1018c67d6573Sopenharmony_ci } 1019c67d6573Sopenharmony_ci } 1020c67d6573Sopenharmony_ci } 1021c67d6573Sopenharmony_ci self.c.utf8_seqs = Some(utf8_seqs); 1022c67d6573Sopenharmony_ci Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) 1023c67d6573Sopenharmony_ci } 1024c67d6573Sopenharmony_ci 1025c67d6573Sopenharmony_ci fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { 1026c67d6573Sopenharmony_ci if self.c.compiled.is_reverse { 1027c67d6573Sopenharmony_ci self.c_utf8_seq_(seq) 1028c67d6573Sopenharmony_ci } else { 1029c67d6573Sopenharmony_ci self.c_utf8_seq_(seq.into_iter().rev()) 1030c67d6573Sopenharmony_ci } 1031c67d6573Sopenharmony_ci } 1032c67d6573Sopenharmony_ci 1033c67d6573Sopenharmony_ci fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result 1034c67d6573Sopenharmony_ci where 1035c67d6573Sopenharmony_ci I: IntoIterator<Item = &'r Utf8Range>, 1036c67d6573Sopenharmony_ci { 1037c67d6573Sopenharmony_ci // The initial instruction for each UTF-8 sequence should be the same. 1038c67d6573Sopenharmony_ci let mut from_inst = ::std::usize::MAX; 1039c67d6573Sopenharmony_ci let mut last_hole = Hole::None; 1040c67d6573Sopenharmony_ci for byte_range in seq { 1041c67d6573Sopenharmony_ci let key = SuffixCacheKey { 1042c67d6573Sopenharmony_ci from_inst, 1043c67d6573Sopenharmony_ci start: byte_range.start, 1044c67d6573Sopenharmony_ci end: byte_range.end, 1045c67d6573Sopenharmony_ci }; 1046c67d6573Sopenharmony_ci { 1047c67d6573Sopenharmony_ci let pc = self.c.insts.len(); 1048c67d6573Sopenharmony_ci if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { 1049c67d6573Sopenharmony_ci from_inst = cached_pc; 1050c67d6573Sopenharmony_ci continue; 1051c67d6573Sopenharmony_ci } 1052c67d6573Sopenharmony_ci } 1053c67d6573Sopenharmony_ci self.c.byte_classes.set_range(byte_range.start, byte_range.end); 1054c67d6573Sopenharmony_ci if from_inst == ::std::usize::MAX { 1055c67d6573Sopenharmony_ci last_hole = self.c.push_hole(InstHole::Bytes { 1056c67d6573Sopenharmony_ci start: byte_range.start, 1057c67d6573Sopenharmony_ci end: byte_range.end, 1058c67d6573Sopenharmony_ci }); 1059c67d6573Sopenharmony_ci } else { 1060c67d6573Sopenharmony_ci self.c.push_compiled(Inst::Bytes(InstBytes { 1061c67d6573Sopenharmony_ci goto: from_inst, 1062c67d6573Sopenharmony_ci start: byte_range.start, 1063c67d6573Sopenharmony_ci end: byte_range.end, 1064c67d6573Sopenharmony_ci })); 1065c67d6573Sopenharmony_ci } 1066c67d6573Sopenharmony_ci from_inst = self.c.insts.len().checked_sub(1).unwrap(); 1067c67d6573Sopenharmony_ci debug_assert!(from_inst < ::std::usize::MAX); 1068c67d6573Sopenharmony_ci } 1069c67d6573Sopenharmony_ci debug_assert!(from_inst < ::std::usize::MAX); 1070c67d6573Sopenharmony_ci Ok(Patch { hole: last_hole, entry: from_inst }) 1071c67d6573Sopenharmony_ci } 1072c67d6573Sopenharmony_ci} 1073c67d6573Sopenharmony_ci 1074c67d6573Sopenharmony_ci/// `SuffixCache` is a simple bounded hash map for caching suffix entries in 1075c67d6573Sopenharmony_ci/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. 1076c67d6573Sopenharmony_ci/// The set of byte ranges looks like this: 1077c67d6573Sopenharmony_ci/// 1078c67d6573Sopenharmony_ci/// [0-7F] 1079c67d6573Sopenharmony_ci/// [C2-DF][80-BF] 1080c67d6573Sopenharmony_ci/// [E0][A0-BF][80-BF] 1081c67d6573Sopenharmony_ci/// [E1-EC][80-BF][80-BF] 1082c67d6573Sopenharmony_ci/// [ED][80-9F][80-BF] 1083c67d6573Sopenharmony_ci/// [EE-EF][80-BF][80-BF] 1084c67d6573Sopenharmony_ci/// 1085c67d6573Sopenharmony_ci/// Each line above translates to one alternate in the compiled regex program. 1086c67d6573Sopenharmony_ci/// However, all but one of the alternates end in the same suffix, which is 1087c67d6573Sopenharmony_ci/// a waste of an instruction. The suffix cache facilitates reusing them across 1088c67d6573Sopenharmony_ci/// alternates. 1089c67d6573Sopenharmony_ci/// 1090c67d6573Sopenharmony_ci/// Note that a HashMap could be trivially used for this, but we don't need its 1091c67d6573Sopenharmony_ci/// overhead. Some small bounded space (LRU style) is more than enough. 1092c67d6573Sopenharmony_ci/// 1093c67d6573Sopenharmony_ci/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), 1094c67d6573Sopenharmony_ci/// except it uses hashes as original indices and then compares full keys for 1095c67d6573Sopenharmony_ci/// validation against `dense` array. 1096c67d6573Sopenharmony_ci#[derive(Debug)] 1097c67d6573Sopenharmony_cistruct SuffixCache { 1098c67d6573Sopenharmony_ci sparse: Box<[usize]>, 1099c67d6573Sopenharmony_ci dense: Vec<SuffixCacheEntry>, 1100c67d6573Sopenharmony_ci} 1101c67d6573Sopenharmony_ci 1102c67d6573Sopenharmony_ci#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] 1103c67d6573Sopenharmony_cistruct SuffixCacheEntry { 1104c67d6573Sopenharmony_ci key: SuffixCacheKey, 1105c67d6573Sopenharmony_ci pc: InstPtr, 1106c67d6573Sopenharmony_ci} 1107c67d6573Sopenharmony_ci 1108c67d6573Sopenharmony_ci#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] 1109c67d6573Sopenharmony_cistruct SuffixCacheKey { 1110c67d6573Sopenharmony_ci from_inst: InstPtr, 1111c67d6573Sopenharmony_ci start: u8, 1112c67d6573Sopenharmony_ci end: u8, 1113c67d6573Sopenharmony_ci} 1114c67d6573Sopenharmony_ci 1115c67d6573Sopenharmony_ciimpl SuffixCache { 1116c67d6573Sopenharmony_ci fn new(size: usize) -> Self { 1117c67d6573Sopenharmony_ci SuffixCache { 1118c67d6573Sopenharmony_ci sparse: vec![0usize; size].into(), 1119c67d6573Sopenharmony_ci dense: Vec::with_capacity(size), 1120c67d6573Sopenharmony_ci } 1121c67d6573Sopenharmony_ci } 1122c67d6573Sopenharmony_ci 1123c67d6573Sopenharmony_ci fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> { 1124c67d6573Sopenharmony_ci let hash = self.hash(&key); 1125c67d6573Sopenharmony_ci let pos = &mut self.sparse[hash]; 1126c67d6573Sopenharmony_ci if let Some(entry) = self.dense.get(*pos) { 1127c67d6573Sopenharmony_ci if entry.key == key { 1128c67d6573Sopenharmony_ci return Some(entry.pc); 1129c67d6573Sopenharmony_ci } 1130c67d6573Sopenharmony_ci } 1131c67d6573Sopenharmony_ci *pos = self.dense.len(); 1132c67d6573Sopenharmony_ci self.dense.push(SuffixCacheEntry { key, pc }); 1133c67d6573Sopenharmony_ci None 1134c67d6573Sopenharmony_ci } 1135c67d6573Sopenharmony_ci 1136c67d6573Sopenharmony_ci fn clear(&mut self) { 1137c67d6573Sopenharmony_ci self.dense.clear(); 1138c67d6573Sopenharmony_ci } 1139c67d6573Sopenharmony_ci 1140c67d6573Sopenharmony_ci fn hash(&self, suffix: &SuffixCacheKey) -> usize { 1141c67d6573Sopenharmony_ci // Basic FNV-1a hash as described: 1142c67d6573Sopenharmony_ci // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function 1143c67d6573Sopenharmony_ci const FNV_PRIME: u64 = 1_099_511_628_211; 1144c67d6573Sopenharmony_ci let mut h = 14_695_981_039_346_656_037; 1145c67d6573Sopenharmony_ci h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); 1146c67d6573Sopenharmony_ci h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); 1147c67d6573Sopenharmony_ci h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); 1148c67d6573Sopenharmony_ci (h as usize) % self.sparse.len() 1149c67d6573Sopenharmony_ci } 1150c67d6573Sopenharmony_ci} 1151c67d6573Sopenharmony_ci 1152c67d6573Sopenharmony_cistruct ByteClassSet([bool; 256]); 1153c67d6573Sopenharmony_ci 1154c67d6573Sopenharmony_ciimpl ByteClassSet { 1155c67d6573Sopenharmony_ci fn new() -> Self { 1156c67d6573Sopenharmony_ci ByteClassSet([false; 256]) 1157c67d6573Sopenharmony_ci } 1158c67d6573Sopenharmony_ci 1159c67d6573Sopenharmony_ci fn set_range(&mut self, start: u8, end: u8) { 1160c67d6573Sopenharmony_ci debug_assert!(start <= end); 1161c67d6573Sopenharmony_ci if start > 0 { 1162c67d6573Sopenharmony_ci self.0[start as usize - 1] = true; 1163c67d6573Sopenharmony_ci } 1164c67d6573Sopenharmony_ci self.0[end as usize] = true; 1165c67d6573Sopenharmony_ci } 1166c67d6573Sopenharmony_ci 1167c67d6573Sopenharmony_ci fn set_word_boundary(&mut self) { 1168c67d6573Sopenharmony_ci // We need to mark all ranges of bytes whose pairs result in 1169c67d6573Sopenharmony_ci // evaluating \b differently. 1170c67d6573Sopenharmony_ci let iswb = is_word_byte; 1171c67d6573Sopenharmony_ci let mut b1: u16 = 0; 1172c67d6573Sopenharmony_ci let mut b2: u16; 1173c67d6573Sopenharmony_ci while b1 <= 255 { 1174c67d6573Sopenharmony_ci b2 = b1 + 1; 1175c67d6573Sopenharmony_ci while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { 1176c67d6573Sopenharmony_ci b2 += 1; 1177c67d6573Sopenharmony_ci } 1178c67d6573Sopenharmony_ci self.set_range(b1 as u8, (b2 - 1) as u8); 1179c67d6573Sopenharmony_ci b1 = b2; 1180c67d6573Sopenharmony_ci } 1181c67d6573Sopenharmony_ci } 1182c67d6573Sopenharmony_ci 1183c67d6573Sopenharmony_ci fn byte_classes(&self) -> Vec<u8> { 1184c67d6573Sopenharmony_ci // N.B. If you're debugging the DFA, it's useful to simply return 1185c67d6573Sopenharmony_ci // `(0..256).collect()`, which effectively removes the byte classes 1186c67d6573Sopenharmony_ci // and makes the transitions easier to read. 1187c67d6573Sopenharmony_ci // (0usize..256).map(|x| x as u8).collect() 1188c67d6573Sopenharmony_ci let mut byte_classes = vec![0; 256]; 1189c67d6573Sopenharmony_ci let mut class = 0u8; 1190c67d6573Sopenharmony_ci let mut i = 0; 1191c67d6573Sopenharmony_ci loop { 1192c67d6573Sopenharmony_ci byte_classes[i] = class as u8; 1193c67d6573Sopenharmony_ci if i >= 255 { 1194c67d6573Sopenharmony_ci break; 1195c67d6573Sopenharmony_ci } 1196c67d6573Sopenharmony_ci if self.0[i] { 1197c67d6573Sopenharmony_ci class = class.checked_add(1).unwrap(); 1198c67d6573Sopenharmony_ci } 1199c67d6573Sopenharmony_ci i += 1; 1200c67d6573Sopenharmony_ci } 1201c67d6573Sopenharmony_ci byte_classes 1202c67d6573Sopenharmony_ci } 1203c67d6573Sopenharmony_ci} 1204c67d6573Sopenharmony_ci 1205c67d6573Sopenharmony_ciimpl fmt::Debug for ByteClassSet { 1206c67d6573Sopenharmony_ci fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1207c67d6573Sopenharmony_ci f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish() 1208c67d6573Sopenharmony_ci } 1209c67d6573Sopenharmony_ci} 1210c67d6573Sopenharmony_ci 1211c67d6573Sopenharmony_cifn u32_to_usize(n: u32) -> usize { 1212c67d6573Sopenharmony_ci // In case usize is less than 32 bits, we need to guard against overflow. 1213c67d6573Sopenharmony_ci // On most platforms this compiles to nothing. 1214c67d6573Sopenharmony_ci // TODO Use `std::convert::TryFrom` once it's stable. 1215c67d6573Sopenharmony_ci if (n as u64) > (::std::usize::MAX as u64) { 1216c67d6573Sopenharmony_ci panic!("BUG: {} is too big to be pointer sized", n) 1217c67d6573Sopenharmony_ci } 1218c67d6573Sopenharmony_ci n as usize 1219c67d6573Sopenharmony_ci} 1220c67d6573Sopenharmony_ci 1221c67d6573Sopenharmony_ci#[cfg(test)] 1222c67d6573Sopenharmony_cimod tests { 1223c67d6573Sopenharmony_ci use super::ByteClassSet; 1224c67d6573Sopenharmony_ci 1225c67d6573Sopenharmony_ci #[test] 1226c67d6573Sopenharmony_ci fn byte_classes() { 1227c67d6573Sopenharmony_ci let mut set = ByteClassSet::new(); 1228c67d6573Sopenharmony_ci set.set_range(b'a', b'z'); 1229c67d6573Sopenharmony_ci let classes = set.byte_classes(); 1230c67d6573Sopenharmony_ci assert_eq!(classes[0], 0); 1231c67d6573Sopenharmony_ci assert_eq!(classes[1], 0); 1232c67d6573Sopenharmony_ci assert_eq!(classes[2], 0); 1233c67d6573Sopenharmony_ci assert_eq!(classes[b'a' as usize - 1], 0); 1234c67d6573Sopenharmony_ci assert_eq!(classes[b'a' as usize], 1); 1235c67d6573Sopenharmony_ci assert_eq!(classes[b'm' as usize], 1); 1236c67d6573Sopenharmony_ci assert_eq!(classes[b'z' as usize], 1); 1237c67d6573Sopenharmony_ci assert_eq!(classes[b'z' as usize + 1], 2); 1238c67d6573Sopenharmony_ci assert_eq!(classes[254], 2); 1239c67d6573Sopenharmony_ci assert_eq!(classes[255], 2); 1240c67d6573Sopenharmony_ci 1241c67d6573Sopenharmony_ci let mut set = ByteClassSet::new(); 1242c67d6573Sopenharmony_ci set.set_range(0, 2); 1243c67d6573Sopenharmony_ci set.set_range(4, 6); 1244c67d6573Sopenharmony_ci let classes = set.byte_classes(); 1245c67d6573Sopenharmony_ci assert_eq!(classes[0], 0); 1246c67d6573Sopenharmony_ci assert_eq!(classes[1], 0); 1247c67d6573Sopenharmony_ci assert_eq!(classes[2], 0); 1248c67d6573Sopenharmony_ci assert_eq!(classes[3], 1); 1249c67d6573Sopenharmony_ci assert_eq!(classes[4], 2); 1250c67d6573Sopenharmony_ci assert_eq!(classes[5], 2); 1251c67d6573Sopenharmony_ci assert_eq!(classes[6], 2); 1252c67d6573Sopenharmony_ci assert_eq!(classes[7], 3); 1253c67d6573Sopenharmony_ci assert_eq!(classes[255], 3); 1254c67d6573Sopenharmony_ci } 1255c67d6573Sopenharmony_ci 1256c67d6573Sopenharmony_ci #[test] 1257c67d6573Sopenharmony_ci fn full_byte_classes() { 1258c67d6573Sopenharmony_ci let mut set = ByteClassSet::new(); 1259c67d6573Sopenharmony_ci for i in 0..256u16 { 1260c67d6573Sopenharmony_ci set.set_range(i as u8, i as u8); 1261c67d6573Sopenharmony_ci } 1262c67d6573Sopenharmony_ci assert_eq!(set.byte_classes().len(), 256); 1263c67d6573Sopenharmony_ci } 1264c67d6573Sopenharmony_ci} 1265