1c67d6573Sopenharmony_ciuse std::cmp::Ordering; 2c67d6573Sopenharmony_ciuse std::collections::HashMap; 3c67d6573Sopenharmony_ciuse std::fmt; 4c67d6573Sopenharmony_ciuse std::mem; 5c67d6573Sopenharmony_ciuse std::ops::Deref; 6c67d6573Sopenharmony_ciuse std::slice; 7c67d6573Sopenharmony_ciuse std::sync::Arc; 8c67d6573Sopenharmony_ci 9c67d6573Sopenharmony_ciuse crate::input::Char; 10c67d6573Sopenharmony_ciuse crate::literal::LiteralSearcher; 11c67d6573Sopenharmony_ci 12c67d6573Sopenharmony_ci/// `InstPtr` represents the index of an instruction in a regex program. 13c67d6573Sopenharmony_cipub type InstPtr = usize; 14c67d6573Sopenharmony_ci 15c67d6573Sopenharmony_ci/// Program is a sequence of instructions and various facts about thos 16c67d6573Sopenharmony_ci/// instructions. 17c67d6573Sopenharmony_ci#[derive(Clone)] 18c67d6573Sopenharmony_cipub struct Program { 19c67d6573Sopenharmony_ci /// A sequence of instructions that represents an NFA. 20c67d6573Sopenharmony_ci pub insts: Vec<Inst>, 21c67d6573Sopenharmony_ci /// Pointers to each Match instruction in the sequence. 22c67d6573Sopenharmony_ci /// 23c67d6573Sopenharmony_ci /// This is always length 1 unless this program represents a regex set. 24c67d6573Sopenharmony_ci pub matches: Vec<InstPtr>, 25c67d6573Sopenharmony_ci /// The ordered sequence of all capture groups extracted from the AST. 26c67d6573Sopenharmony_ci /// Unnamed groups are `None`. 27c67d6573Sopenharmony_ci pub captures: Vec<Option<String>>, 28c67d6573Sopenharmony_ci /// Pointers to all named capture groups into `captures`. 29c67d6573Sopenharmony_ci pub capture_name_idx: Arc<HashMap<String, usize>>, 30c67d6573Sopenharmony_ci /// A pointer to the start instruction. This can vary depending on how 31c67d6573Sopenharmony_ci /// the program was compiled. For example, programs for use with the DFA 32c67d6573Sopenharmony_ci /// engine have a `.*?` inserted at the beginning of unanchored regular 33c67d6573Sopenharmony_ci /// expressions. The actual starting point of the program is after the 34c67d6573Sopenharmony_ci /// `.*?`. 35c67d6573Sopenharmony_ci pub start: InstPtr, 36c67d6573Sopenharmony_ci /// A set of equivalence classes for discriminating bytes in the compiled 37c67d6573Sopenharmony_ci /// program. 38c67d6573Sopenharmony_ci pub byte_classes: Vec<u8>, 39c67d6573Sopenharmony_ci /// When true, this program can only match valid UTF-8. 40c67d6573Sopenharmony_ci pub only_utf8: bool, 41c67d6573Sopenharmony_ci /// When true, this program uses byte range instructions instead of Unicode 42c67d6573Sopenharmony_ci /// range instructions. 43c67d6573Sopenharmony_ci pub is_bytes: bool, 44c67d6573Sopenharmony_ci /// When true, the program is compiled for DFA matching. For example, this 45c67d6573Sopenharmony_ci /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored 46c67d6573Sopenharmony_ci /// regexes. 47c67d6573Sopenharmony_ci pub is_dfa: bool, 48c67d6573Sopenharmony_ci /// When true, the program matches text in reverse (for use only in the 49c67d6573Sopenharmony_ci /// DFA). 50c67d6573Sopenharmony_ci pub is_reverse: bool, 51c67d6573Sopenharmony_ci /// Whether the regex must match from the start of the input. 52c67d6573Sopenharmony_ci pub is_anchored_start: bool, 53c67d6573Sopenharmony_ci /// Whether the regex must match at the end of the input. 54c67d6573Sopenharmony_ci pub is_anchored_end: bool, 55c67d6573Sopenharmony_ci /// Whether this program contains a Unicode word boundary instruction. 56c67d6573Sopenharmony_ci pub has_unicode_word_boundary: bool, 57c67d6573Sopenharmony_ci /// A possibly empty machine for very quickly matching prefix literals. 58c67d6573Sopenharmony_ci pub prefixes: LiteralSearcher, 59c67d6573Sopenharmony_ci /// A limit on the size of the cache that the DFA is allowed to use while 60c67d6573Sopenharmony_ci /// matching. 61c67d6573Sopenharmony_ci /// 62c67d6573Sopenharmony_ci /// The cache limit specifies approximately how much space we're willing to 63c67d6573Sopenharmony_ci /// give to the state cache. Once the state cache exceeds the size, it is 64c67d6573Sopenharmony_ci /// wiped and all states must be re-computed. 65c67d6573Sopenharmony_ci /// 66c67d6573Sopenharmony_ci /// Note that this value does not impact correctness. It can be set to 0 67c67d6573Sopenharmony_ci /// and the DFA will run just fine. (It will only ever store exactly one 68c67d6573Sopenharmony_ci /// state in the cache, and will likely run very slowly, but it will work.) 69c67d6573Sopenharmony_ci /// 70c67d6573Sopenharmony_ci /// Also note that this limit is *per thread of execution*. That is, 71c67d6573Sopenharmony_ci /// if the same regex is used to search text across multiple threads 72c67d6573Sopenharmony_ci /// simultaneously, then the DFA cache is not shared. Instead, copies are 73c67d6573Sopenharmony_ci /// made. 74c67d6573Sopenharmony_ci pub dfa_size_limit: usize, 75c67d6573Sopenharmony_ci} 76c67d6573Sopenharmony_ci 77c67d6573Sopenharmony_ciimpl Program { 78c67d6573Sopenharmony_ci /// Creates an empty instruction sequence. Fields are given default 79c67d6573Sopenharmony_ci /// values. 80c67d6573Sopenharmony_ci pub fn new() -> Self { 81c67d6573Sopenharmony_ci Program { 82c67d6573Sopenharmony_ci insts: vec![], 83c67d6573Sopenharmony_ci matches: vec![], 84c67d6573Sopenharmony_ci captures: vec![], 85c67d6573Sopenharmony_ci capture_name_idx: Arc::new(HashMap::new()), 86c67d6573Sopenharmony_ci start: 0, 87c67d6573Sopenharmony_ci byte_classes: vec![0; 256], 88c67d6573Sopenharmony_ci only_utf8: true, 89c67d6573Sopenharmony_ci is_bytes: false, 90c67d6573Sopenharmony_ci is_dfa: false, 91c67d6573Sopenharmony_ci is_reverse: false, 92c67d6573Sopenharmony_ci is_anchored_start: false, 93c67d6573Sopenharmony_ci is_anchored_end: false, 94c67d6573Sopenharmony_ci has_unicode_word_boundary: false, 95c67d6573Sopenharmony_ci prefixes: LiteralSearcher::empty(), 96c67d6573Sopenharmony_ci dfa_size_limit: 2 * (1 << 20), 97c67d6573Sopenharmony_ci } 98c67d6573Sopenharmony_ci } 99c67d6573Sopenharmony_ci 100c67d6573Sopenharmony_ci /// If pc is an index to a no-op instruction (like Save), then return the 101c67d6573Sopenharmony_ci /// next pc that is not a no-op instruction. 102c67d6573Sopenharmony_ci pub fn skip(&self, mut pc: usize) -> usize { 103c67d6573Sopenharmony_ci loop { 104c67d6573Sopenharmony_ci match self[pc] { 105c67d6573Sopenharmony_ci Inst::Save(ref i) => pc = i.goto, 106c67d6573Sopenharmony_ci _ => return pc, 107c67d6573Sopenharmony_ci } 108c67d6573Sopenharmony_ci } 109c67d6573Sopenharmony_ci } 110c67d6573Sopenharmony_ci 111c67d6573Sopenharmony_ci /// Return true if and only if an execution engine at instruction `pc` will 112c67d6573Sopenharmony_ci /// always lead to a match. 113c67d6573Sopenharmony_ci pub fn leads_to_match(&self, pc: usize) -> bool { 114c67d6573Sopenharmony_ci if self.matches.len() > 1 { 115c67d6573Sopenharmony_ci // If we have a regex set, then we have more than one ending 116c67d6573Sopenharmony_ci // state, so leading to one of those states is generally 117c67d6573Sopenharmony_ci // meaningless. 118c67d6573Sopenharmony_ci return false; 119c67d6573Sopenharmony_ci } 120c67d6573Sopenharmony_ci match self[self.skip(pc)] { 121c67d6573Sopenharmony_ci Inst::Match(_) => true, 122c67d6573Sopenharmony_ci _ => false, 123c67d6573Sopenharmony_ci } 124c67d6573Sopenharmony_ci } 125c67d6573Sopenharmony_ci 126c67d6573Sopenharmony_ci /// Returns true if the current configuration demands that an implicit 127c67d6573Sopenharmony_ci /// `.*?` be prepended to the instruction sequence. 128c67d6573Sopenharmony_ci pub fn needs_dotstar(&self) -> bool { 129c67d6573Sopenharmony_ci self.is_dfa && !self.is_reverse && !self.is_anchored_start 130c67d6573Sopenharmony_ci } 131c67d6573Sopenharmony_ci 132c67d6573Sopenharmony_ci /// Returns true if this program uses Byte instructions instead of 133c67d6573Sopenharmony_ci /// Char/Range instructions. 134c67d6573Sopenharmony_ci pub fn uses_bytes(&self) -> bool { 135c67d6573Sopenharmony_ci self.is_bytes || self.is_dfa 136c67d6573Sopenharmony_ci } 137c67d6573Sopenharmony_ci 138c67d6573Sopenharmony_ci /// Returns true if this program exclusively matches valid UTF-8 bytes. 139c67d6573Sopenharmony_ci /// 140c67d6573Sopenharmony_ci /// That is, if an invalid UTF-8 byte is seen, then no match is possible. 141c67d6573Sopenharmony_ci pub fn only_utf8(&self) -> bool { 142c67d6573Sopenharmony_ci self.only_utf8 143c67d6573Sopenharmony_ci } 144c67d6573Sopenharmony_ci 145c67d6573Sopenharmony_ci /// Return the approximate heap usage of this instruction sequence in 146c67d6573Sopenharmony_ci /// bytes. 147c67d6573Sopenharmony_ci pub fn approximate_size(&self) -> usize { 148c67d6573Sopenharmony_ci // The only instruction that uses heap space is Ranges (for 149c67d6573Sopenharmony_ci // Unicode codepoint programs) to store non-overlapping codepoint 150c67d6573Sopenharmony_ci // ranges. To keep this operation constant time, we ignore them. 151c67d6573Sopenharmony_ci (self.len() * mem::size_of::<Inst>()) 152c67d6573Sopenharmony_ci + (self.matches.len() * mem::size_of::<InstPtr>()) 153c67d6573Sopenharmony_ci + (self.captures.len() * mem::size_of::<Option<String>>()) 154c67d6573Sopenharmony_ci + (self.capture_name_idx.len() 155c67d6573Sopenharmony_ci * (mem::size_of::<String>() + mem::size_of::<usize>())) 156c67d6573Sopenharmony_ci + (self.byte_classes.len() * mem::size_of::<u8>()) 157c67d6573Sopenharmony_ci + self.prefixes.approximate_size() 158c67d6573Sopenharmony_ci } 159c67d6573Sopenharmony_ci} 160c67d6573Sopenharmony_ci 161c67d6573Sopenharmony_ciimpl Deref for Program { 162c67d6573Sopenharmony_ci type Target = [Inst]; 163c67d6573Sopenharmony_ci 164c67d6573Sopenharmony_ci #[cfg_attr(feature = "perf-inline", inline(always))] 165c67d6573Sopenharmony_ci fn deref(&self) -> &Self::Target { 166c67d6573Sopenharmony_ci &*self.insts 167c67d6573Sopenharmony_ci } 168c67d6573Sopenharmony_ci} 169c67d6573Sopenharmony_ci 170c67d6573Sopenharmony_ciimpl fmt::Debug for Program { 171c67d6573Sopenharmony_ci fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 172c67d6573Sopenharmony_ci use self::Inst::*; 173c67d6573Sopenharmony_ci 174c67d6573Sopenharmony_ci fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { 175c67d6573Sopenharmony_ci if goto == cur + 1 { 176c67d6573Sopenharmony_ci fmtd 177c67d6573Sopenharmony_ci } else { 178c67d6573Sopenharmony_ci format!("{} (goto: {})", fmtd, goto) 179c67d6573Sopenharmony_ci } 180c67d6573Sopenharmony_ci } 181c67d6573Sopenharmony_ci 182c67d6573Sopenharmony_ci fn visible_byte(b: u8) -> String { 183c67d6573Sopenharmony_ci use std::ascii::escape_default; 184c67d6573Sopenharmony_ci let escaped = escape_default(b).collect::<Vec<u8>>(); 185c67d6573Sopenharmony_ci String::from_utf8_lossy(&escaped).into_owned() 186c67d6573Sopenharmony_ci } 187c67d6573Sopenharmony_ci 188c67d6573Sopenharmony_ci for (pc, inst) in self.iter().enumerate() { 189c67d6573Sopenharmony_ci match *inst { 190c67d6573Sopenharmony_ci Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?, 191c67d6573Sopenharmony_ci Save(ref inst) => { 192c67d6573Sopenharmony_ci let s = format!("{:04} Save({})", pc, inst.slot); 193c67d6573Sopenharmony_ci write!(f, "{}", with_goto(pc, inst.goto, s))?; 194c67d6573Sopenharmony_ci } 195c67d6573Sopenharmony_ci Split(ref inst) => { 196c67d6573Sopenharmony_ci write!( 197c67d6573Sopenharmony_ci f, 198c67d6573Sopenharmony_ci "{:04} Split({}, {})", 199c67d6573Sopenharmony_ci pc, inst.goto1, inst.goto2 200c67d6573Sopenharmony_ci )?; 201c67d6573Sopenharmony_ci } 202c67d6573Sopenharmony_ci EmptyLook(ref inst) => { 203c67d6573Sopenharmony_ci let s = format!("{:?}", inst.look); 204c67d6573Sopenharmony_ci write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; 205c67d6573Sopenharmony_ci } 206c67d6573Sopenharmony_ci Char(ref inst) => { 207c67d6573Sopenharmony_ci let s = format!("{:?}", inst.c); 208c67d6573Sopenharmony_ci write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; 209c67d6573Sopenharmony_ci } 210c67d6573Sopenharmony_ci Ranges(ref inst) => { 211c67d6573Sopenharmony_ci let ranges = inst 212c67d6573Sopenharmony_ci .ranges 213c67d6573Sopenharmony_ci .iter() 214c67d6573Sopenharmony_ci .map(|r| format!("{:?}-{:?}", r.0, r.1)) 215c67d6573Sopenharmony_ci .collect::<Vec<String>>() 216c67d6573Sopenharmony_ci .join(", "); 217c67d6573Sopenharmony_ci write!( 218c67d6573Sopenharmony_ci f, 219c67d6573Sopenharmony_ci "{:04} {}", 220c67d6573Sopenharmony_ci pc, 221c67d6573Sopenharmony_ci with_goto(pc, inst.goto, ranges) 222c67d6573Sopenharmony_ci )?; 223c67d6573Sopenharmony_ci } 224c67d6573Sopenharmony_ci Bytes(ref inst) => { 225c67d6573Sopenharmony_ci let s = format!( 226c67d6573Sopenharmony_ci "Bytes({}, {})", 227c67d6573Sopenharmony_ci visible_byte(inst.start), 228c67d6573Sopenharmony_ci visible_byte(inst.end) 229c67d6573Sopenharmony_ci ); 230c67d6573Sopenharmony_ci write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; 231c67d6573Sopenharmony_ci } 232c67d6573Sopenharmony_ci } 233c67d6573Sopenharmony_ci if pc == self.start { 234c67d6573Sopenharmony_ci write!(f, " (start)")?; 235c67d6573Sopenharmony_ci } 236c67d6573Sopenharmony_ci writeln!(f)?; 237c67d6573Sopenharmony_ci } 238c67d6573Sopenharmony_ci Ok(()) 239c67d6573Sopenharmony_ci } 240c67d6573Sopenharmony_ci} 241c67d6573Sopenharmony_ci 242c67d6573Sopenharmony_ciimpl<'a> IntoIterator for &'a Program { 243c67d6573Sopenharmony_ci type Item = &'a Inst; 244c67d6573Sopenharmony_ci type IntoIter = slice::Iter<'a, Inst>; 245c67d6573Sopenharmony_ci fn into_iter(self) -> Self::IntoIter { 246c67d6573Sopenharmony_ci self.iter() 247c67d6573Sopenharmony_ci } 248c67d6573Sopenharmony_ci} 249c67d6573Sopenharmony_ci 250c67d6573Sopenharmony_ci/// Inst is an instruction code in a Regex program. 251c67d6573Sopenharmony_ci/// 252c67d6573Sopenharmony_ci/// Regrettably, a regex program either contains Unicode codepoint 253c67d6573Sopenharmony_ci/// instructions (Char and Ranges) or it contains byte instructions (Bytes). 254c67d6573Sopenharmony_ci/// A regex program can never contain both. 255c67d6573Sopenharmony_ci/// 256c67d6573Sopenharmony_ci/// It would be worth investigating splitting this into two distinct types and 257c67d6573Sopenharmony_ci/// then figuring out how to make the matching engines polymorphic over those 258c67d6573Sopenharmony_ci/// types without sacrificing performance. 259c67d6573Sopenharmony_ci/// 260c67d6573Sopenharmony_ci/// Other than the benefit of moving invariants into the type system, another 261c67d6573Sopenharmony_ci/// benefit is the decreased size. If we remove the `Char` and `Ranges` 262c67d6573Sopenharmony_ci/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to 263c67d6573Sopenharmony_ci/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges` 264c67d6573Sopenharmony_ci/// variant.) Given that byte based machines are typically much bigger than 265c67d6573Sopenharmony_ci/// their Unicode analogues (because they can decode UTF-8 directly), this ends 266c67d6573Sopenharmony_ci/// up being a pretty significant savings. 267c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 268c67d6573Sopenharmony_cipub enum Inst { 269c67d6573Sopenharmony_ci /// Match indicates that the program has reached a match state. 270c67d6573Sopenharmony_ci /// 271c67d6573Sopenharmony_ci /// The number in the match corresponds to the Nth logical regular 272c67d6573Sopenharmony_ci /// expression in this program. This index is always 0 for normal regex 273c67d6573Sopenharmony_ci /// programs. Values greater than 0 appear when compiling regex sets, and 274c67d6573Sopenharmony_ci /// each match instruction gets its own unique value. The value corresponds 275c67d6573Sopenharmony_ci /// to the Nth regex in the set. 276c67d6573Sopenharmony_ci Match(usize), 277c67d6573Sopenharmony_ci /// Save causes the program to save the current location of the input in 278c67d6573Sopenharmony_ci /// the slot indicated by InstSave. 279c67d6573Sopenharmony_ci Save(InstSave), 280c67d6573Sopenharmony_ci /// Split causes the program to diverge to one of two paths in the 281c67d6573Sopenharmony_ci /// program, preferring goto1 in InstSplit. 282c67d6573Sopenharmony_ci Split(InstSplit), 283c67d6573Sopenharmony_ci /// EmptyLook represents a zero-width assertion in a regex program. A 284c67d6573Sopenharmony_ci /// zero-width assertion does not consume any of the input text. 285c67d6573Sopenharmony_ci EmptyLook(InstEmptyLook), 286c67d6573Sopenharmony_ci /// Char requires the regex program to match the character in InstChar at 287c67d6573Sopenharmony_ci /// the current position in the input. 288c67d6573Sopenharmony_ci Char(InstChar), 289c67d6573Sopenharmony_ci /// Ranges requires the regex program to match the character at the current 290c67d6573Sopenharmony_ci /// position in the input with one of the ranges specified in InstRanges. 291c67d6573Sopenharmony_ci Ranges(InstRanges), 292c67d6573Sopenharmony_ci /// Bytes is like Ranges, except it expresses a single byte range. It is 293c67d6573Sopenharmony_ci /// used in conjunction with Split instructions to implement multi-byte 294c67d6573Sopenharmony_ci /// character classes. 295c67d6573Sopenharmony_ci Bytes(InstBytes), 296c67d6573Sopenharmony_ci} 297c67d6573Sopenharmony_ci 298c67d6573Sopenharmony_ciimpl Inst { 299c67d6573Sopenharmony_ci /// Returns true if and only if this is a match instruction. 300c67d6573Sopenharmony_ci pub fn is_match(&self) -> bool { 301c67d6573Sopenharmony_ci match *self { 302c67d6573Sopenharmony_ci Inst::Match(_) => true, 303c67d6573Sopenharmony_ci _ => false, 304c67d6573Sopenharmony_ci } 305c67d6573Sopenharmony_ci } 306c67d6573Sopenharmony_ci} 307c67d6573Sopenharmony_ci 308c67d6573Sopenharmony_ci/// Representation of the Save instruction. 309c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 310c67d6573Sopenharmony_cipub struct InstSave { 311c67d6573Sopenharmony_ci /// The next location to execute in the program. 312c67d6573Sopenharmony_ci pub goto: InstPtr, 313c67d6573Sopenharmony_ci /// The capture slot (there are two slots for every capture in a regex, 314c67d6573Sopenharmony_ci /// including the zeroth capture for the entire match). 315c67d6573Sopenharmony_ci pub slot: usize, 316c67d6573Sopenharmony_ci} 317c67d6573Sopenharmony_ci 318c67d6573Sopenharmony_ci/// Representation of the Split instruction. 319c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 320c67d6573Sopenharmony_cipub struct InstSplit { 321c67d6573Sopenharmony_ci /// The first instruction to try. A match resulting from following goto1 322c67d6573Sopenharmony_ci /// has precedence over a match resulting from following goto2. 323c67d6573Sopenharmony_ci pub goto1: InstPtr, 324c67d6573Sopenharmony_ci /// The second instruction to try. A match resulting from following goto1 325c67d6573Sopenharmony_ci /// has precedence over a match resulting from following goto2. 326c67d6573Sopenharmony_ci pub goto2: InstPtr, 327c67d6573Sopenharmony_ci} 328c67d6573Sopenharmony_ci 329c67d6573Sopenharmony_ci/// Representation of the `EmptyLook` instruction. 330c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 331c67d6573Sopenharmony_cipub struct InstEmptyLook { 332c67d6573Sopenharmony_ci /// The next location to execute in the program if this instruction 333c67d6573Sopenharmony_ci /// succeeds. 334c67d6573Sopenharmony_ci pub goto: InstPtr, 335c67d6573Sopenharmony_ci /// The type of zero-width assertion to check. 336c67d6573Sopenharmony_ci pub look: EmptyLook, 337c67d6573Sopenharmony_ci} 338c67d6573Sopenharmony_ci 339c67d6573Sopenharmony_ci/// The set of zero-width match instructions. 340c67d6573Sopenharmony_ci#[derive(Clone, Copy, Debug, PartialEq, Eq)] 341c67d6573Sopenharmony_cipub enum EmptyLook { 342c67d6573Sopenharmony_ci /// Start of line or input. 343c67d6573Sopenharmony_ci StartLine, 344c67d6573Sopenharmony_ci /// End of line or input. 345c67d6573Sopenharmony_ci EndLine, 346c67d6573Sopenharmony_ci /// Start of input. 347c67d6573Sopenharmony_ci StartText, 348c67d6573Sopenharmony_ci /// End of input. 349c67d6573Sopenharmony_ci EndText, 350c67d6573Sopenharmony_ci /// Word character on one side and non-word character on other. 351c67d6573Sopenharmony_ci WordBoundary, 352c67d6573Sopenharmony_ci /// Word character on both sides or non-word character on both sides. 353c67d6573Sopenharmony_ci NotWordBoundary, 354c67d6573Sopenharmony_ci /// ASCII word boundary. 355c67d6573Sopenharmony_ci WordBoundaryAscii, 356c67d6573Sopenharmony_ci /// Not ASCII word boundary. 357c67d6573Sopenharmony_ci NotWordBoundaryAscii, 358c67d6573Sopenharmony_ci} 359c67d6573Sopenharmony_ci 360c67d6573Sopenharmony_ci/// Representation of the Char instruction. 361c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 362c67d6573Sopenharmony_cipub struct InstChar { 363c67d6573Sopenharmony_ci /// The next location to execute in the program if this instruction 364c67d6573Sopenharmony_ci /// succeeds. 365c67d6573Sopenharmony_ci pub goto: InstPtr, 366c67d6573Sopenharmony_ci /// The character to test. 367c67d6573Sopenharmony_ci pub c: char, 368c67d6573Sopenharmony_ci} 369c67d6573Sopenharmony_ci 370c67d6573Sopenharmony_ci/// Representation of the Ranges instruction. 371c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 372c67d6573Sopenharmony_cipub struct InstRanges { 373c67d6573Sopenharmony_ci /// The next location to execute in the program if this instruction 374c67d6573Sopenharmony_ci /// succeeds. 375c67d6573Sopenharmony_ci pub goto: InstPtr, 376c67d6573Sopenharmony_ci /// The set of Unicode scalar value ranges to test. 377c67d6573Sopenharmony_ci pub ranges: Box<[(char, char)]>, 378c67d6573Sopenharmony_ci} 379c67d6573Sopenharmony_ci 380c67d6573Sopenharmony_ciimpl InstRanges { 381c67d6573Sopenharmony_ci /// Tests whether the given input character matches this instruction. 382c67d6573Sopenharmony_ci pub fn matches(&self, c: Char) -> bool { 383c67d6573Sopenharmony_ci // This speeds up the `match_class_unicode` benchmark by checking 384c67d6573Sopenharmony_ci // some common cases quickly without binary search. e.g., Matching 385c67d6573Sopenharmony_ci // a Unicode class on predominantly ASCII text. 386c67d6573Sopenharmony_ci for r in self.ranges.iter().take(4) { 387c67d6573Sopenharmony_ci if c < r.0 { 388c67d6573Sopenharmony_ci return false; 389c67d6573Sopenharmony_ci } 390c67d6573Sopenharmony_ci if c <= r.1 { 391c67d6573Sopenharmony_ci return true; 392c67d6573Sopenharmony_ci } 393c67d6573Sopenharmony_ci } 394c67d6573Sopenharmony_ci self.ranges 395c67d6573Sopenharmony_ci .binary_search_by(|r| { 396c67d6573Sopenharmony_ci if r.1 < c { 397c67d6573Sopenharmony_ci Ordering::Less 398c67d6573Sopenharmony_ci } else if r.0 > c { 399c67d6573Sopenharmony_ci Ordering::Greater 400c67d6573Sopenharmony_ci } else { 401c67d6573Sopenharmony_ci Ordering::Equal 402c67d6573Sopenharmony_ci } 403c67d6573Sopenharmony_ci }) 404c67d6573Sopenharmony_ci .is_ok() 405c67d6573Sopenharmony_ci } 406c67d6573Sopenharmony_ci 407c67d6573Sopenharmony_ci /// Return the number of distinct characters represented by all of the 408c67d6573Sopenharmony_ci /// ranges. 409c67d6573Sopenharmony_ci pub fn num_chars(&self) -> usize { 410c67d6573Sopenharmony_ci self.ranges 411c67d6573Sopenharmony_ci .iter() 412c67d6573Sopenharmony_ci .map(|&(s, e)| 1 + (e as u32) - (s as u32)) 413c67d6573Sopenharmony_ci .sum::<u32>() as usize 414c67d6573Sopenharmony_ci } 415c67d6573Sopenharmony_ci} 416c67d6573Sopenharmony_ci 417c67d6573Sopenharmony_ci/// Representation of the Bytes instruction. 418c67d6573Sopenharmony_ci#[derive(Clone, Debug)] 419c67d6573Sopenharmony_cipub struct InstBytes { 420c67d6573Sopenharmony_ci /// The next location to execute in the program if this instruction 421c67d6573Sopenharmony_ci /// succeeds. 422c67d6573Sopenharmony_ci pub goto: InstPtr, 423c67d6573Sopenharmony_ci /// The start (inclusive) of this byte range. 424c67d6573Sopenharmony_ci pub start: u8, 425c67d6573Sopenharmony_ci /// The end (inclusive) of this byte range. 426c67d6573Sopenharmony_ci pub end: u8, 427c67d6573Sopenharmony_ci} 428c67d6573Sopenharmony_ci 429c67d6573Sopenharmony_ciimpl InstBytes { 430c67d6573Sopenharmony_ci /// Returns true if and only if the given byte is in this range. 431c67d6573Sopenharmony_ci pub fn matches(&self, byte: u8) -> bool { 432c67d6573Sopenharmony_ci self.start <= byte && byte <= self.end 433c67d6573Sopenharmony_ci } 434c67d6573Sopenharmony_ci} 435c67d6573Sopenharmony_ci 436c67d6573Sopenharmony_ci#[cfg(test)] 437c67d6573Sopenharmony_cimod test { 438c67d6573Sopenharmony_ci #[test] 439c67d6573Sopenharmony_ci #[cfg(target_pointer_width = "64")] 440c67d6573Sopenharmony_ci fn test_size_of_inst() { 441c67d6573Sopenharmony_ci use std::mem::size_of; 442c67d6573Sopenharmony_ci 443c67d6573Sopenharmony_ci use super::Inst; 444c67d6573Sopenharmony_ci 445c67d6573Sopenharmony_ci assert_eq!(32, size_of::<Inst>()); 446c67d6573Sopenharmony_ci } 447c67d6573Sopenharmony_ci} 448