1c67d6573Sopenharmony_ciuse std::cmp::Ordering;
2c67d6573Sopenharmony_ciuse std::collections::HashMap;
3c67d6573Sopenharmony_ciuse std::fmt;
4c67d6573Sopenharmony_ciuse std::mem;
5c67d6573Sopenharmony_ciuse std::ops::Deref;
6c67d6573Sopenharmony_ciuse std::slice;
7c67d6573Sopenharmony_ciuse std::sync::Arc;
8c67d6573Sopenharmony_ci
9c67d6573Sopenharmony_ciuse crate::input::Char;
10c67d6573Sopenharmony_ciuse crate::literal::LiteralSearcher;
11c67d6573Sopenharmony_ci
12c67d6573Sopenharmony_ci/// `InstPtr` represents the index of an instruction in a regex program.
13c67d6573Sopenharmony_cipub type InstPtr = usize;
14c67d6573Sopenharmony_ci
15c67d6573Sopenharmony_ci/// Program is a sequence of instructions and various facts about thos
16c67d6573Sopenharmony_ci/// instructions.
17c67d6573Sopenharmony_ci#[derive(Clone)]
18c67d6573Sopenharmony_cipub struct Program {
19c67d6573Sopenharmony_ci    /// A sequence of instructions that represents an NFA.
20c67d6573Sopenharmony_ci    pub insts: Vec<Inst>,
21c67d6573Sopenharmony_ci    /// Pointers to each Match instruction in the sequence.
22c67d6573Sopenharmony_ci    ///
23c67d6573Sopenharmony_ci    /// This is always length 1 unless this program represents a regex set.
24c67d6573Sopenharmony_ci    pub matches: Vec<InstPtr>,
25c67d6573Sopenharmony_ci    /// The ordered sequence of all capture groups extracted from the AST.
26c67d6573Sopenharmony_ci    /// Unnamed groups are `None`.
27c67d6573Sopenharmony_ci    pub captures: Vec<Option<String>>,
28c67d6573Sopenharmony_ci    /// Pointers to all named capture groups into `captures`.
29c67d6573Sopenharmony_ci    pub capture_name_idx: Arc<HashMap<String, usize>>,
30c67d6573Sopenharmony_ci    /// A pointer to the start instruction. This can vary depending on how
31c67d6573Sopenharmony_ci    /// the program was compiled. For example, programs for use with the DFA
32c67d6573Sopenharmony_ci    /// engine have a `.*?` inserted at the beginning of unanchored regular
33c67d6573Sopenharmony_ci    /// expressions. The actual starting point of the program is after the
34c67d6573Sopenharmony_ci    /// `.*?`.
35c67d6573Sopenharmony_ci    pub start: InstPtr,
36c67d6573Sopenharmony_ci    /// A set of equivalence classes for discriminating bytes in the compiled
37c67d6573Sopenharmony_ci    /// program.
38c67d6573Sopenharmony_ci    pub byte_classes: Vec<u8>,
39c67d6573Sopenharmony_ci    /// When true, this program can only match valid UTF-8.
40c67d6573Sopenharmony_ci    pub only_utf8: bool,
41c67d6573Sopenharmony_ci    /// When true, this program uses byte range instructions instead of Unicode
42c67d6573Sopenharmony_ci    /// range instructions.
43c67d6573Sopenharmony_ci    pub is_bytes: bool,
44c67d6573Sopenharmony_ci    /// When true, the program is compiled for DFA matching. For example, this
45c67d6573Sopenharmony_ci    /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
46c67d6573Sopenharmony_ci    /// regexes.
47c67d6573Sopenharmony_ci    pub is_dfa: bool,
48c67d6573Sopenharmony_ci    /// When true, the program matches text in reverse (for use only in the
49c67d6573Sopenharmony_ci    /// DFA).
50c67d6573Sopenharmony_ci    pub is_reverse: bool,
51c67d6573Sopenharmony_ci    /// Whether the regex must match from the start of the input.
52c67d6573Sopenharmony_ci    pub is_anchored_start: bool,
53c67d6573Sopenharmony_ci    /// Whether the regex must match at the end of the input.
54c67d6573Sopenharmony_ci    pub is_anchored_end: bool,
55c67d6573Sopenharmony_ci    /// Whether this program contains a Unicode word boundary instruction.
56c67d6573Sopenharmony_ci    pub has_unicode_word_boundary: bool,
57c67d6573Sopenharmony_ci    /// A possibly empty machine for very quickly matching prefix literals.
58c67d6573Sopenharmony_ci    pub prefixes: LiteralSearcher,
59c67d6573Sopenharmony_ci    /// A limit on the size of the cache that the DFA is allowed to use while
60c67d6573Sopenharmony_ci    /// matching.
61c67d6573Sopenharmony_ci    ///
62c67d6573Sopenharmony_ci    /// The cache limit specifies approximately how much space we're willing to
63c67d6573Sopenharmony_ci    /// give to the state cache. Once the state cache exceeds the size, it is
64c67d6573Sopenharmony_ci    /// wiped and all states must be re-computed.
65c67d6573Sopenharmony_ci    ///
66c67d6573Sopenharmony_ci    /// Note that this value does not impact correctness. It can be set to 0
67c67d6573Sopenharmony_ci    /// and the DFA will run just fine. (It will only ever store exactly one
68c67d6573Sopenharmony_ci    /// state in the cache, and will likely run very slowly, but it will work.)
69c67d6573Sopenharmony_ci    ///
70c67d6573Sopenharmony_ci    /// Also note that this limit is *per thread of execution*. That is,
71c67d6573Sopenharmony_ci    /// if the same regex is used to search text across multiple threads
72c67d6573Sopenharmony_ci    /// simultaneously, then the DFA cache is not shared. Instead, copies are
73c67d6573Sopenharmony_ci    /// made.
74c67d6573Sopenharmony_ci    pub dfa_size_limit: usize,
75c67d6573Sopenharmony_ci}
76c67d6573Sopenharmony_ci
77c67d6573Sopenharmony_ciimpl Program {
78c67d6573Sopenharmony_ci    /// Creates an empty instruction sequence. Fields are given default
79c67d6573Sopenharmony_ci    /// values.
80c67d6573Sopenharmony_ci    pub fn new() -> Self {
81c67d6573Sopenharmony_ci        Program {
82c67d6573Sopenharmony_ci            insts: vec![],
83c67d6573Sopenharmony_ci            matches: vec![],
84c67d6573Sopenharmony_ci            captures: vec![],
85c67d6573Sopenharmony_ci            capture_name_idx: Arc::new(HashMap::new()),
86c67d6573Sopenharmony_ci            start: 0,
87c67d6573Sopenharmony_ci            byte_classes: vec![0; 256],
88c67d6573Sopenharmony_ci            only_utf8: true,
89c67d6573Sopenharmony_ci            is_bytes: false,
90c67d6573Sopenharmony_ci            is_dfa: false,
91c67d6573Sopenharmony_ci            is_reverse: false,
92c67d6573Sopenharmony_ci            is_anchored_start: false,
93c67d6573Sopenharmony_ci            is_anchored_end: false,
94c67d6573Sopenharmony_ci            has_unicode_word_boundary: false,
95c67d6573Sopenharmony_ci            prefixes: LiteralSearcher::empty(),
96c67d6573Sopenharmony_ci            dfa_size_limit: 2 * (1 << 20),
97c67d6573Sopenharmony_ci        }
98c67d6573Sopenharmony_ci    }
99c67d6573Sopenharmony_ci
100c67d6573Sopenharmony_ci    /// If pc is an index to a no-op instruction (like Save), then return the
101c67d6573Sopenharmony_ci    /// next pc that is not a no-op instruction.
102c67d6573Sopenharmony_ci    pub fn skip(&self, mut pc: usize) -> usize {
103c67d6573Sopenharmony_ci        loop {
104c67d6573Sopenharmony_ci            match self[pc] {
105c67d6573Sopenharmony_ci                Inst::Save(ref i) => pc = i.goto,
106c67d6573Sopenharmony_ci                _ => return pc,
107c67d6573Sopenharmony_ci            }
108c67d6573Sopenharmony_ci        }
109c67d6573Sopenharmony_ci    }
110c67d6573Sopenharmony_ci
111c67d6573Sopenharmony_ci    /// Return true if and only if an execution engine at instruction `pc` will
112c67d6573Sopenharmony_ci    /// always lead to a match.
113c67d6573Sopenharmony_ci    pub fn leads_to_match(&self, pc: usize) -> bool {
114c67d6573Sopenharmony_ci        if self.matches.len() > 1 {
115c67d6573Sopenharmony_ci            // If we have a regex set, then we have more than one ending
116c67d6573Sopenharmony_ci            // state, so leading to one of those states is generally
117c67d6573Sopenharmony_ci            // meaningless.
118c67d6573Sopenharmony_ci            return false;
119c67d6573Sopenharmony_ci        }
120c67d6573Sopenharmony_ci        match self[self.skip(pc)] {
121c67d6573Sopenharmony_ci            Inst::Match(_) => true,
122c67d6573Sopenharmony_ci            _ => false,
123c67d6573Sopenharmony_ci        }
124c67d6573Sopenharmony_ci    }
125c67d6573Sopenharmony_ci
126c67d6573Sopenharmony_ci    /// Returns true if the current configuration demands that an implicit
127c67d6573Sopenharmony_ci    /// `.*?` be prepended to the instruction sequence.
128c67d6573Sopenharmony_ci    pub fn needs_dotstar(&self) -> bool {
129c67d6573Sopenharmony_ci        self.is_dfa && !self.is_reverse && !self.is_anchored_start
130c67d6573Sopenharmony_ci    }
131c67d6573Sopenharmony_ci
132c67d6573Sopenharmony_ci    /// Returns true if this program uses Byte instructions instead of
133c67d6573Sopenharmony_ci    /// Char/Range instructions.
134c67d6573Sopenharmony_ci    pub fn uses_bytes(&self) -> bool {
135c67d6573Sopenharmony_ci        self.is_bytes || self.is_dfa
136c67d6573Sopenharmony_ci    }
137c67d6573Sopenharmony_ci
138c67d6573Sopenharmony_ci    /// Returns true if this program exclusively matches valid UTF-8 bytes.
139c67d6573Sopenharmony_ci    ///
140c67d6573Sopenharmony_ci    /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
141c67d6573Sopenharmony_ci    pub fn only_utf8(&self) -> bool {
142c67d6573Sopenharmony_ci        self.only_utf8
143c67d6573Sopenharmony_ci    }
144c67d6573Sopenharmony_ci
145c67d6573Sopenharmony_ci    /// Return the approximate heap usage of this instruction sequence in
146c67d6573Sopenharmony_ci    /// bytes.
147c67d6573Sopenharmony_ci    pub fn approximate_size(&self) -> usize {
148c67d6573Sopenharmony_ci        // The only instruction that uses heap space is Ranges (for
149c67d6573Sopenharmony_ci        // Unicode codepoint programs) to store non-overlapping codepoint
150c67d6573Sopenharmony_ci        // ranges. To keep this operation constant time, we ignore them.
151c67d6573Sopenharmony_ci        (self.len() * mem::size_of::<Inst>())
152c67d6573Sopenharmony_ci            + (self.matches.len() * mem::size_of::<InstPtr>())
153c67d6573Sopenharmony_ci            + (self.captures.len() * mem::size_of::<Option<String>>())
154c67d6573Sopenharmony_ci            + (self.capture_name_idx.len()
155c67d6573Sopenharmony_ci                * (mem::size_of::<String>() + mem::size_of::<usize>()))
156c67d6573Sopenharmony_ci            + (self.byte_classes.len() * mem::size_of::<u8>())
157c67d6573Sopenharmony_ci            + self.prefixes.approximate_size()
158c67d6573Sopenharmony_ci    }
159c67d6573Sopenharmony_ci}
160c67d6573Sopenharmony_ci
161c67d6573Sopenharmony_ciimpl Deref for Program {
162c67d6573Sopenharmony_ci    type Target = [Inst];
163c67d6573Sopenharmony_ci
164c67d6573Sopenharmony_ci    #[cfg_attr(feature = "perf-inline", inline(always))]
165c67d6573Sopenharmony_ci    fn deref(&self) -> &Self::Target {
166c67d6573Sopenharmony_ci        &*self.insts
167c67d6573Sopenharmony_ci    }
168c67d6573Sopenharmony_ci}
169c67d6573Sopenharmony_ci
170c67d6573Sopenharmony_ciimpl fmt::Debug for Program {
171c67d6573Sopenharmony_ci    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
172c67d6573Sopenharmony_ci        use self::Inst::*;
173c67d6573Sopenharmony_ci
174c67d6573Sopenharmony_ci        fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
175c67d6573Sopenharmony_ci            if goto == cur + 1 {
176c67d6573Sopenharmony_ci                fmtd
177c67d6573Sopenharmony_ci            } else {
178c67d6573Sopenharmony_ci                format!("{} (goto: {})", fmtd, goto)
179c67d6573Sopenharmony_ci            }
180c67d6573Sopenharmony_ci        }
181c67d6573Sopenharmony_ci
182c67d6573Sopenharmony_ci        fn visible_byte(b: u8) -> String {
183c67d6573Sopenharmony_ci            use std::ascii::escape_default;
184c67d6573Sopenharmony_ci            let escaped = escape_default(b).collect::<Vec<u8>>();
185c67d6573Sopenharmony_ci            String::from_utf8_lossy(&escaped).into_owned()
186c67d6573Sopenharmony_ci        }
187c67d6573Sopenharmony_ci
188c67d6573Sopenharmony_ci        for (pc, inst) in self.iter().enumerate() {
189c67d6573Sopenharmony_ci            match *inst {
190c67d6573Sopenharmony_ci                Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
191c67d6573Sopenharmony_ci                Save(ref inst) => {
192c67d6573Sopenharmony_ci                    let s = format!("{:04} Save({})", pc, inst.slot);
193c67d6573Sopenharmony_ci                    write!(f, "{}", with_goto(pc, inst.goto, s))?;
194c67d6573Sopenharmony_ci                }
195c67d6573Sopenharmony_ci                Split(ref inst) => {
196c67d6573Sopenharmony_ci                    write!(
197c67d6573Sopenharmony_ci                        f,
198c67d6573Sopenharmony_ci                        "{:04} Split({}, {})",
199c67d6573Sopenharmony_ci                        pc, inst.goto1, inst.goto2
200c67d6573Sopenharmony_ci                    )?;
201c67d6573Sopenharmony_ci                }
202c67d6573Sopenharmony_ci                EmptyLook(ref inst) => {
203c67d6573Sopenharmony_ci                    let s = format!("{:?}", inst.look);
204c67d6573Sopenharmony_ci                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
205c67d6573Sopenharmony_ci                }
206c67d6573Sopenharmony_ci                Char(ref inst) => {
207c67d6573Sopenharmony_ci                    let s = format!("{:?}", inst.c);
208c67d6573Sopenharmony_ci                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
209c67d6573Sopenharmony_ci                }
210c67d6573Sopenharmony_ci                Ranges(ref inst) => {
211c67d6573Sopenharmony_ci                    let ranges = inst
212c67d6573Sopenharmony_ci                        .ranges
213c67d6573Sopenharmony_ci                        .iter()
214c67d6573Sopenharmony_ci                        .map(|r| format!("{:?}-{:?}", r.0, r.1))
215c67d6573Sopenharmony_ci                        .collect::<Vec<String>>()
216c67d6573Sopenharmony_ci                        .join(", ");
217c67d6573Sopenharmony_ci                    write!(
218c67d6573Sopenharmony_ci                        f,
219c67d6573Sopenharmony_ci                        "{:04} {}",
220c67d6573Sopenharmony_ci                        pc,
221c67d6573Sopenharmony_ci                        with_goto(pc, inst.goto, ranges)
222c67d6573Sopenharmony_ci                    )?;
223c67d6573Sopenharmony_ci                }
224c67d6573Sopenharmony_ci                Bytes(ref inst) => {
225c67d6573Sopenharmony_ci                    let s = format!(
226c67d6573Sopenharmony_ci                        "Bytes({}, {})",
227c67d6573Sopenharmony_ci                        visible_byte(inst.start),
228c67d6573Sopenharmony_ci                        visible_byte(inst.end)
229c67d6573Sopenharmony_ci                    );
230c67d6573Sopenharmony_ci                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
231c67d6573Sopenharmony_ci                }
232c67d6573Sopenharmony_ci            }
233c67d6573Sopenharmony_ci            if pc == self.start {
234c67d6573Sopenharmony_ci                write!(f, " (start)")?;
235c67d6573Sopenharmony_ci            }
236c67d6573Sopenharmony_ci            writeln!(f)?;
237c67d6573Sopenharmony_ci        }
238c67d6573Sopenharmony_ci        Ok(())
239c67d6573Sopenharmony_ci    }
240c67d6573Sopenharmony_ci}
241c67d6573Sopenharmony_ci
242c67d6573Sopenharmony_ciimpl<'a> IntoIterator for &'a Program {
243c67d6573Sopenharmony_ci    type Item = &'a Inst;
244c67d6573Sopenharmony_ci    type IntoIter = slice::Iter<'a, Inst>;
245c67d6573Sopenharmony_ci    fn into_iter(self) -> Self::IntoIter {
246c67d6573Sopenharmony_ci        self.iter()
247c67d6573Sopenharmony_ci    }
248c67d6573Sopenharmony_ci}
249c67d6573Sopenharmony_ci
250c67d6573Sopenharmony_ci/// Inst is an instruction code in a Regex program.
251c67d6573Sopenharmony_ci///
252c67d6573Sopenharmony_ci/// Regrettably, a regex program either contains Unicode codepoint
253c67d6573Sopenharmony_ci/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
254c67d6573Sopenharmony_ci/// A regex program can never contain both.
255c67d6573Sopenharmony_ci///
256c67d6573Sopenharmony_ci/// It would be worth investigating splitting this into two distinct types and
257c67d6573Sopenharmony_ci/// then figuring out how to make the matching engines polymorphic over those
258c67d6573Sopenharmony_ci/// types without sacrificing performance.
259c67d6573Sopenharmony_ci///
260c67d6573Sopenharmony_ci/// Other than the benefit of moving invariants into the type system, another
261c67d6573Sopenharmony_ci/// benefit is the decreased size. If we remove the `Char` and `Ranges`
262c67d6573Sopenharmony_ci/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
263c67d6573Sopenharmony_ci/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
264c67d6573Sopenharmony_ci/// variant.) Given that byte based machines are typically much bigger than
265c67d6573Sopenharmony_ci/// their Unicode analogues (because they can decode UTF-8 directly), this ends
266c67d6573Sopenharmony_ci/// up being a pretty significant savings.
267c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
268c67d6573Sopenharmony_cipub enum Inst {
269c67d6573Sopenharmony_ci    /// Match indicates that the program has reached a match state.
270c67d6573Sopenharmony_ci    ///
271c67d6573Sopenharmony_ci    /// The number in the match corresponds to the Nth logical regular
272c67d6573Sopenharmony_ci    /// expression in this program. This index is always 0 for normal regex
273c67d6573Sopenharmony_ci    /// programs. Values greater than 0 appear when compiling regex sets, and
274c67d6573Sopenharmony_ci    /// each match instruction gets its own unique value. The value corresponds
275c67d6573Sopenharmony_ci    /// to the Nth regex in the set.
276c67d6573Sopenharmony_ci    Match(usize),
277c67d6573Sopenharmony_ci    /// Save causes the program to save the current location of the input in
278c67d6573Sopenharmony_ci    /// the slot indicated by InstSave.
279c67d6573Sopenharmony_ci    Save(InstSave),
280c67d6573Sopenharmony_ci    /// Split causes the program to diverge to one of two paths in the
281c67d6573Sopenharmony_ci    /// program, preferring goto1 in InstSplit.
282c67d6573Sopenharmony_ci    Split(InstSplit),
283c67d6573Sopenharmony_ci    /// EmptyLook represents a zero-width assertion in a regex program. A
284c67d6573Sopenharmony_ci    /// zero-width assertion does not consume any of the input text.
285c67d6573Sopenharmony_ci    EmptyLook(InstEmptyLook),
286c67d6573Sopenharmony_ci    /// Char requires the regex program to match the character in InstChar at
287c67d6573Sopenharmony_ci    /// the current position in the input.
288c67d6573Sopenharmony_ci    Char(InstChar),
289c67d6573Sopenharmony_ci    /// Ranges requires the regex program to match the character at the current
290c67d6573Sopenharmony_ci    /// position in the input with one of the ranges specified in InstRanges.
291c67d6573Sopenharmony_ci    Ranges(InstRanges),
292c67d6573Sopenharmony_ci    /// Bytes is like Ranges, except it expresses a single byte range. It is
293c67d6573Sopenharmony_ci    /// used in conjunction with Split instructions to implement multi-byte
294c67d6573Sopenharmony_ci    /// character classes.
295c67d6573Sopenharmony_ci    Bytes(InstBytes),
296c67d6573Sopenharmony_ci}
297c67d6573Sopenharmony_ci
298c67d6573Sopenharmony_ciimpl Inst {
299c67d6573Sopenharmony_ci    /// Returns true if and only if this is a match instruction.
300c67d6573Sopenharmony_ci    pub fn is_match(&self) -> bool {
301c67d6573Sopenharmony_ci        match *self {
302c67d6573Sopenharmony_ci            Inst::Match(_) => true,
303c67d6573Sopenharmony_ci            _ => false,
304c67d6573Sopenharmony_ci        }
305c67d6573Sopenharmony_ci    }
306c67d6573Sopenharmony_ci}
307c67d6573Sopenharmony_ci
308c67d6573Sopenharmony_ci/// Representation of the Save instruction.
309c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
310c67d6573Sopenharmony_cipub struct InstSave {
311c67d6573Sopenharmony_ci    /// The next location to execute in the program.
312c67d6573Sopenharmony_ci    pub goto: InstPtr,
313c67d6573Sopenharmony_ci    /// The capture slot (there are two slots for every capture in a regex,
314c67d6573Sopenharmony_ci    /// including the zeroth capture for the entire match).
315c67d6573Sopenharmony_ci    pub slot: usize,
316c67d6573Sopenharmony_ci}
317c67d6573Sopenharmony_ci
318c67d6573Sopenharmony_ci/// Representation of the Split instruction.
319c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
320c67d6573Sopenharmony_cipub struct InstSplit {
321c67d6573Sopenharmony_ci    /// The first instruction to try. A match resulting from following goto1
322c67d6573Sopenharmony_ci    /// has precedence over a match resulting from following goto2.
323c67d6573Sopenharmony_ci    pub goto1: InstPtr,
324c67d6573Sopenharmony_ci    /// The second instruction to try. A match resulting from following goto1
325c67d6573Sopenharmony_ci    /// has precedence over a match resulting from following goto2.
326c67d6573Sopenharmony_ci    pub goto2: InstPtr,
327c67d6573Sopenharmony_ci}
328c67d6573Sopenharmony_ci
329c67d6573Sopenharmony_ci/// Representation of the `EmptyLook` instruction.
330c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
331c67d6573Sopenharmony_cipub struct InstEmptyLook {
332c67d6573Sopenharmony_ci    /// The next location to execute in the program if this instruction
333c67d6573Sopenharmony_ci    /// succeeds.
334c67d6573Sopenharmony_ci    pub goto: InstPtr,
335c67d6573Sopenharmony_ci    /// The type of zero-width assertion to check.
336c67d6573Sopenharmony_ci    pub look: EmptyLook,
337c67d6573Sopenharmony_ci}
338c67d6573Sopenharmony_ci
339c67d6573Sopenharmony_ci/// The set of zero-width match instructions.
340c67d6573Sopenharmony_ci#[derive(Clone, Copy, Debug, PartialEq, Eq)]
341c67d6573Sopenharmony_cipub enum EmptyLook {
342c67d6573Sopenharmony_ci    /// Start of line or input.
343c67d6573Sopenharmony_ci    StartLine,
344c67d6573Sopenharmony_ci    /// End of line or input.
345c67d6573Sopenharmony_ci    EndLine,
346c67d6573Sopenharmony_ci    /// Start of input.
347c67d6573Sopenharmony_ci    StartText,
348c67d6573Sopenharmony_ci    /// End of input.
349c67d6573Sopenharmony_ci    EndText,
350c67d6573Sopenharmony_ci    /// Word character on one side and non-word character on other.
351c67d6573Sopenharmony_ci    WordBoundary,
352c67d6573Sopenharmony_ci    /// Word character on both sides or non-word character on both sides.
353c67d6573Sopenharmony_ci    NotWordBoundary,
354c67d6573Sopenharmony_ci    /// ASCII word boundary.
355c67d6573Sopenharmony_ci    WordBoundaryAscii,
356c67d6573Sopenharmony_ci    /// Not ASCII word boundary.
357c67d6573Sopenharmony_ci    NotWordBoundaryAscii,
358c67d6573Sopenharmony_ci}
359c67d6573Sopenharmony_ci
360c67d6573Sopenharmony_ci/// Representation of the Char instruction.
361c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
362c67d6573Sopenharmony_cipub struct InstChar {
363c67d6573Sopenharmony_ci    /// The next location to execute in the program if this instruction
364c67d6573Sopenharmony_ci    /// succeeds.
365c67d6573Sopenharmony_ci    pub goto: InstPtr,
366c67d6573Sopenharmony_ci    /// The character to test.
367c67d6573Sopenharmony_ci    pub c: char,
368c67d6573Sopenharmony_ci}
369c67d6573Sopenharmony_ci
370c67d6573Sopenharmony_ci/// Representation of the Ranges instruction.
371c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
372c67d6573Sopenharmony_cipub struct InstRanges {
373c67d6573Sopenharmony_ci    /// The next location to execute in the program if this instruction
374c67d6573Sopenharmony_ci    /// succeeds.
375c67d6573Sopenharmony_ci    pub goto: InstPtr,
376c67d6573Sopenharmony_ci    /// The set of Unicode scalar value ranges to test.
377c67d6573Sopenharmony_ci    pub ranges: Box<[(char, char)]>,
378c67d6573Sopenharmony_ci}
379c67d6573Sopenharmony_ci
380c67d6573Sopenharmony_ciimpl InstRanges {
381c67d6573Sopenharmony_ci    /// Tests whether the given input character matches this instruction.
382c67d6573Sopenharmony_ci    pub fn matches(&self, c: Char) -> bool {
383c67d6573Sopenharmony_ci        // This speeds up the `match_class_unicode` benchmark by checking
384c67d6573Sopenharmony_ci        // some common cases quickly without binary search. e.g., Matching
385c67d6573Sopenharmony_ci        // a Unicode class on predominantly ASCII text.
386c67d6573Sopenharmony_ci        for r in self.ranges.iter().take(4) {
387c67d6573Sopenharmony_ci            if c < r.0 {
388c67d6573Sopenharmony_ci                return false;
389c67d6573Sopenharmony_ci            }
390c67d6573Sopenharmony_ci            if c <= r.1 {
391c67d6573Sopenharmony_ci                return true;
392c67d6573Sopenharmony_ci            }
393c67d6573Sopenharmony_ci        }
394c67d6573Sopenharmony_ci        self.ranges
395c67d6573Sopenharmony_ci            .binary_search_by(|r| {
396c67d6573Sopenharmony_ci                if r.1 < c {
397c67d6573Sopenharmony_ci                    Ordering::Less
398c67d6573Sopenharmony_ci                } else if r.0 > c {
399c67d6573Sopenharmony_ci                    Ordering::Greater
400c67d6573Sopenharmony_ci                } else {
401c67d6573Sopenharmony_ci                    Ordering::Equal
402c67d6573Sopenharmony_ci                }
403c67d6573Sopenharmony_ci            })
404c67d6573Sopenharmony_ci            .is_ok()
405c67d6573Sopenharmony_ci    }
406c67d6573Sopenharmony_ci
407c67d6573Sopenharmony_ci    /// Return the number of distinct characters represented by all of the
408c67d6573Sopenharmony_ci    /// ranges.
409c67d6573Sopenharmony_ci    pub fn num_chars(&self) -> usize {
410c67d6573Sopenharmony_ci        self.ranges
411c67d6573Sopenharmony_ci            .iter()
412c67d6573Sopenharmony_ci            .map(|&(s, e)| 1 + (e as u32) - (s as u32))
413c67d6573Sopenharmony_ci            .sum::<u32>() as usize
414c67d6573Sopenharmony_ci    }
415c67d6573Sopenharmony_ci}
416c67d6573Sopenharmony_ci
417c67d6573Sopenharmony_ci/// Representation of the Bytes instruction.
418c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
419c67d6573Sopenharmony_cipub struct InstBytes {
420c67d6573Sopenharmony_ci    /// The next location to execute in the program if this instruction
421c67d6573Sopenharmony_ci    /// succeeds.
422c67d6573Sopenharmony_ci    pub goto: InstPtr,
423c67d6573Sopenharmony_ci    /// The start (inclusive) of this byte range.
424c67d6573Sopenharmony_ci    pub start: u8,
425c67d6573Sopenharmony_ci    /// The end (inclusive) of this byte range.
426c67d6573Sopenharmony_ci    pub end: u8,
427c67d6573Sopenharmony_ci}
428c67d6573Sopenharmony_ci
429c67d6573Sopenharmony_ciimpl InstBytes {
430c67d6573Sopenharmony_ci    /// Returns true if and only if the given byte is in this range.
431c67d6573Sopenharmony_ci    pub fn matches(&self, byte: u8) -> bool {
432c67d6573Sopenharmony_ci        self.start <= byte && byte <= self.end
433c67d6573Sopenharmony_ci    }
434c67d6573Sopenharmony_ci}
435c67d6573Sopenharmony_ci
436c67d6573Sopenharmony_ci#[cfg(test)]
437c67d6573Sopenharmony_cimod test {
438c67d6573Sopenharmony_ci    #[test]
439c67d6573Sopenharmony_ci    #[cfg(target_pointer_width = "64")]
440c67d6573Sopenharmony_ci    fn test_size_of_inst() {
441c67d6573Sopenharmony_ci        use std::mem::size_of;
442c67d6573Sopenharmony_ci
443c67d6573Sopenharmony_ci        use super::Inst;
444c67d6573Sopenharmony_ci
445c67d6573Sopenharmony_ci        assert_eq!(32, size_of::<Inst>());
446c67d6573Sopenharmony_ci    }
447c67d6573Sopenharmony_ci}
448