1c67d6573Sopenharmony_ciuse std::collections::HashMap;
2c67d6573Sopenharmony_ciuse std::fmt;
3c67d6573Sopenharmony_ciuse std::iter;
4c67d6573Sopenharmony_ciuse std::result;
5c67d6573Sopenharmony_ciuse std::sync::Arc;
6c67d6573Sopenharmony_ci
7c67d6573Sopenharmony_ciuse regex_syntax::hir::{self, Hir};
8c67d6573Sopenharmony_ciuse regex_syntax::is_word_byte;
9c67d6573Sopenharmony_ciuse regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10c67d6573Sopenharmony_ci
11c67d6573Sopenharmony_ciuse crate::prog::{
12c67d6573Sopenharmony_ci    EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13c67d6573Sopenharmony_ci    InstSave, InstSplit, Program,
14c67d6573Sopenharmony_ci};
15c67d6573Sopenharmony_ci
16c67d6573Sopenharmony_ciuse crate::Error;
17c67d6573Sopenharmony_ci
18c67d6573Sopenharmony_citype Result = result::Result<Patch, Error>;
19c67d6573Sopenharmony_citype ResultOrEmpty = result::Result<Option<Patch>, Error>;
20c67d6573Sopenharmony_ci
21c67d6573Sopenharmony_ci#[derive(Debug)]
22c67d6573Sopenharmony_cistruct Patch {
23c67d6573Sopenharmony_ci    hole: Hole,
24c67d6573Sopenharmony_ci    entry: InstPtr,
25c67d6573Sopenharmony_ci}
26c67d6573Sopenharmony_ci
27c67d6573Sopenharmony_ci/// A compiler translates a regular expression AST to a sequence of
28c67d6573Sopenharmony_ci/// instructions. The sequence of instructions represents an NFA.
29c67d6573Sopenharmony_ci// `Compiler` is only public via the `internal` module, so avoid deriving
30c67d6573Sopenharmony_ci// `Debug`.
31c67d6573Sopenharmony_ci#[allow(missing_debug_implementations)]
32c67d6573Sopenharmony_cipub struct Compiler {
33c67d6573Sopenharmony_ci    insts: Vec<MaybeInst>,
34c67d6573Sopenharmony_ci    compiled: Program,
35c67d6573Sopenharmony_ci    capture_name_idx: HashMap<String, usize>,
36c67d6573Sopenharmony_ci    num_exprs: usize,
37c67d6573Sopenharmony_ci    size_limit: usize,
38c67d6573Sopenharmony_ci    suffix_cache: SuffixCache,
39c67d6573Sopenharmony_ci    utf8_seqs: Option<Utf8Sequences>,
40c67d6573Sopenharmony_ci    byte_classes: ByteClassSet,
41c67d6573Sopenharmony_ci    // This keeps track of extra bytes allocated while compiling the regex
42c67d6573Sopenharmony_ci    // program. Currently, this corresponds to two things. First is the heap
43c67d6573Sopenharmony_ci    // memory allocated by Unicode character classes ('InstRanges'). Second is
44c67d6573Sopenharmony_ci    // a "fake" amount of memory used by empty sub-expressions, so that enough
45c67d6573Sopenharmony_ci    // empty sub-expressions will ultimately trigger the compiler to bail
46c67d6573Sopenharmony_ci    // because of a size limit restriction. (That empty sub-expressions don't
47c67d6573Sopenharmony_ci    // add to heap memory usage is more-or-less an implementation detail.) In
48c67d6573Sopenharmony_ci    // the second case, if we don't bail, then an excessively large repetition
49c67d6573Sopenharmony_ci    // on an empty sub-expression can result in the compiler using a very large
50c67d6573Sopenharmony_ci    // amount of CPU time.
51c67d6573Sopenharmony_ci    extra_inst_bytes: usize,
52c67d6573Sopenharmony_ci}
53c67d6573Sopenharmony_ci
54c67d6573Sopenharmony_ciimpl Compiler {
55c67d6573Sopenharmony_ci    /// Create a new regular expression compiler.
56c67d6573Sopenharmony_ci    ///
57c67d6573Sopenharmony_ci    /// Various options can be set before calling `compile` on an expression.
58c67d6573Sopenharmony_ci    pub fn new() -> Self {
59c67d6573Sopenharmony_ci        Compiler {
60c67d6573Sopenharmony_ci            insts: vec![],
61c67d6573Sopenharmony_ci            compiled: Program::new(),
62c67d6573Sopenharmony_ci            capture_name_idx: HashMap::new(),
63c67d6573Sopenharmony_ci            num_exprs: 0,
64c67d6573Sopenharmony_ci            size_limit: 10 * (1 << 20),
65c67d6573Sopenharmony_ci            suffix_cache: SuffixCache::new(1000),
66c67d6573Sopenharmony_ci            utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
67c67d6573Sopenharmony_ci            byte_classes: ByteClassSet::new(),
68c67d6573Sopenharmony_ci            extra_inst_bytes: 0,
69c67d6573Sopenharmony_ci        }
70c67d6573Sopenharmony_ci    }
71c67d6573Sopenharmony_ci
72c67d6573Sopenharmony_ci    /// The size of the resulting program is limited by size_limit. If
73c67d6573Sopenharmony_ci    /// the program approximately exceeds the given size (in bytes), then
74c67d6573Sopenharmony_ci    /// compilation will stop and return an error.
75c67d6573Sopenharmony_ci    pub fn size_limit(mut self, size_limit: usize) -> Self {
76c67d6573Sopenharmony_ci        self.size_limit = size_limit;
77c67d6573Sopenharmony_ci        self
78c67d6573Sopenharmony_ci    }
79c67d6573Sopenharmony_ci
80c67d6573Sopenharmony_ci    /// If bytes is true, then the program is compiled as a byte based
81c67d6573Sopenharmony_ci    /// automaton, which incorporates UTF-8 decoding into the machine. If it's
82c67d6573Sopenharmony_ci    /// false, then the automaton is Unicode scalar value based, e.g., an
83c67d6573Sopenharmony_ci    /// engine utilizing such an automaton is responsible for UTF-8 decoding.
84c67d6573Sopenharmony_ci    ///
85c67d6573Sopenharmony_ci    /// The specific invariant is that when returning a byte based machine,
86c67d6573Sopenharmony_ci    /// the neither the `Char` nor `Ranges` instructions are produced.
87c67d6573Sopenharmony_ci    /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
88c67d6573Sopenharmony_ci    /// instruction is never produced.
89c67d6573Sopenharmony_ci    ///
90c67d6573Sopenharmony_ci    /// Note that `dfa(true)` implies `bytes(true)`.
91c67d6573Sopenharmony_ci    pub fn bytes(mut self, yes: bool) -> Self {
92c67d6573Sopenharmony_ci        self.compiled.is_bytes = yes;
93c67d6573Sopenharmony_ci        self
94c67d6573Sopenharmony_ci    }
95c67d6573Sopenharmony_ci
96c67d6573Sopenharmony_ci    /// When disabled, the program compiled may match arbitrary bytes.
97c67d6573Sopenharmony_ci    ///
98c67d6573Sopenharmony_ci    /// When enabled (the default), all compiled programs exclusively match
99c67d6573Sopenharmony_ci    /// valid UTF-8 bytes.
100c67d6573Sopenharmony_ci    pub fn only_utf8(mut self, yes: bool) -> Self {
101c67d6573Sopenharmony_ci        self.compiled.only_utf8 = yes;
102c67d6573Sopenharmony_ci        self
103c67d6573Sopenharmony_ci    }
104c67d6573Sopenharmony_ci
105c67d6573Sopenharmony_ci    /// When set, the machine returned is suitable for use in the DFA matching
106c67d6573Sopenharmony_ci    /// engine.
107c67d6573Sopenharmony_ci    ///
108c67d6573Sopenharmony_ci    /// In particular, this ensures that if the regex is not anchored in the
109c67d6573Sopenharmony_ci    /// beginning, then a preceding `.*?` is included in the program. (The NFA
110c67d6573Sopenharmony_ci    /// based engines handle the preceding `.*?` explicitly, which is difficult
111c67d6573Sopenharmony_ci    /// or impossible in the DFA engine.)
112c67d6573Sopenharmony_ci    pub fn dfa(mut self, yes: bool) -> Self {
113c67d6573Sopenharmony_ci        self.compiled.is_dfa = yes;
114c67d6573Sopenharmony_ci        self
115c67d6573Sopenharmony_ci    }
116c67d6573Sopenharmony_ci
117c67d6573Sopenharmony_ci    /// When set, the machine returned is suitable for matching text in
118c67d6573Sopenharmony_ci    /// reverse. In particular, all concatenations are flipped.
119c67d6573Sopenharmony_ci    pub fn reverse(mut self, yes: bool) -> Self {
120c67d6573Sopenharmony_ci        self.compiled.is_reverse = yes;
121c67d6573Sopenharmony_ci        self
122c67d6573Sopenharmony_ci    }
123c67d6573Sopenharmony_ci
124c67d6573Sopenharmony_ci    /// Compile a regular expression given its AST.
125c67d6573Sopenharmony_ci    ///
126c67d6573Sopenharmony_ci    /// The compiler is guaranteed to succeed unless the program exceeds the
127c67d6573Sopenharmony_ci    /// specified size limit. If the size limit is exceeded, then compilation
128c67d6573Sopenharmony_ci    /// stops and returns an error.
129c67d6573Sopenharmony_ci    pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
130c67d6573Sopenharmony_ci        debug_assert!(!exprs.is_empty());
131c67d6573Sopenharmony_ci        self.num_exprs = exprs.len();
132c67d6573Sopenharmony_ci        if exprs.len() == 1 {
133c67d6573Sopenharmony_ci            self.compile_one(&exprs[0])
134c67d6573Sopenharmony_ci        } else {
135c67d6573Sopenharmony_ci            self.compile_many(exprs)
136c67d6573Sopenharmony_ci        }
137c67d6573Sopenharmony_ci    }
138c67d6573Sopenharmony_ci
139c67d6573Sopenharmony_ci    fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
140c67d6573Sopenharmony_ci        // If we're compiling a forward DFA and we aren't anchored, then
141c67d6573Sopenharmony_ci        // add a `.*?` before the first capture group.
142c67d6573Sopenharmony_ci        // Other matching engines handle this by baking the logic into the
143c67d6573Sopenharmony_ci        // matching engine itself.
144c67d6573Sopenharmony_ci        let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
145c67d6573Sopenharmony_ci        self.compiled.is_anchored_start = expr.is_anchored_start();
146c67d6573Sopenharmony_ci        self.compiled.is_anchored_end = expr.is_anchored_end();
147c67d6573Sopenharmony_ci        if self.compiled.needs_dotstar() {
148c67d6573Sopenharmony_ci            dotstar_patch = self.c_dotstar()?;
149c67d6573Sopenharmony_ci            self.compiled.start = dotstar_patch.entry;
150c67d6573Sopenharmony_ci        }
151c67d6573Sopenharmony_ci        self.compiled.captures = vec![None];
152c67d6573Sopenharmony_ci        let patch =
153c67d6573Sopenharmony_ci            self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
154c67d6573Sopenharmony_ci        if self.compiled.needs_dotstar() {
155c67d6573Sopenharmony_ci            self.fill(dotstar_patch.hole, patch.entry);
156c67d6573Sopenharmony_ci        } else {
157c67d6573Sopenharmony_ci            self.compiled.start = patch.entry;
158c67d6573Sopenharmony_ci        }
159c67d6573Sopenharmony_ci        self.fill_to_next(patch.hole);
160c67d6573Sopenharmony_ci        self.compiled.matches = vec![self.insts.len()];
161c67d6573Sopenharmony_ci        self.push_compiled(Inst::Match(0));
162c67d6573Sopenharmony_ci        self.compile_finish()
163c67d6573Sopenharmony_ci    }
164c67d6573Sopenharmony_ci
165c67d6573Sopenharmony_ci    fn compile_many(
166c67d6573Sopenharmony_ci        mut self,
167c67d6573Sopenharmony_ci        exprs: &[Hir],
168c67d6573Sopenharmony_ci    ) -> result::Result<Program, Error> {
169c67d6573Sopenharmony_ci        debug_assert!(exprs.len() > 1);
170c67d6573Sopenharmony_ci
171c67d6573Sopenharmony_ci        self.compiled.is_anchored_start =
172c67d6573Sopenharmony_ci            exprs.iter().all(|e| e.is_anchored_start());
173c67d6573Sopenharmony_ci        self.compiled.is_anchored_end =
174c67d6573Sopenharmony_ci            exprs.iter().all(|e| e.is_anchored_end());
175c67d6573Sopenharmony_ci        let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
176c67d6573Sopenharmony_ci        if self.compiled.needs_dotstar() {
177c67d6573Sopenharmony_ci            dotstar_patch = self.c_dotstar()?;
178c67d6573Sopenharmony_ci            self.compiled.start = dotstar_patch.entry;
179c67d6573Sopenharmony_ci        } else {
180c67d6573Sopenharmony_ci            self.compiled.start = 0; // first instruction is always split
181c67d6573Sopenharmony_ci        }
182c67d6573Sopenharmony_ci        self.fill_to_next(dotstar_patch.hole);
183c67d6573Sopenharmony_ci
184c67d6573Sopenharmony_ci        let mut prev_hole = Hole::None;
185c67d6573Sopenharmony_ci        for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
186c67d6573Sopenharmony_ci            self.fill_to_next(prev_hole);
187c67d6573Sopenharmony_ci            let split = self.push_split_hole();
188c67d6573Sopenharmony_ci            let Patch { hole, entry } =
189c67d6573Sopenharmony_ci                self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
190c67d6573Sopenharmony_ci            self.fill_to_next(hole);
191c67d6573Sopenharmony_ci            self.compiled.matches.push(self.insts.len());
192c67d6573Sopenharmony_ci            self.push_compiled(Inst::Match(i));
193c67d6573Sopenharmony_ci            prev_hole = self.fill_split(split, Some(entry), None);
194c67d6573Sopenharmony_ci        }
195c67d6573Sopenharmony_ci        let i = exprs.len() - 1;
196c67d6573Sopenharmony_ci        let Patch { hole, entry } =
197c67d6573Sopenharmony_ci            self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
198c67d6573Sopenharmony_ci        self.fill(prev_hole, entry);
199c67d6573Sopenharmony_ci        self.fill_to_next(hole);
200c67d6573Sopenharmony_ci        self.compiled.matches.push(self.insts.len());
201c67d6573Sopenharmony_ci        self.push_compiled(Inst::Match(i));
202c67d6573Sopenharmony_ci        self.compile_finish()
203c67d6573Sopenharmony_ci    }
204c67d6573Sopenharmony_ci
205c67d6573Sopenharmony_ci    fn compile_finish(mut self) -> result::Result<Program, Error> {
206c67d6573Sopenharmony_ci        self.compiled.insts =
207c67d6573Sopenharmony_ci            self.insts.into_iter().map(|inst| inst.unwrap()).collect();
208c67d6573Sopenharmony_ci        self.compiled.byte_classes = self.byte_classes.byte_classes();
209c67d6573Sopenharmony_ci        self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
210c67d6573Sopenharmony_ci        Ok(self.compiled)
211c67d6573Sopenharmony_ci    }
212c67d6573Sopenharmony_ci
213c67d6573Sopenharmony_ci    /// Compile expr into self.insts, returning a patch on success,
214c67d6573Sopenharmony_ci    /// or an error if we run out of memory.
215c67d6573Sopenharmony_ci    ///
216c67d6573Sopenharmony_ci    /// All of the c_* methods of the compiler share the contract outlined
217c67d6573Sopenharmony_ci    /// here.
218c67d6573Sopenharmony_ci    ///
219c67d6573Sopenharmony_ci    /// The main thing that a c_* method does is mutate `self.insts`
220c67d6573Sopenharmony_ci    /// to add a list of mostly compiled instructions required to execute
221c67d6573Sopenharmony_ci    /// the given expression. `self.insts` contains MaybeInsts rather than
222c67d6573Sopenharmony_ci    /// Insts because there is some backpatching required.
223c67d6573Sopenharmony_ci    ///
224c67d6573Sopenharmony_ci    /// The `Patch` value returned by each c_* method provides metadata
225c67d6573Sopenharmony_ci    /// about the compiled instructions emitted to `self.insts`. The
226c67d6573Sopenharmony_ci    /// `entry` member of the patch refers to the first instruction
227c67d6573Sopenharmony_ci    /// (the entry point), while the `hole` member contains zero or
228c67d6573Sopenharmony_ci    /// more offsets to partial instructions that need to be backpatched.
229c67d6573Sopenharmony_ci    /// The c_* routine can't know where its list of instructions are going to
230c67d6573Sopenharmony_ci    /// jump to after execution, so it is up to the caller to patch
231c67d6573Sopenharmony_ci    /// these jumps to point to the right place. So compiling some
232c67d6573Sopenharmony_ci    /// expression, e, we would end up with a situation that looked like:
233c67d6573Sopenharmony_ci    ///
234c67d6573Sopenharmony_ci    /// ```text
235c67d6573Sopenharmony_ci    /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
236c67d6573Sopenharmony_ci    ///                     ^              ^             ^
237c67d6573Sopenharmony_ci    ///                     |                \         /
238c67d6573Sopenharmony_ci    ///                   entry                \     /
239c67d6573Sopenharmony_ci    ///                                         hole
240c67d6573Sopenharmony_ci    /// ```
241c67d6573Sopenharmony_ci    ///
242c67d6573Sopenharmony_ci    /// To compile two expressions, e1 and e2, concatenated together we
243c67d6573Sopenharmony_ci    /// would do:
244c67d6573Sopenharmony_ci    ///
245c67d6573Sopenharmony_ci    /// ```ignore
246c67d6573Sopenharmony_ci    /// let patch1 = self.c(e1);
247c67d6573Sopenharmony_ci    /// let patch2 = self.c(e2);
248c67d6573Sopenharmony_ci    /// ```
249c67d6573Sopenharmony_ci    ///
250c67d6573Sopenharmony_ci    /// while leaves us with a situation that looks like
251c67d6573Sopenharmony_ci    ///
252c67d6573Sopenharmony_ci    /// ```text
253c67d6573Sopenharmony_ci    /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
254c67d6573Sopenharmony_ci    ///                     ^        ^            ^        ^
255c67d6573Sopenharmony_ci    ///                     |        |            |        |
256c67d6573Sopenharmony_ci    ///                entry1        hole1   entry2        hole2
257c67d6573Sopenharmony_ci    /// ```
258c67d6573Sopenharmony_ci    ///
259c67d6573Sopenharmony_ci    /// Then to merge the two patches together into one we would backpatch
260c67d6573Sopenharmony_ci    /// hole1 with entry2 and return a new patch that enters at entry1
261c67d6573Sopenharmony_ci    /// and has hole2 for a hole. In fact, if you look at the c_concat
262c67d6573Sopenharmony_ci    /// method you will see that it does exactly this, though it handles
263c67d6573Sopenharmony_ci    /// a list of expressions rather than just the two that we use for
264c67d6573Sopenharmony_ci    /// an example.
265c67d6573Sopenharmony_ci    ///
266c67d6573Sopenharmony_ci    /// Ok(None) is returned when an expression is compiled to no
267c67d6573Sopenharmony_ci    /// instruction, and so no patch.entry value makes sense.
268c67d6573Sopenharmony_ci    fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
269c67d6573Sopenharmony_ci        use crate::prog;
270c67d6573Sopenharmony_ci        use regex_syntax::hir::HirKind::*;
271c67d6573Sopenharmony_ci
272c67d6573Sopenharmony_ci        self.check_size()?;
273c67d6573Sopenharmony_ci        match *expr.kind() {
274c67d6573Sopenharmony_ci            Empty => self.c_empty(),
275c67d6573Sopenharmony_ci            Literal(hir::Literal::Unicode(c)) => self.c_char(c),
276c67d6573Sopenharmony_ci            Literal(hir::Literal::Byte(b)) => {
277c67d6573Sopenharmony_ci                assert!(self.compiled.uses_bytes());
278c67d6573Sopenharmony_ci                self.c_byte(b)
279c67d6573Sopenharmony_ci            }
280c67d6573Sopenharmony_ci            Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
281c67d6573Sopenharmony_ci            Class(hir::Class::Bytes(ref cls)) => {
282c67d6573Sopenharmony_ci                if self.compiled.uses_bytes() {
283c67d6573Sopenharmony_ci                    self.c_class_bytes(cls.ranges())
284c67d6573Sopenharmony_ci                } else {
285c67d6573Sopenharmony_ci                    assert!(cls.is_all_ascii());
286c67d6573Sopenharmony_ci                    let mut char_ranges = vec![];
287c67d6573Sopenharmony_ci                    for r in cls.iter() {
288c67d6573Sopenharmony_ci                        let (s, e) = (r.start() as char, r.end() as char);
289c67d6573Sopenharmony_ci                        char_ranges.push(hir::ClassUnicodeRange::new(s, e));
290c67d6573Sopenharmony_ci                    }
291c67d6573Sopenharmony_ci                    self.c_class(&char_ranges)
292c67d6573Sopenharmony_ci                }
293c67d6573Sopenharmony_ci            }
294c67d6573Sopenharmony_ci            Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
295c67d6573Sopenharmony_ci                self.byte_classes.set_range(b'\n', b'\n');
296c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::EndLine)
297c67d6573Sopenharmony_ci            }
298c67d6573Sopenharmony_ci            Anchor(hir::Anchor::StartLine) => {
299c67d6573Sopenharmony_ci                self.byte_classes.set_range(b'\n', b'\n');
300c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::StartLine)
301c67d6573Sopenharmony_ci            }
302c67d6573Sopenharmony_ci            Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
303c67d6573Sopenharmony_ci                self.byte_classes.set_range(b'\n', b'\n');
304c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::StartLine)
305c67d6573Sopenharmony_ci            }
306c67d6573Sopenharmony_ci            Anchor(hir::Anchor::EndLine) => {
307c67d6573Sopenharmony_ci                self.byte_classes.set_range(b'\n', b'\n');
308c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::EndLine)
309c67d6573Sopenharmony_ci            }
310c67d6573Sopenharmony_ci            Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
311c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::EndText)
312c67d6573Sopenharmony_ci            }
313c67d6573Sopenharmony_ci            Anchor(hir::Anchor::StartText) => {
314c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::StartText)
315c67d6573Sopenharmony_ci            }
316c67d6573Sopenharmony_ci            Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
317c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::StartText)
318c67d6573Sopenharmony_ci            }
319c67d6573Sopenharmony_ci            Anchor(hir::Anchor::EndText) => {
320c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::EndText)
321c67d6573Sopenharmony_ci            }
322c67d6573Sopenharmony_ci            WordBoundary(hir::WordBoundary::Unicode) => {
323c67d6573Sopenharmony_ci                if !cfg!(feature = "unicode-perl") {
324c67d6573Sopenharmony_ci                    return Err(Error::Syntax(
325c67d6573Sopenharmony_ci                        "Unicode word boundaries are unavailable when \
326c67d6573Sopenharmony_ci                         the unicode-perl feature is disabled"
327c67d6573Sopenharmony_ci                            .to_string(),
328c67d6573Sopenharmony_ci                    ));
329c67d6573Sopenharmony_ci                }
330c67d6573Sopenharmony_ci                self.compiled.has_unicode_word_boundary = true;
331c67d6573Sopenharmony_ci                self.byte_classes.set_word_boundary();
332c67d6573Sopenharmony_ci                // We also make sure that all ASCII bytes are in a different
333c67d6573Sopenharmony_ci                // class from non-ASCII bytes. Otherwise, it's possible for
334c67d6573Sopenharmony_ci                // ASCII bytes to get lumped into the same class as non-ASCII
335c67d6573Sopenharmony_ci                // bytes. This in turn may cause the lazy DFA to falsely start
336c67d6573Sopenharmony_ci                // when it sees an ASCII byte that maps to a byte class with
337c67d6573Sopenharmony_ci                // non-ASCII bytes. This ensures that never happens.
338c67d6573Sopenharmony_ci                self.byte_classes.set_range(0, 0x7F);
339c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::WordBoundary)
340c67d6573Sopenharmony_ci            }
341c67d6573Sopenharmony_ci            WordBoundary(hir::WordBoundary::UnicodeNegate) => {
342c67d6573Sopenharmony_ci                if !cfg!(feature = "unicode-perl") {
343c67d6573Sopenharmony_ci                    return Err(Error::Syntax(
344c67d6573Sopenharmony_ci                        "Unicode word boundaries are unavailable when \
345c67d6573Sopenharmony_ci                         the unicode-perl feature is disabled"
346c67d6573Sopenharmony_ci                            .to_string(),
347c67d6573Sopenharmony_ci                    ));
348c67d6573Sopenharmony_ci                }
349c67d6573Sopenharmony_ci                self.compiled.has_unicode_word_boundary = true;
350c67d6573Sopenharmony_ci                self.byte_classes.set_word_boundary();
351c67d6573Sopenharmony_ci                // See comments above for why we set the ASCII range here.
352c67d6573Sopenharmony_ci                self.byte_classes.set_range(0, 0x7F);
353c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::NotWordBoundary)
354c67d6573Sopenharmony_ci            }
355c67d6573Sopenharmony_ci            WordBoundary(hir::WordBoundary::Ascii) => {
356c67d6573Sopenharmony_ci                self.byte_classes.set_word_boundary();
357c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
358c67d6573Sopenharmony_ci            }
359c67d6573Sopenharmony_ci            WordBoundary(hir::WordBoundary::AsciiNegate) => {
360c67d6573Sopenharmony_ci                self.byte_classes.set_word_boundary();
361c67d6573Sopenharmony_ci                self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
362c67d6573Sopenharmony_ci            }
363c67d6573Sopenharmony_ci            Group(ref g) => match g.kind {
364c67d6573Sopenharmony_ci                hir::GroupKind::NonCapturing => self.c(&g.hir),
365c67d6573Sopenharmony_ci                hir::GroupKind::CaptureIndex(index) => {
366c67d6573Sopenharmony_ci                    if index as usize >= self.compiled.captures.len() {
367c67d6573Sopenharmony_ci                        self.compiled.captures.push(None);
368c67d6573Sopenharmony_ci                    }
369c67d6573Sopenharmony_ci                    self.c_capture(2 * index as usize, &g.hir)
370c67d6573Sopenharmony_ci                }
371c67d6573Sopenharmony_ci                hir::GroupKind::CaptureName { index, ref name } => {
372c67d6573Sopenharmony_ci                    if index as usize >= self.compiled.captures.len() {
373c67d6573Sopenharmony_ci                        let n = name.to_string();
374c67d6573Sopenharmony_ci                        self.compiled.captures.push(Some(n.clone()));
375c67d6573Sopenharmony_ci                        self.capture_name_idx.insert(n, index as usize);
376c67d6573Sopenharmony_ci                    }
377c67d6573Sopenharmony_ci                    self.c_capture(2 * index as usize, &g.hir)
378c67d6573Sopenharmony_ci                }
379c67d6573Sopenharmony_ci            },
380c67d6573Sopenharmony_ci            Concat(ref es) => {
381c67d6573Sopenharmony_ci                if self.compiled.is_reverse {
382c67d6573Sopenharmony_ci                    self.c_concat(es.iter().rev())
383c67d6573Sopenharmony_ci                } else {
384c67d6573Sopenharmony_ci                    self.c_concat(es)
385c67d6573Sopenharmony_ci                }
386c67d6573Sopenharmony_ci            }
387c67d6573Sopenharmony_ci            Alternation(ref es) => self.c_alternate(&**es),
388c67d6573Sopenharmony_ci            Repetition(ref rep) => self.c_repeat(rep),
389c67d6573Sopenharmony_ci        }
390c67d6573Sopenharmony_ci    }
391c67d6573Sopenharmony_ci
392c67d6573Sopenharmony_ci    fn c_empty(&mut self) -> ResultOrEmpty {
393c67d6573Sopenharmony_ci        // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
394c67d6573Sopenharmony_ci        // See: CVE-2022-24713
395c67d6573Sopenharmony_ci        //
396c67d6573Sopenharmony_ci        // Since 'empty' sub-expressions don't increase the size of
397c67d6573Sopenharmony_ci        // the actual compiled object, we "fake" an increase in its
398c67d6573Sopenharmony_ci        // size so that our 'check_size_limit' routine will eventually
399c67d6573Sopenharmony_ci        // stop compilation if there are too many empty sub-expressions
400c67d6573Sopenharmony_ci        // (e.g., via a large repetition).
401c67d6573Sopenharmony_ci        self.extra_inst_bytes += std::mem::size_of::<Inst>();
402c67d6573Sopenharmony_ci        Ok(None)
403c67d6573Sopenharmony_ci    }
404c67d6573Sopenharmony_ci
405c67d6573Sopenharmony_ci    fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
406c67d6573Sopenharmony_ci        if self.num_exprs > 1 || self.compiled.is_dfa {
407c67d6573Sopenharmony_ci            // Don't ever compile Save instructions for regex sets because
408c67d6573Sopenharmony_ci            // they are never used. They are also never used in DFA programs
409c67d6573Sopenharmony_ci            // because DFAs can't handle captures.
410c67d6573Sopenharmony_ci            self.c(expr)
411c67d6573Sopenharmony_ci        } else {
412c67d6573Sopenharmony_ci            let entry = self.insts.len();
413c67d6573Sopenharmony_ci            let hole = self.push_hole(InstHole::Save { slot: first_slot });
414c67d6573Sopenharmony_ci            let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
415c67d6573Sopenharmony_ci            self.fill(hole, patch.entry);
416c67d6573Sopenharmony_ci            self.fill_to_next(patch.hole);
417c67d6573Sopenharmony_ci            let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
418c67d6573Sopenharmony_ci            Ok(Some(Patch { hole, entry }))
419c67d6573Sopenharmony_ci        }
420c67d6573Sopenharmony_ci    }
421c67d6573Sopenharmony_ci
422c67d6573Sopenharmony_ci    fn c_dotstar(&mut self) -> Result {
423c67d6573Sopenharmony_ci        Ok(if !self.compiled.only_utf8() {
424c67d6573Sopenharmony_ci            self.c(&Hir::repetition(hir::Repetition {
425c67d6573Sopenharmony_ci                kind: hir::RepetitionKind::ZeroOrMore,
426c67d6573Sopenharmony_ci                greedy: false,
427c67d6573Sopenharmony_ci                hir: Box::new(Hir::any(true)),
428c67d6573Sopenharmony_ci            }))?
429c67d6573Sopenharmony_ci            .unwrap()
430c67d6573Sopenharmony_ci        } else {
431c67d6573Sopenharmony_ci            self.c(&Hir::repetition(hir::Repetition {
432c67d6573Sopenharmony_ci                kind: hir::RepetitionKind::ZeroOrMore,
433c67d6573Sopenharmony_ci                greedy: false,
434c67d6573Sopenharmony_ci                hir: Box::new(Hir::any(false)),
435c67d6573Sopenharmony_ci            }))?
436c67d6573Sopenharmony_ci            .unwrap()
437c67d6573Sopenharmony_ci        })
438c67d6573Sopenharmony_ci    }
439c67d6573Sopenharmony_ci
440c67d6573Sopenharmony_ci    fn c_char(&mut self, c: char) -> ResultOrEmpty {
441c67d6573Sopenharmony_ci        if self.compiled.uses_bytes() {
442c67d6573Sopenharmony_ci            if c.is_ascii() {
443c67d6573Sopenharmony_ci                let b = c as u8;
444c67d6573Sopenharmony_ci                let hole =
445c67d6573Sopenharmony_ci                    self.push_hole(InstHole::Bytes { start: b, end: b });
446c67d6573Sopenharmony_ci                self.byte_classes.set_range(b, b);
447c67d6573Sopenharmony_ci                Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
448c67d6573Sopenharmony_ci            } else {
449c67d6573Sopenharmony_ci                self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
450c67d6573Sopenharmony_ci            }
451c67d6573Sopenharmony_ci        } else {
452c67d6573Sopenharmony_ci            let hole = self.push_hole(InstHole::Char { c });
453c67d6573Sopenharmony_ci            Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
454c67d6573Sopenharmony_ci        }
455c67d6573Sopenharmony_ci    }
456c67d6573Sopenharmony_ci
457c67d6573Sopenharmony_ci    fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
458c67d6573Sopenharmony_ci        use std::mem::size_of;
459c67d6573Sopenharmony_ci
460c67d6573Sopenharmony_ci        assert!(!ranges.is_empty());
461c67d6573Sopenharmony_ci        if self.compiled.uses_bytes() {
462c67d6573Sopenharmony_ci            Ok(Some(CompileClass { c: self, ranges }.compile()?))
463c67d6573Sopenharmony_ci        } else {
464c67d6573Sopenharmony_ci            let ranges: Vec<(char, char)> =
465c67d6573Sopenharmony_ci                ranges.iter().map(|r| (r.start(), r.end())).collect();
466c67d6573Sopenharmony_ci            let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
467c67d6573Sopenharmony_ci                self.push_hole(InstHole::Char { c: ranges[0].0 })
468c67d6573Sopenharmony_ci            } else {
469c67d6573Sopenharmony_ci                self.extra_inst_bytes +=
470c67d6573Sopenharmony_ci                    ranges.len() * (size_of::<char>() * 2);
471c67d6573Sopenharmony_ci                self.push_hole(InstHole::Ranges { ranges })
472c67d6573Sopenharmony_ci            };
473c67d6573Sopenharmony_ci            Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
474c67d6573Sopenharmony_ci        }
475c67d6573Sopenharmony_ci    }
476c67d6573Sopenharmony_ci
477c67d6573Sopenharmony_ci    fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
478c67d6573Sopenharmony_ci        self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
479c67d6573Sopenharmony_ci    }
480c67d6573Sopenharmony_ci
481c67d6573Sopenharmony_ci    fn c_class_bytes(
482c67d6573Sopenharmony_ci        &mut self,
483c67d6573Sopenharmony_ci        ranges: &[hir::ClassBytesRange],
484c67d6573Sopenharmony_ci    ) -> ResultOrEmpty {
485c67d6573Sopenharmony_ci        debug_assert!(!ranges.is_empty());
486c67d6573Sopenharmony_ci
487c67d6573Sopenharmony_ci        let first_split_entry = self.insts.len();
488c67d6573Sopenharmony_ci        let mut holes = vec![];
489c67d6573Sopenharmony_ci        let mut prev_hole = Hole::None;
490c67d6573Sopenharmony_ci        for r in &ranges[0..ranges.len() - 1] {
491c67d6573Sopenharmony_ci            self.fill_to_next(prev_hole);
492c67d6573Sopenharmony_ci            let split = self.push_split_hole();
493c67d6573Sopenharmony_ci            let next = self.insts.len();
494c67d6573Sopenharmony_ci            self.byte_classes.set_range(r.start(), r.end());
495c67d6573Sopenharmony_ci            holes.push(self.push_hole(InstHole::Bytes {
496c67d6573Sopenharmony_ci                start: r.start(),
497c67d6573Sopenharmony_ci                end: r.end(),
498c67d6573Sopenharmony_ci            }));
499c67d6573Sopenharmony_ci            prev_hole = self.fill_split(split, Some(next), None);
500c67d6573Sopenharmony_ci        }
501c67d6573Sopenharmony_ci        let next = self.insts.len();
502c67d6573Sopenharmony_ci        let r = &ranges[ranges.len() - 1];
503c67d6573Sopenharmony_ci        self.byte_classes.set_range(r.start(), r.end());
504c67d6573Sopenharmony_ci        holes.push(
505c67d6573Sopenharmony_ci            self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
506c67d6573Sopenharmony_ci        );
507c67d6573Sopenharmony_ci        self.fill(prev_hole, next);
508c67d6573Sopenharmony_ci        Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
509c67d6573Sopenharmony_ci    }
510c67d6573Sopenharmony_ci
511c67d6573Sopenharmony_ci    fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
512c67d6573Sopenharmony_ci        let hole = self.push_hole(InstHole::EmptyLook { look });
513c67d6573Sopenharmony_ci        Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
514c67d6573Sopenharmony_ci    }
515c67d6573Sopenharmony_ci
516c67d6573Sopenharmony_ci    fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
517c67d6573Sopenharmony_ci    where
518c67d6573Sopenharmony_ci        I: IntoIterator<Item = &'a Hir>,
519c67d6573Sopenharmony_ci    {
520c67d6573Sopenharmony_ci        let mut exprs = exprs.into_iter();
521c67d6573Sopenharmony_ci        let Patch { mut hole, entry } = loop {
522c67d6573Sopenharmony_ci            match exprs.next() {
523c67d6573Sopenharmony_ci                None => return self.c_empty(),
524c67d6573Sopenharmony_ci                Some(e) => {
525c67d6573Sopenharmony_ci                    if let Some(p) = self.c(e)? {
526c67d6573Sopenharmony_ci                        break p;
527c67d6573Sopenharmony_ci                    }
528c67d6573Sopenharmony_ci                }
529c67d6573Sopenharmony_ci            }
530c67d6573Sopenharmony_ci        };
531c67d6573Sopenharmony_ci        for e in exprs {
532c67d6573Sopenharmony_ci            if let Some(p) = self.c(e)? {
533c67d6573Sopenharmony_ci                self.fill(hole, p.entry);
534c67d6573Sopenharmony_ci                hole = p.hole;
535c67d6573Sopenharmony_ci            }
536c67d6573Sopenharmony_ci        }
537c67d6573Sopenharmony_ci        Ok(Some(Patch { hole, entry }))
538c67d6573Sopenharmony_ci    }
539c67d6573Sopenharmony_ci
540c67d6573Sopenharmony_ci    fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
541c67d6573Sopenharmony_ci        debug_assert!(
542c67d6573Sopenharmony_ci            exprs.len() >= 2,
543c67d6573Sopenharmony_ci            "alternates must have at least 2 exprs"
544c67d6573Sopenharmony_ci        );
545c67d6573Sopenharmony_ci
546c67d6573Sopenharmony_ci        // Initial entry point is always the first split.
547c67d6573Sopenharmony_ci        let first_split_entry = self.insts.len();
548c67d6573Sopenharmony_ci
549c67d6573Sopenharmony_ci        // Save up all of the holes from each alternate. They will all get
550c67d6573Sopenharmony_ci        // patched to point to the same location.
551c67d6573Sopenharmony_ci        let mut holes = vec![];
552c67d6573Sopenharmony_ci
553c67d6573Sopenharmony_ci        // true indicates that the hole is a split where we want to fill
554c67d6573Sopenharmony_ci        // the second branch.
555c67d6573Sopenharmony_ci        let mut prev_hole = (Hole::None, false);
556c67d6573Sopenharmony_ci        for e in &exprs[0..exprs.len() - 1] {
557c67d6573Sopenharmony_ci            if prev_hole.1 {
558c67d6573Sopenharmony_ci                let next = self.insts.len();
559c67d6573Sopenharmony_ci                self.fill_split(prev_hole.0, None, Some(next));
560c67d6573Sopenharmony_ci            } else {
561c67d6573Sopenharmony_ci                self.fill_to_next(prev_hole.0);
562c67d6573Sopenharmony_ci            }
563c67d6573Sopenharmony_ci            let split = self.push_split_hole();
564c67d6573Sopenharmony_ci            if let Some(Patch { hole, entry }) = self.c(e)? {
565c67d6573Sopenharmony_ci                holes.push(hole);
566c67d6573Sopenharmony_ci                prev_hole = (self.fill_split(split, Some(entry), None), false);
567c67d6573Sopenharmony_ci            } else {
568c67d6573Sopenharmony_ci                let (split1, split2) = split.dup_one();
569c67d6573Sopenharmony_ci                holes.push(split1);
570c67d6573Sopenharmony_ci                prev_hole = (split2, true);
571c67d6573Sopenharmony_ci            }
572c67d6573Sopenharmony_ci        }
573c67d6573Sopenharmony_ci        if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
574c67d6573Sopenharmony_ci            holes.push(hole);
575c67d6573Sopenharmony_ci            if prev_hole.1 {
576c67d6573Sopenharmony_ci                self.fill_split(prev_hole.0, None, Some(entry));
577c67d6573Sopenharmony_ci            } else {
578c67d6573Sopenharmony_ci                self.fill(prev_hole.0, entry);
579c67d6573Sopenharmony_ci            }
580c67d6573Sopenharmony_ci        } else {
581c67d6573Sopenharmony_ci            // We ignore prev_hole.1. When it's true, it means we have two
582c67d6573Sopenharmony_ci            // empty branches both pushing prev_hole.0 into holes, so both
583c67d6573Sopenharmony_ci            // branches will go to the same place anyway.
584c67d6573Sopenharmony_ci            holes.push(prev_hole.0);
585c67d6573Sopenharmony_ci        }
586c67d6573Sopenharmony_ci        Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
587c67d6573Sopenharmony_ci    }
588c67d6573Sopenharmony_ci
589c67d6573Sopenharmony_ci    fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
590c67d6573Sopenharmony_ci        use regex_syntax::hir::RepetitionKind::*;
591c67d6573Sopenharmony_ci        match rep.kind {
592c67d6573Sopenharmony_ci            ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
593c67d6573Sopenharmony_ci            ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
594c67d6573Sopenharmony_ci            OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
595c67d6573Sopenharmony_ci            Range(hir::RepetitionRange::Exactly(min_max)) => {
596c67d6573Sopenharmony_ci                self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
597c67d6573Sopenharmony_ci            }
598c67d6573Sopenharmony_ci            Range(hir::RepetitionRange::AtLeast(min)) => {
599c67d6573Sopenharmony_ci                self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
600c67d6573Sopenharmony_ci            }
601c67d6573Sopenharmony_ci            Range(hir::RepetitionRange::Bounded(min, max)) => {
602c67d6573Sopenharmony_ci                self.c_repeat_range(&rep.hir, rep.greedy, min, max)
603c67d6573Sopenharmony_ci            }
604c67d6573Sopenharmony_ci        }
605c67d6573Sopenharmony_ci    }
606c67d6573Sopenharmony_ci
607c67d6573Sopenharmony_ci    fn c_repeat_zero_or_one(
608c67d6573Sopenharmony_ci        &mut self,
609c67d6573Sopenharmony_ci        expr: &Hir,
610c67d6573Sopenharmony_ci        greedy: bool,
611c67d6573Sopenharmony_ci    ) -> ResultOrEmpty {
612c67d6573Sopenharmony_ci        let split_entry = self.insts.len();
613c67d6573Sopenharmony_ci        let split = self.push_split_hole();
614c67d6573Sopenharmony_ci        let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
615c67d6573Sopenharmony_ci            Some(p) => p,
616c67d6573Sopenharmony_ci            None => return self.pop_split_hole(),
617c67d6573Sopenharmony_ci        };
618c67d6573Sopenharmony_ci        let split_hole = if greedy {
619c67d6573Sopenharmony_ci            self.fill_split(split, Some(entry_rep), None)
620c67d6573Sopenharmony_ci        } else {
621c67d6573Sopenharmony_ci            self.fill_split(split, None, Some(entry_rep))
622c67d6573Sopenharmony_ci        };
623c67d6573Sopenharmony_ci        let holes = vec![hole_rep, split_hole];
624c67d6573Sopenharmony_ci        Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
625c67d6573Sopenharmony_ci    }
626c67d6573Sopenharmony_ci
627c67d6573Sopenharmony_ci    fn c_repeat_zero_or_more(
628c67d6573Sopenharmony_ci        &mut self,
629c67d6573Sopenharmony_ci        expr: &Hir,
630c67d6573Sopenharmony_ci        greedy: bool,
631c67d6573Sopenharmony_ci    ) -> ResultOrEmpty {
632c67d6573Sopenharmony_ci        let split_entry = self.insts.len();
633c67d6573Sopenharmony_ci        let split = self.push_split_hole();
634c67d6573Sopenharmony_ci        let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
635c67d6573Sopenharmony_ci            Some(p) => p,
636c67d6573Sopenharmony_ci            None => return self.pop_split_hole(),
637c67d6573Sopenharmony_ci        };
638c67d6573Sopenharmony_ci
639c67d6573Sopenharmony_ci        self.fill(hole_rep, split_entry);
640c67d6573Sopenharmony_ci        let split_hole = if greedy {
641c67d6573Sopenharmony_ci            self.fill_split(split, Some(entry_rep), None)
642c67d6573Sopenharmony_ci        } else {
643c67d6573Sopenharmony_ci            self.fill_split(split, None, Some(entry_rep))
644c67d6573Sopenharmony_ci        };
645c67d6573Sopenharmony_ci        Ok(Some(Patch { hole: split_hole, entry: split_entry }))
646c67d6573Sopenharmony_ci    }
647c67d6573Sopenharmony_ci
648c67d6573Sopenharmony_ci    fn c_repeat_one_or_more(
649c67d6573Sopenharmony_ci        &mut self,
650c67d6573Sopenharmony_ci        expr: &Hir,
651c67d6573Sopenharmony_ci        greedy: bool,
652c67d6573Sopenharmony_ci    ) -> ResultOrEmpty {
653c67d6573Sopenharmony_ci        let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
654c67d6573Sopenharmony_ci            Some(p) => p,
655c67d6573Sopenharmony_ci            None => return Ok(None),
656c67d6573Sopenharmony_ci        };
657c67d6573Sopenharmony_ci        self.fill_to_next(hole_rep);
658c67d6573Sopenharmony_ci        let split = self.push_split_hole();
659c67d6573Sopenharmony_ci
660c67d6573Sopenharmony_ci        let split_hole = if greedy {
661c67d6573Sopenharmony_ci            self.fill_split(split, Some(entry_rep), None)
662c67d6573Sopenharmony_ci        } else {
663c67d6573Sopenharmony_ci            self.fill_split(split, None, Some(entry_rep))
664c67d6573Sopenharmony_ci        };
665c67d6573Sopenharmony_ci        Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
666c67d6573Sopenharmony_ci    }
667c67d6573Sopenharmony_ci
668c67d6573Sopenharmony_ci    fn c_repeat_range_min_or_more(
669c67d6573Sopenharmony_ci        &mut self,
670c67d6573Sopenharmony_ci        expr: &Hir,
671c67d6573Sopenharmony_ci        greedy: bool,
672c67d6573Sopenharmony_ci        min: u32,
673c67d6573Sopenharmony_ci    ) -> ResultOrEmpty {
674c67d6573Sopenharmony_ci        let min = u32_to_usize(min);
675c67d6573Sopenharmony_ci        // Using next_inst() is ok, because we can't return it (concat would
676c67d6573Sopenharmony_ci        // have to return Some(_) while c_repeat_range_min_or_more returns
677c67d6573Sopenharmony_ci        // None).
678c67d6573Sopenharmony_ci        let patch_concat = self
679c67d6573Sopenharmony_ci            .c_concat(iter::repeat(expr).take(min))?
680c67d6573Sopenharmony_ci            .unwrap_or_else(|| self.next_inst());
681c67d6573Sopenharmony_ci        if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
682c67d6573Sopenharmony_ci            self.fill(patch_concat.hole, patch_rep.entry);
683c67d6573Sopenharmony_ci            Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
684c67d6573Sopenharmony_ci        } else {
685c67d6573Sopenharmony_ci            Ok(None)
686c67d6573Sopenharmony_ci        }
687c67d6573Sopenharmony_ci    }
688c67d6573Sopenharmony_ci
689c67d6573Sopenharmony_ci    fn c_repeat_range(
690c67d6573Sopenharmony_ci        &mut self,
691c67d6573Sopenharmony_ci        expr: &Hir,
692c67d6573Sopenharmony_ci        greedy: bool,
693c67d6573Sopenharmony_ci        min: u32,
694c67d6573Sopenharmony_ci        max: u32,
695c67d6573Sopenharmony_ci    ) -> ResultOrEmpty {
696c67d6573Sopenharmony_ci        let (min, max) = (u32_to_usize(min), u32_to_usize(max));
697c67d6573Sopenharmony_ci        debug_assert!(min <= max);
698c67d6573Sopenharmony_ci        let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
699c67d6573Sopenharmony_ci        if min == max {
700c67d6573Sopenharmony_ci            return Ok(patch_concat);
701c67d6573Sopenharmony_ci        }
702c67d6573Sopenharmony_ci        // Same reasoning as in c_repeat_range_min_or_more (we know that min <
703c67d6573Sopenharmony_ci        // max at this point).
704c67d6573Sopenharmony_ci        let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
705c67d6573Sopenharmony_ci        let initial_entry = patch_concat.entry;
706c67d6573Sopenharmony_ci        // It is much simpler to compile, e.g., `a{2,5}` as:
707c67d6573Sopenharmony_ci        //
708c67d6573Sopenharmony_ci        //     aaa?a?a?
709c67d6573Sopenharmony_ci        //
710c67d6573Sopenharmony_ci        // But you end up with a sequence of instructions like this:
711c67d6573Sopenharmony_ci        //
712c67d6573Sopenharmony_ci        //     0: 'a'
713c67d6573Sopenharmony_ci        //     1: 'a',
714c67d6573Sopenharmony_ci        //     2: split(3, 4)
715c67d6573Sopenharmony_ci        //     3: 'a'
716c67d6573Sopenharmony_ci        //     4: split(5, 6)
717c67d6573Sopenharmony_ci        //     5: 'a'
718c67d6573Sopenharmony_ci        //     6: split(7, 8)
719c67d6573Sopenharmony_ci        //     7: 'a'
720c67d6573Sopenharmony_ci        //     8: MATCH
721c67d6573Sopenharmony_ci        //
722c67d6573Sopenharmony_ci        // This is *incredibly* inefficient because the splits end
723c67d6573Sopenharmony_ci        // up forming a chain, which has to be resolved everything a
724c67d6573Sopenharmony_ci        // transition is followed.
725c67d6573Sopenharmony_ci        let mut holes = vec![];
726c67d6573Sopenharmony_ci        let mut prev_hole = patch_concat.hole;
727c67d6573Sopenharmony_ci        for _ in min..max {
728c67d6573Sopenharmony_ci            self.fill_to_next(prev_hole);
729c67d6573Sopenharmony_ci            let split = self.push_split_hole();
730c67d6573Sopenharmony_ci            let Patch { hole, entry } = match self.c(expr)? {
731c67d6573Sopenharmony_ci                Some(p) => p,
732c67d6573Sopenharmony_ci                None => return self.pop_split_hole(),
733c67d6573Sopenharmony_ci            };
734c67d6573Sopenharmony_ci            prev_hole = hole;
735c67d6573Sopenharmony_ci            if greedy {
736c67d6573Sopenharmony_ci                holes.push(self.fill_split(split, Some(entry), None));
737c67d6573Sopenharmony_ci            } else {
738c67d6573Sopenharmony_ci                holes.push(self.fill_split(split, None, Some(entry)));
739c67d6573Sopenharmony_ci            }
740c67d6573Sopenharmony_ci        }
741c67d6573Sopenharmony_ci        holes.push(prev_hole);
742c67d6573Sopenharmony_ci        Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
743c67d6573Sopenharmony_ci    }
744c67d6573Sopenharmony_ci
745c67d6573Sopenharmony_ci    /// Can be used as a default value for the c_* functions when the call to
746c67d6573Sopenharmony_ci    /// c_function is followed by inserting at least one instruction that is
747c67d6573Sopenharmony_ci    /// always executed after the ones written by the c* function.
748c67d6573Sopenharmony_ci    fn next_inst(&self) -> Patch {
749c67d6573Sopenharmony_ci        Patch { hole: Hole::None, entry: self.insts.len() }
750c67d6573Sopenharmony_ci    }
751c67d6573Sopenharmony_ci
752c67d6573Sopenharmony_ci    fn fill(&mut self, hole: Hole, goto: InstPtr) {
753c67d6573Sopenharmony_ci        match hole {
754c67d6573Sopenharmony_ci            Hole::None => {}
755c67d6573Sopenharmony_ci            Hole::One(pc) => {
756c67d6573Sopenharmony_ci                self.insts[pc].fill(goto);
757c67d6573Sopenharmony_ci            }
758c67d6573Sopenharmony_ci            Hole::Many(holes) => {
759c67d6573Sopenharmony_ci                for hole in holes {
760c67d6573Sopenharmony_ci                    self.fill(hole, goto);
761c67d6573Sopenharmony_ci                }
762c67d6573Sopenharmony_ci            }
763c67d6573Sopenharmony_ci        }
764c67d6573Sopenharmony_ci    }
765c67d6573Sopenharmony_ci
766c67d6573Sopenharmony_ci    fn fill_to_next(&mut self, hole: Hole) {
767c67d6573Sopenharmony_ci        let next = self.insts.len();
768c67d6573Sopenharmony_ci        self.fill(hole, next);
769c67d6573Sopenharmony_ci    }
770c67d6573Sopenharmony_ci
771c67d6573Sopenharmony_ci    fn fill_split(
772c67d6573Sopenharmony_ci        &mut self,
773c67d6573Sopenharmony_ci        hole: Hole,
774c67d6573Sopenharmony_ci        goto1: Option<InstPtr>,
775c67d6573Sopenharmony_ci        goto2: Option<InstPtr>,
776c67d6573Sopenharmony_ci    ) -> Hole {
777c67d6573Sopenharmony_ci        match hole {
778c67d6573Sopenharmony_ci            Hole::None => Hole::None,
779c67d6573Sopenharmony_ci            Hole::One(pc) => match (goto1, goto2) {
780c67d6573Sopenharmony_ci                (Some(goto1), Some(goto2)) => {
781c67d6573Sopenharmony_ci                    self.insts[pc].fill_split(goto1, goto2);
782c67d6573Sopenharmony_ci                    Hole::None
783c67d6573Sopenharmony_ci                }
784c67d6573Sopenharmony_ci                (Some(goto1), None) => {
785c67d6573Sopenharmony_ci                    self.insts[pc].half_fill_split_goto1(goto1);
786c67d6573Sopenharmony_ci                    Hole::One(pc)
787c67d6573Sopenharmony_ci                }
788c67d6573Sopenharmony_ci                (None, Some(goto2)) => {
789c67d6573Sopenharmony_ci                    self.insts[pc].half_fill_split_goto2(goto2);
790c67d6573Sopenharmony_ci                    Hole::One(pc)
791c67d6573Sopenharmony_ci                }
792c67d6573Sopenharmony_ci                (None, None) => unreachable!(
793c67d6573Sopenharmony_ci                    "at least one of the split \
794c67d6573Sopenharmony_ci                     holes must be filled"
795c67d6573Sopenharmony_ci                ),
796c67d6573Sopenharmony_ci            },
797c67d6573Sopenharmony_ci            Hole::Many(holes) => {
798c67d6573Sopenharmony_ci                let mut new_holes = vec![];
799c67d6573Sopenharmony_ci                for hole in holes {
800c67d6573Sopenharmony_ci                    new_holes.push(self.fill_split(hole, goto1, goto2));
801c67d6573Sopenharmony_ci                }
802c67d6573Sopenharmony_ci                if new_holes.is_empty() {
803c67d6573Sopenharmony_ci                    Hole::None
804c67d6573Sopenharmony_ci                } else if new_holes.len() == 1 {
805c67d6573Sopenharmony_ci                    new_holes.pop().unwrap()
806c67d6573Sopenharmony_ci                } else {
807c67d6573Sopenharmony_ci                    Hole::Many(new_holes)
808c67d6573Sopenharmony_ci                }
809c67d6573Sopenharmony_ci            }
810c67d6573Sopenharmony_ci        }
811c67d6573Sopenharmony_ci    }
812c67d6573Sopenharmony_ci
813c67d6573Sopenharmony_ci    fn push_compiled(&mut self, inst: Inst) {
814c67d6573Sopenharmony_ci        self.insts.push(MaybeInst::Compiled(inst));
815c67d6573Sopenharmony_ci    }
816c67d6573Sopenharmony_ci
817c67d6573Sopenharmony_ci    fn push_hole(&mut self, inst: InstHole) -> Hole {
818c67d6573Sopenharmony_ci        let hole = self.insts.len();
819c67d6573Sopenharmony_ci        self.insts.push(MaybeInst::Uncompiled(inst));
820c67d6573Sopenharmony_ci        Hole::One(hole)
821c67d6573Sopenharmony_ci    }
822c67d6573Sopenharmony_ci
823c67d6573Sopenharmony_ci    fn push_split_hole(&mut self) -> Hole {
824c67d6573Sopenharmony_ci        let hole = self.insts.len();
825c67d6573Sopenharmony_ci        self.insts.push(MaybeInst::Split);
826c67d6573Sopenharmony_ci        Hole::One(hole)
827c67d6573Sopenharmony_ci    }
828c67d6573Sopenharmony_ci
829c67d6573Sopenharmony_ci    fn pop_split_hole(&mut self) -> ResultOrEmpty {
830c67d6573Sopenharmony_ci        self.insts.pop();
831c67d6573Sopenharmony_ci        Ok(None)
832c67d6573Sopenharmony_ci    }
833c67d6573Sopenharmony_ci
834c67d6573Sopenharmony_ci    fn check_size(&self) -> result::Result<(), Error> {
835c67d6573Sopenharmony_ci        use std::mem::size_of;
836c67d6573Sopenharmony_ci
837c67d6573Sopenharmony_ci        let size =
838c67d6573Sopenharmony_ci            self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
839c67d6573Sopenharmony_ci        if size > self.size_limit {
840c67d6573Sopenharmony_ci            Err(Error::CompiledTooBig(self.size_limit))
841c67d6573Sopenharmony_ci        } else {
842c67d6573Sopenharmony_ci            Ok(())
843c67d6573Sopenharmony_ci        }
844c67d6573Sopenharmony_ci    }
845c67d6573Sopenharmony_ci}
846c67d6573Sopenharmony_ci
847c67d6573Sopenharmony_ci#[derive(Debug)]
848c67d6573Sopenharmony_cienum Hole {
849c67d6573Sopenharmony_ci    None,
850c67d6573Sopenharmony_ci    One(InstPtr),
851c67d6573Sopenharmony_ci    Many(Vec<Hole>),
852c67d6573Sopenharmony_ci}
853c67d6573Sopenharmony_ci
854c67d6573Sopenharmony_ciimpl Hole {
855c67d6573Sopenharmony_ci    fn dup_one(self) -> (Self, Self) {
856c67d6573Sopenharmony_ci        match self {
857c67d6573Sopenharmony_ci            Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
858c67d6573Sopenharmony_ci            Hole::None | Hole::Many(_) => {
859c67d6573Sopenharmony_ci                unreachable!("must be called on single hole")
860c67d6573Sopenharmony_ci            }
861c67d6573Sopenharmony_ci        }
862c67d6573Sopenharmony_ci    }
863c67d6573Sopenharmony_ci}
864c67d6573Sopenharmony_ci
865c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
866c67d6573Sopenharmony_cienum MaybeInst {
867c67d6573Sopenharmony_ci    Compiled(Inst),
868c67d6573Sopenharmony_ci    Uncompiled(InstHole),
869c67d6573Sopenharmony_ci    Split,
870c67d6573Sopenharmony_ci    Split1(InstPtr),
871c67d6573Sopenharmony_ci    Split2(InstPtr),
872c67d6573Sopenharmony_ci}
873c67d6573Sopenharmony_ci
874c67d6573Sopenharmony_ciimpl MaybeInst {
875c67d6573Sopenharmony_ci    fn fill(&mut self, goto: InstPtr) {
876c67d6573Sopenharmony_ci        let maybeinst = match *self {
877c67d6573Sopenharmony_ci            MaybeInst::Split => MaybeInst::Split1(goto),
878c67d6573Sopenharmony_ci            MaybeInst::Uncompiled(ref inst) => {
879c67d6573Sopenharmony_ci                MaybeInst::Compiled(inst.fill(goto))
880c67d6573Sopenharmony_ci            }
881c67d6573Sopenharmony_ci            MaybeInst::Split1(goto1) => {
882c67d6573Sopenharmony_ci                MaybeInst::Compiled(Inst::Split(InstSplit {
883c67d6573Sopenharmony_ci                    goto1,
884c67d6573Sopenharmony_ci                    goto2: goto,
885c67d6573Sopenharmony_ci                }))
886c67d6573Sopenharmony_ci            }
887c67d6573Sopenharmony_ci            MaybeInst::Split2(goto2) => {
888c67d6573Sopenharmony_ci                MaybeInst::Compiled(Inst::Split(InstSplit {
889c67d6573Sopenharmony_ci                    goto1: goto,
890c67d6573Sopenharmony_ci                    goto2,
891c67d6573Sopenharmony_ci                }))
892c67d6573Sopenharmony_ci            }
893c67d6573Sopenharmony_ci            _ => unreachable!(
894c67d6573Sopenharmony_ci                "not all instructions were compiled! \
895c67d6573Sopenharmony_ci                 found uncompiled instruction: {:?}",
896c67d6573Sopenharmony_ci                self
897c67d6573Sopenharmony_ci            ),
898c67d6573Sopenharmony_ci        };
899c67d6573Sopenharmony_ci        *self = maybeinst;
900c67d6573Sopenharmony_ci    }
901c67d6573Sopenharmony_ci
902c67d6573Sopenharmony_ci    fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
903c67d6573Sopenharmony_ci        let filled = match *self {
904c67d6573Sopenharmony_ci            MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
905c67d6573Sopenharmony_ci            _ => unreachable!(
906c67d6573Sopenharmony_ci                "must be called on Split instruction, \
907c67d6573Sopenharmony_ci                 instead it was called on: {:?}",
908c67d6573Sopenharmony_ci                self
909c67d6573Sopenharmony_ci            ),
910c67d6573Sopenharmony_ci        };
911c67d6573Sopenharmony_ci        *self = MaybeInst::Compiled(filled);
912c67d6573Sopenharmony_ci    }
913c67d6573Sopenharmony_ci
914c67d6573Sopenharmony_ci    fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
915c67d6573Sopenharmony_ci        let half_filled = match *self {
916c67d6573Sopenharmony_ci            MaybeInst::Split => goto1,
917c67d6573Sopenharmony_ci            _ => unreachable!(
918c67d6573Sopenharmony_ci                "must be called on Split instruction, \
919c67d6573Sopenharmony_ci                 instead it was called on: {:?}",
920c67d6573Sopenharmony_ci                self
921c67d6573Sopenharmony_ci            ),
922c67d6573Sopenharmony_ci        };
923c67d6573Sopenharmony_ci        *self = MaybeInst::Split1(half_filled);
924c67d6573Sopenharmony_ci    }
925c67d6573Sopenharmony_ci
926c67d6573Sopenharmony_ci    fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
927c67d6573Sopenharmony_ci        let half_filled = match *self {
928c67d6573Sopenharmony_ci            MaybeInst::Split => goto2,
929c67d6573Sopenharmony_ci            _ => unreachable!(
930c67d6573Sopenharmony_ci                "must be called on Split instruction, \
931c67d6573Sopenharmony_ci                 instead it was called on: {:?}",
932c67d6573Sopenharmony_ci                self
933c67d6573Sopenharmony_ci            ),
934c67d6573Sopenharmony_ci        };
935c67d6573Sopenharmony_ci        *self = MaybeInst::Split2(half_filled);
936c67d6573Sopenharmony_ci    }
937c67d6573Sopenharmony_ci
938c67d6573Sopenharmony_ci    fn unwrap(self) -> Inst {
939c67d6573Sopenharmony_ci        match self {
940c67d6573Sopenharmony_ci            MaybeInst::Compiled(inst) => inst,
941c67d6573Sopenharmony_ci            _ => unreachable!(
942c67d6573Sopenharmony_ci                "must be called on a compiled instruction, \
943c67d6573Sopenharmony_ci                 instead it was called on: {:?}",
944c67d6573Sopenharmony_ci                self
945c67d6573Sopenharmony_ci            ),
946c67d6573Sopenharmony_ci        }
947c67d6573Sopenharmony_ci    }
948c67d6573Sopenharmony_ci}
949c67d6573Sopenharmony_ci
950c67d6573Sopenharmony_ci#[derive(Clone, Debug)]
951c67d6573Sopenharmony_cienum InstHole {
952c67d6573Sopenharmony_ci    Save { slot: usize },
953c67d6573Sopenharmony_ci    EmptyLook { look: EmptyLook },
954c67d6573Sopenharmony_ci    Char { c: char },
955c67d6573Sopenharmony_ci    Ranges { ranges: Vec<(char, char)> },
956c67d6573Sopenharmony_ci    Bytes { start: u8, end: u8 },
957c67d6573Sopenharmony_ci}
958c67d6573Sopenharmony_ci
959c67d6573Sopenharmony_ciimpl InstHole {
960c67d6573Sopenharmony_ci    fn fill(&self, goto: InstPtr) -> Inst {
961c67d6573Sopenharmony_ci        match *self {
962c67d6573Sopenharmony_ci            InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
963c67d6573Sopenharmony_ci            InstHole::EmptyLook { look } => {
964c67d6573Sopenharmony_ci                Inst::EmptyLook(InstEmptyLook { goto, look })
965c67d6573Sopenharmony_ci            }
966c67d6573Sopenharmony_ci            InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
967c67d6573Sopenharmony_ci            InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
968c67d6573Sopenharmony_ci                goto,
969c67d6573Sopenharmony_ci                ranges: ranges.clone().into_boxed_slice(),
970c67d6573Sopenharmony_ci            }),
971c67d6573Sopenharmony_ci            InstHole::Bytes { start, end } => {
972c67d6573Sopenharmony_ci                Inst::Bytes(InstBytes { goto, start, end })
973c67d6573Sopenharmony_ci            }
974c67d6573Sopenharmony_ci        }
975c67d6573Sopenharmony_ci    }
976c67d6573Sopenharmony_ci}
977c67d6573Sopenharmony_ci
978c67d6573Sopenharmony_cistruct CompileClass<'a, 'b> {
979c67d6573Sopenharmony_ci    c: &'a mut Compiler,
980c67d6573Sopenharmony_ci    ranges: &'b [hir::ClassUnicodeRange],
981c67d6573Sopenharmony_ci}
982c67d6573Sopenharmony_ci
983c67d6573Sopenharmony_ciimpl<'a, 'b> CompileClass<'a, 'b> {
984c67d6573Sopenharmony_ci    fn compile(mut self) -> Result {
985c67d6573Sopenharmony_ci        let mut holes = vec![];
986c67d6573Sopenharmony_ci        let mut initial_entry = None;
987c67d6573Sopenharmony_ci        let mut last_split = Hole::None;
988c67d6573Sopenharmony_ci        let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
989c67d6573Sopenharmony_ci        self.c.suffix_cache.clear();
990c67d6573Sopenharmony_ci
991c67d6573Sopenharmony_ci        for (i, range) in self.ranges.iter().enumerate() {
992c67d6573Sopenharmony_ci            let is_last_range = i + 1 == self.ranges.len();
993c67d6573Sopenharmony_ci            utf8_seqs.reset(range.start(), range.end());
994c67d6573Sopenharmony_ci            let mut it = (&mut utf8_seqs).peekable();
995c67d6573Sopenharmony_ci            loop {
996c67d6573Sopenharmony_ci                let utf8_seq = match it.next() {
997c67d6573Sopenharmony_ci                    None => break,
998c67d6573Sopenharmony_ci                    Some(utf8_seq) => utf8_seq,
999c67d6573Sopenharmony_ci                };
1000c67d6573Sopenharmony_ci                if is_last_range && it.peek().is_none() {
1001c67d6573Sopenharmony_ci                    let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1002c67d6573Sopenharmony_ci                    holes.push(hole);
1003c67d6573Sopenharmony_ci                    self.c.fill(last_split, entry);
1004c67d6573Sopenharmony_ci                    last_split = Hole::None;
1005c67d6573Sopenharmony_ci                    if initial_entry.is_none() {
1006c67d6573Sopenharmony_ci                        initial_entry = Some(entry);
1007c67d6573Sopenharmony_ci                    }
1008c67d6573Sopenharmony_ci                } else {
1009c67d6573Sopenharmony_ci                    if initial_entry.is_none() {
1010c67d6573Sopenharmony_ci                        initial_entry = Some(self.c.insts.len());
1011c67d6573Sopenharmony_ci                    }
1012c67d6573Sopenharmony_ci                    self.c.fill_to_next(last_split);
1013c67d6573Sopenharmony_ci                    last_split = self.c.push_split_hole();
1014c67d6573Sopenharmony_ci                    let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1015c67d6573Sopenharmony_ci                    holes.push(hole);
1016c67d6573Sopenharmony_ci                    last_split =
1017c67d6573Sopenharmony_ci                        self.c.fill_split(last_split, Some(entry), None);
1018c67d6573Sopenharmony_ci                }
1019c67d6573Sopenharmony_ci            }
1020c67d6573Sopenharmony_ci        }
1021c67d6573Sopenharmony_ci        self.c.utf8_seqs = Some(utf8_seqs);
1022c67d6573Sopenharmony_ci        Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
1023c67d6573Sopenharmony_ci    }
1024c67d6573Sopenharmony_ci
1025c67d6573Sopenharmony_ci    fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
1026c67d6573Sopenharmony_ci        if self.c.compiled.is_reverse {
1027c67d6573Sopenharmony_ci            self.c_utf8_seq_(seq)
1028c67d6573Sopenharmony_ci        } else {
1029c67d6573Sopenharmony_ci            self.c_utf8_seq_(seq.into_iter().rev())
1030c67d6573Sopenharmony_ci        }
1031c67d6573Sopenharmony_ci    }
1032c67d6573Sopenharmony_ci
1033c67d6573Sopenharmony_ci    fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1034c67d6573Sopenharmony_ci    where
1035c67d6573Sopenharmony_ci        I: IntoIterator<Item = &'r Utf8Range>,
1036c67d6573Sopenharmony_ci    {
1037c67d6573Sopenharmony_ci        // The initial instruction for each UTF-8 sequence should be the same.
1038c67d6573Sopenharmony_ci        let mut from_inst = ::std::usize::MAX;
1039c67d6573Sopenharmony_ci        let mut last_hole = Hole::None;
1040c67d6573Sopenharmony_ci        for byte_range in seq {
1041c67d6573Sopenharmony_ci            let key = SuffixCacheKey {
1042c67d6573Sopenharmony_ci                from_inst,
1043c67d6573Sopenharmony_ci                start: byte_range.start,
1044c67d6573Sopenharmony_ci                end: byte_range.end,
1045c67d6573Sopenharmony_ci            };
1046c67d6573Sopenharmony_ci            {
1047c67d6573Sopenharmony_ci                let pc = self.c.insts.len();
1048c67d6573Sopenharmony_ci                if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1049c67d6573Sopenharmony_ci                    from_inst = cached_pc;
1050c67d6573Sopenharmony_ci                    continue;
1051c67d6573Sopenharmony_ci                }
1052c67d6573Sopenharmony_ci            }
1053c67d6573Sopenharmony_ci            self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1054c67d6573Sopenharmony_ci            if from_inst == ::std::usize::MAX {
1055c67d6573Sopenharmony_ci                last_hole = self.c.push_hole(InstHole::Bytes {
1056c67d6573Sopenharmony_ci                    start: byte_range.start,
1057c67d6573Sopenharmony_ci                    end: byte_range.end,
1058c67d6573Sopenharmony_ci                });
1059c67d6573Sopenharmony_ci            } else {
1060c67d6573Sopenharmony_ci                self.c.push_compiled(Inst::Bytes(InstBytes {
1061c67d6573Sopenharmony_ci                    goto: from_inst,
1062c67d6573Sopenharmony_ci                    start: byte_range.start,
1063c67d6573Sopenharmony_ci                    end: byte_range.end,
1064c67d6573Sopenharmony_ci                }));
1065c67d6573Sopenharmony_ci            }
1066c67d6573Sopenharmony_ci            from_inst = self.c.insts.len().checked_sub(1).unwrap();
1067c67d6573Sopenharmony_ci            debug_assert!(from_inst < ::std::usize::MAX);
1068c67d6573Sopenharmony_ci        }
1069c67d6573Sopenharmony_ci        debug_assert!(from_inst < ::std::usize::MAX);
1070c67d6573Sopenharmony_ci        Ok(Patch { hole: last_hole, entry: from_inst })
1071c67d6573Sopenharmony_ci    }
1072c67d6573Sopenharmony_ci}
1073c67d6573Sopenharmony_ci
1074c67d6573Sopenharmony_ci/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1075c67d6573Sopenharmony_ci/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1076c67d6573Sopenharmony_ci/// The set of byte ranges looks like this:
1077c67d6573Sopenharmony_ci///
1078c67d6573Sopenharmony_ci/// [0-7F]
1079c67d6573Sopenharmony_ci/// [C2-DF][80-BF]
1080c67d6573Sopenharmony_ci/// [E0][A0-BF][80-BF]
1081c67d6573Sopenharmony_ci/// [E1-EC][80-BF][80-BF]
1082c67d6573Sopenharmony_ci/// [ED][80-9F][80-BF]
1083c67d6573Sopenharmony_ci/// [EE-EF][80-BF][80-BF]
1084c67d6573Sopenharmony_ci///
1085c67d6573Sopenharmony_ci/// Each line above translates to one alternate in the compiled regex program.
1086c67d6573Sopenharmony_ci/// However, all but one of the alternates end in the same suffix, which is
1087c67d6573Sopenharmony_ci/// a waste of an instruction. The suffix cache facilitates reusing them across
1088c67d6573Sopenharmony_ci/// alternates.
1089c67d6573Sopenharmony_ci///
1090c67d6573Sopenharmony_ci/// Note that a HashMap could be trivially used for this, but we don't need its
1091c67d6573Sopenharmony_ci/// overhead. Some small bounded space (LRU style) is more than enough.
1092c67d6573Sopenharmony_ci///
1093c67d6573Sopenharmony_ci/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1094c67d6573Sopenharmony_ci/// except it uses hashes as original indices and then compares full keys for
1095c67d6573Sopenharmony_ci/// validation against `dense` array.
1096c67d6573Sopenharmony_ci#[derive(Debug)]
1097c67d6573Sopenharmony_cistruct SuffixCache {
1098c67d6573Sopenharmony_ci    sparse: Box<[usize]>,
1099c67d6573Sopenharmony_ci    dense: Vec<SuffixCacheEntry>,
1100c67d6573Sopenharmony_ci}
1101c67d6573Sopenharmony_ci
1102c67d6573Sopenharmony_ci#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1103c67d6573Sopenharmony_cistruct SuffixCacheEntry {
1104c67d6573Sopenharmony_ci    key: SuffixCacheKey,
1105c67d6573Sopenharmony_ci    pc: InstPtr,
1106c67d6573Sopenharmony_ci}
1107c67d6573Sopenharmony_ci
1108c67d6573Sopenharmony_ci#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1109c67d6573Sopenharmony_cistruct SuffixCacheKey {
1110c67d6573Sopenharmony_ci    from_inst: InstPtr,
1111c67d6573Sopenharmony_ci    start: u8,
1112c67d6573Sopenharmony_ci    end: u8,
1113c67d6573Sopenharmony_ci}
1114c67d6573Sopenharmony_ci
1115c67d6573Sopenharmony_ciimpl SuffixCache {
1116c67d6573Sopenharmony_ci    fn new(size: usize) -> Self {
1117c67d6573Sopenharmony_ci        SuffixCache {
1118c67d6573Sopenharmony_ci            sparse: vec![0usize; size].into(),
1119c67d6573Sopenharmony_ci            dense: Vec::with_capacity(size),
1120c67d6573Sopenharmony_ci        }
1121c67d6573Sopenharmony_ci    }
1122c67d6573Sopenharmony_ci
1123c67d6573Sopenharmony_ci    fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1124c67d6573Sopenharmony_ci        let hash = self.hash(&key);
1125c67d6573Sopenharmony_ci        let pos = &mut self.sparse[hash];
1126c67d6573Sopenharmony_ci        if let Some(entry) = self.dense.get(*pos) {
1127c67d6573Sopenharmony_ci            if entry.key == key {
1128c67d6573Sopenharmony_ci                return Some(entry.pc);
1129c67d6573Sopenharmony_ci            }
1130c67d6573Sopenharmony_ci        }
1131c67d6573Sopenharmony_ci        *pos = self.dense.len();
1132c67d6573Sopenharmony_ci        self.dense.push(SuffixCacheEntry { key, pc });
1133c67d6573Sopenharmony_ci        None
1134c67d6573Sopenharmony_ci    }
1135c67d6573Sopenharmony_ci
1136c67d6573Sopenharmony_ci    fn clear(&mut self) {
1137c67d6573Sopenharmony_ci        self.dense.clear();
1138c67d6573Sopenharmony_ci    }
1139c67d6573Sopenharmony_ci
1140c67d6573Sopenharmony_ci    fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1141c67d6573Sopenharmony_ci        // Basic FNV-1a hash as described:
1142c67d6573Sopenharmony_ci        // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1143c67d6573Sopenharmony_ci        const FNV_PRIME: u64 = 1_099_511_628_211;
1144c67d6573Sopenharmony_ci        let mut h = 14_695_981_039_346_656_037;
1145c67d6573Sopenharmony_ci        h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1146c67d6573Sopenharmony_ci        h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1147c67d6573Sopenharmony_ci        h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1148c67d6573Sopenharmony_ci        (h as usize) % self.sparse.len()
1149c67d6573Sopenharmony_ci    }
1150c67d6573Sopenharmony_ci}
1151c67d6573Sopenharmony_ci
1152c67d6573Sopenharmony_cistruct ByteClassSet([bool; 256]);
1153c67d6573Sopenharmony_ci
1154c67d6573Sopenharmony_ciimpl ByteClassSet {
1155c67d6573Sopenharmony_ci    fn new() -> Self {
1156c67d6573Sopenharmony_ci        ByteClassSet([false; 256])
1157c67d6573Sopenharmony_ci    }
1158c67d6573Sopenharmony_ci
1159c67d6573Sopenharmony_ci    fn set_range(&mut self, start: u8, end: u8) {
1160c67d6573Sopenharmony_ci        debug_assert!(start <= end);
1161c67d6573Sopenharmony_ci        if start > 0 {
1162c67d6573Sopenharmony_ci            self.0[start as usize - 1] = true;
1163c67d6573Sopenharmony_ci        }
1164c67d6573Sopenharmony_ci        self.0[end as usize] = true;
1165c67d6573Sopenharmony_ci    }
1166c67d6573Sopenharmony_ci
1167c67d6573Sopenharmony_ci    fn set_word_boundary(&mut self) {
1168c67d6573Sopenharmony_ci        // We need to mark all ranges of bytes whose pairs result in
1169c67d6573Sopenharmony_ci        // evaluating \b differently.
1170c67d6573Sopenharmony_ci        let iswb = is_word_byte;
1171c67d6573Sopenharmony_ci        let mut b1: u16 = 0;
1172c67d6573Sopenharmony_ci        let mut b2: u16;
1173c67d6573Sopenharmony_ci        while b1 <= 255 {
1174c67d6573Sopenharmony_ci            b2 = b1 + 1;
1175c67d6573Sopenharmony_ci            while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1176c67d6573Sopenharmony_ci                b2 += 1;
1177c67d6573Sopenharmony_ci            }
1178c67d6573Sopenharmony_ci            self.set_range(b1 as u8, (b2 - 1) as u8);
1179c67d6573Sopenharmony_ci            b1 = b2;
1180c67d6573Sopenharmony_ci        }
1181c67d6573Sopenharmony_ci    }
1182c67d6573Sopenharmony_ci
1183c67d6573Sopenharmony_ci    fn byte_classes(&self) -> Vec<u8> {
1184c67d6573Sopenharmony_ci        // N.B. If you're debugging the DFA, it's useful to simply return
1185c67d6573Sopenharmony_ci        // `(0..256).collect()`, which effectively removes the byte classes
1186c67d6573Sopenharmony_ci        // and makes the transitions easier to read.
1187c67d6573Sopenharmony_ci        // (0usize..256).map(|x| x as u8).collect()
1188c67d6573Sopenharmony_ci        let mut byte_classes = vec![0; 256];
1189c67d6573Sopenharmony_ci        let mut class = 0u8;
1190c67d6573Sopenharmony_ci        let mut i = 0;
1191c67d6573Sopenharmony_ci        loop {
1192c67d6573Sopenharmony_ci            byte_classes[i] = class as u8;
1193c67d6573Sopenharmony_ci            if i >= 255 {
1194c67d6573Sopenharmony_ci                break;
1195c67d6573Sopenharmony_ci            }
1196c67d6573Sopenharmony_ci            if self.0[i] {
1197c67d6573Sopenharmony_ci                class = class.checked_add(1).unwrap();
1198c67d6573Sopenharmony_ci            }
1199c67d6573Sopenharmony_ci            i += 1;
1200c67d6573Sopenharmony_ci        }
1201c67d6573Sopenharmony_ci        byte_classes
1202c67d6573Sopenharmony_ci    }
1203c67d6573Sopenharmony_ci}
1204c67d6573Sopenharmony_ci
1205c67d6573Sopenharmony_ciimpl fmt::Debug for ByteClassSet {
1206c67d6573Sopenharmony_ci    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1207c67d6573Sopenharmony_ci        f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1208c67d6573Sopenharmony_ci    }
1209c67d6573Sopenharmony_ci}
1210c67d6573Sopenharmony_ci
1211c67d6573Sopenharmony_cifn u32_to_usize(n: u32) -> usize {
1212c67d6573Sopenharmony_ci    // In case usize is less than 32 bits, we need to guard against overflow.
1213c67d6573Sopenharmony_ci    // On most platforms this compiles to nothing.
1214c67d6573Sopenharmony_ci    // TODO Use `std::convert::TryFrom` once it's stable.
1215c67d6573Sopenharmony_ci    if (n as u64) > (::std::usize::MAX as u64) {
1216c67d6573Sopenharmony_ci        panic!("BUG: {} is too big to be pointer sized", n)
1217c67d6573Sopenharmony_ci    }
1218c67d6573Sopenharmony_ci    n as usize
1219c67d6573Sopenharmony_ci}
1220c67d6573Sopenharmony_ci
1221c67d6573Sopenharmony_ci#[cfg(test)]
1222c67d6573Sopenharmony_cimod tests {
1223c67d6573Sopenharmony_ci    use super::ByteClassSet;
1224c67d6573Sopenharmony_ci
1225c67d6573Sopenharmony_ci    #[test]
1226c67d6573Sopenharmony_ci    fn byte_classes() {
1227c67d6573Sopenharmony_ci        let mut set = ByteClassSet::new();
1228c67d6573Sopenharmony_ci        set.set_range(b'a', b'z');
1229c67d6573Sopenharmony_ci        let classes = set.byte_classes();
1230c67d6573Sopenharmony_ci        assert_eq!(classes[0], 0);
1231c67d6573Sopenharmony_ci        assert_eq!(classes[1], 0);
1232c67d6573Sopenharmony_ci        assert_eq!(classes[2], 0);
1233c67d6573Sopenharmony_ci        assert_eq!(classes[b'a' as usize - 1], 0);
1234c67d6573Sopenharmony_ci        assert_eq!(classes[b'a' as usize], 1);
1235c67d6573Sopenharmony_ci        assert_eq!(classes[b'm' as usize], 1);
1236c67d6573Sopenharmony_ci        assert_eq!(classes[b'z' as usize], 1);
1237c67d6573Sopenharmony_ci        assert_eq!(classes[b'z' as usize + 1], 2);
1238c67d6573Sopenharmony_ci        assert_eq!(classes[254], 2);
1239c67d6573Sopenharmony_ci        assert_eq!(classes[255], 2);
1240c67d6573Sopenharmony_ci
1241c67d6573Sopenharmony_ci        let mut set = ByteClassSet::new();
1242c67d6573Sopenharmony_ci        set.set_range(0, 2);
1243c67d6573Sopenharmony_ci        set.set_range(4, 6);
1244c67d6573Sopenharmony_ci        let classes = set.byte_classes();
1245c67d6573Sopenharmony_ci        assert_eq!(classes[0], 0);
1246c67d6573Sopenharmony_ci        assert_eq!(classes[1], 0);
1247c67d6573Sopenharmony_ci        assert_eq!(classes[2], 0);
1248c67d6573Sopenharmony_ci        assert_eq!(classes[3], 1);
1249c67d6573Sopenharmony_ci        assert_eq!(classes[4], 2);
1250c67d6573Sopenharmony_ci        assert_eq!(classes[5], 2);
1251c67d6573Sopenharmony_ci        assert_eq!(classes[6], 2);
1252c67d6573Sopenharmony_ci        assert_eq!(classes[7], 3);
1253c67d6573Sopenharmony_ci        assert_eq!(classes[255], 3);
1254c67d6573Sopenharmony_ci    }
1255c67d6573Sopenharmony_ci
1256c67d6573Sopenharmony_ci    #[test]
1257c67d6573Sopenharmony_ci    fn full_byte_classes() {
1258c67d6573Sopenharmony_ci        let mut set = ByteClassSet::new();
1259c67d6573Sopenharmony_ci        for i in 0..256u16 {
1260c67d6573Sopenharmony_ci            set.set_range(i as u8, i as u8);
1261c67d6573Sopenharmony_ci        }
1262c67d6573Sopenharmony_ci        assert_eq!(set.byte_classes().len(), 256);
1263c67d6573Sopenharmony_ci    }
1264c67d6573Sopenharmony_ci}
1265