1use crate::ast;
2use crate::hir;
3
4use crate::Result;
5
6/// A builder for a regular expression parser.
7///
8/// This builder permits modifying configuration options for the parser.
9///
10/// This type combines the builder options for both the
11/// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html)
12/// and the
13/// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html).
14#[derive(Clone, Debug, Default)]
15pub struct ParserBuilder {
16    ast: ast::parse::ParserBuilder,
17    hir: hir::translate::TranslatorBuilder,
18}
19
20impl ParserBuilder {
21    /// Create a new parser builder with a default configuration.
22    pub fn new() -> ParserBuilder {
23        ParserBuilder::default()
24    }
25
26    /// Build a parser from this configuration with the given pattern.
27    pub fn build(&self) -> Parser {
28        Parser { ast: self.ast.build(), hir: self.hir.build() }
29    }
30
31    /// Set the nesting limit for this parser.
32    ///
33    /// The nesting limit controls how deep the abstract syntax tree is allowed
34    /// to be. If the AST exceeds the given limit (e.g., with too many nested
35    /// groups), then an error is returned by the parser.
36    ///
37    /// The purpose of this limit is to act as a heuristic to prevent stack
38    /// overflow for consumers that do structural induction on an `Ast` using
39    /// explicit recursion. While this crate never does this (instead using
40    /// constant stack space and moving the call stack to the heap), other
41    /// crates may.
42    ///
43    /// This limit is not checked until the entire Ast is parsed. Therefore,
44    /// if callers want to put a limit on the amount of heap space used, then
45    /// they should impose a limit on the length, in bytes, of the concrete
46    /// pattern string. In particular, this is viable since this parser
47    /// implementation will limit itself to heap space proportional to the
48    /// length of the pattern string.
49    ///
50    /// Note that a nest limit of `0` will return a nest limit error for most
51    /// patterns but not all. For example, a nest limit of `0` permits `a` but
52    /// not `ab`, since `ab` requires a concatenation, which results in a nest
53    /// depth of `1`. In general, a nest limit is not something that manifests
54    /// in an obvious way in the concrete syntax, therefore, it should not be
55    /// used in a granular way.
56    pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
57        self.ast.nest_limit(limit);
58        self
59    }
60
61    /// Whether to support octal syntax or not.
62    ///
63    /// Octal syntax is a little-known way of uttering Unicode codepoints in
64    /// a regular expression. For example, `a`, `\x61`, `\u0061` and
65    /// `\141` are all equivalent regular expressions, where the last example
66    /// shows octal syntax.
67    ///
68    /// While supporting octal syntax isn't in and of itself a problem, it does
69    /// make good error messages harder. That is, in PCRE based regex engines,
70    /// syntax like `\0` invokes a backreference, which is explicitly
71    /// unsupported in Rust's regex engine. However, many users expect it to
72    /// be supported. Therefore, when octal support is disabled, the error
73    /// message will explicitly mention that backreferences aren't supported.
74    ///
75    /// Octal syntax is disabled by default.
76    pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
77        self.ast.octal(yes);
78        self
79    }
80
81    /// When enabled, the parser will permit the construction of a regular
82    /// expression that may match invalid UTF-8.
83    ///
84    /// When disabled (the default), the parser is guaranteed to produce
85    /// an expression that will only ever match valid UTF-8 (otherwise, the
86    /// parser will return an error).
87    ///
88    /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
89    /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
90    /// the parser to return an error. Namely, a negated ASCII word boundary
91    /// can result in matching positions that aren't valid UTF-8 boundaries.
92    pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder {
93        self.hir.allow_invalid_utf8(yes);
94        self
95    }
96
97    /// Enable verbose mode in the regular expression.
98    ///
99    /// When enabled, verbose mode permits insignificant whitespace in many
100    /// places in the regular expression, as well as comments. Comments are
101    /// started using `#` and continue until the end of the line.
102    ///
103    /// By default, this is disabled. It may be selectively enabled in the
104    /// regular expression by using the `x` flag regardless of this setting.
105    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
106        self.ast.ignore_whitespace(yes);
107        self
108    }
109
110    /// Enable or disable the case insensitive flag by default.
111    ///
112    /// By default this is disabled. It may alternatively be selectively
113    /// enabled in the regular expression itself via the `i` flag.
114    pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
115        self.hir.case_insensitive(yes);
116        self
117    }
118
119    /// Enable or disable the multi-line matching flag by default.
120    ///
121    /// By default this is disabled. It may alternatively be selectively
122    /// enabled in the regular expression itself via the `m` flag.
123    pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
124        self.hir.multi_line(yes);
125        self
126    }
127
128    /// Enable or disable the "dot matches any character" flag by default.
129    ///
130    /// By default this is disabled. It may alternatively be selectively
131    /// enabled in the regular expression itself via the `s` flag.
132    pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
133        self.hir.dot_matches_new_line(yes);
134        self
135    }
136
137    /// Enable or disable the "swap greed" flag by default.
138    ///
139    /// By default this is disabled. It may alternatively be selectively
140    /// enabled in the regular expression itself via the `U` flag.
141    pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
142        self.hir.swap_greed(yes);
143        self
144    }
145
146    /// Enable or disable the Unicode flag (`u`) by default.
147    ///
148    /// By default this is **enabled**. It may alternatively be selectively
149    /// disabled in the regular expression itself via the `u` flag.
150    ///
151    /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
152    /// default), a regular expression will fail to parse if Unicode mode is
153    /// disabled and a sub-expression could possibly match invalid UTF-8.
154    pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
155        self.hir.unicode(yes);
156        self
157    }
158}
159
160/// A convenience parser for regular expressions.
161///
162/// This parser takes as input a regular expression pattern string (the
163/// "concrete syntax") and returns a high-level intermediate representation
164/// (the HIR) suitable for most types of analysis. In particular, this parser
165/// hides the intermediate state of producing an AST (the "abstract syntax").
166/// The AST is itself far more complex than the HIR, so this parser serves as a
167/// convenience for never having to deal with it at all.
168///
169/// If callers have more fine grained use cases that need an AST, then please
170/// see the [`ast::parse`](ast/parse/index.html) module.
171///
172/// A `Parser` can be configured in more detail via a
173/// [`ParserBuilder`](struct.ParserBuilder.html).
174#[derive(Clone, Debug)]
175pub struct Parser {
176    ast: ast::parse::Parser,
177    hir: hir::translate::Translator,
178}
179
180impl Parser {
181    /// Create a new parser with a default configuration.
182    ///
183    /// The parser can be run with `parse` method. The parse method returns
184    /// a high level intermediate representation of the given regular
185    /// expression.
186    ///
187    /// To set configuration options on the parser, use
188    /// [`ParserBuilder`](struct.ParserBuilder.html).
189    pub fn new() -> Parser {
190        ParserBuilder::new().build()
191    }
192
193    /// Parse the regular expression into a high level intermediate
194    /// representation.
195    pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir> {
196        let ast = self.ast.parse(pattern)?;
197        let hir = self.hir.translate(pattern, &ast)?;
198        Ok(hir)
199    }
200}
201