xref: /third_party/rust/crates/regex/src/lib.rs (revision c67d6573)
1c67d6573Sopenharmony_ci/*!
2c67d6573Sopenharmony_ciThis crate provides a library for parsing, compiling, and executing regular
3c67d6573Sopenharmony_ciexpressions. Its syntax is similar to Perl-style regular expressions, but lacks
4c67d6573Sopenharmony_cia few features like look around and backreferences. In exchange, all searches
5c67d6573Sopenharmony_ciexecute in linear time with respect to the size of the regular expression and
6c67d6573Sopenharmony_cisearch text.
7c67d6573Sopenharmony_ci
8c67d6573Sopenharmony_ciThis crate's documentation provides some simple examples, describes
9c67d6573Sopenharmony_ci[Unicode support](#unicode) and exhaustively lists the
10c67d6573Sopenharmony_ci[supported syntax](#syntax).
11c67d6573Sopenharmony_ci
12c67d6573Sopenharmony_ciFor more specific details on the API for regular expressions, please see the
13c67d6573Sopenharmony_cidocumentation for the [`Regex`](struct.Regex.html) type.
14c67d6573Sopenharmony_ci
15c67d6573Sopenharmony_ci# Usage
16c67d6573Sopenharmony_ci
17c67d6573Sopenharmony_ciThis crate is [on crates.io](https://crates.io/crates/regex) and can be
18c67d6573Sopenharmony_ciused by adding `regex` to your dependencies in your project's `Cargo.toml`.
19c67d6573Sopenharmony_ci
20c67d6573Sopenharmony_ci```toml
21c67d6573Sopenharmony_ci[dependencies]
22c67d6573Sopenharmony_ciregex = "1"
23c67d6573Sopenharmony_ci```
24c67d6573Sopenharmony_ci
25c67d6573Sopenharmony_ci# Example: find a date
26c67d6573Sopenharmony_ci
27c67d6573Sopenharmony_ciGeneral use of regular expressions in this package involves compiling an
28c67d6573Sopenharmony_ciexpression and then using it to search, split or replace text. For example,
29c67d6573Sopenharmony_cito confirm that some text resembles a date:
30c67d6573Sopenharmony_ci
31c67d6573Sopenharmony_ci```rust
32c67d6573Sopenharmony_ciuse regex::Regex;
33c67d6573Sopenharmony_cilet re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap();
34c67d6573Sopenharmony_ciassert!(re.is_match("2014-01-01"));
35c67d6573Sopenharmony_ci```
36c67d6573Sopenharmony_ci
37c67d6573Sopenharmony_ciNotice the use of the `^` and `$` anchors. In this crate, every expression
38c67d6573Sopenharmony_ciis executed with an implicit `.*?` at the beginning and end, which allows
39c67d6573Sopenharmony_ciit to match anywhere in the text. Anchors can be used to ensure that the
40c67d6573Sopenharmony_cifull text matches an expression.
41c67d6573Sopenharmony_ci
42c67d6573Sopenharmony_ciThis example also demonstrates the utility of
43c67d6573Sopenharmony_ci[raw strings](https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals)
44c67d6573Sopenharmony_ciin Rust, which
45c67d6573Sopenharmony_ciare just like regular strings except they are prefixed with an `r` and do
46c67d6573Sopenharmony_cinot process any escape sequences. For example, `"\\d"` is the same
47c67d6573Sopenharmony_ciexpression as `r"\d"`.
48c67d6573Sopenharmony_ci
49c67d6573Sopenharmony_ci# Example: Avoid compiling the same regex in a loop
50c67d6573Sopenharmony_ci
51c67d6573Sopenharmony_ciIt is an anti-pattern to compile the same regular expression in a loop
52c67d6573Sopenharmony_cisince compilation is typically expensive. (It takes anywhere from a few
53c67d6573Sopenharmony_cimicroseconds to a few **milliseconds** depending on the size of the
54c67d6573Sopenharmony_ciregex.) Not only is compilation itself expensive, but this also prevents
55c67d6573Sopenharmony_cioptimizations that reuse allocations internally to the matching engines.
56c67d6573Sopenharmony_ci
57c67d6573Sopenharmony_ciIn Rust, it can sometimes be a pain to pass regular expressions around if
58c67d6573Sopenharmony_cithey're used from inside a helper function. Instead, we recommend using the
59c67d6573Sopenharmony_ci[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that
60c67d6573Sopenharmony_ciregular expressions are compiled exactly once.
61c67d6573Sopenharmony_ci
62c67d6573Sopenharmony_ciFor example:
63c67d6573Sopenharmony_ci
64c67d6573Sopenharmony_ci```rust
65c67d6573Sopenharmony_ciuse lazy_static::lazy_static;
66c67d6573Sopenharmony_ciuse regex::Regex;
67c67d6573Sopenharmony_ci
68c67d6573Sopenharmony_cifn some_helper_function(text: &str) -> bool {
69c67d6573Sopenharmony_ci    lazy_static! {
70c67d6573Sopenharmony_ci        static ref RE: Regex = Regex::new("...").unwrap();
71c67d6573Sopenharmony_ci    }
72c67d6573Sopenharmony_ci    RE.is_match(text)
73c67d6573Sopenharmony_ci}
74c67d6573Sopenharmony_ci
75c67d6573Sopenharmony_cifn main() {}
76c67d6573Sopenharmony_ci```
77c67d6573Sopenharmony_ci
78c67d6573Sopenharmony_ciSpecifically, in this example, the regex will be compiled when it is used for
79c67d6573Sopenharmony_cithe first time. On subsequent uses, it will reuse the previous compilation.
80c67d6573Sopenharmony_ci
81c67d6573Sopenharmony_ci# Example: iterating over capture groups
82c67d6573Sopenharmony_ci
83c67d6573Sopenharmony_ciThis crate provides convenient iterators for matching an expression
84c67d6573Sopenharmony_cirepeatedly against a search string to find successive non-overlapping
85c67d6573Sopenharmony_cimatches. For example, to find all dates in a string and be able to access
86c67d6573Sopenharmony_cithem by their component pieces:
87c67d6573Sopenharmony_ci
88c67d6573Sopenharmony_ci```rust
89c67d6573Sopenharmony_ci# use regex::Regex;
90c67d6573Sopenharmony_ci# fn main() {
91c67d6573Sopenharmony_cilet re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
92c67d6573Sopenharmony_cilet text = "2012-03-14, 2013-01-01 and 2014-07-05";
93c67d6573Sopenharmony_cifor cap in re.captures_iter(text) {
94c67d6573Sopenharmony_ci    println!("Month: {} Day: {} Year: {}", &cap[2], &cap[3], &cap[1]);
95c67d6573Sopenharmony_ci}
96c67d6573Sopenharmony_ci// Output:
97c67d6573Sopenharmony_ci// Month: 03 Day: 14 Year: 2012
98c67d6573Sopenharmony_ci// Month: 01 Day: 01 Year: 2013
99c67d6573Sopenharmony_ci// Month: 07 Day: 05 Year: 2014
100c67d6573Sopenharmony_ci# }
101c67d6573Sopenharmony_ci```
102c67d6573Sopenharmony_ci
103c67d6573Sopenharmony_ciNotice that the year is in the capture group indexed at `1`. This is
104c67d6573Sopenharmony_cibecause the *entire match* is stored in the capture group at index `0`.
105c67d6573Sopenharmony_ci
106c67d6573Sopenharmony_ci# Example: replacement with named capture groups
107c67d6573Sopenharmony_ci
108c67d6573Sopenharmony_ciBuilding on the previous example, perhaps we'd like to rearrange the date
109c67d6573Sopenharmony_ciformats. This can be done with text replacement. But to make the code
110c67d6573Sopenharmony_ciclearer, we can *name*  our capture groups and use those names as variables
111c67d6573Sopenharmony_ciin our replacement text:
112c67d6573Sopenharmony_ci
113c67d6573Sopenharmony_ci```rust
114c67d6573Sopenharmony_ci# use regex::Regex;
115c67d6573Sopenharmony_ci# fn main() {
116c67d6573Sopenharmony_cilet re = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap();
117c67d6573Sopenharmony_cilet before = "2012-03-14, 2013-01-01 and 2014-07-05";
118c67d6573Sopenharmony_cilet after = re.replace_all(before, "$m/$d/$y");
119c67d6573Sopenharmony_ciassert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014");
120c67d6573Sopenharmony_ci# }
121c67d6573Sopenharmony_ci```
122c67d6573Sopenharmony_ci
123c67d6573Sopenharmony_ciThe `replace` methods are actually polymorphic in the replacement, which
124c67d6573Sopenharmony_ciprovides more flexibility than is seen here. (See the documentation for
125c67d6573Sopenharmony_ci`Regex::replace` for more details.)
126c67d6573Sopenharmony_ci
127c67d6573Sopenharmony_ciNote that if your regex gets complicated, you can use the `x` flag to
128c67d6573Sopenharmony_cienable insignificant whitespace mode, which also lets you write comments:
129c67d6573Sopenharmony_ci
130c67d6573Sopenharmony_ci```rust
131c67d6573Sopenharmony_ci# use regex::Regex;
132c67d6573Sopenharmony_ci# fn main() {
133c67d6573Sopenharmony_cilet re = Regex::new(r"(?x)
134c67d6573Sopenharmony_ci  (?P<y>\d{4}) # the year
135c67d6573Sopenharmony_ci  -
136c67d6573Sopenharmony_ci  (?P<m>\d{2}) # the month
137c67d6573Sopenharmony_ci  -
138c67d6573Sopenharmony_ci  (?P<d>\d{2}) # the day
139c67d6573Sopenharmony_ci").unwrap();
140c67d6573Sopenharmony_cilet before = "2012-03-14, 2013-01-01 and 2014-07-05";
141c67d6573Sopenharmony_cilet after = re.replace_all(before, "$m/$d/$y");
142c67d6573Sopenharmony_ciassert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014");
143c67d6573Sopenharmony_ci# }
144c67d6573Sopenharmony_ci```
145c67d6573Sopenharmony_ci
146c67d6573Sopenharmony_ciIf you wish to match against whitespace in this mode, you can still use `\s`,
147c67d6573Sopenharmony_ci`\n`, `\t`, etc. For escaping a single space character, you can escape it
148c67d6573Sopenharmony_cidirectly with `\ `, use its hex character code `\x20` or temporarily disable
149c67d6573Sopenharmony_cithe `x` flag, e.g., `(?-x: )`.
150c67d6573Sopenharmony_ci
151c67d6573Sopenharmony_ci# Example: match multiple regular expressions simultaneously
152c67d6573Sopenharmony_ci
153c67d6573Sopenharmony_ciThis demonstrates how to use a `RegexSet` to match multiple (possibly
154c67d6573Sopenharmony_cioverlapping) regular expressions in a single scan of the search text:
155c67d6573Sopenharmony_ci
156c67d6573Sopenharmony_ci```rust
157c67d6573Sopenharmony_ciuse regex::RegexSet;
158c67d6573Sopenharmony_ci
159c67d6573Sopenharmony_cilet set = RegexSet::new(&[
160c67d6573Sopenharmony_ci    r"\w+",
161c67d6573Sopenharmony_ci    r"\d+",
162c67d6573Sopenharmony_ci    r"\pL+",
163c67d6573Sopenharmony_ci    r"foo",
164c67d6573Sopenharmony_ci    r"bar",
165c67d6573Sopenharmony_ci    r"barfoo",
166c67d6573Sopenharmony_ci    r"foobar",
167c67d6573Sopenharmony_ci]).unwrap();
168c67d6573Sopenharmony_ci
169c67d6573Sopenharmony_ci// Iterate over and collect all of the matches.
170c67d6573Sopenharmony_cilet matches: Vec<_> = set.matches("foobar").into_iter().collect();
171c67d6573Sopenharmony_ciassert_eq!(matches, vec![0, 2, 3, 4, 6]);
172c67d6573Sopenharmony_ci
173c67d6573Sopenharmony_ci// You can also test whether a particular regex matched:
174c67d6573Sopenharmony_cilet matches = set.matches("foobar");
175c67d6573Sopenharmony_ciassert!(!matches.matched(5));
176c67d6573Sopenharmony_ciassert!(matches.matched(6));
177c67d6573Sopenharmony_ci```
178c67d6573Sopenharmony_ci
179c67d6573Sopenharmony_ci# Pay for what you use
180c67d6573Sopenharmony_ci
181c67d6573Sopenharmony_ciWith respect to searching text with a regular expression, there are three
182c67d6573Sopenharmony_ciquestions that can be asked:
183c67d6573Sopenharmony_ci
184c67d6573Sopenharmony_ci1. Does the text match this expression?
185c67d6573Sopenharmony_ci2. If so, where does it match?
186c67d6573Sopenharmony_ci3. Where did the capturing groups match?
187c67d6573Sopenharmony_ci
188c67d6573Sopenharmony_ciGenerally speaking, this crate could provide a function to answer only #3,
189c67d6573Sopenharmony_ciwhich would subsume #1 and #2 automatically. However, it can be significantly
190c67d6573Sopenharmony_cimore expensive to compute the location of capturing group matches, so it's best
191c67d6573Sopenharmony_cinot to do it if you don't need to.
192c67d6573Sopenharmony_ci
193c67d6573Sopenharmony_ciTherefore, only use what you need. For example, don't use `find` if you
194c67d6573Sopenharmony_cionly need to test if an expression matches a string. (Use `is_match`
195c67d6573Sopenharmony_ciinstead.)
196c67d6573Sopenharmony_ci
197c67d6573Sopenharmony_ci# Unicode
198c67d6573Sopenharmony_ci
199c67d6573Sopenharmony_ciThis implementation executes regular expressions **only** on valid UTF-8
200c67d6573Sopenharmony_ciwhile exposing match locations as byte indices into the search string. (To
201c67d6573Sopenharmony_cirelax this restriction, use the [`bytes`](bytes/index.html) sub-module.)
202c67d6573Sopenharmony_ci
203c67d6573Sopenharmony_ciOnly simple case folding is supported. Namely, when matching
204c67d6573Sopenharmony_cicase-insensitively, the characters are first mapped using the "simple" case
205c67d6573Sopenharmony_cifolding rules defined by Unicode.
206c67d6573Sopenharmony_ci
207c67d6573Sopenharmony_ciRegular expressions themselves are **only** interpreted as a sequence of
208c67d6573Sopenharmony_ciUnicode scalar values. This means you can use Unicode characters directly
209c67d6573Sopenharmony_ciin your expression:
210c67d6573Sopenharmony_ci
211c67d6573Sopenharmony_ci```rust
212c67d6573Sopenharmony_ci# use regex::Regex;
213c67d6573Sopenharmony_ci# fn main() {
214c67d6573Sopenharmony_cilet re = Regex::new(r"(?i)Δ+").unwrap();
215c67d6573Sopenharmony_cilet mat = re.find("ΔδΔ").unwrap();
216c67d6573Sopenharmony_ciassert_eq!((mat.start(), mat.end()), (0, 6));
217c67d6573Sopenharmony_ci# }
218c67d6573Sopenharmony_ci```
219c67d6573Sopenharmony_ci
220c67d6573Sopenharmony_ciMost features of the regular expressions in this crate are Unicode aware. Here
221c67d6573Sopenharmony_ciare some examples:
222c67d6573Sopenharmony_ci
223c67d6573Sopenharmony_ci* `.` will match any valid UTF-8 encoded Unicode scalar value except for `\n`.
224c67d6573Sopenharmony_ci  (To also match `\n`, enable the `s` flag, e.g., `(?s:.)`.)
225c67d6573Sopenharmony_ci* `\w`, `\d` and `\s` are Unicode aware. For example, `\s` will match all forms
226c67d6573Sopenharmony_ci  of whitespace categorized by Unicode.
227c67d6573Sopenharmony_ci* `\b` matches a Unicode word boundary.
228c67d6573Sopenharmony_ci* Negated character classes like `[^a]` match all Unicode scalar values except
229c67d6573Sopenharmony_ci  for `a`.
230c67d6573Sopenharmony_ci* `^` and `$` are **not** Unicode aware in multi-line mode. Namely, they only
231c67d6573Sopenharmony_ci  recognize `\n` and not any of the other forms of line terminators defined
232c67d6573Sopenharmony_ci  by Unicode.
233c67d6573Sopenharmony_ci
234c67d6573Sopenharmony_ciUnicode general categories, scripts, script extensions, ages and a smattering
235c67d6573Sopenharmony_ciof boolean properties are available as character classes. For example, you can
236c67d6573Sopenharmony_cimatch a sequence of numerals, Greek or Cherokee letters:
237c67d6573Sopenharmony_ci
238c67d6573Sopenharmony_ci```rust
239c67d6573Sopenharmony_ci# use regex::Regex;
240c67d6573Sopenharmony_ci# fn main() {
241c67d6573Sopenharmony_cilet re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap();
242c67d6573Sopenharmony_cilet mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap();
243c67d6573Sopenharmony_ciassert_eq!((mat.start(), mat.end()), (3, 23));
244c67d6573Sopenharmony_ci# }
245c67d6573Sopenharmony_ci```
246c67d6573Sopenharmony_ci
247c67d6573Sopenharmony_ciFor a more detailed breakdown of Unicode support with respect to
248c67d6573Sopenharmony_ci[UTS#18](https://unicode.org/reports/tr18/),
249c67d6573Sopenharmony_ciplease see the
250c67d6573Sopenharmony_ci[UNICODE](https://github.com/rust-lang/regex/blob/master/UNICODE.md)
251c67d6573Sopenharmony_cidocument in the root of the regex repository.
252c67d6573Sopenharmony_ci
253c67d6573Sopenharmony_ci# Opt out of Unicode support
254c67d6573Sopenharmony_ci
255c67d6573Sopenharmony_ciThe `bytes` sub-module provides a `Regex` type that can be used to match
256c67d6573Sopenharmony_cion `&[u8]`. By default, text is interpreted as UTF-8 just like it is with
257c67d6573Sopenharmony_cithe main `Regex` type. However, this behavior can be disabled by turning
258c67d6573Sopenharmony_cioff the `u` flag, even if doing so could result in matching invalid UTF-8.
259c67d6573Sopenharmony_ciFor example, when the `u` flag is disabled, `.` will match any byte instead
260c67d6573Sopenharmony_ciof any Unicode scalar value.
261c67d6573Sopenharmony_ci
262c67d6573Sopenharmony_ciDisabling the `u` flag is also possible with the standard `&str`-based `Regex`
263c67d6573Sopenharmony_citype, but it is only allowed where the UTF-8 invariant is maintained. For
264c67d6573Sopenharmony_ciexample, `(?-u:\w)` is an ASCII-only `\w` character class and is legal in an
265c67d6573Sopenharmony_ci`&str`-based `Regex`, but `(?-u:\xFF)` will attempt to match the raw byte
266c67d6573Sopenharmony_ci`\xFF`, which is invalid UTF-8 and therefore is illegal in `&str`-based
267c67d6573Sopenharmony_ciregexes.
268c67d6573Sopenharmony_ci
269c67d6573Sopenharmony_ciFinally, since Unicode support requires bundling large Unicode data
270c67d6573Sopenharmony_citables, this crate exposes knobs to disable the compilation of those
271c67d6573Sopenharmony_cidata tables, which can be useful for shrinking binary size and reducing
272c67d6573Sopenharmony_cicompilation times. For details on how to do that, see the section on [crate
273c67d6573Sopenharmony_cifeatures](#crate-features).
274c67d6573Sopenharmony_ci
275c67d6573Sopenharmony_ci# Syntax
276c67d6573Sopenharmony_ci
277c67d6573Sopenharmony_ciThe syntax supported in this crate is documented below.
278c67d6573Sopenharmony_ci
279c67d6573Sopenharmony_ciNote that the regular expression parser and abstract syntax are exposed in
280c67d6573Sopenharmony_cia separate crate, [`regex-syntax`](https://docs.rs/regex-syntax).
281c67d6573Sopenharmony_ci
282c67d6573Sopenharmony_ci## Matching one character
283c67d6573Sopenharmony_ci
284c67d6573Sopenharmony_ci<pre class="rust">
285c67d6573Sopenharmony_ci.             any character except new line (includes new line with s flag)
286c67d6573Sopenharmony_ci\d            digit (\p{Nd})
287c67d6573Sopenharmony_ci\D            not digit
288c67d6573Sopenharmony_ci\pN           One-letter name Unicode character class
289c67d6573Sopenharmony_ci\p{Greek}     Unicode character class (general category or script)
290c67d6573Sopenharmony_ci\PN           Negated one-letter name Unicode character class
291c67d6573Sopenharmony_ci\P{Greek}     negated Unicode character class (general category or script)
292c67d6573Sopenharmony_ci</pre>
293c67d6573Sopenharmony_ci
294c67d6573Sopenharmony_ci### Character classes
295c67d6573Sopenharmony_ci
296c67d6573Sopenharmony_ci<pre class="rust">
297c67d6573Sopenharmony_ci[xyz]         A character class matching either x, y or z (union).
298c67d6573Sopenharmony_ci[^xyz]        A character class matching any character except x, y and z.
299c67d6573Sopenharmony_ci[a-z]         A character class matching any character in range a-z.
300c67d6573Sopenharmony_ci[[:alpha:]]   ASCII character class ([A-Za-z])
301c67d6573Sopenharmony_ci[[:^alpha:]]  Negated ASCII character class ([^A-Za-z])
302c67d6573Sopenharmony_ci[x[^xyz]]     Nested/grouping character class (matching any character except y and z)
303c67d6573Sopenharmony_ci[a-y&&xyz]    Intersection (matching x or y)
304c67d6573Sopenharmony_ci[0-9&&[^4]]   Subtraction using intersection and negation (matching 0-9 except 4)
305c67d6573Sopenharmony_ci[0-9--4]      Direct subtraction (matching 0-9 except 4)
306c67d6573Sopenharmony_ci[a-g~~b-h]    Symmetric difference (matching `a` and `h` only)
307c67d6573Sopenharmony_ci[\[\]]        Escaping in character classes (matching [ or ])
308c67d6573Sopenharmony_ci</pre>
309c67d6573Sopenharmony_ci
310c67d6573Sopenharmony_ciAny named character class may appear inside a bracketed `[...]` character
311c67d6573Sopenharmony_ciclass. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII
312c67d6573Sopenharmony_cidigit. `[\p{Greek}&&\pL]` matches Greek letters.
313c67d6573Sopenharmony_ci
314c67d6573Sopenharmony_ciPrecedence in character classes, from most binding to least:
315c67d6573Sopenharmony_ci
316c67d6573Sopenharmony_ci1. Ranges: `a-cd` == `[a-c]d`
317c67d6573Sopenharmony_ci2. Union: `ab&&bc` == `[ab]&&[bc]`
318c67d6573Sopenharmony_ci3. Intersection: `^a-z&&b` == `^[a-z&&b]`
319c67d6573Sopenharmony_ci4. Negation
320c67d6573Sopenharmony_ci
321c67d6573Sopenharmony_ci## Composites
322c67d6573Sopenharmony_ci
323c67d6573Sopenharmony_ci<pre class="rust">
324c67d6573Sopenharmony_cixy    concatenation (x followed by y)
325c67d6573Sopenharmony_cix|y   alternation (x or y, prefer x)
326c67d6573Sopenharmony_ci</pre>
327c67d6573Sopenharmony_ci
328c67d6573Sopenharmony_ci## Repetitions
329c67d6573Sopenharmony_ci
330c67d6573Sopenharmony_ci<pre class="rust">
331c67d6573Sopenharmony_cix*        zero or more of x (greedy)
332c67d6573Sopenharmony_cix+        one or more of x (greedy)
333c67d6573Sopenharmony_cix?        zero or one of x (greedy)
334c67d6573Sopenharmony_cix*?       zero or more of x (ungreedy/lazy)
335c67d6573Sopenharmony_cix+?       one or more of x (ungreedy/lazy)
336c67d6573Sopenharmony_cix??       zero or one of x (ungreedy/lazy)
337c67d6573Sopenharmony_cix{n,m}    at least n x and at most m x (greedy)
338c67d6573Sopenharmony_cix{n,}     at least n x (greedy)
339c67d6573Sopenharmony_cix{n}      exactly n x
340c67d6573Sopenharmony_cix{n,m}?   at least n x and at most m x (ungreedy/lazy)
341c67d6573Sopenharmony_cix{n,}?    at least n x (ungreedy/lazy)
342c67d6573Sopenharmony_cix{n}?     exactly n x
343c67d6573Sopenharmony_ci</pre>
344c67d6573Sopenharmony_ci
345c67d6573Sopenharmony_ci## Empty matches
346c67d6573Sopenharmony_ci
347c67d6573Sopenharmony_ci<pre class="rust">
348c67d6573Sopenharmony_ci^     the beginning of text (or start-of-line with multi-line mode)
349c67d6573Sopenharmony_ci$     the end of text (or end-of-line with multi-line mode)
350c67d6573Sopenharmony_ci\A    only the beginning of text (even with multi-line mode enabled)
351c67d6573Sopenharmony_ci\z    only the end of text (even with multi-line mode enabled)
352c67d6573Sopenharmony_ci\b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
353c67d6573Sopenharmony_ci\B    not a Unicode word boundary
354c67d6573Sopenharmony_ci</pre>
355c67d6573Sopenharmony_ci
356c67d6573Sopenharmony_ciThe empty regex is valid and matches the empty string. For example, the empty
357c67d6573Sopenharmony_ciregex matches `abc` at positions `0`, `1`, `2` and `3`.
358c67d6573Sopenharmony_ci
359c67d6573Sopenharmony_ci## Grouping and flags
360c67d6573Sopenharmony_ci
361c67d6573Sopenharmony_ci<pre class="rust">
362c67d6573Sopenharmony_ci(exp)          numbered capture group (indexed by opening parenthesis)
363c67d6573Sopenharmony_ci(?P&lt;name&gt;exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
364c67d6573Sopenharmony_ci(?:exp)        non-capturing group
365c67d6573Sopenharmony_ci(?flags)       set flags within current group
366c67d6573Sopenharmony_ci(?flags:exp)   set flags for exp (non-capturing)
367c67d6573Sopenharmony_ci</pre>
368c67d6573Sopenharmony_ci
369c67d6573Sopenharmony_ciFlags are each a single character. For example, `(?x)` sets the flag `x`
370c67d6573Sopenharmony_ciand `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
371c67d6573Sopenharmony_cithe same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
372c67d6573Sopenharmony_cithe `x` flag and clears the `y` flag.
373c67d6573Sopenharmony_ci
374c67d6573Sopenharmony_ciAll flags are by default disabled unless stated otherwise. They are:
375c67d6573Sopenharmony_ci
376c67d6573Sopenharmony_ci<pre class="rust">
377c67d6573Sopenharmony_cii     case-insensitive: letters match both upper and lower case
378c67d6573Sopenharmony_cim     multi-line mode: ^ and $ match begin/end of line
379c67d6573Sopenharmony_cis     allow . to match \n
380c67d6573Sopenharmony_ciU     swap the meaning of x* and x*?
381c67d6573Sopenharmony_ciu     Unicode support (enabled by default)
382c67d6573Sopenharmony_cix     ignore whitespace and allow line comments (starting with `#`)
383c67d6573Sopenharmony_ci</pre>
384c67d6573Sopenharmony_ci
385c67d6573Sopenharmony_ciFlags can be toggled within a pattern. Here's an example that matches
386c67d6573Sopenharmony_cicase-insensitively for the first part but case-sensitively for the second part:
387c67d6573Sopenharmony_ci
388c67d6573Sopenharmony_ci```rust
389c67d6573Sopenharmony_ci# use regex::Regex;
390c67d6573Sopenharmony_ci# fn main() {
391c67d6573Sopenharmony_cilet re = Regex::new(r"(?i)a+(?-i)b+").unwrap();
392c67d6573Sopenharmony_cilet cap = re.captures("AaAaAbbBBBb").unwrap();
393c67d6573Sopenharmony_ciassert_eq!(&cap[0], "AaAaAbb");
394c67d6573Sopenharmony_ci# }
395c67d6573Sopenharmony_ci```
396c67d6573Sopenharmony_ci
397c67d6573Sopenharmony_ciNotice that the `a+` matches either `a` or `A`, but the `b+` only matches
398c67d6573Sopenharmony_ci`b`.
399c67d6573Sopenharmony_ci
400c67d6573Sopenharmony_ciMulti-line mode means `^` and `$` no longer match just at the beginning/end of
401c67d6573Sopenharmony_cithe input, but at the beginning/end of lines:
402c67d6573Sopenharmony_ci
403c67d6573Sopenharmony_ci```
404c67d6573Sopenharmony_ci# use regex::Regex;
405c67d6573Sopenharmony_cilet re = Regex::new(r"(?m)^line \d+").unwrap();
406c67d6573Sopenharmony_cilet m = re.find("line one\nline 2\n").unwrap();
407c67d6573Sopenharmony_ciassert_eq!(m.as_str(), "line 2");
408c67d6573Sopenharmony_ci```
409c67d6573Sopenharmony_ci
410c67d6573Sopenharmony_ciNote that `^` matches after new lines, even at the end of input:
411c67d6573Sopenharmony_ci
412c67d6573Sopenharmony_ci```
413c67d6573Sopenharmony_ci# use regex::Regex;
414c67d6573Sopenharmony_cilet re = Regex::new(r"(?m)^").unwrap();
415c67d6573Sopenharmony_cilet m = re.find_iter("test\n").last().unwrap();
416c67d6573Sopenharmony_ciassert_eq!((m.start(), m.end()), (5, 5));
417c67d6573Sopenharmony_ci```
418c67d6573Sopenharmony_ci
419c67d6573Sopenharmony_ciHere is an example that uses an ASCII word boundary instead of a Unicode
420c67d6573Sopenharmony_ciword boundary:
421c67d6573Sopenharmony_ci
422c67d6573Sopenharmony_ci```rust
423c67d6573Sopenharmony_ci# use regex::Regex;
424c67d6573Sopenharmony_ci# fn main() {
425c67d6573Sopenharmony_cilet re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap();
426c67d6573Sopenharmony_cilet cap = re.captures("$$abc$$").unwrap();
427c67d6573Sopenharmony_ciassert_eq!(&cap[0], "abc");
428c67d6573Sopenharmony_ci# }
429c67d6573Sopenharmony_ci```
430c67d6573Sopenharmony_ci
431c67d6573Sopenharmony_ci## Escape sequences
432c67d6573Sopenharmony_ci
433c67d6573Sopenharmony_ci<pre class="rust">
434c67d6573Sopenharmony_ci\*          literal *, works for any punctuation character: \.+*?()|[]{}^$
435c67d6573Sopenharmony_ci\a          bell (\x07)
436c67d6573Sopenharmony_ci\f          form feed (\x0C)
437c67d6573Sopenharmony_ci\t          horizontal tab
438c67d6573Sopenharmony_ci\n          new line
439c67d6573Sopenharmony_ci\r          carriage return
440c67d6573Sopenharmony_ci\v          vertical tab (\x0B)
441c67d6573Sopenharmony_ci\123        octal character code (up to three digits) (when enabled)
442c67d6573Sopenharmony_ci\x7F        hex character code (exactly two digits)
443c67d6573Sopenharmony_ci\x{10FFFF}  any hex character code corresponding to a Unicode code point
444c67d6573Sopenharmony_ci\u007F      hex character code (exactly four digits)
445c67d6573Sopenharmony_ci\u{7F}      any hex character code corresponding to a Unicode code point
446c67d6573Sopenharmony_ci\U0000007F  hex character code (exactly eight digits)
447c67d6573Sopenharmony_ci\U{7F}      any hex character code corresponding to a Unicode code point
448c67d6573Sopenharmony_ci</pre>
449c67d6573Sopenharmony_ci
450c67d6573Sopenharmony_ci## Perl character classes (Unicode friendly)
451c67d6573Sopenharmony_ci
452c67d6573Sopenharmony_ciThese classes are based on the definitions provided in
453c67d6573Sopenharmony_ci[UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties):
454c67d6573Sopenharmony_ci
455c67d6573Sopenharmony_ci<pre class="rust">
456c67d6573Sopenharmony_ci\d     digit (\p{Nd})
457c67d6573Sopenharmony_ci\D     not digit
458c67d6573Sopenharmony_ci\s     whitespace (\p{White_Space})
459c67d6573Sopenharmony_ci\S     not whitespace
460c67d6573Sopenharmony_ci\w     word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
461c67d6573Sopenharmony_ci\W     not word character
462c67d6573Sopenharmony_ci</pre>
463c67d6573Sopenharmony_ci
464c67d6573Sopenharmony_ci## ASCII character classes
465c67d6573Sopenharmony_ci
466c67d6573Sopenharmony_ci<pre class="rust">
467c67d6573Sopenharmony_ci[[:alnum:]]    alphanumeric ([0-9A-Za-z])
468c67d6573Sopenharmony_ci[[:alpha:]]    alphabetic ([A-Za-z])
469c67d6573Sopenharmony_ci[[:ascii:]]    ASCII ([\x00-\x7F])
470c67d6573Sopenharmony_ci[[:blank:]]    blank ([\t ])
471c67d6573Sopenharmony_ci[[:cntrl:]]    control ([\x00-\x1F\x7F])
472c67d6573Sopenharmony_ci[[:digit:]]    digits ([0-9])
473c67d6573Sopenharmony_ci[[:graph:]]    graphical ([!-~])
474c67d6573Sopenharmony_ci[[:lower:]]    lower case ([a-z])
475c67d6573Sopenharmony_ci[[:print:]]    printable ([ -~])
476c67d6573Sopenharmony_ci[[:punct:]]    punctuation ([!-/:-@\[-`{-~])
477c67d6573Sopenharmony_ci[[:space:]]    whitespace ([\t\n\v\f\r ])
478c67d6573Sopenharmony_ci[[:upper:]]    upper case ([A-Z])
479c67d6573Sopenharmony_ci[[:word:]]     word characters ([0-9A-Za-z_])
480c67d6573Sopenharmony_ci[[:xdigit:]]   hex digit ([0-9A-Fa-f])
481c67d6573Sopenharmony_ci</pre>
482c67d6573Sopenharmony_ci
483c67d6573Sopenharmony_ci# Crate features
484c67d6573Sopenharmony_ci
485c67d6573Sopenharmony_ciBy default, this crate tries pretty hard to make regex matching both as fast
486c67d6573Sopenharmony_cias possible and as correct as it can be, within reason. This means that there
487c67d6573Sopenharmony_ciis a lot of code dedicated to performance, the handling of Unicode data and the
488c67d6573Sopenharmony_ciUnicode data itself. Overall, this leads to more dependencies, larger binaries
489c67d6573Sopenharmony_ciand longer compile times.  This trade off may not be appropriate in all cases,
490c67d6573Sopenharmony_ciand indeed, even when all Unicode and performance features are disabled, one
491c67d6573Sopenharmony_ciis still left with a perfectly serviceable regex engine that will work well
492c67d6573Sopenharmony_ciin many cases.
493c67d6573Sopenharmony_ci
494c67d6573Sopenharmony_ciThis crate exposes a number of features for controlling that trade off. Some
495c67d6573Sopenharmony_ciof these features are strictly performance oriented, such that disabling them
496c67d6573Sopenharmony_ciwon't result in a loss of functionality, but may result in worse performance.
497c67d6573Sopenharmony_ciOther features, such as the ones controlling the presence or absence of Unicode
498c67d6573Sopenharmony_cidata, can result in a loss of functionality. For example, if one disables the
499c67d6573Sopenharmony_ci`unicode-case` feature (described below), then compiling the regex `(?i)a`
500c67d6573Sopenharmony_ciwill fail since Unicode case insensitivity is enabled by default. Instead,
501c67d6573Sopenharmony_cicallers must use `(?i-u)a` instead to disable Unicode case folding. Stated
502c67d6573Sopenharmony_cidifferently, enabling or disabling any of the features below can only add or
503c67d6573Sopenharmony_cisubtract from the total set of valid regular expressions. Enabling or disabling
504c67d6573Sopenharmony_cia feature will never modify the match semantics of a regular expression.
505c67d6573Sopenharmony_ci
506c67d6573Sopenharmony_ciAll features below are enabled by default.
507c67d6573Sopenharmony_ci
508c67d6573Sopenharmony_ci### Ecosystem features
509c67d6573Sopenharmony_ci
510c67d6573Sopenharmony_ci* **std** -
511c67d6573Sopenharmony_ci  When enabled, this will cause `regex` to use the standard library. Currently,
512c67d6573Sopenharmony_ci  disabling this feature will always result in a compilation error. It is
513c67d6573Sopenharmony_ci  intended to add `alloc`-only support to regex in the future.
514c67d6573Sopenharmony_ci
515c67d6573Sopenharmony_ci### Performance features
516c67d6573Sopenharmony_ci
517c67d6573Sopenharmony_ci* **perf** -
518c67d6573Sopenharmony_ci  Enables all performance related features. This feature is enabled by default
519c67d6573Sopenharmony_ci  and will always cover all features that improve performance, even if more
520c67d6573Sopenharmony_ci  are added in the future.
521c67d6573Sopenharmony_ci* **perf-dfa** -
522c67d6573Sopenharmony_ci  Enables the use of a lazy DFA for matching. The lazy DFA is used to compile
523c67d6573Sopenharmony_ci  portions of a regex to a very fast DFA on an as-needed basis. This can
524c67d6573Sopenharmony_ci  result in substantial speedups, usually by an order of magnitude on large
525c67d6573Sopenharmony_ci  haystacks. The lazy DFA does not bring in any new dependencies, but it can
526c67d6573Sopenharmony_ci  make compile times longer.
527c67d6573Sopenharmony_ci* **perf-inline** -
528c67d6573Sopenharmony_ci  Enables the use of aggressive inlining inside match routines. This reduces
529c67d6573Sopenharmony_ci  the overhead of each match. The aggressive inlining, however, increases
530c67d6573Sopenharmony_ci  compile times and binary size.
531c67d6573Sopenharmony_ci* **perf-literal** -
532c67d6573Sopenharmony_ci  Enables the use of literal optimizations for speeding up matches. In some
533c67d6573Sopenharmony_ci  cases, literal optimizations can result in speedups of _several_ orders of
534c67d6573Sopenharmony_ci  magnitude. Disabling this drops the `aho-corasick` and `memchr` dependencies.
535c67d6573Sopenharmony_ci* **perf-cache** -
536c67d6573Sopenharmony_ci  This feature used to enable a faster internal cache at the cost of using
537c67d6573Sopenharmony_ci  additional dependencies, but this is no longer an option. A fast internal
538c67d6573Sopenharmony_ci  cache is now used unconditionally with no additional dependencies. This may
539c67d6573Sopenharmony_ci  change in the future.
540c67d6573Sopenharmony_ci
541c67d6573Sopenharmony_ci### Unicode features
542c67d6573Sopenharmony_ci
543c67d6573Sopenharmony_ci* **unicode** -
544c67d6573Sopenharmony_ci  Enables all Unicode features. This feature is enabled by default, and will
545c67d6573Sopenharmony_ci  always cover all Unicode features, even if more are added in the future.
546c67d6573Sopenharmony_ci* **unicode-age** -
547c67d6573Sopenharmony_ci  Provide the data for the
548c67d6573Sopenharmony_ci  [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age).
549c67d6573Sopenharmony_ci  This makes it possible to use classes like `\p{Age:6.0}` to refer to all
550c67d6573Sopenharmony_ci  codepoints first introduced in Unicode 6.0
551c67d6573Sopenharmony_ci* **unicode-bool** -
552c67d6573Sopenharmony_ci  Provide the data for numerous Unicode boolean properties. The full list
553c67d6573Sopenharmony_ci  is not included here, but contains properties like `Alphabetic`, `Emoji`,
554c67d6573Sopenharmony_ci  `Lowercase`, `Math`, `Uppercase` and `White_Space`.
555c67d6573Sopenharmony_ci* **unicode-case** -
556c67d6573Sopenharmony_ci  Provide the data for case insensitive matching using
557c67d6573Sopenharmony_ci  [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
558c67d6573Sopenharmony_ci* **unicode-gencat** -
559c67d6573Sopenharmony_ci  Provide the data for
560c67d6573Sopenharmony_ci  [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
561c67d6573Sopenharmony_ci  This includes, but is not limited to, `Decimal_Number`, `Letter`,
562c67d6573Sopenharmony_ci  `Math_Symbol`, `Number` and `Punctuation`.
563c67d6573Sopenharmony_ci* **unicode-perl** -
564c67d6573Sopenharmony_ci  Provide the data for supporting the Unicode-aware Perl character classes,
565c67d6573Sopenharmony_ci  corresponding to `\w`, `\s` and `\d`. This is also necessary for using
566c67d6573Sopenharmony_ci  Unicode-aware word boundary assertions. Note that if this feature is
567c67d6573Sopenharmony_ci  disabled, the `\s` and `\d` character classes are still available if the
568c67d6573Sopenharmony_ci  `unicode-bool` and `unicode-gencat` features are enabled, respectively.
569c67d6573Sopenharmony_ci* **unicode-script** -
570c67d6573Sopenharmony_ci  Provide the data for
571c67d6573Sopenharmony_ci  [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/).
572c67d6573Sopenharmony_ci  This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`,
573c67d6573Sopenharmony_ci  `Latin` and `Thai`.
574c67d6573Sopenharmony_ci* **unicode-segment** -
575c67d6573Sopenharmony_ci  Provide the data necessary to provide the properties used to implement the
576c67d6573Sopenharmony_ci  [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/).
577c67d6573Sopenharmony_ci  This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
578c67d6573Sopenharmony_ci  `\p{sb=ATerm}`.
579c67d6573Sopenharmony_ci
580c67d6573Sopenharmony_ci
581c67d6573Sopenharmony_ci# Untrusted input
582c67d6573Sopenharmony_ci
583c67d6573Sopenharmony_ciThis crate can handle both untrusted regular expressions and untrusted
584c67d6573Sopenharmony_cisearch text.
585c67d6573Sopenharmony_ci
586c67d6573Sopenharmony_ciUntrusted regular expressions are handled by capping the size of a compiled
587c67d6573Sopenharmony_ciregular expression.
588c67d6573Sopenharmony_ci(See [`RegexBuilder::size_limit`](struct.RegexBuilder.html#method.size_limit).)
589c67d6573Sopenharmony_ciWithout this, it would be trivial for an attacker to exhaust your system's
590c67d6573Sopenharmony_cimemory with expressions like `a{100}{100}{100}`.
591c67d6573Sopenharmony_ci
592c67d6573Sopenharmony_ciUntrusted search text is allowed because the matching engine(s) in this
593c67d6573Sopenharmony_cicrate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search
594c67d6573Sopenharmony_citext`), which means there's no way to cause exponential blow-up like with
595c67d6573Sopenharmony_cisome other regular expression engines. (We pay for this by disallowing
596c67d6573Sopenharmony_cifeatures like arbitrary look-ahead and backreferences.)
597c67d6573Sopenharmony_ci
598c67d6573Sopenharmony_ciWhen a DFA is used, pathological cases with exponential state blow-up are
599c67d6573Sopenharmony_ciavoided by constructing the DFA lazily or in an "online" manner. Therefore,
600c67d6573Sopenharmony_ciat most one new state can be created for each byte of input. This satisfies
601c67d6573Sopenharmony_ciour time complexity guarantees, but can lead to memory growth
602c67d6573Sopenharmony_ciproportional to the size of the input. As a stopgap, the DFA is only
603c67d6573Sopenharmony_ciallowed to store a fixed number of states. When the limit is reached, its
604c67d6573Sopenharmony_cistates are wiped and continues on, possibly duplicating previous work. If
605c67d6573Sopenharmony_cithe limit is reached too frequently, it gives up and hands control off to
606c67d6573Sopenharmony_cianother matching engine with fixed memory requirements.
607c67d6573Sopenharmony_ci(The DFA size limit can also be tweaked. See
608c67d6573Sopenharmony_ci[`RegexBuilder::dfa_size_limit`](struct.RegexBuilder.html#method.dfa_size_limit).)
609c67d6573Sopenharmony_ci*/
610c67d6573Sopenharmony_ci
611c67d6573Sopenharmony_ci#![deny(missing_docs)]
612c67d6573Sopenharmony_ci#![cfg_attr(feature = "pattern", feature(pattern))]
613c67d6573Sopenharmony_ci#![warn(missing_debug_implementations)]
614c67d6573Sopenharmony_ci#![allow(clippy::if_same_then_else)]
615c67d6573Sopenharmony_ci#[cfg(not(feature = "std"))]
616c67d6573Sopenharmony_cicompile_error!("`std` feature is currently required to build this crate");
617c67d6573Sopenharmony_ci
618c67d6573Sopenharmony_ci// To check README's example
619c67d6573Sopenharmony_ci// TODO: Re-enable this once the MSRV is 1.43 or greater.
620c67d6573Sopenharmony_ci// See: https://github.com/rust-lang/regex/issues/684
621c67d6573Sopenharmony_ci// See: https://github.com/rust-lang/regex/issues/685
622c67d6573Sopenharmony_ci// #[cfg(doctest)]
623c67d6573Sopenharmony_ci// doc_comment::doctest!("../README.md");
624c67d6573Sopenharmony_ci
625c67d6573Sopenharmony_ci#[cfg(feature = "std")]
626c67d6573Sopenharmony_cipub use crate::error::Error;
627c67d6573Sopenharmony_ci#[cfg(feature = "std")]
628c67d6573Sopenharmony_cipub use crate::re_builder::set_unicode::*;
629c67d6573Sopenharmony_ci#[cfg(feature = "std")]
630c67d6573Sopenharmony_cipub use crate::re_builder::unicode::*;
631c67d6573Sopenharmony_ci#[cfg(feature = "std")]
632c67d6573Sopenharmony_cipub use crate::re_set::unicode::*;
633c67d6573Sopenharmony_ci#[cfg(feature = "std")]
634c67d6573Sopenharmony_cipub use crate::re_unicode::{
635c67d6573Sopenharmony_ci    escape, CaptureLocations, CaptureMatches, CaptureNames, Captures,
636c67d6573Sopenharmony_ci    Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split,
637c67d6573Sopenharmony_ci    SplitN, SubCaptureMatches,
638c67d6573Sopenharmony_ci};
639c67d6573Sopenharmony_ci
640c67d6573Sopenharmony_ci/**
641c67d6573Sopenharmony_ciMatch regular expressions on arbitrary bytes.
642c67d6573Sopenharmony_ci
643c67d6573Sopenharmony_ciThis module provides a nearly identical API to the one found in the
644c67d6573Sopenharmony_citop-level of this crate. There are two important differences:
645c67d6573Sopenharmony_ci
646c67d6573Sopenharmony_ci1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>`
647c67d6573Sopenharmony_ciis used where `String` would have been used.
648c67d6573Sopenharmony_ci2. Unicode support can be disabled even when disabling it would result in
649c67d6573Sopenharmony_cimatching invalid UTF-8 bytes.
650c67d6573Sopenharmony_ci
651c67d6573Sopenharmony_ci# Example: match null terminated string
652c67d6573Sopenharmony_ci
653c67d6573Sopenharmony_ciThis shows how to find all null-terminated strings in a slice of bytes:
654c67d6573Sopenharmony_ci
655c67d6573Sopenharmony_ci```rust
656c67d6573Sopenharmony_ci# use regex::bytes::Regex;
657c67d6573Sopenharmony_cilet re = Regex::new(r"(?-u)(?P<cstr>[^\x00]+)\x00").unwrap();
658c67d6573Sopenharmony_cilet text = b"foo\x00bar\x00baz\x00";
659c67d6573Sopenharmony_ci
660c67d6573Sopenharmony_ci// Extract all of the strings without the null terminator from each match.
661c67d6573Sopenharmony_ci// The unwrap is OK here since a match requires the `cstr` capture to match.
662c67d6573Sopenharmony_cilet cstrs: Vec<&[u8]> =
663c67d6573Sopenharmony_ci    re.captures_iter(text)
664c67d6573Sopenharmony_ci      .map(|c| c.name("cstr").unwrap().as_bytes())
665c67d6573Sopenharmony_ci      .collect();
666c67d6573Sopenharmony_ciassert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs);
667c67d6573Sopenharmony_ci```
668c67d6573Sopenharmony_ci
669c67d6573Sopenharmony_ci# Example: selectively enable Unicode support
670c67d6573Sopenharmony_ci
671c67d6573Sopenharmony_ciThis shows how to match an arbitrary byte pattern followed by a UTF-8 encoded
672c67d6573Sopenharmony_cistring (e.g., to extract a title from a Matroska file):
673c67d6573Sopenharmony_ci
674c67d6573Sopenharmony_ci```rust
675c67d6573Sopenharmony_ci# use std::str;
676c67d6573Sopenharmony_ci# use regex::bytes::Regex;
677c67d6573Sopenharmony_cilet re = Regex::new(
678c67d6573Sopenharmony_ci    r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))"
679c67d6573Sopenharmony_ci).unwrap();
680c67d6573Sopenharmony_cilet text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65";
681c67d6573Sopenharmony_cilet caps = re.captures(text).unwrap();
682c67d6573Sopenharmony_ci
683c67d6573Sopenharmony_ci// Notice that despite the `.*` at the end, it will only match valid UTF-8
684c67d6573Sopenharmony_ci// because Unicode mode was enabled with the `u` flag. Without the `u` flag,
685c67d6573Sopenharmony_ci// the `.*` would match the rest of the bytes.
686c67d6573Sopenharmony_cilet mat = caps.get(1).unwrap();
687c67d6573Sopenharmony_ciassert_eq!((7, 10), (mat.start(), mat.end()));
688c67d6573Sopenharmony_ci
689c67d6573Sopenharmony_ci// If there was a match, Unicode mode guarantees that `title` is valid UTF-8.
690c67d6573Sopenharmony_cilet title = str::from_utf8(&caps[1]).unwrap();
691c67d6573Sopenharmony_ciassert_eq!("☃", title);
692c67d6573Sopenharmony_ci```
693c67d6573Sopenharmony_ci
694c67d6573Sopenharmony_ciIn general, if the Unicode flag is enabled in a capture group and that capture
695c67d6573Sopenharmony_ciis part of the overall match, then the capture is *guaranteed* to be valid
696c67d6573Sopenharmony_ciUTF-8.
697c67d6573Sopenharmony_ci
698c67d6573Sopenharmony_ci# Syntax
699c67d6573Sopenharmony_ci
700c67d6573Sopenharmony_ciThe supported syntax is pretty much the same as the syntax for Unicode
701c67d6573Sopenharmony_ciregular expressions with a few changes that make sense for matching arbitrary
702c67d6573Sopenharmony_cibytes:
703c67d6573Sopenharmony_ci
704c67d6573Sopenharmony_ci1. The `u` flag can be disabled even when disabling it might cause the regex to
705c67d6573Sopenharmony_cimatch invalid UTF-8. When the `u` flag is disabled, the regex is said to be in
706c67d6573Sopenharmony_ci"ASCII compatible" mode.
707c67d6573Sopenharmony_ci2. In ASCII compatible mode, neither Unicode scalar values nor Unicode
708c67d6573Sopenharmony_cicharacter classes are allowed.
709c67d6573Sopenharmony_ci3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)
710c67d6573Sopenharmony_cirevert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps
711c67d6573Sopenharmony_cito `[[:digit:]]` and `\s` maps to `[[:space:]]`.
712c67d6573Sopenharmony_ci4. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to
713c67d6573Sopenharmony_cidetermine whether a byte is a word byte or not.
714c67d6573Sopenharmony_ci5. Hexadecimal notation can be used to specify arbitrary bytes instead of
715c67d6573Sopenharmony_ciUnicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the
716c67d6573Sopenharmony_ciliteral byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that
717c67d6573Sopenharmony_cimatches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when
718c67d6573Sopenharmony_cienabled.
719c67d6573Sopenharmony_ci6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the
720c67d6573Sopenharmony_ci`s` flag is additionally enabled, `.` matches any byte.
721c67d6573Sopenharmony_ci
722c67d6573Sopenharmony_ci# Performance
723c67d6573Sopenharmony_ci
724c67d6573Sopenharmony_ciIn general, one should expect performance on `&[u8]` to be roughly similar to
725c67d6573Sopenharmony_ciperformance on `&str`.
726c67d6573Sopenharmony_ci*/
727c67d6573Sopenharmony_ci#[cfg(feature = "std")]
728c67d6573Sopenharmony_cipub mod bytes {
729c67d6573Sopenharmony_ci    pub use crate::re_builder::bytes::*;
730c67d6573Sopenharmony_ci    pub use crate::re_builder::set_bytes::*;
731c67d6573Sopenharmony_ci    pub use crate::re_bytes::*;
732c67d6573Sopenharmony_ci    pub use crate::re_set::bytes::*;
733c67d6573Sopenharmony_ci}
734c67d6573Sopenharmony_ci
735c67d6573Sopenharmony_cimod backtrack;
736c67d6573Sopenharmony_cimod compile;
737c67d6573Sopenharmony_ci#[cfg(feature = "perf-dfa")]
738c67d6573Sopenharmony_cimod dfa;
739c67d6573Sopenharmony_cimod error;
740c67d6573Sopenharmony_cimod exec;
741c67d6573Sopenharmony_cimod expand;
742c67d6573Sopenharmony_cimod find_byte;
743c67d6573Sopenharmony_cimod input;
744c67d6573Sopenharmony_cimod literal;
745c67d6573Sopenharmony_ci#[cfg(feature = "pattern")]
746c67d6573Sopenharmony_cimod pattern;
747c67d6573Sopenharmony_cimod pikevm;
748c67d6573Sopenharmony_cimod pool;
749c67d6573Sopenharmony_cimod prog;
750c67d6573Sopenharmony_cimod re_builder;
751c67d6573Sopenharmony_cimod re_bytes;
752c67d6573Sopenharmony_cimod re_set;
753c67d6573Sopenharmony_cimod re_trait;
754c67d6573Sopenharmony_cimod re_unicode;
755c67d6573Sopenharmony_cimod sparse;
756c67d6573Sopenharmony_cimod utf8;
757c67d6573Sopenharmony_ci
758c67d6573Sopenharmony_ci/// The `internal` module exists to support suspicious activity, such as
759c67d6573Sopenharmony_ci/// testing different matching engines and supporting the `regex-debug` CLI
760c67d6573Sopenharmony_ci/// utility.
761c67d6573Sopenharmony_ci#[doc(hidden)]
762c67d6573Sopenharmony_ci#[cfg(feature = "std")]
763c67d6573Sopenharmony_cipub mod internal {
764c67d6573Sopenharmony_ci    pub use crate::compile::Compiler;
765c67d6573Sopenharmony_ci    pub use crate::exec::{Exec, ExecBuilder};
766c67d6573Sopenharmony_ci    pub use crate::input::{Char, CharInput, Input, InputAt};
767c67d6573Sopenharmony_ci    pub use crate::literal::LiteralSearcher;
768c67d6573Sopenharmony_ci    pub use crate::prog::{EmptyLook, Inst, InstRanges, Program};
769c67d6573Sopenharmony_ci}
770