1#ifndef _RURE_H
2#define _RURE_H
3
4#include <stdbool.h>
5#include <stdint.h>
6#include <stdlib.h>
7
8#ifdef __cplusplus
9extern "C" {
10#endif
11
12/*
13 * rure is the type of a compiled regular expression.
14 *
15 * An rure can be safely used from multiple threads simultaneously.
16 */
17typedef struct rure rure;
18
19/*
20 * rure_set is the type of a set of compiled regular expressions.
21 *
22 * A rure can be safely used from multiple threads simultaneously.
23 */
24typedef struct rure_set rure_set;
25
26/*
27 * rure_options is the set of non-flag configuration options for compiling
28 * a regular expression. Currently, only two options are available: setting
29 * the size limit of the compiled program and setting the size limit of the
30 * cache of states that the DFA uses while searching.
31 *
32 * For most uses, the default settings will work fine, and NULL can be passed
33 * wherever a *rure_options is expected.
34*/
35typedef struct rure_options rure_options;
36
37/*
38 * The flags listed below can be used in rure_compile to set the default
39 * flags. All flags can otherwise be toggled in the expression itself using
40 * standard syntax, e.g., `(?i)` turns case insensitive matching on and `(?-i)`
41 * disables it.
42 */
43/* The case insensitive (i) flag. */
44#define RURE_FLAG_CASEI (1 << 0)
45/* The multi-line matching (m) flag. (^ and $ match new line boundaries.) */
46#define RURE_FLAG_MULTI (1 << 1)
47/* The any character (s) flag. (. matches new line.) */
48#define RURE_FLAG_DOTNL (1 << 2)
49/* The greedy swap (U) flag. (e.g., + is ungreedy and +? is greedy.) */
50#define RURE_FLAG_SWAP_GREED (1 << 3)
51/* The ignore whitespace (x) flag. */
52#define RURE_FLAG_SPACE (1 << 4)
53/* The Unicode (u) flag. */
54#define RURE_FLAG_UNICODE (1 << 5)
55/* The default set of flags enabled when no flags are set. */
56#define RURE_DEFAULT_FLAGS RURE_FLAG_UNICODE
57
58/*
59 * rure_match corresponds to the location of a single match in a haystack.
60 */
61typedef struct rure_match {
62    /* The start position. */
63    size_t start;
64    /* The end position. */
65    size_t end;
66} rure_match;
67
68/*
69 * rure_captures represents storage for sub-capture locations of a match.
70 *
71 * Computing the capture groups of a match can carry a significant performance
72 * penalty, so their use in the API is optional.
73 *
74 * An rure_captures value can be reused in multiple calls to rure_find_captures,
75 * so long as it is used with the compiled regular expression that created
76 * it.
77 *
78 * An rure_captures value may outlive its corresponding rure and can be freed
79 * independently.
80 *
81 * It is not safe to use from multiple threads simultaneously.
82 */
83typedef struct rure_captures rure_captures;
84
85/*
86 * rure_iter is an iterator over successive non-overlapping matches in a
87 * particular haystack.
88 *
89 * An rure_iter value may not outlive its corresponding rure and should be freed
90 * before its corresponding rure is freed.
91 *
92 * It is not safe to use from multiple threads simultaneously.
93 */
94typedef struct rure_iter rure_iter;
95
96/*
97 * rure_iter_capture_names is an iterator over the list of capture group names
98 * in this particular rure.
99 *
100 * An rure_iter_capture_names value may not outlive its corresponding rure,
101 * and should be freed before its corresponding rure is freed.
102 *
103 * It is not safe to use from multiple threads simultaneously.
104 */
105typedef struct rure_iter_capture_names rure_iter_capture_names;
106
107/*
108 * rure_error is an error that caused compilation to fail.
109 *
110 * Most errors are syntax errors but an error can be returned if the compiled
111 * regular expression would be too big.
112 *
113 * Whenever a function accepts an *rure_error, it is safe to pass NULL. (But
114 * you will not get access to the error if one occurred.)
115 *
116 * It is not safe to use from multiple threads simultaneously.
117 */
118typedef struct rure_error rure_error;
119
120/*
121 * rure_compile_must compiles the given pattern into a regular expression. If
122 * compilation fails for any reason, an error message is printed to stderr and
123 * the process is aborted.
124 *
125 * The pattern given should be in UTF-8. For convenience, this accepts a C
126 * string, which means the pattern cannot usefully contain NUL. If your pattern
127 * may contain NUL, consider using a regular expression escape sequence, or
128 * just use rure_compile.
129 *
130 * This uses RURE_DEFAULT_FLAGS.
131 *
132 * The compiled expression returned may be used from multiple threads
133 * simultaneously.
134 */
135rure *rure_compile_must(const char *pattern);
136
137/*
138 * rure_compile compiles the given pattern into a regular expression. The
139 * pattern must be valid UTF-8 and the length corresponds to the number of
140 * bytes in the pattern.
141 *
142 * flags is a bitfield. Valid values are constants declared with prefix
143 * RURE_FLAG_.
144 *
145 * options contains non-flag configuration settings. If it's NULL, default
146 * settings are used. options may be freed immediately after a call to
147 * rure_compile.
148 *
149 * error is set if there was a problem compiling the pattern (including if the
150 * pattern is not valid UTF-8). If error is NULL, then no error information
151 * is returned. In all cases, if an error occurs, NULL is returned.
152 *
153 * The compiled expression returned may be used from multiple threads
154 * simultaneously.
155 */
156rure *rure_compile(const uint8_t *pattern, size_t length,
157                   uint32_t flags, rure_options *options,
158                   rure_error *error);
159
160/*
161 * rure_free frees the given compiled regular expression.
162 *
163 * This must be called at most once for any rure.
164 */
165void rure_free(rure *re);
166
167/*
168 * rure_is_match returns true if and only if re matches anywhere in haystack.
169 *
170 * haystack may contain arbitrary bytes, but ASCII compatible text is more
171 * useful. UTF-8 is even more useful. Other text encodings aren't supported.
172 * length should be the number of bytes in haystack.
173 *
174 * start is the position at which to start searching. Note that setting the
175 * start position is distinct from incrementing the pointer, since the regex
176 * engine may look at bytes before the start position to determine match
177 * information. For example, if the start position is greater than 0, then the
178 * \A ("begin text") anchor can never match.
179 *
180 * rure_is_match should be preferred to rure_find since it may be faster.
181 *
182 * N.B. The performance of this search is not impacted by the presence of
183 * capturing groups in your regular expression.
184 */
185bool rure_is_match(rure *re, const uint8_t *haystack, size_t length,
186                   size_t start);
187
188/*
189 * rure_find returns true if and only if re matches anywhere in haystack.
190 * If a match is found, then its start and end offsets (in bytes) are set
191 * on the match pointer given.
192 *
193 * haystack may contain arbitrary bytes, but ASCII compatible text is more
194 * useful. UTF-8 is even more useful. Other text encodings aren't supported.
195 * length should be the number of bytes in haystack.
196 *
197 * start is the position at which to start searching. Note that setting the
198 * start position is distinct from incrementing the pointer, since the regex
199 * engine may look at bytes before the start position to determine match
200 * information. For example, if the start position is greater than 0, then the
201 * \A ("begin text") anchor can never match.
202 *
203 * rure_find should be preferred to rure_find_captures since it may be faster.
204 *
205 * N.B. The performance of this search is not impacted by the presence of
206 * capturing groups in your regular expression.
207 */
208bool rure_find(rure *re, const uint8_t *haystack, size_t length,
209               size_t start, rure_match *match);
210
211/*
212 * rure_find_captures returns true if and only if re matches anywhere in
213 * haystack. If a match is found, then all of its capture locations are stored
214 * in the captures pointer given.
215 *
216 * haystack may contain arbitrary bytes, but ASCII compatible text is more
217 * useful. UTF-8 is even more useful. Other text encodings aren't supported.
218 * length should be the number of bytes in haystack.
219 *
220 * start is the position at which to start searching. Note that setting the
221 * start position is distinct from incrementing the pointer, since the regex
222 * engine may look at bytes before the start position to determine match
223 * information. For example, if the start position is greater than 0, then the
224 * \A ("begin text") anchor can never match.
225 *
226 * Only use this function if you specifically need access to capture locations.
227 * It is not necessary to use this function just because your regular
228 * expression contains capturing groups.
229 *
230 * Capture locations can be accessed using the rure_captures_* functions.
231 *
232 * N.B. The performance of this search can be impacted by the number of
233 * capturing groups. If you're using this function, it may be beneficial to
234 * use non-capturing groups (e.g., `(?:re)`) where possible.
235 */
236bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length,
237                        size_t start, rure_captures *captures);
238
239/*
240 * rure_shortest_match returns true if and only if re matches anywhere in
241 * haystack. If a match is found, then its end location is stored in the
242 * pointer given. The end location is the place at which the regex engine
243 * determined that a match exists, but may occur before the end of the proper
244 * leftmost-first match.
245 *
246 * haystack may contain arbitrary bytes, but ASCII compatible text is more
247 * useful. UTF-8 is even more useful. Other text encodings aren't supported.
248 * length should be the number of bytes in haystack.
249 *
250 * start is the position at which to start searching. Note that setting the
251 * start position is distinct from incrementing the pointer, since the regex
252 * engine may look at bytes before the start position to determine match
253 * information. For example, if the start position is greater than 0, then the
254 * \A ("begin text") anchor can never match.
255 *
256 * rure_shortest_match should be preferred to rure_find since it may be faster.
257 *
258 * N.B. The performance of this search is not impacted by the presence of
259 * capturing groups in your regular expression.
260 */
261bool rure_shortest_match(rure *re, const uint8_t *haystack, size_t length,
262                         size_t start, size_t *end);
263
264/*
265 * rure_capture_name_index returns the capture index for the name given. If
266 * no such named capturing group exists in re, then -1 is returned.
267 *
268 * The capture index may be used with rure_captures_at.
269 *
270 * This function never returns 0 since the first capture group always
271 * corresponds to the entire match and is always unnamed.
272 */
273int32_t rure_capture_name_index(rure *re, const char *name);
274
275/*
276 * rure_iter_capture_names_new creates a new capture_names iterator.
277 *
278 * An iterator will report all successive capture group names of re.
279 */
280rure_iter_capture_names *rure_iter_capture_names_new(rure *re);
281
282/*
283 * rure_iter_capture_names_free frees the iterator given.
284 *
285 * It must be called at most once.
286 */
287void rure_iter_capture_names_free(rure_iter_capture_names *it);
288
289/*
290 * rure_iter_capture_names_next advances the iterator and returns true
291 * if and only if another capture group name exists.
292 *
293 * The value of the capture group name is written to the provided pointer.
294 */
295bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name);
296
297/*
298 * rure_iter_new creates a new iterator.
299 *
300 * An iterator will report all successive non-overlapping matches of re.
301 * When calling iterator functions, the same haystack and length must be
302 * supplied to all invocations. (Strict pointer equality is, however, not
303 * required.)
304 */
305rure_iter *rure_iter_new(rure *re);
306
307/*
308 * rure_iter_free frees the iterator given.
309 *
310 * It must be called at most once.
311 */
312void rure_iter_free(rure_iter *it);
313
314/*
315 * rure_iter_next advances the iterator and returns true if and only if a
316 * match was found. If a match is found, then the match pointer is set with the
317 * start and end location of the match, in bytes.
318 *
319 * If no match is found, then subsequent calls will return false indefinitely.
320 *
321 * haystack may contain arbitrary bytes, but ASCII compatible text is more
322 * useful. UTF-8 is even more useful. Other text encodings aren't supported.
323 * length should be the number of bytes in haystack. The given haystack must
324 * be logically equivalent to all other haystacks given to this iterator.
325 *
326 * rure_iter_next should be preferred to rure_iter_next_captures since it may
327 * be faster.
328 *
329 * N.B. The performance of this search is not impacted by the presence of
330 * capturing groups in your regular expression.
331 */
332bool rure_iter_next(rure_iter *it, const uint8_t *haystack, size_t length,
333                    rure_match *match);
334
335/*
336 * rure_iter_next_captures advances the iterator and returns true if and only if a
337 * match was found. If a match is found, then all of its capture locations are
338 * stored in the captures pointer given.
339 *
340 * If no match is found, then subsequent calls will return false indefinitely.
341 *
342 * haystack may contain arbitrary bytes, but ASCII compatible text is more
343 * useful. UTF-8 is even more useful. Other text encodings aren't supported.
344 * length should be the number of bytes in haystack. The given haystack must
345 * be logically equivalent to all other haystacks given to this iterator.
346 *
347 * Only use this function if you specifically need access to capture locations.
348 * It is not necessary to use this function just because your regular
349 * expression contains capturing groups.
350 *
351 * Capture locations can be accessed using the rure_captures_* functions.
352 *
353 * N.B. The performance of this search can be impacted by the number of
354 * capturing groups. If you're using this function, it may be beneficial to
355 * use non-capturing groups (e.g., `(?:re)`) where possible.
356 */
357bool rure_iter_next_captures(rure_iter *it,
358                             const uint8_t *haystack, size_t length,
359                             rure_captures *captures);
360
361/*
362 * rure_captures_new allocates storage for all capturing groups in re.
363 *
364 * An rure_captures value may be reused on subsequent calls to
365 * rure_find_captures or rure_iter_next_captures.
366 *
367 * An rure_captures value may be freed independently of re, although any
368 * particular rure_captures should be used only with the re given here.
369 *
370 * It is not safe to use an rure_captures value from multiple threads
371 * simultaneously.
372 */
373rure_captures *rure_captures_new(rure *re);
374
375/*
376 * rure_captures_free frees the given captures.
377 *
378 * This must be called at most once.
379 */
380void rure_captures_free(rure_captures *captures);
381
382/*
383 * rure_captures_at returns true if and only if the capturing group at the
384 * index given was part of a match. If so, the given match pointer is populated
385 * with the start and end location (in bytes) of the capturing group.
386 *
387 * If no capture group with the index i exists, then false is
388 * returned. (A capturing group exists if and only if i is less than
389 * rure_captures_len(captures).)
390 *
391 * Note that index 0 corresponds to the full match.
392 */
393bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match);
394
395/*
396 * rure_captures_len returns the number of capturing groups in the given
397 * captures.
398 */
399size_t rure_captures_len(rure_captures *captures);
400
401/*
402 * rure_options_new allocates space for options.
403 *
404 * Options may be freed immediately after a call to rure_compile, but otherwise
405 * may be freely used in multiple calls to rure_compile.
406 *
407 * It is not safe to set options from multiple threads simultaneously. It is
408 * safe to call rure_compile from multiple threads simultaneously using the
409 * same options pointer.
410 */
411rure_options *rure_options_new();
412
413/*
414 * rure_options_free frees the given options.
415 *
416 * This must be called at most once.
417 */
418void rure_options_free(rure_options *options);
419
420/*
421 * rure_options_size_limit sets the appoximate size limit of the compiled
422 * regular expression.
423 *
424 * This size limit roughly corresponds to the number of bytes occupied by a
425 * single compiled program. If the program would exceed this number, then a
426 * compilation error will be returned from rure_compile.
427 */
428void rure_options_size_limit(rure_options *options, size_t limit);
429
430/*
431 * rure_options_dfa_size_limit sets the approximate size of the cache used by
432 * the DFA during search.
433 *
434 * This roughly corresponds to the number of bytes that the DFA will use while
435 * searching.
436 *
437 * Note that this is a *per thread* limit. There is no way to set a global
438 * limit. In particular, if a regular expression is used from multiple threads
439 * simultaneously, then each thread may use up to the number of bytes
440 * specified here.
441 */
442void rure_options_dfa_size_limit(rure_options *options, size_t limit);
443
444/*
445 * rure_compile_set compiles the given list of patterns into a single regular
446 * expression which can be matched in a linear-scan. Each pattern in patterns
447 * must be valid UTF-8 and the length of each pattern in patterns corresponds
448 * to a byte length in patterns_lengths.
449 *
450 * The number of patterns to compile is specified by patterns_count. patterns
451 * must contain at least this many entries.
452 *
453 * flags is a bitfield. Valid values are constants declared with prefix
454 * RURE_FLAG_.
455 *
456 * options contains non-flag configuration settings. If it's NULL, default
457 * settings are used. options may be freed immediately after a call to
458 * rure_compile.
459 *
460 * error is set if there was a problem compiling the pattern.
461 *
462 * The compiled expression set returned may be used from multiple threads.
463 */
464rure_set *rure_compile_set(const uint8_t **patterns,
465                           const size_t *patterns_lengths,
466                           size_t patterns_count,
467                           uint32_t flags,
468                           rure_options *options,
469                           rure_error *error);
470
471/*
472 * rure_set_free frees the given compiled regular expression set.
473 *
474 * This must be called at most once for any rure_set.
475 */
476void rure_set_free(rure_set *re);
477
478/*
479 * rure_is_match returns true if and only if any regexes within the set
480 * match anywhere in the haystack. Once a match has been located, the
481 * matching engine will quit immediately.
482 *
483 * haystack may contain arbitrary bytes, but ASCII compatible text is more
484 * useful. UTF-8 is even more useful. Other text encodings aren't supported.
485 * length should be the number of bytes in haystack.
486 *
487 * start is the position at which to start searching. Note that setting the
488 * start position is distinct from incrementing the pointer, since the regex
489 * engine may look at bytes before the start position to determine match
490 * information. For example, if the start position is greater than 0, then the
491 * \A ("begin text") anchor can never match.
492 */
493bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length,
494                       size_t start);
495
496/*
497 * rure_set_matches compares each regex in the set against the haystack and
498 * modifies matches with the match result of each pattern. Match results are
499 * ordered in the same way as the rure_set was compiled. For example,
500 * index 0 of matches corresponds to the first pattern passed to
501 * `rure_compile_set`.
502 *
503 * haystack may contain arbitrary bytes, but ASCII compatible text is more
504 * useful. UTF-8 is even more useful. Other text encodings aren't supported.
505 * length should be the number of bytes in haystack.
506 *
507 * start is the position at which to start searching. Note that setting the
508 * start position is distinct from incrementing the pointer, since the regex
509 * engine may look at bytes before the start position to determine match
510 * information. For example, if the start position is greater than 0, then the
511 * \A ("begin text") anchor can never match.
512 *
513 * matches must be greater than or equal to the number of patterns the
514 * rure_set was compiled with.
515 *
516 * Only use this function if you specifically need to know which regexes
517 * matched within the set. To determine if any of the regexes matched without
518 * caring which, use rure_set_is_match.
519 */
520bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length,
521                      size_t start, bool *matches);
522
523/*
524 * rure_set_len returns the number of patterns rure_set was compiled with.
525 */
526size_t rure_set_len(rure_set *re);
527
528/*
529 * rure_error_new allocates space for an error.
530 *
531 * If error information is desired, then rure_error_new should be called
532 * to create an rure_error pointer, and that pointer can be passed to
533 * rure_compile. If an error occurred, then rure_compile will return NULL and
534 * the error pointer will be set. A message can then be extracted.
535 *
536 * It is not safe to use errors from multiple threads simultaneously. An error
537 * value may be reused on subsequent calls to rure_compile.
538 */
539rure_error *rure_error_new();
540
541/*
542 * rure_error_free frees the error given.
543 *
544 * This must be called at most once.
545 */
546void rure_error_free(rure_error *err);
547
548/*
549 * rure_error_message returns a NUL terminated string that describes the error
550 * message.
551 *
552 * The pointer returned must not be freed. Instead, it will be freed when
553 * rure_error_free is called. If err is used in subsequent calls to
554 * rure_compile, then this pointer may change or become invalid.
555 */
556const char *rure_error_message(rure_error *err);
557
558/*
559 * rure_escape_must returns a NUL terminated string where all meta characters
560 * have been escaped. If escaping fails for any reason, an error message is
561 * printed to stderr and the process is aborted.
562 *
563 * The pattern given should be in UTF-8. For convenience, this accepts a C
564 * string, which means the pattern cannot contain a NUL byte. These correspond
565 * to the only two failure conditions of this function. That is, if the caller
566 * guarantees that the given pattern is valid UTF-8 and does not contain a
567 * NUL byte, then this is guaranteed to succeed (modulo out-of-memory errors).
568 *
569 * The pointer returned must not be freed directly. Instead, it should be freed
570 * by calling rure_cstring_free.
571 */
572const char *rure_escape_must(const char *pattern);
573
574/*
575 * rure_cstring_free frees the string given.
576 *
577 * This must be called at most once per string.
578 */
579void rure_cstring_free(char *s);
580
581#ifdef __cplusplus
582}
583#endif
584
585#endif
586