1/*
2 * This example code shows how to iterate over all regex matches in a file,
3 * emit the match location and print the contents of a capturing group.
4 */
5
6#include <fcntl.h>
7#include <stdio.h>
8#include <stdlib.h>
9#include <string.h>
10#include <sys/mman.h>
11#include <sys/stat.h>
12#include <sys/types.h>
13#include <unistd.h>
14
15#include "rure.h"
16
17int main() {
18    /* Open a file and mmap it. */
19    int fd = open("sherlock.txt", O_RDONLY);
20    if (fd == -1) {
21        perror("failed to open sherlock.txt");
22        exit(1);
23    }
24    struct stat status;
25    if (fstat(fd, &status) == -1) {
26        perror("failed to stat sherlock.txt");
27        exit(1);
28    }
29    if ((uintmax_t)status.st_size > SIZE_MAX) {
30        perror("file too big");
31        exit(1);
32    }
33    if (status.st_size == 0) {
34        perror("file empty");
35        exit(1);
36    }
37    size_t sherlock_len = (size_t)status.st_size;
38    const uint8_t *sherlock = (const uint8_t *)mmap(
39        NULL, status.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
40    close(fd);
41    if (sherlock == MAP_FAILED) {
42        perror("could not mmap file");
43        exit(1);
44    }
45
46    /*
47     * Compile the regular expression. A more convenient routine,
48     * rure_compile_must, is also available, which will abort the process if
49     * and print an error message to stderr if the regex compilation fails.
50     * We show the full gory details here as an example.
51     */
52    const char *pattern = "(\\w+)\\s+Holmes";
53    size_t pattern_len = strlen(pattern);
54    rure_error *err = rure_error_new();
55    rure *re = rure_compile((const uint8_t *)pattern, pattern_len,
56                            RURE_FLAG_UNICODE | RURE_FLAG_CASEI, NULL, err);
57    if (NULL == re) {
58        /* A null regex means compilation failed and an error exists. */
59        printf("compilation of %s failed: %s\n",
60               pattern, rure_error_message(err));
61        rure_error_free(err);
62        munmap((char*)sherlock, sherlock_len);
63        exit(1);
64    }
65    rure_error_free(err);
66
67    /*
68     * Create an iterator to find all successive non-overlapping matches.
69     * For each match, we extract the location of the capturing group.
70     */
71    rure_match group0 = {0};
72    rure_match group1 = {0};
73    rure_captures *caps = rure_captures_new(re);
74    rure_iter *it = rure_iter_new(re);
75
76    while (rure_iter_next_captures(it, sherlock, sherlock_len, caps)) {
77        /*
78         * Get the location of the full match and the capturing group.
79         * We know that both accesses are successful since the body of the
80         * loop only executes if there is a match and both capture groups
81         * must match in order for the entire regex to match.
82         *
83         * N.B. The zeroth group corresponds to the full match of the regex.
84         */
85        rure_captures_at(caps, 0, &group0);
86        rure_captures_at(caps, 1, &group1);
87        printf("%.*s (match at: %zu, %zu)\n",
88               (int)(group1.end - group1.start),
89               sherlock + group1.start,
90               group0.start, group0.end);
91    }
92
93    /* Free all our resources. */
94    munmap((char*)sherlock, sherlock_len);
95    rure_captures_free(caps);
96    rure_iter_free(it);
97    rure_free(re);
98    return 0;
99}
100