1e18e3516Sopenharmony_ci<html>
2e18e3516Sopenharmony_ci<head>
3e18e3516Sopenharmony_ci<title>pcre2demo specification</title>
4e18e3516Sopenharmony_ci</head>
5e18e3516Sopenharmony_ci<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
6e18e3516Sopenharmony_ci<h1>pcre2demo man page</h1>
7e18e3516Sopenharmony_ci<p>
8e18e3516Sopenharmony_ciReturn to the <a href="index.html">PCRE2 index page</a>.
9e18e3516Sopenharmony_ci</p>
10e18e3516Sopenharmony_ci<p>
11e18e3516Sopenharmony_ciThis page is part of the PCRE2 HTML documentation. It was generated
12e18e3516Sopenharmony_ciautomatically from the original man page. If there is any nonsense in it,
13e18e3516Sopenharmony_ciplease consult the man page, in case the conversion went wrong.
14e18e3516Sopenharmony_ci<br>
15e18e3516Sopenharmony_ci<ul>
16e18e3516Sopenharmony_ci</ul>
17e18e3516Sopenharmony_ci<PRE>
18e18e3516Sopenharmony_ci/*************************************************
19e18e3516Sopenharmony_ci*           PCRE2 DEMONSTRATION PROGRAM          *
20e18e3516Sopenharmony_ci*************************************************/
21e18e3516Sopenharmony_ci
22e18e3516Sopenharmony_ci/* This is a demonstration program to illustrate a straightforward way of
23e18e3516Sopenharmony_ciusing the PCRE2 regular expression library from a C program. See the
24e18e3516Sopenharmony_cipcre2sample documentation for a short discussion ("man pcre2sample" if you have
25e18e3516Sopenharmony_cithe PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
26e18e3516Sopenharmony_ciincompatible with the original PCRE API.
27e18e3516Sopenharmony_ci
28e18e3516Sopenharmony_ciThere are actually three libraries, each supporting a different code unit
29e18e3516Sopenharmony_ciwidth. This demonstration program uses the 8-bit library. The default is to
30e18e3516Sopenharmony_ciprocess each code unit as a separate character, but if the pattern begins with
31e18e3516Sopenharmony_ci"(*UTF)", both it and the subject are treated as UTF-8 strings, where
32e18e3516Sopenharmony_cicharacters may occupy multiple code units.
33e18e3516Sopenharmony_ci
34e18e3516Sopenharmony_ciIn Unix-like environments, if PCRE2 is installed in your standard system
35e18e3516Sopenharmony_cilibraries, you should be able to compile this program using this command:
36e18e3516Sopenharmony_ci
37e18e3516Sopenharmony_cicc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
38e18e3516Sopenharmony_ci
39e18e3516Sopenharmony_ciIf PCRE2 is not installed in a standard place, it is likely to be installed
40e18e3516Sopenharmony_ciwith support for the pkg-config mechanism. If you have pkg-config, you can
41e18e3516Sopenharmony_cicompile this program using this command:
42e18e3516Sopenharmony_ci
43e18e3516Sopenharmony_cicc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
44e18e3516Sopenharmony_ci
45e18e3516Sopenharmony_ciIf you do not have pkg-config, you may have to use something like this:
46e18e3516Sopenharmony_ci
47e18e3516Sopenharmony_cicc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
48e18e3516Sopenharmony_ci  -R/usr/local/lib -lpcre2-8 -o pcre2demo
49e18e3516Sopenharmony_ci
50e18e3516Sopenharmony_ciReplace "/usr/local/include" and "/usr/local/lib" with wherever the include and
51e18e3516Sopenharmony_cilibrary files for PCRE2 are installed on your system. Only some operating
52e18e3516Sopenharmony_cisystems (Solaris is one) use the -R option.
53e18e3516Sopenharmony_ci
54e18e3516Sopenharmony_ciBuilding under Windows:
55e18e3516Sopenharmony_ci
56e18e3516Sopenharmony_ciIf you want to statically link this program against a non-dll .a file, you must
57e18e3516Sopenharmony_cidefine PCRE2_STATIC before including pcre2.h, so in this environment, uncomment
58e18e3516Sopenharmony_cithe following line. */
59e18e3516Sopenharmony_ci
60e18e3516Sopenharmony_ci/* #define PCRE2_STATIC */
61e18e3516Sopenharmony_ci
62e18e3516Sopenharmony_ci/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
63e18e3516Sopenharmony_ciFor a program that uses only one code unit width, setting it to 8, 16, or 32
64e18e3516Sopenharmony_cimakes it possible to use generic function names such as pcre2_compile(). Note
65e18e3516Sopenharmony_cithat just changing 8 to 16 (for example) is not sufficient to convert this
66e18e3516Sopenharmony_ciprogram to process 16-bit characters. Even in a fully 16-bit environment, where
67e18e3516Sopenharmony_cistring-handling functions such as strcmp() and printf() work with 16-bit
68e18e3516Sopenharmony_cicharacters, the code for handling the table of named substrings will still need
69e18e3516Sopenharmony_cito be modified. */
70e18e3516Sopenharmony_ci
71e18e3516Sopenharmony_ci#define PCRE2_CODE_UNIT_WIDTH 8
72e18e3516Sopenharmony_ci
73e18e3516Sopenharmony_ci#include &lt;stdio.h&gt;
74e18e3516Sopenharmony_ci#include &lt;string.h&gt;
75e18e3516Sopenharmony_ci#include &lt;pcre2.h&gt;
76e18e3516Sopenharmony_ci
77e18e3516Sopenharmony_ci
78e18e3516Sopenharmony_ci/**************************************************************************
79e18e3516Sopenharmony_ci* Here is the program. The API includes the concept of "contexts" for     *
80e18e3516Sopenharmony_ci* setting up unusual interface requirements for compiling and matching,   *
81e18e3516Sopenharmony_ci* such as custom memory managers and non-standard newline definitions.    *
82e18e3516Sopenharmony_ci* This program does not do any of this, so it makes no use of contexts,   *
83e18e3516Sopenharmony_ci* always passing NULL where a context could be given.                     *
84e18e3516Sopenharmony_ci**************************************************************************/
85e18e3516Sopenharmony_ci
86e18e3516Sopenharmony_ciint main(int argc, char **argv)
87e18e3516Sopenharmony_ci{
88e18e3516Sopenharmony_cipcre2_code *re;
89e18e3516Sopenharmony_ciPCRE2_SPTR pattern;     /* PCRE2_SPTR is a pointer to unsigned code units of */
90e18e3516Sopenharmony_ciPCRE2_SPTR subject;     /* the appropriate width (in this case, 8 bits). */
91e18e3516Sopenharmony_ciPCRE2_SPTR name_table;
92e18e3516Sopenharmony_ci
93e18e3516Sopenharmony_ciint crlf_is_newline;
94e18e3516Sopenharmony_ciint errornumber;
95e18e3516Sopenharmony_ciint find_all;
96e18e3516Sopenharmony_ciint i;
97e18e3516Sopenharmony_ciint rc;
98e18e3516Sopenharmony_ciint utf8;
99e18e3516Sopenharmony_ci
100e18e3516Sopenharmony_ciuint32_t option_bits;
101e18e3516Sopenharmony_ciuint32_t namecount;
102e18e3516Sopenharmony_ciuint32_t name_entry_size;
103e18e3516Sopenharmony_ciuint32_t newline;
104e18e3516Sopenharmony_ci
105e18e3516Sopenharmony_ciPCRE2_SIZE erroroffset;
106e18e3516Sopenharmony_ciPCRE2_SIZE *ovector;
107e18e3516Sopenharmony_ciPCRE2_SIZE subject_length;
108e18e3516Sopenharmony_ci
109e18e3516Sopenharmony_cipcre2_match_data *match_data;
110e18e3516Sopenharmony_ci
111e18e3516Sopenharmony_ci
112e18e3516Sopenharmony_ci/**************************************************************************
113e18e3516Sopenharmony_ci* First, sort out the command line. There is only one possible option at  *
114e18e3516Sopenharmony_ci* the moment, "-g" to request repeated matching to find all occurrences,  *
115e18e3516Sopenharmony_ci* like Perl's /g option. We set the variable find_all to a non-zero value *
116e18e3516Sopenharmony_ci* if the -g option is present.                                            *
117e18e3516Sopenharmony_ci**************************************************************************/
118e18e3516Sopenharmony_ci
119e18e3516Sopenharmony_cifind_all = 0;
120e18e3516Sopenharmony_cifor (i = 1; i &lt; argc; i++)
121e18e3516Sopenharmony_ci  {
122e18e3516Sopenharmony_ci  if (strcmp(argv[i], "-g") == 0) find_all = 1;
123e18e3516Sopenharmony_ci  else if (argv[i][0] == '-')
124e18e3516Sopenharmony_ci    {
125e18e3516Sopenharmony_ci    printf("Unrecognised option %s\n", argv[i]);
126e18e3516Sopenharmony_ci    return 1;
127e18e3516Sopenharmony_ci    }
128e18e3516Sopenharmony_ci  else break;
129e18e3516Sopenharmony_ci  }
130e18e3516Sopenharmony_ci
131e18e3516Sopenharmony_ci/* After the options, we require exactly two arguments, which are the pattern,
132e18e3516Sopenharmony_ciand the subject string. */
133e18e3516Sopenharmony_ci
134e18e3516Sopenharmony_ciif (argc - i != 2)
135e18e3516Sopenharmony_ci  {
136e18e3516Sopenharmony_ci  printf("Exactly two arguments required: a regex and a subject string\n");
137e18e3516Sopenharmony_ci  return 1;
138e18e3516Sopenharmony_ci  }
139e18e3516Sopenharmony_ci
140e18e3516Sopenharmony_ci/* Pattern and subject are char arguments, so they can be straightforwardly
141e18e3516Sopenharmony_cicast to PCRE2_SPTR because we are working in 8-bit code units. The subject
142e18e3516Sopenharmony_cilength is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact
143e18e3516Sopenharmony_cidefined to be size_t. */
144e18e3516Sopenharmony_ci
145e18e3516Sopenharmony_cipattern = (PCRE2_SPTR)argv[i];
146e18e3516Sopenharmony_cisubject = (PCRE2_SPTR)argv[i+1];
147e18e3516Sopenharmony_cisubject_length = (PCRE2_SIZE)strlen((char *)subject);
148e18e3516Sopenharmony_ci
149e18e3516Sopenharmony_ci
150e18e3516Sopenharmony_ci/*************************************************************************
151e18e3516Sopenharmony_ci* Now we are going to compile the regular expression pattern, and handle *
152e18e3516Sopenharmony_ci* any errors that are detected.                                          *
153e18e3516Sopenharmony_ci*************************************************************************/
154e18e3516Sopenharmony_ci
155e18e3516Sopenharmony_cire = pcre2_compile(
156e18e3516Sopenharmony_ci  pattern,               /* the pattern */
157e18e3516Sopenharmony_ci  PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
158e18e3516Sopenharmony_ci  0,                     /* default options */
159e18e3516Sopenharmony_ci  &amp;errornumber,          /* for error number */
160e18e3516Sopenharmony_ci  &amp;erroroffset,          /* for error offset */
161e18e3516Sopenharmony_ci  NULL);                 /* use default compile context */
162e18e3516Sopenharmony_ci
163e18e3516Sopenharmony_ci/* Compilation failed: print the error message and exit. */
164e18e3516Sopenharmony_ci
165e18e3516Sopenharmony_ciif (re == NULL)
166e18e3516Sopenharmony_ci  {
167e18e3516Sopenharmony_ci  PCRE2_UCHAR buffer[256];
168e18e3516Sopenharmony_ci  pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
169e18e3516Sopenharmony_ci  printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,
170e18e3516Sopenharmony_ci    buffer);
171e18e3516Sopenharmony_ci  return 1;
172e18e3516Sopenharmony_ci  }
173e18e3516Sopenharmony_ci
174e18e3516Sopenharmony_ci
175e18e3516Sopenharmony_ci/*************************************************************************
176e18e3516Sopenharmony_ci* If the compilation succeeded, we call PCRE2 again, in order to do a    *
177e18e3516Sopenharmony_ci* pattern match against the subject string. This does just ONE match. If *
178e18e3516Sopenharmony_ci* further matching is needed, it will be done below. Before running the  *
179e18e3516Sopenharmony_ci* match we must set up a match_data block for holding the result. Using  *
180e18e3516Sopenharmony_ci* pcre2_match_data_create_from_pattern() ensures that the block is       *
181e18e3516Sopenharmony_ci* exactly the right size for the number of capturing parentheses in the  *
182e18e3516Sopenharmony_ci* pattern. If you need to know the actual size of a match_data block as  *
183e18e3516Sopenharmony_ci* a number of bytes, you can find it like this:                          *
184e18e3516Sopenharmony_ci*                                                                        *
185e18e3516Sopenharmony_ci* PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data);    *
186e18e3516Sopenharmony_ci*************************************************************************/
187e18e3516Sopenharmony_ci
188e18e3516Sopenharmony_cimatch_data = pcre2_match_data_create_from_pattern(re, NULL);
189e18e3516Sopenharmony_ci
190e18e3516Sopenharmony_ci/* Now run the match. */
191e18e3516Sopenharmony_ci
192e18e3516Sopenharmony_circ = pcre2_match(
193e18e3516Sopenharmony_ci  re,                   /* the compiled pattern */
194e18e3516Sopenharmony_ci  subject,              /* the subject string */
195e18e3516Sopenharmony_ci  subject_length,       /* the length of the subject */
196e18e3516Sopenharmony_ci  0,                    /* start at offset 0 in the subject */
197e18e3516Sopenharmony_ci  0,                    /* default options */
198e18e3516Sopenharmony_ci  match_data,           /* block for storing the result */
199e18e3516Sopenharmony_ci  NULL);                /* use default match context */
200e18e3516Sopenharmony_ci
201e18e3516Sopenharmony_ci/* Matching failed: handle error cases */
202e18e3516Sopenharmony_ci
203e18e3516Sopenharmony_ciif (rc &lt; 0)
204e18e3516Sopenharmony_ci  {
205e18e3516Sopenharmony_ci  switch(rc)
206e18e3516Sopenharmony_ci    {
207e18e3516Sopenharmony_ci    case PCRE2_ERROR_NOMATCH: printf("No match\n"); break;
208e18e3516Sopenharmony_ci    /*
209e18e3516Sopenharmony_ci    Handle other special cases if you like
210e18e3516Sopenharmony_ci    */
211e18e3516Sopenharmony_ci    default: printf("Matching error %d\n", rc); break;
212e18e3516Sopenharmony_ci    }
213e18e3516Sopenharmony_ci  pcre2_match_data_free(match_data);   /* Release memory used for the match */
214e18e3516Sopenharmony_ci  pcre2_code_free(re);                 /*   data and the compiled pattern. */
215e18e3516Sopenharmony_ci  return 1;
216e18e3516Sopenharmony_ci  }
217e18e3516Sopenharmony_ci
218e18e3516Sopenharmony_ci/* Match succeeded. Get a pointer to the output vector, where string offsets
219e18e3516Sopenharmony_ciare stored. */
220e18e3516Sopenharmony_ci
221e18e3516Sopenharmony_ciovector = pcre2_get_ovector_pointer(match_data);
222e18e3516Sopenharmony_ciprintf("Match succeeded at offset %d\n", (int)ovector[0]);
223e18e3516Sopenharmony_ci
224e18e3516Sopenharmony_ci
225e18e3516Sopenharmony_ci/*************************************************************************
226e18e3516Sopenharmony_ci* We have found the first match within the subject string. If the output *
227e18e3516Sopenharmony_ci* vector wasn't big enough, say so. Then output any substrings that were *
228e18e3516Sopenharmony_ci* captured.                                                              *
229e18e3516Sopenharmony_ci*************************************************************************/
230e18e3516Sopenharmony_ci
231e18e3516Sopenharmony_ci/* The output vector wasn't big enough. This should not happen, because we used
232e18e3516Sopenharmony_cipcre2_match_data_create_from_pattern() above. */
233e18e3516Sopenharmony_ci
234e18e3516Sopenharmony_ciif (rc == 0)
235e18e3516Sopenharmony_ci  printf("ovector was not big enough for all the captured substrings\n");
236e18e3516Sopenharmony_ci
237e18e3516Sopenharmony_ci/* Since release 10.38 PCRE2 has locked out the use of \K in lookaround
238e18e3516Sopenharmony_ciassertions. However, there is an option to re-enable the old behaviour. If that
239e18e3516Sopenharmony_ciis set, it is possible to run patterns such as /(?=.\K)/ that use \K in an
240e18e3516Sopenharmony_ciassertion to set the start of a match later than its end. In this demonstration
241e18e3516Sopenharmony_ciprogram, we show how to detect this case, but it shouldn't arise because the
242e18e3516Sopenharmony_cioption is never set. */
243e18e3516Sopenharmony_ci
244e18e3516Sopenharmony_ciif (ovector[0] &gt; ovector[1])
245e18e3516Sopenharmony_ci  {
246e18e3516Sopenharmony_ci  printf("\\K was used in an assertion to set the match start after its end.\n"
247e18e3516Sopenharmony_ci    "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
248e18e3516Sopenharmony_ci      (char *)(subject + ovector[1]));
249e18e3516Sopenharmony_ci  printf("Run abandoned\n");
250e18e3516Sopenharmony_ci  pcre2_match_data_free(match_data);
251e18e3516Sopenharmony_ci  pcre2_code_free(re);
252e18e3516Sopenharmony_ci  return 1;
253e18e3516Sopenharmony_ci  }
254e18e3516Sopenharmony_ci
255e18e3516Sopenharmony_ci/* Show substrings stored in the output vector by number. Obviously, in a real
256e18e3516Sopenharmony_ciapplication you might want to do things other than print them. */
257e18e3516Sopenharmony_ci
258e18e3516Sopenharmony_cifor (i = 0; i &lt; rc; i++)
259e18e3516Sopenharmony_ci  {
260e18e3516Sopenharmony_ci  PCRE2_SPTR substring_start = subject + ovector[2*i];
261e18e3516Sopenharmony_ci  PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i];
262e18e3516Sopenharmony_ci  printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
263e18e3516Sopenharmony_ci  }
264e18e3516Sopenharmony_ci
265e18e3516Sopenharmony_ci
266e18e3516Sopenharmony_ci/**************************************************************************
267e18e3516Sopenharmony_ci* That concludes the basic part of this demonstration program. We have    *
268e18e3516Sopenharmony_ci* compiled a pattern, and performed a single match. The code that follows *
269e18e3516Sopenharmony_ci* shows first how to access named substrings, and then how to code for    *
270e18e3516Sopenharmony_ci* repeated matches on the same subject.                                   *
271e18e3516Sopenharmony_ci**************************************************************************/
272e18e3516Sopenharmony_ci
273e18e3516Sopenharmony_ci/* See if there are any named substrings, and if so, show them by name. First
274e18e3516Sopenharmony_ciwe have to extract the count of named parentheses from the pattern. */
275e18e3516Sopenharmony_ci
276e18e3516Sopenharmony_ci(void)pcre2_pattern_info(
277e18e3516Sopenharmony_ci  re,                   /* the compiled pattern */
278e18e3516Sopenharmony_ci  PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
279e18e3516Sopenharmony_ci  &amp;namecount);          /* where to put the answer */
280e18e3516Sopenharmony_ci
281e18e3516Sopenharmony_ciif (namecount == 0) printf("No named substrings\n"); else
282e18e3516Sopenharmony_ci  {
283e18e3516Sopenharmony_ci  PCRE2_SPTR tabptr;
284e18e3516Sopenharmony_ci  printf("Named substrings\n");
285e18e3516Sopenharmony_ci
286e18e3516Sopenharmony_ci  /* Before we can access the substrings, we must extract the table for
287e18e3516Sopenharmony_ci  translating names to numbers, and the size of each entry in the table. */
288e18e3516Sopenharmony_ci
289e18e3516Sopenharmony_ci  (void)pcre2_pattern_info(
290e18e3516Sopenharmony_ci    re,                       /* the compiled pattern */
291e18e3516Sopenharmony_ci    PCRE2_INFO_NAMETABLE,     /* address of the table */
292e18e3516Sopenharmony_ci    &amp;name_table);             /* where to put the answer */
293e18e3516Sopenharmony_ci
294e18e3516Sopenharmony_ci  (void)pcre2_pattern_info(
295e18e3516Sopenharmony_ci    re,                       /* the compiled pattern */
296e18e3516Sopenharmony_ci    PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
297e18e3516Sopenharmony_ci    &amp;name_entry_size);        /* where to put the answer */
298e18e3516Sopenharmony_ci
299e18e3516Sopenharmony_ci  /* Now we can scan the table and, for each entry, print the number, the name,
300e18e3516Sopenharmony_ci  and the substring itself. In the 8-bit library the number is held in two
301e18e3516Sopenharmony_ci  bytes, most significant first. */
302e18e3516Sopenharmony_ci
303e18e3516Sopenharmony_ci  tabptr = name_table;
304e18e3516Sopenharmony_ci  for (i = 0; i &lt; namecount; i++)
305e18e3516Sopenharmony_ci    {
306e18e3516Sopenharmony_ci    int n = (tabptr[0] &lt;&lt; 8) | tabptr[1];
307e18e3516Sopenharmony_ci    printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
308e18e3516Sopenharmony_ci      (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
309e18e3516Sopenharmony_ci    tabptr += name_entry_size;
310e18e3516Sopenharmony_ci    }
311e18e3516Sopenharmony_ci  }
312e18e3516Sopenharmony_ci
313e18e3516Sopenharmony_ci
314e18e3516Sopenharmony_ci/*************************************************************************
315e18e3516Sopenharmony_ci* If the "-g" option was given on the command line, we want to continue  *
316e18e3516Sopenharmony_ci* to search for additional matches in the subject string, in a similar   *
317e18e3516Sopenharmony_ci* way to the /g option in Perl. This turns out to be trickier than you   *
318e18e3516Sopenharmony_ci* might think because of the possibility of matching an empty string.    *
319e18e3516Sopenharmony_ci* What happens is as follows:                                            *
320e18e3516Sopenharmony_ci*                                                                        *
321e18e3516Sopenharmony_ci* If the previous match was NOT for an empty string, we can just start   *
322e18e3516Sopenharmony_ci* the next match at the end of the previous one.                         *
323e18e3516Sopenharmony_ci*                                                                        *
324e18e3516Sopenharmony_ci* If the previous match WAS for an empty string, we can't do that, as it *
325e18e3516Sopenharmony_ci* would lead to an infinite loop. Instead, a call of pcre2_match() is    *
326e18e3516Sopenharmony_ci* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The *
327e18e3516Sopenharmony_ci* first of these tells PCRE2 that an empty string at the start of the    *
328e18e3516Sopenharmony_ci* subject is not a valid match; other possibilities must be tried. The   *
329e18e3516Sopenharmony_ci* second flag restricts PCRE2 to one match attempt at the initial string *
330e18e3516Sopenharmony_ci* position. If this match succeeds, an alternative to the empty string   *
331e18e3516Sopenharmony_ci* match has been found, and we can print it and proceed round the loop,  *
332e18e3516Sopenharmony_ci* advancing by the length of whatever was found. If this match does not  *
333e18e3516Sopenharmony_ci* succeed, we still stay in the loop, advancing by just one character.   *
334e18e3516Sopenharmony_ci* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be  *
335e18e3516Sopenharmony_ci* more than one byte.                                                    *
336e18e3516Sopenharmony_ci*                                                                        *
337e18e3516Sopenharmony_ci* However, there is a complication concerned with newlines. When the     *
338e18e3516Sopenharmony_ci* newline convention is such that CRLF is a valid newline, we must       *
339e18e3516Sopenharmony_ci* advance by two characters rather than one. The newline convention can  *
340e18e3516Sopenharmony_ci* be set in the regex by (*CR), etc.; if not, we must find the default.  *
341e18e3516Sopenharmony_ci*************************************************************************/
342e18e3516Sopenharmony_ci
343e18e3516Sopenharmony_ciif (!find_all)     /* Check for -g */
344e18e3516Sopenharmony_ci  {
345e18e3516Sopenharmony_ci  pcre2_match_data_free(match_data);  /* Release the memory that was used */
346e18e3516Sopenharmony_ci  pcre2_code_free(re);                /* for the match data and the pattern. */
347e18e3516Sopenharmony_ci  return 0;                           /* Exit the program. */
348e18e3516Sopenharmony_ci  }
349e18e3516Sopenharmony_ci
350e18e3516Sopenharmony_ci/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
351e18e3516Sopenharmony_cisequence. First, find the options with which the regex was compiled and extract
352e18e3516Sopenharmony_cithe UTF state. */
353e18e3516Sopenharmony_ci
354e18e3516Sopenharmony_ci(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &amp;option_bits);
355e18e3516Sopenharmony_ciutf8 = (option_bits &amp; PCRE2_UTF) != 0;
356e18e3516Sopenharmony_ci
357e18e3516Sopenharmony_ci/* Now find the newline convention and see whether CRLF is a valid newline
358e18e3516Sopenharmony_cisequence. */
359e18e3516Sopenharmony_ci
360e18e3516Sopenharmony_ci(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &amp;newline);
361e18e3516Sopenharmony_cicrlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
362e18e3516Sopenharmony_ci                  newline == PCRE2_NEWLINE_CRLF ||
363e18e3516Sopenharmony_ci                  newline == PCRE2_NEWLINE_ANYCRLF;
364e18e3516Sopenharmony_ci
365e18e3516Sopenharmony_ci/* Loop for second and subsequent matches */
366e18e3516Sopenharmony_ci
367e18e3516Sopenharmony_cifor (;;)
368e18e3516Sopenharmony_ci  {
369e18e3516Sopenharmony_ci  uint32_t options = 0;                   /* Normally no options */
370e18e3516Sopenharmony_ci  PCRE2_SIZE start_offset = ovector[1];   /* Start at end of previous match */
371e18e3516Sopenharmony_ci
372e18e3516Sopenharmony_ci  /* If the previous match was for an empty string, we are finished if we are
373e18e3516Sopenharmony_ci  at the end of the subject. Otherwise, arrange to run another match at the
374e18e3516Sopenharmony_ci  same point to see if a non-empty match can be found. */
375e18e3516Sopenharmony_ci
376e18e3516Sopenharmony_ci  if (ovector[0] == ovector[1])
377e18e3516Sopenharmony_ci    {
378e18e3516Sopenharmony_ci    if (ovector[0] == subject_length) break;
379e18e3516Sopenharmony_ci    options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
380e18e3516Sopenharmony_ci    }
381e18e3516Sopenharmony_ci
382e18e3516Sopenharmony_ci  /* If the previous match was not an empty string, there is one tricky case to
383e18e3516Sopenharmony_ci  consider. If a pattern contains \K within a lookbehind assertion at the
384e18e3516Sopenharmony_ci  start, the end of the matched string can be at the offset where the match
385e18e3516Sopenharmony_ci  started. Without special action, this leads to a loop that keeps on matching
386e18e3516Sopenharmony_ci  the same substring. We must detect this case and arrange to move the start on
387e18e3516Sopenharmony_ci  by one character. The pcre2_get_startchar() function returns the starting
388e18e3516Sopenharmony_ci  offset that was passed to pcre2_match(). */
389e18e3516Sopenharmony_ci
390e18e3516Sopenharmony_ci  else
391e18e3516Sopenharmony_ci    {
392e18e3516Sopenharmony_ci    PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
393e18e3516Sopenharmony_ci    if (start_offset &lt;= startchar)
394e18e3516Sopenharmony_ci      {
395e18e3516Sopenharmony_ci      if (startchar &gt;= subject_length) break;   /* Reached end of subject.   */
396e18e3516Sopenharmony_ci      start_offset = startchar + 1;             /* Advance by one character. */
397e18e3516Sopenharmony_ci      if (utf8)                                 /* If UTF-8, it may be more  */
398e18e3516Sopenharmony_ci        {                                       /*   than one code unit.     */
399e18e3516Sopenharmony_ci        for (; start_offset &lt; subject_length; start_offset++)
400e18e3516Sopenharmony_ci          if ((subject[start_offset] &amp; 0xc0) != 0x80) break;
401e18e3516Sopenharmony_ci        }
402e18e3516Sopenharmony_ci      }
403e18e3516Sopenharmony_ci    }
404e18e3516Sopenharmony_ci
405e18e3516Sopenharmony_ci  /* Run the next matching operation */
406e18e3516Sopenharmony_ci
407e18e3516Sopenharmony_ci  rc = pcre2_match(
408e18e3516Sopenharmony_ci    re,                   /* the compiled pattern */
409e18e3516Sopenharmony_ci    subject,              /* the subject string */
410e18e3516Sopenharmony_ci    subject_length,       /* the length of the subject */
411e18e3516Sopenharmony_ci    start_offset,         /* starting offset in the subject */
412e18e3516Sopenharmony_ci    options,              /* options */
413e18e3516Sopenharmony_ci    match_data,           /* block for storing the result */
414e18e3516Sopenharmony_ci    NULL);                /* use default match context */
415e18e3516Sopenharmony_ci
416e18e3516Sopenharmony_ci  /* This time, a result of NOMATCH isn't an error. If the value in "options"
417e18e3516Sopenharmony_ci  is zero, it just means we have found all possible matches, so the loop ends.
418e18e3516Sopenharmony_ci  Otherwise, it means we have failed to find a non-empty-string match at a
419e18e3516Sopenharmony_ci  point where there was a previous empty-string match. In this case, we do what
420e18e3516Sopenharmony_ci  Perl does: advance the matching position by one character, and continue. We
421e18e3516Sopenharmony_ci  do this by setting the "end of previous match" offset, because that is picked
422e18e3516Sopenharmony_ci  up at the top of the loop as the point at which to start again.
423e18e3516Sopenharmony_ci
424e18e3516Sopenharmony_ci  There are two complications: (a) When CRLF is a valid newline sequence, and
425e18e3516Sopenharmony_ci  the current position is just before it, advance by an extra byte. (b)
426e18e3516Sopenharmony_ci  Otherwise we must ensure that we skip an entire UTF character if we are in
427e18e3516Sopenharmony_ci  UTF mode. */
428e18e3516Sopenharmony_ci
429e18e3516Sopenharmony_ci  if (rc == PCRE2_ERROR_NOMATCH)
430e18e3516Sopenharmony_ci    {
431e18e3516Sopenharmony_ci    if (options == 0) break;                    /* All matches found */
432e18e3516Sopenharmony_ci    ovector[1] = start_offset + 1;              /* Advance one code unit */
433e18e3516Sopenharmony_ci    if (crlf_is_newline &amp;&amp;                      /* If CRLF is a newline &amp; */
434e18e3516Sopenharmony_ci        start_offset &lt; subject_length - 1 &amp;&amp;    /* we are at CRLF, */
435e18e3516Sopenharmony_ci        subject[start_offset] == '\r' &amp;&amp;
436e18e3516Sopenharmony_ci        subject[start_offset + 1] == '\n')
437e18e3516Sopenharmony_ci      ovector[1] += 1;                          /* Advance by one more. */
438e18e3516Sopenharmony_ci    else if (utf8)                              /* Otherwise, ensure we */
439e18e3516Sopenharmony_ci      {                                         /* advance a whole UTF-8 */
440e18e3516Sopenharmony_ci      while (ovector[1] &lt; subject_length)       /* character. */
441e18e3516Sopenharmony_ci        {
442e18e3516Sopenharmony_ci        if ((subject[ovector[1]] &amp; 0xc0) != 0x80) break;
443e18e3516Sopenharmony_ci        ovector[1] += 1;
444e18e3516Sopenharmony_ci        }
445e18e3516Sopenharmony_ci      }
446e18e3516Sopenharmony_ci    continue;    /* Go round the loop again */
447e18e3516Sopenharmony_ci    }
448e18e3516Sopenharmony_ci
449e18e3516Sopenharmony_ci  /* Other matching errors are not recoverable. */
450e18e3516Sopenharmony_ci
451e18e3516Sopenharmony_ci  if (rc &lt; 0)
452e18e3516Sopenharmony_ci    {
453e18e3516Sopenharmony_ci    printf("Matching error %d\n", rc);
454e18e3516Sopenharmony_ci    pcre2_match_data_free(match_data);
455e18e3516Sopenharmony_ci    pcre2_code_free(re);
456e18e3516Sopenharmony_ci    return 1;
457e18e3516Sopenharmony_ci    }
458e18e3516Sopenharmony_ci
459e18e3516Sopenharmony_ci  /* Match succeeded */
460e18e3516Sopenharmony_ci
461e18e3516Sopenharmony_ci  printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);
462e18e3516Sopenharmony_ci
463e18e3516Sopenharmony_ci  /* The match succeeded, but the output vector wasn't big enough. This
464e18e3516Sopenharmony_ci  should not happen. */
465e18e3516Sopenharmony_ci
466e18e3516Sopenharmony_ci  if (rc == 0)
467e18e3516Sopenharmony_ci    printf("ovector was not big enough for all the captured substrings\n");
468e18e3516Sopenharmony_ci
469e18e3516Sopenharmony_ci  /* We must guard against patterns such as /(?=.\K)/ that use \K in an
470e18e3516Sopenharmony_ci  assertion to set the start of a match later than its end. In this
471e18e3516Sopenharmony_ci  demonstration program, we just detect this case and give up. */
472e18e3516Sopenharmony_ci
473e18e3516Sopenharmony_ci  if (ovector[0] &gt; ovector[1])
474e18e3516Sopenharmony_ci    {
475e18e3516Sopenharmony_ci    printf("\\K was used in an assertion to set the match start after its end.\n"
476e18e3516Sopenharmony_ci      "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
477e18e3516Sopenharmony_ci        (char *)(subject + ovector[1]));
478e18e3516Sopenharmony_ci    printf("Run abandoned\n");
479e18e3516Sopenharmony_ci    pcre2_match_data_free(match_data);
480e18e3516Sopenharmony_ci    pcre2_code_free(re);
481e18e3516Sopenharmony_ci    return 1;
482e18e3516Sopenharmony_ci    }
483e18e3516Sopenharmony_ci
484e18e3516Sopenharmony_ci  /* As before, show substrings stored in the output vector by number, and then
485e18e3516Sopenharmony_ci  also any named substrings. */
486e18e3516Sopenharmony_ci
487e18e3516Sopenharmony_ci  for (i = 0; i &lt; rc; i++)
488e18e3516Sopenharmony_ci    {
489e18e3516Sopenharmony_ci    PCRE2_SPTR substring_start = subject + ovector[2*i];
490e18e3516Sopenharmony_ci    size_t substring_length = ovector[2*i+1] - ovector[2*i];
491e18e3516Sopenharmony_ci    printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
492e18e3516Sopenharmony_ci    }
493e18e3516Sopenharmony_ci
494e18e3516Sopenharmony_ci  if (namecount == 0) printf("No named substrings\n"); else
495e18e3516Sopenharmony_ci    {
496e18e3516Sopenharmony_ci    PCRE2_SPTR tabptr = name_table;
497e18e3516Sopenharmony_ci    printf("Named substrings\n");
498e18e3516Sopenharmony_ci    for (i = 0; i &lt; namecount; i++)
499e18e3516Sopenharmony_ci      {
500e18e3516Sopenharmony_ci      int n = (tabptr[0] &lt;&lt; 8) | tabptr[1];
501e18e3516Sopenharmony_ci      printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
502e18e3516Sopenharmony_ci        (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
503e18e3516Sopenharmony_ci      tabptr += name_entry_size;
504e18e3516Sopenharmony_ci      }
505e18e3516Sopenharmony_ci    }
506e18e3516Sopenharmony_ci  }      /* End of loop to find second and subsequent matches */
507e18e3516Sopenharmony_ci
508e18e3516Sopenharmony_ciprintf("\n");
509e18e3516Sopenharmony_cipcre2_match_data_free(match_data);
510e18e3516Sopenharmony_cipcre2_code_free(re);
511e18e3516Sopenharmony_cireturn 0;
512e18e3516Sopenharmony_ci}
513e18e3516Sopenharmony_ci
514e18e3516Sopenharmony_ci/* End of pcre2demo.c */
515e18e3516Sopenharmony_ci<p>
516e18e3516Sopenharmony_ciReturn to the <a href="index.html">PCRE2 index page</a>.
517e18e3516Sopenharmony_ci</p>
518