1e18e3516Sopenharmony_ci<html> 2e18e3516Sopenharmony_ci<head> 3e18e3516Sopenharmony_ci<title>pcre2demo specification</title> 4e18e3516Sopenharmony_ci</head> 5e18e3516Sopenharmony_ci<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> 6e18e3516Sopenharmony_ci<h1>pcre2demo man page</h1> 7e18e3516Sopenharmony_ci<p> 8e18e3516Sopenharmony_ciReturn to the <a href="index.html">PCRE2 index page</a>. 9e18e3516Sopenharmony_ci</p> 10e18e3516Sopenharmony_ci<p> 11e18e3516Sopenharmony_ciThis page is part of the PCRE2 HTML documentation. It was generated 12e18e3516Sopenharmony_ciautomatically from the original man page. If there is any nonsense in it, 13e18e3516Sopenharmony_ciplease consult the man page, in case the conversion went wrong. 14e18e3516Sopenharmony_ci<br> 15e18e3516Sopenharmony_ci<ul> 16e18e3516Sopenharmony_ci</ul> 17e18e3516Sopenharmony_ci<PRE> 18e18e3516Sopenharmony_ci/************************************************* 19e18e3516Sopenharmony_ci* PCRE2 DEMONSTRATION PROGRAM * 20e18e3516Sopenharmony_ci*************************************************/ 21e18e3516Sopenharmony_ci 22e18e3516Sopenharmony_ci/* This is a demonstration program to illustrate a straightforward way of 23e18e3516Sopenharmony_ciusing the PCRE2 regular expression library from a C program. See the 24e18e3516Sopenharmony_cipcre2sample documentation for a short discussion ("man pcre2sample" if you have 25e18e3516Sopenharmony_cithe PCRE2 man pages installed). PCRE2 is a revised API for the library, and is 26e18e3516Sopenharmony_ciincompatible with the original PCRE API. 27e18e3516Sopenharmony_ci 28e18e3516Sopenharmony_ciThere are actually three libraries, each supporting a different code unit 29e18e3516Sopenharmony_ciwidth. This demonstration program uses the 8-bit library. The default is to 30e18e3516Sopenharmony_ciprocess each code unit as a separate character, but if the pattern begins with 31e18e3516Sopenharmony_ci"(*UTF)", both it and the subject are treated as UTF-8 strings, where 32e18e3516Sopenharmony_cicharacters may occupy multiple code units. 33e18e3516Sopenharmony_ci 34e18e3516Sopenharmony_ciIn Unix-like environments, if PCRE2 is installed in your standard system 35e18e3516Sopenharmony_cilibraries, you should be able to compile this program using this command: 36e18e3516Sopenharmony_ci 37e18e3516Sopenharmony_cicc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo 38e18e3516Sopenharmony_ci 39e18e3516Sopenharmony_ciIf PCRE2 is not installed in a standard place, it is likely to be installed 40e18e3516Sopenharmony_ciwith support for the pkg-config mechanism. If you have pkg-config, you can 41e18e3516Sopenharmony_cicompile this program using this command: 42e18e3516Sopenharmony_ci 43e18e3516Sopenharmony_cicc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo 44e18e3516Sopenharmony_ci 45e18e3516Sopenharmony_ciIf you do not have pkg-config, you may have to use something like this: 46e18e3516Sopenharmony_ci 47e18e3516Sopenharmony_cicc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \ 48e18e3516Sopenharmony_ci -R/usr/local/lib -lpcre2-8 -o pcre2demo 49e18e3516Sopenharmony_ci 50e18e3516Sopenharmony_ciReplace "/usr/local/include" and "/usr/local/lib" with wherever the include and 51e18e3516Sopenharmony_cilibrary files for PCRE2 are installed on your system. Only some operating 52e18e3516Sopenharmony_cisystems (Solaris is one) use the -R option. 53e18e3516Sopenharmony_ci 54e18e3516Sopenharmony_ciBuilding under Windows: 55e18e3516Sopenharmony_ci 56e18e3516Sopenharmony_ciIf you want to statically link this program against a non-dll .a file, you must 57e18e3516Sopenharmony_cidefine PCRE2_STATIC before including pcre2.h, so in this environment, uncomment 58e18e3516Sopenharmony_cithe following line. */ 59e18e3516Sopenharmony_ci 60e18e3516Sopenharmony_ci/* #define PCRE2_STATIC */ 61e18e3516Sopenharmony_ci 62e18e3516Sopenharmony_ci/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h. 63e18e3516Sopenharmony_ciFor a program that uses only one code unit width, setting it to 8, 16, or 32 64e18e3516Sopenharmony_cimakes it possible to use generic function names such as pcre2_compile(). Note 65e18e3516Sopenharmony_cithat just changing 8 to 16 (for example) is not sufficient to convert this 66e18e3516Sopenharmony_ciprogram to process 16-bit characters. Even in a fully 16-bit environment, where 67e18e3516Sopenharmony_cistring-handling functions such as strcmp() and printf() work with 16-bit 68e18e3516Sopenharmony_cicharacters, the code for handling the table of named substrings will still need 69e18e3516Sopenharmony_cito be modified. */ 70e18e3516Sopenharmony_ci 71e18e3516Sopenharmony_ci#define PCRE2_CODE_UNIT_WIDTH 8 72e18e3516Sopenharmony_ci 73e18e3516Sopenharmony_ci#include <stdio.h> 74e18e3516Sopenharmony_ci#include <string.h> 75e18e3516Sopenharmony_ci#include <pcre2.h> 76e18e3516Sopenharmony_ci 77e18e3516Sopenharmony_ci 78e18e3516Sopenharmony_ci/************************************************************************** 79e18e3516Sopenharmony_ci* Here is the program. The API includes the concept of "contexts" for * 80e18e3516Sopenharmony_ci* setting up unusual interface requirements for compiling and matching, * 81e18e3516Sopenharmony_ci* such as custom memory managers and non-standard newline definitions. * 82e18e3516Sopenharmony_ci* This program does not do any of this, so it makes no use of contexts, * 83e18e3516Sopenharmony_ci* always passing NULL where a context could be given. * 84e18e3516Sopenharmony_ci**************************************************************************/ 85e18e3516Sopenharmony_ci 86e18e3516Sopenharmony_ciint main(int argc, char **argv) 87e18e3516Sopenharmony_ci{ 88e18e3516Sopenharmony_cipcre2_code *re; 89e18e3516Sopenharmony_ciPCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ 90e18e3516Sopenharmony_ciPCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */ 91e18e3516Sopenharmony_ciPCRE2_SPTR name_table; 92e18e3516Sopenharmony_ci 93e18e3516Sopenharmony_ciint crlf_is_newline; 94e18e3516Sopenharmony_ciint errornumber; 95e18e3516Sopenharmony_ciint find_all; 96e18e3516Sopenharmony_ciint i; 97e18e3516Sopenharmony_ciint rc; 98e18e3516Sopenharmony_ciint utf8; 99e18e3516Sopenharmony_ci 100e18e3516Sopenharmony_ciuint32_t option_bits; 101e18e3516Sopenharmony_ciuint32_t namecount; 102e18e3516Sopenharmony_ciuint32_t name_entry_size; 103e18e3516Sopenharmony_ciuint32_t newline; 104e18e3516Sopenharmony_ci 105e18e3516Sopenharmony_ciPCRE2_SIZE erroroffset; 106e18e3516Sopenharmony_ciPCRE2_SIZE *ovector; 107e18e3516Sopenharmony_ciPCRE2_SIZE subject_length; 108e18e3516Sopenharmony_ci 109e18e3516Sopenharmony_cipcre2_match_data *match_data; 110e18e3516Sopenharmony_ci 111e18e3516Sopenharmony_ci 112e18e3516Sopenharmony_ci/************************************************************************** 113e18e3516Sopenharmony_ci* First, sort out the command line. There is only one possible option at * 114e18e3516Sopenharmony_ci* the moment, "-g" to request repeated matching to find all occurrences, * 115e18e3516Sopenharmony_ci* like Perl's /g option. We set the variable find_all to a non-zero value * 116e18e3516Sopenharmony_ci* if the -g option is present. * 117e18e3516Sopenharmony_ci**************************************************************************/ 118e18e3516Sopenharmony_ci 119e18e3516Sopenharmony_cifind_all = 0; 120e18e3516Sopenharmony_cifor (i = 1; i < argc; i++) 121e18e3516Sopenharmony_ci { 122e18e3516Sopenharmony_ci if (strcmp(argv[i], "-g") == 0) find_all = 1; 123e18e3516Sopenharmony_ci else if (argv[i][0] == '-') 124e18e3516Sopenharmony_ci { 125e18e3516Sopenharmony_ci printf("Unrecognised option %s\n", argv[i]); 126e18e3516Sopenharmony_ci return 1; 127e18e3516Sopenharmony_ci } 128e18e3516Sopenharmony_ci else break; 129e18e3516Sopenharmony_ci } 130e18e3516Sopenharmony_ci 131e18e3516Sopenharmony_ci/* After the options, we require exactly two arguments, which are the pattern, 132e18e3516Sopenharmony_ciand the subject string. */ 133e18e3516Sopenharmony_ci 134e18e3516Sopenharmony_ciif (argc - i != 2) 135e18e3516Sopenharmony_ci { 136e18e3516Sopenharmony_ci printf("Exactly two arguments required: a regex and a subject string\n"); 137e18e3516Sopenharmony_ci return 1; 138e18e3516Sopenharmony_ci } 139e18e3516Sopenharmony_ci 140e18e3516Sopenharmony_ci/* Pattern and subject are char arguments, so they can be straightforwardly 141e18e3516Sopenharmony_cicast to PCRE2_SPTR because we are working in 8-bit code units. The subject 142e18e3516Sopenharmony_cilength is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact 143e18e3516Sopenharmony_cidefined to be size_t. */ 144e18e3516Sopenharmony_ci 145e18e3516Sopenharmony_cipattern = (PCRE2_SPTR)argv[i]; 146e18e3516Sopenharmony_cisubject = (PCRE2_SPTR)argv[i+1]; 147e18e3516Sopenharmony_cisubject_length = (PCRE2_SIZE)strlen((char *)subject); 148e18e3516Sopenharmony_ci 149e18e3516Sopenharmony_ci 150e18e3516Sopenharmony_ci/************************************************************************* 151e18e3516Sopenharmony_ci* Now we are going to compile the regular expression pattern, and handle * 152e18e3516Sopenharmony_ci* any errors that are detected. * 153e18e3516Sopenharmony_ci*************************************************************************/ 154e18e3516Sopenharmony_ci 155e18e3516Sopenharmony_cire = pcre2_compile( 156e18e3516Sopenharmony_ci pattern, /* the pattern */ 157e18e3516Sopenharmony_ci PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */ 158e18e3516Sopenharmony_ci 0, /* default options */ 159e18e3516Sopenharmony_ci &errornumber, /* for error number */ 160e18e3516Sopenharmony_ci &erroroffset, /* for error offset */ 161e18e3516Sopenharmony_ci NULL); /* use default compile context */ 162e18e3516Sopenharmony_ci 163e18e3516Sopenharmony_ci/* Compilation failed: print the error message and exit. */ 164e18e3516Sopenharmony_ci 165e18e3516Sopenharmony_ciif (re == NULL) 166e18e3516Sopenharmony_ci { 167e18e3516Sopenharmony_ci PCRE2_UCHAR buffer[256]; 168e18e3516Sopenharmony_ci pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); 169e18e3516Sopenharmony_ci printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset, 170e18e3516Sopenharmony_ci buffer); 171e18e3516Sopenharmony_ci return 1; 172e18e3516Sopenharmony_ci } 173e18e3516Sopenharmony_ci 174e18e3516Sopenharmony_ci 175e18e3516Sopenharmony_ci/************************************************************************* 176e18e3516Sopenharmony_ci* If the compilation succeeded, we call PCRE2 again, in order to do a * 177e18e3516Sopenharmony_ci* pattern match against the subject string. This does just ONE match. If * 178e18e3516Sopenharmony_ci* further matching is needed, it will be done below. Before running the * 179e18e3516Sopenharmony_ci* match we must set up a match_data block for holding the result. Using * 180e18e3516Sopenharmony_ci* pcre2_match_data_create_from_pattern() ensures that the block is * 181e18e3516Sopenharmony_ci* exactly the right size for the number of capturing parentheses in the * 182e18e3516Sopenharmony_ci* pattern. If you need to know the actual size of a match_data block as * 183e18e3516Sopenharmony_ci* a number of bytes, you can find it like this: * 184e18e3516Sopenharmony_ci* * 185e18e3516Sopenharmony_ci* PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data); * 186e18e3516Sopenharmony_ci*************************************************************************/ 187e18e3516Sopenharmony_ci 188e18e3516Sopenharmony_cimatch_data = pcre2_match_data_create_from_pattern(re, NULL); 189e18e3516Sopenharmony_ci 190e18e3516Sopenharmony_ci/* Now run the match. */ 191e18e3516Sopenharmony_ci 192e18e3516Sopenharmony_circ = pcre2_match( 193e18e3516Sopenharmony_ci re, /* the compiled pattern */ 194e18e3516Sopenharmony_ci subject, /* the subject string */ 195e18e3516Sopenharmony_ci subject_length, /* the length of the subject */ 196e18e3516Sopenharmony_ci 0, /* start at offset 0 in the subject */ 197e18e3516Sopenharmony_ci 0, /* default options */ 198e18e3516Sopenharmony_ci match_data, /* block for storing the result */ 199e18e3516Sopenharmony_ci NULL); /* use default match context */ 200e18e3516Sopenharmony_ci 201e18e3516Sopenharmony_ci/* Matching failed: handle error cases */ 202e18e3516Sopenharmony_ci 203e18e3516Sopenharmony_ciif (rc < 0) 204e18e3516Sopenharmony_ci { 205e18e3516Sopenharmony_ci switch(rc) 206e18e3516Sopenharmony_ci { 207e18e3516Sopenharmony_ci case PCRE2_ERROR_NOMATCH: printf("No match\n"); break; 208e18e3516Sopenharmony_ci /* 209e18e3516Sopenharmony_ci Handle other special cases if you like 210e18e3516Sopenharmony_ci */ 211e18e3516Sopenharmony_ci default: printf("Matching error %d\n", rc); break; 212e18e3516Sopenharmony_ci } 213e18e3516Sopenharmony_ci pcre2_match_data_free(match_data); /* Release memory used for the match */ 214e18e3516Sopenharmony_ci pcre2_code_free(re); /* data and the compiled pattern. */ 215e18e3516Sopenharmony_ci return 1; 216e18e3516Sopenharmony_ci } 217e18e3516Sopenharmony_ci 218e18e3516Sopenharmony_ci/* Match succeeded. Get a pointer to the output vector, where string offsets 219e18e3516Sopenharmony_ciare stored. */ 220e18e3516Sopenharmony_ci 221e18e3516Sopenharmony_ciovector = pcre2_get_ovector_pointer(match_data); 222e18e3516Sopenharmony_ciprintf("Match succeeded at offset %d\n", (int)ovector[0]); 223e18e3516Sopenharmony_ci 224e18e3516Sopenharmony_ci 225e18e3516Sopenharmony_ci/************************************************************************* 226e18e3516Sopenharmony_ci* We have found the first match within the subject string. If the output * 227e18e3516Sopenharmony_ci* vector wasn't big enough, say so. Then output any substrings that were * 228e18e3516Sopenharmony_ci* captured. * 229e18e3516Sopenharmony_ci*************************************************************************/ 230e18e3516Sopenharmony_ci 231e18e3516Sopenharmony_ci/* The output vector wasn't big enough. This should not happen, because we used 232e18e3516Sopenharmony_cipcre2_match_data_create_from_pattern() above. */ 233e18e3516Sopenharmony_ci 234e18e3516Sopenharmony_ciif (rc == 0) 235e18e3516Sopenharmony_ci printf("ovector was not big enough for all the captured substrings\n"); 236e18e3516Sopenharmony_ci 237e18e3516Sopenharmony_ci/* Since release 10.38 PCRE2 has locked out the use of \K in lookaround 238e18e3516Sopenharmony_ciassertions. However, there is an option to re-enable the old behaviour. If that 239e18e3516Sopenharmony_ciis set, it is possible to run patterns such as /(?=.\K)/ that use \K in an 240e18e3516Sopenharmony_ciassertion to set the start of a match later than its end. In this demonstration 241e18e3516Sopenharmony_ciprogram, we show how to detect this case, but it shouldn't arise because the 242e18e3516Sopenharmony_cioption is never set. */ 243e18e3516Sopenharmony_ci 244e18e3516Sopenharmony_ciif (ovector[0] > ovector[1]) 245e18e3516Sopenharmony_ci { 246e18e3516Sopenharmony_ci printf("\\K was used in an assertion to set the match start after its end.\n" 247e18e3516Sopenharmony_ci "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]), 248e18e3516Sopenharmony_ci (char *)(subject + ovector[1])); 249e18e3516Sopenharmony_ci printf("Run abandoned\n"); 250e18e3516Sopenharmony_ci pcre2_match_data_free(match_data); 251e18e3516Sopenharmony_ci pcre2_code_free(re); 252e18e3516Sopenharmony_ci return 1; 253e18e3516Sopenharmony_ci } 254e18e3516Sopenharmony_ci 255e18e3516Sopenharmony_ci/* Show substrings stored in the output vector by number. Obviously, in a real 256e18e3516Sopenharmony_ciapplication you might want to do things other than print them. */ 257e18e3516Sopenharmony_ci 258e18e3516Sopenharmony_cifor (i = 0; i < rc; i++) 259e18e3516Sopenharmony_ci { 260e18e3516Sopenharmony_ci PCRE2_SPTR substring_start = subject + ovector[2*i]; 261e18e3516Sopenharmony_ci PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i]; 262e18e3516Sopenharmony_ci printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); 263e18e3516Sopenharmony_ci } 264e18e3516Sopenharmony_ci 265e18e3516Sopenharmony_ci 266e18e3516Sopenharmony_ci/************************************************************************** 267e18e3516Sopenharmony_ci* That concludes the basic part of this demonstration program. We have * 268e18e3516Sopenharmony_ci* compiled a pattern, and performed a single match. The code that follows * 269e18e3516Sopenharmony_ci* shows first how to access named substrings, and then how to code for * 270e18e3516Sopenharmony_ci* repeated matches on the same subject. * 271e18e3516Sopenharmony_ci**************************************************************************/ 272e18e3516Sopenharmony_ci 273e18e3516Sopenharmony_ci/* See if there are any named substrings, and if so, show them by name. First 274e18e3516Sopenharmony_ciwe have to extract the count of named parentheses from the pattern. */ 275e18e3516Sopenharmony_ci 276e18e3516Sopenharmony_ci(void)pcre2_pattern_info( 277e18e3516Sopenharmony_ci re, /* the compiled pattern */ 278e18e3516Sopenharmony_ci PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ 279e18e3516Sopenharmony_ci &namecount); /* where to put the answer */ 280e18e3516Sopenharmony_ci 281e18e3516Sopenharmony_ciif (namecount == 0) printf("No named substrings\n"); else 282e18e3516Sopenharmony_ci { 283e18e3516Sopenharmony_ci PCRE2_SPTR tabptr; 284e18e3516Sopenharmony_ci printf("Named substrings\n"); 285e18e3516Sopenharmony_ci 286e18e3516Sopenharmony_ci /* Before we can access the substrings, we must extract the table for 287e18e3516Sopenharmony_ci translating names to numbers, and the size of each entry in the table. */ 288e18e3516Sopenharmony_ci 289e18e3516Sopenharmony_ci (void)pcre2_pattern_info( 290e18e3516Sopenharmony_ci re, /* the compiled pattern */ 291e18e3516Sopenharmony_ci PCRE2_INFO_NAMETABLE, /* address of the table */ 292e18e3516Sopenharmony_ci &name_table); /* where to put the answer */ 293e18e3516Sopenharmony_ci 294e18e3516Sopenharmony_ci (void)pcre2_pattern_info( 295e18e3516Sopenharmony_ci re, /* the compiled pattern */ 296e18e3516Sopenharmony_ci PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */ 297e18e3516Sopenharmony_ci &name_entry_size); /* where to put the answer */ 298e18e3516Sopenharmony_ci 299e18e3516Sopenharmony_ci /* Now we can scan the table and, for each entry, print the number, the name, 300e18e3516Sopenharmony_ci and the substring itself. In the 8-bit library the number is held in two 301e18e3516Sopenharmony_ci bytes, most significant first. */ 302e18e3516Sopenharmony_ci 303e18e3516Sopenharmony_ci tabptr = name_table; 304e18e3516Sopenharmony_ci for (i = 0; i < namecount; i++) 305e18e3516Sopenharmony_ci { 306e18e3516Sopenharmony_ci int n = (tabptr[0] << 8) | tabptr[1]; 307e18e3516Sopenharmony_ci printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, 308e18e3516Sopenharmony_ci (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); 309e18e3516Sopenharmony_ci tabptr += name_entry_size; 310e18e3516Sopenharmony_ci } 311e18e3516Sopenharmony_ci } 312e18e3516Sopenharmony_ci 313e18e3516Sopenharmony_ci 314e18e3516Sopenharmony_ci/************************************************************************* 315e18e3516Sopenharmony_ci* If the "-g" option was given on the command line, we want to continue * 316e18e3516Sopenharmony_ci* to search for additional matches in the subject string, in a similar * 317e18e3516Sopenharmony_ci* way to the /g option in Perl. This turns out to be trickier than you * 318e18e3516Sopenharmony_ci* might think because of the possibility of matching an empty string. * 319e18e3516Sopenharmony_ci* What happens is as follows: * 320e18e3516Sopenharmony_ci* * 321e18e3516Sopenharmony_ci* If the previous match was NOT for an empty string, we can just start * 322e18e3516Sopenharmony_ci* the next match at the end of the previous one. * 323e18e3516Sopenharmony_ci* * 324e18e3516Sopenharmony_ci* If the previous match WAS for an empty string, we can't do that, as it * 325e18e3516Sopenharmony_ci* would lead to an infinite loop. Instead, a call of pcre2_match() is * 326e18e3516Sopenharmony_ci* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The * 327e18e3516Sopenharmony_ci* first of these tells PCRE2 that an empty string at the start of the * 328e18e3516Sopenharmony_ci* subject is not a valid match; other possibilities must be tried. The * 329e18e3516Sopenharmony_ci* second flag restricts PCRE2 to one match attempt at the initial string * 330e18e3516Sopenharmony_ci* position. If this match succeeds, an alternative to the empty string * 331e18e3516Sopenharmony_ci* match has been found, and we can print it and proceed round the loop, * 332e18e3516Sopenharmony_ci* advancing by the length of whatever was found. If this match does not * 333e18e3516Sopenharmony_ci* succeed, we still stay in the loop, advancing by just one character. * 334e18e3516Sopenharmony_ci* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be * 335e18e3516Sopenharmony_ci* more than one byte. * 336e18e3516Sopenharmony_ci* * 337e18e3516Sopenharmony_ci* However, there is a complication concerned with newlines. When the * 338e18e3516Sopenharmony_ci* newline convention is such that CRLF is a valid newline, we must * 339e18e3516Sopenharmony_ci* advance by two characters rather than one. The newline convention can * 340e18e3516Sopenharmony_ci* be set in the regex by (*CR), etc.; if not, we must find the default. * 341e18e3516Sopenharmony_ci*************************************************************************/ 342e18e3516Sopenharmony_ci 343e18e3516Sopenharmony_ciif (!find_all) /* Check for -g */ 344e18e3516Sopenharmony_ci { 345e18e3516Sopenharmony_ci pcre2_match_data_free(match_data); /* Release the memory that was used */ 346e18e3516Sopenharmony_ci pcre2_code_free(re); /* for the match data and the pattern. */ 347e18e3516Sopenharmony_ci return 0; /* Exit the program. */ 348e18e3516Sopenharmony_ci } 349e18e3516Sopenharmony_ci 350e18e3516Sopenharmony_ci/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline 351e18e3516Sopenharmony_cisequence. First, find the options with which the regex was compiled and extract 352e18e3516Sopenharmony_cithe UTF state. */ 353e18e3516Sopenharmony_ci 354e18e3516Sopenharmony_ci(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &option_bits); 355e18e3516Sopenharmony_ciutf8 = (option_bits & PCRE2_UTF) != 0; 356e18e3516Sopenharmony_ci 357e18e3516Sopenharmony_ci/* Now find the newline convention and see whether CRLF is a valid newline 358e18e3516Sopenharmony_cisequence. */ 359e18e3516Sopenharmony_ci 360e18e3516Sopenharmony_ci(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &newline); 361e18e3516Sopenharmony_cicrlf_is_newline = newline == PCRE2_NEWLINE_ANY || 362e18e3516Sopenharmony_ci newline == PCRE2_NEWLINE_CRLF || 363e18e3516Sopenharmony_ci newline == PCRE2_NEWLINE_ANYCRLF; 364e18e3516Sopenharmony_ci 365e18e3516Sopenharmony_ci/* Loop for second and subsequent matches */ 366e18e3516Sopenharmony_ci 367e18e3516Sopenharmony_cifor (;;) 368e18e3516Sopenharmony_ci { 369e18e3516Sopenharmony_ci uint32_t options = 0; /* Normally no options */ 370e18e3516Sopenharmony_ci PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ 371e18e3516Sopenharmony_ci 372e18e3516Sopenharmony_ci /* If the previous match was for an empty string, we are finished if we are 373e18e3516Sopenharmony_ci at the end of the subject. Otherwise, arrange to run another match at the 374e18e3516Sopenharmony_ci same point to see if a non-empty match can be found. */ 375e18e3516Sopenharmony_ci 376e18e3516Sopenharmony_ci if (ovector[0] == ovector[1]) 377e18e3516Sopenharmony_ci { 378e18e3516Sopenharmony_ci if (ovector[0] == subject_length) break; 379e18e3516Sopenharmony_ci options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; 380e18e3516Sopenharmony_ci } 381e18e3516Sopenharmony_ci 382e18e3516Sopenharmony_ci /* If the previous match was not an empty string, there is one tricky case to 383e18e3516Sopenharmony_ci consider. If a pattern contains \K within a lookbehind assertion at the 384e18e3516Sopenharmony_ci start, the end of the matched string can be at the offset where the match 385e18e3516Sopenharmony_ci started. Without special action, this leads to a loop that keeps on matching 386e18e3516Sopenharmony_ci the same substring. We must detect this case and arrange to move the start on 387e18e3516Sopenharmony_ci by one character. The pcre2_get_startchar() function returns the starting 388e18e3516Sopenharmony_ci offset that was passed to pcre2_match(). */ 389e18e3516Sopenharmony_ci 390e18e3516Sopenharmony_ci else 391e18e3516Sopenharmony_ci { 392e18e3516Sopenharmony_ci PCRE2_SIZE startchar = pcre2_get_startchar(match_data); 393e18e3516Sopenharmony_ci if (start_offset <= startchar) 394e18e3516Sopenharmony_ci { 395e18e3516Sopenharmony_ci if (startchar >= subject_length) break; /* Reached end of subject. */ 396e18e3516Sopenharmony_ci start_offset = startchar + 1; /* Advance by one character. */ 397e18e3516Sopenharmony_ci if (utf8) /* If UTF-8, it may be more */ 398e18e3516Sopenharmony_ci { /* than one code unit. */ 399e18e3516Sopenharmony_ci for (; start_offset < subject_length; start_offset++) 400e18e3516Sopenharmony_ci if ((subject[start_offset] & 0xc0) != 0x80) break; 401e18e3516Sopenharmony_ci } 402e18e3516Sopenharmony_ci } 403e18e3516Sopenharmony_ci } 404e18e3516Sopenharmony_ci 405e18e3516Sopenharmony_ci /* Run the next matching operation */ 406e18e3516Sopenharmony_ci 407e18e3516Sopenharmony_ci rc = pcre2_match( 408e18e3516Sopenharmony_ci re, /* the compiled pattern */ 409e18e3516Sopenharmony_ci subject, /* the subject string */ 410e18e3516Sopenharmony_ci subject_length, /* the length of the subject */ 411e18e3516Sopenharmony_ci start_offset, /* starting offset in the subject */ 412e18e3516Sopenharmony_ci options, /* options */ 413e18e3516Sopenharmony_ci match_data, /* block for storing the result */ 414e18e3516Sopenharmony_ci NULL); /* use default match context */ 415e18e3516Sopenharmony_ci 416e18e3516Sopenharmony_ci /* This time, a result of NOMATCH isn't an error. If the value in "options" 417e18e3516Sopenharmony_ci is zero, it just means we have found all possible matches, so the loop ends. 418e18e3516Sopenharmony_ci Otherwise, it means we have failed to find a non-empty-string match at a 419e18e3516Sopenharmony_ci point where there was a previous empty-string match. In this case, we do what 420e18e3516Sopenharmony_ci Perl does: advance the matching position by one character, and continue. We 421e18e3516Sopenharmony_ci do this by setting the "end of previous match" offset, because that is picked 422e18e3516Sopenharmony_ci up at the top of the loop as the point at which to start again. 423e18e3516Sopenharmony_ci 424e18e3516Sopenharmony_ci There are two complications: (a) When CRLF is a valid newline sequence, and 425e18e3516Sopenharmony_ci the current position is just before it, advance by an extra byte. (b) 426e18e3516Sopenharmony_ci Otherwise we must ensure that we skip an entire UTF character if we are in 427e18e3516Sopenharmony_ci UTF mode. */ 428e18e3516Sopenharmony_ci 429e18e3516Sopenharmony_ci if (rc == PCRE2_ERROR_NOMATCH) 430e18e3516Sopenharmony_ci { 431e18e3516Sopenharmony_ci if (options == 0) break; /* All matches found */ 432e18e3516Sopenharmony_ci ovector[1] = start_offset + 1; /* Advance one code unit */ 433e18e3516Sopenharmony_ci if (crlf_is_newline && /* If CRLF is a newline & */ 434e18e3516Sopenharmony_ci start_offset < subject_length - 1 && /* we are at CRLF, */ 435e18e3516Sopenharmony_ci subject[start_offset] == '\r' && 436e18e3516Sopenharmony_ci subject[start_offset + 1] == '\n') 437e18e3516Sopenharmony_ci ovector[1] += 1; /* Advance by one more. */ 438e18e3516Sopenharmony_ci else if (utf8) /* Otherwise, ensure we */ 439e18e3516Sopenharmony_ci { /* advance a whole UTF-8 */ 440e18e3516Sopenharmony_ci while (ovector[1] < subject_length) /* character. */ 441e18e3516Sopenharmony_ci { 442e18e3516Sopenharmony_ci if ((subject[ovector[1]] & 0xc0) != 0x80) break; 443e18e3516Sopenharmony_ci ovector[1] += 1; 444e18e3516Sopenharmony_ci } 445e18e3516Sopenharmony_ci } 446e18e3516Sopenharmony_ci continue; /* Go round the loop again */ 447e18e3516Sopenharmony_ci } 448e18e3516Sopenharmony_ci 449e18e3516Sopenharmony_ci /* Other matching errors are not recoverable. */ 450e18e3516Sopenharmony_ci 451e18e3516Sopenharmony_ci if (rc < 0) 452e18e3516Sopenharmony_ci { 453e18e3516Sopenharmony_ci printf("Matching error %d\n", rc); 454e18e3516Sopenharmony_ci pcre2_match_data_free(match_data); 455e18e3516Sopenharmony_ci pcre2_code_free(re); 456e18e3516Sopenharmony_ci return 1; 457e18e3516Sopenharmony_ci } 458e18e3516Sopenharmony_ci 459e18e3516Sopenharmony_ci /* Match succeeded */ 460e18e3516Sopenharmony_ci 461e18e3516Sopenharmony_ci printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]); 462e18e3516Sopenharmony_ci 463e18e3516Sopenharmony_ci /* The match succeeded, but the output vector wasn't big enough. This 464e18e3516Sopenharmony_ci should not happen. */ 465e18e3516Sopenharmony_ci 466e18e3516Sopenharmony_ci if (rc == 0) 467e18e3516Sopenharmony_ci printf("ovector was not big enough for all the captured substrings\n"); 468e18e3516Sopenharmony_ci 469e18e3516Sopenharmony_ci /* We must guard against patterns such as /(?=.\K)/ that use \K in an 470e18e3516Sopenharmony_ci assertion to set the start of a match later than its end. In this 471e18e3516Sopenharmony_ci demonstration program, we just detect this case and give up. */ 472e18e3516Sopenharmony_ci 473e18e3516Sopenharmony_ci if (ovector[0] > ovector[1]) 474e18e3516Sopenharmony_ci { 475e18e3516Sopenharmony_ci printf("\\K was used in an assertion to set the match start after its end.\n" 476e18e3516Sopenharmony_ci "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]), 477e18e3516Sopenharmony_ci (char *)(subject + ovector[1])); 478e18e3516Sopenharmony_ci printf("Run abandoned\n"); 479e18e3516Sopenharmony_ci pcre2_match_data_free(match_data); 480e18e3516Sopenharmony_ci pcre2_code_free(re); 481e18e3516Sopenharmony_ci return 1; 482e18e3516Sopenharmony_ci } 483e18e3516Sopenharmony_ci 484e18e3516Sopenharmony_ci /* As before, show substrings stored in the output vector by number, and then 485e18e3516Sopenharmony_ci also any named substrings. */ 486e18e3516Sopenharmony_ci 487e18e3516Sopenharmony_ci for (i = 0; i < rc; i++) 488e18e3516Sopenharmony_ci { 489e18e3516Sopenharmony_ci PCRE2_SPTR substring_start = subject + ovector[2*i]; 490e18e3516Sopenharmony_ci size_t substring_length = ovector[2*i+1] - ovector[2*i]; 491e18e3516Sopenharmony_ci printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); 492e18e3516Sopenharmony_ci } 493e18e3516Sopenharmony_ci 494e18e3516Sopenharmony_ci if (namecount == 0) printf("No named substrings\n"); else 495e18e3516Sopenharmony_ci { 496e18e3516Sopenharmony_ci PCRE2_SPTR tabptr = name_table; 497e18e3516Sopenharmony_ci printf("Named substrings\n"); 498e18e3516Sopenharmony_ci for (i = 0; i < namecount; i++) 499e18e3516Sopenharmony_ci { 500e18e3516Sopenharmony_ci int n = (tabptr[0] << 8) | tabptr[1]; 501e18e3516Sopenharmony_ci printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, 502e18e3516Sopenharmony_ci (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); 503e18e3516Sopenharmony_ci tabptr += name_entry_size; 504e18e3516Sopenharmony_ci } 505e18e3516Sopenharmony_ci } 506e18e3516Sopenharmony_ci } /* End of loop to find second and subsequent matches */ 507e18e3516Sopenharmony_ci 508e18e3516Sopenharmony_ciprintf("\n"); 509e18e3516Sopenharmony_cipcre2_match_data_free(match_data); 510e18e3516Sopenharmony_cipcre2_code_free(re); 511e18e3516Sopenharmony_cireturn 0; 512e18e3516Sopenharmony_ci} 513e18e3516Sopenharmony_ci 514e18e3516Sopenharmony_ci/* End of pcre2demo.c */ 515e18e3516Sopenharmony_ci<p> 516e18e3516Sopenharmony_ciReturn to the <a href="index.html">PCRE2 index page</a>. 517e18e3516Sopenharmony_ci</p> 518