1/************************************************* 2* Perl-Compatible Regular Expressions * 3*************************************************/ 4 5/* PCRE is a library of functions to support regular expressions whose syntax 6and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016-2022 University of Cambridge 11 12----------------------------------------------------------------------------- 13Redistribution and use in source and binary forms, with or without 14modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37POSSIBILITY OF SUCH DAMAGE. 38----------------------------------------------------------------------------- 39*/ 40 41 42#ifdef HAVE_CONFIG_H 43#include "config.h" 44#endif 45 46#include "pcre2_internal.h" 47 48#define PTR_STACK_SIZE 20 49 50#define SUBSTITUTE_OPTIONS \ 51 (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ 52 PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \ 53 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \ 54 PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY) 55 56 57 58/************************************************* 59* Find end of substitute text * 60*************************************************/ 61 62/* In extended mode, we recognize ${name:+set text:unset text} and similar 63constructions. This requires the identification of unescaped : and } 64characters. This function scans for such. It must deal with nested ${ 65constructions. The pointer to the text is updated, either to the required end 66character, or to where an error was detected. 67 68Arguments: 69 code points to the compiled expression (for options) 70 ptrptr points to the pointer to the start of the text (updated) 71 ptrend end of the whole string 72 last TRUE if the last expected string (only } recognized) 73 74Returns: 0 on success 75 negative error code on failure 76*/ 77 78static int 79find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, 80 BOOL last) 81{ 82int rc = 0; 83uint32_t nestlevel = 0; 84BOOL literal = FALSE; 85PCRE2_SPTR ptr = *ptrptr; 86 87for (; ptr < ptrend; ptr++) 88 { 89 if (literal) 90 { 91 if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E) 92 { 93 literal = FALSE; 94 ptr += 1; 95 } 96 } 97 98 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) 99 { 100 if (nestlevel == 0) goto EXIT; 101 nestlevel--; 102 } 103 104 else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT; 105 106 else if (*ptr == CHAR_DOLLAR_SIGN) 107 { 108 if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET) 109 { 110 nestlevel++; 111 ptr += 1; 112 } 113 } 114 115 else if (*ptr == CHAR_BACKSLASH) 116 { 117 int erc; 118 int errorcode; 119 uint32_t ch; 120 121 if (ptr < ptrend - 1) switch (ptr[1]) 122 { 123 case CHAR_L: 124 case CHAR_l: 125 case CHAR_U: 126 case CHAR_u: 127 ptr += 1; 128 continue; 129 } 130 131 ptr += 1; /* Must point after \ */ 132 erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode, 133 code->overall_options, code->extra_options, FALSE, NULL); 134 ptr -= 1; /* Back to last code unit of escape */ 135 if (errorcode != 0) 136 { 137 rc = errorcode; 138 goto EXIT; 139 } 140 141 switch(erc) 142 { 143 case 0: /* Data character */ 144 case ESC_E: /* Isolated \E is ignored */ 145 break; 146 147 case ESC_Q: 148 literal = TRUE; 149 break; 150 151 default: 152 rc = PCRE2_ERROR_BADREPESCAPE; 153 goto EXIT; 154 } 155 } 156 } 157 158rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */ 159 160EXIT: 161*ptrptr = ptr; 162return rc; 163} 164 165 166 167/************************************************* 168* Match and substitute * 169*************************************************/ 170 171/* This function applies a compiled re to a subject string and creates a new 172string with substitutions. The first 7 arguments are the same as for 173pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED. 174 175Arguments: 176 code points to the compiled expression 177 subject points to the subject string 178 length length of subject string (may contain binary zeros) 179 start_offset where to start in the subject string 180 options option bits 181 match_data points to a match_data block, or is NULL 182 context points a PCRE2 context 183 replacement points to the replacement string 184 rlength length of replacement string 185 buffer where to put the substituted string 186 blength points to length of buffer; updated to length of string 187 188Returns: >= 0 number of substitutions made 189 < 0 an error code 190 PCRE2_ERROR_BADREPLACEMENT means invalid use of $ 191*/ 192 193/* This macro checks for space in the buffer before copying into it. On 194overflow, either give an error immediately, or keep on, accumulating the 195length. */ 196 197#define CHECKMEMCPY(from,length) \ 198 { \ 199 if (!overflowed && lengthleft < length) \ 200 { \ 201 if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ 202 overflowed = TRUE; \ 203 extra_needed = length - lengthleft; \ 204 } \ 205 else if (overflowed) \ 206 { \ 207 extra_needed += length; \ 208 } \ 209 else \ 210 { \ 211 memcpy(buffer + buff_offset, from, CU2BYTES(length)); \ 212 buff_offset += length; \ 213 lengthleft -= length; \ 214 } \ 215 } 216 217/* Here's the function */ 218 219PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION 220pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, 221 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, 222 pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, 223 PCRE2_UCHAR *buffer, PCRE2_SIZE *blength) 224{ 225int rc; 226int subs; 227int forcecase = 0; 228int forcecasereset = 0; 229uint32_t ovector_count; 230uint32_t goptions = 0; 231uint32_t suboptions; 232pcre2_match_data *internal_match_data = NULL; 233BOOL escaped_literal = FALSE; 234BOOL overflowed = FALSE; 235BOOL use_existing_match; 236BOOL replacement_only; 237#ifdef SUPPORT_UNICODE 238BOOL utf = (code->overall_options & PCRE2_UTF) != 0; 239BOOL ucp = (code->overall_options & PCRE2_UCP) != 0; 240#endif 241PCRE2_UCHAR temp[6]; 242PCRE2_SPTR ptr; 243PCRE2_SPTR repend; 244PCRE2_SIZE extra_needed = 0; 245PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength; 246PCRE2_SIZE *ovector; 247PCRE2_SIZE ovecsave[3]; 248pcre2_substitute_callout_block scb; 249 250/* General initialization */ 251 252buff_offset = 0; 253lengthleft = buff_length = *blength; 254*blength = PCRE2_UNSET; 255ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; 256 257/* Partial matching is not valid. This must come after setting *blength to 258PCRE2_UNSET, so as not to imply an offset in the replacement. */ 259 260if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) 261 return PCRE2_ERROR_BADOPTION; 262 263/* Validate length and find the end of the replacement. A NULL replacement of 264zero length is interpreted as an empty string. */ 265 266if (replacement == NULL) 267 { 268 if (rlength != 0) return PCRE2_ERROR_NULL; 269 replacement = (PCRE2_SPTR)""; 270 } 271 272if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement); 273repend = replacement + rlength; 274 275/* Check for using a match that has already happened. Note that the subject 276pointer in the match data may be NULL after a no-match. */ 277 278use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0); 279replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0); 280 281/* If starting from an existing match, there must be an externally provided 282match data block. We create an internal match_data block in two cases: (a) an 283external one is not supplied (and we are not starting from an existing match); 284(b) an existing match is to be used for the first substitution. In the latter 285case, we copy the existing match into the internal block, except for any cached 286heap frame size and pointer. This ensures that no changes are made to the 287external match data block. */ 288 289if (match_data == NULL) 290 { 291 pcre2_general_context *gcontext; 292 if (use_existing_match) return PCRE2_ERROR_NULL; 293 gcontext = (mcontext == NULL)? 294 (pcre2_general_context *)code : 295 (pcre2_general_context *)mcontext; 296 match_data = internal_match_data = 297 pcre2_match_data_create_from_pattern(code, gcontext); 298 if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; 299 } 300 301else if (use_existing_match) 302 { 303 pcre2_general_context *gcontext = (mcontext == NULL)? 304 (pcre2_general_context *)code : 305 (pcre2_general_context *)mcontext; 306 int pairs = (code->top_bracket + 1 < match_data->oveccount)? 307 code->top_bracket + 1 : match_data->oveccount; 308 internal_match_data = pcre2_match_data_create(match_data->oveccount, 309 gcontext); 310 if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; 311 memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector) 312 + 2*pairs*sizeof(PCRE2_SIZE)); 313 internal_match_data->heapframes = NULL; 314 internal_match_data->heapframes_size = 0; 315 match_data = internal_match_data; 316 } 317 318/* Remember ovector details */ 319 320ovector = pcre2_get_ovector_pointer(match_data); 321ovector_count = pcre2_get_ovector_count(match_data); 322 323/* Fixed things in the callout block */ 324 325scb.version = 0; 326scb.input = subject; 327scb.output = (PCRE2_SPTR)buffer; 328scb.ovector = ovector; 329 330/* A NULL subject of zero length is treated as an empty string. */ 331 332if (subject == NULL) 333 { 334 if (length != 0) return PCRE2_ERROR_NULL; 335 subject = (PCRE2_SPTR)""; 336 } 337 338/* Find length of zero-terminated subject */ 339 340if (length == PCRE2_ZERO_TERMINATED) 341 length = subject? PRIV(strlen)(subject) : 0; 342 343/* Check UTF replacement string if necessary. */ 344 345#ifdef SUPPORT_UNICODE 346if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) 347 { 348 rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar)); 349 if (rc != 0) 350 { 351 match_data->leftchar = 0; 352 goto EXIT; 353 } 354 } 355#endif /* SUPPORT_UNICODE */ 356 357/* Save the substitute options and remove them from the match options. */ 358 359suboptions = options & SUBSTITUTE_OPTIONS; 360options &= ~SUBSTITUTE_OPTIONS; 361 362/* Error if the start match offset is greater than the length of the subject. */ 363 364if (start_offset > length) 365 { 366 match_data->leftchar = 0; 367 rc = PCRE2_ERROR_BADOFFSET; 368 goto EXIT; 369 } 370 371/* Copy up to the start offset, unless only the replacement is required. */ 372 373if (!replacement_only) CHECKMEMCPY(subject, start_offset); 374 375/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first 376match is taken from the match_data that was passed in. */ 377 378subs = 0; 379do 380 { 381 PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; 382 uint32_t ptrstackptr = 0; 383 384 if (use_existing_match) 385 { 386 rc = match_data->rc; 387 use_existing_match = FALSE; 388 } 389 else rc = pcre2_match(code, subject, length, start_offset, options|goptions, 390 match_data, mcontext); 391 392#ifdef SUPPORT_UNICODE 393 if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */ 394#endif 395 396 /* Any error other than no match returns the error code. No match when not 397 doing the special after-empty-match global rematch, or when at the end of the 398 subject, breaks the global loop. Otherwise, advance the starting point by one 399 character, copying it to the output, and try again. */ 400 401 if (rc < 0) 402 { 403 PCRE2_SIZE save_start; 404 405 if (rc != PCRE2_ERROR_NOMATCH) goto EXIT; 406 if (goptions == 0 || start_offset >= length) break; 407 408 /* Advance by one code point. Then, if CRLF is a valid newline sequence and 409 we have advanced into the middle of it, advance one more code point. In 410 other words, do not start in the middle of CRLF, even if CR and LF on their 411 own are valid newlines. */ 412 413 save_start = start_offset++; 414 if (subject[start_offset-1] == CHAR_CR && 415 code->newline_convention != PCRE2_NEWLINE_CR && 416 code->newline_convention != PCRE2_NEWLINE_LF && 417 start_offset < length && 418 subject[start_offset] == CHAR_LF) 419 start_offset++; 420 421 /* Otherwise, in UTF mode, advance past any secondary code points. */ 422 423 else if ((code->overall_options & PCRE2_UTF) != 0) 424 { 425#if PCRE2_CODE_UNIT_WIDTH == 8 426 while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80) 427 start_offset++; 428#elif PCRE2_CODE_UNIT_WIDTH == 16 429 while (start_offset < length && 430 (subject[start_offset] & 0xfc00) == 0xdc00) 431 start_offset++; 432#endif 433 } 434 435 /* Copy what we have advanced past (unless not required), reset the special 436 global options, and continue to the next match. */ 437 438 fraglength = start_offset - save_start; 439 if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength); 440 goptions = 0; 441 continue; 442 } 443 444 /* Handle a successful match. Matches that use \K to end before they start 445 or start before the current point in the subject are not supported. */ 446 447 if (ovector[1] < ovector[0] || ovector[0] < start_offset) 448 { 449 rc = PCRE2_ERROR_BADSUBSPATTERN; 450 goto EXIT; 451 } 452 453 /* Check for the same match as previous. This is legitimate after matching an 454 empty string that starts after the initial match offset. We have tried again 455 at the match point in case the pattern is one like /(?<=\G.)/ which can never 456 match at its starting point, so running the match achieves the bumpalong. If 457 we do get the same (null) match at the original match point, it isn't such a 458 pattern, so we now do the empty string magic. In all other cases, a repeat 459 match should never occur. */ 460 461 if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) 462 { 463 if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) 464 { 465 goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; 466 ovecsave[2] = start_offset; 467 continue; /* Back to the top of the loop */ 468 } 469 rc = PCRE2_ERROR_INTERNAL_DUPMATCH; 470 goto EXIT; 471 } 472 473 /* Count substitutions with a paranoid check for integer overflow; surely no 474 real call to this function would ever hit this! */ 475 476 if (subs == INT_MAX) 477 { 478 rc = PCRE2_ERROR_TOOMANYREPLACE; 479 goto EXIT; 480 } 481 subs++; 482 483 /* Copy the text leading up to the match (unless not required), and remember 484 where the insert begins and how many ovector pairs are set. */ 485 486 if (rc == 0) rc = ovector_count; 487 fraglength = ovector[0] - start_offset; 488 if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength); 489 scb.output_offsets[0] = buff_offset; 490 scb.oveccount = rc; 491 492 /* Process the replacement string. If the entire replacement is literal, just 493 copy it with length check. */ 494 495 ptr = replacement; 496 if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0) 497 { 498 CHECKMEMCPY(ptr, rlength); 499 } 500 501 /* Within a non-literal replacement, which must be scanned character by 502 character, local literal mode can be set by \Q, but only in extended mode 503 when backslashes are being interpreted. In extended mode we must handle 504 nested substrings that are to be reprocessed. */ 505 506 else for (;;) 507 { 508 uint32_t ch; 509 unsigned int chlen; 510 511 /* If at the end of a nested substring, pop the stack. */ 512 513 if (ptr >= repend) 514 { 515 if (ptrstackptr == 0) break; /* End of replacement string */ 516 repend = ptrstack[--ptrstackptr]; 517 ptr = ptrstack[--ptrstackptr]; 518 continue; 519 } 520 521 /* Handle the next character */ 522 523 if (escaped_literal) 524 { 525 if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) 526 { 527 escaped_literal = FALSE; 528 ptr += 2; 529 continue; 530 } 531 goto LOADLITERAL; 532 } 533 534 /* Not in literal mode. */ 535 536 if (*ptr == CHAR_DOLLAR_SIGN) 537 { 538 int group, n; 539 uint32_t special = 0; 540 BOOL inparens; 541 BOOL star; 542 PCRE2_SIZE sublength; 543 PCRE2_SPTR text1_start = NULL; 544 PCRE2_SPTR text1_end = NULL; 545 PCRE2_SPTR text2_start = NULL; 546 PCRE2_SPTR text2_end = NULL; 547 PCRE2_UCHAR next; 548 PCRE2_UCHAR name[33]; 549 550 if (++ptr >= repend) goto BAD; 551 if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL; 552 553 group = -1; 554 n = 0; 555 inparens = FALSE; 556 star = FALSE; 557 558 if (next == CHAR_LEFT_CURLY_BRACKET) 559 { 560 if (++ptr >= repend) goto BAD; 561 next = *ptr; 562 inparens = TRUE; 563 } 564 565 if (next == CHAR_ASTERISK) 566 { 567 if (++ptr >= repend) goto BAD; 568 next = *ptr; 569 star = TRUE; 570 } 571 572 if (!star && next >= CHAR_0 && next <= CHAR_9) 573 { 574 group = next - CHAR_0; 575 while (++ptr < repend) 576 { 577 next = *ptr; 578 if (next < CHAR_0 || next > CHAR_9) break; 579 group = group * 10 + next - CHAR_0; 580 581 /* A check for a number greater than the hightest captured group 582 is sufficient here; no need for a separate overflow check. If unknown 583 groups are to be treated as unset, just skip over any remaining 584 digits and carry on. */ 585 586 if (group > code->top_bracket) 587 { 588 if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) 589 { 590 while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9); 591 break; 592 } 593 else 594 { 595 rc = PCRE2_ERROR_NOSUBSTRING; 596 goto PTREXIT; 597 } 598 } 599 } 600 } 601 else 602 { 603 const uint8_t *ctypes = code->tables + ctypes_offset; 604 while (MAX_255(next) && (ctypes[next] & ctype_word) != 0) 605 { 606 name[n++] = next; 607 if (n > 32) goto BAD; 608 if (++ptr >= repend) break; 609 next = *ptr; 610 } 611 if (n == 0) goto BAD; 612 name[n] = 0; 613 } 614 615 /* In extended mode we recognize ${name:+set text:unset text} and 616 ${name:-default text}. */ 617 618 if (inparens) 619 { 620 if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && 621 !star && ptr < repend - 2 && next == CHAR_COLON) 622 { 623 special = *(++ptr); 624 if (special != CHAR_PLUS && special != CHAR_MINUS) 625 { 626 rc = PCRE2_ERROR_BADSUBSTITUTION; 627 goto PTREXIT; 628 } 629 630 text1_start = ++ptr; 631 rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS); 632 if (rc != 0) goto PTREXIT; 633 text1_end = ptr; 634 635 if (special == CHAR_PLUS && *ptr == CHAR_COLON) 636 { 637 text2_start = ++ptr; 638 rc = find_text_end(code, &ptr, repend, TRUE); 639 if (rc != 0) goto PTREXIT; 640 text2_end = ptr; 641 } 642 } 643 644 else 645 { 646 if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET) 647 { 648 rc = PCRE2_ERROR_REPMISSINGBRACE; 649 goto PTREXIT; 650 } 651 } 652 653 ptr++; 654 } 655 656 /* Have found a syntactically correct group number or name, or *name. 657 Only *MARK is currently recognized. */ 658 659 if (star) 660 { 661 if (PRIV(strcmp_c8)(name, STRING_MARK) == 0) 662 { 663 PCRE2_SPTR mark = pcre2_get_mark(match_data); 664 if (mark != NULL) 665 { 666 PCRE2_SPTR mark_start = mark; 667 while (*mark != 0) mark++; 668 fraglength = mark - mark_start; 669 CHECKMEMCPY(mark_start, fraglength); 670 } 671 } 672 else goto BAD; 673 } 674 675 /* Substitute the contents of a group. We don't use substring_copy 676 functions any more, in order to support case forcing. */ 677 678 else 679 { 680 PCRE2_SPTR subptr, subptrend; 681 682 /* Find a number for a named group. In case there are duplicate names, 683 search for the first one that is set. If the name is not found when 684 PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a 685 non-existent group. */ 686 687 if (group < 0) 688 { 689 PCRE2_SPTR first, last, entry; 690 rc = pcre2_substring_nametable_scan(code, name, &first, &last); 691 if (rc == PCRE2_ERROR_NOSUBSTRING && 692 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) 693 { 694 group = code->top_bracket + 1; 695 } 696 else 697 { 698 if (rc < 0) goto PTREXIT; 699 for (entry = first; entry <= last; entry += rc) 700 { 701 uint32_t ng = GET2(entry, 0); 702 if (ng < ovector_count) 703 { 704 if (group < 0) group = ng; /* First in ovector */ 705 if (ovector[ng*2] != PCRE2_UNSET) 706 { 707 group = ng; /* First that is set */ 708 break; 709 } 710 } 711 } 712 713 /* If group is still negative, it means we did not find a group 714 that is in the ovector. Just set the first group. */ 715 716 if (group < 0) group = GET2(first, 0); 717 } 718 } 719 720 /* We now have a group that is identified by number. Find the length of 721 the captured string. If a group in a non-special substitution is unset 722 when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */ 723 724 rc = pcre2_substring_length_bynumber(match_data, group, &sublength); 725 if (rc < 0) 726 { 727 if (rc == PCRE2_ERROR_NOSUBSTRING && 728 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) 729 { 730 rc = PCRE2_ERROR_UNSET; 731 } 732 if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */ 733 if (special == 0) /* Plain substitution */ 734 { 735 if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue; 736 goto PTREXIT; /* Else error */ 737 } 738 } 739 740 /* If special is '+' we have a 'set' and possibly an 'unset' text, 741 both of which are reprocessed when used. If special is '-' we have a 742 default text for when the group is unset; it must be reprocessed. */ 743 744 if (special != 0) 745 { 746 if (special == CHAR_MINUS) 747 { 748 if (rc == 0) goto LITERAL_SUBSTITUTE; 749 text2_start = text1_start; 750 text2_end = text1_end; 751 } 752 753 if (ptrstackptr >= PTR_STACK_SIZE) goto BAD; 754 ptrstack[ptrstackptr++] = ptr; 755 ptrstack[ptrstackptr++] = repend; 756 757 if (rc == 0) 758 { 759 ptr = text1_start; 760 repend = text1_end; 761 } 762 else 763 { 764 ptr = text2_start; 765 repend = text2_end; 766 } 767 continue; 768 } 769 770 /* Otherwise we have a literal substitution of a group's contents. */ 771 772 LITERAL_SUBSTITUTE: 773 subptr = subject + ovector[group*2]; 774 subptrend = subject + ovector[group*2 + 1]; 775 776 /* Substitute a literal string, possibly forcing alphabetic case. */ 777 778 while (subptr < subptrend) 779 { 780 GETCHARINCTEST(ch, subptr); 781 if (forcecase != 0) 782 { 783#ifdef SUPPORT_UNICODE 784 if (utf || ucp) 785 { 786 uint32_t type = UCD_CHARTYPE(ch); 787 if (PRIV(ucp_gentype)[type] == ucp_L && 788 type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) 789 ch = UCD_OTHERCASE(ch); 790 } 791 else 792#endif 793 { 794 if (((code->tables + cbits_offset + 795 ((forcecase > 0)? cbit_upper:cbit_lower) 796 )[ch/8] & (1u << (ch%8))) == 0) 797 ch = (code->tables + fcc_offset)[ch]; 798 } 799 forcecase = forcecasereset; 800 } 801 802#ifdef SUPPORT_UNICODE 803 if (utf) chlen = PRIV(ord2utf)(ch, temp); else 804#endif 805 { 806 temp[0] = ch; 807 chlen = 1; 808 } 809 CHECKMEMCPY(temp, chlen); 810 } 811 } 812 } 813 814 /* Handle an escape sequence in extended mode. We can use check_escape() 815 to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but 816 the case-forcing escapes are not supported in pcre2_compile() so must be 817 recognized here. */ 818 819 else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && 820 *ptr == CHAR_BACKSLASH) 821 { 822 int errorcode; 823 824 if (ptr < repend - 1) switch (ptr[1]) 825 { 826 case CHAR_L: 827 forcecase = forcecasereset = -1; 828 ptr += 2; 829 continue; 830 831 case CHAR_l: 832 forcecase = -1; 833 forcecasereset = 0; 834 ptr += 2; 835 continue; 836 837 case CHAR_U: 838 forcecase = forcecasereset = 1; 839 ptr += 2; 840 continue; 841 842 case CHAR_u: 843 forcecase = 1; 844 forcecasereset = 0; 845 ptr += 2; 846 continue; 847 848 default: 849 break; 850 } 851 852 ptr++; /* Point after \ */ 853 rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode, 854 code->overall_options, code->extra_options, FALSE, NULL); 855 if (errorcode != 0) goto BADESCAPE; 856 857 switch(rc) 858 { 859 case ESC_E: 860 forcecase = forcecasereset = 0; 861 continue; 862 863 case ESC_Q: 864 escaped_literal = TRUE; 865 continue; 866 867 case 0: /* Data character */ 868 goto LITERAL; 869 870 default: 871 goto BADESCAPE; 872 } 873 } 874 875 /* Handle a literal code unit */ 876 877 else 878 { 879 LOADLITERAL: 880 GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */ 881 882 LITERAL: 883 if (forcecase != 0) 884 { 885#ifdef SUPPORT_UNICODE 886 if (utf || ucp) 887 { 888 uint32_t type = UCD_CHARTYPE(ch); 889 if (PRIV(ucp_gentype)[type] == ucp_L && 890 type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) 891 ch = UCD_OTHERCASE(ch); 892 } 893 else 894#endif 895 { 896 if (((code->tables + cbits_offset + 897 ((forcecase > 0)? cbit_upper:cbit_lower) 898 )[ch/8] & (1u << (ch%8))) == 0) 899 ch = (code->tables + fcc_offset)[ch]; 900 } 901 forcecase = forcecasereset; 902 } 903 904#ifdef SUPPORT_UNICODE 905 if (utf) chlen = PRIV(ord2utf)(ch, temp); else 906#endif 907 { 908 temp[0] = ch; 909 chlen = 1; 910 } 911 CHECKMEMCPY(temp, chlen); 912 } /* End handling a literal code unit */ 913 } /* End of loop for scanning the replacement. */ 914 915 /* The replacement has been copied to the output, or its size has been 916 remembered. Do the callout if there is one and we have done an actual 917 replacement. */ 918 919 if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL) 920 { 921 scb.subscount = subs; 922 scb.output_offsets[1] = buff_offset; 923 rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data); 924 925 /* A non-zero return means cancel this substitution. Instead, copy the 926 matched string fragment. */ 927 928 if (rc != 0) 929 { 930 PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0]; 931 PCRE2_SIZE oldlength = ovector[1] - ovector[0]; 932 933 buff_offset -= newlength; 934 lengthleft += newlength; 935 if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength); 936 937 /* A negative return means do not do any more. */ 938 939 if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL); 940 } 941 } 942 943 /* Save the details of this match. See above for how this data is used. If we 944 matched an empty string, do the magic for global matches. Update the start 945 offset to point to the rest of the subject string. If we re-used an existing 946 match for the first match, switch to the internal match data block. */ 947 948 ovecsave[0] = ovector[0]; 949 ovecsave[1] = ovector[1]; 950 ovecsave[2] = start_offset; 951 952 goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : 953 PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; 954 start_offset = ovector[1]; 955 } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ 956 957/* Copy the rest of the subject unless not required, and terminate the output 958with a binary zero. */ 959 960if (!replacement_only) 961 { 962 fraglength = length - start_offset; 963 CHECKMEMCPY(subject + start_offset, fraglength); 964 } 965 966temp[0] = 0; 967CHECKMEMCPY(temp, 1); 968 969/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, 970and matching has carried on after a full buffer, in order to compute the length 971needed. Otherwise, an overflow generates an immediate error return. */ 972 973if (overflowed) 974 { 975 rc = PCRE2_ERROR_NOMEMORY; 976 *blength = buff_length + extra_needed; 977 } 978 979/* After a successful execution, return the number of substitutions and set the 980length of buffer used, excluding the trailing zero. */ 981 982else 983 { 984 rc = subs; 985 *blength = buff_offset - 1; 986 } 987 988EXIT: 989if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data); 990 else match_data->rc = rc; 991return rc; 992 993NOROOM: 994rc = PCRE2_ERROR_NOMEMORY; 995goto EXIT; 996 997BAD: 998rc = PCRE2_ERROR_BADREPLACEMENT; 999goto PTREXIT; 1000 1001BADESCAPE: 1002rc = PCRE2_ERROR_BADREPESCAPE; 1003 1004PTREXIT: 1005*blength = (PCRE2_SIZE)(ptr - replacement); 1006goto EXIT; 1007} 1008 1009/* End of pcre2_substitute.c */ 1010