1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9     Original API code Copyright (c) 1997-2012 University of Cambridge
10          New API code Copyright (c) 2016-2022 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16    * Redistributions of source code must retain the above copyright notice,
17      this list of conditions and the following disclaimer.
18
19    * Redistributions in binary form must reproduce the above copyright
20      notice, this list of conditions and the following disclaimer in the
21      documentation and/or other materials provided with the distribution.
22
23    * Neither the name of the University of Cambridge nor the names of its
24      contributors may be used to endorse or promote products derived from
25      this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42#ifdef HAVE_CONFIG_H
43#include "config.h"
44#endif
45
46#include "pcre2_internal.h"
47
48#define PTR_STACK_SIZE 20
49
50#define SUBSTITUTE_OPTIONS \
51  (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52   PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
53   PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
54   PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
55
56
57
58/*************************************************
59*           Find end of substitute text          *
60*************************************************/
61
62/* In extended mode, we recognize ${name:+set text:unset text} and similar
63constructions. This requires the identification of unescaped : and }
64characters. This function scans for such. It must deal with nested ${
65constructions. The pointer to the text is updated, either to the required end
66character, or to where an error was detected.
67
68Arguments:
69  code      points to the compiled expression (for options)
70  ptrptr    points to the pointer to the start of the text (updated)
71  ptrend    end of the whole string
72  last      TRUE if the last expected string (only } recognized)
73
74Returns:    0 on success
75            negative error code on failure
76*/
77
78static int
79find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
80  BOOL last)
81{
82int rc = 0;
83uint32_t nestlevel = 0;
84BOOL literal = FALSE;
85PCRE2_SPTR ptr = *ptrptr;
86
87for (; ptr < ptrend; ptr++)
88  {
89  if (literal)
90    {
91    if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
92      {
93      literal = FALSE;
94      ptr += 1;
95      }
96    }
97
98  else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99    {
100    if (nestlevel == 0) goto EXIT;
101    nestlevel--;
102    }
103
104  else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105
106  else if (*ptr == CHAR_DOLLAR_SIGN)
107    {
108    if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
109      {
110      nestlevel++;
111      ptr += 1;
112      }
113    }
114
115  else if (*ptr == CHAR_BACKSLASH)
116    {
117    int erc;
118    int errorcode;
119    uint32_t ch;
120
121    if (ptr < ptrend - 1) switch (ptr[1])
122      {
123      case CHAR_L:
124      case CHAR_l:
125      case CHAR_U:
126      case CHAR_u:
127      ptr += 1;
128      continue;
129      }
130
131    ptr += 1;  /* Must point after \ */
132    erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
133      code->overall_options, code->extra_options, FALSE, NULL);
134    ptr -= 1;  /* Back to last code unit of escape */
135    if (errorcode != 0)
136      {
137      rc = errorcode;
138      goto EXIT;
139      }
140
141    switch(erc)
142      {
143      case 0:      /* Data character */
144      case ESC_E:  /* Isolated \E is ignored */
145      break;
146
147      case ESC_Q:
148      literal = TRUE;
149      break;
150
151      default:
152      rc = PCRE2_ERROR_BADREPESCAPE;
153      goto EXIT;
154      }
155    }
156  }
157
158rc = PCRE2_ERROR_REPMISSINGBRACE;   /* Terminator not found */
159
160EXIT:
161*ptrptr = ptr;
162return rc;
163}
164
165
166
167/*************************************************
168*              Match and substitute              *
169*************************************************/
170
171/* This function applies a compiled re to a subject string and creates a new
172string with substitutions. The first 7 arguments are the same as for
173pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
174
175Arguments:
176  code            points to the compiled expression
177  subject         points to the subject string
178  length          length of subject string (may contain binary zeros)
179  start_offset    where to start in the subject string
180  options         option bits
181  match_data      points to a match_data block, or is NULL
182  context         points a PCRE2 context
183  replacement     points to the replacement string
184  rlength         length of replacement string
185  buffer          where to put the substituted string
186  blength         points to length of buffer; updated to length of string
187
188Returns:          >= 0 number of substitutions made
189                  < 0 an error code
190                  PCRE2_ERROR_BADREPLACEMENT means invalid use of $
191*/
192
193/* This macro checks for space in the buffer before copying into it. On
194overflow, either give an error immediately, or keep on, accumulating the
195length. */
196
197#define CHECKMEMCPY(from,length) \
198  { \
199  if (!overflowed && lengthleft < length) \
200    { \
201    if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
202    overflowed = TRUE; \
203    extra_needed = length - lengthleft; \
204    } \
205  else if (overflowed) \
206    { \
207    extra_needed += length; \
208    }  \
209  else \
210    {  \
211    memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
212    buff_offset += length; \
213    lengthleft -= length; \
214    } \
215  }
216
217/* Here's the function */
218
219PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
220pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
221  PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
222  pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
223  PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
224{
225int rc;
226int subs;
227int forcecase = 0;
228int forcecasereset = 0;
229uint32_t ovector_count;
230uint32_t goptions = 0;
231uint32_t suboptions;
232pcre2_match_data *internal_match_data = NULL;
233BOOL escaped_literal = FALSE;
234BOOL overflowed = FALSE;
235BOOL use_existing_match;
236BOOL replacement_only;
237#ifdef SUPPORT_UNICODE
238BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
239BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
240#endif
241PCRE2_UCHAR temp[6];
242PCRE2_SPTR ptr;
243PCRE2_SPTR repend;
244PCRE2_SIZE extra_needed = 0;
245PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
246PCRE2_SIZE *ovector;
247PCRE2_SIZE ovecsave[3];
248pcre2_substitute_callout_block scb;
249
250/* General initialization */
251
252buff_offset = 0;
253lengthleft = buff_length = *blength;
254*blength = PCRE2_UNSET;
255ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
256
257/* Partial matching is not valid. This must come after setting *blength to
258PCRE2_UNSET, so as not to imply an offset in the replacement. */
259
260if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
261  return PCRE2_ERROR_BADOPTION;
262
263/* Validate length and find the end of the replacement. A NULL replacement of
264zero length is interpreted as an empty string. */
265
266if (replacement == NULL)
267  {
268  if (rlength != 0) return PCRE2_ERROR_NULL;
269  replacement = (PCRE2_SPTR)"";
270  }
271
272if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
273repend = replacement + rlength;
274
275/* Check for using a match that has already happened. Note that the subject
276pointer in the match data may be NULL after a no-match. */
277
278use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
279replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
280
281/* If starting from an existing match, there must be an externally provided
282match data block. We create an internal match_data block in two cases: (a) an
283external one is not supplied (and we are not starting from an existing match);
284(b) an existing match is to be used for the first substitution. In the latter
285case, we copy the existing match into the internal block, except for any cached
286heap frame size and pointer. This ensures that no changes are made to the
287external match data block. */
288
289if (match_data == NULL)
290  {
291  pcre2_general_context *gcontext;
292  if (use_existing_match) return PCRE2_ERROR_NULL;
293  gcontext = (mcontext == NULL)?
294    (pcre2_general_context *)code :
295    (pcre2_general_context *)mcontext;
296  match_data = internal_match_data =
297    pcre2_match_data_create_from_pattern(code, gcontext);
298  if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
299  }
300
301else if (use_existing_match)
302  {
303  pcre2_general_context *gcontext = (mcontext == NULL)?
304    (pcre2_general_context *)code :
305    (pcre2_general_context *)mcontext;
306  int pairs = (code->top_bracket + 1 < match_data->oveccount)?
307    code->top_bracket + 1 : match_data->oveccount;
308  internal_match_data = pcre2_match_data_create(match_data->oveccount,
309    gcontext);
310  if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
311  memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
312    + 2*pairs*sizeof(PCRE2_SIZE));
313  internal_match_data->heapframes = NULL;
314  internal_match_data->heapframes_size = 0;
315  match_data = internal_match_data;
316  }
317
318/* Remember ovector details */
319
320ovector = pcre2_get_ovector_pointer(match_data);
321ovector_count = pcre2_get_ovector_count(match_data);
322
323/* Fixed things in the callout block */
324
325scb.version = 0;
326scb.input = subject;
327scb.output = (PCRE2_SPTR)buffer;
328scb.ovector = ovector;
329
330/* A NULL subject of zero length is treated as an empty string. */
331
332if (subject == NULL)
333  {
334  if (length != 0) return PCRE2_ERROR_NULL;
335  subject = (PCRE2_SPTR)"";
336  }
337
338/* Find length of zero-terminated subject */
339
340if (length == PCRE2_ZERO_TERMINATED)
341  length = subject? PRIV(strlen)(subject) : 0;
342
343/* Check UTF replacement string if necessary. */
344
345#ifdef SUPPORT_UNICODE
346if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
347  {
348  rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
349  if (rc != 0)
350    {
351    match_data->leftchar = 0;
352    goto EXIT;
353    }
354  }
355#endif  /* SUPPORT_UNICODE */
356
357/* Save the substitute options and remove them from the match options. */
358
359suboptions = options & SUBSTITUTE_OPTIONS;
360options &= ~SUBSTITUTE_OPTIONS;
361
362/* Error if the start match offset is greater than the length of the subject. */
363
364if (start_offset > length)
365  {
366  match_data->leftchar = 0;
367  rc = PCRE2_ERROR_BADOFFSET;
368  goto EXIT;
369  }
370
371/* Copy up to the start offset, unless only the replacement is required. */
372
373if (!replacement_only) CHECKMEMCPY(subject, start_offset);
374
375/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
376match is taken from the match_data that was passed in. */
377
378subs = 0;
379do
380  {
381  PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
382  uint32_t ptrstackptr = 0;
383
384  if (use_existing_match)
385    {
386    rc = match_data->rc;
387    use_existing_match = FALSE;
388    }
389  else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
390    match_data, mcontext);
391
392#ifdef SUPPORT_UNICODE
393  if (utf) options |= PCRE2_NO_UTF_CHECK;  /* Only need to check once */
394#endif
395
396  /* Any error other than no match returns the error code. No match when not
397  doing the special after-empty-match global rematch, or when at the end of the
398  subject, breaks the global loop. Otherwise, advance the starting point by one
399  character, copying it to the output, and try again. */
400
401  if (rc < 0)
402    {
403    PCRE2_SIZE save_start;
404
405    if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
406    if (goptions == 0 || start_offset >= length) break;
407
408    /* Advance by one code point. Then, if CRLF is a valid newline sequence and
409    we have advanced into the middle of it, advance one more code point. In
410    other words, do not start in the middle of CRLF, even if CR and LF on their
411    own are valid newlines. */
412
413    save_start = start_offset++;
414    if (subject[start_offset-1] == CHAR_CR &&
415        code->newline_convention != PCRE2_NEWLINE_CR &&
416        code->newline_convention != PCRE2_NEWLINE_LF &&
417        start_offset < length &&
418        subject[start_offset] == CHAR_LF)
419      start_offset++;
420
421    /* Otherwise, in UTF mode, advance past any secondary code points. */
422
423    else if ((code->overall_options & PCRE2_UTF) != 0)
424      {
425#if PCRE2_CODE_UNIT_WIDTH == 8
426      while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
427        start_offset++;
428#elif PCRE2_CODE_UNIT_WIDTH == 16
429      while (start_offset < length &&
430            (subject[start_offset] & 0xfc00) == 0xdc00)
431        start_offset++;
432#endif
433      }
434
435    /* Copy what we have advanced past (unless not required), reset the special
436    global options, and continue to the next match. */
437
438    fraglength = start_offset - save_start;
439    if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
440    goptions = 0;
441    continue;
442    }
443
444  /* Handle a successful match. Matches that use \K to end before they start
445  or start before the current point in the subject are not supported. */
446
447  if (ovector[1] < ovector[0] || ovector[0] < start_offset)
448    {
449    rc = PCRE2_ERROR_BADSUBSPATTERN;
450    goto EXIT;
451    }
452
453  /* Check for the same match as previous. This is legitimate after matching an
454  empty string that starts after the initial match offset. We have tried again
455  at the match point in case the pattern is one like /(?<=\G.)/ which can never
456  match at its starting point, so running the match achieves the bumpalong. If
457  we do get the same (null) match at the original match point, it isn't such a
458  pattern, so we now do the empty string magic. In all other cases, a repeat
459  match should never occur. */
460
461  if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
462    {
463    if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
464      {
465      goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
466      ovecsave[2] = start_offset;
467      continue;    /* Back to the top of the loop */
468      }
469    rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
470    goto EXIT;
471    }
472
473  /* Count substitutions with a paranoid check for integer overflow; surely no
474  real call to this function would ever hit this! */
475
476  if (subs == INT_MAX)
477    {
478    rc = PCRE2_ERROR_TOOMANYREPLACE;
479    goto EXIT;
480    }
481  subs++;
482
483  /* Copy the text leading up to the match (unless not required), and remember
484  where the insert begins and how many ovector pairs are set. */
485
486  if (rc == 0) rc = ovector_count;
487  fraglength = ovector[0] - start_offset;
488  if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
489  scb.output_offsets[0] = buff_offset;
490  scb.oveccount = rc;
491
492  /* Process the replacement string. If the entire replacement is literal, just
493  copy it with length check. */
494
495  ptr = replacement;
496  if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
497    {
498    CHECKMEMCPY(ptr, rlength);
499    }
500
501  /* Within a non-literal replacement, which must be scanned character by
502  character, local literal mode can be set by \Q, but only in extended mode
503  when backslashes are being interpreted. In extended mode we must handle
504  nested substrings that are to be reprocessed. */
505
506  else for (;;)
507    {
508    uint32_t ch;
509    unsigned int chlen;
510
511    /* If at the end of a nested substring, pop the stack. */
512
513    if (ptr >= repend)
514      {
515      if (ptrstackptr == 0) break;       /* End of replacement string */
516      repend = ptrstack[--ptrstackptr];
517      ptr = ptrstack[--ptrstackptr];
518      continue;
519      }
520
521    /* Handle the next character */
522
523    if (escaped_literal)
524      {
525      if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
526        {
527        escaped_literal = FALSE;
528        ptr += 2;
529        continue;
530        }
531      goto LOADLITERAL;
532      }
533
534    /* Not in literal mode. */
535
536    if (*ptr == CHAR_DOLLAR_SIGN)
537      {
538      int group, n;
539      uint32_t special = 0;
540      BOOL inparens;
541      BOOL star;
542      PCRE2_SIZE sublength;
543      PCRE2_SPTR text1_start = NULL;
544      PCRE2_SPTR text1_end = NULL;
545      PCRE2_SPTR text2_start = NULL;
546      PCRE2_SPTR text2_end = NULL;
547      PCRE2_UCHAR next;
548      PCRE2_UCHAR name[33];
549
550      if (++ptr >= repend) goto BAD;
551      if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
552
553      group = -1;
554      n = 0;
555      inparens = FALSE;
556      star = FALSE;
557
558      if (next == CHAR_LEFT_CURLY_BRACKET)
559        {
560        if (++ptr >= repend) goto BAD;
561        next = *ptr;
562        inparens = TRUE;
563        }
564
565      if (next == CHAR_ASTERISK)
566        {
567        if (++ptr >= repend) goto BAD;
568        next = *ptr;
569        star = TRUE;
570        }
571
572      if (!star && next >= CHAR_0 && next <= CHAR_9)
573        {
574        group = next - CHAR_0;
575        while (++ptr < repend)
576          {
577          next = *ptr;
578          if (next < CHAR_0 || next > CHAR_9) break;
579          group = group * 10 + next - CHAR_0;
580
581          /* A check for a number greater than the hightest captured group
582          is sufficient here; no need for a separate overflow check. If unknown
583          groups are to be treated as unset, just skip over any remaining
584          digits and carry on. */
585
586          if (group > code->top_bracket)
587            {
588            if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
589              {
590              while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
591              break;
592              }
593            else
594              {
595              rc = PCRE2_ERROR_NOSUBSTRING;
596              goto PTREXIT;
597              }
598            }
599          }
600        }
601      else
602        {
603        const uint8_t *ctypes = code->tables + ctypes_offset;
604        while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
605          {
606          name[n++] = next;
607          if (n > 32) goto BAD;
608          if (++ptr >= repend) break;
609          next = *ptr;
610          }
611        if (n == 0) goto BAD;
612        name[n] = 0;
613        }
614
615      /* In extended mode we recognize ${name:+set text:unset text} and
616      ${name:-default text}. */
617
618      if (inparens)
619        {
620        if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
621             !star && ptr < repend - 2 && next == CHAR_COLON)
622          {
623          special = *(++ptr);
624          if (special != CHAR_PLUS && special != CHAR_MINUS)
625            {
626            rc = PCRE2_ERROR_BADSUBSTITUTION;
627            goto PTREXIT;
628            }
629
630          text1_start = ++ptr;
631          rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
632          if (rc != 0) goto PTREXIT;
633          text1_end = ptr;
634
635          if (special == CHAR_PLUS && *ptr == CHAR_COLON)
636            {
637            text2_start = ++ptr;
638            rc = find_text_end(code, &ptr, repend, TRUE);
639            if (rc != 0) goto PTREXIT;
640            text2_end = ptr;
641            }
642          }
643
644        else
645          {
646          if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
647            {
648            rc = PCRE2_ERROR_REPMISSINGBRACE;
649            goto PTREXIT;
650            }
651          }
652
653        ptr++;
654        }
655
656      /* Have found a syntactically correct group number or name, or *name.
657      Only *MARK is currently recognized. */
658
659      if (star)
660        {
661        if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
662          {
663          PCRE2_SPTR mark = pcre2_get_mark(match_data);
664          if (mark != NULL)
665            {
666            PCRE2_SPTR mark_start = mark;
667            while (*mark != 0) mark++;
668            fraglength = mark - mark_start;
669            CHECKMEMCPY(mark_start, fraglength);
670            }
671          }
672        else goto BAD;
673        }
674
675      /* Substitute the contents of a group. We don't use substring_copy
676      functions any more, in order to support case forcing. */
677
678      else
679        {
680        PCRE2_SPTR subptr, subptrend;
681
682        /* Find a number for a named group. In case there are duplicate names,
683        search for the first one that is set. If the name is not found when
684        PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
685        non-existent group. */
686
687        if (group < 0)
688          {
689          PCRE2_SPTR first, last, entry;
690          rc = pcre2_substring_nametable_scan(code, name, &first, &last);
691          if (rc == PCRE2_ERROR_NOSUBSTRING &&
692              (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
693            {
694            group = code->top_bracket + 1;
695            }
696          else
697            {
698            if (rc < 0) goto PTREXIT;
699            for (entry = first; entry <= last; entry += rc)
700              {
701              uint32_t ng = GET2(entry, 0);
702              if (ng < ovector_count)
703                {
704                if (group < 0) group = ng;          /* First in ovector */
705                if (ovector[ng*2] != PCRE2_UNSET)
706                  {
707                  group = ng;                       /* First that is set */
708                  break;
709                  }
710                }
711              }
712
713            /* If group is still negative, it means we did not find a group
714            that is in the ovector. Just set the first group. */
715
716            if (group < 0) group = GET2(first, 0);
717            }
718          }
719
720        /* We now have a group that is identified by number. Find the length of
721        the captured string. If a group in a non-special substitution is unset
722        when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
723
724        rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
725        if (rc < 0)
726          {
727          if (rc == PCRE2_ERROR_NOSUBSTRING &&
728              (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
729            {
730            rc = PCRE2_ERROR_UNSET;
731            }
732          if (rc != PCRE2_ERROR_UNSET) goto PTREXIT;  /* Non-unset errors */
733          if (special == 0)                           /* Plain substitution */
734            {
735            if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
736            goto PTREXIT;                             /* Else error */
737            }
738          }
739
740        /* If special is '+' we have a 'set' and possibly an 'unset' text,
741        both of which are reprocessed when used. If special is '-' we have a
742        default text for when the group is unset; it must be reprocessed. */
743
744        if (special != 0)
745          {
746          if (special == CHAR_MINUS)
747            {
748            if (rc == 0) goto LITERAL_SUBSTITUTE;
749            text2_start = text1_start;
750            text2_end = text1_end;
751            }
752
753          if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
754          ptrstack[ptrstackptr++] = ptr;
755          ptrstack[ptrstackptr++] = repend;
756
757          if (rc == 0)
758            {
759            ptr = text1_start;
760            repend = text1_end;
761            }
762          else
763            {
764            ptr = text2_start;
765            repend = text2_end;
766            }
767          continue;
768          }
769
770        /* Otherwise we have a literal substitution of a group's contents. */
771
772        LITERAL_SUBSTITUTE:
773        subptr = subject + ovector[group*2];
774        subptrend = subject + ovector[group*2 + 1];
775
776        /* Substitute a literal string, possibly forcing alphabetic case. */
777
778        while (subptr < subptrend)
779          {
780          GETCHARINCTEST(ch, subptr);
781          if (forcecase != 0)
782            {
783#ifdef SUPPORT_UNICODE
784            if (utf || ucp)
785              {
786              uint32_t type = UCD_CHARTYPE(ch);
787              if (PRIV(ucp_gentype)[type] == ucp_L &&
788                  type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
789                ch = UCD_OTHERCASE(ch);
790              }
791            else
792#endif
793              {
794              if (((code->tables + cbits_offset +
795                  ((forcecase > 0)? cbit_upper:cbit_lower)
796                  )[ch/8] & (1u << (ch%8))) == 0)
797                ch = (code->tables + fcc_offset)[ch];
798              }
799            forcecase = forcecasereset;
800            }
801
802#ifdef SUPPORT_UNICODE
803          if (utf) chlen = PRIV(ord2utf)(ch, temp); else
804#endif
805            {
806            temp[0] = ch;
807            chlen = 1;
808            }
809          CHECKMEMCPY(temp, chlen);
810          }
811        }
812      }
813
814    /* Handle an escape sequence in extended mode. We can use check_escape()
815    to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
816    the case-forcing escapes are not supported in pcre2_compile() so must be
817    recognized here. */
818
819    else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
820              *ptr == CHAR_BACKSLASH)
821      {
822      int errorcode;
823
824      if (ptr < repend - 1) switch (ptr[1])
825        {
826        case CHAR_L:
827        forcecase = forcecasereset = -1;
828        ptr += 2;
829        continue;
830
831        case CHAR_l:
832        forcecase = -1;
833        forcecasereset = 0;
834        ptr += 2;
835        continue;
836
837        case CHAR_U:
838        forcecase = forcecasereset = 1;
839        ptr += 2;
840        continue;
841
842        case CHAR_u:
843        forcecase = 1;
844        forcecasereset = 0;
845        ptr += 2;
846        continue;
847
848        default:
849        break;
850        }
851
852      ptr++;  /* Point after \ */
853      rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
854        code->overall_options, code->extra_options, FALSE, NULL);
855      if (errorcode != 0) goto BADESCAPE;
856
857      switch(rc)
858        {
859        case ESC_E:
860        forcecase = forcecasereset = 0;
861        continue;
862
863        case ESC_Q:
864        escaped_literal = TRUE;
865        continue;
866
867        case 0:      /* Data character */
868        goto LITERAL;
869
870        default:
871        goto BADESCAPE;
872        }
873      }
874
875    /* Handle a literal code unit */
876
877    else
878      {
879      LOADLITERAL:
880      GETCHARINCTEST(ch, ptr);    /* Get character value, increment pointer */
881
882      LITERAL:
883      if (forcecase != 0)
884        {
885#ifdef SUPPORT_UNICODE
886        if (utf || ucp)
887          {
888          uint32_t type = UCD_CHARTYPE(ch);
889          if (PRIV(ucp_gentype)[type] == ucp_L &&
890              type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
891            ch = UCD_OTHERCASE(ch);
892          }
893        else
894#endif
895          {
896          if (((code->tables + cbits_offset +
897              ((forcecase > 0)? cbit_upper:cbit_lower)
898              )[ch/8] & (1u << (ch%8))) == 0)
899            ch = (code->tables + fcc_offset)[ch];
900          }
901        forcecase = forcecasereset;
902        }
903
904#ifdef SUPPORT_UNICODE
905      if (utf) chlen = PRIV(ord2utf)(ch, temp); else
906#endif
907        {
908        temp[0] = ch;
909        chlen = 1;
910        }
911      CHECKMEMCPY(temp, chlen);
912      } /* End handling a literal code unit */
913    }   /* End of loop for scanning the replacement. */
914
915  /* The replacement has been copied to the output, or its size has been
916  remembered. Do the callout if there is one and we have done an actual
917  replacement. */
918
919  if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
920    {
921    scb.subscount = subs;
922    scb.output_offsets[1] = buff_offset;
923    rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
924
925    /* A non-zero return means cancel this substitution. Instead, copy the
926    matched string fragment. */
927
928    if (rc != 0)
929      {
930      PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
931      PCRE2_SIZE oldlength = ovector[1] - ovector[0];
932
933      buff_offset -= newlength;
934      lengthleft += newlength;
935      if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
936
937      /* A negative return means do not do any more. */
938
939      if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
940      }
941    }
942
943  /* Save the details of this match. See above for how this data is used. If we
944  matched an empty string, do the magic for global matches. Update the start
945  offset to point to the rest of the subject string. If we re-used an existing
946  match for the first match, switch to the internal match data block. */
947
948  ovecsave[0] = ovector[0];
949  ovecsave[1] = ovector[1];
950  ovecsave[2] = start_offset;
951
952  goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
953    PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
954  start_offset = ovector[1];
955  } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0);  /* Repeat "do" loop */
956
957/* Copy the rest of the subject unless not required, and terminate the output
958with a binary zero. */
959
960if (!replacement_only)
961  {
962  fraglength = length - start_offset;
963  CHECKMEMCPY(subject + start_offset, fraglength);
964  }
965
966temp[0] = 0;
967CHECKMEMCPY(temp, 1);
968
969/* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
970and matching has carried on after a full buffer, in order to compute the length
971needed. Otherwise, an overflow generates an immediate error return. */
972
973if (overflowed)
974  {
975  rc = PCRE2_ERROR_NOMEMORY;
976  *blength = buff_length + extra_needed;
977  }
978
979/* After a successful execution, return the number of substitutions and set the
980length of buffer used, excluding the trailing zero. */
981
982else
983  {
984  rc = subs;
985  *blength = buff_offset - 1;
986  }
987
988EXIT:
989if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
990  else match_data->rc = rc;
991return rc;
992
993NOROOM:
994rc = PCRE2_ERROR_NOMEMORY;
995goto EXIT;
996
997BAD:
998rc = PCRE2_ERROR_BADREPLACEMENT;
999goto PTREXIT;
1000
1001BADESCAPE:
1002rc = PCRE2_ERROR_BADREPESCAPE;
1003
1004PTREXIT:
1005*blength = (PCRE2_SIZE)(ptr - replacement);
1006goto EXIT;
1007}
1008
1009/* End of pcre2_substitute.c */
1010