xref: /third_party/curl/src/tool_urlglob.c (revision 13498266)
1/***************************************************************************
2 *                                  _   _ ____  _
3 *  Project                     ___| | | |  _ \| |
4 *                             / __| | | | |_) | |
5 *                            | (__| |_| |  _ <| |___
6 *                             \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at https://curl.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 * SPDX-License-Identifier: curl
22 *
23 ***************************************************************************/
24#include "tool_setup.h"
25
26#define ENABLE_CURLX_PRINTF
27/* use our own printf() functions */
28#include "curlx.h"
29#include "tool_cfgable.h"
30#include "tool_doswin.h"
31#include "tool_urlglob.h"
32#include "tool_vms.h"
33#include "dynbuf.h"
34
35#include "memdebug.h" /* keep this as LAST include */
36
37#define GLOBERROR(string, column, code) \
38  glob->error = string, glob->pos = column, code
39
40static CURLcode glob_fixed(struct URLGlob *glob, char *fixed, size_t len)
41{
42  struct URLPattern *pat = &glob->pattern[glob->size];
43  pat->type = UPTSet;
44  pat->content.Set.size = 1;
45  pat->content.Set.ptr_s = 0;
46  pat->globindex = -1;
47
48  pat->content.Set.elements = malloc(sizeof(char *));
49
50  if(!pat->content.Set.elements)
51    return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY);
52
53  pat->content.Set.elements[0] = malloc(len + 1);
54  if(!pat->content.Set.elements[0])
55    return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY);
56
57  memcpy(pat->content.Set.elements[0], fixed, len);
58  pat->content.Set.elements[0][len] = 0;
59
60  return CURLE_OK;
61}
62
63/* multiply
64 *
65 * Multiplies and checks for overflow.
66 */
67static int multiply(curl_off_t *amount, curl_off_t with)
68{
69  curl_off_t sum;
70  DEBUGASSERT(*amount >= 0);
71  DEBUGASSERT(with >= 0);
72  if((with <= 0) || (*amount <= 0)) {
73    sum = 0;
74  }
75  else {
76#if defined(__GNUC__) && \
77  ((__GNUC__ > 5) || ((__GNUC__ == 5) && (__GNUC_MINOR__ >= 1)))
78    if(__builtin_mul_overflow(*amount, with, &sum))
79      return 1;
80#else
81    sum = *amount * with;
82    if(sum/with != *amount)
83      return 1; /* didn't fit, bail out */
84#endif
85  }
86  *amount = sum;
87  return 0;
88}
89
90static CURLcode glob_set(struct URLGlob *glob, char **patternp,
91                         size_t *posp, curl_off_t *amount,
92                         int globindex)
93{
94  /* processes a set expression with the point behind the opening '{'
95     ','-separated elements are collected until the next closing '}'
96  */
97  struct URLPattern *pat;
98  bool done = FALSE;
99  char *buf = glob->glob_buffer;
100  char *pattern = *patternp;
101  char *opattern = pattern;
102  size_t opos = *posp-1;
103
104  pat = &glob->pattern[glob->size];
105  /* patterns 0,1,2,... correspond to size=1,3,5,... */
106  pat->type = UPTSet;
107  pat->content.Set.size = 0;
108  pat->content.Set.ptr_s = 0;
109  pat->content.Set.elements = NULL;
110  pat->globindex = globindex;
111
112  while(!done) {
113    switch(*pattern) {
114    case '\0':                  /* URL ended while set was still open */
115      return GLOBERROR("unmatched brace", opos, CURLE_URL_MALFORMAT);
116
117    case '{':
118    case '[':                   /* no nested expressions at this time */
119      return GLOBERROR("nested brace", *posp, CURLE_URL_MALFORMAT);
120
121    case '}':                           /* set element completed */
122      if(opattern == pattern)
123        return GLOBERROR("empty string within braces", *posp,
124                         CURLE_URL_MALFORMAT);
125
126      /* add 1 to size since it'll be incremented below */
127      if(multiply(amount, pat->content.Set.size + 1))
128        return GLOBERROR("range overflow", 0, CURLE_URL_MALFORMAT);
129
130      FALLTHROUGH();
131    case ',':
132
133      *buf = '\0';
134      if(pat->content.Set.elements) {
135        char **new_arr = realloc(pat->content.Set.elements,
136                                 (size_t)(pat->content.Set.size + 1) *
137                                 sizeof(char *));
138        if(!new_arr)
139          return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY);
140
141        pat->content.Set.elements = new_arr;
142      }
143      else
144        pat->content.Set.elements = malloc(sizeof(char *));
145
146      if(!pat->content.Set.elements)
147        return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY);
148
149      pat->content.Set.elements[pat->content.Set.size] =
150        strdup(glob->glob_buffer);
151      if(!pat->content.Set.elements[pat->content.Set.size])
152        return GLOBERROR("out of memory", 0, CURLE_OUT_OF_MEMORY);
153      ++pat->content.Set.size;
154
155      if(*pattern == '}') {
156        pattern++; /* pass the closing brace */
157        done = TRUE;
158        continue;
159      }
160
161      buf = glob->glob_buffer;
162      ++pattern;
163      ++(*posp);
164      break;
165
166    case ']':                           /* illegal closing bracket */
167      return GLOBERROR("unexpected close bracket", *posp, CURLE_URL_MALFORMAT);
168
169    case '\\':                          /* escaped character, skip '\' */
170      if(pattern[1]) {
171        ++pattern;
172        ++(*posp);
173      }
174      FALLTHROUGH();
175    default:
176      *buf++ = *pattern++;              /* copy character to set element */
177      ++(*posp);
178    }
179  }
180
181  *patternp = pattern; /* return with the new position */
182  return CURLE_OK;
183}
184
185static CURLcode glob_range(struct URLGlob *glob, char **patternp,
186                           size_t *posp, curl_off_t *amount,
187                           int globindex)
188{
189  /* processes a range expression with the point behind the opening '['
190     - char range: e.g. "a-z]", "B-Q]"
191     - num range: e.g. "0-9]", "17-2000]"
192     - num range with leading zeros: e.g. "001-999]"
193     expression is checked for well-formedness and collected until the next ']'
194  */
195  struct URLPattern *pat;
196  int rc;
197  char *pattern = *patternp;
198  char *c;
199
200  pat = &glob->pattern[glob->size];
201  pat->globindex = globindex;
202
203  if(ISALPHA(*pattern)) {
204    /* character range detected */
205    char min_c;
206    char max_c;
207    char end_c;
208    unsigned long step = 1;
209
210    pat->type = UPTCharRange;
211
212    rc = sscanf(pattern, "%c-%c%c", &min_c, &max_c, &end_c);
213
214    if(rc == 3) {
215      if(end_c == ':') {
216        char *endp;
217        errno = 0;
218        step = strtoul(&pattern[4], &endp, 10);
219        if(errno || &pattern[4] == endp || *endp != ']')
220          step = 0;
221        else
222          pattern = endp + 1;
223      }
224      else if(end_c != ']')
225        /* then this is wrong */
226        rc = 0;
227      else
228        /* end_c == ']' */
229        pattern += 4;
230    }
231
232    *posp += (pattern - *patternp);
233
234    if(rc != 3 || !step || step > (unsigned)INT_MAX ||
235       (min_c == max_c && step != 1) ||
236       (min_c != max_c && (min_c > max_c || step > (unsigned)(max_c - min_c) ||
237                           (max_c - min_c) > ('z' - 'a'))))
238      /* the pattern is not well-formed */
239      return GLOBERROR("bad range", *posp, CURLE_URL_MALFORMAT);
240
241    /* if there was a ":[num]" thing, use that as step or else use 1 */
242    pat->content.CharRange.step = (int)step;
243    pat->content.CharRange.ptr_c = pat->content.CharRange.min_c = min_c;
244    pat->content.CharRange.max_c = max_c;
245
246    if(multiply(amount, ((pat->content.CharRange.max_c -
247                          pat->content.CharRange.min_c) /
248                         pat->content.CharRange.step + 1)))
249      return GLOBERROR("range overflow", *posp, CURLE_URL_MALFORMAT);
250  }
251  else if(ISDIGIT(*pattern)) {
252    /* numeric range detected */
253    unsigned long min_n;
254    unsigned long max_n = 0;
255    unsigned long step_n = 0;
256    char *endp;
257
258    pat->type = UPTNumRange;
259    pat->content.NumRange.padlength = 0;
260
261    if(*pattern == '0') {
262      /* leading zero specified, count them! */
263      c = pattern;
264      while(ISDIGIT(*c)) {
265        c++;
266        ++pat->content.NumRange.padlength; /* padding length is set for all
267                                              instances of this pattern */
268      }
269    }
270
271    errno = 0;
272    min_n = strtoul(pattern, &endp, 10);
273    if(errno || (endp == pattern))
274      endp = NULL;
275    else {
276      if(*endp != '-')
277        endp = NULL;
278      else {
279        pattern = endp + 1;
280        while(*pattern && ISBLANK(*pattern))
281          pattern++;
282        if(!ISDIGIT(*pattern)) {
283          endp = NULL;
284          goto fail;
285        }
286        errno = 0;
287        max_n = strtoul(pattern, &endp, 10);
288        if(errno)
289          /* overflow */
290          endp = NULL;
291        else if(*endp == ':') {
292          pattern = endp + 1;
293          errno = 0;
294          step_n = strtoul(pattern, &endp, 10);
295          if(errno)
296            /* over/underflow situation */
297            endp = NULL;
298        }
299        else
300          step_n = 1;
301        if(endp && (*endp == ']')) {
302          pattern = endp + 1;
303        }
304        else
305          endp = NULL;
306      }
307    }
308
309fail:
310    *posp += (pattern - *patternp);
311
312    if(!endp || !step_n ||
313       (min_n == max_n && step_n != 1) ||
314       (min_n != max_n && (min_n > max_n || step_n > (max_n - min_n))))
315      /* the pattern is not well-formed */
316      return GLOBERROR("bad range", *posp, CURLE_URL_MALFORMAT);
317
318    /* typecasting to ints are fine here since we make sure above that we
319       are within 31 bits */
320    pat->content.NumRange.ptr_n = pat->content.NumRange.min_n = min_n;
321    pat->content.NumRange.max_n = max_n;
322    pat->content.NumRange.step = step_n;
323
324    if(multiply(amount, ((pat->content.NumRange.max_n -
325                          pat->content.NumRange.min_n) /
326                         pat->content.NumRange.step + 1)))
327      return GLOBERROR("range overflow", *posp, CURLE_URL_MALFORMAT);
328  }
329  else
330    return GLOBERROR("bad range specification", *posp, CURLE_URL_MALFORMAT);
331
332  *patternp = pattern;
333  return CURLE_OK;
334}
335
336#define MAX_IP6LEN 128
337
338static bool peek_ipv6(const char *str, size_t *skip)
339{
340  /*
341   * Scan for a potential IPv6 literal.
342   * - Valid globs contain a hyphen and <= 1 colon.
343   * - IPv6 literals contain no hyphens and >= 2 colons.
344   */
345  char hostname[MAX_IP6LEN];
346  CURLU *u;
347  char *endbr = strchr(str, ']');
348  size_t hlen;
349  CURLUcode rc;
350  if(!endbr)
351    return FALSE;
352
353  hlen = endbr - str + 1;
354  if(hlen >= MAX_IP6LEN)
355    return FALSE;
356
357  u = curl_url();
358  if(!u)
359    return FALSE;
360
361  memcpy(hostname, str, hlen);
362  hostname[hlen] = 0;
363
364  /* ask to "guess scheme" as then it works without an https:// prefix */
365  rc = curl_url_set(u, CURLUPART_URL, hostname, CURLU_GUESS_SCHEME);
366
367  curl_url_cleanup(u);
368  if(!rc)
369    *skip = hlen;
370  return rc ? FALSE : TRUE;
371}
372
373static CURLcode glob_parse(struct URLGlob *glob, char *pattern,
374                           size_t pos, curl_off_t *amount)
375{
376  /* processes a literal string component of a URL
377     special characters '{' and '[' branch to set/range processing functions
378   */
379  CURLcode res = CURLE_OK;
380  int globindex = 0; /* count "actual" globs */
381
382  *amount = 1;
383
384  while(*pattern && !res) {
385    char *buf = glob->glob_buffer;
386    size_t sublen = 0;
387    while(*pattern && *pattern != '{') {
388      if(*pattern == '[') {
389        /* skip over IPv6 literals and [] */
390        size_t skip = 0;
391        if(!peek_ipv6(pattern, &skip) && (pattern[1] == ']'))
392          skip = 2;
393        if(skip) {
394          memcpy(buf, pattern, skip);
395          buf += skip;
396          pattern += skip;
397          sublen += skip;
398          continue;
399        }
400        break;
401      }
402      if(*pattern == '}' || *pattern == ']')
403        return GLOBERROR("unmatched close brace/bracket", pos,
404                         CURLE_URL_MALFORMAT);
405
406      /* only allow \ to escape known "special letters" */
407      if(*pattern == '\\' &&
408         (*(pattern + 1) == '{' || *(pattern + 1) == '[' ||
409          *(pattern + 1) == '}' || *(pattern + 1) == ']') ) {
410
411        /* escape character, skip '\' */
412        ++pattern;
413        ++pos;
414      }
415      *buf++ = *pattern++; /* copy character to literal */
416      ++pos;
417      sublen++;
418    }
419    if(sublen) {
420      /* we got a literal string, add it as a single-item list */
421      *buf = '\0';
422      res = glob_fixed(glob, glob->glob_buffer, sublen);
423    }
424    else {
425      switch(*pattern) {
426      case '\0': /* done  */
427        break;
428
429      case '{':
430        /* process set pattern */
431        pattern++;
432        pos++;
433        res = glob_set(glob, &pattern, &pos, amount, globindex++);
434        break;
435
436      case '[':
437        /* process range pattern */
438        pattern++;
439        pos++;
440        res = glob_range(glob, &pattern, &pos, amount, globindex++);
441        break;
442      }
443    }
444
445    if(++glob->size >= GLOB_PATTERN_NUM)
446      return GLOBERROR("too many globs", pos, CURLE_URL_MALFORMAT);
447  }
448  return res;
449}
450
451CURLcode glob_url(struct URLGlob **glob, char *url, curl_off_t *urlnum,
452                  FILE *error)
453{
454  /*
455   * We can deal with any-size, just make a buffer with the same length
456   * as the specified URL!
457   */
458  struct URLGlob *glob_expand;
459  curl_off_t amount = 0;
460  char *glob_buffer;
461  CURLcode res;
462
463  *glob = NULL;
464
465  glob_buffer = malloc(strlen(url) + 1);
466  if(!glob_buffer)
467    return CURLE_OUT_OF_MEMORY;
468  glob_buffer[0] = 0;
469
470  glob_expand = calloc(1, sizeof(struct URLGlob));
471  if(!glob_expand) {
472    Curl_safefree(glob_buffer);
473    return CURLE_OUT_OF_MEMORY;
474  }
475  glob_expand->urllen = strlen(url);
476  glob_expand->glob_buffer = glob_buffer;
477
478  res = glob_parse(glob_expand, url, 1, &amount);
479  if(!res)
480    *urlnum = amount;
481  else {
482    if(error && glob_expand->error) {
483      char text[512];
484      const char *t;
485      if(glob_expand->pos) {
486        msnprintf(text, sizeof(text), "%s in URL position %zu:\n%s\n%*s^",
487                  glob_expand->error,
488                  glob_expand->pos, url, (int)glob_expand->pos - 1, " ");
489        t = text;
490      }
491      else
492        t = glob_expand->error;
493
494      /* send error description to the error-stream */
495      fprintf(error, "curl: (%d) %s\n", res, t);
496    }
497    /* it failed, we cleanup */
498    glob_cleanup(glob_expand);
499    *urlnum = 1;
500    return res;
501  }
502
503  *glob = glob_expand;
504  return CURLE_OK;
505}
506
507void glob_cleanup(struct URLGlob *glob)
508{
509  size_t i;
510  curl_off_t elem;
511
512  if(!glob)
513    return;
514
515  for(i = 0; i < glob->size; i++) {
516    if((glob->pattern[i].type == UPTSet) &&
517       (glob->pattern[i].content.Set.elements)) {
518      for(elem = glob->pattern[i].content.Set.size - 1;
519          elem >= 0;
520          --elem) {
521        Curl_safefree(glob->pattern[i].content.Set.elements[elem]);
522      }
523      Curl_safefree(glob->pattern[i].content.Set.elements);
524    }
525  }
526  Curl_safefree(glob->glob_buffer);
527  Curl_safefree(glob);
528}
529
530CURLcode glob_next_url(char **globbed, struct URLGlob *glob)
531{
532  struct URLPattern *pat;
533  size_t i;
534  size_t len;
535  size_t buflen = glob->urllen + 1;
536  char *buf = glob->glob_buffer;
537
538  *globbed = NULL;
539
540  if(!glob->beenhere)
541    glob->beenhere = 1;
542  else {
543    bool carry = TRUE;
544
545    /* implement a counter over the index ranges of all patterns, starting
546       with the rightmost pattern */
547    for(i = 0; carry && (i < glob->size); i++) {
548      carry = FALSE;
549      pat = &glob->pattern[glob->size - 1 - i];
550      switch(pat->type) {
551      case UPTSet:
552        if((pat->content.Set.elements) &&
553           (++pat->content.Set.ptr_s == pat->content.Set.size)) {
554          pat->content.Set.ptr_s = 0;
555          carry = TRUE;
556        }
557        break;
558      case UPTCharRange:
559        pat->content.CharRange.ptr_c =
560          (char)(pat->content.CharRange.step +
561                 (int)((unsigned char)pat->content.CharRange.ptr_c));
562        if(pat->content.CharRange.ptr_c > pat->content.CharRange.max_c) {
563          pat->content.CharRange.ptr_c = pat->content.CharRange.min_c;
564          carry = TRUE;
565        }
566        break;
567      case UPTNumRange:
568        pat->content.NumRange.ptr_n += pat->content.NumRange.step;
569        if(pat->content.NumRange.ptr_n > pat->content.NumRange.max_n) {
570          pat->content.NumRange.ptr_n = pat->content.NumRange.min_n;
571          carry = TRUE;
572        }
573        break;
574      default:
575        printf("internal error: invalid pattern type (%d)\n", (int)pat->type);
576        return CURLE_FAILED_INIT;
577      }
578    }
579    if(carry) {         /* first pattern ptr has run into overflow, done! */
580      return CURLE_OK;
581    }
582  }
583
584  for(i = 0; i < glob->size; ++i) {
585    pat = &glob->pattern[i];
586    switch(pat->type) {
587    case UPTSet:
588      if(pat->content.Set.elements) {
589        msnprintf(buf, buflen, "%s",
590                  pat->content.Set.elements[pat->content.Set.ptr_s]);
591        len = strlen(buf);
592        buf += len;
593        buflen -= len;
594      }
595      break;
596    case UPTCharRange:
597      if(buflen) {
598        *buf++ = pat->content.CharRange.ptr_c;
599        *buf = '\0';
600        buflen--;
601      }
602      break;
603    case UPTNumRange:
604      msnprintf(buf, buflen, "%0*" CURL_FORMAT_CURL_OFF_T,
605                pat->content.NumRange.padlength,
606                pat->content.NumRange.ptr_n);
607      len = strlen(buf);
608      buf += len;
609      buflen -= len;
610      break;
611    default:
612      printf("internal error: invalid pattern type (%d)\n", (int)pat->type);
613      return CURLE_FAILED_INIT;
614    }
615  }
616
617  *globbed = strdup(glob->glob_buffer);
618  if(!*globbed)
619    return CURLE_OUT_OF_MEMORY;
620
621  return CURLE_OK;
622}
623
624#define MAX_OUTPUT_GLOB_LENGTH (10*1024)
625
626CURLcode glob_match_url(char **result, char *filename, struct URLGlob *glob)
627{
628  char numbuf[18];
629  char *appendthis = (char *)"";
630  size_t appendlen = 0;
631  struct curlx_dynbuf dyn;
632
633  *result = NULL;
634
635  /* We cannot use the glob_buffer for storage since the filename may be
636   * longer than the URL we use.
637   */
638  curlx_dyn_init(&dyn, MAX_OUTPUT_GLOB_LENGTH);
639
640  while(*filename) {
641    if(*filename == '#' && ISDIGIT(filename[1])) {
642      char *ptr = filename;
643      unsigned long num = strtoul(&filename[1], &filename, 10);
644      struct URLPattern *pat = NULL;
645
646      if(num && (num < glob->size)) {
647        unsigned long i;
648        num--; /* make it zero based */
649        /* find the correct glob entry */
650        for(i = 0; i<glob->size; i++) {
651          if(glob->pattern[i].globindex == (int)num) {
652            pat = &glob->pattern[i];
653            break;
654          }
655        }
656      }
657
658      if(pat) {
659        switch(pat->type) {
660        case UPTSet:
661          if(pat->content.Set.elements) {
662            appendthis = pat->content.Set.elements[pat->content.Set.ptr_s];
663            appendlen =
664              strlen(pat->content.Set.elements[pat->content.Set.ptr_s]);
665          }
666          break;
667        case UPTCharRange:
668          numbuf[0] = pat->content.CharRange.ptr_c;
669          numbuf[1] = 0;
670          appendthis = numbuf;
671          appendlen = 1;
672          break;
673        case UPTNumRange:
674          msnprintf(numbuf, sizeof(numbuf), "%0*" CURL_FORMAT_CURL_OFF_T,
675                    pat->content.NumRange.padlength,
676                    pat->content.NumRange.ptr_n);
677          appendthis = numbuf;
678          appendlen = strlen(numbuf);
679          break;
680        default:
681          fprintf(tool_stderr, "internal error: invalid pattern type (%d)\n",
682                  (int)pat->type);
683          curlx_dyn_free(&dyn);
684          return CURLE_FAILED_INIT;
685        }
686      }
687      else {
688        /* #[num] out of range, use the #[num] in the output */
689        filename = ptr;
690        appendthis = filename++;
691        appendlen = 1;
692      }
693    }
694    else {
695      appendthis = filename++;
696      appendlen = 1;
697    }
698    if(curlx_dyn_addn(&dyn, appendthis, appendlen))
699      return CURLE_OUT_OF_MEMORY;
700  }
701
702  if(curlx_dyn_addn(&dyn, "", 0))
703    return CURLE_OUT_OF_MEMORY;
704
705#if defined(_WIN32) || defined(MSDOS)
706  {
707    char *sanitized;
708    SANITIZEcode sc = sanitize_file_name(&sanitized, curlx_dyn_ptr(&dyn),
709                                         (SANITIZE_ALLOW_PATH |
710                                          SANITIZE_ALLOW_RESERVED));
711    curlx_dyn_free(&dyn);
712    if(sc)
713      return CURLE_URL_MALFORMAT;
714    *result = sanitized;
715    return CURLE_OK;
716  }
717#else
718  *result = curlx_dyn_ptr(&dyn);
719  return CURLE_OK;
720#endif /* _WIN32 || MSDOS */
721}
722