xref: /third_party/toybox/toys/posix/sed.c (revision 0f66f451)
1/* sed.c - stream editor. Thing that does s/// and other stuff.
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6 *
7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
8 * but N and s///
9 * TODO: make y// handle unicode, unicode delimiters
10 * TODO: handle error return from emit(), error_msg/exit consistently
11 *       What's the right thing to do for -i when write fails? Skip to next?
12 * test '//q' with no previous regex, also repeat previous regex?
13
14USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
15
16config SED
17  bool "sed"
18  default y
19  help
20    usage: sed [-inrzE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
21
22    Stream editor. Apply one or more editing SCRIPTs to each line of input
23    (from FILE or stdin) producing output (by default to stdout).
24
25    -e	Add SCRIPT to list
26    -f	Add contents of SCRIPT_FILE to list
27    -i	Edit each file in place (-iEXT keeps backup file with extension EXT)
28    -n	No default output (use the p command to output matched lines)
29    -r	Use extended regular expression syntax
30    -E	POSIX alias for -r
31    -s	Treat input files separately (implied by -i)
32    -z	Use \0 rather than \n as the input line separator
33
34    A SCRIPT is a series of one or more COMMANDs separated by newlines or
35    semicolons. All -e SCRIPTs are concatenated together as if separated
36    by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
37    If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
38
39    Each COMMAND may be preceded by an address which limits the command to
40    apply only to the specified line(s). Commands without an address apply to
41    every line. Addresses are of the form:
42
43      [ADDRESS[,ADDRESS]][!]COMMAND
44
45    The ADDRESS may be a decimal line number (starting at 1), a /regular
46    expression/ within a pair of forward slashes, or the character "$" which
47    matches the last line of input. (In -s or -i mode this matches the last
48    line of each file, otherwise just the last line of the last file.) A single
49    address matches one line, a pair of comma separated addresses match
50    everything from the first address to the second address (inclusive). If
51    both addresses are regular expressions, more than one range of lines in
52    each file can match. The second address can be +N to end N lines later.
53
54    REGULAR EXPRESSIONS in sed are started and ended by the same character
55    (traditionally / but anything except a backslash or a newline works).
56    Backslashes may be used to escape the delimiter if it occurs in the
57    regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
58    and unicode). An empty regex repeats the previous one. ADDRESS regexes
59    (above) require the first delimiter to be escaped with a backslash when
60    it isn't a forward slash (to distinguish it from the COMMANDs below).
61
62    Sed mostly operates on individual lines one at a time. It reads each line,
63    processes it, and either writes it to the output or discards it before
64    reading the next line. Sed can remember one additional line in a separate
65    buffer (using the h, H, g, G, and x commands), and can read the next line
66    of input early (using the n and N command), but other than that command
67    scripts operate on individual lines of text.
68
69    Each COMMAND starts with a single character. The following commands take
70    no arguments:
71
72      !  Run this command when the test _didn't_ match.
73
74      {  Start a new command block, continuing until a corresponding "}".
75         Command blocks may nest. If the block has an address, commands within
76         the block are only run for lines within the block's address range.
77
78      }  End command block (this command cannot have an address)
79
80      d  Delete this line and move on to the next one
81         (ignores remaining COMMANDs)
82
83      D  Delete one line of input and restart command SCRIPT (same as "d"
84         unless you've glued lines together with "N" or similar)
85
86      g  Get remembered line (overwriting current line)
87
88      G  Get remembered line (appending to current line)
89
90      h  Remember this line (overwriting remembered line)
91
92      H  Remember this line (appending to remembered line, if any)
93
94      l  Print line, escaping \abfrtv (but not newline), octal escaping other
95         nonprintable characters, wrapping lines to terminal width with a
96         backslash, and appending $ to actual end of line.
97
98      n  Print default output and read next line, replacing current line
99         (If no next line available, quit processing script)
100
101      N  Append next line of input to this line, separated by a newline
102         (This advances the line counter for address matching and "=", if no
103         next line available quit processing script without default output)
104
105      p  Print this line
106
107      P  Print this line up to first newline (from "N")
108
109      q  Quit (print default output, no more commands processed or lines read)
110
111      x  Exchange this line with remembered line (overwrite in both directions)
112
113      =  Print the current line number (followed by a newline)
114
115    The following commands (may) take an argument. The "text" arguments (to
116    the "a", "b", and "c" commands) may end with an unescaped "\" to append
117    the next line (for which leading whitespace is not skipped), and also
118    treat ";" as a literal character (use "\;" instead).
119
120      a [text]   Append text to output before attempting to read next line
121
122      b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
123
124      c [text]   Delete line, output text at end of matching address range
125                 (ignores remaining COMMANDs)
126
127      i [text]   Print text
128
129      r [file]   Append contents of file to output before attempting to read
130                 next line.
131
132      s/S/R/F    Search for regex S, replace matched text with R using flags F.
133                 The first character after the "s" (anything but newline or
134                 backslash) is the delimiter, escape with \ to use normally.
135
136                 The replacement text may contain "&" to substitute the matched
137                 text (escape it with backslash for a literal &), or \1 through
138                 \9 to substitute a parenthetical subexpression in the regex.
139                 You can also use the normal backslash escapes such as \n and
140                 a backslash at the end of the line appends the next line.
141
142                 The flags are:
143
144                 [0-9]    A number, substitute only that occurrence of pattern
145                 g        Global, substitute all occurrences of pattern
146                 i        Ignore case when matching
147                 p        Print the line if match was found and replaced
148                 w [file] Write (append) line to file if match replaced
149
150      t [label]  Test, jump to :label only if an "s" command found a match in
151                 this line since last test (replacing with same text counts)
152
153      T [label]  Test false, jump only if "s" hasn't found a match.
154
155      w [file]   Write (append) line to file
156
157      y/old/new/ Change each character in 'old' to corresponding character
158                 in 'new' (with standard backslash escapes, delimiter can be
159                 any repeated character except \ or \n)
160
161      : [label]  Labeled target for jump commands
162
163      #  Comment, ignore rest of this line of SCRIPT
164
165    Deviations from POSIX: allow extended regular expressions with -r,
166    editing in place with -i, separate with -s, NUL-separated input with -z,
167    printf escapes in text, line continuations, semicolons after all commands,
168    2-address anywhere an address is allowed, "T" command, multiline
169    continuations for [abc], \; to end [abc] argument before end of line.
170*/
171
172#define FOR_sed
173#include "toys.h"
174
175GLOBALS(
176  char *i;
177  struct arg_list *f, *e;
178
179  // processed pattern list
180  struct double_list *pattern;
181
182  char *nextline, *remember;
183  void *restart, *lastregex;
184  long nextlen, rememberlen, count;
185  int fdout, noeol;
186  unsigned xx;
187  char delim;
188)
189
190// Linked list of parsed sed commands. Offset fields indicate location where
191// regex or string starts, ala offset+(char *)struct, because we remalloc()
192// these to expand them for multiline inputs, and pointers would have to be
193// individually adjusted.
194
195struct sedcmd {
196  struct sedcmd *next, *prev;
197
198  // Begin and end of each match
199  long lmatch[2]; // line number of match
200  int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
201  int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
202  unsigned not, hit;
203  unsigned sflags; // s///flag bits: i=1, g=2, p=4
204  char c; // action
205};
206
207// Write out line with potential embedded NUL, handling eol/noeol
208static int emit(char *line, long len, int eol)
209{
210  int l, old = line[len];
211
212  if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
213  TT.noeol = !eol;
214  if (eol) line[len++] = '\n';
215  if (!len) return 0;
216  l = writeall(TT.fdout, line, len);
217  if (eol) line[len-1] = old;
218  if (l != len) {
219    if (TT.fdout != 1) perror_msg("short write");
220
221    return 1;
222  }
223
224  return 0;
225}
226
227// Extend allocation to include new string, with newline between if newlen<0
228
229static char *extend_string(char **old, char *new, int oldlen, int newlen)
230{
231  int newline = newlen < 0;
232  char *s;
233
234  if (newline) newlen = -newlen;
235  s = *old = xrealloc(*old, oldlen+newlen+newline+1);
236  if (newline) s[oldlen++] = '\n';
237  memcpy(s+oldlen, new, newlen);
238  s[oldlen+newlen] = 0;
239
240  return s+oldlen+newlen+1;
241}
242
243// An empty regex repeats the previous one
244static void *get_regex(void *command, int offset)
245{
246  if (!offset) {
247    if (!TT.lastregex) error_exit("no previous regex");
248    return TT.lastregex;
249  }
250
251  return TT.lastregex = offset+(char *)command;
252}
253
254// Apply pattern to line from input file
255static void sed_line(char **pline, long plen)
256{
257  struct append {
258    struct append *next, *prev;
259    int file;
260    char *str;
261  } *append = 0;
262  char *line = TT.nextline;
263  long len = TT.nextlen;
264  struct sedcmd *command;
265  int eol = 0, tea = 0;
266
267  // Ignore EOF for all files before last unless -i
268  if (!pline && !FLAG(i)) return;
269
270  // Grab next line for deferred processing (EOF detection: we get a NULL
271  // pline at EOF to flush last line). Note that only end of _last_ input
272  // file matches $ (unless we're doing -i).
273  TT.nextline = 0;
274  TT.nextlen = 0;
275  if (pline) {
276    TT.nextline = *pline;
277    TT.nextlen = plen;
278    *pline = 0;
279  }
280
281  if (!line || !len) return;
282  if (line[len-1] == '\n') line[--len] = eol++;
283  TT.count++;
284
285  // The restart-1 is because we added one to make sure it wasn't NULL,
286  // otherwise N as last command would restart script
287  command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
288  TT.restart = 0;
289
290  while (command) {
291    char *str, c = command->c;
292
293    // Have we got a line or regex matching range for this rule?
294    if (*command->lmatch || *command->rmatch) {
295      int miss = 0;
296      long lm;
297
298      // In a match that might end?
299      if (command->hit) {
300        if (!(lm = command->lmatch[1])) {
301          if (!command->rmatch[1]) command->hit = 0;
302          else {
303            void *rm = get_regex(command, command->rmatch[1]);
304
305            // regex match end includes matching line, so defer deactivation
306            if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
307          }
308        } else if (lm > 0 && lm < TT.count) command->hit = 0;
309        else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
310
311      // Start a new match?
312      } else {
313        if (!(lm = *command->lmatch)) {
314          void *rm = get_regex(command, *command->rmatch);
315
316          if (line && !regexec0(rm, line, len, 0, 0, 0))
317            command->hit = TT.count;
318        } else if (lm == TT.count || (lm == -1 && !pline))
319          command->hit = TT.count;
320
321        if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
322      }
323
324      // Didn't match?
325      lm = !(command->not^!!command->hit);
326
327      // Deferred disable from regex end match
328      if (miss || command->lmatch[1] == TT.count) command->hit = 0;
329
330      if (lm) {
331        // Handle skipping curly bracket command group
332        if (c == '{') {
333          int curly = 1;
334
335          while (curly) {
336            command = command->next;
337            if (command->c == '{') curly++;
338            if (command->c == '}') curly--;
339          }
340        }
341        command = command->next;
342        continue;
343      }
344    }
345
346    // A deleted line can still update line match state for later commands
347    if (!line) {
348      command = command->next;
349      continue;
350    }
351
352    // Process command
353
354    if (c=='a' || c=='r') {
355      struct append *a = xzalloc(sizeof(struct append));
356      if (command->arg1) a->str = command->arg1+(char *)command;
357      a->file = c=='r';
358      dlist_add_nomalloc((void *)&append, (void *)a);
359    } else if (c=='b' || c=='t' || c=='T') {
360      int t = tea;
361
362      if (c != 'b') tea = 0;
363      if (c=='b' || t^(c=='T')) {
364        if (!command->arg1) break;
365        str = command->arg1+(char *)command;
366        for (command = (void *)TT.pattern; command; command = command->next)
367          if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
368            break;
369        if (!command) error_exit("no :%s", str);
370      }
371    } else if (c=='c') {
372      str = command->arg1+(char *)command;
373      if (!command->hit) emit(str, strlen(str), 1);
374      free(line);
375      line = 0;
376      continue;
377    } else if (c=='d') {
378      free(line);
379      line = 0;
380      continue;
381    } else if (c=='D') {
382      // Delete up to \n or end of buffer
383      str = line;
384      while ((str-line)<len) if (*(str++) == '\n') break;
385      len -= str - line;
386      memmove(line, str, len);
387
388      // if "delete" blanks line, disable further processing
389      // otherwise trim and restart script
390      if (!len) {
391        free(line);
392        line = 0;
393      } else {
394        line[len] = 0;
395        command = (void *)TT.pattern;
396      }
397      continue;
398    } else if (c=='g') {
399      free(line);
400      line = xstrdup(TT.remember);
401      len = TT.rememberlen;
402    } else if (c=='G') {
403      line = xrealloc(line, len+TT.rememberlen+2);
404      line[len++] = '\n';
405      memcpy(line+len, TT.remember, TT.rememberlen);
406      line[len += TT.rememberlen] = 0;
407    } else if (c=='h') {
408      free(TT.remember);
409      TT.remember = xstrdup(line);
410      TT.rememberlen = len;
411    } else if (c=='H') {
412      TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
413      TT.remember[TT.rememberlen++] = '\n';
414      memcpy(TT.remember+TT.rememberlen, line, len);
415      TT.remember[TT.rememberlen += len] = 0;
416    } else if (c=='i') {
417      str = command->arg1+(char *)command;
418      emit(str, strlen(str), 1);
419    } else if (c=='l') {
420      int i, x, off;
421
422      if (!TT.xx) {
423        terminal_size(&TT.xx, 0);
424        if (!TT.xx) TT.xx = 80;
425        if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
426        if (TT.xx > 4) TT.xx -= 4;
427      }
428
429      for (i = off = 0; i<len; i++) {
430        if (off >= TT.xx) {
431          toybuf[off++] = '\\';
432          emit(toybuf, off, 1);
433          off = 0;
434        }
435        x = stridx("\\\a\b\f\r\t\v", line[i]);
436        if (x != -1) {
437          toybuf[off++] = '\\';
438          toybuf[off++] = "\\abfrtv"[x];
439        } else if (line[i] >= ' ') toybuf[off++] = line[i];
440        else off += sprintf(toybuf+off, "\\%03o", line[i]);
441      }
442      toybuf[off++] = '$';
443      emit(toybuf, off, 1);
444    } else if (c=='n') {
445      TT.restart = command->next+1;
446
447      break;
448    } else if (c=='N') {
449      // Can't just grab next line because we could have multiple N and
450      // we need to actually read ahead to get N;$p EOF detection right.
451      if (pline) {
452        TT.restart = command->next+1;
453        extend_string(&line, TT.nextline, len, -TT.nextlen);
454        free(TT.nextline);
455        TT.nextline = line;
456        TT.nextlen += len + 1;
457        line = 0;
458      }
459
460      // Pending append goes out right after N
461      goto done;
462    } else if (c=='p' || c=='P') {
463      char *l = (c=='P') ? strchr(line, '\n') : 0;
464
465      if (emit(line, l ? l-line : len, eol)) break;
466    } else if (c=='q' || c=='Q') {
467      if (pline) *pline = (void *)1;
468      free(TT.nextline);
469      if (!toys.exitval && command->arg1)
470        toys.exitval = atoi(command->arg1+(char *)command);
471      TT.nextline = 0;
472      TT.nextlen = 0;
473      if (c=='Q') line = 0;
474
475      break;
476    } else if (c=='s') {
477      char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
478      regmatch_t *match = (void *)toybuf;
479      regex_t *reg = get_regex(command, command->arg1);
480      int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
481        mlen, off, newlen;
482
483      // Loop finding match in remaining line (up to remaining len)
484      while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
485        mflags = REG_NOTBOL;
486
487        // Zero length matches don't count immediately after a previous match
488        mlen = match[0].rm_eo-match[0].rm_so;
489        if (!mlen && !zmatch) {
490          if (rline-line == len) break;
491          l2[l2used++] = *rline++;
492          zmatch++;
493          continue;
494        } else zmatch = 0;
495
496        // If we're replacing only a specific match, skip if this isn't it
497        off = command->sflags>>3;
498        if (off && off != ++count) {
499          memcpy(l2+l2used, rline, match[0].rm_eo);
500          l2used += match[0].rm_eo;
501          rline += match[0].rm_eo;
502
503          continue;
504        }
505        // The fact getline() can allocate unbounded amounts of memory is
506        // a bigger issue, but while we're here check for integer overflow
507        if (match[0].rm_eo > INT_MAX) perror_exit(0);
508
509        // newlen = strlen(new) but with \1 and & and printf escapes
510        for (off = newlen = 0; new[off]; off++) {
511          int cc = -1;
512
513          if (new[off] == '&') cc = 0;
514          else if (new[off] == '\\') cc = new[++off] - '0';
515          if (cc < 0 || cc > 9) {
516            newlen++;
517            continue;
518          }
519          newlen += match[cc].rm_eo-match[cc].rm_so;
520        }
521
522        // Copy changed data to new string
523
524        // Adjust allocation size of new string, copy data we know we'll keep
525        l2l += newlen-mlen;
526        if ((l2l|0xfff) > l2old) l2 = xrealloc(l2, l2old = (l2l|0xfff)+1);
527        if (match[0].rm_so) {
528          memcpy(l2+l2used, rline, match[0].rm_so);
529          l2used += match[0].rm_so;
530        }
531
532        // copy in new replacement text
533        for (off = mlen = 0; new[off]; off++) {
534          int cc = 0, ll;
535
536          if (new[off] == '\\') {
537            cc = new[++off] - '0';
538            if (cc<0 || cc>9) {
539              if (!(l2[l2used+mlen++] = unescape(new[off])))
540                l2[l2used+mlen-1] = new[off];
541
542              continue;
543            } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
544          } else if (new[off] != '&') {
545            l2[l2used+mlen++] = new[off];
546
547            continue;
548          }
549
550          if (match[cc].rm_so != -1) {
551            ll = match[cc].rm_eo-match[cc].rm_so;
552            memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
553            mlen += ll;
554          }
555        }
556        l2used += newlen;
557        rline += match[0].rm_eo;
558
559        // Stop after first substitution unless we have flag g
560        if (!(command->sflags & 2)) break;
561      }
562
563      // If we made any changes, finish off l2 and swap it for line
564      if (l2) {
565        // grab trailing unmatched data and null terminator, swap with original
566        mlen = len-(rline-line);
567        memcpy(l2+l2used, rline, mlen+1);
568        len = l2used + mlen;
569        free(line);
570        line = l2;
571      }
572
573      if (mflags) {
574        // flag p
575        if (command->sflags & 4) emit(line, len, eol);
576
577        tea = 1;
578        if (command->w) goto writenow;
579      }
580    } else if (c=='w') {
581      int fd, noeol;
582      char *name;
583
584writenow:
585      // Swap out emit() context
586      fd = TT.fdout;
587      noeol = TT.noeol;
588
589      // We save filehandle and newline status before filename
590      name = command->w + (char *)command;
591      memcpy(&TT.fdout, name, 4);
592      name += 4;
593      TT.noeol = *(name++);
594
595      // write, then save/restore context
596      if (emit(line, len, eol))
597        perror_exit("w '%s'", command->arg1+(char *)command);
598      *(--name) = TT.noeol;
599      TT.noeol = noeol;
600      TT.fdout = fd;
601    } else if (c=='x') {
602      long swap = TT.rememberlen;
603
604      str = TT.remember;
605      TT.remember = line;
606      line = str;
607      TT.rememberlen = len;
608      len = swap;
609    } else if (c=='y') {
610      char *from, *to = (char *)command;
611      int i, j;
612
613      from = to+command->arg1;
614      to += command->arg2;
615
616      for (i = 0; i < len; i++) {
617        j = stridx(from, line[i]);
618        if (j != -1) line[i] = to[j];
619      }
620    } else if (c=='=') {
621      sprintf(toybuf, "%ld", TT.count);
622      if (emit(toybuf, strlen(toybuf), 1)) break;
623    }
624
625    command = command->next;
626  }
627
628  if (line && !FLAG(n)) emit(line, len, eol);
629
630done:
631  if (dlist_terminate(append)) while (append) {
632    struct append *a = append->next;
633
634    if (append->file) {
635      int fd = open(append->str, O_RDONLY);
636
637      // Force newline if noeol pending
638      if (fd != -1) {
639        if (TT.noeol) xwrite(TT.fdout, "\n", 1);
640        TT.noeol = 0;
641        xsendfile(fd, TT.fdout);
642        close(fd);
643      }
644    } else if (append->str) emit(append->str, strlen(append->str), 1);
645    else emit(line, 0, 0);
646    free(append);
647    append = a;
648  }
649  free(line);
650}
651
652// Callback called on each input file
653static void do_sed_file(int fd, char *name)
654{
655  char *tmp;
656
657  if (FLAG(i)) {
658    struct sedcmd *command;
659
660    if (!fd) return error_msg("-i on stdin");
661    TT.fdout = copy_tempfile(fd, name, &tmp);
662    TT.count = 0;
663    for (command = (void *)TT.pattern; command; command = command->next)
664      command->hit = 0;
665  }
666  do_lines(fd, TT.delim, sed_line);
667  if (FLAG(i)) {
668    if (TT.i && *TT.i) {
669      char *s = xmprintf("%s%s", name, TT.i);
670
671      xrename(name, s);
672      free(s);
673    }
674    replace_tempfile(-1, TT.fdout, &tmp);
675    TT.fdout = 1;
676    TT.nextline = 0;
677    TT.nextlen = TT.noeol = 0;
678  }
679}
680
681// Copy chunk of string between two delimiters, converting printf escapes.
682// returns processed copy of string (0 if error), *pstr advances to next
683// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
684// if regxex, ignore delimiter in [ranges]
685static char *unescape_delimited_string(char **pstr, char *delim)
686{
687  char *to, *from, mode = 0, d;
688
689  // Grab leading delimiter (if necessary), allocate space for new string
690  from = *pstr;
691  if (!delim || !*delim) {
692    if (!(d = *(from++))) return 0;
693    if (d == '\\') d = *(from++);
694    if (!d || d == '\\') return 0;
695    if (delim) *delim = d;
696  } else d = *delim;
697  to = delim = xmalloc(strlen(*pstr)+1);
698
699  while (mode || *from != d) {
700    if (!*from) return 0;
701
702    // delimiter in regex character range doesn't count
703    if (*from == '[') {
704      if (!mode) {
705        mode = ']';
706        if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
707      } else if (mode == ']' && strchr(".=:", from[1])) {
708        *(to++) = *(from++);
709        mode = *from;
710      }
711    } else if (*from == mode) {
712      if (mode == ']') mode = 0;
713      else {
714        *(to++) = *(from++);
715        mode = ']';
716      }
717    // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
718    // but the perl build does it, so we need to filter it out.
719    } else if (mode && *from == '-' && from[-1] == from[1]) {
720      from+=2;
721      continue;
722    } else if (*from == '\\') {
723      if (!from[1]) return 0;
724
725      // Check escaped end delimiter before printf style escapes.
726      if (from[1] == d) from++;
727      else if (from[1]=='\\') *(to++) = *(from++);
728      else {
729        char c = unescape(from[1]);
730
731        if (c) {
732          *(to++) = c;
733          from+=2;
734          continue;
735        } else if (!mode) *(to++) = *(from++);
736      }
737    }
738    *(to++) = *(from++);
739  }
740  *to = 0;
741  *pstr = from+1;
742
743  return delim;
744}
745
746// Translate pattern strings into command structures. Each command structure
747// is a single allocation (which requires some math and remalloc at times).
748static void parse_pattern(char **pline, long len)
749{
750  struct sedcmd *command = (void *)TT.pattern;
751  char *line, *reg, c, *errstart;
752  int i;
753
754  line = errstart = pline ? *pline : "";
755  if (len && line[len-1]=='\n') line[--len] = 0;
756
757  // Append this line to previous multiline command? (hit indicates type.)
758  // During parsing "hit" stores data about line continuations, but in
759  // sed_line() it means the match range attached to this command
760  // is active, so processing the continuation must zero it again.
761  if (command && command->prev->hit) {
762    // Remove half-finished entry from list so remalloc() doesn't confuse it
763    TT.pattern = TT.pattern->prev;
764    command = dlist_pop(&TT.pattern);
765    c = command->c;
766    reg = (char *)command;
767    reg += command->arg1 + strlen(reg + command->arg1);
768
769    // Resume parsing for 'a' or 's' command. (Only two that can do this.)
770    // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
771    // a unicode character.
772    if (command->hit < 256) goto resume_s;
773    else goto resume_a;
774  }
775
776  // Loop through commands in this line.
777
778  command = 0;
779  for (;;) {
780    if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
781
782    // If there's no more data on this line, return.
783    for (;;) {
784      while (isspace(*line) || *line == ';') line++;
785      if (*line == '#') while (*line && *line != '\n') line++;
786      else break;
787    }
788    if (!*line) return;
789
790    // Start by writing data into toybuf.
791
792    errstart = line;
793    memset(toybuf, 0, sizeof(struct sedcmd));
794    command = (void *)toybuf;
795    reg = toybuf + sizeof(struct sedcmd);
796
797    // Parse address range (if any)
798    for (i = 0; i < 2; i++) {
799      if (*line == ',') line++;
800      else if (i) break;
801
802      if (i && *line == '+' && isdigit(line[1])) {
803        line++;
804        command->lmatch[i] = -2-strtol(line, &line, 0);
805      } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
806      else if (*line == '$') {
807        command->lmatch[i] = -1;
808        line++;
809      } else if (*line == '/' || *line == '\\') {
810        char *s = line;
811
812        if (!(s = unescape_delimited_string(&line, 0))) goto error;
813        if (!*s) command->rmatch[i] = 0;
814        else {
815          xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
816          command->rmatch[i] = reg-toybuf;
817          reg += sizeof(regex_t);
818        }
819        free(s);
820      } else break;
821    }
822
823    while (isspace(*line)) line++;
824    if (!*line) break;
825
826    if (*line == '!') {
827      command->not = 1;
828      line++;
829    }
830    while (isspace(*line)) line++;
831    if (!*line) break;
832
833    c = command->c = *(line++);
834    if (strchr("}:", c) && i) break;
835    if (strchr("aiqQr=", c) && i>1) break;
836
837    // Allocate memory and copy out of toybuf now that we know how big it is
838    command = xmemdup(toybuf, reg-toybuf);
839    reg = (reg-toybuf) + (char *)command;
840
841    // Parse arguments by command type
842    if (c == '{') TT.nextlen++;
843    else if (c == '}') {
844      if (!TT.nextlen--) break;
845    } else if (c == 's') {
846      char *end, delim = 0;
847
848      // s/pattern/replacement/flags
849
850      // line continuations use arg1 (back at the start of the function),
851      // so let's fill out arg2 first (since the regex part can't be multiple
852      // lines) and swap them back later.
853
854      // get pattern (just record, we parse it later)
855      command->arg2 = reg - (char *)command;
856      if (!(TT.remember = unescape_delimited_string(&line, &delim)))
857        goto error;
858
859      reg += sizeof(regex_t);
860      command->arg1 = reg-(char *)command;
861      command->hit = delim;
862resume_s:
863      // get replacement - don't replace escapes yet because \1 and \& need
864      // processing later, after we replace \\ with \ we can't tell \\1 from \1
865      end = line;
866      while (*end != command->hit) {
867        if (!*end) goto error;
868        if (*end++ == '\\') {
869          if (!*end || *end == '\n') {
870            end[-1] = '\n';
871            break;
872          }
873          end++;
874        }
875      }
876
877      reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
878      line = end;
879      // line continuation? (note: '\n' can't be a valid delim).
880      if (*line == command->hit) command->hit = 0;
881      else {
882        if (!*line) continue;
883        reg--;
884        line++;
885        goto resume_s;
886      }
887
888      // swap arg1/arg2 so they're back in order arguments occur.
889      i = command->arg1;
890      command->arg1 = command->arg2;
891      command->arg2 = i;
892
893      // get flags
894      for (line++; *line; line++) {
895        long l;
896
897        if (isspace(*line) && *line != '\n') continue;
898
899        if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
900        else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
901          command->sflags |= l << 3;
902          line--;
903        } else break;
904      }
905
906      // We deferred actually parsing the regex until we had the s///i flag
907      // allocating the space was done by extend_string() above
908      if (!*TT.remember) command->arg1 = 0;
909      else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
910        (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE));
911      free(TT.remember);
912      TT.remember = 0;
913      if (*line == 'w') {
914        line++;
915        goto writenow;
916      }
917    } else if (c == 'w') {
918      int fd, delim;
919      char *cc;
920
921      // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
922      // eol status, and to retain the filename for error messages, we'd need
923      // to go up to arg5 just for this. Compromise: dynamically allocate the
924      // filehandle and eol status.
925
926writenow:
927      while (isspace(*line)) line++;
928      if (!*line) goto error;
929      for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
930      delim = *cc;
931      *cc = 0;
932      fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
933      *cc = delim;
934
935      command->w = reg - (char *)command;
936      command = xrealloc(command, command->w+(cc-line)+6);
937      reg = command->w + (char *)command;
938
939      memcpy(reg, &fd, 4);
940      reg += 4;
941      *(reg++) = 0;
942      memcpy(reg, line, delim);
943      reg += delim;
944      *(reg++) = 0;
945
946      line = cc;
947      if (delim) line += 2;
948    } else if (c == 'y') {
949      char *s, delim = 0;
950      int len;
951
952      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
953      command->arg1 = reg-(char *)command;
954      len = strlen(s);
955      reg = extend_string((void *)&command, s, reg-(char *)command, len);
956      free(s);
957      command->arg2 = reg-(char *)command;
958      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
959      if (len != strlen(s)) goto error;
960      reg = extend_string((void *)&command, s, reg-(char*)command, len);
961      free(s);
962    } else if (strchr("abcirtTqQw:", c)) {
963      int end;
964
965      // trim leading spaces
966      while (isspace(*line) && *line != '\n') line++;
967
968      // Resume logic differs from 's' case because we don't add a newline
969      // unless it's after something, so we add it on return instead.
970resume_a:
971      command->hit = 0;
972
973      // btTqQ: end with space or semicolon, aicrw continue to newline.
974      if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){
975        // Argument's optional for btTqQ
976        if (strchr("btTqQ", c)) continue;
977        else if (!command->arg1) break;
978      }
979      // Error checking: qQ can only have digits after them
980      if (c=='q' || c=='Q') {
981        for (i = 0; i<end && isdigit(line[i]); i++);
982        if (i != end) {
983          line += i;
984          break;
985        }
986      }
987
988      // Extend allocation to include new string. We use offsets instead of
989      // pointers so realloc() moving stuff doesn't break things. Ok to write
990      // \n over NUL terminator because call to extend_string() adds it back.
991      if (!command->arg1) command->arg1 = reg - (char*)command;
992      else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
993      else if (!pline) {
994        command->arg1 = 0;
995        continue;
996      }
997      reg = extend_string((void *)&command, line, reg - (char *)command, end);
998
999      // Recopy data to remove escape sequences and handle line continuation.
1000      if (strchr("aci", c)) {
1001        reg -= end+1;
1002        for (i = end; i; i--) {
1003          if ((*reg++ = *line++)=='\\') {
1004
1005            // escape at end of line: resume if -e escaped literal newline,
1006            // else request callback and resume with next line
1007            if (!--i) {
1008              *--reg = 0;
1009              if (*line) {
1010                line++;
1011                goto resume_a;
1012              }
1013              command->hit = 256;
1014              break;
1015            }
1016            if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
1017            line++;
1018          }
1019        }
1020        *reg = 0;
1021      } else line += end;
1022
1023    // Commands that take no arguments
1024    } else if (!strchr("{dDgGhHlnNpPx=", c)) break;
1025  }
1026
1027error:
1028  error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
1029}
1030
1031void sed_main(void)
1032{
1033  struct arg_list *al;
1034  char **args = toys.optargs;
1035
1036  if (!FLAG(z)) TT.delim = '\n';
1037
1038  // Lie to autoconf when it asks stupid questions, so configure regexes
1039  // that look for "GNU sed version %f" greater than some old buggy number
1040  // don't fail us for not matching their narrow expectations.
1041  if (FLAG(version)) {
1042    xprintf("This is not GNU sed version 9.0\n");
1043    return;
1044  }
1045
1046  // Handling our own --version means we handle our own --help too.
1047  if (FLAG(help)) help_exit(0);
1048
1049  // Parse pattern into commands.
1050
1051  // If no -e or -f, first argument is the pattern.
1052  if (!TT.e && !TT.f) {
1053    if (!*toys.optargs) error_exit("no pattern");
1054    (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1055  }
1056
1057  // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1058  // so handle all -e, then all -f. (At least the behavior's consistent.)
1059
1060  for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1061  parse_pattern(0, 0);
1062  for (al = TT.f; al; al = al->next)
1063    do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1064  dlist_terminate(TT.pattern);
1065  if (TT.nextlen) error_exit("no }");
1066
1067  TT.fdout = 1;
1068  TT.remember = xstrdup("");
1069
1070  // Inflict pattern upon input files. Long version because !O_CLOEXEC
1071  loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1072
1073  // Provide EOF flush at end of cumulative input for non-i mode.
1074  if (!FLAG(i)) {
1075    toys.optflags |= FLAG_i;
1076    sed_line(0, 0);
1077  }
1078
1079  // todo: need to close fd when done for TOYBOX_FREE?
1080}
1081