1/* sed.c - stream editor. Thing that does s/// and other stuff. 2 * 3 * Copyright 2014 Rob Landley <rob@landley.net> 4 * 5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html 6 * 7 * TODO: lines > 2G could wrap signed int length counters. Not just getline() 8 * but N and s/// 9 * TODO: make y// handle unicode, unicode delimiters 10 * TODO: handle error return from emit(), error_msg/exit consistently 11 * What's the right thing to do for -i when write fails? Skip to next? 12 * test '//q' with no previous regex, also repeat previous regex? 13 14USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP)) 15 16config SED 17 bool "sed" 18 default y 19 help 20 usage: sed [-inrzE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...] 21 22 Stream editor. Apply one or more editing SCRIPTs to each line of input 23 (from FILE or stdin) producing output (by default to stdout). 24 25 -e Add SCRIPT to list 26 -f Add contents of SCRIPT_FILE to list 27 -i Edit each file in place (-iEXT keeps backup file with extension EXT) 28 -n No default output (use the p command to output matched lines) 29 -r Use extended regular expression syntax 30 -E POSIX alias for -r 31 -s Treat input files separately (implied by -i) 32 -z Use \0 rather than \n as the input line separator 33 34 A SCRIPT is a series of one or more COMMANDs separated by newlines or 35 semicolons. All -e SCRIPTs are concatenated together as if separated 36 by newlines, followed by all lines from -f SCRIPT_FILEs, in order. 37 If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT. 38 39 Each COMMAND may be preceded by an address which limits the command to 40 apply only to the specified line(s). Commands without an address apply to 41 every line. Addresses are of the form: 42 43 [ADDRESS[,ADDRESS]][!]COMMAND 44 45 The ADDRESS may be a decimal line number (starting at 1), a /regular 46 expression/ within a pair of forward slashes, or the character "$" which 47 matches the last line of input. (In -s or -i mode this matches the last 48 line of each file, otherwise just the last line of the last file.) A single 49 address matches one line, a pair of comma separated addresses match 50 everything from the first address to the second address (inclusive). If 51 both addresses are regular expressions, more than one range of lines in 52 each file can match. The second address can be +N to end N lines later. 53 54 REGULAR EXPRESSIONS in sed are started and ended by the same character 55 (traditionally / but anything except a backslash or a newline works). 56 Backslashes may be used to escape the delimiter if it occurs in the 57 regex, and for the usual printf escapes (\abcefnrtv and octal, hex, 58 and unicode). An empty regex repeats the previous one. ADDRESS regexes 59 (above) require the first delimiter to be escaped with a backslash when 60 it isn't a forward slash (to distinguish it from the COMMANDs below). 61 62 Sed mostly operates on individual lines one at a time. It reads each line, 63 processes it, and either writes it to the output or discards it before 64 reading the next line. Sed can remember one additional line in a separate 65 buffer (using the h, H, g, G, and x commands), and can read the next line 66 of input early (using the n and N command), but other than that command 67 scripts operate on individual lines of text. 68 69 Each COMMAND starts with a single character. The following commands take 70 no arguments: 71 72 ! Run this command when the test _didn't_ match. 73 74 { Start a new command block, continuing until a corresponding "}". 75 Command blocks may nest. If the block has an address, commands within 76 the block are only run for lines within the block's address range. 77 78 } End command block (this command cannot have an address) 79 80 d Delete this line and move on to the next one 81 (ignores remaining COMMANDs) 82 83 D Delete one line of input and restart command SCRIPT (same as "d" 84 unless you've glued lines together with "N" or similar) 85 86 g Get remembered line (overwriting current line) 87 88 G Get remembered line (appending to current line) 89 90 h Remember this line (overwriting remembered line) 91 92 H Remember this line (appending to remembered line, if any) 93 94 l Print line, escaping \abfrtv (but not newline), octal escaping other 95 nonprintable characters, wrapping lines to terminal width with a 96 backslash, and appending $ to actual end of line. 97 98 n Print default output and read next line, replacing current line 99 (If no next line available, quit processing script) 100 101 N Append next line of input to this line, separated by a newline 102 (This advances the line counter for address matching and "=", if no 103 next line available quit processing script without default output) 104 105 p Print this line 106 107 P Print this line up to first newline (from "N") 108 109 q Quit (print default output, no more commands processed or lines read) 110 111 x Exchange this line with remembered line (overwrite in both directions) 112 113 = Print the current line number (followed by a newline) 114 115 The following commands (may) take an argument. The "text" arguments (to 116 the "a", "b", and "c" commands) may end with an unescaped "\" to append 117 the next line (for which leading whitespace is not skipped), and also 118 treat ";" as a literal character (use "\;" instead). 119 120 a [text] Append text to output before attempting to read next line 121 122 b [label] Branch, jumps to :label (or with no label, to end of SCRIPT) 123 124 c [text] Delete line, output text at end of matching address range 125 (ignores remaining COMMANDs) 126 127 i [text] Print text 128 129 r [file] Append contents of file to output before attempting to read 130 next line. 131 132 s/S/R/F Search for regex S, replace matched text with R using flags F. 133 The first character after the "s" (anything but newline or 134 backslash) is the delimiter, escape with \ to use normally. 135 136 The replacement text may contain "&" to substitute the matched 137 text (escape it with backslash for a literal &), or \1 through 138 \9 to substitute a parenthetical subexpression in the regex. 139 You can also use the normal backslash escapes such as \n and 140 a backslash at the end of the line appends the next line. 141 142 The flags are: 143 144 [0-9] A number, substitute only that occurrence of pattern 145 g Global, substitute all occurrences of pattern 146 i Ignore case when matching 147 p Print the line if match was found and replaced 148 w [file] Write (append) line to file if match replaced 149 150 t [label] Test, jump to :label only if an "s" command found a match in 151 this line since last test (replacing with same text counts) 152 153 T [label] Test false, jump only if "s" hasn't found a match. 154 155 w [file] Write (append) line to file 156 157 y/old/new/ Change each character in 'old' to corresponding character 158 in 'new' (with standard backslash escapes, delimiter can be 159 any repeated character except \ or \n) 160 161 : [label] Labeled target for jump commands 162 163 # Comment, ignore rest of this line of SCRIPT 164 165 Deviations from POSIX: allow extended regular expressions with -r, 166 editing in place with -i, separate with -s, NUL-separated input with -z, 167 printf escapes in text, line continuations, semicolons after all commands, 168 2-address anywhere an address is allowed, "T" command, multiline 169 continuations for [abc], \; to end [abc] argument before end of line. 170*/ 171 172#define FOR_sed 173#include "toys.h" 174 175GLOBALS( 176 char *i; 177 struct arg_list *f, *e; 178 179 // processed pattern list 180 struct double_list *pattern; 181 182 char *nextline, *remember; 183 void *restart, *lastregex; 184 long nextlen, rememberlen, count; 185 int fdout, noeol; 186 unsigned xx; 187 char delim; 188) 189 190// Linked list of parsed sed commands. Offset fields indicate location where 191// regex or string starts, ala offset+(char *)struct, because we remalloc() 192// these to expand them for multiline inputs, and pointers would have to be 193// individually adjusted. 194 195struct sedcmd { 196 struct sedcmd *next, *prev; 197 198 // Begin and end of each match 199 long lmatch[2]; // line number of match 200 int rmatch[2]; // offset of regex struct for prefix matches (/abc/,/def/p) 201 int arg1, arg2, w; // offset of two arguments per command, plus s//w filename 202 unsigned not, hit; 203 unsigned sflags; // s///flag bits: i=1, g=2, p=4 204 char c; // action 205}; 206 207// Write out line with potential embedded NUL, handling eol/noeol 208static int emit(char *line, long len, int eol) 209{ 210 int l, old = line[len]; 211 212 if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1; 213 TT.noeol = !eol; 214 if (eol) line[len++] = '\n'; 215 if (!len) return 0; 216 l = writeall(TT.fdout, line, len); 217 if (eol) line[len-1] = old; 218 if (l != len) { 219 if (TT.fdout != 1) perror_msg("short write"); 220 221 return 1; 222 } 223 224 return 0; 225} 226 227// Extend allocation to include new string, with newline between if newlen<0 228 229static char *extend_string(char **old, char *new, int oldlen, int newlen) 230{ 231 int newline = newlen < 0; 232 char *s; 233 234 if (newline) newlen = -newlen; 235 s = *old = xrealloc(*old, oldlen+newlen+newline+1); 236 if (newline) s[oldlen++] = '\n'; 237 memcpy(s+oldlen, new, newlen); 238 s[oldlen+newlen] = 0; 239 240 return s+oldlen+newlen+1; 241} 242 243// An empty regex repeats the previous one 244static void *get_regex(void *command, int offset) 245{ 246 if (!offset) { 247 if (!TT.lastregex) error_exit("no previous regex"); 248 return TT.lastregex; 249 } 250 251 return TT.lastregex = offset+(char *)command; 252} 253 254// Apply pattern to line from input file 255static void sed_line(char **pline, long plen) 256{ 257 struct append { 258 struct append *next, *prev; 259 int file; 260 char *str; 261 } *append = 0; 262 char *line = TT.nextline; 263 long len = TT.nextlen; 264 struct sedcmd *command; 265 int eol = 0, tea = 0; 266 267 // Ignore EOF for all files before last unless -i 268 if (!pline && !FLAG(i)) return; 269 270 // Grab next line for deferred processing (EOF detection: we get a NULL 271 // pline at EOF to flush last line). Note that only end of _last_ input 272 // file matches $ (unless we're doing -i). 273 TT.nextline = 0; 274 TT.nextlen = 0; 275 if (pline) { 276 TT.nextline = *pline; 277 TT.nextlen = plen; 278 *pline = 0; 279 } 280 281 if (!line || !len) return; 282 if (line[len-1] == '\n') line[--len] = eol++; 283 TT.count++; 284 285 // The restart-1 is because we added one to make sure it wasn't NULL, 286 // otherwise N as last command would restart script 287 command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern; 288 TT.restart = 0; 289 290 while (command) { 291 char *str, c = command->c; 292 293 // Have we got a line or regex matching range for this rule? 294 if (*command->lmatch || *command->rmatch) { 295 int miss = 0; 296 long lm; 297 298 // In a match that might end? 299 if (command->hit) { 300 if (!(lm = command->lmatch[1])) { 301 if (!command->rmatch[1]) command->hit = 0; 302 else { 303 void *rm = get_regex(command, command->rmatch[1]); 304 305 // regex match end includes matching line, so defer deactivation 306 if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1; 307 } 308 } else if (lm > 0 && lm < TT.count) command->hit = 0; 309 else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0; 310 311 // Start a new match? 312 } else { 313 if (!(lm = *command->lmatch)) { 314 void *rm = get_regex(command, *command->rmatch); 315 316 if (line && !regexec0(rm, line, len, 0, 0, 0)) 317 command->hit = TT.count; 318 } else if (lm == TT.count || (lm == -1 && !pline)) 319 command->hit = TT.count; 320 321 if (!command->lmatch[1] && !command->rmatch[1]) miss = 1; 322 } 323 324 // Didn't match? 325 lm = !(command->not^!!command->hit); 326 327 // Deferred disable from regex end match 328 if (miss || command->lmatch[1] == TT.count) command->hit = 0; 329 330 if (lm) { 331 // Handle skipping curly bracket command group 332 if (c == '{') { 333 int curly = 1; 334 335 while (curly) { 336 command = command->next; 337 if (command->c == '{') curly++; 338 if (command->c == '}') curly--; 339 } 340 } 341 command = command->next; 342 continue; 343 } 344 } 345 346 // A deleted line can still update line match state for later commands 347 if (!line) { 348 command = command->next; 349 continue; 350 } 351 352 // Process command 353 354 if (c=='a' || c=='r') { 355 struct append *a = xzalloc(sizeof(struct append)); 356 if (command->arg1) a->str = command->arg1+(char *)command; 357 a->file = c=='r'; 358 dlist_add_nomalloc((void *)&append, (void *)a); 359 } else if (c=='b' || c=='t' || c=='T') { 360 int t = tea; 361 362 if (c != 'b') tea = 0; 363 if (c=='b' || t^(c=='T')) { 364 if (!command->arg1) break; 365 str = command->arg1+(char *)command; 366 for (command = (void *)TT.pattern; command; command = command->next) 367 if (command->c == ':' && !strcmp(command->arg1+(char *)command, str)) 368 break; 369 if (!command) error_exit("no :%s", str); 370 } 371 } else if (c=='c') { 372 str = command->arg1+(char *)command; 373 if (!command->hit) emit(str, strlen(str), 1); 374 free(line); 375 line = 0; 376 continue; 377 } else if (c=='d') { 378 free(line); 379 line = 0; 380 continue; 381 } else if (c=='D') { 382 // Delete up to \n or end of buffer 383 str = line; 384 while ((str-line)<len) if (*(str++) == '\n') break; 385 len -= str - line; 386 memmove(line, str, len); 387 388 // if "delete" blanks line, disable further processing 389 // otherwise trim and restart script 390 if (!len) { 391 free(line); 392 line = 0; 393 } else { 394 line[len] = 0; 395 command = (void *)TT.pattern; 396 } 397 continue; 398 } else if (c=='g') { 399 free(line); 400 line = xstrdup(TT.remember); 401 len = TT.rememberlen; 402 } else if (c=='G') { 403 line = xrealloc(line, len+TT.rememberlen+2); 404 line[len++] = '\n'; 405 memcpy(line+len, TT.remember, TT.rememberlen); 406 line[len += TT.rememberlen] = 0; 407 } else if (c=='h') { 408 free(TT.remember); 409 TT.remember = xstrdup(line); 410 TT.rememberlen = len; 411 } else if (c=='H') { 412 TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2); 413 TT.remember[TT.rememberlen++] = '\n'; 414 memcpy(TT.remember+TT.rememberlen, line, len); 415 TT.remember[TT.rememberlen += len] = 0; 416 } else if (c=='i') { 417 str = command->arg1+(char *)command; 418 emit(str, strlen(str), 1); 419 } else if (c=='l') { 420 int i, x, off; 421 422 if (!TT.xx) { 423 terminal_size(&TT.xx, 0); 424 if (!TT.xx) TT.xx = 80; 425 if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10; 426 if (TT.xx > 4) TT.xx -= 4; 427 } 428 429 for (i = off = 0; i<len; i++) { 430 if (off >= TT.xx) { 431 toybuf[off++] = '\\'; 432 emit(toybuf, off, 1); 433 off = 0; 434 } 435 x = stridx("\\\a\b\f\r\t\v", line[i]); 436 if (x != -1) { 437 toybuf[off++] = '\\'; 438 toybuf[off++] = "\\abfrtv"[x]; 439 } else if (line[i] >= ' ') toybuf[off++] = line[i]; 440 else off += sprintf(toybuf+off, "\\%03o", line[i]); 441 } 442 toybuf[off++] = '$'; 443 emit(toybuf, off, 1); 444 } else if (c=='n') { 445 TT.restart = command->next+1; 446 447 break; 448 } else if (c=='N') { 449 // Can't just grab next line because we could have multiple N and 450 // we need to actually read ahead to get N;$p EOF detection right. 451 if (pline) { 452 TT.restart = command->next+1; 453 extend_string(&line, TT.nextline, len, -TT.nextlen); 454 free(TT.nextline); 455 TT.nextline = line; 456 TT.nextlen += len + 1; 457 line = 0; 458 } 459 460 // Pending append goes out right after N 461 goto done; 462 } else if (c=='p' || c=='P') { 463 char *l = (c=='P') ? strchr(line, '\n') : 0; 464 465 if (emit(line, l ? l-line : len, eol)) break; 466 } else if (c=='q' || c=='Q') { 467 if (pline) *pline = (void *)1; 468 free(TT.nextline); 469 if (!toys.exitval && command->arg1) 470 toys.exitval = atoi(command->arg1+(char *)command); 471 TT.nextline = 0; 472 TT.nextlen = 0; 473 if (c=='Q') line = 0; 474 475 break; 476 } else if (c=='s') { 477 char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0; 478 regmatch_t *match = (void *)toybuf; 479 regex_t *reg = get_regex(command, command->arg1); 480 int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0, 481 mlen, off, newlen; 482 483 // Loop finding match in remaining line (up to remaining len) 484 while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) { 485 mflags = REG_NOTBOL; 486 487 // Zero length matches don't count immediately after a previous match 488 mlen = match[0].rm_eo-match[0].rm_so; 489 if (!mlen && !zmatch) { 490 if (rline-line == len) break; 491 l2[l2used++] = *rline++; 492 zmatch++; 493 continue; 494 } else zmatch = 0; 495 496 // If we're replacing only a specific match, skip if this isn't it 497 off = command->sflags>>3; 498 if (off && off != ++count) { 499 memcpy(l2+l2used, rline, match[0].rm_eo); 500 l2used += match[0].rm_eo; 501 rline += match[0].rm_eo; 502 503 continue; 504 } 505 // The fact getline() can allocate unbounded amounts of memory is 506 // a bigger issue, but while we're here check for integer overflow 507 if (match[0].rm_eo > INT_MAX) perror_exit(0); 508 509 // newlen = strlen(new) but with \1 and & and printf escapes 510 for (off = newlen = 0; new[off]; off++) { 511 int cc = -1; 512 513 if (new[off] == '&') cc = 0; 514 else if (new[off] == '\\') cc = new[++off] - '0'; 515 if (cc < 0 || cc > 9) { 516 newlen++; 517 continue; 518 } 519 newlen += match[cc].rm_eo-match[cc].rm_so; 520 } 521 522 // Copy changed data to new string 523 524 // Adjust allocation size of new string, copy data we know we'll keep 525 l2l += newlen-mlen; 526 if ((l2l|0xfff) > l2old) l2 = xrealloc(l2, l2old = (l2l|0xfff)+1); 527 if (match[0].rm_so) { 528 memcpy(l2+l2used, rline, match[0].rm_so); 529 l2used += match[0].rm_so; 530 } 531 532 // copy in new replacement text 533 for (off = mlen = 0; new[off]; off++) { 534 int cc = 0, ll; 535 536 if (new[off] == '\\') { 537 cc = new[++off] - '0'; 538 if (cc<0 || cc>9) { 539 if (!(l2[l2used+mlen++] = unescape(new[off]))) 540 l2[l2used+mlen-1] = new[off]; 541 542 continue; 543 } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc); 544 } else if (new[off] != '&') { 545 l2[l2used+mlen++] = new[off]; 546 547 continue; 548 } 549 550 if (match[cc].rm_so != -1) { 551 ll = match[cc].rm_eo-match[cc].rm_so; 552 memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll); 553 mlen += ll; 554 } 555 } 556 l2used += newlen; 557 rline += match[0].rm_eo; 558 559 // Stop after first substitution unless we have flag g 560 if (!(command->sflags & 2)) break; 561 } 562 563 // If we made any changes, finish off l2 and swap it for line 564 if (l2) { 565 // grab trailing unmatched data and null terminator, swap with original 566 mlen = len-(rline-line); 567 memcpy(l2+l2used, rline, mlen+1); 568 len = l2used + mlen; 569 free(line); 570 line = l2; 571 } 572 573 if (mflags) { 574 // flag p 575 if (command->sflags & 4) emit(line, len, eol); 576 577 tea = 1; 578 if (command->w) goto writenow; 579 } 580 } else if (c=='w') { 581 int fd, noeol; 582 char *name; 583 584writenow: 585 // Swap out emit() context 586 fd = TT.fdout; 587 noeol = TT.noeol; 588 589 // We save filehandle and newline status before filename 590 name = command->w + (char *)command; 591 memcpy(&TT.fdout, name, 4); 592 name += 4; 593 TT.noeol = *(name++); 594 595 // write, then save/restore context 596 if (emit(line, len, eol)) 597 perror_exit("w '%s'", command->arg1+(char *)command); 598 *(--name) = TT.noeol; 599 TT.noeol = noeol; 600 TT.fdout = fd; 601 } else if (c=='x') { 602 long swap = TT.rememberlen; 603 604 str = TT.remember; 605 TT.remember = line; 606 line = str; 607 TT.rememberlen = len; 608 len = swap; 609 } else if (c=='y') { 610 char *from, *to = (char *)command; 611 int i, j; 612 613 from = to+command->arg1; 614 to += command->arg2; 615 616 for (i = 0; i < len; i++) { 617 j = stridx(from, line[i]); 618 if (j != -1) line[i] = to[j]; 619 } 620 } else if (c=='=') { 621 sprintf(toybuf, "%ld", TT.count); 622 if (emit(toybuf, strlen(toybuf), 1)) break; 623 } 624 625 command = command->next; 626 } 627 628 if (line && !FLAG(n)) emit(line, len, eol); 629 630done: 631 if (dlist_terminate(append)) while (append) { 632 struct append *a = append->next; 633 634 if (append->file) { 635 int fd = open(append->str, O_RDONLY); 636 637 // Force newline if noeol pending 638 if (fd != -1) { 639 if (TT.noeol) xwrite(TT.fdout, "\n", 1); 640 TT.noeol = 0; 641 xsendfile(fd, TT.fdout); 642 close(fd); 643 } 644 } else if (append->str) emit(append->str, strlen(append->str), 1); 645 else emit(line, 0, 0); 646 free(append); 647 append = a; 648 } 649 free(line); 650} 651 652// Callback called on each input file 653static void do_sed_file(int fd, char *name) 654{ 655 char *tmp; 656 657 if (FLAG(i)) { 658 struct sedcmd *command; 659 660 if (!fd) return error_msg("-i on stdin"); 661 TT.fdout = copy_tempfile(fd, name, &tmp); 662 TT.count = 0; 663 for (command = (void *)TT.pattern; command; command = command->next) 664 command->hit = 0; 665 } 666 do_lines(fd, TT.delim, sed_line); 667 if (FLAG(i)) { 668 if (TT.i && *TT.i) { 669 char *s = xmprintf("%s%s", name, TT.i); 670 671 xrename(name, s); 672 free(s); 673 } 674 replace_tempfile(-1, TT.fdout, &tmp); 675 TT.fdout = 1; 676 TT.nextline = 0; 677 TT.nextlen = TT.noeol = 0; 678 } 679} 680 681// Copy chunk of string between two delimiters, converting printf escapes. 682// returns processed copy of string (0 if error), *pstr advances to next 683// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter 684// if regxex, ignore delimiter in [ranges] 685static char *unescape_delimited_string(char **pstr, char *delim) 686{ 687 char *to, *from, mode = 0, d; 688 689 // Grab leading delimiter (if necessary), allocate space for new string 690 from = *pstr; 691 if (!delim || !*delim) { 692 if (!(d = *(from++))) return 0; 693 if (d == '\\') d = *(from++); 694 if (!d || d == '\\') return 0; 695 if (delim) *delim = d; 696 } else d = *delim; 697 to = delim = xmalloc(strlen(*pstr)+1); 698 699 while (mode || *from != d) { 700 if (!*from) return 0; 701 702 // delimiter in regex character range doesn't count 703 if (*from == '[') { 704 if (!mode) { 705 mode = ']'; 706 if (from[1]=='-' || from[1]==']') *(to++) = *(from++); 707 } else if (mode == ']' && strchr(".=:", from[1])) { 708 *(to++) = *(from++); 709 mode = *from; 710 } 711 } else if (*from == mode) { 712 if (mode == ']') mode = 0; 713 else { 714 *(to++) = *(from++); 715 mode = ']'; 716 } 717 // Length 1 range (X-X with same X) is "undefined" and makes regcomp err, 718 // but the perl build does it, so we need to filter it out. 719 } else if (mode && *from == '-' && from[-1] == from[1]) { 720 from+=2; 721 continue; 722 } else if (*from == '\\') { 723 if (!from[1]) return 0; 724 725 // Check escaped end delimiter before printf style escapes. 726 if (from[1] == d) from++; 727 else if (from[1]=='\\') *(to++) = *(from++); 728 else { 729 char c = unescape(from[1]); 730 731 if (c) { 732 *(to++) = c; 733 from+=2; 734 continue; 735 } else if (!mode) *(to++) = *(from++); 736 } 737 } 738 *(to++) = *(from++); 739 } 740 *to = 0; 741 *pstr = from+1; 742 743 return delim; 744} 745 746// Translate pattern strings into command structures. Each command structure 747// is a single allocation (which requires some math and remalloc at times). 748static void parse_pattern(char **pline, long len) 749{ 750 struct sedcmd *command = (void *)TT.pattern; 751 char *line, *reg, c, *errstart; 752 int i; 753 754 line = errstart = pline ? *pline : ""; 755 if (len && line[len-1]=='\n') line[--len] = 0; 756 757 // Append this line to previous multiline command? (hit indicates type.) 758 // During parsing "hit" stores data about line continuations, but in 759 // sed_line() it means the match range attached to this command 760 // is active, so processing the continuation must zero it again. 761 if (command && command->prev->hit) { 762 // Remove half-finished entry from list so remalloc() doesn't confuse it 763 TT.pattern = TT.pattern->prev; 764 command = dlist_pop(&TT.pattern); 765 c = command->c; 766 reg = (char *)command; 767 reg += command->arg1 + strlen(reg + command->arg1); 768 769 // Resume parsing for 'a' or 's' command. (Only two that can do this.) 770 // TODO: using 256 to indicate 'a' means our s/// delimiter can't be 771 // a unicode character. 772 if (command->hit < 256) goto resume_s; 773 else goto resume_a; 774 } 775 776 // Loop through commands in this line. 777 778 command = 0; 779 for (;;) { 780 if (command) dlist_add_nomalloc(&TT.pattern, (void *)command); 781 782 // If there's no more data on this line, return. 783 for (;;) { 784 while (isspace(*line) || *line == ';') line++; 785 if (*line == '#') while (*line && *line != '\n') line++; 786 else break; 787 } 788 if (!*line) return; 789 790 // Start by writing data into toybuf. 791 792 errstart = line; 793 memset(toybuf, 0, sizeof(struct sedcmd)); 794 command = (void *)toybuf; 795 reg = toybuf + sizeof(struct sedcmd); 796 797 // Parse address range (if any) 798 for (i = 0; i < 2; i++) { 799 if (*line == ',') line++; 800 else if (i) break; 801 802 if (i && *line == '+' && isdigit(line[1])) { 803 line++; 804 command->lmatch[i] = -2-strtol(line, &line, 0); 805 } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0); 806 else if (*line == '$') { 807 command->lmatch[i] = -1; 808 line++; 809 } else if (*line == '/' || *line == '\\') { 810 char *s = line; 811 812 if (!(s = unescape_delimited_string(&line, 0))) goto error; 813 if (!*s) command->rmatch[i] = 0; 814 else { 815 xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r)); 816 command->rmatch[i] = reg-toybuf; 817 reg += sizeof(regex_t); 818 } 819 free(s); 820 } else break; 821 } 822 823 while (isspace(*line)) line++; 824 if (!*line) break; 825 826 if (*line == '!') { 827 command->not = 1; 828 line++; 829 } 830 while (isspace(*line)) line++; 831 if (!*line) break; 832 833 c = command->c = *(line++); 834 if (strchr("}:", c) && i) break; 835 if (strchr("aiqQr=", c) && i>1) break; 836 837 // Allocate memory and copy out of toybuf now that we know how big it is 838 command = xmemdup(toybuf, reg-toybuf); 839 reg = (reg-toybuf) + (char *)command; 840 841 // Parse arguments by command type 842 if (c == '{') TT.nextlen++; 843 else if (c == '}') { 844 if (!TT.nextlen--) break; 845 } else if (c == 's') { 846 char *end, delim = 0; 847 848 // s/pattern/replacement/flags 849 850 // line continuations use arg1 (back at the start of the function), 851 // so let's fill out arg2 first (since the regex part can't be multiple 852 // lines) and swap them back later. 853 854 // get pattern (just record, we parse it later) 855 command->arg2 = reg - (char *)command; 856 if (!(TT.remember = unescape_delimited_string(&line, &delim))) 857 goto error; 858 859 reg += sizeof(regex_t); 860 command->arg1 = reg-(char *)command; 861 command->hit = delim; 862resume_s: 863 // get replacement - don't replace escapes yet because \1 and \& need 864 // processing later, after we replace \\ with \ we can't tell \\1 from \1 865 end = line; 866 while (*end != command->hit) { 867 if (!*end) goto error; 868 if (*end++ == '\\') { 869 if (!*end || *end == '\n') { 870 end[-1] = '\n'; 871 break; 872 } 873 end++; 874 } 875 } 876 877 reg = extend_string((void *)&command, line, reg-(char *)command,end-line); 878 line = end; 879 // line continuation? (note: '\n' can't be a valid delim). 880 if (*line == command->hit) command->hit = 0; 881 else { 882 if (!*line) continue; 883 reg--; 884 line++; 885 goto resume_s; 886 } 887 888 // swap arg1/arg2 so they're back in order arguments occur. 889 i = command->arg1; 890 command->arg1 = command->arg2; 891 command->arg2 = i; 892 893 // get flags 894 for (line++; *line; line++) { 895 long l; 896 897 if (isspace(*line) && *line != '\n') continue; 898 899 if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l; 900 else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) { 901 command->sflags |= l << 3; 902 line--; 903 } else break; 904 } 905 906 // We deferred actually parsing the regex until we had the s///i flag 907 // allocating the space was done by extend_string() above 908 if (!*TT.remember) command->arg1 = 0; 909 else xregcomp((void *)(command->arg1 + (char *)command), TT.remember, 910 (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE)); 911 free(TT.remember); 912 TT.remember = 0; 913 if (*line == 'w') { 914 line++; 915 goto writenow; 916 } 917 } else if (c == 'w') { 918 int fd, delim; 919 char *cc; 920 921 // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and 922 // eol status, and to retain the filename for error messages, we'd need 923 // to go up to arg5 just for this. Compromise: dynamically allocate the 924 // filehandle and eol status. 925 926writenow: 927 while (isspace(*line)) line++; 928 if (!*line) goto error; 929 for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break; 930 delim = *cc; 931 *cc = 0; 932 fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644); 933 *cc = delim; 934 935 command->w = reg - (char *)command; 936 command = xrealloc(command, command->w+(cc-line)+6); 937 reg = command->w + (char *)command; 938 939 memcpy(reg, &fd, 4); 940 reg += 4; 941 *(reg++) = 0; 942 memcpy(reg, line, delim); 943 reg += delim; 944 *(reg++) = 0; 945 946 line = cc; 947 if (delim) line += 2; 948 } else if (c == 'y') { 949 char *s, delim = 0; 950 int len; 951 952 if (!(s = unescape_delimited_string(&line, &delim))) goto error; 953 command->arg1 = reg-(char *)command; 954 len = strlen(s); 955 reg = extend_string((void *)&command, s, reg-(char *)command, len); 956 free(s); 957 command->arg2 = reg-(char *)command; 958 if (!(s = unescape_delimited_string(&line, &delim))) goto error; 959 if (len != strlen(s)) goto error; 960 reg = extend_string((void *)&command, s, reg-(char*)command, len); 961 free(s); 962 } else if (strchr("abcirtTqQw:", c)) { 963 int end; 964 965 // trim leading spaces 966 while (isspace(*line) && *line != '\n') line++; 967 968 // Resume logic differs from 's' case because we don't add a newline 969 // unless it's after something, so we add it on return instead. 970resume_a: 971 command->hit = 0; 972 973 // btTqQ: end with space or semicolon, aicrw continue to newline. 974 if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){ 975 // Argument's optional for btTqQ 976 if (strchr("btTqQ", c)) continue; 977 else if (!command->arg1) break; 978 } 979 // Error checking: qQ can only have digits after them 980 if (c=='q' || c=='Q') { 981 for (i = 0; i<end && isdigit(line[i]); i++); 982 if (i != end) { 983 line += i; 984 break; 985 } 986 } 987 988 // Extend allocation to include new string. We use offsets instead of 989 // pointers so realloc() moving stuff doesn't break things. Ok to write 990 // \n over NUL terminator because call to extend_string() adds it back. 991 if (!command->arg1) command->arg1 = reg - (char*)command; 992 else if (*(command->arg1+(char *)command)) *(reg++) = '\n'; 993 else if (!pline) { 994 command->arg1 = 0; 995 continue; 996 } 997 reg = extend_string((void *)&command, line, reg - (char *)command, end); 998 999 // Recopy data to remove escape sequences and handle line continuation. 1000 if (strchr("aci", c)) { 1001 reg -= end+1; 1002 for (i = end; i; i--) { 1003 if ((*reg++ = *line++)=='\\') { 1004 1005 // escape at end of line: resume if -e escaped literal newline, 1006 // else request callback and resume with next line 1007 if (!--i) { 1008 *--reg = 0; 1009 if (*line) { 1010 line++; 1011 goto resume_a; 1012 } 1013 command->hit = 256; 1014 break; 1015 } 1016 if (!(reg[-1] = unescape(*line))) reg[-1] = *line; 1017 line++; 1018 } 1019 } 1020 *reg = 0; 1021 } else line += end; 1022 1023 // Commands that take no arguments 1024 } else if (!strchr("{dDgGhHlnNpPx=", c)) break; 1025 } 1026 1027error: 1028 error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line); 1029} 1030 1031void sed_main(void) 1032{ 1033 struct arg_list *al; 1034 char **args = toys.optargs; 1035 1036 if (!FLAG(z)) TT.delim = '\n'; 1037 1038 // Lie to autoconf when it asks stupid questions, so configure regexes 1039 // that look for "GNU sed version %f" greater than some old buggy number 1040 // don't fail us for not matching their narrow expectations. 1041 if (FLAG(version)) { 1042 xprintf("This is not GNU sed version 9.0\n"); 1043 return; 1044 } 1045 1046 // Handling our own --version means we handle our own --help too. 1047 if (FLAG(help)) help_exit(0); 1048 1049 // Parse pattern into commands. 1050 1051 // If no -e or -f, first argument is the pattern. 1052 if (!TT.e && !TT.f) { 1053 if (!*toys.optargs) error_exit("no pattern"); 1054 (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++); 1055 } 1056 1057 // Option parsing infrastructure can't interlace "-e blah -f blah -e blah" 1058 // so handle all -e, then all -f. (At least the behavior's consistent.) 1059 1060 for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg)); 1061 parse_pattern(0, 0); 1062 for (al = TT.f; al; al = al->next) 1063 do_lines(xopenro(al->arg), TT.delim, parse_pattern); 1064 dlist_terminate(TT.pattern); 1065 if (TT.nextlen) error_exit("no }"); 1066 1067 TT.fdout = 1; 1068 TT.remember = xstrdup(""); 1069 1070 // Inflict pattern upon input files. Long version because !O_CLOEXEC 1071 loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file); 1072 1073 // Provide EOF flush at end of cumulative input for non-i mode. 1074 if (!FLAG(i)) { 1075 toys.optflags |= FLAG_i; 1076 sed_line(0, 0); 1077 } 1078 1079 // todo: need to close fd when done for TOYBOX_FREE? 1080} 1081