1/* awk.c - An awk implementation. 2 * vi: tabstop=2 softtabstop=2 shiftwidth=2 3 * 4 * Copyright 2024 Ray Gardner <raygard@gmail.com> 5 * 6 * See https://pubs.opengroup.org/onlinepubs/9699919799/utilities/awk.html 7 8USE_AWK(NEWTOY(awk, "F:v*f*bc", TOYFLAG_USR|TOYFLAG_BIN)) 9 10config AWK 11 bool "awk" 12 default n 13 help 14 usage: awk [-F sepstring] [-v assignment]... program [argument...] 15 or: 16 awk [-F sepstring] -f progfile [-f progfile]... [-v assignment]... 17 [argument...] 18 also: 19 -b : use bytes, not characters 20 -c : compile only, do not run 21*/ 22 23#define FOR_awk 24#include "toys.h" 25 26GLOBALS( 27 struct arg_list *f; 28 struct arg_list *v; 29 char *F; 30 31 struct scanner_state { 32 char *p; 33 char *progstring; 34 struct arg_list *prog_args; 35 char *filename; 36 char *line; 37 size_t line_size; 38 ssize_t line_len; 39 int line_num; 40 int ch; 41 FILE *fp; 42 // state includes latest token seen 43 int tok; 44 int tokbuiltin; 45 int toktype; 46 char *tokstr; 47 size_t maxtok; 48 size_t toklen; 49 double numval; 50 int error; // Set if lexical error. 51 } *scs; 52 char *tokstr; 53 int prevtok; 54 55 struct compiler_globals { 56 int in_print_stmt; 57 int paren_level; 58 int in_function_body; 59 int funcnum; 60 int nparms; 61 int compile_error_count; 62 int first_begin; 63 int last_begin; 64 int first_end; 65 int last_end; 66 int first_recrule; 67 int last_recrule; 68 int break_dest; 69 int continue_dest; 70 int stack_offset_to_fix; // fixup stack if return in for(e in a) 71 int range_pattern_num; 72 int rule_type; // tkbegin, tkend, or 0 73 } cgl; 74 75 // zvalue: the main awk value type 76 // Can be number or string or both, or else map (array) or regex 77 struct zvalue { 78 unsigned flags; 79 double num; 80 union { // anonymous union not in C99; not going to fix it now. 81 struct zstring *vst; 82 struct zmap *map; 83 regex_t *rx; 84 }; 85 } nozvalue; // to shut up compiler warning TODO FIXME 86 87 struct runtime_globals { 88 struct zvalue cur_arg; 89 //char *filename; // UNUSED 90 FILE *fp; // current data file 91 int narg; // cmdline arg index 92 int nfiles; // num of cmdline data file args processed 93 int eof; // all cmdline files (incl. stdin) read 94 char *recptr; 95 char *recbuf; 96 size_t recbufsize; 97 char *recbuf_multx; 98 size_t recbufsize_multx; 99 struct zstring *zspr; // Global to receive sprintf() string value 100 } rgl; 101 102 // Expanding sequential list 103 struct zlist { 104 char *base, *limit, *avail; 105 size_t size; 106 } globals_table, // global symbol table 107 locals_table, // local symbol table 108 func_def_table; // function symbol table 109 // runtime lists 110 struct zlist literals, fields, zcode, stack; 111 112 char *progname; 113 114 int spec_var_limit; 115 int zcode_last; 116 struct zvalue *stackp; // top of stack ptr 117 118 char *pbuf; // Used for number formatting in num_to_zstring() 119#define RS_MAX 64 120 char rs_last[RS_MAX]; 121 regex_t rx_rs_default, rx_rs_last; 122 regex_t rx_default, rx_last, rx_printf_fmt; 123#define FS_MAX 64 124 char fs_last[FS_MAX]; 125 char one_char_fs[4]; 126 int nf_internal; // should match NF 127 char range_sw[64]; // FIXME TODO quick and dirty set of range switches 128 int file_cnt, std_file_cnt; 129 130 struct zfile { 131 struct zfile *next; 132 char *fn; 133 FILE *fp; 134 char mode; // w, a, or r 135 char file_or_pipe; // f or p 136 char is_std_file; 137 char *recbuf; 138 size_t recbufsize; 139 char *recbuf_multi; 140 size_t recbufsize_multi; 141 char *recbuf_multx; 142 size_t recbufsize_multx; 143 int recoffs, endoffs; 144 } *zfiles, *cfile, *zstdout; 145) 146 147#ifdef __GNUC__ 148#define ATTR_FALLTHROUGH_INTENDED __attribute__ ((fallthrough)) 149#else 150#define ATTR_FALLTHROUGH_INTENDED 151#endif 152 153//////////////////// 154//// declarations 155//////////////////// 156 157#define PBUFSIZE 512 // For num_to_zstring() 158 159enum toktypes { 160 // EOF (use -1 from stdio.h) 161 ERROR = 2, NEWLINE, VAR, NUMBER, STRING, REGEX, USERFUNC, BUILTIN, TOKEN, 162 KEYWORD 163 }; 164 165// Must align with lbp_table[] 166enum tokens { 167 tkunusedtoken, tkeof, tkerr, tknl, 168 tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, 169 170// static char *ops = " ; , [ ] ( ) { } $ ++ -- ^ ! * / % + - " 171// "< <= != == > >= ~ !~ && || ? : ^= %= *= /= += -= = >> | "; 172 tksemi, tkcomma, tklbracket, tkrbracket, tklparen, tkrparen, tklbrace, 173 tkrbrace, tkfield, tkincr, tkdecr, tkpow, tknot, tkmul, tkdiv, tkmod, 174 tkplus, tkminus, 175 tkcat, // !!! Fake operator for concatenation (just adjacent string exprs) 176 tklt, tkle, tkne, tkeq, tkgt, tkge, tkmatchop, tknotmatch, tkand, tkor, 177 tkternif, tkternelse, tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn, 178 tkaddasgn, tksubasgn, tkasgn, tkappend, tkpipe, 179 180// static char *keywords = " in BEGIN END if else " 181// "while for do break continue exit function " 182// "return next nextfile delete print printf getline "; 183 tkin, tkbegin, tkend, tkif, tkelse, 184 tkwhile, tkfor, tkdo, tkbreak, tkcontinue, tkexit, tkfunction, 185 tkreturn, tknext, tknextfile, tkdelete, tkprint, tkprintf, tkgetline, 186 187// static char *builtins = " atan2 cos sin exp " 188// "log sqrt int rand srand length " 189// "tolower toupper system fflush " 190// "and or xor lshift rshift "; 191 tkatan2, tkcos, tksin, tkexp, tklog, tksqrt, tkint, tkrand, tksrand, 192 tklength, tktolower, tktoupper, tksystem, tkfflush, 193 tkband, tkbor, tkbxor, tklshift, tkrshift, 194 195// static char *specialfuncs = " close index match split " 196// "sub gsub sprintf substr "; 197 tkclose, tkindex, tkmatch, tksplit, 198 tksub, tkgsub, tksprintf, tksubstr, tklasttk 199 }; 200 201enum opcodes { 202 opunusedop = tklasttk, 203 opvarref, opmapref, opfldref, oppush, opdrop, opdrop_n, opnotnot, 204 oppreincr, oppredecr, oppostincr, oppostdecr, opnegate, opjump, opjumptrue, 205 opjumpfalse, opprepcall, opmap, opmapiternext, opmapdelete, opmatchrec, 206 opquit, opprintrec, oprange1, oprange2, oprange3, oplastop 207}; 208 209// Special variables (POSIX). Must align with char *spec_vars[] 210enum spec_var_names { ARGC=1, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF, 211 NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP }; 212 213struct symtab_slot { // global symbol table entry 214 unsigned flags; 215 int slotnum; 216 char *name; 217}; 218 219// zstring: flexible string type. 220// Capacity must be > size because we insert a NUL byte. 221struct zstring { 222 int refcnt; 223 unsigned size; 224 unsigned capacity; 225 char str[]; // C99 flexible array member 226}; 227 228// Flag bits for zvalue and symbol tables 229#define ZF_MAYBEMAP (1u << 1) 230#define ZF_MAP (1u << 2) 231#define ZF_SCALAR (1u << 3) 232#define ZF_NUM (1u << 4) 233#define ZF_RX (1u << 5) 234#define ZF_STR (1u << 6) 235#define ZF_NUMSTR (1u << 7) // "numeric string" per posix 236#define ZF_REF (1u << 9) // for lvalues 237#define ZF_MAPREF (1u << 10) // for lvalues 238#define ZF_FIELDREF (1u << 11) // for lvalues 239#define ZF_EMPTY_RX (1u << 12) 240#define ZF_ANYMAP (ZF_MAP | ZF_MAYBEMAP) 241 242// Macro to help facilitate possible future change in zvalue layout. 243#define ZVINIT(flags, num, ptr) {(flags), (double)(num), {(ptr)}} 244 245#define IS_STR(zvalp) ((zvalp)->flags & ZF_STR) 246#define IS_RX(zvalp) ((zvalp)->flags & ZF_RX) 247#define IS_NUM(zvalp) ((zvalp)->flags & ZF_NUM) 248#define IS_MAP(zvalp) ((zvalp)->flags & ZF_MAP) 249#define IS_EMPTY_RX(zvalp) ((zvalp)->flags & ZF_EMPTY_RX) 250 251#define GLOBAL ((struct symtab_slot *)TT.globals_table.base) 252#define LOCAL ((struct symtab_slot *)TT.locals_table.base) 253#define FUNC_DEF ((struct functab_slot *)TT.func_def_table.base) 254 255#define LITERAL ((struct zvalue *)TT.literals.base) 256#define STACK ((struct zvalue *)TT.stack.base) 257#define FIELD ((struct zvalue *)TT.fields.base) 258 259#define ZCODE ((int *)TT.zcode.base) 260 261#define FUNC_DEFINED (1u) 262#define FUNC_CALLED (2u) 263 264#define MIN_STACK_LEFT 1024 265 266struct functab_slot { // function symbol table entry 267 unsigned flags; 268 int slotnum; 269 char *name; 270 struct zlist function_locals; 271 int zcode_addr; 272}; 273 274// Elements of the hash table (key/value pairs) 275struct zmap_slot { 276 int hash; // store hash key to speed hash table expansion 277 struct zstring *key; 278 struct zvalue val; 279}; 280#define ZMSLOTINIT(hash, key, val) {hash, key, val} 281 282// zmap: Mapping data type for arrays; a hash table. Values in hash are either 283// 0 (unused), -1 (marked deleted), or one plus the number of the zmap slot 284// containing a key/value pair. The zlist slot entries are numbered from 0 to 285// count-1, so need to add one to distinguish from unused. The probe sequence 286// is borrowed from Python dict, using the "perturb" idea to mix in upper bits 287// of the original hash value. 288struct zmap { 289 unsigned mask; // tablesize - 1; tablesize is 2 ** n 290 int *hash; // (mask + 1) elements 291 int limit; // 80% of table size ((mask+1)*8/10) 292 int count; // number of occupied slots in hash 293 int deleted; // number of deleted slots 294 struct zlist slot; // expanding list of zmap_slot elements 295}; 296 297#define MAPSLOT ((struct zmap_slot *)(m->slot).base) 298#define FFATAL(format, ...) zzerr("$" format, __VA_ARGS__) 299#define FATAL(...) zzerr("$%s\n", __VA_ARGS__) 300#define XERR(format, ...) zzerr(format, __VA_ARGS__) 301 302#define NO_EXIT_STATUS (9999987) // value unlikely to appear in exit stmt 303 304ssize_t getline(char **lineptr, size_t *n, FILE *stream); 305ssize_t getdelim(char ** restrict lineptr, size_t * restrict n, int delimiter, FILE *stream); 306 307 308 309//////////////////// 310//// lib 311//////////////////// 312 313static void xfree(void *p) 314{ 315 free(p); 316} 317 318static int hexval(int c) 319{ 320 // Assumes c is valid hex digit 321 return isdigit(c) ? c - '0' : (c | 040) - 'a' + 10; 322} 323 324//////////////////// 325//// common defs 326//////////////////// 327 328// These (ops, keywords, builtins) must align with enum tokens 329static char *ops = " ; , [ ] ( ) { } $ ++ -- ^ ! * / % + - .. " 330 "< <= != == > >= ~ !~ && || ? : ^= %= *= /= += -= = >> | "; 331 332static char *keywords = " in BEGIN END if else " 333 "while for do break continue exit function " 334 "return next nextfile delete print printf getline "; 335 336static char *builtins = " atan2 cos sin exp log " 337 "sqrt int rand srand length " 338 "tolower toupper system fflush " 339 "and or xor lshift rshift " 340 "close index match split " 341 "sub gsub sprintf substr "; 342 343static void zzerr(char *format, ...) 344{ 345 va_list args; 346 int fatal_sw = 0; 347 fprintf(stderr, "%s: ", TT.progname); 348 if (format[0] == '$') { 349 fprintf(stderr, "FATAL: "); 350 format++; 351 fatal_sw = 1; 352 } 353 fprintf(stderr, "file %s line %d: ", TT.scs->filename, TT.scs->line_num); 354 va_start(args, format); 355 vfprintf(stderr, format, args); 356 va_end(args); 357 if (format[strlen(format)-1] != '\n') fputc('\n', stderr); // TEMP FIXME !!! 358 fflush(stderr); 359 if (fatal_sw) exit(2); 360 // Don't bump error count for warnings 361 else if (!strstr(format, "arning")) TT.cgl.compile_error_count++; 362} 363 364static void get_token_text(char *op, int tk) 365{ 366 // This MUST ? be changed if ops string or tk... assignments change! 367 memmove(op, ops + 3 * (tk - tksemi) + 1, 2); 368 op[ op[1] == ' ' ? 1 : 2 ] = 0; 369} 370 371//////////////////// 372/// UTF-8 373//////////////////// 374 375// Return number of bytes in 'cnt' utf8 codepoints 376static int bytesinutf8(char *str, size_t len, size_t cnt) 377{ 378 if (FLAG(b)) return cnt; 379 unsigned wch; 380 char *lim = str + len, *s0 = str; 381 while (cnt-- && str < lim) { 382 int r = utf8towc(&wch, str, lim - str); 383 str += r > 0 ? r : 1; 384 } 385 return str - s0; 386} 387 388// Return number of utf8 codepoints in str 389static int utf8cnt(char *str, size_t len) 390{ 391 unsigned wch; 392 int cnt = 0; 393 char *lim; 394 if (!len || FLAG(b)) return len; 395 for (lim = str + len; str < lim; cnt++) { 396 int r = utf8towc(&wch, str, lim - str); 397 str += r > 0 ? r : 1; 398 } 399 return cnt; 400} 401 402//////////////////// 403//// zlist 404//////////////////// 405 406static struct zlist *zlist_initx(struct zlist *p, size_t size, size_t count) 407{ 408 p->base = p->avail = xzalloc(count * size); 409 p->limit = p->base + size * count; 410 p->size = size; 411 return p; 412} 413 414static struct zlist *zlist_init(struct zlist *p, size_t size) 415{ 416#define SLIST_MAX_INIT_BYTES 128 417 return zlist_initx(p, size, SLIST_MAX_INIT_BYTES / size); 418} 419 420// This is called from zlist_append() and add_stack() in run 421static void zlist_expand(struct zlist *p) 422{ 423 size_t offset = p->avail - p->base; 424 size_t cap = p->limit - p->base; 425 size_t newcap = maxof(cap + p->size, ((cap / p->size) * 3 / 2) * p->size); 426 if (newcap <= cap) error_exit("mem req error"); 427 char *base = xrealloc(p->base, newcap); 428 p->base = base; 429 p->limit = base + newcap; 430 p->avail = base + offset; 431} 432 433static size_t zlist_append(struct zlist *p, void *obj) 434{ 435 // Insert obj (p->size bytes) at end of list, expand as needed. 436 // Return scaled offset to newly inserted obj; i.e. the 437 // "slot number" 0, 1, 2,... 438 void *objtemp = 0; 439 if (p->avail > p->limit - p->size) { 440 objtemp = xmalloc(p->size); // Copy obj in case it is in 441 memmove(objtemp, obj, p->size); // the area realloc might free! 442 obj = objtemp; 443 zlist_expand(p); 444 } 445 memmove(p->avail, obj, p->size); 446 if (objtemp) xfree(objtemp); 447 p->avail += p->size; 448 return (p->avail - p->base - p->size) / p->size; // offset of updated slot 449} 450 451static int zlist_len(struct zlist *p) 452{ 453 return (p->avail - p->base) / p->size; 454} 455 456//////////////////// 457//// zstring 458//////////////////// 459 460static void zstring_release(struct zstring **s) 461{ 462 if (*s && (**s).refcnt-- == 0) xfree(*s); //free_zstring(s); 463 *s = 0; 464} 465 466static void zstring_incr_refcnt(struct zstring *s) 467{ 468 if (s) s->refcnt++; 469} 470 471// !! Use only if 'to' is NULL or its refcnt is 0. 472static struct zstring *zstring_modify(struct zstring *to, size_t at, char *s, size_t n) 473{ 474 size_t cap = at + n + 1; 475 if (!to || to->capacity < cap) { 476 to = xrealloc(to, sizeof(*to) + cap); 477 to->capacity = cap; 478 to->refcnt = 0; 479 } 480 memcpy(to->str + at, s, n); 481 to->size = at + n; 482 to->str[to->size] = '\0'; 483 return to; 484} 485 486// The 'to' pointer may move by realloc, so return (maybe updated) pointer. 487// If refcnt is nonzero then there is another pointer to this zstring, 488// so copy this one and release it. If refcnt is zero we can mutate this. 489static struct zstring *zstring_update(struct zstring *to, size_t at, char *s, size_t n) 490{ 491 if (to && to->refcnt) { 492 struct zstring *to_before = to; 493 to = zstring_modify(0, 0, to->str, to->size); 494 zstring_release(&to_before); 495 } 496 return zstring_modify(to, at, s, n); 497} 498 499static struct zstring *zstring_copy(struct zstring *to, struct zstring *from) 500{ 501 return zstring_update(to, 0, from->str, from->size); 502} 503 504static struct zstring *zstring_extend(struct zstring *to, struct zstring *from) 505{ 506 return zstring_update(to, to->size, from->str, from->size); 507} 508 509static struct zstring *new_zstring(char *s, size_t size) 510{ 511 return zstring_modify(0, 0, s, size); 512} 513 514//////////////////// 515//// zvalue 516//////////////////// 517 518static struct zvalue uninit_zvalue = ZVINIT(0, 0.0, 0); 519 520// This will be reassigned in init_globals() with an empty string. 521// It's a special value used for "uninitialized" field vars 522// referenced past $NF. See push_field(). 523static struct zvalue uninit_string_zvalue = ZVINIT(0, 0.0, 0); 524 525static struct zvalue new_str_val(char *s) 526{ 527 // Only if no nul inside string! 528 struct zvalue v = ZVINIT(ZF_STR, 0.0, new_zstring(s, strlen(s))); 529 return v; 530} 531 532static void zvalue_release_zstring(struct zvalue *v) 533{ 534 if (v && ! (v->flags & (ZF_ANYMAP | ZF_RX))) zstring_release(&v->vst); 535} 536 537// push_val() is used for initializing globals (see init_compiler()) 538// but mostly used in runtime 539// WARNING: push_val may change location of v, so do NOT depend on it after! 540// Note the incr refcnt used to be after the zlist_append, but that caused a 541// heap-use-after-free error when the zlist_append relocated the zvalue being 542// pushed, invalidating the v pointer. 543static void push_val(struct zvalue *v) 544{ 545 if (IS_STR(v) && v->vst) v->vst->refcnt++; // inlined zstring_incr_refcnt() 546 *++TT.stackp = *v; 547} 548 549static void zvalue_copy(struct zvalue *to, struct zvalue *from) 550{ 551 if (IS_RX(from)) *to = *from; 552 else { 553 zvalue_release_zstring(to); 554 *to = *from; 555 zstring_incr_refcnt(to->vst); 556 } 557} 558 559static void zvalue_dup_zstring(struct zvalue *v) 560{ 561 struct zstring *z = new_zstring(v->vst->str, v->vst->size); 562 zstring_release(&v->vst); 563 v->vst = z; 564} 565 566//////////////////// 567//// zmap (array) implementation 568//////////////////// 569 570static int zstring_match(struct zstring *a, struct zstring *b) 571{ 572 return a->size == b->size && memcmp(a->str, b->str, a->size) == 0; 573} 574 575static int zstring_hash(struct zstring *s) 576{ // djb2 -- small, fast, good enough for this 577 unsigned h = 5381; 578 char *p = s->str, *lim = p + s->size; 579 while (p < lim) 580 h = (h << 5) + h + *p++; 581 return h; 582} 583 584enum { PSHIFT = 5 }; // "perturb" shift -- see find_mapslot() below 585 586static struct zmap_slot *find_mapslot(struct zmap *m, struct zstring *key, int *hash, int *probe) 587{ 588 struct zmap_slot *x = 0; 589 unsigned perturb = *hash = zstring_hash(key); 590 *probe = *hash & m->mask; 591 int n, first_deleted = -1; 592 while ((n = m->hash[*probe])) { 593 if (n > 0) { 594 x = &MAPSLOT[n-1]; 595 if (*hash == x->hash && zstring_match(key, x->key)) { 596 return x; 597 } 598 } else if (first_deleted < 0) first_deleted = *probe; 599 // Based on technique in Python dict implementation. Comment there 600 // (https://github.com/python/cpython/blob/3.10/Objects/dictobject.c) 601 // says 602 // 603 // j = ((5*j) + 1) mod 2**i 604 // For any initial j in range(2**i), repeating that 2**i times generates 605 // each int in range(2**i) exactly once (see any text on random-number 606 // generation for proof). 607 // 608 // The addition of 'perturb' greatly improves the probe sequence. See 609 // the Python dict implementation for more details. 610 *probe = (*probe * 5 + 1 + (perturb >>= PSHIFT)) & m->mask; 611 } 612 if (first_deleted >= 0) *probe = first_deleted; 613 return 0; 614} 615 616static struct zvalue *zmap_find(struct zmap *m, struct zstring *key) 617{ 618 int hash, probe; 619 struct zmap_slot *x = find_mapslot(m, key, &hash, &probe); 620 return x ? &x->val : 0; 621} 622 623static void zmap_init(struct zmap *m) 624{ 625 enum {INIT_SIZE = 8}; 626 m->mask = INIT_SIZE - 1; 627 m->hash = xzalloc(INIT_SIZE * sizeof(*m->hash)); 628 m->limit = INIT_SIZE * 8 / 10; 629 m->count = 0; 630 m->deleted = 0; 631 zlist_init(&m->slot, sizeof(struct zmap_slot)); 632} 633 634static void zvalue_map_init(struct zvalue *v) 635{ 636 struct zmap *m = xmalloc(sizeof(*m)); 637 zmap_init(m); 638 v->map = m; 639 v->flags |= ZF_MAP; 640} 641 642static void zmap_delete_map_incl_slotdata(struct zmap *m) 643{ 644 for (struct zmap_slot *p = &MAPSLOT[0]; p < &MAPSLOT[zlist_len(&m->slot)]; p++) { 645 if (p->key) zstring_release(&p->key); 646 if (p->val.vst) zstring_release(&p->val.vst); 647 } 648 xfree(m->slot.base); 649 xfree(m->hash); 650} 651 652static void zmap_delete_map(struct zmap *m) 653{ 654 zmap_delete_map_incl_slotdata(m); 655 zmap_init(m); 656} 657 658static void zmap_rehash(struct zmap *m) 659{ 660 // New table is twice the size of old. 661 int size = m->mask + 1; 662 unsigned mask = 2 * size - 1; 663 int *h = xzalloc(2 * size * sizeof(*m->hash)); 664 // Step through the old hash table, set up location in new table. 665 for (int i = 0; i < size; i++) { 666 int n = m->hash[i]; 667 if (n > 0) { 668 int hash = MAPSLOT[n-1].hash; 669 unsigned perturb = hash; 670 int p = hash & mask; 671 while (h[p]) { 672 p = (p * 5 + 1 + (perturb >>= PSHIFT)) & mask; 673 } 674 h[p] = n; 675 } 676 } 677 m->mask = mask; 678 xfree(m->hash); 679 m->hash = h; 680 m->limit = 2 * size * 8 / 10; 681} 682 683static struct zmap_slot *zmap_find_or_insert_key(struct zmap *m, struct zstring *key) 684{ 685 int hash, probe; 686 struct zmap_slot *x = find_mapslot(m, key, &hash, &probe); 687 if (x) return x; 688 // not found; insert it. 689 if (m->count == m->limit) { 690 zmap_rehash(m); // rehash if getting too full. 691 // rerun find_mapslot to get new probe index 692 x = find_mapslot(m, key, &hash, &probe); 693 } 694 // Assign key to new slot entry and bump refcnt. 695 struct zmap_slot zs = ZMSLOTINIT(hash, key, (struct zvalue)ZVINIT(0, 0.0, 0)); 696 zstring_incr_refcnt(key); 697 int n = zlist_append(&m->slot, &zs); 698 m->count++; 699 m->hash[probe] = n + 1; 700 return &MAPSLOT[n]; 701} 702 703static void zmap_delete(struct zmap *m, struct zstring *key) 704{ 705 int hash, probe; 706 struct zmap_slot *x = find_mapslot(m, key, &hash, &probe); 707 if (!x) return; 708 zstring_release(&MAPSLOT[m->hash[probe] - 1].key); 709 m->hash[probe] = -1; 710 m->deleted++; 711} 712 713//////////////////// 714//// scan (lexical analyzer) 715//////////////////// 716 717// TODO: 718// IS line_num getting incr correctly? Newline counts as start of line!? 719// Handle nuls in file better. 720// Open files "rb" and handle CRs in program. 721// Roll gch() into get_char() ? 722// Deal with signed char (at EOF? elsewhere?) 723// 724// 2023-01-11: Allow nul bytes inside strings? regexes? 725 726static void progfile_open(void) 727{ 728 TT.scs->filename = TT.scs->prog_args->arg; 729 TT.scs->prog_args = TT.scs->prog_args->next; 730 TT.scs->fp = stdin; 731 if (strcmp(TT.scs->filename, "-")) TT.scs->fp = fopen(TT.scs->filename, "r"); 732 if (!TT.scs->fp) error_exit("Can't open %s", TT.scs->filename); 733 TT.scs->line_num = 0; 734} 735 736static int get_char(void) 737{ 738 static char *nl = "\n"; 739 // On first entry, TT.scs->p points to progstring if any, or null string. 740 for (;;) { 741 int c = *(TT.scs->p)++; 742 if (c) { 743 return c; 744 } 745 if (TT.scs->progstring) { // Fake newline at end of progstring. 746 if (TT.scs->progstring == nl) return EOF; 747 TT.scs->p = TT.scs->progstring = nl; 748 continue; 749 } 750 // Here if getting from progfile(s). 751 if (TT.scs->line == nl) return EOF; 752 if (!TT.scs->fp) { 753 progfile_open(); 754 // The " " + 1 is to set p to null string but allow ref to prev char for 755 // "lastchar" test below. 756 } 757 // Save last char to allow faking final newline. 758 int lastchar = (TT.scs->p)[-2]; 759 TT.scs->line_len = getline(&TT.scs->line, &TT.scs->line_size, TT.scs->fp); 760 if (TT.scs->line_len > 0) { 761 TT.scs->line_num++; 762 TT.scs->p = TT.scs->line; 763 continue; 764 } 765 // EOF 766 // FIXME TODO or check for error? feof() vs. ferror() 767 fclose(TT.scs->fp); 768 TT.scs->fp = 0; 769 TT.scs->p = " " + 2; 770 if (!TT.scs->prog_args) { 771 xfree(TT.scs->line); 772 if (lastchar == '\n') return EOF; 773 // Fake final newline 774 TT.scs->line = TT.scs->p = nl; 775 } 776 } 777} 778 779static void append_this_char(int c) 780{ 781 if (TT.scs->toklen == TT.scs->maxtok - 1) { 782 TT.scs->maxtok *= 2; 783 TT.scs->tokstr = xrealloc(TT.scs->tokstr, TT.scs->maxtok); 784 } 785 TT.scs->tokstr[TT.scs->toklen++] = c; 786 TT.scs->tokstr[TT.scs->toklen] = 0; 787} 788 789static void gch(void) 790{ 791 // FIXME probably not right place to skip CRs. 792 do { 793 TT.scs->ch = get_char(); 794 } while (TT.scs->ch == '\r'); 795} 796 797static void append_char(void) 798{ 799 append_this_char(TT.scs->ch); 800 gch(); 801} 802 803static int find_keyword_or_builtin(char *table, 804 int first_tok_in_table) 805{ 806 char s[16] = " ", *p; 807 // keywords and builtin functions are spaced 10 apart for strstr() lookup, 808 // so must be less than that long. 809 if (TT.scs->toklen >= 10) return 0; 810 strcat(s, TT.scs->tokstr); 811 strcat(s, " "); 812 p = strstr(table, s); 813 if (!p) return 0; 814 return first_tok_in_table + (p - table) / 10; 815} 816 817static int find_token(void) 818{ 819 char s[6] = " ", *p; 820 // tokens are spaced 3 apart for strstr() lookup, so must be less than 821 // that long. 822 strcat(s, TT.scs->tokstr); 823 strcat(s, " "); 824 p = strstr(ops, s); 825 if (!p) return 0; 826 return tksemi + (p - ops) / 3; 827} 828 829static int find_keyword(void) 830{ 831 return find_keyword_or_builtin(keywords, tkin); 832} 833 834static int find_builtin(void) 835{ 836 return find_keyword_or_builtin(builtins, tkatan2); 837} 838 839static void get_number(void) 840{ 841 // Assumes TT.scs->ch is digit or dot on entry. 842 // TT.scs->p points to the following character. 843 // OK formats: 1 1. 1.2 1.2E3 1.2E+3 1.2E-3 1.E2 1.E+2 1.E-2 1E2 .1 .1E2 844 // .1E+2 .1E-2 845 // NOT OK: . .E .E1 .E+ .E+1 ; 1E .1E 1.E 1.E+ 1.E- parse as number 846 // followed by variable E. 847 // gawk accepts 12.E+ and 12.E- as 12; nawk & mawk say syntax error. 848 char *leftover; 849 int len; 850 TT.scs->numval = strtod(TT.scs->p - 1, &leftover); 851 len = leftover - TT.scs->p + 1; 852 if (len == 0) { 853 append_char(); 854 TT.scs->toktype = ERROR; 855 TT.scs->tok = tkerr; 856 TT.scs->error = 1; 857 FFATAL("Unexpected token '%s'\n", TT.scs->tokstr); 858 return; 859 } 860 while (len--) 861 append_char(); 862} 863 864static void get_string_or_regex(int endchar) 865{ 866 gch(); 867 while (TT.scs->ch != endchar) { 868 if (TT.scs->ch == '\n') { 869 // FIXME Handle unterminated string or regex. Is this OK? 870 // FIXME TODO better diagnostic here? 871 XERR("%s\n", "unterminated string or regex"); 872 break; 873 } else if (TT.scs->ch == '\\') { 874 // \\ \a \b \f \n \r \t \v \" \/ \ddd 875 char *p, *escapes = "\\abfnrtv\"/"; 876 gch(); 877 if (TT.scs->ch == '\n') { // backslash newline is continuation 878 gch(); 879 continue; 880 } else if ((p = strchr(escapes, TT.scs->ch))) { 881 // posix regex does not use these escapes, 882 // but awk does, so do them. 883 int c = "\\\a\b\f\n\r\t\v\"/"[p-escapes]; 884 append_this_char(c); 885 // Need to double up \ inside literal regex 886 if (endchar == '/' && c == '\\') append_this_char('\\'); 887 gch(); 888 } else if (TT.scs->ch == 'x') { 889 gch(); 890 if (isxdigit(TT.scs->ch)) { 891 int c = hexval(TT.scs->ch); 892 gch(); 893 if (isxdigit(TT.scs->ch)) { 894 c = c * 16 + hexval(TT.scs->ch); 895 gch(); 896 } 897 append_this_char(c); 898 } else append_this_char('x'); 899 } else if (TT.scs->ch == 'u') { 900 gch(); 901 if (isxdigit(TT.scs->ch)) { 902 int i = 0, j = 0, c = 0; 903 char codep[9] = {0}; 904 do { 905 codep[j++] = TT.scs->ch; 906 gch(); 907 } while (j < 8 && isxdigit(TT.scs->ch)); 908 c = strtol(codep, 0, 16); 909 for (i = wctoutf8(codep, c), j = 0; j < i; j++) 910 append_this_char(codep[j]); 911 } else append_this_char('u'); 912 } else if (isdigit(TT.scs->ch)) { 913 if (TT.scs->ch < '8') { 914 int k, c = 0; 915 for (k = 0; k < 3; k++) { 916 if (isdigit(TT.scs->ch) && TT.scs->ch < '8') { 917 c = c * 8 + TT.scs->ch - '0'; 918 gch(); 919 } else 920 break; 921 } 922 append_this_char(c); 923 } else { 924 append_char(); 925 } 926 } else { 927 if (endchar == '/') { 928 // pass \ unmolested if not awk escape, 929 // so that regex routines can see it. 930 if (!strchr(".[]()*+?{}|^$-", TT.scs->ch)) { 931 XERR("warning: '\\%c' -- unknown regex escape\n", TT.scs->ch); 932 } 933 append_this_char('\\'); 934 } else { 935 XERR("warning: '\\%c' treated as plain '%c'\n", TT.scs->ch, TT.scs->ch); 936 } 937 } 938 } else if (TT.scs->ch == EOF) { 939 FATAL("EOF in string or regex\n"); 940 } else { 941 append_char(); 942 } 943 } 944 gch(); 945} 946 947static void ascan_opt_div(int div_op_allowed_here) 948{ 949 int n; 950 for (;;) { 951 TT.scs->tokbuiltin = 0; 952 TT.scs->toklen = 0; 953 TT.scs->tokstr[0] = 0; 954 while (TT.scs->ch == ' ' || TT.scs->ch == '\t') 955 gch(); 956 if (TT.scs->ch == '\\') { 957 append_char(); 958 if (TT.scs->ch == '\n') { 959 gch(); 960 continue; 961 } 962 TT.scs->toktype = ERROR; // \ not last char in line. 963 TT.scs->tok = tkerr; 964 TT.scs->error = 3; 965 FATAL("backslash not last char in line\n"); 966 return; 967 } 968 break; 969 } 970 // Note \<NEWLINE> in comment does not continue it. 971 if (TT.scs->ch == '#') { 972 gch(); 973 while (TT.scs->ch != '\n') 974 gch(); 975 // Need to fall through here to pick up newline. 976 } 977 if (TT.scs->ch == '\n') { 978 TT.scs->toktype = NEWLINE; 979 TT.scs->tok = tknl; 980 append_char(); 981 } else if (isalpha(TT.scs->ch) || TT.scs->ch == '_') { 982 append_char(); 983 while (isalnum(TT.scs->ch) || TT.scs->ch == '_') { 984 append_char(); 985 } 986 if ((n = find_keyword()) != 0) { 987 TT.scs->toktype = KEYWORD; 988 TT.scs->tok = n; 989 } else if ((n = find_builtin()) != 0) { 990 TT.scs->toktype = BUILTIN; 991 TT.scs->tok = tkbuiltin; 992 TT.scs->tokbuiltin = n; 993 } else if ((TT.scs->ch == '(')) { 994 TT.scs->toktype = USERFUNC; 995 TT.scs->tok = tkfunc; 996 } else { 997 TT.scs->toktype = VAR; 998 TT.scs->tok = tkvar; 999 // skip whitespace to be able to check for , or ) 1000 while (TT.scs->ch == ' ' || TT.scs->ch == '\t') 1001 gch(); 1002 } 1003 return; 1004 } else if (TT.scs->ch == '"') { 1005 TT.scs->toktype = STRING; 1006 TT.scs->tok = tkstring; 1007 get_string_or_regex('"'); 1008 } else if (isdigit(TT.scs->ch) || TT.scs->ch == '.') { 1009 TT.scs->toktype = NUMBER; 1010 TT.scs->tok = tknumber; 1011 get_number(); 1012 } else if (TT.scs->ch == '/' && ! div_op_allowed_here) { 1013 TT.scs->toktype = REGEX; 1014 TT.scs->tok = tkregex; 1015 get_string_or_regex('/'); 1016 } else if (TT.scs->ch == EOF) { 1017 TT.scs->toktype = EOF; 1018 TT.scs->tok = tkeof; 1019 } else if (TT.scs->ch == '\0') { 1020 append_char(); 1021 TT.scs->toktype = ERROR; 1022 TT.scs->tok = tkerr; 1023 TT.scs->error = 5; 1024 FATAL("null char\n"); 1025 } else { 1026 // All other tokens. 1027 TT.scs->toktype = TT.scs->ch; 1028 append_char(); 1029 // Special case for **= and ** tokens 1030 if (TT.scs->toktype == '*' && TT.scs->ch == '*') { 1031 append_char(); 1032 if (TT.scs->ch == '=') { 1033 append_char(); 1034 TT.scs->tok = tkpowasgn; 1035 } else TT.scs->tok = tkpow; 1036 TT.scs->toktype = TT.scs->tok + 200; 1037 return; 1038 } 1039 // Is it a 2-character token? 1040 if (TT.scs->ch != ' ' && TT.scs->ch != '\n') { 1041 append_this_char(TT.scs->ch); 1042 if (find_token()) { 1043 TT.scs->tok = find_token(); 1044 TT.scs->toktype = TT.scs->tok + 200; 1045 gch(); // Eat second char of token. 1046 return; 1047 } 1048 TT.scs->toklen--; // Not 2-character token; back off. 1049 TT.scs->tokstr[TT.scs->toklen] = 0; 1050 } 1051 TT.scs->tok = find_token(); 1052 if (TT.scs->tok) return; 1053 TT.scs->toktype = ERROR; 1054 TT.scs->tok = tkerr; 1055 TT.scs->error = 4; 1056 FFATAL("Unexpected token '%s'\n", TT.scs->tokstr); 1057 } 1058} 1059 1060static void scan_opt_div(int div_op_allowed_here) 1061{ 1062 // TODO FIXME need better diags for bad tokens! 1063 // TODO Also set global syntax error flag. 1064 do ascan_opt_div(div_op_allowed_here); while (TT.scs->tok == tkerr); 1065} 1066 1067static void init_scanner(void) 1068{ 1069 TT.prevtok = tkeof; 1070 gch(); 1071} 1072 1073// POSIX says '/' does not begin a regex wherever '/' or '/=' can mean divide. 1074// Pretty sure if / or /= comes after these, it means divide: 1075static char div_preceders[] = {tknumber, tkstring, tkvar, tkgetline, tkrparen, tkrbracket, tkincr, tkdecr, 0}; 1076 1077// For checking end of prev statement for termination and if '/' can come next 1078 1079static void scan(void) 1080{ 1081 TT.prevtok = TT.scs->tok; 1082 if (TT.prevtok && strchr(div_preceders, TT.prevtok)) scan_opt_div(1); 1083 else scan_opt_div(0); 1084 TT.tokstr = TT.scs->tokstr; 1085} 1086 1087//////////////////// 1088//// compile 1089//////////////////// 1090 1091// NOTES: 1092// NL ok after , { && || do else OR after right paren after if/while/for 1093// TODO: 1094// see case tkgetline -- test more 1095// case tkmatchop, tknotmatch -- fix ~ (/re/) 1096 1097// Forward declarations -- for mutually recursive parsing functions 1098static int expr(int rbp); 1099static void lvalue(void); 1100static int primary(void); 1101static void stmt(void); 1102static void action(int action_type); 1103 1104#define CURTOK() (TT.scs->tok) 1105#define ISTOK(toknum) (TT.scs->tok == (toknum)) 1106 1107static int havetok(int tk) 1108{ 1109 if (!ISTOK(tk)) return 0; 1110 scan(); 1111 return 1; 1112} 1113 1114//// code and "literal" emitters 1115static void gen2cd(int op, int n) 1116{ 1117 zlist_append(&TT.zcode, &op); 1118 TT.zcode_last = zlist_append(&TT.zcode, &n); 1119} 1120 1121static void gencd(int op) 1122{ 1123 TT.zcode_last = zlist_append(&TT.zcode, &op); 1124} 1125 1126static int make_literal_str_val(char *s) 1127{ 1128 // Only if no nul inside string! 1129 struct zvalue v = new_str_val(s); 1130 return zlist_append(&TT.literals, &v); 1131} 1132 1133static int make_literal_regex_val(char *s) 1134{ 1135 regex_t *rx; 1136 rx = xmalloc(sizeof(*rx)); 1137 xregcomp(rx, s, REG_EXTENDED); 1138 struct zvalue v = ZVINIT(ZF_RX, 0, 0); 1139 v.rx = rx; 1140 // Flag empty rx to make it easy to identify for split() special case 1141 if (!*s) v.flags |= ZF_EMPTY_RX; 1142 return zlist_append(&TT.literals, &v); 1143} 1144 1145static int make_literal_num_val(double num) 1146{ 1147 struct zvalue v = ZVINIT(ZF_NUM, num, 0); 1148 return zlist_append(&TT.literals, &v); 1149} 1150 1151static int make_uninit_val(void) 1152{ 1153 struct zvalue v = uninit_zvalue; 1154 return zlist_append(&TT.literals, &v); 1155} 1156//// END code and "literal" emitters 1157 1158//// Symbol tables functions 1159static int find_func_def_entry(char *s) 1160{ 1161 for (int k = 1; k < zlist_len(&TT.func_def_table); k++) 1162 if (!strcmp(s, FUNC_DEF[k].name)) return k; 1163 return 0; 1164} 1165 1166static int add_func_def_entry(char *s) 1167{ 1168 struct functab_slot ent = {0, 0, 0, {0, 0, 0, 0}, 0}; 1169 ent.name = xstrdup(s); 1170 int slotnum = zlist_append(&TT.func_def_table, &ent); 1171 FUNC_DEF[slotnum].slotnum = slotnum; 1172 return slotnum; 1173} 1174 1175static int find_global(char *s) 1176{ 1177 for (int k = 1; k < zlist_len(&TT.globals_table); k++) 1178 if (!strcmp(s, GLOBAL[k].name)) return k; 1179 return 0; 1180} 1181 1182static int add_global(char *s) 1183{ 1184 struct symtab_slot ent = {0, 0, 0}; 1185 ent.name = xstrdup(s); 1186 int slotnum = zlist_append(&TT.globals_table, &ent); 1187 GLOBAL[slotnum].slotnum = slotnum; 1188 return slotnum; 1189} 1190 1191static int find_local_entry(char *s) 1192{ 1193 for (int k = 1; k < zlist_len(&TT.locals_table); k++) 1194 if (!strcmp(s, LOCAL[k].name)) return k; 1195 return 0; 1196} 1197 1198static int add_local_entry(char *s) 1199{ 1200 struct symtab_slot ent = {0, 0, 0}; 1201 ent.name = xstrdup(s); 1202 int slotnum = zlist_append(&TT.locals_table, &ent); 1203 LOCAL[slotnum].slotnum = slotnum; 1204 return slotnum; 1205} 1206 1207static int find_or_add_var_name(void) 1208{ 1209 int slotnum = 0; // + means global; - means local to function 1210 int globals_ent = 0; 1211 int locals_ent = find_local_entry(TT.tokstr); // in local symbol table? 1212 if (locals_ent) { 1213 slotnum = -LOCAL[locals_ent].slotnum; 1214 } else { 1215 globals_ent = find_global(TT.tokstr); 1216 if (!globals_ent) globals_ent = add_global(TT.tokstr); 1217 slotnum = GLOBAL[globals_ent].slotnum; 1218 if (find_func_def_entry(TT.tokstr)) 1219 // POSIX: The same name shall not be used both as a variable name 1220 // with global scope and as the name of a function. 1221 XERR("var '%s' used as function name\n", TT.tokstr); 1222 } 1223 return slotnum; 1224} 1225 1226//// END Symbol tables functions 1227 1228//// Initialization 1229static void init_locals_table(void) 1230{ 1231 static struct symtab_slot locals_ent; 1232 zlist_init(&TT.locals_table, sizeof(struct symtab_slot)); 1233 zlist_append(&TT.locals_table, &locals_ent); 1234} 1235 1236static void init_tables(void) 1237{ 1238 static struct symtab_slot global_ent; 1239 static struct functab_slot func_ent; 1240 1241 // Append dummy elements in lists to force valid offsets nonzero. 1242 zlist_init(&TT.globals_table, sizeof(struct symtab_slot)); 1243 zlist_append(&TT.globals_table, &global_ent); 1244 zlist_init(&TT.func_def_table, sizeof(struct functab_slot)); 1245 zlist_append(&TT.func_def_table, &func_ent); 1246 init_locals_table(); 1247 zlist_init(&TT.zcode, sizeof(int)); 1248 gencd(tkeof); // to ensure zcode offsets are non-zero 1249 zlist_init(&TT.literals, sizeof(struct zvalue)); 1250 // Init stack size at twice MIN_STACK_LEFT. MIN_STACK_LEFT is at least as 1251 // many entries as any statement may ever take. Currently there is no diag 1252 // if this is exceeded; prog. will probably crash. 1024 should be plenty? 1253 zlist_initx(&TT.stack, sizeof(struct zvalue), 2 * MIN_STACK_LEFT); 1254 TT.stackp = (struct zvalue *)TT.stack.base; 1255 zlist_init(&TT.fields, sizeof(struct zvalue)); 1256 zlist_append(&TT.literals, &uninit_zvalue); 1257 zlist_append(&TT.stack, &uninit_zvalue); 1258 zlist_append(&TT.fields, &uninit_zvalue); 1259 FIELD[0].vst = new_zstring("", 0); 1260} 1261 1262static void init_compiler(void) 1263{ 1264 // Special variables (POSIX). Must align with enum spec_var_names 1265 static char *spec_vars[] = { "ARGC", "ARGV", "CONVFMT", "ENVIRON", "FILENAME", 1266 "FNR", "FS", "NF", "NR", "OFMT", "OFS", "ORS", "RLENGTH", "RS", "RSTART", 1267 "SUBSEP", 0}; 1268 1269 init_tables(); 1270 for (int k = 0; spec_vars[k]; k++) { 1271 TT.spec_var_limit = add_global(spec_vars[k]); 1272 GLOBAL[TT.spec_var_limit++].flags |= (k == 1 || k == 3) ? ZF_MAP : ZF_SCALAR; 1273 push_val(&uninit_zvalue); 1274 } 1275} 1276//// END Initialization 1277 1278//// Parsing and compiling to TT.zcode 1279// Left binding powers 1280static int lbp_table[] = { // Must align with enum Toks 1281 0, 0, 0, 0, // tkunusedtoken, tkeof, tkerr, tknl, 1282 250, 250, 250, // tkvar, tknumber, tkstring, 1283 250, 250, 250, // tkregex, tkfunc, tkbuiltin, 1284 0, 0, 210, 0, // tksemi, tkcomma, tklbracket, tkrbracket, 1285 200, 0, 0, 0, // tklparen, tkrparen, tklbrace, tkrbrace, 1286 190, 180, 180, 170, 160, // tkfield, tkincr, tkdecr, tkpow, tknot, 1287 150, 150, 150, 140, 140, // tkmul, tkdiv, tkmod, tkplus, tkminus, 1288 130, // tkcat, // FAKE (?) optor for concatenation (adjacent string exprs) 1289 110, 110, 110, 110, 110, 110, // tklt, tkle, tkne, tkeq, tkgt, tkge, 1290 100, 100, // tkmatchop, tknotmatch, 1291 80, 70, // tkand, tkor, 1292 60, 0, // tkternif, tkternelse, 1293 50, 50, 50, 50, // tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn, 1294 50, 50, 50, // tkaddasgn, tksubasgn, tkasgn, 1295 0, 120, // tkappend, tkpipe, 1296 90 // tkin 1297}; 1298 1299static int getlbp(int tok) 1300{ 1301 // FIXME: should tkappend be here too? is tkpipe needed? 1302 // In print statement outside parens: make '>' end an expression 1303 if (TT.cgl.in_print_stmt && ! TT.cgl.paren_level && (tok == tkgt || tok == tkpipe)) 1304 return 0; 1305 return (0 <= tok && tok <= tkin) ? lbp_table[tok] : 1306 // getline is special, not a normal builtin. 1307 // close, index, match, split, sub, gsub, sprintf, substr 1308 // are really builtin functions though bwk treats them as keywords. 1309 (tkgetline <= tok && tok <= tksubstr) ? 240 : 0; // FIXME 240 is temp? 1310} 1311 1312// Get right binding power. Same as left except for right associative optors 1313static int getrbp(int tok) 1314{ 1315 int lbp = getlbp(tok); 1316 // ternary (?:), assignment, power ops are right associative 1317 return (lbp <= 60 || lbp == 170) ? lbp - 1 : lbp; 1318} 1319 1320static void unexpected_eof(void) 1321{ 1322 error_exit("terminated with error(s)"); 1323} 1324 1325//// syntax error diagnostic and recovery (Turner's method) 1326// D.A. Turner, Error diagnosis and recovery in one pass compilers, 1327// Information Processing Letters, Volume 6, Issue 4, 1977, Pages 113-115 1328static int recovering = 0; 1329 1330static void complain(int tk) 1331{ 1332 char op[3], tkstr[10]; 1333 if (recovering) return; 1334 recovering = 1; 1335 if (!strcmp(TT.tokstr, "\n")) TT.tokstr = "<newline>"; 1336 if (tksemi <= tk && tk <= tkpipe) { 1337 get_token_text(op, tk); 1338 XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, op); 1339 } else if (tk >= tkin && tk <= tksubstr) { 1340 if (tk < tkatan2) memmove(tkstr, keywords + 1 + 10 * (tk - tkin), 10); 1341 else memmove(tkstr, builtins + 1 + 10 * (tk - tkatan2), 10); 1342 *strchr(tkstr, ' ') = 0; 1343 XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, tkstr); 1344 } else XERR("syntax near '%s'\n", TT.tokstr); 1345} 1346 1347static void expect(int tk) 1348{ 1349 if (recovering) { 1350 while (!ISTOK(tkeof) && !ISTOK(tk)) 1351 scan(); 1352 if (ISTOK(tkeof)) unexpected_eof(); 1353 scan(); // consume expected token 1354 recovering = 0; 1355 } else if (!havetok(tk)) complain(tk); 1356} 1357 1358static void skip_to(char *tklist) 1359{ 1360 do scan(); while (!ISTOK(tkeof) && !strchr(tklist, CURTOK())); 1361 if (ISTOK(tkeof)) unexpected_eof(); 1362} 1363 1364//// END syntax error diagnostic and recovery (Turner's method) 1365 1366static void optional_nl_or_semi(void) 1367{ 1368 while (havetok(tknl) || havetok(tksemi)) 1369 ; 1370} 1371 1372static void optional_nl(void) 1373{ 1374 while (havetok(tknl)) 1375 ; 1376} 1377 1378static void rparen(void) 1379{ 1380 expect(tkrparen); 1381 optional_nl(); 1382} 1383 1384static int have_comma(void) 1385{ 1386 if (!havetok(tkcomma)) return 0; 1387 optional_nl(); 1388 return 1; 1389} 1390 1391static void check_set_map(int slotnum) 1392{ 1393 // POSIX: The same name shall not be used within the same scope both as 1394 // a scalar variable and as an array. 1395 if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_SCALAR) 1396 XERR("scalar param '%s' used as array\n", LOCAL[-slotnum].name); 1397 if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_SCALAR) 1398 XERR("scalar var '%s' used as array\n", GLOBAL[slotnum].name); 1399 if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_MAP; 1400 if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_MAP; 1401} 1402 1403static void check_set_scalar(int slotnum) 1404{ 1405 if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_MAP) 1406 XERR("array param '%s' used as scalar\n", LOCAL[-slotnum].name); 1407 if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_MAP) 1408 XERR("array var '%s' used as scalar\n", GLOBAL[slotnum].name); 1409 if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_SCALAR; 1410 if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_SCALAR; 1411} 1412 1413static void map_name(void) 1414{ 1415 int slotnum; 1416 check_set_map(slotnum = find_or_add_var_name()); 1417 gen2cd(tkvar, slotnum); 1418} 1419 1420static void check_builtin_arg_counts(int tk, int num_args, char *fname) 1421{ 1422 static char builtin_1_arg[] = { tkcos, tksin, tkexp, tklog, tksqrt, tkint, 1423 tktolower, tktoupper, tkclose, tksystem, 0}; 1424 static char builtin_2_arg[] = { tkatan2, tkmatch, tkindex, tklshift, tkrshift, 0}; 1425 static char builtin_al_2_arg[] = { tkband, tkbor, tkbxor, 0}; 1426 static char builtin_2_3_arg[] = { tksub, tkgsub, tksplit, tksubstr, 0}; 1427 static char builtin_0_1_arg[] = { tksrand, tklength, tkfflush, 0}; 1428 1429 if (tk == tkrand && num_args) 1430 XERR("function '%s' expected no args, got %d\n", fname, num_args); 1431 else if (strchr(builtin_1_arg, tk) && num_args != 1) 1432 XERR("function '%s' expected 1 arg, got %d\n", fname, num_args); 1433 else if (strchr(builtin_2_arg, tk) && num_args != 2) 1434 XERR("function '%s' expected 2 args, got %d\n", fname, num_args); 1435 else if (strchr(builtin_al_2_arg, tk) && num_args < 2) 1436 XERR("function '%s' expected at least 2 args, got %d\n", fname, num_args); 1437 else if (strchr(builtin_2_3_arg, tk) && num_args != 2 && num_args != 3) 1438 XERR("function '%s' expected 2 or 3 args, got %d\n", fname, num_args); 1439 else if (strchr(builtin_0_1_arg, tk) && num_args != 0 && num_args != 1) 1440 XERR("function '%s' expected no arg or 1 arg, got %d\n", fname, num_args); 1441} 1442 1443static void builtin_call(int tk, char *builtin_name) 1444{ 1445 int num_args = 0; 1446 expect(tklparen); 1447 TT.cgl.paren_level++; 1448 switch (tk) { 1449 case tksub: 1450 case tkgsub: 1451 if (ISTOK(tkregex)) { 1452 gen2cd(tkregex, make_literal_regex_val(TT.tokstr)); 1453 scan(); 1454 } else expr(0); 1455 expect(tkcomma); 1456 optional_nl(); 1457 expr(0); 1458 if (have_comma()) { 1459 lvalue(); 1460 } else { 1461 gen2cd(tknumber, make_literal_num_val(0)); 1462 gen2cd(opfldref, tkeof); 1463 } 1464 num_args = 3; 1465 break; 1466 1467 case tkmatch: 1468 expr(0); 1469 expect(tkcomma); 1470 optional_nl(); 1471 if (ISTOK(tkregex)) { 1472 gen2cd(tkregex, make_literal_regex_val(TT.tokstr)); 1473 scan(); 1474 } else expr(0); 1475 num_args = 2; 1476 break; 1477 1478 case tksplit: 1479 expr(0); 1480 expect(tkcomma); 1481 optional_nl(); 1482 if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) { 1483 map_name(); 1484 scan(); 1485 } else { 1486 XERR("%s\n", "expected array name as split() 2nd arg"); 1487 expr(0); 1488 } 1489 // FIXME some recovery needed here!? 1490 num_args = 2; 1491 if (have_comma()) { 1492 if (ISTOK(tkregex)) { 1493 gen2cd(tkregex, make_literal_regex_val(TT.tokstr)); 1494 scan(); 1495 } else expr(0); 1496 num_args++; 1497 } 1498 break; 1499 1500 case tklength: 1501 if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) { 1502 gen2cd(tkvar, find_or_add_var_name()); 1503 scan(); 1504 num_args++; 1505 } 1506 ATTR_FALLTHROUGH_INTENDED; 1507 1508 default: 1509 if (ISTOK(tkrparen)) break; 1510 do { 1511 expr(0); 1512 num_args++; 1513 } while (have_comma()); 1514 break; 1515 } 1516 expect(tkrparen); 1517 TT.cgl.paren_level--; 1518 1519 check_builtin_arg_counts(tk, num_args, builtin_name); 1520 1521 gen2cd(tk, num_args); 1522} 1523 1524static void function_call(void) 1525{ 1526 // Function call: generate TT.zcode to: 1527 // push placeholder for return value, push placeholder for return addr, 1528 // push args, then push number of args, then: 1529 // for builtins: gen opcode (e.g. tkgsub) 1530 // for user func: gen (tkfunc, function location) 1531 // if function not yet defined, location will be filled in when defined 1532 // the location slots will be chained from the symbol table 1533 int functk = 0, funcnum = 0; 1534 char builtin_name[16]; // be sure it's long enough for all builtins 1535 if (ISTOK(tkbuiltin)) { 1536 functk = TT.scs->tokbuiltin; 1537 strcpy(builtin_name, TT.tokstr); 1538 } else if (ISTOK(tkfunc)) { // user function 1539 funcnum = find_func_def_entry(TT.tokstr); 1540 if (!funcnum) funcnum = add_func_def_entry(TT.tokstr); 1541 FUNC_DEF[funcnum].flags |= FUNC_CALLED; 1542 gen2cd(opprepcall, funcnum); 1543 } else error_exit("bad function %s!", TT.tokstr); 1544 scan(); 1545 // length() can appear without parens 1546 int num_args = 0; 1547 if (functk == tklength && !ISTOK(tklparen)) { 1548 gen2cd(functk, 0); 1549 return; 1550 } 1551 if (functk) { // builtin 1552 builtin_call(functk, builtin_name); 1553 return; 1554 } 1555 expect(tklparen); 1556 TT.cgl.paren_level++; 1557 if (ISTOK(tkrparen)) { 1558 scan(); 1559 } else { 1560 do { 1561 if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) { 1562 // Function call arg that is a lone variable. Cannot tell in this 1563 // context if it is a scalar or map. Just add it to symbol table. 1564 gen2cd(tkvar, find_or_add_var_name()); 1565 scan(); 1566 } else expr(0); 1567 num_args++; 1568 } while (have_comma()); 1569 expect(tkrparen); 1570 } 1571 TT.cgl.paren_level--; 1572 gen2cd(tkfunc, num_args); 1573} 1574 1575static void var(void) 1576{ 1577 // var name is in TT.tokstr 1578 // slotnum: + means global; - means local to function 1579 int slotnum = find_or_add_var_name(); 1580 scan(); 1581 if (havetok(tklbracket)) { 1582 check_set_map(slotnum); 1583 int num_subscripts = 0; 1584 do { 1585 expr(0); 1586 num_subscripts++; 1587 } while (have_comma()); 1588 expect(tkrbracket); 1589 if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts); 1590 gen2cd(opmap, slotnum); 1591 } else { 1592 check_set_scalar(slotnum); 1593 gen2cd(tkvar, slotnum); 1594 } 1595} 1596 1597// Dollar $ tkfield can be followed by "any" expresson, but 1598// the way it binds varies. 1599// The following are valid lvalues: 1600// $ ( expr ) 1601// $ tkvar $ tknumber $ tkstring $ tkregex 1602// $ tkfunc(...) 1603// $ tkbuiltin(...) 1604// $ length # with no parens after 1605// $ tkclose(), ... $ tksubstr 1606// $ tkgetline FIXME TODO TEST THIS 1607// $ ++ lvalue 1608// $ -- lvalue 1609// $ + expression_up_to_exponentiation (also -, ! prefix ops) 1610// $ $ whatever_can_follow_and_bind_to_dollar 1611// 1612// tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus, 1613// tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, 1614// tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr 1615// 1616// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k*k }' 1617// 18 1618// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k*k }' 1619// 18 1620// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k^k }' 1621// 81 1622// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k^k }' 1623// 8 1624 1625static void field_op(void) 1626{ 1627 // CURTOK() must be $ here. 1628 expect(tkfield); 1629 // tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus, 1630 // tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex, 1631 // tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr 1632 if (ISTOK(tkfield)) field_op(); 1633 else if (ISTOK(tkvar)) var(); 1634 else primary(); 1635 // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void) 1636 // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]). 1637 gen2cd(tkfield, tkeof); 1638} 1639 1640// Tokens that can start expression 1641static char exprstartsy[] = {tkvar, tknumber, tkstring, tkregex, tkfunc, 1642 tkbuiltin, tkfield, tkminus, tkplus, tknot, tkincr, tkdecr, tklparen, 1643 tkgetline, tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf, 1644 tksubstr, tkband, tkbor, tkbxor, tkrshift, tklshift, 0}; 1645 1646// Tokens that can end statement 1647static char stmtendsy[] = {tknl, tksemi, tkrbrace, 0}; 1648 1649// Tokens that can follow expressions of a print statement 1650static char printexprendsy[] = {tkgt, tkappend, tkpipe, tknl, tksemi, tkrbrace, 0}; 1651 1652// !! Ensure this: 1653// ternary op is right associative, so 1654// a ? b : c ? d : e evaluates as 1655// a ? b : (c ? d : e) not as 1656// (a ? b : c) ? d : e 1657 1658static void convert_push_to_reference(void) 1659{ 1660 if (ZCODE[TT.zcode_last - 1] == tkvar) ZCODE[TT.zcode_last-1] = opvarref; 1661 else if (ZCODE[TT.zcode_last - 1] == opmap) ZCODE[TT.zcode_last - 1] = opmapref; 1662 else if (ZCODE[TT.zcode_last - 1] == tkfield) ZCODE[TT.zcode_last - 1] = opfldref; 1663 else error_exit("bad lvalue?"); 1664} 1665 1666static void lvalue(void) 1667{ 1668 if (ISTOK(tkfield)) { 1669 field_op(); 1670 convert_push_to_reference(); 1671 } else if (ISTOK(tkvar)) { 1672 var(); 1673 convert_push_to_reference(); 1674 } else { 1675 XERR("syntax near '%s' (bad lvalue)\n", TT.tokstr); 1676 } 1677} 1678 1679static int primary(void) 1680{ 1681 // On entry: CURTOK() is first token of expression 1682 // On exit: CURTOK() is infix operator (for binary_op() to handle) or next 1683 // token after end of expression. 1684 // return -1 for field or var (potential lvalue); 1685 // 2 or more for comma-separated expr list 1686 // as in "multiple subscript expression in array" 1687 // e.g. (1, 2) in array_name, or a print/printf list; 1688 // otherwise return 0 1689 // 1690 // expr can start with: 1691 // tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus, 1692 // tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex, 1693 // tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr 1694 // 1695 // bwk treats these as keywords, not builtins: close index match split sub gsub 1696 // sprintf substr 1697 // 1698 // bwk builtins are: atan2 cos sin exp log sqrt int rand srand length tolower 1699 // toupper system fflush 1700 // NOTE: fflush() is NOT in POSIX awk 1701 // 1702 // primary() must consume prefix and postfix operators as well as 1703 // num, string, regex, var, var with subscripts, and function calls 1704 1705 int num_exprs = 0; 1706 int nargs, modifier; 1707 int tok = CURTOK(); 1708 switch (tok) { 1709 case tkvar: 1710 case tkfield: 1711 if (ISTOK(tkvar)) var(); 1712 else field_op(); 1713 if (ISTOK(tkincr) || ISTOK(tkdecr)) { 1714 convert_push_to_reference(); 1715 gencd(CURTOK()); 1716 scan(); 1717 } else return -1; 1718 break; 1719 1720 case tknumber: 1721 gen2cd(tknumber, make_literal_num_val(TT.scs->numval)); 1722 scan(); 1723 break; 1724 1725 case tkstring: 1726 gen2cd(tkstring, make_literal_str_val(TT.tokstr)); 1727 scan(); 1728 break; 1729 1730 case tkregex: 1731 // When an ERE token appears as an expression in any context other 1732 // than as the right-hand of the '~' or "!~" operator or as one of 1733 // the built-in function arguments described below, the value of 1734 // the resulting expression shall be the equivalent of: $0 ~ /ere/ 1735 // FIXME TODO 1736 gen2cd(opmatchrec, make_literal_regex_val(TT.tokstr)); 1737 scan(); 1738 break; 1739 1740 case tkbuiltin: // various builtins 1741 case tkfunc: // user-defined function 1742 function_call(); 1743 break; 1744 1745 // Unary prefix ! + - 1746 case tknot: 1747 case tkminus: 1748 case tkplus: 1749 scan(); 1750 expr(getlbp(tknot)); // unary +/- same precedence as ! 1751 if (tok == tknot) gencd(tknot); 1752 else gencd(opnegate); // forces to number 1753 if (tok == tkplus) gencd(opnegate); // forces to number 1754 break; 1755 1756 // Unary prefix ++ -- MUST take lvalue 1757 case tkincr: 1758 case tkdecr: 1759 scan(); 1760 lvalue(); 1761 if (tok == tkincr) gencd(oppreincr); 1762 else gencd(oppredecr); 1763 break; 1764 1765 case tklparen: 1766 scan(); 1767 TT.cgl.paren_level++; 1768 num_exprs = 0; 1769 do { 1770 expr(0); 1771 num_exprs++; 1772 } while (have_comma()); 1773 expect(tkrparen); 1774 TT.cgl.paren_level--; 1775 if (num_exprs > 1) return num_exprs; 1776 break; 1777 1778 case tkgetline: 1779 // getline may be (according to awk book): 1780 // getline [var [<file]] 1781 // getline <file 1782 // cmd | getline [var] 1783 // var must be lvalue (can be any lvalue?) 1784 scan(); 1785 nargs = 0; 1786 modifier = tkeof; 1787 if (ISTOK(tkfield) || ISTOK(tkvar)) { 1788 lvalue(); 1789 nargs++; 1790 } 1791 if (havetok(tklt)) { 1792 expr(getrbp(tkcat)); // bwk "historical practice" precedence 1793 nargs++; 1794 modifier = tklt; 1795 } 1796 gen2cd(tkgetline, nargs); 1797 gencd(modifier); 1798 break; 1799 1800 default: 1801 XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr); 1802 skip_to(stmtendsy); 1803 break; 1804 } 1805 return 0; 1806} 1807 1808static void binary_op(int optor) // Also for ternary ?: optor. 1809{ 1810 int nargs, cdx = 0; // index in TT.zcode list 1811 int rbp = getrbp(optor); 1812 if (optor != tkcat) scan(); 1813 // CURTOK() holds first token of right operand. 1814 switch (optor) { 1815 case tkin: 1816 // right side of 'in' must be (only) an array name 1817 map_name(); 1818 gencd(tkin); 1819 scan(); 1820 // FIXME TODO 20230109 x = y in a && 2 works OK? 1821 // x = y in a + 2 does not; it's parsed as x = (y in a) + 2 1822 // The +2 is not cat'ed with (y in a) as in bwk's OTA. 1823 // Other awks see y in a + 2 as a syntax error. They (may) 1824 // not want anything after y in a except a lower binding operator 1825 // (&& || ?:) or end of expression, i.e. ')' ';' '}' 1826 break; 1827 1828 case tkpipe: 1829 expect(tkgetline); 1830 nargs = 1; 1831 if (ISTOK(tkfield) || ISTOK(tkvar)) { 1832 lvalue(); 1833 nargs++; 1834 } 1835 gen2cd(tkgetline, nargs); 1836 gencd(tkpipe); 1837 break; 1838 1839 case tkand: 1840 case tkor: 1841 optional_nl(); 1842 gen2cd(optor, -1); // tkand: jump if false, else drop 1843 cdx = TT.zcode_last; // tkor: jump if true, else drop 1844 expr(rbp); 1845 gencd(opnotnot); // replace TT.stack top with truth value 1846 ZCODE[cdx] = TT.zcode_last - cdx; 1847 break; 1848 1849 case tkternif: 1850 gen2cd(optor, -1); 1851 cdx = TT.zcode_last; 1852 expr(0); 1853 expect(tkternelse); 1854 gen2cd(tkternelse, -1); 1855 ZCODE[cdx] = TT.zcode_last - cdx; 1856 cdx = TT.zcode_last; 1857 expr(rbp); 1858 ZCODE[cdx] = TT.zcode_last - cdx; 1859 break; 1860 1861 case tkmatchop: 1862 case tknotmatch: 1863 expr(rbp); 1864 if (ZCODE[TT.zcode_last - 1] == opmatchrec) ZCODE[TT.zcode_last - 1] = tkregex; 1865 gencd(optor); 1866 break; 1867 1868 default: 1869 expr(rbp); 1870 gencd(optor); 1871 } 1872} 1873 1874static int cat_start_concated_expr(int tok) 1875{ 1876 // concat'ed expr can start w/ var number string func builtin $ ! ( (or ++ if prev was not lvalue) 1877 static char exprstarttermsy[] = {tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, 1878 tkfield, tknot, tkincr, tkdecr, tklparen, tkgetline, 0}; 1879 1880 // NOTE this depends on builtins (close etc) being >= tkgetline 1881 return !! strchr(exprstarttermsy, tok) || tok >= tkgetline; 1882} 1883 1884#define CALLED_BY_PRINT 99987 // Arbitrary, different from any real rbp value 1885 1886static int expr(int rbp) 1887{ 1888 // On entry: TT.scs has first symbol of expression, e.g. var, number, string, 1889 // regex, func, getline, left paren, prefix op ($ ++ -- ! unary + or -) etc. 1890 static char asgnops[] = {tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn, 1891 tkaddasgn, tksubasgn, tkasgn, 0}; 1892 int prim_st = primary(); 1893 // If called directly by print_stmt(), and found a parenthesized expression list 1894 // followed by an end of print statement: any of > >> | ; } <newline> 1895 // Then: return the count of expressions in list 1896 // Else: continue parsing an expression 1897 if (rbp == CALLED_BY_PRINT) { 1898 if (prim_st > 0 && strchr(printexprendsy, CURTOK())) return prim_st; 1899 else rbp = 0; 1900 } 1901 1902 // mult_expr_list in parens must be followed by 'in' unless it 1903 // immediately follows print or printf, where it may still be followed 1904 // by 'in' ... unless at end of statement 1905 if (prim_st > 0 && ! ISTOK(tkin)) 1906 XERR("syntax near '%s'; expected 'in'\n", TT.tokstr); 1907 if (prim_st > 0) gen2cd(tkrbracket, prim_st); 1908 // primary() has eaten subscripts, function args, postfix ops. 1909 // CURTOK() should be a binary op. 1910 int optor = CURTOK(); 1911 if (strchr(asgnops, optor)) { 1912 1913 // TODO FIXME ? NOT SURE IF THIS WORKS RIGHT! 1914 // awk does not parse according to POSIX spec in some odd cases. 1915 // When an assignment (lvalue =) is on the right of certain operators, 1916 // it is not treated as a bad lvalue (as it is in C). 1917 // Example: (1 && a=2) # no error; the assignment is performed. 1918 // This happens for ?: || && ~ !~ < <= ~= == > >= 1919 // 1920 static char odd_assignment_rbp[] = {59, 60, 70, 80, 100, 110, 0}; 1921 if (prim_st < 0 && (rbp <= getrbp(optor) || strchr(odd_assignment_rbp, rbp))) { 1922 convert_push_to_reference(); 1923 scan(); 1924 expr(getrbp(optor)); 1925 gencd(optor); 1926 return 0; 1927 } 1928 XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr); 1929 skip_to(stmtendsy); 1930 } 1931 if (cat_start_concated_expr(optor)) optor = tkcat; 1932 while (rbp < getlbp(optor)) { 1933 binary_op(optor); 1934 // HERE tok s/b an operator or expression terminator ( ; etc.). 1935 optor = CURTOK(); 1936 if (cat_start_concated_expr(optor)) optor = tkcat; 1937 } 1938 return 0; 1939} 1940 1941static void print_stmt(int tk) 1942{ 1943 static char outmodes[] = {tkgt, tkappend, tkpipe, 0}; 1944 int num_exprs = 0, outmode; 1945 TT.cgl.in_print_stmt = 1; 1946 expect(tk); // tkprint or tkprintf 1947 if ((tk == tkprintf) || !strchr(printexprendsy, CURTOK())) { 1948 // printf always needs expression 1949 // print non-empty statement needs expression 1950 num_exprs = expr(CALLED_BY_PRINT); 1951 if (num_exprs > 0 && !strchr(printexprendsy, CURTOK())) FATAL("print stmt bug"); 1952 if (!num_exprs) { 1953 for (num_exprs++; have_comma(); num_exprs++) 1954 expr(0); 1955 } 1956 } 1957 outmode = CURTOK(); 1958 if (strchr(outmodes, outmode)) { 1959 scan(); 1960 expr(0); // FIXME s/b only bwk term? check POSIX 1961 num_exprs++; 1962 } else outmode = 0; 1963 gen2cd(tk, num_exprs); 1964 gencd(outmode); 1965 TT.cgl.in_print_stmt = 0; 1966} 1967 1968static void delete_stmt(void) 1969{ 1970 expect(tkdelete); 1971 if (ISTOK(tkvar)) { 1972 int slotnum = find_or_add_var_name(); 1973 check_set_map(slotnum); 1974 scan(); 1975 if (havetok(tklbracket)) { 1976 int num_subscripts = 0; 1977 do { 1978 expr(0); 1979 num_subscripts++; 1980 } while (have_comma()); 1981 expect(tkrbracket); 1982 if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts); 1983 gen2cd(opmapref, slotnum); 1984 gencd(tkdelete); 1985 } else { 1986 // delete entire map (elements only; var is still a map) 1987 gen2cd(opmapref, slotnum); 1988 gencd(opmapdelete); 1989 } 1990 } else expect(tkvar); 1991} 1992 1993static void simple_stmt(void) 1994{ 1995 if (strchr(exprstartsy, CURTOK())) { 1996 expr(0); 1997 gencd(opdrop); 1998 return; 1999 } 2000 switch (CURTOK()) { 2001 case tkprint: 2002 case tkprintf: 2003 print_stmt(CURTOK()); 2004 break; 2005 2006 case tkdelete: 2007 delete_stmt(); 2008 break; 2009 2010 default: 2011 XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr); 2012 skip_to(stmtendsy); 2013 } 2014} 2015 2016static int prev_was_terminated(void) 2017{ 2018 return !!strchr(stmtendsy, TT.prevtok); 2019} 2020 2021static int is_nl_semi(void) 2022{ 2023 return ISTOK(tknl) || ISTOK(tksemi); 2024} 2025 2026static void if_stmt(void) 2027{ 2028 expect(tkif); 2029 expect(tklparen); 2030 expr(0); 2031 rparen(); 2032 gen2cd(tkif, -1); 2033 int cdx = TT.zcode_last; 2034 stmt(); 2035 if (!prev_was_terminated() && is_nl_semi()) { 2036 scan(); 2037 optional_nl(); 2038 } 2039 if (prev_was_terminated()) { 2040 optional_nl(); 2041 if (havetok(tkelse)) { 2042 gen2cd(tkelse, -1); 2043 ZCODE[cdx] = TT.zcode_last - cdx; 2044 cdx = TT.zcode_last; 2045 optional_nl(); 2046 stmt(); 2047 } 2048 } 2049 ZCODE[cdx] = TT.zcode_last - cdx; 2050} 2051 2052static void save_break_continue(int *brk, int *cont) 2053{ 2054 *brk = TT.cgl.break_dest; 2055 *cont = TT.cgl.continue_dest; 2056} 2057 2058static void restore_break_continue(int *brk, int *cont) 2059{ 2060 TT.cgl.break_dest = *brk; 2061 TT.cgl.continue_dest = *cont; 2062} 2063 2064static void while_stmt(void) 2065{ 2066 int brk, cont; 2067 save_break_continue(&brk, &cont); 2068 expect(tkwhile); 2069 expect(tklparen); 2070 TT.cgl.continue_dest = TT.zcode_last + 1; 2071 expr(0); 2072 rparen(); 2073 gen2cd(tkwhile, 2); // drop, jump if true 2074 TT.cgl.break_dest = TT.zcode_last + 1; 2075 gen2cd(opjump, -1); // jump here to break 2076 stmt(); 2077 gen2cd(opjump, -1); // jump to continue 2078 ZCODE[TT.zcode_last] = TT.cgl.continue_dest - TT.zcode_last - 1; 2079 ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1; 2080 restore_break_continue(&brk, &cont); 2081} 2082 2083static void do_stmt(void) 2084{ 2085 int brk, cont; 2086 save_break_continue(&brk, &cont); 2087 expect(tkdo); 2088 optional_nl(); 2089 gen2cd(opjump, 4); // jump over jumps, to statement 2090 TT.cgl.continue_dest = TT.zcode_last + 1; 2091 gen2cd(opjump, -1); // here on continue 2092 TT.cgl.break_dest = TT.zcode_last + 1; 2093 gen2cd(opjump, -1); // here on break 2094 stmt(); 2095 if (!prev_was_terminated()) { 2096 if (is_nl_semi()) { 2097 scan(); 2098 optional_nl(); 2099 } else { 2100 XERR("syntax near '%s' -- ';' or newline expected\n", TT.tokstr); 2101 // FIXME 2102 } 2103 } 2104 ZCODE[TT.cgl.continue_dest + 1] = TT.zcode_last - TT.cgl.continue_dest - 1; 2105 optional_nl(); 2106 expect(tkwhile); 2107 expect(tklparen); 2108 expr(0); 2109 rparen(); 2110 gen2cd(tkwhile, TT.cgl.break_dest - TT.zcode_last - 1); 2111 ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1; 2112 restore_break_continue(&brk, &cont); 2113} 2114 2115static void for_not_map_iter(void) 2116{ 2117 // Here after loop initialization, if any; loop condition 2118 int condition_loc = TT.zcode_last + 1; 2119 if (havetok(tksemi)) { 2120 // "endless" loop variant; no condition 2121 // no NL allowed here in OTA 2122 gen2cd(opjump, -1); // jump to statement 2123 } else { 2124 optional_nl(); // NOT posix or awk book; in OTA 2125 expr(0); // loop while true 2126 expect(tksemi); 2127 gen2cd(tkwhile, -1); // drop, jump to statement if true 2128 } 2129 optional_nl(); // NOT posix or awk book; in OTA 2130 TT.cgl.break_dest = TT.zcode_last + 1; 2131 gen2cd(opjump, -1); 2132 TT.cgl.continue_dest = TT.zcode_last + 1; 2133 if (!ISTOK(tkrparen)) simple_stmt(); // "increment" 2134 gen2cd(opjump, condition_loc - TT.zcode_last - 3); 2135 rparen(); 2136 ZCODE[TT.cgl.break_dest - 1] = TT.zcode_last - TT.cgl.break_dest + 1; 2137 stmt(); 2138 gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3); 2139 ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1; 2140} 2141 2142static int valid_for_array_iteration(int first, int last) 2143{ 2144 return ZCODE[first] == tkvar && ZCODE[first + 2] == tkvar 2145 && ZCODE[first + 4] == tkin && ZCODE[first + 5] == opdrop 2146 && first + 5 == last; 2147} 2148 2149static void for_stmt(void) 2150{ 2151 int brk, cont; 2152 save_break_continue(&brk, &cont); 2153 expect(tkfor); 2154 expect(tklparen); 2155 if (havetok(tksemi)) { 2156 // No "initialization" part 2157 for_not_map_iter(); 2158 } else { 2159 int loop_start_loc = TT.zcode_last + 1; 2160 simple_stmt(); // initializaton part, OR varname in arrayname form 2161 if (!havetok(tkrparen)) { 2162 expect(tksemi); 2163 for_not_map_iter(); 2164 } else { 2165 // Must be map iteration 2166 // Check here for varname in varname! 2167 // FIXME TODO must examine generated TT.zcode for var in array? 2168 if (!valid_for_array_iteration(loop_start_loc, TT.zcode_last)) 2169 XERR("%s", "bad 'for (var in array)' loop\n"); 2170 else { 2171 ZCODE[TT.zcode_last-5] = opvarref; 2172 ZCODE[TT.zcode_last-1] = tknumber; 2173 ZCODE[TT.zcode_last] = make_literal_num_val(-1); 2174 TT.cgl.continue_dest = TT.zcode_last + 1; 2175 gen2cd(opmapiternext, 2); 2176 TT.cgl.break_dest = TT.zcode_last + 1; 2177 gen2cd(opjump, -1); // fill in with loc after stmt 2178 } 2179 optional_nl(); 2180 // fixup TT.stack if return or exit inside for (var in array) 2181 TT.cgl.stack_offset_to_fix += 3; 2182 stmt(); 2183 TT.cgl.stack_offset_to_fix -= 3; 2184 gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3); 2185 ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1; 2186 gencd(opdrop); 2187 gencd(opdrop); 2188 gencd(opdrop); 2189 } 2190 } 2191 restore_break_continue(&brk, &cont); 2192} 2193 2194static void stmt(void) 2195{ 2196 switch (CURTOK()) { 2197 case tkeof: 2198 break; // FIXME ERROR? 2199 2200 case tkbreak: 2201 scan(); 2202 if (TT.cgl.break_dest) gen2cd(tkbreak, TT.cgl.break_dest - TT.zcode_last - 3); 2203 else XERR("%s", "break not in a loop\n"); 2204 break; 2205 2206 case tkcontinue: 2207 scan(); 2208 if (TT.cgl.continue_dest) 2209 gen2cd(tkcontinue, TT.cgl.continue_dest - TT.zcode_last - 3); 2210 else XERR("%s", "continue not in a loop\n"); 2211 break; 2212 2213 case tknext: 2214 scan(); 2215 gencd(tknext); 2216 if (TT.cgl.rule_type) XERR("%s", "next inside BEGIN or END\n"); 2217 if (TT.cgl.in_function_body) XERR("%s", "next inside function def\n"); 2218 break; 2219 2220 case tknextfile: 2221 scan(); 2222 gencd(tknextfile); 2223 if (TT.cgl.rule_type) XERR("%s", "nextfile inside BEGIN or END\n"); 2224 if (TT.cgl.in_function_body) XERR("%s", "nextfile inside function def\n"); 2225 break; 2226 2227 case tkexit: 2228 scan(); 2229 if (strchr(exprstartsy, CURTOK())) { 2230 expr(0); 2231 } else gen2cd(tknumber, make_literal_num_val(NO_EXIT_STATUS)); 2232 gencd(tkexit); 2233 break; 2234 2235 case tkreturn: 2236 scan(); 2237 if (TT.cgl.stack_offset_to_fix) gen2cd(opdrop_n, TT.cgl.stack_offset_to_fix); 2238 if (strchr(exprstartsy, CURTOK())) { 2239 expr(0); 2240 } else gen2cd(tknumber, make_literal_num_val(0.0)); 2241 gen2cd(tkreturn, TT.cgl.nparms); 2242 if (!TT.cgl.in_function_body) XERR("%s", "return outside function def\n"); 2243 break; 2244 2245 case tklbrace: 2246 action(tklbrace); 2247 break; 2248 2249 case tkif: 2250 if_stmt(); 2251 break; 2252 2253 case tkwhile: 2254 while_stmt(); 2255 break; 2256 2257 case tkdo: 2258 do_stmt(); 2259 break; 2260 2261 case tkfor: 2262 for_stmt(); 2263 break; 2264 2265 case tksemi: 2266 scan(); 2267 break; 2268 default: 2269 simple_stmt(); // expression print printf delete 2270 } 2271} 2272 2273static void add_param(int funcnum, char *s) 2274{ 2275 if (!find_local_entry(s)) add_local_entry(s); 2276 else XERR("function '%s' dup param '%s'\n", FUNC_DEF[funcnum].name, s); 2277 TT.cgl.nparms++; 2278 2279 // POSIX: The same name shall not be used as both a function parameter name 2280 // and as the name of a function or a special awk variable. 2281 // !!! NOTE seems implementations exc. mawk only compare param names with 2282 // builtin funcs; use same name as userfunc is OK! 2283 if (!strcmp(s, FUNC_DEF[funcnum].name)) 2284 XERR("function '%s' param '%s' matches func name\n", 2285 FUNC_DEF[funcnum].name, s); 2286 if (find_global(s) && find_global(s) < TT.spec_var_limit) 2287 XERR("function '%s' param '%s' matches special var\n", 2288 FUNC_DEF[funcnum].name, s); 2289} 2290 2291static void function_def(void) 2292{ 2293 expect(tkfunction); 2294 int funcnum = find_func_def_entry(TT.tokstr); 2295 if (!funcnum) { 2296 funcnum = add_func_def_entry(TT.tokstr); 2297 } else if (FUNC_DEF[funcnum].flags & FUNC_DEFINED) { 2298 XERR("dup defined function '%s'\n", TT.tokstr); 2299 } 2300 FUNC_DEF[funcnum].flags |= FUNC_DEFINED; 2301 if (find_global(TT.tokstr)) { 2302 // POSIX: The same name shall not be used both as a variable name with 2303 // global scope and as the name of a function. 2304 XERR("function name '%s' previously defined\n", TT.tokstr); 2305 } 2306 2307 gen2cd(tkfunction, funcnum); 2308 FUNC_DEF[funcnum].zcode_addr = TT.zcode_last - 1; 2309 TT.cgl.funcnum = funcnum; 2310 TT.cgl.nparms = 0; 2311 if (ISTOK(tkfunc)) expect(tkfunc); // func name with no space before ( 2312 else expect(tkvar); // func name with space before ( 2313 expect(tklparen); 2314 if (ISTOK(tkvar)) { 2315 add_param(funcnum, TT.tokstr); 2316 scan(); 2317 // FIXME is the the best way? what if TT.tokstr not a tkvar? 2318 while (have_comma()) { 2319 add_param(funcnum, TT.tokstr); 2320 expect(tkvar); 2321 } 2322 } 2323 rparen(); 2324 if (ISTOK(tklbrace)) { 2325 TT.cgl.in_function_body = 1; 2326 action(tkfunc); 2327 TT.cgl.in_function_body = 0; 2328 // Need to return uninit value if falling off end of function. 2329 gen2cd(tknumber, make_uninit_val()); 2330 gen2cd(tkreturn, TT.cgl.nparms); 2331 } else { 2332 XERR("syntax near '%s'\n", TT.tokstr); 2333 // FIXME some recovery needed here!? 2334 } 2335 // Do not re-init locals table for dup function. 2336 // Avoids memory leak detected by LeakSanitizer. 2337 if (!FUNC_DEF[funcnum].function_locals.base) { 2338 FUNC_DEF[funcnum].function_locals = TT.locals_table; 2339 init_locals_table(); 2340 } 2341} 2342 2343static void action(int action_type) 2344{ 2345(void)action_type; 2346 // action_type is tkbegin, tkend, tkdo (every line), tkif (if pattern), 2347 // tkfunc (function body), tklbrace (compound statement) 2348 // Should have lbrace on entry. 2349 expect(tklbrace); 2350 for (;;) { 2351 if (ISTOK(tkeof)) unexpected_eof(); 2352 optional_nl_or_semi(); 2353 if (havetok(tkrbrace)) { 2354 break; 2355 } 2356 stmt(); 2357 // stmt() is normally unterminated here, but may be terminated if we 2358 // have if with no else (had to consume terminator looking for else) 2359 // !!! if (ISTOK(tkrbrace) || prev_was_terminated()) 2360 if (prev_was_terminated()) continue; 2361 if (!is_nl_semi() && !ISTOK(tkrbrace)) { 2362 XERR("syntax near '%s' -- newline, ';', or '}' expected\n", TT.tokstr); 2363 while (!is_nl_semi() && !ISTOK(tkrbrace) && !ISTOK(tkeof)) scan(); 2364 if (ISTOK(tkeof)) unexpected_eof(); 2365 } 2366 if (havetok(tkrbrace)) break; 2367 // Must be semicolon or newline 2368 scan(); 2369 } 2370} 2371 2372static void rule(void) 2373{ 2374 // pa_pat 2375 // | pa_pat lbrace stmtlist '}' 2376 // | pa_pat ',' opt_nl pa_pat 2377 // | pa_pat ',' opt_nl pa_pat lbrace stmtlist '}' 2378 // | lbrace stmtlist '}' 2379 // | XBEGIN lbrace stmtlist '}' 2380 // | XEND lbrace stmtlist '}' 2381 // | FUNC funcname '(' varlist rparen lbrace stmtlist '}' 2382 2383 switch (CURTOK()) { 2384 case tkbegin: 2385 scan(); 2386 if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin] = TT.zcode_last - TT.cgl.last_begin; 2387 else TT.cgl.first_begin = TT.zcode_last + 1; 2388 2389 TT.cgl.rule_type = tkbegin; 2390 action(tkbegin); 2391 TT.cgl.rule_type = 0; 2392 gen2cd(opjump, -1); 2393 TT.cgl.last_begin = TT.zcode_last; 2394 break; 2395 2396 case tkend: 2397 scan(); 2398 if (TT.cgl.last_end) ZCODE[TT.cgl.last_end] = TT.zcode_last - TT.cgl.last_end; 2399 else TT.cgl.first_end = TT.zcode_last + 1; 2400 2401 TT.cgl.rule_type = tkbegin; 2402 action(tkend); 2403 TT.cgl.rule_type = 0; 2404 gen2cd(opjump, -1); 2405 TT.cgl.last_end = TT.zcode_last; 2406 break; 2407 2408 case tklbrace: 2409 if (TT.cgl.last_recrule) 2410 ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule; 2411 else TT.cgl.first_recrule = TT.zcode_last + 1; 2412 action(tkdo); 2413 gen2cd(opjump, -1); 2414 TT.cgl.last_recrule = TT.zcode_last; 2415 break; 2416 2417 case tkfunction: 2418 function_def(); 2419 break; 2420 default: 2421 if (TT.cgl.last_recrule) 2422 ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule; 2423 else TT.cgl.first_recrule = TT.zcode_last + 1; 2424 gen2cd(opjump, 1); 2425 gencd(tkeof); 2426 int cdx = 0, saveloc = TT.zcode_last; 2427 expr(0); 2428 if (!have_comma()) { 2429 gen2cd(tkif, -1); 2430 cdx = TT.zcode_last; 2431 } else { 2432 gen2cd(oprange2, ++TT.cgl.range_pattern_num); 2433 gencd(-1); 2434 cdx = TT.zcode_last; 2435 ZCODE[saveloc-2] = oprange1; 2436 ZCODE[saveloc-1] = TT.cgl.range_pattern_num; 2437 ZCODE[saveloc] = TT.zcode_last - saveloc; 2438 expr(0); 2439 gen2cd(oprange3, TT.cgl.range_pattern_num); 2440 } 2441 if (ISTOK(tklbrace)) { 2442 action(tkif); 2443 ZCODE[cdx] = TT.zcode_last - cdx; 2444 } else { 2445 gencd(opprintrec); // print $0 ? 2446 ZCODE[cdx] = TT.zcode_last - cdx; 2447 } 2448 gen2cd(opjump, -1); 2449 TT.cgl.last_recrule = TT.zcode_last; 2450 } 2451} 2452 2453static void diag_func_def_ref(void) 2454{ 2455 int n = zlist_len(&TT.func_def_table); 2456 for (int k = 1; k < n; k++) { 2457 if ((FUNC_DEF[k].flags & FUNC_CALLED) && 2458 !(FUNC_DEF[k].flags & FUNC_DEFINED)) { 2459 // Sorry, we can't tell where this was called from, for now at least. 2460 XERR("Undefined function '%s'", FUNC_DEF[k].name); 2461 } 2462 } 2463} 2464 2465static void compile(void) 2466{ 2467 init_compiler(); 2468 init_scanner(); 2469 scan(); 2470 optional_nl_or_semi(); // Does posix allow NL or ; before first rule? 2471 while (! ISTOK(tkeof)) { 2472 rule(); 2473 optional_nl_or_semi(); // NOT POSIX 2474 } 2475 2476 2477 if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin-1] = opquit; 2478 if (TT.cgl.last_end) ZCODE[TT.cgl.last_end-1] = opquit; 2479 if (TT.cgl.last_recrule) ZCODE[TT.cgl.last_recrule-1] = opquit; 2480 2481 gen2cd(tknumber, make_literal_num_val(0.0)); 2482 gencd(tkexit); 2483 gencd(opquit); 2484 // If there are only BEGIN and END or only END actions, generate actions to 2485 // read all input before END. 2486 if (TT.cgl.first_end && !TT.cgl.first_recrule) { 2487 gencd(opquit); 2488 TT.cgl.first_recrule = TT.zcode_last; 2489 } 2490 gencd(opquit); // One more opcode to keep ip in bounds in run code. 2491 diag_func_def_ref(); 2492} 2493 2494//////////////////// 2495//// runtime 2496//////////////////// 2497 2498static void check_numeric_string(struct zvalue *v) 2499{ 2500 if (v->vst) { 2501 char *end, *s = v->vst->str; 2502 // Significant speed gain with this test: 2503 // num string must begin space, +, -, ., or digit. 2504 if (strchr("+-.1234567890 ", *s)) { 2505 double num = strtod(s, &end); 2506 if (s == end || end[strspn(end, " ")]) return; 2507 v->num = num; 2508 v->flags |= ZF_NUM | ZF_STR | ZF_NUMSTR; 2509 } 2510 } 2511} 2512 2513static struct zstring *num_to_zstring(double n, char *fmt) 2514{ 2515 int k; 2516 if (n == (long long)n) k = snprintf(TT.pbuf, PBUFSIZE, "%lld", (long long)n); 2517 else k = snprintf(TT.pbuf, PBUFSIZE, fmt, n); 2518 if (k < 0 || k >= PBUFSIZE) FFATAL("error encoding %f via '%s'", n, fmt); 2519 return new_zstring(TT.pbuf, k); 2520} 2521 2522//////////////////// 2523//// regex routines 2524//////////////////// 2525 2526static char *escape_str(char *s, int is_regex) 2527{ 2528 char *p, *escapes = is_regex ? "abfnrtv\"/" : "\\abfnrtv\"/"; 2529 // FIXME TODO should / be in there? 2530 char *s0 = s, *to = s; 2531 while ((*to = *s)) { 2532 if (*s != '\\') { to++, s++; 2533 } else if ((p = strchr(escapes, *++s))) { 2534 // checking char after \ for known escapes 2535 int c = (is_regex?"\a\b\f\n\r\t\v\"/":"\\\a\b\f\n\r\t\v\"/")[p-escapes]; 2536 if (c) *to = c, s++; // else final backslash 2537 to++; 2538 } else if ('0' <= *s && *s <= '9') { 2539 int k, c = *s++ - '0'; 2540 for (k = 0; k < 2 && '0' <= *s && *s <= '9'; k++) 2541 c = c * 8 + *s++ - '0'; 2542 *to++ = c; 2543 } else if (*s == 'x') { 2544 if (isxdigit(s[1])) { 2545 int c = hexval(*++s); 2546 if (isxdigit(s[1])) c = c * 16 + hexval(*++s); 2547 *to++ = c, s++; 2548 } 2549 } else { 2550 if (is_regex) *to++ = '\\'; 2551 *to++ = *s++; 2552 } 2553 } 2554 return s0; 2555} 2556 2557static void force_maybemap_to_scalar(struct zvalue *v) 2558{ 2559 if (!(v->flags & ZF_ANYMAP)) return; 2560 if (v->flags & ZF_MAP || v->map->count) 2561 FATAL("array in scalar context"); 2562 v->flags = 0; 2563 v->map = 0; // v->flags = v->map = 0 gets warning 2564} 2565 2566static void force_maybemap_to_map(struct zvalue *v) 2567{ 2568 if (v->flags & ZF_MAYBEMAP) v->flags = ZF_MAP; 2569} 2570 2571// fmt_offs is either CONVFMT or OFMT (offset in stack to zvalue) 2572static struct zvalue *to_str_fmt(struct zvalue *v, int fmt_offs) 2573{ 2574 force_maybemap_to_scalar(v); 2575 // TODO: consider handling numstring differently 2576 if (v->flags & ZF_NUMSTR) v->flags = ZF_STR; 2577 if (IS_STR(v)) return v; 2578 else if (!v->flags) { // uninitialized 2579 v->vst = new_zstring("", 0); 2580 } else if (IS_NUM(v)) { 2581 zvalue_release_zstring(v); 2582 if (!IS_STR(&STACK[fmt_offs])) { 2583 zstring_release(&STACK[fmt_offs].vst); 2584 STACK[fmt_offs].vst = num_to_zstring(STACK[fmt_offs].num, "%.6g"); 2585 STACK[fmt_offs].flags = ZF_STR; 2586 } 2587 v->vst = num_to_zstring(v->num, STACK[fmt_offs].vst->str); 2588 } else { 2589 FATAL("Wrong or unknown type in to_str_fmt\n"); 2590 } 2591 v->flags = ZF_STR; 2592 return v; 2593} 2594 2595static struct zvalue *to_str(struct zvalue *v) 2596{ 2597 return to_str_fmt(v, CONVFMT); 2598} 2599 2600// TODO FIXME Is this needed? (YES -- investigate) Just use to_str()? 2601#define ENSURE_STR(v) (IS_STR(v) ? (v) : to_str(v)) 2602 2603static void rx_zvalue_compile(regex_t **rx, struct zvalue *pat) 2604{ 2605 if (IS_RX(pat)) *rx = pat->rx; 2606 else { 2607 zvalue_dup_zstring(to_str(pat)); 2608 escape_str(pat->vst->str, 1); 2609 xregcomp(*rx, pat->vst->str, REG_EXTENDED); 2610 } 2611} 2612 2613static void rx_zvalue_free(regex_t *rx, struct zvalue *pat) 2614{ 2615 if (!IS_RX(pat) || rx != pat->rx) regfree(rx); 2616} 2617 2618// Used by the match/not match ops (~ !~) and implicit $0 match (/regex/) 2619static int match(struct zvalue *zvsubject, struct zvalue *zvpat) 2620{ 2621 int r; 2622 regex_t rx, *rxp = ℞ 2623 rx_zvalue_compile(&rxp, zvpat); 2624 if ((r = regexec(rxp, to_str(zvsubject)->vst->str, 0, 0, 0)) != 0) { 2625 if (r != REG_NOMATCH) { 2626 char errbuf[256]; 2627 regerror(r, &rx, errbuf, sizeof(errbuf)); 2628 // FIXME TODO better diagnostic here 2629 error_exit("regex match error %d: %s", r, errbuf); 2630 } 2631 rx_zvalue_free(rxp, zvpat); 2632 return 1; 2633 } 2634 rx_zvalue_free(rxp, zvpat); 2635 return 0; 2636} 2637 2638static int rx_find(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags) 2639{ 2640 regmatch_t matches[1]; 2641 int r = regexec(rx, s, 1, matches, eflags); 2642 if (r == REG_NOMATCH) return r; 2643 if (r) FATAL("regexec error"); // TODO ? use regerr() to meaningful msg 2644 *start = matches[0].rm_so; 2645 *end = matches[0].rm_eo; 2646 return 0; 2647} 2648 2649// Differs from rx_find() in that FS cannot match null (empty) string. 2650// See https://www.austingroupbugs.net/view.php?id=1468. 2651static int rx_find_FS(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags) 2652{ 2653 int r = rx_find(rx, s, start, end, eflags); 2654 if (r || *start != *end) return r; // not found, or found non-empty match 2655 // Found empty match, retry starting past the match 2656 char *p = s + *end; 2657 if (!*p) return REG_NOMATCH; // End of string, no non-empty match found 2658 // Empty match not at EOS, move ahead and try again 2659 while (!r && *start == *end && *++p) 2660 r = rx_find(rx, p, start, end, eflags); 2661 if (r || !*p) return REG_NOMATCH; // no non-empty match found 2662 *start += p - s; // offsets from original string 2663 *end += p - s; 2664 return 0; 2665} 2666 2667//////////////////// 2668//// fields 2669//////////////////// 2670 2671#define FIELDS_MAX 102400 // Was 1024; need more for toybox awk test 2672#define THIS_MEANS_SET_NF 999999999 2673 2674static int get_int_val(struct zvalue *v) 2675{ 2676 if (IS_NUM(v)) return (int)v->num; 2677 if (IS_STR(v) && v->vst) return (int)atof(v->vst->str); 2678 return 0; 2679} 2680 2681// A single-char FS is never a regex, so make it a [<char>] regex to 2682// match only that one char in case FS is a regex metachar. 2683// If regex FS is needed, must use > 1 char. If a '.' regex 2684// is needed, use e.g. '.|.' (unlikely case). 2685static char *fmt_one_char_fs(char *fs) 2686{ 2687 if (strlen(fs) != 1) return fs; 2688 snprintf(TT.one_char_fs, sizeof(TT.one_char_fs), "[%c]", fs[0]); 2689 return TT.one_char_fs; 2690} 2691 2692static regex_t *rx_fs_prep(char *fs) 2693{ 2694 if (!strcmp(fs, " ")) return &TT.rx_default; 2695 if (!strcmp(fs, TT.fs_last)) return &TT.rx_last; 2696 if (strlen(fs) >= FS_MAX) FATAL("FS too long"); 2697 strcpy(TT.fs_last, fs); 2698 regfree(&TT.rx_last); 2699 xregcomp(&TT.rx_last, fmt_one_char_fs(fs), REG_EXTENDED); 2700 return &TT.rx_last; 2701} 2702 2703// Only for use by split() builtin 2704static void set_map_element(struct zmap *m, int k, char *val, size_t len) 2705{ 2706 // Do not need format here b/c k is integer, uses "%lld" format. 2707 struct zstring *key = num_to_zstring(k, "");// "" vs 0 format avoids warning 2708 struct zmap_slot *zs = zmap_find_or_insert_key(m, key); 2709 zstring_release(&key); 2710 zs->val.vst = zstring_update(zs->val.vst, 0, val, len); 2711 zs->val.flags = ZF_STR; 2712 check_numeric_string(&zs->val); 2713} 2714 2715static void set_zvalue_str(struct zvalue *v, char *s, size_t size) 2716{ 2717 v->vst = zstring_update(v->vst, 0, s, size); 2718 v->flags = ZF_STR; 2719} 2720 2721// All changes to NF go through here! 2722static void set_nf(int nf) 2723{ 2724 STACK[NF].num = TT.nf_internal = nf; 2725 STACK[NF].flags = ZF_NUM; 2726} 2727 2728static void set_field(struct zmap *unused, int fnum, char *s, size_t size) 2729{ (void)unused; 2730 if (fnum < 0 || fnum > FIELDS_MAX) FFATAL("bad field num %d\n", fnum); 2731 int nfields = zlist_len(&TT.fields); 2732 // Need nfields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields 2733 while (nfields <= fnum) 2734 nfields = zlist_append(&TT.fields, &uninit_zvalue) + 1; 2735 set_zvalue_str(&FIELD[fnum], s, size); 2736 set_nf(fnum); 2737 check_numeric_string(&FIELD[fnum]); 2738} 2739 2740// Split s via fs, using setter; return number of TT.fields. 2741// This is used to split TT.fields and also for split() builtin. 2742static int splitter(void (*setter)(struct zmap *, int, char *, size_t), struct zmap *m, char *s, struct zvalue *zvfs) 2743{ 2744 regex_t *rx; 2745 regoff_t offs, end; 2746 if (!IS_RX(zvfs)) to_str(zvfs); 2747 char *fs = IS_STR(zvfs) ? zvfs->vst->str : ""; 2748 int nf = 0, r = 0, eflag = 0; 2749 // Empty string or empty fs (regex). 2750 // Need to include !*s b/c empty string, otherwise 2751 // split("", a, "x") splits to a 1-element (empty element) array 2752 if (!*s || (IS_STR(zvfs) && !*fs) || IS_EMPTY_RX(zvfs)) { 2753 for ( ; *s; s++) setter(m, ++nf, s, 1); 2754 return nf; 2755 } 2756 if (IS_RX(zvfs)) rx = zvfs->rx; 2757 else rx = rx_fs_prep(fs); 2758 while (*s) { 2759 // Find the next occurrence of FS. 2760 // rx_find_FS() returns 0 if found. If nonzero, the field will 2761 // be the rest of the record (all of it if first time through). 2762 if ((r = rx_find_FS(rx, s, &offs, &end, eflag))) offs = end = strlen(s); 2763 else { 2764 int k = strcspn(s, "\n"); 2765 if (k < offs) offs = k, end = k + 1; 2766 } 2767 eflag |= REG_NOTBOL; 2768 2769 // Field will be s up to (not including) the offset. If offset 2770 // is zero and FS is found and FS is ' ' (TT.rx_default "[ \t]+"), 2771 // then the find is the leading or trailing spaces and/or tabs. 2772 // If so, skip this (empty) field, otherwise set field, length is offs. 2773 if (offs || r || rx != &TT.rx_default) setter(m, ++nf, s, offs); 2774 s += end; 2775 } 2776 if (!r && rx != &TT.rx_default) setter(m, ++nf, "", 0); 2777 return nf; 2778} 2779 2780static void build_fields(void) 2781{ 2782 char *rec = FIELD[0].vst->str; 2783 // TODO test this -- why did I not want to split empty $0? 2784 // Maybe don't split empty $0 b/c non-default FS gets NF==1 with splitter()? 2785 set_nf(*rec ? splitter(set_field, 0, rec, to_str(&STACK[FS])) : 0); 2786} 2787 2788static void rebuild_field0(void) 2789{ 2790 struct zstring *s = FIELD[0].vst; 2791 int nf = TT.nf_internal; 2792 // uninit value needed for eventual reference to .vst in zstring_release() 2793 struct zvalue tempv = uninit_zvalue; 2794 zvalue_copy(&tempv, to_str(&STACK[OFS])); 2795 for (int i = 1; i <= nf; i++) { 2796 if (i > 1) { 2797 s = s ? zstring_extend(s, tempv.vst) : zstring_copy(s, tempv.vst); 2798 } 2799 if (FIELD[i].flags) to_str(&FIELD[i]); 2800 if (FIELD[i].vst) { 2801 if (i > 1) s = zstring_extend(s, FIELD[i].vst); 2802 else s = zstring_copy(s, FIELD[i].vst); 2803 } 2804 } 2805 FIELD[0].vst = s; 2806 FIELD[0].flags |= ZF_STR; 2807 zvalue_release_zstring(&tempv); 2808} 2809 2810// get field ref (lvalue ref) in prep for assignment to field. 2811// [... assigning to a nonexistent field (for example, $(NF+2)=5) shall 2812// increase the value of NF; create any intervening TT.fields with the 2813// uninitialized value; and cause the value of $0 to be recomputed, with the 2814// TT.fields being separated by the value of OFS.] 2815// Called by setup_lvalue() 2816static struct zvalue *get_field_ref(int fnum) 2817{ 2818 if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d", fnum); 2819 if (fnum > TT.nf_internal) { 2820 // Ensure TT.fields list is large enough for fnum 2821 // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields 2822 for (int i = TT.nf_internal + 1; i <= fnum; i++) { 2823 if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue); 2824 zvalue_copy(&FIELD[i], &uninit_string_zvalue); 2825 } 2826 set_nf(fnum); 2827 } 2828 return &FIELD[fnum]; 2829} 2830 2831// Called by tksplit op 2832static int split(struct zstring *s, struct zvalue *a, struct zvalue *fs) 2833{ 2834 return splitter(set_map_element, a->map, s->str, fs); 2835} 2836 2837// Called by getrec_f0_f() and getrec_f0() 2838static void copy_to_field0(char *buf, size_t k) 2839{ 2840 set_zvalue_str(&FIELD[0], buf, k); 2841 check_numeric_string(&FIELD[0]); 2842 build_fields(); 2843} 2844 2845// After changing $0, must rebuild TT.fields & reset NF 2846// Changing other field must rebuild $0 2847// Called by gsub() and assignment ops. 2848static void fixup_fields(int fnum) 2849{ 2850 if (fnum == THIS_MEANS_SET_NF) { // NF was assigned to 2851 int new_nf = get_int_val(&STACK[NF]); 2852 // Ensure TT.fields list is large enough for fnum 2853 // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields 2854 for (int i = TT.nf_internal + 1; i <= new_nf; i++) { 2855 if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue); 2856 zvalue_copy(&FIELD[i], &uninit_string_zvalue); 2857 } 2858 set_nf(TT.nf_internal = STACK[NF].num); 2859 rebuild_field0(); 2860 return; 2861 } 2862 // fnum is # of field that was just updated. 2863 // If it's 0, need to rebuild the TT.fields 1... n. 2864 // If it's non-0, need to rebuild field 0. 2865 to_str(&FIELD[fnum]); 2866 if (fnum) check_numeric_string(&FIELD[fnum]); 2867 if (fnum) rebuild_field0(); 2868 else build_fields(); 2869} 2870 2871// Fetching non-existent field gets uninit string value; no change to NF! 2872// Called by tkfield op // TODO inline it? 2873static void push_field(int fnum) 2874{ 2875 if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d", fnum); 2876 // Contrary to posix, awk evaluates TT.fields beyond $NF as empty strings. 2877 if (fnum > TT.nf_internal) push_val(&uninit_string_zvalue); 2878 else push_val(&FIELD[fnum]); 2879} 2880 2881//////////////////// 2882//// END fields 2883//////////////////// 2884 2885#define STKP TT.stackp // pointer to top of stack 2886 2887static double seedrand(double seed) 2888{ 2889 static double prev_seed; 2890 double r = prev_seed; 2891 srandom(trunc(prev_seed = seed)); 2892 return r; 2893} 2894 2895static int popnumval(void) 2896{ 2897 return STKP-- -> num; 2898} 2899 2900static void drop(void) 2901{ 2902 if (!(STKP->flags & (ZF_ANYMAP | ZF_RX))) zstring_release(&STKP->vst); 2903 STKP--; 2904} 2905 2906static void drop_n(int n) 2907{ 2908 while (n--) drop(); 2909} 2910 2911static void swap(void) 2912{ 2913 struct zvalue tmp = STKP[-1]; 2914 STKP[-1] = STKP[0]; 2915 STKP[0] = tmp; 2916} 2917 2918// Set and return logical (0/1) val of top TT.stack value; flag value as NUM. 2919static int get_set_logical(void) 2920{ 2921 struct zvalue *v = STKP; 2922 force_maybemap_to_scalar(v); 2923 int r = 0; 2924 if (IS_NUM(v)) r = !! v->num; 2925 else if (IS_STR(v)) r = (v->vst && v->vst->str[0]); 2926 zvalue_release_zstring(v); 2927 v->num = r; 2928 v->flags = ZF_NUM; 2929 return r; 2930} 2931 2932 2933static double to_num(struct zvalue *v) 2934{ 2935 force_maybemap_to_scalar(v); 2936 if (v->flags & ZF_NUMSTR) zvalue_release_zstring(v); 2937 else if (!IS_NUM(v)) { 2938 v->num = 0.0; 2939 if (IS_STR(v) && v->vst) v->num = atof(v->vst->str); 2940 zvalue_release_zstring(v); 2941 } 2942 v->flags = ZF_NUM; 2943 return v->num; 2944} 2945 2946static void set_num(struct zvalue *v, double n) 2947{ 2948 zstring_release(&v->vst); 2949 v->num = n; 2950 v->flags = ZF_NUM; 2951} 2952 2953static void incr_zvalue(struct zvalue *v) 2954{ 2955 v->num = trunc(to_num(v)) + 1; 2956} 2957 2958static void push_int_val(ptrdiff_t n) 2959{ 2960 struct zvalue v = ZVINIT(ZF_NUM, n, 0); 2961 push_val(&v); 2962} 2963 2964static struct zvalue *get_map_val(struct zvalue *v, struct zvalue *key) 2965{ 2966 struct zmap_slot *x = zmap_find_or_insert_key(v->map, to_str(key)->vst); 2967 return &x->val; 2968} 2969 2970static struct zvalue *setup_lvalue(int ref_stack_ptr, int parmbase, int *field_num) 2971{ 2972 // ref_stack_ptr is number of slots down in stack the ref is 2973 // for +=, *=, etc 2974 // Stack is: ... scalar_ref value_to_op_by 2975 // or ... subscript_val map_ref value_to_op_by 2976 // or ... fieldref value_to_op_by 2977 // for =, ++, -- 2978 // Stack is: ... scalar_ref 2979 // or ... subscript_val map_ref 2980 // or ... fieldnum fieldref 2981 int k; 2982 struct zvalue *ref, *v = 0; // init v to mute "may be uninit" warning 2983 *field_num = -1; 2984 ref = STKP - ref_stack_ptr; 2985 if (ref->flags & ZF_FIELDREF) return get_field_ref(*field_num = ref->num); 2986 k = ref->num >= 0 ? ref->num : parmbase - ref->num; 2987 if (k == NF) *field_num = THIS_MEANS_SET_NF; 2988 v = &STACK[k]; 2989 if (ref->flags & ZF_REF) { 2990 force_maybemap_to_scalar(v); 2991 } else if (ref->flags & ZF_MAPREF) { 2992 force_maybemap_to_map(v); 2993 if (!IS_MAP(v)) FATAL("scalar in array context"); 2994 v = get_map_val(v, STKP - ref_stack_ptr - 1); 2995 swap(); 2996 drop(); 2997 } else FATAL("assignment to bad lvalue"); 2998 return v; // order FATAL() and return to mute warning 2999} 3000 3001 3002static struct zfile *new_file(char *fn, FILE *fp, char mode, char file_or_pipe) 3003{ 3004 struct zfile *f = xzalloc(sizeof(struct zfile)); 3005 *f = (struct zfile){TT.zfiles, xstrdup(fn), fp, mode, file_or_pipe, 3006 0, 0, 0, 0, 0, 0, 0, 0, 0}; 3007 return TT.zfiles = f; 3008} 3009 3010static int fflush_all(void) 3011{ 3012 int ret = 0; 3013 for (struct zfile *p = TT.zfiles; p; p = p->next) 3014 if (fflush(p->fp)) ret = -1; 3015 return ret; 3016} 3017 3018static int fflush_file(int nargs) 3019{ 3020 if (!nargs) return fflush_all(); 3021 3022 to_str(STKP); // filename at top of TT.stack 3023 // Null string means flush all 3024 if (!STKP[0].vst->str[0]) return fflush_all(); 3025 3026 // is it open in file table? 3027 for (struct zfile *p = TT.zfiles; p; p = p->next) 3028 if (!strcmp(STKP[0].vst->str, p->fn)) 3029 if (!fflush(p->fp)) return 0; 3030 return -1; // error, or file not found in table 3031} 3032static int close_file(char *fn) 3033{ 3034 // !fn (null ptr) means close all (exc. stdin/stdout/stderr) 3035 int r = 0; 3036 struct zfile *np, **pp = &TT.zfiles; 3037 for (struct zfile *p = TT.zfiles; p; p = np) { 3038 np = p->next; // save in case unlinking file (invalidates p->next) 3039 // Don't close std files -- wrecks print/printf (can be fixed though TODO) 3040 if ((!p->is_std_file) && (!fn || !strcmp(fn, p->fn))) { 3041 xfree(p->recbuf); 3042 xfree(p->recbuf_multi); 3043 xfree(p->recbuf_multx); 3044 xfree(p->fn); 3045 r = (p->fp) ? (p->file_or_pipe ? fclose : pclose)(p->fp) : -1; 3046 *pp = p->next; 3047 xfree(p); 3048 if (fn) return r; 3049 } else pp = &p->next; // only if not unlinking zfile 3050 } 3051 return -1; // file not in table, or closed all files 3052} 3053 3054static struct zfile badfile_obj, *badfile = &badfile_obj; 3055 3056// FIXME TODO check if file/pipe/mode matches what's in the table already. 3057// Apparently gawk/mawk/nawk are OK with different mode, but just use the file 3058// in whatever mode it's already in; i.e. > after >> still appends. 3059static struct zfile *setup_file(char file_or_pipe, char *mode) 3060{ 3061 to_str(STKP); // filename at top of TT.stack 3062 char *fn = STKP[0].vst->str; 3063 // is it already open in file table? 3064 for (struct zfile *p = TT.zfiles; p; p = p->next) 3065 if (!strcmp(fn, p->fn)) { 3066 drop(); 3067 return p; // open; return it 3068 } 3069 FILE *fp = (file_or_pipe ? fopen : popen)(fn, mode); 3070 if (fp) { 3071 struct zfile *p = new_file(fn, fp, *mode, file_or_pipe); 3072 drop(); 3073 return p; 3074 } 3075 if (*mode != 'r') FFATAL("cannot open '%s'\n", fn); 3076 drop(); 3077 return badfile; 3078} 3079 3080// TODO FIXME should be a function? 3081#define stkn(n) ((int)(TT.stackp - (n) - (struct zvalue *)TT.stack.base)) 3082 3083static int getcnt(int k) 3084{ 3085 if (k >= stkn(0)) FATAL("too few args for printf\n"); 3086 return (int)to_num(&STACK[k]); 3087} 3088 3089static int fsprintf(FILE *ignored, const char *fmt, ...) 3090{ 3091 (void)ignored; 3092 va_list args, args2; 3093 va_start(args, fmt); 3094 va_copy(args2, args); 3095 int len = vsnprintf(0, 0, fmt, args); // size needed 3096 va_end(args); 3097 if (len < 0) FATAL("Bad sprintf format"); 3098 // Unfortunately we have to mess with zstring internals here. 3099 if (TT.rgl.zspr->size + len + 1 > TT.rgl.zspr->capacity) { 3100 // This should always work b/c capacity > size 3101 unsigned cap = 2 * TT.rgl.zspr->capacity + len; 3102 TT.rgl.zspr = xrealloc(TT.rgl.zspr, sizeof(*TT.rgl.zspr) + cap); 3103 TT.rgl.zspr->capacity = cap; 3104 } 3105 vsnprintf(TT.rgl.zspr->str + TT.rgl.zspr->size, len+1, fmt, args2); 3106 TT.rgl.zspr->size += len; 3107 TT.rgl.zspr->str[TT.rgl.zspr->size] = 0; 3108 va_end(args2); 3109 return 0; 3110} 3111 3112static void varprint(int(*fpvar)(FILE *, const char *, ...), FILE *outfp, int nargs) 3113{ 3114 int k, nn, nnc, fmtc, holdc, cnt1 = 0, cnt2 = 0; 3115 char *s = 0; // to shut up spurious warning 3116 regoff_t offs = -1, e = -1; 3117 char *pfmt, *fmt = to_str(STKP-nargs+1)->vst->str; 3118 k = stkn(nargs - 2); 3119 while (*fmt) { 3120 double n = 0; 3121 nn = strcspn(fmt, "%"); 3122 if (nn) { 3123 holdc = fmt[nn]; 3124 fmt[nn] = 0; 3125 fpvar(outfp, "%s", fmt); 3126 fmt[nn] = holdc; 3127 } 3128 fmt += nn; 3129 if (!*(pfmt = fmt)) break; 3130 nnc = strcspn(fmt+1, "aAdiouxXfFeEgGcs%"); 3131 fmtc = fmt[nnc+1]; 3132 if (!fmtc) FFATAL("bad printf format '%s'", fmt); 3133 holdc = fmt[nnc+2]; 3134 fmt[nnc+2] = 0; 3135 if (rx_find(&TT.rx_printf_fmt, fmt, &offs, &e, 0)) 3136 FFATAL("bad printf format <%s>\n", fmt); 3137 int nargsneeded = 1; 3138 for (char *p = strchr(fmt, '*'); p; p = strchr(p+1, '*')) 3139 nargsneeded++; 3140 nargsneeded -= fmtc == '%'; 3141 3142 switch (nargsneeded) { 3143 case 0: 3144 fpvar(outfp, fmt); 3145 break; 3146 case 3: 3147 cnt1 = getcnt(k++); 3148 ATTR_FALLTHROUGH_INTENDED; 3149 case 2: 3150 cnt2 = getcnt(k++); 3151 ATTR_FALLTHROUGH_INTENDED; 3152 case 1: 3153 if (k > stkn(0)) FATAL("too few args for printf\n"); 3154 if (fmtc == 's') { 3155 s = to_str(&STACK[k++])->vst->str; 3156 } else if (fmtc == 'c' && !IS_NUM(&STACK[k])) { 3157 unsigned wch; 3158 struct zvalue *z = &STACK[k++]; 3159 if (z->vst && z->vst->str[0]) 3160 n = utf8towc(&wch, z->vst->str, z->vst->size) < 1 ? 0xfffd : wch; 3161 } else { 3162 n = to_num(&STACK[k++]); 3163 } 3164 if (strchr("cdiouxX", fmtc)) { 3165 pfmt = strcpy(TT.pbuf, fmt); 3166 if (pfmt[nnc] != 'l') { 3167 strcpy(pfmt+nnc+1, "l_"); 3168 pfmt[nnc+2] = fmtc; 3169 } 3170 } 3171 if (fmtc == 'c' && n > 0x10ffff) n = 0xfffd; // musl won't take larger "wchar" 3172 switch (nargsneeded) { 3173 case 1: 3174 if (fmtc == 's') fpvar(outfp, pfmt, s); 3175 else if (fmtc == 'c') fpvar(outfp, pfmt, (wint_t)n); 3176 else if (strchr("di", fmtc)) fpvar(outfp, pfmt, (long)n); 3177 else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, (unsigned long)n); 3178 else fpvar(outfp, pfmt, n); 3179 break; 3180 case 2: 3181 if (fmtc == 's') fpvar(outfp, pfmt, cnt2, s); 3182 else if (fmtc == 'c') fpvar(outfp, pfmt, cnt2, (wint_t)n); 3183 else if (strchr("di", fmtc)) fpvar(outfp, pfmt, cnt2, (long)n); 3184 else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, cnt2, (unsigned long)n); 3185 else fpvar(outfp, pfmt, cnt2, n); 3186 break; 3187 case 3: 3188 if (fmtc == 's') fpvar(outfp, pfmt, cnt1, cnt2, s); 3189 else if (fmtc == 'c') fpvar(outfp, pfmt, cnt1, cnt2, (wint_t)n); 3190 else if (strchr("di", fmtc)) fpvar(outfp, pfmt, cnt1, cnt2, (long)n); 3191 else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, cnt1, cnt2, (unsigned long)n); 3192 else fpvar(outfp, pfmt, cnt1, cnt2, n); 3193 break; 3194 } 3195 break; 3196 default: 3197 FATAL("bad printf format\n"); 3198 } 3199 fmt += nnc + 2; 3200 *fmt = holdc; 3201 } 3202} 3203 3204static int is_ok_varname(char *v) 3205{ 3206 char *ok = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_"; 3207 if (!*v) return 0; 3208 for (int i = 0; v[i]; i++) 3209 if (i ? !strchr(ok, v[i]) : !strchr(ok + 10, v[i])) return 0; 3210 return 1; 3211} 3212 3213// FIXME TODO return value never used. What if assign to var not in globals? 3214static int assign_global(char *var, char *value) 3215{ 3216 if (!is_ok_varname(var)) FFATAL("Invalid variable name '%s'\n", var); 3217 int globals_ent = find_global(var); 3218 if (globals_ent) { 3219 struct zvalue *v = &STACK[globals_ent]; 3220 if (IS_MAP(v)) error_exit("-v assignment to array"); // Maybe not needed? 3221 3222// The compile phase may insert a var in global table with flag of zero. Then 3223// init_globals() will assign a ZF_MAYBEMAP flag to it. If it is then assigned 3224// via -v option or by assignment_arg() it will here be assigned a string value. 3225// So first, remove all map data to prevent memory leak. BUG FIX // 2024-02-13. 3226 if (v->flags & ZF_ANYMAP) { 3227 zmap_delete_map_incl_slotdata(v->map); 3228 xfree(v->map); 3229 v->map = 0; 3230 v->flags &= ~ZF_ANYMAP; 3231 } 3232 3233 zvalue_release_zstring(v); 3234 value = xstrdup(value); 3235 *v = new_str_val(escape_str(value, 0)); 3236 xfree(value); 3237 check_numeric_string(v); 3238 return 1; 3239 } 3240 return 0; 3241} 3242 3243// If valid assignment arg, assign the global and return 1; 3244// otherwise return 0. 3245// TODO FIXME This does not check the format of the variable per posix. 3246// Needs to start w/ _A-Za-z then _A-Za-z0-9 3247// If not valid assignment form, then nextfilearg needs to treat as filename. 3248static int assignment_arg(char *arg) 3249{ 3250 char *val = strchr(arg, '='); 3251 if (val) { 3252 *val++ = 0; 3253 if (!is_ok_varname(arg)) { 3254 *--val = '='; 3255 return 0; 3256 } 3257 assign_global(arg, val); 3258 *--val = '='; 3259 return 1; 3260 } else return 0; 3261} 3262 3263static char *nextfilearg(void) 3264{ 3265 char *arg; 3266 do { 3267 if (++TT.rgl.narg >= (int)to_num(&STACK[ARGC])) return 0; 3268 struct zvalue *v = &STACK[ARGV]; 3269 struct zvalue zkey = ZVINIT(ZF_STR, 0, 3270 num_to_zstring(TT.rgl.narg, to_str(&STACK[CONVFMT])->vst->str)); 3271 arg = ""; 3272 if (zmap_find(v->map, zkey.vst)) { 3273 zvalue_copy(&TT.rgl.cur_arg, to_str(get_map_val(v, &zkey))); 3274 arg = TT.rgl.cur_arg.vst->str; 3275 } 3276 zvalue_release_zstring(&zkey); 3277 } while (!*arg || assignment_arg(arg)); 3278 TT.rgl.nfiles++; 3279 return arg; 3280} 3281 3282static int next_fp(void) 3283{ 3284 char *fn = nextfilearg(); 3285 if (TT.cfile->fp && TT.cfile->fp != stdin) fclose(TT.cfile->fp); 3286 if ((!fn && !TT.rgl.nfiles && TT.cfile->fp != stdin) || (fn && !strcmp(fn, "-"))) { 3287 TT.cfile->fp = stdin; 3288 zvalue_release_zstring(&STACK[FILENAME]); 3289 STACK[FILENAME].vst = new_zstring("<stdin>", 7); 3290 } else if (fn) { 3291 if (!(TT.cfile->fp = fopen(fn, "r"))) FFATAL("can't open %s\n", fn); 3292 zvalue_copy(&STACK[FILENAME], &TT.rgl.cur_arg); 3293 set_num(&STACK[FNR], 0); 3294 } else { 3295 TT.rgl.eof = 1; 3296 return 0; 3297 } 3298 return 1; 3299} 3300 3301static ssize_t getrec_multiline(struct zfile *zfp) 3302{ 3303 ssize_t k, kk; 3304 do { 3305 k = getdelim(&zfp->recbuf_multi, &zfp->recbufsize_multi, '\n', zfp->fp); 3306 } while (k > 0 && zfp->recbuf_multi[0] == '\n'); 3307 TT.rgl.recptr = zfp->recbuf_multi; 3308 if (k < 0) return k; 3309 // k > 0 and recbuf_multi is not only a \n. Prob. ends w/ \n 3310 // but may not at EOF (last line w/o newline) 3311 for (;;) { 3312 kk = getdelim(&zfp->recbuf_multx, &zfp->recbufsize_multx, '\n', zfp->fp); 3313 if (kk < 0 || zfp->recbuf_multx[0] == '\n') break; 3314 // data is in zfp->recbuf_multi[0..k-1]; append to it 3315 if ((size_t)(k + kk + 1) > zfp->recbufsize_multi) 3316 zfp->recbuf_multi = 3317 xrealloc(zfp->recbuf_multi, zfp->recbufsize_multi = k + kk + 1); 3318 memmove(zfp->recbuf_multi + k, zfp->recbuf_multx, kk+1); 3319 k += kk; 3320 } 3321 if (k > 1 && zfp->recbuf_multi[k-1] == '\n') zfp->recbuf_multi[--k] = 0; 3322 TT.rgl.recptr = zfp->recbuf_multi; 3323 return k; 3324} 3325 3326static int rx_findx(regex_t *rx, char *s, long len, regoff_t *start, regoff_t *end, int eflags) 3327{ 3328 regmatch_t matches[1]; 3329 int r = regexec0(rx, s, len, 1, matches, eflags); 3330 if (r == REG_NOMATCH) return r; 3331 if (r) FATAL("regexec error"); // TODO ? use regerr() to meaningful msg 3332 *start = matches[0].rm_so; 3333 *end = matches[0].rm_eo; 3334 return 0; 3335} 3336 3337static ssize_t getrec_f(struct zfile *zfp) 3338{ 3339 int r = 0, rs = ENSURE_STR(&STACK[RS])->vst->str[0] & 0xff; 3340 if (!rs) return getrec_multiline(zfp); 3341 regex_t rsrx, *rsrxp = &rsrx; 3342 // TEMP!! FIXME Need to cache and avoid too-frequent rx compiles 3343 rx_zvalue_compile(&rsrxp, &STACK[RS]); 3344 regoff_t so = 0, eo = 0; 3345 long ret = -1; 3346 for ( ;; ) { 3347 if (zfp->recoffs == zfp->endoffs) { 3348#define INIT_RECBUF_LEN 8192 3349#define RS_LENGTH_MARGIN (INIT_RECBUF_LEN / 8) 3350 if (!zfp->recbuf) 3351 zfp->recbuf = xmalloc((zfp->recbufsize = INIT_RECBUF_LEN) + 1); 3352 zfp->endoffs = fread(zfp->recbuf, 1, zfp->recbufsize, zfp->fp); 3353 zfp->recoffs = 0; 3354 zfp->recbuf[zfp->endoffs] = 0; 3355 if (!zfp->endoffs) break; 3356 } 3357 TT.rgl.recptr = zfp->recbuf + zfp->recoffs; 3358 r = rx_findx(rsrxp, TT.rgl.recptr, zfp->endoffs - zfp->recoffs, &so, &eo, 0); 3359 // if not found, or found "near" end of buffer... 3360 if (r || zfp->recoffs + eo > (int)zfp->recbufsize - RS_LENGTH_MARGIN) { 3361 // if at end of data, and (not found or found at end of data) 3362 if (zfp->endoffs < (int)zfp->recbufsize && 3363 (r || zfp->recoffs + eo == zfp->endoffs)) { 3364 ret = zfp->endoffs - zfp->recoffs; 3365 zfp->recoffs = zfp->endoffs; 3366 break; 3367 } 3368 if (zfp->recoffs) { 3369 memmove(zfp->recbuf, TT.rgl.recptr, zfp->endoffs - zfp->recoffs); 3370 zfp->endoffs -= zfp->recoffs; 3371 zfp->recoffs = 0; 3372 } else zfp->recbuf = 3373 xrealloc(zfp->recbuf, (zfp->recbufsize = zfp->recbufsize * 3 / 2) + 1); 3374 zfp->endoffs += fread(zfp->recbuf + zfp->endoffs, 3375 1, zfp->recbufsize - zfp->endoffs, zfp->fp); 3376 zfp->recbuf[zfp->endoffs] = 0; 3377 } else { 3378 // found and not too near end of data 3379 ret = so; 3380 TT.rgl.recptr[so] = 0; 3381 zfp->recoffs += eo; 3382 break; 3383 } 3384 } 3385 regfree(rsrxp); 3386 return ret; 3387} 3388 3389static ssize_t getrec(void) 3390{ 3391 ssize_t k; 3392 if (TT.rgl.eof) return -1; 3393 if (!TT.cfile->fp) next_fp(); 3394 do { 3395 if ((k = getrec_f(TT.cfile)) >= 0) return k; 3396 } while (next_fp()); 3397 return -1; 3398} 3399 3400static ssize_t getrec_f0_f(struct zfile *zfp) 3401{ 3402 ssize_t k = getrec_f(zfp); 3403 if (k >= 0) { 3404 copy_to_field0(TT.rgl.recptr, k); 3405 } 3406 return k; 3407} 3408 3409static ssize_t getrec_f0(void) 3410{ 3411 ssize_t k = getrec(); 3412 if (k >= 0) { 3413 copy_to_field0(TT.rgl.recptr, k); 3414 incr_zvalue(&STACK[NR]); 3415 incr_zvalue(&STACK[FNR]); 3416 } 3417 return k; 3418} 3419 3420// source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe) 3421// fp is file or pipe (is NULL if file/pipe could not be opened) 3422// FIXME TODO should -1 return be replaced by test at caller? 3423// v is NULL or an lvalue ref 3424static int awk_getline(int source, struct zfile *zfp, struct zvalue *v) 3425{ 3426 ssize_t k; 3427 int is_stream = source != tkeof; 3428 if (is_stream && !zfp->fp) return -1; 3429 if (v) { 3430 if ((k = is_stream ? getrec_f(zfp) : getrec()) < 0) return 0; 3431 zstring_release(&v->vst); 3432 v->vst = new_zstring(TT.rgl.recptr, k); 3433 v->flags = ZF_STR; 3434 check_numeric_string(v); // bug fix 20240514 3435 if (!is_stream) { 3436 incr_zvalue(&STACK[NR]); 3437 incr_zvalue(&STACK[FNR]); 3438 } 3439 } else k = is_stream ? getrec_f0_f(zfp) : getrec_f0(); 3440 return k < 0 ? 0 : 1; 3441} 3442 3443// Define GAWK_SUB to get the same behavior with sub()/gsub() replacement text 3444// as with gawk, goawk, and recent bwk awk (nawk) versions. Undefine GAWK_SUB 3445// to get the simpler POSIX behavior, but I think most users will prefer the 3446// gawk behavior. See the gawk (GNU Awk) manual, 3447// sec. 9.1.4.1 // More about '\' and '&' with sub(), gsub(), and gensub() 3448// for details on the differences. 3449// 3450#undef GAWK_SUB 3451#define GAWK_SUB 3452 3453// sub(ere, repl[, in]) Substitute the string repl in place of the 3454// first instance of the extended regular expression ERE in string 'in' 3455// and return the number of substitutions. An <ampersand> ( '&' ) 3456// appearing in the string repl shall be replaced by the string from in 3457// that matches the ERE. (partial spec... there's more) 3458static void gsub(int opcode, int nargs, int parmbase) 3459{ (void)nargs; 3460 int field_num = -1; 3461 // compile ensures 3 args 3462 struct zvalue *v = setup_lvalue(0, parmbase, &field_num); 3463 struct zvalue *ere = STKP-2; 3464 struct zvalue *repl = STKP-1; 3465 regex_t rx, *rxp = ℞ 3466 rx_zvalue_compile(&rxp, ere); 3467 to_str(repl); 3468 to_str(v); 3469 3470#define SLEN(zvalp) ((zvalp)->vst->size) 3471 char *p, *rp0 = repl->vst->str, *rp = rp0, *s = v->vst->str; 3472 int namps = 0, nhits = 0, is_sub = (opcode == tksub), eflags = 0; 3473 regoff_t so = -1, eo; 3474 // Count ampersands in repl string; may be overcount due to \& escapes. 3475 for (rp = rp0; *rp; rp++) namps += *rp == '&'; 3476 p = s; 3477 regoff_t need = SLEN(v) + 1; // capacity needed for result string 3478 // A pass just to determine needed destination (result) string size. 3479 while(!rx_find(rxp, p, &so, &eo, eflags)) { 3480 need += SLEN(repl) + (eo - so) * (namps - 1); 3481 if (!*p) break; 3482 p += eo ? eo : 1; // ensure progress if empty hit at start 3483 if (is_sub) break; 3484 eflags |= REG_NOTBOL; 3485 } 3486 3487 if (so >= 0) { // at least one hit 3488 struct zstring *z = xzalloc(sizeof(*z) + need); 3489 z->capacity = need; 3490 3491 char *e = z->str; // result destination pointer 3492 p = s; 3493 eflags = 0; 3494 char *ep0 = p, *sp, *ep; 3495 while(!rx_find(rxp, p, &so, &eo, eflags)) { 3496 sp = p + so; 3497 ep = p + eo; 3498 memmove(e, ep0, sp - ep0); // copy unchanged part 3499 e += sp - ep0; 3500 // Skip match if not at start and just after prev match and this is empty 3501 if (p == s || sp - ep0 || eo - so) { 3502 nhits++; 3503 for (rp = rp0; *rp; rp++) { // copy replacement 3504 if (*rp == '&') { 3505 memmove(e, sp, eo - so); //copy match 3506 e += eo - so; 3507 } else if (*rp == '\\') { 3508 if (rp[1] == '&') *e++ = *++rp; 3509 else if (rp[1] != '\\') *e++ = *rp; 3510 else { 3511#ifdef GAWK_SUB 3512 if (rp[2] == '\\' && rp[3] == '&') { 3513 rp += 2; 3514 *e++ = *rp; 3515 } else if (rp[2] != '&') *e++ = '\\'; 3516#endif 3517 *e++ = *++rp; 3518 } 3519 } else *e++ = *rp; 3520 } 3521 } 3522 ep0 = ep; 3523 if (!*p) break; 3524 p += eo ? eo : 1; // ensure progress if empty hit at start 3525 if (is_sub) break; 3526 eflags |= REG_NOTBOL; 3527 } 3528 // copy remaining subject string 3529 memmove(e, ep0, s + SLEN(v) - ep0); 3530 e += s + SLEN(v) - ep0; 3531 *e = 0; 3532 z->size = e - z->str; 3533 zstring_release(&v->vst); 3534 v->vst = z; 3535 } 3536 rx_zvalue_free(rxp, ere); 3537 if (!IS_RX(STKP-2)) zstring_release(&STKP[-2].vst); 3538 drop_n(3); 3539 push_int_val(nhits); 3540 if (field_num >= 0) fixup_fields(field_num); 3541} 3542 3543static long millinow(void) 3544{ 3545 struct timespec ts; 3546 clock_gettime(CLOCK_REALTIME, &ts); 3547 return ts.tv_sec*1000+ts.tv_nsec/1000000; 3548} 3549 3550// Initially set stackp_needmore at MIN_STACK_LEFT before limit. 3551// When stackp > stackp_needmore, then expand and reset stackp_needmore 3552static void add_stack(struct zvalue **stackp_needmore) 3553{ 3554 int k = stkn(0); // stack elements in use 3555 zlist_expand(&TT.stack); 3556 STKP = (struct zvalue *)TT.stack.base + k; 3557 *stackp_needmore = (struct zvalue *)TT.stack.limit - MIN_STACK_LEFT; 3558} 3559 3560#define CLAMP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x)) 3561 3562// Main loop of interpreter. Run this once for all BEGIN rules (which 3563// have had their instructions chained in compile), all END rules (also 3564// chained in compile), and once for each record of the data file(s). 3565static int interpx(int start, int *status) 3566{ 3567 int *ip = &ZCODE[start]; 3568 int opcode, op2, k, r, nargs, nsubscrs, range_num, parmbase = 0; 3569 int field_num; 3570 double nleft, nright, d; 3571 double (*mathfunc[])(double) = {cos, sin, exp, log, sqrt, trunc}; 3572 struct zvalue *v, vv, 3573 *stackp_needmore = (struct zvalue*)TT.stack.limit - MIN_STACK_LEFT; 3574 while ((opcode = *ip++)) { 3575 3576 switch (opcode) { 3577 case opquit: 3578 return opquit; 3579 3580 case tknot: 3581 (STKP)->num = ! get_set_logical(); 3582 break; 3583 3584 case opnotnot: 3585 get_set_logical(); 3586 break; 3587 3588 case opnegate: 3589 STKP->num = -to_num(STKP); 3590 break; 3591 3592 case tkpow: // FALLTHROUGH intentional here 3593 case tkmul: // FALLTHROUGH intentional here 3594 case tkdiv: // FALLTHROUGH intentional here 3595 case tkmod: // FALLTHROUGH intentional here 3596 case tkplus: // FALLTHROUGH intentional here 3597 case tkminus: 3598 nleft = to_num(STKP-1); 3599 nright = to_num(STKP); 3600 switch (opcode) { 3601 case tkpow: nleft = pow(nleft, nright); break; 3602 case tkmul: nleft *= nright; break; 3603 case tkdiv: nleft /= nright; break; 3604 case tkmod: nleft = fmod(nleft, nright); break; 3605 case tkplus: nleft += nright; break; 3606 case tkminus: nleft -= nright; break; 3607 } 3608 drop(); 3609 STKP->num = nleft; 3610 break; 3611 3612 // FIXME REDO REDO ? 3613 case tkcat: 3614 to_str(STKP-1); 3615 to_str(STKP); 3616 STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP[0].vst); 3617 drop(); 3618 break; 3619 3620 // Comparisons (with the '<', "<=", "!=", "==", '>', and ">=" 3621 // operators) shall be made numerically if both operands are numeric, 3622 // if one is numeric and the other has a string value that is a numeric 3623 // string, or if one is numeric and the other has the uninitialized 3624 // value. Otherwise, operands shall be converted to strings as required 3625 // and a string comparison shall be made as follows: 3626 // 3627 // For the "!=" and "==" operators, the strings should be compared to 3628 // check if they are identical but may be compared using the 3629 // locale-specific collation sequence to check if they collate equally. 3630 // 3631 // For the other operators, the strings shall be compared using the 3632 // locale-specific collation sequence. 3633 // 3634 // The value of the comparison expression shall be 1 if the relation is 3635 // true, or 0 if the relation is false. 3636 case tklt: // FALLTHROUGH intentional here 3637 case tkle: // FALLTHROUGH intentional here 3638 case tkne: // FALLTHROUGH intentional here 3639 case tkeq: // FALLTHROUGH intentional here 3640 case tkgt: // FALLTHROUGH intentional here 3641 case tkge: 3642 ; int cmp = 31416; 3643 3644 if ( (IS_NUM(&STKP[-1]) && 3645 (STKP[0].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[0].flags)) || 3646 (IS_NUM(&STKP[0]) && 3647 (STKP[-1].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[-1].flags))) { 3648 switch (opcode) { 3649 case tklt: cmp = STKP[-1].num < STKP[0].num; break; 3650 case tkle: cmp = STKP[-1].num <= STKP[0].num; break; 3651 case tkne: cmp = STKP[-1].num != STKP[0].num; break; 3652 case tkeq: cmp = STKP[-1].num == STKP[0].num; break; 3653 case tkgt: cmp = STKP[-1].num > STKP[0].num; break; 3654 case tkge: cmp = STKP[-1].num >= STKP[0].num; break; 3655 } 3656 } else { 3657 cmp = strcmp(to_str(STKP-1)->vst->str, to_str(STKP)->vst->str); 3658 switch (opcode) { 3659 case tklt: cmp = cmp < 0; break; 3660 case tkle: cmp = cmp <= 0; break; 3661 case tkne: cmp = cmp != 0; break; 3662 case tkeq: cmp = cmp == 0; break; 3663 case tkgt: cmp = cmp > 0; break; 3664 case tkge: cmp = cmp >= 0; break; 3665 } 3666 } 3667 drop(); 3668 drop(); 3669 push_int_val(cmp); 3670 break; 3671 3672 case opmatchrec: 3673 op2 = *ip++; 3674 int mret = match(&FIELD[0], &LITERAL[op2]); 3675 push_int_val(!mret); 3676 break; 3677 3678 case tkmatchop: 3679 case tknotmatch: 3680 mret = match(STKP-1, STKP); // mret == 0 if match 3681 drop(); 3682 drop(); 3683 push_int_val(!mret == (opcode == tkmatchop)); 3684 break; 3685 3686 case tkpowasgn: // FALLTHROUGH intentional here 3687 case tkmodasgn: // FALLTHROUGH intentional here 3688 case tkmulasgn: // FALLTHROUGH intentional here 3689 case tkdivasgn: // FALLTHROUGH intentional here 3690 case tkaddasgn: // FALLTHROUGH intentional here 3691 case tksubasgn: 3692 // Stack is: ... scalar_ref value_to_op_by 3693 // or ... subscript_val map_ref value_to_op_by 3694 // or ... fieldref value_to_op_by 3695 v = setup_lvalue(1, parmbase, &field_num); 3696 to_num(v); 3697 to_num(STKP); 3698 switch (opcode) { 3699 case tkpowasgn: 3700 // TODO 3701 v->num = pow(v->num, STKP->num); 3702 break; 3703 case tkmodasgn: 3704 // TODO 3705 v->num = fmod(v->num, STKP->num); 3706 break; 3707 case tkmulasgn: 3708 v->num *= STKP->num; 3709 break; 3710 case tkdivasgn: 3711 v->num /= STKP->num; 3712 break; 3713 case tkaddasgn: 3714 v->num += STKP->num; 3715 break; 3716 case tksubasgn: 3717 v->num -= STKP->num; 3718 break; 3719 } 3720 3721 drop_n(2); 3722 v->flags = ZF_NUM; 3723 push_val(v); 3724 if (field_num >= 0) fixup_fields(field_num); 3725 break; 3726 3727 case tkasgn: 3728 // Stack is: ... scalar_ref value_to_assign 3729 // or ... subscript_val map_ref value_to_assign 3730 // or ... fieldref value_to_assign 3731 v = setup_lvalue(1, parmbase, &field_num); 3732 force_maybemap_to_scalar(STKP); 3733 zvalue_copy(v, STKP); 3734 swap(); 3735 drop(); 3736 if (field_num >= 0) fixup_fields(field_num); 3737 break; 3738 3739 case tkincr: // FALLTHROUGH intentional here 3740 case tkdecr: // FALLTHROUGH intentional here 3741 case oppreincr: // FALLTHROUGH intentional here 3742 case oppredecr: 3743 // Stack is: ... scalar_ref 3744 // or ... subscript_val map_ref 3745 // or ... fieldnum fieldref 3746 v = setup_lvalue(0, parmbase, &field_num); 3747 to_num(v); 3748 switch (opcode) { 3749 case tkincr: case tkdecr: 3750 // Must be done in this order because push_val(v) may move v, 3751 // invalidating the pointer. 3752 v->num += (opcode == tkincr) ? 1 : -1; 3753 push_val(v); 3754 // Now reverse the incr/decr on the top TT.stack val. 3755 STKP->num -= (opcode == tkincr) ? 1 : -1; 3756 break; 3757 case oppreincr: case oppredecr: 3758 v->num += (opcode == oppreincr) ? 1 : -1; 3759 push_val(v); 3760 break; 3761 } 3762 swap(); 3763 drop(); 3764 if (field_num >= 0) fixup_fields(field_num); 3765 break; 3766 3767 case tknumber: // FALLTHROUGH intentional here 3768 case tkstring: // FALLTHROUGH intentional here 3769 case tkregex: 3770 push_val(&LITERAL[*ip++]); 3771 break; 3772 3773 case tkprint: 3774 case tkprintf: 3775 nargs = *ip++; 3776 int outmode = *ip++; 3777 struct zfile *outfp = TT.zstdout; 3778 switch (outmode) { 3779 case tkgt: outfp = setup_file(1, "w"); break; // file 3780 case tkappend: outfp = setup_file(1, "a"); break; // file 3781 case tkpipe: outfp = setup_file(0, "w"); break; // pipe 3782 default: nargs++; break; 3783 } 3784 nargs--; 3785 if (opcode == tkprintf) { 3786 varprint(fprintf, outfp->fp, nargs); 3787 drop_n(nargs); 3788 break; 3789 } 3790 if (!nargs) { 3791 fprintf(outfp->fp, "%s", to_str(&FIELD[0])->vst->str); 3792 } else { 3793 struct zvalue tempv = uninit_zvalue; 3794 zvalue_copy(&tempv, &STACK[OFS]); 3795 to_str(&tempv); 3796 for (int k = 0; k < nargs; k++) { 3797 if (k) fprintf(outfp->fp, "%s", tempv.vst->str); 3798 int sp = stkn(nargs - 1 - k); 3799 ////// FIXME refcnt -- prob. don't need to copy from TT.stack? 3800 v = &STACK[sp]; 3801 to_str_fmt(v, OFMT); 3802 struct zstring *zs = v->vst; 3803 fprintf(outfp->fp, "%s", zs ? zs->str : ""); 3804 } 3805 zvalue_release_zstring(&tempv); 3806 drop_n(nargs); 3807 } 3808 fputs(ENSURE_STR(&STACK[ORS])->vst->str, outfp->fp); 3809 break; 3810 3811 case opdrop: 3812 drop(); 3813 break; 3814 3815 case opdrop_n: 3816 drop_n(*ip++); 3817 break; 3818 3819 // Stack frame layout relative to parmbase: 3820#define RETURN_VALUE -4 3821#define RETURN_ADDR -3 3822#define PREV_PARMBASE -2 3823#define ARG_CNT -1 3824#define FUNCTION_NUM 0 3825 // Actual args follow, starting at parmbase + 1 3826 case tkfunction: // function definition 3827 op2 = *ip++; // func table num 3828 struct functab_slot *pfdef = &FUNC_DEF[op2]; 3829 struct zlist *loctab = &pfdef->function_locals; 3830 int nparms = zlist_len(loctab)-1; 3831 3832 nargs = popnumval(); 3833 int newparmbase = stkn(nargs); 3834 STACK[newparmbase + PREV_PARMBASE].num = parmbase; 3835 parmbase = newparmbase; 3836 for ( ;nargs > nparms; nargs--) 3837 drop(); 3838 for ( ;nargs < nparms; nargs++) { 3839 // Push additional "args" that were not passed by the caller, to 3840 // match the formal parameters (parms) defined in the function 3841 // definition. In the local var table we may have the type as scalar 3842 // or map if it is used as such within the function. In that case we 3843 // init the pushed arg from the type of the locals table. 3844 // But if a var appears only as a bare arg in a function call it will 3845 // not be typed in the locals table. In that case we can only say it 3846 // "may be" a map, but we have to assume the possibility and attach a 3847 // map to the var. When/if the var is used as a map or scalar in the 3848 // called function it will be converted to a map or scalar as 3849 // required. 3850 // See force_maybemap_to_scalar(). 3851 struct symtab_slot *q = &((struct symtab_slot *)loctab->base)[nargs+1]; 3852 vv = (struct zvalue)ZVINIT(q->flags, 0, 0); 3853 if (vv.flags == 0) { 3854 zvalue_map_init(&vv); 3855 vv.flags = ZF_MAYBEMAP; 3856 } else if (IS_MAP(&vv)) { 3857 zvalue_map_init(&vv); 3858 } else { 3859 vv.flags = 0; 3860 } 3861 push_val(&vv); 3862 } 3863 break; 3864 3865 case tkreturn: 3866 nparms = *ip++; 3867 nargs = STACK[parmbase+ARG_CNT].num; 3868 force_maybemap_to_scalar(STKP); // Unneeded? 3869 zvalue_copy(&STACK[parmbase+RETURN_VALUE], STKP); 3870 drop(); 3871 // Remove the local args (not supplied by caller) from TT.stack, check to 3872 // release any map data created. 3873 while (stkn(0) > parmbase + nargs) { 3874 if ((STKP)->flags & ZF_ANYMAP) { 3875 zmap_delete_map_incl_slotdata((STKP)->map); 3876 xfree((STKP)->map); 3877 } 3878 drop(); 3879 } 3880 while (stkn(0) > parmbase + RETURN_VALUE) 3881 drop(); 3882 ip = &ZCODE[(int)STACK[parmbase+RETURN_ADDR].num]; 3883 parmbase = STACK[parmbase+PREV_PARMBASE].num; 3884 break; 3885 3886 case opprepcall: // function call prep 3887 if (STKP > stackp_needmore) add_stack(&stackp_needmore); 3888 push_int_val(0); // return value placeholder 3889 push_int_val(0); // return addr 3890 push_int_val(0); // parmbase 3891 push_int_val(0); // arg count 3892 push_int_val(*ip++); // function tbl ref 3893 break; 3894 3895 case tkfunc: // function call 3896 nargs = *ip++; 3897 newparmbase = stkn(nargs); 3898 STACK[newparmbase+RETURN_ADDR].num = ip - &ZCODE[0]; 3899 STACK[newparmbase+ARG_CNT].num = nargs; 3900 push_int_val(nargs); // FIXME TODO pass this in a zregister? 3901 ip = &ZCODE[FUNC_DEF[(int)STACK[newparmbase+FUNCTION_NUM].num].zcode_addr]; 3902 break; 3903 3904 case tkrbracket: // concat multiple map subscripts 3905 nsubscrs = *ip++; 3906 while (--nsubscrs) { 3907 swap(); 3908 to_str(STKP); 3909 push_val(&STACK[SUBSEP]); 3910 to_str(STKP); 3911 STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst); 3912 drop(); 3913 swap(); 3914 to_str(STKP); 3915 STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst); 3916 drop(); 3917 } 3918 break; 3919 3920 case opmapdelete: 3921 case tkdelete: 3922 k = STKP->num; 3923 if (k < 0) k = parmbase - k; // loc of var on TT.stack 3924 v = &STACK[k]; 3925 force_maybemap_to_map(v); 3926 if (opcode == opmapdelete) { 3927 zmap_delete_map(v->map); 3928 } else { 3929 drop(); 3930 zmap_delete(v->map, to_str(STKP)->vst); 3931 } 3932 drop(); 3933 break; 3934 3935 case opmap: 3936 op2 = *ip++; 3937 k = op2 < 0 ? parmbase - op2 : op2; 3938 v = &STACK[k]; 3939 force_maybemap_to_map(v); 3940 if (!IS_MAP(v)) FATAL("scalar in array context"); 3941 v = get_map_val(v, STKP); 3942 drop(); // drop subscript 3943 push_val(v); 3944 break; 3945 3946 case tkin: 3947 if (!(STKP->flags & ZF_ANYMAP)) FATAL("scalar in array context"); 3948 v = zmap_find(STKP->map, to_str(STKP-1)->vst); 3949 drop(); 3950 drop(); 3951 push_int_val(v ? 1 : 0); 3952 break; 3953 3954 case opmapiternext: 3955 op2 = *ip++; 3956 v = STKP-1; 3957 force_maybemap_to_map(v); 3958 if (!IS_MAP(v)) FATAL("scalar in array context"); 3959 struct zmap *m = v->map; // Need for MAPSLOT macro 3960 int zlen = zlist_len(&m->slot); 3961 int kk = STKP->num + 1; 3962 while (kk < zlen && !(MAPSLOT[kk].key)) // skip deleted slots 3963 kk++; 3964 STKP->num = kk; // save index for next iteration 3965 if (kk < zlen) { 3966 struct zvalue *var = setup_lvalue(2, parmbase, &field_num); 3967 var->flags = ZF_STR; 3968 zstring_release(&var->vst); 3969 var->vst = MAPSLOT[kk].key; 3970 zstring_incr_refcnt(var->vst); 3971 ip += op2; 3972 } 3973 break; 3974 3975 case tkvar: 3976 op2 = *ip++; 3977 k = op2 < 0 ? parmbase - op2 : op2; 3978 v = &STACK[k]; 3979 push_val(v); 3980 break; 3981 3982 case tkfield: 3983 // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void) 3984 // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]). 3985 ip++; // skip dummy "operand" instruction field 3986 push_field((int)(to_num(STKP))); 3987 3988 swap(); 3989 drop(); 3990 break; 3991 3992 case oppush: 3993 push_int_val(*ip++); 3994 break; 3995 3996 case tkand: 3997 op2 = *ip++; 3998 if (get_set_logical()) drop(); 3999 else ip += op2; 4000 break; 4001 4002 case tkor: 4003 op2 = *ip++; 4004 if (!get_set_logical()) drop(); 4005 else ip += op2; 4006 break; 4007 4008 case tkwhile: 4009 (STKP)->num = ! get_set_logical(); 4010 ATTR_FALLTHROUGH_INTENDED; 4011 // FALLTHROUGH to tkternif 4012 case tkif: 4013 // FALLTHROUGH to tkternif 4014 case tkternif: 4015 op2 = *ip++; 4016 int t = get_set_logical(); // FIXME only need to get, not set 4017 drop(); 4018 if (!t) ip += op2; 4019 break; 4020 4021 case tkelse: // FALLTHROUGH intentional here 4022 case tkternelse: // FALLTHROUGH intentional here 4023 case tkbreak: // FALLTHROUGH intentional here 4024 case tkcontinue: // FALLTHROUGH intentional here 4025 case opjump: 4026 op2 = *ip++; 4027 ip += op2; 4028 break; 4029 4030 case opvarref: 4031 op2 = *ip++; 4032 vv = (struct zvalue)ZVINIT(ZF_REF, op2, 0); 4033 push_val(&vv); 4034 break; 4035 4036 case opmapref: 4037 op2 = *ip++; 4038 vv = (struct zvalue)ZVINIT(ZF_MAPREF, op2, 0); 4039 push_val(&vv); 4040 break; 4041 4042 case opfldref: 4043 to_num(STKP); 4044 (STKP)->flags |= ZF_FIELDREF; 4045 ip++; // skip dummy "operand" instruction field 4046 break; 4047 4048 case opprintrec: 4049 puts(to_str(&FIELD[0])->vst->str); 4050 break; 4051 4052 case oprange1: 4053 range_num = *ip++; 4054 op2 = *ip++; 4055 if (TT.range_sw[range_num]) ip += op2; 4056 break; 4057 4058 case oprange2: 4059 range_num = *ip++; 4060 op2 = *ip++; 4061 t = get_set_logical(); // FIXME only need to get, not set 4062 drop(); 4063 if (t) TT.range_sw[range_num] = 1; 4064 else ip += op2; 4065 break; 4066 4067 case oprange3: 4068 range_num = *ip++; 4069 t = get_set_logical(); // FIXME only need to get, not set 4070 drop(); 4071 if (t) TT.range_sw[range_num] = 0; 4072 break; 4073 4074 case tkexit: 4075 r = popnumval(); 4076 if (r != NO_EXIT_STATUS) *status = (int)r & 255; 4077 // TODO FIXME do we need NO_EXIT_STATUS at all? Just use 0? 4078 ATTR_FALLTHROUGH_INTENDED; 4079 case tknext: 4080 case tknextfile: 4081 return opcode; 4082 4083 case tkgetline: 4084 nargs = *ip++; 4085 int source = *ip++; 4086 // TT.stack is: 4087 // if tkgetline 0 tkeof: (nothing stacked; plain getline) 4088 // if tkgetline 1 tkeof: (lvalue) 4089 // if tkgetline 1 tklt: (filename_string) 4090 // if tkgetline 2 tklt: (lvalue) (filename_string) 4091 // if tkgetline 1 tkpipe: (pipe_command_string) 4092 // if tkgetline 2 tkpipe: (pipe_command_string) (lvalue) 4093 // effect is to set: 4094 // if tkgetline 0 tkeof: $0 NF NR FNR 4095 // if tkgetline 1 tkeof: var NR FNR 4096 // if tkgetline 1 tklt: $0 NF 4097 // if tkgetline 2 tklt: var 4098 // if tkgetline 1 tkpipe: $0 NF 4099 // if tkgetline 2 tkpipe: var 4100 // Ensure pipe cmd on top 4101 if (nargs == 2 && source == tkpipe) swap(); 4102 struct zfile *zfp = 0; 4103 if (source == tklt || source == tkpipe) { 4104 zfp = setup_file(source == tklt, "r"); 4105 nargs--; 4106 } 4107 // now cases are: 4108 // nargs source TT.stack 4109 // 0 tkeof: (nothing; plain getline) from current data file 4110 // 1 tkeof: (lvalue) from current data file 4111 // 0 tklt: (nothing) from named file in 'stream' 4112 // 1 tklt: (lvalue) from named file in 'stream' 4113 // 0 tkpipe: (nothing) from piped command in 'stream' 4114 // 1 tkpipe: (lvalue) from piped command in 'stream' 4115 v = nargs ? setup_lvalue(0, parmbase, &field_num) : 0; 4116 if (v) drop(); 4117 // source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe) 4118 // stream is name of file or pipe 4119 // v is NULL or an lvalue ref 4120 if (zfp != badfile) push_int_val(awk_getline(source, zfp, v)); 4121 else push_int_val(-1); 4122 4123 // fake return value for now 4124 break; 4125 4126 ////// builtin functions /////// 4127 4128 case tksplit: 4129 nargs = *ip++; 4130 if (nargs == 2) push_val(&STACK[FS]); 4131 struct zstring *s = to_str(STKP-2)->vst; 4132 force_maybemap_to_map(STKP-1); 4133 struct zvalue *a = STKP-1; 4134 struct zvalue *fs = STKP; 4135 zmap_delete_map(a->map); 4136 k = split(s, a, fs); 4137 drop_n(3); 4138 push_int_val(k); 4139 break; 4140 4141 case tkmatch: 4142 nargs = *ip++; 4143 if (!IS_RX(STKP)) to_str(STKP); 4144 regex_t rx_pat, *rxp = &rx_pat; 4145 rx_zvalue_compile(&rxp, STKP); 4146 regoff_t rso = 0, reo = 0; // shut up warning (may be uninit) 4147 k = rx_find(rxp, to_str(STKP-1)->vst->str, &rso, &reo, 0); 4148 rx_zvalue_free(rxp, STKP); 4149 // Force these to num before setting. 4150 to_num(&STACK[RSTART]); 4151 to_num(&STACK[RLENGTH]); 4152 if (k) STACK[RSTART].num = 0, STACK[RLENGTH].num = -1; 4153 else { 4154 reo = utf8cnt(STKP[-1].vst->str, reo); 4155 rso = utf8cnt(STKP[-1].vst->str, rso); 4156 STACK[RSTART].num = rso + 1, STACK[RLENGTH].num = reo - rso; 4157 } 4158 drop(); 4159 drop(); 4160 push_int_val(k ? 0 : rso + 1); 4161 break; 4162 4163 case tksub: 4164 case tkgsub: 4165 gsub(opcode, *ip++, parmbase); // tksub/tkgsub, args 4166 break; 4167 4168 case tksubstr: 4169 nargs = *ip++; 4170 struct zstring *zz = to_str(STKP - nargs + 1)->vst; 4171 int nchars = utf8cnt(zz->str, zz->size); // number of utf8 codepoints 4172 // Offset of start of string (in chars not bytes); convert 1-based to 0-based 4173 ssize_t mm = CLAMP(trunc(to_num(STKP - nargs + 2)) - 1, 0, nchars); 4174 ssize_t nn = nchars - mm; // max possible substring length (chars) 4175 if (nargs == 3) nn = CLAMP(trunc(to_num(STKP)), 0, nn); 4176 mm = bytesinutf8(zz->str, zz->size, mm); 4177 nn = bytesinutf8(zz->str + mm, zz->size - mm, nn); 4178 struct zstring *zzz = new_zstring(zz->str + mm, nn); 4179 zstring_release(&(STKP - nargs + 1)->vst); 4180 (STKP - nargs + 1)->vst = zzz; 4181 drop_n(nargs - 1); 4182 break; 4183 4184 case tkindex: 4185 nargs = *ip++; 4186 char *s1 = to_str(STKP-1)->vst->str; 4187 char *s3 = strstr(s1, to_str(STKP)->vst->str); 4188 ptrdiff_t offs = s3 ? utf8cnt(s1, s3 - s1) + 1 : 0; 4189 drop(); 4190 drop(); 4191 push_int_val(offs); 4192 break; 4193 4194 case tkband: 4195 case tkbor: 4196 case tkbxor: 4197 case tklshift: 4198 case tkrshift: 4199 ; size_t acc = to_num(STKP); 4200 nargs = *ip++; 4201 for (int i = 1; i < nargs; i++) switch (opcode) { 4202 case tkband: acc &= (size_t)to_num(STKP-i); break; 4203 case tkbor: acc |= (size_t)to_num(STKP-i); break; 4204 case tkbxor: acc ^= (size_t)to_num(STKP-i); break; 4205 case tklshift: acc = (size_t)to_num(STKP-i) << acc; break; 4206 case tkrshift: acc = (size_t)to_num(STKP-i) >> acc; break; 4207 } 4208 drop_n(nargs); 4209 push_int_val(acc); 4210 break; 4211 4212 case tktolower: 4213 case tktoupper: 4214 nargs = *ip++; 4215 struct zstring *z = to_str(STKP)->vst; 4216 unsigned zzlen = z->size + 4; // Allow for expansion 4217 zz = zstring_update(0, zzlen, "", 0); 4218 char *p = z->str, *e = z->str + z->size, *q = zz->str; 4219 // Similar logic to toybox strlower(), but fixed. 4220 while (p < e) { 4221 unsigned wch; 4222 int len = utf8towc(&wch, p, e-p); 4223 if (len < 1) { // nul byte, error, or truncated code 4224 *q++ = *p++; 4225 continue; 4226 } 4227 p += len; 4228 wch = (opcode == tktolower ? towlower : towupper)(wch); 4229 len = wctoutf8(q, wch); 4230 q += len; 4231 // Need realloc here if overflow possible 4232 if ((len = q - zz->str) + 4 < (int)zzlen) continue; 4233 zz = zstring_update(zz, zzlen = len + 16, "", 0); 4234 q = zz->str + len; 4235 } 4236 *q = 0; 4237 zz->size = q - zz->str; 4238 zstring_release(&z); 4239 STKP->vst = zz; 4240 break; 4241 4242 case tklength: 4243 nargs = *ip++; 4244 v = nargs ? STKP : &FIELD[0]; 4245 force_maybemap_to_map(v); 4246 if (IS_MAP(v)) k = v->map->count - v->map->deleted; 4247 else { 4248 to_str(v); 4249 k = utf8cnt(v->vst->str, v->vst->size); 4250 } 4251 if (nargs) drop(); 4252 push_int_val(k); 4253 break; 4254 4255 case tksystem: 4256 nargs = *ip++; 4257 fflush(stdout); 4258 fflush(stderr); 4259 r = system(to_str(STKP)->vst->str); 4260#ifdef WEXITSTATUS 4261 // WEXITSTATUS is in sys/wait.h, but I'm not including that. 4262 // It seems to also be in stdlib.h in gcc and musl-gcc. 4263 // No idea how portable this is! 4264 if (WIFEXITED(r)) r = WEXITSTATUS(r); 4265#endif 4266 drop(); 4267 push_int_val(r); 4268 break; 4269 4270 case tkfflush: 4271 nargs = *ip++; 4272 r = fflush_file(nargs); 4273 if (nargs) drop(); 4274 push_int_val(r); 4275 break; 4276 4277 case tkclose: 4278 nargs = *ip++; 4279 r = close_file(to_str(STKP)->vst->str); 4280 drop(); 4281 push_int_val(r); 4282 break; 4283 4284 case tksprintf: 4285 nargs = *ip++; 4286 zstring_release(&TT.rgl.zspr); 4287 TT.rgl.zspr = new_zstring("", 0); 4288 varprint(fsprintf, 0, nargs); 4289 drop_n(nargs); 4290 vv = (struct zvalue)ZVINIT(ZF_STR, 0, TT.rgl.zspr); 4291 push_val(&vv); 4292 break; 4293 4294 // Math builtins -- move here (per Oliver Webb suggestion) 4295 case tkatan2: 4296 nargs = *ip++; 4297 d = atan2(to_num(STKP-1), to_num(STKP)); 4298 drop(); 4299 STKP->num = d; 4300 break; 4301 case tkrand: 4302 nargs = *ip++; 4303 push_int_val(0); 4304 // Get all 53 mantissa bits in play: 4305 // (upper 26 bits * 2^27 + upper 27 bits) / 2^53 4306 STKP->num = 4307 ((random() >> 5) * 134217728.0 + (random() >> 4)) / 9007199254740992.0; 4308 break; 4309 case tksrand: 4310 nargs = *ip++; 4311 if (nargs == 1) { 4312 STKP->num = seedrand(to_num(STKP)); 4313 } else push_int_val(seedrand(millinow())); 4314 break; 4315 case tkcos: case tksin: case tkexp: case tklog: case tksqrt: case tkint: 4316 nargs = *ip++; 4317 STKP->num = mathfunc[opcode-tkcos](to_num(STKP)); 4318 break; 4319 4320 default: 4321 // This should never happen: 4322 error_exit("!!! Unimplemented opcode %d", opcode); 4323 } 4324 } 4325 return opquit; 4326} 4327 4328// interp() wraps the main interpreter loop interpx(). The main purpose 4329// is to allow the TT.stack to be readjusted after an 'exit' from a function. 4330// Also catches errors, as the normal operation should leave the TT.stack 4331// depth unchanged after each run through the rules. 4332static int interp(int start, int *status) 4333{ 4334 int stkptrbefore = stkn(0); 4335 int r = interpx(start, status); 4336 // If exit from function, TT.stack will be loaded with args etc. Clean it. 4337 if (r == tkexit) { 4338 // TODO FIXME is this safe? Just remove extra entries? 4339 STKP = &STACK[stkptrbefore]; 4340 } 4341 if (stkn(0) - stkptrbefore) 4342 error_exit("!!AWK BUG stack pointer offset: %d", stkn(0) - stkptrbefore); 4343 return r; 4344} 4345 4346static void insert_argv_map(struct zvalue *map, int key, char *value) 4347{ 4348 struct zvalue zkey = ZVINIT(ZF_STR, 0, num_to_zstring(key, ENSURE_STR(&STACK[CONVFMT])->vst->str)); 4349 struct zvalue *v = get_map_val(map, &zkey); 4350 zvalue_release_zstring(&zkey); 4351 zvalue_release_zstring(v); 4352 *v = new_str_val(value); 4353 check_numeric_string(v); 4354} 4355 4356static void init_globals(int optind, int argc, char **argv, char *sepstring, 4357 struct arg_list *assign_args) 4358{ 4359 // Global variables reside at the bottom of the TT.stack. Start with the awk 4360 // "special variables": ARGC, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF, 4361 // NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP 4362 4363 STACK[CONVFMT] = new_str_val("%.6g"); 4364 // Init ENVIRON map. 4365 struct zvalue m = ZVINIT(ZF_MAP, 0, 0); 4366 zvalue_map_init(&m); 4367 STACK[ENVIRON] = m; 4368 for (char **pkey = environ; *pkey; pkey++) { 4369 char *pval = strchr(*pkey, '='); 4370 if (!pval) continue; 4371 struct zvalue zkey = ZVINIT(ZF_STR, 0, new_zstring(*pkey, pval - *pkey)); 4372 struct zvalue *v = get_map_val(&m, &zkey); 4373 zstring_release(&zkey.vst); 4374 if (v->vst) FFATAL("env var dup? (%s)", pkey); 4375 *v = new_str_val(++pval); // FIXME refcnt 4376 check_numeric_string(v); 4377 } 4378 4379 // Init ARGV map. 4380 m = (struct zvalue)ZVINIT(ZF_MAP, 0, 0); 4381 zvalue_map_init(&m); 4382 STACK[ARGV] = m; 4383 insert_argv_map(&m, 0, TT.progname); 4384 int nargc = 1; 4385 for (int k = optind; k < argc; k++) { 4386 insert_argv_map(&m, nargc, argv[k]); 4387 nargc++; 4388 } 4389 4390 // Init rest of the awk special variables. 4391 STACK[ARGC] = (struct zvalue)ZVINIT(ZF_NUM, nargc, 0); 4392 STACK[FILENAME] = new_str_val(""); 4393 STACK[FNR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0); 4394 STACK[FS] = new_str_val(sepstring); 4395 STACK[NF] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0); 4396 STACK[NR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0); 4397 STACK[OFMT] = new_str_val("%.6g"); 4398 STACK[OFS] = new_str_val(" "); 4399 STACK[ORS] = new_str_val("\n"); 4400 STACK[RLENGTH] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0); 4401 STACK[RS] = new_str_val("\n"); 4402 STACK[RSTART] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0); 4403 STACK[SUBSEP] = new_str_val("\034"); 4404 4405 // Init program globals. 4406 // 4407 // Push global variables on the TT.stack at offsets matching their index in the 4408 // global var table. In the global var table we may have the type as scalar 4409 // or map if it is used as such in the program. In that case we init the 4410 // pushed arg from the type of the globals table. 4411 // But if a global var appears only as a bare arg in a function call it will 4412 // not be typed in the globals table. In that case we can only say it "may be" 4413 // a map, but we have to assume the possibility and attach a map to the 4414 // var. When/if the var is used as a map or scalar in the called function it 4415 // will be converted to a map or scalar as required. 4416 // See force_maybemap_to_scalar(), and the similar comment in 4417 // 'case tkfunction:' above. 4418 // 4419 int gstx, len = zlist_len(&TT.globals_table); 4420 for (gstx = TT.spec_var_limit; gstx < len; gstx++) { 4421 struct symtab_slot gs = GLOBAL[gstx]; 4422 struct zvalue v = ZVINIT(gs.flags, 0, 0); 4423 if (v.flags == 0) { 4424 zvalue_map_init(&v); 4425 v.flags = ZF_MAYBEMAP; 4426 } else if (IS_MAP(&v)) { 4427 zvalue_map_init(&v); 4428 } else { 4429 // Set SCALAR flag 0 to create "uninitialized" scalar. 4430 v.flags = 0; 4431 } 4432 push_val(&v); 4433 } 4434 4435 // Init -v assignment options. 4436 for (struct arg_list *p = assign_args; p; p = p->next) { 4437 char *asgn = p->arg; 4438 char *val = strchr(asgn, '='); 4439 if (!val) error_exit("bad -v assignment format"); 4440 *val++ = 0; 4441 assign_global(asgn, val); 4442 } 4443 4444 TT.rgl.cur_arg = new_str_val("<cmdline>"); 4445 uninit_string_zvalue = new_str_val(""); 4446 zvalue_copy(&FIELD[0], &uninit_string_zvalue); 4447} 4448 4449static void run_files(int *status) 4450{ 4451 int r = 0; 4452 while (r != tkexit && *status < 0 && getrec_f0() >= 0) 4453 if ((r = interp(TT.cgl.first_recrule, status)) == tknextfile) next_fp(); 4454} 4455 4456static void free_literal_regex(void) 4457{ 4458 int len = zlist_len(&TT.literals); 4459 for (int k = 1; k < len; k++) 4460 if (IS_RX(&LITERAL[k])) regfree(LITERAL[k].rx); 4461} 4462 4463static void run(int optind, int argc, char **argv, char *sepstring, 4464 struct arg_list *assign_args) 4465{ 4466 char *printf_fmt_rx = "%[-+ #0']*([*]|[0-9]*)([.]([*]|[0-9]*))?l?[aAdiouxXfFeEgGcs%]"; 4467 init_globals(optind, argc, argv, sepstring, assign_args); 4468 TT.cfile = xzalloc(sizeof(struct zfile)); 4469 xregcomp(&TT.rx_default, "[ \t\n]+", REG_EXTENDED); 4470 xregcomp(&TT.rx_last, "[ \t\n]+", REG_EXTENDED); 4471 xregcomp(&TT.rx_printf_fmt, printf_fmt_rx, REG_EXTENDED); 4472 new_file("-", stdin, 'r', 'f')->is_std_file = 1; 4473 new_file("/dev/stdin", stdin, 'r', 'f')->is_std_file = 1; 4474 new_file("/dev/stdout", stdout, 'w', 'f')->is_std_file = 1; 4475 TT.zstdout = TT.zfiles; 4476 new_file("/dev/stderr", stderr, 'w', 'f')->is_std_file = 1; 4477 seedrand(123); 4478 int status = -1, r = 0; 4479 if (TT.cgl.first_begin) r = interp(TT.cgl.first_begin, &status); 4480 if (r != tkexit) 4481 if (TT.cgl.first_recrule) run_files(&status); 4482 if (TT.cgl.first_end) r = interp(TT.cgl.first_end, &status); 4483 regfree(&TT.rx_printf_fmt); 4484 regfree(&TT.rx_default); 4485 regfree(&TT.rx_last); 4486 free_literal_regex(); 4487 close_file(0); // close all files 4488 if (status >= 0) exit(status); 4489} 4490 4491//////////////////// 4492//// main 4493//////////////////// 4494 4495static void progfiles_init(char *progstring, struct arg_list *prog_args) 4496{ 4497 TT.scs->p = progstring ? progstring : " " + 2; 4498 TT.scs->progstring = progstring; 4499 TT.scs->prog_args = prog_args; 4500 TT.scs->filename = "(cmdline)"; 4501 TT.scs->maxtok = 256; 4502 TT.scs->tokstr = xzalloc(TT.scs->maxtok); 4503} 4504 4505static int awk(char *sepstring, char *progstring, struct arg_list *prog_args, 4506 struct arg_list *assign_args, int optind, int argc, char **argv, 4507 int opt_run_prog) 4508{ 4509 struct scanner_state ss = {0}; 4510 TT.scs = &ss; 4511 4512 setlocale(LC_NUMERIC, ""); 4513 progfiles_init(progstring, prog_args); 4514 compile(); 4515 4516 if (TT.cgl.compile_error_count) 4517 error_exit("%d syntax error(s)", TT.cgl.compile_error_count); 4518 else { 4519 if (opt_run_prog) 4520 run(optind, argc, argv, sepstring, assign_args); 4521 } 4522 4523 return TT.cgl.compile_error_count; 4524} 4525 4526void awk_main(void) 4527{ 4528 char *sepstring = TT.F ? escape_str(TT.F, 0) : " "; 4529 int optind = 0; 4530 char *progstring = NULL; 4531 4532 TT.pbuf = toybuf; 4533 toys.exitval = 2; 4534 if (!TT.f) { 4535 if (*toys.optargs) progstring = toys.optargs[optind++]; 4536 else error_exit("No program string\n"); 4537 } 4538 TT.progname = toys.which->name; 4539 toys.exitval = awk(sepstring, progstring, TT.f, TT.v, 4540 optind, toys.optc, toys.optargs, !FLAG(c)); 4541} 4542