xref: /third_party/toybox/toys/pending/awk.c (revision 0f66f451)
1/* awk.c - An awk implementation.
2 * vi: tabstop=2 softtabstop=2 shiftwidth=2
3 *
4 * Copyright 2024 Ray Gardner <raygard@gmail.com>
5 *
6 * See https://pubs.opengroup.org/onlinepubs/9699919799/utilities/awk.html
7
8USE_AWK(NEWTOY(awk, "F:v*f*bc", TOYFLAG_USR|TOYFLAG_BIN))
9
10config AWK
11  bool "awk"
12  default n
13  help
14    usage:  awk [-F sepstring] [-v assignment]... program [argument...]
15      or:
16            awk [-F sepstring] -f progfile [-f progfile]... [-v assignment]...
17                  [argument...]
18      also:
19      -b : use bytes, not characters
20      -c : compile only, do not run
21*/
22
23#define FOR_awk
24#include "toys.h"
25
26GLOBALS(
27  struct arg_list *f;
28  struct arg_list *v;
29  char *F;
30
31  struct scanner_state {
32      char *p;
33      char *progstring;
34      struct arg_list *prog_args;
35      char *filename;
36      char *line;
37      size_t line_size;
38      ssize_t line_len;
39      int line_num;
40      int ch;
41      FILE *fp;
42      // state includes latest token seen
43      int tok;
44      int tokbuiltin;
45      int toktype;
46      char *tokstr;
47      size_t maxtok;
48      size_t toklen;
49      double numval;
50      int error;  // Set if lexical error.
51  } *scs;
52  char *tokstr;
53  int prevtok;
54
55  struct compiler_globals {
56    int in_print_stmt;
57    int paren_level;
58    int in_function_body;
59    int funcnum;
60    int nparms;
61    int compile_error_count;
62    int first_begin;
63    int last_begin;
64    int first_end;
65    int last_end;
66    int first_recrule;
67    int last_recrule;
68    int break_dest;
69    int continue_dest;
70    int stack_offset_to_fix;  // fixup stack if return in for(e in a)
71    int range_pattern_num;
72    int rule_type;  // tkbegin, tkend, or 0
73  } cgl;
74
75  // zvalue: the main awk value type
76  // Can be number or string or both, or else map (array) or regex
77  struct zvalue {
78    unsigned flags;
79    double num;
80    union { // anonymous union not in C99; not going to fix it now.
81      struct zstring *vst;
82      struct zmap *map;
83      regex_t *rx;
84    };
85  } nozvalue;   // to shut up compiler warning TODO FIXME
86
87  struct runtime_globals {
88    struct zvalue cur_arg;
89    //char *filename;     // UNUSED
90    FILE *fp;           // current data file
91    int narg;           // cmdline arg index
92    int nfiles;         // num of cmdline data file args processed
93    int eof;            // all cmdline files (incl. stdin) read
94    char *recptr;
95    char *recbuf;
96    size_t recbufsize;
97    char *recbuf_multx;
98    size_t recbufsize_multx;
99    struct zstring *zspr;      // Global to receive sprintf() string value
100  } rgl;
101
102  // Expanding sequential list
103  struct zlist {
104    char *base, *limit, *avail;
105    size_t size;
106  } globals_table,  // global symbol table
107    locals_table,     // local symbol table
108    func_def_table;  // function symbol table
109  // runtime lists
110  struct zlist literals, fields, zcode, stack;
111
112  char *progname;
113
114  int spec_var_limit;
115  int zcode_last;
116  struct zvalue *stackp;  // top of stack ptr
117
118  char *pbuf;   // Used for number formatting in num_to_zstring()
119#define RS_MAX  64
120  char rs_last[RS_MAX];
121  regex_t rx_rs_default, rx_rs_last;
122  regex_t rx_default, rx_last, rx_printf_fmt;
123#define FS_MAX  64
124  char fs_last[FS_MAX];
125  char one_char_fs[4];
126  int nf_internal;  // should match NF
127  char range_sw[64];   // FIXME TODO quick and dirty set of range switches
128  int file_cnt, std_file_cnt;
129
130  struct zfile {
131    struct zfile *next;
132    char *fn;
133    FILE *fp;
134    char mode;  // w, a, or r
135    char file_or_pipe;  // f or p
136    char is_std_file;
137    char *recbuf;
138    size_t recbufsize;
139    char *recbuf_multi;
140    size_t recbufsize_multi;
141    char *recbuf_multx;
142    size_t recbufsize_multx;
143    int recoffs, endoffs;
144  } *zfiles, *cfile, *zstdout;
145)
146
147#ifdef __GNUC__
148#define ATTR_FALLTHROUGH_INTENDED __attribute__ ((fallthrough))
149#else
150#define ATTR_FALLTHROUGH_INTENDED
151#endif
152
153////////////////////
154////   declarations
155////////////////////
156
157#define PBUFSIZE  512 // For num_to_zstring()
158
159enum toktypes {
160    // EOF (use -1 from stdio.h)
161    ERROR = 2, NEWLINE, VAR, NUMBER, STRING, REGEX, USERFUNC, BUILTIN, TOKEN,
162    KEYWORD
163    };
164
165// Must align with lbp_table[]
166enum tokens {
167    tkunusedtoken, tkeof, tkerr, tknl,
168    tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin,
169
170// static char *ops = " ;  ,  [  ]  (  )  {  }  $  ++ -- ^  !  *  /  %  +  -     "
171//        "<  <= != == >  >= ~  !~ && || ?  :  ^= %= *= /= += -= =  >> |  ";
172    tksemi, tkcomma, tklbracket, tkrbracket, tklparen, tkrparen, tklbrace,
173    tkrbrace, tkfield, tkincr, tkdecr, tkpow, tknot, tkmul, tkdiv, tkmod,
174    tkplus, tkminus,
175    tkcat, // !!! Fake operator for concatenation (just adjacent string exprs)
176    tklt, tkle, tkne, tkeq, tkgt, tkge, tkmatchop, tknotmatch, tkand, tkor,
177    tkternif, tkternelse, tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
178    tkaddasgn, tksubasgn, tkasgn, tkappend, tkpipe,
179
180// static char *keywords = " in        BEGIN     END       if        else      "
181//    "while     for       do        break     continue  exit      function  "
182//    "return    next      nextfile  delete    print     printf    getline   ";
183    tkin, tkbegin, tkend, tkif, tkelse,
184    tkwhile, tkfor, tkdo, tkbreak, tkcontinue, tkexit, tkfunction,
185    tkreturn, tknext, tknextfile, tkdelete, tkprint, tkprintf, tkgetline,
186
187// static char *builtins = " atan2     cos       sin       exp       "
188//    "log       sqrt      int       rand      srand     length    "
189//    "tolower   toupper   system    fflush    "
190//    "and       or        xor       lshift    rshift    ";
191    tkatan2, tkcos, tksin, tkexp, tklog, tksqrt, tkint, tkrand, tksrand,
192    tklength, tktolower, tktoupper, tksystem, tkfflush,
193    tkband, tkbor, tkbxor, tklshift, tkrshift,
194
195// static char *specialfuncs = " close     index     match     split     "
196//    "sub       gsub      sprintf   substr    ";
197    tkclose, tkindex, tkmatch, tksplit,
198    tksub, tkgsub, tksprintf, tksubstr, tklasttk
199    };
200
201enum opcodes {
202    opunusedop = tklasttk,
203    opvarref, opmapref, opfldref, oppush, opdrop, opdrop_n, opnotnot,
204    oppreincr, oppredecr, oppostincr, oppostdecr, opnegate, opjump, opjumptrue,
205    opjumpfalse, opprepcall, opmap, opmapiternext, opmapdelete, opmatchrec,
206    opquit, opprintrec, oprange1, oprange2, oprange3, oplastop
207};
208
209// Special variables (POSIX). Must align with char *spec_vars[]
210enum spec_var_names { ARGC=1, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF,
211    NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP };
212
213struct symtab_slot {    // global symbol table entry
214  unsigned flags;
215  int slotnum;
216  char *name;
217};
218
219// zstring: flexible string type.
220// Capacity must be > size because we insert a NUL byte.
221struct zstring {
222  int refcnt;
223  unsigned size;
224  unsigned capacity;
225  char str[];   // C99 flexible array member
226};
227
228// Flag bits for zvalue and symbol tables
229#define ZF_MAYBEMAP (1u << 1)
230#define ZF_MAP      (1u << 2)
231#define ZF_SCALAR   (1u << 3)
232#define ZF_NUM      (1u << 4)
233#define ZF_RX       (1u << 5)
234#define ZF_STR      (1u << 6)
235#define ZF_NUMSTR   (1u << 7)   // "numeric string" per posix
236#define ZF_REF      (1u << 9)   // for lvalues
237#define ZF_MAPREF   (1u << 10)  // for lvalues
238#define ZF_FIELDREF (1u << 11)  // for lvalues
239#define ZF_EMPTY_RX (1u << 12)
240#define ZF_ANYMAP   (ZF_MAP | ZF_MAYBEMAP)
241
242// Macro to help facilitate possible future change in zvalue layout.
243#define ZVINIT(flags, num, ptr) {(flags), (double)(num), {(ptr)}}
244
245#define IS_STR(zvalp) ((zvalp)->flags & ZF_STR)
246#define IS_RX(zvalp) ((zvalp)->flags & ZF_RX)
247#define IS_NUM(zvalp) ((zvalp)->flags & ZF_NUM)
248#define IS_MAP(zvalp) ((zvalp)->flags & ZF_MAP)
249#define IS_EMPTY_RX(zvalp) ((zvalp)->flags & ZF_EMPTY_RX)
250
251#define GLOBAL      ((struct symtab_slot *)TT.globals_table.base)
252#define LOCAL       ((struct symtab_slot *)TT.locals_table.base)
253#define FUNC_DEF    ((struct functab_slot *)TT.func_def_table.base)
254
255#define LITERAL     ((struct zvalue *)TT.literals.base)
256#define STACK       ((struct zvalue *)TT.stack.base)
257#define FIELD       ((struct zvalue *)TT.fields.base)
258
259#define ZCODE       ((int *)TT.zcode.base)
260
261#define FUNC_DEFINED    (1u)
262#define FUNC_CALLED     (2u)
263
264#define MIN_STACK_LEFT 1024
265
266struct functab_slot {    // function symbol table entry
267  unsigned flags;
268  int slotnum;
269  char *name;
270  struct zlist function_locals;
271  int zcode_addr;
272};
273
274// Elements of the hash table (key/value pairs)
275struct zmap_slot {
276  int hash;       // store hash key to speed hash table expansion
277  struct zstring *key;
278  struct zvalue val;
279};
280#define ZMSLOTINIT(hash, key, val) {hash, key, val}
281
282// zmap: Mapping data type for arrays; a hash table. Values in hash are either
283// 0 (unused), -1 (marked deleted), or one plus the number of the zmap slot
284// containing a key/value pair. The zlist slot entries are numbered from 0 to
285// count-1, so need to add one to distinguish from unused.  The probe sequence
286// is borrowed from Python dict, using the "perturb" idea to mix in upper bits
287// of the original hash value.
288struct zmap {
289  unsigned mask;  // tablesize - 1; tablesize is 2 ** n
290  int *hash;      // (mask + 1) elements
291  int limit;      // 80% of table size ((mask+1)*8/10)
292  int count;      // number of occupied slots in hash
293  int deleted;    // number of deleted slots
294  struct zlist slot;     // expanding list of zmap_slot elements
295};
296
297#define MAPSLOT    ((struct zmap_slot *)(m->slot).base)
298#define FFATAL(format, ...) zzerr("$" format, __VA_ARGS__)
299#define FATAL(...) zzerr("$%s\n", __VA_ARGS__)
300#define XERR(format, ...) zzerr(format, __VA_ARGS__)
301
302#define NO_EXIT_STATUS  (9999987)  // value unlikely to appear in exit stmt
303
304ssize_t getline(char **lineptr, size_t *n, FILE *stream);
305ssize_t getdelim(char ** restrict lineptr, size_t * restrict n, int delimiter, FILE *stream);
306
307
308
309////////////////////
310//// lib
311////////////////////
312
313static void xfree(void *p)
314{
315  free(p);
316}
317
318static int hexval(int c)
319{
320  // Assumes c is valid hex digit
321  return isdigit(c) ? c - '0' : (c | 040) - 'a' + 10;
322}
323
324////////////////////
325//// common defs
326////////////////////
327
328// These (ops, keywords, builtins) must align with enum tokens
329static char *ops = " ;  ,  [  ]  (  )  {  }  $  ++ -- ^  !  *  /  %  +  -  .. "
330        "<  <= != == >  >= ~  !~ && || ?  :  ^= %= *= /= += -= =  >> |  ";
331
332static char *keywords = " in        BEGIN     END       if        else      "
333    "while     for       do        break     continue  exit      function  "
334    "return    next      nextfile  delete    print     printf    getline   ";
335
336static char *builtins = " atan2     cos       sin       exp       log       "
337    "sqrt      int       rand      srand     length    "
338    "tolower   toupper   system    fflush    "
339    "and       or        xor       lshift    rshift    "
340    "close     index     match     split     "
341    "sub       gsub      sprintf   substr    ";
342
343static void zzerr(char *format, ...)
344{
345  va_list args;
346  int fatal_sw = 0;
347  fprintf(stderr, "%s: ", TT.progname);
348  if (format[0] == '$') {
349    fprintf(stderr, "FATAL: ");
350    format++;
351    fatal_sw = 1;
352  }
353  fprintf(stderr, "file %s line %d: ", TT.scs->filename, TT.scs->line_num);
354  va_start(args, format);
355  vfprintf(stderr, format, args);
356  va_end(args);
357  if (format[strlen(format)-1] != '\n') fputc('\n', stderr); // TEMP FIXME !!!
358  fflush(stderr);
359  if (fatal_sw) exit(2);
360        // Don't bump error count for warnings
361  else if (!strstr(format, "arning")) TT.cgl.compile_error_count++;
362}
363
364static void get_token_text(char *op, int tk)
365{
366  // This MUST ? be changed if ops string or tk... assignments change!
367  memmove(op, ops + 3 * (tk - tksemi) + 1, 2);
368  op[ op[1] == ' ' ? 1 : 2 ] = 0;
369}
370
371////////////////////
372/// UTF-8
373////////////////////
374
375// Return number of bytes in 'cnt' utf8 codepoints
376static int bytesinutf8(char *str, size_t len, size_t cnt)
377{
378  if (FLAG(b)) return cnt;
379  unsigned wch;
380  char *lim = str + len, *s0 = str;
381  while (cnt-- && str < lim) {
382    int r = utf8towc(&wch, str, lim - str);
383    str += r > 0 ? r : 1;
384  }
385  return str - s0;
386}
387
388// Return number of utf8 codepoints in str
389static int utf8cnt(char *str, size_t len)
390{
391  unsigned wch;
392  int cnt = 0;
393  char *lim;
394  if (!len || FLAG(b)) return len;
395  for (lim = str + len; str < lim; cnt++) {
396    int r = utf8towc(&wch, str, lim - str);
397    str += r > 0 ? r : 1;
398  }
399  return cnt;
400}
401
402////////////////////
403////   zlist
404////////////////////
405
406static struct zlist *zlist_initx(struct zlist *p, size_t size, size_t count)
407{
408  p->base = p->avail = xzalloc(count * size);
409  p->limit = p->base + size * count;
410  p->size = size;
411  return p;
412}
413
414static struct zlist *zlist_init(struct zlist *p, size_t size)
415{
416#define SLIST_MAX_INIT_BYTES 128
417  return zlist_initx(p, size, SLIST_MAX_INIT_BYTES / size);
418}
419
420// This is called from zlist_append() and add_stack() in run
421static void zlist_expand(struct zlist *p)
422{
423  size_t offset = p->avail - p->base;
424  size_t cap = p->limit - p->base;
425  size_t newcap = maxof(cap + p->size, ((cap / p->size) * 3 / 2) * p->size);
426  if (newcap <= cap) error_exit("mem req error");
427  char *base = xrealloc(p->base, newcap);
428  p->base = base;
429  p->limit = base + newcap;
430  p->avail = base + offset;
431}
432
433static size_t zlist_append(struct zlist *p, void *obj)
434{
435  // Insert obj (p->size bytes) at end of list, expand as needed.
436  // Return scaled offset to newly inserted obj; i.e. the
437  // "slot number" 0, 1, 2,...
438  void *objtemp = 0;
439  if (p->avail > p->limit - p->size) {
440    objtemp = xmalloc(p->size);     // Copy obj in case it is in
441    memmove(objtemp, obj, p->size); // the area realloc might free!
442    obj = objtemp;
443    zlist_expand(p);
444  }
445  memmove(p->avail, obj, p->size);
446  if (objtemp) xfree(objtemp);
447  p->avail += p->size;
448  return (p->avail - p->base - p->size) / p->size;  // offset of updated slot
449}
450
451static int zlist_len(struct zlist *p)
452{
453  return (p->avail - p->base) / p->size;
454}
455
456////////////////////
457////   zstring
458////////////////////
459
460static void zstring_release(struct zstring **s)
461{
462  if (*s && (**s).refcnt-- == 0) xfree(*s); //free_zstring(s);
463  *s = 0;
464}
465
466static void zstring_incr_refcnt(struct zstring *s)
467{
468  if (s) s->refcnt++;
469}
470
471// !! Use only if 'to' is NULL or its refcnt is 0.
472static struct zstring *zstring_modify(struct zstring *to, size_t at, char *s, size_t n)
473{
474  size_t cap = at + n + 1;
475  if (!to || to->capacity < cap) {
476    to = xrealloc(to, sizeof(*to) + cap);
477    to->capacity = cap;
478    to->refcnt = 0;
479  }
480  memcpy(to->str + at, s, n);
481  to->size = at + n;
482  to->str[to->size] = '\0';
483  return to;
484}
485
486// The 'to' pointer may move by realloc, so return (maybe updated) pointer.
487// If refcnt is nonzero then there is another pointer to this zstring,
488// so copy this one and release it. If refcnt is zero we can mutate this.
489static struct zstring *zstring_update(struct zstring *to, size_t at, char *s, size_t n)
490{
491  if (to && to->refcnt) {
492    struct zstring *to_before = to;
493    to = zstring_modify(0, 0, to->str, to->size);
494    zstring_release(&to_before);
495  }
496  return zstring_modify(to, at, s, n);
497}
498
499static struct zstring *zstring_copy(struct zstring *to, struct zstring *from)
500{
501  return zstring_update(to, 0, from->str, from->size);
502}
503
504static struct zstring *zstring_extend(struct zstring *to, struct zstring *from)
505{
506  return zstring_update(to, to->size, from->str, from->size);
507}
508
509static struct zstring *new_zstring(char *s, size_t size)
510{
511  return zstring_modify(0, 0, s, size);
512}
513
514////////////////////
515////   zvalue
516////////////////////
517
518static struct zvalue uninit_zvalue = ZVINIT(0, 0.0, 0);
519
520// This will be reassigned in init_globals() with an empty string.
521// It's a special value used for "uninitialized" field vars
522// referenced past $NF. See push_field().
523static struct zvalue uninit_string_zvalue = ZVINIT(0, 0.0, 0);
524
525static struct zvalue new_str_val(char *s)
526{
527  // Only if no nul inside string!
528  struct zvalue v = ZVINIT(ZF_STR, 0.0, new_zstring(s, strlen(s)));
529  return v;
530}
531
532static void zvalue_release_zstring(struct zvalue *v)
533{
534  if (v && ! (v->flags & (ZF_ANYMAP | ZF_RX))) zstring_release(&v->vst);
535}
536
537// push_val() is used for initializing globals (see init_compiler())
538// but mostly used in runtime
539// WARNING: push_val may change location of v, so do NOT depend on it after!
540// Note the incr refcnt used to be after the zlist_append, but that caused a
541// heap-use-after-free error when the zlist_append relocated the zvalue being
542// pushed, invalidating the v pointer.
543static void push_val(struct zvalue *v)
544{
545  if (IS_STR(v) && v->vst) v->vst->refcnt++;  // inlined zstring_incr_refcnt()
546  *++TT.stackp = *v;
547}
548
549static void zvalue_copy(struct zvalue *to, struct zvalue *from)
550{
551  if (IS_RX(from)) *to = *from;
552  else {
553    zvalue_release_zstring(to);
554    *to = *from;
555    zstring_incr_refcnt(to->vst);
556  }
557}
558
559static void zvalue_dup_zstring(struct zvalue *v)
560{
561  struct zstring *z = new_zstring(v->vst->str, v->vst->size);
562  zstring_release(&v->vst);
563  v->vst = z;
564}
565
566////////////////////
567////   zmap (array) implementation
568////////////////////
569
570static int zstring_match(struct zstring *a, struct zstring *b)
571{
572  return a->size == b->size && memcmp(a->str, b->str, a->size) == 0;
573}
574
575static int zstring_hash(struct zstring *s)
576{   // djb2 -- small, fast, good enough for this
577  unsigned h = 5381;
578  char *p = s->str, *lim = p + s->size;
579  while (p < lim)
580    h = (h << 5) + h + *p++;
581  return h;
582}
583
584enum { PSHIFT = 5 };  // "perturb" shift -- see find_mapslot() below
585
586static struct zmap_slot *find_mapslot(struct zmap *m, struct zstring *key, int *hash, int *probe)
587{
588  struct zmap_slot *x = 0;
589  unsigned perturb = *hash = zstring_hash(key);
590  *probe = *hash & m->mask;
591  int n, first_deleted = -1;
592  while ((n = m->hash[*probe])) {
593    if (n > 0) {
594      x = &MAPSLOT[n-1];
595      if (*hash == x->hash && zstring_match(key, x->key)) {
596        return x;
597      }
598    } else if (first_deleted < 0) first_deleted = *probe;
599    // Based on technique in Python dict implementation. Comment there
600    // (https://github.com/python/cpython/blob/3.10/Objects/dictobject.c)
601    // says
602    //
603    // j = ((5*j) + 1) mod 2**i
604    // For any initial j in range(2**i), repeating that 2**i times generates
605    // each int in range(2**i) exactly once (see any text on random-number
606    // generation for proof).
607    //
608    // The addition of 'perturb' greatly improves the probe sequence. See
609    // the Python dict implementation for more details.
610    *probe = (*probe * 5 + 1 + (perturb >>= PSHIFT)) & m->mask;
611  }
612  if (first_deleted >= 0) *probe = first_deleted;
613  return 0;
614}
615
616static struct zvalue *zmap_find(struct zmap *m, struct zstring *key)
617{
618  int hash, probe;
619  struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
620  return x ? &x->val : 0;
621}
622
623static void zmap_init(struct zmap *m)
624{
625  enum {INIT_SIZE = 8};
626  m->mask = INIT_SIZE - 1;
627  m->hash = xzalloc(INIT_SIZE * sizeof(*m->hash));
628  m->limit = INIT_SIZE * 8 / 10;
629  m->count = 0;
630  m->deleted = 0;
631  zlist_init(&m->slot, sizeof(struct zmap_slot));
632}
633
634static void zvalue_map_init(struct zvalue *v)
635{
636  struct zmap *m = xmalloc(sizeof(*m));
637  zmap_init(m);
638  v->map = m;
639  v->flags |= ZF_MAP;
640}
641
642static void zmap_delete_map_incl_slotdata(struct zmap *m)
643{
644  for (struct zmap_slot *p = &MAPSLOT[0]; p < &MAPSLOT[zlist_len(&m->slot)]; p++) {
645    if (p->key) zstring_release(&p->key);
646    if (p->val.vst) zstring_release(&p->val.vst);
647  }
648  xfree(m->slot.base);
649  xfree(m->hash);
650}
651
652static void zmap_delete_map(struct zmap *m)
653{
654  zmap_delete_map_incl_slotdata(m);
655  zmap_init(m);
656}
657
658static void zmap_rehash(struct zmap *m)
659{
660  // New table is twice the size of old.
661  int size = m->mask + 1;
662  unsigned mask = 2 * size - 1;
663  int *h = xzalloc(2 * size * sizeof(*m->hash));
664  // Step through the old hash table, set up location in new table.
665  for (int i = 0; i < size; i++) {
666    int n = m->hash[i];
667    if (n > 0) {
668      int hash = MAPSLOT[n-1].hash;
669      unsigned perturb = hash;
670      int p = hash & mask;
671      while (h[p]) {
672        p = (p * 5 + 1 + (perturb >>= PSHIFT)) & mask;
673      }
674      h[p] = n;
675    }
676  }
677  m->mask = mask;
678  xfree(m->hash);
679  m->hash = h;
680  m->limit = 2 * size * 8 / 10;
681}
682
683static struct zmap_slot *zmap_find_or_insert_key(struct zmap *m, struct zstring *key)
684{
685  int hash, probe;
686  struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
687  if (x) return x;
688  // not found; insert it.
689  if (m->count == m->limit) {
690    zmap_rehash(m);         // rehash if getting too full.
691    // rerun find_mapslot to get new probe index
692    x = find_mapslot(m, key, &hash, &probe);
693  }
694  // Assign key to new slot entry and bump refcnt.
695  struct zmap_slot zs = ZMSLOTINIT(hash, key, (struct zvalue)ZVINIT(0, 0.0, 0));
696  zstring_incr_refcnt(key);
697  int n = zlist_append(&m->slot, &zs);
698  m->count++;
699  m->hash[probe] = n + 1;
700  return &MAPSLOT[n];
701}
702
703static void zmap_delete(struct zmap *m, struct zstring *key)
704{
705  int hash, probe;
706  struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
707  if (!x) return;
708  zstring_release(&MAPSLOT[m->hash[probe] - 1].key);
709  m->hash[probe] = -1;
710  m->deleted++;
711}
712
713////////////////////
714//// scan (lexical analyzer)
715////////////////////
716
717// TODO:
718// IS line_num getting incr correctly? Newline counts as start of line!?
719// Handle nuls in file better.
720// Open files "rb" and handle CRs in program.
721// Roll gch() into get_char() ?
722// Deal with signed char (at EOF? elsewhere?)
723//
724// 2023-01-11: Allow nul bytes inside strings? regexes?
725
726static void progfile_open(void)
727{
728  TT.scs->filename = TT.scs->prog_args->arg;
729  TT.scs->prog_args = TT.scs->prog_args->next;
730  TT.scs->fp = stdin;
731  if (strcmp(TT.scs->filename, "-")) TT.scs->fp = fopen(TT.scs->filename, "r");
732  if (!TT.scs->fp) error_exit("Can't open %s", TT.scs->filename);
733  TT.scs->line_num = 0;
734}
735
736static int get_char(void)
737{
738  static char *nl = "\n";
739  // On first entry, TT.scs->p points to progstring if any, or null string.
740  for (;;) {
741    int c = *(TT.scs->p)++;
742    if (c) {
743      return c;
744    }
745    if (TT.scs->progstring) {  // Fake newline at end of progstring.
746      if (TT.scs->progstring == nl) return EOF;
747      TT.scs->p = TT.scs->progstring = nl;
748      continue;
749    }
750    // Here if getting from progfile(s).
751    if (TT.scs->line == nl) return EOF;
752    if (!TT.scs->fp) {
753      progfile_open();
754    // The "  " + 1 is to set p to null string but allow ref to prev char for
755    // "lastchar" test below.
756    }
757    // Save last char to allow faking final newline.
758    int lastchar = (TT.scs->p)[-2];
759    TT.scs->line_len = getline(&TT.scs->line, &TT.scs->line_size, TT.scs->fp);
760    if (TT.scs->line_len > 0) {
761      TT.scs->line_num++;
762      TT.scs->p = TT.scs->line;
763      continue;
764    }
765    // EOF
766    // FIXME TODO or check for error? feof() vs. ferror()
767    fclose(TT.scs->fp);
768    TT.scs->fp = 0;
769    TT.scs->p = "  " + 2;
770    if (!TT.scs->prog_args) {
771      xfree(TT.scs->line);
772      if (lastchar == '\n') return EOF;
773      // Fake final newline
774      TT.scs->line = TT.scs->p = nl;
775    }
776  }
777}
778
779static void append_this_char(int c)
780{
781  if (TT.scs->toklen == TT.scs->maxtok - 1) {
782    TT.scs->maxtok *= 2;
783    TT.scs->tokstr = xrealloc(TT.scs->tokstr, TT.scs->maxtok);
784  }
785  TT.scs->tokstr[TT.scs->toklen++] = c;
786  TT.scs->tokstr[TT.scs->toklen] = 0;
787}
788
789static void gch(void)
790{
791  // FIXME probably not right place to skip CRs.
792  do {
793    TT.scs->ch = get_char();
794  } while (TT.scs->ch == '\r');
795}
796
797static void append_char(void)
798{
799  append_this_char(TT.scs->ch);
800  gch();
801}
802
803static int find_keyword_or_builtin(char *table,
804    int first_tok_in_table)
805{
806  char s[16] = " ", *p;
807  // keywords and builtin functions are spaced 10 apart for strstr() lookup,
808  // so must be less than that long.
809  if (TT.scs->toklen >= 10) return 0;
810  strcat(s, TT.scs->tokstr);
811  strcat(s, " ");
812  p = strstr(table, s);
813  if (!p) return 0;
814  return first_tok_in_table + (p - table) / 10;
815}
816
817static int find_token(void)
818{
819  char s[6] = " ", *p;
820  // tokens are spaced 3 apart for strstr() lookup, so must be less than
821  // that long.
822  strcat(s, TT.scs->tokstr);
823  strcat(s, " ");
824  p = strstr(ops, s);
825  if (!p) return 0;
826  return tksemi + (p - ops) / 3;
827}
828
829static int find_keyword(void)
830{
831  return find_keyword_or_builtin(keywords, tkin);
832}
833
834static int find_builtin(void)
835{
836  return find_keyword_or_builtin(builtins, tkatan2);
837}
838
839static void get_number(void)
840{
841  // Assumes TT.scs->ch is digit or dot on entry.
842  // TT.scs->p points to the following character.
843  // OK formats: 1 1. 1.2 1.2E3 1.2E+3 1.2E-3 1.E2 1.E+2 1.E-2 1E2 .1 .1E2
844  // .1E+2 .1E-2
845  // NOT OK: . .E .E1 .E+ .E+1 ; 1E .1E 1.E 1.E+ 1.E- parse as number
846  // followed by variable E.
847  // gawk accepts 12.E+ and 12.E- as 12; nawk & mawk say syntax error.
848  char *leftover;
849  int len;
850  TT.scs->numval = strtod(TT.scs->p - 1, &leftover);
851  len = leftover - TT.scs->p + 1;
852  if (len == 0) {
853    append_char();
854    TT.scs->toktype = ERROR;
855    TT.scs->tok = tkerr;
856    TT.scs->error = 1;
857    FFATAL("Unexpected token '%s'\n", TT.scs->tokstr);
858    return;
859  }
860  while (len--)
861    append_char();
862}
863
864static void get_string_or_regex(int endchar)
865{
866  gch();
867  while (TT.scs->ch != endchar) {
868    if (TT.scs->ch == '\n') {
869      // FIXME Handle unterminated string or regex. Is this OK?
870      // FIXME TODO better diagnostic here?
871      XERR("%s\n", "unterminated string or regex");
872      break;
873    } else if (TT.scs->ch == '\\') {
874      // \\ \a \b \f \n \r \t \v \" \/ \ddd
875      char *p, *escapes = "\\abfnrtv\"/";
876      gch();
877      if (TT.scs->ch == '\n') {  // backslash newline is continuation
878        gch();
879        continue;
880      } else if ((p = strchr(escapes, TT.scs->ch))) {
881        // posix regex does not use these escapes,
882        // but awk does, so do them.
883        int c = "\\\a\b\f\n\r\t\v\"/"[p-escapes];
884        append_this_char(c);
885        // Need to double up \ inside literal regex
886        if (endchar == '/' && c == '\\') append_this_char('\\');
887        gch();
888      } else if (TT.scs->ch == 'x') {
889        gch();
890        if (isxdigit(TT.scs->ch)) {
891          int c = hexval(TT.scs->ch);
892          gch();
893          if (isxdigit(TT.scs->ch)) {
894            c = c * 16 + hexval(TT.scs->ch);
895            gch();
896          }
897          append_this_char(c);
898        } else append_this_char('x');
899      } else if (TT.scs->ch == 'u') {
900        gch();
901        if (isxdigit(TT.scs->ch)) {
902          int i = 0, j = 0, c = 0;
903          char codep[9] = {0};
904          do {
905            codep[j++] = TT.scs->ch;
906            gch();
907          } while (j < 8 && isxdigit(TT.scs->ch));
908          c = strtol(codep, 0, 16);
909          for (i = wctoutf8(codep, c), j = 0; j < i; j++)
910            append_this_char(codep[j]);
911        } else append_this_char('u');
912      } else if (isdigit(TT.scs->ch)) {
913        if (TT.scs->ch < '8') {
914          int k, c = 0;
915          for (k = 0; k < 3; k++) {
916            if (isdigit(TT.scs->ch) && TT.scs->ch < '8') {
917              c = c * 8 + TT.scs->ch - '0';
918              gch();
919            } else
920              break;
921          }
922          append_this_char(c);
923        } else {
924          append_char();
925        }
926      } else {
927        if (endchar == '/') {
928          // pass \ unmolested if not awk escape,
929          // so that regex routines can see it.
930          if (!strchr(".[]()*+?{}|^$-", TT.scs->ch)) {
931            XERR("warning: '\\%c' -- unknown regex escape\n", TT.scs->ch);
932          }
933          append_this_char('\\');
934        } else {
935          XERR("warning: '\\%c' treated as plain '%c'\n", TT.scs->ch, TT.scs->ch);
936        }
937      }
938    } else if (TT.scs->ch == EOF) {
939      FATAL("EOF in string or regex\n");
940    } else {
941      append_char();
942    }
943  }
944  gch();
945}
946
947static void ascan_opt_div(int div_op_allowed_here)
948{
949  int n;
950  for (;;) {
951    TT.scs->tokbuiltin = 0;
952    TT.scs->toklen = 0;
953    TT.scs->tokstr[0] = 0;
954    while (TT.scs->ch == ' ' || TT.scs->ch == '\t')
955      gch();
956    if (TT.scs->ch == '\\') {
957      append_char();
958      if (TT.scs->ch == '\n') {
959        gch();
960        continue;
961      }
962      TT.scs->toktype = ERROR;   // \ not last char in line.
963      TT.scs->tok = tkerr;
964      TT.scs->error = 3;
965      FATAL("backslash not last char in line\n");
966      return;
967    }
968    break;
969  }
970  // Note \<NEWLINE> in comment does not continue it.
971  if (TT.scs->ch == '#') {
972    gch();
973    while (TT.scs->ch != '\n')
974      gch();
975    // Need to fall through here to pick up newline.
976  }
977  if (TT.scs->ch == '\n') {
978    TT.scs->toktype = NEWLINE;
979    TT.scs->tok = tknl;
980    append_char();
981  } else if (isalpha(TT.scs->ch) || TT.scs->ch == '_') {
982    append_char();
983    while (isalnum(TT.scs->ch) || TT.scs->ch == '_') {
984      append_char();
985    }
986    if ((n = find_keyword()) != 0) {
987      TT.scs->toktype = KEYWORD;
988      TT.scs->tok = n;
989    } else if ((n = find_builtin()) != 0) {
990      TT.scs->toktype = BUILTIN;
991      TT.scs->tok = tkbuiltin;
992      TT.scs->tokbuiltin = n;
993    } else if ((TT.scs->ch == '(')) {
994      TT.scs->toktype = USERFUNC;
995      TT.scs->tok = tkfunc;
996    } else {
997      TT.scs->toktype = VAR;
998      TT.scs->tok = tkvar;
999      // skip whitespace to be able to check for , or )
1000      while (TT.scs->ch == ' ' || TT.scs->ch == '\t')
1001        gch();
1002    }
1003    return;
1004  } else if (TT.scs->ch == '"') {
1005    TT.scs->toktype = STRING;
1006    TT.scs->tok = tkstring;
1007    get_string_or_regex('"');
1008  } else if (isdigit(TT.scs->ch) || TT.scs->ch == '.') {
1009    TT.scs->toktype = NUMBER;
1010    TT.scs->tok = tknumber;
1011    get_number();
1012  } else if (TT.scs->ch == '/' && ! div_op_allowed_here) {
1013    TT.scs->toktype = REGEX;
1014    TT.scs->tok = tkregex;
1015    get_string_or_regex('/');
1016  } else if (TT.scs->ch == EOF) {
1017    TT.scs->toktype = EOF;
1018    TT.scs->tok = tkeof;
1019  } else if (TT.scs->ch == '\0') {
1020    append_char();
1021    TT.scs->toktype = ERROR;
1022    TT.scs->tok = tkerr;
1023    TT.scs->error = 5;
1024    FATAL("null char\n");
1025  } else {
1026    // All other tokens.
1027    TT.scs->toktype = TT.scs->ch;
1028    append_char();
1029    // Special case for **= and ** tokens
1030    if (TT.scs->toktype == '*' && TT.scs->ch == '*') {
1031      append_char();
1032      if (TT.scs->ch == '=') {
1033        append_char();
1034        TT.scs->tok = tkpowasgn;
1035      } else TT.scs->tok = tkpow;
1036      TT.scs->toktype = TT.scs->tok + 200;
1037      return;
1038    }
1039    // Is it a 2-character token?
1040    if (TT.scs->ch != ' ' && TT.scs->ch != '\n') {
1041      append_this_char(TT.scs->ch);
1042      if (find_token()) {
1043        TT.scs->tok = find_token();
1044        TT.scs->toktype = TT.scs->tok + 200;
1045        gch();  // Eat second char of token.
1046        return;
1047      }
1048      TT.scs->toklen--;  // Not 2-character token; back off.
1049      TT.scs->tokstr[TT.scs->toklen] = 0;
1050    }
1051    TT.scs->tok = find_token();
1052    if (TT.scs->tok) return;
1053    TT.scs->toktype = ERROR;
1054    TT.scs->tok = tkerr;
1055    TT.scs->error = 4;
1056    FFATAL("Unexpected token '%s'\n", TT.scs->tokstr);
1057  }
1058}
1059
1060static void scan_opt_div(int div_op_allowed_here)
1061{
1062  // TODO FIXME need better diags for bad tokens!
1063  // TODO Also set global syntax error flag.
1064  do ascan_opt_div(div_op_allowed_here); while (TT.scs->tok == tkerr);
1065}
1066
1067static void init_scanner(void)
1068{
1069  TT.prevtok = tkeof;
1070  gch();
1071}
1072
1073// POSIX says '/' does not begin a regex wherever '/' or '/=' can mean divide.
1074// Pretty sure if / or /= comes after these, it means divide:
1075static char div_preceders[] = {tknumber, tkstring, tkvar, tkgetline, tkrparen, tkrbracket, tkincr, tkdecr, 0};
1076
1077// For checking end of prev statement for termination and if '/' can come next
1078
1079static void scan(void)
1080{
1081  TT.prevtok = TT.scs->tok;
1082  if (TT.prevtok && strchr(div_preceders, TT.prevtok)) scan_opt_div(1);
1083  else scan_opt_div(0);
1084  TT.tokstr = TT.scs->tokstr;
1085}
1086
1087////////////////////
1088//// compile
1089////////////////////
1090
1091//  NOTES:
1092//  NL ok after , { && || do else OR after right paren after if/while/for
1093//  TODO:
1094//    see case tkgetline -- test more
1095//    case tkmatchop, tknotmatch -- fix ~ (/re/)
1096
1097// Forward declarations -- for mutually recursive parsing functions
1098static int expr(int rbp);
1099static void lvalue(void);
1100static int primary(void);
1101static void stmt(void);
1102static void action(int action_type);
1103
1104#define CURTOK() (TT.scs->tok)
1105#define ISTOK(toknum) (TT.scs->tok == (toknum))
1106
1107static int havetok(int tk)
1108{
1109  if (!ISTOK(tk)) return 0;
1110  scan();
1111  return 1;
1112}
1113
1114//// code and "literal" emitters
1115static void gen2cd(int op, int n)
1116{
1117  zlist_append(&TT.zcode, &op);
1118  TT.zcode_last = zlist_append(&TT.zcode, &n);
1119}
1120
1121static void gencd(int op)
1122{
1123  TT.zcode_last = zlist_append(&TT.zcode, &op);
1124}
1125
1126static int make_literal_str_val(char *s)
1127{
1128  // Only if no nul inside string!
1129  struct zvalue v = new_str_val(s);
1130  return zlist_append(&TT.literals, &v);
1131}
1132
1133static int make_literal_regex_val(char *s)
1134{
1135  regex_t *rx;
1136  rx = xmalloc(sizeof(*rx));
1137  xregcomp(rx, s, REG_EXTENDED);
1138  struct zvalue v = ZVINIT(ZF_RX, 0, 0);
1139  v.rx = rx;
1140  // Flag empty rx to make it easy to identify for split() special case
1141  if (!*s) v.flags |= ZF_EMPTY_RX;
1142  return zlist_append(&TT.literals, &v);
1143}
1144
1145static int make_literal_num_val(double num)
1146{
1147  struct zvalue v = ZVINIT(ZF_NUM, num, 0);
1148  return zlist_append(&TT.literals, &v);
1149}
1150
1151static int make_uninit_val(void)
1152{
1153  struct zvalue v = uninit_zvalue;
1154  return zlist_append(&TT.literals, &v);
1155}
1156//// END code and "literal" emitters
1157
1158//// Symbol tables functions
1159static int find_func_def_entry(char *s)
1160{
1161  for (int k = 1; k < zlist_len(&TT.func_def_table); k++)
1162    if (!strcmp(s, FUNC_DEF[k].name)) return k;
1163  return 0;
1164}
1165
1166static int add_func_def_entry(char *s)
1167{
1168  struct functab_slot ent = {0, 0, 0, {0, 0, 0, 0}, 0};
1169  ent.name = xstrdup(s);
1170  int slotnum = zlist_append(&TT.func_def_table, &ent);
1171  FUNC_DEF[slotnum].slotnum = slotnum;
1172  return slotnum;
1173}
1174
1175static int find_global(char *s)
1176{
1177  for (int k = 1; k < zlist_len(&TT.globals_table); k++)
1178    if (!strcmp(s, GLOBAL[k].name)) return k;
1179  return 0;
1180}
1181
1182static int add_global(char *s)
1183{
1184  struct symtab_slot ent = {0, 0, 0};
1185  ent.name = xstrdup(s);
1186  int slotnum = zlist_append(&TT.globals_table, &ent);
1187  GLOBAL[slotnum].slotnum = slotnum;
1188  return slotnum;
1189}
1190
1191static int find_local_entry(char *s)
1192{
1193  for (int k = 1; k < zlist_len(&TT.locals_table); k++)
1194    if (!strcmp(s, LOCAL[k].name)) return k;
1195  return 0;
1196}
1197
1198static int add_local_entry(char *s)
1199{
1200  struct symtab_slot ent = {0, 0, 0};
1201  ent.name = xstrdup(s);
1202  int slotnum = zlist_append(&TT.locals_table, &ent);
1203  LOCAL[slotnum].slotnum = slotnum;
1204  return slotnum;
1205}
1206
1207static int find_or_add_var_name(void)
1208{
1209  int slotnum = 0;    // + means global; - means local to function
1210  int globals_ent = 0;
1211  int locals_ent = find_local_entry(TT.tokstr);   // in local symbol table?
1212  if (locals_ent) {
1213    slotnum = -LOCAL[locals_ent].slotnum;
1214  } else {
1215    globals_ent = find_global(TT.tokstr);
1216    if (!globals_ent) globals_ent = add_global(TT.tokstr);
1217    slotnum = GLOBAL[globals_ent].slotnum;
1218    if (find_func_def_entry(TT.tokstr))
1219      // POSIX: The same name shall not be used both as a variable name
1220      // with global scope and as the name of a function.
1221      XERR("var '%s' used as function name\n", TT.tokstr);
1222  }
1223  return slotnum;
1224}
1225
1226//// END Symbol tables functions
1227
1228//// Initialization
1229static void init_locals_table(void)
1230{
1231  static struct symtab_slot locals_ent;
1232  zlist_init(&TT.locals_table, sizeof(struct symtab_slot));
1233  zlist_append(&TT.locals_table, &locals_ent);
1234}
1235
1236static void init_tables(void)
1237{
1238  static struct symtab_slot global_ent;
1239  static struct functab_slot func_ent;
1240
1241  // Append dummy elements in lists to force valid offsets nonzero.
1242  zlist_init(&TT.globals_table, sizeof(struct symtab_slot));
1243  zlist_append(&TT.globals_table, &global_ent);
1244  zlist_init(&TT.func_def_table, sizeof(struct functab_slot));
1245  zlist_append(&TT.func_def_table, &func_ent);
1246  init_locals_table();
1247  zlist_init(&TT.zcode, sizeof(int));
1248  gencd(tkeof);   // to ensure zcode offsets are non-zero
1249  zlist_init(&TT.literals, sizeof(struct zvalue));
1250  // Init stack size at twice MIN_STACK_LEFT. MIN_STACK_LEFT is at least as
1251  // many entries as any statement may ever take.  Currently there is no diag
1252  // if this is exceeded; prog. will probably crash. 1024 should be plenty?
1253  zlist_initx(&TT.stack, sizeof(struct zvalue), 2 * MIN_STACK_LEFT);
1254  TT.stackp = (struct zvalue *)TT.stack.base;
1255  zlist_init(&TT.fields, sizeof(struct zvalue));
1256  zlist_append(&TT.literals, &uninit_zvalue);
1257  zlist_append(&TT.stack, &uninit_zvalue);
1258  zlist_append(&TT.fields, &uninit_zvalue);
1259  FIELD[0].vst = new_zstring("", 0);
1260}
1261
1262static void init_compiler(void)
1263{
1264  // Special variables (POSIX). Must align with enum spec_var_names
1265  static char *spec_vars[] = { "ARGC", "ARGV", "CONVFMT", "ENVIRON", "FILENAME",
1266      "FNR", "FS", "NF", "NR", "OFMT", "OFS", "ORS", "RLENGTH", "RS", "RSTART",
1267      "SUBSEP", 0};
1268
1269  init_tables();
1270  for (int k = 0; spec_vars[k]; k++) {
1271    TT.spec_var_limit = add_global(spec_vars[k]);
1272    GLOBAL[TT.spec_var_limit++].flags |= (k == 1 || k == 3) ? ZF_MAP : ZF_SCALAR;
1273    push_val(&uninit_zvalue);
1274  }
1275}
1276//// END Initialization
1277
1278//// Parsing and compiling to TT.zcode
1279// Left binding powers
1280static int lbp_table[] = {  // Must align with enum Toks
1281  0, 0, 0, 0,     // tkunusedtoken, tkeof, tkerr, tknl,
1282  250, 250, 250,  // tkvar, tknumber, tkstring,
1283  250, 250, 250,  // tkregex, tkfunc, tkbuiltin,
1284  0, 0, 210, 0, // tksemi, tkcomma, tklbracket, tkrbracket,
1285  200, 0, 0, 0, // tklparen, tkrparen, tklbrace, tkrbrace,
1286  190, 180, 180, 170, 160, // tkfield, tkincr, tkdecr, tkpow, tknot,
1287  150, 150, 150, 140, 140, // tkmul, tkdiv, tkmod, tkplus, tkminus,
1288  130, // tkcat, // FAKE (?) optor for concatenation (adjacent string exprs)
1289  110, 110, 110, 110, 110, 110, // tklt, tkle, tkne, tkeq, tkgt, tkge,
1290  100, 100, // tkmatchop, tknotmatch,
1291  80, 70, // tkand, tkor,
1292  60, 0, // tkternif, tkternelse,
1293  50, 50, 50, 50,   // tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
1294  50, 50, 50, // tkaddasgn, tksubasgn, tkasgn,
1295  0, 120, // tkappend, tkpipe,
1296  90 // tkin
1297};
1298
1299static int getlbp(int tok)
1300{
1301  // FIXME: should tkappend be here too? is tkpipe needed?
1302  // In print statement outside parens: make '>' end an expression
1303  if (TT.cgl.in_print_stmt && ! TT.cgl.paren_level && (tok == tkgt || tok == tkpipe))
1304    return 0;
1305  return (0 <= tok && tok <= tkin) ? lbp_table[tok] :
1306    // getline is special, not a normal builtin.
1307    // close, index, match, split, sub, gsub, sprintf, substr
1308    // are really builtin functions though bwk treats them as keywords.
1309    (tkgetline <= tok && tok <= tksubstr) ? 240 : 0;     // FIXME 240 is temp?
1310}
1311
1312// Get right binding power. Same as left except for right associative optors
1313static int getrbp(int tok)
1314{
1315  int lbp = getlbp(tok);
1316  // ternary (?:), assignment, power ops are right associative
1317  return (lbp <= 60 || lbp == 170) ? lbp - 1 : lbp;
1318}
1319
1320static void unexpected_eof(void)
1321{
1322  error_exit("terminated with error(s)");
1323}
1324
1325//// syntax error diagnostic and recovery (Turner's method)
1326// D.A. Turner, Error diagnosis and recovery in one pass compilers,
1327// Information Processing Letters, Volume 6, Issue 4, 1977, Pages 113-115
1328static int recovering = 0;
1329
1330static void complain(int tk)
1331{
1332  char op[3], tkstr[10];
1333  if (recovering) return;
1334  recovering = 1;
1335  if (!strcmp(TT.tokstr, "\n")) TT.tokstr = "<newline>";
1336  if (tksemi <= tk && tk <= tkpipe) {
1337    get_token_text(op, tk);
1338    XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, op);
1339  } else if (tk >= tkin && tk <= tksubstr) {
1340    if (tk < tkatan2) memmove(tkstr, keywords + 1 + 10 * (tk - tkin), 10);
1341    else memmove(tkstr, builtins + 1 + 10 * (tk - tkatan2), 10);
1342    *strchr(tkstr, ' ') = 0;
1343    XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, tkstr);
1344  } else XERR("syntax near '%s'\n", TT.tokstr);
1345}
1346
1347static void expect(int tk)
1348{
1349  if (recovering) {
1350    while (!ISTOK(tkeof) && !ISTOK(tk))
1351      scan();
1352    if (ISTOK(tkeof)) unexpected_eof();
1353    scan(); // consume expected token
1354    recovering = 0;
1355  } else if (!havetok(tk)) complain(tk);
1356}
1357
1358static void skip_to(char *tklist)
1359{
1360  do scan(); while (!ISTOK(tkeof) && !strchr(tklist, CURTOK()));
1361  if (ISTOK(tkeof)) unexpected_eof();
1362}
1363
1364//// END syntax error diagnostic and recovery (Turner's method)
1365
1366static void optional_nl_or_semi(void)
1367{
1368  while (havetok(tknl) || havetok(tksemi))
1369    ;
1370}
1371
1372static void optional_nl(void)
1373{
1374  while (havetok(tknl))
1375    ;
1376}
1377
1378static void rparen(void)
1379{
1380  expect(tkrparen);
1381  optional_nl();
1382}
1383
1384static int have_comma(void)
1385{
1386  if (!havetok(tkcomma)) return 0;
1387  optional_nl();
1388  return 1;
1389}
1390
1391static void check_set_map(int slotnum)
1392{
1393  // POSIX: The same name shall not be used within the same scope both as
1394  // a scalar variable and as an array.
1395  if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_SCALAR)
1396    XERR("scalar param '%s' used as array\n", LOCAL[-slotnum].name);
1397  if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_SCALAR)
1398    XERR("scalar var '%s' used as array\n", GLOBAL[slotnum].name);
1399  if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_MAP;
1400  if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_MAP;
1401}
1402
1403static void check_set_scalar(int slotnum)
1404{
1405  if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_MAP)
1406    XERR("array param '%s' used as scalar\n", LOCAL[-slotnum].name);
1407  if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_MAP)
1408    XERR("array var '%s' used as scalar\n", GLOBAL[slotnum].name);
1409  if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_SCALAR;
1410  if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_SCALAR;
1411}
1412
1413static void map_name(void)
1414{
1415  int slotnum;
1416  check_set_map(slotnum = find_or_add_var_name());
1417  gen2cd(tkvar, slotnum);
1418}
1419
1420static void check_builtin_arg_counts(int tk, int num_args, char *fname)
1421{
1422  static char builtin_1_arg[] = { tkcos, tksin, tkexp, tklog, tksqrt, tkint,
1423                                  tktolower, tktoupper, tkclose, tksystem, 0};
1424  static char builtin_2_arg[] = { tkatan2, tkmatch, tkindex, tklshift, tkrshift, 0};
1425  static char builtin_al_2_arg[] = { tkband, tkbor, tkbxor, 0};
1426  static char builtin_2_3_arg[] = { tksub, tkgsub, tksplit, tksubstr, 0};
1427  static char builtin_0_1_arg[] = { tksrand, tklength, tkfflush, 0};
1428
1429  if (tk == tkrand && num_args)
1430    XERR("function '%s' expected no args, got %d\n", fname, num_args);
1431  else if (strchr(builtin_1_arg, tk) && num_args != 1)
1432    XERR("function '%s' expected 1 arg, got %d\n", fname, num_args);
1433  else if (strchr(builtin_2_arg, tk) && num_args != 2)
1434    XERR("function '%s' expected 2 args, got %d\n", fname, num_args);
1435  else if (strchr(builtin_al_2_arg, tk) && num_args < 2)
1436    XERR("function '%s' expected at least 2 args, got %d\n", fname, num_args);
1437  else if (strchr(builtin_2_3_arg, tk) && num_args != 2 && num_args != 3)
1438    XERR("function '%s' expected 2 or 3 args, got %d\n", fname, num_args);
1439  else if (strchr(builtin_0_1_arg, tk) && num_args != 0 && num_args != 1)
1440    XERR("function '%s' expected no arg or 1 arg, got %d\n", fname, num_args);
1441}
1442
1443static void builtin_call(int tk, char *builtin_name)
1444{
1445  int num_args = 0;
1446  expect(tklparen);
1447  TT.cgl.paren_level++;
1448  switch (tk) {
1449    case tksub:
1450    case tkgsub:
1451      if (ISTOK(tkregex)) {
1452        gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1453        scan();
1454      } else expr(0);
1455      expect(tkcomma);
1456      optional_nl();
1457      expr(0);
1458      if (have_comma()) {
1459        lvalue();
1460      } else {
1461        gen2cd(tknumber, make_literal_num_val(0));
1462        gen2cd(opfldref, tkeof);
1463      }
1464      num_args = 3;
1465      break;
1466
1467    case tkmatch:
1468      expr(0);
1469      expect(tkcomma);
1470      optional_nl();
1471      if (ISTOK(tkregex)) {
1472        gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1473        scan();
1474      } else expr(0);
1475      num_args = 2;
1476      break;
1477
1478    case tksplit:
1479      expr(0);
1480      expect(tkcomma);
1481      optional_nl();
1482      if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1483        map_name();
1484        scan();
1485      } else {
1486        XERR("%s\n", "expected array name as split() 2nd arg");
1487        expr(0);
1488      }
1489      // FIXME some recovery needed here!?
1490      num_args = 2;
1491      if (have_comma()) {
1492        if (ISTOK(tkregex)) {
1493          gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1494          scan();
1495        } else expr(0);
1496        num_args++;
1497      }
1498      break;
1499
1500    case tklength:
1501      if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1502        gen2cd(tkvar, find_or_add_var_name());
1503        scan();
1504        num_args++;
1505      }
1506      ATTR_FALLTHROUGH_INTENDED;
1507
1508    default:
1509      if (ISTOK(tkrparen)) break;
1510      do {
1511        expr(0);
1512        num_args++;
1513      } while (have_comma());
1514      break;
1515  }
1516  expect(tkrparen);
1517  TT.cgl.paren_level--;
1518
1519  check_builtin_arg_counts(tk, num_args, builtin_name);
1520
1521  gen2cd(tk, num_args);
1522}
1523
1524static void function_call(void)
1525{
1526  // Function call: generate TT.zcode to:
1527  //  push placeholder for return value, push placeholder for return addr,
1528  //  push args, then push number of args, then:
1529  //      for builtins: gen opcode (e.g. tkgsub)
1530  //      for user func: gen (tkfunc, function location)
1531  //      if function not yet defined, location will be filled in when defined
1532  //          the location slots will be chained from the symbol table
1533  int functk = 0, funcnum = 0;
1534  char builtin_name[16];  // be sure it's long enough for all builtins
1535  if (ISTOK(tkbuiltin)) {
1536    functk = TT.scs->tokbuiltin;
1537    strcpy(builtin_name, TT.tokstr);
1538  } else if (ISTOK(tkfunc)) { // user function
1539    funcnum = find_func_def_entry(TT.tokstr);
1540    if (!funcnum) funcnum = add_func_def_entry(TT.tokstr);
1541    FUNC_DEF[funcnum].flags |= FUNC_CALLED;
1542    gen2cd(opprepcall, funcnum);
1543  } else error_exit("bad function %s!", TT.tokstr);
1544  scan();
1545  // length() can appear without parens
1546  int num_args = 0;
1547  if (functk == tklength && !ISTOK(tklparen)) {
1548    gen2cd(functk, 0);
1549    return;
1550  }
1551  if (functk) {   // builtin
1552    builtin_call(functk, builtin_name);
1553    return;
1554  }
1555  expect(tklparen);
1556  TT.cgl.paren_level++;
1557  if (ISTOK(tkrparen)) {
1558    scan();
1559  } else {
1560    do {
1561      if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1562        // Function call arg that is a lone variable. Cannot tell in this
1563        // context if it is a scalar or map. Just add it to symbol table.
1564        gen2cd(tkvar, find_or_add_var_name());
1565        scan();
1566      } else expr(0);
1567      num_args++;
1568    } while (have_comma());
1569    expect(tkrparen);
1570  }
1571  TT.cgl.paren_level--;
1572  gen2cd(tkfunc, num_args);
1573}
1574
1575static void var(void)
1576{
1577  // var name is in TT.tokstr
1578  // slotnum: + means global; - means local to function
1579  int slotnum = find_or_add_var_name();
1580  scan();
1581  if (havetok(tklbracket)) {
1582    check_set_map(slotnum);
1583    int num_subscripts = 0;
1584    do {
1585      expr(0);
1586      num_subscripts++;
1587    } while (have_comma());
1588    expect(tkrbracket);
1589    if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts);
1590    gen2cd(opmap, slotnum);
1591  } else {
1592    check_set_scalar(slotnum);
1593    gen2cd(tkvar, slotnum);
1594  }
1595}
1596
1597//   Dollar $ tkfield can be followed by "any" expresson, but
1598//   the way it binds varies.
1599//   The following are valid lvalues:
1600//   $ ( expr )
1601//   $ tkvar $ tknumber $ tkstring $ tkregex
1602//   $ tkfunc(...)
1603//   $ tkbuiltin(...)
1604//   $ length   # with no parens after
1605//   $ tkclose(), ... $ tksubstr
1606//   $ tkgetline FIXME TODO TEST THIS
1607//   $ ++ lvalue
1608//   $ -- lvalue
1609//   $ + expression_up_to_exponentiation (also -, ! prefix ops)
1610//   $ $ whatever_can_follow_and_bind_to_dollar
1611//
1612//     tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1613//     tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline,
1614//     tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1615//
1616// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k*k }'
1617// 18
1618// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k*k }'
1619// 18
1620// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k^k }'
1621// 81
1622// ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k^k }'
1623// 8
1624
1625static void field_op(void)
1626{
1627  // CURTOK() must be $ here.
1628  expect(tkfield);
1629  // tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1630  // tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex,
1631  // tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1632  if (ISTOK(tkfield)) field_op();
1633  else if (ISTOK(tkvar)) var();
1634  else primary();
1635  // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void)
1636  // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]).
1637  gen2cd(tkfield, tkeof);
1638}
1639
1640// Tokens that can start expression
1641static char exprstartsy[] = {tkvar, tknumber, tkstring, tkregex, tkfunc,
1642  tkbuiltin, tkfield, tkminus, tkplus, tknot, tkincr, tkdecr, tklparen,
1643  tkgetline, tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf,
1644  tksubstr, tkband, tkbor, tkbxor, tkrshift, tklshift, 0};
1645
1646// Tokens that can end statement
1647static char stmtendsy[] = {tknl, tksemi, tkrbrace, 0};
1648
1649// Tokens that can follow expressions of a print statement
1650static char printexprendsy[] = {tkgt, tkappend, tkpipe, tknl, tksemi, tkrbrace, 0};
1651
1652// !! Ensure this:
1653// ternary op is right associative, so
1654// a ? b : c ? d : e        evaluates as
1655// a ? b : (c ? d : e)      not as
1656// (a ? b : c) ? d : e
1657
1658static void convert_push_to_reference(void)
1659{
1660  if (ZCODE[TT.zcode_last - 1] == tkvar) ZCODE[TT.zcode_last-1] = opvarref;
1661  else if (ZCODE[TT.zcode_last - 1] == opmap) ZCODE[TT.zcode_last - 1] = opmapref;
1662  else if (ZCODE[TT.zcode_last - 1] == tkfield) ZCODE[TT.zcode_last - 1] = opfldref;
1663  else error_exit("bad lvalue?");
1664}
1665
1666static void lvalue(void)
1667{
1668  if (ISTOK(tkfield)) {
1669    field_op();
1670    convert_push_to_reference();
1671  } else if (ISTOK(tkvar)) {
1672    var();
1673    convert_push_to_reference();
1674  } else {
1675    XERR("syntax near '%s' (bad lvalue)\n", TT.tokstr);
1676  }
1677}
1678
1679static int primary(void)
1680{
1681  //  On entry: CURTOK() is first token of expression
1682  //  On exit: CURTOK() is infix operator (for binary_op() to handle) or next
1683  //   token after end of expression.
1684  //  return -1 for field or var (potential lvalue);
1685  //      2 or more for comma-separated expr list
1686  //          as in "multiple subscript expression in array"
1687  //          e.g. (1, 2) in array_name, or a print/printf list;
1688  //      otherwise return 0
1689  //
1690  //  expr can start with:
1691  //      tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1692  //      tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex,
1693  //      tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1694  //
1695  //  bwk treats these as keywords, not builtins: close index match split sub gsub
1696  //      sprintf substr
1697  //
1698  //  bwk builtins are: atan2 cos sin exp log sqrt int rand srand length tolower
1699  //      toupper system fflush
1700  //  NOTE: fflush() is NOT in POSIX awk
1701  //
1702  //  primary() must consume prefix and postfix operators as well as
1703  //      num, string, regex, var, var with subscripts, and function calls
1704
1705  int num_exprs = 0;
1706  int nargs, modifier;
1707  int tok = CURTOK();
1708  switch (tok) {
1709    case tkvar:
1710    case tkfield:
1711      if (ISTOK(tkvar)) var();
1712      else field_op();
1713      if (ISTOK(tkincr) || ISTOK(tkdecr)) {
1714        convert_push_to_reference();
1715        gencd(CURTOK());
1716        scan();
1717      } else return -1;
1718      break;
1719
1720    case tknumber:
1721      gen2cd(tknumber, make_literal_num_val(TT.scs->numval));
1722      scan();
1723      break;
1724
1725    case tkstring:
1726      gen2cd(tkstring, make_literal_str_val(TT.tokstr));
1727      scan();
1728      break;
1729
1730    case tkregex:
1731      // When an ERE token appears as an expression in any context other
1732      // than as the right-hand of the '~' or "!~" operator or as one of
1733      // the built-in function arguments described below, the value of
1734      // the resulting expression shall be the equivalent of: $0 ~ /ere/
1735      // FIXME TODO
1736      gen2cd(opmatchrec, make_literal_regex_val(TT.tokstr));
1737      scan();
1738      break;
1739
1740    case tkbuiltin: // various builtins
1741    case tkfunc:    // user-defined function
1742      function_call();
1743      break;
1744
1745    // Unary prefix ! + -
1746    case tknot:
1747    case tkminus:
1748    case tkplus:
1749      scan();
1750      expr(getlbp(tknot));   // unary +/- same precedence as !
1751      if (tok == tknot) gencd(tknot);
1752      else gencd(opnegate);               // forces to number
1753      if (tok == tkplus) gencd(opnegate); // forces to number
1754      break;
1755
1756      // Unary prefix ++ -- MUST take lvalue
1757    case tkincr:
1758    case tkdecr:
1759      scan();
1760      lvalue();
1761      if (tok == tkincr) gencd(oppreincr);
1762      else gencd(oppredecr);
1763      break;
1764
1765    case tklparen:
1766      scan();
1767      TT.cgl.paren_level++;
1768      num_exprs = 0;
1769      do {
1770        expr(0);
1771        num_exprs++;
1772      } while (have_comma());
1773      expect(tkrparen);
1774      TT.cgl.paren_level--;
1775      if (num_exprs > 1) return num_exprs;
1776      break;
1777
1778    case tkgetline:
1779      // getline may be (according to awk book):
1780      // getline [var [<file]]
1781      // getline <file
1782      // cmd | getline [var]
1783      // var must be lvalue (can be any lvalue?)
1784      scan();
1785      nargs = 0;
1786      modifier = tkeof;
1787      if (ISTOK(tkfield) || ISTOK(tkvar)) {
1788        lvalue();
1789        nargs++;
1790      }
1791      if (havetok(tklt)) {
1792        expr(getrbp(tkcat));   // bwk "historical practice" precedence
1793        nargs++;
1794        modifier = tklt;
1795      }
1796      gen2cd(tkgetline, nargs);
1797      gencd(modifier);
1798      break;
1799
1800    default:
1801      XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
1802      skip_to(stmtendsy);
1803      break;
1804  }
1805  return 0;
1806}
1807
1808static void binary_op(int optor)  // Also for ternary ?: optor.
1809{
1810  int nargs, cdx = 0;  // index in TT.zcode list
1811  int rbp = getrbp(optor);
1812  if (optor != tkcat) scan();
1813  // CURTOK() holds first token of right operand.
1814  switch (optor) {
1815    case tkin:
1816      // right side of 'in' must be (only) an array name
1817      map_name();
1818      gencd(tkin);
1819      scan();
1820      // FIXME TODO 20230109 x = y in a && 2 works OK?
1821      // x = y in a + 2 does not; it's parsed as x = (y in a) + 2
1822      // The +2 is not cat'ed with (y in a) as in bwk's OTA.
1823      // Other awks see y in a + 2 as a syntax error. They (may)
1824      // not want anything after y in a except a lower binding operator
1825      // (&& || ?:) or end of expression, i.e. ')' ';' '}'
1826      break;
1827
1828  case tkpipe:
1829      expect(tkgetline);
1830      nargs = 1;
1831      if (ISTOK(tkfield) || ISTOK(tkvar)) {
1832        lvalue();
1833        nargs++;
1834      }
1835      gen2cd(tkgetline, nargs);
1836      gencd(tkpipe);
1837      break;
1838
1839  case tkand:
1840  case tkor:
1841      optional_nl();
1842      gen2cd(optor, -1);  // tkand: jump if false, else drop
1843      cdx = TT.zcode_last;   // tkor:  jump if true, else drop
1844      expr(rbp);
1845      gencd(opnotnot);    // replace TT.stack top with truth value
1846      ZCODE[cdx] = TT.zcode_last - cdx;
1847      break;
1848
1849  case tkternif:
1850      gen2cd(optor, -1);
1851      cdx = TT.zcode_last;
1852      expr(0);
1853      expect(tkternelse);
1854      gen2cd(tkternelse, -1);
1855      ZCODE[cdx] = TT.zcode_last - cdx;
1856      cdx = TT.zcode_last;
1857      expr(rbp);
1858      ZCODE[cdx] = TT.zcode_last - cdx;
1859      break;
1860
1861  case tkmatchop:
1862  case tknotmatch:
1863      expr(rbp);
1864      if (ZCODE[TT.zcode_last - 1] == opmatchrec) ZCODE[TT.zcode_last - 1] = tkregex;
1865      gencd(optor);
1866      break;
1867
1868  default:
1869      expr(rbp);
1870      gencd(optor);
1871  }
1872}
1873
1874static int cat_start_concated_expr(int tok)
1875{
1876  // concat'ed expr can start w/ var number string func builtin $ ! ( (or ++ if prev was not lvalue)
1877  static char exprstarttermsy[] = {tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin,
1878    tkfield, tknot, tkincr, tkdecr, tklparen, tkgetline, 0};
1879
1880  // NOTE this depends on builtins (close etc) being >= tkgetline
1881  return !! strchr(exprstarttermsy, tok) || tok >= tkgetline;
1882}
1883
1884#define CALLED_BY_PRINT 99987 // Arbitrary, different from any real rbp value
1885
1886static int expr(int rbp)
1887{
1888  // On entry: TT.scs has first symbol of expression, e.g. var, number, string,
1889  // regex, func, getline, left paren, prefix op ($ ++ -- ! unary + or -) etc.
1890  static char asgnops[] = {tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
1891    tkaddasgn, tksubasgn, tkasgn, 0};
1892  int prim_st = primary();
1893  // If called directly by print_stmt(), and found a parenthesized expression list
1894  //    followed by an end of print statement: any of > >> | ; } <newline>
1895  //    Then: return the count of expressions in list
1896  //    Else: continue parsing an expression
1897  if (rbp == CALLED_BY_PRINT) {
1898    if (prim_st > 0 && strchr(printexprendsy, CURTOK())) return prim_st;
1899    else rbp = 0;
1900  }
1901
1902  // mult_expr_list in parens must be followed by 'in' unless it
1903  // immediately follows print or printf, where it may still be followed
1904  // by 'in' ... unless at end of statement
1905  if (prim_st > 0 && ! ISTOK(tkin))
1906    XERR("syntax near '%s'; expected 'in'\n", TT.tokstr);
1907  if (prim_st > 0) gen2cd(tkrbracket, prim_st);
1908  // primary() has eaten subscripts, function args, postfix ops.
1909  // CURTOK() should be a binary op.
1910  int optor = CURTOK();
1911  if (strchr(asgnops, optor)) {
1912
1913    // TODO FIXME ?  NOT SURE IF THIS WORKS RIGHT!
1914    // awk does not parse according to POSIX spec in some odd cases.
1915    // When an assignment (lvalue =) is on the right of certain operators,
1916    // it is not treated as a bad lvalue (as it is in C).
1917    // Example: (1 && a=2) # no error; the assignment is performed.
1918    // This happens for ?: || && ~ !~ < <= ~= == > >=
1919    //
1920    static char odd_assignment_rbp[] = {59, 60, 70, 80, 100, 110, 0};
1921    if (prim_st < 0 && (rbp <= getrbp(optor) || strchr(odd_assignment_rbp, rbp))) {
1922      convert_push_to_reference();
1923      scan();
1924      expr(getrbp(optor));
1925      gencd(optor);
1926      return 0;
1927    }
1928    XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
1929    skip_to(stmtendsy);
1930  }
1931  if (cat_start_concated_expr(optor)) optor = tkcat;
1932  while (rbp < getlbp(optor)) {
1933    binary_op(optor);
1934    // HERE tok s/b an operator or expression terminator ( ; etc.).
1935    optor = CURTOK();
1936    if (cat_start_concated_expr(optor)) optor = tkcat;
1937  }
1938  return 0;
1939}
1940
1941static void print_stmt(int tk)
1942{
1943  static char outmodes[] = {tkgt, tkappend, tkpipe, 0};
1944  int num_exprs = 0, outmode;
1945  TT.cgl.in_print_stmt = 1;
1946  expect(tk); // tkprint or tkprintf
1947  if ((tk == tkprintf) || !strchr(printexprendsy, CURTOK())) {
1948    // printf always needs expression
1949    // print non-empty statement needs expression
1950    num_exprs = expr(CALLED_BY_PRINT);
1951    if (num_exprs > 0 && !strchr(printexprendsy, CURTOK())) FATAL("print stmt bug");
1952    if (!num_exprs) {
1953      for (num_exprs++; have_comma(); num_exprs++)
1954        expr(0);
1955    }
1956  }
1957  outmode = CURTOK();
1958  if (strchr(outmodes, outmode)) {
1959    scan();
1960    expr(0); // FIXME s/b only bwk term? check POSIX
1961    num_exprs++;
1962  } else outmode = 0;
1963  gen2cd(tk, num_exprs);
1964  gencd(outmode);
1965  TT.cgl.in_print_stmt = 0;
1966}
1967
1968static void delete_stmt(void)
1969{
1970  expect(tkdelete);
1971  if (ISTOK(tkvar)) {
1972    int slotnum = find_or_add_var_name();
1973    check_set_map(slotnum);
1974    scan();
1975    if (havetok(tklbracket)) {
1976      int num_subscripts = 0;
1977      do {
1978        expr(0);
1979        num_subscripts++;
1980      } while (have_comma());
1981      expect(tkrbracket);
1982      if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts);
1983      gen2cd(opmapref, slotnum);
1984      gencd(tkdelete);
1985    } else {
1986      // delete entire map (elements only; var is still a map)
1987      gen2cd(opmapref, slotnum);
1988      gencd(opmapdelete);
1989    }
1990  } else expect(tkvar);
1991}
1992
1993static void simple_stmt(void)
1994{
1995  if (strchr(exprstartsy, CURTOK())) {
1996    expr(0);
1997    gencd(opdrop);
1998    return;
1999  }
2000  switch (CURTOK()) {
2001    case tkprint:
2002    case tkprintf:
2003      print_stmt(CURTOK());
2004      break;
2005
2006    case tkdelete:
2007      delete_stmt();
2008      break;
2009
2010    default:
2011      XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
2012      skip_to(stmtendsy);
2013  }
2014}
2015
2016static int prev_was_terminated(void)
2017{
2018  return !!strchr(stmtendsy, TT.prevtok);
2019}
2020
2021static int is_nl_semi(void)
2022{
2023  return ISTOK(tknl) || ISTOK(tksemi);
2024}
2025
2026static void if_stmt(void)
2027{
2028  expect(tkif);
2029  expect(tklparen);
2030  expr(0);
2031  rparen();
2032  gen2cd(tkif, -1);
2033  int cdx = TT.zcode_last;
2034  stmt();
2035  if (!prev_was_terminated() && is_nl_semi()) {
2036    scan();
2037    optional_nl();
2038  }
2039  if (prev_was_terminated()) {
2040    optional_nl();
2041    if (havetok(tkelse)) {
2042      gen2cd(tkelse, -1);
2043      ZCODE[cdx] = TT.zcode_last - cdx;
2044      cdx = TT.zcode_last;
2045      optional_nl();
2046      stmt();
2047    }
2048  }
2049  ZCODE[cdx] = TT.zcode_last - cdx;
2050}
2051
2052static void save_break_continue(int *brk, int *cont)
2053{
2054  *brk = TT.cgl.break_dest;
2055  *cont = TT.cgl.continue_dest;
2056}
2057
2058static void restore_break_continue(int *brk, int *cont)
2059{
2060  TT.cgl.break_dest = *brk;
2061  TT.cgl.continue_dest = *cont;
2062}
2063
2064static void while_stmt(void)
2065{
2066  int brk, cont;
2067  save_break_continue(&brk, &cont);
2068  expect(tkwhile);
2069  expect(tklparen);
2070  TT.cgl.continue_dest = TT.zcode_last + 1;
2071  expr(0);
2072  rparen();
2073  gen2cd(tkwhile, 2);    // drop, jump if true
2074  TT.cgl.break_dest = TT.zcode_last + 1;
2075  gen2cd(opjump, -1);     // jump here to break
2076  stmt();
2077  gen2cd(opjump, -1);     // jump to continue
2078  ZCODE[TT.zcode_last] = TT.cgl.continue_dest - TT.zcode_last - 1;
2079  ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2080  restore_break_continue(&brk, &cont);
2081}
2082
2083static void do_stmt(void)
2084{
2085  int brk, cont;
2086  save_break_continue(&brk, &cont);
2087  expect(tkdo);
2088  optional_nl();
2089  gen2cd(opjump, 4);   // jump over jumps, to statement
2090  TT.cgl.continue_dest = TT.zcode_last + 1;
2091  gen2cd(opjump, -1);   // here on continue
2092  TT.cgl.break_dest = TT.zcode_last + 1;
2093  gen2cd(opjump, -1);   // here on break
2094  stmt();
2095  if (!prev_was_terminated()) {
2096    if (is_nl_semi()) {
2097      scan();
2098      optional_nl();
2099    } else {
2100      XERR("syntax near '%s' -- ';' or newline expected\n", TT.tokstr);
2101      // FIXME
2102    }
2103  }
2104  ZCODE[TT.cgl.continue_dest + 1] = TT.zcode_last - TT.cgl.continue_dest - 1;
2105  optional_nl();
2106  expect(tkwhile);
2107  expect(tklparen);
2108  expr(0);
2109  rparen();
2110  gen2cd(tkwhile, TT.cgl.break_dest - TT.zcode_last - 1);
2111  ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2112  restore_break_continue(&brk, &cont);
2113}
2114
2115static void for_not_map_iter(void)
2116{
2117  // Here after loop initialization, if any; loop condition
2118  int condition_loc = TT.zcode_last + 1;
2119  if (havetok(tksemi)) {
2120    // "endless" loop variant; no condition
2121    // no NL allowed here in OTA
2122    gen2cd(opjump, -1);     // jump to statement
2123  } else {
2124    optional_nl();                // NOT posix or awk book; in OTA
2125    expr(0);                 // loop while true
2126    expect(tksemi);
2127    gen2cd(tkwhile, -1);    // drop, jump to statement if true
2128  }
2129  optional_nl();                    // NOT posix or awk book; in OTA
2130  TT.cgl.break_dest = TT.zcode_last + 1;
2131  gen2cd(opjump, -1);
2132  TT.cgl.continue_dest = TT.zcode_last + 1;
2133  if (!ISTOK(tkrparen)) simple_stmt();  // "increment"
2134  gen2cd(opjump, condition_loc - TT.zcode_last - 3);
2135  rparen();
2136  ZCODE[TT.cgl.break_dest - 1] = TT.zcode_last - TT.cgl.break_dest + 1;
2137  stmt();
2138  gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3);
2139  ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2140}
2141
2142static int valid_for_array_iteration(int first, int last)
2143{
2144  return ZCODE[first] == tkvar && ZCODE[first + 2] == tkvar
2145      && ZCODE[first + 4] == tkin && ZCODE[first + 5] == opdrop
2146      && first + 5 == last;
2147}
2148
2149static void for_stmt(void)
2150{
2151  int brk, cont;
2152  save_break_continue(&brk, &cont);
2153  expect(tkfor);
2154  expect(tklparen);
2155  if (havetok(tksemi)) {
2156    // No "initialization" part
2157    for_not_map_iter();
2158  } else {
2159    int loop_start_loc = TT.zcode_last + 1;
2160    simple_stmt();  // initializaton part, OR varname in arrayname form
2161    if (!havetok(tkrparen)) {
2162      expect(tksemi);
2163      for_not_map_iter();
2164    } else {
2165      // Must be map iteration
2166      // Check here for varname in varname!
2167      // FIXME TODO must examine generated TT.zcode for var in array?
2168      if (!valid_for_array_iteration(loop_start_loc, TT.zcode_last))
2169        XERR("%s", "bad 'for (var in array)' loop\n");
2170      else {
2171        ZCODE[TT.zcode_last-5] = opvarref;
2172        ZCODE[TT.zcode_last-1] = tknumber;
2173        ZCODE[TT.zcode_last] = make_literal_num_val(-1);
2174        TT.cgl.continue_dest = TT.zcode_last + 1;
2175        gen2cd(opmapiternext, 2);
2176        TT.cgl.break_dest = TT.zcode_last + 1;
2177        gen2cd(opjump, -1);   // fill in with loc after stmt
2178      }
2179      optional_nl();
2180      // fixup TT.stack if return or exit inside for (var in array)
2181      TT.cgl.stack_offset_to_fix += 3;
2182      stmt();
2183      TT.cgl.stack_offset_to_fix -= 3;
2184      gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3);
2185      ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2186      gencd(opdrop);
2187      gencd(opdrop);
2188      gencd(opdrop);
2189    }
2190  }
2191  restore_break_continue(&brk, &cont);
2192}
2193
2194static void stmt(void)
2195{
2196  switch (CURTOK()) {
2197    case tkeof:
2198      break;     // FIXME ERROR?
2199
2200    case tkbreak:
2201      scan();
2202      if (TT.cgl.break_dest) gen2cd(tkbreak, TT.cgl.break_dest - TT.zcode_last - 3);
2203      else XERR("%s", "break not in a loop\n");
2204      break;
2205
2206    case tkcontinue:
2207      scan();
2208      if (TT.cgl.continue_dest)
2209        gen2cd(tkcontinue, TT.cgl.continue_dest - TT.zcode_last - 3);
2210      else XERR("%s", "continue not in a loop\n");
2211      break;
2212
2213    case tknext:
2214      scan();
2215      gencd(tknext);
2216      if (TT.cgl.rule_type) XERR("%s", "next inside BEGIN or END\n");
2217      if (TT.cgl.in_function_body) XERR("%s", "next inside function def\n");
2218      break;
2219
2220    case tknextfile:
2221      scan();
2222      gencd(tknextfile);
2223      if (TT.cgl.rule_type) XERR("%s", "nextfile inside BEGIN or END\n");
2224      if (TT.cgl.in_function_body) XERR("%s", "nextfile inside function def\n");
2225      break;
2226
2227    case tkexit:
2228      scan();
2229      if (strchr(exprstartsy, CURTOK())) {
2230        expr(0);
2231      } else gen2cd(tknumber, make_literal_num_val(NO_EXIT_STATUS));
2232      gencd(tkexit);
2233      break;
2234
2235    case tkreturn:
2236      scan();
2237      if (TT.cgl.stack_offset_to_fix) gen2cd(opdrop_n, TT.cgl.stack_offset_to_fix);
2238      if (strchr(exprstartsy, CURTOK())) {
2239        expr(0);
2240      } else gen2cd(tknumber, make_literal_num_val(0.0));
2241      gen2cd(tkreturn, TT.cgl.nparms);
2242      if (!TT.cgl.in_function_body) XERR("%s", "return outside function def\n");
2243      break;
2244
2245    case tklbrace:
2246      action(tklbrace);
2247      break;
2248
2249    case tkif:
2250      if_stmt();
2251      break;
2252
2253    case tkwhile:
2254      while_stmt();
2255      break;
2256
2257    case tkdo:
2258      do_stmt();
2259      break;
2260
2261    case tkfor:
2262      for_stmt();
2263      break;
2264
2265    case tksemi:
2266      scan();
2267      break;
2268    default:
2269      simple_stmt();      // expression print printf delete
2270  }
2271}
2272
2273static void add_param(int funcnum, char *s)
2274{
2275  if (!find_local_entry(s)) add_local_entry(s);
2276  else XERR("function '%s' dup param '%s'\n", FUNC_DEF[funcnum].name, s);
2277  TT.cgl.nparms++;
2278
2279  // POSIX: The same name shall not be used as both a function parameter name
2280  // and as the name of a function or a special awk variable.
2281  // !!! NOTE seems implementations exc. mawk only compare param names with
2282  // builtin funcs; use same name as userfunc is OK!
2283  if (!strcmp(s, FUNC_DEF[funcnum].name))
2284    XERR("function '%s' param '%s' matches func name\n",
2285        FUNC_DEF[funcnum].name, s);
2286  if (find_global(s) && find_global(s) < TT.spec_var_limit)
2287    XERR("function '%s' param '%s' matches special var\n",
2288        FUNC_DEF[funcnum].name, s);
2289}
2290
2291static void function_def(void)
2292{
2293  expect(tkfunction);
2294  int funcnum = find_func_def_entry(TT.tokstr);
2295  if (!funcnum) {
2296    funcnum = add_func_def_entry(TT.tokstr);
2297  } else if (FUNC_DEF[funcnum].flags & FUNC_DEFINED) {
2298    XERR("dup defined function '%s'\n", TT.tokstr);
2299  }
2300  FUNC_DEF[funcnum].flags |= FUNC_DEFINED;
2301  if (find_global(TT.tokstr)) {
2302    // POSIX: The same name shall not be used both as a variable name with
2303    // global scope and as the name of a function.
2304    XERR("function name '%s' previously defined\n", TT.tokstr);
2305  }
2306
2307  gen2cd(tkfunction, funcnum);
2308  FUNC_DEF[funcnum].zcode_addr = TT.zcode_last - 1;
2309  TT.cgl.funcnum = funcnum;
2310  TT.cgl.nparms = 0;
2311  if (ISTOK(tkfunc)) expect(tkfunc); // func name with no space before (
2312  else expect(tkvar);  // func name with space before (
2313  expect(tklparen);
2314  if (ISTOK(tkvar)) {
2315    add_param(funcnum, TT.tokstr);
2316    scan();
2317    // FIXME is the the best way? what if TT.tokstr not a tkvar?
2318    while (have_comma()) {
2319      add_param(funcnum, TT.tokstr);
2320      expect(tkvar);
2321    }
2322  }
2323  rparen();
2324  if (ISTOK(tklbrace)) {
2325    TT.cgl.in_function_body = 1;
2326    action(tkfunc);
2327    TT.cgl.in_function_body = 0;
2328    // Need to return uninit value if falling off end of function.
2329    gen2cd(tknumber, make_uninit_val());
2330    gen2cd(tkreturn, TT.cgl.nparms);
2331  } else {
2332    XERR("syntax near '%s'\n", TT.tokstr);
2333    // FIXME some recovery needed here!?
2334  }
2335  // Do not re-init locals table for dup function.
2336  // Avoids memory leak detected by LeakSanitizer.
2337  if (!FUNC_DEF[funcnum].function_locals.base) {
2338    FUNC_DEF[funcnum].function_locals = TT.locals_table;
2339    init_locals_table();
2340  }
2341}
2342
2343static void action(int action_type)
2344{
2345(void)action_type;
2346  // action_type is tkbegin, tkend, tkdo (every line), tkif (if pattern),
2347  //                  tkfunc (function body), tklbrace (compound statement)
2348  // Should have lbrace on entry.
2349  expect(tklbrace);
2350  for (;;) {
2351    if (ISTOK(tkeof)) unexpected_eof();
2352    optional_nl_or_semi();
2353    if (havetok(tkrbrace)) {
2354      break;
2355    }
2356    stmt();
2357    // stmt() is normally unterminated here, but may be terminated if we
2358    // have if with no else (had to consume terminator looking for else)
2359    //   !!!   if (ISTOK(tkrbrace) || prev_was_terminated())
2360    if (prev_was_terminated()) continue;
2361    if (!is_nl_semi() && !ISTOK(tkrbrace)) {
2362      XERR("syntax near '%s' -- newline, ';', or '}' expected\n", TT.tokstr);
2363      while (!is_nl_semi() && !ISTOK(tkrbrace) && !ISTOK(tkeof)) scan();
2364      if (ISTOK(tkeof)) unexpected_eof();
2365    }
2366    if (havetok(tkrbrace)) break;
2367    // Must be semicolon or newline
2368    scan();
2369  }
2370}
2371
2372static void rule(void)
2373{
2374  //       pa_pat
2375  //     | pa_pat lbrace stmtlist '}'
2376  //     | pa_pat ',' opt_nl pa_pat
2377  //     | pa_pat ',' opt_nl pa_pat lbrace stmtlist '}'
2378  //     | lbrace stmtlist '}'
2379  //     | XBEGIN lbrace stmtlist '}'
2380  //     | XEND lbrace stmtlist '}'
2381  //     | FUNC funcname '(' varlist rparen  lbrace stmtlist '}'
2382
2383  switch (CURTOK()) {
2384    case tkbegin:
2385      scan();
2386      if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin] = TT.zcode_last - TT.cgl.last_begin;
2387      else TT.cgl.first_begin = TT.zcode_last + 1;
2388
2389      TT.cgl.rule_type = tkbegin;
2390      action(tkbegin);
2391      TT.cgl.rule_type = 0;
2392      gen2cd(opjump, -1);
2393      TT.cgl.last_begin = TT.zcode_last;
2394      break;
2395
2396    case tkend:
2397      scan();
2398      if (TT.cgl.last_end) ZCODE[TT.cgl.last_end] = TT.zcode_last - TT.cgl.last_end;
2399      else TT.cgl.first_end = TT.zcode_last + 1;
2400
2401      TT.cgl.rule_type = tkbegin;
2402      action(tkend);
2403      TT.cgl.rule_type = 0;
2404      gen2cd(opjump, -1);
2405      TT.cgl.last_end = TT.zcode_last;
2406      break;
2407
2408    case tklbrace:
2409      if (TT.cgl.last_recrule)
2410        ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule;
2411      else TT.cgl.first_recrule = TT.zcode_last + 1;
2412      action(tkdo);
2413      gen2cd(opjump, -1);
2414      TT.cgl.last_recrule = TT.zcode_last;
2415      break;
2416
2417    case tkfunction:
2418      function_def();
2419      break;
2420    default:
2421      if (TT.cgl.last_recrule)
2422        ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule;
2423      else TT.cgl.first_recrule = TT.zcode_last + 1;
2424      gen2cd(opjump, 1);
2425      gencd(tkeof);
2426      int cdx = 0, saveloc = TT.zcode_last;
2427      expr(0);
2428      if (!have_comma()) {
2429        gen2cd(tkif, -1);
2430        cdx = TT.zcode_last;
2431      } else {
2432        gen2cd(oprange2, ++TT.cgl.range_pattern_num);
2433        gencd(-1);
2434        cdx = TT.zcode_last;
2435        ZCODE[saveloc-2] = oprange1;
2436        ZCODE[saveloc-1] = TT.cgl.range_pattern_num;
2437        ZCODE[saveloc] = TT.zcode_last - saveloc;
2438        expr(0);
2439        gen2cd(oprange3, TT.cgl.range_pattern_num);
2440      }
2441      if (ISTOK(tklbrace)) {
2442        action(tkif);
2443        ZCODE[cdx] = TT.zcode_last - cdx;
2444      } else {
2445        gencd(opprintrec);   // print $0 ?
2446        ZCODE[cdx] = TT.zcode_last - cdx;
2447      }
2448      gen2cd(opjump, -1);
2449      TT.cgl.last_recrule = TT.zcode_last;
2450  }
2451}
2452
2453static void diag_func_def_ref(void)
2454{
2455  int n = zlist_len(&TT.func_def_table);
2456  for (int k = 1; k < n; k++) {
2457    if ((FUNC_DEF[k].flags & FUNC_CALLED) &&
2458            !(FUNC_DEF[k].flags & FUNC_DEFINED)) {
2459      // Sorry, we can't tell where this was called from, for now at least.
2460      XERR("Undefined function '%s'", FUNC_DEF[k].name);
2461    }
2462  }
2463}
2464
2465static void compile(void)
2466{
2467  init_compiler();
2468  init_scanner();
2469  scan();
2470  optional_nl_or_semi();        // Does posix allow NL or ; before first rule?
2471  while (! ISTOK(tkeof)) {
2472    rule();
2473    optional_nl_or_semi();        // NOT POSIX
2474  }
2475
2476
2477  if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin-1] = opquit;
2478  if (TT.cgl.last_end) ZCODE[TT.cgl.last_end-1] = opquit;
2479  if (TT.cgl.last_recrule) ZCODE[TT.cgl.last_recrule-1] = opquit;
2480
2481  gen2cd(tknumber, make_literal_num_val(0.0));
2482  gencd(tkexit);
2483  gencd(opquit);
2484  // If there are only BEGIN and END or only END actions, generate actions to
2485  // read all input before END.
2486  if (TT.cgl.first_end && !TT.cgl.first_recrule) {
2487    gencd(opquit);
2488    TT.cgl.first_recrule = TT.zcode_last;
2489  }
2490  gencd(opquit);  // One more opcode to keep ip in bounds in run code.
2491  diag_func_def_ref();
2492}
2493
2494////////////////////
2495//// runtime
2496////////////////////
2497
2498static void check_numeric_string(struct zvalue *v)
2499{
2500  if (v->vst) {
2501    char *end, *s = v->vst->str;
2502    // Significant speed gain with this test:
2503    // num string must begin space, +, -, ., or digit.
2504    if (strchr("+-.1234567890 ", *s)) {
2505      double num = strtod(s, &end);
2506      if (s == end || end[strspn(end, " ")]) return;
2507      v->num = num;
2508      v->flags |= ZF_NUM | ZF_STR | ZF_NUMSTR;
2509    }
2510  }
2511}
2512
2513static struct zstring *num_to_zstring(double n, char *fmt)
2514{
2515  int k;
2516  if (n == (long long)n) k = snprintf(TT.pbuf, PBUFSIZE, "%lld", (long long)n);
2517  else k = snprintf(TT.pbuf, PBUFSIZE, fmt, n);
2518  if (k < 0 || k >= PBUFSIZE) FFATAL("error encoding %f via '%s'", n, fmt);
2519  return new_zstring(TT.pbuf, k);
2520}
2521
2522////////////////////
2523//// regex routines
2524////////////////////
2525
2526static char *escape_str(char *s, int is_regex)
2527{
2528  char *p, *escapes = is_regex ? "abfnrtv\"/" : "\\abfnrtv\"/";
2529  // FIXME TODO should / be in there?
2530  char *s0 = s, *to = s;
2531  while ((*to = *s)) {
2532    if (*s != '\\') { to++, s++;
2533    } else if ((p = strchr(escapes, *++s))) {
2534      // checking char after \ for known escapes
2535      int c = (is_regex?"\a\b\f\n\r\t\v\"/":"\\\a\b\f\n\r\t\v\"/")[p-escapes];
2536      if (c) *to = c, s++;  // else final backslash
2537      to++;
2538    } else if ('0' <= *s && *s <= '9') {
2539      int k, c = *s++ - '0';
2540      for (k = 0; k < 2 && '0' <= *s && *s <= '9'; k++)
2541        c = c * 8 + *s++ - '0';
2542      *to++ = c;
2543    } else if (*s == 'x') {
2544      if (isxdigit(s[1])) {
2545        int c = hexval(*++s);
2546        if (isxdigit(s[1])) c = c * 16 + hexval(*++s);
2547        *to++ = c, s++;
2548      }
2549    } else {
2550      if (is_regex) *to++ = '\\';
2551      *to++ = *s++;
2552    }
2553  }
2554  return s0;
2555}
2556
2557static void force_maybemap_to_scalar(struct zvalue *v)
2558{
2559  if (!(v->flags & ZF_ANYMAP)) return;
2560  if (v->flags & ZF_MAP || v->map->count)
2561    FATAL("array in scalar context");
2562  v->flags = 0;
2563  v->map = 0; // v->flags = v->map = 0 gets warning
2564}
2565
2566static void force_maybemap_to_map(struct zvalue *v)
2567{
2568  if (v->flags & ZF_MAYBEMAP) v->flags = ZF_MAP;
2569}
2570
2571// fmt_offs is either CONVFMT or OFMT (offset in stack to zvalue)
2572static struct zvalue *to_str_fmt(struct zvalue *v, int fmt_offs)
2573{
2574  force_maybemap_to_scalar(v);
2575  // TODO: consider handling numstring differently
2576  if (v->flags & ZF_NUMSTR) v->flags = ZF_STR;
2577  if (IS_STR(v)) return v;
2578  else if (!v->flags) { // uninitialized
2579    v->vst = new_zstring("", 0);
2580  } else if (IS_NUM(v)) {
2581    zvalue_release_zstring(v);
2582    if (!IS_STR(&STACK[fmt_offs])) {
2583      zstring_release(&STACK[fmt_offs].vst);
2584      STACK[fmt_offs].vst = num_to_zstring(STACK[fmt_offs].num, "%.6g");
2585      STACK[fmt_offs].flags = ZF_STR;
2586    }
2587    v->vst = num_to_zstring(v->num, STACK[fmt_offs].vst->str);
2588  } else {
2589    FATAL("Wrong or unknown type in to_str_fmt\n");
2590  }
2591  v->flags = ZF_STR;
2592  return v;
2593}
2594
2595static struct zvalue *to_str(struct zvalue *v)
2596{
2597  return to_str_fmt(v, CONVFMT);
2598}
2599
2600// TODO FIXME Is this needed? (YES -- investigate) Just use to_str()?
2601#define ENSURE_STR(v) (IS_STR(v) ? (v) : to_str(v))
2602
2603static void rx_zvalue_compile(regex_t **rx, struct zvalue *pat)
2604{
2605  if (IS_RX(pat)) *rx = pat->rx;
2606  else {
2607    zvalue_dup_zstring(to_str(pat));
2608    escape_str(pat->vst->str, 1);
2609    xregcomp(*rx, pat->vst->str, REG_EXTENDED);
2610  }
2611}
2612
2613static void rx_zvalue_free(regex_t *rx, struct zvalue *pat)
2614{
2615  if (!IS_RX(pat) || rx != pat->rx) regfree(rx);
2616}
2617
2618// Used by the match/not match ops (~ !~) and implicit $0 match (/regex/)
2619static int match(struct zvalue *zvsubject, struct zvalue *zvpat)
2620{
2621  int r;
2622  regex_t rx, *rxp = &rx;
2623  rx_zvalue_compile(&rxp, zvpat);
2624  if ((r = regexec(rxp, to_str(zvsubject)->vst->str, 0, 0, 0)) != 0) {
2625    if (r != REG_NOMATCH) {
2626      char errbuf[256];
2627      regerror(r, &rx, errbuf, sizeof(errbuf));
2628      // FIXME TODO better diagnostic here
2629      error_exit("regex match error %d: %s", r, errbuf);
2630    }
2631    rx_zvalue_free(rxp, zvpat);
2632    return 1;
2633  }
2634  rx_zvalue_free(rxp, zvpat);
2635  return 0;
2636}
2637
2638static int rx_find(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags)
2639{
2640  regmatch_t matches[1];
2641  int r = regexec(rx, s, 1, matches, eflags);
2642  if (r == REG_NOMATCH) return r;
2643  if (r) FATAL("regexec error");  // TODO ? use regerr() to meaningful msg
2644  *start = matches[0].rm_so;
2645  *end = matches[0].rm_eo;
2646  return 0;
2647}
2648
2649// Differs from rx_find() in that FS cannot match null (empty) string.
2650// See https://www.austingroupbugs.net/view.php?id=1468.
2651static int rx_find_FS(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags)
2652{
2653  int r = rx_find(rx, s, start, end, eflags);
2654  if (r || *start != *end) return r;  // not found, or found non-empty match
2655  // Found empty match, retry starting past the match
2656  char *p = s + *end;
2657  if (!*p) return REG_NOMATCH;  // End of string, no non-empty match found
2658  // Empty match not at EOS, move ahead and try again
2659  while (!r && *start == *end && *++p)
2660    r = rx_find(rx, p, start, end, eflags);
2661  if (r || !*p) return REG_NOMATCH;  // no non-empty match found
2662  *start += p - s;  // offsets from original string
2663  *end += p - s;
2664  return 0;
2665}
2666
2667////////////////////
2668////   fields
2669////////////////////
2670
2671#define FIELDS_MAX  102400 // Was 1024; need more for toybox awk test
2672#define THIS_MEANS_SET_NF 999999999
2673
2674static int get_int_val(struct zvalue *v)
2675{
2676  if (IS_NUM(v)) return (int)v->num;
2677  if (IS_STR(v) && v->vst) return (int)atof(v->vst->str);
2678  return 0;
2679}
2680
2681// A single-char FS is never a regex, so make it a [<char>] regex to
2682// match only that one char in case FS is a regex metachar.
2683// If regex FS is needed, must use > 1 char. If a '.' regex
2684// is needed, use e.g. '.|.' (unlikely case).
2685static char *fmt_one_char_fs(char *fs)
2686{
2687  if (strlen(fs) != 1) return fs;
2688  snprintf(TT.one_char_fs, sizeof(TT.one_char_fs), "[%c]", fs[0]);
2689  return TT.one_char_fs;
2690}
2691
2692static regex_t *rx_fs_prep(char *fs)
2693{
2694  if (!strcmp(fs, " ")) return &TT.rx_default;
2695  if (!strcmp(fs, TT.fs_last)) return &TT.rx_last;
2696  if (strlen(fs) >= FS_MAX) FATAL("FS too long");
2697  strcpy(TT.fs_last, fs);
2698  regfree(&TT.rx_last);
2699  xregcomp(&TT.rx_last, fmt_one_char_fs(fs), REG_EXTENDED);
2700  return &TT.rx_last;
2701}
2702
2703// Only for use by split() builtin
2704static void set_map_element(struct zmap *m, int k, char *val, size_t len)
2705{
2706  // Do not need format here b/c k is integer, uses "%lld" format.
2707  struct zstring *key = num_to_zstring(k, "");// "" vs 0 format avoids warning
2708  struct zmap_slot *zs = zmap_find_or_insert_key(m, key);
2709  zstring_release(&key);
2710  zs->val.vst = zstring_update(zs->val.vst, 0, val, len);
2711  zs->val.flags = ZF_STR;
2712  check_numeric_string(&zs->val);
2713}
2714
2715static void set_zvalue_str(struct zvalue *v, char *s, size_t size)
2716{
2717  v->vst = zstring_update(v->vst, 0, s, size);
2718  v->flags = ZF_STR;
2719}
2720
2721// All changes to NF go through here!
2722static void set_nf(int nf)
2723{
2724  STACK[NF].num = TT.nf_internal = nf;
2725  STACK[NF].flags = ZF_NUM;
2726}
2727
2728static void set_field(struct zmap *unused, int fnum, char *s, size_t size)
2729{ (void)unused;
2730  if (fnum < 0 || fnum > FIELDS_MAX) FFATAL("bad field num %d\n", fnum);
2731  int nfields = zlist_len(&TT.fields);
2732  // Need nfields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2733  while (nfields <= fnum)
2734    nfields = zlist_append(&TT.fields, &uninit_zvalue) + 1;
2735  set_zvalue_str(&FIELD[fnum], s, size);
2736  set_nf(fnum);
2737  check_numeric_string(&FIELD[fnum]);
2738}
2739
2740// Split s via fs, using setter; return number of TT.fields.
2741// This is used to split TT.fields and also for split() builtin.
2742static int splitter(void (*setter)(struct zmap *, int, char *, size_t), struct zmap *m, char *s, struct zvalue *zvfs)
2743{
2744  regex_t *rx;
2745  regoff_t offs, end;
2746  if (!IS_RX(zvfs)) to_str(zvfs);
2747  char *fs = IS_STR(zvfs) ? zvfs->vst->str : "";
2748  int nf = 0, r = 0, eflag = 0;
2749  // Empty string or empty fs (regex).
2750  // Need to include !*s b/c empty string, otherwise
2751  // split("", a, "x") splits to a 1-element (empty element) array
2752  if (!*s || (IS_STR(zvfs) && !*fs) || IS_EMPTY_RX(zvfs)) {
2753    for ( ; *s; s++) setter(m, ++nf, s, 1);
2754    return nf;
2755  }
2756  if (IS_RX(zvfs)) rx = zvfs->rx;
2757  else rx = rx_fs_prep(fs);
2758  while (*s) {
2759    // Find the next occurrence of FS.
2760    // rx_find_FS() returns 0 if found. If nonzero, the field will
2761    // be the rest of the record (all of it if first time through).
2762    if ((r = rx_find_FS(rx, s, &offs, &end, eflag))) offs = end = strlen(s);
2763    else {
2764      int k = strcspn(s, "\n");
2765      if (k < offs) offs = k, end = k + 1;
2766    }
2767    eflag |= REG_NOTBOL;
2768
2769    // Field will be s up to (not including) the offset. If offset
2770    // is zero and FS is found and FS is ' ' (TT.rx_default "[ \t]+"),
2771    // then the find is the leading or trailing spaces and/or tabs.
2772    // If so, skip this (empty) field, otherwise set field, length is offs.
2773    if (offs || r || rx != &TT.rx_default) setter(m, ++nf, s, offs);
2774    s += end;
2775  }
2776  if (!r && rx != &TT.rx_default) setter(m, ++nf, "", 0);
2777  return nf;
2778}
2779
2780static void build_fields(void)
2781{
2782  char *rec = FIELD[0].vst->str;
2783  // TODO test this -- why did I not want to split empty $0?
2784  // Maybe don't split empty $0 b/c non-default FS gets NF==1 with splitter()?
2785  set_nf(*rec ? splitter(set_field, 0, rec, to_str(&STACK[FS])) : 0);
2786}
2787
2788static void rebuild_field0(void)
2789{
2790  struct zstring *s = FIELD[0].vst;
2791  int nf = TT.nf_internal;
2792  // uninit value needed for eventual reference to .vst in zstring_release()
2793  struct zvalue tempv = uninit_zvalue;
2794  zvalue_copy(&tempv, to_str(&STACK[OFS]));
2795  for (int i = 1; i <= nf; i++) {
2796    if (i > 1) {
2797      s = s ? zstring_extend(s, tempv.vst) : zstring_copy(s, tempv.vst);
2798    }
2799    if (FIELD[i].flags) to_str(&FIELD[i]);
2800    if (FIELD[i].vst) {
2801      if (i > 1) s = zstring_extend(s, FIELD[i].vst);
2802      else s = zstring_copy(s, FIELD[i].vst);
2803    }
2804  }
2805  FIELD[0].vst = s;
2806  FIELD[0].flags |= ZF_STR;
2807  zvalue_release_zstring(&tempv);
2808}
2809
2810// get field ref (lvalue ref) in prep for assignment to field.
2811// [... assigning to a nonexistent field (for example, $(NF+2)=5) shall
2812// increase the value of NF; create any intervening TT.fields with the
2813// uninitialized value; and cause the value of $0 to be recomputed, with the
2814// TT.fields being separated by the value of OFS.]
2815// Called by setup_lvalue()
2816static struct zvalue *get_field_ref(int fnum)
2817{
2818  if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d", fnum);
2819  if (fnum > TT.nf_internal) {
2820    // Ensure TT.fields list is large enough for fnum
2821    // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2822    for (int i = TT.nf_internal + 1; i <= fnum; i++) {
2823      if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue);
2824      zvalue_copy(&FIELD[i], &uninit_string_zvalue);
2825    }
2826    set_nf(fnum);
2827  }
2828  return &FIELD[fnum];
2829}
2830
2831// Called by tksplit op
2832static int split(struct zstring *s, struct zvalue *a, struct zvalue *fs)
2833{
2834  return splitter(set_map_element, a->map, s->str, fs);
2835}
2836
2837// Called by getrec_f0_f() and getrec_f0()
2838static void copy_to_field0(char *buf, size_t k)
2839{
2840  set_zvalue_str(&FIELD[0], buf, k);
2841  check_numeric_string(&FIELD[0]);
2842  build_fields();
2843}
2844
2845// After changing $0, must rebuild TT.fields & reset NF
2846// Changing other field must rebuild $0
2847// Called by gsub() and assignment ops.
2848static void fixup_fields(int fnum)
2849{
2850  if (fnum == THIS_MEANS_SET_NF) {  // NF was assigned to
2851    int new_nf = get_int_val(&STACK[NF]);
2852    // Ensure TT.fields list is large enough for fnum
2853    // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2854    for (int i = TT.nf_internal + 1; i <= new_nf; i++) {
2855      if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue);
2856      zvalue_copy(&FIELD[i], &uninit_string_zvalue);
2857    }
2858    set_nf(TT.nf_internal = STACK[NF].num);
2859    rebuild_field0();
2860    return;
2861  }
2862  // fnum is # of field that was just updated.
2863  // If it's 0, need to rebuild the TT.fields 1... n.
2864  // If it's non-0, need to rebuild field 0.
2865  to_str(&FIELD[fnum]);
2866  if (fnum) check_numeric_string(&FIELD[fnum]);
2867  if (fnum) rebuild_field0();
2868  else build_fields();
2869}
2870
2871// Fetching non-existent field gets uninit string value; no change to NF!
2872// Called by tkfield op       // TODO inline it?
2873static void push_field(int fnum)
2874{
2875  if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d", fnum);
2876  // Contrary to posix, awk evaluates TT.fields beyond $NF as empty strings.
2877  if (fnum > TT.nf_internal) push_val(&uninit_string_zvalue);
2878  else push_val(&FIELD[fnum]);
2879}
2880
2881////////////////////
2882////   END fields
2883////////////////////
2884
2885#define STKP    TT.stackp   // pointer to top of stack
2886
2887static double seedrand(double seed)
2888{
2889  static double prev_seed;
2890  double r = prev_seed;
2891  srandom(trunc(prev_seed = seed));
2892  return r;
2893}
2894
2895static int popnumval(void)
2896{
2897  return STKP-- -> num;
2898}
2899
2900static void drop(void)
2901{
2902  if (!(STKP->flags & (ZF_ANYMAP | ZF_RX))) zstring_release(&STKP->vst);
2903  STKP--;
2904}
2905
2906static void drop_n(int n)
2907{
2908  while (n--) drop();
2909}
2910
2911static void swap(void)
2912{
2913  struct zvalue tmp = STKP[-1];
2914  STKP[-1] = STKP[0];
2915  STKP[0] = tmp;
2916}
2917
2918// Set and return logical (0/1) val of top TT.stack value; flag value as NUM.
2919static int get_set_logical(void)
2920{
2921  struct zvalue *v = STKP;
2922  force_maybemap_to_scalar(v);
2923  int r = 0;
2924  if (IS_NUM(v)) r = !! v->num;
2925  else if (IS_STR(v)) r = (v->vst && v->vst->str[0]);
2926  zvalue_release_zstring(v);
2927  v->num = r;
2928  v->flags = ZF_NUM;
2929  return r;
2930}
2931
2932
2933static double to_num(struct zvalue *v)
2934{
2935  force_maybemap_to_scalar(v);
2936  if (v->flags & ZF_NUMSTR) zvalue_release_zstring(v);
2937  else if (!IS_NUM(v)) {
2938    v->num = 0.0;
2939    if (IS_STR(v) && v->vst) v->num = atof(v->vst->str);
2940    zvalue_release_zstring(v);
2941  }
2942  v->flags = ZF_NUM;
2943  return v->num;
2944}
2945
2946static void set_num(struct zvalue *v, double n)
2947{
2948  zstring_release(&v->vst);
2949  v->num = n;
2950  v->flags = ZF_NUM;
2951}
2952
2953static void incr_zvalue(struct zvalue *v)
2954{
2955  v->num = trunc(to_num(v)) + 1;
2956}
2957
2958static void push_int_val(ptrdiff_t n)
2959{
2960  struct zvalue v = ZVINIT(ZF_NUM, n, 0);
2961  push_val(&v);
2962}
2963
2964static struct zvalue *get_map_val(struct zvalue *v, struct zvalue *key)
2965{
2966  struct zmap_slot *x = zmap_find_or_insert_key(v->map, to_str(key)->vst);
2967  return &x->val;
2968}
2969
2970static struct zvalue *setup_lvalue(int ref_stack_ptr, int parmbase, int *field_num)
2971{
2972  // ref_stack_ptr is number of slots down in stack the ref is
2973  // for +=, *=, etc
2974  // Stack is: ... scalar_ref value_to_op_by
2975  // or ... subscript_val map_ref value_to_op_by
2976  // or ... fieldref value_to_op_by
2977  // for =, ++, --
2978  // Stack is: ... scalar_ref
2979  // or ... subscript_val map_ref
2980  // or ... fieldnum fieldref
2981  int k;
2982  struct zvalue *ref, *v = 0; // init v to mute "may be uninit" warning
2983  *field_num = -1;
2984  ref = STKP - ref_stack_ptr;
2985  if (ref->flags & ZF_FIELDREF) return get_field_ref(*field_num = ref->num);
2986  k = ref->num >= 0 ? ref->num : parmbase - ref->num;
2987  if (k == NF) *field_num = THIS_MEANS_SET_NF;
2988  v = &STACK[k];
2989  if (ref->flags & ZF_REF) {
2990    force_maybemap_to_scalar(v);
2991  } else if (ref->flags & ZF_MAPREF) {
2992    force_maybemap_to_map(v);
2993    if (!IS_MAP(v)) FATAL("scalar in array context");
2994    v = get_map_val(v, STKP - ref_stack_ptr - 1);
2995    swap();
2996    drop();
2997  } else FATAL("assignment to bad lvalue");
2998  return v; // order FATAL() and return to mute warning
2999}
3000
3001
3002static struct zfile *new_file(char *fn, FILE *fp, char mode, char file_or_pipe)
3003{
3004  struct zfile *f = xzalloc(sizeof(struct zfile));
3005  *f = (struct zfile){TT.zfiles, xstrdup(fn), fp, mode, file_or_pipe,
3006                        0, 0, 0, 0, 0, 0, 0, 0, 0};
3007  return TT.zfiles = f;
3008}
3009
3010static int fflush_all(void)
3011{
3012  int ret = 0;
3013  for (struct zfile *p = TT.zfiles; p; p = p->next)
3014    if (fflush(p->fp)) ret = -1;
3015  return ret;
3016}
3017
3018static int fflush_file(int nargs)
3019{
3020  if (!nargs) return fflush_all();
3021
3022  to_str(STKP);   // filename at top of TT.stack
3023  // Null string means flush all
3024  if (!STKP[0].vst->str[0]) return fflush_all();
3025
3026  // is it open in file table?
3027  for (struct zfile *p = TT.zfiles; p; p = p->next)
3028    if (!strcmp(STKP[0].vst->str, p->fn))
3029      if (!fflush(p->fp)) return 0;
3030  return -1;    // error, or file not found in table
3031}
3032static int close_file(char *fn)
3033{
3034  // !fn (null ptr) means close all (exc. stdin/stdout/stderr)
3035  int r = 0;
3036  struct zfile *np, **pp = &TT.zfiles;
3037  for (struct zfile *p = TT.zfiles; p; p = np) {
3038    np = p->next;   // save in case unlinking file (invalidates p->next)
3039    // Don't close std files -- wrecks print/printf (can be fixed though TODO)
3040    if ((!p->is_std_file) && (!fn || !strcmp(fn, p->fn))) {
3041      xfree(p->recbuf);
3042      xfree(p->recbuf_multi);
3043      xfree(p->recbuf_multx);
3044      xfree(p->fn);
3045      r = (p->fp) ? (p->file_or_pipe ? fclose : pclose)(p->fp) : -1;
3046      *pp = p->next;
3047      xfree(p);
3048      if (fn) return r;
3049    } else pp = &p->next; // only if not unlinking zfile
3050  }
3051  return -1;  // file not in table, or closed all files
3052}
3053
3054static struct zfile badfile_obj, *badfile = &badfile_obj;
3055
3056// FIXME TODO check if file/pipe/mode matches what's in the table already.
3057// Apparently gawk/mawk/nawk are OK with different mode, but just use the file
3058// in whatever mode it's already in; i.e. > after >> still appends.
3059static struct zfile *setup_file(char file_or_pipe, char *mode)
3060{
3061  to_str(STKP);   // filename at top of TT.stack
3062  char *fn = STKP[0].vst->str;
3063  // is it already open in file table?
3064  for (struct zfile *p = TT.zfiles; p; p = p->next)
3065    if (!strcmp(fn, p->fn)) {
3066      drop();
3067      return p;   // open; return it
3068    }
3069  FILE *fp = (file_or_pipe ? fopen : popen)(fn, mode);
3070  if (fp) {
3071    struct zfile *p = new_file(fn, fp, *mode, file_or_pipe);
3072    drop();
3073    return p;
3074  }
3075  if (*mode != 'r') FFATAL("cannot open '%s'\n", fn);
3076  drop();
3077  return badfile;
3078}
3079
3080// TODO FIXME should be a function?
3081#define stkn(n) ((int)(TT.stackp - (n) - (struct zvalue *)TT.stack.base))
3082
3083static int getcnt(int k)
3084{
3085  if (k >= stkn(0)) FATAL("too few args for printf\n");
3086  return (int)to_num(&STACK[k]);
3087}
3088
3089static int fsprintf(FILE *ignored, const char *fmt, ...)
3090{
3091  (void)ignored;
3092  va_list args, args2;
3093  va_start(args, fmt);
3094  va_copy(args2, args);
3095  int len = vsnprintf(0, 0, fmt, args); // size needed
3096  va_end(args);
3097  if (len < 0) FATAL("Bad sprintf format");
3098  // Unfortunately we have to mess with zstring internals here.
3099  if (TT.rgl.zspr->size + len + 1 > TT.rgl.zspr->capacity) {
3100      // This should always work b/c capacity > size
3101      unsigned cap = 2 * TT.rgl.zspr->capacity + len;
3102      TT.rgl.zspr = xrealloc(TT.rgl.zspr, sizeof(*TT.rgl.zspr) + cap);
3103      TT.rgl.zspr->capacity = cap;
3104    }
3105  vsnprintf(TT.rgl.zspr->str + TT.rgl.zspr->size, len+1, fmt, args2);
3106  TT.rgl.zspr->size += len;
3107  TT.rgl.zspr->str[TT.rgl.zspr->size] = 0;
3108  va_end(args2);
3109  return 0;
3110}
3111
3112static void varprint(int(*fpvar)(FILE *, const char *, ...), FILE *outfp, int nargs)
3113{
3114  int k, nn, nnc, fmtc, holdc, cnt1 = 0, cnt2 = 0;
3115  char *s = 0;  // to shut up spurious warning
3116  regoff_t offs = -1, e = -1;
3117  char *pfmt, *fmt = to_str(STKP-nargs+1)->vst->str;
3118  k = stkn(nargs - 2);
3119  while (*fmt) {
3120    double n = 0;
3121    nn = strcspn(fmt, "%");
3122    if (nn) {
3123      holdc = fmt[nn];
3124      fmt[nn] = 0;
3125      fpvar(outfp, "%s", fmt);
3126      fmt[nn] = holdc;
3127    }
3128    fmt += nn;
3129    if (!*(pfmt = fmt)) break;
3130    nnc = strcspn(fmt+1, "aAdiouxXfFeEgGcs%");
3131    fmtc = fmt[nnc+1];
3132    if (!fmtc) FFATAL("bad printf format '%s'", fmt);
3133    holdc = fmt[nnc+2];
3134    fmt[nnc+2] = 0;
3135    if (rx_find(&TT.rx_printf_fmt, fmt, &offs, &e, 0))
3136      FFATAL("bad printf format <%s>\n", fmt);
3137    int nargsneeded = 1;
3138    for (char *p = strchr(fmt, '*'); p; p = strchr(p+1, '*'))
3139      nargsneeded++;
3140    nargsneeded -= fmtc == '%';
3141
3142    switch (nargsneeded) {
3143      case 0:
3144        fpvar(outfp, fmt);
3145        break;
3146      case 3:
3147        cnt1 = getcnt(k++);
3148        ATTR_FALLTHROUGH_INTENDED;
3149      case 2:
3150        cnt2 = getcnt(k++);
3151        ATTR_FALLTHROUGH_INTENDED;
3152      case 1:
3153        if (k > stkn(0)) FATAL("too few args for printf\n");
3154        if (fmtc == 's') {
3155          s = to_str(&STACK[k++])->vst->str;
3156        } else if (fmtc == 'c' && !IS_NUM(&STACK[k])) {
3157          unsigned wch;
3158          struct zvalue *z = &STACK[k++];
3159          if (z->vst && z->vst->str[0])
3160            n = utf8towc(&wch, z->vst->str, z->vst->size) < 1 ? 0xfffd : wch;
3161        } else {
3162          n = to_num(&STACK[k++]);
3163        }
3164        if (strchr("cdiouxX", fmtc)) {
3165          pfmt = strcpy(TT.pbuf, fmt);
3166          if (pfmt[nnc] != 'l') {
3167            strcpy(pfmt+nnc+1, "l_");
3168            pfmt[nnc+2] = fmtc;
3169          }
3170        }
3171        if (fmtc == 'c' && n > 0x10ffff) n = 0xfffd;  // musl won't take larger "wchar"
3172        switch (nargsneeded) {
3173          case 1:
3174            if (fmtc == 's') fpvar(outfp, pfmt, s);
3175            else if (fmtc == 'c') fpvar(outfp, pfmt, (wint_t)n);
3176            else if (strchr("di", fmtc)) fpvar(outfp, pfmt, (long)n);
3177            else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, (unsigned long)n);
3178            else fpvar(outfp, pfmt, n);
3179            break;
3180          case 2:
3181            if (fmtc == 's') fpvar(outfp, pfmt, cnt2, s);
3182            else if (fmtc == 'c') fpvar(outfp, pfmt, cnt2, (wint_t)n);
3183            else if (strchr("di", fmtc)) fpvar(outfp, pfmt, cnt2, (long)n);
3184            else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, cnt2, (unsigned long)n);
3185            else fpvar(outfp, pfmt, cnt2, n);
3186            break;
3187          case 3:
3188            if (fmtc == 's') fpvar(outfp, pfmt, cnt1, cnt2, s);
3189            else if (fmtc == 'c') fpvar(outfp, pfmt, cnt1, cnt2, (wint_t)n);
3190            else if (strchr("di", fmtc)) fpvar(outfp, pfmt, cnt1, cnt2, (long)n);
3191            else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, cnt1, cnt2, (unsigned long)n);
3192            else fpvar(outfp, pfmt, cnt1, cnt2, n);
3193            break;
3194        }
3195        break;
3196      default:
3197        FATAL("bad printf format\n");
3198    }
3199    fmt += nnc + 2;
3200    *fmt = holdc;
3201  }
3202}
3203
3204static int is_ok_varname(char *v)
3205{
3206  char *ok = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_";
3207  if (!*v) return 0;
3208  for (int i = 0; v[i]; i++)
3209    if (i ? !strchr(ok, v[i]) : !strchr(ok + 10, v[i])) return 0;
3210  return 1;
3211}
3212
3213// FIXME TODO return value never used. What if assign to var not in globals?
3214static int assign_global(char *var, char *value)
3215{
3216  if (!is_ok_varname(var)) FFATAL("Invalid variable name '%s'\n", var);
3217  int globals_ent = find_global(var);
3218  if (globals_ent) {
3219    struct zvalue *v = &STACK[globals_ent];
3220    if (IS_MAP(v)) error_exit("-v assignment to array");  // Maybe not needed?
3221
3222// The compile phase may insert a var in global table with flag of zero.  Then
3223// init_globals() will assign a ZF_MAYBEMAP flag to it. If it is then assigned
3224// via -v option or by assignment_arg() it will here be assigned a string value.
3225// So first, remove all map data to prevent memory leak. BUG FIX // 2024-02-13.
3226    if (v->flags & ZF_ANYMAP) {
3227      zmap_delete_map_incl_slotdata(v->map);
3228      xfree(v->map);
3229      v->map = 0;
3230      v->flags &= ~ZF_ANYMAP;
3231    }
3232
3233    zvalue_release_zstring(v);
3234    value = xstrdup(value);
3235    *v = new_str_val(escape_str(value, 0));
3236    xfree(value);
3237    check_numeric_string(v);
3238    return 1;
3239  }
3240  return 0;
3241}
3242
3243// If valid assignment arg, assign the global and return 1;
3244// otherwise return 0.
3245// TODO FIXME This does not check the format of the variable per posix.
3246// Needs to start w/ _A-Za-z then _A-Za-z0-9
3247// If not valid assignment form, then nextfilearg needs to treat as filename.
3248static int assignment_arg(char *arg)
3249{
3250  char *val = strchr(arg, '=');
3251  if (val) {
3252    *val++ = 0;
3253    if (!is_ok_varname(arg)) {
3254      *--val = '=';
3255      return 0;
3256    }
3257    assign_global(arg, val);
3258    *--val = '=';
3259    return 1;
3260  } else return 0;
3261}
3262
3263static char *nextfilearg(void)
3264{
3265  char *arg;
3266  do {
3267    if (++TT.rgl.narg >= (int)to_num(&STACK[ARGC])) return 0;
3268    struct zvalue *v = &STACK[ARGV];
3269    struct zvalue zkey = ZVINIT(ZF_STR, 0,
3270        num_to_zstring(TT.rgl.narg, to_str(&STACK[CONVFMT])->vst->str));
3271    arg = "";
3272    if (zmap_find(v->map, zkey.vst)) {
3273      zvalue_copy(&TT.rgl.cur_arg, to_str(get_map_val(v, &zkey)));
3274      arg = TT.rgl.cur_arg.vst->str;
3275    }
3276    zvalue_release_zstring(&zkey);
3277  } while (!*arg || assignment_arg(arg));
3278  TT.rgl.nfiles++;
3279  return arg;
3280}
3281
3282static int next_fp(void)
3283{
3284  char *fn = nextfilearg();
3285  if (TT.cfile->fp && TT.cfile->fp != stdin) fclose(TT.cfile->fp);
3286  if ((!fn && !TT.rgl.nfiles && TT.cfile->fp != stdin) || (fn && !strcmp(fn, "-"))) {
3287    TT.cfile->fp = stdin;
3288    zvalue_release_zstring(&STACK[FILENAME]);
3289    STACK[FILENAME].vst = new_zstring("<stdin>", 7);
3290  } else if (fn) {
3291    if (!(TT.cfile->fp = fopen(fn, "r"))) FFATAL("can't open %s\n", fn);
3292    zvalue_copy(&STACK[FILENAME], &TT.rgl.cur_arg);
3293    set_num(&STACK[FNR], 0);
3294  } else {
3295    TT.rgl.eof = 1;
3296    return 0;
3297  }
3298  return 1;
3299}
3300
3301static ssize_t getrec_multiline(struct zfile *zfp)
3302{
3303  ssize_t k, kk;
3304  do {
3305    k = getdelim(&zfp->recbuf_multi, &zfp->recbufsize_multi, '\n', zfp->fp);
3306  } while (k > 0 && zfp->recbuf_multi[0] == '\n');
3307  TT.rgl.recptr = zfp->recbuf_multi;
3308  if (k < 0) return k;
3309  // k > 0 and recbuf_multi is not only a \n. Prob. ends w/ \n
3310  // but may not at EOF (last line w/o newline)
3311  for (;;) {
3312    kk = getdelim(&zfp->recbuf_multx, &zfp->recbufsize_multx, '\n', zfp->fp);
3313    if (kk < 0 || zfp->recbuf_multx[0] == '\n') break;
3314    // data is in zfp->recbuf_multi[0..k-1]; append to it
3315    if ((size_t)(k + kk + 1) > zfp->recbufsize_multi)
3316      zfp->recbuf_multi =
3317          xrealloc(zfp->recbuf_multi, zfp->recbufsize_multi = k + kk + 1);
3318    memmove(zfp->recbuf_multi + k, zfp->recbuf_multx, kk+1);
3319    k += kk;
3320  }
3321  if (k > 1 && zfp->recbuf_multi[k-1] == '\n') zfp->recbuf_multi[--k] = 0;
3322  TT.rgl.recptr = zfp->recbuf_multi;
3323  return k;
3324}
3325
3326static int rx_findx(regex_t *rx, char *s, long len, regoff_t *start, regoff_t *end, int eflags)
3327{
3328  regmatch_t matches[1];
3329  int r = regexec0(rx, s, len, 1, matches, eflags);
3330  if (r == REG_NOMATCH) return r;
3331  if (r) FATAL("regexec error");  // TODO ? use regerr() to meaningful msg
3332  *start = matches[0].rm_so;
3333  *end = matches[0].rm_eo;
3334  return 0;
3335}
3336
3337static ssize_t getrec_f(struct zfile *zfp)
3338{
3339  int r = 0, rs = ENSURE_STR(&STACK[RS])->vst->str[0] & 0xff;
3340  if (!rs) return getrec_multiline(zfp);
3341  regex_t rsrx, *rsrxp = &rsrx;
3342  // TEMP!! FIXME Need to cache and avoid too-frequent rx compiles
3343  rx_zvalue_compile(&rsrxp, &STACK[RS]);
3344  regoff_t so = 0, eo = 0;
3345  long ret = -1;
3346  for ( ;; ) {
3347    if (zfp->recoffs == zfp->endoffs) {
3348#define INIT_RECBUF_LEN     8192
3349#define RS_LENGTH_MARGIN    (INIT_RECBUF_LEN / 8)
3350      if (!zfp->recbuf)
3351        zfp->recbuf = xmalloc((zfp->recbufsize = INIT_RECBUF_LEN) + 1);
3352      zfp->endoffs = fread(zfp->recbuf, 1, zfp->recbufsize, zfp->fp);
3353      zfp->recoffs = 0;
3354      zfp->recbuf[zfp->endoffs] = 0;
3355      if (!zfp->endoffs) break;
3356    }
3357    TT.rgl.recptr = zfp->recbuf + zfp->recoffs;
3358    r = rx_findx(rsrxp, TT.rgl.recptr, zfp->endoffs - zfp->recoffs, &so, &eo, 0);
3359    // if not found, or found "near" end of buffer...
3360    if (r || zfp->recoffs + eo > (int)zfp->recbufsize - RS_LENGTH_MARGIN) {
3361      // if at end of data, and (not found or found at end of data)
3362      if (zfp->endoffs < (int)zfp->recbufsize &&
3363          (r || zfp->recoffs + eo == zfp->endoffs)) {
3364        ret = zfp->endoffs - zfp->recoffs;
3365        zfp->recoffs = zfp->endoffs;
3366        break;
3367      }
3368      if (zfp->recoffs) {
3369        memmove(zfp->recbuf, TT.rgl.recptr, zfp->endoffs - zfp->recoffs);
3370        zfp->endoffs -= zfp->recoffs;
3371        zfp->recoffs = 0;
3372      } else zfp->recbuf =
3373        xrealloc(zfp->recbuf, (zfp->recbufsize = zfp->recbufsize * 3 / 2) + 1);
3374      zfp->endoffs += fread(zfp->recbuf + zfp->endoffs,
3375                      1, zfp->recbufsize - zfp->endoffs, zfp->fp);
3376      zfp->recbuf[zfp->endoffs] = 0;
3377    } else {
3378      // found and not too near end of data
3379      ret = so;
3380      TT.rgl.recptr[so] = 0;
3381      zfp->recoffs += eo;
3382      break;
3383    }
3384  }
3385  regfree(rsrxp);
3386  return ret;
3387}
3388
3389static ssize_t getrec(void)
3390{
3391  ssize_t k;
3392  if (TT.rgl.eof) return -1;
3393  if (!TT.cfile->fp) next_fp();
3394  do {
3395    if ((k = getrec_f(TT.cfile)) >= 0) return k;
3396  } while (next_fp());
3397  return -1;
3398}
3399
3400static ssize_t getrec_f0_f(struct zfile *zfp)
3401{
3402  ssize_t k = getrec_f(zfp);
3403  if (k >= 0) {
3404    copy_to_field0(TT.rgl.recptr, k);
3405  }
3406  return k;
3407}
3408
3409static ssize_t getrec_f0(void)
3410{
3411  ssize_t k = getrec();
3412  if (k >= 0) {
3413    copy_to_field0(TT.rgl.recptr, k);
3414    incr_zvalue(&STACK[NR]);
3415    incr_zvalue(&STACK[FNR]);
3416  }
3417  return k;
3418}
3419
3420// source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe)
3421// fp is file or pipe (is NULL if file/pipe could not be opened)
3422// FIXME TODO should -1 return be replaced by test at caller?
3423// v is NULL or an lvalue ref
3424static int awk_getline(int source, struct zfile *zfp, struct zvalue *v)
3425{
3426  ssize_t k;
3427  int is_stream = source != tkeof;
3428  if (is_stream && !zfp->fp) return -1;
3429  if (v) {
3430    if ((k = is_stream ? getrec_f(zfp) : getrec()) < 0) return 0;
3431    zstring_release(&v->vst);
3432    v->vst = new_zstring(TT.rgl.recptr, k);
3433    v->flags = ZF_STR;
3434    check_numeric_string(v);    // bug fix 20240514
3435    if (!is_stream) {
3436      incr_zvalue(&STACK[NR]);
3437      incr_zvalue(&STACK[FNR]);
3438    }
3439  } else k = is_stream ? getrec_f0_f(zfp) : getrec_f0();
3440  return k < 0 ? 0 : 1;
3441}
3442
3443// Define GAWK_SUB to get the same behavior with sub()/gsub() replacement text
3444// as with gawk, goawk, and recent bwk awk (nawk) versions. Undefine GAWK_SUB
3445// to get the simpler POSIX behavior, but I think most users will prefer the
3446// gawk behavior. See the gawk (GNU Awk) manual,
3447// sec. 9.1.4.1 // More about '\' and '&' with sub(), gsub(), and gensub()
3448// for details on the differences.
3449//
3450#undef GAWK_SUB
3451#define GAWK_SUB
3452
3453// sub(ere, repl[, in]) Substitute the string repl in place of the
3454// first instance of the extended regular expression ERE in string 'in'
3455// and return the number of substitutions.  An <ampersand> ( '&' )
3456// appearing in the string repl shall be replaced by the string from in
3457// that matches the ERE. (partial spec... there's more)
3458static void gsub(int opcode, int nargs, int parmbase)
3459{ (void)nargs;
3460  int field_num = -1;
3461  // compile ensures 3 args
3462  struct zvalue *v = setup_lvalue(0, parmbase, &field_num);
3463  struct zvalue *ere = STKP-2;
3464  struct zvalue *repl = STKP-1;
3465  regex_t rx, *rxp = &rx;
3466  rx_zvalue_compile(&rxp, ere);
3467  to_str(repl);
3468  to_str(v);
3469
3470#define SLEN(zvalp) ((zvalp)->vst->size)
3471  char *p, *rp0 = repl->vst->str, *rp = rp0, *s = v->vst->str;
3472  int namps = 0, nhits = 0, is_sub = (opcode == tksub), eflags = 0;
3473  regoff_t so = -1, eo;
3474  // Count ampersands in repl string; may be overcount due to \& escapes.
3475  for (rp = rp0; *rp; rp++) namps += *rp == '&';
3476  p = s;
3477  regoff_t need = SLEN(v) + 1;  // capacity needed for result string
3478  // A pass just to determine needed destination (result) string size.
3479  while(!rx_find(rxp, p, &so, &eo, eflags)) {
3480    need += SLEN(repl) + (eo - so) * (namps - 1);
3481    if (!*p) break;
3482    p += eo ? eo : 1; // ensure progress if empty hit at start
3483    if (is_sub) break;
3484    eflags |= REG_NOTBOL;
3485  }
3486
3487  if (so >= 0) {  // at least one hit
3488    struct zstring *z = xzalloc(sizeof(*z) + need);
3489    z->capacity = need;
3490
3491    char *e = z->str; // result destination pointer
3492    p = s;
3493    eflags = 0;
3494    char *ep0 = p, *sp, *ep;
3495    while(!rx_find(rxp, p, &so, &eo, eflags)) {
3496      sp = p + so;
3497      ep = p + eo;
3498      memmove(e, ep0, sp - ep0);  // copy unchanged part
3499      e += sp - ep0;
3500      // Skip match if not at start and just after prev match and this is empty
3501      if (p == s || sp - ep0 || eo - so) {
3502        nhits++;
3503        for (rp = rp0; *rp; rp++) { // copy replacement
3504          if (*rp == '&') {
3505            memmove(e, sp, eo - so);  //copy match
3506            e += eo - so;
3507          } else if (*rp == '\\') {
3508            if (rp[1] == '&') *e++ = *++rp;
3509            else if (rp[1] != '\\') *e++ = *rp;
3510            else {
3511#ifdef GAWK_SUB
3512              if (rp[2] == '\\' && rp[3] == '&') {
3513                rp += 2;
3514                *e++ = *rp;
3515              } else if (rp[2] != '&') *e++ = '\\';
3516#endif
3517              *e++ = *++rp;
3518            }
3519          } else *e++ = *rp;
3520        }
3521      }
3522      ep0 = ep;
3523      if (!*p) break;
3524      p += eo ? eo : 1; // ensure progress if empty hit at start
3525      if (is_sub) break;
3526      eflags |= REG_NOTBOL;
3527    }
3528    // copy remaining subject string
3529    memmove(e, ep0, s + SLEN(v) - ep0);
3530    e += s + SLEN(v) - ep0;
3531    *e = 0;
3532    z->size = e - z->str;
3533    zstring_release(&v->vst);
3534    v->vst = z;
3535  }
3536  rx_zvalue_free(rxp, ere);
3537  if (!IS_RX(STKP-2)) zstring_release(&STKP[-2].vst);
3538  drop_n(3);
3539  push_int_val(nhits);
3540  if (field_num >= 0) fixup_fields(field_num);
3541}
3542
3543static long millinow(void)
3544{
3545  struct timespec ts;
3546  clock_gettime(CLOCK_REALTIME, &ts);
3547  return ts.tv_sec*1000+ts.tv_nsec/1000000;
3548}
3549
3550// Initially set stackp_needmore at MIN_STACK_LEFT before limit.
3551// When stackp > stackp_needmore, then expand and reset stackp_needmore
3552static void add_stack(struct zvalue **stackp_needmore)
3553{
3554  int k = stkn(0);  // stack elements in use
3555  zlist_expand(&TT.stack);
3556  STKP = (struct zvalue *)TT.stack.base + k;
3557  *stackp_needmore = (struct zvalue *)TT.stack.limit - MIN_STACK_LEFT;
3558}
3559
3560#define CLAMP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
3561
3562// Main loop of interpreter. Run this once for all BEGIN rules (which
3563// have had their instructions chained in compile), all END rules (also
3564// chained in compile), and once for each record of the data file(s).
3565static int interpx(int start, int *status)
3566{
3567  int *ip = &ZCODE[start];
3568  int opcode, op2, k, r, nargs, nsubscrs, range_num, parmbase = 0;
3569  int field_num;
3570  double nleft, nright, d;
3571  double (*mathfunc[])(double) = {cos, sin, exp, log, sqrt, trunc};
3572  struct zvalue *v, vv,
3573        *stackp_needmore = (struct zvalue*)TT.stack.limit - MIN_STACK_LEFT;
3574  while ((opcode = *ip++)) {
3575
3576    switch (opcode) {
3577      case opquit:
3578        return opquit;
3579
3580      case tknot:
3581        (STKP)->num = ! get_set_logical();
3582        break;
3583
3584      case opnotnot:
3585        get_set_logical();
3586        break;
3587
3588      case opnegate:
3589        STKP->num = -to_num(STKP);
3590        break;
3591
3592      case tkpow:         // FALLTHROUGH intentional here
3593      case tkmul:         // FALLTHROUGH intentional here
3594      case tkdiv:         // FALLTHROUGH intentional here
3595      case tkmod:         // FALLTHROUGH intentional here
3596      case tkplus:        // FALLTHROUGH intentional here
3597      case tkminus:
3598        nleft = to_num(STKP-1);
3599        nright = to_num(STKP);
3600        switch (opcode) {
3601          case tkpow: nleft = pow(nleft, nright); break;
3602          case tkmul: nleft *= nright; break;
3603          case tkdiv: nleft /= nright; break;
3604          case tkmod: nleft = fmod(nleft, nright); break;
3605          case tkplus: nleft += nright; break;
3606          case tkminus: nleft -= nright; break;
3607        }
3608        drop();
3609        STKP->num = nleft;
3610        break;
3611
3612      // FIXME REDO REDO ?
3613      case tkcat:
3614        to_str(STKP-1);
3615        to_str(STKP);
3616        STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP[0].vst);
3617        drop();
3618        break;
3619
3620        // Comparisons (with the '<', "<=", "!=", "==", '>', and ">="
3621        // operators) shall be made numerically if both operands are numeric,
3622        // if one is numeric and the other has a string value that is a numeric
3623        // string, or if one is numeric and the other has the uninitialized
3624        // value. Otherwise, operands shall be converted to strings as required
3625        // and a string comparison shall be made as follows:
3626        //
3627        // For the "!=" and "==" operators, the strings should be compared to
3628        // check if they are identical but may be compared using the
3629        // locale-specific collation sequence to check if they collate equally.
3630        //
3631        // For the other operators, the strings shall be compared using the
3632        // locale-specific collation sequence.
3633        //
3634        // The value of the comparison expression shall be 1 if the relation is
3635        // true, or 0 if the relation is false.
3636      case tklt:          // FALLTHROUGH intentional here
3637      case tkle:          // FALLTHROUGH intentional here
3638      case tkne:          // FALLTHROUGH intentional here
3639      case tkeq:          // FALLTHROUGH intentional here
3640      case tkgt:          // FALLTHROUGH intentional here
3641      case tkge:
3642        ; int cmp = 31416;
3643
3644        if (  (IS_NUM(&STKP[-1]) &&
3645              (STKP[0].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[0].flags)) ||
3646              (IS_NUM(&STKP[0]) &&
3647              (STKP[-1].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[-1].flags))) {
3648          switch (opcode) {
3649            case tklt: cmp = STKP[-1].num < STKP[0].num; break;
3650            case tkle: cmp = STKP[-1].num <= STKP[0].num; break;
3651            case tkne: cmp = STKP[-1].num != STKP[0].num; break;
3652            case tkeq: cmp = STKP[-1].num == STKP[0].num; break;
3653            case tkgt: cmp = STKP[-1].num > STKP[0].num; break;
3654            case tkge: cmp = STKP[-1].num >= STKP[0].num; break;
3655          }
3656        } else {
3657          cmp = strcmp(to_str(STKP-1)->vst->str, to_str(STKP)->vst->str);
3658          switch (opcode) {
3659            case tklt: cmp = cmp < 0; break;
3660            case tkle: cmp = cmp <= 0; break;
3661            case tkne: cmp = cmp != 0; break;
3662            case tkeq: cmp = cmp == 0; break;
3663            case tkgt: cmp = cmp > 0; break;
3664            case tkge: cmp = cmp >= 0; break;
3665          }
3666        }
3667        drop();
3668        drop();
3669        push_int_val(cmp);
3670        break;
3671
3672      case opmatchrec:
3673        op2 = *ip++;
3674        int mret = match(&FIELD[0], &LITERAL[op2]);
3675        push_int_val(!mret);
3676        break;
3677
3678      case tkmatchop:
3679      case tknotmatch:
3680        mret = match(STKP-1, STKP); // mret == 0 if match
3681        drop();
3682        drop();
3683        push_int_val(!mret == (opcode == tkmatchop));
3684        break;
3685
3686      case tkpowasgn:     // FALLTHROUGH intentional here
3687      case tkmodasgn:     // FALLTHROUGH intentional here
3688      case tkmulasgn:     // FALLTHROUGH intentional here
3689      case tkdivasgn:     // FALLTHROUGH intentional here
3690      case tkaddasgn:     // FALLTHROUGH intentional here
3691      case tksubasgn:
3692        // Stack is: ... scalar_ref value_to_op_by
3693        // or ... subscript_val map_ref value_to_op_by
3694        // or ... fieldref value_to_op_by
3695        v = setup_lvalue(1, parmbase, &field_num);
3696        to_num(v);
3697        to_num(STKP);
3698        switch (opcode) {
3699          case tkpowasgn:
3700            // TODO
3701            v->num = pow(v->num, STKP->num);
3702            break;
3703          case tkmodasgn:
3704            // TODO
3705            v->num = fmod(v->num, STKP->num);
3706            break;
3707          case tkmulasgn:
3708            v->num *= STKP->num;
3709            break;
3710          case tkdivasgn:
3711            v->num /= STKP->num;
3712            break;
3713          case tkaddasgn:
3714            v->num += STKP->num;
3715            break;
3716          case tksubasgn:
3717            v->num -= STKP->num;
3718            break;
3719        }
3720
3721        drop_n(2);
3722        v->flags = ZF_NUM;
3723        push_val(v);
3724        if (field_num >= 0) fixup_fields(field_num);
3725        break;
3726
3727      case tkasgn:
3728        // Stack is: ... scalar_ref value_to_assign
3729        // or ... subscript_val map_ref value_to_assign
3730        // or ... fieldref value_to_assign
3731        v = setup_lvalue(1, parmbase, &field_num);
3732        force_maybemap_to_scalar(STKP);
3733        zvalue_copy(v, STKP);
3734        swap();
3735        drop();
3736        if (field_num >= 0) fixup_fields(field_num);
3737        break;
3738
3739      case tkincr:        // FALLTHROUGH intentional here
3740      case tkdecr:        // FALLTHROUGH intentional here
3741      case oppreincr:     // FALLTHROUGH intentional here
3742      case oppredecr:
3743        // Stack is: ... scalar_ref
3744        // or ... subscript_val map_ref
3745        // or ... fieldnum fieldref
3746        v = setup_lvalue(0, parmbase, &field_num);
3747        to_num(v);
3748        switch (opcode) {
3749          case tkincr: case tkdecr:
3750            // Must be done in this order because push_val(v) may move v,
3751            // invalidating the pointer.
3752            v->num += (opcode == tkincr) ? 1 : -1;
3753            push_val(v);
3754            // Now reverse the incr/decr on the top TT.stack val.
3755            STKP->num -= (opcode == tkincr) ? 1 : -1;
3756            break;
3757          case oppreincr: case oppredecr:
3758            v->num += (opcode == oppreincr) ? 1 : -1;
3759            push_val(v);
3760            break;
3761        }
3762        swap();
3763        drop();
3764        if (field_num >= 0) fixup_fields(field_num);
3765        break;
3766
3767      case tknumber:      // FALLTHROUGH intentional here
3768      case tkstring:      // FALLTHROUGH intentional here
3769      case tkregex:
3770        push_val(&LITERAL[*ip++]);
3771        break;
3772
3773      case tkprint:
3774      case tkprintf:
3775        nargs = *ip++;
3776        int outmode = *ip++;
3777        struct zfile *outfp = TT.zstdout;
3778        switch (outmode) {
3779          case tkgt: outfp = setup_file(1, "w"); break;     // file
3780          case tkappend: outfp = setup_file(1, "a"); break; // file
3781          case tkpipe: outfp = setup_file(0, "w"); break;   // pipe
3782          default: nargs++; break;
3783        }
3784        nargs--;
3785        if (opcode == tkprintf) {
3786          varprint(fprintf, outfp->fp, nargs);
3787          drop_n(nargs);
3788          break;
3789        }
3790        if (!nargs) {
3791          fprintf(outfp->fp, "%s", to_str(&FIELD[0])->vst->str);
3792        } else {
3793          struct zvalue tempv = uninit_zvalue;
3794          zvalue_copy(&tempv, &STACK[OFS]);
3795          to_str(&tempv);
3796          for (int k = 0; k < nargs; k++) {
3797            if (k) fprintf(outfp->fp, "%s", tempv.vst->str);
3798            int sp = stkn(nargs - 1 - k);
3799            ////// FIXME refcnt -- prob. don't need to copy from TT.stack?
3800            v = &STACK[sp];
3801            to_str_fmt(v, OFMT);
3802            struct zstring *zs = v->vst;
3803            fprintf(outfp->fp, "%s", zs ? zs->str : "");
3804          }
3805          zvalue_release_zstring(&tempv);
3806          drop_n(nargs);
3807        }
3808        fputs(ENSURE_STR(&STACK[ORS])->vst->str, outfp->fp);
3809        break;
3810
3811      case opdrop:
3812        drop();
3813        break;
3814
3815      case opdrop_n:
3816        drop_n(*ip++);
3817        break;
3818
3819        // Stack frame layout relative to parmbase:
3820#define RETURN_VALUE    -4
3821#define RETURN_ADDR     -3
3822#define PREV_PARMBASE   -2
3823#define ARG_CNT         -1
3824#define FUNCTION_NUM    0
3825        // Actual args follow, starting at parmbase + 1
3826      case tkfunction:    // function definition
3827        op2 = *ip++;    // func table num
3828        struct functab_slot *pfdef = &FUNC_DEF[op2];
3829        struct zlist *loctab = &pfdef->function_locals;
3830        int nparms = zlist_len(loctab)-1;
3831
3832        nargs = popnumval();
3833        int newparmbase = stkn(nargs);
3834        STACK[newparmbase + PREV_PARMBASE].num = parmbase;
3835        parmbase = newparmbase;
3836        for ( ;nargs > nparms; nargs--)
3837          drop();
3838        for ( ;nargs < nparms; nargs++) {
3839          // Push additional "args" that were not passed by the caller, to
3840          // match the formal parameters (parms) defined in the function
3841          // definition. In the local var table we may have the type as scalar
3842          // or map if it is used as such within the function. In that case we
3843          // init the pushed arg from the type of the locals table.
3844          // But if a var appears only as a bare arg in a function call it will
3845          // not be typed in the locals table. In that case we can only say it
3846          // "may be" a map, but we have to assume the possibility and attach a
3847          // map to the var. When/if the var is used as a map or scalar in the
3848          // called function it will be converted to a map or scalar as
3849          // required.
3850          // See force_maybemap_to_scalar().
3851          struct symtab_slot *q = &((struct symtab_slot *)loctab->base)[nargs+1];
3852          vv = (struct zvalue)ZVINIT(q->flags, 0, 0);
3853          if (vv.flags == 0) {
3854            zvalue_map_init(&vv);
3855            vv.flags = ZF_MAYBEMAP;
3856          } else if (IS_MAP(&vv)) {
3857            zvalue_map_init(&vv);
3858          } else {
3859            vv.flags = 0;
3860          }
3861          push_val(&vv);
3862        }
3863        break;
3864
3865      case tkreturn:
3866        nparms = *ip++;
3867        nargs = STACK[parmbase+ARG_CNT].num;
3868        force_maybemap_to_scalar(STKP); // Unneeded?
3869        zvalue_copy(&STACK[parmbase+RETURN_VALUE], STKP);
3870        drop();
3871        // Remove the local args (not supplied by caller) from TT.stack, check to
3872        // release any map data created.
3873        while (stkn(0) > parmbase + nargs) {
3874          if ((STKP)->flags & ZF_ANYMAP) {
3875            zmap_delete_map_incl_slotdata((STKP)->map);
3876            xfree((STKP)->map);
3877          }
3878          drop();
3879        }
3880        while (stkn(0) > parmbase + RETURN_VALUE)
3881          drop();
3882        ip = &ZCODE[(int)STACK[parmbase+RETURN_ADDR].num];
3883        parmbase = STACK[parmbase+PREV_PARMBASE].num;
3884        break;
3885
3886      case opprepcall:    // function call prep
3887        if (STKP > stackp_needmore) add_stack(&stackp_needmore);
3888        push_int_val(0);      // return value placeholder
3889        push_int_val(0);      // return addr
3890        push_int_val(0);      // parmbase
3891        push_int_val(0);      // arg count
3892        push_int_val(*ip++);  // function tbl ref
3893        break;
3894
3895      case tkfunc:        // function call
3896        nargs = *ip++;
3897        newparmbase = stkn(nargs);
3898        STACK[newparmbase+RETURN_ADDR].num = ip - &ZCODE[0];
3899        STACK[newparmbase+ARG_CNT].num = nargs;
3900        push_int_val(nargs);      // FIXME TODO pass this in a zregister?
3901        ip = &ZCODE[FUNC_DEF[(int)STACK[newparmbase+FUNCTION_NUM].num].zcode_addr];
3902        break;
3903
3904      case tkrbracket:    // concat multiple map subscripts
3905        nsubscrs = *ip++;
3906        while (--nsubscrs) {
3907          swap();
3908          to_str(STKP);
3909          push_val(&STACK[SUBSEP]);
3910          to_str(STKP);
3911          STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst);
3912          drop();
3913          swap();
3914          to_str(STKP);
3915          STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst);
3916          drop();
3917        }
3918        break;
3919
3920      case opmapdelete:
3921      case tkdelete:
3922        k = STKP->num;
3923        if (k < 0) k = parmbase - k;    // loc of var on TT.stack
3924        v = &STACK[k];
3925        force_maybemap_to_map(v);
3926        if (opcode == opmapdelete) {
3927          zmap_delete_map(v->map);
3928        } else {
3929          drop();
3930          zmap_delete(v->map, to_str(STKP)->vst);
3931        }
3932        drop();
3933        break;
3934
3935      case opmap:
3936        op2 = *ip++;
3937        k = op2 < 0 ? parmbase - op2 : op2;
3938        v = &STACK[k];
3939        force_maybemap_to_map(v);
3940        if (!IS_MAP(v)) FATAL("scalar in array context");
3941        v = get_map_val(v, STKP);
3942        drop();     // drop subscript
3943        push_val(v);
3944        break;
3945
3946      case tkin:
3947        if (!(STKP->flags & ZF_ANYMAP)) FATAL("scalar in array context");
3948        v = zmap_find(STKP->map, to_str(STKP-1)->vst);
3949        drop();
3950        drop();
3951        push_int_val(v ? 1 : 0);
3952        break;
3953
3954      case opmapiternext:
3955        op2 = *ip++;
3956        v = STKP-1;
3957        force_maybemap_to_map(v);
3958        if (!IS_MAP(v)) FATAL("scalar in array context");
3959        struct zmap *m = v->map;   // Need for MAPSLOT macro
3960        int zlen = zlist_len(&m->slot);
3961        int kk = STKP->num + 1;
3962        while (kk < zlen && !(MAPSLOT[kk].key)) // skip deleted slots
3963          kk++;
3964        STKP->num = kk; // save index for next iteration
3965        if (kk < zlen) {
3966          struct zvalue *var = setup_lvalue(2, parmbase, &field_num);
3967          var->flags = ZF_STR;
3968          zstring_release(&var->vst);
3969          var->vst = MAPSLOT[kk].key;
3970          zstring_incr_refcnt(var->vst);
3971          ip += op2;
3972        }
3973        break;
3974
3975      case tkvar:
3976        op2 = *ip++;
3977        k = op2 < 0 ? parmbase - op2 : op2;
3978        v = &STACK[k];
3979        push_val(v);
3980        break;
3981
3982      case tkfield:
3983        // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void)
3984        // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]).
3985        ip++; // skip dummy "operand" instruction field
3986        push_field((int)(to_num(STKP)));
3987
3988        swap();
3989        drop();
3990        break;
3991
3992      case oppush:
3993        push_int_val(*ip++);
3994        break;
3995
3996      case tkand:
3997        op2 = *ip++;
3998        if (get_set_logical()) drop();
3999        else ip += op2;
4000        break;
4001
4002      case tkor:
4003        op2 = *ip++;
4004        if (!get_set_logical()) drop();
4005        else ip += op2;
4006        break;
4007
4008      case tkwhile:
4009        (STKP)->num = ! get_set_logical();
4010        ATTR_FALLTHROUGH_INTENDED;
4011        // FALLTHROUGH to tkternif
4012      case tkif:
4013        // FALLTHROUGH to tkternif
4014      case tkternif:
4015        op2 = *ip++;
4016        int t = get_set_logical();  // FIXME only need to get, not set
4017        drop();
4018        if (!t) ip += op2;
4019        break;
4020
4021      case tkelse:        // FALLTHROUGH intentional here
4022      case tkternelse:    // FALLTHROUGH intentional here
4023      case tkbreak:       // FALLTHROUGH intentional here
4024      case tkcontinue:    // FALLTHROUGH intentional here
4025      case opjump:
4026        op2 = *ip++;
4027        ip += op2;
4028        break;
4029
4030      case opvarref:
4031        op2 = *ip++;
4032        vv = (struct zvalue)ZVINIT(ZF_REF, op2, 0);
4033        push_val(&vv);
4034        break;
4035
4036      case opmapref:
4037        op2 = *ip++;
4038        vv = (struct zvalue)ZVINIT(ZF_MAPREF, op2, 0);
4039        push_val(&vv);
4040        break;
4041
4042      case opfldref:
4043        to_num(STKP);
4044        (STKP)->flags |= ZF_FIELDREF;
4045        ip++; // skip dummy "operand" instruction field
4046        break;
4047
4048      case opprintrec:
4049        puts(to_str(&FIELD[0])->vst->str);
4050        break;
4051
4052      case oprange1:
4053        range_num = *ip++;
4054        op2 = *ip++;
4055        if (TT.range_sw[range_num]) ip += op2;
4056        break;
4057
4058      case oprange2:
4059        range_num = *ip++;
4060        op2 = *ip++;
4061        t = get_set_logical();  // FIXME only need to get, not set
4062        drop();
4063        if (t) TT.range_sw[range_num] = 1;
4064        else ip += op2;
4065        break;
4066
4067      case oprange3:
4068        range_num = *ip++;
4069        t = get_set_logical();  // FIXME only need to get, not set
4070        drop();
4071        if (t) TT.range_sw[range_num] = 0;
4072        break;
4073
4074      case tkexit:
4075        r = popnumval();
4076        if (r != NO_EXIT_STATUS) *status = (int)r & 255;
4077        // TODO FIXME do we need NO_EXIT_STATUS at all? Just use 0?
4078        ATTR_FALLTHROUGH_INTENDED;
4079      case tknext:
4080      case tknextfile:
4081        return opcode;
4082
4083      case tkgetline:
4084        nargs = *ip++;
4085        int source = *ip++;
4086        // TT.stack is:
4087        // if tkgetline 0 tkeof:   (nothing stacked; plain getline)
4088        // if tkgetline 1 tkeof:   (lvalue)
4089        // if tkgetline 1 tklt:    (filename_string)
4090        // if tkgetline 2 tklt:    (lvalue) (filename_string)
4091        // if tkgetline 1 tkpipe:  (pipe_command_string)
4092        // if tkgetline 2 tkpipe:  (pipe_command_string) (lvalue)
4093        // effect is to set:
4094        // if tkgetline 0 tkeof:   $0 NF NR FNR
4095        // if tkgetline 1 tkeof:   var NR FNR
4096        // if tkgetline 1 tklt:    $0 NF
4097        // if tkgetline 2 tklt:    var
4098        // if tkgetline 1 tkpipe:  $0 NF
4099        // if tkgetline 2 tkpipe:  var
4100        // Ensure pipe cmd on top
4101        if (nargs == 2 && source == tkpipe) swap();
4102        struct zfile *zfp = 0;
4103        if (source == tklt || source == tkpipe) {
4104          zfp = setup_file(source == tklt, "r");
4105          nargs--;
4106        }
4107        // now cases are:
4108        // nargs source  TT.stack
4109        //  0 tkeof:   (nothing; plain getline) from current data file
4110        //  1 tkeof:   (lvalue)  from current data file
4111        //  0 tklt:    (nothing) from named file in 'stream'
4112        //  1 tklt:    (lvalue)  from  named file in 'stream'
4113        //  0 tkpipe:  (nothing) from piped command in 'stream'
4114        //  1 tkpipe:  (lvalue)  from piped command in 'stream'
4115        v = nargs ? setup_lvalue(0, parmbase, &field_num) : 0;
4116        if (v) drop();
4117        // source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe)
4118        // stream is name of file or pipe
4119        // v is NULL or an lvalue ref
4120        if (zfp != badfile) push_int_val(awk_getline(source, zfp, v));
4121        else push_int_val(-1);
4122
4123        // fake return value for now
4124        break;
4125
4126        ////// builtin functions ///////
4127
4128      case tksplit:
4129        nargs = *ip++;
4130        if (nargs == 2) push_val(&STACK[FS]);
4131        struct zstring *s = to_str(STKP-2)->vst;
4132        force_maybemap_to_map(STKP-1);
4133        struct zvalue *a = STKP-1;
4134        struct zvalue *fs = STKP;
4135        zmap_delete_map(a->map);
4136        k = split(s, a, fs);
4137        drop_n(3);
4138        push_int_val(k);
4139        break;
4140
4141      case tkmatch:
4142        nargs = *ip++;
4143        if (!IS_RX(STKP)) to_str(STKP);
4144        regex_t rx_pat, *rxp = &rx_pat;
4145        rx_zvalue_compile(&rxp, STKP);
4146        regoff_t rso = 0, reo = 0;  // shut up warning (may be uninit)
4147        k = rx_find(rxp, to_str(STKP-1)->vst->str, &rso, &reo, 0);
4148        rx_zvalue_free(rxp, STKP);
4149        // Force these to num before setting.
4150        to_num(&STACK[RSTART]);
4151        to_num(&STACK[RLENGTH]);
4152        if (k) STACK[RSTART].num = 0, STACK[RLENGTH].num = -1;
4153        else {
4154          reo = utf8cnt(STKP[-1].vst->str, reo);
4155          rso = utf8cnt(STKP[-1].vst->str, rso);
4156          STACK[RSTART].num = rso + 1, STACK[RLENGTH].num = reo - rso;
4157        }
4158        drop();
4159        drop();
4160        push_int_val(k ? 0 : rso + 1);
4161        break;
4162
4163      case tksub:
4164      case tkgsub:
4165        gsub(opcode, *ip++, parmbase);  // tksub/tkgsub, args
4166        break;
4167
4168      case tksubstr:
4169        nargs = *ip++;
4170        struct zstring *zz = to_str(STKP - nargs + 1)->vst;
4171        int nchars = utf8cnt(zz->str, zz->size);  // number of utf8 codepoints
4172        // Offset of start of string (in chars not bytes); convert 1-based to 0-based
4173        ssize_t mm = CLAMP(trunc(to_num(STKP - nargs + 2)) - 1, 0, nchars);
4174        ssize_t nn = nchars - mm;   // max possible substring length (chars)
4175        if (nargs == 3) nn = CLAMP(trunc(to_num(STKP)), 0, nn);
4176        mm = bytesinutf8(zz->str, zz->size, mm);
4177        nn = bytesinutf8(zz->str + mm, zz->size - mm, nn);
4178        struct zstring *zzz = new_zstring(zz->str + mm, nn);
4179        zstring_release(&(STKP - nargs + 1)->vst);
4180        (STKP - nargs + 1)->vst = zzz;
4181        drop_n(nargs - 1);
4182        break;
4183
4184      case tkindex:
4185        nargs = *ip++;
4186        char *s1 = to_str(STKP-1)->vst->str;
4187        char *s3 = strstr(s1, to_str(STKP)->vst->str);
4188        ptrdiff_t offs = s3 ? utf8cnt(s1, s3 - s1) + 1 : 0;
4189        drop();
4190        drop();
4191        push_int_val(offs);
4192        break;
4193
4194      case tkband:
4195      case tkbor:
4196      case tkbxor:
4197      case tklshift:
4198      case tkrshift:
4199        ; size_t acc = to_num(STKP);
4200        nargs = *ip++;
4201        for (int i = 1; i < nargs; i++) switch (opcode) {
4202          case tkband: acc &= (size_t)to_num(STKP-i); break;
4203          case tkbor:  acc |= (size_t)to_num(STKP-i); break;
4204          case tkbxor: acc ^= (size_t)to_num(STKP-i); break;
4205          case tklshift: acc = (size_t)to_num(STKP-i) << acc; break;
4206          case tkrshift: acc = (size_t)to_num(STKP-i) >> acc; break;
4207        }
4208        drop_n(nargs);
4209        push_int_val(acc);
4210        break;
4211
4212      case tktolower:
4213      case tktoupper:
4214        nargs = *ip++;
4215        struct zstring *z = to_str(STKP)->vst;
4216        unsigned zzlen = z->size + 4; // Allow for expansion
4217        zz = zstring_update(0, zzlen, "", 0);
4218        char *p = z->str, *e = z->str + z->size, *q = zz->str;
4219        // Similar logic to toybox strlower(), but fixed.
4220        while (p < e) {
4221          unsigned wch;
4222          int len = utf8towc(&wch, p, e-p);
4223          if (len < 1) {  // nul byte, error, or truncated code
4224            *q++ = *p++;
4225            continue;
4226          }
4227          p += len;
4228          wch = (opcode == tktolower ? towlower : towupper)(wch);
4229          len = wctoutf8(q, wch);
4230          q += len;
4231          // Need realloc here if overflow possible
4232          if ((len = q - zz->str) + 4 < (int)zzlen) continue;
4233          zz = zstring_update(zz, zzlen = len + 16, "", 0);
4234          q = zz->str + len;
4235        }
4236        *q = 0;
4237        zz->size = q - zz->str;
4238        zstring_release(&z);
4239        STKP->vst = zz;
4240        break;
4241
4242      case tklength:
4243        nargs = *ip++;
4244        v = nargs ? STKP : &FIELD[0];
4245        force_maybemap_to_map(v);
4246        if (IS_MAP(v)) k = v->map->count - v->map->deleted;
4247        else {
4248          to_str(v);
4249          k = utf8cnt(v->vst->str, v->vst->size);
4250        }
4251        if (nargs) drop();
4252        push_int_val(k);
4253        break;
4254
4255      case tksystem:
4256        nargs = *ip++;
4257        fflush(stdout);
4258        fflush(stderr);
4259        r = system(to_str(STKP)->vst->str);
4260#ifdef WEXITSTATUS
4261        // WEXITSTATUS is in sys/wait.h, but I'm not including that.
4262        // It seems to also be in stdlib.h in gcc and musl-gcc.
4263        // No idea how portable this is!
4264        if (WIFEXITED(r)) r = WEXITSTATUS(r);
4265#endif
4266        drop();
4267        push_int_val(r);
4268        break;
4269
4270      case tkfflush:
4271        nargs = *ip++;
4272        r = fflush_file(nargs);
4273        if (nargs) drop();
4274        push_int_val(r);
4275        break;
4276
4277      case tkclose:
4278        nargs = *ip++;
4279        r = close_file(to_str(STKP)->vst->str);
4280        drop();
4281        push_int_val(r);
4282        break;
4283
4284      case tksprintf:
4285        nargs = *ip++;
4286        zstring_release(&TT.rgl.zspr);
4287        TT.rgl.zspr = new_zstring("", 0);
4288        varprint(fsprintf, 0, nargs);
4289        drop_n(nargs);
4290        vv = (struct zvalue)ZVINIT(ZF_STR, 0, TT.rgl.zspr);
4291        push_val(&vv);
4292        break;
4293
4294      // Math builtins -- move here (per Oliver Webb suggestion)
4295      case tkatan2:
4296        nargs = *ip++;
4297        d = atan2(to_num(STKP-1), to_num(STKP));
4298        drop();
4299        STKP->num = d;
4300        break;
4301      case tkrand:
4302        nargs = *ip++;
4303        push_int_val(0);
4304        // Get all 53 mantissa bits in play:
4305        // (upper 26 bits * 2^27 + upper 27 bits) / 2^53
4306        STKP->num =
4307          ((random() >> 5) * 134217728.0 + (random() >> 4)) / 9007199254740992.0;
4308        break;
4309      case tksrand:
4310        nargs = *ip++;
4311        if (nargs == 1) {
4312          STKP->num = seedrand(to_num(STKP));
4313        } else push_int_val(seedrand(millinow()));
4314        break;
4315      case tkcos: case tksin: case tkexp: case tklog: case tksqrt: case tkint:
4316        nargs = *ip++;
4317        STKP->num = mathfunc[opcode-tkcos](to_num(STKP));
4318        break;
4319
4320      default:
4321        // This should never happen:
4322        error_exit("!!! Unimplemented opcode %d", opcode);
4323    }
4324  }
4325  return opquit;
4326}
4327
4328// interp() wraps the main interpreter loop interpx(). The main purpose
4329// is to allow the TT.stack to be readjusted after an 'exit' from a function.
4330// Also catches errors, as the normal operation should leave the TT.stack
4331// depth unchanged after each run through the rules.
4332static int interp(int start, int *status)
4333{
4334  int stkptrbefore = stkn(0);
4335  int r = interpx(start, status);
4336  // If exit from function, TT.stack will be loaded with args etc. Clean it.
4337  if (r == tkexit) {
4338    // TODO FIXME is this safe? Just remove extra entries?
4339    STKP = &STACK[stkptrbefore];
4340  }
4341  if (stkn(0) - stkptrbefore)
4342    error_exit("!!AWK BUG stack pointer offset: %d", stkn(0) - stkptrbefore);
4343  return r;
4344}
4345
4346static void insert_argv_map(struct zvalue *map, int key, char *value)
4347{
4348  struct zvalue zkey = ZVINIT(ZF_STR, 0, num_to_zstring(key, ENSURE_STR(&STACK[CONVFMT])->vst->str));
4349  struct zvalue *v = get_map_val(map, &zkey);
4350  zvalue_release_zstring(&zkey);
4351  zvalue_release_zstring(v);
4352  *v = new_str_val(value);
4353  check_numeric_string(v);
4354}
4355
4356static void init_globals(int optind, int argc, char **argv, char *sepstring,
4357    struct arg_list *assign_args)
4358{
4359  // Global variables reside at the bottom of the TT.stack. Start with the awk
4360  // "special variables":  ARGC, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF,
4361  // NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP
4362
4363  STACK[CONVFMT] = new_str_val("%.6g");
4364  // Init ENVIRON map.
4365  struct zvalue m = ZVINIT(ZF_MAP, 0, 0);
4366  zvalue_map_init(&m);
4367  STACK[ENVIRON] = m;
4368  for (char **pkey = environ; *pkey; pkey++) {
4369    char *pval = strchr(*pkey, '=');
4370    if (!pval) continue;
4371    struct zvalue zkey = ZVINIT(ZF_STR, 0, new_zstring(*pkey, pval - *pkey));
4372    struct zvalue *v = get_map_val(&m, &zkey);
4373    zstring_release(&zkey.vst);
4374    if (v->vst) FFATAL("env var dup? (%s)", pkey);
4375    *v = new_str_val(++pval);    // FIXME refcnt
4376    check_numeric_string(v);
4377  }
4378
4379  // Init ARGV map.
4380  m = (struct zvalue)ZVINIT(ZF_MAP, 0, 0);
4381  zvalue_map_init(&m);
4382  STACK[ARGV] = m;
4383  insert_argv_map(&m, 0, TT.progname);
4384  int nargc = 1;
4385  for (int k = optind; k < argc; k++) {
4386    insert_argv_map(&m, nargc, argv[k]);
4387    nargc++;
4388  }
4389
4390  // Init rest of the awk special variables.
4391  STACK[ARGC] = (struct zvalue)ZVINIT(ZF_NUM, nargc, 0);
4392  STACK[FILENAME] = new_str_val("");
4393  STACK[FNR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4394  STACK[FS] = new_str_val(sepstring);
4395  STACK[NF] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4396  STACK[NR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4397  STACK[OFMT] = new_str_val("%.6g");
4398  STACK[OFS] = new_str_val(" ");
4399  STACK[ORS] = new_str_val("\n");
4400  STACK[RLENGTH] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4401  STACK[RS] = new_str_val("\n");
4402  STACK[RSTART] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4403  STACK[SUBSEP] = new_str_val("\034");
4404
4405  // Init program globals.
4406  //
4407  // Push global variables on the TT.stack at offsets matching their index in the
4408  // global var table.  In the global var table we may have the type as scalar
4409  // or map if it is used as such in the program. In that case we init the
4410  // pushed arg from the type of the globals table.
4411  // But if a global var appears only as a bare arg in a function call it will
4412  // not be typed in the globals table. In that case we can only say it "may be"
4413  // a map, but we have to assume the possibility and attach a map to the
4414  // var. When/if the var is used as a map or scalar in the called function it
4415  // will be converted to a map or scalar as required.
4416  // See force_maybemap_to_scalar(), and the similar comment in
4417  // 'case tkfunction:' above.
4418  //
4419  int gstx, len = zlist_len(&TT.globals_table);
4420  for (gstx = TT.spec_var_limit; gstx < len; gstx++) {
4421    struct symtab_slot gs = GLOBAL[gstx];
4422    struct zvalue v = ZVINIT(gs.flags, 0, 0);
4423    if (v.flags == 0) {
4424      zvalue_map_init(&v);
4425      v.flags = ZF_MAYBEMAP;
4426    } else if (IS_MAP(&v)) {
4427      zvalue_map_init(&v);
4428    } else {
4429      // Set SCALAR flag 0 to create "uninitialized" scalar.
4430      v.flags = 0;
4431    }
4432    push_val(&v);
4433  }
4434
4435  // Init -v assignment options.
4436  for (struct arg_list *p = assign_args; p; p = p->next) {
4437    char *asgn = p->arg;
4438    char *val = strchr(asgn, '=');
4439    if (!val) error_exit("bad -v assignment format");
4440    *val++ = 0;
4441    assign_global(asgn, val);
4442  }
4443
4444  TT.rgl.cur_arg = new_str_val("<cmdline>");
4445  uninit_string_zvalue = new_str_val("");
4446  zvalue_copy(&FIELD[0], &uninit_string_zvalue);
4447}
4448
4449static void run_files(int *status)
4450{
4451  int r = 0;
4452  while (r != tkexit && *status < 0 && getrec_f0() >= 0)
4453    if ((r = interp(TT.cgl.first_recrule, status)) == tknextfile) next_fp();
4454}
4455
4456static void free_literal_regex(void)
4457{
4458  int len = zlist_len(&TT.literals);
4459  for (int k = 1; k < len; k++)
4460    if (IS_RX(&LITERAL[k])) regfree(LITERAL[k].rx);
4461}
4462
4463static void run(int optind, int argc, char **argv, char *sepstring,
4464    struct arg_list *assign_args)
4465{
4466  char *printf_fmt_rx = "%[-+ #0']*([*]|[0-9]*)([.]([*]|[0-9]*))?l?[aAdiouxXfFeEgGcs%]";
4467  init_globals(optind, argc, argv, sepstring, assign_args);
4468  TT.cfile = xzalloc(sizeof(struct zfile));
4469  xregcomp(&TT.rx_default, "[ \t\n]+", REG_EXTENDED);
4470  xregcomp(&TT.rx_last, "[ \t\n]+", REG_EXTENDED);
4471  xregcomp(&TT.rx_printf_fmt, printf_fmt_rx, REG_EXTENDED);
4472  new_file("-", stdin, 'r', 'f')->is_std_file = 1;
4473  new_file("/dev/stdin", stdin, 'r', 'f')->is_std_file = 1;
4474  new_file("/dev/stdout", stdout, 'w', 'f')->is_std_file = 1;
4475  TT.zstdout = TT.zfiles;
4476  new_file("/dev/stderr", stderr, 'w', 'f')->is_std_file = 1;
4477  seedrand(123);
4478  int status = -1, r = 0;
4479  if (TT.cgl.first_begin) r = interp(TT.cgl.first_begin, &status);
4480  if (r != tkexit)
4481    if (TT.cgl.first_recrule) run_files(&status);
4482  if (TT.cgl.first_end) r = interp(TT.cgl.first_end, &status);
4483  regfree(&TT.rx_printf_fmt);
4484  regfree(&TT.rx_default);
4485  regfree(&TT.rx_last);
4486  free_literal_regex();
4487  close_file(0);    // close all files
4488  if (status >= 0) exit(status);
4489}
4490
4491////////////////////
4492//// main
4493////////////////////
4494
4495static void progfiles_init(char *progstring, struct arg_list *prog_args)
4496{
4497  TT.scs->p = progstring ? progstring : "  " + 2;
4498  TT.scs->progstring = progstring;
4499  TT.scs->prog_args = prog_args;
4500  TT.scs->filename = "(cmdline)";
4501  TT.scs->maxtok = 256;
4502  TT.scs->tokstr = xzalloc(TT.scs->maxtok);
4503}
4504
4505static int awk(char *sepstring, char *progstring, struct arg_list *prog_args,
4506    struct arg_list *assign_args, int optind, int argc, char **argv,
4507    int opt_run_prog)
4508{
4509  struct scanner_state ss = {0};
4510  TT.scs = &ss;
4511
4512  setlocale(LC_NUMERIC, "");
4513  progfiles_init(progstring, prog_args);
4514  compile();
4515
4516  if (TT.cgl.compile_error_count)
4517    error_exit("%d syntax error(s)", TT.cgl.compile_error_count);
4518  else {
4519    if (opt_run_prog)
4520      run(optind, argc, argv, sepstring, assign_args);
4521  }
4522
4523  return TT.cgl.compile_error_count;
4524}
4525
4526void awk_main(void)
4527{
4528  char *sepstring = TT.F ? escape_str(TT.F, 0) : " ";
4529  int optind = 0;
4530  char *progstring = NULL;
4531
4532  TT.pbuf = toybuf;
4533  toys.exitval = 2;
4534  if (!TT.f) {
4535    if (*toys.optargs) progstring = toys.optargs[optind++];
4536    else error_exit("No program string\n");
4537  }
4538  TT.progname = toys.which->name;
4539  toys.exitval = awk(sepstring, progstring, TT.f, TT.v,
4540      optind, toys.optc, toys.optargs, !FLAG(c));
4541}
4542