xref: /third_party/toybox/toys/posix/cut.c (revision 0f66f451)
10f66f451Sopenharmony_ci/* cut.c - print selected ranges from a file
20f66f451Sopenharmony_ci *
30f66f451Sopenharmony_ci * Copyright 2016 Rob Landley <rob@landley.net>
40f66f451Sopenharmony_ci *
50f66f451Sopenharmony_ci * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
60f66f451Sopenharmony_ci *
70f66f451Sopenharmony_ci * Deviations from posix: added -DF. We can only accept 512 selections, and
80f66f451Sopenharmony_ci * "-" counts as start to end. Using spaces to separate a comma-separated list
90f66f451Sopenharmony_ci * is silly and inconsistent with dd, ps, cp, and mount.
100f66f451Sopenharmony_ci *
110f66f451Sopenharmony_ci * todo: -n, -s with -c
120f66f451Sopenharmony_ci
130f66f451Sopenharmony_ciUSE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))
140f66f451Sopenharmony_ci
150f66f451Sopenharmony_ciconfig CUT
160f66f451Sopenharmony_ci  bool "cut"
170f66f451Sopenharmony_ci  default y
180f66f451Sopenharmony_ci  help
190f66f451Sopenharmony_ci    usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...]
200f66f451Sopenharmony_ci
210f66f451Sopenharmony_ci    Print selected parts of lines from each FILE to standard output.
220f66f451Sopenharmony_ci
230f66f451Sopenharmony_ci    Each selection LIST is comma separated, either numbers (counting from 1)
240f66f451Sopenharmony_ci    or dash separated ranges (inclusive, with X- meaning to end of line and -X
250f66f451Sopenharmony_ci    from start). By default selection ranges are sorted and collated, use -D
260f66f451Sopenharmony_ci    to prevent that.
270f66f451Sopenharmony_ci
280f66f451Sopenharmony_ci    -b	Select bytes
290f66f451Sopenharmony_ci    -c	Select UTF-8 characters
300f66f451Sopenharmony_ci    -C	Select unicode columns
310f66f451Sopenharmony_ci    -d	Use DELIM (default is TAB for -f, run of whitespace for -F)
320f66f451Sopenharmony_ci    -D	Don't sort/collate selections or match -fF lines without delimiter
330f66f451Sopenharmony_ci    -f	Select fields (words) separated by single DELIM character
340f66f451Sopenharmony_ci    -F	Select fields separated by DELIM regex
350f66f451Sopenharmony_ci    -O	Output delimiter (default one space for -F, input delim for -f)
360f66f451Sopenharmony_ci    -s	Skip lines without delimiters
370f66f451Sopenharmony_ci*/
380f66f451Sopenharmony_ci#define FOR_cut
390f66f451Sopenharmony_ci#include "toys.h"
400f66f451Sopenharmony_ci
410f66f451Sopenharmony_ciGLOBALS(
420f66f451Sopenharmony_ci  char *d, *O;
430f66f451Sopenharmony_ci  struct arg_list *select[5]; // we treat them the same, so loop through
440f66f451Sopenharmony_ci
450f66f451Sopenharmony_ci  int pairs;
460f66f451Sopenharmony_ci  regex_t reg;
470f66f451Sopenharmony_ci)
480f66f451Sopenharmony_ci
490f66f451Sopenharmony_ci// Return number of bytes to start of first column fitting in columns
500f66f451Sopenharmony_ci// invalid sequences are skipped/ignored
510f66f451Sopenharmony_ciint unicolumns(char *start, unsigned columns)
520f66f451Sopenharmony_ci{
530f66f451Sopenharmony_ci  int i, j = 0;
540f66f451Sopenharmony_ci  unsigned wc;
550f66f451Sopenharmony_ci  char *s = start, *ss = start;
560f66f451Sopenharmony_ci
570f66f451Sopenharmony_ci  // Skip start, rounding down if we hit a multicolumn char
580f66f451Sopenharmony_ci  while (j<columns && (i = utf8towc(&wc, s, 4))) {
590f66f451Sopenharmony_ci    if (i<0) s++;
600f66f451Sopenharmony_ci    else {
610f66f451Sopenharmony_ci      s += i;
620f66f451Sopenharmony_ci      if (0<(i = wcwidth(wc))) {
630f66f451Sopenharmony_ci        if ((j += i)>columns) break;
640f66f451Sopenharmony_ci        ss = s;
650f66f451Sopenharmony_ci      }
660f66f451Sopenharmony_ci    }
670f66f451Sopenharmony_ci  }
680f66f451Sopenharmony_ci
690f66f451Sopenharmony_ci  return ss-start;
700f66f451Sopenharmony_ci}
710f66f451Sopenharmony_ci
720f66f451Sopenharmony_ci// Apply selections to an input line, producing output
730f66f451Sopenharmony_cistatic void cut_line(char **pline, long len)
740f66f451Sopenharmony_ci{
750f66f451Sopenharmony_ci  unsigned *pairs = (void *)toybuf;
760f66f451Sopenharmony_ci  char *line;
770f66f451Sopenharmony_ci  int i, j;
780f66f451Sopenharmony_ci
790f66f451Sopenharmony_ci  if (!pline) return;
800f66f451Sopenharmony_ci  line = *pline;
810f66f451Sopenharmony_ci  if (len && line[len-1]=='\n') line[--len] = 0;
820f66f451Sopenharmony_ci
830f66f451Sopenharmony_ci  // Loop through selections
840f66f451Sopenharmony_ci  for (i=0; i<TT.pairs; i++) {
850f66f451Sopenharmony_ci    unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
860f66f451Sopenharmony_ci    char *s = line, *ss;
870f66f451Sopenharmony_ci
880f66f451Sopenharmony_ci    // input: start/end position, count=difference between them
890f66f451Sopenharmony_ci    // output: s = start of string, len = bytes to output
900f66f451Sopenharmony_ci
910f66f451Sopenharmony_ci    if (start) start--;
920f66f451Sopenharmony_ci    if (start>=len) continue;
930f66f451Sopenharmony_ci    if (!end || end>len) end = len;
940f66f451Sopenharmony_ci    count = end-start;
950f66f451Sopenharmony_ci
960f66f451Sopenharmony_ci    // Find start and end of output string for the relevant selection type
970f66f451Sopenharmony_ci    if (toys.optflags&FLAG_b) s += start;
980f66f451Sopenharmony_ci    else if (toys.optflags&FLAG_C) {
990f66f451Sopenharmony_ci      // crunch_str() currently assumes that combining characters get
1000f66f451Sopenharmony_ci      // escaped, to provide an unambiguous visual representation.
1010f66f451Sopenharmony_ci      // This assumes the input string is null terminated.
1020f66f451Sopenharmony_ci      //if (start) crunch_str(&s, start, 0, 0, 0);
1030f66f451Sopenharmony_ci      //if (!*s) continue;
1040f66f451Sopenharmony_ci      //start = s-line;
1050f66f451Sopenharmony_ci      //ss = s;
1060f66f451Sopenharmony_ci      //crunch_str(&ss, count, 0, 0, 0);
1070f66f451Sopenharmony_ci      //count = ss-s;
1080f66f451Sopenharmony_ci
1090f66f451Sopenharmony_ci      s += unicolumns(s, start);
1100f66f451Sopenharmony_ci      count = unicolumns(s, end-start);
1110f66f451Sopenharmony_ci    } else if (toys.optflags&FLAG_c) {
1120f66f451Sopenharmony_ci      unsigned wc;
1130f66f451Sopenharmony_ci      char *sss;
1140f66f451Sopenharmony_ci
1150f66f451Sopenharmony_ci      // Find start
1160f66f451Sopenharmony_ci      ss = line+len;
1170f66f451Sopenharmony_ci      while (start && s<ss) {
1180f66f451Sopenharmony_ci        if (0<=(j = utf8towc(&wc, s, len))) start--;
1190f66f451Sopenharmony_ci        s += (j<1) ? 1 : j;
1200f66f451Sopenharmony_ci      }
1210f66f451Sopenharmony_ci      if (s == ss) continue;
1220f66f451Sopenharmony_ci
1230f66f451Sopenharmony_ci      // Find end
1240f66f451Sopenharmony_ci      end = count;
1250f66f451Sopenharmony_ci      sss = s;
1260f66f451Sopenharmony_ci      while (end && sss<ss) {
1270f66f451Sopenharmony_ci        if (0<=(j = utf8towc(&wc, sss, len))) end--;
1280f66f451Sopenharmony_ci        sss += (j<1) ? 1 : j;
1290f66f451Sopenharmony_ci      }
1300f66f451Sopenharmony_ci      count = sss-s;
1310f66f451Sopenharmony_ci    } else {
1320f66f451Sopenharmony_ci      regmatch_t match;
1330f66f451Sopenharmony_ci
1340f66f451Sopenharmony_ci      // Loop through skipping appropriate number of fields
1350f66f451Sopenharmony_ci      for (j = 0; j<2; j++) {
1360f66f451Sopenharmony_ci        ss = s;
1370f66f451Sopenharmony_ci        if (j) start = count;
1380f66f451Sopenharmony_ci        else end = start;
1390f66f451Sopenharmony_ci        while (*ss && start) {
1400f66f451Sopenharmony_ci          if (toys.optflags&FLAG_f) {
1410f66f451Sopenharmony_ci            if (!strchr(TT.d, *ss++)) continue;
1420f66f451Sopenharmony_ci            if (!--start && j) ss--;
1430f66f451Sopenharmony_ci          } else {
1440f66f451Sopenharmony_ci            if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
1450f66f451Sopenharmony_ci              ss = line+len;
1460f66f451Sopenharmony_ci              continue;
1470f66f451Sopenharmony_ci            }
1480f66f451Sopenharmony_ci            if (!match.rm_eo) break; // zero length match == no delimiter
1490f66f451Sopenharmony_ci            ss += (!--start && j) ? match.rm_so : match.rm_eo;
1500f66f451Sopenharmony_ci          }
1510f66f451Sopenharmony_ci        }
1520f66f451Sopenharmony_ci        if (!j && !*(s = ss)) break;
1530f66f451Sopenharmony_ci      }
1540f66f451Sopenharmony_ci
1550f66f451Sopenharmony_ci      // If we never encountered even one separator, print whole line (posix!)
1560f66f451Sopenharmony_ci      if (!j && end == start) {
1570f66f451Sopenharmony_ci        if (toys.optflags&FLAG_D) break;
1580f66f451Sopenharmony_ci        if (toys.optflags&FLAG_s) return;
1590f66f451Sopenharmony_ci        fwrite(line, len, 1, stdout);
1600f66f451Sopenharmony_ci        break;
1610f66f451Sopenharmony_ci      } else if (!*s) continue;
1620f66f451Sopenharmony_ci      count = ss-s;
1630f66f451Sopenharmony_ci    }
1640f66f451Sopenharmony_ci    if (i && TT.O) fputs(TT.O, stdout);
1650f66f451Sopenharmony_ci    fwrite(s, count, 1, stdout);
1660f66f451Sopenharmony_ci  }
1670f66f451Sopenharmony_ci  xputc('\n');
1680f66f451Sopenharmony_ci}
1690f66f451Sopenharmony_ci
1700f66f451Sopenharmony_cistatic int compar(unsigned *a, unsigned *b)
1710f66f451Sopenharmony_ci{
1720f66f451Sopenharmony_ci  if (*a<*b) return -1;
1730f66f451Sopenharmony_ci  if (*a>*b) return 1;
1740f66f451Sopenharmony_ci  if (a[1]<b[1]) return -1;
1750f66f451Sopenharmony_ci  if (a[1]>b[1]) return 1;
1760f66f451Sopenharmony_ci
1770f66f451Sopenharmony_ci  return 0;
1780f66f451Sopenharmony_ci}
1790f66f451Sopenharmony_ci
1800f66f451Sopenharmony_ci// parse A or A-B or A- or -B
1810f66f451Sopenharmony_cistatic char *get_range(void *data, char *str, int len)
1820f66f451Sopenharmony_ci{
1830f66f451Sopenharmony_ci  char *end = str;
1840f66f451Sopenharmony_ci  unsigned *pairs = (void *)toybuf, i;
1850f66f451Sopenharmony_ci
1860f66f451Sopenharmony_ci  // Using toybuf[] to store ranges means we can have 512 selections max.
1870f66f451Sopenharmony_ci  if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
1880f66f451Sopenharmony_ci  pairs += 2*TT.pairs++;
1890f66f451Sopenharmony_ci
1900f66f451Sopenharmony_ci  pairs[1] = UINT_MAX;
1910f66f451Sopenharmony_ci  for (i = 0; ;i++) {
1920f66f451Sopenharmony_ci    if (i==2) return end;
1930f66f451Sopenharmony_ci    if (isdigit(*end)) {
1940f66f451Sopenharmony_ci      long long ll = estrtol(end, &end, 10);
1950f66f451Sopenharmony_ci
1960f66f451Sopenharmony_ci      if (ll<1 || ll>UINT_MAX || errno) return end;
1970f66f451Sopenharmony_ci      pairs[i] = ll;
1980f66f451Sopenharmony_ci    }
1990f66f451Sopenharmony_ci    if (*end++ != '-') break;
2000f66f451Sopenharmony_ci  }
2010f66f451Sopenharmony_ci  if (!i) pairs[1] = pairs[0];
2020f66f451Sopenharmony_ci  if ((end-str)<len) return end;
2030f66f451Sopenharmony_ci  if (pairs[0]>pairs[1]) return str;
2040f66f451Sopenharmony_ci
2050f66f451Sopenharmony_ci  // No error
2060f66f451Sopenharmony_ci  return 0;
2070f66f451Sopenharmony_ci}
2080f66f451Sopenharmony_ci
2090f66f451Sopenharmony_civoid cut_main(void)
2100f66f451Sopenharmony_ci{
2110f66f451Sopenharmony_ci  int i;
2120f66f451Sopenharmony_ci  char buf[8];
2130f66f451Sopenharmony_ci
2140f66f451Sopenharmony_ci  // Parse command line arguments
2150f66f451Sopenharmony_ci  if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
2160f66f451Sopenharmony_ci    error_exit("-s needs -Ff");
2170f66f451Sopenharmony_ci  if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
2180f66f451Sopenharmony_ci    error_exit("-d needs -Ff");
2190f66f451Sopenharmony_ci  if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t";
2200f66f451Sopenharmony_ci  if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
2210f66f451Sopenharmony_ci  if (!TT.O) {
2220f66f451Sopenharmony_ci    if (toys.optflags&FLAG_F) TT.O = " ";
2230f66f451Sopenharmony_ci    else if (toys.optflags&FLAG_f) TT.O = TT.d;
2240f66f451Sopenharmony_ci  }
2250f66f451Sopenharmony_ci
2260f66f451Sopenharmony_ci  // Parse ranges, which are attached to a selection type (only one can be set)
2270f66f451Sopenharmony_ci  for (i = 0; i<ARRAY_LEN(TT.select); i++) {
2280f66f451Sopenharmony_ci    sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
2290f66f451Sopenharmony_ci    if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
2300f66f451Sopenharmony_ci  }
2310f66f451Sopenharmony_ci  if (!TT.pairs) error_exit("no selections");
2320f66f451Sopenharmony_ci
2330f66f451Sopenharmony_ci  // Sort and collate selections
2340f66f451Sopenharmony_ci  if (!(toys.optflags&FLAG_D)) {
2350f66f451Sopenharmony_ci    int from, to;
2360f66f451Sopenharmony_ci    unsigned *pairs = (void *)toybuf;
2370f66f451Sopenharmony_ci
2380f66f451Sopenharmony_ci    qsort(toybuf, TT.pairs, 8, (void *)compar);
2390f66f451Sopenharmony_ci    for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
2400f66f451Sopenharmony_ci      if (pairs[from] > pairs[to+1]) {
2410f66f451Sopenharmony_ci        to += 2;
2420f66f451Sopenharmony_ci        memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
2430f66f451Sopenharmony_ci      } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
2440f66f451Sopenharmony_ci    }
2450f66f451Sopenharmony_ci    TT.pairs = (to/2)+1;
2460f66f451Sopenharmony_ci  }
2470f66f451Sopenharmony_ci
2480f66f451Sopenharmony_ci  // For each argument, loop through lines of file and call cut_line() on each
2490f66f451Sopenharmony_ci  loopfiles_lines(toys.optargs, cut_line);
2500f66f451Sopenharmony_ci}
251