10f66f451Sopenharmony_ci/* cut.c - print selected ranges from a file 20f66f451Sopenharmony_ci * 30f66f451Sopenharmony_ci * Copyright 2016 Rob Landley <rob@landley.net> 40f66f451Sopenharmony_ci * 50f66f451Sopenharmony_ci * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html 60f66f451Sopenharmony_ci * 70f66f451Sopenharmony_ci * Deviations from posix: added -DF. We can only accept 512 selections, and 80f66f451Sopenharmony_ci * "-" counts as start to end. Using spaces to separate a comma-separated list 90f66f451Sopenharmony_ci * is silly and inconsistent with dd, ps, cp, and mount. 100f66f451Sopenharmony_ci * 110f66f451Sopenharmony_ci * todo: -n, -s with -c 120f66f451Sopenharmony_ci 130f66f451Sopenharmony_ciUSE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN)) 140f66f451Sopenharmony_ci 150f66f451Sopenharmony_ciconfig CUT 160f66f451Sopenharmony_ci bool "cut" 170f66f451Sopenharmony_ci default y 180f66f451Sopenharmony_ci help 190f66f451Sopenharmony_ci usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...] 200f66f451Sopenharmony_ci 210f66f451Sopenharmony_ci Print selected parts of lines from each FILE to standard output. 220f66f451Sopenharmony_ci 230f66f451Sopenharmony_ci Each selection LIST is comma separated, either numbers (counting from 1) 240f66f451Sopenharmony_ci or dash separated ranges (inclusive, with X- meaning to end of line and -X 250f66f451Sopenharmony_ci from start). By default selection ranges are sorted and collated, use -D 260f66f451Sopenharmony_ci to prevent that. 270f66f451Sopenharmony_ci 280f66f451Sopenharmony_ci -b Select bytes 290f66f451Sopenharmony_ci -c Select UTF-8 characters 300f66f451Sopenharmony_ci -C Select unicode columns 310f66f451Sopenharmony_ci -d Use DELIM (default is TAB for -f, run of whitespace for -F) 320f66f451Sopenharmony_ci -D Don't sort/collate selections or match -fF lines without delimiter 330f66f451Sopenharmony_ci -f Select fields (words) separated by single DELIM character 340f66f451Sopenharmony_ci -F Select fields separated by DELIM regex 350f66f451Sopenharmony_ci -O Output delimiter (default one space for -F, input delim for -f) 360f66f451Sopenharmony_ci -s Skip lines without delimiters 370f66f451Sopenharmony_ci*/ 380f66f451Sopenharmony_ci#define FOR_cut 390f66f451Sopenharmony_ci#include "toys.h" 400f66f451Sopenharmony_ci 410f66f451Sopenharmony_ciGLOBALS( 420f66f451Sopenharmony_ci char *d, *O; 430f66f451Sopenharmony_ci struct arg_list *select[5]; // we treat them the same, so loop through 440f66f451Sopenharmony_ci 450f66f451Sopenharmony_ci int pairs; 460f66f451Sopenharmony_ci regex_t reg; 470f66f451Sopenharmony_ci) 480f66f451Sopenharmony_ci 490f66f451Sopenharmony_ci// Return number of bytes to start of first column fitting in columns 500f66f451Sopenharmony_ci// invalid sequences are skipped/ignored 510f66f451Sopenharmony_ciint unicolumns(char *start, unsigned columns) 520f66f451Sopenharmony_ci{ 530f66f451Sopenharmony_ci int i, j = 0; 540f66f451Sopenharmony_ci unsigned wc; 550f66f451Sopenharmony_ci char *s = start, *ss = start; 560f66f451Sopenharmony_ci 570f66f451Sopenharmony_ci // Skip start, rounding down if we hit a multicolumn char 580f66f451Sopenharmony_ci while (j<columns && (i = utf8towc(&wc, s, 4))) { 590f66f451Sopenharmony_ci if (i<0) s++; 600f66f451Sopenharmony_ci else { 610f66f451Sopenharmony_ci s += i; 620f66f451Sopenharmony_ci if (0<(i = wcwidth(wc))) { 630f66f451Sopenharmony_ci if ((j += i)>columns) break; 640f66f451Sopenharmony_ci ss = s; 650f66f451Sopenharmony_ci } 660f66f451Sopenharmony_ci } 670f66f451Sopenharmony_ci } 680f66f451Sopenharmony_ci 690f66f451Sopenharmony_ci return ss-start; 700f66f451Sopenharmony_ci} 710f66f451Sopenharmony_ci 720f66f451Sopenharmony_ci// Apply selections to an input line, producing output 730f66f451Sopenharmony_cistatic void cut_line(char **pline, long len) 740f66f451Sopenharmony_ci{ 750f66f451Sopenharmony_ci unsigned *pairs = (void *)toybuf; 760f66f451Sopenharmony_ci char *line; 770f66f451Sopenharmony_ci int i, j; 780f66f451Sopenharmony_ci 790f66f451Sopenharmony_ci if (!pline) return; 800f66f451Sopenharmony_ci line = *pline; 810f66f451Sopenharmony_ci if (len && line[len-1]=='\n') line[--len] = 0; 820f66f451Sopenharmony_ci 830f66f451Sopenharmony_ci // Loop through selections 840f66f451Sopenharmony_ci for (i=0; i<TT.pairs; i++) { 850f66f451Sopenharmony_ci unsigned start = pairs[2*i], end = pairs[(2*i)+1], count; 860f66f451Sopenharmony_ci char *s = line, *ss; 870f66f451Sopenharmony_ci 880f66f451Sopenharmony_ci // input: start/end position, count=difference between them 890f66f451Sopenharmony_ci // output: s = start of string, len = bytes to output 900f66f451Sopenharmony_ci 910f66f451Sopenharmony_ci if (start) start--; 920f66f451Sopenharmony_ci if (start>=len) continue; 930f66f451Sopenharmony_ci if (!end || end>len) end = len; 940f66f451Sopenharmony_ci count = end-start; 950f66f451Sopenharmony_ci 960f66f451Sopenharmony_ci // Find start and end of output string for the relevant selection type 970f66f451Sopenharmony_ci if (toys.optflags&FLAG_b) s += start; 980f66f451Sopenharmony_ci else if (toys.optflags&FLAG_C) { 990f66f451Sopenharmony_ci // crunch_str() currently assumes that combining characters get 1000f66f451Sopenharmony_ci // escaped, to provide an unambiguous visual representation. 1010f66f451Sopenharmony_ci // This assumes the input string is null terminated. 1020f66f451Sopenharmony_ci //if (start) crunch_str(&s, start, 0, 0, 0); 1030f66f451Sopenharmony_ci //if (!*s) continue; 1040f66f451Sopenharmony_ci //start = s-line; 1050f66f451Sopenharmony_ci //ss = s; 1060f66f451Sopenharmony_ci //crunch_str(&ss, count, 0, 0, 0); 1070f66f451Sopenharmony_ci //count = ss-s; 1080f66f451Sopenharmony_ci 1090f66f451Sopenharmony_ci s += unicolumns(s, start); 1100f66f451Sopenharmony_ci count = unicolumns(s, end-start); 1110f66f451Sopenharmony_ci } else if (toys.optflags&FLAG_c) { 1120f66f451Sopenharmony_ci unsigned wc; 1130f66f451Sopenharmony_ci char *sss; 1140f66f451Sopenharmony_ci 1150f66f451Sopenharmony_ci // Find start 1160f66f451Sopenharmony_ci ss = line+len; 1170f66f451Sopenharmony_ci while (start && s<ss) { 1180f66f451Sopenharmony_ci if (0<=(j = utf8towc(&wc, s, len))) start--; 1190f66f451Sopenharmony_ci s += (j<1) ? 1 : j; 1200f66f451Sopenharmony_ci } 1210f66f451Sopenharmony_ci if (s == ss) continue; 1220f66f451Sopenharmony_ci 1230f66f451Sopenharmony_ci // Find end 1240f66f451Sopenharmony_ci end = count; 1250f66f451Sopenharmony_ci sss = s; 1260f66f451Sopenharmony_ci while (end && sss<ss) { 1270f66f451Sopenharmony_ci if (0<=(j = utf8towc(&wc, sss, len))) end--; 1280f66f451Sopenharmony_ci sss += (j<1) ? 1 : j; 1290f66f451Sopenharmony_ci } 1300f66f451Sopenharmony_ci count = sss-s; 1310f66f451Sopenharmony_ci } else { 1320f66f451Sopenharmony_ci regmatch_t match; 1330f66f451Sopenharmony_ci 1340f66f451Sopenharmony_ci // Loop through skipping appropriate number of fields 1350f66f451Sopenharmony_ci for (j = 0; j<2; j++) { 1360f66f451Sopenharmony_ci ss = s; 1370f66f451Sopenharmony_ci if (j) start = count; 1380f66f451Sopenharmony_ci else end = start; 1390f66f451Sopenharmony_ci while (*ss && start) { 1400f66f451Sopenharmony_ci if (toys.optflags&FLAG_f) { 1410f66f451Sopenharmony_ci if (!strchr(TT.d, *ss++)) continue; 1420f66f451Sopenharmony_ci if (!--start && j) ss--; 1430f66f451Sopenharmony_ci } else { 1440f66f451Sopenharmony_ci if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) { 1450f66f451Sopenharmony_ci ss = line+len; 1460f66f451Sopenharmony_ci continue; 1470f66f451Sopenharmony_ci } 1480f66f451Sopenharmony_ci if (!match.rm_eo) break; // zero length match == no delimiter 1490f66f451Sopenharmony_ci ss += (!--start && j) ? match.rm_so : match.rm_eo; 1500f66f451Sopenharmony_ci } 1510f66f451Sopenharmony_ci } 1520f66f451Sopenharmony_ci if (!j && !*(s = ss)) break; 1530f66f451Sopenharmony_ci } 1540f66f451Sopenharmony_ci 1550f66f451Sopenharmony_ci // If we never encountered even one separator, print whole line (posix!) 1560f66f451Sopenharmony_ci if (!j && end == start) { 1570f66f451Sopenharmony_ci if (toys.optflags&FLAG_D) break; 1580f66f451Sopenharmony_ci if (toys.optflags&FLAG_s) return; 1590f66f451Sopenharmony_ci fwrite(line, len, 1, stdout); 1600f66f451Sopenharmony_ci break; 1610f66f451Sopenharmony_ci } else if (!*s) continue; 1620f66f451Sopenharmony_ci count = ss-s; 1630f66f451Sopenharmony_ci } 1640f66f451Sopenharmony_ci if (i && TT.O) fputs(TT.O, stdout); 1650f66f451Sopenharmony_ci fwrite(s, count, 1, stdout); 1660f66f451Sopenharmony_ci } 1670f66f451Sopenharmony_ci xputc('\n'); 1680f66f451Sopenharmony_ci} 1690f66f451Sopenharmony_ci 1700f66f451Sopenharmony_cistatic int compar(unsigned *a, unsigned *b) 1710f66f451Sopenharmony_ci{ 1720f66f451Sopenharmony_ci if (*a<*b) return -1; 1730f66f451Sopenharmony_ci if (*a>*b) return 1; 1740f66f451Sopenharmony_ci if (a[1]<b[1]) return -1; 1750f66f451Sopenharmony_ci if (a[1]>b[1]) return 1; 1760f66f451Sopenharmony_ci 1770f66f451Sopenharmony_ci return 0; 1780f66f451Sopenharmony_ci} 1790f66f451Sopenharmony_ci 1800f66f451Sopenharmony_ci// parse A or A-B or A- or -B 1810f66f451Sopenharmony_cistatic char *get_range(void *data, char *str, int len) 1820f66f451Sopenharmony_ci{ 1830f66f451Sopenharmony_ci char *end = str; 1840f66f451Sopenharmony_ci unsigned *pairs = (void *)toybuf, i; 1850f66f451Sopenharmony_ci 1860f66f451Sopenharmony_ci // Using toybuf[] to store ranges means we can have 512 selections max. 1870f66f451Sopenharmony_ci if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit"); 1880f66f451Sopenharmony_ci pairs += 2*TT.pairs++; 1890f66f451Sopenharmony_ci 1900f66f451Sopenharmony_ci pairs[1] = UINT_MAX; 1910f66f451Sopenharmony_ci for (i = 0; ;i++) { 1920f66f451Sopenharmony_ci if (i==2) return end; 1930f66f451Sopenharmony_ci if (isdigit(*end)) { 1940f66f451Sopenharmony_ci long long ll = estrtol(end, &end, 10); 1950f66f451Sopenharmony_ci 1960f66f451Sopenharmony_ci if (ll<1 || ll>UINT_MAX || errno) return end; 1970f66f451Sopenharmony_ci pairs[i] = ll; 1980f66f451Sopenharmony_ci } 1990f66f451Sopenharmony_ci if (*end++ != '-') break; 2000f66f451Sopenharmony_ci } 2010f66f451Sopenharmony_ci if (!i) pairs[1] = pairs[0]; 2020f66f451Sopenharmony_ci if ((end-str)<len) return end; 2030f66f451Sopenharmony_ci if (pairs[0]>pairs[1]) return str; 2040f66f451Sopenharmony_ci 2050f66f451Sopenharmony_ci // No error 2060f66f451Sopenharmony_ci return 0; 2070f66f451Sopenharmony_ci} 2080f66f451Sopenharmony_ci 2090f66f451Sopenharmony_civoid cut_main(void) 2100f66f451Sopenharmony_ci{ 2110f66f451Sopenharmony_ci int i; 2120f66f451Sopenharmony_ci char buf[8]; 2130f66f451Sopenharmony_ci 2140f66f451Sopenharmony_ci // Parse command line arguments 2150f66f451Sopenharmony_ci if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s) 2160f66f451Sopenharmony_ci error_exit("-s needs -Ff"); 2170f66f451Sopenharmony_ci if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d) 2180f66f451Sopenharmony_ci error_exit("-d needs -Ff"); 2190f66f451Sopenharmony_ci if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t"; 2200f66f451Sopenharmony_ci if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED); 2210f66f451Sopenharmony_ci if (!TT.O) { 2220f66f451Sopenharmony_ci if (toys.optflags&FLAG_F) TT.O = " "; 2230f66f451Sopenharmony_ci else if (toys.optflags&FLAG_f) TT.O = TT.d; 2240f66f451Sopenharmony_ci } 2250f66f451Sopenharmony_ci 2260f66f451Sopenharmony_ci // Parse ranges, which are attached to a selection type (only one can be set) 2270f66f451Sopenharmony_ci for (i = 0; i<ARRAY_LEN(TT.select); i++) { 2280f66f451Sopenharmony_ci sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr 2290f66f451Sopenharmony_ci if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range); 2300f66f451Sopenharmony_ci } 2310f66f451Sopenharmony_ci if (!TT.pairs) error_exit("no selections"); 2320f66f451Sopenharmony_ci 2330f66f451Sopenharmony_ci // Sort and collate selections 2340f66f451Sopenharmony_ci if (!(toys.optflags&FLAG_D)) { 2350f66f451Sopenharmony_ci int from, to; 2360f66f451Sopenharmony_ci unsigned *pairs = (void *)toybuf; 2370f66f451Sopenharmony_ci 2380f66f451Sopenharmony_ci qsort(toybuf, TT.pairs, 8, (void *)compar); 2390f66f451Sopenharmony_ci for (to = 0, from = 2; from/2 < TT.pairs; from += 2) { 2400f66f451Sopenharmony_ci if (pairs[from] > pairs[to+1]) { 2410f66f451Sopenharmony_ci to += 2; 2420f66f451Sopenharmony_ci memcpy(pairs+to, pairs+from, 2*sizeof(unsigned)); 2430f66f451Sopenharmony_ci } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1]; 2440f66f451Sopenharmony_ci } 2450f66f451Sopenharmony_ci TT.pairs = (to/2)+1; 2460f66f451Sopenharmony_ci } 2470f66f451Sopenharmony_ci 2480f66f451Sopenharmony_ci // For each argument, loop through lines of file and call cut_line() on each 2490f66f451Sopenharmony_ci loopfiles_lines(toys.optargs, cut_line); 2500f66f451Sopenharmony_ci} 251