1/* cut.c - print selected ranges from a file 2 * 3 * Copyright 2016 Rob Landley <rob@landley.net> 4 * 5 * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html 6 * 7 * Deviations from posix: added -DF. We can only accept 512 selections, and 8 * "-" counts as start to end. Using spaces to separate a comma-separated list 9 * is silly and inconsistent with dd, ps, cp, and mount. 10 * 11 * todo: -n, -s with -c 12 13USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN)) 14 15config CUT 16 bool "cut" 17 default y 18 help 19 usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...] 20 21 Print selected parts of lines from each FILE to standard output. 22 23 Each selection LIST is comma separated, either numbers (counting from 1) 24 or dash separated ranges (inclusive, with X- meaning to end of line and -X 25 from start). By default selection ranges are sorted and collated, use -D 26 to prevent that. 27 28 -b Select bytes 29 -c Select UTF-8 characters 30 -C Select unicode columns 31 -d Use DELIM (default is TAB for -f, run of whitespace for -F) 32 -D Don't sort/collate selections or match -fF lines without delimiter 33 -f Select fields (words) separated by single DELIM character 34 -F Select fields separated by DELIM regex 35 -O Output delimiter (default one space for -F, input delim for -f) 36 -s Skip lines without delimiters 37*/ 38#define FOR_cut 39#include "toys.h" 40 41GLOBALS( 42 char *d, *O; 43 struct arg_list *select[5]; // we treat them the same, so loop through 44 45 int pairs; 46 regex_t reg; 47) 48 49// Return number of bytes to start of first column fitting in columns 50// invalid sequences are skipped/ignored 51int unicolumns(char *start, unsigned columns) 52{ 53 int i, j = 0; 54 unsigned wc; 55 char *s = start, *ss = start; 56 57 // Skip start, rounding down if we hit a multicolumn char 58 while (j<columns && (i = utf8towc(&wc, s, 4))) { 59 if (i<0) s++; 60 else { 61 s += i; 62 if (0<(i = wcwidth(wc))) { 63 if ((j += i)>columns) break; 64 ss = s; 65 } 66 } 67 } 68 69 return ss-start; 70} 71 72// Apply selections to an input line, producing output 73static void cut_line(char **pline, long len) 74{ 75 unsigned *pairs = (void *)toybuf; 76 char *line; 77 int i, j; 78 79 if (!pline) return; 80 line = *pline; 81 if (len && line[len-1]=='\n') line[--len] = 0; 82 83 // Loop through selections 84 for (i=0; i<TT.pairs; i++) { 85 unsigned start = pairs[2*i], end = pairs[(2*i)+1], count; 86 char *s = line, *ss; 87 88 // input: start/end position, count=difference between them 89 // output: s = start of string, len = bytes to output 90 91 if (start) start--; 92 if (start>=len) continue; 93 if (!end || end>len) end = len; 94 count = end-start; 95 96 // Find start and end of output string for the relevant selection type 97 if (toys.optflags&FLAG_b) s += start; 98 else if (toys.optflags&FLAG_C) { 99 // crunch_str() currently assumes that combining characters get 100 // escaped, to provide an unambiguous visual representation. 101 // This assumes the input string is null terminated. 102 //if (start) crunch_str(&s, start, 0, 0, 0); 103 //if (!*s) continue; 104 //start = s-line; 105 //ss = s; 106 //crunch_str(&ss, count, 0, 0, 0); 107 //count = ss-s; 108 109 s += unicolumns(s, start); 110 count = unicolumns(s, end-start); 111 } else if (toys.optflags&FLAG_c) { 112 unsigned wc; 113 char *sss; 114 115 // Find start 116 ss = line+len; 117 while (start && s<ss) { 118 if (0<=(j = utf8towc(&wc, s, len))) start--; 119 s += (j<1) ? 1 : j; 120 } 121 if (s == ss) continue; 122 123 // Find end 124 end = count; 125 sss = s; 126 while (end && sss<ss) { 127 if (0<=(j = utf8towc(&wc, sss, len))) end--; 128 sss += (j<1) ? 1 : j; 129 } 130 count = sss-s; 131 } else { 132 regmatch_t match; 133 134 // Loop through skipping appropriate number of fields 135 for (j = 0; j<2; j++) { 136 ss = s; 137 if (j) start = count; 138 else end = start; 139 while (*ss && start) { 140 if (toys.optflags&FLAG_f) { 141 if (!strchr(TT.d, *ss++)) continue; 142 if (!--start && j) ss--; 143 } else { 144 if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) { 145 ss = line+len; 146 continue; 147 } 148 if (!match.rm_eo) break; // zero length match == no delimiter 149 ss += (!--start && j) ? match.rm_so : match.rm_eo; 150 } 151 } 152 if (!j && !*(s = ss)) break; 153 } 154 155 // If we never encountered even one separator, print whole line (posix!) 156 if (!j && end == start) { 157 if (toys.optflags&FLAG_D) break; 158 if (toys.optflags&FLAG_s) return; 159 fwrite(line, len, 1, stdout); 160 break; 161 } else if (!*s) continue; 162 count = ss-s; 163 } 164 if (i && TT.O) fputs(TT.O, stdout); 165 fwrite(s, count, 1, stdout); 166 } 167 xputc('\n'); 168} 169 170static int compar(unsigned *a, unsigned *b) 171{ 172 if (*a<*b) return -1; 173 if (*a>*b) return 1; 174 if (a[1]<b[1]) return -1; 175 if (a[1]>b[1]) return 1; 176 177 return 0; 178} 179 180// parse A or A-B or A- or -B 181static char *get_range(void *data, char *str, int len) 182{ 183 char *end = str; 184 unsigned *pairs = (void *)toybuf, i; 185 186 // Using toybuf[] to store ranges means we can have 512 selections max. 187 if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit"); 188 pairs += 2*TT.pairs++; 189 190 pairs[1] = UINT_MAX; 191 for (i = 0; ;i++) { 192 if (i==2) return end; 193 if (isdigit(*end)) { 194 long long ll = estrtol(end, &end, 10); 195 196 if (ll<1 || ll>UINT_MAX || errno) return end; 197 pairs[i] = ll; 198 } 199 if (*end++ != '-') break; 200 } 201 if (!i) pairs[1] = pairs[0]; 202 if ((end-str)<len) return end; 203 if (pairs[0]>pairs[1]) return str; 204 205 // No error 206 return 0; 207} 208 209void cut_main(void) 210{ 211 int i; 212 char buf[8]; 213 214 // Parse command line arguments 215 if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s) 216 error_exit("-s needs -Ff"); 217 if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d) 218 error_exit("-d needs -Ff"); 219 if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t"; 220 if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED); 221 if (!TT.O) { 222 if (toys.optflags&FLAG_F) TT.O = " "; 223 else if (toys.optflags&FLAG_f) TT.O = TT.d; 224 } 225 226 // Parse ranges, which are attached to a selection type (only one can be set) 227 for (i = 0; i<ARRAY_LEN(TT.select); i++) { 228 sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr 229 if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range); 230 } 231 if (!TT.pairs) error_exit("no selections"); 232 233 // Sort and collate selections 234 if (!(toys.optflags&FLAG_D)) { 235 int from, to; 236 unsigned *pairs = (void *)toybuf; 237 238 qsort(toybuf, TT.pairs, 8, (void *)compar); 239 for (to = 0, from = 2; from/2 < TT.pairs; from += 2) { 240 if (pairs[from] > pairs[to+1]) { 241 to += 2; 242 memcpy(pairs+to, pairs+from, 2*sizeof(unsigned)); 243 } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1]; 244 } 245 TT.pairs = (to/2)+1; 246 } 247 248 // For each argument, loop through lines of file and call cut_line() on each 249 loopfiles_lines(toys.optargs, cut_line); 250} 251