1/* deflate.c - deflate/inflate code for gzip and friends 2 * 3 * Copyright 2014 Rob Landley <rob@landley.net> 4 * 5 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip) 6 * LSB 4.1 has gzip, gunzip, and zcat 7 * 8 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout 9 */ 10 11#include "toys.h" 12 13struct deflate { 14 // Huffman codes: base offset and extra bits tables (length and distance) 15 char lenbits[29], distbits[30]; 16 unsigned short lenbase[29], distbase[30]; 17 void *fixdisthuff, *fixlithuff; 18 19 // CRC 20 void (*crcfunc)(struct deflate *dd, char *data, unsigned len); 21 unsigned crctable[256], crc; 22 23 24 // Tables only used for deflation 25 unsigned short *hashhead, *hashchain; 26 27 // Compressed data buffer (extra space malloced at end) 28 unsigned pos, len; 29 int infd, outfd; 30 char data[]; 31}; 32 33// little endian bit buffer 34struct bitbuf { 35 int fd, bitpos, len, max; 36 char buf[]; 37}; 38 39// malloc a struct bitbuf 40struct bitbuf *bitbuf_init(int fd, int size) 41{ 42 struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size); 43 44 bb->max = size; 45 bb->fd = fd; 46 47 return bb; 48} 49 50// Advance bitpos without the overhead of recording bits 51// Loads more data when input buffer empty 52// call with 0 to just load data, returns 0 at EOF 53int bitbuf_skip(struct bitbuf *bb, int bits) 54{ 55 int pos = bb->bitpos + bits + (bits<0), len; 56 57 while (pos >= (len = bb->len<<3)) { 58 pos -= len; 59 if (1 > (bb->len = read(bb->fd, bb->buf, bb->max))) { 60 if (!bb->len && !bits) break; 61 error_exit("inflate EOF"); 62 } 63 } 64 bb->bitpos = pos; 65 66 return pos<len; 67} 68 69// Optimized single bit inlined version 70static inline int bitbuf_bit(struct bitbuf *bb) 71{ 72 int bufpos = bb->bitpos>>3; 73 74 if (bufpos == bb->len) { 75 bitbuf_skip(bb, -1); 76 bufpos = 0; 77 } 78 79 return (bb->buf[bufpos]>>(bb->bitpos++&7))&1; 80} 81 82// Fetch the next X bits from the bitbuf, little endian 83unsigned bitbuf_get(struct bitbuf *bb, int bits) 84{ 85 int result = 0, offset = 0; 86 87 while (bits) { 88 int click = bb->bitpos >> 3, blow, blen; 89 90 // Load more data if buffer empty 91 if (click == bb->len) { 92 bitbuf_skip(bb, -1); 93 click = 0; 94 } 95 96 // grab bits from next byte 97 blow = bb->bitpos & 7; 98 blen = 8-blow; 99 if (blen > bits) blen = bits; 100 result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset; 101 offset += blen; 102 bits -= blen; 103 bb->bitpos += blen; 104 } 105 106 return result; 107} 108 109void bitbuf_flush(struct bitbuf *bb) 110{ 111 if (!bb->bitpos) return; 112 113 xwrite(bb->fd, bb->buf, (bb->bitpos+7)>>3); 114 memset(bb->buf, 0, bb->max); 115 bb->bitpos = 0; 116} 117 118void bitbuf_put(struct bitbuf *bb, int data, int len) 119{ 120 while (len) { 121 int click = bb->bitpos >> 3, blow, blen; 122 123 // Flush buffer if necessary 124 if (click == bb->max) { 125 bitbuf_flush(bb); 126 click = 0; 127 } 128 blow = bb->bitpos & 7; 129 blen = 8-blow; 130 if (blen > len) blen = len; 131 bb->buf[click] |= data << blow; 132 bb->bitpos += blen; 133 data >>= blen; 134 len -= blen; 135 } 136} 137 138static void output_byte(struct deflate *dd, char sym) 139{ 140 int pos = dd->pos++ & 32767; 141 142 dd->data[pos] = sym; 143 144 if (pos == 32767) { 145 xwrite(dd->outfd, dd->data, 32768); 146 if (dd->crcfunc) dd->crcfunc(dd, dd->data, 32768); 147 } 148} 149 150// Huffman coding uses bits to traverse a binary tree to a leaf node, 151// By placing frequently occurring symbols at shorter paths, frequently 152// used symbols may be represented in fewer bits than uncommon symbols. 153// (length[0] isn't used but code's clearer if it's there.) 154 155struct huff { 156 unsigned short length[16]; // How many symbols have this bit length? 157 unsigned short symbol[288]; // sorted by bit length, then ascending order 158}; 159 160// Create simple huffman tree from array of bit lengths. 161 162// The symbols in the huffman trees are sorted (first by bit length 163// of the code to reach them, then by symbol number). This means that given 164// the bit length of each symbol, we can construct a unique tree. 165static void len2huff(struct huff *huff, char bitlen[], int len) 166{ 167 int offset[16]; 168 int i; 169 170 // Count number of codes at each bit length 171 memset(huff, 0, sizeof(struct huff)); 172 for (i = 0; i<len; i++) huff->length[bitlen[i]]++; 173 174 // Sort symbols by bit length, then symbol. Get list of starting positions 175 // for each group, then write each symbol to next position within its group. 176 *huff->length = *offset = 0; 177 for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1]; 178 for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i; 179} 180 181// Fetch and decode next huffman coded symbol from bitbuf. 182// This takes advantage of the sorting to navigate the tree as an array: 183// each time we fetch a bit we have all the codes at that bit level in 184// order with no gaps. 185static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff) 186{ 187 unsigned short *length = huff->length; 188 int start = 0, offset = 0; 189 190 // Traverse through the bit lengths until our code is in this range 191 for (;;) { 192 offset = (offset << 1) | bitbuf_bit(bb); 193 start += *++length; 194 if ((offset -= *length) < 0) break; 195 if ((length - huff->length) & 16) error_exit("bad symbol"); 196 } 197 198 return huff->symbol[start + offset]; 199} 200 201// Decompress deflated data from bitbuf to dd->outfd. 202static void inflate(struct deflate *dd, struct bitbuf *bb) 203{ 204 dd->crc = ~0; 205 206 // repeat until spanked 207 for (;;) { 208 int final, type; 209 210 final = bitbuf_get(bb, 1); 211 type = bitbuf_get(bb, 2); 212 213 if (type == 3) error_exit("bad type"); 214 215 // Uncompressed block? 216 if (!type) { 217 int len, nlen; 218 219 // Align to byte, read length 220 bitbuf_skip(bb, (8-bb->bitpos)&7); 221 len = bitbuf_get(bb, 16); 222 nlen = bitbuf_get(bb, 16); 223 if (len != (0xffff & ~nlen)) error_exit("bad len"); 224 225 // Dump literal output data 226 while (len) { 227 int pos = bb->bitpos >> 3, bblen = bb->len - pos; 228 char *p = bb->buf+pos; 229 230 // dump bytes until done or end of current bitbuf contents 231 if (bblen > len) bblen = len; 232 pos = bblen; 233 while (pos--) output_byte(dd, *(p++)); 234 bitbuf_skip(bb, bblen << 3); 235 len -= bblen; 236 } 237 238 // Compressed block 239 } else { 240 struct huff *disthuff, *lithuff; 241 242 // Dynamic huffman codes? 243 if (type == 2) { 244 struct huff *h2 = ((struct huff *)libbuf)+1; 245 int i, litlen, distlen, hufflen; 246 char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b" 247 "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits; 248 249 // The huffman trees are stored as a series of bit lengths 250 litlen = bitbuf_get(bb, 5)+257; // max 288 251 distlen = bitbuf_get(bb, 5)+1; // max 32 252 hufflen = bitbuf_get(bb, 4)+4; // max 19 253 254 // The literal and distance codes are themselves compressed, in 255 // a complicated way: an array of bit lengths (hufflen many 256 // entries, each 3 bits) is used to fill out an array of 19 entries 257 // in a magic order, leaving the rest 0. Then make a tree out of it: 258 memset(bits = libbuf+1, 0, 19); 259 for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3); 260 len2huff(h2, bits, 19); 261 262 // Use that tree to read in the literal and distance bit lengths 263 for (i = 0; i < litlen + distlen;) { 264 int sym = huff_and_puff(bb, h2); 265 266 // 0-15 are literals, 16 = repeat previous code 3-6 times, 267 // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit) 268 if (sym < 16) bits[i++] = sym; 269 else { 270 int len = sym & 2; 271 272 len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2); 273 memset(bits+i, bits[i-1] * !(sym&3), len); 274 i += len; 275 } 276 } 277 if (i > litlen+distlen) error_exit("bad tree"); 278 279 len2huff(lithuff = h2, bits, litlen); 280 len2huff(disthuff = ((struct huff *)libbuf)+2, bits+litlen, distlen); 281 282 // Static huffman codes 283 } else { 284 lithuff = dd->fixlithuff; 285 disthuff = dd->fixdisthuff; 286 } 287 288 // Use huffman tables to decode block of compressed symbols 289 for (;;) { 290 int sym = huff_and_puff(bb, lithuff); 291 292 // Literal? 293 if (sym < 256) output_byte(dd, sym); 294 295 // Copy range? 296 else if (sym > 256) { 297 int len, dist; 298 299 sym -= 257; 300 len = dd->lenbase[sym] + bitbuf_get(bb, dd->lenbits[sym]); 301 sym = huff_and_puff(bb, disthuff); 302 dist = dd->distbase[sym] + bitbuf_get(bb, dd->distbits[sym]); 303 sym = dd->pos & 32767; 304 305 while (len--) output_byte(dd, dd->data[(dd->pos-dist) & 32767]); 306 307 // End of block 308 } else break; 309 } 310 } 311 312 // Was that the last block? 313 if (final) break; 314 } 315 316 if (dd->pos & 32767) { 317 xwrite(dd->outfd, dd->data, dd->pos&32767); 318 if (dd->crcfunc) dd->crcfunc(dd, dd->data, dd->pos&32767); 319 } 320} 321 322// Deflate from dd->infd to bitbuf 323// For deflate, dd->len = input read, dd->pos = input consumed 324static void deflate(struct deflate *dd, struct bitbuf *bb) 325{ 326 char *data = dd->data; 327 int len, final = 0; 328 329 dd->crc = ~0; 330 331 while (!final) { 332 // Read next half-window of data if we haven't hit EOF yet. 333 len = readall(dd->infd, data+(dd->len&32768), 32768); 334 if (len < 0) perror_exit("read"); // todo: add filename 335 if (len != 32768) final++; 336 if (dd->crcfunc) dd->crcfunc(dd, data+(dd->len&32768), len); 337 // dd->len += len; crcfunc advances len TODO 338 339 // store block as literal 340 bitbuf_put(bb, final, 1); 341 bitbuf_put(bb, 0, 1); 342 343 bitbuf_put(bb, 0, (8-bb->bitpos)&7); 344 bitbuf_put(bb, len, 16); 345 bitbuf_put(bb, 0xffff & ~len, 16); 346 347 // repeat until spanked 348 while (dd->pos != dd->len) { 349 unsigned pos = dd->pos&65535; 350 351 bitbuf_put(bb, data[pos], 8); 352 353 // need to refill buffer? 354 if (!(32767 & ++dd->pos) && !final) break; 355 } 356 } 357 bitbuf_flush(bb); 358} 359 360// Allocate memory for deflate/inflate. 361static struct deflate *init_deflate(int compress) 362{ 363 int i, n = 1; 364 struct deflate *dd = xmalloc(sizeof(struct deflate)+32768*(compress ? 4 : 1)); 365 366 memset(dd, 0, sizeof(struct deflate)); 367 // decompress needs 32k history, compress adds 64k hashhead and 32k hashchain 368 if (compress) { 369 dd->hashhead = (unsigned short *)(dd->data+65536); 370 dd->hashchain = (unsigned short *)(dd->data+65536+32768); 371 } 372 373 // Calculate lenbits, lenbase, distbits, distbase 374 *dd->lenbase = 3; 375 for (i = 0; i<sizeof(dd->lenbits)-1; i++) { 376 if (i>4) { 377 if (!(i&3)) { 378 dd->lenbits[i]++; 379 n <<= 1; 380 } 381 if (i == 27) n--; 382 else dd->lenbits[i+1] = dd->lenbits[i]; 383 } 384 dd->lenbase[i+1] = n + dd->lenbase[i]; 385 } 386 n = 0; 387 for (i = 0; i<sizeof(dd->distbits); i++) { 388 dd->distbase[i] = 1<<n; 389 if (i) dd->distbase[i] += dd->distbase[i-1]; 390 if (i>3 && !(i&1)) n++; 391 dd->distbits[i] = n; 392 } 393 394// TODO layout and lifetime of this? 395 // Init fixed huffman tables 396 for (i=0; i<288; i++) libbuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279); 397 len2huff(dd->fixlithuff = ((struct huff *)libbuf)+3, libbuf, 288); 398 memset(libbuf, 5, 30); 399 len2huff(dd->fixdisthuff = ((struct huff *)libbuf)+4, libbuf, 30); 400 401 return dd; 402} 403 404// Return true/false whether we consumed a gzip header. 405static int is_gzip(struct bitbuf *bb) 406{ 407 int flags; 408 409 // Confirm signature 410 if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31) 411 return 0; 412 bitbuf_skip(bb, 6*8); 413 414 // Skip extra, name, comment, header CRC fields 415 if (flags & 4) bitbuf_skip(bb, 16); 416 if (flags & 8) while (bitbuf_get(bb, 8)); 417 if (flags & 16) while (bitbuf_get(bb, 8)); 418 if (flags & 2) bitbuf_skip(bb, 16); 419 420 return 1; 421} 422 423void gzip_crc(struct deflate *dd, char *data, unsigned len) 424{ 425 int i; 426 unsigned crc, *crc_table = dd->crctable; 427 428 crc = dd->crc; 429 for (i = 0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8); 430 dd->crc = crc; 431 dd->len += len; 432} 433 434/* 435// Start with crc = 1, or pass in last crc to append more data 436unsigned adler32(char *buf, unsigned len, unsigned crc) 437{ 438 unsigned aa = crc&((1<<16)-1), bb = crc>>16; 439 while (len--) { 440 aa = (aa+*buf)%65521; 441 bb = (bb+aa)%65521; 442 } 443 return (bb<16)+aa; 444} 445*/ 446 447long long gzip_fd(int infd, int outfd) 448{ 449 struct bitbuf *bb = bitbuf_init(outfd, 4096); 450 struct deflate *dd = init_deflate(1); 451 long long rc; 452 453 // Header from RFC 1952 section 2.2: 454 // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none), 455 // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression), 456 // Operating System (FF=unknown) 457 458 dd->infd = infd; 459 xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10); 460 461 // Little endian crc table 462 crc_init(dd->crctable, 1); 463 dd->crcfunc = gzip_crc; 464 465 deflate(dd, bb); 466 467 // tail: crc32, len32 468 469 bitbuf_put(bb, 0, (8-bb->bitpos)&7); 470 bitbuf_put(bb, ~dd->crc, 32); 471 bitbuf_put(bb, dd->len, 32); 472 rc = dd->len; 473 474 bitbuf_flush(bb); 475 free(bb); 476 free(dd); 477 478 return rc; 479} 480 481long long gunzip_fd(int infd, int outfd) 482{ 483 struct bitbuf *bb = bitbuf_init(infd, 4096); 484 struct deflate *dd = init_deflate(0); 485 long long rc = 0; 486 487 // Little endian crc table 488 crc_init(dd->crctable, 1); 489 dd->crcfunc = gzip_crc; 490 dd->outfd = outfd; 491 492 do { 493 if (!is_gzip(bb)) error_exit("not gzip"); 494 495 inflate(dd, bb); 496 497 // tail: crc32, len32 498 bitbuf_skip(bb, (8-bb->bitpos)&7); 499 if (~dd->crc != bitbuf_get(bb, 32) || dd->len != bitbuf_get(bb, 32)) 500 error_exit("bad crc"); 501 rc += dd->len; 502 503 bitbuf_skip(bb, (8-bb->bitpos)&7); 504 dd->pos = dd->len = 0; 505 } while (bitbuf_skip(bb, 0)); 506 free(bb); 507 free(dd); 508 509 return rc; 510} 511