xref: /third_party/toybox/lib/deflate.c (revision 0f66f451)
1/* deflate.c - deflate/inflate code for gzip and friends
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
6 * LSB 4.1 has gzip, gunzip, and zcat
7 *
8 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
9 */
10
11#include "toys.h"
12
13struct deflate {
14  // Huffman codes: base offset and extra bits tables (length and distance)
15  char lenbits[29], distbits[30];
16  unsigned short lenbase[29], distbase[30];
17  void *fixdisthuff, *fixlithuff;
18
19  // CRC
20  void (*crcfunc)(struct deflate *dd, char *data, unsigned len);
21  unsigned crctable[256], crc;
22
23
24  // Tables only used for deflation
25  unsigned short *hashhead, *hashchain;
26
27  // Compressed data buffer (extra space malloced at end)
28  unsigned pos, len;
29  int infd, outfd;
30  char data[];
31};
32
33// little endian bit buffer
34struct bitbuf {
35  int fd, bitpos, len, max;
36  char buf[];
37};
38
39// malloc a struct bitbuf
40struct bitbuf *bitbuf_init(int fd, int size)
41{
42  struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
43
44  bb->max = size;
45  bb->fd = fd;
46
47  return bb;
48}
49
50// Advance bitpos without the overhead of recording bits
51// Loads more data when input buffer empty
52// call with 0 to just load data, returns 0 at EOF
53int bitbuf_skip(struct bitbuf *bb, int bits)
54{
55  int pos = bb->bitpos + bits + (bits<0), len;
56
57  while (pos >= (len = bb->len<<3)) {
58    pos -= len;
59    if (1 > (bb->len = read(bb->fd, bb->buf, bb->max))) {
60      if (!bb->len && !bits) break;
61      error_exit("inflate EOF");
62    }
63  }
64  bb->bitpos = pos;
65
66  return pos<len;
67}
68
69// Optimized single bit inlined version
70static inline int bitbuf_bit(struct bitbuf *bb)
71{
72  int bufpos = bb->bitpos>>3;
73
74  if (bufpos == bb->len) {
75    bitbuf_skip(bb, -1);
76    bufpos = 0;
77  }
78
79  return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
80}
81
82// Fetch the next X bits from the bitbuf, little endian
83unsigned bitbuf_get(struct bitbuf *bb, int bits)
84{
85  int result = 0, offset = 0;
86
87  while (bits) {
88    int click = bb->bitpos >> 3, blow, blen;
89
90    // Load more data if buffer empty
91    if (click == bb->len) {
92      bitbuf_skip(bb, -1);
93      click = 0;
94    }
95
96    // grab bits from next byte
97    blow = bb->bitpos & 7;
98    blen = 8-blow;
99    if (blen > bits) blen = bits;
100    result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
101    offset += blen;
102    bits -= blen;
103    bb->bitpos += blen;
104  }
105
106  return result;
107}
108
109void bitbuf_flush(struct bitbuf *bb)
110{
111  if (!bb->bitpos) return;
112
113  xwrite(bb->fd, bb->buf, (bb->bitpos+7)>>3);
114  memset(bb->buf, 0, bb->max);
115  bb->bitpos = 0;
116}
117
118void bitbuf_put(struct bitbuf *bb, int data, int len)
119{
120  while (len) {
121    int click = bb->bitpos >> 3, blow, blen;
122
123    // Flush buffer if necessary
124    if (click == bb->max) {
125      bitbuf_flush(bb);
126      click = 0;
127    }
128    blow = bb->bitpos & 7;
129    blen = 8-blow;
130    if (blen > len) blen = len;
131    bb->buf[click] |= data << blow;
132    bb->bitpos += blen;
133    data >>= blen;
134    len -= blen;
135  }
136}
137
138static void output_byte(struct deflate *dd, char sym)
139{
140  int pos = dd->pos++ & 32767;
141
142  dd->data[pos] = sym;
143
144  if (pos == 32767) {
145    xwrite(dd->outfd, dd->data, 32768);
146    if (dd->crcfunc) dd->crcfunc(dd, dd->data, 32768);
147  }
148}
149
150// Huffman coding uses bits to traverse a binary tree to a leaf node,
151// By placing frequently occurring symbols at shorter paths, frequently
152// used symbols may be represented in fewer bits than uncommon symbols.
153// (length[0] isn't used but code's clearer if it's there.)
154
155struct huff {
156  unsigned short length[16];  // How many symbols have this bit length?
157  unsigned short symbol[288]; // sorted by bit length, then ascending order
158};
159
160// Create simple huffman tree from array of bit lengths.
161
162// The symbols in the huffman trees are sorted (first by bit length
163// of the code to reach them, then by symbol number). This means that given
164// the bit length of each symbol, we can construct a unique tree.
165static void len2huff(struct huff *huff, char bitlen[], int len)
166{
167  int offset[16];
168  int i;
169
170  // Count number of codes at each bit length
171  memset(huff, 0, sizeof(struct huff));
172  for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
173
174  // Sort symbols by bit length, then symbol. Get list of starting positions
175  // for each group, then write each symbol to next position within its group.
176  *huff->length = *offset = 0;
177  for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
178  for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
179}
180
181// Fetch and decode next huffman coded symbol from bitbuf.
182// This takes advantage of the sorting to navigate the tree as an array:
183// each time we fetch a bit we have all the codes at that bit level in
184// order with no gaps.
185static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
186{
187  unsigned short *length = huff->length;
188  int start = 0, offset = 0;
189
190  // Traverse through the bit lengths until our code is in this range
191  for (;;) {
192    offset = (offset << 1) | bitbuf_bit(bb);
193    start += *++length;
194    if ((offset -= *length) < 0) break;
195    if ((length - huff->length) & 16) error_exit("bad symbol");
196  }
197
198  return huff->symbol[start + offset];
199}
200
201// Decompress deflated data from bitbuf to dd->outfd.
202static void inflate(struct deflate *dd, struct bitbuf *bb)
203{
204  dd->crc = ~0;
205
206  // repeat until spanked
207  for (;;) {
208    int final, type;
209
210    final = bitbuf_get(bb, 1);
211    type = bitbuf_get(bb, 2);
212
213    if (type == 3) error_exit("bad type");
214
215    // Uncompressed block?
216    if (!type) {
217      int len, nlen;
218
219      // Align to byte, read length
220      bitbuf_skip(bb, (8-bb->bitpos)&7);
221      len = bitbuf_get(bb, 16);
222      nlen = bitbuf_get(bb, 16);
223      if (len != (0xffff & ~nlen)) error_exit("bad len");
224
225      // Dump literal output data
226      while (len) {
227        int pos = bb->bitpos >> 3, bblen = bb->len - pos;
228        char *p = bb->buf+pos;
229
230        // dump bytes until done or end of current bitbuf contents
231        if (bblen > len) bblen = len;
232        pos = bblen;
233        while (pos--) output_byte(dd, *(p++));
234        bitbuf_skip(bb, bblen << 3);
235        len -= bblen;
236      }
237
238    // Compressed block
239    } else {
240      struct huff *disthuff, *lithuff;
241
242      // Dynamic huffman codes?
243      if (type == 2) {
244        struct huff *h2 = ((struct huff *)libbuf)+1;
245        int i, litlen, distlen, hufflen;
246        char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
247                              "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
248
249        // The huffman trees are stored as a series of bit lengths
250        litlen = bitbuf_get(bb, 5)+257;  // max 288
251        distlen = bitbuf_get(bb, 5)+1;   // max 32
252        hufflen = bitbuf_get(bb, 4)+4;   // max 19
253
254        // The literal and distance codes are themselves compressed, in
255        // a complicated way: an array of bit lengths (hufflen many
256        // entries, each 3 bits) is used to fill out an array of 19 entries
257        // in a magic order, leaving the rest 0. Then make a tree out of it:
258        memset(bits = libbuf+1, 0, 19);
259        for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
260        len2huff(h2, bits, 19);
261
262        // Use that tree to read in the literal and distance bit lengths
263        for (i = 0; i < litlen + distlen;) {
264          int sym = huff_and_puff(bb, h2);
265
266          // 0-15 are literals, 16 = repeat previous code 3-6 times,
267          // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
268          if (sym < 16) bits[i++] = sym;
269          else {
270            int len = sym & 2;
271
272            len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
273            memset(bits+i, bits[i-1] * !(sym&3), len);
274            i += len;
275          }
276        }
277        if (i > litlen+distlen) error_exit("bad tree");
278
279        len2huff(lithuff = h2, bits, litlen);
280        len2huff(disthuff = ((struct huff *)libbuf)+2, bits+litlen, distlen);
281
282      // Static huffman codes
283      } else {
284        lithuff = dd->fixlithuff;
285        disthuff = dd->fixdisthuff;
286      }
287
288      // Use huffman tables to decode block of compressed symbols
289      for (;;) {
290        int sym = huff_and_puff(bb, lithuff);
291
292        // Literal?
293        if (sym < 256) output_byte(dd, sym);
294
295        // Copy range?
296        else if (sym > 256) {
297          int len, dist;
298
299          sym -= 257;
300          len = dd->lenbase[sym] + bitbuf_get(bb, dd->lenbits[sym]);
301          sym = huff_and_puff(bb, disthuff);
302          dist = dd->distbase[sym] + bitbuf_get(bb, dd->distbits[sym]);
303          sym = dd->pos & 32767;
304
305          while (len--) output_byte(dd, dd->data[(dd->pos-dist) & 32767]);
306
307        // End of block
308        } else break;
309      }
310    }
311
312    // Was that the last block?
313    if (final) break;
314  }
315
316  if (dd->pos & 32767) {
317    xwrite(dd->outfd, dd->data, dd->pos&32767);
318    if (dd->crcfunc) dd->crcfunc(dd, dd->data, dd->pos&32767);
319  }
320}
321
322// Deflate from dd->infd to bitbuf
323// For deflate, dd->len = input read, dd->pos = input consumed
324static void deflate(struct deflate *dd, struct bitbuf *bb)
325{
326  char *data = dd->data;
327  int len, final = 0;
328
329  dd->crc = ~0;
330
331  while (!final) {
332    // Read next half-window of data if we haven't hit EOF yet.
333    len = readall(dd->infd, data+(dd->len&32768), 32768);
334    if (len < 0) perror_exit("read"); // todo: add filename
335    if (len != 32768) final++;
336    if (dd->crcfunc) dd->crcfunc(dd, data+(dd->len&32768), len);
337    // dd->len += len;  crcfunc advances len TODO
338
339    // store block as literal
340    bitbuf_put(bb, final, 1);
341    bitbuf_put(bb, 0, 1);
342
343    bitbuf_put(bb, 0, (8-bb->bitpos)&7);
344    bitbuf_put(bb, len, 16);
345    bitbuf_put(bb, 0xffff & ~len, 16);
346
347    // repeat until spanked
348    while (dd->pos != dd->len) {
349      unsigned pos = dd->pos&65535;
350
351      bitbuf_put(bb, data[pos], 8);
352
353      // need to refill buffer?
354      if (!(32767 & ++dd->pos) && !final) break;
355    }
356  }
357  bitbuf_flush(bb);
358}
359
360// Allocate memory for deflate/inflate.
361static struct deflate *init_deflate(int compress)
362{
363  int i, n = 1;
364  struct deflate *dd = xmalloc(sizeof(struct deflate)+32768*(compress ? 4 : 1));
365
366  memset(dd, 0, sizeof(struct deflate));
367  // decompress needs 32k history, compress adds 64k hashhead and 32k hashchain
368  if (compress) {
369    dd->hashhead = (unsigned short *)(dd->data+65536);
370    dd->hashchain = (unsigned short *)(dd->data+65536+32768);
371  }
372
373  // Calculate lenbits, lenbase, distbits, distbase
374  *dd->lenbase = 3;
375  for (i = 0; i<sizeof(dd->lenbits)-1; i++) {
376    if (i>4) {
377      if (!(i&3)) {
378        dd->lenbits[i]++;
379        n <<= 1;
380      }
381      if (i == 27) n--;
382      else dd->lenbits[i+1] = dd->lenbits[i];
383    }
384    dd->lenbase[i+1] = n + dd->lenbase[i];
385  }
386  n = 0;
387  for (i = 0; i<sizeof(dd->distbits); i++) {
388    dd->distbase[i] = 1<<n;
389    if (i) dd->distbase[i] += dd->distbase[i-1];
390    if (i>3 && !(i&1)) n++;
391    dd->distbits[i] = n;
392  }
393
394// TODO layout and lifetime of this?
395  // Init fixed huffman tables
396  for (i=0; i<288; i++) libbuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
397  len2huff(dd->fixlithuff = ((struct huff *)libbuf)+3, libbuf, 288);
398  memset(libbuf, 5, 30);
399  len2huff(dd->fixdisthuff = ((struct huff *)libbuf)+4, libbuf, 30);
400
401  return dd;
402}
403
404// Return true/false whether we consumed a gzip header.
405static int is_gzip(struct bitbuf *bb)
406{
407  int flags;
408
409  // Confirm signature
410  if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
411    return 0;
412  bitbuf_skip(bb, 6*8);
413
414  // Skip extra, name, comment, header CRC fields
415  if (flags & 4) bitbuf_skip(bb, 16);
416  if (flags & 8) while (bitbuf_get(bb, 8));
417  if (flags & 16) while (bitbuf_get(bb, 8));
418  if (flags & 2) bitbuf_skip(bb, 16);
419
420  return 1;
421}
422
423void gzip_crc(struct deflate *dd, char *data, unsigned len)
424{
425  int i;
426  unsigned crc, *crc_table = dd->crctable;
427
428  crc = dd->crc;
429  for (i = 0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
430  dd->crc = crc;
431  dd->len += len;
432}
433
434/*
435// Start with crc = 1, or pass in last crc to append more data
436unsigned adler32(char *buf, unsigned len, unsigned crc)
437{
438  unsigned aa = crc&((1<<16)-1), bb = crc>>16;
439  while (len--) {
440    aa = (aa+*buf)%65521;
441    bb = (bb+aa)%65521;
442  }
443  return (bb<16)+aa;
444}
445*/
446
447long long gzip_fd(int infd, int outfd)
448{
449  struct bitbuf *bb = bitbuf_init(outfd, 4096);
450  struct deflate *dd = init_deflate(1);
451  long long rc;
452
453  // Header from RFC 1952 section 2.2:
454  // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
455  // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
456  // Operating System (FF=unknown)
457
458  dd->infd = infd;
459  xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
460
461  // Little endian crc table
462  crc_init(dd->crctable, 1);
463  dd->crcfunc = gzip_crc;
464
465  deflate(dd, bb);
466
467  // tail: crc32, len32
468
469  bitbuf_put(bb, 0, (8-bb->bitpos)&7);
470  bitbuf_put(bb, ~dd->crc, 32);
471  bitbuf_put(bb, dd->len, 32);
472  rc = dd->len;
473
474  bitbuf_flush(bb);
475  free(bb);
476  free(dd);
477
478  return rc;
479}
480
481long long gunzip_fd(int infd, int outfd)
482{
483  struct bitbuf *bb = bitbuf_init(infd, 4096);
484  struct deflate *dd = init_deflate(0);
485  long long rc = 0;
486
487  // Little endian crc table
488  crc_init(dd->crctable, 1);
489  dd->crcfunc = gzip_crc;
490  dd->outfd = outfd;
491
492  do {
493    if (!is_gzip(bb)) error_exit("not gzip");
494
495    inflate(dd, bb);
496
497    // tail: crc32, len32
498    bitbuf_skip(bb, (8-bb->bitpos)&7);
499    if (~dd->crc != bitbuf_get(bb, 32) || dd->len != bitbuf_get(bb, 32))
500      error_exit("bad crc");
501    rc += dd->len;
502
503    bitbuf_skip(bb, (8-bb->bitpos)&7);
504    dd->pos = dd->len = 0;
505  } while (bitbuf_skip(bb, 0));
506  free(bb);
507  free(dd);
508
509  return rc;
510}
511