1 /* crc32.c -- compute the CRC-32 of a data stream
2  * Copyright (C) 1995-2022 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  *
5  * This interleaved implementation of a CRC makes use of pipelined multiple
6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8  */
9 
10 /* @(#) $Id$ */
11 
12 /*
13   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
14   protection on the static variables used to control the first-use generation
15   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
16   first call get_crc_table() to initialize the tables before allowing more than
17   one thread to use crc32().
18 
19   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
20   produced, so that this one source file can be compiled to an executable.
21  */
22 
23 #ifdef MAKECRCH
24 #  include <stdio.h>
25 #  ifndef DYNAMIC_CRC_TABLE
26 #    define DYNAMIC_CRC_TABLE
27 #  endif /* !DYNAMIC_CRC_TABLE */
28 #endif /* MAKECRCH */
29 
30 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
31 
32  /*
33   A CRC of a message is computed on N braids of words in the message, where
34   each word consists of W bytes (4 or 8). If N is 3, for example, then three
35   running sparse CRCs are calculated respectively on each braid, at these
36   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
37   This is done starting at a word boundary, and continues until as many blocks
38   of N * W bytes as are available have been processed. The results are combined
39   into a single CRC at the end. For this code, N must be in the range 1..6 and
40   W must be 4 or 8. The upper limit on N can be increased if desired by adding
41   more #if blocks, extending the patterns apparent in the code. In addition,
42   crc32.h would need to be regenerated, if the maximum N value is increased.
43 
44   N and W are chosen empirically by benchmarking the execution time on a given
45   processor. The choices for N and W below were based on testing on Intel Kaby
46   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
47   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
48   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
49   They were all tested with either gcc or clang, all using the -O3 optimization
50   level. Your mileage may vary.
51  */
52 
53 /* Define N */
54 #ifdef Z_TESTN
55 #  define N Z_TESTN
56 #else
57 #  define N 5
58 #endif
59 #if N < 1 || N > 6
60 #  error N must be in 1..6
61 #endif
62 
63 /*
64   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
65   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
66   that bytes are eight bits.
67  */
68 
69 /*
70   Define W and the associated z_word_t type. If W is not defined, then a
71   braided calculation is not used, and the associated tables and code are not
72   compiled.
73  */
74 #ifdef Z_TESTW
75 #  if Z_TESTW-1 != -1
76 #    define W Z_TESTW
77 #  endif
78 #else
79 #  ifdef MAKECRCH
80 #    define W 8         /* required for MAKECRCH */
81 #  else
82 #    if defined(__x86_64__) || defined(__aarch64__)
83 #      define W 8
84 #    else
85 #      define W 4
86 #    endif
87 #  endif
88 #endif
89 #ifdef W
90 #  if W == 8 && defined(Z_U8)
91      typedef Z_U8 z_word_t;
92 #  elif defined(Z_U4)
93 #    undef W
94 #    define W 4
95      typedef Z_U4 z_word_t;
96 #  else
97 #    undef W
98 #  endif
99 #endif
100 
101 /* If available, use the ARM processor CRC32 instruction. */
102 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
103 #  define ARMCRC32
104 #endif
105 
106 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
107 /*
108   Swap the bytes in a z_word_t to convert between little and big endian. Any
109   self-respecting compiler will optimize this to a single machine byte-swap
110   instruction, if one is available. This assumes that word_t is either 32 bits
111   or 64 bits.
112  */
byte_swap(z_word_t word)113 local z_word_t byte_swap(z_word_t word)
114 {
115 #  if W == 8
116     return
117         (word & 0xff00000000000000) >> 56 |
118         (word & 0xff000000000000) >> 40 |
119         (word & 0xff0000000000) >> 24 |
120         (word & 0xff00000000) >> 8 |
121         (word & 0xff000000) << 8 |
122         (word & 0xff0000) << 24 |
123         (word & 0xff00) << 40 |
124         (word & 0xff) << 56;
125 #  else   /* W == 4 */
126     return
127         (word & 0xff000000) >> 24 |
128         (word & 0xff0000) >> 8 |
129         (word & 0xff00) << 8 |
130         (word & 0xff) << 24;
131 #  endif
132 }
133 #endif
134 
135 #ifdef DYNAMIC_CRC_TABLE
136 /* =========================================================================
137  * Table of powers of x for combining CRC-32s, filled in by make_crc_table()
138  * below.
139  */
140    local z_crc_t FAR x2n_table[32];
141 #else
142 /* =========================================================================
143  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
144  * of x for combining CRC-32s, all made by make_crc_table().
145  */
146 #  include "crc32.h"
147 #endif
148 
149 /* CRC polynomial. */
150 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
151 
152 /*
153   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
154   reflected. For speed, this requires that a not be zero.
155  */
multmodp(z_crc_t a, z_crc_t b)156 local z_crc_t multmodp(z_crc_t a, z_crc_t b)
157 {
158     z_crc_t m, p;
159 
160     m = (z_crc_t)1 << 31;
161     p = 0;
162     for (;;) {
163         if (a & m) {
164             p ^= b;
165             if ((a & (m - 1)) == 0)
166                 break;
167         }
168         m >>= 1;
169         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
170     }
171     return p;
172 }
173 
174 /*
175   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
176   initialized.
177  */
x2nmodp(z_off64_t n, unsigned k)178 local z_crc_t x2nmodp(z_off64_t n, unsigned k)
179 {
180     z_crc_t p;
181 
182     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
183     while (n) {
184         if (n & 1)
185             p = multmodp(x2n_table[k & 31], p);
186         n >>= 1;
187         k++;
188     }
189     return p;
190 }
191 
192 #ifdef DYNAMIC_CRC_TABLE
193 /* =========================================================================
194  * Build the tables for byte-wise and braided CRC-32 calculations, and a table
195  * of powers of x for combining CRC-32s.
196  */
197 local z_crc_t FAR crc_table[256];
198 #ifdef W
199    local z_word_t FAR crc_big_table[256];
200    local z_crc_t FAR crc_braid_table[W][256];
201    local z_word_t FAR crc_braid_big_table[W][256];
202    local void braid(z_crc_t [][256], z_word_t [][256], int, int);
203 #endif
204 #ifdef MAKECRCH
205    local void write_table(FILE *, const z_crc_t FAR *, int);
206    local void write_table32hi(FILE *, const z_word_t FAR *, int);
207    local void write_table64(FILE *, const z_word_t FAR *, int);
208 #endif /* MAKECRCH */
209 
210 /*
211   Define a once() function depending on the availability of atomics. If this is
212   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
213   multiple threads, and if atomics are not available, then get_crc_table() must
214   be called to initialize the tables and must return before any threads are
215   allowed to compute or combine CRCs.
216  */
217 
218 /* Definition of once functionality. */
219 typedef struct once_s once_t;
220 
221 /* Check for the availability of atomics. */
222 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
223     !defined(__STDC_NO_ATOMICS__)
224 
225 #include <stdatomic.h>
226 
227 /* Structure for once(), which must be initialized with ONCE_INIT. */
228 struct once_s {
229     atomic_flag begun;
230     atomic_int done;
231 };
232 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
233 
234 /*
235   Run the provided init() function exactly once, even if multiple threads
236   invoke once() at the same time. The state must be a once_t initialized with
237   ONCE_INIT.
238  */
once(once_t *state, void (*init)(void))239 local void once(once_t *state, void (*init)(void))
240 {
241     if (!atomic_load(&state->done)) {
242         if (atomic_flag_test_and_set(&state->begun))
243             while (!atomic_load(&state->done))
244                 ;
245         else {
246             init();
247             atomic_store(&state->done, 1);
248         }
249     }
250 }
251 
252 #else   /* no atomics */
253 
254 /* Structure for once(), which must be initialized with ONCE_INIT. */
255 struct once_s {
256     volatile int begun;
257     volatile int done;
258 };
259 #define ONCE_INIT {0, 0}
260 
261 /* Test and set. Alas, not atomic, but tries to minimize the period of
262    vulnerability. */
test_and_set(int volatile *flag)263 local int test_and_set(int volatile *flag)
264 {
265     int was;
266 
267     was = *flag;
268     *flag = 1;
269     return was;
270 }
271 
272 /* Run the provided init() function once. This is not thread-safe. */
once(once_t *state, void (*init)(void))273 local void once(once_t *state, void (*init)(void))
274 {
275     if (!state->done) {
276         if (test_and_set(&state->begun))
277             while (!state->done)
278                 ;
279         else {
280             init();
281             state->done = 1;
282         }
283     }
284 }
285 
286 #endif
287 
288 /* State for once(). */
289 local once_t made = ONCE_INIT;
290 
291 /*
292   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
293   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
294 
295   Polynomials over GF(2) are represented in binary, one bit per coefficient,
296   with the lowest powers in the most significant bit. Then adding polynomials
297   is just exclusive-or, and multiplying a polynomial by x is a right shift by
298   one. If we call the above polynomial p, and represent a byte as the
299   polynomial q, also with the lowest power in the most significant bit (so the
300   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
301   where a mod b means the remainder after dividing a by b.
302 
303   This calculation is done using the shift-register method of multiplying and
304   taking the remainder. The register is initialized to zero, and for each
305   incoming bit, x^32 is added mod p to the register if the bit is a one (where
306   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
307   (which is shifting right by one and adding x^32 mod p if the bit shifted out
308   is a one). We start with the highest power (least significant bit) of q and
309   repeat for all eight bits of q.
310 
311   The table is simply the CRC of all possible eight bit values. This is all the
312   information needed to generate CRCs on data a byte at a time for all
313   combinations of CRC register values and incoming bytes.
314  */
315 
make_crc_table(void)316 local void make_crc_table(void)
317 {
318     unsigned i, j, n;
319     z_crc_t p;
320 
321     /* initialize the CRC of bytes tables */
322     for (i = 0; i < 256; i++) {
323         p = i;
324         for (j = 0; j < 8; j++)
325             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
326         crc_table[i] = p;
327 #ifdef W
328         crc_big_table[i] = byte_swap(p);
329 #endif
330     }
331 
332     /* initialize the x^2^n mod p(x) table */
333     p = (z_crc_t)1 << 30;         /* x^1 */
334     x2n_table[0] = p;
335     for (n = 1; n < 32; n++)
336         x2n_table[n] = p = multmodp(p, p);
337 
338 #ifdef W
339     /* initialize the braiding tables -- needs x2n_table[] */
340     braid(crc_braid_table, crc_braid_big_table, N, W);
341 #endif
342 
343 #ifdef MAKECRCH
344     {
345         /*
346           The crc32.h header file contains tables for both 32-bit and 64-bit
347           z_word_t's, and so requires a 64-bit type be available. In that case,
348           z_word_t must be defined to be 64-bits. This code then also generates
349           and writes out the tables for the case that z_word_t is 32 bits.
350          */
351 #if !defined(W) || W != 8
352 #  error Need a 64-bit integer type in order to generate crc32.h.
353 #endif
354         FILE *out;
355         int k, n;
356         z_crc_t ltl[8][256];
357         z_word_t big[8][256];
358 
359         out = fopen("crc32.h", "w");
360         if (out == NULL) return;
361 
362         /* write out little-endian CRC table to crc32.h */
363         fprintf(out,
364             "/* crc32.h -- tables for rapid CRC calculation\n"
365             " * Generated automatically by crc32.c\n */\n"
366             "\n"
367             "local const z_crc_t FAR crc_table[] = {\n"
368             "    ");
369         write_table(out, crc_table, 256);
370         fprintf(out,
371             "};\n");
372 
373         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
374         fprintf(out,
375             "\n"
376             "#ifdef W\n"
377             "\n"
378             "#if W == 8\n"
379             "\n"
380             "local const z_word_t FAR crc_big_table[] = {\n"
381             "    ");
382         write_table64(out, crc_big_table, 256);
383         fprintf(out,
384             "};\n");
385 
386         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
387         fprintf(out,
388             "\n"
389             "#else /* W == 4 */\n"
390             "\n"
391             "local const z_word_t FAR crc_big_table[] = {\n"
392             "    ");
393         write_table32hi(out, crc_big_table, 256);
394         fprintf(out,
395             "};\n"
396             "\n"
397             "#endif\n");
398 
399         /* write out braid tables for each value of N */
400         for (n = 1; n <= 6; n++) {
401             fprintf(out,
402             "\n"
403             "#if N == %d\n", n);
404 
405             /* compute braid tables for this N and 64-bit word_t */
406             braid(ltl, big, n, 8);
407 
408             /* write out braid tables for 64-bit z_word_t to crc32.h */
409             fprintf(out,
410             "\n"
411             "#if W == 8\n"
412             "\n"
413             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
414             for (k = 0; k < 8; k++) {
415                 fprintf(out, "   {");
416                 write_table(out, ltl[k], 256);
417                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
418             }
419             fprintf(out,
420             "};\n"
421             "\n"
422             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
423             for (k = 0; k < 8; k++) {
424                 fprintf(out, "   {");
425                 write_table64(out, big[k], 256);
426                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
427             }
428             fprintf(out,
429             "};\n");
430 
431             /* compute braid tables for this N and 32-bit word_t */
432             braid(ltl, big, n, 4);
433 
434             /* write out braid tables for 32-bit z_word_t to crc32.h */
435             fprintf(out,
436             "\n"
437             "#else /* W == 4 */\n"
438             "\n"
439             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
440             for (k = 0; k < 4; k++) {
441                 fprintf(out, "   {");
442                 write_table(out, ltl[k], 256);
443                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
444             }
445             fprintf(out,
446             "};\n"
447             "\n"
448             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
449             for (k = 0; k < 4; k++) {
450                 fprintf(out, "   {");
451                 write_table32hi(out, big[k], 256);
452                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
453             }
454             fprintf(out,
455             "};\n"
456             "\n"
457             "#endif\n"
458             "\n"
459             "#endif\n");
460         }
461         fprintf(out,
462             "\n"
463             "#endif\n");
464 
465         /* write out zeros operator table to crc32.h */
466         fprintf(out,
467             "\n"
468             "local const z_crc_t FAR x2n_table[] = {\n"
469             "    ");
470         write_table(out, x2n_table, 32);
471         fprintf(out,
472             "};\n");
473         fclose(out);
474     }
475 #endif /* MAKECRCH */
476 }
477 
478 #ifdef MAKECRCH
479 
480 /*
481    Write the 32-bit values in table[0..k-1] to out, five per line in
482    hexadecimal separated by commas.
483  */
write_table(FILE *out, const z_crc_t FAR *table, int k)484 local void write_table(FILE *out, const z_crc_t FAR *table, int k) {
485     int n;
486 
487     for (n = 0; n < k; n++)
488         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
489                 (unsigned long)(table[n]),
490                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
491 }
492 
493 /*
494    Write the high 32-bits of each value in table[0..k-1] to out, five per line
495    in hexadecimal separated by commas.
496  */
write_table32hi(FILE *out, const z_word_t FAR *table, int k)497 local void write_table32hi(FILE *out, const z_word_t FAR *table, int k) {
498     int n;
499 
500     for (n = 0; n < k; n++)
501         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
502                 (unsigned long)(table[n] >> 32),
503                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
504 }
505 
506 /*
507   Write the 64-bit values in table[0..k-1] to out, three per line in
508   hexadecimal separated by commas. This assumes that if there is a 64-bit
509   type, then there is also a long long integer type, and it is at least 64
510   bits. If not, then the type cast and format string can be adjusted
511   accordingly.
512  */
write_table64(FILE *out, const z_word_t FAR *table, int k)513 local void write_table64(FILE *out, const z_word_t FAR *table, int k) {
514     int n;
515 
516     for (n = 0; n < k; n++)
517         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
518                 (unsigned long long)(table[n]),
519                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
520 }
521 
522 /* Actually do the deed. */
main(void)523 int main(void)
524 {
525     make_crc_table();
526     return 0;
527 }
528 
529 #endif /* MAKECRCH */
530 
531 #ifdef W
532 /*
533   Generate the little and big-endian braid tables for the given n and z_word_t
534   size w. Each array must have room for w blocks of 256 elements.
535  */
braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w)536 local void braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w)
537 {
538     int k;
539     z_crc_t i, p, q;
540     for (k = 0; k < w; k++) {
541         p = x2nmodp((n * w + 3 - k) << 3, 0);
542         ltl[k][0] = 0;
543         big[w - 1 - k][0] = 0;
544         for (i = 1; i < 256; i++) {
545             ltl[k][i] = q = multmodp(i << 24, p);
546             big[w - 1 - k][i] = byte_swap(q);
547         }
548     }
549 }
550 #endif
551 
552 #endif /* DYNAMIC_CRC_TABLE */
553 
554 /* =========================================================================
555  * This function can be used by asm versions of crc32(), and to force the
556  * generation of the CRC tables in a threaded application.
557  */
get_crc_table(void)558 const z_crc_t FAR * ZEXPORT get_crc_table(void) {
559 #ifdef DYNAMIC_CRC_TABLE
560     once(&made, make_crc_table);
561 #endif /* DYNAMIC_CRC_TABLE */
562     return (const z_crc_t FAR *)crc_table;
563 }
564 
565 /* =========================================================================
566  * Use ARM machine instructions if available. This will compute the CRC about
567  * ten times faster than the braided calculation. This code does not check for
568  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
569  * only be defined if the compilation specifies an ARM processor architecture
570  * that has the instructions. For example, compiling with -march=armv8.1-a or
571  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
572  * instructions.
573  */
574 #ifdef ARMCRC32
575 
576 /*
577    Constants empirically determined to maximize speed. These values are from
578    measurements on a Cortex-A57. Your mileage may vary.
579  */
580 #define Z_BATCH 3990                /* number of words in a batch */
581 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
582 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
583 
crc32_z(unsigned long crc, const unsigned char FAR *buf, z_size_t len)584 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
585                               z_size_t len) {
586     z_crc_t val;
587     z_word_t crc1, crc2;
588     const z_word_t *word;
589     z_word_t val0, val1, val2;
590     z_size_t last, last2, i;
591     z_size_t num;
592 
593     /* Return initial CRC, if requested. */
594     if (buf == Z_NULL) return 0;
595 
596 #ifdef DYNAMIC_CRC_TABLE
597     once(&made, make_crc_table);
598 #endif /* DYNAMIC_CRC_TABLE */
599 
600     /* Pre-condition the CRC */
601     crc = (~crc) & 0xffffffff;
602 
603     /* Compute the CRC up to a word boundary. */
604     while (len && ((z_size_t)buf & 7) != 0) {
605         len--;
606         val = *buf++;
607         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
608     }
609 
610     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
611     word = (z_word_t const *)buf;
612     num = len >> 3;
613     len &= 7;
614 
615     /* Do three interleaved CRCs to realize the throughput of one crc32x
616        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
617        three CRCs are combined into a single CRC after each set of batches. */
618     while (num >= 3 * Z_BATCH) {
619         crc1 = 0;
620         crc2 = 0;
621         for (i = 0; i < Z_BATCH; i++) {
622             val0 = word[i];
623             val1 = word[i + Z_BATCH];
624             val2 = word[i + 2 * Z_BATCH];
625             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
626             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
627             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
628         }
629         word += 3 * Z_BATCH;
630         num -= 3 * Z_BATCH;
631         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
632         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
633     }
634 
635     /* Do one last smaller batch with the remaining words, if there are enough
636        to pay for the combination of CRCs. */
637     last = num / 3;
638     if (last >= Z_BATCH_MIN) {
639         last2 = last << 1;
640         crc1 = 0;
641         crc2 = 0;
642         for (i = 0; i < last; i++) {
643             val0 = word[i];
644             val1 = word[i + last];
645             val2 = word[i + last2];
646             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
647             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
648             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
649         }
650         word += 3 * last;
651         num -= 3 * last;
652         val = x2nmodp(last, 6);
653         crc = multmodp(val, crc) ^ crc1;
654         crc = multmodp(val, crc) ^ crc2;
655     }
656 
657     /* Compute the CRC on any remaining words. */
658     for (i = 0; i < num; i++) {
659         val0 = word[i];
660         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
661     }
662     word += num;
663 
664     /* Complete the CRC on any remaining bytes. */
665     buf = (const unsigned char FAR *)word;
666     while (len) {
667         len--;
668         val = *buf++;
669         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
670     }
671 
672     /* Return the CRC, post-conditioned. */
673     return crc ^ 0xffffffff;
674 }
675 
676 #else
677 
678 #ifdef W
679 
680 /*
681   Return the CRC of the W bytes in the word_t data, taking the
682   least-significant byte of the word as the first byte of data, without any pre
683   or post conditioning. This is used to combine the CRCs of each braid.
684  */
crc_word(z_word_t data)685 local z_crc_t crc_word(z_word_t data)
686 {
687     int k;
688     for (k = 0; k < W; k++)
689         data = (data >> 8) ^ crc_table[data & 0xff];
690     return (z_crc_t)data;
691 }
692 
crc_word_big(z_word_t data)693 local z_word_t crc_word_big(z_word_t data)
694 {
695     int k;
696     for (k = 0; k < W; k++)
697         data = (data << 8) ^
698             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
699     return data;
700 }
701 
702 #endif
703 
704 /* ========================================================================= */
crc32_z(unsigned long crc, const unsigned char FAR *buf, z_size_t len)705 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
706                               z_size_t len) {
707     /* Return initial CRC, if requested. */
708     if (buf == Z_NULL) return 0;
709 
710 #ifdef DYNAMIC_CRC_TABLE
711     once(&made, make_crc_table);
712 #endif /* DYNAMIC_CRC_TABLE */
713 
714     /* Pre-condition the CRC */
715     crc = (~crc) & 0xffffffff;
716 
717 #ifdef W
718 
719     /* If provided enough bytes, do a braided CRC calculation. */
720     if (len >= N * W + W - 1) {
721         z_size_t blks;
722         z_word_t const *words;
723         unsigned endian;
724         int k;
725 
726         /* Compute the CRC up to a z_word_t boundary. */
727         while (len && ((z_size_t)buf & (W - 1)) != 0) {
728             len--;
729             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
730         }
731 
732         /* Compute the CRC on as many N z_word_t blocks as are available. */
733         blks = len / (N * W);
734         len -= blks * N * W;
735         words = (z_word_t const *)buf;
736 
737         /* Do endian check at execution time instead of compile time, since ARM
738            processors can change the endianness at execution time. If the
739            compiler knows what the endianness will be, it can optimize out the
740            check and the unused branch. */
741         endian = 1;
742         if (*(unsigned char *)&endian) {
743             /* Little endian. */
744 
745             z_crc_t crc0;
746             z_word_t word0;
747 #if N > 1
748             z_crc_t crc1;
749             z_word_t word1;
750 #if N > 2
751             z_crc_t crc2;
752             z_word_t word2;
753 #if N > 3
754             z_crc_t crc3;
755             z_word_t word3;
756 #if N > 4
757             z_crc_t crc4;
758             z_word_t word4;
759 #if N > 5
760             z_crc_t crc5;
761             z_word_t word5;
762 #endif
763 #endif
764 #endif
765 #endif
766 #endif
767 
768             /* Initialize the CRC for each braid. */
769             crc0 = crc;
770 #if N > 1
771             crc1 = 0;
772 #if N > 2
773             crc2 = 0;
774 #if N > 3
775             crc3 = 0;
776 #if N > 4
777             crc4 = 0;
778 #if N > 5
779             crc5 = 0;
780 #endif
781 #endif
782 #endif
783 #endif
784 #endif
785 
786             /*
787               Process the first blks-1 blocks, computing the CRCs on each braid
788               independently.
789              */
790             while (--blks) {
791                 /* Load the word for each braid into registers. */
792                 word0 = crc0 ^ words[0];
793 #if N > 1
794                 word1 = crc1 ^ words[1];
795 #if N > 2
796                 word2 = crc2 ^ words[2];
797 #if N > 3
798                 word3 = crc3 ^ words[3];
799 #if N > 4
800                 word4 = crc4 ^ words[4];
801 #if N > 5
802                 word5 = crc5 ^ words[5];
803 #endif
804 #endif
805 #endif
806 #endif
807 #endif
808                 words += N;
809 
810                 /* Compute and update the CRC for each word. The loop should
811                    get unrolled. */
812                 crc0 = crc_braid_table[0][word0 & 0xff];
813 #if N > 1
814                 crc1 = crc_braid_table[0][word1 & 0xff];
815 #if N > 2
816                 crc2 = crc_braid_table[0][word2 & 0xff];
817 #if N > 3
818                 crc3 = crc_braid_table[0][word3 & 0xff];
819 #if N > 4
820                 crc4 = crc_braid_table[0][word4 & 0xff];
821 #if N > 5
822                 crc5 = crc_braid_table[0][word5 & 0xff];
823 #endif
824 #endif
825 #endif
826 #endif
827 #endif
828                 for (k = 1; k < W; k++) {
829                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
830 #if N > 1
831                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
832 #if N > 2
833                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
834 #if N > 3
835                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
836 #if N > 4
837                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
838 #if N > 5
839                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
840 #endif
841 #endif
842 #endif
843 #endif
844 #endif
845                 }
846             }
847 
848             /*
849               Process the last block, combining the CRCs of the N braids at the
850               same time.
851              */
852             crc = crc_word(crc0 ^ words[0]);
853 #if N > 1
854             crc = crc_word(crc1 ^ words[1] ^ crc);
855 #if N > 2
856             crc = crc_word(crc2 ^ words[2] ^ crc);
857 #if N > 3
858             crc = crc_word(crc3 ^ words[3] ^ crc);
859 #if N > 4
860             crc = crc_word(crc4 ^ words[4] ^ crc);
861 #if N > 5
862             crc = crc_word(crc5 ^ words[5] ^ crc);
863 #endif
864 #endif
865 #endif
866 #endif
867 #endif
868             words += N;
869         }
870         else {
871             /* Big endian. */
872 
873             z_word_t crc0, word0, comb;
874 #if N > 1
875             z_word_t crc1, word1;
876 #if N > 2
877             z_word_t crc2, word2;
878 #if N > 3
879             z_word_t crc3, word3;
880 #if N > 4
881             z_word_t crc4, word4;
882 #if N > 5
883             z_word_t crc5, word5;
884 #endif
885 #endif
886 #endif
887 #endif
888 #endif
889 
890             /* Initialize the CRC for each braid. */
891             crc0 = byte_swap(crc);
892 #if N > 1
893             crc1 = 0;
894 #if N > 2
895             crc2 = 0;
896 #if N > 3
897             crc3 = 0;
898 #if N > 4
899             crc4 = 0;
900 #if N > 5
901             crc5 = 0;
902 #endif
903 #endif
904 #endif
905 #endif
906 #endif
907 
908             /*
909               Process the first blks-1 blocks, computing the CRCs on each braid
910               independently.
911              */
912             while (--blks) {
913                 /* Load the word for each braid into registers. */
914                 word0 = crc0 ^ words[0];
915 #if N > 1
916                 word1 = crc1 ^ words[1];
917 #if N > 2
918                 word2 = crc2 ^ words[2];
919 #if N > 3
920                 word3 = crc3 ^ words[3];
921 #if N > 4
922                 word4 = crc4 ^ words[4];
923 #if N > 5
924                 word5 = crc5 ^ words[5];
925 #endif
926 #endif
927 #endif
928 #endif
929 #endif
930                 words += N;
931 
932                 /* Compute and update the CRC for each word. The loop should
933                    get unrolled. */
934                 crc0 = crc_braid_big_table[0][word0 & 0xff];
935 #if N > 1
936                 crc1 = crc_braid_big_table[0][word1 & 0xff];
937 #if N > 2
938                 crc2 = crc_braid_big_table[0][word2 & 0xff];
939 #if N > 3
940                 crc3 = crc_braid_big_table[0][word3 & 0xff];
941 #if N > 4
942                 crc4 = crc_braid_big_table[0][word4 & 0xff];
943 #if N > 5
944                 crc5 = crc_braid_big_table[0][word5 & 0xff];
945 #endif
946 #endif
947 #endif
948 #endif
949 #endif
950                 for (k = 1; k < W; k++) {
951                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
952 #if N > 1
953                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
954 #if N > 2
955                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
956 #if N > 3
957                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
958 #if N > 4
959                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
960 #if N > 5
961                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
962 #endif
963 #endif
964 #endif
965 #endif
966 #endif
967                 }
968             }
969 
970             /*
971               Process the last block, combining the CRCs of the N braids at the
972               same time.
973              */
974             comb = crc_word_big(crc0 ^ words[0]);
975 #if N > 1
976             comb = crc_word_big(crc1 ^ words[1] ^ comb);
977 #if N > 2
978             comb = crc_word_big(crc2 ^ words[2] ^ comb);
979 #if N > 3
980             comb = crc_word_big(crc3 ^ words[3] ^ comb);
981 #if N > 4
982             comb = crc_word_big(crc4 ^ words[4] ^ comb);
983 #if N > 5
984             comb = crc_word_big(crc5 ^ words[5] ^ comb);
985 #endif
986 #endif
987 #endif
988 #endif
989 #endif
990             words += N;
991             crc = byte_swap(comb);
992         }
993 
994         /*
995           Update the pointer to the remaining bytes to process.
996          */
997         buf = (unsigned char const *)words;
998     }
999 
1000 #endif /* W */
1001 
1002     /* Complete the computation of the CRC on any remaining bytes. */
1003     while (len >= 8) {
1004         len -= 8;
1005         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1006         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1007         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1008         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1009         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1010         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1011         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1012         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1013     }
1014     while (len) {
1015         len--;
1016         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1017     }
1018 
1019     /* Return the CRC, post-conditioned. */
1020     return crc ^ 0xffffffff;
1021 }
1022 
1023 #endif
1024 
1025 /* ========================================================================= */
crc32(unsigned long crc, const unsigned char FAR *buf, uInt len)1026 unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
1027                             uInt len) {
1028     return crc32_z(crc, buf, len);
1029 }
1030 
1031 /* ========================================================================= */
crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2)1032 uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2) {
1033 #ifdef DYNAMIC_CRC_TABLE
1034     once(&made, make_crc_table);
1035 #endif /* DYNAMIC_CRC_TABLE */
1036     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1037 }
1038 
1039 /* ========================================================================= */
crc32_combine(uLong crc1, uLong crc2, z_off_t len2)1040 uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2) {
1041     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1042 }
1043 
1044 /* ========================================================================= */
crc32_combine_gen64(z_off64_t len2)1045 uLong ZEXPORT crc32_combine_gen64(z_off64_t len2) {
1046 #ifdef DYNAMIC_CRC_TABLE
1047     once(&made, make_crc_table);
1048 #endif /* DYNAMIC_CRC_TABLE */
1049     return x2nmodp(len2, 3);
1050 }
1051 
1052 /* ========================================================================= */
crc32_combine_gen(z_off_t len2)1053 uLong ZEXPORT crc32_combine_gen(z_off_t len2) {
1054     return crc32_combine_gen64((z_off64_t)len2);
1055 }
1056 
1057 /* ========================================================================= */
crc32_combine_op(uLong crc1, uLong crc2, uLong op)1058 uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op) {
1059     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1060 }
1061