1 /* crc32.c -- compute the CRC-32 of a data stream
2 * Copyright (C) 1995-2022 Mark Adler
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 *
5 * This interleaved implementation of a CRC makes use of pipelined multiple
6 * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8 */
9
10 /* @(#) $Id$ */
11
12 /*
13 Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
14 protection on the static variables used to control the first-use generation
15 of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
16 first call get_crc_table() to initialize the tables before allowing more than
17 one thread to use crc32().
18
19 MAKECRCH can be #defined to write out crc32.h. A main() routine is also
20 produced, so that this one source file can be compiled to an executable.
21 */
22
23 #ifdef MAKECRCH
24 # include <stdio.h>
25 # ifndef DYNAMIC_CRC_TABLE
26 # define DYNAMIC_CRC_TABLE
27 # endif /* !DYNAMIC_CRC_TABLE */
28 #endif /* MAKECRCH */
29
30 #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
31
32 /*
33 A CRC of a message is computed on N braids of words in the message, where
34 each word consists of W bytes (4 or 8). If N is 3, for example, then three
35 running sparse CRCs are calculated respectively on each braid, at these
36 indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
37 This is done starting at a word boundary, and continues until as many blocks
38 of N * W bytes as are available have been processed. The results are combined
39 into a single CRC at the end. For this code, N must be in the range 1..6 and
40 W must be 4 or 8. The upper limit on N can be increased if desired by adding
41 more #if blocks, extending the patterns apparent in the code. In addition,
42 crc32.h would need to be regenerated, if the maximum N value is increased.
43
44 N and W are chosen empirically by benchmarking the execution time on a given
45 processor. The choices for N and W below were based on testing on Intel Kaby
46 Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
47 Octeon II processors. The Intel, AMD, and ARM processors were all fastest
48 with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
49 They were all tested with either gcc or clang, all using the -O3 optimization
50 level. Your mileage may vary.
51 */
52
53 /* Define N */
54 #ifdef Z_TESTN
55 # define N Z_TESTN
56 #else
57 # define N 5
58 #endif
59 #if N < 1 || N > 6
60 # error N must be in 1..6
61 #endif
62
63 /*
64 z_crc_t must be at least 32 bits. z_word_t must be at least as long as
65 z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
66 that bytes are eight bits.
67 */
68
69 /*
70 Define W and the associated z_word_t type. If W is not defined, then a
71 braided calculation is not used, and the associated tables and code are not
72 compiled.
73 */
74 #ifdef Z_TESTW
75 # if Z_TESTW-1 != -1
76 # define W Z_TESTW
77 # endif
78 #else
79 # ifdef MAKECRCH
80 # define W 8 /* required for MAKECRCH */
81 # else
82 # if defined(__x86_64__) || defined(__aarch64__)
83 # define W 8
84 # else
85 # define W 4
86 # endif
87 # endif
88 #endif
89 #ifdef W
90 # if W == 8 && defined(Z_U8)
91 typedef Z_U8 z_word_t;
92 # elif defined(Z_U4)
93 # undef W
94 # define W 4
95 typedef Z_U4 z_word_t;
96 # else
97 # undef W
98 # endif
99 #endif
100
101 /* If available, use the ARM processor CRC32 instruction. */
102 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
103 # define ARMCRC32
104 #endif
105
106 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
107 /*
108 Swap the bytes in a z_word_t to convert between little and big endian. Any
109 self-respecting compiler will optimize this to a single machine byte-swap
110 instruction, if one is available. This assumes that word_t is either 32 bits
111 or 64 bits.
112 */
byte_swap(z_word_t word)113 local z_word_t byte_swap(z_word_t word)
114 {
115 # if W == 8
116 return
117 (word & 0xff00000000000000) >> 56 |
118 (word & 0xff000000000000) >> 40 |
119 (word & 0xff0000000000) >> 24 |
120 (word & 0xff00000000) >> 8 |
121 (word & 0xff000000) << 8 |
122 (word & 0xff0000) << 24 |
123 (word & 0xff00) << 40 |
124 (word & 0xff) << 56;
125 # else /* W == 4 */
126 return
127 (word & 0xff000000) >> 24 |
128 (word & 0xff0000) >> 8 |
129 (word & 0xff00) << 8 |
130 (word & 0xff) << 24;
131 # endif
132 }
133 #endif
134
135 #ifdef DYNAMIC_CRC_TABLE
136 /* =========================================================================
137 * Table of powers of x for combining CRC-32s, filled in by make_crc_table()
138 * below.
139 */
140 local z_crc_t FAR x2n_table[32];
141 #else
142 /* =========================================================================
143 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
144 * of x for combining CRC-32s, all made by make_crc_table().
145 */
146 # include "crc32.h"
147 #endif
148
149 /* CRC polynomial. */
150 #define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
151
152 /*
153 Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
154 reflected. For speed, this requires that a not be zero.
155 */
multmodp(z_crc_t a, z_crc_t b)156 local z_crc_t multmodp(z_crc_t a, z_crc_t b)
157 {
158 z_crc_t m, p;
159
160 m = (z_crc_t)1 << 31;
161 p = 0;
162 for (;;) {
163 if (a & m) {
164 p ^= b;
165 if ((a & (m - 1)) == 0)
166 break;
167 }
168 m >>= 1;
169 b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
170 }
171 return p;
172 }
173
174 /*
175 Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
176 initialized.
177 */
x2nmodp(z_off64_t n, unsigned k)178 local z_crc_t x2nmodp(z_off64_t n, unsigned k)
179 {
180 z_crc_t p;
181
182 p = (z_crc_t)1 << 31; /* x^0 == 1 */
183 while (n) {
184 if (n & 1)
185 p = multmodp(x2n_table[k & 31], p);
186 n >>= 1;
187 k++;
188 }
189 return p;
190 }
191
192 #ifdef DYNAMIC_CRC_TABLE
193 /* =========================================================================
194 * Build the tables for byte-wise and braided CRC-32 calculations, and a table
195 * of powers of x for combining CRC-32s.
196 */
197 local z_crc_t FAR crc_table[256];
198 #ifdef W
199 local z_word_t FAR crc_big_table[256];
200 local z_crc_t FAR crc_braid_table[W][256];
201 local z_word_t FAR crc_braid_big_table[W][256];
202 local void braid(z_crc_t [][256], z_word_t [][256], int, int);
203 #endif
204 #ifdef MAKECRCH
205 local void write_table(FILE *, const z_crc_t FAR *, int);
206 local void write_table32hi(FILE *, const z_word_t FAR *, int);
207 local void write_table64(FILE *, const z_word_t FAR *, int);
208 #endif /* MAKECRCH */
209
210 /*
211 Define a once() function depending on the availability of atomics. If this is
212 compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
213 multiple threads, and if atomics are not available, then get_crc_table() must
214 be called to initialize the tables and must return before any threads are
215 allowed to compute or combine CRCs.
216 */
217
218 /* Definition of once functionality. */
219 typedef struct once_s once_t;
220
221 /* Check for the availability of atomics. */
222 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
223 !defined(__STDC_NO_ATOMICS__)
224
225 #include <stdatomic.h>
226
227 /* Structure for once(), which must be initialized with ONCE_INIT. */
228 struct once_s {
229 atomic_flag begun;
230 atomic_int done;
231 };
232 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
233
234 /*
235 Run the provided init() function exactly once, even if multiple threads
236 invoke once() at the same time. The state must be a once_t initialized with
237 ONCE_INIT.
238 */
once(once_t *state, void (*init)(void))239 local void once(once_t *state, void (*init)(void))
240 {
241 if (!atomic_load(&state->done)) {
242 if (atomic_flag_test_and_set(&state->begun))
243 while (!atomic_load(&state->done))
244 ;
245 else {
246 init();
247 atomic_store(&state->done, 1);
248 }
249 }
250 }
251
252 #else /* no atomics */
253
254 /* Structure for once(), which must be initialized with ONCE_INIT. */
255 struct once_s {
256 volatile int begun;
257 volatile int done;
258 };
259 #define ONCE_INIT {0, 0}
260
261 /* Test and set. Alas, not atomic, but tries to minimize the period of
262 vulnerability. */
test_and_set(int volatile *flag)263 local int test_and_set(int volatile *flag)
264 {
265 int was;
266
267 was = *flag;
268 *flag = 1;
269 return was;
270 }
271
272 /* Run the provided init() function once. This is not thread-safe. */
once(once_t *state, void (*init)(void))273 local void once(once_t *state, void (*init)(void))
274 {
275 if (!state->done) {
276 if (test_and_set(&state->begun))
277 while (!state->done)
278 ;
279 else {
280 init();
281 state->done = 1;
282 }
283 }
284 }
285
286 #endif
287
288 /* State for once(). */
289 local once_t made = ONCE_INIT;
290
291 /*
292 Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
293 x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
294
295 Polynomials over GF(2) are represented in binary, one bit per coefficient,
296 with the lowest powers in the most significant bit. Then adding polynomials
297 is just exclusive-or, and multiplying a polynomial by x is a right shift by
298 one. If we call the above polynomial p, and represent a byte as the
299 polynomial q, also with the lowest power in the most significant bit (so the
300 byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
301 where a mod b means the remainder after dividing a by b.
302
303 This calculation is done using the shift-register method of multiplying and
304 taking the remainder. The register is initialized to zero, and for each
305 incoming bit, x^32 is added mod p to the register if the bit is a one (where
306 x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
307 (which is shifting right by one and adding x^32 mod p if the bit shifted out
308 is a one). We start with the highest power (least significant bit) of q and
309 repeat for all eight bits of q.
310
311 The table is simply the CRC of all possible eight bit values. This is all the
312 information needed to generate CRCs on data a byte at a time for all
313 combinations of CRC register values and incoming bytes.
314 */
315
make_crc_table(void)316 local void make_crc_table(void)
317 {
318 unsigned i, j, n;
319 z_crc_t p;
320
321 /* initialize the CRC of bytes tables */
322 for (i = 0; i < 256; i++) {
323 p = i;
324 for (j = 0; j < 8; j++)
325 p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
326 crc_table[i] = p;
327 #ifdef W
328 crc_big_table[i] = byte_swap(p);
329 #endif
330 }
331
332 /* initialize the x^2^n mod p(x) table */
333 p = (z_crc_t)1 << 30; /* x^1 */
334 x2n_table[0] = p;
335 for (n = 1; n < 32; n++)
336 x2n_table[n] = p = multmodp(p, p);
337
338 #ifdef W
339 /* initialize the braiding tables -- needs x2n_table[] */
340 braid(crc_braid_table, crc_braid_big_table, N, W);
341 #endif
342
343 #ifdef MAKECRCH
344 {
345 /*
346 The crc32.h header file contains tables for both 32-bit and 64-bit
347 z_word_t's, and so requires a 64-bit type be available. In that case,
348 z_word_t must be defined to be 64-bits. This code then also generates
349 and writes out the tables for the case that z_word_t is 32 bits.
350 */
351 #if !defined(W) || W != 8
352 # error Need a 64-bit integer type in order to generate crc32.h.
353 #endif
354 FILE *out;
355 int k, n;
356 z_crc_t ltl[8][256];
357 z_word_t big[8][256];
358
359 out = fopen("crc32.h", "w");
360 if (out == NULL) return;
361
362 /* write out little-endian CRC table to crc32.h */
363 fprintf(out,
364 "/* crc32.h -- tables for rapid CRC calculation\n"
365 " * Generated automatically by crc32.c\n */\n"
366 "\n"
367 "local const z_crc_t FAR crc_table[] = {\n"
368 " ");
369 write_table(out, crc_table, 256);
370 fprintf(out,
371 "};\n");
372
373 /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
374 fprintf(out,
375 "\n"
376 "#ifdef W\n"
377 "\n"
378 "#if W == 8\n"
379 "\n"
380 "local const z_word_t FAR crc_big_table[] = {\n"
381 " ");
382 write_table64(out, crc_big_table, 256);
383 fprintf(out,
384 "};\n");
385
386 /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
387 fprintf(out,
388 "\n"
389 "#else /* W == 4 */\n"
390 "\n"
391 "local const z_word_t FAR crc_big_table[] = {\n"
392 " ");
393 write_table32hi(out, crc_big_table, 256);
394 fprintf(out,
395 "};\n"
396 "\n"
397 "#endif\n");
398
399 /* write out braid tables for each value of N */
400 for (n = 1; n <= 6; n++) {
401 fprintf(out,
402 "\n"
403 "#if N == %d\n", n);
404
405 /* compute braid tables for this N and 64-bit word_t */
406 braid(ltl, big, n, 8);
407
408 /* write out braid tables for 64-bit z_word_t to crc32.h */
409 fprintf(out,
410 "\n"
411 "#if W == 8\n"
412 "\n"
413 "local const z_crc_t FAR crc_braid_table[][256] = {\n");
414 for (k = 0; k < 8; k++) {
415 fprintf(out, " {");
416 write_table(out, ltl[k], 256);
417 fprintf(out, "}%s", k < 7 ? ",\n" : "");
418 }
419 fprintf(out,
420 "};\n"
421 "\n"
422 "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
423 for (k = 0; k < 8; k++) {
424 fprintf(out, " {");
425 write_table64(out, big[k], 256);
426 fprintf(out, "}%s", k < 7 ? ",\n" : "");
427 }
428 fprintf(out,
429 "};\n");
430
431 /* compute braid tables for this N and 32-bit word_t */
432 braid(ltl, big, n, 4);
433
434 /* write out braid tables for 32-bit z_word_t to crc32.h */
435 fprintf(out,
436 "\n"
437 "#else /* W == 4 */\n"
438 "\n"
439 "local const z_crc_t FAR crc_braid_table[][256] = {\n");
440 for (k = 0; k < 4; k++) {
441 fprintf(out, " {");
442 write_table(out, ltl[k], 256);
443 fprintf(out, "}%s", k < 3 ? ",\n" : "");
444 }
445 fprintf(out,
446 "};\n"
447 "\n"
448 "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
449 for (k = 0; k < 4; k++) {
450 fprintf(out, " {");
451 write_table32hi(out, big[k], 256);
452 fprintf(out, "}%s", k < 3 ? ",\n" : "");
453 }
454 fprintf(out,
455 "};\n"
456 "\n"
457 "#endif\n"
458 "\n"
459 "#endif\n");
460 }
461 fprintf(out,
462 "\n"
463 "#endif\n");
464
465 /* write out zeros operator table to crc32.h */
466 fprintf(out,
467 "\n"
468 "local const z_crc_t FAR x2n_table[] = {\n"
469 " ");
470 write_table(out, x2n_table, 32);
471 fprintf(out,
472 "};\n");
473 fclose(out);
474 }
475 #endif /* MAKECRCH */
476 }
477
478 #ifdef MAKECRCH
479
480 /*
481 Write the 32-bit values in table[0..k-1] to out, five per line in
482 hexadecimal separated by commas.
483 */
write_table(FILE *out, const z_crc_t FAR *table, int k)484 local void write_table(FILE *out, const z_crc_t FAR *table, int k) {
485 int n;
486
487 for (n = 0; n < k; n++)
488 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
489 (unsigned long)(table[n]),
490 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
491 }
492
493 /*
494 Write the high 32-bits of each value in table[0..k-1] to out, five per line
495 in hexadecimal separated by commas.
496 */
write_table32hi(FILE *out, const z_word_t FAR *table, int k)497 local void write_table32hi(FILE *out, const z_word_t FAR *table, int k) {
498 int n;
499
500 for (n = 0; n < k; n++)
501 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
502 (unsigned long)(table[n] >> 32),
503 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
504 }
505
506 /*
507 Write the 64-bit values in table[0..k-1] to out, three per line in
508 hexadecimal separated by commas. This assumes that if there is a 64-bit
509 type, then there is also a long long integer type, and it is at least 64
510 bits. If not, then the type cast and format string can be adjusted
511 accordingly.
512 */
write_table64(FILE *out, const z_word_t FAR *table, int k)513 local void write_table64(FILE *out, const z_word_t FAR *table, int k) {
514 int n;
515
516 for (n = 0; n < k; n++)
517 fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : " ",
518 (unsigned long long)(table[n]),
519 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
520 }
521
522 /* Actually do the deed. */
main(void)523 int main(void)
524 {
525 make_crc_table();
526 return 0;
527 }
528
529 #endif /* MAKECRCH */
530
531 #ifdef W
532 /*
533 Generate the little and big-endian braid tables for the given n and z_word_t
534 size w. Each array must have room for w blocks of 256 elements.
535 */
braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w)536 local void braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w)
537 {
538 int k;
539 z_crc_t i, p, q;
540 for (k = 0; k < w; k++) {
541 p = x2nmodp((n * w + 3 - k) << 3, 0);
542 ltl[k][0] = 0;
543 big[w - 1 - k][0] = 0;
544 for (i = 1; i < 256; i++) {
545 ltl[k][i] = q = multmodp(i << 24, p);
546 big[w - 1 - k][i] = byte_swap(q);
547 }
548 }
549 }
550 #endif
551
552 #endif /* DYNAMIC_CRC_TABLE */
553
554 /* =========================================================================
555 * This function can be used by asm versions of crc32(), and to force the
556 * generation of the CRC tables in a threaded application.
557 */
get_crc_table(void)558 const z_crc_t FAR * ZEXPORT get_crc_table(void) {
559 #ifdef DYNAMIC_CRC_TABLE
560 once(&made, make_crc_table);
561 #endif /* DYNAMIC_CRC_TABLE */
562 return (const z_crc_t FAR *)crc_table;
563 }
564
565 /* =========================================================================
566 * Use ARM machine instructions if available. This will compute the CRC about
567 * ten times faster than the braided calculation. This code does not check for
568 * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
569 * only be defined if the compilation specifies an ARM processor architecture
570 * that has the instructions. For example, compiling with -march=armv8.1-a or
571 * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
572 * instructions.
573 */
574 #ifdef ARMCRC32
575
576 /*
577 Constants empirically determined to maximize speed. These values are from
578 measurements on a Cortex-A57. Your mileage may vary.
579 */
580 #define Z_BATCH 3990 /* number of words in a batch */
581 #define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */
582 #define Z_BATCH_MIN 800 /* fewest words in a final batch */
583
crc32_z(unsigned long crc, const unsigned char FAR *buf, z_size_t len)584 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
585 z_size_t len) {
586 z_crc_t val;
587 z_word_t crc1, crc2;
588 const z_word_t *word;
589 z_word_t val0, val1, val2;
590 z_size_t last, last2, i;
591 z_size_t num;
592
593 /* Return initial CRC, if requested. */
594 if (buf == Z_NULL) return 0;
595
596 #ifdef DYNAMIC_CRC_TABLE
597 once(&made, make_crc_table);
598 #endif /* DYNAMIC_CRC_TABLE */
599
600 /* Pre-condition the CRC */
601 crc = (~crc) & 0xffffffff;
602
603 /* Compute the CRC up to a word boundary. */
604 while (len && ((z_size_t)buf & 7) != 0) {
605 len--;
606 val = *buf++;
607 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
608 }
609
610 /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
611 word = (z_word_t const *)buf;
612 num = len >> 3;
613 len &= 7;
614
615 /* Do three interleaved CRCs to realize the throughput of one crc32x
616 instruction per cycle. Each CRC is calculated on Z_BATCH words. The
617 three CRCs are combined into a single CRC after each set of batches. */
618 while (num >= 3 * Z_BATCH) {
619 crc1 = 0;
620 crc2 = 0;
621 for (i = 0; i < Z_BATCH; i++) {
622 val0 = word[i];
623 val1 = word[i + Z_BATCH];
624 val2 = word[i + 2 * Z_BATCH];
625 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
626 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
627 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
628 }
629 word += 3 * Z_BATCH;
630 num -= 3 * Z_BATCH;
631 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
632 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
633 }
634
635 /* Do one last smaller batch with the remaining words, if there are enough
636 to pay for the combination of CRCs. */
637 last = num / 3;
638 if (last >= Z_BATCH_MIN) {
639 last2 = last << 1;
640 crc1 = 0;
641 crc2 = 0;
642 for (i = 0; i < last; i++) {
643 val0 = word[i];
644 val1 = word[i + last];
645 val2 = word[i + last2];
646 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
647 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
648 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
649 }
650 word += 3 * last;
651 num -= 3 * last;
652 val = x2nmodp(last, 6);
653 crc = multmodp(val, crc) ^ crc1;
654 crc = multmodp(val, crc) ^ crc2;
655 }
656
657 /* Compute the CRC on any remaining words. */
658 for (i = 0; i < num; i++) {
659 val0 = word[i];
660 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
661 }
662 word += num;
663
664 /* Complete the CRC on any remaining bytes. */
665 buf = (const unsigned char FAR *)word;
666 while (len) {
667 len--;
668 val = *buf++;
669 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
670 }
671
672 /* Return the CRC, post-conditioned. */
673 return crc ^ 0xffffffff;
674 }
675
676 #else
677
678 #ifdef W
679
680 /*
681 Return the CRC of the W bytes in the word_t data, taking the
682 least-significant byte of the word as the first byte of data, without any pre
683 or post conditioning. This is used to combine the CRCs of each braid.
684 */
crc_word(z_word_t data)685 local z_crc_t crc_word(z_word_t data)
686 {
687 int k;
688 for (k = 0; k < W; k++)
689 data = (data >> 8) ^ crc_table[data & 0xff];
690 return (z_crc_t)data;
691 }
692
crc_word_big(z_word_t data)693 local z_word_t crc_word_big(z_word_t data)
694 {
695 int k;
696 for (k = 0; k < W; k++)
697 data = (data << 8) ^
698 crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
699 return data;
700 }
701
702 #endif
703
704 /* ========================================================================= */
crc32_z(unsigned long crc, const unsigned char FAR *buf, z_size_t len)705 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
706 z_size_t len) {
707 /* Return initial CRC, if requested. */
708 if (buf == Z_NULL) return 0;
709
710 #ifdef DYNAMIC_CRC_TABLE
711 once(&made, make_crc_table);
712 #endif /* DYNAMIC_CRC_TABLE */
713
714 /* Pre-condition the CRC */
715 crc = (~crc) & 0xffffffff;
716
717 #ifdef W
718
719 /* If provided enough bytes, do a braided CRC calculation. */
720 if (len >= N * W + W - 1) {
721 z_size_t blks;
722 z_word_t const *words;
723 unsigned endian;
724 int k;
725
726 /* Compute the CRC up to a z_word_t boundary. */
727 while (len && ((z_size_t)buf & (W - 1)) != 0) {
728 len--;
729 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
730 }
731
732 /* Compute the CRC on as many N z_word_t blocks as are available. */
733 blks = len / (N * W);
734 len -= blks * N * W;
735 words = (z_word_t const *)buf;
736
737 /* Do endian check at execution time instead of compile time, since ARM
738 processors can change the endianness at execution time. If the
739 compiler knows what the endianness will be, it can optimize out the
740 check and the unused branch. */
741 endian = 1;
742 if (*(unsigned char *)&endian) {
743 /* Little endian. */
744
745 z_crc_t crc0;
746 z_word_t word0;
747 #if N > 1
748 z_crc_t crc1;
749 z_word_t word1;
750 #if N > 2
751 z_crc_t crc2;
752 z_word_t word2;
753 #if N > 3
754 z_crc_t crc3;
755 z_word_t word3;
756 #if N > 4
757 z_crc_t crc4;
758 z_word_t word4;
759 #if N > 5
760 z_crc_t crc5;
761 z_word_t word5;
762 #endif
763 #endif
764 #endif
765 #endif
766 #endif
767
768 /* Initialize the CRC for each braid. */
769 crc0 = crc;
770 #if N > 1
771 crc1 = 0;
772 #if N > 2
773 crc2 = 0;
774 #if N > 3
775 crc3 = 0;
776 #if N > 4
777 crc4 = 0;
778 #if N > 5
779 crc5 = 0;
780 #endif
781 #endif
782 #endif
783 #endif
784 #endif
785
786 /*
787 Process the first blks-1 blocks, computing the CRCs on each braid
788 independently.
789 */
790 while (--blks) {
791 /* Load the word for each braid into registers. */
792 word0 = crc0 ^ words[0];
793 #if N > 1
794 word1 = crc1 ^ words[1];
795 #if N > 2
796 word2 = crc2 ^ words[2];
797 #if N > 3
798 word3 = crc3 ^ words[3];
799 #if N > 4
800 word4 = crc4 ^ words[4];
801 #if N > 5
802 word5 = crc5 ^ words[5];
803 #endif
804 #endif
805 #endif
806 #endif
807 #endif
808 words += N;
809
810 /* Compute and update the CRC for each word. The loop should
811 get unrolled. */
812 crc0 = crc_braid_table[0][word0 & 0xff];
813 #if N > 1
814 crc1 = crc_braid_table[0][word1 & 0xff];
815 #if N > 2
816 crc2 = crc_braid_table[0][word2 & 0xff];
817 #if N > 3
818 crc3 = crc_braid_table[0][word3 & 0xff];
819 #if N > 4
820 crc4 = crc_braid_table[0][word4 & 0xff];
821 #if N > 5
822 crc5 = crc_braid_table[0][word5 & 0xff];
823 #endif
824 #endif
825 #endif
826 #endif
827 #endif
828 for (k = 1; k < W; k++) {
829 crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
830 #if N > 1
831 crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
832 #if N > 2
833 crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
834 #if N > 3
835 crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
836 #if N > 4
837 crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
838 #if N > 5
839 crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
840 #endif
841 #endif
842 #endif
843 #endif
844 #endif
845 }
846 }
847
848 /*
849 Process the last block, combining the CRCs of the N braids at the
850 same time.
851 */
852 crc = crc_word(crc0 ^ words[0]);
853 #if N > 1
854 crc = crc_word(crc1 ^ words[1] ^ crc);
855 #if N > 2
856 crc = crc_word(crc2 ^ words[2] ^ crc);
857 #if N > 3
858 crc = crc_word(crc3 ^ words[3] ^ crc);
859 #if N > 4
860 crc = crc_word(crc4 ^ words[4] ^ crc);
861 #if N > 5
862 crc = crc_word(crc5 ^ words[5] ^ crc);
863 #endif
864 #endif
865 #endif
866 #endif
867 #endif
868 words += N;
869 }
870 else {
871 /* Big endian. */
872
873 z_word_t crc0, word0, comb;
874 #if N > 1
875 z_word_t crc1, word1;
876 #if N > 2
877 z_word_t crc2, word2;
878 #if N > 3
879 z_word_t crc3, word3;
880 #if N > 4
881 z_word_t crc4, word4;
882 #if N > 5
883 z_word_t crc5, word5;
884 #endif
885 #endif
886 #endif
887 #endif
888 #endif
889
890 /* Initialize the CRC for each braid. */
891 crc0 = byte_swap(crc);
892 #if N > 1
893 crc1 = 0;
894 #if N > 2
895 crc2 = 0;
896 #if N > 3
897 crc3 = 0;
898 #if N > 4
899 crc4 = 0;
900 #if N > 5
901 crc5 = 0;
902 #endif
903 #endif
904 #endif
905 #endif
906 #endif
907
908 /*
909 Process the first blks-1 blocks, computing the CRCs on each braid
910 independently.
911 */
912 while (--blks) {
913 /* Load the word for each braid into registers. */
914 word0 = crc0 ^ words[0];
915 #if N > 1
916 word1 = crc1 ^ words[1];
917 #if N > 2
918 word2 = crc2 ^ words[2];
919 #if N > 3
920 word3 = crc3 ^ words[3];
921 #if N > 4
922 word4 = crc4 ^ words[4];
923 #if N > 5
924 word5 = crc5 ^ words[5];
925 #endif
926 #endif
927 #endif
928 #endif
929 #endif
930 words += N;
931
932 /* Compute and update the CRC for each word. The loop should
933 get unrolled. */
934 crc0 = crc_braid_big_table[0][word0 & 0xff];
935 #if N > 1
936 crc1 = crc_braid_big_table[0][word1 & 0xff];
937 #if N > 2
938 crc2 = crc_braid_big_table[0][word2 & 0xff];
939 #if N > 3
940 crc3 = crc_braid_big_table[0][word3 & 0xff];
941 #if N > 4
942 crc4 = crc_braid_big_table[0][word4 & 0xff];
943 #if N > 5
944 crc5 = crc_braid_big_table[0][word5 & 0xff];
945 #endif
946 #endif
947 #endif
948 #endif
949 #endif
950 for (k = 1; k < W; k++) {
951 crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
952 #if N > 1
953 crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
954 #if N > 2
955 crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
956 #if N > 3
957 crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
958 #if N > 4
959 crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
960 #if N > 5
961 crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
962 #endif
963 #endif
964 #endif
965 #endif
966 #endif
967 }
968 }
969
970 /*
971 Process the last block, combining the CRCs of the N braids at the
972 same time.
973 */
974 comb = crc_word_big(crc0 ^ words[0]);
975 #if N > 1
976 comb = crc_word_big(crc1 ^ words[1] ^ comb);
977 #if N > 2
978 comb = crc_word_big(crc2 ^ words[2] ^ comb);
979 #if N > 3
980 comb = crc_word_big(crc3 ^ words[3] ^ comb);
981 #if N > 4
982 comb = crc_word_big(crc4 ^ words[4] ^ comb);
983 #if N > 5
984 comb = crc_word_big(crc5 ^ words[5] ^ comb);
985 #endif
986 #endif
987 #endif
988 #endif
989 #endif
990 words += N;
991 crc = byte_swap(comb);
992 }
993
994 /*
995 Update the pointer to the remaining bytes to process.
996 */
997 buf = (unsigned char const *)words;
998 }
999
1000 #endif /* W */
1001
1002 /* Complete the computation of the CRC on any remaining bytes. */
1003 while (len >= 8) {
1004 len -= 8;
1005 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1006 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1007 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1008 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1009 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1010 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1011 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1012 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1013 }
1014 while (len) {
1015 len--;
1016 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1017 }
1018
1019 /* Return the CRC, post-conditioned. */
1020 return crc ^ 0xffffffff;
1021 }
1022
1023 #endif
1024
1025 /* ========================================================================= */
crc32(unsigned long crc, const unsigned char FAR *buf, uInt len)1026 unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
1027 uInt len) {
1028 return crc32_z(crc, buf, len);
1029 }
1030
1031 /* ========================================================================= */
crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2)1032 uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2) {
1033 #ifdef DYNAMIC_CRC_TABLE
1034 once(&made, make_crc_table);
1035 #endif /* DYNAMIC_CRC_TABLE */
1036 return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1037 }
1038
1039 /* ========================================================================= */
crc32_combine(uLong crc1, uLong crc2, z_off_t len2)1040 uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2) {
1041 return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1042 }
1043
1044 /* ========================================================================= */
crc32_combine_gen64(z_off64_t len2)1045 uLong ZEXPORT crc32_combine_gen64(z_off64_t len2) {
1046 #ifdef DYNAMIC_CRC_TABLE
1047 once(&made, make_crc_table);
1048 #endif /* DYNAMIC_CRC_TABLE */
1049 return x2nmodp(len2, 3);
1050 }
1051
1052 /* ========================================================================= */
crc32_combine_gen(z_off_t len2)1053 uLong ZEXPORT crc32_combine_gen(z_off_t len2) {
1054 return crc32_combine_gen64((z_off64_t)len2);
1055 }
1056
1057 /* ========================================================================= */
crc32_combine_op(uLong crc1, uLong crc2, uLong op)1058 uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op) {
1059 return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1060 }
1061