11cb0ef41Sopenharmony_ci/* 21cb0ef41Sopenharmony_ci * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ 31cb0ef41Sopenharmony_ci * instruction. 41cb0ef41Sopenharmony_ci * 51cb0ef41Sopenharmony_ci * A white paper describing this algorithm can be found at: 61cb0ef41Sopenharmony_ci * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 71cb0ef41Sopenharmony_ci * 81cb0ef41Sopenharmony_ci * Copyright (C) 2013 Intel Corporation. All rights reserved. 91cb0ef41Sopenharmony_ci * Authors: 101cb0ef41Sopenharmony_ci * Wajdi Feghali <wajdi.k.feghali@intel.com> 111cb0ef41Sopenharmony_ci * Jim Guilford <james.guilford@intel.com> 121cb0ef41Sopenharmony_ci * Vinodh Gopal <vinodh.gopal@intel.com> 131cb0ef41Sopenharmony_ci * Erdinc Ozturk <erdinc.ozturk@intel.com> 141cb0ef41Sopenharmony_ci * Jim Kukunas <james.t.kukunas@linux.intel.com> 151cb0ef41Sopenharmony_ci * 161cb0ef41Sopenharmony_ci * For conditions of distribution and use, see copyright notice in zlib.h 171cb0ef41Sopenharmony_ci */ 181cb0ef41Sopenharmony_ci 191cb0ef41Sopenharmony_ci#include "deflate.h" 201cb0ef41Sopenharmony_ci 211cb0ef41Sopenharmony_ci#ifdef CRC32_SIMD_SSE42_PCLMUL 221cb0ef41Sopenharmony_ci 231cb0ef41Sopenharmony_ci#include <inttypes.h> 241cb0ef41Sopenharmony_ci#include <emmintrin.h> 251cb0ef41Sopenharmony_ci#include <immintrin.h> 261cb0ef41Sopenharmony_ci#include <wmmintrin.h> 271cb0ef41Sopenharmony_ci 281cb0ef41Sopenharmony_ci#define CRC_LOAD(s) \ 291cb0ef41Sopenharmony_ci do { \ 301cb0ef41Sopenharmony_ci __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\ 311cb0ef41Sopenharmony_ci __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\ 321cb0ef41Sopenharmony_ci __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\ 331cb0ef41Sopenharmony_ci __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\ 341cb0ef41Sopenharmony_ci __m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4); 351cb0ef41Sopenharmony_ci 361cb0ef41Sopenharmony_ci#define CRC_SAVE(s) \ 371cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\ 381cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\ 391cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\ 401cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\ 411cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\ 421cb0ef41Sopenharmony_ci } while (0); 431cb0ef41Sopenharmony_ci 441cb0ef41Sopenharmony_ciZLIB_INTERNAL void crc_fold_init(deflate_state *const s) 451cb0ef41Sopenharmony_ci{ 461cb0ef41Sopenharmony_ci CRC_LOAD(s) 471cb0ef41Sopenharmony_ci 481cb0ef41Sopenharmony_ci xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); 491cb0ef41Sopenharmony_ci xmm_crc1 = _mm_setzero_si128(); 501cb0ef41Sopenharmony_ci xmm_crc2 = _mm_setzero_si128(); 511cb0ef41Sopenharmony_ci xmm_crc3 = _mm_setzero_si128(); 521cb0ef41Sopenharmony_ci 531cb0ef41Sopenharmony_ci CRC_SAVE(s) 541cb0ef41Sopenharmony_ci 551cb0ef41Sopenharmony_ci s->strm->adler = 0; 561cb0ef41Sopenharmony_ci} 571cb0ef41Sopenharmony_ci 581cb0ef41Sopenharmony_cilocal void fold_1(deflate_state *const s, 591cb0ef41Sopenharmony_ci __m128i *xmm_crc0, __m128i *xmm_crc1, 601cb0ef41Sopenharmony_ci __m128i *xmm_crc2, __m128i *xmm_crc3) 611cb0ef41Sopenharmony_ci{ 621cb0ef41Sopenharmony_ci const __m128i xmm_fold4 = _mm_set_epi32( 631cb0ef41Sopenharmony_ci 0x00000001, 0x54442bd4, 641cb0ef41Sopenharmony_ci 0x00000001, 0xc6e41596); 651cb0ef41Sopenharmony_ci 661cb0ef41Sopenharmony_ci __m128i x_tmp3; 671cb0ef41Sopenharmony_ci __m128 ps_crc0, ps_crc3, ps_res; 681cb0ef41Sopenharmony_ci 691cb0ef41Sopenharmony_ci x_tmp3 = *xmm_crc3; 701cb0ef41Sopenharmony_ci 711cb0ef41Sopenharmony_ci *xmm_crc3 = *xmm_crc0; 721cb0ef41Sopenharmony_ci *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); 731cb0ef41Sopenharmony_ci *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); 741cb0ef41Sopenharmony_ci ps_crc0 = _mm_castsi128_ps(*xmm_crc0); 751cb0ef41Sopenharmony_ci ps_crc3 = _mm_castsi128_ps(*xmm_crc3); 761cb0ef41Sopenharmony_ci ps_res = _mm_xor_ps(ps_crc0, ps_crc3); 771cb0ef41Sopenharmony_ci 781cb0ef41Sopenharmony_ci *xmm_crc0 = *xmm_crc1; 791cb0ef41Sopenharmony_ci *xmm_crc1 = *xmm_crc2; 801cb0ef41Sopenharmony_ci *xmm_crc2 = x_tmp3; 811cb0ef41Sopenharmony_ci *xmm_crc3 = _mm_castps_si128(ps_res); 821cb0ef41Sopenharmony_ci} 831cb0ef41Sopenharmony_ci 841cb0ef41Sopenharmony_cilocal void fold_2(deflate_state *const s, 851cb0ef41Sopenharmony_ci __m128i *xmm_crc0, __m128i *xmm_crc1, 861cb0ef41Sopenharmony_ci __m128i *xmm_crc2, __m128i *xmm_crc3) 871cb0ef41Sopenharmony_ci{ 881cb0ef41Sopenharmony_ci const __m128i xmm_fold4 = _mm_set_epi32( 891cb0ef41Sopenharmony_ci 0x00000001, 0x54442bd4, 901cb0ef41Sopenharmony_ci 0x00000001, 0xc6e41596); 911cb0ef41Sopenharmony_ci 921cb0ef41Sopenharmony_ci __m128i x_tmp3, x_tmp2; 931cb0ef41Sopenharmony_ci __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20; 941cb0ef41Sopenharmony_ci 951cb0ef41Sopenharmony_ci x_tmp3 = *xmm_crc3; 961cb0ef41Sopenharmony_ci x_tmp2 = *xmm_crc2; 971cb0ef41Sopenharmony_ci 981cb0ef41Sopenharmony_ci *xmm_crc3 = *xmm_crc1; 991cb0ef41Sopenharmony_ci *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); 1001cb0ef41Sopenharmony_ci *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); 1011cb0ef41Sopenharmony_ci ps_crc3 = _mm_castsi128_ps(*xmm_crc3); 1021cb0ef41Sopenharmony_ci ps_crc1 = _mm_castsi128_ps(*xmm_crc1); 1031cb0ef41Sopenharmony_ci ps_res31= _mm_xor_ps(ps_crc3, ps_crc1); 1041cb0ef41Sopenharmony_ci 1051cb0ef41Sopenharmony_ci *xmm_crc2 = *xmm_crc0; 1061cb0ef41Sopenharmony_ci *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); 1071cb0ef41Sopenharmony_ci *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); 1081cb0ef41Sopenharmony_ci ps_crc0 = _mm_castsi128_ps(*xmm_crc0); 1091cb0ef41Sopenharmony_ci ps_crc2 = _mm_castsi128_ps(*xmm_crc2); 1101cb0ef41Sopenharmony_ci ps_res20= _mm_xor_ps(ps_crc0, ps_crc2); 1111cb0ef41Sopenharmony_ci 1121cb0ef41Sopenharmony_ci *xmm_crc0 = x_tmp2; 1131cb0ef41Sopenharmony_ci *xmm_crc1 = x_tmp3; 1141cb0ef41Sopenharmony_ci *xmm_crc2 = _mm_castps_si128(ps_res20); 1151cb0ef41Sopenharmony_ci *xmm_crc3 = _mm_castps_si128(ps_res31); 1161cb0ef41Sopenharmony_ci} 1171cb0ef41Sopenharmony_ci 1181cb0ef41Sopenharmony_cilocal void fold_3(deflate_state *const s, 1191cb0ef41Sopenharmony_ci __m128i *xmm_crc0, __m128i *xmm_crc1, 1201cb0ef41Sopenharmony_ci __m128i *xmm_crc2, __m128i *xmm_crc3) 1211cb0ef41Sopenharmony_ci{ 1221cb0ef41Sopenharmony_ci const __m128i xmm_fold4 = _mm_set_epi32( 1231cb0ef41Sopenharmony_ci 0x00000001, 0x54442bd4, 1241cb0ef41Sopenharmony_ci 0x00000001, 0xc6e41596); 1251cb0ef41Sopenharmony_ci 1261cb0ef41Sopenharmony_ci __m128i x_tmp3; 1271cb0ef41Sopenharmony_ci __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10; 1281cb0ef41Sopenharmony_ci 1291cb0ef41Sopenharmony_ci x_tmp3 = *xmm_crc3; 1301cb0ef41Sopenharmony_ci 1311cb0ef41Sopenharmony_ci *xmm_crc3 = *xmm_crc2; 1321cb0ef41Sopenharmony_ci *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); 1331cb0ef41Sopenharmony_ci *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); 1341cb0ef41Sopenharmony_ci ps_crc2 = _mm_castsi128_ps(*xmm_crc2); 1351cb0ef41Sopenharmony_ci ps_crc3 = _mm_castsi128_ps(*xmm_crc3); 1361cb0ef41Sopenharmony_ci ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3); 1371cb0ef41Sopenharmony_ci 1381cb0ef41Sopenharmony_ci *xmm_crc2 = *xmm_crc1; 1391cb0ef41Sopenharmony_ci *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); 1401cb0ef41Sopenharmony_ci *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); 1411cb0ef41Sopenharmony_ci ps_crc1 = _mm_castsi128_ps(*xmm_crc1); 1421cb0ef41Sopenharmony_ci ps_crc2 = _mm_castsi128_ps(*xmm_crc2); 1431cb0ef41Sopenharmony_ci ps_res21= _mm_xor_ps(ps_crc1, ps_crc2); 1441cb0ef41Sopenharmony_ci 1451cb0ef41Sopenharmony_ci *xmm_crc1 = *xmm_crc0; 1461cb0ef41Sopenharmony_ci *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); 1471cb0ef41Sopenharmony_ci *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); 1481cb0ef41Sopenharmony_ci ps_crc0 = _mm_castsi128_ps(*xmm_crc0); 1491cb0ef41Sopenharmony_ci ps_crc1 = _mm_castsi128_ps(*xmm_crc1); 1501cb0ef41Sopenharmony_ci ps_res10= _mm_xor_ps(ps_crc0, ps_crc1); 1511cb0ef41Sopenharmony_ci 1521cb0ef41Sopenharmony_ci *xmm_crc0 = x_tmp3; 1531cb0ef41Sopenharmony_ci *xmm_crc1 = _mm_castps_si128(ps_res10); 1541cb0ef41Sopenharmony_ci *xmm_crc2 = _mm_castps_si128(ps_res21); 1551cb0ef41Sopenharmony_ci *xmm_crc3 = _mm_castps_si128(ps_res32); 1561cb0ef41Sopenharmony_ci} 1571cb0ef41Sopenharmony_ci 1581cb0ef41Sopenharmony_cilocal void fold_4(deflate_state *const s, 1591cb0ef41Sopenharmony_ci __m128i *xmm_crc0, __m128i *xmm_crc1, 1601cb0ef41Sopenharmony_ci __m128i *xmm_crc2, __m128i *xmm_crc3) 1611cb0ef41Sopenharmony_ci{ 1621cb0ef41Sopenharmony_ci const __m128i xmm_fold4 = _mm_set_epi32( 1631cb0ef41Sopenharmony_ci 0x00000001, 0x54442bd4, 1641cb0ef41Sopenharmony_ci 0x00000001, 0xc6e41596); 1651cb0ef41Sopenharmony_ci 1661cb0ef41Sopenharmony_ci __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3; 1671cb0ef41Sopenharmony_ci __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3; 1681cb0ef41Sopenharmony_ci __m128 ps_t0, ps_t1, ps_t2, ps_t3; 1691cb0ef41Sopenharmony_ci __m128 ps_res0, ps_res1, ps_res2, ps_res3; 1701cb0ef41Sopenharmony_ci 1711cb0ef41Sopenharmony_ci x_tmp0 = *xmm_crc0; 1721cb0ef41Sopenharmony_ci x_tmp1 = *xmm_crc1; 1731cb0ef41Sopenharmony_ci x_tmp2 = *xmm_crc2; 1741cb0ef41Sopenharmony_ci x_tmp3 = *xmm_crc3; 1751cb0ef41Sopenharmony_ci 1761cb0ef41Sopenharmony_ci *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); 1771cb0ef41Sopenharmony_ci x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10); 1781cb0ef41Sopenharmony_ci ps_crc0 = _mm_castsi128_ps(*xmm_crc0); 1791cb0ef41Sopenharmony_ci ps_t0 = _mm_castsi128_ps(x_tmp0); 1801cb0ef41Sopenharmony_ci ps_res0 = _mm_xor_ps(ps_crc0, ps_t0); 1811cb0ef41Sopenharmony_ci 1821cb0ef41Sopenharmony_ci *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); 1831cb0ef41Sopenharmony_ci x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10); 1841cb0ef41Sopenharmony_ci ps_crc1 = _mm_castsi128_ps(*xmm_crc1); 1851cb0ef41Sopenharmony_ci ps_t1 = _mm_castsi128_ps(x_tmp1); 1861cb0ef41Sopenharmony_ci ps_res1 = _mm_xor_ps(ps_crc1, ps_t1); 1871cb0ef41Sopenharmony_ci 1881cb0ef41Sopenharmony_ci *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); 1891cb0ef41Sopenharmony_ci x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10); 1901cb0ef41Sopenharmony_ci ps_crc2 = _mm_castsi128_ps(*xmm_crc2); 1911cb0ef41Sopenharmony_ci ps_t2 = _mm_castsi128_ps(x_tmp2); 1921cb0ef41Sopenharmony_ci ps_res2 = _mm_xor_ps(ps_crc2, ps_t2); 1931cb0ef41Sopenharmony_ci 1941cb0ef41Sopenharmony_ci *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01); 1951cb0ef41Sopenharmony_ci x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10); 1961cb0ef41Sopenharmony_ci ps_crc3 = _mm_castsi128_ps(*xmm_crc3); 1971cb0ef41Sopenharmony_ci ps_t3 = _mm_castsi128_ps(x_tmp3); 1981cb0ef41Sopenharmony_ci ps_res3 = _mm_xor_ps(ps_crc3, ps_t3); 1991cb0ef41Sopenharmony_ci 2001cb0ef41Sopenharmony_ci *xmm_crc0 = _mm_castps_si128(ps_res0); 2011cb0ef41Sopenharmony_ci *xmm_crc1 = _mm_castps_si128(ps_res1); 2021cb0ef41Sopenharmony_ci *xmm_crc2 = _mm_castps_si128(ps_res2); 2031cb0ef41Sopenharmony_ci *xmm_crc3 = _mm_castps_si128(ps_res3); 2041cb0ef41Sopenharmony_ci} 2051cb0ef41Sopenharmony_ci 2061cb0ef41Sopenharmony_cilocal const unsigned zalign(32) pshufb_shf_table[60] = { 2071cb0ef41Sopenharmony_ci 0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */ 2081cb0ef41Sopenharmony_ci 0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */ 2091cb0ef41Sopenharmony_ci 0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */ 2101cb0ef41Sopenharmony_ci 0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */ 2111cb0ef41Sopenharmony_ci 0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */ 2121cb0ef41Sopenharmony_ci 0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */ 2131cb0ef41Sopenharmony_ci 0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl 9 (16 - 7)/shr7 */ 2141cb0ef41Sopenharmony_ci 0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl 8 (16 - 8)/shr8 */ 2151cb0ef41Sopenharmony_ci 0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl 7 (16 - 9)/shr9 */ 2161cb0ef41Sopenharmony_ci 0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl 6 (16 -10)/shr10*/ 2171cb0ef41Sopenharmony_ci 0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl 5 (16 -11)/shr11*/ 2181cb0ef41Sopenharmony_ci 0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl 4 (16 -12)/shr12*/ 2191cb0ef41Sopenharmony_ci 0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl 3 (16 -13)/shr13*/ 2201cb0ef41Sopenharmony_ci 0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/ 2211cb0ef41Sopenharmony_ci 0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b /* shl 1 (16 -15)/shr15*/ 2221cb0ef41Sopenharmony_ci}; 2231cb0ef41Sopenharmony_ci 2241cb0ef41Sopenharmony_cilocal void partial_fold(deflate_state *const s, const size_t len, 2251cb0ef41Sopenharmony_ci __m128i *xmm_crc0, __m128i *xmm_crc1, 2261cb0ef41Sopenharmony_ci __m128i *xmm_crc2, __m128i *xmm_crc3, 2271cb0ef41Sopenharmony_ci __m128i *xmm_crc_part) 2281cb0ef41Sopenharmony_ci{ 2291cb0ef41Sopenharmony_ci 2301cb0ef41Sopenharmony_ci const __m128i xmm_fold4 = _mm_set_epi32( 2311cb0ef41Sopenharmony_ci 0x00000001, 0x54442bd4, 2321cb0ef41Sopenharmony_ci 0x00000001, 0xc6e41596); 2331cb0ef41Sopenharmony_ci const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080); 2341cb0ef41Sopenharmony_ci 2351cb0ef41Sopenharmony_ci __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3; 2361cb0ef41Sopenharmony_ci __m128i xmm_a0_0, xmm_a0_1; 2371cb0ef41Sopenharmony_ci __m128 ps_crc3, psa0_0, psa0_1, ps_res; 2381cb0ef41Sopenharmony_ci 2391cb0ef41Sopenharmony_ci xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1)); 2401cb0ef41Sopenharmony_ci xmm_shr = xmm_shl; 2411cb0ef41Sopenharmony_ci xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3); 2421cb0ef41Sopenharmony_ci 2431cb0ef41Sopenharmony_ci xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl); 2441cb0ef41Sopenharmony_ci 2451cb0ef41Sopenharmony_ci *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr); 2461cb0ef41Sopenharmony_ci xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl); 2471cb0ef41Sopenharmony_ci *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1); 2481cb0ef41Sopenharmony_ci 2491cb0ef41Sopenharmony_ci *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr); 2501cb0ef41Sopenharmony_ci xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl); 2511cb0ef41Sopenharmony_ci *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2); 2521cb0ef41Sopenharmony_ci 2531cb0ef41Sopenharmony_ci *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr); 2541cb0ef41Sopenharmony_ci xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl); 2551cb0ef41Sopenharmony_ci *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3); 2561cb0ef41Sopenharmony_ci 2571cb0ef41Sopenharmony_ci *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr); 2581cb0ef41Sopenharmony_ci *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl); 2591cb0ef41Sopenharmony_ci *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part); 2601cb0ef41Sopenharmony_ci 2611cb0ef41Sopenharmony_ci xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10); 2621cb0ef41Sopenharmony_ci xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01); 2631cb0ef41Sopenharmony_ci 2641cb0ef41Sopenharmony_ci ps_crc3 = _mm_castsi128_ps(*xmm_crc3); 2651cb0ef41Sopenharmony_ci psa0_0 = _mm_castsi128_ps(xmm_a0_0); 2661cb0ef41Sopenharmony_ci psa0_1 = _mm_castsi128_ps(xmm_a0_1); 2671cb0ef41Sopenharmony_ci 2681cb0ef41Sopenharmony_ci ps_res = _mm_xor_ps(ps_crc3, psa0_0); 2691cb0ef41Sopenharmony_ci ps_res = _mm_xor_ps(ps_res, psa0_1); 2701cb0ef41Sopenharmony_ci 2711cb0ef41Sopenharmony_ci *xmm_crc3 = _mm_castps_si128(ps_res); 2721cb0ef41Sopenharmony_ci} 2731cb0ef41Sopenharmony_ci 2741cb0ef41Sopenharmony_ciZLIB_INTERNAL void crc_fold_copy(deflate_state *const s, 2751cb0ef41Sopenharmony_ci unsigned char *dst, const unsigned char *src, long len) 2761cb0ef41Sopenharmony_ci{ 2771cb0ef41Sopenharmony_ci unsigned long algn_diff; 2781cb0ef41Sopenharmony_ci __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; 2791cb0ef41Sopenharmony_ci 2801cb0ef41Sopenharmony_ci CRC_LOAD(s) 2811cb0ef41Sopenharmony_ci 2821cb0ef41Sopenharmony_ci if (len < 16) { 2831cb0ef41Sopenharmony_ci if (len == 0) 2841cb0ef41Sopenharmony_ci return; 2851cb0ef41Sopenharmony_ci goto partial; 2861cb0ef41Sopenharmony_ci } 2871cb0ef41Sopenharmony_ci 2881cb0ef41Sopenharmony_ci algn_diff = (0 - (uintptr_t)src) & 0xF; 2891cb0ef41Sopenharmony_ci if (algn_diff) { 2901cb0ef41Sopenharmony_ci xmm_crc_part = _mm_loadu_si128((__m128i *)src); 2911cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst, xmm_crc_part); 2921cb0ef41Sopenharmony_ci 2931cb0ef41Sopenharmony_ci dst += algn_diff; 2941cb0ef41Sopenharmony_ci src += algn_diff; 2951cb0ef41Sopenharmony_ci len -= algn_diff; 2961cb0ef41Sopenharmony_ci 2971cb0ef41Sopenharmony_ci partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, 2981cb0ef41Sopenharmony_ci &xmm_crc_part); 2991cb0ef41Sopenharmony_ci } 3001cb0ef41Sopenharmony_ci 3011cb0ef41Sopenharmony_ci while ((len -= 64) >= 0) { 3021cb0ef41Sopenharmony_ci xmm_t0 = _mm_load_si128((__m128i *)src); 3031cb0ef41Sopenharmony_ci xmm_t1 = _mm_load_si128((__m128i *)src + 1); 3041cb0ef41Sopenharmony_ci xmm_t2 = _mm_load_si128((__m128i *)src + 2); 3051cb0ef41Sopenharmony_ci xmm_t3 = _mm_load_si128((__m128i *)src + 3); 3061cb0ef41Sopenharmony_ci 3071cb0ef41Sopenharmony_ci fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); 3081cb0ef41Sopenharmony_ci 3091cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst, xmm_t0); 3101cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); 3111cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); 3121cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); 3131cb0ef41Sopenharmony_ci 3141cb0ef41Sopenharmony_ci xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); 3151cb0ef41Sopenharmony_ci xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); 3161cb0ef41Sopenharmony_ci xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); 3171cb0ef41Sopenharmony_ci xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); 3181cb0ef41Sopenharmony_ci 3191cb0ef41Sopenharmony_ci src += 64; 3201cb0ef41Sopenharmony_ci dst += 64; 3211cb0ef41Sopenharmony_ci } 3221cb0ef41Sopenharmony_ci 3231cb0ef41Sopenharmony_ci /* 3241cb0ef41Sopenharmony_ci * len = num bytes left - 64 3251cb0ef41Sopenharmony_ci */ 3261cb0ef41Sopenharmony_ci if (len + 16 >= 0) { 3271cb0ef41Sopenharmony_ci len += 16; 3281cb0ef41Sopenharmony_ci 3291cb0ef41Sopenharmony_ci xmm_t0 = _mm_load_si128((__m128i *)src); 3301cb0ef41Sopenharmony_ci xmm_t1 = _mm_load_si128((__m128i *)src + 1); 3311cb0ef41Sopenharmony_ci xmm_t2 = _mm_load_si128((__m128i *)src + 2); 3321cb0ef41Sopenharmony_ci 3331cb0ef41Sopenharmony_ci fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); 3341cb0ef41Sopenharmony_ci 3351cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst, xmm_t0); 3361cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); 3371cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); 3381cb0ef41Sopenharmony_ci 3391cb0ef41Sopenharmony_ci xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); 3401cb0ef41Sopenharmony_ci xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); 3411cb0ef41Sopenharmony_ci xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); 3421cb0ef41Sopenharmony_ci 3431cb0ef41Sopenharmony_ci if (len == 0) 3441cb0ef41Sopenharmony_ci goto done; 3451cb0ef41Sopenharmony_ci 3461cb0ef41Sopenharmony_ci dst += 48; 3471cb0ef41Sopenharmony_ci src += 48; 3481cb0ef41Sopenharmony_ci } else if (len + 32 >= 0) { 3491cb0ef41Sopenharmony_ci len += 32; 3501cb0ef41Sopenharmony_ci 3511cb0ef41Sopenharmony_ci xmm_t0 = _mm_load_si128((__m128i *)src); 3521cb0ef41Sopenharmony_ci xmm_t1 = _mm_load_si128((__m128i *)src + 1); 3531cb0ef41Sopenharmony_ci 3541cb0ef41Sopenharmony_ci fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); 3551cb0ef41Sopenharmony_ci 3561cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst, xmm_t0); 3571cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); 3581cb0ef41Sopenharmony_ci 3591cb0ef41Sopenharmony_ci xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); 3601cb0ef41Sopenharmony_ci xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); 3611cb0ef41Sopenharmony_ci 3621cb0ef41Sopenharmony_ci if (len == 0) 3631cb0ef41Sopenharmony_ci goto done; 3641cb0ef41Sopenharmony_ci 3651cb0ef41Sopenharmony_ci dst += 32; 3661cb0ef41Sopenharmony_ci src += 32; 3671cb0ef41Sopenharmony_ci } else if (len + 48 >= 0) { 3681cb0ef41Sopenharmony_ci len += 48; 3691cb0ef41Sopenharmony_ci 3701cb0ef41Sopenharmony_ci xmm_t0 = _mm_load_si128((__m128i *)src); 3711cb0ef41Sopenharmony_ci 3721cb0ef41Sopenharmony_ci fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); 3731cb0ef41Sopenharmony_ci 3741cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst, xmm_t0); 3751cb0ef41Sopenharmony_ci 3761cb0ef41Sopenharmony_ci xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); 3771cb0ef41Sopenharmony_ci 3781cb0ef41Sopenharmony_ci if (len == 0) 3791cb0ef41Sopenharmony_ci goto done; 3801cb0ef41Sopenharmony_ci 3811cb0ef41Sopenharmony_ci dst += 16; 3821cb0ef41Sopenharmony_ci src += 16; 3831cb0ef41Sopenharmony_ci } else { 3841cb0ef41Sopenharmony_ci len += 64; 3851cb0ef41Sopenharmony_ci if (len == 0) 3861cb0ef41Sopenharmony_ci goto done; 3871cb0ef41Sopenharmony_ci } 3881cb0ef41Sopenharmony_ci 3891cb0ef41Sopenharmony_cipartial: 3901cb0ef41Sopenharmony_ci 3911cb0ef41Sopenharmony_ci#if defined(_MSC_VER) 3921cb0ef41Sopenharmony_ci /* VS does not permit the use of _mm_set_epi64x in 32-bit builds */ 3931cb0ef41Sopenharmony_ci { 3941cb0ef41Sopenharmony_ci int32_t parts[4] = {0, 0, 0, 0}; 3951cb0ef41Sopenharmony_ci memcpy(&parts, src, len); 3961cb0ef41Sopenharmony_ci xmm_crc_part = _mm_set_epi32(parts[3], parts[2], parts[1], parts[0]); 3971cb0ef41Sopenharmony_ci } 3981cb0ef41Sopenharmony_ci#else 3991cb0ef41Sopenharmony_ci { 4001cb0ef41Sopenharmony_ci int64_t parts[2] = {0, 0}; 4011cb0ef41Sopenharmony_ci memcpy(&parts, src, len); 4021cb0ef41Sopenharmony_ci xmm_crc_part = _mm_set_epi64x(parts[1], parts[0]); 4031cb0ef41Sopenharmony_ci } 4041cb0ef41Sopenharmony_ci#endif 4051cb0ef41Sopenharmony_ci 4061cb0ef41Sopenharmony_ci _mm_storeu_si128((__m128i *)dst, xmm_crc_part); 4071cb0ef41Sopenharmony_ci partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, 4081cb0ef41Sopenharmony_ci &xmm_crc_part); 4091cb0ef41Sopenharmony_cidone: 4101cb0ef41Sopenharmony_ci CRC_SAVE(s) 4111cb0ef41Sopenharmony_ci} 4121cb0ef41Sopenharmony_ci 4131cb0ef41Sopenharmony_cilocal const unsigned zalign(16) crc_k[] = { 4141cb0ef41Sopenharmony_ci 0xccaa009e, 0x00000000, /* rk1 */ 4151cb0ef41Sopenharmony_ci 0x751997d0, 0x00000001, /* rk2 */ 4161cb0ef41Sopenharmony_ci 0xccaa009e, 0x00000000, /* rk5 */ 4171cb0ef41Sopenharmony_ci 0x63cd6124, 0x00000001, /* rk6 */ 4181cb0ef41Sopenharmony_ci 0xf7011640, 0x00000001, /* rk7 */ 4191cb0ef41Sopenharmony_ci 0xdb710640, 0x00000001 /* rk8 */ 4201cb0ef41Sopenharmony_ci}; 4211cb0ef41Sopenharmony_ci 4221cb0ef41Sopenharmony_cilocal const unsigned zalign(16) crc_mask[4] = { 4231cb0ef41Sopenharmony_ci 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 4241cb0ef41Sopenharmony_ci}; 4251cb0ef41Sopenharmony_ci 4261cb0ef41Sopenharmony_cilocal const unsigned zalign(16) crc_mask2[4] = { 4271cb0ef41Sopenharmony_ci 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF 4281cb0ef41Sopenharmony_ci}; 4291cb0ef41Sopenharmony_ci 4301cb0ef41Sopenharmony_ciunsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) 4311cb0ef41Sopenharmony_ci{ 4321cb0ef41Sopenharmony_ci const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask); 4331cb0ef41Sopenharmony_ci const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2); 4341cb0ef41Sopenharmony_ci 4351cb0ef41Sopenharmony_ci unsigned crc; 4361cb0ef41Sopenharmony_ci __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold; 4371cb0ef41Sopenharmony_ci 4381cb0ef41Sopenharmony_ci __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0); 4391cb0ef41Sopenharmony_ci __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1); 4401cb0ef41Sopenharmony_ci __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2); 4411cb0ef41Sopenharmony_ci __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3); 4421cb0ef41Sopenharmony_ci 4431cb0ef41Sopenharmony_ci /* 4441cb0ef41Sopenharmony_ci * k1 4451cb0ef41Sopenharmony_ci */ 4461cb0ef41Sopenharmony_ci crc_fold = _mm_load_si128((__m128i *)crc_k); 4471cb0ef41Sopenharmony_ci 4481cb0ef41Sopenharmony_ci x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10); 4491cb0ef41Sopenharmony_ci xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01); 4501cb0ef41Sopenharmony_ci xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0); 4511cb0ef41Sopenharmony_ci xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0); 4521cb0ef41Sopenharmony_ci 4531cb0ef41Sopenharmony_ci x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10); 4541cb0ef41Sopenharmony_ci xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01); 4551cb0ef41Sopenharmony_ci xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1); 4561cb0ef41Sopenharmony_ci xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1); 4571cb0ef41Sopenharmony_ci 4581cb0ef41Sopenharmony_ci x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10); 4591cb0ef41Sopenharmony_ci xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01); 4601cb0ef41Sopenharmony_ci xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2); 4611cb0ef41Sopenharmony_ci xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2); 4621cb0ef41Sopenharmony_ci 4631cb0ef41Sopenharmony_ci /* 4641cb0ef41Sopenharmony_ci * k5 4651cb0ef41Sopenharmony_ci */ 4661cb0ef41Sopenharmony_ci crc_fold = _mm_load_si128((__m128i *)crc_k + 1); 4671cb0ef41Sopenharmony_ci 4681cb0ef41Sopenharmony_ci xmm_crc0 = xmm_crc3; 4691cb0ef41Sopenharmony_ci xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0); 4701cb0ef41Sopenharmony_ci xmm_crc0 = _mm_srli_si128(xmm_crc0, 8); 4711cb0ef41Sopenharmony_ci xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0); 4721cb0ef41Sopenharmony_ci 4731cb0ef41Sopenharmony_ci xmm_crc0 = xmm_crc3; 4741cb0ef41Sopenharmony_ci xmm_crc3 = _mm_slli_si128(xmm_crc3, 4); 4751cb0ef41Sopenharmony_ci xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10); 4761cb0ef41Sopenharmony_ci xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0); 4771cb0ef41Sopenharmony_ci xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2); 4781cb0ef41Sopenharmony_ci 4791cb0ef41Sopenharmony_ci /* 4801cb0ef41Sopenharmony_ci * k7 4811cb0ef41Sopenharmony_ci */ 4821cb0ef41Sopenharmony_ci xmm_crc1 = xmm_crc3; 4831cb0ef41Sopenharmony_ci xmm_crc2 = xmm_crc3; 4841cb0ef41Sopenharmony_ci crc_fold = _mm_load_si128((__m128i *)crc_k + 2); 4851cb0ef41Sopenharmony_ci 4861cb0ef41Sopenharmony_ci xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0); 4871cb0ef41Sopenharmony_ci xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2); 4881cb0ef41Sopenharmony_ci xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask); 4891cb0ef41Sopenharmony_ci 4901cb0ef41Sopenharmony_ci xmm_crc2 = xmm_crc3; 4911cb0ef41Sopenharmony_ci xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10); 4921cb0ef41Sopenharmony_ci xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2); 4931cb0ef41Sopenharmony_ci xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1); 4941cb0ef41Sopenharmony_ci 4951cb0ef41Sopenharmony_ci crc = _mm_extract_epi32(xmm_crc3, 2); 4961cb0ef41Sopenharmony_ci return ~crc; 4971cb0ef41Sopenharmony_ci} 4981cb0ef41Sopenharmony_ci 4991cb0ef41Sopenharmony_ci#endif /* CRC32_SIMD_SSE42_PCLMUL */ 500