11cb0ef41Sopenharmony_ciFrom 87fc8e3e38323cfdabf8da3927488e3e57073b02 Mon Sep 17 00:00:00 2001 21cb0ef41Sopenharmony_ciFrom: Jia Liu <jia3.liu@intel.com> 31cb0ef41Sopenharmony_ciDate: Thu, 30 Mar 2023 11:13:16 +0800 41cb0ef41Sopenharmony_ciSubject: [PATCH] Enabled AVX512 for CRC32 51cb0ef41Sopenharmony_ci 61cb0ef41Sopenharmony_ciEnabled AVX512 for CRC32 that provide best of known performance 71cb0ef41Sopenharmony_cibeyond current SSE SIMD optimization. It enables multiple folding 81cb0ef41Sopenharmony_cioperations and AVX512 new instructions, providing ~3.5X CRC32 91cb0ef41Sopenharmony_ciperformance and ~3.7% gain on Zlib_bench gzip performance. 101cb0ef41Sopenharmony_ci--- 111cb0ef41Sopenharmony_ci CMakeLists.txt | 8 +- 121cb0ef41Sopenharmony_ci cpu_features.c | 9 +++ 131cb0ef41Sopenharmony_ci cpu_features.h | 1 + 141cb0ef41Sopenharmony_ci crc32.c | 14 +++- 151cb0ef41Sopenharmony_ci crc32_simd.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++- 161cb0ef41Sopenharmony_ci crc32_simd.h | 6 ++ 171cb0ef41Sopenharmony_ci 6 files changed, 230 insertions(+), 6 deletions(-) 181cb0ef41Sopenharmony_ci 191cb0ef41Sopenharmony_cidiff --git a/CMakeLists.txt b/CMakeLists.txt 201cb0ef41Sopenharmony_ciindex f06e193..d45b902 100644 211cb0ef41Sopenharmony_ci--- a/CMakeLists.txt 221cb0ef41Sopenharmony_ci+++ b/CMakeLists.txt 231cb0ef41Sopenharmony_ci@@ -22,6 +22,7 @@ check_include_file(stdint.h HAVE_STDINT_H) 241cb0ef41Sopenharmony_ci check_include_file(stddef.h HAVE_STDDEF_H) 251cb0ef41Sopenharmony_ci 261cb0ef41Sopenharmony_ci option(ENABLE_SIMD_OPTIMIZATIONS "Enable all SIMD optimizations" OFF) 271cb0ef41Sopenharmony_ci+option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF) 281cb0ef41Sopenharmony_ci 291cb0ef41Sopenharmony_ci # TODO(cavalcantii): add support for other OSes (e.g. Android, fuchsia, osx) 301cb0ef41Sopenharmony_ci # and architectures (e.g. Arm). 311cb0ef41Sopenharmony_ci@@ -30,8 +31,13 @@ if (ENABLE_SIMD_OPTIMIZATIONS) 321cb0ef41Sopenharmony_ci add_definitions(-DADLER32_SIMD_SSSE3) 331cb0ef41Sopenharmony_ci add_definitions(-DINFLATE_CHUNK_READ_64LE) 341cb0ef41Sopenharmony_ci add_definitions(-DCRC32_SIMD_SSE42_PCLMUL) 351cb0ef41Sopenharmony_ci+ if (ENABLE_SIMD_AVX512) 361cb0ef41Sopenharmony_ci+ add_definitions(-DCRC32_SIMD_AVX512_PCLMUL) 371cb0ef41Sopenharmony_ci+ add_compile_options(-mvpclmulqdq -msse2 -mavx512f -mpclmul) 381cb0ef41Sopenharmony_ci+ else() 391cb0ef41Sopenharmony_ci+ add_compile_options(-msse4.2 -mpclmul) 401cb0ef41Sopenharmony_ci+ endif() 411cb0ef41Sopenharmony_ci add_definitions(-DDEFLATE_SLIDE_HASH_SSE2) 421cb0ef41Sopenharmony_ci- add_compile_options(-msse4.2 -mpclmul) 431cb0ef41Sopenharmony_ci # Required by CPU features detection code. 441cb0ef41Sopenharmony_ci add_definitions(-DX86_NOT_WINDOWS) 451cb0ef41Sopenharmony_ci # Apparently some environments (e.g. CentOS) require to explicitly link 461cb0ef41Sopenharmony_cidiff --git a/cpu_features.c b/cpu_features.c 471cb0ef41Sopenharmony_ciindex 877d5f2..ac6ee88 100644 481cb0ef41Sopenharmony_ci--- a/cpu_features.c 491cb0ef41Sopenharmony_ci+++ b/cpu_features.c 501cb0ef41Sopenharmony_ci@@ -31,6 +31,7 @@ int ZLIB_INTERNAL arm_cpu_enable_pmull = 0; 511cb0ef41Sopenharmony_ci int ZLIB_INTERNAL x86_cpu_enable_sse2 = 0; 521cb0ef41Sopenharmony_ci int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0; 531cb0ef41Sopenharmony_ci int ZLIB_INTERNAL x86_cpu_enable_simd = 0; 541cb0ef41Sopenharmony_ci+int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0; 551cb0ef41Sopenharmony_ci 561cb0ef41Sopenharmony_ci #ifndef CPU_NO_SIMD 571cb0ef41Sopenharmony_ci 581cb0ef41Sopenharmony_ci@@ -138,6 +139,10 @@ static void _cpu_check_features(void) 591cb0ef41Sopenharmony_ci /* On x86 we simply use a instruction to check the CPU features. 601cb0ef41Sopenharmony_ci * (i.e. CPUID). 611cb0ef41Sopenharmony_ci */ 621cb0ef41Sopenharmony_ci+#ifdef CRC32_SIMD_AVX512_PCLMUL 631cb0ef41Sopenharmony_ci+#include <immintrin.h> 641cb0ef41Sopenharmony_ci+#include <xsaveintrin.h> 651cb0ef41Sopenharmony_ci+#endif 661cb0ef41Sopenharmony_ci static void _cpu_check_features(void) 671cb0ef41Sopenharmony_ci { 681cb0ef41Sopenharmony_ci int x86_cpu_has_sse2; 691cb0ef41Sopenharmony_ci@@ -164,6 +169,10 @@ static void _cpu_check_features(void) 701cb0ef41Sopenharmony_ci x86_cpu_enable_simd = x86_cpu_has_sse2 && 711cb0ef41Sopenharmony_ci x86_cpu_has_sse42 && 721cb0ef41Sopenharmony_ci x86_cpu_has_pclmulqdq; 731cb0ef41Sopenharmony_ci+ 741cb0ef41Sopenharmony_ci+#ifdef CRC32_SIMD_AVX512_PCLMUL 751cb0ef41Sopenharmony_ci+ x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040; 761cb0ef41Sopenharmony_ci+#endif 771cb0ef41Sopenharmony_ci } 781cb0ef41Sopenharmony_ci #endif 791cb0ef41Sopenharmony_ci #endif 801cb0ef41Sopenharmony_cidiff --git a/cpu_features.h b/cpu_features.h 811cb0ef41Sopenharmony_ciindex 279246c..aed3e83 100644 821cb0ef41Sopenharmony_ci--- a/cpu_features.h 831cb0ef41Sopenharmony_ci+++ b/cpu_features.h 841cb0ef41Sopenharmony_ci@@ -14,5 +14,6 @@ extern int arm_cpu_enable_pmull; 851cb0ef41Sopenharmony_ci extern int x86_cpu_enable_sse2; 861cb0ef41Sopenharmony_ci extern int x86_cpu_enable_ssse3; 871cb0ef41Sopenharmony_ci extern int x86_cpu_enable_simd; 881cb0ef41Sopenharmony_ci+extern int x86_cpu_enable_avx512; 891cb0ef41Sopenharmony_ci 901cb0ef41Sopenharmony_ci void cpu_check_features(void); 911cb0ef41Sopenharmony_cidiff --git a/crc32.c b/crc32.c 921cb0ef41Sopenharmony_ciindex 4486098..acb6972 100644 931cb0ef41Sopenharmony_ci--- a/crc32.c 941cb0ef41Sopenharmony_ci+++ b/crc32.c 951cb0ef41Sopenharmony_ci@@ -773,7 +773,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) 961cb0ef41Sopenharmony_ci } 971cb0ef41Sopenharmony_ci 981cb0ef41Sopenharmony_ci #endif 991cb0ef41Sopenharmony_ci-#if defined(CRC32_SIMD_SSE42_PCLMUL) 1001cb0ef41Sopenharmony_ci+#if defined(CRC32_SIMD_AVX512_PCLMUL) 1011cb0ef41Sopenharmony_ci+ if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) { 1021cb0ef41Sopenharmony_ci+ /* crc32 64-byte chunks */ 1031cb0ef41Sopenharmony_ci+ z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK; 1041cb0ef41Sopenharmony_ci+ crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc); 1051cb0ef41Sopenharmony_ci+ /* check remaining data */ 1061cb0ef41Sopenharmony_ci+ len -= chunk_size; 1071cb0ef41Sopenharmony_ci+ if (!len) 1081cb0ef41Sopenharmony_ci+ return crc; 1091cb0ef41Sopenharmony_ci+ /* Fall into the default crc32 for the remaining data. */ 1101cb0ef41Sopenharmony_ci+ buf += chunk_size; 1111cb0ef41Sopenharmony_ci+ } 1121cb0ef41Sopenharmony_ci+#elif defined(CRC32_SIMD_SSE42_PCLMUL) 1131cb0ef41Sopenharmony_ci if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) { 1141cb0ef41Sopenharmony_ci /* crc32 16-byte chunks */ 1151cb0ef41Sopenharmony_ci z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK; 1161cb0ef41Sopenharmony_cidiff --git a/crc32_simd.c b/crc32_simd.c 1171cb0ef41Sopenharmony_ciindex d80beba..7428270 100644 1181cb0ef41Sopenharmony_ci--- a/crc32_simd.c 1191cb0ef41Sopenharmony_ci+++ b/crc32_simd.c 1201cb0ef41Sopenharmony_ci@@ -6,17 +6,207 @@ 1211cb0ef41Sopenharmony_ci */ 1221cb0ef41Sopenharmony_ci 1231cb0ef41Sopenharmony_ci #include "crc32_simd.h" 1241cb0ef41Sopenharmony_ci- 1251cb0ef41Sopenharmony_ci-#if defined(CRC32_SIMD_SSE42_PCLMUL) 1261cb0ef41Sopenharmony_ci+#if defined(CRC32_SIMD_AVX512_PCLMUL) 1271cb0ef41Sopenharmony_ci 1281cb0ef41Sopenharmony_ci /* 1291cb0ef41Sopenharmony_ci- * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer 1301cb0ef41Sopenharmony_ci- * length must be at least 64, and a multiple of 16. Based on: 1311cb0ef41Sopenharmony_ci+ * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer 1321cb0ef41Sopenharmony_ci+ * length must be at least 256, and a multiple of 64. Based on: 1331cb0ef41Sopenharmony_ci * 1341cb0ef41Sopenharmony_ci * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" 1351cb0ef41Sopenharmony_ci * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 1361cb0ef41Sopenharmony_ci */ 1371cb0ef41Sopenharmony_ci 1381cb0ef41Sopenharmony_ci+#include <emmintrin.h> 1391cb0ef41Sopenharmony_ci+#include <smmintrin.h> 1401cb0ef41Sopenharmony_ci+#include <wmmintrin.h> 1411cb0ef41Sopenharmony_ci+#include <immintrin.h> 1421cb0ef41Sopenharmony_ci+ 1431cb0ef41Sopenharmony_ci+uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */ 1441cb0ef41Sopenharmony_ci+ const unsigned char *buf, 1451cb0ef41Sopenharmony_ci+ z_size_t len, 1461cb0ef41Sopenharmony_ci+ uint32_t crc) 1471cb0ef41Sopenharmony_ci+{ 1481cb0ef41Sopenharmony_ci+ /* 1491cb0ef41Sopenharmony_ci+ * Definitions of the bit-reflected domain constants k1,k2,k3,k4 1501cb0ef41Sopenharmony_ci+ * are similar to those given at the end of the paper, and remaining 1511cb0ef41Sopenharmony_ci+ * constants and CRC32+Barrett polynomials remain unchanged. 1521cb0ef41Sopenharmony_ci+ * 1531cb0ef41Sopenharmony_ci+ * Replace the index of x from 128 to 512. As follows: 1541cb0ef41Sopenharmony_ci+ * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a 1551cb0ef41Sopenharmony_ci+ * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430 1561cb0ef41Sopenharmony_ci+ * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4 1571cb0ef41Sopenharmony_ci+ * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596 1581cb0ef41Sopenharmony_ci+ */ 1591cb0ef41Sopenharmony_ci+ static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430, 1601cb0ef41Sopenharmony_ci+ 0x011542778a, 0x01322d1430, 1611cb0ef41Sopenharmony_ci+ 0x011542778a, 0x01322d1430, 1621cb0ef41Sopenharmony_ci+ 0x011542778a, 0x01322d1430 }; 1631cb0ef41Sopenharmony_ci+ static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596, 1641cb0ef41Sopenharmony_ci+ 0x0154442bd4, 0x01c6e41596, 1651cb0ef41Sopenharmony_ci+ 0x0154442bd4, 0x01c6e41596, 1661cb0ef41Sopenharmony_ci+ 0x0154442bd4, 0x01c6e41596 }; 1671cb0ef41Sopenharmony_ci+ static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e }; 1681cb0ef41Sopenharmony_ci+ static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 }; 1691cb0ef41Sopenharmony_ci+ static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; 1701cb0ef41Sopenharmony_ci+ __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; 1711cb0ef41Sopenharmony_ci+ __m128i a0, a1, a2, a3; 1721cb0ef41Sopenharmony_ci+ 1731cb0ef41Sopenharmony_ci+ /* 1741cb0ef41Sopenharmony_ci+ * There's at least one block of 256. 1751cb0ef41Sopenharmony_ci+ */ 1761cb0ef41Sopenharmony_ci+ x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); 1771cb0ef41Sopenharmony_ci+ x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); 1781cb0ef41Sopenharmony_ci+ x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); 1791cb0ef41Sopenharmony_ci+ x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); 1801cb0ef41Sopenharmony_ci+ 1811cb0ef41Sopenharmony_ci+ x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); 1821cb0ef41Sopenharmony_ci+ 1831cb0ef41Sopenharmony_ci+ x0 = _mm512_load_si512((__m512i *)k1k2); 1841cb0ef41Sopenharmony_ci+ 1851cb0ef41Sopenharmony_ci+ buf += 256; 1861cb0ef41Sopenharmony_ci+ len -= 256; 1871cb0ef41Sopenharmony_ci+ 1881cb0ef41Sopenharmony_ci+ /* 1891cb0ef41Sopenharmony_ci+ * Parallel fold blocks of 256, if any. 1901cb0ef41Sopenharmony_ci+ */ 1911cb0ef41Sopenharmony_ci+ while (len >= 256) 1921cb0ef41Sopenharmony_ci+ { 1931cb0ef41Sopenharmony_ci+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 1941cb0ef41Sopenharmony_ci+ x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); 1951cb0ef41Sopenharmony_ci+ x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); 1961cb0ef41Sopenharmony_ci+ x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); 1971cb0ef41Sopenharmony_ci+ 1981cb0ef41Sopenharmony_ci+ 1991cb0ef41Sopenharmony_ci+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 2001cb0ef41Sopenharmony_ci+ x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); 2011cb0ef41Sopenharmony_ci+ x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); 2021cb0ef41Sopenharmony_ci+ x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); 2031cb0ef41Sopenharmony_ci+ 2041cb0ef41Sopenharmony_ci+ y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); 2051cb0ef41Sopenharmony_ci+ y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); 2061cb0ef41Sopenharmony_ci+ y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); 2071cb0ef41Sopenharmony_ci+ y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); 2081cb0ef41Sopenharmony_ci+ 2091cb0ef41Sopenharmony_ci+ x1 = _mm512_xor_si512(x1, x5); 2101cb0ef41Sopenharmony_ci+ x2 = _mm512_xor_si512(x2, x6); 2111cb0ef41Sopenharmony_ci+ x3 = _mm512_xor_si512(x3, x7); 2121cb0ef41Sopenharmony_ci+ x4 = _mm512_xor_si512(x4, x8); 2131cb0ef41Sopenharmony_ci+ 2141cb0ef41Sopenharmony_ci+ x1 = _mm512_xor_si512(x1, y5); 2151cb0ef41Sopenharmony_ci+ x2 = _mm512_xor_si512(x2, y6); 2161cb0ef41Sopenharmony_ci+ x3 = _mm512_xor_si512(x3, y7); 2171cb0ef41Sopenharmony_ci+ x4 = _mm512_xor_si512(x4, y8); 2181cb0ef41Sopenharmony_ci+ 2191cb0ef41Sopenharmony_ci+ buf += 256; 2201cb0ef41Sopenharmony_ci+ len -= 256; 2211cb0ef41Sopenharmony_ci+ } 2221cb0ef41Sopenharmony_ci+ 2231cb0ef41Sopenharmony_ci+ /* 2241cb0ef41Sopenharmony_ci+ * Fold into 512-bits. 2251cb0ef41Sopenharmony_ci+ */ 2261cb0ef41Sopenharmony_ci+ x0 = _mm512_load_si512((__m512i *)k3k4); 2271cb0ef41Sopenharmony_ci+ 2281cb0ef41Sopenharmony_ci+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 2291cb0ef41Sopenharmony_ci+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 2301cb0ef41Sopenharmony_ci+ x1 = _mm512_xor_si512(x1, x2); 2311cb0ef41Sopenharmony_ci+ x1 = _mm512_xor_si512(x1, x5); 2321cb0ef41Sopenharmony_ci+ 2331cb0ef41Sopenharmony_ci+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 2341cb0ef41Sopenharmony_ci+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 2351cb0ef41Sopenharmony_ci+ x1 = _mm512_xor_si512(x1, x3); 2361cb0ef41Sopenharmony_ci+ x1 = _mm512_xor_si512(x1, x5); 2371cb0ef41Sopenharmony_ci+ 2381cb0ef41Sopenharmony_ci+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 2391cb0ef41Sopenharmony_ci+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 2401cb0ef41Sopenharmony_ci+ x1 = _mm512_xor_si512(x1, x4); 2411cb0ef41Sopenharmony_ci+ x1 = _mm512_xor_si512(x1, x5); 2421cb0ef41Sopenharmony_ci+ 2431cb0ef41Sopenharmony_ci+ /* 2441cb0ef41Sopenharmony_ci+ * Single fold blocks of 64, if any. 2451cb0ef41Sopenharmony_ci+ */ 2461cb0ef41Sopenharmony_ci+ while (len >= 64) 2471cb0ef41Sopenharmony_ci+ { 2481cb0ef41Sopenharmony_ci+ x2 = _mm512_loadu_si512((__m512i *)buf); 2491cb0ef41Sopenharmony_ci+ 2501cb0ef41Sopenharmony_ci+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 2511cb0ef41Sopenharmony_ci+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 2521cb0ef41Sopenharmony_ci+ x1 = _mm512_xor_si512(x1, x2); 2531cb0ef41Sopenharmony_ci+ x1 = _mm512_xor_si512(x1, x5); 2541cb0ef41Sopenharmony_ci+ 2551cb0ef41Sopenharmony_ci+ buf += 64; 2561cb0ef41Sopenharmony_ci+ len -= 64; 2571cb0ef41Sopenharmony_ci+ } 2581cb0ef41Sopenharmony_ci+ 2591cb0ef41Sopenharmony_ci+ /* 2601cb0ef41Sopenharmony_ci+ * Fold 512-bits to 384-bits. 2611cb0ef41Sopenharmony_ci+ */ 2621cb0ef41Sopenharmony_ci+ a0 = _mm_load_si128((__m128i *)k5k6); 2631cb0ef41Sopenharmony_ci+ 2641cb0ef41Sopenharmony_ci+ a1 = _mm512_extracti32x4_epi32(x1, 0); 2651cb0ef41Sopenharmony_ci+ a2 = _mm512_extracti32x4_epi32(x1, 1); 2661cb0ef41Sopenharmony_ci+ 2671cb0ef41Sopenharmony_ci+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00); 2681cb0ef41Sopenharmony_ci+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11); 2691cb0ef41Sopenharmony_ci+ 2701cb0ef41Sopenharmony_ci+ a1 = _mm_xor_si128(a1, a3); 2711cb0ef41Sopenharmony_ci+ a1 = _mm_xor_si128(a1, a2); 2721cb0ef41Sopenharmony_ci+ 2731cb0ef41Sopenharmony_ci+ /* 2741cb0ef41Sopenharmony_ci+ * Fold 384-bits to 256-bits. 2751cb0ef41Sopenharmony_ci+ */ 2761cb0ef41Sopenharmony_ci+ a2 = _mm512_extracti32x4_epi32(x1, 2); 2771cb0ef41Sopenharmony_ci+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00); 2781cb0ef41Sopenharmony_ci+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11); 2791cb0ef41Sopenharmony_ci+ a1 = _mm_xor_si128(a1, a3); 2801cb0ef41Sopenharmony_ci+ a1 = _mm_xor_si128(a1, a2); 2811cb0ef41Sopenharmony_ci+ 2821cb0ef41Sopenharmony_ci+ /* 2831cb0ef41Sopenharmony_ci+ * Fold 256-bits to 128-bits. 2841cb0ef41Sopenharmony_ci+ */ 2851cb0ef41Sopenharmony_ci+ a2 = _mm512_extracti32x4_epi32(x1, 3); 2861cb0ef41Sopenharmony_ci+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00); 2871cb0ef41Sopenharmony_ci+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11); 2881cb0ef41Sopenharmony_ci+ a1 = _mm_xor_si128(a1, a3); 2891cb0ef41Sopenharmony_ci+ a1 = _mm_xor_si128(a1, a2); 2901cb0ef41Sopenharmony_ci+ 2911cb0ef41Sopenharmony_ci+ /* 2921cb0ef41Sopenharmony_ci+ * Fold 128-bits to 64-bits. 2931cb0ef41Sopenharmony_ci+ */ 2941cb0ef41Sopenharmony_ci+ a2 = _mm_clmulepi64_si128(a1, a0, 0x10); 2951cb0ef41Sopenharmony_ci+ a3 = _mm_setr_epi32(~0, 0, ~0, 0); 2961cb0ef41Sopenharmony_ci+ a1 = _mm_srli_si128(a1, 8); 2971cb0ef41Sopenharmony_ci+ a1 = _mm_xor_si128(a1, a2); 2981cb0ef41Sopenharmony_ci+ 2991cb0ef41Sopenharmony_ci+ a0 = _mm_loadl_epi64((__m128i*)k7k8); 3001cb0ef41Sopenharmony_ci+ a2 = _mm_srli_si128(a1, 4); 3011cb0ef41Sopenharmony_ci+ a1 = _mm_and_si128(a1, a3); 3021cb0ef41Sopenharmony_ci+ a1 = _mm_clmulepi64_si128(a1, a0, 0x00); 3031cb0ef41Sopenharmony_ci+ a1 = _mm_xor_si128(a1, a2); 3041cb0ef41Sopenharmony_ci+ 3051cb0ef41Sopenharmony_ci+ /* 3061cb0ef41Sopenharmony_ci+ * Barret reduce to 32-bits. 3071cb0ef41Sopenharmony_ci+ */ 3081cb0ef41Sopenharmony_ci+ a0 = _mm_load_si128((__m128i*)poly); 3091cb0ef41Sopenharmony_ci+ 3101cb0ef41Sopenharmony_ci+ a2 = _mm_and_si128(a1, a3); 3111cb0ef41Sopenharmony_ci+ a2 = _mm_clmulepi64_si128(a2, a0, 0x10); 3121cb0ef41Sopenharmony_ci+ a2 = _mm_and_si128(a2, a3); 3131cb0ef41Sopenharmony_ci+ a2 = _mm_clmulepi64_si128(a2, a0, 0x00); 3141cb0ef41Sopenharmony_ci+ a1 = _mm_xor_si128(a1, a2); 3151cb0ef41Sopenharmony_ci+ 3161cb0ef41Sopenharmony_ci+ /* 3171cb0ef41Sopenharmony_ci+ * Return the crc32. 3181cb0ef41Sopenharmony_ci+ */ 3191cb0ef41Sopenharmony_ci+ return _mm_extract_epi32(a1, 1); 3201cb0ef41Sopenharmony_ci+} 3211cb0ef41Sopenharmony_ci+ 3221cb0ef41Sopenharmony_ci+#elif defined(CRC32_SIMD_SSE42_PCLMUL) 3231cb0ef41Sopenharmony_ci+ 3241cb0ef41Sopenharmony_ci+/* 3251cb0ef41Sopenharmony_ci+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer 3261cb0ef41Sopenharmony_ci+ * length must be at least 64, and a multiple of 16. 3271cb0ef41Sopenharmony_ci+ */ 3281cb0ef41Sopenharmony_ci+ 3291cb0ef41Sopenharmony_ci #include <emmintrin.h> 3301cb0ef41Sopenharmony_ci #include <smmintrin.h> 3311cb0ef41Sopenharmony_ci #include <wmmintrin.h> 3321cb0ef41Sopenharmony_cidiff --git a/crc32_simd.h b/crc32_simd.h 3331cb0ef41Sopenharmony_ciindex c0346dc..8462464 100644 3341cb0ef41Sopenharmony_ci--- a/crc32_simd.h 3351cb0ef41Sopenharmony_ci+++ b/crc32_simd.h 3361cb0ef41Sopenharmony_ci@@ -19,12 +19,18 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf, 3371cb0ef41Sopenharmony_ci z_size_t len, 3381cb0ef41Sopenharmony_ci uint32_t crc); 3391cb0ef41Sopenharmony_ci 3401cb0ef41Sopenharmony_ci+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf, 3411cb0ef41Sopenharmony_ci+ z_size_t len, 3421cb0ef41Sopenharmony_ci+ uint32_t crc); 3431cb0ef41Sopenharmony_ci+ 3441cb0ef41Sopenharmony_ci /* 3451cb0ef41Sopenharmony_ci * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c 3461cb0ef41Sopenharmony_ci * for computing the crc32 of an arbitrary length buffer. 3471cb0ef41Sopenharmony_ci */ 3481cb0ef41Sopenharmony_ci #define Z_CRC32_SSE42_MINIMUM_LENGTH 64 3491cb0ef41Sopenharmony_ci #define Z_CRC32_SSE42_CHUNKSIZE_MASK 15 3501cb0ef41Sopenharmony_ci+#define Z_CRC32_AVX512_MINIMUM_LENGTH 256 3511cb0ef41Sopenharmony_ci+#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63 3521cb0ef41Sopenharmony_ci 3531cb0ef41Sopenharmony_ci /* 3541cb0ef41Sopenharmony_ci * CRC32 checksums using ARMv8-a crypto instructions. 3551cb0ef41Sopenharmony_ci-- 3561cb0ef41Sopenharmony_ci2.34.1 3571cb0ef41Sopenharmony_ci 358