11cb0ef41Sopenharmony_ciFrom 87fc8e3e38323cfdabf8da3927488e3e57073b02 Mon Sep 17 00:00:00 2001
21cb0ef41Sopenharmony_ciFrom: Jia Liu <jia3.liu@intel.com>
31cb0ef41Sopenharmony_ciDate: Thu, 30 Mar 2023 11:13:16 +0800
41cb0ef41Sopenharmony_ciSubject: [PATCH] Enabled AVX512 for CRC32
51cb0ef41Sopenharmony_ci
61cb0ef41Sopenharmony_ciEnabled AVX512 for CRC32 that provide best of known performance
71cb0ef41Sopenharmony_cibeyond current SSE SIMD optimization. It enables multiple folding
81cb0ef41Sopenharmony_cioperations and AVX512 new instructions, providing ~3.5X CRC32
91cb0ef41Sopenharmony_ciperformance and ~3.7% gain on Zlib_bench gzip performance.
101cb0ef41Sopenharmony_ci---
111cb0ef41Sopenharmony_ci CMakeLists.txt |   8 +-
121cb0ef41Sopenharmony_ci cpu_features.c |   9 +++
131cb0ef41Sopenharmony_ci cpu_features.h |   1 +
141cb0ef41Sopenharmony_ci crc32.c        |  14 +++-
151cb0ef41Sopenharmony_ci crc32_simd.c   | 198 ++++++++++++++++++++++++++++++++++++++++++++++++-
161cb0ef41Sopenharmony_ci crc32_simd.h   |   6 ++
171cb0ef41Sopenharmony_ci 6 files changed, 230 insertions(+), 6 deletions(-)
181cb0ef41Sopenharmony_ci
191cb0ef41Sopenharmony_cidiff --git a/CMakeLists.txt b/CMakeLists.txt
201cb0ef41Sopenharmony_ciindex f06e193..d45b902 100644
211cb0ef41Sopenharmony_ci--- a/CMakeLists.txt
221cb0ef41Sopenharmony_ci+++ b/CMakeLists.txt
231cb0ef41Sopenharmony_ci@@ -22,6 +22,7 @@ check_include_file(stdint.h    HAVE_STDINT_H)
241cb0ef41Sopenharmony_ci check_include_file(stddef.h    HAVE_STDDEF_H)
251cb0ef41Sopenharmony_ci 
261cb0ef41Sopenharmony_ci option(ENABLE_SIMD_OPTIMIZATIONS "Enable all SIMD optimizations" OFF)
271cb0ef41Sopenharmony_ci+option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF)
281cb0ef41Sopenharmony_ci 
291cb0ef41Sopenharmony_ci # TODO(cavalcantii): add support for other OSes (e.g. Android, fuchsia, osx)
301cb0ef41Sopenharmony_ci # and architectures (e.g. Arm).
311cb0ef41Sopenharmony_ci@@ -30,8 +31,13 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
321cb0ef41Sopenharmony_ci    add_definitions(-DADLER32_SIMD_SSSE3)
331cb0ef41Sopenharmony_ci    add_definitions(-DINFLATE_CHUNK_READ_64LE)
341cb0ef41Sopenharmony_ci    add_definitions(-DCRC32_SIMD_SSE42_PCLMUL)
351cb0ef41Sopenharmony_ci+   if (ENABLE_SIMD_AVX512)
361cb0ef41Sopenharmony_ci+    add_definitions(-DCRC32_SIMD_AVX512_PCLMUL)
371cb0ef41Sopenharmony_ci+    add_compile_options(-mvpclmulqdq -msse2 -mavx512f -mpclmul)
381cb0ef41Sopenharmony_ci+   else()
391cb0ef41Sopenharmony_ci+    add_compile_options(-msse4.2 -mpclmul)
401cb0ef41Sopenharmony_ci+   endif()
411cb0ef41Sopenharmony_ci    add_definitions(-DDEFLATE_SLIDE_HASH_SSE2)
421cb0ef41Sopenharmony_ci-   add_compile_options(-msse4.2 -mpclmul)
431cb0ef41Sopenharmony_ci    # Required by CPU features detection code.
441cb0ef41Sopenharmony_ci    add_definitions(-DX86_NOT_WINDOWS)
451cb0ef41Sopenharmony_ci    # Apparently some environments (e.g. CentOS) require to explicitly link
461cb0ef41Sopenharmony_cidiff --git a/cpu_features.c b/cpu_features.c
471cb0ef41Sopenharmony_ciindex 877d5f2..ac6ee88 100644
481cb0ef41Sopenharmony_ci--- a/cpu_features.c
491cb0ef41Sopenharmony_ci+++ b/cpu_features.c
501cb0ef41Sopenharmony_ci@@ -31,6 +31,7 @@ int ZLIB_INTERNAL arm_cpu_enable_pmull = 0;
511cb0ef41Sopenharmony_ci int ZLIB_INTERNAL x86_cpu_enable_sse2 = 0;
521cb0ef41Sopenharmony_ci int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
531cb0ef41Sopenharmony_ci int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
541cb0ef41Sopenharmony_ci+int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;
551cb0ef41Sopenharmony_ci 
561cb0ef41Sopenharmony_ci #ifndef CPU_NO_SIMD
571cb0ef41Sopenharmony_ci 
581cb0ef41Sopenharmony_ci@@ -138,6 +139,10 @@ static void _cpu_check_features(void)
591cb0ef41Sopenharmony_ci /* On x86 we simply use a instruction to check the CPU features.
601cb0ef41Sopenharmony_ci  * (i.e. CPUID).
611cb0ef41Sopenharmony_ci  */
621cb0ef41Sopenharmony_ci+#ifdef CRC32_SIMD_AVX512_PCLMUL
631cb0ef41Sopenharmony_ci+#include <immintrin.h>
641cb0ef41Sopenharmony_ci+#include <xsaveintrin.h>
651cb0ef41Sopenharmony_ci+#endif
661cb0ef41Sopenharmony_ci static void _cpu_check_features(void)
671cb0ef41Sopenharmony_ci {
681cb0ef41Sopenharmony_ci     int x86_cpu_has_sse2;
691cb0ef41Sopenharmony_ci@@ -164,6 +169,10 @@ static void _cpu_check_features(void)
701cb0ef41Sopenharmony_ci     x86_cpu_enable_simd = x86_cpu_has_sse2 &&
711cb0ef41Sopenharmony_ci                           x86_cpu_has_sse42 &&
721cb0ef41Sopenharmony_ci                           x86_cpu_has_pclmulqdq;
731cb0ef41Sopenharmony_ci+
741cb0ef41Sopenharmony_ci+#ifdef CRC32_SIMD_AVX512_PCLMUL
751cb0ef41Sopenharmony_ci+    x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040;
761cb0ef41Sopenharmony_ci+#endif
771cb0ef41Sopenharmony_ci }
781cb0ef41Sopenharmony_ci #endif
791cb0ef41Sopenharmony_ci #endif
801cb0ef41Sopenharmony_cidiff --git a/cpu_features.h b/cpu_features.h
811cb0ef41Sopenharmony_ciindex 279246c..aed3e83 100644
821cb0ef41Sopenharmony_ci--- a/cpu_features.h
831cb0ef41Sopenharmony_ci+++ b/cpu_features.h
841cb0ef41Sopenharmony_ci@@ -14,5 +14,6 @@ extern int arm_cpu_enable_pmull;
851cb0ef41Sopenharmony_ci extern int x86_cpu_enable_sse2;
861cb0ef41Sopenharmony_ci extern int x86_cpu_enable_ssse3;
871cb0ef41Sopenharmony_ci extern int x86_cpu_enable_simd;
881cb0ef41Sopenharmony_ci+extern int x86_cpu_enable_avx512;
891cb0ef41Sopenharmony_ci 
901cb0ef41Sopenharmony_ci void cpu_check_features(void);
911cb0ef41Sopenharmony_cidiff --git a/crc32.c b/crc32.c
921cb0ef41Sopenharmony_ciindex 4486098..acb6972 100644
931cb0ef41Sopenharmony_ci--- a/crc32.c
941cb0ef41Sopenharmony_ci+++ b/crc32.c
951cb0ef41Sopenharmony_ci@@ -773,7 +773,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
961cb0ef41Sopenharmony_ci     }
971cb0ef41Sopenharmony_ci 
981cb0ef41Sopenharmony_ci #endif
991cb0ef41Sopenharmony_ci-#if defined(CRC32_SIMD_SSE42_PCLMUL)
1001cb0ef41Sopenharmony_ci+#if defined(CRC32_SIMD_AVX512_PCLMUL)
1011cb0ef41Sopenharmony_ci+    if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
1021cb0ef41Sopenharmony_ci+        /* crc32 64-byte chunks */
1031cb0ef41Sopenharmony_ci+        z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK;
1041cb0ef41Sopenharmony_ci+        crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc);
1051cb0ef41Sopenharmony_ci+        /* check remaining data */
1061cb0ef41Sopenharmony_ci+        len -= chunk_size;
1071cb0ef41Sopenharmony_ci+        if (!len)
1081cb0ef41Sopenharmony_ci+            return crc;
1091cb0ef41Sopenharmony_ci+        /* Fall into the default crc32 for the remaining data. */
1101cb0ef41Sopenharmony_ci+        buf += chunk_size;
1111cb0ef41Sopenharmony_ci+    }
1121cb0ef41Sopenharmony_ci+#elif defined(CRC32_SIMD_SSE42_PCLMUL)
1131cb0ef41Sopenharmony_ci     if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
1141cb0ef41Sopenharmony_ci         /* crc32 16-byte chunks */
1151cb0ef41Sopenharmony_ci         z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
1161cb0ef41Sopenharmony_cidiff --git a/crc32_simd.c b/crc32_simd.c
1171cb0ef41Sopenharmony_ciindex d80beba..7428270 100644
1181cb0ef41Sopenharmony_ci--- a/crc32_simd.c
1191cb0ef41Sopenharmony_ci+++ b/crc32_simd.c
1201cb0ef41Sopenharmony_ci@@ -6,17 +6,207 @@
1211cb0ef41Sopenharmony_ci  */
1221cb0ef41Sopenharmony_ci 
1231cb0ef41Sopenharmony_ci #include "crc32_simd.h"
1241cb0ef41Sopenharmony_ci-
1251cb0ef41Sopenharmony_ci-#if defined(CRC32_SIMD_SSE42_PCLMUL)
1261cb0ef41Sopenharmony_ci+#if defined(CRC32_SIMD_AVX512_PCLMUL)
1271cb0ef41Sopenharmony_ci 
1281cb0ef41Sopenharmony_ci /*
1291cb0ef41Sopenharmony_ci- * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
1301cb0ef41Sopenharmony_ci- * length must be at least 64, and a multiple of 16. Based on:
1311cb0ef41Sopenharmony_ci+ * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
1321cb0ef41Sopenharmony_ci+ * length must be at least 256, and a multiple of 64. Based on:
1331cb0ef41Sopenharmony_ci  *
1341cb0ef41Sopenharmony_ci  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
1351cb0ef41Sopenharmony_ci  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
1361cb0ef41Sopenharmony_ci  */
1371cb0ef41Sopenharmony_ci 
1381cb0ef41Sopenharmony_ci+#include <emmintrin.h>
1391cb0ef41Sopenharmony_ci+#include <smmintrin.h>
1401cb0ef41Sopenharmony_ci+#include <wmmintrin.h>
1411cb0ef41Sopenharmony_ci+#include <immintrin.h>
1421cb0ef41Sopenharmony_ci+
1431cb0ef41Sopenharmony_ci+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(  /* AVX512+PCLMUL */
1441cb0ef41Sopenharmony_ci+    const unsigned char *buf,
1451cb0ef41Sopenharmony_ci+    z_size_t len,
1461cb0ef41Sopenharmony_ci+    uint32_t crc)
1471cb0ef41Sopenharmony_ci+{
1481cb0ef41Sopenharmony_ci+    /*
1491cb0ef41Sopenharmony_ci+     * Definitions of the bit-reflected domain constants k1,k2,k3,k4
1501cb0ef41Sopenharmony_ci+     * are similar to those given at the end of the paper, and remaining
1511cb0ef41Sopenharmony_ci+     * constants and CRC32+Barrett polynomials remain unchanged.
1521cb0ef41Sopenharmony_ci+     *
1531cb0ef41Sopenharmony_ci+     * Replace the index of x from 128 to 512. As follows:
1541cb0ef41Sopenharmony_ci+     * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
1551cb0ef41Sopenharmony_ci+     * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
1561cb0ef41Sopenharmony_ci+     * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
1571cb0ef41Sopenharmony_ci+     * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
1581cb0ef41Sopenharmony_ci+     */
1591cb0ef41Sopenharmony_ci+    static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
1601cb0ef41Sopenharmony_ci+                                                0x011542778a, 0x01322d1430,
1611cb0ef41Sopenharmony_ci+                                                0x011542778a, 0x01322d1430,
1621cb0ef41Sopenharmony_ci+                                                0x011542778a, 0x01322d1430 };
1631cb0ef41Sopenharmony_ci+    static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
1641cb0ef41Sopenharmony_ci+                                                0x0154442bd4, 0x01c6e41596,
1651cb0ef41Sopenharmony_ci+                                                0x0154442bd4, 0x01c6e41596,
1661cb0ef41Sopenharmony_ci+                                                0x0154442bd4, 0x01c6e41596 };
1671cb0ef41Sopenharmony_ci+    static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
1681cb0ef41Sopenharmony_ci+    static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
1691cb0ef41Sopenharmony_ci+    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
1701cb0ef41Sopenharmony_ci+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
1711cb0ef41Sopenharmony_ci+    __m128i a0, a1, a2, a3;
1721cb0ef41Sopenharmony_ci+
1731cb0ef41Sopenharmony_ci+    /*
1741cb0ef41Sopenharmony_ci+     * There's at least one block of 256.
1751cb0ef41Sopenharmony_ci+     */
1761cb0ef41Sopenharmony_ci+    x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
1771cb0ef41Sopenharmony_ci+    x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
1781cb0ef41Sopenharmony_ci+    x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
1791cb0ef41Sopenharmony_ci+    x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
1801cb0ef41Sopenharmony_ci+
1811cb0ef41Sopenharmony_ci+    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
1821cb0ef41Sopenharmony_ci+
1831cb0ef41Sopenharmony_ci+    x0 = _mm512_load_si512((__m512i *)k1k2);
1841cb0ef41Sopenharmony_ci+
1851cb0ef41Sopenharmony_ci+    buf += 256;
1861cb0ef41Sopenharmony_ci+    len -= 256;
1871cb0ef41Sopenharmony_ci+
1881cb0ef41Sopenharmony_ci+    /*
1891cb0ef41Sopenharmony_ci+     * Parallel fold blocks of 256, if any.
1901cb0ef41Sopenharmony_ci+     */
1911cb0ef41Sopenharmony_ci+    while (len >= 256)
1921cb0ef41Sopenharmony_ci+    {
1931cb0ef41Sopenharmony_ci+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
1941cb0ef41Sopenharmony_ci+        x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
1951cb0ef41Sopenharmony_ci+        x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
1961cb0ef41Sopenharmony_ci+        x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
1971cb0ef41Sopenharmony_ci+
1981cb0ef41Sopenharmony_ci+
1991cb0ef41Sopenharmony_ci+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
2001cb0ef41Sopenharmony_ci+        x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
2011cb0ef41Sopenharmony_ci+        x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
2021cb0ef41Sopenharmony_ci+        x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
2031cb0ef41Sopenharmony_ci+
2041cb0ef41Sopenharmony_ci+        y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
2051cb0ef41Sopenharmony_ci+        y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
2061cb0ef41Sopenharmony_ci+        y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
2071cb0ef41Sopenharmony_ci+        y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
2081cb0ef41Sopenharmony_ci+
2091cb0ef41Sopenharmony_ci+        x1 = _mm512_xor_si512(x1, x5);
2101cb0ef41Sopenharmony_ci+        x2 = _mm512_xor_si512(x2, x6);
2111cb0ef41Sopenharmony_ci+        x3 = _mm512_xor_si512(x3, x7);
2121cb0ef41Sopenharmony_ci+        x4 = _mm512_xor_si512(x4, x8);
2131cb0ef41Sopenharmony_ci+
2141cb0ef41Sopenharmony_ci+        x1 = _mm512_xor_si512(x1, y5);
2151cb0ef41Sopenharmony_ci+        x2 = _mm512_xor_si512(x2, y6);
2161cb0ef41Sopenharmony_ci+        x3 = _mm512_xor_si512(x3, y7);
2171cb0ef41Sopenharmony_ci+        x4 = _mm512_xor_si512(x4, y8);
2181cb0ef41Sopenharmony_ci+
2191cb0ef41Sopenharmony_ci+        buf += 256;
2201cb0ef41Sopenharmony_ci+        len -= 256;
2211cb0ef41Sopenharmony_ci+    }
2221cb0ef41Sopenharmony_ci+
2231cb0ef41Sopenharmony_ci+    /*
2241cb0ef41Sopenharmony_ci+     * Fold into 512-bits.
2251cb0ef41Sopenharmony_ci+     */
2261cb0ef41Sopenharmony_ci+    x0 = _mm512_load_si512((__m512i *)k3k4);
2271cb0ef41Sopenharmony_ci+
2281cb0ef41Sopenharmony_ci+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
2291cb0ef41Sopenharmony_ci+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
2301cb0ef41Sopenharmony_ci+    x1 = _mm512_xor_si512(x1, x2);
2311cb0ef41Sopenharmony_ci+    x1 = _mm512_xor_si512(x1, x5);
2321cb0ef41Sopenharmony_ci+
2331cb0ef41Sopenharmony_ci+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
2341cb0ef41Sopenharmony_ci+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
2351cb0ef41Sopenharmony_ci+    x1 = _mm512_xor_si512(x1, x3);
2361cb0ef41Sopenharmony_ci+    x1 = _mm512_xor_si512(x1, x5);
2371cb0ef41Sopenharmony_ci+
2381cb0ef41Sopenharmony_ci+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
2391cb0ef41Sopenharmony_ci+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
2401cb0ef41Sopenharmony_ci+    x1 = _mm512_xor_si512(x1, x4);
2411cb0ef41Sopenharmony_ci+    x1 = _mm512_xor_si512(x1, x5);
2421cb0ef41Sopenharmony_ci+
2431cb0ef41Sopenharmony_ci+    /*
2441cb0ef41Sopenharmony_ci+     * Single fold blocks of 64, if any.
2451cb0ef41Sopenharmony_ci+     */
2461cb0ef41Sopenharmony_ci+    while (len >= 64)
2471cb0ef41Sopenharmony_ci+    {
2481cb0ef41Sopenharmony_ci+        x2 = _mm512_loadu_si512((__m512i *)buf);
2491cb0ef41Sopenharmony_ci+
2501cb0ef41Sopenharmony_ci+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
2511cb0ef41Sopenharmony_ci+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
2521cb0ef41Sopenharmony_ci+        x1 = _mm512_xor_si512(x1, x2);
2531cb0ef41Sopenharmony_ci+        x1 = _mm512_xor_si512(x1, x5);
2541cb0ef41Sopenharmony_ci+
2551cb0ef41Sopenharmony_ci+        buf += 64;
2561cb0ef41Sopenharmony_ci+        len -= 64;
2571cb0ef41Sopenharmony_ci+    }
2581cb0ef41Sopenharmony_ci+
2591cb0ef41Sopenharmony_ci+    /*
2601cb0ef41Sopenharmony_ci+     * Fold 512-bits to 384-bits.
2611cb0ef41Sopenharmony_ci+     */
2621cb0ef41Sopenharmony_ci+    a0 = _mm_load_si128((__m128i *)k5k6);
2631cb0ef41Sopenharmony_ci+
2641cb0ef41Sopenharmony_ci+    a1 = _mm512_extracti32x4_epi32(x1, 0);
2651cb0ef41Sopenharmony_ci+    a2 = _mm512_extracti32x4_epi32(x1, 1);
2661cb0ef41Sopenharmony_ci+
2671cb0ef41Sopenharmony_ci+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
2681cb0ef41Sopenharmony_ci+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
2691cb0ef41Sopenharmony_ci+
2701cb0ef41Sopenharmony_ci+    a1 = _mm_xor_si128(a1, a3);
2711cb0ef41Sopenharmony_ci+    a1 = _mm_xor_si128(a1, a2);
2721cb0ef41Sopenharmony_ci+
2731cb0ef41Sopenharmony_ci+    /*
2741cb0ef41Sopenharmony_ci+     * Fold 384-bits to 256-bits.
2751cb0ef41Sopenharmony_ci+     */
2761cb0ef41Sopenharmony_ci+    a2 = _mm512_extracti32x4_epi32(x1, 2);
2771cb0ef41Sopenharmony_ci+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
2781cb0ef41Sopenharmony_ci+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
2791cb0ef41Sopenharmony_ci+    a1 = _mm_xor_si128(a1, a3);
2801cb0ef41Sopenharmony_ci+    a1 = _mm_xor_si128(a1, a2);
2811cb0ef41Sopenharmony_ci+
2821cb0ef41Sopenharmony_ci+    /*
2831cb0ef41Sopenharmony_ci+     * Fold 256-bits to 128-bits.
2841cb0ef41Sopenharmony_ci+     */
2851cb0ef41Sopenharmony_ci+    a2 = _mm512_extracti32x4_epi32(x1, 3);
2861cb0ef41Sopenharmony_ci+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
2871cb0ef41Sopenharmony_ci+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
2881cb0ef41Sopenharmony_ci+    a1 = _mm_xor_si128(a1, a3);
2891cb0ef41Sopenharmony_ci+    a1 = _mm_xor_si128(a1, a2);
2901cb0ef41Sopenharmony_ci+
2911cb0ef41Sopenharmony_ci+    /*
2921cb0ef41Sopenharmony_ci+     * Fold 128-bits to 64-bits.
2931cb0ef41Sopenharmony_ci+     */
2941cb0ef41Sopenharmony_ci+    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
2951cb0ef41Sopenharmony_ci+    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
2961cb0ef41Sopenharmony_ci+    a1 = _mm_srli_si128(a1, 8);
2971cb0ef41Sopenharmony_ci+    a1 = _mm_xor_si128(a1, a2);
2981cb0ef41Sopenharmony_ci+
2991cb0ef41Sopenharmony_ci+    a0 = _mm_loadl_epi64((__m128i*)k7k8);
3001cb0ef41Sopenharmony_ci+    a2 = _mm_srli_si128(a1, 4);
3011cb0ef41Sopenharmony_ci+    a1 = _mm_and_si128(a1, a3);
3021cb0ef41Sopenharmony_ci+    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
3031cb0ef41Sopenharmony_ci+    a1 = _mm_xor_si128(a1, a2);
3041cb0ef41Sopenharmony_ci+
3051cb0ef41Sopenharmony_ci+    /*
3061cb0ef41Sopenharmony_ci+     * Barret reduce to 32-bits.
3071cb0ef41Sopenharmony_ci+     */
3081cb0ef41Sopenharmony_ci+    a0 = _mm_load_si128((__m128i*)poly);
3091cb0ef41Sopenharmony_ci+
3101cb0ef41Sopenharmony_ci+    a2 = _mm_and_si128(a1, a3);
3111cb0ef41Sopenharmony_ci+    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
3121cb0ef41Sopenharmony_ci+    a2 = _mm_and_si128(a2, a3);
3131cb0ef41Sopenharmony_ci+    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
3141cb0ef41Sopenharmony_ci+    a1 = _mm_xor_si128(a1, a2);
3151cb0ef41Sopenharmony_ci+
3161cb0ef41Sopenharmony_ci+    /*
3171cb0ef41Sopenharmony_ci+     * Return the crc32.
3181cb0ef41Sopenharmony_ci+     */
3191cb0ef41Sopenharmony_ci+    return _mm_extract_epi32(a1, 1);
3201cb0ef41Sopenharmony_ci+}
3211cb0ef41Sopenharmony_ci+
3221cb0ef41Sopenharmony_ci+#elif defined(CRC32_SIMD_SSE42_PCLMUL)
3231cb0ef41Sopenharmony_ci+
3241cb0ef41Sopenharmony_ci+/*
3251cb0ef41Sopenharmony_ci+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
3261cb0ef41Sopenharmony_ci+ * length must be at least 64, and a multiple of 16.
3271cb0ef41Sopenharmony_ci+ */
3281cb0ef41Sopenharmony_ci+
3291cb0ef41Sopenharmony_ci #include <emmintrin.h>
3301cb0ef41Sopenharmony_ci #include <smmintrin.h>
3311cb0ef41Sopenharmony_ci #include <wmmintrin.h>
3321cb0ef41Sopenharmony_cidiff --git a/crc32_simd.h b/crc32_simd.h
3331cb0ef41Sopenharmony_ciindex c0346dc..8462464 100644
3341cb0ef41Sopenharmony_ci--- a/crc32_simd.h
3351cb0ef41Sopenharmony_ci+++ b/crc32_simd.h
3361cb0ef41Sopenharmony_ci@@ -19,12 +19,18 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
3371cb0ef41Sopenharmony_ci                                          z_size_t len,
3381cb0ef41Sopenharmony_ci                                          uint32_t crc);
3391cb0ef41Sopenharmony_ci 
3401cb0ef41Sopenharmony_ci+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf,
3411cb0ef41Sopenharmony_ci+                                          z_size_t len,
3421cb0ef41Sopenharmony_ci+                                          uint32_t crc);
3431cb0ef41Sopenharmony_ci+
3441cb0ef41Sopenharmony_ci /*
3451cb0ef41Sopenharmony_ci  * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
3461cb0ef41Sopenharmony_ci  * for computing the crc32 of an arbitrary length buffer.
3471cb0ef41Sopenharmony_ci  */
3481cb0ef41Sopenharmony_ci #define Z_CRC32_SSE42_MINIMUM_LENGTH 64
3491cb0ef41Sopenharmony_ci #define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
3501cb0ef41Sopenharmony_ci+#define Z_CRC32_AVX512_MINIMUM_LENGTH 256
3511cb0ef41Sopenharmony_ci+#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63
3521cb0ef41Sopenharmony_ci 
3531cb0ef41Sopenharmony_ci /*
3541cb0ef41Sopenharmony_ci  * CRC32 checksums using ARMv8-a crypto instructions.
3551cb0ef41Sopenharmony_ci-- 
3561cb0ef41Sopenharmony_ci2.34.1
3571cb0ef41Sopenharmony_ci
358