18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* -*- linux-c -*- ------------------------------------------------------- * 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * Copyright (C) 2012 Intel Corporation 58c2ecf20Sopenharmony_ci * Author: Yuanhan Liu <yuanhan.liu@linux.intel.com> 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * ----------------------------------------------------------------------- */ 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_ci/* 128c2ecf20Sopenharmony_ci * AVX2 implementation of RAID-6 syndrome functions 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci */ 158c2ecf20Sopenharmony_ci 168c2ecf20Sopenharmony_ci#include <linux/raid/pq.h> 178c2ecf20Sopenharmony_ci#include "x86.h" 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_cistatic const struct raid6_avx2_constants { 208c2ecf20Sopenharmony_ci u64 x1d[4]; 218c2ecf20Sopenharmony_ci} raid6_avx2_constants __aligned(32) = { 228c2ecf20Sopenharmony_ci { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 238c2ecf20Sopenharmony_ci 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, 248c2ecf20Sopenharmony_ci}; 258c2ecf20Sopenharmony_ci 268c2ecf20Sopenharmony_cistatic int raid6_have_avx2(void) 278c2ecf20Sopenharmony_ci{ 288c2ecf20Sopenharmony_ci return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX); 298c2ecf20Sopenharmony_ci} 308c2ecf20Sopenharmony_ci 318c2ecf20Sopenharmony_ci/* 328c2ecf20Sopenharmony_ci * Plain AVX2 implementation 338c2ecf20Sopenharmony_ci */ 348c2ecf20Sopenharmony_cistatic void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) 358c2ecf20Sopenharmony_ci{ 368c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 378c2ecf20Sopenharmony_ci u8 *p, *q; 388c2ecf20Sopenharmony_ci int d, z, z0; 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_ci z0 = disks - 3; /* Highest data disk */ 418c2ecf20Sopenharmony_ci p = dptr[z0+1]; /* XOR parity */ 428c2ecf20Sopenharmony_ci q = dptr[z0+2]; /* RS syndrome */ 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_ci kernel_fpu_begin(); 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); 478c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */ 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci for (d = 0; d < bytes; d += 32) { 508c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); 518c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ 528c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); 538c2ecf20Sopenharmony_ci asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */ 548c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d])); 558c2ecf20Sopenharmony_ci for (z = z0-2; z >= 0; z--) { 568c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); 578c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); 588c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 598c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm5,%ymm5"); 608c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 618c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm6,%ymm2,%ymm2"); 628c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm6,%ymm4,%ymm4"); 638c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d])); 648c2ecf20Sopenharmony_ci } 658c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); 668c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 678c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm5,%ymm5"); 688c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 698c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm6,%ymm2,%ymm2"); 708c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm6,%ymm4,%ymm4"); 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); 738c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm2,%ymm2,%ymm2"); 748c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); 758c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm4,%ymm4,%ymm4"); 768c2ecf20Sopenharmony_ci } 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 798c2ecf20Sopenharmony_ci kernel_fpu_end(); 808c2ecf20Sopenharmony_ci} 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_cistatic void raid6_avx21_xor_syndrome(int disks, int start, int stop, 838c2ecf20Sopenharmony_ci size_t bytes, void **ptrs) 848c2ecf20Sopenharmony_ci{ 858c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 868c2ecf20Sopenharmony_ci u8 *p, *q; 878c2ecf20Sopenharmony_ci int d, z, z0; 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci z0 = stop; /* P/Q right side optimization */ 908c2ecf20Sopenharmony_ci p = dptr[disks-2]; /* XOR parity */ 918c2ecf20Sopenharmony_ci q = dptr[disks-1]; /* RS syndrome */ 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci kernel_fpu_begin(); 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci for (d = 0 ; d < bytes ; d += 32) { 988c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); 998c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); 1008c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm4,%ymm2,%ymm2"); 1018c2ecf20Sopenharmony_ci /* P/Q data pages */ 1028c2ecf20Sopenharmony_ci for (z = z0-1 ; z >= start ; z--) { 1038c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 1048c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 1058c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 1068c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm5,%ymm5"); 1078c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 1088c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); 1098c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm2,%ymm2"); 1108c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 1118c2ecf20Sopenharmony_ci } 1128c2ecf20Sopenharmony_ci /* P/Q left side optimization */ 1138c2ecf20Sopenharmony_ci for (z = start-1 ; z >= 0 ; z--) { 1148c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 1158c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 1168c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 1178c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm5,%ymm5"); 1188c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 1198c2ecf20Sopenharmony_ci } 1208c2ecf20Sopenharmony_ci asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); 1218c2ecf20Sopenharmony_ci /* Don't use movntdq for r/w memory area < cache line */ 1228c2ecf20Sopenharmony_ci asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); 1238c2ecf20Sopenharmony_ci asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); 1248c2ecf20Sopenharmony_ci } 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 1278c2ecf20Sopenharmony_ci kernel_fpu_end(); 1288c2ecf20Sopenharmony_ci} 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_avx2x1 = { 1318c2ecf20Sopenharmony_ci raid6_avx21_gen_syndrome, 1328c2ecf20Sopenharmony_ci raid6_avx21_xor_syndrome, 1338c2ecf20Sopenharmony_ci raid6_have_avx2, 1348c2ecf20Sopenharmony_ci "avx2x1", 1358c2ecf20Sopenharmony_ci 1 /* Has cache hints */ 1368c2ecf20Sopenharmony_ci}; 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci/* 1398c2ecf20Sopenharmony_ci * Unrolled-by-2 AVX2 implementation 1408c2ecf20Sopenharmony_ci */ 1418c2ecf20Sopenharmony_cistatic void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) 1428c2ecf20Sopenharmony_ci{ 1438c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 1448c2ecf20Sopenharmony_ci u8 *p, *q; 1458c2ecf20Sopenharmony_ci int d, z, z0; 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci z0 = disks - 3; /* Highest data disk */ 1488c2ecf20Sopenharmony_ci p = dptr[z0+1]; /* XOR parity */ 1498c2ecf20Sopenharmony_ci q = dptr[z0+2]; /* RS syndrome */ 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci kernel_fpu_begin(); 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); 1548c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci /* We uniformly assume a single prefetch covers at least 32 bytes */ 1578c2ecf20Sopenharmony_ci for (d = 0; d < bytes; d += 64) { 1588c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); 1598c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32])); 1608c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ 1618c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */ 1628c2ecf20Sopenharmony_ci asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */ 1638c2ecf20Sopenharmony_ci asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */ 1648c2ecf20Sopenharmony_ci for (z = z0-1; z >= 0; z--) { 1658c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); 1668c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); 1678c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); 1688c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); 1698c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 1708c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 1718c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm5,%ymm5"); 1728c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm7,%ymm7"); 1738c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 1748c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 1758c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); 1768c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); 1778c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm2,%ymm2"); 1788c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm3,%ymm3"); 1798c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 1808c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 1818c2ecf20Sopenharmony_ci } 1828c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); 1838c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); 1848c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); 1858c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); 1868c2ecf20Sopenharmony_ci } 1878c2ecf20Sopenharmony_ci 1888c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 1898c2ecf20Sopenharmony_ci kernel_fpu_end(); 1908c2ecf20Sopenharmony_ci} 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_cistatic void raid6_avx22_xor_syndrome(int disks, int start, int stop, 1938c2ecf20Sopenharmony_ci size_t bytes, void **ptrs) 1948c2ecf20Sopenharmony_ci{ 1958c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 1968c2ecf20Sopenharmony_ci u8 *p, *q; 1978c2ecf20Sopenharmony_ci int d, z, z0; 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_ci z0 = stop; /* P/Q right side optimization */ 2008c2ecf20Sopenharmony_ci p = dptr[disks-2]; /* XOR parity */ 2018c2ecf20Sopenharmony_ci q = dptr[disks-1]; /* RS syndrome */ 2028c2ecf20Sopenharmony_ci 2038c2ecf20Sopenharmony_ci kernel_fpu_begin(); 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci for (d = 0 ; d < bytes ; d += 64) { 2088c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); 2098c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); 2108c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); 2118c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); 2128c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm4,%ymm2,%ymm2"); 2138c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm6,%ymm3,%ymm3"); 2148c2ecf20Sopenharmony_ci /* P/Q data pages */ 2158c2ecf20Sopenharmony_ci for (z = z0-1 ; z >= start ; z--) { 2168c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 2178c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm7,%ymm7"); 2188c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 2198c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); 2208c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 2218c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 2228c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm5,%ymm5"); 2238c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm7,%ymm7"); 2248c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 2258c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 2268c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); 2278c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm7" 2288c2ecf20Sopenharmony_ci :: "m" (dptr[z][d+32])); 2298c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm2,%ymm2"); 2308c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm3,%ymm3"); 2318c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 2328c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 2338c2ecf20Sopenharmony_ci } 2348c2ecf20Sopenharmony_ci /* P/Q left side optimization */ 2358c2ecf20Sopenharmony_ci for (z = start-1 ; z >= 0 ; z--) { 2368c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 2378c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm7,%ymm7"); 2388c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 2398c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); 2408c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 2418c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 2428c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm5,%ymm5"); 2438c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm7,%ymm7"); 2448c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 2458c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 2468c2ecf20Sopenharmony_ci } 2478c2ecf20Sopenharmony_ci asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); 2488c2ecf20Sopenharmony_ci asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); 2498c2ecf20Sopenharmony_ci /* Don't use movntdq for r/w memory area < cache line */ 2508c2ecf20Sopenharmony_ci asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); 2518c2ecf20Sopenharmony_ci asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32])); 2528c2ecf20Sopenharmony_ci asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); 2538c2ecf20Sopenharmony_ci asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32])); 2548c2ecf20Sopenharmony_ci } 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 2578c2ecf20Sopenharmony_ci kernel_fpu_end(); 2588c2ecf20Sopenharmony_ci} 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_avx2x2 = { 2618c2ecf20Sopenharmony_ci raid6_avx22_gen_syndrome, 2628c2ecf20Sopenharmony_ci raid6_avx22_xor_syndrome, 2638c2ecf20Sopenharmony_ci raid6_have_avx2, 2648c2ecf20Sopenharmony_ci "avx2x2", 2658c2ecf20Sopenharmony_ci 1 /* Has cache hints */ 2668c2ecf20Sopenharmony_ci}; 2678c2ecf20Sopenharmony_ci 2688c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_64 2698c2ecf20Sopenharmony_ci 2708c2ecf20Sopenharmony_ci/* 2718c2ecf20Sopenharmony_ci * Unrolled-by-4 AVX2 implementation 2728c2ecf20Sopenharmony_ci */ 2738c2ecf20Sopenharmony_cistatic void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) 2748c2ecf20Sopenharmony_ci{ 2758c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 2768c2ecf20Sopenharmony_ci u8 *p, *q; 2778c2ecf20Sopenharmony_ci int d, z, z0; 2788c2ecf20Sopenharmony_ci 2798c2ecf20Sopenharmony_ci z0 = disks - 3; /* Highest data disk */ 2808c2ecf20Sopenharmony_ci p = dptr[z0+1]; /* XOR parity */ 2818c2ecf20Sopenharmony_ci q = dptr[z0+2]; /* RS syndrome */ 2828c2ecf20Sopenharmony_ci 2838c2ecf20Sopenharmony_ci kernel_fpu_begin(); 2848c2ecf20Sopenharmony_ci 2858c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); 2868c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ 2878c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */ 2888c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */ 2898c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */ 2908c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */ 2918c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */ 2928c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */ 2938c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */ 2948c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */ 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_ci for (d = 0; d < bytes; d += 128) { 2978c2ecf20Sopenharmony_ci for (z = z0; z >= 0; z--) { 2988c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); 2998c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); 3008c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64])); 3018c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96])); 3028c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); 3038c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); 3048c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13"); 3058c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15"); 3068c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 3078c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 3088c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); 3098c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); 3108c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm5,%ymm5"); 3118c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm7,%ymm7"); 3128c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm13,%ymm13"); 3138c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm15,%ymm15"); 3148c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 3158c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 3168c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm13,%ymm12,%ymm12"); 3178c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm15,%ymm14,%ymm14"); 3188c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); 3198c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); 3208c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64])); 3218c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96])); 3228c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm2,%ymm2"); 3238c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm3,%ymm3"); 3248c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm13,%ymm10,%ymm10"); 3258c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm15,%ymm11,%ymm11"); 3268c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 3278c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 3288c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm13,%ymm12,%ymm12"); 3298c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm15,%ymm14,%ymm14"); 3308c2ecf20Sopenharmony_ci } 3318c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); 3328c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm2,%ymm2,%ymm2"); 3338c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); 3348c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm3,%ymm3,%ymm3"); 3358c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); 3368c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm10,%ymm10,%ymm10"); 3378c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); 3388c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm11,%ymm11,%ymm11"); 3398c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); 3408c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm4,%ymm4,%ymm4"); 3418c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); 3428c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm6,%ymm6,%ymm6"); 3438c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); 3448c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm12,%ymm12,%ymm12"); 3458c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); 3468c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm14,%ymm14,%ymm14"); 3478c2ecf20Sopenharmony_ci } 3488c2ecf20Sopenharmony_ci 3498c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 3508c2ecf20Sopenharmony_ci kernel_fpu_end(); 3518c2ecf20Sopenharmony_ci} 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_cistatic void raid6_avx24_xor_syndrome(int disks, int start, int stop, 3548c2ecf20Sopenharmony_ci size_t bytes, void **ptrs) 3558c2ecf20Sopenharmony_ci{ 3568c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 3578c2ecf20Sopenharmony_ci u8 *p, *q; 3588c2ecf20Sopenharmony_ci int d, z, z0; 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci z0 = stop; /* P/Q right side optimization */ 3618c2ecf20Sopenharmony_ci p = dptr[disks-2]; /* XOR parity */ 3628c2ecf20Sopenharmony_ci q = dptr[disks-1]; /* RS syndrome */ 3638c2ecf20Sopenharmony_ci 3648c2ecf20Sopenharmony_ci kernel_fpu_begin(); 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0])); 3678c2ecf20Sopenharmony_ci 3688c2ecf20Sopenharmony_ci for (d = 0 ; d < bytes ; d += 128) { 3698c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); 3708c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); 3718c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64])); 3728c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96])); 3738c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); 3748c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); 3758c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64])); 3768c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96])); 3778c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm4,%ymm2,%ymm2"); 3788c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm6,%ymm3,%ymm3"); 3798c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm12,%ymm10,%ymm10"); 3808c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm14,%ymm11,%ymm11"); 3818c2ecf20Sopenharmony_ci /* P/Q data pages */ 3828c2ecf20Sopenharmony_ci for (z = z0-1 ; z >= start ; z--) { 3838c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); 3848c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64])); 3858c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 3868c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm7,%ymm7"); 3878c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm13,%ymm13,%ymm13"); 3888c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm15,%ymm15,%ymm15"); 3898c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 3908c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); 3918c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); 3928c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); 3938c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 3948c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 3958c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); 3968c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); 3978c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm5,%ymm5"); 3988c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm7,%ymm7"); 3998c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm13,%ymm13"); 4008c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm15,%ymm15"); 4018c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 4028c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 4038c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm13,%ymm12,%ymm12"); 4048c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm15,%ymm14,%ymm14"); 4058c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); 4068c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm7" 4078c2ecf20Sopenharmony_ci :: "m" (dptr[z][d+32])); 4088c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm13" 4098c2ecf20Sopenharmony_ci :: "m" (dptr[z][d+64])); 4108c2ecf20Sopenharmony_ci asm volatile("vmovdqa %0,%%ymm15" 4118c2ecf20Sopenharmony_ci :: "m" (dptr[z][d+96])); 4128c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm2,%ymm2"); 4138c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm3,%ymm3"); 4148c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm13,%ymm10,%ymm10"); 4158c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm15,%ymm11,%ymm11"); 4168c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 4178c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 4188c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm13,%ymm12,%ymm12"); 4198c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm15,%ymm14,%ymm14"); 4208c2ecf20Sopenharmony_ci } 4218c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" :: "m" (q[d])); 4228c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" :: "m" (q[d+64])); 4238c2ecf20Sopenharmony_ci /* P/Q left side optimization */ 4248c2ecf20Sopenharmony_ci for (z = start-1 ; z >= 0 ; z--) { 4258c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm5,%ymm5"); 4268c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm7,%ymm7"); 4278c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm13,%ymm13,%ymm13"); 4288c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm15,%ymm15,%ymm15"); 4298c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); 4308c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); 4318c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); 4328c2ecf20Sopenharmony_ci asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); 4338c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); 4348c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); 4358c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); 4368c2ecf20Sopenharmony_ci asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); 4378c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm5,%ymm5"); 4388c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm7,%ymm7"); 4398c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm13,%ymm13"); 4408c2ecf20Sopenharmony_ci asm volatile("vpand %ymm0,%ymm15,%ymm15"); 4418c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm5,%ymm4,%ymm4"); 4428c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm7,%ymm6,%ymm6"); 4438c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm13,%ymm12,%ymm12"); 4448c2ecf20Sopenharmony_ci asm volatile("vpxor %ymm15,%ymm14,%ymm14"); 4458c2ecf20Sopenharmony_ci } 4468c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); 4478c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); 4488c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); 4498c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); 4508c2ecf20Sopenharmony_ci asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); 4518c2ecf20Sopenharmony_ci asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); 4528c2ecf20Sopenharmony_ci asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64])); 4538c2ecf20Sopenharmony_ci asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96])); 4548c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); 4558c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); 4568c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); 4578c2ecf20Sopenharmony_ci asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); 4588c2ecf20Sopenharmony_ci } 4598c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 4608c2ecf20Sopenharmony_ci kernel_fpu_end(); 4618c2ecf20Sopenharmony_ci} 4628c2ecf20Sopenharmony_ci 4638c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_avx2x4 = { 4648c2ecf20Sopenharmony_ci raid6_avx24_gen_syndrome, 4658c2ecf20Sopenharmony_ci raid6_avx24_xor_syndrome, 4668c2ecf20Sopenharmony_ci raid6_have_avx2, 4678c2ecf20Sopenharmony_ci "avx2x4", 4688c2ecf20Sopenharmony_ci 1 /* Has cache hints */ 4698c2ecf20Sopenharmony_ci}; 4708c2ecf20Sopenharmony_ci#endif 471