18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* -*- linux-c -*- ------------------------------------------------------- * 38c2ecf20Sopenharmony_ci * 48c2ecf20Sopenharmony_ci * Copyright 2002 H. Peter Anvin - All Rights Reserved 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * ----------------------------------------------------------------------- */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci/* 98c2ecf20Sopenharmony_ci * raid6/sse2.c 108c2ecf20Sopenharmony_ci * 118c2ecf20Sopenharmony_ci * SSE-2 implementation of RAID-6 syndrome functions 128c2ecf20Sopenharmony_ci * 138c2ecf20Sopenharmony_ci */ 148c2ecf20Sopenharmony_ci 158c2ecf20Sopenharmony_ci#include <linux/raid/pq.h> 168c2ecf20Sopenharmony_ci#include "x86.h" 178c2ecf20Sopenharmony_ci 188c2ecf20Sopenharmony_cistatic const struct raid6_sse_constants { 198c2ecf20Sopenharmony_ci u64 x1d[2]; 208c2ecf20Sopenharmony_ci} raid6_sse_constants __attribute__((aligned(16))) = { 218c2ecf20Sopenharmony_ci { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL }, 228c2ecf20Sopenharmony_ci}; 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_cistatic int raid6_have_sse2(void) 258c2ecf20Sopenharmony_ci{ 268c2ecf20Sopenharmony_ci /* Not really boot_cpu but "all_cpus" */ 278c2ecf20Sopenharmony_ci return boot_cpu_has(X86_FEATURE_MMX) && 288c2ecf20Sopenharmony_ci boot_cpu_has(X86_FEATURE_FXSR) && 298c2ecf20Sopenharmony_ci boot_cpu_has(X86_FEATURE_XMM) && 308c2ecf20Sopenharmony_ci boot_cpu_has(X86_FEATURE_XMM2); 318c2ecf20Sopenharmony_ci} 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ci/* 348c2ecf20Sopenharmony_ci * Plain SSE2 implementation 358c2ecf20Sopenharmony_ci */ 368c2ecf20Sopenharmony_cistatic void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) 378c2ecf20Sopenharmony_ci{ 388c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 398c2ecf20Sopenharmony_ci u8 *p, *q; 408c2ecf20Sopenharmony_ci int d, z, z0; 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci z0 = disks - 3; /* Highest data disk */ 438c2ecf20Sopenharmony_ci p = dptr[z0+1]; /* XOR parity */ 448c2ecf20Sopenharmony_ci q = dptr[z0+2]; /* RS syndrome */ 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_ci kernel_fpu_begin(); 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); 498c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ 508c2ecf20Sopenharmony_ci 518c2ecf20Sopenharmony_ci for ( d = 0 ; d < bytes ; d += 16 ) { 528c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); 538c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ 548c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); 558c2ecf20Sopenharmony_ci asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ 568c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d])); 578c2ecf20Sopenharmony_ci for ( z = z0-2 ; z >= 0 ; z-- ) { 588c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); 598c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm4,%xmm5"); 608c2ecf20Sopenharmony_ci asm volatile("paddb %xmm4,%xmm4"); 618c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm5"); 628c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 638c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); 648c2ecf20Sopenharmony_ci asm volatile("pxor %xmm6,%xmm2"); 658c2ecf20Sopenharmony_ci asm volatile("pxor %xmm6,%xmm4"); 668c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d])); 678c2ecf20Sopenharmony_ci } 688c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm4,%xmm5"); 698c2ecf20Sopenharmony_ci asm volatile("paddb %xmm4,%xmm4"); 708c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm5"); 718c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 728c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); 738c2ecf20Sopenharmony_ci asm volatile("pxor %xmm6,%xmm2"); 748c2ecf20Sopenharmony_ci asm volatile("pxor %xmm6,%xmm4"); 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); 778c2ecf20Sopenharmony_ci asm volatile("pxor %xmm2,%xmm2"); 788c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); 798c2ecf20Sopenharmony_ci asm volatile("pxor %xmm4,%xmm4"); 808c2ecf20Sopenharmony_ci } 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 838c2ecf20Sopenharmony_ci kernel_fpu_end(); 848c2ecf20Sopenharmony_ci} 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_cistatic void raid6_sse21_xor_syndrome(int disks, int start, int stop, 888c2ecf20Sopenharmony_ci size_t bytes, void **ptrs) 898c2ecf20Sopenharmony_ci{ 908c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 918c2ecf20Sopenharmony_ci u8 *p, *q; 928c2ecf20Sopenharmony_ci int d, z, z0; 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci z0 = stop; /* P/Q right side optimization */ 958c2ecf20Sopenharmony_ci p = dptr[disks-2]; /* XOR parity */ 968c2ecf20Sopenharmony_ci q = dptr[disks-1]; /* RS syndrome */ 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci kernel_fpu_begin(); 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_ci for ( d = 0 ; d < bytes ; d += 16 ) { 1038c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); 1048c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); 1058c2ecf20Sopenharmony_ci asm volatile("pxor %xmm4,%xmm2"); 1068c2ecf20Sopenharmony_ci /* P/Q data pages */ 1078c2ecf20Sopenharmony_ci for ( z = z0-1 ; z >= start ; z-- ) { 1088c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); 1098c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm4,%xmm5"); 1108c2ecf20Sopenharmony_ci asm volatile("paddb %xmm4,%xmm4"); 1118c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm5"); 1128c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 1138c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); 1148c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm2"); 1158c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 1168c2ecf20Sopenharmony_ci } 1178c2ecf20Sopenharmony_ci /* P/Q left side optimization */ 1188c2ecf20Sopenharmony_ci for ( z = start-1 ; z >= 0 ; z-- ) { 1198c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); 1208c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm4,%xmm5"); 1218c2ecf20Sopenharmony_ci asm volatile("paddb %xmm4,%xmm4"); 1228c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm5"); 1238c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 1248c2ecf20Sopenharmony_ci } 1258c2ecf20Sopenharmony_ci asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); 1268c2ecf20Sopenharmony_ci /* Don't use movntdq for r/w memory area < cache line */ 1278c2ecf20Sopenharmony_ci asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); 1288c2ecf20Sopenharmony_ci asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); 1298c2ecf20Sopenharmony_ci } 1308c2ecf20Sopenharmony_ci 1318c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 1328c2ecf20Sopenharmony_ci kernel_fpu_end(); 1338c2ecf20Sopenharmony_ci} 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_sse2x1 = { 1368c2ecf20Sopenharmony_ci raid6_sse21_gen_syndrome, 1378c2ecf20Sopenharmony_ci raid6_sse21_xor_syndrome, 1388c2ecf20Sopenharmony_ci raid6_have_sse2, 1398c2ecf20Sopenharmony_ci "sse2x1", 1408c2ecf20Sopenharmony_ci 1 /* Has cache hints */ 1418c2ecf20Sopenharmony_ci}; 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_ci/* 1448c2ecf20Sopenharmony_ci * Unrolled-by-2 SSE2 implementation 1458c2ecf20Sopenharmony_ci */ 1468c2ecf20Sopenharmony_cistatic void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) 1478c2ecf20Sopenharmony_ci{ 1488c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 1498c2ecf20Sopenharmony_ci u8 *p, *q; 1508c2ecf20Sopenharmony_ci int d, z, z0; 1518c2ecf20Sopenharmony_ci 1528c2ecf20Sopenharmony_ci z0 = disks - 3; /* Highest data disk */ 1538c2ecf20Sopenharmony_ci p = dptr[z0+1]; /* XOR parity */ 1548c2ecf20Sopenharmony_ci q = dptr[z0+2]; /* RS syndrome */ 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci kernel_fpu_begin(); 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); 1598c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ 1608c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_ci /* We uniformly assume a single prefetch covers at least 32 bytes */ 1638c2ecf20Sopenharmony_ci for ( d = 0 ; d < bytes ; d += 32 ) { 1648c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); 1658c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ 1668c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */ 1678c2ecf20Sopenharmony_ci asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ 1688c2ecf20Sopenharmony_ci asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */ 1698c2ecf20Sopenharmony_ci for ( z = z0-1 ; z >= 0 ; z-- ) { 1708c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); 1718c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm4,%xmm5"); 1728c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm6,%xmm7"); 1738c2ecf20Sopenharmony_ci asm volatile("paddb %xmm4,%xmm4"); 1748c2ecf20Sopenharmony_ci asm volatile("paddb %xmm6,%xmm6"); 1758c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm5"); 1768c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm7"); 1778c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 1788c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm6"); 1798c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d])); 1808c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16])); 1818c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm2"); 1828c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm3"); 1838c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 1848c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm6"); 1858c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); 1868c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm7"); 1878c2ecf20Sopenharmony_ci } 1888c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); 1898c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); 1908c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); 1918c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); 1928c2ecf20Sopenharmony_ci } 1938c2ecf20Sopenharmony_ci 1948c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 1958c2ecf20Sopenharmony_ci kernel_fpu_end(); 1968c2ecf20Sopenharmony_ci} 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_cistatic void raid6_sse22_xor_syndrome(int disks, int start, int stop, 1998c2ecf20Sopenharmony_ci size_t bytes, void **ptrs) 2008c2ecf20Sopenharmony_ci{ 2018c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 2028c2ecf20Sopenharmony_ci u8 *p, *q; 2038c2ecf20Sopenharmony_ci int d, z, z0; 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci z0 = stop; /* P/Q right side optimization */ 2068c2ecf20Sopenharmony_ci p = dptr[disks-2]; /* XOR parity */ 2078c2ecf20Sopenharmony_ci q = dptr[disks-1]; /* RS syndrome */ 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci kernel_fpu_begin(); 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci for ( d = 0 ; d < bytes ; d += 32 ) { 2148c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); 2158c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); 2168c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); 2178c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); 2188c2ecf20Sopenharmony_ci asm volatile("pxor %xmm4,%xmm2"); 2198c2ecf20Sopenharmony_ci asm volatile("pxor %xmm6,%xmm3"); 2208c2ecf20Sopenharmony_ci /* P/Q data pages */ 2218c2ecf20Sopenharmony_ci for ( z = z0-1 ; z >= start ; z-- ) { 2228c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); 2238c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm7"); 2248c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm4,%xmm5"); 2258c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm6,%xmm7"); 2268c2ecf20Sopenharmony_ci asm volatile("paddb %xmm4,%xmm4"); 2278c2ecf20Sopenharmony_ci asm volatile("paddb %xmm6,%xmm6"); 2288c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm5"); 2298c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm7"); 2308c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 2318c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm6"); 2328c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); 2338c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); 2348c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm2"); 2358c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm3"); 2368c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 2378c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm6"); 2388c2ecf20Sopenharmony_ci } 2398c2ecf20Sopenharmony_ci /* P/Q left side optimization */ 2408c2ecf20Sopenharmony_ci for ( z = start-1 ; z >= 0 ; z-- ) { 2418c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); 2428c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm7"); 2438c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm4,%xmm5"); 2448c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm6,%xmm7"); 2458c2ecf20Sopenharmony_ci asm volatile("paddb %xmm4,%xmm4"); 2468c2ecf20Sopenharmony_ci asm volatile("paddb %xmm6,%xmm6"); 2478c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm5"); 2488c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm7"); 2498c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 2508c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm6"); 2518c2ecf20Sopenharmony_ci } 2528c2ecf20Sopenharmony_ci asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); 2538c2ecf20Sopenharmony_ci asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); 2548c2ecf20Sopenharmony_ci /* Don't use movntdq for r/w memory area < cache line */ 2558c2ecf20Sopenharmony_ci asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); 2568c2ecf20Sopenharmony_ci asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16])); 2578c2ecf20Sopenharmony_ci asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); 2588c2ecf20Sopenharmony_ci asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16])); 2598c2ecf20Sopenharmony_ci } 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 2628c2ecf20Sopenharmony_ci kernel_fpu_end(); 2638c2ecf20Sopenharmony_ci} 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_sse2x2 = { 2668c2ecf20Sopenharmony_ci raid6_sse22_gen_syndrome, 2678c2ecf20Sopenharmony_ci raid6_sse22_xor_syndrome, 2688c2ecf20Sopenharmony_ci raid6_have_sse2, 2698c2ecf20Sopenharmony_ci "sse2x2", 2708c2ecf20Sopenharmony_ci 1 /* Has cache hints */ 2718c2ecf20Sopenharmony_ci}; 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_64 2748c2ecf20Sopenharmony_ci 2758c2ecf20Sopenharmony_ci/* 2768c2ecf20Sopenharmony_ci * Unrolled-by-4 SSE2 implementation 2778c2ecf20Sopenharmony_ci */ 2788c2ecf20Sopenharmony_cistatic void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) 2798c2ecf20Sopenharmony_ci{ 2808c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 2818c2ecf20Sopenharmony_ci u8 *p, *q; 2828c2ecf20Sopenharmony_ci int d, z, z0; 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci z0 = disks - 3; /* Highest data disk */ 2858c2ecf20Sopenharmony_ci p = dptr[z0+1]; /* XOR parity */ 2868c2ecf20Sopenharmony_ci q = dptr[z0+2]; /* RS syndrome */ 2878c2ecf20Sopenharmony_ci 2888c2ecf20Sopenharmony_ci kernel_fpu_begin(); 2898c2ecf20Sopenharmony_ci 2908c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); 2918c2ecf20Sopenharmony_ci asm volatile("pxor %xmm2,%xmm2"); /* P[0] */ 2928c2ecf20Sopenharmony_ci asm volatile("pxor %xmm3,%xmm3"); /* P[1] */ 2938c2ecf20Sopenharmony_ci asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */ 2948c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ 2958c2ecf20Sopenharmony_ci asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */ 2968c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ 2978c2ecf20Sopenharmony_ci asm volatile("pxor %xmm10,%xmm10"); /* P[2] */ 2988c2ecf20Sopenharmony_ci asm volatile("pxor %xmm11,%xmm11"); /* P[3] */ 2998c2ecf20Sopenharmony_ci asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */ 3008c2ecf20Sopenharmony_ci asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */ 3018c2ecf20Sopenharmony_ci asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */ 3028c2ecf20Sopenharmony_ci asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */ 3038c2ecf20Sopenharmony_ci 3048c2ecf20Sopenharmony_ci for ( d = 0 ; d < bytes ; d += 64 ) { 3058c2ecf20Sopenharmony_ci for ( z = z0 ; z >= 0 ; z-- ) { 3068c2ecf20Sopenharmony_ci /* The second prefetch seems to improve performance... */ 3078c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); 3088c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); 3098c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm4,%xmm5"); 3108c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm6,%xmm7"); 3118c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm12,%xmm13"); 3128c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm14,%xmm15"); 3138c2ecf20Sopenharmony_ci asm volatile("paddb %xmm4,%xmm4"); 3148c2ecf20Sopenharmony_ci asm volatile("paddb %xmm6,%xmm6"); 3158c2ecf20Sopenharmony_ci asm volatile("paddb %xmm12,%xmm12"); 3168c2ecf20Sopenharmony_ci asm volatile("paddb %xmm14,%xmm14"); 3178c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm5"); 3188c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm7"); 3198c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm13"); 3208c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm15"); 3218c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 3228c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm6"); 3238c2ecf20Sopenharmony_ci asm volatile("pxor %xmm13,%xmm12"); 3248c2ecf20Sopenharmony_ci asm volatile("pxor %xmm15,%xmm14"); 3258c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); 3268c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); 3278c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); 3288c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); 3298c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm2"); 3308c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm3"); 3318c2ecf20Sopenharmony_ci asm volatile("pxor %xmm13,%xmm10"); 3328c2ecf20Sopenharmony_ci asm volatile("pxor %xmm15,%xmm11"); 3338c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 3348c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm6"); 3358c2ecf20Sopenharmony_ci asm volatile("pxor %xmm13,%xmm12"); 3368c2ecf20Sopenharmony_ci asm volatile("pxor %xmm15,%xmm14"); 3378c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); 3388c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm7"); 3398c2ecf20Sopenharmony_ci asm volatile("pxor %xmm13,%xmm13"); 3408c2ecf20Sopenharmony_ci asm volatile("pxor %xmm15,%xmm15"); 3418c2ecf20Sopenharmony_ci } 3428c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); 3438c2ecf20Sopenharmony_ci asm volatile("pxor %xmm2,%xmm2"); 3448c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); 3458c2ecf20Sopenharmony_ci asm volatile("pxor %xmm3,%xmm3"); 3468c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); 3478c2ecf20Sopenharmony_ci asm volatile("pxor %xmm10,%xmm10"); 3488c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); 3498c2ecf20Sopenharmony_ci asm volatile("pxor %xmm11,%xmm11"); 3508c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); 3518c2ecf20Sopenharmony_ci asm volatile("pxor %xmm4,%xmm4"); 3528c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); 3538c2ecf20Sopenharmony_ci asm volatile("pxor %xmm6,%xmm6"); 3548c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); 3558c2ecf20Sopenharmony_ci asm volatile("pxor %xmm12,%xmm12"); 3568c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); 3578c2ecf20Sopenharmony_ci asm volatile("pxor %xmm14,%xmm14"); 3588c2ecf20Sopenharmony_ci } 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 3618c2ecf20Sopenharmony_ci kernel_fpu_end(); 3628c2ecf20Sopenharmony_ci} 3638c2ecf20Sopenharmony_ci 3648c2ecf20Sopenharmony_cistatic void raid6_sse24_xor_syndrome(int disks, int start, int stop, 3658c2ecf20Sopenharmony_ci size_t bytes, void **ptrs) 3668c2ecf20Sopenharmony_ci{ 3678c2ecf20Sopenharmony_ci u8 **dptr = (u8 **)ptrs; 3688c2ecf20Sopenharmony_ci u8 *p, *q; 3698c2ecf20Sopenharmony_ci int d, z, z0; 3708c2ecf20Sopenharmony_ci 3718c2ecf20Sopenharmony_ci z0 = stop; /* P/Q right side optimization */ 3728c2ecf20Sopenharmony_ci p = dptr[disks-2]; /* XOR parity */ 3738c2ecf20Sopenharmony_ci q = dptr[disks-1]; /* RS syndrome */ 3748c2ecf20Sopenharmony_ci 3758c2ecf20Sopenharmony_ci kernel_fpu_begin(); 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); 3788c2ecf20Sopenharmony_ci 3798c2ecf20Sopenharmony_ci for ( d = 0 ; d < bytes ; d += 64 ) { 3808c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); 3818c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); 3828c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32])); 3838c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48])); 3848c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); 3858c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); 3868c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32])); 3878c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48])); 3888c2ecf20Sopenharmony_ci asm volatile("pxor %xmm4,%xmm2"); 3898c2ecf20Sopenharmony_ci asm volatile("pxor %xmm6,%xmm3"); 3908c2ecf20Sopenharmony_ci asm volatile("pxor %xmm12,%xmm10"); 3918c2ecf20Sopenharmony_ci asm volatile("pxor %xmm14,%xmm11"); 3928c2ecf20Sopenharmony_ci /* P/Q data pages */ 3938c2ecf20Sopenharmony_ci for ( z = z0-1 ; z >= start ; z-- ) { 3948c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); 3958c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); 3968c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); 3978c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm7"); 3988c2ecf20Sopenharmony_ci asm volatile("pxor %xmm13,%xmm13"); 3998c2ecf20Sopenharmony_ci asm volatile("pxor %xmm15,%xmm15"); 4008c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm4,%xmm5"); 4018c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm6,%xmm7"); 4028c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm12,%xmm13"); 4038c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm14,%xmm15"); 4048c2ecf20Sopenharmony_ci asm volatile("paddb %xmm4,%xmm4"); 4058c2ecf20Sopenharmony_ci asm volatile("paddb %xmm6,%xmm6"); 4068c2ecf20Sopenharmony_ci asm volatile("paddb %xmm12,%xmm12"); 4078c2ecf20Sopenharmony_ci asm volatile("paddb %xmm14,%xmm14"); 4088c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm5"); 4098c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm7"); 4108c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm13"); 4118c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm15"); 4128c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 4138c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm6"); 4148c2ecf20Sopenharmony_ci asm volatile("pxor %xmm13,%xmm12"); 4158c2ecf20Sopenharmony_ci asm volatile("pxor %xmm15,%xmm14"); 4168c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); 4178c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); 4188c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); 4198c2ecf20Sopenharmony_ci asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); 4208c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm2"); 4218c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm3"); 4228c2ecf20Sopenharmony_ci asm volatile("pxor %xmm13,%xmm10"); 4238c2ecf20Sopenharmony_ci asm volatile("pxor %xmm15,%xmm11"); 4248c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 4258c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm6"); 4268c2ecf20Sopenharmony_ci asm volatile("pxor %xmm13,%xmm12"); 4278c2ecf20Sopenharmony_ci asm volatile("pxor %xmm15,%xmm14"); 4288c2ecf20Sopenharmony_ci } 4298c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" :: "m" (q[d])); 4308c2ecf20Sopenharmony_ci asm volatile("prefetchnta %0" :: "m" (q[d+32])); 4318c2ecf20Sopenharmony_ci /* P/Q left side optimization */ 4328c2ecf20Sopenharmony_ci for ( z = start-1 ; z >= 0 ; z-- ) { 4338c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm5"); 4348c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm7"); 4358c2ecf20Sopenharmony_ci asm volatile("pxor %xmm13,%xmm13"); 4368c2ecf20Sopenharmony_ci asm volatile("pxor %xmm15,%xmm15"); 4378c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm4,%xmm5"); 4388c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm6,%xmm7"); 4398c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm12,%xmm13"); 4408c2ecf20Sopenharmony_ci asm volatile("pcmpgtb %xmm14,%xmm15"); 4418c2ecf20Sopenharmony_ci asm volatile("paddb %xmm4,%xmm4"); 4428c2ecf20Sopenharmony_ci asm volatile("paddb %xmm6,%xmm6"); 4438c2ecf20Sopenharmony_ci asm volatile("paddb %xmm12,%xmm12"); 4448c2ecf20Sopenharmony_ci asm volatile("paddb %xmm14,%xmm14"); 4458c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm5"); 4468c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm7"); 4478c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm13"); 4488c2ecf20Sopenharmony_ci asm volatile("pand %xmm0,%xmm15"); 4498c2ecf20Sopenharmony_ci asm volatile("pxor %xmm5,%xmm4"); 4508c2ecf20Sopenharmony_ci asm volatile("pxor %xmm7,%xmm6"); 4518c2ecf20Sopenharmony_ci asm volatile("pxor %xmm13,%xmm12"); 4528c2ecf20Sopenharmony_ci asm volatile("pxor %xmm15,%xmm14"); 4538c2ecf20Sopenharmony_ci } 4548c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); 4558c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); 4568c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); 4578c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); 4588c2ecf20Sopenharmony_ci asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); 4598c2ecf20Sopenharmony_ci asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); 4608c2ecf20Sopenharmony_ci asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32])); 4618c2ecf20Sopenharmony_ci asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48])); 4628c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); 4638c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); 4648c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); 4658c2ecf20Sopenharmony_ci asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); 4668c2ecf20Sopenharmony_ci } 4678c2ecf20Sopenharmony_ci asm volatile("sfence" : : : "memory"); 4688c2ecf20Sopenharmony_ci kernel_fpu_end(); 4698c2ecf20Sopenharmony_ci} 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ci 4728c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_sse2x4 = { 4738c2ecf20Sopenharmony_ci raid6_sse24_gen_syndrome, 4748c2ecf20Sopenharmony_ci raid6_sse24_xor_syndrome, 4758c2ecf20Sopenharmony_ci raid6_have_sse2, 4768c2ecf20Sopenharmony_ci "sse2x4", 4778c2ecf20Sopenharmony_ci 1 /* Has cache hints */ 4788c2ecf20Sopenharmony_ci}; 4798c2ecf20Sopenharmony_ci 4808c2ecf20Sopenharmony_ci#endif /* CONFIG_X86_64 */ 481