18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
28c2ecf20Sopenharmony_ci/* -*- linux-c -*- ------------------------------------------------------- *
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci *   Copyright 2002 H. Peter Anvin - All Rights Reserved
58c2ecf20Sopenharmony_ci *
68c2ecf20Sopenharmony_ci * ----------------------------------------------------------------------- */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci/*
98c2ecf20Sopenharmony_ci * raid6/sse2.c
108c2ecf20Sopenharmony_ci *
118c2ecf20Sopenharmony_ci * SSE-2 implementation of RAID-6 syndrome functions
128c2ecf20Sopenharmony_ci *
138c2ecf20Sopenharmony_ci */
148c2ecf20Sopenharmony_ci
158c2ecf20Sopenharmony_ci#include <linux/raid/pq.h>
168c2ecf20Sopenharmony_ci#include "x86.h"
178c2ecf20Sopenharmony_ci
188c2ecf20Sopenharmony_cistatic const struct raid6_sse_constants {
198c2ecf20Sopenharmony_ci	u64 x1d[2];
208c2ecf20Sopenharmony_ci} raid6_sse_constants  __attribute__((aligned(16))) = {
218c2ecf20Sopenharmony_ci	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
228c2ecf20Sopenharmony_ci};
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_cistatic int raid6_have_sse2(void)
258c2ecf20Sopenharmony_ci{
268c2ecf20Sopenharmony_ci	/* Not really boot_cpu but "all_cpus" */
278c2ecf20Sopenharmony_ci	return boot_cpu_has(X86_FEATURE_MMX) &&
288c2ecf20Sopenharmony_ci		boot_cpu_has(X86_FEATURE_FXSR) &&
298c2ecf20Sopenharmony_ci		boot_cpu_has(X86_FEATURE_XMM) &&
308c2ecf20Sopenharmony_ci		boot_cpu_has(X86_FEATURE_XMM2);
318c2ecf20Sopenharmony_ci}
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ci/*
348c2ecf20Sopenharmony_ci * Plain SSE2 implementation
358c2ecf20Sopenharmony_ci */
368c2ecf20Sopenharmony_cistatic void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
378c2ecf20Sopenharmony_ci{
388c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
398c2ecf20Sopenharmony_ci	u8 *p, *q;
408c2ecf20Sopenharmony_ci	int d, z, z0;
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ci	z0 = disks - 3;		/* Highest data disk */
438c2ecf20Sopenharmony_ci	p = dptr[z0+1];		/* XOR parity */
448c2ecf20Sopenharmony_ci	q = dptr[z0+2];		/* RS syndrome */
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_ci	kernel_fpu_begin();
478c2ecf20Sopenharmony_ci
488c2ecf20Sopenharmony_ci	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
498c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
508c2ecf20Sopenharmony_ci
518c2ecf20Sopenharmony_ci	for ( d = 0 ; d < bytes ; d += 16 ) {
528c2ecf20Sopenharmony_ci		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
538c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
548c2ecf20Sopenharmony_ci		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
558c2ecf20Sopenharmony_ci		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
568c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
578c2ecf20Sopenharmony_ci		for ( z = z0-2 ; z >= 0 ; z-- ) {
588c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
598c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm4,%xmm5");
608c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm4,%xmm4");
618c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm5");
628c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
638c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm5");
648c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm6,%xmm2");
658c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm6,%xmm4");
668c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
678c2ecf20Sopenharmony_ci		}
688c2ecf20Sopenharmony_ci		asm volatile("pcmpgtb %xmm4,%xmm5");
698c2ecf20Sopenharmony_ci		asm volatile("paddb %xmm4,%xmm4");
708c2ecf20Sopenharmony_ci		asm volatile("pand %xmm0,%xmm5");
718c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm5,%xmm4");
728c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm5,%xmm5");
738c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm6,%xmm2");
748c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm6,%xmm4");
758c2ecf20Sopenharmony_ci
768c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
778c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm2,%xmm2");
788c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
798c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm4,%xmm4");
808c2ecf20Sopenharmony_ci	}
818c2ecf20Sopenharmony_ci
828c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
838c2ecf20Sopenharmony_ci	kernel_fpu_end();
848c2ecf20Sopenharmony_ci}
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_cistatic void raid6_sse21_xor_syndrome(int disks, int start, int stop,
888c2ecf20Sopenharmony_ci				     size_t bytes, void **ptrs)
898c2ecf20Sopenharmony_ci{
908c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
918c2ecf20Sopenharmony_ci	u8 *p, *q;
928c2ecf20Sopenharmony_ci	int d, z, z0;
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_ci	z0 = stop;		/* P/Q right side optimization */
958c2ecf20Sopenharmony_ci	p = dptr[disks-2];	/* XOR parity */
968c2ecf20Sopenharmony_ci	q = dptr[disks-1];	/* RS syndrome */
978c2ecf20Sopenharmony_ci
988c2ecf20Sopenharmony_ci	kernel_fpu_begin();
998c2ecf20Sopenharmony_ci
1008c2ecf20Sopenharmony_ci	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_ci	for ( d = 0 ; d < bytes ; d += 16 ) {
1038c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
1048c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
1058c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm4,%xmm2");
1068c2ecf20Sopenharmony_ci		/* P/Q data pages */
1078c2ecf20Sopenharmony_ci		for ( z = z0-1 ; z >= start ; z-- ) {
1088c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm5");
1098c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm4,%xmm5");
1108c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm4,%xmm4");
1118c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm5");
1128c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
1138c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
1148c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm2");
1158c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
1168c2ecf20Sopenharmony_ci		}
1178c2ecf20Sopenharmony_ci		/* P/Q left side optimization */
1188c2ecf20Sopenharmony_ci		for ( z = start-1 ; z >= 0 ; z-- ) {
1198c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm5");
1208c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm4,%xmm5");
1218c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm4,%xmm4");
1228c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm5");
1238c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
1248c2ecf20Sopenharmony_ci		}
1258c2ecf20Sopenharmony_ci		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
1268c2ecf20Sopenharmony_ci		/* Don't use movntdq for r/w memory area < cache line */
1278c2ecf20Sopenharmony_ci		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
1288c2ecf20Sopenharmony_ci		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
1298c2ecf20Sopenharmony_ci	}
1308c2ecf20Sopenharmony_ci
1318c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
1328c2ecf20Sopenharmony_ci	kernel_fpu_end();
1338c2ecf20Sopenharmony_ci}
1348c2ecf20Sopenharmony_ci
1358c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_sse2x1 = {
1368c2ecf20Sopenharmony_ci	raid6_sse21_gen_syndrome,
1378c2ecf20Sopenharmony_ci	raid6_sse21_xor_syndrome,
1388c2ecf20Sopenharmony_ci	raid6_have_sse2,
1398c2ecf20Sopenharmony_ci	"sse2x1",
1408c2ecf20Sopenharmony_ci	1			/* Has cache hints */
1418c2ecf20Sopenharmony_ci};
1428c2ecf20Sopenharmony_ci
1438c2ecf20Sopenharmony_ci/*
1448c2ecf20Sopenharmony_ci * Unrolled-by-2 SSE2 implementation
1458c2ecf20Sopenharmony_ci */
1468c2ecf20Sopenharmony_cistatic void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
1478c2ecf20Sopenharmony_ci{
1488c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
1498c2ecf20Sopenharmony_ci	u8 *p, *q;
1508c2ecf20Sopenharmony_ci	int d, z, z0;
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci	z0 = disks - 3;		/* Highest data disk */
1538c2ecf20Sopenharmony_ci	p = dptr[z0+1];		/* XOR parity */
1548c2ecf20Sopenharmony_ci	q = dptr[z0+2];		/* RS syndrome */
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_ci	kernel_fpu_begin();
1578c2ecf20Sopenharmony_ci
1588c2ecf20Sopenharmony_ci	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
1598c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
1608c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
1618c2ecf20Sopenharmony_ci
1628c2ecf20Sopenharmony_ci	/* We uniformly assume a single prefetch covers at least 32 bytes */
1638c2ecf20Sopenharmony_ci	for ( d = 0 ; d < bytes ; d += 32 ) {
1648c2ecf20Sopenharmony_ci		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
1658c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
1668c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
1678c2ecf20Sopenharmony_ci		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
1688c2ecf20Sopenharmony_ci		asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
1698c2ecf20Sopenharmony_ci		for ( z = z0-1 ; z >= 0 ; z-- ) {
1708c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
1718c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm4,%xmm5");
1728c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm6,%xmm7");
1738c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm4,%xmm4");
1748c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm6,%xmm6");
1758c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm5");
1768c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm7");
1778c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
1788c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm6");
1798c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
1808c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
1818c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm2");
1828c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm3");
1838c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
1848c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm6");
1858c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm5");
1868c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm7");
1878c2ecf20Sopenharmony_ci		}
1888c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
1898c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
1908c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
1918c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
1928c2ecf20Sopenharmony_ci	}
1938c2ecf20Sopenharmony_ci
1948c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
1958c2ecf20Sopenharmony_ci	kernel_fpu_end();
1968c2ecf20Sopenharmony_ci}
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_cistatic void raid6_sse22_xor_syndrome(int disks, int start, int stop,
1998c2ecf20Sopenharmony_ci				     size_t bytes, void **ptrs)
2008c2ecf20Sopenharmony_ci{
2018c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
2028c2ecf20Sopenharmony_ci	u8 *p, *q;
2038c2ecf20Sopenharmony_ci	int d, z, z0;
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci	z0 = stop;		/* P/Q right side optimization */
2068c2ecf20Sopenharmony_ci	p = dptr[disks-2];	/* XOR parity */
2078c2ecf20Sopenharmony_ci	q = dptr[disks-1];	/* RS syndrome */
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_ci	kernel_fpu_begin();
2108c2ecf20Sopenharmony_ci
2118c2ecf20Sopenharmony_ci	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci	for ( d = 0 ; d < bytes ; d += 32 ) {
2148c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
2158c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
2168c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
2178c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
2188c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm4,%xmm2");
2198c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm6,%xmm3");
2208c2ecf20Sopenharmony_ci		/* P/Q data pages */
2218c2ecf20Sopenharmony_ci		for ( z = z0-1 ; z >= start ; z-- ) {
2228c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm5");
2238c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm7");
2248c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm4,%xmm5");
2258c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm6,%xmm7");
2268c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm4,%xmm4");
2278c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm6,%xmm6");
2288c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm5");
2298c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm7");
2308c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
2318c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm6");
2328c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
2338c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
2348c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm2");
2358c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm3");
2368c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
2378c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm6");
2388c2ecf20Sopenharmony_ci		}
2398c2ecf20Sopenharmony_ci		/* P/Q left side optimization */
2408c2ecf20Sopenharmony_ci		for ( z = start-1 ; z >= 0 ; z-- ) {
2418c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm5");
2428c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm7");
2438c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm4,%xmm5");
2448c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm6,%xmm7");
2458c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm4,%xmm4");
2468c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm6,%xmm6");
2478c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm5");
2488c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm7");
2498c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
2508c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm6");
2518c2ecf20Sopenharmony_ci		}
2528c2ecf20Sopenharmony_ci		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
2538c2ecf20Sopenharmony_ci		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
2548c2ecf20Sopenharmony_ci		/* Don't use movntdq for r/w memory area < cache line */
2558c2ecf20Sopenharmony_ci		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
2568c2ecf20Sopenharmony_ci		asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16]));
2578c2ecf20Sopenharmony_ci		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
2588c2ecf20Sopenharmony_ci		asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16]));
2598c2ecf20Sopenharmony_ci	}
2608c2ecf20Sopenharmony_ci
2618c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
2628c2ecf20Sopenharmony_ci	kernel_fpu_end();
2638c2ecf20Sopenharmony_ci}
2648c2ecf20Sopenharmony_ci
2658c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_sse2x2 = {
2668c2ecf20Sopenharmony_ci	raid6_sse22_gen_syndrome,
2678c2ecf20Sopenharmony_ci	raid6_sse22_xor_syndrome,
2688c2ecf20Sopenharmony_ci	raid6_have_sse2,
2698c2ecf20Sopenharmony_ci	"sse2x2",
2708c2ecf20Sopenharmony_ci	1			/* Has cache hints */
2718c2ecf20Sopenharmony_ci};
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_64
2748c2ecf20Sopenharmony_ci
2758c2ecf20Sopenharmony_ci/*
2768c2ecf20Sopenharmony_ci * Unrolled-by-4 SSE2 implementation
2778c2ecf20Sopenharmony_ci */
2788c2ecf20Sopenharmony_cistatic void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
2798c2ecf20Sopenharmony_ci{
2808c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
2818c2ecf20Sopenharmony_ci	u8 *p, *q;
2828c2ecf20Sopenharmony_ci	int d, z, z0;
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci	z0 = disks - 3;		/* Highest data disk */
2858c2ecf20Sopenharmony_ci	p = dptr[z0+1];		/* XOR parity */
2868c2ecf20Sopenharmony_ci	q = dptr[z0+2];		/* RS syndrome */
2878c2ecf20Sopenharmony_ci
2888c2ecf20Sopenharmony_ci	kernel_fpu_begin();
2898c2ecf20Sopenharmony_ci
2908c2ecf20Sopenharmony_ci	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
2918c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm2,%xmm2");	/* P[0] */
2928c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm3,%xmm3");	/* P[1] */
2938c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm4,%xmm4"); 	/* Q[0] */
2948c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
2958c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm6,%xmm6"); 	/* Q[1] */
2968c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm7,%xmm7"); 	/* Zero temp */
2978c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm10,%xmm10");	/* P[2] */
2988c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm11,%xmm11");	/* P[3] */
2998c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm12,%xmm12"); 	/* Q[2] */
3008c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm13,%xmm13");	/* Zero temp */
3018c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm14,%xmm14"); 	/* Q[3] */
3028c2ecf20Sopenharmony_ci	asm volatile("pxor %xmm15,%xmm15"); 	/* Zero temp */
3038c2ecf20Sopenharmony_ci
3048c2ecf20Sopenharmony_ci	for ( d = 0 ; d < bytes ; d += 64 ) {
3058c2ecf20Sopenharmony_ci		for ( z = z0 ; z >= 0 ; z-- ) {
3068c2ecf20Sopenharmony_ci			/* The second prefetch seems to improve performance... */
3078c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
3088c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
3098c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm4,%xmm5");
3108c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm6,%xmm7");
3118c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm12,%xmm13");
3128c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm14,%xmm15");
3138c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm4,%xmm4");
3148c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm6,%xmm6");
3158c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm12,%xmm12");
3168c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm14,%xmm14");
3178c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm5");
3188c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm7");
3198c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm13");
3208c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm15");
3218c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
3228c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm6");
3238c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm13,%xmm12");
3248c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm15,%xmm14");
3258c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
3268c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
3278c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
3288c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
3298c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm2");
3308c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm3");
3318c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm13,%xmm10");
3328c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm15,%xmm11");
3338c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
3348c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm6");
3358c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm13,%xmm12");
3368c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm15,%xmm14");
3378c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm5");
3388c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm7");
3398c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm13,%xmm13");
3408c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm15,%xmm15");
3418c2ecf20Sopenharmony_ci		}
3428c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
3438c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm2,%xmm2");
3448c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
3458c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm3,%xmm3");
3468c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
3478c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm10,%xmm10");
3488c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
3498c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm11,%xmm11");
3508c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
3518c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm4,%xmm4");
3528c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
3538c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm6,%xmm6");
3548c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
3558c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm12,%xmm12");
3568c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
3578c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm14,%xmm14");
3588c2ecf20Sopenharmony_ci	}
3598c2ecf20Sopenharmony_ci
3608c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
3618c2ecf20Sopenharmony_ci	kernel_fpu_end();
3628c2ecf20Sopenharmony_ci}
3638c2ecf20Sopenharmony_ci
3648c2ecf20Sopenharmony_cistatic void raid6_sse24_xor_syndrome(int disks, int start, int stop,
3658c2ecf20Sopenharmony_ci				     size_t bytes, void **ptrs)
3668c2ecf20Sopenharmony_ci{
3678c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
3688c2ecf20Sopenharmony_ci	u8 *p, *q;
3698c2ecf20Sopenharmony_ci	int d, z, z0;
3708c2ecf20Sopenharmony_ci
3718c2ecf20Sopenharmony_ci	z0 = stop;		/* P/Q right side optimization */
3728c2ecf20Sopenharmony_ci	p = dptr[disks-2];	/* XOR parity */
3738c2ecf20Sopenharmony_ci	q = dptr[disks-1];	/* RS syndrome */
3748c2ecf20Sopenharmony_ci
3758c2ecf20Sopenharmony_ci	kernel_fpu_begin();
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
3788c2ecf20Sopenharmony_ci
3798c2ecf20Sopenharmony_ci	for ( d = 0 ; d < bytes ; d += 64 ) {
3808c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
3818c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
3828c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32]));
3838c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48]));
3848c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
3858c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
3868c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32]));
3878c2ecf20Sopenharmony_ci		asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48]));
3888c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm4,%xmm2");
3898c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm6,%xmm3");
3908c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm12,%xmm10");
3918c2ecf20Sopenharmony_ci		asm volatile("pxor %xmm14,%xmm11");
3928c2ecf20Sopenharmony_ci		/* P/Q data pages */
3938c2ecf20Sopenharmony_ci		for ( z = z0-1 ; z >= start ; z-- ) {
3948c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
3958c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
3968c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm5");
3978c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm7");
3988c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm13,%xmm13");
3998c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm15,%xmm15");
4008c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm4,%xmm5");
4018c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm6,%xmm7");
4028c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm12,%xmm13");
4038c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm14,%xmm15");
4048c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm4,%xmm4");
4058c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm6,%xmm6");
4068c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm12,%xmm12");
4078c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm14,%xmm14");
4088c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm5");
4098c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm7");
4108c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm13");
4118c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm15");
4128c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
4138c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm6");
4148c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm13,%xmm12");
4158c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm15,%xmm14");
4168c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
4178c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
4188c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
4198c2ecf20Sopenharmony_ci			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
4208c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm2");
4218c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm3");
4228c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm13,%xmm10");
4238c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm15,%xmm11");
4248c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
4258c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm6");
4268c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm13,%xmm12");
4278c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm15,%xmm14");
4288c2ecf20Sopenharmony_ci		}
4298c2ecf20Sopenharmony_ci		asm volatile("prefetchnta %0" :: "m" (q[d]));
4308c2ecf20Sopenharmony_ci		asm volatile("prefetchnta %0" :: "m" (q[d+32]));
4318c2ecf20Sopenharmony_ci		/* P/Q left side optimization */
4328c2ecf20Sopenharmony_ci		for ( z = start-1 ; z >= 0 ; z-- ) {
4338c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm5");
4348c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm7");
4358c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm13,%xmm13");
4368c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm15,%xmm15");
4378c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm4,%xmm5");
4388c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm6,%xmm7");
4398c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm12,%xmm13");
4408c2ecf20Sopenharmony_ci			asm volatile("pcmpgtb %xmm14,%xmm15");
4418c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm4,%xmm4");
4428c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm6,%xmm6");
4438c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm12,%xmm12");
4448c2ecf20Sopenharmony_ci			asm volatile("paddb %xmm14,%xmm14");
4458c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm5");
4468c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm7");
4478c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm13");
4488c2ecf20Sopenharmony_ci			asm volatile("pand %xmm0,%xmm15");
4498c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm5,%xmm4");
4508c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm7,%xmm6");
4518c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm13,%xmm12");
4528c2ecf20Sopenharmony_ci			asm volatile("pxor %xmm15,%xmm14");
4538c2ecf20Sopenharmony_ci		}
4548c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
4558c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
4568c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
4578c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
4588c2ecf20Sopenharmony_ci		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
4598c2ecf20Sopenharmony_ci		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
4608c2ecf20Sopenharmony_ci		asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32]));
4618c2ecf20Sopenharmony_ci		asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48]));
4628c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
4638c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
4648c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
4658c2ecf20Sopenharmony_ci		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
4668c2ecf20Sopenharmony_ci	}
4678c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
4688c2ecf20Sopenharmony_ci	kernel_fpu_end();
4698c2ecf20Sopenharmony_ci}
4708c2ecf20Sopenharmony_ci
4718c2ecf20Sopenharmony_ci
4728c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_sse2x4 = {
4738c2ecf20Sopenharmony_ci	raid6_sse24_gen_syndrome,
4748c2ecf20Sopenharmony_ci	raid6_sse24_xor_syndrome,
4758c2ecf20Sopenharmony_ci	raid6_have_sse2,
4768c2ecf20Sopenharmony_ci	"sse2x4",
4778c2ecf20Sopenharmony_ci	1			/* Has cache hints */
4788c2ecf20Sopenharmony_ci};
4798c2ecf20Sopenharmony_ci
4808c2ecf20Sopenharmony_ci#endif /* CONFIG_X86_64 */
481