18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
28c2ecf20Sopenharmony_ci#ifndef _ASM_X86_XOR_32_H
38c2ecf20Sopenharmony_ci#define _ASM_X86_XOR_32_H
48c2ecf20Sopenharmony_ci
58c2ecf20Sopenharmony_ci/*
68c2ecf20Sopenharmony_ci * Optimized RAID-5 checksumming functions for MMX.
78c2ecf20Sopenharmony_ci */
88c2ecf20Sopenharmony_ci
98c2ecf20Sopenharmony_ci/*
108c2ecf20Sopenharmony_ci * High-speed RAID5 checksumming functions utilizing MMX instructions.
118c2ecf20Sopenharmony_ci * Copyright (C) 1998 Ingo Molnar.
128c2ecf20Sopenharmony_ci */
138c2ecf20Sopenharmony_ci
148c2ecf20Sopenharmony_ci#define LD(x, y)	"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
158c2ecf20Sopenharmony_ci#define ST(x, y)	"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
168c2ecf20Sopenharmony_ci#define XO1(x, y)	"       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
178c2ecf20Sopenharmony_ci#define XO2(x, y)	"       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
188c2ecf20Sopenharmony_ci#define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
198c2ecf20Sopenharmony_ci#define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ci#include <asm/fpu/api.h>
228c2ecf20Sopenharmony_ci
238c2ecf20Sopenharmony_cistatic void
248c2ecf20Sopenharmony_cixor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
258c2ecf20Sopenharmony_ci{
268c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 7;
278c2ecf20Sopenharmony_ci
288c2ecf20Sopenharmony_ci	kernel_fpu_begin();
298c2ecf20Sopenharmony_ci
308c2ecf20Sopenharmony_ci	asm volatile(
318c2ecf20Sopenharmony_ci#undef BLOCK
328c2ecf20Sopenharmony_ci#define BLOCK(i)				\
338c2ecf20Sopenharmony_ci	LD(i, 0)				\
348c2ecf20Sopenharmony_ci		LD(i + 1, 1)			\
358c2ecf20Sopenharmony_ci			LD(i + 2, 2)		\
368c2ecf20Sopenharmony_ci				LD(i + 3, 3)	\
378c2ecf20Sopenharmony_ci	XO1(i, 0)				\
388c2ecf20Sopenharmony_ci	ST(i, 0)				\
398c2ecf20Sopenharmony_ci		XO1(i+1, 1)			\
408c2ecf20Sopenharmony_ci		ST(i+1, 1)			\
418c2ecf20Sopenharmony_ci			XO1(i + 2, 2)		\
428c2ecf20Sopenharmony_ci			ST(i + 2, 2)		\
438c2ecf20Sopenharmony_ci				XO1(i + 3, 3)	\
448c2ecf20Sopenharmony_ci				ST(i + 3, 3)
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_ci	" .align 32			;\n"
478c2ecf20Sopenharmony_ci	" 1:                            ;\n"
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_ci	BLOCK(0)
508c2ecf20Sopenharmony_ci	BLOCK(4)
518c2ecf20Sopenharmony_ci	BLOCK(8)
528c2ecf20Sopenharmony_ci	BLOCK(12)
538c2ecf20Sopenharmony_ci
548c2ecf20Sopenharmony_ci	"       addl $128, %1         ;\n"
558c2ecf20Sopenharmony_ci	"       addl $128, %2         ;\n"
568c2ecf20Sopenharmony_ci	"       decl %0               ;\n"
578c2ecf20Sopenharmony_ci	"       jnz 1b                ;\n"
588c2ecf20Sopenharmony_ci	: "+r" (lines),
598c2ecf20Sopenharmony_ci	  "+r" (p1), "+r" (p2)
608c2ecf20Sopenharmony_ci	:
618c2ecf20Sopenharmony_ci	: "memory");
628c2ecf20Sopenharmony_ci
638c2ecf20Sopenharmony_ci	kernel_fpu_end();
648c2ecf20Sopenharmony_ci}
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_cistatic void
678c2ecf20Sopenharmony_cixor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
688c2ecf20Sopenharmony_ci	      unsigned long *p3)
698c2ecf20Sopenharmony_ci{
708c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 7;
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci	kernel_fpu_begin();
738c2ecf20Sopenharmony_ci
748c2ecf20Sopenharmony_ci	asm volatile(
758c2ecf20Sopenharmony_ci#undef BLOCK
768c2ecf20Sopenharmony_ci#define BLOCK(i)				\
778c2ecf20Sopenharmony_ci	LD(i, 0)				\
788c2ecf20Sopenharmony_ci		LD(i + 1, 1)			\
798c2ecf20Sopenharmony_ci			LD(i + 2, 2)		\
808c2ecf20Sopenharmony_ci				LD(i + 3, 3)	\
818c2ecf20Sopenharmony_ci	XO1(i, 0)				\
828c2ecf20Sopenharmony_ci		XO1(i + 1, 1)			\
838c2ecf20Sopenharmony_ci			XO1(i + 2, 2)		\
848c2ecf20Sopenharmony_ci				XO1(i + 3, 3)	\
858c2ecf20Sopenharmony_ci	XO2(i, 0)				\
868c2ecf20Sopenharmony_ci	ST(i, 0)				\
878c2ecf20Sopenharmony_ci		XO2(i + 1, 1)			\
888c2ecf20Sopenharmony_ci		ST(i + 1, 1)			\
898c2ecf20Sopenharmony_ci			XO2(i + 2, 2)		\
908c2ecf20Sopenharmony_ci			ST(i + 2, 2)		\
918c2ecf20Sopenharmony_ci				XO2(i + 3, 3)	\
928c2ecf20Sopenharmony_ci				ST(i + 3, 3)
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_ci	" .align 32			;\n"
958c2ecf20Sopenharmony_ci	" 1:                            ;\n"
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_ci	BLOCK(0)
988c2ecf20Sopenharmony_ci	BLOCK(4)
998c2ecf20Sopenharmony_ci	BLOCK(8)
1008c2ecf20Sopenharmony_ci	BLOCK(12)
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_ci	"       addl $128, %1         ;\n"
1038c2ecf20Sopenharmony_ci	"       addl $128, %2         ;\n"
1048c2ecf20Sopenharmony_ci	"       addl $128, %3         ;\n"
1058c2ecf20Sopenharmony_ci	"       decl %0               ;\n"
1068c2ecf20Sopenharmony_ci	"       jnz 1b                ;\n"
1078c2ecf20Sopenharmony_ci	: "+r" (lines),
1088c2ecf20Sopenharmony_ci	  "+r" (p1), "+r" (p2), "+r" (p3)
1098c2ecf20Sopenharmony_ci	:
1108c2ecf20Sopenharmony_ci	: "memory");
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_ci	kernel_fpu_end();
1138c2ecf20Sopenharmony_ci}
1148c2ecf20Sopenharmony_ci
1158c2ecf20Sopenharmony_cistatic void
1168c2ecf20Sopenharmony_cixor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
1178c2ecf20Sopenharmony_ci	      unsigned long *p3, unsigned long *p4)
1188c2ecf20Sopenharmony_ci{
1198c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 7;
1208c2ecf20Sopenharmony_ci
1218c2ecf20Sopenharmony_ci	kernel_fpu_begin();
1228c2ecf20Sopenharmony_ci
1238c2ecf20Sopenharmony_ci	asm volatile(
1248c2ecf20Sopenharmony_ci#undef BLOCK
1258c2ecf20Sopenharmony_ci#define BLOCK(i)				\
1268c2ecf20Sopenharmony_ci	LD(i, 0)				\
1278c2ecf20Sopenharmony_ci		LD(i + 1, 1)			\
1288c2ecf20Sopenharmony_ci			LD(i + 2, 2)		\
1298c2ecf20Sopenharmony_ci				LD(i + 3, 3)	\
1308c2ecf20Sopenharmony_ci	XO1(i, 0)				\
1318c2ecf20Sopenharmony_ci		XO1(i + 1, 1)			\
1328c2ecf20Sopenharmony_ci			XO1(i + 2, 2)		\
1338c2ecf20Sopenharmony_ci				XO1(i + 3, 3)	\
1348c2ecf20Sopenharmony_ci	XO2(i, 0)				\
1358c2ecf20Sopenharmony_ci		XO2(i + 1, 1)			\
1368c2ecf20Sopenharmony_ci			XO2(i + 2, 2)		\
1378c2ecf20Sopenharmony_ci				XO2(i + 3, 3)	\
1388c2ecf20Sopenharmony_ci	XO3(i, 0)				\
1398c2ecf20Sopenharmony_ci	ST(i, 0)				\
1408c2ecf20Sopenharmony_ci		XO3(i + 1, 1)			\
1418c2ecf20Sopenharmony_ci		ST(i + 1, 1)			\
1428c2ecf20Sopenharmony_ci			XO3(i + 2, 2)		\
1438c2ecf20Sopenharmony_ci			ST(i + 2, 2)		\
1448c2ecf20Sopenharmony_ci				XO3(i + 3, 3)	\
1458c2ecf20Sopenharmony_ci				ST(i + 3, 3)
1468c2ecf20Sopenharmony_ci
1478c2ecf20Sopenharmony_ci	" .align 32			;\n"
1488c2ecf20Sopenharmony_ci	" 1:                            ;\n"
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci	BLOCK(0)
1518c2ecf20Sopenharmony_ci	BLOCK(4)
1528c2ecf20Sopenharmony_ci	BLOCK(8)
1538c2ecf20Sopenharmony_ci	BLOCK(12)
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ci	"       addl $128, %1         ;\n"
1568c2ecf20Sopenharmony_ci	"       addl $128, %2         ;\n"
1578c2ecf20Sopenharmony_ci	"       addl $128, %3         ;\n"
1588c2ecf20Sopenharmony_ci	"       addl $128, %4         ;\n"
1598c2ecf20Sopenharmony_ci	"       decl %0               ;\n"
1608c2ecf20Sopenharmony_ci	"       jnz 1b                ;\n"
1618c2ecf20Sopenharmony_ci	: "+r" (lines),
1628c2ecf20Sopenharmony_ci	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
1638c2ecf20Sopenharmony_ci	:
1648c2ecf20Sopenharmony_ci	: "memory");
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci	kernel_fpu_end();
1678c2ecf20Sopenharmony_ci}
1688c2ecf20Sopenharmony_ci
1698c2ecf20Sopenharmony_ci
1708c2ecf20Sopenharmony_cistatic void
1718c2ecf20Sopenharmony_cixor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
1728c2ecf20Sopenharmony_ci	      unsigned long *p3, unsigned long *p4, unsigned long *p5)
1738c2ecf20Sopenharmony_ci{
1748c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 7;
1758c2ecf20Sopenharmony_ci
1768c2ecf20Sopenharmony_ci	kernel_fpu_begin();
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_ci	/* Make sure GCC forgets anything it knows about p4 or p5,
1798c2ecf20Sopenharmony_ci	   such that it won't pass to the asm volatile below a
1808c2ecf20Sopenharmony_ci	   register that is shared with any other variable.  That's
1818c2ecf20Sopenharmony_ci	   because we modify p4 and p5 there, but we can't mark them
1828c2ecf20Sopenharmony_ci	   as read/write, otherwise we'd overflow the 10-asm-operands
1838c2ecf20Sopenharmony_ci	   limit of GCC < 3.1.  */
1848c2ecf20Sopenharmony_ci	asm("" : "+r" (p4), "+r" (p5));
1858c2ecf20Sopenharmony_ci
1868c2ecf20Sopenharmony_ci	asm volatile(
1878c2ecf20Sopenharmony_ci#undef BLOCK
1888c2ecf20Sopenharmony_ci#define BLOCK(i)				\
1898c2ecf20Sopenharmony_ci	LD(i, 0)				\
1908c2ecf20Sopenharmony_ci		LD(i + 1, 1)			\
1918c2ecf20Sopenharmony_ci			LD(i + 2, 2)		\
1928c2ecf20Sopenharmony_ci				LD(i + 3, 3)	\
1938c2ecf20Sopenharmony_ci	XO1(i, 0)				\
1948c2ecf20Sopenharmony_ci		XO1(i + 1, 1)			\
1958c2ecf20Sopenharmony_ci			XO1(i + 2, 2)		\
1968c2ecf20Sopenharmony_ci				XO1(i + 3, 3)	\
1978c2ecf20Sopenharmony_ci	XO2(i, 0)				\
1988c2ecf20Sopenharmony_ci		XO2(i + 1, 1)			\
1998c2ecf20Sopenharmony_ci			XO2(i + 2, 2)		\
2008c2ecf20Sopenharmony_ci				XO2(i + 3, 3)	\
2018c2ecf20Sopenharmony_ci	XO3(i, 0)				\
2028c2ecf20Sopenharmony_ci		XO3(i + 1, 1)			\
2038c2ecf20Sopenharmony_ci			XO3(i + 2, 2)		\
2048c2ecf20Sopenharmony_ci				XO3(i + 3, 3)	\
2058c2ecf20Sopenharmony_ci	XO4(i, 0)				\
2068c2ecf20Sopenharmony_ci	ST(i, 0)				\
2078c2ecf20Sopenharmony_ci		XO4(i + 1, 1)			\
2088c2ecf20Sopenharmony_ci		ST(i + 1, 1)			\
2098c2ecf20Sopenharmony_ci			XO4(i + 2, 2)		\
2108c2ecf20Sopenharmony_ci			ST(i + 2, 2)		\
2118c2ecf20Sopenharmony_ci				XO4(i + 3, 3)	\
2128c2ecf20Sopenharmony_ci				ST(i + 3, 3)
2138c2ecf20Sopenharmony_ci
2148c2ecf20Sopenharmony_ci	" .align 32			;\n"
2158c2ecf20Sopenharmony_ci	" 1:                            ;\n"
2168c2ecf20Sopenharmony_ci
2178c2ecf20Sopenharmony_ci	BLOCK(0)
2188c2ecf20Sopenharmony_ci	BLOCK(4)
2198c2ecf20Sopenharmony_ci	BLOCK(8)
2208c2ecf20Sopenharmony_ci	BLOCK(12)
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_ci	"       addl $128, %1         ;\n"
2238c2ecf20Sopenharmony_ci	"       addl $128, %2         ;\n"
2248c2ecf20Sopenharmony_ci	"       addl $128, %3         ;\n"
2258c2ecf20Sopenharmony_ci	"       addl $128, %4         ;\n"
2268c2ecf20Sopenharmony_ci	"       addl $128, %5         ;\n"
2278c2ecf20Sopenharmony_ci	"       decl %0               ;\n"
2288c2ecf20Sopenharmony_ci	"       jnz 1b                ;\n"
2298c2ecf20Sopenharmony_ci	: "+r" (lines),
2308c2ecf20Sopenharmony_ci	  "+r" (p1), "+r" (p2), "+r" (p3)
2318c2ecf20Sopenharmony_ci	: "r" (p4), "r" (p5)
2328c2ecf20Sopenharmony_ci	: "memory");
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_ci	/* p4 and p5 were modified, and now the variables are dead.
2358c2ecf20Sopenharmony_ci	   Clobber them just to be sure nobody does something stupid
2368c2ecf20Sopenharmony_ci	   like assuming they have some legal value.  */
2378c2ecf20Sopenharmony_ci	asm("" : "=r" (p4), "=r" (p5));
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_ci	kernel_fpu_end();
2408c2ecf20Sopenharmony_ci}
2418c2ecf20Sopenharmony_ci
2428c2ecf20Sopenharmony_ci#undef LD
2438c2ecf20Sopenharmony_ci#undef XO1
2448c2ecf20Sopenharmony_ci#undef XO2
2458c2ecf20Sopenharmony_ci#undef XO3
2468c2ecf20Sopenharmony_ci#undef XO4
2478c2ecf20Sopenharmony_ci#undef ST
2488c2ecf20Sopenharmony_ci#undef BLOCK
2498c2ecf20Sopenharmony_ci
2508c2ecf20Sopenharmony_cistatic void
2518c2ecf20Sopenharmony_cixor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
2528c2ecf20Sopenharmony_ci{
2538c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 6;
2548c2ecf20Sopenharmony_ci
2558c2ecf20Sopenharmony_ci	kernel_fpu_begin();
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci	asm volatile(
2588c2ecf20Sopenharmony_ci	" .align 32	             ;\n"
2598c2ecf20Sopenharmony_ci	" 1:                         ;\n"
2608c2ecf20Sopenharmony_ci	"       movq   (%1), %%mm0   ;\n"
2618c2ecf20Sopenharmony_ci	"       movq  8(%1), %%mm1   ;\n"
2628c2ecf20Sopenharmony_ci	"       pxor   (%2), %%mm0   ;\n"
2638c2ecf20Sopenharmony_ci	"       movq 16(%1), %%mm2   ;\n"
2648c2ecf20Sopenharmony_ci	"       movq %%mm0,   (%1)   ;\n"
2658c2ecf20Sopenharmony_ci	"       pxor  8(%2), %%mm1   ;\n"
2668c2ecf20Sopenharmony_ci	"       movq 24(%1), %%mm3   ;\n"
2678c2ecf20Sopenharmony_ci	"       movq %%mm1,  8(%1)   ;\n"
2688c2ecf20Sopenharmony_ci	"       pxor 16(%2), %%mm2   ;\n"
2698c2ecf20Sopenharmony_ci	"       movq 32(%1), %%mm4   ;\n"
2708c2ecf20Sopenharmony_ci	"       movq %%mm2, 16(%1)   ;\n"
2718c2ecf20Sopenharmony_ci	"       pxor 24(%2), %%mm3   ;\n"
2728c2ecf20Sopenharmony_ci	"       movq 40(%1), %%mm5   ;\n"
2738c2ecf20Sopenharmony_ci	"       movq %%mm3, 24(%1)   ;\n"
2748c2ecf20Sopenharmony_ci	"       pxor 32(%2), %%mm4   ;\n"
2758c2ecf20Sopenharmony_ci	"       movq 48(%1), %%mm6   ;\n"
2768c2ecf20Sopenharmony_ci	"       movq %%mm4, 32(%1)   ;\n"
2778c2ecf20Sopenharmony_ci	"       pxor 40(%2), %%mm5   ;\n"
2788c2ecf20Sopenharmony_ci	"       movq 56(%1), %%mm7   ;\n"
2798c2ecf20Sopenharmony_ci	"       movq %%mm5, 40(%1)   ;\n"
2808c2ecf20Sopenharmony_ci	"       pxor 48(%2), %%mm6   ;\n"
2818c2ecf20Sopenharmony_ci	"       pxor 56(%2), %%mm7   ;\n"
2828c2ecf20Sopenharmony_ci	"       movq %%mm6, 48(%1)   ;\n"
2838c2ecf20Sopenharmony_ci	"       movq %%mm7, 56(%1)   ;\n"
2848c2ecf20Sopenharmony_ci
2858c2ecf20Sopenharmony_ci	"       addl $64, %1         ;\n"
2868c2ecf20Sopenharmony_ci	"       addl $64, %2         ;\n"
2878c2ecf20Sopenharmony_ci	"       decl %0              ;\n"
2888c2ecf20Sopenharmony_ci	"       jnz 1b               ;\n"
2898c2ecf20Sopenharmony_ci	: "+r" (lines),
2908c2ecf20Sopenharmony_ci	  "+r" (p1), "+r" (p2)
2918c2ecf20Sopenharmony_ci	:
2928c2ecf20Sopenharmony_ci	: "memory");
2938c2ecf20Sopenharmony_ci
2948c2ecf20Sopenharmony_ci	kernel_fpu_end();
2958c2ecf20Sopenharmony_ci}
2968c2ecf20Sopenharmony_ci
2978c2ecf20Sopenharmony_cistatic void
2988c2ecf20Sopenharmony_cixor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
2998c2ecf20Sopenharmony_ci	     unsigned long *p3)
3008c2ecf20Sopenharmony_ci{
3018c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 6;
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_ci	kernel_fpu_begin();
3048c2ecf20Sopenharmony_ci
3058c2ecf20Sopenharmony_ci	asm volatile(
3068c2ecf20Sopenharmony_ci	" .align 32,0x90             ;\n"
3078c2ecf20Sopenharmony_ci	" 1:                         ;\n"
3088c2ecf20Sopenharmony_ci	"       movq   (%1), %%mm0   ;\n"
3098c2ecf20Sopenharmony_ci	"       movq  8(%1), %%mm1   ;\n"
3108c2ecf20Sopenharmony_ci	"       pxor   (%2), %%mm0   ;\n"
3118c2ecf20Sopenharmony_ci	"       movq 16(%1), %%mm2   ;\n"
3128c2ecf20Sopenharmony_ci	"       pxor  8(%2), %%mm1   ;\n"
3138c2ecf20Sopenharmony_ci	"       pxor   (%3), %%mm0   ;\n"
3148c2ecf20Sopenharmony_ci	"       pxor 16(%2), %%mm2   ;\n"
3158c2ecf20Sopenharmony_ci	"       movq %%mm0,   (%1)   ;\n"
3168c2ecf20Sopenharmony_ci	"       pxor  8(%3), %%mm1   ;\n"
3178c2ecf20Sopenharmony_ci	"       pxor 16(%3), %%mm2   ;\n"
3188c2ecf20Sopenharmony_ci	"       movq 24(%1), %%mm3   ;\n"
3198c2ecf20Sopenharmony_ci	"       movq %%mm1,  8(%1)   ;\n"
3208c2ecf20Sopenharmony_ci	"       movq 32(%1), %%mm4   ;\n"
3218c2ecf20Sopenharmony_ci	"       movq 40(%1), %%mm5   ;\n"
3228c2ecf20Sopenharmony_ci	"       pxor 24(%2), %%mm3   ;\n"
3238c2ecf20Sopenharmony_ci	"       movq %%mm2, 16(%1)   ;\n"
3248c2ecf20Sopenharmony_ci	"       pxor 32(%2), %%mm4   ;\n"
3258c2ecf20Sopenharmony_ci	"       pxor 24(%3), %%mm3   ;\n"
3268c2ecf20Sopenharmony_ci	"       pxor 40(%2), %%mm5   ;\n"
3278c2ecf20Sopenharmony_ci	"       movq %%mm3, 24(%1)   ;\n"
3288c2ecf20Sopenharmony_ci	"       pxor 32(%3), %%mm4   ;\n"
3298c2ecf20Sopenharmony_ci	"       pxor 40(%3), %%mm5   ;\n"
3308c2ecf20Sopenharmony_ci	"       movq 48(%1), %%mm6   ;\n"
3318c2ecf20Sopenharmony_ci	"       movq %%mm4, 32(%1)   ;\n"
3328c2ecf20Sopenharmony_ci	"       movq 56(%1), %%mm7   ;\n"
3338c2ecf20Sopenharmony_ci	"       pxor 48(%2), %%mm6   ;\n"
3348c2ecf20Sopenharmony_ci	"       movq %%mm5, 40(%1)   ;\n"
3358c2ecf20Sopenharmony_ci	"       pxor 56(%2), %%mm7   ;\n"
3368c2ecf20Sopenharmony_ci	"       pxor 48(%3), %%mm6   ;\n"
3378c2ecf20Sopenharmony_ci	"       pxor 56(%3), %%mm7   ;\n"
3388c2ecf20Sopenharmony_ci	"       movq %%mm6, 48(%1)   ;\n"
3398c2ecf20Sopenharmony_ci	"       movq %%mm7, 56(%1)   ;\n"
3408c2ecf20Sopenharmony_ci
3418c2ecf20Sopenharmony_ci	"       addl $64, %1         ;\n"
3428c2ecf20Sopenharmony_ci	"       addl $64, %2         ;\n"
3438c2ecf20Sopenharmony_ci	"       addl $64, %3         ;\n"
3448c2ecf20Sopenharmony_ci	"       decl %0              ;\n"
3458c2ecf20Sopenharmony_ci	"       jnz 1b               ;\n"
3468c2ecf20Sopenharmony_ci	: "+r" (lines),
3478c2ecf20Sopenharmony_ci	  "+r" (p1), "+r" (p2), "+r" (p3)
3488c2ecf20Sopenharmony_ci	:
3498c2ecf20Sopenharmony_ci	: "memory" );
3508c2ecf20Sopenharmony_ci
3518c2ecf20Sopenharmony_ci	kernel_fpu_end();
3528c2ecf20Sopenharmony_ci}
3538c2ecf20Sopenharmony_ci
3548c2ecf20Sopenharmony_cistatic void
3558c2ecf20Sopenharmony_cixor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
3568c2ecf20Sopenharmony_ci	     unsigned long *p3, unsigned long *p4)
3578c2ecf20Sopenharmony_ci{
3588c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 6;
3598c2ecf20Sopenharmony_ci
3608c2ecf20Sopenharmony_ci	kernel_fpu_begin();
3618c2ecf20Sopenharmony_ci
3628c2ecf20Sopenharmony_ci	asm volatile(
3638c2ecf20Sopenharmony_ci	" .align 32,0x90             ;\n"
3648c2ecf20Sopenharmony_ci	" 1:                         ;\n"
3658c2ecf20Sopenharmony_ci	"       movq   (%1), %%mm0   ;\n"
3668c2ecf20Sopenharmony_ci	"       movq  8(%1), %%mm1   ;\n"
3678c2ecf20Sopenharmony_ci	"       pxor   (%2), %%mm0   ;\n"
3688c2ecf20Sopenharmony_ci	"       movq 16(%1), %%mm2   ;\n"
3698c2ecf20Sopenharmony_ci	"       pxor  8(%2), %%mm1   ;\n"
3708c2ecf20Sopenharmony_ci	"       pxor   (%3), %%mm0   ;\n"
3718c2ecf20Sopenharmony_ci	"       pxor 16(%2), %%mm2   ;\n"
3728c2ecf20Sopenharmony_ci	"       pxor  8(%3), %%mm1   ;\n"
3738c2ecf20Sopenharmony_ci	"       pxor   (%4), %%mm0   ;\n"
3748c2ecf20Sopenharmony_ci	"       movq 24(%1), %%mm3   ;\n"
3758c2ecf20Sopenharmony_ci	"       pxor 16(%3), %%mm2   ;\n"
3768c2ecf20Sopenharmony_ci	"       pxor  8(%4), %%mm1   ;\n"
3778c2ecf20Sopenharmony_ci	"       movq %%mm0,   (%1)   ;\n"
3788c2ecf20Sopenharmony_ci	"       movq 32(%1), %%mm4   ;\n"
3798c2ecf20Sopenharmony_ci	"       pxor 24(%2), %%mm3   ;\n"
3808c2ecf20Sopenharmony_ci	"       pxor 16(%4), %%mm2   ;\n"
3818c2ecf20Sopenharmony_ci	"       movq %%mm1,  8(%1)   ;\n"
3828c2ecf20Sopenharmony_ci	"       movq 40(%1), %%mm5   ;\n"
3838c2ecf20Sopenharmony_ci	"       pxor 32(%2), %%mm4   ;\n"
3848c2ecf20Sopenharmony_ci	"       pxor 24(%3), %%mm3   ;\n"
3858c2ecf20Sopenharmony_ci	"       movq %%mm2, 16(%1)   ;\n"
3868c2ecf20Sopenharmony_ci	"       pxor 40(%2), %%mm5   ;\n"
3878c2ecf20Sopenharmony_ci	"       pxor 32(%3), %%mm4   ;\n"
3888c2ecf20Sopenharmony_ci	"       pxor 24(%4), %%mm3   ;\n"
3898c2ecf20Sopenharmony_ci	"       movq %%mm3, 24(%1)   ;\n"
3908c2ecf20Sopenharmony_ci	"       movq 56(%1), %%mm7   ;\n"
3918c2ecf20Sopenharmony_ci	"       movq 48(%1), %%mm6   ;\n"
3928c2ecf20Sopenharmony_ci	"       pxor 40(%3), %%mm5   ;\n"
3938c2ecf20Sopenharmony_ci	"       pxor 32(%4), %%mm4   ;\n"
3948c2ecf20Sopenharmony_ci	"       pxor 48(%2), %%mm6   ;\n"
3958c2ecf20Sopenharmony_ci	"       movq %%mm4, 32(%1)   ;\n"
3968c2ecf20Sopenharmony_ci	"       pxor 56(%2), %%mm7   ;\n"
3978c2ecf20Sopenharmony_ci	"       pxor 40(%4), %%mm5   ;\n"
3988c2ecf20Sopenharmony_ci	"       pxor 48(%3), %%mm6   ;\n"
3998c2ecf20Sopenharmony_ci	"       pxor 56(%3), %%mm7   ;\n"
4008c2ecf20Sopenharmony_ci	"       movq %%mm5, 40(%1)   ;\n"
4018c2ecf20Sopenharmony_ci	"       pxor 48(%4), %%mm6   ;\n"
4028c2ecf20Sopenharmony_ci	"       pxor 56(%4), %%mm7   ;\n"
4038c2ecf20Sopenharmony_ci	"       movq %%mm6, 48(%1)   ;\n"
4048c2ecf20Sopenharmony_ci	"       movq %%mm7, 56(%1)   ;\n"
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci	"       addl $64, %1         ;\n"
4078c2ecf20Sopenharmony_ci	"       addl $64, %2         ;\n"
4088c2ecf20Sopenharmony_ci	"       addl $64, %3         ;\n"
4098c2ecf20Sopenharmony_ci	"       addl $64, %4         ;\n"
4108c2ecf20Sopenharmony_ci	"       decl %0              ;\n"
4118c2ecf20Sopenharmony_ci	"       jnz 1b               ;\n"
4128c2ecf20Sopenharmony_ci	: "+r" (lines),
4138c2ecf20Sopenharmony_ci	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
4148c2ecf20Sopenharmony_ci	:
4158c2ecf20Sopenharmony_ci	: "memory");
4168c2ecf20Sopenharmony_ci
4178c2ecf20Sopenharmony_ci	kernel_fpu_end();
4188c2ecf20Sopenharmony_ci}
4198c2ecf20Sopenharmony_ci
4208c2ecf20Sopenharmony_cistatic void
4218c2ecf20Sopenharmony_cixor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
4228c2ecf20Sopenharmony_ci	     unsigned long *p3, unsigned long *p4, unsigned long *p5)
4238c2ecf20Sopenharmony_ci{
4248c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 6;
4258c2ecf20Sopenharmony_ci
4268c2ecf20Sopenharmony_ci	kernel_fpu_begin();
4278c2ecf20Sopenharmony_ci
4288c2ecf20Sopenharmony_ci	/* Make sure GCC forgets anything it knows about p4 or p5,
4298c2ecf20Sopenharmony_ci	   such that it won't pass to the asm volatile below a
4308c2ecf20Sopenharmony_ci	   register that is shared with any other variable.  That's
4318c2ecf20Sopenharmony_ci	   because we modify p4 and p5 there, but we can't mark them
4328c2ecf20Sopenharmony_ci	   as read/write, otherwise we'd overflow the 10-asm-operands
4338c2ecf20Sopenharmony_ci	   limit of GCC < 3.1.  */
4348c2ecf20Sopenharmony_ci	asm("" : "+r" (p4), "+r" (p5));
4358c2ecf20Sopenharmony_ci
4368c2ecf20Sopenharmony_ci	asm volatile(
4378c2ecf20Sopenharmony_ci	" .align 32,0x90             ;\n"
4388c2ecf20Sopenharmony_ci	" 1:                         ;\n"
4398c2ecf20Sopenharmony_ci	"       movq   (%1), %%mm0   ;\n"
4408c2ecf20Sopenharmony_ci	"       movq  8(%1), %%mm1   ;\n"
4418c2ecf20Sopenharmony_ci	"       pxor   (%2), %%mm0   ;\n"
4428c2ecf20Sopenharmony_ci	"       pxor  8(%2), %%mm1   ;\n"
4438c2ecf20Sopenharmony_ci	"       movq 16(%1), %%mm2   ;\n"
4448c2ecf20Sopenharmony_ci	"       pxor   (%3), %%mm0   ;\n"
4458c2ecf20Sopenharmony_ci	"       pxor  8(%3), %%mm1   ;\n"
4468c2ecf20Sopenharmony_ci	"       pxor 16(%2), %%mm2   ;\n"
4478c2ecf20Sopenharmony_ci	"       pxor   (%4), %%mm0   ;\n"
4488c2ecf20Sopenharmony_ci	"       pxor  8(%4), %%mm1   ;\n"
4498c2ecf20Sopenharmony_ci	"       pxor 16(%3), %%mm2   ;\n"
4508c2ecf20Sopenharmony_ci	"       movq 24(%1), %%mm3   ;\n"
4518c2ecf20Sopenharmony_ci	"       pxor   (%5), %%mm0   ;\n"
4528c2ecf20Sopenharmony_ci	"       pxor  8(%5), %%mm1   ;\n"
4538c2ecf20Sopenharmony_ci	"       movq %%mm0,   (%1)   ;\n"
4548c2ecf20Sopenharmony_ci	"       pxor 16(%4), %%mm2   ;\n"
4558c2ecf20Sopenharmony_ci	"       pxor 24(%2), %%mm3   ;\n"
4568c2ecf20Sopenharmony_ci	"       movq %%mm1,  8(%1)   ;\n"
4578c2ecf20Sopenharmony_ci	"       pxor 16(%5), %%mm2   ;\n"
4588c2ecf20Sopenharmony_ci	"       pxor 24(%3), %%mm3   ;\n"
4598c2ecf20Sopenharmony_ci	"       movq 32(%1), %%mm4   ;\n"
4608c2ecf20Sopenharmony_ci	"       movq %%mm2, 16(%1)   ;\n"
4618c2ecf20Sopenharmony_ci	"       pxor 24(%4), %%mm3   ;\n"
4628c2ecf20Sopenharmony_ci	"       pxor 32(%2), %%mm4   ;\n"
4638c2ecf20Sopenharmony_ci	"       movq 40(%1), %%mm5   ;\n"
4648c2ecf20Sopenharmony_ci	"       pxor 24(%5), %%mm3   ;\n"
4658c2ecf20Sopenharmony_ci	"       pxor 32(%3), %%mm4   ;\n"
4668c2ecf20Sopenharmony_ci	"       pxor 40(%2), %%mm5   ;\n"
4678c2ecf20Sopenharmony_ci	"       movq %%mm3, 24(%1)   ;\n"
4688c2ecf20Sopenharmony_ci	"       pxor 32(%4), %%mm4   ;\n"
4698c2ecf20Sopenharmony_ci	"       pxor 40(%3), %%mm5   ;\n"
4708c2ecf20Sopenharmony_ci	"       movq 48(%1), %%mm6   ;\n"
4718c2ecf20Sopenharmony_ci	"       movq 56(%1), %%mm7   ;\n"
4728c2ecf20Sopenharmony_ci	"       pxor 32(%5), %%mm4   ;\n"
4738c2ecf20Sopenharmony_ci	"       pxor 40(%4), %%mm5   ;\n"
4748c2ecf20Sopenharmony_ci	"       pxor 48(%2), %%mm6   ;\n"
4758c2ecf20Sopenharmony_ci	"       pxor 56(%2), %%mm7   ;\n"
4768c2ecf20Sopenharmony_ci	"       movq %%mm4, 32(%1)   ;\n"
4778c2ecf20Sopenharmony_ci	"       pxor 48(%3), %%mm6   ;\n"
4788c2ecf20Sopenharmony_ci	"       pxor 56(%3), %%mm7   ;\n"
4798c2ecf20Sopenharmony_ci	"       pxor 40(%5), %%mm5   ;\n"
4808c2ecf20Sopenharmony_ci	"       pxor 48(%4), %%mm6   ;\n"
4818c2ecf20Sopenharmony_ci	"       pxor 56(%4), %%mm7   ;\n"
4828c2ecf20Sopenharmony_ci	"       movq %%mm5, 40(%1)   ;\n"
4838c2ecf20Sopenharmony_ci	"       pxor 48(%5), %%mm6   ;\n"
4848c2ecf20Sopenharmony_ci	"       pxor 56(%5), %%mm7   ;\n"
4858c2ecf20Sopenharmony_ci	"       movq %%mm6, 48(%1)   ;\n"
4868c2ecf20Sopenharmony_ci	"       movq %%mm7, 56(%1)   ;\n"
4878c2ecf20Sopenharmony_ci
4888c2ecf20Sopenharmony_ci	"       addl $64, %1         ;\n"
4898c2ecf20Sopenharmony_ci	"       addl $64, %2         ;\n"
4908c2ecf20Sopenharmony_ci	"       addl $64, %3         ;\n"
4918c2ecf20Sopenharmony_ci	"       addl $64, %4         ;\n"
4928c2ecf20Sopenharmony_ci	"       addl $64, %5         ;\n"
4938c2ecf20Sopenharmony_ci	"       decl %0              ;\n"
4948c2ecf20Sopenharmony_ci	"       jnz 1b               ;\n"
4958c2ecf20Sopenharmony_ci	: "+r" (lines),
4968c2ecf20Sopenharmony_ci	  "+r" (p1), "+r" (p2), "+r" (p3)
4978c2ecf20Sopenharmony_ci	: "r" (p4), "r" (p5)
4988c2ecf20Sopenharmony_ci	: "memory");
4998c2ecf20Sopenharmony_ci
5008c2ecf20Sopenharmony_ci	/* p4 and p5 were modified, and now the variables are dead.
5018c2ecf20Sopenharmony_ci	   Clobber them just to be sure nobody does something stupid
5028c2ecf20Sopenharmony_ci	   like assuming they have some legal value.  */
5038c2ecf20Sopenharmony_ci	asm("" : "=r" (p4), "=r" (p5));
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_ci	kernel_fpu_end();
5068c2ecf20Sopenharmony_ci}
5078c2ecf20Sopenharmony_ci
5088c2ecf20Sopenharmony_cistatic struct xor_block_template xor_block_pII_mmx = {
5098c2ecf20Sopenharmony_ci	.name = "pII_mmx",
5108c2ecf20Sopenharmony_ci	.do_2 = xor_pII_mmx_2,
5118c2ecf20Sopenharmony_ci	.do_3 = xor_pII_mmx_3,
5128c2ecf20Sopenharmony_ci	.do_4 = xor_pII_mmx_4,
5138c2ecf20Sopenharmony_ci	.do_5 = xor_pII_mmx_5,
5148c2ecf20Sopenharmony_ci};
5158c2ecf20Sopenharmony_ci
5168c2ecf20Sopenharmony_cistatic struct xor_block_template xor_block_p5_mmx = {
5178c2ecf20Sopenharmony_ci	.name = "p5_mmx",
5188c2ecf20Sopenharmony_ci	.do_2 = xor_p5_mmx_2,
5198c2ecf20Sopenharmony_ci	.do_3 = xor_p5_mmx_3,
5208c2ecf20Sopenharmony_ci	.do_4 = xor_p5_mmx_4,
5218c2ecf20Sopenharmony_ci	.do_5 = xor_p5_mmx_5,
5228c2ecf20Sopenharmony_ci};
5238c2ecf20Sopenharmony_ci
5248c2ecf20Sopenharmony_cistatic struct xor_block_template xor_block_pIII_sse = {
5258c2ecf20Sopenharmony_ci	.name = "pIII_sse",
5268c2ecf20Sopenharmony_ci	.do_2 = xor_sse_2,
5278c2ecf20Sopenharmony_ci	.do_3 = xor_sse_3,
5288c2ecf20Sopenharmony_ci	.do_4 = xor_sse_4,
5298c2ecf20Sopenharmony_ci	.do_5 = xor_sse_5,
5308c2ecf20Sopenharmony_ci};
5318c2ecf20Sopenharmony_ci
5328c2ecf20Sopenharmony_ci/* Also try the AVX routines */
5338c2ecf20Sopenharmony_ci#include <asm/xor_avx.h>
5348c2ecf20Sopenharmony_ci
5358c2ecf20Sopenharmony_ci/* Also try the generic routines.  */
5368c2ecf20Sopenharmony_ci#include <asm-generic/xor.h>
5378c2ecf20Sopenharmony_ci
5388c2ecf20Sopenharmony_ci/* We force the use of the SSE xor block because it can write around L2.
5398c2ecf20Sopenharmony_ci   We may also be able to load into the L1 only depending on how the cpu
5408c2ecf20Sopenharmony_ci   deals with a load to a line that is being prefetched.  */
5418c2ecf20Sopenharmony_ci#undef XOR_TRY_TEMPLATES
5428c2ecf20Sopenharmony_ci#define XOR_TRY_TEMPLATES				\
5438c2ecf20Sopenharmony_cido {							\
5448c2ecf20Sopenharmony_ci	AVX_XOR_SPEED;					\
5458c2ecf20Sopenharmony_ci	if (boot_cpu_has(X86_FEATURE_XMM)) {				\
5468c2ecf20Sopenharmony_ci		xor_speed(&xor_block_pIII_sse);		\
5478c2ecf20Sopenharmony_ci		xor_speed(&xor_block_sse_pf64);		\
5488c2ecf20Sopenharmony_ci	} else if (boot_cpu_has(X86_FEATURE_MMX)) {	\
5498c2ecf20Sopenharmony_ci		xor_speed(&xor_block_pII_mmx);		\
5508c2ecf20Sopenharmony_ci		xor_speed(&xor_block_p5_mmx);		\
5518c2ecf20Sopenharmony_ci	} else {					\
5528c2ecf20Sopenharmony_ci		xor_speed(&xor_block_8regs);		\
5538c2ecf20Sopenharmony_ci		xor_speed(&xor_block_8regs_p);		\
5548c2ecf20Sopenharmony_ci		xor_speed(&xor_block_32regs);		\
5558c2ecf20Sopenharmony_ci		xor_speed(&xor_block_32regs_p);		\
5568c2ecf20Sopenharmony_ci	}						\
5578c2ecf20Sopenharmony_ci} while (0)
5588c2ecf20Sopenharmony_ci
5598c2ecf20Sopenharmony_ci#endif /* _ASM_X86_XOR_32_H */
560