18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
28c2ecf20Sopenharmony_ci#ifndef _ASM_X86_XOR_H
38c2ecf20Sopenharmony_ci#define _ASM_X86_XOR_H
48c2ecf20Sopenharmony_ci
58c2ecf20Sopenharmony_ci/*
68c2ecf20Sopenharmony_ci * Optimized RAID-5 checksumming functions for SSE.
78c2ecf20Sopenharmony_ci */
88c2ecf20Sopenharmony_ci
98c2ecf20Sopenharmony_ci/*
108c2ecf20Sopenharmony_ci * Cache avoiding checksumming functions utilizing KNI instructions
118c2ecf20Sopenharmony_ci * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
128c2ecf20Sopenharmony_ci */
138c2ecf20Sopenharmony_ci
148c2ecf20Sopenharmony_ci/*
158c2ecf20Sopenharmony_ci * Based on
168c2ecf20Sopenharmony_ci * High-speed RAID5 checksumming functions utilizing SSE instructions.
178c2ecf20Sopenharmony_ci * Copyright (C) 1998 Ingo Molnar.
188c2ecf20Sopenharmony_ci */
198c2ecf20Sopenharmony_ci
208c2ecf20Sopenharmony_ci/*
218c2ecf20Sopenharmony_ci * x86-64 changes / gcc fixes from Andi Kleen.
228c2ecf20Sopenharmony_ci * Copyright 2002 Andi Kleen, SuSE Labs.
238c2ecf20Sopenharmony_ci *
248c2ecf20Sopenharmony_ci * This hasn't been optimized for the hammer yet, but there are likely
258c2ecf20Sopenharmony_ci * no advantages to be gotten from x86-64 here anyways.
268c2ecf20Sopenharmony_ci */
278c2ecf20Sopenharmony_ci
288c2ecf20Sopenharmony_ci#include <asm/fpu/api.h>
298c2ecf20Sopenharmony_ci
308c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_32
318c2ecf20Sopenharmony_ci/* reduce register pressure */
328c2ecf20Sopenharmony_ci# define XOR_CONSTANT_CONSTRAINT "i"
338c2ecf20Sopenharmony_ci#else
348c2ecf20Sopenharmony_ci# define XOR_CONSTANT_CONSTRAINT "re"
358c2ecf20Sopenharmony_ci#endif
368c2ecf20Sopenharmony_ci
378c2ecf20Sopenharmony_ci#define OFFS(x)		"16*("#x")"
388c2ecf20Sopenharmony_ci#define PF_OFFS(x)	"256+16*("#x")"
398c2ecf20Sopenharmony_ci#define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
408c2ecf20Sopenharmony_ci#define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
418c2ecf20Sopenharmony_ci#define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
428c2ecf20Sopenharmony_ci#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
438c2ecf20Sopenharmony_ci#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
448c2ecf20Sopenharmony_ci#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
458c2ecf20Sopenharmony_ci#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
468c2ecf20Sopenharmony_ci#define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
478c2ecf20Sopenharmony_ci#define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
488c2ecf20Sopenharmony_ci#define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
498c2ecf20Sopenharmony_ci#define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
508c2ecf20Sopenharmony_ci#define NOP(x)
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_ci#define BLK64(pf, op, i)				\
538c2ecf20Sopenharmony_ci		pf(i)					\
548c2ecf20Sopenharmony_ci		op(i, 0)				\
558c2ecf20Sopenharmony_ci			op(i + 1, 1)			\
568c2ecf20Sopenharmony_ci				op(i + 2, 2)		\
578c2ecf20Sopenharmony_ci					op(i + 3, 3)
588c2ecf20Sopenharmony_ci
598c2ecf20Sopenharmony_cistatic void
608c2ecf20Sopenharmony_cixor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
618c2ecf20Sopenharmony_ci{
628c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 8;
638c2ecf20Sopenharmony_ci
648c2ecf20Sopenharmony_ci	kernel_fpu_begin();
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_ci	asm volatile(
678c2ecf20Sopenharmony_ci#undef BLOCK
688c2ecf20Sopenharmony_ci#define BLOCK(i)					\
698c2ecf20Sopenharmony_ci		LD(i, 0)				\
708c2ecf20Sopenharmony_ci			LD(i + 1, 1)			\
718c2ecf20Sopenharmony_ci		PF1(i)					\
728c2ecf20Sopenharmony_ci				PF1(i + 2)		\
738c2ecf20Sopenharmony_ci				LD(i + 2, 2)		\
748c2ecf20Sopenharmony_ci					LD(i + 3, 3)	\
758c2ecf20Sopenharmony_ci		PF0(i + 4)				\
768c2ecf20Sopenharmony_ci				PF0(i + 6)		\
778c2ecf20Sopenharmony_ci		XO1(i, 0)				\
788c2ecf20Sopenharmony_ci			XO1(i + 1, 1)			\
798c2ecf20Sopenharmony_ci				XO1(i + 2, 2)		\
808c2ecf20Sopenharmony_ci					XO1(i + 3, 3)	\
818c2ecf20Sopenharmony_ci		ST(i, 0)				\
828c2ecf20Sopenharmony_ci			ST(i + 1, 1)			\
838c2ecf20Sopenharmony_ci				ST(i + 2, 2)		\
848c2ecf20Sopenharmony_ci					ST(i + 3, 3)	\
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_ci		PF0(0)
888c2ecf20Sopenharmony_ci				PF0(2)
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci	" .align 32			;\n"
918c2ecf20Sopenharmony_ci	" 1:                            ;\n"
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci		BLOCK(0)
948c2ecf20Sopenharmony_ci		BLOCK(4)
958c2ecf20Sopenharmony_ci		BLOCK(8)
968c2ecf20Sopenharmony_ci		BLOCK(12)
978c2ecf20Sopenharmony_ci
988c2ecf20Sopenharmony_ci	"       add %[inc], %[p1]       ;\n"
998c2ecf20Sopenharmony_ci	"       add %[inc], %[p2]       ;\n"
1008c2ecf20Sopenharmony_ci	"       dec %[cnt]              ;\n"
1018c2ecf20Sopenharmony_ci	"       jnz 1b                  ;\n"
1028c2ecf20Sopenharmony_ci	: [cnt] "+r" (lines),
1038c2ecf20Sopenharmony_ci	  [p1] "+r" (p1), [p2] "+r" (p2)
1048c2ecf20Sopenharmony_ci	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
1058c2ecf20Sopenharmony_ci	: "memory");
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci	kernel_fpu_end();
1088c2ecf20Sopenharmony_ci}
1098c2ecf20Sopenharmony_ci
1108c2ecf20Sopenharmony_cistatic void
1118c2ecf20Sopenharmony_cixor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
1128c2ecf20Sopenharmony_ci{
1138c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 8;
1148c2ecf20Sopenharmony_ci
1158c2ecf20Sopenharmony_ci	kernel_fpu_begin();
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_ci	asm volatile(
1188c2ecf20Sopenharmony_ci#undef BLOCK
1198c2ecf20Sopenharmony_ci#define BLOCK(i)			\
1208c2ecf20Sopenharmony_ci		BLK64(PF0, LD, i)	\
1218c2ecf20Sopenharmony_ci		BLK64(PF1, XO1, i)	\
1228c2ecf20Sopenharmony_ci		BLK64(NOP, ST, i)	\
1238c2ecf20Sopenharmony_ci
1248c2ecf20Sopenharmony_ci	" .align 32			;\n"
1258c2ecf20Sopenharmony_ci	" 1:                            ;\n"
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci		BLOCK(0)
1288c2ecf20Sopenharmony_ci		BLOCK(4)
1298c2ecf20Sopenharmony_ci		BLOCK(8)
1308c2ecf20Sopenharmony_ci		BLOCK(12)
1318c2ecf20Sopenharmony_ci
1328c2ecf20Sopenharmony_ci	"       add %[inc], %[p1]       ;\n"
1338c2ecf20Sopenharmony_ci	"       add %[inc], %[p2]       ;\n"
1348c2ecf20Sopenharmony_ci	"       dec %[cnt]              ;\n"
1358c2ecf20Sopenharmony_ci	"       jnz 1b                  ;\n"
1368c2ecf20Sopenharmony_ci	: [cnt] "+r" (lines),
1378c2ecf20Sopenharmony_ci	  [p1] "+r" (p1), [p2] "+r" (p2)
1388c2ecf20Sopenharmony_ci	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
1398c2ecf20Sopenharmony_ci	: "memory");
1408c2ecf20Sopenharmony_ci
1418c2ecf20Sopenharmony_ci	kernel_fpu_end();
1428c2ecf20Sopenharmony_ci}
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_cistatic void
1458c2ecf20Sopenharmony_cixor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
1468c2ecf20Sopenharmony_ci	  unsigned long *p3)
1478c2ecf20Sopenharmony_ci{
1488c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 8;
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci	kernel_fpu_begin();
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci	asm volatile(
1538c2ecf20Sopenharmony_ci#undef BLOCK
1548c2ecf20Sopenharmony_ci#define BLOCK(i) \
1558c2ecf20Sopenharmony_ci		PF1(i)					\
1568c2ecf20Sopenharmony_ci				PF1(i + 2)		\
1578c2ecf20Sopenharmony_ci		LD(i, 0)				\
1588c2ecf20Sopenharmony_ci			LD(i + 1, 1)			\
1598c2ecf20Sopenharmony_ci				LD(i + 2, 2)		\
1608c2ecf20Sopenharmony_ci					LD(i + 3, 3)	\
1618c2ecf20Sopenharmony_ci		PF2(i)					\
1628c2ecf20Sopenharmony_ci				PF2(i + 2)		\
1638c2ecf20Sopenharmony_ci		PF0(i + 4)				\
1648c2ecf20Sopenharmony_ci				PF0(i + 6)		\
1658c2ecf20Sopenharmony_ci		XO1(i, 0)				\
1668c2ecf20Sopenharmony_ci			XO1(i + 1, 1)			\
1678c2ecf20Sopenharmony_ci				XO1(i + 2, 2)		\
1688c2ecf20Sopenharmony_ci					XO1(i + 3, 3)	\
1698c2ecf20Sopenharmony_ci		XO2(i, 0)				\
1708c2ecf20Sopenharmony_ci			XO2(i + 1, 1)			\
1718c2ecf20Sopenharmony_ci				XO2(i + 2, 2)		\
1728c2ecf20Sopenharmony_ci					XO2(i + 3, 3)	\
1738c2ecf20Sopenharmony_ci		ST(i, 0)				\
1748c2ecf20Sopenharmony_ci			ST(i + 1, 1)			\
1758c2ecf20Sopenharmony_ci				ST(i + 2, 2)		\
1768c2ecf20Sopenharmony_ci					ST(i + 3, 3)	\
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ci		PF0(0)
1808c2ecf20Sopenharmony_ci				PF0(2)
1818c2ecf20Sopenharmony_ci
1828c2ecf20Sopenharmony_ci	" .align 32			;\n"
1838c2ecf20Sopenharmony_ci	" 1:                            ;\n"
1848c2ecf20Sopenharmony_ci
1858c2ecf20Sopenharmony_ci		BLOCK(0)
1868c2ecf20Sopenharmony_ci		BLOCK(4)
1878c2ecf20Sopenharmony_ci		BLOCK(8)
1888c2ecf20Sopenharmony_ci		BLOCK(12)
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci	"       add %[inc], %[p1]       ;\n"
1918c2ecf20Sopenharmony_ci	"       add %[inc], %[p2]       ;\n"
1928c2ecf20Sopenharmony_ci	"       add %[inc], %[p3]       ;\n"
1938c2ecf20Sopenharmony_ci	"       dec %[cnt]              ;\n"
1948c2ecf20Sopenharmony_ci	"       jnz 1b                  ;\n"
1958c2ecf20Sopenharmony_ci	: [cnt] "+r" (lines),
1968c2ecf20Sopenharmony_ci	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
1978c2ecf20Sopenharmony_ci	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
1988c2ecf20Sopenharmony_ci	: "memory");
1998c2ecf20Sopenharmony_ci
2008c2ecf20Sopenharmony_ci	kernel_fpu_end();
2018c2ecf20Sopenharmony_ci}
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_cistatic void
2048c2ecf20Sopenharmony_cixor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
2058c2ecf20Sopenharmony_ci	       unsigned long *p3)
2068c2ecf20Sopenharmony_ci{
2078c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 8;
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_ci	kernel_fpu_begin();
2108c2ecf20Sopenharmony_ci
2118c2ecf20Sopenharmony_ci	asm volatile(
2128c2ecf20Sopenharmony_ci#undef BLOCK
2138c2ecf20Sopenharmony_ci#define BLOCK(i)			\
2148c2ecf20Sopenharmony_ci		BLK64(PF0, LD, i)	\
2158c2ecf20Sopenharmony_ci		BLK64(PF1, XO1, i)	\
2168c2ecf20Sopenharmony_ci		BLK64(PF2, XO2, i)	\
2178c2ecf20Sopenharmony_ci		BLK64(NOP, ST, i)	\
2188c2ecf20Sopenharmony_ci
2198c2ecf20Sopenharmony_ci	" .align 32			;\n"
2208c2ecf20Sopenharmony_ci	" 1:                            ;\n"
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_ci		BLOCK(0)
2238c2ecf20Sopenharmony_ci		BLOCK(4)
2248c2ecf20Sopenharmony_ci		BLOCK(8)
2258c2ecf20Sopenharmony_ci		BLOCK(12)
2268c2ecf20Sopenharmony_ci
2278c2ecf20Sopenharmony_ci	"       add %[inc], %[p1]       ;\n"
2288c2ecf20Sopenharmony_ci	"       add %[inc], %[p2]       ;\n"
2298c2ecf20Sopenharmony_ci	"       add %[inc], %[p3]       ;\n"
2308c2ecf20Sopenharmony_ci	"       dec %[cnt]              ;\n"
2318c2ecf20Sopenharmony_ci	"       jnz 1b                  ;\n"
2328c2ecf20Sopenharmony_ci	: [cnt] "+r" (lines),
2338c2ecf20Sopenharmony_ci	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
2348c2ecf20Sopenharmony_ci	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
2358c2ecf20Sopenharmony_ci	: "memory");
2368c2ecf20Sopenharmony_ci
2378c2ecf20Sopenharmony_ci	kernel_fpu_end();
2388c2ecf20Sopenharmony_ci}
2398c2ecf20Sopenharmony_ci
2408c2ecf20Sopenharmony_cistatic void
2418c2ecf20Sopenharmony_cixor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
2428c2ecf20Sopenharmony_ci	  unsigned long *p3, unsigned long *p4)
2438c2ecf20Sopenharmony_ci{
2448c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 8;
2458c2ecf20Sopenharmony_ci
2468c2ecf20Sopenharmony_ci	kernel_fpu_begin();
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci	asm volatile(
2498c2ecf20Sopenharmony_ci#undef BLOCK
2508c2ecf20Sopenharmony_ci#define BLOCK(i) \
2518c2ecf20Sopenharmony_ci		PF1(i)					\
2528c2ecf20Sopenharmony_ci				PF1(i + 2)		\
2538c2ecf20Sopenharmony_ci		LD(i, 0)				\
2548c2ecf20Sopenharmony_ci			LD(i + 1, 1)			\
2558c2ecf20Sopenharmony_ci				LD(i + 2, 2)		\
2568c2ecf20Sopenharmony_ci					LD(i + 3, 3)	\
2578c2ecf20Sopenharmony_ci		PF2(i)					\
2588c2ecf20Sopenharmony_ci				PF2(i + 2)		\
2598c2ecf20Sopenharmony_ci		XO1(i, 0)				\
2608c2ecf20Sopenharmony_ci			XO1(i + 1, 1)			\
2618c2ecf20Sopenharmony_ci				XO1(i + 2, 2)		\
2628c2ecf20Sopenharmony_ci					XO1(i + 3, 3)	\
2638c2ecf20Sopenharmony_ci		PF3(i)					\
2648c2ecf20Sopenharmony_ci				PF3(i + 2)		\
2658c2ecf20Sopenharmony_ci		PF0(i + 4)				\
2668c2ecf20Sopenharmony_ci				PF0(i + 6)		\
2678c2ecf20Sopenharmony_ci		XO2(i, 0)				\
2688c2ecf20Sopenharmony_ci			XO2(i + 1, 1)			\
2698c2ecf20Sopenharmony_ci				XO2(i + 2, 2)		\
2708c2ecf20Sopenharmony_ci					XO2(i + 3, 3)	\
2718c2ecf20Sopenharmony_ci		XO3(i, 0)				\
2728c2ecf20Sopenharmony_ci			XO3(i + 1, 1)			\
2738c2ecf20Sopenharmony_ci				XO3(i + 2, 2)		\
2748c2ecf20Sopenharmony_ci					XO3(i + 3, 3)	\
2758c2ecf20Sopenharmony_ci		ST(i, 0)				\
2768c2ecf20Sopenharmony_ci			ST(i + 1, 1)			\
2778c2ecf20Sopenharmony_ci				ST(i + 2, 2)		\
2788c2ecf20Sopenharmony_ci					ST(i + 3, 3)	\
2798c2ecf20Sopenharmony_ci
2808c2ecf20Sopenharmony_ci
2818c2ecf20Sopenharmony_ci		PF0(0)
2828c2ecf20Sopenharmony_ci				PF0(2)
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci	" .align 32			;\n"
2858c2ecf20Sopenharmony_ci	" 1:                            ;\n"
2868c2ecf20Sopenharmony_ci
2878c2ecf20Sopenharmony_ci		BLOCK(0)
2888c2ecf20Sopenharmony_ci		BLOCK(4)
2898c2ecf20Sopenharmony_ci		BLOCK(8)
2908c2ecf20Sopenharmony_ci		BLOCK(12)
2918c2ecf20Sopenharmony_ci
2928c2ecf20Sopenharmony_ci	"       add %[inc], %[p1]       ;\n"
2938c2ecf20Sopenharmony_ci	"       add %[inc], %[p2]       ;\n"
2948c2ecf20Sopenharmony_ci	"       add %[inc], %[p3]       ;\n"
2958c2ecf20Sopenharmony_ci	"       add %[inc], %[p4]       ;\n"
2968c2ecf20Sopenharmony_ci	"       dec %[cnt]              ;\n"
2978c2ecf20Sopenharmony_ci	"       jnz 1b                  ;\n"
2988c2ecf20Sopenharmony_ci	: [cnt] "+r" (lines), [p1] "+r" (p1),
2998c2ecf20Sopenharmony_ci	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
3008c2ecf20Sopenharmony_ci	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
3018c2ecf20Sopenharmony_ci	: "memory");
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_ci	kernel_fpu_end();
3048c2ecf20Sopenharmony_ci}
3058c2ecf20Sopenharmony_ci
3068c2ecf20Sopenharmony_cistatic void
3078c2ecf20Sopenharmony_cixor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
3088c2ecf20Sopenharmony_ci	       unsigned long *p3, unsigned long *p4)
3098c2ecf20Sopenharmony_ci{
3108c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 8;
3118c2ecf20Sopenharmony_ci
3128c2ecf20Sopenharmony_ci	kernel_fpu_begin();
3138c2ecf20Sopenharmony_ci
3148c2ecf20Sopenharmony_ci	asm volatile(
3158c2ecf20Sopenharmony_ci#undef BLOCK
3168c2ecf20Sopenharmony_ci#define BLOCK(i)			\
3178c2ecf20Sopenharmony_ci		BLK64(PF0, LD, i)	\
3188c2ecf20Sopenharmony_ci		BLK64(PF1, XO1, i)	\
3198c2ecf20Sopenharmony_ci		BLK64(PF2, XO2, i)	\
3208c2ecf20Sopenharmony_ci		BLK64(PF3, XO3, i)	\
3218c2ecf20Sopenharmony_ci		BLK64(NOP, ST, i)	\
3228c2ecf20Sopenharmony_ci
3238c2ecf20Sopenharmony_ci	" .align 32			;\n"
3248c2ecf20Sopenharmony_ci	" 1:                            ;\n"
3258c2ecf20Sopenharmony_ci
3268c2ecf20Sopenharmony_ci		BLOCK(0)
3278c2ecf20Sopenharmony_ci		BLOCK(4)
3288c2ecf20Sopenharmony_ci		BLOCK(8)
3298c2ecf20Sopenharmony_ci		BLOCK(12)
3308c2ecf20Sopenharmony_ci
3318c2ecf20Sopenharmony_ci	"       add %[inc], %[p1]       ;\n"
3328c2ecf20Sopenharmony_ci	"       add %[inc], %[p2]       ;\n"
3338c2ecf20Sopenharmony_ci	"       add %[inc], %[p3]       ;\n"
3348c2ecf20Sopenharmony_ci	"       add %[inc], %[p4]       ;\n"
3358c2ecf20Sopenharmony_ci	"       dec %[cnt]              ;\n"
3368c2ecf20Sopenharmony_ci	"       jnz 1b                  ;\n"
3378c2ecf20Sopenharmony_ci	: [cnt] "+r" (lines), [p1] "+r" (p1),
3388c2ecf20Sopenharmony_ci	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
3398c2ecf20Sopenharmony_ci	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
3408c2ecf20Sopenharmony_ci	: "memory");
3418c2ecf20Sopenharmony_ci
3428c2ecf20Sopenharmony_ci	kernel_fpu_end();
3438c2ecf20Sopenharmony_ci}
3448c2ecf20Sopenharmony_ci
3458c2ecf20Sopenharmony_cistatic void
3468c2ecf20Sopenharmony_cixor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
3478c2ecf20Sopenharmony_ci	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
3488c2ecf20Sopenharmony_ci{
3498c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 8;
3508c2ecf20Sopenharmony_ci
3518c2ecf20Sopenharmony_ci	kernel_fpu_begin();
3528c2ecf20Sopenharmony_ci
3538c2ecf20Sopenharmony_ci	asm volatile(
3548c2ecf20Sopenharmony_ci#undef BLOCK
3558c2ecf20Sopenharmony_ci#define BLOCK(i) \
3568c2ecf20Sopenharmony_ci		PF1(i)					\
3578c2ecf20Sopenharmony_ci				PF1(i + 2)		\
3588c2ecf20Sopenharmony_ci		LD(i, 0)				\
3598c2ecf20Sopenharmony_ci			LD(i + 1, 1)			\
3608c2ecf20Sopenharmony_ci				LD(i + 2, 2)		\
3618c2ecf20Sopenharmony_ci					LD(i + 3, 3)	\
3628c2ecf20Sopenharmony_ci		PF2(i)					\
3638c2ecf20Sopenharmony_ci				PF2(i + 2)		\
3648c2ecf20Sopenharmony_ci		XO1(i, 0)				\
3658c2ecf20Sopenharmony_ci			XO1(i + 1, 1)			\
3668c2ecf20Sopenharmony_ci				XO1(i + 2, 2)		\
3678c2ecf20Sopenharmony_ci					XO1(i + 3, 3)	\
3688c2ecf20Sopenharmony_ci		PF3(i)					\
3698c2ecf20Sopenharmony_ci				PF3(i + 2)		\
3708c2ecf20Sopenharmony_ci		XO2(i, 0)				\
3718c2ecf20Sopenharmony_ci			XO2(i + 1, 1)			\
3728c2ecf20Sopenharmony_ci				XO2(i + 2, 2)		\
3738c2ecf20Sopenharmony_ci					XO2(i + 3, 3)	\
3748c2ecf20Sopenharmony_ci		PF4(i)					\
3758c2ecf20Sopenharmony_ci				PF4(i + 2)		\
3768c2ecf20Sopenharmony_ci		PF0(i + 4)				\
3778c2ecf20Sopenharmony_ci				PF0(i + 6)		\
3788c2ecf20Sopenharmony_ci		XO3(i, 0)				\
3798c2ecf20Sopenharmony_ci			XO3(i + 1, 1)			\
3808c2ecf20Sopenharmony_ci				XO3(i + 2, 2)		\
3818c2ecf20Sopenharmony_ci					XO3(i + 3, 3)	\
3828c2ecf20Sopenharmony_ci		XO4(i, 0)				\
3838c2ecf20Sopenharmony_ci			XO4(i + 1, 1)			\
3848c2ecf20Sopenharmony_ci				XO4(i + 2, 2)		\
3858c2ecf20Sopenharmony_ci					XO4(i + 3, 3)	\
3868c2ecf20Sopenharmony_ci		ST(i, 0)				\
3878c2ecf20Sopenharmony_ci			ST(i + 1, 1)			\
3888c2ecf20Sopenharmony_ci				ST(i + 2, 2)		\
3898c2ecf20Sopenharmony_ci					ST(i + 3, 3)	\
3908c2ecf20Sopenharmony_ci
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci		PF0(0)
3938c2ecf20Sopenharmony_ci				PF0(2)
3948c2ecf20Sopenharmony_ci
3958c2ecf20Sopenharmony_ci	" .align 32			;\n"
3968c2ecf20Sopenharmony_ci	" 1:                            ;\n"
3978c2ecf20Sopenharmony_ci
3988c2ecf20Sopenharmony_ci		BLOCK(0)
3998c2ecf20Sopenharmony_ci		BLOCK(4)
4008c2ecf20Sopenharmony_ci		BLOCK(8)
4018c2ecf20Sopenharmony_ci		BLOCK(12)
4028c2ecf20Sopenharmony_ci
4038c2ecf20Sopenharmony_ci	"       add %[inc], %[p1]       ;\n"
4048c2ecf20Sopenharmony_ci	"       add %[inc], %[p2]       ;\n"
4058c2ecf20Sopenharmony_ci	"       add %[inc], %[p3]       ;\n"
4068c2ecf20Sopenharmony_ci	"       add %[inc], %[p4]       ;\n"
4078c2ecf20Sopenharmony_ci	"       add %[inc], %[p5]       ;\n"
4088c2ecf20Sopenharmony_ci	"       dec %[cnt]              ;\n"
4098c2ecf20Sopenharmony_ci	"       jnz 1b                  ;\n"
4108c2ecf20Sopenharmony_ci	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
4118c2ecf20Sopenharmony_ci	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
4128c2ecf20Sopenharmony_ci	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
4138c2ecf20Sopenharmony_ci	: "memory");
4148c2ecf20Sopenharmony_ci
4158c2ecf20Sopenharmony_ci	kernel_fpu_end();
4168c2ecf20Sopenharmony_ci}
4178c2ecf20Sopenharmony_ci
4188c2ecf20Sopenharmony_cistatic void
4198c2ecf20Sopenharmony_cixor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
4208c2ecf20Sopenharmony_ci	       unsigned long *p3, unsigned long *p4, unsigned long *p5)
4218c2ecf20Sopenharmony_ci{
4228c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 8;
4238c2ecf20Sopenharmony_ci
4248c2ecf20Sopenharmony_ci	kernel_fpu_begin();
4258c2ecf20Sopenharmony_ci
4268c2ecf20Sopenharmony_ci	asm volatile(
4278c2ecf20Sopenharmony_ci#undef BLOCK
4288c2ecf20Sopenharmony_ci#define BLOCK(i)			\
4298c2ecf20Sopenharmony_ci		BLK64(PF0, LD, i)	\
4308c2ecf20Sopenharmony_ci		BLK64(PF1, XO1, i)	\
4318c2ecf20Sopenharmony_ci		BLK64(PF2, XO2, i)	\
4328c2ecf20Sopenharmony_ci		BLK64(PF3, XO3, i)	\
4338c2ecf20Sopenharmony_ci		BLK64(PF4, XO4, i)	\
4348c2ecf20Sopenharmony_ci		BLK64(NOP, ST, i)	\
4358c2ecf20Sopenharmony_ci
4368c2ecf20Sopenharmony_ci	" .align 32			;\n"
4378c2ecf20Sopenharmony_ci	" 1:                            ;\n"
4388c2ecf20Sopenharmony_ci
4398c2ecf20Sopenharmony_ci		BLOCK(0)
4408c2ecf20Sopenharmony_ci		BLOCK(4)
4418c2ecf20Sopenharmony_ci		BLOCK(8)
4428c2ecf20Sopenharmony_ci		BLOCK(12)
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ci	"       add %[inc], %[p1]       ;\n"
4458c2ecf20Sopenharmony_ci	"       add %[inc], %[p2]       ;\n"
4468c2ecf20Sopenharmony_ci	"       add %[inc], %[p3]       ;\n"
4478c2ecf20Sopenharmony_ci	"       add %[inc], %[p4]       ;\n"
4488c2ecf20Sopenharmony_ci	"       add %[inc], %[p5]       ;\n"
4498c2ecf20Sopenharmony_ci	"       dec %[cnt]              ;\n"
4508c2ecf20Sopenharmony_ci	"       jnz 1b                  ;\n"
4518c2ecf20Sopenharmony_ci	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
4528c2ecf20Sopenharmony_ci	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
4538c2ecf20Sopenharmony_ci	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
4548c2ecf20Sopenharmony_ci	: "memory");
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci	kernel_fpu_end();
4578c2ecf20Sopenharmony_ci}
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_cistatic struct xor_block_template xor_block_sse_pf64 = {
4608c2ecf20Sopenharmony_ci	.name = "prefetch64-sse",
4618c2ecf20Sopenharmony_ci	.do_2 = xor_sse_2_pf64,
4628c2ecf20Sopenharmony_ci	.do_3 = xor_sse_3_pf64,
4638c2ecf20Sopenharmony_ci	.do_4 = xor_sse_4_pf64,
4648c2ecf20Sopenharmony_ci	.do_5 = xor_sse_5_pf64,
4658c2ecf20Sopenharmony_ci};
4668c2ecf20Sopenharmony_ci
4678c2ecf20Sopenharmony_ci#undef LD
4688c2ecf20Sopenharmony_ci#undef XO1
4698c2ecf20Sopenharmony_ci#undef XO2
4708c2ecf20Sopenharmony_ci#undef XO3
4718c2ecf20Sopenharmony_ci#undef XO4
4728c2ecf20Sopenharmony_ci#undef ST
4738c2ecf20Sopenharmony_ci#undef NOP
4748c2ecf20Sopenharmony_ci#undef BLK64
4758c2ecf20Sopenharmony_ci#undef BLOCK
4768c2ecf20Sopenharmony_ci
4778c2ecf20Sopenharmony_ci#undef XOR_CONSTANT_CONSTRAINT
4788c2ecf20Sopenharmony_ci
4798c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_32
4808c2ecf20Sopenharmony_ci# include <asm/xor_32.h>
4818c2ecf20Sopenharmony_ci#else
4828c2ecf20Sopenharmony_ci# include <asm/xor_64.h>
4838c2ecf20Sopenharmony_ci#endif
4848c2ecf20Sopenharmony_ci
4858c2ecf20Sopenharmony_ci#define XOR_SELECT_TEMPLATE(FASTEST) \
4868c2ecf20Sopenharmony_ci	AVX_SELECT(FASTEST)
4878c2ecf20Sopenharmony_ci
4888c2ecf20Sopenharmony_ci#endif /* _ASM_X86_XOR_H */
489