18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
28c2ecf20Sopenharmony_ci#ifndef _ASM_X86_XOR_AVX_H
38c2ecf20Sopenharmony_ci#define _ASM_X86_XOR_AVX_H
48c2ecf20Sopenharmony_ci
58c2ecf20Sopenharmony_ci/*
68c2ecf20Sopenharmony_ci * Optimized RAID-5 checksumming functions for AVX
78c2ecf20Sopenharmony_ci *
88c2ecf20Sopenharmony_ci * Copyright (C) 2012 Intel Corporation
98c2ecf20Sopenharmony_ci * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
108c2ecf20Sopenharmony_ci *
118c2ecf20Sopenharmony_ci * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
128c2ecf20Sopenharmony_ci */
138c2ecf20Sopenharmony_ci
148c2ecf20Sopenharmony_ci#include <linux/compiler.h>
158c2ecf20Sopenharmony_ci#include <asm/fpu/api.h>
168c2ecf20Sopenharmony_ci
178c2ecf20Sopenharmony_ci#define BLOCK4(i) \
188c2ecf20Sopenharmony_ci		BLOCK(32 * i, 0) \
198c2ecf20Sopenharmony_ci		BLOCK(32 * (i + 1), 1) \
208c2ecf20Sopenharmony_ci		BLOCK(32 * (i + 2), 2) \
218c2ecf20Sopenharmony_ci		BLOCK(32 * (i + 3), 3)
228c2ecf20Sopenharmony_ci
238c2ecf20Sopenharmony_ci#define BLOCK16() \
248c2ecf20Sopenharmony_ci		BLOCK4(0) \
258c2ecf20Sopenharmony_ci		BLOCK4(4) \
268c2ecf20Sopenharmony_ci		BLOCK4(8) \
278c2ecf20Sopenharmony_ci		BLOCK4(12)
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_cistatic void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
308c2ecf20Sopenharmony_ci{
318c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 9;
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ci	kernel_fpu_begin();
348c2ecf20Sopenharmony_ci
358c2ecf20Sopenharmony_ci	while (lines--) {
368c2ecf20Sopenharmony_ci#undef BLOCK
378c2ecf20Sopenharmony_ci#define BLOCK(i, reg) \
388c2ecf20Sopenharmony_cido { \
398c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
408c2ecf20Sopenharmony_ci	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
418c2ecf20Sopenharmony_ci		"m" (p0[i / sizeof(*p0)])); \
428c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
438c2ecf20Sopenharmony_ci		"=m" (p0[i / sizeof(*p0)])); \
448c2ecf20Sopenharmony_ci} while (0);
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_ci		BLOCK16()
478c2ecf20Sopenharmony_ci
488c2ecf20Sopenharmony_ci		p0 = (unsigned long *)((uintptr_t)p0 + 512);
498c2ecf20Sopenharmony_ci		p1 = (unsigned long *)((uintptr_t)p1 + 512);
508c2ecf20Sopenharmony_ci	}
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_ci	kernel_fpu_end();
538c2ecf20Sopenharmony_ci}
548c2ecf20Sopenharmony_ci
558c2ecf20Sopenharmony_cistatic void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
568c2ecf20Sopenharmony_ci	unsigned long *p2)
578c2ecf20Sopenharmony_ci{
588c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 9;
598c2ecf20Sopenharmony_ci
608c2ecf20Sopenharmony_ci	kernel_fpu_begin();
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci	while (lines--) {
638c2ecf20Sopenharmony_ci#undef BLOCK
648c2ecf20Sopenharmony_ci#define BLOCK(i, reg) \
658c2ecf20Sopenharmony_cido { \
668c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
678c2ecf20Sopenharmony_ci	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
688c2ecf20Sopenharmony_ci		"m" (p1[i / sizeof(*p1)])); \
698c2ecf20Sopenharmony_ci	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
708c2ecf20Sopenharmony_ci		"m" (p0[i / sizeof(*p0)])); \
718c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
728c2ecf20Sopenharmony_ci		"=m" (p0[i / sizeof(*p0)])); \
738c2ecf20Sopenharmony_ci} while (0);
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_ci		BLOCK16()
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci		p0 = (unsigned long *)((uintptr_t)p0 + 512);
788c2ecf20Sopenharmony_ci		p1 = (unsigned long *)((uintptr_t)p1 + 512);
798c2ecf20Sopenharmony_ci		p2 = (unsigned long *)((uintptr_t)p2 + 512);
808c2ecf20Sopenharmony_ci	}
818c2ecf20Sopenharmony_ci
828c2ecf20Sopenharmony_ci	kernel_fpu_end();
838c2ecf20Sopenharmony_ci}
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_cistatic void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
868c2ecf20Sopenharmony_ci	unsigned long *p2, unsigned long *p3)
878c2ecf20Sopenharmony_ci{
888c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 9;
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci	kernel_fpu_begin();
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci	while (lines--) {
938c2ecf20Sopenharmony_ci#undef BLOCK
948c2ecf20Sopenharmony_ci#define BLOCK(i, reg) \
958c2ecf20Sopenharmony_cido { \
968c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
978c2ecf20Sopenharmony_ci	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
988c2ecf20Sopenharmony_ci		"m" (p2[i / sizeof(*p2)])); \
998c2ecf20Sopenharmony_ci	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
1008c2ecf20Sopenharmony_ci		"m" (p1[i / sizeof(*p1)])); \
1018c2ecf20Sopenharmony_ci	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
1028c2ecf20Sopenharmony_ci		"m" (p0[i / sizeof(*p0)])); \
1038c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
1048c2ecf20Sopenharmony_ci		"=m" (p0[i / sizeof(*p0)])); \
1058c2ecf20Sopenharmony_ci} while (0);
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci		BLOCK16();
1088c2ecf20Sopenharmony_ci
1098c2ecf20Sopenharmony_ci		p0 = (unsigned long *)((uintptr_t)p0 + 512);
1108c2ecf20Sopenharmony_ci		p1 = (unsigned long *)((uintptr_t)p1 + 512);
1118c2ecf20Sopenharmony_ci		p2 = (unsigned long *)((uintptr_t)p2 + 512);
1128c2ecf20Sopenharmony_ci		p3 = (unsigned long *)((uintptr_t)p3 + 512);
1138c2ecf20Sopenharmony_ci	}
1148c2ecf20Sopenharmony_ci
1158c2ecf20Sopenharmony_ci	kernel_fpu_end();
1168c2ecf20Sopenharmony_ci}
1178c2ecf20Sopenharmony_ci
1188c2ecf20Sopenharmony_cistatic void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
1198c2ecf20Sopenharmony_ci	unsigned long *p2, unsigned long *p3, unsigned long *p4)
1208c2ecf20Sopenharmony_ci{
1218c2ecf20Sopenharmony_ci	unsigned long lines = bytes >> 9;
1228c2ecf20Sopenharmony_ci
1238c2ecf20Sopenharmony_ci	kernel_fpu_begin();
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_ci	while (lines--) {
1268c2ecf20Sopenharmony_ci#undef BLOCK
1278c2ecf20Sopenharmony_ci#define BLOCK(i, reg) \
1288c2ecf20Sopenharmony_cido { \
1298c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
1308c2ecf20Sopenharmony_ci	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
1318c2ecf20Sopenharmony_ci		"m" (p3[i / sizeof(*p3)])); \
1328c2ecf20Sopenharmony_ci	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
1338c2ecf20Sopenharmony_ci		"m" (p2[i / sizeof(*p2)])); \
1348c2ecf20Sopenharmony_ci	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
1358c2ecf20Sopenharmony_ci		"m" (p1[i / sizeof(*p1)])); \
1368c2ecf20Sopenharmony_ci	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
1378c2ecf20Sopenharmony_ci		"m" (p0[i / sizeof(*p0)])); \
1388c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
1398c2ecf20Sopenharmony_ci		"=m" (p0[i / sizeof(*p0)])); \
1408c2ecf20Sopenharmony_ci} while (0);
1418c2ecf20Sopenharmony_ci
1428c2ecf20Sopenharmony_ci		BLOCK16()
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci		p0 = (unsigned long *)((uintptr_t)p0 + 512);
1458c2ecf20Sopenharmony_ci		p1 = (unsigned long *)((uintptr_t)p1 + 512);
1468c2ecf20Sopenharmony_ci		p2 = (unsigned long *)((uintptr_t)p2 + 512);
1478c2ecf20Sopenharmony_ci		p3 = (unsigned long *)((uintptr_t)p3 + 512);
1488c2ecf20Sopenharmony_ci		p4 = (unsigned long *)((uintptr_t)p4 + 512);
1498c2ecf20Sopenharmony_ci	}
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci	kernel_fpu_end();
1528c2ecf20Sopenharmony_ci}
1538c2ecf20Sopenharmony_ci
1548c2ecf20Sopenharmony_cistatic struct xor_block_template xor_block_avx = {
1558c2ecf20Sopenharmony_ci	.name = "avx",
1568c2ecf20Sopenharmony_ci	.do_2 = xor_avx_2,
1578c2ecf20Sopenharmony_ci	.do_3 = xor_avx_3,
1588c2ecf20Sopenharmony_ci	.do_4 = xor_avx_4,
1598c2ecf20Sopenharmony_ci	.do_5 = xor_avx_5,
1608c2ecf20Sopenharmony_ci};
1618c2ecf20Sopenharmony_ci
1628c2ecf20Sopenharmony_ci#define AVX_XOR_SPEED \
1638c2ecf20Sopenharmony_cido { \
1648c2ecf20Sopenharmony_ci	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
1658c2ecf20Sopenharmony_ci		xor_speed(&xor_block_avx); \
1668c2ecf20Sopenharmony_ci} while (0)
1678c2ecf20Sopenharmony_ci
1688c2ecf20Sopenharmony_ci#define AVX_SELECT(FASTEST) \
1698c2ecf20Sopenharmony_ci	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
1708c2ecf20Sopenharmony_ci
1718c2ecf20Sopenharmony_ci#endif
172