1/* SPDX-License-Identifier: GPL-2.0-only */
2#ifndef _ASM_X86_XOR_AVX_H
3#define _ASM_X86_XOR_AVX_H
4
5/*
6 * Optimized RAID-5 checksumming functions for AVX
7 *
8 * Copyright (C) 2012 Intel Corporation
9 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
10 *
11 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
12 */
13
14#include <linux/compiler.h>
15#include <asm/fpu/api.h>
16
17#define BLOCK4(i) \
18		BLOCK(32 * i, 0) \
19		BLOCK(32 * (i + 1), 1) \
20		BLOCK(32 * (i + 2), 2) \
21		BLOCK(32 * (i + 3), 3)
22
23#define BLOCK16() \
24		BLOCK4(0) \
25		BLOCK4(4) \
26		BLOCK4(8) \
27		BLOCK4(12)
28
29static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
30{
31	unsigned long lines = bytes >> 9;
32
33	kernel_fpu_begin();
34
35	while (lines--) {
36#undef BLOCK
37#define BLOCK(i, reg) \
38do { \
39	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
40	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
41		"m" (p0[i / sizeof(*p0)])); \
42	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
43		"=m" (p0[i / sizeof(*p0)])); \
44} while (0);
45
46		BLOCK16()
47
48		p0 = (unsigned long *)((uintptr_t)p0 + 512);
49		p1 = (unsigned long *)((uintptr_t)p1 + 512);
50	}
51
52	kernel_fpu_end();
53}
54
55static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
56	unsigned long *p2)
57{
58	unsigned long lines = bytes >> 9;
59
60	kernel_fpu_begin();
61
62	while (lines--) {
63#undef BLOCK
64#define BLOCK(i, reg) \
65do { \
66	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
67	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
68		"m" (p1[i / sizeof(*p1)])); \
69	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
70		"m" (p0[i / sizeof(*p0)])); \
71	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
72		"=m" (p0[i / sizeof(*p0)])); \
73} while (0);
74
75		BLOCK16()
76
77		p0 = (unsigned long *)((uintptr_t)p0 + 512);
78		p1 = (unsigned long *)((uintptr_t)p1 + 512);
79		p2 = (unsigned long *)((uintptr_t)p2 + 512);
80	}
81
82	kernel_fpu_end();
83}
84
85static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
86	unsigned long *p2, unsigned long *p3)
87{
88	unsigned long lines = bytes >> 9;
89
90	kernel_fpu_begin();
91
92	while (lines--) {
93#undef BLOCK
94#define BLOCK(i, reg) \
95do { \
96	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
97	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
98		"m" (p2[i / sizeof(*p2)])); \
99	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
100		"m" (p1[i / sizeof(*p1)])); \
101	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102		"m" (p0[i / sizeof(*p0)])); \
103	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
104		"=m" (p0[i / sizeof(*p0)])); \
105} while (0);
106
107		BLOCK16();
108
109		p0 = (unsigned long *)((uintptr_t)p0 + 512);
110		p1 = (unsigned long *)((uintptr_t)p1 + 512);
111		p2 = (unsigned long *)((uintptr_t)p2 + 512);
112		p3 = (unsigned long *)((uintptr_t)p3 + 512);
113	}
114
115	kernel_fpu_end();
116}
117
118static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
119	unsigned long *p2, unsigned long *p3, unsigned long *p4)
120{
121	unsigned long lines = bytes >> 9;
122
123	kernel_fpu_begin();
124
125	while (lines--) {
126#undef BLOCK
127#define BLOCK(i, reg) \
128do { \
129	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
130	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
131		"m" (p3[i / sizeof(*p3)])); \
132	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133		"m" (p2[i / sizeof(*p2)])); \
134	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135		"m" (p1[i / sizeof(*p1)])); \
136	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137		"m" (p0[i / sizeof(*p0)])); \
138	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
139		"=m" (p0[i / sizeof(*p0)])); \
140} while (0);
141
142		BLOCK16()
143
144		p0 = (unsigned long *)((uintptr_t)p0 + 512);
145		p1 = (unsigned long *)((uintptr_t)p1 + 512);
146		p2 = (unsigned long *)((uintptr_t)p2 + 512);
147		p3 = (unsigned long *)((uintptr_t)p3 + 512);
148		p4 = (unsigned long *)((uintptr_t)p4 + 512);
149	}
150
151	kernel_fpu_end();
152}
153
154static struct xor_block_template xor_block_avx = {
155	.name = "avx",
156	.do_2 = xor_avx_2,
157	.do_3 = xor_avx_3,
158	.do_4 = xor_avx_4,
159	.do_5 = xor_avx_5,
160};
161
162#define AVX_XOR_SPEED \
163do { \
164	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
165		xor_speed(&xor_block_avx); \
166} while (0)
167
168#define AVX_SELECT(FASTEST) \
169	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
170
171#endif
172