xref: /kernel/linux/linux-5.10/lib/raid6/avx512.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/* -*- linux-c -*- --------------------------------------------------------
3 *
4 *   Copyright (C) 2016 Intel Corporation
5 *
6 *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
7 *   Author: Megha Dey <megha.dey@linux.intel.com>
8 *
9 *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
10 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
11 *
12 * -----------------------------------------------------------------------
13 */
14
15/*
16 * AVX512 implementation of RAID-6 syndrome functions
17 *
18 */
19
20#ifdef CONFIG_AS_AVX512
21
22#include <linux/raid/pq.h>
23#include "x86.h"
24
25static const struct raid6_avx512_constants {
26	u64 x1d[8];
27} raid6_avx512_constants __aligned(512/8) = {
28	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
29	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
30	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
31	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
32};
33
34static int raid6_have_avx512(void)
35{
36	return boot_cpu_has(X86_FEATURE_AVX2) &&
37		boot_cpu_has(X86_FEATURE_AVX) &&
38		boot_cpu_has(X86_FEATURE_AVX512F) &&
39		boot_cpu_has(X86_FEATURE_AVX512BW) &&
40		boot_cpu_has(X86_FEATURE_AVX512VL) &&
41		boot_cpu_has(X86_FEATURE_AVX512DQ);
42}
43
44static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
45{
46	u8 **dptr = (u8 **)ptrs;
47	u8 *p, *q;
48	int d, z, z0;
49
50	z0 = disks - 3;         /* Highest data disk */
51	p = dptr[z0+1];         /* XOR parity */
52	q = dptr[z0+2];         /* RS syndrome */
53
54	kernel_fpu_begin();
55
56	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
57		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
58		     :
59		     : "m" (raid6_avx512_constants.x1d[0]));
60
61	for (d = 0; d < bytes; d += 64) {
62		asm volatile("prefetchnta %0\n\t"
63			     "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
64			     "prefetchnta %1\n\t"
65			     "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
66			     "vmovdqa64 %1,%%zmm6"
67			     :
68			     : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
69		for (z = z0-2; z >= 0; z--) {
70			asm volatile("prefetchnta %0\n\t"
71				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
72				     "vpmovm2b %%k1,%%zmm5\n\t"
73				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
74				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
75				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
76				     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
77				     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
78				     "vmovdqa64 %0,%%zmm6"
79				     :
80				     : "m" (dptr[z][d]));
81		}
82		asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
83			     "vpmovm2b %%k1,%%zmm5\n\t"
84			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
85			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
86			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
87			     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
88			     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
89			     "vmovntdq %%zmm2,%0\n\t"
90			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
91			     "vmovntdq %%zmm4,%1\n\t"
92			     "vpxorq %%zmm4,%%zmm4,%%zmm4"
93			     :
94			     : "m" (p[d]), "m" (q[d]));
95	}
96
97	asm volatile("sfence" : : : "memory");
98	kernel_fpu_end();
99}
100
101static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
102				       size_t bytes, void **ptrs)
103{
104	u8 **dptr = (u8 **)ptrs;
105	u8 *p, *q;
106	int d, z, z0;
107
108	z0 = stop;		/* P/Q right side optimization */
109	p = dptr[disks-2];	/* XOR parity */
110	q = dptr[disks-1];	/* RS syndrome */
111
112	kernel_fpu_begin();
113
114	asm volatile("vmovdqa64 %0,%%zmm0"
115		     : : "m" (raid6_avx512_constants.x1d[0]));
116
117	for (d = 0 ; d < bytes ; d += 64) {
118		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
119			     "vmovdqa64 %1,%%zmm2\n\t"
120			     "vpxorq %%zmm4,%%zmm2,%%zmm2"
121			     :
122			     : "m" (dptr[z0][d]),  "m" (p[d]));
123		/* P/Q data pages */
124		for (z = z0-1 ; z >= start ; z--) {
125			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
126				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
127				     "vpmovm2b %%k1,%%zmm5\n\t"
128				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
129				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
130				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
131				     "vmovdqa64 %0,%%zmm5\n\t"
132				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
133				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
134				     :
135				     : "m" (dptr[z][d]));
136		}
137		/* P/Q left side optimization */
138		for (z = start-1 ; z >= 0 ; z--) {
139			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
140				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
141				     "vpmovm2b %%k1,%%zmm5\n\t"
142				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
143				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
144				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
145				     :
146				     : );
147		}
148		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
149		/* Don't use movntdq for r/w memory area < cache line */
150			     "vmovdqa64 %%zmm4,%0\n\t"
151			     "vmovdqa64 %%zmm2,%1"
152			     :
153			     : "m" (q[d]), "m" (p[d]));
154	}
155
156	asm volatile("sfence" : : : "memory");
157	kernel_fpu_end();
158}
159
160const struct raid6_calls raid6_avx512x1 = {
161	raid6_avx5121_gen_syndrome,
162	raid6_avx5121_xor_syndrome,
163	raid6_have_avx512,
164	"avx512x1",
165	1                       /* Has cache hints */
166};
167
168/*
169 * Unrolled-by-2 AVX512 implementation
170 */
171static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
172{
173	u8 **dptr = (u8 **)ptrs;
174	u8 *p, *q;
175	int d, z, z0;
176
177	z0 = disks - 3;         /* Highest data disk */
178	p = dptr[z0+1];         /* XOR parity */
179	q = dptr[z0+2];         /* RS syndrome */
180
181	kernel_fpu_begin();
182
183	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
184		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
185		     :
186		     : "m" (raid6_avx512_constants.x1d[0]));
187
188	/* We uniformly assume a single prefetch covers at least 64 bytes */
189	for (d = 0; d < bytes; d += 128) {
190		asm volatile("prefetchnta %0\n\t"
191			     "prefetchnta %1\n\t"
192			     "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
193			     "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
194			     "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
195			     "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
196			     :
197			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
198		for (z = z0-1; z >= 0; z--) {
199			asm volatile("prefetchnta %0\n\t"
200				     "prefetchnta %1\n\t"
201				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
202				     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
203				     "vpmovm2b %%k1,%%zmm5\n\t"
204				     "vpmovm2b %%k2,%%zmm7\n\t"
205				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
206				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
207				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
208				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
209				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
210				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
211				     "vmovdqa64 %0,%%zmm5\n\t"
212				     "vmovdqa64 %1,%%zmm7\n\t"
213				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
214				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
215				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
216				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
217				     :
218				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
219		}
220		asm volatile("vmovntdq %%zmm2,%0\n\t"
221			     "vmovntdq %%zmm3,%1\n\t"
222			     "vmovntdq %%zmm4,%2\n\t"
223			     "vmovntdq %%zmm6,%3"
224			     :
225			     : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
226			       "m" (q[d+64]));
227	}
228
229	asm volatile("sfence" : : : "memory");
230	kernel_fpu_end();
231}
232
233static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
234				       size_t bytes, void **ptrs)
235{
236	u8 **dptr = (u8 **)ptrs;
237	u8 *p, *q;
238	int d, z, z0;
239
240	z0 = stop;		/* P/Q right side optimization */
241	p = dptr[disks-2];	/* XOR parity */
242	q = dptr[disks-1];	/* RS syndrome */
243
244	kernel_fpu_begin();
245
246	asm volatile("vmovdqa64 %0,%%zmm0"
247		     : : "m" (raid6_avx512_constants.x1d[0]));
248
249	for (d = 0 ; d < bytes ; d += 128) {
250		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
251			     "vmovdqa64 %1,%%zmm6\n\t"
252			     "vmovdqa64 %2,%%zmm2\n\t"
253			     "vmovdqa64 %3,%%zmm3\n\t"
254			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
255			     "vpxorq %%zmm6,%%zmm3,%%zmm3"
256			     :
257			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
258			       "m" (p[d]), "m" (p[d+64]));
259		/* P/Q data pages */
260		for (z = z0-1 ; z >= start ; z--) {
261			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
262				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
263				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
264				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
265				     "vpmovm2b %%k1,%%zmm5\n\t"
266				     "vpmovm2b %%k2,%%zmm7\n\t"
267				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
268				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
269				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
270				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
271				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
272				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
273				     "vmovdqa64 %0,%%zmm5\n\t"
274				     "vmovdqa64 %1,%%zmm7\n\t"
275				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
276				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
277				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
278				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
279				     :
280				     : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
281		}
282		/* P/Q left side optimization */
283		for (z = start-1 ; z >= 0 ; z--) {
284			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
285				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
286				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
287				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
288				     "vpmovm2b %%k1,%%zmm5\n\t"
289				     "vpmovm2b %%k2,%%zmm7\n\t"
290				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
291				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
292				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
293				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
294				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
295				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
296				     :
297				     : );
298		}
299		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
300			     "vpxorq %1,%%zmm6,%%zmm6\n\t"
301			     /* Don't use movntdq for r/w
302			      * memory area < cache line
303			      */
304			     "vmovdqa64 %%zmm4,%0\n\t"
305			     "vmovdqa64 %%zmm6,%1\n\t"
306			     "vmovdqa64 %%zmm2,%2\n\t"
307			     "vmovdqa64 %%zmm3,%3"
308			     :
309			     : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
310			       "m" (p[d+64]));
311	}
312
313	asm volatile("sfence" : : : "memory");
314	kernel_fpu_end();
315}
316
317const struct raid6_calls raid6_avx512x2 = {
318	raid6_avx5122_gen_syndrome,
319	raid6_avx5122_xor_syndrome,
320	raid6_have_avx512,
321	"avx512x2",
322	1                       /* Has cache hints */
323};
324
325#ifdef CONFIG_X86_64
326
327/*
328 * Unrolled-by-4 AVX2 implementation
329 */
330static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
331{
332	u8 **dptr = (u8 **)ptrs;
333	u8 *p, *q;
334	int d, z, z0;
335
336	z0 = disks - 3;         /* Highest data disk */
337	p = dptr[z0+1];         /* XOR parity */
338	q = dptr[z0+2];         /* RS syndrome */
339
340	kernel_fpu_begin();
341
342	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
343		     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
344		     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
345		     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
346		     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
347		     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
348		     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
349		     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
350		     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
351		     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
352		     :
353		     : "m" (raid6_avx512_constants.x1d[0]));
354
355	for (d = 0; d < bytes; d += 256) {
356		for (z = z0; z >= 0; z--) {
357		asm volatile("prefetchnta %0\n\t"
358			     "prefetchnta %1\n\t"
359			     "prefetchnta %2\n\t"
360			     "prefetchnta %3\n\t"
361			     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
362			     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
363			     "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
364			     "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
365			     "vpmovm2b %%k1,%%zmm5\n\t"
366			     "vpmovm2b %%k2,%%zmm7\n\t"
367			     "vpmovm2b %%k3,%%zmm13\n\t"
368			     "vpmovm2b %%k4,%%zmm15\n\t"
369			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
370			     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
371			     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
372			     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
373			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
374			     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
375			     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
376			     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
377			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
378			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
379			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
380			     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
381			     "vmovdqa64 %0,%%zmm5\n\t"
382			     "vmovdqa64 %1,%%zmm7\n\t"
383			     "vmovdqa64 %2,%%zmm13\n\t"
384			     "vmovdqa64 %3,%%zmm15\n\t"
385			     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
386			     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
387			     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
388			     "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
389			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
390			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
391			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
392			     "vpxorq %%zmm15,%%zmm14,%%zmm14"
393			     :
394			     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
395			       "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
396		}
397		asm volatile("vmovntdq %%zmm2,%0\n\t"
398			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
399			     "vmovntdq %%zmm3,%1\n\t"
400			     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
401			     "vmovntdq %%zmm10,%2\n\t"
402			     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
403			     "vmovntdq %%zmm11,%3\n\t"
404			     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
405			     "vmovntdq %%zmm4,%4\n\t"
406			     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
407			     "vmovntdq %%zmm6,%5\n\t"
408			     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
409			     "vmovntdq %%zmm12,%6\n\t"
410			     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
411			     "vmovntdq %%zmm14,%7\n\t"
412			     "vpxorq %%zmm14,%%zmm14,%%zmm14"
413			     :
414			     : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
415			       "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
416			       "m" (q[d+128]), "m" (q[d+192]));
417	}
418
419	asm volatile("sfence" : : : "memory");
420	kernel_fpu_end();
421}
422
423static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
424				       size_t bytes, void **ptrs)
425{
426	u8 **dptr = (u8 **)ptrs;
427	u8 *p, *q;
428	int d, z, z0;
429
430	z0 = stop;		/* P/Q right side optimization */
431	p = dptr[disks-2];	/* XOR parity */
432	q = dptr[disks-1];	/* RS syndrome */
433
434	kernel_fpu_begin();
435
436	asm volatile("vmovdqa64 %0,%%zmm0"
437		     :: "m" (raid6_avx512_constants.x1d[0]));
438
439	for (d = 0 ; d < bytes ; d += 256) {
440		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
441			     "vmovdqa64 %1,%%zmm6\n\t"
442			     "vmovdqa64 %2,%%zmm12\n\t"
443			     "vmovdqa64 %3,%%zmm14\n\t"
444			     "vmovdqa64 %4,%%zmm2\n\t"
445			     "vmovdqa64 %5,%%zmm3\n\t"
446			     "vmovdqa64 %6,%%zmm10\n\t"
447			     "vmovdqa64 %7,%%zmm11\n\t"
448			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
449			     "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
450			     "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
451			     "vpxorq %%zmm14,%%zmm11,%%zmm11"
452			     :
453			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
454			       "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
455			       "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
456			       "m" (p[d+192]));
457		/* P/Q data pages */
458		for (z = z0-1 ; z >= start ; z--) {
459			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
460				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
461				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
462				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
463				     "prefetchnta %0\n\t"
464				     "prefetchnta %2\n\t"
465				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
466				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
467				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
468				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
469				     "vpmovm2b %%k1,%%zmm5\n\t"
470				     "vpmovm2b %%k2,%%zmm7\n\t"
471				     "vpmovm2b %%k3,%%zmm13\n\t"
472				     "vpmovm2b %%k4,%%zmm15\n\t"
473				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
474				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
475				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
476				     "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
477				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
478				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
479				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
480				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
481				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
482				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
483				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
484				     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
485				     "vmovdqa64 %0,%%zmm5\n\t"
486				     "vmovdqa64 %1,%%zmm7\n\t"
487				     "vmovdqa64 %2,%%zmm13\n\t"
488				     "vmovdqa64 %3,%%zmm15\n\t"
489				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
490				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
491				     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
492				     "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
493				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
494				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
495				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
496				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
497				     :
498				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
499				       "m" (dptr[z][d+128]),
500				       "m" (dptr[z][d+192]));
501		}
502		asm volatile("prefetchnta %0\n\t"
503			     "prefetchnta %1\n\t"
504			     :
505			     : "m" (q[d]), "m" (q[d+128]));
506		/* P/Q left side optimization */
507		for (z = start-1 ; z >= 0 ; z--) {
508			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
509				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
510				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
511				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
512				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
513				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
514				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
515				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
516				     "vpmovm2b %%k1,%%zmm5\n\t"
517				     "vpmovm2b %%k2,%%zmm7\n\t"
518				     "vpmovm2b %%k3,%%zmm13\n\t"
519				     "vpmovm2b %%k4,%%zmm15\n\t"
520				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
521				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
522				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
523				     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
524				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
525				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
526				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
527				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
528				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
529				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
530				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
531				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
532				     :
533				     : );
534		}
535		asm volatile("vmovntdq %%zmm2,%0\n\t"
536			     "vmovntdq %%zmm3,%1\n\t"
537			     "vmovntdq %%zmm10,%2\n\t"
538			     "vmovntdq %%zmm11,%3\n\t"
539			     "vpxorq %4,%%zmm4,%%zmm4\n\t"
540			     "vpxorq %5,%%zmm6,%%zmm6\n\t"
541			     "vpxorq %6,%%zmm12,%%zmm12\n\t"
542			     "vpxorq %7,%%zmm14,%%zmm14\n\t"
543			     "vmovntdq %%zmm4,%4\n\t"
544			     "vmovntdq %%zmm6,%5\n\t"
545			     "vmovntdq %%zmm12,%6\n\t"
546			     "vmovntdq %%zmm14,%7"
547			     :
548			     : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
549			       "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
550			       "m" (q[d+128]), "m" (q[d+192]));
551	}
552	asm volatile("sfence" : : : "memory");
553	kernel_fpu_end();
554}
555const struct raid6_calls raid6_avx512x4 = {
556	raid6_avx5124_gen_syndrome,
557	raid6_avx5124_xor_syndrome,
558	raid6_have_avx512,
559	"avx512x4",
560	1                       /* Has cache hints */
561};
562#endif
563
564#endif /* CONFIG_AS_AVX512 */
565