1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX)
4 *
5 * Copyright 2023 WANG Xuerui <git@xen0n.name>
6 *
7 * Based on the generic RAID-6 code (int.uc):
8 *
9 * Copyright 2002-2004 H. Peter Anvin
10 */
11
12#include <linux/raid/pq.h>
13#include "loongarch.h"
14
15/*
16 * The vector algorithms are currently priority 0, which means the generic
17 * scalar algorithms are not being disabled if vector support is present.
18 * This is like the similar LoongArch RAID5 XOR code, with the main reason
19 * repeated here: it cannot be ruled out at this point of time, that some
20 * future (maybe reduced) models could run the vector algorithms slower than
21 * the scalar ones, maybe for errata or micro-op reasons. It may be
22 * appropriate to revisit this after one or two more uarch generations.
23 */
24
25#ifdef CONFIG_CPU_HAS_LSX
26#define NSIZE 16
27
28static int raid6_has_lsx(void)
29{
30	return cpu_has_lsx;
31}
32
33static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)
34{
35	u8 **dptr = (u8 **)ptrs;
36	u8 *p, *q;
37	int d, z, z0;
38
39	z0 = disks - 3;		/* Highest data disk */
40	p = dptr[z0+1];		/* XOR parity */
41	q = dptr[z0+2];		/* RS syndrome */
42
43	kernel_fpu_begin();
44
45	/*
46	 * $vr0, $vr1, $vr2, $vr3: wp
47	 * $vr4, $vr5, $vr6, $vr7: wq
48	 * $vr8, $vr9, $vr10, $vr11: wd
49	 * $vr12, $vr13, $vr14, $vr15: w2
50	 * $vr16, $vr17, $vr18, $vr19: w1
51	 */
52	for (d = 0; d < bytes; d += NSIZE*4) {
53		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
54		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
55		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
56		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
57		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
58		asm volatile("vori.b $vr4, $vr0, 0");
59		asm volatile("vori.b $vr5, $vr1, 0");
60		asm volatile("vori.b $vr6, $vr2, 0");
61		asm volatile("vori.b $vr7, $vr3, 0");
62		for (z = z0-1; z >= 0; z--) {
63			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
64			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
65			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
66			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
67			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
68			/* wp$$ ^= wd$$; */
69			asm volatile("vxor.v $vr0, $vr0, $vr8");
70			asm volatile("vxor.v $vr1, $vr1, $vr9");
71			asm volatile("vxor.v $vr2, $vr2, $vr10");
72			asm volatile("vxor.v $vr3, $vr3, $vr11");
73			/* w2$$ = MASK(wq$$); */
74			asm volatile("vslti.b $vr12, $vr4, 0");
75			asm volatile("vslti.b $vr13, $vr5, 0");
76			asm volatile("vslti.b $vr14, $vr6, 0");
77			asm volatile("vslti.b $vr15, $vr7, 0");
78			/* w1$$ = SHLBYTE(wq$$); */
79			asm volatile("vslli.b $vr16, $vr4, 1");
80			asm volatile("vslli.b $vr17, $vr5, 1");
81			asm volatile("vslli.b $vr18, $vr6, 1");
82			asm volatile("vslli.b $vr19, $vr7, 1");
83			/* w2$$ &= NBYTES(0x1d); */
84			asm volatile("vandi.b $vr12, $vr12, 0x1d");
85			asm volatile("vandi.b $vr13, $vr13, 0x1d");
86			asm volatile("vandi.b $vr14, $vr14, 0x1d");
87			asm volatile("vandi.b $vr15, $vr15, 0x1d");
88			/* w1$$ ^= w2$$; */
89			asm volatile("vxor.v $vr16, $vr16, $vr12");
90			asm volatile("vxor.v $vr17, $vr17, $vr13");
91			asm volatile("vxor.v $vr18, $vr18, $vr14");
92			asm volatile("vxor.v $vr19, $vr19, $vr15");
93			/* wq$$ = w1$$ ^ wd$$; */
94			asm volatile("vxor.v $vr4, $vr16, $vr8");
95			asm volatile("vxor.v $vr5, $vr17, $vr9");
96			asm volatile("vxor.v $vr6, $vr18, $vr10");
97			asm volatile("vxor.v $vr7, $vr19, $vr11");
98		}
99		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
100		asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0]));
101		asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1]));
102		asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2]));
103		asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3]));
104		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
105		asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0]));
106		asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1]));
107		asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2]));
108		asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3]));
109	}
110
111	kernel_fpu_end();
112}
113
114static void raid6_lsx_xor_syndrome(int disks, int start, int stop,
115				   size_t bytes, void **ptrs)
116{
117	u8 **dptr = (u8 **)ptrs;
118	u8 *p, *q;
119	int d, z, z0;
120
121	z0 = stop;		/* P/Q right side optimization */
122	p = dptr[disks-2];	/* XOR parity */
123	q = dptr[disks-1];	/* RS syndrome */
124
125	kernel_fpu_begin();
126
127	/*
128	 * $vr0, $vr1, $vr2, $vr3: wp
129	 * $vr4, $vr5, $vr6, $vr7: wq
130	 * $vr8, $vr9, $vr10, $vr11: wd
131	 * $vr12, $vr13, $vr14, $vr15: w2
132	 * $vr16, $vr17, $vr18, $vr19: w1
133	 */
134	for (d = 0; d < bytes; d += NSIZE*4) {
135		/* P/Q data pages */
136		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
137		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
138		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
139		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
140		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
141		asm volatile("vori.b $vr4, $vr0, 0");
142		asm volatile("vori.b $vr5, $vr1, 0");
143		asm volatile("vori.b $vr6, $vr2, 0");
144		asm volatile("vori.b $vr7, $vr3, 0");
145		for (z = z0-1; z >= start; z--) {
146			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
147			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
148			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
149			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
150			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
151			/* wp$$ ^= wd$$; */
152			asm volatile("vxor.v $vr0, $vr0, $vr8");
153			asm volatile("vxor.v $vr1, $vr1, $vr9");
154			asm volatile("vxor.v $vr2, $vr2, $vr10");
155			asm volatile("vxor.v $vr3, $vr3, $vr11");
156			/* w2$$ = MASK(wq$$); */
157			asm volatile("vslti.b $vr12, $vr4, 0");
158			asm volatile("vslti.b $vr13, $vr5, 0");
159			asm volatile("vslti.b $vr14, $vr6, 0");
160			asm volatile("vslti.b $vr15, $vr7, 0");
161			/* w1$$ = SHLBYTE(wq$$); */
162			asm volatile("vslli.b $vr16, $vr4, 1");
163			asm volatile("vslli.b $vr17, $vr5, 1");
164			asm volatile("vslli.b $vr18, $vr6, 1");
165			asm volatile("vslli.b $vr19, $vr7, 1");
166			/* w2$$ &= NBYTES(0x1d); */
167			asm volatile("vandi.b $vr12, $vr12, 0x1d");
168			asm volatile("vandi.b $vr13, $vr13, 0x1d");
169			asm volatile("vandi.b $vr14, $vr14, 0x1d");
170			asm volatile("vandi.b $vr15, $vr15, 0x1d");
171			/* w1$$ ^= w2$$; */
172			asm volatile("vxor.v $vr16, $vr16, $vr12");
173			asm volatile("vxor.v $vr17, $vr17, $vr13");
174			asm volatile("vxor.v $vr18, $vr18, $vr14");
175			asm volatile("vxor.v $vr19, $vr19, $vr15");
176			/* wq$$ = w1$$ ^ wd$$; */
177			asm volatile("vxor.v $vr4, $vr16, $vr8");
178			asm volatile("vxor.v $vr5, $vr17, $vr9");
179			asm volatile("vxor.v $vr6, $vr18, $vr10");
180			asm volatile("vxor.v $vr7, $vr19, $vr11");
181		}
182
183		/* P/Q left side optimization */
184		for (z = start-1; z >= 0; z--) {
185			/* w2$$ = MASK(wq$$); */
186			asm volatile("vslti.b $vr12, $vr4, 0");
187			asm volatile("vslti.b $vr13, $vr5, 0");
188			asm volatile("vslti.b $vr14, $vr6, 0");
189			asm volatile("vslti.b $vr15, $vr7, 0");
190			/* w1$$ = SHLBYTE(wq$$); */
191			asm volatile("vslli.b $vr16, $vr4, 1");
192			asm volatile("vslli.b $vr17, $vr5, 1");
193			asm volatile("vslli.b $vr18, $vr6, 1");
194			asm volatile("vslli.b $vr19, $vr7, 1");
195			/* w2$$ &= NBYTES(0x1d); */
196			asm volatile("vandi.b $vr12, $vr12, 0x1d");
197			asm volatile("vandi.b $vr13, $vr13, 0x1d");
198			asm volatile("vandi.b $vr14, $vr14, 0x1d");
199			asm volatile("vandi.b $vr15, $vr15, 0x1d");
200			/* wq$$ = w1$$ ^ w2$$; */
201			asm volatile("vxor.v $vr4, $vr16, $vr12");
202			asm volatile("vxor.v $vr5, $vr17, $vr13");
203			asm volatile("vxor.v $vr6, $vr18, $vr14");
204			asm volatile("vxor.v $vr7, $vr19, $vr15");
205		}
206		/*
207		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
208		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
209		 */
210		asm volatile(
211			"vld $vr20, %0\n\t"
212			"vld $vr21, %1\n\t"
213			"vld $vr22, %2\n\t"
214			"vld $vr23, %3\n\t"
215			"vld $vr24, %4\n\t"
216			"vld $vr25, %5\n\t"
217			"vld $vr26, %6\n\t"
218			"vld $vr27, %7\n\t"
219			"vxor.v $vr20, $vr20, $vr0\n\t"
220			"vxor.v $vr21, $vr21, $vr1\n\t"
221			"vxor.v $vr22, $vr22, $vr2\n\t"
222			"vxor.v $vr23, $vr23, $vr3\n\t"
223			"vxor.v $vr24, $vr24, $vr4\n\t"
224			"vxor.v $vr25, $vr25, $vr5\n\t"
225			"vxor.v $vr26, $vr26, $vr6\n\t"
226			"vxor.v $vr27, $vr27, $vr7\n\t"
227			"vst $vr20, %0\n\t"
228			"vst $vr21, %1\n\t"
229			"vst $vr22, %2\n\t"
230			"vst $vr23, %3\n\t"
231			"vst $vr24, %4\n\t"
232			"vst $vr25, %5\n\t"
233			"vst $vr26, %6\n\t"
234			"vst $vr27, %7\n\t"
235			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
236			  "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]),
237			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]),
238			  "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3])
239		);
240	}
241
242	kernel_fpu_end();
243}
244
245const struct raid6_calls raid6_lsx = {
246	raid6_lsx_gen_syndrome,
247	raid6_lsx_xor_syndrome,
248	raid6_has_lsx,
249	"lsx",
250};
251
252#undef NSIZE
253#endif /* CONFIG_CPU_HAS_LSX */
254
255#ifdef CONFIG_CPU_HAS_LASX
256#define NSIZE 32
257
258static int raid6_has_lasx(void)
259{
260	return cpu_has_lasx;
261}
262
263static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)
264{
265	u8 **dptr = (u8 **)ptrs;
266	u8 *p, *q;
267	int d, z, z0;
268
269	z0 = disks - 3;		/* Highest data disk */
270	p = dptr[z0+1];		/* XOR parity */
271	q = dptr[z0+2];		/* RS syndrome */
272
273	kernel_fpu_begin();
274
275	/*
276	 * $xr0, $xr1: wp
277	 * $xr2, $xr3: wq
278	 * $xr4, $xr5: wd
279	 * $xr6, $xr7: w2
280	 * $xr8, $xr9: w1
281	 */
282	for (d = 0; d < bytes; d += NSIZE*2) {
283		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
284		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
285		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
286		asm volatile("xvori.b $xr2, $xr0, 0");
287		asm volatile("xvori.b $xr3, $xr1, 0");
288		for (z = z0-1; z >= 0; z--) {
289			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
290			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
291			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
292			/* wp$$ ^= wd$$; */
293			asm volatile("xvxor.v $xr0, $xr0, $xr4");
294			asm volatile("xvxor.v $xr1, $xr1, $xr5");
295			/* w2$$ = MASK(wq$$); */
296			asm volatile("xvslti.b $xr6, $xr2, 0");
297			asm volatile("xvslti.b $xr7, $xr3, 0");
298			/* w1$$ = SHLBYTE(wq$$); */
299			asm volatile("xvslli.b $xr8, $xr2, 1");
300			asm volatile("xvslli.b $xr9, $xr3, 1");
301			/* w2$$ &= NBYTES(0x1d); */
302			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
303			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
304			/* w1$$ ^= w2$$; */
305			asm volatile("xvxor.v $xr8, $xr8, $xr6");
306			asm volatile("xvxor.v $xr9, $xr9, $xr7");
307			/* wq$$ = w1$$ ^ wd$$; */
308			asm volatile("xvxor.v $xr2, $xr8, $xr4");
309			asm volatile("xvxor.v $xr3, $xr9, $xr5");
310		}
311		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
312		asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0]));
313		asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1]));
314		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
315		asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0]));
316		asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1]));
317	}
318
319	kernel_fpu_end();
320}
321
322static void raid6_lasx_xor_syndrome(int disks, int start, int stop,
323				    size_t bytes, void **ptrs)
324{
325	u8 **dptr = (u8 **)ptrs;
326	u8 *p, *q;
327	int d, z, z0;
328
329	z0 = stop;		/* P/Q right side optimization */
330	p = dptr[disks-2];	/* XOR parity */
331	q = dptr[disks-1];	/* RS syndrome */
332
333	kernel_fpu_begin();
334
335	/*
336	 * $xr0, $xr1: wp
337	 * $xr2, $xr3: wq
338	 * $xr4, $xr5: wd
339	 * $xr6, $xr7: w2
340	 * $xr8, $xr9: w1
341	 */
342	for (d = 0; d < bytes; d += NSIZE*2) {
343		/* P/Q data pages */
344		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
345		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
346		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
347		asm volatile("xvori.b $xr2, $xr0, 0");
348		asm volatile("xvori.b $xr3, $xr1, 0");
349		for (z = z0-1; z >= start; z--) {
350			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
351			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
352			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
353			/* wp$$ ^= wd$$; */
354			asm volatile("xvxor.v $xr0, $xr0, $xr4");
355			asm volatile("xvxor.v $xr1, $xr1, $xr5");
356			/* w2$$ = MASK(wq$$); */
357			asm volatile("xvslti.b $xr6, $xr2, 0");
358			asm volatile("xvslti.b $xr7, $xr3, 0");
359			/* w1$$ = SHLBYTE(wq$$); */
360			asm volatile("xvslli.b $xr8, $xr2, 1");
361			asm volatile("xvslli.b $xr9, $xr3, 1");
362			/* w2$$ &= NBYTES(0x1d); */
363			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
364			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
365			/* w1$$ ^= w2$$; */
366			asm volatile("xvxor.v $xr8, $xr8, $xr6");
367			asm volatile("xvxor.v $xr9, $xr9, $xr7");
368			/* wq$$ = w1$$ ^ wd$$; */
369			asm volatile("xvxor.v $xr2, $xr8, $xr4");
370			asm volatile("xvxor.v $xr3, $xr9, $xr5");
371		}
372
373		/* P/Q left side optimization */
374		for (z = start-1; z >= 0; z--) {
375			/* w2$$ = MASK(wq$$); */
376			asm volatile("xvslti.b $xr6, $xr2, 0");
377			asm volatile("xvslti.b $xr7, $xr3, 0");
378			/* w1$$ = SHLBYTE(wq$$); */
379			asm volatile("xvslli.b $xr8, $xr2, 1");
380			asm volatile("xvslli.b $xr9, $xr3, 1");
381			/* w2$$ &= NBYTES(0x1d); */
382			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
383			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
384			/* wq$$ = w1$$ ^ w2$$; */
385			asm volatile("xvxor.v $xr2, $xr8, $xr6");
386			asm volatile("xvxor.v $xr3, $xr9, $xr7");
387		}
388		/*
389		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
390		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
391		 */
392		asm volatile(
393			"xvld $xr10, %0\n\t"
394			"xvld $xr11, %1\n\t"
395			"xvld $xr12, %2\n\t"
396			"xvld $xr13, %3\n\t"
397			"xvxor.v $xr10, $xr10, $xr0\n\t"
398			"xvxor.v $xr11, $xr11, $xr1\n\t"
399			"xvxor.v $xr12, $xr12, $xr2\n\t"
400			"xvxor.v $xr13, $xr13, $xr3\n\t"
401			"xvst $xr10, %0\n\t"
402			"xvst $xr11, %1\n\t"
403			"xvst $xr12, %2\n\t"
404			"xvst $xr13, %3\n\t"
405			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
406			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1])
407		);
408	}
409
410	kernel_fpu_end();
411}
412
413const struct raid6_calls raid6_lasx = {
414	raid6_lasx_gen_syndrome,
415	raid6_lasx_xor_syndrome,
416	raid6_has_lasx,
417	"lasx",
418};
419#undef NSIZE
420#endif /* CONFIG_CPU_HAS_LASX */
421