1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX)
4  *
5  * Copyright 2023 WANG Xuerui <git@xen0n.name>
6  *
7  * Based on the generic RAID-6 code (int.uc):
8  *
9  * Copyright 2002-2004 H. Peter Anvin
10  */
11 
12 #include <linux/raid/pq.h>
13 #include "loongarch.h"
14 
15 /*
16  * The vector algorithms are currently priority 0, which means the generic
17  * scalar algorithms are not being disabled if vector support is present.
18  * This is like the similar LoongArch RAID5 XOR code, with the main reason
19  * repeated here: it cannot be ruled out at this point of time, that some
20  * future (maybe reduced) models could run the vector algorithms slower than
21  * the scalar ones, maybe for errata or micro-op reasons. It may be
22  * appropriate to revisit this after one or two more uarch generations.
23  */
24 
25 #ifdef CONFIG_CPU_HAS_LSX
26 #define NSIZE 16
27 
raid6_has_lsx(void)28 static int raid6_has_lsx(void)
29 {
30 	return cpu_has_lsx;
31 }
32 
raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)33 static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)
34 {
35 	u8 **dptr = (u8 **)ptrs;
36 	u8 *p, *q;
37 	int d, z, z0;
38 
39 	z0 = disks - 3;		/* Highest data disk */
40 	p = dptr[z0+1];		/* XOR parity */
41 	q = dptr[z0+2];		/* RS syndrome */
42 
43 	kernel_fpu_begin();
44 
45 	/*
46 	 * $vr0, $vr1, $vr2, $vr3: wp
47 	 * $vr4, $vr5, $vr6, $vr7: wq
48 	 * $vr8, $vr9, $vr10, $vr11: wd
49 	 * $vr12, $vr13, $vr14, $vr15: w2
50 	 * $vr16, $vr17, $vr18, $vr19: w1
51 	 */
52 	for (d = 0; d < bytes; d += NSIZE*4) {
53 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
54 		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
55 		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
56 		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
57 		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
58 		asm volatile("vori.b $vr4, $vr0, 0");
59 		asm volatile("vori.b $vr5, $vr1, 0");
60 		asm volatile("vori.b $vr6, $vr2, 0");
61 		asm volatile("vori.b $vr7, $vr3, 0");
62 		for (z = z0-1; z >= 0; z--) {
63 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
64 			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
65 			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
66 			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
67 			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
68 			/* wp$$ ^= wd$$; */
69 			asm volatile("vxor.v $vr0, $vr0, $vr8");
70 			asm volatile("vxor.v $vr1, $vr1, $vr9");
71 			asm volatile("vxor.v $vr2, $vr2, $vr10");
72 			asm volatile("vxor.v $vr3, $vr3, $vr11");
73 			/* w2$$ = MASK(wq$$); */
74 			asm volatile("vslti.b $vr12, $vr4, 0");
75 			asm volatile("vslti.b $vr13, $vr5, 0");
76 			asm volatile("vslti.b $vr14, $vr6, 0");
77 			asm volatile("vslti.b $vr15, $vr7, 0");
78 			/* w1$$ = SHLBYTE(wq$$); */
79 			asm volatile("vslli.b $vr16, $vr4, 1");
80 			asm volatile("vslli.b $vr17, $vr5, 1");
81 			asm volatile("vslli.b $vr18, $vr6, 1");
82 			asm volatile("vslli.b $vr19, $vr7, 1");
83 			/* w2$$ &= NBYTES(0x1d); */
84 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
85 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
86 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
87 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
88 			/* w1$$ ^= w2$$; */
89 			asm volatile("vxor.v $vr16, $vr16, $vr12");
90 			asm volatile("vxor.v $vr17, $vr17, $vr13");
91 			asm volatile("vxor.v $vr18, $vr18, $vr14");
92 			asm volatile("vxor.v $vr19, $vr19, $vr15");
93 			/* wq$$ = w1$$ ^ wd$$; */
94 			asm volatile("vxor.v $vr4, $vr16, $vr8");
95 			asm volatile("vxor.v $vr5, $vr17, $vr9");
96 			asm volatile("vxor.v $vr6, $vr18, $vr10");
97 			asm volatile("vxor.v $vr7, $vr19, $vr11");
98 		}
99 		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
100 		asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0]));
101 		asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1]));
102 		asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2]));
103 		asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3]));
104 		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
105 		asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0]));
106 		asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1]));
107 		asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2]));
108 		asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3]));
109 	}
110 
111 	kernel_fpu_end();
112 }
113 
raid6_lsx_xor_syndrome(int disks, int start, int stop, size_t bytes, void **ptrs)114 static void raid6_lsx_xor_syndrome(int disks, int start, int stop,
115 				   size_t bytes, void **ptrs)
116 {
117 	u8 **dptr = (u8 **)ptrs;
118 	u8 *p, *q;
119 	int d, z, z0;
120 
121 	z0 = stop;		/* P/Q right side optimization */
122 	p = dptr[disks-2];	/* XOR parity */
123 	q = dptr[disks-1];	/* RS syndrome */
124 
125 	kernel_fpu_begin();
126 
127 	/*
128 	 * $vr0, $vr1, $vr2, $vr3: wp
129 	 * $vr4, $vr5, $vr6, $vr7: wq
130 	 * $vr8, $vr9, $vr10, $vr11: wd
131 	 * $vr12, $vr13, $vr14, $vr15: w2
132 	 * $vr16, $vr17, $vr18, $vr19: w1
133 	 */
134 	for (d = 0; d < bytes; d += NSIZE*4) {
135 		/* P/Q data pages */
136 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
137 		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
138 		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
139 		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
140 		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
141 		asm volatile("vori.b $vr4, $vr0, 0");
142 		asm volatile("vori.b $vr5, $vr1, 0");
143 		asm volatile("vori.b $vr6, $vr2, 0");
144 		asm volatile("vori.b $vr7, $vr3, 0");
145 		for (z = z0-1; z >= start; z--) {
146 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
147 			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
148 			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
149 			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
150 			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
151 			/* wp$$ ^= wd$$; */
152 			asm volatile("vxor.v $vr0, $vr0, $vr8");
153 			asm volatile("vxor.v $vr1, $vr1, $vr9");
154 			asm volatile("vxor.v $vr2, $vr2, $vr10");
155 			asm volatile("vxor.v $vr3, $vr3, $vr11");
156 			/* w2$$ = MASK(wq$$); */
157 			asm volatile("vslti.b $vr12, $vr4, 0");
158 			asm volatile("vslti.b $vr13, $vr5, 0");
159 			asm volatile("vslti.b $vr14, $vr6, 0");
160 			asm volatile("vslti.b $vr15, $vr7, 0");
161 			/* w1$$ = SHLBYTE(wq$$); */
162 			asm volatile("vslli.b $vr16, $vr4, 1");
163 			asm volatile("vslli.b $vr17, $vr5, 1");
164 			asm volatile("vslli.b $vr18, $vr6, 1");
165 			asm volatile("vslli.b $vr19, $vr7, 1");
166 			/* w2$$ &= NBYTES(0x1d); */
167 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
168 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
169 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
170 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
171 			/* w1$$ ^= w2$$; */
172 			asm volatile("vxor.v $vr16, $vr16, $vr12");
173 			asm volatile("vxor.v $vr17, $vr17, $vr13");
174 			asm volatile("vxor.v $vr18, $vr18, $vr14");
175 			asm volatile("vxor.v $vr19, $vr19, $vr15");
176 			/* wq$$ = w1$$ ^ wd$$; */
177 			asm volatile("vxor.v $vr4, $vr16, $vr8");
178 			asm volatile("vxor.v $vr5, $vr17, $vr9");
179 			asm volatile("vxor.v $vr6, $vr18, $vr10");
180 			asm volatile("vxor.v $vr7, $vr19, $vr11");
181 		}
182 
183 		/* P/Q left side optimization */
184 		for (z = start-1; z >= 0; z--) {
185 			/* w2$$ = MASK(wq$$); */
186 			asm volatile("vslti.b $vr12, $vr4, 0");
187 			asm volatile("vslti.b $vr13, $vr5, 0");
188 			asm volatile("vslti.b $vr14, $vr6, 0");
189 			asm volatile("vslti.b $vr15, $vr7, 0");
190 			/* w1$$ = SHLBYTE(wq$$); */
191 			asm volatile("vslli.b $vr16, $vr4, 1");
192 			asm volatile("vslli.b $vr17, $vr5, 1");
193 			asm volatile("vslli.b $vr18, $vr6, 1");
194 			asm volatile("vslli.b $vr19, $vr7, 1");
195 			/* w2$$ &= NBYTES(0x1d); */
196 			asm volatile("vandi.b $vr12, $vr12, 0x1d");
197 			asm volatile("vandi.b $vr13, $vr13, 0x1d");
198 			asm volatile("vandi.b $vr14, $vr14, 0x1d");
199 			asm volatile("vandi.b $vr15, $vr15, 0x1d");
200 			/* wq$$ = w1$$ ^ w2$$; */
201 			asm volatile("vxor.v $vr4, $vr16, $vr12");
202 			asm volatile("vxor.v $vr5, $vr17, $vr13");
203 			asm volatile("vxor.v $vr6, $vr18, $vr14");
204 			asm volatile("vxor.v $vr7, $vr19, $vr15");
205 		}
206 		/*
207 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
208 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
209 		 */
210 		asm volatile(
211 			"vld $vr20, %0\n\t"
212 			"vld $vr21, %1\n\t"
213 			"vld $vr22, %2\n\t"
214 			"vld $vr23, %3\n\t"
215 			"vld $vr24, %4\n\t"
216 			"vld $vr25, %5\n\t"
217 			"vld $vr26, %6\n\t"
218 			"vld $vr27, %7\n\t"
219 			"vxor.v $vr20, $vr20, $vr0\n\t"
220 			"vxor.v $vr21, $vr21, $vr1\n\t"
221 			"vxor.v $vr22, $vr22, $vr2\n\t"
222 			"vxor.v $vr23, $vr23, $vr3\n\t"
223 			"vxor.v $vr24, $vr24, $vr4\n\t"
224 			"vxor.v $vr25, $vr25, $vr5\n\t"
225 			"vxor.v $vr26, $vr26, $vr6\n\t"
226 			"vxor.v $vr27, $vr27, $vr7\n\t"
227 			"vst $vr20, %0\n\t"
228 			"vst $vr21, %1\n\t"
229 			"vst $vr22, %2\n\t"
230 			"vst $vr23, %3\n\t"
231 			"vst $vr24, %4\n\t"
232 			"vst $vr25, %5\n\t"
233 			"vst $vr26, %6\n\t"
234 			"vst $vr27, %7\n\t"
235 			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
236 			  "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]),
237 			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]),
238 			  "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3])
239 		);
240 	}
241 
242 	kernel_fpu_end();
243 }
244 
245 const struct raid6_calls raid6_lsx = {
246 	raid6_lsx_gen_syndrome,
247 	raid6_lsx_xor_syndrome,
248 	raid6_has_lsx,
249 	"lsx",
250 };
251 
252 #undef NSIZE
253 #endif /* CONFIG_CPU_HAS_LSX */
254 
255 #ifdef CONFIG_CPU_HAS_LASX
256 #define NSIZE 32
257 
raid6_has_lasx(void)258 static int raid6_has_lasx(void)
259 {
260 	return cpu_has_lasx;
261 }
262 
raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)263 static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)
264 {
265 	u8 **dptr = (u8 **)ptrs;
266 	u8 *p, *q;
267 	int d, z, z0;
268 
269 	z0 = disks - 3;		/* Highest data disk */
270 	p = dptr[z0+1];		/* XOR parity */
271 	q = dptr[z0+2];		/* RS syndrome */
272 
273 	kernel_fpu_begin();
274 
275 	/*
276 	 * $xr0, $xr1: wp
277 	 * $xr2, $xr3: wq
278 	 * $xr4, $xr5: wd
279 	 * $xr6, $xr7: w2
280 	 * $xr8, $xr9: w1
281 	 */
282 	for (d = 0; d < bytes; d += NSIZE*2) {
283 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
284 		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
285 		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
286 		asm volatile("xvori.b $xr2, $xr0, 0");
287 		asm volatile("xvori.b $xr3, $xr1, 0");
288 		for (z = z0-1; z >= 0; z--) {
289 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
290 			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
291 			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
292 			/* wp$$ ^= wd$$; */
293 			asm volatile("xvxor.v $xr0, $xr0, $xr4");
294 			asm volatile("xvxor.v $xr1, $xr1, $xr5");
295 			/* w2$$ = MASK(wq$$); */
296 			asm volatile("xvslti.b $xr6, $xr2, 0");
297 			asm volatile("xvslti.b $xr7, $xr3, 0");
298 			/* w1$$ = SHLBYTE(wq$$); */
299 			asm volatile("xvslli.b $xr8, $xr2, 1");
300 			asm volatile("xvslli.b $xr9, $xr3, 1");
301 			/* w2$$ &= NBYTES(0x1d); */
302 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
303 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
304 			/* w1$$ ^= w2$$; */
305 			asm volatile("xvxor.v $xr8, $xr8, $xr6");
306 			asm volatile("xvxor.v $xr9, $xr9, $xr7");
307 			/* wq$$ = w1$$ ^ wd$$; */
308 			asm volatile("xvxor.v $xr2, $xr8, $xr4");
309 			asm volatile("xvxor.v $xr3, $xr9, $xr5");
310 		}
311 		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
312 		asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0]));
313 		asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1]));
314 		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
315 		asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0]));
316 		asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1]));
317 	}
318 
319 	kernel_fpu_end();
320 }
321 
raid6_lasx_xor_syndrome(int disks, int start, int stop, size_t bytes, void **ptrs)322 static void raid6_lasx_xor_syndrome(int disks, int start, int stop,
323 				    size_t bytes, void **ptrs)
324 {
325 	u8 **dptr = (u8 **)ptrs;
326 	u8 *p, *q;
327 	int d, z, z0;
328 
329 	z0 = stop;		/* P/Q right side optimization */
330 	p = dptr[disks-2];	/* XOR parity */
331 	q = dptr[disks-1];	/* RS syndrome */
332 
333 	kernel_fpu_begin();
334 
335 	/*
336 	 * $xr0, $xr1: wp
337 	 * $xr2, $xr3: wq
338 	 * $xr4, $xr5: wd
339 	 * $xr6, $xr7: w2
340 	 * $xr8, $xr9: w1
341 	 */
342 	for (d = 0; d < bytes; d += NSIZE*2) {
343 		/* P/Q data pages */
344 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
345 		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
346 		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
347 		asm volatile("xvori.b $xr2, $xr0, 0");
348 		asm volatile("xvori.b $xr3, $xr1, 0");
349 		for (z = z0-1; z >= start; z--) {
350 			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
351 			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
352 			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
353 			/* wp$$ ^= wd$$; */
354 			asm volatile("xvxor.v $xr0, $xr0, $xr4");
355 			asm volatile("xvxor.v $xr1, $xr1, $xr5");
356 			/* w2$$ = MASK(wq$$); */
357 			asm volatile("xvslti.b $xr6, $xr2, 0");
358 			asm volatile("xvslti.b $xr7, $xr3, 0");
359 			/* w1$$ = SHLBYTE(wq$$); */
360 			asm volatile("xvslli.b $xr8, $xr2, 1");
361 			asm volatile("xvslli.b $xr9, $xr3, 1");
362 			/* w2$$ &= NBYTES(0x1d); */
363 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
364 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
365 			/* w1$$ ^= w2$$; */
366 			asm volatile("xvxor.v $xr8, $xr8, $xr6");
367 			asm volatile("xvxor.v $xr9, $xr9, $xr7");
368 			/* wq$$ = w1$$ ^ wd$$; */
369 			asm volatile("xvxor.v $xr2, $xr8, $xr4");
370 			asm volatile("xvxor.v $xr3, $xr9, $xr5");
371 		}
372 
373 		/* P/Q left side optimization */
374 		for (z = start-1; z >= 0; z--) {
375 			/* w2$$ = MASK(wq$$); */
376 			asm volatile("xvslti.b $xr6, $xr2, 0");
377 			asm volatile("xvslti.b $xr7, $xr3, 0");
378 			/* w1$$ = SHLBYTE(wq$$); */
379 			asm volatile("xvslli.b $xr8, $xr2, 1");
380 			asm volatile("xvslli.b $xr9, $xr3, 1");
381 			/* w2$$ &= NBYTES(0x1d); */
382 			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
383 			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
384 			/* wq$$ = w1$$ ^ w2$$; */
385 			asm volatile("xvxor.v $xr2, $xr8, $xr6");
386 			asm volatile("xvxor.v $xr3, $xr9, $xr7");
387 		}
388 		/*
389 		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
390 		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
391 		 */
392 		asm volatile(
393 			"xvld $xr10, %0\n\t"
394 			"xvld $xr11, %1\n\t"
395 			"xvld $xr12, %2\n\t"
396 			"xvld $xr13, %3\n\t"
397 			"xvxor.v $xr10, $xr10, $xr0\n\t"
398 			"xvxor.v $xr11, $xr11, $xr1\n\t"
399 			"xvxor.v $xr12, $xr12, $xr2\n\t"
400 			"xvxor.v $xr13, $xr13, $xr3\n\t"
401 			"xvst $xr10, %0\n\t"
402 			"xvst $xr11, %1\n\t"
403 			"xvst $xr12, %2\n\t"
404 			"xvst $xr13, %3\n\t"
405 			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
406 			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1])
407 		);
408 	}
409 
410 	kernel_fpu_end();
411 }
412 
413 const struct raid6_calls raid6_lasx = {
414 	raid6_lasx_gen_syndrome,
415 	raid6_lasx_xor_syndrome,
416 	raid6_has_lasx,
417 	"lasx",
418 };
419 #undef NSIZE
420 #endif /* CONFIG_CPU_HAS_LASX */
421