1/* SPDX-License-Identifier: GPL-2.0-or-later */
2#ifndef _ASM_X86_XOR_32_H
3#define _ASM_X86_XOR_32_H
4
5/*
6 * Optimized RAID-5 checksumming functions for MMX.
7 */
8
9/*
10 * High-speed RAID5 checksumming functions utilizing MMX instructions.
11 * Copyright (C) 1998 Ingo Molnar.
12 */
13
14#define LD(x, y)	"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
15#define ST(x, y)	"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
16#define XO1(x, y)	"       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
17#define XO2(x, y)	"       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
18#define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
19#define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
20
21#include <asm/fpu/api.h>
22
23static void
24xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
25{
26	unsigned long lines = bytes >> 7;
27
28	kernel_fpu_begin();
29
30	asm volatile(
31#undef BLOCK
32#define BLOCK(i)				\
33	LD(i, 0)				\
34		LD(i + 1, 1)			\
35			LD(i + 2, 2)		\
36				LD(i + 3, 3)	\
37	XO1(i, 0)				\
38	ST(i, 0)				\
39		XO1(i+1, 1)			\
40		ST(i+1, 1)			\
41			XO1(i + 2, 2)		\
42			ST(i + 2, 2)		\
43				XO1(i + 3, 3)	\
44				ST(i + 3, 3)
45
46	" .align 32			;\n"
47	" 1:                            ;\n"
48
49	BLOCK(0)
50	BLOCK(4)
51	BLOCK(8)
52	BLOCK(12)
53
54	"       addl $128, %1         ;\n"
55	"       addl $128, %2         ;\n"
56	"       decl %0               ;\n"
57	"       jnz 1b                ;\n"
58	: "+r" (lines),
59	  "+r" (p1), "+r" (p2)
60	:
61	: "memory");
62
63	kernel_fpu_end();
64}
65
66static void
67xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
68	      unsigned long *p3)
69{
70	unsigned long lines = bytes >> 7;
71
72	kernel_fpu_begin();
73
74	asm volatile(
75#undef BLOCK
76#define BLOCK(i)				\
77	LD(i, 0)				\
78		LD(i + 1, 1)			\
79			LD(i + 2, 2)		\
80				LD(i + 3, 3)	\
81	XO1(i, 0)				\
82		XO1(i + 1, 1)			\
83			XO1(i + 2, 2)		\
84				XO1(i + 3, 3)	\
85	XO2(i, 0)				\
86	ST(i, 0)				\
87		XO2(i + 1, 1)			\
88		ST(i + 1, 1)			\
89			XO2(i + 2, 2)		\
90			ST(i + 2, 2)		\
91				XO2(i + 3, 3)	\
92				ST(i + 3, 3)
93
94	" .align 32			;\n"
95	" 1:                            ;\n"
96
97	BLOCK(0)
98	BLOCK(4)
99	BLOCK(8)
100	BLOCK(12)
101
102	"       addl $128, %1         ;\n"
103	"       addl $128, %2         ;\n"
104	"       addl $128, %3         ;\n"
105	"       decl %0               ;\n"
106	"       jnz 1b                ;\n"
107	: "+r" (lines),
108	  "+r" (p1), "+r" (p2), "+r" (p3)
109	:
110	: "memory");
111
112	kernel_fpu_end();
113}
114
115static void
116xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
117	      unsigned long *p3, unsigned long *p4)
118{
119	unsigned long lines = bytes >> 7;
120
121	kernel_fpu_begin();
122
123	asm volatile(
124#undef BLOCK
125#define BLOCK(i)				\
126	LD(i, 0)				\
127		LD(i + 1, 1)			\
128			LD(i + 2, 2)		\
129				LD(i + 3, 3)	\
130	XO1(i, 0)				\
131		XO1(i + 1, 1)			\
132			XO1(i + 2, 2)		\
133				XO1(i + 3, 3)	\
134	XO2(i, 0)				\
135		XO2(i + 1, 1)			\
136			XO2(i + 2, 2)		\
137				XO2(i + 3, 3)	\
138	XO3(i, 0)				\
139	ST(i, 0)				\
140		XO3(i + 1, 1)			\
141		ST(i + 1, 1)			\
142			XO3(i + 2, 2)		\
143			ST(i + 2, 2)		\
144				XO3(i + 3, 3)	\
145				ST(i + 3, 3)
146
147	" .align 32			;\n"
148	" 1:                            ;\n"
149
150	BLOCK(0)
151	BLOCK(4)
152	BLOCK(8)
153	BLOCK(12)
154
155	"       addl $128, %1         ;\n"
156	"       addl $128, %2         ;\n"
157	"       addl $128, %3         ;\n"
158	"       addl $128, %4         ;\n"
159	"       decl %0               ;\n"
160	"       jnz 1b                ;\n"
161	: "+r" (lines),
162	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
163	:
164	: "memory");
165
166	kernel_fpu_end();
167}
168
169
170static void
171xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
172	      unsigned long *p3, unsigned long *p4, unsigned long *p5)
173{
174	unsigned long lines = bytes >> 7;
175
176	kernel_fpu_begin();
177
178	/* Make sure GCC forgets anything it knows about p4 or p5,
179	   such that it won't pass to the asm volatile below a
180	   register that is shared with any other variable.  That's
181	   because we modify p4 and p5 there, but we can't mark them
182	   as read/write, otherwise we'd overflow the 10-asm-operands
183	   limit of GCC < 3.1.  */
184	asm("" : "+r" (p4), "+r" (p5));
185
186	asm volatile(
187#undef BLOCK
188#define BLOCK(i)				\
189	LD(i, 0)				\
190		LD(i + 1, 1)			\
191			LD(i + 2, 2)		\
192				LD(i + 3, 3)	\
193	XO1(i, 0)				\
194		XO1(i + 1, 1)			\
195			XO1(i + 2, 2)		\
196				XO1(i + 3, 3)	\
197	XO2(i, 0)				\
198		XO2(i + 1, 1)			\
199			XO2(i + 2, 2)		\
200				XO2(i + 3, 3)	\
201	XO3(i, 0)				\
202		XO3(i + 1, 1)			\
203			XO3(i + 2, 2)		\
204				XO3(i + 3, 3)	\
205	XO4(i, 0)				\
206	ST(i, 0)				\
207		XO4(i + 1, 1)			\
208		ST(i + 1, 1)			\
209			XO4(i + 2, 2)		\
210			ST(i + 2, 2)		\
211				XO4(i + 3, 3)	\
212				ST(i + 3, 3)
213
214	" .align 32			;\n"
215	" 1:                            ;\n"
216
217	BLOCK(0)
218	BLOCK(4)
219	BLOCK(8)
220	BLOCK(12)
221
222	"       addl $128, %1         ;\n"
223	"       addl $128, %2         ;\n"
224	"       addl $128, %3         ;\n"
225	"       addl $128, %4         ;\n"
226	"       addl $128, %5         ;\n"
227	"       decl %0               ;\n"
228	"       jnz 1b                ;\n"
229	: "+r" (lines),
230	  "+r" (p1), "+r" (p2), "+r" (p3)
231	: "r" (p4), "r" (p5)
232	: "memory");
233
234	/* p4 and p5 were modified, and now the variables are dead.
235	   Clobber them just to be sure nobody does something stupid
236	   like assuming they have some legal value.  */
237	asm("" : "=r" (p4), "=r" (p5));
238
239	kernel_fpu_end();
240}
241
242#undef LD
243#undef XO1
244#undef XO2
245#undef XO3
246#undef XO4
247#undef ST
248#undef BLOCK
249
250static void
251xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
252{
253	unsigned long lines = bytes >> 6;
254
255	kernel_fpu_begin();
256
257	asm volatile(
258	" .align 32	             ;\n"
259	" 1:                         ;\n"
260	"       movq   (%1), %%mm0   ;\n"
261	"       movq  8(%1), %%mm1   ;\n"
262	"       pxor   (%2), %%mm0   ;\n"
263	"       movq 16(%1), %%mm2   ;\n"
264	"       movq %%mm0,   (%1)   ;\n"
265	"       pxor  8(%2), %%mm1   ;\n"
266	"       movq 24(%1), %%mm3   ;\n"
267	"       movq %%mm1,  8(%1)   ;\n"
268	"       pxor 16(%2), %%mm2   ;\n"
269	"       movq 32(%1), %%mm4   ;\n"
270	"       movq %%mm2, 16(%1)   ;\n"
271	"       pxor 24(%2), %%mm3   ;\n"
272	"       movq 40(%1), %%mm5   ;\n"
273	"       movq %%mm3, 24(%1)   ;\n"
274	"       pxor 32(%2), %%mm4   ;\n"
275	"       movq 48(%1), %%mm6   ;\n"
276	"       movq %%mm4, 32(%1)   ;\n"
277	"       pxor 40(%2), %%mm5   ;\n"
278	"       movq 56(%1), %%mm7   ;\n"
279	"       movq %%mm5, 40(%1)   ;\n"
280	"       pxor 48(%2), %%mm6   ;\n"
281	"       pxor 56(%2), %%mm7   ;\n"
282	"       movq %%mm6, 48(%1)   ;\n"
283	"       movq %%mm7, 56(%1)   ;\n"
284
285	"       addl $64, %1         ;\n"
286	"       addl $64, %2         ;\n"
287	"       decl %0              ;\n"
288	"       jnz 1b               ;\n"
289	: "+r" (lines),
290	  "+r" (p1), "+r" (p2)
291	:
292	: "memory");
293
294	kernel_fpu_end();
295}
296
297static void
298xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
299	     unsigned long *p3)
300{
301	unsigned long lines = bytes >> 6;
302
303	kernel_fpu_begin();
304
305	asm volatile(
306	" .align 32,0x90             ;\n"
307	" 1:                         ;\n"
308	"       movq   (%1), %%mm0   ;\n"
309	"       movq  8(%1), %%mm1   ;\n"
310	"       pxor   (%2), %%mm0   ;\n"
311	"       movq 16(%1), %%mm2   ;\n"
312	"       pxor  8(%2), %%mm1   ;\n"
313	"       pxor   (%3), %%mm0   ;\n"
314	"       pxor 16(%2), %%mm2   ;\n"
315	"       movq %%mm0,   (%1)   ;\n"
316	"       pxor  8(%3), %%mm1   ;\n"
317	"       pxor 16(%3), %%mm2   ;\n"
318	"       movq 24(%1), %%mm3   ;\n"
319	"       movq %%mm1,  8(%1)   ;\n"
320	"       movq 32(%1), %%mm4   ;\n"
321	"       movq 40(%1), %%mm5   ;\n"
322	"       pxor 24(%2), %%mm3   ;\n"
323	"       movq %%mm2, 16(%1)   ;\n"
324	"       pxor 32(%2), %%mm4   ;\n"
325	"       pxor 24(%3), %%mm3   ;\n"
326	"       pxor 40(%2), %%mm5   ;\n"
327	"       movq %%mm3, 24(%1)   ;\n"
328	"       pxor 32(%3), %%mm4   ;\n"
329	"       pxor 40(%3), %%mm5   ;\n"
330	"       movq 48(%1), %%mm6   ;\n"
331	"       movq %%mm4, 32(%1)   ;\n"
332	"       movq 56(%1), %%mm7   ;\n"
333	"       pxor 48(%2), %%mm6   ;\n"
334	"       movq %%mm5, 40(%1)   ;\n"
335	"       pxor 56(%2), %%mm7   ;\n"
336	"       pxor 48(%3), %%mm6   ;\n"
337	"       pxor 56(%3), %%mm7   ;\n"
338	"       movq %%mm6, 48(%1)   ;\n"
339	"       movq %%mm7, 56(%1)   ;\n"
340
341	"       addl $64, %1         ;\n"
342	"       addl $64, %2         ;\n"
343	"       addl $64, %3         ;\n"
344	"       decl %0              ;\n"
345	"       jnz 1b               ;\n"
346	: "+r" (lines),
347	  "+r" (p1), "+r" (p2), "+r" (p3)
348	:
349	: "memory" );
350
351	kernel_fpu_end();
352}
353
354static void
355xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
356	     unsigned long *p3, unsigned long *p4)
357{
358	unsigned long lines = bytes >> 6;
359
360	kernel_fpu_begin();
361
362	asm volatile(
363	" .align 32,0x90             ;\n"
364	" 1:                         ;\n"
365	"       movq   (%1), %%mm0   ;\n"
366	"       movq  8(%1), %%mm1   ;\n"
367	"       pxor   (%2), %%mm0   ;\n"
368	"       movq 16(%1), %%mm2   ;\n"
369	"       pxor  8(%2), %%mm1   ;\n"
370	"       pxor   (%3), %%mm0   ;\n"
371	"       pxor 16(%2), %%mm2   ;\n"
372	"       pxor  8(%3), %%mm1   ;\n"
373	"       pxor   (%4), %%mm0   ;\n"
374	"       movq 24(%1), %%mm3   ;\n"
375	"       pxor 16(%3), %%mm2   ;\n"
376	"       pxor  8(%4), %%mm1   ;\n"
377	"       movq %%mm0,   (%1)   ;\n"
378	"       movq 32(%1), %%mm4   ;\n"
379	"       pxor 24(%2), %%mm3   ;\n"
380	"       pxor 16(%4), %%mm2   ;\n"
381	"       movq %%mm1,  8(%1)   ;\n"
382	"       movq 40(%1), %%mm5   ;\n"
383	"       pxor 32(%2), %%mm4   ;\n"
384	"       pxor 24(%3), %%mm3   ;\n"
385	"       movq %%mm2, 16(%1)   ;\n"
386	"       pxor 40(%2), %%mm5   ;\n"
387	"       pxor 32(%3), %%mm4   ;\n"
388	"       pxor 24(%4), %%mm3   ;\n"
389	"       movq %%mm3, 24(%1)   ;\n"
390	"       movq 56(%1), %%mm7   ;\n"
391	"       movq 48(%1), %%mm6   ;\n"
392	"       pxor 40(%3), %%mm5   ;\n"
393	"       pxor 32(%4), %%mm4   ;\n"
394	"       pxor 48(%2), %%mm6   ;\n"
395	"       movq %%mm4, 32(%1)   ;\n"
396	"       pxor 56(%2), %%mm7   ;\n"
397	"       pxor 40(%4), %%mm5   ;\n"
398	"       pxor 48(%3), %%mm6   ;\n"
399	"       pxor 56(%3), %%mm7   ;\n"
400	"       movq %%mm5, 40(%1)   ;\n"
401	"       pxor 48(%4), %%mm6   ;\n"
402	"       pxor 56(%4), %%mm7   ;\n"
403	"       movq %%mm6, 48(%1)   ;\n"
404	"       movq %%mm7, 56(%1)   ;\n"
405
406	"       addl $64, %1         ;\n"
407	"       addl $64, %2         ;\n"
408	"       addl $64, %3         ;\n"
409	"       addl $64, %4         ;\n"
410	"       decl %0              ;\n"
411	"       jnz 1b               ;\n"
412	: "+r" (lines),
413	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
414	:
415	: "memory");
416
417	kernel_fpu_end();
418}
419
420static void
421xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
422	     unsigned long *p3, unsigned long *p4, unsigned long *p5)
423{
424	unsigned long lines = bytes >> 6;
425
426	kernel_fpu_begin();
427
428	/* Make sure GCC forgets anything it knows about p4 or p5,
429	   such that it won't pass to the asm volatile below a
430	   register that is shared with any other variable.  That's
431	   because we modify p4 and p5 there, but we can't mark them
432	   as read/write, otherwise we'd overflow the 10-asm-operands
433	   limit of GCC < 3.1.  */
434	asm("" : "+r" (p4), "+r" (p5));
435
436	asm volatile(
437	" .align 32,0x90             ;\n"
438	" 1:                         ;\n"
439	"       movq   (%1), %%mm0   ;\n"
440	"       movq  8(%1), %%mm1   ;\n"
441	"       pxor   (%2), %%mm0   ;\n"
442	"       pxor  8(%2), %%mm1   ;\n"
443	"       movq 16(%1), %%mm2   ;\n"
444	"       pxor   (%3), %%mm0   ;\n"
445	"       pxor  8(%3), %%mm1   ;\n"
446	"       pxor 16(%2), %%mm2   ;\n"
447	"       pxor   (%4), %%mm0   ;\n"
448	"       pxor  8(%4), %%mm1   ;\n"
449	"       pxor 16(%3), %%mm2   ;\n"
450	"       movq 24(%1), %%mm3   ;\n"
451	"       pxor   (%5), %%mm0   ;\n"
452	"       pxor  8(%5), %%mm1   ;\n"
453	"       movq %%mm0,   (%1)   ;\n"
454	"       pxor 16(%4), %%mm2   ;\n"
455	"       pxor 24(%2), %%mm3   ;\n"
456	"       movq %%mm1,  8(%1)   ;\n"
457	"       pxor 16(%5), %%mm2   ;\n"
458	"       pxor 24(%3), %%mm3   ;\n"
459	"       movq 32(%1), %%mm4   ;\n"
460	"       movq %%mm2, 16(%1)   ;\n"
461	"       pxor 24(%4), %%mm3   ;\n"
462	"       pxor 32(%2), %%mm4   ;\n"
463	"       movq 40(%1), %%mm5   ;\n"
464	"       pxor 24(%5), %%mm3   ;\n"
465	"       pxor 32(%3), %%mm4   ;\n"
466	"       pxor 40(%2), %%mm5   ;\n"
467	"       movq %%mm3, 24(%1)   ;\n"
468	"       pxor 32(%4), %%mm4   ;\n"
469	"       pxor 40(%3), %%mm5   ;\n"
470	"       movq 48(%1), %%mm6   ;\n"
471	"       movq 56(%1), %%mm7   ;\n"
472	"       pxor 32(%5), %%mm4   ;\n"
473	"       pxor 40(%4), %%mm5   ;\n"
474	"       pxor 48(%2), %%mm6   ;\n"
475	"       pxor 56(%2), %%mm7   ;\n"
476	"       movq %%mm4, 32(%1)   ;\n"
477	"       pxor 48(%3), %%mm6   ;\n"
478	"       pxor 56(%3), %%mm7   ;\n"
479	"       pxor 40(%5), %%mm5   ;\n"
480	"       pxor 48(%4), %%mm6   ;\n"
481	"       pxor 56(%4), %%mm7   ;\n"
482	"       movq %%mm5, 40(%1)   ;\n"
483	"       pxor 48(%5), %%mm6   ;\n"
484	"       pxor 56(%5), %%mm7   ;\n"
485	"       movq %%mm6, 48(%1)   ;\n"
486	"       movq %%mm7, 56(%1)   ;\n"
487
488	"       addl $64, %1         ;\n"
489	"       addl $64, %2         ;\n"
490	"       addl $64, %3         ;\n"
491	"       addl $64, %4         ;\n"
492	"       addl $64, %5         ;\n"
493	"       decl %0              ;\n"
494	"       jnz 1b               ;\n"
495	: "+r" (lines),
496	  "+r" (p1), "+r" (p2), "+r" (p3)
497	: "r" (p4), "r" (p5)
498	: "memory");
499
500	/* p4 and p5 were modified, and now the variables are dead.
501	   Clobber them just to be sure nobody does something stupid
502	   like assuming they have some legal value.  */
503	asm("" : "=r" (p4), "=r" (p5));
504
505	kernel_fpu_end();
506}
507
508static struct xor_block_template xor_block_pII_mmx = {
509	.name = "pII_mmx",
510	.do_2 = xor_pII_mmx_2,
511	.do_3 = xor_pII_mmx_3,
512	.do_4 = xor_pII_mmx_4,
513	.do_5 = xor_pII_mmx_5,
514};
515
516static struct xor_block_template xor_block_p5_mmx = {
517	.name = "p5_mmx",
518	.do_2 = xor_p5_mmx_2,
519	.do_3 = xor_p5_mmx_3,
520	.do_4 = xor_p5_mmx_4,
521	.do_5 = xor_p5_mmx_5,
522};
523
524static struct xor_block_template xor_block_pIII_sse = {
525	.name = "pIII_sse",
526	.do_2 = xor_sse_2,
527	.do_3 = xor_sse_3,
528	.do_4 = xor_sse_4,
529	.do_5 = xor_sse_5,
530};
531
532/* Also try the AVX routines */
533#include <asm/xor_avx.h>
534
535/* Also try the generic routines.  */
536#include <asm-generic/xor.h>
537
538/* We force the use of the SSE xor block because it can write around L2.
539   We may also be able to load into the L1 only depending on how the cpu
540   deals with a load to a line that is being prefetched.  */
541#undef XOR_TRY_TEMPLATES
542#define XOR_TRY_TEMPLATES				\
543do {							\
544	AVX_XOR_SPEED;					\
545	if (boot_cpu_has(X86_FEATURE_XMM)) {				\
546		xor_speed(&xor_block_pIII_sse);		\
547		xor_speed(&xor_block_sse_pf64);		\
548	} else if (boot_cpu_has(X86_FEATURE_MMX)) {	\
549		xor_speed(&xor_block_pII_mmx);		\
550		xor_speed(&xor_block_p5_mmx);		\
551	} else {					\
552		xor_speed(&xor_block_8regs);		\
553		xor_speed(&xor_block_8regs_p);		\
554		xor_speed(&xor_block_32regs);		\
555		xor_speed(&xor_block_32regs_p);		\
556	}						\
557} while (0)
558
559#endif /* _ASM_X86_XOR_32_H */
560