18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX)
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * Originally based on recov_avx2.c and recov_ssse3.c:
88c2ecf20Sopenharmony_ci *
98c2ecf20Sopenharmony_ci * Copyright (C) 2012 Intel Corporation
108c2ecf20Sopenharmony_ci * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
118c2ecf20Sopenharmony_ci */
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_ci#include <linux/raid/pq.h>
148c2ecf20Sopenharmony_ci#include "loongarch.h"
158c2ecf20Sopenharmony_ci
168c2ecf20Sopenharmony_ci/*
178c2ecf20Sopenharmony_ci * Unlike with the syndrome calculation algorithms, there's no boot-time
188c2ecf20Sopenharmony_ci * selection of recovery algorithms by benchmarking, so we have to specify
198c2ecf20Sopenharmony_ci * the priorities and hope the future cores will all have decent vector
208c2ecf20Sopenharmony_ci * support (i.e. no LASX slower than LSX, or even scalar code).
218c2ecf20Sopenharmony_ci */
228c2ecf20Sopenharmony_ci
238c2ecf20Sopenharmony_ci#ifdef CONFIG_CPU_HAS_LSX
248c2ecf20Sopenharmony_cistatic int raid6_has_lsx(void)
258c2ecf20Sopenharmony_ci{
268c2ecf20Sopenharmony_ci	return cpu_has_lsx;
278c2ecf20Sopenharmony_ci}
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_cistatic void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
308c2ecf20Sopenharmony_ci				  int failb, void **ptrs)
318c2ecf20Sopenharmony_ci{
328c2ecf20Sopenharmony_ci	u8 *p, *q, *dp, *dq;
338c2ecf20Sopenharmony_ci	const u8 *pbmul;	/* P multiplier table for B data */
348c2ecf20Sopenharmony_ci	const u8 *qmul;		/* Q multiplier table (for both) */
358c2ecf20Sopenharmony_ci
368c2ecf20Sopenharmony_ci	p = (u8 *)ptrs[disks - 2];
378c2ecf20Sopenharmony_ci	q = (u8 *)ptrs[disks - 1];
388c2ecf20Sopenharmony_ci
398c2ecf20Sopenharmony_ci	/*
408c2ecf20Sopenharmony_ci	 * Compute syndrome with zero for the missing data pages
418c2ecf20Sopenharmony_ci	 * Use the dead data pages as temporary storage for
428c2ecf20Sopenharmony_ci	 * delta p and delta q
438c2ecf20Sopenharmony_ci	 */
448c2ecf20Sopenharmony_ci	dp = (u8 *)ptrs[faila];
458c2ecf20Sopenharmony_ci	ptrs[faila] = (void *)raid6_empty_zero_page;
468c2ecf20Sopenharmony_ci	ptrs[disks - 2] = dp;
478c2ecf20Sopenharmony_ci	dq = (u8 *)ptrs[failb];
488c2ecf20Sopenharmony_ci	ptrs[failb] = (void *)raid6_empty_zero_page;
498c2ecf20Sopenharmony_ci	ptrs[disks - 1] = dq;
508c2ecf20Sopenharmony_ci
518c2ecf20Sopenharmony_ci	raid6_call.gen_syndrome(disks, bytes, ptrs);
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ci	/* Restore pointer table */
548c2ecf20Sopenharmony_ci	ptrs[faila] = dp;
558c2ecf20Sopenharmony_ci	ptrs[failb] = dq;
568c2ecf20Sopenharmony_ci	ptrs[disks - 2] = p;
578c2ecf20Sopenharmony_ci	ptrs[disks - 1] = q;
588c2ecf20Sopenharmony_ci
598c2ecf20Sopenharmony_ci	/* Now, pick the proper data tables */
608c2ecf20Sopenharmony_ci	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
618c2ecf20Sopenharmony_ci	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
628c2ecf20Sopenharmony_ci
638c2ecf20Sopenharmony_ci	kernel_fpu_begin();
648c2ecf20Sopenharmony_ci
658c2ecf20Sopenharmony_ci	/*
668c2ecf20Sopenharmony_ci	 * vr20, vr21: qmul
678c2ecf20Sopenharmony_ci	 * vr22, vr23: pbmul
688c2ecf20Sopenharmony_ci	 */
698c2ecf20Sopenharmony_ci	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
708c2ecf20Sopenharmony_ci	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
718c2ecf20Sopenharmony_ci	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
728c2ecf20Sopenharmony_ci	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
738c2ecf20Sopenharmony_ci
748c2ecf20Sopenharmony_ci	while (bytes) {
758c2ecf20Sopenharmony_ci		/* vr4 - vr7: Q */
768c2ecf20Sopenharmony_ci		asm volatile("vld $vr4, %0" : : "m" (q[0]));
778c2ecf20Sopenharmony_ci		asm volatile("vld $vr5, %0" : : "m" (q[16]));
788c2ecf20Sopenharmony_ci		asm volatile("vld $vr6, %0" : : "m" (q[32]));
798c2ecf20Sopenharmony_ci		asm volatile("vld $vr7, %0" : : "m" (q[48]));
808c2ecf20Sopenharmony_ci		/*  vr4 - vr7: Q + Qxy */
818c2ecf20Sopenharmony_ci		asm volatile("vld $vr8, %0" : : "m" (dq[0]));
828c2ecf20Sopenharmony_ci		asm volatile("vld $vr9, %0" : : "m" (dq[16]));
838c2ecf20Sopenharmony_ci		asm volatile("vld $vr10, %0" : : "m" (dq[32]));
848c2ecf20Sopenharmony_ci		asm volatile("vld $vr11, %0" : : "m" (dq[48]));
858c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr4, $vr4, $vr8");
868c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr5, $vr5, $vr9");
878c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr6, $vr6, $vr10");
888c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr7, $vr7, $vr11");
898c2ecf20Sopenharmony_ci		/* vr0 - vr3: P */
908c2ecf20Sopenharmony_ci		asm volatile("vld $vr0, %0" : : "m" (p[0]));
918c2ecf20Sopenharmony_ci		asm volatile("vld $vr1, %0" : : "m" (p[16]));
928c2ecf20Sopenharmony_ci		asm volatile("vld $vr2, %0" : : "m" (p[32]));
938c2ecf20Sopenharmony_ci		asm volatile("vld $vr3, %0" : : "m" (p[48]));
948c2ecf20Sopenharmony_ci		/* vr0 - vr3: P + Pxy */
958c2ecf20Sopenharmony_ci		asm volatile("vld $vr8, %0" : : "m" (dp[0]));
968c2ecf20Sopenharmony_ci		asm volatile("vld $vr9, %0" : : "m" (dp[16]));
978c2ecf20Sopenharmony_ci		asm volatile("vld $vr10, %0" : : "m" (dp[32]));
988c2ecf20Sopenharmony_ci		asm volatile("vld $vr11, %0" : : "m" (dp[48]));
998c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr0, $vr0, $vr8");
1008c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr1, $vr1, $vr9");
1018c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr2, $vr2, $vr10");
1028c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr3, $vr3, $vr11");
1038c2ecf20Sopenharmony_ci
1048c2ecf20Sopenharmony_ci		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */
1058c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr8, $vr4, 4");
1068c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr9, $vr5, 4");
1078c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr10, $vr6, 4");
1088c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr11, $vr7, 4");
1098c2ecf20Sopenharmony_ci		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */
1108c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr4, $vr4, 0x0f");
1118c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr5, $vr5, 0x0f");
1128c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr6, $vr6, 0x0f");
1138c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr7, $vr7, 0x0f");
1148c2ecf20Sopenharmony_ci		/* lookup from qmul[0] */
1158c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4");
1168c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5");
1178c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6");
1188c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7");
1198c2ecf20Sopenharmony_ci		/* lookup from qmul[16] */
1208c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8");
1218c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9");
1228c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10");
1238c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11");
1248c2ecf20Sopenharmony_ci		/* vr16 - vr19: B(Q + Qxy) */
1258c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr16, $vr8, $vr4");
1268c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr17, $vr9, $vr5");
1278c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr18, $vr10, $vr6");
1288c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr19, $vr11, $vr7");
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ci		/* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */
1318c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr4, $vr0, 4");
1328c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr5, $vr1, 4");
1338c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr6, $vr2, 4");
1348c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr7, $vr3, 4");
1358c2ecf20Sopenharmony_ci		/* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */
1368c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr12, $vr0, 0x0f");
1378c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr13, $vr1, 0x0f");
1388c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr14, $vr2, 0x0f");
1398c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr15, $vr3, 0x0f");
1408c2ecf20Sopenharmony_ci		/* lookup from pbmul[0] */
1418c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12");
1428c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13");
1438c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14");
1448c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15");
1458c2ecf20Sopenharmony_ci		/* lookup from pbmul[16] */
1468c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4");
1478c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5");
1488c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6");
1498c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7");
1508c2ecf20Sopenharmony_ci		/* vr4 - vr7: A(P + Pxy) */
1518c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr4, $vr4, $vr12");
1528c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr5, $vr5, $vr13");
1538c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr6, $vr6, $vr14");
1548c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr7, $vr7, $vr15");
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_ci		/* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */
1578c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr4, $vr4, $vr16");
1588c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr5, $vr5, $vr17");
1598c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr6, $vr6, $vr18");
1608c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr7, $vr7, $vr19");
1618c2ecf20Sopenharmony_ci		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
1628c2ecf20Sopenharmony_ci		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
1638c2ecf20Sopenharmony_ci		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
1648c2ecf20Sopenharmony_ci		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci		/* vr0 - vr3: P + Pxy + Dx = Dy */
1678c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr0, $vr0, $vr4");
1688c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr1, $vr1, $vr5");
1698c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr2, $vr2, $vr6");
1708c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr3, $vr3, $vr7");
1718c2ecf20Sopenharmony_ci		asm volatile("vst $vr0, %0" : "=m" (dp[0]));
1728c2ecf20Sopenharmony_ci		asm volatile("vst $vr1, %0" : "=m" (dp[16]));
1738c2ecf20Sopenharmony_ci		asm volatile("vst $vr2, %0" : "=m" (dp[32]));
1748c2ecf20Sopenharmony_ci		asm volatile("vst $vr3, %0" : "=m" (dp[48]));
1758c2ecf20Sopenharmony_ci
1768c2ecf20Sopenharmony_ci		bytes -= 64;
1778c2ecf20Sopenharmony_ci		p += 64;
1788c2ecf20Sopenharmony_ci		q += 64;
1798c2ecf20Sopenharmony_ci		dp += 64;
1808c2ecf20Sopenharmony_ci		dq += 64;
1818c2ecf20Sopenharmony_ci	}
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_ci	kernel_fpu_end();
1848c2ecf20Sopenharmony_ci}
1858c2ecf20Sopenharmony_ci
1868c2ecf20Sopenharmony_cistatic void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
1878c2ecf20Sopenharmony_ci				  void **ptrs)
1888c2ecf20Sopenharmony_ci{
1898c2ecf20Sopenharmony_ci	u8 *p, *q, *dq;
1908c2ecf20Sopenharmony_ci	const u8 *qmul;		/* Q multiplier table */
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ci	p = (u8 *)ptrs[disks - 2];
1938c2ecf20Sopenharmony_ci	q = (u8 *)ptrs[disks - 1];
1948c2ecf20Sopenharmony_ci
1958c2ecf20Sopenharmony_ci	/*
1968c2ecf20Sopenharmony_ci	 * Compute syndrome with zero for the missing data page
1978c2ecf20Sopenharmony_ci	 * Use the dead data page as temporary storage for delta q
1988c2ecf20Sopenharmony_ci	 */
1998c2ecf20Sopenharmony_ci	dq = (u8 *)ptrs[faila];
2008c2ecf20Sopenharmony_ci	ptrs[faila] = (void *)raid6_empty_zero_page;
2018c2ecf20Sopenharmony_ci	ptrs[disks - 1] = dq;
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci	raid6_call.gen_syndrome(disks, bytes, ptrs);
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci	/* Restore pointer table */
2068c2ecf20Sopenharmony_ci	ptrs[faila] = dq;
2078c2ecf20Sopenharmony_ci	ptrs[disks - 1] = q;
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_ci	/* Now, pick the proper data tables */
2108c2ecf20Sopenharmony_ci	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ci	kernel_fpu_begin();
2138c2ecf20Sopenharmony_ci
2148c2ecf20Sopenharmony_ci	/* vr22, vr23: qmul */
2158c2ecf20Sopenharmony_ci	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
2168c2ecf20Sopenharmony_ci	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
2178c2ecf20Sopenharmony_ci
2188c2ecf20Sopenharmony_ci	while (bytes) {
2198c2ecf20Sopenharmony_ci		/* vr0 - vr3: P + Dx */
2208c2ecf20Sopenharmony_ci		asm volatile("vld $vr0, %0" : : "m" (p[0]));
2218c2ecf20Sopenharmony_ci		asm volatile("vld $vr1, %0" : : "m" (p[16]));
2228c2ecf20Sopenharmony_ci		asm volatile("vld $vr2, %0" : : "m" (p[32]));
2238c2ecf20Sopenharmony_ci		asm volatile("vld $vr3, %0" : : "m" (p[48]));
2248c2ecf20Sopenharmony_ci		/* vr4 - vr7: Qx */
2258c2ecf20Sopenharmony_ci		asm volatile("vld $vr4, %0" : : "m" (dq[0]));
2268c2ecf20Sopenharmony_ci		asm volatile("vld $vr5, %0" : : "m" (dq[16]));
2278c2ecf20Sopenharmony_ci		asm volatile("vld $vr6, %0" : : "m" (dq[32]));
2288c2ecf20Sopenharmony_ci		asm volatile("vld $vr7, %0" : : "m" (dq[48]));
2298c2ecf20Sopenharmony_ci		/* vr4 - vr7: Q + Qx */
2308c2ecf20Sopenharmony_ci		asm volatile("vld $vr8, %0" : : "m" (q[0]));
2318c2ecf20Sopenharmony_ci		asm volatile("vld $vr9, %0" : : "m" (q[16]));
2328c2ecf20Sopenharmony_ci		asm volatile("vld $vr10, %0" : : "m" (q[32]));
2338c2ecf20Sopenharmony_ci		asm volatile("vld $vr11, %0" : : "m" (q[48]));
2348c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr4, $vr4, $vr8");
2358c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr5, $vr5, $vr9");
2368c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr6, $vr6, $vr10");
2378c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr7, $vr7, $vr11");
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_ci		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */
2408c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr8, $vr4, 4");
2418c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr9, $vr5, 4");
2428c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr10, $vr6, 4");
2438c2ecf20Sopenharmony_ci		asm volatile("vsrli.b $vr11, $vr7, 4");
2448c2ecf20Sopenharmony_ci		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */
2458c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr4, $vr4, 0x0f");
2468c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr5, $vr5, 0x0f");
2478c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr6, $vr6, 0x0f");
2488c2ecf20Sopenharmony_ci		asm volatile("vandi.b $vr7, $vr7, 0x0f");
2498c2ecf20Sopenharmony_ci		/* lookup from qmul[0] */
2508c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4");
2518c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5");
2528c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6");
2538c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7");
2548c2ecf20Sopenharmony_ci		/* lookup from qmul[16] */
2558c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8");
2568c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9");
2578c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10");
2588c2ecf20Sopenharmony_ci		asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11");
2598c2ecf20Sopenharmony_ci		/* vr4 - vr7: qmul(Q + Qx) = Dx */
2608c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr4, $vr4, $vr8");
2618c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr5, $vr5, $vr9");
2628c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr6, $vr6, $vr10");
2638c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr7, $vr7, $vr11");
2648c2ecf20Sopenharmony_ci		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
2658c2ecf20Sopenharmony_ci		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
2668c2ecf20Sopenharmony_ci		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
2678c2ecf20Sopenharmony_ci		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
2688c2ecf20Sopenharmony_ci
2698c2ecf20Sopenharmony_ci		/* vr0 - vr3: P + Dx + Dx = P */
2708c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr0, $vr0, $vr4");
2718c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr1, $vr1, $vr5");
2728c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr2, $vr2, $vr6");
2738c2ecf20Sopenharmony_ci		asm volatile("vxor.v $vr3, $vr3, $vr7");
2748c2ecf20Sopenharmony_ci		asm volatile("vst $vr0, %0" : "=m" (p[0]));
2758c2ecf20Sopenharmony_ci		asm volatile("vst $vr1, %0" : "=m" (p[16]));
2768c2ecf20Sopenharmony_ci		asm volatile("vst $vr2, %0" : "=m" (p[32]));
2778c2ecf20Sopenharmony_ci		asm volatile("vst $vr3, %0" : "=m" (p[48]));
2788c2ecf20Sopenharmony_ci
2798c2ecf20Sopenharmony_ci		bytes -= 64;
2808c2ecf20Sopenharmony_ci		p += 64;
2818c2ecf20Sopenharmony_ci		q += 64;
2828c2ecf20Sopenharmony_ci		dq += 64;
2838c2ecf20Sopenharmony_ci	}
2848c2ecf20Sopenharmony_ci
2858c2ecf20Sopenharmony_ci	kernel_fpu_end();
2868c2ecf20Sopenharmony_ci}
2878c2ecf20Sopenharmony_ci
2888c2ecf20Sopenharmony_ciconst struct raid6_recov_calls raid6_recov_lsx = {
2898c2ecf20Sopenharmony_ci	.data2 = raid6_2data_recov_lsx,
2908c2ecf20Sopenharmony_ci	.datap = raid6_datap_recov_lsx,
2918c2ecf20Sopenharmony_ci	.valid = raid6_has_lsx,
2928c2ecf20Sopenharmony_ci	.name = "lsx",
2938c2ecf20Sopenharmony_ci	.priority = 1,
2948c2ecf20Sopenharmony_ci};
2958c2ecf20Sopenharmony_ci#endif /* CONFIG_CPU_HAS_LSX */
2968c2ecf20Sopenharmony_ci
2978c2ecf20Sopenharmony_ci#ifdef CONFIG_CPU_HAS_LASX
2988c2ecf20Sopenharmony_cistatic int raid6_has_lasx(void)
2998c2ecf20Sopenharmony_ci{
3008c2ecf20Sopenharmony_ci	return cpu_has_lasx;
3018c2ecf20Sopenharmony_ci}
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_cistatic void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
3048c2ecf20Sopenharmony_ci				   int failb, void **ptrs)
3058c2ecf20Sopenharmony_ci{
3068c2ecf20Sopenharmony_ci	u8 *p, *q, *dp, *dq;
3078c2ecf20Sopenharmony_ci	const u8 *pbmul;	/* P multiplier table for B data */
3088c2ecf20Sopenharmony_ci	const u8 *qmul;		/* Q multiplier table (for both) */
3098c2ecf20Sopenharmony_ci
3108c2ecf20Sopenharmony_ci	p = (u8 *)ptrs[disks - 2];
3118c2ecf20Sopenharmony_ci	q = (u8 *)ptrs[disks - 1];
3128c2ecf20Sopenharmony_ci
3138c2ecf20Sopenharmony_ci	/*
3148c2ecf20Sopenharmony_ci	 * Compute syndrome with zero for the missing data pages
3158c2ecf20Sopenharmony_ci	 * Use the dead data pages as temporary storage for
3168c2ecf20Sopenharmony_ci	 * delta p and delta q
3178c2ecf20Sopenharmony_ci	 */
3188c2ecf20Sopenharmony_ci	dp = (u8 *)ptrs[faila];
3198c2ecf20Sopenharmony_ci	ptrs[faila] = (void *)raid6_empty_zero_page;
3208c2ecf20Sopenharmony_ci	ptrs[disks - 2] = dp;
3218c2ecf20Sopenharmony_ci	dq = (u8 *)ptrs[failb];
3228c2ecf20Sopenharmony_ci	ptrs[failb] = (void *)raid6_empty_zero_page;
3238c2ecf20Sopenharmony_ci	ptrs[disks - 1] = dq;
3248c2ecf20Sopenharmony_ci
3258c2ecf20Sopenharmony_ci	raid6_call.gen_syndrome(disks, bytes, ptrs);
3268c2ecf20Sopenharmony_ci
3278c2ecf20Sopenharmony_ci	/* Restore pointer table */
3288c2ecf20Sopenharmony_ci	ptrs[faila] = dp;
3298c2ecf20Sopenharmony_ci	ptrs[failb] = dq;
3308c2ecf20Sopenharmony_ci	ptrs[disks - 2] = p;
3318c2ecf20Sopenharmony_ci	ptrs[disks - 1] = q;
3328c2ecf20Sopenharmony_ci
3338c2ecf20Sopenharmony_ci	/* Now, pick the proper data tables */
3348c2ecf20Sopenharmony_ci	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
3358c2ecf20Sopenharmony_ci	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
3368c2ecf20Sopenharmony_ci
3378c2ecf20Sopenharmony_ci	kernel_fpu_begin();
3388c2ecf20Sopenharmony_ci
3398c2ecf20Sopenharmony_ci	/*
3408c2ecf20Sopenharmony_ci	 * xr20, xr21: qmul
3418c2ecf20Sopenharmony_ci	 * xr22, xr23: pbmul
3428c2ecf20Sopenharmony_ci	 */
3438c2ecf20Sopenharmony_ci	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
3448c2ecf20Sopenharmony_ci	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
3458c2ecf20Sopenharmony_ci	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
3468c2ecf20Sopenharmony_ci	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
3478c2ecf20Sopenharmony_ci	asm volatile("xvreplve0.q $xr20, $xr20");
3488c2ecf20Sopenharmony_ci	asm volatile("xvreplve0.q $xr21, $xr21");
3498c2ecf20Sopenharmony_ci	asm volatile("xvreplve0.q $xr22, $xr22");
3508c2ecf20Sopenharmony_ci	asm volatile("xvreplve0.q $xr23, $xr23");
3518c2ecf20Sopenharmony_ci
3528c2ecf20Sopenharmony_ci	while (bytes) {
3538c2ecf20Sopenharmony_ci		/* xr0, xr1: Q */
3548c2ecf20Sopenharmony_ci		asm volatile("xvld $xr0, %0" : : "m" (q[0]));
3558c2ecf20Sopenharmony_ci		asm volatile("xvld $xr1, %0" : : "m" (q[32]));
3568c2ecf20Sopenharmony_ci		/* xr0, xr1: Q + Qxy */
3578c2ecf20Sopenharmony_ci		asm volatile("xvld $xr4, %0" : : "m" (dq[0]));
3588c2ecf20Sopenharmony_ci		asm volatile("xvld $xr5, %0" : : "m" (dq[32]));
3598c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr0, $xr0, $xr4");
3608c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr1, $xr1, $xr5");
3618c2ecf20Sopenharmony_ci		/* xr2, xr3: P */
3628c2ecf20Sopenharmony_ci		asm volatile("xvld $xr2, %0" : : "m" (p[0]));
3638c2ecf20Sopenharmony_ci		asm volatile("xvld $xr3, %0" : : "m" (p[32]));
3648c2ecf20Sopenharmony_ci		/* xr2, xr3: P + Pxy */
3658c2ecf20Sopenharmony_ci		asm volatile("xvld $xr4, %0" : : "m" (dp[0]));
3668c2ecf20Sopenharmony_ci		asm volatile("xvld $xr5, %0" : : "m" (dp[32]));
3678c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr2, $xr2, $xr4");
3688c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr3, $xr3, $xr5");
3698c2ecf20Sopenharmony_ci
3708c2ecf20Sopenharmony_ci		/* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */
3718c2ecf20Sopenharmony_ci		asm volatile("xvsrli.b $xr4, $xr0, 4");
3728c2ecf20Sopenharmony_ci		asm volatile("xvsrli.b $xr5, $xr1, 4");
3738c2ecf20Sopenharmony_ci		/* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */
3748c2ecf20Sopenharmony_ci		asm volatile("xvandi.b $xr0, $xr0, 0x0f");
3758c2ecf20Sopenharmony_ci		asm volatile("xvandi.b $xr1, $xr1, 0x0f");
3768c2ecf20Sopenharmony_ci		/* lookup from qmul[0] */
3778c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0");
3788c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1");
3798c2ecf20Sopenharmony_ci		/* lookup from qmul[16] */
3808c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4");
3818c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5");
3828c2ecf20Sopenharmony_ci		/* xr6, xr7: B(Q + Qxy) */
3838c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr6, $xr4, $xr0");
3848c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr7, $xr5, $xr1");
3858c2ecf20Sopenharmony_ci
3868c2ecf20Sopenharmony_ci		/* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */
3878c2ecf20Sopenharmony_ci		asm volatile("xvsrli.b $xr4, $xr2, 4");
3888c2ecf20Sopenharmony_ci		asm volatile("xvsrli.b $xr5, $xr3, 4");
3898c2ecf20Sopenharmony_ci		/* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */
3908c2ecf20Sopenharmony_ci		asm volatile("xvandi.b $xr0, $xr2, 0x0f");
3918c2ecf20Sopenharmony_ci		asm volatile("xvandi.b $xr1, $xr3, 0x0f");
3928c2ecf20Sopenharmony_ci		/* lookup from pbmul[0] */
3938c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0");
3948c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1");
3958c2ecf20Sopenharmony_ci		/* lookup from pbmul[16] */
3968c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
3978c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
3988c2ecf20Sopenharmony_ci		/* xr0, xr1: A(P + Pxy) */
3998c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr0, $xr0, $xr4");
4008c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr1, $xr1, $xr5");
4018c2ecf20Sopenharmony_ci
4028c2ecf20Sopenharmony_ci		/* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */
4038c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr0, $xr0, $xr6");
4048c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr1, $xr1, $xr7");
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci		/* xr2, xr3: P + Pxy + Dx = Dy */
4078c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr2, $xr2, $xr0");
4088c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr3, $xr3, $xr1");
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_ci		asm volatile("xvst $xr0, %0" : "=m" (dq[0]));
4118c2ecf20Sopenharmony_ci		asm volatile("xvst $xr1, %0" : "=m" (dq[32]));
4128c2ecf20Sopenharmony_ci		asm volatile("xvst $xr2, %0" : "=m" (dp[0]));
4138c2ecf20Sopenharmony_ci		asm volatile("xvst $xr3, %0" : "=m" (dp[32]));
4148c2ecf20Sopenharmony_ci
4158c2ecf20Sopenharmony_ci		bytes -= 64;
4168c2ecf20Sopenharmony_ci		p += 64;
4178c2ecf20Sopenharmony_ci		q += 64;
4188c2ecf20Sopenharmony_ci		dp += 64;
4198c2ecf20Sopenharmony_ci		dq += 64;
4208c2ecf20Sopenharmony_ci	}
4218c2ecf20Sopenharmony_ci
4228c2ecf20Sopenharmony_ci	kernel_fpu_end();
4238c2ecf20Sopenharmony_ci}
4248c2ecf20Sopenharmony_ci
4258c2ecf20Sopenharmony_cistatic void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
4268c2ecf20Sopenharmony_ci				   void **ptrs)
4278c2ecf20Sopenharmony_ci{
4288c2ecf20Sopenharmony_ci	u8 *p, *q, *dq;
4298c2ecf20Sopenharmony_ci	const u8 *qmul;		/* Q multiplier table */
4308c2ecf20Sopenharmony_ci
4318c2ecf20Sopenharmony_ci	p = (u8 *)ptrs[disks - 2];
4328c2ecf20Sopenharmony_ci	q = (u8 *)ptrs[disks - 1];
4338c2ecf20Sopenharmony_ci
4348c2ecf20Sopenharmony_ci	/*
4358c2ecf20Sopenharmony_ci	 * Compute syndrome with zero for the missing data page
4368c2ecf20Sopenharmony_ci	 * Use the dead data page as temporary storage for delta q
4378c2ecf20Sopenharmony_ci	 */
4388c2ecf20Sopenharmony_ci	dq = (u8 *)ptrs[faila];
4398c2ecf20Sopenharmony_ci	ptrs[faila] = (void *)raid6_empty_zero_page;
4408c2ecf20Sopenharmony_ci	ptrs[disks - 1] = dq;
4418c2ecf20Sopenharmony_ci
4428c2ecf20Sopenharmony_ci	raid6_call.gen_syndrome(disks, bytes, ptrs);
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ci	/* Restore pointer table */
4458c2ecf20Sopenharmony_ci	ptrs[faila] = dq;
4468c2ecf20Sopenharmony_ci	ptrs[disks - 1] = q;
4478c2ecf20Sopenharmony_ci
4488c2ecf20Sopenharmony_ci	/* Now, pick the proper data tables */
4498c2ecf20Sopenharmony_ci	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
4508c2ecf20Sopenharmony_ci
4518c2ecf20Sopenharmony_ci	kernel_fpu_begin();
4528c2ecf20Sopenharmony_ci
4538c2ecf20Sopenharmony_ci	/* xr22, xr23: qmul */
4548c2ecf20Sopenharmony_ci	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
4558c2ecf20Sopenharmony_ci	asm volatile("xvreplve0.q $xr22, $xr22");
4568c2ecf20Sopenharmony_ci	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
4578c2ecf20Sopenharmony_ci	asm volatile("xvreplve0.q $xr23, $xr23");
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci	while (bytes) {
4608c2ecf20Sopenharmony_ci		/* xr0, xr1: P + Dx */
4618c2ecf20Sopenharmony_ci		asm volatile("xvld $xr0, %0" : : "m" (p[0]));
4628c2ecf20Sopenharmony_ci		asm volatile("xvld $xr1, %0" : : "m" (p[32]));
4638c2ecf20Sopenharmony_ci		/* xr2, xr3: Qx */
4648c2ecf20Sopenharmony_ci		asm volatile("xvld $xr2, %0" : : "m" (dq[0]));
4658c2ecf20Sopenharmony_ci		asm volatile("xvld $xr3, %0" : : "m" (dq[32]));
4668c2ecf20Sopenharmony_ci		/* xr2, xr3: Q + Qx */
4678c2ecf20Sopenharmony_ci		asm volatile("xvld $xr4, %0" : : "m" (q[0]));
4688c2ecf20Sopenharmony_ci		asm volatile("xvld $xr5, %0" : : "m" (q[32]));
4698c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr2, $xr2, $xr4");
4708c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr3, $xr3, $xr5");
4718c2ecf20Sopenharmony_ci
4728c2ecf20Sopenharmony_ci		/* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */
4738c2ecf20Sopenharmony_ci		asm volatile("xvsrli.b $xr4, $xr2, 4");
4748c2ecf20Sopenharmony_ci		asm volatile("xvsrli.b $xr5, $xr3, 4");
4758c2ecf20Sopenharmony_ci		/* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */
4768c2ecf20Sopenharmony_ci		asm volatile("xvandi.b $xr2, $xr2, 0x0f");
4778c2ecf20Sopenharmony_ci		asm volatile("xvandi.b $xr3, $xr3, 0x0f");
4788c2ecf20Sopenharmony_ci		/* lookup from qmul[0] */
4798c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2");
4808c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3");
4818c2ecf20Sopenharmony_ci		/* lookup from qmul[16] */
4828c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
4838c2ecf20Sopenharmony_ci		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
4848c2ecf20Sopenharmony_ci		/* xr2, xr3: qmul(Q + Qx) = Dx */
4858c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr2, $xr2, $xr4");
4868c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr3, $xr3, $xr5");
4878c2ecf20Sopenharmony_ci
4888c2ecf20Sopenharmony_ci		/* xr0, xr1: P + Dx + Dx = P */
4898c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr0, $xr0, $xr2");
4908c2ecf20Sopenharmony_ci		asm volatile("xvxor.v $xr1, $xr1, $xr3");
4918c2ecf20Sopenharmony_ci
4928c2ecf20Sopenharmony_ci		asm volatile("xvst $xr2, %0" : "=m" (dq[0]));
4938c2ecf20Sopenharmony_ci		asm volatile("xvst $xr3, %0" : "=m" (dq[32]));
4948c2ecf20Sopenharmony_ci		asm volatile("xvst $xr0, %0" : "=m" (p[0]));
4958c2ecf20Sopenharmony_ci		asm volatile("xvst $xr1, %0" : "=m" (p[32]));
4968c2ecf20Sopenharmony_ci
4978c2ecf20Sopenharmony_ci		bytes -= 64;
4988c2ecf20Sopenharmony_ci		p += 64;
4998c2ecf20Sopenharmony_ci		q += 64;
5008c2ecf20Sopenharmony_ci		dq += 64;
5018c2ecf20Sopenharmony_ci	}
5028c2ecf20Sopenharmony_ci
5038c2ecf20Sopenharmony_ci	kernel_fpu_end();
5048c2ecf20Sopenharmony_ci}
5058c2ecf20Sopenharmony_ci
5068c2ecf20Sopenharmony_ciconst struct raid6_recov_calls raid6_recov_lasx = {
5078c2ecf20Sopenharmony_ci	.data2 = raid6_2data_recov_lasx,
5088c2ecf20Sopenharmony_ci	.datap = raid6_datap_recov_lasx,
5098c2ecf20Sopenharmony_ci	.valid = raid6_has_lasx,
5108c2ecf20Sopenharmony_ci	.name = "lasx",
5118c2ecf20Sopenharmony_ci	.priority = 2,
5128c2ecf20Sopenharmony_ci};
5138c2ecf20Sopenharmony_ci#endif /* CONFIG_CPU_HAS_LASX */
514