18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
28c2ecf20Sopenharmony_ci/* -*- linux-c -*- ------------------------------------------------------- *
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci *   Copyright (C) 2012 Intel Corporation
58c2ecf20Sopenharmony_ci *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
88c2ecf20Sopenharmony_ci *
98c2ecf20Sopenharmony_ci * ----------------------------------------------------------------------- */
108c2ecf20Sopenharmony_ci
118c2ecf20Sopenharmony_ci/*
128c2ecf20Sopenharmony_ci * AVX2 implementation of RAID-6 syndrome functions
138c2ecf20Sopenharmony_ci *
148c2ecf20Sopenharmony_ci */
158c2ecf20Sopenharmony_ci
168c2ecf20Sopenharmony_ci#include <linux/raid/pq.h>
178c2ecf20Sopenharmony_ci#include "x86.h"
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_cistatic const struct raid6_avx2_constants {
208c2ecf20Sopenharmony_ci	u64 x1d[4];
218c2ecf20Sopenharmony_ci} raid6_avx2_constants __aligned(32) = {
228c2ecf20Sopenharmony_ci	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
238c2ecf20Sopenharmony_ci	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
248c2ecf20Sopenharmony_ci};
258c2ecf20Sopenharmony_ci
268c2ecf20Sopenharmony_cistatic int raid6_have_avx2(void)
278c2ecf20Sopenharmony_ci{
288c2ecf20Sopenharmony_ci	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
298c2ecf20Sopenharmony_ci}
308c2ecf20Sopenharmony_ci
318c2ecf20Sopenharmony_ci/*
328c2ecf20Sopenharmony_ci * Plain AVX2 implementation
338c2ecf20Sopenharmony_ci */
348c2ecf20Sopenharmony_cistatic void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
358c2ecf20Sopenharmony_ci{
368c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
378c2ecf20Sopenharmony_ci	u8 *p, *q;
388c2ecf20Sopenharmony_ci	int d, z, z0;
398c2ecf20Sopenharmony_ci
408c2ecf20Sopenharmony_ci	z0 = disks - 3;		/* Highest data disk */
418c2ecf20Sopenharmony_ci	p = dptr[z0+1];		/* XOR parity */
428c2ecf20Sopenharmony_ci	q = dptr[z0+2];		/* RS syndrome */
438c2ecf20Sopenharmony_ci
448c2ecf20Sopenharmony_ci	kernel_fpu_begin();
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
478c2ecf20Sopenharmony_ci	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_ci	for (d = 0; d < bytes; d += 32) {
508c2ecf20Sopenharmony_ci		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
518c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
528c2ecf20Sopenharmony_ci		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
538c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
548c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
558c2ecf20Sopenharmony_ci		for (z = z0-2; z >= 0; z--) {
568c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
578c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
588c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
598c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm5,%ymm5");
608c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
618c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
628c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
638c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
648c2ecf20Sopenharmony_ci		}
658c2ecf20Sopenharmony_ci		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
668c2ecf20Sopenharmony_ci		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
678c2ecf20Sopenharmony_ci		asm volatile("vpand %ymm0,%ymm5,%ymm5");
688c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
698c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
708c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
738c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
748c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
758c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
768c2ecf20Sopenharmony_ci	}
778c2ecf20Sopenharmony_ci
788c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
798c2ecf20Sopenharmony_ci	kernel_fpu_end();
808c2ecf20Sopenharmony_ci}
818c2ecf20Sopenharmony_ci
828c2ecf20Sopenharmony_cistatic void raid6_avx21_xor_syndrome(int disks, int start, int stop,
838c2ecf20Sopenharmony_ci				     size_t bytes, void **ptrs)
848c2ecf20Sopenharmony_ci{
858c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
868c2ecf20Sopenharmony_ci	u8 *p, *q;
878c2ecf20Sopenharmony_ci	int d, z, z0;
888c2ecf20Sopenharmony_ci
898c2ecf20Sopenharmony_ci	z0 = stop;		/* P/Q right side optimization */
908c2ecf20Sopenharmony_ci	p = dptr[disks-2];	/* XOR parity */
918c2ecf20Sopenharmony_ci	q = dptr[disks-1];	/* RS syndrome */
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci	kernel_fpu_begin();
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
968c2ecf20Sopenharmony_ci
978c2ecf20Sopenharmony_ci	for (d = 0 ; d < bytes ; d += 32) {
988c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
998c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
1008c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
1018c2ecf20Sopenharmony_ci		/* P/Q data pages */
1028c2ecf20Sopenharmony_ci		for (z = z0-1 ; z >= start ; z--) {
1038c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
1048c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
1058c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
1068c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm5,%ymm5");
1078c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
1088c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
1098c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
1108c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
1118c2ecf20Sopenharmony_ci		}
1128c2ecf20Sopenharmony_ci		/* P/Q left side optimization */
1138c2ecf20Sopenharmony_ci		for (z = start-1 ; z >= 0 ; z--) {
1148c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
1158c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
1168c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
1178c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm5,%ymm5");
1188c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
1198c2ecf20Sopenharmony_ci		}
1208c2ecf20Sopenharmony_ci		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
1218c2ecf20Sopenharmony_ci		/* Don't use movntdq for r/w memory area < cache line */
1228c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
1238c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
1248c2ecf20Sopenharmony_ci	}
1258c2ecf20Sopenharmony_ci
1268c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
1278c2ecf20Sopenharmony_ci	kernel_fpu_end();
1288c2ecf20Sopenharmony_ci}
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_avx2x1 = {
1318c2ecf20Sopenharmony_ci	raid6_avx21_gen_syndrome,
1328c2ecf20Sopenharmony_ci	raid6_avx21_xor_syndrome,
1338c2ecf20Sopenharmony_ci	raid6_have_avx2,
1348c2ecf20Sopenharmony_ci	"avx2x1",
1358c2ecf20Sopenharmony_ci	1			/* Has cache hints */
1368c2ecf20Sopenharmony_ci};
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci/*
1398c2ecf20Sopenharmony_ci * Unrolled-by-2 AVX2 implementation
1408c2ecf20Sopenharmony_ci */
1418c2ecf20Sopenharmony_cistatic void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
1428c2ecf20Sopenharmony_ci{
1438c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
1448c2ecf20Sopenharmony_ci	u8 *p, *q;
1458c2ecf20Sopenharmony_ci	int d, z, z0;
1468c2ecf20Sopenharmony_ci
1478c2ecf20Sopenharmony_ci	z0 = disks - 3;		/* Highest data disk */
1488c2ecf20Sopenharmony_ci	p = dptr[z0+1];		/* XOR parity */
1498c2ecf20Sopenharmony_ci	q = dptr[z0+2];		/* RS syndrome */
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci	kernel_fpu_begin();
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
1548c2ecf20Sopenharmony_ci	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_ci	/* We uniformly assume a single prefetch covers at least 32 bytes */
1578c2ecf20Sopenharmony_ci	for (d = 0; d < bytes; d += 64) {
1588c2ecf20Sopenharmony_ci		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
1598c2ecf20Sopenharmony_ci		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
1608c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
1618c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
1628c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
1638c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
1648c2ecf20Sopenharmony_ci		for (z = z0-1; z >= 0; z--) {
1658c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
1668c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
1678c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
1688c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
1698c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
1708c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
1718c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm5,%ymm5");
1728c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm7,%ymm7");
1738c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
1748c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
1758c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
1768c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
1778c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
1788c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
1798c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
1808c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
1818c2ecf20Sopenharmony_ci		}
1828c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
1838c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
1848c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
1858c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
1868c2ecf20Sopenharmony_ci	}
1878c2ecf20Sopenharmony_ci
1888c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
1898c2ecf20Sopenharmony_ci	kernel_fpu_end();
1908c2ecf20Sopenharmony_ci}
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_cistatic void raid6_avx22_xor_syndrome(int disks, int start, int stop,
1938c2ecf20Sopenharmony_ci				     size_t bytes, void **ptrs)
1948c2ecf20Sopenharmony_ci{
1958c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
1968c2ecf20Sopenharmony_ci	u8 *p, *q;
1978c2ecf20Sopenharmony_ci	int d, z, z0;
1988c2ecf20Sopenharmony_ci
1998c2ecf20Sopenharmony_ci	z0 = stop;		/* P/Q right side optimization */
2008c2ecf20Sopenharmony_ci	p = dptr[disks-2];	/* XOR parity */
2018c2ecf20Sopenharmony_ci	q = dptr[disks-1];	/* RS syndrome */
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci	kernel_fpu_begin();
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_ci	for (d = 0 ; d < bytes ; d += 64) {
2088c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
2098c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
2108c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
2118c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
2128c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
2138c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
2148c2ecf20Sopenharmony_ci		/* P/Q data pages */
2158c2ecf20Sopenharmony_ci		for (z = z0-1 ; z >= start ; z--) {
2168c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
2178c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
2188c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
2198c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
2208c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
2218c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
2228c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm5,%ymm5");
2238c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm7,%ymm7");
2248c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
2258c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
2268c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
2278c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm7"
2288c2ecf20Sopenharmony_ci				     :: "m" (dptr[z][d+32]));
2298c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
2308c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
2318c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
2328c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
2338c2ecf20Sopenharmony_ci		}
2348c2ecf20Sopenharmony_ci		/* P/Q left side optimization */
2358c2ecf20Sopenharmony_ci		for (z = start-1 ; z >= 0 ; z--) {
2368c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
2378c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
2388c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
2398c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
2408c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
2418c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
2428c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm5,%ymm5");
2438c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm7,%ymm7");
2448c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
2458c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
2468c2ecf20Sopenharmony_ci		}
2478c2ecf20Sopenharmony_ci		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
2488c2ecf20Sopenharmony_ci		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
2498c2ecf20Sopenharmony_ci		/* Don't use movntdq for r/w memory area < cache line */
2508c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
2518c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
2528c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
2538c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
2548c2ecf20Sopenharmony_ci	}
2558c2ecf20Sopenharmony_ci
2568c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
2578c2ecf20Sopenharmony_ci	kernel_fpu_end();
2588c2ecf20Sopenharmony_ci}
2598c2ecf20Sopenharmony_ci
2608c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_avx2x2 = {
2618c2ecf20Sopenharmony_ci	raid6_avx22_gen_syndrome,
2628c2ecf20Sopenharmony_ci	raid6_avx22_xor_syndrome,
2638c2ecf20Sopenharmony_ci	raid6_have_avx2,
2648c2ecf20Sopenharmony_ci	"avx2x2",
2658c2ecf20Sopenharmony_ci	1			/* Has cache hints */
2668c2ecf20Sopenharmony_ci};
2678c2ecf20Sopenharmony_ci
2688c2ecf20Sopenharmony_ci#ifdef CONFIG_X86_64
2698c2ecf20Sopenharmony_ci
2708c2ecf20Sopenharmony_ci/*
2718c2ecf20Sopenharmony_ci * Unrolled-by-4 AVX2 implementation
2728c2ecf20Sopenharmony_ci */
2738c2ecf20Sopenharmony_cistatic void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
2748c2ecf20Sopenharmony_ci{
2758c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
2768c2ecf20Sopenharmony_ci	u8 *p, *q;
2778c2ecf20Sopenharmony_ci	int d, z, z0;
2788c2ecf20Sopenharmony_ci
2798c2ecf20Sopenharmony_ci	z0 = disks - 3;		/* Highest data disk */
2808c2ecf20Sopenharmony_ci	p = dptr[z0+1];		/* XOR parity */
2818c2ecf20Sopenharmony_ci	q = dptr[z0+2];		/* RS syndrome */
2828c2ecf20Sopenharmony_ci
2838c2ecf20Sopenharmony_ci	kernel_fpu_begin();
2848c2ecf20Sopenharmony_ci
2858c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
2868c2ecf20Sopenharmony_ci	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
2878c2ecf20Sopenharmony_ci	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
2888c2ecf20Sopenharmony_ci	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
2898c2ecf20Sopenharmony_ci	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
2908c2ecf20Sopenharmony_ci	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
2918c2ecf20Sopenharmony_ci	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
2928c2ecf20Sopenharmony_ci	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
2938c2ecf20Sopenharmony_ci	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
2948c2ecf20Sopenharmony_ci	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
2958c2ecf20Sopenharmony_ci
2968c2ecf20Sopenharmony_ci	for (d = 0; d < bytes; d += 128) {
2978c2ecf20Sopenharmony_ci		for (z = z0; z >= 0; z--) {
2988c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
2998c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
3008c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
3018c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
3028c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
3038c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
3048c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
3058c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
3068c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
3078c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
3088c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
3098c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
3108c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm5,%ymm5");
3118c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm7,%ymm7");
3128c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm13,%ymm13");
3138c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm15,%ymm15");
3148c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
3158c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
3168c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
3178c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
3188c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
3198c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
3208c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
3218c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
3228c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
3238c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
3248c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
3258c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
3268c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
3278c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
3288c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
3298c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
3308c2ecf20Sopenharmony_ci		}
3318c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
3328c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
3338c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
3348c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
3358c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
3368c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
3378c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
3388c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
3398c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
3408c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
3418c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
3428c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
3438c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
3448c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
3458c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
3468c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
3478c2ecf20Sopenharmony_ci	}
3488c2ecf20Sopenharmony_ci
3498c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
3508c2ecf20Sopenharmony_ci	kernel_fpu_end();
3518c2ecf20Sopenharmony_ci}
3528c2ecf20Sopenharmony_ci
3538c2ecf20Sopenharmony_cistatic void raid6_avx24_xor_syndrome(int disks, int start, int stop,
3548c2ecf20Sopenharmony_ci				     size_t bytes, void **ptrs)
3558c2ecf20Sopenharmony_ci{
3568c2ecf20Sopenharmony_ci	u8 **dptr = (u8 **)ptrs;
3578c2ecf20Sopenharmony_ci	u8 *p, *q;
3588c2ecf20Sopenharmony_ci	int d, z, z0;
3598c2ecf20Sopenharmony_ci
3608c2ecf20Sopenharmony_ci	z0 = stop;		/* P/Q right side optimization */
3618c2ecf20Sopenharmony_ci	p = dptr[disks-2];	/* XOR parity */
3628c2ecf20Sopenharmony_ci	q = dptr[disks-1];	/* RS syndrome */
3638c2ecf20Sopenharmony_ci
3648c2ecf20Sopenharmony_ci	kernel_fpu_begin();
3658c2ecf20Sopenharmony_ci
3668c2ecf20Sopenharmony_ci	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
3678c2ecf20Sopenharmony_ci
3688c2ecf20Sopenharmony_ci	for (d = 0 ; d < bytes ; d += 128) {
3698c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
3708c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
3718c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
3728c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
3738c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
3748c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
3758c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
3768c2ecf20Sopenharmony_ci		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
3778c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
3788c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
3798c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
3808c2ecf20Sopenharmony_ci		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
3818c2ecf20Sopenharmony_ci		/* P/Q data pages */
3828c2ecf20Sopenharmony_ci		for (z = z0-1 ; z >= start ; z--) {
3838c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
3848c2ecf20Sopenharmony_ci			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
3858c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
3868c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
3878c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
3888c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
3898c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
3908c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
3918c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
3928c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
3938c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
3948c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
3958c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
3968c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
3978c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm5,%ymm5");
3988c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm7,%ymm7");
3998c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm13,%ymm13");
4008c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm15,%ymm15");
4018c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
4028c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
4038c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
4048c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
4058c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
4068c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm7"
4078c2ecf20Sopenharmony_ci				     :: "m" (dptr[z][d+32]));
4088c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm13"
4098c2ecf20Sopenharmony_ci				     :: "m" (dptr[z][d+64]));
4108c2ecf20Sopenharmony_ci			asm volatile("vmovdqa %0,%%ymm15"
4118c2ecf20Sopenharmony_ci				     :: "m" (dptr[z][d+96]));
4128c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
4138c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
4148c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
4158c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
4168c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
4178c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
4188c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
4198c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
4208c2ecf20Sopenharmony_ci		}
4218c2ecf20Sopenharmony_ci		asm volatile("prefetchnta %0" :: "m" (q[d]));
4228c2ecf20Sopenharmony_ci		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
4238c2ecf20Sopenharmony_ci		/* P/Q left side optimization */
4248c2ecf20Sopenharmony_ci		for (z = start-1 ; z >= 0 ; z--) {
4258c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
4268c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
4278c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
4288c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
4298c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
4308c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
4318c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
4328c2ecf20Sopenharmony_ci			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
4338c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
4348c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
4358c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
4368c2ecf20Sopenharmony_ci			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
4378c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm5,%ymm5");
4388c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm7,%ymm7");
4398c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm13,%ymm13");
4408c2ecf20Sopenharmony_ci			asm volatile("vpand %ymm0,%ymm15,%ymm15");
4418c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
4428c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
4438c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
4448c2ecf20Sopenharmony_ci			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
4458c2ecf20Sopenharmony_ci		}
4468c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
4478c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
4488c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
4498c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
4508c2ecf20Sopenharmony_ci		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
4518c2ecf20Sopenharmony_ci		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
4528c2ecf20Sopenharmony_ci		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
4538c2ecf20Sopenharmony_ci		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
4548c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
4558c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
4568c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
4578c2ecf20Sopenharmony_ci		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
4588c2ecf20Sopenharmony_ci	}
4598c2ecf20Sopenharmony_ci	asm volatile("sfence" : : : "memory");
4608c2ecf20Sopenharmony_ci	kernel_fpu_end();
4618c2ecf20Sopenharmony_ci}
4628c2ecf20Sopenharmony_ci
4638c2ecf20Sopenharmony_ciconst struct raid6_calls raid6_avx2x4 = {
4648c2ecf20Sopenharmony_ci	raid6_avx24_gen_syndrome,
4658c2ecf20Sopenharmony_ci	raid6_avx24_xor_syndrome,
4668c2ecf20Sopenharmony_ci	raid6_have_avx2,
4678c2ecf20Sopenharmony_ci	"avx2x4",
4688c2ecf20Sopenharmony_ci	1			/* Has cache hints */
4698c2ecf20Sopenharmony_ci};
4708c2ecf20Sopenharmony_ci#endif
471