18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * arch/arm64/lib/xor-neon.c
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Authors: Jackie Liu <liuyun01@kylinos.cn>
68c2ecf20Sopenharmony_ci * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
78c2ecf20Sopenharmony_ci */
88c2ecf20Sopenharmony_ci
98c2ecf20Sopenharmony_ci#include <linux/raid/xor.h>
108c2ecf20Sopenharmony_ci#include <linux/module.h>
118c2ecf20Sopenharmony_ci#include <asm/neon-intrinsics.h>
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_civoid xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
148c2ecf20Sopenharmony_ci	unsigned long *p2)
158c2ecf20Sopenharmony_ci{
168c2ecf20Sopenharmony_ci	uint64_t *dp1 = (uint64_t *)p1;
178c2ecf20Sopenharmony_ci	uint64_t *dp2 = (uint64_t *)p2;
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_ci	register uint64x2_t v0, v1, v2, v3;
208c2ecf20Sopenharmony_ci	long lines = bytes / (sizeof(uint64x2_t) * 4);
218c2ecf20Sopenharmony_ci
228c2ecf20Sopenharmony_ci	do {
238c2ecf20Sopenharmony_ci		/* p1 ^= p2 */
248c2ecf20Sopenharmony_ci		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
258c2ecf20Sopenharmony_ci		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
268c2ecf20Sopenharmony_ci		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
278c2ecf20Sopenharmony_ci		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_ci		/* store */
308c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  0, v0);
318c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  2, v1);
328c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  4, v2);
338c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  6, v3);
348c2ecf20Sopenharmony_ci
358c2ecf20Sopenharmony_ci		dp1 += 8;
368c2ecf20Sopenharmony_ci		dp2 += 8;
378c2ecf20Sopenharmony_ci	} while (--lines > 0);
388c2ecf20Sopenharmony_ci}
398c2ecf20Sopenharmony_ci
408c2ecf20Sopenharmony_civoid xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
418c2ecf20Sopenharmony_ci	unsigned long *p2, unsigned long *p3)
428c2ecf20Sopenharmony_ci{
438c2ecf20Sopenharmony_ci	uint64_t *dp1 = (uint64_t *)p1;
448c2ecf20Sopenharmony_ci	uint64_t *dp2 = (uint64_t *)p2;
458c2ecf20Sopenharmony_ci	uint64_t *dp3 = (uint64_t *)p3;
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci	register uint64x2_t v0, v1, v2, v3;
488c2ecf20Sopenharmony_ci	long lines = bytes / (sizeof(uint64x2_t) * 4);
498c2ecf20Sopenharmony_ci
508c2ecf20Sopenharmony_ci	do {
518c2ecf20Sopenharmony_ci		/* p1 ^= p2 */
528c2ecf20Sopenharmony_ci		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
538c2ecf20Sopenharmony_ci		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
548c2ecf20Sopenharmony_ci		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
558c2ecf20Sopenharmony_ci		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ci		/* p1 ^= p3 */
588c2ecf20Sopenharmony_ci		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
598c2ecf20Sopenharmony_ci		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
608c2ecf20Sopenharmony_ci		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
618c2ecf20Sopenharmony_ci		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
628c2ecf20Sopenharmony_ci
638c2ecf20Sopenharmony_ci		/* store */
648c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  0, v0);
658c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  2, v1);
668c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  4, v2);
678c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  6, v3);
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci		dp1 += 8;
708c2ecf20Sopenharmony_ci		dp2 += 8;
718c2ecf20Sopenharmony_ci		dp3 += 8;
728c2ecf20Sopenharmony_ci	} while (--lines > 0);
738c2ecf20Sopenharmony_ci}
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_civoid xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
768c2ecf20Sopenharmony_ci	unsigned long *p2, unsigned long *p3, unsigned long *p4)
778c2ecf20Sopenharmony_ci{
788c2ecf20Sopenharmony_ci	uint64_t *dp1 = (uint64_t *)p1;
798c2ecf20Sopenharmony_ci	uint64_t *dp2 = (uint64_t *)p2;
808c2ecf20Sopenharmony_ci	uint64_t *dp3 = (uint64_t *)p3;
818c2ecf20Sopenharmony_ci	uint64_t *dp4 = (uint64_t *)p4;
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_ci	register uint64x2_t v0, v1, v2, v3;
848c2ecf20Sopenharmony_ci	long lines = bytes / (sizeof(uint64x2_t) * 4);
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_ci	do {
878c2ecf20Sopenharmony_ci		/* p1 ^= p2 */
888c2ecf20Sopenharmony_ci		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
898c2ecf20Sopenharmony_ci		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
908c2ecf20Sopenharmony_ci		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
918c2ecf20Sopenharmony_ci		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci		/* p1 ^= p3 */
948c2ecf20Sopenharmony_ci		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
958c2ecf20Sopenharmony_ci		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
968c2ecf20Sopenharmony_ci		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
978c2ecf20Sopenharmony_ci		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
988c2ecf20Sopenharmony_ci
998c2ecf20Sopenharmony_ci		/* p1 ^= p4 */
1008c2ecf20Sopenharmony_ci		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
1018c2ecf20Sopenharmony_ci		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
1028c2ecf20Sopenharmony_ci		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
1038c2ecf20Sopenharmony_ci		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
1048c2ecf20Sopenharmony_ci
1058c2ecf20Sopenharmony_ci		/* store */
1068c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  0, v0);
1078c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  2, v1);
1088c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  4, v2);
1098c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  6, v3);
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_ci		dp1 += 8;
1128c2ecf20Sopenharmony_ci		dp2 += 8;
1138c2ecf20Sopenharmony_ci		dp3 += 8;
1148c2ecf20Sopenharmony_ci		dp4 += 8;
1158c2ecf20Sopenharmony_ci	} while (--lines > 0);
1168c2ecf20Sopenharmony_ci}
1178c2ecf20Sopenharmony_ci
1188c2ecf20Sopenharmony_civoid xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
1198c2ecf20Sopenharmony_ci	unsigned long *p2, unsigned long *p3,
1208c2ecf20Sopenharmony_ci	unsigned long *p4, unsigned long *p5)
1218c2ecf20Sopenharmony_ci{
1228c2ecf20Sopenharmony_ci	uint64_t *dp1 = (uint64_t *)p1;
1238c2ecf20Sopenharmony_ci	uint64_t *dp2 = (uint64_t *)p2;
1248c2ecf20Sopenharmony_ci	uint64_t *dp3 = (uint64_t *)p3;
1258c2ecf20Sopenharmony_ci	uint64_t *dp4 = (uint64_t *)p4;
1268c2ecf20Sopenharmony_ci	uint64_t *dp5 = (uint64_t *)p5;
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_ci	register uint64x2_t v0, v1, v2, v3;
1298c2ecf20Sopenharmony_ci	long lines = bytes / (sizeof(uint64x2_t) * 4);
1308c2ecf20Sopenharmony_ci
1318c2ecf20Sopenharmony_ci	do {
1328c2ecf20Sopenharmony_ci		/* p1 ^= p2 */
1338c2ecf20Sopenharmony_ci		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
1348c2ecf20Sopenharmony_ci		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
1358c2ecf20Sopenharmony_ci		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
1368c2ecf20Sopenharmony_ci		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci		/* p1 ^= p3 */
1398c2ecf20Sopenharmony_ci		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
1408c2ecf20Sopenharmony_ci		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
1418c2ecf20Sopenharmony_ci		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
1428c2ecf20Sopenharmony_ci		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci		/* p1 ^= p4 */
1458c2ecf20Sopenharmony_ci		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
1468c2ecf20Sopenharmony_ci		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
1478c2ecf20Sopenharmony_ci		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
1488c2ecf20Sopenharmony_ci		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci		/* p1 ^= p5 */
1518c2ecf20Sopenharmony_ci		v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
1528c2ecf20Sopenharmony_ci		v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
1538c2ecf20Sopenharmony_ci		v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
1548c2ecf20Sopenharmony_ci		v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_ci		/* store */
1578c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  0, v0);
1588c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  2, v1);
1598c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  4, v2);
1608c2ecf20Sopenharmony_ci		vst1q_u64(dp1 +  6, v3);
1618c2ecf20Sopenharmony_ci
1628c2ecf20Sopenharmony_ci		dp1 += 8;
1638c2ecf20Sopenharmony_ci		dp2 += 8;
1648c2ecf20Sopenharmony_ci		dp3 += 8;
1658c2ecf20Sopenharmony_ci		dp4 += 8;
1668c2ecf20Sopenharmony_ci		dp5 += 8;
1678c2ecf20Sopenharmony_ci	} while (--lines > 0);
1688c2ecf20Sopenharmony_ci}
1698c2ecf20Sopenharmony_ci
1708c2ecf20Sopenharmony_cistruct xor_block_template const xor_block_inner_neon = {
1718c2ecf20Sopenharmony_ci	.name	= "__inner_neon__",
1728c2ecf20Sopenharmony_ci	.do_2	= xor_arm64_neon_2,
1738c2ecf20Sopenharmony_ci	.do_3	= xor_arm64_neon_3,
1748c2ecf20Sopenharmony_ci	.do_4	= xor_arm64_neon_4,
1758c2ecf20Sopenharmony_ci	.do_5	= xor_arm64_neon_5,
1768c2ecf20Sopenharmony_ci};
1778c2ecf20Sopenharmony_ciEXPORT_SYMBOL(xor_block_inner_neon);
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ciMODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
1808c2ecf20Sopenharmony_ciMODULE_DESCRIPTION("ARMv8 XOR Extensions");
1818c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL");
182