1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * arch/arm64/lib/xor-neon.c
4 *
5 * Authors: Jackie Liu <liuyun01@kylinos.cn>
6 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
7 */
8
9#include <linux/raid/xor.h>
10#include <linux/module.h>
11#include <asm/neon-intrinsics.h>
12
13void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
14	unsigned long *p2)
15{
16	uint64_t *dp1 = (uint64_t *)p1;
17	uint64_t *dp2 = (uint64_t *)p2;
18
19	register uint64x2_t v0, v1, v2, v3;
20	long lines = bytes / (sizeof(uint64x2_t) * 4);
21
22	do {
23		/* p1 ^= p2 */
24		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
25		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
26		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
27		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
28
29		/* store */
30		vst1q_u64(dp1 +  0, v0);
31		vst1q_u64(dp1 +  2, v1);
32		vst1q_u64(dp1 +  4, v2);
33		vst1q_u64(dp1 +  6, v3);
34
35		dp1 += 8;
36		dp2 += 8;
37	} while (--lines > 0);
38}
39
40void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
41	unsigned long *p2, unsigned long *p3)
42{
43	uint64_t *dp1 = (uint64_t *)p1;
44	uint64_t *dp2 = (uint64_t *)p2;
45	uint64_t *dp3 = (uint64_t *)p3;
46
47	register uint64x2_t v0, v1, v2, v3;
48	long lines = bytes / (sizeof(uint64x2_t) * 4);
49
50	do {
51		/* p1 ^= p2 */
52		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
53		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
54		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
55		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
56
57		/* p1 ^= p3 */
58		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
59		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
60		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
61		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
62
63		/* store */
64		vst1q_u64(dp1 +  0, v0);
65		vst1q_u64(dp1 +  2, v1);
66		vst1q_u64(dp1 +  4, v2);
67		vst1q_u64(dp1 +  6, v3);
68
69		dp1 += 8;
70		dp2 += 8;
71		dp3 += 8;
72	} while (--lines > 0);
73}
74
75void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
76	unsigned long *p2, unsigned long *p3, unsigned long *p4)
77{
78	uint64_t *dp1 = (uint64_t *)p1;
79	uint64_t *dp2 = (uint64_t *)p2;
80	uint64_t *dp3 = (uint64_t *)p3;
81	uint64_t *dp4 = (uint64_t *)p4;
82
83	register uint64x2_t v0, v1, v2, v3;
84	long lines = bytes / (sizeof(uint64x2_t) * 4);
85
86	do {
87		/* p1 ^= p2 */
88		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
89		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
90		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
91		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
92
93		/* p1 ^= p3 */
94		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
95		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
96		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
97		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
98
99		/* p1 ^= p4 */
100		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
101		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
102		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
103		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
104
105		/* store */
106		vst1q_u64(dp1 +  0, v0);
107		vst1q_u64(dp1 +  2, v1);
108		vst1q_u64(dp1 +  4, v2);
109		vst1q_u64(dp1 +  6, v3);
110
111		dp1 += 8;
112		dp2 += 8;
113		dp3 += 8;
114		dp4 += 8;
115	} while (--lines > 0);
116}
117
118void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
119	unsigned long *p2, unsigned long *p3,
120	unsigned long *p4, unsigned long *p5)
121{
122	uint64_t *dp1 = (uint64_t *)p1;
123	uint64_t *dp2 = (uint64_t *)p2;
124	uint64_t *dp3 = (uint64_t *)p3;
125	uint64_t *dp4 = (uint64_t *)p4;
126	uint64_t *dp5 = (uint64_t *)p5;
127
128	register uint64x2_t v0, v1, v2, v3;
129	long lines = bytes / (sizeof(uint64x2_t) * 4);
130
131	do {
132		/* p1 ^= p2 */
133		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
134		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
135		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
136		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
137
138		/* p1 ^= p3 */
139		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
140		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
141		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
142		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
143
144		/* p1 ^= p4 */
145		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
146		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
147		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
148		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
149
150		/* p1 ^= p5 */
151		v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
152		v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
153		v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
154		v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
155
156		/* store */
157		vst1q_u64(dp1 +  0, v0);
158		vst1q_u64(dp1 +  2, v1);
159		vst1q_u64(dp1 +  4, v2);
160		vst1q_u64(dp1 +  6, v3);
161
162		dp1 += 8;
163		dp2 += 8;
164		dp3 += 8;
165		dp4 += 8;
166		dp5 += 8;
167	} while (--lines > 0);
168}
169
170struct xor_block_template const xor_block_inner_neon = {
171	.name	= "__inner_neon__",
172	.do_2	= xor_arm64_neon_2,
173	.do_3	= xor_arm64_neon_3,
174	.do_4	= xor_arm64_neon_4,
175	.do_5	= xor_arm64_neon_5,
176};
177EXPORT_SYMBOL(xor_block_inner_neon);
178
179MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
180MODULE_DESCRIPTION("ARMv8 XOR Extensions");
181MODULE_LICENSE("GPL");
182