1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * SM4 Cipher Algorithm for ARMv8 NEON
4  * as specified in
5  * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6  *
7  * Copyright (C) 2022, Alibaba Group.
8  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9  */
10 
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13 
14 /* Register macros */
15 
16 #define RTMP0	v8
17 #define RTMP1	v9
18 #define RTMP2	v10
19 #define RTMP3	v11
20 
21 #define RTMP4	v12
22 #define RTMP5	v13
23 #define RTMP6	v14
24 #define RTMP7	v15
25 
26 #define RX0	v12
27 #define RX1	v13
28 #define RKEY	v14
29 #define RIV	v15
30 
31 /* Helper macros. */
32 
33 #define SM4_PREPARE()                                           \
34 	adr_l		x5, crypto_sm4_sbox;                    \
35 	ld1		{v16.16b-v19.16b}, [x5], #64;           \
36 	ld1		{v20.16b-v23.16b}, [x5], #64;           \
37 	ld1		{v24.16b-v27.16b}, [x5], #64;           \
38 	ld1		{v28.16b-v31.16b}, [x5];
39 
40 #define transpose_4x4(s0, s1, s2, s3)                           \
41 	zip1		RTMP0.4s, s0.4s, s1.4s;                 \
42 	zip1		RTMP1.4s, s2.4s, s3.4s;                 \
43 	zip2		RTMP2.4s, s0.4s, s1.4s;                 \
44 	zip2		RTMP3.4s, s2.4s, s3.4s;                 \
45 	zip1		s0.2d, RTMP0.2d, RTMP1.2d;              \
46 	zip2		s1.2d, RTMP0.2d, RTMP1.2d;              \
47 	zip1		s2.2d, RTMP2.2d, RTMP3.2d;              \
48 	zip2		s3.2d, RTMP2.2d, RTMP3.2d;
49 
50 #define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7)        \
51 	zip1		RTMP0.4s, s0.4s, s1.4s;                 \
52 	zip1		RTMP1.4s, s2.4s, s3.4s;                 \
53 	zip2		RTMP2.4s, s0.4s, s1.4s;                 \
54 	zip2		RTMP3.4s, s2.4s, s3.4s;                 \
55 	zip1		RTMP4.4s, s4.4s, s5.4s;                 \
56 	zip1		RTMP5.4s, s6.4s, s7.4s;                 \
57 	zip2		RTMP6.4s, s4.4s, s5.4s;                 \
58 	zip2		RTMP7.4s, s6.4s, s7.4s;                 \
59 	zip1		s0.2d, RTMP0.2d, RTMP1.2d;              \
60 	zip2		s1.2d, RTMP0.2d, RTMP1.2d;              \
61 	zip1		s2.2d, RTMP2.2d, RTMP3.2d;              \
62 	zip2		s3.2d, RTMP2.2d, RTMP3.2d;              \
63 	zip1		s4.2d, RTMP4.2d, RTMP5.2d;              \
64 	zip2		s5.2d, RTMP4.2d, RTMP5.2d;              \
65 	zip1		s6.2d, RTMP6.2d, RTMP7.2d;              \
66 	zip2		s7.2d, RTMP6.2d, RTMP7.2d;
67 
68 #define rotate_clockwise_4x4(s0, s1, s2, s3)                    \
69 	zip1		RTMP0.4s, s1.4s, s0.4s;                 \
70 	zip2		RTMP1.4s, s1.4s, s0.4s;                 \
71 	zip1		RTMP2.4s, s3.4s, s2.4s;                 \
72 	zip2		RTMP3.4s, s3.4s, s2.4s;                 \
73 	zip1		s0.2d, RTMP2.2d, RTMP0.2d;              \
74 	zip2		s1.2d, RTMP2.2d, RTMP0.2d;              \
75 	zip1		s2.2d, RTMP3.2d, RTMP1.2d;              \
76 	zip2		s3.2d, RTMP3.2d, RTMP1.2d;
77 
78 #define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
79 	zip1		RTMP0.4s, s1.4s, s0.4s;                 \
80 	zip1		RTMP2.4s, s3.4s, s2.4s;                 \
81 	zip2		RTMP1.4s, s1.4s, s0.4s;                 \
82 	zip2		RTMP3.4s, s3.4s, s2.4s;                 \
83 	zip1		RTMP4.4s, s5.4s, s4.4s;                 \
84 	zip1		RTMP6.4s, s7.4s, s6.4s;                 \
85 	zip2		RTMP5.4s, s5.4s, s4.4s;                 \
86 	zip2		RTMP7.4s, s7.4s, s6.4s;                 \
87 	zip1		s0.2d, RTMP2.2d, RTMP0.2d;              \
88 	zip2		s1.2d, RTMP2.2d, RTMP0.2d;              \
89 	zip1		s2.2d, RTMP3.2d, RTMP1.2d;              \
90 	zip2		s3.2d, RTMP3.2d, RTMP1.2d;              \
91 	zip1		s4.2d, RTMP6.2d, RTMP4.2d;              \
92 	zip2		s5.2d, RTMP6.2d, RTMP4.2d;              \
93 	zip1		s6.2d, RTMP7.2d, RTMP5.2d;              \
94 	zip2		s7.2d, RTMP7.2d, RTMP5.2d;
95 
96 #define ROUND4(round, s0, s1, s2, s3)                           \
97 	dup		RX0.4s, RKEY.s[round];                  \
98 	/* rk ^ s1 ^ s2 ^ s3 */                                 \
99 	eor		RTMP1.16b, s2.16b, s3.16b;              \
100 	eor		RX0.16b, RX0.16b, s1.16b;               \
101 	eor		RX0.16b, RX0.16b, RTMP1.16b;            \
102                                                                 \
103 	/* sbox, non-linear part */                             \
104 	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
105 	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
106 	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
107 	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
108 	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
109 	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
110 	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
111 	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
112                                                                 \
113 	/* linear part */                                       \
114 	shl		RTMP1.4s, RTMP0.4s, #8;                 \
115 	shl		RTMP2.4s, RTMP0.4s, #16;                \
116 	shl		RTMP3.4s, RTMP0.4s, #24;                \
117 	sri		RTMP1.4s, RTMP0.4s, #(32-8);            \
118 	sri		RTMP2.4s, RTMP0.4s, #(32-16);           \
119 	sri		RTMP3.4s, RTMP0.4s, #(32-24);           \
120 	/* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */            \
121 	eor		RTMP1.16b, RTMP1.16b, RTMP0.16b;        \
122 	eor		RTMP1.16b, RTMP1.16b, RTMP2.16b;        \
123 	/* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */        \
124 	eor		RTMP3.16b, RTMP3.16b, RTMP0.16b;        \
125 	shl		RTMP2.4s, RTMP1.4s, 2;                  \
126 	sri		RTMP2.4s, RTMP1.4s, #(32-2);            \
127 	eor		RTMP3.16b, RTMP3.16b, RTMP2.16b;        \
128 	/* s0 ^= RTMP3 */                                       \
129 	eor		s0.16b, s0.16b, RTMP3.16b;
130 
131 #define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3)                       \
132 	mov		x6, 8;                                  \
133 4:                                                              \
134 	ld1		{RKEY.4s}, [x0], #16;                   \
135 	subs		x6, x6, #1;                             \
136                                                                 \
137 	ROUND4(0, b0, b1, b2, b3);                              \
138 	ROUND4(1, b1, b2, b3, b0);                              \
139 	ROUND4(2, b2, b3, b0, b1);                              \
140 	ROUND4(3, b3, b0, b1, b2);                              \
141                                                                 \
142 	bne		4b;                                     \
143                                                                 \
144 	rev32		b0.16b, b0.16b;                         \
145 	rev32		b1.16b, b1.16b;                         \
146 	rev32		b2.16b, b2.16b;                         \
147 	rev32		b3.16b, b3.16b;                         \
148                                                                 \
149 	rotate_clockwise_4x4(b0, b1, b2, b3);                   \
150                                                                 \
151 	/* repoint to rkey */                                   \
152 	sub		x0, x0, #128;
153 
154 #define SM4_CRYPT_BLK4(b0, b1, b2, b3)                          \
155 	rev32		b0.16b, b0.16b;                         \
156 	rev32		b1.16b, b1.16b;                         \
157 	rev32		b2.16b, b2.16b;                         \
158 	rev32		b3.16b, b3.16b;                         \
159 	SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
160 
161 #define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3)           \
162 	/* rk ^ s1 ^ s2 ^ s3 */                                 \
163 	dup		RX0.4s, RKEY.s[round];                  \
164 	eor		RTMP0.16b, s2.16b, s3.16b;              \
165 	mov		RX1.16b, RX0.16b;                       \
166 	eor		RTMP1.16b, t2.16b, t3.16b;              \
167 	eor		RX0.16b, RX0.16b, s1.16b;               \
168 	eor		RX1.16b, RX1.16b, t1.16b;               \
169 	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
170 	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
171                                                                 \
172 	/* sbox, non-linear part */                             \
173 	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
174 	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
175 	tbl		RTMP1.16b, {v16.16b-v19.16b}, RX1.16b;  \
176 	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
177 	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
178 	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
179 	tbx		RTMP1.16b, {v20.16b-v23.16b}, RX1.16b;  \
180 	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
181 	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
182 	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
183 	tbx		RTMP1.16b, {v24.16b-v27.16b}, RX1.16b;  \
184 	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
185 	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
186 	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
187 	tbx		RTMP1.16b, {v28.16b-v31.16b}, RX1.16b;  \
188                                                                 \
189 	/* linear part */                                       \
190 	shl		RX0.4s, RTMP0.4s, #8;                   \
191 	shl		RX1.4s, RTMP1.4s, #8;                   \
192 	shl		RTMP2.4s, RTMP0.4s, #16;                \
193 	shl		RTMP3.4s, RTMP1.4s, #16;                \
194 	sri		RX0.4s, RTMP0.4s, #(32 - 8);            \
195 	sri		RX1.4s, RTMP1.4s, #(32 - 8);            \
196 	sri		RTMP2.4s, RTMP0.4s, #(32 - 16);         \
197 	sri		RTMP3.4s, RTMP1.4s, #(32 - 16);         \
198 	/* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */               \
199 	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
200 	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
201 	eor		RX0.16b, RX0.16b, RTMP2.16b;            \
202 	eor		RX1.16b, RX1.16b, RTMP3.16b;            \
203 	/* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */        \
204 	shl		RTMP2.4s, RTMP0.4s, #24;                \
205 	shl		RTMP3.4s, RTMP1.4s, #24;                \
206 	sri		RTMP2.4s, RTMP0.4s, #(32 - 24);         \
207 	sri		RTMP3.4s, RTMP1.4s, #(32 - 24);         \
208 	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
209 	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
210 	shl		RTMP2.4s, RX0.4s, #2;                   \
211 	shl		RTMP3.4s, RX1.4s, #2;                   \
212 	sri		RTMP2.4s, RX0.4s, #(32 - 2);            \
213 	sri		RTMP3.4s, RX1.4s, #(32 - 2);            \
214 	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
215 	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
216 	/* s0/t0 ^= RTMP0/1 */                                  \
217 	eor		s0.16b, s0.16b, RTMP0.16b;              \
218 	eor		t0.16b, t0.16b, RTMP1.16b;
219 
220 #define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
221 	rev32		b0.16b, b0.16b;                         \
222 	rev32		b1.16b, b1.16b;                         \
223 	rev32		b2.16b, b2.16b;                         \
224 	rev32		b3.16b, b3.16b;                         \
225 	rev32		b4.16b, b4.16b;                         \
226 	rev32		b5.16b, b5.16b;                         \
227 	rev32		b6.16b, b6.16b;                         \
228 	rev32		b7.16b, b7.16b;                         \
229                                                                 \
230 	mov		x6, 8;                                  \
231 8:                                                              \
232 	ld1		{RKEY.4s}, [x0], #16;                   \
233 	subs		x6, x6, #1;                             \
234                                                                 \
235 	ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7);              \
236 	ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4);              \
237 	ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5);              \
238 	ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6);              \
239                                                                 \
240 	bne		8b;                                     \
241                                                                 \
242 	rev32		b0.16b, b0.16b;                         \
243 	rev32		b1.16b, b1.16b;                         \
244 	rev32		b2.16b, b2.16b;                         \
245 	rev32		b3.16b, b3.16b;                         \
246 	rev32		b4.16b, b4.16b;                         \
247 	rev32		b5.16b, b5.16b;                         \
248 	rev32		b6.16b, b6.16b;                         \
249 	rev32		b7.16b, b7.16b;                         \
250                                                                 \
251 	/* repoint to rkey */                                   \
252 	sub		x0, x0, #128;
253 
254 #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7)			\
255 	SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7);	\
256 	rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7);	\
257 
258 
259 .align 3
260 SYM_FUNC_START(sm4_neon_crypt)
261 	/* input:
262 	 *   x0: round key array, CTX
263 	 *   x1: dst
264 	 *   x2: src
265 	 *   w3: nblocks
266 	 */
267 	SM4_PREPARE()
268 
269 .Lcrypt_loop_8x:
270 	sub		w3, w3, #8
271 	tbnz		w3, #31, .Lcrypt_4x
272 
273 	ld4		{v0.4s-v3.4s}, [x2], #64
274 	ld4		{v4.4s-v7.4s}, [x2], #64
275 
276 	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
277 
278 	st1		{v0.16b-v3.16b}, [x1], #64
279 	st1		{v4.16b-v7.16b}, [x1], #64
280 
281 	cbz		w3, .Lcrypt_end
282 	b		.Lcrypt_loop_8x
283 
284 .Lcrypt_4x:
285 	add		w3, w3, #8
286 	cmp		w3, #4
287 	blt		.Lcrypt_tail
288 
289 	sub		w3, w3, #4
290 
291 	ld4		{v0.4s-v3.4s}, [x2], #64
292 
293 	SM4_CRYPT_BLK4(v0, v1, v2, v3)
294 
295 	st1		{v0.16b-v3.16b}, [x1], #64
296 
297 	cbz		w3, .Lcrypt_end
298 
299 .Lcrypt_tail:
300 	cmp		w3, #2
301 	ld1		{v0.16b}, [x2], #16
302 	blt		.Lcrypt_tail_load_done
303 	ld1		{v1.16b}, [x2], #16
304 	beq		.Lcrypt_tail_load_done
305 	ld1		{v2.16b}, [x2], #16
306 
307 .Lcrypt_tail_load_done:
308 	transpose_4x4(v0, v1, v2, v3)
309 
310 	SM4_CRYPT_BLK4(v0, v1, v2, v3)
311 
312 	cmp		w3, #2
313 	st1		{v0.16b}, [x1], #16
314 	blt		.Lcrypt_end
315 	st1		{v1.16b}, [x1], #16
316 	beq		.Lcrypt_end
317 	st1		{v2.16b}, [x1], #16
318 
319 .Lcrypt_end:
320 	ret
321 SYM_FUNC_END(sm4_neon_crypt)
322 
323 .align 3
324 SYM_FUNC_START(sm4_neon_cbc_dec)
325 	/* input:
326 	 *   x0: round key array, CTX
327 	 *   x1: dst
328 	 *   x2: src
329 	 *   x3: iv (big endian, 128 bit)
330 	 *   w4: nblocks
331 	 */
332 	SM4_PREPARE()
333 
334 	ld1		{RIV.16b}, [x3]
335 
336 .Lcbc_dec_loop_8x:
337 	sub		w4, w4, #8
338 	tbnz		w4, #31, .Lcbc_dec_4x
339 
340 	ld4		{v0.4s-v3.4s}, [x2], #64
341 	ld4		{v4.4s-v7.4s}, [x2]
342 
343 	SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
344 
345 	/* Avoid overwriting the RIV register */
346 	rotate_clockwise_4x4(v0, v1, v2, v3)
347 	rotate_clockwise_4x4(v4, v5, v6, v7)
348 
349 	sub		x2, x2, #64
350 
351 	eor		v0.16b, v0.16b, RIV.16b
352 
353 	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
354 	ld1		{RTMP4.16b-RTMP7.16b}, [x2], #64
355 
356 	eor		v1.16b, v1.16b, RTMP0.16b
357 	eor		v2.16b, v2.16b, RTMP1.16b
358 	eor		v3.16b, v3.16b, RTMP2.16b
359 	eor		v4.16b, v4.16b, RTMP3.16b
360 	eor		v5.16b, v5.16b, RTMP4.16b
361 	eor		v6.16b, v6.16b, RTMP5.16b
362 	eor		v7.16b, v7.16b, RTMP6.16b
363 
364 	mov		RIV.16b, RTMP7.16b
365 
366 	st1		{v0.16b-v3.16b}, [x1], #64
367 	st1		{v4.16b-v7.16b}, [x1], #64
368 
369 	cbz		w4, .Lcbc_dec_end
370 	b		.Lcbc_dec_loop_8x
371 
372 .Lcbc_dec_4x:
373 	add		w4, w4, #8
374 	cmp		w4, #4
375 	blt		.Lcbc_dec_tail
376 
377 	sub		w4, w4, #4
378 
379 	ld1		{v0.16b-v3.16b}, [x2], #64
380 
381 	rev32		v4.16b, v0.16b
382 	rev32		v5.16b, v1.16b
383 	rev32		v6.16b, v2.16b
384 	rev32		v7.16b, v3.16b
385 
386 	transpose_4x4(v4, v5, v6, v7)
387 
388 	SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
389 
390 	eor		v4.16b, v4.16b, RIV.16b
391 	eor		v5.16b, v5.16b, v0.16b
392 	eor		v6.16b, v6.16b, v1.16b
393 	eor		v7.16b, v7.16b, v2.16b
394 
395 	mov		RIV.16b, v3.16b
396 
397 	st1		{v4.16b-v7.16b}, [x1], #64
398 
399 	cbz		w4, .Lcbc_dec_end
400 
401 .Lcbc_dec_tail:
402 	cmp		w4, #2
403 	ld1		{v0.16b}, [x2], #16
404 	blt		.Lcbc_dec_tail_load_done
405 	ld1		{v1.16b}, [x2], #16
406 	beq		.Lcbc_dec_tail_load_done
407 	ld1		{v2.16b}, [x2], #16
408 
409 .Lcbc_dec_tail_load_done:
410 	rev32		v4.16b, v0.16b
411 	rev32		v5.16b, v1.16b
412 	rev32		v6.16b, v2.16b
413 
414 	transpose_4x4(v4, v5, v6, v7)
415 
416 	SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
417 
418 	cmp		w4, #2
419 	eor		v4.16b, v4.16b, RIV.16b
420 	mov		RIV.16b, v0.16b
421 	st1		{v4.16b}, [x1], #16
422 	blt		.Lcbc_dec_end
423 
424 	eor		v5.16b, v5.16b, v0.16b
425 	mov		RIV.16b, v1.16b
426 	st1		{v5.16b}, [x1], #16
427 	beq		.Lcbc_dec_end
428 
429 	eor		v6.16b, v6.16b, v1.16b
430 	mov		RIV.16b, v2.16b
431 	st1		{v6.16b}, [x1], #16
432 
433 .Lcbc_dec_end:
434 	/* store new IV */
435 	st1		{RIV.16b}, [x3]
436 
437 	ret
438 SYM_FUNC_END(sm4_neon_cbc_dec)
439 
440 .align 3
441 SYM_FUNC_START(sm4_neon_cfb_dec)
442 	/* input:
443 	 *   x0: round key array, CTX
444 	 *   x1: dst
445 	 *   x2: src
446 	 *   x3: iv (big endian, 128 bit)
447 	 *   w4: nblocks
448 	 */
449 	SM4_PREPARE()
450 
451 	ld1		{v0.16b}, [x3]
452 
453 .Lcfb_dec_loop_8x:
454 	sub		w4, w4, #8
455 	tbnz		w4, #31, .Lcfb_dec_4x
456 
457 	ld1		{v1.16b-v3.16b}, [x2], #48
458 	ld4		{v4.4s-v7.4s}, [x2]
459 
460 	transpose_4x4(v0, v1, v2, v3)
461 
462 	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
463 
464 	sub		x2, x2, #48
465 	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
466 	ld1		{RTMP4.16b-RTMP7.16b}, [x2], #64
467 
468 	eor		v0.16b, v0.16b, RTMP0.16b
469 	eor		v1.16b, v1.16b, RTMP1.16b
470 	eor		v2.16b, v2.16b, RTMP2.16b
471 	eor		v3.16b, v3.16b, RTMP3.16b
472 	eor		v4.16b, v4.16b, RTMP4.16b
473 	eor		v5.16b, v5.16b, RTMP5.16b
474 	eor		v6.16b, v6.16b, RTMP6.16b
475 	eor		v7.16b, v7.16b, RTMP7.16b
476 
477 	st1		{v0.16b-v3.16b}, [x1], #64
478 	st1		{v4.16b-v7.16b}, [x1], #64
479 
480 	mov		v0.16b, RTMP7.16b
481 
482 	cbz		w4, .Lcfb_dec_end
483 	b		.Lcfb_dec_loop_8x
484 
485 .Lcfb_dec_4x:
486 	add		w4, w4, #8
487 	cmp		w4, #4
488 	blt		.Lcfb_dec_tail
489 
490 	sub		w4, w4, #4
491 
492 	ld1		{v4.16b-v7.16b}, [x2], #64
493 
494 	rev32		v0.16b, v0.16b		/* v0 is IV register */
495 	rev32		v1.16b, v4.16b
496 	rev32		v2.16b, v5.16b
497 	rev32		v3.16b, v6.16b
498 
499 	transpose_4x4(v0, v1, v2, v3)
500 
501 	SM4_CRYPT_BLK4_BE(v0, v1, v2, v3)
502 
503 	eor		v0.16b, v0.16b, v4.16b
504 	eor		v1.16b, v1.16b, v5.16b
505 	eor		v2.16b, v2.16b, v6.16b
506 	eor		v3.16b, v3.16b, v7.16b
507 
508 	st1		{v0.16b-v3.16b}, [x1], #64
509 
510 	mov		v0.16b, v7.16b
511 
512 	cbz		w4, .Lcfb_dec_end
513 
514 .Lcfb_dec_tail:
515 	cmp		w4, #2
516 	ld1		{v4.16b}, [x2], #16
517 	blt		.Lcfb_dec_tail_load_done
518 	ld1		{v5.16b}, [x2], #16
519 	beq		.Lcfb_dec_tail_load_done
520 	ld1		{v6.16b}, [x2], #16
521 
522 .Lcfb_dec_tail_load_done:
523 	rev32		v0.16b, v0.16b		/* v0 is IV register */
524 	rev32		v1.16b, v4.16b
525 	rev32		v2.16b, v5.16b
526 
527 	transpose_4x4(v0, v1, v2, v3)
528 
529 	SM4_CRYPT_BLK4_BE(v0, v1, v2, v3)
530 
531 	cmp		w4, #2
532 	eor		v0.16b, v0.16b, v4.16b
533 	st1		{v0.16b}, [x1], #16
534 	mov		v0.16b, v4.16b
535 	blt		.Lcfb_dec_end
536 
537 	eor		v1.16b, v1.16b, v5.16b
538 	st1		{v1.16b}, [x1], #16
539 	mov		v0.16b, v5.16b
540 	beq		.Lcfb_dec_end
541 
542 	eor		v2.16b, v2.16b, v6.16b
543 	st1		{v2.16b}, [x1], #16
544 	mov		v0.16b, v6.16b
545 
546 .Lcfb_dec_end:
547 	/* store new IV */
548 	st1		{v0.16b}, [x3]
549 
550 	ret
551 SYM_FUNC_END(sm4_neon_cfb_dec)
552 
553 .align 3
554 SYM_FUNC_START(sm4_neon_ctr_crypt)
555 	/* input:
556 	 *   x0: round key array, CTX
557 	 *   x1: dst
558 	 *   x2: src
559 	 *   x3: ctr (big endian, 128 bit)
560 	 *   w4: nblocks
561 	 */
562 	SM4_PREPARE()
563 
564 	ldp		x7, x8, [x3]
565 	rev		x7, x7
566 	rev		x8, x8
567 
568 .Lctr_crypt_loop_8x:
569 	sub		w4, w4, #8
570 	tbnz		w4, #31, .Lctr_crypt_4x
571 
572 #define inc_le128(vctr)                             \
573 		mov		vctr.d[1], x8;      \
574 		mov		vctr.d[0], x7;      \
575 		adds		x8, x8, #1;         \
576 		rev64		vctr.16b, vctr.16b; \
577 		adc		x7, x7, xzr;
578 
579 	/* construct CTRs */
580 	inc_le128(v0)			/* +0 */
581 	inc_le128(v1)			/* +1 */
582 	inc_le128(v2)			/* +2 */
583 	inc_le128(v3)			/* +3 */
584 	inc_le128(v4)			/* +4 */
585 	inc_le128(v5)			/* +5 */
586 	inc_le128(v6)			/* +6 */
587 	inc_le128(v7)			/* +7 */
588 
589 	transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
590 
591 	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
592 
593 	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
594 	ld1		{RTMP4.16b-RTMP7.16b}, [x2], #64
595 
596 	eor		v0.16b, v0.16b, RTMP0.16b
597 	eor		v1.16b, v1.16b, RTMP1.16b
598 	eor		v2.16b, v2.16b, RTMP2.16b
599 	eor		v3.16b, v3.16b, RTMP3.16b
600 	eor		v4.16b, v4.16b, RTMP4.16b
601 	eor		v5.16b, v5.16b, RTMP5.16b
602 	eor		v6.16b, v6.16b, RTMP6.16b
603 	eor		v7.16b, v7.16b, RTMP7.16b
604 
605 	st1		{v0.16b-v3.16b}, [x1], #64
606 	st1		{v4.16b-v7.16b}, [x1], #64
607 
608 	cbz		w4, .Lctr_crypt_end
609 	b		.Lctr_crypt_loop_8x
610 
611 .Lctr_crypt_4x:
612 	add		w4, w4, #8
613 	cmp		w4, #4
614 	blt		.Lctr_crypt_tail
615 
616 	sub		w4, w4, #4
617 
618 	/* construct CTRs */
619 	inc_le128(v0)			/* +0 */
620 	inc_le128(v1)			/* +1 */
621 	inc_le128(v2)			/* +2 */
622 	inc_le128(v3)			/* +3 */
623 
624 	ld1		{v4.16b-v7.16b}, [x2], #64
625 
626 	transpose_4x4(v0, v1, v2, v3)
627 
628 	SM4_CRYPT_BLK4(v0, v1, v2, v3)
629 
630 	eor		v0.16b, v0.16b, v4.16b
631 	eor		v1.16b, v1.16b, v5.16b
632 	eor		v2.16b, v2.16b, v6.16b
633 	eor		v3.16b, v3.16b, v7.16b
634 
635 	st1		{v0.16b-v3.16b}, [x1], #64
636 
637 	cbz		w4, .Lctr_crypt_end
638 
639 .Lctr_crypt_tail:
640 	/* inc_le128 will change the sign bit */
641 	ld1		{v4.16b}, [x2], #16
642 	inc_le128(v0)
643 	cmp		w4, #2
644 	blt		.Lctr_crypt_tail_load_done
645 
646 	ld1		{v5.16b}, [x2], #16
647 	inc_le128(v1)
648 	cmp		w4, #2
649 	beq		.Lctr_crypt_tail_load_done
650 
651 	ld1		{v6.16b}, [x2], #16
652 	inc_le128(v2)
653 
654 .Lctr_crypt_tail_load_done:
655 	transpose_4x4(v0, v1, v2, v3)
656 
657 	SM4_CRYPT_BLK4(v0, v1, v2, v3)
658 
659 	cmp		w4, #2
660 
661 	eor		v0.16b, v0.16b, v4.16b
662 	st1		{v0.16b}, [x1], #16
663 	blt		.Lctr_crypt_end
664 
665 	eor		v1.16b, v1.16b, v5.16b
666 	st1		{v1.16b}, [x1], #16
667 	beq		.Lctr_crypt_end
668 
669 	eor		v2.16b, v2.16b, v6.16b
670 	st1		{v2.16b}, [x1], #16
671 
672 .Lctr_crypt_end:
673 	/* store new CTR */
674 	rev		x7, x7
675 	rev		x8, x8
676 	stp		x7, x8, [x3]
677 
678 	ret
679 SYM_FUNC_END(sm4_neon_ctr_crypt)
680