1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
4  * as specified in
5  * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6  *
7  * Copyright (C) 2022, Alibaba Group.
8  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9  */
10 
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13 #include "sm4-ce-asm.h"
14 
15 .arch	armv8-a+crypto
16 
17 .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
18 		20, 24, 25, 26, 27, 28, 29, 30, 31
19 	.set .Lv\b\().4s, \b
20 .endr
21 
22 .macro sm4e, vd, vn
23 	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
24 .endm
25 
26 .macro sm4ekey, vd, vn, vm
27 	.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
28 .endm
29 
30 /* Register macros */
31 
32 #define RTMP0	v16
33 #define RTMP1	v17
34 #define RTMP2	v18
35 #define RTMP3	v19
36 
37 #define RIV	v20
38 #define RMAC	v20
39 #define RMASK	v21
40 
41 
42 .align 3
43 SYM_FUNC_START(sm4_ce_expand_key)
44 	/* input:
45 	 *   x0: 128-bit key
46 	 *   x1: rkey_enc
47 	 *   x2: rkey_dec
48 	 *   x3: fk array
49 	 *   x4: ck array
50 	 */
51 	ld1		{v0.16b}, [x0];
52 	rev32		v0.16b, v0.16b;
53 	ld1		{v1.16b}, [x3];
54 	/* load ck */
55 	ld1		{v24.16b-v27.16b}, [x4], #64;
56 	ld1		{v28.16b-v31.16b}, [x4];
57 
58 	/* input ^ fk */
59 	eor		v0.16b, v0.16b, v1.16b;
60 
61 	sm4ekey		v0.4s, v0.4s, v24.4s;
62 	sm4ekey		v1.4s, v0.4s, v25.4s;
63 	sm4ekey		v2.4s, v1.4s, v26.4s;
64 	sm4ekey		v3.4s, v2.4s, v27.4s;
65 	sm4ekey		v4.4s, v3.4s, v28.4s;
66 	sm4ekey		v5.4s, v4.4s, v29.4s;
67 	sm4ekey		v6.4s, v5.4s, v30.4s;
68 	sm4ekey		v7.4s, v6.4s, v31.4s;
69 
70 	adr_l		x5, .Lbswap128_mask
71 	ld1		{v24.16b}, [x5]
72 
73 	st1		{v0.16b-v3.16b}, [x1], #64;
74 	st1		{v4.16b-v7.16b}, [x1];
75 
76 	tbl		v16.16b, {v7.16b}, v24.16b
77 	tbl		v17.16b, {v6.16b}, v24.16b
78 	tbl		v18.16b, {v5.16b}, v24.16b
79 	tbl		v19.16b, {v4.16b}, v24.16b
80 	tbl		v20.16b, {v3.16b}, v24.16b
81 	tbl		v21.16b, {v2.16b}, v24.16b
82 	tbl		v22.16b, {v1.16b}, v24.16b
83 	tbl		v23.16b, {v0.16b}, v24.16b
84 
85 	st1		{v16.16b-v19.16b}, [x2], #64
86 	st1		{v20.16b-v23.16b}, [x2]
87 
88 	ret;
89 SYM_FUNC_END(sm4_ce_expand_key)
90 
91 .align 3
92 SYM_FUNC_START(sm4_ce_crypt_block)
93 	/* input:
94 	 *   x0: round key array, CTX
95 	 *   x1: dst
96 	 *   x2: src
97 	 */
98 	SM4_PREPARE(x0)
99 
100 	ld1		{v0.16b}, [x2];
101 	SM4_CRYPT_BLK(v0);
102 	st1		{v0.16b}, [x1];
103 
104 	ret;
105 SYM_FUNC_END(sm4_ce_crypt_block)
106 
107 .align 3
108 SYM_FUNC_START(sm4_ce_crypt)
109 	/* input:
110 	 *   x0: round key array, CTX
111 	 *   x1: dst
112 	 *   x2: src
113 	 *   w3: nblocks
114 	 */
115 	SM4_PREPARE(x0)
116 
117 .Lcrypt_loop_blk:
118 	sub		w3, w3, #8;
119 	tbnz		w3, #31, .Lcrypt_tail8;
120 
121 	ld1		{v0.16b-v3.16b}, [x2], #64;
122 	ld1		{v4.16b-v7.16b}, [x2], #64;
123 
124 	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
125 
126 	st1		{v0.16b-v3.16b}, [x1], #64;
127 	st1		{v4.16b-v7.16b}, [x1], #64;
128 
129 	cbz		w3, .Lcrypt_end;
130 	b		.Lcrypt_loop_blk;
131 
132 .Lcrypt_tail8:
133 	add		w3, w3, #8;
134 	cmp		w3, #4;
135 	blt		.Lcrypt_tail4;
136 
137 	sub		w3, w3, #4;
138 
139 	ld1		{v0.16b-v3.16b}, [x2], #64;
140 	SM4_CRYPT_BLK4(v0, v1, v2, v3);
141 	st1		{v0.16b-v3.16b}, [x1], #64;
142 
143 	cbz		w3, .Lcrypt_end;
144 
145 .Lcrypt_tail4:
146 	sub		w3, w3, #1;
147 
148 	ld1		{v0.16b}, [x2], #16;
149 	SM4_CRYPT_BLK(v0);
150 	st1		{v0.16b}, [x1], #16;
151 
152 	cbnz		w3, .Lcrypt_tail4;
153 
154 .Lcrypt_end:
155 	ret;
156 SYM_FUNC_END(sm4_ce_crypt)
157 
158 .align 3
159 SYM_FUNC_START(sm4_ce_cbc_enc)
160 	/* input:
161 	 *   x0: round key array, CTX
162 	 *   x1: dst
163 	 *   x2: src
164 	 *   x3: iv (big endian, 128 bit)
165 	 *   w4: nblocks
166 	 */
167 	SM4_PREPARE(x0)
168 
169 	ld1		{RIV.16b}, [x3]
170 
171 .Lcbc_enc_loop_4x:
172 	cmp		w4, #4
173 	blt		.Lcbc_enc_loop_1x
174 
175 	sub		w4, w4, #4
176 
177 	ld1		{v0.16b-v3.16b}, [x2], #64
178 
179 	eor		v0.16b, v0.16b, RIV.16b
180 	SM4_CRYPT_BLK(v0)
181 	eor		v1.16b, v1.16b, v0.16b
182 	SM4_CRYPT_BLK(v1)
183 	eor		v2.16b, v2.16b, v1.16b
184 	SM4_CRYPT_BLK(v2)
185 	eor		v3.16b, v3.16b, v2.16b
186 	SM4_CRYPT_BLK(v3)
187 
188 	st1		{v0.16b-v3.16b}, [x1], #64
189 	mov		RIV.16b, v3.16b
190 
191 	cbz		w4, .Lcbc_enc_end
192 	b		.Lcbc_enc_loop_4x
193 
194 .Lcbc_enc_loop_1x:
195 	sub		w4, w4, #1
196 
197 	ld1		{v0.16b}, [x2], #16
198 
199 	eor		RIV.16b, RIV.16b, v0.16b
200 	SM4_CRYPT_BLK(RIV)
201 
202 	st1		{RIV.16b}, [x1], #16
203 
204 	cbnz		w4, .Lcbc_enc_loop_1x
205 
206 .Lcbc_enc_end:
207 	/* store new IV */
208 	st1		{RIV.16b}, [x3]
209 
210 	ret
211 SYM_FUNC_END(sm4_ce_cbc_enc)
212 
213 .align 3
214 SYM_FUNC_START(sm4_ce_cbc_dec)
215 	/* input:
216 	 *   x0: round key array, CTX
217 	 *   x1: dst
218 	 *   x2: src
219 	 *   x3: iv (big endian, 128 bit)
220 	 *   w4: nblocks
221 	 */
222 	SM4_PREPARE(x0)
223 
224 	ld1		{RIV.16b}, [x3]
225 
226 .Lcbc_dec_loop_8x:
227 	sub		w4, w4, #8
228 	tbnz		w4, #31, .Lcbc_dec_4x
229 
230 	ld1		{v0.16b-v3.16b}, [x2], #64
231 	ld1		{v4.16b-v7.16b}, [x2], #64
232 
233 	rev32		v8.16b, v0.16b
234 	rev32		v9.16b, v1.16b
235 	rev32		v10.16b, v2.16b
236 	rev32		v11.16b, v3.16b
237 	rev32		v12.16b, v4.16b
238 	rev32		v13.16b, v5.16b
239 	rev32		v14.16b, v6.16b
240 	rev32		v15.16b, v7.16b
241 
242 	SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
243 
244 	eor		v8.16b, v8.16b, RIV.16b
245 	eor		v9.16b, v9.16b, v0.16b
246 	eor		v10.16b, v10.16b, v1.16b
247 	eor		v11.16b, v11.16b, v2.16b
248 	eor		v12.16b, v12.16b, v3.16b
249 	eor		v13.16b, v13.16b, v4.16b
250 	eor		v14.16b, v14.16b, v5.16b
251 	eor		v15.16b, v15.16b, v6.16b
252 
253 	st1		{v8.16b-v11.16b}, [x1], #64
254 	st1		{v12.16b-v15.16b}, [x1], #64
255 
256 	mov		RIV.16b, v7.16b
257 
258 	cbz		w4, .Lcbc_dec_end
259 	b		.Lcbc_dec_loop_8x
260 
261 .Lcbc_dec_4x:
262 	add		w4, w4, #8
263 	cmp		w4, #4
264 	blt		.Lcbc_dec_loop_1x
265 
266 	sub		w4, w4, #4
267 
268 	ld1		{v0.16b-v3.16b}, [x2], #64
269 
270 	rev32		v8.16b, v0.16b
271 	rev32		v9.16b, v1.16b
272 	rev32		v10.16b, v2.16b
273 	rev32		v11.16b, v3.16b
274 
275 	SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
276 
277 	eor		v8.16b, v8.16b, RIV.16b
278 	eor		v9.16b, v9.16b, v0.16b
279 	eor		v10.16b, v10.16b, v1.16b
280 	eor		v11.16b, v11.16b, v2.16b
281 
282 	st1		{v8.16b-v11.16b}, [x1], #64
283 
284 	mov		RIV.16b, v3.16b
285 
286 	cbz		w4, .Lcbc_dec_end
287 
288 .Lcbc_dec_loop_1x:
289 	sub		w4, w4, #1
290 
291 	ld1		{v0.16b}, [x2], #16
292 
293 	rev32		v8.16b, v0.16b
294 
295 	SM4_CRYPT_BLK_BE(v8)
296 
297 	eor		v8.16b, v8.16b, RIV.16b
298 	st1		{v8.16b}, [x1], #16
299 
300 	mov		RIV.16b, v0.16b
301 
302 	cbnz		w4, .Lcbc_dec_loop_1x
303 
304 .Lcbc_dec_end:
305 	/* store new IV */
306 	st1		{RIV.16b}, [x3]
307 
308 	ret
309 SYM_FUNC_END(sm4_ce_cbc_dec)
310 
311 .align 3
312 SYM_FUNC_START(sm4_ce_cbc_cts_enc)
313 	/* input:
314 	 *   x0: round key array, CTX
315 	 *   x1: dst
316 	 *   x2: src
317 	 *   x3: iv (big endian, 128 bit)
318 	 *   w4: nbytes
319 	 */
320 	SM4_PREPARE(x0)
321 
322 	sub		w5, w4, #16
323 	uxtw		x5, w5
324 
325 	ld1		{RIV.16b}, [x3]
326 
327 	ld1		{v0.16b}, [x2]
328 	eor		RIV.16b, RIV.16b, v0.16b
329 	SM4_CRYPT_BLK(RIV)
330 
331 	/* load permute table */
332 	adr_l		x6, .Lcts_permute_table
333 	add		x7, x6, #32
334 	add		x6, x6, x5
335 	sub		x7, x7, x5
336 	ld1		{v3.16b}, [x6]
337 	ld1		{v4.16b}, [x7]
338 
339 	/* overlapping loads */
340 	add		x2, x2, x5
341 	ld1		{v1.16b}, [x2]
342 
343 	/* create Cn from En-1 */
344 	tbl		v0.16b, {RIV.16b}, v3.16b
345 	/* padding Pn with zeros */
346 	tbl		v1.16b, {v1.16b}, v4.16b
347 
348 	eor		v1.16b, v1.16b, RIV.16b
349 	SM4_CRYPT_BLK(v1)
350 
351 	/* overlapping stores */
352 	add		x5, x1, x5
353 	st1		{v0.16b}, [x5]
354 	st1		{v1.16b}, [x1]
355 
356 	ret
357 SYM_FUNC_END(sm4_ce_cbc_cts_enc)
358 
359 .align 3
360 SYM_FUNC_START(sm4_ce_cbc_cts_dec)
361 	/* input:
362 	 *   x0: round key array, CTX
363 	 *   x1: dst
364 	 *   x2: src
365 	 *   x3: iv (big endian, 128 bit)
366 	 *   w4: nbytes
367 	 */
368 	SM4_PREPARE(x0)
369 
370 	sub		w5, w4, #16
371 	uxtw		x5, w5
372 
373 	ld1		{RIV.16b}, [x3]
374 
375 	/* load permute table */
376 	adr_l		x6, .Lcts_permute_table
377 	add		x7, x6, #32
378 	add		x6, x6, x5
379 	sub		x7, x7, x5
380 	ld1		{v3.16b}, [x6]
381 	ld1		{v4.16b}, [x7]
382 
383 	/* overlapping loads */
384 	ld1		{v0.16b}, [x2], x5
385 	ld1		{v1.16b}, [x2]
386 
387 	SM4_CRYPT_BLK(v0)
388 	/* select the first Ln bytes of Xn to create Pn */
389 	tbl		v2.16b, {v0.16b}, v3.16b
390 	eor		v2.16b, v2.16b, v1.16b
391 
392 	/* overwrite the first Ln bytes with Cn to create En-1 */
393 	tbx		v0.16b, {v1.16b}, v4.16b
394 	SM4_CRYPT_BLK(v0)
395 	eor		v0.16b, v0.16b, RIV.16b
396 
397 	/* overlapping stores */
398 	add		x5, x1, x5
399 	st1		{v2.16b}, [x5]
400 	st1		{v0.16b}, [x1]
401 
402 	ret
403 SYM_FUNC_END(sm4_ce_cbc_cts_dec)
404 
405 .align 3
406 SYM_FUNC_START(sm4_ce_cfb_enc)
407 	/* input:
408 	 *   x0: round key array, CTX
409 	 *   x1: dst
410 	 *   x2: src
411 	 *   x3: iv (big endian, 128 bit)
412 	 *   w4: nblocks
413 	 */
414 	SM4_PREPARE(x0)
415 
416 	ld1		{RIV.16b}, [x3]
417 
418 .Lcfb_enc_loop_4x:
419 	cmp		w4, #4
420 	blt		.Lcfb_enc_loop_1x
421 
422 	sub		w4, w4, #4
423 
424 	ld1		{v0.16b-v3.16b}, [x2], #64
425 
426 	rev32		v8.16b, RIV.16b
427 	SM4_CRYPT_BLK_BE(v8)
428 	eor		v0.16b, v0.16b, v8.16b
429 
430 	rev32		v8.16b, v0.16b
431 	SM4_CRYPT_BLK_BE(v8)
432 	eor		v1.16b, v1.16b, v8.16b
433 
434 	rev32		v8.16b, v1.16b
435 	SM4_CRYPT_BLK_BE(v8)
436 	eor		v2.16b, v2.16b, v8.16b
437 
438 	rev32		v8.16b, v2.16b
439 	SM4_CRYPT_BLK_BE(v8)
440 	eor		v3.16b, v3.16b, v8.16b
441 
442 	st1		{v0.16b-v3.16b}, [x1], #64
443 	mov		RIV.16b, v3.16b
444 
445 	cbz		w4, .Lcfb_enc_end
446 	b		.Lcfb_enc_loop_4x
447 
448 .Lcfb_enc_loop_1x:
449 	sub		w4, w4, #1
450 
451 	ld1		{v0.16b}, [x2], #16
452 
453 	SM4_CRYPT_BLK(RIV)
454 	eor		RIV.16b, RIV.16b, v0.16b
455 
456 	st1		{RIV.16b}, [x1], #16
457 
458 	cbnz		w4, .Lcfb_enc_loop_1x
459 
460 .Lcfb_enc_end:
461 	/* store new IV */
462 	st1		{RIV.16b}, [x3]
463 
464 	ret
465 SYM_FUNC_END(sm4_ce_cfb_enc)
466 
467 .align 3
468 SYM_FUNC_START(sm4_ce_cfb_dec)
469 	/* input:
470 	 *   x0: round key array, CTX
471 	 *   x1: dst
472 	 *   x2: src
473 	 *   x3: iv (big endian, 128 bit)
474 	 *   w4: nblocks
475 	 */
476 	SM4_PREPARE(x0)
477 
478 	ld1		{RIV.16b}, [x3]
479 
480 .Lcfb_dec_loop_8x:
481 	sub		w4, w4, #8
482 	tbnz		w4, #31, .Lcfb_dec_4x
483 
484 	ld1		{v0.16b-v3.16b}, [x2], #64
485 	ld1		{v4.16b-v7.16b}, [x2], #64
486 
487 	rev32		v8.16b, RIV.16b
488 	rev32		v9.16b, v0.16b
489 	rev32		v10.16b, v1.16b
490 	rev32		v11.16b, v2.16b
491 	rev32		v12.16b, v3.16b
492 	rev32		v13.16b, v4.16b
493 	rev32		v14.16b, v5.16b
494 	rev32		v15.16b, v6.16b
495 
496 	SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
497 
498 	mov		RIV.16b, v7.16b
499 
500 	eor		v0.16b, v0.16b, v8.16b
501 	eor		v1.16b, v1.16b, v9.16b
502 	eor		v2.16b, v2.16b, v10.16b
503 	eor		v3.16b, v3.16b, v11.16b
504 	eor		v4.16b, v4.16b, v12.16b
505 	eor		v5.16b, v5.16b, v13.16b
506 	eor		v6.16b, v6.16b, v14.16b
507 	eor		v7.16b, v7.16b, v15.16b
508 
509 	st1		{v0.16b-v3.16b}, [x1], #64
510 	st1		{v4.16b-v7.16b}, [x1], #64
511 
512 	cbz		w4, .Lcfb_dec_end
513 	b		.Lcfb_dec_loop_8x
514 
515 .Lcfb_dec_4x:
516 	add		w4, w4, #8
517 	cmp		w4, #4
518 	blt		.Lcfb_dec_loop_1x
519 
520 	sub		w4, w4, #4
521 
522 	ld1		{v0.16b-v3.16b}, [x2], #64
523 
524 	rev32		v8.16b, RIV.16b
525 	rev32		v9.16b, v0.16b
526 	rev32		v10.16b, v1.16b
527 	rev32		v11.16b, v2.16b
528 
529 	SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
530 
531 	mov		RIV.16b, v3.16b
532 
533 	eor		v0.16b, v0.16b, v8.16b
534 	eor		v1.16b, v1.16b, v9.16b
535 	eor		v2.16b, v2.16b, v10.16b
536 	eor		v3.16b, v3.16b, v11.16b
537 
538 	st1		{v0.16b-v3.16b}, [x1], #64
539 
540 	cbz		w4, .Lcfb_dec_end
541 
542 .Lcfb_dec_loop_1x:
543 	sub		w4, w4, #1
544 
545 	ld1		{v0.16b}, [x2], #16
546 
547 	SM4_CRYPT_BLK(RIV)
548 
549 	eor		RIV.16b, RIV.16b, v0.16b
550 	st1		{RIV.16b}, [x1], #16
551 
552 	mov		RIV.16b, v0.16b
553 
554 	cbnz		w4, .Lcfb_dec_loop_1x
555 
556 .Lcfb_dec_end:
557 	/* store new IV */
558 	st1		{RIV.16b}, [x3]
559 
560 	ret
561 SYM_FUNC_END(sm4_ce_cfb_dec)
562 
563 .align 3
564 SYM_FUNC_START(sm4_ce_ctr_enc)
565 	/* input:
566 	 *   x0: round key array, CTX
567 	 *   x1: dst
568 	 *   x2: src
569 	 *   x3: ctr (big endian, 128 bit)
570 	 *   w4: nblocks
571 	 */
572 	SM4_PREPARE(x0)
573 
574 	ldp		x7, x8, [x3]
575 	rev		x7, x7
576 	rev		x8, x8
577 
578 .Lctr_loop_8x:
579 	sub		w4, w4, #8
580 	tbnz		w4, #31, .Lctr_4x
581 
582 #define inc_le128(vctr)					\
583 		mov		vctr.d[1], x8;		\
584 		mov		vctr.d[0], x7;		\
585 		adds		x8, x8, #1;		\
586 		rev64		vctr.16b, vctr.16b;	\
587 		adc		x7, x7, xzr;
588 
589 	/* construct CTRs */
590 	inc_le128(v0)			/* +0 */
591 	inc_le128(v1)			/* +1 */
592 	inc_le128(v2)			/* +2 */
593 	inc_le128(v3)			/* +3 */
594 	inc_le128(v4)			/* +4 */
595 	inc_le128(v5)			/* +5 */
596 	inc_le128(v6)			/* +6 */
597 	inc_le128(v7)			/* +7 */
598 
599 	ld1		{v8.16b-v11.16b}, [x2], #64
600 	ld1		{v12.16b-v15.16b}, [x2], #64
601 
602 	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
603 
604 	eor		v0.16b, v0.16b, v8.16b
605 	eor		v1.16b, v1.16b, v9.16b
606 	eor		v2.16b, v2.16b, v10.16b
607 	eor		v3.16b, v3.16b, v11.16b
608 	eor		v4.16b, v4.16b, v12.16b
609 	eor		v5.16b, v5.16b, v13.16b
610 	eor		v6.16b, v6.16b, v14.16b
611 	eor		v7.16b, v7.16b, v15.16b
612 
613 	st1		{v0.16b-v3.16b}, [x1], #64
614 	st1		{v4.16b-v7.16b}, [x1], #64
615 
616 	cbz		w4, .Lctr_end
617 	b		.Lctr_loop_8x
618 
619 .Lctr_4x:
620 	add		w4, w4, #8
621 	cmp		w4, #4
622 	blt		.Lctr_loop_1x
623 
624 	sub		w4, w4, #4
625 
626 	/* construct CTRs */
627 	inc_le128(v0)			/* +0 */
628 	inc_le128(v1)			/* +1 */
629 	inc_le128(v2)			/* +2 */
630 	inc_le128(v3)			/* +3 */
631 
632 	ld1		{v8.16b-v11.16b}, [x2], #64
633 
634 	SM4_CRYPT_BLK4(v0, v1, v2, v3)
635 
636 	eor		v0.16b, v0.16b, v8.16b
637 	eor		v1.16b, v1.16b, v9.16b
638 	eor		v2.16b, v2.16b, v10.16b
639 	eor		v3.16b, v3.16b, v11.16b
640 
641 	st1		{v0.16b-v3.16b}, [x1], #64
642 
643 	cbz		w4, .Lctr_end
644 
645 .Lctr_loop_1x:
646 	sub		w4, w4, #1
647 
648 	/* construct CTRs */
649 	inc_le128(v0)
650 
651 	ld1		{v8.16b}, [x2], #16
652 
653 	SM4_CRYPT_BLK(v0)
654 
655 	eor		v0.16b, v0.16b, v8.16b
656 	st1		{v0.16b}, [x1], #16
657 
658 	cbnz		w4, .Lctr_loop_1x
659 
660 .Lctr_end:
661 	/* store new CTR */
662 	rev		x7, x7
663 	rev		x8, x8
664 	stp		x7, x8, [x3]
665 
666 	ret
667 SYM_FUNC_END(sm4_ce_ctr_enc)
668 
669 
670 #define tweak_next(vt, vin, RTMP)					\
671 		sshr		RTMP.2d, vin.2d, #63;			\
672 		and		RTMP.16b, RTMP.16b, RMASK.16b;		\
673 		add		vt.2d, vin.2d, vin.2d;			\
674 		ext		RTMP.16b, RTMP.16b, RTMP.16b, #8;	\
675 		eor		vt.16b, vt.16b, RTMP.16b;
676 
677 .align 3
678 SYM_FUNC_START(sm4_ce_xts_enc)
679 	/* input:
680 	 *   x0: round key array, CTX
681 	 *   x1: dst
682 	 *   x2: src
683 	 *   x3: tweak (big endian, 128 bit)
684 	 *   w4: nbytes
685 	 *   x5: round key array for IV
686 	 */
687 	ld1		{v8.16b}, [x3]
688 
689 	cbz		x5, .Lxts_enc_nofirst
690 
691 	SM4_PREPARE(x5)
692 
693 	/* Generate first tweak */
694 	SM4_CRYPT_BLK(v8)
695 
696 .Lxts_enc_nofirst:
697 	SM4_PREPARE(x0)
698 
699 	ands		w5, w4, #15
700 	lsr		w4, w4, #4
701 	sub		w6, w4, #1
702 	csel		w4, w4, w6, eq
703 	uxtw		x5, w5
704 
705 	movi		RMASK.2s, #0x1
706 	movi		RTMP0.2s, #0x87
707 	uzp1		RMASK.4s, RMASK.4s, RTMP0.4s
708 
709 	cbz		w4, .Lxts_enc_cts
710 
711 .Lxts_enc_loop_8x:
712 	sub		w4, w4, #8
713 	tbnz		w4, #31, .Lxts_enc_4x
714 
715 	tweak_next( v9,  v8, RTMP0)
716 	tweak_next(v10,  v9, RTMP1)
717 	tweak_next(v11, v10, RTMP2)
718 	tweak_next(v12, v11, RTMP3)
719 	tweak_next(v13, v12, RTMP0)
720 	tweak_next(v14, v13, RTMP1)
721 	tweak_next(v15, v14, RTMP2)
722 
723 	ld1		{v0.16b-v3.16b}, [x2], #64
724 	ld1		{v4.16b-v7.16b}, [x2], #64
725 	eor		v0.16b, v0.16b,  v8.16b
726 	eor		v1.16b, v1.16b,  v9.16b
727 	eor		v2.16b, v2.16b, v10.16b
728 	eor		v3.16b, v3.16b, v11.16b
729 	eor		v4.16b, v4.16b, v12.16b
730 	eor		v5.16b, v5.16b, v13.16b
731 	eor		v6.16b, v6.16b, v14.16b
732 	eor		v7.16b, v7.16b, v15.16b
733 
734 	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
735 
736 	eor		v0.16b, v0.16b,  v8.16b
737 	eor		v1.16b, v1.16b,  v9.16b
738 	eor		v2.16b, v2.16b, v10.16b
739 	eor		v3.16b, v3.16b, v11.16b
740 	eor		v4.16b, v4.16b, v12.16b
741 	eor		v5.16b, v5.16b, v13.16b
742 	eor		v6.16b, v6.16b, v14.16b
743 	eor		v7.16b, v7.16b, v15.16b
744 	st1		{v0.16b-v3.16b}, [x1], #64
745 	st1		{v4.16b-v7.16b}, [x1], #64
746 
747 	tweak_next(v8, v15, RTMP3)
748 
749 	cbz		w4, .Lxts_enc_cts
750 	b		.Lxts_enc_loop_8x
751 
752 .Lxts_enc_4x:
753 	add		w4, w4, #8
754 	cmp		w4, #4
755 	blt		.Lxts_enc_loop_1x
756 
757 	sub		w4, w4, #4
758 
759 	tweak_next( v9,  v8, RTMP0)
760 	tweak_next(v10,  v9, RTMP1)
761 	tweak_next(v11, v10, RTMP2)
762 
763 	ld1		{v0.16b-v3.16b}, [x2], #64
764 	eor		v0.16b, v0.16b,  v8.16b
765 	eor		v1.16b, v1.16b,  v9.16b
766 	eor		v2.16b, v2.16b, v10.16b
767 	eor		v3.16b, v3.16b, v11.16b
768 
769 	SM4_CRYPT_BLK4(v0, v1, v2, v3)
770 
771 	eor		v0.16b, v0.16b,  v8.16b
772 	eor		v1.16b, v1.16b,  v9.16b
773 	eor		v2.16b, v2.16b, v10.16b
774 	eor		v3.16b, v3.16b, v11.16b
775 	st1		{v0.16b-v3.16b}, [x1], #64
776 
777 	tweak_next(v8, v11, RTMP3)
778 
779 	cbz		w4, .Lxts_enc_cts
780 
781 .Lxts_enc_loop_1x:
782 	sub		w4, w4, #1
783 
784 	ld1		{v0.16b}, [x2], #16
785 	eor		v0.16b, v0.16b, v8.16b
786 
787 	SM4_CRYPT_BLK(v0)
788 
789 	eor		v0.16b, v0.16b, v8.16b
790 	st1		{v0.16b}, [x1], #16
791 
792 	tweak_next(v8, v8, RTMP0)
793 
794 	cbnz		w4, .Lxts_enc_loop_1x
795 
796 .Lxts_enc_cts:
797 	cbz		x5, .Lxts_enc_end
798 
799 	/* cipher text stealing */
800 
801 	tweak_next(v9, v8, RTMP0)
802 	ld1		{v0.16b}, [x2]
803 	eor		v0.16b, v0.16b, v8.16b
804 	SM4_CRYPT_BLK(v0)
805 	eor		v0.16b, v0.16b, v8.16b
806 
807 	/* load permute table */
808 	adr_l		x6, .Lcts_permute_table
809 	add		x7, x6, #32
810 	add		x6, x6, x5
811 	sub		x7, x7, x5
812 	ld1		{v3.16b}, [x6]
813 	ld1		{v4.16b}, [x7]
814 
815 	/* overlapping loads */
816 	add		x2, x2, x5
817 	ld1		{v1.16b}, [x2]
818 
819 	/* create Cn from En-1 */
820 	tbl		v2.16b, {v0.16b}, v3.16b
821 	/* padding Pn with En-1 at the end */
822 	tbx		v0.16b, {v1.16b}, v4.16b
823 
824 	eor		v0.16b, v0.16b, v9.16b
825 	SM4_CRYPT_BLK(v0)
826 	eor		v0.16b, v0.16b, v9.16b
827 
828 
829 	/* overlapping stores */
830 	add		x5, x1, x5
831 	st1		{v2.16b}, [x5]
832 	st1		{v0.16b}, [x1]
833 
834 	b		.Lxts_enc_ret
835 
836 .Lxts_enc_end:
837 	/* store new tweak */
838 	st1		{v8.16b}, [x3]
839 
840 .Lxts_enc_ret:
841 	ret
842 SYM_FUNC_END(sm4_ce_xts_enc)
843 
844 .align 3
845 SYM_FUNC_START(sm4_ce_xts_dec)
846 	/* input:
847 	 *   x0: round key array, CTX
848 	 *   x1: dst
849 	 *   x2: src
850 	 *   x3: tweak (big endian, 128 bit)
851 	 *   w4: nbytes
852 	 *   x5: round key array for IV
853 	 */
854 	ld1		{v8.16b}, [x3]
855 
856 	cbz		x5, .Lxts_dec_nofirst
857 
858 	SM4_PREPARE(x5)
859 
860 	/* Generate first tweak */
861 	SM4_CRYPT_BLK(v8)
862 
863 .Lxts_dec_nofirst:
864 	SM4_PREPARE(x0)
865 
866 	ands		w5, w4, #15
867 	lsr		w4, w4, #4
868 	sub		w6, w4, #1
869 	csel		w4, w4, w6, eq
870 	uxtw		x5, w5
871 
872 	movi		RMASK.2s, #0x1
873 	movi		RTMP0.2s, #0x87
874 	uzp1		RMASK.4s, RMASK.4s, RTMP0.4s
875 
876 	cbz		w4, .Lxts_dec_cts
877 
878 .Lxts_dec_loop_8x:
879 	sub		w4, w4, #8
880 	tbnz		w4, #31, .Lxts_dec_4x
881 
882 	tweak_next( v9,  v8, RTMP0)
883 	tweak_next(v10,  v9, RTMP1)
884 	tweak_next(v11, v10, RTMP2)
885 	tweak_next(v12, v11, RTMP3)
886 	tweak_next(v13, v12, RTMP0)
887 	tweak_next(v14, v13, RTMP1)
888 	tweak_next(v15, v14, RTMP2)
889 
890 	ld1		{v0.16b-v3.16b}, [x2], #64
891 	ld1		{v4.16b-v7.16b}, [x2], #64
892 	eor		v0.16b, v0.16b,  v8.16b
893 	eor		v1.16b, v1.16b,  v9.16b
894 	eor		v2.16b, v2.16b, v10.16b
895 	eor		v3.16b, v3.16b, v11.16b
896 	eor		v4.16b, v4.16b, v12.16b
897 	eor		v5.16b, v5.16b, v13.16b
898 	eor		v6.16b, v6.16b, v14.16b
899 	eor		v7.16b, v7.16b, v15.16b
900 
901 	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
902 
903 	eor		v0.16b, v0.16b,  v8.16b
904 	eor		v1.16b, v1.16b,  v9.16b
905 	eor		v2.16b, v2.16b, v10.16b
906 	eor		v3.16b, v3.16b, v11.16b
907 	eor		v4.16b, v4.16b, v12.16b
908 	eor		v5.16b, v5.16b, v13.16b
909 	eor		v6.16b, v6.16b, v14.16b
910 	eor		v7.16b, v7.16b, v15.16b
911 	st1		{v0.16b-v3.16b}, [x1], #64
912 	st1		{v4.16b-v7.16b}, [x1], #64
913 
914 	tweak_next(v8, v15, RTMP3)
915 
916 	cbz		w4, .Lxts_dec_cts
917 	b		.Lxts_dec_loop_8x
918 
919 .Lxts_dec_4x:
920 	add		w4, w4, #8
921 	cmp		w4, #4
922 	blt		.Lxts_dec_loop_1x
923 
924 	sub		w4, w4, #4
925 
926 	tweak_next( v9,  v8, RTMP0)
927 	tweak_next(v10,  v9, RTMP1)
928 	tweak_next(v11, v10, RTMP2)
929 
930 	ld1		{v0.16b-v3.16b}, [x2], #64
931 	eor		v0.16b, v0.16b,  v8.16b
932 	eor		v1.16b, v1.16b,  v9.16b
933 	eor		v2.16b, v2.16b, v10.16b
934 	eor		v3.16b, v3.16b, v11.16b
935 
936 	SM4_CRYPT_BLK4(v0, v1, v2, v3)
937 
938 	eor		v0.16b, v0.16b,  v8.16b
939 	eor		v1.16b, v1.16b,  v9.16b
940 	eor		v2.16b, v2.16b, v10.16b
941 	eor		v3.16b, v3.16b, v11.16b
942 	st1		{v0.16b-v3.16b}, [x1], #64
943 
944 	tweak_next(v8, v11, RTMP3)
945 
946 	cbz		w4, .Lxts_dec_cts
947 
948 .Lxts_dec_loop_1x:
949 	sub		w4, w4, #1
950 
951 	ld1		{v0.16b}, [x2], #16
952 	eor		v0.16b, v0.16b, v8.16b
953 
954 	SM4_CRYPT_BLK(v0)
955 
956 	eor		v0.16b, v0.16b, v8.16b
957 	st1		{v0.16b}, [x1], #16
958 
959 	tweak_next(v8, v8, RTMP0)
960 
961 	cbnz		w4, .Lxts_dec_loop_1x
962 
963 .Lxts_dec_cts:
964 	cbz		x5, .Lxts_dec_end
965 
966 	/* cipher text stealing */
967 
968 	tweak_next(v9, v8, RTMP0)
969 	ld1		{v0.16b}, [x2]
970 	eor		v0.16b, v0.16b, v9.16b
971 	SM4_CRYPT_BLK(v0)
972 	eor		v0.16b, v0.16b, v9.16b
973 
974 	/* load permute table */
975 	adr_l		x6, .Lcts_permute_table
976 	add		x7, x6, #32
977 	add		x6, x6, x5
978 	sub		x7, x7, x5
979 	ld1		{v3.16b}, [x6]
980 	ld1		{v4.16b}, [x7]
981 
982 	/* overlapping loads */
983 	add		x2, x2, x5
984 	ld1		{v1.16b}, [x2]
985 
986 	/* create Cn from En-1 */
987 	tbl		v2.16b, {v0.16b}, v3.16b
988 	/* padding Pn with En-1 at the end */
989 	tbx		v0.16b, {v1.16b}, v4.16b
990 
991 	eor		v0.16b, v0.16b, v8.16b
992 	SM4_CRYPT_BLK(v0)
993 	eor		v0.16b, v0.16b, v8.16b
994 
995 
996 	/* overlapping stores */
997 	add		x5, x1, x5
998 	st1		{v2.16b}, [x5]
999 	st1		{v0.16b}, [x1]
1000 
1001 	b		.Lxts_dec_ret
1002 
1003 .Lxts_dec_end:
1004 	/* store new tweak */
1005 	st1		{v8.16b}, [x3]
1006 
1007 .Lxts_dec_ret:
1008 	ret
1009 SYM_FUNC_END(sm4_ce_xts_dec)
1010 
1011 .align 3
1012 SYM_FUNC_START(sm4_ce_mac_update)
1013 	/* input:
1014 	 *   x0: round key array, CTX
1015 	 *   x1: digest
1016 	 *   x2: src
1017 	 *   w3: nblocks
1018 	 *   w4: enc_before
1019 	 *   w5: enc_after
1020 	 */
1021 	SM4_PREPARE(x0)
1022 
1023 	ld1		{RMAC.16b}, [x1]
1024 
1025 	cbz		w4, .Lmac_update
1026 
1027 	SM4_CRYPT_BLK(RMAC)
1028 
1029 .Lmac_update:
1030 	cbz		w3, .Lmac_ret
1031 
1032 	sub		w6, w3, #1
1033 	cmp		w5, wzr
1034 	csel		w3, w3, w6, ne
1035 
1036 	cbz		w3, .Lmac_end
1037 
1038 .Lmac_loop_4x:
1039 	cmp		w3, #4
1040 	blt		.Lmac_loop_1x
1041 
1042 	sub		w3, w3, #4
1043 
1044 	ld1		{v0.16b-v3.16b}, [x2], #64
1045 
1046 	eor		RMAC.16b, RMAC.16b, v0.16b
1047 	SM4_CRYPT_BLK(RMAC)
1048 	eor		RMAC.16b, RMAC.16b, v1.16b
1049 	SM4_CRYPT_BLK(RMAC)
1050 	eor		RMAC.16b, RMAC.16b, v2.16b
1051 	SM4_CRYPT_BLK(RMAC)
1052 	eor		RMAC.16b, RMAC.16b, v3.16b
1053 	SM4_CRYPT_BLK(RMAC)
1054 
1055 	cbz		w3, .Lmac_end
1056 	b		.Lmac_loop_4x
1057 
1058 .Lmac_loop_1x:
1059 	sub		w3, w3, #1
1060 
1061 	ld1		{v0.16b}, [x2], #16
1062 
1063 	eor		RMAC.16b, RMAC.16b, v0.16b
1064 	SM4_CRYPT_BLK(RMAC)
1065 
1066 	cbnz		w3, .Lmac_loop_1x
1067 
1068 
1069 .Lmac_end:
1070 	cbnz		w5, .Lmac_ret
1071 
1072 	ld1		{v0.16b}, [x2], #16
1073 	eor		RMAC.16b, RMAC.16b, v0.16b
1074 
1075 .Lmac_ret:
1076 	st1		{RMAC.16b}, [x1]
1077 	ret
1078 SYM_FUNC_END(sm4_ce_mac_update)
1079 
1080 
1081 	.section	".rodata", "a"
1082 	.align 4
1083 .Lbswap128_mask:
1084 	.byte		0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
1085 	.byte		0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
1086 
1087 .Lcts_permute_table:
1088 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1089 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1090 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
1091 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
1092 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1093 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1094