1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4  *
5  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6  */
7 
8 /* included by aes-ce.S and aes-neon.S */
9 
10 	.text
11 	.align		4
12 
13 #ifndef MAX_STRIDE
14 #define MAX_STRIDE	4
15 #endif
16 
17 #if MAX_STRIDE == 4
18 #define ST4(x...) x
19 #define ST5(x...)
20 #else
21 #define ST4(x...)
22 #define ST5(x...) x
23 #endif
24 
25 SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26 	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
27 	ret
28 SYM_FUNC_END(aes_encrypt_block4x)
29 
30 SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31 	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
32 	ret
33 SYM_FUNC_END(aes_decrypt_block4x)
34 
35 #if MAX_STRIDE == 5
36 SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37 	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
38 	ret
39 SYM_FUNC_END(aes_encrypt_block5x)
40 
41 SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42 	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
43 	ret
44 SYM_FUNC_END(aes_decrypt_block5x)
45 #endif
46 
47 	/*
48 	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49 	 *		   int blocks)
50 	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51 	 *		   int blocks)
52 	 */
53 
54 AES_FUNC_START(aes_ecb_encrypt)
55 	stp		x29, x30, [sp, #-16]!
56 	mov		x29, sp
57 
58 	enc_prepare	w3, x2, x5
59 
60 .LecbencloopNx:
61 	subs		w4, w4, #MAX_STRIDE
62 	bmi		.Lecbenc1x
63 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
64 ST4(	bl		aes_encrypt_block4x		)
65 ST5(	ld1		{v4.16b}, [x1], #16		)
66 ST5(	bl		aes_encrypt_block5x		)
67 	st1		{v0.16b-v3.16b}, [x0], #64
68 ST5(	st1		{v4.16b}, [x0], #16		)
69 	b		.LecbencloopNx
70 .Lecbenc1x:
71 	adds		w4, w4, #MAX_STRIDE
72 	beq		.Lecbencout
73 .Lecbencloop:
74 	ld1		{v0.16b}, [x1], #16		/* get next pt block */
75 	encrypt_block	v0, w3, x2, x5, w6
76 	st1		{v0.16b}, [x0], #16
77 	subs		w4, w4, #1
78 	bne		.Lecbencloop
79 .Lecbencout:
80 	ldp		x29, x30, [sp], #16
81 	ret
82 AES_FUNC_END(aes_ecb_encrypt)
83 
84 
85 AES_FUNC_START(aes_ecb_decrypt)
86 	stp		x29, x30, [sp, #-16]!
87 	mov		x29, sp
88 
89 	dec_prepare	w3, x2, x5
90 
91 .LecbdecloopNx:
92 	subs		w4, w4, #MAX_STRIDE
93 	bmi		.Lecbdec1x
94 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
95 ST4(	bl		aes_decrypt_block4x		)
96 ST5(	ld1		{v4.16b}, [x1], #16		)
97 ST5(	bl		aes_decrypt_block5x		)
98 	st1		{v0.16b-v3.16b}, [x0], #64
99 ST5(	st1		{v4.16b}, [x0], #16		)
100 	b		.LecbdecloopNx
101 .Lecbdec1x:
102 	adds		w4, w4, #MAX_STRIDE
103 	beq		.Lecbdecout
104 .Lecbdecloop:
105 	ld1		{v0.16b}, [x1], #16		/* get next ct block */
106 	decrypt_block	v0, w3, x2, x5, w6
107 	st1		{v0.16b}, [x0], #16
108 	subs		w4, w4, #1
109 	bne		.Lecbdecloop
110 .Lecbdecout:
111 	ldp		x29, x30, [sp], #16
112 	ret
113 AES_FUNC_END(aes_ecb_decrypt)
114 
115 
116 	/*
117 	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118 	 *		   int blocks, u8 iv[])
119 	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120 	 *		   int blocks, u8 iv[])
121 	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122 	 *			 int rounds, int blocks, u8 iv[],
123 	 *			 u32 const rk2[]);
124 	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125 	 *			 int rounds, int blocks, u8 iv[],
126 	 *			 u32 const rk2[]);
127 	 */
128 
129 AES_FUNC_START(aes_essiv_cbc_encrypt)
130 	ld1		{v4.16b}, [x5]			/* get iv */
131 
132 	mov		w8, #14				/* AES-256: 14 rounds */
133 	enc_prepare	w8, x6, x7
134 	encrypt_block	v4, w8, x6, x7, w9
135 	enc_switch_key	w3, x2, x6
136 	b		.Lcbcencloop4x
137 
138 AES_FUNC_START(aes_cbc_encrypt)
139 	ld1		{v4.16b}, [x5]			/* get iv */
140 	enc_prepare	w3, x2, x6
141 
142 .Lcbcencloop4x:
143 	subs		w4, w4, #4
144 	bmi		.Lcbcenc1x
145 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
146 	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
147 	encrypt_block	v0, w3, x2, x6, w7
148 	eor		v1.16b, v1.16b, v0.16b
149 	encrypt_block	v1, w3, x2, x6, w7
150 	eor		v2.16b, v2.16b, v1.16b
151 	encrypt_block	v2, w3, x2, x6, w7
152 	eor		v3.16b, v3.16b, v2.16b
153 	encrypt_block	v3, w3, x2, x6, w7
154 	st1		{v0.16b-v3.16b}, [x0], #64
155 	mov		v4.16b, v3.16b
156 	b		.Lcbcencloop4x
157 .Lcbcenc1x:
158 	adds		w4, w4, #4
159 	beq		.Lcbcencout
160 .Lcbcencloop:
161 	ld1		{v0.16b}, [x1], #16		/* get next pt block */
162 	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
163 	encrypt_block	v4, w3, x2, x6, w7
164 	st1		{v4.16b}, [x0], #16
165 	subs		w4, w4, #1
166 	bne		.Lcbcencloop
167 .Lcbcencout:
168 	st1		{v4.16b}, [x5]			/* return iv */
169 	ret
170 AES_FUNC_END(aes_cbc_encrypt)
171 AES_FUNC_END(aes_essiv_cbc_encrypt)
172 
173 AES_FUNC_START(aes_essiv_cbc_decrypt)
174 	stp		x29, x30, [sp, #-16]!
175 	mov		x29, sp
176 
177 	ld1		{cbciv.16b}, [x5]		/* get iv */
178 
179 	mov		w8, #14				/* AES-256: 14 rounds */
180 	enc_prepare	w8, x6, x7
181 	encrypt_block	cbciv, w8, x6, x7, w9
182 	b		.Lessivcbcdecstart
183 
184 AES_FUNC_START(aes_cbc_decrypt)
185 	stp		x29, x30, [sp, #-16]!
186 	mov		x29, sp
187 
188 	ld1		{cbciv.16b}, [x5]		/* get iv */
189 .Lessivcbcdecstart:
190 	dec_prepare	w3, x2, x6
191 
192 .LcbcdecloopNx:
193 	subs		w4, w4, #MAX_STRIDE
194 	bmi		.Lcbcdec1x
195 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
196 #if MAX_STRIDE == 5
197 	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
198 	mov		v5.16b, v0.16b
199 	mov		v6.16b, v1.16b
200 	mov		v7.16b, v2.16b
201 	bl		aes_decrypt_block5x
202 	sub		x1, x1, #32
203 	eor		v0.16b, v0.16b, cbciv.16b
204 	eor		v1.16b, v1.16b, v5.16b
205 	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
206 	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
207 	eor		v2.16b, v2.16b, v6.16b
208 	eor		v3.16b, v3.16b, v7.16b
209 	eor		v4.16b, v4.16b, v5.16b
210 #else
211 	mov		v4.16b, v0.16b
212 	mov		v5.16b, v1.16b
213 	mov		v6.16b, v2.16b
214 	bl		aes_decrypt_block4x
215 	sub		x1, x1, #16
216 	eor		v0.16b, v0.16b, cbciv.16b
217 	eor		v1.16b, v1.16b, v4.16b
218 	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
219 	eor		v2.16b, v2.16b, v5.16b
220 	eor		v3.16b, v3.16b, v6.16b
221 #endif
222 	st1		{v0.16b-v3.16b}, [x0], #64
223 ST5(	st1		{v4.16b}, [x0], #16		)
224 	b		.LcbcdecloopNx
225 .Lcbcdec1x:
226 	adds		w4, w4, #MAX_STRIDE
227 	beq		.Lcbcdecout
228 .Lcbcdecloop:
229 	ld1		{v1.16b}, [x1], #16		/* get next ct block */
230 	mov		v0.16b, v1.16b			/* ...and copy to v0 */
231 	decrypt_block	v0, w3, x2, x6, w7
232 	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
233 	mov		cbciv.16b, v1.16b		/* ct is next iv */
234 	st1		{v0.16b}, [x0], #16
235 	subs		w4, w4, #1
236 	bne		.Lcbcdecloop
237 .Lcbcdecout:
238 	st1		{cbciv.16b}, [x5]		/* return iv */
239 	ldp		x29, x30, [sp], #16
240 	ret
241 AES_FUNC_END(aes_cbc_decrypt)
242 AES_FUNC_END(aes_essiv_cbc_decrypt)
243 
244 
245 	/*
246 	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247 	 *		       int rounds, int bytes, u8 const iv[])
248 	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249 	 *		       int rounds, int bytes, u8 const iv[])
250 	 */
251 
252 AES_FUNC_START(aes_cbc_cts_encrypt)
253 	adr_l		x8, .Lcts_permute_table
254 	sub		x4, x4, #16
255 	add		x9, x8, #32
256 	add		x8, x8, x4
257 	sub		x9, x9, x4
258 	ld1		{v3.16b}, [x8]
259 	ld1		{v4.16b}, [x9]
260 
261 	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
262 	ld1		{v1.16b}, [x1]
263 
264 	ld1		{v5.16b}, [x5]			/* get iv */
265 	enc_prepare	w3, x2, x6
266 
267 	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
268 	tbl		v1.16b, {v1.16b}, v4.16b
269 	encrypt_block	v0, w3, x2, x6, w7
270 
271 	eor		v1.16b, v1.16b, v0.16b
272 	tbl		v0.16b, {v0.16b}, v3.16b
273 	encrypt_block	v1, w3, x2, x6, w7
274 
275 	add		x4, x0, x4
276 	st1		{v0.16b}, [x4]			/* overlapping stores */
277 	st1		{v1.16b}, [x0]
278 	ret
279 AES_FUNC_END(aes_cbc_cts_encrypt)
280 
281 AES_FUNC_START(aes_cbc_cts_decrypt)
282 	adr_l		x8, .Lcts_permute_table
283 	sub		x4, x4, #16
284 	add		x9, x8, #32
285 	add		x8, x8, x4
286 	sub		x9, x9, x4
287 	ld1		{v3.16b}, [x8]
288 	ld1		{v4.16b}, [x9]
289 
290 	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
291 	ld1		{v1.16b}, [x1]
292 
293 	ld1		{v5.16b}, [x5]			/* get iv */
294 	dec_prepare	w3, x2, x6
295 
296 	decrypt_block	v0, w3, x2, x6, w7
297 	tbl		v2.16b, {v0.16b}, v3.16b
298 	eor		v2.16b, v2.16b, v1.16b
299 
300 	tbx		v0.16b, {v1.16b}, v4.16b
301 	decrypt_block	v0, w3, x2, x6, w7
302 	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
303 
304 	add		x4, x0, x4
305 	st1		{v2.16b}, [x4]			/* overlapping stores */
306 	st1		{v0.16b}, [x0]
307 	ret
308 AES_FUNC_END(aes_cbc_cts_decrypt)
309 
310 	.section	".rodata", "a"
311 	.align		6
312 .Lcts_permute_table:
313 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
316 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
317 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319 	.previous
320 
321 
322 	/*
323 	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
324 	 *		   int blocks, u8 ctr[])
325 	 */
326 
327 AES_FUNC_START(aes_ctr_encrypt)
328 	stp		x29, x30, [sp, #-16]!
329 	mov		x29, sp
330 
331 	enc_prepare	w3, x2, x6
332 	ld1		{vctr.16b}, [x5]
333 
334 	umov		x6, vctr.d[1]		/* keep swabbed ctr in reg */
335 	rev		x6, x6
336 	cmn		w6, w4			/* 32 bit overflow? */
337 	bcs		.Lctrloop
338 .LctrloopNx:
339 	subs		w4, w4, #MAX_STRIDE
340 	bmi		.Lctr1x
341 	add		w7, w6, #1
342 	mov		v0.16b, vctr.16b
343 	add		w8, w6, #2
344 	mov		v1.16b, vctr.16b
345 	add		w9, w6, #3
346 	mov		v2.16b, vctr.16b
347 	add		w9, w6, #3
348 	rev		w7, w7
349 	mov		v3.16b, vctr.16b
350 	rev		w8, w8
351 ST5(	mov		v4.16b, vctr.16b		)
352 	mov		v1.s[3], w7
353 	rev		w9, w9
354 ST5(	add		w10, w6, #4			)
355 	mov		v2.s[3], w8
356 ST5(	rev		w10, w10			)
357 	mov		v3.s[3], w9
358 ST5(	mov		v4.s[3], w10			)
359 	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
360 ST4(	bl		aes_encrypt_block4x		)
361 ST5(	bl		aes_encrypt_block5x		)
362 	eor		v0.16b, v5.16b, v0.16b
363 ST4(	ld1		{v5.16b}, [x1], #16		)
364 	eor		v1.16b, v6.16b, v1.16b
365 ST5(	ld1		{v5.16b-v6.16b}, [x1], #32	)
366 	eor		v2.16b, v7.16b, v2.16b
367 	eor		v3.16b, v5.16b, v3.16b
368 ST5(	eor		v4.16b, v6.16b, v4.16b		)
369 	st1		{v0.16b-v3.16b}, [x0], #64
370 ST5(	st1		{v4.16b}, [x0], #16		)
371 	add		x6, x6, #MAX_STRIDE
372 	rev		x7, x6
373 	ins		vctr.d[1], x7
374 	cbz		w4, .Lctrout
375 	b		.LctrloopNx
376 .Lctr1x:
377 	adds		w4, w4, #MAX_STRIDE
378 	beq		.Lctrout
379 .Lctrloop:
380 	mov		v0.16b, vctr.16b
381 	encrypt_block	v0, w3, x2, x8, w7
382 
383 	adds		x6, x6, #1		/* increment BE ctr */
384 	rev		x7, x6
385 	ins		vctr.d[1], x7
386 	bcs		.Lctrcarry		/* overflow? */
387 
388 .Lctrcarrydone:
389 	subs		w4, w4, #1
390 	bmi		.Lctrtailblock		/* blocks <0 means tail block */
391 	ld1		{v3.16b}, [x1], #16
392 	eor		v3.16b, v0.16b, v3.16b
393 	st1		{v3.16b}, [x0], #16
394 	bne		.Lctrloop
395 
396 .Lctrout:
397 	st1		{vctr.16b}, [x5]	/* return next CTR value */
398 	ldp		x29, x30, [sp], #16
399 	ret
400 
401 .Lctrtailblock:
402 	st1		{v0.16b}, [x0]
403 	b		.Lctrout
404 
405 .Lctrcarry:
406 	umov		x7, vctr.d[0]		/* load upper word of ctr  */
407 	rev		x7, x7			/* ... to handle the carry */
408 	add		x7, x7, #1
409 	rev		x7, x7
410 	ins		vctr.d[0], x7
411 	b		.Lctrcarrydone
412 AES_FUNC_END(aes_ctr_encrypt)
413 
414 
415 	/*
416 	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
417 	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
418 	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
419 	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
420 	 */
421 
422 	.macro		next_tweak, out, in, tmp
423 	sshr		\tmp\().2d,  \in\().2d,   #63
424 	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
425 	add		\out\().2d,  \in\().2d,   \in\().2d
426 	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
427 	eor		\out\().16b, \out\().16b, \tmp\().16b
428 	.endm
429 
430 	.macro		xts_load_mask, tmp
431 	movi		xtsmask.2s, #0x1
432 	movi		\tmp\().2s, #0x87
433 	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
434 	.endm
435 
436 AES_FUNC_START(aes_xts_encrypt)
437 	stp		x29, x30, [sp, #-16]!
438 	mov		x29, sp
439 
440 	ld1		{v4.16b}, [x6]
441 	xts_load_mask	v8
442 	cbz		w7, .Lxtsencnotfirst
443 
444 	enc_prepare	w3, x5, x8
445 	xts_cts_skip_tw	w7, .LxtsencNx
446 	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
447 	enc_switch_key	w3, x2, x8
448 	b		.LxtsencNx
449 
450 .Lxtsencnotfirst:
451 	enc_prepare	w3, x2, x8
452 .LxtsencloopNx:
453 	next_tweak	v4, v4, v8
454 .LxtsencNx:
455 	subs		w4, w4, #64
456 	bmi		.Lxtsenc1x
457 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
458 	next_tweak	v5, v4, v8
459 	eor		v0.16b, v0.16b, v4.16b
460 	next_tweak	v6, v5, v8
461 	eor		v1.16b, v1.16b, v5.16b
462 	eor		v2.16b, v2.16b, v6.16b
463 	next_tweak	v7, v6, v8
464 	eor		v3.16b, v3.16b, v7.16b
465 	bl		aes_encrypt_block4x
466 	eor		v3.16b, v3.16b, v7.16b
467 	eor		v0.16b, v0.16b, v4.16b
468 	eor		v1.16b, v1.16b, v5.16b
469 	eor		v2.16b, v2.16b, v6.16b
470 	st1		{v0.16b-v3.16b}, [x0], #64
471 	mov		v4.16b, v7.16b
472 	cbz		w4, .Lxtsencret
473 	xts_reload_mask	v8
474 	b		.LxtsencloopNx
475 .Lxtsenc1x:
476 	adds		w4, w4, #64
477 	beq		.Lxtsencout
478 	subs		w4, w4, #16
479 	bmi		.LxtsencctsNx
480 .Lxtsencloop:
481 	ld1		{v0.16b}, [x1], #16
482 .Lxtsencctsout:
483 	eor		v0.16b, v0.16b, v4.16b
484 	encrypt_block	v0, w3, x2, x8, w7
485 	eor		v0.16b, v0.16b, v4.16b
486 	cbz		w4, .Lxtsencout
487 	subs		w4, w4, #16
488 	next_tweak	v4, v4, v8
489 	bmi		.Lxtsenccts
490 	st1		{v0.16b}, [x0], #16
491 	b		.Lxtsencloop
492 .Lxtsencout:
493 	st1		{v0.16b}, [x0]
494 .Lxtsencret:
495 	st1		{v4.16b}, [x6]
496 	ldp		x29, x30, [sp], #16
497 	ret
498 
499 .LxtsencctsNx:
500 	mov		v0.16b, v3.16b
501 	sub		x0, x0, #16
502 .Lxtsenccts:
503 	adr_l		x8, .Lcts_permute_table
504 
505 	add		x1, x1, w4, sxtw	/* rewind input pointer */
506 	add		w4, w4, #16		/* # bytes in final block */
507 	add		x9, x8, #32
508 	add		x8, x8, x4
509 	sub		x9, x9, x4
510 	add		x4, x0, x4		/* output address of final block */
511 
512 	ld1		{v1.16b}, [x1]		/* load final block */
513 	ld1		{v2.16b}, [x8]
514 	ld1		{v3.16b}, [x9]
515 
516 	tbl		v2.16b, {v0.16b}, v2.16b
517 	tbx		v0.16b, {v1.16b}, v3.16b
518 	st1		{v2.16b}, [x4]			/* overlapping stores */
519 	mov		w4, wzr
520 	b		.Lxtsencctsout
521 AES_FUNC_END(aes_xts_encrypt)
522 
523 AES_FUNC_START(aes_xts_decrypt)
524 	stp		x29, x30, [sp, #-16]!
525 	mov		x29, sp
526 
527 	/* subtract 16 bytes if we are doing CTS */
528 	sub		w8, w4, #0x10
529 	tst		w4, #0xf
530 	csel		w4, w4, w8, eq
531 
532 	ld1		{v4.16b}, [x6]
533 	xts_load_mask	v8
534 	xts_cts_skip_tw	w7, .Lxtsdecskiptw
535 	cbz		w7, .Lxtsdecnotfirst
536 
537 	enc_prepare	w3, x5, x8
538 	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
539 .Lxtsdecskiptw:
540 	dec_prepare	w3, x2, x8
541 	b		.LxtsdecNx
542 
543 .Lxtsdecnotfirst:
544 	dec_prepare	w3, x2, x8
545 .LxtsdecloopNx:
546 	next_tweak	v4, v4, v8
547 .LxtsdecNx:
548 	subs		w4, w4, #64
549 	bmi		.Lxtsdec1x
550 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
551 	next_tweak	v5, v4, v8
552 	eor		v0.16b, v0.16b, v4.16b
553 	next_tweak	v6, v5, v8
554 	eor		v1.16b, v1.16b, v5.16b
555 	eor		v2.16b, v2.16b, v6.16b
556 	next_tweak	v7, v6, v8
557 	eor		v3.16b, v3.16b, v7.16b
558 	bl		aes_decrypt_block4x
559 	eor		v3.16b, v3.16b, v7.16b
560 	eor		v0.16b, v0.16b, v4.16b
561 	eor		v1.16b, v1.16b, v5.16b
562 	eor		v2.16b, v2.16b, v6.16b
563 	st1		{v0.16b-v3.16b}, [x0], #64
564 	mov		v4.16b, v7.16b
565 	cbz		w4, .Lxtsdecout
566 	xts_reload_mask	v8
567 	b		.LxtsdecloopNx
568 .Lxtsdec1x:
569 	adds		w4, w4, #64
570 	beq		.Lxtsdecout
571 	subs		w4, w4, #16
572 .Lxtsdecloop:
573 	ld1		{v0.16b}, [x1], #16
574 	bmi		.Lxtsdeccts
575 .Lxtsdecctsout:
576 	eor		v0.16b, v0.16b, v4.16b
577 	decrypt_block	v0, w3, x2, x8, w7
578 	eor		v0.16b, v0.16b, v4.16b
579 	st1		{v0.16b}, [x0], #16
580 	cbz		w4, .Lxtsdecout
581 	subs		w4, w4, #16
582 	next_tweak	v4, v4, v8
583 	b		.Lxtsdecloop
584 .Lxtsdecout:
585 	st1		{v4.16b}, [x6]
586 	ldp		x29, x30, [sp], #16
587 	ret
588 
589 .Lxtsdeccts:
590 	adr_l		x8, .Lcts_permute_table
591 
592 	add		x1, x1, w4, sxtw	/* rewind input pointer */
593 	add		w4, w4, #16		/* # bytes in final block */
594 	add		x9, x8, #32
595 	add		x8, x8, x4
596 	sub		x9, x9, x4
597 	add		x4, x0, x4		/* output address of final block */
598 
599 	next_tweak	v5, v4, v8
600 
601 	ld1		{v1.16b}, [x1]		/* load final block */
602 	ld1		{v2.16b}, [x8]
603 	ld1		{v3.16b}, [x9]
604 
605 	eor		v0.16b, v0.16b, v5.16b
606 	decrypt_block	v0, w3, x2, x8, w7
607 	eor		v0.16b, v0.16b, v5.16b
608 
609 	tbl		v2.16b, {v0.16b}, v2.16b
610 	tbx		v0.16b, {v1.16b}, v3.16b
611 
612 	st1		{v2.16b}, [x4]			/* overlapping stores */
613 	mov		w4, wzr
614 	b		.Lxtsdecctsout
615 AES_FUNC_END(aes_xts_decrypt)
616 
617 	/*
618 	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
619 	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
620 	 */
621 AES_FUNC_START(aes_mac_update)
622 	ld1		{v0.16b}, [x4]			/* get dg */
623 	enc_prepare	w2, x1, x7
624 	cbz		w5, .Lmacloop4x
625 
626 	encrypt_block	v0, w2, x1, x7, w8
627 
628 .Lmacloop4x:
629 	subs		w3, w3, #4
630 	bmi		.Lmac1x
631 	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
632 	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
633 	encrypt_block	v0, w2, x1, x7, w8
634 	eor		v0.16b, v0.16b, v2.16b
635 	encrypt_block	v0, w2, x1, x7, w8
636 	eor		v0.16b, v0.16b, v3.16b
637 	encrypt_block	v0, w2, x1, x7, w8
638 	eor		v0.16b, v0.16b, v4.16b
639 	cmp		w3, wzr
640 	csinv		x5, x6, xzr, eq
641 	cbz		w5, .Lmacout
642 	encrypt_block	v0, w2, x1, x7, w8
643 	st1		{v0.16b}, [x4]			/* return dg */
644 	cond_yield	.Lmacout, x7, x8
645 	b		.Lmacloop4x
646 .Lmac1x:
647 	add		w3, w3, #4
648 .Lmacloop:
649 	cbz		w3, .Lmacout
650 	ld1		{v1.16b}, [x0], #16		/* get next pt block */
651 	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
652 
653 	subs		w3, w3, #1
654 	csinv		x5, x6, xzr, eq
655 	cbz		w5, .Lmacout
656 
657 .Lmacenc:
658 	encrypt_block	v0, w2, x1, x7, w8
659 	b		.Lmacloop
660 
661 .Lmacout:
662 	st1		{v0.16b}, [x4]			/* return dg */
663 	mov		w0, w3
664 	ret
665 AES_FUNC_END(aes_mac_update)
666