1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4  *
5  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6  */
7 
8 /* included by aes-ce.S and aes-neon.S */
9 
10 	.text
11 	.align		4
12 
13 #ifndef MAX_STRIDE
14 #define MAX_STRIDE	4
15 #endif
16 
17 #if MAX_STRIDE == 4
18 #define ST4(x...) x
19 #define ST5(x...)
20 #else
21 #define ST4(x...)
22 #define ST5(x...) x
23 #endif
24 
25 SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26 	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
27 	ret
28 SYM_FUNC_END(aes_encrypt_block4x)
29 
30 SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31 	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
32 	ret
33 SYM_FUNC_END(aes_decrypt_block4x)
34 
35 #if MAX_STRIDE == 5
36 SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37 	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
38 	ret
39 SYM_FUNC_END(aes_encrypt_block5x)
40 
41 SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42 	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
43 	ret
44 SYM_FUNC_END(aes_decrypt_block5x)
45 #endif
46 
47 	/*
48 	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49 	 *		   int blocks)
50 	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51 	 *		   int blocks)
52 	 */
53 
54 AES_FUNC_START(aes_ecb_encrypt)
55 	frame_push	0
56 
57 	enc_prepare	w3, x2, x5
58 
59 .LecbencloopNx:
60 	subs		w4, w4, #MAX_STRIDE
61 	bmi		.Lecbenc1x
62 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
63 ST4(	bl		aes_encrypt_block4x		)
64 ST5(	ld1		{v4.16b}, [x1], #16		)
65 ST5(	bl		aes_encrypt_block5x		)
66 	st1		{v0.16b-v3.16b}, [x0], #64
67 ST5(	st1		{v4.16b}, [x0], #16		)
68 	b		.LecbencloopNx
69 .Lecbenc1x:
70 	adds		w4, w4, #MAX_STRIDE
71 	beq		.Lecbencout
72 .Lecbencloop:
73 	ld1		{v0.16b}, [x1], #16		/* get next pt block */
74 	encrypt_block	v0, w3, x2, x5, w6
75 	st1		{v0.16b}, [x0], #16
76 	subs		w4, w4, #1
77 	bne		.Lecbencloop
78 .Lecbencout:
79 	frame_pop
80 	ret
81 AES_FUNC_END(aes_ecb_encrypt)
82 
83 
84 AES_FUNC_START(aes_ecb_decrypt)
85 	frame_push	0
86 
87 	dec_prepare	w3, x2, x5
88 
89 .LecbdecloopNx:
90 	subs		w4, w4, #MAX_STRIDE
91 	bmi		.Lecbdec1x
92 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
93 ST4(	bl		aes_decrypt_block4x		)
94 ST5(	ld1		{v4.16b}, [x1], #16		)
95 ST5(	bl		aes_decrypt_block5x		)
96 	st1		{v0.16b-v3.16b}, [x0], #64
97 ST5(	st1		{v4.16b}, [x0], #16		)
98 	b		.LecbdecloopNx
99 .Lecbdec1x:
100 	adds		w4, w4, #MAX_STRIDE
101 	beq		.Lecbdecout
102 .Lecbdecloop:
103 	ld1		{v0.16b}, [x1], #16		/* get next ct block */
104 	decrypt_block	v0, w3, x2, x5, w6
105 	st1		{v0.16b}, [x0], #16
106 	subs		w4, w4, #1
107 	bne		.Lecbdecloop
108 .Lecbdecout:
109 	frame_pop
110 	ret
111 AES_FUNC_END(aes_ecb_decrypt)
112 
113 
114 	/*
115 	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
116 	 *		   int blocks, u8 iv[])
117 	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118 	 *		   int blocks, u8 iv[])
119 	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
120 	 *			 int rounds, int blocks, u8 iv[],
121 	 *			 u32 const rk2[]);
122 	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
123 	 *			 int rounds, int blocks, u8 iv[],
124 	 *			 u32 const rk2[]);
125 	 */
126 
127 AES_FUNC_START(aes_essiv_cbc_encrypt)
128 	ld1		{v4.16b}, [x5]			/* get iv */
129 
130 	mov		w8, #14				/* AES-256: 14 rounds */
131 	enc_prepare	w8, x6, x7
132 	encrypt_block	v4, w8, x6, x7, w9
133 	enc_switch_key	w3, x2, x6
134 	b		.Lcbcencloop4x
135 
136 AES_FUNC_START(aes_cbc_encrypt)
137 	ld1		{v4.16b}, [x5]			/* get iv */
138 	enc_prepare	w3, x2, x6
139 
140 .Lcbcencloop4x:
141 	subs		w4, w4, #4
142 	bmi		.Lcbcenc1x
143 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
144 	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
145 	encrypt_block	v0, w3, x2, x6, w7
146 	eor		v1.16b, v1.16b, v0.16b
147 	encrypt_block	v1, w3, x2, x6, w7
148 	eor		v2.16b, v2.16b, v1.16b
149 	encrypt_block	v2, w3, x2, x6, w7
150 	eor		v3.16b, v3.16b, v2.16b
151 	encrypt_block	v3, w3, x2, x6, w7
152 	st1		{v0.16b-v3.16b}, [x0], #64
153 	mov		v4.16b, v3.16b
154 	b		.Lcbcencloop4x
155 .Lcbcenc1x:
156 	adds		w4, w4, #4
157 	beq		.Lcbcencout
158 .Lcbcencloop:
159 	ld1		{v0.16b}, [x1], #16		/* get next pt block */
160 	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
161 	encrypt_block	v4, w3, x2, x6, w7
162 	st1		{v4.16b}, [x0], #16
163 	subs		w4, w4, #1
164 	bne		.Lcbcencloop
165 .Lcbcencout:
166 	st1		{v4.16b}, [x5]			/* return iv */
167 	ret
168 AES_FUNC_END(aes_cbc_encrypt)
169 AES_FUNC_END(aes_essiv_cbc_encrypt)
170 
171 AES_FUNC_START(aes_essiv_cbc_decrypt)
172 	ld1		{cbciv.16b}, [x5]		/* get iv */
173 
174 	mov		w8, #14				/* AES-256: 14 rounds */
175 	enc_prepare	w8, x6, x7
176 	encrypt_block	cbciv, w8, x6, x7, w9
177 	b		.Lessivcbcdecstart
178 
179 AES_FUNC_START(aes_cbc_decrypt)
180 	ld1		{cbciv.16b}, [x5]		/* get iv */
181 .Lessivcbcdecstart:
182 	frame_push	0
183 	dec_prepare	w3, x2, x6
184 
185 .LcbcdecloopNx:
186 	subs		w4, w4, #MAX_STRIDE
187 	bmi		.Lcbcdec1x
188 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
189 #if MAX_STRIDE == 5
190 	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
191 	mov		v5.16b, v0.16b
192 	mov		v6.16b, v1.16b
193 	mov		v7.16b, v2.16b
194 	bl		aes_decrypt_block5x
195 	sub		x1, x1, #32
196 	eor		v0.16b, v0.16b, cbciv.16b
197 	eor		v1.16b, v1.16b, v5.16b
198 	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
199 	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
200 	eor		v2.16b, v2.16b, v6.16b
201 	eor		v3.16b, v3.16b, v7.16b
202 	eor		v4.16b, v4.16b, v5.16b
203 #else
204 	mov		v4.16b, v0.16b
205 	mov		v5.16b, v1.16b
206 	mov		v6.16b, v2.16b
207 	bl		aes_decrypt_block4x
208 	sub		x1, x1, #16
209 	eor		v0.16b, v0.16b, cbciv.16b
210 	eor		v1.16b, v1.16b, v4.16b
211 	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
212 	eor		v2.16b, v2.16b, v5.16b
213 	eor		v3.16b, v3.16b, v6.16b
214 #endif
215 	st1		{v0.16b-v3.16b}, [x0], #64
216 ST5(	st1		{v4.16b}, [x0], #16		)
217 	b		.LcbcdecloopNx
218 .Lcbcdec1x:
219 	adds		w4, w4, #MAX_STRIDE
220 	beq		.Lcbcdecout
221 .Lcbcdecloop:
222 	ld1		{v1.16b}, [x1], #16		/* get next ct block */
223 	mov		v0.16b, v1.16b			/* ...and copy to v0 */
224 	decrypt_block	v0, w3, x2, x6, w7
225 	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
226 	mov		cbciv.16b, v1.16b		/* ct is next iv */
227 	st1		{v0.16b}, [x0], #16
228 	subs		w4, w4, #1
229 	bne		.Lcbcdecloop
230 .Lcbcdecout:
231 	st1		{cbciv.16b}, [x5]		/* return iv */
232 	frame_pop
233 	ret
234 AES_FUNC_END(aes_cbc_decrypt)
235 AES_FUNC_END(aes_essiv_cbc_decrypt)
236 
237 
238 	/*
239 	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
240 	 *		       int rounds, int bytes, u8 const iv[])
241 	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
242 	 *		       int rounds, int bytes, u8 const iv[])
243 	 */
244 
245 AES_FUNC_START(aes_cbc_cts_encrypt)
246 	adr_l		x8, .Lcts_permute_table
247 	sub		x4, x4, #16
248 	add		x9, x8, #32
249 	add		x8, x8, x4
250 	sub		x9, x9, x4
251 	ld1		{v3.16b}, [x8]
252 	ld1		{v4.16b}, [x9]
253 
254 	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
255 	ld1		{v1.16b}, [x1]
256 
257 	ld1		{v5.16b}, [x5]			/* get iv */
258 	enc_prepare	w3, x2, x6
259 
260 	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
261 	tbl		v1.16b, {v1.16b}, v4.16b
262 	encrypt_block	v0, w3, x2, x6, w7
263 
264 	eor		v1.16b, v1.16b, v0.16b
265 	tbl		v0.16b, {v0.16b}, v3.16b
266 	encrypt_block	v1, w3, x2, x6, w7
267 
268 	add		x4, x0, x4
269 	st1		{v0.16b}, [x4]			/* overlapping stores */
270 	st1		{v1.16b}, [x0]
271 	ret
272 AES_FUNC_END(aes_cbc_cts_encrypt)
273 
274 AES_FUNC_START(aes_cbc_cts_decrypt)
275 	adr_l		x8, .Lcts_permute_table
276 	sub		x4, x4, #16
277 	add		x9, x8, #32
278 	add		x8, x8, x4
279 	sub		x9, x9, x4
280 	ld1		{v3.16b}, [x8]
281 	ld1		{v4.16b}, [x9]
282 
283 	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
284 	ld1		{v1.16b}, [x1]
285 
286 	ld1		{v5.16b}, [x5]			/* get iv */
287 	dec_prepare	w3, x2, x6
288 
289 	decrypt_block	v0, w3, x2, x6, w7
290 	tbl		v2.16b, {v0.16b}, v3.16b
291 	eor		v2.16b, v2.16b, v1.16b
292 
293 	tbx		v0.16b, {v1.16b}, v4.16b
294 	decrypt_block	v0, w3, x2, x6, w7
295 	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
296 
297 	add		x4, x0, x4
298 	st1		{v2.16b}, [x4]			/* overlapping stores */
299 	st1		{v0.16b}, [x0]
300 	ret
301 AES_FUNC_END(aes_cbc_cts_decrypt)
302 
303 	.section	".rodata", "a"
304 	.align		6
305 .Lcts_permute_table:
306 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
307 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
308 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
309 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
310 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
311 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
312 	.previous
313 
314 	/*
315 	 * This macro generates the code for CTR and XCTR mode.
316 	 */
317 .macro ctr_encrypt xctr
318 	// Arguments
319 	OUT		.req x0
320 	IN		.req x1
321 	KEY		.req x2
322 	ROUNDS_W	.req w3
323 	BYTES_W		.req w4
324 	IV		.req x5
325 	BYTE_CTR_W 	.req w6		// XCTR only
326 	// Intermediate values
327 	CTR_W		.req w11	// XCTR only
328 	CTR		.req x11	// XCTR only
329 	IV_PART		.req x12
330 	BLOCKS		.req x13
331 	BLOCKS_W	.req w13
332 
333 	frame_push	0
334 
335 	enc_prepare	ROUNDS_W, KEY, IV_PART
336 	ld1		{vctr.16b}, [IV]
337 
338 	/*
339 	 * Keep 64 bits of the IV in a register.  For CTR mode this lets us
340 	 * easily increment the IV.  For XCTR mode this lets us efficiently XOR
341 	 * the 64-bit counter with the IV.
342 	 */
343 	.if \xctr
344 		umov		IV_PART, vctr.d[0]
345 		lsr		CTR_W, BYTE_CTR_W, #4
346 	.else
347 		umov		IV_PART, vctr.d[1]
348 		rev		IV_PART, IV_PART
349 	.endif
350 
351 .LctrloopNx\xctr:
352 	add		BLOCKS_W, BYTES_W, #15
353 	sub		BYTES_W, BYTES_W, #MAX_STRIDE << 4
354 	lsr		BLOCKS_W, BLOCKS_W, #4
355 	mov		w8, #MAX_STRIDE
356 	cmp		BLOCKS_W, w8
357 	csel		BLOCKS_W, BLOCKS_W, w8, lt
358 
359 	/*
360 	 * Set up the counter values in v0-v{MAX_STRIDE-1}.
361 	 *
362 	 * If we are encrypting less than MAX_STRIDE blocks, the tail block
363 	 * handling code expects the last keystream block to be in
364 	 * v{MAX_STRIDE-1}.  For example: if encrypting two blocks with
365 	 * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
366 	 */
367 	.if \xctr
368 		add		CTR, CTR, BLOCKS
369 	.else
370 		adds		IV_PART, IV_PART, BLOCKS
371 	.endif
372 	mov		v0.16b, vctr.16b
373 	mov		v1.16b, vctr.16b
374 	mov		v2.16b, vctr.16b
375 	mov		v3.16b, vctr.16b
376 ST5(	mov		v4.16b, vctr.16b		)
377 	.if \xctr
378 		sub		x6, CTR, #MAX_STRIDE - 1
379 		sub		x7, CTR, #MAX_STRIDE - 2
380 		sub		x8, CTR, #MAX_STRIDE - 3
381 		sub		x9, CTR, #MAX_STRIDE - 4
382 ST5(		sub		x10, CTR, #MAX_STRIDE - 5	)
383 		eor		x6, x6, IV_PART
384 		eor		x7, x7, IV_PART
385 		eor		x8, x8, IV_PART
386 		eor		x9, x9, IV_PART
387 ST5(		eor		x10, x10, IV_PART		)
388 		mov		v0.d[0], x6
389 		mov		v1.d[0], x7
390 		mov		v2.d[0], x8
391 		mov		v3.d[0], x9
392 ST5(		mov		v4.d[0], x10			)
393 	.else
394 		bcs		0f
395 		.subsection	1
396 		/*
397 		 * This subsection handles carries.
398 		 *
399 		 * Conditional branching here is allowed with respect to time
400 		 * invariance since the branches are dependent on the IV instead
401 		 * of the plaintext or key.  This code is rarely executed in
402 		 * practice anyway.
403 		 */
404 
405 		/* Apply carry to outgoing counter. */
406 0:		umov		x8, vctr.d[0]
407 		rev		x8, x8
408 		add		x8, x8, #1
409 		rev		x8, x8
410 		ins		vctr.d[0], x8
411 
412 		/*
413 		 * Apply carry to counter blocks if needed.
414 		 *
415 		 * Since the carry flag was set, we know 0 <= IV_PART <
416 		 * MAX_STRIDE.  Using the value of IV_PART we can determine how
417 		 * many counter blocks need to be updated.
418 		 */
419 		cbz		IV_PART, 2f
420 		adr		x16, 1f
421 		sub		x16, x16, IV_PART, lsl #3
422 		br		x16
423 		bti		c
424 		mov		v0.d[0], vctr.d[0]
425 		bti		c
426 		mov		v1.d[0], vctr.d[0]
427 		bti		c
428 		mov		v2.d[0], vctr.d[0]
429 		bti		c
430 		mov		v3.d[0], vctr.d[0]
431 ST5(		bti		c				)
432 ST5(		mov		v4.d[0], vctr.d[0]		)
433 1:		b		2f
434 		.previous
435 
436 2:		rev		x7, IV_PART
437 		ins		vctr.d[1], x7
438 		sub		x7, IV_PART, #MAX_STRIDE - 1
439 		sub		x8, IV_PART, #MAX_STRIDE - 2
440 		sub		x9, IV_PART, #MAX_STRIDE - 3
441 		rev		x7, x7
442 		rev		x8, x8
443 		mov		v1.d[1], x7
444 		rev		x9, x9
445 ST5(		sub		x10, IV_PART, #MAX_STRIDE - 4	)
446 		mov		v2.d[1], x8
447 ST5(		rev		x10, x10			)
448 		mov		v3.d[1], x9
449 ST5(		mov		v4.d[1], x10			)
450 	.endif
451 
452 	/*
453 	 * If there are at least MAX_STRIDE blocks left, XOR the data with
454 	 * keystream and store.  Otherwise jump to tail handling.
455 	 */
456 	tbnz		BYTES_W, #31, .Lctrtail\xctr
457 	ld1		{v5.16b-v7.16b}, [IN], #48
458 ST4(	bl		aes_encrypt_block4x		)
459 ST5(	bl		aes_encrypt_block5x		)
460 	eor		v0.16b, v5.16b, v0.16b
461 ST4(	ld1		{v5.16b}, [IN], #16		)
462 	eor		v1.16b, v6.16b, v1.16b
463 ST5(	ld1		{v5.16b-v6.16b}, [IN], #32	)
464 	eor		v2.16b, v7.16b, v2.16b
465 	eor		v3.16b, v5.16b, v3.16b
466 ST5(	eor		v4.16b, v6.16b, v4.16b		)
467 	st1		{v0.16b-v3.16b}, [OUT], #64
468 ST5(	st1		{v4.16b}, [OUT], #16		)
469 	cbz		BYTES_W, .Lctrout\xctr
470 	b		.LctrloopNx\xctr
471 
472 .Lctrout\xctr:
473 	.if !\xctr
474 		st1		{vctr.16b}, [IV] /* return next CTR value */
475 	.endif
476 	frame_pop
477 	ret
478 
479 .Lctrtail\xctr:
480 	/*
481 	 * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
482 	 *
483 	 * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
484 	 * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
485 	 * v4 should have the next two counter blocks.
486 	 *
487 	 * This allows us to store the ciphertext by writing to overlapping
488 	 * regions of memory.  Any invalid ciphertext blocks get overwritten by
489 	 * correctly computed blocks.  This approach greatly simplifies the
490 	 * logic for storing the ciphertext.
491 	 */
492 	mov		x16, #16
493 	ands		w7, BYTES_W, #0xf
494 	csel		x13, x7, x16, ne
495 
496 ST5(	cmp		BYTES_W, #64 - (MAX_STRIDE << 4))
497 ST5(	csel		x14, x16, xzr, gt		)
498 	cmp		BYTES_W, #48 - (MAX_STRIDE << 4)
499 	csel		x15, x16, xzr, gt
500 	cmp		BYTES_W, #32 - (MAX_STRIDE << 4)
501 	csel		x16, x16, xzr, gt
502 	cmp		BYTES_W, #16 - (MAX_STRIDE << 4)
503 
504 	adr_l		x9, .Lcts_permute_table
505 	add		x9, x9, x13
506 	ble		.Lctrtail1x\xctr
507 
508 ST5(	ld1		{v5.16b}, [IN], x14		)
509 	ld1		{v6.16b}, [IN], x15
510 	ld1		{v7.16b}, [IN], x16
511 
512 ST4(	bl		aes_encrypt_block4x		)
513 ST5(	bl		aes_encrypt_block5x		)
514 
515 	ld1		{v8.16b}, [IN], x13
516 	ld1		{v9.16b}, [IN]
517 	ld1		{v10.16b}, [x9]
518 
519 ST4(	eor		v6.16b, v6.16b, v0.16b		)
520 ST4(	eor		v7.16b, v7.16b, v1.16b		)
521 ST4(	tbl		v3.16b, {v3.16b}, v10.16b	)
522 ST4(	eor		v8.16b, v8.16b, v2.16b		)
523 ST4(	eor		v9.16b, v9.16b, v3.16b		)
524 
525 ST5(	eor		v5.16b, v5.16b, v0.16b		)
526 ST5(	eor		v6.16b, v6.16b, v1.16b		)
527 ST5(	tbl		v4.16b, {v4.16b}, v10.16b	)
528 ST5(	eor		v7.16b, v7.16b, v2.16b		)
529 ST5(	eor		v8.16b, v8.16b, v3.16b		)
530 ST5(	eor		v9.16b, v9.16b, v4.16b		)
531 
532 ST5(	st1		{v5.16b}, [OUT], x14		)
533 	st1		{v6.16b}, [OUT], x15
534 	st1		{v7.16b}, [OUT], x16
535 	add		x13, x13, OUT
536 	st1		{v9.16b}, [x13]		// overlapping stores
537 	st1		{v8.16b}, [OUT]
538 	b		.Lctrout\xctr
539 
540 .Lctrtail1x\xctr:
541 	/*
542 	 * Handle <= 16 bytes of plaintext
543 	 *
544 	 * This code always reads and writes 16 bytes.  To avoid out of bounds
545 	 * accesses, XCTR and CTR modes must use a temporary buffer when
546 	 * encrypting/decrypting less than 16 bytes.
547 	 *
548 	 * This code is unusual in that it loads the input and stores the output
549 	 * relative to the end of the buffers rather than relative to the start.
550 	 * This causes unusual behaviour when encrypting/decrypting less than 16
551 	 * bytes; the end of the data is expected to be at the end of the
552 	 * temporary buffer rather than the start of the data being at the start
553 	 * of the temporary buffer.
554 	 */
555 	sub		x8, x7, #16
556 	csel		x7, x7, x8, eq
557 	add		IN, IN, x7
558 	add		OUT, OUT, x7
559 	ld1		{v5.16b}, [IN]
560 	ld1		{v6.16b}, [OUT]
561 ST5(	mov		v3.16b, v4.16b			)
562 	encrypt_block	v3, ROUNDS_W, KEY, x8, w7
563 	ld1		{v10.16b-v11.16b}, [x9]
564 	tbl		v3.16b, {v3.16b}, v10.16b
565 	sshr		v11.16b, v11.16b, #7
566 	eor		v5.16b, v5.16b, v3.16b
567 	bif		v5.16b, v6.16b, v11.16b
568 	st1		{v5.16b}, [OUT]
569 	b		.Lctrout\xctr
570 
571 	// Arguments
572 	.unreq OUT
573 	.unreq IN
574 	.unreq KEY
575 	.unreq ROUNDS_W
576 	.unreq BYTES_W
577 	.unreq IV
578 	.unreq BYTE_CTR_W	// XCTR only
579 	// Intermediate values
580 	.unreq CTR_W		// XCTR only
581 	.unreq CTR		// XCTR only
582 	.unreq IV_PART
583 	.unreq BLOCKS
584 	.unreq BLOCKS_W
585 .endm
586 
587 	/*
588 	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
589 	 *		   int bytes, u8 ctr[])
590 	 *
591 	 * The input and output buffers must always be at least 16 bytes even if
592 	 * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
593 	 * accesses will occur.  The data to be encrypted/decrypted is expected
594 	 * to be at the end of this 16-byte temporary buffer rather than the
595 	 * start.
596 	 */
597 
598 AES_FUNC_START(aes_ctr_encrypt)
599 	ctr_encrypt 0
600 AES_FUNC_END(aes_ctr_encrypt)
601 
602 	/*
603 	 * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
604 	 *		   int bytes, u8 const iv[], int byte_ctr)
605 	 *
606 	 * The input and output buffers must always be at least 16 bytes even if
607 	 * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
608 	 * accesses will occur.  The data to be encrypted/decrypted is expected
609 	 * to be at the end of this 16-byte temporary buffer rather than the
610 	 * start.
611 	 */
612 
613 AES_FUNC_START(aes_xctr_encrypt)
614 	ctr_encrypt 1
615 AES_FUNC_END(aes_xctr_encrypt)
616 
617 
618 	/*
619 	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
620 	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
621 	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
622 	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
623 	 */
624 
625 	.macro		next_tweak, out, in, tmp
626 	sshr		\tmp\().2d,  \in\().2d,   #63
627 	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
628 	add		\out\().2d,  \in\().2d,   \in\().2d
629 	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
630 	eor		\out\().16b, \out\().16b, \tmp\().16b
631 	.endm
632 
633 	.macro		xts_load_mask, tmp
634 	movi		xtsmask.2s, #0x1
635 	movi		\tmp\().2s, #0x87
636 	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
637 	.endm
638 
639 AES_FUNC_START(aes_xts_encrypt)
640 	frame_push	0
641 
642 	ld1		{v4.16b}, [x6]
643 	xts_load_mask	v8
644 	cbz		w7, .Lxtsencnotfirst
645 
646 	enc_prepare	w3, x5, x8
647 	xts_cts_skip_tw	w7, .LxtsencNx
648 	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
649 	enc_switch_key	w3, x2, x8
650 	b		.LxtsencNx
651 
652 .Lxtsencnotfirst:
653 	enc_prepare	w3, x2, x8
654 .LxtsencloopNx:
655 	next_tweak	v4, v4, v8
656 .LxtsencNx:
657 	subs		w4, w4, #64
658 	bmi		.Lxtsenc1x
659 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
660 	next_tweak	v5, v4, v8
661 	eor		v0.16b, v0.16b, v4.16b
662 	next_tweak	v6, v5, v8
663 	eor		v1.16b, v1.16b, v5.16b
664 	eor		v2.16b, v2.16b, v6.16b
665 	next_tweak	v7, v6, v8
666 	eor		v3.16b, v3.16b, v7.16b
667 	bl		aes_encrypt_block4x
668 	eor		v3.16b, v3.16b, v7.16b
669 	eor		v0.16b, v0.16b, v4.16b
670 	eor		v1.16b, v1.16b, v5.16b
671 	eor		v2.16b, v2.16b, v6.16b
672 	st1		{v0.16b-v3.16b}, [x0], #64
673 	mov		v4.16b, v7.16b
674 	cbz		w4, .Lxtsencret
675 	xts_reload_mask	v8
676 	b		.LxtsencloopNx
677 .Lxtsenc1x:
678 	adds		w4, w4, #64
679 	beq		.Lxtsencout
680 	subs		w4, w4, #16
681 	bmi		.LxtsencctsNx
682 .Lxtsencloop:
683 	ld1		{v0.16b}, [x1], #16
684 .Lxtsencctsout:
685 	eor		v0.16b, v0.16b, v4.16b
686 	encrypt_block	v0, w3, x2, x8, w7
687 	eor		v0.16b, v0.16b, v4.16b
688 	cbz		w4, .Lxtsencout
689 	subs		w4, w4, #16
690 	next_tweak	v4, v4, v8
691 	bmi		.Lxtsenccts
692 	st1		{v0.16b}, [x0], #16
693 	b		.Lxtsencloop
694 .Lxtsencout:
695 	st1		{v0.16b}, [x0]
696 .Lxtsencret:
697 	st1		{v4.16b}, [x6]
698 	frame_pop
699 	ret
700 
701 .LxtsencctsNx:
702 	mov		v0.16b, v3.16b
703 	sub		x0, x0, #16
704 .Lxtsenccts:
705 	adr_l		x8, .Lcts_permute_table
706 
707 	add		x1, x1, w4, sxtw	/* rewind input pointer */
708 	add		w4, w4, #16		/* # bytes in final block */
709 	add		x9, x8, #32
710 	add		x8, x8, x4
711 	sub		x9, x9, x4
712 	add		x4, x0, x4		/* output address of final block */
713 
714 	ld1		{v1.16b}, [x1]		/* load final block */
715 	ld1		{v2.16b}, [x8]
716 	ld1		{v3.16b}, [x9]
717 
718 	tbl		v2.16b, {v0.16b}, v2.16b
719 	tbx		v0.16b, {v1.16b}, v3.16b
720 	st1		{v2.16b}, [x4]			/* overlapping stores */
721 	mov		w4, wzr
722 	b		.Lxtsencctsout
723 AES_FUNC_END(aes_xts_encrypt)
724 
725 AES_FUNC_START(aes_xts_decrypt)
726 	frame_push	0
727 
728 	/* subtract 16 bytes if we are doing CTS */
729 	sub		w8, w4, #0x10
730 	tst		w4, #0xf
731 	csel		w4, w4, w8, eq
732 
733 	ld1		{v4.16b}, [x6]
734 	xts_load_mask	v8
735 	xts_cts_skip_tw	w7, .Lxtsdecskiptw
736 	cbz		w7, .Lxtsdecnotfirst
737 
738 	enc_prepare	w3, x5, x8
739 	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
740 .Lxtsdecskiptw:
741 	dec_prepare	w3, x2, x8
742 	b		.LxtsdecNx
743 
744 .Lxtsdecnotfirst:
745 	dec_prepare	w3, x2, x8
746 .LxtsdecloopNx:
747 	next_tweak	v4, v4, v8
748 .LxtsdecNx:
749 	subs		w4, w4, #64
750 	bmi		.Lxtsdec1x
751 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
752 	next_tweak	v5, v4, v8
753 	eor		v0.16b, v0.16b, v4.16b
754 	next_tweak	v6, v5, v8
755 	eor		v1.16b, v1.16b, v5.16b
756 	eor		v2.16b, v2.16b, v6.16b
757 	next_tweak	v7, v6, v8
758 	eor		v3.16b, v3.16b, v7.16b
759 	bl		aes_decrypt_block4x
760 	eor		v3.16b, v3.16b, v7.16b
761 	eor		v0.16b, v0.16b, v4.16b
762 	eor		v1.16b, v1.16b, v5.16b
763 	eor		v2.16b, v2.16b, v6.16b
764 	st1		{v0.16b-v3.16b}, [x0], #64
765 	mov		v4.16b, v7.16b
766 	cbz		w4, .Lxtsdecout
767 	xts_reload_mask	v8
768 	b		.LxtsdecloopNx
769 .Lxtsdec1x:
770 	adds		w4, w4, #64
771 	beq		.Lxtsdecout
772 	subs		w4, w4, #16
773 .Lxtsdecloop:
774 	ld1		{v0.16b}, [x1], #16
775 	bmi		.Lxtsdeccts
776 .Lxtsdecctsout:
777 	eor		v0.16b, v0.16b, v4.16b
778 	decrypt_block	v0, w3, x2, x8, w7
779 	eor		v0.16b, v0.16b, v4.16b
780 	st1		{v0.16b}, [x0], #16
781 	cbz		w4, .Lxtsdecout
782 	subs		w4, w4, #16
783 	next_tweak	v4, v4, v8
784 	b		.Lxtsdecloop
785 .Lxtsdecout:
786 	st1		{v4.16b}, [x6]
787 	frame_pop
788 	ret
789 
790 .Lxtsdeccts:
791 	adr_l		x8, .Lcts_permute_table
792 
793 	add		x1, x1, w4, sxtw	/* rewind input pointer */
794 	add		w4, w4, #16		/* # bytes in final block */
795 	add		x9, x8, #32
796 	add		x8, x8, x4
797 	sub		x9, x9, x4
798 	add		x4, x0, x4		/* output address of final block */
799 
800 	next_tweak	v5, v4, v8
801 
802 	ld1		{v1.16b}, [x1]		/* load final block */
803 	ld1		{v2.16b}, [x8]
804 	ld1		{v3.16b}, [x9]
805 
806 	eor		v0.16b, v0.16b, v5.16b
807 	decrypt_block	v0, w3, x2, x8, w7
808 	eor		v0.16b, v0.16b, v5.16b
809 
810 	tbl		v2.16b, {v0.16b}, v2.16b
811 	tbx		v0.16b, {v1.16b}, v3.16b
812 
813 	st1		{v2.16b}, [x4]			/* overlapping stores */
814 	mov		w4, wzr
815 	b		.Lxtsdecctsout
816 AES_FUNC_END(aes_xts_decrypt)
817 
818 	/*
819 	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
820 	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
821 	 */
822 AES_FUNC_START(aes_mac_update)
823 	ld1		{v0.16b}, [x4]			/* get dg */
824 	enc_prepare	w2, x1, x7
825 	cbz		w5, .Lmacloop4x
826 
827 	encrypt_block	v0, w2, x1, x7, w8
828 
829 .Lmacloop4x:
830 	subs		w3, w3, #4
831 	bmi		.Lmac1x
832 	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
833 	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
834 	encrypt_block	v0, w2, x1, x7, w8
835 	eor		v0.16b, v0.16b, v2.16b
836 	encrypt_block	v0, w2, x1, x7, w8
837 	eor		v0.16b, v0.16b, v3.16b
838 	encrypt_block	v0, w2, x1, x7, w8
839 	eor		v0.16b, v0.16b, v4.16b
840 	cmp		w3, wzr
841 	csinv		x5, x6, xzr, eq
842 	cbz		w5, .Lmacout
843 	encrypt_block	v0, w2, x1, x7, w8
844 	st1		{v0.16b}, [x4]			/* return dg */
845 	cond_yield	.Lmacout, x7, x8
846 	b		.Lmacloop4x
847 .Lmac1x:
848 	add		w3, w3, #4
849 .Lmacloop:
850 	cbz		w3, .Lmacout
851 	ld1		{v1.16b}, [x0], #16		/* get next pt block */
852 	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
853 
854 	subs		w3, w3, #1
855 	csinv		x5, x6, xzr, eq
856 	cbz		w5, .Lmacout
857 
858 .Lmacenc:
859 	encrypt_block	v0, w2, x1, x7, w8
860 	b		.Lmacloop
861 
862 .Lmacout:
863 	st1		{v0.16b}, [x4]			/* return dg */
864 	mov		w0, w3
865 	ret
866 AES_FUNC_END(aes_mac_update)
867