162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci/* included by aes-ce.S and aes-neon.S */
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci	.text
1162306a36Sopenharmony_ci	.align		4
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci#ifndef MAX_STRIDE
1462306a36Sopenharmony_ci#define MAX_STRIDE	4
1562306a36Sopenharmony_ci#endif
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci#if MAX_STRIDE == 4
1862306a36Sopenharmony_ci#define ST4(x...) x
1962306a36Sopenharmony_ci#define ST5(x...)
2062306a36Sopenharmony_ci#else
2162306a36Sopenharmony_ci#define ST4(x...)
2262306a36Sopenharmony_ci#define ST5(x...) x
2362306a36Sopenharmony_ci#endif
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(aes_encrypt_block4x)
2662306a36Sopenharmony_ci	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
2762306a36Sopenharmony_ci	ret
2862306a36Sopenharmony_ciSYM_FUNC_END(aes_encrypt_block4x)
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(aes_decrypt_block4x)
3162306a36Sopenharmony_ci	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
3262306a36Sopenharmony_ci	ret
3362306a36Sopenharmony_ciSYM_FUNC_END(aes_decrypt_block4x)
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci#if MAX_STRIDE == 5
3662306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(aes_encrypt_block5x)
3762306a36Sopenharmony_ci	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
3862306a36Sopenharmony_ci	ret
3962306a36Sopenharmony_ciSYM_FUNC_END(aes_encrypt_block5x)
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(aes_decrypt_block5x)
4262306a36Sopenharmony_ci	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
4362306a36Sopenharmony_ci	ret
4462306a36Sopenharmony_ciSYM_FUNC_END(aes_decrypt_block5x)
4562306a36Sopenharmony_ci#endif
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci	/*
4862306a36Sopenharmony_ci	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
4962306a36Sopenharmony_ci	 *		   int blocks)
5062306a36Sopenharmony_ci	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
5162306a36Sopenharmony_ci	 *		   int blocks)
5262306a36Sopenharmony_ci	 */
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ciAES_FUNC_START(aes_ecb_encrypt)
5562306a36Sopenharmony_ci	frame_push	0
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_ci	enc_prepare	w3, x2, x5
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci.LecbencloopNx:
6062306a36Sopenharmony_ci	subs		w4, w4, #MAX_STRIDE
6162306a36Sopenharmony_ci	bmi		.Lecbenc1x
6262306a36Sopenharmony_ci	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
6362306a36Sopenharmony_ciST4(	bl		aes_encrypt_block4x		)
6462306a36Sopenharmony_ciST5(	ld1		{v4.16b}, [x1], #16		)
6562306a36Sopenharmony_ciST5(	bl		aes_encrypt_block5x		)
6662306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x0], #64
6762306a36Sopenharmony_ciST5(	st1		{v4.16b}, [x0], #16		)
6862306a36Sopenharmony_ci	b		.LecbencloopNx
6962306a36Sopenharmony_ci.Lecbenc1x:
7062306a36Sopenharmony_ci	adds		w4, w4, #MAX_STRIDE
7162306a36Sopenharmony_ci	beq		.Lecbencout
7262306a36Sopenharmony_ci.Lecbencloop:
7362306a36Sopenharmony_ci	ld1		{v0.16b}, [x1], #16		/* get next pt block */
7462306a36Sopenharmony_ci	encrypt_block	v0, w3, x2, x5, w6
7562306a36Sopenharmony_ci	st1		{v0.16b}, [x0], #16
7662306a36Sopenharmony_ci	subs		w4, w4, #1
7762306a36Sopenharmony_ci	bne		.Lecbencloop
7862306a36Sopenharmony_ci.Lecbencout:
7962306a36Sopenharmony_ci	frame_pop
8062306a36Sopenharmony_ci	ret
8162306a36Sopenharmony_ciAES_FUNC_END(aes_ecb_encrypt)
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ciAES_FUNC_START(aes_ecb_decrypt)
8562306a36Sopenharmony_ci	frame_push	0
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci	dec_prepare	w3, x2, x5
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci.LecbdecloopNx:
9062306a36Sopenharmony_ci	subs		w4, w4, #MAX_STRIDE
9162306a36Sopenharmony_ci	bmi		.Lecbdec1x
9262306a36Sopenharmony_ci	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
9362306a36Sopenharmony_ciST4(	bl		aes_decrypt_block4x		)
9462306a36Sopenharmony_ciST5(	ld1		{v4.16b}, [x1], #16		)
9562306a36Sopenharmony_ciST5(	bl		aes_decrypt_block5x		)
9662306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x0], #64
9762306a36Sopenharmony_ciST5(	st1		{v4.16b}, [x0], #16		)
9862306a36Sopenharmony_ci	b		.LecbdecloopNx
9962306a36Sopenharmony_ci.Lecbdec1x:
10062306a36Sopenharmony_ci	adds		w4, w4, #MAX_STRIDE
10162306a36Sopenharmony_ci	beq		.Lecbdecout
10262306a36Sopenharmony_ci.Lecbdecloop:
10362306a36Sopenharmony_ci	ld1		{v0.16b}, [x1], #16		/* get next ct block */
10462306a36Sopenharmony_ci	decrypt_block	v0, w3, x2, x5, w6
10562306a36Sopenharmony_ci	st1		{v0.16b}, [x0], #16
10662306a36Sopenharmony_ci	subs		w4, w4, #1
10762306a36Sopenharmony_ci	bne		.Lecbdecloop
10862306a36Sopenharmony_ci.Lecbdecout:
10962306a36Sopenharmony_ci	frame_pop
11062306a36Sopenharmony_ci	ret
11162306a36Sopenharmony_ciAES_FUNC_END(aes_ecb_decrypt)
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci	/*
11562306a36Sopenharmony_ci	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
11662306a36Sopenharmony_ci	 *		   int blocks, u8 iv[])
11762306a36Sopenharmony_ci	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
11862306a36Sopenharmony_ci	 *		   int blocks, u8 iv[])
11962306a36Sopenharmony_ci	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
12062306a36Sopenharmony_ci	 *			 int rounds, int blocks, u8 iv[],
12162306a36Sopenharmony_ci	 *			 u32 const rk2[]);
12262306a36Sopenharmony_ci	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
12362306a36Sopenharmony_ci	 *			 int rounds, int blocks, u8 iv[],
12462306a36Sopenharmony_ci	 *			 u32 const rk2[]);
12562306a36Sopenharmony_ci	 */
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ciAES_FUNC_START(aes_essiv_cbc_encrypt)
12862306a36Sopenharmony_ci	ld1		{v4.16b}, [x5]			/* get iv */
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci	mov		w8, #14				/* AES-256: 14 rounds */
13162306a36Sopenharmony_ci	enc_prepare	w8, x6, x7
13262306a36Sopenharmony_ci	encrypt_block	v4, w8, x6, x7, w9
13362306a36Sopenharmony_ci	enc_switch_key	w3, x2, x6
13462306a36Sopenharmony_ci	b		.Lcbcencloop4x
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ciAES_FUNC_START(aes_cbc_encrypt)
13762306a36Sopenharmony_ci	ld1		{v4.16b}, [x5]			/* get iv */
13862306a36Sopenharmony_ci	enc_prepare	w3, x2, x6
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_ci.Lcbcencloop4x:
14162306a36Sopenharmony_ci	subs		w4, w4, #4
14262306a36Sopenharmony_ci	bmi		.Lcbcenc1x
14362306a36Sopenharmony_ci	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
14462306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
14562306a36Sopenharmony_ci	encrypt_block	v0, w3, x2, x6, w7
14662306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v0.16b
14762306a36Sopenharmony_ci	encrypt_block	v1, w3, x2, x6, w7
14862306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v1.16b
14962306a36Sopenharmony_ci	encrypt_block	v2, w3, x2, x6, w7
15062306a36Sopenharmony_ci	eor		v3.16b, v3.16b, v2.16b
15162306a36Sopenharmony_ci	encrypt_block	v3, w3, x2, x6, w7
15262306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x0], #64
15362306a36Sopenharmony_ci	mov		v4.16b, v3.16b
15462306a36Sopenharmony_ci	b		.Lcbcencloop4x
15562306a36Sopenharmony_ci.Lcbcenc1x:
15662306a36Sopenharmony_ci	adds		w4, w4, #4
15762306a36Sopenharmony_ci	beq		.Lcbcencout
15862306a36Sopenharmony_ci.Lcbcencloop:
15962306a36Sopenharmony_ci	ld1		{v0.16b}, [x1], #16		/* get next pt block */
16062306a36Sopenharmony_ci	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
16162306a36Sopenharmony_ci	encrypt_block	v4, w3, x2, x6, w7
16262306a36Sopenharmony_ci	st1		{v4.16b}, [x0], #16
16362306a36Sopenharmony_ci	subs		w4, w4, #1
16462306a36Sopenharmony_ci	bne		.Lcbcencloop
16562306a36Sopenharmony_ci.Lcbcencout:
16662306a36Sopenharmony_ci	st1		{v4.16b}, [x5]			/* return iv */
16762306a36Sopenharmony_ci	ret
16862306a36Sopenharmony_ciAES_FUNC_END(aes_cbc_encrypt)
16962306a36Sopenharmony_ciAES_FUNC_END(aes_essiv_cbc_encrypt)
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ciAES_FUNC_START(aes_essiv_cbc_decrypt)
17262306a36Sopenharmony_ci	ld1		{cbciv.16b}, [x5]		/* get iv */
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci	mov		w8, #14				/* AES-256: 14 rounds */
17562306a36Sopenharmony_ci	enc_prepare	w8, x6, x7
17662306a36Sopenharmony_ci	encrypt_block	cbciv, w8, x6, x7, w9
17762306a36Sopenharmony_ci	b		.Lessivcbcdecstart
17862306a36Sopenharmony_ci
17962306a36Sopenharmony_ciAES_FUNC_START(aes_cbc_decrypt)
18062306a36Sopenharmony_ci	ld1		{cbciv.16b}, [x5]		/* get iv */
18162306a36Sopenharmony_ci.Lessivcbcdecstart:
18262306a36Sopenharmony_ci	frame_push	0
18362306a36Sopenharmony_ci	dec_prepare	w3, x2, x6
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci.LcbcdecloopNx:
18662306a36Sopenharmony_ci	subs		w4, w4, #MAX_STRIDE
18762306a36Sopenharmony_ci	bmi		.Lcbcdec1x
18862306a36Sopenharmony_ci	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
18962306a36Sopenharmony_ci#if MAX_STRIDE == 5
19062306a36Sopenharmony_ci	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
19162306a36Sopenharmony_ci	mov		v5.16b, v0.16b
19262306a36Sopenharmony_ci	mov		v6.16b, v1.16b
19362306a36Sopenharmony_ci	mov		v7.16b, v2.16b
19462306a36Sopenharmony_ci	bl		aes_decrypt_block5x
19562306a36Sopenharmony_ci	sub		x1, x1, #32
19662306a36Sopenharmony_ci	eor		v0.16b, v0.16b, cbciv.16b
19762306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v5.16b
19862306a36Sopenharmony_ci	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
19962306a36Sopenharmony_ci	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
20062306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v6.16b
20162306a36Sopenharmony_ci	eor		v3.16b, v3.16b, v7.16b
20262306a36Sopenharmony_ci	eor		v4.16b, v4.16b, v5.16b
20362306a36Sopenharmony_ci#else
20462306a36Sopenharmony_ci	mov		v4.16b, v0.16b
20562306a36Sopenharmony_ci	mov		v5.16b, v1.16b
20662306a36Sopenharmony_ci	mov		v6.16b, v2.16b
20762306a36Sopenharmony_ci	bl		aes_decrypt_block4x
20862306a36Sopenharmony_ci	sub		x1, x1, #16
20962306a36Sopenharmony_ci	eor		v0.16b, v0.16b, cbciv.16b
21062306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v4.16b
21162306a36Sopenharmony_ci	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
21262306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v5.16b
21362306a36Sopenharmony_ci	eor		v3.16b, v3.16b, v6.16b
21462306a36Sopenharmony_ci#endif
21562306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x0], #64
21662306a36Sopenharmony_ciST5(	st1		{v4.16b}, [x0], #16		)
21762306a36Sopenharmony_ci	b		.LcbcdecloopNx
21862306a36Sopenharmony_ci.Lcbcdec1x:
21962306a36Sopenharmony_ci	adds		w4, w4, #MAX_STRIDE
22062306a36Sopenharmony_ci	beq		.Lcbcdecout
22162306a36Sopenharmony_ci.Lcbcdecloop:
22262306a36Sopenharmony_ci	ld1		{v1.16b}, [x1], #16		/* get next ct block */
22362306a36Sopenharmony_ci	mov		v0.16b, v1.16b			/* ...and copy to v0 */
22462306a36Sopenharmony_ci	decrypt_block	v0, w3, x2, x6, w7
22562306a36Sopenharmony_ci	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
22662306a36Sopenharmony_ci	mov		cbciv.16b, v1.16b		/* ct is next iv */
22762306a36Sopenharmony_ci	st1		{v0.16b}, [x0], #16
22862306a36Sopenharmony_ci	subs		w4, w4, #1
22962306a36Sopenharmony_ci	bne		.Lcbcdecloop
23062306a36Sopenharmony_ci.Lcbcdecout:
23162306a36Sopenharmony_ci	st1		{cbciv.16b}, [x5]		/* return iv */
23262306a36Sopenharmony_ci	frame_pop
23362306a36Sopenharmony_ci	ret
23462306a36Sopenharmony_ciAES_FUNC_END(aes_cbc_decrypt)
23562306a36Sopenharmony_ciAES_FUNC_END(aes_essiv_cbc_decrypt)
23662306a36Sopenharmony_ci
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	/*
23962306a36Sopenharmony_ci	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
24062306a36Sopenharmony_ci	 *		       int rounds, int bytes, u8 const iv[])
24162306a36Sopenharmony_ci	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
24262306a36Sopenharmony_ci	 *		       int rounds, int bytes, u8 const iv[])
24362306a36Sopenharmony_ci	 */
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ciAES_FUNC_START(aes_cbc_cts_encrypt)
24662306a36Sopenharmony_ci	adr_l		x8, .Lcts_permute_table
24762306a36Sopenharmony_ci	sub		x4, x4, #16
24862306a36Sopenharmony_ci	add		x9, x8, #32
24962306a36Sopenharmony_ci	add		x8, x8, x4
25062306a36Sopenharmony_ci	sub		x9, x9, x4
25162306a36Sopenharmony_ci	ld1		{v3.16b}, [x8]
25262306a36Sopenharmony_ci	ld1		{v4.16b}, [x9]
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
25562306a36Sopenharmony_ci	ld1		{v1.16b}, [x1]
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	ld1		{v5.16b}, [x5]			/* get iv */
25862306a36Sopenharmony_ci	enc_prepare	w3, x2, x6
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
26162306a36Sopenharmony_ci	tbl		v1.16b, {v1.16b}, v4.16b
26262306a36Sopenharmony_ci	encrypt_block	v0, w3, x2, x6, w7
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v0.16b
26562306a36Sopenharmony_ci	tbl		v0.16b, {v0.16b}, v3.16b
26662306a36Sopenharmony_ci	encrypt_block	v1, w3, x2, x6, w7
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	add		x4, x0, x4
26962306a36Sopenharmony_ci	st1		{v0.16b}, [x4]			/* overlapping stores */
27062306a36Sopenharmony_ci	st1		{v1.16b}, [x0]
27162306a36Sopenharmony_ci	ret
27262306a36Sopenharmony_ciAES_FUNC_END(aes_cbc_cts_encrypt)
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ciAES_FUNC_START(aes_cbc_cts_decrypt)
27562306a36Sopenharmony_ci	adr_l		x8, .Lcts_permute_table
27662306a36Sopenharmony_ci	sub		x4, x4, #16
27762306a36Sopenharmony_ci	add		x9, x8, #32
27862306a36Sopenharmony_ci	add		x8, x8, x4
27962306a36Sopenharmony_ci	sub		x9, x9, x4
28062306a36Sopenharmony_ci	ld1		{v3.16b}, [x8]
28162306a36Sopenharmony_ci	ld1		{v4.16b}, [x9]
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
28462306a36Sopenharmony_ci	ld1		{v1.16b}, [x1]
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	ld1		{v5.16b}, [x5]			/* get iv */
28762306a36Sopenharmony_ci	dec_prepare	w3, x2, x6
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	decrypt_block	v0, w3, x2, x6, w7
29062306a36Sopenharmony_ci	tbl		v2.16b, {v0.16b}, v3.16b
29162306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v1.16b
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	tbx		v0.16b, {v1.16b}, v4.16b
29462306a36Sopenharmony_ci	decrypt_block	v0, w3, x2, x6, w7
29562306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	add		x4, x0, x4
29862306a36Sopenharmony_ci	st1		{v2.16b}, [x4]			/* overlapping stores */
29962306a36Sopenharmony_ci	st1		{v0.16b}, [x0]
30062306a36Sopenharmony_ci	ret
30162306a36Sopenharmony_ciAES_FUNC_END(aes_cbc_cts_decrypt)
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci	.section	".rodata", "a"
30462306a36Sopenharmony_ci	.align		6
30562306a36Sopenharmony_ci.Lcts_permute_table:
30662306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
30762306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
30862306a36Sopenharmony_ci	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
30962306a36Sopenharmony_ci	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
31062306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
31162306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
31262306a36Sopenharmony_ci	.previous
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_ci	/*
31562306a36Sopenharmony_ci	 * This macro generates the code for CTR and XCTR mode.
31662306a36Sopenharmony_ci	 */
31762306a36Sopenharmony_ci.macro ctr_encrypt xctr
31862306a36Sopenharmony_ci	// Arguments
31962306a36Sopenharmony_ci	OUT		.req x0
32062306a36Sopenharmony_ci	IN		.req x1
32162306a36Sopenharmony_ci	KEY		.req x2
32262306a36Sopenharmony_ci	ROUNDS_W	.req w3
32362306a36Sopenharmony_ci	BYTES_W		.req w4
32462306a36Sopenharmony_ci	IV		.req x5
32562306a36Sopenharmony_ci	BYTE_CTR_W 	.req w6		// XCTR only
32662306a36Sopenharmony_ci	// Intermediate values
32762306a36Sopenharmony_ci	CTR_W		.req w11	// XCTR only
32862306a36Sopenharmony_ci	CTR		.req x11	// XCTR only
32962306a36Sopenharmony_ci	IV_PART		.req x12
33062306a36Sopenharmony_ci	BLOCKS		.req x13
33162306a36Sopenharmony_ci	BLOCKS_W	.req w13
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_ci	frame_push	0
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci	enc_prepare	ROUNDS_W, KEY, IV_PART
33662306a36Sopenharmony_ci	ld1		{vctr.16b}, [IV]
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci	/*
33962306a36Sopenharmony_ci	 * Keep 64 bits of the IV in a register.  For CTR mode this lets us
34062306a36Sopenharmony_ci	 * easily increment the IV.  For XCTR mode this lets us efficiently XOR
34162306a36Sopenharmony_ci	 * the 64-bit counter with the IV.
34262306a36Sopenharmony_ci	 */
34362306a36Sopenharmony_ci	.if \xctr
34462306a36Sopenharmony_ci		umov		IV_PART, vctr.d[0]
34562306a36Sopenharmony_ci		lsr		CTR_W, BYTE_CTR_W, #4
34662306a36Sopenharmony_ci	.else
34762306a36Sopenharmony_ci		umov		IV_PART, vctr.d[1]
34862306a36Sopenharmony_ci		rev		IV_PART, IV_PART
34962306a36Sopenharmony_ci	.endif
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci.LctrloopNx\xctr:
35262306a36Sopenharmony_ci	add		BLOCKS_W, BYTES_W, #15
35362306a36Sopenharmony_ci	sub		BYTES_W, BYTES_W, #MAX_STRIDE << 4
35462306a36Sopenharmony_ci	lsr		BLOCKS_W, BLOCKS_W, #4
35562306a36Sopenharmony_ci	mov		w8, #MAX_STRIDE
35662306a36Sopenharmony_ci	cmp		BLOCKS_W, w8
35762306a36Sopenharmony_ci	csel		BLOCKS_W, BLOCKS_W, w8, lt
35862306a36Sopenharmony_ci
35962306a36Sopenharmony_ci	/*
36062306a36Sopenharmony_ci	 * Set up the counter values in v0-v{MAX_STRIDE-1}.
36162306a36Sopenharmony_ci	 *
36262306a36Sopenharmony_ci	 * If we are encrypting less than MAX_STRIDE blocks, the tail block
36362306a36Sopenharmony_ci	 * handling code expects the last keystream block to be in
36462306a36Sopenharmony_ci	 * v{MAX_STRIDE-1}.  For example: if encrypting two blocks with
36562306a36Sopenharmony_ci	 * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
36662306a36Sopenharmony_ci	 */
36762306a36Sopenharmony_ci	.if \xctr
36862306a36Sopenharmony_ci		add		CTR, CTR, BLOCKS
36962306a36Sopenharmony_ci	.else
37062306a36Sopenharmony_ci		adds		IV_PART, IV_PART, BLOCKS
37162306a36Sopenharmony_ci	.endif
37262306a36Sopenharmony_ci	mov		v0.16b, vctr.16b
37362306a36Sopenharmony_ci	mov		v1.16b, vctr.16b
37462306a36Sopenharmony_ci	mov		v2.16b, vctr.16b
37562306a36Sopenharmony_ci	mov		v3.16b, vctr.16b
37662306a36Sopenharmony_ciST5(	mov		v4.16b, vctr.16b		)
37762306a36Sopenharmony_ci	.if \xctr
37862306a36Sopenharmony_ci		sub		x6, CTR, #MAX_STRIDE - 1
37962306a36Sopenharmony_ci		sub		x7, CTR, #MAX_STRIDE - 2
38062306a36Sopenharmony_ci		sub		x8, CTR, #MAX_STRIDE - 3
38162306a36Sopenharmony_ci		sub		x9, CTR, #MAX_STRIDE - 4
38262306a36Sopenharmony_ciST5(		sub		x10, CTR, #MAX_STRIDE - 5	)
38362306a36Sopenharmony_ci		eor		x6, x6, IV_PART
38462306a36Sopenharmony_ci		eor		x7, x7, IV_PART
38562306a36Sopenharmony_ci		eor		x8, x8, IV_PART
38662306a36Sopenharmony_ci		eor		x9, x9, IV_PART
38762306a36Sopenharmony_ciST5(		eor		x10, x10, IV_PART		)
38862306a36Sopenharmony_ci		mov		v0.d[0], x6
38962306a36Sopenharmony_ci		mov		v1.d[0], x7
39062306a36Sopenharmony_ci		mov		v2.d[0], x8
39162306a36Sopenharmony_ci		mov		v3.d[0], x9
39262306a36Sopenharmony_ciST5(		mov		v4.d[0], x10			)
39362306a36Sopenharmony_ci	.else
39462306a36Sopenharmony_ci		bcs		0f
39562306a36Sopenharmony_ci		.subsection	1
39662306a36Sopenharmony_ci		/*
39762306a36Sopenharmony_ci		 * This subsection handles carries.
39862306a36Sopenharmony_ci		 *
39962306a36Sopenharmony_ci		 * Conditional branching here is allowed with respect to time
40062306a36Sopenharmony_ci		 * invariance since the branches are dependent on the IV instead
40162306a36Sopenharmony_ci		 * of the plaintext or key.  This code is rarely executed in
40262306a36Sopenharmony_ci		 * practice anyway.
40362306a36Sopenharmony_ci		 */
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_ci		/* Apply carry to outgoing counter. */
40662306a36Sopenharmony_ci0:		umov		x8, vctr.d[0]
40762306a36Sopenharmony_ci		rev		x8, x8
40862306a36Sopenharmony_ci		add		x8, x8, #1
40962306a36Sopenharmony_ci		rev		x8, x8
41062306a36Sopenharmony_ci		ins		vctr.d[0], x8
41162306a36Sopenharmony_ci
41262306a36Sopenharmony_ci		/*
41362306a36Sopenharmony_ci		 * Apply carry to counter blocks if needed.
41462306a36Sopenharmony_ci		 *
41562306a36Sopenharmony_ci		 * Since the carry flag was set, we know 0 <= IV_PART <
41662306a36Sopenharmony_ci		 * MAX_STRIDE.  Using the value of IV_PART we can determine how
41762306a36Sopenharmony_ci		 * many counter blocks need to be updated.
41862306a36Sopenharmony_ci		 */
41962306a36Sopenharmony_ci		cbz		IV_PART, 2f
42062306a36Sopenharmony_ci		adr		x16, 1f
42162306a36Sopenharmony_ci		sub		x16, x16, IV_PART, lsl #3
42262306a36Sopenharmony_ci		br		x16
42362306a36Sopenharmony_ci		bti		c
42462306a36Sopenharmony_ci		mov		v0.d[0], vctr.d[0]
42562306a36Sopenharmony_ci		bti		c
42662306a36Sopenharmony_ci		mov		v1.d[0], vctr.d[0]
42762306a36Sopenharmony_ci		bti		c
42862306a36Sopenharmony_ci		mov		v2.d[0], vctr.d[0]
42962306a36Sopenharmony_ci		bti		c
43062306a36Sopenharmony_ci		mov		v3.d[0], vctr.d[0]
43162306a36Sopenharmony_ciST5(		bti		c				)
43262306a36Sopenharmony_ciST5(		mov		v4.d[0], vctr.d[0]		)
43362306a36Sopenharmony_ci1:		b		2f
43462306a36Sopenharmony_ci		.previous
43562306a36Sopenharmony_ci
43662306a36Sopenharmony_ci2:		rev		x7, IV_PART
43762306a36Sopenharmony_ci		ins		vctr.d[1], x7
43862306a36Sopenharmony_ci		sub		x7, IV_PART, #MAX_STRIDE - 1
43962306a36Sopenharmony_ci		sub		x8, IV_PART, #MAX_STRIDE - 2
44062306a36Sopenharmony_ci		sub		x9, IV_PART, #MAX_STRIDE - 3
44162306a36Sopenharmony_ci		rev		x7, x7
44262306a36Sopenharmony_ci		rev		x8, x8
44362306a36Sopenharmony_ci		mov		v1.d[1], x7
44462306a36Sopenharmony_ci		rev		x9, x9
44562306a36Sopenharmony_ciST5(		sub		x10, IV_PART, #MAX_STRIDE - 4	)
44662306a36Sopenharmony_ci		mov		v2.d[1], x8
44762306a36Sopenharmony_ciST5(		rev		x10, x10			)
44862306a36Sopenharmony_ci		mov		v3.d[1], x9
44962306a36Sopenharmony_ciST5(		mov		v4.d[1], x10			)
45062306a36Sopenharmony_ci	.endif
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci	/*
45362306a36Sopenharmony_ci	 * If there are at least MAX_STRIDE blocks left, XOR the data with
45462306a36Sopenharmony_ci	 * keystream and store.  Otherwise jump to tail handling.
45562306a36Sopenharmony_ci	 */
45662306a36Sopenharmony_ci	tbnz		BYTES_W, #31, .Lctrtail\xctr
45762306a36Sopenharmony_ci	ld1		{v5.16b-v7.16b}, [IN], #48
45862306a36Sopenharmony_ciST4(	bl		aes_encrypt_block4x		)
45962306a36Sopenharmony_ciST5(	bl		aes_encrypt_block5x		)
46062306a36Sopenharmony_ci	eor		v0.16b, v5.16b, v0.16b
46162306a36Sopenharmony_ciST4(	ld1		{v5.16b}, [IN], #16		)
46262306a36Sopenharmony_ci	eor		v1.16b, v6.16b, v1.16b
46362306a36Sopenharmony_ciST5(	ld1		{v5.16b-v6.16b}, [IN], #32	)
46462306a36Sopenharmony_ci	eor		v2.16b, v7.16b, v2.16b
46562306a36Sopenharmony_ci	eor		v3.16b, v5.16b, v3.16b
46662306a36Sopenharmony_ciST5(	eor		v4.16b, v6.16b, v4.16b		)
46762306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [OUT], #64
46862306a36Sopenharmony_ciST5(	st1		{v4.16b}, [OUT], #16		)
46962306a36Sopenharmony_ci	cbz		BYTES_W, .Lctrout\xctr
47062306a36Sopenharmony_ci	b		.LctrloopNx\xctr
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_ci.Lctrout\xctr:
47362306a36Sopenharmony_ci	.if !\xctr
47462306a36Sopenharmony_ci		st1		{vctr.16b}, [IV] /* return next CTR value */
47562306a36Sopenharmony_ci	.endif
47662306a36Sopenharmony_ci	frame_pop
47762306a36Sopenharmony_ci	ret
47862306a36Sopenharmony_ci
47962306a36Sopenharmony_ci.Lctrtail\xctr:
48062306a36Sopenharmony_ci	/*
48162306a36Sopenharmony_ci	 * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
48262306a36Sopenharmony_ci	 *
48362306a36Sopenharmony_ci	 * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
48462306a36Sopenharmony_ci	 * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
48562306a36Sopenharmony_ci	 * v4 should have the next two counter blocks.
48662306a36Sopenharmony_ci	 *
48762306a36Sopenharmony_ci	 * This allows us to store the ciphertext by writing to overlapping
48862306a36Sopenharmony_ci	 * regions of memory.  Any invalid ciphertext blocks get overwritten by
48962306a36Sopenharmony_ci	 * correctly computed blocks.  This approach greatly simplifies the
49062306a36Sopenharmony_ci	 * logic for storing the ciphertext.
49162306a36Sopenharmony_ci	 */
49262306a36Sopenharmony_ci	mov		x16, #16
49362306a36Sopenharmony_ci	ands		w7, BYTES_W, #0xf
49462306a36Sopenharmony_ci	csel		x13, x7, x16, ne
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_ciST5(	cmp		BYTES_W, #64 - (MAX_STRIDE << 4))
49762306a36Sopenharmony_ciST5(	csel		x14, x16, xzr, gt		)
49862306a36Sopenharmony_ci	cmp		BYTES_W, #48 - (MAX_STRIDE << 4)
49962306a36Sopenharmony_ci	csel		x15, x16, xzr, gt
50062306a36Sopenharmony_ci	cmp		BYTES_W, #32 - (MAX_STRIDE << 4)
50162306a36Sopenharmony_ci	csel		x16, x16, xzr, gt
50262306a36Sopenharmony_ci	cmp		BYTES_W, #16 - (MAX_STRIDE << 4)
50362306a36Sopenharmony_ci
50462306a36Sopenharmony_ci	adr_l		x9, .Lcts_permute_table
50562306a36Sopenharmony_ci	add		x9, x9, x13
50662306a36Sopenharmony_ci	ble		.Lctrtail1x\xctr
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ciST5(	ld1		{v5.16b}, [IN], x14		)
50962306a36Sopenharmony_ci	ld1		{v6.16b}, [IN], x15
51062306a36Sopenharmony_ci	ld1		{v7.16b}, [IN], x16
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ciST4(	bl		aes_encrypt_block4x		)
51362306a36Sopenharmony_ciST5(	bl		aes_encrypt_block5x		)
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci	ld1		{v8.16b}, [IN], x13
51662306a36Sopenharmony_ci	ld1		{v9.16b}, [IN]
51762306a36Sopenharmony_ci	ld1		{v10.16b}, [x9]
51862306a36Sopenharmony_ci
51962306a36Sopenharmony_ciST4(	eor		v6.16b, v6.16b, v0.16b		)
52062306a36Sopenharmony_ciST4(	eor		v7.16b, v7.16b, v1.16b		)
52162306a36Sopenharmony_ciST4(	tbl		v3.16b, {v3.16b}, v10.16b	)
52262306a36Sopenharmony_ciST4(	eor		v8.16b, v8.16b, v2.16b		)
52362306a36Sopenharmony_ciST4(	eor		v9.16b, v9.16b, v3.16b		)
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ciST5(	eor		v5.16b, v5.16b, v0.16b		)
52662306a36Sopenharmony_ciST5(	eor		v6.16b, v6.16b, v1.16b		)
52762306a36Sopenharmony_ciST5(	tbl		v4.16b, {v4.16b}, v10.16b	)
52862306a36Sopenharmony_ciST5(	eor		v7.16b, v7.16b, v2.16b		)
52962306a36Sopenharmony_ciST5(	eor		v8.16b, v8.16b, v3.16b		)
53062306a36Sopenharmony_ciST5(	eor		v9.16b, v9.16b, v4.16b		)
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ciST5(	st1		{v5.16b}, [OUT], x14		)
53362306a36Sopenharmony_ci	st1		{v6.16b}, [OUT], x15
53462306a36Sopenharmony_ci	st1		{v7.16b}, [OUT], x16
53562306a36Sopenharmony_ci	add		x13, x13, OUT
53662306a36Sopenharmony_ci	st1		{v9.16b}, [x13]		// overlapping stores
53762306a36Sopenharmony_ci	st1		{v8.16b}, [OUT]
53862306a36Sopenharmony_ci	b		.Lctrout\xctr
53962306a36Sopenharmony_ci
54062306a36Sopenharmony_ci.Lctrtail1x\xctr:
54162306a36Sopenharmony_ci	/*
54262306a36Sopenharmony_ci	 * Handle <= 16 bytes of plaintext
54362306a36Sopenharmony_ci	 *
54462306a36Sopenharmony_ci	 * This code always reads and writes 16 bytes.  To avoid out of bounds
54562306a36Sopenharmony_ci	 * accesses, XCTR and CTR modes must use a temporary buffer when
54662306a36Sopenharmony_ci	 * encrypting/decrypting less than 16 bytes.
54762306a36Sopenharmony_ci	 *
54862306a36Sopenharmony_ci	 * This code is unusual in that it loads the input and stores the output
54962306a36Sopenharmony_ci	 * relative to the end of the buffers rather than relative to the start.
55062306a36Sopenharmony_ci	 * This causes unusual behaviour when encrypting/decrypting less than 16
55162306a36Sopenharmony_ci	 * bytes; the end of the data is expected to be at the end of the
55262306a36Sopenharmony_ci	 * temporary buffer rather than the start of the data being at the start
55362306a36Sopenharmony_ci	 * of the temporary buffer.
55462306a36Sopenharmony_ci	 */
55562306a36Sopenharmony_ci	sub		x8, x7, #16
55662306a36Sopenharmony_ci	csel		x7, x7, x8, eq
55762306a36Sopenharmony_ci	add		IN, IN, x7
55862306a36Sopenharmony_ci	add		OUT, OUT, x7
55962306a36Sopenharmony_ci	ld1		{v5.16b}, [IN]
56062306a36Sopenharmony_ci	ld1		{v6.16b}, [OUT]
56162306a36Sopenharmony_ciST5(	mov		v3.16b, v4.16b			)
56262306a36Sopenharmony_ci	encrypt_block	v3, ROUNDS_W, KEY, x8, w7
56362306a36Sopenharmony_ci	ld1		{v10.16b-v11.16b}, [x9]
56462306a36Sopenharmony_ci	tbl		v3.16b, {v3.16b}, v10.16b
56562306a36Sopenharmony_ci	sshr		v11.16b, v11.16b, #7
56662306a36Sopenharmony_ci	eor		v5.16b, v5.16b, v3.16b
56762306a36Sopenharmony_ci	bif		v5.16b, v6.16b, v11.16b
56862306a36Sopenharmony_ci	st1		{v5.16b}, [OUT]
56962306a36Sopenharmony_ci	b		.Lctrout\xctr
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_ci	// Arguments
57262306a36Sopenharmony_ci	.unreq OUT
57362306a36Sopenharmony_ci	.unreq IN
57462306a36Sopenharmony_ci	.unreq KEY
57562306a36Sopenharmony_ci	.unreq ROUNDS_W
57662306a36Sopenharmony_ci	.unreq BYTES_W
57762306a36Sopenharmony_ci	.unreq IV
57862306a36Sopenharmony_ci	.unreq BYTE_CTR_W	// XCTR only
57962306a36Sopenharmony_ci	// Intermediate values
58062306a36Sopenharmony_ci	.unreq CTR_W		// XCTR only
58162306a36Sopenharmony_ci	.unreq CTR		// XCTR only
58262306a36Sopenharmony_ci	.unreq IV_PART
58362306a36Sopenharmony_ci	.unreq BLOCKS
58462306a36Sopenharmony_ci	.unreq BLOCKS_W
58562306a36Sopenharmony_ci.endm
58662306a36Sopenharmony_ci
58762306a36Sopenharmony_ci	/*
58862306a36Sopenharmony_ci	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
58962306a36Sopenharmony_ci	 *		   int bytes, u8 ctr[])
59062306a36Sopenharmony_ci	 *
59162306a36Sopenharmony_ci	 * The input and output buffers must always be at least 16 bytes even if
59262306a36Sopenharmony_ci	 * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
59362306a36Sopenharmony_ci	 * accesses will occur.  The data to be encrypted/decrypted is expected
59462306a36Sopenharmony_ci	 * to be at the end of this 16-byte temporary buffer rather than the
59562306a36Sopenharmony_ci	 * start.
59662306a36Sopenharmony_ci	 */
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ciAES_FUNC_START(aes_ctr_encrypt)
59962306a36Sopenharmony_ci	ctr_encrypt 0
60062306a36Sopenharmony_ciAES_FUNC_END(aes_ctr_encrypt)
60162306a36Sopenharmony_ci
60262306a36Sopenharmony_ci	/*
60362306a36Sopenharmony_ci	 * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
60462306a36Sopenharmony_ci	 *		   int bytes, u8 const iv[], int byte_ctr)
60562306a36Sopenharmony_ci	 *
60662306a36Sopenharmony_ci	 * The input and output buffers must always be at least 16 bytes even if
60762306a36Sopenharmony_ci	 * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
60862306a36Sopenharmony_ci	 * accesses will occur.  The data to be encrypted/decrypted is expected
60962306a36Sopenharmony_ci	 * to be at the end of this 16-byte temporary buffer rather than the
61062306a36Sopenharmony_ci	 * start.
61162306a36Sopenharmony_ci	 */
61262306a36Sopenharmony_ci
61362306a36Sopenharmony_ciAES_FUNC_START(aes_xctr_encrypt)
61462306a36Sopenharmony_ci	ctr_encrypt 1
61562306a36Sopenharmony_ciAES_FUNC_END(aes_xctr_encrypt)
61662306a36Sopenharmony_ci
61762306a36Sopenharmony_ci
61862306a36Sopenharmony_ci	/*
61962306a36Sopenharmony_ci	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
62062306a36Sopenharmony_ci	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
62162306a36Sopenharmony_ci	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
62262306a36Sopenharmony_ci	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
62362306a36Sopenharmony_ci	 */
62462306a36Sopenharmony_ci
62562306a36Sopenharmony_ci	.macro		next_tweak, out, in, tmp
62662306a36Sopenharmony_ci	sshr		\tmp\().2d,  \in\().2d,   #63
62762306a36Sopenharmony_ci	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
62862306a36Sopenharmony_ci	add		\out\().2d,  \in\().2d,   \in\().2d
62962306a36Sopenharmony_ci	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
63062306a36Sopenharmony_ci	eor		\out\().16b, \out\().16b, \tmp\().16b
63162306a36Sopenharmony_ci	.endm
63262306a36Sopenharmony_ci
63362306a36Sopenharmony_ci	.macro		xts_load_mask, tmp
63462306a36Sopenharmony_ci	movi		xtsmask.2s, #0x1
63562306a36Sopenharmony_ci	movi		\tmp\().2s, #0x87
63662306a36Sopenharmony_ci	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
63762306a36Sopenharmony_ci	.endm
63862306a36Sopenharmony_ci
63962306a36Sopenharmony_ciAES_FUNC_START(aes_xts_encrypt)
64062306a36Sopenharmony_ci	frame_push	0
64162306a36Sopenharmony_ci
64262306a36Sopenharmony_ci	ld1		{v4.16b}, [x6]
64362306a36Sopenharmony_ci	xts_load_mask	v8
64462306a36Sopenharmony_ci	cbz		w7, .Lxtsencnotfirst
64562306a36Sopenharmony_ci
64662306a36Sopenharmony_ci	enc_prepare	w3, x5, x8
64762306a36Sopenharmony_ci	xts_cts_skip_tw	w7, .LxtsencNx
64862306a36Sopenharmony_ci	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
64962306a36Sopenharmony_ci	enc_switch_key	w3, x2, x8
65062306a36Sopenharmony_ci	b		.LxtsencNx
65162306a36Sopenharmony_ci
65262306a36Sopenharmony_ci.Lxtsencnotfirst:
65362306a36Sopenharmony_ci	enc_prepare	w3, x2, x8
65462306a36Sopenharmony_ci.LxtsencloopNx:
65562306a36Sopenharmony_ci	next_tweak	v4, v4, v8
65662306a36Sopenharmony_ci.LxtsencNx:
65762306a36Sopenharmony_ci	subs		w4, w4, #64
65862306a36Sopenharmony_ci	bmi		.Lxtsenc1x
65962306a36Sopenharmony_ci	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
66062306a36Sopenharmony_ci	next_tweak	v5, v4, v8
66162306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
66262306a36Sopenharmony_ci	next_tweak	v6, v5, v8
66362306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v5.16b
66462306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v6.16b
66562306a36Sopenharmony_ci	next_tweak	v7, v6, v8
66662306a36Sopenharmony_ci	eor		v3.16b, v3.16b, v7.16b
66762306a36Sopenharmony_ci	bl		aes_encrypt_block4x
66862306a36Sopenharmony_ci	eor		v3.16b, v3.16b, v7.16b
66962306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
67062306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v5.16b
67162306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v6.16b
67262306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x0], #64
67362306a36Sopenharmony_ci	mov		v4.16b, v7.16b
67462306a36Sopenharmony_ci	cbz		w4, .Lxtsencret
67562306a36Sopenharmony_ci	xts_reload_mask	v8
67662306a36Sopenharmony_ci	b		.LxtsencloopNx
67762306a36Sopenharmony_ci.Lxtsenc1x:
67862306a36Sopenharmony_ci	adds		w4, w4, #64
67962306a36Sopenharmony_ci	beq		.Lxtsencout
68062306a36Sopenharmony_ci	subs		w4, w4, #16
68162306a36Sopenharmony_ci	bmi		.LxtsencctsNx
68262306a36Sopenharmony_ci.Lxtsencloop:
68362306a36Sopenharmony_ci	ld1		{v0.16b}, [x1], #16
68462306a36Sopenharmony_ci.Lxtsencctsout:
68562306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
68662306a36Sopenharmony_ci	encrypt_block	v0, w3, x2, x8, w7
68762306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
68862306a36Sopenharmony_ci	cbz		w4, .Lxtsencout
68962306a36Sopenharmony_ci	subs		w4, w4, #16
69062306a36Sopenharmony_ci	next_tweak	v4, v4, v8
69162306a36Sopenharmony_ci	bmi		.Lxtsenccts
69262306a36Sopenharmony_ci	st1		{v0.16b}, [x0], #16
69362306a36Sopenharmony_ci	b		.Lxtsencloop
69462306a36Sopenharmony_ci.Lxtsencout:
69562306a36Sopenharmony_ci	st1		{v0.16b}, [x0]
69662306a36Sopenharmony_ci.Lxtsencret:
69762306a36Sopenharmony_ci	st1		{v4.16b}, [x6]
69862306a36Sopenharmony_ci	frame_pop
69962306a36Sopenharmony_ci	ret
70062306a36Sopenharmony_ci
70162306a36Sopenharmony_ci.LxtsencctsNx:
70262306a36Sopenharmony_ci	mov		v0.16b, v3.16b
70362306a36Sopenharmony_ci	sub		x0, x0, #16
70462306a36Sopenharmony_ci.Lxtsenccts:
70562306a36Sopenharmony_ci	adr_l		x8, .Lcts_permute_table
70662306a36Sopenharmony_ci
70762306a36Sopenharmony_ci	add		x1, x1, w4, sxtw	/* rewind input pointer */
70862306a36Sopenharmony_ci	add		w4, w4, #16		/* # bytes in final block */
70962306a36Sopenharmony_ci	add		x9, x8, #32
71062306a36Sopenharmony_ci	add		x8, x8, x4
71162306a36Sopenharmony_ci	sub		x9, x9, x4
71262306a36Sopenharmony_ci	add		x4, x0, x4		/* output address of final block */
71362306a36Sopenharmony_ci
71462306a36Sopenharmony_ci	ld1		{v1.16b}, [x1]		/* load final block */
71562306a36Sopenharmony_ci	ld1		{v2.16b}, [x8]
71662306a36Sopenharmony_ci	ld1		{v3.16b}, [x9]
71762306a36Sopenharmony_ci
71862306a36Sopenharmony_ci	tbl		v2.16b, {v0.16b}, v2.16b
71962306a36Sopenharmony_ci	tbx		v0.16b, {v1.16b}, v3.16b
72062306a36Sopenharmony_ci	st1		{v2.16b}, [x4]			/* overlapping stores */
72162306a36Sopenharmony_ci	mov		w4, wzr
72262306a36Sopenharmony_ci	b		.Lxtsencctsout
72362306a36Sopenharmony_ciAES_FUNC_END(aes_xts_encrypt)
72462306a36Sopenharmony_ci
72562306a36Sopenharmony_ciAES_FUNC_START(aes_xts_decrypt)
72662306a36Sopenharmony_ci	frame_push	0
72762306a36Sopenharmony_ci
72862306a36Sopenharmony_ci	/* subtract 16 bytes if we are doing CTS */
72962306a36Sopenharmony_ci	sub		w8, w4, #0x10
73062306a36Sopenharmony_ci	tst		w4, #0xf
73162306a36Sopenharmony_ci	csel		w4, w4, w8, eq
73262306a36Sopenharmony_ci
73362306a36Sopenharmony_ci	ld1		{v4.16b}, [x6]
73462306a36Sopenharmony_ci	xts_load_mask	v8
73562306a36Sopenharmony_ci	xts_cts_skip_tw	w7, .Lxtsdecskiptw
73662306a36Sopenharmony_ci	cbz		w7, .Lxtsdecnotfirst
73762306a36Sopenharmony_ci
73862306a36Sopenharmony_ci	enc_prepare	w3, x5, x8
73962306a36Sopenharmony_ci	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
74062306a36Sopenharmony_ci.Lxtsdecskiptw:
74162306a36Sopenharmony_ci	dec_prepare	w3, x2, x8
74262306a36Sopenharmony_ci	b		.LxtsdecNx
74362306a36Sopenharmony_ci
74462306a36Sopenharmony_ci.Lxtsdecnotfirst:
74562306a36Sopenharmony_ci	dec_prepare	w3, x2, x8
74662306a36Sopenharmony_ci.LxtsdecloopNx:
74762306a36Sopenharmony_ci	next_tweak	v4, v4, v8
74862306a36Sopenharmony_ci.LxtsdecNx:
74962306a36Sopenharmony_ci	subs		w4, w4, #64
75062306a36Sopenharmony_ci	bmi		.Lxtsdec1x
75162306a36Sopenharmony_ci	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
75262306a36Sopenharmony_ci	next_tweak	v5, v4, v8
75362306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
75462306a36Sopenharmony_ci	next_tweak	v6, v5, v8
75562306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v5.16b
75662306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v6.16b
75762306a36Sopenharmony_ci	next_tweak	v7, v6, v8
75862306a36Sopenharmony_ci	eor		v3.16b, v3.16b, v7.16b
75962306a36Sopenharmony_ci	bl		aes_decrypt_block4x
76062306a36Sopenharmony_ci	eor		v3.16b, v3.16b, v7.16b
76162306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
76262306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v5.16b
76362306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v6.16b
76462306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x0], #64
76562306a36Sopenharmony_ci	mov		v4.16b, v7.16b
76662306a36Sopenharmony_ci	cbz		w4, .Lxtsdecout
76762306a36Sopenharmony_ci	xts_reload_mask	v8
76862306a36Sopenharmony_ci	b		.LxtsdecloopNx
76962306a36Sopenharmony_ci.Lxtsdec1x:
77062306a36Sopenharmony_ci	adds		w4, w4, #64
77162306a36Sopenharmony_ci	beq		.Lxtsdecout
77262306a36Sopenharmony_ci	subs		w4, w4, #16
77362306a36Sopenharmony_ci.Lxtsdecloop:
77462306a36Sopenharmony_ci	ld1		{v0.16b}, [x1], #16
77562306a36Sopenharmony_ci	bmi		.Lxtsdeccts
77662306a36Sopenharmony_ci.Lxtsdecctsout:
77762306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
77862306a36Sopenharmony_ci	decrypt_block	v0, w3, x2, x8, w7
77962306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
78062306a36Sopenharmony_ci	st1		{v0.16b}, [x0], #16
78162306a36Sopenharmony_ci	cbz		w4, .Lxtsdecout
78262306a36Sopenharmony_ci	subs		w4, w4, #16
78362306a36Sopenharmony_ci	next_tweak	v4, v4, v8
78462306a36Sopenharmony_ci	b		.Lxtsdecloop
78562306a36Sopenharmony_ci.Lxtsdecout:
78662306a36Sopenharmony_ci	st1		{v4.16b}, [x6]
78762306a36Sopenharmony_ci	frame_pop
78862306a36Sopenharmony_ci	ret
78962306a36Sopenharmony_ci
79062306a36Sopenharmony_ci.Lxtsdeccts:
79162306a36Sopenharmony_ci	adr_l		x8, .Lcts_permute_table
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci	add		x1, x1, w4, sxtw	/* rewind input pointer */
79462306a36Sopenharmony_ci	add		w4, w4, #16		/* # bytes in final block */
79562306a36Sopenharmony_ci	add		x9, x8, #32
79662306a36Sopenharmony_ci	add		x8, x8, x4
79762306a36Sopenharmony_ci	sub		x9, x9, x4
79862306a36Sopenharmony_ci	add		x4, x0, x4		/* output address of final block */
79962306a36Sopenharmony_ci
80062306a36Sopenharmony_ci	next_tweak	v5, v4, v8
80162306a36Sopenharmony_ci
80262306a36Sopenharmony_ci	ld1		{v1.16b}, [x1]		/* load final block */
80362306a36Sopenharmony_ci	ld1		{v2.16b}, [x8]
80462306a36Sopenharmony_ci	ld1		{v3.16b}, [x9]
80562306a36Sopenharmony_ci
80662306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v5.16b
80762306a36Sopenharmony_ci	decrypt_block	v0, w3, x2, x8, w7
80862306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v5.16b
80962306a36Sopenharmony_ci
81062306a36Sopenharmony_ci	tbl		v2.16b, {v0.16b}, v2.16b
81162306a36Sopenharmony_ci	tbx		v0.16b, {v1.16b}, v3.16b
81262306a36Sopenharmony_ci
81362306a36Sopenharmony_ci	st1		{v2.16b}, [x4]			/* overlapping stores */
81462306a36Sopenharmony_ci	mov		w4, wzr
81562306a36Sopenharmony_ci	b		.Lxtsdecctsout
81662306a36Sopenharmony_ciAES_FUNC_END(aes_xts_decrypt)
81762306a36Sopenharmony_ci
81862306a36Sopenharmony_ci	/*
81962306a36Sopenharmony_ci	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
82062306a36Sopenharmony_ci	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
82162306a36Sopenharmony_ci	 */
82262306a36Sopenharmony_ciAES_FUNC_START(aes_mac_update)
82362306a36Sopenharmony_ci	ld1		{v0.16b}, [x4]			/* get dg */
82462306a36Sopenharmony_ci	enc_prepare	w2, x1, x7
82562306a36Sopenharmony_ci	cbz		w5, .Lmacloop4x
82662306a36Sopenharmony_ci
82762306a36Sopenharmony_ci	encrypt_block	v0, w2, x1, x7, w8
82862306a36Sopenharmony_ci
82962306a36Sopenharmony_ci.Lmacloop4x:
83062306a36Sopenharmony_ci	subs		w3, w3, #4
83162306a36Sopenharmony_ci	bmi		.Lmac1x
83262306a36Sopenharmony_ci	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
83362306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
83462306a36Sopenharmony_ci	encrypt_block	v0, w2, x1, x7, w8
83562306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v2.16b
83662306a36Sopenharmony_ci	encrypt_block	v0, w2, x1, x7, w8
83762306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v3.16b
83862306a36Sopenharmony_ci	encrypt_block	v0, w2, x1, x7, w8
83962306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
84062306a36Sopenharmony_ci	cmp		w3, wzr
84162306a36Sopenharmony_ci	csinv		x5, x6, xzr, eq
84262306a36Sopenharmony_ci	cbz		w5, .Lmacout
84362306a36Sopenharmony_ci	encrypt_block	v0, w2, x1, x7, w8
84462306a36Sopenharmony_ci	st1		{v0.16b}, [x4]			/* return dg */
84562306a36Sopenharmony_ci	cond_yield	.Lmacout, x7, x8
84662306a36Sopenharmony_ci	b		.Lmacloop4x
84762306a36Sopenharmony_ci.Lmac1x:
84862306a36Sopenharmony_ci	add		w3, w3, #4
84962306a36Sopenharmony_ci.Lmacloop:
85062306a36Sopenharmony_ci	cbz		w3, .Lmacout
85162306a36Sopenharmony_ci	ld1		{v1.16b}, [x0], #16		/* get next pt block */
85262306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
85362306a36Sopenharmony_ci
85462306a36Sopenharmony_ci	subs		w3, w3, #1
85562306a36Sopenharmony_ci	csinv		x5, x6, xzr, eq
85662306a36Sopenharmony_ci	cbz		w5, .Lmacout
85762306a36Sopenharmony_ci
85862306a36Sopenharmony_ci.Lmacenc:
85962306a36Sopenharmony_ci	encrypt_block	v0, w2, x1, x7, w8
86062306a36Sopenharmony_ci	b		.Lmacloop
86162306a36Sopenharmony_ci
86262306a36Sopenharmony_ci.Lmacout:
86362306a36Sopenharmony_ci	st1		{v0.16b}, [x4]			/* return dg */
86462306a36Sopenharmony_ci	mov		w0, w3
86562306a36Sopenharmony_ci	ret
86662306a36Sopenharmony_ciAES_FUNC_END(aes_mac_update)
867