162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * AES CTR mode by8 optimization with AVX instructions. (x86_64)
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright(c) 2014 Intel Corporation.
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Contact Information:
862306a36Sopenharmony_ci * James Guilford <james.guilford@intel.com>
962306a36Sopenharmony_ci * Sean Gulley <sean.m.gulley@intel.com>
1062306a36Sopenharmony_ci * Chandramouli Narayanan <mouli@linux.intel.com>
1162306a36Sopenharmony_ci */
1262306a36Sopenharmony_ci/*
1362306a36Sopenharmony_ci * This is AES128/192/256 CTR mode optimization implementation. It requires
1462306a36Sopenharmony_ci * the support of Intel(R) AESNI and AVX instructions.
1562306a36Sopenharmony_ci *
1662306a36Sopenharmony_ci * This work was inspired by the AES CTR mode optimization published
1762306a36Sopenharmony_ci * in Intel Optimized IPSEC Cryptographic library.
1862306a36Sopenharmony_ci * Additional information on it can be found at:
1962306a36Sopenharmony_ci *    https://github.com/intel/intel-ipsec-mb
2062306a36Sopenharmony_ci */
2162306a36Sopenharmony_ci
2262306a36Sopenharmony_ci#include <linux/linkage.h>
2362306a36Sopenharmony_ci
2462306a36Sopenharmony_ci#define VMOVDQ		vmovdqu
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci/*
2762306a36Sopenharmony_ci * Note: the "x" prefix in these aliases means "this is an xmm register".  The
2862306a36Sopenharmony_ci * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
2962306a36Sopenharmony_ci * counter".
3062306a36Sopenharmony_ci */
3162306a36Sopenharmony_ci#define xdata0		%xmm0
3262306a36Sopenharmony_ci#define xdata1		%xmm1
3362306a36Sopenharmony_ci#define xdata2		%xmm2
3462306a36Sopenharmony_ci#define xdata3		%xmm3
3562306a36Sopenharmony_ci#define xdata4		%xmm4
3662306a36Sopenharmony_ci#define xdata5		%xmm5
3762306a36Sopenharmony_ci#define xdata6		%xmm6
3862306a36Sopenharmony_ci#define xdata7		%xmm7
3962306a36Sopenharmony_ci#define xcounter	%xmm8	// CTR mode only
4062306a36Sopenharmony_ci#define xiv		%xmm8	// XCTR mode only
4162306a36Sopenharmony_ci#define xbyteswap	%xmm9	// CTR mode only
4262306a36Sopenharmony_ci#define xtmp		%xmm9	// XCTR mode only
4362306a36Sopenharmony_ci#define xkey0		%xmm10
4462306a36Sopenharmony_ci#define xkey4		%xmm11
4562306a36Sopenharmony_ci#define xkey8		%xmm12
4662306a36Sopenharmony_ci#define xkey12		%xmm13
4762306a36Sopenharmony_ci#define xkeyA		%xmm14
4862306a36Sopenharmony_ci#define xkeyB		%xmm15
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci#define p_in		%rdi
5162306a36Sopenharmony_ci#define p_iv		%rsi
5262306a36Sopenharmony_ci#define p_keys		%rdx
5362306a36Sopenharmony_ci#define p_out		%rcx
5462306a36Sopenharmony_ci#define num_bytes	%r8
5562306a36Sopenharmony_ci#define counter		%r9	// XCTR mode only
5662306a36Sopenharmony_ci#define tmp		%r10
5762306a36Sopenharmony_ci#define	DDQ_DATA	0
5862306a36Sopenharmony_ci#define	XDATA		1
5962306a36Sopenharmony_ci#define KEY_128		1
6062306a36Sopenharmony_ci#define KEY_192		2
6162306a36Sopenharmony_ci#define KEY_256		3
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci.section .rodata
6462306a36Sopenharmony_ci.align 16
6562306a36Sopenharmony_ci
6662306a36Sopenharmony_cibyteswap_const:
6762306a36Sopenharmony_ci	.octa 0x000102030405060708090A0B0C0D0E0F
6862306a36Sopenharmony_ciddq_low_msk:
6962306a36Sopenharmony_ci	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
7062306a36Sopenharmony_ciddq_high_add_1:
7162306a36Sopenharmony_ci	.octa 0x00000000000000010000000000000000
7262306a36Sopenharmony_ciddq_add_1:
7362306a36Sopenharmony_ci	.octa 0x00000000000000000000000000000001
7462306a36Sopenharmony_ciddq_add_2:
7562306a36Sopenharmony_ci	.octa 0x00000000000000000000000000000002
7662306a36Sopenharmony_ciddq_add_3:
7762306a36Sopenharmony_ci	.octa 0x00000000000000000000000000000003
7862306a36Sopenharmony_ciddq_add_4:
7962306a36Sopenharmony_ci	.octa 0x00000000000000000000000000000004
8062306a36Sopenharmony_ciddq_add_5:
8162306a36Sopenharmony_ci	.octa 0x00000000000000000000000000000005
8262306a36Sopenharmony_ciddq_add_6:
8362306a36Sopenharmony_ci	.octa 0x00000000000000000000000000000006
8462306a36Sopenharmony_ciddq_add_7:
8562306a36Sopenharmony_ci	.octa 0x00000000000000000000000000000007
8662306a36Sopenharmony_ciddq_add_8:
8762306a36Sopenharmony_ci	.octa 0x00000000000000000000000000000008
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci.text
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci/* generate a unique variable for ddq_add_x */
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ci/* generate a unique variable for xmm register */
9462306a36Sopenharmony_ci.macro setxdata n
9562306a36Sopenharmony_ci	var_xdata = %xmm\n
9662306a36Sopenharmony_ci.endm
9762306a36Sopenharmony_ci
9862306a36Sopenharmony_ci/* club the numeric 'id' to the symbol 'name' */
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci.macro club name, id
10162306a36Sopenharmony_ci.altmacro
10262306a36Sopenharmony_ci	.if \name == XDATA
10362306a36Sopenharmony_ci		setxdata %\id
10462306a36Sopenharmony_ci	.endif
10562306a36Sopenharmony_ci.noaltmacro
10662306a36Sopenharmony_ci.endm
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci/*
10962306a36Sopenharmony_ci * do_aes num_in_par load_keys key_len
11062306a36Sopenharmony_ci * This increments p_in, but not p_out
11162306a36Sopenharmony_ci */
11262306a36Sopenharmony_ci.macro do_aes b, k, key_len, xctr
11362306a36Sopenharmony_ci	.set by, \b
11462306a36Sopenharmony_ci	.set load_keys, \k
11562306a36Sopenharmony_ci	.set klen, \key_len
11662306a36Sopenharmony_ci
11762306a36Sopenharmony_ci	.if (load_keys)
11862306a36Sopenharmony_ci		vmovdqa	0*16(p_keys), xkey0
11962306a36Sopenharmony_ci	.endif
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci	.if \xctr
12262306a36Sopenharmony_ci		movq counter, xtmp
12362306a36Sopenharmony_ci		.set i, 0
12462306a36Sopenharmony_ci		.rept (by)
12562306a36Sopenharmony_ci			club XDATA, i
12662306a36Sopenharmony_ci			vpaddq	(ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
12762306a36Sopenharmony_ci			.set i, (i +1)
12862306a36Sopenharmony_ci		.endr
12962306a36Sopenharmony_ci		.set i, 0
13062306a36Sopenharmony_ci		.rept (by)
13162306a36Sopenharmony_ci			club	XDATA, i
13262306a36Sopenharmony_ci			vpxor	xiv, var_xdata, var_xdata
13362306a36Sopenharmony_ci			.set i, (i +1)
13462306a36Sopenharmony_ci		.endr
13562306a36Sopenharmony_ci	.else
13662306a36Sopenharmony_ci		vpshufb	xbyteswap, xcounter, xdata0
13762306a36Sopenharmony_ci		.set i, 1
13862306a36Sopenharmony_ci		.rept (by - 1)
13962306a36Sopenharmony_ci			club XDATA, i
14062306a36Sopenharmony_ci			vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
14162306a36Sopenharmony_ci			vptest	ddq_low_msk(%rip), var_xdata
14262306a36Sopenharmony_ci			jnz 1f
14362306a36Sopenharmony_ci			vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
14462306a36Sopenharmony_ci			vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
14562306a36Sopenharmony_ci			1:
14662306a36Sopenharmony_ci			vpshufb	xbyteswap, var_xdata, var_xdata
14762306a36Sopenharmony_ci			.set i, (i +1)
14862306a36Sopenharmony_ci		.endr
14962306a36Sopenharmony_ci	.endif
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	vmovdqa	1*16(p_keys), xkeyA
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_ci	vpxor	xkey0, xdata0, xdata0
15462306a36Sopenharmony_ci	.if \xctr
15562306a36Sopenharmony_ci		add $by, counter
15662306a36Sopenharmony_ci	.else
15762306a36Sopenharmony_ci		vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
15862306a36Sopenharmony_ci		vptest	ddq_low_msk(%rip), xcounter
15962306a36Sopenharmony_ci		jnz	1f
16062306a36Sopenharmony_ci		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
16162306a36Sopenharmony_ci		1:
16262306a36Sopenharmony_ci	.endif
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci	.set i, 1
16562306a36Sopenharmony_ci	.rept (by - 1)
16662306a36Sopenharmony_ci		club XDATA, i
16762306a36Sopenharmony_ci		vpxor	xkey0, var_xdata, var_xdata
16862306a36Sopenharmony_ci		.set i, (i +1)
16962306a36Sopenharmony_ci	.endr
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci	vmovdqa	2*16(p_keys), xkeyB
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	.set i, 0
17462306a36Sopenharmony_ci	.rept by
17562306a36Sopenharmony_ci		club XDATA, i
17662306a36Sopenharmony_ci		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
17762306a36Sopenharmony_ci		.set i, (i +1)
17862306a36Sopenharmony_ci	.endr
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_ci	.if (klen == KEY_128)
18162306a36Sopenharmony_ci		.if (load_keys)
18262306a36Sopenharmony_ci			vmovdqa	3*16(p_keys), xkey4
18362306a36Sopenharmony_ci		.endif
18462306a36Sopenharmony_ci	.else
18562306a36Sopenharmony_ci		vmovdqa	3*16(p_keys), xkeyA
18662306a36Sopenharmony_ci	.endif
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci	.set i, 0
18962306a36Sopenharmony_ci	.rept by
19062306a36Sopenharmony_ci		club XDATA, i
19162306a36Sopenharmony_ci		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
19262306a36Sopenharmony_ci		.set i, (i +1)
19362306a36Sopenharmony_ci	.endr
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci	add	$(16*by), p_in
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci	.if (klen == KEY_128)
19862306a36Sopenharmony_ci		vmovdqa	4*16(p_keys), xkeyB
19962306a36Sopenharmony_ci	.else
20062306a36Sopenharmony_ci		.if (load_keys)
20162306a36Sopenharmony_ci			vmovdqa	4*16(p_keys), xkey4
20262306a36Sopenharmony_ci		.endif
20362306a36Sopenharmony_ci	.endif
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci	.set i, 0
20662306a36Sopenharmony_ci	.rept by
20762306a36Sopenharmony_ci		club XDATA, i
20862306a36Sopenharmony_ci		/* key 3 */
20962306a36Sopenharmony_ci		.if (klen == KEY_128)
21062306a36Sopenharmony_ci			vaesenc	xkey4, var_xdata, var_xdata
21162306a36Sopenharmony_ci		.else
21262306a36Sopenharmony_ci			vaesenc	xkeyA, var_xdata, var_xdata
21362306a36Sopenharmony_ci		.endif
21462306a36Sopenharmony_ci		.set i, (i +1)
21562306a36Sopenharmony_ci	.endr
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	vmovdqa	5*16(p_keys), xkeyA
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci	.set i, 0
22062306a36Sopenharmony_ci	.rept by
22162306a36Sopenharmony_ci		club XDATA, i
22262306a36Sopenharmony_ci		/* key 4 */
22362306a36Sopenharmony_ci		.if (klen == KEY_128)
22462306a36Sopenharmony_ci			vaesenc	xkeyB, var_xdata, var_xdata
22562306a36Sopenharmony_ci		.else
22662306a36Sopenharmony_ci			vaesenc	xkey4, var_xdata, var_xdata
22762306a36Sopenharmony_ci		.endif
22862306a36Sopenharmony_ci		.set i, (i +1)
22962306a36Sopenharmony_ci	.endr
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_ci	.if (klen == KEY_128)
23262306a36Sopenharmony_ci		.if (load_keys)
23362306a36Sopenharmony_ci			vmovdqa	6*16(p_keys), xkey8
23462306a36Sopenharmony_ci		.endif
23562306a36Sopenharmony_ci	.else
23662306a36Sopenharmony_ci		vmovdqa	6*16(p_keys), xkeyB
23762306a36Sopenharmony_ci	.endif
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_ci	.set i, 0
24062306a36Sopenharmony_ci	.rept by
24162306a36Sopenharmony_ci		club XDATA, i
24262306a36Sopenharmony_ci		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
24362306a36Sopenharmony_ci		.set i, (i +1)
24462306a36Sopenharmony_ci	.endr
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci	vmovdqa	7*16(p_keys), xkeyA
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci	.set i, 0
24962306a36Sopenharmony_ci	.rept by
25062306a36Sopenharmony_ci		club XDATA, i
25162306a36Sopenharmony_ci		/* key 6 */
25262306a36Sopenharmony_ci		.if (klen == KEY_128)
25362306a36Sopenharmony_ci			vaesenc	xkey8, var_xdata, var_xdata
25462306a36Sopenharmony_ci		.else
25562306a36Sopenharmony_ci			vaesenc	xkeyB, var_xdata, var_xdata
25662306a36Sopenharmony_ci		.endif
25762306a36Sopenharmony_ci		.set i, (i +1)
25862306a36Sopenharmony_ci	.endr
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	.if (klen == KEY_128)
26162306a36Sopenharmony_ci		vmovdqa	8*16(p_keys), xkeyB
26262306a36Sopenharmony_ci	.else
26362306a36Sopenharmony_ci		.if (load_keys)
26462306a36Sopenharmony_ci			vmovdqa	8*16(p_keys), xkey8
26562306a36Sopenharmony_ci		.endif
26662306a36Sopenharmony_ci	.endif
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	.set i, 0
26962306a36Sopenharmony_ci	.rept by
27062306a36Sopenharmony_ci		club XDATA, i
27162306a36Sopenharmony_ci		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
27262306a36Sopenharmony_ci		.set i, (i +1)
27362306a36Sopenharmony_ci	.endr
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci	.if (klen == KEY_128)
27662306a36Sopenharmony_ci		.if (load_keys)
27762306a36Sopenharmony_ci			vmovdqa	9*16(p_keys), xkey12
27862306a36Sopenharmony_ci		.endif
27962306a36Sopenharmony_ci	.else
28062306a36Sopenharmony_ci		vmovdqa	9*16(p_keys), xkeyA
28162306a36Sopenharmony_ci	.endif
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci	.set i, 0
28462306a36Sopenharmony_ci	.rept by
28562306a36Sopenharmony_ci		club XDATA, i
28662306a36Sopenharmony_ci		/* key 8 */
28762306a36Sopenharmony_ci		.if (klen == KEY_128)
28862306a36Sopenharmony_ci			vaesenc	xkeyB, var_xdata, var_xdata
28962306a36Sopenharmony_ci		.else
29062306a36Sopenharmony_ci			vaesenc	xkey8, var_xdata, var_xdata
29162306a36Sopenharmony_ci		.endif
29262306a36Sopenharmony_ci		.set i, (i +1)
29362306a36Sopenharmony_ci	.endr
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	vmovdqa	10*16(p_keys), xkeyB
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	.set i, 0
29862306a36Sopenharmony_ci	.rept by
29962306a36Sopenharmony_ci		club XDATA, i
30062306a36Sopenharmony_ci		/* key 9 */
30162306a36Sopenharmony_ci		.if (klen == KEY_128)
30262306a36Sopenharmony_ci			vaesenc	xkey12, var_xdata, var_xdata
30362306a36Sopenharmony_ci		.else
30462306a36Sopenharmony_ci			vaesenc	xkeyA, var_xdata, var_xdata
30562306a36Sopenharmony_ci		.endif
30662306a36Sopenharmony_ci		.set i, (i +1)
30762306a36Sopenharmony_ci	.endr
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ci	.if (klen != KEY_128)
31062306a36Sopenharmony_ci		vmovdqa	11*16(p_keys), xkeyA
31162306a36Sopenharmony_ci	.endif
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	.set i, 0
31462306a36Sopenharmony_ci	.rept by
31562306a36Sopenharmony_ci		club XDATA, i
31662306a36Sopenharmony_ci		/* key 10 */
31762306a36Sopenharmony_ci		.if (klen == KEY_128)
31862306a36Sopenharmony_ci			vaesenclast	xkeyB, var_xdata, var_xdata
31962306a36Sopenharmony_ci		.else
32062306a36Sopenharmony_ci			vaesenc	xkeyB, var_xdata, var_xdata
32162306a36Sopenharmony_ci		.endif
32262306a36Sopenharmony_ci		.set i, (i +1)
32362306a36Sopenharmony_ci	.endr
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_ci	.if (klen != KEY_128)
32662306a36Sopenharmony_ci		.if (load_keys)
32762306a36Sopenharmony_ci			vmovdqa	12*16(p_keys), xkey12
32862306a36Sopenharmony_ci		.endif
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci		.set i, 0
33162306a36Sopenharmony_ci		.rept by
33262306a36Sopenharmony_ci			club XDATA, i
33362306a36Sopenharmony_ci			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
33462306a36Sopenharmony_ci			.set i, (i +1)
33562306a36Sopenharmony_ci		.endr
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci		.if (klen == KEY_256)
33862306a36Sopenharmony_ci			vmovdqa	13*16(p_keys), xkeyA
33962306a36Sopenharmony_ci		.endif
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci		.set i, 0
34262306a36Sopenharmony_ci		.rept by
34362306a36Sopenharmony_ci			club XDATA, i
34462306a36Sopenharmony_ci			.if (klen == KEY_256)
34562306a36Sopenharmony_ci				/* key 12 */
34662306a36Sopenharmony_ci				vaesenc	xkey12, var_xdata, var_xdata
34762306a36Sopenharmony_ci			.else
34862306a36Sopenharmony_ci				vaesenclast xkey12, var_xdata, var_xdata
34962306a36Sopenharmony_ci			.endif
35062306a36Sopenharmony_ci			.set i, (i +1)
35162306a36Sopenharmony_ci		.endr
35262306a36Sopenharmony_ci
35362306a36Sopenharmony_ci		.if (klen == KEY_256)
35462306a36Sopenharmony_ci			vmovdqa	14*16(p_keys), xkeyB
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci			.set i, 0
35762306a36Sopenharmony_ci			.rept by
35862306a36Sopenharmony_ci				club XDATA, i
35962306a36Sopenharmony_ci				/* key 13 */
36062306a36Sopenharmony_ci				vaesenc	xkeyA, var_xdata, var_xdata
36162306a36Sopenharmony_ci				.set i, (i +1)
36262306a36Sopenharmony_ci			.endr
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_ci			.set i, 0
36562306a36Sopenharmony_ci			.rept by
36662306a36Sopenharmony_ci				club XDATA, i
36762306a36Sopenharmony_ci				/* key 14 */
36862306a36Sopenharmony_ci				vaesenclast	xkeyB, var_xdata, var_xdata
36962306a36Sopenharmony_ci				.set i, (i +1)
37062306a36Sopenharmony_ci			.endr
37162306a36Sopenharmony_ci		.endif
37262306a36Sopenharmony_ci	.endif
37362306a36Sopenharmony_ci
37462306a36Sopenharmony_ci	.set i, 0
37562306a36Sopenharmony_ci	.rept (by / 2)
37662306a36Sopenharmony_ci		.set j, (i+1)
37762306a36Sopenharmony_ci		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
37862306a36Sopenharmony_ci		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
37962306a36Sopenharmony_ci		club XDATA, i
38062306a36Sopenharmony_ci		vpxor	xkeyA, var_xdata, var_xdata
38162306a36Sopenharmony_ci		club XDATA, j
38262306a36Sopenharmony_ci		vpxor	xkeyB, var_xdata, var_xdata
38362306a36Sopenharmony_ci		.set i, (i+2)
38462306a36Sopenharmony_ci	.endr
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	.if (i < by)
38762306a36Sopenharmony_ci		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
38862306a36Sopenharmony_ci		club XDATA, i
38962306a36Sopenharmony_ci		vpxor	xkeyA, var_xdata, var_xdata
39062306a36Sopenharmony_ci	.endif
39162306a36Sopenharmony_ci
39262306a36Sopenharmony_ci	.set i, 0
39362306a36Sopenharmony_ci	.rept by
39462306a36Sopenharmony_ci		club XDATA, i
39562306a36Sopenharmony_ci		VMOVDQ	var_xdata, i*16(p_out)
39662306a36Sopenharmony_ci		.set i, (i+1)
39762306a36Sopenharmony_ci	.endr
39862306a36Sopenharmony_ci.endm
39962306a36Sopenharmony_ci
40062306a36Sopenharmony_ci.macro do_aes_load val, key_len, xctr
40162306a36Sopenharmony_ci	do_aes \val, 1, \key_len, \xctr
40262306a36Sopenharmony_ci.endm
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci.macro do_aes_noload val, key_len, xctr
40562306a36Sopenharmony_ci	do_aes \val, 0, \key_len, \xctr
40662306a36Sopenharmony_ci.endm
40762306a36Sopenharmony_ci
40862306a36Sopenharmony_ci/* main body of aes ctr load */
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_ci.macro do_aes_ctrmain key_len, xctr
41162306a36Sopenharmony_ci	cmp	$16, num_bytes
41262306a36Sopenharmony_ci	jb	.Ldo_return2\xctr\key_len
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci	.if \xctr
41562306a36Sopenharmony_ci		shr	$4, counter
41662306a36Sopenharmony_ci		vmovdqu	(p_iv), xiv
41762306a36Sopenharmony_ci	.else
41862306a36Sopenharmony_ci		vmovdqa	byteswap_const(%rip), xbyteswap
41962306a36Sopenharmony_ci		vmovdqu	(p_iv), xcounter
42062306a36Sopenharmony_ci		vpshufb	xbyteswap, xcounter, xcounter
42162306a36Sopenharmony_ci	.endif
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	mov	num_bytes, tmp
42462306a36Sopenharmony_ci	and	$(7*16), tmp
42562306a36Sopenharmony_ci	jz	.Lmult_of_8_blks\xctr\key_len
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci	/* 1 <= tmp <= 7 */
42862306a36Sopenharmony_ci	cmp	$(4*16), tmp
42962306a36Sopenharmony_ci	jg	.Lgt4\xctr\key_len
43062306a36Sopenharmony_ci	je	.Leq4\xctr\key_len
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_ci.Llt4\xctr\key_len:
43362306a36Sopenharmony_ci	cmp	$(2*16), tmp
43462306a36Sopenharmony_ci	jg	.Leq3\xctr\key_len
43562306a36Sopenharmony_ci	je	.Leq2\xctr\key_len
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci.Leq1\xctr\key_len:
43862306a36Sopenharmony_ci	do_aes_load	1, \key_len, \xctr
43962306a36Sopenharmony_ci	add	$(1*16), p_out
44062306a36Sopenharmony_ci	and	$(~7*16), num_bytes
44162306a36Sopenharmony_ci	jz	.Ldo_return2\xctr\key_len
44262306a36Sopenharmony_ci	jmp	.Lmain_loop2\xctr\key_len
44362306a36Sopenharmony_ci
44462306a36Sopenharmony_ci.Leq2\xctr\key_len:
44562306a36Sopenharmony_ci	do_aes_load	2, \key_len, \xctr
44662306a36Sopenharmony_ci	add	$(2*16), p_out
44762306a36Sopenharmony_ci	and	$(~7*16), num_bytes
44862306a36Sopenharmony_ci	jz	.Ldo_return2\xctr\key_len
44962306a36Sopenharmony_ci	jmp	.Lmain_loop2\xctr\key_len
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci.Leq3\xctr\key_len:
45362306a36Sopenharmony_ci	do_aes_load	3, \key_len, \xctr
45462306a36Sopenharmony_ci	add	$(3*16), p_out
45562306a36Sopenharmony_ci	and	$(~7*16), num_bytes
45662306a36Sopenharmony_ci	jz	.Ldo_return2\xctr\key_len
45762306a36Sopenharmony_ci	jmp	.Lmain_loop2\xctr\key_len
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ci.Leq4\xctr\key_len:
46062306a36Sopenharmony_ci	do_aes_load	4, \key_len, \xctr
46162306a36Sopenharmony_ci	add	$(4*16), p_out
46262306a36Sopenharmony_ci	and	$(~7*16), num_bytes
46362306a36Sopenharmony_ci	jz	.Ldo_return2\xctr\key_len
46462306a36Sopenharmony_ci	jmp	.Lmain_loop2\xctr\key_len
46562306a36Sopenharmony_ci
46662306a36Sopenharmony_ci.Lgt4\xctr\key_len:
46762306a36Sopenharmony_ci	cmp	$(6*16), tmp
46862306a36Sopenharmony_ci	jg	.Leq7\xctr\key_len
46962306a36Sopenharmony_ci	je	.Leq6\xctr\key_len
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci.Leq5\xctr\key_len:
47262306a36Sopenharmony_ci	do_aes_load	5, \key_len, \xctr
47362306a36Sopenharmony_ci	add	$(5*16), p_out
47462306a36Sopenharmony_ci	and	$(~7*16), num_bytes
47562306a36Sopenharmony_ci	jz	.Ldo_return2\xctr\key_len
47662306a36Sopenharmony_ci	jmp	.Lmain_loop2\xctr\key_len
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_ci.Leq6\xctr\key_len:
47962306a36Sopenharmony_ci	do_aes_load	6, \key_len, \xctr
48062306a36Sopenharmony_ci	add	$(6*16), p_out
48162306a36Sopenharmony_ci	and	$(~7*16), num_bytes
48262306a36Sopenharmony_ci	jz	.Ldo_return2\xctr\key_len
48362306a36Sopenharmony_ci	jmp	.Lmain_loop2\xctr\key_len
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_ci.Leq7\xctr\key_len:
48662306a36Sopenharmony_ci	do_aes_load	7, \key_len, \xctr
48762306a36Sopenharmony_ci	add	$(7*16), p_out
48862306a36Sopenharmony_ci	and	$(~7*16), num_bytes
48962306a36Sopenharmony_ci	jz	.Ldo_return2\xctr\key_len
49062306a36Sopenharmony_ci	jmp	.Lmain_loop2\xctr\key_len
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci.Lmult_of_8_blks\xctr\key_len:
49362306a36Sopenharmony_ci	.if (\key_len != KEY_128)
49462306a36Sopenharmony_ci		vmovdqa	0*16(p_keys), xkey0
49562306a36Sopenharmony_ci		vmovdqa	4*16(p_keys), xkey4
49662306a36Sopenharmony_ci		vmovdqa	8*16(p_keys), xkey8
49762306a36Sopenharmony_ci		vmovdqa	12*16(p_keys), xkey12
49862306a36Sopenharmony_ci	.else
49962306a36Sopenharmony_ci		vmovdqa	0*16(p_keys), xkey0
50062306a36Sopenharmony_ci		vmovdqa	3*16(p_keys), xkey4
50162306a36Sopenharmony_ci		vmovdqa	6*16(p_keys), xkey8
50262306a36Sopenharmony_ci		vmovdqa	9*16(p_keys), xkey12
50362306a36Sopenharmony_ci	.endif
50462306a36Sopenharmony_ci.align 16
50562306a36Sopenharmony_ci.Lmain_loop2\xctr\key_len:
50662306a36Sopenharmony_ci	/* num_bytes is a multiple of 8 and >0 */
50762306a36Sopenharmony_ci	do_aes_noload	8, \key_len, \xctr
50862306a36Sopenharmony_ci	add	$(8*16), p_out
50962306a36Sopenharmony_ci	sub	$(8*16), num_bytes
51062306a36Sopenharmony_ci	jne	.Lmain_loop2\xctr\key_len
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ci.Ldo_return2\xctr\key_len:
51362306a36Sopenharmony_ci	.if !\xctr
51462306a36Sopenharmony_ci		/* return updated IV */
51562306a36Sopenharmony_ci		vpshufb	xbyteswap, xcounter, xcounter
51662306a36Sopenharmony_ci		vmovdqu	xcounter, (p_iv)
51762306a36Sopenharmony_ci	.endif
51862306a36Sopenharmony_ci	RET
51962306a36Sopenharmony_ci.endm
52062306a36Sopenharmony_ci
52162306a36Sopenharmony_ci/*
52262306a36Sopenharmony_ci * routine to do AES128 CTR enc/decrypt "by8"
52362306a36Sopenharmony_ci * XMM registers are clobbered.
52462306a36Sopenharmony_ci * Saving/restoring must be done at a higher level
52562306a36Sopenharmony_ci * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
52662306a36Sopenharmony_ci *			unsigned int num_bytes)
52762306a36Sopenharmony_ci */
52862306a36Sopenharmony_ciSYM_FUNC_START(aes_ctr_enc_128_avx_by8)
52962306a36Sopenharmony_ci	/* call the aes main loop */
53062306a36Sopenharmony_ci	do_aes_ctrmain KEY_128 0
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ciSYM_FUNC_END(aes_ctr_enc_128_avx_by8)
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_ci/*
53562306a36Sopenharmony_ci * routine to do AES192 CTR enc/decrypt "by8"
53662306a36Sopenharmony_ci * XMM registers are clobbered.
53762306a36Sopenharmony_ci * Saving/restoring must be done at a higher level
53862306a36Sopenharmony_ci * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
53962306a36Sopenharmony_ci *			unsigned int num_bytes)
54062306a36Sopenharmony_ci */
54162306a36Sopenharmony_ciSYM_FUNC_START(aes_ctr_enc_192_avx_by8)
54262306a36Sopenharmony_ci	/* call the aes main loop */
54362306a36Sopenharmony_ci	do_aes_ctrmain KEY_192 0
54462306a36Sopenharmony_ci
54562306a36Sopenharmony_ciSYM_FUNC_END(aes_ctr_enc_192_avx_by8)
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_ci/*
54862306a36Sopenharmony_ci * routine to do AES256 CTR enc/decrypt "by8"
54962306a36Sopenharmony_ci * XMM registers are clobbered.
55062306a36Sopenharmony_ci * Saving/restoring must be done at a higher level
55162306a36Sopenharmony_ci * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
55262306a36Sopenharmony_ci *			unsigned int num_bytes)
55362306a36Sopenharmony_ci */
55462306a36Sopenharmony_ciSYM_FUNC_START(aes_ctr_enc_256_avx_by8)
55562306a36Sopenharmony_ci	/* call the aes main loop */
55662306a36Sopenharmony_ci	do_aes_ctrmain KEY_256 0
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ciSYM_FUNC_END(aes_ctr_enc_256_avx_by8)
55962306a36Sopenharmony_ci
56062306a36Sopenharmony_ci/*
56162306a36Sopenharmony_ci * routine to do AES128 XCTR enc/decrypt "by8"
56262306a36Sopenharmony_ci * XMM registers are clobbered.
56362306a36Sopenharmony_ci * Saving/restoring must be done at a higher level
56462306a36Sopenharmony_ci * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
56562306a36Sopenharmony_ci * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
56662306a36Sopenharmony_ci */
56762306a36Sopenharmony_ciSYM_FUNC_START(aes_xctr_enc_128_avx_by8)
56862306a36Sopenharmony_ci	/* call the aes main loop */
56962306a36Sopenharmony_ci	do_aes_ctrmain KEY_128 1
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_ciSYM_FUNC_END(aes_xctr_enc_128_avx_by8)
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci/*
57462306a36Sopenharmony_ci * routine to do AES192 XCTR enc/decrypt "by8"
57562306a36Sopenharmony_ci * XMM registers are clobbered.
57662306a36Sopenharmony_ci * Saving/restoring must be done at a higher level
57762306a36Sopenharmony_ci * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
57862306a36Sopenharmony_ci * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
57962306a36Sopenharmony_ci */
58062306a36Sopenharmony_ciSYM_FUNC_START(aes_xctr_enc_192_avx_by8)
58162306a36Sopenharmony_ci	/* call the aes main loop */
58262306a36Sopenharmony_ci	do_aes_ctrmain KEY_192 1
58362306a36Sopenharmony_ci
58462306a36Sopenharmony_ciSYM_FUNC_END(aes_xctr_enc_192_avx_by8)
58562306a36Sopenharmony_ci
58662306a36Sopenharmony_ci/*
58762306a36Sopenharmony_ci * routine to do AES256 XCTR enc/decrypt "by8"
58862306a36Sopenharmony_ci * XMM registers are clobbered.
58962306a36Sopenharmony_ci * Saving/restoring must be done at a higher level
59062306a36Sopenharmony_ci * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
59162306a36Sopenharmony_ci * 	u8* out, unsigned int num_bytes, unsigned int byte_ctr)
59262306a36Sopenharmony_ci */
59362306a36Sopenharmony_ciSYM_FUNC_START(aes_xctr_enc_256_avx_by8)
59462306a36Sopenharmony_ci	/* call the aes main loop */
59562306a36Sopenharmony_ci	do_aes_ctrmain KEY_256 1
59662306a36Sopenharmony_ci
59762306a36Sopenharmony_ciSYM_FUNC_END(aes_xctr_enc_256_avx_by8)
598