18c2ecf20Sopenharmony_ci/*
28c2ecf20Sopenharmony_ci *	Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci * This is AES128/192/256 CTR mode optimization implementation. It requires
58c2ecf20Sopenharmony_ci * the support of Intel(R) AESNI and AVX instructions.
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * This work was inspired by the AES CTR mode optimization published
88c2ecf20Sopenharmony_ci * in Intel Optimized IPSEC Cryptograhpic library.
98c2ecf20Sopenharmony_ci * Additional information on it can be found at:
108c2ecf20Sopenharmony_ci *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
118c2ecf20Sopenharmony_ci *
128c2ecf20Sopenharmony_ci * This file is provided under a dual BSD/GPLv2 license.  When using or
138c2ecf20Sopenharmony_ci * redistributing this file, you may do so under either license.
148c2ecf20Sopenharmony_ci *
158c2ecf20Sopenharmony_ci * GPL LICENSE SUMMARY
168c2ecf20Sopenharmony_ci *
178c2ecf20Sopenharmony_ci * Copyright(c) 2014 Intel Corporation.
188c2ecf20Sopenharmony_ci *
198c2ecf20Sopenharmony_ci * This program is free software; you can redistribute it and/or modify
208c2ecf20Sopenharmony_ci * it under the terms of version 2 of the GNU General Public License as
218c2ecf20Sopenharmony_ci * published by the Free Software Foundation.
228c2ecf20Sopenharmony_ci *
238c2ecf20Sopenharmony_ci * This program is distributed in the hope that it will be useful, but
248c2ecf20Sopenharmony_ci * WITHOUT ANY WARRANTY; without even the implied warranty of
258c2ecf20Sopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
268c2ecf20Sopenharmony_ci * General Public License for more details.
278c2ecf20Sopenharmony_ci *
288c2ecf20Sopenharmony_ci * Contact Information:
298c2ecf20Sopenharmony_ci * James Guilford <james.guilford@intel.com>
308c2ecf20Sopenharmony_ci * Sean Gulley <sean.m.gulley@intel.com>
318c2ecf20Sopenharmony_ci * Chandramouli Narayanan <mouli@linux.intel.com>
328c2ecf20Sopenharmony_ci *
338c2ecf20Sopenharmony_ci * BSD LICENSE
348c2ecf20Sopenharmony_ci *
358c2ecf20Sopenharmony_ci * Copyright(c) 2014 Intel Corporation.
368c2ecf20Sopenharmony_ci *
378c2ecf20Sopenharmony_ci * Redistribution and use in source and binary forms, with or without
388c2ecf20Sopenharmony_ci * modification, are permitted provided that the following conditions
398c2ecf20Sopenharmony_ci * are met:
408c2ecf20Sopenharmony_ci *
418c2ecf20Sopenharmony_ci * Redistributions of source code must retain the above copyright
428c2ecf20Sopenharmony_ci * notice, this list of conditions and the following disclaimer.
438c2ecf20Sopenharmony_ci * Redistributions in binary form must reproduce the above copyright
448c2ecf20Sopenharmony_ci * notice, this list of conditions and the following disclaimer in
458c2ecf20Sopenharmony_ci * the documentation and/or other materials provided with the
468c2ecf20Sopenharmony_ci * distribution.
478c2ecf20Sopenharmony_ci * Neither the name of Intel Corporation nor the names of its
488c2ecf20Sopenharmony_ci * contributors may be used to endorse or promote products derived
498c2ecf20Sopenharmony_ci * from this software without specific prior written permission.
508c2ecf20Sopenharmony_ci *
518c2ecf20Sopenharmony_ci * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
528c2ecf20Sopenharmony_ci * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
538c2ecf20Sopenharmony_ci * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
548c2ecf20Sopenharmony_ci * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
558c2ecf20Sopenharmony_ci * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
568c2ecf20Sopenharmony_ci * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
578c2ecf20Sopenharmony_ci * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
588c2ecf20Sopenharmony_ci * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
598c2ecf20Sopenharmony_ci * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
608c2ecf20Sopenharmony_ci * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
618c2ecf20Sopenharmony_ci * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
628c2ecf20Sopenharmony_ci *
638c2ecf20Sopenharmony_ci */
648c2ecf20Sopenharmony_ci
658c2ecf20Sopenharmony_ci#include <linux/linkage.h>
668c2ecf20Sopenharmony_ci
678c2ecf20Sopenharmony_ci#define VMOVDQ		vmovdqu
688c2ecf20Sopenharmony_ci
698c2ecf20Sopenharmony_ci#define xdata0		%xmm0
708c2ecf20Sopenharmony_ci#define xdata1		%xmm1
718c2ecf20Sopenharmony_ci#define xdata2		%xmm2
728c2ecf20Sopenharmony_ci#define xdata3		%xmm3
738c2ecf20Sopenharmony_ci#define xdata4		%xmm4
748c2ecf20Sopenharmony_ci#define xdata5		%xmm5
758c2ecf20Sopenharmony_ci#define xdata6		%xmm6
768c2ecf20Sopenharmony_ci#define xdata7		%xmm7
778c2ecf20Sopenharmony_ci#define xcounter	%xmm8
788c2ecf20Sopenharmony_ci#define xbyteswap	%xmm9
798c2ecf20Sopenharmony_ci#define xkey0		%xmm10
808c2ecf20Sopenharmony_ci#define xkey4		%xmm11
818c2ecf20Sopenharmony_ci#define xkey8		%xmm12
828c2ecf20Sopenharmony_ci#define xkey12		%xmm13
838c2ecf20Sopenharmony_ci#define xkeyA		%xmm14
848c2ecf20Sopenharmony_ci#define xkeyB		%xmm15
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_ci#define p_in		%rdi
878c2ecf20Sopenharmony_ci#define p_iv		%rsi
888c2ecf20Sopenharmony_ci#define p_keys		%rdx
898c2ecf20Sopenharmony_ci#define p_out		%rcx
908c2ecf20Sopenharmony_ci#define num_bytes	%r8
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci#define tmp		%r10
938c2ecf20Sopenharmony_ci#define	DDQ_DATA	0
948c2ecf20Sopenharmony_ci#define	XDATA		1
958c2ecf20Sopenharmony_ci#define KEY_128		1
968c2ecf20Sopenharmony_ci#define KEY_192		2
978c2ecf20Sopenharmony_ci#define KEY_256		3
988c2ecf20Sopenharmony_ci
998c2ecf20Sopenharmony_ci.section .rodata
1008c2ecf20Sopenharmony_ci.align 16
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_cibyteswap_const:
1038c2ecf20Sopenharmony_ci	.octa 0x000102030405060708090A0B0C0D0E0F
1048c2ecf20Sopenharmony_ciddq_low_msk:
1058c2ecf20Sopenharmony_ci	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
1068c2ecf20Sopenharmony_ciddq_high_add_1:
1078c2ecf20Sopenharmony_ci	.octa 0x00000000000000010000000000000000
1088c2ecf20Sopenharmony_ciddq_add_1:
1098c2ecf20Sopenharmony_ci	.octa 0x00000000000000000000000000000001
1108c2ecf20Sopenharmony_ciddq_add_2:
1118c2ecf20Sopenharmony_ci	.octa 0x00000000000000000000000000000002
1128c2ecf20Sopenharmony_ciddq_add_3:
1138c2ecf20Sopenharmony_ci	.octa 0x00000000000000000000000000000003
1148c2ecf20Sopenharmony_ciddq_add_4:
1158c2ecf20Sopenharmony_ci	.octa 0x00000000000000000000000000000004
1168c2ecf20Sopenharmony_ciddq_add_5:
1178c2ecf20Sopenharmony_ci	.octa 0x00000000000000000000000000000005
1188c2ecf20Sopenharmony_ciddq_add_6:
1198c2ecf20Sopenharmony_ci	.octa 0x00000000000000000000000000000006
1208c2ecf20Sopenharmony_ciddq_add_7:
1218c2ecf20Sopenharmony_ci	.octa 0x00000000000000000000000000000007
1228c2ecf20Sopenharmony_ciddq_add_8:
1238c2ecf20Sopenharmony_ci	.octa 0x00000000000000000000000000000008
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_ci.text
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci/* generate a unique variable for ddq_add_x */
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_ci/* generate a unique variable for xmm register */
1308c2ecf20Sopenharmony_ci.macro setxdata n
1318c2ecf20Sopenharmony_ci	var_xdata = %xmm\n
1328c2ecf20Sopenharmony_ci.endm
1338c2ecf20Sopenharmony_ci
1348c2ecf20Sopenharmony_ci/* club the numeric 'id' to the symbol 'name' */
1358c2ecf20Sopenharmony_ci
1368c2ecf20Sopenharmony_ci.macro club name, id
1378c2ecf20Sopenharmony_ci.altmacro
1388c2ecf20Sopenharmony_ci	.if \name == XDATA
1398c2ecf20Sopenharmony_ci		setxdata %\id
1408c2ecf20Sopenharmony_ci	.endif
1418c2ecf20Sopenharmony_ci.noaltmacro
1428c2ecf20Sopenharmony_ci.endm
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci/*
1458c2ecf20Sopenharmony_ci * do_aes num_in_par load_keys key_len
1468c2ecf20Sopenharmony_ci * This increments p_in, but not p_out
1478c2ecf20Sopenharmony_ci */
1488c2ecf20Sopenharmony_ci.macro do_aes b, k, key_len
1498c2ecf20Sopenharmony_ci	.set by, \b
1508c2ecf20Sopenharmony_ci	.set load_keys, \k
1518c2ecf20Sopenharmony_ci	.set klen, \key_len
1528c2ecf20Sopenharmony_ci
1538c2ecf20Sopenharmony_ci	.if (load_keys)
1548c2ecf20Sopenharmony_ci		vmovdqa	0*16(p_keys), xkey0
1558c2ecf20Sopenharmony_ci	.endif
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_ci	vpshufb	xbyteswap, xcounter, xdata0
1588c2ecf20Sopenharmony_ci
1598c2ecf20Sopenharmony_ci	.set i, 1
1608c2ecf20Sopenharmony_ci	.rept (by - 1)
1618c2ecf20Sopenharmony_ci		club XDATA, i
1628c2ecf20Sopenharmony_ci		vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
1638c2ecf20Sopenharmony_ci		vptest	ddq_low_msk(%rip), var_xdata
1648c2ecf20Sopenharmony_ci		jnz 1f
1658c2ecf20Sopenharmony_ci		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
1668c2ecf20Sopenharmony_ci		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
1678c2ecf20Sopenharmony_ci		1:
1688c2ecf20Sopenharmony_ci		vpshufb	xbyteswap, var_xdata, var_xdata
1698c2ecf20Sopenharmony_ci		.set i, (i +1)
1708c2ecf20Sopenharmony_ci	.endr
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci	vmovdqa	1*16(p_keys), xkeyA
1738c2ecf20Sopenharmony_ci
1748c2ecf20Sopenharmony_ci	vpxor	xkey0, xdata0, xdata0
1758c2ecf20Sopenharmony_ci	vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
1768c2ecf20Sopenharmony_ci	vptest	ddq_low_msk(%rip), xcounter
1778c2ecf20Sopenharmony_ci	jnz	1f
1788c2ecf20Sopenharmony_ci	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
1798c2ecf20Sopenharmony_ci	1:
1808c2ecf20Sopenharmony_ci
1818c2ecf20Sopenharmony_ci	.set i, 1
1828c2ecf20Sopenharmony_ci	.rept (by - 1)
1838c2ecf20Sopenharmony_ci		club XDATA, i
1848c2ecf20Sopenharmony_ci		vpxor	xkey0, var_xdata, var_xdata
1858c2ecf20Sopenharmony_ci		.set i, (i +1)
1868c2ecf20Sopenharmony_ci	.endr
1878c2ecf20Sopenharmony_ci
1888c2ecf20Sopenharmony_ci	vmovdqa	2*16(p_keys), xkeyB
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci	.set i, 0
1918c2ecf20Sopenharmony_ci	.rept by
1928c2ecf20Sopenharmony_ci		club XDATA, i
1938c2ecf20Sopenharmony_ci		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
1948c2ecf20Sopenharmony_ci		.set i, (i +1)
1958c2ecf20Sopenharmony_ci	.endr
1968c2ecf20Sopenharmony_ci
1978c2ecf20Sopenharmony_ci	.if (klen == KEY_128)
1988c2ecf20Sopenharmony_ci		.if (load_keys)
1998c2ecf20Sopenharmony_ci			vmovdqa	3*16(p_keys), xkey4
2008c2ecf20Sopenharmony_ci		.endif
2018c2ecf20Sopenharmony_ci	.else
2028c2ecf20Sopenharmony_ci		vmovdqa	3*16(p_keys), xkeyA
2038c2ecf20Sopenharmony_ci	.endif
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci	.set i, 0
2068c2ecf20Sopenharmony_ci	.rept by
2078c2ecf20Sopenharmony_ci		club XDATA, i
2088c2ecf20Sopenharmony_ci		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
2098c2ecf20Sopenharmony_ci		.set i, (i +1)
2108c2ecf20Sopenharmony_ci	.endr
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ci	add	$(16*by), p_in
2138c2ecf20Sopenharmony_ci
2148c2ecf20Sopenharmony_ci	.if (klen == KEY_128)
2158c2ecf20Sopenharmony_ci		vmovdqa	4*16(p_keys), xkeyB
2168c2ecf20Sopenharmony_ci	.else
2178c2ecf20Sopenharmony_ci		.if (load_keys)
2188c2ecf20Sopenharmony_ci			vmovdqa	4*16(p_keys), xkey4
2198c2ecf20Sopenharmony_ci		.endif
2208c2ecf20Sopenharmony_ci	.endif
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_ci	.set i, 0
2238c2ecf20Sopenharmony_ci	.rept by
2248c2ecf20Sopenharmony_ci		club XDATA, i
2258c2ecf20Sopenharmony_ci		/* key 3 */
2268c2ecf20Sopenharmony_ci		.if (klen == KEY_128)
2278c2ecf20Sopenharmony_ci			vaesenc	xkey4, var_xdata, var_xdata
2288c2ecf20Sopenharmony_ci		.else
2298c2ecf20Sopenharmony_ci			vaesenc	xkeyA, var_xdata, var_xdata
2308c2ecf20Sopenharmony_ci		.endif
2318c2ecf20Sopenharmony_ci		.set i, (i +1)
2328c2ecf20Sopenharmony_ci	.endr
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_ci	vmovdqa	5*16(p_keys), xkeyA
2358c2ecf20Sopenharmony_ci
2368c2ecf20Sopenharmony_ci	.set i, 0
2378c2ecf20Sopenharmony_ci	.rept by
2388c2ecf20Sopenharmony_ci		club XDATA, i
2398c2ecf20Sopenharmony_ci		/* key 4 */
2408c2ecf20Sopenharmony_ci		.if (klen == KEY_128)
2418c2ecf20Sopenharmony_ci			vaesenc	xkeyB, var_xdata, var_xdata
2428c2ecf20Sopenharmony_ci		.else
2438c2ecf20Sopenharmony_ci			vaesenc	xkey4, var_xdata, var_xdata
2448c2ecf20Sopenharmony_ci		.endif
2458c2ecf20Sopenharmony_ci		.set i, (i +1)
2468c2ecf20Sopenharmony_ci	.endr
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci	.if (klen == KEY_128)
2498c2ecf20Sopenharmony_ci		.if (load_keys)
2508c2ecf20Sopenharmony_ci			vmovdqa	6*16(p_keys), xkey8
2518c2ecf20Sopenharmony_ci		.endif
2528c2ecf20Sopenharmony_ci	.else
2538c2ecf20Sopenharmony_ci		vmovdqa	6*16(p_keys), xkeyB
2548c2ecf20Sopenharmony_ci	.endif
2558c2ecf20Sopenharmony_ci
2568c2ecf20Sopenharmony_ci	.set i, 0
2578c2ecf20Sopenharmony_ci	.rept by
2588c2ecf20Sopenharmony_ci		club XDATA, i
2598c2ecf20Sopenharmony_ci		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
2608c2ecf20Sopenharmony_ci		.set i, (i +1)
2618c2ecf20Sopenharmony_ci	.endr
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_ci	vmovdqa	7*16(p_keys), xkeyA
2648c2ecf20Sopenharmony_ci
2658c2ecf20Sopenharmony_ci	.set i, 0
2668c2ecf20Sopenharmony_ci	.rept by
2678c2ecf20Sopenharmony_ci		club XDATA, i
2688c2ecf20Sopenharmony_ci		/* key 6 */
2698c2ecf20Sopenharmony_ci		.if (klen == KEY_128)
2708c2ecf20Sopenharmony_ci			vaesenc	xkey8, var_xdata, var_xdata
2718c2ecf20Sopenharmony_ci		.else
2728c2ecf20Sopenharmony_ci			vaesenc	xkeyB, var_xdata, var_xdata
2738c2ecf20Sopenharmony_ci		.endif
2748c2ecf20Sopenharmony_ci		.set i, (i +1)
2758c2ecf20Sopenharmony_ci	.endr
2768c2ecf20Sopenharmony_ci
2778c2ecf20Sopenharmony_ci	.if (klen == KEY_128)
2788c2ecf20Sopenharmony_ci		vmovdqa	8*16(p_keys), xkeyB
2798c2ecf20Sopenharmony_ci	.else
2808c2ecf20Sopenharmony_ci		.if (load_keys)
2818c2ecf20Sopenharmony_ci			vmovdqa	8*16(p_keys), xkey8
2828c2ecf20Sopenharmony_ci		.endif
2838c2ecf20Sopenharmony_ci	.endif
2848c2ecf20Sopenharmony_ci
2858c2ecf20Sopenharmony_ci	.set i, 0
2868c2ecf20Sopenharmony_ci	.rept by
2878c2ecf20Sopenharmony_ci		club XDATA, i
2888c2ecf20Sopenharmony_ci		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
2898c2ecf20Sopenharmony_ci		.set i, (i +1)
2908c2ecf20Sopenharmony_ci	.endr
2918c2ecf20Sopenharmony_ci
2928c2ecf20Sopenharmony_ci	.if (klen == KEY_128)
2938c2ecf20Sopenharmony_ci		.if (load_keys)
2948c2ecf20Sopenharmony_ci			vmovdqa	9*16(p_keys), xkey12
2958c2ecf20Sopenharmony_ci		.endif
2968c2ecf20Sopenharmony_ci	.else
2978c2ecf20Sopenharmony_ci		vmovdqa	9*16(p_keys), xkeyA
2988c2ecf20Sopenharmony_ci	.endif
2998c2ecf20Sopenharmony_ci
3008c2ecf20Sopenharmony_ci	.set i, 0
3018c2ecf20Sopenharmony_ci	.rept by
3028c2ecf20Sopenharmony_ci		club XDATA, i
3038c2ecf20Sopenharmony_ci		/* key 8 */
3048c2ecf20Sopenharmony_ci		.if (klen == KEY_128)
3058c2ecf20Sopenharmony_ci			vaesenc	xkeyB, var_xdata, var_xdata
3068c2ecf20Sopenharmony_ci		.else
3078c2ecf20Sopenharmony_ci			vaesenc	xkey8, var_xdata, var_xdata
3088c2ecf20Sopenharmony_ci		.endif
3098c2ecf20Sopenharmony_ci		.set i, (i +1)
3108c2ecf20Sopenharmony_ci	.endr
3118c2ecf20Sopenharmony_ci
3128c2ecf20Sopenharmony_ci	vmovdqa	10*16(p_keys), xkeyB
3138c2ecf20Sopenharmony_ci
3148c2ecf20Sopenharmony_ci	.set i, 0
3158c2ecf20Sopenharmony_ci	.rept by
3168c2ecf20Sopenharmony_ci		club XDATA, i
3178c2ecf20Sopenharmony_ci		/* key 9 */
3188c2ecf20Sopenharmony_ci		.if (klen == KEY_128)
3198c2ecf20Sopenharmony_ci			vaesenc	xkey12, var_xdata, var_xdata
3208c2ecf20Sopenharmony_ci		.else
3218c2ecf20Sopenharmony_ci			vaesenc	xkeyA, var_xdata, var_xdata
3228c2ecf20Sopenharmony_ci		.endif
3238c2ecf20Sopenharmony_ci		.set i, (i +1)
3248c2ecf20Sopenharmony_ci	.endr
3258c2ecf20Sopenharmony_ci
3268c2ecf20Sopenharmony_ci	.if (klen != KEY_128)
3278c2ecf20Sopenharmony_ci		vmovdqa	11*16(p_keys), xkeyA
3288c2ecf20Sopenharmony_ci	.endif
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci	.set i, 0
3318c2ecf20Sopenharmony_ci	.rept by
3328c2ecf20Sopenharmony_ci		club XDATA, i
3338c2ecf20Sopenharmony_ci		/* key 10 */
3348c2ecf20Sopenharmony_ci		.if (klen == KEY_128)
3358c2ecf20Sopenharmony_ci			vaesenclast	xkeyB, var_xdata, var_xdata
3368c2ecf20Sopenharmony_ci		.else
3378c2ecf20Sopenharmony_ci			vaesenc	xkeyB, var_xdata, var_xdata
3388c2ecf20Sopenharmony_ci		.endif
3398c2ecf20Sopenharmony_ci		.set i, (i +1)
3408c2ecf20Sopenharmony_ci	.endr
3418c2ecf20Sopenharmony_ci
3428c2ecf20Sopenharmony_ci	.if (klen != KEY_128)
3438c2ecf20Sopenharmony_ci		.if (load_keys)
3448c2ecf20Sopenharmony_ci			vmovdqa	12*16(p_keys), xkey12
3458c2ecf20Sopenharmony_ci		.endif
3468c2ecf20Sopenharmony_ci
3478c2ecf20Sopenharmony_ci		.set i, 0
3488c2ecf20Sopenharmony_ci		.rept by
3498c2ecf20Sopenharmony_ci			club XDATA, i
3508c2ecf20Sopenharmony_ci			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
3518c2ecf20Sopenharmony_ci			.set i, (i +1)
3528c2ecf20Sopenharmony_ci		.endr
3538c2ecf20Sopenharmony_ci
3548c2ecf20Sopenharmony_ci		.if (klen == KEY_256)
3558c2ecf20Sopenharmony_ci			vmovdqa	13*16(p_keys), xkeyA
3568c2ecf20Sopenharmony_ci		.endif
3578c2ecf20Sopenharmony_ci
3588c2ecf20Sopenharmony_ci		.set i, 0
3598c2ecf20Sopenharmony_ci		.rept by
3608c2ecf20Sopenharmony_ci			club XDATA, i
3618c2ecf20Sopenharmony_ci			.if (klen == KEY_256)
3628c2ecf20Sopenharmony_ci				/* key 12 */
3638c2ecf20Sopenharmony_ci				vaesenc	xkey12, var_xdata, var_xdata
3648c2ecf20Sopenharmony_ci			.else
3658c2ecf20Sopenharmony_ci				vaesenclast xkey12, var_xdata, var_xdata
3668c2ecf20Sopenharmony_ci			.endif
3678c2ecf20Sopenharmony_ci			.set i, (i +1)
3688c2ecf20Sopenharmony_ci		.endr
3698c2ecf20Sopenharmony_ci
3708c2ecf20Sopenharmony_ci		.if (klen == KEY_256)
3718c2ecf20Sopenharmony_ci			vmovdqa	14*16(p_keys), xkeyB
3728c2ecf20Sopenharmony_ci
3738c2ecf20Sopenharmony_ci			.set i, 0
3748c2ecf20Sopenharmony_ci			.rept by
3758c2ecf20Sopenharmony_ci				club XDATA, i
3768c2ecf20Sopenharmony_ci				/* key 13 */
3778c2ecf20Sopenharmony_ci				vaesenc	xkeyA, var_xdata, var_xdata
3788c2ecf20Sopenharmony_ci				.set i, (i +1)
3798c2ecf20Sopenharmony_ci			.endr
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci			.set i, 0
3828c2ecf20Sopenharmony_ci			.rept by
3838c2ecf20Sopenharmony_ci				club XDATA, i
3848c2ecf20Sopenharmony_ci				/* key 14 */
3858c2ecf20Sopenharmony_ci				vaesenclast	xkeyB, var_xdata, var_xdata
3868c2ecf20Sopenharmony_ci				.set i, (i +1)
3878c2ecf20Sopenharmony_ci			.endr
3888c2ecf20Sopenharmony_ci		.endif
3898c2ecf20Sopenharmony_ci	.endif
3908c2ecf20Sopenharmony_ci
3918c2ecf20Sopenharmony_ci	.set i, 0
3928c2ecf20Sopenharmony_ci	.rept (by / 2)
3938c2ecf20Sopenharmony_ci		.set j, (i+1)
3948c2ecf20Sopenharmony_ci		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
3958c2ecf20Sopenharmony_ci		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
3968c2ecf20Sopenharmony_ci		club XDATA, i
3978c2ecf20Sopenharmony_ci		vpxor	xkeyA, var_xdata, var_xdata
3988c2ecf20Sopenharmony_ci		club XDATA, j
3998c2ecf20Sopenharmony_ci		vpxor	xkeyB, var_xdata, var_xdata
4008c2ecf20Sopenharmony_ci		.set i, (i+2)
4018c2ecf20Sopenharmony_ci	.endr
4028c2ecf20Sopenharmony_ci
4038c2ecf20Sopenharmony_ci	.if (i < by)
4048c2ecf20Sopenharmony_ci		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
4058c2ecf20Sopenharmony_ci		club XDATA, i
4068c2ecf20Sopenharmony_ci		vpxor	xkeyA, var_xdata, var_xdata
4078c2ecf20Sopenharmony_ci	.endif
4088c2ecf20Sopenharmony_ci
4098c2ecf20Sopenharmony_ci	.set i, 0
4108c2ecf20Sopenharmony_ci	.rept by
4118c2ecf20Sopenharmony_ci		club XDATA, i
4128c2ecf20Sopenharmony_ci		VMOVDQ	var_xdata, i*16(p_out)
4138c2ecf20Sopenharmony_ci		.set i, (i+1)
4148c2ecf20Sopenharmony_ci	.endr
4158c2ecf20Sopenharmony_ci.endm
4168c2ecf20Sopenharmony_ci
4178c2ecf20Sopenharmony_ci.macro do_aes_load val, key_len
4188c2ecf20Sopenharmony_ci	do_aes \val, 1, \key_len
4198c2ecf20Sopenharmony_ci.endm
4208c2ecf20Sopenharmony_ci
4218c2ecf20Sopenharmony_ci.macro do_aes_noload val, key_len
4228c2ecf20Sopenharmony_ci	do_aes \val, 0, \key_len
4238c2ecf20Sopenharmony_ci.endm
4248c2ecf20Sopenharmony_ci
4258c2ecf20Sopenharmony_ci/* main body of aes ctr load */
4268c2ecf20Sopenharmony_ci
4278c2ecf20Sopenharmony_ci.macro do_aes_ctrmain key_len
4288c2ecf20Sopenharmony_ci	cmp	$16, num_bytes
4298c2ecf20Sopenharmony_ci	jb	.Ldo_return2\key_len
4308c2ecf20Sopenharmony_ci
4318c2ecf20Sopenharmony_ci	vmovdqa	byteswap_const(%rip), xbyteswap
4328c2ecf20Sopenharmony_ci	vmovdqu	(p_iv), xcounter
4338c2ecf20Sopenharmony_ci	vpshufb	xbyteswap, xcounter, xcounter
4348c2ecf20Sopenharmony_ci
4358c2ecf20Sopenharmony_ci	mov	num_bytes, tmp
4368c2ecf20Sopenharmony_ci	and	$(7*16), tmp
4378c2ecf20Sopenharmony_ci	jz	.Lmult_of_8_blks\key_len
4388c2ecf20Sopenharmony_ci
4398c2ecf20Sopenharmony_ci	/* 1 <= tmp <= 7 */
4408c2ecf20Sopenharmony_ci	cmp	$(4*16), tmp
4418c2ecf20Sopenharmony_ci	jg	.Lgt4\key_len
4428c2ecf20Sopenharmony_ci	je	.Leq4\key_len
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ci.Llt4\key_len:
4458c2ecf20Sopenharmony_ci	cmp	$(2*16), tmp
4468c2ecf20Sopenharmony_ci	jg	.Leq3\key_len
4478c2ecf20Sopenharmony_ci	je	.Leq2\key_len
4488c2ecf20Sopenharmony_ci
4498c2ecf20Sopenharmony_ci.Leq1\key_len:
4508c2ecf20Sopenharmony_ci	do_aes_load	1, \key_len
4518c2ecf20Sopenharmony_ci	add	$(1*16), p_out
4528c2ecf20Sopenharmony_ci	and	$(~7*16), num_bytes
4538c2ecf20Sopenharmony_ci	jz	.Ldo_return2\key_len
4548c2ecf20Sopenharmony_ci	jmp	.Lmain_loop2\key_len
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci.Leq2\key_len:
4578c2ecf20Sopenharmony_ci	do_aes_load	2, \key_len
4588c2ecf20Sopenharmony_ci	add	$(2*16), p_out
4598c2ecf20Sopenharmony_ci	and	$(~7*16), num_bytes
4608c2ecf20Sopenharmony_ci	jz	.Ldo_return2\key_len
4618c2ecf20Sopenharmony_ci	jmp	.Lmain_loop2\key_len
4628c2ecf20Sopenharmony_ci
4638c2ecf20Sopenharmony_ci
4648c2ecf20Sopenharmony_ci.Leq3\key_len:
4658c2ecf20Sopenharmony_ci	do_aes_load	3, \key_len
4668c2ecf20Sopenharmony_ci	add	$(3*16), p_out
4678c2ecf20Sopenharmony_ci	and	$(~7*16), num_bytes
4688c2ecf20Sopenharmony_ci	jz	.Ldo_return2\key_len
4698c2ecf20Sopenharmony_ci	jmp	.Lmain_loop2\key_len
4708c2ecf20Sopenharmony_ci
4718c2ecf20Sopenharmony_ci.Leq4\key_len:
4728c2ecf20Sopenharmony_ci	do_aes_load	4, \key_len
4738c2ecf20Sopenharmony_ci	add	$(4*16), p_out
4748c2ecf20Sopenharmony_ci	and	$(~7*16), num_bytes
4758c2ecf20Sopenharmony_ci	jz	.Ldo_return2\key_len
4768c2ecf20Sopenharmony_ci	jmp	.Lmain_loop2\key_len
4778c2ecf20Sopenharmony_ci
4788c2ecf20Sopenharmony_ci.Lgt4\key_len:
4798c2ecf20Sopenharmony_ci	cmp	$(6*16), tmp
4808c2ecf20Sopenharmony_ci	jg	.Leq7\key_len
4818c2ecf20Sopenharmony_ci	je	.Leq6\key_len
4828c2ecf20Sopenharmony_ci
4838c2ecf20Sopenharmony_ci.Leq5\key_len:
4848c2ecf20Sopenharmony_ci	do_aes_load	5, \key_len
4858c2ecf20Sopenharmony_ci	add	$(5*16), p_out
4868c2ecf20Sopenharmony_ci	and	$(~7*16), num_bytes
4878c2ecf20Sopenharmony_ci	jz	.Ldo_return2\key_len
4888c2ecf20Sopenharmony_ci	jmp	.Lmain_loop2\key_len
4898c2ecf20Sopenharmony_ci
4908c2ecf20Sopenharmony_ci.Leq6\key_len:
4918c2ecf20Sopenharmony_ci	do_aes_load	6, \key_len
4928c2ecf20Sopenharmony_ci	add	$(6*16), p_out
4938c2ecf20Sopenharmony_ci	and	$(~7*16), num_bytes
4948c2ecf20Sopenharmony_ci	jz	.Ldo_return2\key_len
4958c2ecf20Sopenharmony_ci	jmp	.Lmain_loop2\key_len
4968c2ecf20Sopenharmony_ci
4978c2ecf20Sopenharmony_ci.Leq7\key_len:
4988c2ecf20Sopenharmony_ci	do_aes_load	7, \key_len
4998c2ecf20Sopenharmony_ci	add	$(7*16), p_out
5008c2ecf20Sopenharmony_ci	and	$(~7*16), num_bytes
5018c2ecf20Sopenharmony_ci	jz	.Ldo_return2\key_len
5028c2ecf20Sopenharmony_ci	jmp	.Lmain_loop2\key_len
5038c2ecf20Sopenharmony_ci
5048c2ecf20Sopenharmony_ci.Lmult_of_8_blks\key_len:
5058c2ecf20Sopenharmony_ci	.if (\key_len != KEY_128)
5068c2ecf20Sopenharmony_ci		vmovdqa	0*16(p_keys), xkey0
5078c2ecf20Sopenharmony_ci		vmovdqa	4*16(p_keys), xkey4
5088c2ecf20Sopenharmony_ci		vmovdqa	8*16(p_keys), xkey8
5098c2ecf20Sopenharmony_ci		vmovdqa	12*16(p_keys), xkey12
5108c2ecf20Sopenharmony_ci	.else
5118c2ecf20Sopenharmony_ci		vmovdqa	0*16(p_keys), xkey0
5128c2ecf20Sopenharmony_ci		vmovdqa	3*16(p_keys), xkey4
5138c2ecf20Sopenharmony_ci		vmovdqa	6*16(p_keys), xkey8
5148c2ecf20Sopenharmony_ci		vmovdqa	9*16(p_keys), xkey12
5158c2ecf20Sopenharmony_ci	.endif
5168c2ecf20Sopenharmony_ci.align 16
5178c2ecf20Sopenharmony_ci.Lmain_loop2\key_len:
5188c2ecf20Sopenharmony_ci	/* num_bytes is a multiple of 8 and >0 */
5198c2ecf20Sopenharmony_ci	do_aes_noload	8, \key_len
5208c2ecf20Sopenharmony_ci	add	$(8*16), p_out
5218c2ecf20Sopenharmony_ci	sub	$(8*16), num_bytes
5228c2ecf20Sopenharmony_ci	jne	.Lmain_loop2\key_len
5238c2ecf20Sopenharmony_ci
5248c2ecf20Sopenharmony_ci.Ldo_return2\key_len:
5258c2ecf20Sopenharmony_ci	/* return updated IV */
5268c2ecf20Sopenharmony_ci	vpshufb	xbyteswap, xcounter, xcounter
5278c2ecf20Sopenharmony_ci	vmovdqu	xcounter, (p_iv)
5288c2ecf20Sopenharmony_ci	RET
5298c2ecf20Sopenharmony_ci.endm
5308c2ecf20Sopenharmony_ci
5318c2ecf20Sopenharmony_ci/*
5328c2ecf20Sopenharmony_ci * routine to do AES128 CTR enc/decrypt "by8"
5338c2ecf20Sopenharmony_ci * XMM registers are clobbered.
5348c2ecf20Sopenharmony_ci * Saving/restoring must be done at a higher level
5358c2ecf20Sopenharmony_ci * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
5368c2ecf20Sopenharmony_ci *			unsigned int num_bytes)
5378c2ecf20Sopenharmony_ci */
5388c2ecf20Sopenharmony_ciSYM_FUNC_START(aes_ctr_enc_128_avx_by8)
5398c2ecf20Sopenharmony_ci	/* call the aes main loop */
5408c2ecf20Sopenharmony_ci	do_aes_ctrmain KEY_128
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ciSYM_FUNC_END(aes_ctr_enc_128_avx_by8)
5438c2ecf20Sopenharmony_ci
5448c2ecf20Sopenharmony_ci/*
5458c2ecf20Sopenharmony_ci * routine to do AES192 CTR enc/decrypt "by8"
5468c2ecf20Sopenharmony_ci * XMM registers are clobbered.
5478c2ecf20Sopenharmony_ci * Saving/restoring must be done at a higher level
5488c2ecf20Sopenharmony_ci * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
5498c2ecf20Sopenharmony_ci *			unsigned int num_bytes)
5508c2ecf20Sopenharmony_ci */
5518c2ecf20Sopenharmony_ciSYM_FUNC_START(aes_ctr_enc_192_avx_by8)
5528c2ecf20Sopenharmony_ci	/* call the aes main loop */
5538c2ecf20Sopenharmony_ci	do_aes_ctrmain KEY_192
5548c2ecf20Sopenharmony_ci
5558c2ecf20Sopenharmony_ciSYM_FUNC_END(aes_ctr_enc_192_avx_by8)
5568c2ecf20Sopenharmony_ci
5578c2ecf20Sopenharmony_ci/*
5588c2ecf20Sopenharmony_ci * routine to do AES256 CTR enc/decrypt "by8"
5598c2ecf20Sopenharmony_ci * XMM registers are clobbered.
5608c2ecf20Sopenharmony_ci * Saving/restoring must be done at a higher level
5618c2ecf20Sopenharmony_ci * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
5628c2ecf20Sopenharmony_ci *			unsigned int num_bytes)
5638c2ecf20Sopenharmony_ci */
5648c2ecf20Sopenharmony_ciSYM_FUNC_START(aes_ctr_enc_256_avx_by8)
5658c2ecf20Sopenharmony_ci	/* call the aes main loop */
5668c2ecf20Sopenharmony_ci	do_aes_ctrmain KEY_256
5678c2ecf20Sopenharmony_ci
5688c2ecf20Sopenharmony_ciSYM_FUNC_END(aes_ctr_enc_256_avx_by8)
569