18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Implement AES algorithm in Intel AES-NI instructions.
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * The white paper of AES-NI instructions can be downloaded from:
68c2ecf20Sopenharmony_ci *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
78c2ecf20Sopenharmony_ci *
88c2ecf20Sopenharmony_ci * Copyright (C) 2008, Intel Corp.
98c2ecf20Sopenharmony_ci *    Author: Huang Ying <ying.huang@intel.com>
108c2ecf20Sopenharmony_ci *            Vinodh Gopal <vinodh.gopal@intel.com>
118c2ecf20Sopenharmony_ci *            Kahraman Akdemir
128c2ecf20Sopenharmony_ci *
138c2ecf20Sopenharmony_ci * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
148c2ecf20Sopenharmony_ci * interface for 64-bit kernels.
158c2ecf20Sopenharmony_ci *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
168c2ecf20Sopenharmony_ci *             Aidan O'Mahony (aidan.o.mahony@intel.com)
178c2ecf20Sopenharmony_ci *             Adrian Hoban <adrian.hoban@intel.com>
188c2ecf20Sopenharmony_ci *             James Guilford (james.guilford@intel.com)
198c2ecf20Sopenharmony_ci *             Gabriele Paoloni <gabriele.paoloni@intel.com>
208c2ecf20Sopenharmony_ci *             Tadeusz Struk (tadeusz.struk@intel.com)
218c2ecf20Sopenharmony_ci *             Wajdi Feghali (wajdi.k.feghali@intel.com)
228c2ecf20Sopenharmony_ci *    Copyright (c) 2010, Intel Corporation.
238c2ecf20Sopenharmony_ci *
248c2ecf20Sopenharmony_ci * Ported x86_64 version to x86:
258c2ecf20Sopenharmony_ci *    Author: Mathias Krause <minipli@googlemail.com>
268c2ecf20Sopenharmony_ci */
278c2ecf20Sopenharmony_ci
288c2ecf20Sopenharmony_ci#include <linux/linkage.h>
298c2ecf20Sopenharmony_ci#include <asm/frame.h>
308c2ecf20Sopenharmony_ci#include <asm/nospec-branch.h>
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_ci/*
338c2ecf20Sopenharmony_ci * The following macros are used to move an (un)aligned 16 byte value to/from
348c2ecf20Sopenharmony_ci * an XMM register.  This can done for either FP or integer values, for FP use
358c2ecf20Sopenharmony_ci * movaps (move aligned packed single) or integer use movdqa (move double quad
368c2ecf20Sopenharmony_ci * aligned).  It doesn't make a performance difference which instruction is used
378c2ecf20Sopenharmony_ci * since Nehalem (original Core i7) was released.  However, the movaps is a byte
388c2ecf20Sopenharmony_ci * shorter, so that is the one we'll use for now. (same for unaligned).
398c2ecf20Sopenharmony_ci */
408c2ecf20Sopenharmony_ci#define MOVADQ	movaps
418c2ecf20Sopenharmony_ci#define MOVUDQ	movups
428c2ecf20Sopenharmony_ci
438c2ecf20Sopenharmony_ci#ifdef __x86_64__
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_ci# constants in mergeable sections, linker can reorder and merge
468c2ecf20Sopenharmony_ci.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
478c2ecf20Sopenharmony_ci.align 16
488c2ecf20Sopenharmony_ci.Lgf128mul_x_ble_mask:
498c2ecf20Sopenharmony_ci	.octa 0x00000000000000010000000000000087
508c2ecf20Sopenharmony_ci.section	.rodata.cst16.POLY, "aM", @progbits, 16
518c2ecf20Sopenharmony_ci.align 16
528c2ecf20Sopenharmony_ciPOLY:   .octa 0xC2000000000000000000000000000001
538c2ecf20Sopenharmony_ci.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
548c2ecf20Sopenharmony_ci.align 16
558c2ecf20Sopenharmony_ciTWOONE: .octa 0x00000001000000000000000000000001
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ci.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
588c2ecf20Sopenharmony_ci.align 16
598c2ecf20Sopenharmony_ciSHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
608c2ecf20Sopenharmony_ci.section	.rodata.cst16.MASK1, "aM", @progbits, 16
618c2ecf20Sopenharmony_ci.align 16
628c2ecf20Sopenharmony_ciMASK1:      .octa 0x0000000000000000ffffffffffffffff
638c2ecf20Sopenharmony_ci.section	.rodata.cst16.MASK2, "aM", @progbits, 16
648c2ecf20Sopenharmony_ci.align 16
658c2ecf20Sopenharmony_ciMASK2:      .octa 0xffffffffffffffff0000000000000000
668c2ecf20Sopenharmony_ci.section	.rodata.cst16.ONE, "aM", @progbits, 16
678c2ecf20Sopenharmony_ci.align 16
688c2ecf20Sopenharmony_ciONE:        .octa 0x00000000000000000000000000000001
698c2ecf20Sopenharmony_ci.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
708c2ecf20Sopenharmony_ci.align 16
718c2ecf20Sopenharmony_ciF_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
728c2ecf20Sopenharmony_ci.section	.rodata.cst16.dec, "aM", @progbits, 16
738c2ecf20Sopenharmony_ci.align 16
748c2ecf20Sopenharmony_cidec:        .octa 0x1
758c2ecf20Sopenharmony_ci.section	.rodata.cst16.enc, "aM", @progbits, 16
768c2ecf20Sopenharmony_ci.align 16
778c2ecf20Sopenharmony_cienc:        .octa 0x2
788c2ecf20Sopenharmony_ci
798c2ecf20Sopenharmony_ci# order of these constants should not change.
808c2ecf20Sopenharmony_ci# more specifically, ALL_F should follow SHIFT_MASK,
818c2ecf20Sopenharmony_ci# and zero should follow ALL_F
828c2ecf20Sopenharmony_ci.section	.rodata, "a", @progbits
838c2ecf20Sopenharmony_ci.align 16
848c2ecf20Sopenharmony_ciSHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
858c2ecf20Sopenharmony_ciALL_F:      .octa 0xffffffffffffffffffffffffffffffff
868c2ecf20Sopenharmony_ci            .octa 0x00000000000000000000000000000000
878c2ecf20Sopenharmony_ci
888c2ecf20Sopenharmony_ci.text
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci#define	STACK_OFFSET    8*3
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci#define AadHash 16*0
948c2ecf20Sopenharmony_ci#define AadLen 16*1
958c2ecf20Sopenharmony_ci#define InLen (16*1)+8
968c2ecf20Sopenharmony_ci#define PBlockEncKey 16*2
978c2ecf20Sopenharmony_ci#define OrigIV 16*3
988c2ecf20Sopenharmony_ci#define CurCount 16*4
998c2ecf20Sopenharmony_ci#define PBlockLen 16*5
1008c2ecf20Sopenharmony_ci#define	HashKey		16*6	// store HashKey <<1 mod poly here
1018c2ecf20Sopenharmony_ci#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
1028c2ecf20Sopenharmony_ci#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
1038c2ecf20Sopenharmony_ci#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
1048c2ecf20Sopenharmony_ci#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
1058c2ecf20Sopenharmony_ci				// bits of  HashKey <<1 mod poly here
1068c2ecf20Sopenharmony_ci				//(for Karatsuba purposes)
1078c2ecf20Sopenharmony_ci#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
1088c2ecf20Sopenharmony_ci				// bits of  HashKey^2 <<1 mod poly here
1098c2ecf20Sopenharmony_ci				// (for Karatsuba purposes)
1108c2ecf20Sopenharmony_ci#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
1118c2ecf20Sopenharmony_ci				// bits of  HashKey^3 <<1 mod poly here
1128c2ecf20Sopenharmony_ci				// (for Karatsuba purposes)
1138c2ecf20Sopenharmony_ci#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
1148c2ecf20Sopenharmony_ci				// bits of  HashKey^4 <<1 mod poly here
1158c2ecf20Sopenharmony_ci				// (for Karatsuba purposes)
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_ci#define arg1 rdi
1188c2ecf20Sopenharmony_ci#define arg2 rsi
1198c2ecf20Sopenharmony_ci#define arg3 rdx
1208c2ecf20Sopenharmony_ci#define arg4 rcx
1218c2ecf20Sopenharmony_ci#define arg5 r8
1228c2ecf20Sopenharmony_ci#define arg6 r9
1238c2ecf20Sopenharmony_ci#define arg7 STACK_OFFSET+8(%rsp)
1248c2ecf20Sopenharmony_ci#define arg8 STACK_OFFSET+16(%rsp)
1258c2ecf20Sopenharmony_ci#define arg9 STACK_OFFSET+24(%rsp)
1268c2ecf20Sopenharmony_ci#define arg10 STACK_OFFSET+32(%rsp)
1278c2ecf20Sopenharmony_ci#define arg11 STACK_OFFSET+40(%rsp)
1288c2ecf20Sopenharmony_ci#define keysize 2*15*16(%arg1)
1298c2ecf20Sopenharmony_ci#endif
1308c2ecf20Sopenharmony_ci
1318c2ecf20Sopenharmony_ci
1328c2ecf20Sopenharmony_ci#define STATE1	%xmm0
1338c2ecf20Sopenharmony_ci#define STATE2	%xmm4
1348c2ecf20Sopenharmony_ci#define STATE3	%xmm5
1358c2ecf20Sopenharmony_ci#define STATE4	%xmm6
1368c2ecf20Sopenharmony_ci#define STATE	STATE1
1378c2ecf20Sopenharmony_ci#define IN1	%xmm1
1388c2ecf20Sopenharmony_ci#define IN2	%xmm7
1398c2ecf20Sopenharmony_ci#define IN3	%xmm8
1408c2ecf20Sopenharmony_ci#define IN4	%xmm9
1418c2ecf20Sopenharmony_ci#define IN	IN1
1428c2ecf20Sopenharmony_ci#define KEY	%xmm2
1438c2ecf20Sopenharmony_ci#define IV	%xmm3
1448c2ecf20Sopenharmony_ci
1458c2ecf20Sopenharmony_ci#define BSWAP_MASK %xmm10
1468c2ecf20Sopenharmony_ci#define CTR	%xmm11
1478c2ecf20Sopenharmony_ci#define INC	%xmm12
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_ci#define GF128MUL_MASK %xmm10
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_ci#ifdef __x86_64__
1528c2ecf20Sopenharmony_ci#define AREG	%rax
1538c2ecf20Sopenharmony_ci#define KEYP	%rdi
1548c2ecf20Sopenharmony_ci#define OUTP	%rsi
1558c2ecf20Sopenharmony_ci#define UKEYP	OUTP
1568c2ecf20Sopenharmony_ci#define INP	%rdx
1578c2ecf20Sopenharmony_ci#define LEN	%rcx
1588c2ecf20Sopenharmony_ci#define IVP	%r8
1598c2ecf20Sopenharmony_ci#define KLEN	%r9d
1608c2ecf20Sopenharmony_ci#define T1	%r10
1618c2ecf20Sopenharmony_ci#define TKEYP	T1
1628c2ecf20Sopenharmony_ci#define T2	%r11
1638c2ecf20Sopenharmony_ci#define TCTR_LOW T2
1648c2ecf20Sopenharmony_ci#else
1658c2ecf20Sopenharmony_ci#define AREG	%eax
1668c2ecf20Sopenharmony_ci#define KEYP	%edi
1678c2ecf20Sopenharmony_ci#define OUTP	AREG
1688c2ecf20Sopenharmony_ci#define UKEYP	OUTP
1698c2ecf20Sopenharmony_ci#define INP	%edx
1708c2ecf20Sopenharmony_ci#define LEN	%esi
1718c2ecf20Sopenharmony_ci#define IVP	%ebp
1728c2ecf20Sopenharmony_ci#define KLEN	%ebx
1738c2ecf20Sopenharmony_ci#define T1	%ecx
1748c2ecf20Sopenharmony_ci#define TKEYP	T1
1758c2ecf20Sopenharmony_ci#endif
1768c2ecf20Sopenharmony_ci
1778c2ecf20Sopenharmony_ci.macro FUNC_SAVE
1788c2ecf20Sopenharmony_ci	push	%r12
1798c2ecf20Sopenharmony_ci	push	%r13
1808c2ecf20Sopenharmony_ci	push	%r14
1818c2ecf20Sopenharmony_ci#
1828c2ecf20Sopenharmony_ci# states of %xmm registers %xmm6:%xmm15 not saved
1838c2ecf20Sopenharmony_ci# all %xmm registers are clobbered
1848c2ecf20Sopenharmony_ci#
1858c2ecf20Sopenharmony_ci.endm
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci
1888c2ecf20Sopenharmony_ci.macro FUNC_RESTORE
1898c2ecf20Sopenharmony_ci	pop	%r14
1908c2ecf20Sopenharmony_ci	pop	%r13
1918c2ecf20Sopenharmony_ci	pop	%r12
1928c2ecf20Sopenharmony_ci.endm
1938c2ecf20Sopenharmony_ci
1948c2ecf20Sopenharmony_ci# Precompute hashkeys.
1958c2ecf20Sopenharmony_ci# Input: Hash subkey.
1968c2ecf20Sopenharmony_ci# Output: HashKeys stored in gcm_context_data.  Only needs to be called
1978c2ecf20Sopenharmony_ci# once per key.
1988c2ecf20Sopenharmony_ci# clobbers r12, and tmp xmm registers.
1998c2ecf20Sopenharmony_ci.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
2008c2ecf20Sopenharmony_ci	mov	\SUBKEY, %r12
2018c2ecf20Sopenharmony_ci	movdqu	(%r12), \TMP3
2028c2ecf20Sopenharmony_ci	movdqa	SHUF_MASK(%rip), \TMP2
2038c2ecf20Sopenharmony_ci	pshufb	\TMP2, \TMP3
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_ci	movdqa	\TMP3, \TMP2
2088c2ecf20Sopenharmony_ci	psllq	$1, \TMP3
2098c2ecf20Sopenharmony_ci	psrlq	$63, \TMP2
2108c2ecf20Sopenharmony_ci	movdqa	\TMP2, \TMP1
2118c2ecf20Sopenharmony_ci	pslldq	$8, \TMP2
2128c2ecf20Sopenharmony_ci	psrldq	$8, \TMP1
2138c2ecf20Sopenharmony_ci	por	\TMP2, \TMP3
2148c2ecf20Sopenharmony_ci
2158c2ecf20Sopenharmony_ci	# reduce HashKey<<1
2168c2ecf20Sopenharmony_ci
2178c2ecf20Sopenharmony_ci	pshufd	$0x24, \TMP1, \TMP2
2188c2ecf20Sopenharmony_ci	pcmpeqd TWOONE(%rip), \TMP2
2198c2ecf20Sopenharmony_ci	pand	POLY(%rip), \TMP2
2208c2ecf20Sopenharmony_ci	pxor	\TMP2, \TMP3
2218c2ecf20Sopenharmony_ci	movdqu	\TMP3, HashKey(%arg2)
2228c2ecf20Sopenharmony_ci
2238c2ecf20Sopenharmony_ci	movdqa	   \TMP3, \TMP5
2248c2ecf20Sopenharmony_ci	pshufd	   $78, \TMP3, \TMP1
2258c2ecf20Sopenharmony_ci	pxor	   \TMP3, \TMP1
2268c2ecf20Sopenharmony_ci	movdqu	   \TMP1, HashKey_k(%arg2)
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
2298c2ecf20Sopenharmony_ci# TMP5 = HashKey^2<<1 (mod poly)
2308c2ecf20Sopenharmony_ci	movdqu	   \TMP5, HashKey_2(%arg2)
2318c2ecf20Sopenharmony_ci# HashKey_2 = HashKey^2<<1 (mod poly)
2328c2ecf20Sopenharmony_ci	pshufd	   $78, \TMP5, \TMP1
2338c2ecf20Sopenharmony_ci	pxor	   \TMP5, \TMP1
2348c2ecf20Sopenharmony_ci	movdqu	   \TMP1, HashKey_2_k(%arg2)
2358c2ecf20Sopenharmony_ci
2368c2ecf20Sopenharmony_ci	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
2378c2ecf20Sopenharmony_ci# TMP5 = HashKey^3<<1 (mod poly)
2388c2ecf20Sopenharmony_ci	movdqu	   \TMP5, HashKey_3(%arg2)
2398c2ecf20Sopenharmony_ci	pshufd	   $78, \TMP5, \TMP1
2408c2ecf20Sopenharmony_ci	pxor	   \TMP5, \TMP1
2418c2ecf20Sopenharmony_ci	movdqu	   \TMP1, HashKey_3_k(%arg2)
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ci	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
2448c2ecf20Sopenharmony_ci# TMP5 = HashKey^3<<1 (mod poly)
2458c2ecf20Sopenharmony_ci	movdqu	   \TMP5, HashKey_4(%arg2)
2468c2ecf20Sopenharmony_ci	pshufd	   $78, \TMP5, \TMP1
2478c2ecf20Sopenharmony_ci	pxor	   \TMP5, \TMP1
2488c2ecf20Sopenharmony_ci	movdqu	   \TMP1, HashKey_4_k(%arg2)
2498c2ecf20Sopenharmony_ci.endm
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
2528c2ecf20Sopenharmony_ci# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
2538c2ecf20Sopenharmony_ci.macro GCM_INIT Iv SUBKEY AAD AADLEN
2548c2ecf20Sopenharmony_ci	mov \AADLEN, %r11
2558c2ecf20Sopenharmony_ci	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
2568c2ecf20Sopenharmony_ci	xor %r11d, %r11d
2578c2ecf20Sopenharmony_ci	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
2588c2ecf20Sopenharmony_ci	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
2598c2ecf20Sopenharmony_ci	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
2608c2ecf20Sopenharmony_ci	mov \Iv, %rax
2618c2ecf20Sopenharmony_ci	movdqu (%rax), %xmm0
2628c2ecf20Sopenharmony_ci	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
2638c2ecf20Sopenharmony_ci
2648c2ecf20Sopenharmony_ci	movdqa  SHUF_MASK(%rip), %xmm2
2658c2ecf20Sopenharmony_ci	pshufb %xmm2, %xmm0
2668c2ecf20Sopenharmony_ci	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
2678c2ecf20Sopenharmony_ci
2688c2ecf20Sopenharmony_ci	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
2698c2ecf20Sopenharmony_ci	movdqu HashKey(%arg2), %xmm13
2708c2ecf20Sopenharmony_ci
2718c2ecf20Sopenharmony_ci	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
2728c2ecf20Sopenharmony_ci	%xmm4, %xmm5, %xmm6
2738c2ecf20Sopenharmony_ci.endm
2748c2ecf20Sopenharmony_ci
2758c2ecf20Sopenharmony_ci# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
2768c2ecf20Sopenharmony_ci# struct has been initialized by GCM_INIT.
2778c2ecf20Sopenharmony_ci# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
2788c2ecf20Sopenharmony_ci# Clobbers rax, r10-r13, and xmm0-xmm15
2798c2ecf20Sopenharmony_ci.macro GCM_ENC_DEC operation
2808c2ecf20Sopenharmony_ci	movdqu AadHash(%arg2), %xmm8
2818c2ecf20Sopenharmony_ci	movdqu HashKey(%arg2), %xmm13
2828c2ecf20Sopenharmony_ci	add %arg5, InLen(%arg2)
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci	xor %r11d, %r11d # initialise the data pointer offset as zero
2858c2ecf20Sopenharmony_ci	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
2868c2ecf20Sopenharmony_ci
2878c2ecf20Sopenharmony_ci	sub %r11, %arg5		# sub partial block data used
2888c2ecf20Sopenharmony_ci	mov %arg5, %r13		# save the number of bytes
2898c2ecf20Sopenharmony_ci
2908c2ecf20Sopenharmony_ci	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
2918c2ecf20Sopenharmony_ci	mov %r13, %r12
2928c2ecf20Sopenharmony_ci	# Encrypt/Decrypt first few blocks
2938c2ecf20Sopenharmony_ci
2948c2ecf20Sopenharmony_ci	and	$(3<<4), %r12
2958c2ecf20Sopenharmony_ci	jz	_initial_num_blocks_is_0_\@
2968c2ecf20Sopenharmony_ci	cmp	$(2<<4), %r12
2978c2ecf20Sopenharmony_ci	jb	_initial_num_blocks_is_1_\@
2988c2ecf20Sopenharmony_ci	je	_initial_num_blocks_is_2_\@
2998c2ecf20Sopenharmony_ci_initial_num_blocks_is_3_\@:
3008c2ecf20Sopenharmony_ci	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
3018c2ecf20Sopenharmony_ci%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
3028c2ecf20Sopenharmony_ci	sub	$48, %r13
3038c2ecf20Sopenharmony_ci	jmp	_initial_blocks_\@
3048c2ecf20Sopenharmony_ci_initial_num_blocks_is_2_\@:
3058c2ecf20Sopenharmony_ci	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
3068c2ecf20Sopenharmony_ci%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
3078c2ecf20Sopenharmony_ci	sub	$32, %r13
3088c2ecf20Sopenharmony_ci	jmp	_initial_blocks_\@
3098c2ecf20Sopenharmony_ci_initial_num_blocks_is_1_\@:
3108c2ecf20Sopenharmony_ci	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
3118c2ecf20Sopenharmony_ci%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
3128c2ecf20Sopenharmony_ci	sub	$16, %r13
3138c2ecf20Sopenharmony_ci	jmp	_initial_blocks_\@
3148c2ecf20Sopenharmony_ci_initial_num_blocks_is_0_\@:
3158c2ecf20Sopenharmony_ci	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
3168c2ecf20Sopenharmony_ci%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
3178c2ecf20Sopenharmony_ci_initial_blocks_\@:
3188c2ecf20Sopenharmony_ci
3198c2ecf20Sopenharmony_ci	# Main loop - Encrypt/Decrypt remaining blocks
3208c2ecf20Sopenharmony_ci
3218c2ecf20Sopenharmony_ci	test	%r13, %r13
3228c2ecf20Sopenharmony_ci	je	_zero_cipher_left_\@
3238c2ecf20Sopenharmony_ci	sub	$64, %r13
3248c2ecf20Sopenharmony_ci	je	_four_cipher_left_\@
3258c2ecf20Sopenharmony_ci_crypt_by_4_\@:
3268c2ecf20Sopenharmony_ci	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
3278c2ecf20Sopenharmony_ci	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
3288c2ecf20Sopenharmony_ci	%xmm7, %xmm8, enc
3298c2ecf20Sopenharmony_ci	add	$64, %r11
3308c2ecf20Sopenharmony_ci	sub	$64, %r13
3318c2ecf20Sopenharmony_ci	jne	_crypt_by_4_\@
3328c2ecf20Sopenharmony_ci_four_cipher_left_\@:
3338c2ecf20Sopenharmony_ci	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
3348c2ecf20Sopenharmony_ci%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
3358c2ecf20Sopenharmony_ci_zero_cipher_left_\@:
3368c2ecf20Sopenharmony_ci	movdqu %xmm8, AadHash(%arg2)
3378c2ecf20Sopenharmony_ci	movdqu %xmm0, CurCount(%arg2)
3388c2ecf20Sopenharmony_ci
3398c2ecf20Sopenharmony_ci	mov	%arg5, %r13
3408c2ecf20Sopenharmony_ci	and	$15, %r13			# %r13 = arg5 (mod 16)
3418c2ecf20Sopenharmony_ci	je	_multiple_of_16_bytes_\@
3428c2ecf20Sopenharmony_ci
3438c2ecf20Sopenharmony_ci	mov %r13, PBlockLen(%arg2)
3448c2ecf20Sopenharmony_ci
3458c2ecf20Sopenharmony_ci	# Handle the last <16 Byte block separately
3468c2ecf20Sopenharmony_ci	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
3478c2ecf20Sopenharmony_ci	movdqu %xmm0, CurCount(%arg2)
3488c2ecf20Sopenharmony_ci	movdqa SHUF_MASK(%rip), %xmm10
3498c2ecf20Sopenharmony_ci	pshufb %xmm10, %xmm0
3508c2ecf20Sopenharmony_ci
3518c2ecf20Sopenharmony_ci	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
3528c2ecf20Sopenharmony_ci	movdqu %xmm0, PBlockEncKey(%arg2)
3538c2ecf20Sopenharmony_ci
3548c2ecf20Sopenharmony_ci	cmp	$16, %arg5
3558c2ecf20Sopenharmony_ci	jge _large_enough_update_\@
3568c2ecf20Sopenharmony_ci
3578c2ecf20Sopenharmony_ci	lea (%arg4,%r11,1), %r10
3588c2ecf20Sopenharmony_ci	mov %r13, %r12
3598c2ecf20Sopenharmony_ci	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
3608c2ecf20Sopenharmony_ci	jmp _data_read_\@
3618c2ecf20Sopenharmony_ci
3628c2ecf20Sopenharmony_ci_large_enough_update_\@:
3638c2ecf20Sopenharmony_ci	sub	$16, %r11
3648c2ecf20Sopenharmony_ci	add	%r13, %r11
3658c2ecf20Sopenharmony_ci
3668c2ecf20Sopenharmony_ci	# receive the last <16 Byte block
3678c2ecf20Sopenharmony_ci	movdqu	(%arg4, %r11, 1), %xmm1
3688c2ecf20Sopenharmony_ci
3698c2ecf20Sopenharmony_ci	sub	%r13, %r11
3708c2ecf20Sopenharmony_ci	add	$16, %r11
3718c2ecf20Sopenharmony_ci
3728c2ecf20Sopenharmony_ci	lea	SHIFT_MASK+16(%rip), %r12
3738c2ecf20Sopenharmony_ci	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
3748c2ecf20Sopenharmony_ci	# (r13 is the number of bytes in plaintext mod 16)
3758c2ecf20Sopenharmony_ci	sub	%r13, %r12
3768c2ecf20Sopenharmony_ci	# get the appropriate shuffle mask
3778c2ecf20Sopenharmony_ci	movdqu	(%r12), %xmm2
3788c2ecf20Sopenharmony_ci	# shift right 16-r13 bytes
3798c2ecf20Sopenharmony_ci	pshufb  %xmm2, %xmm1
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci_data_read_\@:
3828c2ecf20Sopenharmony_ci	lea ALL_F+16(%rip), %r12
3838c2ecf20Sopenharmony_ci	sub %r13, %r12
3848c2ecf20Sopenharmony_ci
3858c2ecf20Sopenharmony_ci.ifc \operation, dec
3868c2ecf20Sopenharmony_ci	movdqa  %xmm1, %xmm2
3878c2ecf20Sopenharmony_ci.endif
3888c2ecf20Sopenharmony_ci	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
3898c2ecf20Sopenharmony_ci	movdqu	(%r12), %xmm1
3908c2ecf20Sopenharmony_ci	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
3918c2ecf20Sopenharmony_ci	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
3928c2ecf20Sopenharmony_ci.ifc \operation, dec
3938c2ecf20Sopenharmony_ci	pand    %xmm1, %xmm2
3948c2ecf20Sopenharmony_ci	movdqa SHUF_MASK(%rip), %xmm10
3958c2ecf20Sopenharmony_ci	pshufb %xmm10 ,%xmm2
3968c2ecf20Sopenharmony_ci
3978c2ecf20Sopenharmony_ci	pxor %xmm2, %xmm8
3988c2ecf20Sopenharmony_ci.else
3998c2ecf20Sopenharmony_ci	movdqa SHUF_MASK(%rip), %xmm10
4008c2ecf20Sopenharmony_ci	pshufb %xmm10,%xmm0
4018c2ecf20Sopenharmony_ci
4028c2ecf20Sopenharmony_ci	pxor	%xmm0, %xmm8
4038c2ecf20Sopenharmony_ci.endif
4048c2ecf20Sopenharmony_ci
4058c2ecf20Sopenharmony_ci	movdqu %xmm8, AadHash(%arg2)
4068c2ecf20Sopenharmony_ci.ifc \operation, enc
4078c2ecf20Sopenharmony_ci	# GHASH computation for the last <16 byte block
4088c2ecf20Sopenharmony_ci	movdqa SHUF_MASK(%rip), %xmm10
4098c2ecf20Sopenharmony_ci	# shuffle xmm0 back to output as ciphertext
4108c2ecf20Sopenharmony_ci	pshufb %xmm10, %xmm0
4118c2ecf20Sopenharmony_ci.endif
4128c2ecf20Sopenharmony_ci
4138c2ecf20Sopenharmony_ci	# Output %r13 bytes
4148c2ecf20Sopenharmony_ci	movq %xmm0, %rax
4158c2ecf20Sopenharmony_ci	cmp $8, %r13
4168c2ecf20Sopenharmony_ci	jle _less_than_8_bytes_left_\@
4178c2ecf20Sopenharmony_ci	mov %rax, (%arg3 , %r11, 1)
4188c2ecf20Sopenharmony_ci	add $8, %r11
4198c2ecf20Sopenharmony_ci	psrldq $8, %xmm0
4208c2ecf20Sopenharmony_ci	movq %xmm0, %rax
4218c2ecf20Sopenharmony_ci	sub $8, %r13
4228c2ecf20Sopenharmony_ci_less_than_8_bytes_left_\@:
4238c2ecf20Sopenharmony_ci	mov %al,  (%arg3, %r11, 1)
4248c2ecf20Sopenharmony_ci	add $1, %r11
4258c2ecf20Sopenharmony_ci	shr $8, %rax
4268c2ecf20Sopenharmony_ci	sub $1, %r13
4278c2ecf20Sopenharmony_ci	jne _less_than_8_bytes_left_\@
4288c2ecf20Sopenharmony_ci_multiple_of_16_bytes_\@:
4298c2ecf20Sopenharmony_ci.endm
4308c2ecf20Sopenharmony_ci
4318c2ecf20Sopenharmony_ci# GCM_COMPLETE Finishes update of tag of last partial block
4328c2ecf20Sopenharmony_ci# Output: Authorization Tag (AUTH_TAG)
4338c2ecf20Sopenharmony_ci# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
4348c2ecf20Sopenharmony_ci.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
4358c2ecf20Sopenharmony_ci	movdqu AadHash(%arg2), %xmm8
4368c2ecf20Sopenharmony_ci	movdqu HashKey(%arg2), %xmm13
4378c2ecf20Sopenharmony_ci
4388c2ecf20Sopenharmony_ci	mov PBlockLen(%arg2), %r12
4398c2ecf20Sopenharmony_ci
4408c2ecf20Sopenharmony_ci	test %r12, %r12
4418c2ecf20Sopenharmony_ci	je _partial_done\@
4428c2ecf20Sopenharmony_ci
4438c2ecf20Sopenharmony_ci	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
4448c2ecf20Sopenharmony_ci
4458c2ecf20Sopenharmony_ci_partial_done\@:
4468c2ecf20Sopenharmony_ci	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
4478c2ecf20Sopenharmony_ci	shl	$3, %r12		  # convert into number of bits
4488c2ecf20Sopenharmony_ci	movd	%r12d, %xmm15		  # len(A) in %xmm15
4498c2ecf20Sopenharmony_ci	mov InLen(%arg2), %r12
4508c2ecf20Sopenharmony_ci	shl     $3, %r12                  # len(C) in bits (*128)
4518c2ecf20Sopenharmony_ci	movq    %r12, %xmm1
4528c2ecf20Sopenharmony_ci
4538c2ecf20Sopenharmony_ci	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
4548c2ecf20Sopenharmony_ci	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
4558c2ecf20Sopenharmony_ci	pxor	%xmm15, %xmm8
4568c2ecf20Sopenharmony_ci	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
4578c2ecf20Sopenharmony_ci	# final GHASH computation
4588c2ecf20Sopenharmony_ci	movdqa SHUF_MASK(%rip), %xmm10
4598c2ecf20Sopenharmony_ci	pshufb %xmm10, %xmm8
4608c2ecf20Sopenharmony_ci
4618c2ecf20Sopenharmony_ci	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
4628c2ecf20Sopenharmony_ci	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
4638c2ecf20Sopenharmony_ci	pxor	%xmm8, %xmm0
4648c2ecf20Sopenharmony_ci_return_T_\@:
4658c2ecf20Sopenharmony_ci	mov	\AUTHTAG, %r10                     # %r10 = authTag
4668c2ecf20Sopenharmony_ci	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
4678c2ecf20Sopenharmony_ci	cmp	$16, %r11
4688c2ecf20Sopenharmony_ci	je	_T_16_\@
4698c2ecf20Sopenharmony_ci	cmp	$8, %r11
4708c2ecf20Sopenharmony_ci	jl	_T_4_\@
4718c2ecf20Sopenharmony_ci_T_8_\@:
4728c2ecf20Sopenharmony_ci	movq	%xmm0, %rax
4738c2ecf20Sopenharmony_ci	mov	%rax, (%r10)
4748c2ecf20Sopenharmony_ci	add	$8, %r10
4758c2ecf20Sopenharmony_ci	sub	$8, %r11
4768c2ecf20Sopenharmony_ci	psrldq	$8, %xmm0
4778c2ecf20Sopenharmony_ci	test	%r11, %r11
4788c2ecf20Sopenharmony_ci	je	_return_T_done_\@
4798c2ecf20Sopenharmony_ci_T_4_\@:
4808c2ecf20Sopenharmony_ci	movd	%xmm0, %eax
4818c2ecf20Sopenharmony_ci	mov	%eax, (%r10)
4828c2ecf20Sopenharmony_ci	add	$4, %r10
4838c2ecf20Sopenharmony_ci	sub	$4, %r11
4848c2ecf20Sopenharmony_ci	psrldq	$4, %xmm0
4858c2ecf20Sopenharmony_ci	test	%r11, %r11
4868c2ecf20Sopenharmony_ci	je	_return_T_done_\@
4878c2ecf20Sopenharmony_ci_T_123_\@:
4888c2ecf20Sopenharmony_ci	movd	%xmm0, %eax
4898c2ecf20Sopenharmony_ci	cmp	$2, %r11
4908c2ecf20Sopenharmony_ci	jl	_T_1_\@
4918c2ecf20Sopenharmony_ci	mov	%ax, (%r10)
4928c2ecf20Sopenharmony_ci	cmp	$2, %r11
4938c2ecf20Sopenharmony_ci	je	_return_T_done_\@
4948c2ecf20Sopenharmony_ci	add	$2, %r10
4958c2ecf20Sopenharmony_ci	sar	$16, %eax
4968c2ecf20Sopenharmony_ci_T_1_\@:
4978c2ecf20Sopenharmony_ci	mov	%al, (%r10)
4988c2ecf20Sopenharmony_ci	jmp	_return_T_done_\@
4998c2ecf20Sopenharmony_ci_T_16_\@:
5008c2ecf20Sopenharmony_ci	movdqu	%xmm0, (%r10)
5018c2ecf20Sopenharmony_ci_return_T_done_\@:
5028c2ecf20Sopenharmony_ci.endm
5038c2ecf20Sopenharmony_ci
5048c2ecf20Sopenharmony_ci#ifdef __x86_64__
5058c2ecf20Sopenharmony_ci/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
5068c2ecf20Sopenharmony_ci*
5078c2ecf20Sopenharmony_ci*
5088c2ecf20Sopenharmony_ci* Input: A and B (128-bits each, bit-reflected)
5098c2ecf20Sopenharmony_ci* Output: C = A*B*x mod poly, (i.e. >>1 )
5108c2ecf20Sopenharmony_ci* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
5118c2ecf20Sopenharmony_ci* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
5128c2ecf20Sopenharmony_ci*
5138c2ecf20Sopenharmony_ci*/
5148c2ecf20Sopenharmony_ci.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
5158c2ecf20Sopenharmony_ci	movdqa	  \GH, \TMP1
5168c2ecf20Sopenharmony_ci	pshufd	  $78, \GH, \TMP2
5178c2ecf20Sopenharmony_ci	pshufd	  $78, \HK, \TMP3
5188c2ecf20Sopenharmony_ci	pxor	  \GH, \TMP2            # TMP2 = a1+a0
5198c2ecf20Sopenharmony_ci	pxor	  \HK, \TMP3            # TMP3 = b1+b0
5208c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
5218c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
5228c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
5238c2ecf20Sopenharmony_ci	pxor	  \GH, \TMP2
5248c2ecf20Sopenharmony_ci	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
5258c2ecf20Sopenharmony_ci	movdqa	  \TMP2, \TMP3
5268c2ecf20Sopenharmony_ci	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
5278c2ecf20Sopenharmony_ci	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
5288c2ecf20Sopenharmony_ci	pxor	  \TMP3, \GH
5298c2ecf20Sopenharmony_ci	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
5308c2ecf20Sopenharmony_ci
5318c2ecf20Sopenharmony_ci        # first phase of the reduction
5328c2ecf20Sopenharmony_ci
5338c2ecf20Sopenharmony_ci	movdqa    \GH, \TMP2
5348c2ecf20Sopenharmony_ci	movdqa    \GH, \TMP3
5358c2ecf20Sopenharmony_ci	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
5368c2ecf20Sopenharmony_ci					# in in order to perform
5378c2ecf20Sopenharmony_ci					# independent shifts
5388c2ecf20Sopenharmony_ci	pslld     $31, \TMP2            # packed right shift <<31
5398c2ecf20Sopenharmony_ci	pslld     $30, \TMP3            # packed right shift <<30
5408c2ecf20Sopenharmony_ci	pslld     $25, \TMP4            # packed right shift <<25
5418c2ecf20Sopenharmony_ci	pxor      \TMP3, \TMP2          # xor the shifted versions
5428c2ecf20Sopenharmony_ci	pxor      \TMP4, \TMP2
5438c2ecf20Sopenharmony_ci	movdqa    \TMP2, \TMP5
5448c2ecf20Sopenharmony_ci	psrldq    $4, \TMP5             # right shift TMP5 1 DW
5458c2ecf20Sopenharmony_ci	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
5468c2ecf20Sopenharmony_ci	pxor      \TMP2, \GH
5478c2ecf20Sopenharmony_ci
5488c2ecf20Sopenharmony_ci        # second phase of the reduction
5498c2ecf20Sopenharmony_ci
5508c2ecf20Sopenharmony_ci	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
5518c2ecf20Sopenharmony_ci					# in in order to perform
5528c2ecf20Sopenharmony_ci					# independent shifts
5538c2ecf20Sopenharmony_ci	movdqa    \GH,\TMP3
5548c2ecf20Sopenharmony_ci	movdqa    \GH,\TMP4
5558c2ecf20Sopenharmony_ci	psrld     $1,\TMP2              # packed left shift >>1
5568c2ecf20Sopenharmony_ci	psrld     $2,\TMP3              # packed left shift >>2
5578c2ecf20Sopenharmony_ci	psrld     $7,\TMP4              # packed left shift >>7
5588c2ecf20Sopenharmony_ci	pxor      \TMP3,\TMP2		# xor the shifted versions
5598c2ecf20Sopenharmony_ci	pxor      \TMP4,\TMP2
5608c2ecf20Sopenharmony_ci	pxor      \TMP5, \TMP2
5618c2ecf20Sopenharmony_ci	pxor      \TMP2, \GH
5628c2ecf20Sopenharmony_ci	pxor      \TMP1, \GH            # result is in TMP1
5638c2ecf20Sopenharmony_ci.endm
5648c2ecf20Sopenharmony_ci
5658c2ecf20Sopenharmony_ci# Reads DLEN bytes starting at DPTR and stores in XMMDst
5668c2ecf20Sopenharmony_ci# where 0 < DLEN < 16
5678c2ecf20Sopenharmony_ci# Clobbers %rax, DLEN and XMM1
5688c2ecf20Sopenharmony_ci.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
5698c2ecf20Sopenharmony_ci        cmp $8, \DLEN
5708c2ecf20Sopenharmony_ci        jl _read_lt8_\@
5718c2ecf20Sopenharmony_ci        mov (\DPTR), %rax
5728c2ecf20Sopenharmony_ci        movq %rax, \XMMDst
5738c2ecf20Sopenharmony_ci        sub $8, \DLEN
5748c2ecf20Sopenharmony_ci        jz _done_read_partial_block_\@
5758c2ecf20Sopenharmony_ci	xor %eax, %eax
5768c2ecf20Sopenharmony_ci_read_next_byte_\@:
5778c2ecf20Sopenharmony_ci        shl $8, %rax
5788c2ecf20Sopenharmony_ci        mov 7(\DPTR, \DLEN, 1), %al
5798c2ecf20Sopenharmony_ci        dec \DLEN
5808c2ecf20Sopenharmony_ci        jnz _read_next_byte_\@
5818c2ecf20Sopenharmony_ci        movq %rax, \XMM1
5828c2ecf20Sopenharmony_ci	pslldq $8, \XMM1
5838c2ecf20Sopenharmony_ci        por \XMM1, \XMMDst
5848c2ecf20Sopenharmony_ci	jmp _done_read_partial_block_\@
5858c2ecf20Sopenharmony_ci_read_lt8_\@:
5868c2ecf20Sopenharmony_ci	xor %eax, %eax
5878c2ecf20Sopenharmony_ci_read_next_byte_lt8_\@:
5888c2ecf20Sopenharmony_ci        shl $8, %rax
5898c2ecf20Sopenharmony_ci        mov -1(\DPTR, \DLEN, 1), %al
5908c2ecf20Sopenharmony_ci        dec \DLEN
5918c2ecf20Sopenharmony_ci        jnz _read_next_byte_lt8_\@
5928c2ecf20Sopenharmony_ci        movq %rax, \XMMDst
5938c2ecf20Sopenharmony_ci_done_read_partial_block_\@:
5948c2ecf20Sopenharmony_ci.endm
5958c2ecf20Sopenharmony_ci
5968c2ecf20Sopenharmony_ci# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
5978c2ecf20Sopenharmony_ci# clobbers r10-11, xmm14
5988c2ecf20Sopenharmony_ci.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
5998c2ecf20Sopenharmony_ci	TMP6 TMP7
6008c2ecf20Sopenharmony_ci	MOVADQ	   SHUF_MASK(%rip), %xmm14
6018c2ecf20Sopenharmony_ci	mov	   \AAD, %r10		# %r10 = AAD
6028c2ecf20Sopenharmony_ci	mov	   \AADLEN, %r11		# %r11 = aadLen
6038c2ecf20Sopenharmony_ci	pxor	   \TMP7, \TMP7
6048c2ecf20Sopenharmony_ci	pxor	   \TMP6, \TMP6
6058c2ecf20Sopenharmony_ci
6068c2ecf20Sopenharmony_ci	cmp	   $16, %r11
6078c2ecf20Sopenharmony_ci	jl	   _get_AAD_rest\@
6088c2ecf20Sopenharmony_ci_get_AAD_blocks\@:
6098c2ecf20Sopenharmony_ci	movdqu	   (%r10), \TMP7
6108c2ecf20Sopenharmony_ci	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
6118c2ecf20Sopenharmony_ci	pxor	   \TMP7, \TMP6
6128c2ecf20Sopenharmony_ci	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
6138c2ecf20Sopenharmony_ci	add	   $16, %r10
6148c2ecf20Sopenharmony_ci	sub	   $16, %r11
6158c2ecf20Sopenharmony_ci	cmp	   $16, %r11
6168c2ecf20Sopenharmony_ci	jge	   _get_AAD_blocks\@
6178c2ecf20Sopenharmony_ci
6188c2ecf20Sopenharmony_ci	movdqu	   \TMP6, \TMP7
6198c2ecf20Sopenharmony_ci
6208c2ecf20Sopenharmony_ci	/* read the last <16B of AAD */
6218c2ecf20Sopenharmony_ci_get_AAD_rest\@:
6228c2ecf20Sopenharmony_ci	test	   %r11, %r11
6238c2ecf20Sopenharmony_ci	je	   _get_AAD_done\@
6248c2ecf20Sopenharmony_ci
6258c2ecf20Sopenharmony_ci	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
6268c2ecf20Sopenharmony_ci	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
6278c2ecf20Sopenharmony_ci	pxor	   \TMP6, \TMP7
6288c2ecf20Sopenharmony_ci	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
6298c2ecf20Sopenharmony_ci	movdqu \TMP7, \TMP6
6308c2ecf20Sopenharmony_ci
6318c2ecf20Sopenharmony_ci_get_AAD_done\@:
6328c2ecf20Sopenharmony_ci	movdqu \TMP6, AadHash(%arg2)
6338c2ecf20Sopenharmony_ci.endm
6348c2ecf20Sopenharmony_ci
6358c2ecf20Sopenharmony_ci# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
6368c2ecf20Sopenharmony_ci# between update calls.
6378c2ecf20Sopenharmony_ci# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
6388c2ecf20Sopenharmony_ci# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
6398c2ecf20Sopenharmony_ci# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
6408c2ecf20Sopenharmony_ci.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
6418c2ecf20Sopenharmony_ci	AAD_HASH operation
6428c2ecf20Sopenharmony_ci	mov 	PBlockLen(%arg2), %r13
6438c2ecf20Sopenharmony_ci	test	%r13, %r13
6448c2ecf20Sopenharmony_ci	je	_partial_block_done_\@	# Leave Macro if no partial blocks
6458c2ecf20Sopenharmony_ci	# Read in input data without over reading
6468c2ecf20Sopenharmony_ci	cmp	$16, \PLAIN_CYPH_LEN
6478c2ecf20Sopenharmony_ci	jl	_fewer_than_16_bytes_\@
6488c2ecf20Sopenharmony_ci	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
6498c2ecf20Sopenharmony_ci	jmp	_data_read_\@
6508c2ecf20Sopenharmony_ci
6518c2ecf20Sopenharmony_ci_fewer_than_16_bytes_\@:
6528c2ecf20Sopenharmony_ci	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
6538c2ecf20Sopenharmony_ci	mov	\PLAIN_CYPH_LEN, %r12
6548c2ecf20Sopenharmony_ci	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
6558c2ecf20Sopenharmony_ci
6568c2ecf20Sopenharmony_ci	mov PBlockLen(%arg2), %r13
6578c2ecf20Sopenharmony_ci
6588c2ecf20Sopenharmony_ci_data_read_\@:				# Finished reading in data
6598c2ecf20Sopenharmony_ci
6608c2ecf20Sopenharmony_ci	movdqu	PBlockEncKey(%arg2), %xmm9
6618c2ecf20Sopenharmony_ci	movdqu	HashKey(%arg2), %xmm13
6628c2ecf20Sopenharmony_ci
6638c2ecf20Sopenharmony_ci	lea	SHIFT_MASK(%rip), %r12
6648c2ecf20Sopenharmony_ci
6658c2ecf20Sopenharmony_ci	# adjust the shuffle mask pointer to be able to shift r13 bytes
6668c2ecf20Sopenharmony_ci	# r16-r13 is the number of bytes in plaintext mod 16)
6678c2ecf20Sopenharmony_ci	add	%r13, %r12
6688c2ecf20Sopenharmony_ci	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
6698c2ecf20Sopenharmony_ci	pshufb	%xmm2, %xmm9		# shift right r13 bytes
6708c2ecf20Sopenharmony_ci
6718c2ecf20Sopenharmony_ci.ifc \operation, dec
6728c2ecf20Sopenharmony_ci	movdqa	%xmm1, %xmm3
6738c2ecf20Sopenharmony_ci	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
6748c2ecf20Sopenharmony_ci
6758c2ecf20Sopenharmony_ci	mov	\PLAIN_CYPH_LEN, %r10
6768c2ecf20Sopenharmony_ci	add	%r13, %r10
6778c2ecf20Sopenharmony_ci	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
6788c2ecf20Sopenharmony_ci	sub	$16, %r10
6798c2ecf20Sopenharmony_ci	# Determine if if partial block is not being filled and
6808c2ecf20Sopenharmony_ci	# shift mask accordingly
6818c2ecf20Sopenharmony_ci	jge	_no_extra_mask_1_\@
6828c2ecf20Sopenharmony_ci	sub	%r10, %r12
6838c2ecf20Sopenharmony_ci_no_extra_mask_1_\@:
6848c2ecf20Sopenharmony_ci
6858c2ecf20Sopenharmony_ci	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
6868c2ecf20Sopenharmony_ci	# get the appropriate mask to mask out bottom r13 bytes of xmm9
6878c2ecf20Sopenharmony_ci	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
6888c2ecf20Sopenharmony_ci
6898c2ecf20Sopenharmony_ci	pand	%xmm1, %xmm3
6908c2ecf20Sopenharmony_ci	movdqa	SHUF_MASK(%rip), %xmm10
6918c2ecf20Sopenharmony_ci	pshufb	%xmm10, %xmm3
6928c2ecf20Sopenharmony_ci	pshufb	%xmm2, %xmm3
6938c2ecf20Sopenharmony_ci	pxor	%xmm3, \AAD_HASH
6948c2ecf20Sopenharmony_ci
6958c2ecf20Sopenharmony_ci	test	%r10, %r10
6968c2ecf20Sopenharmony_ci	jl	_partial_incomplete_1_\@
6978c2ecf20Sopenharmony_ci
6988c2ecf20Sopenharmony_ci	# GHASH computation for the last <16 Byte block
6998c2ecf20Sopenharmony_ci	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
7008c2ecf20Sopenharmony_ci	xor	%eax, %eax
7018c2ecf20Sopenharmony_ci
7028c2ecf20Sopenharmony_ci	mov	%rax, PBlockLen(%arg2)
7038c2ecf20Sopenharmony_ci	jmp	_dec_done_\@
7048c2ecf20Sopenharmony_ci_partial_incomplete_1_\@:
7058c2ecf20Sopenharmony_ci	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
7068c2ecf20Sopenharmony_ci_dec_done_\@:
7078c2ecf20Sopenharmony_ci	movdqu	\AAD_HASH, AadHash(%arg2)
7088c2ecf20Sopenharmony_ci.else
7098c2ecf20Sopenharmony_ci	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
7108c2ecf20Sopenharmony_ci
7118c2ecf20Sopenharmony_ci	mov	\PLAIN_CYPH_LEN, %r10
7128c2ecf20Sopenharmony_ci	add	%r13, %r10
7138c2ecf20Sopenharmony_ci	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
7148c2ecf20Sopenharmony_ci	sub	$16, %r10
7158c2ecf20Sopenharmony_ci	# Determine if if partial block is not being filled and
7168c2ecf20Sopenharmony_ci	# shift mask accordingly
7178c2ecf20Sopenharmony_ci	jge	_no_extra_mask_2_\@
7188c2ecf20Sopenharmony_ci	sub	%r10, %r12
7198c2ecf20Sopenharmony_ci_no_extra_mask_2_\@:
7208c2ecf20Sopenharmony_ci
7218c2ecf20Sopenharmony_ci	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
7228c2ecf20Sopenharmony_ci	# get the appropriate mask to mask out bottom r13 bytes of xmm9
7238c2ecf20Sopenharmony_ci	pand	%xmm1, %xmm9
7248c2ecf20Sopenharmony_ci
7258c2ecf20Sopenharmony_ci	movdqa	SHUF_MASK(%rip), %xmm1
7268c2ecf20Sopenharmony_ci	pshufb	%xmm1, %xmm9
7278c2ecf20Sopenharmony_ci	pshufb	%xmm2, %xmm9
7288c2ecf20Sopenharmony_ci	pxor	%xmm9, \AAD_HASH
7298c2ecf20Sopenharmony_ci
7308c2ecf20Sopenharmony_ci	test	%r10, %r10
7318c2ecf20Sopenharmony_ci	jl	_partial_incomplete_2_\@
7328c2ecf20Sopenharmony_ci
7338c2ecf20Sopenharmony_ci	# GHASH computation for the last <16 Byte block
7348c2ecf20Sopenharmony_ci	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
7358c2ecf20Sopenharmony_ci	xor	%eax, %eax
7368c2ecf20Sopenharmony_ci
7378c2ecf20Sopenharmony_ci	mov	%rax, PBlockLen(%arg2)
7388c2ecf20Sopenharmony_ci	jmp	_encode_done_\@
7398c2ecf20Sopenharmony_ci_partial_incomplete_2_\@:
7408c2ecf20Sopenharmony_ci	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
7418c2ecf20Sopenharmony_ci_encode_done_\@:
7428c2ecf20Sopenharmony_ci	movdqu	\AAD_HASH, AadHash(%arg2)
7438c2ecf20Sopenharmony_ci
7448c2ecf20Sopenharmony_ci	movdqa	SHUF_MASK(%rip), %xmm10
7458c2ecf20Sopenharmony_ci	# shuffle xmm9 back to output as ciphertext
7468c2ecf20Sopenharmony_ci	pshufb	%xmm10, %xmm9
7478c2ecf20Sopenharmony_ci	pshufb	%xmm2, %xmm9
7488c2ecf20Sopenharmony_ci.endif
7498c2ecf20Sopenharmony_ci	# output encrypted Bytes
7508c2ecf20Sopenharmony_ci	test	%r10, %r10
7518c2ecf20Sopenharmony_ci	jl	_partial_fill_\@
7528c2ecf20Sopenharmony_ci	mov	%r13, %r12
7538c2ecf20Sopenharmony_ci	mov	$16, %r13
7548c2ecf20Sopenharmony_ci	# Set r13 to be the number of bytes to write out
7558c2ecf20Sopenharmony_ci	sub	%r12, %r13
7568c2ecf20Sopenharmony_ci	jmp	_count_set_\@
7578c2ecf20Sopenharmony_ci_partial_fill_\@:
7588c2ecf20Sopenharmony_ci	mov	\PLAIN_CYPH_LEN, %r13
7598c2ecf20Sopenharmony_ci_count_set_\@:
7608c2ecf20Sopenharmony_ci	movdqa	%xmm9, %xmm0
7618c2ecf20Sopenharmony_ci	movq	%xmm0, %rax
7628c2ecf20Sopenharmony_ci	cmp	$8, %r13
7638c2ecf20Sopenharmony_ci	jle	_less_than_8_bytes_left_\@
7648c2ecf20Sopenharmony_ci
7658c2ecf20Sopenharmony_ci	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
7668c2ecf20Sopenharmony_ci	add	$8, \DATA_OFFSET
7678c2ecf20Sopenharmony_ci	psrldq	$8, %xmm0
7688c2ecf20Sopenharmony_ci	movq	%xmm0, %rax
7698c2ecf20Sopenharmony_ci	sub	$8, %r13
7708c2ecf20Sopenharmony_ci_less_than_8_bytes_left_\@:
7718c2ecf20Sopenharmony_ci	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
7728c2ecf20Sopenharmony_ci	add	$1, \DATA_OFFSET
7738c2ecf20Sopenharmony_ci	shr	$8, %rax
7748c2ecf20Sopenharmony_ci	sub	$1, %r13
7758c2ecf20Sopenharmony_ci	jne	_less_than_8_bytes_left_\@
7768c2ecf20Sopenharmony_ci_partial_block_done_\@:
7778c2ecf20Sopenharmony_ci.endm # PARTIAL_BLOCK
7788c2ecf20Sopenharmony_ci
7798c2ecf20Sopenharmony_ci/*
7808c2ecf20Sopenharmony_ci* if a = number of total plaintext bytes
7818c2ecf20Sopenharmony_ci* b = floor(a/16)
7828c2ecf20Sopenharmony_ci* num_initial_blocks = b mod 4
7838c2ecf20Sopenharmony_ci* encrypt the initial num_initial_blocks blocks and apply ghash on
7848c2ecf20Sopenharmony_ci* the ciphertext
7858c2ecf20Sopenharmony_ci* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
7868c2ecf20Sopenharmony_ci* are clobbered
7878c2ecf20Sopenharmony_ci* arg1, %arg2, %arg3 are used as a pointer only, not modified
7888c2ecf20Sopenharmony_ci*/
7898c2ecf20Sopenharmony_ci
7908c2ecf20Sopenharmony_ci
7918c2ecf20Sopenharmony_ci.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
7928c2ecf20Sopenharmony_ci	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
7938c2ecf20Sopenharmony_ci	MOVADQ		SHUF_MASK(%rip), %xmm14
7948c2ecf20Sopenharmony_ci
7958c2ecf20Sopenharmony_ci	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
7968c2ecf20Sopenharmony_ci
7978c2ecf20Sopenharmony_ci	# start AES for num_initial_blocks blocks
7988c2ecf20Sopenharmony_ci
7998c2ecf20Sopenharmony_ci	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
8008c2ecf20Sopenharmony_ci
8018c2ecf20Sopenharmony_ci.if (\i == 5) || (\i == 6) || (\i == 7)
8028c2ecf20Sopenharmony_ci
8038c2ecf20Sopenharmony_ci	MOVADQ		ONE(%RIP),\TMP1
8048c2ecf20Sopenharmony_ci	MOVADQ		0(%arg1),\TMP2
8058c2ecf20Sopenharmony_ci.irpc index, \i_seq
8068c2ecf20Sopenharmony_ci	paddd		\TMP1, \XMM0                 # INCR Y0
8078c2ecf20Sopenharmony_ci.ifc \operation, dec
8088c2ecf20Sopenharmony_ci        movdqa     \XMM0, %xmm\index
8098c2ecf20Sopenharmony_ci.else
8108c2ecf20Sopenharmony_ci	MOVADQ		\XMM0, %xmm\index
8118c2ecf20Sopenharmony_ci.endif
8128c2ecf20Sopenharmony_ci	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
8138c2ecf20Sopenharmony_ci	pxor		\TMP2, %xmm\index
8148c2ecf20Sopenharmony_ci.endr
8158c2ecf20Sopenharmony_ci	lea	0x10(%arg1),%r10
8168c2ecf20Sopenharmony_ci	mov	keysize,%eax
8178c2ecf20Sopenharmony_ci	shr	$2,%eax				# 128->4, 192->6, 256->8
8188c2ecf20Sopenharmony_ci	add	$5,%eax			      # 128->9, 192->11, 256->13
8198c2ecf20Sopenharmony_ci
8208c2ecf20Sopenharmony_ciaes_loop_initial_\@:
8218c2ecf20Sopenharmony_ci	MOVADQ	(%r10),\TMP1
8228c2ecf20Sopenharmony_ci.irpc	index, \i_seq
8238c2ecf20Sopenharmony_ci	aesenc	\TMP1, %xmm\index
8248c2ecf20Sopenharmony_ci.endr
8258c2ecf20Sopenharmony_ci	add	$16,%r10
8268c2ecf20Sopenharmony_ci	sub	$1,%eax
8278c2ecf20Sopenharmony_ci	jnz	aes_loop_initial_\@
8288c2ecf20Sopenharmony_ci
8298c2ecf20Sopenharmony_ci	MOVADQ	(%r10), \TMP1
8308c2ecf20Sopenharmony_ci.irpc index, \i_seq
8318c2ecf20Sopenharmony_ci	aesenclast \TMP1, %xmm\index         # Last Round
8328c2ecf20Sopenharmony_ci.endr
8338c2ecf20Sopenharmony_ci.irpc index, \i_seq
8348c2ecf20Sopenharmony_ci	movdqu	   (%arg4 , %r11, 1), \TMP1
8358c2ecf20Sopenharmony_ci	pxor	   \TMP1, %xmm\index
8368c2ecf20Sopenharmony_ci	movdqu	   %xmm\index, (%arg3 , %r11, 1)
8378c2ecf20Sopenharmony_ci	# write back plaintext/ciphertext for num_initial_blocks
8388c2ecf20Sopenharmony_ci	add	   $16, %r11
8398c2ecf20Sopenharmony_ci
8408c2ecf20Sopenharmony_ci.ifc \operation, dec
8418c2ecf20Sopenharmony_ci	movdqa     \TMP1, %xmm\index
8428c2ecf20Sopenharmony_ci.endif
8438c2ecf20Sopenharmony_ci	pshufb	   %xmm14, %xmm\index
8448c2ecf20Sopenharmony_ci
8458c2ecf20Sopenharmony_ci		# prepare plaintext/ciphertext for GHASH computation
8468c2ecf20Sopenharmony_ci.endr
8478c2ecf20Sopenharmony_ci.endif
8488c2ecf20Sopenharmony_ci
8498c2ecf20Sopenharmony_ci        # apply GHASH on num_initial_blocks blocks
8508c2ecf20Sopenharmony_ci
8518c2ecf20Sopenharmony_ci.if \i == 5
8528c2ecf20Sopenharmony_ci        pxor       %xmm5, %xmm6
8538c2ecf20Sopenharmony_ci	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
8548c2ecf20Sopenharmony_ci        pxor       %xmm6, %xmm7
8558c2ecf20Sopenharmony_ci	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
8568c2ecf20Sopenharmony_ci        pxor       %xmm7, %xmm8
8578c2ecf20Sopenharmony_ci	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
8588c2ecf20Sopenharmony_ci.elseif \i == 6
8598c2ecf20Sopenharmony_ci        pxor       %xmm6, %xmm7
8608c2ecf20Sopenharmony_ci	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
8618c2ecf20Sopenharmony_ci        pxor       %xmm7, %xmm8
8628c2ecf20Sopenharmony_ci	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
8638c2ecf20Sopenharmony_ci.elseif \i == 7
8648c2ecf20Sopenharmony_ci        pxor       %xmm7, %xmm8
8658c2ecf20Sopenharmony_ci	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
8668c2ecf20Sopenharmony_ci.endif
8678c2ecf20Sopenharmony_ci	cmp	   $64, %r13
8688c2ecf20Sopenharmony_ci	jl	_initial_blocks_done\@
8698c2ecf20Sopenharmony_ci	# no need for precomputed values
8708c2ecf20Sopenharmony_ci/*
8718c2ecf20Sopenharmony_ci*
8728c2ecf20Sopenharmony_ci* Precomputations for HashKey parallel with encryption of first 4 blocks.
8738c2ecf20Sopenharmony_ci* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
8748c2ecf20Sopenharmony_ci*/
8758c2ecf20Sopenharmony_ci	MOVADQ	   ONE(%RIP),\TMP1
8768c2ecf20Sopenharmony_ci	paddd	   \TMP1, \XMM0              # INCR Y0
8778c2ecf20Sopenharmony_ci	MOVADQ	   \XMM0, \XMM1
8788c2ecf20Sopenharmony_ci	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
8798c2ecf20Sopenharmony_ci
8808c2ecf20Sopenharmony_ci	paddd	   \TMP1, \XMM0              # INCR Y0
8818c2ecf20Sopenharmony_ci	MOVADQ	   \XMM0, \XMM2
8828c2ecf20Sopenharmony_ci	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
8838c2ecf20Sopenharmony_ci
8848c2ecf20Sopenharmony_ci	paddd	   \TMP1, \XMM0              # INCR Y0
8858c2ecf20Sopenharmony_ci	MOVADQ	   \XMM0, \XMM3
8868c2ecf20Sopenharmony_ci	pshufb %xmm14, \XMM3        # perform a 16 byte swap
8878c2ecf20Sopenharmony_ci
8888c2ecf20Sopenharmony_ci	paddd	   \TMP1, \XMM0              # INCR Y0
8898c2ecf20Sopenharmony_ci	MOVADQ	   \XMM0, \XMM4
8908c2ecf20Sopenharmony_ci	pshufb %xmm14, \XMM4        # perform a 16 byte swap
8918c2ecf20Sopenharmony_ci
8928c2ecf20Sopenharmony_ci	MOVADQ	   0(%arg1),\TMP1
8938c2ecf20Sopenharmony_ci	pxor	   \TMP1, \XMM1
8948c2ecf20Sopenharmony_ci	pxor	   \TMP1, \XMM2
8958c2ecf20Sopenharmony_ci	pxor	   \TMP1, \XMM3
8968c2ecf20Sopenharmony_ci	pxor	   \TMP1, \XMM4
8978c2ecf20Sopenharmony_ci.irpc index, 1234 # do 4 rounds
8988c2ecf20Sopenharmony_ci	movaps 0x10*\index(%arg1), \TMP1
8998c2ecf20Sopenharmony_ci	aesenc	   \TMP1, \XMM1
9008c2ecf20Sopenharmony_ci	aesenc	   \TMP1, \XMM2
9018c2ecf20Sopenharmony_ci	aesenc	   \TMP1, \XMM3
9028c2ecf20Sopenharmony_ci	aesenc	   \TMP1, \XMM4
9038c2ecf20Sopenharmony_ci.endr
9048c2ecf20Sopenharmony_ci.irpc index, 56789 # do next 5 rounds
9058c2ecf20Sopenharmony_ci	movaps 0x10*\index(%arg1), \TMP1
9068c2ecf20Sopenharmony_ci	aesenc	   \TMP1, \XMM1
9078c2ecf20Sopenharmony_ci	aesenc	   \TMP1, \XMM2
9088c2ecf20Sopenharmony_ci	aesenc	   \TMP1, \XMM3
9098c2ecf20Sopenharmony_ci	aesenc	   \TMP1, \XMM4
9108c2ecf20Sopenharmony_ci.endr
9118c2ecf20Sopenharmony_ci	lea	   0xa0(%arg1),%r10
9128c2ecf20Sopenharmony_ci	mov	   keysize,%eax
9138c2ecf20Sopenharmony_ci	shr	   $2,%eax			# 128->4, 192->6, 256->8
9148c2ecf20Sopenharmony_ci	sub	   $4,%eax			# 128->0, 192->2, 256->4
9158c2ecf20Sopenharmony_ci	jz	   aes_loop_pre_done\@
9168c2ecf20Sopenharmony_ci
9178c2ecf20Sopenharmony_ciaes_loop_pre_\@:
9188c2ecf20Sopenharmony_ci	MOVADQ	   (%r10),\TMP2
9198c2ecf20Sopenharmony_ci.irpc	index, 1234
9208c2ecf20Sopenharmony_ci	aesenc	   \TMP2, %xmm\index
9218c2ecf20Sopenharmony_ci.endr
9228c2ecf20Sopenharmony_ci	add	   $16,%r10
9238c2ecf20Sopenharmony_ci	sub	   $1,%eax
9248c2ecf20Sopenharmony_ci	jnz	   aes_loop_pre_\@
9258c2ecf20Sopenharmony_ci
9268c2ecf20Sopenharmony_ciaes_loop_pre_done\@:
9278c2ecf20Sopenharmony_ci	MOVADQ	   (%r10), \TMP2
9288c2ecf20Sopenharmony_ci	aesenclast \TMP2, \XMM1
9298c2ecf20Sopenharmony_ci	aesenclast \TMP2, \XMM2
9308c2ecf20Sopenharmony_ci	aesenclast \TMP2, \XMM3
9318c2ecf20Sopenharmony_ci	aesenclast \TMP2, \XMM4
9328c2ecf20Sopenharmony_ci	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
9338c2ecf20Sopenharmony_ci	pxor	   \TMP1, \XMM1
9348c2ecf20Sopenharmony_ci.ifc \operation, dec
9358c2ecf20Sopenharmony_ci	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
9368c2ecf20Sopenharmony_ci	movdqa     \TMP1, \XMM1
9378c2ecf20Sopenharmony_ci.endif
9388c2ecf20Sopenharmony_ci	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
9398c2ecf20Sopenharmony_ci	pxor	   \TMP1, \XMM2
9408c2ecf20Sopenharmony_ci.ifc \operation, dec
9418c2ecf20Sopenharmony_ci	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
9428c2ecf20Sopenharmony_ci	movdqa     \TMP1, \XMM2
9438c2ecf20Sopenharmony_ci.endif
9448c2ecf20Sopenharmony_ci	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
9458c2ecf20Sopenharmony_ci	pxor	   \TMP1, \XMM3
9468c2ecf20Sopenharmony_ci.ifc \operation, dec
9478c2ecf20Sopenharmony_ci	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
9488c2ecf20Sopenharmony_ci	movdqa     \TMP1, \XMM3
9498c2ecf20Sopenharmony_ci.endif
9508c2ecf20Sopenharmony_ci	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
9518c2ecf20Sopenharmony_ci	pxor	   \TMP1, \XMM4
9528c2ecf20Sopenharmony_ci.ifc \operation, dec
9538c2ecf20Sopenharmony_ci	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
9548c2ecf20Sopenharmony_ci	movdqa     \TMP1, \XMM4
9558c2ecf20Sopenharmony_ci.else
9568c2ecf20Sopenharmony_ci	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
9578c2ecf20Sopenharmony_ci	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
9588c2ecf20Sopenharmony_ci	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
9598c2ecf20Sopenharmony_ci	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
9608c2ecf20Sopenharmony_ci.endif
9618c2ecf20Sopenharmony_ci
9628c2ecf20Sopenharmony_ci	add	   $64, %r11
9638c2ecf20Sopenharmony_ci	pshufb %xmm14, \XMM1 # perform a 16 byte swap
9648c2ecf20Sopenharmony_ci	pxor	   \XMMDst, \XMM1
9658c2ecf20Sopenharmony_ci# combine GHASHed value with the corresponding ciphertext
9668c2ecf20Sopenharmony_ci	pshufb %xmm14, \XMM2 # perform a 16 byte swap
9678c2ecf20Sopenharmony_ci	pshufb %xmm14, \XMM3 # perform a 16 byte swap
9688c2ecf20Sopenharmony_ci	pshufb %xmm14, \XMM4 # perform a 16 byte swap
9698c2ecf20Sopenharmony_ci
9708c2ecf20Sopenharmony_ci_initial_blocks_done\@:
9718c2ecf20Sopenharmony_ci
9728c2ecf20Sopenharmony_ci.endm
9738c2ecf20Sopenharmony_ci
9748c2ecf20Sopenharmony_ci/*
9758c2ecf20Sopenharmony_ci* encrypt 4 blocks at a time
9768c2ecf20Sopenharmony_ci* ghash the 4 previously encrypted ciphertext blocks
9778c2ecf20Sopenharmony_ci* arg1, %arg3, %arg4 are used as pointers only, not modified
9788c2ecf20Sopenharmony_ci* %r11 is the data offset value
9798c2ecf20Sopenharmony_ci*/
9808c2ecf20Sopenharmony_ci.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
9818c2ecf20Sopenharmony_ciTMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
9828c2ecf20Sopenharmony_ci
9838c2ecf20Sopenharmony_ci	movdqa	  \XMM1, \XMM5
9848c2ecf20Sopenharmony_ci	movdqa	  \XMM2, \XMM6
9858c2ecf20Sopenharmony_ci	movdqa	  \XMM3, \XMM7
9868c2ecf20Sopenharmony_ci	movdqa	  \XMM4, \XMM8
9878c2ecf20Sopenharmony_ci
9888c2ecf20Sopenharmony_ci        movdqa    SHUF_MASK(%rip), %xmm15
9898c2ecf20Sopenharmony_ci        # multiply TMP5 * HashKey using karatsuba
9908c2ecf20Sopenharmony_ci
9918c2ecf20Sopenharmony_ci	movdqa	  \XMM5, \TMP4
9928c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM5, \TMP6
9938c2ecf20Sopenharmony_ci	pxor	  \XMM5, \TMP6
9948c2ecf20Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
9958c2ecf20Sopenharmony_ci	movdqu	  HashKey_4(%arg2), \TMP5
9968c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
9978c2ecf20Sopenharmony_ci	movdqa    \XMM0, \XMM1
9988c2ecf20Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
9998c2ecf20Sopenharmony_ci	movdqa    \XMM0, \XMM2
10008c2ecf20Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
10018c2ecf20Sopenharmony_ci	movdqa    \XMM0, \XMM3
10028c2ecf20Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
10038c2ecf20Sopenharmony_ci	movdqa    \XMM0, \XMM4
10048c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM1	# perform a 16 byte swap
10058c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
10068c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM2	# perform a 16 byte swap
10078c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM3	# perform a 16 byte swap
10088c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM4	# perform a 16 byte swap
10098c2ecf20Sopenharmony_ci
10108c2ecf20Sopenharmony_ci	pxor	  (%arg1), \XMM1
10118c2ecf20Sopenharmony_ci	pxor	  (%arg1), \XMM2
10128c2ecf20Sopenharmony_ci	pxor	  (%arg1), \XMM3
10138c2ecf20Sopenharmony_ci	pxor	  (%arg1), \XMM4
10148c2ecf20Sopenharmony_ci	movdqu	  HashKey_4_k(%arg2), \TMP5
10158c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
10168c2ecf20Sopenharmony_ci	movaps 0x10(%arg1), \TMP1
10178c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM1              # Round 1
10188c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM2
10198c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM3
10208c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM4
10218c2ecf20Sopenharmony_ci	movaps 0x20(%arg1), \TMP1
10228c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM1              # Round 2
10238c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM2
10248c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM3
10258c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM4
10268c2ecf20Sopenharmony_ci	movdqa	  \XMM6, \TMP1
10278c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM6, \TMP2
10288c2ecf20Sopenharmony_ci	pxor	  \XMM6, \TMP2
10298c2ecf20Sopenharmony_ci	movdqu	  HashKey_3(%arg2), \TMP5
10308c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
10318c2ecf20Sopenharmony_ci	movaps 0x30(%arg1), \TMP3
10328c2ecf20Sopenharmony_ci	aesenc    \TMP3, \XMM1              # Round 3
10338c2ecf20Sopenharmony_ci	aesenc    \TMP3, \XMM2
10348c2ecf20Sopenharmony_ci	aesenc    \TMP3, \XMM3
10358c2ecf20Sopenharmony_ci	aesenc    \TMP3, \XMM4
10368c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
10378c2ecf20Sopenharmony_ci	movaps 0x40(%arg1), \TMP3
10388c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 4
10398c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
10408c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
10418c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
10428c2ecf20Sopenharmony_ci	movdqu	  HashKey_3_k(%arg2), \TMP5
10438c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
10448c2ecf20Sopenharmony_ci	movaps 0x50(%arg1), \TMP3
10458c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 5
10468c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
10478c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
10488c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
10498c2ecf20Sopenharmony_ci	pxor	  \TMP1, \TMP4
10508c2ecf20Sopenharmony_ci# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
10518c2ecf20Sopenharmony_ci	pxor	  \XMM6, \XMM5
10528c2ecf20Sopenharmony_ci	pxor	  \TMP2, \TMP6
10538c2ecf20Sopenharmony_ci	movdqa	  \XMM7, \TMP1
10548c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM7, \TMP2
10558c2ecf20Sopenharmony_ci	pxor	  \XMM7, \TMP2
10568c2ecf20Sopenharmony_ci	movdqu	  HashKey_2(%arg2), \TMP5
10578c2ecf20Sopenharmony_ci
10588c2ecf20Sopenharmony_ci        # Multiply TMP5 * HashKey using karatsuba
10598c2ecf20Sopenharmony_ci
10608c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
10618c2ecf20Sopenharmony_ci	movaps 0x60(%arg1), \TMP3
10628c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 6
10638c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
10648c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
10658c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
10668c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
10678c2ecf20Sopenharmony_ci	movaps 0x70(%arg1), \TMP3
10688c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 7
10698c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
10708c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
10718c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
10728c2ecf20Sopenharmony_ci	movdqu	  HashKey_2_k(%arg2), \TMP5
10738c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
10748c2ecf20Sopenharmony_ci	movaps 0x80(%arg1), \TMP3
10758c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 8
10768c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
10778c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
10788c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
10798c2ecf20Sopenharmony_ci	pxor	  \TMP1, \TMP4
10808c2ecf20Sopenharmony_ci# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
10818c2ecf20Sopenharmony_ci	pxor	  \XMM7, \XMM5
10828c2ecf20Sopenharmony_ci	pxor	  \TMP2, \TMP6
10838c2ecf20Sopenharmony_ci
10848c2ecf20Sopenharmony_ci        # Multiply XMM8 * HashKey
10858c2ecf20Sopenharmony_ci        # XMM8 and TMP5 hold the values for the two operands
10868c2ecf20Sopenharmony_ci
10878c2ecf20Sopenharmony_ci	movdqa	  \XMM8, \TMP1
10888c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM8, \TMP2
10898c2ecf20Sopenharmony_ci	pxor	  \XMM8, \TMP2
10908c2ecf20Sopenharmony_ci	movdqu	  HashKey(%arg2), \TMP5
10918c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
10928c2ecf20Sopenharmony_ci	movaps 0x90(%arg1), \TMP3
10938c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1             # Round 9
10948c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
10958c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
10968c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
10978c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
10988c2ecf20Sopenharmony_ci	lea	  0xa0(%arg1),%r10
10998c2ecf20Sopenharmony_ci	mov	  keysize,%eax
11008c2ecf20Sopenharmony_ci	shr	  $2,%eax			# 128->4, 192->6, 256->8
11018c2ecf20Sopenharmony_ci	sub	  $4,%eax			# 128->0, 192->2, 256->4
11028c2ecf20Sopenharmony_ci	jz	  aes_loop_par_enc_done\@
11038c2ecf20Sopenharmony_ci
11048c2ecf20Sopenharmony_ciaes_loop_par_enc\@:
11058c2ecf20Sopenharmony_ci	MOVADQ	  (%r10),\TMP3
11068c2ecf20Sopenharmony_ci.irpc	index, 1234
11078c2ecf20Sopenharmony_ci	aesenc	  \TMP3, %xmm\index
11088c2ecf20Sopenharmony_ci.endr
11098c2ecf20Sopenharmony_ci	add	  $16,%r10
11108c2ecf20Sopenharmony_ci	sub	  $1,%eax
11118c2ecf20Sopenharmony_ci	jnz	  aes_loop_par_enc\@
11128c2ecf20Sopenharmony_ci
11138c2ecf20Sopenharmony_ciaes_loop_par_enc_done\@:
11148c2ecf20Sopenharmony_ci	MOVADQ	  (%r10), \TMP3
11158c2ecf20Sopenharmony_ci	aesenclast \TMP3, \XMM1           # Round 10
11168c2ecf20Sopenharmony_ci	aesenclast \TMP3, \XMM2
11178c2ecf20Sopenharmony_ci	aesenclast \TMP3, \XMM3
11188c2ecf20Sopenharmony_ci	aesenclast \TMP3, \XMM4
11198c2ecf20Sopenharmony_ci	movdqu    HashKey_k(%arg2), \TMP5
11208c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
11218c2ecf20Sopenharmony_ci	movdqu	  (%arg4,%r11,1), \TMP3
11228c2ecf20Sopenharmony_ci	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
11238c2ecf20Sopenharmony_ci	movdqu	  16(%arg4,%r11,1), \TMP3
11248c2ecf20Sopenharmony_ci	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
11258c2ecf20Sopenharmony_ci	movdqu	  32(%arg4,%r11,1), \TMP3
11268c2ecf20Sopenharmony_ci	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
11278c2ecf20Sopenharmony_ci	movdqu	  48(%arg4,%r11,1), \TMP3
11288c2ecf20Sopenharmony_ci	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
11298c2ecf20Sopenharmony_ci        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
11308c2ecf20Sopenharmony_ci        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
11318c2ecf20Sopenharmony_ci        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
11328c2ecf20Sopenharmony_ci        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
11338c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM1        # perform a 16 byte swap
11348c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM2	# perform a 16 byte swap
11358c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM3	# perform a 16 byte swap
11368c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM4	# perform a 16 byte swap
11378c2ecf20Sopenharmony_ci
11388c2ecf20Sopenharmony_ci	pxor	  \TMP4, \TMP1
11398c2ecf20Sopenharmony_ci	pxor	  \XMM8, \XMM5
11408c2ecf20Sopenharmony_ci	pxor	  \TMP6, \TMP2
11418c2ecf20Sopenharmony_ci	pxor	  \TMP1, \TMP2
11428c2ecf20Sopenharmony_ci	pxor	  \XMM5, \TMP2
11438c2ecf20Sopenharmony_ci	movdqa	  \TMP2, \TMP3
11448c2ecf20Sopenharmony_ci	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
11458c2ecf20Sopenharmony_ci	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
11468c2ecf20Sopenharmony_ci	pxor	  \TMP3, \XMM5
11478c2ecf20Sopenharmony_ci	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
11488c2ecf20Sopenharmony_ci
11498c2ecf20Sopenharmony_ci        # first phase of reduction
11508c2ecf20Sopenharmony_ci
11518c2ecf20Sopenharmony_ci	movdqa    \XMM5, \TMP2
11528c2ecf20Sopenharmony_ci	movdqa    \XMM5, \TMP3
11538c2ecf20Sopenharmony_ci	movdqa    \XMM5, \TMP4
11548c2ecf20Sopenharmony_ci# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
11558c2ecf20Sopenharmony_ci	pslld     $31, \TMP2                   # packed right shift << 31
11568c2ecf20Sopenharmony_ci	pslld     $30, \TMP3                   # packed right shift << 30
11578c2ecf20Sopenharmony_ci	pslld     $25, \TMP4                   # packed right shift << 25
11588c2ecf20Sopenharmony_ci	pxor      \TMP3, \TMP2	               # xor the shifted versions
11598c2ecf20Sopenharmony_ci	pxor      \TMP4, \TMP2
11608c2ecf20Sopenharmony_ci	movdqa    \TMP2, \TMP5
11618c2ecf20Sopenharmony_ci	psrldq    $4, \TMP5                    # right shift T5 1 DW
11628c2ecf20Sopenharmony_ci	pslldq    $12, \TMP2                   # left shift T2 3 DWs
11638c2ecf20Sopenharmony_ci	pxor      \TMP2, \XMM5
11648c2ecf20Sopenharmony_ci
11658c2ecf20Sopenharmony_ci        # second phase of reduction
11668c2ecf20Sopenharmony_ci
11678c2ecf20Sopenharmony_ci	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
11688c2ecf20Sopenharmony_ci	movdqa    \XMM5,\TMP3
11698c2ecf20Sopenharmony_ci	movdqa    \XMM5,\TMP4
11708c2ecf20Sopenharmony_ci	psrld     $1, \TMP2                    # packed left shift >>1
11718c2ecf20Sopenharmony_ci	psrld     $2, \TMP3                    # packed left shift >>2
11728c2ecf20Sopenharmony_ci	psrld     $7, \TMP4                    # packed left shift >>7
11738c2ecf20Sopenharmony_ci	pxor      \TMP3,\TMP2		       # xor the shifted versions
11748c2ecf20Sopenharmony_ci	pxor      \TMP4,\TMP2
11758c2ecf20Sopenharmony_ci	pxor      \TMP5, \TMP2
11768c2ecf20Sopenharmony_ci	pxor      \TMP2, \XMM5
11778c2ecf20Sopenharmony_ci	pxor      \TMP1, \XMM5                 # result is in TMP1
11788c2ecf20Sopenharmony_ci
11798c2ecf20Sopenharmony_ci	pxor	  \XMM5, \XMM1
11808c2ecf20Sopenharmony_ci.endm
11818c2ecf20Sopenharmony_ci
11828c2ecf20Sopenharmony_ci/*
11838c2ecf20Sopenharmony_ci* decrypt 4 blocks at a time
11848c2ecf20Sopenharmony_ci* ghash the 4 previously decrypted ciphertext blocks
11858c2ecf20Sopenharmony_ci* arg1, %arg3, %arg4 are used as pointers only, not modified
11868c2ecf20Sopenharmony_ci* %r11 is the data offset value
11878c2ecf20Sopenharmony_ci*/
11888c2ecf20Sopenharmony_ci.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
11898c2ecf20Sopenharmony_ciTMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
11908c2ecf20Sopenharmony_ci
11918c2ecf20Sopenharmony_ci	movdqa	  \XMM1, \XMM5
11928c2ecf20Sopenharmony_ci	movdqa	  \XMM2, \XMM6
11938c2ecf20Sopenharmony_ci	movdqa	  \XMM3, \XMM7
11948c2ecf20Sopenharmony_ci	movdqa	  \XMM4, \XMM8
11958c2ecf20Sopenharmony_ci
11968c2ecf20Sopenharmony_ci        movdqa    SHUF_MASK(%rip), %xmm15
11978c2ecf20Sopenharmony_ci        # multiply TMP5 * HashKey using karatsuba
11988c2ecf20Sopenharmony_ci
11998c2ecf20Sopenharmony_ci	movdqa	  \XMM5, \TMP4
12008c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM5, \TMP6
12018c2ecf20Sopenharmony_ci	pxor	  \XMM5, \TMP6
12028c2ecf20Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
12038c2ecf20Sopenharmony_ci	movdqu	  HashKey_4(%arg2), \TMP5
12048c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
12058c2ecf20Sopenharmony_ci	movdqa    \XMM0, \XMM1
12068c2ecf20Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
12078c2ecf20Sopenharmony_ci	movdqa    \XMM0, \XMM2
12088c2ecf20Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
12098c2ecf20Sopenharmony_ci	movdqa    \XMM0, \XMM3
12108c2ecf20Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
12118c2ecf20Sopenharmony_ci	movdqa    \XMM0, \XMM4
12128c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM1	# perform a 16 byte swap
12138c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
12148c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM2	# perform a 16 byte swap
12158c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM3	# perform a 16 byte swap
12168c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM4	# perform a 16 byte swap
12178c2ecf20Sopenharmony_ci
12188c2ecf20Sopenharmony_ci	pxor	  (%arg1), \XMM1
12198c2ecf20Sopenharmony_ci	pxor	  (%arg1), \XMM2
12208c2ecf20Sopenharmony_ci	pxor	  (%arg1), \XMM3
12218c2ecf20Sopenharmony_ci	pxor	  (%arg1), \XMM4
12228c2ecf20Sopenharmony_ci	movdqu	  HashKey_4_k(%arg2), \TMP5
12238c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
12248c2ecf20Sopenharmony_ci	movaps 0x10(%arg1), \TMP1
12258c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM1              # Round 1
12268c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM2
12278c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM3
12288c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM4
12298c2ecf20Sopenharmony_ci	movaps 0x20(%arg1), \TMP1
12308c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM1              # Round 2
12318c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM2
12328c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM3
12338c2ecf20Sopenharmony_ci	aesenc	  \TMP1, \XMM4
12348c2ecf20Sopenharmony_ci	movdqa	  \XMM6, \TMP1
12358c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM6, \TMP2
12368c2ecf20Sopenharmony_ci	pxor	  \XMM6, \TMP2
12378c2ecf20Sopenharmony_ci	movdqu	  HashKey_3(%arg2), \TMP5
12388c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
12398c2ecf20Sopenharmony_ci	movaps 0x30(%arg1), \TMP3
12408c2ecf20Sopenharmony_ci	aesenc    \TMP3, \XMM1              # Round 3
12418c2ecf20Sopenharmony_ci	aesenc    \TMP3, \XMM2
12428c2ecf20Sopenharmony_ci	aesenc    \TMP3, \XMM3
12438c2ecf20Sopenharmony_ci	aesenc    \TMP3, \XMM4
12448c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
12458c2ecf20Sopenharmony_ci	movaps 0x40(%arg1), \TMP3
12468c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 4
12478c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
12488c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
12498c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
12508c2ecf20Sopenharmony_ci	movdqu	  HashKey_3_k(%arg2), \TMP5
12518c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
12528c2ecf20Sopenharmony_ci	movaps 0x50(%arg1), \TMP3
12538c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 5
12548c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
12558c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
12568c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
12578c2ecf20Sopenharmony_ci	pxor	  \TMP1, \TMP4
12588c2ecf20Sopenharmony_ci# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
12598c2ecf20Sopenharmony_ci	pxor	  \XMM6, \XMM5
12608c2ecf20Sopenharmony_ci	pxor	  \TMP2, \TMP6
12618c2ecf20Sopenharmony_ci	movdqa	  \XMM7, \TMP1
12628c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM7, \TMP2
12638c2ecf20Sopenharmony_ci	pxor	  \XMM7, \TMP2
12648c2ecf20Sopenharmony_ci	movdqu	  HashKey_2(%arg2), \TMP5
12658c2ecf20Sopenharmony_ci
12668c2ecf20Sopenharmony_ci        # Multiply TMP5 * HashKey using karatsuba
12678c2ecf20Sopenharmony_ci
12688c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
12698c2ecf20Sopenharmony_ci	movaps 0x60(%arg1), \TMP3
12708c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 6
12718c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
12728c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
12738c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
12748c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
12758c2ecf20Sopenharmony_ci	movaps 0x70(%arg1), \TMP3
12768c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 7
12778c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
12788c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
12798c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
12808c2ecf20Sopenharmony_ci	movdqu	  HashKey_2_k(%arg2), \TMP5
12818c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
12828c2ecf20Sopenharmony_ci	movaps 0x80(%arg1), \TMP3
12838c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 8
12848c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
12858c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
12868c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
12878c2ecf20Sopenharmony_ci	pxor	  \TMP1, \TMP4
12888c2ecf20Sopenharmony_ci# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
12898c2ecf20Sopenharmony_ci	pxor	  \XMM7, \XMM5
12908c2ecf20Sopenharmony_ci	pxor	  \TMP2, \TMP6
12918c2ecf20Sopenharmony_ci
12928c2ecf20Sopenharmony_ci        # Multiply XMM8 * HashKey
12938c2ecf20Sopenharmony_ci        # XMM8 and TMP5 hold the values for the two operands
12948c2ecf20Sopenharmony_ci
12958c2ecf20Sopenharmony_ci	movdqa	  \XMM8, \TMP1
12968c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM8, \TMP2
12978c2ecf20Sopenharmony_ci	pxor	  \XMM8, \TMP2
12988c2ecf20Sopenharmony_ci	movdqu	  HashKey(%arg2), \TMP5
12998c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
13008c2ecf20Sopenharmony_ci	movaps 0x90(%arg1), \TMP3
13018c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM1             # Round 9
13028c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM2
13038c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM3
13048c2ecf20Sopenharmony_ci	aesenc	  \TMP3, \XMM4
13058c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
13068c2ecf20Sopenharmony_ci	lea	  0xa0(%arg1),%r10
13078c2ecf20Sopenharmony_ci	mov	  keysize,%eax
13088c2ecf20Sopenharmony_ci	shr	  $2,%eax		        # 128->4, 192->6, 256->8
13098c2ecf20Sopenharmony_ci	sub	  $4,%eax			# 128->0, 192->2, 256->4
13108c2ecf20Sopenharmony_ci	jz	  aes_loop_par_dec_done\@
13118c2ecf20Sopenharmony_ci
13128c2ecf20Sopenharmony_ciaes_loop_par_dec\@:
13138c2ecf20Sopenharmony_ci	MOVADQ	  (%r10),\TMP3
13148c2ecf20Sopenharmony_ci.irpc	index, 1234
13158c2ecf20Sopenharmony_ci	aesenc	  \TMP3, %xmm\index
13168c2ecf20Sopenharmony_ci.endr
13178c2ecf20Sopenharmony_ci	add	  $16,%r10
13188c2ecf20Sopenharmony_ci	sub	  $1,%eax
13198c2ecf20Sopenharmony_ci	jnz	  aes_loop_par_dec\@
13208c2ecf20Sopenharmony_ci
13218c2ecf20Sopenharmony_ciaes_loop_par_dec_done\@:
13228c2ecf20Sopenharmony_ci	MOVADQ	  (%r10), \TMP3
13238c2ecf20Sopenharmony_ci	aesenclast \TMP3, \XMM1           # last round
13248c2ecf20Sopenharmony_ci	aesenclast \TMP3, \XMM2
13258c2ecf20Sopenharmony_ci	aesenclast \TMP3, \XMM3
13268c2ecf20Sopenharmony_ci	aesenclast \TMP3, \XMM4
13278c2ecf20Sopenharmony_ci	movdqu    HashKey_k(%arg2), \TMP5
13288c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
13298c2ecf20Sopenharmony_ci	movdqu	  (%arg4,%r11,1), \TMP3
13308c2ecf20Sopenharmony_ci	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
13318c2ecf20Sopenharmony_ci	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
13328c2ecf20Sopenharmony_ci	movdqa    \TMP3, \XMM1
13338c2ecf20Sopenharmony_ci	movdqu	  16(%arg4,%r11,1), \TMP3
13348c2ecf20Sopenharmony_ci	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
13358c2ecf20Sopenharmony_ci	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
13368c2ecf20Sopenharmony_ci	movdqa    \TMP3, \XMM2
13378c2ecf20Sopenharmony_ci	movdqu	  32(%arg4,%r11,1), \TMP3
13388c2ecf20Sopenharmony_ci	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
13398c2ecf20Sopenharmony_ci	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
13408c2ecf20Sopenharmony_ci	movdqa    \TMP3, \XMM3
13418c2ecf20Sopenharmony_ci	movdqu	  48(%arg4,%r11,1), \TMP3
13428c2ecf20Sopenharmony_ci	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
13438c2ecf20Sopenharmony_ci	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
13448c2ecf20Sopenharmony_ci	movdqa    \TMP3, \XMM4
13458c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM1        # perform a 16 byte swap
13468c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM2	# perform a 16 byte swap
13478c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM3	# perform a 16 byte swap
13488c2ecf20Sopenharmony_ci	pshufb %xmm15, \XMM4	# perform a 16 byte swap
13498c2ecf20Sopenharmony_ci
13508c2ecf20Sopenharmony_ci	pxor	  \TMP4, \TMP1
13518c2ecf20Sopenharmony_ci	pxor	  \XMM8, \XMM5
13528c2ecf20Sopenharmony_ci	pxor	  \TMP6, \TMP2
13538c2ecf20Sopenharmony_ci	pxor	  \TMP1, \TMP2
13548c2ecf20Sopenharmony_ci	pxor	  \XMM5, \TMP2
13558c2ecf20Sopenharmony_ci	movdqa	  \TMP2, \TMP3
13568c2ecf20Sopenharmony_ci	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
13578c2ecf20Sopenharmony_ci	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
13588c2ecf20Sopenharmony_ci	pxor	  \TMP3, \XMM5
13598c2ecf20Sopenharmony_ci	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
13608c2ecf20Sopenharmony_ci
13618c2ecf20Sopenharmony_ci        # first phase of reduction
13628c2ecf20Sopenharmony_ci
13638c2ecf20Sopenharmony_ci	movdqa    \XMM5, \TMP2
13648c2ecf20Sopenharmony_ci	movdqa    \XMM5, \TMP3
13658c2ecf20Sopenharmony_ci	movdqa    \XMM5, \TMP4
13668c2ecf20Sopenharmony_ci# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
13678c2ecf20Sopenharmony_ci	pslld     $31, \TMP2                   # packed right shift << 31
13688c2ecf20Sopenharmony_ci	pslld     $30, \TMP3                   # packed right shift << 30
13698c2ecf20Sopenharmony_ci	pslld     $25, \TMP4                   # packed right shift << 25
13708c2ecf20Sopenharmony_ci	pxor      \TMP3, \TMP2	               # xor the shifted versions
13718c2ecf20Sopenharmony_ci	pxor      \TMP4, \TMP2
13728c2ecf20Sopenharmony_ci	movdqa    \TMP2, \TMP5
13738c2ecf20Sopenharmony_ci	psrldq    $4, \TMP5                    # right shift T5 1 DW
13748c2ecf20Sopenharmony_ci	pslldq    $12, \TMP2                   # left shift T2 3 DWs
13758c2ecf20Sopenharmony_ci	pxor      \TMP2, \XMM5
13768c2ecf20Sopenharmony_ci
13778c2ecf20Sopenharmony_ci        # second phase of reduction
13788c2ecf20Sopenharmony_ci
13798c2ecf20Sopenharmony_ci	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
13808c2ecf20Sopenharmony_ci	movdqa    \XMM5,\TMP3
13818c2ecf20Sopenharmony_ci	movdqa    \XMM5,\TMP4
13828c2ecf20Sopenharmony_ci	psrld     $1, \TMP2                    # packed left shift >>1
13838c2ecf20Sopenharmony_ci	psrld     $2, \TMP3                    # packed left shift >>2
13848c2ecf20Sopenharmony_ci	psrld     $7, \TMP4                    # packed left shift >>7
13858c2ecf20Sopenharmony_ci	pxor      \TMP3,\TMP2		       # xor the shifted versions
13868c2ecf20Sopenharmony_ci	pxor      \TMP4,\TMP2
13878c2ecf20Sopenharmony_ci	pxor      \TMP5, \TMP2
13888c2ecf20Sopenharmony_ci	pxor      \TMP2, \XMM5
13898c2ecf20Sopenharmony_ci	pxor      \TMP1, \XMM5                 # result is in TMP1
13908c2ecf20Sopenharmony_ci
13918c2ecf20Sopenharmony_ci	pxor	  \XMM5, \XMM1
13928c2ecf20Sopenharmony_ci.endm
13938c2ecf20Sopenharmony_ci
13948c2ecf20Sopenharmony_ci/* GHASH the last 4 ciphertext blocks. */
13958c2ecf20Sopenharmony_ci.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
13968c2ecf20Sopenharmony_ciTMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
13978c2ecf20Sopenharmony_ci
13988c2ecf20Sopenharmony_ci        # Multiply TMP6 * HashKey (using Karatsuba)
13998c2ecf20Sopenharmony_ci
14008c2ecf20Sopenharmony_ci	movdqa	  \XMM1, \TMP6
14018c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM1, \TMP2
14028c2ecf20Sopenharmony_ci	pxor	  \XMM1, \TMP2
14038c2ecf20Sopenharmony_ci	movdqu	  HashKey_4(%arg2), \TMP5
14048c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
14058c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
14068c2ecf20Sopenharmony_ci	movdqu	  HashKey_4_k(%arg2), \TMP4
14078c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
14088c2ecf20Sopenharmony_ci	movdqa	  \XMM1, \XMMDst
14098c2ecf20Sopenharmony_ci	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
14108c2ecf20Sopenharmony_ci
14118c2ecf20Sopenharmony_ci        # Multiply TMP1 * HashKey (using Karatsuba)
14128c2ecf20Sopenharmony_ci
14138c2ecf20Sopenharmony_ci	movdqa	  \XMM2, \TMP1
14148c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM2, \TMP2
14158c2ecf20Sopenharmony_ci	pxor	  \XMM2, \TMP2
14168c2ecf20Sopenharmony_ci	movdqu	  HashKey_3(%arg2), \TMP5
14178c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
14188c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
14198c2ecf20Sopenharmony_ci	movdqu	  HashKey_3_k(%arg2), \TMP4
14208c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
14218c2ecf20Sopenharmony_ci	pxor	  \TMP1, \TMP6
14228c2ecf20Sopenharmony_ci	pxor	  \XMM2, \XMMDst
14238c2ecf20Sopenharmony_ci	pxor	  \TMP2, \XMM1
14248c2ecf20Sopenharmony_ci# results accumulated in TMP6, XMMDst, XMM1
14258c2ecf20Sopenharmony_ci
14268c2ecf20Sopenharmony_ci        # Multiply TMP1 * HashKey (using Karatsuba)
14278c2ecf20Sopenharmony_ci
14288c2ecf20Sopenharmony_ci	movdqa	  \XMM3, \TMP1
14298c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM3, \TMP2
14308c2ecf20Sopenharmony_ci	pxor	  \XMM3, \TMP2
14318c2ecf20Sopenharmony_ci	movdqu	  HashKey_2(%arg2), \TMP5
14328c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
14338c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
14348c2ecf20Sopenharmony_ci	movdqu	  HashKey_2_k(%arg2), \TMP4
14358c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
14368c2ecf20Sopenharmony_ci	pxor	  \TMP1, \TMP6
14378c2ecf20Sopenharmony_ci	pxor	  \XMM3, \XMMDst
14388c2ecf20Sopenharmony_ci	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
14398c2ecf20Sopenharmony_ci
14408c2ecf20Sopenharmony_ci        # Multiply TMP1 * HashKey (using Karatsuba)
14418c2ecf20Sopenharmony_ci	movdqa	  \XMM4, \TMP1
14428c2ecf20Sopenharmony_ci	pshufd	  $78, \XMM4, \TMP2
14438c2ecf20Sopenharmony_ci	pxor	  \XMM4, \TMP2
14448c2ecf20Sopenharmony_ci	movdqu	  HashKey(%arg2), \TMP5
14458c2ecf20Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
14468c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
14478c2ecf20Sopenharmony_ci	movdqu	  HashKey_k(%arg2), \TMP4
14488c2ecf20Sopenharmony_ci	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
14498c2ecf20Sopenharmony_ci	pxor	  \TMP1, \TMP6
14508c2ecf20Sopenharmony_ci	pxor	  \XMM4, \XMMDst
14518c2ecf20Sopenharmony_ci	pxor	  \XMM1, \TMP2
14528c2ecf20Sopenharmony_ci	pxor	  \TMP6, \TMP2
14538c2ecf20Sopenharmony_ci	pxor	  \XMMDst, \TMP2
14548c2ecf20Sopenharmony_ci	# middle section of the temp results combined as in karatsuba algorithm
14558c2ecf20Sopenharmony_ci	movdqa	  \TMP2, \TMP4
14568c2ecf20Sopenharmony_ci	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
14578c2ecf20Sopenharmony_ci	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
14588c2ecf20Sopenharmony_ci	pxor	  \TMP4, \XMMDst
14598c2ecf20Sopenharmony_ci	pxor	  \TMP2, \TMP6
14608c2ecf20Sopenharmony_ci# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
14618c2ecf20Sopenharmony_ci	# first phase of the reduction
14628c2ecf20Sopenharmony_ci	movdqa    \XMMDst, \TMP2
14638c2ecf20Sopenharmony_ci	movdqa    \XMMDst, \TMP3
14648c2ecf20Sopenharmony_ci	movdqa    \XMMDst, \TMP4
14658c2ecf20Sopenharmony_ci# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
14668c2ecf20Sopenharmony_ci	pslld     $31, \TMP2                # packed right shifting << 31
14678c2ecf20Sopenharmony_ci	pslld     $30, \TMP3                # packed right shifting << 30
14688c2ecf20Sopenharmony_ci	pslld     $25, \TMP4                # packed right shifting << 25
14698c2ecf20Sopenharmony_ci	pxor      \TMP3, \TMP2              # xor the shifted versions
14708c2ecf20Sopenharmony_ci	pxor      \TMP4, \TMP2
14718c2ecf20Sopenharmony_ci	movdqa    \TMP2, \TMP7
14728c2ecf20Sopenharmony_ci	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
14738c2ecf20Sopenharmony_ci	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
14748c2ecf20Sopenharmony_ci	pxor      \TMP2, \XMMDst
14758c2ecf20Sopenharmony_ci
14768c2ecf20Sopenharmony_ci        # second phase of the reduction
14778c2ecf20Sopenharmony_ci	movdqa    \XMMDst, \TMP2
14788c2ecf20Sopenharmony_ci	# make 3 copies of XMMDst for doing 3 shift operations
14798c2ecf20Sopenharmony_ci	movdqa    \XMMDst, \TMP3
14808c2ecf20Sopenharmony_ci	movdqa    \XMMDst, \TMP4
14818c2ecf20Sopenharmony_ci	psrld     $1, \TMP2                 # packed left shift >> 1
14828c2ecf20Sopenharmony_ci	psrld     $2, \TMP3                 # packed left shift >> 2
14838c2ecf20Sopenharmony_ci	psrld     $7, \TMP4                 # packed left shift >> 7
14848c2ecf20Sopenharmony_ci	pxor      \TMP3, \TMP2              # xor the shifted versions
14858c2ecf20Sopenharmony_ci	pxor      \TMP4, \TMP2
14868c2ecf20Sopenharmony_ci	pxor      \TMP7, \TMP2
14878c2ecf20Sopenharmony_ci	pxor      \TMP2, \XMMDst
14888c2ecf20Sopenharmony_ci	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
14898c2ecf20Sopenharmony_ci.endm
14908c2ecf20Sopenharmony_ci
14918c2ecf20Sopenharmony_ci
14928c2ecf20Sopenharmony_ci/* Encryption of a single block
14938c2ecf20Sopenharmony_ci* uses eax & r10
14948c2ecf20Sopenharmony_ci*/
14958c2ecf20Sopenharmony_ci
14968c2ecf20Sopenharmony_ci.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
14978c2ecf20Sopenharmony_ci
14988c2ecf20Sopenharmony_ci	pxor		(%arg1), \XMM0
14998c2ecf20Sopenharmony_ci	mov		keysize,%eax
15008c2ecf20Sopenharmony_ci	shr		$2,%eax			# 128->4, 192->6, 256->8
15018c2ecf20Sopenharmony_ci	add		$5,%eax			# 128->9, 192->11, 256->13
15028c2ecf20Sopenharmony_ci	lea		16(%arg1), %r10	  # get first expanded key address
15038c2ecf20Sopenharmony_ci
15048c2ecf20Sopenharmony_ci_esb_loop_\@:
15058c2ecf20Sopenharmony_ci	MOVADQ		(%r10),\TMP1
15068c2ecf20Sopenharmony_ci	aesenc		\TMP1,\XMM0
15078c2ecf20Sopenharmony_ci	add		$16,%r10
15088c2ecf20Sopenharmony_ci	sub		$1,%eax
15098c2ecf20Sopenharmony_ci	jnz		_esb_loop_\@
15108c2ecf20Sopenharmony_ci
15118c2ecf20Sopenharmony_ci	MOVADQ		(%r10),\TMP1
15128c2ecf20Sopenharmony_ci	aesenclast	\TMP1,\XMM0
15138c2ecf20Sopenharmony_ci.endm
15148c2ecf20Sopenharmony_ci/*****************************************************************************
15158c2ecf20Sopenharmony_ci* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
15168c2ecf20Sopenharmony_ci*                   struct gcm_context_data *data
15178c2ecf20Sopenharmony_ci*                                      // Context data
15188c2ecf20Sopenharmony_ci*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
15198c2ecf20Sopenharmony_ci*                   const u8 *in,      // Ciphertext input
15208c2ecf20Sopenharmony_ci*                   u64 plaintext_len, // Length of data in bytes for decryption.
15218c2ecf20Sopenharmony_ci*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
15228c2ecf20Sopenharmony_ci*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
15238c2ecf20Sopenharmony_ci*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
15248c2ecf20Sopenharmony_ci*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
15258c2ecf20Sopenharmony_ci*                   const u8 *aad,     // Additional Authentication Data (AAD)
15268c2ecf20Sopenharmony_ci*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
15278c2ecf20Sopenharmony_ci*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
15288c2ecf20Sopenharmony_ci*                                      // given authentication tag and only return the plaintext if they match.
15298c2ecf20Sopenharmony_ci*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
15308c2ecf20Sopenharmony_ci*                                      // (most likely), 12 or 8.
15318c2ecf20Sopenharmony_ci*
15328c2ecf20Sopenharmony_ci* Assumptions:
15338c2ecf20Sopenharmony_ci*
15348c2ecf20Sopenharmony_ci* keys:
15358c2ecf20Sopenharmony_ci*       keys are pre-expanded and aligned to 16 bytes. we are using the first
15368c2ecf20Sopenharmony_ci*       set of 11 keys in the data structure void *aes_ctx
15378c2ecf20Sopenharmony_ci*
15388c2ecf20Sopenharmony_ci* iv:
15398c2ecf20Sopenharmony_ci*       0                   1                   2                   3
15408c2ecf20Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
15418c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15428c2ecf20Sopenharmony_ci*       |                             Salt  (From the SA)               |
15438c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15448c2ecf20Sopenharmony_ci*       |                     Initialization Vector                     |
15458c2ecf20Sopenharmony_ci*       |         (This is the sequence number from IPSec header)       |
15468c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15478c2ecf20Sopenharmony_ci*       |                              0x1                              |
15488c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15498c2ecf20Sopenharmony_ci*
15508c2ecf20Sopenharmony_ci*
15518c2ecf20Sopenharmony_ci*
15528c2ecf20Sopenharmony_ci* AAD:
15538c2ecf20Sopenharmony_ci*       AAD padded to 128 bits with 0
15548c2ecf20Sopenharmony_ci*       for example, assume AAD is a u32 vector
15558c2ecf20Sopenharmony_ci*
15568c2ecf20Sopenharmony_ci*       if AAD is 8 bytes:
15578c2ecf20Sopenharmony_ci*       AAD[3] = {A0, A1};
15588c2ecf20Sopenharmony_ci*       padded AAD in xmm register = {A1 A0 0 0}
15598c2ecf20Sopenharmony_ci*
15608c2ecf20Sopenharmony_ci*       0                   1                   2                   3
15618c2ecf20Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
15628c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15638c2ecf20Sopenharmony_ci*       |                               SPI (A1)                        |
15648c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15658c2ecf20Sopenharmony_ci*       |                     32-bit Sequence Number (A0)               |
15668c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15678c2ecf20Sopenharmony_ci*       |                              0x0                              |
15688c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15698c2ecf20Sopenharmony_ci*
15708c2ecf20Sopenharmony_ci*                                       AAD Format with 32-bit Sequence Number
15718c2ecf20Sopenharmony_ci*
15728c2ecf20Sopenharmony_ci*       if AAD is 12 bytes:
15738c2ecf20Sopenharmony_ci*       AAD[3] = {A0, A1, A2};
15748c2ecf20Sopenharmony_ci*       padded AAD in xmm register = {A2 A1 A0 0}
15758c2ecf20Sopenharmony_ci*
15768c2ecf20Sopenharmony_ci*       0                   1                   2                   3
15778c2ecf20Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
15788c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15798c2ecf20Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
15808c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15818c2ecf20Sopenharmony_ci*       |                               SPI (A2)                        |
15828c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15838c2ecf20Sopenharmony_ci*       |                 64-bit Extended Sequence Number {A1,A0}       |
15848c2ecf20Sopenharmony_ci*       |                                                               |
15858c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15868c2ecf20Sopenharmony_ci*       |                              0x0                              |
15878c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
15888c2ecf20Sopenharmony_ci*
15898c2ecf20Sopenharmony_ci*                        AAD Format with 64-bit Extended Sequence Number
15908c2ecf20Sopenharmony_ci*
15918c2ecf20Sopenharmony_ci* poly = x^128 + x^127 + x^126 + x^121 + 1
15928c2ecf20Sopenharmony_ci*
15938c2ecf20Sopenharmony_ci*****************************************************************************/
15948c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_dec)
15958c2ecf20Sopenharmony_ci	FUNC_SAVE
15968c2ecf20Sopenharmony_ci
15978c2ecf20Sopenharmony_ci	GCM_INIT %arg6, arg7, arg8, arg9
15988c2ecf20Sopenharmony_ci	GCM_ENC_DEC dec
15998c2ecf20Sopenharmony_ci	GCM_COMPLETE arg10, arg11
16008c2ecf20Sopenharmony_ci	FUNC_RESTORE
16018c2ecf20Sopenharmony_ci	RET
16028c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_dec)
16038c2ecf20Sopenharmony_ci
16048c2ecf20Sopenharmony_ci
16058c2ecf20Sopenharmony_ci/*****************************************************************************
16068c2ecf20Sopenharmony_ci* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
16078c2ecf20Sopenharmony_ci*                    struct gcm_context_data *data
16088c2ecf20Sopenharmony_ci*                                        // Context data
16098c2ecf20Sopenharmony_ci*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
16108c2ecf20Sopenharmony_ci*                    const u8 *in,       // Plaintext input
16118c2ecf20Sopenharmony_ci*                    u64 plaintext_len,  // Length of data in bytes for encryption.
16128c2ecf20Sopenharmony_ci*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
16138c2ecf20Sopenharmony_ci*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
16148c2ecf20Sopenharmony_ci*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
16158c2ecf20Sopenharmony_ci*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
16168c2ecf20Sopenharmony_ci*                    const u8 *aad,      // Additional Authentication Data (AAD)
16178c2ecf20Sopenharmony_ci*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
16188c2ecf20Sopenharmony_ci*                    u8 *auth_tag,       // Authenticated Tag output.
16198c2ecf20Sopenharmony_ci*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
16208c2ecf20Sopenharmony_ci*                                        // 12 or 8.
16218c2ecf20Sopenharmony_ci*
16228c2ecf20Sopenharmony_ci* Assumptions:
16238c2ecf20Sopenharmony_ci*
16248c2ecf20Sopenharmony_ci* keys:
16258c2ecf20Sopenharmony_ci*       keys are pre-expanded and aligned to 16 bytes. we are using the
16268c2ecf20Sopenharmony_ci*       first set of 11 keys in the data structure void *aes_ctx
16278c2ecf20Sopenharmony_ci*
16288c2ecf20Sopenharmony_ci*
16298c2ecf20Sopenharmony_ci* iv:
16308c2ecf20Sopenharmony_ci*       0                   1                   2                   3
16318c2ecf20Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
16328c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16338c2ecf20Sopenharmony_ci*       |                             Salt  (From the SA)               |
16348c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16358c2ecf20Sopenharmony_ci*       |                     Initialization Vector                     |
16368c2ecf20Sopenharmony_ci*       |         (This is the sequence number from IPSec header)       |
16378c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16388c2ecf20Sopenharmony_ci*       |                              0x1                              |
16398c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16408c2ecf20Sopenharmony_ci*
16418c2ecf20Sopenharmony_ci*
16428c2ecf20Sopenharmony_ci*
16438c2ecf20Sopenharmony_ci* AAD:
16448c2ecf20Sopenharmony_ci*       AAD padded to 128 bits with 0
16458c2ecf20Sopenharmony_ci*       for example, assume AAD is a u32 vector
16468c2ecf20Sopenharmony_ci*
16478c2ecf20Sopenharmony_ci*       if AAD is 8 bytes:
16488c2ecf20Sopenharmony_ci*       AAD[3] = {A0, A1};
16498c2ecf20Sopenharmony_ci*       padded AAD in xmm register = {A1 A0 0 0}
16508c2ecf20Sopenharmony_ci*
16518c2ecf20Sopenharmony_ci*       0                   1                   2                   3
16528c2ecf20Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
16538c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16548c2ecf20Sopenharmony_ci*       |                               SPI (A1)                        |
16558c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16568c2ecf20Sopenharmony_ci*       |                     32-bit Sequence Number (A0)               |
16578c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16588c2ecf20Sopenharmony_ci*       |                              0x0                              |
16598c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16608c2ecf20Sopenharmony_ci*
16618c2ecf20Sopenharmony_ci*                                 AAD Format with 32-bit Sequence Number
16628c2ecf20Sopenharmony_ci*
16638c2ecf20Sopenharmony_ci*       if AAD is 12 bytes:
16648c2ecf20Sopenharmony_ci*       AAD[3] = {A0, A1, A2};
16658c2ecf20Sopenharmony_ci*       padded AAD in xmm register = {A2 A1 A0 0}
16668c2ecf20Sopenharmony_ci*
16678c2ecf20Sopenharmony_ci*       0                   1                   2                   3
16688c2ecf20Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
16698c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16708c2ecf20Sopenharmony_ci*       |                               SPI (A2)                        |
16718c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16728c2ecf20Sopenharmony_ci*       |                 64-bit Extended Sequence Number {A1,A0}       |
16738c2ecf20Sopenharmony_ci*       |                                                               |
16748c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16758c2ecf20Sopenharmony_ci*       |                              0x0                              |
16768c2ecf20Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
16778c2ecf20Sopenharmony_ci*
16788c2ecf20Sopenharmony_ci*                         AAD Format with 64-bit Extended Sequence Number
16798c2ecf20Sopenharmony_ci*
16808c2ecf20Sopenharmony_ci* poly = x^128 + x^127 + x^126 + x^121 + 1
16818c2ecf20Sopenharmony_ci***************************************************************************/
16828c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_enc)
16838c2ecf20Sopenharmony_ci	FUNC_SAVE
16848c2ecf20Sopenharmony_ci
16858c2ecf20Sopenharmony_ci	GCM_INIT %arg6, arg7, arg8, arg9
16868c2ecf20Sopenharmony_ci	GCM_ENC_DEC enc
16878c2ecf20Sopenharmony_ci
16888c2ecf20Sopenharmony_ci	GCM_COMPLETE arg10, arg11
16898c2ecf20Sopenharmony_ci	FUNC_RESTORE
16908c2ecf20Sopenharmony_ci	RET
16918c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_enc)
16928c2ecf20Sopenharmony_ci
16938c2ecf20Sopenharmony_ci/*****************************************************************************
16948c2ecf20Sopenharmony_ci* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
16958c2ecf20Sopenharmony_ci*                     struct gcm_context_data *data,
16968c2ecf20Sopenharmony_ci*                                         // context data
16978c2ecf20Sopenharmony_ci*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
16988c2ecf20Sopenharmony_ci*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
16998c2ecf20Sopenharmony_ci*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
17008c2ecf20Sopenharmony_ci*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
17018c2ecf20Sopenharmony_ci*                     const u8 *aad,      // Additional Authentication Data (AAD)
17028c2ecf20Sopenharmony_ci*                     u64 aad_len)        // Length of AAD in bytes.
17038c2ecf20Sopenharmony_ci*/
17048c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_init)
17058c2ecf20Sopenharmony_ci	FUNC_SAVE
17068c2ecf20Sopenharmony_ci	GCM_INIT %arg3, %arg4,%arg5, %arg6
17078c2ecf20Sopenharmony_ci	FUNC_RESTORE
17088c2ecf20Sopenharmony_ci	RET
17098c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_init)
17108c2ecf20Sopenharmony_ci
17118c2ecf20Sopenharmony_ci/*****************************************************************************
17128c2ecf20Sopenharmony_ci* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
17138c2ecf20Sopenharmony_ci*                    struct gcm_context_data *data,
17148c2ecf20Sopenharmony_ci*                                        // context data
17158c2ecf20Sopenharmony_ci*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
17168c2ecf20Sopenharmony_ci*                    const u8 *in,       // Plaintext input
17178c2ecf20Sopenharmony_ci*                    u64 plaintext_len,  // Length of data in bytes for encryption.
17188c2ecf20Sopenharmony_ci*/
17198c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_enc_update)
17208c2ecf20Sopenharmony_ci	FUNC_SAVE
17218c2ecf20Sopenharmony_ci	GCM_ENC_DEC enc
17228c2ecf20Sopenharmony_ci	FUNC_RESTORE
17238c2ecf20Sopenharmony_ci	RET
17248c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_enc_update)
17258c2ecf20Sopenharmony_ci
17268c2ecf20Sopenharmony_ci/*****************************************************************************
17278c2ecf20Sopenharmony_ci* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
17288c2ecf20Sopenharmony_ci*                    struct gcm_context_data *data,
17298c2ecf20Sopenharmony_ci*                                        // context data
17308c2ecf20Sopenharmony_ci*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
17318c2ecf20Sopenharmony_ci*                    const u8 *in,       // Plaintext input
17328c2ecf20Sopenharmony_ci*                    u64 plaintext_len,  // Length of data in bytes for encryption.
17338c2ecf20Sopenharmony_ci*/
17348c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_dec_update)
17358c2ecf20Sopenharmony_ci	FUNC_SAVE
17368c2ecf20Sopenharmony_ci	GCM_ENC_DEC dec
17378c2ecf20Sopenharmony_ci	FUNC_RESTORE
17388c2ecf20Sopenharmony_ci	RET
17398c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_dec_update)
17408c2ecf20Sopenharmony_ci
17418c2ecf20Sopenharmony_ci/*****************************************************************************
17428c2ecf20Sopenharmony_ci* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
17438c2ecf20Sopenharmony_ci*                    struct gcm_context_data *data,
17448c2ecf20Sopenharmony_ci*                                        // context data
17458c2ecf20Sopenharmony_ci*                    u8 *auth_tag,       // Authenticated Tag output.
17468c2ecf20Sopenharmony_ci*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
17478c2ecf20Sopenharmony_ci*                                        // 12 or 8.
17488c2ecf20Sopenharmony_ci*/
17498c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_finalize)
17508c2ecf20Sopenharmony_ci	FUNC_SAVE
17518c2ecf20Sopenharmony_ci	GCM_COMPLETE %arg3 %arg4
17528c2ecf20Sopenharmony_ci	FUNC_RESTORE
17538c2ecf20Sopenharmony_ci	RET
17548c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_finalize)
17558c2ecf20Sopenharmony_ci
17568c2ecf20Sopenharmony_ci#endif
17578c2ecf20Sopenharmony_ci
17588c2ecf20Sopenharmony_ci
17598c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
17608c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(_key_expansion_256a)
17618c2ecf20Sopenharmony_ci	pshufd $0b11111111, %xmm1, %xmm1
17628c2ecf20Sopenharmony_ci	shufps $0b00010000, %xmm0, %xmm4
17638c2ecf20Sopenharmony_ci	pxor %xmm4, %xmm0
17648c2ecf20Sopenharmony_ci	shufps $0b10001100, %xmm0, %xmm4
17658c2ecf20Sopenharmony_ci	pxor %xmm4, %xmm0
17668c2ecf20Sopenharmony_ci	pxor %xmm1, %xmm0
17678c2ecf20Sopenharmony_ci	movaps %xmm0, (TKEYP)
17688c2ecf20Sopenharmony_ci	add $0x10, TKEYP
17698c2ecf20Sopenharmony_ci	RET
17708c2ecf20Sopenharmony_ciSYM_FUNC_END(_key_expansion_256a)
17718c2ecf20Sopenharmony_ciSYM_FUNC_END_ALIAS(_key_expansion_128)
17728c2ecf20Sopenharmony_ci
17738c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(_key_expansion_192a)
17748c2ecf20Sopenharmony_ci	pshufd $0b01010101, %xmm1, %xmm1
17758c2ecf20Sopenharmony_ci	shufps $0b00010000, %xmm0, %xmm4
17768c2ecf20Sopenharmony_ci	pxor %xmm4, %xmm0
17778c2ecf20Sopenharmony_ci	shufps $0b10001100, %xmm0, %xmm4
17788c2ecf20Sopenharmony_ci	pxor %xmm4, %xmm0
17798c2ecf20Sopenharmony_ci	pxor %xmm1, %xmm0
17808c2ecf20Sopenharmony_ci
17818c2ecf20Sopenharmony_ci	movaps %xmm2, %xmm5
17828c2ecf20Sopenharmony_ci	movaps %xmm2, %xmm6
17838c2ecf20Sopenharmony_ci	pslldq $4, %xmm5
17848c2ecf20Sopenharmony_ci	pshufd $0b11111111, %xmm0, %xmm3
17858c2ecf20Sopenharmony_ci	pxor %xmm3, %xmm2
17868c2ecf20Sopenharmony_ci	pxor %xmm5, %xmm2
17878c2ecf20Sopenharmony_ci
17888c2ecf20Sopenharmony_ci	movaps %xmm0, %xmm1
17898c2ecf20Sopenharmony_ci	shufps $0b01000100, %xmm0, %xmm6
17908c2ecf20Sopenharmony_ci	movaps %xmm6, (TKEYP)
17918c2ecf20Sopenharmony_ci	shufps $0b01001110, %xmm2, %xmm1
17928c2ecf20Sopenharmony_ci	movaps %xmm1, 0x10(TKEYP)
17938c2ecf20Sopenharmony_ci	add $0x20, TKEYP
17948c2ecf20Sopenharmony_ci	RET
17958c2ecf20Sopenharmony_ciSYM_FUNC_END(_key_expansion_192a)
17968c2ecf20Sopenharmony_ci
17978c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(_key_expansion_192b)
17988c2ecf20Sopenharmony_ci	pshufd $0b01010101, %xmm1, %xmm1
17998c2ecf20Sopenharmony_ci	shufps $0b00010000, %xmm0, %xmm4
18008c2ecf20Sopenharmony_ci	pxor %xmm4, %xmm0
18018c2ecf20Sopenharmony_ci	shufps $0b10001100, %xmm0, %xmm4
18028c2ecf20Sopenharmony_ci	pxor %xmm4, %xmm0
18038c2ecf20Sopenharmony_ci	pxor %xmm1, %xmm0
18048c2ecf20Sopenharmony_ci
18058c2ecf20Sopenharmony_ci	movaps %xmm2, %xmm5
18068c2ecf20Sopenharmony_ci	pslldq $4, %xmm5
18078c2ecf20Sopenharmony_ci	pshufd $0b11111111, %xmm0, %xmm3
18088c2ecf20Sopenharmony_ci	pxor %xmm3, %xmm2
18098c2ecf20Sopenharmony_ci	pxor %xmm5, %xmm2
18108c2ecf20Sopenharmony_ci
18118c2ecf20Sopenharmony_ci	movaps %xmm0, (TKEYP)
18128c2ecf20Sopenharmony_ci	add $0x10, TKEYP
18138c2ecf20Sopenharmony_ci	RET
18148c2ecf20Sopenharmony_ciSYM_FUNC_END(_key_expansion_192b)
18158c2ecf20Sopenharmony_ci
18168c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(_key_expansion_256b)
18178c2ecf20Sopenharmony_ci	pshufd $0b10101010, %xmm1, %xmm1
18188c2ecf20Sopenharmony_ci	shufps $0b00010000, %xmm2, %xmm4
18198c2ecf20Sopenharmony_ci	pxor %xmm4, %xmm2
18208c2ecf20Sopenharmony_ci	shufps $0b10001100, %xmm2, %xmm4
18218c2ecf20Sopenharmony_ci	pxor %xmm4, %xmm2
18228c2ecf20Sopenharmony_ci	pxor %xmm1, %xmm2
18238c2ecf20Sopenharmony_ci	movaps %xmm2, (TKEYP)
18248c2ecf20Sopenharmony_ci	add $0x10, TKEYP
18258c2ecf20Sopenharmony_ci	RET
18268c2ecf20Sopenharmony_ciSYM_FUNC_END(_key_expansion_256b)
18278c2ecf20Sopenharmony_ci
18288c2ecf20Sopenharmony_ci/*
18298c2ecf20Sopenharmony_ci * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
18308c2ecf20Sopenharmony_ci *                   unsigned int key_len)
18318c2ecf20Sopenharmony_ci */
18328c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_set_key)
18338c2ecf20Sopenharmony_ci	FRAME_BEGIN
18348c2ecf20Sopenharmony_ci#ifndef __x86_64__
18358c2ecf20Sopenharmony_ci	pushl KEYP
18368c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
18378c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
18388c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
18398c2ecf20Sopenharmony_ci#endif
18408c2ecf20Sopenharmony_ci	movups (UKEYP), %xmm0		# user key (first 16 bytes)
18418c2ecf20Sopenharmony_ci	movaps %xmm0, (KEYP)
18428c2ecf20Sopenharmony_ci	lea 0x10(KEYP), TKEYP		# key addr
18438c2ecf20Sopenharmony_ci	movl %edx, 480(KEYP)
18448c2ecf20Sopenharmony_ci	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
18458c2ecf20Sopenharmony_ci	cmp $24, %dl
18468c2ecf20Sopenharmony_ci	jb .Lenc_key128
18478c2ecf20Sopenharmony_ci	je .Lenc_key192
18488c2ecf20Sopenharmony_ci	movups 0x10(UKEYP), %xmm2	# other user key
18498c2ecf20Sopenharmony_ci	movaps %xmm2, (TKEYP)
18508c2ecf20Sopenharmony_ci	add $0x10, TKEYP
18518c2ecf20Sopenharmony_ci	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
18528c2ecf20Sopenharmony_ci	call _key_expansion_256a
18538c2ecf20Sopenharmony_ci	aeskeygenassist $0x1, %xmm0, %xmm1
18548c2ecf20Sopenharmony_ci	call _key_expansion_256b
18558c2ecf20Sopenharmony_ci	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
18568c2ecf20Sopenharmony_ci	call _key_expansion_256a
18578c2ecf20Sopenharmony_ci	aeskeygenassist $0x2, %xmm0, %xmm1
18588c2ecf20Sopenharmony_ci	call _key_expansion_256b
18598c2ecf20Sopenharmony_ci	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
18608c2ecf20Sopenharmony_ci	call _key_expansion_256a
18618c2ecf20Sopenharmony_ci	aeskeygenassist $0x4, %xmm0, %xmm1
18628c2ecf20Sopenharmony_ci	call _key_expansion_256b
18638c2ecf20Sopenharmony_ci	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
18648c2ecf20Sopenharmony_ci	call _key_expansion_256a
18658c2ecf20Sopenharmony_ci	aeskeygenassist $0x8, %xmm0, %xmm1
18668c2ecf20Sopenharmony_ci	call _key_expansion_256b
18678c2ecf20Sopenharmony_ci	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
18688c2ecf20Sopenharmony_ci	call _key_expansion_256a
18698c2ecf20Sopenharmony_ci	aeskeygenassist $0x10, %xmm0, %xmm1
18708c2ecf20Sopenharmony_ci	call _key_expansion_256b
18718c2ecf20Sopenharmony_ci	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
18728c2ecf20Sopenharmony_ci	call _key_expansion_256a
18738c2ecf20Sopenharmony_ci	aeskeygenassist $0x20, %xmm0, %xmm1
18748c2ecf20Sopenharmony_ci	call _key_expansion_256b
18758c2ecf20Sopenharmony_ci	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
18768c2ecf20Sopenharmony_ci	call _key_expansion_256a
18778c2ecf20Sopenharmony_ci	jmp .Ldec_key
18788c2ecf20Sopenharmony_ci.Lenc_key192:
18798c2ecf20Sopenharmony_ci	movq 0x10(UKEYP), %xmm2		# other user key
18808c2ecf20Sopenharmony_ci	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
18818c2ecf20Sopenharmony_ci	call _key_expansion_192a
18828c2ecf20Sopenharmony_ci	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
18838c2ecf20Sopenharmony_ci	call _key_expansion_192b
18848c2ecf20Sopenharmony_ci	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
18858c2ecf20Sopenharmony_ci	call _key_expansion_192a
18868c2ecf20Sopenharmony_ci	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
18878c2ecf20Sopenharmony_ci	call _key_expansion_192b
18888c2ecf20Sopenharmony_ci	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
18898c2ecf20Sopenharmony_ci	call _key_expansion_192a
18908c2ecf20Sopenharmony_ci	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
18918c2ecf20Sopenharmony_ci	call _key_expansion_192b
18928c2ecf20Sopenharmony_ci	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
18938c2ecf20Sopenharmony_ci	call _key_expansion_192a
18948c2ecf20Sopenharmony_ci	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
18958c2ecf20Sopenharmony_ci	call _key_expansion_192b
18968c2ecf20Sopenharmony_ci	jmp .Ldec_key
18978c2ecf20Sopenharmony_ci.Lenc_key128:
18988c2ecf20Sopenharmony_ci	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
18998c2ecf20Sopenharmony_ci	call _key_expansion_128
19008c2ecf20Sopenharmony_ci	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
19018c2ecf20Sopenharmony_ci	call _key_expansion_128
19028c2ecf20Sopenharmony_ci	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
19038c2ecf20Sopenharmony_ci	call _key_expansion_128
19048c2ecf20Sopenharmony_ci	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
19058c2ecf20Sopenharmony_ci	call _key_expansion_128
19068c2ecf20Sopenharmony_ci	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
19078c2ecf20Sopenharmony_ci	call _key_expansion_128
19088c2ecf20Sopenharmony_ci	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
19098c2ecf20Sopenharmony_ci	call _key_expansion_128
19108c2ecf20Sopenharmony_ci	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
19118c2ecf20Sopenharmony_ci	call _key_expansion_128
19128c2ecf20Sopenharmony_ci	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
19138c2ecf20Sopenharmony_ci	call _key_expansion_128
19148c2ecf20Sopenharmony_ci	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
19158c2ecf20Sopenharmony_ci	call _key_expansion_128
19168c2ecf20Sopenharmony_ci	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
19178c2ecf20Sopenharmony_ci	call _key_expansion_128
19188c2ecf20Sopenharmony_ci.Ldec_key:
19198c2ecf20Sopenharmony_ci	sub $0x10, TKEYP
19208c2ecf20Sopenharmony_ci	movaps (KEYP), %xmm0
19218c2ecf20Sopenharmony_ci	movaps (TKEYP), %xmm1
19228c2ecf20Sopenharmony_ci	movaps %xmm0, 240(TKEYP)
19238c2ecf20Sopenharmony_ci	movaps %xmm1, 240(KEYP)
19248c2ecf20Sopenharmony_ci	add $0x10, KEYP
19258c2ecf20Sopenharmony_ci	lea 240-16(TKEYP), UKEYP
19268c2ecf20Sopenharmony_ci.align 4
19278c2ecf20Sopenharmony_ci.Ldec_key_loop:
19288c2ecf20Sopenharmony_ci	movaps (KEYP), %xmm0
19298c2ecf20Sopenharmony_ci	aesimc %xmm0, %xmm1
19308c2ecf20Sopenharmony_ci	movaps %xmm1, (UKEYP)
19318c2ecf20Sopenharmony_ci	add $0x10, KEYP
19328c2ecf20Sopenharmony_ci	sub $0x10, UKEYP
19338c2ecf20Sopenharmony_ci	cmp TKEYP, KEYP
19348c2ecf20Sopenharmony_ci	jb .Ldec_key_loop
19358c2ecf20Sopenharmony_ci	xor AREG, AREG
19368c2ecf20Sopenharmony_ci#ifndef __x86_64__
19378c2ecf20Sopenharmony_ci	popl KEYP
19388c2ecf20Sopenharmony_ci#endif
19398c2ecf20Sopenharmony_ci	FRAME_END
19408c2ecf20Sopenharmony_ci	RET
19418c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_set_key)
19428c2ecf20Sopenharmony_ci
19438c2ecf20Sopenharmony_ci/*
19448c2ecf20Sopenharmony_ci * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
19458c2ecf20Sopenharmony_ci */
19468c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_enc)
19478c2ecf20Sopenharmony_ci	FRAME_BEGIN
19488c2ecf20Sopenharmony_ci#ifndef __x86_64__
19498c2ecf20Sopenharmony_ci	pushl KEYP
19508c2ecf20Sopenharmony_ci	pushl KLEN
19518c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
19528c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
19538c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), INP	# src
19548c2ecf20Sopenharmony_ci#endif
19558c2ecf20Sopenharmony_ci	movl 480(KEYP), KLEN		# key length
19568c2ecf20Sopenharmony_ci	movups (INP), STATE		# input
19578c2ecf20Sopenharmony_ci	call _aesni_enc1
19588c2ecf20Sopenharmony_ci	movups STATE, (OUTP)		# output
19598c2ecf20Sopenharmony_ci#ifndef __x86_64__
19608c2ecf20Sopenharmony_ci	popl KLEN
19618c2ecf20Sopenharmony_ci	popl KEYP
19628c2ecf20Sopenharmony_ci#endif
19638c2ecf20Sopenharmony_ci	FRAME_END
19648c2ecf20Sopenharmony_ci	RET
19658c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_enc)
19668c2ecf20Sopenharmony_ci
19678c2ecf20Sopenharmony_ci/*
19688c2ecf20Sopenharmony_ci * _aesni_enc1:		internal ABI
19698c2ecf20Sopenharmony_ci * input:
19708c2ecf20Sopenharmony_ci *	KEYP:		key struct pointer
19718c2ecf20Sopenharmony_ci *	KLEN:		round count
19728c2ecf20Sopenharmony_ci *	STATE:		initial state (input)
19738c2ecf20Sopenharmony_ci * output:
19748c2ecf20Sopenharmony_ci *	STATE:		finial state (output)
19758c2ecf20Sopenharmony_ci * changed:
19768c2ecf20Sopenharmony_ci *	KEY
19778c2ecf20Sopenharmony_ci *	TKEYP (T1)
19788c2ecf20Sopenharmony_ci */
19798c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_enc1)
19808c2ecf20Sopenharmony_ci	movaps (KEYP), KEY		# key
19818c2ecf20Sopenharmony_ci	mov KEYP, TKEYP
19828c2ecf20Sopenharmony_ci	pxor KEY, STATE		# round 0
19838c2ecf20Sopenharmony_ci	add $0x30, TKEYP
19848c2ecf20Sopenharmony_ci	cmp $24, KLEN
19858c2ecf20Sopenharmony_ci	jb .Lenc128
19868c2ecf20Sopenharmony_ci	lea 0x20(TKEYP), TKEYP
19878c2ecf20Sopenharmony_ci	je .Lenc192
19888c2ecf20Sopenharmony_ci	add $0x20, TKEYP
19898c2ecf20Sopenharmony_ci	movaps -0x60(TKEYP), KEY
19908c2ecf20Sopenharmony_ci	aesenc KEY, STATE
19918c2ecf20Sopenharmony_ci	movaps -0x50(TKEYP), KEY
19928c2ecf20Sopenharmony_ci	aesenc KEY, STATE
19938c2ecf20Sopenharmony_ci.align 4
19948c2ecf20Sopenharmony_ci.Lenc192:
19958c2ecf20Sopenharmony_ci	movaps -0x40(TKEYP), KEY
19968c2ecf20Sopenharmony_ci	aesenc KEY, STATE
19978c2ecf20Sopenharmony_ci	movaps -0x30(TKEYP), KEY
19988c2ecf20Sopenharmony_ci	aesenc KEY, STATE
19998c2ecf20Sopenharmony_ci.align 4
20008c2ecf20Sopenharmony_ci.Lenc128:
20018c2ecf20Sopenharmony_ci	movaps -0x20(TKEYP), KEY
20028c2ecf20Sopenharmony_ci	aesenc KEY, STATE
20038c2ecf20Sopenharmony_ci	movaps -0x10(TKEYP), KEY
20048c2ecf20Sopenharmony_ci	aesenc KEY, STATE
20058c2ecf20Sopenharmony_ci	movaps (TKEYP), KEY
20068c2ecf20Sopenharmony_ci	aesenc KEY, STATE
20078c2ecf20Sopenharmony_ci	movaps 0x10(TKEYP), KEY
20088c2ecf20Sopenharmony_ci	aesenc KEY, STATE
20098c2ecf20Sopenharmony_ci	movaps 0x20(TKEYP), KEY
20108c2ecf20Sopenharmony_ci	aesenc KEY, STATE
20118c2ecf20Sopenharmony_ci	movaps 0x30(TKEYP), KEY
20128c2ecf20Sopenharmony_ci	aesenc KEY, STATE
20138c2ecf20Sopenharmony_ci	movaps 0x40(TKEYP), KEY
20148c2ecf20Sopenharmony_ci	aesenc KEY, STATE
20158c2ecf20Sopenharmony_ci	movaps 0x50(TKEYP), KEY
20168c2ecf20Sopenharmony_ci	aesenc KEY, STATE
20178c2ecf20Sopenharmony_ci	movaps 0x60(TKEYP), KEY
20188c2ecf20Sopenharmony_ci	aesenc KEY, STATE
20198c2ecf20Sopenharmony_ci	movaps 0x70(TKEYP), KEY
20208c2ecf20Sopenharmony_ci	aesenclast KEY, STATE
20218c2ecf20Sopenharmony_ci	RET
20228c2ecf20Sopenharmony_ciSYM_FUNC_END(_aesni_enc1)
20238c2ecf20Sopenharmony_ci
20248c2ecf20Sopenharmony_ci/*
20258c2ecf20Sopenharmony_ci * _aesni_enc4:	internal ABI
20268c2ecf20Sopenharmony_ci * input:
20278c2ecf20Sopenharmony_ci *	KEYP:		key struct pointer
20288c2ecf20Sopenharmony_ci *	KLEN:		round count
20298c2ecf20Sopenharmony_ci *	STATE1:		initial state (input)
20308c2ecf20Sopenharmony_ci *	STATE2
20318c2ecf20Sopenharmony_ci *	STATE3
20328c2ecf20Sopenharmony_ci *	STATE4
20338c2ecf20Sopenharmony_ci * output:
20348c2ecf20Sopenharmony_ci *	STATE1:		finial state (output)
20358c2ecf20Sopenharmony_ci *	STATE2
20368c2ecf20Sopenharmony_ci *	STATE3
20378c2ecf20Sopenharmony_ci *	STATE4
20388c2ecf20Sopenharmony_ci * changed:
20398c2ecf20Sopenharmony_ci *	KEY
20408c2ecf20Sopenharmony_ci *	TKEYP (T1)
20418c2ecf20Sopenharmony_ci */
20428c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_enc4)
20438c2ecf20Sopenharmony_ci	movaps (KEYP), KEY		# key
20448c2ecf20Sopenharmony_ci	mov KEYP, TKEYP
20458c2ecf20Sopenharmony_ci	pxor KEY, STATE1		# round 0
20468c2ecf20Sopenharmony_ci	pxor KEY, STATE2
20478c2ecf20Sopenharmony_ci	pxor KEY, STATE3
20488c2ecf20Sopenharmony_ci	pxor KEY, STATE4
20498c2ecf20Sopenharmony_ci	add $0x30, TKEYP
20508c2ecf20Sopenharmony_ci	cmp $24, KLEN
20518c2ecf20Sopenharmony_ci	jb .L4enc128
20528c2ecf20Sopenharmony_ci	lea 0x20(TKEYP), TKEYP
20538c2ecf20Sopenharmony_ci	je .L4enc192
20548c2ecf20Sopenharmony_ci	add $0x20, TKEYP
20558c2ecf20Sopenharmony_ci	movaps -0x60(TKEYP), KEY
20568c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
20578c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
20588c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
20598c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
20608c2ecf20Sopenharmony_ci	movaps -0x50(TKEYP), KEY
20618c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
20628c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
20638c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
20648c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
20658c2ecf20Sopenharmony_ci#.align 4
20668c2ecf20Sopenharmony_ci.L4enc192:
20678c2ecf20Sopenharmony_ci	movaps -0x40(TKEYP), KEY
20688c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
20698c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
20708c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
20718c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
20728c2ecf20Sopenharmony_ci	movaps -0x30(TKEYP), KEY
20738c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
20748c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
20758c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
20768c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
20778c2ecf20Sopenharmony_ci#.align 4
20788c2ecf20Sopenharmony_ci.L4enc128:
20798c2ecf20Sopenharmony_ci	movaps -0x20(TKEYP), KEY
20808c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
20818c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
20828c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
20838c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
20848c2ecf20Sopenharmony_ci	movaps -0x10(TKEYP), KEY
20858c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
20868c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
20878c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
20888c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
20898c2ecf20Sopenharmony_ci	movaps (TKEYP), KEY
20908c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
20918c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
20928c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
20938c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
20948c2ecf20Sopenharmony_ci	movaps 0x10(TKEYP), KEY
20958c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
20968c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
20978c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
20988c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
20998c2ecf20Sopenharmony_ci	movaps 0x20(TKEYP), KEY
21008c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
21018c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
21028c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
21038c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
21048c2ecf20Sopenharmony_ci	movaps 0x30(TKEYP), KEY
21058c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
21068c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
21078c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
21088c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
21098c2ecf20Sopenharmony_ci	movaps 0x40(TKEYP), KEY
21108c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
21118c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
21128c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
21138c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
21148c2ecf20Sopenharmony_ci	movaps 0x50(TKEYP), KEY
21158c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
21168c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
21178c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
21188c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
21198c2ecf20Sopenharmony_ci	movaps 0x60(TKEYP), KEY
21208c2ecf20Sopenharmony_ci	aesenc KEY, STATE1
21218c2ecf20Sopenharmony_ci	aesenc KEY, STATE2
21228c2ecf20Sopenharmony_ci	aesenc KEY, STATE3
21238c2ecf20Sopenharmony_ci	aesenc KEY, STATE4
21248c2ecf20Sopenharmony_ci	movaps 0x70(TKEYP), KEY
21258c2ecf20Sopenharmony_ci	aesenclast KEY, STATE1		# last round
21268c2ecf20Sopenharmony_ci	aesenclast KEY, STATE2
21278c2ecf20Sopenharmony_ci	aesenclast KEY, STATE3
21288c2ecf20Sopenharmony_ci	aesenclast KEY, STATE4
21298c2ecf20Sopenharmony_ci	RET
21308c2ecf20Sopenharmony_ciSYM_FUNC_END(_aesni_enc4)
21318c2ecf20Sopenharmony_ci
21328c2ecf20Sopenharmony_ci/*
21338c2ecf20Sopenharmony_ci * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
21348c2ecf20Sopenharmony_ci */
21358c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_dec)
21368c2ecf20Sopenharmony_ci	FRAME_BEGIN
21378c2ecf20Sopenharmony_ci#ifndef __x86_64__
21388c2ecf20Sopenharmony_ci	pushl KEYP
21398c2ecf20Sopenharmony_ci	pushl KLEN
21408c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
21418c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
21428c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), INP	# src
21438c2ecf20Sopenharmony_ci#endif
21448c2ecf20Sopenharmony_ci	mov 480(KEYP), KLEN		# key length
21458c2ecf20Sopenharmony_ci	add $240, KEYP
21468c2ecf20Sopenharmony_ci	movups (INP), STATE		# input
21478c2ecf20Sopenharmony_ci	call _aesni_dec1
21488c2ecf20Sopenharmony_ci	movups STATE, (OUTP)		#output
21498c2ecf20Sopenharmony_ci#ifndef __x86_64__
21508c2ecf20Sopenharmony_ci	popl KLEN
21518c2ecf20Sopenharmony_ci	popl KEYP
21528c2ecf20Sopenharmony_ci#endif
21538c2ecf20Sopenharmony_ci	FRAME_END
21548c2ecf20Sopenharmony_ci	RET
21558c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_dec)
21568c2ecf20Sopenharmony_ci
21578c2ecf20Sopenharmony_ci/*
21588c2ecf20Sopenharmony_ci * _aesni_dec1:		internal ABI
21598c2ecf20Sopenharmony_ci * input:
21608c2ecf20Sopenharmony_ci *	KEYP:		key struct pointer
21618c2ecf20Sopenharmony_ci *	KLEN:		key length
21628c2ecf20Sopenharmony_ci *	STATE:		initial state (input)
21638c2ecf20Sopenharmony_ci * output:
21648c2ecf20Sopenharmony_ci *	STATE:		finial state (output)
21658c2ecf20Sopenharmony_ci * changed:
21668c2ecf20Sopenharmony_ci *	KEY
21678c2ecf20Sopenharmony_ci *	TKEYP (T1)
21688c2ecf20Sopenharmony_ci */
21698c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_dec1)
21708c2ecf20Sopenharmony_ci	movaps (KEYP), KEY		# key
21718c2ecf20Sopenharmony_ci	mov KEYP, TKEYP
21728c2ecf20Sopenharmony_ci	pxor KEY, STATE		# round 0
21738c2ecf20Sopenharmony_ci	add $0x30, TKEYP
21748c2ecf20Sopenharmony_ci	cmp $24, KLEN
21758c2ecf20Sopenharmony_ci	jb .Ldec128
21768c2ecf20Sopenharmony_ci	lea 0x20(TKEYP), TKEYP
21778c2ecf20Sopenharmony_ci	je .Ldec192
21788c2ecf20Sopenharmony_ci	add $0x20, TKEYP
21798c2ecf20Sopenharmony_ci	movaps -0x60(TKEYP), KEY
21808c2ecf20Sopenharmony_ci	aesdec KEY, STATE
21818c2ecf20Sopenharmony_ci	movaps -0x50(TKEYP), KEY
21828c2ecf20Sopenharmony_ci	aesdec KEY, STATE
21838c2ecf20Sopenharmony_ci.align 4
21848c2ecf20Sopenharmony_ci.Ldec192:
21858c2ecf20Sopenharmony_ci	movaps -0x40(TKEYP), KEY
21868c2ecf20Sopenharmony_ci	aesdec KEY, STATE
21878c2ecf20Sopenharmony_ci	movaps -0x30(TKEYP), KEY
21888c2ecf20Sopenharmony_ci	aesdec KEY, STATE
21898c2ecf20Sopenharmony_ci.align 4
21908c2ecf20Sopenharmony_ci.Ldec128:
21918c2ecf20Sopenharmony_ci	movaps -0x20(TKEYP), KEY
21928c2ecf20Sopenharmony_ci	aesdec KEY, STATE
21938c2ecf20Sopenharmony_ci	movaps -0x10(TKEYP), KEY
21948c2ecf20Sopenharmony_ci	aesdec KEY, STATE
21958c2ecf20Sopenharmony_ci	movaps (TKEYP), KEY
21968c2ecf20Sopenharmony_ci	aesdec KEY, STATE
21978c2ecf20Sopenharmony_ci	movaps 0x10(TKEYP), KEY
21988c2ecf20Sopenharmony_ci	aesdec KEY, STATE
21998c2ecf20Sopenharmony_ci	movaps 0x20(TKEYP), KEY
22008c2ecf20Sopenharmony_ci	aesdec KEY, STATE
22018c2ecf20Sopenharmony_ci	movaps 0x30(TKEYP), KEY
22028c2ecf20Sopenharmony_ci	aesdec KEY, STATE
22038c2ecf20Sopenharmony_ci	movaps 0x40(TKEYP), KEY
22048c2ecf20Sopenharmony_ci	aesdec KEY, STATE
22058c2ecf20Sopenharmony_ci	movaps 0x50(TKEYP), KEY
22068c2ecf20Sopenharmony_ci	aesdec KEY, STATE
22078c2ecf20Sopenharmony_ci	movaps 0x60(TKEYP), KEY
22088c2ecf20Sopenharmony_ci	aesdec KEY, STATE
22098c2ecf20Sopenharmony_ci	movaps 0x70(TKEYP), KEY
22108c2ecf20Sopenharmony_ci	aesdeclast KEY, STATE
22118c2ecf20Sopenharmony_ci	RET
22128c2ecf20Sopenharmony_ciSYM_FUNC_END(_aesni_dec1)
22138c2ecf20Sopenharmony_ci
22148c2ecf20Sopenharmony_ci/*
22158c2ecf20Sopenharmony_ci * _aesni_dec4:	internal ABI
22168c2ecf20Sopenharmony_ci * input:
22178c2ecf20Sopenharmony_ci *	KEYP:		key struct pointer
22188c2ecf20Sopenharmony_ci *	KLEN:		key length
22198c2ecf20Sopenharmony_ci *	STATE1:		initial state (input)
22208c2ecf20Sopenharmony_ci *	STATE2
22218c2ecf20Sopenharmony_ci *	STATE3
22228c2ecf20Sopenharmony_ci *	STATE4
22238c2ecf20Sopenharmony_ci * output:
22248c2ecf20Sopenharmony_ci *	STATE1:		finial state (output)
22258c2ecf20Sopenharmony_ci *	STATE2
22268c2ecf20Sopenharmony_ci *	STATE3
22278c2ecf20Sopenharmony_ci *	STATE4
22288c2ecf20Sopenharmony_ci * changed:
22298c2ecf20Sopenharmony_ci *	KEY
22308c2ecf20Sopenharmony_ci *	TKEYP (T1)
22318c2ecf20Sopenharmony_ci */
22328c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_dec4)
22338c2ecf20Sopenharmony_ci	movaps (KEYP), KEY		# key
22348c2ecf20Sopenharmony_ci	mov KEYP, TKEYP
22358c2ecf20Sopenharmony_ci	pxor KEY, STATE1		# round 0
22368c2ecf20Sopenharmony_ci	pxor KEY, STATE2
22378c2ecf20Sopenharmony_ci	pxor KEY, STATE3
22388c2ecf20Sopenharmony_ci	pxor KEY, STATE4
22398c2ecf20Sopenharmony_ci	add $0x30, TKEYP
22408c2ecf20Sopenharmony_ci	cmp $24, KLEN
22418c2ecf20Sopenharmony_ci	jb .L4dec128
22428c2ecf20Sopenharmony_ci	lea 0x20(TKEYP), TKEYP
22438c2ecf20Sopenharmony_ci	je .L4dec192
22448c2ecf20Sopenharmony_ci	add $0x20, TKEYP
22458c2ecf20Sopenharmony_ci	movaps -0x60(TKEYP), KEY
22468c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
22478c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
22488c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
22498c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
22508c2ecf20Sopenharmony_ci	movaps -0x50(TKEYP), KEY
22518c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
22528c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
22538c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
22548c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
22558c2ecf20Sopenharmony_ci.align 4
22568c2ecf20Sopenharmony_ci.L4dec192:
22578c2ecf20Sopenharmony_ci	movaps -0x40(TKEYP), KEY
22588c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
22598c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
22608c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
22618c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
22628c2ecf20Sopenharmony_ci	movaps -0x30(TKEYP), KEY
22638c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
22648c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
22658c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
22668c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
22678c2ecf20Sopenharmony_ci.align 4
22688c2ecf20Sopenharmony_ci.L4dec128:
22698c2ecf20Sopenharmony_ci	movaps -0x20(TKEYP), KEY
22708c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
22718c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
22728c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
22738c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
22748c2ecf20Sopenharmony_ci	movaps -0x10(TKEYP), KEY
22758c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
22768c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
22778c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
22788c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
22798c2ecf20Sopenharmony_ci	movaps (TKEYP), KEY
22808c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
22818c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
22828c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
22838c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
22848c2ecf20Sopenharmony_ci	movaps 0x10(TKEYP), KEY
22858c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
22868c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
22878c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
22888c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
22898c2ecf20Sopenharmony_ci	movaps 0x20(TKEYP), KEY
22908c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
22918c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
22928c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
22938c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
22948c2ecf20Sopenharmony_ci	movaps 0x30(TKEYP), KEY
22958c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
22968c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
22978c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
22988c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
22998c2ecf20Sopenharmony_ci	movaps 0x40(TKEYP), KEY
23008c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
23018c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
23028c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
23038c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
23048c2ecf20Sopenharmony_ci	movaps 0x50(TKEYP), KEY
23058c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
23068c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
23078c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
23088c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
23098c2ecf20Sopenharmony_ci	movaps 0x60(TKEYP), KEY
23108c2ecf20Sopenharmony_ci	aesdec KEY, STATE1
23118c2ecf20Sopenharmony_ci	aesdec KEY, STATE2
23128c2ecf20Sopenharmony_ci	aesdec KEY, STATE3
23138c2ecf20Sopenharmony_ci	aesdec KEY, STATE4
23148c2ecf20Sopenharmony_ci	movaps 0x70(TKEYP), KEY
23158c2ecf20Sopenharmony_ci	aesdeclast KEY, STATE1		# last round
23168c2ecf20Sopenharmony_ci	aesdeclast KEY, STATE2
23178c2ecf20Sopenharmony_ci	aesdeclast KEY, STATE3
23188c2ecf20Sopenharmony_ci	aesdeclast KEY, STATE4
23198c2ecf20Sopenharmony_ci	RET
23208c2ecf20Sopenharmony_ciSYM_FUNC_END(_aesni_dec4)
23218c2ecf20Sopenharmony_ci
23228c2ecf20Sopenharmony_ci/*
23238c2ecf20Sopenharmony_ci * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
23248c2ecf20Sopenharmony_ci *		      size_t len)
23258c2ecf20Sopenharmony_ci */
23268c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_ecb_enc)
23278c2ecf20Sopenharmony_ci	FRAME_BEGIN
23288c2ecf20Sopenharmony_ci#ifndef __x86_64__
23298c2ecf20Sopenharmony_ci	pushl LEN
23308c2ecf20Sopenharmony_ci	pushl KEYP
23318c2ecf20Sopenharmony_ci	pushl KLEN
23328c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
23338c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
23348c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), INP	# src
23358c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), LEN	# len
23368c2ecf20Sopenharmony_ci#endif
23378c2ecf20Sopenharmony_ci	test LEN, LEN		# check length
23388c2ecf20Sopenharmony_ci	jz .Lecb_enc_ret
23398c2ecf20Sopenharmony_ci	mov 480(KEYP), KLEN
23408c2ecf20Sopenharmony_ci	cmp $16, LEN
23418c2ecf20Sopenharmony_ci	jb .Lecb_enc_ret
23428c2ecf20Sopenharmony_ci	cmp $64, LEN
23438c2ecf20Sopenharmony_ci	jb .Lecb_enc_loop1
23448c2ecf20Sopenharmony_ci.align 4
23458c2ecf20Sopenharmony_ci.Lecb_enc_loop4:
23468c2ecf20Sopenharmony_ci	movups (INP), STATE1
23478c2ecf20Sopenharmony_ci	movups 0x10(INP), STATE2
23488c2ecf20Sopenharmony_ci	movups 0x20(INP), STATE3
23498c2ecf20Sopenharmony_ci	movups 0x30(INP), STATE4
23508c2ecf20Sopenharmony_ci	call _aesni_enc4
23518c2ecf20Sopenharmony_ci	movups STATE1, (OUTP)
23528c2ecf20Sopenharmony_ci	movups STATE2, 0x10(OUTP)
23538c2ecf20Sopenharmony_ci	movups STATE3, 0x20(OUTP)
23548c2ecf20Sopenharmony_ci	movups STATE4, 0x30(OUTP)
23558c2ecf20Sopenharmony_ci	sub $64, LEN
23568c2ecf20Sopenharmony_ci	add $64, INP
23578c2ecf20Sopenharmony_ci	add $64, OUTP
23588c2ecf20Sopenharmony_ci	cmp $64, LEN
23598c2ecf20Sopenharmony_ci	jge .Lecb_enc_loop4
23608c2ecf20Sopenharmony_ci	cmp $16, LEN
23618c2ecf20Sopenharmony_ci	jb .Lecb_enc_ret
23628c2ecf20Sopenharmony_ci.align 4
23638c2ecf20Sopenharmony_ci.Lecb_enc_loop1:
23648c2ecf20Sopenharmony_ci	movups (INP), STATE1
23658c2ecf20Sopenharmony_ci	call _aesni_enc1
23668c2ecf20Sopenharmony_ci	movups STATE1, (OUTP)
23678c2ecf20Sopenharmony_ci	sub $16, LEN
23688c2ecf20Sopenharmony_ci	add $16, INP
23698c2ecf20Sopenharmony_ci	add $16, OUTP
23708c2ecf20Sopenharmony_ci	cmp $16, LEN
23718c2ecf20Sopenharmony_ci	jge .Lecb_enc_loop1
23728c2ecf20Sopenharmony_ci.Lecb_enc_ret:
23738c2ecf20Sopenharmony_ci#ifndef __x86_64__
23748c2ecf20Sopenharmony_ci	popl KLEN
23758c2ecf20Sopenharmony_ci	popl KEYP
23768c2ecf20Sopenharmony_ci	popl LEN
23778c2ecf20Sopenharmony_ci#endif
23788c2ecf20Sopenharmony_ci	FRAME_END
23798c2ecf20Sopenharmony_ci	RET
23808c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_ecb_enc)
23818c2ecf20Sopenharmony_ci
23828c2ecf20Sopenharmony_ci/*
23838c2ecf20Sopenharmony_ci * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
23848c2ecf20Sopenharmony_ci *		      size_t len);
23858c2ecf20Sopenharmony_ci */
23868c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_ecb_dec)
23878c2ecf20Sopenharmony_ci	FRAME_BEGIN
23888c2ecf20Sopenharmony_ci#ifndef __x86_64__
23898c2ecf20Sopenharmony_ci	pushl LEN
23908c2ecf20Sopenharmony_ci	pushl KEYP
23918c2ecf20Sopenharmony_ci	pushl KLEN
23928c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
23938c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
23948c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), INP	# src
23958c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), LEN	# len
23968c2ecf20Sopenharmony_ci#endif
23978c2ecf20Sopenharmony_ci	test LEN, LEN
23988c2ecf20Sopenharmony_ci	jz .Lecb_dec_ret
23998c2ecf20Sopenharmony_ci	mov 480(KEYP), KLEN
24008c2ecf20Sopenharmony_ci	add $240, KEYP
24018c2ecf20Sopenharmony_ci	cmp $16, LEN
24028c2ecf20Sopenharmony_ci	jb .Lecb_dec_ret
24038c2ecf20Sopenharmony_ci	cmp $64, LEN
24048c2ecf20Sopenharmony_ci	jb .Lecb_dec_loop1
24058c2ecf20Sopenharmony_ci.align 4
24068c2ecf20Sopenharmony_ci.Lecb_dec_loop4:
24078c2ecf20Sopenharmony_ci	movups (INP), STATE1
24088c2ecf20Sopenharmony_ci	movups 0x10(INP), STATE2
24098c2ecf20Sopenharmony_ci	movups 0x20(INP), STATE3
24108c2ecf20Sopenharmony_ci	movups 0x30(INP), STATE4
24118c2ecf20Sopenharmony_ci	call _aesni_dec4
24128c2ecf20Sopenharmony_ci	movups STATE1, (OUTP)
24138c2ecf20Sopenharmony_ci	movups STATE2, 0x10(OUTP)
24148c2ecf20Sopenharmony_ci	movups STATE3, 0x20(OUTP)
24158c2ecf20Sopenharmony_ci	movups STATE4, 0x30(OUTP)
24168c2ecf20Sopenharmony_ci	sub $64, LEN
24178c2ecf20Sopenharmony_ci	add $64, INP
24188c2ecf20Sopenharmony_ci	add $64, OUTP
24198c2ecf20Sopenharmony_ci	cmp $64, LEN
24208c2ecf20Sopenharmony_ci	jge .Lecb_dec_loop4
24218c2ecf20Sopenharmony_ci	cmp $16, LEN
24228c2ecf20Sopenharmony_ci	jb .Lecb_dec_ret
24238c2ecf20Sopenharmony_ci.align 4
24248c2ecf20Sopenharmony_ci.Lecb_dec_loop1:
24258c2ecf20Sopenharmony_ci	movups (INP), STATE1
24268c2ecf20Sopenharmony_ci	call _aesni_dec1
24278c2ecf20Sopenharmony_ci	movups STATE1, (OUTP)
24288c2ecf20Sopenharmony_ci	sub $16, LEN
24298c2ecf20Sopenharmony_ci	add $16, INP
24308c2ecf20Sopenharmony_ci	add $16, OUTP
24318c2ecf20Sopenharmony_ci	cmp $16, LEN
24328c2ecf20Sopenharmony_ci	jge .Lecb_dec_loop1
24338c2ecf20Sopenharmony_ci.Lecb_dec_ret:
24348c2ecf20Sopenharmony_ci#ifndef __x86_64__
24358c2ecf20Sopenharmony_ci	popl KLEN
24368c2ecf20Sopenharmony_ci	popl KEYP
24378c2ecf20Sopenharmony_ci	popl LEN
24388c2ecf20Sopenharmony_ci#endif
24398c2ecf20Sopenharmony_ci	FRAME_END
24408c2ecf20Sopenharmony_ci	RET
24418c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_ecb_dec)
24428c2ecf20Sopenharmony_ci
24438c2ecf20Sopenharmony_ci/*
24448c2ecf20Sopenharmony_ci * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
24458c2ecf20Sopenharmony_ci *		      size_t len, u8 *iv)
24468c2ecf20Sopenharmony_ci */
24478c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_cbc_enc)
24488c2ecf20Sopenharmony_ci	FRAME_BEGIN
24498c2ecf20Sopenharmony_ci#ifndef __x86_64__
24508c2ecf20Sopenharmony_ci	pushl IVP
24518c2ecf20Sopenharmony_ci	pushl LEN
24528c2ecf20Sopenharmony_ci	pushl KEYP
24538c2ecf20Sopenharmony_ci	pushl KLEN
24548c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
24558c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
24568c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), INP	# src
24578c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+32)(%esp), LEN	# len
24588c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
24598c2ecf20Sopenharmony_ci#endif
24608c2ecf20Sopenharmony_ci	cmp $16, LEN
24618c2ecf20Sopenharmony_ci	jb .Lcbc_enc_ret
24628c2ecf20Sopenharmony_ci	mov 480(KEYP), KLEN
24638c2ecf20Sopenharmony_ci	movups (IVP), STATE	# load iv as initial state
24648c2ecf20Sopenharmony_ci.align 4
24658c2ecf20Sopenharmony_ci.Lcbc_enc_loop:
24668c2ecf20Sopenharmony_ci	movups (INP), IN	# load input
24678c2ecf20Sopenharmony_ci	pxor IN, STATE
24688c2ecf20Sopenharmony_ci	call _aesni_enc1
24698c2ecf20Sopenharmony_ci	movups STATE, (OUTP)	# store output
24708c2ecf20Sopenharmony_ci	sub $16, LEN
24718c2ecf20Sopenharmony_ci	add $16, INP
24728c2ecf20Sopenharmony_ci	add $16, OUTP
24738c2ecf20Sopenharmony_ci	cmp $16, LEN
24748c2ecf20Sopenharmony_ci	jge .Lcbc_enc_loop
24758c2ecf20Sopenharmony_ci	movups STATE, (IVP)
24768c2ecf20Sopenharmony_ci.Lcbc_enc_ret:
24778c2ecf20Sopenharmony_ci#ifndef __x86_64__
24788c2ecf20Sopenharmony_ci	popl KLEN
24798c2ecf20Sopenharmony_ci	popl KEYP
24808c2ecf20Sopenharmony_ci	popl LEN
24818c2ecf20Sopenharmony_ci	popl IVP
24828c2ecf20Sopenharmony_ci#endif
24838c2ecf20Sopenharmony_ci	FRAME_END
24848c2ecf20Sopenharmony_ci	RET
24858c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_cbc_enc)
24868c2ecf20Sopenharmony_ci
24878c2ecf20Sopenharmony_ci/*
24888c2ecf20Sopenharmony_ci * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
24898c2ecf20Sopenharmony_ci *		      size_t len, u8 *iv)
24908c2ecf20Sopenharmony_ci */
24918c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_cbc_dec)
24928c2ecf20Sopenharmony_ci	FRAME_BEGIN
24938c2ecf20Sopenharmony_ci#ifndef __x86_64__
24948c2ecf20Sopenharmony_ci	pushl IVP
24958c2ecf20Sopenharmony_ci	pushl LEN
24968c2ecf20Sopenharmony_ci	pushl KEYP
24978c2ecf20Sopenharmony_ci	pushl KLEN
24988c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
24998c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
25008c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), INP	# src
25018c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+32)(%esp), LEN	# len
25028c2ecf20Sopenharmony_ci	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
25038c2ecf20Sopenharmony_ci#endif
25048c2ecf20Sopenharmony_ci	cmp $16, LEN
25058c2ecf20Sopenharmony_ci	jb .Lcbc_dec_just_ret
25068c2ecf20Sopenharmony_ci	mov 480(KEYP), KLEN
25078c2ecf20Sopenharmony_ci	add $240, KEYP
25088c2ecf20Sopenharmony_ci	movups (IVP), IV
25098c2ecf20Sopenharmony_ci	cmp $64, LEN
25108c2ecf20Sopenharmony_ci	jb .Lcbc_dec_loop1
25118c2ecf20Sopenharmony_ci.align 4
25128c2ecf20Sopenharmony_ci.Lcbc_dec_loop4:
25138c2ecf20Sopenharmony_ci	movups (INP), IN1
25148c2ecf20Sopenharmony_ci	movaps IN1, STATE1
25158c2ecf20Sopenharmony_ci	movups 0x10(INP), IN2
25168c2ecf20Sopenharmony_ci	movaps IN2, STATE2
25178c2ecf20Sopenharmony_ci#ifdef __x86_64__
25188c2ecf20Sopenharmony_ci	movups 0x20(INP), IN3
25198c2ecf20Sopenharmony_ci	movaps IN3, STATE3
25208c2ecf20Sopenharmony_ci	movups 0x30(INP), IN4
25218c2ecf20Sopenharmony_ci	movaps IN4, STATE4
25228c2ecf20Sopenharmony_ci#else
25238c2ecf20Sopenharmony_ci	movups 0x20(INP), IN1
25248c2ecf20Sopenharmony_ci	movaps IN1, STATE3
25258c2ecf20Sopenharmony_ci	movups 0x30(INP), IN2
25268c2ecf20Sopenharmony_ci	movaps IN2, STATE4
25278c2ecf20Sopenharmony_ci#endif
25288c2ecf20Sopenharmony_ci	call _aesni_dec4
25298c2ecf20Sopenharmony_ci	pxor IV, STATE1
25308c2ecf20Sopenharmony_ci#ifdef __x86_64__
25318c2ecf20Sopenharmony_ci	pxor IN1, STATE2
25328c2ecf20Sopenharmony_ci	pxor IN2, STATE3
25338c2ecf20Sopenharmony_ci	pxor IN3, STATE4
25348c2ecf20Sopenharmony_ci	movaps IN4, IV
25358c2ecf20Sopenharmony_ci#else
25368c2ecf20Sopenharmony_ci	pxor IN1, STATE4
25378c2ecf20Sopenharmony_ci	movaps IN2, IV
25388c2ecf20Sopenharmony_ci	movups (INP), IN1
25398c2ecf20Sopenharmony_ci	pxor IN1, STATE2
25408c2ecf20Sopenharmony_ci	movups 0x10(INP), IN2
25418c2ecf20Sopenharmony_ci	pxor IN2, STATE3
25428c2ecf20Sopenharmony_ci#endif
25438c2ecf20Sopenharmony_ci	movups STATE1, (OUTP)
25448c2ecf20Sopenharmony_ci	movups STATE2, 0x10(OUTP)
25458c2ecf20Sopenharmony_ci	movups STATE3, 0x20(OUTP)
25468c2ecf20Sopenharmony_ci	movups STATE4, 0x30(OUTP)
25478c2ecf20Sopenharmony_ci	sub $64, LEN
25488c2ecf20Sopenharmony_ci	add $64, INP
25498c2ecf20Sopenharmony_ci	add $64, OUTP
25508c2ecf20Sopenharmony_ci	cmp $64, LEN
25518c2ecf20Sopenharmony_ci	jge .Lcbc_dec_loop4
25528c2ecf20Sopenharmony_ci	cmp $16, LEN
25538c2ecf20Sopenharmony_ci	jb .Lcbc_dec_ret
25548c2ecf20Sopenharmony_ci.align 4
25558c2ecf20Sopenharmony_ci.Lcbc_dec_loop1:
25568c2ecf20Sopenharmony_ci	movups (INP), IN
25578c2ecf20Sopenharmony_ci	movaps IN, STATE
25588c2ecf20Sopenharmony_ci	call _aesni_dec1
25598c2ecf20Sopenharmony_ci	pxor IV, STATE
25608c2ecf20Sopenharmony_ci	movups STATE, (OUTP)
25618c2ecf20Sopenharmony_ci	movaps IN, IV
25628c2ecf20Sopenharmony_ci	sub $16, LEN
25638c2ecf20Sopenharmony_ci	add $16, INP
25648c2ecf20Sopenharmony_ci	add $16, OUTP
25658c2ecf20Sopenharmony_ci	cmp $16, LEN
25668c2ecf20Sopenharmony_ci	jge .Lcbc_dec_loop1
25678c2ecf20Sopenharmony_ci.Lcbc_dec_ret:
25688c2ecf20Sopenharmony_ci	movups IV, (IVP)
25698c2ecf20Sopenharmony_ci.Lcbc_dec_just_ret:
25708c2ecf20Sopenharmony_ci#ifndef __x86_64__
25718c2ecf20Sopenharmony_ci	popl KLEN
25728c2ecf20Sopenharmony_ci	popl KEYP
25738c2ecf20Sopenharmony_ci	popl LEN
25748c2ecf20Sopenharmony_ci	popl IVP
25758c2ecf20Sopenharmony_ci#endif
25768c2ecf20Sopenharmony_ci	FRAME_END
25778c2ecf20Sopenharmony_ci	RET
25788c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_cbc_dec)
25798c2ecf20Sopenharmony_ci
25808c2ecf20Sopenharmony_ci#ifdef __x86_64__
25818c2ecf20Sopenharmony_ci.pushsection .rodata
25828c2ecf20Sopenharmony_ci.align 16
25838c2ecf20Sopenharmony_ci.Lbswap_mask:
25848c2ecf20Sopenharmony_ci	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
25858c2ecf20Sopenharmony_ci.popsection
25868c2ecf20Sopenharmony_ci
25878c2ecf20Sopenharmony_ci/*
25888c2ecf20Sopenharmony_ci * _aesni_inc_init:	internal ABI
25898c2ecf20Sopenharmony_ci *	setup registers used by _aesni_inc
25908c2ecf20Sopenharmony_ci * input:
25918c2ecf20Sopenharmony_ci *	IV
25928c2ecf20Sopenharmony_ci * output:
25938c2ecf20Sopenharmony_ci *	CTR:	== IV, in little endian
25948c2ecf20Sopenharmony_ci *	TCTR_LOW: == lower qword of CTR
25958c2ecf20Sopenharmony_ci *	INC:	== 1, in little endian
25968c2ecf20Sopenharmony_ci *	BSWAP_MASK == endian swapping mask
25978c2ecf20Sopenharmony_ci */
25988c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_inc_init)
25998c2ecf20Sopenharmony_ci	movaps .Lbswap_mask, BSWAP_MASK
26008c2ecf20Sopenharmony_ci	movaps IV, CTR
26018c2ecf20Sopenharmony_ci	pshufb BSWAP_MASK, CTR
26028c2ecf20Sopenharmony_ci	mov $1, TCTR_LOW
26038c2ecf20Sopenharmony_ci	movq TCTR_LOW, INC
26048c2ecf20Sopenharmony_ci	movq CTR, TCTR_LOW
26058c2ecf20Sopenharmony_ci	RET
26068c2ecf20Sopenharmony_ciSYM_FUNC_END(_aesni_inc_init)
26078c2ecf20Sopenharmony_ci
26088c2ecf20Sopenharmony_ci/*
26098c2ecf20Sopenharmony_ci * _aesni_inc:		internal ABI
26108c2ecf20Sopenharmony_ci *	Increase IV by 1, IV is in big endian
26118c2ecf20Sopenharmony_ci * input:
26128c2ecf20Sopenharmony_ci *	IV
26138c2ecf20Sopenharmony_ci *	CTR:	== IV, in little endian
26148c2ecf20Sopenharmony_ci *	TCTR_LOW: == lower qword of CTR
26158c2ecf20Sopenharmony_ci *	INC:	== 1, in little endian
26168c2ecf20Sopenharmony_ci *	BSWAP_MASK == endian swapping mask
26178c2ecf20Sopenharmony_ci * output:
26188c2ecf20Sopenharmony_ci *	IV:	Increase by 1
26198c2ecf20Sopenharmony_ci * changed:
26208c2ecf20Sopenharmony_ci *	CTR:	== output IV, in little endian
26218c2ecf20Sopenharmony_ci *	TCTR_LOW: == lower qword of CTR
26228c2ecf20Sopenharmony_ci */
26238c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_inc)
26248c2ecf20Sopenharmony_ci	paddq INC, CTR
26258c2ecf20Sopenharmony_ci	add $1, TCTR_LOW
26268c2ecf20Sopenharmony_ci	jnc .Linc_low
26278c2ecf20Sopenharmony_ci	pslldq $8, INC
26288c2ecf20Sopenharmony_ci	paddq INC, CTR
26298c2ecf20Sopenharmony_ci	psrldq $8, INC
26308c2ecf20Sopenharmony_ci.Linc_low:
26318c2ecf20Sopenharmony_ci	movaps CTR, IV
26328c2ecf20Sopenharmony_ci	pshufb BSWAP_MASK, IV
26338c2ecf20Sopenharmony_ci	RET
26348c2ecf20Sopenharmony_ciSYM_FUNC_END(_aesni_inc)
26358c2ecf20Sopenharmony_ci
26368c2ecf20Sopenharmony_ci/*
26378c2ecf20Sopenharmony_ci * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
26388c2ecf20Sopenharmony_ci *		      size_t len, u8 *iv)
26398c2ecf20Sopenharmony_ci */
26408c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_ctr_enc)
26418c2ecf20Sopenharmony_ci	FRAME_BEGIN
26428c2ecf20Sopenharmony_ci	cmp $16, LEN
26438c2ecf20Sopenharmony_ci	jb .Lctr_enc_just_ret
26448c2ecf20Sopenharmony_ci	mov 480(KEYP), KLEN
26458c2ecf20Sopenharmony_ci	movups (IVP), IV
26468c2ecf20Sopenharmony_ci	call _aesni_inc_init
26478c2ecf20Sopenharmony_ci	cmp $64, LEN
26488c2ecf20Sopenharmony_ci	jb .Lctr_enc_loop1
26498c2ecf20Sopenharmony_ci.align 4
26508c2ecf20Sopenharmony_ci.Lctr_enc_loop4:
26518c2ecf20Sopenharmony_ci	movaps IV, STATE1
26528c2ecf20Sopenharmony_ci	call _aesni_inc
26538c2ecf20Sopenharmony_ci	movups (INP), IN1
26548c2ecf20Sopenharmony_ci	movaps IV, STATE2
26558c2ecf20Sopenharmony_ci	call _aesni_inc
26568c2ecf20Sopenharmony_ci	movups 0x10(INP), IN2
26578c2ecf20Sopenharmony_ci	movaps IV, STATE3
26588c2ecf20Sopenharmony_ci	call _aesni_inc
26598c2ecf20Sopenharmony_ci	movups 0x20(INP), IN3
26608c2ecf20Sopenharmony_ci	movaps IV, STATE4
26618c2ecf20Sopenharmony_ci	call _aesni_inc
26628c2ecf20Sopenharmony_ci	movups 0x30(INP), IN4
26638c2ecf20Sopenharmony_ci	call _aesni_enc4
26648c2ecf20Sopenharmony_ci	pxor IN1, STATE1
26658c2ecf20Sopenharmony_ci	movups STATE1, (OUTP)
26668c2ecf20Sopenharmony_ci	pxor IN2, STATE2
26678c2ecf20Sopenharmony_ci	movups STATE2, 0x10(OUTP)
26688c2ecf20Sopenharmony_ci	pxor IN3, STATE3
26698c2ecf20Sopenharmony_ci	movups STATE3, 0x20(OUTP)
26708c2ecf20Sopenharmony_ci	pxor IN4, STATE4
26718c2ecf20Sopenharmony_ci	movups STATE4, 0x30(OUTP)
26728c2ecf20Sopenharmony_ci	sub $64, LEN
26738c2ecf20Sopenharmony_ci	add $64, INP
26748c2ecf20Sopenharmony_ci	add $64, OUTP
26758c2ecf20Sopenharmony_ci	cmp $64, LEN
26768c2ecf20Sopenharmony_ci	jge .Lctr_enc_loop4
26778c2ecf20Sopenharmony_ci	cmp $16, LEN
26788c2ecf20Sopenharmony_ci	jb .Lctr_enc_ret
26798c2ecf20Sopenharmony_ci.align 4
26808c2ecf20Sopenharmony_ci.Lctr_enc_loop1:
26818c2ecf20Sopenharmony_ci	movaps IV, STATE
26828c2ecf20Sopenharmony_ci	call _aesni_inc
26838c2ecf20Sopenharmony_ci	movups (INP), IN
26848c2ecf20Sopenharmony_ci	call _aesni_enc1
26858c2ecf20Sopenharmony_ci	pxor IN, STATE
26868c2ecf20Sopenharmony_ci	movups STATE, (OUTP)
26878c2ecf20Sopenharmony_ci	sub $16, LEN
26888c2ecf20Sopenharmony_ci	add $16, INP
26898c2ecf20Sopenharmony_ci	add $16, OUTP
26908c2ecf20Sopenharmony_ci	cmp $16, LEN
26918c2ecf20Sopenharmony_ci	jge .Lctr_enc_loop1
26928c2ecf20Sopenharmony_ci.Lctr_enc_ret:
26938c2ecf20Sopenharmony_ci	movups IV, (IVP)
26948c2ecf20Sopenharmony_ci.Lctr_enc_just_ret:
26958c2ecf20Sopenharmony_ci	FRAME_END
26968c2ecf20Sopenharmony_ci	RET
26978c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_ctr_enc)
26988c2ecf20Sopenharmony_ci
26998c2ecf20Sopenharmony_ci/*
27008c2ecf20Sopenharmony_ci * _aesni_gf128mul_x_ble:		internal ABI
27018c2ecf20Sopenharmony_ci *	Multiply in GF(2^128) for XTS IVs
27028c2ecf20Sopenharmony_ci * input:
27038c2ecf20Sopenharmony_ci *	IV:	current IV
27048c2ecf20Sopenharmony_ci *	GF128MUL_MASK == mask with 0x87 and 0x01
27058c2ecf20Sopenharmony_ci * output:
27068c2ecf20Sopenharmony_ci *	IV:	next IV
27078c2ecf20Sopenharmony_ci * changed:
27088c2ecf20Sopenharmony_ci *	CTR:	== temporary value
27098c2ecf20Sopenharmony_ci */
27108c2ecf20Sopenharmony_ci#define _aesni_gf128mul_x_ble() \
27118c2ecf20Sopenharmony_ci	pshufd $0x13, IV, CTR; \
27128c2ecf20Sopenharmony_ci	paddq IV, IV; \
27138c2ecf20Sopenharmony_ci	psrad $31, CTR; \
27148c2ecf20Sopenharmony_ci	pand GF128MUL_MASK, CTR; \
27158c2ecf20Sopenharmony_ci	pxor CTR, IV;
27168c2ecf20Sopenharmony_ci
27178c2ecf20Sopenharmony_ci/*
27188c2ecf20Sopenharmony_ci * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
27198c2ecf20Sopenharmony_ci *			  const u8 *src, unsigned int len, le128 *iv)
27208c2ecf20Sopenharmony_ci */
27218c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_xts_encrypt)
27228c2ecf20Sopenharmony_ci	FRAME_BEGIN
27238c2ecf20Sopenharmony_ci
27248c2ecf20Sopenharmony_ci	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
27258c2ecf20Sopenharmony_ci	movups (IVP), IV
27268c2ecf20Sopenharmony_ci
27278c2ecf20Sopenharmony_ci	mov 480(KEYP), KLEN
27288c2ecf20Sopenharmony_ci
27298c2ecf20Sopenharmony_ci.Lxts_enc_loop4:
27308c2ecf20Sopenharmony_ci	movdqa IV, STATE1
27318c2ecf20Sopenharmony_ci	movdqu 0x00(INP), INC
27328c2ecf20Sopenharmony_ci	pxor INC, STATE1
27338c2ecf20Sopenharmony_ci	movdqu IV, 0x00(OUTP)
27348c2ecf20Sopenharmony_ci
27358c2ecf20Sopenharmony_ci	_aesni_gf128mul_x_ble()
27368c2ecf20Sopenharmony_ci	movdqa IV, STATE2
27378c2ecf20Sopenharmony_ci	movdqu 0x10(INP), INC
27388c2ecf20Sopenharmony_ci	pxor INC, STATE2
27398c2ecf20Sopenharmony_ci	movdqu IV, 0x10(OUTP)
27408c2ecf20Sopenharmony_ci
27418c2ecf20Sopenharmony_ci	_aesni_gf128mul_x_ble()
27428c2ecf20Sopenharmony_ci	movdqa IV, STATE3
27438c2ecf20Sopenharmony_ci	movdqu 0x20(INP), INC
27448c2ecf20Sopenharmony_ci	pxor INC, STATE3
27458c2ecf20Sopenharmony_ci	movdqu IV, 0x20(OUTP)
27468c2ecf20Sopenharmony_ci
27478c2ecf20Sopenharmony_ci	_aesni_gf128mul_x_ble()
27488c2ecf20Sopenharmony_ci	movdqa IV, STATE4
27498c2ecf20Sopenharmony_ci	movdqu 0x30(INP), INC
27508c2ecf20Sopenharmony_ci	pxor INC, STATE4
27518c2ecf20Sopenharmony_ci	movdqu IV, 0x30(OUTP)
27528c2ecf20Sopenharmony_ci
27538c2ecf20Sopenharmony_ci	call _aesni_enc4
27548c2ecf20Sopenharmony_ci
27558c2ecf20Sopenharmony_ci	movdqu 0x00(OUTP), INC
27568c2ecf20Sopenharmony_ci	pxor INC, STATE1
27578c2ecf20Sopenharmony_ci	movdqu STATE1, 0x00(OUTP)
27588c2ecf20Sopenharmony_ci
27598c2ecf20Sopenharmony_ci	movdqu 0x10(OUTP), INC
27608c2ecf20Sopenharmony_ci	pxor INC, STATE2
27618c2ecf20Sopenharmony_ci	movdqu STATE2, 0x10(OUTP)
27628c2ecf20Sopenharmony_ci
27638c2ecf20Sopenharmony_ci	movdqu 0x20(OUTP), INC
27648c2ecf20Sopenharmony_ci	pxor INC, STATE3
27658c2ecf20Sopenharmony_ci	movdqu STATE3, 0x20(OUTP)
27668c2ecf20Sopenharmony_ci
27678c2ecf20Sopenharmony_ci	movdqu 0x30(OUTP), INC
27688c2ecf20Sopenharmony_ci	pxor INC, STATE4
27698c2ecf20Sopenharmony_ci	movdqu STATE4, 0x30(OUTP)
27708c2ecf20Sopenharmony_ci
27718c2ecf20Sopenharmony_ci	_aesni_gf128mul_x_ble()
27728c2ecf20Sopenharmony_ci
27738c2ecf20Sopenharmony_ci	add $64, INP
27748c2ecf20Sopenharmony_ci	add $64, OUTP
27758c2ecf20Sopenharmony_ci	sub $64, LEN
27768c2ecf20Sopenharmony_ci	ja .Lxts_enc_loop4
27778c2ecf20Sopenharmony_ci
27788c2ecf20Sopenharmony_ci	movups IV, (IVP)
27798c2ecf20Sopenharmony_ci
27808c2ecf20Sopenharmony_ci	FRAME_END
27818c2ecf20Sopenharmony_ci	RET
27828c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_xts_encrypt)
27838c2ecf20Sopenharmony_ci
27848c2ecf20Sopenharmony_ci/*
27858c2ecf20Sopenharmony_ci * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
27868c2ecf20Sopenharmony_ci *			  const u8 *src, unsigned int len, le128 *iv)
27878c2ecf20Sopenharmony_ci */
27888c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_xts_decrypt)
27898c2ecf20Sopenharmony_ci	FRAME_BEGIN
27908c2ecf20Sopenharmony_ci
27918c2ecf20Sopenharmony_ci	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
27928c2ecf20Sopenharmony_ci	movups (IVP), IV
27938c2ecf20Sopenharmony_ci
27948c2ecf20Sopenharmony_ci	mov 480(KEYP), KLEN
27958c2ecf20Sopenharmony_ci	add $240, KEYP
27968c2ecf20Sopenharmony_ci
27978c2ecf20Sopenharmony_ci.Lxts_dec_loop4:
27988c2ecf20Sopenharmony_ci	movdqa IV, STATE1
27998c2ecf20Sopenharmony_ci	movdqu 0x00(INP), INC
28008c2ecf20Sopenharmony_ci	pxor INC, STATE1
28018c2ecf20Sopenharmony_ci	movdqu IV, 0x00(OUTP)
28028c2ecf20Sopenharmony_ci
28038c2ecf20Sopenharmony_ci	_aesni_gf128mul_x_ble()
28048c2ecf20Sopenharmony_ci	movdqa IV, STATE2
28058c2ecf20Sopenharmony_ci	movdqu 0x10(INP), INC
28068c2ecf20Sopenharmony_ci	pxor INC, STATE2
28078c2ecf20Sopenharmony_ci	movdqu IV, 0x10(OUTP)
28088c2ecf20Sopenharmony_ci
28098c2ecf20Sopenharmony_ci	_aesni_gf128mul_x_ble()
28108c2ecf20Sopenharmony_ci	movdqa IV, STATE3
28118c2ecf20Sopenharmony_ci	movdqu 0x20(INP), INC
28128c2ecf20Sopenharmony_ci	pxor INC, STATE3
28138c2ecf20Sopenharmony_ci	movdqu IV, 0x20(OUTP)
28148c2ecf20Sopenharmony_ci
28158c2ecf20Sopenharmony_ci	_aesni_gf128mul_x_ble()
28168c2ecf20Sopenharmony_ci	movdqa IV, STATE4
28178c2ecf20Sopenharmony_ci	movdqu 0x30(INP), INC
28188c2ecf20Sopenharmony_ci	pxor INC, STATE4
28198c2ecf20Sopenharmony_ci	movdqu IV, 0x30(OUTP)
28208c2ecf20Sopenharmony_ci
28218c2ecf20Sopenharmony_ci	call _aesni_dec4
28228c2ecf20Sopenharmony_ci
28238c2ecf20Sopenharmony_ci	movdqu 0x00(OUTP), INC
28248c2ecf20Sopenharmony_ci	pxor INC, STATE1
28258c2ecf20Sopenharmony_ci	movdqu STATE1, 0x00(OUTP)
28268c2ecf20Sopenharmony_ci
28278c2ecf20Sopenharmony_ci	movdqu 0x10(OUTP), INC
28288c2ecf20Sopenharmony_ci	pxor INC, STATE2
28298c2ecf20Sopenharmony_ci	movdqu STATE2, 0x10(OUTP)
28308c2ecf20Sopenharmony_ci
28318c2ecf20Sopenharmony_ci	movdqu 0x20(OUTP), INC
28328c2ecf20Sopenharmony_ci	pxor INC, STATE3
28338c2ecf20Sopenharmony_ci	movdqu STATE3, 0x20(OUTP)
28348c2ecf20Sopenharmony_ci
28358c2ecf20Sopenharmony_ci	movdqu 0x30(OUTP), INC
28368c2ecf20Sopenharmony_ci	pxor INC, STATE4
28378c2ecf20Sopenharmony_ci	movdqu STATE4, 0x30(OUTP)
28388c2ecf20Sopenharmony_ci
28398c2ecf20Sopenharmony_ci	_aesni_gf128mul_x_ble()
28408c2ecf20Sopenharmony_ci
28418c2ecf20Sopenharmony_ci	add $64, INP
28428c2ecf20Sopenharmony_ci	add $64, OUTP
28438c2ecf20Sopenharmony_ci	sub $64, LEN
28448c2ecf20Sopenharmony_ci	ja .Lxts_dec_loop4
28458c2ecf20Sopenharmony_ci
28468c2ecf20Sopenharmony_ci	movups IV, (IVP)
28478c2ecf20Sopenharmony_ci
28488c2ecf20Sopenharmony_ci	FRAME_END
28498c2ecf20Sopenharmony_ci	RET
28508c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_xts_decrypt)
28518c2ecf20Sopenharmony_ci
28528c2ecf20Sopenharmony_ci#endif
2853