162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Implement AES algorithm in Intel AES-NI instructions.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * The white paper of AES-NI instructions can be downloaded from:
662306a36Sopenharmony_ci *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * Copyright (C) 2008, Intel Corp.
962306a36Sopenharmony_ci *    Author: Huang Ying <ying.huang@intel.com>
1062306a36Sopenharmony_ci *            Vinodh Gopal <vinodh.gopal@intel.com>
1162306a36Sopenharmony_ci *            Kahraman Akdemir
1262306a36Sopenharmony_ci *
1362306a36Sopenharmony_ci * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
1462306a36Sopenharmony_ci * interface for 64-bit kernels.
1562306a36Sopenharmony_ci *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
1662306a36Sopenharmony_ci *             Aidan O'Mahony (aidan.o.mahony@intel.com)
1762306a36Sopenharmony_ci *             Adrian Hoban <adrian.hoban@intel.com>
1862306a36Sopenharmony_ci *             James Guilford (james.guilford@intel.com)
1962306a36Sopenharmony_ci *             Gabriele Paoloni <gabriele.paoloni@intel.com>
2062306a36Sopenharmony_ci *             Tadeusz Struk (tadeusz.struk@intel.com)
2162306a36Sopenharmony_ci *             Wajdi Feghali (wajdi.k.feghali@intel.com)
2262306a36Sopenharmony_ci *    Copyright (c) 2010, Intel Corporation.
2362306a36Sopenharmony_ci *
2462306a36Sopenharmony_ci * Ported x86_64 version to x86:
2562306a36Sopenharmony_ci *    Author: Mathias Krause <minipli@googlemail.com>
2662306a36Sopenharmony_ci */
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci#include <linux/linkage.h>
2962306a36Sopenharmony_ci#include <asm/frame.h>
3062306a36Sopenharmony_ci#include <asm/nospec-branch.h>
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_ci/*
3362306a36Sopenharmony_ci * The following macros are used to move an (un)aligned 16 byte value to/from
3462306a36Sopenharmony_ci * an XMM register.  This can done for either FP or integer values, for FP use
3562306a36Sopenharmony_ci * movaps (move aligned packed single) or integer use movdqa (move double quad
3662306a36Sopenharmony_ci * aligned).  It doesn't make a performance difference which instruction is used
3762306a36Sopenharmony_ci * since Nehalem (original Core i7) was released.  However, the movaps is a byte
3862306a36Sopenharmony_ci * shorter, so that is the one we'll use for now. (same for unaligned).
3962306a36Sopenharmony_ci */
4062306a36Sopenharmony_ci#define MOVADQ	movaps
4162306a36Sopenharmony_ci#define MOVUDQ	movups
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ci#ifdef __x86_64__
4462306a36Sopenharmony_ci
4562306a36Sopenharmony_ci# constants in mergeable sections, linker can reorder and merge
4662306a36Sopenharmony_ci.section	.rodata.cst16.POLY, "aM", @progbits, 16
4762306a36Sopenharmony_ci.align 16
4862306a36Sopenharmony_ciPOLY:   .octa 0xC2000000000000000000000000000001
4962306a36Sopenharmony_ci.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
5062306a36Sopenharmony_ci.align 16
5162306a36Sopenharmony_ciTWOONE: .octa 0x00000001000000000000000000000001
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
5462306a36Sopenharmony_ci.align 16
5562306a36Sopenharmony_ciSHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
5662306a36Sopenharmony_ci.section	.rodata.cst16.MASK1, "aM", @progbits, 16
5762306a36Sopenharmony_ci.align 16
5862306a36Sopenharmony_ciMASK1:      .octa 0x0000000000000000ffffffffffffffff
5962306a36Sopenharmony_ci.section	.rodata.cst16.MASK2, "aM", @progbits, 16
6062306a36Sopenharmony_ci.align 16
6162306a36Sopenharmony_ciMASK2:      .octa 0xffffffffffffffff0000000000000000
6262306a36Sopenharmony_ci.section	.rodata.cst16.ONE, "aM", @progbits, 16
6362306a36Sopenharmony_ci.align 16
6462306a36Sopenharmony_ciONE:        .octa 0x00000000000000000000000000000001
6562306a36Sopenharmony_ci.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
6662306a36Sopenharmony_ci.align 16
6762306a36Sopenharmony_ciF_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
6862306a36Sopenharmony_ci.section	.rodata.cst16.dec, "aM", @progbits, 16
6962306a36Sopenharmony_ci.align 16
7062306a36Sopenharmony_cidec:        .octa 0x1
7162306a36Sopenharmony_ci.section	.rodata.cst16.enc, "aM", @progbits, 16
7262306a36Sopenharmony_ci.align 16
7362306a36Sopenharmony_cienc:        .octa 0x2
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_ci# order of these constants should not change.
7662306a36Sopenharmony_ci# more specifically, ALL_F should follow SHIFT_MASK,
7762306a36Sopenharmony_ci# and zero should follow ALL_F
7862306a36Sopenharmony_ci.section	.rodata, "a", @progbits
7962306a36Sopenharmony_ci.align 16
8062306a36Sopenharmony_ciSHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
8162306a36Sopenharmony_ciALL_F:      .octa 0xffffffffffffffffffffffffffffffff
8262306a36Sopenharmony_ci            .octa 0x00000000000000000000000000000000
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci.text
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci#define	STACK_OFFSET    8*3
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci#define AadHash 16*0
9062306a36Sopenharmony_ci#define AadLen 16*1
9162306a36Sopenharmony_ci#define InLen (16*1)+8
9262306a36Sopenharmony_ci#define PBlockEncKey 16*2
9362306a36Sopenharmony_ci#define OrigIV 16*3
9462306a36Sopenharmony_ci#define CurCount 16*4
9562306a36Sopenharmony_ci#define PBlockLen 16*5
9662306a36Sopenharmony_ci#define	HashKey		16*6	// store HashKey <<1 mod poly here
9762306a36Sopenharmony_ci#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
9862306a36Sopenharmony_ci#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
9962306a36Sopenharmony_ci#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
10062306a36Sopenharmony_ci#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
10162306a36Sopenharmony_ci				// bits of  HashKey <<1 mod poly here
10262306a36Sopenharmony_ci				//(for Karatsuba purposes)
10362306a36Sopenharmony_ci#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
10462306a36Sopenharmony_ci				// bits of  HashKey^2 <<1 mod poly here
10562306a36Sopenharmony_ci				// (for Karatsuba purposes)
10662306a36Sopenharmony_ci#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
10762306a36Sopenharmony_ci				// bits of  HashKey^3 <<1 mod poly here
10862306a36Sopenharmony_ci				// (for Karatsuba purposes)
10962306a36Sopenharmony_ci#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
11062306a36Sopenharmony_ci				// bits of  HashKey^4 <<1 mod poly here
11162306a36Sopenharmony_ci				// (for Karatsuba purposes)
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci#define arg1 rdi
11462306a36Sopenharmony_ci#define arg2 rsi
11562306a36Sopenharmony_ci#define arg3 rdx
11662306a36Sopenharmony_ci#define arg4 rcx
11762306a36Sopenharmony_ci#define arg5 r8
11862306a36Sopenharmony_ci#define arg6 r9
11962306a36Sopenharmony_ci#define arg7 STACK_OFFSET+8(%rsp)
12062306a36Sopenharmony_ci#define arg8 STACK_OFFSET+16(%rsp)
12162306a36Sopenharmony_ci#define arg9 STACK_OFFSET+24(%rsp)
12262306a36Sopenharmony_ci#define arg10 STACK_OFFSET+32(%rsp)
12362306a36Sopenharmony_ci#define arg11 STACK_OFFSET+40(%rsp)
12462306a36Sopenharmony_ci#define keysize 2*15*16(%arg1)
12562306a36Sopenharmony_ci#endif
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci#define STATE1	%xmm0
12962306a36Sopenharmony_ci#define STATE2	%xmm4
13062306a36Sopenharmony_ci#define STATE3	%xmm5
13162306a36Sopenharmony_ci#define STATE4	%xmm6
13262306a36Sopenharmony_ci#define STATE	STATE1
13362306a36Sopenharmony_ci#define IN1	%xmm1
13462306a36Sopenharmony_ci#define IN2	%xmm7
13562306a36Sopenharmony_ci#define IN3	%xmm8
13662306a36Sopenharmony_ci#define IN4	%xmm9
13762306a36Sopenharmony_ci#define IN	IN1
13862306a36Sopenharmony_ci#define KEY	%xmm2
13962306a36Sopenharmony_ci#define IV	%xmm3
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci#define BSWAP_MASK %xmm10
14262306a36Sopenharmony_ci#define CTR	%xmm11
14362306a36Sopenharmony_ci#define INC	%xmm12
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_ci#define GF128MUL_MASK %xmm7
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci#ifdef __x86_64__
14862306a36Sopenharmony_ci#define AREG	%rax
14962306a36Sopenharmony_ci#define KEYP	%rdi
15062306a36Sopenharmony_ci#define OUTP	%rsi
15162306a36Sopenharmony_ci#define UKEYP	OUTP
15262306a36Sopenharmony_ci#define INP	%rdx
15362306a36Sopenharmony_ci#define LEN	%rcx
15462306a36Sopenharmony_ci#define IVP	%r8
15562306a36Sopenharmony_ci#define KLEN	%r9d
15662306a36Sopenharmony_ci#define T1	%r10
15762306a36Sopenharmony_ci#define TKEYP	T1
15862306a36Sopenharmony_ci#define T2	%r11
15962306a36Sopenharmony_ci#define TCTR_LOW T2
16062306a36Sopenharmony_ci#else
16162306a36Sopenharmony_ci#define AREG	%eax
16262306a36Sopenharmony_ci#define KEYP	%edi
16362306a36Sopenharmony_ci#define OUTP	AREG
16462306a36Sopenharmony_ci#define UKEYP	OUTP
16562306a36Sopenharmony_ci#define INP	%edx
16662306a36Sopenharmony_ci#define LEN	%esi
16762306a36Sopenharmony_ci#define IVP	%ebp
16862306a36Sopenharmony_ci#define KLEN	%ebx
16962306a36Sopenharmony_ci#define T1	%ecx
17062306a36Sopenharmony_ci#define TKEYP	T1
17162306a36Sopenharmony_ci#endif
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci.macro FUNC_SAVE
17462306a36Sopenharmony_ci	push	%r12
17562306a36Sopenharmony_ci	push	%r13
17662306a36Sopenharmony_ci	push	%r14
17762306a36Sopenharmony_ci#
17862306a36Sopenharmony_ci# states of %xmm registers %xmm6:%xmm15 not saved
17962306a36Sopenharmony_ci# all %xmm registers are clobbered
18062306a36Sopenharmony_ci#
18162306a36Sopenharmony_ci.endm
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci.macro FUNC_RESTORE
18562306a36Sopenharmony_ci	pop	%r14
18662306a36Sopenharmony_ci	pop	%r13
18762306a36Sopenharmony_ci	pop	%r12
18862306a36Sopenharmony_ci.endm
18962306a36Sopenharmony_ci
19062306a36Sopenharmony_ci# Precompute hashkeys.
19162306a36Sopenharmony_ci# Input: Hash subkey.
19262306a36Sopenharmony_ci# Output: HashKeys stored in gcm_context_data.  Only needs to be called
19362306a36Sopenharmony_ci# once per key.
19462306a36Sopenharmony_ci# clobbers r12, and tmp xmm registers.
19562306a36Sopenharmony_ci.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
19662306a36Sopenharmony_ci	mov	\SUBKEY, %r12
19762306a36Sopenharmony_ci	movdqu	(%r12), \TMP3
19862306a36Sopenharmony_ci	movdqa	SHUF_MASK(%rip), \TMP2
19962306a36Sopenharmony_ci	pshufb	\TMP2, \TMP3
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci	movdqa	\TMP3, \TMP2
20462306a36Sopenharmony_ci	psllq	$1, \TMP3
20562306a36Sopenharmony_ci	psrlq	$63, \TMP2
20662306a36Sopenharmony_ci	movdqa	\TMP2, \TMP1
20762306a36Sopenharmony_ci	pslldq	$8, \TMP2
20862306a36Sopenharmony_ci	psrldq	$8, \TMP1
20962306a36Sopenharmony_ci	por	\TMP2, \TMP3
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci	# reduce HashKey<<1
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci	pshufd	$0x24, \TMP1, \TMP2
21462306a36Sopenharmony_ci	pcmpeqd TWOONE(%rip), \TMP2
21562306a36Sopenharmony_ci	pand	POLY(%rip), \TMP2
21662306a36Sopenharmony_ci	pxor	\TMP2, \TMP3
21762306a36Sopenharmony_ci	movdqu	\TMP3, HashKey(%arg2)
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci	movdqa	   \TMP3, \TMP5
22062306a36Sopenharmony_ci	pshufd	   $78, \TMP3, \TMP1
22162306a36Sopenharmony_ci	pxor	   \TMP3, \TMP1
22262306a36Sopenharmony_ci	movdqu	   \TMP1, HashKey_k(%arg2)
22362306a36Sopenharmony_ci
22462306a36Sopenharmony_ci	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
22562306a36Sopenharmony_ci# TMP5 = HashKey^2<<1 (mod poly)
22662306a36Sopenharmony_ci	movdqu	   \TMP5, HashKey_2(%arg2)
22762306a36Sopenharmony_ci# HashKey_2 = HashKey^2<<1 (mod poly)
22862306a36Sopenharmony_ci	pshufd	   $78, \TMP5, \TMP1
22962306a36Sopenharmony_ci	pxor	   \TMP5, \TMP1
23062306a36Sopenharmony_ci	movdqu	   \TMP1, HashKey_2_k(%arg2)
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ci	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
23362306a36Sopenharmony_ci# TMP5 = HashKey^3<<1 (mod poly)
23462306a36Sopenharmony_ci	movdqu	   \TMP5, HashKey_3(%arg2)
23562306a36Sopenharmony_ci	pshufd	   $78, \TMP5, \TMP1
23662306a36Sopenharmony_ci	pxor	   \TMP5, \TMP1
23762306a36Sopenharmony_ci	movdqu	   \TMP1, HashKey_3_k(%arg2)
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_ci	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
24062306a36Sopenharmony_ci# TMP5 = HashKey^3<<1 (mod poly)
24162306a36Sopenharmony_ci	movdqu	   \TMP5, HashKey_4(%arg2)
24262306a36Sopenharmony_ci	pshufd	   $78, \TMP5, \TMP1
24362306a36Sopenharmony_ci	pxor	   \TMP5, \TMP1
24462306a36Sopenharmony_ci	movdqu	   \TMP1, HashKey_4_k(%arg2)
24562306a36Sopenharmony_ci.endm
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
24862306a36Sopenharmony_ci# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
24962306a36Sopenharmony_ci.macro GCM_INIT Iv SUBKEY AAD AADLEN
25062306a36Sopenharmony_ci	mov \AADLEN, %r11
25162306a36Sopenharmony_ci	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
25262306a36Sopenharmony_ci	xor %r11d, %r11d
25362306a36Sopenharmony_ci	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
25462306a36Sopenharmony_ci	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
25562306a36Sopenharmony_ci	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
25662306a36Sopenharmony_ci	mov \Iv, %rax
25762306a36Sopenharmony_ci	movdqu (%rax), %xmm0
25862306a36Sopenharmony_ci	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	movdqa  SHUF_MASK(%rip), %xmm2
26162306a36Sopenharmony_ci	pshufb %xmm2, %xmm0
26262306a36Sopenharmony_ci	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
26562306a36Sopenharmony_ci	movdqu HashKey(%arg2), %xmm13
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
26862306a36Sopenharmony_ci	%xmm4, %xmm5, %xmm6
26962306a36Sopenharmony_ci.endm
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_ci# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
27262306a36Sopenharmony_ci# struct has been initialized by GCM_INIT.
27362306a36Sopenharmony_ci# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
27462306a36Sopenharmony_ci# Clobbers rax, r10-r13, and xmm0-xmm15
27562306a36Sopenharmony_ci.macro GCM_ENC_DEC operation
27662306a36Sopenharmony_ci	movdqu AadHash(%arg2), %xmm8
27762306a36Sopenharmony_ci	movdqu HashKey(%arg2), %xmm13
27862306a36Sopenharmony_ci	add %arg5, InLen(%arg2)
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	xor %r11d, %r11d # initialise the data pointer offset as zero
28162306a36Sopenharmony_ci	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci	sub %r11, %arg5		# sub partial block data used
28462306a36Sopenharmony_ci	mov %arg5, %r13		# save the number of bytes
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
28762306a36Sopenharmony_ci	mov %r13, %r12
28862306a36Sopenharmony_ci	# Encrypt/Decrypt first few blocks
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci	and	$(3<<4), %r12
29162306a36Sopenharmony_ci	jz	.L_initial_num_blocks_is_0_\@
29262306a36Sopenharmony_ci	cmp	$(2<<4), %r12
29362306a36Sopenharmony_ci	jb	.L_initial_num_blocks_is_1_\@
29462306a36Sopenharmony_ci	je	.L_initial_num_blocks_is_2_\@
29562306a36Sopenharmony_ci.L_initial_num_blocks_is_3_\@:
29662306a36Sopenharmony_ci	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
29762306a36Sopenharmony_ci%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
29862306a36Sopenharmony_ci	sub	$48, %r13
29962306a36Sopenharmony_ci	jmp	.L_initial_blocks_\@
30062306a36Sopenharmony_ci.L_initial_num_blocks_is_2_\@:
30162306a36Sopenharmony_ci	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
30262306a36Sopenharmony_ci%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
30362306a36Sopenharmony_ci	sub	$32, %r13
30462306a36Sopenharmony_ci	jmp	.L_initial_blocks_\@
30562306a36Sopenharmony_ci.L_initial_num_blocks_is_1_\@:
30662306a36Sopenharmony_ci	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
30762306a36Sopenharmony_ci%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
30862306a36Sopenharmony_ci	sub	$16, %r13
30962306a36Sopenharmony_ci	jmp	.L_initial_blocks_\@
31062306a36Sopenharmony_ci.L_initial_num_blocks_is_0_\@:
31162306a36Sopenharmony_ci	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
31262306a36Sopenharmony_ci%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
31362306a36Sopenharmony_ci.L_initial_blocks_\@:
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_ci	# Main loop - Encrypt/Decrypt remaining blocks
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_ci	test	%r13, %r13
31862306a36Sopenharmony_ci	je	.L_zero_cipher_left_\@
31962306a36Sopenharmony_ci	sub	$64, %r13
32062306a36Sopenharmony_ci	je	.L_four_cipher_left_\@
32162306a36Sopenharmony_ci.L_crypt_by_4_\@:
32262306a36Sopenharmony_ci	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
32362306a36Sopenharmony_ci	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
32462306a36Sopenharmony_ci	%xmm7, %xmm8, enc
32562306a36Sopenharmony_ci	add	$64, %r11
32662306a36Sopenharmony_ci	sub	$64, %r13
32762306a36Sopenharmony_ci	jne	.L_crypt_by_4_\@
32862306a36Sopenharmony_ci.L_four_cipher_left_\@:
32962306a36Sopenharmony_ci	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
33062306a36Sopenharmony_ci%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
33162306a36Sopenharmony_ci.L_zero_cipher_left_\@:
33262306a36Sopenharmony_ci	movdqu %xmm8, AadHash(%arg2)
33362306a36Sopenharmony_ci	movdqu %xmm0, CurCount(%arg2)
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci	mov	%arg5, %r13
33662306a36Sopenharmony_ci	and	$15, %r13			# %r13 = arg5 (mod 16)
33762306a36Sopenharmony_ci	je	.L_multiple_of_16_bytes_\@
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci	mov %r13, PBlockLen(%arg2)
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	# Handle the last <16 Byte block separately
34262306a36Sopenharmony_ci	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
34362306a36Sopenharmony_ci	movdqu %xmm0, CurCount(%arg2)
34462306a36Sopenharmony_ci	movdqa SHUF_MASK(%rip), %xmm10
34562306a36Sopenharmony_ci	pshufb %xmm10, %xmm0
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
34862306a36Sopenharmony_ci	movdqu %xmm0, PBlockEncKey(%arg2)
34962306a36Sopenharmony_ci
35062306a36Sopenharmony_ci	cmp	$16, %arg5
35162306a36Sopenharmony_ci	jge	.L_large_enough_update_\@
35262306a36Sopenharmony_ci
35362306a36Sopenharmony_ci	lea (%arg4,%r11,1), %r10
35462306a36Sopenharmony_ci	mov %r13, %r12
35562306a36Sopenharmony_ci	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
35662306a36Sopenharmony_ci	jmp	.L_data_read_\@
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_ci.L_large_enough_update_\@:
35962306a36Sopenharmony_ci	sub	$16, %r11
36062306a36Sopenharmony_ci	add	%r13, %r11
36162306a36Sopenharmony_ci
36262306a36Sopenharmony_ci	# receive the last <16 Byte block
36362306a36Sopenharmony_ci	movdqu	(%arg4, %r11, 1), %xmm1
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci	sub	%r13, %r11
36662306a36Sopenharmony_ci	add	$16, %r11
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci	lea	SHIFT_MASK+16(%rip), %r12
36962306a36Sopenharmony_ci	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
37062306a36Sopenharmony_ci	# (r13 is the number of bytes in plaintext mod 16)
37162306a36Sopenharmony_ci	sub	%r13, %r12
37262306a36Sopenharmony_ci	# get the appropriate shuffle mask
37362306a36Sopenharmony_ci	movdqu	(%r12), %xmm2
37462306a36Sopenharmony_ci	# shift right 16-r13 bytes
37562306a36Sopenharmony_ci	pshufb  %xmm2, %xmm1
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci.L_data_read_\@:
37862306a36Sopenharmony_ci	lea ALL_F+16(%rip), %r12
37962306a36Sopenharmony_ci	sub %r13, %r12
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_ci.ifc \operation, dec
38262306a36Sopenharmony_ci	movdqa  %xmm1, %xmm2
38362306a36Sopenharmony_ci.endif
38462306a36Sopenharmony_ci	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
38562306a36Sopenharmony_ci	movdqu	(%r12), %xmm1
38662306a36Sopenharmony_ci	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
38762306a36Sopenharmony_ci	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
38862306a36Sopenharmony_ci.ifc \operation, dec
38962306a36Sopenharmony_ci	pand    %xmm1, %xmm2
39062306a36Sopenharmony_ci	movdqa SHUF_MASK(%rip), %xmm10
39162306a36Sopenharmony_ci	pshufb %xmm10 ,%xmm2
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci	pxor %xmm2, %xmm8
39462306a36Sopenharmony_ci.else
39562306a36Sopenharmony_ci	movdqa SHUF_MASK(%rip), %xmm10
39662306a36Sopenharmony_ci	pshufb %xmm10,%xmm0
39762306a36Sopenharmony_ci
39862306a36Sopenharmony_ci	pxor	%xmm0, %xmm8
39962306a36Sopenharmony_ci.endif
40062306a36Sopenharmony_ci
40162306a36Sopenharmony_ci	movdqu %xmm8, AadHash(%arg2)
40262306a36Sopenharmony_ci.ifc \operation, enc
40362306a36Sopenharmony_ci	# GHASH computation for the last <16 byte block
40462306a36Sopenharmony_ci	movdqa SHUF_MASK(%rip), %xmm10
40562306a36Sopenharmony_ci	# shuffle xmm0 back to output as ciphertext
40662306a36Sopenharmony_ci	pshufb %xmm10, %xmm0
40762306a36Sopenharmony_ci.endif
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci	# Output %r13 bytes
41062306a36Sopenharmony_ci	movq %xmm0, %rax
41162306a36Sopenharmony_ci	cmp $8, %r13
41262306a36Sopenharmony_ci	jle .L_less_than_8_bytes_left_\@
41362306a36Sopenharmony_ci	mov %rax, (%arg3 , %r11, 1)
41462306a36Sopenharmony_ci	add $8, %r11
41562306a36Sopenharmony_ci	psrldq $8, %xmm0
41662306a36Sopenharmony_ci	movq %xmm0, %rax
41762306a36Sopenharmony_ci	sub $8, %r13
41862306a36Sopenharmony_ci.L_less_than_8_bytes_left_\@:
41962306a36Sopenharmony_ci	mov %al,  (%arg3, %r11, 1)
42062306a36Sopenharmony_ci	add $1, %r11
42162306a36Sopenharmony_ci	shr $8, %rax
42262306a36Sopenharmony_ci	sub $1, %r13
42362306a36Sopenharmony_ci	jne .L_less_than_8_bytes_left_\@
42462306a36Sopenharmony_ci.L_multiple_of_16_bytes_\@:
42562306a36Sopenharmony_ci.endm
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci# GCM_COMPLETE Finishes update of tag of last partial block
42862306a36Sopenharmony_ci# Output: Authorization Tag (AUTH_TAG)
42962306a36Sopenharmony_ci# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
43062306a36Sopenharmony_ci.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
43162306a36Sopenharmony_ci	movdqu AadHash(%arg2), %xmm8
43262306a36Sopenharmony_ci	movdqu HashKey(%arg2), %xmm13
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ci	mov PBlockLen(%arg2), %r12
43562306a36Sopenharmony_ci
43662306a36Sopenharmony_ci	test %r12, %r12
43762306a36Sopenharmony_ci	je .L_partial_done\@
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci.L_partial_done\@:
44262306a36Sopenharmony_ci	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
44362306a36Sopenharmony_ci	shl	$3, %r12		  # convert into number of bits
44462306a36Sopenharmony_ci	movd	%r12d, %xmm15		  # len(A) in %xmm15
44562306a36Sopenharmony_ci	mov InLen(%arg2), %r12
44662306a36Sopenharmony_ci	shl     $3, %r12                  # len(C) in bits (*128)
44762306a36Sopenharmony_ci	movq    %r12, %xmm1
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ci	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
45062306a36Sopenharmony_ci	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
45162306a36Sopenharmony_ci	pxor	%xmm15, %xmm8
45262306a36Sopenharmony_ci	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
45362306a36Sopenharmony_ci	# final GHASH computation
45462306a36Sopenharmony_ci	movdqa SHUF_MASK(%rip), %xmm10
45562306a36Sopenharmony_ci	pshufb %xmm10, %xmm8
45662306a36Sopenharmony_ci
45762306a36Sopenharmony_ci	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
45862306a36Sopenharmony_ci	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
45962306a36Sopenharmony_ci	pxor	%xmm8, %xmm0
46062306a36Sopenharmony_ci.L_return_T_\@:
46162306a36Sopenharmony_ci	mov	\AUTHTAG, %r10                     # %r10 = authTag
46262306a36Sopenharmony_ci	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
46362306a36Sopenharmony_ci	cmp	$16, %r11
46462306a36Sopenharmony_ci	je	.L_T_16_\@
46562306a36Sopenharmony_ci	cmp	$8, %r11
46662306a36Sopenharmony_ci	jl	.L_T_4_\@
46762306a36Sopenharmony_ci.L_T_8_\@:
46862306a36Sopenharmony_ci	movq	%xmm0, %rax
46962306a36Sopenharmony_ci	mov	%rax, (%r10)
47062306a36Sopenharmony_ci	add	$8, %r10
47162306a36Sopenharmony_ci	sub	$8, %r11
47262306a36Sopenharmony_ci	psrldq	$8, %xmm0
47362306a36Sopenharmony_ci	test	%r11, %r11
47462306a36Sopenharmony_ci	je	.L_return_T_done_\@
47562306a36Sopenharmony_ci.L_T_4_\@:
47662306a36Sopenharmony_ci	movd	%xmm0, %eax
47762306a36Sopenharmony_ci	mov	%eax, (%r10)
47862306a36Sopenharmony_ci	add	$4, %r10
47962306a36Sopenharmony_ci	sub	$4, %r11
48062306a36Sopenharmony_ci	psrldq	$4, %xmm0
48162306a36Sopenharmony_ci	test	%r11, %r11
48262306a36Sopenharmony_ci	je	.L_return_T_done_\@
48362306a36Sopenharmony_ci.L_T_123_\@:
48462306a36Sopenharmony_ci	movd	%xmm0, %eax
48562306a36Sopenharmony_ci	cmp	$2, %r11
48662306a36Sopenharmony_ci	jl	.L_T_1_\@
48762306a36Sopenharmony_ci	mov	%ax, (%r10)
48862306a36Sopenharmony_ci	cmp	$2, %r11
48962306a36Sopenharmony_ci	je	.L_return_T_done_\@
49062306a36Sopenharmony_ci	add	$2, %r10
49162306a36Sopenharmony_ci	sar	$16, %eax
49262306a36Sopenharmony_ci.L_T_1_\@:
49362306a36Sopenharmony_ci	mov	%al, (%r10)
49462306a36Sopenharmony_ci	jmp	.L_return_T_done_\@
49562306a36Sopenharmony_ci.L_T_16_\@:
49662306a36Sopenharmony_ci	movdqu	%xmm0, (%r10)
49762306a36Sopenharmony_ci.L_return_T_done_\@:
49862306a36Sopenharmony_ci.endm
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci#ifdef __x86_64__
50162306a36Sopenharmony_ci/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
50262306a36Sopenharmony_ci*
50362306a36Sopenharmony_ci*
50462306a36Sopenharmony_ci* Input: A and B (128-bits each, bit-reflected)
50562306a36Sopenharmony_ci* Output: C = A*B*x mod poly, (i.e. >>1 )
50662306a36Sopenharmony_ci* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
50762306a36Sopenharmony_ci* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
50862306a36Sopenharmony_ci*
50962306a36Sopenharmony_ci*/
51062306a36Sopenharmony_ci.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
51162306a36Sopenharmony_ci	movdqa	  \GH, \TMP1
51262306a36Sopenharmony_ci	pshufd	  $78, \GH, \TMP2
51362306a36Sopenharmony_ci	pshufd	  $78, \HK, \TMP3
51462306a36Sopenharmony_ci	pxor	  \GH, \TMP2            # TMP2 = a1+a0
51562306a36Sopenharmony_ci	pxor	  \HK, \TMP3            # TMP3 = b1+b0
51662306a36Sopenharmony_ci	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
51762306a36Sopenharmony_ci	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
51862306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
51962306a36Sopenharmony_ci	pxor	  \GH, \TMP2
52062306a36Sopenharmony_ci	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
52162306a36Sopenharmony_ci	movdqa	  \TMP2, \TMP3
52262306a36Sopenharmony_ci	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
52362306a36Sopenharmony_ci	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
52462306a36Sopenharmony_ci	pxor	  \TMP3, \GH
52562306a36Sopenharmony_ci	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_ci        # first phase of the reduction
52862306a36Sopenharmony_ci
52962306a36Sopenharmony_ci	movdqa    \GH, \TMP2
53062306a36Sopenharmony_ci	movdqa    \GH, \TMP3
53162306a36Sopenharmony_ci	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
53262306a36Sopenharmony_ci					# in in order to perform
53362306a36Sopenharmony_ci					# independent shifts
53462306a36Sopenharmony_ci	pslld     $31, \TMP2            # packed right shift <<31
53562306a36Sopenharmony_ci	pslld     $30, \TMP3            # packed right shift <<30
53662306a36Sopenharmony_ci	pslld     $25, \TMP4            # packed right shift <<25
53762306a36Sopenharmony_ci	pxor      \TMP3, \TMP2          # xor the shifted versions
53862306a36Sopenharmony_ci	pxor      \TMP4, \TMP2
53962306a36Sopenharmony_ci	movdqa    \TMP2, \TMP5
54062306a36Sopenharmony_ci	psrldq    $4, \TMP5             # right shift TMP5 1 DW
54162306a36Sopenharmony_ci	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
54262306a36Sopenharmony_ci	pxor      \TMP2, \GH
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_ci        # second phase of the reduction
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_ci	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
54762306a36Sopenharmony_ci					# in in order to perform
54862306a36Sopenharmony_ci					# independent shifts
54962306a36Sopenharmony_ci	movdqa    \GH,\TMP3
55062306a36Sopenharmony_ci	movdqa    \GH,\TMP4
55162306a36Sopenharmony_ci	psrld     $1,\TMP2              # packed left shift >>1
55262306a36Sopenharmony_ci	psrld     $2,\TMP3              # packed left shift >>2
55362306a36Sopenharmony_ci	psrld     $7,\TMP4              # packed left shift >>7
55462306a36Sopenharmony_ci	pxor      \TMP3,\TMP2		# xor the shifted versions
55562306a36Sopenharmony_ci	pxor      \TMP4,\TMP2
55662306a36Sopenharmony_ci	pxor      \TMP5, \TMP2
55762306a36Sopenharmony_ci	pxor      \TMP2, \GH
55862306a36Sopenharmony_ci	pxor      \TMP1, \GH            # result is in TMP1
55962306a36Sopenharmony_ci.endm
56062306a36Sopenharmony_ci
56162306a36Sopenharmony_ci# Reads DLEN bytes starting at DPTR and stores in XMMDst
56262306a36Sopenharmony_ci# where 0 < DLEN < 16
56362306a36Sopenharmony_ci# Clobbers %rax, DLEN and XMM1
56462306a36Sopenharmony_ci.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
56562306a36Sopenharmony_ci        cmp $8, \DLEN
56662306a36Sopenharmony_ci        jl .L_read_lt8_\@
56762306a36Sopenharmony_ci        mov (\DPTR), %rax
56862306a36Sopenharmony_ci        movq %rax, \XMMDst
56962306a36Sopenharmony_ci        sub $8, \DLEN
57062306a36Sopenharmony_ci        jz .L_done_read_partial_block_\@
57162306a36Sopenharmony_ci	xor %eax, %eax
57262306a36Sopenharmony_ci.L_read_next_byte_\@:
57362306a36Sopenharmony_ci        shl $8, %rax
57462306a36Sopenharmony_ci        mov 7(\DPTR, \DLEN, 1), %al
57562306a36Sopenharmony_ci        dec \DLEN
57662306a36Sopenharmony_ci        jnz .L_read_next_byte_\@
57762306a36Sopenharmony_ci        movq %rax, \XMM1
57862306a36Sopenharmony_ci	pslldq $8, \XMM1
57962306a36Sopenharmony_ci        por \XMM1, \XMMDst
58062306a36Sopenharmony_ci	jmp .L_done_read_partial_block_\@
58162306a36Sopenharmony_ci.L_read_lt8_\@:
58262306a36Sopenharmony_ci	xor %eax, %eax
58362306a36Sopenharmony_ci.L_read_next_byte_lt8_\@:
58462306a36Sopenharmony_ci        shl $8, %rax
58562306a36Sopenharmony_ci        mov -1(\DPTR, \DLEN, 1), %al
58662306a36Sopenharmony_ci        dec \DLEN
58762306a36Sopenharmony_ci        jnz .L_read_next_byte_lt8_\@
58862306a36Sopenharmony_ci        movq %rax, \XMMDst
58962306a36Sopenharmony_ci.L_done_read_partial_block_\@:
59062306a36Sopenharmony_ci.endm
59162306a36Sopenharmony_ci
59262306a36Sopenharmony_ci# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
59362306a36Sopenharmony_ci# clobbers r10-11, xmm14
59462306a36Sopenharmony_ci.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
59562306a36Sopenharmony_ci	TMP6 TMP7
59662306a36Sopenharmony_ci	MOVADQ	   SHUF_MASK(%rip), %xmm14
59762306a36Sopenharmony_ci	mov	   \AAD, %r10		# %r10 = AAD
59862306a36Sopenharmony_ci	mov	   \AADLEN, %r11		# %r11 = aadLen
59962306a36Sopenharmony_ci	pxor	   \TMP7, \TMP7
60062306a36Sopenharmony_ci	pxor	   \TMP6, \TMP6
60162306a36Sopenharmony_ci
60262306a36Sopenharmony_ci	cmp	   $16, %r11
60362306a36Sopenharmony_ci	jl	   .L_get_AAD_rest\@
60462306a36Sopenharmony_ci.L_get_AAD_blocks\@:
60562306a36Sopenharmony_ci	movdqu	   (%r10), \TMP7
60662306a36Sopenharmony_ci	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
60762306a36Sopenharmony_ci	pxor	   \TMP7, \TMP6
60862306a36Sopenharmony_ci	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
60962306a36Sopenharmony_ci	add	   $16, %r10
61062306a36Sopenharmony_ci	sub	   $16, %r11
61162306a36Sopenharmony_ci	cmp	   $16, %r11
61262306a36Sopenharmony_ci	jge	   .L_get_AAD_blocks\@
61362306a36Sopenharmony_ci
61462306a36Sopenharmony_ci	movdqu	   \TMP6, \TMP7
61562306a36Sopenharmony_ci
61662306a36Sopenharmony_ci	/* read the last <16B of AAD */
61762306a36Sopenharmony_ci.L_get_AAD_rest\@:
61862306a36Sopenharmony_ci	test	   %r11, %r11
61962306a36Sopenharmony_ci	je	   .L_get_AAD_done\@
62062306a36Sopenharmony_ci
62162306a36Sopenharmony_ci	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
62262306a36Sopenharmony_ci	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
62362306a36Sopenharmony_ci	pxor	   \TMP6, \TMP7
62462306a36Sopenharmony_ci	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
62562306a36Sopenharmony_ci	movdqu \TMP7, \TMP6
62662306a36Sopenharmony_ci
62762306a36Sopenharmony_ci.L_get_AAD_done\@:
62862306a36Sopenharmony_ci	movdqu \TMP6, AadHash(%arg2)
62962306a36Sopenharmony_ci.endm
63062306a36Sopenharmony_ci
63162306a36Sopenharmony_ci# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
63262306a36Sopenharmony_ci# between update calls.
63362306a36Sopenharmony_ci# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
63462306a36Sopenharmony_ci# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
63562306a36Sopenharmony_ci# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
63662306a36Sopenharmony_ci.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
63762306a36Sopenharmony_ci	AAD_HASH operation
63862306a36Sopenharmony_ci	mov 	PBlockLen(%arg2), %r13
63962306a36Sopenharmony_ci	test	%r13, %r13
64062306a36Sopenharmony_ci	je	.L_partial_block_done_\@	# Leave Macro if no partial blocks
64162306a36Sopenharmony_ci	# Read in input data without over reading
64262306a36Sopenharmony_ci	cmp	$16, \PLAIN_CYPH_LEN
64362306a36Sopenharmony_ci	jl	.L_fewer_than_16_bytes_\@
64462306a36Sopenharmony_ci	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
64562306a36Sopenharmony_ci	jmp	.L_data_read_\@
64662306a36Sopenharmony_ci
64762306a36Sopenharmony_ci.L_fewer_than_16_bytes_\@:
64862306a36Sopenharmony_ci	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
64962306a36Sopenharmony_ci	mov	\PLAIN_CYPH_LEN, %r12
65062306a36Sopenharmony_ci	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
65162306a36Sopenharmony_ci
65262306a36Sopenharmony_ci	mov PBlockLen(%arg2), %r13
65362306a36Sopenharmony_ci
65462306a36Sopenharmony_ci.L_data_read_\@:				# Finished reading in data
65562306a36Sopenharmony_ci
65662306a36Sopenharmony_ci	movdqu	PBlockEncKey(%arg2), %xmm9
65762306a36Sopenharmony_ci	movdqu	HashKey(%arg2), %xmm13
65862306a36Sopenharmony_ci
65962306a36Sopenharmony_ci	lea	SHIFT_MASK(%rip), %r12
66062306a36Sopenharmony_ci
66162306a36Sopenharmony_ci	# adjust the shuffle mask pointer to be able to shift r13 bytes
66262306a36Sopenharmony_ci	# r16-r13 is the number of bytes in plaintext mod 16)
66362306a36Sopenharmony_ci	add	%r13, %r12
66462306a36Sopenharmony_ci	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
66562306a36Sopenharmony_ci	pshufb	%xmm2, %xmm9		# shift right r13 bytes
66662306a36Sopenharmony_ci
66762306a36Sopenharmony_ci.ifc \operation, dec
66862306a36Sopenharmony_ci	movdqa	%xmm1, %xmm3
66962306a36Sopenharmony_ci	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci	mov	\PLAIN_CYPH_LEN, %r10
67262306a36Sopenharmony_ci	add	%r13, %r10
67362306a36Sopenharmony_ci	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
67462306a36Sopenharmony_ci	sub	$16, %r10
67562306a36Sopenharmony_ci	# Determine if if partial block is not being filled and
67662306a36Sopenharmony_ci	# shift mask accordingly
67762306a36Sopenharmony_ci	jge	.L_no_extra_mask_1_\@
67862306a36Sopenharmony_ci	sub	%r10, %r12
67962306a36Sopenharmony_ci.L_no_extra_mask_1_\@:
68062306a36Sopenharmony_ci
68162306a36Sopenharmony_ci	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
68262306a36Sopenharmony_ci	# get the appropriate mask to mask out bottom r13 bytes of xmm9
68362306a36Sopenharmony_ci	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
68462306a36Sopenharmony_ci
68562306a36Sopenharmony_ci	pand	%xmm1, %xmm3
68662306a36Sopenharmony_ci	movdqa	SHUF_MASK(%rip), %xmm10
68762306a36Sopenharmony_ci	pshufb	%xmm10, %xmm3
68862306a36Sopenharmony_ci	pshufb	%xmm2, %xmm3
68962306a36Sopenharmony_ci	pxor	%xmm3, \AAD_HASH
69062306a36Sopenharmony_ci
69162306a36Sopenharmony_ci	test	%r10, %r10
69262306a36Sopenharmony_ci	jl	.L_partial_incomplete_1_\@
69362306a36Sopenharmony_ci
69462306a36Sopenharmony_ci	# GHASH computation for the last <16 Byte block
69562306a36Sopenharmony_ci	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
69662306a36Sopenharmony_ci	xor	%eax, %eax
69762306a36Sopenharmony_ci
69862306a36Sopenharmony_ci	mov	%rax, PBlockLen(%arg2)
69962306a36Sopenharmony_ci	jmp	.L_dec_done_\@
70062306a36Sopenharmony_ci.L_partial_incomplete_1_\@:
70162306a36Sopenharmony_ci	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
70262306a36Sopenharmony_ci.L_dec_done_\@:
70362306a36Sopenharmony_ci	movdqu	\AAD_HASH, AadHash(%arg2)
70462306a36Sopenharmony_ci.else
70562306a36Sopenharmony_ci	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
70662306a36Sopenharmony_ci
70762306a36Sopenharmony_ci	mov	\PLAIN_CYPH_LEN, %r10
70862306a36Sopenharmony_ci	add	%r13, %r10
70962306a36Sopenharmony_ci	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
71062306a36Sopenharmony_ci	sub	$16, %r10
71162306a36Sopenharmony_ci	# Determine if if partial block is not being filled and
71262306a36Sopenharmony_ci	# shift mask accordingly
71362306a36Sopenharmony_ci	jge	.L_no_extra_mask_2_\@
71462306a36Sopenharmony_ci	sub	%r10, %r12
71562306a36Sopenharmony_ci.L_no_extra_mask_2_\@:
71662306a36Sopenharmony_ci
71762306a36Sopenharmony_ci	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
71862306a36Sopenharmony_ci	# get the appropriate mask to mask out bottom r13 bytes of xmm9
71962306a36Sopenharmony_ci	pand	%xmm1, %xmm9
72062306a36Sopenharmony_ci
72162306a36Sopenharmony_ci	movdqa	SHUF_MASK(%rip), %xmm1
72262306a36Sopenharmony_ci	pshufb	%xmm1, %xmm9
72362306a36Sopenharmony_ci	pshufb	%xmm2, %xmm9
72462306a36Sopenharmony_ci	pxor	%xmm9, \AAD_HASH
72562306a36Sopenharmony_ci
72662306a36Sopenharmony_ci	test	%r10, %r10
72762306a36Sopenharmony_ci	jl	.L_partial_incomplete_2_\@
72862306a36Sopenharmony_ci
72962306a36Sopenharmony_ci	# GHASH computation for the last <16 Byte block
73062306a36Sopenharmony_ci	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
73162306a36Sopenharmony_ci	xor	%eax, %eax
73262306a36Sopenharmony_ci
73362306a36Sopenharmony_ci	mov	%rax, PBlockLen(%arg2)
73462306a36Sopenharmony_ci	jmp	.L_encode_done_\@
73562306a36Sopenharmony_ci.L_partial_incomplete_2_\@:
73662306a36Sopenharmony_ci	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
73762306a36Sopenharmony_ci.L_encode_done_\@:
73862306a36Sopenharmony_ci	movdqu	\AAD_HASH, AadHash(%arg2)
73962306a36Sopenharmony_ci
74062306a36Sopenharmony_ci	movdqa	SHUF_MASK(%rip), %xmm10
74162306a36Sopenharmony_ci	# shuffle xmm9 back to output as ciphertext
74262306a36Sopenharmony_ci	pshufb	%xmm10, %xmm9
74362306a36Sopenharmony_ci	pshufb	%xmm2, %xmm9
74462306a36Sopenharmony_ci.endif
74562306a36Sopenharmony_ci	# output encrypted Bytes
74662306a36Sopenharmony_ci	test	%r10, %r10
74762306a36Sopenharmony_ci	jl	.L_partial_fill_\@
74862306a36Sopenharmony_ci	mov	%r13, %r12
74962306a36Sopenharmony_ci	mov	$16, %r13
75062306a36Sopenharmony_ci	# Set r13 to be the number of bytes to write out
75162306a36Sopenharmony_ci	sub	%r12, %r13
75262306a36Sopenharmony_ci	jmp	.L_count_set_\@
75362306a36Sopenharmony_ci.L_partial_fill_\@:
75462306a36Sopenharmony_ci	mov	\PLAIN_CYPH_LEN, %r13
75562306a36Sopenharmony_ci.L_count_set_\@:
75662306a36Sopenharmony_ci	movdqa	%xmm9, %xmm0
75762306a36Sopenharmony_ci	movq	%xmm0, %rax
75862306a36Sopenharmony_ci	cmp	$8, %r13
75962306a36Sopenharmony_ci	jle	.L_less_than_8_bytes_left_\@
76062306a36Sopenharmony_ci
76162306a36Sopenharmony_ci	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
76262306a36Sopenharmony_ci	add	$8, \DATA_OFFSET
76362306a36Sopenharmony_ci	psrldq	$8, %xmm0
76462306a36Sopenharmony_ci	movq	%xmm0, %rax
76562306a36Sopenharmony_ci	sub	$8, %r13
76662306a36Sopenharmony_ci.L_less_than_8_bytes_left_\@:
76762306a36Sopenharmony_ci	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
76862306a36Sopenharmony_ci	add	$1, \DATA_OFFSET
76962306a36Sopenharmony_ci	shr	$8, %rax
77062306a36Sopenharmony_ci	sub	$1, %r13
77162306a36Sopenharmony_ci	jne	.L_less_than_8_bytes_left_\@
77262306a36Sopenharmony_ci.L_partial_block_done_\@:
77362306a36Sopenharmony_ci.endm # PARTIAL_BLOCK
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci/*
77662306a36Sopenharmony_ci* if a = number of total plaintext bytes
77762306a36Sopenharmony_ci* b = floor(a/16)
77862306a36Sopenharmony_ci* num_initial_blocks = b mod 4
77962306a36Sopenharmony_ci* encrypt the initial num_initial_blocks blocks and apply ghash on
78062306a36Sopenharmony_ci* the ciphertext
78162306a36Sopenharmony_ci* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
78262306a36Sopenharmony_ci* are clobbered
78362306a36Sopenharmony_ci* arg1, %arg2, %arg3 are used as a pointer only, not modified
78462306a36Sopenharmony_ci*/
78562306a36Sopenharmony_ci
78662306a36Sopenharmony_ci
78762306a36Sopenharmony_ci.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
78862306a36Sopenharmony_ci	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
78962306a36Sopenharmony_ci	MOVADQ		SHUF_MASK(%rip), %xmm14
79062306a36Sopenharmony_ci
79162306a36Sopenharmony_ci	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci	# start AES for num_initial_blocks blocks
79462306a36Sopenharmony_ci
79562306a36Sopenharmony_ci	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
79662306a36Sopenharmony_ci
79762306a36Sopenharmony_ci.if (\i == 5) || (\i == 6) || (\i == 7)
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_ci	MOVADQ		ONE(%RIP),\TMP1
80062306a36Sopenharmony_ci	MOVADQ		0(%arg1),\TMP2
80162306a36Sopenharmony_ci.irpc index, \i_seq
80262306a36Sopenharmony_ci	paddd		\TMP1, \XMM0                 # INCR Y0
80362306a36Sopenharmony_ci.ifc \operation, dec
80462306a36Sopenharmony_ci        movdqa     \XMM0, %xmm\index
80562306a36Sopenharmony_ci.else
80662306a36Sopenharmony_ci	MOVADQ		\XMM0, %xmm\index
80762306a36Sopenharmony_ci.endif
80862306a36Sopenharmony_ci	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
80962306a36Sopenharmony_ci	pxor		\TMP2, %xmm\index
81062306a36Sopenharmony_ci.endr
81162306a36Sopenharmony_ci	lea	0x10(%arg1),%r10
81262306a36Sopenharmony_ci	mov	keysize,%eax
81362306a36Sopenharmony_ci	shr	$2,%eax				# 128->4, 192->6, 256->8
81462306a36Sopenharmony_ci	add	$5,%eax			      # 128->9, 192->11, 256->13
81562306a36Sopenharmony_ci
81662306a36Sopenharmony_ci.Laes_loop_initial_\@:
81762306a36Sopenharmony_ci	MOVADQ	(%r10),\TMP1
81862306a36Sopenharmony_ci.irpc	index, \i_seq
81962306a36Sopenharmony_ci	aesenc	\TMP1, %xmm\index
82062306a36Sopenharmony_ci.endr
82162306a36Sopenharmony_ci	add	$16,%r10
82262306a36Sopenharmony_ci	sub	$1,%eax
82362306a36Sopenharmony_ci	jnz	.Laes_loop_initial_\@
82462306a36Sopenharmony_ci
82562306a36Sopenharmony_ci	MOVADQ	(%r10), \TMP1
82662306a36Sopenharmony_ci.irpc index, \i_seq
82762306a36Sopenharmony_ci	aesenclast \TMP1, %xmm\index         # Last Round
82862306a36Sopenharmony_ci.endr
82962306a36Sopenharmony_ci.irpc index, \i_seq
83062306a36Sopenharmony_ci	movdqu	   (%arg4 , %r11, 1), \TMP1
83162306a36Sopenharmony_ci	pxor	   \TMP1, %xmm\index
83262306a36Sopenharmony_ci	movdqu	   %xmm\index, (%arg3 , %r11, 1)
83362306a36Sopenharmony_ci	# write back plaintext/ciphertext for num_initial_blocks
83462306a36Sopenharmony_ci	add	   $16, %r11
83562306a36Sopenharmony_ci
83662306a36Sopenharmony_ci.ifc \operation, dec
83762306a36Sopenharmony_ci	movdqa     \TMP1, %xmm\index
83862306a36Sopenharmony_ci.endif
83962306a36Sopenharmony_ci	pshufb	   %xmm14, %xmm\index
84062306a36Sopenharmony_ci
84162306a36Sopenharmony_ci		# prepare plaintext/ciphertext for GHASH computation
84262306a36Sopenharmony_ci.endr
84362306a36Sopenharmony_ci.endif
84462306a36Sopenharmony_ci
84562306a36Sopenharmony_ci        # apply GHASH on num_initial_blocks blocks
84662306a36Sopenharmony_ci
84762306a36Sopenharmony_ci.if \i == 5
84862306a36Sopenharmony_ci        pxor       %xmm5, %xmm6
84962306a36Sopenharmony_ci	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
85062306a36Sopenharmony_ci        pxor       %xmm6, %xmm7
85162306a36Sopenharmony_ci	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
85262306a36Sopenharmony_ci        pxor       %xmm7, %xmm8
85362306a36Sopenharmony_ci	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
85462306a36Sopenharmony_ci.elseif \i == 6
85562306a36Sopenharmony_ci        pxor       %xmm6, %xmm7
85662306a36Sopenharmony_ci	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
85762306a36Sopenharmony_ci        pxor       %xmm7, %xmm8
85862306a36Sopenharmony_ci	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
85962306a36Sopenharmony_ci.elseif \i == 7
86062306a36Sopenharmony_ci        pxor       %xmm7, %xmm8
86162306a36Sopenharmony_ci	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
86262306a36Sopenharmony_ci.endif
86362306a36Sopenharmony_ci	cmp	   $64, %r13
86462306a36Sopenharmony_ci	jl	.L_initial_blocks_done\@
86562306a36Sopenharmony_ci	# no need for precomputed values
86662306a36Sopenharmony_ci/*
86762306a36Sopenharmony_ci*
86862306a36Sopenharmony_ci* Precomputations for HashKey parallel with encryption of first 4 blocks.
86962306a36Sopenharmony_ci* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
87062306a36Sopenharmony_ci*/
87162306a36Sopenharmony_ci	MOVADQ	   ONE(%RIP),\TMP1
87262306a36Sopenharmony_ci	paddd	   \TMP1, \XMM0              # INCR Y0
87362306a36Sopenharmony_ci	MOVADQ	   \XMM0, \XMM1
87462306a36Sopenharmony_ci	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
87562306a36Sopenharmony_ci
87662306a36Sopenharmony_ci	paddd	   \TMP1, \XMM0              # INCR Y0
87762306a36Sopenharmony_ci	MOVADQ	   \XMM0, \XMM2
87862306a36Sopenharmony_ci	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
87962306a36Sopenharmony_ci
88062306a36Sopenharmony_ci	paddd	   \TMP1, \XMM0              # INCR Y0
88162306a36Sopenharmony_ci	MOVADQ	   \XMM0, \XMM3
88262306a36Sopenharmony_ci	pshufb %xmm14, \XMM3        # perform a 16 byte swap
88362306a36Sopenharmony_ci
88462306a36Sopenharmony_ci	paddd	   \TMP1, \XMM0              # INCR Y0
88562306a36Sopenharmony_ci	MOVADQ	   \XMM0, \XMM4
88662306a36Sopenharmony_ci	pshufb %xmm14, \XMM4        # perform a 16 byte swap
88762306a36Sopenharmony_ci
88862306a36Sopenharmony_ci	MOVADQ	   0(%arg1),\TMP1
88962306a36Sopenharmony_ci	pxor	   \TMP1, \XMM1
89062306a36Sopenharmony_ci	pxor	   \TMP1, \XMM2
89162306a36Sopenharmony_ci	pxor	   \TMP1, \XMM3
89262306a36Sopenharmony_ci	pxor	   \TMP1, \XMM4
89362306a36Sopenharmony_ci.irpc index, 1234 # do 4 rounds
89462306a36Sopenharmony_ci	movaps 0x10*\index(%arg1), \TMP1
89562306a36Sopenharmony_ci	aesenc	   \TMP1, \XMM1
89662306a36Sopenharmony_ci	aesenc	   \TMP1, \XMM2
89762306a36Sopenharmony_ci	aesenc	   \TMP1, \XMM3
89862306a36Sopenharmony_ci	aesenc	   \TMP1, \XMM4
89962306a36Sopenharmony_ci.endr
90062306a36Sopenharmony_ci.irpc index, 56789 # do next 5 rounds
90162306a36Sopenharmony_ci	movaps 0x10*\index(%arg1), \TMP1
90262306a36Sopenharmony_ci	aesenc	   \TMP1, \XMM1
90362306a36Sopenharmony_ci	aesenc	   \TMP1, \XMM2
90462306a36Sopenharmony_ci	aesenc	   \TMP1, \XMM3
90562306a36Sopenharmony_ci	aesenc	   \TMP1, \XMM4
90662306a36Sopenharmony_ci.endr
90762306a36Sopenharmony_ci	lea	   0xa0(%arg1),%r10
90862306a36Sopenharmony_ci	mov	   keysize,%eax
90962306a36Sopenharmony_ci	shr	   $2,%eax			# 128->4, 192->6, 256->8
91062306a36Sopenharmony_ci	sub	   $4,%eax			# 128->0, 192->2, 256->4
91162306a36Sopenharmony_ci	jz	   .Laes_loop_pre_done\@
91262306a36Sopenharmony_ci
91362306a36Sopenharmony_ci.Laes_loop_pre_\@:
91462306a36Sopenharmony_ci	MOVADQ	   (%r10),\TMP2
91562306a36Sopenharmony_ci.irpc	index, 1234
91662306a36Sopenharmony_ci	aesenc	   \TMP2, %xmm\index
91762306a36Sopenharmony_ci.endr
91862306a36Sopenharmony_ci	add	   $16,%r10
91962306a36Sopenharmony_ci	sub	   $1,%eax
92062306a36Sopenharmony_ci	jnz	   .Laes_loop_pre_\@
92162306a36Sopenharmony_ci
92262306a36Sopenharmony_ci.Laes_loop_pre_done\@:
92362306a36Sopenharmony_ci	MOVADQ	   (%r10), \TMP2
92462306a36Sopenharmony_ci	aesenclast \TMP2, \XMM1
92562306a36Sopenharmony_ci	aesenclast \TMP2, \XMM2
92662306a36Sopenharmony_ci	aesenclast \TMP2, \XMM3
92762306a36Sopenharmony_ci	aesenclast \TMP2, \XMM4
92862306a36Sopenharmony_ci	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
92962306a36Sopenharmony_ci	pxor	   \TMP1, \XMM1
93062306a36Sopenharmony_ci.ifc \operation, dec
93162306a36Sopenharmony_ci	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
93262306a36Sopenharmony_ci	movdqa     \TMP1, \XMM1
93362306a36Sopenharmony_ci.endif
93462306a36Sopenharmony_ci	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
93562306a36Sopenharmony_ci	pxor	   \TMP1, \XMM2
93662306a36Sopenharmony_ci.ifc \operation, dec
93762306a36Sopenharmony_ci	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
93862306a36Sopenharmony_ci	movdqa     \TMP1, \XMM2
93962306a36Sopenharmony_ci.endif
94062306a36Sopenharmony_ci	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
94162306a36Sopenharmony_ci	pxor	   \TMP1, \XMM3
94262306a36Sopenharmony_ci.ifc \operation, dec
94362306a36Sopenharmony_ci	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
94462306a36Sopenharmony_ci	movdqa     \TMP1, \XMM3
94562306a36Sopenharmony_ci.endif
94662306a36Sopenharmony_ci	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
94762306a36Sopenharmony_ci	pxor	   \TMP1, \XMM4
94862306a36Sopenharmony_ci.ifc \operation, dec
94962306a36Sopenharmony_ci	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
95062306a36Sopenharmony_ci	movdqa     \TMP1, \XMM4
95162306a36Sopenharmony_ci.else
95262306a36Sopenharmony_ci	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
95362306a36Sopenharmony_ci	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
95462306a36Sopenharmony_ci	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
95562306a36Sopenharmony_ci	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
95662306a36Sopenharmony_ci.endif
95762306a36Sopenharmony_ci
95862306a36Sopenharmony_ci	add	   $64, %r11
95962306a36Sopenharmony_ci	pshufb %xmm14, \XMM1 # perform a 16 byte swap
96062306a36Sopenharmony_ci	pxor	   \XMMDst, \XMM1
96162306a36Sopenharmony_ci# combine GHASHed value with the corresponding ciphertext
96262306a36Sopenharmony_ci	pshufb %xmm14, \XMM2 # perform a 16 byte swap
96362306a36Sopenharmony_ci	pshufb %xmm14, \XMM3 # perform a 16 byte swap
96462306a36Sopenharmony_ci	pshufb %xmm14, \XMM4 # perform a 16 byte swap
96562306a36Sopenharmony_ci
96662306a36Sopenharmony_ci.L_initial_blocks_done\@:
96762306a36Sopenharmony_ci
96862306a36Sopenharmony_ci.endm
96962306a36Sopenharmony_ci
97062306a36Sopenharmony_ci/*
97162306a36Sopenharmony_ci* encrypt 4 blocks at a time
97262306a36Sopenharmony_ci* ghash the 4 previously encrypted ciphertext blocks
97362306a36Sopenharmony_ci* arg1, %arg3, %arg4 are used as pointers only, not modified
97462306a36Sopenharmony_ci* %r11 is the data offset value
97562306a36Sopenharmony_ci*/
97662306a36Sopenharmony_ci.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
97762306a36Sopenharmony_ciTMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
97862306a36Sopenharmony_ci
97962306a36Sopenharmony_ci	movdqa	  \XMM1, \XMM5
98062306a36Sopenharmony_ci	movdqa	  \XMM2, \XMM6
98162306a36Sopenharmony_ci	movdqa	  \XMM3, \XMM7
98262306a36Sopenharmony_ci	movdqa	  \XMM4, \XMM8
98362306a36Sopenharmony_ci
98462306a36Sopenharmony_ci        movdqa    SHUF_MASK(%rip), %xmm15
98562306a36Sopenharmony_ci        # multiply TMP5 * HashKey using karatsuba
98662306a36Sopenharmony_ci
98762306a36Sopenharmony_ci	movdqa	  \XMM5, \TMP4
98862306a36Sopenharmony_ci	pshufd	  $78, \XMM5, \TMP6
98962306a36Sopenharmony_ci	pxor	  \XMM5, \TMP6
99062306a36Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
99162306a36Sopenharmony_ci	movdqu	  HashKey_4(%arg2), \TMP5
99262306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
99362306a36Sopenharmony_ci	movdqa    \XMM0, \XMM1
99462306a36Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
99562306a36Sopenharmony_ci	movdqa    \XMM0, \XMM2
99662306a36Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
99762306a36Sopenharmony_ci	movdqa    \XMM0, \XMM3
99862306a36Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
99962306a36Sopenharmony_ci	movdqa    \XMM0, \XMM4
100062306a36Sopenharmony_ci	pshufb %xmm15, \XMM1	# perform a 16 byte swap
100162306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
100262306a36Sopenharmony_ci	pshufb %xmm15, \XMM2	# perform a 16 byte swap
100362306a36Sopenharmony_ci	pshufb %xmm15, \XMM3	# perform a 16 byte swap
100462306a36Sopenharmony_ci	pshufb %xmm15, \XMM4	# perform a 16 byte swap
100562306a36Sopenharmony_ci
100662306a36Sopenharmony_ci	pxor	  (%arg1), \XMM1
100762306a36Sopenharmony_ci	pxor	  (%arg1), \XMM2
100862306a36Sopenharmony_ci	pxor	  (%arg1), \XMM3
100962306a36Sopenharmony_ci	pxor	  (%arg1), \XMM4
101062306a36Sopenharmony_ci	movdqu	  HashKey_4_k(%arg2), \TMP5
101162306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
101262306a36Sopenharmony_ci	movaps 0x10(%arg1), \TMP1
101362306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM1              # Round 1
101462306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM2
101562306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM3
101662306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM4
101762306a36Sopenharmony_ci	movaps 0x20(%arg1), \TMP1
101862306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM1              # Round 2
101962306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM2
102062306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM3
102162306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM4
102262306a36Sopenharmony_ci	movdqa	  \XMM6, \TMP1
102362306a36Sopenharmony_ci	pshufd	  $78, \XMM6, \TMP2
102462306a36Sopenharmony_ci	pxor	  \XMM6, \TMP2
102562306a36Sopenharmony_ci	movdqu	  HashKey_3(%arg2), \TMP5
102662306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
102762306a36Sopenharmony_ci	movaps 0x30(%arg1), \TMP3
102862306a36Sopenharmony_ci	aesenc    \TMP3, \XMM1              # Round 3
102962306a36Sopenharmony_ci	aesenc    \TMP3, \XMM2
103062306a36Sopenharmony_ci	aesenc    \TMP3, \XMM3
103162306a36Sopenharmony_ci	aesenc    \TMP3, \XMM4
103262306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
103362306a36Sopenharmony_ci	movaps 0x40(%arg1), \TMP3
103462306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 4
103562306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
103662306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
103762306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
103862306a36Sopenharmony_ci	movdqu	  HashKey_3_k(%arg2), \TMP5
103962306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
104062306a36Sopenharmony_ci	movaps 0x50(%arg1), \TMP3
104162306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 5
104262306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
104362306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
104462306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
104562306a36Sopenharmony_ci	pxor	  \TMP1, \TMP4
104662306a36Sopenharmony_ci# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
104762306a36Sopenharmony_ci	pxor	  \XMM6, \XMM5
104862306a36Sopenharmony_ci	pxor	  \TMP2, \TMP6
104962306a36Sopenharmony_ci	movdqa	  \XMM7, \TMP1
105062306a36Sopenharmony_ci	pshufd	  $78, \XMM7, \TMP2
105162306a36Sopenharmony_ci	pxor	  \XMM7, \TMP2
105262306a36Sopenharmony_ci	movdqu	  HashKey_2(%arg2), \TMP5
105362306a36Sopenharmony_ci
105462306a36Sopenharmony_ci        # Multiply TMP5 * HashKey using karatsuba
105562306a36Sopenharmony_ci
105662306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
105762306a36Sopenharmony_ci	movaps 0x60(%arg1), \TMP3
105862306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 6
105962306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
106062306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
106162306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
106262306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
106362306a36Sopenharmony_ci	movaps 0x70(%arg1), \TMP3
106462306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 7
106562306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
106662306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
106762306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
106862306a36Sopenharmony_ci	movdqu	  HashKey_2_k(%arg2), \TMP5
106962306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
107062306a36Sopenharmony_ci	movaps 0x80(%arg1), \TMP3
107162306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 8
107262306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
107362306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
107462306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
107562306a36Sopenharmony_ci	pxor	  \TMP1, \TMP4
107662306a36Sopenharmony_ci# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
107762306a36Sopenharmony_ci	pxor	  \XMM7, \XMM5
107862306a36Sopenharmony_ci	pxor	  \TMP2, \TMP6
107962306a36Sopenharmony_ci
108062306a36Sopenharmony_ci        # Multiply XMM8 * HashKey
108162306a36Sopenharmony_ci        # XMM8 and TMP5 hold the values for the two operands
108262306a36Sopenharmony_ci
108362306a36Sopenharmony_ci	movdqa	  \XMM8, \TMP1
108462306a36Sopenharmony_ci	pshufd	  $78, \XMM8, \TMP2
108562306a36Sopenharmony_ci	pxor	  \XMM8, \TMP2
108662306a36Sopenharmony_ci	movdqu	  HashKey(%arg2), \TMP5
108762306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
108862306a36Sopenharmony_ci	movaps 0x90(%arg1), \TMP3
108962306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1             # Round 9
109062306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
109162306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
109262306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
109362306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
109462306a36Sopenharmony_ci	lea	  0xa0(%arg1),%r10
109562306a36Sopenharmony_ci	mov	  keysize,%eax
109662306a36Sopenharmony_ci	shr	  $2,%eax			# 128->4, 192->6, 256->8
109762306a36Sopenharmony_ci	sub	  $4,%eax			# 128->0, 192->2, 256->4
109862306a36Sopenharmony_ci	jz	  .Laes_loop_par_enc_done\@
109962306a36Sopenharmony_ci
110062306a36Sopenharmony_ci.Laes_loop_par_enc\@:
110162306a36Sopenharmony_ci	MOVADQ	  (%r10),\TMP3
110262306a36Sopenharmony_ci.irpc	index, 1234
110362306a36Sopenharmony_ci	aesenc	  \TMP3, %xmm\index
110462306a36Sopenharmony_ci.endr
110562306a36Sopenharmony_ci	add	  $16,%r10
110662306a36Sopenharmony_ci	sub	  $1,%eax
110762306a36Sopenharmony_ci	jnz	  .Laes_loop_par_enc\@
110862306a36Sopenharmony_ci
110962306a36Sopenharmony_ci.Laes_loop_par_enc_done\@:
111062306a36Sopenharmony_ci	MOVADQ	  (%r10), \TMP3
111162306a36Sopenharmony_ci	aesenclast \TMP3, \XMM1           # Round 10
111262306a36Sopenharmony_ci	aesenclast \TMP3, \XMM2
111362306a36Sopenharmony_ci	aesenclast \TMP3, \XMM3
111462306a36Sopenharmony_ci	aesenclast \TMP3, \XMM4
111562306a36Sopenharmony_ci	movdqu    HashKey_k(%arg2), \TMP5
111662306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
111762306a36Sopenharmony_ci	movdqu	  (%arg4,%r11,1), \TMP3
111862306a36Sopenharmony_ci	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
111962306a36Sopenharmony_ci	movdqu	  16(%arg4,%r11,1), \TMP3
112062306a36Sopenharmony_ci	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
112162306a36Sopenharmony_ci	movdqu	  32(%arg4,%r11,1), \TMP3
112262306a36Sopenharmony_ci	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
112362306a36Sopenharmony_ci	movdqu	  48(%arg4,%r11,1), \TMP3
112462306a36Sopenharmony_ci	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
112562306a36Sopenharmony_ci        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
112662306a36Sopenharmony_ci        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
112762306a36Sopenharmony_ci        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
112862306a36Sopenharmony_ci        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
112962306a36Sopenharmony_ci	pshufb %xmm15, \XMM1        # perform a 16 byte swap
113062306a36Sopenharmony_ci	pshufb %xmm15, \XMM2	# perform a 16 byte swap
113162306a36Sopenharmony_ci	pshufb %xmm15, \XMM3	# perform a 16 byte swap
113262306a36Sopenharmony_ci	pshufb %xmm15, \XMM4	# perform a 16 byte swap
113362306a36Sopenharmony_ci
113462306a36Sopenharmony_ci	pxor	  \TMP4, \TMP1
113562306a36Sopenharmony_ci	pxor	  \XMM8, \XMM5
113662306a36Sopenharmony_ci	pxor	  \TMP6, \TMP2
113762306a36Sopenharmony_ci	pxor	  \TMP1, \TMP2
113862306a36Sopenharmony_ci	pxor	  \XMM5, \TMP2
113962306a36Sopenharmony_ci	movdqa	  \TMP2, \TMP3
114062306a36Sopenharmony_ci	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
114162306a36Sopenharmony_ci	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
114262306a36Sopenharmony_ci	pxor	  \TMP3, \XMM5
114362306a36Sopenharmony_ci	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
114462306a36Sopenharmony_ci
114562306a36Sopenharmony_ci        # first phase of reduction
114662306a36Sopenharmony_ci
114762306a36Sopenharmony_ci	movdqa    \XMM5, \TMP2
114862306a36Sopenharmony_ci	movdqa    \XMM5, \TMP3
114962306a36Sopenharmony_ci	movdqa    \XMM5, \TMP4
115062306a36Sopenharmony_ci# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
115162306a36Sopenharmony_ci	pslld     $31, \TMP2                   # packed right shift << 31
115262306a36Sopenharmony_ci	pslld     $30, \TMP3                   # packed right shift << 30
115362306a36Sopenharmony_ci	pslld     $25, \TMP4                   # packed right shift << 25
115462306a36Sopenharmony_ci	pxor      \TMP3, \TMP2	               # xor the shifted versions
115562306a36Sopenharmony_ci	pxor      \TMP4, \TMP2
115662306a36Sopenharmony_ci	movdqa    \TMP2, \TMP5
115762306a36Sopenharmony_ci	psrldq    $4, \TMP5                    # right shift T5 1 DW
115862306a36Sopenharmony_ci	pslldq    $12, \TMP2                   # left shift T2 3 DWs
115962306a36Sopenharmony_ci	pxor      \TMP2, \XMM5
116062306a36Sopenharmony_ci
116162306a36Sopenharmony_ci        # second phase of reduction
116262306a36Sopenharmony_ci
116362306a36Sopenharmony_ci	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
116462306a36Sopenharmony_ci	movdqa    \XMM5,\TMP3
116562306a36Sopenharmony_ci	movdqa    \XMM5,\TMP4
116662306a36Sopenharmony_ci	psrld     $1, \TMP2                    # packed left shift >>1
116762306a36Sopenharmony_ci	psrld     $2, \TMP3                    # packed left shift >>2
116862306a36Sopenharmony_ci	psrld     $7, \TMP4                    # packed left shift >>7
116962306a36Sopenharmony_ci	pxor      \TMP3,\TMP2		       # xor the shifted versions
117062306a36Sopenharmony_ci	pxor      \TMP4,\TMP2
117162306a36Sopenharmony_ci	pxor      \TMP5, \TMP2
117262306a36Sopenharmony_ci	pxor      \TMP2, \XMM5
117362306a36Sopenharmony_ci	pxor      \TMP1, \XMM5                 # result is in TMP1
117462306a36Sopenharmony_ci
117562306a36Sopenharmony_ci	pxor	  \XMM5, \XMM1
117662306a36Sopenharmony_ci.endm
117762306a36Sopenharmony_ci
117862306a36Sopenharmony_ci/*
117962306a36Sopenharmony_ci* decrypt 4 blocks at a time
118062306a36Sopenharmony_ci* ghash the 4 previously decrypted ciphertext blocks
118162306a36Sopenharmony_ci* arg1, %arg3, %arg4 are used as pointers only, not modified
118262306a36Sopenharmony_ci* %r11 is the data offset value
118362306a36Sopenharmony_ci*/
118462306a36Sopenharmony_ci.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
118562306a36Sopenharmony_ciTMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
118662306a36Sopenharmony_ci
118762306a36Sopenharmony_ci	movdqa	  \XMM1, \XMM5
118862306a36Sopenharmony_ci	movdqa	  \XMM2, \XMM6
118962306a36Sopenharmony_ci	movdqa	  \XMM3, \XMM7
119062306a36Sopenharmony_ci	movdqa	  \XMM4, \XMM8
119162306a36Sopenharmony_ci
119262306a36Sopenharmony_ci        movdqa    SHUF_MASK(%rip), %xmm15
119362306a36Sopenharmony_ci        # multiply TMP5 * HashKey using karatsuba
119462306a36Sopenharmony_ci
119562306a36Sopenharmony_ci	movdqa	  \XMM5, \TMP4
119662306a36Sopenharmony_ci	pshufd	  $78, \XMM5, \TMP6
119762306a36Sopenharmony_ci	pxor	  \XMM5, \TMP6
119862306a36Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
119962306a36Sopenharmony_ci	movdqu	  HashKey_4(%arg2), \TMP5
120062306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
120162306a36Sopenharmony_ci	movdqa    \XMM0, \XMM1
120262306a36Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
120362306a36Sopenharmony_ci	movdqa    \XMM0, \XMM2
120462306a36Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
120562306a36Sopenharmony_ci	movdqa    \XMM0, \XMM3
120662306a36Sopenharmony_ci	paddd     ONE(%rip), \XMM0		# INCR CNT
120762306a36Sopenharmony_ci	movdqa    \XMM0, \XMM4
120862306a36Sopenharmony_ci	pshufb %xmm15, \XMM1	# perform a 16 byte swap
120962306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
121062306a36Sopenharmony_ci	pshufb %xmm15, \XMM2	# perform a 16 byte swap
121162306a36Sopenharmony_ci	pshufb %xmm15, \XMM3	# perform a 16 byte swap
121262306a36Sopenharmony_ci	pshufb %xmm15, \XMM4	# perform a 16 byte swap
121362306a36Sopenharmony_ci
121462306a36Sopenharmony_ci	pxor	  (%arg1), \XMM1
121562306a36Sopenharmony_ci	pxor	  (%arg1), \XMM2
121662306a36Sopenharmony_ci	pxor	  (%arg1), \XMM3
121762306a36Sopenharmony_ci	pxor	  (%arg1), \XMM4
121862306a36Sopenharmony_ci	movdqu	  HashKey_4_k(%arg2), \TMP5
121962306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
122062306a36Sopenharmony_ci	movaps 0x10(%arg1), \TMP1
122162306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM1              # Round 1
122262306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM2
122362306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM3
122462306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM4
122562306a36Sopenharmony_ci	movaps 0x20(%arg1), \TMP1
122662306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM1              # Round 2
122762306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM2
122862306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM3
122962306a36Sopenharmony_ci	aesenc	  \TMP1, \XMM4
123062306a36Sopenharmony_ci	movdqa	  \XMM6, \TMP1
123162306a36Sopenharmony_ci	pshufd	  $78, \XMM6, \TMP2
123262306a36Sopenharmony_ci	pxor	  \XMM6, \TMP2
123362306a36Sopenharmony_ci	movdqu	  HashKey_3(%arg2), \TMP5
123462306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
123562306a36Sopenharmony_ci	movaps 0x30(%arg1), \TMP3
123662306a36Sopenharmony_ci	aesenc    \TMP3, \XMM1              # Round 3
123762306a36Sopenharmony_ci	aesenc    \TMP3, \XMM2
123862306a36Sopenharmony_ci	aesenc    \TMP3, \XMM3
123962306a36Sopenharmony_ci	aesenc    \TMP3, \XMM4
124062306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
124162306a36Sopenharmony_ci	movaps 0x40(%arg1), \TMP3
124262306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 4
124362306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
124462306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
124562306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
124662306a36Sopenharmony_ci	movdqu	  HashKey_3_k(%arg2), \TMP5
124762306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
124862306a36Sopenharmony_ci	movaps 0x50(%arg1), \TMP3
124962306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 5
125062306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
125162306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
125262306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
125362306a36Sopenharmony_ci	pxor	  \TMP1, \TMP4
125462306a36Sopenharmony_ci# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
125562306a36Sopenharmony_ci	pxor	  \XMM6, \XMM5
125662306a36Sopenharmony_ci	pxor	  \TMP2, \TMP6
125762306a36Sopenharmony_ci	movdqa	  \XMM7, \TMP1
125862306a36Sopenharmony_ci	pshufd	  $78, \XMM7, \TMP2
125962306a36Sopenharmony_ci	pxor	  \XMM7, \TMP2
126062306a36Sopenharmony_ci	movdqu	  HashKey_2(%arg2), \TMP5
126162306a36Sopenharmony_ci
126262306a36Sopenharmony_ci        # Multiply TMP5 * HashKey using karatsuba
126362306a36Sopenharmony_ci
126462306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
126562306a36Sopenharmony_ci	movaps 0x60(%arg1), \TMP3
126662306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 6
126762306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
126862306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
126962306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
127062306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
127162306a36Sopenharmony_ci	movaps 0x70(%arg1), \TMP3
127262306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 7
127362306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
127462306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
127562306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
127662306a36Sopenharmony_ci	movdqu	  HashKey_2_k(%arg2), \TMP5
127762306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
127862306a36Sopenharmony_ci	movaps 0x80(%arg1), \TMP3
127962306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1              # Round 8
128062306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
128162306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
128262306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
128362306a36Sopenharmony_ci	pxor	  \TMP1, \TMP4
128462306a36Sopenharmony_ci# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
128562306a36Sopenharmony_ci	pxor	  \XMM7, \XMM5
128662306a36Sopenharmony_ci	pxor	  \TMP2, \TMP6
128762306a36Sopenharmony_ci
128862306a36Sopenharmony_ci        # Multiply XMM8 * HashKey
128962306a36Sopenharmony_ci        # XMM8 and TMP5 hold the values for the two operands
129062306a36Sopenharmony_ci
129162306a36Sopenharmony_ci	movdqa	  \XMM8, \TMP1
129262306a36Sopenharmony_ci	pshufd	  $78, \XMM8, \TMP2
129362306a36Sopenharmony_ci	pxor	  \XMM8, \TMP2
129462306a36Sopenharmony_ci	movdqu	  HashKey(%arg2), \TMP5
129562306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
129662306a36Sopenharmony_ci	movaps 0x90(%arg1), \TMP3
129762306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM1             # Round 9
129862306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM2
129962306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM3
130062306a36Sopenharmony_ci	aesenc	  \TMP3, \XMM4
130162306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
130262306a36Sopenharmony_ci	lea	  0xa0(%arg1),%r10
130362306a36Sopenharmony_ci	mov	  keysize,%eax
130462306a36Sopenharmony_ci	shr	  $2,%eax		        # 128->4, 192->6, 256->8
130562306a36Sopenharmony_ci	sub	  $4,%eax			# 128->0, 192->2, 256->4
130662306a36Sopenharmony_ci	jz	  .Laes_loop_par_dec_done\@
130762306a36Sopenharmony_ci
130862306a36Sopenharmony_ci.Laes_loop_par_dec\@:
130962306a36Sopenharmony_ci	MOVADQ	  (%r10),\TMP3
131062306a36Sopenharmony_ci.irpc	index, 1234
131162306a36Sopenharmony_ci	aesenc	  \TMP3, %xmm\index
131262306a36Sopenharmony_ci.endr
131362306a36Sopenharmony_ci	add	  $16,%r10
131462306a36Sopenharmony_ci	sub	  $1,%eax
131562306a36Sopenharmony_ci	jnz	  .Laes_loop_par_dec\@
131662306a36Sopenharmony_ci
131762306a36Sopenharmony_ci.Laes_loop_par_dec_done\@:
131862306a36Sopenharmony_ci	MOVADQ	  (%r10), \TMP3
131962306a36Sopenharmony_ci	aesenclast \TMP3, \XMM1           # last round
132062306a36Sopenharmony_ci	aesenclast \TMP3, \XMM2
132162306a36Sopenharmony_ci	aesenclast \TMP3, \XMM3
132262306a36Sopenharmony_ci	aesenclast \TMP3, \XMM4
132362306a36Sopenharmony_ci	movdqu    HashKey_k(%arg2), \TMP5
132462306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
132562306a36Sopenharmony_ci	movdqu	  (%arg4,%r11,1), \TMP3
132662306a36Sopenharmony_ci	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
132762306a36Sopenharmony_ci	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
132862306a36Sopenharmony_ci	movdqa    \TMP3, \XMM1
132962306a36Sopenharmony_ci	movdqu	  16(%arg4,%r11,1), \TMP3
133062306a36Sopenharmony_ci	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
133162306a36Sopenharmony_ci	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
133262306a36Sopenharmony_ci	movdqa    \TMP3, \XMM2
133362306a36Sopenharmony_ci	movdqu	  32(%arg4,%r11,1), \TMP3
133462306a36Sopenharmony_ci	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
133562306a36Sopenharmony_ci	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
133662306a36Sopenharmony_ci	movdqa    \TMP3, \XMM3
133762306a36Sopenharmony_ci	movdqu	  48(%arg4,%r11,1), \TMP3
133862306a36Sopenharmony_ci	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
133962306a36Sopenharmony_ci	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
134062306a36Sopenharmony_ci	movdqa    \TMP3, \XMM4
134162306a36Sopenharmony_ci	pshufb %xmm15, \XMM1        # perform a 16 byte swap
134262306a36Sopenharmony_ci	pshufb %xmm15, \XMM2	# perform a 16 byte swap
134362306a36Sopenharmony_ci	pshufb %xmm15, \XMM3	# perform a 16 byte swap
134462306a36Sopenharmony_ci	pshufb %xmm15, \XMM4	# perform a 16 byte swap
134562306a36Sopenharmony_ci
134662306a36Sopenharmony_ci	pxor	  \TMP4, \TMP1
134762306a36Sopenharmony_ci	pxor	  \XMM8, \XMM5
134862306a36Sopenharmony_ci	pxor	  \TMP6, \TMP2
134962306a36Sopenharmony_ci	pxor	  \TMP1, \TMP2
135062306a36Sopenharmony_ci	pxor	  \XMM5, \TMP2
135162306a36Sopenharmony_ci	movdqa	  \TMP2, \TMP3
135262306a36Sopenharmony_ci	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
135362306a36Sopenharmony_ci	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
135462306a36Sopenharmony_ci	pxor	  \TMP3, \XMM5
135562306a36Sopenharmony_ci	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
135662306a36Sopenharmony_ci
135762306a36Sopenharmony_ci        # first phase of reduction
135862306a36Sopenharmony_ci
135962306a36Sopenharmony_ci	movdqa    \XMM5, \TMP2
136062306a36Sopenharmony_ci	movdqa    \XMM5, \TMP3
136162306a36Sopenharmony_ci	movdqa    \XMM5, \TMP4
136262306a36Sopenharmony_ci# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
136362306a36Sopenharmony_ci	pslld     $31, \TMP2                   # packed right shift << 31
136462306a36Sopenharmony_ci	pslld     $30, \TMP3                   # packed right shift << 30
136562306a36Sopenharmony_ci	pslld     $25, \TMP4                   # packed right shift << 25
136662306a36Sopenharmony_ci	pxor      \TMP3, \TMP2	               # xor the shifted versions
136762306a36Sopenharmony_ci	pxor      \TMP4, \TMP2
136862306a36Sopenharmony_ci	movdqa    \TMP2, \TMP5
136962306a36Sopenharmony_ci	psrldq    $4, \TMP5                    # right shift T5 1 DW
137062306a36Sopenharmony_ci	pslldq    $12, \TMP2                   # left shift T2 3 DWs
137162306a36Sopenharmony_ci	pxor      \TMP2, \XMM5
137262306a36Sopenharmony_ci
137362306a36Sopenharmony_ci        # second phase of reduction
137462306a36Sopenharmony_ci
137562306a36Sopenharmony_ci	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
137662306a36Sopenharmony_ci	movdqa    \XMM5,\TMP3
137762306a36Sopenharmony_ci	movdqa    \XMM5,\TMP4
137862306a36Sopenharmony_ci	psrld     $1, \TMP2                    # packed left shift >>1
137962306a36Sopenharmony_ci	psrld     $2, \TMP3                    # packed left shift >>2
138062306a36Sopenharmony_ci	psrld     $7, \TMP4                    # packed left shift >>7
138162306a36Sopenharmony_ci	pxor      \TMP3,\TMP2		       # xor the shifted versions
138262306a36Sopenharmony_ci	pxor      \TMP4,\TMP2
138362306a36Sopenharmony_ci	pxor      \TMP5, \TMP2
138462306a36Sopenharmony_ci	pxor      \TMP2, \XMM5
138562306a36Sopenharmony_ci	pxor      \TMP1, \XMM5                 # result is in TMP1
138662306a36Sopenharmony_ci
138762306a36Sopenharmony_ci	pxor	  \XMM5, \XMM1
138862306a36Sopenharmony_ci.endm
138962306a36Sopenharmony_ci
139062306a36Sopenharmony_ci/* GHASH the last 4 ciphertext blocks. */
139162306a36Sopenharmony_ci.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
139262306a36Sopenharmony_ciTMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
139362306a36Sopenharmony_ci
139462306a36Sopenharmony_ci        # Multiply TMP6 * HashKey (using Karatsuba)
139562306a36Sopenharmony_ci
139662306a36Sopenharmony_ci	movdqa	  \XMM1, \TMP6
139762306a36Sopenharmony_ci	pshufd	  $78, \XMM1, \TMP2
139862306a36Sopenharmony_ci	pxor	  \XMM1, \TMP2
139962306a36Sopenharmony_ci	movdqu	  HashKey_4(%arg2), \TMP5
140062306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
140162306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
140262306a36Sopenharmony_ci	movdqu	  HashKey_4_k(%arg2), \TMP4
140362306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
140462306a36Sopenharmony_ci	movdqa	  \XMM1, \XMMDst
140562306a36Sopenharmony_ci	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
140662306a36Sopenharmony_ci
140762306a36Sopenharmony_ci        # Multiply TMP1 * HashKey (using Karatsuba)
140862306a36Sopenharmony_ci
140962306a36Sopenharmony_ci	movdqa	  \XMM2, \TMP1
141062306a36Sopenharmony_ci	pshufd	  $78, \XMM2, \TMP2
141162306a36Sopenharmony_ci	pxor	  \XMM2, \TMP2
141262306a36Sopenharmony_ci	movdqu	  HashKey_3(%arg2), \TMP5
141362306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
141462306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
141562306a36Sopenharmony_ci	movdqu	  HashKey_3_k(%arg2), \TMP4
141662306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
141762306a36Sopenharmony_ci	pxor	  \TMP1, \TMP6
141862306a36Sopenharmony_ci	pxor	  \XMM2, \XMMDst
141962306a36Sopenharmony_ci	pxor	  \TMP2, \XMM1
142062306a36Sopenharmony_ci# results accumulated in TMP6, XMMDst, XMM1
142162306a36Sopenharmony_ci
142262306a36Sopenharmony_ci        # Multiply TMP1 * HashKey (using Karatsuba)
142362306a36Sopenharmony_ci
142462306a36Sopenharmony_ci	movdqa	  \XMM3, \TMP1
142562306a36Sopenharmony_ci	pshufd	  $78, \XMM3, \TMP2
142662306a36Sopenharmony_ci	pxor	  \XMM3, \TMP2
142762306a36Sopenharmony_ci	movdqu	  HashKey_2(%arg2), \TMP5
142862306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
142962306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
143062306a36Sopenharmony_ci	movdqu	  HashKey_2_k(%arg2), \TMP4
143162306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
143262306a36Sopenharmony_ci	pxor	  \TMP1, \TMP6
143362306a36Sopenharmony_ci	pxor	  \XMM3, \XMMDst
143462306a36Sopenharmony_ci	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
143562306a36Sopenharmony_ci
143662306a36Sopenharmony_ci        # Multiply TMP1 * HashKey (using Karatsuba)
143762306a36Sopenharmony_ci	movdqa	  \XMM4, \TMP1
143862306a36Sopenharmony_ci	pshufd	  $78, \XMM4, \TMP2
143962306a36Sopenharmony_ci	pxor	  \XMM4, \TMP2
144062306a36Sopenharmony_ci	movdqu	  HashKey(%arg2), \TMP5
144162306a36Sopenharmony_ci	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
144262306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
144362306a36Sopenharmony_ci	movdqu	  HashKey_k(%arg2), \TMP4
144462306a36Sopenharmony_ci	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
144562306a36Sopenharmony_ci	pxor	  \TMP1, \TMP6
144662306a36Sopenharmony_ci	pxor	  \XMM4, \XMMDst
144762306a36Sopenharmony_ci	pxor	  \XMM1, \TMP2
144862306a36Sopenharmony_ci	pxor	  \TMP6, \TMP2
144962306a36Sopenharmony_ci	pxor	  \XMMDst, \TMP2
145062306a36Sopenharmony_ci	# middle section of the temp results combined as in karatsuba algorithm
145162306a36Sopenharmony_ci	movdqa	  \TMP2, \TMP4
145262306a36Sopenharmony_ci	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
145362306a36Sopenharmony_ci	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
145462306a36Sopenharmony_ci	pxor	  \TMP4, \XMMDst
145562306a36Sopenharmony_ci	pxor	  \TMP2, \TMP6
145662306a36Sopenharmony_ci# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
145762306a36Sopenharmony_ci	# first phase of the reduction
145862306a36Sopenharmony_ci	movdqa    \XMMDst, \TMP2
145962306a36Sopenharmony_ci	movdqa    \XMMDst, \TMP3
146062306a36Sopenharmony_ci	movdqa    \XMMDst, \TMP4
146162306a36Sopenharmony_ci# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
146262306a36Sopenharmony_ci	pslld     $31, \TMP2                # packed right shifting << 31
146362306a36Sopenharmony_ci	pslld     $30, \TMP3                # packed right shifting << 30
146462306a36Sopenharmony_ci	pslld     $25, \TMP4                # packed right shifting << 25
146562306a36Sopenharmony_ci	pxor      \TMP3, \TMP2              # xor the shifted versions
146662306a36Sopenharmony_ci	pxor      \TMP4, \TMP2
146762306a36Sopenharmony_ci	movdqa    \TMP2, \TMP7
146862306a36Sopenharmony_ci	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
146962306a36Sopenharmony_ci	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
147062306a36Sopenharmony_ci	pxor      \TMP2, \XMMDst
147162306a36Sopenharmony_ci
147262306a36Sopenharmony_ci        # second phase of the reduction
147362306a36Sopenharmony_ci	movdqa    \XMMDst, \TMP2
147462306a36Sopenharmony_ci	# make 3 copies of XMMDst for doing 3 shift operations
147562306a36Sopenharmony_ci	movdqa    \XMMDst, \TMP3
147662306a36Sopenharmony_ci	movdqa    \XMMDst, \TMP4
147762306a36Sopenharmony_ci	psrld     $1, \TMP2                 # packed left shift >> 1
147862306a36Sopenharmony_ci	psrld     $2, \TMP3                 # packed left shift >> 2
147962306a36Sopenharmony_ci	psrld     $7, \TMP4                 # packed left shift >> 7
148062306a36Sopenharmony_ci	pxor      \TMP3, \TMP2              # xor the shifted versions
148162306a36Sopenharmony_ci	pxor      \TMP4, \TMP2
148262306a36Sopenharmony_ci	pxor      \TMP7, \TMP2
148362306a36Sopenharmony_ci	pxor      \TMP2, \XMMDst
148462306a36Sopenharmony_ci	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
148562306a36Sopenharmony_ci.endm
148662306a36Sopenharmony_ci
148762306a36Sopenharmony_ci
148862306a36Sopenharmony_ci/* Encryption of a single block
148962306a36Sopenharmony_ci* uses eax & r10
149062306a36Sopenharmony_ci*/
149162306a36Sopenharmony_ci
149262306a36Sopenharmony_ci.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
149362306a36Sopenharmony_ci
149462306a36Sopenharmony_ci	pxor		(%arg1), \XMM0
149562306a36Sopenharmony_ci	mov		keysize,%eax
149662306a36Sopenharmony_ci	shr		$2,%eax			# 128->4, 192->6, 256->8
149762306a36Sopenharmony_ci	add		$5,%eax			# 128->9, 192->11, 256->13
149862306a36Sopenharmony_ci	lea		16(%arg1), %r10	  # get first expanded key address
149962306a36Sopenharmony_ci
150062306a36Sopenharmony_ci_esb_loop_\@:
150162306a36Sopenharmony_ci	MOVADQ		(%r10),\TMP1
150262306a36Sopenharmony_ci	aesenc		\TMP1,\XMM0
150362306a36Sopenharmony_ci	add		$16,%r10
150462306a36Sopenharmony_ci	sub		$1,%eax
150562306a36Sopenharmony_ci	jnz		_esb_loop_\@
150662306a36Sopenharmony_ci
150762306a36Sopenharmony_ci	MOVADQ		(%r10),\TMP1
150862306a36Sopenharmony_ci	aesenclast	\TMP1,\XMM0
150962306a36Sopenharmony_ci.endm
151062306a36Sopenharmony_ci/*****************************************************************************
151162306a36Sopenharmony_ci* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
151262306a36Sopenharmony_ci*                   struct gcm_context_data *data
151362306a36Sopenharmony_ci*                                      // Context data
151462306a36Sopenharmony_ci*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
151562306a36Sopenharmony_ci*                   const u8 *in,      // Ciphertext input
151662306a36Sopenharmony_ci*                   u64 plaintext_len, // Length of data in bytes for decryption.
151762306a36Sopenharmony_ci*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
151862306a36Sopenharmony_ci*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
151962306a36Sopenharmony_ci*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
152062306a36Sopenharmony_ci*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
152162306a36Sopenharmony_ci*                   const u8 *aad,     // Additional Authentication Data (AAD)
152262306a36Sopenharmony_ci*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
152362306a36Sopenharmony_ci*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
152462306a36Sopenharmony_ci*                                      // given authentication tag and only return the plaintext if they match.
152562306a36Sopenharmony_ci*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
152662306a36Sopenharmony_ci*                                      // (most likely), 12 or 8.
152762306a36Sopenharmony_ci*
152862306a36Sopenharmony_ci* Assumptions:
152962306a36Sopenharmony_ci*
153062306a36Sopenharmony_ci* keys:
153162306a36Sopenharmony_ci*       keys are pre-expanded and aligned to 16 bytes. we are using the first
153262306a36Sopenharmony_ci*       set of 11 keys in the data structure void *aes_ctx
153362306a36Sopenharmony_ci*
153462306a36Sopenharmony_ci* iv:
153562306a36Sopenharmony_ci*       0                   1                   2                   3
153662306a36Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
153762306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
153862306a36Sopenharmony_ci*       |                             Salt  (From the SA)               |
153962306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
154062306a36Sopenharmony_ci*       |                     Initialization Vector                     |
154162306a36Sopenharmony_ci*       |         (This is the sequence number from IPSec header)       |
154262306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
154362306a36Sopenharmony_ci*       |                              0x1                              |
154462306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
154562306a36Sopenharmony_ci*
154662306a36Sopenharmony_ci*
154762306a36Sopenharmony_ci*
154862306a36Sopenharmony_ci* AAD:
154962306a36Sopenharmony_ci*       AAD padded to 128 bits with 0
155062306a36Sopenharmony_ci*       for example, assume AAD is a u32 vector
155162306a36Sopenharmony_ci*
155262306a36Sopenharmony_ci*       if AAD is 8 bytes:
155362306a36Sopenharmony_ci*       AAD[3] = {A0, A1};
155462306a36Sopenharmony_ci*       padded AAD in xmm register = {A1 A0 0 0}
155562306a36Sopenharmony_ci*
155662306a36Sopenharmony_ci*       0                   1                   2                   3
155762306a36Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
155862306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
155962306a36Sopenharmony_ci*       |                               SPI (A1)                        |
156062306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
156162306a36Sopenharmony_ci*       |                     32-bit Sequence Number (A0)               |
156262306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
156362306a36Sopenharmony_ci*       |                              0x0                              |
156462306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
156562306a36Sopenharmony_ci*
156662306a36Sopenharmony_ci*                                       AAD Format with 32-bit Sequence Number
156762306a36Sopenharmony_ci*
156862306a36Sopenharmony_ci*       if AAD is 12 bytes:
156962306a36Sopenharmony_ci*       AAD[3] = {A0, A1, A2};
157062306a36Sopenharmony_ci*       padded AAD in xmm register = {A2 A1 A0 0}
157162306a36Sopenharmony_ci*
157262306a36Sopenharmony_ci*       0                   1                   2                   3
157362306a36Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
157462306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
157562306a36Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
157662306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
157762306a36Sopenharmony_ci*       |                               SPI (A2)                        |
157862306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
157962306a36Sopenharmony_ci*       |                 64-bit Extended Sequence Number {A1,A0}       |
158062306a36Sopenharmony_ci*       |                                                               |
158162306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
158262306a36Sopenharmony_ci*       |                              0x0                              |
158362306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
158462306a36Sopenharmony_ci*
158562306a36Sopenharmony_ci*                        AAD Format with 64-bit Extended Sequence Number
158662306a36Sopenharmony_ci*
158762306a36Sopenharmony_ci* poly = x^128 + x^127 + x^126 + x^121 + 1
158862306a36Sopenharmony_ci*
158962306a36Sopenharmony_ci*****************************************************************************/
159062306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_dec)
159162306a36Sopenharmony_ci	FUNC_SAVE
159262306a36Sopenharmony_ci
159362306a36Sopenharmony_ci	GCM_INIT %arg6, arg7, arg8, arg9
159462306a36Sopenharmony_ci	GCM_ENC_DEC dec
159562306a36Sopenharmony_ci	GCM_COMPLETE arg10, arg11
159662306a36Sopenharmony_ci	FUNC_RESTORE
159762306a36Sopenharmony_ci	RET
159862306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_dec)
159962306a36Sopenharmony_ci
160062306a36Sopenharmony_ci
160162306a36Sopenharmony_ci/*****************************************************************************
160262306a36Sopenharmony_ci* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
160362306a36Sopenharmony_ci*                    struct gcm_context_data *data
160462306a36Sopenharmony_ci*                                        // Context data
160562306a36Sopenharmony_ci*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
160662306a36Sopenharmony_ci*                    const u8 *in,       // Plaintext input
160762306a36Sopenharmony_ci*                    u64 plaintext_len,  // Length of data in bytes for encryption.
160862306a36Sopenharmony_ci*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
160962306a36Sopenharmony_ci*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
161062306a36Sopenharmony_ci*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
161162306a36Sopenharmony_ci*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
161262306a36Sopenharmony_ci*                    const u8 *aad,      // Additional Authentication Data (AAD)
161362306a36Sopenharmony_ci*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
161462306a36Sopenharmony_ci*                    u8 *auth_tag,       // Authenticated Tag output.
161562306a36Sopenharmony_ci*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
161662306a36Sopenharmony_ci*                                        // 12 or 8.
161762306a36Sopenharmony_ci*
161862306a36Sopenharmony_ci* Assumptions:
161962306a36Sopenharmony_ci*
162062306a36Sopenharmony_ci* keys:
162162306a36Sopenharmony_ci*       keys are pre-expanded and aligned to 16 bytes. we are using the
162262306a36Sopenharmony_ci*       first set of 11 keys in the data structure void *aes_ctx
162362306a36Sopenharmony_ci*
162462306a36Sopenharmony_ci*
162562306a36Sopenharmony_ci* iv:
162662306a36Sopenharmony_ci*       0                   1                   2                   3
162762306a36Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
162862306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
162962306a36Sopenharmony_ci*       |                             Salt  (From the SA)               |
163062306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
163162306a36Sopenharmony_ci*       |                     Initialization Vector                     |
163262306a36Sopenharmony_ci*       |         (This is the sequence number from IPSec header)       |
163362306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
163462306a36Sopenharmony_ci*       |                              0x1                              |
163562306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
163662306a36Sopenharmony_ci*
163762306a36Sopenharmony_ci*
163862306a36Sopenharmony_ci*
163962306a36Sopenharmony_ci* AAD:
164062306a36Sopenharmony_ci*       AAD padded to 128 bits with 0
164162306a36Sopenharmony_ci*       for example, assume AAD is a u32 vector
164262306a36Sopenharmony_ci*
164362306a36Sopenharmony_ci*       if AAD is 8 bytes:
164462306a36Sopenharmony_ci*       AAD[3] = {A0, A1};
164562306a36Sopenharmony_ci*       padded AAD in xmm register = {A1 A0 0 0}
164662306a36Sopenharmony_ci*
164762306a36Sopenharmony_ci*       0                   1                   2                   3
164862306a36Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
164962306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
165062306a36Sopenharmony_ci*       |                               SPI (A1)                        |
165162306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
165262306a36Sopenharmony_ci*       |                     32-bit Sequence Number (A0)               |
165362306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
165462306a36Sopenharmony_ci*       |                              0x0                              |
165562306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
165662306a36Sopenharmony_ci*
165762306a36Sopenharmony_ci*                                 AAD Format with 32-bit Sequence Number
165862306a36Sopenharmony_ci*
165962306a36Sopenharmony_ci*       if AAD is 12 bytes:
166062306a36Sopenharmony_ci*       AAD[3] = {A0, A1, A2};
166162306a36Sopenharmony_ci*       padded AAD in xmm register = {A2 A1 A0 0}
166262306a36Sopenharmony_ci*
166362306a36Sopenharmony_ci*       0                   1                   2                   3
166462306a36Sopenharmony_ci*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
166562306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
166662306a36Sopenharmony_ci*       |                               SPI (A2)                        |
166762306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
166862306a36Sopenharmony_ci*       |                 64-bit Extended Sequence Number {A1,A0}       |
166962306a36Sopenharmony_ci*       |                                                               |
167062306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
167162306a36Sopenharmony_ci*       |                              0x0                              |
167262306a36Sopenharmony_ci*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
167362306a36Sopenharmony_ci*
167462306a36Sopenharmony_ci*                         AAD Format with 64-bit Extended Sequence Number
167562306a36Sopenharmony_ci*
167662306a36Sopenharmony_ci* poly = x^128 + x^127 + x^126 + x^121 + 1
167762306a36Sopenharmony_ci***************************************************************************/
167862306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_enc)
167962306a36Sopenharmony_ci	FUNC_SAVE
168062306a36Sopenharmony_ci
168162306a36Sopenharmony_ci	GCM_INIT %arg6, arg7, arg8, arg9
168262306a36Sopenharmony_ci	GCM_ENC_DEC enc
168362306a36Sopenharmony_ci
168462306a36Sopenharmony_ci	GCM_COMPLETE arg10, arg11
168562306a36Sopenharmony_ci	FUNC_RESTORE
168662306a36Sopenharmony_ci	RET
168762306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_enc)
168862306a36Sopenharmony_ci
168962306a36Sopenharmony_ci/*****************************************************************************
169062306a36Sopenharmony_ci* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
169162306a36Sopenharmony_ci*                     struct gcm_context_data *data,
169262306a36Sopenharmony_ci*                                         // context data
169362306a36Sopenharmony_ci*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
169462306a36Sopenharmony_ci*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
169562306a36Sopenharmony_ci*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
169662306a36Sopenharmony_ci*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
169762306a36Sopenharmony_ci*                     const u8 *aad,      // Additional Authentication Data (AAD)
169862306a36Sopenharmony_ci*                     u64 aad_len)        // Length of AAD in bytes.
169962306a36Sopenharmony_ci*/
170062306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_init)
170162306a36Sopenharmony_ci	FUNC_SAVE
170262306a36Sopenharmony_ci	GCM_INIT %arg3, %arg4,%arg5, %arg6
170362306a36Sopenharmony_ci	FUNC_RESTORE
170462306a36Sopenharmony_ci	RET
170562306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_init)
170662306a36Sopenharmony_ci
170762306a36Sopenharmony_ci/*****************************************************************************
170862306a36Sopenharmony_ci* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
170962306a36Sopenharmony_ci*                    struct gcm_context_data *data,
171062306a36Sopenharmony_ci*                                        // context data
171162306a36Sopenharmony_ci*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
171262306a36Sopenharmony_ci*                    const u8 *in,       // Plaintext input
171362306a36Sopenharmony_ci*                    u64 plaintext_len,  // Length of data in bytes for encryption.
171462306a36Sopenharmony_ci*/
171562306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_enc_update)
171662306a36Sopenharmony_ci	FUNC_SAVE
171762306a36Sopenharmony_ci	GCM_ENC_DEC enc
171862306a36Sopenharmony_ci	FUNC_RESTORE
171962306a36Sopenharmony_ci	RET
172062306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_enc_update)
172162306a36Sopenharmony_ci
172262306a36Sopenharmony_ci/*****************************************************************************
172362306a36Sopenharmony_ci* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
172462306a36Sopenharmony_ci*                    struct gcm_context_data *data,
172562306a36Sopenharmony_ci*                                        // context data
172662306a36Sopenharmony_ci*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
172762306a36Sopenharmony_ci*                    const u8 *in,       // Plaintext input
172862306a36Sopenharmony_ci*                    u64 plaintext_len,  // Length of data in bytes for encryption.
172962306a36Sopenharmony_ci*/
173062306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_dec_update)
173162306a36Sopenharmony_ci	FUNC_SAVE
173262306a36Sopenharmony_ci	GCM_ENC_DEC dec
173362306a36Sopenharmony_ci	FUNC_RESTORE
173462306a36Sopenharmony_ci	RET
173562306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_dec_update)
173662306a36Sopenharmony_ci
173762306a36Sopenharmony_ci/*****************************************************************************
173862306a36Sopenharmony_ci* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
173962306a36Sopenharmony_ci*                    struct gcm_context_data *data,
174062306a36Sopenharmony_ci*                                        // context data
174162306a36Sopenharmony_ci*                    u8 *auth_tag,       // Authenticated Tag output.
174262306a36Sopenharmony_ci*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
174362306a36Sopenharmony_ci*                                        // 12 or 8.
174462306a36Sopenharmony_ci*/
174562306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_finalize)
174662306a36Sopenharmony_ci	FUNC_SAVE
174762306a36Sopenharmony_ci	GCM_COMPLETE %arg3 %arg4
174862306a36Sopenharmony_ci	FUNC_RESTORE
174962306a36Sopenharmony_ci	RET
175062306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_finalize)
175162306a36Sopenharmony_ci
175262306a36Sopenharmony_ci#endif
175362306a36Sopenharmony_ci
175462306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(_key_expansion_256a)
175562306a36Sopenharmony_ci	pshufd $0b11111111, %xmm1, %xmm1
175662306a36Sopenharmony_ci	shufps $0b00010000, %xmm0, %xmm4
175762306a36Sopenharmony_ci	pxor %xmm4, %xmm0
175862306a36Sopenharmony_ci	shufps $0b10001100, %xmm0, %xmm4
175962306a36Sopenharmony_ci	pxor %xmm4, %xmm0
176062306a36Sopenharmony_ci	pxor %xmm1, %xmm0
176162306a36Sopenharmony_ci	movaps %xmm0, (TKEYP)
176262306a36Sopenharmony_ci	add $0x10, TKEYP
176362306a36Sopenharmony_ci	RET
176462306a36Sopenharmony_ciSYM_FUNC_END(_key_expansion_256a)
176562306a36Sopenharmony_ciSYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
176662306a36Sopenharmony_ci
176762306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(_key_expansion_192a)
176862306a36Sopenharmony_ci	pshufd $0b01010101, %xmm1, %xmm1
176962306a36Sopenharmony_ci	shufps $0b00010000, %xmm0, %xmm4
177062306a36Sopenharmony_ci	pxor %xmm4, %xmm0
177162306a36Sopenharmony_ci	shufps $0b10001100, %xmm0, %xmm4
177262306a36Sopenharmony_ci	pxor %xmm4, %xmm0
177362306a36Sopenharmony_ci	pxor %xmm1, %xmm0
177462306a36Sopenharmony_ci
177562306a36Sopenharmony_ci	movaps %xmm2, %xmm5
177662306a36Sopenharmony_ci	movaps %xmm2, %xmm6
177762306a36Sopenharmony_ci	pslldq $4, %xmm5
177862306a36Sopenharmony_ci	pshufd $0b11111111, %xmm0, %xmm3
177962306a36Sopenharmony_ci	pxor %xmm3, %xmm2
178062306a36Sopenharmony_ci	pxor %xmm5, %xmm2
178162306a36Sopenharmony_ci
178262306a36Sopenharmony_ci	movaps %xmm0, %xmm1
178362306a36Sopenharmony_ci	shufps $0b01000100, %xmm0, %xmm6
178462306a36Sopenharmony_ci	movaps %xmm6, (TKEYP)
178562306a36Sopenharmony_ci	shufps $0b01001110, %xmm2, %xmm1
178662306a36Sopenharmony_ci	movaps %xmm1, 0x10(TKEYP)
178762306a36Sopenharmony_ci	add $0x20, TKEYP
178862306a36Sopenharmony_ci	RET
178962306a36Sopenharmony_ciSYM_FUNC_END(_key_expansion_192a)
179062306a36Sopenharmony_ci
179162306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(_key_expansion_192b)
179262306a36Sopenharmony_ci	pshufd $0b01010101, %xmm1, %xmm1
179362306a36Sopenharmony_ci	shufps $0b00010000, %xmm0, %xmm4
179462306a36Sopenharmony_ci	pxor %xmm4, %xmm0
179562306a36Sopenharmony_ci	shufps $0b10001100, %xmm0, %xmm4
179662306a36Sopenharmony_ci	pxor %xmm4, %xmm0
179762306a36Sopenharmony_ci	pxor %xmm1, %xmm0
179862306a36Sopenharmony_ci
179962306a36Sopenharmony_ci	movaps %xmm2, %xmm5
180062306a36Sopenharmony_ci	pslldq $4, %xmm5
180162306a36Sopenharmony_ci	pshufd $0b11111111, %xmm0, %xmm3
180262306a36Sopenharmony_ci	pxor %xmm3, %xmm2
180362306a36Sopenharmony_ci	pxor %xmm5, %xmm2
180462306a36Sopenharmony_ci
180562306a36Sopenharmony_ci	movaps %xmm0, (TKEYP)
180662306a36Sopenharmony_ci	add $0x10, TKEYP
180762306a36Sopenharmony_ci	RET
180862306a36Sopenharmony_ciSYM_FUNC_END(_key_expansion_192b)
180962306a36Sopenharmony_ci
181062306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(_key_expansion_256b)
181162306a36Sopenharmony_ci	pshufd $0b10101010, %xmm1, %xmm1
181262306a36Sopenharmony_ci	shufps $0b00010000, %xmm2, %xmm4
181362306a36Sopenharmony_ci	pxor %xmm4, %xmm2
181462306a36Sopenharmony_ci	shufps $0b10001100, %xmm2, %xmm4
181562306a36Sopenharmony_ci	pxor %xmm4, %xmm2
181662306a36Sopenharmony_ci	pxor %xmm1, %xmm2
181762306a36Sopenharmony_ci	movaps %xmm2, (TKEYP)
181862306a36Sopenharmony_ci	add $0x10, TKEYP
181962306a36Sopenharmony_ci	RET
182062306a36Sopenharmony_ciSYM_FUNC_END(_key_expansion_256b)
182162306a36Sopenharmony_ci
182262306a36Sopenharmony_ci/*
182362306a36Sopenharmony_ci * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
182462306a36Sopenharmony_ci *                   unsigned int key_len)
182562306a36Sopenharmony_ci */
182662306a36Sopenharmony_ciSYM_FUNC_START(aesni_set_key)
182762306a36Sopenharmony_ci	FRAME_BEGIN
182862306a36Sopenharmony_ci#ifndef __x86_64__
182962306a36Sopenharmony_ci	pushl KEYP
183062306a36Sopenharmony_ci	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
183162306a36Sopenharmony_ci	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
183262306a36Sopenharmony_ci	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
183362306a36Sopenharmony_ci#endif
183462306a36Sopenharmony_ci	movups (UKEYP), %xmm0		# user key (first 16 bytes)
183562306a36Sopenharmony_ci	movaps %xmm0, (KEYP)
183662306a36Sopenharmony_ci	lea 0x10(KEYP), TKEYP		# key addr
183762306a36Sopenharmony_ci	movl %edx, 480(KEYP)
183862306a36Sopenharmony_ci	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
183962306a36Sopenharmony_ci	cmp $24, %dl
184062306a36Sopenharmony_ci	jb .Lenc_key128
184162306a36Sopenharmony_ci	je .Lenc_key192
184262306a36Sopenharmony_ci	movups 0x10(UKEYP), %xmm2	# other user key
184362306a36Sopenharmony_ci	movaps %xmm2, (TKEYP)
184462306a36Sopenharmony_ci	add $0x10, TKEYP
184562306a36Sopenharmony_ci	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
184662306a36Sopenharmony_ci	call _key_expansion_256a
184762306a36Sopenharmony_ci	aeskeygenassist $0x1, %xmm0, %xmm1
184862306a36Sopenharmony_ci	call _key_expansion_256b
184962306a36Sopenharmony_ci	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
185062306a36Sopenharmony_ci	call _key_expansion_256a
185162306a36Sopenharmony_ci	aeskeygenassist $0x2, %xmm0, %xmm1
185262306a36Sopenharmony_ci	call _key_expansion_256b
185362306a36Sopenharmony_ci	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
185462306a36Sopenharmony_ci	call _key_expansion_256a
185562306a36Sopenharmony_ci	aeskeygenassist $0x4, %xmm0, %xmm1
185662306a36Sopenharmony_ci	call _key_expansion_256b
185762306a36Sopenharmony_ci	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
185862306a36Sopenharmony_ci	call _key_expansion_256a
185962306a36Sopenharmony_ci	aeskeygenassist $0x8, %xmm0, %xmm1
186062306a36Sopenharmony_ci	call _key_expansion_256b
186162306a36Sopenharmony_ci	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
186262306a36Sopenharmony_ci	call _key_expansion_256a
186362306a36Sopenharmony_ci	aeskeygenassist $0x10, %xmm0, %xmm1
186462306a36Sopenharmony_ci	call _key_expansion_256b
186562306a36Sopenharmony_ci	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
186662306a36Sopenharmony_ci	call _key_expansion_256a
186762306a36Sopenharmony_ci	aeskeygenassist $0x20, %xmm0, %xmm1
186862306a36Sopenharmony_ci	call _key_expansion_256b
186962306a36Sopenharmony_ci	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
187062306a36Sopenharmony_ci	call _key_expansion_256a
187162306a36Sopenharmony_ci	jmp .Ldec_key
187262306a36Sopenharmony_ci.Lenc_key192:
187362306a36Sopenharmony_ci	movq 0x10(UKEYP), %xmm2		# other user key
187462306a36Sopenharmony_ci	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
187562306a36Sopenharmony_ci	call _key_expansion_192a
187662306a36Sopenharmony_ci	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
187762306a36Sopenharmony_ci	call _key_expansion_192b
187862306a36Sopenharmony_ci	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
187962306a36Sopenharmony_ci	call _key_expansion_192a
188062306a36Sopenharmony_ci	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
188162306a36Sopenharmony_ci	call _key_expansion_192b
188262306a36Sopenharmony_ci	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
188362306a36Sopenharmony_ci	call _key_expansion_192a
188462306a36Sopenharmony_ci	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
188562306a36Sopenharmony_ci	call _key_expansion_192b
188662306a36Sopenharmony_ci	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
188762306a36Sopenharmony_ci	call _key_expansion_192a
188862306a36Sopenharmony_ci	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
188962306a36Sopenharmony_ci	call _key_expansion_192b
189062306a36Sopenharmony_ci	jmp .Ldec_key
189162306a36Sopenharmony_ci.Lenc_key128:
189262306a36Sopenharmony_ci	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
189362306a36Sopenharmony_ci	call _key_expansion_128
189462306a36Sopenharmony_ci	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
189562306a36Sopenharmony_ci	call _key_expansion_128
189662306a36Sopenharmony_ci	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
189762306a36Sopenharmony_ci	call _key_expansion_128
189862306a36Sopenharmony_ci	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
189962306a36Sopenharmony_ci	call _key_expansion_128
190062306a36Sopenharmony_ci	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
190162306a36Sopenharmony_ci	call _key_expansion_128
190262306a36Sopenharmony_ci	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
190362306a36Sopenharmony_ci	call _key_expansion_128
190462306a36Sopenharmony_ci	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
190562306a36Sopenharmony_ci	call _key_expansion_128
190662306a36Sopenharmony_ci	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
190762306a36Sopenharmony_ci	call _key_expansion_128
190862306a36Sopenharmony_ci	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
190962306a36Sopenharmony_ci	call _key_expansion_128
191062306a36Sopenharmony_ci	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
191162306a36Sopenharmony_ci	call _key_expansion_128
191262306a36Sopenharmony_ci.Ldec_key:
191362306a36Sopenharmony_ci	sub $0x10, TKEYP
191462306a36Sopenharmony_ci	movaps (KEYP), %xmm0
191562306a36Sopenharmony_ci	movaps (TKEYP), %xmm1
191662306a36Sopenharmony_ci	movaps %xmm0, 240(TKEYP)
191762306a36Sopenharmony_ci	movaps %xmm1, 240(KEYP)
191862306a36Sopenharmony_ci	add $0x10, KEYP
191962306a36Sopenharmony_ci	lea 240-16(TKEYP), UKEYP
192062306a36Sopenharmony_ci.align 4
192162306a36Sopenharmony_ci.Ldec_key_loop:
192262306a36Sopenharmony_ci	movaps (KEYP), %xmm0
192362306a36Sopenharmony_ci	aesimc %xmm0, %xmm1
192462306a36Sopenharmony_ci	movaps %xmm1, (UKEYP)
192562306a36Sopenharmony_ci	add $0x10, KEYP
192662306a36Sopenharmony_ci	sub $0x10, UKEYP
192762306a36Sopenharmony_ci	cmp TKEYP, KEYP
192862306a36Sopenharmony_ci	jb .Ldec_key_loop
192962306a36Sopenharmony_ci	xor AREG, AREG
193062306a36Sopenharmony_ci#ifndef __x86_64__
193162306a36Sopenharmony_ci	popl KEYP
193262306a36Sopenharmony_ci#endif
193362306a36Sopenharmony_ci	FRAME_END
193462306a36Sopenharmony_ci	RET
193562306a36Sopenharmony_ciSYM_FUNC_END(aesni_set_key)
193662306a36Sopenharmony_ci
193762306a36Sopenharmony_ci/*
193862306a36Sopenharmony_ci * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
193962306a36Sopenharmony_ci */
194062306a36Sopenharmony_ciSYM_FUNC_START(aesni_enc)
194162306a36Sopenharmony_ci	FRAME_BEGIN
194262306a36Sopenharmony_ci#ifndef __x86_64__
194362306a36Sopenharmony_ci	pushl KEYP
194462306a36Sopenharmony_ci	pushl KLEN
194562306a36Sopenharmony_ci	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
194662306a36Sopenharmony_ci	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
194762306a36Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), INP	# src
194862306a36Sopenharmony_ci#endif
194962306a36Sopenharmony_ci	movl 480(KEYP), KLEN		# key length
195062306a36Sopenharmony_ci	movups (INP), STATE		# input
195162306a36Sopenharmony_ci	call _aesni_enc1
195262306a36Sopenharmony_ci	movups STATE, (OUTP)		# output
195362306a36Sopenharmony_ci#ifndef __x86_64__
195462306a36Sopenharmony_ci	popl KLEN
195562306a36Sopenharmony_ci	popl KEYP
195662306a36Sopenharmony_ci#endif
195762306a36Sopenharmony_ci	FRAME_END
195862306a36Sopenharmony_ci	RET
195962306a36Sopenharmony_ciSYM_FUNC_END(aesni_enc)
196062306a36Sopenharmony_ci
196162306a36Sopenharmony_ci/*
196262306a36Sopenharmony_ci * _aesni_enc1:		internal ABI
196362306a36Sopenharmony_ci * input:
196462306a36Sopenharmony_ci *	KEYP:		key struct pointer
196562306a36Sopenharmony_ci *	KLEN:		round count
196662306a36Sopenharmony_ci *	STATE:		initial state (input)
196762306a36Sopenharmony_ci * output:
196862306a36Sopenharmony_ci *	STATE:		finial state (output)
196962306a36Sopenharmony_ci * changed:
197062306a36Sopenharmony_ci *	KEY
197162306a36Sopenharmony_ci *	TKEYP (T1)
197262306a36Sopenharmony_ci */
197362306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_enc1)
197462306a36Sopenharmony_ci	movaps (KEYP), KEY		# key
197562306a36Sopenharmony_ci	mov KEYP, TKEYP
197662306a36Sopenharmony_ci	pxor KEY, STATE		# round 0
197762306a36Sopenharmony_ci	add $0x30, TKEYP
197862306a36Sopenharmony_ci	cmp $24, KLEN
197962306a36Sopenharmony_ci	jb .Lenc128
198062306a36Sopenharmony_ci	lea 0x20(TKEYP), TKEYP
198162306a36Sopenharmony_ci	je .Lenc192
198262306a36Sopenharmony_ci	add $0x20, TKEYP
198362306a36Sopenharmony_ci	movaps -0x60(TKEYP), KEY
198462306a36Sopenharmony_ci	aesenc KEY, STATE
198562306a36Sopenharmony_ci	movaps -0x50(TKEYP), KEY
198662306a36Sopenharmony_ci	aesenc KEY, STATE
198762306a36Sopenharmony_ci.align 4
198862306a36Sopenharmony_ci.Lenc192:
198962306a36Sopenharmony_ci	movaps -0x40(TKEYP), KEY
199062306a36Sopenharmony_ci	aesenc KEY, STATE
199162306a36Sopenharmony_ci	movaps -0x30(TKEYP), KEY
199262306a36Sopenharmony_ci	aesenc KEY, STATE
199362306a36Sopenharmony_ci.align 4
199462306a36Sopenharmony_ci.Lenc128:
199562306a36Sopenharmony_ci	movaps -0x20(TKEYP), KEY
199662306a36Sopenharmony_ci	aesenc KEY, STATE
199762306a36Sopenharmony_ci	movaps -0x10(TKEYP), KEY
199862306a36Sopenharmony_ci	aesenc KEY, STATE
199962306a36Sopenharmony_ci	movaps (TKEYP), KEY
200062306a36Sopenharmony_ci	aesenc KEY, STATE
200162306a36Sopenharmony_ci	movaps 0x10(TKEYP), KEY
200262306a36Sopenharmony_ci	aesenc KEY, STATE
200362306a36Sopenharmony_ci	movaps 0x20(TKEYP), KEY
200462306a36Sopenharmony_ci	aesenc KEY, STATE
200562306a36Sopenharmony_ci	movaps 0x30(TKEYP), KEY
200662306a36Sopenharmony_ci	aesenc KEY, STATE
200762306a36Sopenharmony_ci	movaps 0x40(TKEYP), KEY
200862306a36Sopenharmony_ci	aesenc KEY, STATE
200962306a36Sopenharmony_ci	movaps 0x50(TKEYP), KEY
201062306a36Sopenharmony_ci	aesenc KEY, STATE
201162306a36Sopenharmony_ci	movaps 0x60(TKEYP), KEY
201262306a36Sopenharmony_ci	aesenc KEY, STATE
201362306a36Sopenharmony_ci	movaps 0x70(TKEYP), KEY
201462306a36Sopenharmony_ci	aesenclast KEY, STATE
201562306a36Sopenharmony_ci	RET
201662306a36Sopenharmony_ciSYM_FUNC_END(_aesni_enc1)
201762306a36Sopenharmony_ci
201862306a36Sopenharmony_ci/*
201962306a36Sopenharmony_ci * _aesni_enc4:	internal ABI
202062306a36Sopenharmony_ci * input:
202162306a36Sopenharmony_ci *	KEYP:		key struct pointer
202262306a36Sopenharmony_ci *	KLEN:		round count
202362306a36Sopenharmony_ci *	STATE1:		initial state (input)
202462306a36Sopenharmony_ci *	STATE2
202562306a36Sopenharmony_ci *	STATE3
202662306a36Sopenharmony_ci *	STATE4
202762306a36Sopenharmony_ci * output:
202862306a36Sopenharmony_ci *	STATE1:		finial state (output)
202962306a36Sopenharmony_ci *	STATE2
203062306a36Sopenharmony_ci *	STATE3
203162306a36Sopenharmony_ci *	STATE4
203262306a36Sopenharmony_ci * changed:
203362306a36Sopenharmony_ci *	KEY
203462306a36Sopenharmony_ci *	TKEYP (T1)
203562306a36Sopenharmony_ci */
203662306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_enc4)
203762306a36Sopenharmony_ci	movaps (KEYP), KEY		# key
203862306a36Sopenharmony_ci	mov KEYP, TKEYP
203962306a36Sopenharmony_ci	pxor KEY, STATE1		# round 0
204062306a36Sopenharmony_ci	pxor KEY, STATE2
204162306a36Sopenharmony_ci	pxor KEY, STATE3
204262306a36Sopenharmony_ci	pxor KEY, STATE4
204362306a36Sopenharmony_ci	add $0x30, TKEYP
204462306a36Sopenharmony_ci	cmp $24, KLEN
204562306a36Sopenharmony_ci	jb .L4enc128
204662306a36Sopenharmony_ci	lea 0x20(TKEYP), TKEYP
204762306a36Sopenharmony_ci	je .L4enc192
204862306a36Sopenharmony_ci	add $0x20, TKEYP
204962306a36Sopenharmony_ci	movaps -0x60(TKEYP), KEY
205062306a36Sopenharmony_ci	aesenc KEY, STATE1
205162306a36Sopenharmony_ci	aesenc KEY, STATE2
205262306a36Sopenharmony_ci	aesenc KEY, STATE3
205362306a36Sopenharmony_ci	aesenc KEY, STATE4
205462306a36Sopenharmony_ci	movaps -0x50(TKEYP), KEY
205562306a36Sopenharmony_ci	aesenc KEY, STATE1
205662306a36Sopenharmony_ci	aesenc KEY, STATE2
205762306a36Sopenharmony_ci	aesenc KEY, STATE3
205862306a36Sopenharmony_ci	aesenc KEY, STATE4
205962306a36Sopenharmony_ci#.align 4
206062306a36Sopenharmony_ci.L4enc192:
206162306a36Sopenharmony_ci	movaps -0x40(TKEYP), KEY
206262306a36Sopenharmony_ci	aesenc KEY, STATE1
206362306a36Sopenharmony_ci	aesenc KEY, STATE2
206462306a36Sopenharmony_ci	aesenc KEY, STATE3
206562306a36Sopenharmony_ci	aesenc KEY, STATE4
206662306a36Sopenharmony_ci	movaps -0x30(TKEYP), KEY
206762306a36Sopenharmony_ci	aesenc KEY, STATE1
206862306a36Sopenharmony_ci	aesenc KEY, STATE2
206962306a36Sopenharmony_ci	aesenc KEY, STATE3
207062306a36Sopenharmony_ci	aesenc KEY, STATE4
207162306a36Sopenharmony_ci#.align 4
207262306a36Sopenharmony_ci.L4enc128:
207362306a36Sopenharmony_ci	movaps -0x20(TKEYP), KEY
207462306a36Sopenharmony_ci	aesenc KEY, STATE1
207562306a36Sopenharmony_ci	aesenc KEY, STATE2
207662306a36Sopenharmony_ci	aesenc KEY, STATE3
207762306a36Sopenharmony_ci	aesenc KEY, STATE4
207862306a36Sopenharmony_ci	movaps -0x10(TKEYP), KEY
207962306a36Sopenharmony_ci	aesenc KEY, STATE1
208062306a36Sopenharmony_ci	aesenc KEY, STATE2
208162306a36Sopenharmony_ci	aesenc KEY, STATE3
208262306a36Sopenharmony_ci	aesenc KEY, STATE4
208362306a36Sopenharmony_ci	movaps (TKEYP), KEY
208462306a36Sopenharmony_ci	aesenc KEY, STATE1
208562306a36Sopenharmony_ci	aesenc KEY, STATE2
208662306a36Sopenharmony_ci	aesenc KEY, STATE3
208762306a36Sopenharmony_ci	aesenc KEY, STATE4
208862306a36Sopenharmony_ci	movaps 0x10(TKEYP), KEY
208962306a36Sopenharmony_ci	aesenc KEY, STATE1
209062306a36Sopenharmony_ci	aesenc KEY, STATE2
209162306a36Sopenharmony_ci	aesenc KEY, STATE3
209262306a36Sopenharmony_ci	aesenc KEY, STATE4
209362306a36Sopenharmony_ci	movaps 0x20(TKEYP), KEY
209462306a36Sopenharmony_ci	aesenc KEY, STATE1
209562306a36Sopenharmony_ci	aesenc KEY, STATE2
209662306a36Sopenharmony_ci	aesenc KEY, STATE3
209762306a36Sopenharmony_ci	aesenc KEY, STATE4
209862306a36Sopenharmony_ci	movaps 0x30(TKEYP), KEY
209962306a36Sopenharmony_ci	aesenc KEY, STATE1
210062306a36Sopenharmony_ci	aesenc KEY, STATE2
210162306a36Sopenharmony_ci	aesenc KEY, STATE3
210262306a36Sopenharmony_ci	aesenc KEY, STATE4
210362306a36Sopenharmony_ci	movaps 0x40(TKEYP), KEY
210462306a36Sopenharmony_ci	aesenc KEY, STATE1
210562306a36Sopenharmony_ci	aesenc KEY, STATE2
210662306a36Sopenharmony_ci	aesenc KEY, STATE3
210762306a36Sopenharmony_ci	aesenc KEY, STATE4
210862306a36Sopenharmony_ci	movaps 0x50(TKEYP), KEY
210962306a36Sopenharmony_ci	aesenc KEY, STATE1
211062306a36Sopenharmony_ci	aesenc KEY, STATE2
211162306a36Sopenharmony_ci	aesenc KEY, STATE3
211262306a36Sopenharmony_ci	aesenc KEY, STATE4
211362306a36Sopenharmony_ci	movaps 0x60(TKEYP), KEY
211462306a36Sopenharmony_ci	aesenc KEY, STATE1
211562306a36Sopenharmony_ci	aesenc KEY, STATE2
211662306a36Sopenharmony_ci	aesenc KEY, STATE3
211762306a36Sopenharmony_ci	aesenc KEY, STATE4
211862306a36Sopenharmony_ci	movaps 0x70(TKEYP), KEY
211962306a36Sopenharmony_ci	aesenclast KEY, STATE1		# last round
212062306a36Sopenharmony_ci	aesenclast KEY, STATE2
212162306a36Sopenharmony_ci	aesenclast KEY, STATE3
212262306a36Sopenharmony_ci	aesenclast KEY, STATE4
212362306a36Sopenharmony_ci	RET
212462306a36Sopenharmony_ciSYM_FUNC_END(_aesni_enc4)
212562306a36Sopenharmony_ci
212662306a36Sopenharmony_ci/*
212762306a36Sopenharmony_ci * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
212862306a36Sopenharmony_ci */
212962306a36Sopenharmony_ciSYM_FUNC_START(aesni_dec)
213062306a36Sopenharmony_ci	FRAME_BEGIN
213162306a36Sopenharmony_ci#ifndef __x86_64__
213262306a36Sopenharmony_ci	pushl KEYP
213362306a36Sopenharmony_ci	pushl KLEN
213462306a36Sopenharmony_ci	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
213562306a36Sopenharmony_ci	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
213662306a36Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), INP	# src
213762306a36Sopenharmony_ci#endif
213862306a36Sopenharmony_ci	mov 480(KEYP), KLEN		# key length
213962306a36Sopenharmony_ci	add $240, KEYP
214062306a36Sopenharmony_ci	movups (INP), STATE		# input
214162306a36Sopenharmony_ci	call _aesni_dec1
214262306a36Sopenharmony_ci	movups STATE, (OUTP)		#output
214362306a36Sopenharmony_ci#ifndef __x86_64__
214462306a36Sopenharmony_ci	popl KLEN
214562306a36Sopenharmony_ci	popl KEYP
214662306a36Sopenharmony_ci#endif
214762306a36Sopenharmony_ci	FRAME_END
214862306a36Sopenharmony_ci	RET
214962306a36Sopenharmony_ciSYM_FUNC_END(aesni_dec)
215062306a36Sopenharmony_ci
215162306a36Sopenharmony_ci/*
215262306a36Sopenharmony_ci * _aesni_dec1:		internal ABI
215362306a36Sopenharmony_ci * input:
215462306a36Sopenharmony_ci *	KEYP:		key struct pointer
215562306a36Sopenharmony_ci *	KLEN:		key length
215662306a36Sopenharmony_ci *	STATE:		initial state (input)
215762306a36Sopenharmony_ci * output:
215862306a36Sopenharmony_ci *	STATE:		finial state (output)
215962306a36Sopenharmony_ci * changed:
216062306a36Sopenharmony_ci *	KEY
216162306a36Sopenharmony_ci *	TKEYP (T1)
216262306a36Sopenharmony_ci */
216362306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_dec1)
216462306a36Sopenharmony_ci	movaps (KEYP), KEY		# key
216562306a36Sopenharmony_ci	mov KEYP, TKEYP
216662306a36Sopenharmony_ci	pxor KEY, STATE		# round 0
216762306a36Sopenharmony_ci	add $0x30, TKEYP
216862306a36Sopenharmony_ci	cmp $24, KLEN
216962306a36Sopenharmony_ci	jb .Ldec128
217062306a36Sopenharmony_ci	lea 0x20(TKEYP), TKEYP
217162306a36Sopenharmony_ci	je .Ldec192
217262306a36Sopenharmony_ci	add $0x20, TKEYP
217362306a36Sopenharmony_ci	movaps -0x60(TKEYP), KEY
217462306a36Sopenharmony_ci	aesdec KEY, STATE
217562306a36Sopenharmony_ci	movaps -0x50(TKEYP), KEY
217662306a36Sopenharmony_ci	aesdec KEY, STATE
217762306a36Sopenharmony_ci.align 4
217862306a36Sopenharmony_ci.Ldec192:
217962306a36Sopenharmony_ci	movaps -0x40(TKEYP), KEY
218062306a36Sopenharmony_ci	aesdec KEY, STATE
218162306a36Sopenharmony_ci	movaps -0x30(TKEYP), KEY
218262306a36Sopenharmony_ci	aesdec KEY, STATE
218362306a36Sopenharmony_ci.align 4
218462306a36Sopenharmony_ci.Ldec128:
218562306a36Sopenharmony_ci	movaps -0x20(TKEYP), KEY
218662306a36Sopenharmony_ci	aesdec KEY, STATE
218762306a36Sopenharmony_ci	movaps -0x10(TKEYP), KEY
218862306a36Sopenharmony_ci	aesdec KEY, STATE
218962306a36Sopenharmony_ci	movaps (TKEYP), KEY
219062306a36Sopenharmony_ci	aesdec KEY, STATE
219162306a36Sopenharmony_ci	movaps 0x10(TKEYP), KEY
219262306a36Sopenharmony_ci	aesdec KEY, STATE
219362306a36Sopenharmony_ci	movaps 0x20(TKEYP), KEY
219462306a36Sopenharmony_ci	aesdec KEY, STATE
219562306a36Sopenharmony_ci	movaps 0x30(TKEYP), KEY
219662306a36Sopenharmony_ci	aesdec KEY, STATE
219762306a36Sopenharmony_ci	movaps 0x40(TKEYP), KEY
219862306a36Sopenharmony_ci	aesdec KEY, STATE
219962306a36Sopenharmony_ci	movaps 0x50(TKEYP), KEY
220062306a36Sopenharmony_ci	aesdec KEY, STATE
220162306a36Sopenharmony_ci	movaps 0x60(TKEYP), KEY
220262306a36Sopenharmony_ci	aesdec KEY, STATE
220362306a36Sopenharmony_ci	movaps 0x70(TKEYP), KEY
220462306a36Sopenharmony_ci	aesdeclast KEY, STATE
220562306a36Sopenharmony_ci	RET
220662306a36Sopenharmony_ciSYM_FUNC_END(_aesni_dec1)
220762306a36Sopenharmony_ci
220862306a36Sopenharmony_ci/*
220962306a36Sopenharmony_ci * _aesni_dec4:	internal ABI
221062306a36Sopenharmony_ci * input:
221162306a36Sopenharmony_ci *	KEYP:		key struct pointer
221262306a36Sopenharmony_ci *	KLEN:		key length
221362306a36Sopenharmony_ci *	STATE1:		initial state (input)
221462306a36Sopenharmony_ci *	STATE2
221562306a36Sopenharmony_ci *	STATE3
221662306a36Sopenharmony_ci *	STATE4
221762306a36Sopenharmony_ci * output:
221862306a36Sopenharmony_ci *	STATE1:		finial state (output)
221962306a36Sopenharmony_ci *	STATE2
222062306a36Sopenharmony_ci *	STATE3
222162306a36Sopenharmony_ci *	STATE4
222262306a36Sopenharmony_ci * changed:
222362306a36Sopenharmony_ci *	KEY
222462306a36Sopenharmony_ci *	TKEYP (T1)
222562306a36Sopenharmony_ci */
222662306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_dec4)
222762306a36Sopenharmony_ci	movaps (KEYP), KEY		# key
222862306a36Sopenharmony_ci	mov KEYP, TKEYP
222962306a36Sopenharmony_ci	pxor KEY, STATE1		# round 0
223062306a36Sopenharmony_ci	pxor KEY, STATE2
223162306a36Sopenharmony_ci	pxor KEY, STATE3
223262306a36Sopenharmony_ci	pxor KEY, STATE4
223362306a36Sopenharmony_ci	add $0x30, TKEYP
223462306a36Sopenharmony_ci	cmp $24, KLEN
223562306a36Sopenharmony_ci	jb .L4dec128
223662306a36Sopenharmony_ci	lea 0x20(TKEYP), TKEYP
223762306a36Sopenharmony_ci	je .L4dec192
223862306a36Sopenharmony_ci	add $0x20, TKEYP
223962306a36Sopenharmony_ci	movaps -0x60(TKEYP), KEY
224062306a36Sopenharmony_ci	aesdec KEY, STATE1
224162306a36Sopenharmony_ci	aesdec KEY, STATE2
224262306a36Sopenharmony_ci	aesdec KEY, STATE3
224362306a36Sopenharmony_ci	aesdec KEY, STATE4
224462306a36Sopenharmony_ci	movaps -0x50(TKEYP), KEY
224562306a36Sopenharmony_ci	aesdec KEY, STATE1
224662306a36Sopenharmony_ci	aesdec KEY, STATE2
224762306a36Sopenharmony_ci	aesdec KEY, STATE3
224862306a36Sopenharmony_ci	aesdec KEY, STATE4
224962306a36Sopenharmony_ci.align 4
225062306a36Sopenharmony_ci.L4dec192:
225162306a36Sopenharmony_ci	movaps -0x40(TKEYP), KEY
225262306a36Sopenharmony_ci	aesdec KEY, STATE1
225362306a36Sopenharmony_ci	aesdec KEY, STATE2
225462306a36Sopenharmony_ci	aesdec KEY, STATE3
225562306a36Sopenharmony_ci	aesdec KEY, STATE4
225662306a36Sopenharmony_ci	movaps -0x30(TKEYP), KEY
225762306a36Sopenharmony_ci	aesdec KEY, STATE1
225862306a36Sopenharmony_ci	aesdec KEY, STATE2
225962306a36Sopenharmony_ci	aesdec KEY, STATE3
226062306a36Sopenharmony_ci	aesdec KEY, STATE4
226162306a36Sopenharmony_ci.align 4
226262306a36Sopenharmony_ci.L4dec128:
226362306a36Sopenharmony_ci	movaps -0x20(TKEYP), KEY
226462306a36Sopenharmony_ci	aesdec KEY, STATE1
226562306a36Sopenharmony_ci	aesdec KEY, STATE2
226662306a36Sopenharmony_ci	aesdec KEY, STATE3
226762306a36Sopenharmony_ci	aesdec KEY, STATE4
226862306a36Sopenharmony_ci	movaps -0x10(TKEYP), KEY
226962306a36Sopenharmony_ci	aesdec KEY, STATE1
227062306a36Sopenharmony_ci	aesdec KEY, STATE2
227162306a36Sopenharmony_ci	aesdec KEY, STATE3
227262306a36Sopenharmony_ci	aesdec KEY, STATE4
227362306a36Sopenharmony_ci	movaps (TKEYP), KEY
227462306a36Sopenharmony_ci	aesdec KEY, STATE1
227562306a36Sopenharmony_ci	aesdec KEY, STATE2
227662306a36Sopenharmony_ci	aesdec KEY, STATE3
227762306a36Sopenharmony_ci	aesdec KEY, STATE4
227862306a36Sopenharmony_ci	movaps 0x10(TKEYP), KEY
227962306a36Sopenharmony_ci	aesdec KEY, STATE1
228062306a36Sopenharmony_ci	aesdec KEY, STATE2
228162306a36Sopenharmony_ci	aesdec KEY, STATE3
228262306a36Sopenharmony_ci	aesdec KEY, STATE4
228362306a36Sopenharmony_ci	movaps 0x20(TKEYP), KEY
228462306a36Sopenharmony_ci	aesdec KEY, STATE1
228562306a36Sopenharmony_ci	aesdec KEY, STATE2
228662306a36Sopenharmony_ci	aesdec KEY, STATE3
228762306a36Sopenharmony_ci	aesdec KEY, STATE4
228862306a36Sopenharmony_ci	movaps 0x30(TKEYP), KEY
228962306a36Sopenharmony_ci	aesdec KEY, STATE1
229062306a36Sopenharmony_ci	aesdec KEY, STATE2
229162306a36Sopenharmony_ci	aesdec KEY, STATE3
229262306a36Sopenharmony_ci	aesdec KEY, STATE4
229362306a36Sopenharmony_ci	movaps 0x40(TKEYP), KEY
229462306a36Sopenharmony_ci	aesdec KEY, STATE1
229562306a36Sopenharmony_ci	aesdec KEY, STATE2
229662306a36Sopenharmony_ci	aesdec KEY, STATE3
229762306a36Sopenharmony_ci	aesdec KEY, STATE4
229862306a36Sopenharmony_ci	movaps 0x50(TKEYP), KEY
229962306a36Sopenharmony_ci	aesdec KEY, STATE1
230062306a36Sopenharmony_ci	aesdec KEY, STATE2
230162306a36Sopenharmony_ci	aesdec KEY, STATE3
230262306a36Sopenharmony_ci	aesdec KEY, STATE4
230362306a36Sopenharmony_ci	movaps 0x60(TKEYP), KEY
230462306a36Sopenharmony_ci	aesdec KEY, STATE1
230562306a36Sopenharmony_ci	aesdec KEY, STATE2
230662306a36Sopenharmony_ci	aesdec KEY, STATE3
230762306a36Sopenharmony_ci	aesdec KEY, STATE4
230862306a36Sopenharmony_ci	movaps 0x70(TKEYP), KEY
230962306a36Sopenharmony_ci	aesdeclast KEY, STATE1		# last round
231062306a36Sopenharmony_ci	aesdeclast KEY, STATE2
231162306a36Sopenharmony_ci	aesdeclast KEY, STATE3
231262306a36Sopenharmony_ci	aesdeclast KEY, STATE4
231362306a36Sopenharmony_ci	RET
231462306a36Sopenharmony_ciSYM_FUNC_END(_aesni_dec4)
231562306a36Sopenharmony_ci
231662306a36Sopenharmony_ci/*
231762306a36Sopenharmony_ci * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
231862306a36Sopenharmony_ci *		      size_t len)
231962306a36Sopenharmony_ci */
232062306a36Sopenharmony_ciSYM_FUNC_START(aesni_ecb_enc)
232162306a36Sopenharmony_ci	FRAME_BEGIN
232262306a36Sopenharmony_ci#ifndef __x86_64__
232362306a36Sopenharmony_ci	pushl LEN
232462306a36Sopenharmony_ci	pushl KEYP
232562306a36Sopenharmony_ci	pushl KLEN
232662306a36Sopenharmony_ci	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
232762306a36Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
232862306a36Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), INP	# src
232962306a36Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), LEN	# len
233062306a36Sopenharmony_ci#endif
233162306a36Sopenharmony_ci	test LEN, LEN		# check length
233262306a36Sopenharmony_ci	jz .Lecb_enc_ret
233362306a36Sopenharmony_ci	mov 480(KEYP), KLEN
233462306a36Sopenharmony_ci	cmp $16, LEN
233562306a36Sopenharmony_ci	jb .Lecb_enc_ret
233662306a36Sopenharmony_ci	cmp $64, LEN
233762306a36Sopenharmony_ci	jb .Lecb_enc_loop1
233862306a36Sopenharmony_ci.align 4
233962306a36Sopenharmony_ci.Lecb_enc_loop4:
234062306a36Sopenharmony_ci	movups (INP), STATE1
234162306a36Sopenharmony_ci	movups 0x10(INP), STATE2
234262306a36Sopenharmony_ci	movups 0x20(INP), STATE3
234362306a36Sopenharmony_ci	movups 0x30(INP), STATE4
234462306a36Sopenharmony_ci	call _aesni_enc4
234562306a36Sopenharmony_ci	movups STATE1, (OUTP)
234662306a36Sopenharmony_ci	movups STATE2, 0x10(OUTP)
234762306a36Sopenharmony_ci	movups STATE3, 0x20(OUTP)
234862306a36Sopenharmony_ci	movups STATE4, 0x30(OUTP)
234962306a36Sopenharmony_ci	sub $64, LEN
235062306a36Sopenharmony_ci	add $64, INP
235162306a36Sopenharmony_ci	add $64, OUTP
235262306a36Sopenharmony_ci	cmp $64, LEN
235362306a36Sopenharmony_ci	jge .Lecb_enc_loop4
235462306a36Sopenharmony_ci	cmp $16, LEN
235562306a36Sopenharmony_ci	jb .Lecb_enc_ret
235662306a36Sopenharmony_ci.align 4
235762306a36Sopenharmony_ci.Lecb_enc_loop1:
235862306a36Sopenharmony_ci	movups (INP), STATE1
235962306a36Sopenharmony_ci	call _aesni_enc1
236062306a36Sopenharmony_ci	movups STATE1, (OUTP)
236162306a36Sopenharmony_ci	sub $16, LEN
236262306a36Sopenharmony_ci	add $16, INP
236362306a36Sopenharmony_ci	add $16, OUTP
236462306a36Sopenharmony_ci	cmp $16, LEN
236562306a36Sopenharmony_ci	jge .Lecb_enc_loop1
236662306a36Sopenharmony_ci.Lecb_enc_ret:
236762306a36Sopenharmony_ci#ifndef __x86_64__
236862306a36Sopenharmony_ci	popl KLEN
236962306a36Sopenharmony_ci	popl KEYP
237062306a36Sopenharmony_ci	popl LEN
237162306a36Sopenharmony_ci#endif
237262306a36Sopenharmony_ci	FRAME_END
237362306a36Sopenharmony_ci	RET
237462306a36Sopenharmony_ciSYM_FUNC_END(aesni_ecb_enc)
237562306a36Sopenharmony_ci
237662306a36Sopenharmony_ci/*
237762306a36Sopenharmony_ci * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
237862306a36Sopenharmony_ci *		      size_t len);
237962306a36Sopenharmony_ci */
238062306a36Sopenharmony_ciSYM_FUNC_START(aesni_ecb_dec)
238162306a36Sopenharmony_ci	FRAME_BEGIN
238262306a36Sopenharmony_ci#ifndef __x86_64__
238362306a36Sopenharmony_ci	pushl LEN
238462306a36Sopenharmony_ci	pushl KEYP
238562306a36Sopenharmony_ci	pushl KLEN
238662306a36Sopenharmony_ci	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
238762306a36Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
238862306a36Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), INP	# src
238962306a36Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), LEN	# len
239062306a36Sopenharmony_ci#endif
239162306a36Sopenharmony_ci	test LEN, LEN
239262306a36Sopenharmony_ci	jz .Lecb_dec_ret
239362306a36Sopenharmony_ci	mov 480(KEYP), KLEN
239462306a36Sopenharmony_ci	add $240, KEYP
239562306a36Sopenharmony_ci	cmp $16, LEN
239662306a36Sopenharmony_ci	jb .Lecb_dec_ret
239762306a36Sopenharmony_ci	cmp $64, LEN
239862306a36Sopenharmony_ci	jb .Lecb_dec_loop1
239962306a36Sopenharmony_ci.align 4
240062306a36Sopenharmony_ci.Lecb_dec_loop4:
240162306a36Sopenharmony_ci	movups (INP), STATE1
240262306a36Sopenharmony_ci	movups 0x10(INP), STATE2
240362306a36Sopenharmony_ci	movups 0x20(INP), STATE3
240462306a36Sopenharmony_ci	movups 0x30(INP), STATE4
240562306a36Sopenharmony_ci	call _aesni_dec4
240662306a36Sopenharmony_ci	movups STATE1, (OUTP)
240762306a36Sopenharmony_ci	movups STATE2, 0x10(OUTP)
240862306a36Sopenharmony_ci	movups STATE3, 0x20(OUTP)
240962306a36Sopenharmony_ci	movups STATE4, 0x30(OUTP)
241062306a36Sopenharmony_ci	sub $64, LEN
241162306a36Sopenharmony_ci	add $64, INP
241262306a36Sopenharmony_ci	add $64, OUTP
241362306a36Sopenharmony_ci	cmp $64, LEN
241462306a36Sopenharmony_ci	jge .Lecb_dec_loop4
241562306a36Sopenharmony_ci	cmp $16, LEN
241662306a36Sopenharmony_ci	jb .Lecb_dec_ret
241762306a36Sopenharmony_ci.align 4
241862306a36Sopenharmony_ci.Lecb_dec_loop1:
241962306a36Sopenharmony_ci	movups (INP), STATE1
242062306a36Sopenharmony_ci	call _aesni_dec1
242162306a36Sopenharmony_ci	movups STATE1, (OUTP)
242262306a36Sopenharmony_ci	sub $16, LEN
242362306a36Sopenharmony_ci	add $16, INP
242462306a36Sopenharmony_ci	add $16, OUTP
242562306a36Sopenharmony_ci	cmp $16, LEN
242662306a36Sopenharmony_ci	jge .Lecb_dec_loop1
242762306a36Sopenharmony_ci.Lecb_dec_ret:
242862306a36Sopenharmony_ci#ifndef __x86_64__
242962306a36Sopenharmony_ci	popl KLEN
243062306a36Sopenharmony_ci	popl KEYP
243162306a36Sopenharmony_ci	popl LEN
243262306a36Sopenharmony_ci#endif
243362306a36Sopenharmony_ci	FRAME_END
243462306a36Sopenharmony_ci	RET
243562306a36Sopenharmony_ciSYM_FUNC_END(aesni_ecb_dec)
243662306a36Sopenharmony_ci
243762306a36Sopenharmony_ci/*
243862306a36Sopenharmony_ci * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
243962306a36Sopenharmony_ci *		      size_t len, u8 *iv)
244062306a36Sopenharmony_ci */
244162306a36Sopenharmony_ciSYM_FUNC_START(aesni_cbc_enc)
244262306a36Sopenharmony_ci	FRAME_BEGIN
244362306a36Sopenharmony_ci#ifndef __x86_64__
244462306a36Sopenharmony_ci	pushl IVP
244562306a36Sopenharmony_ci	pushl LEN
244662306a36Sopenharmony_ci	pushl KEYP
244762306a36Sopenharmony_ci	pushl KLEN
244862306a36Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
244962306a36Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
245062306a36Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), INP	# src
245162306a36Sopenharmony_ci	movl (FRAME_OFFSET+32)(%esp), LEN	# len
245262306a36Sopenharmony_ci	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
245362306a36Sopenharmony_ci#endif
245462306a36Sopenharmony_ci	cmp $16, LEN
245562306a36Sopenharmony_ci	jb .Lcbc_enc_ret
245662306a36Sopenharmony_ci	mov 480(KEYP), KLEN
245762306a36Sopenharmony_ci	movups (IVP), STATE	# load iv as initial state
245862306a36Sopenharmony_ci.align 4
245962306a36Sopenharmony_ci.Lcbc_enc_loop:
246062306a36Sopenharmony_ci	movups (INP), IN	# load input
246162306a36Sopenharmony_ci	pxor IN, STATE
246262306a36Sopenharmony_ci	call _aesni_enc1
246362306a36Sopenharmony_ci	movups STATE, (OUTP)	# store output
246462306a36Sopenharmony_ci	sub $16, LEN
246562306a36Sopenharmony_ci	add $16, INP
246662306a36Sopenharmony_ci	add $16, OUTP
246762306a36Sopenharmony_ci	cmp $16, LEN
246862306a36Sopenharmony_ci	jge .Lcbc_enc_loop
246962306a36Sopenharmony_ci	movups STATE, (IVP)
247062306a36Sopenharmony_ci.Lcbc_enc_ret:
247162306a36Sopenharmony_ci#ifndef __x86_64__
247262306a36Sopenharmony_ci	popl KLEN
247362306a36Sopenharmony_ci	popl KEYP
247462306a36Sopenharmony_ci	popl LEN
247562306a36Sopenharmony_ci	popl IVP
247662306a36Sopenharmony_ci#endif
247762306a36Sopenharmony_ci	FRAME_END
247862306a36Sopenharmony_ci	RET
247962306a36Sopenharmony_ciSYM_FUNC_END(aesni_cbc_enc)
248062306a36Sopenharmony_ci
248162306a36Sopenharmony_ci/*
248262306a36Sopenharmony_ci * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
248362306a36Sopenharmony_ci *		      size_t len, u8 *iv)
248462306a36Sopenharmony_ci */
248562306a36Sopenharmony_ciSYM_FUNC_START(aesni_cbc_dec)
248662306a36Sopenharmony_ci	FRAME_BEGIN
248762306a36Sopenharmony_ci#ifndef __x86_64__
248862306a36Sopenharmony_ci	pushl IVP
248962306a36Sopenharmony_ci	pushl LEN
249062306a36Sopenharmony_ci	pushl KEYP
249162306a36Sopenharmony_ci	pushl KLEN
249262306a36Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
249362306a36Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
249462306a36Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), INP	# src
249562306a36Sopenharmony_ci	movl (FRAME_OFFSET+32)(%esp), LEN	# len
249662306a36Sopenharmony_ci	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
249762306a36Sopenharmony_ci#endif
249862306a36Sopenharmony_ci	cmp $16, LEN
249962306a36Sopenharmony_ci	jb .Lcbc_dec_just_ret
250062306a36Sopenharmony_ci	mov 480(KEYP), KLEN
250162306a36Sopenharmony_ci	add $240, KEYP
250262306a36Sopenharmony_ci	movups (IVP), IV
250362306a36Sopenharmony_ci	cmp $64, LEN
250462306a36Sopenharmony_ci	jb .Lcbc_dec_loop1
250562306a36Sopenharmony_ci.align 4
250662306a36Sopenharmony_ci.Lcbc_dec_loop4:
250762306a36Sopenharmony_ci	movups (INP), IN1
250862306a36Sopenharmony_ci	movaps IN1, STATE1
250962306a36Sopenharmony_ci	movups 0x10(INP), IN2
251062306a36Sopenharmony_ci	movaps IN2, STATE2
251162306a36Sopenharmony_ci#ifdef __x86_64__
251262306a36Sopenharmony_ci	movups 0x20(INP), IN3
251362306a36Sopenharmony_ci	movaps IN3, STATE3
251462306a36Sopenharmony_ci	movups 0x30(INP), IN4
251562306a36Sopenharmony_ci	movaps IN4, STATE4
251662306a36Sopenharmony_ci#else
251762306a36Sopenharmony_ci	movups 0x20(INP), IN1
251862306a36Sopenharmony_ci	movaps IN1, STATE3
251962306a36Sopenharmony_ci	movups 0x30(INP), IN2
252062306a36Sopenharmony_ci	movaps IN2, STATE4
252162306a36Sopenharmony_ci#endif
252262306a36Sopenharmony_ci	call _aesni_dec4
252362306a36Sopenharmony_ci	pxor IV, STATE1
252462306a36Sopenharmony_ci#ifdef __x86_64__
252562306a36Sopenharmony_ci	pxor IN1, STATE2
252662306a36Sopenharmony_ci	pxor IN2, STATE3
252762306a36Sopenharmony_ci	pxor IN3, STATE4
252862306a36Sopenharmony_ci	movaps IN4, IV
252962306a36Sopenharmony_ci#else
253062306a36Sopenharmony_ci	pxor IN1, STATE4
253162306a36Sopenharmony_ci	movaps IN2, IV
253262306a36Sopenharmony_ci	movups (INP), IN1
253362306a36Sopenharmony_ci	pxor IN1, STATE2
253462306a36Sopenharmony_ci	movups 0x10(INP), IN2
253562306a36Sopenharmony_ci	pxor IN2, STATE3
253662306a36Sopenharmony_ci#endif
253762306a36Sopenharmony_ci	movups STATE1, (OUTP)
253862306a36Sopenharmony_ci	movups STATE2, 0x10(OUTP)
253962306a36Sopenharmony_ci	movups STATE3, 0x20(OUTP)
254062306a36Sopenharmony_ci	movups STATE4, 0x30(OUTP)
254162306a36Sopenharmony_ci	sub $64, LEN
254262306a36Sopenharmony_ci	add $64, INP
254362306a36Sopenharmony_ci	add $64, OUTP
254462306a36Sopenharmony_ci	cmp $64, LEN
254562306a36Sopenharmony_ci	jge .Lcbc_dec_loop4
254662306a36Sopenharmony_ci	cmp $16, LEN
254762306a36Sopenharmony_ci	jb .Lcbc_dec_ret
254862306a36Sopenharmony_ci.align 4
254962306a36Sopenharmony_ci.Lcbc_dec_loop1:
255062306a36Sopenharmony_ci	movups (INP), IN
255162306a36Sopenharmony_ci	movaps IN, STATE
255262306a36Sopenharmony_ci	call _aesni_dec1
255362306a36Sopenharmony_ci	pxor IV, STATE
255462306a36Sopenharmony_ci	movups STATE, (OUTP)
255562306a36Sopenharmony_ci	movaps IN, IV
255662306a36Sopenharmony_ci	sub $16, LEN
255762306a36Sopenharmony_ci	add $16, INP
255862306a36Sopenharmony_ci	add $16, OUTP
255962306a36Sopenharmony_ci	cmp $16, LEN
256062306a36Sopenharmony_ci	jge .Lcbc_dec_loop1
256162306a36Sopenharmony_ci.Lcbc_dec_ret:
256262306a36Sopenharmony_ci	movups IV, (IVP)
256362306a36Sopenharmony_ci.Lcbc_dec_just_ret:
256462306a36Sopenharmony_ci#ifndef __x86_64__
256562306a36Sopenharmony_ci	popl KLEN
256662306a36Sopenharmony_ci	popl KEYP
256762306a36Sopenharmony_ci	popl LEN
256862306a36Sopenharmony_ci	popl IVP
256962306a36Sopenharmony_ci#endif
257062306a36Sopenharmony_ci	FRAME_END
257162306a36Sopenharmony_ci	RET
257262306a36Sopenharmony_ciSYM_FUNC_END(aesni_cbc_dec)
257362306a36Sopenharmony_ci
257462306a36Sopenharmony_ci/*
257562306a36Sopenharmony_ci * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
257662306a36Sopenharmony_ci *			  size_t len, u8 *iv)
257762306a36Sopenharmony_ci */
257862306a36Sopenharmony_ciSYM_FUNC_START(aesni_cts_cbc_enc)
257962306a36Sopenharmony_ci	FRAME_BEGIN
258062306a36Sopenharmony_ci#ifndef __x86_64__
258162306a36Sopenharmony_ci	pushl IVP
258262306a36Sopenharmony_ci	pushl LEN
258362306a36Sopenharmony_ci	pushl KEYP
258462306a36Sopenharmony_ci	pushl KLEN
258562306a36Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
258662306a36Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
258762306a36Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), INP	# src
258862306a36Sopenharmony_ci	movl (FRAME_OFFSET+32)(%esp), LEN	# len
258962306a36Sopenharmony_ci	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
259062306a36Sopenharmony_ci	lea .Lcts_permute_table, T1
259162306a36Sopenharmony_ci#else
259262306a36Sopenharmony_ci	lea .Lcts_permute_table(%rip), T1
259362306a36Sopenharmony_ci#endif
259462306a36Sopenharmony_ci	mov 480(KEYP), KLEN
259562306a36Sopenharmony_ci	movups (IVP), STATE
259662306a36Sopenharmony_ci	sub $16, LEN
259762306a36Sopenharmony_ci	mov T1, IVP
259862306a36Sopenharmony_ci	add $32, IVP
259962306a36Sopenharmony_ci	add LEN, T1
260062306a36Sopenharmony_ci	sub LEN, IVP
260162306a36Sopenharmony_ci	movups (T1), %xmm4
260262306a36Sopenharmony_ci	movups (IVP), %xmm5
260362306a36Sopenharmony_ci
260462306a36Sopenharmony_ci	movups (INP), IN1
260562306a36Sopenharmony_ci	add LEN, INP
260662306a36Sopenharmony_ci	movups (INP), IN2
260762306a36Sopenharmony_ci
260862306a36Sopenharmony_ci	pxor IN1, STATE
260962306a36Sopenharmony_ci	call _aesni_enc1
261062306a36Sopenharmony_ci
261162306a36Sopenharmony_ci	pshufb %xmm5, IN2
261262306a36Sopenharmony_ci	pxor STATE, IN2
261362306a36Sopenharmony_ci	pshufb %xmm4, STATE
261462306a36Sopenharmony_ci	add OUTP, LEN
261562306a36Sopenharmony_ci	movups STATE, (LEN)
261662306a36Sopenharmony_ci
261762306a36Sopenharmony_ci	movaps IN2, STATE
261862306a36Sopenharmony_ci	call _aesni_enc1
261962306a36Sopenharmony_ci	movups STATE, (OUTP)
262062306a36Sopenharmony_ci
262162306a36Sopenharmony_ci#ifndef __x86_64__
262262306a36Sopenharmony_ci	popl KLEN
262362306a36Sopenharmony_ci	popl KEYP
262462306a36Sopenharmony_ci	popl LEN
262562306a36Sopenharmony_ci	popl IVP
262662306a36Sopenharmony_ci#endif
262762306a36Sopenharmony_ci	FRAME_END
262862306a36Sopenharmony_ci	RET
262962306a36Sopenharmony_ciSYM_FUNC_END(aesni_cts_cbc_enc)
263062306a36Sopenharmony_ci
263162306a36Sopenharmony_ci/*
263262306a36Sopenharmony_ci * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
263362306a36Sopenharmony_ci *			  size_t len, u8 *iv)
263462306a36Sopenharmony_ci */
263562306a36Sopenharmony_ciSYM_FUNC_START(aesni_cts_cbc_dec)
263662306a36Sopenharmony_ci	FRAME_BEGIN
263762306a36Sopenharmony_ci#ifndef __x86_64__
263862306a36Sopenharmony_ci	pushl IVP
263962306a36Sopenharmony_ci	pushl LEN
264062306a36Sopenharmony_ci	pushl KEYP
264162306a36Sopenharmony_ci	pushl KLEN
264262306a36Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
264362306a36Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
264462306a36Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), INP	# src
264562306a36Sopenharmony_ci	movl (FRAME_OFFSET+32)(%esp), LEN	# len
264662306a36Sopenharmony_ci	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
264762306a36Sopenharmony_ci	lea .Lcts_permute_table, T1
264862306a36Sopenharmony_ci#else
264962306a36Sopenharmony_ci	lea .Lcts_permute_table(%rip), T1
265062306a36Sopenharmony_ci#endif
265162306a36Sopenharmony_ci	mov 480(KEYP), KLEN
265262306a36Sopenharmony_ci	add $240, KEYP
265362306a36Sopenharmony_ci	movups (IVP), IV
265462306a36Sopenharmony_ci	sub $16, LEN
265562306a36Sopenharmony_ci	mov T1, IVP
265662306a36Sopenharmony_ci	add $32, IVP
265762306a36Sopenharmony_ci	add LEN, T1
265862306a36Sopenharmony_ci	sub LEN, IVP
265962306a36Sopenharmony_ci	movups (T1), %xmm4
266062306a36Sopenharmony_ci
266162306a36Sopenharmony_ci	movups (INP), STATE
266262306a36Sopenharmony_ci	add LEN, INP
266362306a36Sopenharmony_ci	movups (INP), IN1
266462306a36Sopenharmony_ci
266562306a36Sopenharmony_ci	call _aesni_dec1
266662306a36Sopenharmony_ci	movaps STATE, IN2
266762306a36Sopenharmony_ci	pshufb %xmm4, STATE
266862306a36Sopenharmony_ci	pxor IN1, STATE
266962306a36Sopenharmony_ci
267062306a36Sopenharmony_ci	add OUTP, LEN
267162306a36Sopenharmony_ci	movups STATE, (LEN)
267262306a36Sopenharmony_ci
267362306a36Sopenharmony_ci	movups (IVP), %xmm0
267462306a36Sopenharmony_ci	pshufb %xmm0, IN1
267562306a36Sopenharmony_ci	pblendvb IN2, IN1
267662306a36Sopenharmony_ci	movaps IN1, STATE
267762306a36Sopenharmony_ci	call _aesni_dec1
267862306a36Sopenharmony_ci
267962306a36Sopenharmony_ci	pxor IV, STATE
268062306a36Sopenharmony_ci	movups STATE, (OUTP)
268162306a36Sopenharmony_ci
268262306a36Sopenharmony_ci#ifndef __x86_64__
268362306a36Sopenharmony_ci	popl KLEN
268462306a36Sopenharmony_ci	popl KEYP
268562306a36Sopenharmony_ci	popl LEN
268662306a36Sopenharmony_ci	popl IVP
268762306a36Sopenharmony_ci#endif
268862306a36Sopenharmony_ci	FRAME_END
268962306a36Sopenharmony_ci	RET
269062306a36Sopenharmony_ciSYM_FUNC_END(aesni_cts_cbc_dec)
269162306a36Sopenharmony_ci
269262306a36Sopenharmony_ci.pushsection .rodata
269362306a36Sopenharmony_ci.align 16
269462306a36Sopenharmony_ci.Lcts_permute_table:
269562306a36Sopenharmony_ci	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
269662306a36Sopenharmony_ci	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
269762306a36Sopenharmony_ci	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
269862306a36Sopenharmony_ci	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
269962306a36Sopenharmony_ci	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
270062306a36Sopenharmony_ci	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
270162306a36Sopenharmony_ci#ifdef __x86_64__
270262306a36Sopenharmony_ci.Lbswap_mask:
270362306a36Sopenharmony_ci	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
270462306a36Sopenharmony_ci#endif
270562306a36Sopenharmony_ci.popsection
270662306a36Sopenharmony_ci
270762306a36Sopenharmony_ci#ifdef __x86_64__
270862306a36Sopenharmony_ci/*
270962306a36Sopenharmony_ci * _aesni_inc_init:	internal ABI
271062306a36Sopenharmony_ci *	setup registers used by _aesni_inc
271162306a36Sopenharmony_ci * input:
271262306a36Sopenharmony_ci *	IV
271362306a36Sopenharmony_ci * output:
271462306a36Sopenharmony_ci *	CTR:	== IV, in little endian
271562306a36Sopenharmony_ci *	TCTR_LOW: == lower qword of CTR
271662306a36Sopenharmony_ci *	INC:	== 1, in little endian
271762306a36Sopenharmony_ci *	BSWAP_MASK == endian swapping mask
271862306a36Sopenharmony_ci */
271962306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_inc_init)
272062306a36Sopenharmony_ci	movaps .Lbswap_mask(%rip), BSWAP_MASK
272162306a36Sopenharmony_ci	movaps IV, CTR
272262306a36Sopenharmony_ci	pshufb BSWAP_MASK, CTR
272362306a36Sopenharmony_ci	mov $1, TCTR_LOW
272462306a36Sopenharmony_ci	movq TCTR_LOW, INC
272562306a36Sopenharmony_ci	movq CTR, TCTR_LOW
272662306a36Sopenharmony_ci	RET
272762306a36Sopenharmony_ciSYM_FUNC_END(_aesni_inc_init)
272862306a36Sopenharmony_ci
272962306a36Sopenharmony_ci/*
273062306a36Sopenharmony_ci * _aesni_inc:		internal ABI
273162306a36Sopenharmony_ci *	Increase IV by 1, IV is in big endian
273262306a36Sopenharmony_ci * input:
273362306a36Sopenharmony_ci *	IV
273462306a36Sopenharmony_ci *	CTR:	== IV, in little endian
273562306a36Sopenharmony_ci *	TCTR_LOW: == lower qword of CTR
273662306a36Sopenharmony_ci *	INC:	== 1, in little endian
273762306a36Sopenharmony_ci *	BSWAP_MASK == endian swapping mask
273862306a36Sopenharmony_ci * output:
273962306a36Sopenharmony_ci *	IV:	Increase by 1
274062306a36Sopenharmony_ci * changed:
274162306a36Sopenharmony_ci *	CTR:	== output IV, in little endian
274262306a36Sopenharmony_ci *	TCTR_LOW: == lower qword of CTR
274362306a36Sopenharmony_ci */
274462306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(_aesni_inc)
274562306a36Sopenharmony_ci	paddq INC, CTR
274662306a36Sopenharmony_ci	add $1, TCTR_LOW
274762306a36Sopenharmony_ci	jnc .Linc_low
274862306a36Sopenharmony_ci	pslldq $8, INC
274962306a36Sopenharmony_ci	paddq INC, CTR
275062306a36Sopenharmony_ci	psrldq $8, INC
275162306a36Sopenharmony_ci.Linc_low:
275262306a36Sopenharmony_ci	movaps CTR, IV
275362306a36Sopenharmony_ci	pshufb BSWAP_MASK, IV
275462306a36Sopenharmony_ci	RET
275562306a36Sopenharmony_ciSYM_FUNC_END(_aesni_inc)
275662306a36Sopenharmony_ci
275762306a36Sopenharmony_ci/*
275862306a36Sopenharmony_ci * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
275962306a36Sopenharmony_ci *		      size_t len, u8 *iv)
276062306a36Sopenharmony_ci */
276162306a36Sopenharmony_ciSYM_FUNC_START(aesni_ctr_enc)
276262306a36Sopenharmony_ci	FRAME_BEGIN
276362306a36Sopenharmony_ci	cmp $16, LEN
276462306a36Sopenharmony_ci	jb .Lctr_enc_just_ret
276562306a36Sopenharmony_ci	mov 480(KEYP), KLEN
276662306a36Sopenharmony_ci	movups (IVP), IV
276762306a36Sopenharmony_ci	call _aesni_inc_init
276862306a36Sopenharmony_ci	cmp $64, LEN
276962306a36Sopenharmony_ci	jb .Lctr_enc_loop1
277062306a36Sopenharmony_ci.align 4
277162306a36Sopenharmony_ci.Lctr_enc_loop4:
277262306a36Sopenharmony_ci	movaps IV, STATE1
277362306a36Sopenharmony_ci	call _aesni_inc
277462306a36Sopenharmony_ci	movups (INP), IN1
277562306a36Sopenharmony_ci	movaps IV, STATE2
277662306a36Sopenharmony_ci	call _aesni_inc
277762306a36Sopenharmony_ci	movups 0x10(INP), IN2
277862306a36Sopenharmony_ci	movaps IV, STATE3
277962306a36Sopenharmony_ci	call _aesni_inc
278062306a36Sopenharmony_ci	movups 0x20(INP), IN3
278162306a36Sopenharmony_ci	movaps IV, STATE4
278262306a36Sopenharmony_ci	call _aesni_inc
278362306a36Sopenharmony_ci	movups 0x30(INP), IN4
278462306a36Sopenharmony_ci	call _aesni_enc4
278562306a36Sopenharmony_ci	pxor IN1, STATE1
278662306a36Sopenharmony_ci	movups STATE1, (OUTP)
278762306a36Sopenharmony_ci	pxor IN2, STATE2
278862306a36Sopenharmony_ci	movups STATE2, 0x10(OUTP)
278962306a36Sopenharmony_ci	pxor IN3, STATE3
279062306a36Sopenharmony_ci	movups STATE3, 0x20(OUTP)
279162306a36Sopenharmony_ci	pxor IN4, STATE4
279262306a36Sopenharmony_ci	movups STATE4, 0x30(OUTP)
279362306a36Sopenharmony_ci	sub $64, LEN
279462306a36Sopenharmony_ci	add $64, INP
279562306a36Sopenharmony_ci	add $64, OUTP
279662306a36Sopenharmony_ci	cmp $64, LEN
279762306a36Sopenharmony_ci	jge .Lctr_enc_loop4
279862306a36Sopenharmony_ci	cmp $16, LEN
279962306a36Sopenharmony_ci	jb .Lctr_enc_ret
280062306a36Sopenharmony_ci.align 4
280162306a36Sopenharmony_ci.Lctr_enc_loop1:
280262306a36Sopenharmony_ci	movaps IV, STATE
280362306a36Sopenharmony_ci	call _aesni_inc
280462306a36Sopenharmony_ci	movups (INP), IN
280562306a36Sopenharmony_ci	call _aesni_enc1
280662306a36Sopenharmony_ci	pxor IN, STATE
280762306a36Sopenharmony_ci	movups STATE, (OUTP)
280862306a36Sopenharmony_ci	sub $16, LEN
280962306a36Sopenharmony_ci	add $16, INP
281062306a36Sopenharmony_ci	add $16, OUTP
281162306a36Sopenharmony_ci	cmp $16, LEN
281262306a36Sopenharmony_ci	jge .Lctr_enc_loop1
281362306a36Sopenharmony_ci.Lctr_enc_ret:
281462306a36Sopenharmony_ci	movups IV, (IVP)
281562306a36Sopenharmony_ci.Lctr_enc_just_ret:
281662306a36Sopenharmony_ci	FRAME_END
281762306a36Sopenharmony_ci	RET
281862306a36Sopenharmony_ciSYM_FUNC_END(aesni_ctr_enc)
281962306a36Sopenharmony_ci
282062306a36Sopenharmony_ci#endif
282162306a36Sopenharmony_ci
282262306a36Sopenharmony_ci.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
282362306a36Sopenharmony_ci.align 16
282462306a36Sopenharmony_ci.Lgf128mul_x_ble_mask:
282562306a36Sopenharmony_ci	.octa 0x00000000000000010000000000000087
282662306a36Sopenharmony_ci.previous
282762306a36Sopenharmony_ci
282862306a36Sopenharmony_ci/*
282962306a36Sopenharmony_ci * _aesni_gf128mul_x_ble:		internal ABI
283062306a36Sopenharmony_ci *	Multiply in GF(2^128) for XTS IVs
283162306a36Sopenharmony_ci * input:
283262306a36Sopenharmony_ci *	IV:	current IV
283362306a36Sopenharmony_ci *	GF128MUL_MASK == mask with 0x87 and 0x01
283462306a36Sopenharmony_ci * output:
283562306a36Sopenharmony_ci *	IV:	next IV
283662306a36Sopenharmony_ci * changed:
283762306a36Sopenharmony_ci *	CTR:	== temporary value
283862306a36Sopenharmony_ci */
283962306a36Sopenharmony_ci#define _aesni_gf128mul_x_ble() \
284062306a36Sopenharmony_ci	pshufd $0x13, IV, KEY; \
284162306a36Sopenharmony_ci	paddq IV, IV; \
284262306a36Sopenharmony_ci	psrad $31, KEY; \
284362306a36Sopenharmony_ci	pand GF128MUL_MASK, KEY; \
284462306a36Sopenharmony_ci	pxor KEY, IV;
284562306a36Sopenharmony_ci
284662306a36Sopenharmony_ci/*
284762306a36Sopenharmony_ci * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
284862306a36Sopenharmony_ci *			  const u8 *src, unsigned int len, le128 *iv)
284962306a36Sopenharmony_ci */
285062306a36Sopenharmony_ciSYM_FUNC_START(aesni_xts_encrypt)
285162306a36Sopenharmony_ci	FRAME_BEGIN
285262306a36Sopenharmony_ci#ifndef __x86_64__
285362306a36Sopenharmony_ci	pushl IVP
285462306a36Sopenharmony_ci	pushl LEN
285562306a36Sopenharmony_ci	pushl KEYP
285662306a36Sopenharmony_ci	pushl KLEN
285762306a36Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
285862306a36Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
285962306a36Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), INP	# src
286062306a36Sopenharmony_ci	movl (FRAME_OFFSET+32)(%esp), LEN	# len
286162306a36Sopenharmony_ci	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
286262306a36Sopenharmony_ci	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
286362306a36Sopenharmony_ci#else
286462306a36Sopenharmony_ci	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
286562306a36Sopenharmony_ci#endif
286662306a36Sopenharmony_ci	movups (IVP), IV
286762306a36Sopenharmony_ci
286862306a36Sopenharmony_ci	mov 480(KEYP), KLEN
286962306a36Sopenharmony_ci
287062306a36Sopenharmony_ci.Lxts_enc_loop4:
287162306a36Sopenharmony_ci	sub $64, LEN
287262306a36Sopenharmony_ci	jl .Lxts_enc_1x
287362306a36Sopenharmony_ci
287462306a36Sopenharmony_ci	movdqa IV, STATE1
287562306a36Sopenharmony_ci	movdqu 0x00(INP), IN
287662306a36Sopenharmony_ci	pxor IN, STATE1
287762306a36Sopenharmony_ci	movdqu IV, 0x00(OUTP)
287862306a36Sopenharmony_ci
287962306a36Sopenharmony_ci	_aesni_gf128mul_x_ble()
288062306a36Sopenharmony_ci	movdqa IV, STATE2
288162306a36Sopenharmony_ci	movdqu 0x10(INP), IN
288262306a36Sopenharmony_ci	pxor IN, STATE2
288362306a36Sopenharmony_ci	movdqu IV, 0x10(OUTP)
288462306a36Sopenharmony_ci
288562306a36Sopenharmony_ci	_aesni_gf128mul_x_ble()
288662306a36Sopenharmony_ci	movdqa IV, STATE3
288762306a36Sopenharmony_ci	movdqu 0x20(INP), IN
288862306a36Sopenharmony_ci	pxor IN, STATE3
288962306a36Sopenharmony_ci	movdqu IV, 0x20(OUTP)
289062306a36Sopenharmony_ci
289162306a36Sopenharmony_ci	_aesni_gf128mul_x_ble()
289262306a36Sopenharmony_ci	movdqa IV, STATE4
289362306a36Sopenharmony_ci	movdqu 0x30(INP), IN
289462306a36Sopenharmony_ci	pxor IN, STATE4
289562306a36Sopenharmony_ci	movdqu IV, 0x30(OUTP)
289662306a36Sopenharmony_ci
289762306a36Sopenharmony_ci	call _aesni_enc4
289862306a36Sopenharmony_ci
289962306a36Sopenharmony_ci	movdqu 0x00(OUTP), IN
290062306a36Sopenharmony_ci	pxor IN, STATE1
290162306a36Sopenharmony_ci	movdqu STATE1, 0x00(OUTP)
290262306a36Sopenharmony_ci
290362306a36Sopenharmony_ci	movdqu 0x10(OUTP), IN
290462306a36Sopenharmony_ci	pxor IN, STATE2
290562306a36Sopenharmony_ci	movdqu STATE2, 0x10(OUTP)
290662306a36Sopenharmony_ci
290762306a36Sopenharmony_ci	movdqu 0x20(OUTP), IN
290862306a36Sopenharmony_ci	pxor IN, STATE3
290962306a36Sopenharmony_ci	movdqu STATE3, 0x20(OUTP)
291062306a36Sopenharmony_ci
291162306a36Sopenharmony_ci	movdqu 0x30(OUTP), IN
291262306a36Sopenharmony_ci	pxor IN, STATE4
291362306a36Sopenharmony_ci	movdqu STATE4, 0x30(OUTP)
291462306a36Sopenharmony_ci
291562306a36Sopenharmony_ci	_aesni_gf128mul_x_ble()
291662306a36Sopenharmony_ci
291762306a36Sopenharmony_ci	add $64, INP
291862306a36Sopenharmony_ci	add $64, OUTP
291962306a36Sopenharmony_ci	test LEN, LEN
292062306a36Sopenharmony_ci	jnz .Lxts_enc_loop4
292162306a36Sopenharmony_ci
292262306a36Sopenharmony_ci.Lxts_enc_ret_iv:
292362306a36Sopenharmony_ci	movups IV, (IVP)
292462306a36Sopenharmony_ci
292562306a36Sopenharmony_ci.Lxts_enc_ret:
292662306a36Sopenharmony_ci#ifndef __x86_64__
292762306a36Sopenharmony_ci	popl KLEN
292862306a36Sopenharmony_ci	popl KEYP
292962306a36Sopenharmony_ci	popl LEN
293062306a36Sopenharmony_ci	popl IVP
293162306a36Sopenharmony_ci#endif
293262306a36Sopenharmony_ci	FRAME_END
293362306a36Sopenharmony_ci	RET
293462306a36Sopenharmony_ci
293562306a36Sopenharmony_ci.Lxts_enc_1x:
293662306a36Sopenharmony_ci	add $64, LEN
293762306a36Sopenharmony_ci	jz .Lxts_enc_ret_iv
293862306a36Sopenharmony_ci	sub $16, LEN
293962306a36Sopenharmony_ci	jl .Lxts_enc_cts4
294062306a36Sopenharmony_ci
294162306a36Sopenharmony_ci.Lxts_enc_loop1:
294262306a36Sopenharmony_ci	movdqu (INP), STATE
294362306a36Sopenharmony_ci	pxor IV, STATE
294462306a36Sopenharmony_ci	call _aesni_enc1
294562306a36Sopenharmony_ci	pxor IV, STATE
294662306a36Sopenharmony_ci	_aesni_gf128mul_x_ble()
294762306a36Sopenharmony_ci
294862306a36Sopenharmony_ci	test LEN, LEN
294962306a36Sopenharmony_ci	jz .Lxts_enc_out
295062306a36Sopenharmony_ci
295162306a36Sopenharmony_ci	add $16, INP
295262306a36Sopenharmony_ci	sub $16, LEN
295362306a36Sopenharmony_ci	jl .Lxts_enc_cts1
295462306a36Sopenharmony_ci
295562306a36Sopenharmony_ci	movdqu STATE, (OUTP)
295662306a36Sopenharmony_ci	add $16, OUTP
295762306a36Sopenharmony_ci	jmp .Lxts_enc_loop1
295862306a36Sopenharmony_ci
295962306a36Sopenharmony_ci.Lxts_enc_out:
296062306a36Sopenharmony_ci	movdqu STATE, (OUTP)
296162306a36Sopenharmony_ci	jmp .Lxts_enc_ret_iv
296262306a36Sopenharmony_ci
296362306a36Sopenharmony_ci.Lxts_enc_cts4:
296462306a36Sopenharmony_ci	movdqa STATE4, STATE
296562306a36Sopenharmony_ci	sub $16, OUTP
296662306a36Sopenharmony_ci
296762306a36Sopenharmony_ci.Lxts_enc_cts1:
296862306a36Sopenharmony_ci#ifndef __x86_64__
296962306a36Sopenharmony_ci	lea .Lcts_permute_table, T1
297062306a36Sopenharmony_ci#else
297162306a36Sopenharmony_ci	lea .Lcts_permute_table(%rip), T1
297262306a36Sopenharmony_ci#endif
297362306a36Sopenharmony_ci	add LEN, INP		/* rewind input pointer */
297462306a36Sopenharmony_ci	add $16, LEN		/* # bytes in final block */
297562306a36Sopenharmony_ci	movups (INP), IN1
297662306a36Sopenharmony_ci
297762306a36Sopenharmony_ci	mov T1, IVP
297862306a36Sopenharmony_ci	add $32, IVP
297962306a36Sopenharmony_ci	add LEN, T1
298062306a36Sopenharmony_ci	sub LEN, IVP
298162306a36Sopenharmony_ci	add OUTP, LEN
298262306a36Sopenharmony_ci
298362306a36Sopenharmony_ci	movups (T1), %xmm4
298462306a36Sopenharmony_ci	movaps STATE, IN2
298562306a36Sopenharmony_ci	pshufb %xmm4, STATE
298662306a36Sopenharmony_ci	movups STATE, (LEN)
298762306a36Sopenharmony_ci
298862306a36Sopenharmony_ci	movups (IVP), %xmm0
298962306a36Sopenharmony_ci	pshufb %xmm0, IN1
299062306a36Sopenharmony_ci	pblendvb IN2, IN1
299162306a36Sopenharmony_ci	movaps IN1, STATE
299262306a36Sopenharmony_ci
299362306a36Sopenharmony_ci	pxor IV, STATE
299462306a36Sopenharmony_ci	call _aesni_enc1
299562306a36Sopenharmony_ci	pxor IV, STATE
299662306a36Sopenharmony_ci
299762306a36Sopenharmony_ci	movups STATE, (OUTP)
299862306a36Sopenharmony_ci	jmp .Lxts_enc_ret
299962306a36Sopenharmony_ciSYM_FUNC_END(aesni_xts_encrypt)
300062306a36Sopenharmony_ci
300162306a36Sopenharmony_ci/*
300262306a36Sopenharmony_ci * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
300362306a36Sopenharmony_ci *			  const u8 *src, unsigned int len, le128 *iv)
300462306a36Sopenharmony_ci */
300562306a36Sopenharmony_ciSYM_FUNC_START(aesni_xts_decrypt)
300662306a36Sopenharmony_ci	FRAME_BEGIN
300762306a36Sopenharmony_ci#ifndef __x86_64__
300862306a36Sopenharmony_ci	pushl IVP
300962306a36Sopenharmony_ci	pushl LEN
301062306a36Sopenharmony_ci	pushl KEYP
301162306a36Sopenharmony_ci	pushl KLEN
301262306a36Sopenharmony_ci	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
301362306a36Sopenharmony_ci	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
301462306a36Sopenharmony_ci	movl (FRAME_OFFSET+28)(%esp), INP	# src
301562306a36Sopenharmony_ci	movl (FRAME_OFFSET+32)(%esp), LEN	# len
301662306a36Sopenharmony_ci	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
301762306a36Sopenharmony_ci	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
301862306a36Sopenharmony_ci#else
301962306a36Sopenharmony_ci	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
302062306a36Sopenharmony_ci#endif
302162306a36Sopenharmony_ci	movups (IVP), IV
302262306a36Sopenharmony_ci
302362306a36Sopenharmony_ci	mov 480(KEYP), KLEN
302462306a36Sopenharmony_ci	add $240, KEYP
302562306a36Sopenharmony_ci
302662306a36Sopenharmony_ci	test $15, LEN
302762306a36Sopenharmony_ci	jz .Lxts_dec_loop4
302862306a36Sopenharmony_ci	sub $16, LEN
302962306a36Sopenharmony_ci
303062306a36Sopenharmony_ci.Lxts_dec_loop4:
303162306a36Sopenharmony_ci	sub $64, LEN
303262306a36Sopenharmony_ci	jl .Lxts_dec_1x
303362306a36Sopenharmony_ci
303462306a36Sopenharmony_ci	movdqa IV, STATE1
303562306a36Sopenharmony_ci	movdqu 0x00(INP), IN
303662306a36Sopenharmony_ci	pxor IN, STATE1
303762306a36Sopenharmony_ci	movdqu IV, 0x00(OUTP)
303862306a36Sopenharmony_ci
303962306a36Sopenharmony_ci	_aesni_gf128mul_x_ble()
304062306a36Sopenharmony_ci	movdqa IV, STATE2
304162306a36Sopenharmony_ci	movdqu 0x10(INP), IN
304262306a36Sopenharmony_ci	pxor IN, STATE2
304362306a36Sopenharmony_ci	movdqu IV, 0x10(OUTP)
304462306a36Sopenharmony_ci
304562306a36Sopenharmony_ci	_aesni_gf128mul_x_ble()
304662306a36Sopenharmony_ci	movdqa IV, STATE3
304762306a36Sopenharmony_ci	movdqu 0x20(INP), IN
304862306a36Sopenharmony_ci	pxor IN, STATE3
304962306a36Sopenharmony_ci	movdqu IV, 0x20(OUTP)
305062306a36Sopenharmony_ci
305162306a36Sopenharmony_ci	_aesni_gf128mul_x_ble()
305262306a36Sopenharmony_ci	movdqa IV, STATE4
305362306a36Sopenharmony_ci	movdqu 0x30(INP), IN
305462306a36Sopenharmony_ci	pxor IN, STATE4
305562306a36Sopenharmony_ci	movdqu IV, 0x30(OUTP)
305662306a36Sopenharmony_ci
305762306a36Sopenharmony_ci	call _aesni_dec4
305862306a36Sopenharmony_ci
305962306a36Sopenharmony_ci	movdqu 0x00(OUTP), IN
306062306a36Sopenharmony_ci	pxor IN, STATE1
306162306a36Sopenharmony_ci	movdqu STATE1, 0x00(OUTP)
306262306a36Sopenharmony_ci
306362306a36Sopenharmony_ci	movdqu 0x10(OUTP), IN
306462306a36Sopenharmony_ci	pxor IN, STATE2
306562306a36Sopenharmony_ci	movdqu STATE2, 0x10(OUTP)
306662306a36Sopenharmony_ci
306762306a36Sopenharmony_ci	movdqu 0x20(OUTP), IN
306862306a36Sopenharmony_ci	pxor IN, STATE3
306962306a36Sopenharmony_ci	movdqu STATE3, 0x20(OUTP)
307062306a36Sopenharmony_ci
307162306a36Sopenharmony_ci	movdqu 0x30(OUTP), IN
307262306a36Sopenharmony_ci	pxor IN, STATE4
307362306a36Sopenharmony_ci	movdqu STATE4, 0x30(OUTP)
307462306a36Sopenharmony_ci
307562306a36Sopenharmony_ci	_aesni_gf128mul_x_ble()
307662306a36Sopenharmony_ci
307762306a36Sopenharmony_ci	add $64, INP
307862306a36Sopenharmony_ci	add $64, OUTP
307962306a36Sopenharmony_ci	test LEN, LEN
308062306a36Sopenharmony_ci	jnz .Lxts_dec_loop4
308162306a36Sopenharmony_ci
308262306a36Sopenharmony_ci.Lxts_dec_ret_iv:
308362306a36Sopenharmony_ci	movups IV, (IVP)
308462306a36Sopenharmony_ci
308562306a36Sopenharmony_ci.Lxts_dec_ret:
308662306a36Sopenharmony_ci#ifndef __x86_64__
308762306a36Sopenharmony_ci	popl KLEN
308862306a36Sopenharmony_ci	popl KEYP
308962306a36Sopenharmony_ci	popl LEN
309062306a36Sopenharmony_ci	popl IVP
309162306a36Sopenharmony_ci#endif
309262306a36Sopenharmony_ci	FRAME_END
309362306a36Sopenharmony_ci	RET
309462306a36Sopenharmony_ci
309562306a36Sopenharmony_ci.Lxts_dec_1x:
309662306a36Sopenharmony_ci	add $64, LEN
309762306a36Sopenharmony_ci	jz .Lxts_dec_ret_iv
309862306a36Sopenharmony_ci
309962306a36Sopenharmony_ci.Lxts_dec_loop1:
310062306a36Sopenharmony_ci	movdqu (INP), STATE
310162306a36Sopenharmony_ci
310262306a36Sopenharmony_ci	add $16, INP
310362306a36Sopenharmony_ci	sub $16, LEN
310462306a36Sopenharmony_ci	jl .Lxts_dec_cts1
310562306a36Sopenharmony_ci
310662306a36Sopenharmony_ci	pxor IV, STATE
310762306a36Sopenharmony_ci	call _aesni_dec1
310862306a36Sopenharmony_ci	pxor IV, STATE
310962306a36Sopenharmony_ci	_aesni_gf128mul_x_ble()
311062306a36Sopenharmony_ci
311162306a36Sopenharmony_ci	test LEN, LEN
311262306a36Sopenharmony_ci	jz .Lxts_dec_out
311362306a36Sopenharmony_ci
311462306a36Sopenharmony_ci	movdqu STATE, (OUTP)
311562306a36Sopenharmony_ci	add $16, OUTP
311662306a36Sopenharmony_ci	jmp .Lxts_dec_loop1
311762306a36Sopenharmony_ci
311862306a36Sopenharmony_ci.Lxts_dec_out:
311962306a36Sopenharmony_ci	movdqu STATE, (OUTP)
312062306a36Sopenharmony_ci	jmp .Lxts_dec_ret_iv
312162306a36Sopenharmony_ci
312262306a36Sopenharmony_ci.Lxts_dec_cts1:
312362306a36Sopenharmony_ci	movdqa IV, STATE4
312462306a36Sopenharmony_ci	_aesni_gf128mul_x_ble()
312562306a36Sopenharmony_ci
312662306a36Sopenharmony_ci	pxor IV, STATE
312762306a36Sopenharmony_ci	call _aesni_dec1
312862306a36Sopenharmony_ci	pxor IV, STATE
312962306a36Sopenharmony_ci
313062306a36Sopenharmony_ci#ifndef __x86_64__
313162306a36Sopenharmony_ci	lea .Lcts_permute_table, T1
313262306a36Sopenharmony_ci#else
313362306a36Sopenharmony_ci	lea .Lcts_permute_table(%rip), T1
313462306a36Sopenharmony_ci#endif
313562306a36Sopenharmony_ci	add LEN, INP		/* rewind input pointer */
313662306a36Sopenharmony_ci	add $16, LEN		/* # bytes in final block */
313762306a36Sopenharmony_ci	movups (INP), IN1
313862306a36Sopenharmony_ci
313962306a36Sopenharmony_ci	mov T1, IVP
314062306a36Sopenharmony_ci	add $32, IVP
314162306a36Sopenharmony_ci	add LEN, T1
314262306a36Sopenharmony_ci	sub LEN, IVP
314362306a36Sopenharmony_ci	add OUTP, LEN
314462306a36Sopenharmony_ci
314562306a36Sopenharmony_ci	movups (T1), %xmm4
314662306a36Sopenharmony_ci	movaps STATE, IN2
314762306a36Sopenharmony_ci	pshufb %xmm4, STATE
314862306a36Sopenharmony_ci	movups STATE, (LEN)
314962306a36Sopenharmony_ci
315062306a36Sopenharmony_ci	movups (IVP), %xmm0
315162306a36Sopenharmony_ci	pshufb %xmm0, IN1
315262306a36Sopenharmony_ci	pblendvb IN2, IN1
315362306a36Sopenharmony_ci	movaps IN1, STATE
315462306a36Sopenharmony_ci
315562306a36Sopenharmony_ci	pxor STATE4, STATE
315662306a36Sopenharmony_ci	call _aesni_dec1
315762306a36Sopenharmony_ci	pxor STATE4, STATE
315862306a36Sopenharmony_ci
315962306a36Sopenharmony_ci	movups STATE, (OUTP)
316062306a36Sopenharmony_ci	jmp .Lxts_dec_ret
316162306a36Sopenharmony_ciSYM_FUNC_END(aesni_xts_decrypt)
3162