162306a36Sopenharmony_ci########################################################################
262306a36Sopenharmony_ci# Copyright (c) 2013, Intel Corporation
362306a36Sopenharmony_ci#
462306a36Sopenharmony_ci# This software is available to you under a choice of one of two
562306a36Sopenharmony_ci# licenses.  You may choose to be licensed under the terms of the GNU
662306a36Sopenharmony_ci# General Public License (GPL) Version 2, available from the file
762306a36Sopenharmony_ci# COPYING in the main directory of this source tree, or the
862306a36Sopenharmony_ci# OpenIB.org BSD license below:
962306a36Sopenharmony_ci#
1062306a36Sopenharmony_ci# Redistribution and use in source and binary forms, with or without
1162306a36Sopenharmony_ci# modification, are permitted provided that the following conditions are
1262306a36Sopenharmony_ci# met:
1362306a36Sopenharmony_ci#
1462306a36Sopenharmony_ci# * Redistributions of source code must retain the above copyright
1562306a36Sopenharmony_ci#   notice, this list of conditions and the following disclaimer.
1662306a36Sopenharmony_ci#
1762306a36Sopenharmony_ci# * Redistributions in binary form must reproduce the above copyright
1862306a36Sopenharmony_ci#   notice, this list of conditions and the following disclaimer in the
1962306a36Sopenharmony_ci#   documentation and/or other materials provided with the
2062306a36Sopenharmony_ci#   distribution.
2162306a36Sopenharmony_ci#
2262306a36Sopenharmony_ci# * Neither the name of the Intel Corporation nor the names of its
2362306a36Sopenharmony_ci#   contributors may be used to endorse or promote products derived from
2462306a36Sopenharmony_ci#   this software without specific prior written permission.
2562306a36Sopenharmony_ci#
2662306a36Sopenharmony_ci#
2762306a36Sopenharmony_ci# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
2862306a36Sopenharmony_ci# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2962306a36Sopenharmony_ci# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
3062306a36Sopenharmony_ci# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
3162306a36Sopenharmony_ci# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
3262306a36Sopenharmony_ci# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
3362306a36Sopenharmony_ci# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
3462306a36Sopenharmony_ci# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
3562306a36Sopenharmony_ci# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
3662306a36Sopenharmony_ci# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
3762306a36Sopenharmony_ci# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3862306a36Sopenharmony_ci########################################################################
3962306a36Sopenharmony_ci##
4062306a36Sopenharmony_ci## Authors:
4162306a36Sopenharmony_ci##	Erdinc Ozturk <erdinc.ozturk@intel.com>
4262306a36Sopenharmony_ci##	Vinodh Gopal <vinodh.gopal@intel.com>
4362306a36Sopenharmony_ci##	James Guilford <james.guilford@intel.com>
4462306a36Sopenharmony_ci##	Tim Chen <tim.c.chen@linux.intel.com>
4562306a36Sopenharmony_ci##
4662306a36Sopenharmony_ci## References:
4762306a36Sopenharmony_ci##       This code was derived and highly optimized from the code described in paper:
4862306a36Sopenharmony_ci##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
4962306a36Sopenharmony_ci##			on Intel Architecture Processors. August, 2010
5062306a36Sopenharmony_ci##       The details of the implementation is explained in:
5162306a36Sopenharmony_ci##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
5262306a36Sopenharmony_ci##			on Intel Architecture Processors. October, 2012.
5362306a36Sopenharmony_ci##
5462306a36Sopenharmony_ci## Assumptions:
5562306a36Sopenharmony_ci##
5662306a36Sopenharmony_ci##
5762306a36Sopenharmony_ci##
5862306a36Sopenharmony_ci## iv:
5962306a36Sopenharmony_ci##       0                   1                   2                   3
6062306a36Sopenharmony_ci##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
6162306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
6262306a36Sopenharmony_ci##       |                             Salt  (From the SA)               |
6362306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
6462306a36Sopenharmony_ci##       |                     Initialization Vector                     |
6562306a36Sopenharmony_ci##       |         (This is the sequence number from IPSec header)       |
6662306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
6762306a36Sopenharmony_ci##       |                              0x1                              |
6862306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
6962306a36Sopenharmony_ci##
7062306a36Sopenharmony_ci##
7162306a36Sopenharmony_ci##
7262306a36Sopenharmony_ci## AAD:
7362306a36Sopenharmony_ci##       AAD padded to 128 bits with 0
7462306a36Sopenharmony_ci##       for example, assume AAD is a u32 vector
7562306a36Sopenharmony_ci##
7662306a36Sopenharmony_ci##       if AAD is 8 bytes:
7762306a36Sopenharmony_ci##       AAD[3] = {A0, A1}#
7862306a36Sopenharmony_ci##       padded AAD in xmm register = {A1 A0 0 0}
7962306a36Sopenharmony_ci##
8062306a36Sopenharmony_ci##       0                   1                   2                   3
8162306a36Sopenharmony_ci##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
8262306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
8362306a36Sopenharmony_ci##       |                               SPI (A1)                        |
8462306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
8562306a36Sopenharmony_ci##       |                     32-bit Sequence Number (A0)               |
8662306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
8762306a36Sopenharmony_ci##       |                              0x0                              |
8862306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
8962306a36Sopenharmony_ci##
9062306a36Sopenharmony_ci##                                       AAD Format with 32-bit Sequence Number
9162306a36Sopenharmony_ci##
9262306a36Sopenharmony_ci##       if AAD is 12 bytes:
9362306a36Sopenharmony_ci##       AAD[3] = {A0, A1, A2}#
9462306a36Sopenharmony_ci##       padded AAD in xmm register = {A2 A1 A0 0}
9562306a36Sopenharmony_ci##
9662306a36Sopenharmony_ci##       0                   1                   2                   3
9762306a36Sopenharmony_ci##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
9862306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
9962306a36Sopenharmony_ci##       |                               SPI (A2)                        |
10062306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
10162306a36Sopenharmony_ci##       |                 64-bit Extended Sequence Number {A1,A0}       |
10262306a36Sopenharmony_ci##       |                                                               |
10362306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
10462306a36Sopenharmony_ci##       |                              0x0                              |
10562306a36Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
10662306a36Sopenharmony_ci##
10762306a36Sopenharmony_ci##        AAD Format with 64-bit Extended Sequence Number
10862306a36Sopenharmony_ci##
10962306a36Sopenharmony_ci##
11062306a36Sopenharmony_ci## aadLen:
11162306a36Sopenharmony_ci##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
11262306a36Sopenharmony_ci##	 The code additionally supports aadLen of length 16 bytes.
11362306a36Sopenharmony_ci##
11462306a36Sopenharmony_ci## TLen:
11562306a36Sopenharmony_ci##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
11662306a36Sopenharmony_ci##
11762306a36Sopenharmony_ci## poly = x^128 + x^127 + x^126 + x^121 + 1
11862306a36Sopenharmony_ci## throughout the code, one tab and two tab indentations are used. one tab is
11962306a36Sopenharmony_ci## for GHASH part, two tabs is for AES part.
12062306a36Sopenharmony_ci##
12162306a36Sopenharmony_ci
12262306a36Sopenharmony_ci#include <linux/linkage.h>
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci# constants in mergeable sections, linker can reorder and merge
12562306a36Sopenharmony_ci.section	.rodata.cst16.POLY, "aM", @progbits, 16
12662306a36Sopenharmony_ci.align 16
12762306a36Sopenharmony_ciPOLY:            .octa     0xC2000000000000000000000000000001
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci.section	.rodata.cst16.POLY2, "aM", @progbits, 16
13062306a36Sopenharmony_ci.align 16
13162306a36Sopenharmony_ciPOLY2:           .octa     0xC20000000000000000000001C2000000
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
13462306a36Sopenharmony_ci.align 16
13562306a36Sopenharmony_ciTWOONE:          .octa     0x00000001000000000000000000000001
13662306a36Sopenharmony_ci
13762306a36Sopenharmony_ci.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
13862306a36Sopenharmony_ci.align 16
13962306a36Sopenharmony_ciSHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci.section	.rodata.cst16.ONE, "aM", @progbits, 16
14262306a36Sopenharmony_ci.align 16
14362306a36Sopenharmony_ciONE:             .octa     0x00000000000000000000000000000001
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_ci.section	.rodata.cst16.ONEf, "aM", @progbits, 16
14662306a36Sopenharmony_ci.align 16
14762306a36Sopenharmony_ciONEf:            .octa     0x01000000000000000000000000000000
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci# order of these constants should not change.
15062306a36Sopenharmony_ci# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
15162306a36Sopenharmony_ci.section	.rodata, "a", @progbits
15262306a36Sopenharmony_ci.align 16
15362306a36Sopenharmony_ciSHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
15462306a36Sopenharmony_ciALL_F:           .octa     0xffffffffffffffffffffffffffffffff
15562306a36Sopenharmony_ci                 .octa     0x00000000000000000000000000000000
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci.text
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci
16062306a36Sopenharmony_ci#define AadHash 16*0
16162306a36Sopenharmony_ci#define AadLen 16*1
16262306a36Sopenharmony_ci#define InLen (16*1)+8
16362306a36Sopenharmony_ci#define PBlockEncKey 16*2
16462306a36Sopenharmony_ci#define OrigIV 16*3
16562306a36Sopenharmony_ci#define CurCount 16*4
16662306a36Sopenharmony_ci#define PBlockLen 16*5
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ciHashKey        = 16*6   # store HashKey <<1 mod poly here
16962306a36Sopenharmony_ciHashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
17062306a36Sopenharmony_ciHashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
17162306a36Sopenharmony_ciHashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
17262306a36Sopenharmony_ciHashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
17362306a36Sopenharmony_ciHashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
17462306a36Sopenharmony_ciHashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
17562306a36Sopenharmony_ciHashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
17662306a36Sopenharmony_ciHashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
17762306a36Sopenharmony_ciHashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
17862306a36Sopenharmony_ciHashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
17962306a36Sopenharmony_ciHashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
18062306a36Sopenharmony_ciHashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
18162306a36Sopenharmony_ciHashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
18262306a36Sopenharmony_ciHashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
18362306a36Sopenharmony_ciHashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci#define arg1 %rdi
18662306a36Sopenharmony_ci#define arg2 %rsi
18762306a36Sopenharmony_ci#define arg3 %rdx
18862306a36Sopenharmony_ci#define arg4 %rcx
18962306a36Sopenharmony_ci#define arg5 %r8
19062306a36Sopenharmony_ci#define arg6 %r9
19162306a36Sopenharmony_ci#define keysize 2*15*16(arg1)
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_cii = 0
19462306a36Sopenharmony_cij = 0
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ciout_order = 0
19762306a36Sopenharmony_ciin_order = 1
19862306a36Sopenharmony_ciDEC = 0
19962306a36Sopenharmony_ciENC = 1
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci.macro define_reg r n
20262306a36Sopenharmony_cireg_\r = %xmm\n
20362306a36Sopenharmony_ci.endm
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci.macro setreg
20662306a36Sopenharmony_ci.altmacro
20762306a36Sopenharmony_cidefine_reg i %i
20862306a36Sopenharmony_cidefine_reg j %j
20962306a36Sopenharmony_ci.noaltmacro
21062306a36Sopenharmony_ci.endm
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ciTMP1 =   16*0    # Temporary storage for AAD
21362306a36Sopenharmony_ciTMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
21462306a36Sopenharmony_ciTMP3 =   16*2    # Temporary storage for AES State 3
21562306a36Sopenharmony_ciTMP4 =   16*3    # Temporary storage for AES State 4
21662306a36Sopenharmony_ciTMP5 =   16*4    # Temporary storage for AES State 5
21762306a36Sopenharmony_ciTMP6 =   16*5    # Temporary storage for AES State 6
21862306a36Sopenharmony_ciTMP7 =   16*6    # Temporary storage for AES State 7
21962306a36Sopenharmony_ciTMP8 =   16*7    # Temporary storage for AES State 8
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ciVARIABLE_OFFSET = 16*8
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_ci################################
22462306a36Sopenharmony_ci# Utility Macros
22562306a36Sopenharmony_ci################################
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci.macro FUNC_SAVE
22862306a36Sopenharmony_ci        push    %r12
22962306a36Sopenharmony_ci        push    %r13
23062306a36Sopenharmony_ci        push    %r15
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ci	push	%rbp
23362306a36Sopenharmony_ci	mov	%rsp, %rbp
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_ci        sub     $VARIABLE_OFFSET, %rsp
23662306a36Sopenharmony_ci        and     $~63, %rsp                    # align rsp to 64 bytes
23762306a36Sopenharmony_ci.endm
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_ci.macro FUNC_RESTORE
24062306a36Sopenharmony_ci        mov     %rbp, %rsp
24162306a36Sopenharmony_ci	pop	%rbp
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_ci        pop     %r15
24462306a36Sopenharmony_ci        pop     %r13
24562306a36Sopenharmony_ci        pop     %r12
24662306a36Sopenharmony_ci.endm
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci# Encryption of a single block
24962306a36Sopenharmony_ci.macro ENCRYPT_SINGLE_BLOCK REP XMM0
25062306a36Sopenharmony_ci                vpxor    (arg1), \XMM0, \XMM0
25162306a36Sopenharmony_ci               i = 1
25262306a36Sopenharmony_ci               setreg
25362306a36Sopenharmony_ci.rep \REP
25462306a36Sopenharmony_ci                vaesenc  16*i(arg1), \XMM0, \XMM0
25562306a36Sopenharmony_ci               i = (i+1)
25662306a36Sopenharmony_ci               setreg
25762306a36Sopenharmony_ci.endr
25862306a36Sopenharmony_ci                vaesenclast 16*i(arg1), \XMM0, \XMM0
25962306a36Sopenharmony_ci.endm
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci# combined for GCM encrypt and decrypt functions
26262306a36Sopenharmony_ci# clobbering all xmm registers
26362306a36Sopenharmony_ci# clobbering r10, r11, r12, r13, r15, rax
26462306a36Sopenharmony_ci.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
26562306a36Sopenharmony_ci        vmovdqu AadHash(arg2), %xmm8
26662306a36Sopenharmony_ci        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
26762306a36Sopenharmony_ci        add arg5, InLen(arg2)
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci        # initialize the data pointer offset as zero
27062306a36Sopenharmony_ci        xor     %r11d, %r11d
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
27362306a36Sopenharmony_ci        sub %r11, arg5
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
27662306a36Sopenharmony_ci        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci        mov     %r13, %r12
27962306a36Sopenharmony_ci        shr     $4, %r12
28062306a36Sopenharmony_ci        and     $7, %r12
28162306a36Sopenharmony_ci        jz      .L_initial_num_blocks_is_0\@
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci        cmp     $7, %r12
28462306a36Sopenharmony_ci        je      .L_initial_num_blocks_is_7\@
28562306a36Sopenharmony_ci        cmp     $6, %r12
28662306a36Sopenharmony_ci        je      .L_initial_num_blocks_is_6\@
28762306a36Sopenharmony_ci        cmp     $5, %r12
28862306a36Sopenharmony_ci        je      .L_initial_num_blocks_is_5\@
28962306a36Sopenharmony_ci        cmp     $4, %r12
29062306a36Sopenharmony_ci        je      .L_initial_num_blocks_is_4\@
29162306a36Sopenharmony_ci        cmp     $3, %r12
29262306a36Sopenharmony_ci        je      .L_initial_num_blocks_is_3\@
29362306a36Sopenharmony_ci        cmp     $2, %r12
29462306a36Sopenharmony_ci        je      .L_initial_num_blocks_is_2\@
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci        jmp     .L_initial_num_blocks_is_1\@
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci.L_initial_num_blocks_is_7\@:
29962306a36Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
30062306a36Sopenharmony_ci        sub     $16*7, %r13
30162306a36Sopenharmony_ci        jmp     .L_initial_blocks_encrypted\@
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci.L_initial_num_blocks_is_6\@:
30462306a36Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
30562306a36Sopenharmony_ci        sub     $16*6, %r13
30662306a36Sopenharmony_ci        jmp     .L_initial_blocks_encrypted\@
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci.L_initial_num_blocks_is_5\@:
30962306a36Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
31062306a36Sopenharmony_ci        sub     $16*5, %r13
31162306a36Sopenharmony_ci        jmp     .L_initial_blocks_encrypted\@
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci.L_initial_num_blocks_is_4\@:
31462306a36Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
31562306a36Sopenharmony_ci        sub     $16*4, %r13
31662306a36Sopenharmony_ci        jmp     .L_initial_blocks_encrypted\@
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci.L_initial_num_blocks_is_3\@:
31962306a36Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
32062306a36Sopenharmony_ci        sub     $16*3, %r13
32162306a36Sopenharmony_ci        jmp     .L_initial_blocks_encrypted\@
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci.L_initial_num_blocks_is_2\@:
32462306a36Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
32562306a36Sopenharmony_ci        sub     $16*2, %r13
32662306a36Sopenharmony_ci        jmp     .L_initial_blocks_encrypted\@
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_ci.L_initial_num_blocks_is_1\@:
32962306a36Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
33062306a36Sopenharmony_ci        sub     $16*1, %r13
33162306a36Sopenharmony_ci        jmp     .L_initial_blocks_encrypted\@
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_ci.L_initial_num_blocks_is_0\@:
33462306a36Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci.L_initial_blocks_encrypted\@:
33862306a36Sopenharmony_ci        test    %r13, %r13
33962306a36Sopenharmony_ci        je      .L_zero_cipher_left\@
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci        sub     $128, %r13
34262306a36Sopenharmony_ci        je      .L_eight_cipher_left\@
34362306a36Sopenharmony_ci
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci        vmovd   %xmm9, %r15d
34862306a36Sopenharmony_ci        and     $255, %r15d
34962306a36Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci
35262306a36Sopenharmony_ci.L_encrypt_by_8_new\@:
35362306a36Sopenharmony_ci        cmp     $(255-8), %r15d
35462306a36Sopenharmony_ci        jg      .L_encrypt_by_8\@
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_ci        add     $8, %r15b
35962306a36Sopenharmony_ci        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
36062306a36Sopenharmony_ci        add     $128, %r11
36162306a36Sopenharmony_ci        sub     $128, %r13
36262306a36Sopenharmony_ci        jne     .L_encrypt_by_8_new\@
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
36562306a36Sopenharmony_ci        jmp     .L_eight_cipher_left\@
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci.L_encrypt_by_8\@:
36862306a36Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
36962306a36Sopenharmony_ci        add     $8, %r15b
37062306a36Sopenharmony_ci        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
37162306a36Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
37262306a36Sopenharmony_ci        add     $128, %r11
37362306a36Sopenharmony_ci        sub     $128, %r13
37462306a36Sopenharmony_ci        jne     .L_encrypt_by_8_new\@
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
37762306a36Sopenharmony_ci
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_ci.L_eight_cipher_left\@:
38262306a36Sopenharmony_ci        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
38362306a36Sopenharmony_ci
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci.L_zero_cipher_left\@:
38662306a36Sopenharmony_ci        vmovdqu %xmm14, AadHash(arg2)
38762306a36Sopenharmony_ci        vmovdqu %xmm9, CurCount(arg2)
38862306a36Sopenharmony_ci
38962306a36Sopenharmony_ci        # check for 0 length
39062306a36Sopenharmony_ci        mov     arg5, %r13
39162306a36Sopenharmony_ci        and     $15, %r13                            # r13 = (arg5 mod 16)
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci        je      .L_multiple_of_16_bytes\@
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci        # handle the last <16 Byte block separately
39662306a36Sopenharmony_ci
39762306a36Sopenharmony_ci        mov %r13, PBlockLen(arg2)
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_ci        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
40062306a36Sopenharmony_ci        vmovdqu %xmm9, CurCount(arg2)
40162306a36Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
40462306a36Sopenharmony_ci        vmovdqu %xmm9, PBlockEncKey(arg2)
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci        cmp $16, arg5
40762306a36Sopenharmony_ci        jge .L_large_enough_update\@
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci        lea (arg4,%r11,1), %r10
41062306a36Sopenharmony_ci        mov %r13, %r12
41162306a36Sopenharmony_ci
41262306a36Sopenharmony_ci        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci        lea     SHIFT_MASK+16(%rip), %r12
41562306a36Sopenharmony_ci        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
41662306a36Sopenharmony_ci						     # able to shift 16-r13 bytes (r13 is the
41762306a36Sopenharmony_ci	# number of bytes in plaintext mod 16)
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci        jmp .L_final_ghash_mul\@
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci.L_large_enough_update\@:
42262306a36Sopenharmony_ci        sub $16, %r11
42362306a36Sopenharmony_ci        add %r13, %r11
42462306a36Sopenharmony_ci
42562306a36Sopenharmony_ci        # receive the last <16 Byte block
42662306a36Sopenharmony_ci        vmovdqu	(arg4, %r11, 1), %xmm1
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci        sub	%r13, %r11
42962306a36Sopenharmony_ci        add	$16, %r11
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci        lea	SHIFT_MASK+16(%rip), %r12
43262306a36Sopenharmony_ci        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
43362306a36Sopenharmony_ci        # (r13 is the number of bytes in plaintext mod 16)
43462306a36Sopenharmony_ci        sub	%r13, %r12
43562306a36Sopenharmony_ci        # get the appropriate shuffle mask
43662306a36Sopenharmony_ci        vmovdqu	(%r12), %xmm2
43762306a36Sopenharmony_ci        # shift right 16-r13 bytes
43862306a36Sopenharmony_ci        vpshufb  %xmm2, %xmm1, %xmm1
43962306a36Sopenharmony_ci
44062306a36Sopenharmony_ci.L_final_ghash_mul\@:
44162306a36Sopenharmony_ci        .if  \ENC_DEC ==  DEC
44262306a36Sopenharmony_ci        vmovdqa %xmm1, %xmm2
44362306a36Sopenharmony_ci        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
44462306a36Sopenharmony_ci        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
44562306a36Sopenharmony_ci						     # mask out top 16-r13 bytes of xmm9
44662306a36Sopenharmony_ci        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
44762306a36Sopenharmony_ci        vpand   %xmm1, %xmm2, %xmm2
44862306a36Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
44962306a36Sopenharmony_ci        vpxor   %xmm2, %xmm14, %xmm14
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci        vmovdqu %xmm14, AadHash(arg2)
45262306a36Sopenharmony_ci        .else
45362306a36Sopenharmony_ci        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
45462306a36Sopenharmony_ci        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
45562306a36Sopenharmony_ci						     # mask out top 16-r13 bytes of xmm9
45662306a36Sopenharmony_ci        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
45762306a36Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
45862306a36Sopenharmony_ci        vpxor   %xmm9, %xmm14, %xmm14
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci        vmovdqu %xmm14, AadHash(arg2)
46162306a36Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
46262306a36Sopenharmony_ci        .endif
46362306a36Sopenharmony_ci
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_ci        #############################
46662306a36Sopenharmony_ci        # output r13 Bytes
46762306a36Sopenharmony_ci        vmovq   %xmm9, %rax
46862306a36Sopenharmony_ci        cmp     $8, %r13
46962306a36Sopenharmony_ci        jle     .L_less_than_8_bytes_left\@
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci        mov     %rax, (arg3 , %r11)
47262306a36Sopenharmony_ci        add     $8, %r11
47362306a36Sopenharmony_ci        vpsrldq $8, %xmm9, %xmm9
47462306a36Sopenharmony_ci        vmovq   %xmm9, %rax
47562306a36Sopenharmony_ci        sub     $8, %r13
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_ci.L_less_than_8_bytes_left\@:
47862306a36Sopenharmony_ci        movb    %al, (arg3 , %r11)
47962306a36Sopenharmony_ci        add     $1, %r11
48062306a36Sopenharmony_ci        shr     $8, %rax
48162306a36Sopenharmony_ci        sub     $1, %r13
48262306a36Sopenharmony_ci        jne     .L_less_than_8_bytes_left\@
48362306a36Sopenharmony_ci        #############################
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_ci.L_multiple_of_16_bytes\@:
48662306a36Sopenharmony_ci.endm
48762306a36Sopenharmony_ci
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci# GCM_COMPLETE Finishes update of tag of last partial block
49062306a36Sopenharmony_ci# Output: Authorization Tag (AUTH_TAG)
49162306a36Sopenharmony_ci# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
49262306a36Sopenharmony_ci.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
49362306a36Sopenharmony_ci        vmovdqu AadHash(arg2), %xmm14
49462306a36Sopenharmony_ci        vmovdqu HashKey(arg2), %xmm13
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_ci        mov PBlockLen(arg2), %r12
49762306a36Sopenharmony_ci        test %r12, %r12
49862306a36Sopenharmony_ci        je .L_partial_done\@
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	#GHASH computation for the last <16 Byte block
50162306a36Sopenharmony_ci        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci.L_partial_done\@:
50462306a36Sopenharmony_ci        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
50562306a36Sopenharmony_ci        shl     $3, %r12                             # convert into number of bits
50662306a36Sopenharmony_ci        vmovd   %r12d, %xmm15                        # len(A) in xmm15
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci        mov InLen(arg2), %r12
50962306a36Sopenharmony_ci        shl     $3, %r12                        # len(C) in bits  (*128)
51062306a36Sopenharmony_ci        vmovq   %r12, %xmm1
51162306a36Sopenharmony_ci        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
51262306a36Sopenharmony_ci        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
51362306a36Sopenharmony_ci
51462306a36Sopenharmony_ci        vpxor   %xmm15, %xmm14, %xmm14
51562306a36Sopenharmony_ci        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
51662306a36Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
51762306a36Sopenharmony_ci
51862306a36Sopenharmony_ci        vmovdqu OrigIV(arg2), %xmm9
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
52162306a36Sopenharmony_ci
52262306a36Sopenharmony_ci        vpxor   %xmm14, %xmm9, %xmm9
52362306a36Sopenharmony_ci
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_ci.L_return_T\@:
52762306a36Sopenharmony_ci        mov     \AUTH_TAG, %r10              # r10 = authTag
52862306a36Sopenharmony_ci        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
52962306a36Sopenharmony_ci
53062306a36Sopenharmony_ci        cmp     $16, %r11
53162306a36Sopenharmony_ci        je      .L_T_16\@
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_ci        cmp     $8, %r11
53462306a36Sopenharmony_ci        jl      .L_T_4\@
53562306a36Sopenharmony_ci
53662306a36Sopenharmony_ci.L_T_8\@:
53762306a36Sopenharmony_ci        vmovq   %xmm9, %rax
53862306a36Sopenharmony_ci        mov     %rax, (%r10)
53962306a36Sopenharmony_ci        add     $8, %r10
54062306a36Sopenharmony_ci        sub     $8, %r11
54162306a36Sopenharmony_ci        vpsrldq $8, %xmm9, %xmm9
54262306a36Sopenharmony_ci        test    %r11, %r11
54362306a36Sopenharmony_ci        je     .L_return_T_done\@
54462306a36Sopenharmony_ci.L_T_4\@:
54562306a36Sopenharmony_ci        vmovd   %xmm9, %eax
54662306a36Sopenharmony_ci        mov     %eax, (%r10)
54762306a36Sopenharmony_ci        add     $4, %r10
54862306a36Sopenharmony_ci        sub     $4, %r11
54962306a36Sopenharmony_ci        vpsrldq     $4, %xmm9, %xmm9
55062306a36Sopenharmony_ci        test    %r11, %r11
55162306a36Sopenharmony_ci        je     .L_return_T_done\@
55262306a36Sopenharmony_ci.L_T_123\@:
55362306a36Sopenharmony_ci        vmovd     %xmm9, %eax
55462306a36Sopenharmony_ci        cmp     $2, %r11
55562306a36Sopenharmony_ci        jl     .L_T_1\@
55662306a36Sopenharmony_ci        mov     %ax, (%r10)
55762306a36Sopenharmony_ci        cmp     $2, %r11
55862306a36Sopenharmony_ci        je     .L_return_T_done\@
55962306a36Sopenharmony_ci        add     $2, %r10
56062306a36Sopenharmony_ci        sar     $16, %eax
56162306a36Sopenharmony_ci.L_T_1\@:
56262306a36Sopenharmony_ci        mov     %al, (%r10)
56362306a36Sopenharmony_ci        jmp     .L_return_T_done\@
56462306a36Sopenharmony_ci
56562306a36Sopenharmony_ci.L_T_16\@:
56662306a36Sopenharmony_ci        vmovdqu %xmm9, (%r10)
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_ci.L_return_T_done\@:
56962306a36Sopenharmony_ci.endm
57062306a36Sopenharmony_ci
57162306a36Sopenharmony_ci.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci	mov     \AAD, %r10                      # r10 = AAD
57462306a36Sopenharmony_ci	mov     \AADLEN, %r12                      # r12 = aadLen
57562306a36Sopenharmony_ci
57662306a36Sopenharmony_ci
57762306a36Sopenharmony_ci	mov     %r12, %r11
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci	vpxor   \T8, \T8, \T8
58062306a36Sopenharmony_ci	vpxor   \T7, \T7, \T7
58162306a36Sopenharmony_ci	cmp     $16, %r11
58262306a36Sopenharmony_ci	jl      .L_get_AAD_rest8\@
58362306a36Sopenharmony_ci.L_get_AAD_blocks\@:
58462306a36Sopenharmony_ci	vmovdqu (%r10), \T7
58562306a36Sopenharmony_ci	vpshufb SHUF_MASK(%rip), \T7, \T7
58662306a36Sopenharmony_ci	vpxor   \T7, \T8, \T8
58762306a36Sopenharmony_ci	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
58862306a36Sopenharmony_ci	add     $16, %r10
58962306a36Sopenharmony_ci	sub     $16, %r12
59062306a36Sopenharmony_ci	sub     $16, %r11
59162306a36Sopenharmony_ci	cmp     $16, %r11
59262306a36Sopenharmony_ci	jge     .L_get_AAD_blocks\@
59362306a36Sopenharmony_ci	vmovdqu \T8, \T7
59462306a36Sopenharmony_ci	test    %r11, %r11
59562306a36Sopenharmony_ci	je      .L_get_AAD_done\@
59662306a36Sopenharmony_ci
59762306a36Sopenharmony_ci	vpxor   \T7, \T7, \T7
59862306a36Sopenharmony_ci
59962306a36Sopenharmony_ci	/* read the last <16B of AAD. since we have at least 4B of
60062306a36Sopenharmony_ci	data right after the AAD (the ICV, and maybe some CT), we can
60162306a36Sopenharmony_ci	read 4B/8B blocks safely, and then get rid of the extra stuff */
60262306a36Sopenharmony_ci.L_get_AAD_rest8\@:
60362306a36Sopenharmony_ci	cmp     $4, %r11
60462306a36Sopenharmony_ci	jle     .L_get_AAD_rest4\@
60562306a36Sopenharmony_ci	movq    (%r10), \T1
60662306a36Sopenharmony_ci	add     $8, %r10
60762306a36Sopenharmony_ci	sub     $8, %r11
60862306a36Sopenharmony_ci	vpslldq $8, \T1, \T1
60962306a36Sopenharmony_ci	vpsrldq $8, \T7, \T7
61062306a36Sopenharmony_ci	vpxor   \T1, \T7, \T7
61162306a36Sopenharmony_ci	jmp     .L_get_AAD_rest8\@
61262306a36Sopenharmony_ci.L_get_AAD_rest4\@:
61362306a36Sopenharmony_ci	test    %r11, %r11
61462306a36Sopenharmony_ci	jle     .L_get_AAD_rest0\@
61562306a36Sopenharmony_ci	mov     (%r10), %eax
61662306a36Sopenharmony_ci	movq    %rax, \T1
61762306a36Sopenharmony_ci	add     $4, %r10
61862306a36Sopenharmony_ci	sub     $4, %r11
61962306a36Sopenharmony_ci	vpslldq $12, \T1, \T1
62062306a36Sopenharmony_ci	vpsrldq $4, \T7, \T7
62162306a36Sopenharmony_ci	vpxor   \T1, \T7, \T7
62262306a36Sopenharmony_ci.L_get_AAD_rest0\@:
62362306a36Sopenharmony_ci	/* finalize: shift out the extra bytes we read, and align
62462306a36Sopenharmony_ci	left. since pslldq can only shift by an immediate, we use
62562306a36Sopenharmony_ci	vpshufb and a pair of shuffle masks */
62662306a36Sopenharmony_ci	leaq	ALL_F(%rip), %r11
62762306a36Sopenharmony_ci	subq	%r12, %r11
62862306a36Sopenharmony_ci	vmovdqu	16(%r11), \T1
62962306a36Sopenharmony_ci	andq	$~3, %r11
63062306a36Sopenharmony_ci	vpshufb (%r11), \T7, \T7
63162306a36Sopenharmony_ci	vpand	\T1, \T7, \T7
63262306a36Sopenharmony_ci.L_get_AAD_rest_final\@:
63362306a36Sopenharmony_ci	vpshufb SHUF_MASK(%rip), \T7, \T7
63462306a36Sopenharmony_ci	vpxor   \T8, \T7, \T7
63562306a36Sopenharmony_ci	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_ci.L_get_AAD_done\@:
63862306a36Sopenharmony_ci        vmovdqu \T7, AadHash(arg2)
63962306a36Sopenharmony_ci.endm
64062306a36Sopenharmony_ci
64162306a36Sopenharmony_ci.macro INIT GHASH_MUL PRECOMPUTE
64262306a36Sopenharmony_ci        mov arg6, %r11
64362306a36Sopenharmony_ci        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
64462306a36Sopenharmony_ci        xor %r11d, %r11d
64562306a36Sopenharmony_ci        mov %r11, InLen(arg2) # ctx_data.in_length = 0
64662306a36Sopenharmony_ci
64762306a36Sopenharmony_ci        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
64862306a36Sopenharmony_ci        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
64962306a36Sopenharmony_ci        mov arg3, %rax
65062306a36Sopenharmony_ci        movdqu (%rax), %xmm0
65162306a36Sopenharmony_ci        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
65262306a36Sopenharmony_ci
65362306a36Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
65462306a36Sopenharmony_ci        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
65562306a36Sopenharmony_ci
65662306a36Sopenharmony_ci        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
65762306a36Sopenharmony_ci
65862306a36Sopenharmony_ci        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
65962306a36Sopenharmony_ci        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
66062306a36Sopenharmony_ci        vmovdqa  %xmm6, %xmm2
66162306a36Sopenharmony_ci        vpsllq   $1, %xmm6, %xmm6
66262306a36Sopenharmony_ci        vpsrlq   $63, %xmm2, %xmm2
66362306a36Sopenharmony_ci        vmovdqa  %xmm2, %xmm1
66462306a36Sopenharmony_ci        vpslldq  $8, %xmm2, %xmm2
66562306a36Sopenharmony_ci        vpsrldq  $8, %xmm1, %xmm1
66662306a36Sopenharmony_ci        vpor     %xmm2, %xmm6, %xmm6
66762306a36Sopenharmony_ci        #reduction
66862306a36Sopenharmony_ci        vpshufd  $0b00100100, %xmm1, %xmm2
66962306a36Sopenharmony_ci        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
67062306a36Sopenharmony_ci        vpand    POLY(%rip), %xmm2, %xmm2
67162306a36Sopenharmony_ci        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
67262306a36Sopenharmony_ci        #######################################################################
67362306a36Sopenharmony_ci        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
67462306a36Sopenharmony_ci
67562306a36Sopenharmony_ci        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
67662306a36Sopenharmony_ci
67762306a36Sopenharmony_ci        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
67862306a36Sopenharmony_ci.endm
67962306a36Sopenharmony_ci
68062306a36Sopenharmony_ci
68162306a36Sopenharmony_ci# Reads DLEN bytes starting at DPTR and stores in XMMDst
68262306a36Sopenharmony_ci# where 0 < DLEN < 16
68362306a36Sopenharmony_ci# Clobbers %rax, DLEN
68462306a36Sopenharmony_ci.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
68562306a36Sopenharmony_ci        vpxor \XMMDst, \XMMDst, \XMMDst
68662306a36Sopenharmony_ci
68762306a36Sopenharmony_ci        cmp $8, \DLEN
68862306a36Sopenharmony_ci        jl .L_read_lt8_\@
68962306a36Sopenharmony_ci        mov (\DPTR), %rax
69062306a36Sopenharmony_ci        vpinsrq $0, %rax, \XMMDst, \XMMDst
69162306a36Sopenharmony_ci        sub $8, \DLEN
69262306a36Sopenharmony_ci        jz .L_done_read_partial_block_\@
69362306a36Sopenharmony_ci        xor %eax, %eax
69462306a36Sopenharmony_ci.L_read_next_byte_\@:
69562306a36Sopenharmony_ci        shl $8, %rax
69662306a36Sopenharmony_ci        mov 7(\DPTR, \DLEN, 1), %al
69762306a36Sopenharmony_ci        dec \DLEN
69862306a36Sopenharmony_ci        jnz .L_read_next_byte_\@
69962306a36Sopenharmony_ci        vpinsrq $1, %rax, \XMMDst, \XMMDst
70062306a36Sopenharmony_ci        jmp .L_done_read_partial_block_\@
70162306a36Sopenharmony_ci.L_read_lt8_\@:
70262306a36Sopenharmony_ci        xor %eax, %eax
70362306a36Sopenharmony_ci.L_read_next_byte_lt8_\@:
70462306a36Sopenharmony_ci        shl $8, %rax
70562306a36Sopenharmony_ci        mov -1(\DPTR, \DLEN, 1), %al
70662306a36Sopenharmony_ci        dec \DLEN
70762306a36Sopenharmony_ci        jnz .L_read_next_byte_lt8_\@
70862306a36Sopenharmony_ci        vpinsrq $0, %rax, \XMMDst, \XMMDst
70962306a36Sopenharmony_ci.L_done_read_partial_block_\@:
71062306a36Sopenharmony_ci.endm
71162306a36Sopenharmony_ci
71262306a36Sopenharmony_ci# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
71362306a36Sopenharmony_ci# between update calls.
71462306a36Sopenharmony_ci# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
71562306a36Sopenharmony_ci# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
71662306a36Sopenharmony_ci# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
71762306a36Sopenharmony_ci.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
71862306a36Sopenharmony_ci        AAD_HASH ENC_DEC
71962306a36Sopenharmony_ci        mov 	PBlockLen(arg2), %r13
72062306a36Sopenharmony_ci        test	%r13, %r13
72162306a36Sopenharmony_ci        je	.L_partial_block_done_\@	# Leave Macro if no partial blocks
72262306a36Sopenharmony_ci        # Read in input data without over reading
72362306a36Sopenharmony_ci        cmp	$16, \PLAIN_CYPH_LEN
72462306a36Sopenharmony_ci        jl	.L_fewer_than_16_bytes_\@
72562306a36Sopenharmony_ci        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
72662306a36Sopenharmony_ci        jmp	.L_data_read_\@
72762306a36Sopenharmony_ci
72862306a36Sopenharmony_ci.L_fewer_than_16_bytes_\@:
72962306a36Sopenharmony_ci        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
73062306a36Sopenharmony_ci        mov	\PLAIN_CYPH_LEN, %r12
73162306a36Sopenharmony_ci        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
73262306a36Sopenharmony_ci
73362306a36Sopenharmony_ci        mov PBlockLen(arg2), %r13
73462306a36Sopenharmony_ci
73562306a36Sopenharmony_ci.L_data_read_\@:				# Finished reading in data
73662306a36Sopenharmony_ci
73762306a36Sopenharmony_ci        vmovdqu	PBlockEncKey(arg2), %xmm9
73862306a36Sopenharmony_ci        vmovdqu	HashKey(arg2), %xmm13
73962306a36Sopenharmony_ci
74062306a36Sopenharmony_ci        lea	SHIFT_MASK(%rip), %r12
74162306a36Sopenharmony_ci
74262306a36Sopenharmony_ci        # adjust the shuffle mask pointer to be able to shift r13 bytes
74362306a36Sopenharmony_ci        # r16-r13 is the number of bytes in plaintext mod 16)
74462306a36Sopenharmony_ci        add	%r13, %r12
74562306a36Sopenharmony_ci        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
74662306a36Sopenharmony_ci        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
74762306a36Sopenharmony_ci
74862306a36Sopenharmony_ci.if  \ENC_DEC ==  DEC
74962306a36Sopenharmony_ci        vmovdqa	%xmm1, %xmm3
75062306a36Sopenharmony_ci        pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
75162306a36Sopenharmony_ci
75262306a36Sopenharmony_ci        mov	\PLAIN_CYPH_LEN, %r10
75362306a36Sopenharmony_ci        add	%r13, %r10
75462306a36Sopenharmony_ci        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
75562306a36Sopenharmony_ci        sub	$16, %r10
75662306a36Sopenharmony_ci        # Determine if if partial block is not being filled and
75762306a36Sopenharmony_ci        # shift mask accordingly
75862306a36Sopenharmony_ci        jge	.L_no_extra_mask_1_\@
75962306a36Sopenharmony_ci        sub	%r10, %r12
76062306a36Sopenharmony_ci.L_no_extra_mask_1_\@:
76162306a36Sopenharmony_ci
76262306a36Sopenharmony_ci        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
76362306a36Sopenharmony_ci        # get the appropriate mask to mask out bottom r13 bytes of xmm9
76462306a36Sopenharmony_ci        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
76562306a36Sopenharmony_ci
76662306a36Sopenharmony_ci        vpand	%xmm1, %xmm3, %xmm3
76762306a36Sopenharmony_ci        vmovdqa	SHUF_MASK(%rip), %xmm10
76862306a36Sopenharmony_ci        vpshufb	%xmm10, %xmm3, %xmm3
76962306a36Sopenharmony_ci        vpshufb	%xmm2, %xmm3, %xmm3
77062306a36Sopenharmony_ci        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
77162306a36Sopenharmony_ci
77262306a36Sopenharmony_ci        test	%r10, %r10
77362306a36Sopenharmony_ci        jl	.L_partial_incomplete_1_\@
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci        # GHASH computation for the last <16 Byte block
77662306a36Sopenharmony_ci        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
77762306a36Sopenharmony_ci        xor	%eax,%eax
77862306a36Sopenharmony_ci
77962306a36Sopenharmony_ci        mov	%rax, PBlockLen(arg2)
78062306a36Sopenharmony_ci        jmp	.L_dec_done_\@
78162306a36Sopenharmony_ci.L_partial_incomplete_1_\@:
78262306a36Sopenharmony_ci        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
78362306a36Sopenharmony_ci.L_dec_done_\@:
78462306a36Sopenharmony_ci        vmovdqu	\AAD_HASH, AadHash(arg2)
78562306a36Sopenharmony_ci.else
78662306a36Sopenharmony_ci        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
78762306a36Sopenharmony_ci
78862306a36Sopenharmony_ci        mov	\PLAIN_CYPH_LEN, %r10
78962306a36Sopenharmony_ci        add	%r13, %r10
79062306a36Sopenharmony_ci        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
79162306a36Sopenharmony_ci        sub	$16, %r10
79262306a36Sopenharmony_ci        # Determine if if partial block is not being filled and
79362306a36Sopenharmony_ci        # shift mask accordingly
79462306a36Sopenharmony_ci        jge	.L_no_extra_mask_2_\@
79562306a36Sopenharmony_ci        sub	%r10, %r12
79662306a36Sopenharmony_ci.L_no_extra_mask_2_\@:
79762306a36Sopenharmony_ci
79862306a36Sopenharmony_ci        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
79962306a36Sopenharmony_ci        # get the appropriate mask to mask out bottom r13 bytes of xmm9
80062306a36Sopenharmony_ci        vpand	%xmm1, %xmm9, %xmm9
80162306a36Sopenharmony_ci
80262306a36Sopenharmony_ci        vmovdqa	SHUF_MASK(%rip), %xmm1
80362306a36Sopenharmony_ci        vpshufb %xmm1, %xmm9, %xmm9
80462306a36Sopenharmony_ci        vpshufb %xmm2, %xmm9, %xmm9
80562306a36Sopenharmony_ci        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
80662306a36Sopenharmony_ci
80762306a36Sopenharmony_ci        test	%r10, %r10
80862306a36Sopenharmony_ci        jl	.L_partial_incomplete_2_\@
80962306a36Sopenharmony_ci
81062306a36Sopenharmony_ci        # GHASH computation for the last <16 Byte block
81162306a36Sopenharmony_ci        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
81262306a36Sopenharmony_ci        xor	%eax,%eax
81362306a36Sopenharmony_ci
81462306a36Sopenharmony_ci        mov	%rax, PBlockLen(arg2)
81562306a36Sopenharmony_ci        jmp	.L_encode_done_\@
81662306a36Sopenharmony_ci.L_partial_incomplete_2_\@:
81762306a36Sopenharmony_ci        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
81862306a36Sopenharmony_ci.L_encode_done_\@:
81962306a36Sopenharmony_ci        vmovdqu	\AAD_HASH, AadHash(arg2)
82062306a36Sopenharmony_ci
82162306a36Sopenharmony_ci        vmovdqa	SHUF_MASK(%rip), %xmm10
82262306a36Sopenharmony_ci        # shuffle xmm9 back to output as ciphertext
82362306a36Sopenharmony_ci        vpshufb	%xmm10, %xmm9, %xmm9
82462306a36Sopenharmony_ci        vpshufb	%xmm2, %xmm9, %xmm9
82562306a36Sopenharmony_ci.endif
82662306a36Sopenharmony_ci        # output encrypted Bytes
82762306a36Sopenharmony_ci        test	%r10, %r10
82862306a36Sopenharmony_ci        jl	.L_partial_fill_\@
82962306a36Sopenharmony_ci        mov	%r13, %r12
83062306a36Sopenharmony_ci        mov	$16, %r13
83162306a36Sopenharmony_ci        # Set r13 to be the number of bytes to write out
83262306a36Sopenharmony_ci        sub	%r12, %r13
83362306a36Sopenharmony_ci        jmp	.L_count_set_\@
83462306a36Sopenharmony_ci.L_partial_fill_\@:
83562306a36Sopenharmony_ci        mov	\PLAIN_CYPH_LEN, %r13
83662306a36Sopenharmony_ci.L_count_set_\@:
83762306a36Sopenharmony_ci        vmovdqa	%xmm9, %xmm0
83862306a36Sopenharmony_ci        vmovq	%xmm0, %rax
83962306a36Sopenharmony_ci        cmp	$8, %r13
84062306a36Sopenharmony_ci        jle	.L_less_than_8_bytes_left_\@
84162306a36Sopenharmony_ci
84262306a36Sopenharmony_ci        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
84362306a36Sopenharmony_ci        add	$8, \DATA_OFFSET
84462306a36Sopenharmony_ci        psrldq	$8, %xmm0
84562306a36Sopenharmony_ci        vmovq	%xmm0, %rax
84662306a36Sopenharmony_ci        sub	$8, %r13
84762306a36Sopenharmony_ci.L_less_than_8_bytes_left_\@:
84862306a36Sopenharmony_ci        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
84962306a36Sopenharmony_ci        add	$1, \DATA_OFFSET
85062306a36Sopenharmony_ci        shr	$8, %rax
85162306a36Sopenharmony_ci        sub	$1, %r13
85262306a36Sopenharmony_ci        jne	.L_less_than_8_bytes_left_\@
85362306a36Sopenharmony_ci.L_partial_block_done_\@:
85462306a36Sopenharmony_ci.endm # PARTIAL_BLOCK
85562306a36Sopenharmony_ci
85662306a36Sopenharmony_ci###############################################################################
85762306a36Sopenharmony_ci# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
85862306a36Sopenharmony_ci# Input: A and B (128-bits each, bit-reflected)
85962306a36Sopenharmony_ci# Output: C = A*B*x mod poly, (i.e. >>1 )
86062306a36Sopenharmony_ci# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
86162306a36Sopenharmony_ci# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
86262306a36Sopenharmony_ci###############################################################################
86362306a36Sopenharmony_ci.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
86462306a36Sopenharmony_ci
86562306a36Sopenharmony_ci        vpshufd         $0b01001110, \GH, \T2
86662306a36Sopenharmony_ci        vpshufd         $0b01001110, \HK, \T3
86762306a36Sopenharmony_ci        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
86862306a36Sopenharmony_ci        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
86962306a36Sopenharmony_ci
87062306a36Sopenharmony_ci        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
87162306a36Sopenharmony_ci        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
87262306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
87362306a36Sopenharmony_ci        vpxor           \GH, \T2,\T2
87462306a36Sopenharmony_ci        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
87562306a36Sopenharmony_ci
87662306a36Sopenharmony_ci        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
87762306a36Sopenharmony_ci        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
87862306a36Sopenharmony_ci        vpxor           \T3, \GH, \GH
87962306a36Sopenharmony_ci        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
88062306a36Sopenharmony_ci
88162306a36Sopenharmony_ci        #first phase of the reduction
88262306a36Sopenharmony_ci        vpslld  $31, \GH, \T2                   # packed right shifting << 31
88362306a36Sopenharmony_ci        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
88462306a36Sopenharmony_ci        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
88562306a36Sopenharmony_ci
88662306a36Sopenharmony_ci        vpxor   \T3, \T2, \T2                   # xor the shifted versions
88762306a36Sopenharmony_ci        vpxor   \T4, \T2, \T2
88862306a36Sopenharmony_ci
88962306a36Sopenharmony_ci        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
89062306a36Sopenharmony_ci
89162306a36Sopenharmony_ci        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
89262306a36Sopenharmony_ci        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
89362306a36Sopenharmony_ci
89462306a36Sopenharmony_ci        #second phase of the reduction
89562306a36Sopenharmony_ci
89662306a36Sopenharmony_ci        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
89762306a36Sopenharmony_ci        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
89862306a36Sopenharmony_ci        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
89962306a36Sopenharmony_ci        vpxor   \T3, \T2, \T2                   # xor the shifted versions
90062306a36Sopenharmony_ci        vpxor   \T4, \T2, \T2
90162306a36Sopenharmony_ci
90262306a36Sopenharmony_ci        vpxor   \T5, \T2, \T2
90362306a36Sopenharmony_ci        vpxor   \T2, \GH, \GH
90462306a36Sopenharmony_ci        vpxor   \T1, \GH, \GH                   # the result is in GH
90562306a36Sopenharmony_ci
90662306a36Sopenharmony_ci
90762306a36Sopenharmony_ci.endm
90862306a36Sopenharmony_ci
90962306a36Sopenharmony_ci.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
91062306a36Sopenharmony_ci
91162306a36Sopenharmony_ci        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
91262306a36Sopenharmony_ci        vmovdqa  \HK, \T5
91362306a36Sopenharmony_ci
91462306a36Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
91562306a36Sopenharmony_ci        vpxor    \T5, \T1, \T1
91662306a36Sopenharmony_ci        vmovdqu  \T1, HashKey_k(arg2)
91762306a36Sopenharmony_ci
91862306a36Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
91962306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
92062306a36Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
92162306a36Sopenharmony_ci        vpxor    \T5, \T1, \T1
92262306a36Sopenharmony_ci        vmovdqu  \T1, HashKey_2_k(arg2)
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
92562306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_3(arg2)
92662306a36Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
92762306a36Sopenharmony_ci        vpxor    \T5, \T1, \T1
92862306a36Sopenharmony_ci        vmovdqu  \T1, HashKey_3_k(arg2)
92962306a36Sopenharmony_ci
93062306a36Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
93162306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_4(arg2)
93262306a36Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
93362306a36Sopenharmony_ci        vpxor    \T5, \T1, \T1
93462306a36Sopenharmony_ci        vmovdqu  \T1, HashKey_4_k(arg2)
93562306a36Sopenharmony_ci
93662306a36Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
93762306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_5(arg2)
93862306a36Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
93962306a36Sopenharmony_ci        vpxor    \T5, \T1, \T1
94062306a36Sopenharmony_ci        vmovdqu  \T1, HashKey_5_k(arg2)
94162306a36Sopenharmony_ci
94262306a36Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
94362306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_6(arg2)
94462306a36Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
94562306a36Sopenharmony_ci        vpxor    \T5, \T1, \T1
94662306a36Sopenharmony_ci        vmovdqu  \T1, HashKey_6_k(arg2)
94762306a36Sopenharmony_ci
94862306a36Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
94962306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_7(arg2)
95062306a36Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
95162306a36Sopenharmony_ci        vpxor    \T5, \T1, \T1
95262306a36Sopenharmony_ci        vmovdqu  \T1, HashKey_7_k(arg2)
95362306a36Sopenharmony_ci
95462306a36Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
95562306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_8(arg2)
95662306a36Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
95762306a36Sopenharmony_ci        vpxor    \T5, \T1, \T1
95862306a36Sopenharmony_ci        vmovdqu  \T1, HashKey_8_k(arg2)
95962306a36Sopenharmony_ci
96062306a36Sopenharmony_ci.endm
96162306a36Sopenharmony_ci
96262306a36Sopenharmony_ci## if a = number of total plaintext bytes
96362306a36Sopenharmony_ci## b = floor(a/16)
96462306a36Sopenharmony_ci## num_initial_blocks = b mod 4#
96562306a36Sopenharmony_ci## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
96662306a36Sopenharmony_ci## r10, r11, r12, rax are clobbered
96762306a36Sopenharmony_ci## arg1, arg2, arg3, arg4 are used as pointers only, not modified
96862306a36Sopenharmony_ci
96962306a36Sopenharmony_ci.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
97062306a36Sopenharmony_ci	i = (8-\num_initial_blocks)
97162306a36Sopenharmony_ci	setreg
97262306a36Sopenharmony_ci        vmovdqu AadHash(arg2), reg_i
97362306a36Sopenharmony_ci
97462306a36Sopenharmony_ci	# start AES for num_initial_blocks blocks
97562306a36Sopenharmony_ci	vmovdqu CurCount(arg2), \CTR
97662306a36Sopenharmony_ci
97762306a36Sopenharmony_ci	i = (9-\num_initial_blocks)
97862306a36Sopenharmony_ci	setreg
97962306a36Sopenharmony_ci.rep \num_initial_blocks
98062306a36Sopenharmony_ci                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
98162306a36Sopenharmony_ci                vmovdqa \CTR, reg_i
98262306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
98362306a36Sopenharmony_ci	i = (i+1)
98462306a36Sopenharmony_ci	setreg
98562306a36Sopenharmony_ci.endr
98662306a36Sopenharmony_ci
98762306a36Sopenharmony_ci	vmovdqa  (arg1), \T_key
98862306a36Sopenharmony_ci	i = (9-\num_initial_blocks)
98962306a36Sopenharmony_ci	setreg
99062306a36Sopenharmony_ci.rep \num_initial_blocks
99162306a36Sopenharmony_ci                vpxor   \T_key, reg_i, reg_i
99262306a36Sopenharmony_ci	i = (i+1)
99362306a36Sopenharmony_ci	setreg
99462306a36Sopenharmony_ci.endr
99562306a36Sopenharmony_ci
99662306a36Sopenharmony_ci       j = 1
99762306a36Sopenharmony_ci       setreg
99862306a36Sopenharmony_ci.rep \REP
99962306a36Sopenharmony_ci       vmovdqa  16*j(arg1), \T_key
100062306a36Sopenharmony_ci	i = (9-\num_initial_blocks)
100162306a36Sopenharmony_ci	setreg
100262306a36Sopenharmony_ci.rep \num_initial_blocks
100362306a36Sopenharmony_ci        vaesenc \T_key, reg_i, reg_i
100462306a36Sopenharmony_ci	i = (i+1)
100562306a36Sopenharmony_ci	setreg
100662306a36Sopenharmony_ci.endr
100762306a36Sopenharmony_ci
100862306a36Sopenharmony_ci       j = (j+1)
100962306a36Sopenharmony_ci       setreg
101062306a36Sopenharmony_ci.endr
101162306a36Sopenharmony_ci
101262306a36Sopenharmony_ci	vmovdqa  16*j(arg1), \T_key
101362306a36Sopenharmony_ci	i = (9-\num_initial_blocks)
101462306a36Sopenharmony_ci	setreg
101562306a36Sopenharmony_ci.rep \num_initial_blocks
101662306a36Sopenharmony_ci        vaesenclast      \T_key, reg_i, reg_i
101762306a36Sopenharmony_ci	i = (i+1)
101862306a36Sopenharmony_ci	setreg
101962306a36Sopenharmony_ci.endr
102062306a36Sopenharmony_ci
102162306a36Sopenharmony_ci	i = (9-\num_initial_blocks)
102262306a36Sopenharmony_ci	setreg
102362306a36Sopenharmony_ci.rep \num_initial_blocks
102462306a36Sopenharmony_ci                vmovdqu (arg4, %r11), \T1
102562306a36Sopenharmony_ci                vpxor   \T1, reg_i, reg_i
102662306a36Sopenharmony_ci                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
102762306a36Sopenharmony_ci                add     $16, %r11
102862306a36Sopenharmony_ci.if  \ENC_DEC == DEC
102962306a36Sopenharmony_ci                vmovdqa \T1, reg_i
103062306a36Sopenharmony_ci.endif
103162306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
103262306a36Sopenharmony_ci	i = (i+1)
103362306a36Sopenharmony_ci	setreg
103462306a36Sopenharmony_ci.endr
103562306a36Sopenharmony_ci
103662306a36Sopenharmony_ci
103762306a36Sopenharmony_ci	i = (8-\num_initial_blocks)
103862306a36Sopenharmony_ci	j = (9-\num_initial_blocks)
103962306a36Sopenharmony_ci	setreg
104062306a36Sopenharmony_ci
104162306a36Sopenharmony_ci.rep \num_initial_blocks
104262306a36Sopenharmony_ci        vpxor    reg_i, reg_j, reg_j
104362306a36Sopenharmony_ci        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
104462306a36Sopenharmony_ci	i = (i+1)
104562306a36Sopenharmony_ci	j = (j+1)
104662306a36Sopenharmony_ci	setreg
104762306a36Sopenharmony_ci.endr
104862306a36Sopenharmony_ci        # XMM8 has the combined result here
104962306a36Sopenharmony_ci
105062306a36Sopenharmony_ci        vmovdqa  \XMM8, TMP1(%rsp)
105162306a36Sopenharmony_ci        vmovdqa  \XMM8, \T3
105262306a36Sopenharmony_ci
105362306a36Sopenharmony_ci        cmp     $128, %r13
105462306a36Sopenharmony_ci        jl      .L_initial_blocks_done\@                  # no need for precomputed constants
105562306a36Sopenharmony_ci
105662306a36Sopenharmony_ci###############################################################################
105762306a36Sopenharmony_ci# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
105862306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
105962306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM1
106062306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
106162306a36Sopenharmony_ci
106262306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
106362306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM2
106462306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
106562306a36Sopenharmony_ci
106662306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
106762306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM3
106862306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
106962306a36Sopenharmony_ci
107062306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
107162306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM4
107262306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
107362306a36Sopenharmony_ci
107462306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
107562306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM5
107662306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
107762306a36Sopenharmony_ci
107862306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
107962306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM6
108062306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
108162306a36Sopenharmony_ci
108262306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
108362306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM7
108462306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
108562306a36Sopenharmony_ci
108662306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
108762306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM8
108862306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
108962306a36Sopenharmony_ci
109062306a36Sopenharmony_ci                vmovdqa  (arg1), \T_key
109162306a36Sopenharmony_ci                vpxor    \T_key, \XMM1, \XMM1
109262306a36Sopenharmony_ci                vpxor    \T_key, \XMM2, \XMM2
109362306a36Sopenharmony_ci                vpxor    \T_key, \XMM3, \XMM3
109462306a36Sopenharmony_ci                vpxor    \T_key, \XMM4, \XMM4
109562306a36Sopenharmony_ci                vpxor    \T_key, \XMM5, \XMM5
109662306a36Sopenharmony_ci                vpxor    \T_key, \XMM6, \XMM6
109762306a36Sopenharmony_ci                vpxor    \T_key, \XMM7, \XMM7
109862306a36Sopenharmony_ci                vpxor    \T_key, \XMM8, \XMM8
109962306a36Sopenharmony_ci
110062306a36Sopenharmony_ci               i = 1
110162306a36Sopenharmony_ci               setreg
110262306a36Sopenharmony_ci.rep    \REP       # do REP rounds
110362306a36Sopenharmony_ci                vmovdqa  16*i(arg1), \T_key
110462306a36Sopenharmony_ci                vaesenc  \T_key, \XMM1, \XMM1
110562306a36Sopenharmony_ci                vaesenc  \T_key, \XMM2, \XMM2
110662306a36Sopenharmony_ci                vaesenc  \T_key, \XMM3, \XMM3
110762306a36Sopenharmony_ci                vaesenc  \T_key, \XMM4, \XMM4
110862306a36Sopenharmony_ci                vaesenc  \T_key, \XMM5, \XMM5
110962306a36Sopenharmony_ci                vaesenc  \T_key, \XMM6, \XMM6
111062306a36Sopenharmony_ci                vaesenc  \T_key, \XMM7, \XMM7
111162306a36Sopenharmony_ci                vaesenc  \T_key, \XMM8, \XMM8
111262306a36Sopenharmony_ci               i = (i+1)
111362306a36Sopenharmony_ci               setreg
111462306a36Sopenharmony_ci.endr
111562306a36Sopenharmony_ci
111662306a36Sopenharmony_ci                vmovdqa  16*i(arg1), \T_key
111762306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM1, \XMM1
111862306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM2, \XMM2
111962306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM3, \XMM3
112062306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM4, \XMM4
112162306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM5, \XMM5
112262306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM6, \XMM6
112362306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM7, \XMM7
112462306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM8, \XMM8
112562306a36Sopenharmony_ci
112662306a36Sopenharmony_ci                vmovdqu  (arg4, %r11), \T1
112762306a36Sopenharmony_ci                vpxor    \T1, \XMM1, \XMM1
112862306a36Sopenharmony_ci                vmovdqu  \XMM1, (arg3 , %r11)
112962306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
113062306a36Sopenharmony_ci                vmovdqa  \T1, \XMM1
113162306a36Sopenharmony_ci                .endif
113262306a36Sopenharmony_ci
113362306a36Sopenharmony_ci                vmovdqu  16*1(arg4, %r11), \T1
113462306a36Sopenharmony_ci                vpxor    \T1, \XMM2, \XMM2
113562306a36Sopenharmony_ci                vmovdqu  \XMM2, 16*1(arg3 , %r11)
113662306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
113762306a36Sopenharmony_ci                vmovdqa  \T1, \XMM2
113862306a36Sopenharmony_ci                .endif
113962306a36Sopenharmony_ci
114062306a36Sopenharmony_ci                vmovdqu  16*2(arg4, %r11), \T1
114162306a36Sopenharmony_ci                vpxor    \T1, \XMM3, \XMM3
114262306a36Sopenharmony_ci                vmovdqu  \XMM3, 16*2(arg3 , %r11)
114362306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
114462306a36Sopenharmony_ci                vmovdqa  \T1, \XMM3
114562306a36Sopenharmony_ci                .endif
114662306a36Sopenharmony_ci
114762306a36Sopenharmony_ci                vmovdqu  16*3(arg4, %r11), \T1
114862306a36Sopenharmony_ci                vpxor    \T1, \XMM4, \XMM4
114962306a36Sopenharmony_ci                vmovdqu  \XMM4, 16*3(arg3 , %r11)
115062306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
115162306a36Sopenharmony_ci                vmovdqa  \T1, \XMM4
115262306a36Sopenharmony_ci                .endif
115362306a36Sopenharmony_ci
115462306a36Sopenharmony_ci                vmovdqu  16*4(arg4, %r11), \T1
115562306a36Sopenharmony_ci                vpxor    \T1, \XMM5, \XMM5
115662306a36Sopenharmony_ci                vmovdqu  \XMM5, 16*4(arg3 , %r11)
115762306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
115862306a36Sopenharmony_ci                vmovdqa  \T1, \XMM5
115962306a36Sopenharmony_ci                .endif
116062306a36Sopenharmony_ci
116162306a36Sopenharmony_ci                vmovdqu  16*5(arg4, %r11), \T1
116262306a36Sopenharmony_ci                vpxor    \T1, \XMM6, \XMM6
116362306a36Sopenharmony_ci                vmovdqu  \XMM6, 16*5(arg3 , %r11)
116462306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
116562306a36Sopenharmony_ci                vmovdqa  \T1, \XMM6
116662306a36Sopenharmony_ci                .endif
116762306a36Sopenharmony_ci
116862306a36Sopenharmony_ci                vmovdqu  16*6(arg4, %r11), \T1
116962306a36Sopenharmony_ci                vpxor    \T1, \XMM7, \XMM7
117062306a36Sopenharmony_ci                vmovdqu  \XMM7, 16*6(arg3 , %r11)
117162306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
117262306a36Sopenharmony_ci                vmovdqa  \T1, \XMM7
117362306a36Sopenharmony_ci                .endif
117462306a36Sopenharmony_ci
117562306a36Sopenharmony_ci                vmovdqu  16*7(arg4, %r11), \T1
117662306a36Sopenharmony_ci                vpxor    \T1, \XMM8, \XMM8
117762306a36Sopenharmony_ci                vmovdqu  \XMM8, 16*7(arg3 , %r11)
117862306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
117962306a36Sopenharmony_ci                vmovdqa  \T1, \XMM8
118062306a36Sopenharmony_ci                .endif
118162306a36Sopenharmony_ci
118262306a36Sopenharmony_ci                add     $128, %r11
118362306a36Sopenharmony_ci
118462306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
118562306a36Sopenharmony_ci                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
118662306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
118762306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
118862306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
118962306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
119062306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
119162306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
119262306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
119362306a36Sopenharmony_ci
119462306a36Sopenharmony_ci###############################################################################
119562306a36Sopenharmony_ci
119662306a36Sopenharmony_ci.L_initial_blocks_done\@:
119762306a36Sopenharmony_ci
119862306a36Sopenharmony_ci.endm
119962306a36Sopenharmony_ci
120062306a36Sopenharmony_ci# encrypt 8 blocks at a time
120162306a36Sopenharmony_ci# ghash the 8 previously encrypted ciphertext blocks
120262306a36Sopenharmony_ci# arg1, arg2, arg3, arg4 are used as pointers only, not modified
120362306a36Sopenharmony_ci# r11 is the data offset value
120462306a36Sopenharmony_ci.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
120562306a36Sopenharmony_ci
120662306a36Sopenharmony_ci        vmovdqa \XMM1, \T2
120762306a36Sopenharmony_ci        vmovdqa \XMM2, TMP2(%rsp)
120862306a36Sopenharmony_ci        vmovdqa \XMM3, TMP3(%rsp)
120962306a36Sopenharmony_ci        vmovdqa \XMM4, TMP4(%rsp)
121062306a36Sopenharmony_ci        vmovdqa \XMM5, TMP5(%rsp)
121162306a36Sopenharmony_ci        vmovdqa \XMM6, TMP6(%rsp)
121262306a36Sopenharmony_ci        vmovdqa \XMM7, TMP7(%rsp)
121362306a36Sopenharmony_ci        vmovdqa \XMM8, TMP8(%rsp)
121462306a36Sopenharmony_ci
121562306a36Sopenharmony_ci.if \loop_idx == in_order
121662306a36Sopenharmony_ci                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
121762306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM1, \XMM2
121862306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM2, \XMM3
121962306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM3, \XMM4
122062306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM4, \XMM5
122162306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM5, \XMM6
122262306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM6, \XMM7
122362306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM7, \XMM8
122462306a36Sopenharmony_ci                vmovdqa \XMM8, \CTR
122562306a36Sopenharmony_ci
122662306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
122762306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
122862306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
122962306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
123062306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
123162306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
123262306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
123362306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
123462306a36Sopenharmony_ci.else
123562306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
123662306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM1, \XMM2
123762306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM2, \XMM3
123862306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM3, \XMM4
123962306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM4, \XMM5
124062306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM5, \XMM6
124162306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM6, \XMM7
124262306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM7, \XMM8
124362306a36Sopenharmony_ci                vmovdqa \XMM8, \CTR
124462306a36Sopenharmony_ci.endif
124562306a36Sopenharmony_ci
124662306a36Sopenharmony_ci
124762306a36Sopenharmony_ci        #######################################################################
124862306a36Sopenharmony_ci
124962306a36Sopenharmony_ci                vmovdqu (arg1), \T1
125062306a36Sopenharmony_ci                vpxor   \T1, \XMM1, \XMM1
125162306a36Sopenharmony_ci                vpxor   \T1, \XMM2, \XMM2
125262306a36Sopenharmony_ci                vpxor   \T1, \XMM3, \XMM3
125362306a36Sopenharmony_ci                vpxor   \T1, \XMM4, \XMM4
125462306a36Sopenharmony_ci                vpxor   \T1, \XMM5, \XMM5
125562306a36Sopenharmony_ci                vpxor   \T1, \XMM6, \XMM6
125662306a36Sopenharmony_ci                vpxor   \T1, \XMM7, \XMM7
125762306a36Sopenharmony_ci                vpxor   \T1, \XMM8, \XMM8
125862306a36Sopenharmony_ci
125962306a36Sopenharmony_ci        #######################################################################
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_ci
126262306a36Sopenharmony_ci
126362306a36Sopenharmony_ci
126462306a36Sopenharmony_ci
126562306a36Sopenharmony_ci                vmovdqu 16*1(arg1), \T1
126662306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
126762306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
126862306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
126962306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
127062306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
127162306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
127262306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
127362306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
127462306a36Sopenharmony_ci
127562306a36Sopenharmony_ci                vmovdqu 16*2(arg1), \T1
127662306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
127762306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
127862306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
127962306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
128062306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
128162306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
128262306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
128362306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
128462306a36Sopenharmony_ci
128562306a36Sopenharmony_ci
128662306a36Sopenharmony_ci        #######################################################################
128762306a36Sopenharmony_ci
128862306a36Sopenharmony_ci        vmovdqu         HashKey_8(arg2), \T5
128962306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
129062306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
129162306a36Sopenharmony_ci
129262306a36Sopenharmony_ci        vpshufd         $0b01001110, \T2, \T6
129362306a36Sopenharmony_ci        vpxor           \T2, \T6, \T6
129462306a36Sopenharmony_ci
129562306a36Sopenharmony_ci        vmovdqu         HashKey_8_k(arg2), \T5
129662306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T6, \T6
129762306a36Sopenharmony_ci
129862306a36Sopenharmony_ci                vmovdqu 16*3(arg1), \T1
129962306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
130062306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
130162306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
130262306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
130362306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
130462306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
130562306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
130662306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
130762306a36Sopenharmony_ci
130862306a36Sopenharmony_ci        vmovdqa         TMP2(%rsp), \T1
130962306a36Sopenharmony_ci        vmovdqu         HashKey_7(arg2), \T5
131062306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
131162306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
131262306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
131362306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
131462306a36Sopenharmony_ci
131562306a36Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
131662306a36Sopenharmony_ci        vpxor           \T1, \T3, \T3
131762306a36Sopenharmony_ci        vmovdqu         HashKey_7_k(arg2), \T5
131862306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
131962306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
132062306a36Sopenharmony_ci
132162306a36Sopenharmony_ci                vmovdqu 16*4(arg1), \T1
132262306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
132362306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
132462306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
132562306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
132662306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
132762306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
132862306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
132962306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
133062306a36Sopenharmony_ci
133162306a36Sopenharmony_ci        #######################################################################
133262306a36Sopenharmony_ci
133362306a36Sopenharmony_ci        vmovdqa         TMP3(%rsp), \T1
133462306a36Sopenharmony_ci        vmovdqu         HashKey_6(arg2), \T5
133562306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
133662306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
133762306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
133862306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
133962306a36Sopenharmony_ci
134062306a36Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
134162306a36Sopenharmony_ci        vpxor           \T1, \T3, \T3
134262306a36Sopenharmony_ci        vmovdqu         HashKey_6_k(arg2), \T5
134362306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
134462306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
134562306a36Sopenharmony_ci
134662306a36Sopenharmony_ci                vmovdqu 16*5(arg1), \T1
134762306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
134862306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
134962306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
135062306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
135162306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
135262306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
135362306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
135462306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
135562306a36Sopenharmony_ci
135662306a36Sopenharmony_ci        vmovdqa         TMP4(%rsp), \T1
135762306a36Sopenharmony_ci        vmovdqu         HashKey_5(arg2), \T5
135862306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
135962306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
136062306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
136162306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
136262306a36Sopenharmony_ci
136362306a36Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
136462306a36Sopenharmony_ci        vpxor           \T1, \T3, \T3
136562306a36Sopenharmony_ci        vmovdqu         HashKey_5_k(arg2), \T5
136662306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
136762306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
136862306a36Sopenharmony_ci
136962306a36Sopenharmony_ci                vmovdqu 16*6(arg1), \T1
137062306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
137162306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
137262306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
137362306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
137462306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
137562306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
137662306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
137762306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
137862306a36Sopenharmony_ci
137962306a36Sopenharmony_ci
138062306a36Sopenharmony_ci        vmovdqa         TMP5(%rsp), \T1
138162306a36Sopenharmony_ci        vmovdqu         HashKey_4(arg2), \T5
138262306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
138362306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
138462306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
138562306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
138662306a36Sopenharmony_ci
138762306a36Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
138862306a36Sopenharmony_ci        vpxor           \T1, \T3, \T3
138962306a36Sopenharmony_ci        vmovdqu         HashKey_4_k(arg2), \T5
139062306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
139162306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
139262306a36Sopenharmony_ci
139362306a36Sopenharmony_ci                vmovdqu 16*7(arg1), \T1
139462306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
139562306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
139662306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
139762306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
139862306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
139962306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
140062306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
140162306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
140262306a36Sopenharmony_ci
140362306a36Sopenharmony_ci        vmovdqa         TMP6(%rsp), \T1
140462306a36Sopenharmony_ci        vmovdqu         HashKey_3(arg2), \T5
140562306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
140662306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
140762306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
140862306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
140962306a36Sopenharmony_ci
141062306a36Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
141162306a36Sopenharmony_ci        vpxor           \T1, \T3, \T3
141262306a36Sopenharmony_ci        vmovdqu         HashKey_3_k(arg2), \T5
141362306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
141462306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ci
141762306a36Sopenharmony_ci                vmovdqu 16*8(arg1), \T1
141862306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
141962306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
142062306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
142162306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
142262306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
142362306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
142462306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
142562306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
142662306a36Sopenharmony_ci
142762306a36Sopenharmony_ci        vmovdqa         TMP7(%rsp), \T1
142862306a36Sopenharmony_ci        vmovdqu         HashKey_2(arg2), \T5
142962306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
143062306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
143162306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
143262306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
143362306a36Sopenharmony_ci
143462306a36Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
143562306a36Sopenharmony_ci        vpxor           \T1, \T3, \T3
143662306a36Sopenharmony_ci        vmovdqu         HashKey_2_k(arg2), \T5
143762306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
143862306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
143962306a36Sopenharmony_ci
144062306a36Sopenharmony_ci        #######################################################################
144162306a36Sopenharmony_ci
144262306a36Sopenharmony_ci                vmovdqu 16*9(arg1), \T5
144362306a36Sopenharmony_ci                vaesenc \T5, \XMM1, \XMM1
144462306a36Sopenharmony_ci                vaesenc \T5, \XMM2, \XMM2
144562306a36Sopenharmony_ci                vaesenc \T5, \XMM3, \XMM3
144662306a36Sopenharmony_ci                vaesenc \T5, \XMM4, \XMM4
144762306a36Sopenharmony_ci                vaesenc \T5, \XMM5, \XMM5
144862306a36Sopenharmony_ci                vaesenc \T5, \XMM6, \XMM6
144962306a36Sopenharmony_ci                vaesenc \T5, \XMM7, \XMM7
145062306a36Sopenharmony_ci                vaesenc \T5, \XMM8, \XMM8
145162306a36Sopenharmony_ci
145262306a36Sopenharmony_ci        vmovdqa         TMP8(%rsp), \T1
145362306a36Sopenharmony_ci        vmovdqu         HashKey(arg2), \T5
145462306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
145562306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
145662306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
145762306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
145862306a36Sopenharmony_ci
145962306a36Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
146062306a36Sopenharmony_ci        vpxor           \T1, \T3, \T3
146162306a36Sopenharmony_ci        vmovdqu         HashKey_k(arg2), \T5
146262306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
146362306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
146462306a36Sopenharmony_ci
146562306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
146662306a36Sopenharmony_ci        vpxor           \T7, \T6, \T6
146762306a36Sopenharmony_ci
146862306a36Sopenharmony_ci                vmovdqu 16*10(arg1), \T5
146962306a36Sopenharmony_ci
147062306a36Sopenharmony_ci        i = 11
147162306a36Sopenharmony_ci        setreg
147262306a36Sopenharmony_ci.rep (\REP-9)
147362306a36Sopenharmony_ci
147462306a36Sopenharmony_ci        vaesenc \T5, \XMM1, \XMM1
147562306a36Sopenharmony_ci        vaesenc \T5, \XMM2, \XMM2
147662306a36Sopenharmony_ci        vaesenc \T5, \XMM3, \XMM3
147762306a36Sopenharmony_ci        vaesenc \T5, \XMM4, \XMM4
147862306a36Sopenharmony_ci        vaesenc \T5, \XMM5, \XMM5
147962306a36Sopenharmony_ci        vaesenc \T5, \XMM6, \XMM6
148062306a36Sopenharmony_ci        vaesenc \T5, \XMM7, \XMM7
148162306a36Sopenharmony_ci        vaesenc \T5, \XMM8, \XMM8
148262306a36Sopenharmony_ci
148362306a36Sopenharmony_ci        vmovdqu 16*i(arg1), \T5
148462306a36Sopenharmony_ci        i = i + 1
148562306a36Sopenharmony_ci        setreg
148662306a36Sopenharmony_ci.endr
148762306a36Sopenharmony_ci
148862306a36Sopenharmony_ci	i = 0
148962306a36Sopenharmony_ci	j = 1
149062306a36Sopenharmony_ci	setreg
149162306a36Sopenharmony_ci.rep 8
149262306a36Sopenharmony_ci		vpxor	16*i(arg4, %r11), \T5, \T2
149362306a36Sopenharmony_ci                .if \ENC_DEC == ENC
149462306a36Sopenharmony_ci                vaesenclast     \T2, reg_j, reg_j
149562306a36Sopenharmony_ci                .else
149662306a36Sopenharmony_ci                vaesenclast     \T2, reg_j, \T3
149762306a36Sopenharmony_ci                vmovdqu 16*i(arg4, %r11), reg_j
149862306a36Sopenharmony_ci                vmovdqu \T3, 16*i(arg3, %r11)
149962306a36Sopenharmony_ci                .endif
150062306a36Sopenharmony_ci	i = (i+1)
150162306a36Sopenharmony_ci	j = (j+1)
150262306a36Sopenharmony_ci	setreg
150362306a36Sopenharmony_ci.endr
150462306a36Sopenharmony_ci	#######################################################################
150562306a36Sopenharmony_ci
150662306a36Sopenharmony_ci
150762306a36Sopenharmony_ci	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
150862306a36Sopenharmony_ci	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
150962306a36Sopenharmony_ci	vpxor	\T3, \T7, \T7
151062306a36Sopenharmony_ci	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
151162306a36Sopenharmony_ci
151262306a36Sopenharmony_ci
151362306a36Sopenharmony_ci
151462306a36Sopenharmony_ci	#######################################################################
151562306a36Sopenharmony_ci	#first phase of the reduction
151662306a36Sopenharmony_ci	#######################################################################
151762306a36Sopenharmony_ci        vpslld  $31, \T7, \T2                           # packed right shifting << 31
151862306a36Sopenharmony_ci        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
151962306a36Sopenharmony_ci        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
152062306a36Sopenharmony_ci
152162306a36Sopenharmony_ci        vpxor   \T3, \T2, \T2                           # xor the shifted versions
152262306a36Sopenharmony_ci        vpxor   \T4, \T2, \T2
152362306a36Sopenharmony_ci
152462306a36Sopenharmony_ci        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
152562306a36Sopenharmony_ci
152662306a36Sopenharmony_ci        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
152762306a36Sopenharmony_ci        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
152862306a36Sopenharmony_ci	#######################################################################
152962306a36Sopenharmony_ci                .if \ENC_DEC == ENC
153062306a36Sopenharmony_ci		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
153162306a36Sopenharmony_ci		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
153262306a36Sopenharmony_ci		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
153362306a36Sopenharmony_ci		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
153462306a36Sopenharmony_ci		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
153562306a36Sopenharmony_ci		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
153662306a36Sopenharmony_ci		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
153762306a36Sopenharmony_ci		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
153862306a36Sopenharmony_ci                .endif
153962306a36Sopenharmony_ci
154062306a36Sopenharmony_ci	#######################################################################
154162306a36Sopenharmony_ci	#second phase of the reduction
154262306a36Sopenharmony_ci        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
154362306a36Sopenharmony_ci        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
154462306a36Sopenharmony_ci        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
154562306a36Sopenharmony_ci        vpxor   \T3, \T2, \T2                           # xor the shifted versions
154662306a36Sopenharmony_ci        vpxor   \T4, \T2, \T2
154762306a36Sopenharmony_ci
154862306a36Sopenharmony_ci        vpxor   \T1, \T2, \T2
154962306a36Sopenharmony_ci        vpxor   \T2, \T7, \T7
155062306a36Sopenharmony_ci        vpxor   \T7, \T6, \T6                           # the result is in T6
155162306a36Sopenharmony_ci	#######################################################################
155262306a36Sopenharmony_ci
155362306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
155462306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
155562306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
155662306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
155762306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
155862306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
155962306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
156062306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
156162306a36Sopenharmony_ci
156262306a36Sopenharmony_ci
156362306a36Sopenharmony_ci	vpxor	\T6, \XMM1, \XMM1
156462306a36Sopenharmony_ci
156562306a36Sopenharmony_ci
156662306a36Sopenharmony_ci
156762306a36Sopenharmony_ci.endm
156862306a36Sopenharmony_ci
156962306a36Sopenharmony_ci
157062306a36Sopenharmony_ci# GHASH the last 4 ciphertext blocks.
157162306a36Sopenharmony_ci.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
157262306a36Sopenharmony_ci
157362306a36Sopenharmony_ci        ## Karatsuba Method
157462306a36Sopenharmony_ci
157562306a36Sopenharmony_ci
157662306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM1, \T2
157762306a36Sopenharmony_ci        vpxor           \XMM1, \T2, \T2
157862306a36Sopenharmony_ci        vmovdqu         HashKey_8(arg2), \T5
157962306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM1, \T6
158062306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM1, \T7
158162306a36Sopenharmony_ci
158262306a36Sopenharmony_ci        vmovdqu         HashKey_8_k(arg2), \T3
158362306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \XMM1
158462306a36Sopenharmony_ci
158562306a36Sopenharmony_ci        ######################
158662306a36Sopenharmony_ci
158762306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM2, \T2
158862306a36Sopenharmony_ci        vpxor           \XMM2, \T2, \T2
158962306a36Sopenharmony_ci        vmovdqu         HashKey_7(arg2), \T5
159062306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM2, \T4
159162306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
159262306a36Sopenharmony_ci
159362306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM2, \T4
159462306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
159562306a36Sopenharmony_ci
159662306a36Sopenharmony_ci        vmovdqu         HashKey_7_k(arg2), \T3
159762306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
159862306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
159962306a36Sopenharmony_ci
160062306a36Sopenharmony_ci        ######################
160162306a36Sopenharmony_ci
160262306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM3, \T2
160362306a36Sopenharmony_ci        vpxor           \XMM3, \T2, \T2
160462306a36Sopenharmony_ci        vmovdqu         HashKey_6(arg2), \T5
160562306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM3, \T4
160662306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
160762306a36Sopenharmony_ci
160862306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM3, \T4
160962306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
161062306a36Sopenharmony_ci
161162306a36Sopenharmony_ci        vmovdqu         HashKey_6_k(arg2), \T3
161262306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
161362306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
161462306a36Sopenharmony_ci
161562306a36Sopenharmony_ci        ######################
161662306a36Sopenharmony_ci
161762306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM4, \T2
161862306a36Sopenharmony_ci        vpxor           \XMM4, \T2, \T2
161962306a36Sopenharmony_ci        vmovdqu         HashKey_5(arg2), \T5
162062306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM4, \T4
162162306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
162262306a36Sopenharmony_ci
162362306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM4, \T4
162462306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
162562306a36Sopenharmony_ci
162662306a36Sopenharmony_ci        vmovdqu         HashKey_5_k(arg2), \T3
162762306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
162862306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
162962306a36Sopenharmony_ci
163062306a36Sopenharmony_ci        ######################
163162306a36Sopenharmony_ci
163262306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM5, \T2
163362306a36Sopenharmony_ci        vpxor           \XMM5, \T2, \T2
163462306a36Sopenharmony_ci        vmovdqu         HashKey_4(arg2), \T5
163562306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM5, \T4
163662306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
163762306a36Sopenharmony_ci
163862306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM5, \T4
163962306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
164062306a36Sopenharmony_ci
164162306a36Sopenharmony_ci        vmovdqu         HashKey_4_k(arg2), \T3
164262306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
164362306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
164462306a36Sopenharmony_ci
164562306a36Sopenharmony_ci        ######################
164662306a36Sopenharmony_ci
164762306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM6, \T2
164862306a36Sopenharmony_ci        vpxor           \XMM6, \T2, \T2
164962306a36Sopenharmony_ci        vmovdqu         HashKey_3(arg2), \T5
165062306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM6, \T4
165162306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
165262306a36Sopenharmony_ci
165362306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM6, \T4
165462306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
165562306a36Sopenharmony_ci
165662306a36Sopenharmony_ci        vmovdqu         HashKey_3_k(arg2), \T3
165762306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
165862306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
165962306a36Sopenharmony_ci
166062306a36Sopenharmony_ci        ######################
166162306a36Sopenharmony_ci
166262306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM7, \T2
166362306a36Sopenharmony_ci        vpxor           \XMM7, \T2, \T2
166462306a36Sopenharmony_ci        vmovdqu         HashKey_2(arg2), \T5
166562306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM7, \T4
166662306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
166762306a36Sopenharmony_ci
166862306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM7, \T4
166962306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
167062306a36Sopenharmony_ci
167162306a36Sopenharmony_ci        vmovdqu         HashKey_2_k(arg2), \T3
167262306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
167362306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
167462306a36Sopenharmony_ci
167562306a36Sopenharmony_ci        ######################
167662306a36Sopenharmony_ci
167762306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM8, \T2
167862306a36Sopenharmony_ci        vpxor           \XMM8, \T2, \T2
167962306a36Sopenharmony_ci        vmovdqu         HashKey(arg2), \T5
168062306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM8, \T4
168162306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
168262306a36Sopenharmony_ci
168362306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM8, \T4
168462306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
168562306a36Sopenharmony_ci
168662306a36Sopenharmony_ci        vmovdqu         HashKey_k(arg2), \T3
168762306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
168862306a36Sopenharmony_ci
168962306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
169062306a36Sopenharmony_ci        vpxor           \T6, \XMM1, \XMM1
169162306a36Sopenharmony_ci        vpxor           \T7, \XMM1, \T2
169262306a36Sopenharmony_ci
169362306a36Sopenharmony_ci
169462306a36Sopenharmony_ci
169562306a36Sopenharmony_ci
169662306a36Sopenharmony_ci        vpslldq $8, \T2, \T4
169762306a36Sopenharmony_ci        vpsrldq $8, \T2, \T2
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_ci        vpxor   \T4, \T7, \T7
170062306a36Sopenharmony_ci        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
170162306a36Sopenharmony_ci				# the accumulated carry-less multiplications
170262306a36Sopenharmony_ci
170362306a36Sopenharmony_ci        #######################################################################
170462306a36Sopenharmony_ci        #first phase of the reduction
170562306a36Sopenharmony_ci        vpslld  $31, \T7, \T2   # packed right shifting << 31
170662306a36Sopenharmony_ci        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
170762306a36Sopenharmony_ci        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
170862306a36Sopenharmony_ci
170962306a36Sopenharmony_ci        vpxor   \T3, \T2, \T2   # xor the shifted versions
171062306a36Sopenharmony_ci        vpxor   \T4, \T2, \T2
171162306a36Sopenharmony_ci
171262306a36Sopenharmony_ci        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
171362306a36Sopenharmony_ci
171462306a36Sopenharmony_ci        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
171562306a36Sopenharmony_ci        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
171662306a36Sopenharmony_ci        #######################################################################
171762306a36Sopenharmony_ci
171862306a36Sopenharmony_ci
171962306a36Sopenharmony_ci        #second phase of the reduction
172062306a36Sopenharmony_ci        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
172162306a36Sopenharmony_ci        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
172262306a36Sopenharmony_ci        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
172362306a36Sopenharmony_ci        vpxor   \T3, \T2, \T2   # xor the shifted versions
172462306a36Sopenharmony_ci        vpxor   \T4, \T2, \T2
172562306a36Sopenharmony_ci
172662306a36Sopenharmony_ci        vpxor   \T1, \T2, \T2
172762306a36Sopenharmony_ci        vpxor   \T2, \T7, \T7
172862306a36Sopenharmony_ci        vpxor   \T7, \T6, \T6   # the result is in T6
172962306a36Sopenharmony_ci
173062306a36Sopenharmony_ci.endm
173162306a36Sopenharmony_ci
173262306a36Sopenharmony_ci#############################################################
173362306a36Sopenharmony_ci#void   aesni_gcm_precomp_avx_gen2
173462306a36Sopenharmony_ci#        (gcm_data     *my_ctx_data,
173562306a36Sopenharmony_ci#         gcm_context_data *data,
173662306a36Sopenharmony_ci#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
173762306a36Sopenharmony_ci#        u8      *iv, /* Pre-counter block j0: 4 byte salt
173862306a36Sopenharmony_ci#			(from Security Association) concatenated with 8 byte
173962306a36Sopenharmony_ci#			Initialisation Vector (from IPSec ESP Payload)
174062306a36Sopenharmony_ci#			concatenated with 0x00000001. 16-byte aligned pointer. */
174162306a36Sopenharmony_ci#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
174262306a36Sopenharmony_ci#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
174362306a36Sopenharmony_ci#############################################################
174462306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_init_avx_gen2)
174562306a36Sopenharmony_ci        FUNC_SAVE
174662306a36Sopenharmony_ci        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
174762306a36Sopenharmony_ci        FUNC_RESTORE
174862306a36Sopenharmony_ci        RET
174962306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_init_avx_gen2)
175062306a36Sopenharmony_ci
175162306a36Sopenharmony_ci###############################################################################
175262306a36Sopenharmony_ci#void   aesni_gcm_enc_update_avx_gen2(
175362306a36Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
175462306a36Sopenharmony_ci#        gcm_context_data *data,
175562306a36Sopenharmony_ci#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
175662306a36Sopenharmony_ci#        const   u8 *in, /* Plaintext input */
175762306a36Sopenharmony_ci#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
175862306a36Sopenharmony_ci###############################################################################
175962306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
176062306a36Sopenharmony_ci        FUNC_SAVE
176162306a36Sopenharmony_ci        mov     keysize, %eax
176262306a36Sopenharmony_ci        cmp     $32, %eax
176362306a36Sopenharmony_ci        je      key_256_enc_update
176462306a36Sopenharmony_ci        cmp     $16, %eax
176562306a36Sopenharmony_ci        je      key_128_enc_update
176662306a36Sopenharmony_ci        # must be 192
176762306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
176862306a36Sopenharmony_ci        FUNC_RESTORE
176962306a36Sopenharmony_ci        RET
177062306a36Sopenharmony_cikey_128_enc_update:
177162306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
177262306a36Sopenharmony_ci        FUNC_RESTORE
177362306a36Sopenharmony_ci        RET
177462306a36Sopenharmony_cikey_256_enc_update:
177562306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
177662306a36Sopenharmony_ci        FUNC_RESTORE
177762306a36Sopenharmony_ci        RET
177862306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
177962306a36Sopenharmony_ci
178062306a36Sopenharmony_ci###############################################################################
178162306a36Sopenharmony_ci#void   aesni_gcm_dec_update_avx_gen2(
178262306a36Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
178362306a36Sopenharmony_ci#        gcm_context_data *data,
178462306a36Sopenharmony_ci#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
178562306a36Sopenharmony_ci#        const   u8 *in, /* Ciphertext input */
178662306a36Sopenharmony_ci#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
178762306a36Sopenharmony_ci###############################################################################
178862306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
178962306a36Sopenharmony_ci        FUNC_SAVE
179062306a36Sopenharmony_ci        mov     keysize,%eax
179162306a36Sopenharmony_ci        cmp     $32, %eax
179262306a36Sopenharmony_ci        je      key_256_dec_update
179362306a36Sopenharmony_ci        cmp     $16, %eax
179462306a36Sopenharmony_ci        je      key_128_dec_update
179562306a36Sopenharmony_ci        # must be 192
179662306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
179762306a36Sopenharmony_ci        FUNC_RESTORE
179862306a36Sopenharmony_ci        RET
179962306a36Sopenharmony_cikey_128_dec_update:
180062306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
180162306a36Sopenharmony_ci        FUNC_RESTORE
180262306a36Sopenharmony_ci        RET
180362306a36Sopenharmony_cikey_256_dec_update:
180462306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
180562306a36Sopenharmony_ci        FUNC_RESTORE
180662306a36Sopenharmony_ci        RET
180762306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
180862306a36Sopenharmony_ci
180962306a36Sopenharmony_ci###############################################################################
181062306a36Sopenharmony_ci#void   aesni_gcm_finalize_avx_gen2(
181162306a36Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
181262306a36Sopenharmony_ci#        gcm_context_data *data,
181362306a36Sopenharmony_ci#        u8      *auth_tag, /* Authenticated Tag output. */
181462306a36Sopenharmony_ci#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
181562306a36Sopenharmony_ci#				Valid values are 16 (most likely), 12 or 8. */
181662306a36Sopenharmony_ci###############################################################################
181762306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
181862306a36Sopenharmony_ci        FUNC_SAVE
181962306a36Sopenharmony_ci        mov	keysize,%eax
182062306a36Sopenharmony_ci        cmp     $32, %eax
182162306a36Sopenharmony_ci        je      key_256_finalize
182262306a36Sopenharmony_ci        cmp     $16, %eax
182362306a36Sopenharmony_ci        je      key_128_finalize
182462306a36Sopenharmony_ci        # must be 192
182562306a36Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
182662306a36Sopenharmony_ci        FUNC_RESTORE
182762306a36Sopenharmony_ci        RET
182862306a36Sopenharmony_cikey_128_finalize:
182962306a36Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
183062306a36Sopenharmony_ci        FUNC_RESTORE
183162306a36Sopenharmony_ci        RET
183262306a36Sopenharmony_cikey_256_finalize:
183362306a36Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
183462306a36Sopenharmony_ci        FUNC_RESTORE
183562306a36Sopenharmony_ci        RET
183662306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
183762306a36Sopenharmony_ci
183862306a36Sopenharmony_ci###############################################################################
183962306a36Sopenharmony_ci# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
184062306a36Sopenharmony_ci# Input: A and B (128-bits each, bit-reflected)
184162306a36Sopenharmony_ci# Output: C = A*B*x mod poly, (i.e. >>1 )
184262306a36Sopenharmony_ci# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
184362306a36Sopenharmony_ci# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
184462306a36Sopenharmony_ci###############################################################################
184562306a36Sopenharmony_ci.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
184662306a36Sopenharmony_ci
184762306a36Sopenharmony_ci        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
184862306a36Sopenharmony_ci        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
184962306a36Sopenharmony_ci        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
185062306a36Sopenharmony_ci        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
185162306a36Sopenharmony_ci        vpxor           \T3, \GH, \GH
185262306a36Sopenharmony_ci
185362306a36Sopenharmony_ci
185462306a36Sopenharmony_ci        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
185562306a36Sopenharmony_ci        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
185662306a36Sopenharmony_ci
185762306a36Sopenharmony_ci        vpxor           \T3, \T1, \T1
185862306a36Sopenharmony_ci        vpxor           \T2, \GH, \GH
185962306a36Sopenharmony_ci
186062306a36Sopenharmony_ci        #######################################################################
186162306a36Sopenharmony_ci        #first phase of the reduction
186262306a36Sopenharmony_ci        vmovdqa         POLY2(%rip), \T3
186362306a36Sopenharmony_ci
186462306a36Sopenharmony_ci        vpclmulqdq      $0x01, \GH, \T3, \T2
186562306a36Sopenharmony_ci        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
186662306a36Sopenharmony_ci
186762306a36Sopenharmony_ci        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
186862306a36Sopenharmony_ci        #######################################################################
186962306a36Sopenharmony_ci        #second phase of the reduction
187062306a36Sopenharmony_ci        vpclmulqdq      $0x00, \GH, \T3, \T2
187162306a36Sopenharmony_ci        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
187262306a36Sopenharmony_ci
187362306a36Sopenharmony_ci        vpclmulqdq      $0x10, \GH, \T3, \GH
187462306a36Sopenharmony_ci        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
187562306a36Sopenharmony_ci
187662306a36Sopenharmony_ci        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
187762306a36Sopenharmony_ci        #######################################################################
187862306a36Sopenharmony_ci        vpxor           \T1, \GH, \GH          # the result is in GH
187962306a36Sopenharmony_ci
188062306a36Sopenharmony_ci
188162306a36Sopenharmony_ci.endm
188262306a36Sopenharmony_ci
188362306a36Sopenharmony_ci.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
188462306a36Sopenharmony_ci
188562306a36Sopenharmony_ci        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
188662306a36Sopenharmony_ci        vmovdqa  \HK, \T5
188762306a36Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
188862306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
188962306a36Sopenharmony_ci
189062306a36Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
189162306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_3(arg2)
189262306a36Sopenharmony_ci
189362306a36Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
189462306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_4(arg2)
189562306a36Sopenharmony_ci
189662306a36Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
189762306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_5(arg2)
189862306a36Sopenharmony_ci
189962306a36Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
190062306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_6(arg2)
190162306a36Sopenharmony_ci
190262306a36Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
190362306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_7(arg2)
190462306a36Sopenharmony_ci
190562306a36Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
190662306a36Sopenharmony_ci        vmovdqu  \T5, HashKey_8(arg2)
190762306a36Sopenharmony_ci
190862306a36Sopenharmony_ci.endm
190962306a36Sopenharmony_ci
191062306a36Sopenharmony_ci## if a = number of total plaintext bytes
191162306a36Sopenharmony_ci## b = floor(a/16)
191262306a36Sopenharmony_ci## num_initial_blocks = b mod 4#
191362306a36Sopenharmony_ci## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
191462306a36Sopenharmony_ci## r10, r11, r12, rax are clobbered
191562306a36Sopenharmony_ci## arg1, arg2, arg3, arg4 are used as pointers only, not modified
191662306a36Sopenharmony_ci
191762306a36Sopenharmony_ci.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
191862306a36Sopenharmony_ci	i = (8-\num_initial_blocks)
191962306a36Sopenharmony_ci	setreg
192062306a36Sopenharmony_ci	vmovdqu AadHash(arg2), reg_i
192162306a36Sopenharmony_ci
192262306a36Sopenharmony_ci	# start AES for num_initial_blocks blocks
192362306a36Sopenharmony_ci	vmovdqu CurCount(arg2), \CTR
192462306a36Sopenharmony_ci
192562306a36Sopenharmony_ci	i = (9-\num_initial_blocks)
192662306a36Sopenharmony_ci	setreg
192762306a36Sopenharmony_ci.rep \num_initial_blocks
192862306a36Sopenharmony_ci                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
192962306a36Sopenharmony_ci                vmovdqa \CTR, reg_i
193062306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
193162306a36Sopenharmony_ci	i = (i+1)
193262306a36Sopenharmony_ci	setreg
193362306a36Sopenharmony_ci.endr
193462306a36Sopenharmony_ci
193562306a36Sopenharmony_ci	vmovdqa  (arg1), \T_key
193662306a36Sopenharmony_ci	i = (9-\num_initial_blocks)
193762306a36Sopenharmony_ci	setreg
193862306a36Sopenharmony_ci.rep \num_initial_blocks
193962306a36Sopenharmony_ci                vpxor   \T_key, reg_i, reg_i
194062306a36Sopenharmony_ci	i = (i+1)
194162306a36Sopenharmony_ci	setreg
194262306a36Sopenharmony_ci.endr
194362306a36Sopenharmony_ci
194462306a36Sopenharmony_ci	j = 1
194562306a36Sopenharmony_ci	setreg
194662306a36Sopenharmony_ci.rep \REP
194762306a36Sopenharmony_ci	vmovdqa  16*j(arg1), \T_key
194862306a36Sopenharmony_ci	i = (9-\num_initial_blocks)
194962306a36Sopenharmony_ci	setreg
195062306a36Sopenharmony_ci.rep \num_initial_blocks
195162306a36Sopenharmony_ci        vaesenc \T_key, reg_i, reg_i
195262306a36Sopenharmony_ci	i = (i+1)
195362306a36Sopenharmony_ci	setreg
195462306a36Sopenharmony_ci.endr
195562306a36Sopenharmony_ci
195662306a36Sopenharmony_ci	j = (j+1)
195762306a36Sopenharmony_ci	setreg
195862306a36Sopenharmony_ci.endr
195962306a36Sopenharmony_ci
196062306a36Sopenharmony_ci
196162306a36Sopenharmony_ci	vmovdqa  16*j(arg1), \T_key
196262306a36Sopenharmony_ci	i = (9-\num_initial_blocks)
196362306a36Sopenharmony_ci	setreg
196462306a36Sopenharmony_ci.rep \num_initial_blocks
196562306a36Sopenharmony_ci        vaesenclast      \T_key, reg_i, reg_i
196662306a36Sopenharmony_ci	i = (i+1)
196762306a36Sopenharmony_ci	setreg
196862306a36Sopenharmony_ci.endr
196962306a36Sopenharmony_ci
197062306a36Sopenharmony_ci	i = (9-\num_initial_blocks)
197162306a36Sopenharmony_ci	setreg
197262306a36Sopenharmony_ci.rep \num_initial_blocks
197362306a36Sopenharmony_ci                vmovdqu (arg4, %r11), \T1
197462306a36Sopenharmony_ci                vpxor   \T1, reg_i, reg_i
197562306a36Sopenharmony_ci                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
197662306a36Sopenharmony_ci						       # num_initial_blocks blocks
197762306a36Sopenharmony_ci                add     $16, %r11
197862306a36Sopenharmony_ci.if  \ENC_DEC == DEC
197962306a36Sopenharmony_ci                vmovdqa \T1, reg_i
198062306a36Sopenharmony_ci.endif
198162306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
198262306a36Sopenharmony_ci	i = (i+1)
198362306a36Sopenharmony_ci	setreg
198462306a36Sopenharmony_ci.endr
198562306a36Sopenharmony_ci
198662306a36Sopenharmony_ci
198762306a36Sopenharmony_ci	i = (8-\num_initial_blocks)
198862306a36Sopenharmony_ci	j = (9-\num_initial_blocks)
198962306a36Sopenharmony_ci	setreg
199062306a36Sopenharmony_ci
199162306a36Sopenharmony_ci.rep \num_initial_blocks
199262306a36Sopenharmony_ci        vpxor    reg_i, reg_j, reg_j
199362306a36Sopenharmony_ci        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
199462306a36Sopenharmony_ci	i = (i+1)
199562306a36Sopenharmony_ci	j = (j+1)
199662306a36Sopenharmony_ci	setreg
199762306a36Sopenharmony_ci.endr
199862306a36Sopenharmony_ci        # XMM8 has the combined result here
199962306a36Sopenharmony_ci
200062306a36Sopenharmony_ci        vmovdqa  \XMM8, TMP1(%rsp)
200162306a36Sopenharmony_ci        vmovdqa  \XMM8, \T3
200262306a36Sopenharmony_ci
200362306a36Sopenharmony_ci        cmp     $128, %r13
200462306a36Sopenharmony_ci        jl      .L_initial_blocks_done\@                  # no need for precomputed constants
200562306a36Sopenharmony_ci
200662306a36Sopenharmony_ci###############################################################################
200762306a36Sopenharmony_ci# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
200862306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
200962306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM1
201062306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
201162306a36Sopenharmony_ci
201262306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
201362306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM2
201462306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
201562306a36Sopenharmony_ci
201662306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
201762306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM3
201862306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
201962306a36Sopenharmony_ci
202062306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
202162306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM4
202262306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
202362306a36Sopenharmony_ci
202462306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
202562306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM5
202662306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
202762306a36Sopenharmony_ci
202862306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
202962306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM6
203062306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
203162306a36Sopenharmony_ci
203262306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
203362306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM7
203462306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
203562306a36Sopenharmony_ci
203662306a36Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
203762306a36Sopenharmony_ci                vmovdqa  \CTR, \XMM8
203862306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
203962306a36Sopenharmony_ci
204062306a36Sopenharmony_ci                vmovdqa  (arg1), \T_key
204162306a36Sopenharmony_ci                vpxor    \T_key, \XMM1, \XMM1
204262306a36Sopenharmony_ci                vpxor    \T_key, \XMM2, \XMM2
204362306a36Sopenharmony_ci                vpxor    \T_key, \XMM3, \XMM3
204462306a36Sopenharmony_ci                vpxor    \T_key, \XMM4, \XMM4
204562306a36Sopenharmony_ci                vpxor    \T_key, \XMM5, \XMM5
204662306a36Sopenharmony_ci                vpxor    \T_key, \XMM6, \XMM6
204762306a36Sopenharmony_ci                vpxor    \T_key, \XMM7, \XMM7
204862306a36Sopenharmony_ci                vpxor    \T_key, \XMM8, \XMM8
204962306a36Sopenharmony_ci
205062306a36Sopenharmony_ci		i = 1
205162306a36Sopenharmony_ci		setreg
205262306a36Sopenharmony_ci.rep    \REP       # do REP rounds
205362306a36Sopenharmony_ci                vmovdqa  16*i(arg1), \T_key
205462306a36Sopenharmony_ci                vaesenc  \T_key, \XMM1, \XMM1
205562306a36Sopenharmony_ci                vaesenc  \T_key, \XMM2, \XMM2
205662306a36Sopenharmony_ci                vaesenc  \T_key, \XMM3, \XMM3
205762306a36Sopenharmony_ci                vaesenc  \T_key, \XMM4, \XMM4
205862306a36Sopenharmony_ci                vaesenc  \T_key, \XMM5, \XMM5
205962306a36Sopenharmony_ci                vaesenc  \T_key, \XMM6, \XMM6
206062306a36Sopenharmony_ci                vaesenc  \T_key, \XMM7, \XMM7
206162306a36Sopenharmony_ci                vaesenc  \T_key, \XMM8, \XMM8
206262306a36Sopenharmony_ci		i = (i+1)
206362306a36Sopenharmony_ci		setreg
206462306a36Sopenharmony_ci.endr
206562306a36Sopenharmony_ci
206662306a36Sopenharmony_ci
206762306a36Sopenharmony_ci                vmovdqa  16*i(arg1), \T_key
206862306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM1, \XMM1
206962306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM2, \XMM2
207062306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM3, \XMM3
207162306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM4, \XMM4
207262306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM5, \XMM5
207362306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM6, \XMM6
207462306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM7, \XMM7
207562306a36Sopenharmony_ci                vaesenclast  \T_key, \XMM8, \XMM8
207662306a36Sopenharmony_ci
207762306a36Sopenharmony_ci                vmovdqu  (arg4, %r11), \T1
207862306a36Sopenharmony_ci                vpxor    \T1, \XMM1, \XMM1
207962306a36Sopenharmony_ci                vmovdqu  \XMM1, (arg3 , %r11)
208062306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
208162306a36Sopenharmony_ci                vmovdqa  \T1, \XMM1
208262306a36Sopenharmony_ci                .endif
208362306a36Sopenharmony_ci
208462306a36Sopenharmony_ci                vmovdqu  16*1(arg4, %r11), \T1
208562306a36Sopenharmony_ci                vpxor    \T1, \XMM2, \XMM2
208662306a36Sopenharmony_ci                vmovdqu  \XMM2, 16*1(arg3 , %r11)
208762306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
208862306a36Sopenharmony_ci                vmovdqa  \T1, \XMM2
208962306a36Sopenharmony_ci                .endif
209062306a36Sopenharmony_ci
209162306a36Sopenharmony_ci                vmovdqu  16*2(arg4, %r11), \T1
209262306a36Sopenharmony_ci                vpxor    \T1, \XMM3, \XMM3
209362306a36Sopenharmony_ci                vmovdqu  \XMM3, 16*2(arg3 , %r11)
209462306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
209562306a36Sopenharmony_ci                vmovdqa  \T1, \XMM3
209662306a36Sopenharmony_ci                .endif
209762306a36Sopenharmony_ci
209862306a36Sopenharmony_ci                vmovdqu  16*3(arg4, %r11), \T1
209962306a36Sopenharmony_ci                vpxor    \T1, \XMM4, \XMM4
210062306a36Sopenharmony_ci                vmovdqu  \XMM4, 16*3(arg3 , %r11)
210162306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
210262306a36Sopenharmony_ci                vmovdqa  \T1, \XMM4
210362306a36Sopenharmony_ci                .endif
210462306a36Sopenharmony_ci
210562306a36Sopenharmony_ci                vmovdqu  16*4(arg4, %r11), \T1
210662306a36Sopenharmony_ci                vpxor    \T1, \XMM5, \XMM5
210762306a36Sopenharmony_ci                vmovdqu  \XMM5, 16*4(arg3 , %r11)
210862306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
210962306a36Sopenharmony_ci                vmovdqa  \T1, \XMM5
211062306a36Sopenharmony_ci                .endif
211162306a36Sopenharmony_ci
211262306a36Sopenharmony_ci                vmovdqu  16*5(arg4, %r11), \T1
211362306a36Sopenharmony_ci                vpxor    \T1, \XMM6, \XMM6
211462306a36Sopenharmony_ci                vmovdqu  \XMM6, 16*5(arg3 , %r11)
211562306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
211662306a36Sopenharmony_ci                vmovdqa  \T1, \XMM6
211762306a36Sopenharmony_ci                .endif
211862306a36Sopenharmony_ci
211962306a36Sopenharmony_ci                vmovdqu  16*6(arg4, %r11), \T1
212062306a36Sopenharmony_ci                vpxor    \T1, \XMM7, \XMM7
212162306a36Sopenharmony_ci                vmovdqu  \XMM7, 16*6(arg3 , %r11)
212262306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
212362306a36Sopenharmony_ci                vmovdqa  \T1, \XMM7
212462306a36Sopenharmony_ci                .endif
212562306a36Sopenharmony_ci
212662306a36Sopenharmony_ci                vmovdqu  16*7(arg4, %r11), \T1
212762306a36Sopenharmony_ci                vpxor    \T1, \XMM8, \XMM8
212862306a36Sopenharmony_ci                vmovdqu  \XMM8, 16*7(arg3 , %r11)
212962306a36Sopenharmony_ci                .if   \ENC_DEC == DEC
213062306a36Sopenharmony_ci                vmovdqa  \T1, \XMM8
213162306a36Sopenharmony_ci                .endif
213262306a36Sopenharmony_ci
213362306a36Sopenharmony_ci                add     $128, %r11
213462306a36Sopenharmony_ci
213562306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
213662306a36Sopenharmony_ci                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
213762306a36Sopenharmony_ci							   # the corresponding ciphertext
213862306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
213962306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
214062306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
214162306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
214262306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
214362306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
214462306a36Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
214562306a36Sopenharmony_ci
214662306a36Sopenharmony_ci###############################################################################
214762306a36Sopenharmony_ci
214862306a36Sopenharmony_ci.L_initial_blocks_done\@:
214962306a36Sopenharmony_ci
215062306a36Sopenharmony_ci
215162306a36Sopenharmony_ci.endm
215262306a36Sopenharmony_ci
215362306a36Sopenharmony_ci
215462306a36Sopenharmony_ci
215562306a36Sopenharmony_ci# encrypt 8 blocks at a time
215662306a36Sopenharmony_ci# ghash the 8 previously encrypted ciphertext blocks
215762306a36Sopenharmony_ci# arg1, arg2, arg3, arg4 are used as pointers only, not modified
215862306a36Sopenharmony_ci# r11 is the data offset value
215962306a36Sopenharmony_ci.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
216062306a36Sopenharmony_ci
216162306a36Sopenharmony_ci        vmovdqa \XMM1, \T2
216262306a36Sopenharmony_ci        vmovdqa \XMM2, TMP2(%rsp)
216362306a36Sopenharmony_ci        vmovdqa \XMM3, TMP3(%rsp)
216462306a36Sopenharmony_ci        vmovdqa \XMM4, TMP4(%rsp)
216562306a36Sopenharmony_ci        vmovdqa \XMM5, TMP5(%rsp)
216662306a36Sopenharmony_ci        vmovdqa \XMM6, TMP6(%rsp)
216762306a36Sopenharmony_ci        vmovdqa \XMM7, TMP7(%rsp)
216862306a36Sopenharmony_ci        vmovdqa \XMM8, TMP8(%rsp)
216962306a36Sopenharmony_ci
217062306a36Sopenharmony_ci.if \loop_idx == in_order
217162306a36Sopenharmony_ci                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
217262306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM1, \XMM2
217362306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM2, \XMM3
217462306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM3, \XMM4
217562306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM4, \XMM5
217662306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM5, \XMM6
217762306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM6, \XMM7
217862306a36Sopenharmony_ci                vpaddd  ONE(%rip), \XMM7, \XMM8
217962306a36Sopenharmony_ci                vmovdqa \XMM8, \CTR
218062306a36Sopenharmony_ci
218162306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
218262306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
218362306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
218462306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
218562306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
218662306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
218762306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
218862306a36Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
218962306a36Sopenharmony_ci.else
219062306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
219162306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM1, \XMM2
219262306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM2, \XMM3
219362306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM3, \XMM4
219462306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM4, \XMM5
219562306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM5, \XMM6
219662306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM6, \XMM7
219762306a36Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM7, \XMM8
219862306a36Sopenharmony_ci                vmovdqa \XMM8, \CTR
219962306a36Sopenharmony_ci.endif
220062306a36Sopenharmony_ci
220162306a36Sopenharmony_ci
220262306a36Sopenharmony_ci        #######################################################################
220362306a36Sopenharmony_ci
220462306a36Sopenharmony_ci                vmovdqu (arg1), \T1
220562306a36Sopenharmony_ci                vpxor   \T1, \XMM1, \XMM1
220662306a36Sopenharmony_ci                vpxor   \T1, \XMM2, \XMM2
220762306a36Sopenharmony_ci                vpxor   \T1, \XMM3, \XMM3
220862306a36Sopenharmony_ci                vpxor   \T1, \XMM4, \XMM4
220962306a36Sopenharmony_ci                vpxor   \T1, \XMM5, \XMM5
221062306a36Sopenharmony_ci                vpxor   \T1, \XMM6, \XMM6
221162306a36Sopenharmony_ci                vpxor   \T1, \XMM7, \XMM7
221262306a36Sopenharmony_ci                vpxor   \T1, \XMM8, \XMM8
221362306a36Sopenharmony_ci
221462306a36Sopenharmony_ci        #######################################################################
221562306a36Sopenharmony_ci
221662306a36Sopenharmony_ci
221762306a36Sopenharmony_ci
221862306a36Sopenharmony_ci
221962306a36Sopenharmony_ci
222062306a36Sopenharmony_ci                vmovdqu 16*1(arg1), \T1
222162306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
222262306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
222362306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
222462306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
222562306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
222662306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
222762306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
222862306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
222962306a36Sopenharmony_ci
223062306a36Sopenharmony_ci                vmovdqu 16*2(arg1), \T1
223162306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
223262306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
223362306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
223462306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
223562306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
223662306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
223762306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
223862306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
223962306a36Sopenharmony_ci
224062306a36Sopenharmony_ci
224162306a36Sopenharmony_ci        #######################################################################
224262306a36Sopenharmony_ci
224362306a36Sopenharmony_ci        vmovdqu         HashKey_8(arg2), \T5
224462306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
224562306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
224662306a36Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
224762306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
224862306a36Sopenharmony_ci        vpxor           \T5, \T6, \T6
224962306a36Sopenharmony_ci
225062306a36Sopenharmony_ci                vmovdqu 16*3(arg1), \T1
225162306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
225262306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
225362306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
225462306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
225562306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
225662306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
225762306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
225862306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
225962306a36Sopenharmony_ci
226062306a36Sopenharmony_ci        vmovdqa         TMP2(%rsp), \T1
226162306a36Sopenharmony_ci        vmovdqu         HashKey_7(arg2), \T5
226262306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
226362306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
226462306a36Sopenharmony_ci
226562306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
226662306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
226762306a36Sopenharmony_ci
226862306a36Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
226962306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
227062306a36Sopenharmony_ci
227162306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
227262306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
227362306a36Sopenharmony_ci
227462306a36Sopenharmony_ci                vmovdqu 16*4(arg1), \T1
227562306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
227662306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
227762306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
227862306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
227962306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
228062306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
228162306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
228262306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
228362306a36Sopenharmony_ci
228462306a36Sopenharmony_ci        #######################################################################
228562306a36Sopenharmony_ci
228662306a36Sopenharmony_ci        vmovdqa         TMP3(%rsp), \T1
228762306a36Sopenharmony_ci        vmovdqu         HashKey_6(arg2), \T5
228862306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
228962306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
229062306a36Sopenharmony_ci
229162306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
229262306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
229362306a36Sopenharmony_ci
229462306a36Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
229562306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
229662306a36Sopenharmony_ci
229762306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
229862306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
229962306a36Sopenharmony_ci
230062306a36Sopenharmony_ci                vmovdqu 16*5(arg1), \T1
230162306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
230262306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
230362306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
230462306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
230562306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
230662306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
230762306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
230862306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
230962306a36Sopenharmony_ci
231062306a36Sopenharmony_ci        vmovdqa         TMP4(%rsp), \T1
231162306a36Sopenharmony_ci        vmovdqu         HashKey_5(arg2), \T5
231262306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
231362306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
231462306a36Sopenharmony_ci
231562306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
231662306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
231762306a36Sopenharmony_ci
231862306a36Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
231962306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
232062306a36Sopenharmony_ci
232162306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
232262306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
232362306a36Sopenharmony_ci
232462306a36Sopenharmony_ci                vmovdqu 16*6(arg1), \T1
232562306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
232662306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
232762306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
232862306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
232962306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
233062306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
233162306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
233262306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
233362306a36Sopenharmony_ci
233462306a36Sopenharmony_ci
233562306a36Sopenharmony_ci        vmovdqa         TMP5(%rsp), \T1
233662306a36Sopenharmony_ci        vmovdqu         HashKey_4(arg2), \T5
233762306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
233862306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
233962306a36Sopenharmony_ci
234062306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
234162306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
234262306a36Sopenharmony_ci
234362306a36Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
234462306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
234562306a36Sopenharmony_ci
234662306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
234762306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
234862306a36Sopenharmony_ci
234962306a36Sopenharmony_ci                vmovdqu 16*7(arg1), \T1
235062306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
235162306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
235262306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
235362306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
235462306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
235562306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
235662306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
235762306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
235862306a36Sopenharmony_ci
235962306a36Sopenharmony_ci        vmovdqa         TMP6(%rsp), \T1
236062306a36Sopenharmony_ci        vmovdqu         HashKey_3(arg2), \T5
236162306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
236262306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
236362306a36Sopenharmony_ci
236462306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
236562306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
236662306a36Sopenharmony_ci
236762306a36Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
236862306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
236962306a36Sopenharmony_ci
237062306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
237162306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
237262306a36Sopenharmony_ci
237362306a36Sopenharmony_ci                vmovdqu 16*8(arg1), \T1
237462306a36Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
237562306a36Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
237662306a36Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
237762306a36Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
237862306a36Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
237962306a36Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
238062306a36Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
238162306a36Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
238262306a36Sopenharmony_ci
238362306a36Sopenharmony_ci        vmovdqa         TMP7(%rsp), \T1
238462306a36Sopenharmony_ci        vmovdqu         HashKey_2(arg2), \T5
238562306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
238662306a36Sopenharmony_ci        vpxor           \T3, \T4, \T4
238762306a36Sopenharmony_ci
238862306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
238962306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
239062306a36Sopenharmony_ci
239162306a36Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
239262306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
239362306a36Sopenharmony_ci
239462306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
239562306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
239662306a36Sopenharmony_ci
239762306a36Sopenharmony_ci
239862306a36Sopenharmony_ci        #######################################################################
239962306a36Sopenharmony_ci
240062306a36Sopenharmony_ci                vmovdqu 16*9(arg1), \T5
240162306a36Sopenharmony_ci                vaesenc \T5, \XMM1, \XMM1
240262306a36Sopenharmony_ci                vaesenc \T5, \XMM2, \XMM2
240362306a36Sopenharmony_ci                vaesenc \T5, \XMM3, \XMM3
240462306a36Sopenharmony_ci                vaesenc \T5, \XMM4, \XMM4
240562306a36Sopenharmony_ci                vaesenc \T5, \XMM5, \XMM5
240662306a36Sopenharmony_ci                vaesenc \T5, \XMM6, \XMM6
240762306a36Sopenharmony_ci                vaesenc \T5, \XMM7, \XMM7
240862306a36Sopenharmony_ci                vaesenc \T5, \XMM8, \XMM8
240962306a36Sopenharmony_ci
241062306a36Sopenharmony_ci        vmovdqa         TMP8(%rsp), \T1
241162306a36Sopenharmony_ci        vmovdqu         HashKey(arg2), \T5
241262306a36Sopenharmony_ci
241362306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
241462306a36Sopenharmony_ci        vpxor           \T3, \T7, \T7
241562306a36Sopenharmony_ci
241662306a36Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
241762306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
241862306a36Sopenharmony_ci
241962306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
242062306a36Sopenharmony_ci        vpxor           \T3, \T6, \T6
242162306a36Sopenharmony_ci
242262306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
242362306a36Sopenharmony_ci        vpxor           \T3, \T4, \T1
242462306a36Sopenharmony_ci
242562306a36Sopenharmony_ci
242662306a36Sopenharmony_ci                vmovdqu 16*10(arg1), \T5
242762306a36Sopenharmony_ci
242862306a36Sopenharmony_ci        i = 11
242962306a36Sopenharmony_ci        setreg
243062306a36Sopenharmony_ci.rep (\REP-9)
243162306a36Sopenharmony_ci        vaesenc \T5, \XMM1, \XMM1
243262306a36Sopenharmony_ci        vaesenc \T5, \XMM2, \XMM2
243362306a36Sopenharmony_ci        vaesenc \T5, \XMM3, \XMM3
243462306a36Sopenharmony_ci        vaesenc \T5, \XMM4, \XMM4
243562306a36Sopenharmony_ci        vaesenc \T5, \XMM5, \XMM5
243662306a36Sopenharmony_ci        vaesenc \T5, \XMM6, \XMM6
243762306a36Sopenharmony_ci        vaesenc \T5, \XMM7, \XMM7
243862306a36Sopenharmony_ci        vaesenc \T5, \XMM8, \XMM8
243962306a36Sopenharmony_ci
244062306a36Sopenharmony_ci        vmovdqu 16*i(arg1), \T5
244162306a36Sopenharmony_ci        i = i + 1
244262306a36Sopenharmony_ci        setreg
244362306a36Sopenharmony_ci.endr
244462306a36Sopenharmony_ci
244562306a36Sopenharmony_ci	i = 0
244662306a36Sopenharmony_ci	j = 1
244762306a36Sopenharmony_ci	setreg
244862306a36Sopenharmony_ci.rep 8
244962306a36Sopenharmony_ci		vpxor	16*i(arg4, %r11), \T5, \T2
245062306a36Sopenharmony_ci                .if \ENC_DEC == ENC
245162306a36Sopenharmony_ci                vaesenclast     \T2, reg_j, reg_j
245262306a36Sopenharmony_ci                .else
245362306a36Sopenharmony_ci                vaesenclast     \T2, reg_j, \T3
245462306a36Sopenharmony_ci                vmovdqu 16*i(arg4, %r11), reg_j
245562306a36Sopenharmony_ci                vmovdqu \T3, 16*i(arg3, %r11)
245662306a36Sopenharmony_ci                .endif
245762306a36Sopenharmony_ci	i = (i+1)
245862306a36Sopenharmony_ci	j = (j+1)
245962306a36Sopenharmony_ci	setreg
246062306a36Sopenharmony_ci.endr
246162306a36Sopenharmony_ci	#######################################################################
246262306a36Sopenharmony_ci
246362306a36Sopenharmony_ci
246462306a36Sopenharmony_ci	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
246562306a36Sopenharmony_ci	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
246662306a36Sopenharmony_ci	vpxor	\T3, \T7, \T7
246762306a36Sopenharmony_ci	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
246862306a36Sopenharmony_ci
246962306a36Sopenharmony_ci
247062306a36Sopenharmony_ci
247162306a36Sopenharmony_ci	#######################################################################
247262306a36Sopenharmony_ci	#first phase of the reduction
247362306a36Sopenharmony_ci	vmovdqa         POLY2(%rip), \T3
247462306a36Sopenharmony_ci
247562306a36Sopenharmony_ci	vpclmulqdq	$0x01, \T7, \T3, \T2
247662306a36Sopenharmony_ci	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
247762306a36Sopenharmony_ci
247862306a36Sopenharmony_ci	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
247962306a36Sopenharmony_ci	#######################################################################
248062306a36Sopenharmony_ci                .if \ENC_DEC == ENC
248162306a36Sopenharmony_ci		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
248262306a36Sopenharmony_ci		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
248362306a36Sopenharmony_ci		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
248462306a36Sopenharmony_ci		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
248562306a36Sopenharmony_ci		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
248662306a36Sopenharmony_ci		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
248762306a36Sopenharmony_ci		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
248862306a36Sopenharmony_ci		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
248962306a36Sopenharmony_ci                .endif
249062306a36Sopenharmony_ci
249162306a36Sopenharmony_ci	#######################################################################
249262306a36Sopenharmony_ci	#second phase of the reduction
249362306a36Sopenharmony_ci	vpclmulqdq	$0x00, \T7, \T3, \T2
249462306a36Sopenharmony_ci	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
249562306a36Sopenharmony_ci
249662306a36Sopenharmony_ci	vpclmulqdq	$0x10, \T7, \T3, \T4
249762306a36Sopenharmony_ci	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
249862306a36Sopenharmony_ci
249962306a36Sopenharmony_ci	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
250062306a36Sopenharmony_ci	#######################################################################
250162306a36Sopenharmony_ci	vpxor		\T4, \T1, \T1			# the result is in T1
250262306a36Sopenharmony_ci
250362306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
250462306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
250562306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
250662306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
250762306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
250862306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
250962306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
251062306a36Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
251162306a36Sopenharmony_ci
251262306a36Sopenharmony_ci
251362306a36Sopenharmony_ci	vpxor	\T1, \XMM1, \XMM1
251462306a36Sopenharmony_ci
251562306a36Sopenharmony_ci
251662306a36Sopenharmony_ci
251762306a36Sopenharmony_ci.endm
251862306a36Sopenharmony_ci
251962306a36Sopenharmony_ci
252062306a36Sopenharmony_ci# GHASH the last 4 ciphertext blocks.
252162306a36Sopenharmony_ci.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
252262306a36Sopenharmony_ci
252362306a36Sopenharmony_ci        ## Karatsuba Method
252462306a36Sopenharmony_ci
252562306a36Sopenharmony_ci        vmovdqu         HashKey_8(arg2), \T5
252662306a36Sopenharmony_ci
252762306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM1, \T2
252862306a36Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
252962306a36Sopenharmony_ci        vpxor           \XMM1, \T2, \T2
253062306a36Sopenharmony_ci        vpxor           \T5, \T3, \T3
253162306a36Sopenharmony_ci
253262306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM1, \T6
253362306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM1, \T7
253462306a36Sopenharmony_ci
253562306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \XMM1
253662306a36Sopenharmony_ci
253762306a36Sopenharmony_ci        ######################
253862306a36Sopenharmony_ci
253962306a36Sopenharmony_ci        vmovdqu         HashKey_7(arg2), \T5
254062306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM2, \T2
254162306a36Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
254262306a36Sopenharmony_ci        vpxor           \XMM2, \T2, \T2
254362306a36Sopenharmony_ci        vpxor           \T5, \T3, \T3
254462306a36Sopenharmony_ci
254562306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM2, \T4
254662306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
254762306a36Sopenharmony_ci
254862306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM2, \T4
254962306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
255062306a36Sopenharmony_ci
255162306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
255262306a36Sopenharmony_ci
255362306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
255462306a36Sopenharmony_ci
255562306a36Sopenharmony_ci        ######################
255662306a36Sopenharmony_ci
255762306a36Sopenharmony_ci        vmovdqu         HashKey_6(arg2), \T5
255862306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM3, \T2
255962306a36Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
256062306a36Sopenharmony_ci        vpxor           \XMM3, \T2, \T2
256162306a36Sopenharmony_ci        vpxor           \T5, \T3, \T3
256262306a36Sopenharmony_ci
256362306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM3, \T4
256462306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
256562306a36Sopenharmony_ci
256662306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM3, \T4
256762306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
256862306a36Sopenharmony_ci
256962306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
257062306a36Sopenharmony_ci
257162306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
257262306a36Sopenharmony_ci
257362306a36Sopenharmony_ci        ######################
257462306a36Sopenharmony_ci
257562306a36Sopenharmony_ci        vmovdqu         HashKey_5(arg2), \T5
257662306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM4, \T2
257762306a36Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
257862306a36Sopenharmony_ci        vpxor           \XMM4, \T2, \T2
257962306a36Sopenharmony_ci        vpxor           \T5, \T3, \T3
258062306a36Sopenharmony_ci
258162306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM4, \T4
258262306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
258362306a36Sopenharmony_ci
258462306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM4, \T4
258562306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
258662306a36Sopenharmony_ci
258762306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
258862306a36Sopenharmony_ci
258962306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
259062306a36Sopenharmony_ci
259162306a36Sopenharmony_ci        ######################
259262306a36Sopenharmony_ci
259362306a36Sopenharmony_ci        vmovdqu         HashKey_4(arg2), \T5
259462306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM5, \T2
259562306a36Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
259662306a36Sopenharmony_ci        vpxor           \XMM5, \T2, \T2
259762306a36Sopenharmony_ci        vpxor           \T5, \T3, \T3
259862306a36Sopenharmony_ci
259962306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM5, \T4
260062306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
260162306a36Sopenharmony_ci
260262306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM5, \T4
260362306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
260462306a36Sopenharmony_ci
260562306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
260662306a36Sopenharmony_ci
260762306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
260862306a36Sopenharmony_ci
260962306a36Sopenharmony_ci        ######################
261062306a36Sopenharmony_ci
261162306a36Sopenharmony_ci        vmovdqu         HashKey_3(arg2), \T5
261262306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM6, \T2
261362306a36Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
261462306a36Sopenharmony_ci        vpxor           \XMM6, \T2, \T2
261562306a36Sopenharmony_ci        vpxor           \T5, \T3, \T3
261662306a36Sopenharmony_ci
261762306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM6, \T4
261862306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
261962306a36Sopenharmony_ci
262062306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM6, \T4
262162306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
262262306a36Sopenharmony_ci
262362306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
262462306a36Sopenharmony_ci
262562306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
262662306a36Sopenharmony_ci
262762306a36Sopenharmony_ci        ######################
262862306a36Sopenharmony_ci
262962306a36Sopenharmony_ci        vmovdqu         HashKey_2(arg2), \T5
263062306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM7, \T2
263162306a36Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
263262306a36Sopenharmony_ci        vpxor           \XMM7, \T2, \T2
263362306a36Sopenharmony_ci        vpxor           \T5, \T3, \T3
263462306a36Sopenharmony_ci
263562306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM7, \T4
263662306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
263762306a36Sopenharmony_ci
263862306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM7, \T4
263962306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
264062306a36Sopenharmony_ci
264162306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
264262306a36Sopenharmony_ci
264362306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
264462306a36Sopenharmony_ci
264562306a36Sopenharmony_ci        ######################
264662306a36Sopenharmony_ci
264762306a36Sopenharmony_ci        vmovdqu         HashKey(arg2), \T5
264862306a36Sopenharmony_ci        vpshufd         $0b01001110, \XMM8, \T2
264962306a36Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
265062306a36Sopenharmony_ci        vpxor           \XMM8, \T2, \T2
265162306a36Sopenharmony_ci        vpxor           \T5, \T3, \T3
265262306a36Sopenharmony_ci
265362306a36Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM8, \T4
265462306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6
265562306a36Sopenharmony_ci
265662306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM8, \T4
265762306a36Sopenharmony_ci        vpxor           \T4, \T7, \T7
265862306a36Sopenharmony_ci
265962306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
266062306a36Sopenharmony_ci
266162306a36Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
266262306a36Sopenharmony_ci        vpxor           \T6, \XMM1, \XMM1
266362306a36Sopenharmony_ci        vpxor           \T7, \XMM1, \T2
266462306a36Sopenharmony_ci
266562306a36Sopenharmony_ci
266662306a36Sopenharmony_ci
266762306a36Sopenharmony_ci
266862306a36Sopenharmony_ci        vpslldq $8, \T2, \T4
266962306a36Sopenharmony_ci        vpsrldq $8, \T2, \T2
267062306a36Sopenharmony_ci
267162306a36Sopenharmony_ci        vpxor   \T4, \T7, \T7
267262306a36Sopenharmony_ci        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
267362306a36Sopenharmony_ci						   # accumulated carry-less multiplications
267462306a36Sopenharmony_ci
267562306a36Sopenharmony_ci        #######################################################################
267662306a36Sopenharmony_ci        #first phase of the reduction
267762306a36Sopenharmony_ci        vmovdqa         POLY2(%rip), \T3
267862306a36Sopenharmony_ci
267962306a36Sopenharmony_ci        vpclmulqdq      $0x01, \T7, \T3, \T2
268062306a36Sopenharmony_ci        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
268162306a36Sopenharmony_ci
268262306a36Sopenharmony_ci        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
268362306a36Sopenharmony_ci        #######################################################################
268462306a36Sopenharmony_ci
268562306a36Sopenharmony_ci
268662306a36Sopenharmony_ci        #second phase of the reduction
268762306a36Sopenharmony_ci        vpclmulqdq      $0x00, \T7, \T3, \T2
268862306a36Sopenharmony_ci        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
268962306a36Sopenharmony_ci
269062306a36Sopenharmony_ci        vpclmulqdq      $0x10, \T7, \T3, \T4
269162306a36Sopenharmony_ci        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
269262306a36Sopenharmony_ci
269362306a36Sopenharmony_ci        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
269462306a36Sopenharmony_ci        #######################################################################
269562306a36Sopenharmony_ci        vpxor           \T4, \T6, \T6              # the result is in T6
269662306a36Sopenharmony_ci.endm
269762306a36Sopenharmony_ci
269862306a36Sopenharmony_ci
269962306a36Sopenharmony_ci
270062306a36Sopenharmony_ci#############################################################
270162306a36Sopenharmony_ci#void   aesni_gcm_init_avx_gen4
270262306a36Sopenharmony_ci#        (gcm_data     *my_ctx_data,
270362306a36Sopenharmony_ci#         gcm_context_data *data,
270462306a36Sopenharmony_ci#        u8      *iv, /* Pre-counter block j0: 4 byte salt
270562306a36Sopenharmony_ci#			(from Security Association) concatenated with 8 byte
270662306a36Sopenharmony_ci#			Initialisation Vector (from IPSec ESP Payload)
270762306a36Sopenharmony_ci#			concatenated with 0x00000001. 16-byte aligned pointer. */
270862306a36Sopenharmony_ci#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
270962306a36Sopenharmony_ci#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
271062306a36Sopenharmony_ci#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
271162306a36Sopenharmony_ci#############################################################
271262306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_init_avx_gen4)
271362306a36Sopenharmony_ci        FUNC_SAVE
271462306a36Sopenharmony_ci        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
271562306a36Sopenharmony_ci        FUNC_RESTORE
271662306a36Sopenharmony_ci        RET
271762306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_init_avx_gen4)
271862306a36Sopenharmony_ci
271962306a36Sopenharmony_ci###############################################################################
272062306a36Sopenharmony_ci#void   aesni_gcm_enc_avx_gen4(
272162306a36Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
272262306a36Sopenharmony_ci#        gcm_context_data *data,
272362306a36Sopenharmony_ci#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
272462306a36Sopenharmony_ci#        const   u8 *in, /* Plaintext input */
272562306a36Sopenharmony_ci#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
272662306a36Sopenharmony_ci###############################################################################
272762306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
272862306a36Sopenharmony_ci        FUNC_SAVE
272962306a36Sopenharmony_ci        mov     keysize,%eax
273062306a36Sopenharmony_ci        cmp     $32, %eax
273162306a36Sopenharmony_ci        je      key_256_enc_update4
273262306a36Sopenharmony_ci        cmp     $16, %eax
273362306a36Sopenharmony_ci        je      key_128_enc_update4
273462306a36Sopenharmony_ci        # must be 192
273562306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
273662306a36Sopenharmony_ci        FUNC_RESTORE
273762306a36Sopenharmony_ci	RET
273862306a36Sopenharmony_cikey_128_enc_update4:
273962306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
274062306a36Sopenharmony_ci        FUNC_RESTORE
274162306a36Sopenharmony_ci	RET
274262306a36Sopenharmony_cikey_256_enc_update4:
274362306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
274462306a36Sopenharmony_ci        FUNC_RESTORE
274562306a36Sopenharmony_ci	RET
274662306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
274762306a36Sopenharmony_ci
274862306a36Sopenharmony_ci###############################################################################
274962306a36Sopenharmony_ci#void   aesni_gcm_dec_update_avx_gen4(
275062306a36Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
275162306a36Sopenharmony_ci#        gcm_context_data *data,
275262306a36Sopenharmony_ci#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
275362306a36Sopenharmony_ci#        const   u8 *in, /* Ciphertext input */
275462306a36Sopenharmony_ci#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
275562306a36Sopenharmony_ci###############################################################################
275662306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
275762306a36Sopenharmony_ci        FUNC_SAVE
275862306a36Sopenharmony_ci        mov     keysize,%eax
275962306a36Sopenharmony_ci        cmp     $32, %eax
276062306a36Sopenharmony_ci        je      key_256_dec_update4
276162306a36Sopenharmony_ci        cmp     $16, %eax
276262306a36Sopenharmony_ci        je      key_128_dec_update4
276362306a36Sopenharmony_ci        # must be 192
276462306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
276562306a36Sopenharmony_ci        FUNC_RESTORE
276662306a36Sopenharmony_ci        RET
276762306a36Sopenharmony_cikey_128_dec_update4:
276862306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
276962306a36Sopenharmony_ci        FUNC_RESTORE
277062306a36Sopenharmony_ci        RET
277162306a36Sopenharmony_cikey_256_dec_update4:
277262306a36Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
277362306a36Sopenharmony_ci        FUNC_RESTORE
277462306a36Sopenharmony_ci        RET
277562306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
277662306a36Sopenharmony_ci
277762306a36Sopenharmony_ci###############################################################################
277862306a36Sopenharmony_ci#void   aesni_gcm_finalize_avx_gen4(
277962306a36Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
278062306a36Sopenharmony_ci#        gcm_context_data *data,
278162306a36Sopenharmony_ci#        u8      *auth_tag, /* Authenticated Tag output. */
278262306a36Sopenharmony_ci#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
278362306a36Sopenharmony_ci#                              Valid values are 16 (most likely), 12 or 8. */
278462306a36Sopenharmony_ci###############################################################################
278562306a36Sopenharmony_ciSYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
278662306a36Sopenharmony_ci        FUNC_SAVE
278762306a36Sopenharmony_ci        mov	keysize,%eax
278862306a36Sopenharmony_ci        cmp     $32, %eax
278962306a36Sopenharmony_ci        je      key_256_finalize4
279062306a36Sopenharmony_ci        cmp     $16, %eax
279162306a36Sopenharmony_ci        je      key_128_finalize4
279262306a36Sopenharmony_ci        # must be 192
279362306a36Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
279462306a36Sopenharmony_ci        FUNC_RESTORE
279562306a36Sopenharmony_ci        RET
279662306a36Sopenharmony_cikey_128_finalize4:
279762306a36Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
279862306a36Sopenharmony_ci        FUNC_RESTORE
279962306a36Sopenharmony_ci        RET
280062306a36Sopenharmony_cikey_256_finalize4:
280162306a36Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
280262306a36Sopenharmony_ci        FUNC_RESTORE
280362306a36Sopenharmony_ci        RET
280462306a36Sopenharmony_ciSYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2805