18c2ecf20Sopenharmony_ci########################################################################
28c2ecf20Sopenharmony_ci# Copyright (c) 2013, Intel Corporation
38c2ecf20Sopenharmony_ci#
48c2ecf20Sopenharmony_ci# This software is available to you under a choice of one of two
58c2ecf20Sopenharmony_ci# licenses.  You may choose to be licensed under the terms of the GNU
68c2ecf20Sopenharmony_ci# General Public License (GPL) Version 2, available from the file
78c2ecf20Sopenharmony_ci# COPYING in the main directory of this source tree, or the
88c2ecf20Sopenharmony_ci# OpenIB.org BSD license below:
98c2ecf20Sopenharmony_ci#
108c2ecf20Sopenharmony_ci# Redistribution and use in source and binary forms, with or without
118c2ecf20Sopenharmony_ci# modification, are permitted provided that the following conditions are
128c2ecf20Sopenharmony_ci# met:
138c2ecf20Sopenharmony_ci#
148c2ecf20Sopenharmony_ci# * Redistributions of source code must retain the above copyright
158c2ecf20Sopenharmony_ci#   notice, this list of conditions and the following disclaimer.
168c2ecf20Sopenharmony_ci#
178c2ecf20Sopenharmony_ci# * Redistributions in binary form must reproduce the above copyright
188c2ecf20Sopenharmony_ci#   notice, this list of conditions and the following disclaimer in the
198c2ecf20Sopenharmony_ci#   documentation and/or other materials provided with the
208c2ecf20Sopenharmony_ci#   distribution.
218c2ecf20Sopenharmony_ci#
228c2ecf20Sopenharmony_ci# * Neither the name of the Intel Corporation nor the names of its
238c2ecf20Sopenharmony_ci#   contributors may be used to endorse or promote products derived from
248c2ecf20Sopenharmony_ci#   this software without specific prior written permission.
258c2ecf20Sopenharmony_ci#
268c2ecf20Sopenharmony_ci#
278c2ecf20Sopenharmony_ci# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
288c2ecf20Sopenharmony_ci# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
298c2ecf20Sopenharmony_ci# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
308c2ecf20Sopenharmony_ci# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
318c2ecf20Sopenharmony_ci# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
328c2ecf20Sopenharmony_ci# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
338c2ecf20Sopenharmony_ci# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
348c2ecf20Sopenharmony_ci# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
358c2ecf20Sopenharmony_ci# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
368c2ecf20Sopenharmony_ci# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
378c2ecf20Sopenharmony_ci# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
388c2ecf20Sopenharmony_ci########################################################################
398c2ecf20Sopenharmony_ci##
408c2ecf20Sopenharmony_ci## Authors:
418c2ecf20Sopenharmony_ci##	Erdinc Ozturk <erdinc.ozturk@intel.com>
428c2ecf20Sopenharmony_ci##	Vinodh Gopal <vinodh.gopal@intel.com>
438c2ecf20Sopenharmony_ci##	James Guilford <james.guilford@intel.com>
448c2ecf20Sopenharmony_ci##	Tim Chen <tim.c.chen@linux.intel.com>
458c2ecf20Sopenharmony_ci##
468c2ecf20Sopenharmony_ci## References:
478c2ecf20Sopenharmony_ci##       This code was derived and highly optimized from the code described in paper:
488c2ecf20Sopenharmony_ci##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
498c2ecf20Sopenharmony_ci##			on Intel Architecture Processors. August, 2010
508c2ecf20Sopenharmony_ci##       The details of the implementation is explained in:
518c2ecf20Sopenharmony_ci##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
528c2ecf20Sopenharmony_ci##			on Intel Architecture Processors. October, 2012.
538c2ecf20Sopenharmony_ci##
548c2ecf20Sopenharmony_ci## Assumptions:
558c2ecf20Sopenharmony_ci##
568c2ecf20Sopenharmony_ci##
578c2ecf20Sopenharmony_ci##
588c2ecf20Sopenharmony_ci## iv:
598c2ecf20Sopenharmony_ci##       0                   1                   2                   3
608c2ecf20Sopenharmony_ci##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
618c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
628c2ecf20Sopenharmony_ci##       |                             Salt  (From the SA)               |
638c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
648c2ecf20Sopenharmony_ci##       |                     Initialization Vector                     |
658c2ecf20Sopenharmony_ci##       |         (This is the sequence number from IPSec header)       |
668c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
678c2ecf20Sopenharmony_ci##       |                              0x1                              |
688c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
698c2ecf20Sopenharmony_ci##
708c2ecf20Sopenharmony_ci##
718c2ecf20Sopenharmony_ci##
728c2ecf20Sopenharmony_ci## AAD:
738c2ecf20Sopenharmony_ci##       AAD padded to 128 bits with 0
748c2ecf20Sopenharmony_ci##       for example, assume AAD is a u32 vector
758c2ecf20Sopenharmony_ci##
768c2ecf20Sopenharmony_ci##       if AAD is 8 bytes:
778c2ecf20Sopenharmony_ci##       AAD[3] = {A0, A1}#
788c2ecf20Sopenharmony_ci##       padded AAD in xmm register = {A1 A0 0 0}
798c2ecf20Sopenharmony_ci##
808c2ecf20Sopenharmony_ci##       0                   1                   2                   3
818c2ecf20Sopenharmony_ci##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
828c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
838c2ecf20Sopenharmony_ci##       |                               SPI (A1)                        |
848c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
858c2ecf20Sopenharmony_ci##       |                     32-bit Sequence Number (A0)               |
868c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
878c2ecf20Sopenharmony_ci##       |                              0x0                              |
888c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
898c2ecf20Sopenharmony_ci##
908c2ecf20Sopenharmony_ci##                                       AAD Format with 32-bit Sequence Number
918c2ecf20Sopenharmony_ci##
928c2ecf20Sopenharmony_ci##       if AAD is 12 bytes:
938c2ecf20Sopenharmony_ci##       AAD[3] = {A0, A1, A2}#
948c2ecf20Sopenharmony_ci##       padded AAD in xmm register = {A2 A1 A0 0}
958c2ecf20Sopenharmony_ci##
968c2ecf20Sopenharmony_ci##       0                   1                   2                   3
978c2ecf20Sopenharmony_ci##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
988c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
998c2ecf20Sopenharmony_ci##       |                               SPI (A2)                        |
1008c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1018c2ecf20Sopenharmony_ci##       |                 64-bit Extended Sequence Number {A1,A0}       |
1028c2ecf20Sopenharmony_ci##       |                                                               |
1038c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1048c2ecf20Sopenharmony_ci##       |                              0x0                              |
1058c2ecf20Sopenharmony_ci##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1068c2ecf20Sopenharmony_ci##
1078c2ecf20Sopenharmony_ci##        AAD Format with 64-bit Extended Sequence Number
1088c2ecf20Sopenharmony_ci##
1098c2ecf20Sopenharmony_ci##
1108c2ecf20Sopenharmony_ci## aadLen:
1118c2ecf20Sopenharmony_ci##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
1128c2ecf20Sopenharmony_ci##	 The code additionally supports aadLen of length 16 bytes.
1138c2ecf20Sopenharmony_ci##
1148c2ecf20Sopenharmony_ci## TLen:
1158c2ecf20Sopenharmony_ci##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1168c2ecf20Sopenharmony_ci##
1178c2ecf20Sopenharmony_ci## poly = x^128 + x^127 + x^126 + x^121 + 1
1188c2ecf20Sopenharmony_ci## throughout the code, one tab and two tab indentations are used. one tab is
1198c2ecf20Sopenharmony_ci## for GHASH part, two tabs is for AES part.
1208c2ecf20Sopenharmony_ci##
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci#include <linux/linkage.h>
1238c2ecf20Sopenharmony_ci
1248c2ecf20Sopenharmony_ci# constants in mergeable sections, linker can reorder and merge
1258c2ecf20Sopenharmony_ci.section	.rodata.cst16.POLY, "aM", @progbits, 16
1268c2ecf20Sopenharmony_ci.align 16
1278c2ecf20Sopenharmony_ciPOLY:            .octa     0xC2000000000000000000000000000001
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_ci.section	.rodata.cst16.POLY2, "aM", @progbits, 16
1308c2ecf20Sopenharmony_ci.align 16
1318c2ecf20Sopenharmony_ciPOLY2:           .octa     0xC20000000000000000000001C2000000
1328c2ecf20Sopenharmony_ci
1338c2ecf20Sopenharmony_ci.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
1348c2ecf20Sopenharmony_ci.align 16
1358c2ecf20Sopenharmony_ciTWOONE:          .octa     0x00000001000000000000000000000001
1368c2ecf20Sopenharmony_ci
1378c2ecf20Sopenharmony_ci.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
1388c2ecf20Sopenharmony_ci.align 16
1398c2ecf20Sopenharmony_ciSHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
1408c2ecf20Sopenharmony_ci
1418c2ecf20Sopenharmony_ci.section	.rodata.cst16.ONE, "aM", @progbits, 16
1428c2ecf20Sopenharmony_ci.align 16
1438c2ecf20Sopenharmony_ciONE:             .octa     0x00000000000000000000000000000001
1448c2ecf20Sopenharmony_ci
1458c2ecf20Sopenharmony_ci.section	.rodata.cst16.ONEf, "aM", @progbits, 16
1468c2ecf20Sopenharmony_ci.align 16
1478c2ecf20Sopenharmony_ciONEf:            .octa     0x01000000000000000000000000000000
1488c2ecf20Sopenharmony_ci
1498c2ecf20Sopenharmony_ci# order of these constants should not change.
1508c2ecf20Sopenharmony_ci# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
1518c2ecf20Sopenharmony_ci.section	.rodata, "a", @progbits
1528c2ecf20Sopenharmony_ci.align 16
1538c2ecf20Sopenharmony_ciSHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
1548c2ecf20Sopenharmony_ciALL_F:           .octa     0xffffffffffffffffffffffffffffffff
1558c2ecf20Sopenharmony_ci                 .octa     0x00000000000000000000000000000000
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_ci.section .rodata
1588c2ecf20Sopenharmony_ci.align 16
1598c2ecf20Sopenharmony_ci.type aad_shift_arr, @object
1608c2ecf20Sopenharmony_ci.size aad_shift_arr, 272
1618c2ecf20Sopenharmony_ciaad_shift_arr:
1628c2ecf20Sopenharmony_ci        .octa     0xffffffffffffffffffffffffffffffff
1638c2ecf20Sopenharmony_ci        .octa     0xffffffffffffffffffffffffffffff0C
1648c2ecf20Sopenharmony_ci        .octa     0xffffffffffffffffffffffffffff0D0C
1658c2ecf20Sopenharmony_ci        .octa     0xffffffffffffffffffffffffff0E0D0C
1668c2ecf20Sopenharmony_ci        .octa     0xffffffffffffffffffffffff0F0E0D0C
1678c2ecf20Sopenharmony_ci        .octa     0xffffffffffffffffffffff0C0B0A0908
1688c2ecf20Sopenharmony_ci        .octa     0xffffffffffffffffffff0D0C0B0A0908
1698c2ecf20Sopenharmony_ci        .octa     0xffffffffffffffffff0E0D0C0B0A0908
1708c2ecf20Sopenharmony_ci        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
1718c2ecf20Sopenharmony_ci        .octa     0xffffffffffffff0C0B0A090807060504
1728c2ecf20Sopenharmony_ci        .octa     0xffffffffffff0D0C0B0A090807060504
1738c2ecf20Sopenharmony_ci        .octa     0xffffffffff0E0D0C0B0A090807060504
1748c2ecf20Sopenharmony_ci        .octa     0xffffffff0F0E0D0C0B0A090807060504
1758c2ecf20Sopenharmony_ci        .octa     0xffffff0C0B0A09080706050403020100
1768c2ecf20Sopenharmony_ci        .octa     0xffff0D0C0B0A09080706050403020100
1778c2ecf20Sopenharmony_ci        .octa     0xff0E0D0C0B0A09080706050403020100
1788c2ecf20Sopenharmony_ci        .octa     0x0F0E0D0C0B0A09080706050403020100
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_ci
1818c2ecf20Sopenharmony_ci.text
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_ci
1848c2ecf20Sopenharmony_ci#define AadHash 16*0
1858c2ecf20Sopenharmony_ci#define AadLen 16*1
1868c2ecf20Sopenharmony_ci#define InLen (16*1)+8
1878c2ecf20Sopenharmony_ci#define PBlockEncKey 16*2
1888c2ecf20Sopenharmony_ci#define OrigIV 16*3
1898c2ecf20Sopenharmony_ci#define CurCount 16*4
1908c2ecf20Sopenharmony_ci#define PBlockLen 16*5
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ciHashKey        = 16*6   # store HashKey <<1 mod poly here
1938c2ecf20Sopenharmony_ciHashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
1948c2ecf20Sopenharmony_ciHashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
1958c2ecf20Sopenharmony_ciHashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
1968c2ecf20Sopenharmony_ciHashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
1978c2ecf20Sopenharmony_ciHashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
1988c2ecf20Sopenharmony_ciHashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
1998c2ecf20Sopenharmony_ciHashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
2008c2ecf20Sopenharmony_ciHashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
2018c2ecf20Sopenharmony_ciHashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
2028c2ecf20Sopenharmony_ciHashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
2038c2ecf20Sopenharmony_ciHashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
2048c2ecf20Sopenharmony_ciHashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
2058c2ecf20Sopenharmony_ciHashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
2068c2ecf20Sopenharmony_ciHashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
2078c2ecf20Sopenharmony_ciHashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_ci#define arg1 %rdi
2108c2ecf20Sopenharmony_ci#define arg2 %rsi
2118c2ecf20Sopenharmony_ci#define arg3 %rdx
2128c2ecf20Sopenharmony_ci#define arg4 %rcx
2138c2ecf20Sopenharmony_ci#define arg5 %r8
2148c2ecf20Sopenharmony_ci#define arg6 %r9
2158c2ecf20Sopenharmony_ci#define arg7 STACK_OFFSET+8*1(%r14)
2168c2ecf20Sopenharmony_ci#define arg8 STACK_OFFSET+8*2(%r14)
2178c2ecf20Sopenharmony_ci#define arg9 STACK_OFFSET+8*3(%r14)
2188c2ecf20Sopenharmony_ci#define arg10 STACK_OFFSET+8*4(%r14)
2198c2ecf20Sopenharmony_ci#define keysize 2*15*16(arg1)
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_cii = 0
2228c2ecf20Sopenharmony_cij = 0
2238c2ecf20Sopenharmony_ci
2248c2ecf20Sopenharmony_ciout_order = 0
2258c2ecf20Sopenharmony_ciin_order = 1
2268c2ecf20Sopenharmony_ciDEC = 0
2278c2ecf20Sopenharmony_ciENC = 1
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_ci.macro define_reg r n
2308c2ecf20Sopenharmony_cireg_\r = %xmm\n
2318c2ecf20Sopenharmony_ci.endm
2328c2ecf20Sopenharmony_ci
2338c2ecf20Sopenharmony_ci.macro setreg
2348c2ecf20Sopenharmony_ci.altmacro
2358c2ecf20Sopenharmony_cidefine_reg i %i
2368c2ecf20Sopenharmony_cidefine_reg j %j
2378c2ecf20Sopenharmony_ci.noaltmacro
2388c2ecf20Sopenharmony_ci.endm
2398c2ecf20Sopenharmony_ci
2408c2ecf20Sopenharmony_ci# need to push 4 registers into stack to maintain
2418c2ecf20Sopenharmony_ciSTACK_OFFSET = 8*4
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ciTMP1 =   16*0    # Temporary storage for AAD
2448c2ecf20Sopenharmony_ciTMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
2458c2ecf20Sopenharmony_ciTMP3 =   16*2    # Temporary storage for AES State 3
2468c2ecf20Sopenharmony_ciTMP4 =   16*3    # Temporary storage for AES State 4
2478c2ecf20Sopenharmony_ciTMP5 =   16*4    # Temporary storage for AES State 5
2488c2ecf20Sopenharmony_ciTMP6 =   16*5    # Temporary storage for AES State 6
2498c2ecf20Sopenharmony_ciTMP7 =   16*6    # Temporary storage for AES State 7
2508c2ecf20Sopenharmony_ciTMP8 =   16*7    # Temporary storage for AES State 8
2518c2ecf20Sopenharmony_ci
2528c2ecf20Sopenharmony_ciVARIABLE_OFFSET = 16*8
2538c2ecf20Sopenharmony_ci
2548c2ecf20Sopenharmony_ci################################
2558c2ecf20Sopenharmony_ci# Utility Macros
2568c2ecf20Sopenharmony_ci################################
2578c2ecf20Sopenharmony_ci
2588c2ecf20Sopenharmony_ci.macro FUNC_SAVE
2598c2ecf20Sopenharmony_ci        #the number of pushes must equal STACK_OFFSET
2608c2ecf20Sopenharmony_ci        push    %r12
2618c2ecf20Sopenharmony_ci        push    %r13
2628c2ecf20Sopenharmony_ci        push    %r14
2638c2ecf20Sopenharmony_ci        push    %r15
2648c2ecf20Sopenharmony_ci
2658c2ecf20Sopenharmony_ci        mov     %rsp, %r14
2668c2ecf20Sopenharmony_ci
2678c2ecf20Sopenharmony_ci
2688c2ecf20Sopenharmony_ci
2698c2ecf20Sopenharmony_ci        sub     $VARIABLE_OFFSET, %rsp
2708c2ecf20Sopenharmony_ci        and     $~63, %rsp                    # align rsp to 64 bytes
2718c2ecf20Sopenharmony_ci.endm
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_ci.macro FUNC_RESTORE
2748c2ecf20Sopenharmony_ci        mov     %r14, %rsp
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_ci        pop     %r15
2778c2ecf20Sopenharmony_ci        pop     %r14
2788c2ecf20Sopenharmony_ci        pop     %r13
2798c2ecf20Sopenharmony_ci        pop     %r12
2808c2ecf20Sopenharmony_ci.endm
2818c2ecf20Sopenharmony_ci
2828c2ecf20Sopenharmony_ci# Encryption of a single block
2838c2ecf20Sopenharmony_ci.macro ENCRYPT_SINGLE_BLOCK REP XMM0
2848c2ecf20Sopenharmony_ci                vpxor    (arg1), \XMM0, \XMM0
2858c2ecf20Sopenharmony_ci               i = 1
2868c2ecf20Sopenharmony_ci               setreg
2878c2ecf20Sopenharmony_ci.rep \REP
2888c2ecf20Sopenharmony_ci                vaesenc  16*i(arg1), \XMM0, \XMM0
2898c2ecf20Sopenharmony_ci               i = (i+1)
2908c2ecf20Sopenharmony_ci               setreg
2918c2ecf20Sopenharmony_ci.endr
2928c2ecf20Sopenharmony_ci                vaesenclast 16*i(arg1), \XMM0, \XMM0
2938c2ecf20Sopenharmony_ci.endm
2948c2ecf20Sopenharmony_ci
2958c2ecf20Sopenharmony_ci# combined for GCM encrypt and decrypt functions
2968c2ecf20Sopenharmony_ci# clobbering all xmm registers
2978c2ecf20Sopenharmony_ci# clobbering r10, r11, r12, r13, r14, r15
2988c2ecf20Sopenharmony_ci.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
2998c2ecf20Sopenharmony_ci        vmovdqu AadHash(arg2), %xmm8
3008c2ecf20Sopenharmony_ci        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
3018c2ecf20Sopenharmony_ci        add arg5, InLen(arg2)
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_ci        # initialize the data pointer offset as zero
3048c2ecf20Sopenharmony_ci        xor     %r11d, %r11d
3058c2ecf20Sopenharmony_ci
3068c2ecf20Sopenharmony_ci        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
3078c2ecf20Sopenharmony_ci        sub %r11, arg5
3088c2ecf20Sopenharmony_ci
3098c2ecf20Sopenharmony_ci        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
3108c2ecf20Sopenharmony_ci        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
3118c2ecf20Sopenharmony_ci
3128c2ecf20Sopenharmony_ci        mov     %r13, %r12
3138c2ecf20Sopenharmony_ci        shr     $4, %r12
3148c2ecf20Sopenharmony_ci        and     $7, %r12
3158c2ecf20Sopenharmony_ci        jz      _initial_num_blocks_is_0\@
3168c2ecf20Sopenharmony_ci
3178c2ecf20Sopenharmony_ci        cmp     $7, %r12
3188c2ecf20Sopenharmony_ci        je      _initial_num_blocks_is_7\@
3198c2ecf20Sopenharmony_ci        cmp     $6, %r12
3208c2ecf20Sopenharmony_ci        je      _initial_num_blocks_is_6\@
3218c2ecf20Sopenharmony_ci        cmp     $5, %r12
3228c2ecf20Sopenharmony_ci        je      _initial_num_blocks_is_5\@
3238c2ecf20Sopenharmony_ci        cmp     $4, %r12
3248c2ecf20Sopenharmony_ci        je      _initial_num_blocks_is_4\@
3258c2ecf20Sopenharmony_ci        cmp     $3, %r12
3268c2ecf20Sopenharmony_ci        je      _initial_num_blocks_is_3\@
3278c2ecf20Sopenharmony_ci        cmp     $2, %r12
3288c2ecf20Sopenharmony_ci        je      _initial_num_blocks_is_2\@
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci        jmp     _initial_num_blocks_is_1\@
3318c2ecf20Sopenharmony_ci
3328c2ecf20Sopenharmony_ci_initial_num_blocks_is_7\@:
3338c2ecf20Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
3348c2ecf20Sopenharmony_ci        sub     $16*7, %r13
3358c2ecf20Sopenharmony_ci        jmp     _initial_blocks_encrypted\@
3368c2ecf20Sopenharmony_ci
3378c2ecf20Sopenharmony_ci_initial_num_blocks_is_6\@:
3388c2ecf20Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
3398c2ecf20Sopenharmony_ci        sub     $16*6, %r13
3408c2ecf20Sopenharmony_ci        jmp     _initial_blocks_encrypted\@
3418c2ecf20Sopenharmony_ci
3428c2ecf20Sopenharmony_ci_initial_num_blocks_is_5\@:
3438c2ecf20Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
3448c2ecf20Sopenharmony_ci        sub     $16*5, %r13
3458c2ecf20Sopenharmony_ci        jmp     _initial_blocks_encrypted\@
3468c2ecf20Sopenharmony_ci
3478c2ecf20Sopenharmony_ci_initial_num_blocks_is_4\@:
3488c2ecf20Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
3498c2ecf20Sopenharmony_ci        sub     $16*4, %r13
3508c2ecf20Sopenharmony_ci        jmp     _initial_blocks_encrypted\@
3518c2ecf20Sopenharmony_ci
3528c2ecf20Sopenharmony_ci_initial_num_blocks_is_3\@:
3538c2ecf20Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
3548c2ecf20Sopenharmony_ci        sub     $16*3, %r13
3558c2ecf20Sopenharmony_ci        jmp     _initial_blocks_encrypted\@
3568c2ecf20Sopenharmony_ci
3578c2ecf20Sopenharmony_ci_initial_num_blocks_is_2\@:
3588c2ecf20Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
3598c2ecf20Sopenharmony_ci        sub     $16*2, %r13
3608c2ecf20Sopenharmony_ci        jmp     _initial_blocks_encrypted\@
3618c2ecf20Sopenharmony_ci
3628c2ecf20Sopenharmony_ci_initial_num_blocks_is_1\@:
3638c2ecf20Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
3648c2ecf20Sopenharmony_ci        sub     $16*1, %r13
3658c2ecf20Sopenharmony_ci        jmp     _initial_blocks_encrypted\@
3668c2ecf20Sopenharmony_ci
3678c2ecf20Sopenharmony_ci_initial_num_blocks_is_0\@:
3688c2ecf20Sopenharmony_ci        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
3698c2ecf20Sopenharmony_ci
3708c2ecf20Sopenharmony_ci
3718c2ecf20Sopenharmony_ci_initial_blocks_encrypted\@:
3728c2ecf20Sopenharmony_ci        test    %r13, %r13
3738c2ecf20Sopenharmony_ci        je      _zero_cipher_left\@
3748c2ecf20Sopenharmony_ci
3758c2ecf20Sopenharmony_ci        sub     $128, %r13
3768c2ecf20Sopenharmony_ci        je      _eight_cipher_left\@
3778c2ecf20Sopenharmony_ci
3788c2ecf20Sopenharmony_ci
3798c2ecf20Sopenharmony_ci
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci        vmovd   %xmm9, %r15d
3828c2ecf20Sopenharmony_ci        and     $255, %r15d
3838c2ecf20Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
3848c2ecf20Sopenharmony_ci
3858c2ecf20Sopenharmony_ci
3868c2ecf20Sopenharmony_ci_encrypt_by_8_new\@:
3878c2ecf20Sopenharmony_ci        cmp     $(255-8), %r15d
3888c2ecf20Sopenharmony_ci        jg      _encrypt_by_8\@
3898c2ecf20Sopenharmony_ci
3908c2ecf20Sopenharmony_ci
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci        add     $8, %r15b
3938c2ecf20Sopenharmony_ci        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
3948c2ecf20Sopenharmony_ci        add     $128, %r11
3958c2ecf20Sopenharmony_ci        sub     $128, %r13
3968c2ecf20Sopenharmony_ci        jne     _encrypt_by_8_new\@
3978c2ecf20Sopenharmony_ci
3988c2ecf20Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
3998c2ecf20Sopenharmony_ci        jmp     _eight_cipher_left\@
4008c2ecf20Sopenharmony_ci
4018c2ecf20Sopenharmony_ci_encrypt_by_8\@:
4028c2ecf20Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
4038c2ecf20Sopenharmony_ci        add     $8, %r15b
4048c2ecf20Sopenharmony_ci        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
4058c2ecf20Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
4068c2ecf20Sopenharmony_ci        add     $128, %r11
4078c2ecf20Sopenharmony_ci        sub     $128, %r13
4088c2ecf20Sopenharmony_ci        jne     _encrypt_by_8_new\@
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
4118c2ecf20Sopenharmony_ci
4128c2ecf20Sopenharmony_ci
4138c2ecf20Sopenharmony_ci
4148c2ecf20Sopenharmony_ci
4158c2ecf20Sopenharmony_ci_eight_cipher_left\@:
4168c2ecf20Sopenharmony_ci        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
4178c2ecf20Sopenharmony_ci
4188c2ecf20Sopenharmony_ci
4198c2ecf20Sopenharmony_ci_zero_cipher_left\@:
4208c2ecf20Sopenharmony_ci        vmovdqu %xmm14, AadHash(arg2)
4218c2ecf20Sopenharmony_ci        vmovdqu %xmm9, CurCount(arg2)
4228c2ecf20Sopenharmony_ci
4238c2ecf20Sopenharmony_ci        # check for 0 length
4248c2ecf20Sopenharmony_ci        mov     arg5, %r13
4258c2ecf20Sopenharmony_ci        and     $15, %r13                            # r13 = (arg5 mod 16)
4268c2ecf20Sopenharmony_ci
4278c2ecf20Sopenharmony_ci        je      _multiple_of_16_bytes\@
4288c2ecf20Sopenharmony_ci
4298c2ecf20Sopenharmony_ci        # handle the last <16 Byte block separately
4308c2ecf20Sopenharmony_ci
4318c2ecf20Sopenharmony_ci        mov %r13, PBlockLen(arg2)
4328c2ecf20Sopenharmony_ci
4338c2ecf20Sopenharmony_ci        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
4348c2ecf20Sopenharmony_ci        vmovdqu %xmm9, CurCount(arg2)
4358c2ecf20Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
4368c2ecf20Sopenharmony_ci
4378c2ecf20Sopenharmony_ci        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
4388c2ecf20Sopenharmony_ci        vmovdqu %xmm9, PBlockEncKey(arg2)
4398c2ecf20Sopenharmony_ci
4408c2ecf20Sopenharmony_ci        cmp $16, arg5
4418c2ecf20Sopenharmony_ci        jge _large_enough_update\@
4428c2ecf20Sopenharmony_ci
4438c2ecf20Sopenharmony_ci        lea (arg4,%r11,1), %r10
4448c2ecf20Sopenharmony_ci        mov %r13, %r12
4458c2ecf20Sopenharmony_ci
4468c2ecf20Sopenharmony_ci        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
4478c2ecf20Sopenharmony_ci
4488c2ecf20Sopenharmony_ci        lea     SHIFT_MASK+16(%rip), %r12
4498c2ecf20Sopenharmony_ci        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
4508c2ecf20Sopenharmony_ci						     # able to shift 16-r13 bytes (r13 is the
4518c2ecf20Sopenharmony_ci	# number of bytes in plaintext mod 16)
4528c2ecf20Sopenharmony_ci
4538c2ecf20Sopenharmony_ci        jmp _final_ghash_mul\@
4548c2ecf20Sopenharmony_ci
4558c2ecf20Sopenharmony_ci_large_enough_update\@:
4568c2ecf20Sopenharmony_ci        sub $16, %r11
4578c2ecf20Sopenharmony_ci        add %r13, %r11
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci        # receive the last <16 Byte block
4608c2ecf20Sopenharmony_ci        vmovdqu	(arg4, %r11, 1), %xmm1
4618c2ecf20Sopenharmony_ci
4628c2ecf20Sopenharmony_ci        sub	%r13, %r11
4638c2ecf20Sopenharmony_ci        add	$16, %r11
4648c2ecf20Sopenharmony_ci
4658c2ecf20Sopenharmony_ci        lea	SHIFT_MASK+16(%rip), %r12
4668c2ecf20Sopenharmony_ci        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
4678c2ecf20Sopenharmony_ci        # (r13 is the number of bytes in plaintext mod 16)
4688c2ecf20Sopenharmony_ci        sub	%r13, %r12
4698c2ecf20Sopenharmony_ci        # get the appropriate shuffle mask
4708c2ecf20Sopenharmony_ci        vmovdqu	(%r12), %xmm2
4718c2ecf20Sopenharmony_ci        # shift right 16-r13 bytes
4728c2ecf20Sopenharmony_ci        vpshufb  %xmm2, %xmm1, %xmm1
4738c2ecf20Sopenharmony_ci
4748c2ecf20Sopenharmony_ci_final_ghash_mul\@:
4758c2ecf20Sopenharmony_ci        .if  \ENC_DEC ==  DEC
4768c2ecf20Sopenharmony_ci        vmovdqa %xmm1, %xmm2
4778c2ecf20Sopenharmony_ci        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
4788c2ecf20Sopenharmony_ci        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
4798c2ecf20Sopenharmony_ci						     # mask out top 16-r13 bytes of xmm9
4808c2ecf20Sopenharmony_ci        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
4818c2ecf20Sopenharmony_ci        vpand   %xmm1, %xmm2, %xmm2
4828c2ecf20Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
4838c2ecf20Sopenharmony_ci        vpxor   %xmm2, %xmm14, %xmm14
4848c2ecf20Sopenharmony_ci
4858c2ecf20Sopenharmony_ci        vmovdqu %xmm14, AadHash(arg2)
4868c2ecf20Sopenharmony_ci        .else
4878c2ecf20Sopenharmony_ci        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
4888c2ecf20Sopenharmony_ci        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
4898c2ecf20Sopenharmony_ci						     # mask out top 16-r13 bytes of xmm9
4908c2ecf20Sopenharmony_ci        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
4918c2ecf20Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
4928c2ecf20Sopenharmony_ci        vpxor   %xmm9, %xmm14, %xmm14
4938c2ecf20Sopenharmony_ci
4948c2ecf20Sopenharmony_ci        vmovdqu %xmm14, AadHash(arg2)
4958c2ecf20Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
4968c2ecf20Sopenharmony_ci        .endif
4978c2ecf20Sopenharmony_ci
4988c2ecf20Sopenharmony_ci
4998c2ecf20Sopenharmony_ci        #############################
5008c2ecf20Sopenharmony_ci        # output r13 Bytes
5018c2ecf20Sopenharmony_ci        vmovq   %xmm9, %rax
5028c2ecf20Sopenharmony_ci        cmp     $8, %r13
5038c2ecf20Sopenharmony_ci        jle     _less_than_8_bytes_left\@
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_ci        mov     %rax, (arg3 , %r11)
5068c2ecf20Sopenharmony_ci        add     $8, %r11
5078c2ecf20Sopenharmony_ci        vpsrldq $8, %xmm9, %xmm9
5088c2ecf20Sopenharmony_ci        vmovq   %xmm9, %rax
5098c2ecf20Sopenharmony_ci        sub     $8, %r13
5108c2ecf20Sopenharmony_ci
5118c2ecf20Sopenharmony_ci_less_than_8_bytes_left\@:
5128c2ecf20Sopenharmony_ci        movb    %al, (arg3 , %r11)
5138c2ecf20Sopenharmony_ci        add     $1, %r11
5148c2ecf20Sopenharmony_ci        shr     $8, %rax
5158c2ecf20Sopenharmony_ci        sub     $1, %r13
5168c2ecf20Sopenharmony_ci        jne     _less_than_8_bytes_left\@
5178c2ecf20Sopenharmony_ci        #############################
5188c2ecf20Sopenharmony_ci
5198c2ecf20Sopenharmony_ci_multiple_of_16_bytes\@:
5208c2ecf20Sopenharmony_ci.endm
5218c2ecf20Sopenharmony_ci
5228c2ecf20Sopenharmony_ci
5238c2ecf20Sopenharmony_ci# GCM_COMPLETE Finishes update of tag of last partial block
5248c2ecf20Sopenharmony_ci# Output: Authorization Tag (AUTH_TAG)
5258c2ecf20Sopenharmony_ci# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
5268c2ecf20Sopenharmony_ci.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
5278c2ecf20Sopenharmony_ci        vmovdqu AadHash(arg2), %xmm14
5288c2ecf20Sopenharmony_ci        vmovdqu HashKey(arg2), %xmm13
5298c2ecf20Sopenharmony_ci
5308c2ecf20Sopenharmony_ci        mov PBlockLen(arg2), %r12
5318c2ecf20Sopenharmony_ci        test %r12, %r12
5328c2ecf20Sopenharmony_ci        je _partial_done\@
5338c2ecf20Sopenharmony_ci
5348c2ecf20Sopenharmony_ci	#GHASH computation for the last <16 Byte block
5358c2ecf20Sopenharmony_ci        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
5368c2ecf20Sopenharmony_ci
5378c2ecf20Sopenharmony_ci_partial_done\@:
5388c2ecf20Sopenharmony_ci        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
5398c2ecf20Sopenharmony_ci        shl     $3, %r12                             # convert into number of bits
5408c2ecf20Sopenharmony_ci        vmovd   %r12d, %xmm15                        # len(A) in xmm15
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ci        mov InLen(arg2), %r12
5438c2ecf20Sopenharmony_ci        shl     $3, %r12                        # len(C) in bits  (*128)
5448c2ecf20Sopenharmony_ci        vmovq   %r12, %xmm1
5458c2ecf20Sopenharmony_ci        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
5468c2ecf20Sopenharmony_ci        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
5478c2ecf20Sopenharmony_ci
5488c2ecf20Sopenharmony_ci        vpxor   %xmm15, %xmm14, %xmm14
5498c2ecf20Sopenharmony_ci        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
5508c2ecf20Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
5518c2ecf20Sopenharmony_ci
5528c2ecf20Sopenharmony_ci        vmovdqu OrigIV(arg2), %xmm9
5538c2ecf20Sopenharmony_ci
5548c2ecf20Sopenharmony_ci        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
5558c2ecf20Sopenharmony_ci
5568c2ecf20Sopenharmony_ci        vpxor   %xmm14, %xmm9, %xmm9
5578c2ecf20Sopenharmony_ci
5588c2ecf20Sopenharmony_ci
5598c2ecf20Sopenharmony_ci
5608c2ecf20Sopenharmony_ci_return_T\@:
5618c2ecf20Sopenharmony_ci        mov     \AUTH_TAG, %r10              # r10 = authTag
5628c2ecf20Sopenharmony_ci        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
5638c2ecf20Sopenharmony_ci
5648c2ecf20Sopenharmony_ci        cmp     $16, %r11
5658c2ecf20Sopenharmony_ci        je      _T_16\@
5668c2ecf20Sopenharmony_ci
5678c2ecf20Sopenharmony_ci        cmp     $8, %r11
5688c2ecf20Sopenharmony_ci        jl      _T_4\@
5698c2ecf20Sopenharmony_ci
5708c2ecf20Sopenharmony_ci_T_8\@:
5718c2ecf20Sopenharmony_ci        vmovq   %xmm9, %rax
5728c2ecf20Sopenharmony_ci        mov     %rax, (%r10)
5738c2ecf20Sopenharmony_ci        add     $8, %r10
5748c2ecf20Sopenharmony_ci        sub     $8, %r11
5758c2ecf20Sopenharmony_ci        vpsrldq $8, %xmm9, %xmm9
5768c2ecf20Sopenharmony_ci        test    %r11, %r11
5778c2ecf20Sopenharmony_ci        je     _return_T_done\@
5788c2ecf20Sopenharmony_ci_T_4\@:
5798c2ecf20Sopenharmony_ci        vmovd   %xmm9, %eax
5808c2ecf20Sopenharmony_ci        mov     %eax, (%r10)
5818c2ecf20Sopenharmony_ci        add     $4, %r10
5828c2ecf20Sopenharmony_ci        sub     $4, %r11
5838c2ecf20Sopenharmony_ci        vpsrldq     $4, %xmm9, %xmm9
5848c2ecf20Sopenharmony_ci        test    %r11, %r11
5858c2ecf20Sopenharmony_ci        je     _return_T_done\@
5868c2ecf20Sopenharmony_ci_T_123\@:
5878c2ecf20Sopenharmony_ci        vmovd     %xmm9, %eax
5888c2ecf20Sopenharmony_ci        cmp     $2, %r11
5898c2ecf20Sopenharmony_ci        jl     _T_1\@
5908c2ecf20Sopenharmony_ci        mov     %ax, (%r10)
5918c2ecf20Sopenharmony_ci        cmp     $2, %r11
5928c2ecf20Sopenharmony_ci        je     _return_T_done\@
5938c2ecf20Sopenharmony_ci        add     $2, %r10
5948c2ecf20Sopenharmony_ci        sar     $16, %eax
5958c2ecf20Sopenharmony_ci_T_1\@:
5968c2ecf20Sopenharmony_ci        mov     %al, (%r10)
5978c2ecf20Sopenharmony_ci        jmp     _return_T_done\@
5988c2ecf20Sopenharmony_ci
5998c2ecf20Sopenharmony_ci_T_16\@:
6008c2ecf20Sopenharmony_ci        vmovdqu %xmm9, (%r10)
6018c2ecf20Sopenharmony_ci
6028c2ecf20Sopenharmony_ci_return_T_done\@:
6038c2ecf20Sopenharmony_ci.endm
6048c2ecf20Sopenharmony_ci
6058c2ecf20Sopenharmony_ci.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
6068c2ecf20Sopenharmony_ci
6078c2ecf20Sopenharmony_ci	mov     \AAD, %r10                      # r10 = AAD
6088c2ecf20Sopenharmony_ci	mov     \AADLEN, %r12                      # r12 = aadLen
6098c2ecf20Sopenharmony_ci
6108c2ecf20Sopenharmony_ci
6118c2ecf20Sopenharmony_ci	mov     %r12, %r11
6128c2ecf20Sopenharmony_ci
6138c2ecf20Sopenharmony_ci	vpxor   \T8, \T8, \T8
6148c2ecf20Sopenharmony_ci	vpxor   \T7, \T7, \T7
6158c2ecf20Sopenharmony_ci	cmp     $16, %r11
6168c2ecf20Sopenharmony_ci	jl      _get_AAD_rest8\@
6178c2ecf20Sopenharmony_ci_get_AAD_blocks\@:
6188c2ecf20Sopenharmony_ci	vmovdqu (%r10), \T7
6198c2ecf20Sopenharmony_ci	vpshufb SHUF_MASK(%rip), \T7, \T7
6208c2ecf20Sopenharmony_ci	vpxor   \T7, \T8, \T8
6218c2ecf20Sopenharmony_ci	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
6228c2ecf20Sopenharmony_ci	add     $16, %r10
6238c2ecf20Sopenharmony_ci	sub     $16, %r12
6248c2ecf20Sopenharmony_ci	sub     $16, %r11
6258c2ecf20Sopenharmony_ci	cmp     $16, %r11
6268c2ecf20Sopenharmony_ci	jge     _get_AAD_blocks\@
6278c2ecf20Sopenharmony_ci	vmovdqu \T8, \T7
6288c2ecf20Sopenharmony_ci	test    %r11, %r11
6298c2ecf20Sopenharmony_ci	je      _get_AAD_done\@
6308c2ecf20Sopenharmony_ci
6318c2ecf20Sopenharmony_ci	vpxor   \T7, \T7, \T7
6328c2ecf20Sopenharmony_ci
6338c2ecf20Sopenharmony_ci	/* read the last <16B of AAD. since we have at least 4B of
6348c2ecf20Sopenharmony_ci	data right after the AAD (the ICV, and maybe some CT), we can
6358c2ecf20Sopenharmony_ci	read 4B/8B blocks safely, and then get rid of the extra stuff */
6368c2ecf20Sopenharmony_ci_get_AAD_rest8\@:
6378c2ecf20Sopenharmony_ci	cmp     $4, %r11
6388c2ecf20Sopenharmony_ci	jle     _get_AAD_rest4\@
6398c2ecf20Sopenharmony_ci	movq    (%r10), \T1
6408c2ecf20Sopenharmony_ci	add     $8, %r10
6418c2ecf20Sopenharmony_ci	sub     $8, %r11
6428c2ecf20Sopenharmony_ci	vpslldq $8, \T1, \T1
6438c2ecf20Sopenharmony_ci	vpsrldq $8, \T7, \T7
6448c2ecf20Sopenharmony_ci	vpxor   \T1, \T7, \T7
6458c2ecf20Sopenharmony_ci	jmp     _get_AAD_rest8\@
6468c2ecf20Sopenharmony_ci_get_AAD_rest4\@:
6478c2ecf20Sopenharmony_ci	test    %r11, %r11
6488c2ecf20Sopenharmony_ci	jle      _get_AAD_rest0\@
6498c2ecf20Sopenharmony_ci	mov     (%r10), %eax
6508c2ecf20Sopenharmony_ci	movq    %rax, \T1
6518c2ecf20Sopenharmony_ci	add     $4, %r10
6528c2ecf20Sopenharmony_ci	sub     $4, %r11
6538c2ecf20Sopenharmony_ci	vpslldq $12, \T1, \T1
6548c2ecf20Sopenharmony_ci	vpsrldq $4, \T7, \T7
6558c2ecf20Sopenharmony_ci	vpxor   \T1, \T7, \T7
6568c2ecf20Sopenharmony_ci_get_AAD_rest0\@:
6578c2ecf20Sopenharmony_ci	/* finalize: shift out the extra bytes we read, and align
6588c2ecf20Sopenharmony_ci	left. since pslldq can only shift by an immediate, we use
6598c2ecf20Sopenharmony_ci	vpshufb and an array of shuffle masks */
6608c2ecf20Sopenharmony_ci	movq    %r12, %r11
6618c2ecf20Sopenharmony_ci	salq    $4, %r11
6628c2ecf20Sopenharmony_ci	vmovdqu  aad_shift_arr(%r11), \T1
6638c2ecf20Sopenharmony_ci	vpshufb \T1, \T7, \T7
6648c2ecf20Sopenharmony_ci_get_AAD_rest_final\@:
6658c2ecf20Sopenharmony_ci	vpshufb SHUF_MASK(%rip), \T7, \T7
6668c2ecf20Sopenharmony_ci	vpxor   \T8, \T7, \T7
6678c2ecf20Sopenharmony_ci	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
6688c2ecf20Sopenharmony_ci
6698c2ecf20Sopenharmony_ci_get_AAD_done\@:
6708c2ecf20Sopenharmony_ci        vmovdqu \T7, AadHash(arg2)
6718c2ecf20Sopenharmony_ci.endm
6728c2ecf20Sopenharmony_ci
6738c2ecf20Sopenharmony_ci.macro INIT GHASH_MUL PRECOMPUTE
6748c2ecf20Sopenharmony_ci        mov arg6, %r11
6758c2ecf20Sopenharmony_ci        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
6768c2ecf20Sopenharmony_ci        xor %r11d, %r11d
6778c2ecf20Sopenharmony_ci        mov %r11, InLen(arg2) # ctx_data.in_length = 0
6788c2ecf20Sopenharmony_ci
6798c2ecf20Sopenharmony_ci        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
6808c2ecf20Sopenharmony_ci        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
6818c2ecf20Sopenharmony_ci        mov arg3, %rax
6828c2ecf20Sopenharmony_ci        movdqu (%rax), %xmm0
6838c2ecf20Sopenharmony_ci        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
6848c2ecf20Sopenharmony_ci
6858c2ecf20Sopenharmony_ci        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
6868c2ecf20Sopenharmony_ci        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
6878c2ecf20Sopenharmony_ci
6888c2ecf20Sopenharmony_ci        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
6898c2ecf20Sopenharmony_ci
6908c2ecf20Sopenharmony_ci        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
6918c2ecf20Sopenharmony_ci        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
6928c2ecf20Sopenharmony_ci        vmovdqa  %xmm6, %xmm2
6938c2ecf20Sopenharmony_ci        vpsllq   $1, %xmm6, %xmm6
6948c2ecf20Sopenharmony_ci        vpsrlq   $63, %xmm2, %xmm2
6958c2ecf20Sopenharmony_ci        vmovdqa  %xmm2, %xmm1
6968c2ecf20Sopenharmony_ci        vpslldq  $8, %xmm2, %xmm2
6978c2ecf20Sopenharmony_ci        vpsrldq  $8, %xmm1, %xmm1
6988c2ecf20Sopenharmony_ci        vpor     %xmm2, %xmm6, %xmm6
6998c2ecf20Sopenharmony_ci        #reduction
7008c2ecf20Sopenharmony_ci        vpshufd  $0b00100100, %xmm1, %xmm2
7018c2ecf20Sopenharmony_ci        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
7028c2ecf20Sopenharmony_ci        vpand    POLY(%rip), %xmm2, %xmm2
7038c2ecf20Sopenharmony_ci        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
7048c2ecf20Sopenharmony_ci        #######################################################################
7058c2ecf20Sopenharmony_ci        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
7068c2ecf20Sopenharmony_ci
7078c2ecf20Sopenharmony_ci        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
7088c2ecf20Sopenharmony_ci
7098c2ecf20Sopenharmony_ci        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
7108c2ecf20Sopenharmony_ci.endm
7118c2ecf20Sopenharmony_ci
7128c2ecf20Sopenharmony_ci
7138c2ecf20Sopenharmony_ci# Reads DLEN bytes starting at DPTR and stores in XMMDst
7148c2ecf20Sopenharmony_ci# where 0 < DLEN < 16
7158c2ecf20Sopenharmony_ci# Clobbers %rax, DLEN
7168c2ecf20Sopenharmony_ci.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
7178c2ecf20Sopenharmony_ci        vpxor \XMMDst, \XMMDst, \XMMDst
7188c2ecf20Sopenharmony_ci
7198c2ecf20Sopenharmony_ci        cmp $8, \DLEN
7208c2ecf20Sopenharmony_ci        jl _read_lt8_\@
7218c2ecf20Sopenharmony_ci        mov (\DPTR), %rax
7228c2ecf20Sopenharmony_ci        vpinsrq $0, %rax, \XMMDst, \XMMDst
7238c2ecf20Sopenharmony_ci        sub $8, \DLEN
7248c2ecf20Sopenharmony_ci        jz _done_read_partial_block_\@
7258c2ecf20Sopenharmony_ci        xor %eax, %eax
7268c2ecf20Sopenharmony_ci_read_next_byte_\@:
7278c2ecf20Sopenharmony_ci        shl $8, %rax
7288c2ecf20Sopenharmony_ci        mov 7(\DPTR, \DLEN, 1), %al
7298c2ecf20Sopenharmony_ci        dec \DLEN
7308c2ecf20Sopenharmony_ci        jnz _read_next_byte_\@
7318c2ecf20Sopenharmony_ci        vpinsrq $1, %rax, \XMMDst, \XMMDst
7328c2ecf20Sopenharmony_ci        jmp _done_read_partial_block_\@
7338c2ecf20Sopenharmony_ci_read_lt8_\@:
7348c2ecf20Sopenharmony_ci        xor %eax, %eax
7358c2ecf20Sopenharmony_ci_read_next_byte_lt8_\@:
7368c2ecf20Sopenharmony_ci        shl $8, %rax
7378c2ecf20Sopenharmony_ci        mov -1(\DPTR, \DLEN, 1), %al
7388c2ecf20Sopenharmony_ci        dec \DLEN
7398c2ecf20Sopenharmony_ci        jnz _read_next_byte_lt8_\@
7408c2ecf20Sopenharmony_ci        vpinsrq $0, %rax, \XMMDst, \XMMDst
7418c2ecf20Sopenharmony_ci_done_read_partial_block_\@:
7428c2ecf20Sopenharmony_ci.endm
7438c2ecf20Sopenharmony_ci
7448c2ecf20Sopenharmony_ci# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
7458c2ecf20Sopenharmony_ci# between update calls.
7468c2ecf20Sopenharmony_ci# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
7478c2ecf20Sopenharmony_ci# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
7488c2ecf20Sopenharmony_ci# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
7498c2ecf20Sopenharmony_ci.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
7508c2ecf20Sopenharmony_ci        AAD_HASH ENC_DEC
7518c2ecf20Sopenharmony_ci        mov 	PBlockLen(arg2), %r13
7528c2ecf20Sopenharmony_ci        test	%r13, %r13
7538c2ecf20Sopenharmony_ci        je	_partial_block_done_\@	# Leave Macro if no partial blocks
7548c2ecf20Sopenharmony_ci        # Read in input data without over reading
7558c2ecf20Sopenharmony_ci        cmp	$16, \PLAIN_CYPH_LEN
7568c2ecf20Sopenharmony_ci        jl	_fewer_than_16_bytes_\@
7578c2ecf20Sopenharmony_ci        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
7588c2ecf20Sopenharmony_ci        jmp	_data_read_\@
7598c2ecf20Sopenharmony_ci
7608c2ecf20Sopenharmony_ci_fewer_than_16_bytes_\@:
7618c2ecf20Sopenharmony_ci        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
7628c2ecf20Sopenharmony_ci        mov	\PLAIN_CYPH_LEN, %r12
7638c2ecf20Sopenharmony_ci        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
7648c2ecf20Sopenharmony_ci
7658c2ecf20Sopenharmony_ci        mov PBlockLen(arg2), %r13
7668c2ecf20Sopenharmony_ci
7678c2ecf20Sopenharmony_ci_data_read_\@:				# Finished reading in data
7688c2ecf20Sopenharmony_ci
7698c2ecf20Sopenharmony_ci        vmovdqu	PBlockEncKey(arg2), %xmm9
7708c2ecf20Sopenharmony_ci        vmovdqu	HashKey(arg2), %xmm13
7718c2ecf20Sopenharmony_ci
7728c2ecf20Sopenharmony_ci        lea	SHIFT_MASK(%rip), %r12
7738c2ecf20Sopenharmony_ci
7748c2ecf20Sopenharmony_ci        # adjust the shuffle mask pointer to be able to shift r13 bytes
7758c2ecf20Sopenharmony_ci        # r16-r13 is the number of bytes in plaintext mod 16)
7768c2ecf20Sopenharmony_ci        add	%r13, %r12
7778c2ecf20Sopenharmony_ci        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
7788c2ecf20Sopenharmony_ci        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
7798c2ecf20Sopenharmony_ci
7808c2ecf20Sopenharmony_ci.if  \ENC_DEC ==  DEC
7818c2ecf20Sopenharmony_ci        vmovdqa	%xmm1, %xmm3
7828c2ecf20Sopenharmony_ci        pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
7838c2ecf20Sopenharmony_ci
7848c2ecf20Sopenharmony_ci        mov	\PLAIN_CYPH_LEN, %r10
7858c2ecf20Sopenharmony_ci        add	%r13, %r10
7868c2ecf20Sopenharmony_ci        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
7878c2ecf20Sopenharmony_ci        sub	$16, %r10
7888c2ecf20Sopenharmony_ci        # Determine if if partial block is not being filled and
7898c2ecf20Sopenharmony_ci        # shift mask accordingly
7908c2ecf20Sopenharmony_ci        jge	_no_extra_mask_1_\@
7918c2ecf20Sopenharmony_ci        sub	%r10, %r12
7928c2ecf20Sopenharmony_ci_no_extra_mask_1_\@:
7938c2ecf20Sopenharmony_ci
7948c2ecf20Sopenharmony_ci        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
7958c2ecf20Sopenharmony_ci        # get the appropriate mask to mask out bottom r13 bytes of xmm9
7968c2ecf20Sopenharmony_ci        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
7978c2ecf20Sopenharmony_ci
7988c2ecf20Sopenharmony_ci        vpand	%xmm1, %xmm3, %xmm3
7998c2ecf20Sopenharmony_ci        vmovdqa	SHUF_MASK(%rip), %xmm10
8008c2ecf20Sopenharmony_ci        vpshufb	%xmm10, %xmm3, %xmm3
8018c2ecf20Sopenharmony_ci        vpshufb	%xmm2, %xmm3, %xmm3
8028c2ecf20Sopenharmony_ci        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
8038c2ecf20Sopenharmony_ci
8048c2ecf20Sopenharmony_ci        test	%r10, %r10
8058c2ecf20Sopenharmony_ci        jl	_partial_incomplete_1_\@
8068c2ecf20Sopenharmony_ci
8078c2ecf20Sopenharmony_ci        # GHASH computation for the last <16 Byte block
8088c2ecf20Sopenharmony_ci        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
8098c2ecf20Sopenharmony_ci        xor	%eax,%eax
8108c2ecf20Sopenharmony_ci
8118c2ecf20Sopenharmony_ci        mov	%rax, PBlockLen(arg2)
8128c2ecf20Sopenharmony_ci        jmp	_dec_done_\@
8138c2ecf20Sopenharmony_ci_partial_incomplete_1_\@:
8148c2ecf20Sopenharmony_ci        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
8158c2ecf20Sopenharmony_ci_dec_done_\@:
8168c2ecf20Sopenharmony_ci        vmovdqu	\AAD_HASH, AadHash(arg2)
8178c2ecf20Sopenharmony_ci.else
8188c2ecf20Sopenharmony_ci        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
8198c2ecf20Sopenharmony_ci
8208c2ecf20Sopenharmony_ci        mov	\PLAIN_CYPH_LEN, %r10
8218c2ecf20Sopenharmony_ci        add	%r13, %r10
8228c2ecf20Sopenharmony_ci        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
8238c2ecf20Sopenharmony_ci        sub	$16, %r10
8248c2ecf20Sopenharmony_ci        # Determine if if partial block is not being filled and
8258c2ecf20Sopenharmony_ci        # shift mask accordingly
8268c2ecf20Sopenharmony_ci        jge	_no_extra_mask_2_\@
8278c2ecf20Sopenharmony_ci        sub	%r10, %r12
8288c2ecf20Sopenharmony_ci_no_extra_mask_2_\@:
8298c2ecf20Sopenharmony_ci
8308c2ecf20Sopenharmony_ci        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
8318c2ecf20Sopenharmony_ci        # get the appropriate mask to mask out bottom r13 bytes of xmm9
8328c2ecf20Sopenharmony_ci        vpand	%xmm1, %xmm9, %xmm9
8338c2ecf20Sopenharmony_ci
8348c2ecf20Sopenharmony_ci        vmovdqa	SHUF_MASK(%rip), %xmm1
8358c2ecf20Sopenharmony_ci        vpshufb %xmm1, %xmm9, %xmm9
8368c2ecf20Sopenharmony_ci        vpshufb %xmm2, %xmm9, %xmm9
8378c2ecf20Sopenharmony_ci        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
8388c2ecf20Sopenharmony_ci
8398c2ecf20Sopenharmony_ci        test	%r10, %r10
8408c2ecf20Sopenharmony_ci        jl	_partial_incomplete_2_\@
8418c2ecf20Sopenharmony_ci
8428c2ecf20Sopenharmony_ci        # GHASH computation for the last <16 Byte block
8438c2ecf20Sopenharmony_ci        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
8448c2ecf20Sopenharmony_ci        xor	%eax,%eax
8458c2ecf20Sopenharmony_ci
8468c2ecf20Sopenharmony_ci        mov	%rax, PBlockLen(arg2)
8478c2ecf20Sopenharmony_ci        jmp	_encode_done_\@
8488c2ecf20Sopenharmony_ci_partial_incomplete_2_\@:
8498c2ecf20Sopenharmony_ci        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
8508c2ecf20Sopenharmony_ci_encode_done_\@:
8518c2ecf20Sopenharmony_ci        vmovdqu	\AAD_HASH, AadHash(arg2)
8528c2ecf20Sopenharmony_ci
8538c2ecf20Sopenharmony_ci        vmovdqa	SHUF_MASK(%rip), %xmm10
8548c2ecf20Sopenharmony_ci        # shuffle xmm9 back to output as ciphertext
8558c2ecf20Sopenharmony_ci        vpshufb	%xmm10, %xmm9, %xmm9
8568c2ecf20Sopenharmony_ci        vpshufb	%xmm2, %xmm9, %xmm9
8578c2ecf20Sopenharmony_ci.endif
8588c2ecf20Sopenharmony_ci        # output encrypted Bytes
8598c2ecf20Sopenharmony_ci        test	%r10, %r10
8608c2ecf20Sopenharmony_ci        jl	_partial_fill_\@
8618c2ecf20Sopenharmony_ci        mov	%r13, %r12
8628c2ecf20Sopenharmony_ci        mov	$16, %r13
8638c2ecf20Sopenharmony_ci        # Set r13 to be the number of bytes to write out
8648c2ecf20Sopenharmony_ci        sub	%r12, %r13
8658c2ecf20Sopenharmony_ci        jmp	_count_set_\@
8668c2ecf20Sopenharmony_ci_partial_fill_\@:
8678c2ecf20Sopenharmony_ci        mov	\PLAIN_CYPH_LEN, %r13
8688c2ecf20Sopenharmony_ci_count_set_\@:
8698c2ecf20Sopenharmony_ci        vmovdqa	%xmm9, %xmm0
8708c2ecf20Sopenharmony_ci        vmovq	%xmm0, %rax
8718c2ecf20Sopenharmony_ci        cmp	$8, %r13
8728c2ecf20Sopenharmony_ci        jle	_less_than_8_bytes_left_\@
8738c2ecf20Sopenharmony_ci
8748c2ecf20Sopenharmony_ci        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
8758c2ecf20Sopenharmony_ci        add	$8, \DATA_OFFSET
8768c2ecf20Sopenharmony_ci        psrldq	$8, %xmm0
8778c2ecf20Sopenharmony_ci        vmovq	%xmm0, %rax
8788c2ecf20Sopenharmony_ci        sub	$8, %r13
8798c2ecf20Sopenharmony_ci_less_than_8_bytes_left_\@:
8808c2ecf20Sopenharmony_ci        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
8818c2ecf20Sopenharmony_ci        add	$1, \DATA_OFFSET
8828c2ecf20Sopenharmony_ci        shr	$8, %rax
8838c2ecf20Sopenharmony_ci        sub	$1, %r13
8848c2ecf20Sopenharmony_ci        jne	_less_than_8_bytes_left_\@
8858c2ecf20Sopenharmony_ci_partial_block_done_\@:
8868c2ecf20Sopenharmony_ci.endm # PARTIAL_BLOCK
8878c2ecf20Sopenharmony_ci
8888c2ecf20Sopenharmony_ci###############################################################################
8898c2ecf20Sopenharmony_ci# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
8908c2ecf20Sopenharmony_ci# Input: A and B (128-bits each, bit-reflected)
8918c2ecf20Sopenharmony_ci# Output: C = A*B*x mod poly, (i.e. >>1 )
8928c2ecf20Sopenharmony_ci# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
8938c2ecf20Sopenharmony_ci# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
8948c2ecf20Sopenharmony_ci###############################################################################
8958c2ecf20Sopenharmony_ci.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
8968c2ecf20Sopenharmony_ci
8978c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \GH, \T2
8988c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \HK, \T3
8998c2ecf20Sopenharmony_ci        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
9008c2ecf20Sopenharmony_ci        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
9018c2ecf20Sopenharmony_ci
9028c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
9038c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
9048c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
9058c2ecf20Sopenharmony_ci        vpxor           \GH, \T2,\T2
9068c2ecf20Sopenharmony_ci        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
9078c2ecf20Sopenharmony_ci
9088c2ecf20Sopenharmony_ci        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
9098c2ecf20Sopenharmony_ci        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
9108c2ecf20Sopenharmony_ci        vpxor           \T3, \GH, \GH
9118c2ecf20Sopenharmony_ci        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
9128c2ecf20Sopenharmony_ci
9138c2ecf20Sopenharmony_ci        #first phase of the reduction
9148c2ecf20Sopenharmony_ci        vpslld  $31, \GH, \T2                   # packed right shifting << 31
9158c2ecf20Sopenharmony_ci        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
9168c2ecf20Sopenharmony_ci        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
9178c2ecf20Sopenharmony_ci
9188c2ecf20Sopenharmony_ci        vpxor   \T3, \T2, \T2                   # xor the shifted versions
9198c2ecf20Sopenharmony_ci        vpxor   \T4, \T2, \T2
9208c2ecf20Sopenharmony_ci
9218c2ecf20Sopenharmony_ci        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
9228c2ecf20Sopenharmony_ci
9238c2ecf20Sopenharmony_ci        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
9248c2ecf20Sopenharmony_ci        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
9258c2ecf20Sopenharmony_ci
9268c2ecf20Sopenharmony_ci        #second phase of the reduction
9278c2ecf20Sopenharmony_ci
9288c2ecf20Sopenharmony_ci        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
9298c2ecf20Sopenharmony_ci        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
9308c2ecf20Sopenharmony_ci        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
9318c2ecf20Sopenharmony_ci        vpxor   \T3, \T2, \T2                   # xor the shifted versions
9328c2ecf20Sopenharmony_ci        vpxor   \T4, \T2, \T2
9338c2ecf20Sopenharmony_ci
9348c2ecf20Sopenharmony_ci        vpxor   \T5, \T2, \T2
9358c2ecf20Sopenharmony_ci        vpxor   \T2, \GH, \GH
9368c2ecf20Sopenharmony_ci        vpxor   \T1, \GH, \GH                   # the result is in GH
9378c2ecf20Sopenharmony_ci
9388c2ecf20Sopenharmony_ci
9398c2ecf20Sopenharmony_ci.endm
9408c2ecf20Sopenharmony_ci
9418c2ecf20Sopenharmony_ci.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
9428c2ecf20Sopenharmony_ci
9438c2ecf20Sopenharmony_ci        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
9448c2ecf20Sopenharmony_ci        vmovdqa  \HK, \T5
9458c2ecf20Sopenharmony_ci
9468c2ecf20Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
9478c2ecf20Sopenharmony_ci        vpxor    \T5, \T1, \T1
9488c2ecf20Sopenharmony_ci        vmovdqu  \T1, HashKey_k(arg2)
9498c2ecf20Sopenharmony_ci
9508c2ecf20Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
9518c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
9528c2ecf20Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
9538c2ecf20Sopenharmony_ci        vpxor    \T5, \T1, \T1
9548c2ecf20Sopenharmony_ci        vmovdqu  \T1, HashKey_2_k(arg2)
9558c2ecf20Sopenharmony_ci
9568c2ecf20Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
9578c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_3(arg2)
9588c2ecf20Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
9598c2ecf20Sopenharmony_ci        vpxor    \T5, \T1, \T1
9608c2ecf20Sopenharmony_ci        vmovdqu  \T1, HashKey_3_k(arg2)
9618c2ecf20Sopenharmony_ci
9628c2ecf20Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
9638c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_4(arg2)
9648c2ecf20Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
9658c2ecf20Sopenharmony_ci        vpxor    \T5, \T1, \T1
9668c2ecf20Sopenharmony_ci        vmovdqu  \T1, HashKey_4_k(arg2)
9678c2ecf20Sopenharmony_ci
9688c2ecf20Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
9698c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_5(arg2)
9708c2ecf20Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
9718c2ecf20Sopenharmony_ci        vpxor    \T5, \T1, \T1
9728c2ecf20Sopenharmony_ci        vmovdqu  \T1, HashKey_5_k(arg2)
9738c2ecf20Sopenharmony_ci
9748c2ecf20Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
9758c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_6(arg2)
9768c2ecf20Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
9778c2ecf20Sopenharmony_ci        vpxor    \T5, \T1, \T1
9788c2ecf20Sopenharmony_ci        vmovdqu  \T1, HashKey_6_k(arg2)
9798c2ecf20Sopenharmony_ci
9808c2ecf20Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
9818c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_7(arg2)
9828c2ecf20Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
9838c2ecf20Sopenharmony_ci        vpxor    \T5, \T1, \T1
9848c2ecf20Sopenharmony_ci        vmovdqu  \T1, HashKey_7_k(arg2)
9858c2ecf20Sopenharmony_ci
9868c2ecf20Sopenharmony_ci        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
9878c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_8(arg2)
9888c2ecf20Sopenharmony_ci        vpshufd  $0b01001110, \T5, \T1
9898c2ecf20Sopenharmony_ci        vpxor    \T5, \T1, \T1
9908c2ecf20Sopenharmony_ci        vmovdqu  \T1, HashKey_8_k(arg2)
9918c2ecf20Sopenharmony_ci
9928c2ecf20Sopenharmony_ci.endm
9938c2ecf20Sopenharmony_ci
9948c2ecf20Sopenharmony_ci## if a = number of total plaintext bytes
9958c2ecf20Sopenharmony_ci## b = floor(a/16)
9968c2ecf20Sopenharmony_ci## num_initial_blocks = b mod 4#
9978c2ecf20Sopenharmony_ci## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
9988c2ecf20Sopenharmony_ci## r10, r11, r12, rax are clobbered
9998c2ecf20Sopenharmony_ci## arg1, arg3, arg4, r14 are used as a pointer only, not modified
10008c2ecf20Sopenharmony_ci
10018c2ecf20Sopenharmony_ci.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
10028c2ecf20Sopenharmony_ci	i = (8-\num_initial_blocks)
10038c2ecf20Sopenharmony_ci	setreg
10048c2ecf20Sopenharmony_ci        vmovdqu AadHash(arg2), reg_i
10058c2ecf20Sopenharmony_ci
10068c2ecf20Sopenharmony_ci	# start AES for num_initial_blocks blocks
10078c2ecf20Sopenharmony_ci	vmovdqu CurCount(arg2), \CTR
10088c2ecf20Sopenharmony_ci
10098c2ecf20Sopenharmony_ci	i = (9-\num_initial_blocks)
10108c2ecf20Sopenharmony_ci	setreg
10118c2ecf20Sopenharmony_ci.rep \num_initial_blocks
10128c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
10138c2ecf20Sopenharmony_ci                vmovdqa \CTR, reg_i
10148c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
10158c2ecf20Sopenharmony_ci	i = (i+1)
10168c2ecf20Sopenharmony_ci	setreg
10178c2ecf20Sopenharmony_ci.endr
10188c2ecf20Sopenharmony_ci
10198c2ecf20Sopenharmony_ci	vmovdqa  (arg1), \T_key
10208c2ecf20Sopenharmony_ci	i = (9-\num_initial_blocks)
10218c2ecf20Sopenharmony_ci	setreg
10228c2ecf20Sopenharmony_ci.rep \num_initial_blocks
10238c2ecf20Sopenharmony_ci                vpxor   \T_key, reg_i, reg_i
10248c2ecf20Sopenharmony_ci	i = (i+1)
10258c2ecf20Sopenharmony_ci	setreg
10268c2ecf20Sopenharmony_ci.endr
10278c2ecf20Sopenharmony_ci
10288c2ecf20Sopenharmony_ci       j = 1
10298c2ecf20Sopenharmony_ci       setreg
10308c2ecf20Sopenharmony_ci.rep \REP
10318c2ecf20Sopenharmony_ci       vmovdqa  16*j(arg1), \T_key
10328c2ecf20Sopenharmony_ci	i = (9-\num_initial_blocks)
10338c2ecf20Sopenharmony_ci	setreg
10348c2ecf20Sopenharmony_ci.rep \num_initial_blocks
10358c2ecf20Sopenharmony_ci        vaesenc \T_key, reg_i, reg_i
10368c2ecf20Sopenharmony_ci	i = (i+1)
10378c2ecf20Sopenharmony_ci	setreg
10388c2ecf20Sopenharmony_ci.endr
10398c2ecf20Sopenharmony_ci
10408c2ecf20Sopenharmony_ci       j = (j+1)
10418c2ecf20Sopenharmony_ci       setreg
10428c2ecf20Sopenharmony_ci.endr
10438c2ecf20Sopenharmony_ci
10448c2ecf20Sopenharmony_ci	vmovdqa  16*j(arg1), \T_key
10458c2ecf20Sopenharmony_ci	i = (9-\num_initial_blocks)
10468c2ecf20Sopenharmony_ci	setreg
10478c2ecf20Sopenharmony_ci.rep \num_initial_blocks
10488c2ecf20Sopenharmony_ci        vaesenclast      \T_key, reg_i, reg_i
10498c2ecf20Sopenharmony_ci	i = (i+1)
10508c2ecf20Sopenharmony_ci	setreg
10518c2ecf20Sopenharmony_ci.endr
10528c2ecf20Sopenharmony_ci
10538c2ecf20Sopenharmony_ci	i = (9-\num_initial_blocks)
10548c2ecf20Sopenharmony_ci	setreg
10558c2ecf20Sopenharmony_ci.rep \num_initial_blocks
10568c2ecf20Sopenharmony_ci                vmovdqu (arg4, %r11), \T1
10578c2ecf20Sopenharmony_ci                vpxor   \T1, reg_i, reg_i
10588c2ecf20Sopenharmony_ci                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
10598c2ecf20Sopenharmony_ci                add     $16, %r11
10608c2ecf20Sopenharmony_ci.if  \ENC_DEC == DEC
10618c2ecf20Sopenharmony_ci                vmovdqa \T1, reg_i
10628c2ecf20Sopenharmony_ci.endif
10638c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
10648c2ecf20Sopenharmony_ci	i = (i+1)
10658c2ecf20Sopenharmony_ci	setreg
10668c2ecf20Sopenharmony_ci.endr
10678c2ecf20Sopenharmony_ci
10688c2ecf20Sopenharmony_ci
10698c2ecf20Sopenharmony_ci	i = (8-\num_initial_blocks)
10708c2ecf20Sopenharmony_ci	j = (9-\num_initial_blocks)
10718c2ecf20Sopenharmony_ci	setreg
10728c2ecf20Sopenharmony_ci
10738c2ecf20Sopenharmony_ci.rep \num_initial_blocks
10748c2ecf20Sopenharmony_ci        vpxor    reg_i, reg_j, reg_j
10758c2ecf20Sopenharmony_ci        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
10768c2ecf20Sopenharmony_ci	i = (i+1)
10778c2ecf20Sopenharmony_ci	j = (j+1)
10788c2ecf20Sopenharmony_ci	setreg
10798c2ecf20Sopenharmony_ci.endr
10808c2ecf20Sopenharmony_ci        # XMM8 has the combined result here
10818c2ecf20Sopenharmony_ci
10828c2ecf20Sopenharmony_ci        vmovdqa  \XMM8, TMP1(%rsp)
10838c2ecf20Sopenharmony_ci        vmovdqa  \XMM8, \T3
10848c2ecf20Sopenharmony_ci
10858c2ecf20Sopenharmony_ci        cmp     $128, %r13
10868c2ecf20Sopenharmony_ci        jl      _initial_blocks_done\@                  # no need for precomputed constants
10878c2ecf20Sopenharmony_ci
10888c2ecf20Sopenharmony_ci###############################################################################
10898c2ecf20Sopenharmony_ci# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
10908c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
10918c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM1
10928c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
10938c2ecf20Sopenharmony_ci
10948c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
10958c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM2
10968c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
10978c2ecf20Sopenharmony_ci
10988c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
10998c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM3
11008c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
11018c2ecf20Sopenharmony_ci
11028c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
11038c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM4
11048c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
11058c2ecf20Sopenharmony_ci
11068c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
11078c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM5
11088c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
11098c2ecf20Sopenharmony_ci
11108c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
11118c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM6
11128c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
11138c2ecf20Sopenharmony_ci
11148c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
11158c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM7
11168c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
11178c2ecf20Sopenharmony_ci
11188c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
11198c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM8
11208c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
11218c2ecf20Sopenharmony_ci
11228c2ecf20Sopenharmony_ci                vmovdqa  (arg1), \T_key
11238c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM1, \XMM1
11248c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM2, \XMM2
11258c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM3, \XMM3
11268c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM4, \XMM4
11278c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM5, \XMM5
11288c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM6, \XMM6
11298c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM7, \XMM7
11308c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM8, \XMM8
11318c2ecf20Sopenharmony_ci
11328c2ecf20Sopenharmony_ci               i = 1
11338c2ecf20Sopenharmony_ci               setreg
11348c2ecf20Sopenharmony_ci.rep    \REP       # do REP rounds
11358c2ecf20Sopenharmony_ci                vmovdqa  16*i(arg1), \T_key
11368c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM1, \XMM1
11378c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM2, \XMM2
11388c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM3, \XMM3
11398c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM4, \XMM4
11408c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM5, \XMM5
11418c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM6, \XMM6
11428c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM7, \XMM7
11438c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM8, \XMM8
11448c2ecf20Sopenharmony_ci               i = (i+1)
11458c2ecf20Sopenharmony_ci               setreg
11468c2ecf20Sopenharmony_ci.endr
11478c2ecf20Sopenharmony_ci
11488c2ecf20Sopenharmony_ci                vmovdqa  16*i(arg1), \T_key
11498c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM1, \XMM1
11508c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM2, \XMM2
11518c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM3, \XMM3
11528c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM4, \XMM4
11538c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM5, \XMM5
11548c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM6, \XMM6
11558c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM7, \XMM7
11568c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM8, \XMM8
11578c2ecf20Sopenharmony_ci
11588c2ecf20Sopenharmony_ci                vmovdqu  (arg4, %r11), \T1
11598c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM1, \XMM1
11608c2ecf20Sopenharmony_ci                vmovdqu  \XMM1, (arg3 , %r11)
11618c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
11628c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM1
11638c2ecf20Sopenharmony_ci                .endif
11648c2ecf20Sopenharmony_ci
11658c2ecf20Sopenharmony_ci                vmovdqu  16*1(arg4, %r11), \T1
11668c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM2, \XMM2
11678c2ecf20Sopenharmony_ci                vmovdqu  \XMM2, 16*1(arg3 , %r11)
11688c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
11698c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM2
11708c2ecf20Sopenharmony_ci                .endif
11718c2ecf20Sopenharmony_ci
11728c2ecf20Sopenharmony_ci                vmovdqu  16*2(arg4, %r11), \T1
11738c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM3, \XMM3
11748c2ecf20Sopenharmony_ci                vmovdqu  \XMM3, 16*2(arg3 , %r11)
11758c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
11768c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM3
11778c2ecf20Sopenharmony_ci                .endif
11788c2ecf20Sopenharmony_ci
11798c2ecf20Sopenharmony_ci                vmovdqu  16*3(arg4, %r11), \T1
11808c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM4, \XMM4
11818c2ecf20Sopenharmony_ci                vmovdqu  \XMM4, 16*3(arg3 , %r11)
11828c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
11838c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM4
11848c2ecf20Sopenharmony_ci                .endif
11858c2ecf20Sopenharmony_ci
11868c2ecf20Sopenharmony_ci                vmovdqu  16*4(arg4, %r11), \T1
11878c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM5, \XMM5
11888c2ecf20Sopenharmony_ci                vmovdqu  \XMM5, 16*4(arg3 , %r11)
11898c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
11908c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM5
11918c2ecf20Sopenharmony_ci                .endif
11928c2ecf20Sopenharmony_ci
11938c2ecf20Sopenharmony_ci                vmovdqu  16*5(arg4, %r11), \T1
11948c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM6, \XMM6
11958c2ecf20Sopenharmony_ci                vmovdqu  \XMM6, 16*5(arg3 , %r11)
11968c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
11978c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM6
11988c2ecf20Sopenharmony_ci                .endif
11998c2ecf20Sopenharmony_ci
12008c2ecf20Sopenharmony_ci                vmovdqu  16*6(arg4, %r11), \T1
12018c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM7, \XMM7
12028c2ecf20Sopenharmony_ci                vmovdqu  \XMM7, 16*6(arg3 , %r11)
12038c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
12048c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM7
12058c2ecf20Sopenharmony_ci                .endif
12068c2ecf20Sopenharmony_ci
12078c2ecf20Sopenharmony_ci                vmovdqu  16*7(arg4, %r11), \T1
12088c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM8, \XMM8
12098c2ecf20Sopenharmony_ci                vmovdqu  \XMM8, 16*7(arg3 , %r11)
12108c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
12118c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM8
12128c2ecf20Sopenharmony_ci                .endif
12138c2ecf20Sopenharmony_ci
12148c2ecf20Sopenharmony_ci                add     $128, %r11
12158c2ecf20Sopenharmony_ci
12168c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
12178c2ecf20Sopenharmony_ci                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
12188c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
12198c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
12208c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
12218c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
12228c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
12238c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
12248c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
12258c2ecf20Sopenharmony_ci
12268c2ecf20Sopenharmony_ci###############################################################################
12278c2ecf20Sopenharmony_ci
12288c2ecf20Sopenharmony_ci_initial_blocks_done\@:
12298c2ecf20Sopenharmony_ci
12308c2ecf20Sopenharmony_ci.endm
12318c2ecf20Sopenharmony_ci
12328c2ecf20Sopenharmony_ci# encrypt 8 blocks at a time
12338c2ecf20Sopenharmony_ci# ghash the 8 previously encrypted ciphertext blocks
12348c2ecf20Sopenharmony_ci# arg1, arg3, arg4 are used as pointers only, not modified
12358c2ecf20Sopenharmony_ci# r11 is the data offset value
12368c2ecf20Sopenharmony_ci.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
12378c2ecf20Sopenharmony_ci
12388c2ecf20Sopenharmony_ci        vmovdqa \XMM1, \T2
12398c2ecf20Sopenharmony_ci        vmovdqa \XMM2, TMP2(%rsp)
12408c2ecf20Sopenharmony_ci        vmovdqa \XMM3, TMP3(%rsp)
12418c2ecf20Sopenharmony_ci        vmovdqa \XMM4, TMP4(%rsp)
12428c2ecf20Sopenharmony_ci        vmovdqa \XMM5, TMP5(%rsp)
12438c2ecf20Sopenharmony_ci        vmovdqa \XMM6, TMP6(%rsp)
12448c2ecf20Sopenharmony_ci        vmovdqa \XMM7, TMP7(%rsp)
12458c2ecf20Sopenharmony_ci        vmovdqa \XMM8, TMP8(%rsp)
12468c2ecf20Sopenharmony_ci
12478c2ecf20Sopenharmony_ci.if \loop_idx == in_order
12488c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
12498c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM1, \XMM2
12508c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM2, \XMM3
12518c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM3, \XMM4
12528c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM4, \XMM5
12538c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM5, \XMM6
12548c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM6, \XMM7
12558c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM7, \XMM8
12568c2ecf20Sopenharmony_ci                vmovdqa \XMM8, \CTR
12578c2ecf20Sopenharmony_ci
12588c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
12598c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
12608c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
12618c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
12628c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
12638c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
12648c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
12658c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
12668c2ecf20Sopenharmony_ci.else
12678c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
12688c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM1, \XMM2
12698c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM2, \XMM3
12708c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM3, \XMM4
12718c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM4, \XMM5
12728c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM5, \XMM6
12738c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM6, \XMM7
12748c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM7, \XMM8
12758c2ecf20Sopenharmony_ci                vmovdqa \XMM8, \CTR
12768c2ecf20Sopenharmony_ci.endif
12778c2ecf20Sopenharmony_ci
12788c2ecf20Sopenharmony_ci
12798c2ecf20Sopenharmony_ci        #######################################################################
12808c2ecf20Sopenharmony_ci
12818c2ecf20Sopenharmony_ci                vmovdqu (arg1), \T1
12828c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM1, \XMM1
12838c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM2, \XMM2
12848c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM3, \XMM3
12858c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM4, \XMM4
12868c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM5, \XMM5
12878c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM6, \XMM6
12888c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM7, \XMM7
12898c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM8, \XMM8
12908c2ecf20Sopenharmony_ci
12918c2ecf20Sopenharmony_ci        #######################################################################
12928c2ecf20Sopenharmony_ci
12938c2ecf20Sopenharmony_ci
12948c2ecf20Sopenharmony_ci
12958c2ecf20Sopenharmony_ci
12968c2ecf20Sopenharmony_ci
12978c2ecf20Sopenharmony_ci                vmovdqu 16*1(arg1), \T1
12988c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
12998c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
13008c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
13018c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
13028c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
13038c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
13048c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
13058c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
13068c2ecf20Sopenharmony_ci
13078c2ecf20Sopenharmony_ci                vmovdqu 16*2(arg1), \T1
13088c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
13098c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
13108c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
13118c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
13128c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
13138c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
13148c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
13158c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
13168c2ecf20Sopenharmony_ci
13178c2ecf20Sopenharmony_ci
13188c2ecf20Sopenharmony_ci        #######################################################################
13198c2ecf20Sopenharmony_ci
13208c2ecf20Sopenharmony_ci        vmovdqu         HashKey_8(arg2), \T5
13218c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
13228c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
13238c2ecf20Sopenharmony_ci
13248c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T2, \T6
13258c2ecf20Sopenharmony_ci        vpxor           \T2, \T6, \T6
13268c2ecf20Sopenharmony_ci
13278c2ecf20Sopenharmony_ci        vmovdqu         HashKey_8_k(arg2), \T5
13288c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T6, \T6
13298c2ecf20Sopenharmony_ci
13308c2ecf20Sopenharmony_ci                vmovdqu 16*3(arg1), \T1
13318c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
13328c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
13338c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
13348c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
13358c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
13368c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
13378c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
13388c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
13398c2ecf20Sopenharmony_ci
13408c2ecf20Sopenharmony_ci        vmovdqa         TMP2(%rsp), \T1
13418c2ecf20Sopenharmony_ci        vmovdqu         HashKey_7(arg2), \T5
13428c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
13438c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
13448c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
13458c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
13468c2ecf20Sopenharmony_ci
13478c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
13488c2ecf20Sopenharmony_ci        vpxor           \T1, \T3, \T3
13498c2ecf20Sopenharmony_ci        vmovdqu         HashKey_7_k(arg2), \T5
13508c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
13518c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
13528c2ecf20Sopenharmony_ci
13538c2ecf20Sopenharmony_ci                vmovdqu 16*4(arg1), \T1
13548c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
13558c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
13568c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
13578c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
13588c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
13598c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
13608c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
13618c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
13628c2ecf20Sopenharmony_ci
13638c2ecf20Sopenharmony_ci        #######################################################################
13648c2ecf20Sopenharmony_ci
13658c2ecf20Sopenharmony_ci        vmovdqa         TMP3(%rsp), \T1
13668c2ecf20Sopenharmony_ci        vmovdqu         HashKey_6(arg2), \T5
13678c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
13688c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
13698c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
13708c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
13718c2ecf20Sopenharmony_ci
13728c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
13738c2ecf20Sopenharmony_ci        vpxor           \T1, \T3, \T3
13748c2ecf20Sopenharmony_ci        vmovdqu         HashKey_6_k(arg2), \T5
13758c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
13768c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
13778c2ecf20Sopenharmony_ci
13788c2ecf20Sopenharmony_ci                vmovdqu 16*5(arg1), \T1
13798c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
13808c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
13818c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
13828c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
13838c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
13848c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
13858c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
13868c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
13878c2ecf20Sopenharmony_ci
13888c2ecf20Sopenharmony_ci        vmovdqa         TMP4(%rsp), \T1
13898c2ecf20Sopenharmony_ci        vmovdqu         HashKey_5(arg2), \T5
13908c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
13918c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
13928c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
13938c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
13948c2ecf20Sopenharmony_ci
13958c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
13968c2ecf20Sopenharmony_ci        vpxor           \T1, \T3, \T3
13978c2ecf20Sopenharmony_ci        vmovdqu         HashKey_5_k(arg2), \T5
13988c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
13998c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
14008c2ecf20Sopenharmony_ci
14018c2ecf20Sopenharmony_ci                vmovdqu 16*6(arg1), \T1
14028c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
14038c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
14048c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
14058c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
14068c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
14078c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
14088c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
14098c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
14108c2ecf20Sopenharmony_ci
14118c2ecf20Sopenharmony_ci
14128c2ecf20Sopenharmony_ci        vmovdqa         TMP5(%rsp), \T1
14138c2ecf20Sopenharmony_ci        vmovdqu         HashKey_4(arg2), \T5
14148c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
14158c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
14168c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
14178c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
14188c2ecf20Sopenharmony_ci
14198c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
14208c2ecf20Sopenharmony_ci        vpxor           \T1, \T3, \T3
14218c2ecf20Sopenharmony_ci        vmovdqu         HashKey_4_k(arg2), \T5
14228c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
14238c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
14248c2ecf20Sopenharmony_ci
14258c2ecf20Sopenharmony_ci                vmovdqu 16*7(arg1), \T1
14268c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
14278c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
14288c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
14298c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
14308c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
14318c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
14328c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
14338c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
14348c2ecf20Sopenharmony_ci
14358c2ecf20Sopenharmony_ci        vmovdqa         TMP6(%rsp), \T1
14368c2ecf20Sopenharmony_ci        vmovdqu         HashKey_3(arg2), \T5
14378c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
14388c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
14398c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
14408c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
14418c2ecf20Sopenharmony_ci
14428c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
14438c2ecf20Sopenharmony_ci        vpxor           \T1, \T3, \T3
14448c2ecf20Sopenharmony_ci        vmovdqu         HashKey_3_k(arg2), \T5
14458c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
14468c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
14478c2ecf20Sopenharmony_ci
14488c2ecf20Sopenharmony_ci
14498c2ecf20Sopenharmony_ci                vmovdqu 16*8(arg1), \T1
14508c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
14518c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
14528c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
14538c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
14548c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
14558c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
14568c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
14578c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
14588c2ecf20Sopenharmony_ci
14598c2ecf20Sopenharmony_ci        vmovdqa         TMP7(%rsp), \T1
14608c2ecf20Sopenharmony_ci        vmovdqu         HashKey_2(arg2), \T5
14618c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
14628c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
14638c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
14648c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
14658c2ecf20Sopenharmony_ci
14668c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
14678c2ecf20Sopenharmony_ci        vpxor           \T1, \T3, \T3
14688c2ecf20Sopenharmony_ci        vmovdqu         HashKey_2_k(arg2), \T5
14698c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
14708c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
14718c2ecf20Sopenharmony_ci
14728c2ecf20Sopenharmony_ci        #######################################################################
14738c2ecf20Sopenharmony_ci
14748c2ecf20Sopenharmony_ci                vmovdqu 16*9(arg1), \T5
14758c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM1, \XMM1
14768c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM2, \XMM2
14778c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM3, \XMM3
14788c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM4, \XMM4
14798c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM5, \XMM5
14808c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM6, \XMM6
14818c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM7, \XMM7
14828c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM8, \XMM8
14838c2ecf20Sopenharmony_ci
14848c2ecf20Sopenharmony_ci        vmovdqa         TMP8(%rsp), \T1
14858c2ecf20Sopenharmony_ci        vmovdqu         HashKey(arg2), \T5
14868c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
14878c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
14888c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
14898c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
14908c2ecf20Sopenharmony_ci
14918c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T1, \T3
14928c2ecf20Sopenharmony_ci        vpxor           \T1, \T3, \T3
14938c2ecf20Sopenharmony_ci        vmovdqu         HashKey_k(arg2), \T5
14948c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T3, \T3
14958c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
14968c2ecf20Sopenharmony_ci
14978c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
14988c2ecf20Sopenharmony_ci        vpxor           \T7, \T6, \T6
14998c2ecf20Sopenharmony_ci
15008c2ecf20Sopenharmony_ci                vmovdqu 16*10(arg1), \T5
15018c2ecf20Sopenharmony_ci
15028c2ecf20Sopenharmony_ci        i = 11
15038c2ecf20Sopenharmony_ci        setreg
15048c2ecf20Sopenharmony_ci.rep (\REP-9)
15058c2ecf20Sopenharmony_ci
15068c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM1, \XMM1
15078c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM2, \XMM2
15088c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM3, \XMM3
15098c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM4, \XMM4
15108c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM5, \XMM5
15118c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM6, \XMM6
15128c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM7, \XMM7
15138c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM8, \XMM8
15148c2ecf20Sopenharmony_ci
15158c2ecf20Sopenharmony_ci        vmovdqu 16*i(arg1), \T5
15168c2ecf20Sopenharmony_ci        i = i + 1
15178c2ecf20Sopenharmony_ci        setreg
15188c2ecf20Sopenharmony_ci.endr
15198c2ecf20Sopenharmony_ci
15208c2ecf20Sopenharmony_ci	i = 0
15218c2ecf20Sopenharmony_ci	j = 1
15228c2ecf20Sopenharmony_ci	setreg
15238c2ecf20Sopenharmony_ci.rep 8
15248c2ecf20Sopenharmony_ci		vpxor	16*i(arg4, %r11), \T5, \T2
15258c2ecf20Sopenharmony_ci                .if \ENC_DEC == ENC
15268c2ecf20Sopenharmony_ci                vaesenclast     \T2, reg_j, reg_j
15278c2ecf20Sopenharmony_ci                .else
15288c2ecf20Sopenharmony_ci                vaesenclast     \T2, reg_j, \T3
15298c2ecf20Sopenharmony_ci                vmovdqu 16*i(arg4, %r11), reg_j
15308c2ecf20Sopenharmony_ci                vmovdqu \T3, 16*i(arg3, %r11)
15318c2ecf20Sopenharmony_ci                .endif
15328c2ecf20Sopenharmony_ci	i = (i+1)
15338c2ecf20Sopenharmony_ci	j = (j+1)
15348c2ecf20Sopenharmony_ci	setreg
15358c2ecf20Sopenharmony_ci.endr
15368c2ecf20Sopenharmony_ci	#######################################################################
15378c2ecf20Sopenharmony_ci
15388c2ecf20Sopenharmony_ci
15398c2ecf20Sopenharmony_ci	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
15408c2ecf20Sopenharmony_ci	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
15418c2ecf20Sopenharmony_ci	vpxor	\T3, \T7, \T7
15428c2ecf20Sopenharmony_ci	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
15438c2ecf20Sopenharmony_ci
15448c2ecf20Sopenharmony_ci
15458c2ecf20Sopenharmony_ci
15468c2ecf20Sopenharmony_ci	#######################################################################
15478c2ecf20Sopenharmony_ci	#first phase of the reduction
15488c2ecf20Sopenharmony_ci	#######################################################################
15498c2ecf20Sopenharmony_ci        vpslld  $31, \T7, \T2                           # packed right shifting << 31
15508c2ecf20Sopenharmony_ci        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
15518c2ecf20Sopenharmony_ci        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
15528c2ecf20Sopenharmony_ci
15538c2ecf20Sopenharmony_ci        vpxor   \T3, \T2, \T2                           # xor the shifted versions
15548c2ecf20Sopenharmony_ci        vpxor   \T4, \T2, \T2
15558c2ecf20Sopenharmony_ci
15568c2ecf20Sopenharmony_ci        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
15578c2ecf20Sopenharmony_ci
15588c2ecf20Sopenharmony_ci        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
15598c2ecf20Sopenharmony_ci        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
15608c2ecf20Sopenharmony_ci	#######################################################################
15618c2ecf20Sopenharmony_ci                .if \ENC_DEC == ENC
15628c2ecf20Sopenharmony_ci		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
15638c2ecf20Sopenharmony_ci		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
15648c2ecf20Sopenharmony_ci		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
15658c2ecf20Sopenharmony_ci		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
15668c2ecf20Sopenharmony_ci		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
15678c2ecf20Sopenharmony_ci		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
15688c2ecf20Sopenharmony_ci		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
15698c2ecf20Sopenharmony_ci		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
15708c2ecf20Sopenharmony_ci                .endif
15718c2ecf20Sopenharmony_ci
15728c2ecf20Sopenharmony_ci	#######################################################################
15738c2ecf20Sopenharmony_ci	#second phase of the reduction
15748c2ecf20Sopenharmony_ci        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
15758c2ecf20Sopenharmony_ci        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
15768c2ecf20Sopenharmony_ci        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
15778c2ecf20Sopenharmony_ci        vpxor   \T3, \T2, \T2                           # xor the shifted versions
15788c2ecf20Sopenharmony_ci        vpxor   \T4, \T2, \T2
15798c2ecf20Sopenharmony_ci
15808c2ecf20Sopenharmony_ci        vpxor   \T1, \T2, \T2
15818c2ecf20Sopenharmony_ci        vpxor   \T2, \T7, \T7
15828c2ecf20Sopenharmony_ci        vpxor   \T7, \T6, \T6                           # the result is in T6
15838c2ecf20Sopenharmony_ci	#######################################################################
15848c2ecf20Sopenharmony_ci
15858c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
15868c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
15878c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
15888c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
15898c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
15908c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
15918c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
15928c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
15938c2ecf20Sopenharmony_ci
15948c2ecf20Sopenharmony_ci
15958c2ecf20Sopenharmony_ci	vpxor	\T6, \XMM1, \XMM1
15968c2ecf20Sopenharmony_ci
15978c2ecf20Sopenharmony_ci
15988c2ecf20Sopenharmony_ci
15998c2ecf20Sopenharmony_ci.endm
16008c2ecf20Sopenharmony_ci
16018c2ecf20Sopenharmony_ci
16028c2ecf20Sopenharmony_ci# GHASH the last 4 ciphertext blocks.
16038c2ecf20Sopenharmony_ci.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
16048c2ecf20Sopenharmony_ci
16058c2ecf20Sopenharmony_ci        ## Karatsuba Method
16068c2ecf20Sopenharmony_ci
16078c2ecf20Sopenharmony_ci
16088c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM1, \T2
16098c2ecf20Sopenharmony_ci        vpxor           \XMM1, \T2, \T2
16108c2ecf20Sopenharmony_ci        vmovdqu         HashKey_8(arg2), \T5
16118c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM1, \T6
16128c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM1, \T7
16138c2ecf20Sopenharmony_ci
16148c2ecf20Sopenharmony_ci        vmovdqu         HashKey_8_k(arg2), \T3
16158c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \XMM1
16168c2ecf20Sopenharmony_ci
16178c2ecf20Sopenharmony_ci        ######################
16188c2ecf20Sopenharmony_ci
16198c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM2, \T2
16208c2ecf20Sopenharmony_ci        vpxor           \XMM2, \T2, \T2
16218c2ecf20Sopenharmony_ci        vmovdqu         HashKey_7(arg2), \T5
16228c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM2, \T4
16238c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
16248c2ecf20Sopenharmony_ci
16258c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM2, \T4
16268c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
16278c2ecf20Sopenharmony_ci
16288c2ecf20Sopenharmony_ci        vmovdqu         HashKey_7_k(arg2), \T3
16298c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
16308c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
16318c2ecf20Sopenharmony_ci
16328c2ecf20Sopenharmony_ci        ######################
16338c2ecf20Sopenharmony_ci
16348c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM3, \T2
16358c2ecf20Sopenharmony_ci        vpxor           \XMM3, \T2, \T2
16368c2ecf20Sopenharmony_ci        vmovdqu         HashKey_6(arg2), \T5
16378c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM3, \T4
16388c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
16398c2ecf20Sopenharmony_ci
16408c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM3, \T4
16418c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
16428c2ecf20Sopenharmony_ci
16438c2ecf20Sopenharmony_ci        vmovdqu         HashKey_6_k(arg2), \T3
16448c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
16458c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
16468c2ecf20Sopenharmony_ci
16478c2ecf20Sopenharmony_ci        ######################
16488c2ecf20Sopenharmony_ci
16498c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM4, \T2
16508c2ecf20Sopenharmony_ci        vpxor           \XMM4, \T2, \T2
16518c2ecf20Sopenharmony_ci        vmovdqu         HashKey_5(arg2), \T5
16528c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM4, \T4
16538c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
16548c2ecf20Sopenharmony_ci
16558c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM4, \T4
16568c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
16578c2ecf20Sopenharmony_ci
16588c2ecf20Sopenharmony_ci        vmovdqu         HashKey_5_k(arg2), \T3
16598c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
16608c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
16618c2ecf20Sopenharmony_ci
16628c2ecf20Sopenharmony_ci        ######################
16638c2ecf20Sopenharmony_ci
16648c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM5, \T2
16658c2ecf20Sopenharmony_ci        vpxor           \XMM5, \T2, \T2
16668c2ecf20Sopenharmony_ci        vmovdqu         HashKey_4(arg2), \T5
16678c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM5, \T4
16688c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
16698c2ecf20Sopenharmony_ci
16708c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM5, \T4
16718c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
16728c2ecf20Sopenharmony_ci
16738c2ecf20Sopenharmony_ci        vmovdqu         HashKey_4_k(arg2), \T3
16748c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
16758c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
16768c2ecf20Sopenharmony_ci
16778c2ecf20Sopenharmony_ci        ######################
16788c2ecf20Sopenharmony_ci
16798c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM6, \T2
16808c2ecf20Sopenharmony_ci        vpxor           \XMM6, \T2, \T2
16818c2ecf20Sopenharmony_ci        vmovdqu         HashKey_3(arg2), \T5
16828c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM6, \T4
16838c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
16848c2ecf20Sopenharmony_ci
16858c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM6, \T4
16868c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
16878c2ecf20Sopenharmony_ci
16888c2ecf20Sopenharmony_ci        vmovdqu         HashKey_3_k(arg2), \T3
16898c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
16908c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
16918c2ecf20Sopenharmony_ci
16928c2ecf20Sopenharmony_ci        ######################
16938c2ecf20Sopenharmony_ci
16948c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM7, \T2
16958c2ecf20Sopenharmony_ci        vpxor           \XMM7, \T2, \T2
16968c2ecf20Sopenharmony_ci        vmovdqu         HashKey_2(arg2), \T5
16978c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM7, \T4
16988c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
16998c2ecf20Sopenharmony_ci
17008c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM7, \T4
17018c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
17028c2ecf20Sopenharmony_ci
17038c2ecf20Sopenharmony_ci        vmovdqu         HashKey_2_k(arg2), \T3
17048c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
17058c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
17068c2ecf20Sopenharmony_ci
17078c2ecf20Sopenharmony_ci        ######################
17088c2ecf20Sopenharmony_ci
17098c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM8, \T2
17108c2ecf20Sopenharmony_ci        vpxor           \XMM8, \T2, \T2
17118c2ecf20Sopenharmony_ci        vmovdqu         HashKey(arg2), \T5
17128c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM8, \T4
17138c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
17148c2ecf20Sopenharmony_ci
17158c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM8, \T4
17168c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
17178c2ecf20Sopenharmony_ci
17188c2ecf20Sopenharmony_ci        vmovdqu         HashKey_k(arg2), \T3
17198c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
17208c2ecf20Sopenharmony_ci
17218c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
17228c2ecf20Sopenharmony_ci        vpxor           \T6, \XMM1, \XMM1
17238c2ecf20Sopenharmony_ci        vpxor           \T7, \XMM1, \T2
17248c2ecf20Sopenharmony_ci
17258c2ecf20Sopenharmony_ci
17268c2ecf20Sopenharmony_ci
17278c2ecf20Sopenharmony_ci
17288c2ecf20Sopenharmony_ci        vpslldq $8, \T2, \T4
17298c2ecf20Sopenharmony_ci        vpsrldq $8, \T2, \T2
17308c2ecf20Sopenharmony_ci
17318c2ecf20Sopenharmony_ci        vpxor   \T4, \T7, \T7
17328c2ecf20Sopenharmony_ci        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
17338c2ecf20Sopenharmony_ci				# the accumulated carry-less multiplications
17348c2ecf20Sopenharmony_ci
17358c2ecf20Sopenharmony_ci        #######################################################################
17368c2ecf20Sopenharmony_ci        #first phase of the reduction
17378c2ecf20Sopenharmony_ci        vpslld  $31, \T7, \T2   # packed right shifting << 31
17388c2ecf20Sopenharmony_ci        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
17398c2ecf20Sopenharmony_ci        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
17408c2ecf20Sopenharmony_ci
17418c2ecf20Sopenharmony_ci        vpxor   \T3, \T2, \T2   # xor the shifted versions
17428c2ecf20Sopenharmony_ci        vpxor   \T4, \T2, \T2
17438c2ecf20Sopenharmony_ci
17448c2ecf20Sopenharmony_ci        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
17458c2ecf20Sopenharmony_ci
17468c2ecf20Sopenharmony_ci        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
17478c2ecf20Sopenharmony_ci        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
17488c2ecf20Sopenharmony_ci        #######################################################################
17498c2ecf20Sopenharmony_ci
17508c2ecf20Sopenharmony_ci
17518c2ecf20Sopenharmony_ci        #second phase of the reduction
17528c2ecf20Sopenharmony_ci        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
17538c2ecf20Sopenharmony_ci        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
17548c2ecf20Sopenharmony_ci        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
17558c2ecf20Sopenharmony_ci        vpxor   \T3, \T2, \T2   # xor the shifted versions
17568c2ecf20Sopenharmony_ci        vpxor   \T4, \T2, \T2
17578c2ecf20Sopenharmony_ci
17588c2ecf20Sopenharmony_ci        vpxor   \T1, \T2, \T2
17598c2ecf20Sopenharmony_ci        vpxor   \T2, \T7, \T7
17608c2ecf20Sopenharmony_ci        vpxor   \T7, \T6, \T6   # the result is in T6
17618c2ecf20Sopenharmony_ci
17628c2ecf20Sopenharmony_ci.endm
17638c2ecf20Sopenharmony_ci
17648c2ecf20Sopenharmony_ci#############################################################
17658c2ecf20Sopenharmony_ci#void   aesni_gcm_precomp_avx_gen2
17668c2ecf20Sopenharmony_ci#        (gcm_data     *my_ctx_data,
17678c2ecf20Sopenharmony_ci#         gcm_context_data *data,
17688c2ecf20Sopenharmony_ci#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
17698c2ecf20Sopenharmony_ci#        u8      *iv, /* Pre-counter block j0: 4 byte salt
17708c2ecf20Sopenharmony_ci#			(from Security Association) concatenated with 8 byte
17718c2ecf20Sopenharmony_ci#			Initialisation Vector (from IPSec ESP Payload)
17728c2ecf20Sopenharmony_ci#			concatenated with 0x00000001. 16-byte aligned pointer. */
17738c2ecf20Sopenharmony_ci#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
17748c2ecf20Sopenharmony_ci#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
17758c2ecf20Sopenharmony_ci#############################################################
17768c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_init_avx_gen2)
17778c2ecf20Sopenharmony_ci        FUNC_SAVE
17788c2ecf20Sopenharmony_ci        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
17798c2ecf20Sopenharmony_ci        FUNC_RESTORE
17808c2ecf20Sopenharmony_ci        RET
17818c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_init_avx_gen2)
17828c2ecf20Sopenharmony_ci
17838c2ecf20Sopenharmony_ci###############################################################################
17848c2ecf20Sopenharmony_ci#void   aesni_gcm_enc_update_avx_gen2(
17858c2ecf20Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
17868c2ecf20Sopenharmony_ci#        gcm_context_data *data,
17878c2ecf20Sopenharmony_ci#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
17888c2ecf20Sopenharmony_ci#        const   u8 *in, /* Plaintext input */
17898c2ecf20Sopenharmony_ci#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
17908c2ecf20Sopenharmony_ci###############################################################################
17918c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
17928c2ecf20Sopenharmony_ci        FUNC_SAVE
17938c2ecf20Sopenharmony_ci        mov     keysize, %eax
17948c2ecf20Sopenharmony_ci        cmp     $32, %eax
17958c2ecf20Sopenharmony_ci        je      key_256_enc_update
17968c2ecf20Sopenharmony_ci        cmp     $16, %eax
17978c2ecf20Sopenharmony_ci        je      key_128_enc_update
17988c2ecf20Sopenharmony_ci        # must be 192
17998c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
18008c2ecf20Sopenharmony_ci        FUNC_RESTORE
18018c2ecf20Sopenharmony_ci        RET
18028c2ecf20Sopenharmony_cikey_128_enc_update:
18038c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
18048c2ecf20Sopenharmony_ci        FUNC_RESTORE
18058c2ecf20Sopenharmony_ci        RET
18068c2ecf20Sopenharmony_cikey_256_enc_update:
18078c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
18088c2ecf20Sopenharmony_ci        FUNC_RESTORE
18098c2ecf20Sopenharmony_ci        RET
18108c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
18118c2ecf20Sopenharmony_ci
18128c2ecf20Sopenharmony_ci###############################################################################
18138c2ecf20Sopenharmony_ci#void   aesni_gcm_dec_update_avx_gen2(
18148c2ecf20Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
18158c2ecf20Sopenharmony_ci#        gcm_context_data *data,
18168c2ecf20Sopenharmony_ci#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
18178c2ecf20Sopenharmony_ci#        const   u8 *in, /* Ciphertext input */
18188c2ecf20Sopenharmony_ci#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
18198c2ecf20Sopenharmony_ci###############################################################################
18208c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
18218c2ecf20Sopenharmony_ci        FUNC_SAVE
18228c2ecf20Sopenharmony_ci        mov     keysize,%eax
18238c2ecf20Sopenharmony_ci        cmp     $32, %eax
18248c2ecf20Sopenharmony_ci        je      key_256_dec_update
18258c2ecf20Sopenharmony_ci        cmp     $16, %eax
18268c2ecf20Sopenharmony_ci        je      key_128_dec_update
18278c2ecf20Sopenharmony_ci        # must be 192
18288c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
18298c2ecf20Sopenharmony_ci        FUNC_RESTORE
18308c2ecf20Sopenharmony_ci        RET
18318c2ecf20Sopenharmony_cikey_128_dec_update:
18328c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
18338c2ecf20Sopenharmony_ci        FUNC_RESTORE
18348c2ecf20Sopenharmony_ci        RET
18358c2ecf20Sopenharmony_cikey_256_dec_update:
18368c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
18378c2ecf20Sopenharmony_ci        FUNC_RESTORE
18388c2ecf20Sopenharmony_ci        RET
18398c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
18408c2ecf20Sopenharmony_ci
18418c2ecf20Sopenharmony_ci###############################################################################
18428c2ecf20Sopenharmony_ci#void   aesni_gcm_finalize_avx_gen2(
18438c2ecf20Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
18448c2ecf20Sopenharmony_ci#        gcm_context_data *data,
18458c2ecf20Sopenharmony_ci#        u8      *auth_tag, /* Authenticated Tag output. */
18468c2ecf20Sopenharmony_ci#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
18478c2ecf20Sopenharmony_ci#				Valid values are 16 (most likely), 12 or 8. */
18488c2ecf20Sopenharmony_ci###############################################################################
18498c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
18508c2ecf20Sopenharmony_ci        FUNC_SAVE
18518c2ecf20Sopenharmony_ci        mov	keysize,%eax
18528c2ecf20Sopenharmony_ci        cmp     $32, %eax
18538c2ecf20Sopenharmony_ci        je      key_256_finalize
18548c2ecf20Sopenharmony_ci        cmp     $16, %eax
18558c2ecf20Sopenharmony_ci        je      key_128_finalize
18568c2ecf20Sopenharmony_ci        # must be 192
18578c2ecf20Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
18588c2ecf20Sopenharmony_ci        FUNC_RESTORE
18598c2ecf20Sopenharmony_ci        RET
18608c2ecf20Sopenharmony_cikey_128_finalize:
18618c2ecf20Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
18628c2ecf20Sopenharmony_ci        FUNC_RESTORE
18638c2ecf20Sopenharmony_ci        RET
18648c2ecf20Sopenharmony_cikey_256_finalize:
18658c2ecf20Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
18668c2ecf20Sopenharmony_ci        FUNC_RESTORE
18678c2ecf20Sopenharmony_ci        RET
18688c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
18698c2ecf20Sopenharmony_ci
18708c2ecf20Sopenharmony_ci###############################################################################
18718c2ecf20Sopenharmony_ci# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
18728c2ecf20Sopenharmony_ci# Input: A and B (128-bits each, bit-reflected)
18738c2ecf20Sopenharmony_ci# Output: C = A*B*x mod poly, (i.e. >>1 )
18748c2ecf20Sopenharmony_ci# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
18758c2ecf20Sopenharmony_ci# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
18768c2ecf20Sopenharmony_ci###############################################################################
18778c2ecf20Sopenharmony_ci.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
18788c2ecf20Sopenharmony_ci
18798c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
18808c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
18818c2ecf20Sopenharmony_ci        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
18828c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
18838c2ecf20Sopenharmony_ci        vpxor           \T3, \GH, \GH
18848c2ecf20Sopenharmony_ci
18858c2ecf20Sopenharmony_ci
18868c2ecf20Sopenharmony_ci        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
18878c2ecf20Sopenharmony_ci        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
18888c2ecf20Sopenharmony_ci
18898c2ecf20Sopenharmony_ci        vpxor           \T3, \T1, \T1
18908c2ecf20Sopenharmony_ci        vpxor           \T2, \GH, \GH
18918c2ecf20Sopenharmony_ci
18928c2ecf20Sopenharmony_ci        #######################################################################
18938c2ecf20Sopenharmony_ci        #first phase of the reduction
18948c2ecf20Sopenharmony_ci        vmovdqa         POLY2(%rip), \T3
18958c2ecf20Sopenharmony_ci
18968c2ecf20Sopenharmony_ci        vpclmulqdq      $0x01, \GH, \T3, \T2
18978c2ecf20Sopenharmony_ci        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
18988c2ecf20Sopenharmony_ci
18998c2ecf20Sopenharmony_ci        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
19008c2ecf20Sopenharmony_ci        #######################################################################
19018c2ecf20Sopenharmony_ci        #second phase of the reduction
19028c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \GH, \T3, \T2
19038c2ecf20Sopenharmony_ci        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
19048c2ecf20Sopenharmony_ci
19058c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \GH, \T3, \GH
19068c2ecf20Sopenharmony_ci        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
19078c2ecf20Sopenharmony_ci
19088c2ecf20Sopenharmony_ci        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
19098c2ecf20Sopenharmony_ci        #######################################################################
19108c2ecf20Sopenharmony_ci        vpxor           \T1, \GH, \GH          # the result is in GH
19118c2ecf20Sopenharmony_ci
19128c2ecf20Sopenharmony_ci
19138c2ecf20Sopenharmony_ci.endm
19148c2ecf20Sopenharmony_ci
19158c2ecf20Sopenharmony_ci.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
19168c2ecf20Sopenharmony_ci
19178c2ecf20Sopenharmony_ci        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
19188c2ecf20Sopenharmony_ci        vmovdqa  \HK, \T5
19198c2ecf20Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
19208c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
19218c2ecf20Sopenharmony_ci
19228c2ecf20Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
19238c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_3(arg2)
19248c2ecf20Sopenharmony_ci
19258c2ecf20Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
19268c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_4(arg2)
19278c2ecf20Sopenharmony_ci
19288c2ecf20Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
19298c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_5(arg2)
19308c2ecf20Sopenharmony_ci
19318c2ecf20Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
19328c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_6(arg2)
19338c2ecf20Sopenharmony_ci
19348c2ecf20Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
19358c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_7(arg2)
19368c2ecf20Sopenharmony_ci
19378c2ecf20Sopenharmony_ci        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
19388c2ecf20Sopenharmony_ci        vmovdqu  \T5, HashKey_8(arg2)
19398c2ecf20Sopenharmony_ci
19408c2ecf20Sopenharmony_ci.endm
19418c2ecf20Sopenharmony_ci
19428c2ecf20Sopenharmony_ci## if a = number of total plaintext bytes
19438c2ecf20Sopenharmony_ci## b = floor(a/16)
19448c2ecf20Sopenharmony_ci## num_initial_blocks = b mod 4#
19458c2ecf20Sopenharmony_ci## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
19468c2ecf20Sopenharmony_ci## r10, r11, r12, rax are clobbered
19478c2ecf20Sopenharmony_ci## arg1, arg3, arg4, r14 are used as a pointer only, not modified
19488c2ecf20Sopenharmony_ci
19498c2ecf20Sopenharmony_ci.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
19508c2ecf20Sopenharmony_ci	i = (8-\num_initial_blocks)
19518c2ecf20Sopenharmony_ci	setreg
19528c2ecf20Sopenharmony_ci	vmovdqu AadHash(arg2), reg_i
19538c2ecf20Sopenharmony_ci
19548c2ecf20Sopenharmony_ci	# start AES for num_initial_blocks blocks
19558c2ecf20Sopenharmony_ci	vmovdqu CurCount(arg2), \CTR
19568c2ecf20Sopenharmony_ci
19578c2ecf20Sopenharmony_ci	i = (9-\num_initial_blocks)
19588c2ecf20Sopenharmony_ci	setreg
19598c2ecf20Sopenharmony_ci.rep \num_initial_blocks
19608c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
19618c2ecf20Sopenharmony_ci                vmovdqa \CTR, reg_i
19628c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
19638c2ecf20Sopenharmony_ci	i = (i+1)
19648c2ecf20Sopenharmony_ci	setreg
19658c2ecf20Sopenharmony_ci.endr
19668c2ecf20Sopenharmony_ci
19678c2ecf20Sopenharmony_ci	vmovdqa  (arg1), \T_key
19688c2ecf20Sopenharmony_ci	i = (9-\num_initial_blocks)
19698c2ecf20Sopenharmony_ci	setreg
19708c2ecf20Sopenharmony_ci.rep \num_initial_blocks
19718c2ecf20Sopenharmony_ci                vpxor   \T_key, reg_i, reg_i
19728c2ecf20Sopenharmony_ci	i = (i+1)
19738c2ecf20Sopenharmony_ci	setreg
19748c2ecf20Sopenharmony_ci.endr
19758c2ecf20Sopenharmony_ci
19768c2ecf20Sopenharmony_ci	j = 1
19778c2ecf20Sopenharmony_ci	setreg
19788c2ecf20Sopenharmony_ci.rep \REP
19798c2ecf20Sopenharmony_ci	vmovdqa  16*j(arg1), \T_key
19808c2ecf20Sopenharmony_ci	i = (9-\num_initial_blocks)
19818c2ecf20Sopenharmony_ci	setreg
19828c2ecf20Sopenharmony_ci.rep \num_initial_blocks
19838c2ecf20Sopenharmony_ci        vaesenc \T_key, reg_i, reg_i
19848c2ecf20Sopenharmony_ci	i = (i+1)
19858c2ecf20Sopenharmony_ci	setreg
19868c2ecf20Sopenharmony_ci.endr
19878c2ecf20Sopenharmony_ci
19888c2ecf20Sopenharmony_ci	j = (j+1)
19898c2ecf20Sopenharmony_ci	setreg
19908c2ecf20Sopenharmony_ci.endr
19918c2ecf20Sopenharmony_ci
19928c2ecf20Sopenharmony_ci
19938c2ecf20Sopenharmony_ci	vmovdqa  16*j(arg1), \T_key
19948c2ecf20Sopenharmony_ci	i = (9-\num_initial_blocks)
19958c2ecf20Sopenharmony_ci	setreg
19968c2ecf20Sopenharmony_ci.rep \num_initial_blocks
19978c2ecf20Sopenharmony_ci        vaesenclast      \T_key, reg_i, reg_i
19988c2ecf20Sopenharmony_ci	i = (i+1)
19998c2ecf20Sopenharmony_ci	setreg
20008c2ecf20Sopenharmony_ci.endr
20018c2ecf20Sopenharmony_ci
20028c2ecf20Sopenharmony_ci	i = (9-\num_initial_blocks)
20038c2ecf20Sopenharmony_ci	setreg
20048c2ecf20Sopenharmony_ci.rep \num_initial_blocks
20058c2ecf20Sopenharmony_ci                vmovdqu (arg4, %r11), \T1
20068c2ecf20Sopenharmony_ci                vpxor   \T1, reg_i, reg_i
20078c2ecf20Sopenharmony_ci                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
20088c2ecf20Sopenharmony_ci						       # num_initial_blocks blocks
20098c2ecf20Sopenharmony_ci                add     $16, %r11
20108c2ecf20Sopenharmony_ci.if  \ENC_DEC == DEC
20118c2ecf20Sopenharmony_ci                vmovdqa \T1, reg_i
20128c2ecf20Sopenharmony_ci.endif
20138c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
20148c2ecf20Sopenharmony_ci	i = (i+1)
20158c2ecf20Sopenharmony_ci	setreg
20168c2ecf20Sopenharmony_ci.endr
20178c2ecf20Sopenharmony_ci
20188c2ecf20Sopenharmony_ci
20198c2ecf20Sopenharmony_ci	i = (8-\num_initial_blocks)
20208c2ecf20Sopenharmony_ci	j = (9-\num_initial_blocks)
20218c2ecf20Sopenharmony_ci	setreg
20228c2ecf20Sopenharmony_ci
20238c2ecf20Sopenharmony_ci.rep \num_initial_blocks
20248c2ecf20Sopenharmony_ci        vpxor    reg_i, reg_j, reg_j
20258c2ecf20Sopenharmony_ci        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
20268c2ecf20Sopenharmony_ci	i = (i+1)
20278c2ecf20Sopenharmony_ci	j = (j+1)
20288c2ecf20Sopenharmony_ci	setreg
20298c2ecf20Sopenharmony_ci.endr
20308c2ecf20Sopenharmony_ci        # XMM8 has the combined result here
20318c2ecf20Sopenharmony_ci
20328c2ecf20Sopenharmony_ci        vmovdqa  \XMM8, TMP1(%rsp)
20338c2ecf20Sopenharmony_ci        vmovdqa  \XMM8, \T3
20348c2ecf20Sopenharmony_ci
20358c2ecf20Sopenharmony_ci        cmp     $128, %r13
20368c2ecf20Sopenharmony_ci        jl      _initial_blocks_done\@                  # no need for precomputed constants
20378c2ecf20Sopenharmony_ci
20388c2ecf20Sopenharmony_ci###############################################################################
20398c2ecf20Sopenharmony_ci# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
20408c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
20418c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM1
20428c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
20438c2ecf20Sopenharmony_ci
20448c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
20458c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM2
20468c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
20478c2ecf20Sopenharmony_ci
20488c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
20498c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM3
20508c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
20518c2ecf20Sopenharmony_ci
20528c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
20538c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM4
20548c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
20558c2ecf20Sopenharmony_ci
20568c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
20578c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM5
20588c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
20598c2ecf20Sopenharmony_ci
20608c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
20618c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM6
20628c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
20638c2ecf20Sopenharmony_ci
20648c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
20658c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM7
20668c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
20678c2ecf20Sopenharmony_ci
20688c2ecf20Sopenharmony_ci                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
20698c2ecf20Sopenharmony_ci                vmovdqa  \CTR, \XMM8
20708c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
20718c2ecf20Sopenharmony_ci
20728c2ecf20Sopenharmony_ci                vmovdqa  (arg1), \T_key
20738c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM1, \XMM1
20748c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM2, \XMM2
20758c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM3, \XMM3
20768c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM4, \XMM4
20778c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM5, \XMM5
20788c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM6, \XMM6
20798c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM7, \XMM7
20808c2ecf20Sopenharmony_ci                vpxor    \T_key, \XMM8, \XMM8
20818c2ecf20Sopenharmony_ci
20828c2ecf20Sopenharmony_ci		i = 1
20838c2ecf20Sopenharmony_ci		setreg
20848c2ecf20Sopenharmony_ci.rep    \REP       # do REP rounds
20858c2ecf20Sopenharmony_ci                vmovdqa  16*i(arg1), \T_key
20868c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM1, \XMM1
20878c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM2, \XMM2
20888c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM3, \XMM3
20898c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM4, \XMM4
20908c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM5, \XMM5
20918c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM6, \XMM6
20928c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM7, \XMM7
20938c2ecf20Sopenharmony_ci                vaesenc  \T_key, \XMM8, \XMM8
20948c2ecf20Sopenharmony_ci		i = (i+1)
20958c2ecf20Sopenharmony_ci		setreg
20968c2ecf20Sopenharmony_ci.endr
20978c2ecf20Sopenharmony_ci
20988c2ecf20Sopenharmony_ci
20998c2ecf20Sopenharmony_ci                vmovdqa  16*i(arg1), \T_key
21008c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM1, \XMM1
21018c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM2, \XMM2
21028c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM3, \XMM3
21038c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM4, \XMM4
21048c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM5, \XMM5
21058c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM6, \XMM6
21068c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM7, \XMM7
21078c2ecf20Sopenharmony_ci                vaesenclast  \T_key, \XMM8, \XMM8
21088c2ecf20Sopenharmony_ci
21098c2ecf20Sopenharmony_ci                vmovdqu  (arg4, %r11), \T1
21108c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM1, \XMM1
21118c2ecf20Sopenharmony_ci                vmovdqu  \XMM1, (arg3 , %r11)
21128c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
21138c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM1
21148c2ecf20Sopenharmony_ci                .endif
21158c2ecf20Sopenharmony_ci
21168c2ecf20Sopenharmony_ci                vmovdqu  16*1(arg4, %r11), \T1
21178c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM2, \XMM2
21188c2ecf20Sopenharmony_ci                vmovdqu  \XMM2, 16*1(arg3 , %r11)
21198c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
21208c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM2
21218c2ecf20Sopenharmony_ci                .endif
21228c2ecf20Sopenharmony_ci
21238c2ecf20Sopenharmony_ci                vmovdqu  16*2(arg4, %r11), \T1
21248c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM3, \XMM3
21258c2ecf20Sopenharmony_ci                vmovdqu  \XMM3, 16*2(arg3 , %r11)
21268c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
21278c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM3
21288c2ecf20Sopenharmony_ci                .endif
21298c2ecf20Sopenharmony_ci
21308c2ecf20Sopenharmony_ci                vmovdqu  16*3(arg4, %r11), \T1
21318c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM4, \XMM4
21328c2ecf20Sopenharmony_ci                vmovdqu  \XMM4, 16*3(arg3 , %r11)
21338c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
21348c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM4
21358c2ecf20Sopenharmony_ci                .endif
21368c2ecf20Sopenharmony_ci
21378c2ecf20Sopenharmony_ci                vmovdqu  16*4(arg4, %r11), \T1
21388c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM5, \XMM5
21398c2ecf20Sopenharmony_ci                vmovdqu  \XMM5, 16*4(arg3 , %r11)
21408c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
21418c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM5
21428c2ecf20Sopenharmony_ci                .endif
21438c2ecf20Sopenharmony_ci
21448c2ecf20Sopenharmony_ci                vmovdqu  16*5(arg4, %r11), \T1
21458c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM6, \XMM6
21468c2ecf20Sopenharmony_ci                vmovdqu  \XMM6, 16*5(arg3 , %r11)
21478c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
21488c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM6
21498c2ecf20Sopenharmony_ci                .endif
21508c2ecf20Sopenharmony_ci
21518c2ecf20Sopenharmony_ci                vmovdqu  16*6(arg4, %r11), \T1
21528c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM7, \XMM7
21538c2ecf20Sopenharmony_ci                vmovdqu  \XMM7, 16*6(arg3 , %r11)
21548c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
21558c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM7
21568c2ecf20Sopenharmony_ci                .endif
21578c2ecf20Sopenharmony_ci
21588c2ecf20Sopenharmony_ci                vmovdqu  16*7(arg4, %r11), \T1
21598c2ecf20Sopenharmony_ci                vpxor    \T1, \XMM8, \XMM8
21608c2ecf20Sopenharmony_ci                vmovdqu  \XMM8, 16*7(arg3 , %r11)
21618c2ecf20Sopenharmony_ci                .if   \ENC_DEC == DEC
21628c2ecf20Sopenharmony_ci                vmovdqa  \T1, \XMM8
21638c2ecf20Sopenharmony_ci                .endif
21648c2ecf20Sopenharmony_ci
21658c2ecf20Sopenharmony_ci                add     $128, %r11
21668c2ecf20Sopenharmony_ci
21678c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
21688c2ecf20Sopenharmony_ci                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
21698c2ecf20Sopenharmony_ci							   # the corresponding ciphertext
21708c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
21718c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
21728c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
21738c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
21748c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
21758c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
21768c2ecf20Sopenharmony_ci                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
21778c2ecf20Sopenharmony_ci
21788c2ecf20Sopenharmony_ci###############################################################################
21798c2ecf20Sopenharmony_ci
21808c2ecf20Sopenharmony_ci_initial_blocks_done\@:
21818c2ecf20Sopenharmony_ci
21828c2ecf20Sopenharmony_ci
21838c2ecf20Sopenharmony_ci.endm
21848c2ecf20Sopenharmony_ci
21858c2ecf20Sopenharmony_ci
21868c2ecf20Sopenharmony_ci
21878c2ecf20Sopenharmony_ci# encrypt 8 blocks at a time
21888c2ecf20Sopenharmony_ci# ghash the 8 previously encrypted ciphertext blocks
21898c2ecf20Sopenharmony_ci# arg1, arg3, arg4 are used as pointers only, not modified
21908c2ecf20Sopenharmony_ci# r11 is the data offset value
21918c2ecf20Sopenharmony_ci.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
21928c2ecf20Sopenharmony_ci
21938c2ecf20Sopenharmony_ci        vmovdqa \XMM1, \T2
21948c2ecf20Sopenharmony_ci        vmovdqa \XMM2, TMP2(%rsp)
21958c2ecf20Sopenharmony_ci        vmovdqa \XMM3, TMP3(%rsp)
21968c2ecf20Sopenharmony_ci        vmovdqa \XMM4, TMP4(%rsp)
21978c2ecf20Sopenharmony_ci        vmovdqa \XMM5, TMP5(%rsp)
21988c2ecf20Sopenharmony_ci        vmovdqa \XMM6, TMP6(%rsp)
21998c2ecf20Sopenharmony_ci        vmovdqa \XMM7, TMP7(%rsp)
22008c2ecf20Sopenharmony_ci        vmovdqa \XMM8, TMP8(%rsp)
22018c2ecf20Sopenharmony_ci
22028c2ecf20Sopenharmony_ci.if \loop_idx == in_order
22038c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
22048c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM1, \XMM2
22058c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM2, \XMM3
22068c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM3, \XMM4
22078c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM4, \XMM5
22088c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM5, \XMM6
22098c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM6, \XMM7
22108c2ecf20Sopenharmony_ci                vpaddd  ONE(%rip), \XMM7, \XMM8
22118c2ecf20Sopenharmony_ci                vmovdqa \XMM8, \CTR
22128c2ecf20Sopenharmony_ci
22138c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
22148c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
22158c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
22168c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
22178c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
22188c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
22198c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
22208c2ecf20Sopenharmony_ci                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
22218c2ecf20Sopenharmony_ci.else
22228c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
22238c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM1, \XMM2
22248c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM2, \XMM3
22258c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM3, \XMM4
22268c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM4, \XMM5
22278c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM5, \XMM6
22288c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM6, \XMM7
22298c2ecf20Sopenharmony_ci                vpaddd  ONEf(%rip), \XMM7, \XMM8
22308c2ecf20Sopenharmony_ci                vmovdqa \XMM8, \CTR
22318c2ecf20Sopenharmony_ci.endif
22328c2ecf20Sopenharmony_ci
22338c2ecf20Sopenharmony_ci
22348c2ecf20Sopenharmony_ci        #######################################################################
22358c2ecf20Sopenharmony_ci
22368c2ecf20Sopenharmony_ci                vmovdqu (arg1), \T1
22378c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM1, \XMM1
22388c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM2, \XMM2
22398c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM3, \XMM3
22408c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM4, \XMM4
22418c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM5, \XMM5
22428c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM6, \XMM6
22438c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM7, \XMM7
22448c2ecf20Sopenharmony_ci                vpxor   \T1, \XMM8, \XMM8
22458c2ecf20Sopenharmony_ci
22468c2ecf20Sopenharmony_ci        #######################################################################
22478c2ecf20Sopenharmony_ci
22488c2ecf20Sopenharmony_ci
22498c2ecf20Sopenharmony_ci
22508c2ecf20Sopenharmony_ci
22518c2ecf20Sopenharmony_ci
22528c2ecf20Sopenharmony_ci                vmovdqu 16*1(arg1), \T1
22538c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
22548c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
22558c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
22568c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
22578c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
22588c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
22598c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
22608c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
22618c2ecf20Sopenharmony_ci
22628c2ecf20Sopenharmony_ci                vmovdqu 16*2(arg1), \T1
22638c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
22648c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
22658c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
22668c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
22678c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
22688c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
22698c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
22708c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
22718c2ecf20Sopenharmony_ci
22728c2ecf20Sopenharmony_ci
22738c2ecf20Sopenharmony_ci        #######################################################################
22748c2ecf20Sopenharmony_ci
22758c2ecf20Sopenharmony_ci        vmovdqu         HashKey_8(arg2), \T5
22768c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
22778c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
22788c2ecf20Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
22798c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
22808c2ecf20Sopenharmony_ci        vpxor           \T5, \T6, \T6
22818c2ecf20Sopenharmony_ci
22828c2ecf20Sopenharmony_ci                vmovdqu 16*3(arg1), \T1
22838c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
22848c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
22858c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
22868c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
22878c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
22888c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
22898c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
22908c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
22918c2ecf20Sopenharmony_ci
22928c2ecf20Sopenharmony_ci        vmovdqa         TMP2(%rsp), \T1
22938c2ecf20Sopenharmony_ci        vmovdqu         HashKey_7(arg2), \T5
22948c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
22958c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
22968c2ecf20Sopenharmony_ci
22978c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
22988c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
22998c2ecf20Sopenharmony_ci
23008c2ecf20Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
23018c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
23028c2ecf20Sopenharmony_ci
23038c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
23048c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
23058c2ecf20Sopenharmony_ci
23068c2ecf20Sopenharmony_ci                vmovdqu 16*4(arg1), \T1
23078c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
23088c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
23098c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
23108c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
23118c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
23128c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
23138c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
23148c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
23158c2ecf20Sopenharmony_ci
23168c2ecf20Sopenharmony_ci        #######################################################################
23178c2ecf20Sopenharmony_ci
23188c2ecf20Sopenharmony_ci        vmovdqa         TMP3(%rsp), \T1
23198c2ecf20Sopenharmony_ci        vmovdqu         HashKey_6(arg2), \T5
23208c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
23218c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
23228c2ecf20Sopenharmony_ci
23238c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
23248c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
23258c2ecf20Sopenharmony_ci
23268c2ecf20Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
23278c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
23288c2ecf20Sopenharmony_ci
23298c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
23308c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
23318c2ecf20Sopenharmony_ci
23328c2ecf20Sopenharmony_ci                vmovdqu 16*5(arg1), \T1
23338c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
23348c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
23358c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
23368c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
23378c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
23388c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
23398c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
23408c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
23418c2ecf20Sopenharmony_ci
23428c2ecf20Sopenharmony_ci        vmovdqa         TMP4(%rsp), \T1
23438c2ecf20Sopenharmony_ci        vmovdqu         HashKey_5(arg2), \T5
23448c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
23458c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
23468c2ecf20Sopenharmony_ci
23478c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
23488c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
23498c2ecf20Sopenharmony_ci
23508c2ecf20Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
23518c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
23528c2ecf20Sopenharmony_ci
23538c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
23548c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
23558c2ecf20Sopenharmony_ci
23568c2ecf20Sopenharmony_ci                vmovdqu 16*6(arg1), \T1
23578c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
23588c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
23598c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
23608c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
23618c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
23628c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
23638c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
23648c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
23658c2ecf20Sopenharmony_ci
23668c2ecf20Sopenharmony_ci
23678c2ecf20Sopenharmony_ci        vmovdqa         TMP5(%rsp), \T1
23688c2ecf20Sopenharmony_ci        vmovdqu         HashKey_4(arg2), \T5
23698c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
23708c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
23718c2ecf20Sopenharmony_ci
23728c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
23738c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
23748c2ecf20Sopenharmony_ci
23758c2ecf20Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
23768c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
23778c2ecf20Sopenharmony_ci
23788c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
23798c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
23808c2ecf20Sopenharmony_ci
23818c2ecf20Sopenharmony_ci                vmovdqu 16*7(arg1), \T1
23828c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
23838c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
23848c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
23858c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
23868c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
23878c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
23888c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
23898c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
23908c2ecf20Sopenharmony_ci
23918c2ecf20Sopenharmony_ci        vmovdqa         TMP6(%rsp), \T1
23928c2ecf20Sopenharmony_ci        vmovdqu         HashKey_3(arg2), \T5
23938c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
23948c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
23958c2ecf20Sopenharmony_ci
23968c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
23978c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
23988c2ecf20Sopenharmony_ci
23998c2ecf20Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
24008c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
24018c2ecf20Sopenharmony_ci
24028c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
24038c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
24048c2ecf20Sopenharmony_ci
24058c2ecf20Sopenharmony_ci                vmovdqu 16*8(arg1), \T1
24068c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM1, \XMM1
24078c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM2, \XMM2
24088c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM3, \XMM3
24098c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM4, \XMM4
24108c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM5, \XMM5
24118c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM6, \XMM6
24128c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM7, \XMM7
24138c2ecf20Sopenharmony_ci                vaesenc \T1, \XMM8, \XMM8
24148c2ecf20Sopenharmony_ci
24158c2ecf20Sopenharmony_ci        vmovdqa         TMP7(%rsp), \T1
24168c2ecf20Sopenharmony_ci        vmovdqu         HashKey_2(arg2), \T5
24178c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
24188c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T4
24198c2ecf20Sopenharmony_ci
24208c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
24218c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
24228c2ecf20Sopenharmony_ci
24238c2ecf20Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
24248c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
24258c2ecf20Sopenharmony_ci
24268c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
24278c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
24288c2ecf20Sopenharmony_ci
24298c2ecf20Sopenharmony_ci
24308c2ecf20Sopenharmony_ci        #######################################################################
24318c2ecf20Sopenharmony_ci
24328c2ecf20Sopenharmony_ci                vmovdqu 16*9(arg1), \T5
24338c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM1, \XMM1
24348c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM2, \XMM2
24358c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM3, \XMM3
24368c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM4, \XMM4
24378c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM5, \XMM5
24388c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM6, \XMM6
24398c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM7, \XMM7
24408c2ecf20Sopenharmony_ci                vaesenc \T5, \XMM8, \XMM8
24418c2ecf20Sopenharmony_ci
24428c2ecf20Sopenharmony_ci        vmovdqa         TMP8(%rsp), \T1
24438c2ecf20Sopenharmony_ci        vmovdqu         HashKey(arg2), \T5
24448c2ecf20Sopenharmony_ci
24458c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \T1, \T3
24468c2ecf20Sopenharmony_ci        vpxor           \T3, \T7, \T7
24478c2ecf20Sopenharmony_ci
24488c2ecf20Sopenharmony_ci        vpclmulqdq      $0x01, \T5, \T1, \T3
24498c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
24508c2ecf20Sopenharmony_ci
24518c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T5, \T1, \T3
24528c2ecf20Sopenharmony_ci        vpxor           \T3, \T6, \T6
24538c2ecf20Sopenharmony_ci
24548c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \T1, \T3
24558c2ecf20Sopenharmony_ci        vpxor           \T3, \T4, \T1
24568c2ecf20Sopenharmony_ci
24578c2ecf20Sopenharmony_ci
24588c2ecf20Sopenharmony_ci                vmovdqu 16*10(arg1), \T5
24598c2ecf20Sopenharmony_ci
24608c2ecf20Sopenharmony_ci        i = 11
24618c2ecf20Sopenharmony_ci        setreg
24628c2ecf20Sopenharmony_ci.rep (\REP-9)
24638c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM1, \XMM1
24648c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM2, \XMM2
24658c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM3, \XMM3
24668c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM4, \XMM4
24678c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM5, \XMM5
24688c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM6, \XMM6
24698c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM7, \XMM7
24708c2ecf20Sopenharmony_ci        vaesenc \T5, \XMM8, \XMM8
24718c2ecf20Sopenharmony_ci
24728c2ecf20Sopenharmony_ci        vmovdqu 16*i(arg1), \T5
24738c2ecf20Sopenharmony_ci        i = i + 1
24748c2ecf20Sopenharmony_ci        setreg
24758c2ecf20Sopenharmony_ci.endr
24768c2ecf20Sopenharmony_ci
24778c2ecf20Sopenharmony_ci	i = 0
24788c2ecf20Sopenharmony_ci	j = 1
24798c2ecf20Sopenharmony_ci	setreg
24808c2ecf20Sopenharmony_ci.rep 8
24818c2ecf20Sopenharmony_ci		vpxor	16*i(arg4, %r11), \T5, \T2
24828c2ecf20Sopenharmony_ci                .if \ENC_DEC == ENC
24838c2ecf20Sopenharmony_ci                vaesenclast     \T2, reg_j, reg_j
24848c2ecf20Sopenharmony_ci                .else
24858c2ecf20Sopenharmony_ci                vaesenclast     \T2, reg_j, \T3
24868c2ecf20Sopenharmony_ci                vmovdqu 16*i(arg4, %r11), reg_j
24878c2ecf20Sopenharmony_ci                vmovdqu \T3, 16*i(arg3, %r11)
24888c2ecf20Sopenharmony_ci                .endif
24898c2ecf20Sopenharmony_ci	i = (i+1)
24908c2ecf20Sopenharmony_ci	j = (j+1)
24918c2ecf20Sopenharmony_ci	setreg
24928c2ecf20Sopenharmony_ci.endr
24938c2ecf20Sopenharmony_ci	#######################################################################
24948c2ecf20Sopenharmony_ci
24958c2ecf20Sopenharmony_ci
24968c2ecf20Sopenharmony_ci	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
24978c2ecf20Sopenharmony_ci	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
24988c2ecf20Sopenharmony_ci	vpxor	\T3, \T7, \T7
24998c2ecf20Sopenharmony_ci	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
25008c2ecf20Sopenharmony_ci
25018c2ecf20Sopenharmony_ci
25028c2ecf20Sopenharmony_ci
25038c2ecf20Sopenharmony_ci	#######################################################################
25048c2ecf20Sopenharmony_ci	#first phase of the reduction
25058c2ecf20Sopenharmony_ci	vmovdqa         POLY2(%rip), \T3
25068c2ecf20Sopenharmony_ci
25078c2ecf20Sopenharmony_ci	vpclmulqdq	$0x01, \T7, \T3, \T2
25088c2ecf20Sopenharmony_ci	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
25098c2ecf20Sopenharmony_ci
25108c2ecf20Sopenharmony_ci	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
25118c2ecf20Sopenharmony_ci	#######################################################################
25128c2ecf20Sopenharmony_ci                .if \ENC_DEC == ENC
25138c2ecf20Sopenharmony_ci		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
25148c2ecf20Sopenharmony_ci		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
25158c2ecf20Sopenharmony_ci		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
25168c2ecf20Sopenharmony_ci		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
25178c2ecf20Sopenharmony_ci		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
25188c2ecf20Sopenharmony_ci		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
25198c2ecf20Sopenharmony_ci		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
25208c2ecf20Sopenharmony_ci		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
25218c2ecf20Sopenharmony_ci                .endif
25228c2ecf20Sopenharmony_ci
25238c2ecf20Sopenharmony_ci	#######################################################################
25248c2ecf20Sopenharmony_ci	#second phase of the reduction
25258c2ecf20Sopenharmony_ci	vpclmulqdq	$0x00, \T7, \T3, \T2
25268c2ecf20Sopenharmony_ci	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
25278c2ecf20Sopenharmony_ci
25288c2ecf20Sopenharmony_ci	vpclmulqdq	$0x10, \T7, \T3, \T4
25298c2ecf20Sopenharmony_ci	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
25308c2ecf20Sopenharmony_ci
25318c2ecf20Sopenharmony_ci	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
25328c2ecf20Sopenharmony_ci	#######################################################################
25338c2ecf20Sopenharmony_ci	vpxor		\T4, \T1, \T1			# the result is in T1
25348c2ecf20Sopenharmony_ci
25358c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
25368c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
25378c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
25388c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
25398c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
25408c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
25418c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
25428c2ecf20Sopenharmony_ci		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
25438c2ecf20Sopenharmony_ci
25448c2ecf20Sopenharmony_ci
25458c2ecf20Sopenharmony_ci	vpxor	\T1, \XMM1, \XMM1
25468c2ecf20Sopenharmony_ci
25478c2ecf20Sopenharmony_ci
25488c2ecf20Sopenharmony_ci
25498c2ecf20Sopenharmony_ci.endm
25508c2ecf20Sopenharmony_ci
25518c2ecf20Sopenharmony_ci
25528c2ecf20Sopenharmony_ci# GHASH the last 4 ciphertext blocks.
25538c2ecf20Sopenharmony_ci.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
25548c2ecf20Sopenharmony_ci
25558c2ecf20Sopenharmony_ci        ## Karatsuba Method
25568c2ecf20Sopenharmony_ci
25578c2ecf20Sopenharmony_ci        vmovdqu         HashKey_8(arg2), \T5
25588c2ecf20Sopenharmony_ci
25598c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM1, \T2
25608c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
25618c2ecf20Sopenharmony_ci        vpxor           \XMM1, \T2, \T2
25628c2ecf20Sopenharmony_ci        vpxor           \T5, \T3, \T3
25638c2ecf20Sopenharmony_ci
25648c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM1, \T6
25658c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM1, \T7
25668c2ecf20Sopenharmony_ci
25678c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \XMM1
25688c2ecf20Sopenharmony_ci
25698c2ecf20Sopenharmony_ci        ######################
25708c2ecf20Sopenharmony_ci
25718c2ecf20Sopenharmony_ci        vmovdqu         HashKey_7(arg2), \T5
25728c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM2, \T2
25738c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
25748c2ecf20Sopenharmony_ci        vpxor           \XMM2, \T2, \T2
25758c2ecf20Sopenharmony_ci        vpxor           \T5, \T3, \T3
25768c2ecf20Sopenharmony_ci
25778c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM2, \T4
25788c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
25798c2ecf20Sopenharmony_ci
25808c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM2, \T4
25818c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
25828c2ecf20Sopenharmony_ci
25838c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
25848c2ecf20Sopenharmony_ci
25858c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
25868c2ecf20Sopenharmony_ci
25878c2ecf20Sopenharmony_ci        ######################
25888c2ecf20Sopenharmony_ci
25898c2ecf20Sopenharmony_ci        vmovdqu         HashKey_6(arg2), \T5
25908c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM3, \T2
25918c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
25928c2ecf20Sopenharmony_ci        vpxor           \XMM3, \T2, \T2
25938c2ecf20Sopenharmony_ci        vpxor           \T5, \T3, \T3
25948c2ecf20Sopenharmony_ci
25958c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM3, \T4
25968c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
25978c2ecf20Sopenharmony_ci
25988c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM3, \T4
25998c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
26008c2ecf20Sopenharmony_ci
26018c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
26028c2ecf20Sopenharmony_ci
26038c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
26048c2ecf20Sopenharmony_ci
26058c2ecf20Sopenharmony_ci        ######################
26068c2ecf20Sopenharmony_ci
26078c2ecf20Sopenharmony_ci        vmovdqu         HashKey_5(arg2), \T5
26088c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM4, \T2
26098c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
26108c2ecf20Sopenharmony_ci        vpxor           \XMM4, \T2, \T2
26118c2ecf20Sopenharmony_ci        vpxor           \T5, \T3, \T3
26128c2ecf20Sopenharmony_ci
26138c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM4, \T4
26148c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
26158c2ecf20Sopenharmony_ci
26168c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM4, \T4
26178c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
26188c2ecf20Sopenharmony_ci
26198c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
26208c2ecf20Sopenharmony_ci
26218c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
26228c2ecf20Sopenharmony_ci
26238c2ecf20Sopenharmony_ci        ######################
26248c2ecf20Sopenharmony_ci
26258c2ecf20Sopenharmony_ci        vmovdqu         HashKey_4(arg2), \T5
26268c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM5, \T2
26278c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
26288c2ecf20Sopenharmony_ci        vpxor           \XMM5, \T2, \T2
26298c2ecf20Sopenharmony_ci        vpxor           \T5, \T3, \T3
26308c2ecf20Sopenharmony_ci
26318c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM5, \T4
26328c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
26338c2ecf20Sopenharmony_ci
26348c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM5, \T4
26358c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
26368c2ecf20Sopenharmony_ci
26378c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
26388c2ecf20Sopenharmony_ci
26398c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
26408c2ecf20Sopenharmony_ci
26418c2ecf20Sopenharmony_ci        ######################
26428c2ecf20Sopenharmony_ci
26438c2ecf20Sopenharmony_ci        vmovdqu         HashKey_3(arg2), \T5
26448c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM6, \T2
26458c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
26468c2ecf20Sopenharmony_ci        vpxor           \XMM6, \T2, \T2
26478c2ecf20Sopenharmony_ci        vpxor           \T5, \T3, \T3
26488c2ecf20Sopenharmony_ci
26498c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM6, \T4
26508c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
26518c2ecf20Sopenharmony_ci
26528c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM6, \T4
26538c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
26548c2ecf20Sopenharmony_ci
26558c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
26568c2ecf20Sopenharmony_ci
26578c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
26588c2ecf20Sopenharmony_ci
26598c2ecf20Sopenharmony_ci        ######################
26608c2ecf20Sopenharmony_ci
26618c2ecf20Sopenharmony_ci        vmovdqu         HashKey_2(arg2), \T5
26628c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM7, \T2
26638c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
26648c2ecf20Sopenharmony_ci        vpxor           \XMM7, \T2, \T2
26658c2ecf20Sopenharmony_ci        vpxor           \T5, \T3, \T3
26668c2ecf20Sopenharmony_ci
26678c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM7, \T4
26688c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
26698c2ecf20Sopenharmony_ci
26708c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM7, \T4
26718c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
26728c2ecf20Sopenharmony_ci
26738c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
26748c2ecf20Sopenharmony_ci
26758c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
26768c2ecf20Sopenharmony_ci
26778c2ecf20Sopenharmony_ci        ######################
26788c2ecf20Sopenharmony_ci
26798c2ecf20Sopenharmony_ci        vmovdqu         HashKey(arg2), \T5
26808c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \XMM8, \T2
26818c2ecf20Sopenharmony_ci        vpshufd         $0b01001110, \T5, \T3
26828c2ecf20Sopenharmony_ci        vpxor           \XMM8, \T2, \T2
26838c2ecf20Sopenharmony_ci        vpxor           \T5, \T3, \T3
26848c2ecf20Sopenharmony_ci
26858c2ecf20Sopenharmony_ci        vpclmulqdq      $0x11, \T5, \XMM8, \T4
26868c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6
26878c2ecf20Sopenharmony_ci
26888c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T5, \XMM8, \T4
26898c2ecf20Sopenharmony_ci        vpxor           \T4, \T7, \T7
26908c2ecf20Sopenharmony_ci
26918c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T3, \T2, \T2
26928c2ecf20Sopenharmony_ci
26938c2ecf20Sopenharmony_ci        vpxor           \T2, \XMM1, \XMM1
26948c2ecf20Sopenharmony_ci        vpxor           \T6, \XMM1, \XMM1
26958c2ecf20Sopenharmony_ci        vpxor           \T7, \XMM1, \T2
26968c2ecf20Sopenharmony_ci
26978c2ecf20Sopenharmony_ci
26988c2ecf20Sopenharmony_ci
26998c2ecf20Sopenharmony_ci
27008c2ecf20Sopenharmony_ci        vpslldq $8, \T2, \T4
27018c2ecf20Sopenharmony_ci        vpsrldq $8, \T2, \T2
27028c2ecf20Sopenharmony_ci
27038c2ecf20Sopenharmony_ci        vpxor   \T4, \T7, \T7
27048c2ecf20Sopenharmony_ci        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
27058c2ecf20Sopenharmony_ci						   # accumulated carry-less multiplications
27068c2ecf20Sopenharmony_ci
27078c2ecf20Sopenharmony_ci        #######################################################################
27088c2ecf20Sopenharmony_ci        #first phase of the reduction
27098c2ecf20Sopenharmony_ci        vmovdqa         POLY2(%rip), \T3
27108c2ecf20Sopenharmony_ci
27118c2ecf20Sopenharmony_ci        vpclmulqdq      $0x01, \T7, \T3, \T2
27128c2ecf20Sopenharmony_ci        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
27138c2ecf20Sopenharmony_ci
27148c2ecf20Sopenharmony_ci        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
27158c2ecf20Sopenharmony_ci        #######################################################################
27168c2ecf20Sopenharmony_ci
27178c2ecf20Sopenharmony_ci
27188c2ecf20Sopenharmony_ci        #second phase of the reduction
27198c2ecf20Sopenharmony_ci        vpclmulqdq      $0x00, \T7, \T3, \T2
27208c2ecf20Sopenharmony_ci        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
27218c2ecf20Sopenharmony_ci
27228c2ecf20Sopenharmony_ci        vpclmulqdq      $0x10, \T7, \T3, \T4
27238c2ecf20Sopenharmony_ci        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
27248c2ecf20Sopenharmony_ci
27258c2ecf20Sopenharmony_ci        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
27268c2ecf20Sopenharmony_ci        #######################################################################
27278c2ecf20Sopenharmony_ci        vpxor           \T4, \T6, \T6              # the result is in T6
27288c2ecf20Sopenharmony_ci.endm
27298c2ecf20Sopenharmony_ci
27308c2ecf20Sopenharmony_ci
27318c2ecf20Sopenharmony_ci
27328c2ecf20Sopenharmony_ci#############################################################
27338c2ecf20Sopenharmony_ci#void   aesni_gcm_init_avx_gen4
27348c2ecf20Sopenharmony_ci#        (gcm_data     *my_ctx_data,
27358c2ecf20Sopenharmony_ci#         gcm_context_data *data,
27368c2ecf20Sopenharmony_ci#        u8      *iv, /* Pre-counter block j0: 4 byte salt
27378c2ecf20Sopenharmony_ci#			(from Security Association) concatenated with 8 byte
27388c2ecf20Sopenharmony_ci#			Initialisation Vector (from IPSec ESP Payload)
27398c2ecf20Sopenharmony_ci#			concatenated with 0x00000001. 16-byte aligned pointer. */
27408c2ecf20Sopenharmony_ci#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
27418c2ecf20Sopenharmony_ci#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
27428c2ecf20Sopenharmony_ci#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
27438c2ecf20Sopenharmony_ci#############################################################
27448c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_init_avx_gen4)
27458c2ecf20Sopenharmony_ci        FUNC_SAVE
27468c2ecf20Sopenharmony_ci        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
27478c2ecf20Sopenharmony_ci        FUNC_RESTORE
27488c2ecf20Sopenharmony_ci        RET
27498c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_init_avx_gen4)
27508c2ecf20Sopenharmony_ci
27518c2ecf20Sopenharmony_ci###############################################################################
27528c2ecf20Sopenharmony_ci#void   aesni_gcm_enc_avx_gen4(
27538c2ecf20Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
27548c2ecf20Sopenharmony_ci#        gcm_context_data *data,
27558c2ecf20Sopenharmony_ci#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
27568c2ecf20Sopenharmony_ci#        const   u8 *in, /* Plaintext input */
27578c2ecf20Sopenharmony_ci#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
27588c2ecf20Sopenharmony_ci###############################################################################
27598c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
27608c2ecf20Sopenharmony_ci        FUNC_SAVE
27618c2ecf20Sopenharmony_ci        mov     keysize,%eax
27628c2ecf20Sopenharmony_ci        cmp     $32, %eax
27638c2ecf20Sopenharmony_ci        je      key_256_enc_update4
27648c2ecf20Sopenharmony_ci        cmp     $16, %eax
27658c2ecf20Sopenharmony_ci        je      key_128_enc_update4
27668c2ecf20Sopenharmony_ci        # must be 192
27678c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
27688c2ecf20Sopenharmony_ci        FUNC_RESTORE
27698c2ecf20Sopenharmony_ci	RET
27708c2ecf20Sopenharmony_cikey_128_enc_update4:
27718c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
27728c2ecf20Sopenharmony_ci        FUNC_RESTORE
27738c2ecf20Sopenharmony_ci	RET
27748c2ecf20Sopenharmony_cikey_256_enc_update4:
27758c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
27768c2ecf20Sopenharmony_ci        FUNC_RESTORE
27778c2ecf20Sopenharmony_ci	RET
27788c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
27798c2ecf20Sopenharmony_ci
27808c2ecf20Sopenharmony_ci###############################################################################
27818c2ecf20Sopenharmony_ci#void   aesni_gcm_dec_update_avx_gen4(
27828c2ecf20Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
27838c2ecf20Sopenharmony_ci#        gcm_context_data *data,
27848c2ecf20Sopenharmony_ci#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
27858c2ecf20Sopenharmony_ci#        const   u8 *in, /* Ciphertext input */
27868c2ecf20Sopenharmony_ci#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
27878c2ecf20Sopenharmony_ci###############################################################################
27888c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
27898c2ecf20Sopenharmony_ci        FUNC_SAVE
27908c2ecf20Sopenharmony_ci        mov     keysize,%eax
27918c2ecf20Sopenharmony_ci        cmp     $32, %eax
27928c2ecf20Sopenharmony_ci        je      key_256_dec_update4
27938c2ecf20Sopenharmony_ci        cmp     $16, %eax
27948c2ecf20Sopenharmony_ci        je      key_128_dec_update4
27958c2ecf20Sopenharmony_ci        # must be 192
27968c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
27978c2ecf20Sopenharmony_ci        FUNC_RESTORE
27988c2ecf20Sopenharmony_ci        RET
27998c2ecf20Sopenharmony_cikey_128_dec_update4:
28008c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
28018c2ecf20Sopenharmony_ci        FUNC_RESTORE
28028c2ecf20Sopenharmony_ci        RET
28038c2ecf20Sopenharmony_cikey_256_dec_update4:
28048c2ecf20Sopenharmony_ci        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
28058c2ecf20Sopenharmony_ci        FUNC_RESTORE
28068c2ecf20Sopenharmony_ci        RET
28078c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
28088c2ecf20Sopenharmony_ci
28098c2ecf20Sopenharmony_ci###############################################################################
28108c2ecf20Sopenharmony_ci#void   aesni_gcm_finalize_avx_gen4(
28118c2ecf20Sopenharmony_ci#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
28128c2ecf20Sopenharmony_ci#        gcm_context_data *data,
28138c2ecf20Sopenharmony_ci#        u8      *auth_tag, /* Authenticated Tag output. */
28148c2ecf20Sopenharmony_ci#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
28158c2ecf20Sopenharmony_ci#                              Valid values are 16 (most likely), 12 or 8. */
28168c2ecf20Sopenharmony_ci###############################################################################
28178c2ecf20Sopenharmony_ciSYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
28188c2ecf20Sopenharmony_ci        FUNC_SAVE
28198c2ecf20Sopenharmony_ci        mov	keysize,%eax
28208c2ecf20Sopenharmony_ci        cmp     $32, %eax
28218c2ecf20Sopenharmony_ci        je      key_256_finalize4
28228c2ecf20Sopenharmony_ci        cmp     $16, %eax
28238c2ecf20Sopenharmony_ci        je      key_128_finalize4
28248c2ecf20Sopenharmony_ci        # must be 192
28258c2ecf20Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
28268c2ecf20Sopenharmony_ci        FUNC_RESTORE
28278c2ecf20Sopenharmony_ci        RET
28288c2ecf20Sopenharmony_cikey_128_finalize4:
28298c2ecf20Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
28308c2ecf20Sopenharmony_ci        FUNC_RESTORE
28318c2ecf20Sopenharmony_ci        RET
28328c2ecf20Sopenharmony_cikey_256_finalize4:
28338c2ecf20Sopenharmony_ci        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
28348c2ecf20Sopenharmony_ci        FUNC_RESTORE
28358c2ecf20Sopenharmony_ci        RET
28368c2ecf20Sopenharmony_ciSYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2837