1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci#
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci#
18e1051a39Sopenharmony_ci# AES-NI-CTR+GHASH stitch.
19e1051a39Sopenharmony_ci#
20e1051a39Sopenharmony_ci# February 2013
21e1051a39Sopenharmony_ci#
22e1051a39Sopenharmony_ci# OpenSSL GCM implementation is organized in such way that its
23e1051a39Sopenharmony_ci# performance is rather close to the sum of its streamed components,
24e1051a39Sopenharmony_ci# in the context parallelized AES-NI CTR and modulo-scheduled
25e1051a39Sopenharmony_ci# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
26e1051a39Sopenharmony_ci# was observed to perform significantly better than the sum of the
27e1051a39Sopenharmony_ci# components on contemporary CPUs, the effort was deemed impossible to
28e1051a39Sopenharmony_ci# justify. This module is based on combination of Intel submissions,
29e1051a39Sopenharmony_ci# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
30e1051a39Sopenharmony_ci# Locktyukhin of Intel Corp. who verified that it reduces shuffles
31e1051a39Sopenharmony_ci# pressure with notable relative improvement, achieving 1.0 cycle per
32e1051a39Sopenharmony_ci# byte processed with 128-bit key on Haswell processor, 0.74 - on
33e1051a39Sopenharmony_ci# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
34e1051a39Sopenharmony_ci# measurements for favourable packet size, one divisible by 96.
35e1051a39Sopenharmony_ci# Applications using the EVP interface will observe a few percent
36e1051a39Sopenharmony_ci# worse performance.]
37e1051a39Sopenharmony_ci#
38e1051a39Sopenharmony_ci# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
39e1051a39Sopenharmony_ci#
40e1051a39Sopenharmony_ci# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
41e1051a39Sopenharmony_ci# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
42e1051a39Sopenharmony_ci
43e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
44e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
45e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47e1051a39Sopenharmony_ci
48e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49e1051a39Sopenharmony_ci
50e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl";
54e1051a39Sopenharmony_ci
55e1051a39Sopenharmony_ciif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56e1051a39Sopenharmony_ci		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57e1051a39Sopenharmony_ci	$avx = ($1>=2.20) + ($1>=2.22);
58e1051a39Sopenharmony_ci}
59e1051a39Sopenharmony_ci
60e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61e1051a39Sopenharmony_ci	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62e1051a39Sopenharmony_ci	$avx = ($1>=2.09) + ($1>=2.10);
63e1051a39Sopenharmony_ci}
64e1051a39Sopenharmony_ci
65e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66e1051a39Sopenharmony_ci	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67e1051a39Sopenharmony_ci	$avx = ($1>=10) + ($1>=11);
68e1051a39Sopenharmony_ci}
69e1051a39Sopenharmony_ci
70e1051a39Sopenharmony_ciif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
71e1051a39Sopenharmony_ci	$avx = ($2>=3.0) + ($2>3.0);
72e1051a39Sopenharmony_ci}
73e1051a39Sopenharmony_ci
74e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
75e1051a39Sopenharmony_ci    or die "can't call $xlate: $!";
76e1051a39Sopenharmony_ci*STDOUT=*OUT;
77e1051a39Sopenharmony_ci
78e1051a39Sopenharmony_ciif ($avx>1) {{{
79e1051a39Sopenharmony_ci
80e1051a39Sopenharmony_ci($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
81e1051a39Sopenharmony_ci
82e1051a39Sopenharmony_ci($Ii,$T1,$T2,$Hkey,
83e1051a39Sopenharmony_ci $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
84e1051a39Sopenharmony_ci
85e1051a39Sopenharmony_ci($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
86e1051a39Sopenharmony_ci
87e1051a39Sopenharmony_ci($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
88e1051a39Sopenharmony_ci
89e1051a39Sopenharmony_ci$code=<<___;
90e1051a39Sopenharmony_ci.text
91e1051a39Sopenharmony_ci
92e1051a39Sopenharmony_ci.type	_aesni_ctr32_ghash_6x,\@abi-omnipotent
93e1051a39Sopenharmony_ci.align	32
94e1051a39Sopenharmony_ci_aesni_ctr32_ghash_6x:
95e1051a39Sopenharmony_ci.cfi_startproc
96e1051a39Sopenharmony_ci	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
97e1051a39Sopenharmony_ci	sub		\$6,$len
98e1051a39Sopenharmony_ci	vpxor		$Z0,$Z0,$Z0		# $Z0   = 0
99e1051a39Sopenharmony_ci	vmovdqu		0x00-0x80($key),$rndkey
100e1051a39Sopenharmony_ci	vpaddb		$T2,$T1,$inout1
101e1051a39Sopenharmony_ci	vpaddb		$T2,$inout1,$inout2
102e1051a39Sopenharmony_ci	vpaddb		$T2,$inout2,$inout3
103e1051a39Sopenharmony_ci	vpaddb		$T2,$inout3,$inout4
104e1051a39Sopenharmony_ci	vpaddb		$T2,$inout4,$inout5
105e1051a39Sopenharmony_ci	vpxor		$rndkey,$T1,$inout0
106e1051a39Sopenharmony_ci	vmovdqu		$Z0,16+8(%rsp)		# "$Z3" = 0
107e1051a39Sopenharmony_ci	jmp		.Loop6x
108e1051a39Sopenharmony_ci
109e1051a39Sopenharmony_ci.align	32
110e1051a39Sopenharmony_ci.Loop6x:
111e1051a39Sopenharmony_ci	add		\$`6<<24`,$counter
112e1051a39Sopenharmony_ci	jc		.Lhandle_ctr32		# discard $inout[1-5]?
113e1051a39Sopenharmony_ci	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
114e1051a39Sopenharmony_ci	  vpaddb	$T2,$inout5,$T1		# next counter value
115e1051a39Sopenharmony_ci	  vpxor		$rndkey,$inout1,$inout1
116e1051a39Sopenharmony_ci	  vpxor		$rndkey,$inout2,$inout2
117e1051a39Sopenharmony_ci
118e1051a39Sopenharmony_ci.Lresume_ctr32:
119e1051a39Sopenharmony_ci	vmovdqu		$T1,($ivp)		# save next counter value
120e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$Hkey,$Z3,$Z1
121e1051a39Sopenharmony_ci	  vpxor		$rndkey,$inout3,$inout3
122e1051a39Sopenharmony_ci	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
123e1051a39Sopenharmony_ci	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
124e1051a39Sopenharmony_ci	xor		%r12,%r12
125e1051a39Sopenharmony_ci	cmp		$in0,$end0
126e1051a39Sopenharmony_ci
127e1051a39Sopenharmony_ci	  vaesenc	$T2,$inout0,$inout0
128e1051a39Sopenharmony_ci	vmovdqu		0x30+8(%rsp),$Ii	# I[4]
129e1051a39Sopenharmony_ci	  vpxor		$rndkey,$inout4,$inout4
130e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Hkey,$Z3,$T1
131e1051a39Sopenharmony_ci	  vaesenc	$T2,$inout1,$inout1
132e1051a39Sopenharmony_ci	  vpxor		$rndkey,$inout5,$inout5
133e1051a39Sopenharmony_ci	setnc		%r12b
134e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
135e1051a39Sopenharmony_ci	  vaesenc	$T2,$inout2,$inout2
136e1051a39Sopenharmony_ci	vmovdqu		0x10-0x20($Xip),$Hkey	# $Hkey^2
137e1051a39Sopenharmony_ci	neg		%r12
138e1051a39Sopenharmony_ci	  vaesenc	$T2,$inout3,$inout3
139e1051a39Sopenharmony_ci	 vpxor		$Z1,$Z2,$Z2
140e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Hkey,$Ii,$Z1
141e1051a39Sopenharmony_ci	 vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
142e1051a39Sopenharmony_ci	  vaesenc	$T2,$inout4,$inout4
143e1051a39Sopenharmony_ci	 vpxor		$Z1,$T1,$Z0
144e1051a39Sopenharmony_ci	and		\$0x60,%r12
145e1051a39Sopenharmony_ci	  vmovups	0x20-0x80($key),$rndkey
146e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$Hkey,$Ii,$T1
147e1051a39Sopenharmony_ci	  vaesenc	$T2,$inout5,$inout5
148e1051a39Sopenharmony_ci
149e1051a39Sopenharmony_ci	vpclmulqdq	\$0x01,$Hkey,$Ii,$T2
150e1051a39Sopenharmony_ci	lea		($in0,%r12),$in0
151e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout0,$inout0
152e1051a39Sopenharmony_ci	 vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled [vpxor $Z3,$Xi,$Xi]
153e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Hkey,$Ii,$Hkey
154e1051a39Sopenharmony_ci	 vmovdqu	0x40+8(%rsp),$Ii	# I[3]
155e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout1,$inout1
156e1051a39Sopenharmony_ci	movbe		0x58($in0),%r13
157e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout2,$inout2
158e1051a39Sopenharmony_ci	movbe		0x50($in0),%r12
159e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout3,$inout3
160e1051a39Sopenharmony_ci	mov		%r13,0x20+8(%rsp)
161e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout4,$inout4
162e1051a39Sopenharmony_ci	mov		%r12,0x28+8(%rsp)
163e1051a39Sopenharmony_ci	vmovdqu		0x30-0x20($Xip),$Z1	# borrow $Z1 for $Hkey^3
164e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout5,$inout5
165e1051a39Sopenharmony_ci
166e1051a39Sopenharmony_ci	  vmovups	0x30-0x80($key),$rndkey
167e1051a39Sopenharmony_ci	 vpxor		$T1,$Z2,$Z2
168e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Z1,$Ii,$T1
169e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout0,$inout0
170e1051a39Sopenharmony_ci	 vpxor		$T2,$Z2,$Z2
171e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$Z1,$Ii,$T2
172e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout1,$inout1
173e1051a39Sopenharmony_ci	 vpxor		$Hkey,$Z3,$Z3
174e1051a39Sopenharmony_ci	vpclmulqdq	\$0x01,$Z1,$Ii,$Hkey
175e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout2,$inout2
176e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Z1,$Ii,$Z1
177e1051a39Sopenharmony_ci	 vmovdqu	0x50+8(%rsp),$Ii	# I[2]
178e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout3,$inout3
179e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout4,$inout4
180e1051a39Sopenharmony_ci	 vpxor		$T1,$Z0,$Z0
181e1051a39Sopenharmony_ci	vmovdqu		0x40-0x20($Xip),$T1	# borrow $T1 for $Hkey^4
182e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout5,$inout5
183e1051a39Sopenharmony_ci
184e1051a39Sopenharmony_ci	  vmovups	0x40-0x80($key),$rndkey
185e1051a39Sopenharmony_ci	 vpxor		$T2,$Z2,$Z2
186e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$T1,$Ii,$T2
187e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout0,$inout0
188e1051a39Sopenharmony_ci	 vpxor		$Hkey,$Z2,$Z2
189e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$T1,$Ii,$Hkey
190e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout1,$inout1
191e1051a39Sopenharmony_ci	movbe		0x48($in0),%r13
192e1051a39Sopenharmony_ci	 vpxor		$Z1,$Z3,$Z3
193e1051a39Sopenharmony_ci	vpclmulqdq	\$0x01,$T1,$Ii,$Z1
194e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout2,$inout2
195e1051a39Sopenharmony_ci	movbe		0x40($in0),%r12
196e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$T1,$Ii,$T1
197e1051a39Sopenharmony_ci	 vmovdqu	0x60+8(%rsp),$Ii	# I[1]
198e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout3,$inout3
199e1051a39Sopenharmony_ci	mov		%r13,0x30+8(%rsp)
200e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout4,$inout4
201e1051a39Sopenharmony_ci	mov		%r12,0x38+8(%rsp)
202e1051a39Sopenharmony_ci	 vpxor		$T2,$Z0,$Z0
203e1051a39Sopenharmony_ci	vmovdqu		0x60-0x20($Xip),$T2	# borrow $T2 for $Hkey^5
204e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout5,$inout5
205e1051a39Sopenharmony_ci
206e1051a39Sopenharmony_ci	  vmovups	0x50-0x80($key),$rndkey
207e1051a39Sopenharmony_ci	 vpxor		$Hkey,$Z2,$Z2
208e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$T2,$Ii,$Hkey
209e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout0,$inout0
210e1051a39Sopenharmony_ci	 vpxor		$Z1,$Z2,$Z2
211e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$T2,$Ii,$Z1
212e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout1,$inout1
213e1051a39Sopenharmony_ci	movbe		0x38($in0),%r13
214e1051a39Sopenharmony_ci	 vpxor		$T1,$Z3,$Z3
215e1051a39Sopenharmony_ci	vpclmulqdq	\$0x01,$T2,$Ii,$T1
216e1051a39Sopenharmony_ci	 vpxor		0x70+8(%rsp),$Xi,$Xi	# accumulate I[0]
217e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout2,$inout2
218e1051a39Sopenharmony_ci	movbe		0x30($in0),%r12
219e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$T2,$Ii,$T2
220e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout3,$inout3
221e1051a39Sopenharmony_ci	mov		%r13,0x40+8(%rsp)
222e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout4,$inout4
223e1051a39Sopenharmony_ci	mov		%r12,0x48+8(%rsp)
224e1051a39Sopenharmony_ci	 vpxor		$Hkey,$Z0,$Z0
225e1051a39Sopenharmony_ci	 vmovdqu	0x70-0x20($Xip),$Hkey	# $Hkey^6
226e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout5,$inout5
227e1051a39Sopenharmony_ci
228e1051a39Sopenharmony_ci	  vmovups	0x60-0x80($key),$rndkey
229e1051a39Sopenharmony_ci	 vpxor		$Z1,$Z2,$Z2
230e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$Hkey,$Xi,$Z1
231e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout0,$inout0
232e1051a39Sopenharmony_ci	 vpxor		$T1,$Z2,$Z2
233e1051a39Sopenharmony_ci	vpclmulqdq	\$0x01,$Hkey,$Xi,$T1
234e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout1,$inout1
235e1051a39Sopenharmony_ci	movbe		0x28($in0),%r13
236e1051a39Sopenharmony_ci	 vpxor		$T2,$Z3,$Z3
237e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Hkey,$Xi,$T2
238e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout2,$inout2
239e1051a39Sopenharmony_ci	movbe		0x20($in0),%r12
240e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xi
241e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout3,$inout3
242e1051a39Sopenharmony_ci	mov		%r13,0x50+8(%rsp)
243e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout4,$inout4
244e1051a39Sopenharmony_ci	mov		%r12,0x58+8(%rsp)
245e1051a39Sopenharmony_ci	vpxor		$Z1,$Z2,$Z2
246e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout5,$inout5
247e1051a39Sopenharmony_ci	vpxor		$T1,$Z2,$Z2
248e1051a39Sopenharmony_ci
249e1051a39Sopenharmony_ci	  vmovups	0x70-0x80($key),$rndkey
250e1051a39Sopenharmony_ci	vpslldq		\$8,$Z2,$Z1
251e1051a39Sopenharmony_ci	vpxor		$T2,$Z0,$Z0
252e1051a39Sopenharmony_ci	vmovdqu		0x10($const),$Hkey	# .Lpoly
253e1051a39Sopenharmony_ci
254e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout0,$inout0
255e1051a39Sopenharmony_ci	vpxor		$Xi,$Z3,$Z3
256e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout1,$inout1
257e1051a39Sopenharmony_ci	vpxor		$Z1,$Z0,$Z0
258e1051a39Sopenharmony_ci	movbe		0x18($in0),%r13
259e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout2,$inout2
260e1051a39Sopenharmony_ci	movbe		0x10($in0),%r12
261e1051a39Sopenharmony_ci	vpalignr	\$8,$Z0,$Z0,$Ii		# 1st phase
262e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
263e1051a39Sopenharmony_ci	mov		%r13,0x60+8(%rsp)
264e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout3,$inout3
265e1051a39Sopenharmony_ci	mov		%r12,0x68+8(%rsp)
266e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout4,$inout4
267e1051a39Sopenharmony_ci	  vmovups	0x80-0x80($key),$T1	# borrow $T1 for $rndkey
268e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout5,$inout5
269e1051a39Sopenharmony_ci
270e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout0,$inout0
271e1051a39Sopenharmony_ci	  vmovups	0x90-0x80($key),$rndkey
272e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout1,$inout1
273e1051a39Sopenharmony_ci	vpsrldq		\$8,$Z2,$Z2
274e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout2,$inout2
275e1051a39Sopenharmony_ci	vpxor		$Z2,$Z3,$Z3
276e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout3,$inout3
277e1051a39Sopenharmony_ci	vpxor		$Ii,$Z0,$Z0
278e1051a39Sopenharmony_ci	movbe		0x08($in0),%r13
279e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout4,$inout4
280e1051a39Sopenharmony_ci	movbe		0x00($in0),%r12
281e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout5,$inout5
282e1051a39Sopenharmony_ci	  vmovups	0xa0-0x80($key),$T1
283e1051a39Sopenharmony_ci	  cmp		\$11,$rounds
284e1051a39Sopenharmony_ci	  jb		.Lenc_tail		# 128-bit key
285e1051a39Sopenharmony_ci
286e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout0,$inout0
287e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout1,$inout1
288e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout2,$inout2
289e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout3,$inout3
290e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout4,$inout4
291e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout5,$inout5
292e1051a39Sopenharmony_ci
293e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout0,$inout0
294e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout1,$inout1
295e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout2,$inout2
296e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout3,$inout3
297e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout4,$inout4
298e1051a39Sopenharmony_ci	  vmovups	0xb0-0x80($key),$rndkey
299e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout5,$inout5
300e1051a39Sopenharmony_ci	  vmovups	0xc0-0x80($key),$T1
301e1051a39Sopenharmony_ci	  je		.Lenc_tail		# 192-bit key
302e1051a39Sopenharmony_ci
303e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout0,$inout0
304e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout1,$inout1
305e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout2,$inout2
306e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout3,$inout3
307e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout4,$inout4
308e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout5,$inout5
309e1051a39Sopenharmony_ci
310e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout0,$inout0
311e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout1,$inout1
312e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout2,$inout2
313e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout3,$inout3
314e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout4,$inout4
315e1051a39Sopenharmony_ci	  vmovups	0xd0-0x80($key),$rndkey
316e1051a39Sopenharmony_ci	  vaesenc	$T1,$inout5,$inout5
317e1051a39Sopenharmony_ci	  vmovups	0xe0-0x80($key),$T1
318e1051a39Sopenharmony_ci	  jmp		.Lenc_tail		# 256-bit key
319e1051a39Sopenharmony_ci
320e1051a39Sopenharmony_ci.align	32
321e1051a39Sopenharmony_ci.Lhandle_ctr32:
322e1051a39Sopenharmony_ci	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
323e1051a39Sopenharmony_ci	  vpshufb	$Ii,$T1,$Z2		# byte-swap counter
324e1051a39Sopenharmony_ci	  vmovdqu	0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
325e1051a39Sopenharmony_ci	  vpaddd	0x40($const),$Z2,$inout1	# .Lone_lsb
326e1051a39Sopenharmony_ci	  vpaddd	$Z1,$Z2,$inout2
327e1051a39Sopenharmony_ci	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
328e1051a39Sopenharmony_ci	  vpaddd	$Z1,$inout1,$inout3
329e1051a39Sopenharmony_ci	  vpshufb	$Ii,$inout1,$inout1
330e1051a39Sopenharmony_ci	  vpaddd	$Z1,$inout2,$inout4
331e1051a39Sopenharmony_ci	  vpshufb	$Ii,$inout2,$inout2
332e1051a39Sopenharmony_ci	  vpxor		$rndkey,$inout1,$inout1
333e1051a39Sopenharmony_ci	  vpaddd	$Z1,$inout3,$inout5
334e1051a39Sopenharmony_ci	  vpshufb	$Ii,$inout3,$inout3
335e1051a39Sopenharmony_ci	  vpxor		$rndkey,$inout2,$inout2
336e1051a39Sopenharmony_ci	  vpaddd	$Z1,$inout4,$T1		# byte-swapped next counter value
337e1051a39Sopenharmony_ci	  vpshufb	$Ii,$inout4,$inout4
338e1051a39Sopenharmony_ci	  vpshufb	$Ii,$inout5,$inout5
339e1051a39Sopenharmony_ci	  vpshufb	$Ii,$T1,$T1		# next counter value
340e1051a39Sopenharmony_ci	jmp		.Lresume_ctr32
341e1051a39Sopenharmony_ci
342e1051a39Sopenharmony_ci.align	32
343e1051a39Sopenharmony_ci.Lenc_tail:
344e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout0,$inout0
345e1051a39Sopenharmony_ci	vmovdqu		$Z3,16+8(%rsp)		# postpone vpxor $Z3,$Xi,$Xi
346e1051a39Sopenharmony_ci	vpalignr	\$8,$Z0,$Z0,$Xi		# 2nd phase
347e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout1,$inout1
348e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
349e1051a39Sopenharmony_ci	  vpxor		0x00($inp),$T1,$T2
350e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout2,$inout2
351e1051a39Sopenharmony_ci	  vpxor		0x10($inp),$T1,$Ii
352e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout3,$inout3
353e1051a39Sopenharmony_ci	  vpxor		0x20($inp),$T1,$Z1
354e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout4,$inout4
355e1051a39Sopenharmony_ci	  vpxor		0x30($inp),$T1,$Z2
356e1051a39Sopenharmony_ci	  vaesenc	$rndkey,$inout5,$inout5
357e1051a39Sopenharmony_ci	  vpxor		0x40($inp),$T1,$Z3
358e1051a39Sopenharmony_ci	  vpxor		0x50($inp),$T1,$Hkey
359e1051a39Sopenharmony_ci	  vmovdqu	($ivp),$T1		# load next counter value
360e1051a39Sopenharmony_ci
361e1051a39Sopenharmony_ci	  vaesenclast	$T2,$inout0,$inout0
362e1051a39Sopenharmony_ci	  vmovdqu	0x20($const),$T2	# borrow $T2, .Lone_msb
363e1051a39Sopenharmony_ci	  vaesenclast	$Ii,$inout1,$inout1
364e1051a39Sopenharmony_ci	 vpaddb		$T2,$T1,$Ii
365e1051a39Sopenharmony_ci	mov		%r13,0x70+8(%rsp)
366e1051a39Sopenharmony_ci	lea		0x60($inp),$inp
367e1051a39Sopenharmony_ci	  vaesenclast	$Z1,$inout2,$inout2
368e1051a39Sopenharmony_ci	 vpaddb		$T2,$Ii,$Z1
369e1051a39Sopenharmony_ci	mov		%r12,0x78+8(%rsp)
370e1051a39Sopenharmony_ci	lea		0x60($out),$out
371e1051a39Sopenharmony_ci	  vmovdqu	0x00-0x80($key),$rndkey
372e1051a39Sopenharmony_ci	  vaesenclast	$Z2,$inout3,$inout3
373e1051a39Sopenharmony_ci	 vpaddb		$T2,$Z1,$Z2
374e1051a39Sopenharmony_ci	  vaesenclast	$Z3, $inout4,$inout4
375e1051a39Sopenharmony_ci	 vpaddb		$T2,$Z2,$Z3
376e1051a39Sopenharmony_ci	  vaesenclast	$Hkey,$inout5,$inout5
377e1051a39Sopenharmony_ci	 vpaddb		$T2,$Z3,$Hkey
378e1051a39Sopenharmony_ci
379e1051a39Sopenharmony_ci	add		\$0x60,$ret
380e1051a39Sopenharmony_ci	sub		\$0x6,$len
381e1051a39Sopenharmony_ci	jc		.L6x_done
382e1051a39Sopenharmony_ci
383e1051a39Sopenharmony_ci	  vmovups	$inout0,-0x60($out)	# save output
384e1051a39Sopenharmony_ci	 vpxor		$rndkey,$T1,$inout0
385e1051a39Sopenharmony_ci	  vmovups	$inout1,-0x50($out)
386e1051a39Sopenharmony_ci	 vmovdqa	$Ii,$inout1		# 0 latency
387e1051a39Sopenharmony_ci	  vmovups	$inout2,-0x40($out)
388e1051a39Sopenharmony_ci	 vmovdqa	$Z1,$inout2		# 0 latency
389e1051a39Sopenharmony_ci	  vmovups	$inout3,-0x30($out)
390e1051a39Sopenharmony_ci	 vmovdqa	$Z2,$inout3		# 0 latency
391e1051a39Sopenharmony_ci	  vmovups	$inout4,-0x20($out)
392e1051a39Sopenharmony_ci	 vmovdqa	$Z3,$inout4		# 0 latency
393e1051a39Sopenharmony_ci	  vmovups	$inout5,-0x10($out)
394e1051a39Sopenharmony_ci	 vmovdqa	$Hkey,$inout5		# 0 latency
395e1051a39Sopenharmony_ci	vmovdqu		0x20+8(%rsp),$Z3	# I[5]
396e1051a39Sopenharmony_ci	jmp		.Loop6x
397e1051a39Sopenharmony_ci
398e1051a39Sopenharmony_ci.L6x_done:
399e1051a39Sopenharmony_ci	vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled
400e1051a39Sopenharmony_ci	vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
401e1051a39Sopenharmony_ci
402e1051a39Sopenharmony_ci	ret
403e1051a39Sopenharmony_ci.cfi_endproc
404e1051a39Sopenharmony_ci.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
405e1051a39Sopenharmony_ci___
406e1051a39Sopenharmony_ci######################################################################
407e1051a39Sopenharmony_ci#
408e1051a39Sopenharmony_ci# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
409e1051a39Sopenharmony_ci#		const AES_KEY *key, unsigned char iv[16],
410e1051a39Sopenharmony_ci#		struct { u128 Xi,H,Htbl[9]; } *Xip);
411e1051a39Sopenharmony_ci$code.=<<___;
412e1051a39Sopenharmony_ci.globl	aesni_gcm_decrypt
413e1051a39Sopenharmony_ci.type	aesni_gcm_decrypt,\@function,6
414e1051a39Sopenharmony_ci.align	32
415e1051a39Sopenharmony_ciaesni_gcm_decrypt:
416e1051a39Sopenharmony_ci.cfi_startproc
417e1051a39Sopenharmony_ci	xor	$ret,$ret
418e1051a39Sopenharmony_ci	cmp	\$0x60,$len			# minimal accepted length
419e1051a39Sopenharmony_ci	jb	.Lgcm_dec_abort
420e1051a39Sopenharmony_ci
421e1051a39Sopenharmony_ci	lea	(%rsp),%rax			# save stack pointer
422e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
423e1051a39Sopenharmony_ci	push	%rbx
424e1051a39Sopenharmony_ci.cfi_push	%rbx
425e1051a39Sopenharmony_ci	push	%rbp
426e1051a39Sopenharmony_ci.cfi_push	%rbp
427e1051a39Sopenharmony_ci	push	%r12
428e1051a39Sopenharmony_ci.cfi_push	%r12
429e1051a39Sopenharmony_ci	push	%r13
430e1051a39Sopenharmony_ci.cfi_push	%r13
431e1051a39Sopenharmony_ci	push	%r14
432e1051a39Sopenharmony_ci.cfi_push	%r14
433e1051a39Sopenharmony_ci	push	%r15
434e1051a39Sopenharmony_ci.cfi_push	%r15
435e1051a39Sopenharmony_ci___
436e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
437e1051a39Sopenharmony_ci	lea	-0xa8(%rsp),%rsp
438e1051a39Sopenharmony_ci	movaps	%xmm6,-0xd8(%rax)
439e1051a39Sopenharmony_ci	movaps	%xmm7,-0xc8(%rax)
440e1051a39Sopenharmony_ci	movaps	%xmm8,-0xb8(%rax)
441e1051a39Sopenharmony_ci	movaps	%xmm9,-0xa8(%rax)
442e1051a39Sopenharmony_ci	movaps	%xmm10,-0x98(%rax)
443e1051a39Sopenharmony_ci	movaps	%xmm11,-0x88(%rax)
444e1051a39Sopenharmony_ci	movaps	%xmm12,-0x78(%rax)
445e1051a39Sopenharmony_ci	movaps	%xmm13,-0x68(%rax)
446e1051a39Sopenharmony_ci	movaps	%xmm14,-0x58(%rax)
447e1051a39Sopenharmony_ci	movaps	%xmm15,-0x48(%rax)
448e1051a39Sopenharmony_ci.Lgcm_dec_body:
449e1051a39Sopenharmony_ci___
450e1051a39Sopenharmony_ci$code.=<<___;
451e1051a39Sopenharmony_ci	vzeroupper
452e1051a39Sopenharmony_ci
453e1051a39Sopenharmony_ci	vmovdqu		($ivp),$T1		# input counter value
454e1051a39Sopenharmony_ci	add		\$-128,%rsp
455e1051a39Sopenharmony_ci	mov		12($ivp),$counter
456e1051a39Sopenharmony_ci	lea		.Lbswap_mask(%rip),$const
457e1051a39Sopenharmony_ci	lea		-0x80($key),$in0	# borrow $in0
458e1051a39Sopenharmony_ci	mov		\$0xf80,$end0		# borrow $end0
459e1051a39Sopenharmony_ci	vmovdqu		($Xip),$Xi		# load Xi
460e1051a39Sopenharmony_ci	and		\$-128,%rsp		# ensure stack alignment
461e1051a39Sopenharmony_ci	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
462e1051a39Sopenharmony_ci	lea		0x80($key),$key		# size optimization
463e1051a39Sopenharmony_ci	lea		0x20+0x20($Xip),$Xip	# size optimization
464e1051a39Sopenharmony_ci	mov		0xf0-0x80($key),$rounds
465e1051a39Sopenharmony_ci	vpshufb		$Ii,$Xi,$Xi
466e1051a39Sopenharmony_ci
467e1051a39Sopenharmony_ci	and		$end0,$in0
468e1051a39Sopenharmony_ci	and		%rsp,$end0
469e1051a39Sopenharmony_ci	sub		$in0,$end0
470e1051a39Sopenharmony_ci	jc		.Ldec_no_key_aliasing
471e1051a39Sopenharmony_ci	cmp		\$768,$end0
472e1051a39Sopenharmony_ci	jnc		.Ldec_no_key_aliasing
473e1051a39Sopenharmony_ci	sub		$end0,%rsp		# avoid aliasing with key
474e1051a39Sopenharmony_ci.Ldec_no_key_aliasing:
475e1051a39Sopenharmony_ci
476e1051a39Sopenharmony_ci	vmovdqu		0x50($inp),$Z3		# I[5]
477e1051a39Sopenharmony_ci	lea		($inp),$in0
478e1051a39Sopenharmony_ci	vmovdqu		0x40($inp),$Z0
479e1051a39Sopenharmony_ci	lea		-0xc0($inp,$len),$end0
480e1051a39Sopenharmony_ci	vmovdqu		0x30($inp),$Z1
481e1051a39Sopenharmony_ci	shr		\$4,$len
482e1051a39Sopenharmony_ci	xor		$ret,$ret
483e1051a39Sopenharmony_ci	vmovdqu		0x20($inp),$Z2
484e1051a39Sopenharmony_ci	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
485e1051a39Sopenharmony_ci	vmovdqu		0x10($inp),$T2
486e1051a39Sopenharmony_ci	 vpshufb	$Ii,$Z0,$Z0
487e1051a39Sopenharmony_ci	vmovdqu		($inp),$Hkey
488e1051a39Sopenharmony_ci	 vpshufb	$Ii,$Z1,$Z1
489e1051a39Sopenharmony_ci	vmovdqu		$Z0,0x30(%rsp)
490e1051a39Sopenharmony_ci	 vpshufb	$Ii,$Z2,$Z2
491e1051a39Sopenharmony_ci	vmovdqu		$Z1,0x40(%rsp)
492e1051a39Sopenharmony_ci	 vpshufb	$Ii,$T2,$T2
493e1051a39Sopenharmony_ci	vmovdqu		$Z2,0x50(%rsp)
494e1051a39Sopenharmony_ci	 vpshufb	$Ii,$Hkey,$Hkey
495e1051a39Sopenharmony_ci	vmovdqu		$T2,0x60(%rsp)
496e1051a39Sopenharmony_ci	vmovdqu		$Hkey,0x70(%rsp)
497e1051a39Sopenharmony_ci
498e1051a39Sopenharmony_ci	call		_aesni_ctr32_ghash_6x
499e1051a39Sopenharmony_ci
500e1051a39Sopenharmony_ci	vmovups		$inout0,-0x60($out)	# save output
501e1051a39Sopenharmony_ci	vmovups		$inout1,-0x50($out)
502e1051a39Sopenharmony_ci	vmovups		$inout2,-0x40($out)
503e1051a39Sopenharmony_ci	vmovups		$inout3,-0x30($out)
504e1051a39Sopenharmony_ci	vmovups		$inout4,-0x20($out)
505e1051a39Sopenharmony_ci	vmovups		$inout5,-0x10($out)
506e1051a39Sopenharmony_ci
507e1051a39Sopenharmony_ci	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
508e1051a39Sopenharmony_ci	vmovdqu		$Xi,-0x40($Xip)		# output Xi
509e1051a39Sopenharmony_ci
510e1051a39Sopenharmony_ci	vzeroupper
511e1051a39Sopenharmony_ci___
512e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
513e1051a39Sopenharmony_ci	movaps	-0xd8(%rax),%xmm6
514e1051a39Sopenharmony_ci	movaps	-0xc8(%rax),%xmm7
515e1051a39Sopenharmony_ci	movaps	-0xb8(%rax),%xmm8
516e1051a39Sopenharmony_ci	movaps	-0xa8(%rax),%xmm9
517e1051a39Sopenharmony_ci	movaps	-0x98(%rax),%xmm10
518e1051a39Sopenharmony_ci	movaps	-0x88(%rax),%xmm11
519e1051a39Sopenharmony_ci	movaps	-0x78(%rax),%xmm12
520e1051a39Sopenharmony_ci	movaps	-0x68(%rax),%xmm13
521e1051a39Sopenharmony_ci	movaps	-0x58(%rax),%xmm14
522e1051a39Sopenharmony_ci	movaps	-0x48(%rax),%xmm15
523e1051a39Sopenharmony_ci___
524e1051a39Sopenharmony_ci$code.=<<___;
525e1051a39Sopenharmony_ci	mov	-48(%rax),%r15
526e1051a39Sopenharmony_ci.cfi_restore	%r15
527e1051a39Sopenharmony_ci	mov	-40(%rax),%r14
528e1051a39Sopenharmony_ci.cfi_restore	%r14
529e1051a39Sopenharmony_ci	mov	-32(%rax),%r13
530e1051a39Sopenharmony_ci.cfi_restore	%r13
531e1051a39Sopenharmony_ci	mov	-24(%rax),%r12
532e1051a39Sopenharmony_ci.cfi_restore	%r12
533e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
534e1051a39Sopenharmony_ci.cfi_restore	%rbp
535e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
536e1051a39Sopenharmony_ci.cfi_restore	%rbx
537e1051a39Sopenharmony_ci	lea	(%rax),%rsp		# restore %rsp
538e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
539e1051a39Sopenharmony_ci.Lgcm_dec_abort:
540e1051a39Sopenharmony_ci	mov	$ret,%rax		# return value
541e1051a39Sopenharmony_ci	ret
542e1051a39Sopenharmony_ci.cfi_endproc
543e1051a39Sopenharmony_ci.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
544e1051a39Sopenharmony_ci___
545e1051a39Sopenharmony_ci
546e1051a39Sopenharmony_ci$code.=<<___;
547e1051a39Sopenharmony_ci.type	_aesni_ctr32_6x,\@abi-omnipotent
548e1051a39Sopenharmony_ci.align	32
549e1051a39Sopenharmony_ci_aesni_ctr32_6x:
550e1051a39Sopenharmony_ci.cfi_startproc
551e1051a39Sopenharmony_ci	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey
552e1051a39Sopenharmony_ci	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
553e1051a39Sopenharmony_ci	lea		-1($rounds),%r13
554e1051a39Sopenharmony_ci	vmovups		0x10-0x80($key),$rndkey
555e1051a39Sopenharmony_ci	lea		0x20-0x80($key),%r12
556e1051a39Sopenharmony_ci	vpxor		$Z0,$T1,$inout0
557e1051a39Sopenharmony_ci	add		\$`6<<24`,$counter
558e1051a39Sopenharmony_ci	jc		.Lhandle_ctr32_2
559e1051a39Sopenharmony_ci	vpaddb		$T2,$T1,$inout1
560e1051a39Sopenharmony_ci	vpaddb		$T2,$inout1,$inout2
561e1051a39Sopenharmony_ci	vpxor		$Z0,$inout1,$inout1
562e1051a39Sopenharmony_ci	vpaddb		$T2,$inout2,$inout3
563e1051a39Sopenharmony_ci	vpxor		$Z0,$inout2,$inout2
564e1051a39Sopenharmony_ci	vpaddb		$T2,$inout3,$inout4
565e1051a39Sopenharmony_ci	vpxor		$Z0,$inout3,$inout3
566e1051a39Sopenharmony_ci	vpaddb		$T2,$inout4,$inout5
567e1051a39Sopenharmony_ci	vpxor		$Z0,$inout4,$inout4
568e1051a39Sopenharmony_ci	vpaddb		$T2,$inout5,$T1
569e1051a39Sopenharmony_ci	vpxor		$Z0,$inout5,$inout5
570e1051a39Sopenharmony_ci	jmp		.Loop_ctr32
571e1051a39Sopenharmony_ci
572e1051a39Sopenharmony_ci.align	16
573e1051a39Sopenharmony_ci.Loop_ctr32:
574e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout0,$inout0
575e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout1,$inout1
576e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout2,$inout2
577e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout3,$inout3
578e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout4,$inout4
579e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout5,$inout5
580e1051a39Sopenharmony_ci	vmovups		(%r12),$rndkey
581e1051a39Sopenharmony_ci	lea		0x10(%r12),%r12
582e1051a39Sopenharmony_ci	dec		%r13d
583e1051a39Sopenharmony_ci	jnz		.Loop_ctr32
584e1051a39Sopenharmony_ci
585e1051a39Sopenharmony_ci	vmovdqu		(%r12),$Hkey		# last round key
586e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout0,$inout0
587e1051a39Sopenharmony_ci	vpxor		0x00($inp),$Hkey,$Z0
588e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout1,$inout1
589e1051a39Sopenharmony_ci	vpxor		0x10($inp),$Hkey,$Z1
590e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout2,$inout2
591e1051a39Sopenharmony_ci	vpxor		0x20($inp),$Hkey,$Z2
592e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout3,$inout3
593e1051a39Sopenharmony_ci	vpxor		0x30($inp),$Hkey,$Xi
594e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout4,$inout4
595e1051a39Sopenharmony_ci	vpxor		0x40($inp),$Hkey,$T2
596e1051a39Sopenharmony_ci	vaesenc		$rndkey,$inout5,$inout5
597e1051a39Sopenharmony_ci	vpxor		0x50($inp),$Hkey,$Hkey
598e1051a39Sopenharmony_ci	lea		0x60($inp),$inp
599e1051a39Sopenharmony_ci
600e1051a39Sopenharmony_ci	vaesenclast	$Z0,$inout0,$inout0
601e1051a39Sopenharmony_ci	vaesenclast	$Z1,$inout1,$inout1
602e1051a39Sopenharmony_ci	vaesenclast	$Z2,$inout2,$inout2
603e1051a39Sopenharmony_ci	vaesenclast	$Xi,$inout3,$inout3
604e1051a39Sopenharmony_ci	vaesenclast	$T2,$inout4,$inout4
605e1051a39Sopenharmony_ci	vaesenclast	$Hkey,$inout5,$inout5
606e1051a39Sopenharmony_ci	vmovups		$inout0,0x00($out)
607e1051a39Sopenharmony_ci	vmovups		$inout1,0x10($out)
608e1051a39Sopenharmony_ci	vmovups		$inout2,0x20($out)
609e1051a39Sopenharmony_ci	vmovups		$inout3,0x30($out)
610e1051a39Sopenharmony_ci	vmovups		$inout4,0x40($out)
611e1051a39Sopenharmony_ci	vmovups		$inout5,0x50($out)
612e1051a39Sopenharmony_ci	lea		0x60($out),$out
613e1051a39Sopenharmony_ci
614e1051a39Sopenharmony_ci	ret
615e1051a39Sopenharmony_ci.align	32
616e1051a39Sopenharmony_ci.Lhandle_ctr32_2:
617e1051a39Sopenharmony_ci	vpshufb		$Ii,$T1,$Z2		# byte-swap counter
618e1051a39Sopenharmony_ci	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
619e1051a39Sopenharmony_ci	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb
620e1051a39Sopenharmony_ci	vpaddd		$Z1,$Z2,$inout2
621e1051a39Sopenharmony_ci	vpaddd		$Z1,$inout1,$inout3
622e1051a39Sopenharmony_ci	vpshufb		$Ii,$inout1,$inout1
623e1051a39Sopenharmony_ci	vpaddd		$Z1,$inout2,$inout4
624e1051a39Sopenharmony_ci	vpshufb		$Ii,$inout2,$inout2
625e1051a39Sopenharmony_ci	vpxor		$Z0,$inout1,$inout1
626e1051a39Sopenharmony_ci	vpaddd		$Z1,$inout3,$inout5
627e1051a39Sopenharmony_ci	vpshufb		$Ii,$inout3,$inout3
628e1051a39Sopenharmony_ci	vpxor		$Z0,$inout2,$inout2
629e1051a39Sopenharmony_ci	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value
630e1051a39Sopenharmony_ci	vpshufb		$Ii,$inout4,$inout4
631e1051a39Sopenharmony_ci	vpxor		$Z0,$inout3,$inout3
632e1051a39Sopenharmony_ci	vpshufb		$Ii,$inout5,$inout5
633e1051a39Sopenharmony_ci	vpxor		$Z0,$inout4,$inout4
634e1051a39Sopenharmony_ci	vpshufb		$Ii,$T1,$T1		# next counter value
635e1051a39Sopenharmony_ci	vpxor		$Z0,$inout5,$inout5
636e1051a39Sopenharmony_ci	jmp	.Loop_ctr32
637e1051a39Sopenharmony_ci.cfi_endproc
638e1051a39Sopenharmony_ci.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
639e1051a39Sopenharmony_ci
640e1051a39Sopenharmony_ci.globl	aesni_gcm_encrypt
641e1051a39Sopenharmony_ci.type	aesni_gcm_encrypt,\@function,6
642e1051a39Sopenharmony_ci.align	32
643e1051a39Sopenharmony_ciaesni_gcm_encrypt:
644e1051a39Sopenharmony_ci.cfi_startproc
645e1051a39Sopenharmony_ci	xor	$ret,$ret
646e1051a39Sopenharmony_ci	cmp	\$0x60*3,$len			# minimal accepted length
647e1051a39Sopenharmony_ci	jb	.Lgcm_enc_abort
648e1051a39Sopenharmony_ci
649e1051a39Sopenharmony_ci	lea	(%rsp),%rax			# save stack pointer
650e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
651e1051a39Sopenharmony_ci	push	%rbx
652e1051a39Sopenharmony_ci.cfi_push	%rbx
653e1051a39Sopenharmony_ci	push	%rbp
654e1051a39Sopenharmony_ci.cfi_push	%rbp
655e1051a39Sopenharmony_ci	push	%r12
656e1051a39Sopenharmony_ci.cfi_push	%r12
657e1051a39Sopenharmony_ci	push	%r13
658e1051a39Sopenharmony_ci.cfi_push	%r13
659e1051a39Sopenharmony_ci	push	%r14
660e1051a39Sopenharmony_ci.cfi_push	%r14
661e1051a39Sopenharmony_ci	push	%r15
662e1051a39Sopenharmony_ci.cfi_push	%r15
663e1051a39Sopenharmony_ci___
664e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
665e1051a39Sopenharmony_ci	lea	-0xa8(%rsp),%rsp
666e1051a39Sopenharmony_ci	movaps	%xmm6,-0xd8(%rax)
667e1051a39Sopenharmony_ci	movaps	%xmm7,-0xc8(%rax)
668e1051a39Sopenharmony_ci	movaps	%xmm8,-0xb8(%rax)
669e1051a39Sopenharmony_ci	movaps	%xmm9,-0xa8(%rax)
670e1051a39Sopenharmony_ci	movaps	%xmm10,-0x98(%rax)
671e1051a39Sopenharmony_ci	movaps	%xmm11,-0x88(%rax)
672e1051a39Sopenharmony_ci	movaps	%xmm12,-0x78(%rax)
673e1051a39Sopenharmony_ci	movaps	%xmm13,-0x68(%rax)
674e1051a39Sopenharmony_ci	movaps	%xmm14,-0x58(%rax)
675e1051a39Sopenharmony_ci	movaps	%xmm15,-0x48(%rax)
676e1051a39Sopenharmony_ci.Lgcm_enc_body:
677e1051a39Sopenharmony_ci___
678e1051a39Sopenharmony_ci$code.=<<___;
679e1051a39Sopenharmony_ci	vzeroupper
680e1051a39Sopenharmony_ci
681e1051a39Sopenharmony_ci	vmovdqu		($ivp),$T1		# input counter value
682e1051a39Sopenharmony_ci	add		\$-128,%rsp
683e1051a39Sopenharmony_ci	mov		12($ivp),$counter
684e1051a39Sopenharmony_ci	lea		.Lbswap_mask(%rip),$const
685e1051a39Sopenharmony_ci	lea		-0x80($key),$in0	# borrow $in0
686e1051a39Sopenharmony_ci	mov		\$0xf80,$end0		# borrow $end0
687e1051a39Sopenharmony_ci	lea		0x80($key),$key		# size optimization
688e1051a39Sopenharmony_ci	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
689e1051a39Sopenharmony_ci	and		\$-128,%rsp		# ensure stack alignment
690e1051a39Sopenharmony_ci	mov		0xf0-0x80($key),$rounds
691e1051a39Sopenharmony_ci
692e1051a39Sopenharmony_ci	and		$end0,$in0
693e1051a39Sopenharmony_ci	and		%rsp,$end0
694e1051a39Sopenharmony_ci	sub		$in0,$end0
695e1051a39Sopenharmony_ci	jc		.Lenc_no_key_aliasing
696e1051a39Sopenharmony_ci	cmp		\$768,$end0
697e1051a39Sopenharmony_ci	jnc		.Lenc_no_key_aliasing
698e1051a39Sopenharmony_ci	sub		$end0,%rsp		# avoid aliasing with key
699e1051a39Sopenharmony_ci.Lenc_no_key_aliasing:
700e1051a39Sopenharmony_ci
701e1051a39Sopenharmony_ci	lea		($out),$in0
702e1051a39Sopenharmony_ci	lea		-0xc0($out,$len),$end0
703e1051a39Sopenharmony_ci	shr		\$4,$len
704e1051a39Sopenharmony_ci
705e1051a39Sopenharmony_ci	call		_aesni_ctr32_6x
706e1051a39Sopenharmony_ci	vpshufb		$Ii,$inout0,$Xi		# save bswapped output on stack
707e1051a39Sopenharmony_ci	vpshufb		$Ii,$inout1,$T2
708e1051a39Sopenharmony_ci	vmovdqu		$Xi,0x70(%rsp)
709e1051a39Sopenharmony_ci	vpshufb		$Ii,$inout2,$Z0
710e1051a39Sopenharmony_ci	vmovdqu		$T2,0x60(%rsp)
711e1051a39Sopenharmony_ci	vpshufb		$Ii,$inout3,$Z1
712e1051a39Sopenharmony_ci	vmovdqu		$Z0,0x50(%rsp)
713e1051a39Sopenharmony_ci	vpshufb		$Ii,$inout4,$Z2
714e1051a39Sopenharmony_ci	vmovdqu		$Z1,0x40(%rsp)
715e1051a39Sopenharmony_ci	vpshufb		$Ii,$inout5,$Z3		# passed to _aesni_ctr32_ghash_6x
716e1051a39Sopenharmony_ci	vmovdqu		$Z2,0x30(%rsp)
717e1051a39Sopenharmony_ci
718e1051a39Sopenharmony_ci	call		_aesni_ctr32_6x
719e1051a39Sopenharmony_ci
720e1051a39Sopenharmony_ci	vmovdqu		($Xip),$Xi		# load Xi
721e1051a39Sopenharmony_ci	lea		0x20+0x20($Xip),$Xip	# size optimization
722e1051a39Sopenharmony_ci	sub		\$12,$len
723e1051a39Sopenharmony_ci	mov		\$0x60*2,$ret
724e1051a39Sopenharmony_ci	vpshufb		$Ii,$Xi,$Xi
725e1051a39Sopenharmony_ci
726e1051a39Sopenharmony_ci	call		_aesni_ctr32_ghash_6x
727e1051a39Sopenharmony_ci	vmovdqu		0x20(%rsp),$Z3		# I[5]
728e1051a39Sopenharmony_ci	 vmovdqu	($const),$Ii		# borrow $Ii for .Lbswap_mask
729e1051a39Sopenharmony_ci	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
730e1051a39Sopenharmony_ci	vpunpckhqdq	$Z3,$Z3,$T1
731e1051a39Sopenharmony_ci	vmovdqu		0x20-0x20($Xip),$rndkey	# borrow $rndkey for $HK
732e1051a39Sopenharmony_ci	 vmovups	$inout0,-0x60($out)	# save output
733e1051a39Sopenharmony_ci	 vpshufb	$Ii,$inout0,$inout0	# but keep bswapped copy
734e1051a39Sopenharmony_ci	vpxor		$Z3,$T1,$T1
735e1051a39Sopenharmony_ci	 vmovups	$inout1,-0x50($out)
736e1051a39Sopenharmony_ci	 vpshufb	$Ii,$inout1,$inout1
737e1051a39Sopenharmony_ci	 vmovups	$inout2,-0x40($out)
738e1051a39Sopenharmony_ci	 vpshufb	$Ii,$inout2,$inout2
739e1051a39Sopenharmony_ci	 vmovups	$inout3,-0x30($out)
740e1051a39Sopenharmony_ci	 vpshufb	$Ii,$inout3,$inout3
741e1051a39Sopenharmony_ci	 vmovups	$inout4,-0x20($out)
742e1051a39Sopenharmony_ci	 vpshufb	$Ii,$inout4,$inout4
743e1051a39Sopenharmony_ci	 vmovups	$inout5,-0x10($out)
744e1051a39Sopenharmony_ci	 vpshufb	$Ii,$inout5,$inout5
745e1051a39Sopenharmony_ci	 vmovdqu	$inout0,0x10(%rsp)	# free $inout0
746e1051a39Sopenharmony_ci___
747e1051a39Sopenharmony_ci{ my ($HK,$T3)=($rndkey,$inout0);
748e1051a39Sopenharmony_ci
749e1051a39Sopenharmony_ci$code.=<<___;
750e1051a39Sopenharmony_ci	 vmovdqu	0x30(%rsp),$Z2		# I[4]
751e1051a39Sopenharmony_ci	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
752e1051a39Sopenharmony_ci	 vpunpckhqdq	$Z2,$Z2,$T2
753e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Hkey,$Z3,$Z1
754e1051a39Sopenharmony_ci	 vpxor		$Z2,$T2,$T2
755e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
756e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$HK,$T1,$T1
757e1051a39Sopenharmony_ci
758e1051a39Sopenharmony_ci	 vmovdqu	0x40(%rsp),$T3		# I[3]
759e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Ii,$Z2,$Z0
760e1051a39Sopenharmony_ci	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
761e1051a39Sopenharmony_ci	vpxor		$Z1,$Z0,$Z0
762e1051a39Sopenharmony_ci	 vpunpckhqdq	$T3,$T3,$Z1
763e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Ii,$Z2,$Z2
764e1051a39Sopenharmony_ci	 vpxor		$T3,$Z1,$Z1
765e1051a39Sopenharmony_ci	vpxor		$Z3,$Z2,$Z2
766e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$HK,$T2,$T2
767e1051a39Sopenharmony_ci	 vmovdqu	0x50-0x20($Xip),$HK
768e1051a39Sopenharmony_ci	vpxor		$T1,$T2,$T2
769e1051a39Sopenharmony_ci
770e1051a39Sopenharmony_ci	 vmovdqu	0x50(%rsp),$T1		# I[2]
771e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Hkey,$T3,$Z3
772e1051a39Sopenharmony_ci	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
773e1051a39Sopenharmony_ci	vpxor		$Z0,$Z3,$Z3
774e1051a39Sopenharmony_ci	 vpunpckhqdq	$T1,$T1,$Z0
775e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Hkey,$T3,$T3
776e1051a39Sopenharmony_ci	 vpxor		$T1,$Z0,$Z0
777e1051a39Sopenharmony_ci	vpxor		$Z2,$T3,$T3
778e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$HK,$Z1,$Z1
779e1051a39Sopenharmony_ci	vpxor		$T2,$Z1,$Z1
780e1051a39Sopenharmony_ci
781e1051a39Sopenharmony_ci	 vmovdqu	0x60(%rsp),$T2		# I[1]
782e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Ii,$T1,$Z2
783e1051a39Sopenharmony_ci	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
784e1051a39Sopenharmony_ci	vpxor		$Z3,$Z2,$Z2
785e1051a39Sopenharmony_ci	 vpunpckhqdq	$T2,$T2,$Z3
786e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Ii,$T1,$T1
787e1051a39Sopenharmony_ci	 vpxor		$T2,$Z3,$Z3
788e1051a39Sopenharmony_ci	vpxor		$T3,$T1,$T1
789e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$HK,$Z0,$Z0
790e1051a39Sopenharmony_ci	 vmovdqu	0x80-0x20($Xip),$HK
791e1051a39Sopenharmony_ci	vpxor		$Z1,$Z0,$Z0
792e1051a39Sopenharmony_ci
793e1051a39Sopenharmony_ci	 vpxor		0x70(%rsp),$Xi,$Xi	# accumulate I[0]
794e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Hkey,$T2,$Z1
795e1051a39Sopenharmony_ci	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
796e1051a39Sopenharmony_ci	 vpunpckhqdq	$Xi,$Xi,$T3
797e1051a39Sopenharmony_ci	vpxor		$Z2,$Z1,$Z1
798e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Hkey,$T2,$T2
799e1051a39Sopenharmony_ci	 vpxor		$Xi,$T3,$T3
800e1051a39Sopenharmony_ci	vpxor		$T1,$T2,$T2
801e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$HK,$Z3,$Z3
802e1051a39Sopenharmony_ci	vpxor		$Z0,$Z3,$Z0
803e1051a39Sopenharmony_ci
804e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Ii,$Xi,$Z2
805e1051a39Sopenharmony_ci	 vmovdqu	0x00-0x20($Xip),$Hkey	# $Hkey^1
806e1051a39Sopenharmony_ci	 vpunpckhqdq	$inout5,$inout5,$T1
807e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Ii,$Xi,$Xi
808e1051a39Sopenharmony_ci	 vpxor		$inout5,$T1,$T1
809e1051a39Sopenharmony_ci	vpxor		$Z1,$Z2,$Z1
810e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$HK,$T3,$T3
811e1051a39Sopenharmony_ci	 vmovdqu	0x20-0x20($Xip),$HK
812e1051a39Sopenharmony_ci	vpxor		$T2,$Xi,$Z3
813e1051a39Sopenharmony_ci	vpxor		$Z0,$T3,$Z2
814e1051a39Sopenharmony_ci
815e1051a39Sopenharmony_ci	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
816e1051a39Sopenharmony_ci	  vpxor		$Z1,$Z3,$T3		# aggregated Karatsuba post-processing
817e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Hkey,$inout5,$Z0
818e1051a39Sopenharmony_ci	  vpxor		$T3,$Z2,$Z2
819e1051a39Sopenharmony_ci	 vpunpckhqdq	$inout4,$inout4,$T2
820e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Hkey,$inout5,$inout5
821e1051a39Sopenharmony_ci	 vpxor		$inout4,$T2,$T2
822e1051a39Sopenharmony_ci	  vpslldq	\$8,$Z2,$T3
823e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$HK,$T1,$T1
824e1051a39Sopenharmony_ci	  vpxor		$T3,$Z1,$Xi
825e1051a39Sopenharmony_ci	  vpsrldq	\$8,$Z2,$Z2
826e1051a39Sopenharmony_ci	  vpxor		$Z2,$Z3,$Z3
827e1051a39Sopenharmony_ci
828e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Ii,$inout4,$Z1
829e1051a39Sopenharmony_ci	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
830e1051a39Sopenharmony_ci	vpxor		$Z0,$Z1,$Z1
831e1051a39Sopenharmony_ci	 vpunpckhqdq	$inout3,$inout3,$T3
832e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Ii,$inout4,$inout4
833e1051a39Sopenharmony_ci	 vpxor		$inout3,$T3,$T3
834e1051a39Sopenharmony_ci	vpxor		$inout5,$inout4,$inout4
835e1051a39Sopenharmony_ci	  vpalignr	\$8,$Xi,$Xi,$inout5	# 1st phase
836e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$HK,$T2,$T2
837e1051a39Sopenharmony_ci	 vmovdqu	0x50-0x20($Xip),$HK
838e1051a39Sopenharmony_ci	vpxor		$T1,$T2,$T2
839e1051a39Sopenharmony_ci
840e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Hkey,$inout3,$Z0
841e1051a39Sopenharmony_ci	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
842e1051a39Sopenharmony_ci	vpxor		$Z1,$Z0,$Z0
843e1051a39Sopenharmony_ci	 vpunpckhqdq	$inout2,$inout2,$T1
844e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Hkey,$inout3,$inout3
845e1051a39Sopenharmony_ci	 vpxor		$inout2,$T1,$T1
846e1051a39Sopenharmony_ci	vpxor		$inout4,$inout3,$inout3
847e1051a39Sopenharmony_ci	  vxorps	0x10(%rsp),$Z3,$Z3	# accumulate $inout0
848e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$HK,$T3,$T3
849e1051a39Sopenharmony_ci	vpxor		$T2,$T3,$T3
850e1051a39Sopenharmony_ci
851e1051a39Sopenharmony_ci	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
852e1051a39Sopenharmony_ci	  vxorps	$inout5,$Xi,$Xi
853e1051a39Sopenharmony_ci
854e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Ii,$inout2,$Z1
855e1051a39Sopenharmony_ci	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
856e1051a39Sopenharmony_ci	vpxor		$Z0,$Z1,$Z1
857e1051a39Sopenharmony_ci	 vpunpckhqdq	$inout1,$inout1,$T2
858e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Ii,$inout2,$inout2
859e1051a39Sopenharmony_ci	 vpxor		$inout1,$T2,$T2
860e1051a39Sopenharmony_ci	  vpalignr	\$8,$Xi,$Xi,$inout5	# 2nd phase
861e1051a39Sopenharmony_ci	vpxor		$inout3,$inout2,$inout2
862e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$HK,$T1,$T1
863e1051a39Sopenharmony_ci	 vmovdqu	0x80-0x20($Xip),$HK
864e1051a39Sopenharmony_ci	vpxor		$T3,$T1,$T1
865e1051a39Sopenharmony_ci
866e1051a39Sopenharmony_ci	  vxorps	$Z3,$inout5,$inout5
867e1051a39Sopenharmony_ci	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
868e1051a39Sopenharmony_ci	  vxorps	$inout5,$Xi,$Xi
869e1051a39Sopenharmony_ci
870e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Hkey,$inout1,$Z0
871e1051a39Sopenharmony_ci	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
872e1051a39Sopenharmony_ci	vpxor		$Z1,$Z0,$Z0
873e1051a39Sopenharmony_ci	 vpunpckhqdq	$Xi,$Xi,$T3
874e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Hkey,$inout1,$inout1
875e1051a39Sopenharmony_ci	 vpxor		$Xi,$T3,$T3
876e1051a39Sopenharmony_ci	vpxor		$inout2,$inout1,$inout1
877e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$HK,$T2,$T2
878e1051a39Sopenharmony_ci	vpxor		$T1,$T2,$T2
879e1051a39Sopenharmony_ci
880e1051a39Sopenharmony_ci	vpclmulqdq	\$0x00,$Ii,$Xi,$Z1
881e1051a39Sopenharmony_ci	vpclmulqdq	\$0x11,$Ii,$Xi,$Z3
882e1051a39Sopenharmony_ci	vpxor		$Z0,$Z1,$Z1
883e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$HK,$T3,$Z2
884e1051a39Sopenharmony_ci	vpxor		$inout1,$Z3,$Z3
885e1051a39Sopenharmony_ci	vpxor		$T2,$Z2,$Z2
886e1051a39Sopenharmony_ci
887e1051a39Sopenharmony_ci	vpxor		$Z1,$Z3,$Z0		# aggregated Karatsuba post-processing
888e1051a39Sopenharmony_ci	vpxor		$Z0,$Z2,$Z2
889e1051a39Sopenharmony_ci	vpslldq		\$8,$Z2,$T1
890e1051a39Sopenharmony_ci	vmovdqu		0x10($const),$Hkey	# .Lpoly
891e1051a39Sopenharmony_ci	vpsrldq		\$8,$Z2,$Z2
892e1051a39Sopenharmony_ci	vpxor		$T1,$Z1,$Xi
893e1051a39Sopenharmony_ci	vpxor		$Z2,$Z3,$Z3
894e1051a39Sopenharmony_ci
895e1051a39Sopenharmony_ci	vpalignr	\$8,$Xi,$Xi,$T2		# 1st phase
896e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
897e1051a39Sopenharmony_ci	vpxor		$T2,$Xi,$Xi
898e1051a39Sopenharmony_ci
899e1051a39Sopenharmony_ci	vpalignr	\$8,$Xi,$Xi,$T2		# 2nd phase
900e1051a39Sopenharmony_ci	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
901e1051a39Sopenharmony_ci	vpxor		$Z3,$T2,$T2
902e1051a39Sopenharmony_ci	vpxor		$T2,$Xi,$Xi
903e1051a39Sopenharmony_ci___
904e1051a39Sopenharmony_ci}
905e1051a39Sopenharmony_ci$code.=<<___;
906e1051a39Sopenharmony_ci	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
907e1051a39Sopenharmony_ci	vmovdqu		$Xi,-0x40($Xip)		# output Xi
908e1051a39Sopenharmony_ci
909e1051a39Sopenharmony_ci	vzeroupper
910e1051a39Sopenharmony_ci___
911e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
912e1051a39Sopenharmony_ci	movaps	-0xd8(%rax),%xmm6
913e1051a39Sopenharmony_ci	movaps	-0xc8(%rax),%xmm7
914e1051a39Sopenharmony_ci	movaps	-0xb8(%rax),%xmm8
915e1051a39Sopenharmony_ci	movaps	-0xa8(%rax),%xmm9
916e1051a39Sopenharmony_ci	movaps	-0x98(%rax),%xmm10
917e1051a39Sopenharmony_ci	movaps	-0x88(%rax),%xmm11
918e1051a39Sopenharmony_ci	movaps	-0x78(%rax),%xmm12
919e1051a39Sopenharmony_ci	movaps	-0x68(%rax),%xmm13
920e1051a39Sopenharmony_ci	movaps	-0x58(%rax),%xmm14
921e1051a39Sopenharmony_ci	movaps	-0x48(%rax),%xmm15
922e1051a39Sopenharmony_ci___
923e1051a39Sopenharmony_ci$code.=<<___;
924e1051a39Sopenharmony_ci	mov	-48(%rax),%r15
925e1051a39Sopenharmony_ci.cfi_restore	%r15
926e1051a39Sopenharmony_ci	mov	-40(%rax),%r14
927e1051a39Sopenharmony_ci.cfi_restore	%r14
928e1051a39Sopenharmony_ci	mov	-32(%rax),%r13
929e1051a39Sopenharmony_ci.cfi_restore	%r13
930e1051a39Sopenharmony_ci	mov	-24(%rax),%r12
931e1051a39Sopenharmony_ci.cfi_restore	%r12
932e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
933e1051a39Sopenharmony_ci.cfi_restore	%rbp
934e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
935e1051a39Sopenharmony_ci.cfi_restore	%rbx
936e1051a39Sopenharmony_ci	lea	(%rax),%rsp		# restore %rsp
937e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
938e1051a39Sopenharmony_ci.Lgcm_enc_abort:
939e1051a39Sopenharmony_ci	mov	$ret,%rax		# return value
940e1051a39Sopenharmony_ci	ret
941e1051a39Sopenharmony_ci.cfi_endproc
942e1051a39Sopenharmony_ci.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
943e1051a39Sopenharmony_ci___
944e1051a39Sopenharmony_ci
945e1051a39Sopenharmony_ci$code.=<<___;
946e1051a39Sopenharmony_ci.align	64
947e1051a39Sopenharmony_ci.Lbswap_mask:
948e1051a39Sopenharmony_ci	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
949e1051a39Sopenharmony_ci.Lpoly:
950e1051a39Sopenharmony_ci	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
951e1051a39Sopenharmony_ci.Lone_msb:
952e1051a39Sopenharmony_ci	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
953e1051a39Sopenharmony_ci.Ltwo_lsb:
954e1051a39Sopenharmony_ci	.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
955e1051a39Sopenharmony_ci.Lone_lsb:
956e1051a39Sopenharmony_ci	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
957e1051a39Sopenharmony_ci.asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
958e1051a39Sopenharmony_ci.align	64
959e1051a39Sopenharmony_ci___
960e1051a39Sopenharmony_ciif ($win64) {
961e1051a39Sopenharmony_ci$rec="%rcx";
962e1051a39Sopenharmony_ci$frame="%rdx";
963e1051a39Sopenharmony_ci$context="%r8";
964e1051a39Sopenharmony_ci$disp="%r9";
965e1051a39Sopenharmony_ci
966e1051a39Sopenharmony_ci$code.=<<___
967e1051a39Sopenharmony_ci.extern	__imp_RtlVirtualUnwind
968e1051a39Sopenharmony_ci.type	gcm_se_handler,\@abi-omnipotent
969e1051a39Sopenharmony_ci.align	16
970e1051a39Sopenharmony_cigcm_se_handler:
971e1051a39Sopenharmony_ci	push	%rsi
972e1051a39Sopenharmony_ci	push	%rdi
973e1051a39Sopenharmony_ci	push	%rbx
974e1051a39Sopenharmony_ci	push	%rbp
975e1051a39Sopenharmony_ci	push	%r12
976e1051a39Sopenharmony_ci	push	%r13
977e1051a39Sopenharmony_ci	push	%r14
978e1051a39Sopenharmony_ci	push	%r15
979e1051a39Sopenharmony_ci	pushfq
980e1051a39Sopenharmony_ci	sub	\$64,%rsp
981e1051a39Sopenharmony_ci
982e1051a39Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
983e1051a39Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
984e1051a39Sopenharmony_ci
985e1051a39Sopenharmony_ci	mov	8($disp),%rsi		# disp->ImageBase
986e1051a39Sopenharmony_ci	mov	56($disp),%r11		# disp->HandlerData
987e1051a39Sopenharmony_ci
988e1051a39Sopenharmony_ci	mov	0(%r11),%r10d		# HandlerData[0]
989e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# prologue label
990e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<prologue label
991e1051a39Sopenharmony_ci	jb	.Lcommon_seh_tail
992e1051a39Sopenharmony_ci
993e1051a39Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
994e1051a39Sopenharmony_ci
995e1051a39Sopenharmony_ci	mov	4(%r11),%r10d		# HandlerData[1]
996e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# epilogue label
997e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=epilogue label
998e1051a39Sopenharmony_ci	jae	.Lcommon_seh_tail
999e1051a39Sopenharmony_ci
1000e1051a39Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
1001e1051a39Sopenharmony_ci
1002e1051a39Sopenharmony_ci	mov	-48(%rax),%r15
1003e1051a39Sopenharmony_ci	mov	-40(%rax),%r14
1004e1051a39Sopenharmony_ci	mov	-32(%rax),%r13
1005e1051a39Sopenharmony_ci	mov	-24(%rax),%r12
1006e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
1007e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
1008e1051a39Sopenharmony_ci	mov	%r15,240($context)
1009e1051a39Sopenharmony_ci	mov	%r14,232($context)
1010e1051a39Sopenharmony_ci	mov	%r13,224($context)
1011e1051a39Sopenharmony_ci	mov	%r12,216($context)
1012e1051a39Sopenharmony_ci	mov	%rbp,160($context)
1013e1051a39Sopenharmony_ci	mov	%rbx,144($context)
1014e1051a39Sopenharmony_ci
1015e1051a39Sopenharmony_ci	lea	-0xd8(%rax),%rsi	# %xmm save area
1016e1051a39Sopenharmony_ci	lea	512($context),%rdi	# & context.Xmm6
1017e1051a39Sopenharmony_ci	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
1018e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
1019e1051a39Sopenharmony_ci
1020e1051a39Sopenharmony_ci.Lcommon_seh_tail:
1021e1051a39Sopenharmony_ci	mov	8(%rax),%rdi
1022e1051a39Sopenharmony_ci	mov	16(%rax),%rsi
1023e1051a39Sopenharmony_ci	mov	%rax,152($context)	# restore context->Rsp
1024e1051a39Sopenharmony_ci	mov	%rsi,168($context)	# restore context->Rsi
1025e1051a39Sopenharmony_ci	mov	%rdi,176($context)	# restore context->Rdi
1026e1051a39Sopenharmony_ci
1027e1051a39Sopenharmony_ci	mov	40($disp),%rdi		# disp->ContextRecord
1028e1051a39Sopenharmony_ci	mov	$context,%rsi		# context
1029e1051a39Sopenharmony_ci	mov	\$154,%ecx		# sizeof(CONTEXT)
1030e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
1031e1051a39Sopenharmony_ci
1032e1051a39Sopenharmony_ci	mov	$disp,%rsi
1033e1051a39Sopenharmony_ci	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1034e1051a39Sopenharmony_ci	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1035e1051a39Sopenharmony_ci	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1036e1051a39Sopenharmony_ci	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1037e1051a39Sopenharmony_ci	mov	40(%rsi),%r10		# disp->ContextRecord
1038e1051a39Sopenharmony_ci	lea	56(%rsi),%r11		# &disp->HandlerData
1039e1051a39Sopenharmony_ci	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1040e1051a39Sopenharmony_ci	mov	%r10,32(%rsp)		# arg5
1041e1051a39Sopenharmony_ci	mov	%r11,40(%rsp)		# arg6
1042e1051a39Sopenharmony_ci	mov	%r12,48(%rsp)		# arg7
1043e1051a39Sopenharmony_ci	mov	%rcx,56(%rsp)		# arg8, (NULL)
1044e1051a39Sopenharmony_ci	call	*__imp_RtlVirtualUnwind(%rip)
1045e1051a39Sopenharmony_ci
1046e1051a39Sopenharmony_ci	mov	\$1,%eax		# ExceptionContinueSearch
1047e1051a39Sopenharmony_ci	add	\$64,%rsp
1048e1051a39Sopenharmony_ci	popfq
1049e1051a39Sopenharmony_ci	pop	%r15
1050e1051a39Sopenharmony_ci	pop	%r14
1051e1051a39Sopenharmony_ci	pop	%r13
1052e1051a39Sopenharmony_ci	pop	%r12
1053e1051a39Sopenharmony_ci	pop	%rbp
1054e1051a39Sopenharmony_ci	pop	%rbx
1055e1051a39Sopenharmony_ci	pop	%rdi
1056e1051a39Sopenharmony_ci	pop	%rsi
1057e1051a39Sopenharmony_ci	ret
1058e1051a39Sopenharmony_ci.size	gcm_se_handler,.-gcm_se_handler
1059e1051a39Sopenharmony_ci
1060e1051a39Sopenharmony_ci.section	.pdata
1061e1051a39Sopenharmony_ci.align	4
1062e1051a39Sopenharmony_ci	.rva	.LSEH_begin_aesni_gcm_decrypt
1063e1051a39Sopenharmony_ci	.rva	.LSEH_end_aesni_gcm_decrypt
1064e1051a39Sopenharmony_ci	.rva	.LSEH_gcm_dec_info
1065e1051a39Sopenharmony_ci
1066e1051a39Sopenharmony_ci	.rva	.LSEH_begin_aesni_gcm_encrypt
1067e1051a39Sopenharmony_ci	.rva	.LSEH_end_aesni_gcm_encrypt
1068e1051a39Sopenharmony_ci	.rva	.LSEH_gcm_enc_info
1069e1051a39Sopenharmony_ci.section	.xdata
1070e1051a39Sopenharmony_ci.align	8
1071e1051a39Sopenharmony_ci.LSEH_gcm_dec_info:
1072e1051a39Sopenharmony_ci	.byte	9,0,0,0
1073e1051a39Sopenharmony_ci	.rva	gcm_se_handler
1074e1051a39Sopenharmony_ci	.rva	.Lgcm_dec_body,.Lgcm_dec_abort
1075e1051a39Sopenharmony_ci.LSEH_gcm_enc_info:
1076e1051a39Sopenharmony_ci	.byte	9,0,0,0
1077e1051a39Sopenharmony_ci	.rva	gcm_se_handler
1078e1051a39Sopenharmony_ci	.rva	.Lgcm_enc_body,.Lgcm_enc_abort
1079e1051a39Sopenharmony_ci___
1080e1051a39Sopenharmony_ci}
1081e1051a39Sopenharmony_ci}}} else {{{
1082e1051a39Sopenharmony_ci$code=<<___;	# assembler is too old
1083e1051a39Sopenharmony_ci.text
1084e1051a39Sopenharmony_ci
1085e1051a39Sopenharmony_ci.globl	aesni_gcm_encrypt
1086e1051a39Sopenharmony_ci.type	aesni_gcm_encrypt,\@abi-omnipotent
1087e1051a39Sopenharmony_ciaesni_gcm_encrypt:
1088e1051a39Sopenharmony_ci.cfi_startproc
1089e1051a39Sopenharmony_ci	xor	%eax,%eax
1090e1051a39Sopenharmony_ci	ret
1091e1051a39Sopenharmony_ci.cfi_endproc
1092e1051a39Sopenharmony_ci.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
1093e1051a39Sopenharmony_ci
1094e1051a39Sopenharmony_ci.globl	aesni_gcm_decrypt
1095e1051a39Sopenharmony_ci.type	aesni_gcm_decrypt,\@abi-omnipotent
1096e1051a39Sopenharmony_ciaesni_gcm_decrypt:
1097e1051a39Sopenharmony_ci.cfi_startproc
1098e1051a39Sopenharmony_ci	xor	%eax,%eax
1099e1051a39Sopenharmony_ci	ret
1100e1051a39Sopenharmony_ci.cfi_endproc
1101e1051a39Sopenharmony_ci.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
1102e1051a39Sopenharmony_ci___
1103e1051a39Sopenharmony_ci}}}
1104e1051a39Sopenharmony_ci
1105e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1106e1051a39Sopenharmony_ci
1107e1051a39Sopenharmony_ciprint $code;
1108e1051a39Sopenharmony_ci
1109e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
1110