1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci#
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# January 2013
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20e1051a39Sopenharmony_ci# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21e1051a39Sopenharmony_ci# that since AESNI-CBC encrypt exhibit *very* low instruction-level
22e1051a39Sopenharmony_ci# parallelism, interleaving it with another algorithm would allow to
23e1051a39Sopenharmony_ci# utilize processor resources better and achieve better performance.
24e1051a39Sopenharmony_ci# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25e1051a39Sopenharmony_ci# AESNI code is weaved into it. As SHA256 dominates execution time,
26e1051a39Sopenharmony_ci# stitch performance does not depend on AES key length. Below are
27e1051a39Sopenharmony_ci# performance numbers in cycles per processed byte, less is better,
28e1051a39Sopenharmony_ci# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29e1051a39Sopenharmony_ci# subroutine:
30e1051a39Sopenharmony_ci#
31e1051a39Sopenharmony_ci#		 AES-128/-192/-256+SHA256   this(**)	gain
32e1051a39Sopenharmony_ci# Sandy Bridge	    5.05/6.05/7.05+11.6	    13.0	+28%/36%/43%
33e1051a39Sopenharmony_ci# Ivy Bridge	    5.05/6.05/7.05+10.3	    11.6	+32%/41%/50%
34e1051a39Sopenharmony_ci# Haswell	    4.43/5.29/6.19+7.80	    8.79	+39%/49%/59%
35e1051a39Sopenharmony_ci# Skylake	    2.62/3.14/3.62+7.70	    8.10	+27%/34%/40%
36e1051a39Sopenharmony_ci# Bulldozer	    5.77/6.89/8.00+13.7	    13.7	+42%/50%/58%
37e1051a39Sopenharmony_ci# Ryzen(***)	    2.71/-/3.71+2.05	    2.74/-/3.73	+74%/-/54%
38e1051a39Sopenharmony_ci# Goldmont(***)	    3.82/-/5.35+4.16	    4.73/-/5.94	+69%/-/60%
39e1051a39Sopenharmony_ci#
40e1051a39Sopenharmony_ci# (*)	there are XOP, AVX1 and AVX2 code paths, meaning that
41e1051a39Sopenharmony_ci#	Westmere is omitted from loop, this is because gain was not
42e1051a39Sopenharmony_ci#	estimated high enough to justify the effort;
43e1051a39Sopenharmony_ci# (**)	these are EVP-free results, results obtained with 'speed
44e1051a39Sopenharmony_ci#	-evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45e1051a39Sopenharmony_ci# (***)	these are SHAEXT results;
46e1051a39Sopenharmony_ci
47e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
48e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
49e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51e1051a39Sopenharmony_ci
52e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53e1051a39Sopenharmony_ci
54e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl";
58e1051a39Sopenharmony_ci
59e1051a39Sopenharmony_ciif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60e1051a39Sopenharmony_ci		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61e1051a39Sopenharmony_ci	$avx = ($1>=2.19) + ($1>=2.22);
62e1051a39Sopenharmony_ci}
63e1051a39Sopenharmony_ci
64e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
65e1051a39Sopenharmony_ci	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
66e1051a39Sopenharmony_ci	$avx = ($1>=2.09) + ($1>=2.10);
67e1051a39Sopenharmony_ci}
68e1051a39Sopenharmony_ci
69e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70e1051a39Sopenharmony_ci	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71e1051a39Sopenharmony_ci	$avx = ($1>=10) + ($1>=12);
72e1051a39Sopenharmony_ci}
73e1051a39Sopenharmony_ci
74e1051a39Sopenharmony_ciif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
75e1051a39Sopenharmony_ci	$avx = ($2>=3.0) + ($2>3.0);
76e1051a39Sopenharmony_ci}
77e1051a39Sopenharmony_ci
78e1051a39Sopenharmony_ci$shaext=$avx;	### set to zero if compiling for 1.0.1
79e1051a39Sopenharmony_ci$avx=1		if (!$shaext && $avx);
80e1051a39Sopenharmony_ci
81e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
82e1051a39Sopenharmony_ci    or die "can't call $xlate: $!";
83e1051a39Sopenharmony_ci*STDOUT=*OUT;
84e1051a39Sopenharmony_ci
85e1051a39Sopenharmony_ci$func="aesni_cbc_sha256_enc";
86e1051a39Sopenharmony_ci$TABLE="K256";
87e1051a39Sopenharmony_ci$SZ=4;
88e1051a39Sopenharmony_ci@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
89e1051a39Sopenharmony_ci				"%r8d","%r9d","%r10d","%r11d");
90e1051a39Sopenharmony_ci($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
91e1051a39Sopenharmony_ci@Sigma0=( 2,13,22);
92e1051a39Sopenharmony_ci@Sigma1=( 6,11,25);
93e1051a39Sopenharmony_ci@sigma0=( 7,18, 3);
94e1051a39Sopenharmony_ci@sigma1=(17,19,10);
95e1051a39Sopenharmony_ci$rounds=64;
96e1051a39Sopenharmony_ci
97e1051a39Sopenharmony_ci########################################################################
98e1051a39Sopenharmony_ci# void aesni_cbc_sha256_enc(const void *inp,
99e1051a39Sopenharmony_ci#			void *out,
100e1051a39Sopenharmony_ci#			size_t length,
101e1051a39Sopenharmony_ci#			const AES_KEY *key,
102e1051a39Sopenharmony_ci#			unsigned char *iv,
103e1051a39Sopenharmony_ci#			SHA256_CTX *ctx,
104e1051a39Sopenharmony_ci#			const void *in0);
105e1051a39Sopenharmony_ci($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
106e1051a39Sopenharmony_ci("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
107e1051a39Sopenharmony_ci
108e1051a39Sopenharmony_ci$Tbl="%rbp";
109e1051a39Sopenharmony_ci
110e1051a39Sopenharmony_ci$_inp="16*$SZ+0*8(%rsp)";
111e1051a39Sopenharmony_ci$_out="16*$SZ+1*8(%rsp)";
112e1051a39Sopenharmony_ci$_end="16*$SZ+2*8(%rsp)";
113e1051a39Sopenharmony_ci$_key="16*$SZ+3*8(%rsp)";
114e1051a39Sopenharmony_ci$_ivp="16*$SZ+4*8(%rsp)";
115e1051a39Sopenharmony_ci$_ctx="16*$SZ+5*8(%rsp)";
116e1051a39Sopenharmony_ci$_in0="16*$SZ+6*8(%rsp)";
117e1051a39Sopenharmony_ci$_rsp="`16*$SZ+7*8`(%rsp)";
118e1051a39Sopenharmony_ci$framesz=16*$SZ+8*8;
119e1051a39Sopenharmony_ci
120e1051a39Sopenharmony_ci$code=<<___;
121e1051a39Sopenharmony_ci.text
122e1051a39Sopenharmony_ci
123e1051a39Sopenharmony_ci.extern	OPENSSL_ia32cap_P
124e1051a39Sopenharmony_ci.globl	$func
125e1051a39Sopenharmony_ci.type	$func,\@abi-omnipotent
126e1051a39Sopenharmony_ci.align	16
127e1051a39Sopenharmony_ci$func:
128e1051a39Sopenharmony_ci.cfi_startproc
129e1051a39Sopenharmony_ci___
130e1051a39Sopenharmony_ci						if ($avx) {
131e1051a39Sopenharmony_ci$code.=<<___;
132e1051a39Sopenharmony_ci	lea	OPENSSL_ia32cap_P(%rip),%r11
133e1051a39Sopenharmony_ci	mov	\$1,%eax
134e1051a39Sopenharmony_ci	cmp	\$0,`$win64?"%rcx":"%rdi"`
135e1051a39Sopenharmony_ci	je	.Lprobe
136e1051a39Sopenharmony_ci	mov	0(%r11),%eax
137e1051a39Sopenharmony_ci	mov	4(%r11),%r10
138e1051a39Sopenharmony_ci___
139e1051a39Sopenharmony_ci$code.=<<___ if ($shaext);
140e1051a39Sopenharmony_ci	bt	\$61,%r10			# check for SHA
141e1051a39Sopenharmony_ci	jc	${func}_shaext
142e1051a39Sopenharmony_ci___
143e1051a39Sopenharmony_ci$code.=<<___;
144e1051a39Sopenharmony_ci	mov	%r10,%r11
145e1051a39Sopenharmony_ci	shr	\$32,%r11
146e1051a39Sopenharmony_ci
147e1051a39Sopenharmony_ci	test	\$`1<<11`,%r10d			# check for XOP
148e1051a39Sopenharmony_ci	jnz	${func}_xop
149e1051a39Sopenharmony_ci___
150e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1);
151e1051a39Sopenharmony_ci	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
152e1051a39Sopenharmony_ci	cmp	\$`1<<8|1<<5|1<<3`,%r11d
153e1051a39Sopenharmony_ci	je	${func}_avx2
154e1051a39Sopenharmony_ci___
155e1051a39Sopenharmony_ci$code.=<<___;
156e1051a39Sopenharmony_ci	and	\$`1<<28`,%r10d			# check for AVX
157e1051a39Sopenharmony_ci	jnz	${func}_avx
158e1051a39Sopenharmony_ci	ud2
159e1051a39Sopenharmony_ci___
160e1051a39Sopenharmony_ci						}
161e1051a39Sopenharmony_ci$code.=<<___;
162e1051a39Sopenharmony_ci	xor	%eax,%eax
163e1051a39Sopenharmony_ci	cmp	\$0,`$win64?"%rcx":"%rdi"`
164e1051a39Sopenharmony_ci	je	.Lprobe
165e1051a39Sopenharmony_ci	ud2
166e1051a39Sopenharmony_ci.Lprobe:
167e1051a39Sopenharmony_ci	ret
168e1051a39Sopenharmony_ci.cfi_endproc
169e1051a39Sopenharmony_ci.size	$func,.-$func
170e1051a39Sopenharmony_ci
171e1051a39Sopenharmony_ci.align	64
172e1051a39Sopenharmony_ci.type	$TABLE,\@object
173e1051a39Sopenharmony_ci$TABLE:
174e1051a39Sopenharmony_ci	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
175e1051a39Sopenharmony_ci	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
176e1051a39Sopenharmony_ci	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
177e1051a39Sopenharmony_ci	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
178e1051a39Sopenharmony_ci	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
179e1051a39Sopenharmony_ci	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
180e1051a39Sopenharmony_ci	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
181e1051a39Sopenharmony_ci	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182e1051a39Sopenharmony_ci	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183e1051a39Sopenharmony_ci	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
184e1051a39Sopenharmony_ci	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
185e1051a39Sopenharmony_ci	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
186e1051a39Sopenharmony_ci	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
187e1051a39Sopenharmony_ci	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188e1051a39Sopenharmony_ci	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189e1051a39Sopenharmony_ci	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
190e1051a39Sopenharmony_ci	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
191e1051a39Sopenharmony_ci	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
192e1051a39Sopenharmony_ci	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
193e1051a39Sopenharmony_ci	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
194e1051a39Sopenharmony_ci	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
195e1051a39Sopenharmony_ci	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
196e1051a39Sopenharmony_ci	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
197e1051a39Sopenharmony_ci	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
198e1051a39Sopenharmony_ci	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
199e1051a39Sopenharmony_ci	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200e1051a39Sopenharmony_ci	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201e1051a39Sopenharmony_ci	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202e1051a39Sopenharmony_ci	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203e1051a39Sopenharmony_ci	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
204e1051a39Sopenharmony_ci	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
205e1051a39Sopenharmony_ci	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
206e1051a39Sopenharmony_ci
207e1051a39Sopenharmony_ci	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
208e1051a39Sopenharmony_ci	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
209e1051a39Sopenharmony_ci	.long	0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
210e1051a39Sopenharmony_ci	.long	0,0,0,0,   0,0,0,0
211e1051a39Sopenharmony_ci	.asciz	"AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
212e1051a39Sopenharmony_ci.align	64
213e1051a39Sopenharmony_ci___
214e1051a39Sopenharmony_ci
215e1051a39Sopenharmony_ci######################################################################
216e1051a39Sopenharmony_ci# SIMD code paths
217e1051a39Sopenharmony_ci#
218e1051a39Sopenharmony_ci{{{
219e1051a39Sopenharmony_ci($iv,$inout,$roundkey,$temp,
220e1051a39Sopenharmony_ci $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
221e1051a39Sopenharmony_ci
222e1051a39Sopenharmony_ci$aesni_cbc_idx=0;
223e1051a39Sopenharmony_ci@aesni_cbc_block = (
224e1051a39Sopenharmony_ci##	&vmovdqu	($roundkey,"0x00-0x80($inp)");'
225e1051a39Sopenharmony_ci##	&vmovdqu	($inout,($inp));
226e1051a39Sopenharmony_ci##	&mov		($_inp,$inp);
227e1051a39Sopenharmony_ci
228e1051a39Sopenharmony_ci	'&vpxor		($inout,$inout,$roundkey);'.
229e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0x10-0x80($inp)");',
230e1051a39Sopenharmony_ci
231e1051a39Sopenharmony_ci	'&vpxor		($inout,$inout,$iv);',
232e1051a39Sopenharmony_ci
233e1051a39Sopenharmony_ci	'&vaesenc	($inout,$inout,$roundkey);'.
234e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0x20-0x80($inp)");',
235e1051a39Sopenharmony_ci
236e1051a39Sopenharmony_ci	'&vaesenc	($inout,$inout,$roundkey);'.
237e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0x30-0x80($inp)");',
238e1051a39Sopenharmony_ci
239e1051a39Sopenharmony_ci	'&vaesenc	($inout,$inout,$roundkey);'.
240e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0x40-0x80($inp)");',
241e1051a39Sopenharmony_ci
242e1051a39Sopenharmony_ci	'&vaesenc	($inout,$inout,$roundkey);'.
243e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0x50-0x80($inp)");',
244e1051a39Sopenharmony_ci
245e1051a39Sopenharmony_ci	'&vaesenc	($inout,$inout,$roundkey);'.
246e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0x60-0x80($inp)");',
247e1051a39Sopenharmony_ci
248e1051a39Sopenharmony_ci	'&vaesenc	($inout,$inout,$roundkey);'.
249e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0x70-0x80($inp)");',
250e1051a39Sopenharmony_ci
251e1051a39Sopenharmony_ci	'&vaesenc	($inout,$inout,$roundkey);'.
252e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0x80-0x80($inp)");',
253e1051a39Sopenharmony_ci
254e1051a39Sopenharmony_ci	'&vaesenc	($inout,$inout,$roundkey);'.
255e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0x90-0x80($inp)");',
256e1051a39Sopenharmony_ci
257e1051a39Sopenharmony_ci	'&vaesenc	($inout,$inout,$roundkey);'.
258e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0xa0-0x80($inp)");',
259e1051a39Sopenharmony_ci
260e1051a39Sopenharmony_ci	'&vaesenclast	($temp,$inout,$roundkey);'.
261e1051a39Sopenharmony_ci	' &vaesenc	($inout,$inout,$roundkey);'.
262e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0xb0-0x80($inp)");',
263e1051a39Sopenharmony_ci
264e1051a39Sopenharmony_ci	'&vpand		($iv,$temp,$mask10);'.
265e1051a39Sopenharmony_ci	' &vaesenc	($inout,$inout,$roundkey);'.
266e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0xc0-0x80($inp)");',
267e1051a39Sopenharmony_ci
268e1051a39Sopenharmony_ci	'&vaesenclast	($temp,$inout,$roundkey);'.
269e1051a39Sopenharmony_ci	' &vaesenc	($inout,$inout,$roundkey);'.
270e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0xd0-0x80($inp)");',
271e1051a39Sopenharmony_ci
272e1051a39Sopenharmony_ci	'&vpand		($temp,$temp,$mask12);'.
273e1051a39Sopenharmony_ci	' &vaesenc	($inout,$inout,$roundkey);'.
274e1051a39Sopenharmony_ci	 '&vmovdqu	($roundkey,"0xe0-0x80($inp)");',
275e1051a39Sopenharmony_ci
276e1051a39Sopenharmony_ci	'&vpor		($iv,$iv,$temp);'.
277e1051a39Sopenharmony_ci	' &vaesenclast	($temp,$inout,$roundkey);'.
278e1051a39Sopenharmony_ci	' &vmovdqu	($roundkey,"0x00-0x80($inp)");'
279e1051a39Sopenharmony_ci
280e1051a39Sopenharmony_ci##	&mov		($inp,$_inp);
281e1051a39Sopenharmony_ci##	&mov		($out,$_out);
282e1051a39Sopenharmony_ci##	&vpand		($temp,$temp,$mask14);
283e1051a39Sopenharmony_ci##	&vpor		($iv,$iv,$temp);
284e1051a39Sopenharmony_ci##	&vmovdqu	($iv,($out,$inp);
285e1051a39Sopenharmony_ci##	&lea		(inp,16($inp));
286e1051a39Sopenharmony_ci);
287e1051a39Sopenharmony_ci
288e1051a39Sopenharmony_cimy $a4=$T1;
289e1051a39Sopenharmony_cimy ($a,$b,$c,$d,$e,$f,$g,$h);
290e1051a39Sopenharmony_ci
291e1051a39Sopenharmony_cisub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
292e1051a39Sopenharmony_ci{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
293e1051a39Sopenharmony_ci  my $arg = pop;
294e1051a39Sopenharmony_ci    $arg = "\$$arg" if ($arg*1 eq $arg);
295e1051a39Sopenharmony_ci    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
296e1051a39Sopenharmony_ci}
297e1051a39Sopenharmony_ci
298e1051a39Sopenharmony_cisub body_00_15 () {
299e1051a39Sopenharmony_ci	(
300e1051a39Sopenharmony_ci	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
301e1051a39Sopenharmony_ci
302e1051a39Sopenharmony_ci	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
303e1051a39Sopenharmony_ci	'&mov	($a,$a1)',
304e1051a39Sopenharmony_ci	'&mov	($a4,$f)',
305e1051a39Sopenharmony_ci
306e1051a39Sopenharmony_ci	'&xor	($a0,$e)',
307e1051a39Sopenharmony_ci	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
308e1051a39Sopenharmony_ci	'&xor	($a4,$g)',			# f^g
309e1051a39Sopenharmony_ci
310e1051a39Sopenharmony_ci	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
311e1051a39Sopenharmony_ci	'&xor	($a1,$a)',
312e1051a39Sopenharmony_ci	'&and	($a4,$e)',			# (f^g)&e
313e1051a39Sopenharmony_ci
314e1051a39Sopenharmony_ci	@aesni_cbc_block[$aesni_cbc_idx++].
315e1051a39Sopenharmony_ci	'&xor	($a0,$e)',
316e1051a39Sopenharmony_ci	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
317e1051a39Sopenharmony_ci	'&mov	($a2,$a)',
318e1051a39Sopenharmony_ci
319e1051a39Sopenharmony_ci	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
320e1051a39Sopenharmony_ci	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
321e1051a39Sopenharmony_ci	'&xor	($a2,$b)',			# a^b, b^c in next round
322e1051a39Sopenharmony_ci
323e1051a39Sopenharmony_ci	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
324e1051a39Sopenharmony_ci	'&add	($h,$a4)',			# h+=Ch(e,f,g)
325e1051a39Sopenharmony_ci	'&and	($a3,$a2)',			# (b^c)&(a^b)
326e1051a39Sopenharmony_ci
327e1051a39Sopenharmony_ci	'&xor	($a1,$a)',
328e1051a39Sopenharmony_ci	'&add	($h,$a0)',			# h+=Sigma1(e)
329e1051a39Sopenharmony_ci	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
330e1051a39Sopenharmony_ci
331e1051a39Sopenharmony_ci	'&add	($d,$h)',			# d+=h
332e1051a39Sopenharmony_ci	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
333e1051a39Sopenharmony_ci	'&add	($h,$a3)',			# h+=Maj(a,b,c)
334e1051a39Sopenharmony_ci
335e1051a39Sopenharmony_ci	'&mov	($a0,$d)',
336e1051a39Sopenharmony_ci	'&add	($a1,$h);'.			# h+=Sigma0(a)
337e1051a39Sopenharmony_ci	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
338e1051a39Sopenharmony_ci	);
339e1051a39Sopenharmony_ci}
340e1051a39Sopenharmony_ci
341e1051a39Sopenharmony_ciif ($avx) {{
342e1051a39Sopenharmony_ci######################################################################
343e1051a39Sopenharmony_ci# XOP code path
344e1051a39Sopenharmony_ci#
345e1051a39Sopenharmony_ci$code.=<<___;
346e1051a39Sopenharmony_ci.type	${func}_xop,\@function,6
347e1051a39Sopenharmony_ci.align	64
348e1051a39Sopenharmony_ci${func}_xop:
349e1051a39Sopenharmony_ci.cfi_startproc
350e1051a39Sopenharmony_ci.Lxop_shortcut:
351e1051a39Sopenharmony_ci	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
352e1051a39Sopenharmony_ci	mov	%rsp,%rax		# copy %rsp
353e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
354e1051a39Sopenharmony_ci	push	%rbx
355e1051a39Sopenharmony_ci.cfi_push	%rbx
356e1051a39Sopenharmony_ci	push	%rbp
357e1051a39Sopenharmony_ci.cfi_push	%rbp
358e1051a39Sopenharmony_ci	push	%r12
359e1051a39Sopenharmony_ci.cfi_push	%r12
360e1051a39Sopenharmony_ci	push	%r13
361e1051a39Sopenharmony_ci.cfi_push	%r13
362e1051a39Sopenharmony_ci	push	%r14
363e1051a39Sopenharmony_ci.cfi_push	%r14
364e1051a39Sopenharmony_ci	push	%r15
365e1051a39Sopenharmony_ci.cfi_push	%r15
366e1051a39Sopenharmony_ci	sub	\$`$framesz+$win64*16*10`,%rsp
367e1051a39Sopenharmony_ci	and	\$-64,%rsp		# align stack frame
368e1051a39Sopenharmony_ci
369e1051a39Sopenharmony_ci	shl	\$6,$len
370e1051a39Sopenharmony_ci	sub	$inp,$out		# re-bias
371e1051a39Sopenharmony_ci	sub	$inp,$in0
372e1051a39Sopenharmony_ci	add	$inp,$len		# end of input
373e1051a39Sopenharmony_ci
374e1051a39Sopenharmony_ci	#mov	$inp,$_inp		# saved later
375e1051a39Sopenharmony_ci	mov	$out,$_out
376e1051a39Sopenharmony_ci	mov	$len,$_end
377e1051a39Sopenharmony_ci	#mov	$key,$_key		# remains resident in $inp register
378e1051a39Sopenharmony_ci	mov	$ivp,$_ivp
379e1051a39Sopenharmony_ci	mov	$ctx,$_ctx
380e1051a39Sopenharmony_ci	mov	$in0,$_in0
381e1051a39Sopenharmony_ci	mov	%rax,$_rsp
382e1051a39Sopenharmony_ci.cfi_cfa_expression	$_rsp,deref,+8
383e1051a39Sopenharmony_ci___
384e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
385e1051a39Sopenharmony_ci	movaps	%xmm6,`$framesz+16*0`(%rsp)
386e1051a39Sopenharmony_ci	movaps	%xmm7,`$framesz+16*1`(%rsp)
387e1051a39Sopenharmony_ci	movaps	%xmm8,`$framesz+16*2`(%rsp)
388e1051a39Sopenharmony_ci	movaps	%xmm9,`$framesz+16*3`(%rsp)
389e1051a39Sopenharmony_ci	movaps	%xmm10,`$framesz+16*4`(%rsp)
390e1051a39Sopenharmony_ci	movaps	%xmm11,`$framesz+16*5`(%rsp)
391e1051a39Sopenharmony_ci	movaps	%xmm12,`$framesz+16*6`(%rsp)
392e1051a39Sopenharmony_ci	movaps	%xmm13,`$framesz+16*7`(%rsp)
393e1051a39Sopenharmony_ci	movaps	%xmm14,`$framesz+16*8`(%rsp)
394e1051a39Sopenharmony_ci	movaps	%xmm15,`$framesz+16*9`(%rsp)
395e1051a39Sopenharmony_ci___
396e1051a39Sopenharmony_ci$code.=<<___;
397e1051a39Sopenharmony_ci.Lprologue_xop:
398e1051a39Sopenharmony_ci	vzeroall
399e1051a39Sopenharmony_ci
400e1051a39Sopenharmony_ci	mov	$inp,%r12		# borrow $a4
401e1051a39Sopenharmony_ci	lea	0x80($key),$inp		# size optimization, reassign
402e1051a39Sopenharmony_ci	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
403e1051a39Sopenharmony_ci	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
404e1051a39Sopenharmony_ci	mov	$ctx,%r15		# borrow $a2
405e1051a39Sopenharmony_ci	mov	$in0,%rsi		# borrow $a3
406e1051a39Sopenharmony_ci	vmovdqu	($ivp),$iv		# load IV
407e1051a39Sopenharmony_ci	sub	\$9,%r14
408e1051a39Sopenharmony_ci
409e1051a39Sopenharmony_ci	mov	$SZ*0(%r15),$A
410e1051a39Sopenharmony_ci	mov	$SZ*1(%r15),$B
411e1051a39Sopenharmony_ci	mov	$SZ*2(%r15),$C
412e1051a39Sopenharmony_ci	mov	$SZ*3(%r15),$D
413e1051a39Sopenharmony_ci	mov	$SZ*4(%r15),$E
414e1051a39Sopenharmony_ci	mov	$SZ*5(%r15),$F
415e1051a39Sopenharmony_ci	mov	$SZ*6(%r15),$G
416e1051a39Sopenharmony_ci	mov	$SZ*7(%r15),$H
417e1051a39Sopenharmony_ci
418e1051a39Sopenharmony_ci	vmovdqa	0x00(%r13,%r14,8),$mask14
419e1051a39Sopenharmony_ci	vmovdqa	0x10(%r13,%r14,8),$mask12
420e1051a39Sopenharmony_ci	vmovdqa	0x20(%r13,%r14,8),$mask10
421e1051a39Sopenharmony_ci	vmovdqu	0x00-0x80($inp),$roundkey
422e1051a39Sopenharmony_ci	jmp	.Lloop_xop
423e1051a39Sopenharmony_ci___
424e1051a39Sopenharmony_ci					if ($SZ==4) {	# SHA256
425e1051a39Sopenharmony_ci    my @X = map("%xmm$_",(0..3));
426e1051a39Sopenharmony_ci    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
427e1051a39Sopenharmony_ci
428e1051a39Sopenharmony_ci$code.=<<___;
429e1051a39Sopenharmony_ci.align	16
430e1051a39Sopenharmony_ci.Lloop_xop:
431e1051a39Sopenharmony_ci	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
432e1051a39Sopenharmony_ci	vmovdqu	0x00(%rsi,%r12),@X[0]
433e1051a39Sopenharmony_ci	vmovdqu	0x10(%rsi,%r12),@X[1]
434e1051a39Sopenharmony_ci	vmovdqu	0x20(%rsi,%r12),@X[2]
435e1051a39Sopenharmony_ci	vmovdqu	0x30(%rsi,%r12),@X[3]
436e1051a39Sopenharmony_ci	vpshufb	$t3,@X[0],@X[0]
437e1051a39Sopenharmony_ci	lea	$TABLE(%rip),$Tbl
438e1051a39Sopenharmony_ci	vpshufb	$t3,@X[1],@X[1]
439e1051a39Sopenharmony_ci	vpshufb	$t3,@X[2],@X[2]
440e1051a39Sopenharmony_ci	vpaddd	0x00($Tbl),@X[0],$t0
441e1051a39Sopenharmony_ci	vpshufb	$t3,@X[3],@X[3]
442e1051a39Sopenharmony_ci	vpaddd	0x20($Tbl),@X[1],$t1
443e1051a39Sopenharmony_ci	vpaddd	0x40($Tbl),@X[2],$t2
444e1051a39Sopenharmony_ci	vpaddd	0x60($Tbl),@X[3],$t3
445e1051a39Sopenharmony_ci	vmovdqa	$t0,0x00(%rsp)
446e1051a39Sopenharmony_ci	mov	$A,$a1
447e1051a39Sopenharmony_ci	vmovdqa	$t1,0x10(%rsp)
448e1051a39Sopenharmony_ci	mov	$B,$a3
449e1051a39Sopenharmony_ci	vmovdqa	$t2,0x20(%rsp)
450e1051a39Sopenharmony_ci	xor	$C,$a3			# magic
451e1051a39Sopenharmony_ci	vmovdqa	$t3,0x30(%rsp)
452e1051a39Sopenharmony_ci	mov	$E,$a0
453e1051a39Sopenharmony_ci	jmp	.Lxop_00_47
454e1051a39Sopenharmony_ci
455e1051a39Sopenharmony_ci.align	16
456e1051a39Sopenharmony_ci.Lxop_00_47:
457e1051a39Sopenharmony_ci	sub	\$-16*2*$SZ,$Tbl	# size optimization
458e1051a39Sopenharmony_ci	vmovdqu	(%r12),$inout		# $a4
459e1051a39Sopenharmony_ci	mov	%r12,$_inp		# $a4
460e1051a39Sopenharmony_ci___
461e1051a39Sopenharmony_cisub XOP_256_00_47 () {
462e1051a39Sopenharmony_cimy $j = shift;
463e1051a39Sopenharmony_cimy $body = shift;
464e1051a39Sopenharmony_cimy @X = @_;
465e1051a39Sopenharmony_cimy @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
466e1051a39Sopenharmony_ci
467e1051a39Sopenharmony_ci	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
468e1051a39Sopenharmony_ci	  eval(shift(@insns));
469e1051a39Sopenharmony_ci	  eval(shift(@insns));
470e1051a39Sopenharmony_ci	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
471e1051a39Sopenharmony_ci	  eval(shift(@insns));
472e1051a39Sopenharmony_ci	  eval(shift(@insns));
473e1051a39Sopenharmony_ci	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
474e1051a39Sopenharmony_ci	  eval(shift(@insns));
475e1051a39Sopenharmony_ci	  eval(shift(@insns));
476e1051a39Sopenharmony_ci	&vpsrld		($t0,$t0,$sigma0[2]);
477e1051a39Sopenharmony_ci	  eval(shift(@insns));
478e1051a39Sopenharmony_ci	  eval(shift(@insns));
479e1051a39Sopenharmony_ci	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
480e1051a39Sopenharmony_ci	  eval(shift(@insns));
481e1051a39Sopenharmony_ci	  eval(shift(@insns));
482e1051a39Sopenharmony_ci	  eval(shift(@insns));
483e1051a39Sopenharmony_ci	  eval(shift(@insns));
484e1051a39Sopenharmony_ci	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
485e1051a39Sopenharmony_ci	  eval(shift(@insns));
486e1051a39Sopenharmony_ci	  eval(shift(@insns));
487e1051a39Sopenharmony_ci	&vpxor		($t0,$t0,$t1);
488e1051a39Sopenharmony_ci	  eval(shift(@insns));
489e1051a39Sopenharmony_ci	  eval(shift(@insns));
490e1051a39Sopenharmony_ci	  eval(shift(@insns));
491e1051a39Sopenharmony_ci	  eval(shift(@insns));
492e1051a39Sopenharmony_ci	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
493e1051a39Sopenharmony_ci	  eval(shift(@insns));
494e1051a39Sopenharmony_ci	  eval(shift(@insns));
495e1051a39Sopenharmony_ci	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
496e1051a39Sopenharmony_ci	  eval(shift(@insns));
497e1051a39Sopenharmony_ci	  eval(shift(@insns));
498e1051a39Sopenharmony_ci	 &vpsrld	($t2,@X[3],$sigma1[2]);
499e1051a39Sopenharmony_ci	  eval(shift(@insns));
500e1051a39Sopenharmony_ci	  eval(shift(@insns));
501e1051a39Sopenharmony_ci	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
502e1051a39Sopenharmony_ci	  eval(shift(@insns));
503e1051a39Sopenharmony_ci	  eval(shift(@insns));
504e1051a39Sopenharmony_ci	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
505e1051a39Sopenharmony_ci	  eval(shift(@insns));
506e1051a39Sopenharmony_ci	  eval(shift(@insns));
507e1051a39Sopenharmony_ci	 &vpxor		($t3,$t3,$t2);
508e1051a39Sopenharmony_ci	  eval(shift(@insns));
509e1051a39Sopenharmony_ci	  eval(shift(@insns));
510e1051a39Sopenharmony_ci	  eval(shift(@insns));
511e1051a39Sopenharmony_ci	  eval(shift(@insns));
512e1051a39Sopenharmony_ci	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
513e1051a39Sopenharmony_ci	  eval(shift(@insns));
514e1051a39Sopenharmony_ci	  eval(shift(@insns));
515e1051a39Sopenharmony_ci	  eval(shift(@insns));
516e1051a39Sopenharmony_ci	  eval(shift(@insns));
517e1051a39Sopenharmony_ci	&vpsrldq	($t3,$t3,8);
518e1051a39Sopenharmony_ci	  eval(shift(@insns));
519e1051a39Sopenharmony_ci	  eval(shift(@insns));
520e1051a39Sopenharmony_ci	  eval(shift(@insns));
521e1051a39Sopenharmony_ci	  eval(shift(@insns));
522e1051a39Sopenharmony_ci	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
523e1051a39Sopenharmony_ci	  eval(shift(@insns));
524e1051a39Sopenharmony_ci	  eval(shift(@insns));
525e1051a39Sopenharmony_ci	  eval(shift(@insns));
526e1051a39Sopenharmony_ci	  eval(shift(@insns));
527e1051a39Sopenharmony_ci	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
528e1051a39Sopenharmony_ci	  eval(shift(@insns));
529e1051a39Sopenharmony_ci	  eval(shift(@insns));
530e1051a39Sopenharmony_ci	 &vpsrld	($t2,@X[0],$sigma1[2]);
531e1051a39Sopenharmony_ci	  eval(shift(@insns));
532e1051a39Sopenharmony_ci	  eval(shift(@insns));
533e1051a39Sopenharmony_ci	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
534e1051a39Sopenharmony_ci	  eval(shift(@insns));
535e1051a39Sopenharmony_ci	  eval(shift(@insns));
536e1051a39Sopenharmony_ci	 &vpxor		($t3,$t3,$t2);
537e1051a39Sopenharmony_ci	  eval(shift(@insns));
538e1051a39Sopenharmony_ci	  eval(shift(@insns));
539e1051a39Sopenharmony_ci	  eval(shift(@insns));
540e1051a39Sopenharmony_ci	  eval(shift(@insns));
541e1051a39Sopenharmony_ci	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
542e1051a39Sopenharmony_ci	  eval(shift(@insns));
543e1051a39Sopenharmony_ci	  eval(shift(@insns));
544e1051a39Sopenharmony_ci	  eval(shift(@insns));
545e1051a39Sopenharmony_ci	  eval(shift(@insns));
546e1051a39Sopenharmony_ci	&vpslldq	($t3,$t3,8);		# 22 instructions
547e1051a39Sopenharmony_ci	  eval(shift(@insns));
548e1051a39Sopenharmony_ci	  eval(shift(@insns));
549e1051a39Sopenharmony_ci	  eval(shift(@insns));
550e1051a39Sopenharmony_ci	  eval(shift(@insns));
551e1051a39Sopenharmony_ci	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
552e1051a39Sopenharmony_ci	  eval(shift(@insns));
553e1051a39Sopenharmony_ci	  eval(shift(@insns));
554e1051a39Sopenharmony_ci	  eval(shift(@insns));
555e1051a39Sopenharmony_ci	  eval(shift(@insns));
556e1051a39Sopenharmony_ci	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
557e1051a39Sopenharmony_ci	  foreach (@insns) { eval; }		# remaining instructions
558e1051a39Sopenharmony_ci	&vmovdqa	(16*$j."(%rsp)",$t2);
559e1051a39Sopenharmony_ci}
560e1051a39Sopenharmony_ci
561e1051a39Sopenharmony_ci    $aesni_cbc_idx=0;
562e1051a39Sopenharmony_ci    for ($i=0,$j=0; $j<4; $j++) {
563e1051a39Sopenharmony_ci	&XOP_256_00_47($j,\&body_00_15,@X);
564e1051a39Sopenharmony_ci	push(@X,shift(@X));			# rotate(@X)
565e1051a39Sopenharmony_ci    }
566e1051a39Sopenharmony_ci    	&mov		("%r12",$_inp);		# borrow $a4
567e1051a39Sopenharmony_ci	&vpand		($temp,$temp,$mask14);
568e1051a39Sopenharmony_ci	&mov		("%r15",$_out);		# borrow $a2
569e1051a39Sopenharmony_ci	&vpor		($iv,$iv,$temp);
570e1051a39Sopenharmony_ci	&vmovdqu	("(%r15,%r12)",$iv);	# write output
571e1051a39Sopenharmony_ci	&lea		("%r12","16(%r12)");	# inp++
572e1051a39Sopenharmony_ci
573e1051a39Sopenharmony_ci	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
574e1051a39Sopenharmony_ci	&jne	(".Lxop_00_47");
575e1051a39Sopenharmony_ci
576e1051a39Sopenharmony_ci	&vmovdqu	($inout,"(%r12)");
577e1051a39Sopenharmony_ci	&mov		($_inp,"%r12");
578e1051a39Sopenharmony_ci
579e1051a39Sopenharmony_ci    $aesni_cbc_idx=0;
580e1051a39Sopenharmony_ci    for ($i=0; $i<16; ) {
581e1051a39Sopenharmony_ci	foreach(body_00_15()) { eval; }
582e1051a39Sopenharmony_ci    }
583e1051a39Sopenharmony_ci					}
584e1051a39Sopenharmony_ci$code.=<<___;
585e1051a39Sopenharmony_ci	mov	$_inp,%r12		# borrow $a4
586e1051a39Sopenharmony_ci	mov	$_out,%r13		# borrow $a0
587e1051a39Sopenharmony_ci	mov	$_ctx,%r15		# borrow $a2
588e1051a39Sopenharmony_ci	mov	$_in0,%rsi		# borrow $a3
589e1051a39Sopenharmony_ci
590e1051a39Sopenharmony_ci	vpand	$mask14,$temp,$temp
591e1051a39Sopenharmony_ci	mov	$a1,$A
592e1051a39Sopenharmony_ci	vpor	$temp,$iv,$iv
593e1051a39Sopenharmony_ci	vmovdqu	$iv,(%r13,%r12)		# write output
594e1051a39Sopenharmony_ci	lea	16(%r12),%r12		# inp++
595e1051a39Sopenharmony_ci
596e1051a39Sopenharmony_ci	add	$SZ*0(%r15),$A
597e1051a39Sopenharmony_ci	add	$SZ*1(%r15),$B
598e1051a39Sopenharmony_ci	add	$SZ*2(%r15),$C
599e1051a39Sopenharmony_ci	add	$SZ*3(%r15),$D
600e1051a39Sopenharmony_ci	add	$SZ*4(%r15),$E
601e1051a39Sopenharmony_ci	add	$SZ*5(%r15),$F
602e1051a39Sopenharmony_ci	add	$SZ*6(%r15),$G
603e1051a39Sopenharmony_ci	add	$SZ*7(%r15),$H
604e1051a39Sopenharmony_ci
605e1051a39Sopenharmony_ci	cmp	$_end,%r12
606e1051a39Sopenharmony_ci
607e1051a39Sopenharmony_ci	mov	$A,$SZ*0(%r15)
608e1051a39Sopenharmony_ci	mov	$B,$SZ*1(%r15)
609e1051a39Sopenharmony_ci	mov	$C,$SZ*2(%r15)
610e1051a39Sopenharmony_ci	mov	$D,$SZ*3(%r15)
611e1051a39Sopenharmony_ci	mov	$E,$SZ*4(%r15)
612e1051a39Sopenharmony_ci	mov	$F,$SZ*5(%r15)
613e1051a39Sopenharmony_ci	mov	$G,$SZ*6(%r15)
614e1051a39Sopenharmony_ci	mov	$H,$SZ*7(%r15)
615e1051a39Sopenharmony_ci
616e1051a39Sopenharmony_ci	jb	.Lloop_xop
617e1051a39Sopenharmony_ci
618e1051a39Sopenharmony_ci	mov	$_ivp,$ivp
619e1051a39Sopenharmony_ci	mov	$_rsp,%rsi
620e1051a39Sopenharmony_ci.cfi_def_cfa	%rsi,8
621e1051a39Sopenharmony_ci	vmovdqu	$iv,($ivp)		# output IV
622e1051a39Sopenharmony_ci	vzeroall
623e1051a39Sopenharmony_ci___
624e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
625e1051a39Sopenharmony_ci	movaps	`$framesz+16*0`(%rsp),%xmm6
626e1051a39Sopenharmony_ci	movaps	`$framesz+16*1`(%rsp),%xmm7
627e1051a39Sopenharmony_ci	movaps	`$framesz+16*2`(%rsp),%xmm8
628e1051a39Sopenharmony_ci	movaps	`$framesz+16*3`(%rsp),%xmm9
629e1051a39Sopenharmony_ci	movaps	`$framesz+16*4`(%rsp),%xmm10
630e1051a39Sopenharmony_ci	movaps	`$framesz+16*5`(%rsp),%xmm11
631e1051a39Sopenharmony_ci	movaps	`$framesz+16*6`(%rsp),%xmm12
632e1051a39Sopenharmony_ci	movaps	`$framesz+16*7`(%rsp),%xmm13
633e1051a39Sopenharmony_ci	movaps	`$framesz+16*8`(%rsp),%xmm14
634e1051a39Sopenharmony_ci	movaps	`$framesz+16*9`(%rsp),%xmm15
635e1051a39Sopenharmony_ci___
636e1051a39Sopenharmony_ci$code.=<<___;
637e1051a39Sopenharmony_ci	mov	-48(%rsi),%r15
638e1051a39Sopenharmony_ci.cfi_restore	%r15
639e1051a39Sopenharmony_ci	mov	-40(%rsi),%r14
640e1051a39Sopenharmony_ci.cfi_restore	%r14
641e1051a39Sopenharmony_ci	mov	-32(%rsi),%r13
642e1051a39Sopenharmony_ci.cfi_restore	%r13
643e1051a39Sopenharmony_ci	mov	-24(%rsi),%r12
644e1051a39Sopenharmony_ci.cfi_restore	%r12
645e1051a39Sopenharmony_ci	mov	-16(%rsi),%rbp
646e1051a39Sopenharmony_ci.cfi_restore	%rbp
647e1051a39Sopenharmony_ci	mov	-8(%rsi),%rbx
648e1051a39Sopenharmony_ci.cfi_restore	%rbx
649e1051a39Sopenharmony_ci	lea	(%rsi),%rsp
650e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
651e1051a39Sopenharmony_ci.Lepilogue_xop:
652e1051a39Sopenharmony_ci	ret
653e1051a39Sopenharmony_ci.cfi_endproc
654e1051a39Sopenharmony_ci.size	${func}_xop,.-${func}_xop
655e1051a39Sopenharmony_ci___
656e1051a39Sopenharmony_ci######################################################################
657e1051a39Sopenharmony_ci# AVX+shrd code path
658e1051a39Sopenharmony_ci#
659e1051a39Sopenharmony_cilocal *ror = sub { &shrd(@_[0],@_) };
660e1051a39Sopenharmony_ci
661e1051a39Sopenharmony_ci$code.=<<___;
662e1051a39Sopenharmony_ci.type	${func}_avx,\@function,6
663e1051a39Sopenharmony_ci.align	64
664e1051a39Sopenharmony_ci${func}_avx:
665e1051a39Sopenharmony_ci.cfi_startproc
666e1051a39Sopenharmony_ci.Lavx_shortcut:
667e1051a39Sopenharmony_ci	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
668e1051a39Sopenharmony_ci	mov	%rsp,%rax		# copy %rsp
669e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
670e1051a39Sopenharmony_ci	push	%rbx
671e1051a39Sopenharmony_ci.cfi_push	%rbx
672e1051a39Sopenharmony_ci	push	%rbp
673e1051a39Sopenharmony_ci.cfi_push	%rbp
674e1051a39Sopenharmony_ci	push	%r12
675e1051a39Sopenharmony_ci.cfi_push	%r12
676e1051a39Sopenharmony_ci	push	%r13
677e1051a39Sopenharmony_ci.cfi_push	%r13
678e1051a39Sopenharmony_ci	push	%r14
679e1051a39Sopenharmony_ci.cfi_push	%r14
680e1051a39Sopenharmony_ci	push	%r15
681e1051a39Sopenharmony_ci.cfi_push	%r15
682e1051a39Sopenharmony_ci	sub	\$`$framesz+$win64*16*10`,%rsp
683e1051a39Sopenharmony_ci	and	\$-64,%rsp		# align stack frame
684e1051a39Sopenharmony_ci
685e1051a39Sopenharmony_ci	shl	\$6,$len
686e1051a39Sopenharmony_ci	sub	$inp,$out		# re-bias
687e1051a39Sopenharmony_ci	sub	$inp,$in0
688e1051a39Sopenharmony_ci	add	$inp,$len		# end of input
689e1051a39Sopenharmony_ci
690e1051a39Sopenharmony_ci	#mov	$inp,$_inp		# saved later
691e1051a39Sopenharmony_ci	mov	$out,$_out
692e1051a39Sopenharmony_ci	mov	$len,$_end
693e1051a39Sopenharmony_ci	#mov	$key,$_key		# remains resident in $inp register
694e1051a39Sopenharmony_ci	mov	$ivp,$_ivp
695e1051a39Sopenharmony_ci	mov	$ctx,$_ctx
696e1051a39Sopenharmony_ci	mov	$in0,$_in0
697e1051a39Sopenharmony_ci	mov	%rax,$_rsp
698e1051a39Sopenharmony_ci.cfi_cfa_expression	$_rsp,deref,+8
699e1051a39Sopenharmony_ci___
700e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
701e1051a39Sopenharmony_ci	movaps	%xmm6,`$framesz+16*0`(%rsp)
702e1051a39Sopenharmony_ci	movaps	%xmm7,`$framesz+16*1`(%rsp)
703e1051a39Sopenharmony_ci	movaps	%xmm8,`$framesz+16*2`(%rsp)
704e1051a39Sopenharmony_ci	movaps	%xmm9,`$framesz+16*3`(%rsp)
705e1051a39Sopenharmony_ci	movaps	%xmm10,`$framesz+16*4`(%rsp)
706e1051a39Sopenharmony_ci	movaps	%xmm11,`$framesz+16*5`(%rsp)
707e1051a39Sopenharmony_ci	movaps	%xmm12,`$framesz+16*6`(%rsp)
708e1051a39Sopenharmony_ci	movaps	%xmm13,`$framesz+16*7`(%rsp)
709e1051a39Sopenharmony_ci	movaps	%xmm14,`$framesz+16*8`(%rsp)
710e1051a39Sopenharmony_ci	movaps	%xmm15,`$framesz+16*9`(%rsp)
711e1051a39Sopenharmony_ci___
712e1051a39Sopenharmony_ci$code.=<<___;
713e1051a39Sopenharmony_ci.Lprologue_avx:
714e1051a39Sopenharmony_ci	vzeroall
715e1051a39Sopenharmony_ci
716e1051a39Sopenharmony_ci	mov	$inp,%r12		# borrow $a4
717e1051a39Sopenharmony_ci	lea	0x80($key),$inp		# size optimization, reassign
718e1051a39Sopenharmony_ci	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
719e1051a39Sopenharmony_ci	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
720e1051a39Sopenharmony_ci	mov	$ctx,%r15		# borrow $a2
721e1051a39Sopenharmony_ci	mov	$in0,%rsi		# borrow $a3
722e1051a39Sopenharmony_ci	vmovdqu	($ivp),$iv		# load IV
723e1051a39Sopenharmony_ci	sub	\$9,%r14
724e1051a39Sopenharmony_ci
725e1051a39Sopenharmony_ci	mov	$SZ*0(%r15),$A
726e1051a39Sopenharmony_ci	mov	$SZ*1(%r15),$B
727e1051a39Sopenharmony_ci	mov	$SZ*2(%r15),$C
728e1051a39Sopenharmony_ci	mov	$SZ*3(%r15),$D
729e1051a39Sopenharmony_ci	mov	$SZ*4(%r15),$E
730e1051a39Sopenharmony_ci	mov	$SZ*5(%r15),$F
731e1051a39Sopenharmony_ci	mov	$SZ*6(%r15),$G
732e1051a39Sopenharmony_ci	mov	$SZ*7(%r15),$H
733e1051a39Sopenharmony_ci
734e1051a39Sopenharmony_ci	vmovdqa	0x00(%r13,%r14,8),$mask14
735e1051a39Sopenharmony_ci	vmovdqa	0x10(%r13,%r14,8),$mask12
736e1051a39Sopenharmony_ci	vmovdqa	0x20(%r13,%r14,8),$mask10
737e1051a39Sopenharmony_ci	vmovdqu	0x00-0x80($inp),$roundkey
738e1051a39Sopenharmony_ci___
739e1051a39Sopenharmony_ci					if ($SZ==4) {	# SHA256
740e1051a39Sopenharmony_ci    my @X = map("%xmm$_",(0..3));
741e1051a39Sopenharmony_ci    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
742e1051a39Sopenharmony_ci
743e1051a39Sopenharmony_ci$code.=<<___;
744e1051a39Sopenharmony_ci	jmp	.Lloop_avx
745e1051a39Sopenharmony_ci.align	16
746e1051a39Sopenharmony_ci.Lloop_avx:
747e1051a39Sopenharmony_ci	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
748e1051a39Sopenharmony_ci	vmovdqu	0x00(%rsi,%r12),@X[0]
749e1051a39Sopenharmony_ci	vmovdqu	0x10(%rsi,%r12),@X[1]
750e1051a39Sopenharmony_ci	vmovdqu	0x20(%rsi,%r12),@X[2]
751e1051a39Sopenharmony_ci	vmovdqu	0x30(%rsi,%r12),@X[3]
752e1051a39Sopenharmony_ci	vpshufb	$t3,@X[0],@X[0]
753e1051a39Sopenharmony_ci	lea	$TABLE(%rip),$Tbl
754e1051a39Sopenharmony_ci	vpshufb	$t3,@X[1],@X[1]
755e1051a39Sopenharmony_ci	vpshufb	$t3,@X[2],@X[2]
756e1051a39Sopenharmony_ci	vpaddd	0x00($Tbl),@X[0],$t0
757e1051a39Sopenharmony_ci	vpshufb	$t3,@X[3],@X[3]
758e1051a39Sopenharmony_ci	vpaddd	0x20($Tbl),@X[1],$t1
759e1051a39Sopenharmony_ci	vpaddd	0x40($Tbl),@X[2],$t2
760e1051a39Sopenharmony_ci	vpaddd	0x60($Tbl),@X[3],$t3
761e1051a39Sopenharmony_ci	vmovdqa	$t0,0x00(%rsp)
762e1051a39Sopenharmony_ci	mov	$A,$a1
763e1051a39Sopenharmony_ci	vmovdqa	$t1,0x10(%rsp)
764e1051a39Sopenharmony_ci	mov	$B,$a3
765e1051a39Sopenharmony_ci	vmovdqa	$t2,0x20(%rsp)
766e1051a39Sopenharmony_ci	xor	$C,$a3			# magic
767e1051a39Sopenharmony_ci	vmovdqa	$t3,0x30(%rsp)
768e1051a39Sopenharmony_ci	mov	$E,$a0
769e1051a39Sopenharmony_ci	jmp	.Lavx_00_47
770e1051a39Sopenharmony_ci
771e1051a39Sopenharmony_ci.align	16
772e1051a39Sopenharmony_ci.Lavx_00_47:
773e1051a39Sopenharmony_ci	sub	\$-16*2*$SZ,$Tbl	# size optimization
774e1051a39Sopenharmony_ci	vmovdqu	(%r12),$inout		# $a4
775e1051a39Sopenharmony_ci	mov	%r12,$_inp		# $a4
776e1051a39Sopenharmony_ci___
777e1051a39Sopenharmony_cisub Xupdate_256_AVX () {
778e1051a39Sopenharmony_ci	(
779e1051a39Sopenharmony_ci	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
780e1051a39Sopenharmony_ci	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
781e1051a39Sopenharmony_ci	'&vpsrld	($t2,$t0,$sigma0[0]);',
782e1051a39Sopenharmony_ci	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
783e1051a39Sopenharmony_ci	'&vpsrld	($t3,$t0,$sigma0[2])',
784e1051a39Sopenharmony_ci	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
785e1051a39Sopenharmony_ci	'&vpxor		($t0,$t3,$t2)',
786e1051a39Sopenharmony_ci	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
787e1051a39Sopenharmony_ci	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
788e1051a39Sopenharmony_ci	'&vpxor		($t0,$t0,$t1)',
789e1051a39Sopenharmony_ci	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
790e1051a39Sopenharmony_ci	'&vpxor		($t0,$t0,$t2)',
791e1051a39Sopenharmony_ci	 '&vpsrld	($t2,$t3,$sigma1[2]);',
792e1051a39Sopenharmony_ci	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
793e1051a39Sopenharmony_ci	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
794e1051a39Sopenharmony_ci	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
795e1051a39Sopenharmony_ci	 '&vpxor	($t2,$t2,$t3);',
796e1051a39Sopenharmony_ci	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
797e1051a39Sopenharmony_ci	 '&vpxor	($t2,$t2,$t3)',		# sigma1(X[14..15])
798e1051a39Sopenharmony_ci	 '&vpshufd	($t2,$t2,0b10000100)',
799e1051a39Sopenharmony_ci	 '&vpsrldq	($t2,$t2,8)',
800e1051a39Sopenharmony_ci	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
801e1051a39Sopenharmony_ci	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
802e1051a39Sopenharmony_ci	 '&vpsrld	($t2,$t3,$sigma1[2])',
803e1051a39Sopenharmony_ci	 '&vpsrlq	($t3,$t3,$sigma1[0])',
804e1051a39Sopenharmony_ci	 '&vpxor	($t2,$t2,$t3);',
805e1051a39Sopenharmony_ci	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
806e1051a39Sopenharmony_ci	 '&vpxor	($t2,$t2,$t3)',
807e1051a39Sopenharmony_ci	 '&vpshufd	($t2,$t2,0b11101000)',
808e1051a39Sopenharmony_ci	 '&vpslldq	($t2,$t2,8)',
809e1051a39Sopenharmony_ci	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
810e1051a39Sopenharmony_ci	);
811e1051a39Sopenharmony_ci}
812e1051a39Sopenharmony_ci
813e1051a39Sopenharmony_cisub AVX_256_00_47 () {
814e1051a39Sopenharmony_cimy $j = shift;
815e1051a39Sopenharmony_cimy $body = shift;
816e1051a39Sopenharmony_cimy @X = @_;
817e1051a39Sopenharmony_cimy @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
818e1051a39Sopenharmony_ci
819e1051a39Sopenharmony_ci	foreach (Xupdate_256_AVX()) {		# 29 instructions
820e1051a39Sopenharmony_ci	    eval;
821e1051a39Sopenharmony_ci	    eval(shift(@insns));
822e1051a39Sopenharmony_ci	    eval(shift(@insns));
823e1051a39Sopenharmony_ci	    eval(shift(@insns));
824e1051a39Sopenharmony_ci	}
825e1051a39Sopenharmony_ci	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
826e1051a39Sopenharmony_ci	  foreach (@insns) { eval; }		# remaining instructions
827e1051a39Sopenharmony_ci	&vmovdqa	(16*$j."(%rsp)",$t2);
828e1051a39Sopenharmony_ci}
829e1051a39Sopenharmony_ci
830e1051a39Sopenharmony_ci    $aesni_cbc_idx=0;
831e1051a39Sopenharmony_ci    for ($i=0,$j=0; $j<4; $j++) {
832e1051a39Sopenharmony_ci	&AVX_256_00_47($j,\&body_00_15,@X);
833e1051a39Sopenharmony_ci	push(@X,shift(@X));			# rotate(@X)
834e1051a39Sopenharmony_ci    }
835e1051a39Sopenharmony_ci    	&mov		("%r12",$_inp);		# borrow $a4
836e1051a39Sopenharmony_ci	&vpand		($temp,$temp,$mask14);
837e1051a39Sopenharmony_ci	&mov		("%r15",$_out);		# borrow $a2
838e1051a39Sopenharmony_ci	&vpor		($iv,$iv,$temp);
839e1051a39Sopenharmony_ci	&vmovdqu	("(%r15,%r12)",$iv);	# write output
840e1051a39Sopenharmony_ci	&lea		("%r12","16(%r12)");	# inp++
841e1051a39Sopenharmony_ci
842e1051a39Sopenharmony_ci	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
843e1051a39Sopenharmony_ci	&jne	(".Lavx_00_47");
844e1051a39Sopenharmony_ci
845e1051a39Sopenharmony_ci	&vmovdqu	($inout,"(%r12)");
846e1051a39Sopenharmony_ci	&mov		($_inp,"%r12");
847e1051a39Sopenharmony_ci
848e1051a39Sopenharmony_ci    $aesni_cbc_idx=0;
849e1051a39Sopenharmony_ci    for ($i=0; $i<16; ) {
850e1051a39Sopenharmony_ci	foreach(body_00_15()) { eval; }
851e1051a39Sopenharmony_ci    }
852e1051a39Sopenharmony_ci
853e1051a39Sopenharmony_ci					}
854e1051a39Sopenharmony_ci$code.=<<___;
855e1051a39Sopenharmony_ci	mov	$_inp,%r12		# borrow $a4
856e1051a39Sopenharmony_ci	mov	$_out,%r13		# borrow $a0
857e1051a39Sopenharmony_ci	mov	$_ctx,%r15		# borrow $a2
858e1051a39Sopenharmony_ci	mov	$_in0,%rsi		# borrow $a3
859e1051a39Sopenharmony_ci
860e1051a39Sopenharmony_ci	vpand	$mask14,$temp,$temp
861e1051a39Sopenharmony_ci	mov	$a1,$A
862e1051a39Sopenharmony_ci	vpor	$temp,$iv,$iv
863e1051a39Sopenharmony_ci	vmovdqu	$iv,(%r13,%r12)		# write output
864e1051a39Sopenharmony_ci	lea	16(%r12),%r12		# inp++
865e1051a39Sopenharmony_ci
866e1051a39Sopenharmony_ci	add	$SZ*0(%r15),$A
867e1051a39Sopenharmony_ci	add	$SZ*1(%r15),$B
868e1051a39Sopenharmony_ci	add	$SZ*2(%r15),$C
869e1051a39Sopenharmony_ci	add	$SZ*3(%r15),$D
870e1051a39Sopenharmony_ci	add	$SZ*4(%r15),$E
871e1051a39Sopenharmony_ci	add	$SZ*5(%r15),$F
872e1051a39Sopenharmony_ci	add	$SZ*6(%r15),$G
873e1051a39Sopenharmony_ci	add	$SZ*7(%r15),$H
874e1051a39Sopenharmony_ci
875e1051a39Sopenharmony_ci	cmp	$_end,%r12
876e1051a39Sopenharmony_ci
877e1051a39Sopenharmony_ci	mov	$A,$SZ*0(%r15)
878e1051a39Sopenharmony_ci	mov	$B,$SZ*1(%r15)
879e1051a39Sopenharmony_ci	mov	$C,$SZ*2(%r15)
880e1051a39Sopenharmony_ci	mov	$D,$SZ*3(%r15)
881e1051a39Sopenharmony_ci	mov	$E,$SZ*4(%r15)
882e1051a39Sopenharmony_ci	mov	$F,$SZ*5(%r15)
883e1051a39Sopenharmony_ci	mov	$G,$SZ*6(%r15)
884e1051a39Sopenharmony_ci	mov	$H,$SZ*7(%r15)
885e1051a39Sopenharmony_ci	jb	.Lloop_avx
886e1051a39Sopenharmony_ci
887e1051a39Sopenharmony_ci	mov	$_ivp,$ivp
888e1051a39Sopenharmony_ci	mov	$_rsp,%rsi
889e1051a39Sopenharmony_ci.cfi_def_cfa	%rsi,8
890e1051a39Sopenharmony_ci	vmovdqu	$iv,($ivp)		# output IV
891e1051a39Sopenharmony_ci	vzeroall
892e1051a39Sopenharmony_ci___
893e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
894e1051a39Sopenharmony_ci	movaps	`$framesz+16*0`(%rsp),%xmm6
895e1051a39Sopenharmony_ci	movaps	`$framesz+16*1`(%rsp),%xmm7
896e1051a39Sopenharmony_ci	movaps	`$framesz+16*2`(%rsp),%xmm8
897e1051a39Sopenharmony_ci	movaps	`$framesz+16*3`(%rsp),%xmm9
898e1051a39Sopenharmony_ci	movaps	`$framesz+16*4`(%rsp),%xmm10
899e1051a39Sopenharmony_ci	movaps	`$framesz+16*5`(%rsp),%xmm11
900e1051a39Sopenharmony_ci	movaps	`$framesz+16*6`(%rsp),%xmm12
901e1051a39Sopenharmony_ci	movaps	`$framesz+16*7`(%rsp),%xmm13
902e1051a39Sopenharmony_ci	movaps	`$framesz+16*8`(%rsp),%xmm14
903e1051a39Sopenharmony_ci	movaps	`$framesz+16*9`(%rsp),%xmm15
904e1051a39Sopenharmony_ci___
905e1051a39Sopenharmony_ci$code.=<<___;
906e1051a39Sopenharmony_ci	mov	-48(%rsi),%r15
907e1051a39Sopenharmony_ci.cfi_restore	%r15
908e1051a39Sopenharmony_ci	mov	-40(%rsi),%r14
909e1051a39Sopenharmony_ci.cfi_restore	%r14
910e1051a39Sopenharmony_ci	mov	-32(%rsi),%r13
911e1051a39Sopenharmony_ci.cfi_restore	%r13
912e1051a39Sopenharmony_ci	mov	-24(%rsi),%r12
913e1051a39Sopenharmony_ci.cfi_restore	%r12
914e1051a39Sopenharmony_ci	mov	-16(%rsi),%rbp
915e1051a39Sopenharmony_ci.cfi_restore	%rbp
916e1051a39Sopenharmony_ci	mov	-8(%rsi),%rbx
917e1051a39Sopenharmony_ci.cfi_restore	%rbx
918e1051a39Sopenharmony_ci	lea	(%rsi),%rsp
919e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
920e1051a39Sopenharmony_ci.Lepilogue_avx:
921e1051a39Sopenharmony_ci	ret
922e1051a39Sopenharmony_ci.cfi_endproc
923e1051a39Sopenharmony_ci.size	${func}_avx,.-${func}_avx
924e1051a39Sopenharmony_ci___
925e1051a39Sopenharmony_ci
926e1051a39Sopenharmony_ciif ($avx>1) {{
927e1051a39Sopenharmony_ci######################################################################
928e1051a39Sopenharmony_ci# AVX2+BMI code path
929e1051a39Sopenharmony_ci#
930e1051a39Sopenharmony_cimy $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
931e1051a39Sopenharmony_cimy $PUSH8=8*2*$SZ;
932e1051a39Sopenharmony_ciuse integer;
933e1051a39Sopenharmony_ci
934e1051a39Sopenharmony_cisub bodyx_00_15 () {
935e1051a39Sopenharmony_ci	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
936e1051a39Sopenharmony_ci	(
937e1051a39Sopenharmony_ci	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
938e1051a39Sopenharmony_ci
939e1051a39Sopenharmony_ci	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
940e1051a39Sopenharmony_ci	'&and	($a4,$e)',		# f&e
941e1051a39Sopenharmony_ci	'&rorx	($a0,$e,$Sigma1[2])',
942e1051a39Sopenharmony_ci	'&rorx	($a2,$e,$Sigma1[1])',
943e1051a39Sopenharmony_ci
944e1051a39Sopenharmony_ci	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
945e1051a39Sopenharmony_ci	'&lea	($h,"($h,$a4)")',
946e1051a39Sopenharmony_ci	'&andn	($a4,$e,$g)',		# ~e&g
947e1051a39Sopenharmony_ci	'&xor	($a0,$a2)',
948e1051a39Sopenharmony_ci
949e1051a39Sopenharmony_ci	'&rorx	($a1,$e,$Sigma1[0])',
950e1051a39Sopenharmony_ci	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
951e1051a39Sopenharmony_ci	'&xor	($a0,$a1)',		# Sigma1(e)
952e1051a39Sopenharmony_ci	'&mov	($a2,$a)',
953e1051a39Sopenharmony_ci
954e1051a39Sopenharmony_ci	'&rorx	($a4,$a,$Sigma0[2])',
955e1051a39Sopenharmony_ci	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
956e1051a39Sopenharmony_ci	'&xor	($a2,$b)',		# a^b, b^c in next round
957e1051a39Sopenharmony_ci	'&rorx	($a1,$a,$Sigma0[1])',
958e1051a39Sopenharmony_ci
959e1051a39Sopenharmony_ci	'&rorx	($a0,$a,$Sigma0[0])',
960e1051a39Sopenharmony_ci	'&lea	($d,"($d,$h)")',	# d+=h
961e1051a39Sopenharmony_ci	'&and	($a3,$a2)',		# (b^c)&(a^b)
962e1051a39Sopenharmony_ci	@aesni_cbc_block[$aesni_cbc_idx++].
963e1051a39Sopenharmony_ci	'&xor	($a1,$a4)',
964e1051a39Sopenharmony_ci
965e1051a39Sopenharmony_ci	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
966e1051a39Sopenharmony_ci	'&xor	($a1,$a0)',		# Sigma0(a)
967e1051a39Sopenharmony_ci	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
968e1051a39Sopenharmony_ci	'&mov	($a4,$e)',		# copy of f in future
969e1051a39Sopenharmony_ci
970e1051a39Sopenharmony_ci	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
971e1051a39Sopenharmony_ci	);
972e1051a39Sopenharmony_ci	# and at the finish one has to $a+=$a1
973e1051a39Sopenharmony_ci}
974e1051a39Sopenharmony_ci
975e1051a39Sopenharmony_ci$code.=<<___;
976e1051a39Sopenharmony_ci.type	${func}_avx2,\@function,6
977e1051a39Sopenharmony_ci.align	64
978e1051a39Sopenharmony_ci${func}_avx2:
979e1051a39Sopenharmony_ci.cfi_startproc
980e1051a39Sopenharmony_ci.Lavx2_shortcut:
981e1051a39Sopenharmony_ci	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
982e1051a39Sopenharmony_ci	mov	%rsp,%rax		# copy %rsp
983e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
984e1051a39Sopenharmony_ci	push	%rbx
985e1051a39Sopenharmony_ci.cfi_push	%rbx
986e1051a39Sopenharmony_ci	push	%rbp
987e1051a39Sopenharmony_ci.cfi_push	%rbp
988e1051a39Sopenharmony_ci	push	%r12
989e1051a39Sopenharmony_ci.cfi_push	%r12
990e1051a39Sopenharmony_ci	push	%r13
991e1051a39Sopenharmony_ci.cfi_push	%r13
992e1051a39Sopenharmony_ci	push	%r14
993e1051a39Sopenharmony_ci.cfi_push	%r14
994e1051a39Sopenharmony_ci	push	%r15
995e1051a39Sopenharmony_ci.cfi_push	%r15
996e1051a39Sopenharmony_ci	sub	\$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
997e1051a39Sopenharmony_ci	and	\$-256*$SZ,%rsp		# align stack frame
998e1051a39Sopenharmony_ci	add	\$`2*$SZ*($rounds-8)`,%rsp
999e1051a39Sopenharmony_ci
1000e1051a39Sopenharmony_ci	shl	\$6,$len
1001e1051a39Sopenharmony_ci	sub	$inp,$out		# re-bias
1002e1051a39Sopenharmony_ci	sub	$inp,$in0
1003e1051a39Sopenharmony_ci	add	$inp,$len		# end of input
1004e1051a39Sopenharmony_ci
1005e1051a39Sopenharmony_ci	#mov	$inp,$_inp		# saved later
1006e1051a39Sopenharmony_ci	#mov	$out,$_out		# kept in $offload
1007e1051a39Sopenharmony_ci	mov	$len,$_end
1008e1051a39Sopenharmony_ci	#mov	$key,$_key		# remains resident in $inp register
1009e1051a39Sopenharmony_ci	mov	$ivp,$_ivp
1010e1051a39Sopenharmony_ci	mov	$ctx,$_ctx
1011e1051a39Sopenharmony_ci	mov	$in0,$_in0
1012e1051a39Sopenharmony_ci	mov	%rax,$_rsp
1013e1051a39Sopenharmony_ci.cfi_cfa_expression	$_rsp,deref,+8
1014e1051a39Sopenharmony_ci___
1015e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1016e1051a39Sopenharmony_ci	movaps	%xmm6,`$framesz+16*0`(%rsp)
1017e1051a39Sopenharmony_ci	movaps	%xmm7,`$framesz+16*1`(%rsp)
1018e1051a39Sopenharmony_ci	movaps	%xmm8,`$framesz+16*2`(%rsp)
1019e1051a39Sopenharmony_ci	movaps	%xmm9,`$framesz+16*3`(%rsp)
1020e1051a39Sopenharmony_ci	movaps	%xmm10,`$framesz+16*4`(%rsp)
1021e1051a39Sopenharmony_ci	movaps	%xmm11,`$framesz+16*5`(%rsp)
1022e1051a39Sopenharmony_ci	movaps	%xmm12,`$framesz+16*6`(%rsp)
1023e1051a39Sopenharmony_ci	movaps	%xmm13,`$framesz+16*7`(%rsp)
1024e1051a39Sopenharmony_ci	movaps	%xmm14,`$framesz+16*8`(%rsp)
1025e1051a39Sopenharmony_ci	movaps	%xmm15,`$framesz+16*9`(%rsp)
1026e1051a39Sopenharmony_ci___
1027e1051a39Sopenharmony_ci$code.=<<___;
1028e1051a39Sopenharmony_ci.Lprologue_avx2:
1029e1051a39Sopenharmony_ci	vzeroall
1030e1051a39Sopenharmony_ci
1031e1051a39Sopenharmony_ci	mov	$inp,%r13		# borrow $a0
1032e1051a39Sopenharmony_ci	vpinsrq	\$1,$out,$offload,$offload
1033e1051a39Sopenharmony_ci	lea	0x80($key),$inp		# size optimization, reassign
1034e1051a39Sopenharmony_ci	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r12	# borrow $a4
1035e1051a39Sopenharmony_ci	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
1036e1051a39Sopenharmony_ci	mov	$ctx,%r15		# borrow $a2
1037e1051a39Sopenharmony_ci	mov	$in0,%rsi		# borrow $a3
1038e1051a39Sopenharmony_ci	vmovdqu	($ivp),$iv		# load IV
1039e1051a39Sopenharmony_ci	lea	-9(%r14),%r14
1040e1051a39Sopenharmony_ci
1041e1051a39Sopenharmony_ci	vmovdqa	0x00(%r12,%r14,8),$mask14
1042e1051a39Sopenharmony_ci	vmovdqa	0x10(%r12,%r14,8),$mask12
1043e1051a39Sopenharmony_ci	vmovdqa	0x20(%r12,%r14,8),$mask10
1044e1051a39Sopenharmony_ci
1045e1051a39Sopenharmony_ci	sub	\$-16*$SZ,%r13		# inp++, size optimization
1046e1051a39Sopenharmony_ci	mov	$SZ*0(%r15),$A
1047e1051a39Sopenharmony_ci	lea	(%rsi,%r13),%r12	# borrow $a0
1048e1051a39Sopenharmony_ci	mov	$SZ*1(%r15),$B
1049e1051a39Sopenharmony_ci	cmp	$len,%r13		# $_end
1050e1051a39Sopenharmony_ci	mov	$SZ*2(%r15),$C
1051e1051a39Sopenharmony_ci	cmove	%rsp,%r12		# next block or random data
1052e1051a39Sopenharmony_ci	mov	$SZ*3(%r15),$D
1053e1051a39Sopenharmony_ci	mov	$SZ*4(%r15),$E
1054e1051a39Sopenharmony_ci	mov	$SZ*5(%r15),$F
1055e1051a39Sopenharmony_ci	mov	$SZ*6(%r15),$G
1056e1051a39Sopenharmony_ci	mov	$SZ*7(%r15),$H
1057e1051a39Sopenharmony_ci	vmovdqu	0x00-0x80($inp),$roundkey
1058e1051a39Sopenharmony_ci___
1059e1051a39Sopenharmony_ci					if ($SZ==4) {	# SHA256
1060e1051a39Sopenharmony_ci    my @X = map("%ymm$_",(0..3));
1061e1051a39Sopenharmony_ci    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1062e1051a39Sopenharmony_ci
1063e1051a39Sopenharmony_ci$code.=<<___;
1064e1051a39Sopenharmony_ci	jmp	.Loop_avx2
1065e1051a39Sopenharmony_ci.align	16
1066e1051a39Sopenharmony_ci.Loop_avx2:
1067e1051a39Sopenharmony_ci	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1068e1051a39Sopenharmony_ci	vmovdqu	-16*$SZ+0(%rsi,%r13),%xmm0
1069e1051a39Sopenharmony_ci	vmovdqu	-16*$SZ+16(%rsi,%r13),%xmm1
1070e1051a39Sopenharmony_ci	vmovdqu	-16*$SZ+32(%rsi,%r13),%xmm2
1071e1051a39Sopenharmony_ci	vmovdqu	-16*$SZ+48(%rsi,%r13),%xmm3
1072e1051a39Sopenharmony_ci
1073e1051a39Sopenharmony_ci	vinserti128	\$1,(%r12),@X[0],@X[0]
1074e1051a39Sopenharmony_ci	vinserti128	\$1,16(%r12),@X[1],@X[1]
1075e1051a39Sopenharmony_ci	 vpshufb	$t3,@X[0],@X[0]
1076e1051a39Sopenharmony_ci	vinserti128	\$1,32(%r12),@X[2],@X[2]
1077e1051a39Sopenharmony_ci	 vpshufb	$t3,@X[1],@X[1]
1078e1051a39Sopenharmony_ci	vinserti128	\$1,48(%r12),@X[3],@X[3]
1079e1051a39Sopenharmony_ci
1080e1051a39Sopenharmony_ci	lea	$TABLE(%rip),$Tbl
1081e1051a39Sopenharmony_ci	vpshufb	$t3,@X[2],@X[2]
1082e1051a39Sopenharmony_ci	lea	-16*$SZ(%r13),%r13
1083e1051a39Sopenharmony_ci	vpaddd	0x00($Tbl),@X[0],$t0
1084e1051a39Sopenharmony_ci	vpshufb	$t3,@X[3],@X[3]
1085e1051a39Sopenharmony_ci	vpaddd	0x20($Tbl),@X[1],$t1
1086e1051a39Sopenharmony_ci	vpaddd	0x40($Tbl),@X[2],$t2
1087e1051a39Sopenharmony_ci	vpaddd	0x60($Tbl),@X[3],$t3
1088e1051a39Sopenharmony_ci	vmovdqa	$t0,0x00(%rsp)
1089e1051a39Sopenharmony_ci	xor	$a1,$a1
1090e1051a39Sopenharmony_ci	vmovdqa	$t1,0x20(%rsp)
1091e1051a39Sopenharmony_ci___
1092e1051a39Sopenharmony_ci$code.=<<___ if (!$win64);
1093e1051a39Sopenharmony_ci# temporarily use %rsi as frame pointer
1094e1051a39Sopenharmony_ci        mov     $_rsp,%rsi
1095e1051a39Sopenharmony_ci.cfi_def_cfa    %rsi,8
1096e1051a39Sopenharmony_ci___
1097e1051a39Sopenharmony_ci$code.=<<___;
1098e1051a39Sopenharmony_ci	lea	-$PUSH8(%rsp),%rsp
1099e1051a39Sopenharmony_ci___
1100e1051a39Sopenharmony_ci$code.=<<___ if (!$win64);
1101e1051a39Sopenharmony_ci# the frame info is at $_rsp, but the stack is moving...
1102e1051a39Sopenharmony_ci# so a second frame pointer is saved at -8(%rsp)
1103e1051a39Sopenharmony_ci# that is in the red zone
1104e1051a39Sopenharmony_ci        mov     %rsi,-8(%rsp)
1105e1051a39Sopenharmony_ci.cfi_cfa_expression     %rsp-8,deref,+8
1106e1051a39Sopenharmony_ci___
1107e1051a39Sopenharmony_ci$code.=<<___;
1108e1051a39Sopenharmony_ci	mov	$B,$a3
1109e1051a39Sopenharmony_ci	vmovdqa	$t2,0x00(%rsp)
1110e1051a39Sopenharmony_ci	xor	$C,$a3			# magic
1111e1051a39Sopenharmony_ci	vmovdqa	$t3,0x20(%rsp)
1112e1051a39Sopenharmony_ci	mov	$F,$a4
1113e1051a39Sopenharmony_ci	sub	\$-16*2*$SZ,$Tbl	# size optimization
1114e1051a39Sopenharmony_ci	jmp	.Lavx2_00_47
1115e1051a39Sopenharmony_ci
1116e1051a39Sopenharmony_ci.align	16
1117e1051a39Sopenharmony_ci.Lavx2_00_47:
1118e1051a39Sopenharmony_ci	vmovdqu	(%r13),$inout
1119e1051a39Sopenharmony_ci	vpinsrq	\$0,%r13,$offload,$offload
1120e1051a39Sopenharmony_ci___
1121e1051a39Sopenharmony_ci
1122e1051a39Sopenharmony_cisub AVX2_256_00_47 () {
1123e1051a39Sopenharmony_cimy $j = shift;
1124e1051a39Sopenharmony_cimy $body = shift;
1125e1051a39Sopenharmony_cimy @X = @_;
1126e1051a39Sopenharmony_cimy @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1127e1051a39Sopenharmony_cimy $base = "+2*$PUSH8(%rsp)";
1128e1051a39Sopenharmony_ci
1129e1051a39Sopenharmony_ci	if (($j%2)==0) {
1130e1051a39Sopenharmony_ci	&lea	("%rsp","-$PUSH8(%rsp)");
1131e1051a39Sopenharmony_ci$code.=<<___ if (!$win64);
1132e1051a39Sopenharmony_ci.cfi_cfa_expression     %rsp+`$PUSH8-8`,deref,+8
1133e1051a39Sopenharmony_ci# copy secondary frame pointer to new location again at -8(%rsp)
1134e1051a39Sopenharmony_ci        pushq   $PUSH8-8(%rsp)
1135e1051a39Sopenharmony_ci.cfi_cfa_expression     %rsp,deref,+8
1136e1051a39Sopenharmony_ci        lea     8(%rsp),%rsp
1137e1051a39Sopenharmony_ci.cfi_cfa_expression     %rsp-8,deref,+8
1138e1051a39Sopenharmony_ci___
1139e1051a39Sopenharmony_ci	}
1140e1051a39Sopenharmony_ci	foreach (Xupdate_256_AVX()) {		# 29 instructions
1141e1051a39Sopenharmony_ci	    eval;
1142e1051a39Sopenharmony_ci	    eval(shift(@insns));
1143e1051a39Sopenharmony_ci	    eval(shift(@insns));
1144e1051a39Sopenharmony_ci	    eval(shift(@insns));
1145e1051a39Sopenharmony_ci	}
1146e1051a39Sopenharmony_ci	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1147e1051a39Sopenharmony_ci	  foreach (@insns) { eval; }		# remaining instructions
1148e1051a39Sopenharmony_ci	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1149e1051a39Sopenharmony_ci}
1150e1051a39Sopenharmony_ci    $aesni_cbc_idx=0;
1151e1051a39Sopenharmony_ci    for ($i=0,$j=0; $j<4; $j++) {
1152e1051a39Sopenharmony_ci	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1153e1051a39Sopenharmony_ci	push(@X,shift(@X));			# rotate(@X)
1154e1051a39Sopenharmony_ci    }
1155e1051a39Sopenharmony_ci	&vmovq		("%r13",$offload);	# borrow $a0
1156e1051a39Sopenharmony_ci	&vpextrq	("%r15",$offload,1);	# borrow $a2
1157e1051a39Sopenharmony_ci	&vpand		($temp,$temp,$mask14);
1158e1051a39Sopenharmony_ci	&vpor		($iv,$iv,$temp);
1159e1051a39Sopenharmony_ci	&vmovdqu	("(%r15,%r13)",$iv);	# write output
1160e1051a39Sopenharmony_ci	&lea		("%r13","16(%r13)");	# inp++
1161e1051a39Sopenharmony_ci
1162e1051a39Sopenharmony_ci	&lea	($Tbl,16*2*$SZ."($Tbl)");
1163e1051a39Sopenharmony_ci	&cmpb	(($SZ-1)."($Tbl)",0);
1164e1051a39Sopenharmony_ci	&jne	(".Lavx2_00_47");
1165e1051a39Sopenharmony_ci
1166e1051a39Sopenharmony_ci	&vmovdqu	($inout,"(%r13)");
1167e1051a39Sopenharmony_ci	&vpinsrq	($offload,$offload,"%r13",0);
1168e1051a39Sopenharmony_ci
1169e1051a39Sopenharmony_ci    $aesni_cbc_idx=0;
1170e1051a39Sopenharmony_ci    for ($i=0; $i<16; ) {
1171e1051a39Sopenharmony_ci	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1172e1051a39Sopenharmony_ci	foreach(bodyx_00_15()) { eval; }
1173e1051a39Sopenharmony_ci    }
1174e1051a39Sopenharmony_ci					}
1175e1051a39Sopenharmony_ci$code.=<<___;
1176e1051a39Sopenharmony_ci	vpextrq	\$1,$offload,%r12		# $_out, borrow $a4
1177e1051a39Sopenharmony_ci	vmovq	$offload,%r13			# $_inp, borrow $a0
1178e1051a39Sopenharmony_ci	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1179e1051a39Sopenharmony_ci	add	$a1,$A
1180e1051a39Sopenharmony_ci	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
1181e1051a39Sopenharmony_ci
1182e1051a39Sopenharmony_ci	vpand	$mask14,$temp,$temp
1183e1051a39Sopenharmony_ci	vpor	$temp,$iv,$iv
1184e1051a39Sopenharmony_ci	vmovdqu	$iv,(%r12,%r13)			# write output
1185e1051a39Sopenharmony_ci	lea	16(%r13),%r13
1186e1051a39Sopenharmony_ci
1187e1051a39Sopenharmony_ci	add	$SZ*0(%r15),$A
1188e1051a39Sopenharmony_ci	add	$SZ*1(%r15),$B
1189e1051a39Sopenharmony_ci	add	$SZ*2(%r15),$C
1190e1051a39Sopenharmony_ci	add	$SZ*3(%r15),$D
1191e1051a39Sopenharmony_ci	add	$SZ*4(%r15),$E
1192e1051a39Sopenharmony_ci	add	$SZ*5(%r15),$F
1193e1051a39Sopenharmony_ci	add	$SZ*6(%r15),$G
1194e1051a39Sopenharmony_ci	add	$SZ*7(%r15),$H
1195e1051a39Sopenharmony_ci
1196e1051a39Sopenharmony_ci	mov	$A,$SZ*0(%r15)
1197e1051a39Sopenharmony_ci	mov	$B,$SZ*1(%r15)
1198e1051a39Sopenharmony_ci	mov	$C,$SZ*2(%r15)
1199e1051a39Sopenharmony_ci	mov	$D,$SZ*3(%r15)
1200e1051a39Sopenharmony_ci	mov	$E,$SZ*4(%r15)
1201e1051a39Sopenharmony_ci	mov	$F,$SZ*5(%r15)
1202e1051a39Sopenharmony_ci	mov	$G,$SZ*6(%r15)
1203e1051a39Sopenharmony_ci	mov	$H,$SZ*7(%r15)
1204e1051a39Sopenharmony_ci
1205e1051a39Sopenharmony_ci	cmp	`$PUSH8+2*8`($Tbl),%r13		# $_end
1206e1051a39Sopenharmony_ci	je	.Ldone_avx2
1207e1051a39Sopenharmony_ci
1208e1051a39Sopenharmony_ci	xor	$a1,$a1
1209e1051a39Sopenharmony_ci	mov	$B,$a3
1210e1051a39Sopenharmony_ci	mov	$F,$a4
1211e1051a39Sopenharmony_ci	xor	$C,$a3			# magic
1212e1051a39Sopenharmony_ci	jmp	.Lower_avx2
1213e1051a39Sopenharmony_ci.align	16
1214e1051a39Sopenharmony_ci.Lower_avx2:
1215e1051a39Sopenharmony_ci	vmovdqu	(%r13),$inout
1216e1051a39Sopenharmony_ci	vpinsrq	\$0,%r13,$offload,$offload
1217e1051a39Sopenharmony_ci___
1218e1051a39Sopenharmony_ci    $aesni_cbc_idx=0;
1219e1051a39Sopenharmony_ci    for ($i=0; $i<16; ) {
1220e1051a39Sopenharmony_ci	my $base="+16($Tbl)";
1221e1051a39Sopenharmony_ci	foreach(bodyx_00_15()) { eval; }
1222e1051a39Sopenharmony_ci	&lea	($Tbl,"-$PUSH8($Tbl)")	if ($i==8);
1223e1051a39Sopenharmony_ci    }
1224e1051a39Sopenharmony_ci$code.=<<___;
1225e1051a39Sopenharmony_ci	vmovq	$offload,%r13			# borrow $a0
1226e1051a39Sopenharmony_ci	vpextrq	\$1,$offload,%r15		# borrow $a2
1227e1051a39Sopenharmony_ci	vpand	$mask14,$temp,$temp
1228e1051a39Sopenharmony_ci	vpor	$temp,$iv,$iv
1229e1051a39Sopenharmony_ci	lea	-$PUSH8($Tbl),$Tbl
1230e1051a39Sopenharmony_ci	vmovdqu	$iv,(%r15,%r13)			# write output
1231e1051a39Sopenharmony_ci	lea	16(%r13),%r13			# inp++
1232e1051a39Sopenharmony_ci	cmp	%rsp,$Tbl
1233e1051a39Sopenharmony_ci	jae	.Lower_avx2
1234e1051a39Sopenharmony_ci
1235e1051a39Sopenharmony_ci	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1236e1051a39Sopenharmony_ci	lea	16*$SZ(%r13),%r13
1237e1051a39Sopenharmony_ci	mov	`2*$SZ*$rounds+6*8`(%rsp),%rsi	# $_in0, borrow $a3
1238e1051a39Sopenharmony_ci	add	$a1,$A
1239e1051a39Sopenharmony_ci	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
1240e1051a39Sopenharmony_ci
1241e1051a39Sopenharmony_ci	add	$SZ*0(%r15),$A
1242e1051a39Sopenharmony_ci	add	$SZ*1(%r15),$B
1243e1051a39Sopenharmony_ci	add	$SZ*2(%r15),$C
1244e1051a39Sopenharmony_ci	add	$SZ*3(%r15),$D
1245e1051a39Sopenharmony_ci	add	$SZ*4(%r15),$E
1246e1051a39Sopenharmony_ci	add	$SZ*5(%r15),$F
1247e1051a39Sopenharmony_ci	add	$SZ*6(%r15),$G
1248e1051a39Sopenharmony_ci	lea	(%rsi,%r13),%r12
1249e1051a39Sopenharmony_ci	add	$SZ*7(%r15),$H
1250e1051a39Sopenharmony_ci
1251e1051a39Sopenharmony_ci	cmp	$_end,%r13
1252e1051a39Sopenharmony_ci
1253e1051a39Sopenharmony_ci	mov	$A,$SZ*0(%r15)
1254e1051a39Sopenharmony_ci	cmove	%rsp,%r12		# next block or stale data
1255e1051a39Sopenharmony_ci	mov	$B,$SZ*1(%r15)
1256e1051a39Sopenharmony_ci	mov	$C,$SZ*2(%r15)
1257e1051a39Sopenharmony_ci	mov	$D,$SZ*3(%r15)
1258e1051a39Sopenharmony_ci	mov	$E,$SZ*4(%r15)
1259e1051a39Sopenharmony_ci	mov	$F,$SZ*5(%r15)
1260e1051a39Sopenharmony_ci	mov	$G,$SZ*6(%r15)
1261e1051a39Sopenharmony_ci	mov	$H,$SZ*7(%r15)
1262e1051a39Sopenharmony_ci
1263e1051a39Sopenharmony_ci	jbe	.Loop_avx2
1264e1051a39Sopenharmony_ci	lea	(%rsp),$Tbl
1265e1051a39Sopenharmony_ci# temporarily use $Tbl as index to $_rsp
1266e1051a39Sopenharmony_ci# this avoids the need to save a secondary frame pointer at -8(%rsp)
1267e1051a39Sopenharmony_ci.cfi_cfa_expression     $Tbl+`16*$SZ+7*8`,deref,+8
1268e1051a39Sopenharmony_ci
1269e1051a39Sopenharmony_ci.Ldone_avx2:
1270e1051a39Sopenharmony_ci	mov	16*$SZ+4*8($Tbl),$ivp
1271e1051a39Sopenharmony_ci	mov	16*$SZ+7*8($Tbl),%rsi
1272e1051a39Sopenharmony_ci.cfi_def_cfa	%rsi,8
1273e1051a39Sopenharmony_ci	vmovdqu	$iv,($ivp)		# output IV
1274e1051a39Sopenharmony_ci	vzeroall
1275e1051a39Sopenharmony_ci___
1276e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1277e1051a39Sopenharmony_ci	movaps	`$framesz+16*0`($Tbl),%xmm6
1278e1051a39Sopenharmony_ci	movaps	`$framesz+16*1`($Tbl),%xmm7
1279e1051a39Sopenharmony_ci	movaps	`$framesz+16*2`($Tbl),%xmm8
1280e1051a39Sopenharmony_ci	movaps	`$framesz+16*3`($Tbl),%xmm9
1281e1051a39Sopenharmony_ci	movaps	`$framesz+16*4`($Tbl),%xmm10
1282e1051a39Sopenharmony_ci	movaps	`$framesz+16*5`($Tbl),%xmm11
1283e1051a39Sopenharmony_ci	movaps	`$framesz+16*6`($Tbl),%xmm12
1284e1051a39Sopenharmony_ci	movaps	`$framesz+16*7`($Tbl),%xmm13
1285e1051a39Sopenharmony_ci	movaps	`$framesz+16*8`($Tbl),%xmm14
1286e1051a39Sopenharmony_ci	movaps	`$framesz+16*9`($Tbl),%xmm15
1287e1051a39Sopenharmony_ci___
1288e1051a39Sopenharmony_ci$code.=<<___;
1289e1051a39Sopenharmony_ci	mov	-48(%rsi),%r15
1290e1051a39Sopenharmony_ci.cfi_restore	%r15
1291e1051a39Sopenharmony_ci	mov	-40(%rsi),%r14
1292e1051a39Sopenharmony_ci.cfi_restore	%r14
1293e1051a39Sopenharmony_ci	mov	-32(%rsi),%r13
1294e1051a39Sopenharmony_ci.cfi_restore	%r13
1295e1051a39Sopenharmony_ci	mov	-24(%rsi),%r12
1296e1051a39Sopenharmony_ci.cfi_restore	%r12
1297e1051a39Sopenharmony_ci	mov	-16(%rsi),%rbp
1298e1051a39Sopenharmony_ci.cfi_restore	%rbp
1299e1051a39Sopenharmony_ci	mov	-8(%rsi),%rbx
1300e1051a39Sopenharmony_ci.cfi_restore	%rbx
1301e1051a39Sopenharmony_ci	lea	(%rsi),%rsp
1302e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
1303e1051a39Sopenharmony_ci.Lepilogue_avx2:
1304e1051a39Sopenharmony_ci	ret
1305e1051a39Sopenharmony_ci.cfi_endproc
1306e1051a39Sopenharmony_ci.size	${func}_avx2,.-${func}_avx2
1307e1051a39Sopenharmony_ci___
1308e1051a39Sopenharmony_ci}}
1309e1051a39Sopenharmony_ci}}
1310e1051a39Sopenharmony_ci{{
1311e1051a39Sopenharmony_cimy ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1312e1051a39Sopenharmony_ci
1313e1051a39Sopenharmony_cimy ($rounds,$Tbl)=("%r11d","%rbx");
1314e1051a39Sopenharmony_ci
1315e1051a39Sopenharmony_cimy ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1316e1051a39Sopenharmony_cimy @rndkey=("%xmm4","%xmm5");
1317e1051a39Sopenharmony_cimy $r=0;
1318e1051a39Sopenharmony_cimy $sn=0;
1319e1051a39Sopenharmony_ci
1320e1051a39Sopenharmony_cimy ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1321e1051a39Sopenharmony_cimy @MSG=map("%xmm$_",(10..13));
1322e1051a39Sopenharmony_ci
1323e1051a39Sopenharmony_cimy $aesenc=sub {
1324e1051a39Sopenharmony_ci  use integer;
1325e1051a39Sopenharmony_ci  my ($n,$k)=($r/10,$r%10);
1326e1051a39Sopenharmony_ci    if ($k==0) {
1327e1051a39Sopenharmony_ci      $code.=<<___;
1328e1051a39Sopenharmony_ci	movups		`16*$n`($in0),$in		# load input
1329e1051a39Sopenharmony_ci	xorps		$rndkey0,$in
1330e1051a39Sopenharmony_ci___
1331e1051a39Sopenharmony_ci      $code.=<<___ if ($n);
1332e1051a39Sopenharmony_ci	movups		$iv,`16*($n-1)`($out,$in0)	# write output
1333e1051a39Sopenharmony_ci___
1334e1051a39Sopenharmony_ci      $code.=<<___;
1335e1051a39Sopenharmony_ci	xorps		$in,$iv
1336e1051a39Sopenharmony_ci	movups		`32+16*$k-112`($key),$rndkey[1]
1337e1051a39Sopenharmony_ci	aesenc		$rndkey[0],$iv
1338e1051a39Sopenharmony_ci___
1339e1051a39Sopenharmony_ci    } elsif ($k==9) {
1340e1051a39Sopenharmony_ci      $sn++;
1341e1051a39Sopenharmony_ci      $code.=<<___;
1342e1051a39Sopenharmony_ci	cmp		\$11,$rounds
1343e1051a39Sopenharmony_ci	jb		.Laesenclast$sn
1344e1051a39Sopenharmony_ci	movups		`32+16*($k+0)-112`($key),$rndkey[1]
1345e1051a39Sopenharmony_ci	aesenc		$rndkey[0],$iv
1346e1051a39Sopenharmony_ci	movups		`32+16*($k+1)-112`($key),$rndkey[0]
1347e1051a39Sopenharmony_ci	aesenc		$rndkey[1],$iv
1348e1051a39Sopenharmony_ci	je		.Laesenclast$sn
1349e1051a39Sopenharmony_ci	movups		`32+16*($k+2)-112`($key),$rndkey[1]
1350e1051a39Sopenharmony_ci	aesenc		$rndkey[0],$iv
1351e1051a39Sopenharmony_ci	movups		`32+16*($k+3)-112`($key),$rndkey[0]
1352e1051a39Sopenharmony_ci	aesenc		$rndkey[1],$iv
1353e1051a39Sopenharmony_ci.Laesenclast$sn:
1354e1051a39Sopenharmony_ci	aesenclast	$rndkey[0],$iv
1355e1051a39Sopenharmony_ci	movups		16-112($key),$rndkey[1]		# forward reference
1356e1051a39Sopenharmony_ci	nop
1357e1051a39Sopenharmony_ci___
1358e1051a39Sopenharmony_ci    } else {
1359e1051a39Sopenharmony_ci      $code.=<<___;
1360e1051a39Sopenharmony_ci	movups		`32+16*$k-112`($key),$rndkey[1]
1361e1051a39Sopenharmony_ci	aesenc		$rndkey[0],$iv
1362e1051a39Sopenharmony_ci___
1363e1051a39Sopenharmony_ci    }
1364e1051a39Sopenharmony_ci    $r++;	unshift(@rndkey,pop(@rndkey));
1365e1051a39Sopenharmony_ci};
1366e1051a39Sopenharmony_ci
1367e1051a39Sopenharmony_ciif ($shaext) {
1368e1051a39Sopenharmony_cimy $Tbl="%rax";
1369e1051a39Sopenharmony_ci
1370e1051a39Sopenharmony_ci$code.=<<___;
1371e1051a39Sopenharmony_ci.type	${func}_shaext,\@function,6
1372e1051a39Sopenharmony_ci.align	32
1373e1051a39Sopenharmony_ci${func}_shaext:
1374e1051a39Sopenharmony_ci.cfi_startproc
1375e1051a39Sopenharmony_ci	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1376e1051a39Sopenharmony_ci___
1377e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1378e1051a39Sopenharmony_ci	lea	`-8-10*16`(%rsp),%rsp
1379e1051a39Sopenharmony_ci	movaps	%xmm6,-8-10*16(%rax)
1380e1051a39Sopenharmony_ci	movaps	%xmm7,-8-9*16(%rax)
1381e1051a39Sopenharmony_ci	movaps	%xmm8,-8-8*16(%rax)
1382e1051a39Sopenharmony_ci	movaps	%xmm9,-8-7*16(%rax)
1383e1051a39Sopenharmony_ci	movaps	%xmm10,-8-6*16(%rax)
1384e1051a39Sopenharmony_ci	movaps	%xmm11,-8-5*16(%rax)
1385e1051a39Sopenharmony_ci	movaps	%xmm12,-8-4*16(%rax)
1386e1051a39Sopenharmony_ci	movaps	%xmm13,-8-3*16(%rax)
1387e1051a39Sopenharmony_ci	movaps	%xmm14,-8-2*16(%rax)
1388e1051a39Sopenharmony_ci	movaps	%xmm15,-8-1*16(%rax)
1389e1051a39Sopenharmony_ci.Lprologue_shaext:
1390e1051a39Sopenharmony_ci___
1391e1051a39Sopenharmony_ci$code.=<<___;
1392e1051a39Sopenharmony_ci	lea		K256+0x80(%rip),$Tbl
1393e1051a39Sopenharmony_ci	movdqu		($ctx),$ABEF		# DCBA
1394e1051a39Sopenharmony_ci	movdqu		16($ctx),$CDGH		# HGFE
1395e1051a39Sopenharmony_ci	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
1396e1051a39Sopenharmony_ci
1397e1051a39Sopenharmony_ci	mov		240($key),$rounds
1398e1051a39Sopenharmony_ci	sub		$in0,$out
1399e1051a39Sopenharmony_ci	movups		($key),$rndkey0		# $key[0]
1400e1051a39Sopenharmony_ci	movups		($ivp),$iv		# load IV
1401e1051a39Sopenharmony_ci	movups		16($key),$rndkey[0]	# forward reference
1402e1051a39Sopenharmony_ci	lea		112($key),$key		# size optimization
1403e1051a39Sopenharmony_ci
1404e1051a39Sopenharmony_ci	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
1405e1051a39Sopenharmony_ci	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
1406e1051a39Sopenharmony_ci	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
1407e1051a39Sopenharmony_ci	movdqa		$TMP,$BSWAP		# offload
1408e1051a39Sopenharmony_ci	palignr		\$8,$CDGH,$ABEF		# ABEF
1409e1051a39Sopenharmony_ci	punpcklqdq	$Wi,$CDGH		# CDGH
1410e1051a39Sopenharmony_ci
1411e1051a39Sopenharmony_ci	jmp	.Loop_shaext
1412e1051a39Sopenharmony_ci
1413e1051a39Sopenharmony_ci.align	16
1414e1051a39Sopenharmony_ci.Loop_shaext:
1415e1051a39Sopenharmony_ci	movdqu		($inp),@MSG[0]
1416e1051a39Sopenharmony_ci	movdqu		0x10($inp),@MSG[1]
1417e1051a39Sopenharmony_ci	movdqu		0x20($inp),@MSG[2]
1418e1051a39Sopenharmony_ci	pshufb		$TMP,@MSG[0]
1419e1051a39Sopenharmony_ci	movdqu		0x30($inp),@MSG[3]
1420e1051a39Sopenharmony_ci
1421e1051a39Sopenharmony_ci	movdqa		0*32-0x80($Tbl),$Wi
1422e1051a39Sopenharmony_ci	paddd		@MSG[0],$Wi
1423e1051a39Sopenharmony_ci	pshufb		$TMP,@MSG[1]
1424e1051a39Sopenharmony_ci	movdqa		$CDGH,$CDGH_SAVE	# offload
1425e1051a39Sopenharmony_ci	movdqa		$ABEF,$ABEF_SAVE	# offload
1426e1051a39Sopenharmony_ci___
1427e1051a39Sopenharmony_ci	&$aesenc();
1428e1051a39Sopenharmony_ci$code.=<<___;
1429e1051a39Sopenharmony_ci	sha256rnds2	$ABEF,$CDGH		# 0-3
1430e1051a39Sopenharmony_ci	pshufd		\$0x0e,$Wi,$Wi
1431e1051a39Sopenharmony_ci___
1432e1051a39Sopenharmony_ci	&$aesenc();
1433e1051a39Sopenharmony_ci$code.=<<___;
1434e1051a39Sopenharmony_ci	sha256rnds2	$CDGH,$ABEF
1435e1051a39Sopenharmony_ci
1436e1051a39Sopenharmony_ci	movdqa		1*32-0x80($Tbl),$Wi
1437e1051a39Sopenharmony_ci	paddd		@MSG[1],$Wi
1438e1051a39Sopenharmony_ci	pshufb		$TMP,@MSG[2]
1439e1051a39Sopenharmony_ci	lea		0x40($inp),$inp
1440e1051a39Sopenharmony_ci___
1441e1051a39Sopenharmony_ci	&$aesenc();
1442e1051a39Sopenharmony_ci$code.=<<___;
1443e1051a39Sopenharmony_ci	sha256rnds2	$ABEF,$CDGH		# 4-7
1444e1051a39Sopenharmony_ci	pshufd		\$0x0e,$Wi,$Wi
1445e1051a39Sopenharmony_ci___
1446e1051a39Sopenharmony_ci	&$aesenc();
1447e1051a39Sopenharmony_ci$code.=<<___;
1448e1051a39Sopenharmony_ci	sha256rnds2	$CDGH,$ABEF
1449e1051a39Sopenharmony_ci
1450e1051a39Sopenharmony_ci	movdqa		2*32-0x80($Tbl),$Wi
1451e1051a39Sopenharmony_ci	paddd		@MSG[2],$Wi
1452e1051a39Sopenharmony_ci	pshufb		$TMP,@MSG[3]
1453e1051a39Sopenharmony_ci	sha256msg1	@MSG[1],@MSG[0]
1454e1051a39Sopenharmony_ci___
1455e1051a39Sopenharmony_ci	&$aesenc();
1456e1051a39Sopenharmony_ci$code.=<<___;
1457e1051a39Sopenharmony_ci	sha256rnds2	$ABEF,$CDGH		# 8-11
1458e1051a39Sopenharmony_ci	pshufd		\$0x0e,$Wi,$Wi
1459e1051a39Sopenharmony_ci	movdqa		@MSG[3],$TMP
1460e1051a39Sopenharmony_ci	palignr		\$4,@MSG[2],$TMP
1461e1051a39Sopenharmony_ci	paddd		$TMP,@MSG[0]
1462e1051a39Sopenharmony_ci___
1463e1051a39Sopenharmony_ci	&$aesenc();
1464e1051a39Sopenharmony_ci$code.=<<___;
1465e1051a39Sopenharmony_ci	sha256rnds2	$CDGH,$ABEF
1466e1051a39Sopenharmony_ci
1467e1051a39Sopenharmony_ci	movdqa		3*32-0x80($Tbl),$Wi
1468e1051a39Sopenharmony_ci	paddd		@MSG[3],$Wi
1469e1051a39Sopenharmony_ci	sha256msg2	@MSG[3],@MSG[0]
1470e1051a39Sopenharmony_ci	sha256msg1	@MSG[2],@MSG[1]
1471e1051a39Sopenharmony_ci___
1472e1051a39Sopenharmony_ci	&$aesenc();
1473e1051a39Sopenharmony_ci$code.=<<___;
1474e1051a39Sopenharmony_ci	sha256rnds2	$ABEF,$CDGH		# 12-15
1475e1051a39Sopenharmony_ci	pshufd		\$0x0e,$Wi,$Wi
1476e1051a39Sopenharmony_ci___
1477e1051a39Sopenharmony_ci	&$aesenc();
1478e1051a39Sopenharmony_ci$code.=<<___;
1479e1051a39Sopenharmony_ci	movdqa		@MSG[0],$TMP
1480e1051a39Sopenharmony_ci	palignr		\$4,@MSG[3],$TMP
1481e1051a39Sopenharmony_ci	paddd		$TMP,@MSG[1]
1482e1051a39Sopenharmony_ci	sha256rnds2	$CDGH,$ABEF
1483e1051a39Sopenharmony_ci___
1484e1051a39Sopenharmony_cifor($i=4;$i<16-3;$i++) {
1485e1051a39Sopenharmony_ci	&$aesenc()	if (($r%10)==0);
1486e1051a39Sopenharmony_ci$code.=<<___;
1487e1051a39Sopenharmony_ci	movdqa		$i*32-0x80($Tbl),$Wi
1488e1051a39Sopenharmony_ci	paddd		@MSG[0],$Wi
1489e1051a39Sopenharmony_ci	sha256msg2	@MSG[0],@MSG[1]
1490e1051a39Sopenharmony_ci	sha256msg1	@MSG[3],@MSG[2]
1491e1051a39Sopenharmony_ci___
1492e1051a39Sopenharmony_ci	&$aesenc();
1493e1051a39Sopenharmony_ci$code.=<<___;
1494e1051a39Sopenharmony_ci	sha256rnds2	$ABEF,$CDGH		# 16-19...
1495e1051a39Sopenharmony_ci	pshufd		\$0x0e,$Wi,$Wi
1496e1051a39Sopenharmony_ci	movdqa		@MSG[1],$TMP
1497e1051a39Sopenharmony_ci	palignr		\$4,@MSG[0],$TMP
1498e1051a39Sopenharmony_ci	paddd		$TMP,@MSG[2]
1499e1051a39Sopenharmony_ci___
1500e1051a39Sopenharmony_ci	&$aesenc();
1501e1051a39Sopenharmony_ci	&$aesenc()	if ($r==19);
1502e1051a39Sopenharmony_ci$code.=<<___;
1503e1051a39Sopenharmony_ci	sha256rnds2	$CDGH,$ABEF
1504e1051a39Sopenharmony_ci___
1505e1051a39Sopenharmony_ci	push(@MSG,shift(@MSG));
1506e1051a39Sopenharmony_ci}
1507e1051a39Sopenharmony_ci$code.=<<___;
1508e1051a39Sopenharmony_ci	movdqa		13*32-0x80($Tbl),$Wi
1509e1051a39Sopenharmony_ci	paddd		@MSG[0],$Wi
1510e1051a39Sopenharmony_ci	sha256msg2	@MSG[0],@MSG[1]
1511e1051a39Sopenharmony_ci	sha256msg1	@MSG[3],@MSG[2]
1512e1051a39Sopenharmony_ci___
1513e1051a39Sopenharmony_ci	&$aesenc();
1514e1051a39Sopenharmony_ci$code.=<<___;
1515e1051a39Sopenharmony_ci	sha256rnds2	$ABEF,$CDGH		# 52-55
1516e1051a39Sopenharmony_ci	pshufd		\$0x0e,$Wi,$Wi
1517e1051a39Sopenharmony_ci	movdqa		@MSG[1],$TMP
1518e1051a39Sopenharmony_ci	palignr		\$4,@MSG[0],$TMP
1519e1051a39Sopenharmony_ci	paddd		$TMP,@MSG[2]
1520e1051a39Sopenharmony_ci___
1521e1051a39Sopenharmony_ci	&$aesenc();
1522e1051a39Sopenharmony_ci	&$aesenc();
1523e1051a39Sopenharmony_ci$code.=<<___;
1524e1051a39Sopenharmony_ci	sha256rnds2	$CDGH,$ABEF
1525e1051a39Sopenharmony_ci
1526e1051a39Sopenharmony_ci	movdqa		14*32-0x80($Tbl),$Wi
1527e1051a39Sopenharmony_ci	paddd		@MSG[1],$Wi
1528e1051a39Sopenharmony_ci	sha256msg2	@MSG[1],@MSG[2]
1529e1051a39Sopenharmony_ci	movdqa		$BSWAP,$TMP
1530e1051a39Sopenharmony_ci___
1531e1051a39Sopenharmony_ci	&$aesenc();
1532e1051a39Sopenharmony_ci$code.=<<___;
1533e1051a39Sopenharmony_ci	sha256rnds2	$ABEF,$CDGH		# 56-59
1534e1051a39Sopenharmony_ci	pshufd		\$0x0e,$Wi,$Wi
1535e1051a39Sopenharmony_ci___
1536e1051a39Sopenharmony_ci	&$aesenc();
1537e1051a39Sopenharmony_ci$code.=<<___;
1538e1051a39Sopenharmony_ci	sha256rnds2	$CDGH,$ABEF
1539e1051a39Sopenharmony_ci
1540e1051a39Sopenharmony_ci	movdqa		15*32-0x80($Tbl),$Wi
1541e1051a39Sopenharmony_ci	paddd		@MSG[2],$Wi
1542e1051a39Sopenharmony_ci___
1543e1051a39Sopenharmony_ci	&$aesenc();
1544e1051a39Sopenharmony_ci	&$aesenc();
1545e1051a39Sopenharmony_ci$code.=<<___;
1546e1051a39Sopenharmony_ci	sha256rnds2	$ABEF,$CDGH		# 60-63
1547e1051a39Sopenharmony_ci	pshufd		\$0x0e,$Wi,$Wi
1548e1051a39Sopenharmony_ci___
1549e1051a39Sopenharmony_ci	&$aesenc();
1550e1051a39Sopenharmony_ci$code.=<<___;
1551e1051a39Sopenharmony_ci	sha256rnds2	$CDGH,$ABEF
1552e1051a39Sopenharmony_ci	#pxor		$CDGH,$rndkey0		# black magic
1553e1051a39Sopenharmony_ci___
1554e1051a39Sopenharmony_ci	while ($r<40)	{ &$aesenc(); }		# remaining aesenc's
1555e1051a39Sopenharmony_ci$code.=<<___;
1556e1051a39Sopenharmony_ci	#xorps		$CDGH,$rndkey0		# black magic
1557e1051a39Sopenharmony_ci	paddd		$CDGH_SAVE,$CDGH
1558e1051a39Sopenharmony_ci	paddd		$ABEF_SAVE,$ABEF
1559e1051a39Sopenharmony_ci
1560e1051a39Sopenharmony_ci	dec		$len
1561e1051a39Sopenharmony_ci	movups		$iv,48($out,$in0)	# write output
1562e1051a39Sopenharmony_ci	lea		64($in0),$in0
1563e1051a39Sopenharmony_ci	jnz		.Loop_shaext
1564e1051a39Sopenharmony_ci
1565e1051a39Sopenharmony_ci	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
1566e1051a39Sopenharmony_ci	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
1567e1051a39Sopenharmony_ci	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
1568e1051a39Sopenharmony_ci	punpckhqdq	$CDGH,$ABEF		# DCBA
1569e1051a39Sopenharmony_ci	palignr		\$8,$TMP,$CDGH		# HGFE
1570e1051a39Sopenharmony_ci
1571e1051a39Sopenharmony_ci	movups		$iv,($ivp)		# write IV
1572e1051a39Sopenharmony_ci	movdqu		$ABEF,($ctx)
1573e1051a39Sopenharmony_ci	movdqu		$CDGH,16($ctx)
1574e1051a39Sopenharmony_ci___
1575e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1576e1051a39Sopenharmony_ci	movaps	0*16(%rsp),%xmm6
1577e1051a39Sopenharmony_ci	movaps	1*16(%rsp),%xmm7
1578e1051a39Sopenharmony_ci	movaps	2*16(%rsp),%xmm8
1579e1051a39Sopenharmony_ci	movaps	3*16(%rsp),%xmm9
1580e1051a39Sopenharmony_ci	movaps	4*16(%rsp),%xmm10
1581e1051a39Sopenharmony_ci	movaps	5*16(%rsp),%xmm11
1582e1051a39Sopenharmony_ci	movaps	6*16(%rsp),%xmm12
1583e1051a39Sopenharmony_ci	movaps	7*16(%rsp),%xmm13
1584e1051a39Sopenharmony_ci	movaps	8*16(%rsp),%xmm14
1585e1051a39Sopenharmony_ci	movaps	9*16(%rsp),%xmm15
1586e1051a39Sopenharmony_ci	lea	8+10*16(%rsp),%rsp
1587e1051a39Sopenharmony_ci.Lepilogue_shaext:
1588e1051a39Sopenharmony_ci___
1589e1051a39Sopenharmony_ci$code.=<<___;
1590e1051a39Sopenharmony_ci	ret
1591e1051a39Sopenharmony_ci.cfi_endproc
1592e1051a39Sopenharmony_ci.size	${func}_shaext,.-${func}_shaext
1593e1051a39Sopenharmony_ci___
1594e1051a39Sopenharmony_ci}
1595e1051a39Sopenharmony_ci}}}}}
1596e1051a39Sopenharmony_ci
1597e1051a39Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1598e1051a39Sopenharmony_ci#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1599e1051a39Sopenharmony_ciif ($win64 && $avx) {
1600e1051a39Sopenharmony_ci$rec="%rcx";
1601e1051a39Sopenharmony_ci$frame="%rdx";
1602e1051a39Sopenharmony_ci$context="%r8";
1603e1051a39Sopenharmony_ci$disp="%r9";
1604e1051a39Sopenharmony_ci
1605e1051a39Sopenharmony_ci$code.=<<___;
1606e1051a39Sopenharmony_ci.extern	__imp_RtlVirtualUnwind
1607e1051a39Sopenharmony_ci.type	se_handler,\@abi-omnipotent
1608e1051a39Sopenharmony_ci.align	16
1609e1051a39Sopenharmony_cise_handler:
1610e1051a39Sopenharmony_ci	push	%rsi
1611e1051a39Sopenharmony_ci	push	%rdi
1612e1051a39Sopenharmony_ci	push	%rbx
1613e1051a39Sopenharmony_ci	push	%rbp
1614e1051a39Sopenharmony_ci	push	%r12
1615e1051a39Sopenharmony_ci	push	%r13
1616e1051a39Sopenharmony_ci	push	%r14
1617e1051a39Sopenharmony_ci	push	%r15
1618e1051a39Sopenharmony_ci	pushfq
1619e1051a39Sopenharmony_ci	sub	\$64,%rsp
1620e1051a39Sopenharmony_ci
1621e1051a39Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
1622e1051a39Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
1623e1051a39Sopenharmony_ci
1624e1051a39Sopenharmony_ci	mov	8($disp),%rsi		# disp->ImageBase
1625e1051a39Sopenharmony_ci	mov	56($disp),%r11		# disp->HanderlData
1626e1051a39Sopenharmony_ci
1627e1051a39Sopenharmony_ci	mov	0(%r11),%r10d		# HandlerData[0]
1628e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# prologue label
1629e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<prologue label
1630e1051a39Sopenharmony_ci	jb	.Lin_prologue
1631e1051a39Sopenharmony_ci
1632e1051a39Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
1633e1051a39Sopenharmony_ci
1634e1051a39Sopenharmony_ci	mov	4(%r11),%r10d		# HandlerData[1]
1635e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# epilogue label
1636e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=epilogue label
1637e1051a39Sopenharmony_ci	jae	.Lin_prologue
1638e1051a39Sopenharmony_ci___
1639e1051a39Sopenharmony_ci$code.=<<___ if ($shaext);
1640e1051a39Sopenharmony_ci	lea	aesni_cbc_sha256_enc_shaext(%rip),%r10
1641e1051a39Sopenharmony_ci	cmp	%r10,%rbx
1642e1051a39Sopenharmony_ci	jb	.Lnot_in_shaext
1643e1051a39Sopenharmony_ci
1644e1051a39Sopenharmony_ci	lea	(%rax),%rsi
1645e1051a39Sopenharmony_ci	lea	512($context),%rdi	# &context.Xmm6
1646e1051a39Sopenharmony_ci	mov	\$20,%ecx
1647e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
1648e1051a39Sopenharmony_ci	lea	168(%rax),%rax		# adjust stack pointer
1649e1051a39Sopenharmony_ci	jmp	.Lin_prologue
1650e1051a39Sopenharmony_ci.Lnot_in_shaext:
1651e1051a39Sopenharmony_ci___
1652e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1);
1653e1051a39Sopenharmony_ci	lea	.Lavx2_shortcut(%rip),%r10
1654e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
1655e1051a39Sopenharmony_ci	jb	.Lnot_in_avx2
1656e1051a39Sopenharmony_ci
1657e1051a39Sopenharmony_ci	and	\$-256*$SZ,%rax
1658e1051a39Sopenharmony_ci	add	\$`2*$SZ*($rounds-8)`,%rax
1659e1051a39Sopenharmony_ci.Lnot_in_avx2:
1660e1051a39Sopenharmony_ci___
1661e1051a39Sopenharmony_ci$code.=<<___;
1662e1051a39Sopenharmony_ci	mov	%rax,%rsi		# put aside Rsp
1663e1051a39Sopenharmony_ci	mov	16*$SZ+7*8(%rax),%rax	# pull $_rsp
1664e1051a39Sopenharmony_ci
1665e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
1666e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
1667e1051a39Sopenharmony_ci	mov	-24(%rax),%r12
1668e1051a39Sopenharmony_ci	mov	-32(%rax),%r13
1669e1051a39Sopenharmony_ci	mov	-40(%rax),%r14
1670e1051a39Sopenharmony_ci	mov	-48(%rax),%r15
1671e1051a39Sopenharmony_ci	mov	%rbx,144($context)	# restore context->Rbx
1672e1051a39Sopenharmony_ci	mov	%rbp,160($context)	# restore context->Rbp
1673e1051a39Sopenharmony_ci	mov	%r12,216($context)	# restore context->R12
1674e1051a39Sopenharmony_ci	mov	%r13,224($context)	# restore context->R13
1675e1051a39Sopenharmony_ci	mov	%r14,232($context)	# restore context->R14
1676e1051a39Sopenharmony_ci	mov	%r15,240($context)	# restore context->R15
1677e1051a39Sopenharmony_ci
1678e1051a39Sopenharmony_ci	lea	16*$SZ+8*8(%rsi),%rsi	# Xmm6- save area
1679e1051a39Sopenharmony_ci	lea	512($context),%rdi	# &context.Xmm6
1680e1051a39Sopenharmony_ci	mov	\$20,%ecx
1681e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
1682e1051a39Sopenharmony_ci
1683e1051a39Sopenharmony_ci.Lin_prologue:
1684e1051a39Sopenharmony_ci	mov	8(%rax),%rdi
1685e1051a39Sopenharmony_ci	mov	16(%rax),%rsi
1686e1051a39Sopenharmony_ci	mov	%rax,152($context)	# restore context->Rsp
1687e1051a39Sopenharmony_ci	mov	%rsi,168($context)	# restore context->Rsi
1688e1051a39Sopenharmony_ci	mov	%rdi,176($context)	# restore context->Rdi
1689e1051a39Sopenharmony_ci
1690e1051a39Sopenharmony_ci	mov	40($disp),%rdi		# disp->ContextRecord
1691e1051a39Sopenharmony_ci	mov	$context,%rsi		# context
1692e1051a39Sopenharmony_ci	mov	\$154,%ecx		# sizeof(CONTEXT)
1693e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
1694e1051a39Sopenharmony_ci
1695e1051a39Sopenharmony_ci	mov	$disp,%rsi
1696e1051a39Sopenharmony_ci	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1697e1051a39Sopenharmony_ci	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1698e1051a39Sopenharmony_ci	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1699e1051a39Sopenharmony_ci	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1700e1051a39Sopenharmony_ci	mov	40(%rsi),%r10		# disp->ContextRecord
1701e1051a39Sopenharmony_ci	lea	56(%rsi),%r11		# &disp->HandlerData
1702e1051a39Sopenharmony_ci	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1703e1051a39Sopenharmony_ci	mov	%r10,32(%rsp)		# arg5
1704e1051a39Sopenharmony_ci	mov	%r11,40(%rsp)		# arg6
1705e1051a39Sopenharmony_ci	mov	%r12,48(%rsp)		# arg7
1706e1051a39Sopenharmony_ci	mov	%rcx,56(%rsp)		# arg8, (NULL)
1707e1051a39Sopenharmony_ci	call	*__imp_RtlVirtualUnwind(%rip)
1708e1051a39Sopenharmony_ci
1709e1051a39Sopenharmony_ci	mov	\$1,%eax		# ExceptionContinueSearch
1710e1051a39Sopenharmony_ci	add	\$64,%rsp
1711e1051a39Sopenharmony_ci	popfq
1712e1051a39Sopenharmony_ci	pop	%r15
1713e1051a39Sopenharmony_ci	pop	%r14
1714e1051a39Sopenharmony_ci	pop	%r13
1715e1051a39Sopenharmony_ci	pop	%r12
1716e1051a39Sopenharmony_ci	pop	%rbp
1717e1051a39Sopenharmony_ci	pop	%rbx
1718e1051a39Sopenharmony_ci	pop	%rdi
1719e1051a39Sopenharmony_ci	pop	%rsi
1720e1051a39Sopenharmony_ci	ret
1721e1051a39Sopenharmony_ci.size	se_handler,.-se_handler
1722e1051a39Sopenharmony_ci
1723e1051a39Sopenharmony_ci.section	.pdata
1724e1051a39Sopenharmony_ci	.rva	.LSEH_begin_${func}_xop
1725e1051a39Sopenharmony_ci	.rva	.LSEH_end_${func}_xop
1726e1051a39Sopenharmony_ci	.rva	.LSEH_info_${func}_xop
1727e1051a39Sopenharmony_ci
1728e1051a39Sopenharmony_ci	.rva	.LSEH_begin_${func}_avx
1729e1051a39Sopenharmony_ci	.rva	.LSEH_end_${func}_avx
1730e1051a39Sopenharmony_ci	.rva	.LSEH_info_${func}_avx
1731e1051a39Sopenharmony_ci___
1732e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1);
1733e1051a39Sopenharmony_ci	.rva	.LSEH_begin_${func}_avx2
1734e1051a39Sopenharmony_ci	.rva	.LSEH_end_${func}_avx2
1735e1051a39Sopenharmony_ci	.rva	.LSEH_info_${func}_avx2
1736e1051a39Sopenharmony_ci___
1737e1051a39Sopenharmony_ci$code.=<<___ if ($shaext);
1738e1051a39Sopenharmony_ci	.rva	.LSEH_begin_${func}_shaext
1739e1051a39Sopenharmony_ci	.rva	.LSEH_end_${func}_shaext
1740e1051a39Sopenharmony_ci	.rva	.LSEH_info_${func}_shaext
1741e1051a39Sopenharmony_ci___
1742e1051a39Sopenharmony_ci$code.=<<___;
1743e1051a39Sopenharmony_ci.section	.xdata
1744e1051a39Sopenharmony_ci.align	8
1745e1051a39Sopenharmony_ci.LSEH_info_${func}_xop:
1746e1051a39Sopenharmony_ci	.byte	9,0,0,0
1747e1051a39Sopenharmony_ci	.rva	se_handler
1748e1051a39Sopenharmony_ci	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
1749e1051a39Sopenharmony_ci
1750e1051a39Sopenharmony_ci.LSEH_info_${func}_avx:
1751e1051a39Sopenharmony_ci	.byte	9,0,0,0
1752e1051a39Sopenharmony_ci	.rva	se_handler
1753e1051a39Sopenharmony_ci	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1754e1051a39Sopenharmony_ci___
1755e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1);
1756e1051a39Sopenharmony_ci.LSEH_info_${func}_avx2:
1757e1051a39Sopenharmony_ci	.byte	9,0,0,0
1758e1051a39Sopenharmony_ci	.rva	se_handler
1759e1051a39Sopenharmony_ci	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
1760e1051a39Sopenharmony_ci___
1761e1051a39Sopenharmony_ci$code.=<<___ if ($shaext);
1762e1051a39Sopenharmony_ci.LSEH_info_${func}_shaext:
1763e1051a39Sopenharmony_ci	.byte	9,0,0,0
1764e1051a39Sopenharmony_ci	.rva	se_handler
1765e1051a39Sopenharmony_ci	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
1766e1051a39Sopenharmony_ci___
1767e1051a39Sopenharmony_ci}
1768e1051a39Sopenharmony_ci
1769e1051a39Sopenharmony_ci####################################################################
1770e1051a39Sopenharmony_cisub rex {
1771e1051a39Sopenharmony_ci  local *opcode=shift;
1772e1051a39Sopenharmony_ci  my ($dst,$src)=@_;
1773e1051a39Sopenharmony_ci  my $rex=0;
1774e1051a39Sopenharmony_ci
1775e1051a39Sopenharmony_ci    $rex|=0x04			if($dst>=8);
1776e1051a39Sopenharmony_ci    $rex|=0x01			if($src>=8);
1777e1051a39Sopenharmony_ci    unshift @opcode,$rex|0x40	if($rex);
1778e1051a39Sopenharmony_ci}
1779e1051a39Sopenharmony_ci
1780e1051a39Sopenharmony_ci{
1781e1051a39Sopenharmony_ci  my %opcodelet = (
1782e1051a39Sopenharmony_ci		"sha256rnds2" => 0xcb,
1783e1051a39Sopenharmony_ci  		"sha256msg1"  => 0xcc,
1784e1051a39Sopenharmony_ci		"sha256msg2"  => 0xcd	);
1785e1051a39Sopenharmony_ci
1786e1051a39Sopenharmony_ci  sub sha256op38 {
1787e1051a39Sopenharmony_ci    my $instr = shift;
1788e1051a39Sopenharmony_ci
1789e1051a39Sopenharmony_ci    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1790e1051a39Sopenharmony_ci      my @opcode=(0x0f,0x38);
1791e1051a39Sopenharmony_ci	rex(\@opcode,$2,$1);
1792e1051a39Sopenharmony_ci	push @opcode,$opcodelet{$instr};
1793e1051a39Sopenharmony_ci	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1794e1051a39Sopenharmony_ci	return ".byte\t".join(',',@opcode);
1795e1051a39Sopenharmony_ci    } else {
1796e1051a39Sopenharmony_ci	return $instr."\t".@_[0];
1797e1051a39Sopenharmony_ci    }
1798e1051a39Sopenharmony_ci  }
1799e1051a39Sopenharmony_ci}
1800e1051a39Sopenharmony_ci
1801e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval $1/gem;
1802e1051a39Sopenharmony_ci$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1803e1051a39Sopenharmony_ciprint $code;
1804e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
1805