1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci
17e1051a39Sopenharmony_ci# Multi-buffer SHA1 procedure processes n buffers in parallel by
18e1051a39Sopenharmony_ci# placing buffer data to designated lane of SIMD register. n is
19e1051a39Sopenharmony_ci# naturally limited to 4 on pre-AVX2 processors and to 8 on
20e1051a39Sopenharmony_ci# AVX2-capable processors such as Haswell.
21e1051a39Sopenharmony_ci#
22e1051a39Sopenharmony_ci#		this	+aesni(i)	sha1	aesni-sha1	gain(iv)
23e1051a39Sopenharmony_ci# -------------------------------------------------------------------
24e1051a39Sopenharmony_ci# Westmere(ii)	10.7/n	+1.28=3.96(n=4)	5.30	6.66		+68%
25e1051a39Sopenharmony_ci# Atom(ii)	18.1/n	+3.93=8.46(n=4)	9.37	12.8		+51%
26e1051a39Sopenharmony_ci# Sandy Bridge	(8.16	+5.15=13.3)/n	4.99	5.98		+80%
27e1051a39Sopenharmony_ci# Ivy Bridge	(8.08	+5.14=13.2)/n	4.60	5.54		+68%
28e1051a39Sopenharmony_ci# Haswell(iii)	(8.96	+5.00=14.0)/n	3.57	4.55		+160%
29e1051a39Sopenharmony_ci# Skylake	(8.70	+5.00=13.7)/n	3.64	4.20		+145%
30e1051a39Sopenharmony_ci# Bulldozer	(9.76	+5.76=15.5)/n	5.95	6.37		+64%
31e1051a39Sopenharmony_ci#
32e1051a39Sopenharmony_ci# (i)	multi-block CBC encrypt with 128-bit key;
33e1051a39Sopenharmony_ci# (ii)	(HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34e1051a39Sopenharmony_ci#	because of lower AES-NI instruction throughput;
35e1051a39Sopenharmony_ci# (iii)	"this" is for n=8, when we gather twice as much data, result
36e1051a39Sopenharmony_ci#	for n=4 is 8.00+4.44=12.4;
37e1051a39Sopenharmony_ci# (iv)	presented improvement coefficients are asymptotic limits and
38e1051a39Sopenharmony_ci#	in real-life application are somewhat lower, e.g. for 2KB
39e1051a39Sopenharmony_ci#	fragments they range from 30% to 100% (on Haswell);
40e1051a39Sopenharmony_ci
41e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
42e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
43e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
44e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
45e1051a39Sopenharmony_ci
46e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47e1051a39Sopenharmony_ci
48e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl";
52e1051a39Sopenharmony_ci
53e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm");
54e1051a39Sopenharmony_cirequire "x86_64-support.pl";
55e1051a39Sopenharmony_ci
56e1051a39Sopenharmony_ci$ptr_size=&pointer_size($flavour);
57e1051a39Sopenharmony_ci
58e1051a39Sopenharmony_ci$avx=0;
59e1051a39Sopenharmony_ci
60e1051a39Sopenharmony_ciif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
61e1051a39Sopenharmony_ci		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
62e1051a39Sopenharmony_ci	$avx = ($1>=2.19) + ($1>=2.22);
63e1051a39Sopenharmony_ci}
64e1051a39Sopenharmony_ci
65e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66e1051a39Sopenharmony_ci	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
67e1051a39Sopenharmony_ci	$avx = ($1>=2.09) + ($1>=2.10);
68e1051a39Sopenharmony_ci}
69e1051a39Sopenharmony_ci
70e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
71e1051a39Sopenharmony_ci	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
72e1051a39Sopenharmony_ci	$avx = ($1>=10) + ($1>=11);
73e1051a39Sopenharmony_ci}
74e1051a39Sopenharmony_ci
75e1051a39Sopenharmony_ciif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
76e1051a39Sopenharmony_ci	$avx = ($2>=3.0) + ($2>3.0);
77e1051a39Sopenharmony_ci}
78e1051a39Sopenharmony_ci
79e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
80e1051a39Sopenharmony_ci    or die "can't call $xlate: $!";
81e1051a39Sopenharmony_ci*STDOUT=*OUT;
82e1051a39Sopenharmony_ci
83e1051a39Sopenharmony_ci# void sha1_multi_block (
84e1051a39Sopenharmony_ci#     struct {	unsigned int A[8];
85e1051a39Sopenharmony_ci#		unsigned int B[8];
86e1051a39Sopenharmony_ci#		unsigned int C[8];
87e1051a39Sopenharmony_ci#		unsigned int D[8];
88e1051a39Sopenharmony_ci#		unsigned int E[8];	} *ctx,
89e1051a39Sopenharmony_ci#     struct {	void *ptr; int blocks;	} inp[8],
90e1051a39Sopenharmony_ci#     int num);		/* 1 or 2 */
91e1051a39Sopenharmony_ci#
92e1051a39Sopenharmony_ci$ctx="%rdi";	# 1st arg
93e1051a39Sopenharmony_ci$inp="%rsi";	# 2nd arg
94e1051a39Sopenharmony_ci$num="%edx";
95e1051a39Sopenharmony_ci@ptr=map("%r$_",(8..11));
96e1051a39Sopenharmony_ci$Tbl="%rbp";
97e1051a39Sopenharmony_ci$inp_elm_size=2*$ptr_size;
98e1051a39Sopenharmony_ci
99e1051a39Sopenharmony_ci@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4));
100e1051a39Sopenharmony_ci($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9));
101e1051a39Sopenharmony_ci@Xi=map("%xmm$_",(10..14));
102e1051a39Sopenharmony_ci$K="%xmm15";
103e1051a39Sopenharmony_ci
104e1051a39Sopenharmony_ciif (1) {
105e1051a39Sopenharmony_ci    # Atom-specific optimization aiming to eliminate pshufb with high
106e1051a39Sopenharmony_ci    # registers [and thus get rid of 48 cycles accumulated penalty]
107e1051a39Sopenharmony_ci    @Xi=map("%xmm$_",(0..4));
108e1051a39Sopenharmony_ci    ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
109e1051a39Sopenharmony_ci    @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
110e1051a39Sopenharmony_ci}
111e1051a39Sopenharmony_ci
112e1051a39Sopenharmony_ci$REG_SZ=16;
113e1051a39Sopenharmony_ci
114e1051a39Sopenharmony_cisub Xi_off {
115e1051a39Sopenharmony_cimy $off = shift;
116e1051a39Sopenharmony_ci
117e1051a39Sopenharmony_ci    $off %= 16; $off *= $REG_SZ;
118e1051a39Sopenharmony_ci    $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
119e1051a39Sopenharmony_ci}
120e1051a39Sopenharmony_ci
121e1051a39Sopenharmony_cisub BODY_00_19 {
122e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
123e1051a39Sopenharmony_cimy $j=$i+1;
124e1051a39Sopenharmony_cimy $k=$i+2;
125e1051a39Sopenharmony_ci
126e1051a39Sopenharmony_ci# Loads are performed 2+3/4 iterations in advance. 3/4 means that out
127e1051a39Sopenharmony_ci# of 4 words you would expect to be loaded per given iteration one is
128e1051a39Sopenharmony_ci# spilled to next iteration. In other words indices in four input
129e1051a39Sopenharmony_ci# streams are distributed as following:
130e1051a39Sopenharmony_ci#
131e1051a39Sopenharmony_ci# $i==0:	0,0,0,0,1,1,1,1,2,2,2,
132e1051a39Sopenharmony_ci# $i==1:	2,3,3,3,
133e1051a39Sopenharmony_ci# $i==2:	3,4,4,4,
134e1051a39Sopenharmony_ci# ...
135e1051a39Sopenharmony_ci# $i==13:	14,15,15,15,
136e1051a39Sopenharmony_ci# $i==14:	15
137e1051a39Sopenharmony_ci#
138e1051a39Sopenharmony_ci# Then at $i==15 Xupdate is applied one iteration in advance...
139e1051a39Sopenharmony_ci$code.=<<___ if ($i==0);
140e1051a39Sopenharmony_ci	movd		(@ptr[0]),@Xi[0]
141e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[0]),@ptr[0]
142e1051a39Sopenharmony_ci	movd		(@ptr[1]),@Xi[2]	# borrow @Xi[2]
143e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[1]),@ptr[1]
144e1051a39Sopenharmony_ci	movd		(@ptr[2]),@Xi[3]	# borrow @Xi[3]
145e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[2]),@ptr[2]
146e1051a39Sopenharmony_ci	movd		(@ptr[3]),@Xi[4]	# borrow @Xi[4]
147e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[3]),@ptr[3]
148e1051a39Sopenharmony_ci	punpckldq	@Xi[3],@Xi[0]
149e1051a39Sopenharmony_ci	 movd		`4*$j-16*4`(@ptr[0]),@Xi[1]
150e1051a39Sopenharmony_ci	punpckldq	@Xi[4],@Xi[2]
151e1051a39Sopenharmony_ci	 movd		`4*$j-16*4`(@ptr[1]),$t3
152e1051a39Sopenharmony_ci	punpckldq	@Xi[2],@Xi[0]
153e1051a39Sopenharmony_ci	 movd		`4*$j-16*4`(@ptr[2]),$t2
154e1051a39Sopenharmony_ci	pshufb		$tx,@Xi[0]
155e1051a39Sopenharmony_ci___
156e1051a39Sopenharmony_ci$code.=<<___ if ($i<14);			# just load input
157e1051a39Sopenharmony_ci	 movd		`4*$j-16*4`(@ptr[3]),$t1
158e1051a39Sopenharmony_ci	 punpckldq	$t2,@Xi[1]
159e1051a39Sopenharmony_ci	movdqa	$a,$t2
160e1051a39Sopenharmony_ci	paddd	$K,$e				# e+=K_00_19
161e1051a39Sopenharmony_ci	 punpckldq	$t1,$t3
162e1051a39Sopenharmony_ci	movdqa	$b,$t1
163e1051a39Sopenharmony_ci	movdqa	$b,$t0
164e1051a39Sopenharmony_ci	pslld	\$5,$t2
165e1051a39Sopenharmony_ci	pandn	$d,$t1
166e1051a39Sopenharmony_ci	pand	$c,$t0
167e1051a39Sopenharmony_ci	 punpckldq	$t3,@Xi[1]
168e1051a39Sopenharmony_ci	movdqa	$a,$t3
169e1051a39Sopenharmony_ci
170e1051a39Sopenharmony_ci	movdqa	@Xi[0],`&Xi_off($i)`
171e1051a39Sopenharmony_ci	paddd	@Xi[0],$e			# e+=X[i]
172e1051a39Sopenharmony_ci	 movd		`4*$k-16*4`(@ptr[0]),@Xi[2]
173e1051a39Sopenharmony_ci	psrld	\$27,$t3
174e1051a39Sopenharmony_ci	pxor	$t1,$t0				# Ch(b,c,d)
175e1051a39Sopenharmony_ci	movdqa	$b,$t1
176e1051a39Sopenharmony_ci
177e1051a39Sopenharmony_ci	por	$t3,$t2				# rol(a,5)
178e1051a39Sopenharmony_ci	 movd		`4*$k-16*4`(@ptr[1]),$t3
179e1051a39Sopenharmony_ci	pslld	\$30,$t1
180e1051a39Sopenharmony_ci	paddd	$t0,$e				# e+=Ch(b,c,d)
181e1051a39Sopenharmony_ci
182e1051a39Sopenharmony_ci	psrld	\$2,$b
183e1051a39Sopenharmony_ci	paddd	$t2,$e				# e+=rol(a,5)
184e1051a39Sopenharmony_ci	 pshufb	$tx,@Xi[1]
185e1051a39Sopenharmony_ci	 movd		`4*$k-16*4`(@ptr[2]),$t2
186e1051a39Sopenharmony_ci	por	$t1,$b				# b=rol(b,30)
187e1051a39Sopenharmony_ci___
188e1051a39Sopenharmony_ci$code.=<<___ if ($i==14);			# just load input
189e1051a39Sopenharmony_ci	 movd		`4*$j-16*4`(@ptr[3]),$t1
190e1051a39Sopenharmony_ci	 punpckldq	$t2,@Xi[1]
191e1051a39Sopenharmony_ci	movdqa	$a,$t2
192e1051a39Sopenharmony_ci	paddd	$K,$e				# e+=K_00_19
193e1051a39Sopenharmony_ci	 punpckldq	$t1,$t3
194e1051a39Sopenharmony_ci	movdqa	$b,$t1
195e1051a39Sopenharmony_ci	movdqa	$b,$t0
196e1051a39Sopenharmony_ci	pslld	\$5,$t2
197e1051a39Sopenharmony_ci	 prefetcht0	63(@ptr[0])
198e1051a39Sopenharmony_ci	pandn	$d,$t1
199e1051a39Sopenharmony_ci	pand	$c,$t0
200e1051a39Sopenharmony_ci	 punpckldq	$t3,@Xi[1]
201e1051a39Sopenharmony_ci	movdqa	$a,$t3
202e1051a39Sopenharmony_ci
203e1051a39Sopenharmony_ci	movdqa	@Xi[0],`&Xi_off($i)`
204e1051a39Sopenharmony_ci	paddd	@Xi[0],$e			# e+=X[i]
205e1051a39Sopenharmony_ci	psrld	\$27,$t3
206e1051a39Sopenharmony_ci	pxor	$t1,$t0				# Ch(b,c,d)
207e1051a39Sopenharmony_ci	movdqa	$b,$t1
208e1051a39Sopenharmony_ci	 prefetcht0	63(@ptr[1])
209e1051a39Sopenharmony_ci
210e1051a39Sopenharmony_ci	por	$t3,$t2				# rol(a,5)
211e1051a39Sopenharmony_ci	pslld	\$30,$t1
212e1051a39Sopenharmony_ci	paddd	$t0,$e				# e+=Ch(b,c,d)
213e1051a39Sopenharmony_ci	 prefetcht0	63(@ptr[2])
214e1051a39Sopenharmony_ci
215e1051a39Sopenharmony_ci	psrld	\$2,$b
216e1051a39Sopenharmony_ci	paddd	$t2,$e				# e+=rol(a,5)
217e1051a39Sopenharmony_ci	 pshufb	$tx,@Xi[1]
218e1051a39Sopenharmony_ci	 prefetcht0	63(@ptr[3])
219e1051a39Sopenharmony_ci	por	$t1,$b				# b=rol(b,30)
220e1051a39Sopenharmony_ci___
221e1051a39Sopenharmony_ci$code.=<<___ if ($i>=13 && $i<15);
222e1051a39Sopenharmony_ci	movdqa	`&Xi_off($j+2)`,@Xi[3]		# preload "X[2]"
223e1051a39Sopenharmony_ci___
224e1051a39Sopenharmony_ci$code.=<<___ if ($i>=15);			# apply Xupdate
225e1051a39Sopenharmony_ci	pxor	@Xi[-2],@Xi[1]			# "X[13]"
226e1051a39Sopenharmony_ci	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
227e1051a39Sopenharmony_ci
228e1051a39Sopenharmony_ci	movdqa	$a,$t2
229e1051a39Sopenharmony_ci	 pxor	`&Xi_off($j+8)`,@Xi[1]
230e1051a39Sopenharmony_ci	paddd	$K,$e				# e+=K_00_19
231e1051a39Sopenharmony_ci	movdqa	$b,$t1
232e1051a39Sopenharmony_ci	pslld	\$5,$t2
233e1051a39Sopenharmony_ci	 pxor	@Xi[3],@Xi[1]
234e1051a39Sopenharmony_ci	movdqa	$b,$t0
235e1051a39Sopenharmony_ci	pandn	$d,$t1
236e1051a39Sopenharmony_ci	 movdqa	@Xi[1],$tx
237e1051a39Sopenharmony_ci	pand	$c,$t0
238e1051a39Sopenharmony_ci	movdqa	$a,$t3
239e1051a39Sopenharmony_ci	 psrld	\$31,$tx
240e1051a39Sopenharmony_ci	 paddd	@Xi[1],@Xi[1]
241e1051a39Sopenharmony_ci
242e1051a39Sopenharmony_ci	movdqa	@Xi[0],`&Xi_off($i)`
243e1051a39Sopenharmony_ci	paddd	@Xi[0],$e			# e+=X[i]
244e1051a39Sopenharmony_ci	psrld	\$27,$t3
245e1051a39Sopenharmony_ci	pxor	$t1,$t0				# Ch(b,c,d)
246e1051a39Sopenharmony_ci
247e1051a39Sopenharmony_ci	movdqa	$b,$t1
248e1051a39Sopenharmony_ci	por	$t3,$t2				# rol(a,5)
249e1051a39Sopenharmony_ci	pslld	\$30,$t1
250e1051a39Sopenharmony_ci	paddd	$t0,$e				# e+=Ch(b,c,d)
251e1051a39Sopenharmony_ci
252e1051a39Sopenharmony_ci	psrld	\$2,$b
253e1051a39Sopenharmony_ci	paddd	$t2,$e				# e+=rol(a,5)
254e1051a39Sopenharmony_ci	 por	$tx,@Xi[1]			# rol	\$1,@Xi[1]
255e1051a39Sopenharmony_ci	por	$t1,$b				# b=rol(b,30)
256e1051a39Sopenharmony_ci___
257e1051a39Sopenharmony_cipush(@Xi,shift(@Xi));
258e1051a39Sopenharmony_ci}
259e1051a39Sopenharmony_ci
260e1051a39Sopenharmony_cisub BODY_20_39 {
261e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
262e1051a39Sopenharmony_cimy $j=$i+1;
263e1051a39Sopenharmony_ci
264e1051a39Sopenharmony_ci$code.=<<___ if ($i<79);
265e1051a39Sopenharmony_ci	pxor	@Xi[-2],@Xi[1]			# "X[13]"
266e1051a39Sopenharmony_ci	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
267e1051a39Sopenharmony_ci
268e1051a39Sopenharmony_ci	movdqa	$a,$t2
269e1051a39Sopenharmony_ci	movdqa	$d,$t0
270e1051a39Sopenharmony_ci	 pxor	`&Xi_off($j+8)`,@Xi[1]
271e1051a39Sopenharmony_ci	paddd	$K,$e				# e+=K_20_39
272e1051a39Sopenharmony_ci	pslld	\$5,$t2
273e1051a39Sopenharmony_ci	pxor	$b,$t0
274e1051a39Sopenharmony_ci
275e1051a39Sopenharmony_ci	movdqa	$a,$t3
276e1051a39Sopenharmony_ci___
277e1051a39Sopenharmony_ci$code.=<<___ if ($i<72);
278e1051a39Sopenharmony_ci	movdqa	@Xi[0],`&Xi_off($i)`
279e1051a39Sopenharmony_ci___
280e1051a39Sopenharmony_ci$code.=<<___ if ($i<79);
281e1051a39Sopenharmony_ci	paddd	@Xi[0],$e			# e+=X[i]
282e1051a39Sopenharmony_ci	 pxor	@Xi[3],@Xi[1]
283e1051a39Sopenharmony_ci	psrld	\$27,$t3
284e1051a39Sopenharmony_ci	pxor	$c,$t0				# Parity(b,c,d)
285e1051a39Sopenharmony_ci	movdqa	$b,$t1
286e1051a39Sopenharmony_ci
287e1051a39Sopenharmony_ci	pslld	\$30,$t1
288e1051a39Sopenharmony_ci	 movdqa	@Xi[1],$tx
289e1051a39Sopenharmony_ci	por	$t3,$t2				# rol(a,5)
290e1051a39Sopenharmony_ci	 psrld	\$31,$tx
291e1051a39Sopenharmony_ci	paddd	$t0,$e				# e+=Parity(b,c,d)
292e1051a39Sopenharmony_ci	 paddd	@Xi[1],@Xi[1]
293e1051a39Sopenharmony_ci
294e1051a39Sopenharmony_ci	psrld	\$2,$b
295e1051a39Sopenharmony_ci	paddd	$t2,$e				# e+=rol(a,5)
296e1051a39Sopenharmony_ci	 por	$tx,@Xi[1]			# rol(@Xi[1],1)
297e1051a39Sopenharmony_ci	por	$t1,$b				# b=rol(b,30)
298e1051a39Sopenharmony_ci___
299e1051a39Sopenharmony_ci$code.=<<___ if ($i==79);
300e1051a39Sopenharmony_ci	movdqa	$a,$t2
301e1051a39Sopenharmony_ci	paddd	$K,$e				# e+=K_20_39
302e1051a39Sopenharmony_ci	movdqa	$d,$t0
303e1051a39Sopenharmony_ci	pslld	\$5,$t2
304e1051a39Sopenharmony_ci	pxor	$b,$t0
305e1051a39Sopenharmony_ci
306e1051a39Sopenharmony_ci	movdqa	$a,$t3
307e1051a39Sopenharmony_ci	paddd	@Xi[0],$e			# e+=X[i]
308e1051a39Sopenharmony_ci	psrld	\$27,$t3
309e1051a39Sopenharmony_ci	movdqa	$b,$t1
310e1051a39Sopenharmony_ci	pxor	$c,$t0				# Parity(b,c,d)
311e1051a39Sopenharmony_ci
312e1051a39Sopenharmony_ci	pslld	\$30,$t1
313e1051a39Sopenharmony_ci	por	$t3,$t2				# rol(a,5)
314e1051a39Sopenharmony_ci	paddd	$t0,$e				# e+=Parity(b,c,d)
315e1051a39Sopenharmony_ci
316e1051a39Sopenharmony_ci	psrld	\$2,$b
317e1051a39Sopenharmony_ci	paddd	$t2,$e				# e+=rol(a,5)
318e1051a39Sopenharmony_ci	por	$t1,$b				# b=rol(b,30)
319e1051a39Sopenharmony_ci___
320e1051a39Sopenharmony_cipush(@Xi,shift(@Xi));
321e1051a39Sopenharmony_ci}
322e1051a39Sopenharmony_ci
323e1051a39Sopenharmony_cisub BODY_40_59 {
324e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
325e1051a39Sopenharmony_cimy $j=$i+1;
326e1051a39Sopenharmony_ci
327e1051a39Sopenharmony_ci$code.=<<___;
328e1051a39Sopenharmony_ci	pxor	@Xi[-2],@Xi[1]			# "X[13]"
329e1051a39Sopenharmony_ci	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
330e1051a39Sopenharmony_ci
331e1051a39Sopenharmony_ci	movdqa	$a,$t2
332e1051a39Sopenharmony_ci	movdqa	$d,$t1
333e1051a39Sopenharmony_ci	 pxor	`&Xi_off($j+8)`,@Xi[1]
334e1051a39Sopenharmony_ci	pxor	@Xi[3],@Xi[1]
335e1051a39Sopenharmony_ci	paddd	$K,$e				# e+=K_40_59
336e1051a39Sopenharmony_ci	pslld	\$5,$t2
337e1051a39Sopenharmony_ci	movdqa	$a,$t3
338e1051a39Sopenharmony_ci	pand	$c,$t1
339e1051a39Sopenharmony_ci
340e1051a39Sopenharmony_ci	movdqa	$d,$t0
341e1051a39Sopenharmony_ci	 movdqa	@Xi[1],$tx
342e1051a39Sopenharmony_ci	psrld	\$27,$t3
343e1051a39Sopenharmony_ci	paddd	$t1,$e
344e1051a39Sopenharmony_ci	pxor	$c,$t0
345e1051a39Sopenharmony_ci
346e1051a39Sopenharmony_ci	movdqa	@Xi[0],`&Xi_off($i)`
347e1051a39Sopenharmony_ci	paddd	@Xi[0],$e			# e+=X[i]
348e1051a39Sopenharmony_ci	por	$t3,$t2				# rol(a,5)
349e1051a39Sopenharmony_ci	 psrld	\$31,$tx
350e1051a39Sopenharmony_ci	pand	$b,$t0
351e1051a39Sopenharmony_ci	movdqa	$b,$t1
352e1051a39Sopenharmony_ci
353e1051a39Sopenharmony_ci	pslld	\$30,$t1
354e1051a39Sopenharmony_ci	 paddd	@Xi[1],@Xi[1]
355e1051a39Sopenharmony_ci	paddd	$t0,$e				# e+=Maj(b,d,c)
356e1051a39Sopenharmony_ci
357e1051a39Sopenharmony_ci	psrld	\$2,$b
358e1051a39Sopenharmony_ci	paddd	$t2,$e				# e+=rol(a,5)
359e1051a39Sopenharmony_ci	 por	$tx,@Xi[1]			# rol(@X[1],1)
360e1051a39Sopenharmony_ci	por	$t1,$b				# b=rol(b,30)
361e1051a39Sopenharmony_ci___
362e1051a39Sopenharmony_cipush(@Xi,shift(@Xi));
363e1051a39Sopenharmony_ci}
364e1051a39Sopenharmony_ci
365e1051a39Sopenharmony_ci$code.=<<___;
366e1051a39Sopenharmony_ci.text
367e1051a39Sopenharmony_ci
368e1051a39Sopenharmony_ci.extern	OPENSSL_ia32cap_P
369e1051a39Sopenharmony_ci
370e1051a39Sopenharmony_ci.globl	sha1_multi_block
371e1051a39Sopenharmony_ci.type	sha1_multi_block,\@function,3
372e1051a39Sopenharmony_ci.align	32
373e1051a39Sopenharmony_cisha1_multi_block:
374e1051a39Sopenharmony_ci.cfi_startproc
375e1051a39Sopenharmony_ci	mov	OPENSSL_ia32cap_P+4(%rip),%rcx
376e1051a39Sopenharmony_ci	bt	\$61,%rcx			# check SHA bit
377e1051a39Sopenharmony_ci	jc	_shaext_shortcut
378e1051a39Sopenharmony_ci___
379e1051a39Sopenharmony_ci$code.=<<___ if ($avx);
380e1051a39Sopenharmony_ci	test	\$`1<<28`,%ecx
381e1051a39Sopenharmony_ci	jnz	_avx_shortcut
382e1051a39Sopenharmony_ci___
383e1051a39Sopenharmony_ci$code.=<<___;
384e1051a39Sopenharmony_ci	mov	%rsp,%rax
385e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
386e1051a39Sopenharmony_ci	push	%rbx
387e1051a39Sopenharmony_ci.cfi_push	%rbx
388e1051a39Sopenharmony_ci	push	%rbp
389e1051a39Sopenharmony_ci.cfi_push	%rbx
390e1051a39Sopenharmony_ci___
391e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
392e1051a39Sopenharmony_ci	lea	-0xa8(%rsp),%rsp
393e1051a39Sopenharmony_ci	movaps	%xmm6,(%rsp)
394e1051a39Sopenharmony_ci	movaps	%xmm7,0x10(%rsp)
395e1051a39Sopenharmony_ci	movaps	%xmm8,0x20(%rsp)
396e1051a39Sopenharmony_ci	movaps	%xmm9,0x30(%rsp)
397e1051a39Sopenharmony_ci	movaps	%xmm10,-0x78(%rax)
398e1051a39Sopenharmony_ci	movaps	%xmm11,-0x68(%rax)
399e1051a39Sopenharmony_ci	movaps	%xmm12,-0x58(%rax)
400e1051a39Sopenharmony_ci	movaps	%xmm13,-0x48(%rax)
401e1051a39Sopenharmony_ci	movaps	%xmm14,-0x38(%rax)
402e1051a39Sopenharmony_ci	movaps	%xmm15,-0x28(%rax)
403e1051a39Sopenharmony_ci___
404e1051a39Sopenharmony_ci$code.=<<___;
405e1051a39Sopenharmony_ci	sub	\$`$REG_SZ*18`,%rsp
406e1051a39Sopenharmony_ci	and	\$-256,%rsp
407e1051a39Sopenharmony_ci	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
408e1051a39Sopenharmony_ci.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
409e1051a39Sopenharmony_ci.Lbody:
410e1051a39Sopenharmony_ci	lea	K_XX_XX(%rip),$Tbl
411e1051a39Sopenharmony_ci	lea	`$REG_SZ*16`(%rsp),%rbx
412e1051a39Sopenharmony_ci
413e1051a39Sopenharmony_ci.Loop_grande:
414e1051a39Sopenharmony_ci	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
415e1051a39Sopenharmony_ci	xor	$num,$num
416e1051a39Sopenharmony_ci___
417e1051a39Sopenharmony_cifor($i=0;$i<4;$i++) {
418e1051a39Sopenharmony_ci    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
419e1051a39Sopenharmony_ci    $code.=<<___;
420e1051a39Sopenharmony_ci	# input pointer
421e1051a39Sopenharmony_ci	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
422e1051a39Sopenharmony_ci	# number of blocks
423e1051a39Sopenharmony_ci	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
424e1051a39Sopenharmony_ci	cmp	$num,%ecx
425e1051a39Sopenharmony_ci	cmovg	%ecx,$num			# find maximum
426e1051a39Sopenharmony_ci	test	%ecx,%ecx
427e1051a39Sopenharmony_ci	mov	%ecx,`4*$i`(%rbx)		# initialize counters
428e1051a39Sopenharmony_ci	cmovle	$Tbl,@ptr[$i]			# cancel input
429e1051a39Sopenharmony_ci___
430e1051a39Sopenharmony_ci}
431e1051a39Sopenharmony_ci$code.=<<___;
432e1051a39Sopenharmony_ci	test	$num,$num
433e1051a39Sopenharmony_ci	jz	.Ldone
434e1051a39Sopenharmony_ci
435e1051a39Sopenharmony_ci	movdqu	0x00($ctx),$A			# load context
436e1051a39Sopenharmony_ci	 lea	128(%rsp),%rax
437e1051a39Sopenharmony_ci	movdqu	0x20($ctx),$B
438e1051a39Sopenharmony_ci	movdqu	0x40($ctx),$C
439e1051a39Sopenharmony_ci	movdqu	0x60($ctx),$D
440e1051a39Sopenharmony_ci	movdqu	0x80($ctx),$E
441e1051a39Sopenharmony_ci	movdqa	0x60($Tbl),$tx			# pbswap_mask
442e1051a39Sopenharmony_ci	movdqa	-0x20($Tbl),$K			# K_00_19
443e1051a39Sopenharmony_ci	jmp	.Loop
444e1051a39Sopenharmony_ci
445e1051a39Sopenharmony_ci.align	32
446e1051a39Sopenharmony_ci.Loop:
447e1051a39Sopenharmony_ci___
448e1051a39Sopenharmony_cifor($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
449e1051a39Sopenharmony_ci$code.="	movdqa	0x00($Tbl),$K\n";	# K_20_39
450e1051a39Sopenharmony_cifor(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
451e1051a39Sopenharmony_ci$code.="	movdqa	0x20($Tbl),$K\n";	# K_40_59
452e1051a39Sopenharmony_cifor(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
453e1051a39Sopenharmony_ci$code.="	movdqa	0x40($Tbl),$K\n";	# K_60_79
454e1051a39Sopenharmony_cifor(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
455e1051a39Sopenharmony_ci$code.=<<___;
456e1051a39Sopenharmony_ci	movdqa	(%rbx),@Xi[0]			# pull counters
457e1051a39Sopenharmony_ci	mov	\$1,%ecx
458e1051a39Sopenharmony_ci	cmp	4*0(%rbx),%ecx			# examine counters
459e1051a39Sopenharmony_ci	pxor	$t2,$t2
460e1051a39Sopenharmony_ci	cmovge	$Tbl,@ptr[0]			# cancel input
461e1051a39Sopenharmony_ci	cmp	4*1(%rbx),%ecx
462e1051a39Sopenharmony_ci	movdqa	@Xi[0],@Xi[1]
463e1051a39Sopenharmony_ci	cmovge	$Tbl,@ptr[1]
464e1051a39Sopenharmony_ci	cmp	4*2(%rbx),%ecx
465e1051a39Sopenharmony_ci	pcmpgtd	$t2,@Xi[1]			# mask value
466e1051a39Sopenharmony_ci	cmovge	$Tbl,@ptr[2]
467e1051a39Sopenharmony_ci	cmp	4*3(%rbx),%ecx
468e1051a39Sopenharmony_ci	paddd	@Xi[1],@Xi[0]			# counters--
469e1051a39Sopenharmony_ci	cmovge	$Tbl,@ptr[3]
470e1051a39Sopenharmony_ci
471e1051a39Sopenharmony_ci	movdqu	0x00($ctx),$t0
472e1051a39Sopenharmony_ci	pand	@Xi[1],$A
473e1051a39Sopenharmony_ci	movdqu	0x20($ctx),$t1
474e1051a39Sopenharmony_ci	pand	@Xi[1],$B
475e1051a39Sopenharmony_ci	paddd	$t0,$A
476e1051a39Sopenharmony_ci	movdqu	0x40($ctx),$t2
477e1051a39Sopenharmony_ci	pand	@Xi[1],$C
478e1051a39Sopenharmony_ci	paddd	$t1,$B
479e1051a39Sopenharmony_ci	movdqu	0x60($ctx),$t3
480e1051a39Sopenharmony_ci	pand	@Xi[1],$D
481e1051a39Sopenharmony_ci	paddd	$t2,$C
482e1051a39Sopenharmony_ci	movdqu	0x80($ctx),$tx
483e1051a39Sopenharmony_ci	pand	@Xi[1],$E
484e1051a39Sopenharmony_ci	movdqu	$A,0x00($ctx)
485e1051a39Sopenharmony_ci	paddd	$t3,$D
486e1051a39Sopenharmony_ci	movdqu	$B,0x20($ctx)
487e1051a39Sopenharmony_ci	paddd	$tx,$E
488e1051a39Sopenharmony_ci	movdqu	$C,0x40($ctx)
489e1051a39Sopenharmony_ci	movdqu	$D,0x60($ctx)
490e1051a39Sopenharmony_ci	movdqu	$E,0x80($ctx)
491e1051a39Sopenharmony_ci
492e1051a39Sopenharmony_ci	movdqa	@Xi[0],(%rbx)			# save counters
493e1051a39Sopenharmony_ci	movdqa	0x60($Tbl),$tx			# pbswap_mask
494e1051a39Sopenharmony_ci	movdqa	-0x20($Tbl),$K			# K_00_19
495e1051a39Sopenharmony_ci	dec	$num
496e1051a39Sopenharmony_ci	jnz	.Loop
497e1051a39Sopenharmony_ci
498e1051a39Sopenharmony_ci	mov	`$REG_SZ*17+8`(%rsp),$num
499e1051a39Sopenharmony_ci	lea	$REG_SZ($ctx),$ctx
500e1051a39Sopenharmony_ci	lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
501e1051a39Sopenharmony_ci	dec	$num
502e1051a39Sopenharmony_ci	jnz	.Loop_grande
503e1051a39Sopenharmony_ci
504e1051a39Sopenharmony_ci.Ldone:
505e1051a39Sopenharmony_ci	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
506e1051a39Sopenharmony_ci.cfi_def_cfa	%rax,8
507e1051a39Sopenharmony_ci___
508e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
509e1051a39Sopenharmony_ci	movaps	-0xb8(%rax),%xmm6
510e1051a39Sopenharmony_ci	movaps	-0xa8(%rax),%xmm7
511e1051a39Sopenharmony_ci	movaps	-0x98(%rax),%xmm8
512e1051a39Sopenharmony_ci	movaps	-0x88(%rax),%xmm9
513e1051a39Sopenharmony_ci	movaps	-0x78(%rax),%xmm10
514e1051a39Sopenharmony_ci	movaps	-0x68(%rax),%xmm11
515e1051a39Sopenharmony_ci	movaps	-0x58(%rax),%xmm12
516e1051a39Sopenharmony_ci	movaps	-0x48(%rax),%xmm13
517e1051a39Sopenharmony_ci	movaps	-0x38(%rax),%xmm14
518e1051a39Sopenharmony_ci	movaps	-0x28(%rax),%xmm15
519e1051a39Sopenharmony_ci___
520e1051a39Sopenharmony_ci$code.=<<___;
521e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
522e1051a39Sopenharmony_ci.cfi_restore	%rbp
523e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
524e1051a39Sopenharmony_ci.cfi_restore	%rbx
525e1051a39Sopenharmony_ci	lea	(%rax),%rsp
526e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
527e1051a39Sopenharmony_ci.Lepilogue:
528e1051a39Sopenharmony_ci	ret
529e1051a39Sopenharmony_ci.cfi_endproc
530e1051a39Sopenharmony_ci.size	sha1_multi_block,.-sha1_multi_block
531e1051a39Sopenharmony_ci___
532e1051a39Sopenharmony_ci						{{{
533e1051a39Sopenharmony_cimy ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10));
534e1051a39Sopenharmony_cimy @MSG0=map("%xmm$_",(4..7));
535e1051a39Sopenharmony_cimy @MSG1=map("%xmm$_",(11..14));
536e1051a39Sopenharmony_ci
537e1051a39Sopenharmony_ci$code.=<<___;
538e1051a39Sopenharmony_ci.type	sha1_multi_block_shaext,\@function,3
539e1051a39Sopenharmony_ci.align	32
540e1051a39Sopenharmony_cisha1_multi_block_shaext:
541e1051a39Sopenharmony_ci.cfi_startproc
542e1051a39Sopenharmony_ci_shaext_shortcut:
543e1051a39Sopenharmony_ci	mov	%rsp,%rax
544e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
545e1051a39Sopenharmony_ci	push	%rbx
546e1051a39Sopenharmony_ci.cfi_push	%rbx
547e1051a39Sopenharmony_ci	push	%rbp
548e1051a39Sopenharmony_ci.cfi_push	%rbp
549e1051a39Sopenharmony_ci___
550e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
551e1051a39Sopenharmony_ci	lea	-0xa8(%rsp),%rsp
552e1051a39Sopenharmony_ci	movaps	%xmm6,(%rsp)
553e1051a39Sopenharmony_ci	movaps	%xmm7,0x10(%rsp)
554e1051a39Sopenharmony_ci	movaps	%xmm8,0x20(%rsp)
555e1051a39Sopenharmony_ci	movaps	%xmm9,0x30(%rsp)
556e1051a39Sopenharmony_ci	movaps	%xmm10,-0x78(%rax)
557e1051a39Sopenharmony_ci	movaps	%xmm11,-0x68(%rax)
558e1051a39Sopenharmony_ci	movaps	%xmm12,-0x58(%rax)
559e1051a39Sopenharmony_ci	movaps	%xmm13,-0x48(%rax)
560e1051a39Sopenharmony_ci	movaps	%xmm14,-0x38(%rax)
561e1051a39Sopenharmony_ci	movaps	%xmm15,-0x28(%rax)
562e1051a39Sopenharmony_ci___
563e1051a39Sopenharmony_ci$code.=<<___;
564e1051a39Sopenharmony_ci	sub	\$`$REG_SZ*18`,%rsp
565e1051a39Sopenharmony_ci	shl	\$1,$num			# we process pair at a time
566e1051a39Sopenharmony_ci	and	\$-256,%rsp
567e1051a39Sopenharmony_ci	lea	0x40($ctx),$ctx			# size optimization
568e1051a39Sopenharmony_ci	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
569e1051a39Sopenharmony_ci.Lbody_shaext:
570e1051a39Sopenharmony_ci	lea	`$REG_SZ*16`(%rsp),%rbx
571e1051a39Sopenharmony_ci	movdqa	K_XX_XX+0x80(%rip),$BSWAP	# byte-n-word swap
572e1051a39Sopenharmony_ci
573e1051a39Sopenharmony_ci.Loop_grande_shaext:
574e1051a39Sopenharmony_ci	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
575e1051a39Sopenharmony_ci	xor	$num,$num
576e1051a39Sopenharmony_ci___
577e1051a39Sopenharmony_cifor($i=0;$i<2;$i++) {
578e1051a39Sopenharmony_ci    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
579e1051a39Sopenharmony_ci    $code.=<<___;
580e1051a39Sopenharmony_ci	# input pointer
581e1051a39Sopenharmony_ci	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
582e1051a39Sopenharmony_ci	# number of blocks
583e1051a39Sopenharmony_ci	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
584e1051a39Sopenharmony_ci	cmp	$num,%ecx
585e1051a39Sopenharmony_ci	cmovg	%ecx,$num			# find maximum
586e1051a39Sopenharmony_ci	test	%ecx,%ecx
587e1051a39Sopenharmony_ci	mov	%ecx,`4*$i`(%rbx)		# initialize counters
588e1051a39Sopenharmony_ci	cmovle	%rsp,@ptr[$i]			# cancel input
589e1051a39Sopenharmony_ci___
590e1051a39Sopenharmony_ci}
591e1051a39Sopenharmony_ci$code.=<<___;
592e1051a39Sopenharmony_ci	test	$num,$num
593e1051a39Sopenharmony_ci	jz	.Ldone_shaext
594e1051a39Sopenharmony_ci
595e1051a39Sopenharmony_ci	movq		0x00-0x40($ctx),$ABCD0	# a1.a0
596e1051a39Sopenharmony_ci	movq		0x20-0x40($ctx),@MSG0[0]# b1.b0
597e1051a39Sopenharmony_ci	movq		0x40-0x40($ctx),@MSG0[1]# c1.c0
598e1051a39Sopenharmony_ci	movq		0x60-0x40($ctx),@MSG0[2]# d1.d0
599e1051a39Sopenharmony_ci	movq		0x80-0x40($ctx),@MSG0[3]# e1.e0
600e1051a39Sopenharmony_ci
601e1051a39Sopenharmony_ci	punpckldq	@MSG0[0],$ABCD0		# b1.a1.b0.a0
602e1051a39Sopenharmony_ci	punpckldq	@MSG0[2],@MSG0[1]	# d1.c1.d0.c0
603e1051a39Sopenharmony_ci
604e1051a39Sopenharmony_ci	movdqa		$ABCD0,$ABCD1
605e1051a39Sopenharmony_ci	punpcklqdq	@MSG0[1],$ABCD0		# d0.c0.b0.a0
606e1051a39Sopenharmony_ci	punpckhqdq	@MSG0[1],$ABCD1		# d1.c1.b1.a1
607e1051a39Sopenharmony_ci
608e1051a39Sopenharmony_ci	pshufd		\$0b00111111,@MSG0[3],$E0
609e1051a39Sopenharmony_ci	pshufd		\$0b01111111,@MSG0[3],$E1
610e1051a39Sopenharmony_ci	pshufd		\$0b00011011,$ABCD0,$ABCD0
611e1051a39Sopenharmony_ci	pshufd		\$0b00011011,$ABCD1,$ABCD1
612e1051a39Sopenharmony_ci	jmp		.Loop_shaext
613e1051a39Sopenharmony_ci
614e1051a39Sopenharmony_ci.align	32
615e1051a39Sopenharmony_ci.Loop_shaext:
616e1051a39Sopenharmony_ci	movdqu		0x00(@ptr[0]),@MSG0[0]
617e1051a39Sopenharmony_ci	 movdqu		0x00(@ptr[1]),@MSG1[0]
618e1051a39Sopenharmony_ci	movdqu		0x10(@ptr[0]),@MSG0[1]
619e1051a39Sopenharmony_ci	 movdqu		0x10(@ptr[1]),@MSG1[1]
620e1051a39Sopenharmony_ci	movdqu		0x20(@ptr[0]),@MSG0[2]
621e1051a39Sopenharmony_ci	pshufb		$BSWAP,@MSG0[0]
622e1051a39Sopenharmony_ci	 movdqu		0x20(@ptr[1]),@MSG1[2]
623e1051a39Sopenharmony_ci	 pshufb		$BSWAP,@MSG1[0]
624e1051a39Sopenharmony_ci	movdqu		0x30(@ptr[0]),@MSG0[3]
625e1051a39Sopenharmony_ci	lea		0x40(@ptr[0]),@ptr[0]
626e1051a39Sopenharmony_ci	pshufb		$BSWAP,@MSG0[1]
627e1051a39Sopenharmony_ci	 movdqu		0x30(@ptr[1]),@MSG1[3]
628e1051a39Sopenharmony_ci	 lea		0x40(@ptr[1]),@ptr[1]
629e1051a39Sopenharmony_ci	 pshufb		$BSWAP,@MSG1[1]
630e1051a39Sopenharmony_ci
631e1051a39Sopenharmony_ci	movdqa		$E0,0x50(%rsp)		# offload
632e1051a39Sopenharmony_ci	paddd		@MSG0[0],$E0
633e1051a39Sopenharmony_ci	 movdqa		$E1,0x70(%rsp)
634e1051a39Sopenharmony_ci	 paddd		@MSG1[0],$E1
635e1051a39Sopenharmony_ci	movdqa		$ABCD0,0x40(%rsp)	# offload
636e1051a39Sopenharmony_ci	movdqa		$ABCD0,$E0_
637e1051a39Sopenharmony_ci	 movdqa		$ABCD1,0x60(%rsp)
638e1051a39Sopenharmony_ci	 movdqa		$ABCD1,$E1_
639e1051a39Sopenharmony_ci	sha1rnds4	\$0,$E0,$ABCD0		# 0-3
640e1051a39Sopenharmony_ci	sha1nexte	@MSG0[1],$E0_
641e1051a39Sopenharmony_ci	 sha1rnds4	\$0,$E1,$ABCD1		# 0-3
642e1051a39Sopenharmony_ci	 sha1nexte	@MSG1[1],$E1_
643e1051a39Sopenharmony_ci	pshufb		$BSWAP,@MSG0[2]
644e1051a39Sopenharmony_ci	prefetcht0	127(@ptr[0])
645e1051a39Sopenharmony_ci	sha1msg1	@MSG0[1],@MSG0[0]
646e1051a39Sopenharmony_ci	 pshufb		$BSWAP,@MSG1[2]
647e1051a39Sopenharmony_ci	 prefetcht0	127(@ptr[1])
648e1051a39Sopenharmony_ci	 sha1msg1	@MSG1[1],@MSG1[0]
649e1051a39Sopenharmony_ci
650e1051a39Sopenharmony_ci	pshufb		$BSWAP,@MSG0[3]
651e1051a39Sopenharmony_ci	movdqa		$ABCD0,$E0
652e1051a39Sopenharmony_ci	 pshufb		$BSWAP,@MSG1[3]
653e1051a39Sopenharmony_ci	 movdqa		$ABCD1,$E1
654e1051a39Sopenharmony_ci	sha1rnds4	\$0,$E0_,$ABCD0		# 4-7
655e1051a39Sopenharmony_ci	sha1nexte	@MSG0[2],$E0
656e1051a39Sopenharmony_ci	 sha1rnds4	\$0,$E1_,$ABCD1		# 4-7
657e1051a39Sopenharmony_ci	 sha1nexte	@MSG1[2],$E1
658e1051a39Sopenharmony_ci	pxor		@MSG0[2],@MSG0[0]
659e1051a39Sopenharmony_ci	sha1msg1	@MSG0[2],@MSG0[1]
660e1051a39Sopenharmony_ci	 pxor		@MSG1[2],@MSG1[0]
661e1051a39Sopenharmony_ci	 sha1msg1	@MSG1[2],@MSG1[1]
662e1051a39Sopenharmony_ci___
663e1051a39Sopenharmony_cifor($i=2;$i<20-4;$i++) {
664e1051a39Sopenharmony_ci$code.=<<___;
665e1051a39Sopenharmony_ci	movdqa		$ABCD0,$E0_
666e1051a39Sopenharmony_ci	 movdqa		$ABCD1,$E1_
667e1051a39Sopenharmony_ci	sha1rnds4	\$`int($i/5)`,$E0,$ABCD0	# 8-11
668e1051a39Sopenharmony_ci	sha1nexte	@MSG0[3],$E0_
669e1051a39Sopenharmony_ci	 sha1rnds4	\$`int($i/5)`,$E1,$ABCD1	# 8-11
670e1051a39Sopenharmony_ci	 sha1nexte	@MSG1[3],$E1_
671e1051a39Sopenharmony_ci	sha1msg2	@MSG0[3],@MSG0[0]
672e1051a39Sopenharmony_ci	 sha1msg2	@MSG1[3],@MSG1[0]
673e1051a39Sopenharmony_ci	pxor		@MSG0[3],@MSG0[1]
674e1051a39Sopenharmony_ci	sha1msg1	@MSG0[3],@MSG0[2]
675e1051a39Sopenharmony_ci	 pxor		@MSG1[3],@MSG1[1]
676e1051a39Sopenharmony_ci	 sha1msg1	@MSG1[3],@MSG1[2]
677e1051a39Sopenharmony_ci___
678e1051a39Sopenharmony_ci	($E0,$E0_)=($E0_,$E0);		($E1,$E1_)=($E1_,$E1);
679e1051a39Sopenharmony_ci	push(@MSG0,shift(@MSG0));	push(@MSG1,shift(@MSG1));
680e1051a39Sopenharmony_ci}
681e1051a39Sopenharmony_ci$code.=<<___;
682e1051a39Sopenharmony_ci	movdqa		$ABCD0,$E0_
683e1051a39Sopenharmony_ci	 movdqa		$ABCD1,$E1_
684e1051a39Sopenharmony_ci	sha1rnds4	\$3,$E0,$ABCD0		# 64-67
685e1051a39Sopenharmony_ci	sha1nexte	@MSG0[3],$E0_
686e1051a39Sopenharmony_ci	 sha1rnds4	\$3,$E1,$ABCD1		# 64-67
687e1051a39Sopenharmony_ci	 sha1nexte	@MSG1[3],$E1_
688e1051a39Sopenharmony_ci	sha1msg2	@MSG0[3],@MSG0[0]
689e1051a39Sopenharmony_ci	 sha1msg2	@MSG1[3],@MSG1[0]
690e1051a39Sopenharmony_ci	pxor		@MSG0[3],@MSG0[1]
691e1051a39Sopenharmony_ci	 pxor		@MSG1[3],@MSG1[1]
692e1051a39Sopenharmony_ci
693e1051a39Sopenharmony_ci	mov		\$1,%ecx
694e1051a39Sopenharmony_ci	pxor		@MSG0[2],@MSG0[2]	# zero
695e1051a39Sopenharmony_ci	cmp		4*0(%rbx),%ecx		# examine counters
696e1051a39Sopenharmony_ci	cmovge		%rsp,@ptr[0]		# cancel input
697e1051a39Sopenharmony_ci
698e1051a39Sopenharmony_ci	movdqa		$ABCD0,$E0
699e1051a39Sopenharmony_ci	 movdqa		$ABCD1,$E1
700e1051a39Sopenharmony_ci	sha1rnds4	\$3,$E0_,$ABCD0		# 68-71
701e1051a39Sopenharmony_ci	sha1nexte	@MSG0[0],$E0
702e1051a39Sopenharmony_ci	 sha1rnds4	\$3,$E1_,$ABCD1		# 68-71
703e1051a39Sopenharmony_ci	 sha1nexte	@MSG1[0],$E1
704e1051a39Sopenharmony_ci	sha1msg2	@MSG0[0],@MSG0[1]
705e1051a39Sopenharmony_ci	 sha1msg2	@MSG1[0],@MSG1[1]
706e1051a39Sopenharmony_ci
707e1051a39Sopenharmony_ci	cmp		4*1(%rbx),%ecx
708e1051a39Sopenharmony_ci	cmovge		%rsp,@ptr[1]
709e1051a39Sopenharmony_ci	movq		(%rbx),@MSG0[0]		# pull counters
710e1051a39Sopenharmony_ci
711e1051a39Sopenharmony_ci	movdqa		$ABCD0,$E0_
712e1051a39Sopenharmony_ci	 movdqa		$ABCD1,$E1_
713e1051a39Sopenharmony_ci	sha1rnds4	\$3,$E0,$ABCD0		# 72-75
714e1051a39Sopenharmony_ci	sha1nexte	@MSG0[1],$E0_
715e1051a39Sopenharmony_ci	 sha1rnds4	\$3,$E1,$ABCD1		# 72-75
716e1051a39Sopenharmony_ci	 sha1nexte	@MSG1[1],$E1_
717e1051a39Sopenharmony_ci
718e1051a39Sopenharmony_ci	pshufd		\$0x00,@MSG0[0],@MSG1[2]
719e1051a39Sopenharmony_ci	pshufd		\$0x55,@MSG0[0],@MSG1[3]
720e1051a39Sopenharmony_ci	movdqa		@MSG0[0],@MSG0[1]
721e1051a39Sopenharmony_ci	pcmpgtd		@MSG0[2],@MSG1[2]
722e1051a39Sopenharmony_ci	pcmpgtd		@MSG0[2],@MSG1[3]
723e1051a39Sopenharmony_ci
724e1051a39Sopenharmony_ci	movdqa		$ABCD0,$E0
725e1051a39Sopenharmony_ci	 movdqa		$ABCD1,$E1
726e1051a39Sopenharmony_ci	sha1rnds4	\$3,$E0_,$ABCD0		# 76-79
727e1051a39Sopenharmony_ci	sha1nexte	$MSG0[2],$E0
728e1051a39Sopenharmony_ci	 sha1rnds4	\$3,$E1_,$ABCD1		# 76-79
729e1051a39Sopenharmony_ci	 sha1nexte	$MSG0[2],$E1
730e1051a39Sopenharmony_ci
731e1051a39Sopenharmony_ci	pcmpgtd		@MSG0[2],@MSG0[1]	# counter mask
732e1051a39Sopenharmony_ci	pand		@MSG1[2],$ABCD0
733e1051a39Sopenharmony_ci	pand		@MSG1[2],$E0
734e1051a39Sopenharmony_ci	 pand		@MSG1[3],$ABCD1
735e1051a39Sopenharmony_ci	 pand		@MSG1[3],$E1
736e1051a39Sopenharmony_ci	paddd		@MSG0[1],@MSG0[0]	# counters--
737e1051a39Sopenharmony_ci
738e1051a39Sopenharmony_ci	paddd		0x40(%rsp),$ABCD0
739e1051a39Sopenharmony_ci	paddd		0x50(%rsp),$E0
740e1051a39Sopenharmony_ci	 paddd		0x60(%rsp),$ABCD1
741e1051a39Sopenharmony_ci	 paddd		0x70(%rsp),$E1
742e1051a39Sopenharmony_ci
743e1051a39Sopenharmony_ci	movq		@MSG0[0],(%rbx)		# save counters
744e1051a39Sopenharmony_ci	dec		$num
745e1051a39Sopenharmony_ci	jnz		.Loop_shaext
746e1051a39Sopenharmony_ci
747e1051a39Sopenharmony_ci	mov		`$REG_SZ*17+8`(%rsp),$num
748e1051a39Sopenharmony_ci
749e1051a39Sopenharmony_ci	pshufd		\$0b00011011,$ABCD0,$ABCD0
750e1051a39Sopenharmony_ci	pshufd		\$0b00011011,$ABCD1,$ABCD1
751e1051a39Sopenharmony_ci
752e1051a39Sopenharmony_ci	movdqa		$ABCD0,@MSG0[0]
753e1051a39Sopenharmony_ci	punpckldq	$ABCD1,$ABCD0		# b1.b0.a1.a0
754e1051a39Sopenharmony_ci	punpckhdq	$ABCD1,@MSG0[0]		# d1.d0.c1.c0
755e1051a39Sopenharmony_ci	punpckhdq	$E1,$E0			# e1.e0.xx.xx
756e1051a39Sopenharmony_ci	movq		$ABCD0,0x00-0x40($ctx)	# a1.a0
757e1051a39Sopenharmony_ci	psrldq		\$8,$ABCD0
758e1051a39Sopenharmony_ci	movq		@MSG0[0],0x40-0x40($ctx)# c1.c0
759e1051a39Sopenharmony_ci	psrldq		\$8,@MSG0[0]
760e1051a39Sopenharmony_ci	movq		$ABCD0,0x20-0x40($ctx)	# b1.b0
761e1051a39Sopenharmony_ci	psrldq		\$8,$E0
762e1051a39Sopenharmony_ci	movq		@MSG0[0],0x60-0x40($ctx)# d1.d0
763e1051a39Sopenharmony_ci	movq		$E0,0x80-0x40($ctx)	# e1.e0
764e1051a39Sopenharmony_ci
765e1051a39Sopenharmony_ci	lea	`$REG_SZ/2`($ctx),$ctx
766e1051a39Sopenharmony_ci	lea	`$inp_elm_size*2`($inp),$inp
767e1051a39Sopenharmony_ci	dec	$num
768e1051a39Sopenharmony_ci	jnz	.Loop_grande_shaext
769e1051a39Sopenharmony_ci
770e1051a39Sopenharmony_ci.Ldone_shaext:
771e1051a39Sopenharmony_ci	#mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
772e1051a39Sopenharmony_ci___
773e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
774e1051a39Sopenharmony_ci	movaps	-0xb8(%rax),%xmm6
775e1051a39Sopenharmony_ci	movaps	-0xa8(%rax),%xmm7
776e1051a39Sopenharmony_ci	movaps	-0x98(%rax),%xmm8
777e1051a39Sopenharmony_ci	movaps	-0x88(%rax),%xmm9
778e1051a39Sopenharmony_ci	movaps	-0x78(%rax),%xmm10
779e1051a39Sopenharmony_ci	movaps	-0x68(%rax),%xmm11
780e1051a39Sopenharmony_ci	movaps	-0x58(%rax),%xmm12
781e1051a39Sopenharmony_ci	movaps	-0x48(%rax),%xmm13
782e1051a39Sopenharmony_ci	movaps	-0x38(%rax),%xmm14
783e1051a39Sopenharmony_ci	movaps	-0x28(%rax),%xmm15
784e1051a39Sopenharmony_ci___
785e1051a39Sopenharmony_ci$code.=<<___;
786e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
787e1051a39Sopenharmony_ci.cfi_restore	%rbp
788e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
789e1051a39Sopenharmony_ci.cfi_restore	%rbx
790e1051a39Sopenharmony_ci	lea	(%rax),%rsp
791e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
792e1051a39Sopenharmony_ci.Lepilogue_shaext:
793e1051a39Sopenharmony_ci	ret
794e1051a39Sopenharmony_ci.cfi_endproc
795e1051a39Sopenharmony_ci.size	sha1_multi_block_shaext,.-sha1_multi_block_shaext
796e1051a39Sopenharmony_ci___
797e1051a39Sopenharmony_ci						}}}
798e1051a39Sopenharmony_ci
799e1051a39Sopenharmony_ci						if ($avx) {{{
800e1051a39Sopenharmony_cisub BODY_00_19_avx {
801e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
802e1051a39Sopenharmony_cimy $j=$i+1;
803e1051a39Sopenharmony_cimy $k=$i+2;
804e1051a39Sopenharmony_cimy $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128";
805e1051a39Sopenharmony_cimy $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4];
806e1051a39Sopenharmony_ci
807e1051a39Sopenharmony_ci$code.=<<___ if ($i==0 && $REG_SZ==16);
808e1051a39Sopenharmony_ci	vmovd		(@ptr[0]),@Xi[0]
809e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[0]),@ptr[0]
810e1051a39Sopenharmony_ci	vmovd		(@ptr[1]),@Xi[2]	# borrow Xi[2]
811e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[1]),@ptr[1]
812e1051a39Sopenharmony_ci	vpinsrd		\$1,(@ptr[2]),@Xi[0],@Xi[0]
813e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[2]),@ptr[2]
814e1051a39Sopenharmony_ci	vpinsrd		\$1,(@ptr[3]),@Xi[2],@Xi[2]
815e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[3]),@ptr[3]
816e1051a39Sopenharmony_ci	 vmovd		`4*$j-16*4`(@ptr[0]),@Xi[1]
817e1051a39Sopenharmony_ci	vpunpckldq	@Xi[2],@Xi[0],@Xi[0]
818e1051a39Sopenharmony_ci	 vmovd		`4*$j-16*4`($ptr_n),$t3
819e1051a39Sopenharmony_ci	vpshufb		$tx,@Xi[0],@Xi[0]
820e1051a39Sopenharmony_ci___
821e1051a39Sopenharmony_ci$code.=<<___ if ($i<15 && $REG_SZ==16);		# just load input
822e1051a39Sopenharmony_ci	 vpinsrd	\$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
823e1051a39Sopenharmony_ci	 vpinsrd	\$1,`4*$j-16*4`(@ptr[3]),$t3,$t3
824e1051a39Sopenharmony_ci___
825e1051a39Sopenharmony_ci$code.=<<___ if ($i==0 && $REG_SZ==32);
826e1051a39Sopenharmony_ci	vmovd		(@ptr[0]),@Xi[0]
827e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[0]),@ptr[0]
828e1051a39Sopenharmony_ci	vmovd		(@ptr[4]),@Xi[2]	# borrow Xi[2]
829e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[4]),@ptr[4]
830e1051a39Sopenharmony_ci	vmovd		(@ptr[1]),$t2
831e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[1]),@ptr[1]
832e1051a39Sopenharmony_ci	vmovd		(@ptr[5]),$t1
833e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[5]),@ptr[5]
834e1051a39Sopenharmony_ci	vpinsrd		\$1,(@ptr[2]),@Xi[0],@Xi[0]
835e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[2]),@ptr[2]
836e1051a39Sopenharmony_ci	vpinsrd		\$1,(@ptr[6]),@Xi[2],@Xi[2]
837e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[6]),@ptr[6]
838e1051a39Sopenharmony_ci	vpinsrd		\$1,(@ptr[3]),$t2,$t2
839e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[3]),@ptr[3]
840e1051a39Sopenharmony_ci	vpunpckldq	$t2,@Xi[0],@Xi[0]
841e1051a39Sopenharmony_ci	vpinsrd		\$1,(@ptr[7]),$t1,$t1
842e1051a39Sopenharmony_ci	 lea		`16*4`(@ptr[7]),@ptr[7]
843e1051a39Sopenharmony_ci	vpunpckldq	$t1,@Xi[2],@Xi[2]
844e1051a39Sopenharmony_ci	 vmovd		`4*$j-16*4`(@ptr[0]),@Xi[1]
845e1051a39Sopenharmony_ci	vinserti128	@Xi[2],@Xi[0],@Xi[0]
846e1051a39Sopenharmony_ci	 vmovd		`4*$j-16*4`($ptr_n),$t3
847e1051a39Sopenharmony_ci	vpshufb		$tx,@Xi[0],@Xi[0]
848e1051a39Sopenharmony_ci___
849e1051a39Sopenharmony_ci$code.=<<___ if ($i<15 && $REG_SZ==32);		# just load input
850e1051a39Sopenharmony_ci	 vmovd		`4*$j-16*4`(@ptr[1]),$t2
851e1051a39Sopenharmony_ci	 vmovd		`4*$j-16*4`(@ptr[5]),$t1
852e1051a39Sopenharmony_ci	 vpinsrd	\$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
853e1051a39Sopenharmony_ci	 vpinsrd	\$1,`4*$j-16*4`(@ptr[6]),$t3,$t3
854e1051a39Sopenharmony_ci	 vpinsrd	\$1,`4*$j-16*4`(@ptr[3]),$t2,$t2
855e1051a39Sopenharmony_ci	 vpunpckldq	$t2,@Xi[1],@Xi[1]
856e1051a39Sopenharmony_ci	 vpinsrd	\$1,`4*$j-16*4`(@ptr[7]),$t1,$t1
857e1051a39Sopenharmony_ci	 vpunpckldq	$t1,$t3,$t3
858e1051a39Sopenharmony_ci___
859e1051a39Sopenharmony_ci$code.=<<___ if ($i<14);
860e1051a39Sopenharmony_ci	vpaddd	$K,$e,$e			# e+=K_00_19
861e1051a39Sopenharmony_ci	vpslld	\$5,$a,$t2
862e1051a39Sopenharmony_ci	vpandn	$d,$b,$t1
863e1051a39Sopenharmony_ci	vpand	$c,$b,$t0
864e1051a39Sopenharmony_ci
865e1051a39Sopenharmony_ci	vmovdqa	@Xi[0],`&Xi_off($i)`
866e1051a39Sopenharmony_ci	vpaddd	@Xi[0],$e,$e			# e+=X[i]
867e1051a39Sopenharmony_ci	 $vpack		$t3,@Xi[1],@Xi[1]
868e1051a39Sopenharmony_ci	vpsrld	\$27,$a,$t3
869e1051a39Sopenharmony_ci	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
870e1051a39Sopenharmony_ci	 vmovd		`4*$k-16*4`(@ptr[0]),@Xi[2]
871e1051a39Sopenharmony_ci
872e1051a39Sopenharmony_ci	vpslld	\$30,$b,$t1
873e1051a39Sopenharmony_ci	vpor	$t3,$t2,$t2			# rol(a,5)
874e1051a39Sopenharmony_ci	 vmovd		`4*$k-16*4`($ptr_n),$t3
875e1051a39Sopenharmony_ci	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
876e1051a39Sopenharmony_ci
877e1051a39Sopenharmony_ci	vpsrld	\$2,$b,$b
878e1051a39Sopenharmony_ci	vpaddd	$t2,$e,$e			# e+=rol(a,5)
879e1051a39Sopenharmony_ci	 vpshufb	$tx,@Xi[1],@Xi[1]
880e1051a39Sopenharmony_ci	vpor	$t1,$b,$b			# b=rol(b,30)
881e1051a39Sopenharmony_ci___
882e1051a39Sopenharmony_ci$code.=<<___ if ($i==14);
883e1051a39Sopenharmony_ci	vpaddd	$K,$e,$e			# e+=K_00_19
884e1051a39Sopenharmony_ci	 prefetcht0	63(@ptr[0])
885e1051a39Sopenharmony_ci	vpslld	\$5,$a,$t2
886e1051a39Sopenharmony_ci	vpandn	$d,$b,$t1
887e1051a39Sopenharmony_ci	vpand	$c,$b,$t0
888e1051a39Sopenharmony_ci
889e1051a39Sopenharmony_ci	vmovdqa	@Xi[0],`&Xi_off($i)`
890e1051a39Sopenharmony_ci	vpaddd	@Xi[0],$e,$e			# e+=X[i]
891e1051a39Sopenharmony_ci	 $vpack		$t3,@Xi[1],@Xi[1]
892e1051a39Sopenharmony_ci	vpsrld	\$27,$a,$t3
893e1051a39Sopenharmony_ci	 prefetcht0	63(@ptr[1])
894e1051a39Sopenharmony_ci	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
895e1051a39Sopenharmony_ci
896e1051a39Sopenharmony_ci	vpslld	\$30,$b,$t1
897e1051a39Sopenharmony_ci	vpor	$t3,$t2,$t2			# rol(a,5)
898e1051a39Sopenharmony_ci	 prefetcht0	63(@ptr[2])
899e1051a39Sopenharmony_ci	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
900e1051a39Sopenharmony_ci
901e1051a39Sopenharmony_ci	vpsrld	\$2,$b,$b
902e1051a39Sopenharmony_ci	vpaddd	$t2,$e,$e			# e+=rol(a,5)
903e1051a39Sopenharmony_ci	 prefetcht0	63(@ptr[3])
904e1051a39Sopenharmony_ci	 vpshufb	$tx,@Xi[1],@Xi[1]
905e1051a39Sopenharmony_ci	vpor	$t1,$b,$b			# b=rol(b,30)
906e1051a39Sopenharmony_ci___
907e1051a39Sopenharmony_ci$code.=<<___ if ($i>=13 && $i<15);
908e1051a39Sopenharmony_ci	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# preload "X[2]"
909e1051a39Sopenharmony_ci___
910e1051a39Sopenharmony_ci$code.=<<___ if ($i>=15);			# apply Xupdate
911e1051a39Sopenharmony_ci	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
912e1051a39Sopenharmony_ci	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
913e1051a39Sopenharmony_ci
914e1051a39Sopenharmony_ci	vpaddd	$K,$e,$e			# e+=K_00_19
915e1051a39Sopenharmony_ci	vpslld	\$5,$a,$t2
916e1051a39Sopenharmony_ci	vpandn	$d,$b,$t1
917e1051a39Sopenharmony_ci	 `"prefetcht0	63(@ptr[4])"		if ($i==15 && $REG_SZ==32)`
918e1051a39Sopenharmony_ci	vpand	$c,$b,$t0
919e1051a39Sopenharmony_ci
920e1051a39Sopenharmony_ci	vmovdqa	@Xi[0],`&Xi_off($i)`
921e1051a39Sopenharmony_ci	vpaddd	@Xi[0],$e,$e			# e+=X[i]
922e1051a39Sopenharmony_ci	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
923e1051a39Sopenharmony_ci	vpsrld	\$27,$a,$t3
924e1051a39Sopenharmony_ci	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
925e1051a39Sopenharmony_ci	 vpxor	@Xi[3],@Xi[1],@Xi[1]
926e1051a39Sopenharmony_ci	 `"prefetcht0	63(@ptr[5])"		if ($i==15 && $REG_SZ==32)`
927e1051a39Sopenharmony_ci
928e1051a39Sopenharmony_ci	vpslld	\$30,$b,$t1
929e1051a39Sopenharmony_ci	vpor	$t3,$t2,$t2			# rol(a,5)
930e1051a39Sopenharmony_ci	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
931e1051a39Sopenharmony_ci	 `"prefetcht0	63(@ptr[6])"		if ($i==15 && $REG_SZ==32)`
932e1051a39Sopenharmony_ci	 vpsrld	\$31,@Xi[1],$tx
933e1051a39Sopenharmony_ci	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
934e1051a39Sopenharmony_ci
935e1051a39Sopenharmony_ci	vpsrld	\$2,$b,$b
936e1051a39Sopenharmony_ci	 `"prefetcht0	63(@ptr[7])"		if ($i==15 && $REG_SZ==32)`
937e1051a39Sopenharmony_ci	vpaddd	$t2,$e,$e			# e+=rol(a,5)
938e1051a39Sopenharmony_ci	 vpor	$tx,@Xi[1],@Xi[1]		# rol	\$1,@Xi[1]
939e1051a39Sopenharmony_ci	vpor	$t1,$b,$b			# b=rol(b,30)
940e1051a39Sopenharmony_ci___
941e1051a39Sopenharmony_cipush(@Xi,shift(@Xi));
942e1051a39Sopenharmony_ci}
943e1051a39Sopenharmony_ci
944e1051a39Sopenharmony_cisub BODY_20_39_avx {
945e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
946e1051a39Sopenharmony_cimy $j=$i+1;
947e1051a39Sopenharmony_ci
948e1051a39Sopenharmony_ci$code.=<<___ if ($i<79);
949e1051a39Sopenharmony_ci	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
950e1051a39Sopenharmony_ci	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
951e1051a39Sopenharmony_ci
952e1051a39Sopenharmony_ci	vpslld	\$5,$a,$t2
953e1051a39Sopenharmony_ci	vpaddd	$K,$e,$e			# e+=K_20_39
954e1051a39Sopenharmony_ci	vpxor	$b,$d,$t0
955e1051a39Sopenharmony_ci___
956e1051a39Sopenharmony_ci$code.=<<___ if ($i<72);
957e1051a39Sopenharmony_ci	vmovdqa	@Xi[0],`&Xi_off($i)`
958e1051a39Sopenharmony_ci___
959e1051a39Sopenharmony_ci$code.=<<___ if ($i<79);
960e1051a39Sopenharmony_ci	vpaddd	@Xi[0],$e,$e			# e+=X[i]
961e1051a39Sopenharmony_ci	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
962e1051a39Sopenharmony_ci	vpsrld	\$27,$a,$t3
963e1051a39Sopenharmony_ci	vpxor	$c,$t0,$t0			# Parity(b,c,d)
964e1051a39Sopenharmony_ci	 vpxor	@Xi[3],@Xi[1],@Xi[1]
965e1051a39Sopenharmony_ci
966e1051a39Sopenharmony_ci	vpslld	\$30,$b,$t1
967e1051a39Sopenharmony_ci	vpor	$t3,$t2,$t2			# rol(a,5)
968e1051a39Sopenharmony_ci	vpaddd	$t0,$e,$e			# e+=Parity(b,c,d)
969e1051a39Sopenharmony_ci	 vpsrld	\$31,@Xi[1],$tx
970e1051a39Sopenharmony_ci	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
971e1051a39Sopenharmony_ci
972e1051a39Sopenharmony_ci	vpsrld	\$2,$b,$b
973e1051a39Sopenharmony_ci	vpaddd	$t2,$e,$e			# e+=rol(a,5)
974e1051a39Sopenharmony_ci	 vpor	$tx,@Xi[1],@Xi[1]		# rol(@Xi[1],1)
975e1051a39Sopenharmony_ci	vpor	$t1,$b,$b			# b=rol(b,30)
976e1051a39Sopenharmony_ci___
977e1051a39Sopenharmony_ci$code.=<<___ if ($i==79);
978e1051a39Sopenharmony_ci	vpslld	\$5,$a,$t2
979e1051a39Sopenharmony_ci	vpaddd	$K,$e,$e			# e+=K_20_39
980e1051a39Sopenharmony_ci	vpxor	$b,$d,$t0
981e1051a39Sopenharmony_ci
982e1051a39Sopenharmony_ci	vpsrld	\$27,$a,$t3
983e1051a39Sopenharmony_ci	vpaddd	@Xi[0],$e,$e			# e+=X[i]
984e1051a39Sopenharmony_ci	vpxor	$c,$t0,$t0			# Parity(b,c,d)
985e1051a39Sopenharmony_ci
986e1051a39Sopenharmony_ci	vpslld	\$30,$b,$t1
987e1051a39Sopenharmony_ci	vpor	$t3,$t2,$t2			# rol(a,5)
988e1051a39Sopenharmony_ci	vpaddd	$t0,$e,$e			# e+=Parity(b,c,d)
989e1051a39Sopenharmony_ci
990e1051a39Sopenharmony_ci	vpsrld	\$2,$b,$b
991e1051a39Sopenharmony_ci	vpaddd	$t2,$e,$e			# e+=rol(a,5)
992e1051a39Sopenharmony_ci	vpor	$t1,$b,$b			# b=rol(b,30)
993e1051a39Sopenharmony_ci___
994e1051a39Sopenharmony_cipush(@Xi,shift(@Xi));
995e1051a39Sopenharmony_ci}
996e1051a39Sopenharmony_ci
997e1051a39Sopenharmony_cisub BODY_40_59_avx {
998e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
999e1051a39Sopenharmony_cimy $j=$i+1;
1000e1051a39Sopenharmony_ci
1001e1051a39Sopenharmony_ci$code.=<<___;
1002e1051a39Sopenharmony_ci	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
1003e1051a39Sopenharmony_ci	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
1004e1051a39Sopenharmony_ci
1005e1051a39Sopenharmony_ci	vpaddd	$K,$e,$e			# e+=K_40_59
1006e1051a39Sopenharmony_ci	vpslld	\$5,$a,$t2
1007e1051a39Sopenharmony_ci	vpand	$c,$d,$t1
1008e1051a39Sopenharmony_ci	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
1009e1051a39Sopenharmony_ci
1010e1051a39Sopenharmony_ci	vpaddd	$t1,$e,$e
1011e1051a39Sopenharmony_ci	vpsrld	\$27,$a,$t3
1012e1051a39Sopenharmony_ci	vpxor	$c,$d,$t0
1013e1051a39Sopenharmony_ci	 vpxor	@Xi[3],@Xi[1],@Xi[1]
1014e1051a39Sopenharmony_ci
1015e1051a39Sopenharmony_ci	vmovdqu	@Xi[0],`&Xi_off($i)`
1016e1051a39Sopenharmony_ci	vpaddd	@Xi[0],$e,$e			# e+=X[i]
1017e1051a39Sopenharmony_ci	vpor	$t3,$t2,$t2			# rol(a,5)
1018e1051a39Sopenharmony_ci	 vpsrld	\$31,@Xi[1],$tx
1019e1051a39Sopenharmony_ci	vpand	$b,$t0,$t0
1020e1051a39Sopenharmony_ci	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
1021e1051a39Sopenharmony_ci
1022e1051a39Sopenharmony_ci	vpslld	\$30,$b,$t1
1023e1051a39Sopenharmony_ci	vpaddd	$t0,$e,$e			# e+=Maj(b,d,c)
1024e1051a39Sopenharmony_ci
1025e1051a39Sopenharmony_ci	vpsrld	\$2,$b,$b
1026e1051a39Sopenharmony_ci	vpaddd	$t2,$e,$e			# e+=rol(a,5)
1027e1051a39Sopenharmony_ci	 vpor	$tx,@Xi[1],@Xi[1]		# rol(@X[1],1)
1028e1051a39Sopenharmony_ci	vpor	$t1,$b,$b			# b=rol(b,30)
1029e1051a39Sopenharmony_ci___
1030e1051a39Sopenharmony_cipush(@Xi,shift(@Xi));
1031e1051a39Sopenharmony_ci}
1032e1051a39Sopenharmony_ci
1033e1051a39Sopenharmony_ci$code.=<<___;
1034e1051a39Sopenharmony_ci.type	sha1_multi_block_avx,\@function,3
1035e1051a39Sopenharmony_ci.align	32
1036e1051a39Sopenharmony_cisha1_multi_block_avx:
1037e1051a39Sopenharmony_ci.cfi_startproc
1038e1051a39Sopenharmony_ci_avx_shortcut:
1039e1051a39Sopenharmony_ci___
1040e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1);
1041e1051a39Sopenharmony_ci	shr	\$32,%rcx
1042e1051a39Sopenharmony_ci	cmp	\$2,$num
1043e1051a39Sopenharmony_ci	jb	.Lavx
1044e1051a39Sopenharmony_ci	test	\$`1<<5`,%ecx
1045e1051a39Sopenharmony_ci	jnz	_avx2_shortcut
1046e1051a39Sopenharmony_ci	jmp	.Lavx
1047e1051a39Sopenharmony_ci.align	32
1048e1051a39Sopenharmony_ci.Lavx:
1049e1051a39Sopenharmony_ci___
1050e1051a39Sopenharmony_ci$code.=<<___;
1051e1051a39Sopenharmony_ci	mov	%rsp,%rax
1052e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
1053e1051a39Sopenharmony_ci	push	%rbx
1054e1051a39Sopenharmony_ci.cfi_push	%rbx
1055e1051a39Sopenharmony_ci	push	%rbp
1056e1051a39Sopenharmony_ci.cfi_push	%rbp
1057e1051a39Sopenharmony_ci___
1058e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1059e1051a39Sopenharmony_ci	lea	-0xa8(%rsp),%rsp
1060e1051a39Sopenharmony_ci	movaps	%xmm6,(%rsp)
1061e1051a39Sopenharmony_ci	movaps	%xmm7,0x10(%rsp)
1062e1051a39Sopenharmony_ci	movaps	%xmm8,0x20(%rsp)
1063e1051a39Sopenharmony_ci	movaps	%xmm9,0x30(%rsp)
1064e1051a39Sopenharmony_ci	movaps	%xmm10,-0x78(%rax)
1065e1051a39Sopenharmony_ci	movaps	%xmm11,-0x68(%rax)
1066e1051a39Sopenharmony_ci	movaps	%xmm12,-0x58(%rax)
1067e1051a39Sopenharmony_ci	movaps	%xmm13,-0x48(%rax)
1068e1051a39Sopenharmony_ci	movaps	%xmm14,-0x38(%rax)
1069e1051a39Sopenharmony_ci	movaps	%xmm15,-0x28(%rax)
1070e1051a39Sopenharmony_ci___
1071e1051a39Sopenharmony_ci$code.=<<___;
1072e1051a39Sopenharmony_ci	sub	\$`$REG_SZ*18`, %rsp
1073e1051a39Sopenharmony_ci	and	\$-256,%rsp
1074e1051a39Sopenharmony_ci	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1075e1051a39Sopenharmony_ci.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
1076e1051a39Sopenharmony_ci.Lbody_avx:
1077e1051a39Sopenharmony_ci	lea	K_XX_XX(%rip),$Tbl
1078e1051a39Sopenharmony_ci	lea	`$REG_SZ*16`(%rsp),%rbx
1079e1051a39Sopenharmony_ci
1080e1051a39Sopenharmony_ci	vzeroupper
1081e1051a39Sopenharmony_ci.Loop_grande_avx:
1082e1051a39Sopenharmony_ci	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1083e1051a39Sopenharmony_ci	xor	$num,$num
1084e1051a39Sopenharmony_ci___
1085e1051a39Sopenharmony_cifor($i=0;$i<4;$i++) {
1086e1051a39Sopenharmony_ci    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1087e1051a39Sopenharmony_ci    $code.=<<___;
1088e1051a39Sopenharmony_ci	# input pointer
1089e1051a39Sopenharmony_ci	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
1090e1051a39Sopenharmony_ci	# number of blocks
1091e1051a39Sopenharmony_ci	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
1092e1051a39Sopenharmony_ci	cmp	$num,%ecx
1093e1051a39Sopenharmony_ci	cmovg	%ecx,$num			# find maximum
1094e1051a39Sopenharmony_ci	test	%ecx,%ecx
1095e1051a39Sopenharmony_ci	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1096e1051a39Sopenharmony_ci	cmovle	$Tbl,@ptr[$i]			# cancel input
1097e1051a39Sopenharmony_ci___
1098e1051a39Sopenharmony_ci}
1099e1051a39Sopenharmony_ci$code.=<<___;
1100e1051a39Sopenharmony_ci	test	$num,$num
1101e1051a39Sopenharmony_ci	jz	.Ldone_avx
1102e1051a39Sopenharmony_ci
1103e1051a39Sopenharmony_ci	vmovdqu	0x00($ctx),$A			# load context
1104e1051a39Sopenharmony_ci	 lea	128(%rsp),%rax
1105e1051a39Sopenharmony_ci	vmovdqu	0x20($ctx),$B
1106e1051a39Sopenharmony_ci	vmovdqu	0x40($ctx),$C
1107e1051a39Sopenharmony_ci	vmovdqu	0x60($ctx),$D
1108e1051a39Sopenharmony_ci	vmovdqu	0x80($ctx),$E
1109e1051a39Sopenharmony_ci	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1110e1051a39Sopenharmony_ci	jmp	.Loop_avx
1111e1051a39Sopenharmony_ci
1112e1051a39Sopenharmony_ci.align	32
1113e1051a39Sopenharmony_ci.Loop_avx:
1114e1051a39Sopenharmony_ci___
1115e1051a39Sopenharmony_ci$code.="	vmovdqa	-0x20($Tbl),$K\n";	# K_00_19
1116e1051a39Sopenharmony_cifor($i=0;$i<20;$i++)	{ &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1117e1051a39Sopenharmony_ci$code.="	vmovdqa	0x00($Tbl),$K\n";	# K_20_39
1118e1051a39Sopenharmony_cifor(;$i<40;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1119e1051a39Sopenharmony_ci$code.="	vmovdqa	0x20($Tbl),$K\n";	# K_40_59
1120e1051a39Sopenharmony_cifor(;$i<60;$i++)	{ &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1121e1051a39Sopenharmony_ci$code.="	vmovdqa	0x40($Tbl),$K\n";	# K_60_79
1122e1051a39Sopenharmony_cifor(;$i<80;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1123e1051a39Sopenharmony_ci$code.=<<___;
1124e1051a39Sopenharmony_ci	mov	\$1,%ecx
1125e1051a39Sopenharmony_ci___
1126e1051a39Sopenharmony_cifor($i=0;$i<4;$i++) {
1127e1051a39Sopenharmony_ci    $code.=<<___;
1128e1051a39Sopenharmony_ci	cmp	`4*$i`(%rbx),%ecx		# examine counters
1129e1051a39Sopenharmony_ci	cmovge	$Tbl,@ptr[$i]			# cancel input
1130e1051a39Sopenharmony_ci___
1131e1051a39Sopenharmony_ci}
1132e1051a39Sopenharmony_ci$code.=<<___;
1133e1051a39Sopenharmony_ci	vmovdqu	(%rbx),$t0			# pull counters
1134e1051a39Sopenharmony_ci	vpxor	$t2,$t2,$t2
1135e1051a39Sopenharmony_ci	vmovdqa	$t0,$t1
1136e1051a39Sopenharmony_ci	vpcmpgtd $t2,$t1,$t1			# mask value
1137e1051a39Sopenharmony_ci	vpaddd	$t1,$t0,$t0			# counters--
1138e1051a39Sopenharmony_ci
1139e1051a39Sopenharmony_ci	vpand	$t1,$A,$A
1140e1051a39Sopenharmony_ci	vpand	$t1,$B,$B
1141e1051a39Sopenharmony_ci	vpaddd	0x00($ctx),$A,$A
1142e1051a39Sopenharmony_ci	vpand	$t1,$C,$C
1143e1051a39Sopenharmony_ci	vpaddd	0x20($ctx),$B,$B
1144e1051a39Sopenharmony_ci	vpand	$t1,$D,$D
1145e1051a39Sopenharmony_ci	vpaddd	0x40($ctx),$C,$C
1146e1051a39Sopenharmony_ci	vpand	$t1,$E,$E
1147e1051a39Sopenharmony_ci	vpaddd	0x60($ctx),$D,$D
1148e1051a39Sopenharmony_ci	vpaddd	0x80($ctx),$E,$E
1149e1051a39Sopenharmony_ci	vmovdqu	$A,0x00($ctx)
1150e1051a39Sopenharmony_ci	vmovdqu	$B,0x20($ctx)
1151e1051a39Sopenharmony_ci	vmovdqu	$C,0x40($ctx)
1152e1051a39Sopenharmony_ci	vmovdqu	$D,0x60($ctx)
1153e1051a39Sopenharmony_ci	vmovdqu	$E,0x80($ctx)
1154e1051a39Sopenharmony_ci
1155e1051a39Sopenharmony_ci	vmovdqu	$t0,(%rbx)			# save counters
1156e1051a39Sopenharmony_ci	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1157e1051a39Sopenharmony_ci	dec	$num
1158e1051a39Sopenharmony_ci	jnz	.Loop_avx
1159e1051a39Sopenharmony_ci
1160e1051a39Sopenharmony_ci	mov	`$REG_SZ*17+8`(%rsp),$num
1161e1051a39Sopenharmony_ci	lea	$REG_SZ($ctx),$ctx
1162e1051a39Sopenharmony_ci	lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
1163e1051a39Sopenharmony_ci	dec	$num
1164e1051a39Sopenharmony_ci	jnz	.Loop_grande_avx
1165e1051a39Sopenharmony_ci
1166e1051a39Sopenharmony_ci.Ldone_avx:
1167e1051a39Sopenharmony_ci	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
1168e1051a39Sopenharmony_ci.cfi_def_cfa	%rax,8
1169e1051a39Sopenharmony_ci	vzeroupper
1170e1051a39Sopenharmony_ci___
1171e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1172e1051a39Sopenharmony_ci	movaps	-0xb8(%rax),%xmm6
1173e1051a39Sopenharmony_ci	movaps	-0xa8(%rax),%xmm7
1174e1051a39Sopenharmony_ci	movaps	-0x98(%rax),%xmm8
1175e1051a39Sopenharmony_ci	movaps	-0x88(%rax),%xmm9
1176e1051a39Sopenharmony_ci	movaps	-0x78(%rax),%xmm10
1177e1051a39Sopenharmony_ci	movaps	-0x68(%rax),%xmm11
1178e1051a39Sopenharmony_ci	movaps	-0x58(%rax),%xmm12
1179e1051a39Sopenharmony_ci	movaps	-0x48(%rax),%xmm13
1180e1051a39Sopenharmony_ci	movaps	-0x38(%rax),%xmm14
1181e1051a39Sopenharmony_ci	movaps	-0x28(%rax),%xmm15
1182e1051a39Sopenharmony_ci___
1183e1051a39Sopenharmony_ci$code.=<<___;
1184e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
1185e1051a39Sopenharmony_ci.cfi_restore	%rbp
1186e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
1187e1051a39Sopenharmony_ci.cfi_restore	%rbx
1188e1051a39Sopenharmony_ci	lea	(%rax),%rsp
1189e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
1190e1051a39Sopenharmony_ci.Lepilogue_avx:
1191e1051a39Sopenharmony_ci	ret
1192e1051a39Sopenharmony_ci.cfi_endproc
1193e1051a39Sopenharmony_ci.size	sha1_multi_block_avx,.-sha1_multi_block_avx
1194e1051a39Sopenharmony_ci___
1195e1051a39Sopenharmony_ci
1196e1051a39Sopenharmony_ci						if ($avx>1) {
1197e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval $1/gem;
1198e1051a39Sopenharmony_ci
1199e1051a39Sopenharmony_ci$REG_SZ=32;
1200e1051a39Sopenharmony_ci
1201e1051a39Sopenharmony_ci@ptr=map("%r$_",(12..15,8..11));
1202e1051a39Sopenharmony_ci
1203e1051a39Sopenharmony_ci@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4));
1204e1051a39Sopenharmony_ci($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9));
1205e1051a39Sopenharmony_ci@Xi=map("%ymm$_",(10..14));
1206e1051a39Sopenharmony_ci$K="%ymm15";
1207e1051a39Sopenharmony_ci
1208e1051a39Sopenharmony_ci$code.=<<___;
1209e1051a39Sopenharmony_ci.type	sha1_multi_block_avx2,\@function,3
1210e1051a39Sopenharmony_ci.align	32
1211e1051a39Sopenharmony_cisha1_multi_block_avx2:
1212e1051a39Sopenharmony_ci.cfi_startproc
1213e1051a39Sopenharmony_ci_avx2_shortcut:
1214e1051a39Sopenharmony_ci	mov	%rsp,%rax
1215e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
1216e1051a39Sopenharmony_ci	push	%rbx
1217e1051a39Sopenharmony_ci.cfi_push	%rbx
1218e1051a39Sopenharmony_ci	push	%rbp
1219e1051a39Sopenharmony_ci.cfi_push	%rbp
1220e1051a39Sopenharmony_ci	push	%r12
1221e1051a39Sopenharmony_ci.cfi_push	%r12
1222e1051a39Sopenharmony_ci	push	%r13
1223e1051a39Sopenharmony_ci.cfi_push	%r13
1224e1051a39Sopenharmony_ci	push	%r14
1225e1051a39Sopenharmony_ci.cfi_push	%r14
1226e1051a39Sopenharmony_ci	push	%r15
1227e1051a39Sopenharmony_ci.cfi_push	%r15
1228e1051a39Sopenharmony_ci___
1229e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1230e1051a39Sopenharmony_ci	lea	-0xa8(%rsp),%rsp
1231e1051a39Sopenharmony_ci	movaps	%xmm6,(%rsp)
1232e1051a39Sopenharmony_ci	movaps	%xmm7,0x10(%rsp)
1233e1051a39Sopenharmony_ci	movaps	%xmm8,0x20(%rsp)
1234e1051a39Sopenharmony_ci	movaps	%xmm9,0x30(%rsp)
1235e1051a39Sopenharmony_ci	movaps	%xmm10,0x40(%rsp)
1236e1051a39Sopenharmony_ci	movaps	%xmm11,0x50(%rsp)
1237e1051a39Sopenharmony_ci	movaps	%xmm12,-0x78(%rax)
1238e1051a39Sopenharmony_ci	movaps	%xmm13,-0x68(%rax)
1239e1051a39Sopenharmony_ci	movaps	%xmm14,-0x58(%rax)
1240e1051a39Sopenharmony_ci	movaps	%xmm15,-0x48(%rax)
1241e1051a39Sopenharmony_ci___
1242e1051a39Sopenharmony_ci$code.=<<___;
1243e1051a39Sopenharmony_ci	sub	\$`$REG_SZ*18`, %rsp
1244e1051a39Sopenharmony_ci	and	\$-256,%rsp
1245e1051a39Sopenharmony_ci	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1246e1051a39Sopenharmony_ci.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
1247e1051a39Sopenharmony_ci.Lbody_avx2:
1248e1051a39Sopenharmony_ci	lea	K_XX_XX(%rip),$Tbl
1249e1051a39Sopenharmony_ci	shr	\$1,$num
1250e1051a39Sopenharmony_ci
1251e1051a39Sopenharmony_ci	vzeroupper
1252e1051a39Sopenharmony_ci.Loop_grande_avx2:
1253e1051a39Sopenharmony_ci	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1254e1051a39Sopenharmony_ci	xor	$num,$num
1255e1051a39Sopenharmony_ci	lea	`$REG_SZ*16`(%rsp),%rbx
1256e1051a39Sopenharmony_ci___
1257e1051a39Sopenharmony_cifor($i=0;$i<8;$i++) {
1258e1051a39Sopenharmony_ci    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1259e1051a39Sopenharmony_ci    $code.=<<___;
1260e1051a39Sopenharmony_ci	# input pointer
1261e1051a39Sopenharmony_ci	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
1262e1051a39Sopenharmony_ci	# number of blocks
1263e1051a39Sopenharmony_ci	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
1264e1051a39Sopenharmony_ci	cmp	$num,%ecx
1265e1051a39Sopenharmony_ci	cmovg	%ecx,$num			# find maximum
1266e1051a39Sopenharmony_ci	test	%ecx,%ecx
1267e1051a39Sopenharmony_ci	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1268e1051a39Sopenharmony_ci	cmovle	$Tbl,@ptr[$i]			# cancel input
1269e1051a39Sopenharmony_ci___
1270e1051a39Sopenharmony_ci}
1271e1051a39Sopenharmony_ci$code.=<<___;
1272e1051a39Sopenharmony_ci	vmovdqu	0x00($ctx),$A			# load context
1273e1051a39Sopenharmony_ci	 lea	128(%rsp),%rax
1274e1051a39Sopenharmony_ci	vmovdqu	0x20($ctx),$B
1275e1051a39Sopenharmony_ci	 lea	256+128(%rsp),%rbx
1276e1051a39Sopenharmony_ci	vmovdqu	0x40($ctx),$C
1277e1051a39Sopenharmony_ci	vmovdqu	0x60($ctx),$D
1278e1051a39Sopenharmony_ci	vmovdqu	0x80($ctx),$E
1279e1051a39Sopenharmony_ci	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1280e1051a39Sopenharmony_ci	jmp	.Loop_avx2
1281e1051a39Sopenharmony_ci
1282e1051a39Sopenharmony_ci.align	32
1283e1051a39Sopenharmony_ci.Loop_avx2:
1284e1051a39Sopenharmony_ci___
1285e1051a39Sopenharmony_ci$code.="	vmovdqa	-0x20($Tbl),$K\n";	# K_00_19
1286e1051a39Sopenharmony_cifor($i=0;$i<20;$i++)	{ &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1287e1051a39Sopenharmony_ci$code.="	vmovdqa	0x00($Tbl),$K\n";	# K_20_39
1288e1051a39Sopenharmony_cifor(;$i<40;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1289e1051a39Sopenharmony_ci$code.="	vmovdqa	0x20($Tbl),$K\n";	# K_40_59
1290e1051a39Sopenharmony_cifor(;$i<60;$i++)	{ &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1291e1051a39Sopenharmony_ci$code.="	vmovdqa	0x40($Tbl),$K\n";	# K_60_79
1292e1051a39Sopenharmony_cifor(;$i<80;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1293e1051a39Sopenharmony_ci$code.=<<___;
1294e1051a39Sopenharmony_ci	mov	\$1,%ecx
1295e1051a39Sopenharmony_ci	lea	`$REG_SZ*16`(%rsp),%rbx
1296e1051a39Sopenharmony_ci___
1297e1051a39Sopenharmony_cifor($i=0;$i<8;$i++) {
1298e1051a39Sopenharmony_ci    $code.=<<___;
1299e1051a39Sopenharmony_ci	cmp	`4*$i`(%rbx),%ecx		# examine counters
1300e1051a39Sopenharmony_ci	cmovge	$Tbl,@ptr[$i]			# cancel input
1301e1051a39Sopenharmony_ci___
1302e1051a39Sopenharmony_ci}
1303e1051a39Sopenharmony_ci$code.=<<___;
1304e1051a39Sopenharmony_ci	vmovdqu	(%rbx),$t0		# pull counters
1305e1051a39Sopenharmony_ci	vpxor	$t2,$t2,$t2
1306e1051a39Sopenharmony_ci	vmovdqa	$t0,$t1
1307e1051a39Sopenharmony_ci	vpcmpgtd $t2,$t1,$t1			# mask value
1308e1051a39Sopenharmony_ci	vpaddd	$t1,$t0,$t0			# counters--
1309e1051a39Sopenharmony_ci
1310e1051a39Sopenharmony_ci	vpand	$t1,$A,$A
1311e1051a39Sopenharmony_ci	vpand	$t1,$B,$B
1312e1051a39Sopenharmony_ci	vpaddd	0x00($ctx),$A,$A
1313e1051a39Sopenharmony_ci	vpand	$t1,$C,$C
1314e1051a39Sopenharmony_ci	vpaddd	0x20($ctx),$B,$B
1315e1051a39Sopenharmony_ci	vpand	$t1,$D,$D
1316e1051a39Sopenharmony_ci	vpaddd	0x40($ctx),$C,$C
1317e1051a39Sopenharmony_ci	vpand	$t1,$E,$E
1318e1051a39Sopenharmony_ci	vpaddd	0x60($ctx),$D,$D
1319e1051a39Sopenharmony_ci	vpaddd	0x80($ctx),$E,$E
1320e1051a39Sopenharmony_ci	vmovdqu	$A,0x00($ctx)
1321e1051a39Sopenharmony_ci	vmovdqu	$B,0x20($ctx)
1322e1051a39Sopenharmony_ci	vmovdqu	$C,0x40($ctx)
1323e1051a39Sopenharmony_ci	vmovdqu	$D,0x60($ctx)
1324e1051a39Sopenharmony_ci	vmovdqu	$E,0x80($ctx)
1325e1051a39Sopenharmony_ci
1326e1051a39Sopenharmony_ci	vmovdqu	$t0,(%rbx)			# save counters
1327e1051a39Sopenharmony_ci	lea	256+128(%rsp),%rbx
1328e1051a39Sopenharmony_ci	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1329e1051a39Sopenharmony_ci	dec	$num
1330e1051a39Sopenharmony_ci	jnz	.Loop_avx2
1331e1051a39Sopenharmony_ci
1332e1051a39Sopenharmony_ci	#mov	`$REG_SZ*17+8`(%rsp),$num
1333e1051a39Sopenharmony_ci	#lea	$REG_SZ($ctx),$ctx
1334e1051a39Sopenharmony_ci	#lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
1335e1051a39Sopenharmony_ci	#dec	$num
1336e1051a39Sopenharmony_ci	#jnz	.Loop_grande_avx2
1337e1051a39Sopenharmony_ci
1338e1051a39Sopenharmony_ci.Ldone_avx2:
1339e1051a39Sopenharmony_ci	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
1340e1051a39Sopenharmony_ci.cfi_def_cfa	%rax,8
1341e1051a39Sopenharmony_ci	vzeroupper
1342e1051a39Sopenharmony_ci___
1343e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1344e1051a39Sopenharmony_ci	movaps	-0xd8(%rax),%xmm6
1345e1051a39Sopenharmony_ci	movaps	-0xc8(%rax),%xmm7
1346e1051a39Sopenharmony_ci	movaps	-0xb8(%rax),%xmm8
1347e1051a39Sopenharmony_ci	movaps	-0xa8(%rax),%xmm9
1348e1051a39Sopenharmony_ci	movaps	-0x98(%rax),%xmm10
1349e1051a39Sopenharmony_ci	movaps	-0x88(%rax),%xmm11
1350e1051a39Sopenharmony_ci	movaps	-0x78(%rax),%xmm12
1351e1051a39Sopenharmony_ci	movaps	-0x68(%rax),%xmm13
1352e1051a39Sopenharmony_ci	movaps	-0x58(%rax),%xmm14
1353e1051a39Sopenharmony_ci	movaps	-0x48(%rax),%xmm15
1354e1051a39Sopenharmony_ci___
1355e1051a39Sopenharmony_ci$code.=<<___;
1356e1051a39Sopenharmony_ci	mov	-48(%rax),%r15
1357e1051a39Sopenharmony_ci.cfi_restore	%r15
1358e1051a39Sopenharmony_ci	mov	-40(%rax),%r14
1359e1051a39Sopenharmony_ci.cfi_restore	%r14
1360e1051a39Sopenharmony_ci	mov	-32(%rax),%r13
1361e1051a39Sopenharmony_ci.cfi_restore	%r13
1362e1051a39Sopenharmony_ci	mov	-24(%rax),%r12
1363e1051a39Sopenharmony_ci.cfi_restore	%r12
1364e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
1365e1051a39Sopenharmony_ci.cfi_restore	%rbp
1366e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
1367e1051a39Sopenharmony_ci.cfi_restore	%rbx
1368e1051a39Sopenharmony_ci	lea	(%rax),%rsp
1369e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
1370e1051a39Sopenharmony_ci.Lepilogue_avx2:
1371e1051a39Sopenharmony_ci	ret
1372e1051a39Sopenharmony_ci.cfi_endproc
1373e1051a39Sopenharmony_ci.size	sha1_multi_block_avx2,.-sha1_multi_block_avx2
1374e1051a39Sopenharmony_ci___
1375e1051a39Sopenharmony_ci						}	}}}
1376e1051a39Sopenharmony_ci$code.=<<___;
1377e1051a39Sopenharmony_ci
1378e1051a39Sopenharmony_ci.align	256
1379e1051a39Sopenharmony_ci	.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1380e1051a39Sopenharmony_ci	.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1381e1051a39Sopenharmony_ciK_XX_XX:
1382e1051a39Sopenharmony_ci	.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1383e1051a39Sopenharmony_ci	.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1384e1051a39Sopenharmony_ci	.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1385e1051a39Sopenharmony_ci	.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1386e1051a39Sopenharmony_ci	.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1387e1051a39Sopenharmony_ci	.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1388e1051a39Sopenharmony_ci	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1389e1051a39Sopenharmony_ci	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1390e1051a39Sopenharmony_ci	.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1391e1051a39Sopenharmony_ci	.asciz	"SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1392e1051a39Sopenharmony_ci___
1393e1051a39Sopenharmony_ci
1394e1051a39Sopenharmony_ciif ($win64) {
1395e1051a39Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1396e1051a39Sopenharmony_ci#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1397e1051a39Sopenharmony_ci$rec="%rcx";
1398e1051a39Sopenharmony_ci$frame="%rdx";
1399e1051a39Sopenharmony_ci$context="%r8";
1400e1051a39Sopenharmony_ci$disp="%r9";
1401e1051a39Sopenharmony_ci
1402e1051a39Sopenharmony_ci$code.=<<___;
1403e1051a39Sopenharmony_ci.extern	__imp_RtlVirtualUnwind
1404e1051a39Sopenharmony_ci.type	se_handler,\@abi-omnipotent
1405e1051a39Sopenharmony_ci.align	16
1406e1051a39Sopenharmony_cise_handler:
1407e1051a39Sopenharmony_ci	push	%rsi
1408e1051a39Sopenharmony_ci	push	%rdi
1409e1051a39Sopenharmony_ci	push	%rbx
1410e1051a39Sopenharmony_ci	push	%rbp
1411e1051a39Sopenharmony_ci	push	%r12
1412e1051a39Sopenharmony_ci	push	%r13
1413e1051a39Sopenharmony_ci	push	%r14
1414e1051a39Sopenharmony_ci	push	%r15
1415e1051a39Sopenharmony_ci	pushfq
1416e1051a39Sopenharmony_ci	sub	\$64,%rsp
1417e1051a39Sopenharmony_ci
1418e1051a39Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
1419e1051a39Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
1420e1051a39Sopenharmony_ci
1421e1051a39Sopenharmony_ci	mov	8($disp),%rsi		# disp->ImageBase
1422e1051a39Sopenharmony_ci	mov	56($disp),%r11		# disp->HandlerData
1423e1051a39Sopenharmony_ci
1424e1051a39Sopenharmony_ci	mov	0(%r11),%r10d		# HandlerData[0]
1425e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# end of prologue label
1426e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<.Lbody
1427e1051a39Sopenharmony_ci	jb	.Lin_prologue
1428e1051a39Sopenharmony_ci
1429e1051a39Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
1430e1051a39Sopenharmony_ci
1431e1051a39Sopenharmony_ci	mov	4(%r11),%r10d		# HandlerData[1]
1432e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# epilogue label
1433e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1434e1051a39Sopenharmony_ci	jae	.Lin_prologue
1435e1051a39Sopenharmony_ci
1436e1051a39Sopenharmony_ci	mov	`16*17`(%rax),%rax	# pull saved stack pointer
1437e1051a39Sopenharmony_ci
1438e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
1439e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
1440e1051a39Sopenharmony_ci	mov	%rbx,144($context)	# restore context->Rbx
1441e1051a39Sopenharmony_ci	mov	%rbp,160($context)	# restore context->Rbp
1442e1051a39Sopenharmony_ci
1443e1051a39Sopenharmony_ci	lea	-24-10*16(%rax),%rsi
1444e1051a39Sopenharmony_ci	lea	512($context),%rdi	# &context.Xmm6
1445e1051a39Sopenharmony_ci	mov	\$20,%ecx
1446e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
1447e1051a39Sopenharmony_ci
1448e1051a39Sopenharmony_ci.Lin_prologue:
1449e1051a39Sopenharmony_ci	mov	8(%rax),%rdi
1450e1051a39Sopenharmony_ci	mov	16(%rax),%rsi
1451e1051a39Sopenharmony_ci	mov	%rax,152($context)	# restore context->Rsp
1452e1051a39Sopenharmony_ci	mov	%rsi,168($context)	# restore context->Rsi
1453e1051a39Sopenharmony_ci	mov	%rdi,176($context)	# restore context->Rdi
1454e1051a39Sopenharmony_ci
1455e1051a39Sopenharmony_ci	mov	40($disp),%rdi		# disp->ContextRecord
1456e1051a39Sopenharmony_ci	mov	$context,%rsi		# context
1457e1051a39Sopenharmony_ci	mov	\$154,%ecx		# sizeof(CONTEXT)
1458e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
1459e1051a39Sopenharmony_ci
1460e1051a39Sopenharmony_ci	mov	$disp,%rsi
1461e1051a39Sopenharmony_ci	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1462e1051a39Sopenharmony_ci	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1463e1051a39Sopenharmony_ci	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1464e1051a39Sopenharmony_ci	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1465e1051a39Sopenharmony_ci	mov	40(%rsi),%r10		# disp->ContextRecord
1466e1051a39Sopenharmony_ci	lea	56(%rsi),%r11		# &disp->HandlerData
1467e1051a39Sopenharmony_ci	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1468e1051a39Sopenharmony_ci	mov	%r10,32(%rsp)		# arg5
1469e1051a39Sopenharmony_ci	mov	%r11,40(%rsp)		# arg6
1470e1051a39Sopenharmony_ci	mov	%r12,48(%rsp)		# arg7
1471e1051a39Sopenharmony_ci	mov	%rcx,56(%rsp)		# arg8, (NULL)
1472e1051a39Sopenharmony_ci	call	*__imp_RtlVirtualUnwind(%rip)
1473e1051a39Sopenharmony_ci
1474e1051a39Sopenharmony_ci	mov	\$1,%eax		# ExceptionContinueSearch
1475e1051a39Sopenharmony_ci	add	\$64,%rsp
1476e1051a39Sopenharmony_ci	popfq
1477e1051a39Sopenharmony_ci	pop	%r15
1478e1051a39Sopenharmony_ci	pop	%r14
1479e1051a39Sopenharmony_ci	pop	%r13
1480e1051a39Sopenharmony_ci	pop	%r12
1481e1051a39Sopenharmony_ci	pop	%rbp
1482e1051a39Sopenharmony_ci	pop	%rbx
1483e1051a39Sopenharmony_ci	pop	%rdi
1484e1051a39Sopenharmony_ci	pop	%rsi
1485e1051a39Sopenharmony_ci	ret
1486e1051a39Sopenharmony_ci.size	se_handler,.-se_handler
1487e1051a39Sopenharmony_ci___
1488e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1);
1489e1051a39Sopenharmony_ci.type	avx2_handler,\@abi-omnipotent
1490e1051a39Sopenharmony_ci.align	16
1491e1051a39Sopenharmony_ciavx2_handler:
1492e1051a39Sopenharmony_ci	push	%rsi
1493e1051a39Sopenharmony_ci	push	%rdi
1494e1051a39Sopenharmony_ci	push	%rbx
1495e1051a39Sopenharmony_ci	push	%rbp
1496e1051a39Sopenharmony_ci	push	%r12
1497e1051a39Sopenharmony_ci	push	%r13
1498e1051a39Sopenharmony_ci	push	%r14
1499e1051a39Sopenharmony_ci	push	%r15
1500e1051a39Sopenharmony_ci	pushfq
1501e1051a39Sopenharmony_ci	sub	\$64,%rsp
1502e1051a39Sopenharmony_ci
1503e1051a39Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
1504e1051a39Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
1505e1051a39Sopenharmony_ci
1506e1051a39Sopenharmony_ci	mov	8($disp),%rsi		# disp->ImageBase
1507e1051a39Sopenharmony_ci	mov	56($disp),%r11		# disp->HandlerData
1508e1051a39Sopenharmony_ci
1509e1051a39Sopenharmony_ci	mov	0(%r11),%r10d		# HandlerData[0]
1510e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# end of prologue label
1511e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<body label
1512e1051a39Sopenharmony_ci	jb	.Lin_prologue
1513e1051a39Sopenharmony_ci
1514e1051a39Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
1515e1051a39Sopenharmony_ci
1516e1051a39Sopenharmony_ci	mov	4(%r11),%r10d		# HandlerData[1]
1517e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# epilogue label
1518e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=epilogue label
1519e1051a39Sopenharmony_ci	jae	.Lin_prologue
1520e1051a39Sopenharmony_ci
1521e1051a39Sopenharmony_ci	mov	`32*17`($context),%rax	# pull saved stack pointer
1522e1051a39Sopenharmony_ci
1523e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
1524e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
1525e1051a39Sopenharmony_ci	mov	-24(%rax),%r12
1526e1051a39Sopenharmony_ci	mov	-32(%rax),%r13
1527e1051a39Sopenharmony_ci	mov	-40(%rax),%r14
1528e1051a39Sopenharmony_ci	mov	-48(%rax),%r15
1529e1051a39Sopenharmony_ci	mov	%rbx,144($context)	# restore context->Rbx
1530e1051a39Sopenharmony_ci	mov	%rbp,160($context)	# restore context->Rbp
1531e1051a39Sopenharmony_ci	mov	%r12,216($context)	# restore context->R12
1532e1051a39Sopenharmony_ci	mov	%r13,224($context)	# restore context->R13
1533e1051a39Sopenharmony_ci	mov	%r14,232($context)	# restore context->R14
1534e1051a39Sopenharmony_ci	mov	%r15,240($context)	# restore context->R15
1535e1051a39Sopenharmony_ci
1536e1051a39Sopenharmony_ci	lea	-56-10*16(%rax),%rsi
1537e1051a39Sopenharmony_ci	lea	512($context),%rdi	# &context.Xmm6
1538e1051a39Sopenharmony_ci	mov	\$20,%ecx
1539e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
1540e1051a39Sopenharmony_ci
1541e1051a39Sopenharmony_ci	jmp	.Lin_prologue
1542e1051a39Sopenharmony_ci.size	avx2_handler,.-avx2_handler
1543e1051a39Sopenharmony_ci___
1544e1051a39Sopenharmony_ci$code.=<<___;
1545e1051a39Sopenharmony_ci.section	.pdata
1546e1051a39Sopenharmony_ci.align	4
1547e1051a39Sopenharmony_ci	.rva	.LSEH_begin_sha1_multi_block
1548e1051a39Sopenharmony_ci	.rva	.LSEH_end_sha1_multi_block
1549e1051a39Sopenharmony_ci	.rva	.LSEH_info_sha1_multi_block
1550e1051a39Sopenharmony_ci	.rva	.LSEH_begin_sha1_multi_block_shaext
1551e1051a39Sopenharmony_ci	.rva	.LSEH_end_sha1_multi_block_shaext
1552e1051a39Sopenharmony_ci	.rva	.LSEH_info_sha1_multi_block_shaext
1553e1051a39Sopenharmony_ci___
1554e1051a39Sopenharmony_ci$code.=<<___ if ($avx);
1555e1051a39Sopenharmony_ci	.rva	.LSEH_begin_sha1_multi_block_avx
1556e1051a39Sopenharmony_ci	.rva	.LSEH_end_sha1_multi_block_avx
1557e1051a39Sopenharmony_ci	.rva	.LSEH_info_sha1_multi_block_avx
1558e1051a39Sopenharmony_ci___
1559e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1);
1560e1051a39Sopenharmony_ci	.rva	.LSEH_begin_sha1_multi_block_avx2
1561e1051a39Sopenharmony_ci	.rva	.LSEH_end_sha1_multi_block_avx2
1562e1051a39Sopenharmony_ci	.rva	.LSEH_info_sha1_multi_block_avx2
1563e1051a39Sopenharmony_ci___
1564e1051a39Sopenharmony_ci$code.=<<___;
1565e1051a39Sopenharmony_ci.section	.xdata
1566e1051a39Sopenharmony_ci.align	8
1567e1051a39Sopenharmony_ci.LSEH_info_sha1_multi_block:
1568e1051a39Sopenharmony_ci	.byte	9,0,0,0
1569e1051a39Sopenharmony_ci	.rva	se_handler
1570e1051a39Sopenharmony_ci	.rva	.Lbody,.Lepilogue			# HandlerData[]
1571e1051a39Sopenharmony_ci.LSEH_info_sha1_multi_block_shaext:
1572e1051a39Sopenharmony_ci	.byte	9,0,0,0
1573e1051a39Sopenharmony_ci	.rva	se_handler
1574e1051a39Sopenharmony_ci	.rva	.Lbody_shaext,.Lepilogue_shaext	# HandlerData[]
1575e1051a39Sopenharmony_ci___
1576e1051a39Sopenharmony_ci$code.=<<___ if ($avx);
1577e1051a39Sopenharmony_ci.LSEH_info_sha1_multi_block_avx:
1578e1051a39Sopenharmony_ci	.byte	9,0,0,0
1579e1051a39Sopenharmony_ci	.rva	se_handler
1580e1051a39Sopenharmony_ci	.rva	.Lbody_avx,.Lepilogue_avx		# HandlerData[]
1581e1051a39Sopenharmony_ci___
1582e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1);
1583e1051a39Sopenharmony_ci.LSEH_info_sha1_multi_block_avx2:
1584e1051a39Sopenharmony_ci	.byte	9,0,0,0
1585e1051a39Sopenharmony_ci	.rva	avx2_handler
1586e1051a39Sopenharmony_ci	.rva	.Lbody_avx2,.Lepilogue_avx2		# HandlerData[]
1587e1051a39Sopenharmony_ci___
1588e1051a39Sopenharmony_ci}
1589e1051a39Sopenharmony_ci####################################################################
1590e1051a39Sopenharmony_ci
1591e1051a39Sopenharmony_cisub rex {
1592e1051a39Sopenharmony_ci  local *opcode=shift;
1593e1051a39Sopenharmony_ci  my ($dst,$src)=@_;
1594e1051a39Sopenharmony_ci  my $rex=0;
1595e1051a39Sopenharmony_ci
1596e1051a39Sopenharmony_ci    $rex|=0x04			if ($dst>=8);
1597e1051a39Sopenharmony_ci    $rex|=0x01			if ($src>=8);
1598e1051a39Sopenharmony_ci    unshift @opcode,$rex|0x40	if ($rex);
1599e1051a39Sopenharmony_ci}
1600e1051a39Sopenharmony_ci
1601e1051a39Sopenharmony_cisub sha1rnds4 {
1602e1051a39Sopenharmony_ci    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1603e1051a39Sopenharmony_ci      my @opcode=(0x0f,0x3a,0xcc);
1604e1051a39Sopenharmony_ci	rex(\@opcode,$3,$2);
1605e1051a39Sopenharmony_ci	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
1606e1051a39Sopenharmony_ci	my $c=$1;
1607e1051a39Sopenharmony_ci	push @opcode,$c=~/^0/?oct($c):$c;
1608e1051a39Sopenharmony_ci	return ".byte\t".join(',',@opcode);
1609e1051a39Sopenharmony_ci    } else {
1610e1051a39Sopenharmony_ci	return "sha1rnds4\t".@_[0];
1611e1051a39Sopenharmony_ci    }
1612e1051a39Sopenharmony_ci}
1613e1051a39Sopenharmony_ci
1614e1051a39Sopenharmony_cisub sha1op38 {
1615e1051a39Sopenharmony_ci    my $instr = shift;
1616e1051a39Sopenharmony_ci    my %opcodelet = (
1617e1051a39Sopenharmony_ci		"sha1nexte" => 0xc8,
1618e1051a39Sopenharmony_ci  		"sha1msg1"  => 0xc9,
1619e1051a39Sopenharmony_ci		"sha1msg2"  => 0xca	);
1620e1051a39Sopenharmony_ci
1621e1051a39Sopenharmony_ci    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1622e1051a39Sopenharmony_ci      my @opcode=(0x0f,0x38);
1623e1051a39Sopenharmony_ci	rex(\@opcode,$2,$1);
1624e1051a39Sopenharmony_ci	push @opcode,$opcodelet{$instr};
1625e1051a39Sopenharmony_ci	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1626e1051a39Sopenharmony_ci	return ".byte\t".join(',',@opcode);
1627e1051a39Sopenharmony_ci    } else {
1628e1051a39Sopenharmony_ci	return $instr."\t".@_[0];
1629e1051a39Sopenharmony_ci    }
1630e1051a39Sopenharmony_ci}
1631e1051a39Sopenharmony_ci
1632e1051a39Sopenharmony_ciforeach (split("\n",$code)) {
1633e1051a39Sopenharmony_ci	s/\`([^\`]*)\`/eval($1)/ge;
1634e1051a39Sopenharmony_ci
1635e1051a39Sopenharmony_ci	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo		or
1636e1051a39Sopenharmony_ci	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo		or
1637e1051a39Sopenharmony_ci
1638e1051a39Sopenharmony_ci	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
1639e1051a39Sopenharmony_ci	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
1640e1051a39Sopenharmony_ci	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go	or
1641e1051a39Sopenharmony_ci	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1642e1051a39Sopenharmony_ci	s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go		or
1643e1051a39Sopenharmony_ci	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1644e1051a39Sopenharmony_ci
1645e1051a39Sopenharmony_ci	print $_,"\n";
1646e1051a39Sopenharmony_ci}
1647e1051a39Sopenharmony_ci
1648e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
1649