1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci#
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# sha1_block procedure for x86_64.
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# It was brought to my attention that on EM64T compiler-generated code
20e1051a39Sopenharmony_ci# was far behind 32-bit assembler implementation. This is unlike on
21e1051a39Sopenharmony_ci# Opteron where compiler-generated code was only 15% behind 32-bit
22e1051a39Sopenharmony_ci# assembler, which originally made it hard to motivate the effort.
23e1051a39Sopenharmony_ci# There was suggestion to mechanically translate 32-bit code, but I
24e1051a39Sopenharmony_ci# dismissed it, reasoning that x86_64 offers enough register bank
25e1051a39Sopenharmony_ci# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
26e1051a39Sopenharmony_ci# implementation:-) However! While 64-bit code does perform better
27e1051a39Sopenharmony_ci# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
28e1051a39Sopenharmony_ci# x86_64 does offer larger *addressable* bank, but out-of-order core
29e1051a39Sopenharmony_ci# reaches for even more registers through dynamic aliasing, and EM64T
30e1051a39Sopenharmony_ci# core must have managed to run-time optimize even 32-bit code just as
31e1051a39Sopenharmony_ci# good as 64-bit one. Performance improvement is summarized in the
32e1051a39Sopenharmony_ci# following table:
33e1051a39Sopenharmony_ci#
34e1051a39Sopenharmony_ci#		gcc 3.4		32-bit asm	cycles/byte
35e1051a39Sopenharmony_ci# Opteron	+45%		+20%		6.8
36e1051a39Sopenharmony_ci# Xeon P4	+65%		+0%		9.9
37e1051a39Sopenharmony_ci# Core2		+60%		+10%		7.0
38e1051a39Sopenharmony_ci
39e1051a39Sopenharmony_ci# August 2009.
40e1051a39Sopenharmony_ci#
41e1051a39Sopenharmony_ci# The code was revised to minimize code size and to maximize
42e1051a39Sopenharmony_ci# "distance" between instructions producing input to 'lea'
43e1051a39Sopenharmony_ci# instruction and the 'lea' instruction itself, which is essential
44e1051a39Sopenharmony_ci# for Intel Atom core.
45e1051a39Sopenharmony_ci
46e1051a39Sopenharmony_ci# October 2010.
47e1051a39Sopenharmony_ci#
48e1051a39Sopenharmony_ci# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
49e1051a39Sopenharmony_ci# is to offload message schedule denoted by Wt in NIST specification,
50e1051a39Sopenharmony_ci# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
51e1051a39Sopenharmony_ci# for background and implementation details. The only difference from
52e1051a39Sopenharmony_ci# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
53e1051a39Sopenharmony_ci# to free temporary registers.
54e1051a39Sopenharmony_ci
55e1051a39Sopenharmony_ci# April 2011.
56e1051a39Sopenharmony_ci#
57e1051a39Sopenharmony_ci# Add AVX code path. See sha1-586.pl for further information.
58e1051a39Sopenharmony_ci
59e1051a39Sopenharmony_ci# May 2013.
60e1051a39Sopenharmony_ci#
61e1051a39Sopenharmony_ci# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
62e1051a39Sopenharmony_ci# and loading pair of consecutive blocks to 256-bit %ymm registers)
63e1051a39Sopenharmony_ci# did not provide impressive performance improvement till a crucial
64e1051a39Sopenharmony_ci# hint regarding the number of Xupdate iterations to pre-compute in
65e1051a39Sopenharmony_ci# advance was provided by Ilya Albrekht of Intel Corp.
66e1051a39Sopenharmony_ci
67e1051a39Sopenharmony_ci# March 2014.
68e1051a39Sopenharmony_ci#
69e1051a39Sopenharmony_ci# Add support for Intel SHA Extensions.
70e1051a39Sopenharmony_ci
71e1051a39Sopenharmony_ci######################################################################
72e1051a39Sopenharmony_ci# Current performance is summarized in following table. Numbers are
73e1051a39Sopenharmony_ci# CPU clock cycles spent to process single byte (less is better).
74e1051a39Sopenharmony_ci#
75e1051a39Sopenharmony_ci#		x86_64		SSSE3		AVX[2]
76e1051a39Sopenharmony_ci# P4		9.05		-
77e1051a39Sopenharmony_ci# Opteron	6.26		-
78e1051a39Sopenharmony_ci# Core2		6.55		6.05/+8%	-
79e1051a39Sopenharmony_ci# Westmere	6.73		5.30/+27%	-
80e1051a39Sopenharmony_ci# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
81e1051a39Sopenharmony_ci# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
82e1051a39Sopenharmony_ci# Haswell	5.45		4.15/+31%	3.57/+53%
83e1051a39Sopenharmony_ci# Skylake	5.18		4.06/+28%	3.54/+46%
84e1051a39Sopenharmony_ci# Bulldozer	9.11		5.95/+53%
85e1051a39Sopenharmony_ci# Ryzen		4.75		3.80/+24%	1.93/+150%(**)
86e1051a39Sopenharmony_ci# VIA Nano	9.32		7.15/+30%
87e1051a39Sopenharmony_ci# Atom		10.3		9.17/+12%
88e1051a39Sopenharmony_ci# Silvermont	13.1(*)		9.37/+40%
89e1051a39Sopenharmony_ci# Knights L	13.2(*)		9.68/+36%	8.30/+59%
90e1051a39Sopenharmony_ci# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
91e1051a39Sopenharmony_ci#
92e1051a39Sopenharmony_ci# (*)	obviously suboptimal result, nothing was done about it,
93e1051a39Sopenharmony_ci#	because SSSE3 code is compiled unconditionally;
94e1051a39Sopenharmony_ci# (**)	SHAEXT result
95e1051a39Sopenharmony_ci
96e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
97e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
98e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
99e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
100e1051a39Sopenharmony_ci
101e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102e1051a39Sopenharmony_ci
103e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl";
107e1051a39Sopenharmony_ci
108e1051a39Sopenharmony_ciif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
109e1051a39Sopenharmony_ci		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
110e1051a39Sopenharmony_ci	$avx = ($1>=2.19) + ($1>=2.22);
111e1051a39Sopenharmony_ci}
112e1051a39Sopenharmony_ci
113e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
114e1051a39Sopenharmony_ci	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
115e1051a39Sopenharmony_ci	$avx = ($1>=2.09) + ($1>=2.10);
116e1051a39Sopenharmony_ci}
117e1051a39Sopenharmony_ci
118e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
119e1051a39Sopenharmony_ci	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
120e1051a39Sopenharmony_ci	$avx = ($1>=10) + ($1>=11);
121e1051a39Sopenharmony_ci}
122e1051a39Sopenharmony_ci
123e1051a39Sopenharmony_ciif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
124e1051a39Sopenharmony_ci	$avx = ($2>=3.0) + ($2>3.0);
125e1051a39Sopenharmony_ci}
126e1051a39Sopenharmony_ci
127e1051a39Sopenharmony_ci$shaext=1;	### set to zero if compiling for 1.0.1
128e1051a39Sopenharmony_ci$avx=1		if (!$shaext && $avx);
129e1051a39Sopenharmony_ci
130e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
131e1051a39Sopenharmony_ci    or die "can't call $xlate: $!";
132e1051a39Sopenharmony_ci*STDOUT=*OUT;
133e1051a39Sopenharmony_ci
134e1051a39Sopenharmony_ci$ctx="%rdi";	# 1st arg
135e1051a39Sopenharmony_ci$inp="%rsi";	# 2nd arg
136e1051a39Sopenharmony_ci$num="%rdx";	# 3rd arg
137e1051a39Sopenharmony_ci
138e1051a39Sopenharmony_ci# reassign arguments in order to produce more compact code
139e1051a39Sopenharmony_ci$ctx="%r8";
140e1051a39Sopenharmony_ci$inp="%r9";
141e1051a39Sopenharmony_ci$num="%r10";
142e1051a39Sopenharmony_ci
143e1051a39Sopenharmony_ci$t0="%eax";
144e1051a39Sopenharmony_ci$t1="%ebx";
145e1051a39Sopenharmony_ci$t2="%ecx";
146e1051a39Sopenharmony_ci@xi=("%edx","%ebp","%r14d");
147e1051a39Sopenharmony_ci$A="%esi";
148e1051a39Sopenharmony_ci$B="%edi";
149e1051a39Sopenharmony_ci$C="%r11d";
150e1051a39Sopenharmony_ci$D="%r12d";
151e1051a39Sopenharmony_ci$E="%r13d";
152e1051a39Sopenharmony_ci
153e1051a39Sopenharmony_ci@V=($A,$B,$C,$D,$E);
154e1051a39Sopenharmony_ci
155e1051a39Sopenharmony_cisub BODY_00_19 {
156e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
157e1051a39Sopenharmony_cimy $j=$i+1;
158e1051a39Sopenharmony_ci$code.=<<___ if ($i==0);
159e1051a39Sopenharmony_ci	mov	`4*$i`($inp),$xi[0]
160e1051a39Sopenharmony_ci	bswap	$xi[0]
161e1051a39Sopenharmony_ci___
162e1051a39Sopenharmony_ci$code.=<<___ if ($i<15);
163e1051a39Sopenharmony_ci	mov	`4*$j`($inp),$xi[1]
164e1051a39Sopenharmony_ci	mov	$d,$t0
165e1051a39Sopenharmony_ci	mov	$xi[0],`4*$i`(%rsp)
166e1051a39Sopenharmony_ci	mov	$a,$t2
167e1051a39Sopenharmony_ci	bswap	$xi[1]
168e1051a39Sopenharmony_ci	xor	$c,$t0
169e1051a39Sopenharmony_ci	rol	\$5,$t2
170e1051a39Sopenharmony_ci	and	$b,$t0
171e1051a39Sopenharmony_ci	lea	0x5a827999($xi[0],$e),$e
172e1051a39Sopenharmony_ci	add	$t2,$e
173e1051a39Sopenharmony_ci	xor	$d,$t0
174e1051a39Sopenharmony_ci	rol	\$30,$b
175e1051a39Sopenharmony_ci	add	$t0,$e
176e1051a39Sopenharmony_ci___
177e1051a39Sopenharmony_ci$code.=<<___ if ($i>=15);
178e1051a39Sopenharmony_ci	xor	`4*($j%16)`(%rsp),$xi[1]
179e1051a39Sopenharmony_ci	mov	$d,$t0
180e1051a39Sopenharmony_ci	mov	$xi[0],`4*($i%16)`(%rsp)
181e1051a39Sopenharmony_ci	mov	$a,$t2
182e1051a39Sopenharmony_ci	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
183e1051a39Sopenharmony_ci	xor	$c,$t0
184e1051a39Sopenharmony_ci	rol	\$5,$t2
185e1051a39Sopenharmony_ci	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
186e1051a39Sopenharmony_ci	and	$b,$t0
187e1051a39Sopenharmony_ci	lea	0x5a827999($xi[0],$e),$e
188e1051a39Sopenharmony_ci	rol	\$30,$b
189e1051a39Sopenharmony_ci	xor	$d,$t0
190e1051a39Sopenharmony_ci	add	$t2,$e
191e1051a39Sopenharmony_ci	rol	\$1,$xi[1]
192e1051a39Sopenharmony_ci	add	$t0,$e
193e1051a39Sopenharmony_ci___
194e1051a39Sopenharmony_cipush(@xi,shift(@xi));
195e1051a39Sopenharmony_ci}
196e1051a39Sopenharmony_ci
197e1051a39Sopenharmony_cisub BODY_20_39 {
198e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
199e1051a39Sopenharmony_cimy $j=$i+1;
200e1051a39Sopenharmony_cimy $K=($i<40)?0x6ed9eba1:0xca62c1d6;
201e1051a39Sopenharmony_ci$code.=<<___ if ($i<79);
202e1051a39Sopenharmony_ci	xor	`4*($j%16)`(%rsp),$xi[1]
203e1051a39Sopenharmony_ci	mov	$b,$t0
204e1051a39Sopenharmony_ci	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
205e1051a39Sopenharmony_ci	mov	$a,$t2
206e1051a39Sopenharmony_ci	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
207e1051a39Sopenharmony_ci	xor	$d,$t0
208e1051a39Sopenharmony_ci	rol	\$5,$t2
209e1051a39Sopenharmony_ci	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
210e1051a39Sopenharmony_ci	lea	$K($xi[0],$e),$e
211e1051a39Sopenharmony_ci	xor	$c,$t0
212e1051a39Sopenharmony_ci	add	$t2,$e
213e1051a39Sopenharmony_ci	rol	\$30,$b
214e1051a39Sopenharmony_ci	add	$t0,$e
215e1051a39Sopenharmony_ci	rol	\$1,$xi[1]
216e1051a39Sopenharmony_ci___
217e1051a39Sopenharmony_ci$code.=<<___ if ($i==79);
218e1051a39Sopenharmony_ci	mov	$b,$t0
219e1051a39Sopenharmony_ci	mov	$a,$t2
220e1051a39Sopenharmony_ci	xor	$d,$t0
221e1051a39Sopenharmony_ci	lea	$K($xi[0],$e),$e
222e1051a39Sopenharmony_ci	rol	\$5,$t2
223e1051a39Sopenharmony_ci	xor	$c,$t0
224e1051a39Sopenharmony_ci	add	$t2,$e
225e1051a39Sopenharmony_ci	rol	\$30,$b
226e1051a39Sopenharmony_ci	add	$t0,$e
227e1051a39Sopenharmony_ci___
228e1051a39Sopenharmony_cipush(@xi,shift(@xi));
229e1051a39Sopenharmony_ci}
230e1051a39Sopenharmony_ci
231e1051a39Sopenharmony_cisub BODY_40_59 {
232e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
233e1051a39Sopenharmony_cimy $j=$i+1;
234e1051a39Sopenharmony_ci$code.=<<___;
235e1051a39Sopenharmony_ci	xor	`4*($j%16)`(%rsp),$xi[1]
236e1051a39Sopenharmony_ci	mov	$d,$t0
237e1051a39Sopenharmony_ci	mov	$xi[0],`4*($i%16)`(%rsp)
238e1051a39Sopenharmony_ci	mov	$d,$t1
239e1051a39Sopenharmony_ci	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
240e1051a39Sopenharmony_ci	and	$c,$t0
241e1051a39Sopenharmony_ci	mov	$a,$t2
242e1051a39Sopenharmony_ci	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
243e1051a39Sopenharmony_ci	lea	0x8f1bbcdc($xi[0],$e),$e
244e1051a39Sopenharmony_ci	xor	$c,$t1
245e1051a39Sopenharmony_ci	rol	\$5,$t2
246e1051a39Sopenharmony_ci	add	$t0,$e
247e1051a39Sopenharmony_ci	rol	\$1,$xi[1]
248e1051a39Sopenharmony_ci	and	$b,$t1
249e1051a39Sopenharmony_ci	add	$t2,$e
250e1051a39Sopenharmony_ci	rol	\$30,$b
251e1051a39Sopenharmony_ci	add	$t1,$e
252e1051a39Sopenharmony_ci___
253e1051a39Sopenharmony_cipush(@xi,shift(@xi));
254e1051a39Sopenharmony_ci}
255e1051a39Sopenharmony_ci
256e1051a39Sopenharmony_ci$code.=<<___;
257e1051a39Sopenharmony_ci.text
258e1051a39Sopenharmony_ci.extern	OPENSSL_ia32cap_P
259e1051a39Sopenharmony_ci
260e1051a39Sopenharmony_ci.globl	sha1_block_data_order
261e1051a39Sopenharmony_ci.type	sha1_block_data_order,\@function,3
262e1051a39Sopenharmony_ci.align	16
263e1051a39Sopenharmony_cisha1_block_data_order:
264e1051a39Sopenharmony_ci.cfi_startproc
265e1051a39Sopenharmony_ci	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
266e1051a39Sopenharmony_ci	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
267e1051a39Sopenharmony_ci	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
268e1051a39Sopenharmony_ci	test	\$`1<<9`,%r8d		# check SSSE3 bit
269e1051a39Sopenharmony_ci	jz	.Lialu
270e1051a39Sopenharmony_ci___
271e1051a39Sopenharmony_ci$code.=<<___ if ($shaext);
272e1051a39Sopenharmony_ci	test	\$`1<<29`,%r10d		# check SHA bit
273e1051a39Sopenharmony_ci	jnz	_shaext_shortcut
274e1051a39Sopenharmony_ci___
275e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1);
276e1051a39Sopenharmony_ci	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
277e1051a39Sopenharmony_ci	cmp	\$`1<<3|1<<5|1<<8`,%r10d
278e1051a39Sopenharmony_ci	je	_avx2_shortcut
279e1051a39Sopenharmony_ci___
280e1051a39Sopenharmony_ci$code.=<<___ if ($avx);
281e1051a39Sopenharmony_ci	and	\$`1<<28`,%r8d		# mask AVX bit
282e1051a39Sopenharmony_ci	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
283e1051a39Sopenharmony_ci	or	%r9d,%r8d
284e1051a39Sopenharmony_ci	cmp	\$`1<<28|1<<30`,%r8d
285e1051a39Sopenharmony_ci	je	_avx_shortcut
286e1051a39Sopenharmony_ci___
287e1051a39Sopenharmony_ci$code.=<<___;
288e1051a39Sopenharmony_ci	jmp	_ssse3_shortcut
289e1051a39Sopenharmony_ci
290e1051a39Sopenharmony_ci.align	16
291e1051a39Sopenharmony_ci.Lialu:
292e1051a39Sopenharmony_ci	mov	%rsp,%rax
293e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rax
294e1051a39Sopenharmony_ci	push	%rbx
295e1051a39Sopenharmony_ci.cfi_push	%rbx
296e1051a39Sopenharmony_ci	push	%rbp
297e1051a39Sopenharmony_ci.cfi_push	%rbp
298e1051a39Sopenharmony_ci	push	%r12
299e1051a39Sopenharmony_ci.cfi_push	%r12
300e1051a39Sopenharmony_ci	push	%r13
301e1051a39Sopenharmony_ci.cfi_push	%r13
302e1051a39Sopenharmony_ci	push	%r14
303e1051a39Sopenharmony_ci.cfi_push	%r14
304e1051a39Sopenharmony_ci	mov	%rdi,$ctx	# reassigned argument
305e1051a39Sopenharmony_ci	sub	\$`8+16*4`,%rsp
306e1051a39Sopenharmony_ci	mov	%rsi,$inp	# reassigned argument
307e1051a39Sopenharmony_ci	and	\$-64,%rsp
308e1051a39Sopenharmony_ci	mov	%rdx,$num	# reassigned argument
309e1051a39Sopenharmony_ci	mov	%rax,`16*4`(%rsp)
310e1051a39Sopenharmony_ci.cfi_cfa_expression	%rsp+64,deref,+8
311e1051a39Sopenharmony_ci.Lprologue:
312e1051a39Sopenharmony_ci
313e1051a39Sopenharmony_ci	mov	0($ctx),$A
314e1051a39Sopenharmony_ci	mov	4($ctx),$B
315e1051a39Sopenharmony_ci	mov	8($ctx),$C
316e1051a39Sopenharmony_ci	mov	12($ctx),$D
317e1051a39Sopenharmony_ci	mov	16($ctx),$E
318e1051a39Sopenharmony_ci	jmp	.Lloop
319e1051a39Sopenharmony_ci
320e1051a39Sopenharmony_ci.align	16
321e1051a39Sopenharmony_ci.Lloop:
322e1051a39Sopenharmony_ci___
323e1051a39Sopenharmony_cifor($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
324e1051a39Sopenharmony_cifor(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
325e1051a39Sopenharmony_cifor(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
326e1051a39Sopenharmony_cifor(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
327e1051a39Sopenharmony_ci$code.=<<___;
328e1051a39Sopenharmony_ci	add	0($ctx),$A
329e1051a39Sopenharmony_ci	add	4($ctx),$B
330e1051a39Sopenharmony_ci	add	8($ctx),$C
331e1051a39Sopenharmony_ci	add	12($ctx),$D
332e1051a39Sopenharmony_ci	add	16($ctx),$E
333e1051a39Sopenharmony_ci	mov	$A,0($ctx)
334e1051a39Sopenharmony_ci	mov	$B,4($ctx)
335e1051a39Sopenharmony_ci	mov	$C,8($ctx)
336e1051a39Sopenharmony_ci	mov	$D,12($ctx)
337e1051a39Sopenharmony_ci	mov	$E,16($ctx)
338e1051a39Sopenharmony_ci
339e1051a39Sopenharmony_ci	sub	\$1,$num
340e1051a39Sopenharmony_ci	lea	`16*4`($inp),$inp
341e1051a39Sopenharmony_ci	jnz	.Lloop
342e1051a39Sopenharmony_ci
343e1051a39Sopenharmony_ci	mov	`16*4`(%rsp),%rsi
344e1051a39Sopenharmony_ci.cfi_def_cfa	%rsi,8
345e1051a39Sopenharmony_ci	mov	-40(%rsi),%r14
346e1051a39Sopenharmony_ci.cfi_restore	%r14
347e1051a39Sopenharmony_ci	mov	-32(%rsi),%r13
348e1051a39Sopenharmony_ci.cfi_restore	%r13
349e1051a39Sopenharmony_ci	mov	-24(%rsi),%r12
350e1051a39Sopenharmony_ci.cfi_restore	%r12
351e1051a39Sopenharmony_ci	mov	-16(%rsi),%rbp
352e1051a39Sopenharmony_ci.cfi_restore	%rbp
353e1051a39Sopenharmony_ci	mov	-8(%rsi),%rbx
354e1051a39Sopenharmony_ci.cfi_restore	%rbx
355e1051a39Sopenharmony_ci	lea	(%rsi),%rsp
356e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
357e1051a39Sopenharmony_ci.Lepilogue:
358e1051a39Sopenharmony_ci	ret
359e1051a39Sopenharmony_ci.cfi_endproc
360e1051a39Sopenharmony_ci.size	sha1_block_data_order,.-sha1_block_data_order
361e1051a39Sopenharmony_ci___
362e1051a39Sopenharmony_ciif ($shaext) {{{
363e1051a39Sopenharmony_ci######################################################################
364e1051a39Sopenharmony_ci# Intel SHA Extensions implementation of SHA1 update function.
365e1051a39Sopenharmony_ci#
366e1051a39Sopenharmony_cimy ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
367e1051a39Sopenharmony_cimy ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
368e1051a39Sopenharmony_cimy @MSG=map("%xmm$_",(4..7));
369e1051a39Sopenharmony_ci
370e1051a39Sopenharmony_ci$code.=<<___;
371e1051a39Sopenharmony_ci.type	sha1_block_data_order_shaext,\@function,3
372e1051a39Sopenharmony_ci.align	32
373e1051a39Sopenharmony_cisha1_block_data_order_shaext:
374e1051a39Sopenharmony_ci_shaext_shortcut:
375e1051a39Sopenharmony_ci.cfi_startproc
376e1051a39Sopenharmony_ci___
377e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
378e1051a39Sopenharmony_ci	lea	`-8-4*16`(%rsp),%rsp
379e1051a39Sopenharmony_ci	movaps	%xmm6,-8-4*16(%rax)
380e1051a39Sopenharmony_ci	movaps	%xmm7,-8-3*16(%rax)
381e1051a39Sopenharmony_ci	movaps	%xmm8,-8-2*16(%rax)
382e1051a39Sopenharmony_ci	movaps	%xmm9,-8-1*16(%rax)
383e1051a39Sopenharmony_ci.Lprologue_shaext:
384e1051a39Sopenharmony_ci___
385e1051a39Sopenharmony_ci$code.=<<___;
386e1051a39Sopenharmony_ci	movdqu	($ctx),$ABCD
387e1051a39Sopenharmony_ci	movd	16($ctx),$E
388e1051a39Sopenharmony_ci	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
389e1051a39Sopenharmony_ci
390e1051a39Sopenharmony_ci	movdqu	($inp),@MSG[0]
391e1051a39Sopenharmony_ci	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
392e1051a39Sopenharmony_ci	movdqu	0x10($inp),@MSG[1]
393e1051a39Sopenharmony_ci	pshufd	\$0b00011011,$E,$E		# flip word order
394e1051a39Sopenharmony_ci	movdqu	0x20($inp),@MSG[2]
395e1051a39Sopenharmony_ci	pshufb	$BSWAP,@MSG[0]
396e1051a39Sopenharmony_ci	movdqu	0x30($inp),@MSG[3]
397e1051a39Sopenharmony_ci	pshufb	$BSWAP,@MSG[1]
398e1051a39Sopenharmony_ci	pshufb	$BSWAP,@MSG[2]
399e1051a39Sopenharmony_ci	movdqa	$E,$E_SAVE			# offload $E
400e1051a39Sopenharmony_ci	pshufb	$BSWAP,@MSG[3]
401e1051a39Sopenharmony_ci	jmp	.Loop_shaext
402e1051a39Sopenharmony_ci
403e1051a39Sopenharmony_ci.align	16
404e1051a39Sopenharmony_ci.Loop_shaext:
405e1051a39Sopenharmony_ci	dec		$num
406e1051a39Sopenharmony_ci	lea		0x40($inp),%r8		# next input block
407e1051a39Sopenharmony_ci	paddd		@MSG[0],$E
408e1051a39Sopenharmony_ci	cmovne		%r8,$inp
409e1051a39Sopenharmony_ci	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
410e1051a39Sopenharmony_ci___
411e1051a39Sopenharmony_cifor($i=0;$i<20-4;$i+=2) {
412e1051a39Sopenharmony_ci$code.=<<___;
413e1051a39Sopenharmony_ci	sha1msg1	@MSG[1],@MSG[0]
414e1051a39Sopenharmony_ci	movdqa		$ABCD,$E_
415e1051a39Sopenharmony_ci	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
416e1051a39Sopenharmony_ci	sha1nexte	@MSG[1],$E_
417e1051a39Sopenharmony_ci	pxor		@MSG[2],@MSG[0]
418e1051a39Sopenharmony_ci	sha1msg1	@MSG[2],@MSG[1]
419e1051a39Sopenharmony_ci	sha1msg2	@MSG[3],@MSG[0]
420e1051a39Sopenharmony_ci
421e1051a39Sopenharmony_ci	movdqa		$ABCD,$E
422e1051a39Sopenharmony_ci	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
423e1051a39Sopenharmony_ci	sha1nexte	@MSG[2],$E
424e1051a39Sopenharmony_ci	pxor		@MSG[3],@MSG[1]
425e1051a39Sopenharmony_ci	sha1msg2	@MSG[0],@MSG[1]
426e1051a39Sopenharmony_ci___
427e1051a39Sopenharmony_ci	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
428e1051a39Sopenharmony_ci}
429e1051a39Sopenharmony_ci$code.=<<___;
430e1051a39Sopenharmony_ci	movdqu		($inp),@MSG[0]
431e1051a39Sopenharmony_ci	movdqa		$ABCD,$E_
432e1051a39Sopenharmony_ci	sha1rnds4	\$3,$E,$ABCD		# 64-67
433e1051a39Sopenharmony_ci	sha1nexte	@MSG[1],$E_
434e1051a39Sopenharmony_ci	movdqu		0x10($inp),@MSG[1]
435e1051a39Sopenharmony_ci	pshufb		$BSWAP,@MSG[0]
436e1051a39Sopenharmony_ci
437e1051a39Sopenharmony_ci	movdqa		$ABCD,$E
438e1051a39Sopenharmony_ci	sha1rnds4	\$3,$E_,$ABCD		# 68-71
439e1051a39Sopenharmony_ci	sha1nexte	@MSG[2],$E
440e1051a39Sopenharmony_ci	movdqu		0x20($inp),@MSG[2]
441e1051a39Sopenharmony_ci	pshufb		$BSWAP,@MSG[1]
442e1051a39Sopenharmony_ci
443e1051a39Sopenharmony_ci	movdqa		$ABCD,$E_
444e1051a39Sopenharmony_ci	sha1rnds4	\$3,$E,$ABCD		# 72-75
445e1051a39Sopenharmony_ci	sha1nexte	@MSG[3],$E_
446e1051a39Sopenharmony_ci	movdqu		0x30($inp),@MSG[3]
447e1051a39Sopenharmony_ci	pshufb		$BSWAP,@MSG[2]
448e1051a39Sopenharmony_ci
449e1051a39Sopenharmony_ci	movdqa		$ABCD,$E
450e1051a39Sopenharmony_ci	sha1rnds4	\$3,$E_,$ABCD		# 76-79
451e1051a39Sopenharmony_ci	sha1nexte	$E_SAVE,$E
452e1051a39Sopenharmony_ci	pshufb		$BSWAP,@MSG[3]
453e1051a39Sopenharmony_ci
454e1051a39Sopenharmony_ci	paddd		$ABCD_SAVE,$ABCD
455e1051a39Sopenharmony_ci	movdqa		$E,$E_SAVE		# offload $E
456e1051a39Sopenharmony_ci
457e1051a39Sopenharmony_ci	jnz		.Loop_shaext
458e1051a39Sopenharmony_ci
459e1051a39Sopenharmony_ci	pshufd	\$0b00011011,$ABCD,$ABCD
460e1051a39Sopenharmony_ci	pshufd	\$0b00011011,$E,$E
461e1051a39Sopenharmony_ci	movdqu	$ABCD,($ctx)
462e1051a39Sopenharmony_ci	movd	$E,16($ctx)
463e1051a39Sopenharmony_ci___
464e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
465e1051a39Sopenharmony_ci	movaps	-8-4*16(%rax),%xmm6
466e1051a39Sopenharmony_ci	movaps	-8-3*16(%rax),%xmm7
467e1051a39Sopenharmony_ci	movaps	-8-2*16(%rax),%xmm8
468e1051a39Sopenharmony_ci	movaps	-8-1*16(%rax),%xmm9
469e1051a39Sopenharmony_ci	mov	%rax,%rsp
470e1051a39Sopenharmony_ci.Lepilogue_shaext:
471e1051a39Sopenharmony_ci___
472e1051a39Sopenharmony_ci$code.=<<___;
473e1051a39Sopenharmony_ci	ret
474e1051a39Sopenharmony_ci.cfi_endproc
475e1051a39Sopenharmony_ci.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
476e1051a39Sopenharmony_ci___
477e1051a39Sopenharmony_ci}}}
478e1051a39Sopenharmony_ci{{{
479e1051a39Sopenharmony_cimy $Xi=4;
480e1051a39Sopenharmony_cimy @X=map("%xmm$_",(4..7,0..3));
481e1051a39Sopenharmony_cimy @Tx=map("%xmm$_",(8..10));
482e1051a39Sopenharmony_cimy $Kx="%xmm11";
483e1051a39Sopenharmony_cimy @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
484e1051a39Sopenharmony_cimy @T=("%esi","%edi");
485e1051a39Sopenharmony_cimy $j=0;
486e1051a39Sopenharmony_cimy $rx=0;
487e1051a39Sopenharmony_cimy $K_XX_XX="%r14";
488e1051a39Sopenharmony_cimy $fp="%r11";
489e1051a39Sopenharmony_ci
490e1051a39Sopenharmony_cimy $_rol=sub { &rol(@_) };
491e1051a39Sopenharmony_cimy $_ror=sub { &ror(@_) };
492e1051a39Sopenharmony_ci
493e1051a39Sopenharmony_ci{ my $sn;
494e1051a39Sopenharmony_cisub align32() {
495e1051a39Sopenharmony_ci  ++$sn;
496e1051a39Sopenharmony_ci$code.=<<___;
497e1051a39Sopenharmony_ci	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
498e1051a39Sopenharmony_ci.align	32
499e1051a39Sopenharmony_ci.Lalign32_$sn:
500e1051a39Sopenharmony_ci___
501e1051a39Sopenharmony_ci}
502e1051a39Sopenharmony_ci}
503e1051a39Sopenharmony_ci
504e1051a39Sopenharmony_ci$code.=<<___;
505e1051a39Sopenharmony_ci.type	sha1_block_data_order_ssse3,\@function,3
506e1051a39Sopenharmony_ci.align	16
507e1051a39Sopenharmony_cisha1_block_data_order_ssse3:
508e1051a39Sopenharmony_ci_ssse3_shortcut:
509e1051a39Sopenharmony_ci.cfi_startproc
510e1051a39Sopenharmony_ci	mov	%rsp,$fp	# frame pointer
511e1051a39Sopenharmony_ci.cfi_def_cfa_register	$fp
512e1051a39Sopenharmony_ci	push	%rbx
513e1051a39Sopenharmony_ci.cfi_push	%rbx
514e1051a39Sopenharmony_ci	push	%rbp
515e1051a39Sopenharmony_ci.cfi_push	%rbp
516e1051a39Sopenharmony_ci	push	%r12
517e1051a39Sopenharmony_ci.cfi_push	%r12
518e1051a39Sopenharmony_ci	push	%r13		# redundant, done to share Win64 SE handler
519e1051a39Sopenharmony_ci.cfi_push	%r13
520e1051a39Sopenharmony_ci	push	%r14
521e1051a39Sopenharmony_ci.cfi_push	%r14
522e1051a39Sopenharmony_ci	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
523e1051a39Sopenharmony_ci___
524e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
525e1051a39Sopenharmony_ci	movaps	%xmm6,-40-6*16($fp)
526e1051a39Sopenharmony_ci	movaps	%xmm7,-40-5*16($fp)
527e1051a39Sopenharmony_ci	movaps	%xmm8,-40-4*16($fp)
528e1051a39Sopenharmony_ci	movaps	%xmm9,-40-3*16($fp)
529e1051a39Sopenharmony_ci	movaps	%xmm10,-40-2*16($fp)
530e1051a39Sopenharmony_ci	movaps	%xmm11,-40-1*16($fp)
531e1051a39Sopenharmony_ci.Lprologue_ssse3:
532e1051a39Sopenharmony_ci___
533e1051a39Sopenharmony_ci$code.=<<___;
534e1051a39Sopenharmony_ci	and	\$-64,%rsp
535e1051a39Sopenharmony_ci	mov	%rdi,$ctx	# reassigned argument
536e1051a39Sopenharmony_ci	mov	%rsi,$inp	# reassigned argument
537e1051a39Sopenharmony_ci	mov	%rdx,$num	# reassigned argument
538e1051a39Sopenharmony_ci
539e1051a39Sopenharmony_ci	shl	\$6,$num
540e1051a39Sopenharmony_ci	add	$inp,$num
541e1051a39Sopenharmony_ci	lea	K_XX_XX+64(%rip),$K_XX_XX
542e1051a39Sopenharmony_ci
543e1051a39Sopenharmony_ci	mov	0($ctx),$A		# load context
544e1051a39Sopenharmony_ci	mov	4($ctx),$B
545e1051a39Sopenharmony_ci	mov	8($ctx),$C
546e1051a39Sopenharmony_ci	mov	12($ctx),$D
547e1051a39Sopenharmony_ci	mov	$B,@T[0]		# magic seed
548e1051a39Sopenharmony_ci	mov	16($ctx),$E
549e1051a39Sopenharmony_ci	mov	$C,@T[1]
550e1051a39Sopenharmony_ci	xor	$D,@T[1]
551e1051a39Sopenharmony_ci	and	@T[1],@T[0]
552e1051a39Sopenharmony_ci
553e1051a39Sopenharmony_ci	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
554e1051a39Sopenharmony_ci	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
555e1051a39Sopenharmony_ci	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
556e1051a39Sopenharmony_ci	movdqu	16($inp),@X[-3&7]
557e1051a39Sopenharmony_ci	movdqu	32($inp),@X[-2&7]
558e1051a39Sopenharmony_ci	movdqu	48($inp),@X[-1&7]
559e1051a39Sopenharmony_ci	pshufb	@X[2],@X[-4&7]		# byte swap
560e1051a39Sopenharmony_ci	pshufb	@X[2],@X[-3&7]
561e1051a39Sopenharmony_ci	pshufb	@X[2],@X[-2&7]
562e1051a39Sopenharmony_ci	add	\$64,$inp
563e1051a39Sopenharmony_ci	paddd	@Tx[1],@X[-4&7]		# add K_00_19
564e1051a39Sopenharmony_ci	pshufb	@X[2],@X[-1&7]
565e1051a39Sopenharmony_ci	paddd	@Tx[1],@X[-3&7]
566e1051a39Sopenharmony_ci	paddd	@Tx[1],@X[-2&7]
567e1051a39Sopenharmony_ci	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
568e1051a39Sopenharmony_ci	psubd	@Tx[1],@X[-4&7]		# restore X[]
569e1051a39Sopenharmony_ci	movdqa	@X[-3&7],16(%rsp)
570e1051a39Sopenharmony_ci	psubd	@Tx[1],@X[-3&7]
571e1051a39Sopenharmony_ci	movdqa	@X[-2&7],32(%rsp)
572e1051a39Sopenharmony_ci	psubd	@Tx[1],@X[-2&7]
573e1051a39Sopenharmony_ci	jmp	.Loop_ssse3
574e1051a39Sopenharmony_ci___
575e1051a39Sopenharmony_ci
576e1051a39Sopenharmony_cisub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
577e1051a39Sopenharmony_ci{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
578e1051a39Sopenharmony_ci  my $arg = pop;
579e1051a39Sopenharmony_ci    $arg = "\$$arg" if ($arg*1 eq $arg);
580e1051a39Sopenharmony_ci    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
581e1051a39Sopenharmony_ci}
582e1051a39Sopenharmony_ci
583e1051a39Sopenharmony_cisub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
584e1051a39Sopenharmony_ci{ use integer;
585e1051a39Sopenharmony_ci  my $body = shift;
586e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
587e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
588e1051a39Sopenharmony_ci
589e1051a39Sopenharmony_ci	 eval(shift(@insns));		# ror
590e1051a39Sopenharmony_ci	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
591e1051a39Sopenharmony_ci	 eval(shift(@insns));
592e1051a39Sopenharmony_ci	&movdqa	(@Tx[0],@X[-1&7]);
593e1051a39Sopenharmony_ci	  &paddd	(@Tx[1],@X[-1&7]);
594e1051a39Sopenharmony_ci	 eval(shift(@insns));
595e1051a39Sopenharmony_ci	 eval(shift(@insns));
596e1051a39Sopenharmony_ci
597e1051a39Sopenharmony_ci	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
598e1051a39Sopenharmony_ci	 eval(shift(@insns));
599e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
600e1051a39Sopenharmony_ci	 eval(shift(@insns));
601e1051a39Sopenharmony_ci	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
602e1051a39Sopenharmony_ci	 eval(shift(@insns));
603e1051a39Sopenharmony_ci	 eval(shift(@insns));
604e1051a39Sopenharmony_ci
605e1051a39Sopenharmony_ci	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
606e1051a39Sopenharmony_ci	 eval(shift(@insns));
607e1051a39Sopenharmony_ci	 eval(shift(@insns));		# ror
608e1051a39Sopenharmony_ci	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
609e1051a39Sopenharmony_ci	 eval(shift(@insns));
610e1051a39Sopenharmony_ci	 eval(shift(@insns));
611e1051a39Sopenharmony_ci	 eval(shift(@insns));
612e1051a39Sopenharmony_ci
613e1051a39Sopenharmony_ci	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
614e1051a39Sopenharmony_ci	 eval(shift(@insns));
615e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
616e1051a39Sopenharmony_ci	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
617e1051a39Sopenharmony_ci	 eval(shift(@insns));
618e1051a39Sopenharmony_ci	 eval(shift(@insns));
619e1051a39Sopenharmony_ci
620e1051a39Sopenharmony_ci	&movdqa	(@Tx[2],@X[0]);
621e1051a39Sopenharmony_ci	 eval(shift(@insns));
622e1051a39Sopenharmony_ci	 eval(shift(@insns));
623e1051a39Sopenharmony_ci	 eval(shift(@insns));		# ror
624e1051a39Sopenharmony_ci	&movdqa	(@Tx[0],@X[0]);
625e1051a39Sopenharmony_ci	 eval(shift(@insns));
626e1051a39Sopenharmony_ci
627e1051a39Sopenharmony_ci	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
628e1051a39Sopenharmony_ci	&paddd	(@X[0],@X[0]);
629e1051a39Sopenharmony_ci	 eval(shift(@insns));
630e1051a39Sopenharmony_ci	 eval(shift(@insns));
631e1051a39Sopenharmony_ci
632e1051a39Sopenharmony_ci	&psrld	(@Tx[0],31);
633e1051a39Sopenharmony_ci	 eval(shift(@insns));
634e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
635e1051a39Sopenharmony_ci	 eval(shift(@insns));
636e1051a39Sopenharmony_ci	&movdqa	(@Tx[1],@Tx[2]);
637e1051a39Sopenharmony_ci	 eval(shift(@insns));
638e1051a39Sopenharmony_ci	 eval(shift(@insns));
639e1051a39Sopenharmony_ci
640e1051a39Sopenharmony_ci	&psrld	(@Tx[2],30);
641e1051a39Sopenharmony_ci	 eval(shift(@insns));
642e1051a39Sopenharmony_ci	 eval(shift(@insns));		# ror
643e1051a39Sopenharmony_ci	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
644e1051a39Sopenharmony_ci	 eval(shift(@insns));
645e1051a39Sopenharmony_ci	 eval(shift(@insns));
646e1051a39Sopenharmony_ci	 eval(shift(@insns));
647e1051a39Sopenharmony_ci
648e1051a39Sopenharmony_ci	&pslld	(@Tx[1],2);
649e1051a39Sopenharmony_ci	&pxor	(@X[0],@Tx[2]);
650e1051a39Sopenharmony_ci	 eval(shift(@insns));
651e1051a39Sopenharmony_ci	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
652e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
653e1051a39Sopenharmony_ci	 eval(shift(@insns));
654e1051a39Sopenharmony_ci	 eval(shift(@insns));
655e1051a39Sopenharmony_ci
656e1051a39Sopenharmony_ci	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
657e1051a39Sopenharmony_ci	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
658e1051a39Sopenharmony_ci
659e1051a39Sopenharmony_ci	 foreach (@insns) { eval; }	# remaining instructions [if any]
660e1051a39Sopenharmony_ci
661e1051a39Sopenharmony_ci  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
662e1051a39Sopenharmony_ci		push(@Tx,shift(@Tx));
663e1051a39Sopenharmony_ci}
664e1051a39Sopenharmony_ci
665e1051a39Sopenharmony_cisub Xupdate_ssse3_32_79()
666e1051a39Sopenharmony_ci{ use integer;
667e1051a39Sopenharmony_ci  my $body = shift;
668e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
669e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
670e1051a39Sopenharmony_ci
671e1051a39Sopenharmony_ci	 eval(shift(@insns))		if ($Xi==8);
672e1051a39Sopenharmony_ci	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
673e1051a39Sopenharmony_ci	 eval(shift(@insns))		if ($Xi==8);
674e1051a39Sopenharmony_ci	 eval(shift(@insns));		# body_20_39
675e1051a39Sopenharmony_ci	 eval(shift(@insns));
676e1051a39Sopenharmony_ci	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
677e1051a39Sopenharmony_ci	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
678e1051a39Sopenharmony_ci	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
679e1051a39Sopenharmony_ci	 eval(shift(@insns));
680e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
681e1051a39Sopenharmony_ci
682e1051a39Sopenharmony_ci	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
683e1051a39Sopenharmony_ci	 eval(shift(@insns));
684e1051a39Sopenharmony_ci	 eval(shift(@insns));
685e1051a39Sopenharmony_ci	if ($Xi%5) {
686e1051a39Sopenharmony_ci	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
687e1051a39Sopenharmony_ci	} else {			# ... or load next one
688e1051a39Sopenharmony_ci	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
689e1051a39Sopenharmony_ci	}
690e1051a39Sopenharmony_ci	 eval(shift(@insns));		# ror
691e1051a39Sopenharmony_ci	  &paddd	(@Tx[1],@X[-1&7]);
692e1051a39Sopenharmony_ci	 eval(shift(@insns));
693e1051a39Sopenharmony_ci
694e1051a39Sopenharmony_ci	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
695e1051a39Sopenharmony_ci	 eval(shift(@insns));		# body_20_39
696e1051a39Sopenharmony_ci	 eval(shift(@insns));
697e1051a39Sopenharmony_ci	 eval(shift(@insns));
698e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
699e1051a39Sopenharmony_ci	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
700e1051a39Sopenharmony_ci
701e1051a39Sopenharmony_ci	&movdqa	(@Tx[0],@X[0]);
702e1051a39Sopenharmony_ci	 eval(shift(@insns));
703e1051a39Sopenharmony_ci	 eval(shift(@insns));
704e1051a39Sopenharmony_ci	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
705e1051a39Sopenharmony_ci	 eval(shift(@insns));		# ror
706e1051a39Sopenharmony_ci	 eval(shift(@insns));
707e1051a39Sopenharmony_ci	 eval(shift(@insns));		# body_20_39
708e1051a39Sopenharmony_ci
709e1051a39Sopenharmony_ci	&pslld	(@X[0],2);
710e1051a39Sopenharmony_ci	 eval(shift(@insns));
711e1051a39Sopenharmony_ci	 eval(shift(@insns));
712e1051a39Sopenharmony_ci	&psrld	(@Tx[0],30);
713e1051a39Sopenharmony_ci	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
714e1051a39Sopenharmony_ci	 eval(shift(@insns));
715e1051a39Sopenharmony_ci	 eval(shift(@insns));
716e1051a39Sopenharmony_ci	 eval(shift(@insns));		# ror
717e1051a39Sopenharmony_ci
718e1051a39Sopenharmony_ci	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
719e1051a39Sopenharmony_ci	 eval(shift(@insns));
720e1051a39Sopenharmony_ci	 eval(shift(@insns));		# body_20_39
721e1051a39Sopenharmony_ci	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
722e1051a39Sopenharmony_ci	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
723e1051a39Sopenharmony_ci	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
724e1051a39Sopenharmony_ci	 eval(shift(@insns));
725e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
726e1051a39Sopenharmony_ci	 eval(shift(@insns));
727e1051a39Sopenharmony_ci	 eval(shift(@insns));
728e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
729e1051a39Sopenharmony_ci	 eval(shift(@insns));
730e1051a39Sopenharmony_ci
731e1051a39Sopenharmony_ci	 foreach (@insns) { eval; }	# remaining instructions
732e1051a39Sopenharmony_ci
733e1051a39Sopenharmony_ci  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
734e1051a39Sopenharmony_ci		push(@Tx,shift(@Tx));
735e1051a39Sopenharmony_ci}
736e1051a39Sopenharmony_ci
737e1051a39Sopenharmony_cisub Xuplast_ssse3_80()
738e1051a39Sopenharmony_ci{ use integer;
739e1051a39Sopenharmony_ci  my $body = shift;
740e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
741e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
742e1051a39Sopenharmony_ci
743e1051a39Sopenharmony_ci	 eval(shift(@insns));
744e1051a39Sopenharmony_ci	 eval(shift(@insns));
745e1051a39Sopenharmony_ci	 eval(shift(@insns));
746e1051a39Sopenharmony_ci	 eval(shift(@insns));
747e1051a39Sopenharmony_ci	  &paddd	(@Tx[1],@X[-1&7]);
748e1051a39Sopenharmony_ci	 eval(shift(@insns));
749e1051a39Sopenharmony_ci	 eval(shift(@insns));
750e1051a39Sopenharmony_ci
751e1051a39Sopenharmony_ci	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
752e1051a39Sopenharmony_ci
753e1051a39Sopenharmony_ci	 foreach (@insns) { eval; }		# remaining instructions
754e1051a39Sopenharmony_ci
755e1051a39Sopenharmony_ci	&cmp	($inp,$num);
756e1051a39Sopenharmony_ci	&je	(".Ldone_ssse3");
757e1051a39Sopenharmony_ci
758e1051a39Sopenharmony_ci	unshift(@Tx,pop(@Tx));
759e1051a39Sopenharmony_ci
760e1051a39Sopenharmony_ci	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
761e1051a39Sopenharmony_ci	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
762e1051a39Sopenharmony_ci	&movdqu	(@X[-4&7],"0($inp)");		# load input
763e1051a39Sopenharmony_ci	&movdqu	(@X[-3&7],"16($inp)");
764e1051a39Sopenharmony_ci	&movdqu	(@X[-2&7],"32($inp)");
765e1051a39Sopenharmony_ci	&movdqu	(@X[-1&7],"48($inp)");
766e1051a39Sopenharmony_ci	&pshufb	(@X[-4&7],@X[2]);		# byte swap
767e1051a39Sopenharmony_ci	&add	($inp,64);
768e1051a39Sopenharmony_ci
769e1051a39Sopenharmony_ci  $Xi=0;
770e1051a39Sopenharmony_ci}
771e1051a39Sopenharmony_ci
772e1051a39Sopenharmony_cisub Xloop_ssse3()
773e1051a39Sopenharmony_ci{ use integer;
774e1051a39Sopenharmony_ci  my $body = shift;
775e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
776e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
777e1051a39Sopenharmony_ci
778e1051a39Sopenharmony_ci	 eval(shift(@insns));
779e1051a39Sopenharmony_ci	 eval(shift(@insns));
780e1051a39Sopenharmony_ci	 eval(shift(@insns));
781e1051a39Sopenharmony_ci	&pshufb	(@X[($Xi-3)&7],@X[2]);
782e1051a39Sopenharmony_ci	 eval(shift(@insns));
783e1051a39Sopenharmony_ci	 eval(shift(@insns));
784e1051a39Sopenharmony_ci	 eval(shift(@insns));
785e1051a39Sopenharmony_ci	 eval(shift(@insns));
786e1051a39Sopenharmony_ci	&paddd	(@X[($Xi-4)&7],@Tx[1]);
787e1051a39Sopenharmony_ci	 eval(shift(@insns));
788e1051a39Sopenharmony_ci	 eval(shift(@insns));
789e1051a39Sopenharmony_ci	 eval(shift(@insns));
790e1051a39Sopenharmony_ci	 eval(shift(@insns));
791e1051a39Sopenharmony_ci	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
792e1051a39Sopenharmony_ci	 eval(shift(@insns));
793e1051a39Sopenharmony_ci	 eval(shift(@insns));
794e1051a39Sopenharmony_ci	 eval(shift(@insns));
795e1051a39Sopenharmony_ci	 eval(shift(@insns));
796e1051a39Sopenharmony_ci	&psubd	(@X[($Xi-4)&7],@Tx[1]);
797e1051a39Sopenharmony_ci
798e1051a39Sopenharmony_ci	foreach (@insns) { eval; }
799e1051a39Sopenharmony_ci  $Xi++;
800e1051a39Sopenharmony_ci}
801e1051a39Sopenharmony_ci
802e1051a39Sopenharmony_cisub Xtail_ssse3()
803e1051a39Sopenharmony_ci{ use integer;
804e1051a39Sopenharmony_ci  my $body = shift;
805e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
806e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
807e1051a39Sopenharmony_ci
808e1051a39Sopenharmony_ci	foreach (@insns) { eval; }
809e1051a39Sopenharmony_ci}
810e1051a39Sopenharmony_ci
811e1051a39Sopenharmony_cisub body_00_19 () {	# ((c^d)&b)^d
812e1051a39Sopenharmony_ci	# on start @T[0]=(c^d)&b
813e1051a39Sopenharmony_ci	return &body_20_39() if ($rx==19); $rx++;
814e1051a39Sopenharmony_ci	(
815e1051a39Sopenharmony_ci	'($a,$b,$c,$d,$e)=@V;'.
816e1051a39Sopenharmony_ci	'&$_ror	($b,$j?7:2)',	# $b>>>2
817e1051a39Sopenharmony_ci	'&xor	(@T[0],$d)',
818e1051a39Sopenharmony_ci	'&mov	(@T[1],$a)',	# $b for next round
819e1051a39Sopenharmony_ci
820e1051a39Sopenharmony_ci	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
821e1051a39Sopenharmony_ci	'&xor	($b,$c)',	# $c^$d for next round
822e1051a39Sopenharmony_ci
823e1051a39Sopenharmony_ci	'&$_rol	($a,5)',
824e1051a39Sopenharmony_ci	'&add	($e,@T[0])',
825e1051a39Sopenharmony_ci	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
826e1051a39Sopenharmony_ci
827e1051a39Sopenharmony_ci	'&xor	($b,$c)',	# restore $b
828e1051a39Sopenharmony_ci	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
829e1051a39Sopenharmony_ci	);
830e1051a39Sopenharmony_ci}
831e1051a39Sopenharmony_ci
832e1051a39Sopenharmony_cisub body_20_39 () {	# b^d^c
833e1051a39Sopenharmony_ci	# on entry @T[0]=b^d
834e1051a39Sopenharmony_ci	return &body_40_59() if ($rx==39); $rx++;
835e1051a39Sopenharmony_ci	(
836e1051a39Sopenharmony_ci	'($a,$b,$c,$d,$e)=@V;'.
837e1051a39Sopenharmony_ci	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
838e1051a39Sopenharmony_ci	'&xor	(@T[0],$d)	if($j==19);'.
839e1051a39Sopenharmony_ci	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
840e1051a39Sopenharmony_ci	'&mov	(@T[1],$a)',	# $b for next round
841e1051a39Sopenharmony_ci
842e1051a39Sopenharmony_ci	'&$_rol	($a,5)',
843e1051a39Sopenharmony_ci	'&add	($e,@T[0])',
844e1051a39Sopenharmony_ci	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
845e1051a39Sopenharmony_ci
846e1051a39Sopenharmony_ci	'&$_ror	($b,7)',	# $b>>>2
847e1051a39Sopenharmony_ci	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
848e1051a39Sopenharmony_ci	);
849e1051a39Sopenharmony_ci}
850e1051a39Sopenharmony_ci
851e1051a39Sopenharmony_cisub body_40_59 () {	# ((b^c)&(c^d))^c
852e1051a39Sopenharmony_ci	# on entry @T[0]=(b^c), (c^=d)
853e1051a39Sopenharmony_ci	$rx++;
854e1051a39Sopenharmony_ci	(
855e1051a39Sopenharmony_ci	'($a,$b,$c,$d,$e)=@V;'.
856e1051a39Sopenharmony_ci	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
857e1051a39Sopenharmony_ci	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
858e1051a39Sopenharmony_ci	'&xor	($c,$d)		if ($j>=40)',	# restore $c
859e1051a39Sopenharmony_ci
860e1051a39Sopenharmony_ci	'&$_ror	($b,7)',	# $b>>>2
861e1051a39Sopenharmony_ci	'&mov	(@T[1],$a)',	# $b for next round
862e1051a39Sopenharmony_ci	'&xor	(@T[0],$c)',
863e1051a39Sopenharmony_ci
864e1051a39Sopenharmony_ci	'&$_rol	($a,5)',
865e1051a39Sopenharmony_ci	'&add	($e,@T[0])',
866e1051a39Sopenharmony_ci	'&xor	(@T[1],$c)	if ($j==59);'.
867e1051a39Sopenharmony_ci	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
868e1051a39Sopenharmony_ci
869e1051a39Sopenharmony_ci	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
870e1051a39Sopenharmony_ci	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
871e1051a39Sopenharmony_ci	);
872e1051a39Sopenharmony_ci}
873e1051a39Sopenharmony_ci$code.=<<___;
874e1051a39Sopenharmony_ci.align	16
875e1051a39Sopenharmony_ci.Loop_ssse3:
876e1051a39Sopenharmony_ci___
877e1051a39Sopenharmony_ci	&Xupdate_ssse3_16_31(\&body_00_19);
878e1051a39Sopenharmony_ci	&Xupdate_ssse3_16_31(\&body_00_19);
879e1051a39Sopenharmony_ci	&Xupdate_ssse3_16_31(\&body_00_19);
880e1051a39Sopenharmony_ci	&Xupdate_ssse3_16_31(\&body_00_19);
881e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_00_19);
882e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_20_39);
883e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_20_39);
884e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_20_39);
885e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_20_39);
886e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_20_39);
887e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_40_59);
888e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_40_59);
889e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_40_59);
890e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_40_59);
891e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_40_59);
892e1051a39Sopenharmony_ci	&Xupdate_ssse3_32_79(\&body_20_39);
893e1051a39Sopenharmony_ci	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
894e1051a39Sopenharmony_ci
895e1051a39Sopenharmony_ci				$saved_j=$j; @saved_V=@V;
896e1051a39Sopenharmony_ci
897e1051a39Sopenharmony_ci	&Xloop_ssse3(\&body_20_39);
898e1051a39Sopenharmony_ci	&Xloop_ssse3(\&body_20_39);
899e1051a39Sopenharmony_ci	&Xloop_ssse3(\&body_20_39);
900e1051a39Sopenharmony_ci
901e1051a39Sopenharmony_ci$code.=<<___;
902e1051a39Sopenharmony_ci	add	0($ctx),$A			# update context
903e1051a39Sopenharmony_ci	add	4($ctx),@T[0]
904e1051a39Sopenharmony_ci	add	8($ctx),$C
905e1051a39Sopenharmony_ci	add	12($ctx),$D
906e1051a39Sopenharmony_ci	mov	$A,0($ctx)
907e1051a39Sopenharmony_ci	add	16($ctx),$E
908e1051a39Sopenharmony_ci	mov	@T[0],4($ctx)
909e1051a39Sopenharmony_ci	mov	@T[0],$B			# magic seed
910e1051a39Sopenharmony_ci	mov	$C,8($ctx)
911e1051a39Sopenharmony_ci	mov	$C,@T[1]
912e1051a39Sopenharmony_ci	mov	$D,12($ctx)
913e1051a39Sopenharmony_ci	xor	$D,@T[1]
914e1051a39Sopenharmony_ci	mov	$E,16($ctx)
915e1051a39Sopenharmony_ci	and	@T[1],@T[0]
916e1051a39Sopenharmony_ci	jmp	.Loop_ssse3
917e1051a39Sopenharmony_ci
918e1051a39Sopenharmony_ci.align	16
919e1051a39Sopenharmony_ci.Ldone_ssse3:
920e1051a39Sopenharmony_ci___
921e1051a39Sopenharmony_ci				$j=$saved_j; @V=@saved_V;
922e1051a39Sopenharmony_ci
923e1051a39Sopenharmony_ci	&Xtail_ssse3(\&body_20_39);
924e1051a39Sopenharmony_ci	&Xtail_ssse3(\&body_20_39);
925e1051a39Sopenharmony_ci	&Xtail_ssse3(\&body_20_39);
926e1051a39Sopenharmony_ci
927e1051a39Sopenharmony_ci$code.=<<___;
928e1051a39Sopenharmony_ci	add	0($ctx),$A			# update context
929e1051a39Sopenharmony_ci	add	4($ctx),@T[0]
930e1051a39Sopenharmony_ci	add	8($ctx),$C
931e1051a39Sopenharmony_ci	mov	$A,0($ctx)
932e1051a39Sopenharmony_ci	add	12($ctx),$D
933e1051a39Sopenharmony_ci	mov	@T[0],4($ctx)
934e1051a39Sopenharmony_ci	add	16($ctx),$E
935e1051a39Sopenharmony_ci	mov	$C,8($ctx)
936e1051a39Sopenharmony_ci	mov	$D,12($ctx)
937e1051a39Sopenharmony_ci	mov	$E,16($ctx)
938e1051a39Sopenharmony_ci___
939e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
940e1051a39Sopenharmony_ci	movaps	-40-6*16($fp),%xmm6
941e1051a39Sopenharmony_ci	movaps	-40-5*16($fp),%xmm7
942e1051a39Sopenharmony_ci	movaps	-40-4*16($fp),%xmm8
943e1051a39Sopenharmony_ci	movaps	-40-3*16($fp),%xmm9
944e1051a39Sopenharmony_ci	movaps	-40-2*16($fp),%xmm10
945e1051a39Sopenharmony_ci	movaps	-40-1*16($fp),%xmm11
946e1051a39Sopenharmony_ci___
947e1051a39Sopenharmony_ci$code.=<<___;
948e1051a39Sopenharmony_ci	mov	-40($fp),%r14
949e1051a39Sopenharmony_ci.cfi_restore	%r14
950e1051a39Sopenharmony_ci	mov	-32($fp),%r13
951e1051a39Sopenharmony_ci.cfi_restore	%r13
952e1051a39Sopenharmony_ci	mov	-24($fp),%r12
953e1051a39Sopenharmony_ci.cfi_restore	%r12
954e1051a39Sopenharmony_ci	mov	-16($fp),%rbp
955e1051a39Sopenharmony_ci.cfi_restore	%rbp
956e1051a39Sopenharmony_ci	mov	-8($fp),%rbx
957e1051a39Sopenharmony_ci.cfi_restore	%rbx
958e1051a39Sopenharmony_ci	lea	($fp),%rsp
959e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
960e1051a39Sopenharmony_ci.Lepilogue_ssse3:
961e1051a39Sopenharmony_ci	ret
962e1051a39Sopenharmony_ci.cfi_endproc
963e1051a39Sopenharmony_ci.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
964e1051a39Sopenharmony_ci___
965e1051a39Sopenharmony_ci
966e1051a39Sopenharmony_ciif ($avx) {
967e1051a39Sopenharmony_ci$Xi=4;				# reset variables
968e1051a39Sopenharmony_ci@X=map("%xmm$_",(4..7,0..3));
969e1051a39Sopenharmony_ci@Tx=map("%xmm$_",(8..10));
970e1051a39Sopenharmony_ci$j=0;
971e1051a39Sopenharmony_ci$rx=0;
972e1051a39Sopenharmony_ci
973e1051a39Sopenharmony_cimy $done_avx_label=".Ldone_avx";
974e1051a39Sopenharmony_ci
975e1051a39Sopenharmony_cimy $_rol=sub { &shld(@_[0],@_) };
976e1051a39Sopenharmony_cimy $_ror=sub { &shrd(@_[0],@_) };
977e1051a39Sopenharmony_ci
978e1051a39Sopenharmony_ci$code.=<<___;
979e1051a39Sopenharmony_ci.type	sha1_block_data_order_avx,\@function,3
980e1051a39Sopenharmony_ci.align	16
981e1051a39Sopenharmony_cisha1_block_data_order_avx:
982e1051a39Sopenharmony_ci_avx_shortcut:
983e1051a39Sopenharmony_ci.cfi_startproc
984e1051a39Sopenharmony_ci	mov	%rsp,$fp
985e1051a39Sopenharmony_ci.cfi_def_cfa_register	$fp
986e1051a39Sopenharmony_ci	push	%rbx
987e1051a39Sopenharmony_ci.cfi_push	%rbx
988e1051a39Sopenharmony_ci	push	%rbp
989e1051a39Sopenharmony_ci.cfi_push	%rbp
990e1051a39Sopenharmony_ci	push	%r12
991e1051a39Sopenharmony_ci.cfi_push	%r12
992e1051a39Sopenharmony_ci	push	%r13		# redundant, done to share Win64 SE handler
993e1051a39Sopenharmony_ci.cfi_push	%r13
994e1051a39Sopenharmony_ci	push	%r14
995e1051a39Sopenharmony_ci.cfi_push	%r14
996e1051a39Sopenharmony_ci	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
997e1051a39Sopenharmony_ci	vzeroupper
998e1051a39Sopenharmony_ci___
999e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1000e1051a39Sopenharmony_ci	vmovaps	%xmm6,-40-6*16($fp)
1001e1051a39Sopenharmony_ci	vmovaps	%xmm7,-40-5*16($fp)
1002e1051a39Sopenharmony_ci	vmovaps	%xmm8,-40-4*16($fp)
1003e1051a39Sopenharmony_ci	vmovaps	%xmm9,-40-3*16($fp)
1004e1051a39Sopenharmony_ci	vmovaps	%xmm10,-40-2*16($fp)
1005e1051a39Sopenharmony_ci	vmovaps	%xmm11,-40-1*16($fp)
1006e1051a39Sopenharmony_ci.Lprologue_avx:
1007e1051a39Sopenharmony_ci___
1008e1051a39Sopenharmony_ci$code.=<<___;
1009e1051a39Sopenharmony_ci	and	\$-64,%rsp
1010e1051a39Sopenharmony_ci	mov	%rdi,$ctx	# reassigned argument
1011e1051a39Sopenharmony_ci	mov	%rsi,$inp	# reassigned argument
1012e1051a39Sopenharmony_ci	mov	%rdx,$num	# reassigned argument
1013e1051a39Sopenharmony_ci
1014e1051a39Sopenharmony_ci	shl	\$6,$num
1015e1051a39Sopenharmony_ci	add	$inp,$num
1016e1051a39Sopenharmony_ci	lea	K_XX_XX+64(%rip),$K_XX_XX
1017e1051a39Sopenharmony_ci
1018e1051a39Sopenharmony_ci	mov	0($ctx),$A		# load context
1019e1051a39Sopenharmony_ci	mov	4($ctx),$B
1020e1051a39Sopenharmony_ci	mov	8($ctx),$C
1021e1051a39Sopenharmony_ci	mov	12($ctx),$D
1022e1051a39Sopenharmony_ci	mov	$B,@T[0]		# magic seed
1023e1051a39Sopenharmony_ci	mov	16($ctx),$E
1024e1051a39Sopenharmony_ci	mov	$C,@T[1]
1025e1051a39Sopenharmony_ci	xor	$D,@T[1]
1026e1051a39Sopenharmony_ci	and	@T[1],@T[0]
1027e1051a39Sopenharmony_ci
1028e1051a39Sopenharmony_ci	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1029e1051a39Sopenharmony_ci	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
1030e1051a39Sopenharmony_ci	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1031e1051a39Sopenharmony_ci	vmovdqu	16($inp),@X[-3&7]
1032e1051a39Sopenharmony_ci	vmovdqu	32($inp),@X[-2&7]
1033e1051a39Sopenharmony_ci	vmovdqu	48($inp),@X[-1&7]
1034e1051a39Sopenharmony_ci	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1035e1051a39Sopenharmony_ci	add	\$64,$inp
1036e1051a39Sopenharmony_ci	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1037e1051a39Sopenharmony_ci	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1038e1051a39Sopenharmony_ci	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1039e1051a39Sopenharmony_ci	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1040e1051a39Sopenharmony_ci	vpaddd	$Kx,@X[-3&7],@X[1]
1041e1051a39Sopenharmony_ci	vpaddd	$Kx,@X[-2&7],@X[2]
1042e1051a39Sopenharmony_ci	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1043e1051a39Sopenharmony_ci	vmovdqa	@X[1],16(%rsp)
1044e1051a39Sopenharmony_ci	vmovdqa	@X[2],32(%rsp)
1045e1051a39Sopenharmony_ci	jmp	.Loop_avx
1046e1051a39Sopenharmony_ci___
1047e1051a39Sopenharmony_ci
1048e1051a39Sopenharmony_cisub Xupdate_avx_16_31()		# recall that $Xi starts with 4
1049e1051a39Sopenharmony_ci{ use integer;
1050e1051a39Sopenharmony_ci  my $body = shift;
1051e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1052e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
1053e1051a39Sopenharmony_ci
1054e1051a39Sopenharmony_ci	 eval(shift(@insns));
1055e1051a39Sopenharmony_ci	 eval(shift(@insns));
1056e1051a39Sopenharmony_ci	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1057e1051a39Sopenharmony_ci	 eval(shift(@insns));
1058e1051a39Sopenharmony_ci	 eval(shift(@insns));
1059e1051a39Sopenharmony_ci
1060e1051a39Sopenharmony_ci	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1061e1051a39Sopenharmony_ci	 eval(shift(@insns));
1062e1051a39Sopenharmony_ci	 eval(shift(@insns));
1063e1051a39Sopenharmony_ci	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1064e1051a39Sopenharmony_ci	 eval(shift(@insns));
1065e1051a39Sopenharmony_ci	 eval(shift(@insns));
1066e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1067e1051a39Sopenharmony_ci	 eval(shift(@insns));
1068e1051a39Sopenharmony_ci	 eval(shift(@insns));
1069e1051a39Sopenharmony_ci
1070e1051a39Sopenharmony_ci	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1071e1051a39Sopenharmony_ci	 eval(shift(@insns));
1072e1051a39Sopenharmony_ci	 eval(shift(@insns));
1073e1051a39Sopenharmony_ci	 eval(shift(@insns));
1074e1051a39Sopenharmony_ci	 eval(shift(@insns));
1075e1051a39Sopenharmony_ci
1076e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1077e1051a39Sopenharmony_ci	 eval(shift(@insns));
1078e1051a39Sopenharmony_ci	 eval(shift(@insns));
1079e1051a39Sopenharmony_ci	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1080e1051a39Sopenharmony_ci	 eval(shift(@insns));
1081e1051a39Sopenharmony_ci	 eval(shift(@insns));
1082e1051a39Sopenharmony_ci
1083e1051a39Sopenharmony_ci	&vpsrld	(@Tx[0],@X[0],31);
1084e1051a39Sopenharmony_ci	 eval(shift(@insns));
1085e1051a39Sopenharmony_ci	 eval(shift(@insns));
1086e1051a39Sopenharmony_ci	 eval(shift(@insns));
1087e1051a39Sopenharmony_ci	 eval(shift(@insns));
1088e1051a39Sopenharmony_ci
1089e1051a39Sopenharmony_ci	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1090e1051a39Sopenharmony_ci	&vpaddd	(@X[0],@X[0],@X[0]);
1091e1051a39Sopenharmony_ci	 eval(shift(@insns));
1092e1051a39Sopenharmony_ci	 eval(shift(@insns));
1093e1051a39Sopenharmony_ci	 eval(shift(@insns));
1094e1051a39Sopenharmony_ci	 eval(shift(@insns));
1095e1051a39Sopenharmony_ci
1096e1051a39Sopenharmony_ci	&vpsrld	(@Tx[1],@Tx[2],30);
1097e1051a39Sopenharmony_ci	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1098e1051a39Sopenharmony_ci	 eval(shift(@insns));
1099e1051a39Sopenharmony_ci	 eval(shift(@insns));
1100e1051a39Sopenharmony_ci	 eval(shift(@insns));
1101e1051a39Sopenharmony_ci	 eval(shift(@insns));
1102e1051a39Sopenharmony_ci
1103e1051a39Sopenharmony_ci	&vpslld	(@Tx[2],@Tx[2],2);
1104e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@Tx[1]);
1105e1051a39Sopenharmony_ci	 eval(shift(@insns));
1106e1051a39Sopenharmony_ci	 eval(shift(@insns));
1107e1051a39Sopenharmony_ci	 eval(shift(@insns));
1108e1051a39Sopenharmony_ci	 eval(shift(@insns));
1109e1051a39Sopenharmony_ci
1110e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1111e1051a39Sopenharmony_ci	 eval(shift(@insns));
1112e1051a39Sopenharmony_ci	 eval(shift(@insns));
1113e1051a39Sopenharmony_ci	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1114e1051a39Sopenharmony_ci	 eval(shift(@insns));
1115e1051a39Sopenharmony_ci	 eval(shift(@insns));
1116e1051a39Sopenharmony_ci
1117e1051a39Sopenharmony_ci
1118e1051a39Sopenharmony_ci	 foreach (@insns) { eval; }	# remaining instructions [if any]
1119e1051a39Sopenharmony_ci
1120e1051a39Sopenharmony_ci  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1121e1051a39Sopenharmony_ci}
1122e1051a39Sopenharmony_ci
1123e1051a39Sopenharmony_cisub Xupdate_avx_32_79()
1124e1051a39Sopenharmony_ci{ use integer;
1125e1051a39Sopenharmony_ci  my $body = shift;
1126e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1127e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
1128e1051a39Sopenharmony_ci
1129e1051a39Sopenharmony_ci	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1130e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1131e1051a39Sopenharmony_ci	 eval(shift(@insns));		# body_20_39
1132e1051a39Sopenharmony_ci	 eval(shift(@insns));
1133e1051a39Sopenharmony_ci	 eval(shift(@insns));
1134e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
1135e1051a39Sopenharmony_ci
1136e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1137e1051a39Sopenharmony_ci	 eval(shift(@insns));
1138e1051a39Sopenharmony_ci	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1139e1051a39Sopenharmony_ci	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1140e1051a39Sopenharmony_ci	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1141e1051a39Sopenharmony_ci	 eval(shift(@insns));		# ror
1142e1051a39Sopenharmony_ci	 eval(shift(@insns));
1143e1051a39Sopenharmony_ci
1144e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1145e1051a39Sopenharmony_ci	 eval(shift(@insns));		# body_20_39
1146e1051a39Sopenharmony_ci	 eval(shift(@insns));
1147e1051a39Sopenharmony_ci	 eval(shift(@insns));
1148e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
1149e1051a39Sopenharmony_ci
1150e1051a39Sopenharmony_ci	&vpsrld	(@Tx[0],@X[0],30);
1151e1051a39Sopenharmony_ci	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1152e1051a39Sopenharmony_ci	 eval(shift(@insns));
1153e1051a39Sopenharmony_ci	 eval(shift(@insns));
1154e1051a39Sopenharmony_ci	 eval(shift(@insns));		# ror
1155e1051a39Sopenharmony_ci	 eval(shift(@insns));
1156e1051a39Sopenharmony_ci
1157e1051a39Sopenharmony_ci	&vpslld	(@X[0],@X[0],2);
1158e1051a39Sopenharmony_ci	 eval(shift(@insns));		# body_20_39
1159e1051a39Sopenharmony_ci	 eval(shift(@insns));
1160e1051a39Sopenharmony_ci	 eval(shift(@insns));
1161e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
1162e1051a39Sopenharmony_ci	 eval(shift(@insns));
1163e1051a39Sopenharmony_ci	 eval(shift(@insns));
1164e1051a39Sopenharmony_ci	 eval(shift(@insns));		# ror
1165e1051a39Sopenharmony_ci	 eval(shift(@insns));
1166e1051a39Sopenharmony_ci
1167e1051a39Sopenharmony_ci	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1168e1051a39Sopenharmony_ci	 eval(shift(@insns));		# body_20_39
1169e1051a39Sopenharmony_ci	 eval(shift(@insns));
1170e1051a39Sopenharmony_ci	 eval(shift(@insns));
1171e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
1172e1051a39Sopenharmony_ci	 eval(shift(@insns));
1173e1051a39Sopenharmony_ci	 eval(shift(@insns));
1174e1051a39Sopenharmony_ci	 eval(shift(@insns));		# rol
1175e1051a39Sopenharmony_ci	 eval(shift(@insns));
1176e1051a39Sopenharmony_ci
1177e1051a39Sopenharmony_ci	 foreach (@insns) { eval; }	# remaining instructions
1178e1051a39Sopenharmony_ci
1179e1051a39Sopenharmony_ci  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1180e1051a39Sopenharmony_ci}
1181e1051a39Sopenharmony_ci
1182e1051a39Sopenharmony_cisub Xuplast_avx_80()
1183e1051a39Sopenharmony_ci{ use integer;
1184e1051a39Sopenharmony_ci  my $body = shift;
1185e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1186e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
1187e1051a39Sopenharmony_ci
1188e1051a39Sopenharmony_ci	 eval(shift(@insns));
1189e1051a39Sopenharmony_ci	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1190e1051a39Sopenharmony_ci	 eval(shift(@insns));
1191e1051a39Sopenharmony_ci	 eval(shift(@insns));
1192e1051a39Sopenharmony_ci	 eval(shift(@insns));
1193e1051a39Sopenharmony_ci	 eval(shift(@insns));
1194e1051a39Sopenharmony_ci
1195e1051a39Sopenharmony_ci	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1196e1051a39Sopenharmony_ci
1197e1051a39Sopenharmony_ci	 foreach (@insns) { eval; }		# remaining instructions
1198e1051a39Sopenharmony_ci
1199e1051a39Sopenharmony_ci	&cmp	($inp,$num);
1200e1051a39Sopenharmony_ci	&je	($done_avx_label);
1201e1051a39Sopenharmony_ci
1202e1051a39Sopenharmony_ci	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1203e1051a39Sopenharmony_ci	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1204e1051a39Sopenharmony_ci	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1205e1051a39Sopenharmony_ci	&vmovdqu(@X[-3&7],"16($inp)");
1206e1051a39Sopenharmony_ci	&vmovdqu(@X[-2&7],"32($inp)");
1207e1051a39Sopenharmony_ci	&vmovdqu(@X[-1&7],"48($inp)");
1208e1051a39Sopenharmony_ci	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1209e1051a39Sopenharmony_ci	&add	($inp,64);
1210e1051a39Sopenharmony_ci
1211e1051a39Sopenharmony_ci  $Xi=0;
1212e1051a39Sopenharmony_ci}
1213e1051a39Sopenharmony_ci
1214e1051a39Sopenharmony_cisub Xloop_avx()
1215e1051a39Sopenharmony_ci{ use integer;
1216e1051a39Sopenharmony_ci  my $body = shift;
1217e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1218e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
1219e1051a39Sopenharmony_ci
1220e1051a39Sopenharmony_ci	 eval(shift(@insns));
1221e1051a39Sopenharmony_ci	 eval(shift(@insns));
1222e1051a39Sopenharmony_ci	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1223e1051a39Sopenharmony_ci	 eval(shift(@insns));
1224e1051a39Sopenharmony_ci	 eval(shift(@insns));
1225e1051a39Sopenharmony_ci	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1226e1051a39Sopenharmony_ci	 eval(shift(@insns));
1227e1051a39Sopenharmony_ci	 eval(shift(@insns));
1228e1051a39Sopenharmony_ci	 eval(shift(@insns));
1229e1051a39Sopenharmony_ci	 eval(shift(@insns));
1230e1051a39Sopenharmony_ci	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1231e1051a39Sopenharmony_ci	 eval(shift(@insns));
1232e1051a39Sopenharmony_ci	 eval(shift(@insns));
1233e1051a39Sopenharmony_ci
1234e1051a39Sopenharmony_ci	foreach (@insns) { eval; }
1235e1051a39Sopenharmony_ci  $Xi++;
1236e1051a39Sopenharmony_ci}
1237e1051a39Sopenharmony_ci
1238e1051a39Sopenharmony_cisub Xtail_avx()
1239e1051a39Sopenharmony_ci{ use integer;
1240e1051a39Sopenharmony_ci  my $body = shift;
1241e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1242e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
1243e1051a39Sopenharmony_ci
1244e1051a39Sopenharmony_ci	foreach (@insns) { eval; }
1245e1051a39Sopenharmony_ci}
1246e1051a39Sopenharmony_ci
1247e1051a39Sopenharmony_ci$code.=<<___;
1248e1051a39Sopenharmony_ci.align	16
1249e1051a39Sopenharmony_ci.Loop_avx:
1250e1051a39Sopenharmony_ci___
1251e1051a39Sopenharmony_ci	&Xupdate_avx_16_31(\&body_00_19);
1252e1051a39Sopenharmony_ci	&Xupdate_avx_16_31(\&body_00_19);
1253e1051a39Sopenharmony_ci	&Xupdate_avx_16_31(\&body_00_19);
1254e1051a39Sopenharmony_ci	&Xupdate_avx_16_31(\&body_00_19);
1255e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_00_19);
1256e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_20_39);
1257e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_20_39);
1258e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_20_39);
1259e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_20_39);
1260e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_20_39);
1261e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_40_59);
1262e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_40_59);
1263e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_40_59);
1264e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_40_59);
1265e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_40_59);
1266e1051a39Sopenharmony_ci	&Xupdate_avx_32_79(\&body_20_39);
1267e1051a39Sopenharmony_ci	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1268e1051a39Sopenharmony_ci
1269e1051a39Sopenharmony_ci				$saved_j=$j; @saved_V=@V;
1270e1051a39Sopenharmony_ci
1271e1051a39Sopenharmony_ci	&Xloop_avx(\&body_20_39);
1272e1051a39Sopenharmony_ci	&Xloop_avx(\&body_20_39);
1273e1051a39Sopenharmony_ci	&Xloop_avx(\&body_20_39);
1274e1051a39Sopenharmony_ci
1275e1051a39Sopenharmony_ci$code.=<<___;
1276e1051a39Sopenharmony_ci	add	0($ctx),$A			# update context
1277e1051a39Sopenharmony_ci	add	4($ctx),@T[0]
1278e1051a39Sopenharmony_ci	add	8($ctx),$C
1279e1051a39Sopenharmony_ci	add	12($ctx),$D
1280e1051a39Sopenharmony_ci	mov	$A,0($ctx)
1281e1051a39Sopenharmony_ci	add	16($ctx),$E
1282e1051a39Sopenharmony_ci	mov	@T[0],4($ctx)
1283e1051a39Sopenharmony_ci	mov	@T[0],$B			# magic seed
1284e1051a39Sopenharmony_ci	mov	$C,8($ctx)
1285e1051a39Sopenharmony_ci	mov	$C,@T[1]
1286e1051a39Sopenharmony_ci	mov	$D,12($ctx)
1287e1051a39Sopenharmony_ci	xor	$D,@T[1]
1288e1051a39Sopenharmony_ci	mov	$E,16($ctx)
1289e1051a39Sopenharmony_ci	and	@T[1],@T[0]
1290e1051a39Sopenharmony_ci	jmp	.Loop_avx
1291e1051a39Sopenharmony_ci
1292e1051a39Sopenharmony_ci.align	16
1293e1051a39Sopenharmony_ci$done_avx_label:
1294e1051a39Sopenharmony_ci___
1295e1051a39Sopenharmony_ci				$j=$saved_j; @V=@saved_V;
1296e1051a39Sopenharmony_ci
1297e1051a39Sopenharmony_ci	&Xtail_avx(\&body_20_39);
1298e1051a39Sopenharmony_ci	&Xtail_avx(\&body_20_39);
1299e1051a39Sopenharmony_ci	&Xtail_avx(\&body_20_39);
1300e1051a39Sopenharmony_ci
1301e1051a39Sopenharmony_ci$code.=<<___;
1302e1051a39Sopenharmony_ci	vzeroupper
1303e1051a39Sopenharmony_ci
1304e1051a39Sopenharmony_ci	add	0($ctx),$A			# update context
1305e1051a39Sopenharmony_ci	add	4($ctx),@T[0]
1306e1051a39Sopenharmony_ci	add	8($ctx),$C
1307e1051a39Sopenharmony_ci	mov	$A,0($ctx)
1308e1051a39Sopenharmony_ci	add	12($ctx),$D
1309e1051a39Sopenharmony_ci	mov	@T[0],4($ctx)
1310e1051a39Sopenharmony_ci	add	16($ctx),$E
1311e1051a39Sopenharmony_ci	mov	$C,8($ctx)
1312e1051a39Sopenharmony_ci	mov	$D,12($ctx)
1313e1051a39Sopenharmony_ci	mov	$E,16($ctx)
1314e1051a39Sopenharmony_ci___
1315e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1316e1051a39Sopenharmony_ci	movaps	-40-6*16($fp),%xmm6
1317e1051a39Sopenharmony_ci	movaps	-40-5*16($fp),%xmm7
1318e1051a39Sopenharmony_ci	movaps	-40-4*16($fp),%xmm8
1319e1051a39Sopenharmony_ci	movaps	-40-3*16($fp),%xmm9
1320e1051a39Sopenharmony_ci	movaps	-40-2*16($fp),%xmm10
1321e1051a39Sopenharmony_ci	movaps	-40-1*16($fp),%xmm11
1322e1051a39Sopenharmony_ci___
1323e1051a39Sopenharmony_ci$code.=<<___;
1324e1051a39Sopenharmony_ci	mov	-40($fp),%r14
1325e1051a39Sopenharmony_ci.cfi_restore	%r14
1326e1051a39Sopenharmony_ci	mov	-32($fp),%r13
1327e1051a39Sopenharmony_ci.cfi_restore	%r13
1328e1051a39Sopenharmony_ci	mov	-24($fp),%r12
1329e1051a39Sopenharmony_ci.cfi_restore	%r12
1330e1051a39Sopenharmony_ci	mov	-16($fp),%rbp
1331e1051a39Sopenharmony_ci.cfi_restore	%rbp
1332e1051a39Sopenharmony_ci	mov	-8($fp),%rbx
1333e1051a39Sopenharmony_ci.cfi_restore	%rbx
1334e1051a39Sopenharmony_ci	lea	($fp),%rsp
1335e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
1336e1051a39Sopenharmony_ci.Lepilogue_avx:
1337e1051a39Sopenharmony_ci	ret
1338e1051a39Sopenharmony_ci.cfi_endproc
1339e1051a39Sopenharmony_ci.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1340e1051a39Sopenharmony_ci___
1341e1051a39Sopenharmony_ci
1342e1051a39Sopenharmony_ciif ($avx>1) {
1343e1051a39Sopenharmony_ciuse integer;
1344e1051a39Sopenharmony_ci$Xi=4;					# reset variables
1345e1051a39Sopenharmony_ci@X=map("%ymm$_",(4..7,0..3));
1346e1051a39Sopenharmony_ci@Tx=map("%ymm$_",(8..10));
1347e1051a39Sopenharmony_ci$Kx="%ymm11";
1348e1051a39Sopenharmony_ci$j=0;
1349e1051a39Sopenharmony_ci
1350e1051a39Sopenharmony_cimy @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1351e1051a39Sopenharmony_cimy ($a5,$t0)=("%r12d","%edi");
1352e1051a39Sopenharmony_ci
1353e1051a39Sopenharmony_cimy ($A,$F,$B,$C,$D,$E)=@ROTX;
1354e1051a39Sopenharmony_cimy $rx=0;
1355e1051a39Sopenharmony_cimy $frame="%r13";
1356e1051a39Sopenharmony_ci
1357e1051a39Sopenharmony_ci$code.=<<___;
1358e1051a39Sopenharmony_ci.type	sha1_block_data_order_avx2,\@function,3
1359e1051a39Sopenharmony_ci.align	16
1360e1051a39Sopenharmony_cisha1_block_data_order_avx2:
1361e1051a39Sopenharmony_ci_avx2_shortcut:
1362e1051a39Sopenharmony_ci.cfi_startproc
1363e1051a39Sopenharmony_ci	mov	%rsp,$fp
1364e1051a39Sopenharmony_ci.cfi_def_cfa_register	$fp
1365e1051a39Sopenharmony_ci	push	%rbx
1366e1051a39Sopenharmony_ci.cfi_push	%rbx
1367e1051a39Sopenharmony_ci	push	%rbp
1368e1051a39Sopenharmony_ci.cfi_push	%rbp
1369e1051a39Sopenharmony_ci	push	%r12
1370e1051a39Sopenharmony_ci.cfi_push	%r12
1371e1051a39Sopenharmony_ci	push	%r13
1372e1051a39Sopenharmony_ci.cfi_push	%r13
1373e1051a39Sopenharmony_ci	push	%r14
1374e1051a39Sopenharmony_ci.cfi_push	%r14
1375e1051a39Sopenharmony_ci	vzeroupper
1376e1051a39Sopenharmony_ci___
1377e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1378e1051a39Sopenharmony_ci	lea	-6*16(%rsp),%rsp
1379e1051a39Sopenharmony_ci	vmovaps	%xmm6,-40-6*16($fp)
1380e1051a39Sopenharmony_ci	vmovaps	%xmm7,-40-5*16($fp)
1381e1051a39Sopenharmony_ci	vmovaps	%xmm8,-40-4*16($fp)
1382e1051a39Sopenharmony_ci	vmovaps	%xmm9,-40-3*16($fp)
1383e1051a39Sopenharmony_ci	vmovaps	%xmm10,-40-2*16($fp)
1384e1051a39Sopenharmony_ci	vmovaps	%xmm11,-40-1*16($fp)
1385e1051a39Sopenharmony_ci.Lprologue_avx2:
1386e1051a39Sopenharmony_ci___
1387e1051a39Sopenharmony_ci$code.=<<___;
1388e1051a39Sopenharmony_ci	mov	%rdi,$ctx		# reassigned argument
1389e1051a39Sopenharmony_ci	mov	%rsi,$inp		# reassigned argument
1390e1051a39Sopenharmony_ci	mov	%rdx,$num		# reassigned argument
1391e1051a39Sopenharmony_ci
1392e1051a39Sopenharmony_ci	lea	-640(%rsp),%rsp
1393e1051a39Sopenharmony_ci	shl	\$6,$num
1394e1051a39Sopenharmony_ci	 lea	64($inp),$frame
1395e1051a39Sopenharmony_ci	and	\$-128,%rsp
1396e1051a39Sopenharmony_ci	add	$inp,$num
1397e1051a39Sopenharmony_ci	lea	K_XX_XX+64(%rip),$K_XX_XX
1398e1051a39Sopenharmony_ci
1399e1051a39Sopenharmony_ci	mov	0($ctx),$A		# load context
1400e1051a39Sopenharmony_ci	 cmp	$num,$frame
1401e1051a39Sopenharmony_ci	 cmovae	$inp,$frame		# next or same block
1402e1051a39Sopenharmony_ci	mov	4($ctx),$F
1403e1051a39Sopenharmony_ci	mov	8($ctx),$C
1404e1051a39Sopenharmony_ci	mov	12($ctx),$D
1405e1051a39Sopenharmony_ci	mov	16($ctx),$E
1406e1051a39Sopenharmony_ci	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1407e1051a39Sopenharmony_ci
1408e1051a39Sopenharmony_ci	vmovdqu		($inp),%xmm0
1409e1051a39Sopenharmony_ci	vmovdqu		16($inp),%xmm1
1410e1051a39Sopenharmony_ci	vmovdqu		32($inp),%xmm2
1411e1051a39Sopenharmony_ci	vmovdqu		48($inp),%xmm3
1412e1051a39Sopenharmony_ci	lea		64($inp),$inp
1413e1051a39Sopenharmony_ci	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1414e1051a39Sopenharmony_ci	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1415e1051a39Sopenharmony_ci	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1416e1051a39Sopenharmony_ci	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1417e1051a39Sopenharmony_ci	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1418e1051a39Sopenharmony_ci	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1419e1051a39Sopenharmony_ci	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1420e1051a39Sopenharmony_ci	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1421e1051a39Sopenharmony_ci	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1422e1051a39Sopenharmony_ci
1423e1051a39Sopenharmony_ci	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1424e1051a39Sopenharmony_ci	vpaddd	$Kx,@X[-3&7],@X[1]
1425e1051a39Sopenharmony_ci	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1426e1051a39Sopenharmony_ci	vpaddd	$Kx,@X[-2&7],@X[2]
1427e1051a39Sopenharmony_ci	vmovdqu	@X[1],32(%rsp)
1428e1051a39Sopenharmony_ci	vpaddd	$Kx,@X[-1&7],@X[3]
1429e1051a39Sopenharmony_ci	vmovdqu	@X[2],64(%rsp)
1430e1051a39Sopenharmony_ci	vmovdqu	@X[3],96(%rsp)
1431e1051a39Sopenharmony_ci___
1432e1051a39Sopenharmony_cifor (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1433e1051a39Sopenharmony_ci    use integer;
1434e1051a39Sopenharmony_ci
1435e1051a39Sopenharmony_ci	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1436e1051a39Sopenharmony_ci	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1437e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1438e1051a39Sopenharmony_ci	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1439e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1440e1051a39Sopenharmony_ci	&vpsrld	(@Tx[0],@X[0],31);
1441e1051a39Sopenharmony_ci	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1442e1051a39Sopenharmony_ci	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1443e1051a39Sopenharmony_ci	&vpaddd	(@X[0],@X[0],@X[0]);
1444e1051a39Sopenharmony_ci	&vpsrld	(@Tx[1],@Tx[2],30);
1445e1051a39Sopenharmony_ci	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1446e1051a39Sopenharmony_ci	&vpslld	(@Tx[2],@Tx[2],2);
1447e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@Tx[1]);
1448e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1449e1051a39Sopenharmony_ci	&vpaddd	(@Tx[1],@X[0],$Kx);
1450e1051a39Sopenharmony_ci	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1451e1051a39Sopenharmony_ci
1452e1051a39Sopenharmony_ci	push(@X,shift(@X));	# "rotate" X[]
1453e1051a39Sopenharmony_ci}
1454e1051a39Sopenharmony_ci$code.=<<___;
1455e1051a39Sopenharmony_ci	lea	128(%rsp),$frame
1456e1051a39Sopenharmony_ci	jmp	.Loop_avx2
1457e1051a39Sopenharmony_ci.align	32
1458e1051a39Sopenharmony_ci.Loop_avx2:
1459e1051a39Sopenharmony_ci	rorx	\$2,$F,$B
1460e1051a39Sopenharmony_ci	andn	$D,$F,$t0
1461e1051a39Sopenharmony_ci	and	$C,$F
1462e1051a39Sopenharmony_ci	xor	$t0,$F
1463e1051a39Sopenharmony_ci___
1464e1051a39Sopenharmony_cisub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1465e1051a39Sopenharmony_ci	# at start $f=(b&c)^(~b&d), $b>>>=2
1466e1051a39Sopenharmony_ci	return &bodyx_20_39() if ($rx==19); $rx++;
1467e1051a39Sopenharmony_ci	(
1468e1051a39Sopenharmony_ci	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1469e1051a39Sopenharmony_ci
1470e1051a39Sopenharmony_ci	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1471e1051a39Sopenharmony_ci	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1472e1051a39Sopenharmony_ci	'&andn	($t0,$a,$c)',			# ~b&d for next round
1473e1051a39Sopenharmony_ci
1474e1051a39Sopenharmony_ci	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1475e1051a39Sopenharmony_ci	'&rorx	($a5,$a,27)',			# a<<<5
1476e1051a39Sopenharmony_ci	'&rorx	($f,$a,2)',			# b>>>2 for next round
1477e1051a39Sopenharmony_ci	'&and	($a,$b)',			# b&c for next round
1478e1051a39Sopenharmony_ci
1479e1051a39Sopenharmony_ci	'&add	($e,$a5)',			# e+=a<<<5
1480e1051a39Sopenharmony_ci	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1481e1051a39Sopenharmony_ci
1482e1051a39Sopenharmony_ci	'unshift(@ROTX,pop(@ROTX)); $j++;'
1483e1051a39Sopenharmony_ci	)
1484e1051a39Sopenharmony_ci}
1485e1051a39Sopenharmony_ci
1486e1051a39Sopenharmony_cisub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1487e1051a39Sopenharmony_ci	# on entry $f=b^c^d, $b>>>=2
1488e1051a39Sopenharmony_ci	return &bodyx_40_59() if ($rx==39); $rx++;
1489e1051a39Sopenharmony_ci	(
1490e1051a39Sopenharmony_ci	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1491e1051a39Sopenharmony_ci
1492e1051a39Sopenharmony_ci	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1493e1051a39Sopenharmony_ci	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1494e1051a39Sopenharmony_ci
1495e1051a39Sopenharmony_ci	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1496e1051a39Sopenharmony_ci	'&rorx	($a5,$a,27)',			# a<<<5
1497e1051a39Sopenharmony_ci	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1498e1051a39Sopenharmony_ci	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1499e1051a39Sopenharmony_ci
1500e1051a39Sopenharmony_ci	'&add	($e,$a5)',			# e+=a<<<5
1501e1051a39Sopenharmony_ci	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1502e1051a39Sopenharmony_ci
1503e1051a39Sopenharmony_ci	'unshift(@ROTX,pop(@ROTX)); $j++;'
1504e1051a39Sopenharmony_ci	)
1505e1051a39Sopenharmony_ci}
1506e1051a39Sopenharmony_ci
1507e1051a39Sopenharmony_cisub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1508e1051a39Sopenharmony_ci	# on entry $f=((b^c)&(c^d)), $b>>>=2
1509e1051a39Sopenharmony_ci	$rx++;
1510e1051a39Sopenharmony_ci	(
1511e1051a39Sopenharmony_ci	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1512e1051a39Sopenharmony_ci
1513e1051a39Sopenharmony_ci	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1514e1051a39Sopenharmony_ci	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1515e1051a39Sopenharmony_ci	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1516e1051a39Sopenharmony_ci	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1517e1051a39Sopenharmony_ci	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1518e1051a39Sopenharmony_ci
1519e1051a39Sopenharmony_ci	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1520e1051a39Sopenharmony_ci	'&rorx	($a5,$a,27)',			# a<<<5
1521e1051a39Sopenharmony_ci	'&rorx	($f,$a,2)',			# b>>>2 in next round
1522e1051a39Sopenharmony_ci	'&xor	($a,$b)',			# b^c for next round
1523e1051a39Sopenharmony_ci
1524e1051a39Sopenharmony_ci	'&add	($e,$a5)',			# e+=a<<<5
1525e1051a39Sopenharmony_ci	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1526e1051a39Sopenharmony_ci	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1527e1051a39Sopenharmony_ci
1528e1051a39Sopenharmony_ci	'unshift(@ROTX,pop(@ROTX)); $j++;'
1529e1051a39Sopenharmony_ci	)
1530e1051a39Sopenharmony_ci}
1531e1051a39Sopenharmony_ci
1532e1051a39Sopenharmony_cisub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
1533e1051a39Sopenharmony_ci{ use integer;
1534e1051a39Sopenharmony_ci  my $body = shift;
1535e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1536e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
1537e1051a39Sopenharmony_ci
1538e1051a39Sopenharmony_ci	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1539e1051a39Sopenharmony_ci	 eval(shift(@insns));
1540e1051a39Sopenharmony_ci	 eval(shift(@insns));
1541e1051a39Sopenharmony_ci	 eval(shift(@insns));
1542e1051a39Sopenharmony_ci	 eval(shift(@insns));
1543e1051a39Sopenharmony_ci
1544e1051a39Sopenharmony_ci	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1545e1051a39Sopenharmony_ci	 eval(shift(@insns));
1546e1051a39Sopenharmony_ci	 eval(shift(@insns));
1547e1051a39Sopenharmony_ci	 eval(shift(@insns));
1548e1051a39Sopenharmony_ci
1549e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1550e1051a39Sopenharmony_ci	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1551e1051a39Sopenharmony_ci	 eval(shift(@insns));
1552e1051a39Sopenharmony_ci	 eval(shift(@insns));
1553e1051a39Sopenharmony_ci
1554e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1555e1051a39Sopenharmony_ci	 eval(shift(@insns));
1556e1051a39Sopenharmony_ci	 eval(shift(@insns));
1557e1051a39Sopenharmony_ci	 eval(shift(@insns));
1558e1051a39Sopenharmony_ci	 eval(shift(@insns));
1559e1051a39Sopenharmony_ci
1560e1051a39Sopenharmony_ci	&vpsrld	(@Tx[0],@X[0],31);
1561e1051a39Sopenharmony_ci	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1562e1051a39Sopenharmony_ci	 eval(shift(@insns));
1563e1051a39Sopenharmony_ci	 eval(shift(@insns));
1564e1051a39Sopenharmony_ci	 eval(shift(@insns));
1565e1051a39Sopenharmony_ci
1566e1051a39Sopenharmony_ci	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1567e1051a39Sopenharmony_ci	&vpaddd	(@X[0],@X[0],@X[0]);
1568e1051a39Sopenharmony_ci	 eval(shift(@insns));
1569e1051a39Sopenharmony_ci	 eval(shift(@insns));
1570e1051a39Sopenharmony_ci
1571e1051a39Sopenharmony_ci	&vpsrld	(@Tx[1],@Tx[2],30);
1572e1051a39Sopenharmony_ci	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1573e1051a39Sopenharmony_ci	 eval(shift(@insns));
1574e1051a39Sopenharmony_ci	 eval(shift(@insns));
1575e1051a39Sopenharmony_ci
1576e1051a39Sopenharmony_ci	&vpslld	(@Tx[2],@Tx[2],2);
1577e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@Tx[1]);
1578e1051a39Sopenharmony_ci	 eval(shift(@insns));
1579e1051a39Sopenharmony_ci	 eval(shift(@insns));
1580e1051a39Sopenharmony_ci
1581e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1582e1051a39Sopenharmony_ci	 eval(shift(@insns));
1583e1051a39Sopenharmony_ci	 eval(shift(@insns));
1584e1051a39Sopenharmony_ci	 eval(shift(@insns));
1585e1051a39Sopenharmony_ci
1586e1051a39Sopenharmony_ci	&vpaddd	(@Tx[1],@X[0],$Kx);
1587e1051a39Sopenharmony_ci	 eval(shift(@insns));
1588e1051a39Sopenharmony_ci	 eval(shift(@insns));
1589e1051a39Sopenharmony_ci	 eval(shift(@insns));
1590e1051a39Sopenharmony_ci	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1591e1051a39Sopenharmony_ci
1592e1051a39Sopenharmony_ci	 foreach (@insns) { eval; }	# remaining instructions [if any]
1593e1051a39Sopenharmony_ci
1594e1051a39Sopenharmony_ci	$Xi++;
1595e1051a39Sopenharmony_ci	push(@X,shift(@X));	# "rotate" X[]
1596e1051a39Sopenharmony_ci}
1597e1051a39Sopenharmony_ci
1598e1051a39Sopenharmony_cisub Xupdate_avx2_32_79()
1599e1051a39Sopenharmony_ci{ use integer;
1600e1051a39Sopenharmony_ci  my $body = shift;
1601e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1602e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
1603e1051a39Sopenharmony_ci
1604e1051a39Sopenharmony_ci	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1605e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1606e1051a39Sopenharmony_ci	 eval(shift(@insns));
1607e1051a39Sopenharmony_ci	 eval(shift(@insns));
1608e1051a39Sopenharmony_ci
1609e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1610e1051a39Sopenharmony_ci	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1611e1051a39Sopenharmony_ci	 eval(shift(@insns));
1612e1051a39Sopenharmony_ci	 eval(shift(@insns));
1613e1051a39Sopenharmony_ci	 eval(shift(@insns));
1614e1051a39Sopenharmony_ci
1615e1051a39Sopenharmony_ci	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1616e1051a39Sopenharmony_ci	 eval(shift(@insns));
1617e1051a39Sopenharmony_ci	 eval(shift(@insns));
1618e1051a39Sopenharmony_ci	 eval(shift(@insns));
1619e1051a39Sopenharmony_ci
1620e1051a39Sopenharmony_ci	&vpsrld	(@Tx[0],@X[0],30);
1621e1051a39Sopenharmony_ci	&vpslld	(@X[0],@X[0],2);
1622e1051a39Sopenharmony_ci	 eval(shift(@insns));
1623e1051a39Sopenharmony_ci	 eval(shift(@insns));
1624e1051a39Sopenharmony_ci	 eval(shift(@insns));
1625e1051a39Sopenharmony_ci
1626e1051a39Sopenharmony_ci	#&vpslld	(@X[0],@X[0],2);
1627e1051a39Sopenharmony_ci	 eval(shift(@insns));
1628e1051a39Sopenharmony_ci	 eval(shift(@insns));
1629e1051a39Sopenharmony_ci	 eval(shift(@insns));
1630e1051a39Sopenharmony_ci
1631e1051a39Sopenharmony_ci	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1632e1051a39Sopenharmony_ci	 eval(shift(@insns));
1633e1051a39Sopenharmony_ci	 eval(shift(@insns));
1634e1051a39Sopenharmony_ci	 eval(shift(@insns));
1635e1051a39Sopenharmony_ci	 eval(shift(@insns));
1636e1051a39Sopenharmony_ci
1637e1051a39Sopenharmony_ci	&vpaddd	(@Tx[1],@X[0],$Kx);
1638e1051a39Sopenharmony_ci	 eval(shift(@insns));
1639e1051a39Sopenharmony_ci	 eval(shift(@insns));
1640e1051a39Sopenharmony_ci	 eval(shift(@insns));
1641e1051a39Sopenharmony_ci	 eval(shift(@insns));
1642e1051a39Sopenharmony_ci
1643e1051a39Sopenharmony_ci	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1644e1051a39Sopenharmony_ci
1645e1051a39Sopenharmony_ci	 foreach (@insns) { eval; }	# remaining instructions
1646e1051a39Sopenharmony_ci
1647e1051a39Sopenharmony_ci	$Xi++;
1648e1051a39Sopenharmony_ci	push(@X,shift(@X));	# "rotate" X[]
1649e1051a39Sopenharmony_ci}
1650e1051a39Sopenharmony_ci
1651e1051a39Sopenharmony_cisub Xloop_avx2()
1652e1051a39Sopenharmony_ci{ use integer;
1653e1051a39Sopenharmony_ci  my $body = shift;
1654e1051a39Sopenharmony_ci  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1655e1051a39Sopenharmony_ci  my ($a,$b,$c,$d,$e);
1656e1051a39Sopenharmony_ci
1657e1051a39Sopenharmony_ci	 foreach (@insns) { eval; }
1658e1051a39Sopenharmony_ci}
1659e1051a39Sopenharmony_ci
1660e1051a39Sopenharmony_ci	&align32();
1661e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_00_19);
1662e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_00_19);
1663e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_00_19);
1664e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_00_19);
1665e1051a39Sopenharmony_ci
1666e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_20_39);
1667e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_20_39);
1668e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_20_39);
1669e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_20_39);
1670e1051a39Sopenharmony_ci
1671e1051a39Sopenharmony_ci	&align32();
1672e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_40_59);
1673e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_40_59);
1674e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_40_59);
1675e1051a39Sopenharmony_ci	&Xupdate_avx2_32_79(\&bodyx_40_59);
1676e1051a39Sopenharmony_ci
1677e1051a39Sopenharmony_ci	&Xloop_avx2(\&bodyx_20_39);
1678e1051a39Sopenharmony_ci	&Xloop_avx2(\&bodyx_20_39);
1679e1051a39Sopenharmony_ci	&Xloop_avx2(\&bodyx_20_39);
1680e1051a39Sopenharmony_ci	&Xloop_avx2(\&bodyx_20_39);
1681e1051a39Sopenharmony_ci
1682e1051a39Sopenharmony_ci$code.=<<___;
1683e1051a39Sopenharmony_ci	lea	128($inp),$frame
1684e1051a39Sopenharmony_ci	lea	128($inp),%rdi			# borrow $t0
1685e1051a39Sopenharmony_ci	cmp	$num,$frame
1686e1051a39Sopenharmony_ci	cmovae	$inp,$frame			# next or previous block
1687e1051a39Sopenharmony_ci
1688e1051a39Sopenharmony_ci	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1689e1051a39Sopenharmony_ci	add	0($ctx),@ROTX[0]		# update context
1690e1051a39Sopenharmony_ci	add	4($ctx),@ROTX[1]
1691e1051a39Sopenharmony_ci	add	8($ctx),@ROTX[3]
1692e1051a39Sopenharmony_ci	mov	@ROTX[0],0($ctx)
1693e1051a39Sopenharmony_ci	add	12($ctx),@ROTX[4]
1694e1051a39Sopenharmony_ci	mov	@ROTX[1],4($ctx)
1695e1051a39Sopenharmony_ci	 mov	@ROTX[0],$A			# A=d
1696e1051a39Sopenharmony_ci	add	16($ctx),@ROTX[5]
1697e1051a39Sopenharmony_ci	 mov	@ROTX[3],$a5
1698e1051a39Sopenharmony_ci	mov	@ROTX[3],8($ctx)
1699e1051a39Sopenharmony_ci	 mov	@ROTX[4],$D			# D=b
1700e1051a39Sopenharmony_ci	 #xchg	@ROTX[5],$F			# F=c, C=f
1701e1051a39Sopenharmony_ci	mov	@ROTX[4],12($ctx)
1702e1051a39Sopenharmony_ci	 mov	@ROTX[1],$F			# F=e
1703e1051a39Sopenharmony_ci	mov	@ROTX[5],16($ctx)
1704e1051a39Sopenharmony_ci	#mov	$F,16($ctx)
1705e1051a39Sopenharmony_ci	 mov	@ROTX[5],$E			# E=c
1706e1051a39Sopenharmony_ci	 mov	$a5,$C				# C=f
1707e1051a39Sopenharmony_ci	 #xchg	$F,$E				# E=c, F=e
1708e1051a39Sopenharmony_ci
1709e1051a39Sopenharmony_ci	cmp	$num,$inp
1710e1051a39Sopenharmony_ci	je	.Ldone_avx2
1711e1051a39Sopenharmony_ci___
1712e1051a39Sopenharmony_ci
1713e1051a39Sopenharmony_ci$Xi=4;				# reset variables
1714e1051a39Sopenharmony_ci@X=map("%ymm$_",(4..7,0..3));
1715e1051a39Sopenharmony_ci
1716e1051a39Sopenharmony_ci$code.=<<___;
1717e1051a39Sopenharmony_ci	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1718e1051a39Sopenharmony_ci	cmp	$num,%rdi			# borrowed $t0
1719e1051a39Sopenharmony_ci	ja	.Last_avx2
1720e1051a39Sopenharmony_ci
1721e1051a39Sopenharmony_ci	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1722e1051a39Sopenharmony_ci	vmovdqu		-48(%rdi),%xmm1
1723e1051a39Sopenharmony_ci	vmovdqu		-32(%rdi),%xmm2
1724e1051a39Sopenharmony_ci	vmovdqu		-16(%rdi),%xmm3
1725e1051a39Sopenharmony_ci	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1726e1051a39Sopenharmony_ci	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1727e1051a39Sopenharmony_ci	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1728e1051a39Sopenharmony_ci	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1729e1051a39Sopenharmony_ci	jmp	.Last_avx2
1730e1051a39Sopenharmony_ci
1731e1051a39Sopenharmony_ci.align	32
1732e1051a39Sopenharmony_ci.Last_avx2:
1733e1051a39Sopenharmony_ci	lea	128+16(%rsp),$frame
1734e1051a39Sopenharmony_ci	rorx	\$2,$F,$B
1735e1051a39Sopenharmony_ci	andn	$D,$F,$t0
1736e1051a39Sopenharmony_ci	and	$C,$F
1737e1051a39Sopenharmony_ci	xor	$t0,$F
1738e1051a39Sopenharmony_ci	sub	\$-128,$inp
1739e1051a39Sopenharmony_ci___
1740e1051a39Sopenharmony_ci	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1741e1051a39Sopenharmony_ci
1742e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_00_19);
1743e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_00_19);
1744e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_00_19);
1745e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_00_19);
1746e1051a39Sopenharmony_ci
1747e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_20_39);
1748e1051a39Sopenharmony_ci	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1749e1051a39Sopenharmony_ci	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1750e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_20_39);
1751e1051a39Sopenharmony_ci	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1752e1051a39Sopenharmony_ci	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1753e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_20_39);
1754e1051a39Sopenharmony_ci	  &vmovdqu	("0(%rsp)",@Tx[0]);
1755e1051a39Sopenharmony_ci	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1756e1051a39Sopenharmony_ci	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1757e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_20_39);
1758e1051a39Sopenharmony_ci	  &vmovdqu	("32(%rsp)",@Tx[1]);
1759e1051a39Sopenharmony_ci	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1760e1051a39Sopenharmony_ci	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1761e1051a39Sopenharmony_ci
1762e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_40_59);
1763e1051a39Sopenharmony_ci	&align32	();
1764e1051a39Sopenharmony_ci	  &vmovdqu	("64(%rsp)",@X[2]);
1765e1051a39Sopenharmony_ci	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1766e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_40_59);
1767e1051a39Sopenharmony_ci	  &vmovdqu	("96(%rsp)",@X[3]);
1768e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_40_59);
1769e1051a39Sopenharmony_ci	&Xupdate_avx2_16_31(\&bodyx_40_59);
1770e1051a39Sopenharmony_ci
1771e1051a39Sopenharmony_ci	&Xupdate_avx2_16_31(\&bodyx_20_39);
1772e1051a39Sopenharmony_ci	&Xupdate_avx2_16_31(\&bodyx_20_39);
1773e1051a39Sopenharmony_ci	&Xupdate_avx2_16_31(\&bodyx_20_39);
1774e1051a39Sopenharmony_ci	&Xloop_avx2	(\&bodyx_20_39);
1775e1051a39Sopenharmony_ci
1776e1051a39Sopenharmony_ci$code.=<<___;
1777e1051a39Sopenharmony_ci	lea	128(%rsp),$frame
1778e1051a39Sopenharmony_ci
1779e1051a39Sopenharmony_ci	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1780e1051a39Sopenharmony_ci	add	0($ctx),@ROTX[0]		# update context
1781e1051a39Sopenharmony_ci	add	4($ctx),@ROTX[1]
1782e1051a39Sopenharmony_ci	add	8($ctx),@ROTX[3]
1783e1051a39Sopenharmony_ci	mov	@ROTX[0],0($ctx)
1784e1051a39Sopenharmony_ci	add	12($ctx),@ROTX[4]
1785e1051a39Sopenharmony_ci	mov	@ROTX[1],4($ctx)
1786e1051a39Sopenharmony_ci	 mov	@ROTX[0],$A			# A=d
1787e1051a39Sopenharmony_ci	add	16($ctx),@ROTX[5]
1788e1051a39Sopenharmony_ci	 mov	@ROTX[3],$a5
1789e1051a39Sopenharmony_ci	mov	@ROTX[3],8($ctx)
1790e1051a39Sopenharmony_ci	 mov	@ROTX[4],$D			# D=b
1791e1051a39Sopenharmony_ci	 #xchg	@ROTX[5],$F			# F=c, C=f
1792e1051a39Sopenharmony_ci	mov	@ROTX[4],12($ctx)
1793e1051a39Sopenharmony_ci	 mov	@ROTX[1],$F			# F=e
1794e1051a39Sopenharmony_ci	mov	@ROTX[5],16($ctx)
1795e1051a39Sopenharmony_ci	#mov	$F,16($ctx)
1796e1051a39Sopenharmony_ci	 mov	@ROTX[5],$E			# E=c
1797e1051a39Sopenharmony_ci	 mov	$a5,$C				# C=f
1798e1051a39Sopenharmony_ci	 #xchg	$F,$E				# E=c, F=e
1799e1051a39Sopenharmony_ci
1800e1051a39Sopenharmony_ci	cmp	$num,$inp
1801e1051a39Sopenharmony_ci	jbe	.Loop_avx2
1802e1051a39Sopenharmony_ci
1803e1051a39Sopenharmony_ci.Ldone_avx2:
1804e1051a39Sopenharmony_ci	vzeroupper
1805e1051a39Sopenharmony_ci___
1806e1051a39Sopenharmony_ci$code.=<<___ if ($win64);
1807e1051a39Sopenharmony_ci	movaps	-40-6*16($fp),%xmm6
1808e1051a39Sopenharmony_ci	movaps	-40-5*16($fp),%xmm7
1809e1051a39Sopenharmony_ci	movaps	-40-4*16($fp),%xmm8
1810e1051a39Sopenharmony_ci	movaps	-40-3*16($fp),%xmm9
1811e1051a39Sopenharmony_ci	movaps	-40-2*16($fp),%xmm10
1812e1051a39Sopenharmony_ci	movaps	-40-1*16($fp),%xmm11
1813e1051a39Sopenharmony_ci___
1814e1051a39Sopenharmony_ci$code.=<<___;
1815e1051a39Sopenharmony_ci	mov	-40($fp),%r14
1816e1051a39Sopenharmony_ci.cfi_restore	%r14
1817e1051a39Sopenharmony_ci	mov	-32($fp),%r13
1818e1051a39Sopenharmony_ci.cfi_restore	%r13
1819e1051a39Sopenharmony_ci	mov	-24($fp),%r12
1820e1051a39Sopenharmony_ci.cfi_restore	%r12
1821e1051a39Sopenharmony_ci	mov	-16($fp),%rbp
1822e1051a39Sopenharmony_ci.cfi_restore	%rbp
1823e1051a39Sopenharmony_ci	mov	-8($fp),%rbx
1824e1051a39Sopenharmony_ci.cfi_restore	%rbx
1825e1051a39Sopenharmony_ci	lea	($fp),%rsp
1826e1051a39Sopenharmony_ci.cfi_def_cfa_register	%rsp
1827e1051a39Sopenharmony_ci.Lepilogue_avx2:
1828e1051a39Sopenharmony_ci	ret
1829e1051a39Sopenharmony_ci.cfi_endproc
1830e1051a39Sopenharmony_ci.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1831e1051a39Sopenharmony_ci___
1832e1051a39Sopenharmony_ci}
1833e1051a39Sopenharmony_ci}
1834e1051a39Sopenharmony_ci$code.=<<___;
1835e1051a39Sopenharmony_ci.align	64
1836e1051a39Sopenharmony_ciK_XX_XX:
1837e1051a39Sopenharmony_ci.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1838e1051a39Sopenharmony_ci.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1839e1051a39Sopenharmony_ci.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1840e1051a39Sopenharmony_ci.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1841e1051a39Sopenharmony_ci.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1842e1051a39Sopenharmony_ci.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1843e1051a39Sopenharmony_ci.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1844e1051a39Sopenharmony_ci.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1845e1051a39Sopenharmony_ci.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1846e1051a39Sopenharmony_ci.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1847e1051a39Sopenharmony_ci.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1848e1051a39Sopenharmony_ci___
1849e1051a39Sopenharmony_ci}}}
1850e1051a39Sopenharmony_ci$code.=<<___;
1851e1051a39Sopenharmony_ci.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1852e1051a39Sopenharmony_ci.align	64
1853e1051a39Sopenharmony_ci___
1854e1051a39Sopenharmony_ci
1855e1051a39Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1856e1051a39Sopenharmony_ci#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1857e1051a39Sopenharmony_ciif ($win64) {
1858e1051a39Sopenharmony_ci$rec="%rcx";
1859e1051a39Sopenharmony_ci$frame="%rdx";
1860e1051a39Sopenharmony_ci$context="%r8";
1861e1051a39Sopenharmony_ci$disp="%r9";
1862e1051a39Sopenharmony_ci
1863e1051a39Sopenharmony_ci$code.=<<___;
1864e1051a39Sopenharmony_ci.extern	__imp_RtlVirtualUnwind
1865e1051a39Sopenharmony_ci.type	se_handler,\@abi-omnipotent
1866e1051a39Sopenharmony_ci.align	16
1867e1051a39Sopenharmony_cise_handler:
1868e1051a39Sopenharmony_ci	push	%rsi
1869e1051a39Sopenharmony_ci	push	%rdi
1870e1051a39Sopenharmony_ci	push	%rbx
1871e1051a39Sopenharmony_ci	push	%rbp
1872e1051a39Sopenharmony_ci	push	%r12
1873e1051a39Sopenharmony_ci	push	%r13
1874e1051a39Sopenharmony_ci	push	%r14
1875e1051a39Sopenharmony_ci	push	%r15
1876e1051a39Sopenharmony_ci	pushfq
1877e1051a39Sopenharmony_ci	sub	\$64,%rsp
1878e1051a39Sopenharmony_ci
1879e1051a39Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
1880e1051a39Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
1881e1051a39Sopenharmony_ci
1882e1051a39Sopenharmony_ci	lea	.Lprologue(%rip),%r10
1883e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<.Lprologue
1884e1051a39Sopenharmony_ci	jb	.Lcommon_seh_tail
1885e1051a39Sopenharmony_ci
1886e1051a39Sopenharmony_ci	mov	152($context),%rax	# pull context->Rsp
1887e1051a39Sopenharmony_ci
1888e1051a39Sopenharmony_ci	lea	.Lepilogue(%rip),%r10
1889e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1890e1051a39Sopenharmony_ci	jae	.Lcommon_seh_tail
1891e1051a39Sopenharmony_ci
1892e1051a39Sopenharmony_ci	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1893e1051a39Sopenharmony_ci
1894e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
1895e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
1896e1051a39Sopenharmony_ci	mov	-24(%rax),%r12
1897e1051a39Sopenharmony_ci	mov	-32(%rax),%r13
1898e1051a39Sopenharmony_ci	mov	-40(%rax),%r14
1899e1051a39Sopenharmony_ci	mov	%rbx,144($context)	# restore context->Rbx
1900e1051a39Sopenharmony_ci	mov	%rbp,160($context)	# restore context->Rbp
1901e1051a39Sopenharmony_ci	mov	%r12,216($context)	# restore context->R12
1902e1051a39Sopenharmony_ci	mov	%r13,224($context)	# restore context->R13
1903e1051a39Sopenharmony_ci	mov	%r14,232($context)	# restore context->R14
1904e1051a39Sopenharmony_ci
1905e1051a39Sopenharmony_ci	jmp	.Lcommon_seh_tail
1906e1051a39Sopenharmony_ci.size	se_handler,.-se_handler
1907e1051a39Sopenharmony_ci___
1908e1051a39Sopenharmony_ci
1909e1051a39Sopenharmony_ci$code.=<<___ if ($shaext);
1910e1051a39Sopenharmony_ci.type	shaext_handler,\@abi-omnipotent
1911e1051a39Sopenharmony_ci.align	16
1912e1051a39Sopenharmony_cishaext_handler:
1913e1051a39Sopenharmony_ci	push	%rsi
1914e1051a39Sopenharmony_ci	push	%rdi
1915e1051a39Sopenharmony_ci	push	%rbx
1916e1051a39Sopenharmony_ci	push	%rbp
1917e1051a39Sopenharmony_ci	push	%r12
1918e1051a39Sopenharmony_ci	push	%r13
1919e1051a39Sopenharmony_ci	push	%r14
1920e1051a39Sopenharmony_ci	push	%r15
1921e1051a39Sopenharmony_ci	pushfq
1922e1051a39Sopenharmony_ci	sub	\$64,%rsp
1923e1051a39Sopenharmony_ci
1924e1051a39Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
1925e1051a39Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
1926e1051a39Sopenharmony_ci
1927e1051a39Sopenharmony_ci	lea	.Lprologue_shaext(%rip),%r10
1928e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<.Lprologue
1929e1051a39Sopenharmony_ci	jb	.Lcommon_seh_tail
1930e1051a39Sopenharmony_ci
1931e1051a39Sopenharmony_ci	lea	.Lepilogue_shaext(%rip),%r10
1932e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1933e1051a39Sopenharmony_ci	jae	.Lcommon_seh_tail
1934e1051a39Sopenharmony_ci
1935e1051a39Sopenharmony_ci	lea	-8-4*16(%rax),%rsi
1936e1051a39Sopenharmony_ci	lea	512($context),%rdi	# &context.Xmm6
1937e1051a39Sopenharmony_ci	mov	\$8,%ecx
1938e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
1939e1051a39Sopenharmony_ci
1940e1051a39Sopenharmony_ci	jmp	.Lcommon_seh_tail
1941e1051a39Sopenharmony_ci.size	shaext_handler,.-shaext_handler
1942e1051a39Sopenharmony_ci___
1943e1051a39Sopenharmony_ci
1944e1051a39Sopenharmony_ci$code.=<<___;
1945e1051a39Sopenharmony_ci.type	ssse3_handler,\@abi-omnipotent
1946e1051a39Sopenharmony_ci.align	16
1947e1051a39Sopenharmony_cissse3_handler:
1948e1051a39Sopenharmony_ci	push	%rsi
1949e1051a39Sopenharmony_ci	push	%rdi
1950e1051a39Sopenharmony_ci	push	%rbx
1951e1051a39Sopenharmony_ci	push	%rbp
1952e1051a39Sopenharmony_ci	push	%r12
1953e1051a39Sopenharmony_ci	push	%r13
1954e1051a39Sopenharmony_ci	push	%r14
1955e1051a39Sopenharmony_ci	push	%r15
1956e1051a39Sopenharmony_ci	pushfq
1957e1051a39Sopenharmony_ci	sub	\$64,%rsp
1958e1051a39Sopenharmony_ci
1959e1051a39Sopenharmony_ci	mov	120($context),%rax	# pull context->Rax
1960e1051a39Sopenharmony_ci	mov	248($context),%rbx	# pull context->Rip
1961e1051a39Sopenharmony_ci
1962e1051a39Sopenharmony_ci	mov	8($disp),%rsi		# disp->ImageBase
1963e1051a39Sopenharmony_ci	mov	56($disp),%r11		# disp->HandlerData
1964e1051a39Sopenharmony_ci
1965e1051a39Sopenharmony_ci	mov	0(%r11),%r10d		# HandlerData[0]
1966e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# prologue label
1967e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip<prologue label
1968e1051a39Sopenharmony_ci	jb	.Lcommon_seh_tail
1969e1051a39Sopenharmony_ci
1970e1051a39Sopenharmony_ci	mov	208($context),%rax	# pull context->R11
1971e1051a39Sopenharmony_ci
1972e1051a39Sopenharmony_ci	mov	4(%r11),%r10d		# HandlerData[1]
1973e1051a39Sopenharmony_ci	lea	(%rsi,%r10),%r10	# epilogue label
1974e1051a39Sopenharmony_ci	cmp	%r10,%rbx		# context->Rip>=epilogue label
1975e1051a39Sopenharmony_ci	jae	.Lcommon_seh_tail
1976e1051a39Sopenharmony_ci
1977e1051a39Sopenharmony_ci	lea	-40-6*16(%rax),%rsi
1978e1051a39Sopenharmony_ci	lea	512($context),%rdi	# &context.Xmm6
1979e1051a39Sopenharmony_ci	mov	\$12,%ecx
1980e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
1981e1051a39Sopenharmony_ci
1982e1051a39Sopenharmony_ci	mov	-8(%rax),%rbx
1983e1051a39Sopenharmony_ci	mov	-16(%rax),%rbp
1984e1051a39Sopenharmony_ci	mov	-24(%rax),%r12
1985e1051a39Sopenharmony_ci	mov	-32(%rax),%r13
1986e1051a39Sopenharmony_ci	mov	-40(%rax),%r14
1987e1051a39Sopenharmony_ci	mov	%rbx,144($context)	# restore context->Rbx
1988e1051a39Sopenharmony_ci	mov	%rbp,160($context)	# restore context->Rbp
1989e1051a39Sopenharmony_ci	mov	%r12,216($context)	# restore context->R12
1990e1051a39Sopenharmony_ci	mov	%r13,224($context)	# restore context->R13
1991e1051a39Sopenharmony_ci	mov	%r14,232($context)	# restore context->R14
1992e1051a39Sopenharmony_ci
1993e1051a39Sopenharmony_ci.Lcommon_seh_tail:
1994e1051a39Sopenharmony_ci	mov	8(%rax),%rdi
1995e1051a39Sopenharmony_ci	mov	16(%rax),%rsi
1996e1051a39Sopenharmony_ci	mov	%rax,152($context)	# restore context->Rsp
1997e1051a39Sopenharmony_ci	mov	%rsi,168($context)	# restore context->Rsi
1998e1051a39Sopenharmony_ci	mov	%rdi,176($context)	# restore context->Rdi
1999e1051a39Sopenharmony_ci
2000e1051a39Sopenharmony_ci	mov	40($disp),%rdi		# disp->ContextRecord
2001e1051a39Sopenharmony_ci	mov	$context,%rsi		# context
2002e1051a39Sopenharmony_ci	mov	\$154,%ecx		# sizeof(CONTEXT)
2003e1051a39Sopenharmony_ci	.long	0xa548f3fc		# cld; rep movsq
2004e1051a39Sopenharmony_ci
2005e1051a39Sopenharmony_ci	mov	$disp,%rsi
2006e1051a39Sopenharmony_ci	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2007e1051a39Sopenharmony_ci	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2008e1051a39Sopenharmony_ci	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2009e1051a39Sopenharmony_ci	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2010e1051a39Sopenharmony_ci	mov	40(%rsi),%r10		# disp->ContextRecord
2011e1051a39Sopenharmony_ci	lea	56(%rsi),%r11		# &disp->HandlerData
2012e1051a39Sopenharmony_ci	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2013e1051a39Sopenharmony_ci	mov	%r10,32(%rsp)		# arg5
2014e1051a39Sopenharmony_ci	mov	%r11,40(%rsp)		# arg6
2015e1051a39Sopenharmony_ci	mov	%r12,48(%rsp)		# arg7
2016e1051a39Sopenharmony_ci	mov	%rcx,56(%rsp)		# arg8, (NULL)
2017e1051a39Sopenharmony_ci	call	*__imp_RtlVirtualUnwind(%rip)
2018e1051a39Sopenharmony_ci
2019e1051a39Sopenharmony_ci	mov	\$1,%eax		# ExceptionContinueSearch
2020e1051a39Sopenharmony_ci	add	\$64,%rsp
2021e1051a39Sopenharmony_ci	popfq
2022e1051a39Sopenharmony_ci	pop	%r15
2023e1051a39Sopenharmony_ci	pop	%r14
2024e1051a39Sopenharmony_ci	pop	%r13
2025e1051a39Sopenharmony_ci	pop	%r12
2026e1051a39Sopenharmony_ci	pop	%rbp
2027e1051a39Sopenharmony_ci	pop	%rbx
2028e1051a39Sopenharmony_ci	pop	%rdi
2029e1051a39Sopenharmony_ci	pop	%rsi
2030e1051a39Sopenharmony_ci	ret
2031e1051a39Sopenharmony_ci.size	ssse3_handler,.-ssse3_handler
2032e1051a39Sopenharmony_ci
2033e1051a39Sopenharmony_ci.section	.pdata
2034e1051a39Sopenharmony_ci.align	4
2035e1051a39Sopenharmony_ci	.rva	.LSEH_begin_sha1_block_data_order
2036e1051a39Sopenharmony_ci	.rva	.LSEH_end_sha1_block_data_order
2037e1051a39Sopenharmony_ci	.rva	.LSEH_info_sha1_block_data_order
2038e1051a39Sopenharmony_ci___
2039e1051a39Sopenharmony_ci$code.=<<___ if ($shaext);
2040e1051a39Sopenharmony_ci	.rva	.LSEH_begin_sha1_block_data_order_shaext
2041e1051a39Sopenharmony_ci	.rva	.LSEH_end_sha1_block_data_order_shaext
2042e1051a39Sopenharmony_ci	.rva	.LSEH_info_sha1_block_data_order_shaext
2043e1051a39Sopenharmony_ci___
2044e1051a39Sopenharmony_ci$code.=<<___;
2045e1051a39Sopenharmony_ci	.rva	.LSEH_begin_sha1_block_data_order_ssse3
2046e1051a39Sopenharmony_ci	.rva	.LSEH_end_sha1_block_data_order_ssse3
2047e1051a39Sopenharmony_ci	.rva	.LSEH_info_sha1_block_data_order_ssse3
2048e1051a39Sopenharmony_ci___
2049e1051a39Sopenharmony_ci$code.=<<___ if ($avx);
2050e1051a39Sopenharmony_ci	.rva	.LSEH_begin_sha1_block_data_order_avx
2051e1051a39Sopenharmony_ci	.rva	.LSEH_end_sha1_block_data_order_avx
2052e1051a39Sopenharmony_ci	.rva	.LSEH_info_sha1_block_data_order_avx
2053e1051a39Sopenharmony_ci___
2054e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1);
2055e1051a39Sopenharmony_ci	.rva	.LSEH_begin_sha1_block_data_order_avx2
2056e1051a39Sopenharmony_ci	.rva	.LSEH_end_sha1_block_data_order_avx2
2057e1051a39Sopenharmony_ci	.rva	.LSEH_info_sha1_block_data_order_avx2
2058e1051a39Sopenharmony_ci___
2059e1051a39Sopenharmony_ci$code.=<<___;
2060e1051a39Sopenharmony_ci.section	.xdata
2061e1051a39Sopenharmony_ci.align	8
2062e1051a39Sopenharmony_ci.LSEH_info_sha1_block_data_order:
2063e1051a39Sopenharmony_ci	.byte	9,0,0,0
2064e1051a39Sopenharmony_ci	.rva	se_handler
2065e1051a39Sopenharmony_ci___
2066e1051a39Sopenharmony_ci$code.=<<___ if ($shaext);
2067e1051a39Sopenharmony_ci.LSEH_info_sha1_block_data_order_shaext:
2068e1051a39Sopenharmony_ci	.byte	9,0,0,0
2069e1051a39Sopenharmony_ci	.rva	shaext_handler
2070e1051a39Sopenharmony_ci___
2071e1051a39Sopenharmony_ci$code.=<<___;
2072e1051a39Sopenharmony_ci.LSEH_info_sha1_block_data_order_ssse3:
2073e1051a39Sopenharmony_ci	.byte	9,0,0,0
2074e1051a39Sopenharmony_ci	.rva	ssse3_handler
2075e1051a39Sopenharmony_ci	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2076e1051a39Sopenharmony_ci___
2077e1051a39Sopenharmony_ci$code.=<<___ if ($avx);
2078e1051a39Sopenharmony_ci.LSEH_info_sha1_block_data_order_avx:
2079e1051a39Sopenharmony_ci	.byte	9,0,0,0
2080e1051a39Sopenharmony_ci	.rva	ssse3_handler
2081e1051a39Sopenharmony_ci	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2082e1051a39Sopenharmony_ci___
2083e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1);
2084e1051a39Sopenharmony_ci.LSEH_info_sha1_block_data_order_avx2:
2085e1051a39Sopenharmony_ci	.byte	9,0,0,0
2086e1051a39Sopenharmony_ci	.rva	ssse3_handler
2087e1051a39Sopenharmony_ci	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2088e1051a39Sopenharmony_ci___
2089e1051a39Sopenharmony_ci}
2090e1051a39Sopenharmony_ci
2091e1051a39Sopenharmony_ci####################################################################
2092e1051a39Sopenharmony_ci
2093e1051a39Sopenharmony_cisub sha1rnds4 {
2094e1051a39Sopenharmony_ci    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2095e1051a39Sopenharmony_ci      my @opcode=(0x0f,0x3a,0xcc);
2096e1051a39Sopenharmony_ci	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2097e1051a39Sopenharmony_ci	my $c=$1;
2098e1051a39Sopenharmony_ci	push @opcode,$c=~/^0/?oct($c):$c;
2099e1051a39Sopenharmony_ci	return ".byte\t".join(',',@opcode);
2100e1051a39Sopenharmony_ci    } else {
2101e1051a39Sopenharmony_ci	return "sha1rnds4\t".@_[0];
2102e1051a39Sopenharmony_ci    }
2103e1051a39Sopenharmony_ci}
2104e1051a39Sopenharmony_ci
2105e1051a39Sopenharmony_cisub sha1op38 {
2106e1051a39Sopenharmony_ci    my $instr = shift;
2107e1051a39Sopenharmony_ci    my %opcodelet = (
2108e1051a39Sopenharmony_ci		"sha1nexte" => 0xc8,
2109e1051a39Sopenharmony_ci  		"sha1msg1"  => 0xc9,
2110e1051a39Sopenharmony_ci		"sha1msg2"  => 0xca	);
2111e1051a39Sopenharmony_ci
2112e1051a39Sopenharmony_ci    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2113e1051a39Sopenharmony_ci      my @opcode=(0x0f,0x38);
2114e1051a39Sopenharmony_ci      my $rex=0;
2115e1051a39Sopenharmony_ci	$rex|=0x04			if ($2>=8);
2116e1051a39Sopenharmony_ci	$rex|=0x01			if ($1>=8);
2117e1051a39Sopenharmony_ci	unshift @opcode,0x40|$rex	if ($rex);
2118e1051a39Sopenharmony_ci	push @opcode,$opcodelet{$instr};
2119e1051a39Sopenharmony_ci	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2120e1051a39Sopenharmony_ci	return ".byte\t".join(',',@opcode);
2121e1051a39Sopenharmony_ci    } else {
2122e1051a39Sopenharmony_ci	return $instr."\t".@_[0];
2123e1051a39Sopenharmony_ci    }
2124e1051a39Sopenharmony_ci}
2125e1051a39Sopenharmony_ci
2126e1051a39Sopenharmony_ciforeach (split("\n",$code)) {
2127e1051a39Sopenharmony_ci	s/\`([^\`]*)\`/eval $1/geo;
2128e1051a39Sopenharmony_ci
2129e1051a39Sopenharmony_ci	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2130e1051a39Sopenharmony_ci	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2131e1051a39Sopenharmony_ci
2132e1051a39Sopenharmony_ci	print $_,"\n";
2133e1051a39Sopenharmony_ci}
2134e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
2135