1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci#
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# SHA256 block transform for x86. September 2007.
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# Performance improvement over compiler generated code varies from
20e1051a39Sopenharmony_ci# 10% to 40% [see below]. Not very impressive on some µ-archs, but
21e1051a39Sopenharmony_ci# it's 5 times smaller and optimizes amount of writes.
22e1051a39Sopenharmony_ci#
23e1051a39Sopenharmony_ci# May 2012.
24e1051a39Sopenharmony_ci#
25e1051a39Sopenharmony_ci# Optimization including two of Pavel Semjanov's ideas, alternative
26e1051a39Sopenharmony_ci# Maj and full unroll, resulted in ~20-25% improvement on most CPUs,
27e1051a39Sopenharmony_ci# ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost
28e1051a39Sopenharmony_ci# 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not
29e1051a39Sopenharmony_ci# on P4, where it kills performance, nor Sandy Bridge, where folded
30e1051a39Sopenharmony_ci# loop is approximately as fast...
31e1051a39Sopenharmony_ci#
32e1051a39Sopenharmony_ci# June 2012.
33e1051a39Sopenharmony_ci#
34e1051a39Sopenharmony_ci# Add AMD XOP-specific code path, >30% improvement on Bulldozer over
35e1051a39Sopenharmony_ci# May version, >60% over original. Add AVX+shrd code path, >25%
36e1051a39Sopenharmony_ci# improvement on Sandy Bridge over May version, 60% over original.
37e1051a39Sopenharmony_ci#
38e1051a39Sopenharmony_ci# May 2013.
39e1051a39Sopenharmony_ci#
40e1051a39Sopenharmony_ci# Replace AMD XOP code path with SSSE3 to cover more processors.
41e1051a39Sopenharmony_ci# (Biggest improvement coefficient is on upcoming Atom Silvermont,
42e1051a39Sopenharmony_ci# not shown.) Add AVX+BMI code path.
43e1051a39Sopenharmony_ci#
44e1051a39Sopenharmony_ci# March 2014.
45e1051a39Sopenharmony_ci#
46e1051a39Sopenharmony_ci# Add support for Intel SHA Extensions.
47e1051a39Sopenharmony_ci#
48e1051a39Sopenharmony_ci# Performance in clock cycles per processed byte (less is better):
49e1051a39Sopenharmony_ci#
50e1051a39Sopenharmony_ci#		gcc	icc	x86 asm(*)	SIMD	x86_64 asm(**)
51e1051a39Sopenharmony_ci# Pentium	46	57	40/38		-	-
52e1051a39Sopenharmony_ci# PIII		36	33	27/24		-	-
53e1051a39Sopenharmony_ci# P4		41	38	28		-	17.3
54e1051a39Sopenharmony_ci# AMD K8	27	25	19/15.5		-	14.9
55e1051a39Sopenharmony_ci# Core2		26	23	18/15.6		14.3	13.8
56e1051a39Sopenharmony_ci# Westmere	27	-	19/15.7		13.4	12.3
57e1051a39Sopenharmony_ci# Sandy Bridge	25	-	15.9		12.4	11.6
58e1051a39Sopenharmony_ci# Ivy Bridge	24	-	15.0		11.4	10.3
59e1051a39Sopenharmony_ci# Haswell	22	-	13.9		9.46	7.80
60e1051a39Sopenharmony_ci# Skylake	20	-	14.9		9.50	7.70
61e1051a39Sopenharmony_ci# Bulldozer	36	-	27/22		17.0	13.6
62e1051a39Sopenharmony_ci# VIA Nano	36	-	25/22		16.8	16.5
63e1051a39Sopenharmony_ci# Atom		50	-	30/25		21.9	18.9
64e1051a39Sopenharmony_ci# Silvermont	40	-	34/31		22.9	20.6
65e1051a39Sopenharmony_ci# Goldmont	29	-	20		16.3(***)
66e1051a39Sopenharmony_ci#
67e1051a39Sopenharmony_ci# (*)	numbers after slash are for unrolled loop, where applicable;
68e1051a39Sopenharmony_ci# (**)	x86_64 assembly performance is presented for reference
69e1051a39Sopenharmony_ci#	purposes, results are best-available;
70e1051a39Sopenharmony_ci# (***)	SHAEXT result is 4.1, strangely enough better than 64-bit one;
71e1051a39Sopenharmony_ci
72e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
73e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm");
74e1051a39Sopenharmony_cirequire "x86asm.pl";
75e1051a39Sopenharmony_ci
76e1051a39Sopenharmony_ci$output=pop and open STDOUT,">$output";
77e1051a39Sopenharmony_ci
78e1051a39Sopenharmony_ci&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
79e1051a39Sopenharmony_ci
80e1051a39Sopenharmony_ci$xmm=$avx=0;
81e1051a39Sopenharmony_cifor (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
82e1051a39Sopenharmony_ci
83e1051a39Sopenharmony_ciif ($xmm &&	`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
84e1051a39Sopenharmony_ci			=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
85e1051a39Sopenharmony_ci	$avx = ($1>=2.19) + ($1>=2.22);
86e1051a39Sopenharmony_ci}
87e1051a39Sopenharmony_ci
88e1051a39Sopenharmony_ciif ($xmm && !$avx && $ARGV[0] eq "win32n" &&
89e1051a39Sopenharmony_ci		`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
90e1051a39Sopenharmony_ci	$avx = ($1>=2.03) + ($1>=2.10);
91e1051a39Sopenharmony_ci}
92e1051a39Sopenharmony_ci
93e1051a39Sopenharmony_ciif ($xmm && !$avx && $ARGV[0] eq "win32" &&
94e1051a39Sopenharmony_ci		`ml 2>&1` =~ /Version ([0-9]+)\./) {
95e1051a39Sopenharmony_ci	$avx = ($1>=10) + ($1>=11);
96e1051a39Sopenharmony_ci}
97e1051a39Sopenharmony_ci
98e1051a39Sopenharmony_ciif ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) {
99e1051a39Sopenharmony_ci	$avx = ($2>=3.0) + ($2>3.0);
100e1051a39Sopenharmony_ci}
101e1051a39Sopenharmony_ci
102e1051a39Sopenharmony_ci$shaext=$xmm;	### set to zero if compiling for 1.0.1
103e1051a39Sopenharmony_ci
104e1051a39Sopenharmony_ci$unroll_after = 64*4;	# If pre-evicted from L1P cache first spin of
105e1051a39Sopenharmony_ci			# fully unrolled loop was measured to run about
106e1051a39Sopenharmony_ci			# 3-4x slower. If slowdown coefficient is N and
107e1051a39Sopenharmony_ci			# unrolled loop is m times faster, then you break
108e1051a39Sopenharmony_ci			# even at (N-1)/(m-1) blocks. Then it needs to be
109e1051a39Sopenharmony_ci			# adjusted for probability of code being evicted,
110e1051a39Sopenharmony_ci			# code size/cache size=1/4. Typical m is 1.15...
111e1051a39Sopenharmony_ci
112e1051a39Sopenharmony_ci$A="eax";
113e1051a39Sopenharmony_ci$E="edx";
114e1051a39Sopenharmony_ci$T="ebx";
115e1051a39Sopenharmony_ci$Aoff=&DWP(4,"esp");
116e1051a39Sopenharmony_ci$Boff=&DWP(8,"esp");
117e1051a39Sopenharmony_ci$Coff=&DWP(12,"esp");
118e1051a39Sopenharmony_ci$Doff=&DWP(16,"esp");
119e1051a39Sopenharmony_ci$Eoff=&DWP(20,"esp");
120e1051a39Sopenharmony_ci$Foff=&DWP(24,"esp");
121e1051a39Sopenharmony_ci$Goff=&DWP(28,"esp");
122e1051a39Sopenharmony_ci$Hoff=&DWP(32,"esp");
123e1051a39Sopenharmony_ci$Xoff=&DWP(36,"esp");
124e1051a39Sopenharmony_ci$K256="ebp";
125e1051a39Sopenharmony_ci
126e1051a39Sopenharmony_cisub BODY_16_63() {
127e1051a39Sopenharmony_ci	&mov	($T,"ecx");			# "ecx" is preloaded
128e1051a39Sopenharmony_ci	 &mov	("esi",&DWP(4*(9+15+16-14),"esp"));
129e1051a39Sopenharmony_ci	&ror	("ecx",18-7);
130e1051a39Sopenharmony_ci	 &mov	("edi","esi");
131e1051a39Sopenharmony_ci	&ror	("esi",19-17);
132e1051a39Sopenharmony_ci	 &xor	("ecx",$T);
133e1051a39Sopenharmony_ci	 &shr	($T,3);
134e1051a39Sopenharmony_ci	&ror	("ecx",7);
135e1051a39Sopenharmony_ci	 &xor	("esi","edi");
136e1051a39Sopenharmony_ci	 &xor	($T,"ecx");			# T = sigma0(X[-15])
137e1051a39Sopenharmony_ci	&ror	("esi",17);
138e1051a39Sopenharmony_ci	 &add	($T,&DWP(4*(9+15+16),"esp"));	# T += X[-16]
139e1051a39Sopenharmony_ci	&shr	("edi",10);
140e1051a39Sopenharmony_ci	 &add	($T,&DWP(4*(9+15+16-9),"esp"));	# T += X[-7]
141e1051a39Sopenharmony_ci	#&xor	("edi","esi")			# sigma1(X[-2])
142e1051a39Sopenharmony_ci	# &add	($T,"edi");			# T += sigma1(X[-2])
143e1051a39Sopenharmony_ci	# &mov	(&DWP(4*(9+15),"esp"),$T);	# save X[0]
144e1051a39Sopenharmony_ci
145e1051a39Sopenharmony_ci	&BODY_00_15(1);
146e1051a39Sopenharmony_ci}
147e1051a39Sopenharmony_cisub BODY_00_15() {
148e1051a39Sopenharmony_ci    my $in_16_63=shift;
149e1051a39Sopenharmony_ci
150e1051a39Sopenharmony_ci	&mov	("ecx",$E);
151e1051a39Sopenharmony_ci	 &xor	("edi","esi")			if ($in_16_63);	# sigma1(X[-2])
152e1051a39Sopenharmony_ci	 &mov	("esi",$Foff);
153e1051a39Sopenharmony_ci	&ror	("ecx",25-11);
154e1051a39Sopenharmony_ci	 &add	($T,"edi")			if ($in_16_63);	# T += sigma1(X[-2])
155e1051a39Sopenharmony_ci	 &mov	("edi",$Goff);
156e1051a39Sopenharmony_ci	&xor	("ecx",$E);
157e1051a39Sopenharmony_ci	 &xor	("esi","edi");
158e1051a39Sopenharmony_ci	 &mov	($T,&DWP(4*(9+15),"esp"))	if (!$in_16_63);
159e1051a39Sopenharmony_ci	 &mov	(&DWP(4*(9+15),"esp"),$T)	if ($in_16_63);	# save X[0]
160e1051a39Sopenharmony_ci	&ror	("ecx",11-6);
161e1051a39Sopenharmony_ci	 &and	("esi",$E);
162e1051a39Sopenharmony_ci	 &mov	($Eoff,$E);		# modulo-scheduled
163e1051a39Sopenharmony_ci	&xor	($E,"ecx");
164e1051a39Sopenharmony_ci	 &add	($T,$Hoff);		# T += h
165e1051a39Sopenharmony_ci	 &xor	("esi","edi");		# Ch(e,f,g)
166e1051a39Sopenharmony_ci	&ror	($E,6);			# Sigma1(e)
167e1051a39Sopenharmony_ci	 &mov	("ecx",$A);
168e1051a39Sopenharmony_ci	 &add	($T,"esi");		# T += Ch(e,f,g)
169e1051a39Sopenharmony_ci
170e1051a39Sopenharmony_ci	&ror	("ecx",22-13);
171e1051a39Sopenharmony_ci	 &add	($T,$E);		# T += Sigma1(e)
172e1051a39Sopenharmony_ci	 &mov	("edi",$Boff);
173e1051a39Sopenharmony_ci	&xor	("ecx",$A);
174e1051a39Sopenharmony_ci	 &mov	($Aoff,$A);		# modulo-scheduled
175e1051a39Sopenharmony_ci	 &lea	("esp",&DWP(-4,"esp"));
176e1051a39Sopenharmony_ci	&ror	("ecx",13-2);
177e1051a39Sopenharmony_ci	 &mov	("esi",&DWP(0,$K256));
178e1051a39Sopenharmony_ci	&xor	("ecx",$A);
179e1051a39Sopenharmony_ci	 &mov	($E,$Eoff);		# e in next iteration, d in this one
180e1051a39Sopenharmony_ci	 &xor	($A,"edi");		# a ^= b
181e1051a39Sopenharmony_ci	&ror	("ecx",2);		# Sigma0(a)
182e1051a39Sopenharmony_ci
183e1051a39Sopenharmony_ci	 &add	($T,"esi");		# T+= K[i]
184e1051a39Sopenharmony_ci	 &mov	(&DWP(0,"esp"),$A);	# (b^c) in next round
185e1051a39Sopenharmony_ci	&add	($E,$T);		# d += T
186e1051a39Sopenharmony_ci	 &and	($A,&DWP(4,"esp"));	# a &= (b^c)
187e1051a39Sopenharmony_ci	&add	($T,"ecx");		# T += Sigma0(a)
188e1051a39Sopenharmony_ci	 &xor	($A,"edi");		# h = Maj(a,b,c) = Ch(a^b,c,b)
189e1051a39Sopenharmony_ci	 &mov	("ecx",&DWP(4*(9+15+16-1),"esp"))	if ($in_16_63);	# preload T
190e1051a39Sopenharmony_ci	&add	($K256,4);
191e1051a39Sopenharmony_ci	 &add	($A,$T);		# h += T
192e1051a39Sopenharmony_ci}
193e1051a39Sopenharmony_ci
194e1051a39Sopenharmony_ci&external_label("OPENSSL_ia32cap_P")		if (!$i386);
195e1051a39Sopenharmony_ci
196e1051a39Sopenharmony_ci&function_begin("sha256_block_data_order");
197e1051a39Sopenharmony_ci	&mov	("esi",wparam(0));	# ctx
198e1051a39Sopenharmony_ci	&mov	("edi",wparam(1));	# inp
199e1051a39Sopenharmony_ci	&mov	("eax",wparam(2));	# num
200e1051a39Sopenharmony_ci	&mov	("ebx","esp");		# saved sp
201e1051a39Sopenharmony_ci
202e1051a39Sopenharmony_ci	&call	(&label("pic_point"));	# make it PIC!
203e1051a39Sopenharmony_ci&set_label("pic_point");
204e1051a39Sopenharmony_ci	&blindpop($K256);
205e1051a39Sopenharmony_ci	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
206e1051a39Sopenharmony_ci
207e1051a39Sopenharmony_ci	&sub	("esp",16);
208e1051a39Sopenharmony_ci	&and	("esp",-64);
209e1051a39Sopenharmony_ci
210e1051a39Sopenharmony_ci	&shl	("eax",6);
211e1051a39Sopenharmony_ci	&add	("eax","edi");
212e1051a39Sopenharmony_ci	&mov	(&DWP(0,"esp"),"esi");	# ctx
213e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esp"),"edi");	# inp
214e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
215e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
216e1051a39Sopenharmony_ci						if (!$i386 && $xmm) {
217e1051a39Sopenharmony_ci	&picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256"));
218e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(0,"edx"));
219e1051a39Sopenharmony_ci	&mov	("ebx",&DWP(4,"edx"));
220e1051a39Sopenharmony_ci	&test	("ecx",1<<20);		# check for P4
221e1051a39Sopenharmony_ci	&jnz	(&label("loop"));
222e1051a39Sopenharmony_ci	&mov	("edx",&DWP(8,"edx"))	if ($xmm);
223e1051a39Sopenharmony_ci	&test	("ecx",1<<24);		# check for FXSR
224e1051a39Sopenharmony_ci	&jz	($unroll_after?&label("no_xmm"):&label("loop"));
225e1051a39Sopenharmony_ci	&and	("ecx",1<<30);		# mask "Intel CPU" bit
226e1051a39Sopenharmony_ci	&and	("ebx",1<<28|1<<9);	# mask AVX and SSSE3 bits
227e1051a39Sopenharmony_ci	&test	("edx",1<<29)		if ($shaext);	# check for SHA
228e1051a39Sopenharmony_ci	&jnz	(&label("shaext"))	if ($shaext);
229e1051a39Sopenharmony_ci	&or	("ecx","ebx");
230e1051a39Sopenharmony_ci	&and	("ecx",1<<28|1<<30);
231e1051a39Sopenharmony_ci	&cmp	("ecx",1<<28|1<<30);
232e1051a39Sopenharmony_ci					if ($xmm) {
233e1051a39Sopenharmony_ci	&je	(&label("AVX"))		if ($avx);
234e1051a39Sopenharmony_ci	&test	("ebx",1<<9);		# check for SSSE3
235e1051a39Sopenharmony_ci	&jnz	(&label("SSSE3"));
236e1051a39Sopenharmony_ci					} else {
237e1051a39Sopenharmony_ci	&je	(&label("loop_shrd"));
238e1051a39Sopenharmony_ci					}
239e1051a39Sopenharmony_ci						if ($unroll_after) {
240e1051a39Sopenharmony_ci&set_label("no_xmm");
241e1051a39Sopenharmony_ci	&sub	("eax","edi");
242e1051a39Sopenharmony_ci	&cmp	("eax",$unroll_after);
243e1051a39Sopenharmony_ci	&jae	(&label("unrolled"));
244e1051a39Sopenharmony_ci						} }
245e1051a39Sopenharmony_ci	&jmp	(&label("loop"));
246e1051a39Sopenharmony_ci
247e1051a39Sopenharmony_cisub COMPACT_LOOP() {
248e1051a39Sopenharmony_cimy $suffix=shift;
249e1051a39Sopenharmony_ci
250e1051a39Sopenharmony_ci&set_label("loop$suffix",$suffix?32:16);
251e1051a39Sopenharmony_ci    # copy input block to stack reversing byte and dword order
252e1051a39Sopenharmony_ci    for($i=0;$i<4;$i++) {
253e1051a39Sopenharmony_ci	&mov	("eax",&DWP($i*16+0,"edi"));
254e1051a39Sopenharmony_ci	&mov	("ebx",&DWP($i*16+4,"edi"));
255e1051a39Sopenharmony_ci	&mov	("ecx",&DWP($i*16+8,"edi"));
256e1051a39Sopenharmony_ci	&bswap	("eax");
257e1051a39Sopenharmony_ci	&mov	("edx",&DWP($i*16+12,"edi"));
258e1051a39Sopenharmony_ci	&bswap	("ebx");
259e1051a39Sopenharmony_ci	&push	("eax");
260e1051a39Sopenharmony_ci	&bswap	("ecx");
261e1051a39Sopenharmony_ci	&push	("ebx");
262e1051a39Sopenharmony_ci	&bswap	("edx");
263e1051a39Sopenharmony_ci	&push	("ecx");
264e1051a39Sopenharmony_ci	&push	("edx");
265e1051a39Sopenharmony_ci    }
266e1051a39Sopenharmony_ci	&add	("edi",64);
267e1051a39Sopenharmony_ci	&lea	("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H
268e1051a39Sopenharmony_ci	&mov	(&DWP(4*(9+16)+4,"esp"),"edi");
269e1051a39Sopenharmony_ci
270e1051a39Sopenharmony_ci	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
271e1051a39Sopenharmony_ci	&mov	($A,&DWP(0,"esi"));
272e1051a39Sopenharmony_ci	&mov	("ebx",&DWP(4,"esi"));
273e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(8,"esi"));
274e1051a39Sopenharmony_ci	&mov	("edi",&DWP(12,"esi"));
275e1051a39Sopenharmony_ci	# &mov	($Aoff,$A);
276e1051a39Sopenharmony_ci	&mov	($Boff,"ebx");
277e1051a39Sopenharmony_ci	&xor	("ebx","ecx");
278e1051a39Sopenharmony_ci	&mov	($Coff,"ecx");
279e1051a39Sopenharmony_ci	&mov	($Doff,"edi");
280e1051a39Sopenharmony_ci	&mov	(&DWP(0,"esp"),"ebx");	# magic
281e1051a39Sopenharmony_ci	&mov	($E,&DWP(16,"esi"));
282e1051a39Sopenharmony_ci	&mov	("ebx",&DWP(20,"esi"));
283e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(24,"esi"));
284e1051a39Sopenharmony_ci	&mov	("edi",&DWP(28,"esi"));
285e1051a39Sopenharmony_ci	# &mov	($Eoff,$E);
286e1051a39Sopenharmony_ci	&mov	($Foff,"ebx");
287e1051a39Sopenharmony_ci	&mov	($Goff,"ecx");
288e1051a39Sopenharmony_ci	&mov	($Hoff,"edi");
289e1051a39Sopenharmony_ci
290e1051a39Sopenharmony_ci&set_label("00_15$suffix",16);
291e1051a39Sopenharmony_ci
292e1051a39Sopenharmony_ci	&BODY_00_15();
293e1051a39Sopenharmony_ci
294e1051a39Sopenharmony_ci	&cmp	("esi",0xc19bf174);
295e1051a39Sopenharmony_ci	&jne	(&label("00_15$suffix"));
296e1051a39Sopenharmony_ci
297e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(4*(9+15+16-1),"esp"));	# preloaded in BODY_00_15(1)
298e1051a39Sopenharmony_ci	&jmp	(&label("16_63$suffix"));
299e1051a39Sopenharmony_ci
300e1051a39Sopenharmony_ci&set_label("16_63$suffix",16);
301e1051a39Sopenharmony_ci
302e1051a39Sopenharmony_ci	&BODY_16_63();
303e1051a39Sopenharmony_ci
304e1051a39Sopenharmony_ci	&cmp	("esi",0xc67178f2);
305e1051a39Sopenharmony_ci	&jne	(&label("16_63$suffix"));
306e1051a39Sopenharmony_ci
307e1051a39Sopenharmony_ci	&mov	("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx
308e1051a39Sopenharmony_ci	# &mov	($A,$Aoff);
309e1051a39Sopenharmony_ci	&mov	("ebx",$Boff);
310e1051a39Sopenharmony_ci	# &mov	("edi",$Coff);
311e1051a39Sopenharmony_ci	&mov	("ecx",$Doff);
312e1051a39Sopenharmony_ci	&add	($A,&DWP(0,"esi"));
313e1051a39Sopenharmony_ci	&add	("ebx",&DWP(4,"esi"));
314e1051a39Sopenharmony_ci	&add	("edi",&DWP(8,"esi"));
315e1051a39Sopenharmony_ci	&add	("ecx",&DWP(12,"esi"));
316e1051a39Sopenharmony_ci	&mov	(&DWP(0,"esi"),$A);
317e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esi"),"ebx");
318e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esi"),"edi");
319e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esi"),"ecx");
320e1051a39Sopenharmony_ci	# &mov	($E,$Eoff);
321e1051a39Sopenharmony_ci	&mov	("eax",$Foff);
322e1051a39Sopenharmony_ci	&mov	("ebx",$Goff);
323e1051a39Sopenharmony_ci	&mov	("ecx",$Hoff);
324e1051a39Sopenharmony_ci	&mov	("edi",&DWP(4*(9+16+64)+4,"esp"));#inp
325e1051a39Sopenharmony_ci	&add	($E,&DWP(16,"esi"));
326e1051a39Sopenharmony_ci	&add	("eax",&DWP(20,"esi"));
327e1051a39Sopenharmony_ci	&add	("ebx",&DWP(24,"esi"));
328e1051a39Sopenharmony_ci	&add	("ecx",&DWP(28,"esi"));
329e1051a39Sopenharmony_ci	&mov	(&DWP(16,"esi"),$E);
330e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esi"),"eax");
331e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esi"),"ebx");
332e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esi"),"ecx");
333e1051a39Sopenharmony_ci
334e1051a39Sopenharmony_ci	&lea	("esp",&DWP(4*(9+16+64),"esp"));# destroy frame
335e1051a39Sopenharmony_ci	&sub	($K256,4*64);			# rewind K
336e1051a39Sopenharmony_ci
337e1051a39Sopenharmony_ci	&cmp	("edi",&DWP(8,"esp"));		# are we done yet?
338e1051a39Sopenharmony_ci	&jb	(&label("loop$suffix"));
339e1051a39Sopenharmony_ci}
340e1051a39Sopenharmony_ci	&COMPACT_LOOP();
341e1051a39Sopenharmony_ci	&mov	("esp",&DWP(12,"esp"));		# restore sp
342e1051a39Sopenharmony_ci&function_end_A();
343e1051a39Sopenharmony_ci						if (!$i386 && !$xmm) {
344e1051a39Sopenharmony_ci	# ~20% improvement on Sandy Bridge
345e1051a39Sopenharmony_ci	local *ror = sub { &shrd(@_[0],@_) };
346e1051a39Sopenharmony_ci	&COMPACT_LOOP("_shrd");
347e1051a39Sopenharmony_ci	&mov	("esp",&DWP(12,"esp"));		# restore sp
348e1051a39Sopenharmony_ci&function_end_A();
349e1051a39Sopenharmony_ci						}
350e1051a39Sopenharmony_ci
351e1051a39Sopenharmony_ci&set_label("K256",64);	# Yes! I keep it in the code segment!
352e1051a39Sopenharmony_ci@K256=(	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
353e1051a39Sopenharmony_ci	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
354e1051a39Sopenharmony_ci	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
355e1051a39Sopenharmony_ci	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
356e1051a39Sopenharmony_ci	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
357e1051a39Sopenharmony_ci	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
358e1051a39Sopenharmony_ci	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
359e1051a39Sopenharmony_ci	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
360e1051a39Sopenharmony_ci	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
361e1051a39Sopenharmony_ci	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
362e1051a39Sopenharmony_ci	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
363e1051a39Sopenharmony_ci	0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
364e1051a39Sopenharmony_ci	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
365e1051a39Sopenharmony_ci	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
366e1051a39Sopenharmony_ci	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
367e1051a39Sopenharmony_ci	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2	);
368e1051a39Sopenharmony_ci&data_word(@K256);
369e1051a39Sopenharmony_ci&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# byte swap mask
370e1051a39Sopenharmony_ci&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
371e1051a39Sopenharmony_ci
372e1051a39Sopenharmony_ci($a,$b,$c,$d,$e,$f,$g,$h)=(0..7);	# offsets
373e1051a39Sopenharmony_cisub off { &DWP(4*(((shift)-$i)&7),"esp"); }
374e1051a39Sopenharmony_ci
375e1051a39Sopenharmony_ciif (!$i386 && $unroll_after) {
376e1051a39Sopenharmony_cimy @AH=($A,$K256);
377e1051a39Sopenharmony_ci
378e1051a39Sopenharmony_ci&set_label("unrolled",16);
379e1051a39Sopenharmony_ci	&lea	("esp",&DWP(-96,"esp"));
380e1051a39Sopenharmony_ci	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
381e1051a39Sopenharmony_ci	&mov	($AH[0],&DWP(0,"esi"));
382e1051a39Sopenharmony_ci	&mov	($AH[1],&DWP(4,"esi"));
383e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(8,"esi"));
384e1051a39Sopenharmony_ci	&mov	("ebx",&DWP(12,"esi"));
385e1051a39Sopenharmony_ci	#&mov	(&DWP(0,"esp"),$AH[0]);
386e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esp"),$AH[1]);
387e1051a39Sopenharmony_ci	&xor	($AH[1],"ecx");		# magic
388e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esp"),"ecx");
389e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esp"),"ebx");
390e1051a39Sopenharmony_ci	&mov	($E,&DWP(16,"esi"));
391e1051a39Sopenharmony_ci	&mov	("ebx",&DWP(20,"esi"));
392e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(24,"esi"));
393e1051a39Sopenharmony_ci	&mov	("esi",&DWP(28,"esi"));
394e1051a39Sopenharmony_ci	#&mov	(&DWP(16,"esp"),$E);
395e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esp"),"ebx");
396e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esp"),"ecx");
397e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esp"),"esi");
398e1051a39Sopenharmony_ci	&jmp	(&label("grand_loop"));
399e1051a39Sopenharmony_ci
400e1051a39Sopenharmony_ci&set_label("grand_loop",16);
401e1051a39Sopenharmony_ci    # copy input block to stack reversing byte order
402e1051a39Sopenharmony_ci    for($i=0;$i<5;$i++) {
403e1051a39Sopenharmony_ci	&mov	("ebx",&DWP(12*$i+0,"edi"));
404e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(12*$i+4,"edi"));
405e1051a39Sopenharmony_ci	&bswap	("ebx");
406e1051a39Sopenharmony_ci	&mov	("esi",&DWP(12*$i+8,"edi"));
407e1051a39Sopenharmony_ci	&bswap	("ecx");
408e1051a39Sopenharmony_ci	&mov	(&DWP(32+12*$i+0,"esp"),"ebx");
409e1051a39Sopenharmony_ci	&bswap	("esi");
410e1051a39Sopenharmony_ci	&mov	(&DWP(32+12*$i+4,"esp"),"ecx");
411e1051a39Sopenharmony_ci	&mov	(&DWP(32+12*$i+8,"esp"),"esi");
412e1051a39Sopenharmony_ci    }
413e1051a39Sopenharmony_ci	&mov	("ebx",&DWP($i*12,"edi"));
414e1051a39Sopenharmony_ci	&add	("edi",64);
415e1051a39Sopenharmony_ci	&bswap	("ebx");
416e1051a39Sopenharmony_ci	&mov	(&DWP(96+4,"esp"),"edi");
417e1051a39Sopenharmony_ci	&mov	(&DWP(32+12*$i,"esp"),"ebx");
418e1051a39Sopenharmony_ci
419e1051a39Sopenharmony_ci    my ($t1,$t2) = ("ecx","esi");
420e1051a39Sopenharmony_ci
421e1051a39Sopenharmony_ci    for ($i=0;$i<64;$i++) {
422e1051a39Sopenharmony_ci
423e1051a39Sopenharmony_ci      if ($i>=16) {
424e1051a39Sopenharmony_ci	&mov	($T,$t1);			# $t1 is preloaded
425e1051a39Sopenharmony_ci	# &mov	($t2,&DWP(32+4*(($i+14)&15),"esp"));
426e1051a39Sopenharmony_ci	&ror	($t1,18-7);
427e1051a39Sopenharmony_ci	 &mov	("edi",$t2);
428e1051a39Sopenharmony_ci	&ror	($t2,19-17);
429e1051a39Sopenharmony_ci	 &xor	($t1,$T);
430e1051a39Sopenharmony_ci	 &shr	($T,3);
431e1051a39Sopenharmony_ci	&ror	($t1,7);
432e1051a39Sopenharmony_ci	 &xor	($t2,"edi");
433e1051a39Sopenharmony_ci	 &xor	($T,$t1);			# T = sigma0(X[-15])
434e1051a39Sopenharmony_ci	&ror	($t2,17);
435e1051a39Sopenharmony_ci	 &add	($T,&DWP(32+4*($i&15),"esp"));	# T += X[-16]
436e1051a39Sopenharmony_ci	&shr	("edi",10);
437e1051a39Sopenharmony_ci	 &add	($T,&DWP(32+4*(($i+9)&15),"esp"));	# T += X[-7]
438e1051a39Sopenharmony_ci	#&xor	("edi",$t2)			# sigma1(X[-2])
439e1051a39Sopenharmony_ci	# &add	($T,"edi");			# T += sigma1(X[-2])
440e1051a39Sopenharmony_ci	# &mov	(&DWP(4*(9+15),"esp"),$T);	# save X[0]
441e1051a39Sopenharmony_ci      }
442e1051a39Sopenharmony_ci	&mov	($t1,$E);
443e1051a39Sopenharmony_ci	 &xor	("edi",$t2)			if ($i>=16);	# sigma1(X[-2])
444e1051a39Sopenharmony_ci	 &mov	($t2,&off($f));
445e1051a39Sopenharmony_ci	&ror	($E,25-11);
446e1051a39Sopenharmony_ci	 &add	($T,"edi")			if ($i>=16);	# T += sigma1(X[-2])
447e1051a39Sopenharmony_ci	 &mov	("edi",&off($g));
448e1051a39Sopenharmony_ci	&xor	($E,$t1);
449e1051a39Sopenharmony_ci	 &mov	($T,&DWP(32+4*($i&15),"esp"))	if ($i<16);	# X[i]
450e1051a39Sopenharmony_ci	 &mov	(&DWP(32+4*($i&15),"esp"),$T)	if ($i>=16 && $i<62);	# save X[0]
451e1051a39Sopenharmony_ci	 &xor	($t2,"edi");
452e1051a39Sopenharmony_ci	&ror	($E,11-6);
453e1051a39Sopenharmony_ci	 &and	($t2,$t1);
454e1051a39Sopenharmony_ci	 &mov	(&off($e),$t1);		# save $E, modulo-scheduled
455e1051a39Sopenharmony_ci	&xor	($E,$t1);
456e1051a39Sopenharmony_ci	 &add	($T,&off($h));		# T += h
457e1051a39Sopenharmony_ci	 &xor	("edi",$t2);		# Ch(e,f,g)
458e1051a39Sopenharmony_ci	&ror	($E,6);			# Sigma1(e)
459e1051a39Sopenharmony_ci	 &mov	($t1,$AH[0]);
460e1051a39Sopenharmony_ci	 &add	($T,"edi");		# T += Ch(e,f,g)
461e1051a39Sopenharmony_ci
462e1051a39Sopenharmony_ci	&ror	($t1,22-13);
463e1051a39Sopenharmony_ci	 &mov	($t2,$AH[0]);
464e1051a39Sopenharmony_ci	 &mov	("edi",&off($b));
465e1051a39Sopenharmony_ci	&xor	($t1,$AH[0]);
466e1051a39Sopenharmony_ci	 &mov	(&off($a),$AH[0]);	# save $A, modulo-scheduled
467e1051a39Sopenharmony_ci	 &xor	($AH[0],"edi");		# a ^= b, (b^c) in next round
468e1051a39Sopenharmony_ci	&ror	($t1,13-2);
469e1051a39Sopenharmony_ci	 &and	($AH[1],$AH[0]);	# (b^c) &= (a^b)
470e1051a39Sopenharmony_ci	 &lea	($E,&DWP(@K256[$i],$T,$E));	# T += Sigma1(1)+K[i]
471e1051a39Sopenharmony_ci	&xor	($t1,$t2);
472e1051a39Sopenharmony_ci	 &xor	($AH[1],"edi");		# h = Maj(a,b,c) = Ch(a^b,c,b)
473e1051a39Sopenharmony_ci	 &mov	($t2,&DWP(32+4*(($i+2)&15),"esp"))	if ($i>=15 && $i<63);
474e1051a39Sopenharmony_ci	&ror	($t1,2);		# Sigma0(a)
475e1051a39Sopenharmony_ci
476e1051a39Sopenharmony_ci	 &add	($AH[1],$E);		# h += T
477e1051a39Sopenharmony_ci	 &add	($E,&off($d));		# d += T
478e1051a39Sopenharmony_ci	&add	($AH[1],$t1);		# h += Sigma0(a)
479e1051a39Sopenharmony_ci	 &mov	($t1,&DWP(32+4*(($i+15)&15),"esp"))	if ($i>=15 && $i<63);
480e1051a39Sopenharmony_ci
481e1051a39Sopenharmony_ci	@AH = reverse(@AH);		# rotate(a,h)
482e1051a39Sopenharmony_ci	($t1,$t2) = ($t2,$t1);		# rotate(t1,t2)
483e1051a39Sopenharmony_ci    }
484e1051a39Sopenharmony_ci	&mov	("esi",&DWP(96,"esp"));	#ctx
485e1051a39Sopenharmony_ci					#&mov	($AH[0],&DWP(0,"esp"));
486e1051a39Sopenharmony_ci	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
487e1051a39Sopenharmony_ci					#&mov	("edi", &DWP(8,"esp"));
488e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(12,"esp"));
489e1051a39Sopenharmony_ci	&add	($AH[0],&DWP(0,"esi"));
490e1051a39Sopenharmony_ci	&add	($AH[1],&DWP(4,"esi"));
491e1051a39Sopenharmony_ci	&add	("edi",&DWP(8,"esi"));
492e1051a39Sopenharmony_ci	&add	("ecx",&DWP(12,"esi"));
493e1051a39Sopenharmony_ci	&mov	(&DWP(0,"esi"),$AH[0]);
494e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esi"),$AH[1]);
495e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esi"),"edi");
496e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esi"),"ecx");
497e1051a39Sopenharmony_ci	 #&mov	(&DWP(0,"esp"),$AH[0]);
498e1051a39Sopenharmony_ci	 &mov	(&DWP(4,"esp"),$AH[1]);
499e1051a39Sopenharmony_ci	 &xor	($AH[1],"edi");		# magic
500e1051a39Sopenharmony_ci	 &mov	(&DWP(8,"esp"),"edi");
501e1051a39Sopenharmony_ci	 &mov	(&DWP(12,"esp"),"ecx");
502e1051a39Sopenharmony_ci	#&mov	($E,&DWP(16,"esp"));
503e1051a39Sopenharmony_ci	&mov	("edi",&DWP(20,"esp"));
504e1051a39Sopenharmony_ci	&mov	("ebx",&DWP(24,"esp"));
505e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(28,"esp"));
506e1051a39Sopenharmony_ci	&add	($E,&DWP(16,"esi"));
507e1051a39Sopenharmony_ci	&add	("edi",&DWP(20,"esi"));
508e1051a39Sopenharmony_ci	&add	("ebx",&DWP(24,"esi"));
509e1051a39Sopenharmony_ci	&add	("ecx",&DWP(28,"esi"));
510e1051a39Sopenharmony_ci	&mov	(&DWP(16,"esi"),$E);
511e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esi"),"edi");
512e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esi"),"ebx");
513e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esi"),"ecx");
514e1051a39Sopenharmony_ci	 #&mov	(&DWP(16,"esp"),$E);
515e1051a39Sopenharmony_ci	 &mov	(&DWP(20,"esp"),"edi");
516e1051a39Sopenharmony_ci	&mov	("edi",&DWP(96+4,"esp"));	# inp
517e1051a39Sopenharmony_ci	 &mov	(&DWP(24,"esp"),"ebx");
518e1051a39Sopenharmony_ci	 &mov	(&DWP(28,"esp"),"ecx");
519e1051a39Sopenharmony_ci
520e1051a39Sopenharmony_ci	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
521e1051a39Sopenharmony_ci	&jb	(&label("grand_loop"));
522e1051a39Sopenharmony_ci
523e1051a39Sopenharmony_ci	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
524e1051a39Sopenharmony_ci&function_end_A();
525e1051a39Sopenharmony_ci}
526e1051a39Sopenharmony_ci						if (!$i386 && $xmm) {{{
527e1051a39Sopenharmony_ciif ($shaext) {
528e1051a39Sopenharmony_ci######################################################################
529e1051a39Sopenharmony_ci# Intel SHA Extensions implementation of SHA256 update function.
530e1051a39Sopenharmony_ci#
531e1051a39Sopenharmony_cimy ($ctx,$inp,$end)=("esi","edi","eax");
532e1051a39Sopenharmony_cimy ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7));
533e1051a39Sopenharmony_cimy @MSG=map("xmm$_",(3..6));
534e1051a39Sopenharmony_ci
535e1051a39Sopenharmony_cisub sha256op38 {
536e1051a39Sopenharmony_ci my ($opcodelet,$dst,$src)=@_;
537e1051a39Sopenharmony_ci    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
538e1051a39Sopenharmony_ci    {	&data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);	}
539e1051a39Sopenharmony_ci}
540e1051a39Sopenharmony_cisub sha256rnds2	{ sha256op38(0xcb,@_); }
541e1051a39Sopenharmony_cisub sha256msg1	{ sha256op38(0xcc,@_); }
542e1051a39Sopenharmony_cisub sha256msg2	{ sha256op38(0xcd,@_); }
543e1051a39Sopenharmony_ci
544e1051a39Sopenharmony_ci&set_label("shaext",32);
545e1051a39Sopenharmony_ci	&sub		("esp",32);
546e1051a39Sopenharmony_ci
547e1051a39Sopenharmony_ci	&movdqu		($ABEF,&QWP(0,$ctx));		# DCBA
548e1051a39Sopenharmony_ci	&lea		($K256,&DWP(0x80,$K256));
549e1051a39Sopenharmony_ci	&movdqu		($CDGH,&QWP(16,$ctx));		# HGFE
550e1051a39Sopenharmony_ci	&movdqa		($TMP,&QWP(0x100-0x80,$K256));	# byte swap mask
551e1051a39Sopenharmony_ci
552e1051a39Sopenharmony_ci	&pshufd		($Wi,$ABEF,0x1b);		# ABCD
553e1051a39Sopenharmony_ci	&pshufd		($ABEF,$ABEF,0xb1);		# CDAB
554e1051a39Sopenharmony_ci	&pshufd		($CDGH,$CDGH,0x1b);		# EFGH
555e1051a39Sopenharmony_ci	&palignr	($ABEF,$CDGH,8);		# ABEF
556e1051a39Sopenharmony_ci	&punpcklqdq	($CDGH,$Wi);			# CDGH
557e1051a39Sopenharmony_ci	&jmp		(&label("loop_shaext"));
558e1051a39Sopenharmony_ci
559e1051a39Sopenharmony_ci&set_label("loop_shaext",16);
560e1051a39Sopenharmony_ci	&movdqu		(@MSG[0],&QWP(0,$inp));
561e1051a39Sopenharmony_ci	&movdqu		(@MSG[1],&QWP(0x10,$inp));
562e1051a39Sopenharmony_ci	&movdqu		(@MSG[2],&QWP(0x20,$inp));
563e1051a39Sopenharmony_ci	&pshufb		(@MSG[0],$TMP);
564e1051a39Sopenharmony_ci	&movdqu		(@MSG[3],&QWP(0x30,$inp));
565e1051a39Sopenharmony_ci	&movdqa		(&QWP(16,"esp"),$CDGH);		# offload
566e1051a39Sopenharmony_ci
567e1051a39Sopenharmony_ci	&movdqa		($Wi,&QWP(0*16-0x80,$K256));
568e1051a39Sopenharmony_ci	&paddd		($Wi,@MSG[0]);
569e1051a39Sopenharmony_ci	&pshufb		(@MSG[1],$TMP);
570e1051a39Sopenharmony_ci	&sha256rnds2	($CDGH,$ABEF);			# 0-3
571e1051a39Sopenharmony_ci	&pshufd		($Wi,$Wi,0x0e);
572e1051a39Sopenharmony_ci	&nop		();
573e1051a39Sopenharmony_ci	&movdqa		(&QWP(0,"esp"),$ABEF);		# offload
574e1051a39Sopenharmony_ci	&sha256rnds2	($ABEF,$CDGH);
575e1051a39Sopenharmony_ci
576e1051a39Sopenharmony_ci	&movdqa		($Wi,&QWP(1*16-0x80,$K256));
577e1051a39Sopenharmony_ci	&paddd		($Wi,@MSG[1]);
578e1051a39Sopenharmony_ci	&pshufb		(@MSG[2],$TMP);
579e1051a39Sopenharmony_ci	&sha256rnds2	($CDGH,$ABEF);			# 4-7
580e1051a39Sopenharmony_ci	&pshufd		($Wi,$Wi,0x0e);
581e1051a39Sopenharmony_ci	&lea		($inp,&DWP(0x40,$inp));
582e1051a39Sopenharmony_ci	&sha256msg1	(@MSG[0],@MSG[1]);
583e1051a39Sopenharmony_ci	&sha256rnds2	($ABEF,$CDGH);
584e1051a39Sopenharmony_ci
585e1051a39Sopenharmony_ci	&movdqa		($Wi,&QWP(2*16-0x80,$K256));
586e1051a39Sopenharmony_ci	&paddd		($Wi,@MSG[2]);
587e1051a39Sopenharmony_ci	&pshufb		(@MSG[3],$TMP);
588e1051a39Sopenharmony_ci	&sha256rnds2	($CDGH,$ABEF);			# 8-11
589e1051a39Sopenharmony_ci	&pshufd		($Wi,$Wi,0x0e);
590e1051a39Sopenharmony_ci	&movdqa		($TMP,@MSG[3]);
591e1051a39Sopenharmony_ci	&palignr	($TMP,@MSG[2],4);
592e1051a39Sopenharmony_ci	&nop		();
593e1051a39Sopenharmony_ci	&paddd		(@MSG[0],$TMP);
594e1051a39Sopenharmony_ci	&sha256msg1	(@MSG[1],@MSG[2]);
595e1051a39Sopenharmony_ci	&sha256rnds2	($ABEF,$CDGH);
596e1051a39Sopenharmony_ci
597e1051a39Sopenharmony_ci	&movdqa		($Wi,&QWP(3*16-0x80,$K256));
598e1051a39Sopenharmony_ci	&paddd		($Wi,@MSG[3]);
599e1051a39Sopenharmony_ci	&sha256msg2	(@MSG[0],@MSG[3]);
600e1051a39Sopenharmony_ci	&sha256rnds2	($CDGH,$ABEF);			# 12-15
601e1051a39Sopenharmony_ci	&pshufd		($Wi,$Wi,0x0e);
602e1051a39Sopenharmony_ci	&movdqa		($TMP,@MSG[0]);
603e1051a39Sopenharmony_ci	&palignr	($TMP,@MSG[3],4);
604e1051a39Sopenharmony_ci	&nop		();
605e1051a39Sopenharmony_ci	&paddd		(@MSG[1],$TMP);
606e1051a39Sopenharmony_ci	&sha256msg1	(@MSG[2],@MSG[3]);
607e1051a39Sopenharmony_ci	&sha256rnds2	($ABEF,$CDGH);
608e1051a39Sopenharmony_ci
609e1051a39Sopenharmony_cifor($i=4;$i<16-3;$i++) {
610e1051a39Sopenharmony_ci	&movdqa		($Wi,&QWP($i*16-0x80,$K256));
611e1051a39Sopenharmony_ci	&paddd		($Wi,@MSG[0]);
612e1051a39Sopenharmony_ci	&sha256msg2	(@MSG[1],@MSG[0]);
613e1051a39Sopenharmony_ci	&sha256rnds2	($CDGH,$ABEF);			# 16-19...
614e1051a39Sopenharmony_ci	&pshufd		($Wi,$Wi,0x0e);
615e1051a39Sopenharmony_ci	&movdqa		($TMP,@MSG[1]);
616e1051a39Sopenharmony_ci	&palignr	($TMP,@MSG[0],4);
617e1051a39Sopenharmony_ci	&nop		();
618e1051a39Sopenharmony_ci	&paddd		(@MSG[2],$TMP);
619e1051a39Sopenharmony_ci	&sha256msg1	(@MSG[3],@MSG[0]);
620e1051a39Sopenharmony_ci	&sha256rnds2	($ABEF,$CDGH);
621e1051a39Sopenharmony_ci
622e1051a39Sopenharmony_ci	push(@MSG,shift(@MSG));
623e1051a39Sopenharmony_ci}
624e1051a39Sopenharmony_ci	&movdqa		($Wi,&QWP(13*16-0x80,$K256));
625e1051a39Sopenharmony_ci	&paddd		($Wi,@MSG[0]);
626e1051a39Sopenharmony_ci	&sha256msg2	(@MSG[1],@MSG[0]);
627e1051a39Sopenharmony_ci	&sha256rnds2	($CDGH,$ABEF);			# 52-55
628e1051a39Sopenharmony_ci	&pshufd		($Wi,$Wi,0x0e);
629e1051a39Sopenharmony_ci	&movdqa		($TMP,@MSG[1])
630e1051a39Sopenharmony_ci	&palignr	($TMP,@MSG[0],4);
631e1051a39Sopenharmony_ci	&sha256rnds2	($ABEF,$CDGH);
632e1051a39Sopenharmony_ci	&paddd		(@MSG[2],$TMP);
633e1051a39Sopenharmony_ci
634e1051a39Sopenharmony_ci	&movdqa		($Wi,&QWP(14*16-0x80,$K256));
635e1051a39Sopenharmony_ci	&paddd		($Wi,@MSG[1]);
636e1051a39Sopenharmony_ci	&sha256rnds2	($CDGH,$ABEF);			# 56-59
637e1051a39Sopenharmony_ci	&pshufd		($Wi,$Wi,0x0e);
638e1051a39Sopenharmony_ci	&sha256msg2	(@MSG[2],@MSG[1]);
639e1051a39Sopenharmony_ci	&movdqa		($TMP,&QWP(0x100-0x80,$K256));	# byte swap mask
640e1051a39Sopenharmony_ci	&sha256rnds2	($ABEF,$CDGH);
641e1051a39Sopenharmony_ci
642e1051a39Sopenharmony_ci	&movdqa		($Wi,&QWP(15*16-0x80,$K256));
643e1051a39Sopenharmony_ci	&paddd		($Wi,@MSG[2]);
644e1051a39Sopenharmony_ci	&nop		();
645e1051a39Sopenharmony_ci	&sha256rnds2	($CDGH,$ABEF);			# 60-63
646e1051a39Sopenharmony_ci	&pshufd		($Wi,$Wi,0x0e);
647e1051a39Sopenharmony_ci	&cmp		($end,$inp);
648e1051a39Sopenharmony_ci	&nop		();
649e1051a39Sopenharmony_ci	&sha256rnds2	($ABEF,$CDGH);
650e1051a39Sopenharmony_ci
651e1051a39Sopenharmony_ci	&paddd		($CDGH,&QWP(16,"esp"));
652e1051a39Sopenharmony_ci	&paddd		($ABEF,&QWP(0,"esp"));
653e1051a39Sopenharmony_ci	&jnz		(&label("loop_shaext"));
654e1051a39Sopenharmony_ci
655e1051a39Sopenharmony_ci	&pshufd		($CDGH,$CDGH,0xb1);		# DCHG
656e1051a39Sopenharmony_ci	&pshufd		($TMP,$ABEF,0x1b);		# FEBA
657e1051a39Sopenharmony_ci	&pshufd		($ABEF,$ABEF,0xb1);		# BAFE
658e1051a39Sopenharmony_ci	&punpckhqdq	($ABEF,$CDGH);			# DCBA
659e1051a39Sopenharmony_ci	&palignr	($CDGH,$TMP,8);			# HGFE
660e1051a39Sopenharmony_ci
661e1051a39Sopenharmony_ci	&mov		("esp",&DWP(32+12,"esp"));
662e1051a39Sopenharmony_ci	&movdqu		(&QWP(0,$ctx),$ABEF);
663e1051a39Sopenharmony_ci	&movdqu		(&QWP(16,$ctx),$CDGH);
664e1051a39Sopenharmony_ci&function_end_A();
665e1051a39Sopenharmony_ci}
666e1051a39Sopenharmony_ci
667e1051a39Sopenharmony_cimy @X = map("xmm$_",(0..3));
668e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7));
669e1051a39Sopenharmony_cimy @AH = ($A,$T);
670e1051a39Sopenharmony_ci
671e1051a39Sopenharmony_ci&set_label("SSSE3",32);
672e1051a39Sopenharmony_ci	&lea	("esp",&DWP(-96,"esp"));
673e1051a39Sopenharmony_ci	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
674e1051a39Sopenharmony_ci	&mov	($AH[0],&DWP(0,"esi"));
675e1051a39Sopenharmony_ci	&mov	($AH[1],&DWP(4,"esi"));
676e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(8,"esi"));
677e1051a39Sopenharmony_ci	&mov	("edi",&DWP(12,"esi"));
678e1051a39Sopenharmony_ci	#&mov	(&DWP(0,"esp"),$AH[0]);
679e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esp"),$AH[1]);
680e1051a39Sopenharmony_ci	&xor	($AH[1],"ecx");			# magic
681e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esp"),"ecx");
682e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esp"),"edi");
683e1051a39Sopenharmony_ci	&mov	($E,&DWP(16,"esi"));
684e1051a39Sopenharmony_ci	&mov	("edi",&DWP(20,"esi"));
685e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(24,"esi"));
686e1051a39Sopenharmony_ci	&mov	("esi",&DWP(28,"esi"));
687e1051a39Sopenharmony_ci	#&mov	(&DWP(16,"esp"),$E);
688e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esp"),"edi");
689e1051a39Sopenharmony_ci	&mov	("edi",&DWP(96+4,"esp"));	# inp
690e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esp"),"ecx");
691e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esp"),"esi");
692e1051a39Sopenharmony_ci	&movdqa	($t3,&QWP(256,$K256));
693e1051a39Sopenharmony_ci	&jmp	(&label("grand_ssse3"));
694e1051a39Sopenharmony_ci
695e1051a39Sopenharmony_ci&set_label("grand_ssse3",16);
696e1051a39Sopenharmony_ci	# load input, reverse byte order, add K256[0..15], save to stack
697e1051a39Sopenharmony_ci	&movdqu	(@X[0],&QWP(0,"edi"));
698e1051a39Sopenharmony_ci	&movdqu	(@X[1],&QWP(16,"edi"));
699e1051a39Sopenharmony_ci	&movdqu	(@X[2],&QWP(32,"edi"));
700e1051a39Sopenharmony_ci	&movdqu	(@X[3],&QWP(48,"edi"));
701e1051a39Sopenharmony_ci	&add	("edi",64);
702e1051a39Sopenharmony_ci	&pshufb	(@X[0],$t3);
703e1051a39Sopenharmony_ci	&mov	(&DWP(96+4,"esp"),"edi");
704e1051a39Sopenharmony_ci	&pshufb	(@X[1],$t3);
705e1051a39Sopenharmony_ci	&movdqa	($t0,&QWP(0,$K256));
706e1051a39Sopenharmony_ci	&pshufb	(@X[2],$t3);
707e1051a39Sopenharmony_ci	&movdqa	($t1,&QWP(16,$K256));
708e1051a39Sopenharmony_ci	&paddd	($t0,@X[0]);
709e1051a39Sopenharmony_ci	&pshufb	(@X[3],$t3);
710e1051a39Sopenharmony_ci	&movdqa	($t2,&QWP(32,$K256));
711e1051a39Sopenharmony_ci	&paddd	($t1,@X[1]);
712e1051a39Sopenharmony_ci	&movdqa	($t3,&QWP(48,$K256));
713e1051a39Sopenharmony_ci	&movdqa	(&QWP(32+0,"esp"),$t0);
714e1051a39Sopenharmony_ci	&paddd	($t2,@X[2]);
715e1051a39Sopenharmony_ci	&movdqa	(&QWP(32+16,"esp"),$t1);
716e1051a39Sopenharmony_ci	&paddd	($t3,@X[3]);
717e1051a39Sopenharmony_ci	&movdqa	(&QWP(32+32,"esp"),$t2);
718e1051a39Sopenharmony_ci	&movdqa	(&QWP(32+48,"esp"),$t3);
719e1051a39Sopenharmony_ci	&jmp	(&label("ssse3_00_47"));
720e1051a39Sopenharmony_ci
721e1051a39Sopenharmony_ci&set_label("ssse3_00_47",16);
722e1051a39Sopenharmony_ci	&add		($K256,64);
723e1051a39Sopenharmony_ci
724e1051a39Sopenharmony_cisub SSSE3_00_47 () {
725e1051a39Sopenharmony_cimy $j = shift;
726e1051a39Sopenharmony_cimy $body = shift;
727e1051a39Sopenharmony_cimy @X = @_;
728e1051a39Sopenharmony_cimy @insns = (&$body,&$body,&$body,&$body);	# 120 instructions
729e1051a39Sopenharmony_ci
730e1051a39Sopenharmony_ci	  eval(shift(@insns));
731e1051a39Sopenharmony_ci	&movdqa		($t0,@X[1]);
732e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
733e1051a39Sopenharmony_ci	  eval(shift(@insns));
734e1051a39Sopenharmony_ci	&movdqa		($t3,@X[3]);
735e1051a39Sopenharmony_ci	  eval(shift(@insns));
736e1051a39Sopenharmony_ci	  eval(shift(@insns));
737e1051a39Sopenharmony_ci	&palignr	($t0,@X[0],4);		# X[1..4]
738e1051a39Sopenharmony_ci	  eval(shift(@insns));
739e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
740e1051a39Sopenharmony_ci	  eval(shift(@insns));
741e1051a39Sopenharmony_ci	 &palignr	($t3,@X[2],4);		# X[9..12]
742e1051a39Sopenharmony_ci	  eval(shift(@insns));
743e1051a39Sopenharmony_ci	  eval(shift(@insns));
744e1051a39Sopenharmony_ci	  eval(shift(@insns));
745e1051a39Sopenharmony_ci	&movdqa		($t1,$t0);
746e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
747e1051a39Sopenharmony_ci	  eval(shift(@insns));
748e1051a39Sopenharmony_ci	&movdqa		($t2,$t0);
749e1051a39Sopenharmony_ci	  eval(shift(@insns));
750e1051a39Sopenharmony_ci	  eval(shift(@insns));
751e1051a39Sopenharmony_ci	&psrld		($t0,3);
752e1051a39Sopenharmony_ci	  eval(shift(@insns));
753e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
754e1051a39Sopenharmony_ci	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
755e1051a39Sopenharmony_ci	  eval(shift(@insns));
756e1051a39Sopenharmony_ci	  eval(shift(@insns));
757e1051a39Sopenharmony_ci	&psrld		($t2,7);
758e1051a39Sopenharmony_ci	  eval(shift(@insns));
759e1051a39Sopenharmony_ci	  eval(shift(@insns));
760e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
761e1051a39Sopenharmony_ci	  eval(shift(@insns));
762e1051a39Sopenharmony_ci	 &pshufd	($t3,@X[3],0b11111010);	# X[14..15]
763e1051a39Sopenharmony_ci	  eval(shift(@insns));
764e1051a39Sopenharmony_ci	  eval(shift(@insns));
765e1051a39Sopenharmony_ci	&pslld		($t1,32-18);
766e1051a39Sopenharmony_ci	  eval(shift(@insns));
767e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
768e1051a39Sopenharmony_ci	&pxor		($t0,$t2);
769e1051a39Sopenharmony_ci	  eval(shift(@insns));
770e1051a39Sopenharmony_ci	  eval(shift(@insns));
771e1051a39Sopenharmony_ci	&psrld		($t2,18-7);
772e1051a39Sopenharmony_ci	  eval(shift(@insns));
773e1051a39Sopenharmony_ci	  eval(shift(@insns));
774e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
775e1051a39Sopenharmony_ci	&pxor		($t0,$t1);
776e1051a39Sopenharmony_ci	  eval(shift(@insns));
777e1051a39Sopenharmony_ci	  eval(shift(@insns));
778e1051a39Sopenharmony_ci	&pslld		($t1,18-7);
779e1051a39Sopenharmony_ci	  eval(shift(@insns));
780e1051a39Sopenharmony_ci	  eval(shift(@insns));
781e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
782e1051a39Sopenharmony_ci	&pxor		($t0,$t2);
783e1051a39Sopenharmony_ci	  eval(shift(@insns));
784e1051a39Sopenharmony_ci	  eval(shift(@insns));
785e1051a39Sopenharmony_ci	 &movdqa	($t2,$t3);
786e1051a39Sopenharmony_ci	  eval(shift(@insns));
787e1051a39Sopenharmony_ci	  eval(shift(@insns));
788e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
789e1051a39Sopenharmony_ci	&pxor		($t0,$t1);		# sigma0(X[1..4])
790e1051a39Sopenharmony_ci	  eval(shift(@insns));
791e1051a39Sopenharmony_ci	  eval(shift(@insns));
792e1051a39Sopenharmony_ci	 &psrld		($t3,10);
793e1051a39Sopenharmony_ci	  eval(shift(@insns));
794e1051a39Sopenharmony_ci	  eval(shift(@insns));
795e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
796e1051a39Sopenharmony_ci	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
797e1051a39Sopenharmony_ci	  eval(shift(@insns));
798e1051a39Sopenharmony_ci	  eval(shift(@insns));
799e1051a39Sopenharmony_ci	 &psrlq		($t2,17);
800e1051a39Sopenharmony_ci	  eval(shift(@insns));
801e1051a39Sopenharmony_ci	  eval(shift(@insns));
802e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
803e1051a39Sopenharmony_ci	 &pxor		($t3,$t2);
804e1051a39Sopenharmony_ci	  eval(shift(@insns));
805e1051a39Sopenharmony_ci	  eval(shift(@insns));
806e1051a39Sopenharmony_ci	 &psrlq		($t2,19-17);
807e1051a39Sopenharmony_ci	  eval(shift(@insns));
808e1051a39Sopenharmony_ci	  eval(shift(@insns));
809e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
810e1051a39Sopenharmony_ci	 &pxor		($t3,$t2);
811e1051a39Sopenharmony_ci	  eval(shift(@insns));
812e1051a39Sopenharmony_ci	  eval(shift(@insns));
813e1051a39Sopenharmony_ci	 &pshufd	($t3,$t3,0b10000000);
814e1051a39Sopenharmony_ci	  eval(shift(@insns));
815e1051a39Sopenharmony_ci	  eval(shift(@insns));
816e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
817e1051a39Sopenharmony_ci	  eval(shift(@insns));
818e1051a39Sopenharmony_ci	  eval(shift(@insns));
819e1051a39Sopenharmony_ci	  eval(shift(@insns));
820e1051a39Sopenharmony_ci	  eval(shift(@insns));
821e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
822e1051a39Sopenharmony_ci	  eval(shift(@insns));
823e1051a39Sopenharmony_ci	 &psrldq	($t3,8);
824e1051a39Sopenharmony_ci	  eval(shift(@insns));
825e1051a39Sopenharmony_ci	  eval(shift(@insns));
826e1051a39Sopenharmony_ci	  eval(shift(@insns));
827e1051a39Sopenharmony_ci	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
828e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
829e1051a39Sopenharmony_ci	  eval(shift(@insns));
830e1051a39Sopenharmony_ci	  eval(shift(@insns));
831e1051a39Sopenharmony_ci	  eval(shift(@insns));
832e1051a39Sopenharmony_ci	  eval(shift(@insns));
833e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
834e1051a39Sopenharmony_ci	  eval(shift(@insns));
835e1051a39Sopenharmony_ci	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
836e1051a39Sopenharmony_ci	  eval(shift(@insns));
837e1051a39Sopenharmony_ci	  eval(shift(@insns));
838e1051a39Sopenharmony_ci	  eval(shift(@insns));
839e1051a39Sopenharmony_ci	 &movdqa	($t2,$t3);
840e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
841e1051a39Sopenharmony_ci	 &psrld		($t3,10);
842e1051a39Sopenharmony_ci	  eval(shift(@insns));
843e1051a39Sopenharmony_ci	 &psrlq		($t2,17);
844e1051a39Sopenharmony_ci	  eval(shift(@insns));
845e1051a39Sopenharmony_ci	  eval(shift(@insns));
846e1051a39Sopenharmony_ci	  eval(shift(@insns));
847e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
848e1051a39Sopenharmony_ci	 &pxor		($t3,$t2);
849e1051a39Sopenharmony_ci	  eval(shift(@insns));
850e1051a39Sopenharmony_ci	  eval(shift(@insns));
851e1051a39Sopenharmony_ci	 &psrlq		($t2,19-17);
852e1051a39Sopenharmony_ci	  eval(shift(@insns));
853e1051a39Sopenharmony_ci	  eval(shift(@insns));
854e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
855e1051a39Sopenharmony_ci	 &pxor		($t3,$t2);
856e1051a39Sopenharmony_ci	  eval(shift(@insns));
857e1051a39Sopenharmony_ci	  eval(shift(@insns));
858e1051a39Sopenharmony_ci	  eval(shift(@insns));
859e1051a39Sopenharmony_ci	 &pshufd	($t3,$t3,0b00001000);
860e1051a39Sopenharmony_ci	  eval(shift(@insns));
861e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
862e1051a39Sopenharmony_ci	&movdqa		($t2,&QWP(16*$j,$K256));
863e1051a39Sopenharmony_ci	  eval(shift(@insns));
864e1051a39Sopenharmony_ci	  eval(shift(@insns));
865e1051a39Sopenharmony_ci	 &pslldq	($t3,8);
866e1051a39Sopenharmony_ci	  eval(shift(@insns));
867e1051a39Sopenharmony_ci	  eval(shift(@insns));
868e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
869e1051a39Sopenharmony_ci	  eval(shift(@insns));
870e1051a39Sopenharmony_ci	  eval(shift(@insns));
871e1051a39Sopenharmony_ci	  eval(shift(@insns));
872e1051a39Sopenharmony_ci	  eval(shift(@insns));
873e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
874e1051a39Sopenharmony_ci	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
875e1051a39Sopenharmony_ci	  eval(shift(@insns));
876e1051a39Sopenharmony_ci	  eval(shift(@insns));
877e1051a39Sopenharmony_ci	  eval(shift(@insns));
878e1051a39Sopenharmony_ci	  eval(shift(@insns));
879e1051a39Sopenharmony_ci	&paddd		($t2,@X[0]);
880e1051a39Sopenharmony_ci	  eval(shift(@insns));			# @
881e1051a39Sopenharmony_ci
882e1051a39Sopenharmony_ci	foreach (@insns) { eval; }		# remaining instructions
883e1051a39Sopenharmony_ci
884e1051a39Sopenharmony_ci	&movdqa		(&QWP(32+16*$j,"esp"),$t2);
885e1051a39Sopenharmony_ci}
886e1051a39Sopenharmony_ci
887e1051a39Sopenharmony_cisub body_00_15 () {
888e1051a39Sopenharmony_ci	(
889e1051a39Sopenharmony_ci	'&mov	("ecx",$E);',
890e1051a39Sopenharmony_ci	'&ror	($E,25-11);',
891e1051a39Sopenharmony_ci	 '&mov	("esi",&off($f));',
892e1051a39Sopenharmony_ci	'&xor	($E,"ecx");',
893e1051a39Sopenharmony_ci	 '&mov	("edi",&off($g));',
894e1051a39Sopenharmony_ci	 '&xor	("esi","edi");',
895e1051a39Sopenharmony_ci	'&ror	($E,11-6);',
896e1051a39Sopenharmony_ci	 '&and	("esi","ecx");',
897e1051a39Sopenharmony_ci	 '&mov	(&off($e),"ecx");',	# save $E, modulo-scheduled
898e1051a39Sopenharmony_ci	'&xor	($E,"ecx");',
899e1051a39Sopenharmony_ci	 '&xor	("edi","esi");',	# Ch(e,f,g)
900e1051a39Sopenharmony_ci	'&ror	($E,6);',		# T = Sigma1(e)
901e1051a39Sopenharmony_ci	 '&mov	("ecx",$AH[0]);',
902e1051a39Sopenharmony_ci	 '&add	($E,"edi");',		# T += Ch(e,f,g)
903e1051a39Sopenharmony_ci	 '&mov	("edi",&off($b));',
904e1051a39Sopenharmony_ci	'&mov	("esi",$AH[0]);',
905e1051a39Sopenharmony_ci
906e1051a39Sopenharmony_ci	'&ror	("ecx",22-13);',
907e1051a39Sopenharmony_ci	 '&mov	(&off($a),$AH[0]);',	# save $A, modulo-scheduled
908e1051a39Sopenharmony_ci	'&xor	("ecx",$AH[0]);',
909e1051a39Sopenharmony_ci	 '&xor	($AH[0],"edi");',	# a ^= b, (b^c) in next round
910e1051a39Sopenharmony_ci	 '&add	($E,&off($h));',	# T += h
911e1051a39Sopenharmony_ci	'&ror	("ecx",13-2);',
912e1051a39Sopenharmony_ci	 '&and	($AH[1],$AH[0]);',	# (b^c) &= (a^b)
913e1051a39Sopenharmony_ci	'&xor	("ecx","esi");',
914e1051a39Sopenharmony_ci	 '&add	($E,&DWP(32+4*($i&15),"esp"));',	# T += K[i]+X[i]
915e1051a39Sopenharmony_ci	 '&xor	($AH[1],"edi");',	# h = Maj(a,b,c) = Ch(a^b,c,b)
916e1051a39Sopenharmony_ci	'&ror	("ecx",2);',		# Sigma0(a)
917e1051a39Sopenharmony_ci
918e1051a39Sopenharmony_ci	 '&add	($AH[1],$E);',		# h += T
919e1051a39Sopenharmony_ci	 '&add	($E,&off($d));',	# d += T
920e1051a39Sopenharmony_ci	'&add	($AH[1],"ecx");'.	# h += Sigma0(a)
921e1051a39Sopenharmony_ci
922e1051a39Sopenharmony_ci	'@AH = reverse(@AH); $i++;'	# rotate(a,h)
923e1051a39Sopenharmony_ci	);
924e1051a39Sopenharmony_ci}
925e1051a39Sopenharmony_ci
926e1051a39Sopenharmony_ci    for ($i=0,$j=0; $j<4; $j++) {
927e1051a39Sopenharmony_ci	&SSSE3_00_47($j,\&body_00_15,@X);
928e1051a39Sopenharmony_ci	push(@X,shift(@X));		# rotate(@X)
929e1051a39Sopenharmony_ci    }
930e1051a39Sopenharmony_ci	&cmp	(&DWP(16*$j,$K256),0x00010203);
931e1051a39Sopenharmony_ci	&jne	(&label("ssse3_00_47"));
932e1051a39Sopenharmony_ci
933e1051a39Sopenharmony_ci    for ($i=0; $i<16; ) {
934e1051a39Sopenharmony_ci	foreach(body_00_15()) { eval; }
935e1051a39Sopenharmony_ci    }
936e1051a39Sopenharmony_ci
937e1051a39Sopenharmony_ci	&mov	("esi",&DWP(96,"esp"));	#ctx
938e1051a39Sopenharmony_ci					#&mov	($AH[0],&DWP(0,"esp"));
939e1051a39Sopenharmony_ci	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
940e1051a39Sopenharmony_ci					#&mov	("edi", &DWP(8,"esp"));
941e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(12,"esp"));
942e1051a39Sopenharmony_ci	&add	($AH[0],&DWP(0,"esi"));
943e1051a39Sopenharmony_ci	&add	($AH[1],&DWP(4,"esi"));
944e1051a39Sopenharmony_ci	&add	("edi",&DWP(8,"esi"));
945e1051a39Sopenharmony_ci	&add	("ecx",&DWP(12,"esi"));
946e1051a39Sopenharmony_ci	&mov	(&DWP(0,"esi"),$AH[0]);
947e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esi"),$AH[1]);
948e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esi"),"edi");
949e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esi"),"ecx");
950e1051a39Sopenharmony_ci	 #&mov	(&DWP(0,"esp"),$AH[0]);
951e1051a39Sopenharmony_ci	 &mov	(&DWP(4,"esp"),$AH[1]);
952e1051a39Sopenharmony_ci	 &xor	($AH[1],"edi");			# magic
953e1051a39Sopenharmony_ci	 &mov	(&DWP(8,"esp"),"edi");
954e1051a39Sopenharmony_ci	 &mov	(&DWP(12,"esp"),"ecx");
955e1051a39Sopenharmony_ci	#&mov	($E,&DWP(16,"esp"));
956e1051a39Sopenharmony_ci	&mov	("edi",&DWP(20,"esp"));
957e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(24,"esp"));
958e1051a39Sopenharmony_ci	&add	($E,&DWP(16,"esi"));
959e1051a39Sopenharmony_ci	&add	("edi",&DWP(20,"esi"));
960e1051a39Sopenharmony_ci	&add	("ecx",&DWP(24,"esi"));
961e1051a39Sopenharmony_ci	&mov	(&DWP(16,"esi"),$E);
962e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esi"),"edi");
963e1051a39Sopenharmony_ci	 &mov	(&DWP(20,"esp"),"edi");
964e1051a39Sopenharmony_ci	&mov	("edi",&DWP(28,"esp"));
965e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esi"),"ecx");
966e1051a39Sopenharmony_ci	 #&mov	(&DWP(16,"esp"),$E);
967e1051a39Sopenharmony_ci	&add	("edi",&DWP(28,"esi"));
968e1051a39Sopenharmony_ci	 &mov	(&DWP(24,"esp"),"ecx");
969e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esi"),"edi");
970e1051a39Sopenharmony_ci	 &mov	(&DWP(28,"esp"),"edi");
971e1051a39Sopenharmony_ci	&mov	("edi",&DWP(96+4,"esp"));	# inp
972e1051a39Sopenharmony_ci
973e1051a39Sopenharmony_ci	&movdqa	($t3,&QWP(64,$K256));
974e1051a39Sopenharmony_ci	&sub	($K256,3*64);			# rewind K
975e1051a39Sopenharmony_ci	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
976e1051a39Sopenharmony_ci	&jb	(&label("grand_ssse3"));
977e1051a39Sopenharmony_ci
978e1051a39Sopenharmony_ci	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
979e1051a39Sopenharmony_ci&function_end_A();
980e1051a39Sopenharmony_ci						if ($avx) {
981e1051a39Sopenharmony_ci&set_label("AVX",32);
982e1051a39Sopenharmony_ci						if ($avx>1) {
983e1051a39Sopenharmony_ci	&and	("edx",1<<8|1<<3);		# check for BMI2+BMI1
984e1051a39Sopenharmony_ci	&cmp	("edx",1<<8|1<<3);
985e1051a39Sopenharmony_ci	&je	(&label("AVX_BMI"));
986e1051a39Sopenharmony_ci						}
987e1051a39Sopenharmony_ci	&lea	("esp",&DWP(-96,"esp"));
988e1051a39Sopenharmony_ci	&vzeroall	();
989e1051a39Sopenharmony_ci	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
990e1051a39Sopenharmony_ci	&mov	($AH[0],&DWP(0,"esi"));
991e1051a39Sopenharmony_ci	&mov	($AH[1],&DWP(4,"esi"));
992e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(8,"esi"));
993e1051a39Sopenharmony_ci	&mov	("edi",&DWP(12,"esi"));
994e1051a39Sopenharmony_ci	#&mov	(&DWP(0,"esp"),$AH[0]);
995e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esp"),$AH[1]);
996e1051a39Sopenharmony_ci	&xor	($AH[1],"ecx");			# magic
997e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esp"),"ecx");
998e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esp"),"edi");
999e1051a39Sopenharmony_ci	&mov	($E,&DWP(16,"esi"));
1000e1051a39Sopenharmony_ci	&mov	("edi",&DWP(20,"esi"));
1001e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(24,"esi"));
1002e1051a39Sopenharmony_ci	&mov	("esi",&DWP(28,"esi"));
1003e1051a39Sopenharmony_ci	#&mov	(&DWP(16,"esp"),$E);
1004e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esp"),"edi");
1005e1051a39Sopenharmony_ci	&mov	("edi",&DWP(96+4,"esp"));	# inp
1006e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esp"),"ecx");
1007e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esp"),"esi");
1008e1051a39Sopenharmony_ci	&vmovdqa	($t3,&QWP(256,$K256));
1009e1051a39Sopenharmony_ci	&jmp	(&label("grand_avx"));
1010e1051a39Sopenharmony_ci
1011e1051a39Sopenharmony_ci&set_label("grand_avx",32);
1012e1051a39Sopenharmony_ci	# load input, reverse byte order, add K256[0..15], save to stack
1013e1051a39Sopenharmony_ci	&vmovdqu	(@X[0],&QWP(0,"edi"));
1014e1051a39Sopenharmony_ci	&vmovdqu	(@X[1],&QWP(16,"edi"));
1015e1051a39Sopenharmony_ci	&vmovdqu	(@X[2],&QWP(32,"edi"));
1016e1051a39Sopenharmony_ci	&vmovdqu	(@X[3],&QWP(48,"edi"));
1017e1051a39Sopenharmony_ci	&add		("edi",64);
1018e1051a39Sopenharmony_ci	&vpshufb	(@X[0],@X[0],$t3);
1019e1051a39Sopenharmony_ci	&mov		(&DWP(96+4,"esp"),"edi");
1020e1051a39Sopenharmony_ci	&vpshufb	(@X[1],@X[1],$t3);
1021e1051a39Sopenharmony_ci	&vpshufb	(@X[2],@X[2],$t3);
1022e1051a39Sopenharmony_ci	&vpaddd		($t0,@X[0],&QWP(0,$K256));
1023e1051a39Sopenharmony_ci	&vpshufb	(@X[3],@X[3],$t3);
1024e1051a39Sopenharmony_ci	&vpaddd		($t1,@X[1],&QWP(16,$K256));
1025e1051a39Sopenharmony_ci	&vpaddd		($t2,@X[2],&QWP(32,$K256));
1026e1051a39Sopenharmony_ci	&vpaddd		($t3,@X[3],&QWP(48,$K256));
1027e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(32+0,"esp"),$t0);
1028e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(32+16,"esp"),$t1);
1029e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(32+32,"esp"),$t2);
1030e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(32+48,"esp"),$t3);
1031e1051a39Sopenharmony_ci	&jmp		(&label("avx_00_47"));
1032e1051a39Sopenharmony_ci
1033e1051a39Sopenharmony_ci&set_label("avx_00_47",16);
1034e1051a39Sopenharmony_ci	&add		($K256,64);
1035e1051a39Sopenharmony_ci
1036e1051a39Sopenharmony_cisub Xupdate_AVX () {
1037e1051a39Sopenharmony_ci	(
1038e1051a39Sopenharmony_ci	'&vpalignr	($t0,@X[1],@X[0],4);',	# X[1..4]
1039e1051a39Sopenharmony_ci	 '&vpalignr	($t3,@X[3],@X[2],4);',	# X[9..12]
1040e1051a39Sopenharmony_ci	'&vpsrld	($t2,$t0,7);',
1041e1051a39Sopenharmony_ci	 '&vpaddd	(@X[0],@X[0],$t3);',	# X[0..3] += X[9..16]
1042e1051a39Sopenharmony_ci	'&vpsrld	($t3,$t0,3);',
1043e1051a39Sopenharmony_ci	'&vpslld	($t1,$t0,14);',
1044e1051a39Sopenharmony_ci	'&vpxor		($t0,$t3,$t2);',
1045e1051a39Sopenharmony_ci	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1046e1051a39Sopenharmony_ci	'&vpsrld	($t2,$t2,18-7);',
1047e1051a39Sopenharmony_ci	'&vpxor		($t0,$t0,$t1);',
1048e1051a39Sopenharmony_ci	'&vpslld	($t1,$t1,25-14);',
1049e1051a39Sopenharmony_ci	'&vpxor		($t0,$t0,$t2);',
1050e1051a39Sopenharmony_ci	 '&vpsrld	($t2,$t3,10);',
1051e1051a39Sopenharmony_ci	'&vpxor		($t0,$t0,$t1);',	# sigma0(X[1..4])
1052e1051a39Sopenharmony_ci	 '&vpsrlq	($t1,$t3,17);',
1053e1051a39Sopenharmony_ci	'&vpaddd	(@X[0],@X[0],$t0);',	# X[0..3] += sigma0(X[1..4])
1054e1051a39Sopenharmony_ci	 '&vpxor	($t2,$t2,$t1);',
1055e1051a39Sopenharmony_ci	 '&vpsrlq	($t3,$t3,19);',
1056e1051a39Sopenharmony_ci	 '&vpxor	($t2,$t2,$t3);',	# sigma1(X[14..15]
1057e1051a39Sopenharmony_ci	 '&vpshufd	($t3,$t2,0b10000100);',
1058e1051a39Sopenharmony_ci	'&vpsrldq	($t3,$t3,8);',
1059e1051a39Sopenharmony_ci	'&vpaddd	(@X[0],@X[0],$t3);',	# X[0..1] += sigma1(X[14..15])
1060e1051a39Sopenharmony_ci	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1061e1051a39Sopenharmony_ci	 '&vpsrld	($t2,$t3,10);',
1062e1051a39Sopenharmony_ci	 '&vpsrlq	($t1,$t3,17);',
1063e1051a39Sopenharmony_ci	 '&vpxor	($t2,$t2,$t1);',
1064e1051a39Sopenharmony_ci	 '&vpsrlq	($t3,$t3,19);',
1065e1051a39Sopenharmony_ci	 '&vpxor	($t2,$t2,$t3);',	# sigma1(X[16..17]
1066e1051a39Sopenharmony_ci	 '&vpshufd	($t3,$t2,0b11101000);',
1067e1051a39Sopenharmony_ci	'&vpslldq	($t3,$t3,8);',
1068e1051a39Sopenharmony_ci	'&vpaddd	(@X[0],@X[0],$t3);'	# X[2..3] += sigma1(X[16..17])
1069e1051a39Sopenharmony_ci	);
1070e1051a39Sopenharmony_ci}
1071e1051a39Sopenharmony_ci
1072e1051a39Sopenharmony_cilocal *ror = sub { &shrd(@_[0],@_) };
1073e1051a39Sopenharmony_cisub AVX_00_47 () {
1074e1051a39Sopenharmony_cimy $j = shift;
1075e1051a39Sopenharmony_cimy $body = shift;
1076e1051a39Sopenharmony_cimy @X = @_;
1077e1051a39Sopenharmony_cimy @insns = (&$body,&$body,&$body,&$body);	# 120 instructions
1078e1051a39Sopenharmony_cimy $insn;
1079e1051a39Sopenharmony_ci
1080e1051a39Sopenharmony_ci	foreach (Xupdate_AVX()) {		# 31 instructions
1081e1051a39Sopenharmony_ci	    eval;
1082e1051a39Sopenharmony_ci	    eval(shift(@insns));
1083e1051a39Sopenharmony_ci	    eval(shift(@insns));
1084e1051a39Sopenharmony_ci	    eval($insn = shift(@insns));
1085e1051a39Sopenharmony_ci	    eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/);
1086e1051a39Sopenharmony_ci	}
1087e1051a39Sopenharmony_ci	&vpaddd		($t2,@X[0],&QWP(16*$j,$K256));
1088e1051a39Sopenharmony_ci	foreach (@insns) { eval; }		# remaining instructions
1089e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(32+16*$j,"esp"),$t2);
1090e1051a39Sopenharmony_ci}
1091e1051a39Sopenharmony_ci
1092e1051a39Sopenharmony_ci    for ($i=0,$j=0; $j<4; $j++) {
1093e1051a39Sopenharmony_ci	&AVX_00_47($j,\&body_00_15,@X);
1094e1051a39Sopenharmony_ci	push(@X,shift(@X));		# rotate(@X)
1095e1051a39Sopenharmony_ci    }
1096e1051a39Sopenharmony_ci	&cmp	(&DWP(16*$j,$K256),0x00010203);
1097e1051a39Sopenharmony_ci	&jne	(&label("avx_00_47"));
1098e1051a39Sopenharmony_ci
1099e1051a39Sopenharmony_ci    for ($i=0; $i<16; ) {
1100e1051a39Sopenharmony_ci	foreach(body_00_15()) { eval; }
1101e1051a39Sopenharmony_ci    }
1102e1051a39Sopenharmony_ci
1103e1051a39Sopenharmony_ci	&mov	("esi",&DWP(96,"esp"));	#ctx
1104e1051a39Sopenharmony_ci					#&mov	($AH[0],&DWP(0,"esp"));
1105e1051a39Sopenharmony_ci	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
1106e1051a39Sopenharmony_ci					#&mov	("edi", &DWP(8,"esp"));
1107e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(12,"esp"));
1108e1051a39Sopenharmony_ci	&add	($AH[0],&DWP(0,"esi"));
1109e1051a39Sopenharmony_ci	&add	($AH[1],&DWP(4,"esi"));
1110e1051a39Sopenharmony_ci	&add	("edi",&DWP(8,"esi"));
1111e1051a39Sopenharmony_ci	&add	("ecx",&DWP(12,"esi"));
1112e1051a39Sopenharmony_ci	&mov	(&DWP(0,"esi"),$AH[0]);
1113e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esi"),$AH[1]);
1114e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esi"),"edi");
1115e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esi"),"ecx");
1116e1051a39Sopenharmony_ci	 #&mov	(&DWP(0,"esp"),$AH[0]);
1117e1051a39Sopenharmony_ci	 &mov	(&DWP(4,"esp"),$AH[1]);
1118e1051a39Sopenharmony_ci	 &xor	($AH[1],"edi");			# magic
1119e1051a39Sopenharmony_ci	 &mov	(&DWP(8,"esp"),"edi");
1120e1051a39Sopenharmony_ci	 &mov	(&DWP(12,"esp"),"ecx");
1121e1051a39Sopenharmony_ci	#&mov	($E,&DWP(16,"esp"));
1122e1051a39Sopenharmony_ci	&mov	("edi",&DWP(20,"esp"));
1123e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(24,"esp"));
1124e1051a39Sopenharmony_ci	&add	($E,&DWP(16,"esi"));
1125e1051a39Sopenharmony_ci	&add	("edi",&DWP(20,"esi"));
1126e1051a39Sopenharmony_ci	&add	("ecx",&DWP(24,"esi"));
1127e1051a39Sopenharmony_ci	&mov	(&DWP(16,"esi"),$E);
1128e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esi"),"edi");
1129e1051a39Sopenharmony_ci	 &mov	(&DWP(20,"esp"),"edi");
1130e1051a39Sopenharmony_ci	&mov	("edi",&DWP(28,"esp"));
1131e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esi"),"ecx");
1132e1051a39Sopenharmony_ci	 #&mov	(&DWP(16,"esp"),$E);
1133e1051a39Sopenharmony_ci	&add	("edi",&DWP(28,"esi"));
1134e1051a39Sopenharmony_ci	 &mov	(&DWP(24,"esp"),"ecx");
1135e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esi"),"edi");
1136e1051a39Sopenharmony_ci	 &mov	(&DWP(28,"esp"),"edi");
1137e1051a39Sopenharmony_ci	&mov	("edi",&DWP(96+4,"esp"));	# inp
1138e1051a39Sopenharmony_ci
1139e1051a39Sopenharmony_ci	&vmovdqa	($t3,&QWP(64,$K256));
1140e1051a39Sopenharmony_ci	&sub	($K256,3*64);			# rewind K
1141e1051a39Sopenharmony_ci	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
1142e1051a39Sopenharmony_ci	&jb	(&label("grand_avx"));
1143e1051a39Sopenharmony_ci
1144e1051a39Sopenharmony_ci	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
1145e1051a39Sopenharmony_ci	&vzeroall	();
1146e1051a39Sopenharmony_ci&function_end_A();
1147e1051a39Sopenharmony_ci						if ($avx>1) {
1148e1051a39Sopenharmony_cisub bodyx_00_15 () {			# +10%
1149e1051a39Sopenharmony_ci	(
1150e1051a39Sopenharmony_ci	'&rorx	("ecx",$E,6)',
1151e1051a39Sopenharmony_ci	'&rorx	("esi",$E,11)',
1152e1051a39Sopenharmony_ci	 '&mov	(&off($e),$E)',		# save $E, modulo-scheduled
1153e1051a39Sopenharmony_ci	'&rorx	("edi",$E,25)',
1154e1051a39Sopenharmony_ci	'&xor	("ecx","esi")',
1155e1051a39Sopenharmony_ci	 '&andn	("esi",$E,&off($g))',
1156e1051a39Sopenharmony_ci	'&xor	("ecx","edi")',		# Sigma1(e)
1157e1051a39Sopenharmony_ci	 '&and	($E,&off($f))',
1158e1051a39Sopenharmony_ci	 '&mov	(&off($a),$AH[0]);',	# save $A, modulo-scheduled
1159e1051a39Sopenharmony_ci	 '&or	($E,"esi")',		# T = Ch(e,f,g)
1160e1051a39Sopenharmony_ci
1161e1051a39Sopenharmony_ci	'&rorx	("edi",$AH[0],2)',
1162e1051a39Sopenharmony_ci	'&rorx	("esi",$AH[0],13)',
1163e1051a39Sopenharmony_ci	 '&lea	($E,&DWP(0,$E,"ecx"))',	# T += Sigma1(e)
1164e1051a39Sopenharmony_ci	'&rorx	("ecx",$AH[0],22)',
1165e1051a39Sopenharmony_ci	'&xor	("esi","edi")',
1166e1051a39Sopenharmony_ci	 '&mov	("edi",&off($b))',
1167e1051a39Sopenharmony_ci	'&xor	("ecx","esi")',		# Sigma0(a)
1168e1051a39Sopenharmony_ci
1169e1051a39Sopenharmony_ci	 '&xor	($AH[0],"edi")',	# a ^= b, (b^c) in next round
1170e1051a39Sopenharmony_ci	 '&add	($E,&off($h))',		# T += h
1171e1051a39Sopenharmony_ci	 '&and	($AH[1],$AH[0])',	# (b^c) &= (a^b)
1172e1051a39Sopenharmony_ci	 '&add	($E,&DWP(32+4*($i&15),"esp"))',	# T += K[i]+X[i]
1173e1051a39Sopenharmony_ci	 '&xor	($AH[1],"edi")',	# h = Maj(a,b,c) = Ch(a^b,c,b)
1174e1051a39Sopenharmony_ci
1175e1051a39Sopenharmony_ci	 '&add	("ecx",$E)',		# h += T
1176e1051a39Sopenharmony_ci	 '&add	($E,&off($d))',		# d += T
1177e1051a39Sopenharmony_ci	'&lea	($AH[1],&DWP(0,$AH[1],"ecx"));'.	# h += Sigma0(a)
1178e1051a39Sopenharmony_ci
1179e1051a39Sopenharmony_ci	'@AH = reverse(@AH); $i++;'	# rotate(a,h)
1180e1051a39Sopenharmony_ci	);
1181e1051a39Sopenharmony_ci}
1182e1051a39Sopenharmony_ci
1183e1051a39Sopenharmony_ci&set_label("AVX_BMI",32);
1184e1051a39Sopenharmony_ci	&lea	("esp",&DWP(-96,"esp"));
1185e1051a39Sopenharmony_ci	&vzeroall	();
1186e1051a39Sopenharmony_ci	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
1187e1051a39Sopenharmony_ci	&mov	($AH[0],&DWP(0,"esi"));
1188e1051a39Sopenharmony_ci	&mov	($AH[1],&DWP(4,"esi"));
1189e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(8,"esi"));
1190e1051a39Sopenharmony_ci	&mov	("edi",&DWP(12,"esi"));
1191e1051a39Sopenharmony_ci	#&mov	(&DWP(0,"esp"),$AH[0]);
1192e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esp"),$AH[1]);
1193e1051a39Sopenharmony_ci	&xor	($AH[1],"ecx");			# magic
1194e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esp"),"ecx");
1195e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esp"),"edi");
1196e1051a39Sopenharmony_ci	&mov	($E,&DWP(16,"esi"));
1197e1051a39Sopenharmony_ci	&mov	("edi",&DWP(20,"esi"));
1198e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(24,"esi"));
1199e1051a39Sopenharmony_ci	&mov	("esi",&DWP(28,"esi"));
1200e1051a39Sopenharmony_ci	#&mov	(&DWP(16,"esp"),$E);
1201e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esp"),"edi");
1202e1051a39Sopenharmony_ci	&mov	("edi",&DWP(96+4,"esp"));	# inp
1203e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esp"),"ecx");
1204e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esp"),"esi");
1205e1051a39Sopenharmony_ci	&vmovdqa	($t3,&QWP(256,$K256));
1206e1051a39Sopenharmony_ci	&jmp	(&label("grand_avx_bmi"));
1207e1051a39Sopenharmony_ci
1208e1051a39Sopenharmony_ci&set_label("grand_avx_bmi",32);
1209e1051a39Sopenharmony_ci	# load input, reverse byte order, add K256[0..15], save to stack
1210e1051a39Sopenharmony_ci	&vmovdqu	(@X[0],&QWP(0,"edi"));
1211e1051a39Sopenharmony_ci	&vmovdqu	(@X[1],&QWP(16,"edi"));
1212e1051a39Sopenharmony_ci	&vmovdqu	(@X[2],&QWP(32,"edi"));
1213e1051a39Sopenharmony_ci	&vmovdqu	(@X[3],&QWP(48,"edi"));
1214e1051a39Sopenharmony_ci	&add		("edi",64);
1215e1051a39Sopenharmony_ci	&vpshufb	(@X[0],@X[0],$t3);
1216e1051a39Sopenharmony_ci	&mov		(&DWP(96+4,"esp"),"edi");
1217e1051a39Sopenharmony_ci	&vpshufb	(@X[1],@X[1],$t3);
1218e1051a39Sopenharmony_ci	&vpshufb	(@X[2],@X[2],$t3);
1219e1051a39Sopenharmony_ci	&vpaddd		($t0,@X[0],&QWP(0,$K256));
1220e1051a39Sopenharmony_ci	&vpshufb	(@X[3],@X[3],$t3);
1221e1051a39Sopenharmony_ci	&vpaddd		($t1,@X[1],&QWP(16,$K256));
1222e1051a39Sopenharmony_ci	&vpaddd		($t2,@X[2],&QWP(32,$K256));
1223e1051a39Sopenharmony_ci	&vpaddd		($t3,@X[3],&QWP(48,$K256));
1224e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(32+0,"esp"),$t0);
1225e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(32+16,"esp"),$t1);
1226e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(32+32,"esp"),$t2);
1227e1051a39Sopenharmony_ci	&vmovdqa	(&QWP(32+48,"esp"),$t3);
1228e1051a39Sopenharmony_ci	&jmp		(&label("avx_bmi_00_47"));
1229e1051a39Sopenharmony_ci
1230e1051a39Sopenharmony_ci&set_label("avx_bmi_00_47",16);
1231e1051a39Sopenharmony_ci	&add		($K256,64);
1232e1051a39Sopenharmony_ci
1233e1051a39Sopenharmony_ci    for ($i=0,$j=0; $j<4; $j++) {
1234e1051a39Sopenharmony_ci	&AVX_00_47($j,\&bodyx_00_15,@X);
1235e1051a39Sopenharmony_ci	push(@X,shift(@X));		# rotate(@X)
1236e1051a39Sopenharmony_ci    }
1237e1051a39Sopenharmony_ci	&cmp	(&DWP(16*$j,$K256),0x00010203);
1238e1051a39Sopenharmony_ci	&jne	(&label("avx_bmi_00_47"));
1239e1051a39Sopenharmony_ci
1240e1051a39Sopenharmony_ci    for ($i=0; $i<16; ) {
1241e1051a39Sopenharmony_ci	foreach(bodyx_00_15()) { eval; }
1242e1051a39Sopenharmony_ci    }
1243e1051a39Sopenharmony_ci
1244e1051a39Sopenharmony_ci	&mov	("esi",&DWP(96,"esp"));	#ctx
1245e1051a39Sopenharmony_ci					#&mov	($AH[0],&DWP(0,"esp"));
1246e1051a39Sopenharmony_ci	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
1247e1051a39Sopenharmony_ci					#&mov	("edi", &DWP(8,"esp"));
1248e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(12,"esp"));
1249e1051a39Sopenharmony_ci	&add	($AH[0],&DWP(0,"esi"));
1250e1051a39Sopenharmony_ci	&add	($AH[1],&DWP(4,"esi"));
1251e1051a39Sopenharmony_ci	&add	("edi",&DWP(8,"esi"));
1252e1051a39Sopenharmony_ci	&add	("ecx",&DWP(12,"esi"));
1253e1051a39Sopenharmony_ci	&mov	(&DWP(0,"esi"),$AH[0]);
1254e1051a39Sopenharmony_ci	&mov	(&DWP(4,"esi"),$AH[1]);
1255e1051a39Sopenharmony_ci	&mov	(&DWP(8,"esi"),"edi");
1256e1051a39Sopenharmony_ci	&mov	(&DWP(12,"esi"),"ecx");
1257e1051a39Sopenharmony_ci	 #&mov	(&DWP(0,"esp"),$AH[0]);
1258e1051a39Sopenharmony_ci	 &mov	(&DWP(4,"esp"),$AH[1]);
1259e1051a39Sopenharmony_ci	 &xor	($AH[1],"edi");			# magic
1260e1051a39Sopenharmony_ci	 &mov	(&DWP(8,"esp"),"edi");
1261e1051a39Sopenharmony_ci	 &mov	(&DWP(12,"esp"),"ecx");
1262e1051a39Sopenharmony_ci	#&mov	($E,&DWP(16,"esp"));
1263e1051a39Sopenharmony_ci	&mov	("edi",&DWP(20,"esp"));
1264e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(24,"esp"));
1265e1051a39Sopenharmony_ci	&add	($E,&DWP(16,"esi"));
1266e1051a39Sopenharmony_ci	&add	("edi",&DWP(20,"esi"));
1267e1051a39Sopenharmony_ci	&add	("ecx",&DWP(24,"esi"));
1268e1051a39Sopenharmony_ci	&mov	(&DWP(16,"esi"),$E);
1269e1051a39Sopenharmony_ci	&mov	(&DWP(20,"esi"),"edi");
1270e1051a39Sopenharmony_ci	 &mov	(&DWP(20,"esp"),"edi");
1271e1051a39Sopenharmony_ci	&mov	("edi",&DWP(28,"esp"));
1272e1051a39Sopenharmony_ci	&mov	(&DWP(24,"esi"),"ecx");
1273e1051a39Sopenharmony_ci	 #&mov	(&DWP(16,"esp"),$E);
1274e1051a39Sopenharmony_ci	&add	("edi",&DWP(28,"esi"));
1275e1051a39Sopenharmony_ci	 &mov	(&DWP(24,"esp"),"ecx");
1276e1051a39Sopenharmony_ci	&mov	(&DWP(28,"esi"),"edi");
1277e1051a39Sopenharmony_ci	 &mov	(&DWP(28,"esp"),"edi");
1278e1051a39Sopenharmony_ci	&mov	("edi",&DWP(96+4,"esp"));	# inp
1279e1051a39Sopenharmony_ci
1280e1051a39Sopenharmony_ci	&vmovdqa	($t3,&QWP(64,$K256));
1281e1051a39Sopenharmony_ci	&sub	($K256,3*64);			# rewind K
1282e1051a39Sopenharmony_ci	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
1283e1051a39Sopenharmony_ci	&jb	(&label("grand_avx_bmi"));
1284e1051a39Sopenharmony_ci
1285e1051a39Sopenharmony_ci	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
1286e1051a39Sopenharmony_ci	&vzeroall	();
1287e1051a39Sopenharmony_ci&function_end_A();
1288e1051a39Sopenharmony_ci						}
1289e1051a39Sopenharmony_ci						}
1290e1051a39Sopenharmony_ci						}}}
1291e1051a39Sopenharmony_ci&function_end_B("sha256_block_data_order");
1292e1051a39Sopenharmony_ci
1293e1051a39Sopenharmony_ci&asm_finish();
1294e1051a39Sopenharmony_ci
1295e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
1296