1e1051a39Sopenharmony_ci#!/usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci#
9e1051a39Sopenharmony_ci# ====================================================================
10e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
12e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
13e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
14e1051a39Sopenharmony_ci# ====================================================================
15e1051a39Sopenharmony_ci#
16e1051a39Sopenharmony_ci# Keccak-1600 for x86 MMX.
17e1051a39Sopenharmony_ci#
18e1051a39Sopenharmony_ci# June 2017.
19e1051a39Sopenharmony_ci#
20e1051a39Sopenharmony_ci# Below code is KECCAK_2X implementation (see sha/keccak1600.c) with
21e1051a39Sopenharmony_ci# C[5] held in register bank and D[5] offloaded to memory. Though
22e1051a39Sopenharmony_ci# instead of actually unrolling the loop pair-wise I simply flip
23e1051a39Sopenharmony_ci# pointers to T[][] and A[][] and the end of round. Since number of
24e1051a39Sopenharmony_ci# rounds is even, last round writes to A[][] and everything works out.
25e1051a39Sopenharmony_ci# It's argued that MMX is the only code path meaningful to implement
26e1051a39Sopenharmony_ci# for x86. This is because non-MMX-capable processors is an extinct
27e1051a39Sopenharmony_ci# breed, and they as well can lurk executing compiler-generated code.
28e1051a39Sopenharmony_ci# For reference gcc-5.x-generated KECCAK_2X code takes 89 cycles per
29e1051a39Sopenharmony_ci# processed byte on Pentium. Which is fair result. But older compilers
30e1051a39Sopenharmony_ci# produce worse code. On the other hand one can wonder why not 128-bit
31e1051a39Sopenharmony_ci# SSE2? Well, SSE2 won't provide double improvement, rather far from
32e1051a39Sopenharmony_ci# that, if any at all on some processors, because it will take extra
33e1051a39Sopenharmony_ci# permutations and inter-bank data transfers. Besides, contemporary
34e1051a39Sopenharmony_ci# CPUs are better off executing 64-bit code, and it makes lesser sense
35e1051a39Sopenharmony_ci# to invest into fancy 32-bit code. And the decision doesn't seem to
36e1051a39Sopenharmony_ci# be inadequate, if one compares below results to "64-bit platforms in
37e1051a39Sopenharmony_ci# 32-bit mode" SIMD data points available at
38e1051a39Sopenharmony_ci# http://keccak.noekeon.org/sw_performance.html.
39e1051a39Sopenharmony_ci#
40e1051a39Sopenharmony_ci########################################################################
41e1051a39Sopenharmony_ci# Numbers are cycles per processed byte out of large message.
42e1051a39Sopenharmony_ci#
43e1051a39Sopenharmony_ci#			r=1088(i)
44e1051a39Sopenharmony_ci#
45e1051a39Sopenharmony_ci# PIII			30/+150%
46e1051a39Sopenharmony_ci# Pentium M		27/+150%
47e1051a39Sopenharmony_ci# P4			40/+85%
48e1051a39Sopenharmony_ci# Core 2		19/+170%
49e1051a39Sopenharmony_ci# Sandy Bridge(ii)	18/+140%
50e1051a39Sopenharmony_ci# Atom			33/+180%
51e1051a39Sopenharmony_ci# Silvermont(ii)	30/+180%
52e1051a39Sopenharmony_ci# VIA Nano(ii)		43/+60%
53e1051a39Sopenharmony_ci# Sledgehammer(ii)(iii)	24/+130%
54e1051a39Sopenharmony_ci#
55e1051a39Sopenharmony_ci# (i)	Corresponds to SHA3-256. Numbers after slash are improvement
56e1051a39Sopenharmony_ci#	coefficients over KECCAK_2X [with bit interleave and lane
57e1051a39Sopenharmony_ci#	complementing] position-independent *scalar* code generated
58e1051a39Sopenharmony_ci#	by gcc-5.x. It's not exactly fair comparison, but it's a
59e1051a39Sopenharmony_ci#	datapoint...
60e1051a39Sopenharmony_ci# (ii)	64-bit processor executing 32-bit code.
61e1051a39Sopenharmony_ci# (iii)	Result is considered to be representative even for older AMD
62e1051a39Sopenharmony_ci#	processors.
63e1051a39Sopenharmony_ci
64e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm");
66e1051a39Sopenharmony_cirequire "x86asm.pl";
67e1051a39Sopenharmony_ci
68e1051a39Sopenharmony_ci$output=pop and open STDOUT,">$output";
69e1051a39Sopenharmony_ci
70e1051a39Sopenharmony_ci&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
71e1051a39Sopenharmony_ci
72e1051a39Sopenharmony_cimy @C = map("mm$_",(0..4));
73e1051a39Sopenharmony_cimy @T = map("mm$_",(5..7));
74e1051a39Sopenharmony_cimy @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
75e1051a39Sopenharmony_ci              8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
76e1051a39Sopenharmony_cimy @D = map(8*$_+4, (0..4));
77e1051a39Sopenharmony_cimy @rhotates = ([  0,  1, 62, 28, 27 ],
78e1051a39Sopenharmony_ci                [ 36, 44,  6, 55, 20 ],
79e1051a39Sopenharmony_ci                [  3, 10, 43, 25, 39 ],
80e1051a39Sopenharmony_ci                [ 41, 45, 15, 21,  8 ],
81e1051a39Sopenharmony_ci                [ 18,  2, 61, 56, 14 ]);
82e1051a39Sopenharmony_ci
83e1051a39Sopenharmony_ci&static_label("iotas");
84e1051a39Sopenharmony_ci
85e1051a39Sopenharmony_ci&function_begin_B("_KeccakF1600");
86e1051a39Sopenharmony_ci	&movq	(@C[0],&QWP($A[4][0],"esi"));
87e1051a39Sopenharmony_ci	&movq	(@C[1],&QWP($A[4][1],"esi"));
88e1051a39Sopenharmony_ci	&movq	(@C[2],&QWP($A[4][2],"esi"));
89e1051a39Sopenharmony_ci	&movq	(@C[3],&QWP($A[4][3],"esi"));
90e1051a39Sopenharmony_ci	&movq	(@C[4],&QWP($A[4][4],"esi"));
91e1051a39Sopenharmony_ci
92e1051a39Sopenharmony_ci	&mov	("ecx",24);			# loop counter
93e1051a39Sopenharmony_ci	&jmp	(&label("loop"));
94e1051a39Sopenharmony_ci
95e1051a39Sopenharmony_ci    &set_label("loop",16);
96e1051a39Sopenharmony_ci	######################################### Theta
97e1051a39Sopenharmony_ci	&pxor	(@C[0],&QWP($A[0][0],"esi"));
98e1051a39Sopenharmony_ci	&pxor	(@C[1],&QWP($A[0][1],"esi"));
99e1051a39Sopenharmony_ci	&pxor	(@C[2],&QWP($A[0][2],"esi"));
100e1051a39Sopenharmony_ci	&pxor	(@C[3],&QWP($A[0][3],"esi"));
101e1051a39Sopenharmony_ci	&pxor	(@C[4],&QWP($A[0][4],"esi"));
102e1051a39Sopenharmony_ci
103e1051a39Sopenharmony_ci	&pxor	(@C[0],&QWP($A[1][0],"esi"));
104e1051a39Sopenharmony_ci	&pxor	(@C[1],&QWP($A[1][1],"esi"));
105e1051a39Sopenharmony_ci	&pxor	(@C[2],&QWP($A[1][2],"esi"));
106e1051a39Sopenharmony_ci	&pxor	(@C[3],&QWP($A[1][3],"esi"));
107e1051a39Sopenharmony_ci	&pxor	(@C[4],&QWP($A[1][4],"esi"));
108e1051a39Sopenharmony_ci
109e1051a39Sopenharmony_ci	&pxor	(@C[0],&QWP($A[2][0],"esi"));
110e1051a39Sopenharmony_ci	&pxor	(@C[1],&QWP($A[2][1],"esi"));
111e1051a39Sopenharmony_ci	&pxor	(@C[2],&QWP($A[2][2],"esi"));
112e1051a39Sopenharmony_ci	&pxor	(@C[3],&QWP($A[2][3],"esi"));
113e1051a39Sopenharmony_ci	&pxor	(@C[4],&QWP($A[2][4],"esi"));
114e1051a39Sopenharmony_ci
115e1051a39Sopenharmony_ci	&pxor	(@C[2],&QWP($A[3][2],"esi"));
116e1051a39Sopenharmony_ci	&pxor	(@C[0],&QWP($A[3][0],"esi"));
117e1051a39Sopenharmony_ci	&pxor	(@C[1],&QWP($A[3][1],"esi"));
118e1051a39Sopenharmony_ci	&pxor	(@C[3],&QWP($A[3][3],"esi"));
119e1051a39Sopenharmony_ci	 &movq	(@T[0],@C[2]);
120e1051a39Sopenharmony_ci	&pxor	(@C[4],&QWP($A[3][4],"esi"));
121e1051a39Sopenharmony_ci
122e1051a39Sopenharmony_ci	 &movq	(@T[2],@C[2]);
123e1051a39Sopenharmony_ci	 &psrlq	(@T[0],63);
124e1051a39Sopenharmony_ci	&movq	(@T[1],@C[0]);
125e1051a39Sopenharmony_ci	 &psllq	(@T[2],1);
126e1051a39Sopenharmony_ci	 &pxor	(@T[0],@C[0]);
127e1051a39Sopenharmony_ci	&psrlq	(@C[0],63);
128e1051a39Sopenharmony_ci	 &pxor	(@T[0],@T[2]);
129e1051a39Sopenharmony_ci	&psllq	(@T[1],1);
130e1051a39Sopenharmony_ci	 &movq	(@T[2],@C[1]);
131e1051a39Sopenharmony_ci	 &movq	(&QWP(@D[1],"esp"),@T[0]);	# D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
132e1051a39Sopenharmony_ci
133e1051a39Sopenharmony_ci	&pxor	(@T[1],@C[0]);
134e1051a39Sopenharmony_ci	 &psrlq	(@T[2],63);
135e1051a39Sopenharmony_ci	&pxor	(@T[1],@C[3]);
136e1051a39Sopenharmony_ci	 &movq	(@C[0],@C[1]);
137e1051a39Sopenharmony_ci	&movq	(&QWP(@D[4],"esp"),@T[1]);	# D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
138e1051a39Sopenharmony_ci
139e1051a39Sopenharmony_ci	 &psllq	(@C[0],1);
140e1051a39Sopenharmony_ci	 &pxor	(@T[2],@C[4]);
141e1051a39Sopenharmony_ci	 &pxor	(@C[0],@T[2]);
142e1051a39Sopenharmony_ci
143e1051a39Sopenharmony_ci	&movq	(@T[2],@C[3]);
144e1051a39Sopenharmony_ci	&psrlq	(@C[3],63);
145e1051a39Sopenharmony_ci	 &movq	(&QWP(@D[0],"esp"),@C[0]);	# D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
146e1051a39Sopenharmony_ci	&psllq	(@T[2],1);
147e1051a39Sopenharmony_ci	 &movq	(@T[0],@C[4]);
148e1051a39Sopenharmony_ci	 &psrlq	(@C[4],63);
149e1051a39Sopenharmony_ci	&pxor	(@C[1],@C[3]);
150e1051a39Sopenharmony_ci	 &psllq	(@T[0],1);
151e1051a39Sopenharmony_ci	&pxor	(@C[1],@T[2]);
152e1051a39Sopenharmony_ci	 &pxor	(@C[2],@C[4]);
153e1051a39Sopenharmony_ci	&movq	(&QWP(@D[2],"esp"),@C[1]);	# D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
154e1051a39Sopenharmony_ci	 &pxor	(@C[2],@T[0]);
155e1051a39Sopenharmony_ci
156e1051a39Sopenharmony_ci	######################################### first Rho(0) is special
157e1051a39Sopenharmony_ci	&movq	(@C[3],&QWP($A[3][3],"esi"));
158e1051a39Sopenharmony_ci	 &movq	(&QWP(@D[3],"esp"),@C[2]);	# D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
159e1051a39Sopenharmony_ci	&pxor	(@C[3],@C[2]);
160e1051a39Sopenharmony_ci	 &movq	(@C[4],&QWP($A[4][4],"esi"));
161e1051a39Sopenharmony_ci	&movq	(@T[2],@C[3]);
162e1051a39Sopenharmony_ci	&psrlq	(@C[3],64-$rhotates[3][3]);
163e1051a39Sopenharmony_ci	 &pxor	(@C[4],@T[1]);
164e1051a39Sopenharmony_ci	&psllq	(@T[2],$rhotates[3][3]);
165e1051a39Sopenharmony_ci	 &movq	(@T[1],@C[4]);
166e1051a39Sopenharmony_ci	 &psrlq	(@C[4],64-$rhotates[4][4]);
167e1051a39Sopenharmony_ci	&por	(@C[3],@T[2]);		# C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
168e1051a39Sopenharmony_ci	 &psllq	(@T[1],$rhotates[4][4]);
169e1051a39Sopenharmony_ci
170e1051a39Sopenharmony_ci	&movq	(@C[2],&QWP($A[2][2],"esi"));
171e1051a39Sopenharmony_ci	 &por	(@C[4],@T[1]);		# C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
172e1051a39Sopenharmony_ci	&pxor	(@C[2],@C[1]);
173e1051a39Sopenharmony_ci	 &movq	(@C[1],&QWP($A[1][1],"esi"));
174e1051a39Sopenharmony_ci	&movq	(@T[1],@C[2]);
175e1051a39Sopenharmony_ci	&psrlq	(@C[2],64-$rhotates[2][2]);
176e1051a39Sopenharmony_ci	 &pxor	(@C[1],&QWP(@D[1],"esp"));
177e1051a39Sopenharmony_ci	&psllq	(@T[1],$rhotates[2][2]);
178e1051a39Sopenharmony_ci
179e1051a39Sopenharmony_ci	 &movq	(@T[2],@C[1]);
180e1051a39Sopenharmony_ci	 &psrlq	(@C[1],64-$rhotates[1][1]);
181e1051a39Sopenharmony_ci	&por	(@C[2],@T[1]);		# C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
182e1051a39Sopenharmony_ci	 &psllq	(@T[2],$rhotates[1][1]);
183e1051a39Sopenharmony_ci	&pxor	(@C[0],&QWP($A[0][0],"esi")); # /* rotate by 0 */  /* D[0] */
184e1051a39Sopenharmony_ci	 &por	(@C[1],@T[2]);		# C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
185e1051a39Sopenharmony_ci
186e1051a39Sopenharmony_cisub Chi() {				######### regular Chi step
187e1051a39Sopenharmony_ci    my ($y,$xrho) = @_;
188e1051a39Sopenharmony_ci
189e1051a39Sopenharmony_ci	&movq	(@T[0],@C[1]);
190e1051a39Sopenharmony_ci	 &movq	(@T[1],@C[2]);
191e1051a39Sopenharmony_ci	&pandn	(@T[0],@C[2]);
192e1051a39Sopenharmony_ci	 &pandn	(@C[2],@C[3]);
193e1051a39Sopenharmony_ci	&pxor	(@T[0],@C[0]);
194e1051a39Sopenharmony_ci	 &pxor	(@C[2],@C[1]);
195e1051a39Sopenharmony_ci	&pxor	(@T[0],&QWP(0,"ebx"))		if ($y == 0);
196e1051a39Sopenharmony_ci	&lea	("ebx",&DWP(8,"ebx"))		if ($y == 0);
197e1051a39Sopenharmony_ci
198e1051a39Sopenharmony_ci	&movq	(@T[2],@C[3]);
199e1051a39Sopenharmony_ci	&movq	(&QWP($A[$y][0],"edi"),@T[0]);	# R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
200e1051a39Sopenharmony_ci	 &movq	(@T[0],@C[4]);
201e1051a39Sopenharmony_ci	&pandn	(@C[3],@C[4]);
202e1051a39Sopenharmony_ci	 &pandn	(@C[4],@C[0]);
203e1051a39Sopenharmony_ci	&pxor	(@C[3],@T[1]);
204e1051a39Sopenharmony_ci	 &movq	(&QWP($A[$y][1],"edi"),@C[2]);	# R[0][1] = C[1] ^ (~C[2] & C[3]);
205e1051a39Sopenharmony_ci	 &pxor	(@C[4],@T[2]);
206e1051a39Sopenharmony_ci	  &movq	(@T[2],&QWP($A[0][$xrho],"esi"))	if (defined($xrho));
207e1051a39Sopenharmony_ci
208e1051a39Sopenharmony_ci	 &movq	(&QWP($A[$y][2],"edi"),@C[3]);	# R[0][2] = C[2] ^ (~C[3] & C[4]);
209e1051a39Sopenharmony_ci	&pandn	(@C[0],@C[1]);
210e1051a39Sopenharmony_ci	 &movq	(&QWP($A[$y][3],"edi"),@C[4]);	# R[0][3] = C[3] ^ (~C[4] & C[0]);
211e1051a39Sopenharmony_ci	&pxor	(@C[0],@T[0]);
212e1051a39Sopenharmony_ci	  &pxor	(@T[2],&QWP(@D[$xrho],"esp"))		if (defined($xrho));
213e1051a39Sopenharmony_ci	&movq	(&QWP($A[$y][4],"edi"),@C[0]);	# R[0][4] = C[4] ^ (~C[0] & C[1]);
214e1051a39Sopenharmony_ci}
215e1051a39Sopenharmony_ci	&Chi	(0, 3);
216e1051a39Sopenharmony_ci
217e1051a39Sopenharmony_cisub Rho() {				######### regular Rho step
218e1051a39Sopenharmony_ci    my $x = shift;
219e1051a39Sopenharmony_ci
220e1051a39Sopenharmony_ci	#&movq	(@T[2],&QWP($A[0][$x],"esi"));	# moved to Chi
221e1051a39Sopenharmony_ci	#&pxor	(@T[2],&QWP(@D[$x],"esp"));	# moved to Chi
222e1051a39Sopenharmony_ci	&movq	(@C[0],@T[2]);
223e1051a39Sopenharmony_ci	&psrlq	(@T[2],64-$rhotates[0][$x]);
224e1051a39Sopenharmony_ci	 &movq	(@C[1],&QWP($A[1][($x+1)%5],"esi"));
225e1051a39Sopenharmony_ci	&psllq	(@C[0],$rhotates[0][$x]);
226e1051a39Sopenharmony_ci	 &pxor	(@C[1],&QWP(@D[($x+1)%5],"esp"));
227e1051a39Sopenharmony_ci	&por	(@C[0],@T[2]);		# C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
228e1051a39Sopenharmony_ci
229e1051a39Sopenharmony_ci	 &movq	(@T[1],@C[1]);
230e1051a39Sopenharmony_ci	 &psrlq	(@C[1],64-$rhotates[1][($x+1)%5]);
231e1051a39Sopenharmony_ci	&movq	(@C[2],&QWP($A[2][($x+2)%5],"esi"));
232e1051a39Sopenharmony_ci	 &psllq	(@T[1],$rhotates[1][($x+1)%5]);
233e1051a39Sopenharmony_ci	&pxor	(@C[2],&QWP(@D[($x+2)%5],"esp"));
234e1051a39Sopenharmony_ci	 &por	(@C[1],@T[1]);		# C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
235e1051a39Sopenharmony_ci
236e1051a39Sopenharmony_ci	&movq	(@T[2],@C[2]);
237e1051a39Sopenharmony_ci	&psrlq	(@C[2],64-$rhotates[2][($x+2)%5]);
238e1051a39Sopenharmony_ci	 &movq	(@C[3],&QWP($A[3][($x+3)%5],"esi"));
239e1051a39Sopenharmony_ci	&psllq	(@T[2],$rhotates[2][($x+2)%5]);
240e1051a39Sopenharmony_ci	 &pxor	(@C[3],&QWP(@D[($x+3)%5],"esp"));
241e1051a39Sopenharmony_ci	&por	(@C[2],@T[2]);		# C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
242e1051a39Sopenharmony_ci
243e1051a39Sopenharmony_ci	 &movq	(@T[0],@C[3]);
244e1051a39Sopenharmony_ci	 &psrlq	(@C[3],64-$rhotates[3][($x+3)%5]);
245e1051a39Sopenharmony_ci	&movq	(@C[4],&QWP($A[4][($x+4)%5],"esi"));
246e1051a39Sopenharmony_ci	 &psllq	(@T[0],$rhotates[3][($x+3)%5]);
247e1051a39Sopenharmony_ci	&pxor	(@C[4],&QWP(@D[($x+4)%5],"esp"));
248e1051a39Sopenharmony_ci	 &por	(@C[3],@T[0]);		# C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
249e1051a39Sopenharmony_ci
250e1051a39Sopenharmony_ci	&movq	(@T[1],@C[4]);
251e1051a39Sopenharmony_ci	&psrlq	(@C[4],64-$rhotates[4][($x+4)%5]);
252e1051a39Sopenharmony_ci	&psllq	(@T[1],$rhotates[4][($x+4)%5]);
253e1051a39Sopenharmony_ci	&por	(@C[4],@T[1]);		# C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
254e1051a39Sopenharmony_ci}
255e1051a39Sopenharmony_ci	&Rho	(3);	&Chi	(1, 1);
256e1051a39Sopenharmony_ci	&Rho	(1);	&Chi	(2, 4);
257e1051a39Sopenharmony_ci	&Rho	(4);	&Chi	(3, 2);
258e1051a39Sopenharmony_ci	&Rho	(2);	###&Chi	(4);
259e1051a39Sopenharmony_ci
260e1051a39Sopenharmony_ci	&movq	(@T[0],@C[0]);		######### last Chi(4) is special
261e1051a39Sopenharmony_ci	 &xor	("edi","esi");		# &xchg	("esi","edi");
262e1051a39Sopenharmony_ci	&movq	(&QWP(@D[1],"esp"),@C[1]);
263e1051a39Sopenharmony_ci	 &xor	("esi","edi");
264e1051a39Sopenharmony_ci	 &xor	("edi","esi");
265e1051a39Sopenharmony_ci
266e1051a39Sopenharmony_ci	&movq	(@T[1],@C[1]);
267e1051a39Sopenharmony_ci	 &movq	(@T[2],@C[2]);
268e1051a39Sopenharmony_ci	&pandn	(@T[1],@C[2]);
269e1051a39Sopenharmony_ci	 &pandn	(@T[2],@C[3]);
270e1051a39Sopenharmony_ci	&pxor	(@C[0],@T[1]);
271e1051a39Sopenharmony_ci	 &pxor	(@C[1],@T[2]);
272e1051a39Sopenharmony_ci
273e1051a39Sopenharmony_ci	&movq	(@T[1],@C[3]);
274e1051a39Sopenharmony_ci	 &movq	(&QWP($A[4][0],"esi"),@C[0]);	# R[4][0] = C[0] ^= (~C[1] & C[2]);
275e1051a39Sopenharmony_ci	&pandn	(@T[1],@C[4]);
276e1051a39Sopenharmony_ci	 &movq	(&QWP($A[4][1],"esi"),@C[1]);	# R[4][1] = C[1] ^= (~C[2] & C[3]);
277e1051a39Sopenharmony_ci	&pxor	(@C[2],@T[1]);
278e1051a39Sopenharmony_ci	 &movq	(@T[2],@C[4]);
279e1051a39Sopenharmony_ci	&movq	(&QWP($A[4][2],"esi"),@C[2]);	# R[4][2] = C[2] ^= (~C[3] & C[4]);
280e1051a39Sopenharmony_ci
281e1051a39Sopenharmony_ci	&pandn	(@T[2],@T[0]);
282e1051a39Sopenharmony_ci	 &pandn	(@T[0],&QWP(@D[1],"esp"));
283e1051a39Sopenharmony_ci	&pxor	(@C[3],@T[2]);
284e1051a39Sopenharmony_ci	 &pxor	(@C[4],@T[0]);
285e1051a39Sopenharmony_ci	&movq	(&QWP($A[4][3],"esi"),@C[3]);	# R[4][3] = C[3] ^= (~C[4] & D[0]);
286e1051a39Sopenharmony_ci	&sub	("ecx",1);
287e1051a39Sopenharmony_ci	 &movq	(&QWP($A[4][4],"esi"),@C[4]);	# R[4][4] = C[4] ^= (~D[0] & D[1]);
288e1051a39Sopenharmony_ci	&jnz	(&label("loop"));
289e1051a39Sopenharmony_ci
290e1051a39Sopenharmony_ci	&lea	("ebx",&DWP(-192,"ebx"));	# rewind iotas
291e1051a39Sopenharmony_ci	&ret	();
292e1051a39Sopenharmony_ci&function_end_B("_KeccakF1600");
293e1051a39Sopenharmony_ci
294e1051a39Sopenharmony_ci&function_begin("KeccakF1600");
295e1051a39Sopenharmony_ci	&mov	("esi",&wparam(0));
296e1051a39Sopenharmony_ci	&mov	("ebp","esp");
297e1051a39Sopenharmony_ci	&sub	("esp",240);
298e1051a39Sopenharmony_ci	&call	(&label("pic_point"));
299e1051a39Sopenharmony_ci    &set_label("pic_point");
300e1051a39Sopenharmony_ci	&blindpop("ebx");
301e1051a39Sopenharmony_ci	&lea	("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
302e1051a39Sopenharmony_ci	&and	("esp",-8);
303e1051a39Sopenharmony_ci	&lea	("esi",&DWP(100,"esi"));	# size optimization
304e1051a39Sopenharmony_ci	&lea	("edi",&DWP(8*5+100,"esp"));	# size optimization
305e1051a39Sopenharmony_ci
306e1051a39Sopenharmony_ci	&call	("_KeccakF1600");
307e1051a39Sopenharmony_ci
308e1051a39Sopenharmony_ci	&mov	("esp","ebp");
309e1051a39Sopenharmony_ci	&emms	();
310e1051a39Sopenharmony_ci&function_end("KeccakF1600");
311e1051a39Sopenharmony_ci
312e1051a39Sopenharmony_ci&function_begin("SHA3_absorb");
313e1051a39Sopenharmony_ci	&mov	("esi",&wparam(0));		# A[][]
314e1051a39Sopenharmony_ci	&mov	("eax",&wparam(1));		# inp
315e1051a39Sopenharmony_ci	&mov	("ecx",&wparam(2));		# len
316e1051a39Sopenharmony_ci	&mov	("edx",&wparam(3));		# bsz
317e1051a39Sopenharmony_ci	&mov	("ebp","esp");
318e1051a39Sopenharmony_ci	&sub	("esp",240+8);
319e1051a39Sopenharmony_ci	&call	(&label("pic_point"));
320e1051a39Sopenharmony_ci    &set_label("pic_point");
321e1051a39Sopenharmony_ci	&blindpop("ebx");
322e1051a39Sopenharmony_ci	&lea	("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
323e1051a39Sopenharmony_ci	&and	("esp",-8);
324e1051a39Sopenharmony_ci
325e1051a39Sopenharmony_ci	&mov	("edi","esi");
326e1051a39Sopenharmony_ci	&lea	("esi",&DWP(100,"esi"));	# size optimization
327e1051a39Sopenharmony_ci	&mov	(&DWP(-4,"ebp"),"edx");		# save bsz
328e1051a39Sopenharmony_ci	&jmp	(&label("loop"));
329e1051a39Sopenharmony_ci
330e1051a39Sopenharmony_ci&set_label("loop",16);
331e1051a39Sopenharmony_ci	&cmp	("ecx","edx");			# len < bsz?
332e1051a39Sopenharmony_ci	&jc	(&label("absorbed"));
333e1051a39Sopenharmony_ci
334e1051a39Sopenharmony_ci	&shr	("edx",3);			# bsz /= 8
335e1051a39Sopenharmony_ci&set_label("block");
336e1051a39Sopenharmony_ci	&movq	("mm0",&QWP(0,"eax"));
337e1051a39Sopenharmony_ci	&lea	("eax",&DWP(8,"eax"));
338e1051a39Sopenharmony_ci	&pxor	("mm0",&QWP(0,"edi"));
339e1051a39Sopenharmony_ci	&lea	("edi",&DWP(8,"edi"));
340e1051a39Sopenharmony_ci	&sub	("ecx",8);			# len -= 8
341e1051a39Sopenharmony_ci	&movq	(&QWP(-8,"edi"),"mm0");
342e1051a39Sopenharmony_ci	&dec	("edx");			# bsz--
343e1051a39Sopenharmony_ci	&jnz	(&label("block"));
344e1051a39Sopenharmony_ci
345e1051a39Sopenharmony_ci	&lea	("edi",&DWP(8*5+100,"esp"));	# size optimization
346e1051a39Sopenharmony_ci	&mov	(&DWP(-8,"ebp"),"ecx");		# save len
347e1051a39Sopenharmony_ci	&call	("_KeccakF1600");
348e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(-8,"ebp"));		# pull len
349e1051a39Sopenharmony_ci	&mov	("edx",&DWP(-4,"ebp"));		# pull bsz
350e1051a39Sopenharmony_ci	&lea	("edi",&DWP(-100,"esi"));
351e1051a39Sopenharmony_ci	&jmp	(&label("loop"));
352e1051a39Sopenharmony_ci
353e1051a39Sopenharmony_ci&set_label("absorbed",16);
354e1051a39Sopenharmony_ci	&mov	("eax","ecx");			# return value
355e1051a39Sopenharmony_ci	&mov	("esp","ebp");
356e1051a39Sopenharmony_ci	&emms	();
357e1051a39Sopenharmony_ci&function_end("SHA3_absorb");
358e1051a39Sopenharmony_ci
359e1051a39Sopenharmony_ci&function_begin("SHA3_squeeze");
360e1051a39Sopenharmony_ci	&mov	("esi",&wparam(0));		# A[][]
361e1051a39Sopenharmony_ci	&mov	("eax",&wparam(1));		# out
362e1051a39Sopenharmony_ci	&mov	("ecx",&wparam(2));		# len
363e1051a39Sopenharmony_ci	&mov	("edx",&wparam(3));		# bsz
364e1051a39Sopenharmony_ci	&mov	("ebp","esp");
365e1051a39Sopenharmony_ci	&sub	("esp",240+8);
366e1051a39Sopenharmony_ci	&call	(&label("pic_point"));
367e1051a39Sopenharmony_ci    &set_label("pic_point");
368e1051a39Sopenharmony_ci	&blindpop("ebx");
369e1051a39Sopenharmony_ci	&lea	("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
370e1051a39Sopenharmony_ci	&and	("esp",-8);
371e1051a39Sopenharmony_ci
372e1051a39Sopenharmony_ci	&shr	("edx",3);			# bsz /= 8
373e1051a39Sopenharmony_ci	&mov	("edi","esi");
374e1051a39Sopenharmony_ci	&lea	("esi",&DWP(100,"esi"));	# size optimization
375e1051a39Sopenharmony_ci	&mov	(&DWP(-4,"ebp"),"edx");		# save bsz
376e1051a39Sopenharmony_ci	&jmp	(&label("loop"));
377e1051a39Sopenharmony_ci
378e1051a39Sopenharmony_ci&set_label("loop",16);
379e1051a39Sopenharmony_ci	&cmp	("ecx",8);			# len < 8?
380e1051a39Sopenharmony_ci	&jc	(&label("tail"));
381e1051a39Sopenharmony_ci
382e1051a39Sopenharmony_ci	&movq	("mm0",&QWP(0,"edi"));
383e1051a39Sopenharmony_ci	&lea	("edi",&DWP(8,"edi"));
384e1051a39Sopenharmony_ci	&movq	(&QWP(0,"eax"),"mm0");
385e1051a39Sopenharmony_ci	&lea	("eax",&DWP(8,"eax"));
386e1051a39Sopenharmony_ci	&sub	("ecx",8);			# len -= 8
387e1051a39Sopenharmony_ci	&jz	(&label("done"));
388e1051a39Sopenharmony_ci
389e1051a39Sopenharmony_ci	&dec	("edx");			# bsz--
390e1051a39Sopenharmony_ci	&jnz	(&label("loop"));
391e1051a39Sopenharmony_ci
392e1051a39Sopenharmony_ci	&lea	("edi",&DWP(8*5+100,"esp"));	# size optimization
393e1051a39Sopenharmony_ci	&mov	(&DWP(-8,"ebp"),"ecx");		# save len
394e1051a39Sopenharmony_ci	&call	("_KeccakF1600");
395e1051a39Sopenharmony_ci	&mov	("ecx",&DWP(-8,"ebp"));		# pull len
396e1051a39Sopenharmony_ci	&mov	("edx",&DWP(-4,"ebp"));		# pull bsz
397e1051a39Sopenharmony_ci	&lea	("edi",&DWP(-100,"esi"));
398e1051a39Sopenharmony_ci	&jmp	(&label("loop"));
399e1051a39Sopenharmony_ci
400e1051a39Sopenharmony_ci&set_label("tail",16);
401e1051a39Sopenharmony_ci	&mov	("esi","edi");
402e1051a39Sopenharmony_ci	&mov	("edi","eax");
403e1051a39Sopenharmony_ci	&data_word("0xA4F39066");		# rep movsb
404e1051a39Sopenharmony_ci
405e1051a39Sopenharmony_ci&set_label("done");
406e1051a39Sopenharmony_ci	&mov	("esp","ebp");
407e1051a39Sopenharmony_ci	&emms	();
408e1051a39Sopenharmony_ci&function_end("SHA3_squeeze");
409e1051a39Sopenharmony_ci
410e1051a39Sopenharmony_ci&set_label("iotas",32);
411e1051a39Sopenharmony_ci	&data_word(0x00000001,0x00000000);
412e1051a39Sopenharmony_ci	&data_word(0x00008082,0x00000000);
413e1051a39Sopenharmony_ci	&data_word(0x0000808a,0x80000000);
414e1051a39Sopenharmony_ci	&data_word(0x80008000,0x80000000);
415e1051a39Sopenharmony_ci	&data_word(0x0000808b,0x00000000);
416e1051a39Sopenharmony_ci	&data_word(0x80000001,0x00000000);
417e1051a39Sopenharmony_ci	&data_word(0x80008081,0x80000000);
418e1051a39Sopenharmony_ci	&data_word(0x00008009,0x80000000);
419e1051a39Sopenharmony_ci	&data_word(0x0000008a,0x00000000);
420e1051a39Sopenharmony_ci	&data_word(0x00000088,0x00000000);
421e1051a39Sopenharmony_ci	&data_word(0x80008009,0x00000000);
422e1051a39Sopenharmony_ci	&data_word(0x8000000a,0x00000000);
423e1051a39Sopenharmony_ci	&data_word(0x8000808b,0x00000000);
424e1051a39Sopenharmony_ci	&data_word(0x0000008b,0x80000000);
425e1051a39Sopenharmony_ci	&data_word(0x00008089,0x80000000);
426e1051a39Sopenharmony_ci	&data_word(0x00008003,0x80000000);
427e1051a39Sopenharmony_ci	&data_word(0x00008002,0x80000000);
428e1051a39Sopenharmony_ci	&data_word(0x00000080,0x80000000);
429e1051a39Sopenharmony_ci	&data_word(0x0000800a,0x00000000);
430e1051a39Sopenharmony_ci	&data_word(0x8000000a,0x80000000);
431e1051a39Sopenharmony_ci	&data_word(0x80008081,0x80000000);
432e1051a39Sopenharmony_ci	&data_word(0x00008080,0x80000000);
433e1051a39Sopenharmony_ci	&data_word(0x80000001,0x00000000);
434e1051a39Sopenharmony_ci	&data_word(0x80008008,0x80000000);
435e1051a39Sopenharmony_ci&asciz("Keccak-1600 absorb and squeeze for MMX, CRYPTOGAMS by <appro\@openssl.org>");
436e1051a39Sopenharmony_ci
437e1051a39Sopenharmony_ci&asm_finish();
438e1051a39Sopenharmony_ci
439e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
440