1e1051a39Sopenharmony_ci#!/usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci#
9e1051a39Sopenharmony_ci# ====================================================================
10e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
12e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
13e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
14e1051a39Sopenharmony_ci# ====================================================================
15e1051a39Sopenharmony_ci#
16e1051a39Sopenharmony_ci# Keccak-1600 for x86_64.
17e1051a39Sopenharmony_ci#
18e1051a39Sopenharmony_ci# June 2017.
19e1051a39Sopenharmony_ci#
20e1051a39Sopenharmony_ci# Below code is [lane complementing] KECCAK_2X implementation (see
21e1051a39Sopenharmony_ci# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22e1051a39Sopenharmony_ci# instead of actually unrolling the loop pair-wise I simply flip
23e1051a39Sopenharmony_ci# pointers to T[][] and A[][] at the end of round. Since number of
24e1051a39Sopenharmony_ci# rounds is even, last round writes to A[][] and everything works out.
25e1051a39Sopenharmony_ci# How does it compare to x86_64 assembly module in Keccak Code Package?
26e1051a39Sopenharmony_ci# Depending on processor it's either as fast or faster by up to 15%...
27e1051a39Sopenharmony_ci#
28e1051a39Sopenharmony_ci########################################################################
29e1051a39Sopenharmony_ci# Numbers are cycles per processed byte out of large message.
30e1051a39Sopenharmony_ci#
31e1051a39Sopenharmony_ci#			r=1088(*)
32e1051a39Sopenharmony_ci#
33e1051a39Sopenharmony_ci# P4			25.8
34e1051a39Sopenharmony_ci# Core 2		12.9
35e1051a39Sopenharmony_ci# Westmere		13.7
36e1051a39Sopenharmony_ci# Sandy Bridge		12.9(**)
37e1051a39Sopenharmony_ci# Haswell		9.6
38e1051a39Sopenharmony_ci# Skylake		9.4
39e1051a39Sopenharmony_ci# Silvermont		22.8
40e1051a39Sopenharmony_ci# Goldmont		15.8
41e1051a39Sopenharmony_ci# VIA Nano		17.3
42e1051a39Sopenharmony_ci# Sledgehammer		13.3
43e1051a39Sopenharmony_ci# Bulldozer		16.5
44e1051a39Sopenharmony_ci# Ryzen			8.8
45e1051a39Sopenharmony_ci#
46e1051a39Sopenharmony_ci# (*)	Corresponds to SHA3-256. Improvement over compiler-generate
47e1051a39Sopenharmony_ci#	varies a lot, most common coefficient is 15% in comparison to
48e1051a39Sopenharmony_ci#	gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
49e1051a39Sopenharmony_ci# (**)	Sandy Bridge has broken rotate instruction. Performance can be
50e1051a39Sopenharmony_ci#	improved by 14% by replacing rotates with double-precision
51e1051a39Sopenharmony_ci#	shift with same register as source and destination.
52e1051a39Sopenharmony_ci
53e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
54e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
55e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
56e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
57e1051a39Sopenharmony_ci
58e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
59e1051a39Sopenharmony_ci
60e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
61e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
62e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
63e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl";
64e1051a39Sopenharmony_ci
65e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
66e1051a39Sopenharmony_ci    or die "can't call $xlate: $!";
67e1051a39Sopenharmony_ci*STDOUT=*OUT;
68e1051a39Sopenharmony_ci
69e1051a39Sopenharmony_cimy @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
70e1051a39Sopenharmony_ci              8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
71e1051a39Sopenharmony_ci
72e1051a39Sopenharmony_cimy @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
73e1051a39Sopenharmony_cimy @D = map("%r$_",(8..12));
74e1051a39Sopenharmony_cimy @T = map("%r$_",(13..14));
75e1051a39Sopenharmony_cimy $iotas = "%r15";
76e1051a39Sopenharmony_ci
77e1051a39Sopenharmony_cimy @rhotates = ([  0,  1, 62, 28, 27 ],
78e1051a39Sopenharmony_ci                [ 36, 44,  6, 55, 20 ],
79e1051a39Sopenharmony_ci                [  3, 10, 43, 25, 39 ],
80e1051a39Sopenharmony_ci                [ 41, 45, 15, 21,  8 ],
81e1051a39Sopenharmony_ci                [ 18,  2, 61, 56, 14 ]);
82e1051a39Sopenharmony_ci
83e1051a39Sopenharmony_ci$code.=<<___;
84e1051a39Sopenharmony_ci.text
85e1051a39Sopenharmony_ci
86e1051a39Sopenharmony_ci.type	__KeccakF1600,\@abi-omnipotent
87e1051a39Sopenharmony_ci.align	32
88e1051a39Sopenharmony_ci__KeccakF1600:
89e1051a39Sopenharmony_ci.cfi_startproc
90e1051a39Sopenharmony_ci	mov	$A[4][0](%rdi),@C[0]
91e1051a39Sopenharmony_ci	mov	$A[4][1](%rdi),@C[1]
92e1051a39Sopenharmony_ci	mov	$A[4][2](%rdi),@C[2]
93e1051a39Sopenharmony_ci	mov	$A[4][3](%rdi),@C[3]
94e1051a39Sopenharmony_ci	mov	$A[4][4](%rdi),@C[4]
95e1051a39Sopenharmony_ci	jmp	.Loop
96e1051a39Sopenharmony_ci
97e1051a39Sopenharmony_ci.align	32
98e1051a39Sopenharmony_ci.Loop:
99e1051a39Sopenharmony_ci	mov	$A[0][0](%rdi),@D[0]
100e1051a39Sopenharmony_ci	mov	$A[1][1](%rdi),@D[1]
101e1051a39Sopenharmony_ci	mov	$A[2][2](%rdi),@D[2]
102e1051a39Sopenharmony_ci	mov	$A[3][3](%rdi),@D[3]
103e1051a39Sopenharmony_ci
104e1051a39Sopenharmony_ci	xor	$A[0][2](%rdi),@C[2]
105e1051a39Sopenharmony_ci	xor	$A[0][3](%rdi),@C[3]
106e1051a39Sopenharmony_ci	xor	@D[0],         @C[0]
107e1051a39Sopenharmony_ci	xor	$A[0][1](%rdi),@C[1]
108e1051a39Sopenharmony_ci	 xor	$A[1][2](%rdi),@C[2]
109e1051a39Sopenharmony_ci	 xor	$A[1][0](%rdi),@C[0]
110e1051a39Sopenharmony_ci	mov	@C[4],@D[4]
111e1051a39Sopenharmony_ci	xor	$A[0][4](%rdi),@C[4]
112e1051a39Sopenharmony_ci
113e1051a39Sopenharmony_ci	xor	@D[2],         @C[2]
114e1051a39Sopenharmony_ci	xor	$A[2][0](%rdi),@C[0]
115e1051a39Sopenharmony_ci	 xor	$A[1][3](%rdi),@C[3]
116e1051a39Sopenharmony_ci	 xor	@D[1],         @C[1]
117e1051a39Sopenharmony_ci	 xor	$A[1][4](%rdi),@C[4]
118e1051a39Sopenharmony_ci
119e1051a39Sopenharmony_ci	xor	$A[3][2](%rdi),@C[2]
120e1051a39Sopenharmony_ci	xor	$A[3][0](%rdi),@C[0]
121e1051a39Sopenharmony_ci	 xor	$A[2][3](%rdi),@C[3]
122e1051a39Sopenharmony_ci	 xor	$A[2][1](%rdi),@C[1]
123e1051a39Sopenharmony_ci	 xor	$A[2][4](%rdi),@C[4]
124e1051a39Sopenharmony_ci
125e1051a39Sopenharmony_ci	mov	@C[2],@T[0]
126e1051a39Sopenharmony_ci	rol	\$1,@C[2]
127e1051a39Sopenharmony_ci	xor	@C[0],@C[2]		# D[1] = ROL64(C[2], 1) ^ C[0]
128e1051a39Sopenharmony_ci	 xor	@D[3],         @C[3]
129e1051a39Sopenharmony_ci
130e1051a39Sopenharmony_ci	rol	\$1,@C[0]
131e1051a39Sopenharmony_ci	xor	@C[3],@C[0]		# D[4] = ROL64(C[0], 1) ^ C[3]
132e1051a39Sopenharmony_ci	 xor	$A[3][1](%rdi),@C[1]
133e1051a39Sopenharmony_ci
134e1051a39Sopenharmony_ci	rol	\$1,@C[3]
135e1051a39Sopenharmony_ci	xor	@C[1],@C[3]		# D[2] = ROL64(C[3], 1) ^ C[1]
136e1051a39Sopenharmony_ci	 xor	$A[3][4](%rdi),@C[4]
137e1051a39Sopenharmony_ci
138e1051a39Sopenharmony_ci	rol	\$1,@C[1]
139e1051a39Sopenharmony_ci	xor	@C[4],@C[1]		# D[0] = ROL64(C[1], 1) ^ C[4]
140e1051a39Sopenharmony_ci
141e1051a39Sopenharmony_ci	rol	\$1,@C[4]
142e1051a39Sopenharmony_ci	xor	@T[0],@C[4]		# D[3] = ROL64(C[4], 1) ^ C[2]
143e1051a39Sopenharmony_ci___
144e1051a39Sopenharmony_ci	(@D[0..4], @C) = (@C[1..4,0], @D);
145e1051a39Sopenharmony_ci$code.=<<___;
146e1051a39Sopenharmony_ci	xor	@D[1],@C[1]
147e1051a39Sopenharmony_ci	xor	@D[2],@C[2]
148e1051a39Sopenharmony_ci	rol	\$$rhotates[1][1],@C[1]
149e1051a39Sopenharmony_ci	xor	@D[3],@C[3]
150e1051a39Sopenharmony_ci	xor	@D[4],@C[4]
151e1051a39Sopenharmony_ci	rol	\$$rhotates[2][2],@C[2]
152e1051a39Sopenharmony_ci	xor	@D[0],@C[0]
153e1051a39Sopenharmony_ci	 mov	@C[1],@T[0]
154e1051a39Sopenharmony_ci	rol	\$$rhotates[3][3],@C[3]
155e1051a39Sopenharmony_ci	 or	@C[2],@C[1]
156e1051a39Sopenharmony_ci	 xor	@C[0],@C[1]		#           C[0] ^ ( C[1] | C[2])
157e1051a39Sopenharmony_ci	rol	\$$rhotates[4][4],@C[4]
158e1051a39Sopenharmony_ci
159e1051a39Sopenharmony_ci	 xor	($iotas),@C[1]
160e1051a39Sopenharmony_ci	 lea	8($iotas),$iotas
161e1051a39Sopenharmony_ci
162e1051a39Sopenharmony_ci	mov	@C[4],@T[1]
163e1051a39Sopenharmony_ci	and	@C[3],@C[4]
164e1051a39Sopenharmony_ci	 mov	@C[1],$A[0][0](%rsi)	# R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
165e1051a39Sopenharmony_ci	xor	@C[2],@C[4]		#           C[2] ^ ( C[4] & C[3])
166e1051a39Sopenharmony_ci	not	@C[2]
167e1051a39Sopenharmony_ci	mov	@C[4],$A[0][2](%rsi)	# R[0][2] = C[2] ^ ( C[4] & C[3])
168e1051a39Sopenharmony_ci
169e1051a39Sopenharmony_ci	or	@C[3],@C[2]
170e1051a39Sopenharmony_ci	  mov	$A[4][2](%rdi),@C[4]
171e1051a39Sopenharmony_ci	xor	@T[0],@C[2]		#           C[1] ^ (~C[2] | C[3])
172e1051a39Sopenharmony_ci	mov	@C[2],$A[0][1](%rsi)	# R[0][1] = C[1] ^ (~C[2] | C[3])
173e1051a39Sopenharmony_ci
174e1051a39Sopenharmony_ci	and	@C[0],@T[0]
175e1051a39Sopenharmony_ci	  mov	$A[1][4](%rdi),@C[1]
176e1051a39Sopenharmony_ci	xor	@T[1],@T[0]		#           C[4] ^ ( C[1] & C[0])
177e1051a39Sopenharmony_ci	  mov	$A[2][0](%rdi),@C[2]
178e1051a39Sopenharmony_ci	mov	@T[0],$A[0][4](%rsi)	# R[0][4] = C[4] ^ ( C[1] & C[0])
179e1051a39Sopenharmony_ci
180e1051a39Sopenharmony_ci	or	@C[0],@T[1]
181e1051a39Sopenharmony_ci	  mov	$A[0][3](%rdi),@C[0]
182e1051a39Sopenharmony_ci	xor	@C[3],@T[1]		#           C[3] ^ ( C[4] | C[0])
183e1051a39Sopenharmony_ci	  mov	$A[3][1](%rdi),@C[3]
184e1051a39Sopenharmony_ci	mov	@T[1],$A[0][3](%rsi)	# R[0][3] = C[3] ^ ( C[4] | C[0])
185e1051a39Sopenharmony_ci
186e1051a39Sopenharmony_ci
187e1051a39Sopenharmony_ci	xor	@D[3],@C[0]
188e1051a39Sopenharmony_ci	xor	@D[2],@C[4]
189e1051a39Sopenharmony_ci	rol	\$$rhotates[0][3],@C[0]
190e1051a39Sopenharmony_ci	xor	@D[1],@C[3]
191e1051a39Sopenharmony_ci	xor	@D[4],@C[1]
192e1051a39Sopenharmony_ci	rol	\$$rhotates[4][2],@C[4]
193e1051a39Sopenharmony_ci	rol	\$$rhotates[3][1],@C[3]
194e1051a39Sopenharmony_ci	xor	@D[0],@C[2]
195e1051a39Sopenharmony_ci	rol	\$$rhotates[1][4],@C[1]
196e1051a39Sopenharmony_ci	 mov	@C[0],@T[0]
197e1051a39Sopenharmony_ci	 or	@C[4],@C[0]
198e1051a39Sopenharmony_ci	rol	\$$rhotates[2][0],@C[2]
199e1051a39Sopenharmony_ci
200e1051a39Sopenharmony_ci	xor	@C[3],@C[0]		#           C[3] ^ (C[0] |  C[4])
201e1051a39Sopenharmony_ci	mov	@C[0],$A[1][3](%rsi)	# R[1][3] = C[3] ^ (C[0] |  C[4])
202e1051a39Sopenharmony_ci
203e1051a39Sopenharmony_ci	mov	@C[1],@T[1]
204e1051a39Sopenharmony_ci	and	@T[0],@C[1]
205e1051a39Sopenharmony_ci	  mov	$A[0][1](%rdi),@C[0]
206e1051a39Sopenharmony_ci	xor	@C[4],@C[1]		#           C[4] ^ (C[1] &  C[0])
207e1051a39Sopenharmony_ci	not	@C[4]
208e1051a39Sopenharmony_ci	mov	@C[1],$A[1][4](%rsi)	# R[1][4] = C[4] ^ (C[1] &  C[0])
209e1051a39Sopenharmony_ci
210e1051a39Sopenharmony_ci	or	@C[3],@C[4]
211e1051a39Sopenharmony_ci	  mov	$A[1][2](%rdi),@C[1]
212e1051a39Sopenharmony_ci	xor	@C[2],@C[4]		#           C[2] ^ (~C[4] | C[3])
213e1051a39Sopenharmony_ci	mov	@C[4],$A[1][2](%rsi)	# R[1][2] = C[2] ^ (~C[4] | C[3])
214e1051a39Sopenharmony_ci
215e1051a39Sopenharmony_ci	and	@C[2],@C[3]
216e1051a39Sopenharmony_ci	  mov	$A[4][0](%rdi),@C[4]
217e1051a39Sopenharmony_ci	xor	@T[1],@C[3]		#           C[1] ^ (C[3] &  C[2])
218e1051a39Sopenharmony_ci	mov	@C[3],$A[1][1](%rsi)	# R[1][1] = C[1] ^ (C[3] &  C[2])
219e1051a39Sopenharmony_ci
220e1051a39Sopenharmony_ci	or	@C[2],@T[1]
221e1051a39Sopenharmony_ci	  mov	$A[2][3](%rdi),@C[2]
222e1051a39Sopenharmony_ci	xor	@T[0],@T[1]		#           C[0] ^ (C[1] |  C[2])
223e1051a39Sopenharmony_ci	  mov	$A[3][4](%rdi),@C[3]
224e1051a39Sopenharmony_ci	mov	@T[1],$A[1][0](%rsi)	# R[1][0] = C[0] ^ (C[1] |  C[2])
225e1051a39Sopenharmony_ci
226e1051a39Sopenharmony_ci
227e1051a39Sopenharmony_ci	xor	@D[3],@C[2]
228e1051a39Sopenharmony_ci	xor	@D[4],@C[3]
229e1051a39Sopenharmony_ci	rol	\$$rhotates[2][3],@C[2]
230e1051a39Sopenharmony_ci	xor	@D[2],@C[1]
231e1051a39Sopenharmony_ci	rol	\$$rhotates[3][4],@C[3]
232e1051a39Sopenharmony_ci	xor	@D[0],@C[4]
233e1051a39Sopenharmony_ci	rol	\$$rhotates[1][2],@C[1]
234e1051a39Sopenharmony_ci	xor	@D[1],@C[0]
235e1051a39Sopenharmony_ci	rol	\$$rhotates[4][0],@C[4]
236e1051a39Sopenharmony_ci	 mov	@C[2],@T[0]
237e1051a39Sopenharmony_ci	 and	@C[3],@C[2]
238e1051a39Sopenharmony_ci	rol	\$$rhotates[0][1],@C[0]
239e1051a39Sopenharmony_ci
240e1051a39Sopenharmony_ci	not	@C[3]
241e1051a39Sopenharmony_ci	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] & C[3])
242e1051a39Sopenharmony_ci	mov	@C[2],$A[2][1](%rsi)	# R[2][1] =  C[1] ^ ( C[2] & C[3])
243e1051a39Sopenharmony_ci
244e1051a39Sopenharmony_ci	mov	@C[4],@T[1]
245e1051a39Sopenharmony_ci	and	@C[3],@C[4]
246e1051a39Sopenharmony_ci	  mov	$A[2][1](%rdi),@C[2]
247e1051a39Sopenharmony_ci	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] & ~C[3])
248e1051a39Sopenharmony_ci	mov	@C[4],$A[2][2](%rsi)	# R[2][2] =  C[2] ^ ( C[4] & ~C[3])
249e1051a39Sopenharmony_ci
250e1051a39Sopenharmony_ci	or	@C[1],@T[0]
251e1051a39Sopenharmony_ci	  mov	$A[4][3](%rdi),@C[4]
252e1051a39Sopenharmony_ci	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] | C[1])
253e1051a39Sopenharmony_ci	mov	@T[0],$A[2][0](%rsi)	# R[2][0] =  C[0] ^ ( C[2] | C[1])
254e1051a39Sopenharmony_ci
255e1051a39Sopenharmony_ci	and	@C[0],@C[1]
256e1051a39Sopenharmony_ci	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] & C[0])
257e1051a39Sopenharmony_ci	mov	@C[1],$A[2][4](%rsi)	# R[2][4] =  C[4] ^ ( C[1] & C[0])
258e1051a39Sopenharmony_ci
259e1051a39Sopenharmony_ci	or	@C[0],@T[1]
260e1051a39Sopenharmony_ci	  mov	$A[1][0](%rdi),@C[1]
261e1051a39Sopenharmony_ci	xor	@C[3],@T[1]		#           ~C[3] ^ ( C[0] | C[4])
262e1051a39Sopenharmony_ci	  mov	$A[3][2](%rdi),@C[3]
263e1051a39Sopenharmony_ci	mov	@T[1],$A[2][3](%rsi)	# R[2][3] = ~C[3] ^ ( C[0] | C[4])
264e1051a39Sopenharmony_ci
265e1051a39Sopenharmony_ci
266e1051a39Sopenharmony_ci	mov	$A[0][4](%rdi),@C[0]
267e1051a39Sopenharmony_ci
268e1051a39Sopenharmony_ci	xor	@D[1],@C[2]
269e1051a39Sopenharmony_ci	xor	@D[2],@C[3]
270e1051a39Sopenharmony_ci	rol	\$$rhotates[2][1],@C[2]
271e1051a39Sopenharmony_ci	xor	@D[0],@C[1]
272e1051a39Sopenharmony_ci	rol	\$$rhotates[3][2],@C[3]
273e1051a39Sopenharmony_ci	xor	@D[3],@C[4]
274e1051a39Sopenharmony_ci	rol	\$$rhotates[1][0],@C[1]
275e1051a39Sopenharmony_ci	xor	@D[4],@C[0]
276e1051a39Sopenharmony_ci	rol	\$$rhotates[4][3],@C[4]
277e1051a39Sopenharmony_ci	 mov	@C[2],@T[0]
278e1051a39Sopenharmony_ci	 or	@C[3],@C[2]
279e1051a39Sopenharmony_ci	rol	\$$rhotates[0][4],@C[0]
280e1051a39Sopenharmony_ci
281e1051a39Sopenharmony_ci	not	@C[3]
282e1051a39Sopenharmony_ci	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] | C[3])
283e1051a39Sopenharmony_ci	mov	@C[2],$A[3][1](%rsi)	# R[3][1] =  C[1] ^ ( C[2] | C[3])
284e1051a39Sopenharmony_ci
285e1051a39Sopenharmony_ci	mov	@C[4],@T[1]
286e1051a39Sopenharmony_ci	or	@C[3],@C[4]
287e1051a39Sopenharmony_ci	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] | ~C[3])
288e1051a39Sopenharmony_ci	mov	@C[4],$A[3][2](%rsi)	# R[3][2] =  C[2] ^ ( C[4] | ~C[3])
289e1051a39Sopenharmony_ci
290e1051a39Sopenharmony_ci	and	@C[1],@T[0]
291e1051a39Sopenharmony_ci	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] & C[1])
292e1051a39Sopenharmony_ci	mov	@T[0],$A[3][0](%rsi)	# R[3][0] =  C[0] ^ ( C[2] & C[1])
293e1051a39Sopenharmony_ci
294e1051a39Sopenharmony_ci	or	@C[0],@C[1]
295e1051a39Sopenharmony_ci	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] | C[0])
296e1051a39Sopenharmony_ci	mov	@C[1],$A[3][4](%rsi)	# R[3][4] =  C[4] ^ ( C[1] | C[0])
297e1051a39Sopenharmony_ci
298e1051a39Sopenharmony_ci	and	@T[1],@C[0]
299e1051a39Sopenharmony_ci	xor	@C[3],@C[0]		#           ~C[3] ^ ( C[0] & C[4])
300e1051a39Sopenharmony_ci	mov	@C[0],$A[3][3](%rsi)	# R[3][3] = ~C[3] ^ ( C[0] & C[4])
301e1051a39Sopenharmony_ci
302e1051a39Sopenharmony_ci
303e1051a39Sopenharmony_ci	xor	$A[0][2](%rdi),@D[2]
304e1051a39Sopenharmony_ci	xor	$A[1][3](%rdi),@D[3]
305e1051a39Sopenharmony_ci	rol	\$$rhotates[0][2],@D[2]
306e1051a39Sopenharmony_ci	xor	$A[4][1](%rdi),@D[1]
307e1051a39Sopenharmony_ci	rol	\$$rhotates[1][3],@D[3]
308e1051a39Sopenharmony_ci	xor	$A[2][4](%rdi),@D[4]
309e1051a39Sopenharmony_ci	rol	\$$rhotates[4][1],@D[1]
310e1051a39Sopenharmony_ci	xor	$A[3][0](%rdi),@D[0]
311e1051a39Sopenharmony_ci	xchg	%rsi,%rdi
312e1051a39Sopenharmony_ci	rol	\$$rhotates[2][4],@D[4]
313e1051a39Sopenharmony_ci	rol	\$$rhotates[3][0],@D[0]
314e1051a39Sopenharmony_ci___
315e1051a39Sopenharmony_ci	@C = @D[2..4,0,1];
316e1051a39Sopenharmony_ci$code.=<<___;
317e1051a39Sopenharmony_ci	mov	@C[0],@T[0]
318e1051a39Sopenharmony_ci	and	@C[1],@C[0]
319e1051a39Sopenharmony_ci	not	@C[1]
320e1051a39Sopenharmony_ci	xor	@C[4],@C[0]		#            C[4] ^ ( C[0] & C[1])
321e1051a39Sopenharmony_ci	mov	@C[0],$A[4][4](%rdi)	# R[4][4] =  C[4] ^ ( C[0] & C[1])
322e1051a39Sopenharmony_ci
323e1051a39Sopenharmony_ci	mov	@C[2],@T[1]
324e1051a39Sopenharmony_ci	and	@C[1],@C[2]
325e1051a39Sopenharmony_ci	xor	@T[0],@C[2]		#            C[0] ^ ( C[2] & ~C[1])
326e1051a39Sopenharmony_ci	mov	@C[2],$A[4][0](%rdi)	# R[4][0] =  C[0] ^ ( C[2] & ~C[1])
327e1051a39Sopenharmony_ci
328e1051a39Sopenharmony_ci	or	@C[4],@T[0]
329e1051a39Sopenharmony_ci	xor	@C[3],@T[0]		#            C[3] ^ ( C[0] | C[4])
330e1051a39Sopenharmony_ci	mov	@T[0],$A[4][3](%rdi)	# R[4][3] =  C[3] ^ ( C[0] | C[4])
331e1051a39Sopenharmony_ci
332e1051a39Sopenharmony_ci	and	@C[3],@C[4]
333e1051a39Sopenharmony_ci	xor	@T[1],@C[4]		#            C[2] ^ ( C[4] & C[3])
334e1051a39Sopenharmony_ci	mov	@C[4],$A[4][2](%rdi)	# R[4][2] =  C[2] ^ ( C[4] & C[3])
335e1051a39Sopenharmony_ci
336e1051a39Sopenharmony_ci	or	@T[1],@C[3]
337e1051a39Sopenharmony_ci	xor	@C[1],@C[3]		#           ~C[1] ^ ( C[2] | C[3])
338e1051a39Sopenharmony_ci	mov	@C[3],$A[4][1](%rdi)	# R[4][1] = ~C[1] ^ ( C[2] | C[3])
339e1051a39Sopenharmony_ci
340e1051a39Sopenharmony_ci	mov	@C[0],@C[1]		# harmonize with the loop top
341e1051a39Sopenharmony_ci	mov	@T[0],@C[0]
342e1051a39Sopenharmony_ci
343e1051a39Sopenharmony_ci	test	\$255,$iotas
344e1051a39Sopenharmony_ci	jnz	.Loop
345e1051a39Sopenharmony_ci
346e1051a39Sopenharmony_ci	lea	-192($iotas),$iotas	# rewind iotas
347e1051a39Sopenharmony_ci	ret
348e1051a39Sopenharmony_ci.cfi_endproc
349e1051a39Sopenharmony_ci.size	__KeccakF1600,.-__KeccakF1600
350e1051a39Sopenharmony_ci
351e1051a39Sopenharmony_ci.type	KeccakF1600,\@abi-omnipotent
352e1051a39Sopenharmony_ci.align	32
353e1051a39Sopenharmony_ciKeccakF1600:
354e1051a39Sopenharmony_ci.cfi_startproc
355e1051a39Sopenharmony_ci	push	%rbx
356e1051a39Sopenharmony_ci.cfi_push	%rbx
357e1051a39Sopenharmony_ci	push	%rbp
358e1051a39Sopenharmony_ci.cfi_push	%rbp
359e1051a39Sopenharmony_ci	push	%r12
360e1051a39Sopenharmony_ci.cfi_push	%r12
361e1051a39Sopenharmony_ci	push	%r13
362e1051a39Sopenharmony_ci.cfi_push	%r13
363e1051a39Sopenharmony_ci	push	%r14
364e1051a39Sopenharmony_ci.cfi_push	%r14
365e1051a39Sopenharmony_ci	push	%r15
366e1051a39Sopenharmony_ci.cfi_push	%r15
367e1051a39Sopenharmony_ci
368e1051a39Sopenharmony_ci	lea	100(%rdi),%rdi		# size optimization
369e1051a39Sopenharmony_ci	sub	\$200,%rsp
370e1051a39Sopenharmony_ci.cfi_adjust_cfa_offset	200
371e1051a39Sopenharmony_ci
372e1051a39Sopenharmony_ci	notq	$A[0][1](%rdi)
373e1051a39Sopenharmony_ci	notq	$A[0][2](%rdi)
374e1051a39Sopenharmony_ci	notq	$A[1][3](%rdi)
375e1051a39Sopenharmony_ci	notq	$A[2][2](%rdi)
376e1051a39Sopenharmony_ci	notq	$A[3][2](%rdi)
377e1051a39Sopenharmony_ci	notq	$A[4][0](%rdi)
378e1051a39Sopenharmony_ci
379e1051a39Sopenharmony_ci	lea	iotas(%rip),$iotas
380e1051a39Sopenharmony_ci	lea	100(%rsp),%rsi		# size optimization
381e1051a39Sopenharmony_ci
382e1051a39Sopenharmony_ci	call	__KeccakF1600
383e1051a39Sopenharmony_ci
384e1051a39Sopenharmony_ci	notq	$A[0][1](%rdi)
385e1051a39Sopenharmony_ci	notq	$A[0][2](%rdi)
386e1051a39Sopenharmony_ci	notq	$A[1][3](%rdi)
387e1051a39Sopenharmony_ci	notq	$A[2][2](%rdi)
388e1051a39Sopenharmony_ci	notq	$A[3][2](%rdi)
389e1051a39Sopenharmony_ci	notq	$A[4][0](%rdi)
390e1051a39Sopenharmony_ci	lea	-100(%rdi),%rdi		# preserve A[][]
391e1051a39Sopenharmony_ci
392e1051a39Sopenharmony_ci	add	\$200,%rsp
393e1051a39Sopenharmony_ci.cfi_adjust_cfa_offset	-200
394e1051a39Sopenharmony_ci
395e1051a39Sopenharmony_ci	pop	%r15
396e1051a39Sopenharmony_ci.cfi_pop	%r15
397e1051a39Sopenharmony_ci	pop	%r14
398e1051a39Sopenharmony_ci.cfi_pop	%r14
399e1051a39Sopenharmony_ci	pop	%r13
400e1051a39Sopenharmony_ci.cfi_pop	%r13
401e1051a39Sopenharmony_ci	pop	%r12
402e1051a39Sopenharmony_ci.cfi_pop	%r12
403e1051a39Sopenharmony_ci	pop	%rbp
404e1051a39Sopenharmony_ci.cfi_pop	%rbp
405e1051a39Sopenharmony_ci	pop	%rbx
406e1051a39Sopenharmony_ci.cfi_pop	%rbx
407e1051a39Sopenharmony_ci	ret
408e1051a39Sopenharmony_ci.cfi_endproc
409e1051a39Sopenharmony_ci.size	KeccakF1600,.-KeccakF1600
410e1051a39Sopenharmony_ci___
411e1051a39Sopenharmony_ci
412e1051a39Sopenharmony_ci{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
413e1051a39Sopenharmony_ci     ($A_flat,$inp) = ("%r8","%r9");
414e1051a39Sopenharmony_ci$code.=<<___;
415e1051a39Sopenharmony_ci.globl	SHA3_absorb
416e1051a39Sopenharmony_ci.type	SHA3_absorb,\@function,4
417e1051a39Sopenharmony_ci.align	32
418e1051a39Sopenharmony_ciSHA3_absorb:
419e1051a39Sopenharmony_ci.cfi_startproc
420e1051a39Sopenharmony_ci	push	%rbx
421e1051a39Sopenharmony_ci.cfi_push	%rbx
422e1051a39Sopenharmony_ci	push	%rbp
423e1051a39Sopenharmony_ci.cfi_push	%rbp
424e1051a39Sopenharmony_ci	push	%r12
425e1051a39Sopenharmony_ci.cfi_push	%r12
426e1051a39Sopenharmony_ci	push	%r13
427e1051a39Sopenharmony_ci.cfi_push	%r13
428e1051a39Sopenharmony_ci	push	%r14
429e1051a39Sopenharmony_ci.cfi_push	%r14
430e1051a39Sopenharmony_ci	push	%r15
431e1051a39Sopenharmony_ci.cfi_push	%r15
432e1051a39Sopenharmony_ci
433e1051a39Sopenharmony_ci	lea	100(%rdi),%rdi		# size optimization
434e1051a39Sopenharmony_ci	sub	\$232,%rsp
435e1051a39Sopenharmony_ci.cfi_adjust_cfa_offset	232
436e1051a39Sopenharmony_ci
437e1051a39Sopenharmony_ci	mov	%rsi,$inp
438e1051a39Sopenharmony_ci	lea	100(%rsp),%rsi		# size optimization
439e1051a39Sopenharmony_ci
440e1051a39Sopenharmony_ci	notq	$A[0][1](%rdi)
441e1051a39Sopenharmony_ci	notq	$A[0][2](%rdi)
442e1051a39Sopenharmony_ci	notq	$A[1][3](%rdi)
443e1051a39Sopenharmony_ci	notq	$A[2][2](%rdi)
444e1051a39Sopenharmony_ci	notq	$A[3][2](%rdi)
445e1051a39Sopenharmony_ci	notq	$A[4][0](%rdi)
446e1051a39Sopenharmony_ci	lea	iotas(%rip),$iotas
447e1051a39Sopenharmony_ci
448e1051a39Sopenharmony_ci	mov	$bsz,216-100(%rsi)	# save bsz
449e1051a39Sopenharmony_ci
450e1051a39Sopenharmony_ci.Loop_absorb:
451e1051a39Sopenharmony_ci	cmp	$bsz,$len
452e1051a39Sopenharmony_ci	jc	.Ldone_absorb
453e1051a39Sopenharmony_ci
454e1051a39Sopenharmony_ci	shr	\$3,$bsz
455e1051a39Sopenharmony_ci	lea	-100(%rdi),$A_flat
456e1051a39Sopenharmony_ci
457e1051a39Sopenharmony_ci.Lblock_absorb:
458e1051a39Sopenharmony_ci	mov	($inp),%rax
459e1051a39Sopenharmony_ci	lea	8($inp),$inp
460e1051a39Sopenharmony_ci	xor	($A_flat),%rax
461e1051a39Sopenharmony_ci	lea	8($A_flat),$A_flat
462e1051a39Sopenharmony_ci	sub	\$8,$len
463e1051a39Sopenharmony_ci	mov	%rax,-8($A_flat)
464e1051a39Sopenharmony_ci	sub	\$1,$bsz
465e1051a39Sopenharmony_ci	jnz	.Lblock_absorb
466e1051a39Sopenharmony_ci
467e1051a39Sopenharmony_ci	mov	$inp,200-100(%rsi)	# save inp
468e1051a39Sopenharmony_ci	mov	$len,208-100(%rsi)	# save len
469e1051a39Sopenharmony_ci	call	__KeccakF1600
470e1051a39Sopenharmony_ci	mov	200-100(%rsi),$inp	# pull inp
471e1051a39Sopenharmony_ci	mov	208-100(%rsi),$len	# pull len
472e1051a39Sopenharmony_ci	mov	216-100(%rsi),$bsz	# pull bsz
473e1051a39Sopenharmony_ci	jmp	.Loop_absorb
474e1051a39Sopenharmony_ci
475e1051a39Sopenharmony_ci.align	32
476e1051a39Sopenharmony_ci.Ldone_absorb:
477e1051a39Sopenharmony_ci	mov	$len,%rax		# return value
478e1051a39Sopenharmony_ci
479e1051a39Sopenharmony_ci	notq	$A[0][1](%rdi)
480e1051a39Sopenharmony_ci	notq	$A[0][2](%rdi)
481e1051a39Sopenharmony_ci	notq	$A[1][3](%rdi)
482e1051a39Sopenharmony_ci	notq	$A[2][2](%rdi)
483e1051a39Sopenharmony_ci	notq	$A[3][2](%rdi)
484e1051a39Sopenharmony_ci	notq	$A[4][0](%rdi)
485e1051a39Sopenharmony_ci
486e1051a39Sopenharmony_ci	add	\$232,%rsp
487e1051a39Sopenharmony_ci.cfi_adjust_cfa_offset	-232
488e1051a39Sopenharmony_ci
489e1051a39Sopenharmony_ci	pop	%r15
490e1051a39Sopenharmony_ci.cfi_pop	%r15
491e1051a39Sopenharmony_ci	pop	%r14
492e1051a39Sopenharmony_ci.cfi_pop	%r14
493e1051a39Sopenharmony_ci	pop	%r13
494e1051a39Sopenharmony_ci.cfi_pop	%r13
495e1051a39Sopenharmony_ci	pop	%r12
496e1051a39Sopenharmony_ci.cfi_pop	%r12
497e1051a39Sopenharmony_ci	pop	%rbp
498e1051a39Sopenharmony_ci.cfi_pop	%rbp
499e1051a39Sopenharmony_ci	pop	%rbx
500e1051a39Sopenharmony_ci.cfi_pop	%rbx
501e1051a39Sopenharmony_ci	ret
502e1051a39Sopenharmony_ci.cfi_endproc
503e1051a39Sopenharmony_ci.size	SHA3_absorb,.-SHA3_absorb
504e1051a39Sopenharmony_ci___
505e1051a39Sopenharmony_ci}
506e1051a39Sopenharmony_ci{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
507e1051a39Sopenharmony_ci     ($out,$len,$bsz) = ("%r12","%r13","%r14");
508e1051a39Sopenharmony_ci
509e1051a39Sopenharmony_ci$code.=<<___;
510e1051a39Sopenharmony_ci.globl	SHA3_squeeze
511e1051a39Sopenharmony_ci.type	SHA3_squeeze,\@function,4
512e1051a39Sopenharmony_ci.align	32
513e1051a39Sopenharmony_ciSHA3_squeeze:
514e1051a39Sopenharmony_ci.cfi_startproc
515e1051a39Sopenharmony_ci	push	%r12
516e1051a39Sopenharmony_ci.cfi_push	%r12
517e1051a39Sopenharmony_ci	push	%r13
518e1051a39Sopenharmony_ci.cfi_push	%r13
519e1051a39Sopenharmony_ci	push	%r14
520e1051a39Sopenharmony_ci.cfi_push	%r14
521e1051a39Sopenharmony_ci
522e1051a39Sopenharmony_ci	shr	\$3,%rcx
523e1051a39Sopenharmony_ci	mov	$A_flat,%r8
524e1051a39Sopenharmony_ci	mov	%rsi,$out
525e1051a39Sopenharmony_ci	mov	%rdx,$len
526e1051a39Sopenharmony_ci	mov	%rcx,$bsz
527e1051a39Sopenharmony_ci	jmp	.Loop_squeeze
528e1051a39Sopenharmony_ci
529e1051a39Sopenharmony_ci.align	32
530e1051a39Sopenharmony_ci.Loop_squeeze:
531e1051a39Sopenharmony_ci	cmp	\$8,$len
532e1051a39Sopenharmony_ci	jb	.Ltail_squeeze
533e1051a39Sopenharmony_ci
534e1051a39Sopenharmony_ci	mov	(%r8),%rax
535e1051a39Sopenharmony_ci	lea	8(%r8),%r8
536e1051a39Sopenharmony_ci	mov	%rax,($out)
537e1051a39Sopenharmony_ci	lea	8($out),$out
538e1051a39Sopenharmony_ci	sub	\$8,$len		# len -= 8
539e1051a39Sopenharmony_ci	jz	.Ldone_squeeze
540e1051a39Sopenharmony_ci
541e1051a39Sopenharmony_ci	sub	\$1,%rcx		# bsz--
542e1051a39Sopenharmony_ci	jnz	.Loop_squeeze
543e1051a39Sopenharmony_ci
544e1051a39Sopenharmony_ci	call	KeccakF1600
545e1051a39Sopenharmony_ci	mov	$A_flat,%r8
546e1051a39Sopenharmony_ci	mov	$bsz,%rcx
547e1051a39Sopenharmony_ci	jmp	.Loop_squeeze
548e1051a39Sopenharmony_ci
549e1051a39Sopenharmony_ci.Ltail_squeeze:
550e1051a39Sopenharmony_ci	mov	%r8, %rsi
551e1051a39Sopenharmony_ci	mov	$out,%rdi
552e1051a39Sopenharmony_ci	mov	$len,%rcx
553e1051a39Sopenharmony_ci	.byte	0xf3,0xa4		# rep	movsb
554e1051a39Sopenharmony_ci
555e1051a39Sopenharmony_ci.Ldone_squeeze:
556e1051a39Sopenharmony_ci	pop	%r14
557e1051a39Sopenharmony_ci.cfi_pop	%r14
558e1051a39Sopenharmony_ci	pop	%r13
559e1051a39Sopenharmony_ci.cfi_pop	%r13
560e1051a39Sopenharmony_ci	pop	%r12
561e1051a39Sopenharmony_ci.cfi_pop	%r13
562e1051a39Sopenharmony_ci	ret
563e1051a39Sopenharmony_ci.cfi_endproc
564e1051a39Sopenharmony_ci.size	SHA3_squeeze,.-SHA3_squeeze
565e1051a39Sopenharmony_ci___
566e1051a39Sopenharmony_ci}
567e1051a39Sopenharmony_ci$code.=<<___;
568e1051a39Sopenharmony_ci.align	256
569e1051a39Sopenharmony_ci	.quad	0,0,0,0,0,0,0,0
570e1051a39Sopenharmony_ci.type	iotas,\@object
571e1051a39Sopenharmony_ciiotas:
572e1051a39Sopenharmony_ci	.quad	0x0000000000000001
573e1051a39Sopenharmony_ci	.quad	0x0000000000008082
574e1051a39Sopenharmony_ci	.quad	0x800000000000808a
575e1051a39Sopenharmony_ci	.quad	0x8000000080008000
576e1051a39Sopenharmony_ci	.quad	0x000000000000808b
577e1051a39Sopenharmony_ci	.quad	0x0000000080000001
578e1051a39Sopenharmony_ci	.quad	0x8000000080008081
579e1051a39Sopenharmony_ci	.quad	0x8000000000008009
580e1051a39Sopenharmony_ci	.quad	0x000000000000008a
581e1051a39Sopenharmony_ci	.quad	0x0000000000000088
582e1051a39Sopenharmony_ci	.quad	0x0000000080008009
583e1051a39Sopenharmony_ci	.quad	0x000000008000000a
584e1051a39Sopenharmony_ci	.quad	0x000000008000808b
585e1051a39Sopenharmony_ci	.quad	0x800000000000008b
586e1051a39Sopenharmony_ci	.quad	0x8000000000008089
587e1051a39Sopenharmony_ci	.quad	0x8000000000008003
588e1051a39Sopenharmony_ci	.quad	0x8000000000008002
589e1051a39Sopenharmony_ci	.quad	0x8000000000000080
590e1051a39Sopenharmony_ci	.quad	0x000000000000800a
591e1051a39Sopenharmony_ci	.quad	0x800000008000000a
592e1051a39Sopenharmony_ci	.quad	0x8000000080008081
593e1051a39Sopenharmony_ci	.quad	0x8000000000008080
594e1051a39Sopenharmony_ci	.quad	0x0000000080000001
595e1051a39Sopenharmony_ci	.quad	0x8000000080008008
596e1051a39Sopenharmony_ci.size	iotas,.-iotas
597e1051a39Sopenharmony_ci.asciz	"Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
598e1051a39Sopenharmony_ci___
599e1051a39Sopenharmony_ci
600e1051a39Sopenharmony_ciforeach (split("\n",$code)) {
601e1051a39Sopenharmony_ci	# Below replacement results in 11.2 on Sandy Bridge, 9.4 on
602e1051a39Sopenharmony_ci	# Haswell, but it hurts other processors by up to 2-3-4x...
603e1051a39Sopenharmony_ci	#s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
604e1051a39Sopenharmony_ci	# Below replacement results in 9.3 on Haswell [as well as
605e1051a39Sopenharmony_ci	# on Ryzen, i.e. it *hurts* Ryzen]...
606e1051a39Sopenharmony_ci	#s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
607e1051a39Sopenharmony_ci
608e1051a39Sopenharmony_ci	print $_, "\n";
609e1051a39Sopenharmony_ci}
610e1051a39Sopenharmony_ci
611e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
612