1e1051a39Sopenharmony_ci#!/usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci#
9e1051a39Sopenharmony_ci# ====================================================================
10e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
12e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
13e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
14e1051a39Sopenharmony_ci# ====================================================================
15e1051a39Sopenharmony_ci#
16e1051a39Sopenharmony_ci# Keccak-1600 for AVX2.
17e1051a39Sopenharmony_ci#
18e1051a39Sopenharmony_ci# July 2017.
19e1051a39Sopenharmony_ci#
20e1051a39Sopenharmony_ci# To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
21e1051a39Sopenharmony_ci# 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
22e1051a39Sopenharmony_ci# other than A[0][0] in magic order into 6 [256-bit] registers, *each
23e1051a39Sopenharmony_ci# dedicated to one axis*, Pi permutation is reduced to intra-register
24e1051a39Sopenharmony_ci# shuffles...
25e1051a39Sopenharmony_ci#
26e1051a39Sopenharmony_ci# It makes other steps more intricate, but overall, is it a win? To be
27e1051a39Sopenharmony_ci# more specific index permutations organized by quadruples are:
28e1051a39Sopenharmony_ci#
29e1051a39Sopenharmony_ci#       [4][4] [3][3] [2][2] [1][1]<-+
30e1051a39Sopenharmony_ci#       [0][4] [0][3] [0][2] [0][1]<-+
31e1051a39Sopenharmony_ci#       [3][0] [1][0] [4][0] [2][0]  |
32e1051a39Sopenharmony_ci#       [4][3] [3][1] [2][4] [1][2]  |
33e1051a39Sopenharmony_ci#       [3][4] [1][3] [4][2] [2][1]  |
34e1051a39Sopenharmony_ci#       [2][3] [4][1] [1][4] [3][2]  |
35e1051a39Sopenharmony_ci#       [2][2] [4][4] [1][1] [3][3] -+
36e1051a39Sopenharmony_ci#
37e1051a39Sopenharmony_ci# This however is highly impractical for Theta and Chi. What would help
38e1051a39Sopenharmony_ci# Theta is if x indices were aligned column-wise, or in other words:
39e1051a39Sopenharmony_ci#
40e1051a39Sopenharmony_ci#       [0][4] [0][3] [0][2] [0][1]
41e1051a39Sopenharmony_ci#       [3][0] [1][0] [4][0] [2][0]
42e1051a39Sopenharmony_ci#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
43e1051a39Sopenharmony_ci#       [2][4] [4][3] [1][2] [3][1]
44e1051a39Sopenharmony_ci#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
45e1051a39Sopenharmony_ci#       [3][4] [1][3] [4][2] [2][1]
46e1051a39Sopenharmony_ci#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
47e1051a39Sopenharmony_ci#       [1][4] [2][3] [3][2] [4][1]
48e1051a39Sopenharmony_ci#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
49e1051a39Sopenharmony_ci#       [4][4] [3][3] [2][2] [1][1]
50e1051a39Sopenharmony_ci#
51e1051a39Sopenharmony_ci# So here we have it, lines not marked with vpermq() represent the magic
52e1051a39Sopenharmony_ci# order in which data is to be loaded and maintained. [And lines marked
53e1051a39Sopenharmony_ci# with vpermq() represent Pi circular permutation in chosen layout. Note
54e1051a39Sopenharmony_ci# that first step is permutation-free.] A[0][0] is loaded to register of
55e1051a39Sopenharmony_ci# its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
56e1051a39Sopenharmony_ci# Digits in variables' names denote right-most coordinates:
57e1051a39Sopenharmony_ci
58e1051a39Sopenharmony_cimy ($A00,	# [0][0] [0][0] [0][0] [0][0]		# %ymm0
59e1051a39Sopenharmony_ci    $A01,	# [0][4] [0][3] [0][2] [0][1]		# %ymm1
60e1051a39Sopenharmony_ci    $A20,	# [3][0] [1][0] [4][0] [2][0]		# %ymm2
61e1051a39Sopenharmony_ci    $A31,	# [2][4] [4][3] [1][2] [3][1]		# %ymm3
62e1051a39Sopenharmony_ci    $A21,	# [3][4] [1][3] [4][2] [2][1]		# %ymm4
63e1051a39Sopenharmony_ci    $A41,	# [1][4] [2][3] [3][2] [4][1]		# %ymm5
64e1051a39Sopenharmony_ci    $A11) =	# [4][4] [3][3] [2][2] [1][1]		# %ymm6
65e1051a39Sopenharmony_ci    map("%ymm$_",(0..6));
66e1051a39Sopenharmony_ci
67e1051a39Sopenharmony_ci# We also need to map the magic order into offsets within structure:
68e1051a39Sopenharmony_ci
69e1051a39Sopenharmony_cimy @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],	# [0][0..4]
70e1051a39Sopenharmony_ci		[2,2], [6,0], [3,1], [4,2], [5,3],	# [1][0..4]
71e1051a39Sopenharmony_ci		[2,0], [4,0], [6,1], [5,2], [3,3],	# [2][0..4]
72e1051a39Sopenharmony_ci		[2,3], [3,0], [5,1], [6,2], [4,3],	# [3][0..4]
73e1051a39Sopenharmony_ci		[2,1], [5,0], [4,1], [3,2], [6,3]);	# [4][0..4]
74e1051a39Sopenharmony_ci   @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);	# ... and now linear
75e1051a39Sopenharmony_ci
76e1051a39Sopenharmony_ci# But on the other hand Chi is much better off if y indices were aligned
77e1051a39Sopenharmony_ci# column-wise, not x. For this reason we have to shuffle data prior
78e1051a39Sopenharmony_ci# Chi and revert it afterwards. Prior shuffle is naturally merged with
79e1051a39Sopenharmony_ci# Pi itself:
80e1051a39Sopenharmony_ci#
81e1051a39Sopenharmony_ci#       [0][4] [0][3] [0][2] [0][1]
82e1051a39Sopenharmony_ci#       [3][0] [1][0] [4][0] [2][0]
83e1051a39Sopenharmony_ci#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
84e1051a39Sopenharmony_ci#vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
85e1051a39Sopenharmony_ci#       [3][1] [1][2] [4][3] [2][4]
86e1051a39Sopenharmony_ci#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
87e1051a39Sopenharmony_ci#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
88e1051a39Sopenharmony_ci#       [3][4] [1][3] [4][2] [2][1]
89e1051a39Sopenharmony_ci#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
90e1051a39Sopenharmony_ci#vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
91e1051a39Sopenharmony_ci#       [3][2] [1][4] [4][1] [2][3]
92e1051a39Sopenharmony_ci#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
93e1051a39Sopenharmony_ci#vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
94e1051a39Sopenharmony_ci#       [3][3] [1][1] [4][4] [2][2]
95e1051a39Sopenharmony_ci#
96e1051a39Sopenharmony_ci# And reverse post-Chi permutation:
97e1051a39Sopenharmony_ci#
98e1051a39Sopenharmony_ci#       [0][4] [0][3] [0][2] [0][1]
99e1051a39Sopenharmony_ci#       [3][0] [1][0] [4][0] [2][0]
100e1051a39Sopenharmony_ci#vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
101e1051a39Sopenharmony_ci#       [2][4] [4][3] [1][2] [3][1]
102e1051a39Sopenharmony_ci#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
103e1051a39Sopenharmony_ci#       [3][4] [1][3] [4][2] [2][1]
104e1051a39Sopenharmony_ci#vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
105e1051a39Sopenharmony_ci#       [1][4] [2][3] [3][2] [4][1]
106e1051a39Sopenharmony_ci#vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
107e1051a39Sopenharmony_ci#       [4][4] [3][3] [2][2] [1][1]
108e1051a39Sopenharmony_ci#
109e1051a39Sopenharmony_ci########################################################################
110e1051a39Sopenharmony_ci# Numbers are cycles per processed byte out of large message.
111e1051a39Sopenharmony_ci#
112e1051a39Sopenharmony_ci#			r=1088(*)
113e1051a39Sopenharmony_ci#
114e1051a39Sopenharmony_ci# Haswell		8.7/+10%
115e1051a39Sopenharmony_ci# Skylake		7.8/+20%
116e1051a39Sopenharmony_ci# Ryzen			17(**)
117e1051a39Sopenharmony_ci#
118e1051a39Sopenharmony_ci# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
119e1051a39Sopenharmony_ci#	coefficient in comparison to scalar keccak1600-x86_64.pl.
120e1051a39Sopenharmony_ci# (**)	It's expected that Ryzen performs poorly, because instruction
121e1051a39Sopenharmony_ci#	issue rate is limited to two AVX2 instructions per cycle and
122e1051a39Sopenharmony_ci#	in addition vpblendd is reportedly bound to specific port.
123e1051a39Sopenharmony_ci#	Obviously this code path should not be executed on Ryzen.
124e1051a39Sopenharmony_ci
125e1051a39Sopenharmony_cimy @T = map("%ymm$_",(7..15));
126e1051a39Sopenharmony_cimy ($C14,$C00,$D00,$D14) = @T[5..8];
127e1051a39Sopenharmony_ci
128e1051a39Sopenharmony_ci$code.=<<___;
129e1051a39Sopenharmony_ci.text
130e1051a39Sopenharmony_ci
131e1051a39Sopenharmony_ci.type	__KeccakF1600,\@function
132e1051a39Sopenharmony_ci.align	32
133e1051a39Sopenharmony_ci__KeccakF1600:
134e1051a39Sopenharmony_ci	lea		rhotates_left+96(%rip),%r8
135e1051a39Sopenharmony_ci	lea		rhotates_right+96(%rip),%r9
136e1051a39Sopenharmony_ci	lea		iotas(%rip),%r10
137e1051a39Sopenharmony_ci	mov		\$24,%eax
138e1051a39Sopenharmony_ci	jmp		.Loop_avx2
139e1051a39Sopenharmony_ci
140e1051a39Sopenharmony_ci.align	32
141e1051a39Sopenharmony_ci.Loop_avx2:
142e1051a39Sopenharmony_ci	######################################### Theta
143e1051a39Sopenharmony_ci	vpshufd		\$0b01001110,$A20,$C00
144e1051a39Sopenharmony_ci	vpxor		$A31,$A41,$C14
145e1051a39Sopenharmony_ci	vpxor		$A11,$A21,@T[2]
146e1051a39Sopenharmony_ci	vpxor		$A01,$C14,$C14
147e1051a39Sopenharmony_ci	vpxor		@T[2],$C14,$C14		# C[1..4]
148e1051a39Sopenharmony_ci
149e1051a39Sopenharmony_ci	vpermq		\$0b10010011,$C14,@T[4]
150e1051a39Sopenharmony_ci	vpxor		$A20,$C00,$C00
151e1051a39Sopenharmony_ci	vpermq		\$0b01001110,$C00,@T[0]
152e1051a39Sopenharmony_ci
153e1051a39Sopenharmony_ci	vpsrlq		\$63,$C14,@T[1]
154e1051a39Sopenharmony_ci	vpaddq		$C14,$C14,@T[2]
155e1051a39Sopenharmony_ci	vpor		@T[2],@T[1],@T[1]	# ROL64(C[1..4],1)
156e1051a39Sopenharmony_ci
157e1051a39Sopenharmony_ci	vpermq		\$0b00111001,@T[1],$D14
158e1051a39Sopenharmony_ci	vpxor		@T[4],@T[1],$D00
159e1051a39Sopenharmony_ci	vpermq		\$0b00000000,$D00,$D00	# D[0..0] = ROL64(C[1],1) ^ C[4]
160e1051a39Sopenharmony_ci
161e1051a39Sopenharmony_ci	vpxor		$A00,$C00,$C00
162e1051a39Sopenharmony_ci	vpxor		@T[0],$C00,$C00		# C[0..0]
163e1051a39Sopenharmony_ci
164e1051a39Sopenharmony_ci	vpsrlq		\$63,$C00,@T[0]
165e1051a39Sopenharmony_ci	vpaddq		$C00,$C00,@T[1]
166e1051a39Sopenharmony_ci	vpor		@T[0],@T[1],@T[1]	# ROL64(C[0..0],1)
167e1051a39Sopenharmony_ci
168e1051a39Sopenharmony_ci	vpxor		$D00,$A20,$A20		# ^= D[0..0]
169e1051a39Sopenharmony_ci	vpxor		$D00,$A00,$A00		# ^= D[0..0]
170e1051a39Sopenharmony_ci
171e1051a39Sopenharmony_ci	vpblendd	\$0b11000000,@T[1],$D14,$D14
172e1051a39Sopenharmony_ci	vpblendd	\$0b00000011,$C00,@T[4],@T[4]
173e1051a39Sopenharmony_ci	vpxor		@T[4],$D14,$D14		# D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
174e1051a39Sopenharmony_ci
175e1051a39Sopenharmony_ci	######################################### Rho + Pi + pre-Chi shuffle
176e1051a39Sopenharmony_ci	vpsllvq		0*32-96(%r8),$A20,@T[3]
177e1051a39Sopenharmony_ci	vpsrlvq		0*32-96(%r9),$A20,$A20
178e1051a39Sopenharmony_ci	vpor		@T[3],$A20,$A20
179e1051a39Sopenharmony_ci
180e1051a39Sopenharmony_ci	 vpxor		$D14,$A31,$A31		# ^= D[1..4] from Theta
181e1051a39Sopenharmony_ci	vpsllvq		2*32-96(%r8),$A31,@T[4]
182e1051a39Sopenharmony_ci	vpsrlvq		2*32-96(%r9),$A31,$A31
183e1051a39Sopenharmony_ci	vpor		@T[4],$A31,$A31
184e1051a39Sopenharmony_ci
185e1051a39Sopenharmony_ci	 vpxor		$D14,$A21,$A21		# ^= D[1..4] from Theta
186e1051a39Sopenharmony_ci	vpsllvq		3*32-96(%r8),$A21,@T[5]
187e1051a39Sopenharmony_ci	vpsrlvq		3*32-96(%r9),$A21,$A21
188e1051a39Sopenharmony_ci	vpor		@T[5],$A21,$A21
189e1051a39Sopenharmony_ci
190e1051a39Sopenharmony_ci	 vpxor		$D14,$A41,$A41		# ^= D[1..4] from Theta
191e1051a39Sopenharmony_ci	vpsllvq		4*32-96(%r8),$A41,@T[6]
192e1051a39Sopenharmony_ci	vpsrlvq		4*32-96(%r9),$A41,$A41
193e1051a39Sopenharmony_ci	vpor		@T[6],$A41,$A41
194e1051a39Sopenharmony_ci
195e1051a39Sopenharmony_ci	 vpxor		$D14,$A11,$A11		# ^= D[1..4] from Theta
196e1051a39Sopenharmony_ci	 vpermq		\$0b10001101,$A20,@T[3]	# $A20 -> future $A31
197e1051a39Sopenharmony_ci	 vpermq		\$0b10001101,$A31,@T[4]	# $A31 -> future $A21
198e1051a39Sopenharmony_ci	vpsllvq		5*32-96(%r8),$A11,@T[7]
199e1051a39Sopenharmony_ci	vpsrlvq		5*32-96(%r9),$A11,@T[1]
200e1051a39Sopenharmony_ci	vpor		@T[7],@T[1],@T[1]	# $A11 -> future $A01
201e1051a39Sopenharmony_ci
202e1051a39Sopenharmony_ci	 vpxor		$D14,$A01,$A01		# ^= D[1..4] from Theta
203e1051a39Sopenharmony_ci	 vpermq		\$0b00011011,$A21,@T[5]	# $A21 -> future $A41
204e1051a39Sopenharmony_ci	 vpermq		\$0b01110010,$A41,@T[6]	# $A41 -> future $A11
205e1051a39Sopenharmony_ci	vpsllvq		1*32-96(%r8),$A01,@T[8]
206e1051a39Sopenharmony_ci	vpsrlvq		1*32-96(%r9),$A01,@T[2]
207e1051a39Sopenharmony_ci	vpor		@T[8],@T[2],@T[2]	# $A01 -> future $A20
208e1051a39Sopenharmony_ci
209e1051a39Sopenharmony_ci	######################################### Chi
210e1051a39Sopenharmony_ci	vpsrldq		\$8,@T[1],@T[7]
211e1051a39Sopenharmony_ci	vpandn		@T[7],@T[1],@T[0]	# tgting  [0][0] [0][0] [0][0] [0][0]
212e1051a39Sopenharmony_ci
213e1051a39Sopenharmony_ci	vpblendd	\$0b00001100,@T[6],@T[2],$A31	#               [4][4] [2][0]
214e1051a39Sopenharmony_ci	vpblendd	\$0b00001100,@T[2],@T[4],@T[8]	#               [4][0] [2][1]
215e1051a39Sopenharmony_ci	 vpblendd	\$0b00001100,@T[4],@T[3],$A41	#               [4][2] [2][4]
216e1051a39Sopenharmony_ci	 vpblendd	\$0b00001100,@T[3],@T[2],@T[7]	#               [4][3] [2][0]
217e1051a39Sopenharmony_ci	vpblendd	\$0b00110000,@T[4],$A31,$A31	#        [1][3] [4][4] [2][0]
218e1051a39Sopenharmony_ci	vpblendd	\$0b00110000,@T[5],@T[8],@T[8]	#        [1][4] [4][0] [2][1]
219e1051a39Sopenharmony_ci	 vpblendd	\$0b00110000,@T[2],$A41,$A41	#        [1][0] [4][2] [2][4]
220e1051a39Sopenharmony_ci	 vpblendd	\$0b00110000,@T[6],@T[7],@T[7]	#        [1][1] [4][3] [2][0]
221e1051a39Sopenharmony_ci	vpblendd	\$0b11000000,@T[5],$A31,$A31	# [3][2] [1][3] [4][4] [2][0]
222e1051a39Sopenharmony_ci	vpblendd	\$0b11000000,@T[6],@T[8],@T[8]	# [3][3] [1][4] [4][0] [2][1]
223e1051a39Sopenharmony_ci	 vpblendd	\$0b11000000,@T[6],$A41,$A41	# [3][3] [1][0] [4][2] [2][4]
224e1051a39Sopenharmony_ci	 vpblendd	\$0b11000000,@T[4],@T[7],@T[7]	# [3][4] [1][1] [4][3] [2][0]
225e1051a39Sopenharmony_ci	vpandn		@T[8],$A31,$A31		# tgting  [3][1] [1][2] [4][3] [2][4]
226e1051a39Sopenharmony_ci	 vpandn		@T[7],$A41,$A41		# tgting  [3][2] [1][4] [4][1] [2][3]
227e1051a39Sopenharmony_ci
228e1051a39Sopenharmony_ci	vpblendd	\$0b00001100,@T[2],@T[5],$A11	#               [4][0] [2][3]
229e1051a39Sopenharmony_ci	vpblendd	\$0b00001100,@T[5],@T[3],@T[8]	#               [4][1] [2][4]
230e1051a39Sopenharmony_ci	 vpxor		@T[3],$A31,$A31
231e1051a39Sopenharmony_ci	vpblendd	\$0b00110000,@T[3],$A11,$A11	#        [1][2] [4][0] [2][3]
232e1051a39Sopenharmony_ci	vpblendd	\$0b00110000,@T[4],@T[8],@T[8]	#        [1][3] [4][1] [2][4]
233e1051a39Sopenharmony_ci	 vpxor		@T[5],$A41,$A41
234e1051a39Sopenharmony_ci	vpblendd	\$0b11000000,@T[4],$A11,$A11	# [3][4] [1][2] [4][0] [2][3]
235e1051a39Sopenharmony_ci	vpblendd	\$0b11000000,@T[2],@T[8],@T[8]	# [3][0] [1][3] [4][1] [2][4]
236e1051a39Sopenharmony_ci	vpandn		@T[8],$A11,$A11		# tgting  [3][3] [1][1] [4][4] [2][2]
237e1051a39Sopenharmony_ci	vpxor		@T[6],$A11,$A11
238e1051a39Sopenharmony_ci
239e1051a39Sopenharmony_ci	  vpermq	\$0b00011110,@T[1],$A21		# [0][1] [0][2] [0][4] [0][3]
240e1051a39Sopenharmony_ci	  vpblendd	\$0b00110000,$A00,$A21,@T[8]	# [0][1] [0][0] [0][4] [0][3]
241e1051a39Sopenharmony_ci	  vpermq	\$0b00111001,@T[1],$A01		# [0][1] [0][4] [0][3] [0][2]
242e1051a39Sopenharmony_ci	  vpblendd	\$0b11000000,$A00,$A01,$A01	# [0][0] [0][4] [0][3] [0][2]
243e1051a39Sopenharmony_ci	  vpandn	@T[8],$A01,$A01		# tgting  [0][4] [0][3] [0][2] [0][1]
244e1051a39Sopenharmony_ci
245e1051a39Sopenharmony_ci	vpblendd	\$0b00001100,@T[5],@T[4],$A20	#               [4][1] [2][1]
246e1051a39Sopenharmony_ci	vpblendd	\$0b00001100,@T[4],@T[6],@T[7]	#               [4][2] [2][2]
247e1051a39Sopenharmony_ci	vpblendd	\$0b00110000,@T[6],$A20,$A20	#        [1][1] [4][1] [2][1]
248e1051a39Sopenharmony_ci	vpblendd	\$0b00110000,@T[3],@T[7],@T[7]	#        [1][2] [4][2] [2][2]
249e1051a39Sopenharmony_ci	vpblendd	\$0b11000000,@T[3],$A20,$A20	# [3][1] [1][1] [4][1] [2][1]
250e1051a39Sopenharmony_ci	vpblendd	\$0b11000000,@T[5],@T[7],@T[7]	# [3][2] [1][2] [4][2] [2][2]
251e1051a39Sopenharmony_ci	vpandn		@T[7],$A20,$A20		# tgting  [3][0] [1][0] [4][0] [2][0]
252e1051a39Sopenharmony_ci	vpxor		@T[2],$A20,$A20
253e1051a39Sopenharmony_ci
254e1051a39Sopenharmony_ci	 vpermq		\$0b00000000,@T[0],@T[0]	# [0][0] [0][0] [0][0] [0][0]
255e1051a39Sopenharmony_ci	 vpermq		\$0b00011011,$A31,$A31	# post-Chi shuffle
256e1051a39Sopenharmony_ci	 vpermq		\$0b10001101,$A41,$A41
257e1051a39Sopenharmony_ci	 vpermq		\$0b01110010,$A11,$A11
258e1051a39Sopenharmony_ci
259e1051a39Sopenharmony_ci	vpblendd	\$0b00001100,@T[3],@T[6],$A21	#               [4][3] [2][2]
260e1051a39Sopenharmony_ci	vpblendd	\$0b00001100,@T[6],@T[5],@T[7]	#               [4][4] [2][3]
261e1051a39Sopenharmony_ci	vpblendd	\$0b00110000,@T[5],$A21,$A21	#        [1][4] [4][3] [2][2]
262e1051a39Sopenharmony_ci	vpblendd	\$0b00110000,@T[2],@T[7],@T[7]	#        [1][0] [4][4] [2][3]
263e1051a39Sopenharmony_ci	vpblendd	\$0b11000000,@T[2],$A21,$A21	# [3][0] [1][4] [4][3] [2][2]
264e1051a39Sopenharmony_ci	vpblendd	\$0b11000000,@T[3],@T[7],@T[7]	# [3][1] [1][0] [4][4] [2][3]
265e1051a39Sopenharmony_ci	vpandn		@T[7],$A21,$A21		# tgting  [3][4] [1][3] [4][2] [2][1]
266e1051a39Sopenharmony_ci
267e1051a39Sopenharmony_ci	vpxor		@T[0],$A00,$A00
268e1051a39Sopenharmony_ci	vpxor		@T[1],$A01,$A01
269e1051a39Sopenharmony_ci	vpxor		@T[4],$A21,$A21
270e1051a39Sopenharmony_ci
271e1051a39Sopenharmony_ci	######################################### Iota
272e1051a39Sopenharmony_ci	vpxor		(%r10),$A00,$A00
273e1051a39Sopenharmony_ci	lea		32(%r10),%r10
274e1051a39Sopenharmony_ci
275e1051a39Sopenharmony_ci	dec		%eax
276e1051a39Sopenharmony_ci	jnz		.Loop_avx2
277e1051a39Sopenharmony_ci
278e1051a39Sopenharmony_ci	ret
279e1051a39Sopenharmony_ci.size	__KeccakF1600,.-__KeccakF1600
280e1051a39Sopenharmony_ci___
281e1051a39Sopenharmony_cimy ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
282e1051a39Sopenharmony_cimy  $out = $inp;	# in squeeze
283e1051a39Sopenharmony_ci
284e1051a39Sopenharmony_ci$code.=<<___;
285e1051a39Sopenharmony_ci.globl	SHA3_absorb
286e1051a39Sopenharmony_ci.type	SHA3_absorb,\@function
287e1051a39Sopenharmony_ci.align	32
288e1051a39Sopenharmony_ciSHA3_absorb:
289e1051a39Sopenharmony_ci	mov	%rsp,%r11
290e1051a39Sopenharmony_ci
291e1051a39Sopenharmony_ci	lea	-240(%rsp),%rsp
292e1051a39Sopenharmony_ci	and	\$-32,%rsp
293e1051a39Sopenharmony_ci
294e1051a39Sopenharmony_ci	lea	96($A_flat),$A_flat
295e1051a39Sopenharmony_ci	lea	96($inp),$inp
296e1051a39Sopenharmony_ci	lea	96(%rsp),%r10
297e1051a39Sopenharmony_ci
298e1051a39Sopenharmony_ci	vzeroupper
299e1051a39Sopenharmony_ci
300e1051a39Sopenharmony_ci	vpbroadcastq	-96($A_flat),$A00	# load A[5][5]
301e1051a39Sopenharmony_ci	vmovdqu		8+32*0-96($A_flat),$A01
302e1051a39Sopenharmony_ci	vmovdqu		8+32*1-96($A_flat),$A20
303e1051a39Sopenharmony_ci	vmovdqu		8+32*2-96($A_flat),$A31
304e1051a39Sopenharmony_ci	vmovdqu		8+32*3-96($A_flat),$A21
305e1051a39Sopenharmony_ci	vmovdqu		8+32*4-96($A_flat),$A41
306e1051a39Sopenharmony_ci	vmovdqu		8+32*5-96($A_flat),$A11
307e1051a39Sopenharmony_ci
308e1051a39Sopenharmony_ci	vpxor		@T[0],@T[0],@T[0]
309e1051a39Sopenharmony_ci	vmovdqa		@T[0],32*2-96(%r10)	# zero transfer area on stack
310e1051a39Sopenharmony_ci	vmovdqa		@T[0],32*3-96(%r10)
311e1051a39Sopenharmony_ci	vmovdqa		@T[0],32*4-96(%r10)
312e1051a39Sopenharmony_ci	vmovdqa		@T[0],32*5-96(%r10)
313e1051a39Sopenharmony_ci	vmovdqa		@T[0],32*6-96(%r10)
314e1051a39Sopenharmony_ci
315e1051a39Sopenharmony_ci.Loop_absorb_avx2:
316e1051a39Sopenharmony_ci	mov		$bsz,%rax
317e1051a39Sopenharmony_ci	sub		$bsz,$len
318e1051a39Sopenharmony_ci	jc		.Ldone_absorb_avx2
319e1051a39Sopenharmony_ci
320e1051a39Sopenharmony_ci	shr		\$3,%eax
321e1051a39Sopenharmony_ci	vpbroadcastq	0-96($inp),@T[0]
322e1051a39Sopenharmony_ci	vmovdqu		8-96($inp),@T[1]
323e1051a39Sopenharmony_ci	sub		\$4,%eax
324e1051a39Sopenharmony_ci___
325e1051a39Sopenharmony_cifor(my $i=5; $i<25; $i++) {
326e1051a39Sopenharmony_ci$code.=<<___
327e1051a39Sopenharmony_ci	dec	%eax
328e1051a39Sopenharmony_ci	jz	.Labsorved_avx2
329e1051a39Sopenharmony_ci	mov	8*$i-96($inp),%r8
330e1051a39Sopenharmony_ci	mov	%r8,$A_jagged[$i]-96(%r10)
331e1051a39Sopenharmony_ci___
332e1051a39Sopenharmony_ci}
333e1051a39Sopenharmony_ci$code.=<<___;
334e1051a39Sopenharmony_ci.Labsorved_avx2:
335e1051a39Sopenharmony_ci	lea	($inp,$bsz),$inp
336e1051a39Sopenharmony_ci
337e1051a39Sopenharmony_ci	vpxor	@T[0],$A00,$A00
338e1051a39Sopenharmony_ci	vpxor	@T[1],$A01,$A01
339e1051a39Sopenharmony_ci	vpxor	32*2-96(%r10),$A20,$A20
340e1051a39Sopenharmony_ci	vpxor	32*3-96(%r10),$A31,$A31
341e1051a39Sopenharmony_ci	vpxor	32*4-96(%r10),$A21,$A21
342e1051a39Sopenharmony_ci	vpxor	32*5-96(%r10),$A41,$A41
343e1051a39Sopenharmony_ci	vpxor	32*6-96(%r10),$A11,$A11
344e1051a39Sopenharmony_ci
345e1051a39Sopenharmony_ci	call	__KeccakF1600
346e1051a39Sopenharmony_ci
347e1051a39Sopenharmony_ci	lea	96(%rsp),%r10
348e1051a39Sopenharmony_ci	jmp	.Loop_absorb_avx2
349e1051a39Sopenharmony_ci
350e1051a39Sopenharmony_ci.Ldone_absorb_avx2:
351e1051a39Sopenharmony_ci	vmovq	%xmm0,-96($A_flat)
352e1051a39Sopenharmony_ci	vmovdqu	$A01,8+32*0-96($A_flat)
353e1051a39Sopenharmony_ci	vmovdqu	$A20,8+32*1-96($A_flat)
354e1051a39Sopenharmony_ci	vmovdqu	$A31,8+32*2-96($A_flat)
355e1051a39Sopenharmony_ci	vmovdqu	$A21,8+32*3-96($A_flat)
356e1051a39Sopenharmony_ci	vmovdqu	$A41,8+32*4-96($A_flat)
357e1051a39Sopenharmony_ci	vmovdqu	$A11,8+32*5-96($A_flat)
358e1051a39Sopenharmony_ci
359e1051a39Sopenharmony_ci	vzeroupper
360e1051a39Sopenharmony_ci
361e1051a39Sopenharmony_ci	lea	(%r11),%rsp
362e1051a39Sopenharmony_ci	lea	($len,$bsz),%rax		# return value
363e1051a39Sopenharmony_ci	ret
364e1051a39Sopenharmony_ci.size	SHA3_absorb,.-SHA3_absorb
365e1051a39Sopenharmony_ci
366e1051a39Sopenharmony_ci.globl	SHA3_squeeze
367e1051a39Sopenharmony_ci.type	SHA3_squeeze,\@function
368e1051a39Sopenharmony_ci.align	32
369e1051a39Sopenharmony_ciSHA3_squeeze:
370e1051a39Sopenharmony_ci	mov	%rsp,%r11
371e1051a39Sopenharmony_ci
372e1051a39Sopenharmony_ci	lea	96($A_flat),$A_flat
373e1051a39Sopenharmony_ci	shr	\$3,$bsz
374e1051a39Sopenharmony_ci
375e1051a39Sopenharmony_ci	vzeroupper
376e1051a39Sopenharmony_ci
377e1051a39Sopenharmony_ci	vpbroadcastq	-96($A_flat),$A00
378e1051a39Sopenharmony_ci	vpxor		@T[0],@T[0],@T[0]
379e1051a39Sopenharmony_ci	vmovdqu		8+32*0-96($A_flat),$A01
380e1051a39Sopenharmony_ci	vmovdqu		8+32*1-96($A_flat),$A20
381e1051a39Sopenharmony_ci	vmovdqu		8+32*2-96($A_flat),$A31
382e1051a39Sopenharmony_ci	vmovdqu		8+32*3-96($A_flat),$A21
383e1051a39Sopenharmony_ci	vmovdqu		8+32*4-96($A_flat),$A41
384e1051a39Sopenharmony_ci	vmovdqu		8+32*5-96($A_flat),$A11
385e1051a39Sopenharmony_ci
386e1051a39Sopenharmony_ci	mov	$bsz,%rax
387e1051a39Sopenharmony_ci
388e1051a39Sopenharmony_ci.Loop_squeeze_avx2:
389e1051a39Sopenharmony_ci	mov	@A_jagged[$i]-96($A_flat),%r8
390e1051a39Sopenharmony_ci___
391e1051a39Sopenharmony_cifor (my $i=0; $i<25; $i++) {
392e1051a39Sopenharmony_ci$code.=<<___;
393e1051a39Sopenharmony_ci	sub	\$8,$len
394e1051a39Sopenharmony_ci	jc	.Ltail_squeeze_avx2
395e1051a39Sopenharmony_ci	mov	%r8,($out)
396e1051a39Sopenharmony_ci	lea	8($out),$out
397e1051a39Sopenharmony_ci	je	.Ldone_squeeze_avx2
398e1051a39Sopenharmony_ci	dec	%eax
399e1051a39Sopenharmony_ci	je	.Lextend_output_avx2
400e1051a39Sopenharmony_ci	mov	@A_jagged[$i+1]-120($A_flat),%r8
401e1051a39Sopenharmony_ci___
402e1051a39Sopenharmony_ci}
403e1051a39Sopenharmony_ci$code.=<<___;
404e1051a39Sopenharmony_ci.Lextend_output_avx2:
405e1051a39Sopenharmony_ci	call	__KeccakF1600
406e1051a39Sopenharmony_ci
407e1051a39Sopenharmony_ci	vmovq	%xmm0,-96($A_flat)
408e1051a39Sopenharmony_ci	vmovdqu	$A01,8+32*0-96($A_flat)
409e1051a39Sopenharmony_ci	vmovdqu	$A20,8+32*1-96($A_flat)
410e1051a39Sopenharmony_ci	vmovdqu	$A31,8+32*2-96($A_flat)
411e1051a39Sopenharmony_ci	vmovdqu	$A21,8+32*3-96($A_flat)
412e1051a39Sopenharmony_ci	vmovdqu	$A41,8+32*4-96($A_flat)
413e1051a39Sopenharmony_ci	vmovdqu	$A11,8+32*5-96($A_flat)
414e1051a39Sopenharmony_ci
415e1051a39Sopenharmony_ci	mov	$bsz,%rax
416e1051a39Sopenharmony_ci	jmp	.Loop_squeeze_avx2
417e1051a39Sopenharmony_ci
418e1051a39Sopenharmony_ci
419e1051a39Sopenharmony_ci.Ltail_squeeze_avx2:
420e1051a39Sopenharmony_ci	add	\$8,$len
421e1051a39Sopenharmony_ci.Loop_tail_avx2:
422e1051a39Sopenharmony_ci	mov	%r8b,($out)
423e1051a39Sopenharmony_ci	lea	1($out),$out
424e1051a39Sopenharmony_ci	shr	\$8,%r8
425e1051a39Sopenharmony_ci	dec	$len
426e1051a39Sopenharmony_ci	jnz	.Loop_tail_avx2
427e1051a39Sopenharmony_ci
428e1051a39Sopenharmony_ci.Ldone_squeeze_avx2:
429e1051a39Sopenharmony_ci	vzeroupper
430e1051a39Sopenharmony_ci
431e1051a39Sopenharmony_ci	lea	(%r11),%rsp
432e1051a39Sopenharmony_ci	ret
433e1051a39Sopenharmony_ci.size	SHA3_squeeze,.-SHA3_squeeze
434e1051a39Sopenharmony_ci
435e1051a39Sopenharmony_ci.align	64
436e1051a39Sopenharmony_cirhotates_left:
437e1051a39Sopenharmony_ci	.quad	3,	18,	36,	41	# [2][0] [4][0] [1][0] [3][0]
438e1051a39Sopenharmony_ci	.quad	1,	62,	28,	27	# [0][1] [0][2] [0][3] [0][4]
439e1051a39Sopenharmony_ci	.quad	45,	6,	56,	39	# [3][1] [1][2] [4][3] [2][4]
440e1051a39Sopenharmony_ci	.quad	10,	61,	55,	8	# [2][1] [4][2] [1][3] [3][4]
441e1051a39Sopenharmony_ci	.quad	2,	15,	25,	20	# [4][1] [3][2] [2][3] [1][4]
442e1051a39Sopenharmony_ci	.quad	44,	43,	21,	14	# [1][1] [2][2] [3][3] [4][4]
443e1051a39Sopenharmony_cirhotates_right:
444e1051a39Sopenharmony_ci	.quad	64-3,	64-18,	64-36,	64-41
445e1051a39Sopenharmony_ci	.quad	64-1,	64-62,	64-28,	64-27
446e1051a39Sopenharmony_ci	.quad	64-45,	64-6,	64-56,	64-39
447e1051a39Sopenharmony_ci	.quad	64-10,	64-61,	64-55,	64-8
448e1051a39Sopenharmony_ci	.quad	64-2,	64-15,	64-25,	64-20
449e1051a39Sopenharmony_ci	.quad	64-44,	64-43,	64-21,	64-14
450e1051a39Sopenharmony_ciiotas:
451e1051a39Sopenharmony_ci	.quad	0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
452e1051a39Sopenharmony_ci	.quad	0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
453e1051a39Sopenharmony_ci	.quad	0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
454e1051a39Sopenharmony_ci	.quad	0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
455e1051a39Sopenharmony_ci	.quad	0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
456e1051a39Sopenharmony_ci	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
457e1051a39Sopenharmony_ci	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
458e1051a39Sopenharmony_ci	.quad	0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
459e1051a39Sopenharmony_ci	.quad	0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
460e1051a39Sopenharmony_ci	.quad	0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
461e1051a39Sopenharmony_ci	.quad	0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
462e1051a39Sopenharmony_ci	.quad	0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
463e1051a39Sopenharmony_ci	.quad	0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
464e1051a39Sopenharmony_ci	.quad	0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
465e1051a39Sopenharmony_ci	.quad	0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
466e1051a39Sopenharmony_ci	.quad	0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
467e1051a39Sopenharmony_ci	.quad	0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
468e1051a39Sopenharmony_ci	.quad	0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
469e1051a39Sopenharmony_ci	.quad	0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
470e1051a39Sopenharmony_ci	.quad	0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
471e1051a39Sopenharmony_ci	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
472e1051a39Sopenharmony_ci	.quad	0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
473e1051a39Sopenharmony_ci	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
474e1051a39Sopenharmony_ci	.quad	0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
475e1051a39Sopenharmony_ci
476e1051a39Sopenharmony_ci.asciz	"Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
477e1051a39Sopenharmony_ci___
478e1051a39Sopenharmony_ci
479e1051a39Sopenharmony_ci$output=pop and open STDOUT,">$output";
480e1051a39Sopenharmony_ciprint $code;
481e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
482