1e1051a39Sopenharmony_ci#!/usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci#
9e1051a39Sopenharmony_ci# ====================================================================
10e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
12e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
13e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
14e1051a39Sopenharmony_ci# ====================================================================
15e1051a39Sopenharmony_ci#
16e1051a39Sopenharmony_ci# Keccak-1600 for AVX-512F.
17e1051a39Sopenharmony_ci#
18e1051a39Sopenharmony_ci# July 2017.
19e1051a39Sopenharmony_ci#
20e1051a39Sopenharmony_ci# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
21e1051a39Sopenharmony_ci# Pretty straightforward, the only "magic" is data layout in registers.
22e1051a39Sopenharmony_ci# It's impossible to have one that is optimal for every step, hence
23e1051a39Sopenharmony_ci# it's changing as algorithm progresses. Data is saved in linear order,
24e1051a39Sopenharmony_ci# but in-register order morphs between rounds. Even rounds take in
25e1051a39Sopenharmony_ci# linear layout, and odd rounds - transposed, or "verticaly-shaped"...
26e1051a39Sopenharmony_ci#
27e1051a39Sopenharmony_ci########################################################################
28e1051a39Sopenharmony_ci# Numbers are cycles per processed byte out of large message.
29e1051a39Sopenharmony_ci#
30e1051a39Sopenharmony_ci#			r=1088(*)
31e1051a39Sopenharmony_ci#
32e1051a39Sopenharmony_ci# Knights Landing	7.6
33e1051a39Sopenharmony_ci# Skylake-X		5.7
34e1051a39Sopenharmony_ci#
35e1051a39Sopenharmony_ci# (*)	Corresponds to SHA3-256.
36e1051a39Sopenharmony_ci
37e1051a39Sopenharmony_ci########################################################################
38e1051a39Sopenharmony_ci# Below code is combination of two ideas. One is taken from Keccak Code
39e1051a39Sopenharmony_ci# Package, hereafter KCP, and another one from initial version of this
40e1051a39Sopenharmony_ci# module. What is common is observation that Pi's input and output are
41e1051a39Sopenharmony_ci# "mostly transposed", i.e. if input is aligned by x coordinate, then
42e1051a39Sopenharmony_ci# output is [mostly] aligned by y. Both versions, KCP and predecessor,
43e1051a39Sopenharmony_ci# were trying to use one of them from round to round, which resulted in
44e1051a39Sopenharmony_ci# some kind of transposition in each round. This version still does
45e1051a39Sopenharmony_ci# transpose data, but only every second round. Another essential factor
46e1051a39Sopenharmony_ci# is that KCP transposition has to be performed with instructions that
47e1051a39Sopenharmony_ci# turned to be rather expensive on Knights Landing, both latency- and
48e1051a39Sopenharmony_ci# throughput-wise. Not to mention that some of them have to depend on
49e1051a39Sopenharmony_ci# each other. On the other hand initial version of this module was
50e1051a39Sopenharmony_ci# relying heavily on blend instructions. There were lots of them,
51e1051a39Sopenharmony_ci# resulting in higher instruction count, yet it performed better on
52e1051a39Sopenharmony_ci# Knights Landing, because processor can execute pair of them each
53e1051a39Sopenharmony_ci# cycle and they have minimal latency. This module is an attempt to
54e1051a39Sopenharmony_ci# bring best parts together:-)
55e1051a39Sopenharmony_ci#
56e1051a39Sopenharmony_ci# Coordinates below correspond to those in sha/keccak1600.c. Input
57e1051a39Sopenharmony_ci# layout is straight linear:
58e1051a39Sopenharmony_ci#
59e1051a39Sopenharmony_ci# [0][4] [0][3] [0][2] [0][1] [0][0]
60e1051a39Sopenharmony_ci# [1][4] [1][3] [1][2] [1][1] [1][0]
61e1051a39Sopenharmony_ci# [2][4] [2][3] [2][2] [2][1] [2][0]
62e1051a39Sopenharmony_ci# [3][4] [3][3] [3][2] [3][1] [3][0]
63e1051a39Sopenharmony_ci# [4][4] [4][3] [4][2] [4][1] [4][0]
64e1051a39Sopenharmony_ci#
65e1051a39Sopenharmony_ci# It's perfect for Theta, while Pi is reduced to intra-register
66e1051a39Sopenharmony_ci# permutations which yield layout perfect for Chi:
67e1051a39Sopenharmony_ci#
68e1051a39Sopenharmony_ci# [4][0] [3][0] [2][0] [1][0] [0][0]
69e1051a39Sopenharmony_ci# [4][1] [3][1] [2][1] [1][1] [0][1]
70e1051a39Sopenharmony_ci# [4][2] [3][2] [2][2] [1][2] [0][2]
71e1051a39Sopenharmony_ci# [4][3] [3][3] [2][3] [1][3] [0][3]
72e1051a39Sopenharmony_ci# [4][4] [3][4] [2][4] [1][4] [0][4]
73e1051a39Sopenharmony_ci#
74e1051a39Sopenharmony_ci# Now instead of performing full transposition and feeding it to next
75e1051a39Sopenharmony_ci# identical round, we perform kind of diagonal transposition to layout
76e1051a39Sopenharmony_ci# from initial version of this module, and make it suitable for Theta:
77e1051a39Sopenharmony_ci#
78e1051a39Sopenharmony_ci# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
79e1051a39Sopenharmony_ci# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
80e1051a39Sopenharmony_ci# [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
81e1051a39Sopenharmony_ci# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
82e1051a39Sopenharmony_ci# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
83e1051a39Sopenharmony_ci#
84e1051a39Sopenharmony_ci# Now intra-register permutations yield initial [almost] straight
85e1051a39Sopenharmony_ci# linear layout:
86e1051a39Sopenharmony_ci#
87e1051a39Sopenharmony_ci# [4][4] [3][3] [2][2] [1][1] [0][0]
88e1051a39Sopenharmony_ci##[0][4] [0][3] [0][2] [0][1] [0][0]
89e1051a39Sopenharmony_ci# [3][4] [2][3] [1][2] [0][1] [4][0]
90e1051a39Sopenharmony_ci##[2][3] [2][2] [2][1] [2][0] [2][4]
91e1051a39Sopenharmony_ci# [2][4] [1][3] [0][2] [4][1] [3][0]
92e1051a39Sopenharmony_ci##[4][2] [4][1] [4][0] [4][4] [4][3]
93e1051a39Sopenharmony_ci# [1][4] [0][3] [4][2] [3][1] [2][0]
94e1051a39Sopenharmony_ci##[1][1] [1][0] [1][4] [1][3] [1][2]
95e1051a39Sopenharmony_ci# [0][4] [4][3] [3][2] [2][1] [1][0]
96e1051a39Sopenharmony_ci##[3][0] [3][4] [3][3] [3][2] [3][1]
97e1051a39Sopenharmony_ci#
98e1051a39Sopenharmony_ci# This means that odd round Chi is performed in less suitable layout,
99e1051a39Sopenharmony_ci# with a number of additional permutations. But overall it turned to be
100e1051a39Sopenharmony_ci# a win. Permutations are fastest possible on Knights Landing and they
101e1051a39Sopenharmony_ci# are laid down to be independent of each other. In the essence I traded
102e1051a39Sopenharmony_ci# 20 blend instructions for 3 permutations. The result is 13% faster
103e1051a39Sopenharmony_ci# than KCP on Skylake-X, and >40% on Knights Landing.
104e1051a39Sopenharmony_ci#
105e1051a39Sopenharmony_ci# As implied, data is loaded in straight linear order. Digits in
106e1051a39Sopenharmony_ci# variables' names represent coordinates of right-most element of
107e1051a39Sopenharmony_ci# loaded data chunk:
108e1051a39Sopenharmony_ci
109e1051a39Sopenharmony_cimy ($A00,	# [0][4] [0][3] [0][2] [0][1] [0][0]
110e1051a39Sopenharmony_ci    $A10,	# [1][4] [1][3] [1][2] [1][1] [1][0]
111e1051a39Sopenharmony_ci    $A20,	# [2][4] [2][3] [2][2] [2][1] [2][0]
112e1051a39Sopenharmony_ci    $A30,	# [3][4] [3][3] [3][2] [3][1] [3][0]
113e1051a39Sopenharmony_ci    $A40) =	# [4][4] [4][3] [4][2] [4][1] [4][0]
114e1051a39Sopenharmony_ci    map("%zmm$_",(0..4));
115e1051a39Sopenharmony_ci
116e1051a39Sopenharmony_ci# We also need to map the magic order into offsets within structure:
117e1051a39Sopenharmony_ci
118e1051a39Sopenharmony_cimy @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
119e1051a39Sopenharmony_ci		[1,0], [1,1], [1,2], [1,3], [1,4],
120e1051a39Sopenharmony_ci		[2,0], [2,1], [2,2], [2,3], [2,4],
121e1051a39Sopenharmony_ci		[3,0], [3,1], [3,2], [3,3], [3,4],
122e1051a39Sopenharmony_ci		[4,0], [4,1], [4,2], [4,3], [4,4]);
123e1051a39Sopenharmony_ci   @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged);	# ... and now linear
124e1051a39Sopenharmony_ci
125e1051a39Sopenharmony_cimy @T        = map("%zmm$_",(5..12));
126e1051a39Sopenharmony_cimy @Theta    = map("%zmm$_",(33,13..16));	# invalid @Theta[0] is not typo
127e1051a39Sopenharmony_cimy @Pi0      = map("%zmm$_",(17..21));
128e1051a39Sopenharmony_cimy @Rhotate0 = map("%zmm$_",(22..26));
129e1051a39Sopenharmony_cimy @Rhotate1 = map("%zmm$_",(27..31));
130e1051a39Sopenharmony_ci
131e1051a39Sopenharmony_cimy ($C00,$D00) = @T[0..1];
132e1051a39Sopenharmony_cimy ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
133e1051a39Sopenharmony_ci
134e1051a39Sopenharmony_ci$code.=<<___;
135e1051a39Sopenharmony_ci.text
136e1051a39Sopenharmony_ci
137e1051a39Sopenharmony_ci.type	__KeccakF1600,\@function
138e1051a39Sopenharmony_ci.align	32
139e1051a39Sopenharmony_ci__KeccakF1600:
140e1051a39Sopenharmony_ci	lea		iotas(%rip),%r10
141e1051a39Sopenharmony_ci	mov		\$12,%eax
142e1051a39Sopenharmony_ci	jmp		.Loop_avx512
143e1051a39Sopenharmony_ci
144e1051a39Sopenharmony_ci.align	32
145e1051a39Sopenharmony_ci.Loop_avx512:
146e1051a39Sopenharmony_ci	######################################### Theta, even round
147e1051a39Sopenharmony_ci	vmovdqa64	$A00,@T[0]		# put aside original A00
148e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$A20,$A10,$A00	# and use it as "C00"
149e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$A40,$A30,$A00
150e1051a39Sopenharmony_ci
151e1051a39Sopenharmony_ci	vprolq		\$1,$A00,$D00
152e1051a39Sopenharmony_ci	vpermq		$A00,@Theta[1],$A00
153e1051a39Sopenharmony_ci	vpermq		$D00,@Theta[4],$D00
154e1051a39Sopenharmony_ci
155e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$A00,$D00,@T[0]	# T[0] is original A00
156e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$A00,$D00,$A10
157e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$A00,$D00,$A20
158e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$A00,$D00,$A30
159e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$A00,$D00,$A40
160e1051a39Sopenharmony_ci
161e1051a39Sopenharmony_ci	######################################### Rho
162e1051a39Sopenharmony_ci	vprolvq		@Rhotate0[0],@T[0],$A00	# T[0] is original A00
163e1051a39Sopenharmony_ci	vprolvq		@Rhotate0[1],$A10,$A10
164e1051a39Sopenharmony_ci	vprolvq		@Rhotate0[2],$A20,$A20
165e1051a39Sopenharmony_ci	vprolvq		@Rhotate0[3],$A30,$A30
166e1051a39Sopenharmony_ci	vprolvq		@Rhotate0[4],$A40,$A40
167e1051a39Sopenharmony_ci
168e1051a39Sopenharmony_ci	######################################### Pi
169e1051a39Sopenharmony_ci	vpermq		$A00,@Pi0[0],$A00
170e1051a39Sopenharmony_ci	vpermq		$A10,@Pi0[1],$A10
171e1051a39Sopenharmony_ci	vpermq		$A20,@Pi0[2],$A20
172e1051a39Sopenharmony_ci	vpermq		$A30,@Pi0[3],$A30
173e1051a39Sopenharmony_ci	vpermq		$A40,@Pi0[4],$A40
174e1051a39Sopenharmony_ci
175e1051a39Sopenharmony_ci	######################################### Chi
176e1051a39Sopenharmony_ci	vmovdqa64	$A00,@T[0]
177e1051a39Sopenharmony_ci	vmovdqa64	$A10,@T[1]
178e1051a39Sopenharmony_ci	vpternlogq	\$0xD2,$A20,$A10,$A00
179e1051a39Sopenharmony_ci	vpternlogq	\$0xD2,$A30,$A20,$A10
180e1051a39Sopenharmony_ci	vpternlogq	\$0xD2,$A40,$A30,$A20
181e1051a39Sopenharmony_ci	vpternlogq	\$0xD2,@T[0],$A40,$A30
182e1051a39Sopenharmony_ci	vpternlogq	\$0xD2,@T[1],@T[0],$A40
183e1051a39Sopenharmony_ci
184e1051a39Sopenharmony_ci	######################################### Iota
185e1051a39Sopenharmony_ci	vpxorq		(%r10),$A00,${A00}{$k00001}
186e1051a39Sopenharmony_ci	lea		16(%r10),%r10
187e1051a39Sopenharmony_ci
188e1051a39Sopenharmony_ci	######################################### Harmonize rounds
189e1051a39Sopenharmony_ci	vpblendmq	$A20,$A10,@{T[1]}{$k00010}
190e1051a39Sopenharmony_ci	vpblendmq	$A30,$A20,@{T[2]}{$k00010}
191e1051a39Sopenharmony_ci	vpblendmq	$A40,$A30,@{T[3]}{$k00010}
192e1051a39Sopenharmony_ci	 vpblendmq	$A10,$A00,@{T[0]}{$k00010}
193e1051a39Sopenharmony_ci	vpblendmq	$A00,$A40,@{T[4]}{$k00010}
194e1051a39Sopenharmony_ci
195e1051a39Sopenharmony_ci	vpblendmq	$A30,@T[1],@{T[1]}{$k00100}
196e1051a39Sopenharmony_ci	vpblendmq	$A40,@T[2],@{T[2]}{$k00100}
197e1051a39Sopenharmony_ci	 vpblendmq	$A20,@T[0],@{T[0]}{$k00100}
198e1051a39Sopenharmony_ci	vpblendmq	$A00,@T[3],@{T[3]}{$k00100}
199e1051a39Sopenharmony_ci	vpblendmq	$A10,@T[4],@{T[4]}{$k00100}
200e1051a39Sopenharmony_ci
201e1051a39Sopenharmony_ci	vpblendmq	$A40,@T[1],@{T[1]}{$k01000}
202e1051a39Sopenharmony_ci	 vpblendmq	$A30,@T[0],@{T[0]}{$k01000}
203e1051a39Sopenharmony_ci	vpblendmq	$A00,@T[2],@{T[2]}{$k01000}
204e1051a39Sopenharmony_ci	vpblendmq	$A10,@T[3],@{T[3]}{$k01000}
205e1051a39Sopenharmony_ci	vpblendmq	$A20,@T[4],@{T[4]}{$k01000}
206e1051a39Sopenharmony_ci
207e1051a39Sopenharmony_ci	vpblendmq	$A40,@T[0],@{T[0]}{$k10000}
208e1051a39Sopenharmony_ci	vpblendmq	$A00,@T[1],@{T[1]}{$k10000}
209e1051a39Sopenharmony_ci	vpblendmq	$A10,@T[2],@{T[2]}{$k10000}
210e1051a39Sopenharmony_ci	vpblendmq	$A20,@T[3],@{T[3]}{$k10000}
211e1051a39Sopenharmony_ci	vpblendmq	$A30,@T[4],@{T[4]}{$k10000}
212e1051a39Sopenharmony_ci
213e1051a39Sopenharmony_ci	#vpermq		@T[0],@Theta[0],$A00	# doesn't actually change order
214e1051a39Sopenharmony_ci	vpermq		@T[1],@Theta[1],$A10
215e1051a39Sopenharmony_ci	vpermq		@T[2],@Theta[2],$A20
216e1051a39Sopenharmony_ci	vpermq		@T[3],@Theta[3],$A30
217e1051a39Sopenharmony_ci	vpermq		@T[4],@Theta[4],$A40
218e1051a39Sopenharmony_ci
219e1051a39Sopenharmony_ci	######################################### Theta, odd round
220e1051a39Sopenharmony_ci	vmovdqa64	$T[0],$A00		# real A00
221e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$A20,$A10,$C00	# C00 is @T[0]'s alias
222e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$A40,$A30,$C00
223e1051a39Sopenharmony_ci
224e1051a39Sopenharmony_ci	vprolq		\$1,$C00,$D00
225e1051a39Sopenharmony_ci	vpermq		$C00,@Theta[1],$C00
226e1051a39Sopenharmony_ci	vpermq		$D00,@Theta[4],$D00
227e1051a39Sopenharmony_ci
228e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$C00,$D00,$A00
229e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$C00,$D00,$A30
230e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$C00,$D00,$A10
231e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$C00,$D00,$A40
232e1051a39Sopenharmony_ci	vpternlogq	\$0x96,$C00,$D00,$A20
233e1051a39Sopenharmony_ci
234e1051a39Sopenharmony_ci	######################################### Rho
235e1051a39Sopenharmony_ci	vprolvq		@Rhotate1[0],$A00,$A00
236e1051a39Sopenharmony_ci	vprolvq		@Rhotate1[3],$A30,@T[1]
237e1051a39Sopenharmony_ci	vprolvq		@Rhotate1[1],$A10,@T[2]
238e1051a39Sopenharmony_ci	vprolvq		@Rhotate1[4],$A40,@T[3]
239e1051a39Sopenharmony_ci	vprolvq		@Rhotate1[2],$A20,@T[4]
240e1051a39Sopenharmony_ci
241e1051a39Sopenharmony_ci	 vpermq		$A00,@Theta[4],@T[5]
242e1051a39Sopenharmony_ci	 vpermq		$A00,@Theta[3],@T[6]
243e1051a39Sopenharmony_ci
244e1051a39Sopenharmony_ci	######################################### Iota
245e1051a39Sopenharmony_ci	vpxorq		-8(%r10),$A00,${A00}{$k00001}
246e1051a39Sopenharmony_ci
247e1051a39Sopenharmony_ci	######################################### Pi
248e1051a39Sopenharmony_ci	vpermq		@T[1],@Theta[2],$A10
249e1051a39Sopenharmony_ci	vpermq		@T[2],@Theta[4],$A20
250e1051a39Sopenharmony_ci	vpermq		@T[3],@Theta[1],$A30
251e1051a39Sopenharmony_ci	vpermq		@T[4],@Theta[3],$A40
252e1051a39Sopenharmony_ci
253e1051a39Sopenharmony_ci	######################################### Chi
254e1051a39Sopenharmony_ci	vpternlogq	\$0xD2,@T[6],@T[5],$A00
255e1051a39Sopenharmony_ci
256e1051a39Sopenharmony_ci	vpermq		@T[1],@Theta[1],@T[7]
257e1051a39Sopenharmony_ci	#vpermq		@T[1],@Theta[0],@T[1]
258e1051a39Sopenharmony_ci	vpternlogq	\$0xD2,@T[1],@T[7],$A10
259e1051a39Sopenharmony_ci
260e1051a39Sopenharmony_ci	vpermq		@T[2],@Theta[3],@T[0]
261e1051a39Sopenharmony_ci	vpermq		@T[2],@Theta[2],@T[2]
262e1051a39Sopenharmony_ci	vpternlogq	\$0xD2,@T[2],@T[0],$A20
263e1051a39Sopenharmony_ci
264e1051a39Sopenharmony_ci	#vpermq		@T[3],@Theta[0],@T[3]
265e1051a39Sopenharmony_ci	vpermq		@T[3],@Theta[4],@T[1]
266e1051a39Sopenharmony_ci	vpternlogq	\$0xD2,@T[1],@T[3],$A30
267e1051a39Sopenharmony_ci
268e1051a39Sopenharmony_ci	vpermq		@T[4],@Theta[2],@T[0]
269e1051a39Sopenharmony_ci	vpermq		@T[4],@Theta[1],@T[4]
270e1051a39Sopenharmony_ci	vpternlogq	\$0xD2,@T[4],@T[0],$A40
271e1051a39Sopenharmony_ci
272e1051a39Sopenharmony_ci	dec		%eax
273e1051a39Sopenharmony_ci	jnz		.Loop_avx512
274e1051a39Sopenharmony_ci
275e1051a39Sopenharmony_ci	ret
276e1051a39Sopenharmony_ci.size	__KeccakF1600,.-__KeccakF1600
277e1051a39Sopenharmony_ci___
278e1051a39Sopenharmony_ci
279e1051a39Sopenharmony_cimy ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
280e1051a39Sopenharmony_cimy  $out = $inp;	# in squeeze
281e1051a39Sopenharmony_ci
282e1051a39Sopenharmony_ci$code.=<<___;
283e1051a39Sopenharmony_ci.globl	SHA3_absorb
284e1051a39Sopenharmony_ci.type	SHA3_absorb,\@function
285e1051a39Sopenharmony_ci.align	32
286e1051a39Sopenharmony_ciSHA3_absorb:
287e1051a39Sopenharmony_ci	mov	%rsp,%r11
288e1051a39Sopenharmony_ci
289e1051a39Sopenharmony_ci	lea	-320(%rsp),%rsp
290e1051a39Sopenharmony_ci	and	\$-64,%rsp
291e1051a39Sopenharmony_ci
292e1051a39Sopenharmony_ci	lea	96($A_flat),$A_flat
293e1051a39Sopenharmony_ci	lea	96($inp),$inp
294e1051a39Sopenharmony_ci	lea	128(%rsp),%r9
295e1051a39Sopenharmony_ci
296e1051a39Sopenharmony_ci	lea		theta_perm(%rip),%r8
297e1051a39Sopenharmony_ci
298e1051a39Sopenharmony_ci	kxnorw		$k11111,$k11111,$k11111
299e1051a39Sopenharmony_ci	kshiftrw	\$15,$k11111,$k00001
300e1051a39Sopenharmony_ci	kshiftrw	\$11,$k11111,$k11111
301e1051a39Sopenharmony_ci	kshiftlw	\$1,$k00001,$k00010
302e1051a39Sopenharmony_ci	kshiftlw	\$2,$k00001,$k00100
303e1051a39Sopenharmony_ci	kshiftlw	\$3,$k00001,$k01000
304e1051a39Sopenharmony_ci	kshiftlw	\$4,$k00001,$k10000
305e1051a39Sopenharmony_ci
306e1051a39Sopenharmony_ci	#vmovdqa64	64*0(%r8),@Theta[0]
307e1051a39Sopenharmony_ci	vmovdqa64	64*1(%r8),@Theta[1]
308e1051a39Sopenharmony_ci	vmovdqa64	64*2(%r8),@Theta[2]
309e1051a39Sopenharmony_ci	vmovdqa64	64*3(%r8),@Theta[3]
310e1051a39Sopenharmony_ci	vmovdqa64	64*4(%r8),@Theta[4]
311e1051a39Sopenharmony_ci
312e1051a39Sopenharmony_ci	vmovdqa64	64*5(%r8),@Rhotate1[0]
313e1051a39Sopenharmony_ci	vmovdqa64	64*6(%r8),@Rhotate1[1]
314e1051a39Sopenharmony_ci	vmovdqa64	64*7(%r8),@Rhotate1[2]
315e1051a39Sopenharmony_ci	vmovdqa64	64*8(%r8),@Rhotate1[3]
316e1051a39Sopenharmony_ci	vmovdqa64	64*9(%r8),@Rhotate1[4]
317e1051a39Sopenharmony_ci
318e1051a39Sopenharmony_ci	vmovdqa64	64*10(%r8),@Rhotate0[0]
319e1051a39Sopenharmony_ci	vmovdqa64	64*11(%r8),@Rhotate0[1]
320e1051a39Sopenharmony_ci	vmovdqa64	64*12(%r8),@Rhotate0[2]
321e1051a39Sopenharmony_ci	vmovdqa64	64*13(%r8),@Rhotate0[3]
322e1051a39Sopenharmony_ci	vmovdqa64	64*14(%r8),@Rhotate0[4]
323e1051a39Sopenharmony_ci
324e1051a39Sopenharmony_ci	vmovdqa64	64*15(%r8),@Pi0[0]
325e1051a39Sopenharmony_ci	vmovdqa64	64*16(%r8),@Pi0[1]
326e1051a39Sopenharmony_ci	vmovdqa64	64*17(%r8),@Pi0[2]
327e1051a39Sopenharmony_ci	vmovdqa64	64*18(%r8),@Pi0[3]
328e1051a39Sopenharmony_ci	vmovdqa64	64*19(%r8),@Pi0[4]
329e1051a39Sopenharmony_ci
330e1051a39Sopenharmony_ci	vmovdqu64	40*0-96($A_flat),${A00}{$k11111}{z}
331e1051a39Sopenharmony_ci	vpxorq		@T[0],@T[0],@T[0]
332e1051a39Sopenharmony_ci	vmovdqu64	40*1-96($A_flat),${A10}{$k11111}{z}
333e1051a39Sopenharmony_ci	vmovdqu64	40*2-96($A_flat),${A20}{$k11111}{z}
334e1051a39Sopenharmony_ci	vmovdqu64	40*3-96($A_flat),${A30}{$k11111}{z}
335e1051a39Sopenharmony_ci	vmovdqu64	40*4-96($A_flat),${A40}{$k11111}{z}
336e1051a39Sopenharmony_ci
337e1051a39Sopenharmony_ci	vmovdqa64	@T[0],0*64-128(%r9)	# zero transfer area on stack
338e1051a39Sopenharmony_ci	vmovdqa64	@T[0],1*64-128(%r9)
339e1051a39Sopenharmony_ci	vmovdqa64	@T[0],2*64-128(%r9)
340e1051a39Sopenharmony_ci	vmovdqa64	@T[0],3*64-128(%r9)
341e1051a39Sopenharmony_ci	vmovdqa64	@T[0],4*64-128(%r9)
342e1051a39Sopenharmony_ci	jmp		.Loop_absorb_avx512
343e1051a39Sopenharmony_ci
344e1051a39Sopenharmony_ci.align	32
345e1051a39Sopenharmony_ci.Loop_absorb_avx512:
346e1051a39Sopenharmony_ci	mov		$bsz,%rax
347e1051a39Sopenharmony_ci	sub		$bsz,$len
348e1051a39Sopenharmony_ci	jc		.Ldone_absorb_avx512
349e1051a39Sopenharmony_ci
350e1051a39Sopenharmony_ci	shr		\$3,%eax
351e1051a39Sopenharmony_ci___
352e1051a39Sopenharmony_cifor(my $i=0; $i<25; $i++) {
353e1051a39Sopenharmony_ci$code.=<<___
354e1051a39Sopenharmony_ci	mov	8*$i-96($inp),%r8
355e1051a39Sopenharmony_ci	mov	%r8,$A_jagged[$i]-128(%r9)
356e1051a39Sopenharmony_ci	dec	%eax
357e1051a39Sopenharmony_ci	jz	.Labsorved_avx512
358e1051a39Sopenharmony_ci___
359e1051a39Sopenharmony_ci}
360e1051a39Sopenharmony_ci$code.=<<___;
361e1051a39Sopenharmony_ci.Labsorved_avx512:
362e1051a39Sopenharmony_ci	lea	($inp,$bsz),$inp
363e1051a39Sopenharmony_ci
364e1051a39Sopenharmony_ci	vpxorq	64*0-128(%r9),$A00,$A00
365e1051a39Sopenharmony_ci	vpxorq	64*1-128(%r9),$A10,$A10
366e1051a39Sopenharmony_ci	vpxorq	64*2-128(%r9),$A20,$A20
367e1051a39Sopenharmony_ci	vpxorq	64*3-128(%r9),$A30,$A30
368e1051a39Sopenharmony_ci	vpxorq	64*4-128(%r9),$A40,$A40
369e1051a39Sopenharmony_ci
370e1051a39Sopenharmony_ci	call	__KeccakF1600
371e1051a39Sopenharmony_ci
372e1051a39Sopenharmony_ci	jmp	.Loop_absorb_avx512
373e1051a39Sopenharmony_ci
374e1051a39Sopenharmony_ci.align	32
375e1051a39Sopenharmony_ci.Ldone_absorb_avx512:
376e1051a39Sopenharmony_ci	vmovdqu64	$A00,40*0-96($A_flat){$k11111}
377e1051a39Sopenharmony_ci	vmovdqu64	$A10,40*1-96($A_flat){$k11111}
378e1051a39Sopenharmony_ci	vmovdqu64	$A20,40*2-96($A_flat){$k11111}
379e1051a39Sopenharmony_ci	vmovdqu64	$A30,40*3-96($A_flat){$k11111}
380e1051a39Sopenharmony_ci	vmovdqu64	$A40,40*4-96($A_flat){$k11111}
381e1051a39Sopenharmony_ci
382e1051a39Sopenharmony_ci	vzeroupper
383e1051a39Sopenharmony_ci
384e1051a39Sopenharmony_ci	lea	(%r11),%rsp
385e1051a39Sopenharmony_ci	lea	($len,$bsz),%rax		# return value
386e1051a39Sopenharmony_ci	ret
387e1051a39Sopenharmony_ci.size	SHA3_absorb,.-SHA3_absorb
388e1051a39Sopenharmony_ci
389e1051a39Sopenharmony_ci.globl	SHA3_squeeze
390e1051a39Sopenharmony_ci.type	SHA3_squeeze,\@function
391e1051a39Sopenharmony_ci.align	32
392e1051a39Sopenharmony_ciSHA3_squeeze:
393e1051a39Sopenharmony_ci	mov	%rsp,%r11
394e1051a39Sopenharmony_ci
395e1051a39Sopenharmony_ci	lea	96($A_flat),$A_flat
396e1051a39Sopenharmony_ci	cmp	$bsz,$len
397e1051a39Sopenharmony_ci	jbe	.Lno_output_extension_avx512
398e1051a39Sopenharmony_ci
399e1051a39Sopenharmony_ci	lea		theta_perm(%rip),%r8
400e1051a39Sopenharmony_ci
401e1051a39Sopenharmony_ci	kxnorw		$k11111,$k11111,$k11111
402e1051a39Sopenharmony_ci	kshiftrw	\$15,$k11111,$k00001
403e1051a39Sopenharmony_ci	kshiftrw	\$11,$k11111,$k11111
404e1051a39Sopenharmony_ci	kshiftlw	\$1,$k00001,$k00010
405e1051a39Sopenharmony_ci	kshiftlw	\$2,$k00001,$k00100
406e1051a39Sopenharmony_ci	kshiftlw	\$3,$k00001,$k01000
407e1051a39Sopenharmony_ci	kshiftlw	\$4,$k00001,$k10000
408e1051a39Sopenharmony_ci
409e1051a39Sopenharmony_ci	#vmovdqa64	64*0(%r8),@Theta[0]
410e1051a39Sopenharmony_ci	vmovdqa64	64*1(%r8),@Theta[1]
411e1051a39Sopenharmony_ci	vmovdqa64	64*2(%r8),@Theta[2]
412e1051a39Sopenharmony_ci	vmovdqa64	64*3(%r8),@Theta[3]
413e1051a39Sopenharmony_ci	vmovdqa64	64*4(%r8),@Theta[4]
414e1051a39Sopenharmony_ci
415e1051a39Sopenharmony_ci	vmovdqa64	64*5(%r8),@Rhotate1[0]
416e1051a39Sopenharmony_ci	vmovdqa64	64*6(%r8),@Rhotate1[1]
417e1051a39Sopenharmony_ci	vmovdqa64	64*7(%r8),@Rhotate1[2]
418e1051a39Sopenharmony_ci	vmovdqa64	64*8(%r8),@Rhotate1[3]
419e1051a39Sopenharmony_ci	vmovdqa64	64*9(%r8),@Rhotate1[4]
420e1051a39Sopenharmony_ci
421e1051a39Sopenharmony_ci	vmovdqa64	64*10(%r8),@Rhotate0[0]
422e1051a39Sopenharmony_ci	vmovdqa64	64*11(%r8),@Rhotate0[1]
423e1051a39Sopenharmony_ci	vmovdqa64	64*12(%r8),@Rhotate0[2]
424e1051a39Sopenharmony_ci	vmovdqa64	64*13(%r8),@Rhotate0[3]
425e1051a39Sopenharmony_ci	vmovdqa64	64*14(%r8),@Rhotate0[4]
426e1051a39Sopenharmony_ci
427e1051a39Sopenharmony_ci	vmovdqa64	64*15(%r8),@Pi0[0]
428e1051a39Sopenharmony_ci	vmovdqa64	64*16(%r8),@Pi0[1]
429e1051a39Sopenharmony_ci	vmovdqa64	64*17(%r8),@Pi0[2]
430e1051a39Sopenharmony_ci	vmovdqa64	64*18(%r8),@Pi0[3]
431e1051a39Sopenharmony_ci	vmovdqa64	64*19(%r8),@Pi0[4]
432e1051a39Sopenharmony_ci
433e1051a39Sopenharmony_ci	vmovdqu64	40*0-96($A_flat),${A00}{$k11111}{z}
434e1051a39Sopenharmony_ci	vmovdqu64	40*1-96($A_flat),${A10}{$k11111}{z}
435e1051a39Sopenharmony_ci	vmovdqu64	40*2-96($A_flat),${A20}{$k11111}{z}
436e1051a39Sopenharmony_ci	vmovdqu64	40*3-96($A_flat),${A30}{$k11111}{z}
437e1051a39Sopenharmony_ci	vmovdqu64	40*4-96($A_flat),${A40}{$k11111}{z}
438e1051a39Sopenharmony_ci
439e1051a39Sopenharmony_ci.Lno_output_extension_avx512:
440e1051a39Sopenharmony_ci	shr	\$3,$bsz
441e1051a39Sopenharmony_ci	lea	-96($A_flat),%r9
442e1051a39Sopenharmony_ci	mov	$bsz,%rax
443e1051a39Sopenharmony_ci	jmp	.Loop_squeeze_avx512
444e1051a39Sopenharmony_ci
445e1051a39Sopenharmony_ci.align	32
446e1051a39Sopenharmony_ci.Loop_squeeze_avx512:
447e1051a39Sopenharmony_ci	cmp	\$8,$len
448e1051a39Sopenharmony_ci	jb	.Ltail_squeeze_avx512
449e1051a39Sopenharmony_ci
450e1051a39Sopenharmony_ci	mov	(%r9),%r8
451e1051a39Sopenharmony_ci	lea	8(%r9),%r9
452e1051a39Sopenharmony_ci	mov	%r8,($out)
453e1051a39Sopenharmony_ci	lea	8($out),$out
454e1051a39Sopenharmony_ci	sub	\$8,$len		# len -= 8
455e1051a39Sopenharmony_ci	jz	.Ldone_squeeze_avx512
456e1051a39Sopenharmony_ci
457e1051a39Sopenharmony_ci	sub	\$1,%rax		# bsz--
458e1051a39Sopenharmony_ci	jnz	.Loop_squeeze_avx512
459e1051a39Sopenharmony_ci
460e1051a39Sopenharmony_ci	#vpermq		@Theta[4],@Theta[4],@Theta[3]
461e1051a39Sopenharmony_ci	#vpermq		@Theta[3],@Theta[4],@Theta[2]
462e1051a39Sopenharmony_ci	#vpermq		@Theta[3],@Theta[3],@Theta[1]
463e1051a39Sopenharmony_ci
464e1051a39Sopenharmony_ci	call		__KeccakF1600
465e1051a39Sopenharmony_ci
466e1051a39Sopenharmony_ci	vmovdqu64	$A00,40*0-96($A_flat){$k11111}
467e1051a39Sopenharmony_ci	vmovdqu64	$A10,40*1-96($A_flat){$k11111}
468e1051a39Sopenharmony_ci	vmovdqu64	$A20,40*2-96($A_flat){$k11111}
469e1051a39Sopenharmony_ci	vmovdqu64	$A30,40*3-96($A_flat){$k11111}
470e1051a39Sopenharmony_ci	vmovdqu64	$A40,40*4-96($A_flat){$k11111}
471e1051a39Sopenharmony_ci
472e1051a39Sopenharmony_ci	lea	-96($A_flat),%r9
473e1051a39Sopenharmony_ci	mov	$bsz,%rax
474e1051a39Sopenharmony_ci	jmp	.Loop_squeeze_avx512
475e1051a39Sopenharmony_ci
476e1051a39Sopenharmony_ci.Ltail_squeeze_avx512:
477e1051a39Sopenharmony_ci	mov	$out,%rdi
478e1051a39Sopenharmony_ci	mov	%r9,%rsi
479e1051a39Sopenharmony_ci	mov	$len,%rcx
480e1051a39Sopenharmony_ci	.byte	0xf3,0xa4		# rep movsb
481e1051a39Sopenharmony_ci
482e1051a39Sopenharmony_ci.Ldone_squeeze_avx512:
483e1051a39Sopenharmony_ci	vzeroupper
484e1051a39Sopenharmony_ci
485e1051a39Sopenharmony_ci	lea	(%r11),%rsp
486e1051a39Sopenharmony_ci	ret
487e1051a39Sopenharmony_ci.size	SHA3_squeeze,.-SHA3_squeeze
488e1051a39Sopenharmony_ci
489e1051a39Sopenharmony_ci.align	64
490e1051a39Sopenharmony_citheta_perm:
491e1051a39Sopenharmony_ci	.quad	0, 1, 2, 3, 4, 5, 6, 7		# [not used]
492e1051a39Sopenharmony_ci	.quad	4, 0, 1, 2, 3, 5, 6, 7
493e1051a39Sopenharmony_ci	.quad	3, 4, 0, 1, 2, 5, 6, 7
494e1051a39Sopenharmony_ci	.quad	2, 3, 4, 0, 1, 5, 6, 7
495e1051a39Sopenharmony_ci	.quad	1, 2, 3, 4, 0, 5, 6, 7
496e1051a39Sopenharmony_ci
497e1051a39Sopenharmony_cirhotates1:
498e1051a39Sopenharmony_ci	.quad	0,  44, 43, 21, 14, 0, 0, 0	# [0][0] [1][1] [2][2] [3][3] [4][4]
499e1051a39Sopenharmony_ci	.quad	18, 1,  6,  25, 8,  0, 0, 0	# [4][0] [0][1] [1][2] [2][3] [3][4]
500e1051a39Sopenharmony_ci	.quad	41, 2,	62, 55, 39, 0, 0, 0	# [3][0] [4][1] [0][2] [1][3] [2][4]
501e1051a39Sopenharmony_ci	.quad	3,  45, 61, 28, 20, 0, 0, 0	# [2][0] [3][1] [4][2] [0][3] [1][4]
502e1051a39Sopenharmony_ci	.quad	36, 10, 15, 56, 27, 0, 0, 0	# [1][0] [2][1] [3][2] [4][3] [0][4]
503e1051a39Sopenharmony_ci
504e1051a39Sopenharmony_cirhotates0:
505e1051a39Sopenharmony_ci	.quad	 0,  1, 62, 28, 27, 0, 0, 0
506e1051a39Sopenharmony_ci	.quad	36, 44,  6, 55, 20, 0, 0, 0
507e1051a39Sopenharmony_ci	.quad	 3, 10, 43, 25, 39, 0, 0, 0
508e1051a39Sopenharmony_ci	.quad	41, 45, 15, 21,  8, 0, 0, 0
509e1051a39Sopenharmony_ci	.quad	18,  2, 61, 56, 14, 0, 0, 0
510e1051a39Sopenharmony_ci
511e1051a39Sopenharmony_cipi0_perm:
512e1051a39Sopenharmony_ci	.quad	0, 3, 1, 4, 2, 5, 6, 7
513e1051a39Sopenharmony_ci	.quad	1, 4, 2, 0, 3, 5, 6, 7
514e1051a39Sopenharmony_ci	.quad	2, 0, 3, 1, 4, 5, 6, 7
515e1051a39Sopenharmony_ci	.quad	3, 1, 4, 2, 0, 5, 6, 7
516e1051a39Sopenharmony_ci	.quad	4, 2, 0, 3, 1, 5, 6, 7
517e1051a39Sopenharmony_ci
518e1051a39Sopenharmony_ci
519e1051a39Sopenharmony_ciiotas:
520e1051a39Sopenharmony_ci	.quad	0x0000000000000001
521e1051a39Sopenharmony_ci	.quad	0x0000000000008082
522e1051a39Sopenharmony_ci	.quad	0x800000000000808a
523e1051a39Sopenharmony_ci	.quad	0x8000000080008000
524e1051a39Sopenharmony_ci	.quad	0x000000000000808b
525e1051a39Sopenharmony_ci	.quad	0x0000000080000001
526e1051a39Sopenharmony_ci	.quad	0x8000000080008081
527e1051a39Sopenharmony_ci	.quad	0x8000000000008009
528e1051a39Sopenharmony_ci	.quad	0x000000000000008a
529e1051a39Sopenharmony_ci	.quad	0x0000000000000088
530e1051a39Sopenharmony_ci	.quad	0x0000000080008009
531e1051a39Sopenharmony_ci	.quad	0x000000008000000a
532e1051a39Sopenharmony_ci	.quad	0x000000008000808b
533e1051a39Sopenharmony_ci	.quad	0x800000000000008b
534e1051a39Sopenharmony_ci	.quad	0x8000000000008089
535e1051a39Sopenharmony_ci	.quad	0x8000000000008003
536e1051a39Sopenharmony_ci	.quad	0x8000000000008002
537e1051a39Sopenharmony_ci	.quad	0x8000000000000080
538e1051a39Sopenharmony_ci	.quad	0x000000000000800a
539e1051a39Sopenharmony_ci	.quad	0x800000008000000a
540e1051a39Sopenharmony_ci	.quad	0x8000000080008081
541e1051a39Sopenharmony_ci	.quad	0x8000000000008080
542e1051a39Sopenharmony_ci	.quad	0x0000000080000001
543e1051a39Sopenharmony_ci	.quad	0x8000000080008008
544e1051a39Sopenharmony_ci
545e1051a39Sopenharmony_ci.asciz	"Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"
546e1051a39Sopenharmony_ci___
547e1051a39Sopenharmony_ci
548e1051a39Sopenharmony_ci$output=pop and open STDOUT,">$output";
549e1051a39Sopenharmony_ciprint $code;
550e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
551