1e1051a39Sopenharmony_ci#!/usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci# 9e1051a39Sopenharmony_ci# ==================================================================== 10e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 12e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 13e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 14e1051a39Sopenharmony_ci# ==================================================================== 15e1051a39Sopenharmony_ci# 16e1051a39Sopenharmony_ci# Keccak-1600 for AVX-512F. 17e1051a39Sopenharmony_ci# 18e1051a39Sopenharmony_ci# July 2017. 19e1051a39Sopenharmony_ci# 20e1051a39Sopenharmony_ci# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c). 21e1051a39Sopenharmony_ci# Pretty straightforward, the only "magic" is data layout in registers. 22e1051a39Sopenharmony_ci# It's impossible to have one that is optimal for every step, hence 23e1051a39Sopenharmony_ci# it's changing as algorithm progresses. Data is saved in linear order, 24e1051a39Sopenharmony_ci# but in-register order morphs between rounds. Even rounds take in 25e1051a39Sopenharmony_ci# linear layout, and odd rounds - transposed, or "verticaly-shaped"... 26e1051a39Sopenharmony_ci# 27e1051a39Sopenharmony_ci######################################################################## 28e1051a39Sopenharmony_ci# Numbers are cycles per processed byte out of large message. 29e1051a39Sopenharmony_ci# 30e1051a39Sopenharmony_ci# r=1088(*) 31e1051a39Sopenharmony_ci# 32e1051a39Sopenharmony_ci# Knights Landing 7.6 33e1051a39Sopenharmony_ci# Skylake-X 5.7 34e1051a39Sopenharmony_ci# 35e1051a39Sopenharmony_ci# (*) Corresponds to SHA3-256. 36e1051a39Sopenharmony_ci 37e1051a39Sopenharmony_ci######################################################################## 38e1051a39Sopenharmony_ci# Below code is combination of two ideas. One is taken from Keccak Code 39e1051a39Sopenharmony_ci# Package, hereafter KCP, and another one from initial version of this 40e1051a39Sopenharmony_ci# module. What is common is observation that Pi's input and output are 41e1051a39Sopenharmony_ci# "mostly transposed", i.e. if input is aligned by x coordinate, then 42e1051a39Sopenharmony_ci# output is [mostly] aligned by y. Both versions, KCP and predecessor, 43e1051a39Sopenharmony_ci# were trying to use one of them from round to round, which resulted in 44e1051a39Sopenharmony_ci# some kind of transposition in each round. This version still does 45e1051a39Sopenharmony_ci# transpose data, but only every second round. Another essential factor 46e1051a39Sopenharmony_ci# is that KCP transposition has to be performed with instructions that 47e1051a39Sopenharmony_ci# turned to be rather expensive on Knights Landing, both latency- and 48e1051a39Sopenharmony_ci# throughput-wise. Not to mention that some of them have to depend on 49e1051a39Sopenharmony_ci# each other. On the other hand initial version of this module was 50e1051a39Sopenharmony_ci# relying heavily on blend instructions. There were lots of them, 51e1051a39Sopenharmony_ci# resulting in higher instruction count, yet it performed better on 52e1051a39Sopenharmony_ci# Knights Landing, because processor can execute pair of them each 53e1051a39Sopenharmony_ci# cycle and they have minimal latency. This module is an attempt to 54e1051a39Sopenharmony_ci# bring best parts together:-) 55e1051a39Sopenharmony_ci# 56e1051a39Sopenharmony_ci# Coordinates below correspond to those in sha/keccak1600.c. Input 57e1051a39Sopenharmony_ci# layout is straight linear: 58e1051a39Sopenharmony_ci# 59e1051a39Sopenharmony_ci# [0][4] [0][3] [0][2] [0][1] [0][0] 60e1051a39Sopenharmony_ci# [1][4] [1][3] [1][2] [1][1] [1][0] 61e1051a39Sopenharmony_ci# [2][4] [2][3] [2][2] [2][1] [2][0] 62e1051a39Sopenharmony_ci# [3][4] [3][3] [3][2] [3][1] [3][0] 63e1051a39Sopenharmony_ci# [4][4] [4][3] [4][2] [4][1] [4][0] 64e1051a39Sopenharmony_ci# 65e1051a39Sopenharmony_ci# It's perfect for Theta, while Pi is reduced to intra-register 66e1051a39Sopenharmony_ci# permutations which yield layout perfect for Chi: 67e1051a39Sopenharmony_ci# 68e1051a39Sopenharmony_ci# [4][0] [3][0] [2][0] [1][0] [0][0] 69e1051a39Sopenharmony_ci# [4][1] [3][1] [2][1] [1][1] [0][1] 70e1051a39Sopenharmony_ci# [4][2] [3][2] [2][2] [1][2] [0][2] 71e1051a39Sopenharmony_ci# [4][3] [3][3] [2][3] [1][3] [0][3] 72e1051a39Sopenharmony_ci# [4][4] [3][4] [2][4] [1][4] [0][4] 73e1051a39Sopenharmony_ci# 74e1051a39Sopenharmony_ci# Now instead of performing full transposition and feeding it to next 75e1051a39Sopenharmony_ci# identical round, we perform kind of diagonal transposition to layout 76e1051a39Sopenharmony_ci# from initial version of this module, and make it suitable for Theta: 77e1051a39Sopenharmony_ci# 78e1051a39Sopenharmony_ci# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0] 79e1051a39Sopenharmony_ci# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0] 80e1051a39Sopenharmony_ci# [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0] 81e1051a39Sopenharmony_ci# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0] 82e1051a39Sopenharmony_ci# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0] 83e1051a39Sopenharmony_ci# 84e1051a39Sopenharmony_ci# Now intra-register permutations yield initial [almost] straight 85e1051a39Sopenharmony_ci# linear layout: 86e1051a39Sopenharmony_ci# 87e1051a39Sopenharmony_ci# [4][4] [3][3] [2][2] [1][1] [0][0] 88e1051a39Sopenharmony_ci##[0][4] [0][3] [0][2] [0][1] [0][0] 89e1051a39Sopenharmony_ci# [3][4] [2][3] [1][2] [0][1] [4][0] 90e1051a39Sopenharmony_ci##[2][3] [2][2] [2][1] [2][0] [2][4] 91e1051a39Sopenharmony_ci# [2][4] [1][3] [0][2] [4][1] [3][0] 92e1051a39Sopenharmony_ci##[4][2] [4][1] [4][0] [4][4] [4][3] 93e1051a39Sopenharmony_ci# [1][4] [0][3] [4][2] [3][1] [2][0] 94e1051a39Sopenharmony_ci##[1][1] [1][0] [1][4] [1][3] [1][2] 95e1051a39Sopenharmony_ci# [0][4] [4][3] [3][2] [2][1] [1][0] 96e1051a39Sopenharmony_ci##[3][0] [3][4] [3][3] [3][2] [3][1] 97e1051a39Sopenharmony_ci# 98e1051a39Sopenharmony_ci# This means that odd round Chi is performed in less suitable layout, 99e1051a39Sopenharmony_ci# with a number of additional permutations. But overall it turned to be 100e1051a39Sopenharmony_ci# a win. Permutations are fastest possible on Knights Landing and they 101e1051a39Sopenharmony_ci# are laid down to be independent of each other. In the essence I traded 102e1051a39Sopenharmony_ci# 20 blend instructions for 3 permutations. The result is 13% faster 103e1051a39Sopenharmony_ci# than KCP on Skylake-X, and >40% on Knights Landing. 104e1051a39Sopenharmony_ci# 105e1051a39Sopenharmony_ci# As implied, data is loaded in straight linear order. Digits in 106e1051a39Sopenharmony_ci# variables' names represent coordinates of right-most element of 107e1051a39Sopenharmony_ci# loaded data chunk: 108e1051a39Sopenharmony_ci 109e1051a39Sopenharmony_cimy ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0] 110e1051a39Sopenharmony_ci $A10, # [1][4] [1][3] [1][2] [1][1] [1][0] 111e1051a39Sopenharmony_ci $A20, # [2][4] [2][3] [2][2] [2][1] [2][0] 112e1051a39Sopenharmony_ci $A30, # [3][4] [3][3] [3][2] [3][1] [3][0] 113e1051a39Sopenharmony_ci $A40) = # [4][4] [4][3] [4][2] [4][1] [4][0] 114e1051a39Sopenharmony_ci map("%zmm$_",(0..4)); 115e1051a39Sopenharmony_ci 116e1051a39Sopenharmony_ci# We also need to map the magic order into offsets within structure: 117e1051a39Sopenharmony_ci 118e1051a39Sopenharmony_cimy @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4], 119e1051a39Sopenharmony_ci [1,0], [1,1], [1,2], [1,3], [1,4], 120e1051a39Sopenharmony_ci [2,0], [2,1], [2,2], [2,3], [2,4], 121e1051a39Sopenharmony_ci [3,0], [3,1], [3,2], [3,3], [3,4], 122e1051a39Sopenharmony_ci [4,0], [4,1], [4,2], [4,3], [4,4]); 123e1051a39Sopenharmony_ci @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear 124e1051a39Sopenharmony_ci 125e1051a39Sopenharmony_cimy @T = map("%zmm$_",(5..12)); 126e1051a39Sopenharmony_cimy @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo 127e1051a39Sopenharmony_cimy @Pi0 = map("%zmm$_",(17..21)); 128e1051a39Sopenharmony_cimy @Rhotate0 = map("%zmm$_",(22..26)); 129e1051a39Sopenharmony_cimy @Rhotate1 = map("%zmm$_",(27..31)); 130e1051a39Sopenharmony_ci 131e1051a39Sopenharmony_cimy ($C00,$D00) = @T[0..1]; 132e1051a39Sopenharmony_cimy ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6)); 133e1051a39Sopenharmony_ci 134e1051a39Sopenharmony_ci$code.=<<___; 135e1051a39Sopenharmony_ci.text 136e1051a39Sopenharmony_ci 137e1051a39Sopenharmony_ci.type __KeccakF1600,\@function 138e1051a39Sopenharmony_ci.align 32 139e1051a39Sopenharmony_ci__KeccakF1600: 140e1051a39Sopenharmony_ci lea iotas(%rip),%r10 141e1051a39Sopenharmony_ci mov \$12,%eax 142e1051a39Sopenharmony_ci jmp .Loop_avx512 143e1051a39Sopenharmony_ci 144e1051a39Sopenharmony_ci.align 32 145e1051a39Sopenharmony_ci.Loop_avx512: 146e1051a39Sopenharmony_ci ######################################### Theta, even round 147e1051a39Sopenharmony_ci vmovdqa64 $A00,@T[0] # put aside original A00 148e1051a39Sopenharmony_ci vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00" 149e1051a39Sopenharmony_ci vpternlogq \$0x96,$A40,$A30,$A00 150e1051a39Sopenharmony_ci 151e1051a39Sopenharmony_ci vprolq \$1,$A00,$D00 152e1051a39Sopenharmony_ci vpermq $A00,@Theta[1],$A00 153e1051a39Sopenharmony_ci vpermq $D00,@Theta[4],$D00 154e1051a39Sopenharmony_ci 155e1051a39Sopenharmony_ci vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00 156e1051a39Sopenharmony_ci vpternlogq \$0x96,$A00,$D00,$A10 157e1051a39Sopenharmony_ci vpternlogq \$0x96,$A00,$D00,$A20 158e1051a39Sopenharmony_ci vpternlogq \$0x96,$A00,$D00,$A30 159e1051a39Sopenharmony_ci vpternlogq \$0x96,$A00,$D00,$A40 160e1051a39Sopenharmony_ci 161e1051a39Sopenharmony_ci ######################################### Rho 162e1051a39Sopenharmony_ci vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00 163e1051a39Sopenharmony_ci vprolvq @Rhotate0[1],$A10,$A10 164e1051a39Sopenharmony_ci vprolvq @Rhotate0[2],$A20,$A20 165e1051a39Sopenharmony_ci vprolvq @Rhotate0[3],$A30,$A30 166e1051a39Sopenharmony_ci vprolvq @Rhotate0[4],$A40,$A40 167e1051a39Sopenharmony_ci 168e1051a39Sopenharmony_ci ######################################### Pi 169e1051a39Sopenharmony_ci vpermq $A00,@Pi0[0],$A00 170e1051a39Sopenharmony_ci vpermq $A10,@Pi0[1],$A10 171e1051a39Sopenharmony_ci vpermq $A20,@Pi0[2],$A20 172e1051a39Sopenharmony_ci vpermq $A30,@Pi0[3],$A30 173e1051a39Sopenharmony_ci vpermq $A40,@Pi0[4],$A40 174e1051a39Sopenharmony_ci 175e1051a39Sopenharmony_ci ######################################### Chi 176e1051a39Sopenharmony_ci vmovdqa64 $A00,@T[0] 177e1051a39Sopenharmony_ci vmovdqa64 $A10,@T[1] 178e1051a39Sopenharmony_ci vpternlogq \$0xD2,$A20,$A10,$A00 179e1051a39Sopenharmony_ci vpternlogq \$0xD2,$A30,$A20,$A10 180e1051a39Sopenharmony_ci vpternlogq \$0xD2,$A40,$A30,$A20 181e1051a39Sopenharmony_ci vpternlogq \$0xD2,@T[0],$A40,$A30 182e1051a39Sopenharmony_ci vpternlogq \$0xD2,@T[1],@T[0],$A40 183e1051a39Sopenharmony_ci 184e1051a39Sopenharmony_ci ######################################### Iota 185e1051a39Sopenharmony_ci vpxorq (%r10),$A00,${A00}{$k00001} 186e1051a39Sopenharmony_ci lea 16(%r10),%r10 187e1051a39Sopenharmony_ci 188e1051a39Sopenharmony_ci ######################################### Harmonize rounds 189e1051a39Sopenharmony_ci vpblendmq $A20,$A10,@{T[1]}{$k00010} 190e1051a39Sopenharmony_ci vpblendmq $A30,$A20,@{T[2]}{$k00010} 191e1051a39Sopenharmony_ci vpblendmq $A40,$A30,@{T[3]}{$k00010} 192e1051a39Sopenharmony_ci vpblendmq $A10,$A00,@{T[0]}{$k00010} 193e1051a39Sopenharmony_ci vpblendmq $A00,$A40,@{T[4]}{$k00010} 194e1051a39Sopenharmony_ci 195e1051a39Sopenharmony_ci vpblendmq $A30,@T[1],@{T[1]}{$k00100} 196e1051a39Sopenharmony_ci vpblendmq $A40,@T[2],@{T[2]}{$k00100} 197e1051a39Sopenharmony_ci vpblendmq $A20,@T[0],@{T[0]}{$k00100} 198e1051a39Sopenharmony_ci vpblendmq $A00,@T[3],@{T[3]}{$k00100} 199e1051a39Sopenharmony_ci vpblendmq $A10,@T[4],@{T[4]}{$k00100} 200e1051a39Sopenharmony_ci 201e1051a39Sopenharmony_ci vpblendmq $A40,@T[1],@{T[1]}{$k01000} 202e1051a39Sopenharmony_ci vpblendmq $A30,@T[0],@{T[0]}{$k01000} 203e1051a39Sopenharmony_ci vpblendmq $A00,@T[2],@{T[2]}{$k01000} 204e1051a39Sopenharmony_ci vpblendmq $A10,@T[3],@{T[3]}{$k01000} 205e1051a39Sopenharmony_ci vpblendmq $A20,@T[4],@{T[4]}{$k01000} 206e1051a39Sopenharmony_ci 207e1051a39Sopenharmony_ci vpblendmq $A40,@T[0],@{T[0]}{$k10000} 208e1051a39Sopenharmony_ci vpblendmq $A00,@T[1],@{T[1]}{$k10000} 209e1051a39Sopenharmony_ci vpblendmq $A10,@T[2],@{T[2]}{$k10000} 210e1051a39Sopenharmony_ci vpblendmq $A20,@T[3],@{T[3]}{$k10000} 211e1051a39Sopenharmony_ci vpblendmq $A30,@T[4],@{T[4]}{$k10000} 212e1051a39Sopenharmony_ci 213e1051a39Sopenharmony_ci #vpermq @T[0],@Theta[0],$A00 # doesn't actually change order 214e1051a39Sopenharmony_ci vpermq @T[1],@Theta[1],$A10 215e1051a39Sopenharmony_ci vpermq @T[2],@Theta[2],$A20 216e1051a39Sopenharmony_ci vpermq @T[3],@Theta[3],$A30 217e1051a39Sopenharmony_ci vpermq @T[4],@Theta[4],$A40 218e1051a39Sopenharmony_ci 219e1051a39Sopenharmony_ci ######################################### Theta, odd round 220e1051a39Sopenharmony_ci vmovdqa64 $T[0],$A00 # real A00 221e1051a39Sopenharmony_ci vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias 222e1051a39Sopenharmony_ci vpternlogq \$0x96,$A40,$A30,$C00 223e1051a39Sopenharmony_ci 224e1051a39Sopenharmony_ci vprolq \$1,$C00,$D00 225e1051a39Sopenharmony_ci vpermq $C00,@Theta[1],$C00 226e1051a39Sopenharmony_ci vpermq $D00,@Theta[4],$D00 227e1051a39Sopenharmony_ci 228e1051a39Sopenharmony_ci vpternlogq \$0x96,$C00,$D00,$A00 229e1051a39Sopenharmony_ci vpternlogq \$0x96,$C00,$D00,$A30 230e1051a39Sopenharmony_ci vpternlogq \$0x96,$C00,$D00,$A10 231e1051a39Sopenharmony_ci vpternlogq \$0x96,$C00,$D00,$A40 232e1051a39Sopenharmony_ci vpternlogq \$0x96,$C00,$D00,$A20 233e1051a39Sopenharmony_ci 234e1051a39Sopenharmony_ci ######################################### Rho 235e1051a39Sopenharmony_ci vprolvq @Rhotate1[0],$A00,$A00 236e1051a39Sopenharmony_ci vprolvq @Rhotate1[3],$A30,@T[1] 237e1051a39Sopenharmony_ci vprolvq @Rhotate1[1],$A10,@T[2] 238e1051a39Sopenharmony_ci vprolvq @Rhotate1[4],$A40,@T[3] 239e1051a39Sopenharmony_ci vprolvq @Rhotate1[2],$A20,@T[4] 240e1051a39Sopenharmony_ci 241e1051a39Sopenharmony_ci vpermq $A00,@Theta[4],@T[5] 242e1051a39Sopenharmony_ci vpermq $A00,@Theta[3],@T[6] 243e1051a39Sopenharmony_ci 244e1051a39Sopenharmony_ci ######################################### Iota 245e1051a39Sopenharmony_ci vpxorq -8(%r10),$A00,${A00}{$k00001} 246e1051a39Sopenharmony_ci 247e1051a39Sopenharmony_ci ######################################### Pi 248e1051a39Sopenharmony_ci vpermq @T[1],@Theta[2],$A10 249e1051a39Sopenharmony_ci vpermq @T[2],@Theta[4],$A20 250e1051a39Sopenharmony_ci vpermq @T[3],@Theta[1],$A30 251e1051a39Sopenharmony_ci vpermq @T[4],@Theta[3],$A40 252e1051a39Sopenharmony_ci 253e1051a39Sopenharmony_ci ######################################### Chi 254e1051a39Sopenharmony_ci vpternlogq \$0xD2,@T[6],@T[5],$A00 255e1051a39Sopenharmony_ci 256e1051a39Sopenharmony_ci vpermq @T[1],@Theta[1],@T[7] 257e1051a39Sopenharmony_ci #vpermq @T[1],@Theta[0],@T[1] 258e1051a39Sopenharmony_ci vpternlogq \$0xD2,@T[1],@T[7],$A10 259e1051a39Sopenharmony_ci 260e1051a39Sopenharmony_ci vpermq @T[2],@Theta[3],@T[0] 261e1051a39Sopenharmony_ci vpermq @T[2],@Theta[2],@T[2] 262e1051a39Sopenharmony_ci vpternlogq \$0xD2,@T[2],@T[0],$A20 263e1051a39Sopenharmony_ci 264e1051a39Sopenharmony_ci #vpermq @T[3],@Theta[0],@T[3] 265e1051a39Sopenharmony_ci vpermq @T[3],@Theta[4],@T[1] 266e1051a39Sopenharmony_ci vpternlogq \$0xD2,@T[1],@T[3],$A30 267e1051a39Sopenharmony_ci 268e1051a39Sopenharmony_ci vpermq @T[4],@Theta[2],@T[0] 269e1051a39Sopenharmony_ci vpermq @T[4],@Theta[1],@T[4] 270e1051a39Sopenharmony_ci vpternlogq \$0xD2,@T[4],@T[0],$A40 271e1051a39Sopenharmony_ci 272e1051a39Sopenharmony_ci dec %eax 273e1051a39Sopenharmony_ci jnz .Loop_avx512 274e1051a39Sopenharmony_ci 275e1051a39Sopenharmony_ci ret 276e1051a39Sopenharmony_ci.size __KeccakF1600,.-__KeccakF1600 277e1051a39Sopenharmony_ci___ 278e1051a39Sopenharmony_ci 279e1051a39Sopenharmony_cimy ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); 280e1051a39Sopenharmony_cimy $out = $inp; # in squeeze 281e1051a39Sopenharmony_ci 282e1051a39Sopenharmony_ci$code.=<<___; 283e1051a39Sopenharmony_ci.globl SHA3_absorb 284e1051a39Sopenharmony_ci.type SHA3_absorb,\@function 285e1051a39Sopenharmony_ci.align 32 286e1051a39Sopenharmony_ciSHA3_absorb: 287e1051a39Sopenharmony_ci mov %rsp,%r11 288e1051a39Sopenharmony_ci 289e1051a39Sopenharmony_ci lea -320(%rsp),%rsp 290e1051a39Sopenharmony_ci and \$-64,%rsp 291e1051a39Sopenharmony_ci 292e1051a39Sopenharmony_ci lea 96($A_flat),$A_flat 293e1051a39Sopenharmony_ci lea 96($inp),$inp 294e1051a39Sopenharmony_ci lea 128(%rsp),%r9 295e1051a39Sopenharmony_ci 296e1051a39Sopenharmony_ci lea theta_perm(%rip),%r8 297e1051a39Sopenharmony_ci 298e1051a39Sopenharmony_ci kxnorw $k11111,$k11111,$k11111 299e1051a39Sopenharmony_ci kshiftrw \$15,$k11111,$k00001 300e1051a39Sopenharmony_ci kshiftrw \$11,$k11111,$k11111 301e1051a39Sopenharmony_ci kshiftlw \$1,$k00001,$k00010 302e1051a39Sopenharmony_ci kshiftlw \$2,$k00001,$k00100 303e1051a39Sopenharmony_ci kshiftlw \$3,$k00001,$k01000 304e1051a39Sopenharmony_ci kshiftlw \$4,$k00001,$k10000 305e1051a39Sopenharmony_ci 306e1051a39Sopenharmony_ci #vmovdqa64 64*0(%r8),@Theta[0] 307e1051a39Sopenharmony_ci vmovdqa64 64*1(%r8),@Theta[1] 308e1051a39Sopenharmony_ci vmovdqa64 64*2(%r8),@Theta[2] 309e1051a39Sopenharmony_ci vmovdqa64 64*3(%r8),@Theta[3] 310e1051a39Sopenharmony_ci vmovdqa64 64*4(%r8),@Theta[4] 311e1051a39Sopenharmony_ci 312e1051a39Sopenharmony_ci vmovdqa64 64*5(%r8),@Rhotate1[0] 313e1051a39Sopenharmony_ci vmovdqa64 64*6(%r8),@Rhotate1[1] 314e1051a39Sopenharmony_ci vmovdqa64 64*7(%r8),@Rhotate1[2] 315e1051a39Sopenharmony_ci vmovdqa64 64*8(%r8),@Rhotate1[3] 316e1051a39Sopenharmony_ci vmovdqa64 64*9(%r8),@Rhotate1[4] 317e1051a39Sopenharmony_ci 318e1051a39Sopenharmony_ci vmovdqa64 64*10(%r8),@Rhotate0[0] 319e1051a39Sopenharmony_ci vmovdqa64 64*11(%r8),@Rhotate0[1] 320e1051a39Sopenharmony_ci vmovdqa64 64*12(%r8),@Rhotate0[2] 321e1051a39Sopenharmony_ci vmovdqa64 64*13(%r8),@Rhotate0[3] 322e1051a39Sopenharmony_ci vmovdqa64 64*14(%r8),@Rhotate0[4] 323e1051a39Sopenharmony_ci 324e1051a39Sopenharmony_ci vmovdqa64 64*15(%r8),@Pi0[0] 325e1051a39Sopenharmony_ci vmovdqa64 64*16(%r8),@Pi0[1] 326e1051a39Sopenharmony_ci vmovdqa64 64*17(%r8),@Pi0[2] 327e1051a39Sopenharmony_ci vmovdqa64 64*18(%r8),@Pi0[3] 328e1051a39Sopenharmony_ci vmovdqa64 64*19(%r8),@Pi0[4] 329e1051a39Sopenharmony_ci 330e1051a39Sopenharmony_ci vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} 331e1051a39Sopenharmony_ci vpxorq @T[0],@T[0],@T[0] 332e1051a39Sopenharmony_ci vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z} 333e1051a39Sopenharmony_ci vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z} 334e1051a39Sopenharmony_ci vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z} 335e1051a39Sopenharmony_ci vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z} 336e1051a39Sopenharmony_ci 337e1051a39Sopenharmony_ci vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack 338e1051a39Sopenharmony_ci vmovdqa64 @T[0],1*64-128(%r9) 339e1051a39Sopenharmony_ci vmovdqa64 @T[0],2*64-128(%r9) 340e1051a39Sopenharmony_ci vmovdqa64 @T[0],3*64-128(%r9) 341e1051a39Sopenharmony_ci vmovdqa64 @T[0],4*64-128(%r9) 342e1051a39Sopenharmony_ci jmp .Loop_absorb_avx512 343e1051a39Sopenharmony_ci 344e1051a39Sopenharmony_ci.align 32 345e1051a39Sopenharmony_ci.Loop_absorb_avx512: 346e1051a39Sopenharmony_ci mov $bsz,%rax 347e1051a39Sopenharmony_ci sub $bsz,$len 348e1051a39Sopenharmony_ci jc .Ldone_absorb_avx512 349e1051a39Sopenharmony_ci 350e1051a39Sopenharmony_ci shr \$3,%eax 351e1051a39Sopenharmony_ci___ 352e1051a39Sopenharmony_cifor(my $i=0; $i<25; $i++) { 353e1051a39Sopenharmony_ci$code.=<<___ 354e1051a39Sopenharmony_ci mov 8*$i-96($inp),%r8 355e1051a39Sopenharmony_ci mov %r8,$A_jagged[$i]-128(%r9) 356e1051a39Sopenharmony_ci dec %eax 357e1051a39Sopenharmony_ci jz .Labsorved_avx512 358e1051a39Sopenharmony_ci___ 359e1051a39Sopenharmony_ci} 360e1051a39Sopenharmony_ci$code.=<<___; 361e1051a39Sopenharmony_ci.Labsorved_avx512: 362e1051a39Sopenharmony_ci lea ($inp,$bsz),$inp 363e1051a39Sopenharmony_ci 364e1051a39Sopenharmony_ci vpxorq 64*0-128(%r9),$A00,$A00 365e1051a39Sopenharmony_ci vpxorq 64*1-128(%r9),$A10,$A10 366e1051a39Sopenharmony_ci vpxorq 64*2-128(%r9),$A20,$A20 367e1051a39Sopenharmony_ci vpxorq 64*3-128(%r9),$A30,$A30 368e1051a39Sopenharmony_ci vpxorq 64*4-128(%r9),$A40,$A40 369e1051a39Sopenharmony_ci 370e1051a39Sopenharmony_ci call __KeccakF1600 371e1051a39Sopenharmony_ci 372e1051a39Sopenharmony_ci jmp .Loop_absorb_avx512 373e1051a39Sopenharmony_ci 374e1051a39Sopenharmony_ci.align 32 375e1051a39Sopenharmony_ci.Ldone_absorb_avx512: 376e1051a39Sopenharmony_ci vmovdqu64 $A00,40*0-96($A_flat){$k11111} 377e1051a39Sopenharmony_ci vmovdqu64 $A10,40*1-96($A_flat){$k11111} 378e1051a39Sopenharmony_ci vmovdqu64 $A20,40*2-96($A_flat){$k11111} 379e1051a39Sopenharmony_ci vmovdqu64 $A30,40*3-96($A_flat){$k11111} 380e1051a39Sopenharmony_ci vmovdqu64 $A40,40*4-96($A_flat){$k11111} 381e1051a39Sopenharmony_ci 382e1051a39Sopenharmony_ci vzeroupper 383e1051a39Sopenharmony_ci 384e1051a39Sopenharmony_ci lea (%r11),%rsp 385e1051a39Sopenharmony_ci lea ($len,$bsz),%rax # return value 386e1051a39Sopenharmony_ci ret 387e1051a39Sopenharmony_ci.size SHA3_absorb,.-SHA3_absorb 388e1051a39Sopenharmony_ci 389e1051a39Sopenharmony_ci.globl SHA3_squeeze 390e1051a39Sopenharmony_ci.type SHA3_squeeze,\@function 391e1051a39Sopenharmony_ci.align 32 392e1051a39Sopenharmony_ciSHA3_squeeze: 393e1051a39Sopenharmony_ci mov %rsp,%r11 394e1051a39Sopenharmony_ci 395e1051a39Sopenharmony_ci lea 96($A_flat),$A_flat 396e1051a39Sopenharmony_ci cmp $bsz,$len 397e1051a39Sopenharmony_ci jbe .Lno_output_extension_avx512 398e1051a39Sopenharmony_ci 399e1051a39Sopenharmony_ci lea theta_perm(%rip),%r8 400e1051a39Sopenharmony_ci 401e1051a39Sopenharmony_ci kxnorw $k11111,$k11111,$k11111 402e1051a39Sopenharmony_ci kshiftrw \$15,$k11111,$k00001 403e1051a39Sopenharmony_ci kshiftrw \$11,$k11111,$k11111 404e1051a39Sopenharmony_ci kshiftlw \$1,$k00001,$k00010 405e1051a39Sopenharmony_ci kshiftlw \$2,$k00001,$k00100 406e1051a39Sopenharmony_ci kshiftlw \$3,$k00001,$k01000 407e1051a39Sopenharmony_ci kshiftlw \$4,$k00001,$k10000 408e1051a39Sopenharmony_ci 409e1051a39Sopenharmony_ci #vmovdqa64 64*0(%r8),@Theta[0] 410e1051a39Sopenharmony_ci vmovdqa64 64*1(%r8),@Theta[1] 411e1051a39Sopenharmony_ci vmovdqa64 64*2(%r8),@Theta[2] 412e1051a39Sopenharmony_ci vmovdqa64 64*3(%r8),@Theta[3] 413e1051a39Sopenharmony_ci vmovdqa64 64*4(%r8),@Theta[4] 414e1051a39Sopenharmony_ci 415e1051a39Sopenharmony_ci vmovdqa64 64*5(%r8),@Rhotate1[0] 416e1051a39Sopenharmony_ci vmovdqa64 64*6(%r8),@Rhotate1[1] 417e1051a39Sopenharmony_ci vmovdqa64 64*7(%r8),@Rhotate1[2] 418e1051a39Sopenharmony_ci vmovdqa64 64*8(%r8),@Rhotate1[3] 419e1051a39Sopenharmony_ci vmovdqa64 64*9(%r8),@Rhotate1[4] 420e1051a39Sopenharmony_ci 421e1051a39Sopenharmony_ci vmovdqa64 64*10(%r8),@Rhotate0[0] 422e1051a39Sopenharmony_ci vmovdqa64 64*11(%r8),@Rhotate0[1] 423e1051a39Sopenharmony_ci vmovdqa64 64*12(%r8),@Rhotate0[2] 424e1051a39Sopenharmony_ci vmovdqa64 64*13(%r8),@Rhotate0[3] 425e1051a39Sopenharmony_ci vmovdqa64 64*14(%r8),@Rhotate0[4] 426e1051a39Sopenharmony_ci 427e1051a39Sopenharmony_ci vmovdqa64 64*15(%r8),@Pi0[0] 428e1051a39Sopenharmony_ci vmovdqa64 64*16(%r8),@Pi0[1] 429e1051a39Sopenharmony_ci vmovdqa64 64*17(%r8),@Pi0[2] 430e1051a39Sopenharmony_ci vmovdqa64 64*18(%r8),@Pi0[3] 431e1051a39Sopenharmony_ci vmovdqa64 64*19(%r8),@Pi0[4] 432e1051a39Sopenharmony_ci 433e1051a39Sopenharmony_ci vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} 434e1051a39Sopenharmony_ci vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z} 435e1051a39Sopenharmony_ci vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z} 436e1051a39Sopenharmony_ci vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z} 437e1051a39Sopenharmony_ci vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z} 438e1051a39Sopenharmony_ci 439e1051a39Sopenharmony_ci.Lno_output_extension_avx512: 440e1051a39Sopenharmony_ci shr \$3,$bsz 441e1051a39Sopenharmony_ci lea -96($A_flat),%r9 442e1051a39Sopenharmony_ci mov $bsz,%rax 443e1051a39Sopenharmony_ci jmp .Loop_squeeze_avx512 444e1051a39Sopenharmony_ci 445e1051a39Sopenharmony_ci.align 32 446e1051a39Sopenharmony_ci.Loop_squeeze_avx512: 447e1051a39Sopenharmony_ci cmp \$8,$len 448e1051a39Sopenharmony_ci jb .Ltail_squeeze_avx512 449e1051a39Sopenharmony_ci 450e1051a39Sopenharmony_ci mov (%r9),%r8 451e1051a39Sopenharmony_ci lea 8(%r9),%r9 452e1051a39Sopenharmony_ci mov %r8,($out) 453e1051a39Sopenharmony_ci lea 8($out),$out 454e1051a39Sopenharmony_ci sub \$8,$len # len -= 8 455e1051a39Sopenharmony_ci jz .Ldone_squeeze_avx512 456e1051a39Sopenharmony_ci 457e1051a39Sopenharmony_ci sub \$1,%rax # bsz-- 458e1051a39Sopenharmony_ci jnz .Loop_squeeze_avx512 459e1051a39Sopenharmony_ci 460e1051a39Sopenharmony_ci #vpermq @Theta[4],@Theta[4],@Theta[3] 461e1051a39Sopenharmony_ci #vpermq @Theta[3],@Theta[4],@Theta[2] 462e1051a39Sopenharmony_ci #vpermq @Theta[3],@Theta[3],@Theta[1] 463e1051a39Sopenharmony_ci 464e1051a39Sopenharmony_ci call __KeccakF1600 465e1051a39Sopenharmony_ci 466e1051a39Sopenharmony_ci vmovdqu64 $A00,40*0-96($A_flat){$k11111} 467e1051a39Sopenharmony_ci vmovdqu64 $A10,40*1-96($A_flat){$k11111} 468e1051a39Sopenharmony_ci vmovdqu64 $A20,40*2-96($A_flat){$k11111} 469e1051a39Sopenharmony_ci vmovdqu64 $A30,40*3-96($A_flat){$k11111} 470e1051a39Sopenharmony_ci vmovdqu64 $A40,40*4-96($A_flat){$k11111} 471e1051a39Sopenharmony_ci 472e1051a39Sopenharmony_ci lea -96($A_flat),%r9 473e1051a39Sopenharmony_ci mov $bsz,%rax 474e1051a39Sopenharmony_ci jmp .Loop_squeeze_avx512 475e1051a39Sopenharmony_ci 476e1051a39Sopenharmony_ci.Ltail_squeeze_avx512: 477e1051a39Sopenharmony_ci mov $out,%rdi 478e1051a39Sopenharmony_ci mov %r9,%rsi 479e1051a39Sopenharmony_ci mov $len,%rcx 480e1051a39Sopenharmony_ci .byte 0xf3,0xa4 # rep movsb 481e1051a39Sopenharmony_ci 482e1051a39Sopenharmony_ci.Ldone_squeeze_avx512: 483e1051a39Sopenharmony_ci vzeroupper 484e1051a39Sopenharmony_ci 485e1051a39Sopenharmony_ci lea (%r11),%rsp 486e1051a39Sopenharmony_ci ret 487e1051a39Sopenharmony_ci.size SHA3_squeeze,.-SHA3_squeeze 488e1051a39Sopenharmony_ci 489e1051a39Sopenharmony_ci.align 64 490e1051a39Sopenharmony_citheta_perm: 491e1051a39Sopenharmony_ci .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used] 492e1051a39Sopenharmony_ci .quad 4, 0, 1, 2, 3, 5, 6, 7 493e1051a39Sopenharmony_ci .quad 3, 4, 0, 1, 2, 5, 6, 7 494e1051a39Sopenharmony_ci .quad 2, 3, 4, 0, 1, 5, 6, 7 495e1051a39Sopenharmony_ci .quad 1, 2, 3, 4, 0, 5, 6, 7 496e1051a39Sopenharmony_ci 497e1051a39Sopenharmony_cirhotates1: 498e1051a39Sopenharmony_ci .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4] 499e1051a39Sopenharmony_ci .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4] 500e1051a39Sopenharmony_ci .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4] 501e1051a39Sopenharmony_ci .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4] 502e1051a39Sopenharmony_ci .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4] 503e1051a39Sopenharmony_ci 504e1051a39Sopenharmony_cirhotates0: 505e1051a39Sopenharmony_ci .quad 0, 1, 62, 28, 27, 0, 0, 0 506e1051a39Sopenharmony_ci .quad 36, 44, 6, 55, 20, 0, 0, 0 507e1051a39Sopenharmony_ci .quad 3, 10, 43, 25, 39, 0, 0, 0 508e1051a39Sopenharmony_ci .quad 41, 45, 15, 21, 8, 0, 0, 0 509e1051a39Sopenharmony_ci .quad 18, 2, 61, 56, 14, 0, 0, 0 510e1051a39Sopenharmony_ci 511e1051a39Sopenharmony_cipi0_perm: 512e1051a39Sopenharmony_ci .quad 0, 3, 1, 4, 2, 5, 6, 7 513e1051a39Sopenharmony_ci .quad 1, 4, 2, 0, 3, 5, 6, 7 514e1051a39Sopenharmony_ci .quad 2, 0, 3, 1, 4, 5, 6, 7 515e1051a39Sopenharmony_ci .quad 3, 1, 4, 2, 0, 5, 6, 7 516e1051a39Sopenharmony_ci .quad 4, 2, 0, 3, 1, 5, 6, 7 517e1051a39Sopenharmony_ci 518e1051a39Sopenharmony_ci 519e1051a39Sopenharmony_ciiotas: 520e1051a39Sopenharmony_ci .quad 0x0000000000000001 521e1051a39Sopenharmony_ci .quad 0x0000000000008082 522e1051a39Sopenharmony_ci .quad 0x800000000000808a 523e1051a39Sopenharmony_ci .quad 0x8000000080008000 524e1051a39Sopenharmony_ci .quad 0x000000000000808b 525e1051a39Sopenharmony_ci .quad 0x0000000080000001 526e1051a39Sopenharmony_ci .quad 0x8000000080008081 527e1051a39Sopenharmony_ci .quad 0x8000000000008009 528e1051a39Sopenharmony_ci .quad 0x000000000000008a 529e1051a39Sopenharmony_ci .quad 0x0000000000000088 530e1051a39Sopenharmony_ci .quad 0x0000000080008009 531e1051a39Sopenharmony_ci .quad 0x000000008000000a 532e1051a39Sopenharmony_ci .quad 0x000000008000808b 533e1051a39Sopenharmony_ci .quad 0x800000000000008b 534e1051a39Sopenharmony_ci .quad 0x8000000000008089 535e1051a39Sopenharmony_ci .quad 0x8000000000008003 536e1051a39Sopenharmony_ci .quad 0x8000000000008002 537e1051a39Sopenharmony_ci .quad 0x8000000000000080 538e1051a39Sopenharmony_ci .quad 0x000000000000800a 539e1051a39Sopenharmony_ci .quad 0x800000008000000a 540e1051a39Sopenharmony_ci .quad 0x8000000080008081 541e1051a39Sopenharmony_ci .quad 0x8000000000008080 542e1051a39Sopenharmony_ci .quad 0x0000000080000001 543e1051a39Sopenharmony_ci .quad 0x8000000080008008 544e1051a39Sopenharmony_ci 545e1051a39Sopenharmony_ci.asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>" 546e1051a39Sopenharmony_ci___ 547e1051a39Sopenharmony_ci 548e1051a39Sopenharmony_ci$output=pop and open STDOUT,">$output"; 549e1051a39Sopenharmony_ciprint $code; 550e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 551