1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci 17e1051a39Sopenharmony_ci# Multi-buffer SHA1 procedure processes n buffers in parallel by 18e1051a39Sopenharmony_ci# placing buffer data to designated lane of SIMD register. n is 19e1051a39Sopenharmony_ci# naturally limited to 4 on pre-AVX2 processors and to 8 on 20e1051a39Sopenharmony_ci# AVX2-capable processors such as Haswell. 21e1051a39Sopenharmony_ci# 22e1051a39Sopenharmony_ci# this +aesni(i) sha1 aesni-sha1 gain(iv) 23e1051a39Sopenharmony_ci# ------------------------------------------------------------------- 24e1051a39Sopenharmony_ci# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% 25e1051a39Sopenharmony_ci# Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51% 26e1051a39Sopenharmony_ci# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% 27e1051a39Sopenharmony_ci# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% 28e1051a39Sopenharmony_ci# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% 29e1051a39Sopenharmony_ci# Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145% 30e1051a39Sopenharmony_ci# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% 31e1051a39Sopenharmony_ci# 32e1051a39Sopenharmony_ci# (i) multi-block CBC encrypt with 128-bit key; 33e1051a39Sopenharmony_ci# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, 34e1051a39Sopenharmony_ci# because of lower AES-NI instruction throughput; 35e1051a39Sopenharmony_ci# (iii) "this" is for n=8, when we gather twice as much data, result 36e1051a39Sopenharmony_ci# for n=4 is 8.00+4.44=12.4; 37e1051a39Sopenharmony_ci# (iv) presented improvement coefficients are asymptotic limits and 38e1051a39Sopenharmony_ci# in real-life application are somewhat lower, e.g. for 2KB 39e1051a39Sopenharmony_ci# fragments they range from 30% to 100% (on Haswell); 40e1051a39Sopenharmony_ci 41e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 42e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 43e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 44e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 45e1051a39Sopenharmony_ci 46e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 47e1051a39Sopenharmony_ci 48e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 49e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 50e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 51e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl"; 52e1051a39Sopenharmony_ci 53e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 54e1051a39Sopenharmony_cirequire "x86_64-support.pl"; 55e1051a39Sopenharmony_ci 56e1051a39Sopenharmony_ci$ptr_size=&pointer_size($flavour); 57e1051a39Sopenharmony_ci 58e1051a39Sopenharmony_ci$avx=0; 59e1051a39Sopenharmony_ci 60e1051a39Sopenharmony_ciif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 61e1051a39Sopenharmony_ci =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 62e1051a39Sopenharmony_ci $avx = ($1>=2.19) + ($1>=2.22); 63e1051a39Sopenharmony_ci} 64e1051a39Sopenharmony_ci 65e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 66e1051a39Sopenharmony_ci `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 67e1051a39Sopenharmony_ci $avx = ($1>=2.09) + ($1>=2.10); 68e1051a39Sopenharmony_ci} 69e1051a39Sopenharmony_ci 70e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 71e1051a39Sopenharmony_ci `ml64 2>&1` =~ /Version ([0-9]+)\./) { 72e1051a39Sopenharmony_ci $avx = ($1>=10) + ($1>=11); 73e1051a39Sopenharmony_ci} 74e1051a39Sopenharmony_ci 75e1051a39Sopenharmony_ciif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 76e1051a39Sopenharmony_ci $avx = ($2>=3.0) + ($2>3.0); 77e1051a39Sopenharmony_ci} 78e1051a39Sopenharmony_ci 79e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 80e1051a39Sopenharmony_ci or die "can't call $xlate: $!"; 81e1051a39Sopenharmony_ci*STDOUT=*OUT; 82e1051a39Sopenharmony_ci 83e1051a39Sopenharmony_ci# void sha1_multi_block ( 84e1051a39Sopenharmony_ci# struct { unsigned int A[8]; 85e1051a39Sopenharmony_ci# unsigned int B[8]; 86e1051a39Sopenharmony_ci# unsigned int C[8]; 87e1051a39Sopenharmony_ci# unsigned int D[8]; 88e1051a39Sopenharmony_ci# unsigned int E[8]; } *ctx, 89e1051a39Sopenharmony_ci# struct { void *ptr; int blocks; } inp[8], 90e1051a39Sopenharmony_ci# int num); /* 1 or 2 */ 91e1051a39Sopenharmony_ci# 92e1051a39Sopenharmony_ci$ctx="%rdi"; # 1st arg 93e1051a39Sopenharmony_ci$inp="%rsi"; # 2nd arg 94e1051a39Sopenharmony_ci$num="%edx"; 95e1051a39Sopenharmony_ci@ptr=map("%r$_",(8..11)); 96e1051a39Sopenharmony_ci$Tbl="%rbp"; 97e1051a39Sopenharmony_ci$inp_elm_size=2*$ptr_size; 98e1051a39Sopenharmony_ci 99e1051a39Sopenharmony_ci@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); 100e1051a39Sopenharmony_ci($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); 101e1051a39Sopenharmony_ci@Xi=map("%xmm$_",(10..14)); 102e1051a39Sopenharmony_ci$K="%xmm15"; 103e1051a39Sopenharmony_ci 104e1051a39Sopenharmony_ciif (1) { 105e1051a39Sopenharmony_ci # Atom-specific optimization aiming to eliminate pshufb with high 106e1051a39Sopenharmony_ci # registers [and thus get rid of 48 cycles accumulated penalty] 107e1051a39Sopenharmony_ci @Xi=map("%xmm$_",(0..4)); 108e1051a39Sopenharmony_ci ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); 109e1051a39Sopenharmony_ci @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); 110e1051a39Sopenharmony_ci} 111e1051a39Sopenharmony_ci 112e1051a39Sopenharmony_ci$REG_SZ=16; 113e1051a39Sopenharmony_ci 114e1051a39Sopenharmony_cisub Xi_off { 115e1051a39Sopenharmony_cimy $off = shift; 116e1051a39Sopenharmony_ci 117e1051a39Sopenharmony_ci $off %= 16; $off *= $REG_SZ; 118e1051a39Sopenharmony_ci $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; 119e1051a39Sopenharmony_ci} 120e1051a39Sopenharmony_ci 121e1051a39Sopenharmony_cisub BODY_00_19 { 122e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_; 123e1051a39Sopenharmony_cimy $j=$i+1; 124e1051a39Sopenharmony_cimy $k=$i+2; 125e1051a39Sopenharmony_ci 126e1051a39Sopenharmony_ci# Loads are performed 2+3/4 iterations in advance. 3/4 means that out 127e1051a39Sopenharmony_ci# of 4 words you would expect to be loaded per given iteration one is 128e1051a39Sopenharmony_ci# spilled to next iteration. In other words indices in four input 129e1051a39Sopenharmony_ci# streams are distributed as following: 130e1051a39Sopenharmony_ci# 131e1051a39Sopenharmony_ci# $i==0: 0,0,0,0,1,1,1,1,2,2,2, 132e1051a39Sopenharmony_ci# $i==1: 2,3,3,3, 133e1051a39Sopenharmony_ci# $i==2: 3,4,4,4, 134e1051a39Sopenharmony_ci# ... 135e1051a39Sopenharmony_ci# $i==13: 14,15,15,15, 136e1051a39Sopenharmony_ci# $i==14: 15 137e1051a39Sopenharmony_ci# 138e1051a39Sopenharmony_ci# Then at $i==15 Xupdate is applied one iteration in advance... 139e1051a39Sopenharmony_ci$code.=<<___ if ($i==0); 140e1051a39Sopenharmony_ci movd (@ptr[0]),@Xi[0] 141e1051a39Sopenharmony_ci lea `16*4`(@ptr[0]),@ptr[0] 142e1051a39Sopenharmony_ci movd (@ptr[1]),@Xi[2] # borrow @Xi[2] 143e1051a39Sopenharmony_ci lea `16*4`(@ptr[1]),@ptr[1] 144e1051a39Sopenharmony_ci movd (@ptr[2]),@Xi[3] # borrow @Xi[3] 145e1051a39Sopenharmony_ci lea `16*4`(@ptr[2]),@ptr[2] 146e1051a39Sopenharmony_ci movd (@ptr[3]),@Xi[4] # borrow @Xi[4] 147e1051a39Sopenharmony_ci lea `16*4`(@ptr[3]),@ptr[3] 148e1051a39Sopenharmony_ci punpckldq @Xi[3],@Xi[0] 149e1051a39Sopenharmony_ci movd `4*$j-16*4`(@ptr[0]),@Xi[1] 150e1051a39Sopenharmony_ci punpckldq @Xi[4],@Xi[2] 151e1051a39Sopenharmony_ci movd `4*$j-16*4`(@ptr[1]),$t3 152e1051a39Sopenharmony_ci punpckldq @Xi[2],@Xi[0] 153e1051a39Sopenharmony_ci movd `4*$j-16*4`(@ptr[2]),$t2 154e1051a39Sopenharmony_ci pshufb $tx,@Xi[0] 155e1051a39Sopenharmony_ci___ 156e1051a39Sopenharmony_ci$code.=<<___ if ($i<14); # just load input 157e1051a39Sopenharmony_ci movd `4*$j-16*4`(@ptr[3]),$t1 158e1051a39Sopenharmony_ci punpckldq $t2,@Xi[1] 159e1051a39Sopenharmony_ci movdqa $a,$t2 160e1051a39Sopenharmony_ci paddd $K,$e # e+=K_00_19 161e1051a39Sopenharmony_ci punpckldq $t1,$t3 162e1051a39Sopenharmony_ci movdqa $b,$t1 163e1051a39Sopenharmony_ci movdqa $b,$t0 164e1051a39Sopenharmony_ci pslld \$5,$t2 165e1051a39Sopenharmony_ci pandn $d,$t1 166e1051a39Sopenharmony_ci pand $c,$t0 167e1051a39Sopenharmony_ci punpckldq $t3,@Xi[1] 168e1051a39Sopenharmony_ci movdqa $a,$t3 169e1051a39Sopenharmony_ci 170e1051a39Sopenharmony_ci movdqa @Xi[0],`&Xi_off($i)` 171e1051a39Sopenharmony_ci paddd @Xi[0],$e # e+=X[i] 172e1051a39Sopenharmony_ci movd `4*$k-16*4`(@ptr[0]),@Xi[2] 173e1051a39Sopenharmony_ci psrld \$27,$t3 174e1051a39Sopenharmony_ci pxor $t1,$t0 # Ch(b,c,d) 175e1051a39Sopenharmony_ci movdqa $b,$t1 176e1051a39Sopenharmony_ci 177e1051a39Sopenharmony_ci por $t3,$t2 # rol(a,5) 178e1051a39Sopenharmony_ci movd `4*$k-16*4`(@ptr[1]),$t3 179e1051a39Sopenharmony_ci pslld \$30,$t1 180e1051a39Sopenharmony_ci paddd $t0,$e # e+=Ch(b,c,d) 181e1051a39Sopenharmony_ci 182e1051a39Sopenharmony_ci psrld \$2,$b 183e1051a39Sopenharmony_ci paddd $t2,$e # e+=rol(a,5) 184e1051a39Sopenharmony_ci pshufb $tx,@Xi[1] 185e1051a39Sopenharmony_ci movd `4*$k-16*4`(@ptr[2]),$t2 186e1051a39Sopenharmony_ci por $t1,$b # b=rol(b,30) 187e1051a39Sopenharmony_ci___ 188e1051a39Sopenharmony_ci$code.=<<___ if ($i==14); # just load input 189e1051a39Sopenharmony_ci movd `4*$j-16*4`(@ptr[3]),$t1 190e1051a39Sopenharmony_ci punpckldq $t2,@Xi[1] 191e1051a39Sopenharmony_ci movdqa $a,$t2 192e1051a39Sopenharmony_ci paddd $K,$e # e+=K_00_19 193e1051a39Sopenharmony_ci punpckldq $t1,$t3 194e1051a39Sopenharmony_ci movdqa $b,$t1 195e1051a39Sopenharmony_ci movdqa $b,$t0 196e1051a39Sopenharmony_ci pslld \$5,$t2 197e1051a39Sopenharmony_ci prefetcht0 63(@ptr[0]) 198e1051a39Sopenharmony_ci pandn $d,$t1 199e1051a39Sopenharmony_ci pand $c,$t0 200e1051a39Sopenharmony_ci punpckldq $t3,@Xi[1] 201e1051a39Sopenharmony_ci movdqa $a,$t3 202e1051a39Sopenharmony_ci 203e1051a39Sopenharmony_ci movdqa @Xi[0],`&Xi_off($i)` 204e1051a39Sopenharmony_ci paddd @Xi[0],$e # e+=X[i] 205e1051a39Sopenharmony_ci psrld \$27,$t3 206e1051a39Sopenharmony_ci pxor $t1,$t0 # Ch(b,c,d) 207e1051a39Sopenharmony_ci movdqa $b,$t1 208e1051a39Sopenharmony_ci prefetcht0 63(@ptr[1]) 209e1051a39Sopenharmony_ci 210e1051a39Sopenharmony_ci por $t3,$t2 # rol(a,5) 211e1051a39Sopenharmony_ci pslld \$30,$t1 212e1051a39Sopenharmony_ci paddd $t0,$e # e+=Ch(b,c,d) 213e1051a39Sopenharmony_ci prefetcht0 63(@ptr[2]) 214e1051a39Sopenharmony_ci 215e1051a39Sopenharmony_ci psrld \$2,$b 216e1051a39Sopenharmony_ci paddd $t2,$e # e+=rol(a,5) 217e1051a39Sopenharmony_ci pshufb $tx,@Xi[1] 218e1051a39Sopenharmony_ci prefetcht0 63(@ptr[3]) 219e1051a39Sopenharmony_ci por $t1,$b # b=rol(b,30) 220e1051a39Sopenharmony_ci___ 221e1051a39Sopenharmony_ci$code.=<<___ if ($i>=13 && $i<15); 222e1051a39Sopenharmony_ci movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 223e1051a39Sopenharmony_ci___ 224e1051a39Sopenharmony_ci$code.=<<___ if ($i>=15); # apply Xupdate 225e1051a39Sopenharmony_ci pxor @Xi[-2],@Xi[1] # "X[13]" 226e1051a39Sopenharmony_ci movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 227e1051a39Sopenharmony_ci 228e1051a39Sopenharmony_ci movdqa $a,$t2 229e1051a39Sopenharmony_ci pxor `&Xi_off($j+8)`,@Xi[1] 230e1051a39Sopenharmony_ci paddd $K,$e # e+=K_00_19 231e1051a39Sopenharmony_ci movdqa $b,$t1 232e1051a39Sopenharmony_ci pslld \$5,$t2 233e1051a39Sopenharmony_ci pxor @Xi[3],@Xi[1] 234e1051a39Sopenharmony_ci movdqa $b,$t0 235e1051a39Sopenharmony_ci pandn $d,$t1 236e1051a39Sopenharmony_ci movdqa @Xi[1],$tx 237e1051a39Sopenharmony_ci pand $c,$t0 238e1051a39Sopenharmony_ci movdqa $a,$t3 239e1051a39Sopenharmony_ci psrld \$31,$tx 240e1051a39Sopenharmony_ci paddd @Xi[1],@Xi[1] 241e1051a39Sopenharmony_ci 242e1051a39Sopenharmony_ci movdqa @Xi[0],`&Xi_off($i)` 243e1051a39Sopenharmony_ci paddd @Xi[0],$e # e+=X[i] 244e1051a39Sopenharmony_ci psrld \$27,$t3 245e1051a39Sopenharmony_ci pxor $t1,$t0 # Ch(b,c,d) 246e1051a39Sopenharmony_ci 247e1051a39Sopenharmony_ci movdqa $b,$t1 248e1051a39Sopenharmony_ci por $t3,$t2 # rol(a,5) 249e1051a39Sopenharmony_ci pslld \$30,$t1 250e1051a39Sopenharmony_ci paddd $t0,$e # e+=Ch(b,c,d) 251e1051a39Sopenharmony_ci 252e1051a39Sopenharmony_ci psrld \$2,$b 253e1051a39Sopenharmony_ci paddd $t2,$e # e+=rol(a,5) 254e1051a39Sopenharmony_ci por $tx,@Xi[1] # rol \$1,@Xi[1] 255e1051a39Sopenharmony_ci por $t1,$b # b=rol(b,30) 256e1051a39Sopenharmony_ci___ 257e1051a39Sopenharmony_cipush(@Xi,shift(@Xi)); 258e1051a39Sopenharmony_ci} 259e1051a39Sopenharmony_ci 260e1051a39Sopenharmony_cisub BODY_20_39 { 261e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_; 262e1051a39Sopenharmony_cimy $j=$i+1; 263e1051a39Sopenharmony_ci 264e1051a39Sopenharmony_ci$code.=<<___ if ($i<79); 265e1051a39Sopenharmony_ci pxor @Xi[-2],@Xi[1] # "X[13]" 266e1051a39Sopenharmony_ci movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 267e1051a39Sopenharmony_ci 268e1051a39Sopenharmony_ci movdqa $a,$t2 269e1051a39Sopenharmony_ci movdqa $d,$t0 270e1051a39Sopenharmony_ci pxor `&Xi_off($j+8)`,@Xi[1] 271e1051a39Sopenharmony_ci paddd $K,$e # e+=K_20_39 272e1051a39Sopenharmony_ci pslld \$5,$t2 273e1051a39Sopenharmony_ci pxor $b,$t0 274e1051a39Sopenharmony_ci 275e1051a39Sopenharmony_ci movdqa $a,$t3 276e1051a39Sopenharmony_ci___ 277e1051a39Sopenharmony_ci$code.=<<___ if ($i<72); 278e1051a39Sopenharmony_ci movdqa @Xi[0],`&Xi_off($i)` 279e1051a39Sopenharmony_ci___ 280e1051a39Sopenharmony_ci$code.=<<___ if ($i<79); 281e1051a39Sopenharmony_ci paddd @Xi[0],$e # e+=X[i] 282e1051a39Sopenharmony_ci pxor @Xi[3],@Xi[1] 283e1051a39Sopenharmony_ci psrld \$27,$t3 284e1051a39Sopenharmony_ci pxor $c,$t0 # Parity(b,c,d) 285e1051a39Sopenharmony_ci movdqa $b,$t1 286e1051a39Sopenharmony_ci 287e1051a39Sopenharmony_ci pslld \$30,$t1 288e1051a39Sopenharmony_ci movdqa @Xi[1],$tx 289e1051a39Sopenharmony_ci por $t3,$t2 # rol(a,5) 290e1051a39Sopenharmony_ci psrld \$31,$tx 291e1051a39Sopenharmony_ci paddd $t0,$e # e+=Parity(b,c,d) 292e1051a39Sopenharmony_ci paddd @Xi[1],@Xi[1] 293e1051a39Sopenharmony_ci 294e1051a39Sopenharmony_ci psrld \$2,$b 295e1051a39Sopenharmony_ci paddd $t2,$e # e+=rol(a,5) 296e1051a39Sopenharmony_ci por $tx,@Xi[1] # rol(@Xi[1],1) 297e1051a39Sopenharmony_ci por $t1,$b # b=rol(b,30) 298e1051a39Sopenharmony_ci___ 299e1051a39Sopenharmony_ci$code.=<<___ if ($i==79); 300e1051a39Sopenharmony_ci movdqa $a,$t2 301e1051a39Sopenharmony_ci paddd $K,$e # e+=K_20_39 302e1051a39Sopenharmony_ci movdqa $d,$t0 303e1051a39Sopenharmony_ci pslld \$5,$t2 304e1051a39Sopenharmony_ci pxor $b,$t0 305e1051a39Sopenharmony_ci 306e1051a39Sopenharmony_ci movdqa $a,$t3 307e1051a39Sopenharmony_ci paddd @Xi[0],$e # e+=X[i] 308e1051a39Sopenharmony_ci psrld \$27,$t3 309e1051a39Sopenharmony_ci movdqa $b,$t1 310e1051a39Sopenharmony_ci pxor $c,$t0 # Parity(b,c,d) 311e1051a39Sopenharmony_ci 312e1051a39Sopenharmony_ci pslld \$30,$t1 313e1051a39Sopenharmony_ci por $t3,$t2 # rol(a,5) 314e1051a39Sopenharmony_ci paddd $t0,$e # e+=Parity(b,c,d) 315e1051a39Sopenharmony_ci 316e1051a39Sopenharmony_ci psrld \$2,$b 317e1051a39Sopenharmony_ci paddd $t2,$e # e+=rol(a,5) 318e1051a39Sopenharmony_ci por $t1,$b # b=rol(b,30) 319e1051a39Sopenharmony_ci___ 320e1051a39Sopenharmony_cipush(@Xi,shift(@Xi)); 321e1051a39Sopenharmony_ci} 322e1051a39Sopenharmony_ci 323e1051a39Sopenharmony_cisub BODY_40_59 { 324e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_; 325e1051a39Sopenharmony_cimy $j=$i+1; 326e1051a39Sopenharmony_ci 327e1051a39Sopenharmony_ci$code.=<<___; 328e1051a39Sopenharmony_ci pxor @Xi[-2],@Xi[1] # "X[13]" 329e1051a39Sopenharmony_ci movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 330e1051a39Sopenharmony_ci 331e1051a39Sopenharmony_ci movdqa $a,$t2 332e1051a39Sopenharmony_ci movdqa $d,$t1 333e1051a39Sopenharmony_ci pxor `&Xi_off($j+8)`,@Xi[1] 334e1051a39Sopenharmony_ci pxor @Xi[3],@Xi[1] 335e1051a39Sopenharmony_ci paddd $K,$e # e+=K_40_59 336e1051a39Sopenharmony_ci pslld \$5,$t2 337e1051a39Sopenharmony_ci movdqa $a,$t3 338e1051a39Sopenharmony_ci pand $c,$t1 339e1051a39Sopenharmony_ci 340e1051a39Sopenharmony_ci movdqa $d,$t0 341e1051a39Sopenharmony_ci movdqa @Xi[1],$tx 342e1051a39Sopenharmony_ci psrld \$27,$t3 343e1051a39Sopenharmony_ci paddd $t1,$e 344e1051a39Sopenharmony_ci pxor $c,$t0 345e1051a39Sopenharmony_ci 346e1051a39Sopenharmony_ci movdqa @Xi[0],`&Xi_off($i)` 347e1051a39Sopenharmony_ci paddd @Xi[0],$e # e+=X[i] 348e1051a39Sopenharmony_ci por $t3,$t2 # rol(a,5) 349e1051a39Sopenharmony_ci psrld \$31,$tx 350e1051a39Sopenharmony_ci pand $b,$t0 351e1051a39Sopenharmony_ci movdqa $b,$t1 352e1051a39Sopenharmony_ci 353e1051a39Sopenharmony_ci pslld \$30,$t1 354e1051a39Sopenharmony_ci paddd @Xi[1],@Xi[1] 355e1051a39Sopenharmony_ci paddd $t0,$e # e+=Maj(b,d,c) 356e1051a39Sopenharmony_ci 357e1051a39Sopenharmony_ci psrld \$2,$b 358e1051a39Sopenharmony_ci paddd $t2,$e # e+=rol(a,5) 359e1051a39Sopenharmony_ci por $tx,@Xi[1] # rol(@X[1],1) 360e1051a39Sopenharmony_ci por $t1,$b # b=rol(b,30) 361e1051a39Sopenharmony_ci___ 362e1051a39Sopenharmony_cipush(@Xi,shift(@Xi)); 363e1051a39Sopenharmony_ci} 364e1051a39Sopenharmony_ci 365e1051a39Sopenharmony_ci$code.=<<___; 366e1051a39Sopenharmony_ci.text 367e1051a39Sopenharmony_ci 368e1051a39Sopenharmony_ci.extern OPENSSL_ia32cap_P 369e1051a39Sopenharmony_ci 370e1051a39Sopenharmony_ci.globl sha1_multi_block 371e1051a39Sopenharmony_ci.type sha1_multi_block,\@function,3 372e1051a39Sopenharmony_ci.align 32 373e1051a39Sopenharmony_cisha1_multi_block: 374e1051a39Sopenharmony_ci.cfi_startproc 375e1051a39Sopenharmony_ci mov OPENSSL_ia32cap_P+4(%rip),%rcx 376e1051a39Sopenharmony_ci bt \$61,%rcx # check SHA bit 377e1051a39Sopenharmony_ci jc _shaext_shortcut 378e1051a39Sopenharmony_ci___ 379e1051a39Sopenharmony_ci$code.=<<___ if ($avx); 380e1051a39Sopenharmony_ci test \$`1<<28`,%ecx 381e1051a39Sopenharmony_ci jnz _avx_shortcut 382e1051a39Sopenharmony_ci___ 383e1051a39Sopenharmony_ci$code.=<<___; 384e1051a39Sopenharmony_ci mov %rsp,%rax 385e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 386e1051a39Sopenharmony_ci push %rbx 387e1051a39Sopenharmony_ci.cfi_push %rbx 388e1051a39Sopenharmony_ci push %rbp 389e1051a39Sopenharmony_ci.cfi_push %rbx 390e1051a39Sopenharmony_ci___ 391e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 392e1051a39Sopenharmony_ci lea -0xa8(%rsp),%rsp 393e1051a39Sopenharmony_ci movaps %xmm6,(%rsp) 394e1051a39Sopenharmony_ci movaps %xmm7,0x10(%rsp) 395e1051a39Sopenharmony_ci movaps %xmm8,0x20(%rsp) 396e1051a39Sopenharmony_ci movaps %xmm9,0x30(%rsp) 397e1051a39Sopenharmony_ci movaps %xmm10,-0x78(%rax) 398e1051a39Sopenharmony_ci movaps %xmm11,-0x68(%rax) 399e1051a39Sopenharmony_ci movaps %xmm12,-0x58(%rax) 400e1051a39Sopenharmony_ci movaps %xmm13,-0x48(%rax) 401e1051a39Sopenharmony_ci movaps %xmm14,-0x38(%rax) 402e1051a39Sopenharmony_ci movaps %xmm15,-0x28(%rax) 403e1051a39Sopenharmony_ci___ 404e1051a39Sopenharmony_ci$code.=<<___; 405e1051a39Sopenharmony_ci sub \$`$REG_SZ*18`,%rsp 406e1051a39Sopenharmony_ci and \$-256,%rsp 407e1051a39Sopenharmony_ci mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 408e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 409e1051a39Sopenharmony_ci.Lbody: 410e1051a39Sopenharmony_ci lea K_XX_XX(%rip),$Tbl 411e1051a39Sopenharmony_ci lea `$REG_SZ*16`(%rsp),%rbx 412e1051a39Sopenharmony_ci 413e1051a39Sopenharmony_ci.Loop_grande: 414e1051a39Sopenharmony_ci mov $num,`$REG_SZ*17+8`(%rsp) # original $num 415e1051a39Sopenharmony_ci xor $num,$num 416e1051a39Sopenharmony_ci___ 417e1051a39Sopenharmony_cifor($i=0;$i<4;$i++) { 418e1051a39Sopenharmony_ci $ptr_reg=&pointer_register($flavour,@ptr[$i]); 419e1051a39Sopenharmony_ci $code.=<<___; 420e1051a39Sopenharmony_ci # input pointer 421e1051a39Sopenharmony_ci mov `$inp_elm_size*$i+0`($inp),$ptr_reg 422e1051a39Sopenharmony_ci # number of blocks 423e1051a39Sopenharmony_ci mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 424e1051a39Sopenharmony_ci cmp $num,%ecx 425e1051a39Sopenharmony_ci cmovg %ecx,$num # find maximum 426e1051a39Sopenharmony_ci test %ecx,%ecx 427e1051a39Sopenharmony_ci mov %ecx,`4*$i`(%rbx) # initialize counters 428e1051a39Sopenharmony_ci cmovle $Tbl,@ptr[$i] # cancel input 429e1051a39Sopenharmony_ci___ 430e1051a39Sopenharmony_ci} 431e1051a39Sopenharmony_ci$code.=<<___; 432e1051a39Sopenharmony_ci test $num,$num 433e1051a39Sopenharmony_ci jz .Ldone 434e1051a39Sopenharmony_ci 435e1051a39Sopenharmony_ci movdqu 0x00($ctx),$A # load context 436e1051a39Sopenharmony_ci lea 128(%rsp),%rax 437e1051a39Sopenharmony_ci movdqu 0x20($ctx),$B 438e1051a39Sopenharmony_ci movdqu 0x40($ctx),$C 439e1051a39Sopenharmony_ci movdqu 0x60($ctx),$D 440e1051a39Sopenharmony_ci movdqu 0x80($ctx),$E 441e1051a39Sopenharmony_ci movdqa 0x60($Tbl),$tx # pbswap_mask 442e1051a39Sopenharmony_ci movdqa -0x20($Tbl),$K # K_00_19 443e1051a39Sopenharmony_ci jmp .Loop 444e1051a39Sopenharmony_ci 445e1051a39Sopenharmony_ci.align 32 446e1051a39Sopenharmony_ci.Loop: 447e1051a39Sopenharmony_ci___ 448e1051a39Sopenharmony_cifor($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 449e1051a39Sopenharmony_ci$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 450e1051a39Sopenharmony_cifor(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 451e1051a39Sopenharmony_ci$code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59 452e1051a39Sopenharmony_cifor(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 453e1051a39Sopenharmony_ci$code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79 454e1051a39Sopenharmony_cifor(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 455e1051a39Sopenharmony_ci$code.=<<___; 456e1051a39Sopenharmony_ci movdqa (%rbx),@Xi[0] # pull counters 457e1051a39Sopenharmony_ci mov \$1,%ecx 458e1051a39Sopenharmony_ci cmp 4*0(%rbx),%ecx # examine counters 459e1051a39Sopenharmony_ci pxor $t2,$t2 460e1051a39Sopenharmony_ci cmovge $Tbl,@ptr[0] # cancel input 461e1051a39Sopenharmony_ci cmp 4*1(%rbx),%ecx 462e1051a39Sopenharmony_ci movdqa @Xi[0],@Xi[1] 463e1051a39Sopenharmony_ci cmovge $Tbl,@ptr[1] 464e1051a39Sopenharmony_ci cmp 4*2(%rbx),%ecx 465e1051a39Sopenharmony_ci pcmpgtd $t2,@Xi[1] # mask value 466e1051a39Sopenharmony_ci cmovge $Tbl,@ptr[2] 467e1051a39Sopenharmony_ci cmp 4*3(%rbx),%ecx 468e1051a39Sopenharmony_ci paddd @Xi[1],@Xi[0] # counters-- 469e1051a39Sopenharmony_ci cmovge $Tbl,@ptr[3] 470e1051a39Sopenharmony_ci 471e1051a39Sopenharmony_ci movdqu 0x00($ctx),$t0 472e1051a39Sopenharmony_ci pand @Xi[1],$A 473e1051a39Sopenharmony_ci movdqu 0x20($ctx),$t1 474e1051a39Sopenharmony_ci pand @Xi[1],$B 475e1051a39Sopenharmony_ci paddd $t0,$A 476e1051a39Sopenharmony_ci movdqu 0x40($ctx),$t2 477e1051a39Sopenharmony_ci pand @Xi[1],$C 478e1051a39Sopenharmony_ci paddd $t1,$B 479e1051a39Sopenharmony_ci movdqu 0x60($ctx),$t3 480e1051a39Sopenharmony_ci pand @Xi[1],$D 481e1051a39Sopenharmony_ci paddd $t2,$C 482e1051a39Sopenharmony_ci movdqu 0x80($ctx),$tx 483e1051a39Sopenharmony_ci pand @Xi[1],$E 484e1051a39Sopenharmony_ci movdqu $A,0x00($ctx) 485e1051a39Sopenharmony_ci paddd $t3,$D 486e1051a39Sopenharmony_ci movdqu $B,0x20($ctx) 487e1051a39Sopenharmony_ci paddd $tx,$E 488e1051a39Sopenharmony_ci movdqu $C,0x40($ctx) 489e1051a39Sopenharmony_ci movdqu $D,0x60($ctx) 490e1051a39Sopenharmony_ci movdqu $E,0x80($ctx) 491e1051a39Sopenharmony_ci 492e1051a39Sopenharmony_ci movdqa @Xi[0],(%rbx) # save counters 493e1051a39Sopenharmony_ci movdqa 0x60($Tbl),$tx # pbswap_mask 494e1051a39Sopenharmony_ci movdqa -0x20($Tbl),$K # K_00_19 495e1051a39Sopenharmony_ci dec $num 496e1051a39Sopenharmony_ci jnz .Loop 497e1051a39Sopenharmony_ci 498e1051a39Sopenharmony_ci mov `$REG_SZ*17+8`(%rsp),$num 499e1051a39Sopenharmony_ci lea $REG_SZ($ctx),$ctx 500e1051a39Sopenharmony_ci lea `$inp_elm_size*$REG_SZ/4`($inp),$inp 501e1051a39Sopenharmony_ci dec $num 502e1051a39Sopenharmony_ci jnz .Loop_grande 503e1051a39Sopenharmony_ci 504e1051a39Sopenharmony_ci.Ldone: 505e1051a39Sopenharmony_ci mov `$REG_SZ*17`(%rsp),%rax # original %rsp 506e1051a39Sopenharmony_ci.cfi_def_cfa %rax,8 507e1051a39Sopenharmony_ci___ 508e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 509e1051a39Sopenharmony_ci movaps -0xb8(%rax),%xmm6 510e1051a39Sopenharmony_ci movaps -0xa8(%rax),%xmm7 511e1051a39Sopenharmony_ci movaps -0x98(%rax),%xmm8 512e1051a39Sopenharmony_ci movaps -0x88(%rax),%xmm9 513e1051a39Sopenharmony_ci movaps -0x78(%rax),%xmm10 514e1051a39Sopenharmony_ci movaps -0x68(%rax),%xmm11 515e1051a39Sopenharmony_ci movaps -0x58(%rax),%xmm12 516e1051a39Sopenharmony_ci movaps -0x48(%rax),%xmm13 517e1051a39Sopenharmony_ci movaps -0x38(%rax),%xmm14 518e1051a39Sopenharmony_ci movaps -0x28(%rax),%xmm15 519e1051a39Sopenharmony_ci___ 520e1051a39Sopenharmony_ci$code.=<<___; 521e1051a39Sopenharmony_ci mov -16(%rax),%rbp 522e1051a39Sopenharmony_ci.cfi_restore %rbp 523e1051a39Sopenharmony_ci mov -8(%rax),%rbx 524e1051a39Sopenharmony_ci.cfi_restore %rbx 525e1051a39Sopenharmony_ci lea (%rax),%rsp 526e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 527e1051a39Sopenharmony_ci.Lepilogue: 528e1051a39Sopenharmony_ci ret 529e1051a39Sopenharmony_ci.cfi_endproc 530e1051a39Sopenharmony_ci.size sha1_multi_block,.-sha1_multi_block 531e1051a39Sopenharmony_ci___ 532e1051a39Sopenharmony_ci {{{ 533e1051a39Sopenharmony_cimy ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10)); 534e1051a39Sopenharmony_cimy @MSG0=map("%xmm$_",(4..7)); 535e1051a39Sopenharmony_cimy @MSG1=map("%xmm$_",(11..14)); 536e1051a39Sopenharmony_ci 537e1051a39Sopenharmony_ci$code.=<<___; 538e1051a39Sopenharmony_ci.type sha1_multi_block_shaext,\@function,3 539e1051a39Sopenharmony_ci.align 32 540e1051a39Sopenharmony_cisha1_multi_block_shaext: 541e1051a39Sopenharmony_ci.cfi_startproc 542e1051a39Sopenharmony_ci_shaext_shortcut: 543e1051a39Sopenharmony_ci mov %rsp,%rax 544e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 545e1051a39Sopenharmony_ci push %rbx 546e1051a39Sopenharmony_ci.cfi_push %rbx 547e1051a39Sopenharmony_ci push %rbp 548e1051a39Sopenharmony_ci.cfi_push %rbp 549e1051a39Sopenharmony_ci___ 550e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 551e1051a39Sopenharmony_ci lea -0xa8(%rsp),%rsp 552e1051a39Sopenharmony_ci movaps %xmm6,(%rsp) 553e1051a39Sopenharmony_ci movaps %xmm7,0x10(%rsp) 554e1051a39Sopenharmony_ci movaps %xmm8,0x20(%rsp) 555e1051a39Sopenharmony_ci movaps %xmm9,0x30(%rsp) 556e1051a39Sopenharmony_ci movaps %xmm10,-0x78(%rax) 557e1051a39Sopenharmony_ci movaps %xmm11,-0x68(%rax) 558e1051a39Sopenharmony_ci movaps %xmm12,-0x58(%rax) 559e1051a39Sopenharmony_ci movaps %xmm13,-0x48(%rax) 560e1051a39Sopenharmony_ci movaps %xmm14,-0x38(%rax) 561e1051a39Sopenharmony_ci movaps %xmm15,-0x28(%rax) 562e1051a39Sopenharmony_ci___ 563e1051a39Sopenharmony_ci$code.=<<___; 564e1051a39Sopenharmony_ci sub \$`$REG_SZ*18`,%rsp 565e1051a39Sopenharmony_ci shl \$1,$num # we process pair at a time 566e1051a39Sopenharmony_ci and \$-256,%rsp 567e1051a39Sopenharmony_ci lea 0x40($ctx),$ctx # size optimization 568e1051a39Sopenharmony_ci mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 569e1051a39Sopenharmony_ci.Lbody_shaext: 570e1051a39Sopenharmony_ci lea `$REG_SZ*16`(%rsp),%rbx 571e1051a39Sopenharmony_ci movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap 572e1051a39Sopenharmony_ci 573e1051a39Sopenharmony_ci.Loop_grande_shaext: 574e1051a39Sopenharmony_ci mov $num,`$REG_SZ*17+8`(%rsp) # original $num 575e1051a39Sopenharmony_ci xor $num,$num 576e1051a39Sopenharmony_ci___ 577e1051a39Sopenharmony_cifor($i=0;$i<2;$i++) { 578e1051a39Sopenharmony_ci $ptr_reg=&pointer_register($flavour,@ptr[$i]); 579e1051a39Sopenharmony_ci $code.=<<___; 580e1051a39Sopenharmony_ci # input pointer 581e1051a39Sopenharmony_ci mov `$inp_elm_size*$i+0`($inp),$ptr_reg 582e1051a39Sopenharmony_ci # number of blocks 583e1051a39Sopenharmony_ci mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 584e1051a39Sopenharmony_ci cmp $num,%ecx 585e1051a39Sopenharmony_ci cmovg %ecx,$num # find maximum 586e1051a39Sopenharmony_ci test %ecx,%ecx 587e1051a39Sopenharmony_ci mov %ecx,`4*$i`(%rbx) # initialize counters 588e1051a39Sopenharmony_ci cmovle %rsp,@ptr[$i] # cancel input 589e1051a39Sopenharmony_ci___ 590e1051a39Sopenharmony_ci} 591e1051a39Sopenharmony_ci$code.=<<___; 592e1051a39Sopenharmony_ci test $num,$num 593e1051a39Sopenharmony_ci jz .Ldone_shaext 594e1051a39Sopenharmony_ci 595e1051a39Sopenharmony_ci movq 0x00-0x40($ctx),$ABCD0 # a1.a0 596e1051a39Sopenharmony_ci movq 0x20-0x40($ctx),@MSG0[0]# b1.b0 597e1051a39Sopenharmony_ci movq 0x40-0x40($ctx),@MSG0[1]# c1.c0 598e1051a39Sopenharmony_ci movq 0x60-0x40($ctx),@MSG0[2]# d1.d0 599e1051a39Sopenharmony_ci movq 0x80-0x40($ctx),@MSG0[3]# e1.e0 600e1051a39Sopenharmony_ci 601e1051a39Sopenharmony_ci punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0 602e1051a39Sopenharmony_ci punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0 603e1051a39Sopenharmony_ci 604e1051a39Sopenharmony_ci movdqa $ABCD0,$ABCD1 605e1051a39Sopenharmony_ci punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0 606e1051a39Sopenharmony_ci punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1 607e1051a39Sopenharmony_ci 608e1051a39Sopenharmony_ci pshufd \$0b00111111,@MSG0[3],$E0 609e1051a39Sopenharmony_ci pshufd \$0b01111111,@MSG0[3],$E1 610e1051a39Sopenharmony_ci pshufd \$0b00011011,$ABCD0,$ABCD0 611e1051a39Sopenharmony_ci pshufd \$0b00011011,$ABCD1,$ABCD1 612e1051a39Sopenharmony_ci jmp .Loop_shaext 613e1051a39Sopenharmony_ci 614e1051a39Sopenharmony_ci.align 32 615e1051a39Sopenharmony_ci.Loop_shaext: 616e1051a39Sopenharmony_ci movdqu 0x00(@ptr[0]),@MSG0[0] 617e1051a39Sopenharmony_ci movdqu 0x00(@ptr[1]),@MSG1[0] 618e1051a39Sopenharmony_ci movdqu 0x10(@ptr[0]),@MSG0[1] 619e1051a39Sopenharmony_ci movdqu 0x10(@ptr[1]),@MSG1[1] 620e1051a39Sopenharmony_ci movdqu 0x20(@ptr[0]),@MSG0[2] 621e1051a39Sopenharmony_ci pshufb $BSWAP,@MSG0[0] 622e1051a39Sopenharmony_ci movdqu 0x20(@ptr[1]),@MSG1[2] 623e1051a39Sopenharmony_ci pshufb $BSWAP,@MSG1[0] 624e1051a39Sopenharmony_ci movdqu 0x30(@ptr[0]),@MSG0[3] 625e1051a39Sopenharmony_ci lea 0x40(@ptr[0]),@ptr[0] 626e1051a39Sopenharmony_ci pshufb $BSWAP,@MSG0[1] 627e1051a39Sopenharmony_ci movdqu 0x30(@ptr[1]),@MSG1[3] 628e1051a39Sopenharmony_ci lea 0x40(@ptr[1]),@ptr[1] 629e1051a39Sopenharmony_ci pshufb $BSWAP,@MSG1[1] 630e1051a39Sopenharmony_ci 631e1051a39Sopenharmony_ci movdqa $E0,0x50(%rsp) # offload 632e1051a39Sopenharmony_ci paddd @MSG0[0],$E0 633e1051a39Sopenharmony_ci movdqa $E1,0x70(%rsp) 634e1051a39Sopenharmony_ci paddd @MSG1[0],$E1 635e1051a39Sopenharmony_ci movdqa $ABCD0,0x40(%rsp) # offload 636e1051a39Sopenharmony_ci movdqa $ABCD0,$E0_ 637e1051a39Sopenharmony_ci movdqa $ABCD1,0x60(%rsp) 638e1051a39Sopenharmony_ci movdqa $ABCD1,$E1_ 639e1051a39Sopenharmony_ci sha1rnds4 \$0,$E0,$ABCD0 # 0-3 640e1051a39Sopenharmony_ci sha1nexte @MSG0[1],$E0_ 641e1051a39Sopenharmony_ci sha1rnds4 \$0,$E1,$ABCD1 # 0-3 642e1051a39Sopenharmony_ci sha1nexte @MSG1[1],$E1_ 643e1051a39Sopenharmony_ci pshufb $BSWAP,@MSG0[2] 644e1051a39Sopenharmony_ci prefetcht0 127(@ptr[0]) 645e1051a39Sopenharmony_ci sha1msg1 @MSG0[1],@MSG0[0] 646e1051a39Sopenharmony_ci pshufb $BSWAP,@MSG1[2] 647e1051a39Sopenharmony_ci prefetcht0 127(@ptr[1]) 648e1051a39Sopenharmony_ci sha1msg1 @MSG1[1],@MSG1[0] 649e1051a39Sopenharmony_ci 650e1051a39Sopenharmony_ci pshufb $BSWAP,@MSG0[3] 651e1051a39Sopenharmony_ci movdqa $ABCD0,$E0 652e1051a39Sopenharmony_ci pshufb $BSWAP,@MSG1[3] 653e1051a39Sopenharmony_ci movdqa $ABCD1,$E1 654e1051a39Sopenharmony_ci sha1rnds4 \$0,$E0_,$ABCD0 # 4-7 655e1051a39Sopenharmony_ci sha1nexte @MSG0[2],$E0 656e1051a39Sopenharmony_ci sha1rnds4 \$0,$E1_,$ABCD1 # 4-7 657e1051a39Sopenharmony_ci sha1nexte @MSG1[2],$E1 658e1051a39Sopenharmony_ci pxor @MSG0[2],@MSG0[0] 659e1051a39Sopenharmony_ci sha1msg1 @MSG0[2],@MSG0[1] 660e1051a39Sopenharmony_ci pxor @MSG1[2],@MSG1[0] 661e1051a39Sopenharmony_ci sha1msg1 @MSG1[2],@MSG1[1] 662e1051a39Sopenharmony_ci___ 663e1051a39Sopenharmony_cifor($i=2;$i<20-4;$i++) { 664e1051a39Sopenharmony_ci$code.=<<___; 665e1051a39Sopenharmony_ci movdqa $ABCD0,$E0_ 666e1051a39Sopenharmony_ci movdqa $ABCD1,$E1_ 667e1051a39Sopenharmony_ci sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11 668e1051a39Sopenharmony_ci sha1nexte @MSG0[3],$E0_ 669e1051a39Sopenharmony_ci sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11 670e1051a39Sopenharmony_ci sha1nexte @MSG1[3],$E1_ 671e1051a39Sopenharmony_ci sha1msg2 @MSG0[3],@MSG0[0] 672e1051a39Sopenharmony_ci sha1msg2 @MSG1[3],@MSG1[0] 673e1051a39Sopenharmony_ci pxor @MSG0[3],@MSG0[1] 674e1051a39Sopenharmony_ci sha1msg1 @MSG0[3],@MSG0[2] 675e1051a39Sopenharmony_ci pxor @MSG1[3],@MSG1[1] 676e1051a39Sopenharmony_ci sha1msg1 @MSG1[3],@MSG1[2] 677e1051a39Sopenharmony_ci___ 678e1051a39Sopenharmony_ci ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1); 679e1051a39Sopenharmony_ci push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); 680e1051a39Sopenharmony_ci} 681e1051a39Sopenharmony_ci$code.=<<___; 682e1051a39Sopenharmony_ci movdqa $ABCD0,$E0_ 683e1051a39Sopenharmony_ci movdqa $ABCD1,$E1_ 684e1051a39Sopenharmony_ci sha1rnds4 \$3,$E0,$ABCD0 # 64-67 685e1051a39Sopenharmony_ci sha1nexte @MSG0[3],$E0_ 686e1051a39Sopenharmony_ci sha1rnds4 \$3,$E1,$ABCD1 # 64-67 687e1051a39Sopenharmony_ci sha1nexte @MSG1[3],$E1_ 688e1051a39Sopenharmony_ci sha1msg2 @MSG0[3],@MSG0[0] 689e1051a39Sopenharmony_ci sha1msg2 @MSG1[3],@MSG1[0] 690e1051a39Sopenharmony_ci pxor @MSG0[3],@MSG0[1] 691e1051a39Sopenharmony_ci pxor @MSG1[3],@MSG1[1] 692e1051a39Sopenharmony_ci 693e1051a39Sopenharmony_ci mov \$1,%ecx 694e1051a39Sopenharmony_ci pxor @MSG0[2],@MSG0[2] # zero 695e1051a39Sopenharmony_ci cmp 4*0(%rbx),%ecx # examine counters 696e1051a39Sopenharmony_ci cmovge %rsp,@ptr[0] # cancel input 697e1051a39Sopenharmony_ci 698e1051a39Sopenharmony_ci movdqa $ABCD0,$E0 699e1051a39Sopenharmony_ci movdqa $ABCD1,$E1 700e1051a39Sopenharmony_ci sha1rnds4 \$3,$E0_,$ABCD0 # 68-71 701e1051a39Sopenharmony_ci sha1nexte @MSG0[0],$E0 702e1051a39Sopenharmony_ci sha1rnds4 \$3,$E1_,$ABCD1 # 68-71 703e1051a39Sopenharmony_ci sha1nexte @MSG1[0],$E1 704e1051a39Sopenharmony_ci sha1msg2 @MSG0[0],@MSG0[1] 705e1051a39Sopenharmony_ci sha1msg2 @MSG1[0],@MSG1[1] 706e1051a39Sopenharmony_ci 707e1051a39Sopenharmony_ci cmp 4*1(%rbx),%ecx 708e1051a39Sopenharmony_ci cmovge %rsp,@ptr[1] 709e1051a39Sopenharmony_ci movq (%rbx),@MSG0[0] # pull counters 710e1051a39Sopenharmony_ci 711e1051a39Sopenharmony_ci movdqa $ABCD0,$E0_ 712e1051a39Sopenharmony_ci movdqa $ABCD1,$E1_ 713e1051a39Sopenharmony_ci sha1rnds4 \$3,$E0,$ABCD0 # 72-75 714e1051a39Sopenharmony_ci sha1nexte @MSG0[1],$E0_ 715e1051a39Sopenharmony_ci sha1rnds4 \$3,$E1,$ABCD1 # 72-75 716e1051a39Sopenharmony_ci sha1nexte @MSG1[1],$E1_ 717e1051a39Sopenharmony_ci 718e1051a39Sopenharmony_ci pshufd \$0x00,@MSG0[0],@MSG1[2] 719e1051a39Sopenharmony_ci pshufd \$0x55,@MSG0[0],@MSG1[3] 720e1051a39Sopenharmony_ci movdqa @MSG0[0],@MSG0[1] 721e1051a39Sopenharmony_ci pcmpgtd @MSG0[2],@MSG1[2] 722e1051a39Sopenharmony_ci pcmpgtd @MSG0[2],@MSG1[3] 723e1051a39Sopenharmony_ci 724e1051a39Sopenharmony_ci movdqa $ABCD0,$E0 725e1051a39Sopenharmony_ci movdqa $ABCD1,$E1 726e1051a39Sopenharmony_ci sha1rnds4 \$3,$E0_,$ABCD0 # 76-79 727e1051a39Sopenharmony_ci sha1nexte $MSG0[2],$E0 728e1051a39Sopenharmony_ci sha1rnds4 \$3,$E1_,$ABCD1 # 76-79 729e1051a39Sopenharmony_ci sha1nexte $MSG0[2],$E1 730e1051a39Sopenharmony_ci 731e1051a39Sopenharmony_ci pcmpgtd @MSG0[2],@MSG0[1] # counter mask 732e1051a39Sopenharmony_ci pand @MSG1[2],$ABCD0 733e1051a39Sopenharmony_ci pand @MSG1[2],$E0 734e1051a39Sopenharmony_ci pand @MSG1[3],$ABCD1 735e1051a39Sopenharmony_ci pand @MSG1[3],$E1 736e1051a39Sopenharmony_ci paddd @MSG0[1],@MSG0[0] # counters-- 737e1051a39Sopenharmony_ci 738e1051a39Sopenharmony_ci paddd 0x40(%rsp),$ABCD0 739e1051a39Sopenharmony_ci paddd 0x50(%rsp),$E0 740e1051a39Sopenharmony_ci paddd 0x60(%rsp),$ABCD1 741e1051a39Sopenharmony_ci paddd 0x70(%rsp),$E1 742e1051a39Sopenharmony_ci 743e1051a39Sopenharmony_ci movq @MSG0[0],(%rbx) # save counters 744e1051a39Sopenharmony_ci dec $num 745e1051a39Sopenharmony_ci jnz .Loop_shaext 746e1051a39Sopenharmony_ci 747e1051a39Sopenharmony_ci mov `$REG_SZ*17+8`(%rsp),$num 748e1051a39Sopenharmony_ci 749e1051a39Sopenharmony_ci pshufd \$0b00011011,$ABCD0,$ABCD0 750e1051a39Sopenharmony_ci pshufd \$0b00011011,$ABCD1,$ABCD1 751e1051a39Sopenharmony_ci 752e1051a39Sopenharmony_ci movdqa $ABCD0,@MSG0[0] 753e1051a39Sopenharmony_ci punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0 754e1051a39Sopenharmony_ci punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0 755e1051a39Sopenharmony_ci punpckhdq $E1,$E0 # e1.e0.xx.xx 756e1051a39Sopenharmony_ci movq $ABCD0,0x00-0x40($ctx) # a1.a0 757e1051a39Sopenharmony_ci psrldq \$8,$ABCD0 758e1051a39Sopenharmony_ci movq @MSG0[0],0x40-0x40($ctx)# c1.c0 759e1051a39Sopenharmony_ci psrldq \$8,@MSG0[0] 760e1051a39Sopenharmony_ci movq $ABCD0,0x20-0x40($ctx) # b1.b0 761e1051a39Sopenharmony_ci psrldq \$8,$E0 762e1051a39Sopenharmony_ci movq @MSG0[0],0x60-0x40($ctx)# d1.d0 763e1051a39Sopenharmony_ci movq $E0,0x80-0x40($ctx) # e1.e0 764e1051a39Sopenharmony_ci 765e1051a39Sopenharmony_ci lea `$REG_SZ/2`($ctx),$ctx 766e1051a39Sopenharmony_ci lea `$inp_elm_size*2`($inp),$inp 767e1051a39Sopenharmony_ci dec $num 768e1051a39Sopenharmony_ci jnz .Loop_grande_shaext 769e1051a39Sopenharmony_ci 770e1051a39Sopenharmony_ci.Ldone_shaext: 771e1051a39Sopenharmony_ci #mov `$REG_SZ*17`(%rsp),%rax # original %rsp 772e1051a39Sopenharmony_ci___ 773e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 774e1051a39Sopenharmony_ci movaps -0xb8(%rax),%xmm6 775e1051a39Sopenharmony_ci movaps -0xa8(%rax),%xmm7 776e1051a39Sopenharmony_ci movaps -0x98(%rax),%xmm8 777e1051a39Sopenharmony_ci movaps -0x88(%rax),%xmm9 778e1051a39Sopenharmony_ci movaps -0x78(%rax),%xmm10 779e1051a39Sopenharmony_ci movaps -0x68(%rax),%xmm11 780e1051a39Sopenharmony_ci movaps -0x58(%rax),%xmm12 781e1051a39Sopenharmony_ci movaps -0x48(%rax),%xmm13 782e1051a39Sopenharmony_ci movaps -0x38(%rax),%xmm14 783e1051a39Sopenharmony_ci movaps -0x28(%rax),%xmm15 784e1051a39Sopenharmony_ci___ 785e1051a39Sopenharmony_ci$code.=<<___; 786e1051a39Sopenharmony_ci mov -16(%rax),%rbp 787e1051a39Sopenharmony_ci.cfi_restore %rbp 788e1051a39Sopenharmony_ci mov -8(%rax),%rbx 789e1051a39Sopenharmony_ci.cfi_restore %rbx 790e1051a39Sopenharmony_ci lea (%rax),%rsp 791e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 792e1051a39Sopenharmony_ci.Lepilogue_shaext: 793e1051a39Sopenharmony_ci ret 794e1051a39Sopenharmony_ci.cfi_endproc 795e1051a39Sopenharmony_ci.size sha1_multi_block_shaext,.-sha1_multi_block_shaext 796e1051a39Sopenharmony_ci___ 797e1051a39Sopenharmony_ci }}} 798e1051a39Sopenharmony_ci 799e1051a39Sopenharmony_ci if ($avx) {{{ 800e1051a39Sopenharmony_cisub BODY_00_19_avx { 801e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_; 802e1051a39Sopenharmony_cimy $j=$i+1; 803e1051a39Sopenharmony_cimy $k=$i+2; 804e1051a39Sopenharmony_cimy $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128"; 805e1051a39Sopenharmony_cimy $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4]; 806e1051a39Sopenharmony_ci 807e1051a39Sopenharmony_ci$code.=<<___ if ($i==0 && $REG_SZ==16); 808e1051a39Sopenharmony_ci vmovd (@ptr[0]),@Xi[0] 809e1051a39Sopenharmony_ci lea `16*4`(@ptr[0]),@ptr[0] 810e1051a39Sopenharmony_ci vmovd (@ptr[1]),@Xi[2] # borrow Xi[2] 811e1051a39Sopenharmony_ci lea `16*4`(@ptr[1]),@ptr[1] 812e1051a39Sopenharmony_ci vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 813e1051a39Sopenharmony_ci lea `16*4`(@ptr[2]),@ptr[2] 814e1051a39Sopenharmony_ci vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2] 815e1051a39Sopenharmony_ci lea `16*4`(@ptr[3]),@ptr[3] 816e1051a39Sopenharmony_ci vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 817e1051a39Sopenharmony_ci vpunpckldq @Xi[2],@Xi[0],@Xi[0] 818e1051a39Sopenharmony_ci vmovd `4*$j-16*4`($ptr_n),$t3 819e1051a39Sopenharmony_ci vpshufb $tx,@Xi[0],@Xi[0] 820e1051a39Sopenharmony_ci___ 821e1051a39Sopenharmony_ci$code.=<<___ if ($i<15 && $REG_SZ==16); # just load input 822e1051a39Sopenharmony_ci vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 823e1051a39Sopenharmony_ci vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3 824e1051a39Sopenharmony_ci___ 825e1051a39Sopenharmony_ci$code.=<<___ if ($i==0 && $REG_SZ==32); 826e1051a39Sopenharmony_ci vmovd (@ptr[0]),@Xi[0] 827e1051a39Sopenharmony_ci lea `16*4`(@ptr[0]),@ptr[0] 828e1051a39Sopenharmony_ci vmovd (@ptr[4]),@Xi[2] # borrow Xi[2] 829e1051a39Sopenharmony_ci lea `16*4`(@ptr[4]),@ptr[4] 830e1051a39Sopenharmony_ci vmovd (@ptr[1]),$t2 831e1051a39Sopenharmony_ci lea `16*4`(@ptr[1]),@ptr[1] 832e1051a39Sopenharmony_ci vmovd (@ptr[5]),$t1 833e1051a39Sopenharmony_ci lea `16*4`(@ptr[5]),@ptr[5] 834e1051a39Sopenharmony_ci vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] 835e1051a39Sopenharmony_ci lea `16*4`(@ptr[2]),@ptr[2] 836e1051a39Sopenharmony_ci vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2] 837e1051a39Sopenharmony_ci lea `16*4`(@ptr[6]),@ptr[6] 838e1051a39Sopenharmony_ci vpinsrd \$1,(@ptr[3]),$t2,$t2 839e1051a39Sopenharmony_ci lea `16*4`(@ptr[3]),@ptr[3] 840e1051a39Sopenharmony_ci vpunpckldq $t2,@Xi[0],@Xi[0] 841e1051a39Sopenharmony_ci vpinsrd \$1,(@ptr[7]),$t1,$t1 842e1051a39Sopenharmony_ci lea `16*4`(@ptr[7]),@ptr[7] 843e1051a39Sopenharmony_ci vpunpckldq $t1,@Xi[2],@Xi[2] 844e1051a39Sopenharmony_ci vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] 845e1051a39Sopenharmony_ci vinserti128 @Xi[2],@Xi[0],@Xi[0] 846e1051a39Sopenharmony_ci vmovd `4*$j-16*4`($ptr_n),$t3 847e1051a39Sopenharmony_ci vpshufb $tx,@Xi[0],@Xi[0] 848e1051a39Sopenharmony_ci___ 849e1051a39Sopenharmony_ci$code.=<<___ if ($i<15 && $REG_SZ==32); # just load input 850e1051a39Sopenharmony_ci vmovd `4*$j-16*4`(@ptr[1]),$t2 851e1051a39Sopenharmony_ci vmovd `4*$j-16*4`(@ptr[5]),$t1 852e1051a39Sopenharmony_ci vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] 853e1051a39Sopenharmony_ci vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3 854e1051a39Sopenharmony_ci vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2 855e1051a39Sopenharmony_ci vpunpckldq $t2,@Xi[1],@Xi[1] 856e1051a39Sopenharmony_ci vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1 857e1051a39Sopenharmony_ci vpunpckldq $t1,$t3,$t3 858e1051a39Sopenharmony_ci___ 859e1051a39Sopenharmony_ci$code.=<<___ if ($i<14); 860e1051a39Sopenharmony_ci vpaddd $K,$e,$e # e+=K_00_19 861e1051a39Sopenharmony_ci vpslld \$5,$a,$t2 862e1051a39Sopenharmony_ci vpandn $d,$b,$t1 863e1051a39Sopenharmony_ci vpand $c,$b,$t0 864e1051a39Sopenharmony_ci 865e1051a39Sopenharmony_ci vmovdqa @Xi[0],`&Xi_off($i)` 866e1051a39Sopenharmony_ci vpaddd @Xi[0],$e,$e # e+=X[i] 867e1051a39Sopenharmony_ci $vpack $t3,@Xi[1],@Xi[1] 868e1051a39Sopenharmony_ci vpsrld \$27,$a,$t3 869e1051a39Sopenharmony_ci vpxor $t1,$t0,$t0 # Ch(b,c,d) 870e1051a39Sopenharmony_ci vmovd `4*$k-16*4`(@ptr[0]),@Xi[2] 871e1051a39Sopenharmony_ci 872e1051a39Sopenharmony_ci vpslld \$30,$b,$t1 873e1051a39Sopenharmony_ci vpor $t3,$t2,$t2 # rol(a,5) 874e1051a39Sopenharmony_ci vmovd `4*$k-16*4`($ptr_n),$t3 875e1051a39Sopenharmony_ci vpaddd $t0,$e,$e # e+=Ch(b,c,d) 876e1051a39Sopenharmony_ci 877e1051a39Sopenharmony_ci vpsrld \$2,$b,$b 878e1051a39Sopenharmony_ci vpaddd $t2,$e,$e # e+=rol(a,5) 879e1051a39Sopenharmony_ci vpshufb $tx,@Xi[1],@Xi[1] 880e1051a39Sopenharmony_ci vpor $t1,$b,$b # b=rol(b,30) 881e1051a39Sopenharmony_ci___ 882e1051a39Sopenharmony_ci$code.=<<___ if ($i==14); 883e1051a39Sopenharmony_ci vpaddd $K,$e,$e # e+=K_00_19 884e1051a39Sopenharmony_ci prefetcht0 63(@ptr[0]) 885e1051a39Sopenharmony_ci vpslld \$5,$a,$t2 886e1051a39Sopenharmony_ci vpandn $d,$b,$t1 887e1051a39Sopenharmony_ci vpand $c,$b,$t0 888e1051a39Sopenharmony_ci 889e1051a39Sopenharmony_ci vmovdqa @Xi[0],`&Xi_off($i)` 890e1051a39Sopenharmony_ci vpaddd @Xi[0],$e,$e # e+=X[i] 891e1051a39Sopenharmony_ci $vpack $t3,@Xi[1],@Xi[1] 892e1051a39Sopenharmony_ci vpsrld \$27,$a,$t3 893e1051a39Sopenharmony_ci prefetcht0 63(@ptr[1]) 894e1051a39Sopenharmony_ci vpxor $t1,$t0,$t0 # Ch(b,c,d) 895e1051a39Sopenharmony_ci 896e1051a39Sopenharmony_ci vpslld \$30,$b,$t1 897e1051a39Sopenharmony_ci vpor $t3,$t2,$t2 # rol(a,5) 898e1051a39Sopenharmony_ci prefetcht0 63(@ptr[2]) 899e1051a39Sopenharmony_ci vpaddd $t0,$e,$e # e+=Ch(b,c,d) 900e1051a39Sopenharmony_ci 901e1051a39Sopenharmony_ci vpsrld \$2,$b,$b 902e1051a39Sopenharmony_ci vpaddd $t2,$e,$e # e+=rol(a,5) 903e1051a39Sopenharmony_ci prefetcht0 63(@ptr[3]) 904e1051a39Sopenharmony_ci vpshufb $tx,@Xi[1],@Xi[1] 905e1051a39Sopenharmony_ci vpor $t1,$b,$b # b=rol(b,30) 906e1051a39Sopenharmony_ci___ 907e1051a39Sopenharmony_ci$code.=<<___ if ($i>=13 && $i<15); 908e1051a39Sopenharmony_ci vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" 909e1051a39Sopenharmony_ci___ 910e1051a39Sopenharmony_ci$code.=<<___ if ($i>=15); # apply Xupdate 911e1051a39Sopenharmony_ci vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 912e1051a39Sopenharmony_ci vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 913e1051a39Sopenharmony_ci 914e1051a39Sopenharmony_ci vpaddd $K,$e,$e # e+=K_00_19 915e1051a39Sopenharmony_ci vpslld \$5,$a,$t2 916e1051a39Sopenharmony_ci vpandn $d,$b,$t1 917e1051a39Sopenharmony_ci `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` 918e1051a39Sopenharmony_ci vpand $c,$b,$t0 919e1051a39Sopenharmony_ci 920e1051a39Sopenharmony_ci vmovdqa @Xi[0],`&Xi_off($i)` 921e1051a39Sopenharmony_ci vpaddd @Xi[0],$e,$e # e+=X[i] 922e1051a39Sopenharmony_ci vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 923e1051a39Sopenharmony_ci vpsrld \$27,$a,$t3 924e1051a39Sopenharmony_ci vpxor $t1,$t0,$t0 # Ch(b,c,d) 925e1051a39Sopenharmony_ci vpxor @Xi[3],@Xi[1],@Xi[1] 926e1051a39Sopenharmony_ci `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` 927e1051a39Sopenharmony_ci 928e1051a39Sopenharmony_ci vpslld \$30,$b,$t1 929e1051a39Sopenharmony_ci vpor $t3,$t2,$t2 # rol(a,5) 930e1051a39Sopenharmony_ci vpaddd $t0,$e,$e # e+=Ch(b,c,d) 931e1051a39Sopenharmony_ci `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` 932e1051a39Sopenharmony_ci vpsrld \$31,@Xi[1],$tx 933e1051a39Sopenharmony_ci vpaddd @Xi[1],@Xi[1],@Xi[1] 934e1051a39Sopenharmony_ci 935e1051a39Sopenharmony_ci vpsrld \$2,$b,$b 936e1051a39Sopenharmony_ci `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` 937e1051a39Sopenharmony_ci vpaddd $t2,$e,$e # e+=rol(a,5) 938e1051a39Sopenharmony_ci vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] 939e1051a39Sopenharmony_ci vpor $t1,$b,$b # b=rol(b,30) 940e1051a39Sopenharmony_ci___ 941e1051a39Sopenharmony_cipush(@Xi,shift(@Xi)); 942e1051a39Sopenharmony_ci} 943e1051a39Sopenharmony_ci 944e1051a39Sopenharmony_cisub BODY_20_39_avx { 945e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_; 946e1051a39Sopenharmony_cimy $j=$i+1; 947e1051a39Sopenharmony_ci 948e1051a39Sopenharmony_ci$code.=<<___ if ($i<79); 949e1051a39Sopenharmony_ci vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 950e1051a39Sopenharmony_ci vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 951e1051a39Sopenharmony_ci 952e1051a39Sopenharmony_ci vpslld \$5,$a,$t2 953e1051a39Sopenharmony_ci vpaddd $K,$e,$e # e+=K_20_39 954e1051a39Sopenharmony_ci vpxor $b,$d,$t0 955e1051a39Sopenharmony_ci___ 956e1051a39Sopenharmony_ci$code.=<<___ if ($i<72); 957e1051a39Sopenharmony_ci vmovdqa @Xi[0],`&Xi_off($i)` 958e1051a39Sopenharmony_ci___ 959e1051a39Sopenharmony_ci$code.=<<___ if ($i<79); 960e1051a39Sopenharmony_ci vpaddd @Xi[0],$e,$e # e+=X[i] 961e1051a39Sopenharmony_ci vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 962e1051a39Sopenharmony_ci vpsrld \$27,$a,$t3 963e1051a39Sopenharmony_ci vpxor $c,$t0,$t0 # Parity(b,c,d) 964e1051a39Sopenharmony_ci vpxor @Xi[3],@Xi[1],@Xi[1] 965e1051a39Sopenharmony_ci 966e1051a39Sopenharmony_ci vpslld \$30,$b,$t1 967e1051a39Sopenharmony_ci vpor $t3,$t2,$t2 # rol(a,5) 968e1051a39Sopenharmony_ci vpaddd $t0,$e,$e # e+=Parity(b,c,d) 969e1051a39Sopenharmony_ci vpsrld \$31,@Xi[1],$tx 970e1051a39Sopenharmony_ci vpaddd @Xi[1],@Xi[1],@Xi[1] 971e1051a39Sopenharmony_ci 972e1051a39Sopenharmony_ci vpsrld \$2,$b,$b 973e1051a39Sopenharmony_ci vpaddd $t2,$e,$e # e+=rol(a,5) 974e1051a39Sopenharmony_ci vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1) 975e1051a39Sopenharmony_ci vpor $t1,$b,$b # b=rol(b,30) 976e1051a39Sopenharmony_ci___ 977e1051a39Sopenharmony_ci$code.=<<___ if ($i==79); 978e1051a39Sopenharmony_ci vpslld \$5,$a,$t2 979e1051a39Sopenharmony_ci vpaddd $K,$e,$e # e+=K_20_39 980e1051a39Sopenharmony_ci vpxor $b,$d,$t0 981e1051a39Sopenharmony_ci 982e1051a39Sopenharmony_ci vpsrld \$27,$a,$t3 983e1051a39Sopenharmony_ci vpaddd @Xi[0],$e,$e # e+=X[i] 984e1051a39Sopenharmony_ci vpxor $c,$t0,$t0 # Parity(b,c,d) 985e1051a39Sopenharmony_ci 986e1051a39Sopenharmony_ci vpslld \$30,$b,$t1 987e1051a39Sopenharmony_ci vpor $t3,$t2,$t2 # rol(a,5) 988e1051a39Sopenharmony_ci vpaddd $t0,$e,$e # e+=Parity(b,c,d) 989e1051a39Sopenharmony_ci 990e1051a39Sopenharmony_ci vpsrld \$2,$b,$b 991e1051a39Sopenharmony_ci vpaddd $t2,$e,$e # e+=rol(a,5) 992e1051a39Sopenharmony_ci vpor $t1,$b,$b # b=rol(b,30) 993e1051a39Sopenharmony_ci___ 994e1051a39Sopenharmony_cipush(@Xi,shift(@Xi)); 995e1051a39Sopenharmony_ci} 996e1051a39Sopenharmony_ci 997e1051a39Sopenharmony_cisub BODY_40_59_avx { 998e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_; 999e1051a39Sopenharmony_cimy $j=$i+1; 1000e1051a39Sopenharmony_ci 1001e1051a39Sopenharmony_ci$code.=<<___; 1002e1051a39Sopenharmony_ci vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" 1003e1051a39Sopenharmony_ci vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" 1004e1051a39Sopenharmony_ci 1005e1051a39Sopenharmony_ci vpaddd $K,$e,$e # e+=K_40_59 1006e1051a39Sopenharmony_ci vpslld \$5,$a,$t2 1007e1051a39Sopenharmony_ci vpand $c,$d,$t1 1008e1051a39Sopenharmony_ci vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] 1009e1051a39Sopenharmony_ci 1010e1051a39Sopenharmony_ci vpaddd $t1,$e,$e 1011e1051a39Sopenharmony_ci vpsrld \$27,$a,$t3 1012e1051a39Sopenharmony_ci vpxor $c,$d,$t0 1013e1051a39Sopenharmony_ci vpxor @Xi[3],@Xi[1],@Xi[1] 1014e1051a39Sopenharmony_ci 1015e1051a39Sopenharmony_ci vmovdqu @Xi[0],`&Xi_off($i)` 1016e1051a39Sopenharmony_ci vpaddd @Xi[0],$e,$e # e+=X[i] 1017e1051a39Sopenharmony_ci vpor $t3,$t2,$t2 # rol(a,5) 1018e1051a39Sopenharmony_ci vpsrld \$31,@Xi[1],$tx 1019e1051a39Sopenharmony_ci vpand $b,$t0,$t0 1020e1051a39Sopenharmony_ci vpaddd @Xi[1],@Xi[1],@Xi[1] 1021e1051a39Sopenharmony_ci 1022e1051a39Sopenharmony_ci vpslld \$30,$b,$t1 1023e1051a39Sopenharmony_ci vpaddd $t0,$e,$e # e+=Maj(b,d,c) 1024e1051a39Sopenharmony_ci 1025e1051a39Sopenharmony_ci vpsrld \$2,$b,$b 1026e1051a39Sopenharmony_ci vpaddd $t2,$e,$e # e+=rol(a,5) 1027e1051a39Sopenharmony_ci vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1) 1028e1051a39Sopenharmony_ci vpor $t1,$b,$b # b=rol(b,30) 1029e1051a39Sopenharmony_ci___ 1030e1051a39Sopenharmony_cipush(@Xi,shift(@Xi)); 1031e1051a39Sopenharmony_ci} 1032e1051a39Sopenharmony_ci 1033e1051a39Sopenharmony_ci$code.=<<___; 1034e1051a39Sopenharmony_ci.type sha1_multi_block_avx,\@function,3 1035e1051a39Sopenharmony_ci.align 32 1036e1051a39Sopenharmony_cisha1_multi_block_avx: 1037e1051a39Sopenharmony_ci.cfi_startproc 1038e1051a39Sopenharmony_ci_avx_shortcut: 1039e1051a39Sopenharmony_ci___ 1040e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1); 1041e1051a39Sopenharmony_ci shr \$32,%rcx 1042e1051a39Sopenharmony_ci cmp \$2,$num 1043e1051a39Sopenharmony_ci jb .Lavx 1044e1051a39Sopenharmony_ci test \$`1<<5`,%ecx 1045e1051a39Sopenharmony_ci jnz _avx2_shortcut 1046e1051a39Sopenharmony_ci jmp .Lavx 1047e1051a39Sopenharmony_ci.align 32 1048e1051a39Sopenharmony_ci.Lavx: 1049e1051a39Sopenharmony_ci___ 1050e1051a39Sopenharmony_ci$code.=<<___; 1051e1051a39Sopenharmony_ci mov %rsp,%rax 1052e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 1053e1051a39Sopenharmony_ci push %rbx 1054e1051a39Sopenharmony_ci.cfi_push %rbx 1055e1051a39Sopenharmony_ci push %rbp 1056e1051a39Sopenharmony_ci.cfi_push %rbp 1057e1051a39Sopenharmony_ci___ 1058e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1059e1051a39Sopenharmony_ci lea -0xa8(%rsp),%rsp 1060e1051a39Sopenharmony_ci movaps %xmm6,(%rsp) 1061e1051a39Sopenharmony_ci movaps %xmm7,0x10(%rsp) 1062e1051a39Sopenharmony_ci movaps %xmm8,0x20(%rsp) 1063e1051a39Sopenharmony_ci movaps %xmm9,0x30(%rsp) 1064e1051a39Sopenharmony_ci movaps %xmm10,-0x78(%rax) 1065e1051a39Sopenharmony_ci movaps %xmm11,-0x68(%rax) 1066e1051a39Sopenharmony_ci movaps %xmm12,-0x58(%rax) 1067e1051a39Sopenharmony_ci movaps %xmm13,-0x48(%rax) 1068e1051a39Sopenharmony_ci movaps %xmm14,-0x38(%rax) 1069e1051a39Sopenharmony_ci movaps %xmm15,-0x28(%rax) 1070e1051a39Sopenharmony_ci___ 1071e1051a39Sopenharmony_ci$code.=<<___; 1072e1051a39Sopenharmony_ci sub \$`$REG_SZ*18`, %rsp 1073e1051a39Sopenharmony_ci and \$-256,%rsp 1074e1051a39Sopenharmony_ci mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1075e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 1076e1051a39Sopenharmony_ci.Lbody_avx: 1077e1051a39Sopenharmony_ci lea K_XX_XX(%rip),$Tbl 1078e1051a39Sopenharmony_ci lea `$REG_SZ*16`(%rsp),%rbx 1079e1051a39Sopenharmony_ci 1080e1051a39Sopenharmony_ci vzeroupper 1081e1051a39Sopenharmony_ci.Loop_grande_avx: 1082e1051a39Sopenharmony_ci mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1083e1051a39Sopenharmony_ci xor $num,$num 1084e1051a39Sopenharmony_ci___ 1085e1051a39Sopenharmony_cifor($i=0;$i<4;$i++) { 1086e1051a39Sopenharmony_ci $ptr_reg=&pointer_register($flavour,@ptr[$i]); 1087e1051a39Sopenharmony_ci $code.=<<___; 1088e1051a39Sopenharmony_ci # input pointer 1089e1051a39Sopenharmony_ci mov `$inp_elm_size*$i+0`($inp),$ptr_reg 1090e1051a39Sopenharmony_ci # number of blocks 1091e1051a39Sopenharmony_ci mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 1092e1051a39Sopenharmony_ci cmp $num,%ecx 1093e1051a39Sopenharmony_ci cmovg %ecx,$num # find maximum 1094e1051a39Sopenharmony_ci test %ecx,%ecx 1095e1051a39Sopenharmony_ci mov %ecx,`4*$i`(%rbx) # initialize counters 1096e1051a39Sopenharmony_ci cmovle $Tbl,@ptr[$i] # cancel input 1097e1051a39Sopenharmony_ci___ 1098e1051a39Sopenharmony_ci} 1099e1051a39Sopenharmony_ci$code.=<<___; 1100e1051a39Sopenharmony_ci test $num,$num 1101e1051a39Sopenharmony_ci jz .Ldone_avx 1102e1051a39Sopenharmony_ci 1103e1051a39Sopenharmony_ci vmovdqu 0x00($ctx),$A # load context 1104e1051a39Sopenharmony_ci lea 128(%rsp),%rax 1105e1051a39Sopenharmony_ci vmovdqu 0x20($ctx),$B 1106e1051a39Sopenharmony_ci vmovdqu 0x40($ctx),$C 1107e1051a39Sopenharmony_ci vmovdqu 0x60($ctx),$D 1108e1051a39Sopenharmony_ci vmovdqu 0x80($ctx),$E 1109e1051a39Sopenharmony_ci vmovdqu 0x60($Tbl),$tx # pbswap_mask 1110e1051a39Sopenharmony_ci jmp .Loop_avx 1111e1051a39Sopenharmony_ci 1112e1051a39Sopenharmony_ci.align 32 1113e1051a39Sopenharmony_ci.Loop_avx: 1114e1051a39Sopenharmony_ci___ 1115e1051a39Sopenharmony_ci$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 1116e1051a39Sopenharmony_cifor($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 1117e1051a39Sopenharmony_ci$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 1118e1051a39Sopenharmony_cifor(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1119e1051a39Sopenharmony_ci$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 1120e1051a39Sopenharmony_cifor(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 1121e1051a39Sopenharmony_ci$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 1122e1051a39Sopenharmony_cifor(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1123e1051a39Sopenharmony_ci$code.=<<___; 1124e1051a39Sopenharmony_ci mov \$1,%ecx 1125e1051a39Sopenharmony_ci___ 1126e1051a39Sopenharmony_cifor($i=0;$i<4;$i++) { 1127e1051a39Sopenharmony_ci $code.=<<___; 1128e1051a39Sopenharmony_ci cmp `4*$i`(%rbx),%ecx # examine counters 1129e1051a39Sopenharmony_ci cmovge $Tbl,@ptr[$i] # cancel input 1130e1051a39Sopenharmony_ci___ 1131e1051a39Sopenharmony_ci} 1132e1051a39Sopenharmony_ci$code.=<<___; 1133e1051a39Sopenharmony_ci vmovdqu (%rbx),$t0 # pull counters 1134e1051a39Sopenharmony_ci vpxor $t2,$t2,$t2 1135e1051a39Sopenharmony_ci vmovdqa $t0,$t1 1136e1051a39Sopenharmony_ci vpcmpgtd $t2,$t1,$t1 # mask value 1137e1051a39Sopenharmony_ci vpaddd $t1,$t0,$t0 # counters-- 1138e1051a39Sopenharmony_ci 1139e1051a39Sopenharmony_ci vpand $t1,$A,$A 1140e1051a39Sopenharmony_ci vpand $t1,$B,$B 1141e1051a39Sopenharmony_ci vpaddd 0x00($ctx),$A,$A 1142e1051a39Sopenharmony_ci vpand $t1,$C,$C 1143e1051a39Sopenharmony_ci vpaddd 0x20($ctx),$B,$B 1144e1051a39Sopenharmony_ci vpand $t1,$D,$D 1145e1051a39Sopenharmony_ci vpaddd 0x40($ctx),$C,$C 1146e1051a39Sopenharmony_ci vpand $t1,$E,$E 1147e1051a39Sopenharmony_ci vpaddd 0x60($ctx),$D,$D 1148e1051a39Sopenharmony_ci vpaddd 0x80($ctx),$E,$E 1149e1051a39Sopenharmony_ci vmovdqu $A,0x00($ctx) 1150e1051a39Sopenharmony_ci vmovdqu $B,0x20($ctx) 1151e1051a39Sopenharmony_ci vmovdqu $C,0x40($ctx) 1152e1051a39Sopenharmony_ci vmovdqu $D,0x60($ctx) 1153e1051a39Sopenharmony_ci vmovdqu $E,0x80($ctx) 1154e1051a39Sopenharmony_ci 1155e1051a39Sopenharmony_ci vmovdqu $t0,(%rbx) # save counters 1156e1051a39Sopenharmony_ci vmovdqu 0x60($Tbl),$tx # pbswap_mask 1157e1051a39Sopenharmony_ci dec $num 1158e1051a39Sopenharmony_ci jnz .Loop_avx 1159e1051a39Sopenharmony_ci 1160e1051a39Sopenharmony_ci mov `$REG_SZ*17+8`(%rsp),$num 1161e1051a39Sopenharmony_ci lea $REG_SZ($ctx),$ctx 1162e1051a39Sopenharmony_ci lea `$inp_elm_size*$REG_SZ/4`($inp),$inp 1163e1051a39Sopenharmony_ci dec $num 1164e1051a39Sopenharmony_ci jnz .Loop_grande_avx 1165e1051a39Sopenharmony_ci 1166e1051a39Sopenharmony_ci.Ldone_avx: 1167e1051a39Sopenharmony_ci mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1168e1051a39Sopenharmony_ci.cfi_def_cfa %rax,8 1169e1051a39Sopenharmony_ci vzeroupper 1170e1051a39Sopenharmony_ci___ 1171e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1172e1051a39Sopenharmony_ci movaps -0xb8(%rax),%xmm6 1173e1051a39Sopenharmony_ci movaps -0xa8(%rax),%xmm7 1174e1051a39Sopenharmony_ci movaps -0x98(%rax),%xmm8 1175e1051a39Sopenharmony_ci movaps -0x88(%rax),%xmm9 1176e1051a39Sopenharmony_ci movaps -0x78(%rax),%xmm10 1177e1051a39Sopenharmony_ci movaps -0x68(%rax),%xmm11 1178e1051a39Sopenharmony_ci movaps -0x58(%rax),%xmm12 1179e1051a39Sopenharmony_ci movaps -0x48(%rax),%xmm13 1180e1051a39Sopenharmony_ci movaps -0x38(%rax),%xmm14 1181e1051a39Sopenharmony_ci movaps -0x28(%rax),%xmm15 1182e1051a39Sopenharmony_ci___ 1183e1051a39Sopenharmony_ci$code.=<<___; 1184e1051a39Sopenharmony_ci mov -16(%rax),%rbp 1185e1051a39Sopenharmony_ci.cfi_restore %rbp 1186e1051a39Sopenharmony_ci mov -8(%rax),%rbx 1187e1051a39Sopenharmony_ci.cfi_restore %rbx 1188e1051a39Sopenharmony_ci lea (%rax),%rsp 1189e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 1190e1051a39Sopenharmony_ci.Lepilogue_avx: 1191e1051a39Sopenharmony_ci ret 1192e1051a39Sopenharmony_ci.cfi_endproc 1193e1051a39Sopenharmony_ci.size sha1_multi_block_avx,.-sha1_multi_block_avx 1194e1051a39Sopenharmony_ci___ 1195e1051a39Sopenharmony_ci 1196e1051a39Sopenharmony_ci if ($avx>1) { 1197e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval $1/gem; 1198e1051a39Sopenharmony_ci 1199e1051a39Sopenharmony_ci$REG_SZ=32; 1200e1051a39Sopenharmony_ci 1201e1051a39Sopenharmony_ci@ptr=map("%r$_",(12..15,8..11)); 1202e1051a39Sopenharmony_ci 1203e1051a39Sopenharmony_ci@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4)); 1204e1051a39Sopenharmony_ci($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9)); 1205e1051a39Sopenharmony_ci@Xi=map("%ymm$_",(10..14)); 1206e1051a39Sopenharmony_ci$K="%ymm15"; 1207e1051a39Sopenharmony_ci 1208e1051a39Sopenharmony_ci$code.=<<___; 1209e1051a39Sopenharmony_ci.type sha1_multi_block_avx2,\@function,3 1210e1051a39Sopenharmony_ci.align 32 1211e1051a39Sopenharmony_cisha1_multi_block_avx2: 1212e1051a39Sopenharmony_ci.cfi_startproc 1213e1051a39Sopenharmony_ci_avx2_shortcut: 1214e1051a39Sopenharmony_ci mov %rsp,%rax 1215e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 1216e1051a39Sopenharmony_ci push %rbx 1217e1051a39Sopenharmony_ci.cfi_push %rbx 1218e1051a39Sopenharmony_ci push %rbp 1219e1051a39Sopenharmony_ci.cfi_push %rbp 1220e1051a39Sopenharmony_ci push %r12 1221e1051a39Sopenharmony_ci.cfi_push %r12 1222e1051a39Sopenharmony_ci push %r13 1223e1051a39Sopenharmony_ci.cfi_push %r13 1224e1051a39Sopenharmony_ci push %r14 1225e1051a39Sopenharmony_ci.cfi_push %r14 1226e1051a39Sopenharmony_ci push %r15 1227e1051a39Sopenharmony_ci.cfi_push %r15 1228e1051a39Sopenharmony_ci___ 1229e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1230e1051a39Sopenharmony_ci lea -0xa8(%rsp),%rsp 1231e1051a39Sopenharmony_ci movaps %xmm6,(%rsp) 1232e1051a39Sopenharmony_ci movaps %xmm7,0x10(%rsp) 1233e1051a39Sopenharmony_ci movaps %xmm8,0x20(%rsp) 1234e1051a39Sopenharmony_ci movaps %xmm9,0x30(%rsp) 1235e1051a39Sopenharmony_ci movaps %xmm10,0x40(%rsp) 1236e1051a39Sopenharmony_ci movaps %xmm11,0x50(%rsp) 1237e1051a39Sopenharmony_ci movaps %xmm12,-0x78(%rax) 1238e1051a39Sopenharmony_ci movaps %xmm13,-0x68(%rax) 1239e1051a39Sopenharmony_ci movaps %xmm14,-0x58(%rax) 1240e1051a39Sopenharmony_ci movaps %xmm15,-0x48(%rax) 1241e1051a39Sopenharmony_ci___ 1242e1051a39Sopenharmony_ci$code.=<<___; 1243e1051a39Sopenharmony_ci sub \$`$REG_SZ*18`, %rsp 1244e1051a39Sopenharmony_ci and \$-256,%rsp 1245e1051a39Sopenharmony_ci mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1246e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 1247e1051a39Sopenharmony_ci.Lbody_avx2: 1248e1051a39Sopenharmony_ci lea K_XX_XX(%rip),$Tbl 1249e1051a39Sopenharmony_ci shr \$1,$num 1250e1051a39Sopenharmony_ci 1251e1051a39Sopenharmony_ci vzeroupper 1252e1051a39Sopenharmony_ci.Loop_grande_avx2: 1253e1051a39Sopenharmony_ci mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1254e1051a39Sopenharmony_ci xor $num,$num 1255e1051a39Sopenharmony_ci lea `$REG_SZ*16`(%rsp),%rbx 1256e1051a39Sopenharmony_ci___ 1257e1051a39Sopenharmony_cifor($i=0;$i<8;$i++) { 1258e1051a39Sopenharmony_ci $ptr_reg=&pointer_register($flavour,@ptr[$i]); 1259e1051a39Sopenharmony_ci $code.=<<___; 1260e1051a39Sopenharmony_ci # input pointer 1261e1051a39Sopenharmony_ci mov `$inp_elm_size*$i+0`($inp),$ptr_reg 1262e1051a39Sopenharmony_ci # number of blocks 1263e1051a39Sopenharmony_ci mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 1264e1051a39Sopenharmony_ci cmp $num,%ecx 1265e1051a39Sopenharmony_ci cmovg %ecx,$num # find maximum 1266e1051a39Sopenharmony_ci test %ecx,%ecx 1267e1051a39Sopenharmony_ci mov %ecx,`4*$i`(%rbx) # initialize counters 1268e1051a39Sopenharmony_ci cmovle $Tbl,@ptr[$i] # cancel input 1269e1051a39Sopenharmony_ci___ 1270e1051a39Sopenharmony_ci} 1271e1051a39Sopenharmony_ci$code.=<<___; 1272e1051a39Sopenharmony_ci vmovdqu 0x00($ctx),$A # load context 1273e1051a39Sopenharmony_ci lea 128(%rsp),%rax 1274e1051a39Sopenharmony_ci vmovdqu 0x20($ctx),$B 1275e1051a39Sopenharmony_ci lea 256+128(%rsp),%rbx 1276e1051a39Sopenharmony_ci vmovdqu 0x40($ctx),$C 1277e1051a39Sopenharmony_ci vmovdqu 0x60($ctx),$D 1278e1051a39Sopenharmony_ci vmovdqu 0x80($ctx),$E 1279e1051a39Sopenharmony_ci vmovdqu 0x60($Tbl),$tx # pbswap_mask 1280e1051a39Sopenharmony_ci jmp .Loop_avx2 1281e1051a39Sopenharmony_ci 1282e1051a39Sopenharmony_ci.align 32 1283e1051a39Sopenharmony_ci.Loop_avx2: 1284e1051a39Sopenharmony_ci___ 1285e1051a39Sopenharmony_ci$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 1286e1051a39Sopenharmony_cifor($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } 1287e1051a39Sopenharmony_ci$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 1288e1051a39Sopenharmony_cifor(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1289e1051a39Sopenharmony_ci$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 1290e1051a39Sopenharmony_cifor(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } 1291e1051a39Sopenharmony_ci$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 1292e1051a39Sopenharmony_cifor(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } 1293e1051a39Sopenharmony_ci$code.=<<___; 1294e1051a39Sopenharmony_ci mov \$1,%ecx 1295e1051a39Sopenharmony_ci lea `$REG_SZ*16`(%rsp),%rbx 1296e1051a39Sopenharmony_ci___ 1297e1051a39Sopenharmony_cifor($i=0;$i<8;$i++) { 1298e1051a39Sopenharmony_ci $code.=<<___; 1299e1051a39Sopenharmony_ci cmp `4*$i`(%rbx),%ecx # examine counters 1300e1051a39Sopenharmony_ci cmovge $Tbl,@ptr[$i] # cancel input 1301e1051a39Sopenharmony_ci___ 1302e1051a39Sopenharmony_ci} 1303e1051a39Sopenharmony_ci$code.=<<___; 1304e1051a39Sopenharmony_ci vmovdqu (%rbx),$t0 # pull counters 1305e1051a39Sopenharmony_ci vpxor $t2,$t2,$t2 1306e1051a39Sopenharmony_ci vmovdqa $t0,$t1 1307e1051a39Sopenharmony_ci vpcmpgtd $t2,$t1,$t1 # mask value 1308e1051a39Sopenharmony_ci vpaddd $t1,$t0,$t0 # counters-- 1309e1051a39Sopenharmony_ci 1310e1051a39Sopenharmony_ci vpand $t1,$A,$A 1311e1051a39Sopenharmony_ci vpand $t1,$B,$B 1312e1051a39Sopenharmony_ci vpaddd 0x00($ctx),$A,$A 1313e1051a39Sopenharmony_ci vpand $t1,$C,$C 1314e1051a39Sopenharmony_ci vpaddd 0x20($ctx),$B,$B 1315e1051a39Sopenharmony_ci vpand $t1,$D,$D 1316e1051a39Sopenharmony_ci vpaddd 0x40($ctx),$C,$C 1317e1051a39Sopenharmony_ci vpand $t1,$E,$E 1318e1051a39Sopenharmony_ci vpaddd 0x60($ctx),$D,$D 1319e1051a39Sopenharmony_ci vpaddd 0x80($ctx),$E,$E 1320e1051a39Sopenharmony_ci vmovdqu $A,0x00($ctx) 1321e1051a39Sopenharmony_ci vmovdqu $B,0x20($ctx) 1322e1051a39Sopenharmony_ci vmovdqu $C,0x40($ctx) 1323e1051a39Sopenharmony_ci vmovdqu $D,0x60($ctx) 1324e1051a39Sopenharmony_ci vmovdqu $E,0x80($ctx) 1325e1051a39Sopenharmony_ci 1326e1051a39Sopenharmony_ci vmovdqu $t0,(%rbx) # save counters 1327e1051a39Sopenharmony_ci lea 256+128(%rsp),%rbx 1328e1051a39Sopenharmony_ci vmovdqu 0x60($Tbl),$tx # pbswap_mask 1329e1051a39Sopenharmony_ci dec $num 1330e1051a39Sopenharmony_ci jnz .Loop_avx2 1331e1051a39Sopenharmony_ci 1332e1051a39Sopenharmony_ci #mov `$REG_SZ*17+8`(%rsp),$num 1333e1051a39Sopenharmony_ci #lea $REG_SZ($ctx),$ctx 1334e1051a39Sopenharmony_ci #lea `$inp_elm_size*$REG_SZ/4`($inp),$inp 1335e1051a39Sopenharmony_ci #dec $num 1336e1051a39Sopenharmony_ci #jnz .Loop_grande_avx2 1337e1051a39Sopenharmony_ci 1338e1051a39Sopenharmony_ci.Ldone_avx2: 1339e1051a39Sopenharmony_ci mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1340e1051a39Sopenharmony_ci.cfi_def_cfa %rax,8 1341e1051a39Sopenharmony_ci vzeroupper 1342e1051a39Sopenharmony_ci___ 1343e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1344e1051a39Sopenharmony_ci movaps -0xd8(%rax),%xmm6 1345e1051a39Sopenharmony_ci movaps -0xc8(%rax),%xmm7 1346e1051a39Sopenharmony_ci movaps -0xb8(%rax),%xmm8 1347e1051a39Sopenharmony_ci movaps -0xa8(%rax),%xmm9 1348e1051a39Sopenharmony_ci movaps -0x98(%rax),%xmm10 1349e1051a39Sopenharmony_ci movaps -0x88(%rax),%xmm11 1350e1051a39Sopenharmony_ci movaps -0x78(%rax),%xmm12 1351e1051a39Sopenharmony_ci movaps -0x68(%rax),%xmm13 1352e1051a39Sopenharmony_ci movaps -0x58(%rax),%xmm14 1353e1051a39Sopenharmony_ci movaps -0x48(%rax),%xmm15 1354e1051a39Sopenharmony_ci___ 1355e1051a39Sopenharmony_ci$code.=<<___; 1356e1051a39Sopenharmony_ci mov -48(%rax),%r15 1357e1051a39Sopenharmony_ci.cfi_restore %r15 1358e1051a39Sopenharmony_ci mov -40(%rax),%r14 1359e1051a39Sopenharmony_ci.cfi_restore %r14 1360e1051a39Sopenharmony_ci mov -32(%rax),%r13 1361e1051a39Sopenharmony_ci.cfi_restore %r13 1362e1051a39Sopenharmony_ci mov -24(%rax),%r12 1363e1051a39Sopenharmony_ci.cfi_restore %r12 1364e1051a39Sopenharmony_ci mov -16(%rax),%rbp 1365e1051a39Sopenharmony_ci.cfi_restore %rbp 1366e1051a39Sopenharmony_ci mov -8(%rax),%rbx 1367e1051a39Sopenharmony_ci.cfi_restore %rbx 1368e1051a39Sopenharmony_ci lea (%rax),%rsp 1369e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 1370e1051a39Sopenharmony_ci.Lepilogue_avx2: 1371e1051a39Sopenharmony_ci ret 1372e1051a39Sopenharmony_ci.cfi_endproc 1373e1051a39Sopenharmony_ci.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 1374e1051a39Sopenharmony_ci___ 1375e1051a39Sopenharmony_ci } }}} 1376e1051a39Sopenharmony_ci$code.=<<___; 1377e1051a39Sopenharmony_ci 1378e1051a39Sopenharmony_ci.align 256 1379e1051a39Sopenharmony_ci .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1380e1051a39Sopenharmony_ci .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1381e1051a39Sopenharmony_ciK_XX_XX: 1382e1051a39Sopenharmony_ci .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1383e1051a39Sopenharmony_ci .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1384e1051a39Sopenharmony_ci .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1385e1051a39Sopenharmony_ci .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1386e1051a39Sopenharmony_ci .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1387e1051a39Sopenharmony_ci .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1388e1051a39Sopenharmony_ci .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1389e1051a39Sopenharmony_ci .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1390e1051a39Sopenharmony_ci .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 1391e1051a39Sopenharmony_ci .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1392e1051a39Sopenharmony_ci___ 1393e1051a39Sopenharmony_ci 1394e1051a39Sopenharmony_ciif ($win64) { 1395e1051a39Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1396e1051a39Sopenharmony_ci# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1397e1051a39Sopenharmony_ci$rec="%rcx"; 1398e1051a39Sopenharmony_ci$frame="%rdx"; 1399e1051a39Sopenharmony_ci$context="%r8"; 1400e1051a39Sopenharmony_ci$disp="%r9"; 1401e1051a39Sopenharmony_ci 1402e1051a39Sopenharmony_ci$code.=<<___; 1403e1051a39Sopenharmony_ci.extern __imp_RtlVirtualUnwind 1404e1051a39Sopenharmony_ci.type se_handler,\@abi-omnipotent 1405e1051a39Sopenharmony_ci.align 16 1406e1051a39Sopenharmony_cise_handler: 1407e1051a39Sopenharmony_ci push %rsi 1408e1051a39Sopenharmony_ci push %rdi 1409e1051a39Sopenharmony_ci push %rbx 1410e1051a39Sopenharmony_ci push %rbp 1411e1051a39Sopenharmony_ci push %r12 1412e1051a39Sopenharmony_ci push %r13 1413e1051a39Sopenharmony_ci push %r14 1414e1051a39Sopenharmony_ci push %r15 1415e1051a39Sopenharmony_ci pushfq 1416e1051a39Sopenharmony_ci sub \$64,%rsp 1417e1051a39Sopenharmony_ci 1418e1051a39Sopenharmony_ci mov 120($context),%rax # pull context->Rax 1419e1051a39Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 1420e1051a39Sopenharmony_ci 1421e1051a39Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 1422e1051a39Sopenharmony_ci mov 56($disp),%r11 # disp->HandlerData 1423e1051a39Sopenharmony_ci 1424e1051a39Sopenharmony_ci mov 0(%r11),%r10d # HandlerData[0] 1425e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # end of prologue label 1426e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip<.Lbody 1427e1051a39Sopenharmony_ci jb .Lin_prologue 1428e1051a39Sopenharmony_ci 1429e1051a39Sopenharmony_ci mov 152($context),%rax # pull context->Rsp 1430e1051a39Sopenharmony_ci 1431e1051a39Sopenharmony_ci mov 4(%r11),%r10d # HandlerData[1] 1432e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # epilogue label 1433e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip>=.Lepilogue 1434e1051a39Sopenharmony_ci jae .Lin_prologue 1435e1051a39Sopenharmony_ci 1436e1051a39Sopenharmony_ci mov `16*17`(%rax),%rax # pull saved stack pointer 1437e1051a39Sopenharmony_ci 1438e1051a39Sopenharmony_ci mov -8(%rax),%rbx 1439e1051a39Sopenharmony_ci mov -16(%rax),%rbp 1440e1051a39Sopenharmony_ci mov %rbx,144($context) # restore context->Rbx 1441e1051a39Sopenharmony_ci mov %rbp,160($context) # restore context->Rbp 1442e1051a39Sopenharmony_ci 1443e1051a39Sopenharmony_ci lea -24-10*16(%rax),%rsi 1444e1051a39Sopenharmony_ci lea 512($context),%rdi # &context.Xmm6 1445e1051a39Sopenharmony_ci mov \$20,%ecx 1446e1051a39Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 1447e1051a39Sopenharmony_ci 1448e1051a39Sopenharmony_ci.Lin_prologue: 1449e1051a39Sopenharmony_ci mov 8(%rax),%rdi 1450e1051a39Sopenharmony_ci mov 16(%rax),%rsi 1451e1051a39Sopenharmony_ci mov %rax,152($context) # restore context->Rsp 1452e1051a39Sopenharmony_ci mov %rsi,168($context) # restore context->Rsi 1453e1051a39Sopenharmony_ci mov %rdi,176($context) # restore context->Rdi 1454e1051a39Sopenharmony_ci 1455e1051a39Sopenharmony_ci mov 40($disp),%rdi # disp->ContextRecord 1456e1051a39Sopenharmony_ci mov $context,%rsi # context 1457e1051a39Sopenharmony_ci mov \$154,%ecx # sizeof(CONTEXT) 1458e1051a39Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 1459e1051a39Sopenharmony_ci 1460e1051a39Sopenharmony_ci mov $disp,%rsi 1461e1051a39Sopenharmony_ci xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1462e1051a39Sopenharmony_ci mov 8(%rsi),%rdx # arg2, disp->ImageBase 1463e1051a39Sopenharmony_ci mov 0(%rsi),%r8 # arg3, disp->ControlPc 1464e1051a39Sopenharmony_ci mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1465e1051a39Sopenharmony_ci mov 40(%rsi),%r10 # disp->ContextRecord 1466e1051a39Sopenharmony_ci lea 56(%rsi),%r11 # &disp->HandlerData 1467e1051a39Sopenharmony_ci lea 24(%rsi),%r12 # &disp->EstablisherFrame 1468e1051a39Sopenharmony_ci mov %r10,32(%rsp) # arg5 1469e1051a39Sopenharmony_ci mov %r11,40(%rsp) # arg6 1470e1051a39Sopenharmony_ci mov %r12,48(%rsp) # arg7 1471e1051a39Sopenharmony_ci mov %rcx,56(%rsp) # arg8, (NULL) 1472e1051a39Sopenharmony_ci call *__imp_RtlVirtualUnwind(%rip) 1473e1051a39Sopenharmony_ci 1474e1051a39Sopenharmony_ci mov \$1,%eax # ExceptionContinueSearch 1475e1051a39Sopenharmony_ci add \$64,%rsp 1476e1051a39Sopenharmony_ci popfq 1477e1051a39Sopenharmony_ci pop %r15 1478e1051a39Sopenharmony_ci pop %r14 1479e1051a39Sopenharmony_ci pop %r13 1480e1051a39Sopenharmony_ci pop %r12 1481e1051a39Sopenharmony_ci pop %rbp 1482e1051a39Sopenharmony_ci pop %rbx 1483e1051a39Sopenharmony_ci pop %rdi 1484e1051a39Sopenharmony_ci pop %rsi 1485e1051a39Sopenharmony_ci ret 1486e1051a39Sopenharmony_ci.size se_handler,.-se_handler 1487e1051a39Sopenharmony_ci___ 1488e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1); 1489e1051a39Sopenharmony_ci.type avx2_handler,\@abi-omnipotent 1490e1051a39Sopenharmony_ci.align 16 1491e1051a39Sopenharmony_ciavx2_handler: 1492e1051a39Sopenharmony_ci push %rsi 1493e1051a39Sopenharmony_ci push %rdi 1494e1051a39Sopenharmony_ci push %rbx 1495e1051a39Sopenharmony_ci push %rbp 1496e1051a39Sopenharmony_ci push %r12 1497e1051a39Sopenharmony_ci push %r13 1498e1051a39Sopenharmony_ci push %r14 1499e1051a39Sopenharmony_ci push %r15 1500e1051a39Sopenharmony_ci pushfq 1501e1051a39Sopenharmony_ci sub \$64,%rsp 1502e1051a39Sopenharmony_ci 1503e1051a39Sopenharmony_ci mov 120($context),%rax # pull context->Rax 1504e1051a39Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 1505e1051a39Sopenharmony_ci 1506e1051a39Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 1507e1051a39Sopenharmony_ci mov 56($disp),%r11 # disp->HandlerData 1508e1051a39Sopenharmony_ci 1509e1051a39Sopenharmony_ci mov 0(%r11),%r10d # HandlerData[0] 1510e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # end of prologue label 1511e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip<body label 1512e1051a39Sopenharmony_ci jb .Lin_prologue 1513e1051a39Sopenharmony_ci 1514e1051a39Sopenharmony_ci mov 152($context),%rax # pull context->Rsp 1515e1051a39Sopenharmony_ci 1516e1051a39Sopenharmony_ci mov 4(%r11),%r10d # HandlerData[1] 1517e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # epilogue label 1518e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip>=epilogue label 1519e1051a39Sopenharmony_ci jae .Lin_prologue 1520e1051a39Sopenharmony_ci 1521e1051a39Sopenharmony_ci mov `32*17`($context),%rax # pull saved stack pointer 1522e1051a39Sopenharmony_ci 1523e1051a39Sopenharmony_ci mov -8(%rax),%rbx 1524e1051a39Sopenharmony_ci mov -16(%rax),%rbp 1525e1051a39Sopenharmony_ci mov -24(%rax),%r12 1526e1051a39Sopenharmony_ci mov -32(%rax),%r13 1527e1051a39Sopenharmony_ci mov -40(%rax),%r14 1528e1051a39Sopenharmony_ci mov -48(%rax),%r15 1529e1051a39Sopenharmony_ci mov %rbx,144($context) # restore context->Rbx 1530e1051a39Sopenharmony_ci mov %rbp,160($context) # restore context->Rbp 1531e1051a39Sopenharmony_ci mov %r12,216($context) # restore context->R12 1532e1051a39Sopenharmony_ci mov %r13,224($context) # restore context->R13 1533e1051a39Sopenharmony_ci mov %r14,232($context) # restore context->R14 1534e1051a39Sopenharmony_ci mov %r15,240($context) # restore context->R15 1535e1051a39Sopenharmony_ci 1536e1051a39Sopenharmony_ci lea -56-10*16(%rax),%rsi 1537e1051a39Sopenharmony_ci lea 512($context),%rdi # &context.Xmm6 1538e1051a39Sopenharmony_ci mov \$20,%ecx 1539e1051a39Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 1540e1051a39Sopenharmony_ci 1541e1051a39Sopenharmony_ci jmp .Lin_prologue 1542e1051a39Sopenharmony_ci.size avx2_handler,.-avx2_handler 1543e1051a39Sopenharmony_ci___ 1544e1051a39Sopenharmony_ci$code.=<<___; 1545e1051a39Sopenharmony_ci.section .pdata 1546e1051a39Sopenharmony_ci.align 4 1547e1051a39Sopenharmony_ci .rva .LSEH_begin_sha1_multi_block 1548e1051a39Sopenharmony_ci .rva .LSEH_end_sha1_multi_block 1549e1051a39Sopenharmony_ci .rva .LSEH_info_sha1_multi_block 1550e1051a39Sopenharmony_ci .rva .LSEH_begin_sha1_multi_block_shaext 1551e1051a39Sopenharmony_ci .rva .LSEH_end_sha1_multi_block_shaext 1552e1051a39Sopenharmony_ci .rva .LSEH_info_sha1_multi_block_shaext 1553e1051a39Sopenharmony_ci___ 1554e1051a39Sopenharmony_ci$code.=<<___ if ($avx); 1555e1051a39Sopenharmony_ci .rva .LSEH_begin_sha1_multi_block_avx 1556e1051a39Sopenharmony_ci .rva .LSEH_end_sha1_multi_block_avx 1557e1051a39Sopenharmony_ci .rva .LSEH_info_sha1_multi_block_avx 1558e1051a39Sopenharmony_ci___ 1559e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1); 1560e1051a39Sopenharmony_ci .rva .LSEH_begin_sha1_multi_block_avx2 1561e1051a39Sopenharmony_ci .rva .LSEH_end_sha1_multi_block_avx2 1562e1051a39Sopenharmony_ci .rva .LSEH_info_sha1_multi_block_avx2 1563e1051a39Sopenharmony_ci___ 1564e1051a39Sopenharmony_ci$code.=<<___; 1565e1051a39Sopenharmony_ci.section .xdata 1566e1051a39Sopenharmony_ci.align 8 1567e1051a39Sopenharmony_ci.LSEH_info_sha1_multi_block: 1568e1051a39Sopenharmony_ci .byte 9,0,0,0 1569e1051a39Sopenharmony_ci .rva se_handler 1570e1051a39Sopenharmony_ci .rva .Lbody,.Lepilogue # HandlerData[] 1571e1051a39Sopenharmony_ci.LSEH_info_sha1_multi_block_shaext: 1572e1051a39Sopenharmony_ci .byte 9,0,0,0 1573e1051a39Sopenharmony_ci .rva se_handler 1574e1051a39Sopenharmony_ci .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] 1575e1051a39Sopenharmony_ci___ 1576e1051a39Sopenharmony_ci$code.=<<___ if ($avx); 1577e1051a39Sopenharmony_ci.LSEH_info_sha1_multi_block_avx: 1578e1051a39Sopenharmony_ci .byte 9,0,0,0 1579e1051a39Sopenharmony_ci .rva se_handler 1580e1051a39Sopenharmony_ci .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] 1581e1051a39Sopenharmony_ci___ 1582e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1); 1583e1051a39Sopenharmony_ci.LSEH_info_sha1_multi_block_avx2: 1584e1051a39Sopenharmony_ci .byte 9,0,0,0 1585e1051a39Sopenharmony_ci .rva avx2_handler 1586e1051a39Sopenharmony_ci .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] 1587e1051a39Sopenharmony_ci___ 1588e1051a39Sopenharmony_ci} 1589e1051a39Sopenharmony_ci#################################################################### 1590e1051a39Sopenharmony_ci 1591e1051a39Sopenharmony_cisub rex { 1592e1051a39Sopenharmony_ci local *opcode=shift; 1593e1051a39Sopenharmony_ci my ($dst,$src)=@_; 1594e1051a39Sopenharmony_ci my $rex=0; 1595e1051a39Sopenharmony_ci 1596e1051a39Sopenharmony_ci $rex|=0x04 if ($dst>=8); 1597e1051a39Sopenharmony_ci $rex|=0x01 if ($src>=8); 1598e1051a39Sopenharmony_ci unshift @opcode,$rex|0x40 if ($rex); 1599e1051a39Sopenharmony_ci} 1600e1051a39Sopenharmony_ci 1601e1051a39Sopenharmony_cisub sha1rnds4 { 1602e1051a39Sopenharmony_ci if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1603e1051a39Sopenharmony_ci my @opcode=(0x0f,0x3a,0xcc); 1604e1051a39Sopenharmony_ci rex(\@opcode,$3,$2); 1605e1051a39Sopenharmony_ci push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 1606e1051a39Sopenharmony_ci my $c=$1; 1607e1051a39Sopenharmony_ci push @opcode,$c=~/^0/?oct($c):$c; 1608e1051a39Sopenharmony_ci return ".byte\t".join(',',@opcode); 1609e1051a39Sopenharmony_ci } else { 1610e1051a39Sopenharmony_ci return "sha1rnds4\t".@_[0]; 1611e1051a39Sopenharmony_ci } 1612e1051a39Sopenharmony_ci} 1613e1051a39Sopenharmony_ci 1614e1051a39Sopenharmony_cisub sha1op38 { 1615e1051a39Sopenharmony_ci my $instr = shift; 1616e1051a39Sopenharmony_ci my %opcodelet = ( 1617e1051a39Sopenharmony_ci "sha1nexte" => 0xc8, 1618e1051a39Sopenharmony_ci "sha1msg1" => 0xc9, 1619e1051a39Sopenharmony_ci "sha1msg2" => 0xca ); 1620e1051a39Sopenharmony_ci 1621e1051a39Sopenharmony_ci if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1622e1051a39Sopenharmony_ci my @opcode=(0x0f,0x38); 1623e1051a39Sopenharmony_ci rex(\@opcode,$2,$1); 1624e1051a39Sopenharmony_ci push @opcode,$opcodelet{$instr}; 1625e1051a39Sopenharmony_ci push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1626e1051a39Sopenharmony_ci return ".byte\t".join(',',@opcode); 1627e1051a39Sopenharmony_ci } else { 1628e1051a39Sopenharmony_ci return $instr."\t".@_[0]; 1629e1051a39Sopenharmony_ci } 1630e1051a39Sopenharmony_ci} 1631e1051a39Sopenharmony_ci 1632e1051a39Sopenharmony_ciforeach (split("\n",$code)) { 1633e1051a39Sopenharmony_ci s/\`([^\`]*)\`/eval($1)/ge; 1634e1051a39Sopenharmony_ci 1635e1051a39Sopenharmony_ci s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or 1636e1051a39Sopenharmony_ci s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or 1637e1051a39Sopenharmony_ci 1638e1051a39Sopenharmony_ci s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1639e1051a39Sopenharmony_ci s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1640e1051a39Sopenharmony_ci s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or 1641e1051a39Sopenharmony_ci s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1642e1051a39Sopenharmony_ci s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or 1643e1051a39Sopenharmony_ci s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1644e1051a39Sopenharmony_ci 1645e1051a39Sopenharmony_ci print $_,"\n"; 1646e1051a39Sopenharmony_ci} 1647e1051a39Sopenharmony_ci 1648e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 1649