1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci# 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# SHA256 block transform for x86. September 2007. 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# Performance improvement over compiler generated code varies from 20e1051a39Sopenharmony_ci# 10% to 40% [see below]. Not very impressive on some µ-archs, but 21e1051a39Sopenharmony_ci# it's 5 times smaller and optimizes amount of writes. 22e1051a39Sopenharmony_ci# 23e1051a39Sopenharmony_ci# May 2012. 24e1051a39Sopenharmony_ci# 25e1051a39Sopenharmony_ci# Optimization including two of Pavel Semjanov's ideas, alternative 26e1051a39Sopenharmony_ci# Maj and full unroll, resulted in ~20-25% improvement on most CPUs, 27e1051a39Sopenharmony_ci# ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost 28e1051a39Sopenharmony_ci# 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not 29e1051a39Sopenharmony_ci# on P4, where it kills performance, nor Sandy Bridge, where folded 30e1051a39Sopenharmony_ci# loop is approximately as fast... 31e1051a39Sopenharmony_ci# 32e1051a39Sopenharmony_ci# June 2012. 33e1051a39Sopenharmony_ci# 34e1051a39Sopenharmony_ci# Add AMD XOP-specific code path, >30% improvement on Bulldozer over 35e1051a39Sopenharmony_ci# May version, >60% over original. Add AVX+shrd code path, >25% 36e1051a39Sopenharmony_ci# improvement on Sandy Bridge over May version, 60% over original. 37e1051a39Sopenharmony_ci# 38e1051a39Sopenharmony_ci# May 2013. 39e1051a39Sopenharmony_ci# 40e1051a39Sopenharmony_ci# Replace AMD XOP code path with SSSE3 to cover more processors. 41e1051a39Sopenharmony_ci# (Biggest improvement coefficient is on upcoming Atom Silvermont, 42e1051a39Sopenharmony_ci# not shown.) Add AVX+BMI code path. 43e1051a39Sopenharmony_ci# 44e1051a39Sopenharmony_ci# March 2014. 45e1051a39Sopenharmony_ci# 46e1051a39Sopenharmony_ci# Add support for Intel SHA Extensions. 47e1051a39Sopenharmony_ci# 48e1051a39Sopenharmony_ci# Performance in clock cycles per processed byte (less is better): 49e1051a39Sopenharmony_ci# 50e1051a39Sopenharmony_ci# gcc icc x86 asm(*) SIMD x86_64 asm(**) 51e1051a39Sopenharmony_ci# Pentium 46 57 40/38 - - 52e1051a39Sopenharmony_ci# PIII 36 33 27/24 - - 53e1051a39Sopenharmony_ci# P4 41 38 28 - 17.3 54e1051a39Sopenharmony_ci# AMD K8 27 25 19/15.5 - 14.9 55e1051a39Sopenharmony_ci# Core2 26 23 18/15.6 14.3 13.8 56e1051a39Sopenharmony_ci# Westmere 27 - 19/15.7 13.4 12.3 57e1051a39Sopenharmony_ci# Sandy Bridge 25 - 15.9 12.4 11.6 58e1051a39Sopenharmony_ci# Ivy Bridge 24 - 15.0 11.4 10.3 59e1051a39Sopenharmony_ci# Haswell 22 - 13.9 9.46 7.80 60e1051a39Sopenharmony_ci# Skylake 20 - 14.9 9.50 7.70 61e1051a39Sopenharmony_ci# Bulldozer 36 - 27/22 17.0 13.6 62e1051a39Sopenharmony_ci# VIA Nano 36 - 25/22 16.8 16.5 63e1051a39Sopenharmony_ci# Atom 50 - 30/25 21.9 18.9 64e1051a39Sopenharmony_ci# Silvermont 40 - 34/31 22.9 20.6 65e1051a39Sopenharmony_ci# Goldmont 29 - 20 16.3(***) 66e1051a39Sopenharmony_ci# 67e1051a39Sopenharmony_ci# (*) numbers after slash are for unrolled loop, where applicable; 68e1051a39Sopenharmony_ci# (**) x86_64 assembly performance is presented for reference 69e1051a39Sopenharmony_ci# purposes, results are best-available; 70e1051a39Sopenharmony_ci# (***) SHAEXT result is 4.1, strangely enough better than 64-bit one; 71e1051a39Sopenharmony_ci 72e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 73e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 74e1051a39Sopenharmony_cirequire "x86asm.pl"; 75e1051a39Sopenharmony_ci 76e1051a39Sopenharmony_ci$output=pop and open STDOUT,">$output"; 77e1051a39Sopenharmony_ci 78e1051a39Sopenharmony_ci&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 79e1051a39Sopenharmony_ci 80e1051a39Sopenharmony_ci$xmm=$avx=0; 81e1051a39Sopenharmony_cifor (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } 82e1051a39Sopenharmony_ci 83e1051a39Sopenharmony_ciif ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 84e1051a39Sopenharmony_ci =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 85e1051a39Sopenharmony_ci $avx = ($1>=2.19) + ($1>=2.22); 86e1051a39Sopenharmony_ci} 87e1051a39Sopenharmony_ci 88e1051a39Sopenharmony_ciif ($xmm && !$avx && $ARGV[0] eq "win32n" && 89e1051a39Sopenharmony_ci `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 90e1051a39Sopenharmony_ci $avx = ($1>=2.03) + ($1>=2.10); 91e1051a39Sopenharmony_ci} 92e1051a39Sopenharmony_ci 93e1051a39Sopenharmony_ciif ($xmm && !$avx && $ARGV[0] eq "win32" && 94e1051a39Sopenharmony_ci `ml 2>&1` =~ /Version ([0-9]+)\./) { 95e1051a39Sopenharmony_ci $avx = ($1>=10) + ($1>=11); 96e1051a39Sopenharmony_ci} 97e1051a39Sopenharmony_ci 98e1051a39Sopenharmony_ciif ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { 99e1051a39Sopenharmony_ci $avx = ($2>=3.0) + ($2>3.0); 100e1051a39Sopenharmony_ci} 101e1051a39Sopenharmony_ci 102e1051a39Sopenharmony_ci$shaext=$xmm; ### set to zero if compiling for 1.0.1 103e1051a39Sopenharmony_ci 104e1051a39Sopenharmony_ci$unroll_after = 64*4; # If pre-evicted from L1P cache first spin of 105e1051a39Sopenharmony_ci # fully unrolled loop was measured to run about 106e1051a39Sopenharmony_ci # 3-4x slower. If slowdown coefficient is N and 107e1051a39Sopenharmony_ci # unrolled loop is m times faster, then you break 108e1051a39Sopenharmony_ci # even at (N-1)/(m-1) blocks. Then it needs to be 109e1051a39Sopenharmony_ci # adjusted for probability of code being evicted, 110e1051a39Sopenharmony_ci # code size/cache size=1/4. Typical m is 1.15... 111e1051a39Sopenharmony_ci 112e1051a39Sopenharmony_ci$A="eax"; 113e1051a39Sopenharmony_ci$E="edx"; 114e1051a39Sopenharmony_ci$T="ebx"; 115e1051a39Sopenharmony_ci$Aoff=&DWP(4,"esp"); 116e1051a39Sopenharmony_ci$Boff=&DWP(8,"esp"); 117e1051a39Sopenharmony_ci$Coff=&DWP(12,"esp"); 118e1051a39Sopenharmony_ci$Doff=&DWP(16,"esp"); 119e1051a39Sopenharmony_ci$Eoff=&DWP(20,"esp"); 120e1051a39Sopenharmony_ci$Foff=&DWP(24,"esp"); 121e1051a39Sopenharmony_ci$Goff=&DWP(28,"esp"); 122e1051a39Sopenharmony_ci$Hoff=&DWP(32,"esp"); 123e1051a39Sopenharmony_ci$Xoff=&DWP(36,"esp"); 124e1051a39Sopenharmony_ci$K256="ebp"; 125e1051a39Sopenharmony_ci 126e1051a39Sopenharmony_cisub BODY_16_63() { 127e1051a39Sopenharmony_ci &mov ($T,"ecx"); # "ecx" is preloaded 128e1051a39Sopenharmony_ci &mov ("esi",&DWP(4*(9+15+16-14),"esp")); 129e1051a39Sopenharmony_ci &ror ("ecx",18-7); 130e1051a39Sopenharmony_ci &mov ("edi","esi"); 131e1051a39Sopenharmony_ci &ror ("esi",19-17); 132e1051a39Sopenharmony_ci &xor ("ecx",$T); 133e1051a39Sopenharmony_ci &shr ($T,3); 134e1051a39Sopenharmony_ci &ror ("ecx",7); 135e1051a39Sopenharmony_ci &xor ("esi","edi"); 136e1051a39Sopenharmony_ci &xor ($T,"ecx"); # T = sigma0(X[-15]) 137e1051a39Sopenharmony_ci &ror ("esi",17); 138e1051a39Sopenharmony_ci &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16] 139e1051a39Sopenharmony_ci &shr ("edi",10); 140e1051a39Sopenharmony_ci &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7] 141e1051a39Sopenharmony_ci #&xor ("edi","esi") # sigma1(X[-2]) 142e1051a39Sopenharmony_ci # &add ($T,"edi"); # T += sigma1(X[-2]) 143e1051a39Sopenharmony_ci # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] 144e1051a39Sopenharmony_ci 145e1051a39Sopenharmony_ci &BODY_00_15(1); 146e1051a39Sopenharmony_ci} 147e1051a39Sopenharmony_cisub BODY_00_15() { 148e1051a39Sopenharmony_ci my $in_16_63=shift; 149e1051a39Sopenharmony_ci 150e1051a39Sopenharmony_ci &mov ("ecx",$E); 151e1051a39Sopenharmony_ci &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2]) 152e1051a39Sopenharmony_ci &mov ("esi",$Foff); 153e1051a39Sopenharmony_ci &ror ("ecx",25-11); 154e1051a39Sopenharmony_ci &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) 155e1051a39Sopenharmony_ci &mov ("edi",$Goff); 156e1051a39Sopenharmony_ci &xor ("ecx",$E); 157e1051a39Sopenharmony_ci &xor ("esi","edi"); 158e1051a39Sopenharmony_ci &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63); 159e1051a39Sopenharmony_ci &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0] 160e1051a39Sopenharmony_ci &ror ("ecx",11-6); 161e1051a39Sopenharmony_ci &and ("esi",$E); 162e1051a39Sopenharmony_ci &mov ($Eoff,$E); # modulo-scheduled 163e1051a39Sopenharmony_ci &xor ($E,"ecx"); 164e1051a39Sopenharmony_ci &add ($T,$Hoff); # T += h 165e1051a39Sopenharmony_ci &xor ("esi","edi"); # Ch(e,f,g) 166e1051a39Sopenharmony_ci &ror ($E,6); # Sigma1(e) 167e1051a39Sopenharmony_ci &mov ("ecx",$A); 168e1051a39Sopenharmony_ci &add ($T,"esi"); # T += Ch(e,f,g) 169e1051a39Sopenharmony_ci 170e1051a39Sopenharmony_ci &ror ("ecx",22-13); 171e1051a39Sopenharmony_ci &add ($T,$E); # T += Sigma1(e) 172e1051a39Sopenharmony_ci &mov ("edi",$Boff); 173e1051a39Sopenharmony_ci &xor ("ecx",$A); 174e1051a39Sopenharmony_ci &mov ($Aoff,$A); # modulo-scheduled 175e1051a39Sopenharmony_ci &lea ("esp",&DWP(-4,"esp")); 176e1051a39Sopenharmony_ci &ror ("ecx",13-2); 177e1051a39Sopenharmony_ci &mov ("esi",&DWP(0,$K256)); 178e1051a39Sopenharmony_ci &xor ("ecx",$A); 179e1051a39Sopenharmony_ci &mov ($E,$Eoff); # e in next iteration, d in this one 180e1051a39Sopenharmony_ci &xor ($A,"edi"); # a ^= b 181e1051a39Sopenharmony_ci &ror ("ecx",2); # Sigma0(a) 182e1051a39Sopenharmony_ci 183e1051a39Sopenharmony_ci &add ($T,"esi"); # T+= K[i] 184e1051a39Sopenharmony_ci &mov (&DWP(0,"esp"),$A); # (b^c) in next round 185e1051a39Sopenharmony_ci &add ($E,$T); # d += T 186e1051a39Sopenharmony_ci &and ($A,&DWP(4,"esp")); # a &= (b^c) 187e1051a39Sopenharmony_ci &add ($T,"ecx"); # T += Sigma0(a) 188e1051a39Sopenharmony_ci &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) 189e1051a39Sopenharmony_ci &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T 190e1051a39Sopenharmony_ci &add ($K256,4); 191e1051a39Sopenharmony_ci &add ($A,$T); # h += T 192e1051a39Sopenharmony_ci} 193e1051a39Sopenharmony_ci 194e1051a39Sopenharmony_ci&external_label("OPENSSL_ia32cap_P") if (!$i386); 195e1051a39Sopenharmony_ci 196e1051a39Sopenharmony_ci&function_begin("sha256_block_data_order"); 197e1051a39Sopenharmony_ci &mov ("esi",wparam(0)); # ctx 198e1051a39Sopenharmony_ci &mov ("edi",wparam(1)); # inp 199e1051a39Sopenharmony_ci &mov ("eax",wparam(2)); # num 200e1051a39Sopenharmony_ci &mov ("ebx","esp"); # saved sp 201e1051a39Sopenharmony_ci 202e1051a39Sopenharmony_ci &call (&label("pic_point")); # make it PIC! 203e1051a39Sopenharmony_ci&set_label("pic_point"); 204e1051a39Sopenharmony_ci &blindpop($K256); 205e1051a39Sopenharmony_ci &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256)); 206e1051a39Sopenharmony_ci 207e1051a39Sopenharmony_ci &sub ("esp",16); 208e1051a39Sopenharmony_ci &and ("esp",-64); 209e1051a39Sopenharmony_ci 210e1051a39Sopenharmony_ci &shl ("eax",6); 211e1051a39Sopenharmony_ci &add ("eax","edi"); 212e1051a39Sopenharmony_ci &mov (&DWP(0,"esp"),"esi"); # ctx 213e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),"edi"); # inp 214e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),"eax"); # inp+num*128 215e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),"ebx"); # saved sp 216e1051a39Sopenharmony_ci if (!$i386 && $xmm) { 217e1051a39Sopenharmony_ci &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256")); 218e1051a39Sopenharmony_ci &mov ("ecx",&DWP(0,"edx")); 219e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"edx")); 220e1051a39Sopenharmony_ci &test ("ecx",1<<20); # check for P4 221e1051a39Sopenharmony_ci &jnz (&label("loop")); 222e1051a39Sopenharmony_ci &mov ("edx",&DWP(8,"edx")) if ($xmm); 223e1051a39Sopenharmony_ci &test ("ecx",1<<24); # check for FXSR 224e1051a39Sopenharmony_ci &jz ($unroll_after?&label("no_xmm"):&label("loop")); 225e1051a39Sopenharmony_ci &and ("ecx",1<<30); # mask "Intel CPU" bit 226e1051a39Sopenharmony_ci &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits 227e1051a39Sopenharmony_ci &test ("edx",1<<29) if ($shaext); # check for SHA 228e1051a39Sopenharmony_ci &jnz (&label("shaext")) if ($shaext); 229e1051a39Sopenharmony_ci &or ("ecx","ebx"); 230e1051a39Sopenharmony_ci &and ("ecx",1<<28|1<<30); 231e1051a39Sopenharmony_ci &cmp ("ecx",1<<28|1<<30); 232e1051a39Sopenharmony_ci if ($xmm) { 233e1051a39Sopenharmony_ci &je (&label("AVX")) if ($avx); 234e1051a39Sopenharmony_ci &test ("ebx",1<<9); # check for SSSE3 235e1051a39Sopenharmony_ci &jnz (&label("SSSE3")); 236e1051a39Sopenharmony_ci } else { 237e1051a39Sopenharmony_ci &je (&label("loop_shrd")); 238e1051a39Sopenharmony_ci } 239e1051a39Sopenharmony_ci if ($unroll_after) { 240e1051a39Sopenharmony_ci&set_label("no_xmm"); 241e1051a39Sopenharmony_ci &sub ("eax","edi"); 242e1051a39Sopenharmony_ci &cmp ("eax",$unroll_after); 243e1051a39Sopenharmony_ci &jae (&label("unrolled")); 244e1051a39Sopenharmony_ci } } 245e1051a39Sopenharmony_ci &jmp (&label("loop")); 246e1051a39Sopenharmony_ci 247e1051a39Sopenharmony_cisub COMPACT_LOOP() { 248e1051a39Sopenharmony_cimy $suffix=shift; 249e1051a39Sopenharmony_ci 250e1051a39Sopenharmony_ci&set_label("loop$suffix",$suffix?32:16); 251e1051a39Sopenharmony_ci # copy input block to stack reversing byte and dword order 252e1051a39Sopenharmony_ci for($i=0;$i<4;$i++) { 253e1051a39Sopenharmony_ci &mov ("eax",&DWP($i*16+0,"edi")); 254e1051a39Sopenharmony_ci &mov ("ebx",&DWP($i*16+4,"edi")); 255e1051a39Sopenharmony_ci &mov ("ecx",&DWP($i*16+8,"edi")); 256e1051a39Sopenharmony_ci &bswap ("eax"); 257e1051a39Sopenharmony_ci &mov ("edx",&DWP($i*16+12,"edi")); 258e1051a39Sopenharmony_ci &bswap ("ebx"); 259e1051a39Sopenharmony_ci &push ("eax"); 260e1051a39Sopenharmony_ci &bswap ("ecx"); 261e1051a39Sopenharmony_ci &push ("ebx"); 262e1051a39Sopenharmony_ci &bswap ("edx"); 263e1051a39Sopenharmony_ci &push ("ecx"); 264e1051a39Sopenharmony_ci &push ("edx"); 265e1051a39Sopenharmony_ci } 266e1051a39Sopenharmony_ci &add ("edi",64); 267e1051a39Sopenharmony_ci &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H 268e1051a39Sopenharmony_ci &mov (&DWP(4*(9+16)+4,"esp"),"edi"); 269e1051a39Sopenharmony_ci 270e1051a39Sopenharmony_ci # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 271e1051a39Sopenharmony_ci &mov ($A,&DWP(0,"esi")); 272e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"esi")); 273e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"esi")); 274e1051a39Sopenharmony_ci &mov ("edi",&DWP(12,"esi")); 275e1051a39Sopenharmony_ci # &mov ($Aoff,$A); 276e1051a39Sopenharmony_ci &mov ($Boff,"ebx"); 277e1051a39Sopenharmony_ci &xor ("ebx","ecx"); 278e1051a39Sopenharmony_ci &mov ($Coff,"ecx"); 279e1051a39Sopenharmony_ci &mov ($Doff,"edi"); 280e1051a39Sopenharmony_ci &mov (&DWP(0,"esp"),"ebx"); # magic 281e1051a39Sopenharmony_ci &mov ($E,&DWP(16,"esi")); 282e1051a39Sopenharmony_ci &mov ("ebx",&DWP(20,"esi")); 283e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"esi")); 284e1051a39Sopenharmony_ci &mov ("edi",&DWP(28,"esi")); 285e1051a39Sopenharmony_ci # &mov ($Eoff,$E); 286e1051a39Sopenharmony_ci &mov ($Foff,"ebx"); 287e1051a39Sopenharmony_ci &mov ($Goff,"ecx"); 288e1051a39Sopenharmony_ci &mov ($Hoff,"edi"); 289e1051a39Sopenharmony_ci 290e1051a39Sopenharmony_ci&set_label("00_15$suffix",16); 291e1051a39Sopenharmony_ci 292e1051a39Sopenharmony_ci &BODY_00_15(); 293e1051a39Sopenharmony_ci 294e1051a39Sopenharmony_ci &cmp ("esi",0xc19bf174); 295e1051a39Sopenharmony_ci &jne (&label("00_15$suffix")); 296e1051a39Sopenharmony_ci 297e1051a39Sopenharmony_ci &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1) 298e1051a39Sopenharmony_ci &jmp (&label("16_63$suffix")); 299e1051a39Sopenharmony_ci 300e1051a39Sopenharmony_ci&set_label("16_63$suffix",16); 301e1051a39Sopenharmony_ci 302e1051a39Sopenharmony_ci &BODY_16_63(); 303e1051a39Sopenharmony_ci 304e1051a39Sopenharmony_ci &cmp ("esi",0xc67178f2); 305e1051a39Sopenharmony_ci &jne (&label("16_63$suffix")); 306e1051a39Sopenharmony_ci 307e1051a39Sopenharmony_ci &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx 308e1051a39Sopenharmony_ci # &mov ($A,$Aoff); 309e1051a39Sopenharmony_ci &mov ("ebx",$Boff); 310e1051a39Sopenharmony_ci # &mov ("edi",$Coff); 311e1051a39Sopenharmony_ci &mov ("ecx",$Doff); 312e1051a39Sopenharmony_ci &add ($A,&DWP(0,"esi")); 313e1051a39Sopenharmony_ci &add ("ebx",&DWP(4,"esi")); 314e1051a39Sopenharmony_ci &add ("edi",&DWP(8,"esi")); 315e1051a39Sopenharmony_ci &add ("ecx",&DWP(12,"esi")); 316e1051a39Sopenharmony_ci &mov (&DWP(0,"esi"),$A); 317e1051a39Sopenharmony_ci &mov (&DWP(4,"esi"),"ebx"); 318e1051a39Sopenharmony_ci &mov (&DWP(8,"esi"),"edi"); 319e1051a39Sopenharmony_ci &mov (&DWP(12,"esi"),"ecx"); 320e1051a39Sopenharmony_ci # &mov ($E,$Eoff); 321e1051a39Sopenharmony_ci &mov ("eax",$Foff); 322e1051a39Sopenharmony_ci &mov ("ebx",$Goff); 323e1051a39Sopenharmony_ci &mov ("ecx",$Hoff); 324e1051a39Sopenharmony_ci &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp 325e1051a39Sopenharmony_ci &add ($E,&DWP(16,"esi")); 326e1051a39Sopenharmony_ci &add ("eax",&DWP(20,"esi")); 327e1051a39Sopenharmony_ci &add ("ebx",&DWP(24,"esi")); 328e1051a39Sopenharmony_ci &add ("ecx",&DWP(28,"esi")); 329e1051a39Sopenharmony_ci &mov (&DWP(16,"esi"),$E); 330e1051a39Sopenharmony_ci &mov (&DWP(20,"esi"),"eax"); 331e1051a39Sopenharmony_ci &mov (&DWP(24,"esi"),"ebx"); 332e1051a39Sopenharmony_ci &mov (&DWP(28,"esi"),"ecx"); 333e1051a39Sopenharmony_ci 334e1051a39Sopenharmony_ci &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame 335e1051a39Sopenharmony_ci &sub ($K256,4*64); # rewind K 336e1051a39Sopenharmony_ci 337e1051a39Sopenharmony_ci &cmp ("edi",&DWP(8,"esp")); # are we done yet? 338e1051a39Sopenharmony_ci &jb (&label("loop$suffix")); 339e1051a39Sopenharmony_ci} 340e1051a39Sopenharmony_ci &COMPACT_LOOP(); 341e1051a39Sopenharmony_ci &mov ("esp",&DWP(12,"esp")); # restore sp 342e1051a39Sopenharmony_ci&function_end_A(); 343e1051a39Sopenharmony_ci if (!$i386 && !$xmm) { 344e1051a39Sopenharmony_ci # ~20% improvement on Sandy Bridge 345e1051a39Sopenharmony_ci local *ror = sub { &shrd(@_[0],@_) }; 346e1051a39Sopenharmony_ci &COMPACT_LOOP("_shrd"); 347e1051a39Sopenharmony_ci &mov ("esp",&DWP(12,"esp")); # restore sp 348e1051a39Sopenharmony_ci&function_end_A(); 349e1051a39Sopenharmony_ci } 350e1051a39Sopenharmony_ci 351e1051a39Sopenharmony_ci&set_label("K256",64); # Yes! I keep it in the code segment! 352e1051a39Sopenharmony_ci@K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, 353e1051a39Sopenharmony_ci 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 354e1051a39Sopenharmony_ci 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, 355e1051a39Sopenharmony_ci 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 356e1051a39Sopenharmony_ci 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, 357e1051a39Sopenharmony_ci 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 358e1051a39Sopenharmony_ci 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, 359e1051a39Sopenharmony_ci 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 360e1051a39Sopenharmony_ci 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, 361e1051a39Sopenharmony_ci 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 362e1051a39Sopenharmony_ci 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, 363e1051a39Sopenharmony_ci 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 364e1051a39Sopenharmony_ci 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, 365e1051a39Sopenharmony_ci 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 366e1051a39Sopenharmony_ci 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, 367e1051a39Sopenharmony_ci 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); 368e1051a39Sopenharmony_ci&data_word(@K256); 369e1051a39Sopenharmony_ci&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask 370e1051a39Sopenharmony_ci&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); 371e1051a39Sopenharmony_ci 372e1051a39Sopenharmony_ci($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets 373e1051a39Sopenharmony_cisub off { &DWP(4*(((shift)-$i)&7),"esp"); } 374e1051a39Sopenharmony_ci 375e1051a39Sopenharmony_ciif (!$i386 && $unroll_after) { 376e1051a39Sopenharmony_cimy @AH=($A,$K256); 377e1051a39Sopenharmony_ci 378e1051a39Sopenharmony_ci&set_label("unrolled",16); 379e1051a39Sopenharmony_ci &lea ("esp",&DWP(-96,"esp")); 380e1051a39Sopenharmony_ci # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 381e1051a39Sopenharmony_ci &mov ($AH[0],&DWP(0,"esi")); 382e1051a39Sopenharmony_ci &mov ($AH[1],&DWP(4,"esi")); 383e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"esi")); 384e1051a39Sopenharmony_ci &mov ("ebx",&DWP(12,"esi")); 385e1051a39Sopenharmony_ci #&mov (&DWP(0,"esp"),$AH[0]); 386e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),$AH[1]); 387e1051a39Sopenharmony_ci &xor ($AH[1],"ecx"); # magic 388e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),"ecx"); 389e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),"ebx"); 390e1051a39Sopenharmony_ci &mov ($E,&DWP(16,"esi")); 391e1051a39Sopenharmony_ci &mov ("ebx",&DWP(20,"esi")); 392e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"esi")); 393e1051a39Sopenharmony_ci &mov ("esi",&DWP(28,"esi")); 394e1051a39Sopenharmony_ci #&mov (&DWP(16,"esp"),$E); 395e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),"ebx"); 396e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),"ecx"); 397e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),"esi"); 398e1051a39Sopenharmony_ci &jmp (&label("grand_loop")); 399e1051a39Sopenharmony_ci 400e1051a39Sopenharmony_ci&set_label("grand_loop",16); 401e1051a39Sopenharmony_ci # copy input block to stack reversing byte order 402e1051a39Sopenharmony_ci for($i=0;$i<5;$i++) { 403e1051a39Sopenharmony_ci &mov ("ebx",&DWP(12*$i+0,"edi")); 404e1051a39Sopenharmony_ci &mov ("ecx",&DWP(12*$i+4,"edi")); 405e1051a39Sopenharmony_ci &bswap ("ebx"); 406e1051a39Sopenharmony_ci &mov ("esi",&DWP(12*$i+8,"edi")); 407e1051a39Sopenharmony_ci &bswap ("ecx"); 408e1051a39Sopenharmony_ci &mov (&DWP(32+12*$i+0,"esp"),"ebx"); 409e1051a39Sopenharmony_ci &bswap ("esi"); 410e1051a39Sopenharmony_ci &mov (&DWP(32+12*$i+4,"esp"),"ecx"); 411e1051a39Sopenharmony_ci &mov (&DWP(32+12*$i+8,"esp"),"esi"); 412e1051a39Sopenharmony_ci } 413e1051a39Sopenharmony_ci &mov ("ebx",&DWP($i*12,"edi")); 414e1051a39Sopenharmony_ci &add ("edi",64); 415e1051a39Sopenharmony_ci &bswap ("ebx"); 416e1051a39Sopenharmony_ci &mov (&DWP(96+4,"esp"),"edi"); 417e1051a39Sopenharmony_ci &mov (&DWP(32+12*$i,"esp"),"ebx"); 418e1051a39Sopenharmony_ci 419e1051a39Sopenharmony_ci my ($t1,$t2) = ("ecx","esi"); 420e1051a39Sopenharmony_ci 421e1051a39Sopenharmony_ci for ($i=0;$i<64;$i++) { 422e1051a39Sopenharmony_ci 423e1051a39Sopenharmony_ci if ($i>=16) { 424e1051a39Sopenharmony_ci &mov ($T,$t1); # $t1 is preloaded 425e1051a39Sopenharmony_ci # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp")); 426e1051a39Sopenharmony_ci &ror ($t1,18-7); 427e1051a39Sopenharmony_ci &mov ("edi",$t2); 428e1051a39Sopenharmony_ci &ror ($t2,19-17); 429e1051a39Sopenharmony_ci &xor ($t1,$T); 430e1051a39Sopenharmony_ci &shr ($T,3); 431e1051a39Sopenharmony_ci &ror ($t1,7); 432e1051a39Sopenharmony_ci &xor ($t2,"edi"); 433e1051a39Sopenharmony_ci &xor ($T,$t1); # T = sigma0(X[-15]) 434e1051a39Sopenharmony_ci &ror ($t2,17); 435e1051a39Sopenharmony_ci &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16] 436e1051a39Sopenharmony_ci &shr ("edi",10); 437e1051a39Sopenharmony_ci &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7] 438e1051a39Sopenharmony_ci #&xor ("edi",$t2) # sigma1(X[-2]) 439e1051a39Sopenharmony_ci # &add ($T,"edi"); # T += sigma1(X[-2]) 440e1051a39Sopenharmony_ci # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] 441e1051a39Sopenharmony_ci } 442e1051a39Sopenharmony_ci &mov ($t1,$E); 443e1051a39Sopenharmony_ci &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2]) 444e1051a39Sopenharmony_ci &mov ($t2,&off($f)); 445e1051a39Sopenharmony_ci &ror ($E,25-11); 446e1051a39Sopenharmony_ci &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2]) 447e1051a39Sopenharmony_ci &mov ("edi",&off($g)); 448e1051a39Sopenharmony_ci &xor ($E,$t1); 449e1051a39Sopenharmony_ci &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i] 450e1051a39Sopenharmony_ci &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0] 451e1051a39Sopenharmony_ci &xor ($t2,"edi"); 452e1051a39Sopenharmony_ci &ror ($E,11-6); 453e1051a39Sopenharmony_ci &and ($t2,$t1); 454e1051a39Sopenharmony_ci &mov (&off($e),$t1); # save $E, modulo-scheduled 455e1051a39Sopenharmony_ci &xor ($E,$t1); 456e1051a39Sopenharmony_ci &add ($T,&off($h)); # T += h 457e1051a39Sopenharmony_ci &xor ("edi",$t2); # Ch(e,f,g) 458e1051a39Sopenharmony_ci &ror ($E,6); # Sigma1(e) 459e1051a39Sopenharmony_ci &mov ($t1,$AH[0]); 460e1051a39Sopenharmony_ci &add ($T,"edi"); # T += Ch(e,f,g) 461e1051a39Sopenharmony_ci 462e1051a39Sopenharmony_ci &ror ($t1,22-13); 463e1051a39Sopenharmony_ci &mov ($t2,$AH[0]); 464e1051a39Sopenharmony_ci &mov ("edi",&off($b)); 465e1051a39Sopenharmony_ci &xor ($t1,$AH[0]); 466e1051a39Sopenharmony_ci &mov (&off($a),$AH[0]); # save $A, modulo-scheduled 467e1051a39Sopenharmony_ci &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round 468e1051a39Sopenharmony_ci &ror ($t1,13-2); 469e1051a39Sopenharmony_ci &and ($AH[1],$AH[0]); # (b^c) &= (a^b) 470e1051a39Sopenharmony_ci &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i] 471e1051a39Sopenharmony_ci &xor ($t1,$t2); 472e1051a39Sopenharmony_ci &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) 473e1051a39Sopenharmony_ci &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63); 474e1051a39Sopenharmony_ci &ror ($t1,2); # Sigma0(a) 475e1051a39Sopenharmony_ci 476e1051a39Sopenharmony_ci &add ($AH[1],$E); # h += T 477e1051a39Sopenharmony_ci &add ($E,&off($d)); # d += T 478e1051a39Sopenharmony_ci &add ($AH[1],$t1); # h += Sigma0(a) 479e1051a39Sopenharmony_ci &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63); 480e1051a39Sopenharmony_ci 481e1051a39Sopenharmony_ci @AH = reverse(@AH); # rotate(a,h) 482e1051a39Sopenharmony_ci ($t1,$t2) = ($t2,$t1); # rotate(t1,t2) 483e1051a39Sopenharmony_ci } 484e1051a39Sopenharmony_ci &mov ("esi",&DWP(96,"esp")); #ctx 485e1051a39Sopenharmony_ci #&mov ($AH[0],&DWP(0,"esp")); 486e1051a39Sopenharmony_ci &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 487e1051a39Sopenharmony_ci #&mov ("edi", &DWP(8,"esp")); 488e1051a39Sopenharmony_ci &mov ("ecx",&DWP(12,"esp")); 489e1051a39Sopenharmony_ci &add ($AH[0],&DWP(0,"esi")); 490e1051a39Sopenharmony_ci &add ($AH[1],&DWP(4,"esi")); 491e1051a39Sopenharmony_ci &add ("edi",&DWP(8,"esi")); 492e1051a39Sopenharmony_ci &add ("ecx",&DWP(12,"esi")); 493e1051a39Sopenharmony_ci &mov (&DWP(0,"esi"),$AH[0]); 494e1051a39Sopenharmony_ci &mov (&DWP(4,"esi"),$AH[1]); 495e1051a39Sopenharmony_ci &mov (&DWP(8,"esi"),"edi"); 496e1051a39Sopenharmony_ci &mov (&DWP(12,"esi"),"ecx"); 497e1051a39Sopenharmony_ci #&mov (&DWP(0,"esp"),$AH[0]); 498e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),$AH[1]); 499e1051a39Sopenharmony_ci &xor ($AH[1],"edi"); # magic 500e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),"edi"); 501e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),"ecx"); 502e1051a39Sopenharmony_ci #&mov ($E,&DWP(16,"esp")); 503e1051a39Sopenharmony_ci &mov ("edi",&DWP(20,"esp")); 504e1051a39Sopenharmony_ci &mov ("ebx",&DWP(24,"esp")); 505e1051a39Sopenharmony_ci &mov ("ecx",&DWP(28,"esp")); 506e1051a39Sopenharmony_ci &add ($E,&DWP(16,"esi")); 507e1051a39Sopenharmony_ci &add ("edi",&DWP(20,"esi")); 508e1051a39Sopenharmony_ci &add ("ebx",&DWP(24,"esi")); 509e1051a39Sopenharmony_ci &add ("ecx",&DWP(28,"esi")); 510e1051a39Sopenharmony_ci &mov (&DWP(16,"esi"),$E); 511e1051a39Sopenharmony_ci &mov (&DWP(20,"esi"),"edi"); 512e1051a39Sopenharmony_ci &mov (&DWP(24,"esi"),"ebx"); 513e1051a39Sopenharmony_ci &mov (&DWP(28,"esi"),"ecx"); 514e1051a39Sopenharmony_ci #&mov (&DWP(16,"esp"),$E); 515e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),"edi"); 516e1051a39Sopenharmony_ci &mov ("edi",&DWP(96+4,"esp")); # inp 517e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),"ebx"); 518e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),"ecx"); 519e1051a39Sopenharmony_ci 520e1051a39Sopenharmony_ci &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 521e1051a39Sopenharmony_ci &jb (&label("grand_loop")); 522e1051a39Sopenharmony_ci 523e1051a39Sopenharmony_ci &mov ("esp",&DWP(96+12,"esp")); # restore sp 524e1051a39Sopenharmony_ci&function_end_A(); 525e1051a39Sopenharmony_ci} 526e1051a39Sopenharmony_ci if (!$i386 && $xmm) {{{ 527e1051a39Sopenharmony_ciif ($shaext) { 528e1051a39Sopenharmony_ci###################################################################### 529e1051a39Sopenharmony_ci# Intel SHA Extensions implementation of SHA256 update function. 530e1051a39Sopenharmony_ci# 531e1051a39Sopenharmony_cimy ($ctx,$inp,$end)=("esi","edi","eax"); 532e1051a39Sopenharmony_cimy ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7)); 533e1051a39Sopenharmony_cimy @MSG=map("xmm$_",(3..6)); 534e1051a39Sopenharmony_ci 535e1051a39Sopenharmony_cisub sha256op38 { 536e1051a39Sopenharmony_ci my ($opcodelet,$dst,$src)=@_; 537e1051a39Sopenharmony_ci if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 538e1051a39Sopenharmony_ci { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } 539e1051a39Sopenharmony_ci} 540e1051a39Sopenharmony_cisub sha256rnds2 { sha256op38(0xcb,@_); } 541e1051a39Sopenharmony_cisub sha256msg1 { sha256op38(0xcc,@_); } 542e1051a39Sopenharmony_cisub sha256msg2 { sha256op38(0xcd,@_); } 543e1051a39Sopenharmony_ci 544e1051a39Sopenharmony_ci&set_label("shaext",32); 545e1051a39Sopenharmony_ci &sub ("esp",32); 546e1051a39Sopenharmony_ci 547e1051a39Sopenharmony_ci &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA 548e1051a39Sopenharmony_ci &lea ($K256,&DWP(0x80,$K256)); 549e1051a39Sopenharmony_ci &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE 550e1051a39Sopenharmony_ci &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask 551e1051a39Sopenharmony_ci 552e1051a39Sopenharmony_ci &pshufd ($Wi,$ABEF,0x1b); # ABCD 553e1051a39Sopenharmony_ci &pshufd ($ABEF,$ABEF,0xb1); # CDAB 554e1051a39Sopenharmony_ci &pshufd ($CDGH,$CDGH,0x1b); # EFGH 555e1051a39Sopenharmony_ci &palignr ($ABEF,$CDGH,8); # ABEF 556e1051a39Sopenharmony_ci &punpcklqdq ($CDGH,$Wi); # CDGH 557e1051a39Sopenharmony_ci &jmp (&label("loop_shaext")); 558e1051a39Sopenharmony_ci 559e1051a39Sopenharmony_ci&set_label("loop_shaext",16); 560e1051a39Sopenharmony_ci &movdqu (@MSG[0],&QWP(0,$inp)); 561e1051a39Sopenharmony_ci &movdqu (@MSG[1],&QWP(0x10,$inp)); 562e1051a39Sopenharmony_ci &movdqu (@MSG[2],&QWP(0x20,$inp)); 563e1051a39Sopenharmony_ci &pshufb (@MSG[0],$TMP); 564e1051a39Sopenharmony_ci &movdqu (@MSG[3],&QWP(0x30,$inp)); 565e1051a39Sopenharmony_ci &movdqa (&QWP(16,"esp"),$CDGH); # offload 566e1051a39Sopenharmony_ci 567e1051a39Sopenharmony_ci &movdqa ($Wi,&QWP(0*16-0x80,$K256)); 568e1051a39Sopenharmony_ci &paddd ($Wi,@MSG[0]); 569e1051a39Sopenharmony_ci &pshufb (@MSG[1],$TMP); 570e1051a39Sopenharmony_ci &sha256rnds2 ($CDGH,$ABEF); # 0-3 571e1051a39Sopenharmony_ci &pshufd ($Wi,$Wi,0x0e); 572e1051a39Sopenharmony_ci &nop (); 573e1051a39Sopenharmony_ci &movdqa (&QWP(0,"esp"),$ABEF); # offload 574e1051a39Sopenharmony_ci &sha256rnds2 ($ABEF,$CDGH); 575e1051a39Sopenharmony_ci 576e1051a39Sopenharmony_ci &movdqa ($Wi,&QWP(1*16-0x80,$K256)); 577e1051a39Sopenharmony_ci &paddd ($Wi,@MSG[1]); 578e1051a39Sopenharmony_ci &pshufb (@MSG[2],$TMP); 579e1051a39Sopenharmony_ci &sha256rnds2 ($CDGH,$ABEF); # 4-7 580e1051a39Sopenharmony_ci &pshufd ($Wi,$Wi,0x0e); 581e1051a39Sopenharmony_ci &lea ($inp,&DWP(0x40,$inp)); 582e1051a39Sopenharmony_ci &sha256msg1 (@MSG[0],@MSG[1]); 583e1051a39Sopenharmony_ci &sha256rnds2 ($ABEF,$CDGH); 584e1051a39Sopenharmony_ci 585e1051a39Sopenharmony_ci &movdqa ($Wi,&QWP(2*16-0x80,$K256)); 586e1051a39Sopenharmony_ci &paddd ($Wi,@MSG[2]); 587e1051a39Sopenharmony_ci &pshufb (@MSG[3],$TMP); 588e1051a39Sopenharmony_ci &sha256rnds2 ($CDGH,$ABEF); # 8-11 589e1051a39Sopenharmony_ci &pshufd ($Wi,$Wi,0x0e); 590e1051a39Sopenharmony_ci &movdqa ($TMP,@MSG[3]); 591e1051a39Sopenharmony_ci &palignr ($TMP,@MSG[2],4); 592e1051a39Sopenharmony_ci &nop (); 593e1051a39Sopenharmony_ci &paddd (@MSG[0],$TMP); 594e1051a39Sopenharmony_ci &sha256msg1 (@MSG[1],@MSG[2]); 595e1051a39Sopenharmony_ci &sha256rnds2 ($ABEF,$CDGH); 596e1051a39Sopenharmony_ci 597e1051a39Sopenharmony_ci &movdqa ($Wi,&QWP(3*16-0x80,$K256)); 598e1051a39Sopenharmony_ci &paddd ($Wi,@MSG[3]); 599e1051a39Sopenharmony_ci &sha256msg2 (@MSG[0],@MSG[3]); 600e1051a39Sopenharmony_ci &sha256rnds2 ($CDGH,$ABEF); # 12-15 601e1051a39Sopenharmony_ci &pshufd ($Wi,$Wi,0x0e); 602e1051a39Sopenharmony_ci &movdqa ($TMP,@MSG[0]); 603e1051a39Sopenharmony_ci &palignr ($TMP,@MSG[3],4); 604e1051a39Sopenharmony_ci &nop (); 605e1051a39Sopenharmony_ci &paddd (@MSG[1],$TMP); 606e1051a39Sopenharmony_ci &sha256msg1 (@MSG[2],@MSG[3]); 607e1051a39Sopenharmony_ci &sha256rnds2 ($ABEF,$CDGH); 608e1051a39Sopenharmony_ci 609e1051a39Sopenharmony_cifor($i=4;$i<16-3;$i++) { 610e1051a39Sopenharmony_ci &movdqa ($Wi,&QWP($i*16-0x80,$K256)); 611e1051a39Sopenharmony_ci &paddd ($Wi,@MSG[0]); 612e1051a39Sopenharmony_ci &sha256msg2 (@MSG[1],@MSG[0]); 613e1051a39Sopenharmony_ci &sha256rnds2 ($CDGH,$ABEF); # 16-19... 614e1051a39Sopenharmony_ci &pshufd ($Wi,$Wi,0x0e); 615e1051a39Sopenharmony_ci &movdqa ($TMP,@MSG[1]); 616e1051a39Sopenharmony_ci &palignr ($TMP,@MSG[0],4); 617e1051a39Sopenharmony_ci &nop (); 618e1051a39Sopenharmony_ci &paddd (@MSG[2],$TMP); 619e1051a39Sopenharmony_ci &sha256msg1 (@MSG[3],@MSG[0]); 620e1051a39Sopenharmony_ci &sha256rnds2 ($ABEF,$CDGH); 621e1051a39Sopenharmony_ci 622e1051a39Sopenharmony_ci push(@MSG,shift(@MSG)); 623e1051a39Sopenharmony_ci} 624e1051a39Sopenharmony_ci &movdqa ($Wi,&QWP(13*16-0x80,$K256)); 625e1051a39Sopenharmony_ci &paddd ($Wi,@MSG[0]); 626e1051a39Sopenharmony_ci &sha256msg2 (@MSG[1],@MSG[0]); 627e1051a39Sopenharmony_ci &sha256rnds2 ($CDGH,$ABEF); # 52-55 628e1051a39Sopenharmony_ci &pshufd ($Wi,$Wi,0x0e); 629e1051a39Sopenharmony_ci &movdqa ($TMP,@MSG[1]) 630e1051a39Sopenharmony_ci &palignr ($TMP,@MSG[0],4); 631e1051a39Sopenharmony_ci &sha256rnds2 ($ABEF,$CDGH); 632e1051a39Sopenharmony_ci &paddd (@MSG[2],$TMP); 633e1051a39Sopenharmony_ci 634e1051a39Sopenharmony_ci &movdqa ($Wi,&QWP(14*16-0x80,$K256)); 635e1051a39Sopenharmony_ci &paddd ($Wi,@MSG[1]); 636e1051a39Sopenharmony_ci &sha256rnds2 ($CDGH,$ABEF); # 56-59 637e1051a39Sopenharmony_ci &pshufd ($Wi,$Wi,0x0e); 638e1051a39Sopenharmony_ci &sha256msg2 (@MSG[2],@MSG[1]); 639e1051a39Sopenharmony_ci &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask 640e1051a39Sopenharmony_ci &sha256rnds2 ($ABEF,$CDGH); 641e1051a39Sopenharmony_ci 642e1051a39Sopenharmony_ci &movdqa ($Wi,&QWP(15*16-0x80,$K256)); 643e1051a39Sopenharmony_ci &paddd ($Wi,@MSG[2]); 644e1051a39Sopenharmony_ci &nop (); 645e1051a39Sopenharmony_ci &sha256rnds2 ($CDGH,$ABEF); # 60-63 646e1051a39Sopenharmony_ci &pshufd ($Wi,$Wi,0x0e); 647e1051a39Sopenharmony_ci &cmp ($end,$inp); 648e1051a39Sopenharmony_ci &nop (); 649e1051a39Sopenharmony_ci &sha256rnds2 ($ABEF,$CDGH); 650e1051a39Sopenharmony_ci 651e1051a39Sopenharmony_ci &paddd ($CDGH,&QWP(16,"esp")); 652e1051a39Sopenharmony_ci &paddd ($ABEF,&QWP(0,"esp")); 653e1051a39Sopenharmony_ci &jnz (&label("loop_shaext")); 654e1051a39Sopenharmony_ci 655e1051a39Sopenharmony_ci &pshufd ($CDGH,$CDGH,0xb1); # DCHG 656e1051a39Sopenharmony_ci &pshufd ($TMP,$ABEF,0x1b); # FEBA 657e1051a39Sopenharmony_ci &pshufd ($ABEF,$ABEF,0xb1); # BAFE 658e1051a39Sopenharmony_ci &punpckhqdq ($ABEF,$CDGH); # DCBA 659e1051a39Sopenharmony_ci &palignr ($CDGH,$TMP,8); # HGFE 660e1051a39Sopenharmony_ci 661e1051a39Sopenharmony_ci &mov ("esp",&DWP(32+12,"esp")); 662e1051a39Sopenharmony_ci &movdqu (&QWP(0,$ctx),$ABEF); 663e1051a39Sopenharmony_ci &movdqu (&QWP(16,$ctx),$CDGH); 664e1051a39Sopenharmony_ci&function_end_A(); 665e1051a39Sopenharmony_ci} 666e1051a39Sopenharmony_ci 667e1051a39Sopenharmony_cimy @X = map("xmm$_",(0..3)); 668e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7)); 669e1051a39Sopenharmony_cimy @AH = ($A,$T); 670e1051a39Sopenharmony_ci 671e1051a39Sopenharmony_ci&set_label("SSSE3",32); 672e1051a39Sopenharmony_ci &lea ("esp",&DWP(-96,"esp")); 673e1051a39Sopenharmony_ci # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 674e1051a39Sopenharmony_ci &mov ($AH[0],&DWP(0,"esi")); 675e1051a39Sopenharmony_ci &mov ($AH[1],&DWP(4,"esi")); 676e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"esi")); 677e1051a39Sopenharmony_ci &mov ("edi",&DWP(12,"esi")); 678e1051a39Sopenharmony_ci #&mov (&DWP(0,"esp"),$AH[0]); 679e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),$AH[1]); 680e1051a39Sopenharmony_ci &xor ($AH[1],"ecx"); # magic 681e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),"ecx"); 682e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),"edi"); 683e1051a39Sopenharmony_ci &mov ($E,&DWP(16,"esi")); 684e1051a39Sopenharmony_ci &mov ("edi",&DWP(20,"esi")); 685e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"esi")); 686e1051a39Sopenharmony_ci &mov ("esi",&DWP(28,"esi")); 687e1051a39Sopenharmony_ci #&mov (&DWP(16,"esp"),$E); 688e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),"edi"); 689e1051a39Sopenharmony_ci &mov ("edi",&DWP(96+4,"esp")); # inp 690e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),"ecx"); 691e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),"esi"); 692e1051a39Sopenharmony_ci &movdqa ($t3,&QWP(256,$K256)); 693e1051a39Sopenharmony_ci &jmp (&label("grand_ssse3")); 694e1051a39Sopenharmony_ci 695e1051a39Sopenharmony_ci&set_label("grand_ssse3",16); 696e1051a39Sopenharmony_ci # load input, reverse byte order, add K256[0..15], save to stack 697e1051a39Sopenharmony_ci &movdqu (@X[0],&QWP(0,"edi")); 698e1051a39Sopenharmony_ci &movdqu (@X[1],&QWP(16,"edi")); 699e1051a39Sopenharmony_ci &movdqu (@X[2],&QWP(32,"edi")); 700e1051a39Sopenharmony_ci &movdqu (@X[3],&QWP(48,"edi")); 701e1051a39Sopenharmony_ci &add ("edi",64); 702e1051a39Sopenharmony_ci &pshufb (@X[0],$t3); 703e1051a39Sopenharmony_ci &mov (&DWP(96+4,"esp"),"edi"); 704e1051a39Sopenharmony_ci &pshufb (@X[1],$t3); 705e1051a39Sopenharmony_ci &movdqa ($t0,&QWP(0,$K256)); 706e1051a39Sopenharmony_ci &pshufb (@X[2],$t3); 707e1051a39Sopenharmony_ci &movdqa ($t1,&QWP(16,$K256)); 708e1051a39Sopenharmony_ci &paddd ($t0,@X[0]); 709e1051a39Sopenharmony_ci &pshufb (@X[3],$t3); 710e1051a39Sopenharmony_ci &movdqa ($t2,&QWP(32,$K256)); 711e1051a39Sopenharmony_ci &paddd ($t1,@X[1]); 712e1051a39Sopenharmony_ci &movdqa ($t3,&QWP(48,$K256)); 713e1051a39Sopenharmony_ci &movdqa (&QWP(32+0,"esp"),$t0); 714e1051a39Sopenharmony_ci &paddd ($t2,@X[2]); 715e1051a39Sopenharmony_ci &movdqa (&QWP(32+16,"esp"),$t1); 716e1051a39Sopenharmony_ci &paddd ($t3,@X[3]); 717e1051a39Sopenharmony_ci &movdqa (&QWP(32+32,"esp"),$t2); 718e1051a39Sopenharmony_ci &movdqa (&QWP(32+48,"esp"),$t3); 719e1051a39Sopenharmony_ci &jmp (&label("ssse3_00_47")); 720e1051a39Sopenharmony_ci 721e1051a39Sopenharmony_ci&set_label("ssse3_00_47",16); 722e1051a39Sopenharmony_ci &add ($K256,64); 723e1051a39Sopenharmony_ci 724e1051a39Sopenharmony_cisub SSSE3_00_47 () { 725e1051a39Sopenharmony_cimy $j = shift; 726e1051a39Sopenharmony_cimy $body = shift; 727e1051a39Sopenharmony_cimy @X = @_; 728e1051a39Sopenharmony_cimy @insns = (&$body,&$body,&$body,&$body); # 120 instructions 729e1051a39Sopenharmony_ci 730e1051a39Sopenharmony_ci eval(shift(@insns)); 731e1051a39Sopenharmony_ci &movdqa ($t0,@X[1]); 732e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 733e1051a39Sopenharmony_ci eval(shift(@insns)); 734e1051a39Sopenharmony_ci &movdqa ($t3,@X[3]); 735e1051a39Sopenharmony_ci eval(shift(@insns)); 736e1051a39Sopenharmony_ci eval(shift(@insns)); 737e1051a39Sopenharmony_ci &palignr ($t0,@X[0],4); # X[1..4] 738e1051a39Sopenharmony_ci eval(shift(@insns)); 739e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 740e1051a39Sopenharmony_ci eval(shift(@insns)); 741e1051a39Sopenharmony_ci &palignr ($t3,@X[2],4); # X[9..12] 742e1051a39Sopenharmony_ci eval(shift(@insns)); 743e1051a39Sopenharmony_ci eval(shift(@insns)); 744e1051a39Sopenharmony_ci eval(shift(@insns)); 745e1051a39Sopenharmony_ci &movdqa ($t1,$t0); 746e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 747e1051a39Sopenharmony_ci eval(shift(@insns)); 748e1051a39Sopenharmony_ci &movdqa ($t2,$t0); 749e1051a39Sopenharmony_ci eval(shift(@insns)); 750e1051a39Sopenharmony_ci eval(shift(@insns)); 751e1051a39Sopenharmony_ci &psrld ($t0,3); 752e1051a39Sopenharmony_ci eval(shift(@insns)); 753e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 754e1051a39Sopenharmony_ci &paddd (@X[0],$t3); # X[0..3] += X[9..12] 755e1051a39Sopenharmony_ci eval(shift(@insns)); 756e1051a39Sopenharmony_ci eval(shift(@insns)); 757e1051a39Sopenharmony_ci &psrld ($t2,7); 758e1051a39Sopenharmony_ci eval(shift(@insns)); 759e1051a39Sopenharmony_ci eval(shift(@insns)); 760e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 761e1051a39Sopenharmony_ci eval(shift(@insns)); 762e1051a39Sopenharmony_ci &pshufd ($t3,@X[3],0b11111010); # X[14..15] 763e1051a39Sopenharmony_ci eval(shift(@insns)); 764e1051a39Sopenharmony_ci eval(shift(@insns)); 765e1051a39Sopenharmony_ci &pslld ($t1,32-18); 766e1051a39Sopenharmony_ci eval(shift(@insns)); 767e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 768e1051a39Sopenharmony_ci &pxor ($t0,$t2); 769e1051a39Sopenharmony_ci eval(shift(@insns)); 770e1051a39Sopenharmony_ci eval(shift(@insns)); 771e1051a39Sopenharmony_ci &psrld ($t2,18-7); 772e1051a39Sopenharmony_ci eval(shift(@insns)); 773e1051a39Sopenharmony_ci eval(shift(@insns)); 774e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 775e1051a39Sopenharmony_ci &pxor ($t0,$t1); 776e1051a39Sopenharmony_ci eval(shift(@insns)); 777e1051a39Sopenharmony_ci eval(shift(@insns)); 778e1051a39Sopenharmony_ci &pslld ($t1,18-7); 779e1051a39Sopenharmony_ci eval(shift(@insns)); 780e1051a39Sopenharmony_ci eval(shift(@insns)); 781e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 782e1051a39Sopenharmony_ci &pxor ($t0,$t2); 783e1051a39Sopenharmony_ci eval(shift(@insns)); 784e1051a39Sopenharmony_ci eval(shift(@insns)); 785e1051a39Sopenharmony_ci &movdqa ($t2,$t3); 786e1051a39Sopenharmony_ci eval(shift(@insns)); 787e1051a39Sopenharmony_ci eval(shift(@insns)); 788e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 789e1051a39Sopenharmony_ci &pxor ($t0,$t1); # sigma0(X[1..4]) 790e1051a39Sopenharmony_ci eval(shift(@insns)); 791e1051a39Sopenharmony_ci eval(shift(@insns)); 792e1051a39Sopenharmony_ci &psrld ($t3,10); 793e1051a39Sopenharmony_ci eval(shift(@insns)); 794e1051a39Sopenharmony_ci eval(shift(@insns)); 795e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 796e1051a39Sopenharmony_ci &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 797e1051a39Sopenharmony_ci eval(shift(@insns)); 798e1051a39Sopenharmony_ci eval(shift(@insns)); 799e1051a39Sopenharmony_ci &psrlq ($t2,17); 800e1051a39Sopenharmony_ci eval(shift(@insns)); 801e1051a39Sopenharmony_ci eval(shift(@insns)); 802e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 803e1051a39Sopenharmony_ci &pxor ($t3,$t2); 804e1051a39Sopenharmony_ci eval(shift(@insns)); 805e1051a39Sopenharmony_ci eval(shift(@insns)); 806e1051a39Sopenharmony_ci &psrlq ($t2,19-17); 807e1051a39Sopenharmony_ci eval(shift(@insns)); 808e1051a39Sopenharmony_ci eval(shift(@insns)); 809e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 810e1051a39Sopenharmony_ci &pxor ($t3,$t2); 811e1051a39Sopenharmony_ci eval(shift(@insns)); 812e1051a39Sopenharmony_ci eval(shift(@insns)); 813e1051a39Sopenharmony_ci &pshufd ($t3,$t3,0b10000000); 814e1051a39Sopenharmony_ci eval(shift(@insns)); 815e1051a39Sopenharmony_ci eval(shift(@insns)); 816e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 817e1051a39Sopenharmony_ci eval(shift(@insns)); 818e1051a39Sopenharmony_ci eval(shift(@insns)); 819e1051a39Sopenharmony_ci eval(shift(@insns)); 820e1051a39Sopenharmony_ci eval(shift(@insns)); 821e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 822e1051a39Sopenharmony_ci eval(shift(@insns)); 823e1051a39Sopenharmony_ci &psrldq ($t3,8); 824e1051a39Sopenharmony_ci eval(shift(@insns)); 825e1051a39Sopenharmony_ci eval(shift(@insns)); 826e1051a39Sopenharmony_ci eval(shift(@insns)); 827e1051a39Sopenharmony_ci &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 828e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 829e1051a39Sopenharmony_ci eval(shift(@insns)); 830e1051a39Sopenharmony_ci eval(shift(@insns)); 831e1051a39Sopenharmony_ci eval(shift(@insns)); 832e1051a39Sopenharmony_ci eval(shift(@insns)); 833e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 834e1051a39Sopenharmony_ci eval(shift(@insns)); 835e1051a39Sopenharmony_ci &pshufd ($t3,@X[0],0b01010000); # X[16..17] 836e1051a39Sopenharmony_ci eval(shift(@insns)); 837e1051a39Sopenharmony_ci eval(shift(@insns)); 838e1051a39Sopenharmony_ci eval(shift(@insns)); 839e1051a39Sopenharmony_ci &movdqa ($t2,$t3); 840e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 841e1051a39Sopenharmony_ci &psrld ($t3,10); 842e1051a39Sopenharmony_ci eval(shift(@insns)); 843e1051a39Sopenharmony_ci &psrlq ($t2,17); 844e1051a39Sopenharmony_ci eval(shift(@insns)); 845e1051a39Sopenharmony_ci eval(shift(@insns)); 846e1051a39Sopenharmony_ci eval(shift(@insns)); 847e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 848e1051a39Sopenharmony_ci &pxor ($t3,$t2); 849e1051a39Sopenharmony_ci eval(shift(@insns)); 850e1051a39Sopenharmony_ci eval(shift(@insns)); 851e1051a39Sopenharmony_ci &psrlq ($t2,19-17); 852e1051a39Sopenharmony_ci eval(shift(@insns)); 853e1051a39Sopenharmony_ci eval(shift(@insns)); 854e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 855e1051a39Sopenharmony_ci &pxor ($t3,$t2); 856e1051a39Sopenharmony_ci eval(shift(@insns)); 857e1051a39Sopenharmony_ci eval(shift(@insns)); 858e1051a39Sopenharmony_ci eval(shift(@insns)); 859e1051a39Sopenharmony_ci &pshufd ($t3,$t3,0b00001000); 860e1051a39Sopenharmony_ci eval(shift(@insns)); 861e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 862e1051a39Sopenharmony_ci &movdqa ($t2,&QWP(16*$j,$K256)); 863e1051a39Sopenharmony_ci eval(shift(@insns)); 864e1051a39Sopenharmony_ci eval(shift(@insns)); 865e1051a39Sopenharmony_ci &pslldq ($t3,8); 866e1051a39Sopenharmony_ci eval(shift(@insns)); 867e1051a39Sopenharmony_ci eval(shift(@insns)); 868e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 869e1051a39Sopenharmony_ci eval(shift(@insns)); 870e1051a39Sopenharmony_ci eval(shift(@insns)); 871e1051a39Sopenharmony_ci eval(shift(@insns)); 872e1051a39Sopenharmony_ci eval(shift(@insns)); 873e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 874e1051a39Sopenharmony_ci &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 875e1051a39Sopenharmony_ci eval(shift(@insns)); 876e1051a39Sopenharmony_ci eval(shift(@insns)); 877e1051a39Sopenharmony_ci eval(shift(@insns)); 878e1051a39Sopenharmony_ci eval(shift(@insns)); 879e1051a39Sopenharmony_ci &paddd ($t2,@X[0]); 880e1051a39Sopenharmony_ci eval(shift(@insns)); # @ 881e1051a39Sopenharmony_ci 882e1051a39Sopenharmony_ci foreach (@insns) { eval; } # remaining instructions 883e1051a39Sopenharmony_ci 884e1051a39Sopenharmony_ci &movdqa (&QWP(32+16*$j,"esp"),$t2); 885e1051a39Sopenharmony_ci} 886e1051a39Sopenharmony_ci 887e1051a39Sopenharmony_cisub body_00_15 () { 888e1051a39Sopenharmony_ci ( 889e1051a39Sopenharmony_ci '&mov ("ecx",$E);', 890e1051a39Sopenharmony_ci '&ror ($E,25-11);', 891e1051a39Sopenharmony_ci '&mov ("esi",&off($f));', 892e1051a39Sopenharmony_ci '&xor ($E,"ecx");', 893e1051a39Sopenharmony_ci '&mov ("edi",&off($g));', 894e1051a39Sopenharmony_ci '&xor ("esi","edi");', 895e1051a39Sopenharmony_ci '&ror ($E,11-6);', 896e1051a39Sopenharmony_ci '&and ("esi","ecx");', 897e1051a39Sopenharmony_ci '&mov (&off($e),"ecx");', # save $E, modulo-scheduled 898e1051a39Sopenharmony_ci '&xor ($E,"ecx");', 899e1051a39Sopenharmony_ci '&xor ("edi","esi");', # Ch(e,f,g) 900e1051a39Sopenharmony_ci '&ror ($E,6);', # T = Sigma1(e) 901e1051a39Sopenharmony_ci '&mov ("ecx",$AH[0]);', 902e1051a39Sopenharmony_ci '&add ($E,"edi");', # T += Ch(e,f,g) 903e1051a39Sopenharmony_ci '&mov ("edi",&off($b));', 904e1051a39Sopenharmony_ci '&mov ("esi",$AH[0]);', 905e1051a39Sopenharmony_ci 906e1051a39Sopenharmony_ci '&ror ("ecx",22-13);', 907e1051a39Sopenharmony_ci '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled 908e1051a39Sopenharmony_ci '&xor ("ecx",$AH[0]);', 909e1051a39Sopenharmony_ci '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round 910e1051a39Sopenharmony_ci '&add ($E,&off($h));', # T += h 911e1051a39Sopenharmony_ci '&ror ("ecx",13-2);', 912e1051a39Sopenharmony_ci '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b) 913e1051a39Sopenharmony_ci '&xor ("ecx","esi");', 914e1051a39Sopenharmony_ci '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i] 915e1051a39Sopenharmony_ci '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b) 916e1051a39Sopenharmony_ci '&ror ("ecx",2);', # Sigma0(a) 917e1051a39Sopenharmony_ci 918e1051a39Sopenharmony_ci '&add ($AH[1],$E);', # h += T 919e1051a39Sopenharmony_ci '&add ($E,&off($d));', # d += T 920e1051a39Sopenharmony_ci '&add ($AH[1],"ecx");'. # h += Sigma0(a) 921e1051a39Sopenharmony_ci 922e1051a39Sopenharmony_ci '@AH = reverse(@AH); $i++;' # rotate(a,h) 923e1051a39Sopenharmony_ci ); 924e1051a39Sopenharmony_ci} 925e1051a39Sopenharmony_ci 926e1051a39Sopenharmony_ci for ($i=0,$j=0; $j<4; $j++) { 927e1051a39Sopenharmony_ci &SSSE3_00_47($j,\&body_00_15,@X); 928e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 929e1051a39Sopenharmony_ci } 930e1051a39Sopenharmony_ci &cmp (&DWP(16*$j,$K256),0x00010203); 931e1051a39Sopenharmony_ci &jne (&label("ssse3_00_47")); 932e1051a39Sopenharmony_ci 933e1051a39Sopenharmony_ci for ($i=0; $i<16; ) { 934e1051a39Sopenharmony_ci foreach(body_00_15()) { eval; } 935e1051a39Sopenharmony_ci } 936e1051a39Sopenharmony_ci 937e1051a39Sopenharmony_ci &mov ("esi",&DWP(96,"esp")); #ctx 938e1051a39Sopenharmony_ci #&mov ($AH[0],&DWP(0,"esp")); 939e1051a39Sopenharmony_ci &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 940e1051a39Sopenharmony_ci #&mov ("edi", &DWP(8,"esp")); 941e1051a39Sopenharmony_ci &mov ("ecx",&DWP(12,"esp")); 942e1051a39Sopenharmony_ci &add ($AH[0],&DWP(0,"esi")); 943e1051a39Sopenharmony_ci &add ($AH[1],&DWP(4,"esi")); 944e1051a39Sopenharmony_ci &add ("edi",&DWP(8,"esi")); 945e1051a39Sopenharmony_ci &add ("ecx",&DWP(12,"esi")); 946e1051a39Sopenharmony_ci &mov (&DWP(0,"esi"),$AH[0]); 947e1051a39Sopenharmony_ci &mov (&DWP(4,"esi"),$AH[1]); 948e1051a39Sopenharmony_ci &mov (&DWP(8,"esi"),"edi"); 949e1051a39Sopenharmony_ci &mov (&DWP(12,"esi"),"ecx"); 950e1051a39Sopenharmony_ci #&mov (&DWP(0,"esp"),$AH[0]); 951e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),$AH[1]); 952e1051a39Sopenharmony_ci &xor ($AH[1],"edi"); # magic 953e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),"edi"); 954e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),"ecx"); 955e1051a39Sopenharmony_ci #&mov ($E,&DWP(16,"esp")); 956e1051a39Sopenharmony_ci &mov ("edi",&DWP(20,"esp")); 957e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"esp")); 958e1051a39Sopenharmony_ci &add ($E,&DWP(16,"esi")); 959e1051a39Sopenharmony_ci &add ("edi",&DWP(20,"esi")); 960e1051a39Sopenharmony_ci &add ("ecx",&DWP(24,"esi")); 961e1051a39Sopenharmony_ci &mov (&DWP(16,"esi"),$E); 962e1051a39Sopenharmony_ci &mov (&DWP(20,"esi"),"edi"); 963e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),"edi"); 964e1051a39Sopenharmony_ci &mov ("edi",&DWP(28,"esp")); 965e1051a39Sopenharmony_ci &mov (&DWP(24,"esi"),"ecx"); 966e1051a39Sopenharmony_ci #&mov (&DWP(16,"esp"),$E); 967e1051a39Sopenharmony_ci &add ("edi",&DWP(28,"esi")); 968e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),"ecx"); 969e1051a39Sopenharmony_ci &mov (&DWP(28,"esi"),"edi"); 970e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),"edi"); 971e1051a39Sopenharmony_ci &mov ("edi",&DWP(96+4,"esp")); # inp 972e1051a39Sopenharmony_ci 973e1051a39Sopenharmony_ci &movdqa ($t3,&QWP(64,$K256)); 974e1051a39Sopenharmony_ci &sub ($K256,3*64); # rewind K 975e1051a39Sopenharmony_ci &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 976e1051a39Sopenharmony_ci &jb (&label("grand_ssse3")); 977e1051a39Sopenharmony_ci 978e1051a39Sopenharmony_ci &mov ("esp",&DWP(96+12,"esp")); # restore sp 979e1051a39Sopenharmony_ci&function_end_A(); 980e1051a39Sopenharmony_ci if ($avx) { 981e1051a39Sopenharmony_ci&set_label("AVX",32); 982e1051a39Sopenharmony_ci if ($avx>1) { 983e1051a39Sopenharmony_ci &and ("edx",1<<8|1<<3); # check for BMI2+BMI1 984e1051a39Sopenharmony_ci &cmp ("edx",1<<8|1<<3); 985e1051a39Sopenharmony_ci &je (&label("AVX_BMI")); 986e1051a39Sopenharmony_ci } 987e1051a39Sopenharmony_ci &lea ("esp",&DWP(-96,"esp")); 988e1051a39Sopenharmony_ci &vzeroall (); 989e1051a39Sopenharmony_ci # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 990e1051a39Sopenharmony_ci &mov ($AH[0],&DWP(0,"esi")); 991e1051a39Sopenharmony_ci &mov ($AH[1],&DWP(4,"esi")); 992e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"esi")); 993e1051a39Sopenharmony_ci &mov ("edi",&DWP(12,"esi")); 994e1051a39Sopenharmony_ci #&mov (&DWP(0,"esp"),$AH[0]); 995e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),$AH[1]); 996e1051a39Sopenharmony_ci &xor ($AH[1],"ecx"); # magic 997e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),"ecx"); 998e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),"edi"); 999e1051a39Sopenharmony_ci &mov ($E,&DWP(16,"esi")); 1000e1051a39Sopenharmony_ci &mov ("edi",&DWP(20,"esi")); 1001e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"esi")); 1002e1051a39Sopenharmony_ci &mov ("esi",&DWP(28,"esi")); 1003e1051a39Sopenharmony_ci #&mov (&DWP(16,"esp"),$E); 1004e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),"edi"); 1005e1051a39Sopenharmony_ci &mov ("edi",&DWP(96+4,"esp")); # inp 1006e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),"ecx"); 1007e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),"esi"); 1008e1051a39Sopenharmony_ci &vmovdqa ($t3,&QWP(256,$K256)); 1009e1051a39Sopenharmony_ci &jmp (&label("grand_avx")); 1010e1051a39Sopenharmony_ci 1011e1051a39Sopenharmony_ci&set_label("grand_avx",32); 1012e1051a39Sopenharmony_ci # load input, reverse byte order, add K256[0..15], save to stack 1013e1051a39Sopenharmony_ci &vmovdqu (@X[0],&QWP(0,"edi")); 1014e1051a39Sopenharmony_ci &vmovdqu (@X[1],&QWP(16,"edi")); 1015e1051a39Sopenharmony_ci &vmovdqu (@X[2],&QWP(32,"edi")); 1016e1051a39Sopenharmony_ci &vmovdqu (@X[3],&QWP(48,"edi")); 1017e1051a39Sopenharmony_ci &add ("edi",64); 1018e1051a39Sopenharmony_ci &vpshufb (@X[0],@X[0],$t3); 1019e1051a39Sopenharmony_ci &mov (&DWP(96+4,"esp"),"edi"); 1020e1051a39Sopenharmony_ci &vpshufb (@X[1],@X[1],$t3); 1021e1051a39Sopenharmony_ci &vpshufb (@X[2],@X[2],$t3); 1022e1051a39Sopenharmony_ci &vpaddd ($t0,@X[0],&QWP(0,$K256)); 1023e1051a39Sopenharmony_ci &vpshufb (@X[3],@X[3],$t3); 1024e1051a39Sopenharmony_ci &vpaddd ($t1,@X[1],&QWP(16,$K256)); 1025e1051a39Sopenharmony_ci &vpaddd ($t2,@X[2],&QWP(32,$K256)); 1026e1051a39Sopenharmony_ci &vpaddd ($t3,@X[3],&QWP(48,$K256)); 1027e1051a39Sopenharmony_ci &vmovdqa (&QWP(32+0,"esp"),$t0); 1028e1051a39Sopenharmony_ci &vmovdqa (&QWP(32+16,"esp"),$t1); 1029e1051a39Sopenharmony_ci &vmovdqa (&QWP(32+32,"esp"),$t2); 1030e1051a39Sopenharmony_ci &vmovdqa (&QWP(32+48,"esp"),$t3); 1031e1051a39Sopenharmony_ci &jmp (&label("avx_00_47")); 1032e1051a39Sopenharmony_ci 1033e1051a39Sopenharmony_ci&set_label("avx_00_47",16); 1034e1051a39Sopenharmony_ci &add ($K256,64); 1035e1051a39Sopenharmony_ci 1036e1051a39Sopenharmony_cisub Xupdate_AVX () { 1037e1051a39Sopenharmony_ci ( 1038e1051a39Sopenharmony_ci '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4] 1039e1051a39Sopenharmony_ci '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12] 1040e1051a39Sopenharmony_ci '&vpsrld ($t2,$t0,7);', 1041e1051a39Sopenharmony_ci '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16] 1042e1051a39Sopenharmony_ci '&vpsrld ($t3,$t0,3);', 1043e1051a39Sopenharmony_ci '&vpslld ($t1,$t0,14);', 1044e1051a39Sopenharmony_ci '&vpxor ($t0,$t3,$t2);', 1045e1051a39Sopenharmony_ci '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1046e1051a39Sopenharmony_ci '&vpsrld ($t2,$t2,18-7);', 1047e1051a39Sopenharmony_ci '&vpxor ($t0,$t0,$t1);', 1048e1051a39Sopenharmony_ci '&vpslld ($t1,$t1,25-14);', 1049e1051a39Sopenharmony_ci '&vpxor ($t0,$t0,$t2);', 1050e1051a39Sopenharmony_ci '&vpsrld ($t2,$t3,10);', 1051e1051a39Sopenharmony_ci '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4]) 1052e1051a39Sopenharmony_ci '&vpsrlq ($t1,$t3,17);', 1053e1051a39Sopenharmony_ci '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 1054e1051a39Sopenharmony_ci '&vpxor ($t2,$t2,$t1);', 1055e1051a39Sopenharmony_ci '&vpsrlq ($t3,$t3,19);', 1056e1051a39Sopenharmony_ci '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15] 1057e1051a39Sopenharmony_ci '&vpshufd ($t3,$t2,0b10000100);', 1058e1051a39Sopenharmony_ci '&vpsrldq ($t3,$t3,8);', 1059e1051a39Sopenharmony_ci '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15]) 1060e1051a39Sopenharmony_ci '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1061e1051a39Sopenharmony_ci '&vpsrld ($t2,$t3,10);', 1062e1051a39Sopenharmony_ci '&vpsrlq ($t1,$t3,17);', 1063e1051a39Sopenharmony_ci '&vpxor ($t2,$t2,$t1);', 1064e1051a39Sopenharmony_ci '&vpsrlq ($t3,$t3,19);', 1065e1051a39Sopenharmony_ci '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17] 1066e1051a39Sopenharmony_ci '&vpshufd ($t3,$t2,0b11101000);', 1067e1051a39Sopenharmony_ci '&vpslldq ($t3,$t3,8);', 1068e1051a39Sopenharmony_ci '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17]) 1069e1051a39Sopenharmony_ci ); 1070e1051a39Sopenharmony_ci} 1071e1051a39Sopenharmony_ci 1072e1051a39Sopenharmony_cilocal *ror = sub { &shrd(@_[0],@_) }; 1073e1051a39Sopenharmony_cisub AVX_00_47 () { 1074e1051a39Sopenharmony_cimy $j = shift; 1075e1051a39Sopenharmony_cimy $body = shift; 1076e1051a39Sopenharmony_cimy @X = @_; 1077e1051a39Sopenharmony_cimy @insns = (&$body,&$body,&$body,&$body); # 120 instructions 1078e1051a39Sopenharmony_cimy $insn; 1079e1051a39Sopenharmony_ci 1080e1051a39Sopenharmony_ci foreach (Xupdate_AVX()) { # 31 instructions 1081e1051a39Sopenharmony_ci eval; 1082e1051a39Sopenharmony_ci eval(shift(@insns)); 1083e1051a39Sopenharmony_ci eval(shift(@insns)); 1084e1051a39Sopenharmony_ci eval($insn = shift(@insns)); 1085e1051a39Sopenharmony_ci eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/); 1086e1051a39Sopenharmony_ci } 1087e1051a39Sopenharmony_ci &vpaddd ($t2,@X[0],&QWP(16*$j,$K256)); 1088e1051a39Sopenharmony_ci foreach (@insns) { eval; } # remaining instructions 1089e1051a39Sopenharmony_ci &vmovdqa (&QWP(32+16*$j,"esp"),$t2); 1090e1051a39Sopenharmony_ci} 1091e1051a39Sopenharmony_ci 1092e1051a39Sopenharmony_ci for ($i=0,$j=0; $j<4; $j++) { 1093e1051a39Sopenharmony_ci &AVX_00_47($j,\&body_00_15,@X); 1094e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 1095e1051a39Sopenharmony_ci } 1096e1051a39Sopenharmony_ci &cmp (&DWP(16*$j,$K256),0x00010203); 1097e1051a39Sopenharmony_ci &jne (&label("avx_00_47")); 1098e1051a39Sopenharmony_ci 1099e1051a39Sopenharmony_ci for ($i=0; $i<16; ) { 1100e1051a39Sopenharmony_ci foreach(body_00_15()) { eval; } 1101e1051a39Sopenharmony_ci } 1102e1051a39Sopenharmony_ci 1103e1051a39Sopenharmony_ci &mov ("esi",&DWP(96,"esp")); #ctx 1104e1051a39Sopenharmony_ci #&mov ($AH[0],&DWP(0,"esp")); 1105e1051a39Sopenharmony_ci &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 1106e1051a39Sopenharmony_ci #&mov ("edi", &DWP(8,"esp")); 1107e1051a39Sopenharmony_ci &mov ("ecx",&DWP(12,"esp")); 1108e1051a39Sopenharmony_ci &add ($AH[0],&DWP(0,"esi")); 1109e1051a39Sopenharmony_ci &add ($AH[1],&DWP(4,"esi")); 1110e1051a39Sopenharmony_ci &add ("edi",&DWP(8,"esi")); 1111e1051a39Sopenharmony_ci &add ("ecx",&DWP(12,"esi")); 1112e1051a39Sopenharmony_ci &mov (&DWP(0,"esi"),$AH[0]); 1113e1051a39Sopenharmony_ci &mov (&DWP(4,"esi"),$AH[1]); 1114e1051a39Sopenharmony_ci &mov (&DWP(8,"esi"),"edi"); 1115e1051a39Sopenharmony_ci &mov (&DWP(12,"esi"),"ecx"); 1116e1051a39Sopenharmony_ci #&mov (&DWP(0,"esp"),$AH[0]); 1117e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),$AH[1]); 1118e1051a39Sopenharmony_ci &xor ($AH[1],"edi"); # magic 1119e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),"edi"); 1120e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),"ecx"); 1121e1051a39Sopenharmony_ci #&mov ($E,&DWP(16,"esp")); 1122e1051a39Sopenharmony_ci &mov ("edi",&DWP(20,"esp")); 1123e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"esp")); 1124e1051a39Sopenharmony_ci &add ($E,&DWP(16,"esi")); 1125e1051a39Sopenharmony_ci &add ("edi",&DWP(20,"esi")); 1126e1051a39Sopenharmony_ci &add ("ecx",&DWP(24,"esi")); 1127e1051a39Sopenharmony_ci &mov (&DWP(16,"esi"),$E); 1128e1051a39Sopenharmony_ci &mov (&DWP(20,"esi"),"edi"); 1129e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),"edi"); 1130e1051a39Sopenharmony_ci &mov ("edi",&DWP(28,"esp")); 1131e1051a39Sopenharmony_ci &mov (&DWP(24,"esi"),"ecx"); 1132e1051a39Sopenharmony_ci #&mov (&DWP(16,"esp"),$E); 1133e1051a39Sopenharmony_ci &add ("edi",&DWP(28,"esi")); 1134e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),"ecx"); 1135e1051a39Sopenharmony_ci &mov (&DWP(28,"esi"),"edi"); 1136e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),"edi"); 1137e1051a39Sopenharmony_ci &mov ("edi",&DWP(96+4,"esp")); # inp 1138e1051a39Sopenharmony_ci 1139e1051a39Sopenharmony_ci &vmovdqa ($t3,&QWP(64,$K256)); 1140e1051a39Sopenharmony_ci &sub ($K256,3*64); # rewind K 1141e1051a39Sopenharmony_ci &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 1142e1051a39Sopenharmony_ci &jb (&label("grand_avx")); 1143e1051a39Sopenharmony_ci 1144e1051a39Sopenharmony_ci &mov ("esp",&DWP(96+12,"esp")); # restore sp 1145e1051a39Sopenharmony_ci &vzeroall (); 1146e1051a39Sopenharmony_ci&function_end_A(); 1147e1051a39Sopenharmony_ci if ($avx>1) { 1148e1051a39Sopenharmony_cisub bodyx_00_15 () { # +10% 1149e1051a39Sopenharmony_ci ( 1150e1051a39Sopenharmony_ci '&rorx ("ecx",$E,6)', 1151e1051a39Sopenharmony_ci '&rorx ("esi",$E,11)', 1152e1051a39Sopenharmony_ci '&mov (&off($e),$E)', # save $E, modulo-scheduled 1153e1051a39Sopenharmony_ci '&rorx ("edi",$E,25)', 1154e1051a39Sopenharmony_ci '&xor ("ecx","esi")', 1155e1051a39Sopenharmony_ci '&andn ("esi",$E,&off($g))', 1156e1051a39Sopenharmony_ci '&xor ("ecx","edi")', # Sigma1(e) 1157e1051a39Sopenharmony_ci '&and ($E,&off($f))', 1158e1051a39Sopenharmony_ci '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled 1159e1051a39Sopenharmony_ci '&or ($E,"esi")', # T = Ch(e,f,g) 1160e1051a39Sopenharmony_ci 1161e1051a39Sopenharmony_ci '&rorx ("edi",$AH[0],2)', 1162e1051a39Sopenharmony_ci '&rorx ("esi",$AH[0],13)', 1163e1051a39Sopenharmony_ci '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e) 1164e1051a39Sopenharmony_ci '&rorx ("ecx",$AH[0],22)', 1165e1051a39Sopenharmony_ci '&xor ("esi","edi")', 1166e1051a39Sopenharmony_ci '&mov ("edi",&off($b))', 1167e1051a39Sopenharmony_ci '&xor ("ecx","esi")', # Sigma0(a) 1168e1051a39Sopenharmony_ci 1169e1051a39Sopenharmony_ci '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round 1170e1051a39Sopenharmony_ci '&add ($E,&off($h))', # T += h 1171e1051a39Sopenharmony_ci '&and ($AH[1],$AH[0])', # (b^c) &= (a^b) 1172e1051a39Sopenharmony_ci '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i] 1173e1051a39Sopenharmony_ci '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b) 1174e1051a39Sopenharmony_ci 1175e1051a39Sopenharmony_ci '&add ("ecx",$E)', # h += T 1176e1051a39Sopenharmony_ci '&add ($E,&off($d))', # d += T 1177e1051a39Sopenharmony_ci '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a) 1178e1051a39Sopenharmony_ci 1179e1051a39Sopenharmony_ci '@AH = reverse(@AH); $i++;' # rotate(a,h) 1180e1051a39Sopenharmony_ci ); 1181e1051a39Sopenharmony_ci} 1182e1051a39Sopenharmony_ci 1183e1051a39Sopenharmony_ci&set_label("AVX_BMI",32); 1184e1051a39Sopenharmony_ci &lea ("esp",&DWP(-96,"esp")); 1185e1051a39Sopenharmony_ci &vzeroall (); 1186e1051a39Sopenharmony_ci # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 1187e1051a39Sopenharmony_ci &mov ($AH[0],&DWP(0,"esi")); 1188e1051a39Sopenharmony_ci &mov ($AH[1],&DWP(4,"esi")); 1189e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8,"esi")); 1190e1051a39Sopenharmony_ci &mov ("edi",&DWP(12,"esi")); 1191e1051a39Sopenharmony_ci #&mov (&DWP(0,"esp"),$AH[0]); 1192e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),$AH[1]); 1193e1051a39Sopenharmony_ci &xor ($AH[1],"ecx"); # magic 1194e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),"ecx"); 1195e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),"edi"); 1196e1051a39Sopenharmony_ci &mov ($E,&DWP(16,"esi")); 1197e1051a39Sopenharmony_ci &mov ("edi",&DWP(20,"esi")); 1198e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"esi")); 1199e1051a39Sopenharmony_ci &mov ("esi",&DWP(28,"esi")); 1200e1051a39Sopenharmony_ci #&mov (&DWP(16,"esp"),$E); 1201e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),"edi"); 1202e1051a39Sopenharmony_ci &mov ("edi",&DWP(96+4,"esp")); # inp 1203e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),"ecx"); 1204e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),"esi"); 1205e1051a39Sopenharmony_ci &vmovdqa ($t3,&QWP(256,$K256)); 1206e1051a39Sopenharmony_ci &jmp (&label("grand_avx_bmi")); 1207e1051a39Sopenharmony_ci 1208e1051a39Sopenharmony_ci&set_label("grand_avx_bmi",32); 1209e1051a39Sopenharmony_ci # load input, reverse byte order, add K256[0..15], save to stack 1210e1051a39Sopenharmony_ci &vmovdqu (@X[0],&QWP(0,"edi")); 1211e1051a39Sopenharmony_ci &vmovdqu (@X[1],&QWP(16,"edi")); 1212e1051a39Sopenharmony_ci &vmovdqu (@X[2],&QWP(32,"edi")); 1213e1051a39Sopenharmony_ci &vmovdqu (@X[3],&QWP(48,"edi")); 1214e1051a39Sopenharmony_ci &add ("edi",64); 1215e1051a39Sopenharmony_ci &vpshufb (@X[0],@X[0],$t3); 1216e1051a39Sopenharmony_ci &mov (&DWP(96+4,"esp"),"edi"); 1217e1051a39Sopenharmony_ci &vpshufb (@X[1],@X[1],$t3); 1218e1051a39Sopenharmony_ci &vpshufb (@X[2],@X[2],$t3); 1219e1051a39Sopenharmony_ci &vpaddd ($t0,@X[0],&QWP(0,$K256)); 1220e1051a39Sopenharmony_ci &vpshufb (@X[3],@X[3],$t3); 1221e1051a39Sopenharmony_ci &vpaddd ($t1,@X[1],&QWP(16,$K256)); 1222e1051a39Sopenharmony_ci &vpaddd ($t2,@X[2],&QWP(32,$K256)); 1223e1051a39Sopenharmony_ci &vpaddd ($t3,@X[3],&QWP(48,$K256)); 1224e1051a39Sopenharmony_ci &vmovdqa (&QWP(32+0,"esp"),$t0); 1225e1051a39Sopenharmony_ci &vmovdqa (&QWP(32+16,"esp"),$t1); 1226e1051a39Sopenharmony_ci &vmovdqa (&QWP(32+32,"esp"),$t2); 1227e1051a39Sopenharmony_ci &vmovdqa (&QWP(32+48,"esp"),$t3); 1228e1051a39Sopenharmony_ci &jmp (&label("avx_bmi_00_47")); 1229e1051a39Sopenharmony_ci 1230e1051a39Sopenharmony_ci&set_label("avx_bmi_00_47",16); 1231e1051a39Sopenharmony_ci &add ($K256,64); 1232e1051a39Sopenharmony_ci 1233e1051a39Sopenharmony_ci for ($i=0,$j=0; $j<4; $j++) { 1234e1051a39Sopenharmony_ci &AVX_00_47($j,\&bodyx_00_15,@X); 1235e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 1236e1051a39Sopenharmony_ci } 1237e1051a39Sopenharmony_ci &cmp (&DWP(16*$j,$K256),0x00010203); 1238e1051a39Sopenharmony_ci &jne (&label("avx_bmi_00_47")); 1239e1051a39Sopenharmony_ci 1240e1051a39Sopenharmony_ci for ($i=0; $i<16; ) { 1241e1051a39Sopenharmony_ci foreach(bodyx_00_15()) { eval; } 1242e1051a39Sopenharmony_ci } 1243e1051a39Sopenharmony_ci 1244e1051a39Sopenharmony_ci &mov ("esi",&DWP(96,"esp")); #ctx 1245e1051a39Sopenharmony_ci #&mov ($AH[0],&DWP(0,"esp")); 1246e1051a39Sopenharmony_ci &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 1247e1051a39Sopenharmony_ci #&mov ("edi", &DWP(8,"esp")); 1248e1051a39Sopenharmony_ci &mov ("ecx",&DWP(12,"esp")); 1249e1051a39Sopenharmony_ci &add ($AH[0],&DWP(0,"esi")); 1250e1051a39Sopenharmony_ci &add ($AH[1],&DWP(4,"esi")); 1251e1051a39Sopenharmony_ci &add ("edi",&DWP(8,"esi")); 1252e1051a39Sopenharmony_ci &add ("ecx",&DWP(12,"esi")); 1253e1051a39Sopenharmony_ci &mov (&DWP(0,"esi"),$AH[0]); 1254e1051a39Sopenharmony_ci &mov (&DWP(4,"esi"),$AH[1]); 1255e1051a39Sopenharmony_ci &mov (&DWP(8,"esi"),"edi"); 1256e1051a39Sopenharmony_ci &mov (&DWP(12,"esi"),"ecx"); 1257e1051a39Sopenharmony_ci #&mov (&DWP(0,"esp"),$AH[0]); 1258e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),$AH[1]); 1259e1051a39Sopenharmony_ci &xor ($AH[1],"edi"); # magic 1260e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),"edi"); 1261e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),"ecx"); 1262e1051a39Sopenharmony_ci #&mov ($E,&DWP(16,"esp")); 1263e1051a39Sopenharmony_ci &mov ("edi",&DWP(20,"esp")); 1264e1051a39Sopenharmony_ci &mov ("ecx",&DWP(24,"esp")); 1265e1051a39Sopenharmony_ci &add ($E,&DWP(16,"esi")); 1266e1051a39Sopenharmony_ci &add ("edi",&DWP(20,"esi")); 1267e1051a39Sopenharmony_ci &add ("ecx",&DWP(24,"esi")); 1268e1051a39Sopenharmony_ci &mov (&DWP(16,"esi"),$E); 1269e1051a39Sopenharmony_ci &mov (&DWP(20,"esi"),"edi"); 1270e1051a39Sopenharmony_ci &mov (&DWP(20,"esp"),"edi"); 1271e1051a39Sopenharmony_ci &mov ("edi",&DWP(28,"esp")); 1272e1051a39Sopenharmony_ci &mov (&DWP(24,"esi"),"ecx"); 1273e1051a39Sopenharmony_ci #&mov (&DWP(16,"esp"),$E); 1274e1051a39Sopenharmony_ci &add ("edi",&DWP(28,"esi")); 1275e1051a39Sopenharmony_ci &mov (&DWP(24,"esp"),"ecx"); 1276e1051a39Sopenharmony_ci &mov (&DWP(28,"esi"),"edi"); 1277e1051a39Sopenharmony_ci &mov (&DWP(28,"esp"),"edi"); 1278e1051a39Sopenharmony_ci &mov ("edi",&DWP(96+4,"esp")); # inp 1279e1051a39Sopenharmony_ci 1280e1051a39Sopenharmony_ci &vmovdqa ($t3,&QWP(64,$K256)); 1281e1051a39Sopenharmony_ci &sub ($K256,3*64); # rewind K 1282e1051a39Sopenharmony_ci &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 1283e1051a39Sopenharmony_ci &jb (&label("grand_avx_bmi")); 1284e1051a39Sopenharmony_ci 1285e1051a39Sopenharmony_ci &mov ("esp",&DWP(96+12,"esp")); # restore sp 1286e1051a39Sopenharmony_ci &vzeroall (); 1287e1051a39Sopenharmony_ci&function_end_A(); 1288e1051a39Sopenharmony_ci } 1289e1051a39Sopenharmony_ci } 1290e1051a39Sopenharmony_ci }}} 1291e1051a39Sopenharmony_ci&function_end_B("sha256_block_data_order"); 1292e1051a39Sopenharmony_ci 1293e1051a39Sopenharmony_ci&asm_finish(); 1294e1051a39Sopenharmony_ci 1295e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 1296