1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci# 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. Rights for redistribution and usage in source and binary 13e1051a39Sopenharmony_ci# forms are granted according to the License. 14e1051a39Sopenharmony_ci# ==================================================================== 15e1051a39Sopenharmony_ci# 16e1051a39Sopenharmony_ci# sha256/512_block procedure for x86_64. 17e1051a39Sopenharmony_ci# 18e1051a39Sopenharmony_ci# 40% improvement over compiler-generated code on Opteron. On EM64T 19e1051a39Sopenharmony_ci# sha256 was observed to run >80% faster and sha512 - >40%. No magical 20e1051a39Sopenharmony_ci# tricks, just straight implementation... I really wonder why gcc 21e1051a39Sopenharmony_ci# [being armed with inline assembler] fails to generate as fast code. 22e1051a39Sopenharmony_ci# The only thing which is cool about this module is that it's very 23e1051a39Sopenharmony_ci# same instruction sequence used for both SHA-256 and SHA-512. In 24e1051a39Sopenharmony_ci# former case the instructions operate on 32-bit operands, while in 25e1051a39Sopenharmony_ci# latter - on 64-bit ones. All I had to do is to get one flavor right, 26e1051a39Sopenharmony_ci# the other one passed the test right away:-) 27e1051a39Sopenharmony_ci# 28e1051a39Sopenharmony_ci# sha256_block runs in ~1005 cycles on Opteron, which gives you 29e1051a39Sopenharmony_ci# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 30e1051a39Sopenharmony_ci# frequency in GHz. sha512_block runs in ~1275 cycles, which results 31e1051a39Sopenharmony_ci# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 32e1051a39Sopenharmony_ci# Well, if you compare it to IA-64 implementation, which maintains 33e1051a39Sopenharmony_ci# X[16] in register bank[!], tends to 4 instructions per CPU clock 34e1051a39Sopenharmony_ci# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 35e1051a39Sopenharmony_ci# issue Opteron pipeline and X[16] maintained in memory. So that *if* 36e1051a39Sopenharmony_ci# there is a way to improve it, *then* the only way would be to try to 37e1051a39Sopenharmony_ci# offload X[16] updates to SSE unit, but that would require "deeper" 38e1051a39Sopenharmony_ci# loop unroll, which in turn would naturally cause size blow-up, not 39e1051a39Sopenharmony_ci# to mention increased complexity! And once again, only *if* it's 40e1051a39Sopenharmony_ci# actually possible to noticeably improve overall ILP, instruction 41e1051a39Sopenharmony_ci# level parallelism, on a given CPU implementation in this case. 42e1051a39Sopenharmony_ci# 43e1051a39Sopenharmony_ci# Special note on Intel EM64T. While Opteron CPU exhibits perfect 44e1051a39Sopenharmony_ci# performance ratio of 1.5 between 64- and 32-bit flavors [see above], 45e1051a39Sopenharmony_ci# [currently available] EM64T CPUs apparently are far from it. On the 46e1051a39Sopenharmony_ci# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 47e1051a39Sopenharmony_ci# sha256_block:-( This is presumably because 64-bit shifts/rotates 48e1051a39Sopenharmony_ci# apparently are not atomic instructions, but implemented in microcode. 49e1051a39Sopenharmony_ci# 50e1051a39Sopenharmony_ci# May 2012. 51e1051a39Sopenharmony_ci# 52e1051a39Sopenharmony_ci# Optimization including one of Pavel Semjanov's ideas, alternative 53e1051a39Sopenharmony_ci# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 54e1051a39Sopenharmony_ci# unfortunately -2% SHA512 on P4 [which nobody should care about 55e1051a39Sopenharmony_ci# that much]. 56e1051a39Sopenharmony_ci# 57e1051a39Sopenharmony_ci# June 2012. 58e1051a39Sopenharmony_ci# 59e1051a39Sopenharmony_ci# Add SIMD code paths, see below for improvement coefficients. SSSE3 60e1051a39Sopenharmony_ci# code path was not attempted for SHA512, because improvement is not 61e1051a39Sopenharmony_ci# estimated to be high enough, noticeably less than 9%, to justify 62e1051a39Sopenharmony_ci# the effort, not on pre-AVX processors. [Obviously with exclusion 63e1051a39Sopenharmony_ci# for VIA Nano, but it has SHA512 instruction that is faster and 64e1051a39Sopenharmony_ci# should be used instead.] For reference, corresponding estimated 65e1051a39Sopenharmony_ci# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 66e1051a39Sopenharmony_ci# higher coefficients are observed on VIA Nano and Bulldozer has more 67e1051a39Sopenharmony_ci# to do with specifics of their architecture [which is topic for 68e1051a39Sopenharmony_ci# separate discussion]. 69e1051a39Sopenharmony_ci# 70e1051a39Sopenharmony_ci# November 2012. 71e1051a39Sopenharmony_ci# 72e1051a39Sopenharmony_ci# Add AVX2 code path. Two consecutive input blocks are loaded to 73e1051a39Sopenharmony_ci# 256-bit %ymm registers, with data from first block to least 74e1051a39Sopenharmony_ci# significant 128-bit halves and data from second to most significant. 75e1051a39Sopenharmony_ci# The data is then processed with same SIMD instruction sequence as 76e1051a39Sopenharmony_ci# for AVX, but with %ymm as operands. Side effect is increased stack 77e1051a39Sopenharmony_ci# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 78e1051a39Sopenharmony_ci# code size increase. 79e1051a39Sopenharmony_ci# 80e1051a39Sopenharmony_ci# March 2014. 81e1051a39Sopenharmony_ci# 82e1051a39Sopenharmony_ci# Add support for Intel SHA Extensions. 83e1051a39Sopenharmony_ci 84e1051a39Sopenharmony_ci###################################################################### 85e1051a39Sopenharmony_ci# Current performance in cycles per processed byte (less is better): 86e1051a39Sopenharmony_ci# 87e1051a39Sopenharmony_ci# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 88e1051a39Sopenharmony_ci# 89e1051a39Sopenharmony_ci# AMD K8 14.9 - - 9.57 - 90e1051a39Sopenharmony_ci# P4 17.3 - - 30.8 - 91e1051a39Sopenharmony_ci# Core 2 15.6 13.8(+13%) - 9.97 - 92e1051a39Sopenharmony_ci# Westmere 14.8 12.3(+19%) - 9.58 - 93e1051a39Sopenharmony_ci# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 94e1051a39Sopenharmony_ci# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 95e1051a39Sopenharmony_ci# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 96e1051a39Sopenharmony_ci# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) 97e1051a39Sopenharmony_ci# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 98e1051a39Sopenharmony_ci# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) 99e1051a39Sopenharmony_ci# VIA Nano 23.0 16.5(+39%) - 14.7 - 100e1051a39Sopenharmony_ci# Atom 23.0 18.9(+22%) - 14.7 - 101e1051a39Sopenharmony_ci# Silvermont 27.4 20.6(+33%) - 17.5 - 102e1051a39Sopenharmony_ci# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) 103e1051a39Sopenharmony_ci# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - 104e1051a39Sopenharmony_ci# 105e1051a39Sopenharmony_ci# (*) whichever best applicable, including SHAEXT; 106e1051a39Sopenharmony_ci# (**) switch from ror to shrd stands for fair share of improvement; 107e1051a39Sopenharmony_ci# (***) execution time is fully determined by remaining integer-only 108e1051a39Sopenharmony_ci# part, body_00_15; reducing the amount of SIMD instructions 109e1051a39Sopenharmony_ci# below certain limit makes no difference/sense; to conserve 110e1051a39Sopenharmony_ci# space SHA256 XOP code path is therefore omitted; 111e1051a39Sopenharmony_ci 112e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 113e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 114e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 115e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 116e1051a39Sopenharmony_ci 117e1051a39Sopenharmony_ci$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 118e1051a39Sopenharmony_ci 119e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 120e1051a39Sopenharmony_ci( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 121e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 122e1051a39Sopenharmony_cidie "can't locate x86_64-xlate.pl"; 123e1051a39Sopenharmony_ci 124e1051a39Sopenharmony_ciif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 125e1051a39Sopenharmony_ci =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 126e1051a39Sopenharmony_ci $avx = ($1>=2.19) + ($1>=2.22); 127e1051a39Sopenharmony_ci} 128e1051a39Sopenharmony_ci 129e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 130e1051a39Sopenharmony_ci `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 131e1051a39Sopenharmony_ci $avx = ($1>=2.09) + ($1>=2.10); 132e1051a39Sopenharmony_ci} 133e1051a39Sopenharmony_ci 134e1051a39Sopenharmony_ciif (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 135e1051a39Sopenharmony_ci `ml64 2>&1` =~ /Version ([0-9]+)\./) { 136e1051a39Sopenharmony_ci $avx = ($1>=10) + ($1>=11); 137e1051a39Sopenharmony_ci} 138e1051a39Sopenharmony_ci 139e1051a39Sopenharmony_ciif (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 140e1051a39Sopenharmony_ci $avx = ($2>=3.0) + ($2>3.0); 141e1051a39Sopenharmony_ci} 142e1051a39Sopenharmony_ci 143e1051a39Sopenharmony_ci$shaext=1; ### set to zero if compiling for 1.0.1 144e1051a39Sopenharmony_ci$avx=1 if (!$shaext && $avx); 145e1051a39Sopenharmony_ci 146e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 147e1051a39Sopenharmony_ci or die "can't call $xlate: $!"; 148e1051a39Sopenharmony_ci*STDOUT=*OUT; 149e1051a39Sopenharmony_ci 150e1051a39Sopenharmony_ciif ($output =~ /512/) { 151e1051a39Sopenharmony_ci $func="sha512_block_data_order"; 152e1051a39Sopenharmony_ci $TABLE="K512"; 153e1051a39Sopenharmony_ci $SZ=8; 154e1051a39Sopenharmony_ci @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 155e1051a39Sopenharmony_ci "%r8", "%r9", "%r10","%r11"); 156e1051a39Sopenharmony_ci ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 157e1051a39Sopenharmony_ci @Sigma0=(28,34,39); 158e1051a39Sopenharmony_ci @Sigma1=(14,18,41); 159e1051a39Sopenharmony_ci @sigma0=(1, 8, 7); 160e1051a39Sopenharmony_ci @sigma1=(19,61, 6); 161e1051a39Sopenharmony_ci $rounds=80; 162e1051a39Sopenharmony_ci} else { 163e1051a39Sopenharmony_ci $func="sha256_block_data_order"; 164e1051a39Sopenharmony_ci $TABLE="K256"; 165e1051a39Sopenharmony_ci $SZ=4; 166e1051a39Sopenharmony_ci @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 167e1051a39Sopenharmony_ci "%r8d","%r9d","%r10d","%r11d"); 168e1051a39Sopenharmony_ci ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 169e1051a39Sopenharmony_ci @Sigma0=( 2,13,22); 170e1051a39Sopenharmony_ci @Sigma1=( 6,11,25); 171e1051a39Sopenharmony_ci @sigma0=( 7,18, 3); 172e1051a39Sopenharmony_ci @sigma1=(17,19,10); 173e1051a39Sopenharmony_ci $rounds=64; 174e1051a39Sopenharmony_ci} 175e1051a39Sopenharmony_ci 176e1051a39Sopenharmony_ci$ctx="%rdi"; # 1st arg, zapped by $a3 177e1051a39Sopenharmony_ci$inp="%rsi"; # 2nd arg 178e1051a39Sopenharmony_ci$Tbl="%rbp"; 179e1051a39Sopenharmony_ci 180e1051a39Sopenharmony_ci$_ctx="16*$SZ+0*8(%rsp)"; 181e1051a39Sopenharmony_ci$_inp="16*$SZ+1*8(%rsp)"; 182e1051a39Sopenharmony_ci$_end="16*$SZ+2*8(%rsp)"; 183e1051a39Sopenharmony_ci$_rsp="`16*$SZ+3*8`(%rsp)"; 184e1051a39Sopenharmony_ci$framesz="16*$SZ+4*8"; 185e1051a39Sopenharmony_ci 186e1051a39Sopenharmony_ci 187e1051a39Sopenharmony_cisub ROUND_00_15() 188e1051a39Sopenharmony_ci{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 189e1051a39Sopenharmony_ci my $STRIDE=$SZ; 190e1051a39Sopenharmony_ci $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 191e1051a39Sopenharmony_ci 192e1051a39Sopenharmony_ci$code.=<<___; 193e1051a39Sopenharmony_ci ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 194e1051a39Sopenharmony_ci mov $f,$a2 195e1051a39Sopenharmony_ci 196e1051a39Sopenharmony_ci xor $e,$a0 197e1051a39Sopenharmony_ci ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 198e1051a39Sopenharmony_ci xor $g,$a2 # f^g 199e1051a39Sopenharmony_ci 200e1051a39Sopenharmony_ci mov $T1,`$SZ*($i&0xf)`(%rsp) 201e1051a39Sopenharmony_ci xor $a,$a1 202e1051a39Sopenharmony_ci and $e,$a2 # (f^g)&e 203e1051a39Sopenharmony_ci 204e1051a39Sopenharmony_ci ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 205e1051a39Sopenharmony_ci add $h,$T1 # T1+=h 206e1051a39Sopenharmony_ci xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 207e1051a39Sopenharmony_ci 208e1051a39Sopenharmony_ci ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 209e1051a39Sopenharmony_ci xor $e,$a0 210e1051a39Sopenharmony_ci add $a2,$T1 # T1+=Ch(e,f,g) 211e1051a39Sopenharmony_ci 212e1051a39Sopenharmony_ci mov $a,$a2 213e1051a39Sopenharmony_ci add ($Tbl),$T1 # T1+=K[round] 214e1051a39Sopenharmony_ci xor $a,$a1 215e1051a39Sopenharmony_ci 216e1051a39Sopenharmony_ci xor $b,$a2 # a^b, b^c in next round 217e1051a39Sopenharmony_ci ror \$$Sigma1[0],$a0 # Sigma1(e) 218e1051a39Sopenharmony_ci mov $b,$h 219e1051a39Sopenharmony_ci 220e1051a39Sopenharmony_ci and $a2,$a3 221e1051a39Sopenharmony_ci ror \$$Sigma0[0],$a1 # Sigma0(a) 222e1051a39Sopenharmony_ci add $a0,$T1 # T1+=Sigma1(e) 223e1051a39Sopenharmony_ci 224e1051a39Sopenharmony_ci xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 225e1051a39Sopenharmony_ci add $T1,$d # d+=T1 226e1051a39Sopenharmony_ci add $T1,$h # h+=T1 227e1051a39Sopenharmony_ci 228e1051a39Sopenharmony_ci lea $STRIDE($Tbl),$Tbl # round++ 229e1051a39Sopenharmony_ci___ 230e1051a39Sopenharmony_ci$code.=<<___ if ($i<15); 231e1051a39Sopenharmony_ci add $a1,$h # h+=Sigma0(a) 232e1051a39Sopenharmony_ci___ 233e1051a39Sopenharmony_ci ($a2,$a3) = ($a3,$a2); 234e1051a39Sopenharmony_ci} 235e1051a39Sopenharmony_ci 236e1051a39Sopenharmony_cisub ROUND_16_XX() 237e1051a39Sopenharmony_ci{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 238e1051a39Sopenharmony_ci 239e1051a39Sopenharmony_ci$code.=<<___; 240e1051a39Sopenharmony_ci mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 241e1051a39Sopenharmony_ci mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 242e1051a39Sopenharmony_ci 243e1051a39Sopenharmony_ci mov $a0,$T1 244e1051a39Sopenharmony_ci ror \$`$sigma0[1]-$sigma0[0]`,$a0 245e1051a39Sopenharmony_ci add $a1,$a # modulo-scheduled h+=Sigma0(a) 246e1051a39Sopenharmony_ci mov $a2,$a1 247e1051a39Sopenharmony_ci ror \$`$sigma1[1]-$sigma1[0]`,$a2 248e1051a39Sopenharmony_ci 249e1051a39Sopenharmony_ci xor $T1,$a0 250e1051a39Sopenharmony_ci shr \$$sigma0[2],$T1 251e1051a39Sopenharmony_ci ror \$$sigma0[0],$a0 252e1051a39Sopenharmony_ci xor $a1,$a2 253e1051a39Sopenharmony_ci shr \$$sigma1[2],$a1 254e1051a39Sopenharmony_ci 255e1051a39Sopenharmony_ci ror \$$sigma1[0],$a2 256e1051a39Sopenharmony_ci xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 257e1051a39Sopenharmony_ci xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 258e1051a39Sopenharmony_ci add `$SZ*(($i+9)&0xf)`(%rsp),$T1 259e1051a39Sopenharmony_ci 260e1051a39Sopenharmony_ci add `$SZ*($i&0xf)`(%rsp),$T1 261e1051a39Sopenharmony_ci mov $e,$a0 262e1051a39Sopenharmony_ci add $a2,$T1 263e1051a39Sopenharmony_ci mov $a,$a1 264e1051a39Sopenharmony_ci___ 265e1051a39Sopenharmony_ci &ROUND_00_15(@_); 266e1051a39Sopenharmony_ci} 267e1051a39Sopenharmony_ci 268e1051a39Sopenharmony_ci$code=<<___; 269e1051a39Sopenharmony_ci.text 270e1051a39Sopenharmony_ci 271e1051a39Sopenharmony_ci.extern OPENSSL_ia32cap_P 272e1051a39Sopenharmony_ci.globl $func 273e1051a39Sopenharmony_ci.type $func,\@function,3 274e1051a39Sopenharmony_ci.align 16 275e1051a39Sopenharmony_ci$func: 276e1051a39Sopenharmony_ci.cfi_startproc 277e1051a39Sopenharmony_ci___ 278e1051a39Sopenharmony_ci$code.=<<___ if ($SZ==4 || $avx); 279e1051a39Sopenharmony_ci lea OPENSSL_ia32cap_P(%rip),%r11 280e1051a39Sopenharmony_ci mov 0(%r11),%r9d 281e1051a39Sopenharmony_ci mov 4(%r11),%r10d 282e1051a39Sopenharmony_ci mov 8(%r11),%r11d 283e1051a39Sopenharmony_ci___ 284e1051a39Sopenharmony_ci$code.=<<___ if ($SZ==4 && $shaext); 285e1051a39Sopenharmony_ci test \$`1<<29`,%r11d # check for SHA 286e1051a39Sopenharmony_ci jnz _shaext_shortcut 287e1051a39Sopenharmony_ci___ 288e1051a39Sopenharmony_ci$code.=<<___ if ($avx && $SZ==8); 289e1051a39Sopenharmony_ci test \$`1<<11`,%r10d # check for XOP 290e1051a39Sopenharmony_ci jnz .Lxop_shortcut 291e1051a39Sopenharmony_ci___ 292e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1); 293e1051a39Sopenharmony_ci and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 294e1051a39Sopenharmony_ci cmp \$`1<<8|1<<5|1<<3`,%r11d 295e1051a39Sopenharmony_ci je .Lavx2_shortcut 296e1051a39Sopenharmony_ci___ 297e1051a39Sopenharmony_ci$code.=<<___ if ($avx); 298e1051a39Sopenharmony_ci and \$`1<<30`,%r9d # mask "Intel CPU" bit 299e1051a39Sopenharmony_ci and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 300e1051a39Sopenharmony_ci or %r9d,%r10d 301e1051a39Sopenharmony_ci cmp \$`1<<28|1<<9|1<<30`,%r10d 302e1051a39Sopenharmony_ci je .Lavx_shortcut 303e1051a39Sopenharmony_ci___ 304e1051a39Sopenharmony_ci$code.=<<___ if ($SZ==4); 305e1051a39Sopenharmony_ci test \$`1<<9`,%r10d 306e1051a39Sopenharmony_ci jnz .Lssse3_shortcut 307e1051a39Sopenharmony_ci___ 308e1051a39Sopenharmony_ci$code.=<<___; 309e1051a39Sopenharmony_ci mov %rsp,%rax # copy %rsp 310e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 311e1051a39Sopenharmony_ci push %rbx 312e1051a39Sopenharmony_ci.cfi_push %rbx 313e1051a39Sopenharmony_ci push %rbp 314e1051a39Sopenharmony_ci.cfi_push %rbp 315e1051a39Sopenharmony_ci push %r12 316e1051a39Sopenharmony_ci.cfi_push %r12 317e1051a39Sopenharmony_ci push %r13 318e1051a39Sopenharmony_ci.cfi_push %r13 319e1051a39Sopenharmony_ci push %r14 320e1051a39Sopenharmony_ci.cfi_push %r14 321e1051a39Sopenharmony_ci push %r15 322e1051a39Sopenharmony_ci.cfi_push %r15 323e1051a39Sopenharmony_ci shl \$4,%rdx # num*16 324e1051a39Sopenharmony_ci sub \$$framesz,%rsp 325e1051a39Sopenharmony_ci lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 326e1051a39Sopenharmony_ci and \$-64,%rsp # align stack frame 327e1051a39Sopenharmony_ci mov $ctx,$_ctx # save ctx, 1st arg 328e1051a39Sopenharmony_ci mov $inp,$_inp # save inp, 2nd arh 329e1051a39Sopenharmony_ci mov %rdx,$_end # save end pointer, "3rd" arg 330e1051a39Sopenharmony_ci mov %rax,$_rsp # save copy of %rsp 331e1051a39Sopenharmony_ci.cfi_cfa_expression $_rsp,deref,+8 332e1051a39Sopenharmony_ci.Lprologue: 333e1051a39Sopenharmony_ci 334e1051a39Sopenharmony_ci mov $SZ*0($ctx),$A 335e1051a39Sopenharmony_ci mov $SZ*1($ctx),$B 336e1051a39Sopenharmony_ci mov $SZ*2($ctx),$C 337e1051a39Sopenharmony_ci mov $SZ*3($ctx),$D 338e1051a39Sopenharmony_ci mov $SZ*4($ctx),$E 339e1051a39Sopenharmony_ci mov $SZ*5($ctx),$F 340e1051a39Sopenharmony_ci mov $SZ*6($ctx),$G 341e1051a39Sopenharmony_ci mov $SZ*7($ctx),$H 342e1051a39Sopenharmony_ci jmp .Lloop 343e1051a39Sopenharmony_ci 344e1051a39Sopenharmony_ci.align 16 345e1051a39Sopenharmony_ci.Lloop: 346e1051a39Sopenharmony_ci mov $B,$a3 347e1051a39Sopenharmony_ci lea $TABLE(%rip),$Tbl 348e1051a39Sopenharmony_ci xor $C,$a3 # magic 349e1051a39Sopenharmony_ci___ 350e1051a39Sopenharmony_ci for($i=0;$i<16;$i++) { 351e1051a39Sopenharmony_ci $code.=" mov $SZ*$i($inp),$T1\n"; 352e1051a39Sopenharmony_ci $code.=" mov @ROT[4],$a0\n"; 353e1051a39Sopenharmony_ci $code.=" mov @ROT[0],$a1\n"; 354e1051a39Sopenharmony_ci $code.=" bswap $T1\n"; 355e1051a39Sopenharmony_ci &ROUND_00_15($i,@ROT); 356e1051a39Sopenharmony_ci unshift(@ROT,pop(@ROT)); 357e1051a39Sopenharmony_ci } 358e1051a39Sopenharmony_ci$code.=<<___; 359e1051a39Sopenharmony_ci jmp .Lrounds_16_xx 360e1051a39Sopenharmony_ci.align 16 361e1051a39Sopenharmony_ci.Lrounds_16_xx: 362e1051a39Sopenharmony_ci___ 363e1051a39Sopenharmony_ci for(;$i<32;$i++) { 364e1051a39Sopenharmony_ci &ROUND_16_XX($i,@ROT); 365e1051a39Sopenharmony_ci unshift(@ROT,pop(@ROT)); 366e1051a39Sopenharmony_ci } 367e1051a39Sopenharmony_ci 368e1051a39Sopenharmony_ci$code.=<<___; 369e1051a39Sopenharmony_ci cmpb \$0,`$SZ-1`($Tbl) 370e1051a39Sopenharmony_ci jnz .Lrounds_16_xx 371e1051a39Sopenharmony_ci 372e1051a39Sopenharmony_ci mov $_ctx,$ctx 373e1051a39Sopenharmony_ci add $a1,$A # modulo-scheduled h+=Sigma0(a) 374e1051a39Sopenharmony_ci lea 16*$SZ($inp),$inp 375e1051a39Sopenharmony_ci 376e1051a39Sopenharmony_ci add $SZ*0($ctx),$A 377e1051a39Sopenharmony_ci add $SZ*1($ctx),$B 378e1051a39Sopenharmony_ci add $SZ*2($ctx),$C 379e1051a39Sopenharmony_ci add $SZ*3($ctx),$D 380e1051a39Sopenharmony_ci add $SZ*4($ctx),$E 381e1051a39Sopenharmony_ci add $SZ*5($ctx),$F 382e1051a39Sopenharmony_ci add $SZ*6($ctx),$G 383e1051a39Sopenharmony_ci add $SZ*7($ctx),$H 384e1051a39Sopenharmony_ci 385e1051a39Sopenharmony_ci cmp $_end,$inp 386e1051a39Sopenharmony_ci 387e1051a39Sopenharmony_ci mov $A,$SZ*0($ctx) 388e1051a39Sopenharmony_ci mov $B,$SZ*1($ctx) 389e1051a39Sopenharmony_ci mov $C,$SZ*2($ctx) 390e1051a39Sopenharmony_ci mov $D,$SZ*3($ctx) 391e1051a39Sopenharmony_ci mov $E,$SZ*4($ctx) 392e1051a39Sopenharmony_ci mov $F,$SZ*5($ctx) 393e1051a39Sopenharmony_ci mov $G,$SZ*6($ctx) 394e1051a39Sopenharmony_ci mov $H,$SZ*7($ctx) 395e1051a39Sopenharmony_ci jb .Lloop 396e1051a39Sopenharmony_ci 397e1051a39Sopenharmony_ci mov $_rsp,%rsi 398e1051a39Sopenharmony_ci.cfi_def_cfa %rsi,8 399e1051a39Sopenharmony_ci mov -48(%rsi),%r15 400e1051a39Sopenharmony_ci.cfi_restore %r15 401e1051a39Sopenharmony_ci mov -40(%rsi),%r14 402e1051a39Sopenharmony_ci.cfi_restore %r14 403e1051a39Sopenharmony_ci mov -32(%rsi),%r13 404e1051a39Sopenharmony_ci.cfi_restore %r13 405e1051a39Sopenharmony_ci mov -24(%rsi),%r12 406e1051a39Sopenharmony_ci.cfi_restore %r12 407e1051a39Sopenharmony_ci mov -16(%rsi),%rbp 408e1051a39Sopenharmony_ci.cfi_restore %rbp 409e1051a39Sopenharmony_ci mov -8(%rsi),%rbx 410e1051a39Sopenharmony_ci.cfi_restore %rbx 411e1051a39Sopenharmony_ci lea (%rsi),%rsp 412e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 413e1051a39Sopenharmony_ci.Lepilogue: 414e1051a39Sopenharmony_ci ret 415e1051a39Sopenharmony_ci.cfi_endproc 416e1051a39Sopenharmony_ci.size $func,.-$func 417e1051a39Sopenharmony_ci___ 418e1051a39Sopenharmony_ci 419e1051a39Sopenharmony_ciif ($SZ==4) { 420e1051a39Sopenharmony_ci$code.=<<___; 421e1051a39Sopenharmony_ci.align 64 422e1051a39Sopenharmony_ci.type $TABLE,\@object 423e1051a39Sopenharmony_ci$TABLE: 424e1051a39Sopenharmony_ci .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 425e1051a39Sopenharmony_ci .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 426e1051a39Sopenharmony_ci .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 427e1051a39Sopenharmony_ci .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 428e1051a39Sopenharmony_ci .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 429e1051a39Sopenharmony_ci .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 430e1051a39Sopenharmony_ci .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 431e1051a39Sopenharmony_ci .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 432e1051a39Sopenharmony_ci .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 433e1051a39Sopenharmony_ci .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 434e1051a39Sopenharmony_ci .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 435e1051a39Sopenharmony_ci .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 436e1051a39Sopenharmony_ci .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 437e1051a39Sopenharmony_ci .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 438e1051a39Sopenharmony_ci .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 439e1051a39Sopenharmony_ci .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 440e1051a39Sopenharmony_ci .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 441e1051a39Sopenharmony_ci .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 442e1051a39Sopenharmony_ci .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 443e1051a39Sopenharmony_ci .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 444e1051a39Sopenharmony_ci .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 445e1051a39Sopenharmony_ci .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 446e1051a39Sopenharmony_ci .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 447e1051a39Sopenharmony_ci .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 448e1051a39Sopenharmony_ci .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 449e1051a39Sopenharmony_ci .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 450e1051a39Sopenharmony_ci .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 451e1051a39Sopenharmony_ci .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 452e1051a39Sopenharmony_ci .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 453e1051a39Sopenharmony_ci .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 454e1051a39Sopenharmony_ci .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 455e1051a39Sopenharmony_ci .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 456e1051a39Sopenharmony_ci 457e1051a39Sopenharmony_ci .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 458e1051a39Sopenharmony_ci .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 459e1051a39Sopenharmony_ci .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 460e1051a39Sopenharmony_ci .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 461e1051a39Sopenharmony_ci .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 462e1051a39Sopenharmony_ci .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 463e1051a39Sopenharmony_ci .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 464e1051a39Sopenharmony_ci___ 465e1051a39Sopenharmony_ci} else { 466e1051a39Sopenharmony_ci$code.=<<___; 467e1051a39Sopenharmony_ci.align 64 468e1051a39Sopenharmony_ci.type $TABLE,\@object 469e1051a39Sopenharmony_ci$TABLE: 470e1051a39Sopenharmony_ci .quad 0x428a2f98d728ae22,0x7137449123ef65cd 471e1051a39Sopenharmony_ci .quad 0x428a2f98d728ae22,0x7137449123ef65cd 472e1051a39Sopenharmony_ci .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 473e1051a39Sopenharmony_ci .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 474e1051a39Sopenharmony_ci .quad 0x3956c25bf348b538,0x59f111f1b605d019 475e1051a39Sopenharmony_ci .quad 0x3956c25bf348b538,0x59f111f1b605d019 476e1051a39Sopenharmony_ci .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 477e1051a39Sopenharmony_ci .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 478e1051a39Sopenharmony_ci .quad 0xd807aa98a3030242,0x12835b0145706fbe 479e1051a39Sopenharmony_ci .quad 0xd807aa98a3030242,0x12835b0145706fbe 480e1051a39Sopenharmony_ci .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 481e1051a39Sopenharmony_ci .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 482e1051a39Sopenharmony_ci .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 483e1051a39Sopenharmony_ci .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 484e1051a39Sopenharmony_ci .quad 0x9bdc06a725c71235,0xc19bf174cf692694 485e1051a39Sopenharmony_ci .quad 0x9bdc06a725c71235,0xc19bf174cf692694 486e1051a39Sopenharmony_ci .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 487e1051a39Sopenharmony_ci .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 488e1051a39Sopenharmony_ci .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 489e1051a39Sopenharmony_ci .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 490e1051a39Sopenharmony_ci .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 491e1051a39Sopenharmony_ci .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 492e1051a39Sopenharmony_ci .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 493e1051a39Sopenharmony_ci .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 494e1051a39Sopenharmony_ci .quad 0x983e5152ee66dfab,0xa831c66d2db43210 495e1051a39Sopenharmony_ci .quad 0x983e5152ee66dfab,0xa831c66d2db43210 496e1051a39Sopenharmony_ci .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 497e1051a39Sopenharmony_ci .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 498e1051a39Sopenharmony_ci .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 499e1051a39Sopenharmony_ci .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 500e1051a39Sopenharmony_ci .quad 0x06ca6351e003826f,0x142929670a0e6e70 501e1051a39Sopenharmony_ci .quad 0x06ca6351e003826f,0x142929670a0e6e70 502e1051a39Sopenharmony_ci .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 503e1051a39Sopenharmony_ci .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 504e1051a39Sopenharmony_ci .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 505e1051a39Sopenharmony_ci .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 506e1051a39Sopenharmony_ci .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 507e1051a39Sopenharmony_ci .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 508e1051a39Sopenharmony_ci .quad 0x81c2c92e47edaee6,0x92722c851482353b 509e1051a39Sopenharmony_ci .quad 0x81c2c92e47edaee6,0x92722c851482353b 510e1051a39Sopenharmony_ci .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 511e1051a39Sopenharmony_ci .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 512e1051a39Sopenharmony_ci .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 513e1051a39Sopenharmony_ci .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 514e1051a39Sopenharmony_ci .quad 0xd192e819d6ef5218,0xd69906245565a910 515e1051a39Sopenharmony_ci .quad 0xd192e819d6ef5218,0xd69906245565a910 516e1051a39Sopenharmony_ci .quad 0xf40e35855771202a,0x106aa07032bbd1b8 517e1051a39Sopenharmony_ci .quad 0xf40e35855771202a,0x106aa07032bbd1b8 518e1051a39Sopenharmony_ci .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 519e1051a39Sopenharmony_ci .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 520e1051a39Sopenharmony_ci .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 521e1051a39Sopenharmony_ci .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 522e1051a39Sopenharmony_ci .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 523e1051a39Sopenharmony_ci .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 524e1051a39Sopenharmony_ci .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 525e1051a39Sopenharmony_ci .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 526e1051a39Sopenharmony_ci .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 527e1051a39Sopenharmony_ci .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 528e1051a39Sopenharmony_ci .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 529e1051a39Sopenharmony_ci .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 530e1051a39Sopenharmony_ci .quad 0x90befffa23631e28,0xa4506cebde82bde9 531e1051a39Sopenharmony_ci .quad 0x90befffa23631e28,0xa4506cebde82bde9 532e1051a39Sopenharmony_ci .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 533e1051a39Sopenharmony_ci .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 534e1051a39Sopenharmony_ci .quad 0xca273eceea26619c,0xd186b8c721c0c207 535e1051a39Sopenharmony_ci .quad 0xca273eceea26619c,0xd186b8c721c0c207 536e1051a39Sopenharmony_ci .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 537e1051a39Sopenharmony_ci .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 538e1051a39Sopenharmony_ci .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 539e1051a39Sopenharmony_ci .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 540e1051a39Sopenharmony_ci .quad 0x113f9804bef90dae,0x1b710b35131c471b 541e1051a39Sopenharmony_ci .quad 0x113f9804bef90dae,0x1b710b35131c471b 542e1051a39Sopenharmony_ci .quad 0x28db77f523047d84,0x32caab7b40c72493 543e1051a39Sopenharmony_ci .quad 0x28db77f523047d84,0x32caab7b40c72493 544e1051a39Sopenharmony_ci .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 545e1051a39Sopenharmony_ci .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 546e1051a39Sopenharmony_ci .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 547e1051a39Sopenharmony_ci .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 548e1051a39Sopenharmony_ci .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 549e1051a39Sopenharmony_ci .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 550e1051a39Sopenharmony_ci 551e1051a39Sopenharmony_ci .quad 0x0001020304050607,0x08090a0b0c0d0e0f 552e1051a39Sopenharmony_ci .quad 0x0001020304050607,0x08090a0b0c0d0e0f 553e1051a39Sopenharmony_ci .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 554e1051a39Sopenharmony_ci___ 555e1051a39Sopenharmony_ci} 556e1051a39Sopenharmony_ci 557e1051a39Sopenharmony_ci###################################################################### 558e1051a39Sopenharmony_ci# SIMD code paths 559e1051a39Sopenharmony_ci# 560e1051a39Sopenharmony_ciif ($SZ==4 && $shaext) {{{ 561e1051a39Sopenharmony_ci###################################################################### 562e1051a39Sopenharmony_ci# Intel SHA Extensions implementation of SHA256 update function. 563e1051a39Sopenharmony_ci# 564e1051a39Sopenharmony_cimy ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 565e1051a39Sopenharmony_ci 566e1051a39Sopenharmony_cimy ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 567e1051a39Sopenharmony_cimy @MSG=map("%xmm$_",(3..6)); 568e1051a39Sopenharmony_ci 569e1051a39Sopenharmony_ci$code.=<<___; 570e1051a39Sopenharmony_ci.type sha256_block_data_order_shaext,\@function,3 571e1051a39Sopenharmony_ci.align 64 572e1051a39Sopenharmony_cisha256_block_data_order_shaext: 573e1051a39Sopenharmony_ci_shaext_shortcut: 574e1051a39Sopenharmony_ci.cfi_startproc 575e1051a39Sopenharmony_ci___ 576e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 577e1051a39Sopenharmony_ci lea `-8-5*16`(%rsp),%rsp 578e1051a39Sopenharmony_ci movaps %xmm6,-8-5*16(%rax) 579e1051a39Sopenharmony_ci movaps %xmm7,-8-4*16(%rax) 580e1051a39Sopenharmony_ci movaps %xmm8,-8-3*16(%rax) 581e1051a39Sopenharmony_ci movaps %xmm9,-8-2*16(%rax) 582e1051a39Sopenharmony_ci movaps %xmm10,-8-1*16(%rax) 583e1051a39Sopenharmony_ci.Lprologue_shaext: 584e1051a39Sopenharmony_ci___ 585e1051a39Sopenharmony_ci$code.=<<___; 586e1051a39Sopenharmony_ci lea K256+0x80(%rip),$Tbl 587e1051a39Sopenharmony_ci movdqu ($ctx),$ABEF # DCBA 588e1051a39Sopenharmony_ci movdqu 16($ctx),$CDGH # HGFE 589e1051a39Sopenharmony_ci movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 590e1051a39Sopenharmony_ci 591e1051a39Sopenharmony_ci pshufd \$0x1b,$ABEF,$Wi # ABCD 592e1051a39Sopenharmony_ci pshufd \$0xb1,$ABEF,$ABEF # CDAB 593e1051a39Sopenharmony_ci pshufd \$0x1b,$CDGH,$CDGH # EFGH 594e1051a39Sopenharmony_ci movdqa $TMP,$BSWAP # offload 595e1051a39Sopenharmony_ci palignr \$8,$CDGH,$ABEF # ABEF 596e1051a39Sopenharmony_ci punpcklqdq $Wi,$CDGH # CDGH 597e1051a39Sopenharmony_ci jmp .Loop_shaext 598e1051a39Sopenharmony_ci 599e1051a39Sopenharmony_ci.align 16 600e1051a39Sopenharmony_ci.Loop_shaext: 601e1051a39Sopenharmony_ci movdqu ($inp),@MSG[0] 602e1051a39Sopenharmony_ci movdqu 0x10($inp),@MSG[1] 603e1051a39Sopenharmony_ci movdqu 0x20($inp),@MSG[2] 604e1051a39Sopenharmony_ci pshufb $TMP,@MSG[0] 605e1051a39Sopenharmony_ci movdqu 0x30($inp),@MSG[3] 606e1051a39Sopenharmony_ci 607e1051a39Sopenharmony_ci movdqa 0*32-0x80($Tbl),$Wi 608e1051a39Sopenharmony_ci paddd @MSG[0],$Wi 609e1051a39Sopenharmony_ci pshufb $TMP,@MSG[1] 610e1051a39Sopenharmony_ci movdqa $CDGH,$CDGH_SAVE # offload 611e1051a39Sopenharmony_ci sha256rnds2 $ABEF,$CDGH # 0-3 612e1051a39Sopenharmony_ci pshufd \$0x0e,$Wi,$Wi 613e1051a39Sopenharmony_ci nop 614e1051a39Sopenharmony_ci movdqa $ABEF,$ABEF_SAVE # offload 615e1051a39Sopenharmony_ci sha256rnds2 $CDGH,$ABEF 616e1051a39Sopenharmony_ci 617e1051a39Sopenharmony_ci movdqa 1*32-0x80($Tbl),$Wi 618e1051a39Sopenharmony_ci paddd @MSG[1],$Wi 619e1051a39Sopenharmony_ci pshufb $TMP,@MSG[2] 620e1051a39Sopenharmony_ci sha256rnds2 $ABEF,$CDGH # 4-7 621e1051a39Sopenharmony_ci pshufd \$0x0e,$Wi,$Wi 622e1051a39Sopenharmony_ci lea 0x40($inp),$inp 623e1051a39Sopenharmony_ci sha256msg1 @MSG[1],@MSG[0] 624e1051a39Sopenharmony_ci sha256rnds2 $CDGH,$ABEF 625e1051a39Sopenharmony_ci 626e1051a39Sopenharmony_ci movdqa 2*32-0x80($Tbl),$Wi 627e1051a39Sopenharmony_ci paddd @MSG[2],$Wi 628e1051a39Sopenharmony_ci pshufb $TMP,@MSG[3] 629e1051a39Sopenharmony_ci sha256rnds2 $ABEF,$CDGH # 8-11 630e1051a39Sopenharmony_ci pshufd \$0x0e,$Wi,$Wi 631e1051a39Sopenharmony_ci movdqa @MSG[3],$TMP 632e1051a39Sopenharmony_ci palignr \$4,@MSG[2],$TMP 633e1051a39Sopenharmony_ci nop 634e1051a39Sopenharmony_ci paddd $TMP,@MSG[0] 635e1051a39Sopenharmony_ci sha256msg1 @MSG[2],@MSG[1] 636e1051a39Sopenharmony_ci sha256rnds2 $CDGH,$ABEF 637e1051a39Sopenharmony_ci 638e1051a39Sopenharmony_ci movdqa 3*32-0x80($Tbl),$Wi 639e1051a39Sopenharmony_ci paddd @MSG[3],$Wi 640e1051a39Sopenharmony_ci sha256msg2 @MSG[3],@MSG[0] 641e1051a39Sopenharmony_ci sha256rnds2 $ABEF,$CDGH # 12-15 642e1051a39Sopenharmony_ci pshufd \$0x0e,$Wi,$Wi 643e1051a39Sopenharmony_ci movdqa @MSG[0],$TMP 644e1051a39Sopenharmony_ci palignr \$4,@MSG[3],$TMP 645e1051a39Sopenharmony_ci nop 646e1051a39Sopenharmony_ci paddd $TMP,@MSG[1] 647e1051a39Sopenharmony_ci sha256msg1 @MSG[3],@MSG[2] 648e1051a39Sopenharmony_ci sha256rnds2 $CDGH,$ABEF 649e1051a39Sopenharmony_ci___ 650e1051a39Sopenharmony_cifor($i=4;$i<16-3;$i++) { 651e1051a39Sopenharmony_ci$code.=<<___; 652e1051a39Sopenharmony_ci movdqa $i*32-0x80($Tbl),$Wi 653e1051a39Sopenharmony_ci paddd @MSG[0],$Wi 654e1051a39Sopenharmony_ci sha256msg2 @MSG[0],@MSG[1] 655e1051a39Sopenharmony_ci sha256rnds2 $ABEF,$CDGH # 16-19... 656e1051a39Sopenharmony_ci pshufd \$0x0e,$Wi,$Wi 657e1051a39Sopenharmony_ci movdqa @MSG[1],$TMP 658e1051a39Sopenharmony_ci palignr \$4,@MSG[0],$TMP 659e1051a39Sopenharmony_ci nop 660e1051a39Sopenharmony_ci paddd $TMP,@MSG[2] 661e1051a39Sopenharmony_ci sha256msg1 @MSG[0],@MSG[3] 662e1051a39Sopenharmony_ci sha256rnds2 $CDGH,$ABEF 663e1051a39Sopenharmony_ci___ 664e1051a39Sopenharmony_ci push(@MSG,shift(@MSG)); 665e1051a39Sopenharmony_ci} 666e1051a39Sopenharmony_ci$code.=<<___; 667e1051a39Sopenharmony_ci movdqa 13*32-0x80($Tbl),$Wi 668e1051a39Sopenharmony_ci paddd @MSG[0],$Wi 669e1051a39Sopenharmony_ci sha256msg2 @MSG[0],@MSG[1] 670e1051a39Sopenharmony_ci sha256rnds2 $ABEF,$CDGH # 52-55 671e1051a39Sopenharmony_ci pshufd \$0x0e,$Wi,$Wi 672e1051a39Sopenharmony_ci movdqa @MSG[1],$TMP 673e1051a39Sopenharmony_ci palignr \$4,@MSG[0],$TMP 674e1051a39Sopenharmony_ci sha256rnds2 $CDGH,$ABEF 675e1051a39Sopenharmony_ci paddd $TMP,@MSG[2] 676e1051a39Sopenharmony_ci 677e1051a39Sopenharmony_ci movdqa 14*32-0x80($Tbl),$Wi 678e1051a39Sopenharmony_ci paddd @MSG[1],$Wi 679e1051a39Sopenharmony_ci sha256rnds2 $ABEF,$CDGH # 56-59 680e1051a39Sopenharmony_ci pshufd \$0x0e,$Wi,$Wi 681e1051a39Sopenharmony_ci sha256msg2 @MSG[1],@MSG[2] 682e1051a39Sopenharmony_ci movdqa $BSWAP,$TMP 683e1051a39Sopenharmony_ci sha256rnds2 $CDGH,$ABEF 684e1051a39Sopenharmony_ci 685e1051a39Sopenharmony_ci movdqa 15*32-0x80($Tbl),$Wi 686e1051a39Sopenharmony_ci paddd @MSG[2],$Wi 687e1051a39Sopenharmony_ci nop 688e1051a39Sopenharmony_ci sha256rnds2 $ABEF,$CDGH # 60-63 689e1051a39Sopenharmony_ci pshufd \$0x0e,$Wi,$Wi 690e1051a39Sopenharmony_ci dec $num 691e1051a39Sopenharmony_ci nop 692e1051a39Sopenharmony_ci sha256rnds2 $CDGH,$ABEF 693e1051a39Sopenharmony_ci 694e1051a39Sopenharmony_ci paddd $CDGH_SAVE,$CDGH 695e1051a39Sopenharmony_ci paddd $ABEF_SAVE,$ABEF 696e1051a39Sopenharmony_ci jnz .Loop_shaext 697e1051a39Sopenharmony_ci 698e1051a39Sopenharmony_ci pshufd \$0xb1,$CDGH,$CDGH # DCHG 699e1051a39Sopenharmony_ci pshufd \$0x1b,$ABEF,$TMP # FEBA 700e1051a39Sopenharmony_ci pshufd \$0xb1,$ABEF,$ABEF # BAFE 701e1051a39Sopenharmony_ci punpckhqdq $CDGH,$ABEF # DCBA 702e1051a39Sopenharmony_ci palignr \$8,$TMP,$CDGH # HGFE 703e1051a39Sopenharmony_ci 704e1051a39Sopenharmony_ci movdqu $ABEF,($ctx) 705e1051a39Sopenharmony_ci movdqu $CDGH,16($ctx) 706e1051a39Sopenharmony_ci___ 707e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 708e1051a39Sopenharmony_ci movaps -8-5*16(%rax),%xmm6 709e1051a39Sopenharmony_ci movaps -8-4*16(%rax),%xmm7 710e1051a39Sopenharmony_ci movaps -8-3*16(%rax),%xmm8 711e1051a39Sopenharmony_ci movaps -8-2*16(%rax),%xmm9 712e1051a39Sopenharmony_ci movaps -8-1*16(%rax),%xmm10 713e1051a39Sopenharmony_ci mov %rax,%rsp 714e1051a39Sopenharmony_ci.Lepilogue_shaext: 715e1051a39Sopenharmony_ci___ 716e1051a39Sopenharmony_ci$code.=<<___; 717e1051a39Sopenharmony_ci ret 718e1051a39Sopenharmony_ci.cfi_endproc 719e1051a39Sopenharmony_ci.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 720e1051a39Sopenharmony_ci___ 721e1051a39Sopenharmony_ci}}} 722e1051a39Sopenharmony_ci{{{ 723e1051a39Sopenharmony_ci 724e1051a39Sopenharmony_cimy $a4=$T1; 725e1051a39Sopenharmony_cimy ($a,$b,$c,$d,$e,$f,$g,$h); 726e1051a39Sopenharmony_ci 727e1051a39Sopenharmony_cisub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 728e1051a39Sopenharmony_ci{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 729e1051a39Sopenharmony_ci my $arg = pop; 730e1051a39Sopenharmony_ci $arg = "\$$arg" if ($arg*1 eq $arg); 731e1051a39Sopenharmony_ci $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 732e1051a39Sopenharmony_ci} 733e1051a39Sopenharmony_ci 734e1051a39Sopenharmony_cisub body_00_15 () { 735e1051a39Sopenharmony_ci ( 736e1051a39Sopenharmony_ci '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 737e1051a39Sopenharmony_ci 738e1051a39Sopenharmony_ci '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 739e1051a39Sopenharmony_ci '&mov ($a,$a1)', 740e1051a39Sopenharmony_ci '&mov ($a4,$f)', 741e1051a39Sopenharmony_ci 742e1051a39Sopenharmony_ci '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 743e1051a39Sopenharmony_ci '&xor ($a0,$e)', 744e1051a39Sopenharmony_ci '&xor ($a4,$g)', # f^g 745e1051a39Sopenharmony_ci 746e1051a39Sopenharmony_ci '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 747e1051a39Sopenharmony_ci '&xor ($a1,$a)', 748e1051a39Sopenharmony_ci '&and ($a4,$e)', # (f^g)&e 749e1051a39Sopenharmony_ci 750e1051a39Sopenharmony_ci '&xor ($a0,$e)', 751e1051a39Sopenharmony_ci '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 752e1051a39Sopenharmony_ci '&mov ($a2,$a)', 753e1051a39Sopenharmony_ci 754e1051a39Sopenharmony_ci '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 755e1051a39Sopenharmony_ci '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 756e1051a39Sopenharmony_ci '&xor ($a2,$b)', # a^b, b^c in next round 757e1051a39Sopenharmony_ci 758e1051a39Sopenharmony_ci '&add ($h,$a4)', # h+=Ch(e,f,g) 759e1051a39Sopenharmony_ci '&ror ($a0,$Sigma1[0])', # Sigma1(e) 760e1051a39Sopenharmony_ci '&and ($a3,$a2)', # (b^c)&(a^b) 761e1051a39Sopenharmony_ci 762e1051a39Sopenharmony_ci '&xor ($a1,$a)', 763e1051a39Sopenharmony_ci '&add ($h,$a0)', # h+=Sigma1(e) 764e1051a39Sopenharmony_ci '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 765e1051a39Sopenharmony_ci 766e1051a39Sopenharmony_ci '&ror ($a1,$Sigma0[0])', # Sigma0(a) 767e1051a39Sopenharmony_ci '&add ($d,$h)', # d+=h 768e1051a39Sopenharmony_ci '&add ($h,$a3)', # h+=Maj(a,b,c) 769e1051a39Sopenharmony_ci 770e1051a39Sopenharmony_ci '&mov ($a0,$d)', 771e1051a39Sopenharmony_ci '&add ($a1,$h);'. # h+=Sigma0(a) 772e1051a39Sopenharmony_ci '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 773e1051a39Sopenharmony_ci ); 774e1051a39Sopenharmony_ci} 775e1051a39Sopenharmony_ci 776e1051a39Sopenharmony_ci###################################################################### 777e1051a39Sopenharmony_ci# SSSE3 code path 778e1051a39Sopenharmony_ci# 779e1051a39Sopenharmony_ciif ($SZ==4) { # SHA256 only 780e1051a39Sopenharmony_cimy @X = map("%xmm$_",(0..3)); 781e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 782e1051a39Sopenharmony_ci 783e1051a39Sopenharmony_ci$code.=<<___; 784e1051a39Sopenharmony_ci.type ${func}_ssse3,\@function,3 785e1051a39Sopenharmony_ci.align 64 786e1051a39Sopenharmony_ci${func}_ssse3: 787e1051a39Sopenharmony_ci.cfi_startproc 788e1051a39Sopenharmony_ci.Lssse3_shortcut: 789e1051a39Sopenharmony_ci mov %rsp,%rax # copy %rsp 790e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 791e1051a39Sopenharmony_ci push %rbx 792e1051a39Sopenharmony_ci.cfi_push %rbx 793e1051a39Sopenharmony_ci push %rbp 794e1051a39Sopenharmony_ci.cfi_push %rbp 795e1051a39Sopenharmony_ci push %r12 796e1051a39Sopenharmony_ci.cfi_push %r12 797e1051a39Sopenharmony_ci push %r13 798e1051a39Sopenharmony_ci.cfi_push %r13 799e1051a39Sopenharmony_ci push %r14 800e1051a39Sopenharmony_ci.cfi_push %r14 801e1051a39Sopenharmony_ci push %r15 802e1051a39Sopenharmony_ci.cfi_push %r15 803e1051a39Sopenharmony_ci shl \$4,%rdx # num*16 804e1051a39Sopenharmony_ci sub \$`$framesz+$win64*16*4`,%rsp 805e1051a39Sopenharmony_ci lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 806e1051a39Sopenharmony_ci and \$-64,%rsp # align stack frame 807e1051a39Sopenharmony_ci mov $ctx,$_ctx # save ctx, 1st arg 808e1051a39Sopenharmony_ci mov $inp,$_inp # save inp, 2nd arh 809e1051a39Sopenharmony_ci mov %rdx,$_end # save end pointer, "3rd" arg 810e1051a39Sopenharmony_ci mov %rax,$_rsp # save copy of %rsp 811e1051a39Sopenharmony_ci.cfi_cfa_expression $_rsp,deref,+8 812e1051a39Sopenharmony_ci___ 813e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 814e1051a39Sopenharmony_ci movaps %xmm6,16*$SZ+32(%rsp) 815e1051a39Sopenharmony_ci movaps %xmm7,16*$SZ+48(%rsp) 816e1051a39Sopenharmony_ci movaps %xmm8,16*$SZ+64(%rsp) 817e1051a39Sopenharmony_ci movaps %xmm9,16*$SZ+80(%rsp) 818e1051a39Sopenharmony_ci___ 819e1051a39Sopenharmony_ci$code.=<<___; 820e1051a39Sopenharmony_ci.Lprologue_ssse3: 821e1051a39Sopenharmony_ci 822e1051a39Sopenharmony_ci mov $SZ*0($ctx),$A 823e1051a39Sopenharmony_ci mov $SZ*1($ctx),$B 824e1051a39Sopenharmony_ci mov $SZ*2($ctx),$C 825e1051a39Sopenharmony_ci mov $SZ*3($ctx),$D 826e1051a39Sopenharmony_ci mov $SZ*4($ctx),$E 827e1051a39Sopenharmony_ci mov $SZ*5($ctx),$F 828e1051a39Sopenharmony_ci mov $SZ*6($ctx),$G 829e1051a39Sopenharmony_ci mov $SZ*7($ctx),$H 830e1051a39Sopenharmony_ci___ 831e1051a39Sopenharmony_ci 832e1051a39Sopenharmony_ci$code.=<<___; 833e1051a39Sopenharmony_ci #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 834e1051a39Sopenharmony_ci #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 835e1051a39Sopenharmony_ci jmp .Lloop_ssse3 836e1051a39Sopenharmony_ci.align 16 837e1051a39Sopenharmony_ci.Lloop_ssse3: 838e1051a39Sopenharmony_ci movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 839e1051a39Sopenharmony_ci movdqu 0x00($inp),@X[0] 840e1051a39Sopenharmony_ci movdqu 0x10($inp),@X[1] 841e1051a39Sopenharmony_ci movdqu 0x20($inp),@X[2] 842e1051a39Sopenharmony_ci pshufb $t3,@X[0] 843e1051a39Sopenharmony_ci movdqu 0x30($inp),@X[3] 844e1051a39Sopenharmony_ci lea $TABLE(%rip),$Tbl 845e1051a39Sopenharmony_ci pshufb $t3,@X[1] 846e1051a39Sopenharmony_ci movdqa 0x00($Tbl),$t0 847e1051a39Sopenharmony_ci movdqa 0x20($Tbl),$t1 848e1051a39Sopenharmony_ci pshufb $t3,@X[2] 849e1051a39Sopenharmony_ci paddd @X[0],$t0 850e1051a39Sopenharmony_ci movdqa 0x40($Tbl),$t2 851e1051a39Sopenharmony_ci pshufb $t3,@X[3] 852e1051a39Sopenharmony_ci movdqa 0x60($Tbl),$t3 853e1051a39Sopenharmony_ci paddd @X[1],$t1 854e1051a39Sopenharmony_ci paddd @X[2],$t2 855e1051a39Sopenharmony_ci paddd @X[3],$t3 856e1051a39Sopenharmony_ci movdqa $t0,0x00(%rsp) 857e1051a39Sopenharmony_ci mov $A,$a1 858e1051a39Sopenharmony_ci movdqa $t1,0x10(%rsp) 859e1051a39Sopenharmony_ci mov $B,$a3 860e1051a39Sopenharmony_ci movdqa $t2,0x20(%rsp) 861e1051a39Sopenharmony_ci xor $C,$a3 # magic 862e1051a39Sopenharmony_ci movdqa $t3,0x30(%rsp) 863e1051a39Sopenharmony_ci mov $E,$a0 864e1051a39Sopenharmony_ci jmp .Lssse3_00_47 865e1051a39Sopenharmony_ci 866e1051a39Sopenharmony_ci.align 16 867e1051a39Sopenharmony_ci.Lssse3_00_47: 868e1051a39Sopenharmony_ci sub \$`-16*2*$SZ`,$Tbl # size optimization 869e1051a39Sopenharmony_ci___ 870e1051a39Sopenharmony_cisub Xupdate_256_SSSE3 () { 871e1051a39Sopenharmony_ci ( 872e1051a39Sopenharmony_ci '&movdqa ($t0,@X[1]);', 873e1051a39Sopenharmony_ci '&movdqa ($t3,@X[3])', 874e1051a39Sopenharmony_ci '&palignr ($t0,@X[0],$SZ)', # X[1..4] 875e1051a39Sopenharmony_ci '&palignr ($t3,@X[2],$SZ);', # X[9..12] 876e1051a39Sopenharmony_ci '&movdqa ($t1,$t0)', 877e1051a39Sopenharmony_ci '&movdqa ($t2,$t0);', 878e1051a39Sopenharmony_ci '&psrld ($t0,$sigma0[2])', 879e1051a39Sopenharmony_ci '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 880e1051a39Sopenharmony_ci '&psrld ($t2,$sigma0[0])', 881e1051a39Sopenharmony_ci '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 882e1051a39Sopenharmony_ci '&pslld ($t1,8*$SZ-$sigma0[1]);'. 883e1051a39Sopenharmony_ci '&pxor ($t0,$t2)', 884e1051a39Sopenharmony_ci '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 885e1051a39Sopenharmony_ci '&pxor ($t0,$t1)', 886e1051a39Sopenharmony_ci '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 887e1051a39Sopenharmony_ci '&pxor ($t0,$t2);', 888e1051a39Sopenharmony_ci '&movdqa ($t2,$t3)', 889e1051a39Sopenharmony_ci '&pxor ($t0,$t1);', # sigma0(X[1..4]) 890e1051a39Sopenharmony_ci '&psrld ($t3,$sigma1[2])', 891e1051a39Sopenharmony_ci '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 892e1051a39Sopenharmony_ci '&psrlq ($t2,$sigma1[0])', 893e1051a39Sopenharmony_ci '&pxor ($t3,$t2);', 894e1051a39Sopenharmony_ci '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 895e1051a39Sopenharmony_ci '&pxor ($t3,$t2)', 896e1051a39Sopenharmony_ci '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 897e1051a39Sopenharmony_ci '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 898e1051a39Sopenharmony_ci '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 899e1051a39Sopenharmony_ci '&movdqa ($t2,$t3);', 900e1051a39Sopenharmony_ci '&psrld ($t3,$sigma1[2])', 901e1051a39Sopenharmony_ci '&psrlq ($t2,$sigma1[0])', 902e1051a39Sopenharmony_ci '&pxor ($t3,$t2);', 903e1051a39Sopenharmony_ci '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 904e1051a39Sopenharmony_ci '&pxor ($t3,$t2);', 905e1051a39Sopenharmony_ci '&movdqa ($t2,16*2*$j."($Tbl)")', 906e1051a39Sopenharmony_ci '&pshufb ($t3,$t5)', 907e1051a39Sopenharmony_ci '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 908e1051a39Sopenharmony_ci ); 909e1051a39Sopenharmony_ci} 910e1051a39Sopenharmony_ci 911e1051a39Sopenharmony_cisub SSSE3_256_00_47 () { 912e1051a39Sopenharmony_cimy $j = shift; 913e1051a39Sopenharmony_cimy $body = shift; 914e1051a39Sopenharmony_cimy @X = @_; 915e1051a39Sopenharmony_cimy @insns = (&$body,&$body,&$body,&$body); # 104 instructions 916e1051a39Sopenharmony_ci 917e1051a39Sopenharmony_ci if (0) { 918e1051a39Sopenharmony_ci foreach (Xupdate_256_SSSE3()) { # 36 instructions 919e1051a39Sopenharmony_ci eval; 920e1051a39Sopenharmony_ci eval(shift(@insns)); 921e1051a39Sopenharmony_ci eval(shift(@insns)); 922e1051a39Sopenharmony_ci eval(shift(@insns)); 923e1051a39Sopenharmony_ci } 924e1051a39Sopenharmony_ci } else { # squeeze extra 4% on Westmere and 19% on Atom 925e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 926e1051a39Sopenharmony_ci &movdqa ($t0,@X[1]); 927e1051a39Sopenharmony_ci eval(shift(@insns)); 928e1051a39Sopenharmony_ci eval(shift(@insns)); 929e1051a39Sopenharmony_ci &movdqa ($t3,@X[3]); 930e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 931e1051a39Sopenharmony_ci eval(shift(@insns)); 932e1051a39Sopenharmony_ci eval(shift(@insns)); 933e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 934e1051a39Sopenharmony_ci eval(shift(@insns)); 935e1051a39Sopenharmony_ci &palignr ($t0,@X[0],$SZ); # X[1..4] 936e1051a39Sopenharmony_ci eval(shift(@insns)); 937e1051a39Sopenharmony_ci eval(shift(@insns)); 938e1051a39Sopenharmony_ci &palignr ($t3,@X[2],$SZ); # X[9..12] 939e1051a39Sopenharmony_ci eval(shift(@insns)); 940e1051a39Sopenharmony_ci eval(shift(@insns)); 941e1051a39Sopenharmony_ci eval(shift(@insns)); 942e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 943e1051a39Sopenharmony_ci &movdqa ($t1,$t0); 944e1051a39Sopenharmony_ci eval(shift(@insns)); 945e1051a39Sopenharmony_ci eval(shift(@insns)); 946e1051a39Sopenharmony_ci &movdqa ($t2,$t0); 947e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 948e1051a39Sopenharmony_ci eval(shift(@insns)); 949e1051a39Sopenharmony_ci &psrld ($t0,$sigma0[2]); 950e1051a39Sopenharmony_ci eval(shift(@insns)); 951e1051a39Sopenharmony_ci eval(shift(@insns)); 952e1051a39Sopenharmony_ci eval(shift(@insns)); 953e1051a39Sopenharmony_ci &paddd (@X[0],$t3); # X[0..3] += X[9..12] 954e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 955e1051a39Sopenharmony_ci eval(shift(@insns)); 956e1051a39Sopenharmony_ci &psrld ($t2,$sigma0[0]); 957e1051a39Sopenharmony_ci eval(shift(@insns)); 958e1051a39Sopenharmony_ci eval(shift(@insns)); 959e1051a39Sopenharmony_ci &pshufd ($t3,@X[3],0b11111010); # X[4..15] 960e1051a39Sopenharmony_ci eval(shift(@insns)); 961e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 962e1051a39Sopenharmony_ci &pslld ($t1,8*$SZ-$sigma0[1]); 963e1051a39Sopenharmony_ci eval(shift(@insns)); 964e1051a39Sopenharmony_ci eval(shift(@insns)); 965e1051a39Sopenharmony_ci &pxor ($t0,$t2); 966e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 967e1051a39Sopenharmony_ci eval(shift(@insns)); 968e1051a39Sopenharmony_ci eval(shift(@insns)); 969e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 970e1051a39Sopenharmony_ci &psrld ($t2,$sigma0[1]-$sigma0[0]); 971e1051a39Sopenharmony_ci eval(shift(@insns)); 972e1051a39Sopenharmony_ci &pxor ($t0,$t1); 973e1051a39Sopenharmony_ci eval(shift(@insns)); 974e1051a39Sopenharmony_ci eval(shift(@insns)); 975e1051a39Sopenharmony_ci &pslld ($t1,$sigma0[1]-$sigma0[0]); 976e1051a39Sopenharmony_ci eval(shift(@insns)); 977e1051a39Sopenharmony_ci eval(shift(@insns)); 978e1051a39Sopenharmony_ci &pxor ($t0,$t2); 979e1051a39Sopenharmony_ci eval(shift(@insns)); 980e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 981e1051a39Sopenharmony_ci &movdqa ($t2,$t3); 982e1051a39Sopenharmony_ci eval(shift(@insns)); 983e1051a39Sopenharmony_ci eval(shift(@insns)); 984e1051a39Sopenharmony_ci &pxor ($t0,$t1); # sigma0(X[1..4]) 985e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 986e1051a39Sopenharmony_ci eval(shift(@insns)); 987e1051a39Sopenharmony_ci eval(shift(@insns)); 988e1051a39Sopenharmony_ci &psrld ($t3,$sigma1[2]); 989e1051a39Sopenharmony_ci eval(shift(@insns)); 990e1051a39Sopenharmony_ci eval(shift(@insns)); 991e1051a39Sopenharmony_ci &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 992e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 993e1051a39Sopenharmony_ci eval(shift(@insns)); 994e1051a39Sopenharmony_ci &psrlq ($t2,$sigma1[0]); 995e1051a39Sopenharmony_ci eval(shift(@insns)); 996e1051a39Sopenharmony_ci eval(shift(@insns)); 997e1051a39Sopenharmony_ci eval(shift(@insns)); 998e1051a39Sopenharmony_ci &pxor ($t3,$t2); 999e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1000e1051a39Sopenharmony_ci eval(shift(@insns)); 1001e1051a39Sopenharmony_ci eval(shift(@insns)); 1002e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1003e1051a39Sopenharmony_ci &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1004e1051a39Sopenharmony_ci eval(shift(@insns)); 1005e1051a39Sopenharmony_ci eval(shift(@insns)); 1006e1051a39Sopenharmony_ci &pxor ($t3,$t2); 1007e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1008e1051a39Sopenharmony_ci eval(shift(@insns)); 1009e1051a39Sopenharmony_ci eval(shift(@insns)); 1010e1051a39Sopenharmony_ci #&pshufb ($t3,$t4); # sigma1(X[14..15]) 1011e1051a39Sopenharmony_ci &pshufd ($t3,$t3,0b10000000); 1012e1051a39Sopenharmony_ci eval(shift(@insns)); 1013e1051a39Sopenharmony_ci eval(shift(@insns)); 1014e1051a39Sopenharmony_ci eval(shift(@insns)); 1015e1051a39Sopenharmony_ci &psrldq ($t3,8); 1016e1051a39Sopenharmony_ci eval(shift(@insns)); 1017e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1018e1051a39Sopenharmony_ci eval(shift(@insns)); 1019e1051a39Sopenharmony_ci eval(shift(@insns)); 1020e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1021e1051a39Sopenharmony_ci &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1022e1051a39Sopenharmony_ci eval(shift(@insns)); 1023e1051a39Sopenharmony_ci eval(shift(@insns)); 1024e1051a39Sopenharmony_ci eval(shift(@insns)); 1025e1051a39Sopenharmony_ci &pshufd ($t3,@X[0],0b01010000); # X[16..17] 1026e1051a39Sopenharmony_ci eval(shift(@insns)); 1027e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1028e1051a39Sopenharmony_ci eval(shift(@insns)); 1029e1051a39Sopenharmony_ci &movdqa ($t2,$t3); 1030e1051a39Sopenharmony_ci eval(shift(@insns)); 1031e1051a39Sopenharmony_ci eval(shift(@insns)); 1032e1051a39Sopenharmony_ci &psrld ($t3,$sigma1[2]); 1033e1051a39Sopenharmony_ci eval(shift(@insns)); 1034e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1035e1051a39Sopenharmony_ci &psrlq ($t2,$sigma1[0]); 1036e1051a39Sopenharmony_ci eval(shift(@insns)); 1037e1051a39Sopenharmony_ci eval(shift(@insns)); 1038e1051a39Sopenharmony_ci &pxor ($t3,$t2); 1039e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1040e1051a39Sopenharmony_ci eval(shift(@insns)); 1041e1051a39Sopenharmony_ci eval(shift(@insns)); 1042e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1043e1051a39Sopenharmony_ci eval(shift(@insns)); 1044e1051a39Sopenharmony_ci &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1045e1051a39Sopenharmony_ci eval(shift(@insns)); 1046e1051a39Sopenharmony_ci eval(shift(@insns)); 1047e1051a39Sopenharmony_ci eval(shift(@insns)); 1048e1051a39Sopenharmony_ci &pxor ($t3,$t2); 1049e1051a39Sopenharmony_ci eval(shift(@insns)); 1050e1051a39Sopenharmony_ci eval(shift(@insns)); 1051e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1052e1051a39Sopenharmony_ci #&pshufb ($t3,$t5); 1053e1051a39Sopenharmony_ci &pshufd ($t3,$t3,0b00001000); 1054e1051a39Sopenharmony_ci eval(shift(@insns)); 1055e1051a39Sopenharmony_ci eval(shift(@insns)); 1056e1051a39Sopenharmony_ci &movdqa ($t2,16*2*$j."($Tbl)"); 1057e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1058e1051a39Sopenharmony_ci eval(shift(@insns)); 1059e1051a39Sopenharmony_ci &pslldq ($t3,8); 1060e1051a39Sopenharmony_ci eval(shift(@insns)); 1061e1051a39Sopenharmony_ci eval(shift(@insns)); 1062e1051a39Sopenharmony_ci eval(shift(@insns)); 1063e1051a39Sopenharmony_ci &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1064e1051a39Sopenharmony_ci eval(shift(@insns)); #@ 1065e1051a39Sopenharmony_ci eval(shift(@insns)); 1066e1051a39Sopenharmony_ci eval(shift(@insns)); 1067e1051a39Sopenharmony_ci } 1068e1051a39Sopenharmony_ci &paddd ($t2,@X[0]); 1069e1051a39Sopenharmony_ci foreach (@insns) { eval; } # remaining instructions 1070e1051a39Sopenharmony_ci &movdqa (16*$j."(%rsp)",$t2); 1071e1051a39Sopenharmony_ci} 1072e1051a39Sopenharmony_ci 1073e1051a39Sopenharmony_ci for ($i=0,$j=0; $j<4; $j++) { 1074e1051a39Sopenharmony_ci &SSSE3_256_00_47($j,\&body_00_15,@X); 1075e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 1076e1051a39Sopenharmony_ci } 1077e1051a39Sopenharmony_ci &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1078e1051a39Sopenharmony_ci &jne (".Lssse3_00_47"); 1079e1051a39Sopenharmony_ci 1080e1051a39Sopenharmony_ci for ($i=0; $i<16; ) { 1081e1051a39Sopenharmony_ci foreach(body_00_15()) { eval; } 1082e1051a39Sopenharmony_ci } 1083e1051a39Sopenharmony_ci$code.=<<___; 1084e1051a39Sopenharmony_ci mov $_ctx,$ctx 1085e1051a39Sopenharmony_ci mov $a1,$A 1086e1051a39Sopenharmony_ci 1087e1051a39Sopenharmony_ci add $SZ*0($ctx),$A 1088e1051a39Sopenharmony_ci lea 16*$SZ($inp),$inp 1089e1051a39Sopenharmony_ci add $SZ*1($ctx),$B 1090e1051a39Sopenharmony_ci add $SZ*2($ctx),$C 1091e1051a39Sopenharmony_ci add $SZ*3($ctx),$D 1092e1051a39Sopenharmony_ci add $SZ*4($ctx),$E 1093e1051a39Sopenharmony_ci add $SZ*5($ctx),$F 1094e1051a39Sopenharmony_ci add $SZ*6($ctx),$G 1095e1051a39Sopenharmony_ci add $SZ*7($ctx),$H 1096e1051a39Sopenharmony_ci 1097e1051a39Sopenharmony_ci cmp $_end,$inp 1098e1051a39Sopenharmony_ci 1099e1051a39Sopenharmony_ci mov $A,$SZ*0($ctx) 1100e1051a39Sopenharmony_ci mov $B,$SZ*1($ctx) 1101e1051a39Sopenharmony_ci mov $C,$SZ*2($ctx) 1102e1051a39Sopenharmony_ci mov $D,$SZ*3($ctx) 1103e1051a39Sopenharmony_ci mov $E,$SZ*4($ctx) 1104e1051a39Sopenharmony_ci mov $F,$SZ*5($ctx) 1105e1051a39Sopenharmony_ci mov $G,$SZ*6($ctx) 1106e1051a39Sopenharmony_ci mov $H,$SZ*7($ctx) 1107e1051a39Sopenharmony_ci jb .Lloop_ssse3 1108e1051a39Sopenharmony_ci 1109e1051a39Sopenharmony_ci mov $_rsp,%rsi 1110e1051a39Sopenharmony_ci.cfi_def_cfa %rsi,8 1111e1051a39Sopenharmony_ci___ 1112e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1113e1051a39Sopenharmony_ci movaps 16*$SZ+32(%rsp),%xmm6 1114e1051a39Sopenharmony_ci movaps 16*$SZ+48(%rsp),%xmm7 1115e1051a39Sopenharmony_ci movaps 16*$SZ+64(%rsp),%xmm8 1116e1051a39Sopenharmony_ci movaps 16*$SZ+80(%rsp),%xmm9 1117e1051a39Sopenharmony_ci___ 1118e1051a39Sopenharmony_ci$code.=<<___; 1119e1051a39Sopenharmony_ci mov -48(%rsi),%r15 1120e1051a39Sopenharmony_ci.cfi_restore %r15 1121e1051a39Sopenharmony_ci mov -40(%rsi),%r14 1122e1051a39Sopenharmony_ci.cfi_restore %r14 1123e1051a39Sopenharmony_ci mov -32(%rsi),%r13 1124e1051a39Sopenharmony_ci.cfi_restore %r13 1125e1051a39Sopenharmony_ci mov -24(%rsi),%r12 1126e1051a39Sopenharmony_ci.cfi_restore %r12 1127e1051a39Sopenharmony_ci mov -16(%rsi),%rbp 1128e1051a39Sopenharmony_ci.cfi_restore %rbp 1129e1051a39Sopenharmony_ci mov -8(%rsi),%rbx 1130e1051a39Sopenharmony_ci.cfi_restore %rbx 1131e1051a39Sopenharmony_ci lea (%rsi),%rsp 1132e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 1133e1051a39Sopenharmony_ci.Lepilogue_ssse3: 1134e1051a39Sopenharmony_ci ret 1135e1051a39Sopenharmony_ci.cfi_endproc 1136e1051a39Sopenharmony_ci.size ${func}_ssse3,.-${func}_ssse3 1137e1051a39Sopenharmony_ci___ 1138e1051a39Sopenharmony_ci} 1139e1051a39Sopenharmony_ci 1140e1051a39Sopenharmony_ciif ($avx) {{ 1141e1051a39Sopenharmony_ci###################################################################### 1142e1051a39Sopenharmony_ci# XOP code path 1143e1051a39Sopenharmony_ci# 1144e1051a39Sopenharmony_ciif ($SZ==8) { # SHA512 only 1145e1051a39Sopenharmony_ci$code.=<<___; 1146e1051a39Sopenharmony_ci.type ${func}_xop,\@function,3 1147e1051a39Sopenharmony_ci.align 64 1148e1051a39Sopenharmony_ci${func}_xop: 1149e1051a39Sopenharmony_ci.cfi_startproc 1150e1051a39Sopenharmony_ci.Lxop_shortcut: 1151e1051a39Sopenharmony_ci mov %rsp,%rax # copy %rsp 1152e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 1153e1051a39Sopenharmony_ci push %rbx 1154e1051a39Sopenharmony_ci.cfi_push %rbx 1155e1051a39Sopenharmony_ci push %rbp 1156e1051a39Sopenharmony_ci.cfi_push %rbp 1157e1051a39Sopenharmony_ci push %r12 1158e1051a39Sopenharmony_ci.cfi_push %r12 1159e1051a39Sopenharmony_ci push %r13 1160e1051a39Sopenharmony_ci.cfi_push %r13 1161e1051a39Sopenharmony_ci push %r14 1162e1051a39Sopenharmony_ci.cfi_push %r14 1163e1051a39Sopenharmony_ci push %r15 1164e1051a39Sopenharmony_ci.cfi_push %r15 1165e1051a39Sopenharmony_ci shl \$4,%rdx # num*16 1166e1051a39Sopenharmony_ci sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1167e1051a39Sopenharmony_ci lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1168e1051a39Sopenharmony_ci and \$-64,%rsp # align stack frame 1169e1051a39Sopenharmony_ci mov $ctx,$_ctx # save ctx, 1st arg 1170e1051a39Sopenharmony_ci mov $inp,$_inp # save inp, 2nd arh 1171e1051a39Sopenharmony_ci mov %rdx,$_end # save end pointer, "3rd" arg 1172e1051a39Sopenharmony_ci mov %rax,$_rsp # save copy of %rsp 1173e1051a39Sopenharmony_ci.cfi_cfa_expression $_rsp,deref,+8 1174e1051a39Sopenharmony_ci___ 1175e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1176e1051a39Sopenharmony_ci movaps %xmm6,16*$SZ+32(%rsp) 1177e1051a39Sopenharmony_ci movaps %xmm7,16*$SZ+48(%rsp) 1178e1051a39Sopenharmony_ci movaps %xmm8,16*$SZ+64(%rsp) 1179e1051a39Sopenharmony_ci movaps %xmm9,16*$SZ+80(%rsp) 1180e1051a39Sopenharmony_ci___ 1181e1051a39Sopenharmony_ci$code.=<<___ if ($win64 && $SZ>4); 1182e1051a39Sopenharmony_ci movaps %xmm10,16*$SZ+96(%rsp) 1183e1051a39Sopenharmony_ci movaps %xmm11,16*$SZ+112(%rsp) 1184e1051a39Sopenharmony_ci___ 1185e1051a39Sopenharmony_ci$code.=<<___; 1186e1051a39Sopenharmony_ci.Lprologue_xop: 1187e1051a39Sopenharmony_ci 1188e1051a39Sopenharmony_ci vzeroupper 1189e1051a39Sopenharmony_ci mov $SZ*0($ctx),$A 1190e1051a39Sopenharmony_ci mov $SZ*1($ctx),$B 1191e1051a39Sopenharmony_ci mov $SZ*2($ctx),$C 1192e1051a39Sopenharmony_ci mov $SZ*3($ctx),$D 1193e1051a39Sopenharmony_ci mov $SZ*4($ctx),$E 1194e1051a39Sopenharmony_ci mov $SZ*5($ctx),$F 1195e1051a39Sopenharmony_ci mov $SZ*6($ctx),$G 1196e1051a39Sopenharmony_ci mov $SZ*7($ctx),$H 1197e1051a39Sopenharmony_ci jmp .Lloop_xop 1198e1051a39Sopenharmony_ci___ 1199e1051a39Sopenharmony_ci if ($SZ==4) { # SHA256 1200e1051a39Sopenharmony_ci my @X = map("%xmm$_",(0..3)); 1201e1051a39Sopenharmony_ci my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 1202e1051a39Sopenharmony_ci 1203e1051a39Sopenharmony_ci$code.=<<___; 1204e1051a39Sopenharmony_ci.align 16 1205e1051a39Sopenharmony_ci.Lloop_xop: 1206e1051a39Sopenharmony_ci vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1207e1051a39Sopenharmony_ci vmovdqu 0x00($inp),@X[0] 1208e1051a39Sopenharmony_ci vmovdqu 0x10($inp),@X[1] 1209e1051a39Sopenharmony_ci vmovdqu 0x20($inp),@X[2] 1210e1051a39Sopenharmony_ci vmovdqu 0x30($inp),@X[3] 1211e1051a39Sopenharmony_ci vpshufb $t3,@X[0],@X[0] 1212e1051a39Sopenharmony_ci lea $TABLE(%rip),$Tbl 1213e1051a39Sopenharmony_ci vpshufb $t3,@X[1],@X[1] 1214e1051a39Sopenharmony_ci vpshufb $t3,@X[2],@X[2] 1215e1051a39Sopenharmony_ci vpaddd 0x00($Tbl),@X[0],$t0 1216e1051a39Sopenharmony_ci vpshufb $t3,@X[3],@X[3] 1217e1051a39Sopenharmony_ci vpaddd 0x20($Tbl),@X[1],$t1 1218e1051a39Sopenharmony_ci vpaddd 0x40($Tbl),@X[2],$t2 1219e1051a39Sopenharmony_ci vpaddd 0x60($Tbl),@X[3],$t3 1220e1051a39Sopenharmony_ci vmovdqa $t0,0x00(%rsp) 1221e1051a39Sopenharmony_ci mov $A,$a1 1222e1051a39Sopenharmony_ci vmovdqa $t1,0x10(%rsp) 1223e1051a39Sopenharmony_ci mov $B,$a3 1224e1051a39Sopenharmony_ci vmovdqa $t2,0x20(%rsp) 1225e1051a39Sopenharmony_ci xor $C,$a3 # magic 1226e1051a39Sopenharmony_ci vmovdqa $t3,0x30(%rsp) 1227e1051a39Sopenharmony_ci mov $E,$a0 1228e1051a39Sopenharmony_ci jmp .Lxop_00_47 1229e1051a39Sopenharmony_ci 1230e1051a39Sopenharmony_ci.align 16 1231e1051a39Sopenharmony_ci.Lxop_00_47: 1232e1051a39Sopenharmony_ci sub \$`-16*2*$SZ`,$Tbl # size optimization 1233e1051a39Sopenharmony_ci___ 1234e1051a39Sopenharmony_cisub XOP_256_00_47 () { 1235e1051a39Sopenharmony_cimy $j = shift; 1236e1051a39Sopenharmony_cimy $body = shift; 1237e1051a39Sopenharmony_cimy @X = @_; 1238e1051a39Sopenharmony_cimy @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1239e1051a39Sopenharmony_ci 1240e1051a39Sopenharmony_ci &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 1241e1051a39Sopenharmony_ci eval(shift(@insns)); 1242e1051a39Sopenharmony_ci eval(shift(@insns)); 1243e1051a39Sopenharmony_ci &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 1244e1051a39Sopenharmony_ci eval(shift(@insns)); 1245e1051a39Sopenharmony_ci eval(shift(@insns)); 1246e1051a39Sopenharmony_ci &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 1247e1051a39Sopenharmony_ci eval(shift(@insns)); 1248e1051a39Sopenharmony_ci eval(shift(@insns)); 1249e1051a39Sopenharmony_ci &vpsrld ($t0,$t0,$sigma0[2]); 1250e1051a39Sopenharmony_ci eval(shift(@insns)); 1251e1051a39Sopenharmony_ci eval(shift(@insns)); 1252e1051a39Sopenharmony_ci &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 1253e1051a39Sopenharmony_ci eval(shift(@insns)); 1254e1051a39Sopenharmony_ci eval(shift(@insns)); 1255e1051a39Sopenharmony_ci eval(shift(@insns)); 1256e1051a39Sopenharmony_ci eval(shift(@insns)); 1257e1051a39Sopenharmony_ci &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 1258e1051a39Sopenharmony_ci eval(shift(@insns)); 1259e1051a39Sopenharmony_ci eval(shift(@insns)); 1260e1051a39Sopenharmony_ci &vpxor ($t0,$t0,$t1); 1261e1051a39Sopenharmony_ci eval(shift(@insns)); 1262e1051a39Sopenharmony_ci eval(shift(@insns)); 1263e1051a39Sopenharmony_ci eval(shift(@insns)); 1264e1051a39Sopenharmony_ci eval(shift(@insns)); 1265e1051a39Sopenharmony_ci &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 1266e1051a39Sopenharmony_ci eval(shift(@insns)); 1267e1051a39Sopenharmony_ci eval(shift(@insns)); 1268e1051a39Sopenharmony_ci &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 1269e1051a39Sopenharmony_ci eval(shift(@insns)); 1270e1051a39Sopenharmony_ci eval(shift(@insns)); 1271e1051a39Sopenharmony_ci &vpsrld ($t2,@X[3],$sigma1[2]); 1272e1051a39Sopenharmony_ci eval(shift(@insns)); 1273e1051a39Sopenharmony_ci eval(shift(@insns)); 1274e1051a39Sopenharmony_ci &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 1275e1051a39Sopenharmony_ci eval(shift(@insns)); 1276e1051a39Sopenharmony_ci eval(shift(@insns)); 1277e1051a39Sopenharmony_ci &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1278e1051a39Sopenharmony_ci eval(shift(@insns)); 1279e1051a39Sopenharmony_ci eval(shift(@insns)); 1280e1051a39Sopenharmony_ci &vpxor ($t3,$t3,$t2); 1281e1051a39Sopenharmony_ci eval(shift(@insns)); 1282e1051a39Sopenharmony_ci eval(shift(@insns)); 1283e1051a39Sopenharmony_ci eval(shift(@insns)); 1284e1051a39Sopenharmony_ci eval(shift(@insns)); 1285e1051a39Sopenharmony_ci &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1286e1051a39Sopenharmony_ci eval(shift(@insns)); 1287e1051a39Sopenharmony_ci eval(shift(@insns)); 1288e1051a39Sopenharmony_ci eval(shift(@insns)); 1289e1051a39Sopenharmony_ci eval(shift(@insns)); 1290e1051a39Sopenharmony_ci &vpsrldq ($t3,$t3,8); 1291e1051a39Sopenharmony_ci eval(shift(@insns)); 1292e1051a39Sopenharmony_ci eval(shift(@insns)); 1293e1051a39Sopenharmony_ci eval(shift(@insns)); 1294e1051a39Sopenharmony_ci eval(shift(@insns)); 1295e1051a39Sopenharmony_ci &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1296e1051a39Sopenharmony_ci eval(shift(@insns)); 1297e1051a39Sopenharmony_ci eval(shift(@insns)); 1298e1051a39Sopenharmony_ci eval(shift(@insns)); 1299e1051a39Sopenharmony_ci eval(shift(@insns)); 1300e1051a39Sopenharmony_ci &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 1301e1051a39Sopenharmony_ci eval(shift(@insns)); 1302e1051a39Sopenharmony_ci eval(shift(@insns)); 1303e1051a39Sopenharmony_ci &vpsrld ($t2,@X[0],$sigma1[2]); 1304e1051a39Sopenharmony_ci eval(shift(@insns)); 1305e1051a39Sopenharmony_ci eval(shift(@insns)); 1306e1051a39Sopenharmony_ci &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1307e1051a39Sopenharmony_ci eval(shift(@insns)); 1308e1051a39Sopenharmony_ci eval(shift(@insns)); 1309e1051a39Sopenharmony_ci &vpxor ($t3,$t3,$t2); 1310e1051a39Sopenharmony_ci eval(shift(@insns)); 1311e1051a39Sopenharmony_ci eval(shift(@insns)); 1312e1051a39Sopenharmony_ci eval(shift(@insns)); 1313e1051a39Sopenharmony_ci eval(shift(@insns)); 1314e1051a39Sopenharmony_ci &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 1315e1051a39Sopenharmony_ci eval(shift(@insns)); 1316e1051a39Sopenharmony_ci eval(shift(@insns)); 1317e1051a39Sopenharmony_ci eval(shift(@insns)); 1318e1051a39Sopenharmony_ci eval(shift(@insns)); 1319e1051a39Sopenharmony_ci &vpslldq ($t3,$t3,8); # 22 instructions 1320e1051a39Sopenharmony_ci eval(shift(@insns)); 1321e1051a39Sopenharmony_ci eval(shift(@insns)); 1322e1051a39Sopenharmony_ci eval(shift(@insns)); 1323e1051a39Sopenharmony_ci eval(shift(@insns)); 1324e1051a39Sopenharmony_ci &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1325e1051a39Sopenharmony_ci eval(shift(@insns)); 1326e1051a39Sopenharmony_ci eval(shift(@insns)); 1327e1051a39Sopenharmony_ci eval(shift(@insns)); 1328e1051a39Sopenharmony_ci eval(shift(@insns)); 1329e1051a39Sopenharmony_ci &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1330e1051a39Sopenharmony_ci foreach (@insns) { eval; } # remaining instructions 1331e1051a39Sopenharmony_ci &vmovdqa (16*$j."(%rsp)",$t2); 1332e1051a39Sopenharmony_ci} 1333e1051a39Sopenharmony_ci 1334e1051a39Sopenharmony_ci for ($i=0,$j=0; $j<4; $j++) { 1335e1051a39Sopenharmony_ci &XOP_256_00_47($j,\&body_00_15,@X); 1336e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 1337e1051a39Sopenharmony_ci } 1338e1051a39Sopenharmony_ci &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1339e1051a39Sopenharmony_ci &jne (".Lxop_00_47"); 1340e1051a39Sopenharmony_ci 1341e1051a39Sopenharmony_ci for ($i=0; $i<16; ) { 1342e1051a39Sopenharmony_ci foreach(body_00_15()) { eval; } 1343e1051a39Sopenharmony_ci } 1344e1051a39Sopenharmony_ci 1345e1051a39Sopenharmony_ci } else { # SHA512 1346e1051a39Sopenharmony_ci my @X = map("%xmm$_",(0..7)); 1347e1051a39Sopenharmony_ci my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1348e1051a39Sopenharmony_ci 1349e1051a39Sopenharmony_ci$code.=<<___; 1350e1051a39Sopenharmony_ci.align 16 1351e1051a39Sopenharmony_ci.Lloop_xop: 1352e1051a39Sopenharmony_ci vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1353e1051a39Sopenharmony_ci vmovdqu 0x00($inp),@X[0] 1354e1051a39Sopenharmony_ci lea $TABLE+0x80(%rip),$Tbl # size optimization 1355e1051a39Sopenharmony_ci vmovdqu 0x10($inp),@X[1] 1356e1051a39Sopenharmony_ci vmovdqu 0x20($inp),@X[2] 1357e1051a39Sopenharmony_ci vpshufb $t3,@X[0],@X[0] 1358e1051a39Sopenharmony_ci vmovdqu 0x30($inp),@X[3] 1359e1051a39Sopenharmony_ci vpshufb $t3,@X[1],@X[1] 1360e1051a39Sopenharmony_ci vmovdqu 0x40($inp),@X[4] 1361e1051a39Sopenharmony_ci vpshufb $t3,@X[2],@X[2] 1362e1051a39Sopenharmony_ci vmovdqu 0x50($inp),@X[5] 1363e1051a39Sopenharmony_ci vpshufb $t3,@X[3],@X[3] 1364e1051a39Sopenharmony_ci vmovdqu 0x60($inp),@X[6] 1365e1051a39Sopenharmony_ci vpshufb $t3,@X[4],@X[4] 1366e1051a39Sopenharmony_ci vmovdqu 0x70($inp),@X[7] 1367e1051a39Sopenharmony_ci vpshufb $t3,@X[5],@X[5] 1368e1051a39Sopenharmony_ci vpaddq -0x80($Tbl),@X[0],$t0 1369e1051a39Sopenharmony_ci vpshufb $t3,@X[6],@X[6] 1370e1051a39Sopenharmony_ci vpaddq -0x60($Tbl),@X[1],$t1 1371e1051a39Sopenharmony_ci vpshufb $t3,@X[7],@X[7] 1372e1051a39Sopenharmony_ci vpaddq -0x40($Tbl),@X[2],$t2 1373e1051a39Sopenharmony_ci vpaddq -0x20($Tbl),@X[3],$t3 1374e1051a39Sopenharmony_ci vmovdqa $t0,0x00(%rsp) 1375e1051a39Sopenharmony_ci vpaddq 0x00($Tbl),@X[4],$t0 1376e1051a39Sopenharmony_ci vmovdqa $t1,0x10(%rsp) 1377e1051a39Sopenharmony_ci vpaddq 0x20($Tbl),@X[5],$t1 1378e1051a39Sopenharmony_ci vmovdqa $t2,0x20(%rsp) 1379e1051a39Sopenharmony_ci vpaddq 0x40($Tbl),@X[6],$t2 1380e1051a39Sopenharmony_ci vmovdqa $t3,0x30(%rsp) 1381e1051a39Sopenharmony_ci vpaddq 0x60($Tbl),@X[7],$t3 1382e1051a39Sopenharmony_ci vmovdqa $t0,0x40(%rsp) 1383e1051a39Sopenharmony_ci mov $A,$a1 1384e1051a39Sopenharmony_ci vmovdqa $t1,0x50(%rsp) 1385e1051a39Sopenharmony_ci mov $B,$a3 1386e1051a39Sopenharmony_ci vmovdqa $t2,0x60(%rsp) 1387e1051a39Sopenharmony_ci xor $C,$a3 # magic 1388e1051a39Sopenharmony_ci vmovdqa $t3,0x70(%rsp) 1389e1051a39Sopenharmony_ci mov $E,$a0 1390e1051a39Sopenharmony_ci jmp .Lxop_00_47 1391e1051a39Sopenharmony_ci 1392e1051a39Sopenharmony_ci.align 16 1393e1051a39Sopenharmony_ci.Lxop_00_47: 1394e1051a39Sopenharmony_ci add \$`16*2*$SZ`,$Tbl 1395e1051a39Sopenharmony_ci___ 1396e1051a39Sopenharmony_cisub XOP_512_00_47 () { 1397e1051a39Sopenharmony_cimy $j = shift; 1398e1051a39Sopenharmony_cimy $body = shift; 1399e1051a39Sopenharmony_cimy @X = @_; 1400e1051a39Sopenharmony_cimy @insns = (&$body,&$body); # 52 instructions 1401e1051a39Sopenharmony_ci 1402e1051a39Sopenharmony_ci &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] 1403e1051a39Sopenharmony_ci eval(shift(@insns)); 1404e1051a39Sopenharmony_ci eval(shift(@insns)); 1405e1051a39Sopenharmony_ci &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] 1406e1051a39Sopenharmony_ci eval(shift(@insns)); 1407e1051a39Sopenharmony_ci eval(shift(@insns)); 1408e1051a39Sopenharmony_ci &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); 1409e1051a39Sopenharmony_ci eval(shift(@insns)); 1410e1051a39Sopenharmony_ci eval(shift(@insns)); 1411e1051a39Sopenharmony_ci &vpsrlq ($t0,$t0,$sigma0[2]); 1412e1051a39Sopenharmony_ci eval(shift(@insns)); 1413e1051a39Sopenharmony_ci eval(shift(@insns)); 1414e1051a39Sopenharmony_ci &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] 1415e1051a39Sopenharmony_ci eval(shift(@insns)); 1416e1051a39Sopenharmony_ci eval(shift(@insns)); 1417e1051a39Sopenharmony_ci eval(shift(@insns)); 1418e1051a39Sopenharmony_ci eval(shift(@insns)); 1419e1051a39Sopenharmony_ci &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); 1420e1051a39Sopenharmony_ci eval(shift(@insns)); 1421e1051a39Sopenharmony_ci eval(shift(@insns)); 1422e1051a39Sopenharmony_ci &vpxor ($t0,$t0,$t1); 1423e1051a39Sopenharmony_ci eval(shift(@insns)); 1424e1051a39Sopenharmony_ci eval(shift(@insns)); 1425e1051a39Sopenharmony_ci eval(shift(@insns)); 1426e1051a39Sopenharmony_ci eval(shift(@insns)); 1427e1051a39Sopenharmony_ci &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); 1428e1051a39Sopenharmony_ci eval(shift(@insns)); 1429e1051a39Sopenharmony_ci eval(shift(@insns)); 1430e1051a39Sopenharmony_ci &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) 1431e1051a39Sopenharmony_ci eval(shift(@insns)); 1432e1051a39Sopenharmony_ci eval(shift(@insns)); 1433e1051a39Sopenharmony_ci &vpsrlq ($t2,@X[7],$sigma1[2]); 1434e1051a39Sopenharmony_ci eval(shift(@insns)); 1435e1051a39Sopenharmony_ci eval(shift(@insns)); 1436e1051a39Sopenharmony_ci &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) 1437e1051a39Sopenharmony_ci eval(shift(@insns)); 1438e1051a39Sopenharmony_ci eval(shift(@insns)); 1439e1051a39Sopenharmony_ci &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); 1440e1051a39Sopenharmony_ci eval(shift(@insns)); 1441e1051a39Sopenharmony_ci eval(shift(@insns)); 1442e1051a39Sopenharmony_ci &vpxor ($t3,$t3,$t2); 1443e1051a39Sopenharmony_ci eval(shift(@insns)); 1444e1051a39Sopenharmony_ci eval(shift(@insns)); 1445e1051a39Sopenharmony_ci eval(shift(@insns)); 1446e1051a39Sopenharmony_ci eval(shift(@insns)); 1447e1051a39Sopenharmony_ci &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1448e1051a39Sopenharmony_ci eval(shift(@insns)); 1449e1051a39Sopenharmony_ci eval(shift(@insns)); 1450e1051a39Sopenharmony_ci eval(shift(@insns)); 1451e1051a39Sopenharmony_ci eval(shift(@insns)); 1452e1051a39Sopenharmony_ci &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1453e1051a39Sopenharmony_ci eval(shift(@insns)); 1454e1051a39Sopenharmony_ci eval(shift(@insns)); 1455e1051a39Sopenharmony_ci eval(shift(@insns)); 1456e1051a39Sopenharmony_ci eval(shift(@insns)); 1457e1051a39Sopenharmony_ci &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1458e1051a39Sopenharmony_ci foreach (@insns) { eval; } # remaining instructions 1459e1051a39Sopenharmony_ci &vmovdqa (16*$j."(%rsp)",$t2); 1460e1051a39Sopenharmony_ci} 1461e1051a39Sopenharmony_ci 1462e1051a39Sopenharmony_ci for ($i=0,$j=0; $j<8; $j++) { 1463e1051a39Sopenharmony_ci &XOP_512_00_47($j,\&body_00_15,@X); 1464e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 1465e1051a39Sopenharmony_ci } 1466e1051a39Sopenharmony_ci &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1467e1051a39Sopenharmony_ci &jne (".Lxop_00_47"); 1468e1051a39Sopenharmony_ci 1469e1051a39Sopenharmony_ci for ($i=0; $i<16; ) { 1470e1051a39Sopenharmony_ci foreach(body_00_15()) { eval; } 1471e1051a39Sopenharmony_ci } 1472e1051a39Sopenharmony_ci} 1473e1051a39Sopenharmony_ci$code.=<<___; 1474e1051a39Sopenharmony_ci mov $_ctx,$ctx 1475e1051a39Sopenharmony_ci mov $a1,$A 1476e1051a39Sopenharmony_ci 1477e1051a39Sopenharmony_ci add $SZ*0($ctx),$A 1478e1051a39Sopenharmony_ci lea 16*$SZ($inp),$inp 1479e1051a39Sopenharmony_ci add $SZ*1($ctx),$B 1480e1051a39Sopenharmony_ci add $SZ*2($ctx),$C 1481e1051a39Sopenharmony_ci add $SZ*3($ctx),$D 1482e1051a39Sopenharmony_ci add $SZ*4($ctx),$E 1483e1051a39Sopenharmony_ci add $SZ*5($ctx),$F 1484e1051a39Sopenharmony_ci add $SZ*6($ctx),$G 1485e1051a39Sopenharmony_ci add $SZ*7($ctx),$H 1486e1051a39Sopenharmony_ci 1487e1051a39Sopenharmony_ci cmp $_end,$inp 1488e1051a39Sopenharmony_ci 1489e1051a39Sopenharmony_ci mov $A,$SZ*0($ctx) 1490e1051a39Sopenharmony_ci mov $B,$SZ*1($ctx) 1491e1051a39Sopenharmony_ci mov $C,$SZ*2($ctx) 1492e1051a39Sopenharmony_ci mov $D,$SZ*3($ctx) 1493e1051a39Sopenharmony_ci mov $E,$SZ*4($ctx) 1494e1051a39Sopenharmony_ci mov $F,$SZ*5($ctx) 1495e1051a39Sopenharmony_ci mov $G,$SZ*6($ctx) 1496e1051a39Sopenharmony_ci mov $H,$SZ*7($ctx) 1497e1051a39Sopenharmony_ci jb .Lloop_xop 1498e1051a39Sopenharmony_ci 1499e1051a39Sopenharmony_ci mov $_rsp,%rsi 1500e1051a39Sopenharmony_ci.cfi_def_cfa %rsi,8 1501e1051a39Sopenharmony_ci vzeroupper 1502e1051a39Sopenharmony_ci___ 1503e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1504e1051a39Sopenharmony_ci movaps 16*$SZ+32(%rsp),%xmm6 1505e1051a39Sopenharmony_ci movaps 16*$SZ+48(%rsp),%xmm7 1506e1051a39Sopenharmony_ci movaps 16*$SZ+64(%rsp),%xmm8 1507e1051a39Sopenharmony_ci movaps 16*$SZ+80(%rsp),%xmm9 1508e1051a39Sopenharmony_ci___ 1509e1051a39Sopenharmony_ci$code.=<<___ if ($win64 && $SZ>4); 1510e1051a39Sopenharmony_ci movaps 16*$SZ+96(%rsp),%xmm10 1511e1051a39Sopenharmony_ci movaps 16*$SZ+112(%rsp),%xmm11 1512e1051a39Sopenharmony_ci___ 1513e1051a39Sopenharmony_ci$code.=<<___; 1514e1051a39Sopenharmony_ci mov -48(%rsi),%r15 1515e1051a39Sopenharmony_ci.cfi_restore %r15 1516e1051a39Sopenharmony_ci mov -40(%rsi),%r14 1517e1051a39Sopenharmony_ci.cfi_restore %r14 1518e1051a39Sopenharmony_ci mov -32(%rsi),%r13 1519e1051a39Sopenharmony_ci.cfi_restore %r13 1520e1051a39Sopenharmony_ci mov -24(%rsi),%r12 1521e1051a39Sopenharmony_ci.cfi_restore %r12 1522e1051a39Sopenharmony_ci mov -16(%rsi),%rbp 1523e1051a39Sopenharmony_ci.cfi_restore %rbp 1524e1051a39Sopenharmony_ci mov -8(%rsi),%rbx 1525e1051a39Sopenharmony_ci.cfi_restore %rbx 1526e1051a39Sopenharmony_ci lea (%rsi),%rsp 1527e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 1528e1051a39Sopenharmony_ci.Lepilogue_xop: 1529e1051a39Sopenharmony_ci ret 1530e1051a39Sopenharmony_ci.cfi_endproc 1531e1051a39Sopenharmony_ci.size ${func}_xop,.-${func}_xop 1532e1051a39Sopenharmony_ci___ 1533e1051a39Sopenharmony_ci} 1534e1051a39Sopenharmony_ci###################################################################### 1535e1051a39Sopenharmony_ci# AVX+shrd code path 1536e1051a39Sopenharmony_ci# 1537e1051a39Sopenharmony_cilocal *ror = sub { &shrd(@_[0],@_) }; 1538e1051a39Sopenharmony_ci 1539e1051a39Sopenharmony_ci$code.=<<___; 1540e1051a39Sopenharmony_ci.type ${func}_avx,\@function,3 1541e1051a39Sopenharmony_ci.align 64 1542e1051a39Sopenharmony_ci${func}_avx: 1543e1051a39Sopenharmony_ci.cfi_startproc 1544e1051a39Sopenharmony_ci.Lavx_shortcut: 1545e1051a39Sopenharmony_ci mov %rsp,%rax # copy %rsp 1546e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 1547e1051a39Sopenharmony_ci push %rbx 1548e1051a39Sopenharmony_ci.cfi_push %rbx 1549e1051a39Sopenharmony_ci push %rbp 1550e1051a39Sopenharmony_ci.cfi_push %rbp 1551e1051a39Sopenharmony_ci push %r12 1552e1051a39Sopenharmony_ci.cfi_push %r12 1553e1051a39Sopenharmony_ci push %r13 1554e1051a39Sopenharmony_ci.cfi_push %r13 1555e1051a39Sopenharmony_ci push %r14 1556e1051a39Sopenharmony_ci.cfi_push %r14 1557e1051a39Sopenharmony_ci push %r15 1558e1051a39Sopenharmony_ci.cfi_push %r15 1559e1051a39Sopenharmony_ci shl \$4,%rdx # num*16 1560e1051a39Sopenharmony_ci sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1561e1051a39Sopenharmony_ci lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1562e1051a39Sopenharmony_ci and \$-64,%rsp # align stack frame 1563e1051a39Sopenharmony_ci mov $ctx,$_ctx # save ctx, 1st arg 1564e1051a39Sopenharmony_ci mov $inp,$_inp # save inp, 2nd arh 1565e1051a39Sopenharmony_ci mov %rdx,$_end # save end pointer, "3rd" arg 1566e1051a39Sopenharmony_ci mov %rax,$_rsp # save copy of %rsp 1567e1051a39Sopenharmony_ci.cfi_cfa_expression $_rsp,deref,+8 1568e1051a39Sopenharmony_ci___ 1569e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1570e1051a39Sopenharmony_ci movaps %xmm6,16*$SZ+32(%rsp) 1571e1051a39Sopenharmony_ci movaps %xmm7,16*$SZ+48(%rsp) 1572e1051a39Sopenharmony_ci movaps %xmm8,16*$SZ+64(%rsp) 1573e1051a39Sopenharmony_ci movaps %xmm9,16*$SZ+80(%rsp) 1574e1051a39Sopenharmony_ci___ 1575e1051a39Sopenharmony_ci$code.=<<___ if ($win64 && $SZ>4); 1576e1051a39Sopenharmony_ci movaps %xmm10,16*$SZ+96(%rsp) 1577e1051a39Sopenharmony_ci movaps %xmm11,16*$SZ+112(%rsp) 1578e1051a39Sopenharmony_ci___ 1579e1051a39Sopenharmony_ci$code.=<<___; 1580e1051a39Sopenharmony_ci.Lprologue_avx: 1581e1051a39Sopenharmony_ci 1582e1051a39Sopenharmony_ci vzeroupper 1583e1051a39Sopenharmony_ci mov $SZ*0($ctx),$A 1584e1051a39Sopenharmony_ci mov $SZ*1($ctx),$B 1585e1051a39Sopenharmony_ci mov $SZ*2($ctx),$C 1586e1051a39Sopenharmony_ci mov $SZ*3($ctx),$D 1587e1051a39Sopenharmony_ci mov $SZ*4($ctx),$E 1588e1051a39Sopenharmony_ci mov $SZ*5($ctx),$F 1589e1051a39Sopenharmony_ci mov $SZ*6($ctx),$G 1590e1051a39Sopenharmony_ci mov $SZ*7($ctx),$H 1591e1051a39Sopenharmony_ci___ 1592e1051a39Sopenharmony_ci if ($SZ==4) { # SHA256 1593e1051a39Sopenharmony_ci my @X = map("%xmm$_",(0..3)); 1594e1051a39Sopenharmony_ci my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1595e1051a39Sopenharmony_ci 1596e1051a39Sopenharmony_ci$code.=<<___; 1597e1051a39Sopenharmony_ci vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1598e1051a39Sopenharmony_ci vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1599e1051a39Sopenharmony_ci jmp .Lloop_avx 1600e1051a39Sopenharmony_ci.align 16 1601e1051a39Sopenharmony_ci.Lloop_avx: 1602e1051a39Sopenharmony_ci vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1603e1051a39Sopenharmony_ci vmovdqu 0x00($inp),@X[0] 1604e1051a39Sopenharmony_ci vmovdqu 0x10($inp),@X[1] 1605e1051a39Sopenharmony_ci vmovdqu 0x20($inp),@X[2] 1606e1051a39Sopenharmony_ci vmovdqu 0x30($inp),@X[3] 1607e1051a39Sopenharmony_ci vpshufb $t3,@X[0],@X[0] 1608e1051a39Sopenharmony_ci lea $TABLE(%rip),$Tbl 1609e1051a39Sopenharmony_ci vpshufb $t3,@X[1],@X[1] 1610e1051a39Sopenharmony_ci vpshufb $t3,@X[2],@X[2] 1611e1051a39Sopenharmony_ci vpaddd 0x00($Tbl),@X[0],$t0 1612e1051a39Sopenharmony_ci vpshufb $t3,@X[3],@X[3] 1613e1051a39Sopenharmony_ci vpaddd 0x20($Tbl),@X[1],$t1 1614e1051a39Sopenharmony_ci vpaddd 0x40($Tbl),@X[2],$t2 1615e1051a39Sopenharmony_ci vpaddd 0x60($Tbl),@X[3],$t3 1616e1051a39Sopenharmony_ci vmovdqa $t0,0x00(%rsp) 1617e1051a39Sopenharmony_ci mov $A,$a1 1618e1051a39Sopenharmony_ci vmovdqa $t1,0x10(%rsp) 1619e1051a39Sopenharmony_ci mov $B,$a3 1620e1051a39Sopenharmony_ci vmovdqa $t2,0x20(%rsp) 1621e1051a39Sopenharmony_ci xor $C,$a3 # magic 1622e1051a39Sopenharmony_ci vmovdqa $t3,0x30(%rsp) 1623e1051a39Sopenharmony_ci mov $E,$a0 1624e1051a39Sopenharmony_ci jmp .Lavx_00_47 1625e1051a39Sopenharmony_ci 1626e1051a39Sopenharmony_ci.align 16 1627e1051a39Sopenharmony_ci.Lavx_00_47: 1628e1051a39Sopenharmony_ci sub \$`-16*2*$SZ`,$Tbl # size optimization 1629e1051a39Sopenharmony_ci___ 1630e1051a39Sopenharmony_cisub Xupdate_256_AVX () { 1631e1051a39Sopenharmony_ci ( 1632e1051a39Sopenharmony_ci '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1633e1051a39Sopenharmony_ci '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1634e1051a39Sopenharmony_ci '&vpsrld ($t2,$t0,$sigma0[0]);', 1635e1051a39Sopenharmony_ci '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1636e1051a39Sopenharmony_ci '&vpsrld ($t3,$t0,$sigma0[2])', 1637e1051a39Sopenharmony_ci '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1638e1051a39Sopenharmony_ci '&vpxor ($t0,$t3,$t2)', 1639e1051a39Sopenharmony_ci '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1640e1051a39Sopenharmony_ci '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1641e1051a39Sopenharmony_ci '&vpxor ($t0,$t0,$t1)', 1642e1051a39Sopenharmony_ci '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1643e1051a39Sopenharmony_ci '&vpxor ($t0,$t0,$t2)', 1644e1051a39Sopenharmony_ci '&vpsrld ($t2,$t3,$sigma1[2]);', 1645e1051a39Sopenharmony_ci '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1646e1051a39Sopenharmony_ci '&vpsrlq ($t3,$t3,$sigma1[0]);', 1647e1051a39Sopenharmony_ci '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1648e1051a39Sopenharmony_ci '&vpxor ($t2,$t2,$t3);', 1649e1051a39Sopenharmony_ci '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1650e1051a39Sopenharmony_ci '&vpxor ($t2,$t2,$t3)', 1651e1051a39Sopenharmony_ci '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1652e1051a39Sopenharmony_ci '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1653e1051a39Sopenharmony_ci '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1654e1051a39Sopenharmony_ci '&vpsrld ($t2,$t3,$sigma1[2])', 1655e1051a39Sopenharmony_ci '&vpsrlq ($t3,$t3,$sigma1[0])', 1656e1051a39Sopenharmony_ci '&vpxor ($t2,$t2,$t3);', 1657e1051a39Sopenharmony_ci '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1658e1051a39Sopenharmony_ci '&vpxor ($t2,$t2,$t3)', 1659e1051a39Sopenharmony_ci '&vpshufb ($t2,$t2,$t5)', 1660e1051a39Sopenharmony_ci '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1661e1051a39Sopenharmony_ci ); 1662e1051a39Sopenharmony_ci} 1663e1051a39Sopenharmony_ci 1664e1051a39Sopenharmony_cisub AVX_256_00_47 () { 1665e1051a39Sopenharmony_cimy $j = shift; 1666e1051a39Sopenharmony_cimy $body = shift; 1667e1051a39Sopenharmony_cimy @X = @_; 1668e1051a39Sopenharmony_cimy @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1669e1051a39Sopenharmony_ci 1670e1051a39Sopenharmony_ci foreach (Xupdate_256_AVX()) { # 29 instructions 1671e1051a39Sopenharmony_ci eval; 1672e1051a39Sopenharmony_ci eval(shift(@insns)); 1673e1051a39Sopenharmony_ci eval(shift(@insns)); 1674e1051a39Sopenharmony_ci eval(shift(@insns)); 1675e1051a39Sopenharmony_ci } 1676e1051a39Sopenharmony_ci &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1677e1051a39Sopenharmony_ci foreach (@insns) { eval; } # remaining instructions 1678e1051a39Sopenharmony_ci &vmovdqa (16*$j."(%rsp)",$t2); 1679e1051a39Sopenharmony_ci} 1680e1051a39Sopenharmony_ci 1681e1051a39Sopenharmony_ci for ($i=0,$j=0; $j<4; $j++) { 1682e1051a39Sopenharmony_ci &AVX_256_00_47($j,\&body_00_15,@X); 1683e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 1684e1051a39Sopenharmony_ci } 1685e1051a39Sopenharmony_ci &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1686e1051a39Sopenharmony_ci &jne (".Lavx_00_47"); 1687e1051a39Sopenharmony_ci 1688e1051a39Sopenharmony_ci for ($i=0; $i<16; ) { 1689e1051a39Sopenharmony_ci foreach(body_00_15()) { eval; } 1690e1051a39Sopenharmony_ci } 1691e1051a39Sopenharmony_ci 1692e1051a39Sopenharmony_ci } else { # SHA512 1693e1051a39Sopenharmony_ci my @X = map("%xmm$_",(0..7)); 1694e1051a39Sopenharmony_ci my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1695e1051a39Sopenharmony_ci 1696e1051a39Sopenharmony_ci$code.=<<___; 1697e1051a39Sopenharmony_ci jmp .Lloop_avx 1698e1051a39Sopenharmony_ci.align 16 1699e1051a39Sopenharmony_ci.Lloop_avx: 1700e1051a39Sopenharmony_ci vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1701e1051a39Sopenharmony_ci vmovdqu 0x00($inp),@X[0] 1702e1051a39Sopenharmony_ci lea $TABLE+0x80(%rip),$Tbl # size optimization 1703e1051a39Sopenharmony_ci vmovdqu 0x10($inp),@X[1] 1704e1051a39Sopenharmony_ci vmovdqu 0x20($inp),@X[2] 1705e1051a39Sopenharmony_ci vpshufb $t3,@X[0],@X[0] 1706e1051a39Sopenharmony_ci vmovdqu 0x30($inp),@X[3] 1707e1051a39Sopenharmony_ci vpshufb $t3,@X[1],@X[1] 1708e1051a39Sopenharmony_ci vmovdqu 0x40($inp),@X[4] 1709e1051a39Sopenharmony_ci vpshufb $t3,@X[2],@X[2] 1710e1051a39Sopenharmony_ci vmovdqu 0x50($inp),@X[5] 1711e1051a39Sopenharmony_ci vpshufb $t3,@X[3],@X[3] 1712e1051a39Sopenharmony_ci vmovdqu 0x60($inp),@X[6] 1713e1051a39Sopenharmony_ci vpshufb $t3,@X[4],@X[4] 1714e1051a39Sopenharmony_ci vmovdqu 0x70($inp),@X[7] 1715e1051a39Sopenharmony_ci vpshufb $t3,@X[5],@X[5] 1716e1051a39Sopenharmony_ci vpaddq -0x80($Tbl),@X[0],$t0 1717e1051a39Sopenharmony_ci vpshufb $t3,@X[6],@X[6] 1718e1051a39Sopenharmony_ci vpaddq -0x60($Tbl),@X[1],$t1 1719e1051a39Sopenharmony_ci vpshufb $t3,@X[7],@X[7] 1720e1051a39Sopenharmony_ci vpaddq -0x40($Tbl),@X[2],$t2 1721e1051a39Sopenharmony_ci vpaddq -0x20($Tbl),@X[3],$t3 1722e1051a39Sopenharmony_ci vmovdqa $t0,0x00(%rsp) 1723e1051a39Sopenharmony_ci vpaddq 0x00($Tbl),@X[4],$t0 1724e1051a39Sopenharmony_ci vmovdqa $t1,0x10(%rsp) 1725e1051a39Sopenharmony_ci vpaddq 0x20($Tbl),@X[5],$t1 1726e1051a39Sopenharmony_ci vmovdqa $t2,0x20(%rsp) 1727e1051a39Sopenharmony_ci vpaddq 0x40($Tbl),@X[6],$t2 1728e1051a39Sopenharmony_ci vmovdqa $t3,0x30(%rsp) 1729e1051a39Sopenharmony_ci vpaddq 0x60($Tbl),@X[7],$t3 1730e1051a39Sopenharmony_ci vmovdqa $t0,0x40(%rsp) 1731e1051a39Sopenharmony_ci mov $A,$a1 1732e1051a39Sopenharmony_ci vmovdqa $t1,0x50(%rsp) 1733e1051a39Sopenharmony_ci mov $B,$a3 1734e1051a39Sopenharmony_ci vmovdqa $t2,0x60(%rsp) 1735e1051a39Sopenharmony_ci xor $C,$a3 # magic 1736e1051a39Sopenharmony_ci vmovdqa $t3,0x70(%rsp) 1737e1051a39Sopenharmony_ci mov $E,$a0 1738e1051a39Sopenharmony_ci jmp .Lavx_00_47 1739e1051a39Sopenharmony_ci 1740e1051a39Sopenharmony_ci.align 16 1741e1051a39Sopenharmony_ci.Lavx_00_47: 1742e1051a39Sopenharmony_ci add \$`16*2*$SZ`,$Tbl 1743e1051a39Sopenharmony_ci___ 1744e1051a39Sopenharmony_cisub Xupdate_512_AVX () { 1745e1051a39Sopenharmony_ci ( 1746e1051a39Sopenharmony_ci '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1747e1051a39Sopenharmony_ci '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1748e1051a39Sopenharmony_ci '&vpsrlq ($t2,$t0,$sigma0[0])', 1749e1051a39Sopenharmony_ci '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1750e1051a39Sopenharmony_ci '&vpsrlq ($t3,$t0,$sigma0[2])', 1751e1051a39Sopenharmony_ci '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1752e1051a39Sopenharmony_ci '&vpxor ($t0,$t3,$t2)', 1753e1051a39Sopenharmony_ci '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1754e1051a39Sopenharmony_ci '&vpxor ($t0,$t0,$t1)', 1755e1051a39Sopenharmony_ci '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1756e1051a39Sopenharmony_ci '&vpxor ($t0,$t0,$t2)', 1757e1051a39Sopenharmony_ci '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1758e1051a39Sopenharmony_ci '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1759e1051a39Sopenharmony_ci '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1760e1051a39Sopenharmony_ci '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1761e1051a39Sopenharmony_ci '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1762e1051a39Sopenharmony_ci '&vpxor ($t3,$t3,$t2)', 1763e1051a39Sopenharmony_ci '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1764e1051a39Sopenharmony_ci '&vpxor ($t3,$t3,$t1)', 1765e1051a39Sopenharmony_ci '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1766e1051a39Sopenharmony_ci '&vpxor ($t3,$t3,$t2)', 1767e1051a39Sopenharmony_ci '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1768e1051a39Sopenharmony_ci '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1769e1051a39Sopenharmony_ci ); 1770e1051a39Sopenharmony_ci} 1771e1051a39Sopenharmony_ci 1772e1051a39Sopenharmony_cisub AVX_512_00_47 () { 1773e1051a39Sopenharmony_cimy $j = shift; 1774e1051a39Sopenharmony_cimy $body = shift; 1775e1051a39Sopenharmony_cimy @X = @_; 1776e1051a39Sopenharmony_cimy @insns = (&$body,&$body); # 52 instructions 1777e1051a39Sopenharmony_ci 1778e1051a39Sopenharmony_ci foreach (Xupdate_512_AVX()) { # 23 instructions 1779e1051a39Sopenharmony_ci eval; 1780e1051a39Sopenharmony_ci eval(shift(@insns)); 1781e1051a39Sopenharmony_ci eval(shift(@insns)); 1782e1051a39Sopenharmony_ci } 1783e1051a39Sopenharmony_ci &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1784e1051a39Sopenharmony_ci foreach (@insns) { eval; } # remaining instructions 1785e1051a39Sopenharmony_ci &vmovdqa (16*$j."(%rsp)",$t2); 1786e1051a39Sopenharmony_ci} 1787e1051a39Sopenharmony_ci 1788e1051a39Sopenharmony_ci for ($i=0,$j=0; $j<8; $j++) { 1789e1051a39Sopenharmony_ci &AVX_512_00_47($j,\&body_00_15,@X); 1790e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 1791e1051a39Sopenharmony_ci } 1792e1051a39Sopenharmony_ci &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1793e1051a39Sopenharmony_ci &jne (".Lavx_00_47"); 1794e1051a39Sopenharmony_ci 1795e1051a39Sopenharmony_ci for ($i=0; $i<16; ) { 1796e1051a39Sopenharmony_ci foreach(body_00_15()) { eval; } 1797e1051a39Sopenharmony_ci } 1798e1051a39Sopenharmony_ci} 1799e1051a39Sopenharmony_ci$code.=<<___; 1800e1051a39Sopenharmony_ci mov $_ctx,$ctx 1801e1051a39Sopenharmony_ci mov $a1,$A 1802e1051a39Sopenharmony_ci 1803e1051a39Sopenharmony_ci add $SZ*0($ctx),$A 1804e1051a39Sopenharmony_ci lea 16*$SZ($inp),$inp 1805e1051a39Sopenharmony_ci add $SZ*1($ctx),$B 1806e1051a39Sopenharmony_ci add $SZ*2($ctx),$C 1807e1051a39Sopenharmony_ci add $SZ*3($ctx),$D 1808e1051a39Sopenharmony_ci add $SZ*4($ctx),$E 1809e1051a39Sopenharmony_ci add $SZ*5($ctx),$F 1810e1051a39Sopenharmony_ci add $SZ*6($ctx),$G 1811e1051a39Sopenharmony_ci add $SZ*7($ctx),$H 1812e1051a39Sopenharmony_ci 1813e1051a39Sopenharmony_ci cmp $_end,$inp 1814e1051a39Sopenharmony_ci 1815e1051a39Sopenharmony_ci mov $A,$SZ*0($ctx) 1816e1051a39Sopenharmony_ci mov $B,$SZ*1($ctx) 1817e1051a39Sopenharmony_ci mov $C,$SZ*2($ctx) 1818e1051a39Sopenharmony_ci mov $D,$SZ*3($ctx) 1819e1051a39Sopenharmony_ci mov $E,$SZ*4($ctx) 1820e1051a39Sopenharmony_ci mov $F,$SZ*5($ctx) 1821e1051a39Sopenharmony_ci mov $G,$SZ*6($ctx) 1822e1051a39Sopenharmony_ci mov $H,$SZ*7($ctx) 1823e1051a39Sopenharmony_ci jb .Lloop_avx 1824e1051a39Sopenharmony_ci 1825e1051a39Sopenharmony_ci mov $_rsp,%rsi 1826e1051a39Sopenharmony_ci.cfi_def_cfa %rsi,8 1827e1051a39Sopenharmony_ci vzeroupper 1828e1051a39Sopenharmony_ci___ 1829e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1830e1051a39Sopenharmony_ci movaps 16*$SZ+32(%rsp),%xmm6 1831e1051a39Sopenharmony_ci movaps 16*$SZ+48(%rsp),%xmm7 1832e1051a39Sopenharmony_ci movaps 16*$SZ+64(%rsp),%xmm8 1833e1051a39Sopenharmony_ci movaps 16*$SZ+80(%rsp),%xmm9 1834e1051a39Sopenharmony_ci___ 1835e1051a39Sopenharmony_ci$code.=<<___ if ($win64 && $SZ>4); 1836e1051a39Sopenharmony_ci movaps 16*$SZ+96(%rsp),%xmm10 1837e1051a39Sopenharmony_ci movaps 16*$SZ+112(%rsp),%xmm11 1838e1051a39Sopenharmony_ci___ 1839e1051a39Sopenharmony_ci$code.=<<___; 1840e1051a39Sopenharmony_ci mov -48(%rsi),%r15 1841e1051a39Sopenharmony_ci.cfi_restore %r15 1842e1051a39Sopenharmony_ci mov -40(%rsi),%r14 1843e1051a39Sopenharmony_ci.cfi_restore %r14 1844e1051a39Sopenharmony_ci mov -32(%rsi),%r13 1845e1051a39Sopenharmony_ci.cfi_restore %r13 1846e1051a39Sopenharmony_ci mov -24(%rsi),%r12 1847e1051a39Sopenharmony_ci.cfi_restore %r12 1848e1051a39Sopenharmony_ci mov -16(%rsi),%rbp 1849e1051a39Sopenharmony_ci.cfi_restore %rbp 1850e1051a39Sopenharmony_ci mov -8(%rsi),%rbx 1851e1051a39Sopenharmony_ci.cfi_restore %rbx 1852e1051a39Sopenharmony_ci lea (%rsi),%rsp 1853e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 1854e1051a39Sopenharmony_ci.Lepilogue_avx: 1855e1051a39Sopenharmony_ci ret 1856e1051a39Sopenharmony_ci.cfi_endproc 1857e1051a39Sopenharmony_ci.size ${func}_avx,.-${func}_avx 1858e1051a39Sopenharmony_ci___ 1859e1051a39Sopenharmony_ci 1860e1051a39Sopenharmony_ciif ($avx>1) {{ 1861e1051a39Sopenharmony_ci###################################################################### 1862e1051a39Sopenharmony_ci# AVX2+BMI code path 1863e1051a39Sopenharmony_ci# 1864e1051a39Sopenharmony_cimy $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1865e1051a39Sopenharmony_cimy $PUSH8=8*2*$SZ; 1866e1051a39Sopenharmony_ciuse integer; 1867e1051a39Sopenharmony_ci 1868e1051a39Sopenharmony_cisub bodyx_00_15 () { 1869e1051a39Sopenharmony_ci # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1870e1051a39Sopenharmony_ci ( 1871e1051a39Sopenharmony_ci '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1872e1051a39Sopenharmony_ci 1873e1051a39Sopenharmony_ci '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1874e1051a39Sopenharmony_ci '&and ($a4,$e)', # f&e 1875e1051a39Sopenharmony_ci '&rorx ($a0,$e,$Sigma1[2])', 1876e1051a39Sopenharmony_ci '&rorx ($a2,$e,$Sigma1[1])', 1877e1051a39Sopenharmony_ci 1878e1051a39Sopenharmony_ci '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1879e1051a39Sopenharmony_ci '&lea ($h,"($h,$a4)")', 1880e1051a39Sopenharmony_ci '&andn ($a4,$e,$g)', # ~e&g 1881e1051a39Sopenharmony_ci '&xor ($a0,$a2)', 1882e1051a39Sopenharmony_ci 1883e1051a39Sopenharmony_ci '&rorx ($a1,$e,$Sigma1[0])', 1884e1051a39Sopenharmony_ci '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1885e1051a39Sopenharmony_ci '&xor ($a0,$a1)', # Sigma1(e) 1886e1051a39Sopenharmony_ci '&mov ($a2,$a)', 1887e1051a39Sopenharmony_ci 1888e1051a39Sopenharmony_ci '&rorx ($a4,$a,$Sigma0[2])', 1889e1051a39Sopenharmony_ci '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1890e1051a39Sopenharmony_ci '&xor ($a2,$b)', # a^b, b^c in next round 1891e1051a39Sopenharmony_ci '&rorx ($a1,$a,$Sigma0[1])', 1892e1051a39Sopenharmony_ci 1893e1051a39Sopenharmony_ci '&rorx ($a0,$a,$Sigma0[0])', 1894e1051a39Sopenharmony_ci '&lea ($d,"($d,$h)")', # d+=h 1895e1051a39Sopenharmony_ci '&and ($a3,$a2)', # (b^c)&(a^b) 1896e1051a39Sopenharmony_ci '&xor ($a1,$a4)', 1897e1051a39Sopenharmony_ci 1898e1051a39Sopenharmony_ci '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1899e1051a39Sopenharmony_ci '&xor ($a1,$a0)', # Sigma0(a) 1900e1051a39Sopenharmony_ci '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1901e1051a39Sopenharmony_ci '&mov ($a4,$e)', # copy of f in future 1902e1051a39Sopenharmony_ci 1903e1051a39Sopenharmony_ci '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1904e1051a39Sopenharmony_ci ); 1905e1051a39Sopenharmony_ci # and at the finish one has to $a+=$a1 1906e1051a39Sopenharmony_ci} 1907e1051a39Sopenharmony_ci 1908e1051a39Sopenharmony_ci$code.=<<___; 1909e1051a39Sopenharmony_ci.type ${func}_avx2,\@function,3 1910e1051a39Sopenharmony_ci.align 64 1911e1051a39Sopenharmony_ci${func}_avx2: 1912e1051a39Sopenharmony_ci.cfi_startproc 1913e1051a39Sopenharmony_ci.Lavx2_shortcut: 1914e1051a39Sopenharmony_ci mov %rsp,%rax # copy %rsp 1915e1051a39Sopenharmony_ci.cfi_def_cfa_register %rax 1916e1051a39Sopenharmony_ci push %rbx 1917e1051a39Sopenharmony_ci.cfi_push %rbx 1918e1051a39Sopenharmony_ci push %rbp 1919e1051a39Sopenharmony_ci.cfi_push %rbp 1920e1051a39Sopenharmony_ci push %r12 1921e1051a39Sopenharmony_ci.cfi_push %r12 1922e1051a39Sopenharmony_ci push %r13 1923e1051a39Sopenharmony_ci.cfi_push %r13 1924e1051a39Sopenharmony_ci push %r14 1925e1051a39Sopenharmony_ci.cfi_push %r14 1926e1051a39Sopenharmony_ci push %r15 1927e1051a39Sopenharmony_ci.cfi_push %r15 1928e1051a39Sopenharmony_ci sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1929e1051a39Sopenharmony_ci shl \$4,%rdx # num*16 1930e1051a39Sopenharmony_ci and \$-256*$SZ,%rsp # align stack frame 1931e1051a39Sopenharmony_ci lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1932e1051a39Sopenharmony_ci add \$`2*$SZ*($rounds-8)`,%rsp 1933e1051a39Sopenharmony_ci mov $ctx,$_ctx # save ctx, 1st arg 1934e1051a39Sopenharmony_ci mov $inp,$_inp # save inp, 2nd arh 1935e1051a39Sopenharmony_ci mov %rdx,$_end # save end pointer, "3rd" arg 1936e1051a39Sopenharmony_ci mov %rax,$_rsp # save copy of %rsp 1937e1051a39Sopenharmony_ci.cfi_cfa_expression $_rsp,deref,+8 1938e1051a39Sopenharmony_ci___ 1939e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 1940e1051a39Sopenharmony_ci movaps %xmm6,16*$SZ+32(%rsp) 1941e1051a39Sopenharmony_ci movaps %xmm7,16*$SZ+48(%rsp) 1942e1051a39Sopenharmony_ci movaps %xmm8,16*$SZ+64(%rsp) 1943e1051a39Sopenharmony_ci movaps %xmm9,16*$SZ+80(%rsp) 1944e1051a39Sopenharmony_ci___ 1945e1051a39Sopenharmony_ci$code.=<<___ if ($win64 && $SZ>4); 1946e1051a39Sopenharmony_ci movaps %xmm10,16*$SZ+96(%rsp) 1947e1051a39Sopenharmony_ci movaps %xmm11,16*$SZ+112(%rsp) 1948e1051a39Sopenharmony_ci___ 1949e1051a39Sopenharmony_ci$code.=<<___; 1950e1051a39Sopenharmony_ci.Lprologue_avx2: 1951e1051a39Sopenharmony_ci 1952e1051a39Sopenharmony_ci vzeroupper 1953e1051a39Sopenharmony_ci sub \$-16*$SZ,$inp # inp++, size optimization 1954e1051a39Sopenharmony_ci mov $SZ*0($ctx),$A 1955e1051a39Sopenharmony_ci mov $inp,%r12 # borrow $T1 1956e1051a39Sopenharmony_ci mov $SZ*1($ctx),$B 1957e1051a39Sopenharmony_ci cmp %rdx,$inp # $_end 1958e1051a39Sopenharmony_ci mov $SZ*2($ctx),$C 1959e1051a39Sopenharmony_ci cmove %rsp,%r12 # next block or random data 1960e1051a39Sopenharmony_ci mov $SZ*3($ctx),$D 1961e1051a39Sopenharmony_ci mov $SZ*4($ctx),$E 1962e1051a39Sopenharmony_ci mov $SZ*5($ctx),$F 1963e1051a39Sopenharmony_ci mov $SZ*6($ctx),$G 1964e1051a39Sopenharmony_ci mov $SZ*7($ctx),$H 1965e1051a39Sopenharmony_ci___ 1966e1051a39Sopenharmony_ci if ($SZ==4) { # SHA256 1967e1051a39Sopenharmony_ci my @X = map("%ymm$_",(0..3)); 1968e1051a39Sopenharmony_ci my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1969e1051a39Sopenharmony_ci 1970e1051a39Sopenharmony_ci$code.=<<___; 1971e1051a39Sopenharmony_ci vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1972e1051a39Sopenharmony_ci vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1973e1051a39Sopenharmony_ci jmp .Loop_avx2 1974e1051a39Sopenharmony_ci.align 16 1975e1051a39Sopenharmony_ci.Loop_avx2: 1976e1051a39Sopenharmony_ci vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1977e1051a39Sopenharmony_ci vmovdqu -16*$SZ+0($inp),%xmm0 1978e1051a39Sopenharmony_ci vmovdqu -16*$SZ+16($inp),%xmm1 1979e1051a39Sopenharmony_ci vmovdqu -16*$SZ+32($inp),%xmm2 1980e1051a39Sopenharmony_ci vmovdqu -16*$SZ+48($inp),%xmm3 1981e1051a39Sopenharmony_ci #mov $inp,$_inp # offload $inp 1982e1051a39Sopenharmony_ci vinserti128 \$1,(%r12),@X[0],@X[0] 1983e1051a39Sopenharmony_ci vinserti128 \$1,16(%r12),@X[1],@X[1] 1984e1051a39Sopenharmony_ci vpshufb $t3,@X[0],@X[0] 1985e1051a39Sopenharmony_ci vinserti128 \$1,32(%r12),@X[2],@X[2] 1986e1051a39Sopenharmony_ci vpshufb $t3,@X[1],@X[1] 1987e1051a39Sopenharmony_ci vinserti128 \$1,48(%r12),@X[3],@X[3] 1988e1051a39Sopenharmony_ci 1989e1051a39Sopenharmony_ci lea $TABLE(%rip),$Tbl 1990e1051a39Sopenharmony_ci vpshufb $t3,@X[2],@X[2] 1991e1051a39Sopenharmony_ci vpaddd 0x00($Tbl),@X[0],$t0 1992e1051a39Sopenharmony_ci vpshufb $t3,@X[3],@X[3] 1993e1051a39Sopenharmony_ci vpaddd 0x20($Tbl),@X[1],$t1 1994e1051a39Sopenharmony_ci vpaddd 0x40($Tbl),@X[2],$t2 1995e1051a39Sopenharmony_ci vpaddd 0x60($Tbl),@X[3],$t3 1996e1051a39Sopenharmony_ci vmovdqa $t0,0x00(%rsp) 1997e1051a39Sopenharmony_ci xor $a1,$a1 1998e1051a39Sopenharmony_ci vmovdqa $t1,0x20(%rsp) 1999e1051a39Sopenharmony_ci___ 2000e1051a39Sopenharmony_ci$code.=<<___ if (!$win64); 2001e1051a39Sopenharmony_ci# temporarily use %rdi as frame pointer 2002e1051a39Sopenharmony_ci mov $_rsp,%rdi 2003e1051a39Sopenharmony_ci.cfi_def_cfa %rdi,8 2004e1051a39Sopenharmony_ci___ 2005e1051a39Sopenharmony_ci$code.=<<___; 2006e1051a39Sopenharmony_ci lea -$PUSH8(%rsp),%rsp 2007e1051a39Sopenharmony_ci___ 2008e1051a39Sopenharmony_ci$code.=<<___ if (!$win64); 2009e1051a39Sopenharmony_ci# the frame info is at $_rsp, but the stack is moving... 2010e1051a39Sopenharmony_ci# so a second frame pointer is saved at -8(%rsp) 2011e1051a39Sopenharmony_ci# that is in the red zone 2012e1051a39Sopenharmony_ci mov %rdi,-8(%rsp) 2013e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp-8,deref,+8 2014e1051a39Sopenharmony_ci___ 2015e1051a39Sopenharmony_ci$code.=<<___; 2016e1051a39Sopenharmony_ci mov $B,$a3 2017e1051a39Sopenharmony_ci vmovdqa $t2,0x00(%rsp) 2018e1051a39Sopenharmony_ci xor $C,$a3 # magic 2019e1051a39Sopenharmony_ci vmovdqa $t3,0x20(%rsp) 2020e1051a39Sopenharmony_ci mov $F,$a4 2021e1051a39Sopenharmony_ci sub \$-16*2*$SZ,$Tbl # size optimization 2022e1051a39Sopenharmony_ci jmp .Lavx2_00_47 2023e1051a39Sopenharmony_ci 2024e1051a39Sopenharmony_ci.align 16 2025e1051a39Sopenharmony_ci.Lavx2_00_47: 2026e1051a39Sopenharmony_ci___ 2027e1051a39Sopenharmony_ci 2028e1051a39Sopenharmony_cisub AVX2_256_00_47 () { 2029e1051a39Sopenharmony_cimy $j = shift; 2030e1051a39Sopenharmony_cimy $body = shift; 2031e1051a39Sopenharmony_cimy @X = @_; 2032e1051a39Sopenharmony_cimy @insns = (&$body,&$body,&$body,&$body); # 96 instructions 2033e1051a39Sopenharmony_cimy $base = "+2*$PUSH8(%rsp)"; 2034e1051a39Sopenharmony_ci 2035e1051a39Sopenharmony_ci if (($j%2)==0) { 2036e1051a39Sopenharmony_ci &lea ("%rsp","-$PUSH8(%rsp)"); 2037e1051a39Sopenharmony_ci$code.=<<___ if (!$win64); 2038e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 2039e1051a39Sopenharmony_ci# copy secondary frame pointer to new location again at -8(%rsp) 2040e1051a39Sopenharmony_ci pushq $PUSH8-8(%rsp) 2041e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp,deref,+8 2042e1051a39Sopenharmony_ci lea 8(%rsp),%rsp 2043e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp-8,deref,+8 2044e1051a39Sopenharmony_ci___ 2045e1051a39Sopenharmony_ci } 2046e1051a39Sopenharmony_ci 2047e1051a39Sopenharmony_ci foreach (Xupdate_256_AVX()) { # 29 instructions 2048e1051a39Sopenharmony_ci eval; 2049e1051a39Sopenharmony_ci eval(shift(@insns)); 2050e1051a39Sopenharmony_ci eval(shift(@insns)); 2051e1051a39Sopenharmony_ci eval(shift(@insns)); 2052e1051a39Sopenharmony_ci } 2053e1051a39Sopenharmony_ci &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 2054e1051a39Sopenharmony_ci foreach (@insns) { eval; } # remaining instructions 2055e1051a39Sopenharmony_ci &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2056e1051a39Sopenharmony_ci} 2057e1051a39Sopenharmony_ci 2058e1051a39Sopenharmony_ci for ($i=0,$j=0; $j<4; $j++) { 2059e1051a39Sopenharmony_ci &AVX2_256_00_47($j,\&bodyx_00_15,@X); 2060e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 2061e1051a39Sopenharmony_ci } 2062e1051a39Sopenharmony_ci &lea ($Tbl,16*2*$SZ."($Tbl)"); 2063e1051a39Sopenharmony_ci &cmpb (($SZ-1)."($Tbl)",0); 2064e1051a39Sopenharmony_ci &jne (".Lavx2_00_47"); 2065e1051a39Sopenharmony_ci 2066e1051a39Sopenharmony_ci for ($i=0; $i<16; ) { 2067e1051a39Sopenharmony_ci my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2068e1051a39Sopenharmony_ci foreach(bodyx_00_15()) { eval; } 2069e1051a39Sopenharmony_ci } 2070e1051a39Sopenharmony_ci } else { # SHA512 2071e1051a39Sopenharmony_ci my @X = map("%ymm$_",(0..7)); 2072e1051a39Sopenharmony_ci my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 2073e1051a39Sopenharmony_ci 2074e1051a39Sopenharmony_ci$code.=<<___; 2075e1051a39Sopenharmony_ci jmp .Loop_avx2 2076e1051a39Sopenharmony_ci.align 16 2077e1051a39Sopenharmony_ci.Loop_avx2: 2078e1051a39Sopenharmony_ci vmovdqu -16*$SZ($inp),%xmm0 2079e1051a39Sopenharmony_ci vmovdqu -16*$SZ+16($inp),%xmm1 2080e1051a39Sopenharmony_ci vmovdqu -16*$SZ+32($inp),%xmm2 2081e1051a39Sopenharmony_ci lea $TABLE+0x80(%rip),$Tbl # size optimization 2082e1051a39Sopenharmony_ci vmovdqu -16*$SZ+48($inp),%xmm3 2083e1051a39Sopenharmony_ci vmovdqu -16*$SZ+64($inp),%xmm4 2084e1051a39Sopenharmony_ci vmovdqu -16*$SZ+80($inp),%xmm5 2085e1051a39Sopenharmony_ci vmovdqu -16*$SZ+96($inp),%xmm6 2086e1051a39Sopenharmony_ci vmovdqu -16*$SZ+112($inp),%xmm7 2087e1051a39Sopenharmony_ci #mov $inp,$_inp # offload $inp 2088e1051a39Sopenharmony_ci vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 2089e1051a39Sopenharmony_ci vinserti128 \$1,(%r12),@X[0],@X[0] 2090e1051a39Sopenharmony_ci vinserti128 \$1,16(%r12),@X[1],@X[1] 2091e1051a39Sopenharmony_ci vpshufb $t2,@X[0],@X[0] 2092e1051a39Sopenharmony_ci vinserti128 \$1,32(%r12),@X[2],@X[2] 2093e1051a39Sopenharmony_ci vpshufb $t2,@X[1],@X[1] 2094e1051a39Sopenharmony_ci vinserti128 \$1,48(%r12),@X[3],@X[3] 2095e1051a39Sopenharmony_ci vpshufb $t2,@X[2],@X[2] 2096e1051a39Sopenharmony_ci vinserti128 \$1,64(%r12),@X[4],@X[4] 2097e1051a39Sopenharmony_ci vpshufb $t2,@X[3],@X[3] 2098e1051a39Sopenharmony_ci vinserti128 \$1,80(%r12),@X[5],@X[5] 2099e1051a39Sopenharmony_ci vpshufb $t2,@X[4],@X[4] 2100e1051a39Sopenharmony_ci vinserti128 \$1,96(%r12),@X[6],@X[6] 2101e1051a39Sopenharmony_ci vpshufb $t2,@X[5],@X[5] 2102e1051a39Sopenharmony_ci vinserti128 \$1,112(%r12),@X[7],@X[7] 2103e1051a39Sopenharmony_ci 2104e1051a39Sopenharmony_ci vpaddq -0x80($Tbl),@X[0],$t0 2105e1051a39Sopenharmony_ci vpshufb $t2,@X[6],@X[6] 2106e1051a39Sopenharmony_ci vpaddq -0x60($Tbl),@X[1],$t1 2107e1051a39Sopenharmony_ci vpshufb $t2,@X[7],@X[7] 2108e1051a39Sopenharmony_ci vpaddq -0x40($Tbl),@X[2],$t2 2109e1051a39Sopenharmony_ci vpaddq -0x20($Tbl),@X[3],$t3 2110e1051a39Sopenharmony_ci vmovdqa $t0,0x00(%rsp) 2111e1051a39Sopenharmony_ci vpaddq 0x00($Tbl),@X[4],$t0 2112e1051a39Sopenharmony_ci vmovdqa $t1,0x20(%rsp) 2113e1051a39Sopenharmony_ci vpaddq 0x20($Tbl),@X[5],$t1 2114e1051a39Sopenharmony_ci vmovdqa $t2,0x40(%rsp) 2115e1051a39Sopenharmony_ci vpaddq 0x40($Tbl),@X[6],$t2 2116e1051a39Sopenharmony_ci vmovdqa $t3,0x60(%rsp) 2117e1051a39Sopenharmony_ci___ 2118e1051a39Sopenharmony_ci$code.=<<___ if (!$win64); 2119e1051a39Sopenharmony_ci# temporarily use %rdi as frame pointer 2120e1051a39Sopenharmony_ci mov $_rsp,%rdi 2121e1051a39Sopenharmony_ci.cfi_def_cfa %rdi,8 2122e1051a39Sopenharmony_ci___ 2123e1051a39Sopenharmony_ci$code.=<<___; 2124e1051a39Sopenharmony_ci lea -$PUSH8(%rsp),%rsp 2125e1051a39Sopenharmony_ci___ 2126e1051a39Sopenharmony_ci$code.=<<___ if (!$win64); 2127e1051a39Sopenharmony_ci# the frame info is at $_rsp, but the stack is moving... 2128e1051a39Sopenharmony_ci# so a second frame pointer is saved at -8(%rsp) 2129e1051a39Sopenharmony_ci# that is in the red zone 2130e1051a39Sopenharmony_ci mov %rdi,-8(%rsp) 2131e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp-8,deref,+8 2132e1051a39Sopenharmony_ci___ 2133e1051a39Sopenharmony_ci$code.=<<___; 2134e1051a39Sopenharmony_ci vpaddq 0x60($Tbl),@X[7],$t3 2135e1051a39Sopenharmony_ci vmovdqa $t0,0x00(%rsp) 2136e1051a39Sopenharmony_ci xor $a1,$a1 2137e1051a39Sopenharmony_ci vmovdqa $t1,0x20(%rsp) 2138e1051a39Sopenharmony_ci mov $B,$a3 2139e1051a39Sopenharmony_ci vmovdqa $t2,0x40(%rsp) 2140e1051a39Sopenharmony_ci xor $C,$a3 # magic 2141e1051a39Sopenharmony_ci vmovdqa $t3,0x60(%rsp) 2142e1051a39Sopenharmony_ci mov $F,$a4 2143e1051a39Sopenharmony_ci add \$16*2*$SZ,$Tbl 2144e1051a39Sopenharmony_ci jmp .Lavx2_00_47 2145e1051a39Sopenharmony_ci 2146e1051a39Sopenharmony_ci.align 16 2147e1051a39Sopenharmony_ci.Lavx2_00_47: 2148e1051a39Sopenharmony_ci___ 2149e1051a39Sopenharmony_ci 2150e1051a39Sopenharmony_cisub AVX2_512_00_47 () { 2151e1051a39Sopenharmony_cimy $j = shift; 2152e1051a39Sopenharmony_cimy $body = shift; 2153e1051a39Sopenharmony_cimy @X = @_; 2154e1051a39Sopenharmony_cimy @insns = (&$body,&$body); # 48 instructions 2155e1051a39Sopenharmony_cimy $base = "+2*$PUSH8(%rsp)"; 2156e1051a39Sopenharmony_ci 2157e1051a39Sopenharmony_ci if (($j%4)==0) { 2158e1051a39Sopenharmony_ci &lea ("%rsp","-$PUSH8(%rsp)"); 2159e1051a39Sopenharmony_ci$code.=<<___ if (!$win64); 2160e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 2161e1051a39Sopenharmony_ci# copy secondary frame pointer to new location again at -8(%rsp) 2162e1051a39Sopenharmony_ci pushq $PUSH8-8(%rsp) 2163e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp,deref,+8 2164e1051a39Sopenharmony_ci lea 8(%rsp),%rsp 2165e1051a39Sopenharmony_ci.cfi_cfa_expression %rsp-8,deref,+8 2166e1051a39Sopenharmony_ci___ 2167e1051a39Sopenharmony_ci } 2168e1051a39Sopenharmony_ci 2169e1051a39Sopenharmony_ci foreach (Xupdate_512_AVX()) { # 23 instructions 2170e1051a39Sopenharmony_ci eval; 2171e1051a39Sopenharmony_ci if ($_ !~ /\;$/) { 2172e1051a39Sopenharmony_ci eval(shift(@insns)); 2173e1051a39Sopenharmony_ci eval(shift(@insns)); 2174e1051a39Sopenharmony_ci eval(shift(@insns)); 2175e1051a39Sopenharmony_ci } 2176e1051a39Sopenharmony_ci } 2177e1051a39Sopenharmony_ci &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 2178e1051a39Sopenharmony_ci foreach (@insns) { eval; } # remaining instructions 2179e1051a39Sopenharmony_ci &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2180e1051a39Sopenharmony_ci} 2181e1051a39Sopenharmony_ci 2182e1051a39Sopenharmony_ci for ($i=0,$j=0; $j<8; $j++) { 2183e1051a39Sopenharmony_ci &AVX2_512_00_47($j,\&bodyx_00_15,@X); 2184e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 2185e1051a39Sopenharmony_ci } 2186e1051a39Sopenharmony_ci &lea ($Tbl,16*2*$SZ."($Tbl)"); 2187e1051a39Sopenharmony_ci &cmpb (($SZ-1-0x80)."($Tbl)",0); 2188e1051a39Sopenharmony_ci &jne (".Lavx2_00_47"); 2189e1051a39Sopenharmony_ci 2190e1051a39Sopenharmony_ci for ($i=0; $i<16; ) { 2191e1051a39Sopenharmony_ci my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2192e1051a39Sopenharmony_ci foreach(bodyx_00_15()) { eval; } 2193e1051a39Sopenharmony_ci } 2194e1051a39Sopenharmony_ci} 2195e1051a39Sopenharmony_ci$code.=<<___; 2196e1051a39Sopenharmony_ci mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2197e1051a39Sopenharmony_ci add $a1,$A 2198e1051a39Sopenharmony_ci #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2199e1051a39Sopenharmony_ci lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 2200e1051a39Sopenharmony_ci 2201e1051a39Sopenharmony_ci add $SZ*0($ctx),$A 2202e1051a39Sopenharmony_ci add $SZ*1($ctx),$B 2203e1051a39Sopenharmony_ci add $SZ*2($ctx),$C 2204e1051a39Sopenharmony_ci add $SZ*3($ctx),$D 2205e1051a39Sopenharmony_ci add $SZ*4($ctx),$E 2206e1051a39Sopenharmony_ci add $SZ*5($ctx),$F 2207e1051a39Sopenharmony_ci add $SZ*6($ctx),$G 2208e1051a39Sopenharmony_ci add $SZ*7($ctx),$H 2209e1051a39Sopenharmony_ci 2210e1051a39Sopenharmony_ci mov $A,$SZ*0($ctx) 2211e1051a39Sopenharmony_ci mov $B,$SZ*1($ctx) 2212e1051a39Sopenharmony_ci mov $C,$SZ*2($ctx) 2213e1051a39Sopenharmony_ci mov $D,$SZ*3($ctx) 2214e1051a39Sopenharmony_ci mov $E,$SZ*4($ctx) 2215e1051a39Sopenharmony_ci mov $F,$SZ*5($ctx) 2216e1051a39Sopenharmony_ci mov $G,$SZ*6($ctx) 2217e1051a39Sopenharmony_ci mov $H,$SZ*7($ctx) 2218e1051a39Sopenharmony_ci 2219e1051a39Sopenharmony_ci cmp `$PUSH8+2*8`($Tbl),$inp # $_end 2220e1051a39Sopenharmony_ci je .Ldone_avx2 2221e1051a39Sopenharmony_ci 2222e1051a39Sopenharmony_ci xor $a1,$a1 2223e1051a39Sopenharmony_ci mov $B,$a3 2224e1051a39Sopenharmony_ci xor $C,$a3 # magic 2225e1051a39Sopenharmony_ci mov $F,$a4 2226e1051a39Sopenharmony_ci jmp .Lower_avx2 2227e1051a39Sopenharmony_ci.align 16 2228e1051a39Sopenharmony_ci.Lower_avx2: 2229e1051a39Sopenharmony_ci___ 2230e1051a39Sopenharmony_ci for ($i=0; $i<8; ) { 2231e1051a39Sopenharmony_ci my $base="+16($Tbl)"; 2232e1051a39Sopenharmony_ci foreach(bodyx_00_15()) { eval; } 2233e1051a39Sopenharmony_ci } 2234e1051a39Sopenharmony_ci$code.=<<___; 2235e1051a39Sopenharmony_ci lea -$PUSH8($Tbl),$Tbl 2236e1051a39Sopenharmony_ci cmp %rsp,$Tbl 2237e1051a39Sopenharmony_ci jae .Lower_avx2 2238e1051a39Sopenharmony_ci 2239e1051a39Sopenharmony_ci mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2240e1051a39Sopenharmony_ci add $a1,$A 2241e1051a39Sopenharmony_ci #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2242e1051a39Sopenharmony_ci lea `2*$SZ*($rounds-8)`(%rsp),%rsp 2243e1051a39Sopenharmony_ci# restore frame pointer to original location at $_rsp 2244e1051a39Sopenharmony_ci.cfi_cfa_expression $_rsp,deref,+8 2245e1051a39Sopenharmony_ci 2246e1051a39Sopenharmony_ci add $SZ*0($ctx),$A 2247e1051a39Sopenharmony_ci add $SZ*1($ctx),$B 2248e1051a39Sopenharmony_ci add $SZ*2($ctx),$C 2249e1051a39Sopenharmony_ci add $SZ*3($ctx),$D 2250e1051a39Sopenharmony_ci add $SZ*4($ctx),$E 2251e1051a39Sopenharmony_ci add $SZ*5($ctx),$F 2252e1051a39Sopenharmony_ci lea `2*16*$SZ`($inp),$inp # inp+=2 2253e1051a39Sopenharmony_ci add $SZ*6($ctx),$G 2254e1051a39Sopenharmony_ci mov $inp,%r12 2255e1051a39Sopenharmony_ci add $SZ*7($ctx),$H 2256e1051a39Sopenharmony_ci cmp $_end,$inp 2257e1051a39Sopenharmony_ci 2258e1051a39Sopenharmony_ci mov $A,$SZ*0($ctx) 2259e1051a39Sopenharmony_ci cmove %rsp,%r12 # next block or stale data 2260e1051a39Sopenharmony_ci mov $B,$SZ*1($ctx) 2261e1051a39Sopenharmony_ci mov $C,$SZ*2($ctx) 2262e1051a39Sopenharmony_ci mov $D,$SZ*3($ctx) 2263e1051a39Sopenharmony_ci mov $E,$SZ*4($ctx) 2264e1051a39Sopenharmony_ci mov $F,$SZ*5($ctx) 2265e1051a39Sopenharmony_ci mov $G,$SZ*6($ctx) 2266e1051a39Sopenharmony_ci mov $H,$SZ*7($ctx) 2267e1051a39Sopenharmony_ci 2268e1051a39Sopenharmony_ci jbe .Loop_avx2 2269e1051a39Sopenharmony_ci lea (%rsp),$Tbl 2270e1051a39Sopenharmony_ci# temporarily use $Tbl as index to $_rsp 2271e1051a39Sopenharmony_ci# this avoids the need to save a secondary frame pointer at -8(%rsp) 2272e1051a39Sopenharmony_ci.cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8 2273e1051a39Sopenharmony_ci 2274e1051a39Sopenharmony_ci.Ldone_avx2: 2275e1051a39Sopenharmony_ci mov `16*$SZ+3*8`($Tbl),%rsi 2276e1051a39Sopenharmony_ci.cfi_def_cfa %rsi,8 2277e1051a39Sopenharmony_ci vzeroupper 2278e1051a39Sopenharmony_ci___ 2279e1051a39Sopenharmony_ci$code.=<<___ if ($win64); 2280e1051a39Sopenharmony_ci movaps 16*$SZ+32($Tbl),%xmm6 2281e1051a39Sopenharmony_ci movaps 16*$SZ+48($Tbl),%xmm7 2282e1051a39Sopenharmony_ci movaps 16*$SZ+64($Tbl),%xmm8 2283e1051a39Sopenharmony_ci movaps 16*$SZ+80($Tbl),%xmm9 2284e1051a39Sopenharmony_ci___ 2285e1051a39Sopenharmony_ci$code.=<<___ if ($win64 && $SZ>4); 2286e1051a39Sopenharmony_ci movaps 16*$SZ+96($Tbl),%xmm10 2287e1051a39Sopenharmony_ci movaps 16*$SZ+112($Tbl),%xmm11 2288e1051a39Sopenharmony_ci___ 2289e1051a39Sopenharmony_ci$code.=<<___; 2290e1051a39Sopenharmony_ci mov -48(%rsi),%r15 2291e1051a39Sopenharmony_ci.cfi_restore %r15 2292e1051a39Sopenharmony_ci mov -40(%rsi),%r14 2293e1051a39Sopenharmony_ci.cfi_restore %r14 2294e1051a39Sopenharmony_ci mov -32(%rsi),%r13 2295e1051a39Sopenharmony_ci.cfi_restore %r13 2296e1051a39Sopenharmony_ci mov -24(%rsi),%r12 2297e1051a39Sopenharmony_ci.cfi_restore %r12 2298e1051a39Sopenharmony_ci mov -16(%rsi),%rbp 2299e1051a39Sopenharmony_ci.cfi_restore %rbp 2300e1051a39Sopenharmony_ci mov -8(%rsi),%rbx 2301e1051a39Sopenharmony_ci.cfi_restore %rbx 2302e1051a39Sopenharmony_ci lea (%rsi),%rsp 2303e1051a39Sopenharmony_ci.cfi_def_cfa_register %rsp 2304e1051a39Sopenharmony_ci.Lepilogue_avx2: 2305e1051a39Sopenharmony_ci ret 2306e1051a39Sopenharmony_ci.cfi_endproc 2307e1051a39Sopenharmony_ci.size ${func}_avx2,.-${func}_avx2 2308e1051a39Sopenharmony_ci___ 2309e1051a39Sopenharmony_ci}} 2310e1051a39Sopenharmony_ci}}}}} 2311e1051a39Sopenharmony_ci 2312e1051a39Sopenharmony_ci# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2313e1051a39Sopenharmony_ci# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2314e1051a39Sopenharmony_ciif ($win64) { 2315e1051a39Sopenharmony_ci$rec="%rcx"; 2316e1051a39Sopenharmony_ci$frame="%rdx"; 2317e1051a39Sopenharmony_ci$context="%r8"; 2318e1051a39Sopenharmony_ci$disp="%r9"; 2319e1051a39Sopenharmony_ci 2320e1051a39Sopenharmony_ci$code.=<<___; 2321e1051a39Sopenharmony_ci.extern __imp_RtlVirtualUnwind 2322e1051a39Sopenharmony_ci.type se_handler,\@abi-omnipotent 2323e1051a39Sopenharmony_ci.align 16 2324e1051a39Sopenharmony_cise_handler: 2325e1051a39Sopenharmony_ci push %rsi 2326e1051a39Sopenharmony_ci push %rdi 2327e1051a39Sopenharmony_ci push %rbx 2328e1051a39Sopenharmony_ci push %rbp 2329e1051a39Sopenharmony_ci push %r12 2330e1051a39Sopenharmony_ci push %r13 2331e1051a39Sopenharmony_ci push %r14 2332e1051a39Sopenharmony_ci push %r15 2333e1051a39Sopenharmony_ci pushfq 2334e1051a39Sopenharmony_ci sub \$64,%rsp 2335e1051a39Sopenharmony_ci 2336e1051a39Sopenharmony_ci mov 120($context),%rax # pull context->Rax 2337e1051a39Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 2338e1051a39Sopenharmony_ci 2339e1051a39Sopenharmony_ci mov 8($disp),%rsi # disp->ImageBase 2340e1051a39Sopenharmony_ci mov 56($disp),%r11 # disp->HanderlData 2341e1051a39Sopenharmony_ci 2342e1051a39Sopenharmony_ci mov 0(%r11),%r10d # HandlerData[0] 2343e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # prologue label 2344e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip<prologue label 2345e1051a39Sopenharmony_ci jb .Lin_prologue 2346e1051a39Sopenharmony_ci 2347e1051a39Sopenharmony_ci mov 152($context),%rax # pull context->Rsp 2348e1051a39Sopenharmony_ci 2349e1051a39Sopenharmony_ci mov 4(%r11),%r10d # HandlerData[1] 2350e1051a39Sopenharmony_ci lea (%rsi,%r10),%r10 # epilogue label 2351e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip>=epilogue label 2352e1051a39Sopenharmony_ci jae .Lin_prologue 2353e1051a39Sopenharmony_ci___ 2354e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1); 2355e1051a39Sopenharmony_ci lea .Lavx2_shortcut(%rip),%r10 2356e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip<avx2_shortcut 2357e1051a39Sopenharmony_ci jb .Lnot_in_avx2 2358e1051a39Sopenharmony_ci 2359e1051a39Sopenharmony_ci and \$-256*$SZ,%rax 2360e1051a39Sopenharmony_ci add \$`2*$SZ*($rounds-8)`,%rax 2361e1051a39Sopenharmony_ci.Lnot_in_avx2: 2362e1051a39Sopenharmony_ci___ 2363e1051a39Sopenharmony_ci$code.=<<___; 2364e1051a39Sopenharmony_ci mov %rax,%rsi # put aside Rsp 2365e1051a39Sopenharmony_ci mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 2366e1051a39Sopenharmony_ci 2367e1051a39Sopenharmony_ci mov -8(%rax),%rbx 2368e1051a39Sopenharmony_ci mov -16(%rax),%rbp 2369e1051a39Sopenharmony_ci mov -24(%rax),%r12 2370e1051a39Sopenharmony_ci mov -32(%rax),%r13 2371e1051a39Sopenharmony_ci mov -40(%rax),%r14 2372e1051a39Sopenharmony_ci mov -48(%rax),%r15 2373e1051a39Sopenharmony_ci mov %rbx,144($context) # restore context->Rbx 2374e1051a39Sopenharmony_ci mov %rbp,160($context) # restore context->Rbp 2375e1051a39Sopenharmony_ci mov %r12,216($context) # restore context->R12 2376e1051a39Sopenharmony_ci mov %r13,224($context) # restore context->R13 2377e1051a39Sopenharmony_ci mov %r14,232($context) # restore context->R14 2378e1051a39Sopenharmony_ci mov %r15,240($context) # restore context->R15 2379e1051a39Sopenharmony_ci 2380e1051a39Sopenharmony_ci lea .Lepilogue(%rip),%r10 2381e1051a39Sopenharmony_ci cmp %r10,%rbx 2382e1051a39Sopenharmony_ci jb .Lin_prologue # non-AVX code 2383e1051a39Sopenharmony_ci 2384e1051a39Sopenharmony_ci lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 2385e1051a39Sopenharmony_ci lea 512($context),%rdi # &context.Xmm6 2386e1051a39Sopenharmony_ci mov \$`$SZ==4?8:12`,%ecx 2387e1051a39Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 2388e1051a39Sopenharmony_ci 2389e1051a39Sopenharmony_ci.Lin_prologue: 2390e1051a39Sopenharmony_ci mov 8(%rax),%rdi 2391e1051a39Sopenharmony_ci mov 16(%rax),%rsi 2392e1051a39Sopenharmony_ci mov %rax,152($context) # restore context->Rsp 2393e1051a39Sopenharmony_ci mov %rsi,168($context) # restore context->Rsi 2394e1051a39Sopenharmony_ci mov %rdi,176($context) # restore context->Rdi 2395e1051a39Sopenharmony_ci 2396e1051a39Sopenharmony_ci mov 40($disp),%rdi # disp->ContextRecord 2397e1051a39Sopenharmony_ci mov $context,%rsi # context 2398e1051a39Sopenharmony_ci mov \$154,%ecx # sizeof(CONTEXT) 2399e1051a39Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 2400e1051a39Sopenharmony_ci 2401e1051a39Sopenharmony_ci mov $disp,%rsi 2402e1051a39Sopenharmony_ci xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2403e1051a39Sopenharmony_ci mov 8(%rsi),%rdx # arg2, disp->ImageBase 2404e1051a39Sopenharmony_ci mov 0(%rsi),%r8 # arg3, disp->ControlPc 2405e1051a39Sopenharmony_ci mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2406e1051a39Sopenharmony_ci mov 40(%rsi),%r10 # disp->ContextRecord 2407e1051a39Sopenharmony_ci lea 56(%rsi),%r11 # &disp->HandlerData 2408e1051a39Sopenharmony_ci lea 24(%rsi),%r12 # &disp->EstablisherFrame 2409e1051a39Sopenharmony_ci mov %r10,32(%rsp) # arg5 2410e1051a39Sopenharmony_ci mov %r11,40(%rsp) # arg6 2411e1051a39Sopenharmony_ci mov %r12,48(%rsp) # arg7 2412e1051a39Sopenharmony_ci mov %rcx,56(%rsp) # arg8, (NULL) 2413e1051a39Sopenharmony_ci call *__imp_RtlVirtualUnwind(%rip) 2414e1051a39Sopenharmony_ci 2415e1051a39Sopenharmony_ci mov \$1,%eax # ExceptionContinueSearch 2416e1051a39Sopenharmony_ci add \$64,%rsp 2417e1051a39Sopenharmony_ci popfq 2418e1051a39Sopenharmony_ci pop %r15 2419e1051a39Sopenharmony_ci pop %r14 2420e1051a39Sopenharmony_ci pop %r13 2421e1051a39Sopenharmony_ci pop %r12 2422e1051a39Sopenharmony_ci pop %rbp 2423e1051a39Sopenharmony_ci pop %rbx 2424e1051a39Sopenharmony_ci pop %rdi 2425e1051a39Sopenharmony_ci pop %rsi 2426e1051a39Sopenharmony_ci ret 2427e1051a39Sopenharmony_ci.size se_handler,.-se_handler 2428e1051a39Sopenharmony_ci___ 2429e1051a39Sopenharmony_ci 2430e1051a39Sopenharmony_ci$code.=<<___ if ($SZ==4 && $shaext); 2431e1051a39Sopenharmony_ci.type shaext_handler,\@abi-omnipotent 2432e1051a39Sopenharmony_ci.align 16 2433e1051a39Sopenharmony_cishaext_handler: 2434e1051a39Sopenharmony_ci push %rsi 2435e1051a39Sopenharmony_ci push %rdi 2436e1051a39Sopenharmony_ci push %rbx 2437e1051a39Sopenharmony_ci push %rbp 2438e1051a39Sopenharmony_ci push %r12 2439e1051a39Sopenharmony_ci push %r13 2440e1051a39Sopenharmony_ci push %r14 2441e1051a39Sopenharmony_ci push %r15 2442e1051a39Sopenharmony_ci pushfq 2443e1051a39Sopenharmony_ci sub \$64,%rsp 2444e1051a39Sopenharmony_ci 2445e1051a39Sopenharmony_ci mov 120($context),%rax # pull context->Rax 2446e1051a39Sopenharmony_ci mov 248($context),%rbx # pull context->Rip 2447e1051a39Sopenharmony_ci 2448e1051a39Sopenharmony_ci lea .Lprologue_shaext(%rip),%r10 2449e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip<.Lprologue 2450e1051a39Sopenharmony_ci jb .Lin_prologue 2451e1051a39Sopenharmony_ci 2452e1051a39Sopenharmony_ci lea .Lepilogue_shaext(%rip),%r10 2453e1051a39Sopenharmony_ci cmp %r10,%rbx # context->Rip>=.Lepilogue 2454e1051a39Sopenharmony_ci jae .Lin_prologue 2455e1051a39Sopenharmony_ci 2456e1051a39Sopenharmony_ci lea -8-5*16(%rax),%rsi 2457e1051a39Sopenharmony_ci lea 512($context),%rdi # &context.Xmm6 2458e1051a39Sopenharmony_ci mov \$10,%ecx 2459e1051a39Sopenharmony_ci .long 0xa548f3fc # cld; rep movsq 2460e1051a39Sopenharmony_ci 2461e1051a39Sopenharmony_ci jmp .Lin_prologue 2462e1051a39Sopenharmony_ci.size shaext_handler,.-shaext_handler 2463e1051a39Sopenharmony_ci___ 2464e1051a39Sopenharmony_ci 2465e1051a39Sopenharmony_ci$code.=<<___; 2466e1051a39Sopenharmony_ci.section .pdata 2467e1051a39Sopenharmony_ci.align 4 2468e1051a39Sopenharmony_ci .rva .LSEH_begin_$func 2469e1051a39Sopenharmony_ci .rva .LSEH_end_$func 2470e1051a39Sopenharmony_ci .rva .LSEH_info_$func 2471e1051a39Sopenharmony_ci___ 2472e1051a39Sopenharmony_ci$code.=<<___ if ($SZ==4 && $shaext); 2473e1051a39Sopenharmony_ci .rva .LSEH_begin_${func}_shaext 2474e1051a39Sopenharmony_ci .rva .LSEH_end_${func}_shaext 2475e1051a39Sopenharmony_ci .rva .LSEH_info_${func}_shaext 2476e1051a39Sopenharmony_ci___ 2477e1051a39Sopenharmony_ci$code.=<<___ if ($SZ==4); 2478e1051a39Sopenharmony_ci .rva .LSEH_begin_${func}_ssse3 2479e1051a39Sopenharmony_ci .rva .LSEH_end_${func}_ssse3 2480e1051a39Sopenharmony_ci .rva .LSEH_info_${func}_ssse3 2481e1051a39Sopenharmony_ci___ 2482e1051a39Sopenharmony_ci$code.=<<___ if ($avx && $SZ==8); 2483e1051a39Sopenharmony_ci .rva .LSEH_begin_${func}_xop 2484e1051a39Sopenharmony_ci .rva .LSEH_end_${func}_xop 2485e1051a39Sopenharmony_ci .rva .LSEH_info_${func}_xop 2486e1051a39Sopenharmony_ci___ 2487e1051a39Sopenharmony_ci$code.=<<___ if ($avx); 2488e1051a39Sopenharmony_ci .rva .LSEH_begin_${func}_avx 2489e1051a39Sopenharmony_ci .rva .LSEH_end_${func}_avx 2490e1051a39Sopenharmony_ci .rva .LSEH_info_${func}_avx 2491e1051a39Sopenharmony_ci___ 2492e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1); 2493e1051a39Sopenharmony_ci .rva .LSEH_begin_${func}_avx2 2494e1051a39Sopenharmony_ci .rva .LSEH_end_${func}_avx2 2495e1051a39Sopenharmony_ci .rva .LSEH_info_${func}_avx2 2496e1051a39Sopenharmony_ci___ 2497e1051a39Sopenharmony_ci$code.=<<___; 2498e1051a39Sopenharmony_ci.section .xdata 2499e1051a39Sopenharmony_ci.align 8 2500e1051a39Sopenharmony_ci.LSEH_info_$func: 2501e1051a39Sopenharmony_ci .byte 9,0,0,0 2502e1051a39Sopenharmony_ci .rva se_handler 2503e1051a39Sopenharmony_ci .rva .Lprologue,.Lepilogue # HandlerData[] 2504e1051a39Sopenharmony_ci___ 2505e1051a39Sopenharmony_ci$code.=<<___ if ($SZ==4 && $shaext); 2506e1051a39Sopenharmony_ci.LSEH_info_${func}_shaext: 2507e1051a39Sopenharmony_ci .byte 9,0,0,0 2508e1051a39Sopenharmony_ci .rva shaext_handler 2509e1051a39Sopenharmony_ci___ 2510e1051a39Sopenharmony_ci$code.=<<___ if ($SZ==4); 2511e1051a39Sopenharmony_ci.LSEH_info_${func}_ssse3: 2512e1051a39Sopenharmony_ci .byte 9,0,0,0 2513e1051a39Sopenharmony_ci .rva se_handler 2514e1051a39Sopenharmony_ci .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2515e1051a39Sopenharmony_ci___ 2516e1051a39Sopenharmony_ci$code.=<<___ if ($avx && $SZ==8); 2517e1051a39Sopenharmony_ci.LSEH_info_${func}_xop: 2518e1051a39Sopenharmony_ci .byte 9,0,0,0 2519e1051a39Sopenharmony_ci .rva se_handler 2520e1051a39Sopenharmony_ci .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 2521e1051a39Sopenharmony_ci___ 2522e1051a39Sopenharmony_ci$code.=<<___ if ($avx); 2523e1051a39Sopenharmony_ci.LSEH_info_${func}_avx: 2524e1051a39Sopenharmony_ci .byte 9,0,0,0 2525e1051a39Sopenharmony_ci .rva se_handler 2526e1051a39Sopenharmony_ci .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2527e1051a39Sopenharmony_ci___ 2528e1051a39Sopenharmony_ci$code.=<<___ if ($avx>1); 2529e1051a39Sopenharmony_ci.LSEH_info_${func}_avx2: 2530e1051a39Sopenharmony_ci .byte 9,0,0,0 2531e1051a39Sopenharmony_ci .rva se_handler 2532e1051a39Sopenharmony_ci .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2533e1051a39Sopenharmony_ci___ 2534e1051a39Sopenharmony_ci} 2535e1051a39Sopenharmony_ci 2536e1051a39Sopenharmony_cisub sha256op38 { 2537e1051a39Sopenharmony_ci my $instr = shift; 2538e1051a39Sopenharmony_ci my %opcodelet = ( 2539e1051a39Sopenharmony_ci "sha256rnds2" => 0xcb, 2540e1051a39Sopenharmony_ci "sha256msg1" => 0xcc, 2541e1051a39Sopenharmony_ci "sha256msg2" => 0xcd ); 2542e1051a39Sopenharmony_ci 2543e1051a39Sopenharmony_ci if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2544e1051a39Sopenharmony_ci my @opcode=(0x0f,0x38); 2545e1051a39Sopenharmony_ci push @opcode,$opcodelet{$instr}; 2546e1051a39Sopenharmony_ci push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2547e1051a39Sopenharmony_ci return ".byte\t".join(',',@opcode); 2548e1051a39Sopenharmony_ci } else { 2549e1051a39Sopenharmony_ci return $instr."\t".@_[0]; 2550e1051a39Sopenharmony_ci } 2551e1051a39Sopenharmony_ci} 2552e1051a39Sopenharmony_ci 2553e1051a39Sopenharmony_ciforeach (split("\n",$code)) { 2554e1051a39Sopenharmony_ci s/\`([^\`]*)\`/eval $1/geo; 2555e1051a39Sopenharmony_ci 2556e1051a39Sopenharmony_ci s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2557e1051a39Sopenharmony_ci 2558e1051a39Sopenharmony_ci print $_,"\n"; 2559e1051a39Sopenharmony_ci} 2560e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 2561