1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci# 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# SHA512 block transform for x86. September 2007. 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# May 2013. 20e1051a39Sopenharmony_ci# 21e1051a39Sopenharmony_ci# Add SSSE3 code path, 20-25% improvement [over original SSE2 code]. 22e1051a39Sopenharmony_ci# 23e1051a39Sopenharmony_ci# Performance in clock cycles per processed byte (less is better): 24e1051a39Sopenharmony_ci# 25e1051a39Sopenharmony_ci# gcc icc x86 asm SIMD(*) x86_64(**) 26e1051a39Sopenharmony_ci# Pentium 100 97 61 - - 27e1051a39Sopenharmony_ci# PIII 75 77 56 - - 28e1051a39Sopenharmony_ci# P4 116 95 82 34.6 30.8 29e1051a39Sopenharmony_ci# AMD K8 54 55 36 20.7 9.57 30e1051a39Sopenharmony_ci# Core2 66 57 40 15.9 9.97 31e1051a39Sopenharmony_ci# Westmere 70 - 38 12.2 9.58 32e1051a39Sopenharmony_ci# Sandy Bridge 58 - 35 11.9 11.2 33e1051a39Sopenharmony_ci# Ivy Bridge 50 - 33 11.5 8.17 34e1051a39Sopenharmony_ci# Haswell 46 - 29 11.3 7.66 35e1051a39Sopenharmony_ci# Skylake 40 - 26 13.3 7.25 36e1051a39Sopenharmony_ci# Bulldozer 121 - 50 14.0 13.5 37e1051a39Sopenharmony_ci# VIA Nano 91 - 52 33 14.7 38e1051a39Sopenharmony_ci# Atom 126 - 68 48(***) 14.7 39e1051a39Sopenharmony_ci# Silvermont 97 - 58 42(***) 17.5 40e1051a39Sopenharmony_ci# Goldmont 80 - 48 19.5 12.0 41e1051a39Sopenharmony_ci# 42e1051a39Sopenharmony_ci# (*) whichever best applicable. 43e1051a39Sopenharmony_ci# (**) x86_64 assembler performance is presented for reference 44e1051a39Sopenharmony_ci# purposes, the results are for integer-only code. 45e1051a39Sopenharmony_ci# (***) paddq is incredibly slow on Atom. 46e1051a39Sopenharmony_ci# 47e1051a39Sopenharmony_ci# IALU code-path is optimized for elder Pentiums. On vanilla Pentium 48e1051a39Sopenharmony_ci# performance improvement over compiler generated code reaches ~60%, 49e1051a39Sopenharmony_ci# while on PIII - ~35%. On newer µ-archs improvement varies from 15% 50e1051a39Sopenharmony_ci# to 50%, but it's less important as they are expected to execute SSE2 51e1051a39Sopenharmony_ci# code-path, which is commonly ~2-3x faster [than compiler generated 52e1051a39Sopenharmony_ci# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even 53e1051a39Sopenharmony_ci# though it does not use 128-bit operations. The latter means that 54e1051a39Sopenharmony_ci# SSE2-aware kernel is no longer required to execute the code. Another 55e1051a39Sopenharmony_ci# difference is that new code optimizes amount of writes, but at the 56e1051a39Sopenharmony_ci# cost of increased data cache "footprint" by 1/2KB. 57e1051a39Sopenharmony_ci 58e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 59e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm"); 60e1051a39Sopenharmony_cirequire "x86asm.pl"; 61e1051a39Sopenharmony_ci 62e1051a39Sopenharmony_ci$output=pop and open STDOUT,">$output"; 63e1051a39Sopenharmony_ci 64e1051a39Sopenharmony_ci&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 65e1051a39Sopenharmony_ci 66e1051a39Sopenharmony_ci$sse2=0; 67e1051a39Sopenharmony_cifor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 68e1051a39Sopenharmony_ci 69e1051a39Sopenharmony_ci&external_label("OPENSSL_ia32cap_P") if ($sse2); 70e1051a39Sopenharmony_ci 71e1051a39Sopenharmony_ci$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp"); 72e1051a39Sopenharmony_ci$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp"); 73e1051a39Sopenharmony_ci$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp"); 74e1051a39Sopenharmony_ci$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp"); 75e1051a39Sopenharmony_ci$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp"); 76e1051a39Sopenharmony_ci$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp"); 77e1051a39Sopenharmony_ci$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp"); 78e1051a39Sopenharmony_ci$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp"); 79e1051a39Sopenharmony_ci$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp"); 80e1051a39Sopenharmony_ci$K512="ebp"; 81e1051a39Sopenharmony_ci 82e1051a39Sopenharmony_ci$Asse2=&QWP(0,"esp"); 83e1051a39Sopenharmony_ci$Bsse2=&QWP(8,"esp"); 84e1051a39Sopenharmony_ci$Csse2=&QWP(16,"esp"); 85e1051a39Sopenharmony_ci$Dsse2=&QWP(24,"esp"); 86e1051a39Sopenharmony_ci$Esse2=&QWP(32,"esp"); 87e1051a39Sopenharmony_ci$Fsse2=&QWP(40,"esp"); 88e1051a39Sopenharmony_ci$Gsse2=&QWP(48,"esp"); 89e1051a39Sopenharmony_ci$Hsse2=&QWP(56,"esp"); 90e1051a39Sopenharmony_ci 91e1051a39Sopenharmony_ci$A="mm0"; # B-D and 92e1051a39Sopenharmony_ci$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and 93e1051a39Sopenharmony_ci # mm5-mm7, but it's done on on-demand basis... 94e1051a39Sopenharmony_ci$BxC="mm2"; # ... except for B^C 95e1051a39Sopenharmony_ci 96e1051a39Sopenharmony_cisub BODY_00_15_sse2 { 97e1051a39Sopenharmony_ci my $phase=shift; 98e1051a39Sopenharmony_ci 99e1051a39Sopenharmony_ci #&movq ("mm5",$Fsse2); # load f 100e1051a39Sopenharmony_ci #&movq ("mm6",$Gsse2); # load g 101e1051a39Sopenharmony_ci 102e1051a39Sopenharmony_ci &movq ("mm1",$E); # %mm1 is sliding right 103e1051a39Sopenharmony_ci &pxor ("mm5","mm6"); # f^=g 104e1051a39Sopenharmony_ci &psrlq ("mm1",14); 105e1051a39Sopenharmony_ci &movq ($Esse2,$E); # modulo-scheduled save e 106e1051a39Sopenharmony_ci &pand ("mm5",$E); # f&=e 107e1051a39Sopenharmony_ci &psllq ($E,23); # $E is sliding left 108e1051a39Sopenharmony_ci &movq ($A,"mm3") if ($phase<2); 109e1051a39Sopenharmony_ci &movq (&QWP(8*9,"esp"),"mm7") # save X[i] 110e1051a39Sopenharmony_ci &movq ("mm3","mm1"); # %mm3 is T1 111e1051a39Sopenharmony_ci &psrlq ("mm1",4); 112e1051a39Sopenharmony_ci &pxor ("mm5","mm6"); # Ch(e,f,g) 113e1051a39Sopenharmony_ci &pxor ("mm3",$E); 114e1051a39Sopenharmony_ci &psllq ($E,23); 115e1051a39Sopenharmony_ci &pxor ("mm3","mm1"); 116e1051a39Sopenharmony_ci &movq ($Asse2,$A); # modulo-scheduled save a 117e1051a39Sopenharmony_ci &paddq ("mm7","mm5"); # X[i]+=Ch(e,f,g) 118e1051a39Sopenharmony_ci &pxor ("mm3",$E); 119e1051a39Sopenharmony_ci &psrlq ("mm1",23); 120e1051a39Sopenharmony_ci &paddq ("mm7",$Hsse2); # X[i]+=h 121e1051a39Sopenharmony_ci &pxor ("mm3","mm1"); 122e1051a39Sopenharmony_ci &psllq ($E,4); 123e1051a39Sopenharmony_ci &paddq ("mm7",QWP(0,$K512)); # X[i]+=K512[i] 124e1051a39Sopenharmony_ci &pxor ("mm3",$E); # T1=Sigma1_512(e) 125e1051a39Sopenharmony_ci 126e1051a39Sopenharmony_ci &movq ($E,$Dsse2); # e = load d, e in next round 127e1051a39Sopenharmony_ci &paddq ("mm3","mm7"); # T1+=X[i] 128e1051a39Sopenharmony_ci &movq ("mm5",$A); # %mm5 is sliding right 129e1051a39Sopenharmony_ci &psrlq ("mm5",28); 130e1051a39Sopenharmony_ci &paddq ($E,"mm3"); # d += T1 131e1051a39Sopenharmony_ci &movq ("mm6",$A); # %mm6 is sliding left 132e1051a39Sopenharmony_ci &movq ("mm7","mm5"); 133e1051a39Sopenharmony_ci &psllq ("mm6",25); 134e1051a39Sopenharmony_ci &movq ("mm1",$Bsse2); # load b 135e1051a39Sopenharmony_ci &psrlq ("mm5",6); 136e1051a39Sopenharmony_ci &pxor ("mm7","mm6"); 137e1051a39Sopenharmony_ci &sub ("esp",8); 138e1051a39Sopenharmony_ci &psllq ("mm6",5); 139e1051a39Sopenharmony_ci &pxor ("mm7","mm5"); 140e1051a39Sopenharmony_ci &pxor ($A,"mm1"); # a^b, b^c in next round 141e1051a39Sopenharmony_ci &psrlq ("mm5",5); 142e1051a39Sopenharmony_ci &pxor ("mm7","mm6"); 143e1051a39Sopenharmony_ci &pand ($BxC,$A); # (b^c)&(a^b) 144e1051a39Sopenharmony_ci &psllq ("mm6",6); 145e1051a39Sopenharmony_ci &pxor ("mm7","mm5"); 146e1051a39Sopenharmony_ci &pxor ($BxC,"mm1"); # [h=]Maj(a,b,c) 147e1051a39Sopenharmony_ci &pxor ("mm6","mm7"); # Sigma0_512(a) 148e1051a39Sopenharmony_ci &movq ("mm7",&QWP(8*(9+16-1),"esp")) if ($phase!=0); # pre-fetch 149e1051a39Sopenharmony_ci &movq ("mm5",$Fsse2) if ($phase==0); # load f 150e1051a39Sopenharmony_ci 151e1051a39Sopenharmony_ci if ($phase>1) { 152e1051a39Sopenharmony_ci &paddq ($BxC,"mm6"); # h+=Sigma0(a) 153e1051a39Sopenharmony_ci &add ($K512,8); 154e1051a39Sopenharmony_ci #&paddq ($BxC,"mm3"); # h+=T1 155e1051a39Sopenharmony_ci 156e1051a39Sopenharmony_ci ($A,$BxC) = ($BxC,$A); # rotate registers 157e1051a39Sopenharmony_ci } else { 158e1051a39Sopenharmony_ci &paddq ("mm3",$BxC); # T1+=Maj(a,b,c) 159e1051a39Sopenharmony_ci &movq ($BxC,$A); 160e1051a39Sopenharmony_ci &add ($K512,8); 161e1051a39Sopenharmony_ci &paddq ("mm3","mm6"); # T1+=Sigma0(a) 162e1051a39Sopenharmony_ci &movq ("mm6",$Gsse2) if ($phase==0); # load g 163e1051a39Sopenharmony_ci #&movq ($A,"mm3"); # h=T1 164e1051a39Sopenharmony_ci } 165e1051a39Sopenharmony_ci} 166e1051a39Sopenharmony_ci 167e1051a39Sopenharmony_cisub BODY_00_15_x86 { 168e1051a39Sopenharmony_ci #define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) 169e1051a39Sopenharmony_ci # LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 170e1051a39Sopenharmony_ci # HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 171e1051a39Sopenharmony_ci &mov ("ecx",$Elo); 172e1051a39Sopenharmony_ci &mov ("edx",$Ehi); 173e1051a39Sopenharmony_ci &mov ("esi","ecx"); 174e1051a39Sopenharmony_ci 175e1051a39Sopenharmony_ci &shr ("ecx",9); # lo>>9 176e1051a39Sopenharmony_ci &mov ("edi","edx"); 177e1051a39Sopenharmony_ci &shr ("edx",9); # hi>>9 178e1051a39Sopenharmony_ci &mov ("ebx","ecx"); 179e1051a39Sopenharmony_ci &shl ("esi",14); # lo<<14 180e1051a39Sopenharmony_ci &mov ("eax","edx"); 181e1051a39Sopenharmony_ci &shl ("edi",14); # hi<<14 182e1051a39Sopenharmony_ci &xor ("ebx","esi"); 183e1051a39Sopenharmony_ci 184e1051a39Sopenharmony_ci &shr ("ecx",14-9); # lo>>14 185e1051a39Sopenharmony_ci &xor ("eax","edi"); 186e1051a39Sopenharmony_ci &shr ("edx",14-9); # hi>>14 187e1051a39Sopenharmony_ci &xor ("eax","ecx"); 188e1051a39Sopenharmony_ci &shl ("esi",18-14); # lo<<18 189e1051a39Sopenharmony_ci &xor ("ebx","edx"); 190e1051a39Sopenharmony_ci &shl ("edi",18-14); # hi<<18 191e1051a39Sopenharmony_ci &xor ("ebx","esi"); 192e1051a39Sopenharmony_ci 193e1051a39Sopenharmony_ci &shr ("ecx",18-14); # lo>>18 194e1051a39Sopenharmony_ci &xor ("eax","edi"); 195e1051a39Sopenharmony_ci &shr ("edx",18-14); # hi>>18 196e1051a39Sopenharmony_ci &xor ("eax","ecx"); 197e1051a39Sopenharmony_ci &shl ("esi",23-18); # lo<<23 198e1051a39Sopenharmony_ci &xor ("ebx","edx"); 199e1051a39Sopenharmony_ci &shl ("edi",23-18); # hi<<23 200e1051a39Sopenharmony_ci &xor ("eax","esi"); 201e1051a39Sopenharmony_ci &xor ("ebx","edi"); # T1 = Sigma1(e) 202e1051a39Sopenharmony_ci 203e1051a39Sopenharmony_ci &mov ("ecx",$Flo); 204e1051a39Sopenharmony_ci &mov ("edx",$Fhi); 205e1051a39Sopenharmony_ci &mov ("esi",$Glo); 206e1051a39Sopenharmony_ci &mov ("edi",$Ghi); 207e1051a39Sopenharmony_ci &add ("eax",$Hlo); 208e1051a39Sopenharmony_ci &adc ("ebx",$Hhi); # T1 += h 209e1051a39Sopenharmony_ci &xor ("ecx","esi"); 210e1051a39Sopenharmony_ci &xor ("edx","edi"); 211e1051a39Sopenharmony_ci &and ("ecx",$Elo); 212e1051a39Sopenharmony_ci &and ("edx",$Ehi); 213e1051a39Sopenharmony_ci &add ("eax",&DWP(8*(9+15)+0,"esp")); 214e1051a39Sopenharmony_ci &adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0] 215e1051a39Sopenharmony_ci &xor ("ecx","esi"); 216e1051a39Sopenharmony_ci &xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g 217e1051a39Sopenharmony_ci 218e1051a39Sopenharmony_ci &mov ("esi",&DWP(0,$K512)); 219e1051a39Sopenharmony_ci &mov ("edi",&DWP(4,$K512)); # K[i] 220e1051a39Sopenharmony_ci &add ("eax","ecx"); 221e1051a39Sopenharmony_ci &adc ("ebx","edx"); # T1 += Ch(e,f,g) 222e1051a39Sopenharmony_ci &mov ("ecx",$Dlo); 223e1051a39Sopenharmony_ci &mov ("edx",$Dhi); 224e1051a39Sopenharmony_ci &add ("eax","esi"); 225e1051a39Sopenharmony_ci &adc ("ebx","edi"); # T1 += K[i] 226e1051a39Sopenharmony_ci &mov ($Tlo,"eax"); 227e1051a39Sopenharmony_ci &mov ($Thi,"ebx"); # put T1 away 228e1051a39Sopenharmony_ci &add ("eax","ecx"); 229e1051a39Sopenharmony_ci &adc ("ebx","edx"); # d += T1 230e1051a39Sopenharmony_ci 231e1051a39Sopenharmony_ci #define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) 232e1051a39Sopenharmony_ci # LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 233e1051a39Sopenharmony_ci # HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 234e1051a39Sopenharmony_ci &mov ("ecx",$Alo); 235e1051a39Sopenharmony_ci &mov ("edx",$Ahi); 236e1051a39Sopenharmony_ci &mov ($Dlo,"eax"); 237e1051a39Sopenharmony_ci &mov ($Dhi,"ebx"); 238e1051a39Sopenharmony_ci &mov ("esi","ecx"); 239e1051a39Sopenharmony_ci 240e1051a39Sopenharmony_ci &shr ("ecx",2); # lo>>2 241e1051a39Sopenharmony_ci &mov ("edi","edx"); 242e1051a39Sopenharmony_ci &shr ("edx",2); # hi>>2 243e1051a39Sopenharmony_ci &mov ("ebx","ecx"); 244e1051a39Sopenharmony_ci &shl ("esi",4); # lo<<4 245e1051a39Sopenharmony_ci &mov ("eax","edx"); 246e1051a39Sopenharmony_ci &shl ("edi",4); # hi<<4 247e1051a39Sopenharmony_ci &xor ("ebx","esi"); 248e1051a39Sopenharmony_ci 249e1051a39Sopenharmony_ci &shr ("ecx",7-2); # lo>>7 250e1051a39Sopenharmony_ci &xor ("eax","edi"); 251e1051a39Sopenharmony_ci &shr ("edx",7-2); # hi>>7 252e1051a39Sopenharmony_ci &xor ("ebx","ecx"); 253e1051a39Sopenharmony_ci &shl ("esi",25-4); # lo<<25 254e1051a39Sopenharmony_ci &xor ("eax","edx"); 255e1051a39Sopenharmony_ci &shl ("edi",25-4); # hi<<25 256e1051a39Sopenharmony_ci &xor ("eax","esi"); 257e1051a39Sopenharmony_ci 258e1051a39Sopenharmony_ci &shr ("ecx",28-7); # lo>>28 259e1051a39Sopenharmony_ci &xor ("ebx","edi"); 260e1051a39Sopenharmony_ci &shr ("edx",28-7); # hi>>28 261e1051a39Sopenharmony_ci &xor ("eax","ecx"); 262e1051a39Sopenharmony_ci &shl ("esi",30-25); # lo<<30 263e1051a39Sopenharmony_ci &xor ("ebx","edx"); 264e1051a39Sopenharmony_ci &shl ("edi",30-25); # hi<<30 265e1051a39Sopenharmony_ci &xor ("eax","esi"); 266e1051a39Sopenharmony_ci &xor ("ebx","edi"); # Sigma0(a) 267e1051a39Sopenharmony_ci 268e1051a39Sopenharmony_ci &mov ("ecx",$Alo); 269e1051a39Sopenharmony_ci &mov ("edx",$Ahi); 270e1051a39Sopenharmony_ci &mov ("esi",$Blo); 271e1051a39Sopenharmony_ci &mov ("edi",$Bhi); 272e1051a39Sopenharmony_ci &add ("eax",$Tlo); 273e1051a39Sopenharmony_ci &adc ("ebx",$Thi); # T1 = Sigma0(a)+T1 274e1051a39Sopenharmony_ci &or ("ecx","esi"); 275e1051a39Sopenharmony_ci &or ("edx","edi"); 276e1051a39Sopenharmony_ci &and ("ecx",$Clo); 277e1051a39Sopenharmony_ci &and ("edx",$Chi); 278e1051a39Sopenharmony_ci &and ("esi",$Alo); 279e1051a39Sopenharmony_ci &and ("edi",$Ahi); 280e1051a39Sopenharmony_ci &or ("ecx","esi"); 281e1051a39Sopenharmony_ci &or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b) 282e1051a39Sopenharmony_ci 283e1051a39Sopenharmony_ci &add ("eax","ecx"); 284e1051a39Sopenharmony_ci &adc ("ebx","edx"); # T1 += Maj(a,b,c) 285e1051a39Sopenharmony_ci &mov ($Tlo,"eax"); 286e1051a39Sopenharmony_ci &mov ($Thi,"ebx"); 287e1051a39Sopenharmony_ci 288e1051a39Sopenharmony_ci &mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K 289e1051a39Sopenharmony_ci &sub ("esp",8); 290e1051a39Sopenharmony_ci &lea ($K512,&DWP(8,$K512)); # K++ 291e1051a39Sopenharmony_ci} 292e1051a39Sopenharmony_ci 293e1051a39Sopenharmony_ci 294e1051a39Sopenharmony_ci&function_begin("sha512_block_data_order"); 295e1051a39Sopenharmony_ci &mov ("esi",wparam(0)); # ctx 296e1051a39Sopenharmony_ci &mov ("edi",wparam(1)); # inp 297e1051a39Sopenharmony_ci &mov ("eax",wparam(2)); # num 298e1051a39Sopenharmony_ci &mov ("ebx","esp"); # saved sp 299e1051a39Sopenharmony_ci 300e1051a39Sopenharmony_ci &call (&label("pic_point")); # make it PIC! 301e1051a39Sopenharmony_ci&set_label("pic_point"); 302e1051a39Sopenharmony_ci &blindpop($K512); 303e1051a39Sopenharmony_ci &lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512)); 304e1051a39Sopenharmony_ci 305e1051a39Sopenharmony_ci &sub ("esp",16); 306e1051a39Sopenharmony_ci &and ("esp",-64); 307e1051a39Sopenharmony_ci 308e1051a39Sopenharmony_ci &shl ("eax",7); 309e1051a39Sopenharmony_ci &add ("eax","edi"); 310e1051a39Sopenharmony_ci &mov (&DWP(0,"esp"),"esi"); # ctx 311e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),"edi"); # inp 312e1051a39Sopenharmony_ci &mov (&DWP(8,"esp"),"eax"); # inp+num*128 313e1051a39Sopenharmony_ci &mov (&DWP(12,"esp"),"ebx"); # saved sp 314e1051a39Sopenharmony_ci 315e1051a39Sopenharmony_ciif ($sse2) { 316e1051a39Sopenharmony_ci &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512")); 317e1051a39Sopenharmony_ci &mov ("ecx",&DWP(0,"edx")); 318e1051a39Sopenharmony_ci &test ("ecx",1<<26); 319e1051a39Sopenharmony_ci &jz (&label("loop_x86")); 320e1051a39Sopenharmony_ci 321e1051a39Sopenharmony_ci &mov ("edx",&DWP(4,"edx")); 322e1051a39Sopenharmony_ci 323e1051a39Sopenharmony_ci # load ctx->h[0-7] 324e1051a39Sopenharmony_ci &movq ($A,&QWP(0,"esi")); 325e1051a39Sopenharmony_ci &and ("ecx",1<<24); # XMM registers availability 326e1051a39Sopenharmony_ci &movq ("mm1",&QWP(8,"esi")); 327e1051a39Sopenharmony_ci &and ("edx",1<<9); # SSSE3 bit 328e1051a39Sopenharmony_ci &movq ($BxC,&QWP(16,"esi")); 329e1051a39Sopenharmony_ci &or ("ecx","edx"); 330e1051a39Sopenharmony_ci &movq ("mm3",&QWP(24,"esi")); 331e1051a39Sopenharmony_ci &movq ($E,&QWP(32,"esi")); 332e1051a39Sopenharmony_ci &movq ("mm5",&QWP(40,"esi")); 333e1051a39Sopenharmony_ci &movq ("mm6",&QWP(48,"esi")); 334e1051a39Sopenharmony_ci &movq ("mm7",&QWP(56,"esi")); 335e1051a39Sopenharmony_ci &cmp ("ecx",1<<24|1<<9); 336e1051a39Sopenharmony_ci &je (&label("SSSE3")); 337e1051a39Sopenharmony_ci &sub ("esp",8*10); 338e1051a39Sopenharmony_ci &jmp (&label("loop_sse2")); 339e1051a39Sopenharmony_ci 340e1051a39Sopenharmony_ci&set_label("loop_sse2",16); 341e1051a39Sopenharmony_ci #&movq ($Asse2,$A); 342e1051a39Sopenharmony_ci &movq ($Bsse2,"mm1"); 343e1051a39Sopenharmony_ci &movq ($Csse2,$BxC); 344e1051a39Sopenharmony_ci &movq ($Dsse2,"mm3"); 345e1051a39Sopenharmony_ci #&movq ($Esse2,$E); 346e1051a39Sopenharmony_ci &movq ($Fsse2,"mm5"); 347e1051a39Sopenharmony_ci &movq ($Gsse2,"mm6"); 348e1051a39Sopenharmony_ci &pxor ($BxC,"mm1"); # magic 349e1051a39Sopenharmony_ci &movq ($Hsse2,"mm7"); 350e1051a39Sopenharmony_ci &movq ("mm3",$A); # magic 351e1051a39Sopenharmony_ci 352e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"edi")); 353e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"edi")); 354e1051a39Sopenharmony_ci &add ("edi",8); 355e1051a39Sopenharmony_ci &mov ("edx",15); # counter 356e1051a39Sopenharmony_ci &bswap ("eax"); 357e1051a39Sopenharmony_ci &bswap ("ebx"); 358e1051a39Sopenharmony_ci &jmp (&label("00_14_sse2")); 359e1051a39Sopenharmony_ci 360e1051a39Sopenharmony_ci&set_label("00_14_sse2",16); 361e1051a39Sopenharmony_ci &movd ("mm1","eax"); 362e1051a39Sopenharmony_ci &mov ("eax",&DWP(0,"edi")); 363e1051a39Sopenharmony_ci &movd ("mm7","ebx"); 364e1051a39Sopenharmony_ci &mov ("ebx",&DWP(4,"edi")); 365e1051a39Sopenharmony_ci &add ("edi",8); 366e1051a39Sopenharmony_ci &bswap ("eax"); 367e1051a39Sopenharmony_ci &bswap ("ebx"); 368e1051a39Sopenharmony_ci &punpckldq("mm7","mm1"); 369e1051a39Sopenharmony_ci 370e1051a39Sopenharmony_ci &BODY_00_15_sse2(); 371e1051a39Sopenharmony_ci 372e1051a39Sopenharmony_ci &dec ("edx"); 373e1051a39Sopenharmony_ci &jnz (&label("00_14_sse2")); 374e1051a39Sopenharmony_ci 375e1051a39Sopenharmony_ci &movd ("mm1","eax"); 376e1051a39Sopenharmony_ci &movd ("mm7","ebx"); 377e1051a39Sopenharmony_ci &punpckldq("mm7","mm1"); 378e1051a39Sopenharmony_ci 379e1051a39Sopenharmony_ci &BODY_00_15_sse2(1); 380e1051a39Sopenharmony_ci 381e1051a39Sopenharmony_ci &pxor ($A,$A); # A is in %mm3 382e1051a39Sopenharmony_ci &mov ("edx",32); # counter 383e1051a39Sopenharmony_ci &jmp (&label("16_79_sse2")); 384e1051a39Sopenharmony_ci 385e1051a39Sopenharmony_ci&set_label("16_79_sse2",16); 386e1051a39Sopenharmony_ci for ($j=0;$j<2;$j++) { # 2x unroll 387e1051a39Sopenharmony_ci #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15 388e1051a39Sopenharmony_ci &movq ("mm5",&QWP(8*(9+16-14),"esp")); 389e1051a39Sopenharmony_ci &movq ("mm1","mm7"); 390e1051a39Sopenharmony_ci &psrlq ("mm7",1); 391e1051a39Sopenharmony_ci &movq ("mm6","mm5"); 392e1051a39Sopenharmony_ci &psrlq ("mm5",6); 393e1051a39Sopenharmony_ci &psllq ("mm1",56); 394e1051a39Sopenharmony_ci &paddq ($A,"mm3"); # from BODY_00_15 395e1051a39Sopenharmony_ci &movq ("mm3","mm7"); 396e1051a39Sopenharmony_ci &psrlq ("mm7",7-1); 397e1051a39Sopenharmony_ci &pxor ("mm3","mm1"); 398e1051a39Sopenharmony_ci &psllq ("mm1",63-56); 399e1051a39Sopenharmony_ci &pxor ("mm3","mm7"); 400e1051a39Sopenharmony_ci &psrlq ("mm7",8-7); 401e1051a39Sopenharmony_ci &pxor ("mm3","mm1"); 402e1051a39Sopenharmony_ci &movq ("mm1","mm5"); 403e1051a39Sopenharmony_ci &psrlq ("mm5",19-6); 404e1051a39Sopenharmony_ci &pxor ("mm7","mm3"); # sigma0 405e1051a39Sopenharmony_ci 406e1051a39Sopenharmony_ci &psllq ("mm6",3); 407e1051a39Sopenharmony_ci &pxor ("mm1","mm5"); 408e1051a39Sopenharmony_ci &paddq ("mm7",&QWP(8*(9+16),"esp")); 409e1051a39Sopenharmony_ci &pxor ("mm1","mm6"); 410e1051a39Sopenharmony_ci &psrlq ("mm5",61-19); 411e1051a39Sopenharmony_ci &paddq ("mm7",&QWP(8*(9+16-9),"esp")); 412e1051a39Sopenharmony_ci &pxor ("mm1","mm5"); 413e1051a39Sopenharmony_ci &psllq ("mm6",45-3); 414e1051a39Sopenharmony_ci &movq ("mm5",$Fsse2); # load f 415e1051a39Sopenharmony_ci &pxor ("mm1","mm6"); # sigma1 416e1051a39Sopenharmony_ci &movq ("mm6",$Gsse2); # load g 417e1051a39Sopenharmony_ci 418e1051a39Sopenharmony_ci &paddq ("mm7","mm1"); # X[i] 419e1051a39Sopenharmony_ci #&movq (&QWP(8*9,"esp"),"mm7"); # moved to BODY_00_15 420e1051a39Sopenharmony_ci 421e1051a39Sopenharmony_ci &BODY_00_15_sse2(2); 422e1051a39Sopenharmony_ci } 423e1051a39Sopenharmony_ci &dec ("edx"); 424e1051a39Sopenharmony_ci &jnz (&label("16_79_sse2")); 425e1051a39Sopenharmony_ci 426e1051a39Sopenharmony_ci #&movq ($A,$Asse2); 427e1051a39Sopenharmony_ci &paddq ($A,"mm3"); # from BODY_00_15 428e1051a39Sopenharmony_ci &movq ("mm1",$Bsse2); 429e1051a39Sopenharmony_ci #&movq ($BxC,$Csse2); 430e1051a39Sopenharmony_ci &movq ("mm3",$Dsse2); 431e1051a39Sopenharmony_ci #&movq ($E,$Esse2); 432e1051a39Sopenharmony_ci &movq ("mm5",$Fsse2); 433e1051a39Sopenharmony_ci &movq ("mm6",$Gsse2); 434e1051a39Sopenharmony_ci &movq ("mm7",$Hsse2); 435e1051a39Sopenharmony_ci 436e1051a39Sopenharmony_ci &pxor ($BxC,"mm1"); # de-magic 437e1051a39Sopenharmony_ci &paddq ($A,&QWP(0,"esi")); 438e1051a39Sopenharmony_ci &paddq ("mm1",&QWP(8,"esi")); 439e1051a39Sopenharmony_ci &paddq ($BxC,&QWP(16,"esi")); 440e1051a39Sopenharmony_ci &paddq ("mm3",&QWP(24,"esi")); 441e1051a39Sopenharmony_ci &paddq ($E,&QWP(32,"esi")); 442e1051a39Sopenharmony_ci &paddq ("mm5",&QWP(40,"esi")); 443e1051a39Sopenharmony_ci &paddq ("mm6",&QWP(48,"esi")); 444e1051a39Sopenharmony_ci &paddq ("mm7",&QWP(56,"esi")); 445e1051a39Sopenharmony_ci 446e1051a39Sopenharmony_ci &mov ("eax",8*80); 447e1051a39Sopenharmony_ci &movq (&QWP(0,"esi"),$A); 448e1051a39Sopenharmony_ci &movq (&QWP(8,"esi"),"mm1"); 449e1051a39Sopenharmony_ci &movq (&QWP(16,"esi"),$BxC); 450e1051a39Sopenharmony_ci &movq (&QWP(24,"esi"),"mm3"); 451e1051a39Sopenharmony_ci &movq (&QWP(32,"esi"),$E); 452e1051a39Sopenharmony_ci &movq (&QWP(40,"esi"),"mm5"); 453e1051a39Sopenharmony_ci &movq (&QWP(48,"esi"),"mm6"); 454e1051a39Sopenharmony_ci &movq (&QWP(56,"esi"),"mm7"); 455e1051a39Sopenharmony_ci 456e1051a39Sopenharmony_ci &lea ("esp",&DWP(0,"esp","eax")); # destroy frame 457e1051a39Sopenharmony_ci &sub ($K512,"eax"); # rewind K 458e1051a39Sopenharmony_ci 459e1051a39Sopenharmony_ci &cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet? 460e1051a39Sopenharmony_ci &jb (&label("loop_sse2")); 461e1051a39Sopenharmony_ci 462e1051a39Sopenharmony_ci &mov ("esp",&DWP(8*10+12,"esp")); # restore sp 463e1051a39Sopenharmony_ci &emms (); 464e1051a39Sopenharmony_ci&function_end_A(); 465e1051a39Sopenharmony_ci 466e1051a39Sopenharmony_ci&set_label("SSSE3",32); 467e1051a39Sopenharmony_ci{ my ($cnt,$frame)=("ecx","edx"); 468e1051a39Sopenharmony_ci my @X=map("xmm$_",(0..7)); 469e1051a39Sopenharmony_ci my $j; 470e1051a39Sopenharmony_ci my $i=0; 471e1051a39Sopenharmony_ci 472e1051a39Sopenharmony_ci &lea ($frame,&DWP(-64,"esp")); 473e1051a39Sopenharmony_ci &sub ("esp",256); 474e1051a39Sopenharmony_ci 475e1051a39Sopenharmony_ci # fixed stack frame layout 476e1051a39Sopenharmony_ci # 477e1051a39Sopenharmony_ci # +0 A B C D E F G H # backing store 478e1051a39Sopenharmony_ci # +64 X[0]+K[i] .. X[15]+K[i] # XMM->MM xfer area 479e1051a39Sopenharmony_ci # +192 # XMM off-load ring buffer 480e1051a39Sopenharmony_ci # +256 # saved parameters 481e1051a39Sopenharmony_ci 482e1051a39Sopenharmony_ci &movdqa (@X[1],&QWP(80*8,$K512)); # byte swap mask 483e1051a39Sopenharmony_ci &movdqu (@X[0],&QWP(0,"edi")); 484e1051a39Sopenharmony_ci &pshufb (@X[0],@X[1]); 485e1051a39Sopenharmony_ci for ($j=0;$j<8;$j++) { 486e1051a39Sopenharmony_ci &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load 487e1051a39Sopenharmony_ci &movdqa (@X[3],&QWP(16*($j%8),$K512)); 488e1051a39Sopenharmony_ci &movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask 489e1051a39Sopenharmony_ci &movdqu (@X[1],&QWP(16*($j+1),"edi")) if ($j<7); # next input 490e1051a39Sopenharmony_ci &movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0] 491e1051a39Sopenharmony_ci &paddq (@X[3],@X[0]); 492e1051a39Sopenharmony_ci &pshufb (@X[1],@X[2]) if ($j<7); 493e1051a39Sopenharmony_ci &movdqa (&QWP(16*($j%8)-128,$frame),@X[3]); # xfer X[i]+K[i] 494e1051a39Sopenharmony_ci 495e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 496e1051a39Sopenharmony_ci } 497e1051a39Sopenharmony_ci #&jmp (&label("loop_ssse3")); 498e1051a39Sopenharmony_ci &nop (); 499e1051a39Sopenharmony_ci 500e1051a39Sopenharmony_ci&set_label("loop_ssse3",32); 501e1051a39Sopenharmony_ci &movdqa (@X[2],&QWP(16*(($j+1)%4),$frame)); # pre-restore @X[1] 502e1051a39Sopenharmony_ci &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]); # off-load @X[3] 503e1051a39Sopenharmony_ci &lea ($K512,&DWP(16*8,$K512)); 504e1051a39Sopenharmony_ci 505e1051a39Sopenharmony_ci #&movq ($Asse2,$A); # off-load A-H 506e1051a39Sopenharmony_ci &movq ($Bsse2,"mm1"); 507e1051a39Sopenharmony_ci &mov ("ebx","edi"); 508e1051a39Sopenharmony_ci &movq ($Csse2,$BxC); 509e1051a39Sopenharmony_ci &lea ("edi",&DWP(128,"edi")); # advance input 510e1051a39Sopenharmony_ci &movq ($Dsse2,"mm3"); 511e1051a39Sopenharmony_ci &cmp ("edi","eax"); 512e1051a39Sopenharmony_ci #&movq ($Esse2,$E); 513e1051a39Sopenharmony_ci &movq ($Fsse2,"mm5"); 514e1051a39Sopenharmony_ci &cmovb ("ebx","edi"); 515e1051a39Sopenharmony_ci &movq ($Gsse2,"mm6"); 516e1051a39Sopenharmony_ci &mov ("ecx",4); # loop counter 517e1051a39Sopenharmony_ci &pxor ($BxC,"mm1"); # magic 518e1051a39Sopenharmony_ci &movq ($Hsse2,"mm7"); 519e1051a39Sopenharmony_ci &pxor ("mm3","mm3"); # magic 520e1051a39Sopenharmony_ci 521e1051a39Sopenharmony_ci &jmp (&label("00_47_ssse3")); 522e1051a39Sopenharmony_ci 523e1051a39Sopenharmony_cisub BODY_00_15_ssse3 { # "phase-less" copy of BODY_00_15_sse2 524e1051a39Sopenharmony_ci ( 525e1051a39Sopenharmony_ci '&movq ("mm1",$E)', # %mm1 is sliding right 526e1051a39Sopenharmony_ci '&movq ("mm7",&QWP(((-8*$i)%128)-128,$frame))',# X[i]+K[i] 527e1051a39Sopenharmony_ci '&pxor ("mm5","mm6")', # f^=g 528e1051a39Sopenharmony_ci '&psrlq ("mm1",14)', 529e1051a39Sopenharmony_ci '&movq (&QWP(8*($i+4)%64,"esp"),$E)', # modulo-scheduled save e 530e1051a39Sopenharmony_ci '&pand ("mm5",$E)', # f&=e 531e1051a39Sopenharmony_ci '&psllq ($E,23)', # $E is sliding left 532e1051a39Sopenharmony_ci '&paddq ($A,"mm3")', # [h+=Maj(a,b,c)] 533e1051a39Sopenharmony_ci '&movq ("mm3","mm1")', # %mm3 is T1 534e1051a39Sopenharmony_ci '&psrlq("mm1",4)', 535e1051a39Sopenharmony_ci '&pxor ("mm5","mm6")', # Ch(e,f,g) 536e1051a39Sopenharmony_ci '&pxor ("mm3",$E)', 537e1051a39Sopenharmony_ci '&psllq($E,23)', 538e1051a39Sopenharmony_ci '&pxor ("mm3","mm1")', 539e1051a39Sopenharmony_ci '&movq (&QWP(8*$i%64,"esp"),$A)', # modulo-scheduled save a 540e1051a39Sopenharmony_ci '&paddq("mm7","mm5")', # X[i]+=Ch(e,f,g) 541e1051a39Sopenharmony_ci '&pxor ("mm3",$E)', 542e1051a39Sopenharmony_ci '&psrlq("mm1",23)', 543e1051a39Sopenharmony_ci '&paddq("mm7",&QWP(8*($i+7)%64,"esp"))', # X[i]+=h 544e1051a39Sopenharmony_ci '&pxor ("mm3","mm1")', 545e1051a39Sopenharmony_ci '&psllq($E,4)', 546e1051a39Sopenharmony_ci '&pxor ("mm3",$E)', # T1=Sigma1_512(e) 547e1051a39Sopenharmony_ci 548e1051a39Sopenharmony_ci '&movq ($E,&QWP(8*($i+3)%64,"esp"))', # e = load d, e in next round 549e1051a39Sopenharmony_ci '&paddq ("mm3","mm7")', # T1+=X[i] 550e1051a39Sopenharmony_ci '&movq ("mm5",$A)', # %mm5 is sliding right 551e1051a39Sopenharmony_ci '&psrlq("mm5",28)', 552e1051a39Sopenharmony_ci '&paddq ($E,"mm3")', # d += T1 553e1051a39Sopenharmony_ci '&movq ("mm6",$A)', # %mm6 is sliding left 554e1051a39Sopenharmony_ci '&movq ("mm7","mm5")', 555e1051a39Sopenharmony_ci '&psllq("mm6",25)', 556e1051a39Sopenharmony_ci '&movq ("mm1",&QWP(8*($i+1)%64,"esp"))', # load b 557e1051a39Sopenharmony_ci '&psrlq("mm5",6)', 558e1051a39Sopenharmony_ci '&pxor ("mm7","mm6")', 559e1051a39Sopenharmony_ci '&psllq("mm6",5)', 560e1051a39Sopenharmony_ci '&pxor ("mm7","mm5")', 561e1051a39Sopenharmony_ci '&pxor ($A,"mm1")', # a^b, b^c in next round 562e1051a39Sopenharmony_ci '&psrlq("mm5",5)', 563e1051a39Sopenharmony_ci '&pxor ("mm7","mm6")', 564e1051a39Sopenharmony_ci '&pand ($BxC,$A)', # (b^c)&(a^b) 565e1051a39Sopenharmony_ci '&psllq("mm6",6)', 566e1051a39Sopenharmony_ci '&pxor ("mm7","mm5")', 567e1051a39Sopenharmony_ci '&pxor ($BxC,"mm1")', # [h=]Maj(a,b,c) 568e1051a39Sopenharmony_ci '&pxor ("mm6","mm7")', # Sigma0_512(a) 569e1051a39Sopenharmony_ci '&movq ("mm5",&QWP(8*($i+5-1)%64,"esp"))', # pre-load f 570e1051a39Sopenharmony_ci '&paddq ($BxC,"mm6")', # h+=Sigma0(a) 571e1051a39Sopenharmony_ci '&movq ("mm6",&QWP(8*($i+6-1)%64,"esp"))', # pre-load g 572e1051a39Sopenharmony_ci 573e1051a39Sopenharmony_ci '($A,$BxC) = ($BxC,$A); $i--;' 574e1051a39Sopenharmony_ci ); 575e1051a39Sopenharmony_ci} 576e1051a39Sopenharmony_ci 577e1051a39Sopenharmony_ci&set_label("00_47_ssse3",32); 578e1051a39Sopenharmony_ci 579e1051a39Sopenharmony_ci for(;$j<16;$j++) { 580e1051a39Sopenharmony_ci my ($t0,$t2,$t1)=@X[2..4]; 581e1051a39Sopenharmony_ci my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3()); 582e1051a39Sopenharmony_ci 583e1051a39Sopenharmony_ci &movdqa ($t2,@X[5]); 584e1051a39Sopenharmony_ci &movdqa (@X[1],$t0); # restore @X[1] 585e1051a39Sopenharmony_ci &palignr ($t0,@X[0],8); # X[1..2] 586e1051a39Sopenharmony_ci &movdqa (&QWP(16*($j%4),$frame),@X[4]); # off-load @X[4] 587e1051a39Sopenharmony_ci &palignr ($t2,@X[4],8); # X[9..10] 588e1051a39Sopenharmony_ci 589e1051a39Sopenharmony_ci &movdqa ($t1,$t0); 590e1051a39Sopenharmony_ci &psrlq ($t0,7); 591e1051a39Sopenharmony_ci &paddq (@X[0],$t2); # X[0..1] += X[9..10] 592e1051a39Sopenharmony_ci &movdqa ($t2,$t1); 593e1051a39Sopenharmony_ci &psrlq ($t1,1); 594e1051a39Sopenharmony_ci &psllq ($t2,64-8); 595e1051a39Sopenharmony_ci &pxor ($t0,$t1); 596e1051a39Sopenharmony_ci &psrlq ($t1,8-1); 597e1051a39Sopenharmony_ci &pxor ($t0,$t2); 598e1051a39Sopenharmony_ci &psllq ($t2,8-1); 599e1051a39Sopenharmony_ci &pxor ($t0,$t1); 600e1051a39Sopenharmony_ci &movdqa ($t1,@X[7]); 601e1051a39Sopenharmony_ci &pxor ($t0,$t2); # sigma0(X[1..2]) 602e1051a39Sopenharmony_ci &movdqa ($t2,@X[7]); 603e1051a39Sopenharmony_ci &psrlq ($t1,6); 604e1051a39Sopenharmony_ci &paddq (@X[0],$t0); # X[0..1] += sigma0(X[1..2]) 605e1051a39Sopenharmony_ci 606e1051a39Sopenharmony_ci &movdqa ($t0,@X[7]); 607e1051a39Sopenharmony_ci &psrlq ($t2,19); 608e1051a39Sopenharmony_ci &psllq ($t0,64-61); 609e1051a39Sopenharmony_ci &pxor ($t1,$t2); 610e1051a39Sopenharmony_ci &psrlq ($t2,61-19); 611e1051a39Sopenharmony_ci &pxor ($t1,$t0); 612e1051a39Sopenharmony_ci &psllq ($t0,61-19); 613e1051a39Sopenharmony_ci &pxor ($t1,$t2); 614e1051a39Sopenharmony_ci &movdqa ($t2,&QWP(16*(($j+2)%4),$frame));# pre-restore @X[1] 615e1051a39Sopenharmony_ci &pxor ($t1,$t0); # sigma0(X[1..2]) 616e1051a39Sopenharmony_ci &movdqa ($t0,&QWP(16*($j%8),$K512)); 617e1051a39Sopenharmony_ci eval(shift(@insns)); 618e1051a39Sopenharmony_ci &paddq (@X[0],$t1); # X[0..1] += sigma0(X[14..15]) 619e1051a39Sopenharmony_ci eval(shift(@insns)); 620e1051a39Sopenharmony_ci eval(shift(@insns)); 621e1051a39Sopenharmony_ci eval(shift(@insns)); 622e1051a39Sopenharmony_ci eval(shift(@insns)); 623e1051a39Sopenharmony_ci &paddq ($t0,@X[0]); 624e1051a39Sopenharmony_ci foreach(@insns) { eval; } 625e1051a39Sopenharmony_ci &movdqa (&QWP(16*($j%8)-128,$frame),$t0);# xfer X[i]+K[i] 626e1051a39Sopenharmony_ci 627e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 628e1051a39Sopenharmony_ci } 629e1051a39Sopenharmony_ci &lea ($K512,&DWP(16*8,$K512)); 630e1051a39Sopenharmony_ci &dec ("ecx"); 631e1051a39Sopenharmony_ci &jnz (&label("00_47_ssse3")); 632e1051a39Sopenharmony_ci 633e1051a39Sopenharmony_ci &movdqa (@X[1],&QWP(0,$K512)); # byte swap mask 634e1051a39Sopenharmony_ci &lea ($K512,&DWP(-80*8,$K512)); # rewind 635e1051a39Sopenharmony_ci &movdqu (@X[0],&QWP(0,"ebx")); 636e1051a39Sopenharmony_ci &pshufb (@X[0],@X[1]); 637e1051a39Sopenharmony_ci 638e1051a39Sopenharmony_ci for ($j=0;$j<8;$j++) { # load next or same block 639e1051a39Sopenharmony_ci my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3()); 640e1051a39Sopenharmony_ci 641e1051a39Sopenharmony_ci &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load 642e1051a39Sopenharmony_ci &movdqa (@X[3],&QWP(16*($j%8),$K512)); 643e1051a39Sopenharmony_ci &movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask 644e1051a39Sopenharmony_ci &movdqu (@X[1],&QWP(16*($j+1),"ebx")) if ($j<7); # next input 645e1051a39Sopenharmony_ci &movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0] 646e1051a39Sopenharmony_ci &paddq (@X[3],@X[0]); 647e1051a39Sopenharmony_ci &pshufb (@X[1],@X[2]) if ($j<7); 648e1051a39Sopenharmony_ci foreach(@insns) { eval; } 649e1051a39Sopenharmony_ci &movdqa (&QWP(16*($j%8)-128,$frame),@X[3]);# xfer X[i]+K[i] 650e1051a39Sopenharmony_ci 651e1051a39Sopenharmony_ci push(@X,shift(@X)); # rotate(@X) 652e1051a39Sopenharmony_ci } 653e1051a39Sopenharmony_ci 654e1051a39Sopenharmony_ci #&movq ($A,$Asse2); # load A-H 655e1051a39Sopenharmony_ci &movq ("mm1",$Bsse2); 656e1051a39Sopenharmony_ci &paddq ($A,"mm3"); # from BODY_00_15 657e1051a39Sopenharmony_ci #&movq ($BxC,$Csse2); 658e1051a39Sopenharmony_ci &movq ("mm3",$Dsse2); 659e1051a39Sopenharmony_ci #&movq ($E,$Esse2); 660e1051a39Sopenharmony_ci #&movq ("mm5",$Fsse2); 661e1051a39Sopenharmony_ci #&movq ("mm6",$Gsse2); 662e1051a39Sopenharmony_ci &movq ("mm7",$Hsse2); 663e1051a39Sopenharmony_ci 664e1051a39Sopenharmony_ci &pxor ($BxC,"mm1"); # de-magic 665e1051a39Sopenharmony_ci &paddq ($A,&QWP(0,"esi")); 666e1051a39Sopenharmony_ci &paddq ("mm1",&QWP(8,"esi")); 667e1051a39Sopenharmony_ci &paddq ($BxC,&QWP(16,"esi")); 668e1051a39Sopenharmony_ci &paddq ("mm3",&QWP(24,"esi")); 669e1051a39Sopenharmony_ci &paddq ($E,&QWP(32,"esi")); 670e1051a39Sopenharmony_ci &paddq ("mm5",&QWP(40,"esi")); 671e1051a39Sopenharmony_ci &paddq ("mm6",&QWP(48,"esi")); 672e1051a39Sopenharmony_ci &paddq ("mm7",&QWP(56,"esi")); 673e1051a39Sopenharmony_ci 674e1051a39Sopenharmony_ci &movq (&QWP(0,"esi"),$A); 675e1051a39Sopenharmony_ci &movq (&QWP(8,"esi"),"mm1"); 676e1051a39Sopenharmony_ci &movq (&QWP(16,"esi"),$BxC); 677e1051a39Sopenharmony_ci &movq (&QWP(24,"esi"),"mm3"); 678e1051a39Sopenharmony_ci &movq (&QWP(32,"esi"),$E); 679e1051a39Sopenharmony_ci &movq (&QWP(40,"esi"),"mm5"); 680e1051a39Sopenharmony_ci &movq (&QWP(48,"esi"),"mm6"); 681e1051a39Sopenharmony_ci &movq (&QWP(56,"esi"),"mm7"); 682e1051a39Sopenharmony_ci 683e1051a39Sopenharmony_ci &cmp ("edi","eax") # are we done yet? 684e1051a39Sopenharmony_ci &jb (&label("loop_ssse3")); 685e1051a39Sopenharmony_ci 686e1051a39Sopenharmony_ci &mov ("esp",&DWP(64+12,$frame)); # restore sp 687e1051a39Sopenharmony_ci &emms (); 688e1051a39Sopenharmony_ci} 689e1051a39Sopenharmony_ci&function_end_A(); 690e1051a39Sopenharmony_ci} 691e1051a39Sopenharmony_ci&set_label("loop_x86",16); 692e1051a39Sopenharmony_ci # copy input block to stack reversing byte and qword order 693e1051a39Sopenharmony_ci for ($i=0;$i<8;$i++) { 694e1051a39Sopenharmony_ci &mov ("eax",&DWP($i*16+0,"edi")); 695e1051a39Sopenharmony_ci &mov ("ebx",&DWP($i*16+4,"edi")); 696e1051a39Sopenharmony_ci &mov ("ecx",&DWP($i*16+8,"edi")); 697e1051a39Sopenharmony_ci &mov ("edx",&DWP($i*16+12,"edi")); 698e1051a39Sopenharmony_ci &bswap ("eax"); 699e1051a39Sopenharmony_ci &bswap ("ebx"); 700e1051a39Sopenharmony_ci &bswap ("ecx"); 701e1051a39Sopenharmony_ci &bswap ("edx"); 702e1051a39Sopenharmony_ci &push ("eax"); 703e1051a39Sopenharmony_ci &push ("ebx"); 704e1051a39Sopenharmony_ci &push ("ecx"); 705e1051a39Sopenharmony_ci &push ("edx"); 706e1051a39Sopenharmony_ci } 707e1051a39Sopenharmony_ci &add ("edi",128); 708e1051a39Sopenharmony_ci &sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H 709e1051a39Sopenharmony_ci &mov (&DWP(8*(9+16)+4,"esp"),"edi"); 710e1051a39Sopenharmony_ci 711e1051a39Sopenharmony_ci # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 712e1051a39Sopenharmony_ci &lea ("edi",&DWP(8,"esp")); 713e1051a39Sopenharmony_ci &mov ("ecx",16); 714e1051a39Sopenharmony_ci &data_word(0xA5F3F689); # rep movsd 715e1051a39Sopenharmony_ci 716e1051a39Sopenharmony_ci&set_label("00_15_x86",16); 717e1051a39Sopenharmony_ci &BODY_00_15_x86(); 718e1051a39Sopenharmony_ci 719e1051a39Sopenharmony_ci &cmp (&LB("edx"),0x94); 720e1051a39Sopenharmony_ci &jne (&label("00_15_x86")); 721e1051a39Sopenharmony_ci 722e1051a39Sopenharmony_ci&set_label("16_79_x86",16); 723e1051a39Sopenharmony_ci #define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) 724e1051a39Sopenharmony_ci # LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 725e1051a39Sopenharmony_ci # HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 726e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp")); 727e1051a39Sopenharmony_ci &mov ("edx",&DWP(8*(9+15+16-1)+4,"esp")); 728e1051a39Sopenharmony_ci &mov ("esi","ecx"); 729e1051a39Sopenharmony_ci 730e1051a39Sopenharmony_ci &shr ("ecx",1); # lo>>1 731e1051a39Sopenharmony_ci &mov ("edi","edx"); 732e1051a39Sopenharmony_ci &shr ("edx",1); # hi>>1 733e1051a39Sopenharmony_ci &mov ("eax","ecx"); 734e1051a39Sopenharmony_ci &shl ("esi",24); # lo<<24 735e1051a39Sopenharmony_ci &mov ("ebx","edx"); 736e1051a39Sopenharmony_ci &shl ("edi",24); # hi<<24 737e1051a39Sopenharmony_ci &xor ("ebx","esi"); 738e1051a39Sopenharmony_ci 739e1051a39Sopenharmony_ci &shr ("ecx",7-1); # lo>>7 740e1051a39Sopenharmony_ci &xor ("eax","edi"); 741e1051a39Sopenharmony_ci &shr ("edx",7-1); # hi>>7 742e1051a39Sopenharmony_ci &xor ("eax","ecx"); 743e1051a39Sopenharmony_ci &shl ("esi",31-24); # lo<<31 744e1051a39Sopenharmony_ci &xor ("ebx","edx"); 745e1051a39Sopenharmony_ci &shl ("edi",25-24); # hi<<25 746e1051a39Sopenharmony_ci &xor ("ebx","esi"); 747e1051a39Sopenharmony_ci 748e1051a39Sopenharmony_ci &shr ("ecx",8-7); # lo>>8 749e1051a39Sopenharmony_ci &xor ("eax","edi"); 750e1051a39Sopenharmony_ci &shr ("edx",8-7); # hi>>8 751e1051a39Sopenharmony_ci &xor ("eax","ecx"); 752e1051a39Sopenharmony_ci &shl ("edi",31-25); # hi<<31 753e1051a39Sopenharmony_ci &xor ("ebx","edx"); 754e1051a39Sopenharmony_ci &xor ("eax","edi"); # T1 = sigma0(X[-15]) 755e1051a39Sopenharmony_ci 756e1051a39Sopenharmony_ci &mov (&DWP(0,"esp"),"eax"); 757e1051a39Sopenharmony_ci &mov (&DWP(4,"esp"),"ebx"); # put T1 away 758e1051a39Sopenharmony_ci 759e1051a39Sopenharmony_ci #define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) 760e1051a39Sopenharmony_ci # LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 761e1051a39Sopenharmony_ci # HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 762e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp")); 763e1051a39Sopenharmony_ci &mov ("edx",&DWP(8*(9+15+16-14)+4,"esp")); 764e1051a39Sopenharmony_ci &mov ("esi","ecx"); 765e1051a39Sopenharmony_ci 766e1051a39Sopenharmony_ci &shr ("ecx",6); # lo>>6 767e1051a39Sopenharmony_ci &mov ("edi","edx"); 768e1051a39Sopenharmony_ci &shr ("edx",6); # hi>>6 769e1051a39Sopenharmony_ci &mov ("eax","ecx"); 770e1051a39Sopenharmony_ci &shl ("esi",3); # lo<<3 771e1051a39Sopenharmony_ci &mov ("ebx","edx"); 772e1051a39Sopenharmony_ci &shl ("edi",3); # hi<<3 773e1051a39Sopenharmony_ci &xor ("eax","esi"); 774e1051a39Sopenharmony_ci 775e1051a39Sopenharmony_ci &shr ("ecx",19-6); # lo>>19 776e1051a39Sopenharmony_ci &xor ("ebx","edi"); 777e1051a39Sopenharmony_ci &shr ("edx",19-6); # hi>>19 778e1051a39Sopenharmony_ci &xor ("eax","ecx"); 779e1051a39Sopenharmony_ci &shl ("esi",13-3); # lo<<13 780e1051a39Sopenharmony_ci &xor ("ebx","edx"); 781e1051a39Sopenharmony_ci &shl ("edi",13-3); # hi<<13 782e1051a39Sopenharmony_ci &xor ("ebx","esi"); 783e1051a39Sopenharmony_ci 784e1051a39Sopenharmony_ci &shr ("ecx",29-19); # lo>>29 785e1051a39Sopenharmony_ci &xor ("eax","edi"); 786e1051a39Sopenharmony_ci &shr ("edx",29-19); # hi>>29 787e1051a39Sopenharmony_ci &xor ("ebx","ecx"); 788e1051a39Sopenharmony_ci &shl ("edi",26-13); # hi<<26 789e1051a39Sopenharmony_ci &xor ("eax","edx"); 790e1051a39Sopenharmony_ci &xor ("eax","edi"); # sigma1(X[-2]) 791e1051a39Sopenharmony_ci 792e1051a39Sopenharmony_ci &mov ("ecx",&DWP(8*(9+15+16)+0,"esp")); 793e1051a39Sopenharmony_ci &mov ("edx",&DWP(8*(9+15+16)+4,"esp")); 794e1051a39Sopenharmony_ci &add ("eax",&DWP(0,"esp")); 795e1051a39Sopenharmony_ci &adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1 796e1051a39Sopenharmony_ci &mov ("esi",&DWP(8*(9+15+16-9)+0,"esp")); 797e1051a39Sopenharmony_ci &mov ("edi",&DWP(8*(9+15+16-9)+4,"esp")); 798e1051a39Sopenharmony_ci &add ("eax","ecx"); 799e1051a39Sopenharmony_ci &adc ("ebx","edx"); # T1 += X[-16] 800e1051a39Sopenharmony_ci &add ("eax","esi"); 801e1051a39Sopenharmony_ci &adc ("ebx","edi"); # T1 += X[-7] 802e1051a39Sopenharmony_ci &mov (&DWP(8*(9+15)+0,"esp"),"eax"); 803e1051a39Sopenharmony_ci &mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0] 804e1051a39Sopenharmony_ci 805e1051a39Sopenharmony_ci &BODY_00_15_x86(); 806e1051a39Sopenharmony_ci 807e1051a39Sopenharmony_ci &cmp (&LB("edx"),0x17); 808e1051a39Sopenharmony_ci &jne (&label("16_79_x86")); 809e1051a39Sopenharmony_ci 810e1051a39Sopenharmony_ci &mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx 811e1051a39Sopenharmony_ci &mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp 812e1051a39Sopenharmony_ci for($i=0;$i<4;$i++) { 813e1051a39Sopenharmony_ci &mov ("eax",&DWP($i*16+0,"esi")); 814e1051a39Sopenharmony_ci &mov ("ebx",&DWP($i*16+4,"esi")); 815e1051a39Sopenharmony_ci &mov ("ecx",&DWP($i*16+8,"esi")); 816e1051a39Sopenharmony_ci &mov ("edx",&DWP($i*16+12,"esi")); 817e1051a39Sopenharmony_ci &add ("eax",&DWP(8+($i*16)+0,"esp")); 818e1051a39Sopenharmony_ci &adc ("ebx",&DWP(8+($i*16)+4,"esp")); 819e1051a39Sopenharmony_ci &mov (&DWP($i*16+0,"esi"),"eax"); 820e1051a39Sopenharmony_ci &mov (&DWP($i*16+4,"esi"),"ebx"); 821e1051a39Sopenharmony_ci &add ("ecx",&DWP(8+($i*16)+8,"esp")); 822e1051a39Sopenharmony_ci &adc ("edx",&DWP(8+($i*16)+12,"esp")); 823e1051a39Sopenharmony_ci &mov (&DWP($i*16+8,"esi"),"ecx"); 824e1051a39Sopenharmony_ci &mov (&DWP($i*16+12,"esi"),"edx"); 825e1051a39Sopenharmony_ci } 826e1051a39Sopenharmony_ci &add ("esp",8*(9+16+80)); # destroy frame 827e1051a39Sopenharmony_ci &sub ($K512,8*80); # rewind K 828e1051a39Sopenharmony_ci 829e1051a39Sopenharmony_ci &cmp ("edi",&DWP(8,"esp")); # are we done yet? 830e1051a39Sopenharmony_ci &jb (&label("loop_x86")); 831e1051a39Sopenharmony_ci 832e1051a39Sopenharmony_ci &mov ("esp",&DWP(12,"esp")); # restore sp 833e1051a39Sopenharmony_ci&function_end_A(); 834e1051a39Sopenharmony_ci 835e1051a39Sopenharmony_ci&set_label("K512",64); # Yes! I keep it in the code segment! 836e1051a39Sopenharmony_ci &data_word(0xd728ae22,0x428a2f98); # u64 837e1051a39Sopenharmony_ci &data_word(0x23ef65cd,0x71374491); # u64 838e1051a39Sopenharmony_ci &data_word(0xec4d3b2f,0xb5c0fbcf); # u64 839e1051a39Sopenharmony_ci &data_word(0x8189dbbc,0xe9b5dba5); # u64 840e1051a39Sopenharmony_ci &data_word(0xf348b538,0x3956c25b); # u64 841e1051a39Sopenharmony_ci &data_word(0xb605d019,0x59f111f1); # u64 842e1051a39Sopenharmony_ci &data_word(0xaf194f9b,0x923f82a4); # u64 843e1051a39Sopenharmony_ci &data_word(0xda6d8118,0xab1c5ed5); # u64 844e1051a39Sopenharmony_ci &data_word(0xa3030242,0xd807aa98); # u64 845e1051a39Sopenharmony_ci &data_word(0x45706fbe,0x12835b01); # u64 846e1051a39Sopenharmony_ci &data_word(0x4ee4b28c,0x243185be); # u64 847e1051a39Sopenharmony_ci &data_word(0xd5ffb4e2,0x550c7dc3); # u64 848e1051a39Sopenharmony_ci &data_word(0xf27b896f,0x72be5d74); # u64 849e1051a39Sopenharmony_ci &data_word(0x3b1696b1,0x80deb1fe); # u64 850e1051a39Sopenharmony_ci &data_word(0x25c71235,0x9bdc06a7); # u64 851e1051a39Sopenharmony_ci &data_word(0xcf692694,0xc19bf174); # u64 852e1051a39Sopenharmony_ci &data_word(0x9ef14ad2,0xe49b69c1); # u64 853e1051a39Sopenharmony_ci &data_word(0x384f25e3,0xefbe4786); # u64 854e1051a39Sopenharmony_ci &data_word(0x8b8cd5b5,0x0fc19dc6); # u64 855e1051a39Sopenharmony_ci &data_word(0x77ac9c65,0x240ca1cc); # u64 856e1051a39Sopenharmony_ci &data_word(0x592b0275,0x2de92c6f); # u64 857e1051a39Sopenharmony_ci &data_word(0x6ea6e483,0x4a7484aa); # u64 858e1051a39Sopenharmony_ci &data_word(0xbd41fbd4,0x5cb0a9dc); # u64 859e1051a39Sopenharmony_ci &data_word(0x831153b5,0x76f988da); # u64 860e1051a39Sopenharmony_ci &data_word(0xee66dfab,0x983e5152); # u64 861e1051a39Sopenharmony_ci &data_word(0x2db43210,0xa831c66d); # u64 862e1051a39Sopenharmony_ci &data_word(0x98fb213f,0xb00327c8); # u64 863e1051a39Sopenharmony_ci &data_word(0xbeef0ee4,0xbf597fc7); # u64 864e1051a39Sopenharmony_ci &data_word(0x3da88fc2,0xc6e00bf3); # u64 865e1051a39Sopenharmony_ci &data_word(0x930aa725,0xd5a79147); # u64 866e1051a39Sopenharmony_ci &data_word(0xe003826f,0x06ca6351); # u64 867e1051a39Sopenharmony_ci &data_word(0x0a0e6e70,0x14292967); # u64 868e1051a39Sopenharmony_ci &data_word(0x46d22ffc,0x27b70a85); # u64 869e1051a39Sopenharmony_ci &data_word(0x5c26c926,0x2e1b2138); # u64 870e1051a39Sopenharmony_ci &data_word(0x5ac42aed,0x4d2c6dfc); # u64 871e1051a39Sopenharmony_ci &data_word(0x9d95b3df,0x53380d13); # u64 872e1051a39Sopenharmony_ci &data_word(0x8baf63de,0x650a7354); # u64 873e1051a39Sopenharmony_ci &data_word(0x3c77b2a8,0x766a0abb); # u64 874e1051a39Sopenharmony_ci &data_word(0x47edaee6,0x81c2c92e); # u64 875e1051a39Sopenharmony_ci &data_word(0x1482353b,0x92722c85); # u64 876e1051a39Sopenharmony_ci &data_word(0x4cf10364,0xa2bfe8a1); # u64 877e1051a39Sopenharmony_ci &data_word(0xbc423001,0xa81a664b); # u64 878e1051a39Sopenharmony_ci &data_word(0xd0f89791,0xc24b8b70); # u64 879e1051a39Sopenharmony_ci &data_word(0x0654be30,0xc76c51a3); # u64 880e1051a39Sopenharmony_ci &data_word(0xd6ef5218,0xd192e819); # u64 881e1051a39Sopenharmony_ci &data_word(0x5565a910,0xd6990624); # u64 882e1051a39Sopenharmony_ci &data_word(0x5771202a,0xf40e3585); # u64 883e1051a39Sopenharmony_ci &data_word(0x32bbd1b8,0x106aa070); # u64 884e1051a39Sopenharmony_ci &data_word(0xb8d2d0c8,0x19a4c116); # u64 885e1051a39Sopenharmony_ci &data_word(0x5141ab53,0x1e376c08); # u64 886e1051a39Sopenharmony_ci &data_word(0xdf8eeb99,0x2748774c); # u64 887e1051a39Sopenharmony_ci &data_word(0xe19b48a8,0x34b0bcb5); # u64 888e1051a39Sopenharmony_ci &data_word(0xc5c95a63,0x391c0cb3); # u64 889e1051a39Sopenharmony_ci &data_word(0xe3418acb,0x4ed8aa4a); # u64 890e1051a39Sopenharmony_ci &data_word(0x7763e373,0x5b9cca4f); # u64 891e1051a39Sopenharmony_ci &data_word(0xd6b2b8a3,0x682e6ff3); # u64 892e1051a39Sopenharmony_ci &data_word(0x5defb2fc,0x748f82ee); # u64 893e1051a39Sopenharmony_ci &data_word(0x43172f60,0x78a5636f); # u64 894e1051a39Sopenharmony_ci &data_word(0xa1f0ab72,0x84c87814); # u64 895e1051a39Sopenharmony_ci &data_word(0x1a6439ec,0x8cc70208); # u64 896e1051a39Sopenharmony_ci &data_word(0x23631e28,0x90befffa); # u64 897e1051a39Sopenharmony_ci &data_word(0xde82bde9,0xa4506ceb); # u64 898e1051a39Sopenharmony_ci &data_word(0xb2c67915,0xbef9a3f7); # u64 899e1051a39Sopenharmony_ci &data_word(0xe372532b,0xc67178f2); # u64 900e1051a39Sopenharmony_ci &data_word(0xea26619c,0xca273ece); # u64 901e1051a39Sopenharmony_ci &data_word(0x21c0c207,0xd186b8c7); # u64 902e1051a39Sopenharmony_ci &data_word(0xcde0eb1e,0xeada7dd6); # u64 903e1051a39Sopenharmony_ci &data_word(0xee6ed178,0xf57d4f7f); # u64 904e1051a39Sopenharmony_ci &data_word(0x72176fba,0x06f067aa); # u64 905e1051a39Sopenharmony_ci &data_word(0xa2c898a6,0x0a637dc5); # u64 906e1051a39Sopenharmony_ci &data_word(0xbef90dae,0x113f9804); # u64 907e1051a39Sopenharmony_ci &data_word(0x131c471b,0x1b710b35); # u64 908e1051a39Sopenharmony_ci &data_word(0x23047d84,0x28db77f5); # u64 909e1051a39Sopenharmony_ci &data_word(0x40c72493,0x32caab7b); # u64 910e1051a39Sopenharmony_ci &data_word(0x15c9bebc,0x3c9ebe0a); # u64 911e1051a39Sopenharmony_ci &data_word(0x9c100d4c,0x431d67c4); # u64 912e1051a39Sopenharmony_ci &data_word(0xcb3e42b6,0x4cc5d4be); # u64 913e1051a39Sopenharmony_ci &data_word(0xfc657e2a,0x597f299c); # u64 914e1051a39Sopenharmony_ci &data_word(0x3ad6faec,0x5fcb6fab); # u64 915e1051a39Sopenharmony_ci &data_word(0x4a475817,0x6c44198c); # u64 916e1051a39Sopenharmony_ci 917e1051a39Sopenharmony_ci &data_word(0x04050607,0x00010203); # byte swap 918e1051a39Sopenharmony_ci &data_word(0x0c0d0e0f,0x08090a0b); # mask 919e1051a39Sopenharmony_ci&function_end_B("sha512_block_data_order"); 920e1051a39Sopenharmony_ci&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); 921e1051a39Sopenharmony_ci 922e1051a39Sopenharmony_ci&asm_finish(); 923e1051a39Sopenharmony_ci 924e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 925