1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci# 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# SHA256/512_Transform for Itanium. 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50% 20e1051a39Sopenharmony_ci# faster than gcc and >60%(!) faster than code generated by HP-UX 21e1051a39Sopenharmony_ci# compiler (yes, HP-UX is generating slower code, because unlike gcc, 22e1051a39Sopenharmony_ci# it failed to deploy "shift right pair," 'shrp' instruction, which 23e1051a39Sopenharmony_ci# substitutes for 64-bit rotate). 24e1051a39Sopenharmony_ci# 25e1051a39Sopenharmony_ci# 924 cycles long sha256_block outperforms gcc by over factor of 2(!) 26e1051a39Sopenharmony_ci# and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost 27e1051a39Sopenharmony_ci# this one big time). Note that "formally" 924 is about 100 cycles 28e1051a39Sopenharmony_ci# too much. I mean it's 64 32-bit rounds vs. 80 virtually identical 29e1051a39Sopenharmony_ci# 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round, 30e1051a39Sopenharmony_ci# are spent on extra work to provide for 32-bit rotations. 32-bit 31e1051a39Sopenharmony_ci# rotations are still handled by 'shrp' instruction and for this 32e1051a39Sopenharmony_ci# reason lower 32 bits are deposited to upper half of 64-bit register 33e1051a39Sopenharmony_ci# prior 'shrp' issue. And in order to minimize the amount of such 34e1051a39Sopenharmony_ci# operations, X[16] values are *maintained* with copies of lower 35e1051a39Sopenharmony_ci# halves in upper halves, which is why you'll spot such instructions 36e1051a39Sopenharmony_ci# as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel 37e1051a39Sopenharmony_ci# 32-bit unsigned right shift," 'pshr4.u' instructions here. 38e1051a39Sopenharmony_ci# 39e1051a39Sopenharmony_ci# Rules of engagement. 40e1051a39Sopenharmony_ci# 41e1051a39Sopenharmony_ci# There is only one integer shifter meaning that if I have two rotate, 42e1051a39Sopenharmony_ci# deposit or extract instructions in adjacent bundles, they shall 43e1051a39Sopenharmony_ci# split [at run-time if they have to]. But note that variable and 44e1051a39Sopenharmony_ci# parallel shifts are performed by multi-media ALU and *are* pairable 45e1051a39Sopenharmony_ci# with rotates [and alike]. On the backside MMALU is rather slow: it 46e1051a39Sopenharmony_ci# takes 2 extra cycles before the result of integer operation is 47e1051a39Sopenharmony_ci# available *to* MMALU and 2(*) extra cycles before the result of MM 48e1051a39Sopenharmony_ci# operation is available "back" *to* integer ALU, not to mention that 49e1051a39Sopenharmony_ci# MMALU itself has 2 cycles latency. However! I explicitly scheduled 50e1051a39Sopenharmony_ci# these MM instructions to avoid MM stalls, so that all these extra 51e1051a39Sopenharmony_ci# latencies get "hidden" in instruction-level parallelism. 52e1051a39Sopenharmony_ci# 53e1051a39Sopenharmony_ci# (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule 54e1051a39Sopenharmony_ci# for 2 in order to provide for best *overall* performance, 55e1051a39Sopenharmony_ci# because on Itanium 1 stall on MM result is accompanied by 56e1051a39Sopenharmony_ci# pipeline flush, which takes 6 cycles:-( 57e1051a39Sopenharmony_ci# 58e1051a39Sopenharmony_ci# June 2012 59e1051a39Sopenharmony_ci# 60e1051a39Sopenharmony_ci# Improve performance by 15-20%. Note about "rules of engagement" 61e1051a39Sopenharmony_ci# above. Contemporary cores are equipped with additional shifter, 62e1051a39Sopenharmony_ci# so that they should perform even better than below, presumably 63e1051a39Sopenharmony_ci# by ~10%. 64e1051a39Sopenharmony_ci# 65e1051a39Sopenharmony_ci###################################################################### 66e1051a39Sopenharmony_ci# Current performance in cycles per processed byte for Itanium 2 67e1051a39Sopenharmony_ci# pre-9000 series [little-endian] system: 68e1051a39Sopenharmony_ci# 69e1051a39Sopenharmony_ci# SHA1(*) 5.7 70e1051a39Sopenharmony_ci# SHA256 12.6 71e1051a39Sopenharmony_ci# SHA512 6.7 72e1051a39Sopenharmony_ci# 73e1051a39Sopenharmony_ci# (*) SHA1 result is presented purely for reference purposes. 74e1051a39Sopenharmony_ci# 75e1051a39Sopenharmony_ci# To generate code, pass the file name with either 256 or 512 in its 76e1051a39Sopenharmony_ci# name and compiler flags. 77e1051a39Sopenharmony_ci 78e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 79e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 80e1051a39Sopenharmony_ci 81e1051a39Sopenharmony_ciif ($output =~ /512.*\.[s|asm]/i) { 82e1051a39Sopenharmony_ci $SZ=8; 83e1051a39Sopenharmony_ci $BITS=8*$SZ; 84e1051a39Sopenharmony_ci $LDW="ld8"; 85e1051a39Sopenharmony_ci $STW="st8"; 86e1051a39Sopenharmony_ci $ADD="add"; 87e1051a39Sopenharmony_ci $SHRU="shr.u"; 88e1051a39Sopenharmony_ci $TABLE="K512"; 89e1051a39Sopenharmony_ci $func="sha512_block_data_order"; 90e1051a39Sopenharmony_ci @Sigma0=(28,34,39); 91e1051a39Sopenharmony_ci @Sigma1=(14,18,41); 92e1051a39Sopenharmony_ci @sigma0=(1, 8, 7); 93e1051a39Sopenharmony_ci @sigma1=(19,61, 6); 94e1051a39Sopenharmony_ci $rounds=80; 95e1051a39Sopenharmony_ci} elsif ($output =~ /256.*\.[s|asm]/i) { 96e1051a39Sopenharmony_ci $SZ=4; 97e1051a39Sopenharmony_ci $BITS=8*$SZ; 98e1051a39Sopenharmony_ci $LDW="ld4"; 99e1051a39Sopenharmony_ci $STW="st4"; 100e1051a39Sopenharmony_ci $ADD="padd4"; 101e1051a39Sopenharmony_ci $SHRU="pshr4.u"; 102e1051a39Sopenharmony_ci $TABLE="K256"; 103e1051a39Sopenharmony_ci $func="sha256_block_data_order"; 104e1051a39Sopenharmony_ci @Sigma0=( 2,13,22); 105e1051a39Sopenharmony_ci @Sigma1=( 6,11,25); 106e1051a39Sopenharmony_ci @sigma0=( 7,18, 3); 107e1051a39Sopenharmony_ci @sigma1=(17,19,10); 108e1051a39Sopenharmony_ci $rounds=64; 109e1051a39Sopenharmony_ci} else { die "nonsense $output"; } 110e1051a39Sopenharmony_ci 111e1051a39Sopenharmony_ci$output and (open STDOUT,">$output" or die "can't open $output: $!"); 112e1051a39Sopenharmony_ci 113e1051a39Sopenharmony_ciif ($^O eq "hpux") { 114e1051a39Sopenharmony_ci $ADDP="addp4"; 115e1051a39Sopenharmony_ci for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 116e1051a39Sopenharmony_ci} else { $ADDP="add"; } 117e1051a39Sopenharmony_cifor (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); 118e1051a39Sopenharmony_ci $big_endian=0 if (/\-DL_ENDIAN/); } 119e1051a39Sopenharmony_ciif (!defined($big_endian)) 120e1051a39Sopenharmony_ci { $big_endian=(unpack('L',pack('N',1))==1); } 121e1051a39Sopenharmony_ci 122e1051a39Sopenharmony_ci$code=<<___; 123e1051a39Sopenharmony_ci.ident \"$output, version 2.0\" 124e1051a39Sopenharmony_ci.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@openssl.org>\" 125e1051a39Sopenharmony_ci.explicit 126e1051a39Sopenharmony_ci.text 127e1051a39Sopenharmony_ci 128e1051a39Sopenharmony_cipfssave=r2; 129e1051a39Sopenharmony_cilcsave=r3; 130e1051a39Sopenharmony_ciprsave=r14; 131e1051a39Sopenharmony_ciK=r15; 132e1051a39Sopenharmony_ciA_=r16; B_=r17; C_=r18; D_=r19; 133e1051a39Sopenharmony_ciE_=r20; F_=r21; G_=r22; H_=r23; 134e1051a39Sopenharmony_ciT1=r24; T2=r25; 135e1051a39Sopenharmony_cis0=r26; s1=r27; t0=r28; t1=r29; 136e1051a39Sopenharmony_ciKtbl=r30; 137e1051a39Sopenharmony_cictx=r31; // 1st arg 138e1051a39Sopenharmony_ciinput=r56; // 2nd arg 139e1051a39Sopenharmony_cinum=r57; // 3rd arg 140e1051a39Sopenharmony_cisgm0=r58; sgm1=r59; // small constants 141e1051a39Sopenharmony_ci 142e1051a39Sopenharmony_ci// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host]) 143e1051a39Sopenharmony_ci.global $func# 144e1051a39Sopenharmony_ci.proc $func# 145e1051a39Sopenharmony_ci.align 32 146e1051a39Sopenharmony_ci.skip 16 147e1051a39Sopenharmony_ci$func: 148e1051a39Sopenharmony_ci .prologue 149e1051a39Sopenharmony_ci .save ar.pfs,pfssave 150e1051a39Sopenharmony_ci{ .mmi; alloc pfssave=ar.pfs,3,25,0,24 151e1051a39Sopenharmony_ci $ADDP ctx=0,r32 // 1st arg 152e1051a39Sopenharmony_ci .save ar.lc,lcsave 153e1051a39Sopenharmony_ci mov lcsave=ar.lc } 154e1051a39Sopenharmony_ci{ .mmi; $ADDP input=0,r33 // 2nd arg 155e1051a39Sopenharmony_ci mov num=r34 // 3rd arg 156e1051a39Sopenharmony_ci .save pr,prsave 157e1051a39Sopenharmony_ci mov prsave=pr };; 158e1051a39Sopenharmony_ci 159e1051a39Sopenharmony_ci .body 160e1051a39Sopenharmony_ci{ .mib; add r8=0*$SZ,ctx 161e1051a39Sopenharmony_ci add r9=1*$SZ,ctx } 162e1051a39Sopenharmony_ci{ .mib; add r10=2*$SZ,ctx 163e1051a39Sopenharmony_ci add r11=3*$SZ,ctx };; 164e1051a39Sopenharmony_ci 165e1051a39Sopenharmony_ci// load A-H 166e1051a39Sopenharmony_ci.Lpic_point: 167e1051a39Sopenharmony_ci{ .mmi; $LDW A_=[r8],4*$SZ 168e1051a39Sopenharmony_ci $LDW B_=[r9],4*$SZ 169e1051a39Sopenharmony_ci mov Ktbl=ip } 170e1051a39Sopenharmony_ci{ .mmi; $LDW C_=[r10],4*$SZ 171e1051a39Sopenharmony_ci $LDW D_=[r11],4*$SZ 172e1051a39Sopenharmony_ci mov sgm0=$sigma0[2] };; 173e1051a39Sopenharmony_ci{ .mmi; $LDW E_=[r8] 174e1051a39Sopenharmony_ci $LDW F_=[r9] 175e1051a39Sopenharmony_ci add Ktbl=($TABLE#-.Lpic_point),Ktbl } 176e1051a39Sopenharmony_ci{ .mmi; $LDW G_=[r10] 177e1051a39Sopenharmony_ci $LDW H_=[r11] 178e1051a39Sopenharmony_ci cmp.ne p0,p16=0,r0 };; 179e1051a39Sopenharmony_ci___ 180e1051a39Sopenharmony_ci$code.=<<___ if ($BITS==64); 181e1051a39Sopenharmony_ci{ .mii; and r8=7,input 182e1051a39Sopenharmony_ci and input=~7,input;; 183e1051a39Sopenharmony_ci cmp.eq p9,p0=1,r8 } 184e1051a39Sopenharmony_ci{ .mmi; cmp.eq p10,p0=2,r8 185e1051a39Sopenharmony_ci cmp.eq p11,p0=3,r8 186e1051a39Sopenharmony_ci cmp.eq p12,p0=4,r8 } 187e1051a39Sopenharmony_ci{ .mmi; cmp.eq p13,p0=5,r8 188e1051a39Sopenharmony_ci cmp.eq p14,p0=6,r8 189e1051a39Sopenharmony_ci cmp.eq p15,p0=7,r8 };; 190e1051a39Sopenharmony_ci___ 191e1051a39Sopenharmony_ci$code.=<<___; 192e1051a39Sopenharmony_ci.L_outer: 193e1051a39Sopenharmony_ci.rotr R[8],X[16] 194e1051a39Sopenharmony_ciA=R[0]; B=R[1]; C=R[2]; D=R[3]; E=R[4]; F=R[5]; G=R[6]; H=R[7] 195e1051a39Sopenharmony_ci{ .mmi; ld1 X[15]=[input],$SZ // eliminated in sha512 196e1051a39Sopenharmony_ci mov A=A_ 197e1051a39Sopenharmony_ci mov ar.lc=14 } 198e1051a39Sopenharmony_ci{ .mmi; mov B=B_ 199e1051a39Sopenharmony_ci mov C=C_ 200e1051a39Sopenharmony_ci mov D=D_ } 201e1051a39Sopenharmony_ci{ .mmi; mov E=E_ 202e1051a39Sopenharmony_ci mov F=F_ 203e1051a39Sopenharmony_ci mov ar.ec=2 };; 204e1051a39Sopenharmony_ci{ .mmi; mov G=G_ 205e1051a39Sopenharmony_ci mov H=H_ 206e1051a39Sopenharmony_ci mov sgm1=$sigma1[2] } 207e1051a39Sopenharmony_ci{ .mib; mov r8=0 208e1051a39Sopenharmony_ci add r9=1-$SZ,input 209e1051a39Sopenharmony_ci brp.loop.imp .L_first16,.L_first16_end-16 };; 210e1051a39Sopenharmony_ci___ 211e1051a39Sopenharmony_ci$t0="A", $t1="E", $code.=<<___ if ($BITS==64); 212e1051a39Sopenharmony_ci// in sha512 case I load whole X[16] at once and take care of alignment... 213e1051a39Sopenharmony_ci{ .mmi; add r8=1*$SZ,input 214e1051a39Sopenharmony_ci add r9=2*$SZ,input 215e1051a39Sopenharmony_ci add r10=3*$SZ,input };; 216e1051a39Sopenharmony_ci{ .mmb; $LDW X[15]=[input],4*$SZ 217e1051a39Sopenharmony_ci $LDW X[14]=[r8],4*$SZ 218e1051a39Sopenharmony_ci(p9) br.cond.dpnt.many .L1byte };; 219e1051a39Sopenharmony_ci{ .mmb; $LDW X[13]=[r9],4*$SZ 220e1051a39Sopenharmony_ci $LDW X[12]=[r10],4*$SZ 221e1051a39Sopenharmony_ci(p10) br.cond.dpnt.many .L2byte };; 222e1051a39Sopenharmony_ci{ .mmb; $LDW X[11]=[input],4*$SZ 223e1051a39Sopenharmony_ci $LDW X[10]=[r8],4*$SZ 224e1051a39Sopenharmony_ci(p11) br.cond.dpnt.many .L3byte };; 225e1051a39Sopenharmony_ci{ .mmb; $LDW X[ 9]=[r9],4*$SZ 226e1051a39Sopenharmony_ci $LDW X[ 8]=[r10],4*$SZ 227e1051a39Sopenharmony_ci(p12) br.cond.dpnt.many .L4byte };; 228e1051a39Sopenharmony_ci{ .mmb; $LDW X[ 7]=[input],4*$SZ 229e1051a39Sopenharmony_ci $LDW X[ 6]=[r8],4*$SZ 230e1051a39Sopenharmony_ci(p13) br.cond.dpnt.many .L5byte };; 231e1051a39Sopenharmony_ci{ .mmb; $LDW X[ 5]=[r9],4*$SZ 232e1051a39Sopenharmony_ci $LDW X[ 4]=[r10],4*$SZ 233e1051a39Sopenharmony_ci(p14) br.cond.dpnt.many .L6byte };; 234e1051a39Sopenharmony_ci{ .mmb; $LDW X[ 3]=[input],4*$SZ 235e1051a39Sopenharmony_ci $LDW X[ 2]=[r8],4*$SZ 236e1051a39Sopenharmony_ci(p15) br.cond.dpnt.many .L7byte };; 237e1051a39Sopenharmony_ci{ .mmb; $LDW X[ 1]=[r9],4*$SZ 238e1051a39Sopenharmony_ci $LDW X[ 0]=[r10],4*$SZ } 239e1051a39Sopenharmony_ci{ .mib; mov r8=0 240e1051a39Sopenharmony_ci mux1 X[15]=X[15],\@rev // eliminated on big-endian 241e1051a39Sopenharmony_ci br.many .L_first16 };; 242e1051a39Sopenharmony_ci.L1byte: 243e1051a39Sopenharmony_ci{ .mmi; $LDW X[13]=[r9],4*$SZ 244e1051a39Sopenharmony_ci $LDW X[12]=[r10],4*$SZ 245e1051a39Sopenharmony_ci shrp X[15]=X[15],X[14],56 };; 246e1051a39Sopenharmony_ci{ .mmi; $LDW X[11]=[input],4*$SZ 247e1051a39Sopenharmony_ci $LDW X[10]=[r8],4*$SZ 248e1051a39Sopenharmony_ci shrp X[14]=X[14],X[13],56 } 249e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 9]=[r9],4*$SZ 250e1051a39Sopenharmony_ci $LDW X[ 8]=[r10],4*$SZ 251e1051a39Sopenharmony_ci shrp X[13]=X[13],X[12],56 };; 252e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 7]=[input],4*$SZ 253e1051a39Sopenharmony_ci $LDW X[ 6]=[r8],4*$SZ 254e1051a39Sopenharmony_ci shrp X[12]=X[12],X[11],56 } 255e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 5]=[r9],4*$SZ 256e1051a39Sopenharmony_ci $LDW X[ 4]=[r10],4*$SZ 257e1051a39Sopenharmony_ci shrp X[11]=X[11],X[10],56 };; 258e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 3]=[input],4*$SZ 259e1051a39Sopenharmony_ci $LDW X[ 2]=[r8],4*$SZ 260e1051a39Sopenharmony_ci shrp X[10]=X[10],X[ 9],56 } 261e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 1]=[r9],4*$SZ 262e1051a39Sopenharmony_ci $LDW X[ 0]=[r10],4*$SZ 263e1051a39Sopenharmony_ci shrp X[ 9]=X[ 9],X[ 8],56 };; 264e1051a39Sopenharmony_ci{ .mii; $LDW T1=[input] 265e1051a39Sopenharmony_ci shrp X[ 8]=X[ 8],X[ 7],56 266e1051a39Sopenharmony_ci shrp X[ 7]=X[ 7],X[ 6],56 } 267e1051a39Sopenharmony_ci{ .mii; shrp X[ 6]=X[ 6],X[ 5],56 268e1051a39Sopenharmony_ci shrp X[ 5]=X[ 5],X[ 4],56 };; 269e1051a39Sopenharmony_ci{ .mii; shrp X[ 4]=X[ 4],X[ 3],56 270e1051a39Sopenharmony_ci shrp X[ 3]=X[ 3],X[ 2],56 } 271e1051a39Sopenharmony_ci{ .mii; shrp X[ 2]=X[ 2],X[ 1],56 272e1051a39Sopenharmony_ci shrp X[ 1]=X[ 1],X[ 0],56 } 273e1051a39Sopenharmony_ci{ .mib; shrp X[ 0]=X[ 0],T1,56 } 274e1051a39Sopenharmony_ci{ .mib; mov r8=0 275e1051a39Sopenharmony_ci mux1 X[15]=X[15],\@rev // eliminated on big-endian 276e1051a39Sopenharmony_ci br.many .L_first16 };; 277e1051a39Sopenharmony_ci.L2byte: 278e1051a39Sopenharmony_ci{ .mmi; $LDW X[11]=[input],4*$SZ 279e1051a39Sopenharmony_ci $LDW X[10]=[r8],4*$SZ 280e1051a39Sopenharmony_ci shrp X[15]=X[15],X[14],48 } 281e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 9]=[r9],4*$SZ 282e1051a39Sopenharmony_ci $LDW X[ 8]=[r10],4*$SZ 283e1051a39Sopenharmony_ci shrp X[14]=X[14],X[13],48 };; 284e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 7]=[input],4*$SZ 285e1051a39Sopenharmony_ci $LDW X[ 6]=[r8],4*$SZ 286e1051a39Sopenharmony_ci shrp X[13]=X[13],X[12],48 } 287e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 5]=[r9],4*$SZ 288e1051a39Sopenharmony_ci $LDW X[ 4]=[r10],4*$SZ 289e1051a39Sopenharmony_ci shrp X[12]=X[12],X[11],48 };; 290e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 3]=[input],4*$SZ 291e1051a39Sopenharmony_ci $LDW X[ 2]=[r8],4*$SZ 292e1051a39Sopenharmony_ci shrp X[11]=X[11],X[10],48 } 293e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 1]=[r9],4*$SZ 294e1051a39Sopenharmony_ci $LDW X[ 0]=[r10],4*$SZ 295e1051a39Sopenharmony_ci shrp X[10]=X[10],X[ 9],48 };; 296e1051a39Sopenharmony_ci{ .mii; $LDW T1=[input] 297e1051a39Sopenharmony_ci shrp X[ 9]=X[ 9],X[ 8],48 298e1051a39Sopenharmony_ci shrp X[ 8]=X[ 8],X[ 7],48 } 299e1051a39Sopenharmony_ci{ .mii; shrp X[ 7]=X[ 7],X[ 6],48 300e1051a39Sopenharmony_ci shrp X[ 6]=X[ 6],X[ 5],48 };; 301e1051a39Sopenharmony_ci{ .mii; shrp X[ 5]=X[ 5],X[ 4],48 302e1051a39Sopenharmony_ci shrp X[ 4]=X[ 4],X[ 3],48 } 303e1051a39Sopenharmony_ci{ .mii; shrp X[ 3]=X[ 3],X[ 2],48 304e1051a39Sopenharmony_ci shrp X[ 2]=X[ 2],X[ 1],48 } 305e1051a39Sopenharmony_ci{ .mii; shrp X[ 1]=X[ 1],X[ 0],48 306e1051a39Sopenharmony_ci shrp X[ 0]=X[ 0],T1,48 } 307e1051a39Sopenharmony_ci{ .mib; mov r8=0 308e1051a39Sopenharmony_ci mux1 X[15]=X[15],\@rev // eliminated on big-endian 309e1051a39Sopenharmony_ci br.many .L_first16 };; 310e1051a39Sopenharmony_ci.L3byte: 311e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 9]=[r9],4*$SZ 312e1051a39Sopenharmony_ci $LDW X[ 8]=[r10],4*$SZ 313e1051a39Sopenharmony_ci shrp X[15]=X[15],X[14],40 };; 314e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 7]=[input],4*$SZ 315e1051a39Sopenharmony_ci $LDW X[ 6]=[r8],4*$SZ 316e1051a39Sopenharmony_ci shrp X[14]=X[14],X[13],40 } 317e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 5]=[r9],4*$SZ 318e1051a39Sopenharmony_ci $LDW X[ 4]=[r10],4*$SZ 319e1051a39Sopenharmony_ci shrp X[13]=X[13],X[12],40 };; 320e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 3]=[input],4*$SZ 321e1051a39Sopenharmony_ci $LDW X[ 2]=[r8],4*$SZ 322e1051a39Sopenharmony_ci shrp X[12]=X[12],X[11],40 } 323e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 1]=[r9],4*$SZ 324e1051a39Sopenharmony_ci $LDW X[ 0]=[r10],4*$SZ 325e1051a39Sopenharmony_ci shrp X[11]=X[11],X[10],40 };; 326e1051a39Sopenharmony_ci{ .mii; $LDW T1=[input] 327e1051a39Sopenharmony_ci shrp X[10]=X[10],X[ 9],40 328e1051a39Sopenharmony_ci shrp X[ 9]=X[ 9],X[ 8],40 } 329e1051a39Sopenharmony_ci{ .mii; shrp X[ 8]=X[ 8],X[ 7],40 330e1051a39Sopenharmony_ci shrp X[ 7]=X[ 7],X[ 6],40 };; 331e1051a39Sopenharmony_ci{ .mii; shrp X[ 6]=X[ 6],X[ 5],40 332e1051a39Sopenharmony_ci shrp X[ 5]=X[ 5],X[ 4],40 } 333e1051a39Sopenharmony_ci{ .mii; shrp X[ 4]=X[ 4],X[ 3],40 334e1051a39Sopenharmony_ci shrp X[ 3]=X[ 3],X[ 2],40 } 335e1051a39Sopenharmony_ci{ .mii; shrp X[ 2]=X[ 2],X[ 1],40 336e1051a39Sopenharmony_ci shrp X[ 1]=X[ 1],X[ 0],40 } 337e1051a39Sopenharmony_ci{ .mib; shrp X[ 0]=X[ 0],T1,40 } 338e1051a39Sopenharmony_ci{ .mib; mov r8=0 339e1051a39Sopenharmony_ci mux1 X[15]=X[15],\@rev // eliminated on big-endian 340e1051a39Sopenharmony_ci br.many .L_first16 };; 341e1051a39Sopenharmony_ci.L4byte: 342e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 7]=[input],4*$SZ 343e1051a39Sopenharmony_ci $LDW X[ 6]=[r8],4*$SZ 344e1051a39Sopenharmony_ci shrp X[15]=X[15],X[14],32 } 345e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 5]=[r9],4*$SZ 346e1051a39Sopenharmony_ci $LDW X[ 4]=[r10],4*$SZ 347e1051a39Sopenharmony_ci shrp X[14]=X[14],X[13],32 };; 348e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 3]=[input],4*$SZ 349e1051a39Sopenharmony_ci $LDW X[ 2]=[r8],4*$SZ 350e1051a39Sopenharmony_ci shrp X[13]=X[13],X[12],32 } 351e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 1]=[r9],4*$SZ 352e1051a39Sopenharmony_ci $LDW X[ 0]=[r10],4*$SZ 353e1051a39Sopenharmony_ci shrp X[12]=X[12],X[11],32 };; 354e1051a39Sopenharmony_ci{ .mii; $LDW T1=[input] 355e1051a39Sopenharmony_ci shrp X[11]=X[11],X[10],32 356e1051a39Sopenharmony_ci shrp X[10]=X[10],X[ 9],32 } 357e1051a39Sopenharmony_ci{ .mii; shrp X[ 9]=X[ 9],X[ 8],32 358e1051a39Sopenharmony_ci shrp X[ 8]=X[ 8],X[ 7],32 };; 359e1051a39Sopenharmony_ci{ .mii; shrp X[ 7]=X[ 7],X[ 6],32 360e1051a39Sopenharmony_ci shrp X[ 6]=X[ 6],X[ 5],32 } 361e1051a39Sopenharmony_ci{ .mii; shrp X[ 5]=X[ 5],X[ 4],32 362e1051a39Sopenharmony_ci shrp X[ 4]=X[ 4],X[ 3],32 } 363e1051a39Sopenharmony_ci{ .mii; shrp X[ 3]=X[ 3],X[ 2],32 364e1051a39Sopenharmony_ci shrp X[ 2]=X[ 2],X[ 1],32 } 365e1051a39Sopenharmony_ci{ .mii; shrp X[ 1]=X[ 1],X[ 0],32 366e1051a39Sopenharmony_ci shrp X[ 0]=X[ 0],T1,32 } 367e1051a39Sopenharmony_ci{ .mib; mov r8=0 368e1051a39Sopenharmony_ci mux1 X[15]=X[15],\@rev // eliminated on big-endian 369e1051a39Sopenharmony_ci br.many .L_first16 };; 370e1051a39Sopenharmony_ci.L5byte: 371e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 5]=[r9],4*$SZ 372e1051a39Sopenharmony_ci $LDW X[ 4]=[r10],4*$SZ 373e1051a39Sopenharmony_ci shrp X[15]=X[15],X[14],24 };; 374e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 3]=[input],4*$SZ 375e1051a39Sopenharmony_ci $LDW X[ 2]=[r8],4*$SZ 376e1051a39Sopenharmony_ci shrp X[14]=X[14],X[13],24 } 377e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 1]=[r9],4*$SZ 378e1051a39Sopenharmony_ci $LDW X[ 0]=[r10],4*$SZ 379e1051a39Sopenharmony_ci shrp X[13]=X[13],X[12],24 };; 380e1051a39Sopenharmony_ci{ .mii; $LDW T1=[input] 381e1051a39Sopenharmony_ci shrp X[12]=X[12],X[11],24 382e1051a39Sopenharmony_ci shrp X[11]=X[11],X[10],24 } 383e1051a39Sopenharmony_ci{ .mii; shrp X[10]=X[10],X[ 9],24 384e1051a39Sopenharmony_ci shrp X[ 9]=X[ 9],X[ 8],24 };; 385e1051a39Sopenharmony_ci{ .mii; shrp X[ 8]=X[ 8],X[ 7],24 386e1051a39Sopenharmony_ci shrp X[ 7]=X[ 7],X[ 6],24 } 387e1051a39Sopenharmony_ci{ .mii; shrp X[ 6]=X[ 6],X[ 5],24 388e1051a39Sopenharmony_ci shrp X[ 5]=X[ 5],X[ 4],24 } 389e1051a39Sopenharmony_ci{ .mii; shrp X[ 4]=X[ 4],X[ 3],24 390e1051a39Sopenharmony_ci shrp X[ 3]=X[ 3],X[ 2],24 } 391e1051a39Sopenharmony_ci{ .mii; shrp X[ 2]=X[ 2],X[ 1],24 392e1051a39Sopenharmony_ci shrp X[ 1]=X[ 1],X[ 0],24 } 393e1051a39Sopenharmony_ci{ .mib; shrp X[ 0]=X[ 0],T1,24 } 394e1051a39Sopenharmony_ci{ .mib; mov r8=0 395e1051a39Sopenharmony_ci mux1 X[15]=X[15],\@rev // eliminated on big-endian 396e1051a39Sopenharmony_ci br.many .L_first16 };; 397e1051a39Sopenharmony_ci.L6byte: 398e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 3]=[input],4*$SZ 399e1051a39Sopenharmony_ci $LDW X[ 2]=[r8],4*$SZ 400e1051a39Sopenharmony_ci shrp X[15]=X[15],X[14],16 } 401e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 1]=[r9],4*$SZ 402e1051a39Sopenharmony_ci $LDW X[ 0]=[r10],4*$SZ 403e1051a39Sopenharmony_ci shrp X[14]=X[14],X[13],16 };; 404e1051a39Sopenharmony_ci{ .mii; $LDW T1=[input] 405e1051a39Sopenharmony_ci shrp X[13]=X[13],X[12],16 406e1051a39Sopenharmony_ci shrp X[12]=X[12],X[11],16 } 407e1051a39Sopenharmony_ci{ .mii; shrp X[11]=X[11],X[10],16 408e1051a39Sopenharmony_ci shrp X[10]=X[10],X[ 9],16 };; 409e1051a39Sopenharmony_ci{ .mii; shrp X[ 9]=X[ 9],X[ 8],16 410e1051a39Sopenharmony_ci shrp X[ 8]=X[ 8],X[ 7],16 } 411e1051a39Sopenharmony_ci{ .mii; shrp X[ 7]=X[ 7],X[ 6],16 412e1051a39Sopenharmony_ci shrp X[ 6]=X[ 6],X[ 5],16 } 413e1051a39Sopenharmony_ci{ .mii; shrp X[ 5]=X[ 5],X[ 4],16 414e1051a39Sopenharmony_ci shrp X[ 4]=X[ 4],X[ 3],16 } 415e1051a39Sopenharmony_ci{ .mii; shrp X[ 3]=X[ 3],X[ 2],16 416e1051a39Sopenharmony_ci shrp X[ 2]=X[ 2],X[ 1],16 } 417e1051a39Sopenharmony_ci{ .mii; shrp X[ 1]=X[ 1],X[ 0],16 418e1051a39Sopenharmony_ci shrp X[ 0]=X[ 0],T1,16 } 419e1051a39Sopenharmony_ci{ .mib; mov r8=0 420e1051a39Sopenharmony_ci mux1 X[15]=X[15],\@rev // eliminated on big-endian 421e1051a39Sopenharmony_ci br.many .L_first16 };; 422e1051a39Sopenharmony_ci.L7byte: 423e1051a39Sopenharmony_ci{ .mmi; $LDW X[ 1]=[r9],4*$SZ 424e1051a39Sopenharmony_ci $LDW X[ 0]=[r10],4*$SZ 425e1051a39Sopenharmony_ci shrp X[15]=X[15],X[14],8 };; 426e1051a39Sopenharmony_ci{ .mii; $LDW T1=[input] 427e1051a39Sopenharmony_ci shrp X[14]=X[14],X[13],8 428e1051a39Sopenharmony_ci shrp X[13]=X[13],X[12],8 } 429e1051a39Sopenharmony_ci{ .mii; shrp X[12]=X[12],X[11],8 430e1051a39Sopenharmony_ci shrp X[11]=X[11],X[10],8 };; 431e1051a39Sopenharmony_ci{ .mii; shrp X[10]=X[10],X[ 9],8 432e1051a39Sopenharmony_ci shrp X[ 9]=X[ 9],X[ 8],8 } 433e1051a39Sopenharmony_ci{ .mii; shrp X[ 8]=X[ 8],X[ 7],8 434e1051a39Sopenharmony_ci shrp X[ 7]=X[ 7],X[ 6],8 } 435e1051a39Sopenharmony_ci{ .mii; shrp X[ 6]=X[ 6],X[ 5],8 436e1051a39Sopenharmony_ci shrp X[ 5]=X[ 5],X[ 4],8 } 437e1051a39Sopenharmony_ci{ .mii; shrp X[ 4]=X[ 4],X[ 3],8 438e1051a39Sopenharmony_ci shrp X[ 3]=X[ 3],X[ 2],8 } 439e1051a39Sopenharmony_ci{ .mii; shrp X[ 2]=X[ 2],X[ 1],8 440e1051a39Sopenharmony_ci shrp X[ 1]=X[ 1],X[ 0],8 } 441e1051a39Sopenharmony_ci{ .mib; shrp X[ 0]=X[ 0],T1,8 } 442e1051a39Sopenharmony_ci{ .mib; mov r8=0 443e1051a39Sopenharmony_ci mux1 X[15]=X[15],\@rev };; // eliminated on big-endian 444e1051a39Sopenharmony_ci 445e1051a39Sopenharmony_ci.align 32 446e1051a39Sopenharmony_ci.L_first16: 447e1051a39Sopenharmony_ci{ .mmi; $LDW K=[Ktbl],$SZ 448e1051a39Sopenharmony_ci add A=A,r8 // H+=Sigma(0) from the past 449e1051a39Sopenharmony_ci _rotr r10=$t1,$Sigma1[0] } // ROTR(e,14) 450e1051a39Sopenharmony_ci{ .mmi; and T1=F,E 451e1051a39Sopenharmony_ci andcm r8=G,E 452e1051a39Sopenharmony_ci (p16) mux1 X[14]=X[14],\@rev };; // eliminated on big-endian 453e1051a39Sopenharmony_ci{ .mmi; and T2=A,B 454e1051a39Sopenharmony_ci and r9=A,C 455e1051a39Sopenharmony_ci _rotr r11=$t1,$Sigma1[1] } // ROTR(e,41) 456e1051a39Sopenharmony_ci{ .mmi; xor T1=T1,r8 // T1=((e & f) ^ (~e & g)) 457e1051a39Sopenharmony_ci and r8=B,C };; 458e1051a39Sopenharmony_ci___ 459e1051a39Sopenharmony_ci$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32); 460e1051a39Sopenharmony_ci.align 32 461e1051a39Sopenharmony_ci.L_first16: 462e1051a39Sopenharmony_ci{ .mmi; add A=A,r8 // H+=Sigma(0) from the past 463e1051a39Sopenharmony_ci add r10=2-$SZ,input 464e1051a39Sopenharmony_ci add r11=3-$SZ,input };; 465e1051a39Sopenharmony_ci{ .mmi; ld1 r9=[r9] 466e1051a39Sopenharmony_ci ld1 r10=[r10] 467e1051a39Sopenharmony_ci dep.z $t1=E,32,32 } 468e1051a39Sopenharmony_ci{ .mmi; ld1 r11=[r11] 469e1051a39Sopenharmony_ci $LDW K=[Ktbl],$SZ 470e1051a39Sopenharmony_ci zxt4 E=E };; 471e1051a39Sopenharmony_ci{ .mii; or $t1=$t1,E 472e1051a39Sopenharmony_ci dep X[15]=X[15],r9,8,8 473e1051a39Sopenharmony_ci mux2 $t0=A,0x44 };; // copy lower half to upper 474e1051a39Sopenharmony_ci{ .mmi; and T1=F,E 475e1051a39Sopenharmony_ci andcm r8=G,E 476e1051a39Sopenharmony_ci dep r11=r10,r11,8,8 };; 477e1051a39Sopenharmony_ci{ .mmi; and T2=A,B 478e1051a39Sopenharmony_ci and r9=A,C 479e1051a39Sopenharmony_ci dep X[15]=X[15],r11,16,16 };; 480e1051a39Sopenharmony_ci{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch 481e1051a39Sopenharmony_ci xor T1=T1,r8 // T1=((e & f) ^ (~e & g)) 482e1051a39Sopenharmony_ci _rotr r10=$t1,$Sigma1[0] } // ROTR(e,14) 483e1051a39Sopenharmony_ci{ .mmi; and r8=B,C 484e1051a39Sopenharmony_ci _rotr r11=$t1,$Sigma1[1] };; // ROTR(e,18) 485e1051a39Sopenharmony_ci___ 486e1051a39Sopenharmony_ci$code.=<<___; 487e1051a39Sopenharmony_ci{ .mmi; add T1=T1,H // T1=Ch(e,f,g)+h 488e1051a39Sopenharmony_ci xor r10=r10,r11 489e1051a39Sopenharmony_ci _rotr r11=$t1,$Sigma1[2] } // ROTR(e,41) 490e1051a39Sopenharmony_ci{ .mmi; xor T2=T2,r9 491e1051a39Sopenharmony_ci add K=K,X[15] };; 492e1051a39Sopenharmony_ci{ .mmi; add T1=T1,K // T1+=K[i]+X[i] 493e1051a39Sopenharmony_ci xor T2=T2,r8 // T2=((a & b) ^ (a & c) ^ (b & c)) 494e1051a39Sopenharmony_ci _rotr r8=$t0,$Sigma0[0] } // ROTR(a,28) 495e1051a39Sopenharmony_ci{ .mmi; xor r11=r11,r10 // Sigma1(e) 496e1051a39Sopenharmony_ci _rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34) 497e1051a39Sopenharmony_ci{ .mmi; add T1=T1,r11 // T+=Sigma1(e) 498e1051a39Sopenharmony_ci xor r8=r8,r9 499e1051a39Sopenharmony_ci _rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39) 500e1051a39Sopenharmony_ci{ .mmi; xor r8=r8,r9 // Sigma0(a) 501e1051a39Sopenharmony_ci add D=D,T1 502e1051a39Sopenharmony_ci mux2 H=X[15],0x44 } // mov H=X[15] in sha512 503e1051a39Sopenharmony_ci{ .mib; (p16) add r9=1-$SZ,input // not used in sha512 504e1051a39Sopenharmony_ci add X[15]=T1,T2 // H=T1+Maj(a,b,c) 505e1051a39Sopenharmony_ci br.ctop.sptk .L_first16 };; 506e1051a39Sopenharmony_ci.L_first16_end: 507e1051a39Sopenharmony_ci 508e1051a39Sopenharmony_ci{ .mib; mov ar.lc=$rounds-17 509e1051a39Sopenharmony_ci brp.loop.imp .L_rest,.L_rest_end-16 } 510e1051a39Sopenharmony_ci{ .mib; mov ar.ec=1 511e1051a39Sopenharmony_ci br.many .L_rest };; 512e1051a39Sopenharmony_ci 513e1051a39Sopenharmony_ci.align 32 514e1051a39Sopenharmony_ci.L_rest: 515e1051a39Sopenharmony_ci{ .mmi; $LDW K=[Ktbl],$SZ 516e1051a39Sopenharmony_ci add A=A,r8 // H+=Sigma0(a) from the past 517e1051a39Sopenharmony_ci _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1) 518e1051a39Sopenharmony_ci{ .mmi; add X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF] 519e1051a39Sopenharmony_ci $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7 520e1051a39Sopenharmony_ci{ .mib; and T1=F,E 521e1051a39Sopenharmony_ci _rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8) 522e1051a39Sopenharmony_ci{ .mib; andcm r10=G,E 523e1051a39Sopenharmony_ci $SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6 524e1051a39Sopenharmony_ci// Pair of mmi; splits on Itanium 1 and prevents pipeline flush 525e1051a39Sopenharmony_ci// upon $SHRU output usage 526e1051a39Sopenharmony_ci{ .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g)) 527e1051a39Sopenharmony_ci xor r9=r8,r9 528e1051a39Sopenharmony_ci _rotr r10=X[15-14],$sigma1[0] }// ROTR(s1,19) 529e1051a39Sopenharmony_ci{ .mmi; and T2=A,B 530e1051a39Sopenharmony_ci and r8=A,C 531e1051a39Sopenharmony_ci _rotr r11=X[15-14],$sigma1[1] };;// ROTR(s1,61) 532e1051a39Sopenharmony_ci___ 533e1051a39Sopenharmony_ci$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32); 534e1051a39Sopenharmony_ci{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF]) 535e1051a39Sopenharmony_ci dep.z $t1=E,32,32 } 536e1051a39Sopenharmony_ci{ .mib; xor r10=r11,r10 537e1051a39Sopenharmony_ci zxt4 E=E };; 538e1051a39Sopenharmony_ci{ .mii; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF]) 539e1051a39Sopenharmony_ci shrp r9=E,$t1,32+$Sigma1[0] // ROTR(e,14) 540e1051a39Sopenharmony_ci mux2 $t0=A,0x44 };; // copy lower half to upper 541e1051a39Sopenharmony_ci// Pair of mmi; splits on Itanium 1 and prevents pipeline flush 542e1051a39Sopenharmony_ci// upon mux2 output usage 543e1051a39Sopenharmony_ci{ .mmi; xor T2=T2,r8 544e1051a39Sopenharmony_ci shrp r8=E,$t1,32+$Sigma1[1]} // ROTR(e,18) 545e1051a39Sopenharmony_ci{ .mmi; and r10=B,C 546e1051a39Sopenharmony_ci add T1=T1,H // T1=Ch(e,f,g)+h 547e1051a39Sopenharmony_ci or $t1=$t1,E };; 548e1051a39Sopenharmony_ci___ 549e1051a39Sopenharmony_ci$t0="A", $t1="E", $code.=<<___ if ($BITS==64); 550e1051a39Sopenharmony_ci{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF]) 551e1051a39Sopenharmony_ci _rotr r9=$t1,$Sigma1[0] } // ROTR(e,14) 552e1051a39Sopenharmony_ci{ .mib; xor r10=r11,r10 553e1051a39Sopenharmony_ci xor T2=T2,r8 };; 554e1051a39Sopenharmony_ci{ .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF]) 555e1051a39Sopenharmony_ci _rotr r8=$t1,$Sigma1[1] } // ROTR(e,18) 556e1051a39Sopenharmony_ci{ .mib; and r10=B,C 557e1051a39Sopenharmony_ci add T1=T1,H };; // T1+=H 558e1051a39Sopenharmony_ci___ 559e1051a39Sopenharmony_ci$code.=<<___; 560e1051a39Sopenharmony_ci{ .mib; xor r9=r9,r8 561e1051a39Sopenharmony_ci _rotr r8=$t1,$Sigma1[2] } // ROTR(e,41) 562e1051a39Sopenharmony_ci{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c)) 563e1051a39Sopenharmony_ci add X[15]=X[15],s0 };; // X[i]+=sigma0(X[i+1]) 564e1051a39Sopenharmony_ci{ .mmi; xor r9=r9,r8 // Sigma1(e) 565e1051a39Sopenharmony_ci add X[15]=X[15],s1 // X[i]+=sigma0(X[i+14]) 566e1051a39Sopenharmony_ci _rotr r8=$t0,$Sigma0[0] };; // ROTR(a,28) 567e1051a39Sopenharmony_ci{ .mmi; add K=K,X[15] 568e1051a39Sopenharmony_ci add T1=T1,r9 // T1+=Sigma1(e) 569e1051a39Sopenharmony_ci _rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34) 570e1051a39Sopenharmony_ci{ .mmi; add T1=T1,K // T1+=K[i]+X[i] 571e1051a39Sopenharmony_ci xor r8=r8,r9 572e1051a39Sopenharmony_ci _rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39) 573e1051a39Sopenharmony_ci{ .mib; add D=D,T1 574e1051a39Sopenharmony_ci mux2 H=X[15],0x44 } // mov H=X[15] in sha512 575e1051a39Sopenharmony_ci{ .mib; xor r8=r8,r9 // Sigma0(a) 576e1051a39Sopenharmony_ci add X[15]=T1,T2 // H=T1+Maj(a,b,c) 577e1051a39Sopenharmony_ci br.ctop.sptk .L_rest };; 578e1051a39Sopenharmony_ci.L_rest_end: 579e1051a39Sopenharmony_ci 580e1051a39Sopenharmony_ci{ .mmi; add A=A,r8 };; // H+=Sigma0(a) from the past 581e1051a39Sopenharmony_ci{ .mmi; add A_=A_,A 582e1051a39Sopenharmony_ci add B_=B_,B 583e1051a39Sopenharmony_ci add C_=C_,C } 584e1051a39Sopenharmony_ci{ .mmi; add D_=D_,D 585e1051a39Sopenharmony_ci add E_=E_,E 586e1051a39Sopenharmony_ci cmp.ltu p16,p0=1,num };; 587e1051a39Sopenharmony_ci{ .mmi; add F_=F_,F 588e1051a39Sopenharmony_ci add G_=G_,G 589e1051a39Sopenharmony_ci add H_=H_,H } 590e1051a39Sopenharmony_ci{ .mmb; add Ktbl=-$SZ*$rounds,Ktbl 591e1051a39Sopenharmony_ci(p16) add num=-1,num 592e1051a39Sopenharmony_ci(p16) br.dptk.many .L_outer };; 593e1051a39Sopenharmony_ci 594e1051a39Sopenharmony_ci{ .mib; add r8=0*$SZ,ctx 595e1051a39Sopenharmony_ci add r9=1*$SZ,ctx } 596e1051a39Sopenharmony_ci{ .mib; add r10=2*$SZ,ctx 597e1051a39Sopenharmony_ci add r11=3*$SZ,ctx };; 598e1051a39Sopenharmony_ci{ .mmi; $STW [r8]=A_,4*$SZ 599e1051a39Sopenharmony_ci $STW [r9]=B_,4*$SZ 600e1051a39Sopenharmony_ci mov ar.lc=lcsave } 601e1051a39Sopenharmony_ci{ .mmi; $STW [r10]=C_,4*$SZ 602e1051a39Sopenharmony_ci $STW [r11]=D_,4*$SZ 603e1051a39Sopenharmony_ci mov pr=prsave,0x1ffff };; 604e1051a39Sopenharmony_ci{ .mmb; $STW [r8]=E_ 605e1051a39Sopenharmony_ci $STW [r9]=F_ } 606e1051a39Sopenharmony_ci{ .mmb; $STW [r10]=G_ 607e1051a39Sopenharmony_ci $STW [r11]=H_ 608e1051a39Sopenharmony_ci br.ret.sptk.many b0 };; 609e1051a39Sopenharmony_ci.endp $func# 610e1051a39Sopenharmony_ci___ 611e1051a39Sopenharmony_ci 612e1051a39Sopenharmony_ciforeach(split($/,$code)) { 613e1051a39Sopenharmony_ci s/\`([^\`]*)\`/eval $1/gem; 614e1051a39Sopenharmony_ci s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm; 615e1051a39Sopenharmony_ci if ($BITS==64) { 616e1051a39Sopenharmony_ci s/mux2(\s+)([^=]+)=([^,]+),\S+/mov$1 $2=$3/gm; 617e1051a39Sopenharmony_ci s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian); 618e1051a39Sopenharmony_ci s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm 619e1051a39Sopenharmony_ci if (!$big_endian); 620e1051a39Sopenharmony_ci s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm; 621e1051a39Sopenharmony_ci } 622e1051a39Sopenharmony_ci 623e1051a39Sopenharmony_ci print $_,"\n"; 624e1051a39Sopenharmony_ci} 625e1051a39Sopenharmony_ci 626e1051a39Sopenharmony_ciprint<<___ if ($BITS==32); 627e1051a39Sopenharmony_ci.align 64 628e1051a39Sopenharmony_ci.type K256#,\@object 629e1051a39Sopenharmony_ciK256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 630e1051a39Sopenharmony_ci data4 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 631e1051a39Sopenharmony_ci data4 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 632e1051a39Sopenharmony_ci data4 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 633e1051a39Sopenharmony_ci data4 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 634e1051a39Sopenharmony_ci data4 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 635e1051a39Sopenharmony_ci data4 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 636e1051a39Sopenharmony_ci data4 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 637e1051a39Sopenharmony_ci data4 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 638e1051a39Sopenharmony_ci data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 639e1051a39Sopenharmony_ci data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 640e1051a39Sopenharmony_ci data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 641e1051a39Sopenharmony_ci data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 642e1051a39Sopenharmony_ci data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 643e1051a39Sopenharmony_ci data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 644e1051a39Sopenharmony_ci data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 645e1051a39Sopenharmony_ci.size K256#,$SZ*$rounds 646e1051a39Sopenharmony_cistringz "SHA256 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>" 647e1051a39Sopenharmony_ci___ 648e1051a39Sopenharmony_ciprint<<___ if ($BITS==64); 649e1051a39Sopenharmony_ci.align 64 650e1051a39Sopenharmony_ci.type K512#,\@object 651e1051a39Sopenharmony_ciK512: data8 0x428a2f98d728ae22,0x7137449123ef65cd 652e1051a39Sopenharmony_ci data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 653e1051a39Sopenharmony_ci data8 0x3956c25bf348b538,0x59f111f1b605d019 654e1051a39Sopenharmony_ci data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118 655e1051a39Sopenharmony_ci data8 0xd807aa98a3030242,0x12835b0145706fbe 656e1051a39Sopenharmony_ci data8 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 657e1051a39Sopenharmony_ci data8 0x72be5d74f27b896f,0x80deb1fe3b1696b1 658e1051a39Sopenharmony_ci data8 0x9bdc06a725c71235,0xc19bf174cf692694 659e1051a39Sopenharmony_ci data8 0xe49b69c19ef14ad2,0xefbe4786384f25e3 660e1051a39Sopenharmony_ci data8 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 661e1051a39Sopenharmony_ci data8 0x2de92c6f592b0275,0x4a7484aa6ea6e483 662e1051a39Sopenharmony_ci data8 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 663e1051a39Sopenharmony_ci data8 0x983e5152ee66dfab,0xa831c66d2db43210 664e1051a39Sopenharmony_ci data8 0xb00327c898fb213f,0xbf597fc7beef0ee4 665e1051a39Sopenharmony_ci data8 0xc6e00bf33da88fc2,0xd5a79147930aa725 666e1051a39Sopenharmony_ci data8 0x06ca6351e003826f,0x142929670a0e6e70 667e1051a39Sopenharmony_ci data8 0x27b70a8546d22ffc,0x2e1b21385c26c926 668e1051a39Sopenharmony_ci data8 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 669e1051a39Sopenharmony_ci data8 0x650a73548baf63de,0x766a0abb3c77b2a8 670e1051a39Sopenharmony_ci data8 0x81c2c92e47edaee6,0x92722c851482353b 671e1051a39Sopenharmony_ci data8 0xa2bfe8a14cf10364,0xa81a664bbc423001 672e1051a39Sopenharmony_ci data8 0xc24b8b70d0f89791,0xc76c51a30654be30 673e1051a39Sopenharmony_ci data8 0xd192e819d6ef5218,0xd69906245565a910 674e1051a39Sopenharmony_ci data8 0xf40e35855771202a,0x106aa07032bbd1b8 675e1051a39Sopenharmony_ci data8 0x19a4c116b8d2d0c8,0x1e376c085141ab53 676e1051a39Sopenharmony_ci data8 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 677e1051a39Sopenharmony_ci data8 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 678e1051a39Sopenharmony_ci data8 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 679e1051a39Sopenharmony_ci data8 0x748f82ee5defb2fc,0x78a5636f43172f60 680e1051a39Sopenharmony_ci data8 0x84c87814a1f0ab72,0x8cc702081a6439ec 681e1051a39Sopenharmony_ci data8 0x90befffa23631e28,0xa4506cebde82bde9 682e1051a39Sopenharmony_ci data8 0xbef9a3f7b2c67915,0xc67178f2e372532b 683e1051a39Sopenharmony_ci data8 0xca273eceea26619c,0xd186b8c721c0c207 684e1051a39Sopenharmony_ci data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 685e1051a39Sopenharmony_ci data8 0x06f067aa72176fba,0x0a637dc5a2c898a6 686e1051a39Sopenharmony_ci data8 0x113f9804bef90dae,0x1b710b35131c471b 687e1051a39Sopenharmony_ci data8 0x28db77f523047d84,0x32caab7b40c72493 688e1051a39Sopenharmony_ci data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 689e1051a39Sopenharmony_ci data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 690e1051a39Sopenharmony_ci data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817 691e1051a39Sopenharmony_ci.size K512#,$SZ*$rounds 692e1051a39Sopenharmony_cistringz "SHA512 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>" 693e1051a39Sopenharmony_ci___ 694