1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci 17e1051a39Sopenharmony_ci# SHA1 block procedure for s390x. 18e1051a39Sopenharmony_ci 19e1051a39Sopenharmony_ci# April 2007. 20e1051a39Sopenharmony_ci# 21e1051a39Sopenharmony_ci# Performance is >30% better than gcc 3.3 generated code. But the real 22e1051a39Sopenharmony_ci# twist is that SHA1 hardware support is detected and utilized. In 23e1051a39Sopenharmony_ci# which case performance can reach further >4.5x for larger chunks. 24e1051a39Sopenharmony_ci 25e1051a39Sopenharmony_ci# January 2009. 26e1051a39Sopenharmony_ci# 27e1051a39Sopenharmony_ci# Optimize Xupdate for amount of memory references and reschedule 28e1051a39Sopenharmony_ci# instructions to favour dual-issue z10 pipeline. On z10 hardware is 29e1051a39Sopenharmony_ci# "only" ~2.3x faster than software. 30e1051a39Sopenharmony_ci 31e1051a39Sopenharmony_ci# November 2010. 32e1051a39Sopenharmony_ci# 33e1051a39Sopenharmony_ci# Adapt for -m31 build. If kernel supports what's called "highgprs" 34e1051a39Sopenharmony_ci# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit 35e1051a39Sopenharmony_ci# instructions and achieve "64-bit" performance even in 31-bit legacy 36e1051a39Sopenharmony_ci# application context. The feature is not specific to any particular 37e1051a39Sopenharmony_ci# processor, as long as it's "z-CPU". Latter implies that the code 38e1051a39Sopenharmony_ci# remains z/Architecture specific. On z990 it was measured to perform 39e1051a39Sopenharmony_ci# 23% better than code generated by gcc 4.3. 40e1051a39Sopenharmony_ci 41e1051a39Sopenharmony_ci$kimdfunc=1; # magic function code for kimd instruction 42e1051a39Sopenharmony_ci 43e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 44e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 45e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 46e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 47e1051a39Sopenharmony_ci 48e1051a39Sopenharmony_ciif ($flavour =~ /3[12]/) { 49e1051a39Sopenharmony_ci $SIZE_T=4; 50e1051a39Sopenharmony_ci $g=""; 51e1051a39Sopenharmony_ci} else { 52e1051a39Sopenharmony_ci $SIZE_T=8; 53e1051a39Sopenharmony_ci $g="g"; 54e1051a39Sopenharmony_ci} 55e1051a39Sopenharmony_ci 56e1051a39Sopenharmony_ci$output and open STDOUT,">$output"; 57e1051a39Sopenharmony_ci 58e1051a39Sopenharmony_ci$K_00_39="%r0"; $K=$K_00_39; 59e1051a39Sopenharmony_ci$K_40_79="%r1"; 60e1051a39Sopenharmony_ci$ctx="%r2"; $prefetch="%r2"; 61e1051a39Sopenharmony_ci$inp="%r3"; 62e1051a39Sopenharmony_ci$len="%r4"; 63e1051a39Sopenharmony_ci 64e1051a39Sopenharmony_ci$A="%r5"; 65e1051a39Sopenharmony_ci$B="%r6"; 66e1051a39Sopenharmony_ci$C="%r7"; 67e1051a39Sopenharmony_ci$D="%r8"; 68e1051a39Sopenharmony_ci$E="%r9"; @V=($A,$B,$C,$D,$E); 69e1051a39Sopenharmony_ci$t0="%r10"; 70e1051a39Sopenharmony_ci$t1="%r11"; 71e1051a39Sopenharmony_ci@X=("%r12","%r13","%r14"); 72e1051a39Sopenharmony_ci$sp="%r15"; 73e1051a39Sopenharmony_ci 74e1051a39Sopenharmony_ci$stdframe=16*$SIZE_T+4*8; 75e1051a39Sopenharmony_ci$frame=$stdframe+16*4; 76e1051a39Sopenharmony_ci 77e1051a39Sopenharmony_cisub Xupdate { 78e1051a39Sopenharmony_cimy $i=shift; 79e1051a39Sopenharmony_ci 80e1051a39Sopenharmony_ci$code.=<<___ if ($i==15); 81e1051a39Sopenharmony_ci lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up 82e1051a39Sopenharmony_ci lr $X[0],$X[2] 83e1051a39Sopenharmony_ci___ 84e1051a39Sopenharmony_cireturn if ($i&1); # Xupdate is vectorized and executed every 2nd cycle 85e1051a39Sopenharmony_ci$code.=<<___ if ($i<16); 86e1051a39Sopenharmony_ci lg $X[0],`$i*4`($inp) ### Xload($i) 87e1051a39Sopenharmony_ci rllg $X[1],$X[0],32 88e1051a39Sopenharmony_ci___ 89e1051a39Sopenharmony_ci$code.=<<___ if ($i>=16); 90e1051a39Sopenharmony_ci xgr $X[0],$prefetch ### Xupdate($i) 91e1051a39Sopenharmony_ci lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp) 92e1051a39Sopenharmony_ci xg $X[0],`$stdframe+4*(($i+8)%16)`($sp) 93e1051a39Sopenharmony_ci xgr $X[0],$prefetch 94e1051a39Sopenharmony_ci rll $X[0],$X[0],1 95e1051a39Sopenharmony_ci rllg $X[1],$X[0],32 96e1051a39Sopenharmony_ci rll $X[1],$X[1],1 97e1051a39Sopenharmony_ci rllg $X[0],$X[1],32 98e1051a39Sopenharmony_ci lr $X[2],$X[1] # feedback 99e1051a39Sopenharmony_ci___ 100e1051a39Sopenharmony_ci$code.=<<___ if ($i<=70); 101e1051a39Sopenharmony_ci stg $X[0],`$stdframe+4*($i%16)`($sp) 102e1051a39Sopenharmony_ci___ 103e1051a39Sopenharmony_ciunshift(@X,pop(@X)); 104e1051a39Sopenharmony_ci} 105e1051a39Sopenharmony_ci 106e1051a39Sopenharmony_cisub BODY_00_19 { 107e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_; 108e1051a39Sopenharmony_cimy $xi=$X[1]; 109e1051a39Sopenharmony_ci 110e1051a39Sopenharmony_ci &Xupdate($i); 111e1051a39Sopenharmony_ci$code.=<<___; 112e1051a39Sopenharmony_ci alr $e,$K ### $i 113e1051a39Sopenharmony_ci rll $t1,$a,5 114e1051a39Sopenharmony_ci lr $t0,$d 115e1051a39Sopenharmony_ci xr $t0,$c 116e1051a39Sopenharmony_ci alr $e,$t1 117e1051a39Sopenharmony_ci nr $t0,$b 118e1051a39Sopenharmony_ci alr $e,$xi 119e1051a39Sopenharmony_ci xr $t0,$d 120e1051a39Sopenharmony_ci rll $b,$b,30 121e1051a39Sopenharmony_ci alr $e,$t0 122e1051a39Sopenharmony_ci___ 123e1051a39Sopenharmony_ci} 124e1051a39Sopenharmony_ci 125e1051a39Sopenharmony_cisub BODY_20_39 { 126e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_; 127e1051a39Sopenharmony_cimy $xi=$X[1]; 128e1051a39Sopenharmony_ci 129e1051a39Sopenharmony_ci &Xupdate($i); 130e1051a39Sopenharmony_ci$code.=<<___; 131e1051a39Sopenharmony_ci alr $e,$K ### $i 132e1051a39Sopenharmony_ci rll $t1,$a,5 133e1051a39Sopenharmony_ci lr $t0,$b 134e1051a39Sopenharmony_ci alr $e,$t1 135e1051a39Sopenharmony_ci xr $t0,$c 136e1051a39Sopenharmony_ci alr $e,$xi 137e1051a39Sopenharmony_ci xr $t0,$d 138e1051a39Sopenharmony_ci rll $b,$b,30 139e1051a39Sopenharmony_ci alr $e,$t0 140e1051a39Sopenharmony_ci___ 141e1051a39Sopenharmony_ci} 142e1051a39Sopenharmony_ci 143e1051a39Sopenharmony_cisub BODY_40_59 { 144e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_; 145e1051a39Sopenharmony_cimy $xi=$X[1]; 146e1051a39Sopenharmony_ci 147e1051a39Sopenharmony_ci &Xupdate($i); 148e1051a39Sopenharmony_ci$code.=<<___; 149e1051a39Sopenharmony_ci alr $e,$K ### $i 150e1051a39Sopenharmony_ci rll $t1,$a,5 151e1051a39Sopenharmony_ci lr $t0,$b 152e1051a39Sopenharmony_ci alr $e,$t1 153e1051a39Sopenharmony_ci or $t0,$c 154e1051a39Sopenharmony_ci lr $t1,$b 155e1051a39Sopenharmony_ci nr $t0,$d 156e1051a39Sopenharmony_ci nr $t1,$c 157e1051a39Sopenharmony_ci alr $e,$xi 158e1051a39Sopenharmony_ci or $t0,$t1 159e1051a39Sopenharmony_ci rll $b,$b,30 160e1051a39Sopenharmony_ci alr $e,$t0 161e1051a39Sopenharmony_ci___ 162e1051a39Sopenharmony_ci} 163e1051a39Sopenharmony_ci 164e1051a39Sopenharmony_ci$code.=<<___; 165e1051a39Sopenharmony_ci#include "s390x_arch.h" 166e1051a39Sopenharmony_ci 167e1051a39Sopenharmony_ci.text 168e1051a39Sopenharmony_ci.align 64 169e1051a39Sopenharmony_ci.type Ktable,\@object 170e1051a39Sopenharmony_ciKtable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6 171e1051a39Sopenharmony_ci .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0 172e1051a39Sopenharmony_ci.size Ktable,.-Ktable 173e1051a39Sopenharmony_ci.globl sha1_block_data_order 174e1051a39Sopenharmony_ci.type sha1_block_data_order,\@function 175e1051a39Sopenharmony_cisha1_block_data_order: 176e1051a39Sopenharmony_ci___ 177e1051a39Sopenharmony_ci$code.=<<___ if ($kimdfunc); 178e1051a39Sopenharmony_ci larl %r1,OPENSSL_s390xcap_P 179e1051a39Sopenharmony_ci lg %r0,S390X_KIMD(%r1) # check kimd capabilities 180e1051a39Sopenharmony_ci tmhh %r0,`0x8000>>$kimdfunc` 181e1051a39Sopenharmony_ci jz .Lsoftware 182e1051a39Sopenharmony_ci lghi %r0,$kimdfunc 183e1051a39Sopenharmony_ci lgr %r1,$ctx 184e1051a39Sopenharmony_ci lgr %r2,$inp 185e1051a39Sopenharmony_ci sllg %r3,$len,6 186e1051a39Sopenharmony_ci .long 0xb93e0002 # kimd %r0,%r2 187e1051a39Sopenharmony_ci brc 1,.-4 # pay attention to "partial completion" 188e1051a39Sopenharmony_ci br %r14 189e1051a39Sopenharmony_ci.align 16 190e1051a39Sopenharmony_ci.Lsoftware: 191e1051a39Sopenharmony_ci___ 192e1051a39Sopenharmony_ci$code.=<<___; 193e1051a39Sopenharmony_ci lghi %r1,-$frame 194e1051a39Sopenharmony_ci st${g} $ctx,`2*$SIZE_T`($sp) 195e1051a39Sopenharmony_ci stm${g} %r6,%r15,`6*$SIZE_T`($sp) 196e1051a39Sopenharmony_ci lgr %r0,$sp 197e1051a39Sopenharmony_ci la $sp,0(%r1,$sp) 198e1051a39Sopenharmony_ci st${g} %r0,0($sp) 199e1051a39Sopenharmony_ci 200e1051a39Sopenharmony_ci larl $t0,Ktable 201e1051a39Sopenharmony_ci llgf $A,0($ctx) 202e1051a39Sopenharmony_ci llgf $B,4($ctx) 203e1051a39Sopenharmony_ci llgf $C,8($ctx) 204e1051a39Sopenharmony_ci llgf $D,12($ctx) 205e1051a39Sopenharmony_ci llgf $E,16($ctx) 206e1051a39Sopenharmony_ci 207e1051a39Sopenharmony_ci lg $K_00_39,0($t0) 208e1051a39Sopenharmony_ci lg $K_40_79,8($t0) 209e1051a39Sopenharmony_ci 210e1051a39Sopenharmony_ci.Lloop: 211e1051a39Sopenharmony_ci rllg $K_00_39,$K_00_39,32 212e1051a39Sopenharmony_ci___ 213e1051a39Sopenharmony_cifor ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 214e1051a39Sopenharmony_ci$code.=<<___; 215e1051a39Sopenharmony_ci rllg $K_00_39,$K_00_39,32 216e1051a39Sopenharmony_ci___ 217e1051a39Sopenharmony_cifor (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 218e1051a39Sopenharmony_ci$code.=<<___; $K=$K_40_79; 219e1051a39Sopenharmony_ci rllg $K_40_79,$K_40_79,32 220e1051a39Sopenharmony_ci___ 221e1051a39Sopenharmony_cifor (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 222e1051a39Sopenharmony_ci$code.=<<___; 223e1051a39Sopenharmony_ci rllg $K_40_79,$K_40_79,32 224e1051a39Sopenharmony_ci___ 225e1051a39Sopenharmony_cifor (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 226e1051a39Sopenharmony_ci$code.=<<___; 227e1051a39Sopenharmony_ci 228e1051a39Sopenharmony_ci l${g} $ctx,`$frame+2*$SIZE_T`($sp) 229e1051a39Sopenharmony_ci la $inp,64($inp) 230e1051a39Sopenharmony_ci al $A,0($ctx) 231e1051a39Sopenharmony_ci al $B,4($ctx) 232e1051a39Sopenharmony_ci al $C,8($ctx) 233e1051a39Sopenharmony_ci al $D,12($ctx) 234e1051a39Sopenharmony_ci al $E,16($ctx) 235e1051a39Sopenharmony_ci st $A,0($ctx) 236e1051a39Sopenharmony_ci st $B,4($ctx) 237e1051a39Sopenharmony_ci st $C,8($ctx) 238e1051a39Sopenharmony_ci st $D,12($ctx) 239e1051a39Sopenharmony_ci st $E,16($ctx) 240e1051a39Sopenharmony_ci brct${g} $len,.Lloop 241e1051a39Sopenharmony_ci 242e1051a39Sopenharmony_ci lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) 243e1051a39Sopenharmony_ci br %r14 244e1051a39Sopenharmony_ci.size sha1_block_data_order,.-sha1_block_data_order 245e1051a39Sopenharmony_ci.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" 246e1051a39Sopenharmony_ci___ 247e1051a39Sopenharmony_ci 248e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval $1/gem; 249e1051a39Sopenharmony_ci 250e1051a39Sopenharmony_ciprint $code; 251e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 252