1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci
17e1051a39Sopenharmony_ci# SHA1 block procedure for s390x.
18e1051a39Sopenharmony_ci
19e1051a39Sopenharmony_ci# April 2007.
20e1051a39Sopenharmony_ci#
21e1051a39Sopenharmony_ci# Performance is >30% better than gcc 3.3 generated code. But the real
22e1051a39Sopenharmony_ci# twist is that SHA1 hardware support is detected and utilized. In
23e1051a39Sopenharmony_ci# which case performance can reach further >4.5x for larger chunks.
24e1051a39Sopenharmony_ci
25e1051a39Sopenharmony_ci# January 2009.
26e1051a39Sopenharmony_ci#
27e1051a39Sopenharmony_ci# Optimize Xupdate for amount of memory references and reschedule
28e1051a39Sopenharmony_ci# instructions to favour dual-issue z10 pipeline. On z10 hardware is
29e1051a39Sopenharmony_ci# "only" ~2.3x faster than software.
30e1051a39Sopenharmony_ci
31e1051a39Sopenharmony_ci# November 2010.
32e1051a39Sopenharmony_ci#
33e1051a39Sopenharmony_ci# Adapt for -m31 build. If kernel supports what's called "highgprs"
34e1051a39Sopenharmony_ci# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
35e1051a39Sopenharmony_ci# instructions and achieve "64-bit" performance even in 31-bit legacy
36e1051a39Sopenharmony_ci# application context. The feature is not specific to any particular
37e1051a39Sopenharmony_ci# processor, as long as it's "z-CPU". Latter implies that the code
38e1051a39Sopenharmony_ci# remains z/Architecture specific. On z990 it was measured to perform
39e1051a39Sopenharmony_ci# 23% better than code generated by gcc 4.3.
40e1051a39Sopenharmony_ci
41e1051a39Sopenharmony_ci$kimdfunc=1;	# magic function code for kimd instruction
42e1051a39Sopenharmony_ci
43e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
44e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
45e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47e1051a39Sopenharmony_ci
48e1051a39Sopenharmony_ciif ($flavour =~ /3[12]/) {
49e1051a39Sopenharmony_ci	$SIZE_T=4;
50e1051a39Sopenharmony_ci	$g="";
51e1051a39Sopenharmony_ci} else {
52e1051a39Sopenharmony_ci	$SIZE_T=8;
53e1051a39Sopenharmony_ci	$g="g";
54e1051a39Sopenharmony_ci}
55e1051a39Sopenharmony_ci
56e1051a39Sopenharmony_ci$output and open STDOUT,">$output";
57e1051a39Sopenharmony_ci
58e1051a39Sopenharmony_ci$K_00_39="%r0"; $K=$K_00_39;
59e1051a39Sopenharmony_ci$K_40_79="%r1";
60e1051a39Sopenharmony_ci$ctx="%r2";	$prefetch="%r2";
61e1051a39Sopenharmony_ci$inp="%r3";
62e1051a39Sopenharmony_ci$len="%r4";
63e1051a39Sopenharmony_ci
64e1051a39Sopenharmony_ci$A="%r5";
65e1051a39Sopenharmony_ci$B="%r6";
66e1051a39Sopenharmony_ci$C="%r7";
67e1051a39Sopenharmony_ci$D="%r8";
68e1051a39Sopenharmony_ci$E="%r9";	@V=($A,$B,$C,$D,$E);
69e1051a39Sopenharmony_ci$t0="%r10";
70e1051a39Sopenharmony_ci$t1="%r11";
71e1051a39Sopenharmony_ci@X=("%r12","%r13","%r14");
72e1051a39Sopenharmony_ci$sp="%r15";
73e1051a39Sopenharmony_ci
74e1051a39Sopenharmony_ci$stdframe=16*$SIZE_T+4*8;
75e1051a39Sopenharmony_ci$frame=$stdframe+16*4;
76e1051a39Sopenharmony_ci
77e1051a39Sopenharmony_cisub Xupdate {
78e1051a39Sopenharmony_cimy $i=shift;
79e1051a39Sopenharmony_ci
80e1051a39Sopenharmony_ci$code.=<<___ if ($i==15);
81e1051a39Sopenharmony_ci	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
82e1051a39Sopenharmony_ci	lr	$X[0],$X[2]
83e1051a39Sopenharmony_ci___
84e1051a39Sopenharmony_cireturn if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
85e1051a39Sopenharmony_ci$code.=<<___ if ($i<16);
86e1051a39Sopenharmony_ci	lg	$X[0],`$i*4`($inp)	### Xload($i)
87e1051a39Sopenharmony_ci	rllg	$X[1],$X[0],32
88e1051a39Sopenharmony_ci___
89e1051a39Sopenharmony_ci$code.=<<___ if ($i>=16);
90e1051a39Sopenharmony_ci	xgr	$X[0],$prefetch		### Xupdate($i)
91e1051a39Sopenharmony_ci	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
92e1051a39Sopenharmony_ci	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
93e1051a39Sopenharmony_ci	xgr	$X[0],$prefetch
94e1051a39Sopenharmony_ci	rll	$X[0],$X[0],1
95e1051a39Sopenharmony_ci	rllg	$X[1],$X[0],32
96e1051a39Sopenharmony_ci	rll	$X[1],$X[1],1
97e1051a39Sopenharmony_ci	rllg	$X[0],$X[1],32
98e1051a39Sopenharmony_ci	lr	$X[2],$X[1]		# feedback
99e1051a39Sopenharmony_ci___
100e1051a39Sopenharmony_ci$code.=<<___ if ($i<=70);
101e1051a39Sopenharmony_ci	stg	$X[0],`$stdframe+4*($i%16)`($sp)
102e1051a39Sopenharmony_ci___
103e1051a39Sopenharmony_ciunshift(@X,pop(@X));
104e1051a39Sopenharmony_ci}
105e1051a39Sopenharmony_ci
106e1051a39Sopenharmony_cisub BODY_00_19 {
107e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
108e1051a39Sopenharmony_cimy $xi=$X[1];
109e1051a39Sopenharmony_ci
110e1051a39Sopenharmony_ci	&Xupdate($i);
111e1051a39Sopenharmony_ci$code.=<<___;
112e1051a39Sopenharmony_ci	alr	$e,$K		### $i
113e1051a39Sopenharmony_ci	rll	$t1,$a,5
114e1051a39Sopenharmony_ci	lr	$t0,$d
115e1051a39Sopenharmony_ci	xr	$t0,$c
116e1051a39Sopenharmony_ci	alr	$e,$t1
117e1051a39Sopenharmony_ci	nr	$t0,$b
118e1051a39Sopenharmony_ci	alr	$e,$xi
119e1051a39Sopenharmony_ci	xr	$t0,$d
120e1051a39Sopenharmony_ci	rll	$b,$b,30
121e1051a39Sopenharmony_ci	alr	$e,$t0
122e1051a39Sopenharmony_ci___
123e1051a39Sopenharmony_ci}
124e1051a39Sopenharmony_ci
125e1051a39Sopenharmony_cisub BODY_20_39 {
126e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
127e1051a39Sopenharmony_cimy $xi=$X[1];
128e1051a39Sopenharmony_ci
129e1051a39Sopenharmony_ci	&Xupdate($i);
130e1051a39Sopenharmony_ci$code.=<<___;
131e1051a39Sopenharmony_ci	alr	$e,$K		### $i
132e1051a39Sopenharmony_ci	rll	$t1,$a,5
133e1051a39Sopenharmony_ci	lr	$t0,$b
134e1051a39Sopenharmony_ci	alr	$e,$t1
135e1051a39Sopenharmony_ci	xr	$t0,$c
136e1051a39Sopenharmony_ci	alr	$e,$xi
137e1051a39Sopenharmony_ci	xr	$t0,$d
138e1051a39Sopenharmony_ci	rll	$b,$b,30
139e1051a39Sopenharmony_ci	alr	$e,$t0
140e1051a39Sopenharmony_ci___
141e1051a39Sopenharmony_ci}
142e1051a39Sopenharmony_ci
143e1051a39Sopenharmony_cisub BODY_40_59 {
144e1051a39Sopenharmony_cimy ($i,$a,$b,$c,$d,$e)=@_;
145e1051a39Sopenharmony_cimy $xi=$X[1];
146e1051a39Sopenharmony_ci
147e1051a39Sopenharmony_ci	&Xupdate($i);
148e1051a39Sopenharmony_ci$code.=<<___;
149e1051a39Sopenharmony_ci	alr	$e,$K		### $i
150e1051a39Sopenharmony_ci	rll	$t1,$a,5
151e1051a39Sopenharmony_ci	lr	$t0,$b
152e1051a39Sopenharmony_ci	alr	$e,$t1
153e1051a39Sopenharmony_ci	or	$t0,$c
154e1051a39Sopenharmony_ci	lr	$t1,$b
155e1051a39Sopenharmony_ci	nr	$t0,$d
156e1051a39Sopenharmony_ci	nr	$t1,$c
157e1051a39Sopenharmony_ci	alr	$e,$xi
158e1051a39Sopenharmony_ci	or	$t0,$t1
159e1051a39Sopenharmony_ci	rll	$b,$b,30
160e1051a39Sopenharmony_ci	alr	$e,$t0
161e1051a39Sopenharmony_ci___
162e1051a39Sopenharmony_ci}
163e1051a39Sopenharmony_ci
164e1051a39Sopenharmony_ci$code.=<<___;
165e1051a39Sopenharmony_ci#include "s390x_arch.h"
166e1051a39Sopenharmony_ci
167e1051a39Sopenharmony_ci.text
168e1051a39Sopenharmony_ci.align	64
169e1051a39Sopenharmony_ci.type	Ktable,\@object
170e1051a39Sopenharmony_ciKtable: .long	0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
171e1051a39Sopenharmony_ci	.skip	48	#.long	0,0,0,0,0,0,0,0,0,0,0,0
172e1051a39Sopenharmony_ci.size	Ktable,.-Ktable
173e1051a39Sopenharmony_ci.globl	sha1_block_data_order
174e1051a39Sopenharmony_ci.type	sha1_block_data_order,\@function
175e1051a39Sopenharmony_cisha1_block_data_order:
176e1051a39Sopenharmony_ci___
177e1051a39Sopenharmony_ci$code.=<<___ if ($kimdfunc);
178e1051a39Sopenharmony_ci	larl	%r1,OPENSSL_s390xcap_P
179e1051a39Sopenharmony_ci	lg	%r0,S390X_KIMD(%r1)	# check kimd capabilities
180e1051a39Sopenharmony_ci	tmhh	%r0,`0x8000>>$kimdfunc`
181e1051a39Sopenharmony_ci	jz	.Lsoftware
182e1051a39Sopenharmony_ci	lghi	%r0,$kimdfunc
183e1051a39Sopenharmony_ci	lgr	%r1,$ctx
184e1051a39Sopenharmony_ci	lgr	%r2,$inp
185e1051a39Sopenharmony_ci	sllg	%r3,$len,6
186e1051a39Sopenharmony_ci	.long	0xb93e0002	# kimd %r0,%r2
187e1051a39Sopenharmony_ci	brc	1,.-4		# pay attention to "partial completion"
188e1051a39Sopenharmony_ci	br	%r14
189e1051a39Sopenharmony_ci.align	16
190e1051a39Sopenharmony_ci.Lsoftware:
191e1051a39Sopenharmony_ci___
192e1051a39Sopenharmony_ci$code.=<<___;
193e1051a39Sopenharmony_ci	lghi	%r1,-$frame
194e1051a39Sopenharmony_ci	st${g}	$ctx,`2*$SIZE_T`($sp)
195e1051a39Sopenharmony_ci	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
196e1051a39Sopenharmony_ci	lgr	%r0,$sp
197e1051a39Sopenharmony_ci	la	$sp,0(%r1,$sp)
198e1051a39Sopenharmony_ci	st${g}	%r0,0($sp)
199e1051a39Sopenharmony_ci
200e1051a39Sopenharmony_ci	larl	$t0,Ktable
201e1051a39Sopenharmony_ci	llgf	$A,0($ctx)
202e1051a39Sopenharmony_ci	llgf	$B,4($ctx)
203e1051a39Sopenharmony_ci	llgf	$C,8($ctx)
204e1051a39Sopenharmony_ci	llgf	$D,12($ctx)
205e1051a39Sopenharmony_ci	llgf	$E,16($ctx)
206e1051a39Sopenharmony_ci
207e1051a39Sopenharmony_ci	lg	$K_00_39,0($t0)
208e1051a39Sopenharmony_ci	lg	$K_40_79,8($t0)
209e1051a39Sopenharmony_ci
210e1051a39Sopenharmony_ci.Lloop:
211e1051a39Sopenharmony_ci	rllg	$K_00_39,$K_00_39,32
212e1051a39Sopenharmony_ci___
213e1051a39Sopenharmony_cifor ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
214e1051a39Sopenharmony_ci$code.=<<___;
215e1051a39Sopenharmony_ci	rllg	$K_00_39,$K_00_39,32
216e1051a39Sopenharmony_ci___
217e1051a39Sopenharmony_cifor (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
218e1051a39Sopenharmony_ci$code.=<<___;	$K=$K_40_79;
219e1051a39Sopenharmony_ci	rllg	$K_40_79,$K_40_79,32
220e1051a39Sopenharmony_ci___
221e1051a39Sopenharmony_cifor (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
222e1051a39Sopenharmony_ci$code.=<<___;
223e1051a39Sopenharmony_ci	rllg	$K_40_79,$K_40_79,32
224e1051a39Sopenharmony_ci___
225e1051a39Sopenharmony_cifor (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
226e1051a39Sopenharmony_ci$code.=<<___;
227e1051a39Sopenharmony_ci
228e1051a39Sopenharmony_ci	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
229e1051a39Sopenharmony_ci	la	$inp,64($inp)
230e1051a39Sopenharmony_ci	al	$A,0($ctx)
231e1051a39Sopenharmony_ci	al	$B,4($ctx)
232e1051a39Sopenharmony_ci	al	$C,8($ctx)
233e1051a39Sopenharmony_ci	al	$D,12($ctx)
234e1051a39Sopenharmony_ci	al	$E,16($ctx)
235e1051a39Sopenharmony_ci	st	$A,0($ctx)
236e1051a39Sopenharmony_ci	st	$B,4($ctx)
237e1051a39Sopenharmony_ci	st	$C,8($ctx)
238e1051a39Sopenharmony_ci	st	$D,12($ctx)
239e1051a39Sopenharmony_ci	st	$E,16($ctx)
240e1051a39Sopenharmony_ci	brct${g} $len,.Lloop
241e1051a39Sopenharmony_ci
242e1051a39Sopenharmony_ci	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
243e1051a39Sopenharmony_ci	br	%r14
244e1051a39Sopenharmony_ci.size	sha1_block_data_order,.-sha1_block_data_order
245e1051a39Sopenharmony_ci.string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
246e1051a39Sopenharmony_ci___
247e1051a39Sopenharmony_ci
248e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval $1/gem;
249e1051a39Sopenharmony_ci
250e1051a39Sopenharmony_ciprint $code;
251e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
252