1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci
17e1051a39Sopenharmony_ci# April 2007.
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# Performance improvement over vanilla C code varies from 85% to 45%
20e1051a39Sopenharmony_ci# depending on key length and benchmark. Unfortunately in this context
21e1051a39Sopenharmony_ci# these are not very impressive results [for code that utilizes "wide"
22e1051a39Sopenharmony_ci# 64x64=128-bit multiplication, which is not commonly available to C
23e1051a39Sopenharmony_ci# programmers], at least hand-coded bn_asm.c replacement is known to
24e1051a39Sopenharmony_ci# provide 30-40% better results for longest keys. Well, on a second
25e1051a39Sopenharmony_ci# thought it's not very surprising, because z-CPUs are single-issue
26e1051a39Sopenharmony_ci# and _strictly_ in-order execution, while bn_mul_mont is more or less
27e1051a39Sopenharmony_ci# dependent on CPU ability to pipe-line instructions and have several
28e1051a39Sopenharmony_ci# of them "in-flight" at the same time. I mean while other methods,
29e1051a39Sopenharmony_ci# for example Karatsuba, aim to minimize amount of multiplications at
30e1051a39Sopenharmony_ci# the cost of other operations increase, bn_mul_mont aim to neatly
31e1051a39Sopenharmony_ci# "overlap" multiplications and the other operations [and on most
32e1051a39Sopenharmony_ci# platforms even minimize the amount of the other operations, in
33e1051a39Sopenharmony_ci# particular references to memory]. But it's possible to improve this
34e1051a39Sopenharmony_ci# module performance by implementing dedicated squaring code-path and
35e1051a39Sopenharmony_ci# possibly by unrolling loops...
36e1051a39Sopenharmony_ci
37e1051a39Sopenharmony_ci# January 2009.
38e1051a39Sopenharmony_ci#
39e1051a39Sopenharmony_ci# Reschedule to minimize/avoid Address Generation Interlock hazard,
40e1051a39Sopenharmony_ci# make inner loops counter-based.
41e1051a39Sopenharmony_ci
42e1051a39Sopenharmony_ci# November 2010.
43e1051a39Sopenharmony_ci#
44e1051a39Sopenharmony_ci# Adapt for -m31 build. If kernel supports what's called "highgprs"
45e1051a39Sopenharmony_ci# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
46e1051a39Sopenharmony_ci# instructions and achieve "64-bit" performance even in 31-bit legacy
47e1051a39Sopenharmony_ci# application context. The feature is not specific to any particular
48e1051a39Sopenharmony_ci# processor, as long as it's "z-CPU". Latter implies that the code
49e1051a39Sopenharmony_ci# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
50e1051a39Sopenharmony_ci# is achieved by swapping words after 64-bit loads, follow _dswap-s.
51e1051a39Sopenharmony_ci# On z990 it was measured to perform 2.6-2.2 times better than
52e1051a39Sopenharmony_ci# compiler-generated code, less for longer keys...
53e1051a39Sopenharmony_ci
54e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
55e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
56e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
57e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
58e1051a39Sopenharmony_ci
59e1051a39Sopenharmony_ciif ($flavour =~ /3[12]/) {
60e1051a39Sopenharmony_ci	$SIZE_T=4;
61e1051a39Sopenharmony_ci	$g="";
62e1051a39Sopenharmony_ci} else {
63e1051a39Sopenharmony_ci	$SIZE_T=8;
64e1051a39Sopenharmony_ci	$g="g";
65e1051a39Sopenharmony_ci}
66e1051a39Sopenharmony_ci
67e1051a39Sopenharmony_ci$output and open STDOUT,">$output";
68e1051a39Sopenharmony_ci
69e1051a39Sopenharmony_ci$stdframe=16*$SIZE_T+4*8;
70e1051a39Sopenharmony_ci
71e1051a39Sopenharmony_ci$mn0="%r0";
72e1051a39Sopenharmony_ci$num="%r1";
73e1051a39Sopenharmony_ci
74e1051a39Sopenharmony_ci# int bn_mul_mont(
75e1051a39Sopenharmony_ci$rp="%r2";		# BN_ULONG *rp,
76e1051a39Sopenharmony_ci$ap="%r3";		# const BN_ULONG *ap,
77e1051a39Sopenharmony_ci$bp="%r4";		# const BN_ULONG *bp,
78e1051a39Sopenharmony_ci$np="%r5";		# const BN_ULONG *np,
79e1051a39Sopenharmony_ci$n0="%r6";		# const BN_ULONG *n0,
80e1051a39Sopenharmony_ci#$num="160(%r15)"	# int num);
81e1051a39Sopenharmony_ci
82e1051a39Sopenharmony_ci$bi="%r2";	# zaps rp
83e1051a39Sopenharmony_ci$j="%r7";
84e1051a39Sopenharmony_ci
85e1051a39Sopenharmony_ci$ahi="%r8";
86e1051a39Sopenharmony_ci$alo="%r9";
87e1051a39Sopenharmony_ci$nhi="%r10";
88e1051a39Sopenharmony_ci$nlo="%r11";
89e1051a39Sopenharmony_ci$AHI="%r12";
90e1051a39Sopenharmony_ci$NHI="%r13";
91e1051a39Sopenharmony_ci$count="%r14";
92e1051a39Sopenharmony_ci$sp="%r15";
93e1051a39Sopenharmony_ci
94e1051a39Sopenharmony_ci$code.=<<___;
95e1051a39Sopenharmony_ci.text
96e1051a39Sopenharmony_ci.globl	bn_mul_mont
97e1051a39Sopenharmony_ci.type	bn_mul_mont,\@function
98e1051a39Sopenharmony_cibn_mul_mont:
99e1051a39Sopenharmony_ci	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
100e1051a39Sopenharmony_ci	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
101e1051a39Sopenharmony_ci	la	$bp,0($num,$bp)
102e1051a39Sopenharmony_ci
103e1051a39Sopenharmony_ci	st${g}	%r2,2*$SIZE_T($sp)
104e1051a39Sopenharmony_ci
105e1051a39Sopenharmony_ci	cghi	$num,16		#
106e1051a39Sopenharmony_ci	lghi	%r2,0		#
107e1051a39Sopenharmony_ci	blr	%r14		# if($num<16) return 0;
108e1051a39Sopenharmony_ci___
109e1051a39Sopenharmony_ci$code.=<<___ if ($flavour =~ /3[12]/);
110e1051a39Sopenharmony_ci	tmll	$num,4
111e1051a39Sopenharmony_ci	bnzr	%r14		# if ($num&1) return 0;
112e1051a39Sopenharmony_ci___
113e1051a39Sopenharmony_ci$code.=<<___ if ($flavour !~ /3[12]/);
114e1051a39Sopenharmony_ci	cghi	$num,96		#
115e1051a39Sopenharmony_ci	bhr	%r14		# if($num>96) return 0;
116e1051a39Sopenharmony_ci___
117e1051a39Sopenharmony_ci$code.=<<___;
118e1051a39Sopenharmony_ci	stm${g}	%r3,%r15,3*$SIZE_T($sp)
119e1051a39Sopenharmony_ci
120e1051a39Sopenharmony_ci	lghi	$rp,-$stdframe-8	# leave room for carry bit
121e1051a39Sopenharmony_ci	lcgr	$j,$num		# -$num
122e1051a39Sopenharmony_ci	lgr	%r0,$sp
123e1051a39Sopenharmony_ci	la	$rp,0($rp,$sp)
124e1051a39Sopenharmony_ci	la	$sp,0($j,$rp)	# alloca
125e1051a39Sopenharmony_ci	st${g}	%r0,0($sp)	# back chain
126e1051a39Sopenharmony_ci
127e1051a39Sopenharmony_ci	sra	$num,3		# restore $num
128e1051a39Sopenharmony_ci	la	$bp,0($j,$bp)	# restore $bp
129e1051a39Sopenharmony_ci	ahi	$num,-1		# adjust $num for inner loop
130e1051a39Sopenharmony_ci	lg	$n0,0($n0)	# pull n0
131e1051a39Sopenharmony_ci	_dswap	$n0
132e1051a39Sopenharmony_ci
133e1051a39Sopenharmony_ci	lg	$bi,0($bp)
134e1051a39Sopenharmony_ci	_dswap	$bi
135e1051a39Sopenharmony_ci	lg	$alo,0($ap)
136e1051a39Sopenharmony_ci	_dswap	$alo
137e1051a39Sopenharmony_ci	mlgr	$ahi,$bi	# ap[0]*bp[0]
138e1051a39Sopenharmony_ci	lgr	$AHI,$ahi
139e1051a39Sopenharmony_ci
140e1051a39Sopenharmony_ci	lgr	$mn0,$alo	# "tp[0]"*n0
141e1051a39Sopenharmony_ci	msgr	$mn0,$n0
142e1051a39Sopenharmony_ci
143e1051a39Sopenharmony_ci	lg	$nlo,0($np)	#
144e1051a39Sopenharmony_ci	_dswap	$nlo
145e1051a39Sopenharmony_ci	mlgr	$nhi,$mn0	# np[0]*m1
146e1051a39Sopenharmony_ci	algr	$nlo,$alo	# +="tp[0]"
147e1051a39Sopenharmony_ci	lghi	$NHI,0
148e1051a39Sopenharmony_ci	alcgr	$NHI,$nhi
149e1051a39Sopenharmony_ci
150e1051a39Sopenharmony_ci	la	$j,8		# j=1
151e1051a39Sopenharmony_ci	lr	$count,$num
152e1051a39Sopenharmony_ci
153e1051a39Sopenharmony_ci.align	16
154e1051a39Sopenharmony_ci.L1st:
155e1051a39Sopenharmony_ci	lg	$alo,0($j,$ap)
156e1051a39Sopenharmony_ci	_dswap	$alo
157e1051a39Sopenharmony_ci	mlgr	$ahi,$bi	# ap[j]*bp[0]
158e1051a39Sopenharmony_ci	algr	$alo,$AHI
159e1051a39Sopenharmony_ci	lghi	$AHI,0
160e1051a39Sopenharmony_ci	alcgr	$AHI,$ahi
161e1051a39Sopenharmony_ci
162e1051a39Sopenharmony_ci	lg	$nlo,0($j,$np)
163e1051a39Sopenharmony_ci	_dswap	$nlo
164e1051a39Sopenharmony_ci	mlgr	$nhi,$mn0	# np[j]*m1
165e1051a39Sopenharmony_ci	algr	$nlo,$NHI
166e1051a39Sopenharmony_ci	lghi	$NHI,0
167e1051a39Sopenharmony_ci	alcgr	$nhi,$NHI	# +="tp[j]"
168e1051a39Sopenharmony_ci	algr	$nlo,$alo
169e1051a39Sopenharmony_ci	alcgr	$NHI,$nhi
170e1051a39Sopenharmony_ci
171e1051a39Sopenharmony_ci	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
172e1051a39Sopenharmony_ci	la	$j,8($j)	# j++
173e1051a39Sopenharmony_ci	brct	$count,.L1st
174e1051a39Sopenharmony_ci
175e1051a39Sopenharmony_ci	algr	$NHI,$AHI
176e1051a39Sopenharmony_ci	lghi	$AHI,0
177e1051a39Sopenharmony_ci	alcgr	$AHI,$AHI	# upmost overflow bit
178e1051a39Sopenharmony_ci	stg	$NHI,$stdframe-8($j,$sp)
179e1051a39Sopenharmony_ci	stg	$AHI,$stdframe($j,$sp)
180e1051a39Sopenharmony_ci	la	$bp,8($bp)	# bp++
181e1051a39Sopenharmony_ci
182e1051a39Sopenharmony_ci.Louter:
183e1051a39Sopenharmony_ci	lg	$bi,0($bp)	# bp[i]
184e1051a39Sopenharmony_ci	_dswap	$bi
185e1051a39Sopenharmony_ci	lg	$alo,0($ap)
186e1051a39Sopenharmony_ci	_dswap	$alo
187e1051a39Sopenharmony_ci	mlgr	$ahi,$bi	# ap[0]*bp[i]
188e1051a39Sopenharmony_ci	alg	$alo,$stdframe($sp)	# +=tp[0]
189e1051a39Sopenharmony_ci	lghi	$AHI,0
190e1051a39Sopenharmony_ci	alcgr	$AHI,$ahi
191e1051a39Sopenharmony_ci
192e1051a39Sopenharmony_ci	lgr	$mn0,$alo
193e1051a39Sopenharmony_ci	msgr	$mn0,$n0	# tp[0]*n0
194e1051a39Sopenharmony_ci
195e1051a39Sopenharmony_ci	lg	$nlo,0($np)	# np[0]
196e1051a39Sopenharmony_ci	_dswap	$nlo
197e1051a39Sopenharmony_ci	mlgr	$nhi,$mn0	# np[0]*m1
198e1051a39Sopenharmony_ci	algr	$nlo,$alo	# +="tp[0]"
199e1051a39Sopenharmony_ci	lghi	$NHI,0
200e1051a39Sopenharmony_ci	alcgr	$NHI,$nhi
201e1051a39Sopenharmony_ci
202e1051a39Sopenharmony_ci	la	$j,8		# j=1
203e1051a39Sopenharmony_ci	lr	$count,$num
204e1051a39Sopenharmony_ci
205e1051a39Sopenharmony_ci.align	16
206e1051a39Sopenharmony_ci.Linner:
207e1051a39Sopenharmony_ci	lg	$alo,0($j,$ap)
208e1051a39Sopenharmony_ci	_dswap	$alo
209e1051a39Sopenharmony_ci	mlgr	$ahi,$bi	# ap[j]*bp[i]
210e1051a39Sopenharmony_ci	algr	$alo,$AHI
211e1051a39Sopenharmony_ci	lghi	$AHI,0
212e1051a39Sopenharmony_ci	alcgr	$ahi,$AHI
213e1051a39Sopenharmony_ci	alg	$alo,$stdframe($j,$sp)# +=tp[j]
214e1051a39Sopenharmony_ci	alcgr	$AHI,$ahi
215e1051a39Sopenharmony_ci
216e1051a39Sopenharmony_ci	lg	$nlo,0($j,$np)
217e1051a39Sopenharmony_ci	_dswap	$nlo
218e1051a39Sopenharmony_ci	mlgr	$nhi,$mn0	# np[j]*m1
219e1051a39Sopenharmony_ci	algr	$nlo,$NHI
220e1051a39Sopenharmony_ci	lghi	$NHI,0
221e1051a39Sopenharmony_ci	alcgr	$nhi,$NHI
222e1051a39Sopenharmony_ci	algr	$nlo,$alo	# +="tp[j]"
223e1051a39Sopenharmony_ci	alcgr	$NHI,$nhi
224e1051a39Sopenharmony_ci
225e1051a39Sopenharmony_ci	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
226e1051a39Sopenharmony_ci	la	$j,8($j)	# j++
227e1051a39Sopenharmony_ci	brct	$count,.Linner
228e1051a39Sopenharmony_ci
229e1051a39Sopenharmony_ci	algr	$NHI,$AHI
230e1051a39Sopenharmony_ci	lghi	$AHI,0
231e1051a39Sopenharmony_ci	alcgr	$AHI,$AHI
232e1051a39Sopenharmony_ci	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
233e1051a39Sopenharmony_ci	lghi	$ahi,0
234e1051a39Sopenharmony_ci	alcgr	$AHI,$ahi	# new upmost overflow bit
235e1051a39Sopenharmony_ci	stg	$NHI,$stdframe-8($j,$sp)
236e1051a39Sopenharmony_ci	stg	$AHI,$stdframe($j,$sp)
237e1051a39Sopenharmony_ci
238e1051a39Sopenharmony_ci	la	$bp,8($bp)	# bp++
239e1051a39Sopenharmony_ci	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
240e1051a39Sopenharmony_ci	jne	.Louter
241e1051a39Sopenharmony_ci
242e1051a39Sopenharmony_ci	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
243e1051a39Sopenharmony_ci	la	$ap,$stdframe($sp)
244e1051a39Sopenharmony_ci	ahi	$num,1		# restore $num, incidentally clears "borrow"
245e1051a39Sopenharmony_ci
246e1051a39Sopenharmony_ci	la	$j,0
247e1051a39Sopenharmony_ci	lr	$count,$num
248e1051a39Sopenharmony_ci.Lsub:	lg	$alo,0($j,$ap)
249e1051a39Sopenharmony_ci	lg	$nlo,0($j,$np)
250e1051a39Sopenharmony_ci	_dswap	$nlo
251e1051a39Sopenharmony_ci	slbgr	$alo,$nlo
252e1051a39Sopenharmony_ci	stg	$alo,0($j,$rp)
253e1051a39Sopenharmony_ci	la	$j,8($j)
254e1051a39Sopenharmony_ci	brct	$count,.Lsub
255e1051a39Sopenharmony_ci	lghi	$ahi,0
256e1051a39Sopenharmony_ci	slbgr	$AHI,$ahi	# handle upmost carry
257e1051a39Sopenharmony_ci	lghi	$NHI,-1
258e1051a39Sopenharmony_ci	xgr	$NHI,$AHI
259e1051a39Sopenharmony_ci
260e1051a39Sopenharmony_ci	la	$j,0
261e1051a39Sopenharmony_ci	lgr	$count,$num
262e1051a39Sopenharmony_ci.Lcopy:	lg	$ahi,$stdframe($j,$sp)	# conditional copy
263e1051a39Sopenharmony_ci	lg	$alo,0($j,$rp)
264e1051a39Sopenharmony_ci	ngr	$ahi,$AHI
265e1051a39Sopenharmony_ci	ngr	$alo,$NHI
266e1051a39Sopenharmony_ci	ogr	$alo,$ahi
267e1051a39Sopenharmony_ci	_dswap	$alo
268e1051a39Sopenharmony_ci	stg	$j,$stdframe($j,$sp)	# zap tp
269e1051a39Sopenharmony_ci	stg	$alo,0($j,$rp)
270e1051a39Sopenharmony_ci	la	$j,8($j)
271e1051a39Sopenharmony_ci	brct	$count,.Lcopy
272e1051a39Sopenharmony_ci
273e1051a39Sopenharmony_ci	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
274e1051a39Sopenharmony_ci	lm${g}	%r6,%r15,0(%r1)
275e1051a39Sopenharmony_ci	lghi	%r2,1		# signal "processed"
276e1051a39Sopenharmony_ci	br	%r14
277e1051a39Sopenharmony_ci.size	bn_mul_mont,.-bn_mul_mont
278e1051a39Sopenharmony_ci.string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
279e1051a39Sopenharmony_ci___
280e1051a39Sopenharmony_ci
281e1051a39Sopenharmony_ciforeach (split("\n",$code)) {
282e1051a39Sopenharmony_ci	s/\`([^\`]*)\`/eval $1/ge;
283e1051a39Sopenharmony_ci	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
284e1051a39Sopenharmony_ci	print $_,"\n";
285e1051a39Sopenharmony_ci}
286e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
287