1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci
17e1051a39Sopenharmony_ci# December 2005
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20e1051a39Sopenharmony_ci# for undertaken effort are multiple. First of all, UltraSPARC is not
21e1051a39Sopenharmony_ci# the whole SPARCv9 universe and other VIS-free implementations deserve
22e1051a39Sopenharmony_ci# optimized code as much. Secondly, newly introduced UltraSPARC T1,
23e1051a39Sopenharmony_ci# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24e1051a39Sopenharmony_ci# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25e1051a39Sopenharmony_ci# several integrated RSA/DSA accelerator circuits accessible through
26e1051a39Sopenharmony_ci# kernel driver [only(*)], but having decent user-land software
27e1051a39Sopenharmony_ci# implementation is important too. Finally, reasons like desire to
28e1051a39Sopenharmony_ci# experiment with dedicated squaring procedure. Yes, this module
29e1051a39Sopenharmony_ci# implements one, because it was easiest to draft it in SPARCv9
30e1051a39Sopenharmony_ci# instructions...
31e1051a39Sopenharmony_ci
32e1051a39Sopenharmony_ci# (*)	Engine accessing the driver in question is on my TODO list.
33e1051a39Sopenharmony_ci#	For reference, accelerator is estimated to give 6 to 10 times
34e1051a39Sopenharmony_ci#	improvement on single-threaded RSA sign. It should be noted
35e1051a39Sopenharmony_ci#	that 6-10x improvement coefficient does not actually mean
36e1051a39Sopenharmony_ci#	something extraordinary in terms of absolute [single-threaded]
37e1051a39Sopenharmony_ci#	performance, as SPARCv9 instruction set is by all means least
38e1051a39Sopenharmony_ci#	suitable for high performance crypto among other 64 bit
39e1051a39Sopenharmony_ci#	platforms. 6-10x factor simply places T1 in same performance
40e1051a39Sopenharmony_ci#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
41e1051a39Sopenharmony_ci#	appear impressive at all, but it's the sign operation which is
42e1051a39Sopenharmony_ci#	far more critical/interesting.
43e1051a39Sopenharmony_ci
44e1051a39Sopenharmony_ci# You might notice that inner loops are modulo-scheduled:-) This has
45e1051a39Sopenharmony_ci# essentially negligible impact on UltraSPARC performance, it's
46e1051a39Sopenharmony_ci# Fujitsu SPARC64 V users who should notice and hopefully appreciate
47e1051a39Sopenharmony_ci# the advantage... Currently this module surpasses sparcv9a-mont.pl
48e1051a39Sopenharmony_ci# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49e1051a39Sopenharmony_ci# module still have hidden potential [see TODO list there], which is
50e1051a39Sopenharmony_ci# estimated to be larger than 20%...
51e1051a39Sopenharmony_ci
52e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output";
53e1051a39Sopenharmony_ci
54e1051a39Sopenharmony_ci# int bn_mul_mont(
55e1051a39Sopenharmony_ci$rp="%i0";	# BN_ULONG *rp,
56e1051a39Sopenharmony_ci$ap="%i1";	# const BN_ULONG *ap,
57e1051a39Sopenharmony_ci$bp="%i2";	# const BN_ULONG *bp,
58e1051a39Sopenharmony_ci$np="%i3";	# const BN_ULONG *np,
59e1051a39Sopenharmony_ci$n0="%i4";	# const BN_ULONG *n0,
60e1051a39Sopenharmony_ci$num="%i5";	# int num);
61e1051a39Sopenharmony_ci
62e1051a39Sopenharmony_ci$frame="STACK_FRAME";
63e1051a39Sopenharmony_ci$bias="STACK_BIAS";
64e1051a39Sopenharmony_ci
65e1051a39Sopenharmony_ci$car0="%o0";
66e1051a39Sopenharmony_ci$car1="%o1";
67e1051a39Sopenharmony_ci$car2="%o2";	# 1 bit
68e1051a39Sopenharmony_ci$acc0="%o3";
69e1051a39Sopenharmony_ci$acc1="%o4";
70e1051a39Sopenharmony_ci$mask="%g1";	# 32 bits, what a waste...
71e1051a39Sopenharmony_ci$tmp0="%g4";
72e1051a39Sopenharmony_ci$tmp1="%g5";
73e1051a39Sopenharmony_ci
74e1051a39Sopenharmony_ci$i="%l0";
75e1051a39Sopenharmony_ci$j="%l1";
76e1051a39Sopenharmony_ci$mul0="%l2";
77e1051a39Sopenharmony_ci$mul1="%l3";
78e1051a39Sopenharmony_ci$tp="%l4";
79e1051a39Sopenharmony_ci$apj="%l5";
80e1051a39Sopenharmony_ci$npj="%l6";
81e1051a39Sopenharmony_ci$tpj="%l7";
82e1051a39Sopenharmony_ci
83e1051a39Sopenharmony_ci$fname="bn_mul_mont_int";
84e1051a39Sopenharmony_ci
85e1051a39Sopenharmony_ci$code=<<___;
86e1051a39Sopenharmony_ci#ifndef __ASSEMBLER__
87e1051a39Sopenharmony_ci# define __ASSEMBLER__ 1
88e1051a39Sopenharmony_ci#endif
89e1051a39Sopenharmony_ci#include "crypto/sparc_arch.h"
90e1051a39Sopenharmony_ci
91e1051a39Sopenharmony_ci.section	".text",#alloc,#execinstr
92e1051a39Sopenharmony_ci
93e1051a39Sopenharmony_ci.global	$fname
94e1051a39Sopenharmony_ci.align	32
95e1051a39Sopenharmony_ci$fname:
96e1051a39Sopenharmony_ci	cmp	%o5,4			! 128 bits minimum
97e1051a39Sopenharmony_ci	bge,pt	%icc,.Lenter
98e1051a39Sopenharmony_ci	sethi	%hi(0xffffffff),$mask
99e1051a39Sopenharmony_ci	retl
100e1051a39Sopenharmony_ci	clr	%o0
101e1051a39Sopenharmony_ci.align	32
102e1051a39Sopenharmony_ci.Lenter:
103e1051a39Sopenharmony_ci	save	%sp,-$frame,%sp
104e1051a39Sopenharmony_ci	sll	$num,2,$num		! num*=4
105e1051a39Sopenharmony_ci	or	$mask,%lo(0xffffffff),$mask
106e1051a39Sopenharmony_ci	ld	[$n0],$n0
107e1051a39Sopenharmony_ci	cmp	$ap,$bp
108e1051a39Sopenharmony_ci	and	$num,$mask,$num
109e1051a39Sopenharmony_ci	ld	[$bp],$mul0		! bp[0]
110e1051a39Sopenharmony_ci	nop
111e1051a39Sopenharmony_ci
112e1051a39Sopenharmony_ci	add	%sp,$bias,%o7		! real top of stack
113e1051a39Sopenharmony_ci	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
114e1051a39Sopenharmony_ci	sub	%o7,$num,%o7
115e1051a39Sopenharmony_ci	ld	[$ap+4],$apj		! ap[1]
116e1051a39Sopenharmony_ci	and	%o7,-1024,%o7
117e1051a39Sopenharmony_ci	ld	[$np],$car1		! np[0]
118e1051a39Sopenharmony_ci	sub	%o7,$bias,%sp		! alloca
119e1051a39Sopenharmony_ci	ld	[$np+4],$npj		! np[1]
120e1051a39Sopenharmony_ci	be,pt	SIZE_T_CC,.Lbn_sqr_mont
121e1051a39Sopenharmony_ci	mov	12,$j
122e1051a39Sopenharmony_ci
123e1051a39Sopenharmony_ci	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
124e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
125e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
126e1051a39Sopenharmony_ci	add	%sp,$bias+$frame,$tp
127e1051a39Sopenharmony_ci	ld	[$ap+8],$apj		!prologue!
128e1051a39Sopenharmony_ci
129e1051a39Sopenharmony_ci	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
130e1051a39Sopenharmony_ci	and	$mul1,$mask,$mul1
131e1051a39Sopenharmony_ci
132e1051a39Sopenharmony_ci	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
133e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
134e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
135e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
136e1051a39Sopenharmony_ci	ld	[$np+8],$npj		!prologue!
137e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
138e1051a39Sopenharmony_ci	mov	$tmp0,$acc0		!prologue!
139e1051a39Sopenharmony_ci
140e1051a39Sopenharmony_ci.L1st:
141e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$tmp0
142e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$tmp1
143e1051a39Sopenharmony_ci	add	$acc0,$car0,$car0
144e1051a39Sopenharmony_ci	ld	[$ap+$j],$apj		! ap[j]
145e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
146e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
147e1051a39Sopenharmony_ci	ld	[$np+$j],$npj		! np[j]
148e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
149e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
150e1051a39Sopenharmony_ci	add	$j,4,$j			! j++
151e1051a39Sopenharmony_ci	mov	$tmp0,$acc0
152e1051a39Sopenharmony_ci	st	$car1,[$tp]
153e1051a39Sopenharmony_ci	cmp	$j,$num
154e1051a39Sopenharmony_ci	mov	$tmp1,$acc1
155e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
156e1051a39Sopenharmony_ci	bl	%icc,.L1st
157e1051a39Sopenharmony_ci	add	$tp,4,$tp		! tp++
158e1051a39Sopenharmony_ci!.L1st
159e1051a39Sopenharmony_ci
160e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$tmp0	!epilogue!
161e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$tmp1
162e1051a39Sopenharmony_ci	add	$acc0,$car0,$car0
163e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
164e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
165e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
166e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
167e1051a39Sopenharmony_ci	st	$car1,[$tp]
168e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
169e1051a39Sopenharmony_ci
170e1051a39Sopenharmony_ci	add	$tmp0,$car0,$car0
171e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
172e1051a39Sopenharmony_ci	add	$tmp1,$car1,$car1
173e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
174e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
175e1051a39Sopenharmony_ci	st	$car1,[$tp+4]
176e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
177e1051a39Sopenharmony_ci
178e1051a39Sopenharmony_ci	add	$car0,$car1,$car1
179e1051a39Sopenharmony_ci	st	$car1,[$tp+8]
180e1051a39Sopenharmony_ci	srlx	$car1,32,$car2
181e1051a39Sopenharmony_ci
182e1051a39Sopenharmony_ci	mov	4,$i			! i++
183e1051a39Sopenharmony_ci	ld	[$bp+4],$mul0		! bp[1]
184e1051a39Sopenharmony_ci.Louter:
185e1051a39Sopenharmony_ci	add	%sp,$bias+$frame,$tp
186e1051a39Sopenharmony_ci	ld	[$ap],$car0		! ap[0]
187e1051a39Sopenharmony_ci	ld	[$ap+4],$apj		! ap[1]
188e1051a39Sopenharmony_ci	ld	[$np],$car1		! np[0]
189e1051a39Sopenharmony_ci	ld	[$np+4],$npj		! np[1]
190e1051a39Sopenharmony_ci	ld	[$tp],$tmp1		! tp[0]
191e1051a39Sopenharmony_ci	ld	[$tp+4],$tpj		! tp[1]
192e1051a39Sopenharmony_ci	mov	12,$j
193e1051a39Sopenharmony_ci
194e1051a39Sopenharmony_ci	mulx	$car0,$mul0,$car0
195e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$tmp0	!prologue!
196e1051a39Sopenharmony_ci	add	$tmp1,$car0,$car0
197e1051a39Sopenharmony_ci	ld	[$ap+8],$apj		!prologue!
198e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
199e1051a39Sopenharmony_ci
200e1051a39Sopenharmony_ci	mulx	$n0,$acc0,$mul1
201e1051a39Sopenharmony_ci	and	$mul1,$mask,$mul1
202e1051a39Sopenharmony_ci
203e1051a39Sopenharmony_ci	mulx	$car1,$mul1,$car1
204e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1	!prologue!
205e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
206e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
207e1051a39Sopenharmony_ci	ld	[$np+8],$npj		!prologue!
208e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
209e1051a39Sopenharmony_ci	mov	$tmp0,$acc0		!prologue!
210e1051a39Sopenharmony_ci
211e1051a39Sopenharmony_ci.Linner:
212e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$tmp0
213e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$tmp1
214e1051a39Sopenharmony_ci	add	$tpj,$car0,$car0
215e1051a39Sopenharmony_ci	ld	[$ap+$j],$apj		! ap[j]
216e1051a39Sopenharmony_ci	add	$acc0,$car0,$car0
217e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
218e1051a39Sopenharmony_ci	ld	[$np+$j],$npj		! np[j]
219e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
220e1051a39Sopenharmony_ci	ld	[$tp+8],$tpj		! tp[j]
221e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
222e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
223e1051a39Sopenharmony_ci	add	$j,4,$j			! j++
224e1051a39Sopenharmony_ci	mov	$tmp0,$acc0
225e1051a39Sopenharmony_ci	st	$car1,[$tp]		! tp[j-1]
226e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
227e1051a39Sopenharmony_ci	mov	$tmp1,$acc1
228e1051a39Sopenharmony_ci	cmp	$j,$num
229e1051a39Sopenharmony_ci	bl	%icc,.Linner
230e1051a39Sopenharmony_ci	add	$tp,4,$tp		! tp++
231e1051a39Sopenharmony_ci!.Linner
232e1051a39Sopenharmony_ci
233e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$tmp0	!epilogue!
234e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$tmp1
235e1051a39Sopenharmony_ci	add	$tpj,$car0,$car0
236e1051a39Sopenharmony_ci	add	$acc0,$car0,$car0
237e1051a39Sopenharmony_ci	ld	[$tp+8],$tpj		! tp[j]
238e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
239e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
240e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
241e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
242e1051a39Sopenharmony_ci	st	$car1,[$tp]		! tp[j-1]
243e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
244e1051a39Sopenharmony_ci
245e1051a39Sopenharmony_ci	add	$tpj,$car0,$car0
246e1051a39Sopenharmony_ci	add	$tmp0,$car0,$car0
247e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
248e1051a39Sopenharmony_ci	add	$tmp1,$car1,$car1
249e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
250e1051a39Sopenharmony_ci	st	$car1,[$tp+4]		! tp[j-1]
251e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
252e1051a39Sopenharmony_ci	add	$i,4,$i			! i++
253e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
254e1051a39Sopenharmony_ci
255e1051a39Sopenharmony_ci	add	$car0,$car1,$car1
256e1051a39Sopenharmony_ci	cmp	$i,$num
257e1051a39Sopenharmony_ci	add	$car2,$car1,$car1
258e1051a39Sopenharmony_ci	st	$car1,[$tp+8]
259e1051a39Sopenharmony_ci
260e1051a39Sopenharmony_ci	srlx	$car1,32,$car2
261e1051a39Sopenharmony_ci	bl,a	%icc,.Louter
262e1051a39Sopenharmony_ci	ld	[$bp+$i],$mul0		! bp[i]
263e1051a39Sopenharmony_ci!.Louter
264e1051a39Sopenharmony_ci
265e1051a39Sopenharmony_ci	add	$tp,12,$tp
266e1051a39Sopenharmony_ci
267e1051a39Sopenharmony_ci.Ltail:
268e1051a39Sopenharmony_ci	add	$np,$num,$np
269e1051a39Sopenharmony_ci	add	$rp,$num,$rp
270e1051a39Sopenharmony_ci	sub	%g0,$num,%o7		! k=-num
271e1051a39Sopenharmony_ci	ba	.Lsub
272e1051a39Sopenharmony_ci	subcc	%g0,%g0,%g0		! clear %icc.c
273e1051a39Sopenharmony_ci.align	16
274e1051a39Sopenharmony_ci.Lsub:
275e1051a39Sopenharmony_ci	ld	[$tp+%o7],%o0
276e1051a39Sopenharmony_ci	ld	[$np+%o7],%o1
277e1051a39Sopenharmony_ci	subccc	%o0,%o1,%o1		! tp[j]-np[j]
278e1051a39Sopenharmony_ci	add	$rp,%o7,$i
279e1051a39Sopenharmony_ci	add	%o7,4,%o7
280e1051a39Sopenharmony_ci	brnz	%o7,.Lsub
281e1051a39Sopenharmony_ci	st	%o1,[$i]
282e1051a39Sopenharmony_ci	subccc	$car2,0,$car2		! handle upmost overflow bit
283e1051a39Sopenharmony_ci	sub	%g0,$num,%o7
284e1051a39Sopenharmony_ci
285e1051a39Sopenharmony_ci.Lcopy:
286e1051a39Sopenharmony_ci	ld	[$tp+%o7],%o1		! conditional copy
287e1051a39Sopenharmony_ci	ld	[$rp+%o7],%o0
288e1051a39Sopenharmony_ci	st	%g0,[$tp+%o7]		! zap tp
289e1051a39Sopenharmony_ci	movcs	%icc,%o1,%o0
290e1051a39Sopenharmony_ci	st	%o0,[$rp+%o7]
291e1051a39Sopenharmony_ci	add	%o7,4,%o7
292e1051a39Sopenharmony_ci	brnz	%o7,.Lcopy
293e1051a39Sopenharmony_ci	nop
294e1051a39Sopenharmony_ci	mov	1,%i0
295e1051a39Sopenharmony_ci	ret
296e1051a39Sopenharmony_ci	restore
297e1051a39Sopenharmony_ci___
298e1051a39Sopenharmony_ci
299e1051a39Sopenharmony_ci########
300e1051a39Sopenharmony_ci######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
301e1051a39Sopenharmony_ci######## code without following dedicated squaring procedure.
302e1051a39Sopenharmony_ci########
303e1051a39Sopenharmony_ci$sbit="%o5";
304e1051a39Sopenharmony_ci
305e1051a39Sopenharmony_ci$code.=<<___;
306e1051a39Sopenharmony_ci.align	32
307e1051a39Sopenharmony_ci.Lbn_sqr_mont:
308e1051a39Sopenharmony_ci	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
309e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$tmp0		!prologue!
310e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
311e1051a39Sopenharmony_ci	add	%sp,$bias+$frame,$tp
312e1051a39Sopenharmony_ci	ld	[$ap+8],$apj			!prologue!
313e1051a39Sopenharmony_ci
314e1051a39Sopenharmony_ci	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
315e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
316e1051a39Sopenharmony_ci	and	$mul1,$mask,$mul1
317e1051a39Sopenharmony_ci
318e1051a39Sopenharmony_ci	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
319e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1		!prologue!
320e1051a39Sopenharmony_ci	and	$car0,1,$sbit
321e1051a39Sopenharmony_ci	ld	[$np+8],$npj			!prologue!
322e1051a39Sopenharmony_ci	srlx	$car0,1,$car0
323e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
324e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
325e1051a39Sopenharmony_ci	mov	$tmp0,$acc0			!prologue!
326e1051a39Sopenharmony_ci
327e1051a39Sopenharmony_ci.Lsqr_1st:
328e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$tmp0
329e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$tmp1
330e1051a39Sopenharmony_ci	add	$acc0,$car0,$car0		! ap[j]*a0+c0
331e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
332e1051a39Sopenharmony_ci	ld	[$ap+$j],$apj			! ap[j]
333e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
334e1051a39Sopenharmony_ci	ld	[$np+$j],$npj			! np[j]
335e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
336e1051a39Sopenharmony_ci	add	$acc0,$acc0,$acc0
337e1051a39Sopenharmony_ci	or	$sbit,$acc0,$acc0
338e1051a39Sopenharmony_ci	mov	$tmp1,$acc1
339e1051a39Sopenharmony_ci	srlx	$acc0,32,$sbit
340e1051a39Sopenharmony_ci	add	$j,4,$j				! j++
341e1051a39Sopenharmony_ci	and	$acc0,$mask,$acc0
342e1051a39Sopenharmony_ci	cmp	$j,$num
343e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
344e1051a39Sopenharmony_ci	st	$car1,[$tp]
345e1051a39Sopenharmony_ci	mov	$tmp0,$acc0
346e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
347e1051a39Sopenharmony_ci	bl	%icc,.Lsqr_1st
348e1051a39Sopenharmony_ci	add	$tp,4,$tp			! tp++
349e1051a39Sopenharmony_ci!.Lsqr_1st
350e1051a39Sopenharmony_ci
351e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$tmp0		! epilogue
352e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$tmp1
353e1051a39Sopenharmony_ci	add	$acc0,$car0,$car0		! ap[j]*a0+c0
354e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
355e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
356e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
357e1051a39Sopenharmony_ci	add	$acc0,$acc0,$acc0
358e1051a39Sopenharmony_ci	or	$sbit,$acc0,$acc0
359e1051a39Sopenharmony_ci	srlx	$acc0,32,$sbit
360e1051a39Sopenharmony_ci	and	$acc0,$mask,$acc0
361e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
362e1051a39Sopenharmony_ci	st	$car1,[$tp]
363e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
364e1051a39Sopenharmony_ci
365e1051a39Sopenharmony_ci	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
366e1051a39Sopenharmony_ci	add	$tmp1,$car1,$car1
367e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
368e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
369e1051a39Sopenharmony_ci	add	$acc0,$acc0,$acc0
370e1051a39Sopenharmony_ci	or	$sbit,$acc0,$acc0
371e1051a39Sopenharmony_ci	srlx	$acc0,32,$sbit
372e1051a39Sopenharmony_ci	and	$acc0,$mask,$acc0
373e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
374e1051a39Sopenharmony_ci	st	$car1,[$tp+4]
375e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
376e1051a39Sopenharmony_ci
377e1051a39Sopenharmony_ci	add	$car0,$car0,$car0
378e1051a39Sopenharmony_ci	or	$sbit,$car0,$car0
379e1051a39Sopenharmony_ci	add	$car0,$car1,$car1
380e1051a39Sopenharmony_ci	st	$car1,[$tp+8]
381e1051a39Sopenharmony_ci	srlx	$car1,32,$car2
382e1051a39Sopenharmony_ci
383e1051a39Sopenharmony_ci	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
384e1051a39Sopenharmony_ci	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
385e1051a39Sopenharmony_ci	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
386e1051a39Sopenharmony_ci	ld	[$ap+4],$mul0			! ap[1]
387e1051a39Sopenharmony_ci	ld	[$ap+8],$apj			! ap[2]
388e1051a39Sopenharmony_ci	ld	[$np],$car1			! np[0]
389e1051a39Sopenharmony_ci	ld	[$np+4],$npj			! np[1]
390e1051a39Sopenharmony_ci	mulx	$n0,$tmp0,$mul1
391e1051a39Sopenharmony_ci
392e1051a39Sopenharmony_ci	mulx	$mul0,$mul0,$car0
393e1051a39Sopenharmony_ci	and	$mul1,$mask,$mul1
394e1051a39Sopenharmony_ci
395e1051a39Sopenharmony_ci	mulx	$car1,$mul1,$car1
396e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1
397e1051a39Sopenharmony_ci	add	$tmp0,$car1,$car1
398e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
399e1051a39Sopenharmony_ci	ld	[$np+8],$npj			! np[2]
400e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
401e1051a39Sopenharmony_ci	add	$tmp1,$car1,$car1
402e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
403e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
404e1051a39Sopenharmony_ci	and	$car0,1,$sbit
405e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
406e1051a39Sopenharmony_ci	srlx	$car0,1,$car0
407e1051a39Sopenharmony_ci	mov	12,$j
408e1051a39Sopenharmony_ci	st	$car1,[%sp+$bias+$frame]	! tp[0]=
409e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
410e1051a39Sopenharmony_ci	add	%sp,$bias+$frame+4,$tp
411e1051a39Sopenharmony_ci
412e1051a39Sopenharmony_ci.Lsqr_2nd:
413e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$acc0
414e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1
415e1051a39Sopenharmony_ci	add	$acc0,$car0,$car0
416e1051a39Sopenharmony_ci	add	$tpj,$sbit,$sbit
417e1051a39Sopenharmony_ci	ld	[$ap+$j],$apj			! ap[j]
418e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
419e1051a39Sopenharmony_ci	ld	[$np+$j],$npj			! np[j]
420e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
421e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
422e1051a39Sopenharmony_ci	ld	[$tp+8],$tpj			! tp[j]
423e1051a39Sopenharmony_ci	add	$acc0,$acc0,$acc0
424e1051a39Sopenharmony_ci	add	$j,4,$j				! j++
425e1051a39Sopenharmony_ci	add	$sbit,$acc0,$acc0
426e1051a39Sopenharmony_ci	srlx	$acc0,32,$sbit
427e1051a39Sopenharmony_ci	and	$acc0,$mask,$acc0
428e1051a39Sopenharmony_ci	cmp	$j,$num
429e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
430e1051a39Sopenharmony_ci	st	$car1,[$tp]			! tp[j-1]
431e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
432e1051a39Sopenharmony_ci	bl	%icc,.Lsqr_2nd
433e1051a39Sopenharmony_ci	add	$tp,4,$tp			! tp++
434e1051a39Sopenharmony_ci!.Lsqr_2nd
435e1051a39Sopenharmony_ci
436e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$acc0
437e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1
438e1051a39Sopenharmony_ci	add	$acc0,$car0,$car0
439e1051a39Sopenharmony_ci	add	$tpj,$sbit,$sbit
440e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
441e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
442e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
443e1051a39Sopenharmony_ci	add	$acc0,$acc0,$acc0
444e1051a39Sopenharmony_ci	add	$sbit,$acc0,$acc0
445e1051a39Sopenharmony_ci	srlx	$acc0,32,$sbit
446e1051a39Sopenharmony_ci	and	$acc0,$mask,$acc0
447e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
448e1051a39Sopenharmony_ci	st	$car1,[$tp]			! tp[j-1]
449e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
450e1051a39Sopenharmony_ci
451e1051a39Sopenharmony_ci	add	$car0,$car0,$car0
452e1051a39Sopenharmony_ci	add	$sbit,$car0,$car0
453e1051a39Sopenharmony_ci	add	$car0,$car1,$car1
454e1051a39Sopenharmony_ci	add	$car2,$car1,$car1
455e1051a39Sopenharmony_ci	st	$car1,[$tp+4]
456e1051a39Sopenharmony_ci	srlx	$car1,32,$car2
457e1051a39Sopenharmony_ci
458e1051a39Sopenharmony_ci	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
459e1051a39Sopenharmony_ci	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
460e1051a39Sopenharmony_ci	ld	[$ap+8],$mul0			! ap[2]
461e1051a39Sopenharmony_ci	ld	[$np],$car1			! np[0]
462e1051a39Sopenharmony_ci	ld	[$np+4],$npj			! np[1]
463e1051a39Sopenharmony_ci	mulx	$n0,$tmp1,$mul1
464e1051a39Sopenharmony_ci	and	$mul1,$mask,$mul1
465e1051a39Sopenharmony_ci	mov	8,$i
466e1051a39Sopenharmony_ci
467e1051a39Sopenharmony_ci	mulx	$mul0,$mul0,$car0
468e1051a39Sopenharmony_ci	mulx	$car1,$mul1,$car1
469e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
470e1051a39Sopenharmony_ci	add	$tmp1,$car1,$car1
471e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
472e1051a39Sopenharmony_ci	add	%sp,$bias+$frame,$tp
473e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
474e1051a39Sopenharmony_ci	and	$car0,1,$sbit
475e1051a39Sopenharmony_ci	srlx	$car0,1,$car0
476e1051a39Sopenharmony_ci	mov	4,$j
477e1051a39Sopenharmony_ci
478e1051a39Sopenharmony_ci.Lsqr_outer:
479e1051a39Sopenharmony_ci.Lsqr_inner1:
480e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1
481e1051a39Sopenharmony_ci	add	$tpj,$car1,$car1
482e1051a39Sopenharmony_ci	add	$j,4,$j
483e1051a39Sopenharmony_ci	ld	[$tp+8],$tpj
484e1051a39Sopenharmony_ci	cmp	$j,$i
485e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
486e1051a39Sopenharmony_ci	ld	[$np+$j],$npj
487e1051a39Sopenharmony_ci	st	$car1,[$tp]
488e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
489e1051a39Sopenharmony_ci	bl	%icc,.Lsqr_inner1
490e1051a39Sopenharmony_ci	add	$tp,4,$tp
491e1051a39Sopenharmony_ci!.Lsqr_inner1
492e1051a39Sopenharmony_ci
493e1051a39Sopenharmony_ci	add	$j,4,$j
494e1051a39Sopenharmony_ci	ld	[$ap+$j],$apj			! ap[j]
495e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1
496e1051a39Sopenharmony_ci	add	$tpj,$car1,$car1
497e1051a39Sopenharmony_ci	ld	[$np+$j],$npj			! np[j]
498e1051a39Sopenharmony_ci	srlx	$car1,32,$tmp0
499e1051a39Sopenharmony_ci	and	$car1,$mask,$car1
500e1051a39Sopenharmony_ci	add	$tmp0,$sbit,$sbit
501e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
502e1051a39Sopenharmony_ci	ld	[$tp+8],$tpj			! tp[j]
503e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
504e1051a39Sopenharmony_ci	st	$car1,[$tp]
505e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
506e1051a39Sopenharmony_ci
507e1051a39Sopenharmony_ci	add	$j,4,$j
508e1051a39Sopenharmony_ci	cmp	$j,$num
509e1051a39Sopenharmony_ci	be,pn	%icc,.Lsqr_no_inner2
510e1051a39Sopenharmony_ci	add	$tp,4,$tp
511e1051a39Sopenharmony_ci
512e1051a39Sopenharmony_ci.Lsqr_inner2:
513e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$acc0
514e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1
515e1051a39Sopenharmony_ci	add	$tpj,$sbit,$sbit
516e1051a39Sopenharmony_ci	add	$acc0,$car0,$car0
517e1051a39Sopenharmony_ci	ld	[$ap+$j],$apj			! ap[j]
518e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
519e1051a39Sopenharmony_ci	ld	[$np+$j],$npj			! np[j]
520e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
521e1051a39Sopenharmony_ci	add	$acc0,$acc0,$acc0
522e1051a39Sopenharmony_ci	ld	[$tp+8],$tpj			! tp[j]
523e1051a39Sopenharmony_ci	add	$sbit,$acc0,$acc0
524e1051a39Sopenharmony_ci	add	$j,4,$j				! j++
525e1051a39Sopenharmony_ci	srlx	$acc0,32,$sbit
526e1051a39Sopenharmony_ci	and	$acc0,$mask,$acc0
527e1051a39Sopenharmony_ci	cmp	$j,$num
528e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
529e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
530e1051a39Sopenharmony_ci	st	$car1,[$tp]			! tp[j-1]
531e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
532e1051a39Sopenharmony_ci	bl	%icc,.Lsqr_inner2
533e1051a39Sopenharmony_ci	add	$tp,4,$tp			! tp++
534e1051a39Sopenharmony_ci
535e1051a39Sopenharmony_ci.Lsqr_no_inner2:
536e1051a39Sopenharmony_ci	mulx	$apj,$mul0,$acc0
537e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1
538e1051a39Sopenharmony_ci	add	$tpj,$sbit,$sbit
539e1051a39Sopenharmony_ci	add	$acc0,$car0,$car0
540e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
541e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
542e1051a39Sopenharmony_ci	add	$acc0,$acc0,$acc0
543e1051a39Sopenharmony_ci	add	$sbit,$acc0,$acc0
544e1051a39Sopenharmony_ci	srlx	$acc0,32,$sbit
545e1051a39Sopenharmony_ci	and	$acc0,$mask,$acc0
546e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
547e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
548e1051a39Sopenharmony_ci	st	$car1,[$tp]			! tp[j-1]
549e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
550e1051a39Sopenharmony_ci
551e1051a39Sopenharmony_ci	add	$car0,$car0,$car0
552e1051a39Sopenharmony_ci	add	$sbit,$car0,$car0
553e1051a39Sopenharmony_ci	add	$car0,$car1,$car1
554e1051a39Sopenharmony_ci	add	$car2,$car1,$car1
555e1051a39Sopenharmony_ci	st	$car1,[$tp+4]
556e1051a39Sopenharmony_ci	srlx	$car1,32,$car2
557e1051a39Sopenharmony_ci
558e1051a39Sopenharmony_ci	add	$i,4,$i				! i++
559e1051a39Sopenharmony_ci	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
560e1051a39Sopenharmony_ci	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
561e1051a39Sopenharmony_ci	ld	[$ap+$i],$mul0			! ap[j]
562e1051a39Sopenharmony_ci	ld	[$np],$car1			! np[0]
563e1051a39Sopenharmony_ci	ld	[$np+4],$npj			! np[1]
564e1051a39Sopenharmony_ci	mulx	$n0,$tmp1,$mul1
565e1051a39Sopenharmony_ci	and	$mul1,$mask,$mul1
566e1051a39Sopenharmony_ci	add	$i,4,$tmp0
567e1051a39Sopenharmony_ci
568e1051a39Sopenharmony_ci	mulx	$mul0,$mul0,$car0
569e1051a39Sopenharmony_ci	mulx	$car1,$mul1,$car1
570e1051a39Sopenharmony_ci	and	$car0,$mask,$acc0
571e1051a39Sopenharmony_ci	add	$tmp1,$car1,$car1
572e1051a39Sopenharmony_ci	srlx	$car0,32,$car0
573e1051a39Sopenharmony_ci	add	%sp,$bias+$frame,$tp
574e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
575e1051a39Sopenharmony_ci	and	$car0,1,$sbit
576e1051a39Sopenharmony_ci	srlx	$car0,1,$car0
577e1051a39Sopenharmony_ci
578e1051a39Sopenharmony_ci	cmp	$tmp0,$num			! i<num-1
579e1051a39Sopenharmony_ci	bl	%icc,.Lsqr_outer
580e1051a39Sopenharmony_ci	mov	4,$j
581e1051a39Sopenharmony_ci
582e1051a39Sopenharmony_ci.Lsqr_last:
583e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1
584e1051a39Sopenharmony_ci	add	$tpj,$car1,$car1
585e1051a39Sopenharmony_ci	add	$j,4,$j
586e1051a39Sopenharmony_ci	ld	[$tp+8],$tpj
587e1051a39Sopenharmony_ci	cmp	$j,$i
588e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
589e1051a39Sopenharmony_ci	ld	[$np+$j],$npj
590e1051a39Sopenharmony_ci	st	$car1,[$tp]
591e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
592e1051a39Sopenharmony_ci	bl	%icc,.Lsqr_last
593e1051a39Sopenharmony_ci	add	$tp,4,$tp
594e1051a39Sopenharmony_ci!.Lsqr_last
595e1051a39Sopenharmony_ci
596e1051a39Sopenharmony_ci	mulx	$npj,$mul1,$acc1
597e1051a39Sopenharmony_ci	add	$tpj,$acc0,$acc0
598e1051a39Sopenharmony_ci	srlx	$acc0,32,$tmp0
599e1051a39Sopenharmony_ci	and	$acc0,$mask,$acc0
600e1051a39Sopenharmony_ci	add	$tmp0,$sbit,$sbit
601e1051a39Sopenharmony_ci	add	$acc0,$car1,$car1
602e1051a39Sopenharmony_ci	add	$acc1,$car1,$car1
603e1051a39Sopenharmony_ci	st	$car1,[$tp]
604e1051a39Sopenharmony_ci	srlx	$car1,32,$car1
605e1051a39Sopenharmony_ci
606e1051a39Sopenharmony_ci	add	$car0,$car0,$car0		! recover $car0
607e1051a39Sopenharmony_ci	add	$sbit,$car0,$car0
608e1051a39Sopenharmony_ci	add	$car0,$car1,$car1
609e1051a39Sopenharmony_ci	add	$car2,$car1,$car1
610e1051a39Sopenharmony_ci	st	$car1,[$tp+4]
611e1051a39Sopenharmony_ci	srlx	$car1,32,$car2
612e1051a39Sopenharmony_ci
613e1051a39Sopenharmony_ci	ba	.Ltail
614e1051a39Sopenharmony_ci	add	$tp,8,$tp
615e1051a39Sopenharmony_ci.type	$fname,#function
616e1051a39Sopenharmony_ci.size	$fname,(.-$fname)
617e1051a39Sopenharmony_ci.asciz	"Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
618e1051a39Sopenharmony_ci.align	32
619e1051a39Sopenharmony_ci___
620e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval($1)/gem;
621e1051a39Sopenharmony_ciprint $code;
622e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
623