1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by David S. Miller and Andy Polyakov
12e1051a39Sopenharmony_ci# The module is licensed under 2-clause BSD license.
13e1051a39Sopenharmony_ci# November 2012. All rights reserved.
14e1051a39Sopenharmony_ci# ====================================================================
15e1051a39Sopenharmony_ci
16e1051a39Sopenharmony_ci######################################################################
17e1051a39Sopenharmony_ci# Montgomery squaring-n-multiplication module for SPARC T4.
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# The module consists of three parts:
20e1051a39Sopenharmony_ci#
21e1051a39Sopenharmony_ci# 1) collection of "single-op" subroutines that perform single
22e1051a39Sopenharmony_ci#    operation, Montgomery squaring or multiplication, on 512-,
23e1051a39Sopenharmony_ci#    1024-, 1536- and 2048-bit operands;
24e1051a39Sopenharmony_ci# 2) collection of "multi-op" subroutines that perform 5 squaring and
25e1051a39Sopenharmony_ci#    1 multiplication operations on operands of above lengths;
26e1051a39Sopenharmony_ci# 3) fall-back and helper VIS3 subroutines.
27e1051a39Sopenharmony_ci#
28e1051a39Sopenharmony_ci# RSA sign is dominated by multi-op subroutine, while RSA verify and
29e1051a39Sopenharmony_ci# DSA - by single-op. Special note about 4096-bit RSA verify result.
30e1051a39Sopenharmony_ci# Operands are too long for dedicated hardware and it's handled by
31e1051a39Sopenharmony_ci# VIS3 code, which is why you don't see any improvement. It's surely
32e1051a39Sopenharmony_ci# possible to improve it [by deploying 'mpmul' instruction], maybe in
33e1051a39Sopenharmony_ci# the future...
34e1051a39Sopenharmony_ci#
35e1051a39Sopenharmony_ci# Performance improvement.
36e1051a39Sopenharmony_ci#
37e1051a39Sopenharmony_ci# 64-bit process, VIS3:
38e1051a39Sopenharmony_ci#                   sign    verify    sign/s verify/s
39e1051a39Sopenharmony_ci# rsa 1024 bits 0.000628s 0.000028s   1592.4  35434.4
40e1051a39Sopenharmony_ci# rsa 2048 bits 0.003282s 0.000106s    304.7   9438.3
41e1051a39Sopenharmony_ci# rsa 4096 bits 0.025866s 0.000340s     38.7   2940.9
42e1051a39Sopenharmony_ci# dsa 1024 bits 0.000301s 0.000332s   3323.7   3013.9
43e1051a39Sopenharmony_ci# dsa 2048 bits 0.001056s 0.001233s    946.9    810.8
44e1051a39Sopenharmony_ci#
45e1051a39Sopenharmony_ci# 64-bit process, this module:
46e1051a39Sopenharmony_ci#                   sign    verify    sign/s verify/s
47e1051a39Sopenharmony_ci# rsa 1024 bits 0.000256s 0.000016s   3904.4  61411.9
48e1051a39Sopenharmony_ci# rsa 2048 bits 0.000946s 0.000029s   1056.8  34292.7
49e1051a39Sopenharmony_ci# rsa 4096 bits 0.005061s 0.000340s    197.6   2940.5
50e1051a39Sopenharmony_ci# dsa 1024 bits 0.000176s 0.000195s   5674.7   5130.5
51e1051a39Sopenharmony_ci# dsa 2048 bits 0.000296s 0.000354s   3383.2   2827.6
52e1051a39Sopenharmony_ci#
53e1051a39Sopenharmony_ci######################################################################
54e1051a39Sopenharmony_ci# 32-bit process, VIS3:
55e1051a39Sopenharmony_ci#                   sign    verify    sign/s verify/s
56e1051a39Sopenharmony_ci# rsa 1024 bits 0.000665s 0.000028s   1504.8  35233.3
57e1051a39Sopenharmony_ci# rsa 2048 bits 0.003349s 0.000106s    298.6   9433.4
58e1051a39Sopenharmony_ci# rsa 4096 bits 0.025959s 0.000341s     38.5   2934.8
59e1051a39Sopenharmony_ci# dsa 1024 bits 0.000320s 0.000341s   3123.3   2929.6
60e1051a39Sopenharmony_ci# dsa 2048 bits 0.001101s 0.001260s    908.2    793.4
61e1051a39Sopenharmony_ci#
62e1051a39Sopenharmony_ci# 32-bit process, this module:
63e1051a39Sopenharmony_ci#                   sign    verify    sign/s verify/s
64e1051a39Sopenharmony_ci# rsa 1024 bits 0.000301s 0.000017s   3317.1  60240.0
65e1051a39Sopenharmony_ci# rsa 2048 bits 0.001034s 0.000030s    966.9  33812.7
66e1051a39Sopenharmony_ci# rsa 4096 bits 0.005244s 0.000341s    190.7   2935.4
67e1051a39Sopenharmony_ci# dsa 1024 bits 0.000201s 0.000205s   4976.1   4879.2
68e1051a39Sopenharmony_ci# dsa 2048 bits 0.000328s 0.000360s   3051.1   2774.2
69e1051a39Sopenharmony_ci#
70e1051a39Sopenharmony_ci# 32-bit code is prone to performance degradation as interrupt rate
71e1051a39Sopenharmony_ci# dispatched to CPU executing the code grows. This is because in
72e1051a39Sopenharmony_ci# standard process of handling interrupt in 32-bit process context
73e1051a39Sopenharmony_ci# upper halves of most integer registers used as input or output are
74e1051a39Sopenharmony_ci# zeroed. This renders result invalid, and operation has to be re-run.
75e1051a39Sopenharmony_ci# If CPU is "bothered" with timer interrupts only, the penalty is
76e1051a39Sopenharmony_ci# hardly measurable. But in order to mitigate this problem for higher
77e1051a39Sopenharmony_ci# interrupt rates contemporary Linux kernel recognizes biased stack
78e1051a39Sopenharmony_ci# even in 32-bit process context and preserves full register contents.
79e1051a39Sopenharmony_ci# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
80e1051a39Sopenharmony_ci# for details.
81e1051a39Sopenharmony_ci
82e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83e1051a39Sopenharmony_cipush(@INC,"${dir}","${dir}../../perlasm");
84e1051a39Sopenharmony_cirequire "sparcv9_modes.pl";
85e1051a39Sopenharmony_ci
86e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output";
87e1051a39Sopenharmony_ci
88e1051a39Sopenharmony_ci$code.=<<___;
89e1051a39Sopenharmony_ci#ifndef __ASSEMBLER__
90e1051a39Sopenharmony_ci# define __ASSEMBLER__ 1
91e1051a39Sopenharmony_ci#endif
92e1051a39Sopenharmony_ci#include "crypto/sparc_arch.h"
93e1051a39Sopenharmony_ci
94e1051a39Sopenharmony_ci#ifdef	__arch64__
95e1051a39Sopenharmony_ci.register	%g2,#scratch
96e1051a39Sopenharmony_ci.register	%g3,#scratch
97e1051a39Sopenharmony_ci#endif
98e1051a39Sopenharmony_ci
99e1051a39Sopenharmony_ci.section	".text",#alloc,#execinstr
100e1051a39Sopenharmony_ci
101e1051a39Sopenharmony_ci#ifdef	__PIC__
102e1051a39Sopenharmony_ciSPARC_PIC_THUNK(%g1)
103e1051a39Sopenharmony_ci#endif
104e1051a39Sopenharmony_ci___
105e1051a39Sopenharmony_ci
106e1051a39Sopenharmony_ci########################################################################
107e1051a39Sopenharmony_ci# Register layout for mont[mul|sqr] instructions.
108e1051a39Sopenharmony_ci# For details see "Oracle SPARC Architecture 2011" manual at
109e1051a39Sopenharmony_ci# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
110e1051a39Sopenharmony_ci#
111e1051a39Sopenharmony_cimy @R=map("%f".2*$_,(0..11,30,31,12..29));
112e1051a39Sopenharmony_cimy @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
113e1051a39Sopenharmony_cimy @A=(@N[0..13],@R[14..31]);
114e1051a39Sopenharmony_cimy @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
115e1051a39Sopenharmony_ci
116e1051a39Sopenharmony_ci########################################################################
117e1051a39Sopenharmony_ci# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
118e1051a39Sopenharmony_ci#			  const u64 *np,const BN_ULONG *n0);
119e1051a39Sopenharmony_ci#
120e1051a39Sopenharmony_cisub generate_bn_mul_mont_t4() {
121e1051a39Sopenharmony_cimy $NUM=shift;
122e1051a39Sopenharmony_cimy ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
123e1051a39Sopenharmony_ci
124e1051a39Sopenharmony_ci$code.=<<___;
125e1051a39Sopenharmony_ci.globl	bn_mul_mont_t4_$NUM
126e1051a39Sopenharmony_ci.align	32
127e1051a39Sopenharmony_cibn_mul_mont_t4_$NUM:
128e1051a39Sopenharmony_ci#ifdef	__arch64__
129e1051a39Sopenharmony_ci	mov	0,$sentinel
130e1051a39Sopenharmony_ci	mov	-128,%g4
131e1051a39Sopenharmony_ci#elif defined(SPARCV9_64BIT_STACK)
132e1051a39Sopenharmony_ci	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
133e1051a39Sopenharmony_ci	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
134e1051a39Sopenharmony_ci	mov	-2047,%g4
135e1051a39Sopenharmony_ci	and	%g1,SPARCV9_64BIT_STACK,%g1
136e1051a39Sopenharmony_ci	movrz	%g1,0,%g4
137e1051a39Sopenharmony_ci	mov	-1,$sentinel
138e1051a39Sopenharmony_ci	add	%g4,-128,%g4
139e1051a39Sopenharmony_ci#else
140e1051a39Sopenharmony_ci	mov	-1,$sentinel
141e1051a39Sopenharmony_ci	mov	-128,%g4
142e1051a39Sopenharmony_ci#endif
143e1051a39Sopenharmony_ci	sllx	$sentinel,32,$sentinel
144e1051a39Sopenharmony_ci	save	%sp,%g4,%sp
145e1051a39Sopenharmony_ci#ifndef	__arch64__
146e1051a39Sopenharmony_ci	save	%sp,-128,%sp	! warm it up
147e1051a39Sopenharmony_ci	save	%sp,-128,%sp
148e1051a39Sopenharmony_ci	save	%sp,-128,%sp
149e1051a39Sopenharmony_ci	save	%sp,-128,%sp
150e1051a39Sopenharmony_ci	save	%sp,-128,%sp
151e1051a39Sopenharmony_ci	save	%sp,-128,%sp
152e1051a39Sopenharmony_ci	restore
153e1051a39Sopenharmony_ci	restore
154e1051a39Sopenharmony_ci	restore
155e1051a39Sopenharmony_ci	restore
156e1051a39Sopenharmony_ci	restore
157e1051a39Sopenharmony_ci	restore
158e1051a39Sopenharmony_ci#endif
159e1051a39Sopenharmony_ci	and	%sp,1,%g4
160e1051a39Sopenharmony_ci	or	$sentinel,%fp,%fp
161e1051a39Sopenharmony_ci	or	%g4,$sentinel,$sentinel
162e1051a39Sopenharmony_ci
163e1051a39Sopenharmony_ci	! copy arguments to global registers
164e1051a39Sopenharmony_ci	mov	%i0,$rp
165e1051a39Sopenharmony_ci	mov	%i1,$ap
166e1051a39Sopenharmony_ci	mov	%i2,$bp
167e1051a39Sopenharmony_ci	mov	%i3,$np
168e1051a39Sopenharmony_ci	ld	[%i4+0],%f1	! load *n0
169e1051a39Sopenharmony_ci	ld	[%i4+4],%f0
170e1051a39Sopenharmony_ci	fsrc2	%f0,%f60
171e1051a39Sopenharmony_ci___
172e1051a39Sopenharmony_ci
173e1051a39Sopenharmony_ci# load ap[$NUM] ########################################################
174e1051a39Sopenharmony_ci$code.=<<___;
175e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
176e1051a39Sopenharmony_ci___
177e1051a39Sopenharmony_cifor($i=0; $i<14 && $i<$NUM; $i++) {
178e1051a39Sopenharmony_cimy $lo=$i<13?@A[$i+1]:"%o7";
179e1051a39Sopenharmony_ci$code.=<<___;
180e1051a39Sopenharmony_ci	ld	[$ap+$i*8+0],$lo
181e1051a39Sopenharmony_ci	ld	[$ap+$i*8+4],@A[$i]
182e1051a39Sopenharmony_ci	sllx	@A[$i],32,@A[$i]
183e1051a39Sopenharmony_ci	or	$lo,@A[$i],@A[$i]
184e1051a39Sopenharmony_ci___
185e1051a39Sopenharmony_ci}
186e1051a39Sopenharmony_cifor(; $i<$NUM; $i++) {
187e1051a39Sopenharmony_cimy ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
188e1051a39Sopenharmony_ci$code.=<<___;
189e1051a39Sopenharmony_ci	ld	[$ap+$i*8+0],$lo
190e1051a39Sopenharmony_ci	ld	[$ap+$i*8+4],$hi
191e1051a39Sopenharmony_ci	fsrc2	$hi,@A[$i]
192e1051a39Sopenharmony_ci___
193e1051a39Sopenharmony_ci}
194e1051a39Sopenharmony_ci# load np[$NUM] ########################################################
195e1051a39Sopenharmony_ci$code.=<<___;
196e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
197e1051a39Sopenharmony_ci___
198e1051a39Sopenharmony_cifor($i=0; $i<14 && $i<$NUM; $i++) {
199e1051a39Sopenharmony_cimy $lo=$i<13?@N[$i+1]:"%o7";
200e1051a39Sopenharmony_ci$code.=<<___;
201e1051a39Sopenharmony_ci	ld	[$np+$i*8+0],$lo
202e1051a39Sopenharmony_ci	ld	[$np+$i*8+4],@N[$i]
203e1051a39Sopenharmony_ci	sllx	@N[$i],32,@N[$i]
204e1051a39Sopenharmony_ci	or	$lo,@N[$i],@N[$i]
205e1051a39Sopenharmony_ci___
206e1051a39Sopenharmony_ci}
207e1051a39Sopenharmony_ci$code.=<<___;
208e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
209e1051a39Sopenharmony_ci___
210e1051a39Sopenharmony_cifor(; $i<28 && $i<$NUM; $i++) {
211e1051a39Sopenharmony_cimy $lo=$i<27?@N[$i+1]:"%o7";
212e1051a39Sopenharmony_ci$code.=<<___;
213e1051a39Sopenharmony_ci	ld	[$np+$i*8+0],$lo
214e1051a39Sopenharmony_ci	ld	[$np+$i*8+4],@N[$i]
215e1051a39Sopenharmony_ci	sllx	@N[$i],32,@N[$i]
216e1051a39Sopenharmony_ci	or	$lo,@N[$i],@N[$i]
217e1051a39Sopenharmony_ci___
218e1051a39Sopenharmony_ci}
219e1051a39Sopenharmony_ci$code.=<<___;
220e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
221e1051a39Sopenharmony_ci___
222e1051a39Sopenharmony_cifor(; $i<$NUM; $i++) {
223e1051a39Sopenharmony_cimy $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
224e1051a39Sopenharmony_ci$code.=<<___;
225e1051a39Sopenharmony_ci	ld	[$np+$i*8+0],$lo
226e1051a39Sopenharmony_ci	ld	[$np+$i*8+4],@N[$i]
227e1051a39Sopenharmony_ci	sllx	@N[$i],32,@N[$i]
228e1051a39Sopenharmony_ci	or	$lo,@N[$i],@N[$i]
229e1051a39Sopenharmony_ci___
230e1051a39Sopenharmony_ci}
231e1051a39Sopenharmony_ci$code.=<<___;
232e1051a39Sopenharmony_ci	cmp	$ap,$bp
233e1051a39Sopenharmony_ci	be	SIZE_T_CC,.Lmsquare_$NUM
234e1051a39Sopenharmony_ci	nop
235e1051a39Sopenharmony_ci___
236e1051a39Sopenharmony_ci
237e1051a39Sopenharmony_ci# load bp[$NUM] ########################################################
238e1051a39Sopenharmony_ci$code.=<<___;
239e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
240e1051a39Sopenharmony_ci___
241e1051a39Sopenharmony_cifor($i=0; $i<14 && $i<$NUM; $i++) {
242e1051a39Sopenharmony_cimy $lo=$i<13?@B[$i+1]:"%o7";
243e1051a39Sopenharmony_ci$code.=<<___;
244e1051a39Sopenharmony_ci	ld	[$bp+$i*8+0],$lo
245e1051a39Sopenharmony_ci	ld	[$bp+$i*8+4],@B[$i]
246e1051a39Sopenharmony_ci	sllx	@B[$i],32,@B[$i]
247e1051a39Sopenharmony_ci	or	$lo,@B[$i],@B[$i]
248e1051a39Sopenharmony_ci___
249e1051a39Sopenharmony_ci}
250e1051a39Sopenharmony_ci$code.=<<___;
251e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
252e1051a39Sopenharmony_ci___
253e1051a39Sopenharmony_cifor(; $i<$NUM; $i++) {
254e1051a39Sopenharmony_cimy $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
255e1051a39Sopenharmony_ci$code.=<<___;
256e1051a39Sopenharmony_ci	ld	[$bp+$i*8+0],$lo
257e1051a39Sopenharmony_ci	ld	[$bp+$i*8+4],@B[$i]
258e1051a39Sopenharmony_ci	sllx	@B[$i],32,@B[$i]
259e1051a39Sopenharmony_ci	or	$lo,@B[$i],@B[$i]
260e1051a39Sopenharmony_ci___
261e1051a39Sopenharmony_ci}
262e1051a39Sopenharmony_ci# magic ################################################################
263e1051a39Sopenharmony_ci$code.=<<___;
264e1051a39Sopenharmony_ci	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
265e1051a39Sopenharmony_ci.Lmresume_$NUM:
266e1051a39Sopenharmony_ci	fbu,pn	%fcc3,.Lmabort_$NUM
267e1051a39Sopenharmony_ci#ifndef	__arch64__
268e1051a39Sopenharmony_ci	and	%fp,$sentinel,$sentinel
269e1051a39Sopenharmony_ci	brz,pn	$sentinel,.Lmabort_$NUM
270e1051a39Sopenharmony_ci#endif
271e1051a39Sopenharmony_ci	nop
272e1051a39Sopenharmony_ci#ifdef	__arch64__
273e1051a39Sopenharmony_ci	restore
274e1051a39Sopenharmony_ci	restore
275e1051a39Sopenharmony_ci	restore
276e1051a39Sopenharmony_ci	restore
277e1051a39Sopenharmony_ci	restore
278e1051a39Sopenharmony_ci#else
279e1051a39Sopenharmony_ci	restore;		and	%fp,$sentinel,$sentinel
280e1051a39Sopenharmony_ci	restore;		and	%fp,$sentinel,$sentinel
281e1051a39Sopenharmony_ci	restore;		and	%fp,$sentinel,$sentinel
282e1051a39Sopenharmony_ci	restore;		and	%fp,$sentinel,$sentinel
283e1051a39Sopenharmony_ci	 brz,pn	$sentinel,.Lmabort1_$NUM
284e1051a39Sopenharmony_ci	restore
285e1051a39Sopenharmony_ci#endif
286e1051a39Sopenharmony_ci___
287e1051a39Sopenharmony_ci
288e1051a39Sopenharmony_ci# save tp[$NUM] ########################################################
289e1051a39Sopenharmony_cifor($i=0; $i<14 && $i<$NUM; $i++) {
290e1051a39Sopenharmony_ci$code.=<<___;
291e1051a39Sopenharmony_ci	movxtod	@A[$i],@R[$i]
292e1051a39Sopenharmony_ci___
293e1051a39Sopenharmony_ci}
294e1051a39Sopenharmony_ci$code.=<<___;
295e1051a39Sopenharmony_ci#ifdef	__arch64__
296e1051a39Sopenharmony_ci	restore
297e1051a39Sopenharmony_ci#else
298e1051a39Sopenharmony_ci	 and	%fp,$sentinel,$sentinel
299e1051a39Sopenharmony_ci	restore
300e1051a39Sopenharmony_ci	 and	$sentinel,1,%o7
301e1051a39Sopenharmony_ci	 and	%fp,$sentinel,$sentinel
302e1051a39Sopenharmony_ci	 srl	%fp,0,%fp		! just in case?
303e1051a39Sopenharmony_ci	 or	%o7,$sentinel,$sentinel
304e1051a39Sopenharmony_ci	brz,a,pn $sentinel,.Lmdone_$NUM
305e1051a39Sopenharmony_ci	mov	0,%i0		! return failure
306e1051a39Sopenharmony_ci#endif
307e1051a39Sopenharmony_ci___
308e1051a39Sopenharmony_cifor($i=0; $i<12 && $i<$NUM; $i++) {
309e1051a39Sopenharmony_ci@R[$i] =~ /%f([0-9]+)/;
310e1051a39Sopenharmony_cimy $lo = "%f".($1+1);
311e1051a39Sopenharmony_ci$code.=<<___;
312e1051a39Sopenharmony_ci	st	$lo,[$rp+$i*8+0]
313e1051a39Sopenharmony_ci	st	@R[$i],[$rp+$i*8+4]
314e1051a39Sopenharmony_ci___
315e1051a39Sopenharmony_ci}
316e1051a39Sopenharmony_cifor(; $i<$NUM; $i++) {
317e1051a39Sopenharmony_cimy ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
318e1051a39Sopenharmony_ci$code.=<<___;
319e1051a39Sopenharmony_ci	fsrc2	@R[$i],$hi
320e1051a39Sopenharmony_ci	st	$lo,[$rp+$i*8+0]
321e1051a39Sopenharmony_ci	st	$hi,[$rp+$i*8+4]
322e1051a39Sopenharmony_ci___
323e1051a39Sopenharmony_ci}
324e1051a39Sopenharmony_ci$code.=<<___;
325e1051a39Sopenharmony_ci	mov	1,%i0		! return success
326e1051a39Sopenharmony_ci.Lmdone_$NUM:
327e1051a39Sopenharmony_ci	ret
328e1051a39Sopenharmony_ci	restore
329e1051a39Sopenharmony_ci
330e1051a39Sopenharmony_ci.Lmabort_$NUM:
331e1051a39Sopenharmony_ci	restore
332e1051a39Sopenharmony_ci	restore
333e1051a39Sopenharmony_ci	restore
334e1051a39Sopenharmony_ci	restore
335e1051a39Sopenharmony_ci	restore
336e1051a39Sopenharmony_ci.Lmabort1_$NUM:
337e1051a39Sopenharmony_ci	restore
338e1051a39Sopenharmony_ci
339e1051a39Sopenharmony_ci	mov	0,%i0		! return failure
340e1051a39Sopenharmony_ci	ret
341e1051a39Sopenharmony_ci	restore
342e1051a39Sopenharmony_ci
343e1051a39Sopenharmony_ci.align	32
344e1051a39Sopenharmony_ci.Lmsquare_$NUM:
345e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
346e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
347e1051a39Sopenharmony_ci	.word   0x81b02940+$NUM-1	! montsqr	$NUM-1
348e1051a39Sopenharmony_ci	ba	.Lmresume_$NUM
349e1051a39Sopenharmony_ci	nop
350e1051a39Sopenharmony_ci.type	bn_mul_mont_t4_$NUM, #function
351e1051a39Sopenharmony_ci.size	bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
352e1051a39Sopenharmony_ci___
353e1051a39Sopenharmony_ci}
354e1051a39Sopenharmony_ci
355e1051a39Sopenharmony_cifor ($i=8;$i<=32;$i+=8) {
356e1051a39Sopenharmony_ci	&generate_bn_mul_mont_t4($i);
357e1051a39Sopenharmony_ci}
358e1051a39Sopenharmony_ci
359e1051a39Sopenharmony_ci########################################################################
360e1051a39Sopenharmony_ci#
361e1051a39Sopenharmony_cisub load_ccr {
362e1051a39Sopenharmony_cimy ($ptbl,$pwr,$ccr,$skip_wr)=@_;
363e1051a39Sopenharmony_ci$code.=<<___;
364e1051a39Sopenharmony_ci	srl	$pwr,	2,	%o4
365e1051a39Sopenharmony_ci	and	$pwr,	3,	%o5
366e1051a39Sopenharmony_ci	and	%o4,	7,	%o4
367e1051a39Sopenharmony_ci	sll	%o5,	3,	%o5	! offset within first cache line
368e1051a39Sopenharmony_ci	add	%o5,	$ptbl,	$ptbl	! of the pwrtbl
369e1051a39Sopenharmony_ci	or	%g0,	1,	%o5
370e1051a39Sopenharmony_ci	sll	%o5,	%o4,	$ccr
371e1051a39Sopenharmony_ci___
372e1051a39Sopenharmony_ci$code.=<<___	if (!$skip_wr);
373e1051a39Sopenharmony_ci	wr	$ccr,	%g0,	%ccr
374e1051a39Sopenharmony_ci___
375e1051a39Sopenharmony_ci}
376e1051a39Sopenharmony_cisub load_b_pair {
377e1051a39Sopenharmony_cimy ($pwrtbl,$B0,$B1)=@_;
378e1051a39Sopenharmony_ci
379e1051a39Sopenharmony_ci$code.=<<___;
380e1051a39Sopenharmony_ci	ldx	[$pwrtbl+0*32],	$B0
381e1051a39Sopenharmony_ci	ldx	[$pwrtbl+8*32],	$B1
382e1051a39Sopenharmony_ci	ldx	[$pwrtbl+1*32],	%o4
383e1051a39Sopenharmony_ci	ldx	[$pwrtbl+9*32],	%o5
384e1051a39Sopenharmony_ci	movvs	%icc,	%o4,	$B0
385e1051a39Sopenharmony_ci	ldx	[$pwrtbl+2*32],	%o4
386e1051a39Sopenharmony_ci	movvs	%icc,	%o5,	$B1
387e1051a39Sopenharmony_ci	ldx	[$pwrtbl+10*32],%o5
388e1051a39Sopenharmony_ci	move	%icc,	%o4,	$B0
389e1051a39Sopenharmony_ci	ldx	[$pwrtbl+3*32],	%o4
390e1051a39Sopenharmony_ci	move	%icc,	%o5,	$B1
391e1051a39Sopenharmony_ci	ldx	[$pwrtbl+11*32],%o5
392e1051a39Sopenharmony_ci	movneg	%icc,	%o4,	$B0
393e1051a39Sopenharmony_ci	ldx	[$pwrtbl+4*32],	%o4
394e1051a39Sopenharmony_ci	movneg	%icc,	%o5,	$B1
395e1051a39Sopenharmony_ci	ldx	[$pwrtbl+12*32],%o5
396e1051a39Sopenharmony_ci	movcs	%xcc,	%o4,	$B0
397e1051a39Sopenharmony_ci	ldx	[$pwrtbl+5*32],%o4
398e1051a39Sopenharmony_ci	movcs	%xcc,	%o5,	$B1
399e1051a39Sopenharmony_ci	ldx	[$pwrtbl+13*32],%o5
400e1051a39Sopenharmony_ci	movvs	%xcc,	%o4,	$B0
401e1051a39Sopenharmony_ci	ldx	[$pwrtbl+6*32],	%o4
402e1051a39Sopenharmony_ci	movvs	%xcc,	%o5,	$B1
403e1051a39Sopenharmony_ci	ldx	[$pwrtbl+14*32],%o5
404e1051a39Sopenharmony_ci	move	%xcc,	%o4,	$B0
405e1051a39Sopenharmony_ci	ldx	[$pwrtbl+7*32],	%o4
406e1051a39Sopenharmony_ci	move	%xcc,	%o5,	$B1
407e1051a39Sopenharmony_ci	ldx	[$pwrtbl+15*32],%o5
408e1051a39Sopenharmony_ci	movneg	%xcc,	%o4,	$B0
409e1051a39Sopenharmony_ci	add	$pwrtbl,16*32,	$pwrtbl
410e1051a39Sopenharmony_ci	movneg	%xcc,	%o5,	$B1
411e1051a39Sopenharmony_ci___
412e1051a39Sopenharmony_ci}
413e1051a39Sopenharmony_cisub load_b {
414e1051a39Sopenharmony_cimy ($pwrtbl,$Bi)=@_;
415e1051a39Sopenharmony_ci
416e1051a39Sopenharmony_ci$code.=<<___;
417e1051a39Sopenharmony_ci	ldx	[$pwrtbl+0*32],	$Bi
418e1051a39Sopenharmony_ci	ldx	[$pwrtbl+1*32],	%o4
419e1051a39Sopenharmony_ci	ldx	[$pwrtbl+2*32],	%o5
420e1051a39Sopenharmony_ci	movvs	%icc,	%o4,	$Bi
421e1051a39Sopenharmony_ci	ldx	[$pwrtbl+3*32],	%o4
422e1051a39Sopenharmony_ci	move	%icc,	%o5,	$Bi
423e1051a39Sopenharmony_ci	ldx	[$pwrtbl+4*32],	%o5
424e1051a39Sopenharmony_ci	movneg	%icc,	%o4,	$Bi
425e1051a39Sopenharmony_ci	ldx	[$pwrtbl+5*32],	%o4
426e1051a39Sopenharmony_ci	movcs	%xcc,	%o5,	$Bi
427e1051a39Sopenharmony_ci	ldx	[$pwrtbl+6*32],	%o5
428e1051a39Sopenharmony_ci	movvs	%xcc,	%o4,	$Bi
429e1051a39Sopenharmony_ci	ldx	[$pwrtbl+7*32],	%o4
430e1051a39Sopenharmony_ci	move	%xcc,	%o5,	$Bi
431e1051a39Sopenharmony_ci	add	$pwrtbl,8*32,	$pwrtbl
432e1051a39Sopenharmony_ci	movneg	%xcc,	%o4,	$Bi
433e1051a39Sopenharmony_ci___
434e1051a39Sopenharmony_ci}
435e1051a39Sopenharmony_ci
436e1051a39Sopenharmony_ci########################################################################
437e1051a39Sopenharmony_ci# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
438e1051a39Sopenharmony_ci#			   const u64 *pwrtbl,int pwr,int stride);
439e1051a39Sopenharmony_ci#
440e1051a39Sopenharmony_cisub generate_bn_pwr5_mont_t4() {
441e1051a39Sopenharmony_cimy $NUM=shift;
442e1051a39Sopenharmony_cimy ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
443e1051a39Sopenharmony_ci
444e1051a39Sopenharmony_ci$code.=<<___;
445e1051a39Sopenharmony_ci.globl	bn_pwr5_mont_t4_$NUM
446e1051a39Sopenharmony_ci.align	32
447e1051a39Sopenharmony_cibn_pwr5_mont_t4_$NUM:
448e1051a39Sopenharmony_ci#ifdef	__arch64__
449e1051a39Sopenharmony_ci	mov	0,$sentinel
450e1051a39Sopenharmony_ci	mov	-128,%g4
451e1051a39Sopenharmony_ci#elif defined(SPARCV9_64BIT_STACK)
452e1051a39Sopenharmony_ci	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
453e1051a39Sopenharmony_ci	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
454e1051a39Sopenharmony_ci	mov	-2047,%g4
455e1051a39Sopenharmony_ci	and	%g1,SPARCV9_64BIT_STACK,%g1
456e1051a39Sopenharmony_ci	movrz	%g1,0,%g4
457e1051a39Sopenharmony_ci	mov	-1,$sentinel
458e1051a39Sopenharmony_ci	add	%g4,-128,%g4
459e1051a39Sopenharmony_ci#else
460e1051a39Sopenharmony_ci	mov	-1,$sentinel
461e1051a39Sopenharmony_ci	mov	-128,%g4
462e1051a39Sopenharmony_ci#endif
463e1051a39Sopenharmony_ci	sllx	$sentinel,32,$sentinel
464e1051a39Sopenharmony_ci	save	%sp,%g4,%sp
465e1051a39Sopenharmony_ci#ifndef	__arch64__
466e1051a39Sopenharmony_ci	save	%sp,-128,%sp	! warm it up
467e1051a39Sopenharmony_ci	save	%sp,-128,%sp
468e1051a39Sopenharmony_ci	save	%sp,-128,%sp
469e1051a39Sopenharmony_ci	save	%sp,-128,%sp
470e1051a39Sopenharmony_ci	save	%sp,-128,%sp
471e1051a39Sopenharmony_ci	save	%sp,-128,%sp
472e1051a39Sopenharmony_ci	restore
473e1051a39Sopenharmony_ci	restore
474e1051a39Sopenharmony_ci	restore
475e1051a39Sopenharmony_ci	restore
476e1051a39Sopenharmony_ci	restore
477e1051a39Sopenharmony_ci	restore
478e1051a39Sopenharmony_ci#endif
479e1051a39Sopenharmony_ci	and	%sp,1,%g4
480e1051a39Sopenharmony_ci	or	$sentinel,%fp,%fp
481e1051a39Sopenharmony_ci	or	%g4,$sentinel,$sentinel
482e1051a39Sopenharmony_ci
483e1051a39Sopenharmony_ci	! copy arguments to global registers
484e1051a39Sopenharmony_ci	mov	%i0,$tp
485e1051a39Sopenharmony_ci	mov	%i1,$np
486e1051a39Sopenharmony_ci	ld	[%i2+0],%f1	! load *n0
487e1051a39Sopenharmony_ci	ld	[%i2+4],%f0
488e1051a39Sopenharmony_ci	mov	%i3,$pwrtbl
489e1051a39Sopenharmony_ci	srl	%i4,%g0,%i4	! pack last arguments
490e1051a39Sopenharmony_ci	sllx	%i5,32,$pwr
491e1051a39Sopenharmony_ci	or	%i4,$pwr,$pwr
492e1051a39Sopenharmony_ci	fsrc2	%f0,%f60
493e1051a39Sopenharmony_ci___
494e1051a39Sopenharmony_ci
495e1051a39Sopenharmony_ci# load tp[$NUM] ########################################################
496e1051a39Sopenharmony_ci$code.=<<___;
497e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
498e1051a39Sopenharmony_ci___
499e1051a39Sopenharmony_cifor($i=0; $i<14 && $i<$NUM; $i++) {
500e1051a39Sopenharmony_ci$code.=<<___;
501e1051a39Sopenharmony_ci	ldx	[$tp+$i*8],@A[$i]
502e1051a39Sopenharmony_ci___
503e1051a39Sopenharmony_ci}
504e1051a39Sopenharmony_cifor(; $i<$NUM; $i++) {
505e1051a39Sopenharmony_ci$code.=<<___;
506e1051a39Sopenharmony_ci	ldd	[$tp+$i*8],@A[$i]
507e1051a39Sopenharmony_ci___
508e1051a39Sopenharmony_ci}
509e1051a39Sopenharmony_ci# load np[$NUM] ########################################################
510e1051a39Sopenharmony_ci$code.=<<___;
511e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
512e1051a39Sopenharmony_ci___
513e1051a39Sopenharmony_cifor($i=0; $i<14 && $i<$NUM; $i++) {
514e1051a39Sopenharmony_ci$code.=<<___;
515e1051a39Sopenharmony_ci	ldx	[$np+$i*8],@N[$i]
516e1051a39Sopenharmony_ci___
517e1051a39Sopenharmony_ci}
518e1051a39Sopenharmony_ci$code.=<<___;
519e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
520e1051a39Sopenharmony_ci___
521e1051a39Sopenharmony_cifor(; $i<28 && $i<$NUM; $i++) {
522e1051a39Sopenharmony_ci$code.=<<___;
523e1051a39Sopenharmony_ci	ldx	[$np+$i*8],@N[$i]
524e1051a39Sopenharmony_ci___
525e1051a39Sopenharmony_ci}
526e1051a39Sopenharmony_ci$code.=<<___;
527e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
528e1051a39Sopenharmony_ci___
529e1051a39Sopenharmony_cifor(; $i<$NUM; $i++) {
530e1051a39Sopenharmony_ci$code.=<<___;
531e1051a39Sopenharmony_ci	ldx	[$np+$i*8],@N[$i]
532e1051a39Sopenharmony_ci___
533e1051a39Sopenharmony_ci}
534e1051a39Sopenharmony_ci# load pwrtbl[pwr] ########################################################
535e1051a39Sopenharmony_ci$code.=<<___;
536e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
537e1051a39Sopenharmony_ci
538e1051a39Sopenharmony_ci	srlx	$pwr,	32,	%o4		! unpack $pwr
539e1051a39Sopenharmony_ci	srl	$pwr,	%g0,	%o5
540e1051a39Sopenharmony_ci	sub	%o4,	5,	%o4
541e1051a39Sopenharmony_ci	mov	$pwrtbl,	%o7
542e1051a39Sopenharmony_ci	sllx	%o4,	32,	$pwr		! re-pack $pwr
543e1051a39Sopenharmony_ci	or	%o5,	$pwr,	$pwr
544e1051a39Sopenharmony_ci	srl	%o5,	%o4,	%o5
545e1051a39Sopenharmony_ci___
546e1051a39Sopenharmony_ci	&load_ccr("%o7","%o5","%o4");
547e1051a39Sopenharmony_ci$code.=<<___;
548e1051a39Sopenharmony_ci	b	.Lstride_$NUM
549e1051a39Sopenharmony_ci	nop
550e1051a39Sopenharmony_ci.align	16
551e1051a39Sopenharmony_ci.Lstride_$NUM:
552e1051a39Sopenharmony_ci___
553e1051a39Sopenharmony_cifor($i=0; $i<14 && $i<$NUM; $i+=2) {
554e1051a39Sopenharmony_ci	&load_b_pair("%o7",@B[$i],@B[$i+1]);
555e1051a39Sopenharmony_ci}
556e1051a39Sopenharmony_ci$code.=<<___;
557e1051a39Sopenharmony_ci	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
558e1051a39Sopenharmony_ci___
559e1051a39Sopenharmony_cifor(; $i<$NUM; $i+=2) {
560e1051a39Sopenharmony_ci	&load_b_pair("%i7",@B[$i],@B[$i+1]);
561e1051a39Sopenharmony_ci}
562e1051a39Sopenharmony_ci$code.=<<___;
563e1051a39Sopenharmony_ci	srax	$pwr,	32,	%o4		! unpack $pwr
564e1051a39Sopenharmony_ci	srl	$pwr,	%g0,	%o5
565e1051a39Sopenharmony_ci	sub	%o4,	5,	%o4
566e1051a39Sopenharmony_ci	mov	$pwrtbl,	%i7
567e1051a39Sopenharmony_ci	sllx	%o4,	32,	$pwr		! re-pack $pwr
568e1051a39Sopenharmony_ci	or	%o5,	$pwr,	$pwr
569e1051a39Sopenharmony_ci	srl	%o5,	%o4,	%o5
570e1051a39Sopenharmony_ci___
571e1051a39Sopenharmony_ci	&load_ccr("%i7","%o5","%o4",1);
572e1051a39Sopenharmony_ci
573e1051a39Sopenharmony_ci# magic ################################################################
574e1051a39Sopenharmony_cifor($i=0; $i<5; $i++) {
575e1051a39Sopenharmony_ci$code.=<<___;
576e1051a39Sopenharmony_ci	.word	0x81b02940+$NUM-1	! montsqr	$NUM-1
577e1051a39Sopenharmony_ci	fbu,pn	%fcc3,.Labort_$NUM
578e1051a39Sopenharmony_ci#ifndef	__arch64__
579e1051a39Sopenharmony_ci	and	%fp,$sentinel,$sentinel
580e1051a39Sopenharmony_ci	brz,pn	$sentinel,.Labort_$NUM
581e1051a39Sopenharmony_ci#endif
582e1051a39Sopenharmony_ci	nop
583e1051a39Sopenharmony_ci___
584e1051a39Sopenharmony_ci}
585e1051a39Sopenharmony_ci$code.=<<___;
586e1051a39Sopenharmony_ci	wr	%o4,	%g0,	%ccr
587e1051a39Sopenharmony_ci	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
588e1051a39Sopenharmony_ci	fbu,pn	%fcc3,.Labort_$NUM
589e1051a39Sopenharmony_ci#ifndef	__arch64__
590e1051a39Sopenharmony_ci	and	%fp,$sentinel,$sentinel
591e1051a39Sopenharmony_ci	brz,pn	$sentinel,.Labort_$NUM
592e1051a39Sopenharmony_ci#endif
593e1051a39Sopenharmony_ci
594e1051a39Sopenharmony_ci	srax	$pwr,	32,	%o4
595e1051a39Sopenharmony_ci#ifdef	__arch64__
596e1051a39Sopenharmony_ci	brgez	%o4,.Lstride_$NUM
597e1051a39Sopenharmony_ci	restore
598e1051a39Sopenharmony_ci	restore
599e1051a39Sopenharmony_ci	restore
600e1051a39Sopenharmony_ci	restore
601e1051a39Sopenharmony_ci	restore
602e1051a39Sopenharmony_ci#else
603e1051a39Sopenharmony_ci	brgez	%o4,.Lstride_$NUM
604e1051a39Sopenharmony_ci	restore;		and	%fp,$sentinel,$sentinel
605e1051a39Sopenharmony_ci	restore;		and	%fp,$sentinel,$sentinel
606e1051a39Sopenharmony_ci	restore;		and	%fp,$sentinel,$sentinel
607e1051a39Sopenharmony_ci	restore;		and	%fp,$sentinel,$sentinel
608e1051a39Sopenharmony_ci	 brz,pn	$sentinel,.Labort1_$NUM
609e1051a39Sopenharmony_ci	restore
610e1051a39Sopenharmony_ci#endif
611e1051a39Sopenharmony_ci___
612e1051a39Sopenharmony_ci
613e1051a39Sopenharmony_ci# save tp[$NUM] ########################################################
614e1051a39Sopenharmony_cifor($i=0; $i<14 && $i<$NUM; $i++) {
615e1051a39Sopenharmony_ci$code.=<<___;
616e1051a39Sopenharmony_ci	movxtod	@A[$i],@R[$i]
617e1051a39Sopenharmony_ci___
618e1051a39Sopenharmony_ci}
619e1051a39Sopenharmony_ci$code.=<<___;
620e1051a39Sopenharmony_ci#ifdef	__arch64__
621e1051a39Sopenharmony_ci	restore
622e1051a39Sopenharmony_ci#else
623e1051a39Sopenharmony_ci	 and	%fp,$sentinel,$sentinel
624e1051a39Sopenharmony_ci	restore
625e1051a39Sopenharmony_ci	 and	$sentinel,1,%o7
626e1051a39Sopenharmony_ci	 and	%fp,$sentinel,$sentinel
627e1051a39Sopenharmony_ci	 srl	%fp,0,%fp		! just in case?
628e1051a39Sopenharmony_ci	 or	%o7,$sentinel,$sentinel
629e1051a39Sopenharmony_ci	brz,a,pn $sentinel,.Ldone_$NUM
630e1051a39Sopenharmony_ci	mov	0,%i0		! return failure
631e1051a39Sopenharmony_ci#endif
632e1051a39Sopenharmony_ci___
633e1051a39Sopenharmony_cifor($i=0; $i<$NUM; $i++) {
634e1051a39Sopenharmony_ci$code.=<<___;
635e1051a39Sopenharmony_ci	std	@R[$i],[$tp+$i*8]
636e1051a39Sopenharmony_ci___
637e1051a39Sopenharmony_ci}
638e1051a39Sopenharmony_ci$code.=<<___;
639e1051a39Sopenharmony_ci	mov	1,%i0		! return success
640e1051a39Sopenharmony_ci.Ldone_$NUM:
641e1051a39Sopenharmony_ci	ret
642e1051a39Sopenharmony_ci	restore
643e1051a39Sopenharmony_ci
644e1051a39Sopenharmony_ci.Labort_$NUM:
645e1051a39Sopenharmony_ci	restore
646e1051a39Sopenharmony_ci	restore
647e1051a39Sopenharmony_ci	restore
648e1051a39Sopenharmony_ci	restore
649e1051a39Sopenharmony_ci	restore
650e1051a39Sopenharmony_ci.Labort1_$NUM:
651e1051a39Sopenharmony_ci	restore
652e1051a39Sopenharmony_ci
653e1051a39Sopenharmony_ci	mov	0,%i0		! return failure
654e1051a39Sopenharmony_ci	ret
655e1051a39Sopenharmony_ci	restore
656e1051a39Sopenharmony_ci.type	bn_pwr5_mont_t4_$NUM, #function
657e1051a39Sopenharmony_ci.size	bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
658e1051a39Sopenharmony_ci___
659e1051a39Sopenharmony_ci}
660e1051a39Sopenharmony_ci
661e1051a39Sopenharmony_cifor ($i=8;$i<=32;$i+=8) {
662e1051a39Sopenharmony_ci	&generate_bn_pwr5_mont_t4($i);
663e1051a39Sopenharmony_ci}
664e1051a39Sopenharmony_ci
665e1051a39Sopenharmony_ci{
666e1051a39Sopenharmony_ci########################################################################
667e1051a39Sopenharmony_ci# Fall-back subroutines
668e1051a39Sopenharmony_ci#
669e1051a39Sopenharmony_ci# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
670e1051a39Sopenharmony_ci#
671e1051a39Sopenharmony_ci($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
672e1051a39Sopenharmony_ci	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
673e1051a39Sopenharmony_ci
674e1051a39Sopenharmony_ci# int bn_mul_mont(
675e1051a39Sopenharmony_ci$rp="%o0";	# u64 *rp,
676e1051a39Sopenharmony_ci$ap="%o1";	# const u64 *ap,
677e1051a39Sopenharmony_ci$bp="%o2";	# const u64 *bp,
678e1051a39Sopenharmony_ci$np="%o3";	# const u64 *np,
679e1051a39Sopenharmony_ci$n0p="%o4";	# const BN_ULONG *n0,
680e1051a39Sopenharmony_ci$num="%o5";	# int num);	# caller ensures that num is >=3
681e1051a39Sopenharmony_ci$code.=<<___;
682e1051a39Sopenharmony_ci.globl	bn_mul_mont_t4
683e1051a39Sopenharmony_ci.align	32
684e1051a39Sopenharmony_cibn_mul_mont_t4:
685e1051a39Sopenharmony_ci	add	%sp,	STACK_BIAS,	%g4	! real top of stack
686e1051a39Sopenharmony_ci	sll	$num,	3,	$num		! size in bytes
687e1051a39Sopenharmony_ci	add	$num,	63,	%g1
688e1051a39Sopenharmony_ci	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
689e1051a39Sopenharmony_ci	sub	%g4,	%g1,	%g1
690e1051a39Sopenharmony_ci	andn	%g1,	63,	%g1		! align at 64 byte
691e1051a39Sopenharmony_ci	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
692e1051a39Sopenharmony_ci	sub	%g1,	%g4,	%g1
693e1051a39Sopenharmony_ci
694e1051a39Sopenharmony_ci	save	%sp,	%g1,	%sp
695e1051a39Sopenharmony_ci___
696e1051a39Sopenharmony_ci#	+-------------------------------+<-----	%sp
697e1051a39Sopenharmony_ci#	.				.
698e1051a39Sopenharmony_ci#	+-------------------------------+<-----	aligned at 64 bytes
699e1051a39Sopenharmony_ci#	| __int64 tmp[0]		|
700e1051a39Sopenharmony_ci#	+-------------------------------+
701e1051a39Sopenharmony_ci#	.				.
702e1051a39Sopenharmony_ci#	.				.
703e1051a39Sopenharmony_ci#	+-------------------------------+<-----	aligned at 64 bytes
704e1051a39Sopenharmony_ci#	.				.
705e1051a39Sopenharmony_ci($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
706e1051a39Sopenharmony_ci($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
707e1051a39Sopenharmony_ci($ovf,$i)=($t0,$t1);
708e1051a39Sopenharmony_ci$code.=<<___;
709e1051a39Sopenharmony_ci	ld	[$n0p+0],	$t0	! pull n0[0..1] value
710e1051a39Sopenharmony_ci	ld	[$n0p+4],	$t1
711e1051a39Sopenharmony_ci	add	%sp, STACK_BIAS+STACK_FRAME, $tp
712e1051a39Sopenharmony_ci	ldx	[$bp+0],	$m0	! m0=bp[0]
713e1051a39Sopenharmony_ci	sllx	$t1,	32,	$n0
714e1051a39Sopenharmony_ci	add	$bp,	8,	$bp
715e1051a39Sopenharmony_ci	or	$t0,	$n0,	$n0
716e1051a39Sopenharmony_ci
717e1051a39Sopenharmony_ci	ldx	[$ap+0],	$aj	! ap[0]
718e1051a39Sopenharmony_ci
719e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
720e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$hi0
721e1051a39Sopenharmony_ci
722e1051a39Sopenharmony_ci	ldx	[$ap+8],	$aj	! ap[1]
723e1051a39Sopenharmony_ci	add	$ap,	16,	$ap
724e1051a39Sopenharmony_ci	ldx	[$np+0],	$nj	! np[0]
725e1051a39Sopenharmony_ci
726e1051a39Sopenharmony_ci	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
727e1051a39Sopenharmony_ci
728e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
729e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$aj	! ahi=aj
730e1051a39Sopenharmony_ci
731e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$lo1	! np[0]*m1
732e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$hi1
733e1051a39Sopenharmony_ci
734e1051a39Sopenharmony_ci	ldx	[$np+8],	$nj	! np[1]
735e1051a39Sopenharmony_ci
736e1051a39Sopenharmony_ci	addcc	$lo0,	$lo1,	$lo1
737e1051a39Sopenharmony_ci	add	$np,	16,	$np
738e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
739e1051a39Sopenharmony_ci
740e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$nlo	! np[1]*m1
741e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$nj	! nhi=nj
742e1051a39Sopenharmony_ci
743e1051a39Sopenharmony_ci	ba	.L1st
744e1051a39Sopenharmony_ci	sub	$num,	24,	$cnt	! cnt=num-3
745e1051a39Sopenharmony_ci
746e1051a39Sopenharmony_ci.align	16
747e1051a39Sopenharmony_ci.L1st:
748e1051a39Sopenharmony_ci	addcc	$alo,	$hi0,	$lo0
749e1051a39Sopenharmony_ci	addxc	$aj,	%g0,	$hi0
750e1051a39Sopenharmony_ci
751e1051a39Sopenharmony_ci	ldx	[$ap+0],	$aj	! ap[j]
752e1051a39Sopenharmony_ci	addcc	$nlo,	$hi1,	$lo1
753e1051a39Sopenharmony_ci	add	$ap,	8,	$ap
754e1051a39Sopenharmony_ci	addxc	$nj,	%g0,	$hi1	! nhi=nj
755e1051a39Sopenharmony_ci
756e1051a39Sopenharmony_ci	ldx	[$np+0],	$nj	! np[j]
757e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
758e1051a39Sopenharmony_ci	add	$np,	8,	$np
759e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$aj	! ahi=aj
760e1051a39Sopenharmony_ci
761e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$nlo	! np[j]*m1
762e1051a39Sopenharmony_ci	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
763e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$nj	! nhi=nj
764e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
765e1051a39Sopenharmony_ci	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
766e1051a39Sopenharmony_ci	add	$tp,	8,	$tp	! tp++
767e1051a39Sopenharmony_ci
768e1051a39Sopenharmony_ci	brnz,pt	$cnt,	.L1st
769e1051a39Sopenharmony_ci	sub	$cnt,	8,	$cnt	! j--
770e1051a39Sopenharmony_ci!.L1st
771e1051a39Sopenharmony_ci	addcc	$alo,	$hi0,	$lo0
772e1051a39Sopenharmony_ci	addxc	$aj,	%g0,	$hi0	! ahi=aj
773e1051a39Sopenharmony_ci
774e1051a39Sopenharmony_ci	addcc	$nlo,	$hi1,	$lo1
775e1051a39Sopenharmony_ci	addxc	$nj,	%g0,	$hi1
776e1051a39Sopenharmony_ci	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
777e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
778e1051a39Sopenharmony_ci	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
779e1051a39Sopenharmony_ci	add	$tp,	8,	$tp
780e1051a39Sopenharmony_ci
781e1051a39Sopenharmony_ci	addcc	$hi0,	$hi1,	$hi1
782e1051a39Sopenharmony_ci	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
783e1051a39Sopenharmony_ci	stxa	$hi1,	[$tp]0xe2
784e1051a39Sopenharmony_ci	add	$tp,	8,	$tp
785e1051a39Sopenharmony_ci
786e1051a39Sopenharmony_ci	ba	.Louter
787e1051a39Sopenharmony_ci	sub	$num,	16,	$i	! i=num-2
788e1051a39Sopenharmony_ci
789e1051a39Sopenharmony_ci.align	16
790e1051a39Sopenharmony_ci.Louter:
791e1051a39Sopenharmony_ci	ldx	[$bp+0],	$m0	! m0=bp[i]
792e1051a39Sopenharmony_ci	add	$bp,	8,	$bp
793e1051a39Sopenharmony_ci
794e1051a39Sopenharmony_ci	sub	$ap,	$num,	$ap	! rewind
795e1051a39Sopenharmony_ci	sub	$np,	$num,	$np
796e1051a39Sopenharmony_ci	sub	$tp,	$num,	$tp
797e1051a39Sopenharmony_ci
798e1051a39Sopenharmony_ci	ldx	[$ap+0],	$aj	! ap[0]
799e1051a39Sopenharmony_ci	ldx	[$np+0],	$nj	! np[0]
800e1051a39Sopenharmony_ci
801e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
802e1051a39Sopenharmony_ci	ldx	[$tp],		$tj	! tp[0]
803e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$hi0
804e1051a39Sopenharmony_ci	ldx	[$ap+8],	$aj	! ap[1]
805e1051a39Sopenharmony_ci	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
806e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
807e1051a39Sopenharmony_ci	addxc	%g0,	$hi0,	$hi0
808e1051a39Sopenharmony_ci	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
809e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$aj	! ahi=aj
810e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$lo1	! np[0]*m1
811e1051a39Sopenharmony_ci	add	$ap,	16,	$ap
812e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$hi1
813e1051a39Sopenharmony_ci	ldx	[$np+8],	$nj	! np[1]
814e1051a39Sopenharmony_ci	add	$np,	16,	$np
815e1051a39Sopenharmony_ci	addcc	$lo1,	$lo0,	$lo1
816e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$nlo	! np[1]*m1
817e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
818e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$nj	! nhi=nj
819e1051a39Sopenharmony_ci
820e1051a39Sopenharmony_ci	ba	.Linner
821e1051a39Sopenharmony_ci	sub	$num,	24,	$cnt	! cnt=num-3
822e1051a39Sopenharmony_ci.align	16
823e1051a39Sopenharmony_ci.Linner:
824e1051a39Sopenharmony_ci	addcc	$alo,	$hi0,	$lo0
825e1051a39Sopenharmony_ci	ldx	[$tp+8],	$tj	! tp[j]
826e1051a39Sopenharmony_ci	addxc	$aj,	%g0,	$hi0	! ahi=aj
827e1051a39Sopenharmony_ci	ldx	[$ap+0],	$aj	! ap[j]
828e1051a39Sopenharmony_ci	add	$ap,	8,	$ap
829e1051a39Sopenharmony_ci	addcc	$nlo,	$hi1,	$lo1
830e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
831e1051a39Sopenharmony_ci	addxc	$nj,	%g0,	$hi1	! nhi=nj
832e1051a39Sopenharmony_ci	ldx	[$np+0],	$nj	! np[j]
833e1051a39Sopenharmony_ci	add	$np,	8,	$np
834e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$aj	! ahi=aj
835e1051a39Sopenharmony_ci	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
836e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$nlo	! np[j]*m1
837e1051a39Sopenharmony_ci	addxc	%g0,	$hi0,	$hi0
838e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$nj	! nhi=nj
839e1051a39Sopenharmony_ci	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
840e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
841e1051a39Sopenharmony_ci	stx	$lo1,	[$tp]		! tp[j-1]
842e1051a39Sopenharmony_ci	add	$tp,	8,	$tp
843e1051a39Sopenharmony_ci	brnz,pt	$cnt,	.Linner
844e1051a39Sopenharmony_ci	sub	$cnt,	8,	$cnt
845e1051a39Sopenharmony_ci!.Linner
846e1051a39Sopenharmony_ci	ldx	[$tp+8],	$tj	! tp[j]
847e1051a39Sopenharmony_ci	addcc	$alo,	$hi0,	$lo0
848e1051a39Sopenharmony_ci	addxc	$aj,	%g0,	$hi0	! ahi=aj
849e1051a39Sopenharmony_ci	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
850e1051a39Sopenharmony_ci	addxc	%g0,	$hi0,	$hi0
851e1051a39Sopenharmony_ci
852e1051a39Sopenharmony_ci	addcc	$nlo,	$hi1,	$lo1
853e1051a39Sopenharmony_ci	addxc	$nj,	%g0,	$hi1	! nhi=nj
854e1051a39Sopenharmony_ci	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
855e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
856e1051a39Sopenharmony_ci	stx	$lo1,	[$tp]		! tp[j-1]
857e1051a39Sopenharmony_ci
858e1051a39Sopenharmony_ci	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
859e1051a39Sopenharmony_ci	addxccc	$hi1,	$hi0,	$hi1
860e1051a39Sopenharmony_ci	addxc	%g0,	%g0,	$ovf
861e1051a39Sopenharmony_ci	stx	$hi1,	[$tp+8]
862e1051a39Sopenharmony_ci	add	$tp,	16,	$tp
863e1051a39Sopenharmony_ci
864e1051a39Sopenharmony_ci	brnz,pt	$i,	.Louter
865e1051a39Sopenharmony_ci	sub	$i,	8,	$i
866e1051a39Sopenharmony_ci
867e1051a39Sopenharmony_ci	sub	$ap,	$num,	$ap	! rewind
868e1051a39Sopenharmony_ci	sub	$np,	$num,	$np
869e1051a39Sopenharmony_ci	sub	$tp,	$num,	$tp
870e1051a39Sopenharmony_ci	ba	.Lsub
871e1051a39Sopenharmony_ci	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
872e1051a39Sopenharmony_ci
873e1051a39Sopenharmony_ci.align	16
874e1051a39Sopenharmony_ci.Lsub:
875e1051a39Sopenharmony_ci	ldx	[$tp],		$tj
876e1051a39Sopenharmony_ci	add	$tp,	8,	$tp
877e1051a39Sopenharmony_ci	ldx	[$np+0],	$nj
878e1051a39Sopenharmony_ci	add	$np,	8,	$np
879e1051a39Sopenharmony_ci	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
880e1051a39Sopenharmony_ci	srlx	$tj,	32,	$tj
881e1051a39Sopenharmony_ci	srlx	$nj,	32,	$nj
882e1051a39Sopenharmony_ci	subccc	$tj,	$nj,	$t3
883e1051a39Sopenharmony_ci	add	$rp,	8,	$rp
884e1051a39Sopenharmony_ci	st	$t2,	[$rp-4]		! reverse order
885e1051a39Sopenharmony_ci	st	$t3,	[$rp-8]
886e1051a39Sopenharmony_ci	brnz,pt	$cnt,	.Lsub
887e1051a39Sopenharmony_ci	sub	$cnt,	8,	$cnt
888e1051a39Sopenharmony_ci
889e1051a39Sopenharmony_ci	sub	$np,	$num,	$np	! rewind
890e1051a39Sopenharmony_ci	sub	$tp,	$num,	$tp
891e1051a39Sopenharmony_ci	sub	$rp,	$num,	$rp
892e1051a39Sopenharmony_ci
893e1051a39Sopenharmony_ci	subccc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
894e1051a39Sopenharmony_ci	ba	.Lcopy
895e1051a39Sopenharmony_ci	sub	$num,	8,	$cnt
896e1051a39Sopenharmony_ci
897e1051a39Sopenharmony_ci.align	16
898e1051a39Sopenharmony_ci.Lcopy:					! conditional copy
899e1051a39Sopenharmony_ci	ldx	[$tp],		$tj
900e1051a39Sopenharmony_ci	ldx	[$rp+0],	$t2
901e1051a39Sopenharmony_ci	stx	%g0,	[$tp]		! zap
902e1051a39Sopenharmony_ci	add	$tp,	8,	$tp
903e1051a39Sopenharmony_ci	movcs	%icc,	$tj,	$t2
904e1051a39Sopenharmony_ci	stx	$t2,	[$rp+0]
905e1051a39Sopenharmony_ci	add	$rp,	8,	$rp
906e1051a39Sopenharmony_ci	brnz	$cnt,	.Lcopy
907e1051a39Sopenharmony_ci	sub	$cnt,	8,	$cnt
908e1051a39Sopenharmony_ci
909e1051a39Sopenharmony_ci	mov	1,	%o0
910e1051a39Sopenharmony_ci	ret
911e1051a39Sopenharmony_ci	restore
912e1051a39Sopenharmony_ci.type	bn_mul_mont_t4, #function
913e1051a39Sopenharmony_ci.size	bn_mul_mont_t4, .-bn_mul_mont_t4
914e1051a39Sopenharmony_ci___
915e1051a39Sopenharmony_ci
916e1051a39Sopenharmony_ci# int bn_mul_mont_gather5(
917e1051a39Sopenharmony_ci$rp="%o0";	# u64 *rp,
918e1051a39Sopenharmony_ci$ap="%o1";	# const u64 *ap,
919e1051a39Sopenharmony_ci$bp="%o2";	# const u64 *pwrtbl,
920e1051a39Sopenharmony_ci$np="%o3";	# const u64 *np,
921e1051a39Sopenharmony_ci$n0p="%o4";	# const BN_ULONG *n0,
922e1051a39Sopenharmony_ci$num="%o5";	# int num,	# caller ensures that num is >=3
923e1051a39Sopenharmony_ci		# int power);
924e1051a39Sopenharmony_ci$code.=<<___;
925e1051a39Sopenharmony_ci.globl	bn_mul_mont_gather5_t4
926e1051a39Sopenharmony_ci.align	32
927e1051a39Sopenharmony_cibn_mul_mont_gather5_t4:
928e1051a39Sopenharmony_ci	add	%sp,	STACK_BIAS,	%g4	! real top of stack
929e1051a39Sopenharmony_ci	sll	$num,	3,	$num		! size in bytes
930e1051a39Sopenharmony_ci	add	$num,	63,	%g1
931e1051a39Sopenharmony_ci	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
932e1051a39Sopenharmony_ci	sub	%g4,	%g1,	%g1
933e1051a39Sopenharmony_ci	andn	%g1,	63,	%g1		! align at 64 byte
934e1051a39Sopenharmony_ci	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
935e1051a39Sopenharmony_ci	sub	%g1,	%g4,	%g1
936e1051a39Sopenharmony_ci	LDPTR	[%sp+STACK_7thARG],	%g4	! load power, 7th argument
937e1051a39Sopenharmony_ci
938e1051a39Sopenharmony_ci	save	%sp,	%g1,	%sp
939e1051a39Sopenharmony_ci___
940e1051a39Sopenharmony_ci#	+-------------------------------+<-----	%sp
941e1051a39Sopenharmony_ci#	.				.
942e1051a39Sopenharmony_ci#	+-------------------------------+<-----	aligned at 64 bytes
943e1051a39Sopenharmony_ci#	| __int64 tmp[0]		|
944e1051a39Sopenharmony_ci#	+-------------------------------+
945e1051a39Sopenharmony_ci#	.				.
946e1051a39Sopenharmony_ci#	.				.
947e1051a39Sopenharmony_ci#	+-------------------------------+<-----	aligned at 64 bytes
948e1051a39Sopenharmony_ci#	.				.
949e1051a39Sopenharmony_ci($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
950e1051a39Sopenharmony_ci($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
951e1051a39Sopenharmony_ci($ovf,$i)=($t0,$t1);
952e1051a39Sopenharmony_ci	&load_ccr($bp,"%g4",$ccr);
953e1051a39Sopenharmony_ci	&load_b($bp,$m0,"%o7");		# m0=bp[0]
954e1051a39Sopenharmony_ci
955e1051a39Sopenharmony_ci$code.=<<___;
956e1051a39Sopenharmony_ci	ld	[$n0p+0],	$t0	! pull n0[0..1] value
957e1051a39Sopenharmony_ci	ld	[$n0p+4],	$t1
958e1051a39Sopenharmony_ci	add	%sp, STACK_BIAS+STACK_FRAME, $tp
959e1051a39Sopenharmony_ci	sllx	$t1,	32,	$n0
960e1051a39Sopenharmony_ci	or	$t0,	$n0,	$n0
961e1051a39Sopenharmony_ci
962e1051a39Sopenharmony_ci	ldx	[$ap+0],	$aj	! ap[0]
963e1051a39Sopenharmony_ci
964e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
965e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$hi0
966e1051a39Sopenharmony_ci
967e1051a39Sopenharmony_ci	ldx	[$ap+8],	$aj	! ap[1]
968e1051a39Sopenharmony_ci	add	$ap,	16,	$ap
969e1051a39Sopenharmony_ci	ldx	[$np+0],	$nj	! np[0]
970e1051a39Sopenharmony_ci
971e1051a39Sopenharmony_ci	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
972e1051a39Sopenharmony_ci
973e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
974e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$aj	! ahi=aj
975e1051a39Sopenharmony_ci
976e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$lo1	! np[0]*m1
977e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$hi1
978e1051a39Sopenharmony_ci
979e1051a39Sopenharmony_ci	ldx	[$np+8],	$nj	! np[1]
980e1051a39Sopenharmony_ci
981e1051a39Sopenharmony_ci	addcc	$lo0,	$lo1,	$lo1
982e1051a39Sopenharmony_ci	add	$np,	16,	$np
983e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
984e1051a39Sopenharmony_ci
985e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$nlo	! np[1]*m1
986e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$nj	! nhi=nj
987e1051a39Sopenharmony_ci
988e1051a39Sopenharmony_ci	ba	.L1st_g5
989e1051a39Sopenharmony_ci	sub	$num,	24,	$cnt	! cnt=num-3
990e1051a39Sopenharmony_ci
991e1051a39Sopenharmony_ci.align	16
992e1051a39Sopenharmony_ci.L1st_g5:
993e1051a39Sopenharmony_ci	addcc	$alo,	$hi0,	$lo0
994e1051a39Sopenharmony_ci	addxc	$aj,	%g0,	$hi0
995e1051a39Sopenharmony_ci
996e1051a39Sopenharmony_ci	ldx	[$ap+0],	$aj	! ap[j]
997e1051a39Sopenharmony_ci	addcc	$nlo,	$hi1,	$lo1
998e1051a39Sopenharmony_ci	add	$ap,	8,	$ap
999e1051a39Sopenharmony_ci	addxc	$nj,	%g0,	$hi1	! nhi=nj
1000e1051a39Sopenharmony_ci
1001e1051a39Sopenharmony_ci	ldx	[$np+0],	$nj	! np[j]
1002e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
1003e1051a39Sopenharmony_ci	add	$np,	8,	$np
1004e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$aj	! ahi=aj
1005e1051a39Sopenharmony_ci
1006e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$nlo	! np[j]*m1
1007e1051a39Sopenharmony_ci	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
1008e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$nj	! nhi=nj
1009e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
1010e1051a39Sopenharmony_ci	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
1011e1051a39Sopenharmony_ci	add	$tp,	8,	$tp	! tp++
1012e1051a39Sopenharmony_ci
1013e1051a39Sopenharmony_ci	brnz,pt	$cnt,	.L1st_g5
1014e1051a39Sopenharmony_ci	sub	$cnt,	8,	$cnt	! j--
1015e1051a39Sopenharmony_ci!.L1st_g5
1016e1051a39Sopenharmony_ci	addcc	$alo,	$hi0,	$lo0
1017e1051a39Sopenharmony_ci	addxc	$aj,	%g0,	$hi0	! ahi=aj
1018e1051a39Sopenharmony_ci
1019e1051a39Sopenharmony_ci	addcc	$nlo,	$hi1,	$lo1
1020e1051a39Sopenharmony_ci	addxc	$nj,	%g0,	$hi1
1021e1051a39Sopenharmony_ci	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
1022e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
1023e1051a39Sopenharmony_ci	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
1024e1051a39Sopenharmony_ci	add	$tp,	8,	$tp
1025e1051a39Sopenharmony_ci
1026e1051a39Sopenharmony_ci	addcc	$hi0,	$hi1,	$hi1
1027e1051a39Sopenharmony_ci	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
1028e1051a39Sopenharmony_ci	stxa	$hi1,	[$tp]0xe2
1029e1051a39Sopenharmony_ci	add	$tp,	8,	$tp
1030e1051a39Sopenharmony_ci
1031e1051a39Sopenharmony_ci	ba	.Louter_g5
1032e1051a39Sopenharmony_ci	sub	$num,	16,	$i	! i=num-2
1033e1051a39Sopenharmony_ci
1034e1051a39Sopenharmony_ci.align	16
1035e1051a39Sopenharmony_ci.Louter_g5:
1036e1051a39Sopenharmony_ci	wr	$ccr,	%g0,	%ccr
1037e1051a39Sopenharmony_ci___
1038e1051a39Sopenharmony_ci	&load_b($bp,$m0);		# m0=bp[i]
1039e1051a39Sopenharmony_ci$code.=<<___;
1040e1051a39Sopenharmony_ci	sub	$ap,	$num,	$ap	! rewind
1041e1051a39Sopenharmony_ci	sub	$np,	$num,	$np
1042e1051a39Sopenharmony_ci	sub	$tp,	$num,	$tp
1043e1051a39Sopenharmony_ci
1044e1051a39Sopenharmony_ci	ldx	[$ap+0],	$aj	! ap[0]
1045e1051a39Sopenharmony_ci	ldx	[$np+0],	$nj	! np[0]
1046e1051a39Sopenharmony_ci
1047e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
1048e1051a39Sopenharmony_ci	ldx	[$tp],		$tj	! tp[0]
1049e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$hi0
1050e1051a39Sopenharmony_ci	ldx	[$ap+8],	$aj	! ap[1]
1051e1051a39Sopenharmony_ci	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
1052e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
1053e1051a39Sopenharmony_ci	addxc	%g0,	$hi0,	$hi0
1054e1051a39Sopenharmony_ci	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
1055e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$aj	! ahi=aj
1056e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$lo1	! np[0]*m1
1057e1051a39Sopenharmony_ci	add	$ap,	16,	$ap
1058e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$hi1
1059e1051a39Sopenharmony_ci	ldx	[$np+8],	$nj	! np[1]
1060e1051a39Sopenharmony_ci	add	$np,	16,	$np
1061e1051a39Sopenharmony_ci	addcc	$lo1,	$lo0,	$lo1
1062e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$nlo	! np[1]*m1
1063e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
1064e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$nj	! nhi=nj
1065e1051a39Sopenharmony_ci
1066e1051a39Sopenharmony_ci	ba	.Linner_g5
1067e1051a39Sopenharmony_ci	sub	$num,	24,	$cnt	! cnt=num-3
1068e1051a39Sopenharmony_ci.align	16
1069e1051a39Sopenharmony_ci.Linner_g5:
1070e1051a39Sopenharmony_ci	addcc	$alo,	$hi0,	$lo0
1071e1051a39Sopenharmony_ci	ldx	[$tp+8],	$tj	! tp[j]
1072e1051a39Sopenharmony_ci	addxc	$aj,	%g0,	$hi0	! ahi=aj
1073e1051a39Sopenharmony_ci	ldx	[$ap+0],	$aj	! ap[j]
1074e1051a39Sopenharmony_ci	add	$ap,	8,	$ap
1075e1051a39Sopenharmony_ci	addcc	$nlo,	$hi1,	$lo1
1076e1051a39Sopenharmony_ci	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
1077e1051a39Sopenharmony_ci	addxc	$nj,	%g0,	$hi1	! nhi=nj
1078e1051a39Sopenharmony_ci	ldx	[$np+0],	$nj	! np[j]
1079e1051a39Sopenharmony_ci	add	$np,	8,	$np
1080e1051a39Sopenharmony_ci	umulxhi	$aj,	$m0,	$aj	! ahi=aj
1081e1051a39Sopenharmony_ci	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
1082e1051a39Sopenharmony_ci	mulx	$nj,	$m1,	$nlo	! np[j]*m1
1083e1051a39Sopenharmony_ci	addxc	%g0,	$hi0,	$hi0
1084e1051a39Sopenharmony_ci	umulxhi	$nj,	$m1,	$nj	! nhi=nj
1085e1051a39Sopenharmony_ci	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
1086e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
1087e1051a39Sopenharmony_ci	stx	$lo1,	[$tp]		! tp[j-1]
1088e1051a39Sopenharmony_ci	add	$tp,	8,	$tp
1089e1051a39Sopenharmony_ci	brnz,pt	$cnt,	.Linner_g5
1090e1051a39Sopenharmony_ci	sub	$cnt,	8,	$cnt
1091e1051a39Sopenharmony_ci!.Linner_g5
1092e1051a39Sopenharmony_ci	ldx	[$tp+8],	$tj	! tp[j]
1093e1051a39Sopenharmony_ci	addcc	$alo,	$hi0,	$lo0
1094e1051a39Sopenharmony_ci	addxc	$aj,	%g0,	$hi0	! ahi=aj
1095e1051a39Sopenharmony_ci	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
1096e1051a39Sopenharmony_ci	addxc	%g0,	$hi0,	$hi0
1097e1051a39Sopenharmony_ci
1098e1051a39Sopenharmony_ci	addcc	$nlo,	$hi1,	$lo1
1099e1051a39Sopenharmony_ci	addxc	$nj,	%g0,	$hi1	! nhi=nj
1100e1051a39Sopenharmony_ci	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
1101e1051a39Sopenharmony_ci	addxc	%g0,	$hi1,	$hi1
1102e1051a39Sopenharmony_ci	stx	$lo1,	[$tp]		! tp[j-1]
1103e1051a39Sopenharmony_ci
1104e1051a39Sopenharmony_ci	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
1105e1051a39Sopenharmony_ci	addxccc	$hi1,	$hi0,	$hi1
1106e1051a39Sopenharmony_ci	addxc	%g0,	%g0,	$ovf
1107e1051a39Sopenharmony_ci	stx	$hi1,	[$tp+8]
1108e1051a39Sopenharmony_ci	add	$tp,	16,	$tp
1109e1051a39Sopenharmony_ci
1110e1051a39Sopenharmony_ci	brnz,pt	$i,	.Louter_g5
1111e1051a39Sopenharmony_ci	sub	$i,	8,	$i
1112e1051a39Sopenharmony_ci
1113e1051a39Sopenharmony_ci	sub	$ap,	$num,	$ap	! rewind
1114e1051a39Sopenharmony_ci	sub	$np,	$num,	$np
1115e1051a39Sopenharmony_ci	sub	$tp,	$num,	$tp
1116e1051a39Sopenharmony_ci	ba	.Lsub_g5
1117e1051a39Sopenharmony_ci	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
1118e1051a39Sopenharmony_ci
1119e1051a39Sopenharmony_ci.align	16
1120e1051a39Sopenharmony_ci.Lsub_g5:
1121e1051a39Sopenharmony_ci	ldx	[$tp],		$tj
1122e1051a39Sopenharmony_ci	add	$tp,	8,	$tp
1123e1051a39Sopenharmony_ci	ldx	[$np+0],	$nj
1124e1051a39Sopenharmony_ci	add	$np,	8,	$np
1125e1051a39Sopenharmony_ci	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
1126e1051a39Sopenharmony_ci	srlx	$tj,	32,	$tj
1127e1051a39Sopenharmony_ci	srlx	$nj,	32,	$nj
1128e1051a39Sopenharmony_ci	subccc	$tj,	$nj,	$t3
1129e1051a39Sopenharmony_ci	add	$rp,	8,	$rp
1130e1051a39Sopenharmony_ci	st	$t2,	[$rp-4]		! reverse order
1131e1051a39Sopenharmony_ci	st	$t3,	[$rp-8]
1132e1051a39Sopenharmony_ci	brnz,pt	$cnt,	.Lsub_g5
1133e1051a39Sopenharmony_ci	sub	$cnt,	8,	$cnt
1134e1051a39Sopenharmony_ci
1135e1051a39Sopenharmony_ci	sub	$np,	$num,	$np	! rewind
1136e1051a39Sopenharmony_ci	sub	$tp,	$num,	$tp
1137e1051a39Sopenharmony_ci	sub	$rp,	$num,	$rp
1138e1051a39Sopenharmony_ci
1139e1051a39Sopenharmony_ci	subccc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
1140e1051a39Sopenharmony_ci	ba	.Lcopy_g5
1141e1051a39Sopenharmony_ci	sub	$num,	8,	$cnt
1142e1051a39Sopenharmony_ci
1143e1051a39Sopenharmony_ci.align	16
1144e1051a39Sopenharmony_ci.Lcopy_g5:				! conditional copy
1145e1051a39Sopenharmony_ci	ldx	[$tp],		$tj
1146e1051a39Sopenharmony_ci	ldx	[$rp+0],	$t2
1147e1051a39Sopenharmony_ci	stx	%g0,	[$tp]		! zap
1148e1051a39Sopenharmony_ci	add	$tp,	8,	$tp
1149e1051a39Sopenharmony_ci	movcs	%icc,	$tj,	$t2
1150e1051a39Sopenharmony_ci	stx	$t2,	[$rp+0]
1151e1051a39Sopenharmony_ci	add	$rp,	8,	$rp
1152e1051a39Sopenharmony_ci	brnz	$cnt,	.Lcopy_g5
1153e1051a39Sopenharmony_ci	sub	$cnt,	8,	$cnt
1154e1051a39Sopenharmony_ci
1155e1051a39Sopenharmony_ci	mov	1,	%o0
1156e1051a39Sopenharmony_ci	ret
1157e1051a39Sopenharmony_ci	restore
1158e1051a39Sopenharmony_ci.type	bn_mul_mont_gather5_t4, #function
1159e1051a39Sopenharmony_ci.size	bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1160e1051a39Sopenharmony_ci___
1161e1051a39Sopenharmony_ci}
1162e1051a39Sopenharmony_ci
1163e1051a39Sopenharmony_ci$code.=<<___;
1164e1051a39Sopenharmony_ci.globl	bn_flip_t4
1165e1051a39Sopenharmony_ci.align	32
1166e1051a39Sopenharmony_cibn_flip_t4:
1167e1051a39Sopenharmony_ci.Loop_flip:
1168e1051a39Sopenharmony_ci	ld	[%o1+0],	%o4
1169e1051a39Sopenharmony_ci	sub	%o2,	1,	%o2
1170e1051a39Sopenharmony_ci	ld	[%o1+4],	%o5
1171e1051a39Sopenharmony_ci	add	%o1,	8,	%o1
1172e1051a39Sopenharmony_ci	st	%o5,	[%o0+0]
1173e1051a39Sopenharmony_ci	st	%o4,	[%o0+4]
1174e1051a39Sopenharmony_ci	brnz	%o2,	.Loop_flip
1175e1051a39Sopenharmony_ci	add	%o0,	8,	%o0
1176e1051a39Sopenharmony_ci	retl
1177e1051a39Sopenharmony_ci	nop
1178e1051a39Sopenharmony_ci.type	bn_flip_t4, #function
1179e1051a39Sopenharmony_ci.size	bn_flip_t4, .-bn_flip_t4
1180e1051a39Sopenharmony_ci
1181e1051a39Sopenharmony_ci.globl	bn_flip_n_scatter5_t4
1182e1051a39Sopenharmony_ci.align	32
1183e1051a39Sopenharmony_cibn_flip_n_scatter5_t4:
1184e1051a39Sopenharmony_ci	sll	%o3,	3,	%o3
1185e1051a39Sopenharmony_ci	srl	%o1,	1,	%o1
1186e1051a39Sopenharmony_ci	add	%o3,	%o2,	%o2	! &pwrtbl[pwr]
1187e1051a39Sopenharmony_ci	sub	%o1,	1,	%o1
1188e1051a39Sopenharmony_ci.Loop_flip_n_scatter5:
1189e1051a39Sopenharmony_ci	ld	[%o0+0],	%o4	! inp[i]
1190e1051a39Sopenharmony_ci	ld	[%o0+4],	%o5
1191e1051a39Sopenharmony_ci	add	%o0,	8,	%o0
1192e1051a39Sopenharmony_ci	sllx	%o5,	32,	%o5
1193e1051a39Sopenharmony_ci	or	%o4,	%o5,	%o5
1194e1051a39Sopenharmony_ci	stx	%o5,	[%o2]
1195e1051a39Sopenharmony_ci	add	%o2,	32*8,	%o2
1196e1051a39Sopenharmony_ci	brnz	%o1,	.Loop_flip_n_scatter5
1197e1051a39Sopenharmony_ci	sub	%o1,	1,	%o1
1198e1051a39Sopenharmony_ci	retl
1199e1051a39Sopenharmony_ci	nop
1200e1051a39Sopenharmony_ci.type	bn_flip_n_scatter5_t4, #function
1201e1051a39Sopenharmony_ci.size	bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1202e1051a39Sopenharmony_ci
1203e1051a39Sopenharmony_ci.globl	bn_gather5_t4
1204e1051a39Sopenharmony_ci.align	32
1205e1051a39Sopenharmony_cibn_gather5_t4:
1206e1051a39Sopenharmony_ci___
1207e1051a39Sopenharmony_ci	&load_ccr("%o2","%o3","%g1");
1208e1051a39Sopenharmony_ci$code.=<<___;
1209e1051a39Sopenharmony_ci	sub	%o1,	1,	%o1
1210e1051a39Sopenharmony_ci.Loop_gather5:
1211e1051a39Sopenharmony_ci___
1212e1051a39Sopenharmony_ci	&load_b("%o2","%g1");
1213e1051a39Sopenharmony_ci$code.=<<___;
1214e1051a39Sopenharmony_ci	stx	%g1,	[%o0]
1215e1051a39Sopenharmony_ci	add	%o0,	8,	%o0
1216e1051a39Sopenharmony_ci	brnz	%o1,	.Loop_gather5
1217e1051a39Sopenharmony_ci	sub	%o1,	1,	%o1
1218e1051a39Sopenharmony_ci
1219e1051a39Sopenharmony_ci	retl
1220e1051a39Sopenharmony_ci	nop
1221e1051a39Sopenharmony_ci.type	bn_gather5_t4, #function
1222e1051a39Sopenharmony_ci.size	bn_gather5_t4, .-bn_gather5_t4
1223e1051a39Sopenharmony_ci
1224e1051a39Sopenharmony_ci.asciz	"Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
1225e1051a39Sopenharmony_ci.align	4
1226e1051a39Sopenharmony_ci___
1227e1051a39Sopenharmony_ci
1228e1051a39Sopenharmony_ci&emit_assembler();
1229e1051a39Sopenharmony_ci
1230e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
1231