1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci
17e1051a39Sopenharmony_ci# April 2006
18e1051a39Sopenharmony_ci
19e1051a39Sopenharmony_ci# "Teaser" Montgomery multiplication module for PowerPC. It's possible
20e1051a39Sopenharmony_ci# to gain a bit more by modulo-scheduling outer loop, then dedicated
21e1051a39Sopenharmony_ci# squaring procedure should give further 20% and code can be adapted
22e1051a39Sopenharmony_ci# for 32-bit application running on 64-bit CPU. As for the latter.
23e1051a39Sopenharmony_ci# It won't be able to achieve "native" 64-bit performance, because in
24e1051a39Sopenharmony_ci# 32-bit application context every addc instruction will have to be
25e1051a39Sopenharmony_ci# expanded as addc, twice right shift by 32 and finally adde, etc.
26e1051a39Sopenharmony_ci# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
27e1051a39Sopenharmony_ci# for 64-bit application running on PPC970/G5 is:
28e1051a39Sopenharmony_ci#
29e1051a39Sopenharmony_ci# 512-bit	+65%
30e1051a39Sopenharmony_ci# 1024-bit	+35%
31e1051a39Sopenharmony_ci# 2048-bit	+18%
32e1051a39Sopenharmony_ci# 4096-bit	+4%
33e1051a39Sopenharmony_ci
34e1051a39Sopenharmony_ci# September 2016
35e1051a39Sopenharmony_ci#
36e1051a39Sopenharmony_ci# Add multiplication procedure operating on lengths divisible by 4
37e1051a39Sopenharmony_ci# and squaring procedure operating on lengths divisible by 8. Length
38e1051a39Sopenharmony_ci# is expressed in number of limbs. RSA private key operations are
39e1051a39Sopenharmony_ci# ~35-50% faster (more for longer keys) on contemporary high-end POWER
40e1051a39Sopenharmony_ci# processors in 64-bit builds, [mysteriously enough] more in 32-bit
41e1051a39Sopenharmony_ci# builds. On low-end 32-bit processors performance improvement turned
42e1051a39Sopenharmony_ci# to be marginal...
43e1051a39Sopenharmony_ci
44e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
45e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
46e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
47e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
48e1051a39Sopenharmony_ci
49e1051a39Sopenharmony_ciif ($flavour =~ /32/) {
50e1051a39Sopenharmony_ci	$BITS=	32;
51e1051a39Sopenharmony_ci	$BNSZ=	$BITS/8;
52e1051a39Sopenharmony_ci	$SIZE_T=4;
53e1051a39Sopenharmony_ci	$RZONE=	224;
54e1051a39Sopenharmony_ci
55e1051a39Sopenharmony_ci	$LD=	"lwz";		# load
56e1051a39Sopenharmony_ci	$LDU=	"lwzu";		# load and update
57e1051a39Sopenharmony_ci	$LDX=	"lwzx";		# load indexed
58e1051a39Sopenharmony_ci	$ST=	"stw";		# store
59e1051a39Sopenharmony_ci	$STU=	"stwu";		# store and update
60e1051a39Sopenharmony_ci	$STX=	"stwx";		# store indexed
61e1051a39Sopenharmony_ci	$STUX=	"stwux";	# store indexed and update
62e1051a39Sopenharmony_ci	$UMULL=	"mullw";	# unsigned multiply low
63e1051a39Sopenharmony_ci	$UMULH=	"mulhwu";	# unsigned multiply high
64e1051a39Sopenharmony_ci	$UCMP=	"cmplw";	# unsigned compare
65e1051a39Sopenharmony_ci	$SHRI=	"srwi";		# unsigned shift right by immediate
66e1051a39Sopenharmony_ci	$SHLI=	"slwi";		# unsigned shift left by immediate
67e1051a39Sopenharmony_ci	$PUSH=	$ST;
68e1051a39Sopenharmony_ci	$POP=	$LD;
69e1051a39Sopenharmony_ci} elsif ($flavour =~ /64/) {
70e1051a39Sopenharmony_ci	$BITS=	64;
71e1051a39Sopenharmony_ci	$BNSZ=	$BITS/8;
72e1051a39Sopenharmony_ci	$SIZE_T=8;
73e1051a39Sopenharmony_ci	$RZONE=	288;
74e1051a39Sopenharmony_ci
75e1051a39Sopenharmony_ci	# same as above, but 64-bit mnemonics...
76e1051a39Sopenharmony_ci	$LD=	"ld";		# load
77e1051a39Sopenharmony_ci	$LDU=	"ldu";		# load and update
78e1051a39Sopenharmony_ci	$LDX=	"ldx";		# load indexed
79e1051a39Sopenharmony_ci	$ST=	"std";		# store
80e1051a39Sopenharmony_ci	$STU=	"stdu";		# store and update
81e1051a39Sopenharmony_ci	$STX=	"stdx";		# store indexed
82e1051a39Sopenharmony_ci	$STUX=	"stdux";	# store indexed and update
83e1051a39Sopenharmony_ci	$UMULL=	"mulld";	# unsigned multiply low
84e1051a39Sopenharmony_ci	$UMULH=	"mulhdu";	# unsigned multiply high
85e1051a39Sopenharmony_ci	$UCMP=	"cmpld";	# unsigned compare
86e1051a39Sopenharmony_ci	$SHRI=	"srdi";		# unsigned shift right by immediate
87e1051a39Sopenharmony_ci	$SHLI=	"sldi";		# unsigned shift left by immediate
88e1051a39Sopenharmony_ci	$PUSH=	$ST;
89e1051a39Sopenharmony_ci	$POP=	$LD;
90e1051a39Sopenharmony_ci} else { die "nonsense $flavour"; }
91e1051a39Sopenharmony_ci
92e1051a39Sopenharmony_ci$FRAME=8*$SIZE_T+$RZONE;
93e1051a39Sopenharmony_ci$LOCALS=8*$SIZE_T;
94e1051a39Sopenharmony_ci
95e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
96e1051a39Sopenharmony_ci( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
97e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
98e1051a39Sopenharmony_cidie "can't locate ppc-xlate.pl";
99e1051a39Sopenharmony_ci
100e1051a39Sopenharmony_ciopen STDOUT,"| $^X $xlate $flavour \"$output\""
101e1051a39Sopenharmony_ci    or die "can't call $xlate: $!";
102e1051a39Sopenharmony_ci
103e1051a39Sopenharmony_ci$sp="r1";
104e1051a39Sopenharmony_ci$toc="r2";
105e1051a39Sopenharmony_ci$rp="r3";
106e1051a39Sopenharmony_ci$ap="r4";
107e1051a39Sopenharmony_ci$bp="r5";
108e1051a39Sopenharmony_ci$np="r6";
109e1051a39Sopenharmony_ci$n0="r7";
110e1051a39Sopenharmony_ci$num="r8";
111e1051a39Sopenharmony_ci
112e1051a39Sopenharmony_ci{
113e1051a39Sopenharmony_cimy $ovf=$rp;
114e1051a39Sopenharmony_cimy $rp="r9";	# $rp is reassigned
115e1051a39Sopenharmony_cimy $aj="r10";
116e1051a39Sopenharmony_cimy $nj="r11";
117e1051a39Sopenharmony_cimy $tj="r12";
118e1051a39Sopenharmony_ci# non-volatile registers
119e1051a39Sopenharmony_cimy $i="r20";
120e1051a39Sopenharmony_cimy $j="r21";
121e1051a39Sopenharmony_cimy $tp="r22";
122e1051a39Sopenharmony_cimy $m0="r23";
123e1051a39Sopenharmony_cimy $m1="r24";
124e1051a39Sopenharmony_cimy $lo0="r25";
125e1051a39Sopenharmony_cimy $hi0="r26";
126e1051a39Sopenharmony_cimy $lo1="r27";
127e1051a39Sopenharmony_cimy $hi1="r28";
128e1051a39Sopenharmony_cimy $alo="r29";
129e1051a39Sopenharmony_cimy $ahi="r30";
130e1051a39Sopenharmony_cimy $nlo="r31";
131e1051a39Sopenharmony_ci#
132e1051a39Sopenharmony_cimy $nhi="r0";
133e1051a39Sopenharmony_ci
134e1051a39Sopenharmony_ci$code=<<___;
135e1051a39Sopenharmony_ci.machine "any"
136e1051a39Sopenharmony_ci.text
137e1051a39Sopenharmony_ci
138e1051a39Sopenharmony_ci.globl	.bn_mul_mont_int
139e1051a39Sopenharmony_ci.align	5
140e1051a39Sopenharmony_ci.bn_mul_mont_int:
141e1051a39Sopenharmony_ci	mr	$rp,r3		; $rp is reassigned
142e1051a39Sopenharmony_ci	li	r3,0
143e1051a39Sopenharmony_ci___
144e1051a39Sopenharmony_ci$code.=<<___ if ($BNSZ==4);
145e1051a39Sopenharmony_ci	cmpwi	$num,32		; longer key performance is not better
146e1051a39Sopenharmony_ci	bgelr
147e1051a39Sopenharmony_ci___
148e1051a39Sopenharmony_ci$code.=<<___;
149e1051a39Sopenharmony_ci	slwi	$num,$num,`log($BNSZ)/log(2)`
150e1051a39Sopenharmony_ci	li	$tj,-4096
151e1051a39Sopenharmony_ci	addi	$ovf,$num,$FRAME
152e1051a39Sopenharmony_ci	subf	$ovf,$ovf,$sp	; $sp-$ovf
153e1051a39Sopenharmony_ci	and	$ovf,$ovf,$tj	; minimize TLB usage
154e1051a39Sopenharmony_ci	subf	$ovf,$sp,$ovf	; $ovf-$sp
155e1051a39Sopenharmony_ci	mr	$tj,$sp
156e1051a39Sopenharmony_ci	srwi	$num,$num,`log($BNSZ)/log(2)`
157e1051a39Sopenharmony_ci	$STUX	$sp,$sp,$ovf
158e1051a39Sopenharmony_ci
159e1051a39Sopenharmony_ci	$PUSH	r20,`-12*$SIZE_T`($tj)
160e1051a39Sopenharmony_ci	$PUSH	r21,`-11*$SIZE_T`($tj)
161e1051a39Sopenharmony_ci	$PUSH	r22,`-10*$SIZE_T`($tj)
162e1051a39Sopenharmony_ci	$PUSH	r23,`-9*$SIZE_T`($tj)
163e1051a39Sopenharmony_ci	$PUSH	r24,`-8*$SIZE_T`($tj)
164e1051a39Sopenharmony_ci	$PUSH	r25,`-7*$SIZE_T`($tj)
165e1051a39Sopenharmony_ci	$PUSH	r26,`-6*$SIZE_T`($tj)
166e1051a39Sopenharmony_ci	$PUSH	r27,`-5*$SIZE_T`($tj)
167e1051a39Sopenharmony_ci	$PUSH	r28,`-4*$SIZE_T`($tj)
168e1051a39Sopenharmony_ci	$PUSH	r29,`-3*$SIZE_T`($tj)
169e1051a39Sopenharmony_ci	$PUSH	r30,`-2*$SIZE_T`($tj)
170e1051a39Sopenharmony_ci	$PUSH	r31,`-1*$SIZE_T`($tj)
171e1051a39Sopenharmony_ci
172e1051a39Sopenharmony_ci	$LD	$n0,0($n0)	; pull n0[0] value
173e1051a39Sopenharmony_ci	addi	$num,$num,-2	; adjust $num for counter register
174e1051a39Sopenharmony_ci
175e1051a39Sopenharmony_ci	$LD	$m0,0($bp)	; m0=bp[0]
176e1051a39Sopenharmony_ci	$LD	$aj,0($ap)	; ap[0]
177e1051a39Sopenharmony_ci	addi	$tp,$sp,$LOCALS
178e1051a39Sopenharmony_ci	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
179e1051a39Sopenharmony_ci	$UMULH	$hi0,$aj,$m0
180e1051a39Sopenharmony_ci
181e1051a39Sopenharmony_ci	$LD	$aj,$BNSZ($ap)	; ap[1]
182e1051a39Sopenharmony_ci	$LD	$nj,0($np)	; np[0]
183e1051a39Sopenharmony_ci
184e1051a39Sopenharmony_ci	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
185e1051a39Sopenharmony_ci
186e1051a39Sopenharmony_ci	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
187e1051a39Sopenharmony_ci	$UMULH	$ahi,$aj,$m0
188e1051a39Sopenharmony_ci
189e1051a39Sopenharmony_ci	$UMULL	$lo1,$nj,$m1	; np[0]*m1
190e1051a39Sopenharmony_ci	$UMULH	$hi1,$nj,$m1
191e1051a39Sopenharmony_ci	$LD	$nj,$BNSZ($np)	; np[1]
192e1051a39Sopenharmony_ci	addc	$lo1,$lo1,$lo0
193e1051a39Sopenharmony_ci	addze	$hi1,$hi1
194e1051a39Sopenharmony_ci
195e1051a39Sopenharmony_ci	$UMULL	$nlo,$nj,$m1	; np[1]*m1
196e1051a39Sopenharmony_ci	$UMULH	$nhi,$nj,$m1
197e1051a39Sopenharmony_ci
198e1051a39Sopenharmony_ci	mtctr	$num
199e1051a39Sopenharmony_ci	li	$j,`2*$BNSZ`
200e1051a39Sopenharmony_ci.align	4
201e1051a39Sopenharmony_ciL1st:
202e1051a39Sopenharmony_ci	$LDX	$aj,$ap,$j	; ap[j]
203e1051a39Sopenharmony_ci	addc	$lo0,$alo,$hi0
204e1051a39Sopenharmony_ci	$LDX	$nj,$np,$j	; np[j]
205e1051a39Sopenharmony_ci	addze	$hi0,$ahi
206e1051a39Sopenharmony_ci	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
207e1051a39Sopenharmony_ci	addc	$lo1,$nlo,$hi1
208e1051a39Sopenharmony_ci	$UMULH	$ahi,$aj,$m0
209e1051a39Sopenharmony_ci	addze	$hi1,$nhi
210e1051a39Sopenharmony_ci	$UMULL	$nlo,$nj,$m1	; np[j]*m1
211e1051a39Sopenharmony_ci	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
212e1051a39Sopenharmony_ci	$UMULH	$nhi,$nj,$m1
213e1051a39Sopenharmony_ci	addze	$hi1,$hi1
214e1051a39Sopenharmony_ci	$ST	$lo1,0($tp)	; tp[j-1]
215e1051a39Sopenharmony_ci
216e1051a39Sopenharmony_ci	addi	$j,$j,$BNSZ	; j++
217e1051a39Sopenharmony_ci	addi	$tp,$tp,$BNSZ	; tp++
218e1051a39Sopenharmony_ci	bdnz	L1st
219e1051a39Sopenharmony_ci;L1st
220e1051a39Sopenharmony_ci	addc	$lo0,$alo,$hi0
221e1051a39Sopenharmony_ci	addze	$hi0,$ahi
222e1051a39Sopenharmony_ci
223e1051a39Sopenharmony_ci	addc	$lo1,$nlo,$hi1
224e1051a39Sopenharmony_ci	addze	$hi1,$nhi
225e1051a39Sopenharmony_ci	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
226e1051a39Sopenharmony_ci	addze	$hi1,$hi1
227e1051a39Sopenharmony_ci	$ST	$lo1,0($tp)	; tp[j-1]
228e1051a39Sopenharmony_ci
229e1051a39Sopenharmony_ci	li	$ovf,0
230e1051a39Sopenharmony_ci	addc	$hi1,$hi1,$hi0
231e1051a39Sopenharmony_ci	addze	$ovf,$ovf	; upmost overflow bit
232e1051a39Sopenharmony_ci	$ST	$hi1,$BNSZ($tp)
233e1051a39Sopenharmony_ci
234e1051a39Sopenharmony_ci	li	$i,$BNSZ
235e1051a39Sopenharmony_ci.align	4
236e1051a39Sopenharmony_ciLouter:
237e1051a39Sopenharmony_ci	$LDX	$m0,$bp,$i	; m0=bp[i]
238e1051a39Sopenharmony_ci	$LD	$aj,0($ap)	; ap[0]
239e1051a39Sopenharmony_ci	addi	$tp,$sp,$LOCALS
240e1051a39Sopenharmony_ci	$LD	$tj,$LOCALS($sp); tp[0]
241e1051a39Sopenharmony_ci	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
242e1051a39Sopenharmony_ci	$UMULH	$hi0,$aj,$m0
243e1051a39Sopenharmony_ci	$LD	$aj,$BNSZ($ap)	; ap[1]
244e1051a39Sopenharmony_ci	$LD	$nj,0($np)	; np[0]
245e1051a39Sopenharmony_ci	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
246e1051a39Sopenharmony_ci	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
247e1051a39Sopenharmony_ci	addze	$hi0,$hi0
248e1051a39Sopenharmony_ci	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
249e1051a39Sopenharmony_ci	$UMULH	$ahi,$aj,$m0
250e1051a39Sopenharmony_ci	$UMULL	$lo1,$nj,$m1	; np[0]*m1
251e1051a39Sopenharmony_ci	$UMULH	$hi1,$nj,$m1
252e1051a39Sopenharmony_ci	$LD	$nj,$BNSZ($np)	; np[1]
253e1051a39Sopenharmony_ci	addc	$lo1,$lo1,$lo0
254e1051a39Sopenharmony_ci	$UMULL	$nlo,$nj,$m1	; np[1]*m1
255e1051a39Sopenharmony_ci	addze	$hi1,$hi1
256e1051a39Sopenharmony_ci	$UMULH	$nhi,$nj,$m1
257e1051a39Sopenharmony_ci
258e1051a39Sopenharmony_ci	mtctr	$num
259e1051a39Sopenharmony_ci	li	$j,`2*$BNSZ`
260e1051a39Sopenharmony_ci.align	4
261e1051a39Sopenharmony_ciLinner:
262e1051a39Sopenharmony_ci	$LDX	$aj,$ap,$j	; ap[j]
263e1051a39Sopenharmony_ci	addc	$lo0,$alo,$hi0
264e1051a39Sopenharmony_ci	$LD	$tj,$BNSZ($tp)	; tp[j]
265e1051a39Sopenharmony_ci	addze	$hi0,$ahi
266e1051a39Sopenharmony_ci	$LDX	$nj,$np,$j	; np[j]
267e1051a39Sopenharmony_ci	addc	$lo1,$nlo,$hi1
268e1051a39Sopenharmony_ci	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
269e1051a39Sopenharmony_ci	addze	$hi1,$nhi
270e1051a39Sopenharmony_ci	$UMULH	$ahi,$aj,$m0
271e1051a39Sopenharmony_ci	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
272e1051a39Sopenharmony_ci	$UMULL	$nlo,$nj,$m1	; np[j]*m1
273e1051a39Sopenharmony_ci	addze	$hi0,$hi0
274e1051a39Sopenharmony_ci	$UMULH	$nhi,$nj,$m1
275e1051a39Sopenharmony_ci	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
276e1051a39Sopenharmony_ci	addi	$j,$j,$BNSZ	; j++
277e1051a39Sopenharmony_ci	addze	$hi1,$hi1
278e1051a39Sopenharmony_ci	$ST	$lo1,0($tp)	; tp[j-1]
279e1051a39Sopenharmony_ci	addi	$tp,$tp,$BNSZ	; tp++
280e1051a39Sopenharmony_ci	bdnz	Linner
281e1051a39Sopenharmony_ci;Linner
282e1051a39Sopenharmony_ci	$LD	$tj,$BNSZ($tp)	; tp[j]
283e1051a39Sopenharmony_ci	addc	$lo0,$alo,$hi0
284e1051a39Sopenharmony_ci	addze	$hi0,$ahi
285e1051a39Sopenharmony_ci	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
286e1051a39Sopenharmony_ci	addze	$hi0,$hi0
287e1051a39Sopenharmony_ci
288e1051a39Sopenharmony_ci	addc	$lo1,$nlo,$hi1
289e1051a39Sopenharmony_ci	addze	$hi1,$nhi
290e1051a39Sopenharmony_ci	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
291e1051a39Sopenharmony_ci	addze	$hi1,$hi1
292e1051a39Sopenharmony_ci	$ST	$lo1,0($tp)	; tp[j-1]
293e1051a39Sopenharmony_ci
294e1051a39Sopenharmony_ci	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
295e1051a39Sopenharmony_ci	li	$ovf,0
296e1051a39Sopenharmony_ci	adde	$hi1,$hi1,$hi0
297e1051a39Sopenharmony_ci	addze	$ovf,$ovf
298e1051a39Sopenharmony_ci	$ST	$hi1,$BNSZ($tp)
299e1051a39Sopenharmony_ci;
300e1051a39Sopenharmony_ci	slwi	$tj,$num,`log($BNSZ)/log(2)`
301e1051a39Sopenharmony_ci	$UCMP	$i,$tj
302e1051a39Sopenharmony_ci	addi	$i,$i,$BNSZ
303e1051a39Sopenharmony_ci	ble	Louter
304e1051a39Sopenharmony_ci
305e1051a39Sopenharmony_ci	addi	$num,$num,2	; restore $num
306e1051a39Sopenharmony_ci	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
307e1051a39Sopenharmony_ci	addi	$tp,$sp,$LOCALS
308e1051a39Sopenharmony_ci	mtctr	$num
309e1051a39Sopenharmony_ci
310e1051a39Sopenharmony_ci.align	4
311e1051a39Sopenharmony_ciLsub:	$LDX	$tj,$tp,$j
312e1051a39Sopenharmony_ci	$LDX	$nj,$np,$j
313e1051a39Sopenharmony_ci	subfe	$aj,$nj,$tj	; tp[j]-np[j]
314e1051a39Sopenharmony_ci	$STX	$aj,$rp,$j
315e1051a39Sopenharmony_ci	addi	$j,$j,$BNSZ
316e1051a39Sopenharmony_ci	bdnz	Lsub
317e1051a39Sopenharmony_ci
318e1051a39Sopenharmony_ci	li	$j,0
319e1051a39Sopenharmony_ci	mtctr	$num
320e1051a39Sopenharmony_ci	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
321e1051a39Sopenharmony_ci
322e1051a39Sopenharmony_ci.align	4
323e1051a39Sopenharmony_ciLcopy:				; conditional copy
324e1051a39Sopenharmony_ci	$LDX	$tj,$tp,$j
325e1051a39Sopenharmony_ci	$LDX	$aj,$rp,$j
326e1051a39Sopenharmony_ci	and	$tj,$tj,$ovf
327e1051a39Sopenharmony_ci	andc	$aj,$aj,$ovf
328e1051a39Sopenharmony_ci	$STX	$j,$tp,$j	; zap at once
329e1051a39Sopenharmony_ci	or	$aj,$aj,$tj
330e1051a39Sopenharmony_ci	$STX	$aj,$rp,$j
331e1051a39Sopenharmony_ci	addi	$j,$j,$BNSZ
332e1051a39Sopenharmony_ci	bdnz	Lcopy
333e1051a39Sopenharmony_ci
334e1051a39Sopenharmony_ci	$POP	$tj,0($sp)
335e1051a39Sopenharmony_ci	li	r3,1
336e1051a39Sopenharmony_ci	$POP	r20,`-12*$SIZE_T`($tj)
337e1051a39Sopenharmony_ci	$POP	r21,`-11*$SIZE_T`($tj)
338e1051a39Sopenharmony_ci	$POP	r22,`-10*$SIZE_T`($tj)
339e1051a39Sopenharmony_ci	$POP	r23,`-9*$SIZE_T`($tj)
340e1051a39Sopenharmony_ci	$POP	r24,`-8*$SIZE_T`($tj)
341e1051a39Sopenharmony_ci	$POP	r25,`-7*$SIZE_T`($tj)
342e1051a39Sopenharmony_ci	$POP	r26,`-6*$SIZE_T`($tj)
343e1051a39Sopenharmony_ci	$POP	r27,`-5*$SIZE_T`($tj)
344e1051a39Sopenharmony_ci	$POP	r28,`-4*$SIZE_T`($tj)
345e1051a39Sopenharmony_ci	$POP	r29,`-3*$SIZE_T`($tj)
346e1051a39Sopenharmony_ci	$POP	r30,`-2*$SIZE_T`($tj)
347e1051a39Sopenharmony_ci	$POP	r31,`-1*$SIZE_T`($tj)
348e1051a39Sopenharmony_ci	mr	$sp,$tj
349e1051a39Sopenharmony_ci	blr
350e1051a39Sopenharmony_ci	.long	0
351e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,12,6,0
352e1051a39Sopenharmony_ci	.long	0
353e1051a39Sopenharmony_ci.size	.bn_mul_mont_int,.-.bn_mul_mont_int
354e1051a39Sopenharmony_ci___
355e1051a39Sopenharmony_ci}
356e1051a39Sopenharmony_ciif (1) {
357e1051a39Sopenharmony_cimy ($a0,$a1,$a2,$a3,
358e1051a39Sopenharmony_ci    $t0,$t1,$t2,$t3,
359e1051a39Sopenharmony_ci    $m0,$m1,$m2,$m3,
360e1051a39Sopenharmony_ci    $acc0,$acc1,$acc2,$acc3,$acc4,
361e1051a39Sopenharmony_ci    $bi,$mi,$tp,$ap_end,$cnt) = map("r$_",(9..12,14..31));
362e1051a39Sopenharmony_cimy  ($carry,$zero) = ($rp,"r0");
363e1051a39Sopenharmony_ci
364e1051a39Sopenharmony_ci# sp----------->+-------------------------------+
365e1051a39Sopenharmony_ci#		| saved sp			|
366e1051a39Sopenharmony_ci#		+-------------------------------+
367e1051a39Sopenharmony_ci#		.				.
368e1051a39Sopenharmony_ci# +8*size_t	+-------------------------------+
369e1051a39Sopenharmony_ci#		| 4 "n0*t0"			|
370e1051a39Sopenharmony_ci#		.				.
371e1051a39Sopenharmony_ci#		.				.
372e1051a39Sopenharmony_ci# +12*size_t	+-------------------------------+
373e1051a39Sopenharmony_ci#		| size_t tmp[num]		|
374e1051a39Sopenharmony_ci#		.				.
375e1051a39Sopenharmony_ci#		.				.
376e1051a39Sopenharmony_ci#		.				.
377e1051a39Sopenharmony_ci#		+-------------------------------+
378e1051a39Sopenharmony_ci#		| topmost carry			|
379e1051a39Sopenharmony_ci#		.				.
380e1051a39Sopenharmony_ci# -18*size_t	+-------------------------------+
381e1051a39Sopenharmony_ci#		| 18 saved gpr, r14-r31		|
382e1051a39Sopenharmony_ci#		.				.
383e1051a39Sopenharmony_ci#		.				.
384e1051a39Sopenharmony_ci#		+-------------------------------+
385e1051a39Sopenharmony_ci$code.=<<___;
386e1051a39Sopenharmony_ci.globl	.bn_mul4x_mont_int
387e1051a39Sopenharmony_ci.align	5
388e1051a39Sopenharmony_ci.bn_mul4x_mont_int:
389e1051a39Sopenharmony_ci	andi.	r0,$num,7
390e1051a39Sopenharmony_ci	bne	.Lmul4x_do
391e1051a39Sopenharmony_ci	$UCMP	$ap,$bp
392e1051a39Sopenharmony_ci	bne	.Lmul4x_do
393e1051a39Sopenharmony_ci	b	.Lsqr8x_do
394e1051a39Sopenharmony_ci.Lmul4x_do:
395e1051a39Sopenharmony_ci	slwi	$num,$num,`log($SIZE_T)/log(2)`
396e1051a39Sopenharmony_ci	mr	$a0,$sp
397e1051a39Sopenharmony_ci	li	$a1,-32*$SIZE_T
398e1051a39Sopenharmony_ci	sub	$a1,$a1,$num
399e1051a39Sopenharmony_ci	$STUX	$sp,$sp,$a1		# alloca
400e1051a39Sopenharmony_ci
401e1051a39Sopenharmony_ci	$PUSH	r14,-$SIZE_T*18($a0)
402e1051a39Sopenharmony_ci	$PUSH	r15,-$SIZE_T*17($a0)
403e1051a39Sopenharmony_ci	$PUSH	r16,-$SIZE_T*16($a0)
404e1051a39Sopenharmony_ci	$PUSH	r17,-$SIZE_T*15($a0)
405e1051a39Sopenharmony_ci	$PUSH	r18,-$SIZE_T*14($a0)
406e1051a39Sopenharmony_ci	$PUSH	r19,-$SIZE_T*13($a0)
407e1051a39Sopenharmony_ci	$PUSH	r20,-$SIZE_T*12($a0)
408e1051a39Sopenharmony_ci	$PUSH	r21,-$SIZE_T*11($a0)
409e1051a39Sopenharmony_ci	$PUSH	r22,-$SIZE_T*10($a0)
410e1051a39Sopenharmony_ci	$PUSH	r23,-$SIZE_T*9($a0)
411e1051a39Sopenharmony_ci	$PUSH	r24,-$SIZE_T*8($a0)
412e1051a39Sopenharmony_ci	$PUSH	r25,-$SIZE_T*7($a0)
413e1051a39Sopenharmony_ci	$PUSH	r26,-$SIZE_T*6($a0)
414e1051a39Sopenharmony_ci	$PUSH	r27,-$SIZE_T*5($a0)
415e1051a39Sopenharmony_ci	$PUSH	r28,-$SIZE_T*4($a0)
416e1051a39Sopenharmony_ci	$PUSH	r29,-$SIZE_T*3($a0)
417e1051a39Sopenharmony_ci	$PUSH	r30,-$SIZE_T*2($a0)
418e1051a39Sopenharmony_ci	$PUSH	r31,-$SIZE_T*1($a0)
419e1051a39Sopenharmony_ci
420e1051a39Sopenharmony_ci	subi	$ap,$ap,$SIZE_T		# bias by -1
421e1051a39Sopenharmony_ci	subi	$np,$np,$SIZE_T		# bias by -1
422e1051a39Sopenharmony_ci	subi	$rp,$rp,$SIZE_T		# bias by -1
423e1051a39Sopenharmony_ci	$LD	$n0,0($n0)		# *n0
424e1051a39Sopenharmony_ci
425e1051a39Sopenharmony_ci	add	$t0,$bp,$num
426e1051a39Sopenharmony_ci	add	$ap_end,$ap,$num
427e1051a39Sopenharmony_ci	subi	$t0,$t0,$SIZE_T*4	# &b[num-4]
428e1051a39Sopenharmony_ci
429e1051a39Sopenharmony_ci	$LD	$bi,$SIZE_T*0($bp)	# b[0]
430e1051a39Sopenharmony_ci	li	$acc0,0
431e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($ap)	# a[0..3]
432e1051a39Sopenharmony_ci	li	$acc1,0
433e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($ap)
434e1051a39Sopenharmony_ci	li	$acc2,0
435e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($ap)
436e1051a39Sopenharmony_ci	li	$acc3,0
437e1051a39Sopenharmony_ci	$LDU	$a3,$SIZE_T*4($ap)
438e1051a39Sopenharmony_ci	$LD	$m0,$SIZE_T*1($np)	# n[0..3]
439e1051a39Sopenharmony_ci	$LD	$m1,$SIZE_T*2($np)
440e1051a39Sopenharmony_ci	$LD	$m2,$SIZE_T*3($np)
441e1051a39Sopenharmony_ci	$LDU	$m3,$SIZE_T*4($np)
442e1051a39Sopenharmony_ci
443e1051a39Sopenharmony_ci	$PUSH	$rp,$SIZE_T*6($sp)	# offload rp and &b[num-4]
444e1051a39Sopenharmony_ci	$PUSH	$t0,$SIZE_T*7($sp)
445e1051a39Sopenharmony_ci	li	$carry,0
446e1051a39Sopenharmony_ci	addic	$tp,$sp,$SIZE_T*7	# &t[-1], clear carry bit
447e1051a39Sopenharmony_ci	li	$cnt,0
448e1051a39Sopenharmony_ci	li	$zero,0
449e1051a39Sopenharmony_ci	b	.Loop_mul4x_1st_reduction
450e1051a39Sopenharmony_ci
451e1051a39Sopenharmony_ci.align	5
452e1051a39Sopenharmony_ci.Loop_mul4x_1st_reduction:
453e1051a39Sopenharmony_ci	$UMULL	$t0,$a0,$bi		# lo(a[0..3]*b[0])
454e1051a39Sopenharmony_ci	addze	$carry,$carry		# modulo-scheduled
455e1051a39Sopenharmony_ci	$UMULL	$t1,$a1,$bi
456e1051a39Sopenharmony_ci	addi	$cnt,$cnt,$SIZE_T
457e1051a39Sopenharmony_ci	$UMULL	$t2,$a2,$bi
458e1051a39Sopenharmony_ci	andi.	$cnt,$cnt,$SIZE_T*4-1
459e1051a39Sopenharmony_ci	$UMULL	$t3,$a3,$bi
460e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
461e1051a39Sopenharmony_ci	$UMULH	$t0,$a0,$bi		# hi(a[0..3]*b[0])
462e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
463e1051a39Sopenharmony_ci	$UMULH	$t1,$a1,$bi
464e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
465e1051a39Sopenharmony_ci	$UMULL	$mi,$acc0,$n0		# t[0]*n0
466e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
467e1051a39Sopenharmony_ci	$UMULH	$t2,$a2,$bi
468e1051a39Sopenharmony_ci	addze	$acc4,$zero
469e1051a39Sopenharmony_ci	$UMULH	$t3,$a3,$bi
470e1051a39Sopenharmony_ci	$LDX	$bi,$bp,$cnt		# next b[i] (or b[0])
471e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$t0
472e1051a39Sopenharmony_ci	# (*)	mul	$t0,$m0,$mi	# lo(n[0..3]*t[0]*n0)
473e1051a39Sopenharmony_ci	$STU	$mi,$SIZE_T($tp)	# put aside t[0]*n0 for tail processing
474e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
475e1051a39Sopenharmony_ci	$UMULL	$t1,$m1,$mi
476e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
477e1051a39Sopenharmony_ci	$UMULL	$t2,$m2,$mi
478e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t3		# can't overflow
479e1051a39Sopenharmony_ci	$UMULL	$t3,$m3,$mi
480e1051a39Sopenharmony_ci	# (*)	addc	$acc0,$acc0,$t0
481e1051a39Sopenharmony_ci	# (*)	As for removal of first multiplication and addition
482e1051a39Sopenharmony_ci	#	instructions. The outcome of first addition is
483e1051a39Sopenharmony_ci	#	guaranteed to be zero, which leaves two computationally
484e1051a39Sopenharmony_ci	#	significant outcomes: it either carries or not. Then
485e1051a39Sopenharmony_ci	#	question is when does it carry? Is there alternative
486e1051a39Sopenharmony_ci	#	way to deduce it? If you follow operations, you can
487e1051a39Sopenharmony_ci	#	observe that condition for carry is quite simple:
488e1051a39Sopenharmony_ci	#	$acc0 being non-zero. So that carry can be calculated
489e1051a39Sopenharmony_ci	#	by adding -1 to $acc0. That's what next instruction does.
490e1051a39Sopenharmony_ci	addic	$acc0,$acc0,-1		# (*), discarded
491e1051a39Sopenharmony_ci	$UMULH	$t0,$m0,$mi		# hi(n[0..3]*t[0]*n0)
492e1051a39Sopenharmony_ci	adde	$acc0,$acc1,$t1
493e1051a39Sopenharmony_ci	$UMULH	$t1,$m1,$mi
494e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t2
495e1051a39Sopenharmony_ci	$UMULH	$t2,$m2,$mi
496e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t3
497e1051a39Sopenharmony_ci	$UMULH	$t3,$m3,$mi
498e1051a39Sopenharmony_ci	adde	$acc3,$acc4,$carry
499e1051a39Sopenharmony_ci	addze	$carry,$zero
500e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
501e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
502e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
503e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
504e1051a39Sopenharmony_ci	#addze	$carry,$carry
505e1051a39Sopenharmony_ci	bne	.Loop_mul4x_1st_reduction
506e1051a39Sopenharmony_ci
507e1051a39Sopenharmony_ci	$UCMP	$ap_end,$ap
508e1051a39Sopenharmony_ci	beq	.Lmul4x4_post_condition
509e1051a39Sopenharmony_ci
510e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($ap)	# a[4..7]
511e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($ap)
512e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($ap)
513e1051a39Sopenharmony_ci	$LDU	$a3,$SIZE_T*4($ap)
514e1051a39Sopenharmony_ci	$LD	$mi,$SIZE_T*8($sp)	# a[0]*n0
515e1051a39Sopenharmony_ci	$LD	$m0,$SIZE_T*1($np)	# n[4..7]
516e1051a39Sopenharmony_ci	$LD	$m1,$SIZE_T*2($np)
517e1051a39Sopenharmony_ci	$LD	$m2,$SIZE_T*3($np)
518e1051a39Sopenharmony_ci	$LDU	$m3,$SIZE_T*4($np)
519e1051a39Sopenharmony_ci	b	.Loop_mul4x_1st_tail
520e1051a39Sopenharmony_ci
521e1051a39Sopenharmony_ci.align	5
522e1051a39Sopenharmony_ci.Loop_mul4x_1st_tail:
523e1051a39Sopenharmony_ci	$UMULL	$t0,$a0,$bi		# lo(a[4..7]*b[i])
524e1051a39Sopenharmony_ci	addze	$carry,$carry		# modulo-scheduled
525e1051a39Sopenharmony_ci	$UMULL	$t1,$a1,$bi
526e1051a39Sopenharmony_ci	addi	$cnt,$cnt,$SIZE_T
527e1051a39Sopenharmony_ci	$UMULL	$t2,$a2,$bi
528e1051a39Sopenharmony_ci	andi.	$cnt,$cnt,$SIZE_T*4-1
529e1051a39Sopenharmony_ci	$UMULL	$t3,$a3,$bi
530e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
531e1051a39Sopenharmony_ci	$UMULH	$t0,$a0,$bi		# hi(a[4..7]*b[i])
532e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
533e1051a39Sopenharmony_ci	$UMULH	$t1,$a1,$bi
534e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
535e1051a39Sopenharmony_ci	$UMULH	$t2,$a2,$bi
536e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
537e1051a39Sopenharmony_ci	$UMULH	$t3,$a3,$bi
538e1051a39Sopenharmony_ci	addze	$acc4,$zero
539e1051a39Sopenharmony_ci	$LDX	$bi,$bp,$cnt		# next b[i] (or b[0])
540e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$t0
541e1051a39Sopenharmony_ci	$UMULL	$t0,$m0,$mi		# lo(n[4..7]*a[0]*n0)
542e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
543e1051a39Sopenharmony_ci	$UMULL	$t1,$m1,$mi
544e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
545e1051a39Sopenharmony_ci	$UMULL	$t2,$m2,$mi
546e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t3		# can't overflow
547e1051a39Sopenharmony_ci	$UMULL	$t3,$m3,$mi
548e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
549e1051a39Sopenharmony_ci	$UMULH	$t0,$m0,$mi		# hi(n[4..7]*a[0]*n0)
550e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
551e1051a39Sopenharmony_ci	$UMULH	$t1,$m1,$mi
552e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
553e1051a39Sopenharmony_ci	$UMULH	$t2,$m2,$mi
554e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
555e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$carry
556e1051a39Sopenharmony_ci	$UMULH	$t3,$m3,$mi
557e1051a39Sopenharmony_ci	addze	$carry,$zero
558e1051a39Sopenharmony_ci	addi	$mi,$sp,$SIZE_T*8
559e1051a39Sopenharmony_ci	$LDX	$mi,$mi,$cnt		# next t[0]*n0
560e1051a39Sopenharmony_ci	$STU	$acc0,$SIZE_T($tp)	# word of result
561e1051a39Sopenharmony_ci	addc	$acc0,$acc1,$t0
562e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t1
563e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t2
564e1051a39Sopenharmony_ci	adde	$acc3,$acc4,$t3
565e1051a39Sopenharmony_ci	#addze	$carry,$carry
566e1051a39Sopenharmony_ci	bne	.Loop_mul4x_1st_tail
567e1051a39Sopenharmony_ci
568e1051a39Sopenharmony_ci	sub	$t1,$ap_end,$num	# rewinded $ap
569e1051a39Sopenharmony_ci	$UCMP	$ap_end,$ap		# done yet?
570e1051a39Sopenharmony_ci	beq	.Lmul4x_proceed
571e1051a39Sopenharmony_ci
572e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($ap)
573e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($ap)
574e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($ap)
575e1051a39Sopenharmony_ci	$LDU	$a3,$SIZE_T*4($ap)
576e1051a39Sopenharmony_ci	$LD	$m0,$SIZE_T*1($np)
577e1051a39Sopenharmony_ci	$LD	$m1,$SIZE_T*2($np)
578e1051a39Sopenharmony_ci	$LD	$m2,$SIZE_T*3($np)
579e1051a39Sopenharmony_ci	$LDU	$m3,$SIZE_T*4($np)
580e1051a39Sopenharmony_ci	b	.Loop_mul4x_1st_tail
581e1051a39Sopenharmony_ci
582e1051a39Sopenharmony_ci.align	5
583e1051a39Sopenharmony_ci.Lmul4x_proceed:
584e1051a39Sopenharmony_ci	$LDU	$bi,$SIZE_T*4($bp)	# *++b
585e1051a39Sopenharmony_ci	addze	$carry,$carry		# topmost carry
586e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($t1)
587e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($t1)
588e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($t1)
589e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*4($t1)
590e1051a39Sopenharmony_ci	addi	$ap,$t1,$SIZE_T*4
591e1051a39Sopenharmony_ci	sub	$np,$np,$num		# rewind np
592e1051a39Sopenharmony_ci
593e1051a39Sopenharmony_ci	$ST	$acc0,$SIZE_T*1($tp)	# result
594e1051a39Sopenharmony_ci	$ST	$acc1,$SIZE_T*2($tp)
595e1051a39Sopenharmony_ci	$ST	$acc2,$SIZE_T*3($tp)
596e1051a39Sopenharmony_ci	$ST	$acc3,$SIZE_T*4($tp)
597e1051a39Sopenharmony_ci	$ST	$carry,$SIZE_T*5($tp)	# save topmost carry
598e1051a39Sopenharmony_ci	$LD	$acc0,$SIZE_T*12($sp)	# t[0..3]
599e1051a39Sopenharmony_ci	$LD	$acc1,$SIZE_T*13($sp)
600e1051a39Sopenharmony_ci	$LD	$acc2,$SIZE_T*14($sp)
601e1051a39Sopenharmony_ci	$LD	$acc3,$SIZE_T*15($sp)
602e1051a39Sopenharmony_ci
603e1051a39Sopenharmony_ci	$LD	$m0,$SIZE_T*1($np)	# n[0..3]
604e1051a39Sopenharmony_ci	$LD	$m1,$SIZE_T*2($np)
605e1051a39Sopenharmony_ci	$LD	$m2,$SIZE_T*3($np)
606e1051a39Sopenharmony_ci	$LDU	$m3,$SIZE_T*4($np)
607e1051a39Sopenharmony_ci	addic	$tp,$sp,$SIZE_T*7	# &t[-1], clear carry bit
608e1051a39Sopenharmony_ci	li	$carry,0
609e1051a39Sopenharmony_ci	b	.Loop_mul4x_reduction
610e1051a39Sopenharmony_ci
611e1051a39Sopenharmony_ci.align	5
612e1051a39Sopenharmony_ci.Loop_mul4x_reduction:
613e1051a39Sopenharmony_ci	$UMULL	$t0,$a0,$bi		# lo(a[0..3]*b[4])
614e1051a39Sopenharmony_ci	addze	$carry,$carry		# modulo-scheduled
615e1051a39Sopenharmony_ci	$UMULL	$t1,$a1,$bi
616e1051a39Sopenharmony_ci	addi	$cnt,$cnt,$SIZE_T
617e1051a39Sopenharmony_ci	$UMULL	$t2,$a2,$bi
618e1051a39Sopenharmony_ci	andi.	$cnt,$cnt,$SIZE_T*4-1
619e1051a39Sopenharmony_ci	$UMULL	$t3,$a3,$bi
620e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
621e1051a39Sopenharmony_ci	$UMULH	$t0,$a0,$bi		# hi(a[0..3]*b[4])
622e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
623e1051a39Sopenharmony_ci	$UMULH	$t1,$a1,$bi
624e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
625e1051a39Sopenharmony_ci	$UMULL	$mi,$acc0,$n0		# t[0]*n0
626e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
627e1051a39Sopenharmony_ci	$UMULH	$t2,$a2,$bi
628e1051a39Sopenharmony_ci	addze	$acc4,$zero
629e1051a39Sopenharmony_ci	$UMULH	$t3,$a3,$bi
630e1051a39Sopenharmony_ci	$LDX	$bi,$bp,$cnt		# next b[i]
631e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$t0
632e1051a39Sopenharmony_ci	# (*)	mul	$t0,$m0,$mi
633e1051a39Sopenharmony_ci	$STU	$mi,$SIZE_T($tp)	# put aside t[0]*n0 for tail processing
634e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
635e1051a39Sopenharmony_ci	$UMULL	$t1,$m1,$mi		# lo(n[0..3]*t[0]*n0
636e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
637e1051a39Sopenharmony_ci	$UMULL	$t2,$m2,$mi
638e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t3		# can't overflow
639e1051a39Sopenharmony_ci	$UMULL	$t3,$m3,$mi
640e1051a39Sopenharmony_ci	# (*)	addc	$acc0,$acc0,$t0
641e1051a39Sopenharmony_ci	addic	$acc0,$acc0,-1		# (*), discarded
642e1051a39Sopenharmony_ci	$UMULH	$t0,$m0,$mi		# hi(n[0..3]*t[0]*n0
643e1051a39Sopenharmony_ci	adde	$acc0,$acc1,$t1
644e1051a39Sopenharmony_ci	$UMULH	$t1,$m1,$mi
645e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t2
646e1051a39Sopenharmony_ci	$UMULH	$t2,$m2,$mi
647e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t3
648e1051a39Sopenharmony_ci	$UMULH	$t3,$m3,$mi
649e1051a39Sopenharmony_ci	adde	$acc3,$acc4,$carry
650e1051a39Sopenharmony_ci	addze	$carry,$zero
651e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
652e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
653e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
654e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
655e1051a39Sopenharmony_ci	#addze	$carry,$carry
656e1051a39Sopenharmony_ci	bne	.Loop_mul4x_reduction
657e1051a39Sopenharmony_ci
658e1051a39Sopenharmony_ci	$LD	$t0,$SIZE_T*5($tp)	# t[4..7]
659e1051a39Sopenharmony_ci	addze	$carry,$carry
660e1051a39Sopenharmony_ci	$LD	$t1,$SIZE_T*6($tp)
661e1051a39Sopenharmony_ci	$LD	$t2,$SIZE_T*7($tp)
662e1051a39Sopenharmony_ci	$LD	$t3,$SIZE_T*8($tp)
663e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($ap)	# a[4..7]
664e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($ap)
665e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($ap)
666e1051a39Sopenharmony_ci	$LDU	$a3,$SIZE_T*4($ap)
667e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
668e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
669e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
670e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
671e1051a39Sopenharmony_ci	#addze	$carry,$carry
672e1051a39Sopenharmony_ci
673e1051a39Sopenharmony_ci	$LD	$mi,$SIZE_T*8($sp)	# t[0]*n0
674e1051a39Sopenharmony_ci	$LD	$m0,$SIZE_T*1($np)	# n[4..7]
675e1051a39Sopenharmony_ci	$LD	$m1,$SIZE_T*2($np)
676e1051a39Sopenharmony_ci	$LD	$m2,$SIZE_T*3($np)
677e1051a39Sopenharmony_ci	$LDU	$m3,$SIZE_T*4($np)
678e1051a39Sopenharmony_ci	b	.Loop_mul4x_tail
679e1051a39Sopenharmony_ci
680e1051a39Sopenharmony_ci.align	5
681e1051a39Sopenharmony_ci.Loop_mul4x_tail:
682e1051a39Sopenharmony_ci	$UMULL	$t0,$a0,$bi		# lo(a[4..7]*b[4])
683e1051a39Sopenharmony_ci	addze	$carry,$carry		# modulo-scheduled
684e1051a39Sopenharmony_ci	$UMULL	$t1,$a1,$bi
685e1051a39Sopenharmony_ci	addi	$cnt,$cnt,$SIZE_T
686e1051a39Sopenharmony_ci	$UMULL	$t2,$a2,$bi
687e1051a39Sopenharmony_ci	andi.	$cnt,$cnt,$SIZE_T*4-1
688e1051a39Sopenharmony_ci	$UMULL	$t3,$a3,$bi
689e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
690e1051a39Sopenharmony_ci	$UMULH	$t0,$a0,$bi		# hi(a[4..7]*b[4])
691e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
692e1051a39Sopenharmony_ci	$UMULH	$t1,$a1,$bi
693e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
694e1051a39Sopenharmony_ci	$UMULH	$t2,$a2,$bi
695e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
696e1051a39Sopenharmony_ci	$UMULH	$t3,$a3,$bi
697e1051a39Sopenharmony_ci	addze	$acc4,$zero
698e1051a39Sopenharmony_ci	$LDX	$bi,$bp,$cnt		# next b[i]
699e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$t0
700e1051a39Sopenharmony_ci	$UMULL	$t0,$m0,$mi		# lo(n[4..7]*t[0]*n0)
701e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
702e1051a39Sopenharmony_ci	$UMULL	$t1,$m1,$mi
703e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
704e1051a39Sopenharmony_ci	$UMULL	$t2,$m2,$mi
705e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t3		# can't overflow
706e1051a39Sopenharmony_ci	$UMULL	$t3,$m3,$mi
707e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
708e1051a39Sopenharmony_ci	$UMULH	$t0,$m0,$mi		# hi(n[4..7]*t[0]*n0)
709e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
710e1051a39Sopenharmony_ci	$UMULH	$t1,$m1,$mi
711e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
712e1051a39Sopenharmony_ci	$UMULH	$t2,$m2,$mi
713e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
714e1051a39Sopenharmony_ci	$UMULH	$t3,$m3,$mi
715e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$carry
716e1051a39Sopenharmony_ci	addi	$mi,$sp,$SIZE_T*8
717e1051a39Sopenharmony_ci	$LDX	$mi,$mi,$cnt		# next a[0]*n0
718e1051a39Sopenharmony_ci	addze	$carry,$zero
719e1051a39Sopenharmony_ci	$STU	$acc0,$SIZE_T($tp)	# word of result
720e1051a39Sopenharmony_ci	addc	$acc0,$acc1,$t0
721e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t1
722e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t2
723e1051a39Sopenharmony_ci	adde	$acc3,$acc4,$t3
724e1051a39Sopenharmony_ci	#addze	$carry,$carry
725e1051a39Sopenharmony_ci	bne	.Loop_mul4x_tail
726e1051a39Sopenharmony_ci
727e1051a39Sopenharmony_ci	$LD	$t0,$SIZE_T*5($tp)	# next t[i] or topmost carry
728e1051a39Sopenharmony_ci	sub	$t1,$np,$num		# rewinded np?
729e1051a39Sopenharmony_ci	addze	$carry,$carry
730e1051a39Sopenharmony_ci	$UCMP	$ap_end,$ap		# done yet?
731e1051a39Sopenharmony_ci	beq	.Loop_mul4x_break
732e1051a39Sopenharmony_ci
733e1051a39Sopenharmony_ci	$LD	$t1,$SIZE_T*6($tp)
734e1051a39Sopenharmony_ci	$LD	$t2,$SIZE_T*7($tp)
735e1051a39Sopenharmony_ci	$LD	$t3,$SIZE_T*8($tp)
736e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($ap)
737e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($ap)
738e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($ap)
739e1051a39Sopenharmony_ci	$LDU	$a3,$SIZE_T*4($ap)
740e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
741e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
742e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
743e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
744e1051a39Sopenharmony_ci	#addze	$carry,$carry
745e1051a39Sopenharmony_ci
746e1051a39Sopenharmony_ci	$LD	$m0,$SIZE_T*1($np)	# n[4..7]
747e1051a39Sopenharmony_ci	$LD	$m1,$SIZE_T*2($np)
748e1051a39Sopenharmony_ci	$LD	$m2,$SIZE_T*3($np)
749e1051a39Sopenharmony_ci	$LDU	$m3,$SIZE_T*4($np)
750e1051a39Sopenharmony_ci	b	.Loop_mul4x_tail
751e1051a39Sopenharmony_ci
752e1051a39Sopenharmony_ci.align	5
753e1051a39Sopenharmony_ci.Loop_mul4x_break:
754e1051a39Sopenharmony_ci	$POP	$t2,$SIZE_T*6($sp)	# pull rp and &b[num-4]
755e1051a39Sopenharmony_ci	$POP	$t3,$SIZE_T*7($sp)
756e1051a39Sopenharmony_ci	addc	$a0,$acc0,$t0		# accumulate topmost carry
757e1051a39Sopenharmony_ci	$LD	$acc0,$SIZE_T*12($sp)	# t[0..3]
758e1051a39Sopenharmony_ci	addze	$a1,$acc1
759e1051a39Sopenharmony_ci	$LD	$acc1,$SIZE_T*13($sp)
760e1051a39Sopenharmony_ci	addze	$a2,$acc2
761e1051a39Sopenharmony_ci	$LD	$acc2,$SIZE_T*14($sp)
762e1051a39Sopenharmony_ci	addze	$a3,$acc3
763e1051a39Sopenharmony_ci	$LD	$acc3,$SIZE_T*15($sp)
764e1051a39Sopenharmony_ci	addze	$carry,$carry		# topmost carry
765e1051a39Sopenharmony_ci	$ST	$a0,$SIZE_T*1($tp)	# result
766e1051a39Sopenharmony_ci	sub	$ap,$ap_end,$num	# rewind ap
767e1051a39Sopenharmony_ci	$ST	$a1,$SIZE_T*2($tp)
768e1051a39Sopenharmony_ci	$ST	$a2,$SIZE_T*3($tp)
769e1051a39Sopenharmony_ci	$ST	$a3,$SIZE_T*4($tp)
770e1051a39Sopenharmony_ci	$ST	$carry,$SIZE_T*5($tp)	# store topmost carry
771e1051a39Sopenharmony_ci
772e1051a39Sopenharmony_ci	$LD	$m0,$SIZE_T*1($t1)	# n[0..3]
773e1051a39Sopenharmony_ci	$LD	$m1,$SIZE_T*2($t1)
774e1051a39Sopenharmony_ci	$LD	$m2,$SIZE_T*3($t1)
775e1051a39Sopenharmony_ci	$LD	$m3,$SIZE_T*4($t1)
776e1051a39Sopenharmony_ci	addi	$np,$t1,$SIZE_T*4
777e1051a39Sopenharmony_ci	$UCMP	$bp,$t3			# done yet?
778e1051a39Sopenharmony_ci	beq	.Lmul4x_post
779e1051a39Sopenharmony_ci
780e1051a39Sopenharmony_ci	$LDU	$bi,$SIZE_T*4($bp)
781e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($ap)	# a[0..3]
782e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($ap)
783e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($ap)
784e1051a39Sopenharmony_ci	$LDU	$a3,$SIZE_T*4($ap)
785e1051a39Sopenharmony_ci	li	$carry,0
786e1051a39Sopenharmony_ci	addic	$tp,$sp,$SIZE_T*7	# &t[-1], clear carry bit
787e1051a39Sopenharmony_ci	b	.Loop_mul4x_reduction
788e1051a39Sopenharmony_ci
789e1051a39Sopenharmony_ci.align	5
790e1051a39Sopenharmony_ci.Lmul4x_post:
791e1051a39Sopenharmony_ci	# Final step. We see if result is larger than modulus, and
792e1051a39Sopenharmony_ci	# if it is, subtract the modulus. But comparison implies
793e1051a39Sopenharmony_ci	# subtraction. So we subtract modulus, see if it borrowed,
794e1051a39Sopenharmony_ci	# and conditionally copy original value.
795e1051a39Sopenharmony_ci	srwi	$cnt,$num,`log($SIZE_T)/log(2)+2`
796e1051a39Sopenharmony_ci	mr	$bp,$t2			# &rp[-1]
797e1051a39Sopenharmony_ci	subi	$cnt,$cnt,1
798e1051a39Sopenharmony_ci	mr	$ap_end,$t2		# &rp[-1] copy
799e1051a39Sopenharmony_ci	subfc	$t0,$m0,$acc0
800e1051a39Sopenharmony_ci	addi	$tp,$sp,$SIZE_T*15
801e1051a39Sopenharmony_ci	subfe	$t1,$m1,$acc1
802e1051a39Sopenharmony_ci
803e1051a39Sopenharmony_ci	mtctr	$cnt
804e1051a39Sopenharmony_ci.Lmul4x_sub:
805e1051a39Sopenharmony_ci	$LD	$m0,$SIZE_T*1($np)
806e1051a39Sopenharmony_ci	$LD	$acc0,$SIZE_T*1($tp)
807e1051a39Sopenharmony_ci	subfe	$t2,$m2,$acc2
808e1051a39Sopenharmony_ci	$LD	$m1,$SIZE_T*2($np)
809e1051a39Sopenharmony_ci	$LD	$acc1,$SIZE_T*2($tp)
810e1051a39Sopenharmony_ci	subfe	$t3,$m3,$acc3
811e1051a39Sopenharmony_ci	$LD	$m2,$SIZE_T*3($np)
812e1051a39Sopenharmony_ci	$LD	$acc2,$SIZE_T*3($tp)
813e1051a39Sopenharmony_ci	$LDU	$m3,$SIZE_T*4($np)
814e1051a39Sopenharmony_ci	$LDU	$acc3,$SIZE_T*4($tp)
815e1051a39Sopenharmony_ci	$ST	$t0,$SIZE_T*1($bp)
816e1051a39Sopenharmony_ci	$ST	$t1,$SIZE_T*2($bp)
817e1051a39Sopenharmony_ci	subfe	$t0,$m0,$acc0
818e1051a39Sopenharmony_ci	$ST	$t2,$SIZE_T*3($bp)
819e1051a39Sopenharmony_ci	$STU	$t3,$SIZE_T*4($bp)
820e1051a39Sopenharmony_ci	subfe	$t1,$m1,$acc1
821e1051a39Sopenharmony_ci	bdnz	.Lmul4x_sub
822e1051a39Sopenharmony_ci
823e1051a39Sopenharmony_ci	 $LD	$a0,$SIZE_T*1($ap_end)
824e1051a39Sopenharmony_ci	$ST	$t0,$SIZE_T*1($bp)
825e1051a39Sopenharmony_ci	 $LD	$t0,$SIZE_T*12($sp)
826e1051a39Sopenharmony_ci	subfe	$t2,$m2,$acc2
827e1051a39Sopenharmony_ci	 $LD	$a1,$SIZE_T*2($ap_end)
828e1051a39Sopenharmony_ci	$ST	$t1,$SIZE_T*2($bp)
829e1051a39Sopenharmony_ci	 $LD	$t1,$SIZE_T*13($sp)
830e1051a39Sopenharmony_ci	subfe	$t3,$m3,$acc3
831e1051a39Sopenharmony_ci	subfe	$carry,$zero,$carry	# did it borrow?
832e1051a39Sopenharmony_ci	 addi	$tp,$sp,$SIZE_T*12
833e1051a39Sopenharmony_ci	 $LD	$a2,$SIZE_T*3($ap_end)
834e1051a39Sopenharmony_ci	$ST	$t2,$SIZE_T*3($bp)
835e1051a39Sopenharmony_ci	 $LD	$t2,$SIZE_T*14($sp)
836e1051a39Sopenharmony_ci	 $LD	$a3,$SIZE_T*4($ap_end)
837e1051a39Sopenharmony_ci	$ST	$t3,$SIZE_T*4($bp)
838e1051a39Sopenharmony_ci	 $LD	$t3,$SIZE_T*15($sp)
839e1051a39Sopenharmony_ci
840e1051a39Sopenharmony_ci	mtctr	$cnt
841e1051a39Sopenharmony_ci.Lmul4x_cond_copy:
842e1051a39Sopenharmony_ci	and	$t0,$t0,$carry
843e1051a39Sopenharmony_ci	andc	$a0,$a0,$carry
844e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*0($tp)	# wipe stack clean
845e1051a39Sopenharmony_ci	and	$t1,$t1,$carry
846e1051a39Sopenharmony_ci	andc	$a1,$a1,$carry
847e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*1($tp)
848e1051a39Sopenharmony_ci	and	$t2,$t2,$carry
849e1051a39Sopenharmony_ci	andc	$a2,$a2,$carry
850e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*2($tp)
851e1051a39Sopenharmony_ci	and	$t3,$t3,$carry
852e1051a39Sopenharmony_ci	andc	$a3,$a3,$carry
853e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*3($tp)
854e1051a39Sopenharmony_ci	or	$acc0,$t0,$a0
855e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*5($ap_end)
856e1051a39Sopenharmony_ci	$LD	$t0,$SIZE_T*4($tp)
857e1051a39Sopenharmony_ci	or	$acc1,$t1,$a1
858e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*6($ap_end)
859e1051a39Sopenharmony_ci	$LD	$t1,$SIZE_T*5($tp)
860e1051a39Sopenharmony_ci	or	$acc2,$t2,$a2
861e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*7($ap_end)
862e1051a39Sopenharmony_ci	$LD	$t2,$SIZE_T*6($tp)
863e1051a39Sopenharmony_ci	or	$acc3,$t3,$a3
864e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*8($ap_end)
865e1051a39Sopenharmony_ci	$LD	$t3,$SIZE_T*7($tp)
866e1051a39Sopenharmony_ci	addi	$tp,$tp,$SIZE_T*4
867e1051a39Sopenharmony_ci	$ST	$acc0,$SIZE_T*1($ap_end)
868e1051a39Sopenharmony_ci	$ST	$acc1,$SIZE_T*2($ap_end)
869e1051a39Sopenharmony_ci	$ST	$acc2,$SIZE_T*3($ap_end)
870e1051a39Sopenharmony_ci	$STU	$acc3,$SIZE_T*4($ap_end)
871e1051a39Sopenharmony_ci	bdnz	.Lmul4x_cond_copy
872e1051a39Sopenharmony_ci
873e1051a39Sopenharmony_ci	$POP	$bp,0($sp)		# pull saved sp
874e1051a39Sopenharmony_ci	and	$t0,$t0,$carry
875e1051a39Sopenharmony_ci	andc	$a0,$a0,$carry
876e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*0($tp)
877e1051a39Sopenharmony_ci	and	$t1,$t1,$carry
878e1051a39Sopenharmony_ci	andc	$a1,$a1,$carry
879e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*1($tp)
880e1051a39Sopenharmony_ci	and	$t2,$t2,$carry
881e1051a39Sopenharmony_ci	andc	$a2,$a2,$carry
882e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*2($tp)
883e1051a39Sopenharmony_ci	and	$t3,$t3,$carry
884e1051a39Sopenharmony_ci	andc	$a3,$a3,$carry
885e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*3($tp)
886e1051a39Sopenharmony_ci	or	$acc0,$t0,$a0
887e1051a39Sopenharmony_ci	or	$acc1,$t1,$a1
888e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*4($tp)
889e1051a39Sopenharmony_ci	or	$acc2,$t2,$a2
890e1051a39Sopenharmony_ci	or	$acc3,$t3,$a3
891e1051a39Sopenharmony_ci	$ST	$acc0,$SIZE_T*1($ap_end)
892e1051a39Sopenharmony_ci	$ST	$acc1,$SIZE_T*2($ap_end)
893e1051a39Sopenharmony_ci	$ST	$acc2,$SIZE_T*3($ap_end)
894e1051a39Sopenharmony_ci	$ST	$acc3,$SIZE_T*4($ap_end)
895e1051a39Sopenharmony_ci
896e1051a39Sopenharmony_ci	b	.Lmul4x_done
897e1051a39Sopenharmony_ci
898e1051a39Sopenharmony_ci.align	4
899e1051a39Sopenharmony_ci.Lmul4x4_post_condition:
900e1051a39Sopenharmony_ci	$POP	$ap,$SIZE_T*6($sp)	# pull &rp[-1]
901e1051a39Sopenharmony_ci	$POP	$bp,0($sp)		# pull saved sp
902e1051a39Sopenharmony_ci	addze	$carry,$carry		# modulo-scheduled
903e1051a39Sopenharmony_ci	# $acc0-3,$carry hold result, $m0-3 hold modulus
904e1051a39Sopenharmony_ci	subfc	$a0,$m0,$acc0
905e1051a39Sopenharmony_ci	subfe	$a1,$m1,$acc1
906e1051a39Sopenharmony_ci	subfe	$a2,$m2,$acc2
907e1051a39Sopenharmony_ci	subfe	$a3,$m3,$acc3
908e1051a39Sopenharmony_ci	subfe	$carry,$zero,$carry	# did it borrow?
909e1051a39Sopenharmony_ci
910e1051a39Sopenharmony_ci	and	$m0,$m0,$carry
911e1051a39Sopenharmony_ci	and	$m1,$m1,$carry
912e1051a39Sopenharmony_ci	addc	$a0,$a0,$m0
913e1051a39Sopenharmony_ci	and	$m2,$m2,$carry
914e1051a39Sopenharmony_ci	adde	$a1,$a1,$m1
915e1051a39Sopenharmony_ci	and	$m3,$m3,$carry
916e1051a39Sopenharmony_ci	adde	$a2,$a2,$m2
917e1051a39Sopenharmony_ci	adde	$a3,$a3,$m3
918e1051a39Sopenharmony_ci
919e1051a39Sopenharmony_ci	$ST	$a0,$SIZE_T*1($ap)	# write result
920e1051a39Sopenharmony_ci	$ST	$a1,$SIZE_T*2($ap)
921e1051a39Sopenharmony_ci	$ST	$a2,$SIZE_T*3($ap)
922e1051a39Sopenharmony_ci	$ST	$a3,$SIZE_T*4($ap)
923e1051a39Sopenharmony_ci
924e1051a39Sopenharmony_ci.Lmul4x_done:
925e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*8($sp)	# wipe stack clean
926e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*9($sp)
927e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*10($sp)
928e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*11($sp)
929e1051a39Sopenharmony_ci	li	r3,1			# signal "done"
930e1051a39Sopenharmony_ci	$POP	r14,-$SIZE_T*18($bp)
931e1051a39Sopenharmony_ci	$POP	r15,-$SIZE_T*17($bp)
932e1051a39Sopenharmony_ci	$POP	r16,-$SIZE_T*16($bp)
933e1051a39Sopenharmony_ci	$POP	r17,-$SIZE_T*15($bp)
934e1051a39Sopenharmony_ci	$POP	r18,-$SIZE_T*14($bp)
935e1051a39Sopenharmony_ci	$POP	r19,-$SIZE_T*13($bp)
936e1051a39Sopenharmony_ci	$POP	r20,-$SIZE_T*12($bp)
937e1051a39Sopenharmony_ci	$POP	r21,-$SIZE_T*11($bp)
938e1051a39Sopenharmony_ci	$POP	r22,-$SIZE_T*10($bp)
939e1051a39Sopenharmony_ci	$POP	r23,-$SIZE_T*9($bp)
940e1051a39Sopenharmony_ci	$POP	r24,-$SIZE_T*8($bp)
941e1051a39Sopenharmony_ci	$POP	r25,-$SIZE_T*7($bp)
942e1051a39Sopenharmony_ci	$POP	r26,-$SIZE_T*6($bp)
943e1051a39Sopenharmony_ci	$POP	r27,-$SIZE_T*5($bp)
944e1051a39Sopenharmony_ci	$POP	r28,-$SIZE_T*4($bp)
945e1051a39Sopenharmony_ci	$POP	r29,-$SIZE_T*3($bp)
946e1051a39Sopenharmony_ci	$POP	r30,-$SIZE_T*2($bp)
947e1051a39Sopenharmony_ci	$POP	r31,-$SIZE_T*1($bp)
948e1051a39Sopenharmony_ci	mr	$sp,$bp
949e1051a39Sopenharmony_ci	blr
950e1051a39Sopenharmony_ci	.long	0
951e1051a39Sopenharmony_ci	.byte	0,12,4,0x20,0x80,18,6,0
952e1051a39Sopenharmony_ci	.long	0
953e1051a39Sopenharmony_ci.size	.bn_mul4x_mont_int,.-.bn_mul4x_mont_int
954e1051a39Sopenharmony_ci___
955e1051a39Sopenharmony_ci}
956e1051a39Sopenharmony_ci
957e1051a39Sopenharmony_ciif (1) {
958e1051a39Sopenharmony_ci########################################################################
959e1051a39Sopenharmony_ci# Following is PPC adaptation of sqrx8x_mont from x86_64-mont5 module.
960e1051a39Sopenharmony_ci
961e1051a39Sopenharmony_cimy ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("r$_",(9..12,14..17));
962e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3)=map("r$_",(18..21));
963e1051a39Sopenharmony_cimy ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("r$_",(22..29));
964e1051a39Sopenharmony_cimy ($cnt,$carry,$zero)=("r30","r31","r0");
965e1051a39Sopenharmony_cimy ($tp,$ap_end,$na0)=($bp,$np,$carry);
966e1051a39Sopenharmony_ci
967e1051a39Sopenharmony_ci# sp----------->+-------------------------------+
968e1051a39Sopenharmony_ci#		| saved sp			|
969e1051a39Sopenharmony_ci#		+-------------------------------+
970e1051a39Sopenharmony_ci#		.				.
971e1051a39Sopenharmony_ci# +12*size_t	+-------------------------------+
972e1051a39Sopenharmony_ci#		| size_t tmp[2*num]		|
973e1051a39Sopenharmony_ci#		.				.
974e1051a39Sopenharmony_ci#		.				.
975e1051a39Sopenharmony_ci#		.				.
976e1051a39Sopenharmony_ci#		+-------------------------------+
977e1051a39Sopenharmony_ci#		.				.
978e1051a39Sopenharmony_ci# -18*size_t	+-------------------------------+
979e1051a39Sopenharmony_ci#		| 18 saved gpr, r14-r31		|
980e1051a39Sopenharmony_ci#		.				.
981e1051a39Sopenharmony_ci#		.				.
982e1051a39Sopenharmony_ci#		+-------------------------------+
983e1051a39Sopenharmony_ci$code.=<<___;
984e1051a39Sopenharmony_ci.align	5
985e1051a39Sopenharmony_ci__bn_sqr8x_mont:
986e1051a39Sopenharmony_ci.Lsqr8x_do:
987e1051a39Sopenharmony_ci	mr	$a0,$sp
988e1051a39Sopenharmony_ci	slwi	$a1,$num,`log($SIZE_T)/log(2)+1`
989e1051a39Sopenharmony_ci	li	$a2,-32*$SIZE_T
990e1051a39Sopenharmony_ci	sub	$a1,$a2,$a1
991e1051a39Sopenharmony_ci	slwi	$num,$num,`log($SIZE_T)/log(2)`
992e1051a39Sopenharmony_ci	$STUX	$sp,$sp,$a1		# alloca
993e1051a39Sopenharmony_ci
994e1051a39Sopenharmony_ci	$PUSH	r14,-$SIZE_T*18($a0)
995e1051a39Sopenharmony_ci	$PUSH	r15,-$SIZE_T*17($a0)
996e1051a39Sopenharmony_ci	$PUSH	r16,-$SIZE_T*16($a0)
997e1051a39Sopenharmony_ci	$PUSH	r17,-$SIZE_T*15($a0)
998e1051a39Sopenharmony_ci	$PUSH	r18,-$SIZE_T*14($a0)
999e1051a39Sopenharmony_ci	$PUSH	r19,-$SIZE_T*13($a0)
1000e1051a39Sopenharmony_ci	$PUSH	r20,-$SIZE_T*12($a0)
1001e1051a39Sopenharmony_ci	$PUSH	r21,-$SIZE_T*11($a0)
1002e1051a39Sopenharmony_ci	$PUSH	r22,-$SIZE_T*10($a0)
1003e1051a39Sopenharmony_ci	$PUSH	r23,-$SIZE_T*9($a0)
1004e1051a39Sopenharmony_ci	$PUSH	r24,-$SIZE_T*8($a0)
1005e1051a39Sopenharmony_ci	$PUSH	r25,-$SIZE_T*7($a0)
1006e1051a39Sopenharmony_ci	$PUSH	r26,-$SIZE_T*6($a0)
1007e1051a39Sopenharmony_ci	$PUSH	r27,-$SIZE_T*5($a0)
1008e1051a39Sopenharmony_ci	$PUSH	r28,-$SIZE_T*4($a0)
1009e1051a39Sopenharmony_ci	$PUSH	r29,-$SIZE_T*3($a0)
1010e1051a39Sopenharmony_ci	$PUSH	r30,-$SIZE_T*2($a0)
1011e1051a39Sopenharmony_ci	$PUSH	r31,-$SIZE_T*1($a0)
1012e1051a39Sopenharmony_ci
1013e1051a39Sopenharmony_ci	subi	$ap,$ap,$SIZE_T		# bias by -1
1014e1051a39Sopenharmony_ci	subi	$t0,$np,$SIZE_T		# bias by -1
1015e1051a39Sopenharmony_ci	subi	$rp,$rp,$SIZE_T		# bias by -1
1016e1051a39Sopenharmony_ci	$LD	$n0,0($n0)		# *n0
1017e1051a39Sopenharmony_ci	li	$zero,0
1018e1051a39Sopenharmony_ci
1019e1051a39Sopenharmony_ci	add	$ap_end,$ap,$num
1020e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($ap)
1021e1051a39Sopenharmony_ci	#li	$acc0,0
1022e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($ap)
1023e1051a39Sopenharmony_ci	li	$acc1,0
1024e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($ap)
1025e1051a39Sopenharmony_ci	li	$acc2,0
1026e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*4($ap)
1027e1051a39Sopenharmony_ci	li	$acc3,0
1028e1051a39Sopenharmony_ci	$LD	$a4,$SIZE_T*5($ap)
1029e1051a39Sopenharmony_ci	li	$acc4,0
1030e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*6($ap)
1031e1051a39Sopenharmony_ci	li	$acc5,0
1032e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*7($ap)
1033e1051a39Sopenharmony_ci	li	$acc6,0
1034e1051a39Sopenharmony_ci	$LDU	$a7,$SIZE_T*8($ap)
1035e1051a39Sopenharmony_ci	li	$acc7,0
1036e1051a39Sopenharmony_ci
1037e1051a39Sopenharmony_ci	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1038e1051a39Sopenharmony_ci	subic.	$cnt,$num,$SIZE_T*8
1039e1051a39Sopenharmony_ci	b	.Lsqr8x_zero_start
1040e1051a39Sopenharmony_ci
1041e1051a39Sopenharmony_ci.align	5
1042e1051a39Sopenharmony_ci.Lsqr8x_zero:
1043e1051a39Sopenharmony_ci	subic.	$cnt,$cnt,$SIZE_T*8
1044e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*1($tp)
1045e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*2($tp)
1046e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*3($tp)
1047e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*4($tp)
1048e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*5($tp)
1049e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*6($tp)
1050e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*7($tp)
1051e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*8($tp)
1052e1051a39Sopenharmony_ci.Lsqr8x_zero_start:
1053e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*9($tp)
1054e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*10($tp)
1055e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*11($tp)
1056e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*12($tp)
1057e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*13($tp)
1058e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*14($tp)
1059e1051a39Sopenharmony_ci	$ST	$zero,$SIZE_T*15($tp)
1060e1051a39Sopenharmony_ci	$STU	$zero,$SIZE_T*16($tp)
1061e1051a39Sopenharmony_ci	bne	.Lsqr8x_zero
1062e1051a39Sopenharmony_ci
1063e1051a39Sopenharmony_ci	$PUSH	$rp,$SIZE_T*6($sp)	# offload &rp[-1]
1064e1051a39Sopenharmony_ci	$PUSH	$t0,$SIZE_T*7($sp)	# offload &np[-1]
1065e1051a39Sopenharmony_ci	$PUSH	$n0,$SIZE_T*8($sp)	# offload n0
1066e1051a39Sopenharmony_ci	$PUSH	$tp,$SIZE_T*9($sp)	# &tp[2*num-1]
1067e1051a39Sopenharmony_ci	$PUSH	$zero,$SIZE_T*10($sp)	# initial top-most carry
1068e1051a39Sopenharmony_ci	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1069e1051a39Sopenharmony_ci
1070e1051a39Sopenharmony_ci	# Multiply everything but a[i]*a[i]
1071e1051a39Sopenharmony_ci.align	5
1072e1051a39Sopenharmony_ci.Lsqr8x_outer_loop:
1073e1051a39Sopenharmony_ci	#						  a[1]a[0]     (i)
1074e1051a39Sopenharmony_ci	#					      a[2]a[0]
1075e1051a39Sopenharmony_ci	#					  a[3]a[0]
1076e1051a39Sopenharmony_ci	#				      a[4]a[0]
1077e1051a39Sopenharmony_ci	#				  a[5]a[0]
1078e1051a39Sopenharmony_ci	#			      a[6]a[0]
1079e1051a39Sopenharmony_ci	#			  a[7]a[0]
1080e1051a39Sopenharmony_ci	#					  a[2]a[1]	       (ii)
1081e1051a39Sopenharmony_ci	#				      a[3]a[1]
1082e1051a39Sopenharmony_ci	#				  a[4]a[1]
1083e1051a39Sopenharmony_ci	#			      a[5]a[1]
1084e1051a39Sopenharmony_ci	#			  a[6]a[1]
1085e1051a39Sopenharmony_ci	#		      a[7]a[1]
1086e1051a39Sopenharmony_ci	#				  a[3]a[2]		       (iii)
1087e1051a39Sopenharmony_ci	#			      a[4]a[2]
1088e1051a39Sopenharmony_ci	#			  a[5]a[2]
1089e1051a39Sopenharmony_ci	#		      a[6]a[2]
1090e1051a39Sopenharmony_ci	#		  a[7]a[2]
1091e1051a39Sopenharmony_ci	#			  a[4]a[3]			       (iv)
1092e1051a39Sopenharmony_ci	#		      a[5]a[3]
1093e1051a39Sopenharmony_ci	#		  a[6]a[3]
1094e1051a39Sopenharmony_ci	#	      a[7]a[3]
1095e1051a39Sopenharmony_ci	#		  a[5]a[4]				       (v)
1096e1051a39Sopenharmony_ci	#	      a[6]a[4]
1097e1051a39Sopenharmony_ci	#	  a[7]a[4]
1098e1051a39Sopenharmony_ci	#	  a[6]a[5]					       (vi)
1099e1051a39Sopenharmony_ci	#     a[7]a[5]
1100e1051a39Sopenharmony_ci	# a[7]a[6]						       (vii)
1101e1051a39Sopenharmony_ci
1102e1051a39Sopenharmony_ci	$UMULL	$t0,$a1,$a0		# lo(a[1..7]*a[0])		(i)
1103e1051a39Sopenharmony_ci	$UMULL	$t1,$a2,$a0
1104e1051a39Sopenharmony_ci	$UMULL	$t2,$a3,$a0
1105e1051a39Sopenharmony_ci	$UMULL	$t3,$a4,$a0
1106e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$t0		# t[1]+lo(a[1]*a[0])
1107e1051a39Sopenharmony_ci	$UMULL	$t0,$a5,$a0
1108e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
1109e1051a39Sopenharmony_ci	$UMULL	$t1,$a6,$a0
1110e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
1111e1051a39Sopenharmony_ci	$UMULL	$t2,$a7,$a0
1112e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t3
1113e1051a39Sopenharmony_ci	$UMULH	$t3,$a1,$a0		# hi(a[1..7]*a[0])
1114e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$t0
1115e1051a39Sopenharmony_ci	$UMULH	$t0,$a2,$a0
1116e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$t1
1117e1051a39Sopenharmony_ci	$UMULH	$t1,$a3,$a0
1118e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$t2
1119e1051a39Sopenharmony_ci	$UMULH	$t2,$a4,$a0
1120e1051a39Sopenharmony_ci	$ST	$acc0,$SIZE_T*1($tp)	# t[0]
1121e1051a39Sopenharmony_ci	addze	$acc0,$zero		# t[8]
1122e1051a39Sopenharmony_ci	$ST	$acc1,$SIZE_T*2($tp)	# t[1]
1123e1051a39Sopenharmony_ci	addc	$acc2,$acc2,$t3		# t[2]+lo(a[1]*a[0])
1124e1051a39Sopenharmony_ci	$UMULH	$t3,$a5,$a0
1125e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t0
1126e1051a39Sopenharmony_ci	$UMULH	$t0,$a6,$a0
1127e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t1
1128e1051a39Sopenharmony_ci	$UMULH	$t1,$a7,$a0
1129e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$t2
1130e1051a39Sopenharmony_ci	 $UMULL	$t2,$a2,$a1		# lo(a[2..7]*a[1])		(ii)
1131e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$t3
1132e1051a39Sopenharmony_ci	 $UMULL	$t3,$a3,$a1
1133e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$t0
1134e1051a39Sopenharmony_ci	 $UMULL	$t0,$a4,$a1
1135e1051a39Sopenharmony_ci	adde	$acc0,$acc0,$t1
1136e1051a39Sopenharmony_ci
1137e1051a39Sopenharmony_ci	$UMULL	$t1,$a5,$a1
1138e1051a39Sopenharmony_ci	addc	$acc3,$acc3,$t2
1139e1051a39Sopenharmony_ci	$UMULL	$t2,$a6,$a1
1140e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t3
1141e1051a39Sopenharmony_ci	$UMULL	$t3,$a7,$a1
1142e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$t0
1143e1051a39Sopenharmony_ci	$UMULH	$t0,$a2,$a1		# hi(a[2..7]*a[1])
1144e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$t1
1145e1051a39Sopenharmony_ci	$UMULH	$t1,$a3,$a1
1146e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$t2
1147e1051a39Sopenharmony_ci	$UMULH	$t2,$a4,$a1
1148e1051a39Sopenharmony_ci	adde	$acc0,$acc0,$t3
1149e1051a39Sopenharmony_ci	$UMULH	$t3,$a5,$a1
1150e1051a39Sopenharmony_ci	$ST	$acc2,$SIZE_T*3($tp)	# t[2]
1151e1051a39Sopenharmony_ci	addze	$acc1,$zero		# t[9]
1152e1051a39Sopenharmony_ci	$ST	$acc3,$SIZE_T*4($tp)	# t[3]
1153e1051a39Sopenharmony_ci	addc	$acc4,$acc4,$t0
1154e1051a39Sopenharmony_ci	$UMULH	$t0,$a6,$a1
1155e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$t1
1156e1051a39Sopenharmony_ci	$UMULH	$t1,$a7,$a1
1157e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$t2
1158e1051a39Sopenharmony_ci	 $UMULL	$t2,$a3,$a2		# lo(a[3..7]*a[2])		(iii)
1159e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$t3
1160e1051a39Sopenharmony_ci	 $UMULL	$t3,$a4,$a2
1161e1051a39Sopenharmony_ci	adde	$acc0,$acc0,$t0
1162e1051a39Sopenharmony_ci	 $UMULL	$t0,$a5,$a2
1163e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
1164e1051a39Sopenharmony_ci
1165e1051a39Sopenharmony_ci	$UMULL	$t1,$a6,$a2
1166e1051a39Sopenharmony_ci	addc	$acc5,$acc5,$t2
1167e1051a39Sopenharmony_ci	$UMULL	$t2,$a7,$a2
1168e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$t3
1169e1051a39Sopenharmony_ci	$UMULH	$t3,$a3,$a2		# hi(a[3..7]*a[2])
1170e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$t0
1171e1051a39Sopenharmony_ci	$UMULH	$t0,$a4,$a2
1172e1051a39Sopenharmony_ci	adde	$acc0,$acc0,$t1
1173e1051a39Sopenharmony_ci	$UMULH	$t1,$a5,$a2
1174e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t2
1175e1051a39Sopenharmony_ci	$UMULH	$t2,$a6,$a2
1176e1051a39Sopenharmony_ci	$ST	$acc4,$SIZE_T*5($tp)	# t[4]
1177e1051a39Sopenharmony_ci	addze	$acc2,$zero		# t[10]
1178e1051a39Sopenharmony_ci	$ST	$acc5,$SIZE_T*6($tp)	# t[5]
1179e1051a39Sopenharmony_ci	addc	$acc6,$acc6,$t3
1180e1051a39Sopenharmony_ci	$UMULH	$t3,$a7,$a2
1181e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$t0
1182e1051a39Sopenharmony_ci	 $UMULL	$t0,$a4,$a3		# lo(a[4..7]*a[3])		(iv)
1183e1051a39Sopenharmony_ci	adde	$acc0,$acc0,$t1
1184e1051a39Sopenharmony_ci	 $UMULL	$t1,$a5,$a3
1185e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t2
1186e1051a39Sopenharmony_ci	 $UMULL	$t2,$a6,$a3
1187e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t3
1188e1051a39Sopenharmony_ci
1189e1051a39Sopenharmony_ci	$UMULL	$t3,$a7,$a3
1190e1051a39Sopenharmony_ci	addc	$acc7,$acc7,$t0
1191e1051a39Sopenharmony_ci	$UMULH	$t0,$a4,$a3		# hi(a[4..7]*a[3])
1192e1051a39Sopenharmony_ci	adde	$acc0,$acc0,$t1
1193e1051a39Sopenharmony_ci	$UMULH	$t1,$a5,$a3
1194e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t2
1195e1051a39Sopenharmony_ci	$UMULH	$t2,$a6,$a3
1196e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t3
1197e1051a39Sopenharmony_ci	$UMULH	$t3,$a7,$a3
1198e1051a39Sopenharmony_ci	$ST	$acc6,$SIZE_T*7($tp)	# t[6]
1199e1051a39Sopenharmony_ci	addze	$acc3,$zero		# t[11]
1200e1051a39Sopenharmony_ci	$STU	$acc7,$SIZE_T*8($tp)	# t[7]
1201e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
1202e1051a39Sopenharmony_ci	 $UMULL	$t0,$a5,$a4		# lo(a[5..7]*a[4])		(v)
1203e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
1204e1051a39Sopenharmony_ci	 $UMULL	$t1,$a6,$a4
1205e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
1206e1051a39Sopenharmony_ci	 $UMULL	$t2,$a7,$a4
1207e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
1208e1051a39Sopenharmony_ci
1209e1051a39Sopenharmony_ci	$UMULH	$t3,$a5,$a4		# hi(a[5..7]*a[4])
1210e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$t0
1211e1051a39Sopenharmony_ci	$UMULH	$t0,$a6,$a4
1212e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
1213e1051a39Sopenharmony_ci	$UMULH	$t1,$a7,$a4
1214e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
1215e1051a39Sopenharmony_ci	 $UMULL	$t2,$a6,$a5		# lo(a[6..7]*a[5])		(vi)
1216e1051a39Sopenharmony_ci	addze	$acc4,$zero		# t[12]
1217e1051a39Sopenharmony_ci	addc	$acc2,$acc2,$t3
1218e1051a39Sopenharmony_ci	 $UMULL	$t3,$a7,$a5
1219e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t0
1220e1051a39Sopenharmony_ci	 $UMULH	$t0,$a6,$a5		# hi(a[6..7]*a[5])
1221e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t1
1222e1051a39Sopenharmony_ci
1223e1051a39Sopenharmony_ci	$UMULH	$t1,$a7,$a5
1224e1051a39Sopenharmony_ci	addc	$acc3,$acc3,$t2
1225e1051a39Sopenharmony_ci	 $UMULL	$t2,$a7,$a6		# lo(a[7]*a[6])			(vii)
1226e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t3
1227e1051a39Sopenharmony_ci	 $UMULH	$t3,$a7,$a6		# hi(a[7]*a[6])
1228e1051a39Sopenharmony_ci	addze	$acc5,$zero		# t[13]
1229e1051a39Sopenharmony_ci	addc	$acc4,$acc4,$t0
1230e1051a39Sopenharmony_ci	$UCMP	$ap_end,$ap		# done yet?
1231e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$t1
1232e1051a39Sopenharmony_ci
1233e1051a39Sopenharmony_ci	addc	$acc5,$acc5,$t2
1234e1051a39Sopenharmony_ci	sub	$t0,$ap_end,$num	# rewinded ap
1235e1051a39Sopenharmony_ci	addze	$acc6,$zero		# t[14]
1236e1051a39Sopenharmony_ci	add	$acc6,$acc6,$t3
1237e1051a39Sopenharmony_ci
1238e1051a39Sopenharmony_ci	beq	.Lsqr8x_outer_break
1239e1051a39Sopenharmony_ci
1240e1051a39Sopenharmony_ci	mr	$n0,$a0
1241e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($tp)
1242e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($tp)
1243e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($tp)
1244e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*4($tp)
1245e1051a39Sopenharmony_ci	$LD	$a4,$SIZE_T*5($tp)
1246e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*6($tp)
1247e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*7($tp)
1248e1051a39Sopenharmony_ci	$LD	$a7,$SIZE_T*8($tp)
1249e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$a0
1250e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($ap)
1251e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$a1
1252e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($ap)
1253e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$a2
1254e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($ap)
1255e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$a3
1256e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*4($ap)
1257e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$a4
1258e1051a39Sopenharmony_ci	$LD	$a4,$SIZE_T*5($ap)
1259e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$a5
1260e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*6($ap)
1261e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$a6
1262e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*7($ap)
1263e1051a39Sopenharmony_ci	subi	$rp,$ap,$SIZE_T*7
1264e1051a39Sopenharmony_ci	addze	$acc7,$a7
1265e1051a39Sopenharmony_ci	$LDU	$a7,$SIZE_T*8($ap)
1266e1051a39Sopenharmony_ci	#addze	$carry,$zero		# moved below
1267e1051a39Sopenharmony_ci	li	$cnt,0
1268e1051a39Sopenharmony_ci	b	.Lsqr8x_mul
1269e1051a39Sopenharmony_ci
1270e1051a39Sopenharmony_ci	#                                                          a[8]a[0]
1271e1051a39Sopenharmony_ci	#                                                      a[9]a[0]
1272e1051a39Sopenharmony_ci	#                                                  a[a]a[0]
1273e1051a39Sopenharmony_ci	#                                              a[b]a[0]
1274e1051a39Sopenharmony_ci	#                                          a[c]a[0]
1275e1051a39Sopenharmony_ci	#                                      a[d]a[0]
1276e1051a39Sopenharmony_ci	#                                  a[e]a[0]
1277e1051a39Sopenharmony_ci	#                              a[f]a[0]
1278e1051a39Sopenharmony_ci	#                                                      a[8]a[1]
1279e1051a39Sopenharmony_ci	#                          a[f]a[1]........................
1280e1051a39Sopenharmony_ci	#                                                  a[8]a[2]
1281e1051a39Sopenharmony_ci	#                      a[f]a[2]........................
1282e1051a39Sopenharmony_ci	#                                              a[8]a[3]
1283e1051a39Sopenharmony_ci	#                  a[f]a[3]........................
1284e1051a39Sopenharmony_ci	#                                          a[8]a[4]
1285e1051a39Sopenharmony_ci	#              a[f]a[4]........................
1286e1051a39Sopenharmony_ci	#                                      a[8]a[5]
1287e1051a39Sopenharmony_ci	#          a[f]a[5]........................
1288e1051a39Sopenharmony_ci	#                                  a[8]a[6]
1289e1051a39Sopenharmony_ci	#      a[f]a[6]........................
1290e1051a39Sopenharmony_ci	#                              a[8]a[7]
1291e1051a39Sopenharmony_ci	#  a[f]a[7]........................
1292e1051a39Sopenharmony_ci.align	5
1293e1051a39Sopenharmony_ci.Lsqr8x_mul:
1294e1051a39Sopenharmony_ci	$UMULL	$t0,$a0,$n0
1295e1051a39Sopenharmony_ci	addze	$carry,$zero		# carry bit, modulo-scheduled
1296e1051a39Sopenharmony_ci	$UMULL	$t1,$a1,$n0
1297e1051a39Sopenharmony_ci	addi	$cnt,$cnt,$SIZE_T
1298e1051a39Sopenharmony_ci	$UMULL	$t2,$a2,$n0
1299e1051a39Sopenharmony_ci	andi.	$cnt,$cnt,$SIZE_T*8-1
1300e1051a39Sopenharmony_ci	$UMULL	$t3,$a3,$n0
1301e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
1302e1051a39Sopenharmony_ci	$UMULL	$t0,$a4,$n0
1303e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
1304e1051a39Sopenharmony_ci	$UMULL	$t1,$a5,$n0
1305e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
1306e1051a39Sopenharmony_ci	$UMULL	$t2,$a6,$n0
1307e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
1308e1051a39Sopenharmony_ci	$UMULL	$t3,$a7,$n0
1309e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t0
1310e1051a39Sopenharmony_ci	$UMULH	$t0,$a0,$n0
1311e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$t1
1312e1051a39Sopenharmony_ci	$UMULH	$t1,$a1,$n0
1313e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$t2
1314e1051a39Sopenharmony_ci	$UMULH	$t2,$a2,$n0
1315e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$t3
1316e1051a39Sopenharmony_ci	$UMULH	$t3,$a3,$n0
1317e1051a39Sopenharmony_ci	addze	$carry,$carry
1318e1051a39Sopenharmony_ci	$STU	$acc0,$SIZE_T($tp)
1319e1051a39Sopenharmony_ci	addc	$acc0,$acc1,$t0
1320e1051a39Sopenharmony_ci	$UMULH	$t0,$a4,$n0
1321e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t1
1322e1051a39Sopenharmony_ci	$UMULH	$t1,$a5,$n0
1323e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t2
1324e1051a39Sopenharmony_ci	$UMULH	$t2,$a6,$n0
1325e1051a39Sopenharmony_ci	adde	$acc3,$acc4,$t3
1326e1051a39Sopenharmony_ci	$UMULH	$t3,$a7,$n0
1327e1051a39Sopenharmony_ci	$LDX	$n0,$rp,$cnt
1328e1051a39Sopenharmony_ci	adde	$acc4,$acc5,$t0
1329e1051a39Sopenharmony_ci	adde	$acc5,$acc6,$t1
1330e1051a39Sopenharmony_ci	adde	$acc6,$acc7,$t2
1331e1051a39Sopenharmony_ci	adde	$acc7,$carry,$t3
1332e1051a39Sopenharmony_ci	#addze	$carry,$zero		# moved above
1333e1051a39Sopenharmony_ci	bne	.Lsqr8x_mul
1334e1051a39Sopenharmony_ci					# note that carry flag is guaranteed
1335e1051a39Sopenharmony_ci					# to be zero at this point
1336e1051a39Sopenharmony_ci	$UCMP	$ap,$ap_end		# done yet?
1337e1051a39Sopenharmony_ci	beq	.Lsqr8x_break
1338e1051a39Sopenharmony_ci
1339e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($tp)
1340e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($tp)
1341e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($tp)
1342e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*4($tp)
1343e1051a39Sopenharmony_ci	$LD	$a4,$SIZE_T*5($tp)
1344e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*6($tp)
1345e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*7($tp)
1346e1051a39Sopenharmony_ci	$LD	$a7,$SIZE_T*8($tp)
1347e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$a0
1348e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($ap)
1349e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$a1
1350e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($ap)
1351e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$a2
1352e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($ap)
1353e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$a3
1354e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*4($ap)
1355e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$a4
1356e1051a39Sopenharmony_ci	$LD	$a4,$SIZE_T*5($ap)
1357e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$a5
1358e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*6($ap)
1359e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$a6
1360e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*7($ap)
1361e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$a7
1362e1051a39Sopenharmony_ci	$LDU	$a7,$SIZE_T*8($ap)
1363e1051a39Sopenharmony_ci	#addze	$carry,$zero		# moved above
1364e1051a39Sopenharmony_ci	b	.Lsqr8x_mul
1365e1051a39Sopenharmony_ci
1366e1051a39Sopenharmony_ci.align	5
1367e1051a39Sopenharmony_ci.Lsqr8x_break:
1368e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*8($rp)
1369e1051a39Sopenharmony_ci	addi	$ap,$rp,$SIZE_T*15
1370e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*9($rp)
1371e1051a39Sopenharmony_ci	sub.	$t0,$ap_end,$ap		# is it last iteration?
1372e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*10($rp)
1373e1051a39Sopenharmony_ci	sub	$t1,$tp,$t0
1374e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*11($rp)
1375e1051a39Sopenharmony_ci	$LD	$a4,$SIZE_T*12($rp)
1376e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*13($rp)
1377e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*14($rp)
1378e1051a39Sopenharmony_ci	$LD	$a7,$SIZE_T*15($rp)
1379e1051a39Sopenharmony_ci	beq	.Lsqr8x_outer_loop
1380e1051a39Sopenharmony_ci
1381e1051a39Sopenharmony_ci	$ST	$acc0,$SIZE_T*1($tp)
1382e1051a39Sopenharmony_ci	$LD	$acc0,$SIZE_T*1($t1)
1383e1051a39Sopenharmony_ci	$ST	$acc1,$SIZE_T*2($tp)
1384e1051a39Sopenharmony_ci	$LD	$acc1,$SIZE_T*2($t1)
1385e1051a39Sopenharmony_ci	$ST	$acc2,$SIZE_T*3($tp)
1386e1051a39Sopenharmony_ci	$LD	$acc2,$SIZE_T*3($t1)
1387e1051a39Sopenharmony_ci	$ST	$acc3,$SIZE_T*4($tp)
1388e1051a39Sopenharmony_ci	$LD	$acc3,$SIZE_T*4($t1)
1389e1051a39Sopenharmony_ci	$ST	$acc4,$SIZE_T*5($tp)
1390e1051a39Sopenharmony_ci	$LD	$acc4,$SIZE_T*5($t1)
1391e1051a39Sopenharmony_ci	$ST	$acc5,$SIZE_T*6($tp)
1392e1051a39Sopenharmony_ci	$LD	$acc5,$SIZE_T*6($t1)
1393e1051a39Sopenharmony_ci	$ST	$acc6,$SIZE_T*7($tp)
1394e1051a39Sopenharmony_ci	$LD	$acc6,$SIZE_T*7($t1)
1395e1051a39Sopenharmony_ci	$ST	$acc7,$SIZE_T*8($tp)
1396e1051a39Sopenharmony_ci	$LD	$acc7,$SIZE_T*8($t1)
1397e1051a39Sopenharmony_ci	mr	$tp,$t1
1398e1051a39Sopenharmony_ci	b	.Lsqr8x_outer_loop
1399e1051a39Sopenharmony_ci
1400e1051a39Sopenharmony_ci.align	5
1401e1051a39Sopenharmony_ci.Lsqr8x_outer_break:
1402e1051a39Sopenharmony_ci	####################################################################
1403e1051a39Sopenharmony_ci	# Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1404e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*1($t0)	# recall that $t0 is &a[-1]
1405e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*2($t0)
1406e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*3($t0)
1407e1051a39Sopenharmony_ci	$LD	$a7,$SIZE_T*4($t0)
1408e1051a39Sopenharmony_ci	addi	$ap,$t0,$SIZE_T*4
1409e1051a39Sopenharmony_ci					# "tp[x]" comments are for num==8 case
1410e1051a39Sopenharmony_ci	$LD	$t1,$SIZE_T*13($sp)	# =tp[1], t[0] is not interesting
1411e1051a39Sopenharmony_ci	$LD	$t2,$SIZE_T*14($sp)
1412e1051a39Sopenharmony_ci	$LD	$t3,$SIZE_T*15($sp)
1413e1051a39Sopenharmony_ci	$LD	$t0,$SIZE_T*16($sp)
1414e1051a39Sopenharmony_ci
1415e1051a39Sopenharmony_ci	$ST	$acc0,$SIZE_T*1($tp)	# tp[8]=
1416e1051a39Sopenharmony_ci	srwi	$cnt,$num,`log($SIZE_T)/log(2)+2`
1417e1051a39Sopenharmony_ci	$ST	$acc1,$SIZE_T*2($tp)
1418e1051a39Sopenharmony_ci	subi	$cnt,$cnt,1
1419e1051a39Sopenharmony_ci	$ST	$acc2,$SIZE_T*3($tp)
1420e1051a39Sopenharmony_ci	$ST	$acc3,$SIZE_T*4($tp)
1421e1051a39Sopenharmony_ci	$ST	$acc4,$SIZE_T*5($tp)
1422e1051a39Sopenharmony_ci	$ST	$acc5,$SIZE_T*6($tp)
1423e1051a39Sopenharmony_ci	$ST	$acc6,$SIZE_T*7($tp)
1424e1051a39Sopenharmony_ci	#$ST	$acc7,$SIZE_T*8($tp)	# tp[15] is not interesting
1425e1051a39Sopenharmony_ci	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1426e1051a39Sopenharmony_ci	$UMULL	$acc0,$a1,$a1
1427e1051a39Sopenharmony_ci	$UMULH	$a1,$a1,$a1
1428e1051a39Sopenharmony_ci	add	$acc1,$t1,$t1		# <<1
1429e1051a39Sopenharmony_ci	$SHRI	$t1,$t1,$BITS-1
1430e1051a39Sopenharmony_ci	$UMULL	$a2,$a3,$a3
1431e1051a39Sopenharmony_ci	$UMULH	$a3,$a3,$a3
1432e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$a1
1433e1051a39Sopenharmony_ci	add	$acc2,$t2,$t2
1434e1051a39Sopenharmony_ci	$SHRI	$t2,$t2,$BITS-1
1435e1051a39Sopenharmony_ci	add	$acc3,$t3,$t3
1436e1051a39Sopenharmony_ci	$SHRI	$t3,$t3,$BITS-1
1437e1051a39Sopenharmony_ci	or	$acc2,$acc2,$t1
1438e1051a39Sopenharmony_ci
1439e1051a39Sopenharmony_ci	mtctr	$cnt
1440e1051a39Sopenharmony_ci.Lsqr4x_shift_n_add:
1441e1051a39Sopenharmony_ci	$UMULL	$a4,$a5,$a5
1442e1051a39Sopenharmony_ci	$UMULH	$a5,$a5,$a5
1443e1051a39Sopenharmony_ci	$LD	$t1,$SIZE_T*6($tp)	# =tp[5]
1444e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*1($ap)
1445e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$a2
1446e1051a39Sopenharmony_ci	add	$acc4,$t0,$t0
1447e1051a39Sopenharmony_ci	$SHRI	$t0,$t0,$BITS-1
1448e1051a39Sopenharmony_ci	or	$acc3,$acc3,$t2
1449e1051a39Sopenharmony_ci	$LD	$t2,$SIZE_T*7($tp)	# =tp[6]
1450e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$a3
1451e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*2($ap)
1452e1051a39Sopenharmony_ci	add	$acc5,$t1,$t1
1453e1051a39Sopenharmony_ci	$SHRI	$t1,$t1,$BITS-1
1454e1051a39Sopenharmony_ci	or	$acc4,$acc4,$t3
1455e1051a39Sopenharmony_ci	$LD	$t3,$SIZE_T*8($tp)	# =tp[7]
1456e1051a39Sopenharmony_ci	$UMULL	$a6,$a7,$a7
1457e1051a39Sopenharmony_ci	$UMULH	$a7,$a7,$a7
1458e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$a4
1459e1051a39Sopenharmony_ci	add	$acc6,$t2,$t2
1460e1051a39Sopenharmony_ci	$SHRI	$t2,$t2,$BITS-1
1461e1051a39Sopenharmony_ci	or	$acc5,$acc5,$t0
1462e1051a39Sopenharmony_ci	$LD	$t0,$SIZE_T*9($tp)	# =tp[8]
1463e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$a5
1464e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*3($ap)
1465e1051a39Sopenharmony_ci	add	$acc7,$t3,$t3
1466e1051a39Sopenharmony_ci	$SHRI	$t3,$t3,$BITS-1
1467e1051a39Sopenharmony_ci	or	$acc6,$acc6,$t1
1468e1051a39Sopenharmony_ci	$LD	$t1,$SIZE_T*10($tp)	# =tp[9]
1469e1051a39Sopenharmony_ci	$UMULL	$a0,$a1,$a1
1470e1051a39Sopenharmony_ci	$UMULH	$a1,$a1,$a1
1471e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$a6
1472e1051a39Sopenharmony_ci	$ST	$acc0,$SIZE_T*1($tp)	# tp[0]=
1473e1051a39Sopenharmony_ci	add	$acc0,$t0,$t0
1474e1051a39Sopenharmony_ci	$SHRI	$t0,$t0,$BITS-1
1475e1051a39Sopenharmony_ci	or	$acc7,$acc7,$t2
1476e1051a39Sopenharmony_ci	$LD	$t2,$SIZE_T*11($tp)	# =tp[10]
1477e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$a7
1478e1051a39Sopenharmony_ci	$LDU	$a7,$SIZE_T*4($ap)
1479e1051a39Sopenharmony_ci	$ST	$acc1,$SIZE_T*2($tp)	# tp[1]=
1480e1051a39Sopenharmony_ci	add	$acc1,$t1,$t1
1481e1051a39Sopenharmony_ci	$SHRI	$t1,$t1,$BITS-1
1482e1051a39Sopenharmony_ci	or	$acc0,$acc0,$t3
1483e1051a39Sopenharmony_ci	$LD	$t3,$SIZE_T*12($tp)	# =tp[11]
1484e1051a39Sopenharmony_ci	$UMULL	$a2,$a3,$a3
1485e1051a39Sopenharmony_ci	$UMULH	$a3,$a3,$a3
1486e1051a39Sopenharmony_ci	adde	$acc0,$acc0,$a0
1487e1051a39Sopenharmony_ci	$ST	$acc2,$SIZE_T*3($tp)	# tp[2]=
1488e1051a39Sopenharmony_ci	add	$acc2,$t2,$t2
1489e1051a39Sopenharmony_ci	$SHRI	$t2,$t2,$BITS-1
1490e1051a39Sopenharmony_ci	or	$acc1,$acc1,$t0
1491e1051a39Sopenharmony_ci	$LD	$t0,$SIZE_T*13($tp)	# =tp[12]
1492e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$a1
1493e1051a39Sopenharmony_ci	$ST	$acc3,$SIZE_T*4($tp)	# tp[3]=
1494e1051a39Sopenharmony_ci	$ST	$acc4,$SIZE_T*5($tp)	# tp[4]=
1495e1051a39Sopenharmony_ci	$ST	$acc5,$SIZE_T*6($tp)	# tp[5]=
1496e1051a39Sopenharmony_ci	$ST	$acc6,$SIZE_T*7($tp)	# tp[6]=
1497e1051a39Sopenharmony_ci	$STU	$acc7,$SIZE_T*8($tp)	# tp[7]=
1498e1051a39Sopenharmony_ci	add	$acc3,$t3,$t3
1499e1051a39Sopenharmony_ci	$SHRI	$t3,$t3,$BITS-1
1500e1051a39Sopenharmony_ci	or	$acc2,$acc2,$t1
1501e1051a39Sopenharmony_ci	bdnz	.Lsqr4x_shift_n_add
1502e1051a39Sopenharmony_ci___
1503e1051a39Sopenharmony_cimy ($np,$np_end)=($ap,$ap_end);
1504e1051a39Sopenharmony_ci$code.=<<___;
1505e1051a39Sopenharmony_ci	 $POP	$np,$SIZE_T*7($sp)	# pull &np[-1] and n0
1506e1051a39Sopenharmony_ci	 $POP	$n0,$SIZE_T*8($sp)
1507e1051a39Sopenharmony_ci
1508e1051a39Sopenharmony_ci	$UMULL	$a4,$a5,$a5
1509e1051a39Sopenharmony_ci	$UMULH	$a5,$a5,$a5
1510e1051a39Sopenharmony_ci	$ST	$acc0,$SIZE_T*1($tp)	# tp[8]=
1511e1051a39Sopenharmony_ci	 $LD	$acc0,$SIZE_T*12($sp)	# =tp[0]
1512e1051a39Sopenharmony_ci	$LD	$t1,$SIZE_T*6($tp)	# =tp[13]
1513e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$a2
1514e1051a39Sopenharmony_ci	add	$acc4,$t0,$t0
1515e1051a39Sopenharmony_ci	$SHRI	$t0,$t0,$BITS-1
1516e1051a39Sopenharmony_ci	or	$acc3,$acc3,$t2
1517e1051a39Sopenharmony_ci	$LD	$t2,$SIZE_T*7($tp)	# =tp[14]
1518e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$a3
1519e1051a39Sopenharmony_ci	add	$acc5,$t1,$t1
1520e1051a39Sopenharmony_ci	$SHRI	$t1,$t1,$BITS-1
1521e1051a39Sopenharmony_ci	or	$acc4,$acc4,$t3
1522e1051a39Sopenharmony_ci	$UMULL	$a6,$a7,$a7
1523e1051a39Sopenharmony_ci	$UMULH	$a7,$a7,$a7
1524e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$a4
1525e1051a39Sopenharmony_ci	add	$acc6,$t2,$t2
1526e1051a39Sopenharmony_ci	$SHRI	$t2,$t2,$BITS-1
1527e1051a39Sopenharmony_ci	or	$acc5,$acc5,$t0
1528e1051a39Sopenharmony_ci	$ST	$acc1,$SIZE_T*2($tp)	# tp[9]=
1529e1051a39Sopenharmony_ci	 $LD	$acc1,$SIZE_T*13($sp)	# =tp[1]
1530e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$a5
1531e1051a39Sopenharmony_ci	or	$acc6,$acc6,$t1
1532e1051a39Sopenharmony_ci	 $LD	$a0,$SIZE_T*1($np)
1533e1051a39Sopenharmony_ci	 $LD	$a1,$SIZE_T*2($np)
1534e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$a6
1535e1051a39Sopenharmony_ci	 $LD	$a2,$SIZE_T*3($np)
1536e1051a39Sopenharmony_ci	 $LD	$a3,$SIZE_T*4($np)
1537e1051a39Sopenharmony_ci	adde	$acc7,$a7,$t2
1538e1051a39Sopenharmony_ci	 $LD	$a4,$SIZE_T*5($np)
1539e1051a39Sopenharmony_ci	 $LD	$a5,$SIZE_T*6($np)
1540e1051a39Sopenharmony_ci
1541e1051a39Sopenharmony_ci	################################################################
1542e1051a39Sopenharmony_ci	# Reduce by 8 limbs per iteration
1543e1051a39Sopenharmony_ci	$UMULL	$na0,$n0,$acc0		# t[0]*n0
1544e1051a39Sopenharmony_ci	li	$cnt,8
1545e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*7($np)
1546e1051a39Sopenharmony_ci	add	$np_end,$np,$num
1547e1051a39Sopenharmony_ci	$LDU	$a7,$SIZE_T*8($np)
1548e1051a39Sopenharmony_ci	$ST	$acc2,$SIZE_T*3($tp)	# tp[10]=
1549e1051a39Sopenharmony_ci	$LD	$acc2,$SIZE_T*14($sp)
1550e1051a39Sopenharmony_ci	$ST	$acc3,$SIZE_T*4($tp)	# tp[11]=
1551e1051a39Sopenharmony_ci	$LD	$acc3,$SIZE_T*15($sp)
1552e1051a39Sopenharmony_ci	$ST	$acc4,$SIZE_T*5($tp)	# tp[12]=
1553e1051a39Sopenharmony_ci	$LD	$acc4,$SIZE_T*16($sp)
1554e1051a39Sopenharmony_ci	$ST	$acc5,$SIZE_T*6($tp)	# tp[13]=
1555e1051a39Sopenharmony_ci	$LD	$acc5,$SIZE_T*17($sp)
1556e1051a39Sopenharmony_ci	$ST	$acc6,$SIZE_T*7($tp)	# tp[14]=
1557e1051a39Sopenharmony_ci	$LD	$acc6,$SIZE_T*18($sp)
1558e1051a39Sopenharmony_ci	$ST	$acc7,$SIZE_T*8($tp)	# tp[15]=
1559e1051a39Sopenharmony_ci	$LD	$acc7,$SIZE_T*19($sp)
1560e1051a39Sopenharmony_ci	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1561e1051a39Sopenharmony_ci	mtctr	$cnt
1562e1051a39Sopenharmony_ci	b	.Lsqr8x_reduction
1563e1051a39Sopenharmony_ci
1564e1051a39Sopenharmony_ci.align	5
1565e1051a39Sopenharmony_ci.Lsqr8x_reduction:
1566e1051a39Sopenharmony_ci	# (*)	$UMULL	$t0,$a0,$na0	# lo(n[0-7])*lo(t[0]*n0)
1567e1051a39Sopenharmony_ci	$UMULL	$t1,$a1,$na0
1568e1051a39Sopenharmony_ci	$UMULL	$t2,$a2,$na0
1569e1051a39Sopenharmony_ci	$STU	$na0,$SIZE_T($tp)	# put aside t[0]*n0 for tail processing
1570e1051a39Sopenharmony_ci	$UMULL	$t3,$a3,$na0
1571e1051a39Sopenharmony_ci	# (*)	addc	$acc0,$acc0,$t0
1572e1051a39Sopenharmony_ci	addic	$acc0,$acc0,-1		# (*)
1573e1051a39Sopenharmony_ci	$UMULL	$t0,$a4,$na0
1574e1051a39Sopenharmony_ci	adde	$acc0,$acc1,$t1
1575e1051a39Sopenharmony_ci	$UMULL	$t1,$a5,$na0
1576e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t2
1577e1051a39Sopenharmony_ci	$UMULL	$t2,$a6,$na0
1578e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t3
1579e1051a39Sopenharmony_ci	$UMULL	$t3,$a7,$na0
1580e1051a39Sopenharmony_ci	adde	$acc3,$acc4,$t0
1581e1051a39Sopenharmony_ci	$UMULH	$t0,$a0,$na0		# hi(n[0-7])*lo(t[0]*n0)
1582e1051a39Sopenharmony_ci	adde	$acc4,$acc5,$t1
1583e1051a39Sopenharmony_ci	$UMULH	$t1,$a1,$na0
1584e1051a39Sopenharmony_ci	adde	$acc5,$acc6,$t2
1585e1051a39Sopenharmony_ci	$UMULH	$t2,$a2,$na0
1586e1051a39Sopenharmony_ci	adde	$acc6,$acc7,$t3
1587e1051a39Sopenharmony_ci	$UMULH	$t3,$a3,$na0
1588e1051a39Sopenharmony_ci	addze	$acc7,$zero
1589e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
1590e1051a39Sopenharmony_ci	$UMULH	$t0,$a4,$na0
1591e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
1592e1051a39Sopenharmony_ci	$UMULH	$t1,$a5,$na0
1593e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
1594e1051a39Sopenharmony_ci	$UMULH	$t2,$a6,$na0
1595e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
1596e1051a39Sopenharmony_ci	$UMULH	$t3,$a7,$na0
1597e1051a39Sopenharmony_ci	$UMULL	$na0,$n0,$acc0		# next t[0]*n0
1598e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t0
1599e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$t1
1600e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$t2
1601e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$t3
1602e1051a39Sopenharmony_ci	bdnz	.Lsqr8x_reduction
1603e1051a39Sopenharmony_ci
1604e1051a39Sopenharmony_ci	$LD	$t0,$SIZE_T*1($tp)
1605e1051a39Sopenharmony_ci	$LD	$t1,$SIZE_T*2($tp)
1606e1051a39Sopenharmony_ci	$LD	$t2,$SIZE_T*3($tp)
1607e1051a39Sopenharmony_ci	$LD	$t3,$SIZE_T*4($tp)
1608e1051a39Sopenharmony_ci	subi	$rp,$tp,$SIZE_T*7
1609e1051a39Sopenharmony_ci	$UCMP	$np_end,$np		# done yet?
1610e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
1611e1051a39Sopenharmony_ci	$LD	$t0,$SIZE_T*5($tp)
1612e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
1613e1051a39Sopenharmony_ci	$LD	$t1,$SIZE_T*6($tp)
1614e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
1615e1051a39Sopenharmony_ci	$LD	$t2,$SIZE_T*7($tp)
1616e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
1617e1051a39Sopenharmony_ci	$LD	$t3,$SIZE_T*8($tp)
1618e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t0
1619e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$t1
1620e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$t2
1621e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$t3
1622e1051a39Sopenharmony_ci	#addze	$carry,$zero		# moved below
1623e1051a39Sopenharmony_ci	beq	.Lsqr8x8_post_condition
1624e1051a39Sopenharmony_ci
1625e1051a39Sopenharmony_ci	$LD	$n0,$SIZE_T*0($rp)
1626e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($np)
1627e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($np)
1628e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($np)
1629e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*4($np)
1630e1051a39Sopenharmony_ci	$LD	$a4,$SIZE_T*5($np)
1631e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*6($np)
1632e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*7($np)
1633e1051a39Sopenharmony_ci	$LDU	$a7,$SIZE_T*8($np)
1634e1051a39Sopenharmony_ci	li	$cnt,0
1635e1051a39Sopenharmony_ci
1636e1051a39Sopenharmony_ci.align	5
1637e1051a39Sopenharmony_ci.Lsqr8x_tail:
1638e1051a39Sopenharmony_ci	$UMULL	$t0,$a0,$n0
1639e1051a39Sopenharmony_ci	addze	$carry,$zero		# carry bit, modulo-scheduled
1640e1051a39Sopenharmony_ci	$UMULL	$t1,$a1,$n0
1641e1051a39Sopenharmony_ci	addi	$cnt,$cnt,$SIZE_T
1642e1051a39Sopenharmony_ci	$UMULL	$t2,$a2,$n0
1643e1051a39Sopenharmony_ci	andi.	$cnt,$cnt,$SIZE_T*8-1
1644e1051a39Sopenharmony_ci	$UMULL	$t3,$a3,$n0
1645e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
1646e1051a39Sopenharmony_ci	$UMULL	$t0,$a4,$n0
1647e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
1648e1051a39Sopenharmony_ci	$UMULL	$t1,$a5,$n0
1649e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
1650e1051a39Sopenharmony_ci	$UMULL	$t2,$a6,$n0
1651e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
1652e1051a39Sopenharmony_ci	$UMULL	$t3,$a7,$n0
1653e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t0
1654e1051a39Sopenharmony_ci	$UMULH	$t0,$a0,$n0
1655e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$t1
1656e1051a39Sopenharmony_ci	$UMULH	$t1,$a1,$n0
1657e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$t2
1658e1051a39Sopenharmony_ci	$UMULH	$t2,$a2,$n0
1659e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$t3
1660e1051a39Sopenharmony_ci	$UMULH	$t3,$a3,$n0
1661e1051a39Sopenharmony_ci	addze	$carry,$carry
1662e1051a39Sopenharmony_ci	$STU	$acc0,$SIZE_T($tp)
1663e1051a39Sopenharmony_ci	addc	$acc0,$acc1,$t0
1664e1051a39Sopenharmony_ci	$UMULH	$t0,$a4,$n0
1665e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t1
1666e1051a39Sopenharmony_ci	$UMULH	$t1,$a5,$n0
1667e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t2
1668e1051a39Sopenharmony_ci	$UMULH	$t2,$a6,$n0
1669e1051a39Sopenharmony_ci	adde	$acc3,$acc4,$t3
1670e1051a39Sopenharmony_ci	$UMULH	$t3,$a7,$n0
1671e1051a39Sopenharmony_ci	$LDX	$n0,$rp,$cnt
1672e1051a39Sopenharmony_ci	adde	$acc4,$acc5,$t0
1673e1051a39Sopenharmony_ci	adde	$acc5,$acc6,$t1
1674e1051a39Sopenharmony_ci	adde	$acc6,$acc7,$t2
1675e1051a39Sopenharmony_ci	adde	$acc7,$carry,$t3
1676e1051a39Sopenharmony_ci	#addze	$carry,$zero		# moved above
1677e1051a39Sopenharmony_ci	bne	.Lsqr8x_tail
1678e1051a39Sopenharmony_ci					# note that carry flag is guaranteed
1679e1051a39Sopenharmony_ci					# to be zero at this point
1680e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($tp)
1681e1051a39Sopenharmony_ci	$POP	$carry,$SIZE_T*10($sp)	# pull top-most carry in case we break
1682e1051a39Sopenharmony_ci	$UCMP	$np_end,$np		# done yet?
1683e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($tp)
1684e1051a39Sopenharmony_ci	sub	$t2,$np_end,$num	# rewinded np
1685e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($tp)
1686e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*4($tp)
1687e1051a39Sopenharmony_ci	$LD	$a4,$SIZE_T*5($tp)
1688e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*6($tp)
1689e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*7($tp)
1690e1051a39Sopenharmony_ci	$LD	$a7,$SIZE_T*8($tp)
1691e1051a39Sopenharmony_ci	beq	.Lsqr8x_tail_break
1692e1051a39Sopenharmony_ci
1693e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$a0
1694e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($np)
1695e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$a1
1696e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($np)
1697e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$a2
1698e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($np)
1699e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$a3
1700e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*4($np)
1701e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$a4
1702e1051a39Sopenharmony_ci	$LD	$a4,$SIZE_T*5($np)
1703e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$a5
1704e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*6($np)
1705e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$a6
1706e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*7($np)
1707e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$a7
1708e1051a39Sopenharmony_ci	$LDU	$a7,$SIZE_T*8($np)
1709e1051a39Sopenharmony_ci	#addze	$carry,$zero		# moved above
1710e1051a39Sopenharmony_ci	b	.Lsqr8x_tail
1711e1051a39Sopenharmony_ci
1712e1051a39Sopenharmony_ci.align	5
1713e1051a39Sopenharmony_ci.Lsqr8x_tail_break:
1714e1051a39Sopenharmony_ci	$POP	$n0,$SIZE_T*8($sp)	# pull n0
1715e1051a39Sopenharmony_ci	$POP	$t3,$SIZE_T*9($sp)	# &tp[2*num-1]
1716e1051a39Sopenharmony_ci	addi	$cnt,$tp,$SIZE_T*8	# end of current t[num] window
1717e1051a39Sopenharmony_ci
1718e1051a39Sopenharmony_ci	addic	$carry,$carry,-1	# "move" top-most carry to carry bit
1719e1051a39Sopenharmony_ci	adde	$t0,$acc0,$a0
1720e1051a39Sopenharmony_ci	$LD	$acc0,$SIZE_T*8($rp)
1721e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($t2)	# recall that $t2 is &n[-1]
1722e1051a39Sopenharmony_ci	adde	$t1,$acc1,$a1
1723e1051a39Sopenharmony_ci	$LD	$acc1,$SIZE_T*9($rp)
1724e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($t2)
1725e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$a2
1726e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($t2)
1727e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$a3
1728e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*4($t2)
1729e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$a4
1730e1051a39Sopenharmony_ci	$LD	$a4,$SIZE_T*5($t2)
1731e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$a5
1732e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*6($t2)
1733e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$a6
1734e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*7($t2)
1735e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$a7
1736e1051a39Sopenharmony_ci	$LD	$a7,$SIZE_T*8($t2)
1737e1051a39Sopenharmony_ci	addi	$np,$t2,$SIZE_T*8
1738e1051a39Sopenharmony_ci	addze	$t2,$zero		# top-most carry
1739e1051a39Sopenharmony_ci	$UMULL	$na0,$n0,$acc0
1740e1051a39Sopenharmony_ci	$ST	$t0,$SIZE_T*1($tp)
1741e1051a39Sopenharmony_ci	$UCMP	$cnt,$t3		# did we hit the bottom?
1742e1051a39Sopenharmony_ci	$ST	$t1,$SIZE_T*2($tp)
1743e1051a39Sopenharmony_ci	li	$cnt,8
1744e1051a39Sopenharmony_ci	$ST	$acc2,$SIZE_T*3($tp)
1745e1051a39Sopenharmony_ci	$LD	$acc2,$SIZE_T*10($rp)
1746e1051a39Sopenharmony_ci	$ST	$acc3,$SIZE_T*4($tp)
1747e1051a39Sopenharmony_ci	$LD	$acc3,$SIZE_T*11($rp)
1748e1051a39Sopenharmony_ci	$ST	$acc4,$SIZE_T*5($tp)
1749e1051a39Sopenharmony_ci	$LD	$acc4,$SIZE_T*12($rp)
1750e1051a39Sopenharmony_ci	$ST	$acc5,$SIZE_T*6($tp)
1751e1051a39Sopenharmony_ci	$LD	$acc5,$SIZE_T*13($rp)
1752e1051a39Sopenharmony_ci	$ST	$acc6,$SIZE_T*7($tp)
1753e1051a39Sopenharmony_ci	$LD	$acc6,$SIZE_T*14($rp)
1754e1051a39Sopenharmony_ci	$ST	$acc7,$SIZE_T*8($tp)
1755e1051a39Sopenharmony_ci	$LD	$acc7,$SIZE_T*15($rp)
1756e1051a39Sopenharmony_ci	$PUSH	$t2,$SIZE_T*10($sp)	# off-load top-most carry
1757e1051a39Sopenharmony_ci	addi	$tp,$rp,$SIZE_T*7	# slide the window
1758e1051a39Sopenharmony_ci	mtctr	$cnt
1759e1051a39Sopenharmony_ci	bne	.Lsqr8x_reduction
1760e1051a39Sopenharmony_ci
1761e1051a39Sopenharmony_ci	################################################################
1762e1051a39Sopenharmony_ci	# Final step. We see if result is larger than modulus, and
1763e1051a39Sopenharmony_ci	# if it is, subtract the modulus. But comparison implies
1764e1051a39Sopenharmony_ci	# subtraction. So we subtract modulus, see if it borrowed,
1765e1051a39Sopenharmony_ci	# and conditionally copy original value.
1766e1051a39Sopenharmony_ci	$POP	$rp,$SIZE_T*6($sp)	# pull &rp[-1]
1767e1051a39Sopenharmony_ci	srwi	$cnt,$num,`log($SIZE_T)/log(2)+3`
1768e1051a39Sopenharmony_ci	mr	$n0,$tp			# put tp aside
1769e1051a39Sopenharmony_ci	addi	$tp,$tp,$SIZE_T*8
1770e1051a39Sopenharmony_ci	subi	$cnt,$cnt,1
1771e1051a39Sopenharmony_ci	subfc	$t0,$a0,$acc0
1772e1051a39Sopenharmony_ci	subfe	$t1,$a1,$acc1
1773e1051a39Sopenharmony_ci	mr	$carry,$t2
1774e1051a39Sopenharmony_ci	mr	$ap_end,$rp		# $rp copy
1775e1051a39Sopenharmony_ci
1776e1051a39Sopenharmony_ci	mtctr	$cnt
1777e1051a39Sopenharmony_ci	b	.Lsqr8x_sub
1778e1051a39Sopenharmony_ci
1779e1051a39Sopenharmony_ci.align	5
1780e1051a39Sopenharmony_ci.Lsqr8x_sub:
1781e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*1($np)
1782e1051a39Sopenharmony_ci	$LD	$acc0,$SIZE_T*1($tp)
1783e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*2($np)
1784e1051a39Sopenharmony_ci	$LD	$acc1,$SIZE_T*2($tp)
1785e1051a39Sopenharmony_ci	subfe	$t2,$a2,$acc2
1786e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*3($np)
1787e1051a39Sopenharmony_ci	$LD	$acc2,$SIZE_T*3($tp)
1788e1051a39Sopenharmony_ci	subfe	$t3,$a3,$acc3
1789e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*4($np)
1790e1051a39Sopenharmony_ci	$LD	$acc3,$SIZE_T*4($tp)
1791e1051a39Sopenharmony_ci	$ST	$t0,$SIZE_T*1($rp)
1792e1051a39Sopenharmony_ci	subfe	$t0,$a4,$acc4
1793e1051a39Sopenharmony_ci	$LD	$a4,$SIZE_T*5($np)
1794e1051a39Sopenharmony_ci	$LD	$acc4,$SIZE_T*5($tp)
1795e1051a39Sopenharmony_ci	$ST	$t1,$SIZE_T*2($rp)
1796e1051a39Sopenharmony_ci	subfe	$t1,$a5,$acc5
1797e1051a39Sopenharmony_ci	$LD	$a5,$SIZE_T*6($np)
1798e1051a39Sopenharmony_ci	$LD	$acc5,$SIZE_T*6($tp)
1799e1051a39Sopenharmony_ci	$ST	$t2,$SIZE_T*3($rp)
1800e1051a39Sopenharmony_ci	subfe	$t2,$a6,$acc6
1801e1051a39Sopenharmony_ci	$LD	$a6,$SIZE_T*7($np)
1802e1051a39Sopenharmony_ci	$LD	$acc6,$SIZE_T*7($tp)
1803e1051a39Sopenharmony_ci	$ST	$t3,$SIZE_T*4($rp)
1804e1051a39Sopenharmony_ci	subfe	$t3,$a7,$acc7
1805e1051a39Sopenharmony_ci	$LDU	$a7,$SIZE_T*8($np)
1806e1051a39Sopenharmony_ci	$LDU	$acc7,$SIZE_T*8($tp)
1807e1051a39Sopenharmony_ci	$ST	$t0,$SIZE_T*5($rp)
1808e1051a39Sopenharmony_ci	subfe	$t0,$a0,$acc0
1809e1051a39Sopenharmony_ci	$ST	$t1,$SIZE_T*6($rp)
1810e1051a39Sopenharmony_ci	subfe	$t1,$a1,$acc1
1811e1051a39Sopenharmony_ci	$ST	$t2,$SIZE_T*7($rp)
1812e1051a39Sopenharmony_ci	$STU	$t3,$SIZE_T*8($rp)
1813e1051a39Sopenharmony_ci	bdnz	.Lsqr8x_sub
1814e1051a39Sopenharmony_ci
1815e1051a39Sopenharmony_ci	srwi	$cnt,$num,`log($SIZE_T)/log(2)+2`
1816e1051a39Sopenharmony_ci	 $LD	$a0,$SIZE_T*1($ap_end)	# original $rp
1817e1051a39Sopenharmony_ci	 $LD	$acc0,$SIZE_T*1($n0)	# original $tp
1818e1051a39Sopenharmony_ci	subi	$cnt,$cnt,1
1819e1051a39Sopenharmony_ci	 $LD	$a1,$SIZE_T*2($ap_end)
1820e1051a39Sopenharmony_ci	 $LD	$acc1,$SIZE_T*2($n0)
1821e1051a39Sopenharmony_ci	subfe	$t2,$a2,$acc2
1822e1051a39Sopenharmony_ci	 $LD	$a2,$SIZE_T*3($ap_end)
1823e1051a39Sopenharmony_ci	 $LD	$acc2,$SIZE_T*3($n0)
1824e1051a39Sopenharmony_ci	subfe	$t3,$a3,$acc3
1825e1051a39Sopenharmony_ci	 $LD	$a3,$SIZE_T*4($ap_end)
1826e1051a39Sopenharmony_ci	 $LDU	$acc3,$SIZE_T*4($n0)
1827e1051a39Sopenharmony_ci	$ST	$t0,$SIZE_T*1($rp)
1828e1051a39Sopenharmony_ci	subfe	$t0,$a4,$acc4
1829e1051a39Sopenharmony_ci	$ST	$t1,$SIZE_T*2($rp)
1830e1051a39Sopenharmony_ci	subfe	$t1,$a5,$acc5
1831e1051a39Sopenharmony_ci	$ST	$t2,$SIZE_T*3($rp)
1832e1051a39Sopenharmony_ci	subfe	$t2,$a6,$acc6
1833e1051a39Sopenharmony_ci	$ST	$t3,$SIZE_T*4($rp)
1834e1051a39Sopenharmony_ci	subfe	$t3,$a7,$acc7
1835e1051a39Sopenharmony_ci	$ST	$t0,$SIZE_T*5($rp)
1836e1051a39Sopenharmony_ci	subfe	$carry,$zero,$carry	# did it borrow?
1837e1051a39Sopenharmony_ci	$ST	$t1,$SIZE_T*6($rp)
1838e1051a39Sopenharmony_ci	$ST	$t2,$SIZE_T*7($rp)
1839e1051a39Sopenharmony_ci	$ST	$t3,$SIZE_T*8($rp)
1840e1051a39Sopenharmony_ci
1841e1051a39Sopenharmony_ci	addi	$tp,$sp,$SIZE_T*11
1842e1051a39Sopenharmony_ci	mtctr	$cnt
1843e1051a39Sopenharmony_ci
1844e1051a39Sopenharmony_ci.Lsqr4x_cond_copy:
1845e1051a39Sopenharmony_ci	andc	$a0,$a0,$carry
1846e1051a39Sopenharmony_ci	 $ST	$zero,-$SIZE_T*3($n0)	# wipe stack clean
1847e1051a39Sopenharmony_ci	and	$acc0,$acc0,$carry
1848e1051a39Sopenharmony_ci	 $ST	$zero,-$SIZE_T*2($n0)
1849e1051a39Sopenharmony_ci	andc	$a1,$a1,$carry
1850e1051a39Sopenharmony_ci	 $ST	$zero,-$SIZE_T*1($n0)
1851e1051a39Sopenharmony_ci	and	$acc1,$acc1,$carry
1852e1051a39Sopenharmony_ci	 $ST	$zero,-$SIZE_T*0($n0)
1853e1051a39Sopenharmony_ci	andc	$a2,$a2,$carry
1854e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*1($tp)
1855e1051a39Sopenharmony_ci	and	$acc2,$acc2,$carry
1856e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*2($tp)
1857e1051a39Sopenharmony_ci	andc	$a3,$a3,$carry
1858e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*3($tp)
1859e1051a39Sopenharmony_ci	and	$acc3,$acc3,$carry
1860e1051a39Sopenharmony_ci	 $STU	$zero,$SIZE_T*4($tp)
1861e1051a39Sopenharmony_ci	or	$t0,$a0,$acc0
1862e1051a39Sopenharmony_ci	$LD	$a0,$SIZE_T*5($ap_end)
1863e1051a39Sopenharmony_ci	$LD	$acc0,$SIZE_T*1($n0)
1864e1051a39Sopenharmony_ci	or	$t1,$a1,$acc1
1865e1051a39Sopenharmony_ci	$LD	$a1,$SIZE_T*6($ap_end)
1866e1051a39Sopenharmony_ci	$LD	$acc1,$SIZE_T*2($n0)
1867e1051a39Sopenharmony_ci	or	$t2,$a2,$acc2
1868e1051a39Sopenharmony_ci	$LD	$a2,$SIZE_T*7($ap_end)
1869e1051a39Sopenharmony_ci	$LD	$acc2,$SIZE_T*3($n0)
1870e1051a39Sopenharmony_ci	or	$t3,$a3,$acc3
1871e1051a39Sopenharmony_ci	$LD	$a3,$SIZE_T*8($ap_end)
1872e1051a39Sopenharmony_ci	$LDU	$acc3,$SIZE_T*4($n0)
1873e1051a39Sopenharmony_ci	$ST	$t0,$SIZE_T*1($ap_end)
1874e1051a39Sopenharmony_ci	$ST	$t1,$SIZE_T*2($ap_end)
1875e1051a39Sopenharmony_ci	$ST	$t2,$SIZE_T*3($ap_end)
1876e1051a39Sopenharmony_ci	$STU	$t3,$SIZE_T*4($ap_end)
1877e1051a39Sopenharmony_ci	bdnz	.Lsqr4x_cond_copy
1878e1051a39Sopenharmony_ci
1879e1051a39Sopenharmony_ci	$POP	$ap,0($sp)		# pull saved sp
1880e1051a39Sopenharmony_ci	andc	$a0,$a0,$carry
1881e1051a39Sopenharmony_ci	and	$acc0,$acc0,$carry
1882e1051a39Sopenharmony_ci	andc	$a1,$a1,$carry
1883e1051a39Sopenharmony_ci	and	$acc1,$acc1,$carry
1884e1051a39Sopenharmony_ci	andc	$a2,$a2,$carry
1885e1051a39Sopenharmony_ci	and	$acc2,$acc2,$carry
1886e1051a39Sopenharmony_ci	andc	$a3,$a3,$carry
1887e1051a39Sopenharmony_ci	and	$acc3,$acc3,$carry
1888e1051a39Sopenharmony_ci	or	$t0,$a0,$acc0
1889e1051a39Sopenharmony_ci	or	$t1,$a1,$acc1
1890e1051a39Sopenharmony_ci	or	$t2,$a2,$acc2
1891e1051a39Sopenharmony_ci	or	$t3,$a3,$acc3
1892e1051a39Sopenharmony_ci	$ST	$t0,$SIZE_T*1($ap_end)
1893e1051a39Sopenharmony_ci	$ST	$t1,$SIZE_T*2($ap_end)
1894e1051a39Sopenharmony_ci	$ST	$t2,$SIZE_T*3($ap_end)
1895e1051a39Sopenharmony_ci	$ST	$t3,$SIZE_T*4($ap_end)
1896e1051a39Sopenharmony_ci
1897e1051a39Sopenharmony_ci	b	.Lsqr8x_done
1898e1051a39Sopenharmony_ci
1899e1051a39Sopenharmony_ci.align	5
1900e1051a39Sopenharmony_ci.Lsqr8x8_post_condition:
1901e1051a39Sopenharmony_ci	$POP	$rp,$SIZE_T*6($sp)	# pull rp
1902e1051a39Sopenharmony_ci	$POP	$ap,0($sp)		# pull saved sp
1903e1051a39Sopenharmony_ci	addze	$carry,$zero
1904e1051a39Sopenharmony_ci
1905e1051a39Sopenharmony_ci	# $acc0-7,$carry hold result, $a0-7 hold modulus
1906e1051a39Sopenharmony_ci	subfc	$acc0,$a0,$acc0
1907e1051a39Sopenharmony_ci	subfe	$acc1,$a1,$acc1
1908e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*12($sp)	# wipe stack clean
1909e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*13($sp)
1910e1051a39Sopenharmony_ci	subfe	$acc2,$a2,$acc2
1911e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*14($sp)
1912e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*15($sp)
1913e1051a39Sopenharmony_ci	subfe	$acc3,$a3,$acc3
1914e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*16($sp)
1915e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*17($sp)
1916e1051a39Sopenharmony_ci	subfe	$acc4,$a4,$acc4
1917e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*18($sp)
1918e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*19($sp)
1919e1051a39Sopenharmony_ci	subfe	$acc5,$a5,$acc5
1920e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*20($sp)
1921e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*21($sp)
1922e1051a39Sopenharmony_ci	subfe	$acc6,$a6,$acc6
1923e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*22($sp)
1924e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*23($sp)
1925e1051a39Sopenharmony_ci	subfe	$acc7,$a7,$acc7
1926e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*24($sp)
1927e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*25($sp)
1928e1051a39Sopenharmony_ci	subfe	$carry,$zero,$carry	# did it borrow?
1929e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*26($sp)
1930e1051a39Sopenharmony_ci	 $ST	$zero,$SIZE_T*27($sp)
1931e1051a39Sopenharmony_ci
1932e1051a39Sopenharmony_ci	and	$a0,$a0,$carry
1933e1051a39Sopenharmony_ci	and	$a1,$a1,$carry
1934e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$a0		# add modulus back if borrowed
1935e1051a39Sopenharmony_ci	and	$a2,$a2,$carry
1936e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$a1
1937e1051a39Sopenharmony_ci	and	$a3,$a3,$carry
1938e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$a2
1939e1051a39Sopenharmony_ci	and	$a4,$a4,$carry
1940e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$a3
1941e1051a39Sopenharmony_ci	and	$a5,$a5,$carry
1942e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$a4
1943e1051a39Sopenharmony_ci	and	$a6,$a6,$carry
1944e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$a5
1945e1051a39Sopenharmony_ci	and	$a7,$a7,$carry
1946e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$a6
1947e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$a7
1948e1051a39Sopenharmony_ci	$ST	$acc0,$SIZE_T*1($rp)
1949e1051a39Sopenharmony_ci	$ST	$acc1,$SIZE_T*2($rp)
1950e1051a39Sopenharmony_ci	$ST	$acc2,$SIZE_T*3($rp)
1951e1051a39Sopenharmony_ci	$ST	$acc3,$SIZE_T*4($rp)
1952e1051a39Sopenharmony_ci	$ST	$acc4,$SIZE_T*5($rp)
1953e1051a39Sopenharmony_ci	$ST	$acc5,$SIZE_T*6($rp)
1954e1051a39Sopenharmony_ci	$ST	$acc6,$SIZE_T*7($rp)
1955e1051a39Sopenharmony_ci	$ST	$acc7,$SIZE_T*8($rp)
1956e1051a39Sopenharmony_ci
1957e1051a39Sopenharmony_ci.Lsqr8x_done:
1958e1051a39Sopenharmony_ci	$PUSH	$zero,$SIZE_T*8($sp)
1959e1051a39Sopenharmony_ci	$PUSH	$zero,$SIZE_T*10($sp)
1960e1051a39Sopenharmony_ci
1961e1051a39Sopenharmony_ci	$POP	r14,-$SIZE_T*18($ap)
1962e1051a39Sopenharmony_ci	li	r3,1			# signal "done"
1963e1051a39Sopenharmony_ci	$POP	r15,-$SIZE_T*17($ap)
1964e1051a39Sopenharmony_ci	$POP	r16,-$SIZE_T*16($ap)
1965e1051a39Sopenharmony_ci	$POP	r17,-$SIZE_T*15($ap)
1966e1051a39Sopenharmony_ci	$POP	r18,-$SIZE_T*14($ap)
1967e1051a39Sopenharmony_ci	$POP	r19,-$SIZE_T*13($ap)
1968e1051a39Sopenharmony_ci	$POP	r20,-$SIZE_T*12($ap)
1969e1051a39Sopenharmony_ci	$POP	r21,-$SIZE_T*11($ap)
1970e1051a39Sopenharmony_ci	$POP	r22,-$SIZE_T*10($ap)
1971e1051a39Sopenharmony_ci	$POP	r23,-$SIZE_T*9($ap)
1972e1051a39Sopenharmony_ci	$POP	r24,-$SIZE_T*8($ap)
1973e1051a39Sopenharmony_ci	$POP	r25,-$SIZE_T*7($ap)
1974e1051a39Sopenharmony_ci	$POP	r26,-$SIZE_T*6($ap)
1975e1051a39Sopenharmony_ci	$POP	r27,-$SIZE_T*5($ap)
1976e1051a39Sopenharmony_ci	$POP	r28,-$SIZE_T*4($ap)
1977e1051a39Sopenharmony_ci	$POP	r29,-$SIZE_T*3($ap)
1978e1051a39Sopenharmony_ci	$POP	r30,-$SIZE_T*2($ap)
1979e1051a39Sopenharmony_ci	$POP	r31,-$SIZE_T*1($ap)
1980e1051a39Sopenharmony_ci	mr	$sp,$ap
1981e1051a39Sopenharmony_ci	blr
1982e1051a39Sopenharmony_ci	.long	0
1983e1051a39Sopenharmony_ci	.byte	0,12,4,0x20,0x80,18,6,0
1984e1051a39Sopenharmony_ci	.long	0
1985e1051a39Sopenharmony_ci.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1986e1051a39Sopenharmony_ci___
1987e1051a39Sopenharmony_ci}
1988e1051a39Sopenharmony_ci$code.=<<___;
1989e1051a39Sopenharmony_ci.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1990e1051a39Sopenharmony_ci___
1991e1051a39Sopenharmony_ci
1992e1051a39Sopenharmony_ci$code =~ s/\`([^\`]*)\`/eval $1/gem;
1993e1051a39Sopenharmony_ciprint $code;
1994e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
1995