1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# ECP_NISTZ256 module for SPARCv9.
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# February 2015.
20e1051a39Sopenharmony_ci#
21e1051a39Sopenharmony_ci# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22e1051a39Sopenharmony_ci# http://eprint.iacr.org/2013/816. In the process of adaptation
23e1051a39Sopenharmony_ci# original .c module was made 32-bit savvy in order to make this
24e1051a39Sopenharmony_ci# implementation possible.
25e1051a39Sopenharmony_ci#
26e1051a39Sopenharmony_ci#			with/without -DECP_NISTZ256_ASM
27e1051a39Sopenharmony_ci# UltraSPARC III	+12-18%
28e1051a39Sopenharmony_ci# SPARC T4		+99-550% (+66-150% on 32-bit Solaris)
29e1051a39Sopenharmony_ci#
30e1051a39Sopenharmony_ci# Ranges denote minimum and maximum improvement coefficients depending
31e1051a39Sopenharmony_ci# on benchmark. Lower coefficients are for ECDSA sign, server-side
32e1051a39Sopenharmony_ci# operation. Keep in mind that +200% means 3x improvement.
33e1051a39Sopenharmony_ci
34e1051a39Sopenharmony_ci$output = pop and open STDOUT,">$output";
35e1051a39Sopenharmony_ci
36e1051a39Sopenharmony_ci$code.=<<___;
37e1051a39Sopenharmony_ci#ifndef __ASSEMBLER__
38e1051a39Sopenharmony_ci# define __ASSEMBLER__ 1
39e1051a39Sopenharmony_ci#endif
40e1051a39Sopenharmony_ci#include "crypto/sparc_arch.h"
41e1051a39Sopenharmony_ci
42e1051a39Sopenharmony_ci#define LOCALS	(STACK_BIAS+STACK_FRAME)
43e1051a39Sopenharmony_ci#ifdef	__arch64__
44e1051a39Sopenharmony_ci.register	%g2,#scratch
45e1051a39Sopenharmony_ci.register	%g3,#scratch
46e1051a39Sopenharmony_ci# define STACK64_FRAME	STACK_FRAME
47e1051a39Sopenharmony_ci# define LOCALS64	LOCALS
48e1051a39Sopenharmony_ci#else
49e1051a39Sopenharmony_ci# define STACK64_FRAME	(2047+192)
50e1051a39Sopenharmony_ci# define LOCALS64	STACK64_FRAME
51e1051a39Sopenharmony_ci#endif
52e1051a39Sopenharmony_ci
53e1051a39Sopenharmony_ci.section	".text",#alloc,#execinstr
54e1051a39Sopenharmony_ci___
55e1051a39Sopenharmony_ci########################################################################
56e1051a39Sopenharmony_ci# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
57e1051a39Sopenharmony_ci#
58e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59e1051a39Sopenharmony_ciopen TABLE,"<ecp_nistz256_table.c"		or
60e1051a39Sopenharmony_ciopen TABLE,"<${dir}../ecp_nistz256_table.c"	or
61e1051a39Sopenharmony_cidie "failed to open ecp_nistz256_table.c:",$!;
62e1051a39Sopenharmony_ci
63e1051a39Sopenharmony_ciuse integer;
64e1051a39Sopenharmony_ci
65e1051a39Sopenharmony_ciforeach(<TABLE>) {
66e1051a39Sopenharmony_ci	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
67e1051a39Sopenharmony_ci}
68e1051a39Sopenharmony_ciclose TABLE;
69e1051a39Sopenharmony_ci
70e1051a39Sopenharmony_ci# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
71e1051a39Sopenharmony_ci# 64*16*37-1 is because $#arr returns last valid index or @arr, not
72e1051a39Sopenharmony_ci# amount of elements.
73e1051a39Sopenharmony_cidie "insane number of elements" if ($#arr != 64*16*37-1);
74e1051a39Sopenharmony_ci
75e1051a39Sopenharmony_ci$code.=<<___;
76e1051a39Sopenharmony_ci.globl	ecp_nistz256_precomputed
77e1051a39Sopenharmony_ci.align	4096
78e1051a39Sopenharmony_ciecp_nistz256_precomputed:
79e1051a39Sopenharmony_ci___
80e1051a39Sopenharmony_ci########################################################################
81e1051a39Sopenharmony_ci# this conversion smashes P256_POINT_AFFINE by individual bytes with
82e1051a39Sopenharmony_ci# 64 byte interval, similar to
83e1051a39Sopenharmony_ci#	1111222233334444
84e1051a39Sopenharmony_ci#	1234123412341234
85e1051a39Sopenharmony_cifor(1..37) {
86e1051a39Sopenharmony_ci	@tbl = splice(@arr,0,64*16);
87e1051a39Sopenharmony_ci	for($i=0;$i<64;$i++) {
88e1051a39Sopenharmony_ci		undef @line;
89e1051a39Sopenharmony_ci		for($j=0;$j<64;$j++) {
90e1051a39Sopenharmony_ci			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
91e1051a39Sopenharmony_ci		}
92e1051a39Sopenharmony_ci		$code.=".byte\t";
93e1051a39Sopenharmony_ci		$code.=join(',',map { sprintf "0x%02x",$_} @line);
94e1051a39Sopenharmony_ci		$code.="\n";
95e1051a39Sopenharmony_ci	}
96e1051a39Sopenharmony_ci}
97e1051a39Sopenharmony_ci
98e1051a39Sopenharmony_ci{{{
99e1051a39Sopenharmony_cimy ($rp,$ap,$bp)=map("%i$_",(0..2));
100e1051a39Sopenharmony_cimy @acc=map("%l$_",(0..7));
101e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5");
102e1051a39Sopenharmony_cimy ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1");
103e1051a39Sopenharmony_cimy ($rp_real,$ap_real)=("%g2","%g3");
104e1051a39Sopenharmony_ci
105e1051a39Sopenharmony_ci$code.=<<___;
106e1051a39Sopenharmony_ci.type	ecp_nistz256_precomputed,#object
107e1051a39Sopenharmony_ci.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
108e1051a39Sopenharmony_ci.align	64
109e1051a39Sopenharmony_ci.LRR:	! 2^512 mod P precomputed for NIST P256 polynomial
110e1051a39Sopenharmony_ci.long	0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
111e1051a39Sopenharmony_ci.long	0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
112e1051a39Sopenharmony_ci.Lone:
113e1051a39Sopenharmony_ci.long	1,0,0,0,0,0,0,0
114e1051a39Sopenharmony_ci.asciz	"ECP_NISTZ256 for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
115e1051a39Sopenharmony_ci
116e1051a39Sopenharmony_ci! void	ecp_nistz256_to_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
117e1051a39Sopenharmony_ci.globl	ecp_nistz256_to_mont
118e1051a39Sopenharmony_ci.align	64
119e1051a39Sopenharmony_ciecp_nistz256_to_mont:
120e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
121e1051a39Sopenharmony_ci	nop
122e1051a39Sopenharmony_ci1:	call	.+8
123e1051a39Sopenharmony_ci	add	%o7,.LRR-1b,$bp
124e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont
125e1051a39Sopenharmony_ci	nop
126e1051a39Sopenharmony_ci	ret
127e1051a39Sopenharmony_ci	restore
128e1051a39Sopenharmony_ci.type	ecp_nistz256_to_mont,#function
129e1051a39Sopenharmony_ci.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
130e1051a39Sopenharmony_ci
131e1051a39Sopenharmony_ci! void	ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
132e1051a39Sopenharmony_ci.globl	ecp_nistz256_from_mont
133e1051a39Sopenharmony_ci.align	32
134e1051a39Sopenharmony_ciecp_nistz256_from_mont:
135e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
136e1051a39Sopenharmony_ci	nop
137e1051a39Sopenharmony_ci1:	call	.+8
138e1051a39Sopenharmony_ci	add	%o7,.Lone-1b,$bp
139e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont
140e1051a39Sopenharmony_ci	nop
141e1051a39Sopenharmony_ci	ret
142e1051a39Sopenharmony_ci	restore
143e1051a39Sopenharmony_ci.type	ecp_nistz256_from_mont,#function
144e1051a39Sopenharmony_ci.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
145e1051a39Sopenharmony_ci
146e1051a39Sopenharmony_ci! void	ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8],
147e1051a39Sopenharmony_ci!					      const BN_ULONG %i2[8]);
148e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_mont
149e1051a39Sopenharmony_ci.align	32
150e1051a39Sopenharmony_ciecp_nistz256_mul_mont:
151e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
152e1051a39Sopenharmony_ci	nop
153e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont
154e1051a39Sopenharmony_ci	nop
155e1051a39Sopenharmony_ci	ret
156e1051a39Sopenharmony_ci	restore
157e1051a39Sopenharmony_ci.type	ecp_nistz256_mul_mont,#function
158e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
159e1051a39Sopenharmony_ci
160e1051a39Sopenharmony_ci! void	ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]);
161e1051a39Sopenharmony_ci.globl	ecp_nistz256_sqr_mont
162e1051a39Sopenharmony_ci.align	32
163e1051a39Sopenharmony_ciecp_nistz256_sqr_mont:
164e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
165e1051a39Sopenharmony_ci	mov	$ap,$bp
166e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont
167e1051a39Sopenharmony_ci	nop
168e1051a39Sopenharmony_ci	ret
169e1051a39Sopenharmony_ci	restore
170e1051a39Sopenharmony_ci.type	ecp_nistz256_sqr_mont,#function
171e1051a39Sopenharmony_ci.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
172e1051a39Sopenharmony_ci___
173e1051a39Sopenharmony_ci
174e1051a39Sopenharmony_ci########################################################################
175e1051a39Sopenharmony_ci# Special thing to keep in mind is that $t0-$t7 hold 64-bit values,
176e1051a39Sopenharmony_ci# while all others are meant to keep 32. "Meant to" means that additions
177e1051a39Sopenharmony_ci# to @acc[0-7] do "contaminate" upper bits, but they are cleared before
178e1051a39Sopenharmony_ci# they can affect outcome (follow 'and' with $mask). Also keep in mind
179e1051a39Sopenharmony_ci# that addition with carry is addition with 32-bit carry, even though
180e1051a39Sopenharmony_ci# CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see
181e1051a39Sopenharmony_ci# below for VIS3 code paths.]
182e1051a39Sopenharmony_ci
183e1051a39Sopenharmony_ci$code.=<<___;
184e1051a39Sopenharmony_ci.align	32
185e1051a39Sopenharmony_ci__ecp_nistz256_mul_mont:
186e1051a39Sopenharmony_ci	ld	[$bp+0],$bi		! b[0]
187e1051a39Sopenharmony_ci	mov	-1,$mask
188e1051a39Sopenharmony_ci	ld	[$ap+0],$a0
189e1051a39Sopenharmony_ci	srl	$mask,0,$mask		! 0xffffffff
190e1051a39Sopenharmony_ci	ld	[$ap+4],$t1
191e1051a39Sopenharmony_ci	ld	[$ap+8],$t2
192e1051a39Sopenharmony_ci	ld	[$ap+12],$t3
193e1051a39Sopenharmony_ci	ld	[$ap+16],$t4
194e1051a39Sopenharmony_ci	ld	[$ap+20],$t5
195e1051a39Sopenharmony_ci	ld	[$ap+24],$t6
196e1051a39Sopenharmony_ci	ld	[$ap+28],$t7
197e1051a39Sopenharmony_ci	mulx	$a0,$bi,$t0		! a[0-7]*b[0], 64-bit results
198e1051a39Sopenharmony_ci	mulx	$t1,$bi,$t1
199e1051a39Sopenharmony_ci	mulx	$t2,$bi,$t2
200e1051a39Sopenharmony_ci	mulx	$t3,$bi,$t3
201e1051a39Sopenharmony_ci	mulx	$t4,$bi,$t4
202e1051a39Sopenharmony_ci	mulx	$t5,$bi,$t5
203e1051a39Sopenharmony_ci	mulx	$t6,$bi,$t6
204e1051a39Sopenharmony_ci	mulx	$t7,$bi,$t7
205e1051a39Sopenharmony_ci	srlx	$t0,32,@acc[1]		! extract high parts
206e1051a39Sopenharmony_ci	srlx	$t1,32,@acc[2]
207e1051a39Sopenharmony_ci	srlx	$t2,32,@acc[3]
208e1051a39Sopenharmony_ci	srlx	$t3,32,@acc[4]
209e1051a39Sopenharmony_ci	srlx	$t4,32,@acc[5]
210e1051a39Sopenharmony_ci	srlx	$t5,32,@acc[6]
211e1051a39Sopenharmony_ci	srlx	$t6,32,@acc[7]
212e1051a39Sopenharmony_ci	srlx	$t7,32,@acc[0]		! "@acc[8]"
213e1051a39Sopenharmony_ci	mov	0,$carry
214e1051a39Sopenharmony_ci___
215e1051a39Sopenharmony_cifor($i=1;$i<8;$i++) {
216e1051a39Sopenharmony_ci$code.=<<___;
217e1051a39Sopenharmony_ci	addcc	@acc[1],$t1,@acc[1]	! accumulate high parts
218e1051a39Sopenharmony_ci	ld	[$bp+4*$i],$bi		! b[$i]
219e1051a39Sopenharmony_ci	ld	[$ap+4],$t1		! re-load a[1-7]
220e1051a39Sopenharmony_ci	addccc	@acc[2],$t2,@acc[2]
221e1051a39Sopenharmony_ci	addccc	@acc[3],$t3,@acc[3]
222e1051a39Sopenharmony_ci	ld	[$ap+8],$t2
223e1051a39Sopenharmony_ci	ld	[$ap+12],$t3
224e1051a39Sopenharmony_ci	addccc	@acc[4],$t4,@acc[4]
225e1051a39Sopenharmony_ci	addccc	@acc[5],$t5,@acc[5]
226e1051a39Sopenharmony_ci	ld	[$ap+16],$t4
227e1051a39Sopenharmony_ci	ld	[$ap+20],$t5
228e1051a39Sopenharmony_ci	addccc	@acc[6],$t6,@acc[6]
229e1051a39Sopenharmony_ci	addccc	@acc[7],$t7,@acc[7]
230e1051a39Sopenharmony_ci	ld	[$ap+24],$t6
231e1051a39Sopenharmony_ci	ld	[$ap+28],$t7
232e1051a39Sopenharmony_ci	addccc	@acc[0],$carry,@acc[0]	! "@acc[8]"
233e1051a39Sopenharmony_ci	addc	%g0,%g0,$carry
234e1051a39Sopenharmony_ci___
235e1051a39Sopenharmony_ci	# Reduction iteration is normally performed by accumulating
236e1051a39Sopenharmony_ci	# result of multiplication of modulus by "magic" digit [and
237e1051a39Sopenharmony_ci	# omitting least significant word, which is guaranteed to
238e1051a39Sopenharmony_ci	# be 0], but thanks to special form of modulus and "magic"
239e1051a39Sopenharmony_ci	# digit being equal to least significant word, it can be
240e1051a39Sopenharmony_ci	# performed with additions and subtractions alone. Indeed:
241e1051a39Sopenharmony_ci	#
242e1051a39Sopenharmony_ci	#        ffff.0001.0000.0000.0000.ffff.ffff.ffff
243e1051a39Sopenharmony_ci	# *                                         abcd
244e1051a39Sopenharmony_ci	# + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
245e1051a39Sopenharmony_ci	#
246e1051a39Sopenharmony_ci	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
247e1051a39Sopenharmony_ci	# rewrite above as:
248e1051a39Sopenharmony_ci	#
249e1051a39Sopenharmony_ci	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
250e1051a39Sopenharmony_ci	# + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
251e1051a39Sopenharmony_ci	# -      abcd.0000.0000.0000.0000.0000.0000.abcd
252e1051a39Sopenharmony_ci	#
253e1051a39Sopenharmony_ci	# or marking redundant operations:
254e1051a39Sopenharmony_ci	#
255e1051a39Sopenharmony_ci	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
256e1051a39Sopenharmony_ci	# + abcd.0000.abcd.0000.0000.abcd.----.----.----
257e1051a39Sopenharmony_ci	# -      abcd.----.----.----.----.----.----.----
258e1051a39Sopenharmony_ci
259e1051a39Sopenharmony_ci$code.=<<___;
260e1051a39Sopenharmony_ci	! multiplication-less reduction
261e1051a39Sopenharmony_ci	addcc	@acc[3],$t0,@acc[3]	! r[3]+=r[0]
262e1051a39Sopenharmony_ci	addccc	@acc[4],%g0,@acc[4]	! r[4]+=0
263e1051a39Sopenharmony_ci	 and	@acc[1],$mask,@acc[1]
264e1051a39Sopenharmony_ci	 and	@acc[2],$mask,@acc[2]
265e1051a39Sopenharmony_ci	addccc	@acc[5],%g0,@acc[5]	! r[5]+=0
266e1051a39Sopenharmony_ci	addccc	@acc[6],$t0,@acc[6]	! r[6]+=r[0]
267e1051a39Sopenharmony_ci	 and	@acc[3],$mask,@acc[3]
268e1051a39Sopenharmony_ci	 and	@acc[4],$mask,@acc[4]
269e1051a39Sopenharmony_ci	addccc	@acc[7],%g0,@acc[7]	! r[7]+=0
270e1051a39Sopenharmony_ci	addccc	@acc[0],$t0,@acc[0]	! r[8]+=r[0]	"@acc[8]"
271e1051a39Sopenharmony_ci	 and	@acc[5],$mask,@acc[5]
272e1051a39Sopenharmony_ci	 and	@acc[6],$mask,@acc[6]
273e1051a39Sopenharmony_ci	addc	$carry,%g0,$carry	! top-most carry
274e1051a39Sopenharmony_ci	subcc	@acc[7],$t0,@acc[7]	! r[7]-=r[0]
275e1051a39Sopenharmony_ci	subccc	@acc[0],%g0,@acc[0]	! r[8]-=0	"@acc[8]"
276e1051a39Sopenharmony_ci	subc	$carry,%g0,$carry	! top-most carry
277e1051a39Sopenharmony_ci	 and	@acc[7],$mask,@acc[7]
278e1051a39Sopenharmony_ci	 and	@acc[0],$mask,@acc[0]	! "@acc[8]"
279e1051a39Sopenharmony_ci___
280e1051a39Sopenharmony_ci	push(@acc,shift(@acc));		# rotate registers to "omit" acc[0]
281e1051a39Sopenharmony_ci$code.=<<___;
282e1051a39Sopenharmony_ci	mulx	$a0,$bi,$t0		! a[0-7]*b[$i], 64-bit results
283e1051a39Sopenharmony_ci	mulx	$t1,$bi,$t1
284e1051a39Sopenharmony_ci	mulx	$t2,$bi,$t2
285e1051a39Sopenharmony_ci	mulx	$t3,$bi,$t3
286e1051a39Sopenharmony_ci	mulx	$t4,$bi,$t4
287e1051a39Sopenharmony_ci	mulx	$t5,$bi,$t5
288e1051a39Sopenharmony_ci	mulx	$t6,$bi,$t6
289e1051a39Sopenharmony_ci	mulx	$t7,$bi,$t7
290e1051a39Sopenharmony_ci	add	@acc[0],$t0,$t0		! accumulate low parts, can't overflow
291e1051a39Sopenharmony_ci	add	@acc[1],$t1,$t1
292e1051a39Sopenharmony_ci	srlx	$t0,32,@acc[1]		! extract high parts
293e1051a39Sopenharmony_ci	add	@acc[2],$t2,$t2
294e1051a39Sopenharmony_ci	srlx	$t1,32,@acc[2]
295e1051a39Sopenharmony_ci	add	@acc[3],$t3,$t3
296e1051a39Sopenharmony_ci	srlx	$t2,32,@acc[3]
297e1051a39Sopenharmony_ci	add	@acc[4],$t4,$t4
298e1051a39Sopenharmony_ci	srlx	$t3,32,@acc[4]
299e1051a39Sopenharmony_ci	add	@acc[5],$t5,$t5
300e1051a39Sopenharmony_ci	srlx	$t4,32,@acc[5]
301e1051a39Sopenharmony_ci	add	@acc[6],$t6,$t6
302e1051a39Sopenharmony_ci	srlx	$t5,32,@acc[6]
303e1051a39Sopenharmony_ci	add	@acc[7],$t7,$t7
304e1051a39Sopenharmony_ci	srlx	$t6,32,@acc[7]
305e1051a39Sopenharmony_ci	srlx	$t7,32,@acc[0]		! "@acc[8]"
306e1051a39Sopenharmony_ci___
307e1051a39Sopenharmony_ci}
308e1051a39Sopenharmony_ci$code.=<<___;
309e1051a39Sopenharmony_ci	addcc	@acc[1],$t1,@acc[1]	! accumulate high parts
310e1051a39Sopenharmony_ci	addccc	@acc[2],$t2,@acc[2]
311e1051a39Sopenharmony_ci	addccc	@acc[3],$t3,@acc[3]
312e1051a39Sopenharmony_ci	addccc	@acc[4],$t4,@acc[4]
313e1051a39Sopenharmony_ci	addccc	@acc[5],$t5,@acc[5]
314e1051a39Sopenharmony_ci	addccc	@acc[6],$t6,@acc[6]
315e1051a39Sopenharmony_ci	addccc	@acc[7],$t7,@acc[7]
316e1051a39Sopenharmony_ci	addccc	@acc[0],$carry,@acc[0]	! "@acc[8]"
317e1051a39Sopenharmony_ci	addc	%g0,%g0,$carry
318e1051a39Sopenharmony_ci
319e1051a39Sopenharmony_ci	addcc	@acc[3],$t0,@acc[3]	! multiplication-less reduction
320e1051a39Sopenharmony_ci	addccc	@acc[4],%g0,@acc[4]
321e1051a39Sopenharmony_ci	addccc	@acc[5],%g0,@acc[5]
322e1051a39Sopenharmony_ci	addccc	@acc[6],$t0,@acc[6]
323e1051a39Sopenharmony_ci	addccc	@acc[7],%g0,@acc[7]
324e1051a39Sopenharmony_ci	addccc	@acc[0],$t0,@acc[0]	! "@acc[8]"
325e1051a39Sopenharmony_ci	addc	$carry,%g0,$carry
326e1051a39Sopenharmony_ci	subcc	@acc[7],$t0,@acc[7]
327e1051a39Sopenharmony_ci	subccc	@acc[0],%g0,@acc[0]	! "@acc[8]"
328e1051a39Sopenharmony_ci	subc	$carry,%g0,$carry	! top-most carry
329e1051a39Sopenharmony_ci___
330e1051a39Sopenharmony_ci	push(@acc,shift(@acc));		# rotate registers to omit acc[0]
331e1051a39Sopenharmony_ci$code.=<<___;
332e1051a39Sopenharmony_ci	! Final step is "if result > mod, subtract mod", but we do it
333e1051a39Sopenharmony_ci	! "other way around", namely subtract modulus from result
334e1051a39Sopenharmony_ci	! and if it borrowed, add modulus back.
335e1051a39Sopenharmony_ci
336e1051a39Sopenharmony_ci	subcc	@acc[0],-1,@acc[0]	! subtract modulus
337e1051a39Sopenharmony_ci	subccc	@acc[1],-1,@acc[1]
338e1051a39Sopenharmony_ci	subccc	@acc[2],-1,@acc[2]
339e1051a39Sopenharmony_ci	subccc	@acc[3],0,@acc[3]
340e1051a39Sopenharmony_ci	subccc	@acc[4],0,@acc[4]
341e1051a39Sopenharmony_ci	subccc	@acc[5],0,@acc[5]
342e1051a39Sopenharmony_ci	subccc	@acc[6],1,@acc[6]
343e1051a39Sopenharmony_ci	subccc	@acc[7],-1,@acc[7]
344e1051a39Sopenharmony_ci	subc	$carry,0,$carry		! broadcast borrow bit
345e1051a39Sopenharmony_ci
346e1051a39Sopenharmony_ci	! Note that because mod has special form, i.e. consists of
347e1051a39Sopenharmony_ci	! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
348e1051a39Sopenharmony_ci	! using value of broadcasted borrow and the borrow bit itself.
349e1051a39Sopenharmony_ci	! To minimize dependency chain we first broadcast and then
350e1051a39Sopenharmony_ci	! extract the bit by negating (follow $bi).
351e1051a39Sopenharmony_ci
352e1051a39Sopenharmony_ci	addcc	@acc[0],$carry,@acc[0]	! add modulus or zero
353e1051a39Sopenharmony_ci	addccc	@acc[1],$carry,@acc[1]
354e1051a39Sopenharmony_ci	neg	$carry,$bi
355e1051a39Sopenharmony_ci	st	@acc[0],[$rp]
356e1051a39Sopenharmony_ci	addccc	@acc[2],$carry,@acc[2]
357e1051a39Sopenharmony_ci	st	@acc[1],[$rp+4]
358e1051a39Sopenharmony_ci	addccc	@acc[3],0,@acc[3]
359e1051a39Sopenharmony_ci	st	@acc[2],[$rp+8]
360e1051a39Sopenharmony_ci	addccc	@acc[4],0,@acc[4]
361e1051a39Sopenharmony_ci	st	@acc[3],[$rp+12]
362e1051a39Sopenharmony_ci	addccc	@acc[5],0,@acc[5]
363e1051a39Sopenharmony_ci	st	@acc[4],[$rp+16]
364e1051a39Sopenharmony_ci	addccc	@acc[6],$bi,@acc[6]
365e1051a39Sopenharmony_ci	st	@acc[5],[$rp+20]
366e1051a39Sopenharmony_ci	addc	@acc[7],$carry,@acc[7]
367e1051a39Sopenharmony_ci	st	@acc[6],[$rp+24]
368e1051a39Sopenharmony_ci	retl
369e1051a39Sopenharmony_ci	st	@acc[7],[$rp+28]
370e1051a39Sopenharmony_ci.type	__ecp_nistz256_mul_mont,#function
371e1051a39Sopenharmony_ci.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
372e1051a39Sopenharmony_ci
373e1051a39Sopenharmony_ci! void	ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8],
374e1051a39Sopenharmony_ci!					 const BN_ULONG %i2[8]);
375e1051a39Sopenharmony_ci.globl	ecp_nistz256_add
376e1051a39Sopenharmony_ci.align	32
377e1051a39Sopenharmony_ciecp_nistz256_add:
378e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
379e1051a39Sopenharmony_ci	ld	[$ap],@acc[0]
380e1051a39Sopenharmony_ci	ld	[$ap+4],@acc[1]
381e1051a39Sopenharmony_ci	ld	[$ap+8],@acc[2]
382e1051a39Sopenharmony_ci	ld	[$ap+12],@acc[3]
383e1051a39Sopenharmony_ci	ld	[$ap+16],@acc[4]
384e1051a39Sopenharmony_ci	ld	[$ap+20],@acc[5]
385e1051a39Sopenharmony_ci	ld	[$ap+24],@acc[6]
386e1051a39Sopenharmony_ci	call	__ecp_nistz256_add
387e1051a39Sopenharmony_ci	ld	[$ap+28],@acc[7]
388e1051a39Sopenharmony_ci	ret
389e1051a39Sopenharmony_ci	restore
390e1051a39Sopenharmony_ci.type	ecp_nistz256_add,#function
391e1051a39Sopenharmony_ci.size	ecp_nistz256_add,.-ecp_nistz256_add
392e1051a39Sopenharmony_ci
393e1051a39Sopenharmony_ci.align	32
394e1051a39Sopenharmony_ci__ecp_nistz256_add:
395e1051a39Sopenharmony_ci	ld	[$bp+0],$t0		! b[0]
396e1051a39Sopenharmony_ci	ld	[$bp+4],$t1
397e1051a39Sopenharmony_ci	ld	[$bp+8],$t2
398e1051a39Sopenharmony_ci	ld	[$bp+12],$t3
399e1051a39Sopenharmony_ci	addcc	@acc[0],$t0,@acc[0]
400e1051a39Sopenharmony_ci	ld	[$bp+16],$t4
401e1051a39Sopenharmony_ci	ld	[$bp+20],$t5
402e1051a39Sopenharmony_ci	addccc	@acc[1],$t1,@acc[1]
403e1051a39Sopenharmony_ci	ld	[$bp+24],$t6
404e1051a39Sopenharmony_ci	ld	[$bp+28],$t7
405e1051a39Sopenharmony_ci	addccc	@acc[2],$t2,@acc[2]
406e1051a39Sopenharmony_ci	addccc	@acc[3],$t3,@acc[3]
407e1051a39Sopenharmony_ci	addccc	@acc[4],$t4,@acc[4]
408e1051a39Sopenharmony_ci	addccc	@acc[5],$t5,@acc[5]
409e1051a39Sopenharmony_ci	addccc	@acc[6],$t6,@acc[6]
410e1051a39Sopenharmony_ci	addccc	@acc[7],$t7,@acc[7]
411e1051a39Sopenharmony_ci	addc	%g0,%g0,$carry
412e1051a39Sopenharmony_ci
413e1051a39Sopenharmony_ci.Lreduce_by_sub:
414e1051a39Sopenharmony_ci
415e1051a39Sopenharmony_ci	! if a+b >= modulus, subtract modulus.
416e1051a39Sopenharmony_ci	!
417e1051a39Sopenharmony_ci	! But since comparison implies subtraction, we subtract
418e1051a39Sopenharmony_ci	! modulus and then add it back if subtraction borrowed.
419e1051a39Sopenharmony_ci
420e1051a39Sopenharmony_ci	subcc	@acc[0],-1,@acc[0]
421e1051a39Sopenharmony_ci	subccc	@acc[1],-1,@acc[1]
422e1051a39Sopenharmony_ci	subccc	@acc[2],-1,@acc[2]
423e1051a39Sopenharmony_ci	subccc	@acc[3], 0,@acc[3]
424e1051a39Sopenharmony_ci	subccc	@acc[4], 0,@acc[4]
425e1051a39Sopenharmony_ci	subccc	@acc[5], 0,@acc[5]
426e1051a39Sopenharmony_ci	subccc	@acc[6], 1,@acc[6]
427e1051a39Sopenharmony_ci	subccc	@acc[7],-1,@acc[7]
428e1051a39Sopenharmony_ci	subc	$carry,0,$carry
429e1051a39Sopenharmony_ci
430e1051a39Sopenharmony_ci	! Note that because mod has special form, i.e. consists of
431e1051a39Sopenharmony_ci	! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
432e1051a39Sopenharmony_ci	! using value of borrow and its negative.
433e1051a39Sopenharmony_ci
434e1051a39Sopenharmony_ci	addcc	@acc[0],$carry,@acc[0]	! add synthesized modulus
435e1051a39Sopenharmony_ci	addccc	@acc[1],$carry,@acc[1]
436e1051a39Sopenharmony_ci	neg	$carry,$bi
437e1051a39Sopenharmony_ci	st	@acc[0],[$rp]
438e1051a39Sopenharmony_ci	addccc	@acc[2],$carry,@acc[2]
439e1051a39Sopenharmony_ci	st	@acc[1],[$rp+4]
440e1051a39Sopenharmony_ci	addccc	@acc[3],0,@acc[3]
441e1051a39Sopenharmony_ci	st	@acc[2],[$rp+8]
442e1051a39Sopenharmony_ci	addccc	@acc[4],0,@acc[4]
443e1051a39Sopenharmony_ci	st	@acc[3],[$rp+12]
444e1051a39Sopenharmony_ci	addccc	@acc[5],0,@acc[5]
445e1051a39Sopenharmony_ci	st	@acc[4],[$rp+16]
446e1051a39Sopenharmony_ci	addccc	@acc[6],$bi,@acc[6]
447e1051a39Sopenharmony_ci	st	@acc[5],[$rp+20]
448e1051a39Sopenharmony_ci	addc	@acc[7],$carry,@acc[7]
449e1051a39Sopenharmony_ci	st	@acc[6],[$rp+24]
450e1051a39Sopenharmony_ci	retl
451e1051a39Sopenharmony_ci	st	@acc[7],[$rp+28]
452e1051a39Sopenharmony_ci.type	__ecp_nistz256_add,#function
453e1051a39Sopenharmony_ci.size	__ecp_nistz256_add,.-__ecp_nistz256_add
454e1051a39Sopenharmony_ci
455e1051a39Sopenharmony_ci! void	ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
456e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_by_2
457e1051a39Sopenharmony_ci.align	32
458e1051a39Sopenharmony_ciecp_nistz256_mul_by_2:
459e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
460e1051a39Sopenharmony_ci	ld	[$ap],@acc[0]
461e1051a39Sopenharmony_ci	ld	[$ap+4],@acc[1]
462e1051a39Sopenharmony_ci	ld	[$ap+8],@acc[2]
463e1051a39Sopenharmony_ci	ld	[$ap+12],@acc[3]
464e1051a39Sopenharmony_ci	ld	[$ap+16],@acc[4]
465e1051a39Sopenharmony_ci	ld	[$ap+20],@acc[5]
466e1051a39Sopenharmony_ci	ld	[$ap+24],@acc[6]
467e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2
468e1051a39Sopenharmony_ci	ld	[$ap+28],@acc[7]
469e1051a39Sopenharmony_ci	ret
470e1051a39Sopenharmony_ci	restore
471e1051a39Sopenharmony_ci.type	ecp_nistz256_mul_by_2,#function
472e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
473e1051a39Sopenharmony_ci
474e1051a39Sopenharmony_ci.align	32
475e1051a39Sopenharmony_ci__ecp_nistz256_mul_by_2:
476e1051a39Sopenharmony_ci	addcc	@acc[0],@acc[0],@acc[0]	! a+a=2*a
477e1051a39Sopenharmony_ci	addccc	@acc[1],@acc[1],@acc[1]
478e1051a39Sopenharmony_ci	addccc	@acc[2],@acc[2],@acc[2]
479e1051a39Sopenharmony_ci	addccc	@acc[3],@acc[3],@acc[3]
480e1051a39Sopenharmony_ci	addccc	@acc[4],@acc[4],@acc[4]
481e1051a39Sopenharmony_ci	addccc	@acc[5],@acc[5],@acc[5]
482e1051a39Sopenharmony_ci	addccc	@acc[6],@acc[6],@acc[6]
483e1051a39Sopenharmony_ci	addccc	@acc[7],@acc[7],@acc[7]
484e1051a39Sopenharmony_ci	b	.Lreduce_by_sub
485e1051a39Sopenharmony_ci	addc	%g0,%g0,$carry
486e1051a39Sopenharmony_ci.type	__ecp_nistz256_mul_by_2,#function
487e1051a39Sopenharmony_ci.size	__ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
488e1051a39Sopenharmony_ci
489e1051a39Sopenharmony_ci! void	ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
490e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_by_3
491e1051a39Sopenharmony_ci.align	32
492e1051a39Sopenharmony_ciecp_nistz256_mul_by_3:
493e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
494e1051a39Sopenharmony_ci	ld	[$ap],@acc[0]
495e1051a39Sopenharmony_ci	ld	[$ap+4],@acc[1]
496e1051a39Sopenharmony_ci	ld	[$ap+8],@acc[2]
497e1051a39Sopenharmony_ci	ld	[$ap+12],@acc[3]
498e1051a39Sopenharmony_ci	ld	[$ap+16],@acc[4]
499e1051a39Sopenharmony_ci	ld	[$ap+20],@acc[5]
500e1051a39Sopenharmony_ci	ld	[$ap+24],@acc[6]
501e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_3
502e1051a39Sopenharmony_ci	ld	[$ap+28],@acc[7]
503e1051a39Sopenharmony_ci	ret
504e1051a39Sopenharmony_ci	restore
505e1051a39Sopenharmony_ci.type	ecp_nistz256_mul_by_3,#function
506e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
507e1051a39Sopenharmony_ci
508e1051a39Sopenharmony_ci.align	32
509e1051a39Sopenharmony_ci__ecp_nistz256_mul_by_3:
510e1051a39Sopenharmony_ci	addcc	@acc[0],@acc[0],$t0	! a+a=2*a
511e1051a39Sopenharmony_ci	addccc	@acc[1],@acc[1],$t1
512e1051a39Sopenharmony_ci	addccc	@acc[2],@acc[2],$t2
513e1051a39Sopenharmony_ci	addccc	@acc[3],@acc[3],$t3
514e1051a39Sopenharmony_ci	addccc	@acc[4],@acc[4],$t4
515e1051a39Sopenharmony_ci	addccc	@acc[5],@acc[5],$t5
516e1051a39Sopenharmony_ci	addccc	@acc[6],@acc[6],$t6
517e1051a39Sopenharmony_ci	addccc	@acc[7],@acc[7],$t7
518e1051a39Sopenharmony_ci	addc	%g0,%g0,$carry
519e1051a39Sopenharmony_ci
520e1051a39Sopenharmony_ci	subcc	$t0,-1,$t0		! .Lreduce_by_sub but without stores
521e1051a39Sopenharmony_ci	subccc	$t1,-1,$t1
522e1051a39Sopenharmony_ci	subccc	$t2,-1,$t2
523e1051a39Sopenharmony_ci	subccc	$t3, 0,$t3
524e1051a39Sopenharmony_ci	subccc	$t4, 0,$t4
525e1051a39Sopenharmony_ci	subccc	$t5, 0,$t5
526e1051a39Sopenharmony_ci	subccc	$t6, 1,$t6
527e1051a39Sopenharmony_ci	subccc	$t7,-1,$t7
528e1051a39Sopenharmony_ci	subc	$carry,0,$carry
529e1051a39Sopenharmony_ci
530e1051a39Sopenharmony_ci	addcc	$t0,$carry,$t0		! add synthesized modulus
531e1051a39Sopenharmony_ci	addccc	$t1,$carry,$t1
532e1051a39Sopenharmony_ci	neg	$carry,$bi
533e1051a39Sopenharmony_ci	addccc	$t2,$carry,$t2
534e1051a39Sopenharmony_ci	addccc	$t3,0,$t3
535e1051a39Sopenharmony_ci	addccc	$t4,0,$t4
536e1051a39Sopenharmony_ci	addccc	$t5,0,$t5
537e1051a39Sopenharmony_ci	addccc	$t6,$bi,$t6
538e1051a39Sopenharmony_ci	addc	$t7,$carry,$t7
539e1051a39Sopenharmony_ci
540e1051a39Sopenharmony_ci	addcc	$t0,@acc[0],@acc[0]	! 2*a+a=3*a
541e1051a39Sopenharmony_ci	addccc	$t1,@acc[1],@acc[1]
542e1051a39Sopenharmony_ci	addccc	$t2,@acc[2],@acc[2]
543e1051a39Sopenharmony_ci	addccc	$t3,@acc[3],@acc[3]
544e1051a39Sopenharmony_ci	addccc	$t4,@acc[4],@acc[4]
545e1051a39Sopenharmony_ci	addccc	$t5,@acc[5],@acc[5]
546e1051a39Sopenharmony_ci	addccc	$t6,@acc[6],@acc[6]
547e1051a39Sopenharmony_ci	addccc	$t7,@acc[7],@acc[7]
548e1051a39Sopenharmony_ci	b	.Lreduce_by_sub
549e1051a39Sopenharmony_ci	addc	%g0,%g0,$carry
550e1051a39Sopenharmony_ci.type	__ecp_nistz256_mul_by_3,#function
551e1051a39Sopenharmony_ci.size	__ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
552e1051a39Sopenharmony_ci
553e1051a39Sopenharmony_ci! void	ecp_nistz256_sub(BN_ULONG %i0[8],const BN_ULONG %i1[8],
554e1051a39Sopenharmony_ci!				         const BN_ULONG %i2[8]);
555e1051a39Sopenharmony_ci.globl	ecp_nistz256_sub
556e1051a39Sopenharmony_ci.align	32
557e1051a39Sopenharmony_ciecp_nistz256_sub:
558e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
559e1051a39Sopenharmony_ci	ld	[$ap],@acc[0]
560e1051a39Sopenharmony_ci	ld	[$ap+4],@acc[1]
561e1051a39Sopenharmony_ci	ld	[$ap+8],@acc[2]
562e1051a39Sopenharmony_ci	ld	[$ap+12],@acc[3]
563e1051a39Sopenharmony_ci	ld	[$ap+16],@acc[4]
564e1051a39Sopenharmony_ci	ld	[$ap+20],@acc[5]
565e1051a39Sopenharmony_ci	ld	[$ap+24],@acc[6]
566e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from
567e1051a39Sopenharmony_ci	ld	[$ap+28],@acc[7]
568e1051a39Sopenharmony_ci	ret
569e1051a39Sopenharmony_ci	restore
570e1051a39Sopenharmony_ci.type	ecp_nistz256_sub,#function
571e1051a39Sopenharmony_ci.size	ecp_nistz256_sub,.-ecp_nistz256_sub
572e1051a39Sopenharmony_ci
573e1051a39Sopenharmony_ci! void	ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
574e1051a39Sopenharmony_ci.globl	ecp_nistz256_neg
575e1051a39Sopenharmony_ci.align	32
576e1051a39Sopenharmony_ciecp_nistz256_neg:
577e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
578e1051a39Sopenharmony_ci	mov	$ap,$bp
579e1051a39Sopenharmony_ci	mov	0,@acc[0]
580e1051a39Sopenharmony_ci	mov	0,@acc[1]
581e1051a39Sopenharmony_ci	mov	0,@acc[2]
582e1051a39Sopenharmony_ci	mov	0,@acc[3]
583e1051a39Sopenharmony_ci	mov	0,@acc[4]
584e1051a39Sopenharmony_ci	mov	0,@acc[5]
585e1051a39Sopenharmony_ci	mov	0,@acc[6]
586e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from
587e1051a39Sopenharmony_ci	mov	0,@acc[7]
588e1051a39Sopenharmony_ci	ret
589e1051a39Sopenharmony_ci	restore
590e1051a39Sopenharmony_ci.type	ecp_nistz256_neg,#function
591e1051a39Sopenharmony_ci.size	ecp_nistz256_neg,.-ecp_nistz256_neg
592e1051a39Sopenharmony_ci
593e1051a39Sopenharmony_ci.align	32
594e1051a39Sopenharmony_ci__ecp_nistz256_sub_from:
595e1051a39Sopenharmony_ci	ld	[$bp+0],$t0		! b[0]
596e1051a39Sopenharmony_ci	ld	[$bp+4],$t1
597e1051a39Sopenharmony_ci	ld	[$bp+8],$t2
598e1051a39Sopenharmony_ci	ld	[$bp+12],$t3
599e1051a39Sopenharmony_ci	subcc	@acc[0],$t0,@acc[0]
600e1051a39Sopenharmony_ci	ld	[$bp+16],$t4
601e1051a39Sopenharmony_ci	ld	[$bp+20],$t5
602e1051a39Sopenharmony_ci	subccc	@acc[1],$t1,@acc[1]
603e1051a39Sopenharmony_ci	subccc	@acc[2],$t2,@acc[2]
604e1051a39Sopenharmony_ci	ld	[$bp+24],$t6
605e1051a39Sopenharmony_ci	ld	[$bp+28],$t7
606e1051a39Sopenharmony_ci	subccc	@acc[3],$t3,@acc[3]
607e1051a39Sopenharmony_ci	subccc	@acc[4],$t4,@acc[4]
608e1051a39Sopenharmony_ci	subccc	@acc[5],$t5,@acc[5]
609e1051a39Sopenharmony_ci	subccc	@acc[6],$t6,@acc[6]
610e1051a39Sopenharmony_ci	subccc	@acc[7],$t7,@acc[7]
611e1051a39Sopenharmony_ci	subc	%g0,%g0,$carry		! broadcast borrow bit
612e1051a39Sopenharmony_ci
613e1051a39Sopenharmony_ci.Lreduce_by_add:
614e1051a39Sopenharmony_ci
615e1051a39Sopenharmony_ci	! if a-b borrows, add modulus.
616e1051a39Sopenharmony_ci	!
617e1051a39Sopenharmony_ci	! Note that because mod has special form, i.e. consists of
618e1051a39Sopenharmony_ci	! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
619e1051a39Sopenharmony_ci	! using value of broadcasted borrow and the borrow bit itself.
620e1051a39Sopenharmony_ci	! To minimize dependency chain we first broadcast and then
621e1051a39Sopenharmony_ci	! extract the bit by negating (follow $bi).
622e1051a39Sopenharmony_ci
623e1051a39Sopenharmony_ci	addcc	@acc[0],$carry,@acc[0]	! add synthesized modulus
624e1051a39Sopenharmony_ci	addccc	@acc[1],$carry,@acc[1]
625e1051a39Sopenharmony_ci	neg	$carry,$bi
626e1051a39Sopenharmony_ci	st	@acc[0],[$rp]
627e1051a39Sopenharmony_ci	addccc	@acc[2],$carry,@acc[2]
628e1051a39Sopenharmony_ci	st	@acc[1],[$rp+4]
629e1051a39Sopenharmony_ci	addccc	@acc[3],0,@acc[3]
630e1051a39Sopenharmony_ci	st	@acc[2],[$rp+8]
631e1051a39Sopenharmony_ci	addccc	@acc[4],0,@acc[4]
632e1051a39Sopenharmony_ci	st	@acc[3],[$rp+12]
633e1051a39Sopenharmony_ci	addccc	@acc[5],0,@acc[5]
634e1051a39Sopenharmony_ci	st	@acc[4],[$rp+16]
635e1051a39Sopenharmony_ci	addccc	@acc[6],$bi,@acc[6]
636e1051a39Sopenharmony_ci	st	@acc[5],[$rp+20]
637e1051a39Sopenharmony_ci	addc	@acc[7],$carry,@acc[7]
638e1051a39Sopenharmony_ci	st	@acc[6],[$rp+24]
639e1051a39Sopenharmony_ci	retl
640e1051a39Sopenharmony_ci	st	@acc[7],[$rp+28]
641e1051a39Sopenharmony_ci.type	__ecp_nistz256_sub_from,#function
642e1051a39Sopenharmony_ci.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
643e1051a39Sopenharmony_ci
644e1051a39Sopenharmony_ci.align	32
645e1051a39Sopenharmony_ci__ecp_nistz256_sub_morf:
646e1051a39Sopenharmony_ci	ld	[$bp+0],$t0		! b[0]
647e1051a39Sopenharmony_ci	ld	[$bp+4],$t1
648e1051a39Sopenharmony_ci	ld	[$bp+8],$t2
649e1051a39Sopenharmony_ci	ld	[$bp+12],$t3
650e1051a39Sopenharmony_ci	subcc	$t0,@acc[0],@acc[0]
651e1051a39Sopenharmony_ci	ld	[$bp+16],$t4
652e1051a39Sopenharmony_ci	ld	[$bp+20],$t5
653e1051a39Sopenharmony_ci	subccc	$t1,@acc[1],@acc[1]
654e1051a39Sopenharmony_ci	subccc	$t2,@acc[2],@acc[2]
655e1051a39Sopenharmony_ci	ld	[$bp+24],$t6
656e1051a39Sopenharmony_ci	ld	[$bp+28],$t7
657e1051a39Sopenharmony_ci	subccc	$t3,@acc[3],@acc[3]
658e1051a39Sopenharmony_ci	subccc	$t4,@acc[4],@acc[4]
659e1051a39Sopenharmony_ci	subccc	$t5,@acc[5],@acc[5]
660e1051a39Sopenharmony_ci	subccc	$t6,@acc[6],@acc[6]
661e1051a39Sopenharmony_ci	subccc	$t7,@acc[7],@acc[7]
662e1051a39Sopenharmony_ci	b	.Lreduce_by_add
663e1051a39Sopenharmony_ci	subc	%g0,%g0,$carry		! broadcast borrow bit
664e1051a39Sopenharmony_ci.type	__ecp_nistz256_sub_morf,#function
665e1051a39Sopenharmony_ci.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
666e1051a39Sopenharmony_ci
667e1051a39Sopenharmony_ci! void	ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
668e1051a39Sopenharmony_ci.globl	ecp_nistz256_div_by_2
669e1051a39Sopenharmony_ci.align	32
670e1051a39Sopenharmony_ciecp_nistz256_div_by_2:
671e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
672e1051a39Sopenharmony_ci	ld	[$ap],@acc[0]
673e1051a39Sopenharmony_ci	ld	[$ap+4],@acc[1]
674e1051a39Sopenharmony_ci	ld	[$ap+8],@acc[2]
675e1051a39Sopenharmony_ci	ld	[$ap+12],@acc[3]
676e1051a39Sopenharmony_ci	ld	[$ap+16],@acc[4]
677e1051a39Sopenharmony_ci	ld	[$ap+20],@acc[5]
678e1051a39Sopenharmony_ci	ld	[$ap+24],@acc[6]
679e1051a39Sopenharmony_ci	call	__ecp_nistz256_div_by_2
680e1051a39Sopenharmony_ci	ld	[$ap+28],@acc[7]
681e1051a39Sopenharmony_ci	ret
682e1051a39Sopenharmony_ci	restore
683e1051a39Sopenharmony_ci.type	ecp_nistz256_div_by_2,#function
684e1051a39Sopenharmony_ci.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
685e1051a39Sopenharmony_ci
686e1051a39Sopenharmony_ci.align	32
687e1051a39Sopenharmony_ci__ecp_nistz256_div_by_2:
688e1051a39Sopenharmony_ci	! ret = (a is odd ? a+mod : a) >> 1
689e1051a39Sopenharmony_ci
690e1051a39Sopenharmony_ci	and	@acc[0],1,$bi
691e1051a39Sopenharmony_ci	neg	$bi,$carry
692e1051a39Sopenharmony_ci	addcc	@acc[0],$carry,@acc[0]
693e1051a39Sopenharmony_ci	addccc	@acc[1],$carry,@acc[1]
694e1051a39Sopenharmony_ci	addccc	@acc[2],$carry,@acc[2]
695e1051a39Sopenharmony_ci	addccc	@acc[3],0,@acc[3]
696e1051a39Sopenharmony_ci	addccc	@acc[4],0,@acc[4]
697e1051a39Sopenharmony_ci	addccc	@acc[5],0,@acc[5]
698e1051a39Sopenharmony_ci	addccc	@acc[6],$bi,@acc[6]
699e1051a39Sopenharmony_ci	addccc	@acc[7],$carry,@acc[7]
700e1051a39Sopenharmony_ci	addc	%g0,%g0,$carry
701e1051a39Sopenharmony_ci
702e1051a39Sopenharmony_ci	! ret >>= 1
703e1051a39Sopenharmony_ci
704e1051a39Sopenharmony_ci	srl	@acc[0],1,@acc[0]
705e1051a39Sopenharmony_ci	sll	@acc[1],31,$t0
706e1051a39Sopenharmony_ci	srl	@acc[1],1,@acc[1]
707e1051a39Sopenharmony_ci	or	@acc[0],$t0,@acc[0]
708e1051a39Sopenharmony_ci	sll	@acc[2],31,$t1
709e1051a39Sopenharmony_ci	srl	@acc[2],1,@acc[2]
710e1051a39Sopenharmony_ci	or	@acc[1],$t1,@acc[1]
711e1051a39Sopenharmony_ci	sll	@acc[3],31,$t2
712e1051a39Sopenharmony_ci	st	@acc[0],[$rp]
713e1051a39Sopenharmony_ci	srl	@acc[3],1,@acc[3]
714e1051a39Sopenharmony_ci	or	@acc[2],$t2,@acc[2]
715e1051a39Sopenharmony_ci	sll	@acc[4],31,$t3
716e1051a39Sopenharmony_ci	st	@acc[1],[$rp+4]
717e1051a39Sopenharmony_ci	srl	@acc[4],1,@acc[4]
718e1051a39Sopenharmony_ci	or	@acc[3],$t3,@acc[3]
719e1051a39Sopenharmony_ci	sll	@acc[5],31,$t4
720e1051a39Sopenharmony_ci	st	@acc[2],[$rp+8]
721e1051a39Sopenharmony_ci	srl	@acc[5],1,@acc[5]
722e1051a39Sopenharmony_ci	or	@acc[4],$t4,@acc[4]
723e1051a39Sopenharmony_ci	sll	@acc[6],31,$t5
724e1051a39Sopenharmony_ci	st	@acc[3],[$rp+12]
725e1051a39Sopenharmony_ci	srl	@acc[6],1,@acc[6]
726e1051a39Sopenharmony_ci	or	@acc[5],$t5,@acc[5]
727e1051a39Sopenharmony_ci	sll	@acc[7],31,$t6
728e1051a39Sopenharmony_ci	st	@acc[4],[$rp+16]
729e1051a39Sopenharmony_ci	srl	@acc[7],1,@acc[7]
730e1051a39Sopenharmony_ci	or	@acc[6],$t6,@acc[6]
731e1051a39Sopenharmony_ci	sll	$carry,31,$t7
732e1051a39Sopenharmony_ci	st	@acc[5],[$rp+20]
733e1051a39Sopenharmony_ci	or	@acc[7],$t7,@acc[7]
734e1051a39Sopenharmony_ci	st	@acc[6],[$rp+24]
735e1051a39Sopenharmony_ci	retl
736e1051a39Sopenharmony_ci	st	@acc[7],[$rp+28]
737e1051a39Sopenharmony_ci.type	__ecp_nistz256_div_by_2,#function
738e1051a39Sopenharmony_ci.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
739e1051a39Sopenharmony_ci___
740e1051a39Sopenharmony_ci
741e1051a39Sopenharmony_ci########################################################################
742e1051a39Sopenharmony_ci# following subroutines are "literal" implementation of those found in
743e1051a39Sopenharmony_ci# ecp_nistz256.c
744e1051a39Sopenharmony_ci#
745e1051a39Sopenharmony_ci########################################################################
746e1051a39Sopenharmony_ci# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
747e1051a39Sopenharmony_ci#
748e1051a39Sopenharmony_ci{
749e1051a39Sopenharmony_cimy ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
750e1051a39Sopenharmony_ci# above map() describes stack layout with 4 temporary
751e1051a39Sopenharmony_ci# 256-bit vectors on top.
752e1051a39Sopenharmony_ci
753e1051a39Sopenharmony_ci$code.=<<___;
754e1051a39Sopenharmony_ci#ifdef __PIC__
755e1051a39Sopenharmony_ciSPARC_PIC_THUNK(%g1)
756e1051a39Sopenharmony_ci#endif
757e1051a39Sopenharmony_ci
758e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_double
759e1051a39Sopenharmony_ci.align	32
760e1051a39Sopenharmony_ciecp_nistz256_point_double:
761e1051a39Sopenharmony_ci	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
762e1051a39Sopenharmony_ci	ld	[%g1],%g1		! OPENSSL_sparcv9cap_P[0]
763e1051a39Sopenharmony_ci	and	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
764e1051a39Sopenharmony_ci	cmp	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
765e1051a39Sopenharmony_ci	be	ecp_nistz256_point_double_vis3
766e1051a39Sopenharmony_ci	nop
767e1051a39Sopenharmony_ci
768e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME-32*4,%sp
769e1051a39Sopenharmony_ci
770e1051a39Sopenharmony_ci	mov	$rp,$rp_real
771e1051a39Sopenharmony_ci	mov	$ap,$ap_real
772e1051a39Sopenharmony_ci
773e1051a39Sopenharmony_ci.Lpoint_double_shortcut:
774e1051a39Sopenharmony_ci	ld	[$ap+32],@acc[0]
775e1051a39Sopenharmony_ci	ld	[$ap+32+4],@acc[1]
776e1051a39Sopenharmony_ci	ld	[$ap+32+8],@acc[2]
777e1051a39Sopenharmony_ci	ld	[$ap+32+12],@acc[3]
778e1051a39Sopenharmony_ci	ld	[$ap+32+16],@acc[4]
779e1051a39Sopenharmony_ci	ld	[$ap+32+20],@acc[5]
780e1051a39Sopenharmony_ci	ld	[$ap+32+24],@acc[6]
781e1051a39Sopenharmony_ci	ld	[$ap+32+28],@acc[7]
782e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2	! p256_mul_by_2(S, in_y);
783e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$rp
784e1051a39Sopenharmony_ci
785e1051a39Sopenharmony_ci	add	$ap_real,64,$bp
786e1051a39Sopenharmony_ci	add	$ap_real,64,$ap
787e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Zsqr, in_z);
788e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Zsqr,$rp
789e1051a39Sopenharmony_ci
790e1051a39Sopenharmony_ci	add	$ap_real,0,$bp
791e1051a39Sopenharmony_ci	call	__ecp_nistz256_add	! p256_add(M, Zsqr, in_x);
792e1051a39Sopenharmony_ci	add	%sp,LOCALS+$M,$rp
793e1051a39Sopenharmony_ci
794e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$bp
795e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$ap
796e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(S, S);
797e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$rp
798e1051a39Sopenharmony_ci
799e1051a39Sopenharmony_ci	ld	[$ap_real],@acc[0]
800e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Zsqr,$bp
801e1051a39Sopenharmony_ci	ld	[$ap_real+4],@acc[1]
802e1051a39Sopenharmony_ci	ld	[$ap_real+8],@acc[2]
803e1051a39Sopenharmony_ci	ld	[$ap_real+12],@acc[3]
804e1051a39Sopenharmony_ci	ld	[$ap_real+16],@acc[4]
805e1051a39Sopenharmony_ci	ld	[$ap_real+20],@acc[5]
806e1051a39Sopenharmony_ci	ld	[$ap_real+24],@acc[6]
807e1051a39Sopenharmony_ci	ld	[$ap_real+28],@acc[7]
808e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from	! p256_sub(Zsqr, in_x, Zsqr);
809e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Zsqr,$rp
810e1051a39Sopenharmony_ci
811e1051a39Sopenharmony_ci	add	$ap_real,32,$bp
812e1051a39Sopenharmony_ci	add	$ap_real,64,$ap
813e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(tmp0, in_z, in_y);
814e1051a39Sopenharmony_ci	add	%sp,LOCALS+$tmp0,$rp
815e1051a39Sopenharmony_ci
816e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2	! p256_mul_by_2(res_z, tmp0);
817e1051a39Sopenharmony_ci	add	$rp_real,64,$rp
818e1051a39Sopenharmony_ci
819e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Zsqr,$bp
820e1051a39Sopenharmony_ci	add	%sp,LOCALS+$M,$ap
821e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(M, M, Zsqr);
822e1051a39Sopenharmony_ci	add	%sp,LOCALS+$M,$rp
823e1051a39Sopenharmony_ci
824e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_3	! p256_mul_by_3(M, M);
825e1051a39Sopenharmony_ci	add	%sp,LOCALS+$M,$rp
826e1051a39Sopenharmony_ci
827e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$bp
828e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$ap
829e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(tmp0, S);
830e1051a39Sopenharmony_ci	add	%sp,LOCALS+$tmp0,$rp
831e1051a39Sopenharmony_ci
832e1051a39Sopenharmony_ci	call	__ecp_nistz256_div_by_2	! p256_div_by_2(res_y, tmp0);
833e1051a39Sopenharmony_ci	add	$rp_real,32,$rp
834e1051a39Sopenharmony_ci
835e1051a39Sopenharmony_ci	add	$ap_real,0,$bp
836e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$ap
837e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S, S, in_x);
838e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$rp
839e1051a39Sopenharmony_ci
840e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2	! p256_mul_by_2(tmp0, S);
841e1051a39Sopenharmony_ci	add	%sp,LOCALS+$tmp0,$rp
842e1051a39Sopenharmony_ci
843e1051a39Sopenharmony_ci	add	%sp,LOCALS+$M,$bp
844e1051a39Sopenharmony_ci	add	%sp,LOCALS+$M,$ap
845e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(res_x, M);
846e1051a39Sopenharmony_ci	add	$rp_real,0,$rp
847e1051a39Sopenharmony_ci
848e1051a39Sopenharmony_ci	add	%sp,LOCALS+$tmp0,$bp
849e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from	! p256_sub(res_x, res_x, tmp0);
850e1051a39Sopenharmony_ci	add	$rp_real,0,$rp
851e1051a39Sopenharmony_ci
852e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$bp
853e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_morf	! p256_sub(S, S, res_x);
854e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$rp
855e1051a39Sopenharmony_ci
856e1051a39Sopenharmony_ci	add	%sp,LOCALS+$M,$bp
857e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$ap
858e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S, S, M);
859e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S,$rp
860e1051a39Sopenharmony_ci
861e1051a39Sopenharmony_ci	add	$rp_real,32,$bp
862e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from	! p256_sub(res_y, S, res_y);
863e1051a39Sopenharmony_ci	add	$rp_real,32,$rp
864e1051a39Sopenharmony_ci
865e1051a39Sopenharmony_ci	ret
866e1051a39Sopenharmony_ci	restore
867e1051a39Sopenharmony_ci.type	ecp_nistz256_point_double,#function
868e1051a39Sopenharmony_ci.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
869e1051a39Sopenharmony_ci___
870e1051a39Sopenharmony_ci}
871e1051a39Sopenharmony_ci
872e1051a39Sopenharmony_ci########################################################################
873e1051a39Sopenharmony_ci# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
874e1051a39Sopenharmony_ci#			      const P256_POINT *in2);
875e1051a39Sopenharmony_ci{
876e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z,
877e1051a39Sopenharmony_ci    $H,$Hsqr,$R,$Rsqr,$Hcub,
878e1051a39Sopenharmony_ci    $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
879e1051a39Sopenharmony_cimy ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
880e1051a39Sopenharmony_ci
881e1051a39Sopenharmony_ci# above map() describes stack layout with 12 temporary
882e1051a39Sopenharmony_ci# 256-bit vectors on top. Then we reserve some space for
883e1051a39Sopenharmony_ci# !in1infty, !in2infty, result of check for zero and return pointer.
884e1051a39Sopenharmony_ci
885e1051a39Sopenharmony_cimy $bp_real=$rp_real;
886e1051a39Sopenharmony_ci
887e1051a39Sopenharmony_ci$code.=<<___;
888e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_add
889e1051a39Sopenharmony_ci.align	32
890e1051a39Sopenharmony_ciecp_nistz256_point_add:
891e1051a39Sopenharmony_ci	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
892e1051a39Sopenharmony_ci	ld	[%g1],%g1		! OPENSSL_sparcv9cap_P[0]
893e1051a39Sopenharmony_ci	and	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
894e1051a39Sopenharmony_ci	cmp	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
895e1051a39Sopenharmony_ci	be	ecp_nistz256_point_add_vis3
896e1051a39Sopenharmony_ci	nop
897e1051a39Sopenharmony_ci
898e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME-32*12-32,%sp
899e1051a39Sopenharmony_ci
900e1051a39Sopenharmony_ci	stx	$rp,[%fp+STACK_BIAS-8]	! off-load $rp
901e1051a39Sopenharmony_ci	mov	$ap,$ap_real
902e1051a39Sopenharmony_ci	mov	$bp,$bp_real
903e1051a39Sopenharmony_ci
904e1051a39Sopenharmony_ci	ld	[$bp+64],$t0		! in2_z
905e1051a39Sopenharmony_ci	ld	[$bp+64+4],$t1
906e1051a39Sopenharmony_ci	ld	[$bp+64+8],$t2
907e1051a39Sopenharmony_ci	ld	[$bp+64+12],$t3
908e1051a39Sopenharmony_ci	ld	[$bp+64+16],$t4
909e1051a39Sopenharmony_ci	ld	[$bp+64+20],$t5
910e1051a39Sopenharmony_ci	ld	[$bp+64+24],$t6
911e1051a39Sopenharmony_ci	ld	[$bp+64+28],$t7
912e1051a39Sopenharmony_ci	or	$t1,$t0,$t0
913e1051a39Sopenharmony_ci	or	$t3,$t2,$t2
914e1051a39Sopenharmony_ci	or	$t5,$t4,$t4
915e1051a39Sopenharmony_ci	or	$t7,$t6,$t6
916e1051a39Sopenharmony_ci	or	$t2,$t0,$t0
917e1051a39Sopenharmony_ci	or	$t6,$t4,$t4
918e1051a39Sopenharmony_ci	or	$t4,$t0,$t0		! !in2infty
919e1051a39Sopenharmony_ci	movrnz	$t0,-1,$t0
920e1051a39Sopenharmony_ci	st	$t0,[%fp+STACK_BIAS-12]
921e1051a39Sopenharmony_ci
922e1051a39Sopenharmony_ci	ld	[$ap+64],$t0		! in1_z
923e1051a39Sopenharmony_ci	ld	[$ap+64+4],$t1
924e1051a39Sopenharmony_ci	ld	[$ap+64+8],$t2
925e1051a39Sopenharmony_ci	ld	[$ap+64+12],$t3
926e1051a39Sopenharmony_ci	ld	[$ap+64+16],$t4
927e1051a39Sopenharmony_ci	ld	[$ap+64+20],$t5
928e1051a39Sopenharmony_ci	ld	[$ap+64+24],$t6
929e1051a39Sopenharmony_ci	ld	[$ap+64+28],$t7
930e1051a39Sopenharmony_ci	or	$t1,$t0,$t0
931e1051a39Sopenharmony_ci	or	$t3,$t2,$t2
932e1051a39Sopenharmony_ci	or	$t5,$t4,$t4
933e1051a39Sopenharmony_ci	or	$t7,$t6,$t6
934e1051a39Sopenharmony_ci	or	$t2,$t0,$t0
935e1051a39Sopenharmony_ci	or	$t6,$t4,$t4
936e1051a39Sopenharmony_ci	or	$t4,$t0,$t0		! !in1infty
937e1051a39Sopenharmony_ci	movrnz	$t0,-1,$t0
938e1051a39Sopenharmony_ci	st	$t0,[%fp+STACK_BIAS-16]
939e1051a39Sopenharmony_ci
940e1051a39Sopenharmony_ci	add	$bp_real,64,$bp
941e1051a39Sopenharmony_ci	add	$bp_real,64,$ap
942e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Z2sqr, in2_z);
943e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Z2sqr,$rp
944e1051a39Sopenharmony_ci
945e1051a39Sopenharmony_ci	add	$ap_real,64,$bp
946e1051a39Sopenharmony_ci	add	$ap_real,64,$ap
947e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Z1sqr, in1_z);
948e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Z1sqr,$rp
949e1051a39Sopenharmony_ci
950e1051a39Sopenharmony_ci	add	$bp_real,64,$bp
951e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Z2sqr,$ap
952e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S1, Z2sqr, in2_z);
953e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S1,$rp
954e1051a39Sopenharmony_ci
955e1051a39Sopenharmony_ci	add	$ap_real,64,$bp
956e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Z1sqr,$ap
957e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, Z1sqr, in1_z);
958e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S2,$rp
959e1051a39Sopenharmony_ci
960e1051a39Sopenharmony_ci	add	$ap_real,32,$bp
961e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S1,$ap
962e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S1, S1, in1_y);
963e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S1,$rp
964e1051a39Sopenharmony_ci
965e1051a39Sopenharmony_ci	add	$bp_real,32,$bp
966e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S2,$ap
967e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, S2, in2_y);
968e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S2,$rp
969e1051a39Sopenharmony_ci
970e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S1,$bp
971e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from	! p256_sub(R, S2, S1);
972e1051a39Sopenharmony_ci	add	%sp,LOCALS+$R,$rp
973e1051a39Sopenharmony_ci
974e1051a39Sopenharmony_ci	or	@acc[1],@acc[0],@acc[0]	! see if result is zero
975e1051a39Sopenharmony_ci	or	@acc[3],@acc[2],@acc[2]
976e1051a39Sopenharmony_ci	or	@acc[5],@acc[4],@acc[4]
977e1051a39Sopenharmony_ci	or	@acc[7],@acc[6],@acc[6]
978e1051a39Sopenharmony_ci	or	@acc[2],@acc[0],@acc[0]
979e1051a39Sopenharmony_ci	or	@acc[6],@acc[4],@acc[4]
980e1051a39Sopenharmony_ci	or	@acc[4],@acc[0],@acc[0]
981e1051a39Sopenharmony_ci	st	@acc[0],[%fp+STACK_BIAS-20]
982e1051a39Sopenharmony_ci
983e1051a39Sopenharmony_ci	add	$ap_real,0,$bp
984e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Z2sqr,$ap
985e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(U1, in1_x, Z2sqr);
986e1051a39Sopenharmony_ci	add	%sp,LOCALS+$U1,$rp
987e1051a39Sopenharmony_ci
988e1051a39Sopenharmony_ci	add	$bp_real,0,$bp
989e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Z1sqr,$ap
990e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(U2, in2_x, Z1sqr);
991e1051a39Sopenharmony_ci	add	%sp,LOCALS+$U2,$rp
992e1051a39Sopenharmony_ci
993e1051a39Sopenharmony_ci	add	%sp,LOCALS+$U1,$bp
994e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from	! p256_sub(H, U2, U1);
995e1051a39Sopenharmony_ci	add	%sp,LOCALS+$H,$rp
996e1051a39Sopenharmony_ci
997e1051a39Sopenharmony_ci	or	@acc[1],@acc[0],@acc[0]	! see if result is zero
998e1051a39Sopenharmony_ci	or	@acc[3],@acc[2],@acc[2]
999e1051a39Sopenharmony_ci	or	@acc[5],@acc[4],@acc[4]
1000e1051a39Sopenharmony_ci	or	@acc[7],@acc[6],@acc[6]
1001e1051a39Sopenharmony_ci	or	@acc[2],@acc[0],@acc[0]
1002e1051a39Sopenharmony_ci	or	@acc[6],@acc[4],@acc[4]
1003e1051a39Sopenharmony_ci	orcc	@acc[4],@acc[0],@acc[0]
1004e1051a39Sopenharmony_ci
1005e1051a39Sopenharmony_ci	bne,pt	%icc,.Ladd_proceed	! is_equal(U1,U2)?
1006e1051a39Sopenharmony_ci	nop
1007e1051a39Sopenharmony_ci
1008e1051a39Sopenharmony_ci	ld	[%fp+STACK_BIAS-12],$t0
1009e1051a39Sopenharmony_ci	ld	[%fp+STACK_BIAS-16],$t1
1010e1051a39Sopenharmony_ci	ld	[%fp+STACK_BIAS-20],$t2
1011e1051a39Sopenharmony_ci	andcc	$t0,$t1,%g0
1012e1051a39Sopenharmony_ci	be,pt	%icc,.Ladd_proceed	! (in1infty || in2infty)?
1013e1051a39Sopenharmony_ci	nop
1014e1051a39Sopenharmony_ci	andcc	$t2,$t2,%g0
1015e1051a39Sopenharmony_ci	be,pt	%icc,.Ladd_double	! is_equal(S1,S2)?
1016e1051a39Sopenharmony_ci	nop
1017e1051a39Sopenharmony_ci
1018e1051a39Sopenharmony_ci	ldx	[%fp+STACK_BIAS-8],$rp
1019e1051a39Sopenharmony_ci	st	%g0,[$rp]
1020e1051a39Sopenharmony_ci	st	%g0,[$rp+4]
1021e1051a39Sopenharmony_ci	st	%g0,[$rp+8]
1022e1051a39Sopenharmony_ci	st	%g0,[$rp+12]
1023e1051a39Sopenharmony_ci	st	%g0,[$rp+16]
1024e1051a39Sopenharmony_ci	st	%g0,[$rp+20]
1025e1051a39Sopenharmony_ci	st	%g0,[$rp+24]
1026e1051a39Sopenharmony_ci	st	%g0,[$rp+28]
1027e1051a39Sopenharmony_ci	st	%g0,[$rp+32]
1028e1051a39Sopenharmony_ci	st	%g0,[$rp+32+4]
1029e1051a39Sopenharmony_ci	st	%g0,[$rp+32+8]
1030e1051a39Sopenharmony_ci	st	%g0,[$rp+32+12]
1031e1051a39Sopenharmony_ci	st	%g0,[$rp+32+16]
1032e1051a39Sopenharmony_ci	st	%g0,[$rp+32+20]
1033e1051a39Sopenharmony_ci	st	%g0,[$rp+32+24]
1034e1051a39Sopenharmony_ci	st	%g0,[$rp+32+28]
1035e1051a39Sopenharmony_ci	st	%g0,[$rp+64]
1036e1051a39Sopenharmony_ci	st	%g0,[$rp+64+4]
1037e1051a39Sopenharmony_ci	st	%g0,[$rp+64+8]
1038e1051a39Sopenharmony_ci	st	%g0,[$rp+64+12]
1039e1051a39Sopenharmony_ci	st	%g0,[$rp+64+16]
1040e1051a39Sopenharmony_ci	st	%g0,[$rp+64+20]
1041e1051a39Sopenharmony_ci	st	%g0,[$rp+64+24]
1042e1051a39Sopenharmony_ci	st	%g0,[$rp+64+28]
1043e1051a39Sopenharmony_ci	b	.Ladd_done
1044e1051a39Sopenharmony_ci	nop
1045e1051a39Sopenharmony_ci
1046e1051a39Sopenharmony_ci.align	16
1047e1051a39Sopenharmony_ci.Ladd_double:
1048e1051a39Sopenharmony_ci	ldx	[%fp+STACK_BIAS-8],$rp_real
1049e1051a39Sopenharmony_ci	mov	$ap_real,$ap
1050e1051a39Sopenharmony_ci	b	.Lpoint_double_shortcut
1051e1051a39Sopenharmony_ci	add	%sp,32*(12-4)+32,%sp	! difference in frame sizes
1052e1051a39Sopenharmony_ci
1053e1051a39Sopenharmony_ci.align	16
1054e1051a39Sopenharmony_ci.Ladd_proceed:
1055e1051a39Sopenharmony_ci	add	%sp,LOCALS+$R,$bp
1056e1051a39Sopenharmony_ci	add	%sp,LOCALS+$R,$ap
1057e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Rsqr, R);
1058e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Rsqr,$rp
1059e1051a39Sopenharmony_ci
1060e1051a39Sopenharmony_ci	add	$ap_real,64,$bp
1061e1051a39Sopenharmony_ci	add	%sp,LOCALS+$H,$ap
1062e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(res_z, H, in1_z);
1063e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_z,$rp
1064e1051a39Sopenharmony_ci
1065e1051a39Sopenharmony_ci	add	%sp,LOCALS+$H,$bp
1066e1051a39Sopenharmony_ci	add	%sp,LOCALS+$H,$ap
1067e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Hsqr, H);
1068e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hsqr,$rp
1069e1051a39Sopenharmony_ci
1070e1051a39Sopenharmony_ci	add	$bp_real,64,$bp
1071e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_z,$ap
1072e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(res_z, res_z, in2_z);
1073e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_z,$rp
1074e1051a39Sopenharmony_ci
1075e1051a39Sopenharmony_ci	add	%sp,LOCALS+$H,$bp
1076e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hsqr,$ap
1077e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(Hcub, Hsqr, H);
1078e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hcub,$rp
1079e1051a39Sopenharmony_ci
1080e1051a39Sopenharmony_ci	add	%sp,LOCALS+$U1,$bp
1081e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hsqr,$ap
1082e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(U2, U1, Hsqr);
1083e1051a39Sopenharmony_ci	add	%sp,LOCALS+$U2,$rp
1084e1051a39Sopenharmony_ci
1085e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2	! p256_mul_by_2(Hsqr, U2);
1086e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hsqr,$rp
1087e1051a39Sopenharmony_ci
1088e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Rsqr,$bp
1089e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_morf	! p256_sub(res_x, Rsqr, Hsqr);
1090e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_x,$rp
1091e1051a39Sopenharmony_ci
1092e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hcub,$bp
1093e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from	!  p256_sub(res_x, res_x, Hcub);
1094e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_x,$rp
1095e1051a39Sopenharmony_ci
1096e1051a39Sopenharmony_ci	add	%sp,LOCALS+$U2,$bp
1097e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_morf	! p256_sub(res_y, U2, res_x);
1098e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_y,$rp
1099e1051a39Sopenharmony_ci
1100e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hcub,$bp
1101e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S1,$ap
1102e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, S1, Hcub);
1103e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S2,$rp
1104e1051a39Sopenharmony_ci
1105e1051a39Sopenharmony_ci	add	%sp,LOCALS+$R,$bp
1106e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_y,$ap
1107e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(res_y, res_y, R);
1108e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_y,$rp
1109e1051a39Sopenharmony_ci
1110e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S2,$bp
1111e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from	! p256_sub(res_y, res_y, S2);
1112e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_y,$rp
1113e1051a39Sopenharmony_ci
1114e1051a39Sopenharmony_ci	ld	[%fp+STACK_BIAS-16],$t1	! !in1infty
1115e1051a39Sopenharmony_ci	ld	[%fp+STACK_BIAS-12],$t2	! !in2infty
1116e1051a39Sopenharmony_ci	ldx	[%fp+STACK_BIAS-8],$rp
1117e1051a39Sopenharmony_ci___
1118e1051a39Sopenharmony_cifor($i=0;$i<96;$i+=8) {			# conditional moves
1119e1051a39Sopenharmony_ci$code.=<<___;
1120e1051a39Sopenharmony_ci	ld	[%sp+LOCALS+$i],@acc[0]		! res
1121e1051a39Sopenharmony_ci	ld	[%sp+LOCALS+$i+4],@acc[1]
1122e1051a39Sopenharmony_ci	ld	[$bp_real+$i],@acc[2]		! in2
1123e1051a39Sopenharmony_ci	ld	[$bp_real+$i+4],@acc[3]
1124e1051a39Sopenharmony_ci	ld	[$ap_real+$i],@acc[4]		! in1
1125e1051a39Sopenharmony_ci	ld	[$ap_real+$i+4],@acc[5]
1126e1051a39Sopenharmony_ci	movrz	$t1,@acc[2],@acc[0]
1127e1051a39Sopenharmony_ci	movrz	$t1,@acc[3],@acc[1]
1128e1051a39Sopenharmony_ci	movrz	$t2,@acc[4],@acc[0]
1129e1051a39Sopenharmony_ci	movrz	$t2,@acc[5],@acc[1]
1130e1051a39Sopenharmony_ci	st	@acc[0],[$rp+$i]
1131e1051a39Sopenharmony_ci	st	@acc[1],[$rp+$i+4]
1132e1051a39Sopenharmony_ci___
1133e1051a39Sopenharmony_ci}
1134e1051a39Sopenharmony_ci$code.=<<___;
1135e1051a39Sopenharmony_ci.Ladd_done:
1136e1051a39Sopenharmony_ci	ret
1137e1051a39Sopenharmony_ci	restore
1138e1051a39Sopenharmony_ci.type	ecp_nistz256_point_add,#function
1139e1051a39Sopenharmony_ci.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1140e1051a39Sopenharmony_ci___
1141e1051a39Sopenharmony_ci}
1142e1051a39Sopenharmony_ci
1143e1051a39Sopenharmony_ci########################################################################
1144e1051a39Sopenharmony_ci# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1145e1051a39Sopenharmony_ci#				     const P256_POINT_AFFINE *in2);
1146e1051a39Sopenharmony_ci{
1147e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z,
1148e1051a39Sopenharmony_ci    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1149e1051a39Sopenharmony_cimy $Z1sqr = $S2;
1150e1051a39Sopenharmony_ci# above map() describes stack layout with 10 temporary
1151e1051a39Sopenharmony_ci# 256-bit vectors on top. Then we reserve some space for
1152e1051a39Sopenharmony_ci# !in1infty, !in2infty, result of check for zero and return pointer.
1153e1051a39Sopenharmony_ci
1154e1051a39Sopenharmony_cimy @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1155e1051a39Sopenharmony_cimy $bp_real=$rp_real;
1156e1051a39Sopenharmony_ci
1157e1051a39Sopenharmony_ci$code.=<<___;
1158e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_add_affine
1159e1051a39Sopenharmony_ci.align	32
1160e1051a39Sopenharmony_ciecp_nistz256_point_add_affine:
1161e1051a39Sopenharmony_ci	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
1162e1051a39Sopenharmony_ci	ld	[%g1],%g1		! OPENSSL_sparcv9cap_P[0]
1163e1051a39Sopenharmony_ci	and	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
1164e1051a39Sopenharmony_ci	cmp	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
1165e1051a39Sopenharmony_ci	be	ecp_nistz256_point_add_affine_vis3
1166e1051a39Sopenharmony_ci	nop
1167e1051a39Sopenharmony_ci
1168e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME-32*10-32,%sp
1169e1051a39Sopenharmony_ci
1170e1051a39Sopenharmony_ci	stx	$rp,[%fp+STACK_BIAS-8]	! off-load $rp
1171e1051a39Sopenharmony_ci	mov	$ap,$ap_real
1172e1051a39Sopenharmony_ci	mov	$bp,$bp_real
1173e1051a39Sopenharmony_ci
1174e1051a39Sopenharmony_ci	ld	[$ap+64],$t0		! in1_z
1175e1051a39Sopenharmony_ci	ld	[$ap+64+4],$t1
1176e1051a39Sopenharmony_ci	ld	[$ap+64+8],$t2
1177e1051a39Sopenharmony_ci	ld	[$ap+64+12],$t3
1178e1051a39Sopenharmony_ci	ld	[$ap+64+16],$t4
1179e1051a39Sopenharmony_ci	ld	[$ap+64+20],$t5
1180e1051a39Sopenharmony_ci	ld	[$ap+64+24],$t6
1181e1051a39Sopenharmony_ci	ld	[$ap+64+28],$t7
1182e1051a39Sopenharmony_ci	or	$t1,$t0,$t0
1183e1051a39Sopenharmony_ci	or	$t3,$t2,$t2
1184e1051a39Sopenharmony_ci	or	$t5,$t4,$t4
1185e1051a39Sopenharmony_ci	or	$t7,$t6,$t6
1186e1051a39Sopenharmony_ci	or	$t2,$t0,$t0
1187e1051a39Sopenharmony_ci	or	$t6,$t4,$t4
1188e1051a39Sopenharmony_ci	or	$t4,$t0,$t0		! !in1infty
1189e1051a39Sopenharmony_ci	movrnz	$t0,-1,$t0
1190e1051a39Sopenharmony_ci	st	$t0,[%fp+STACK_BIAS-16]
1191e1051a39Sopenharmony_ci
1192e1051a39Sopenharmony_ci	ld	[$bp],@acc[0]		! in2_x
1193e1051a39Sopenharmony_ci	ld	[$bp+4],@acc[1]
1194e1051a39Sopenharmony_ci	ld	[$bp+8],@acc[2]
1195e1051a39Sopenharmony_ci	ld	[$bp+12],@acc[3]
1196e1051a39Sopenharmony_ci	ld	[$bp+16],@acc[4]
1197e1051a39Sopenharmony_ci	ld	[$bp+20],@acc[5]
1198e1051a39Sopenharmony_ci	ld	[$bp+24],@acc[6]
1199e1051a39Sopenharmony_ci	ld	[$bp+28],@acc[7]
1200e1051a39Sopenharmony_ci	ld	[$bp+32],$t0		! in2_y
1201e1051a39Sopenharmony_ci	ld	[$bp+32+4],$t1
1202e1051a39Sopenharmony_ci	ld	[$bp+32+8],$t2
1203e1051a39Sopenharmony_ci	ld	[$bp+32+12],$t3
1204e1051a39Sopenharmony_ci	ld	[$bp+32+16],$t4
1205e1051a39Sopenharmony_ci	ld	[$bp+32+20],$t5
1206e1051a39Sopenharmony_ci	ld	[$bp+32+24],$t6
1207e1051a39Sopenharmony_ci	ld	[$bp+32+28],$t7
1208e1051a39Sopenharmony_ci	or	@acc[1],@acc[0],@acc[0]
1209e1051a39Sopenharmony_ci	or	@acc[3],@acc[2],@acc[2]
1210e1051a39Sopenharmony_ci	or	@acc[5],@acc[4],@acc[4]
1211e1051a39Sopenharmony_ci	or	@acc[7],@acc[6],@acc[6]
1212e1051a39Sopenharmony_ci	or	@acc[2],@acc[0],@acc[0]
1213e1051a39Sopenharmony_ci	or	@acc[6],@acc[4],@acc[4]
1214e1051a39Sopenharmony_ci	or	@acc[4],@acc[0],@acc[0]
1215e1051a39Sopenharmony_ci	or	$t1,$t0,$t0
1216e1051a39Sopenharmony_ci	or	$t3,$t2,$t2
1217e1051a39Sopenharmony_ci	or	$t5,$t4,$t4
1218e1051a39Sopenharmony_ci	or	$t7,$t6,$t6
1219e1051a39Sopenharmony_ci	or	$t2,$t0,$t0
1220e1051a39Sopenharmony_ci	or	$t6,$t4,$t4
1221e1051a39Sopenharmony_ci	or	$t4,$t0,$t0
1222e1051a39Sopenharmony_ci	or	@acc[0],$t0,$t0		! !in2infty
1223e1051a39Sopenharmony_ci	movrnz	$t0,-1,$t0
1224e1051a39Sopenharmony_ci	st	$t0,[%fp+STACK_BIAS-12]
1225e1051a39Sopenharmony_ci
1226e1051a39Sopenharmony_ci	add	$ap_real,64,$bp
1227e1051a39Sopenharmony_ci	add	$ap_real,64,$ap
1228e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Z1sqr, in1_z);
1229e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Z1sqr,$rp
1230e1051a39Sopenharmony_ci
1231e1051a39Sopenharmony_ci	add	$bp_real,0,$bp
1232e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Z1sqr,$ap
1233e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(U2, Z1sqr, in2_x);
1234e1051a39Sopenharmony_ci	add	%sp,LOCALS+$U2,$rp
1235e1051a39Sopenharmony_ci
1236e1051a39Sopenharmony_ci	add	$ap_real,0,$bp
1237e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from	! p256_sub(H, U2, in1_x);
1238e1051a39Sopenharmony_ci	add	%sp,LOCALS+$H,$rp
1239e1051a39Sopenharmony_ci
1240e1051a39Sopenharmony_ci	add	$ap_real,64,$bp
1241e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Z1sqr,$ap
1242e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, Z1sqr, in1_z);
1243e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S2,$rp
1244e1051a39Sopenharmony_ci
1245e1051a39Sopenharmony_ci	add	$ap_real,64,$bp
1246e1051a39Sopenharmony_ci	add	%sp,LOCALS+$H,$ap
1247e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(res_z, H, in1_z);
1248e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_z,$rp
1249e1051a39Sopenharmony_ci
1250e1051a39Sopenharmony_ci	add	$bp_real,32,$bp
1251e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S2,$ap
1252e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, S2, in2_y);
1253e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S2,$rp
1254e1051a39Sopenharmony_ci
1255e1051a39Sopenharmony_ci	add	$ap_real,32,$bp
1256e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from	! p256_sub(R, S2, in1_y);
1257e1051a39Sopenharmony_ci	add	%sp,LOCALS+$R,$rp
1258e1051a39Sopenharmony_ci
1259e1051a39Sopenharmony_ci	add	%sp,LOCALS+$H,$bp
1260e1051a39Sopenharmony_ci	add	%sp,LOCALS+$H,$ap
1261e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Hsqr, H);
1262e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hsqr,$rp
1263e1051a39Sopenharmony_ci
1264e1051a39Sopenharmony_ci	add	%sp,LOCALS+$R,$bp
1265e1051a39Sopenharmony_ci	add	%sp,LOCALS+$R,$ap
1266e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Rsqr, R);
1267e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Rsqr,$rp
1268e1051a39Sopenharmony_ci
1269e1051a39Sopenharmony_ci	add	%sp,LOCALS+$H,$bp
1270e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hsqr,$ap
1271e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(Hcub, Hsqr, H);
1272e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hcub,$rp
1273e1051a39Sopenharmony_ci
1274e1051a39Sopenharmony_ci	add	$ap_real,0,$bp
1275e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hsqr,$ap
1276e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(U2, in1_x, Hsqr);
1277e1051a39Sopenharmony_ci	add	%sp,LOCALS+$U2,$rp
1278e1051a39Sopenharmony_ci
1279e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2	! p256_mul_by_2(Hsqr, U2);
1280e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hsqr,$rp
1281e1051a39Sopenharmony_ci
1282e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Rsqr,$bp
1283e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_morf	! p256_sub(res_x, Rsqr, Hsqr);
1284e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_x,$rp
1285e1051a39Sopenharmony_ci
1286e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hcub,$bp
1287e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from	!  p256_sub(res_x, res_x, Hcub);
1288e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_x,$rp
1289e1051a39Sopenharmony_ci
1290e1051a39Sopenharmony_ci	add	%sp,LOCALS+$U2,$bp
1291e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_morf	! p256_sub(res_y, U2, res_x);
1292e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_y,$rp
1293e1051a39Sopenharmony_ci
1294e1051a39Sopenharmony_ci	add	$ap_real,32,$bp
1295e1051a39Sopenharmony_ci	add	%sp,LOCALS+$Hcub,$ap
1296e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, in1_y, Hcub);
1297e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S2,$rp
1298e1051a39Sopenharmony_ci
1299e1051a39Sopenharmony_ci	add	%sp,LOCALS+$R,$bp
1300e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_y,$ap
1301e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont	! p256_mul_mont(res_y, res_y, R);
1302e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_y,$rp
1303e1051a39Sopenharmony_ci
1304e1051a39Sopenharmony_ci	add	%sp,LOCALS+$S2,$bp
1305e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from	! p256_sub(res_y, res_y, S2);
1306e1051a39Sopenharmony_ci	add	%sp,LOCALS+$res_y,$rp
1307e1051a39Sopenharmony_ci
1308e1051a39Sopenharmony_ci	ld	[%fp+STACK_BIAS-16],$t1	! !in1infty
1309e1051a39Sopenharmony_ci	ld	[%fp+STACK_BIAS-12],$t2	! !in2infty
1310e1051a39Sopenharmony_ci	ldx	[%fp+STACK_BIAS-8],$rp
1311e1051a39Sopenharmony_ci___
1312e1051a39Sopenharmony_cifor($i=0;$i<64;$i+=8) {			# conditional moves
1313e1051a39Sopenharmony_ci$code.=<<___;
1314e1051a39Sopenharmony_ci	ld	[%sp+LOCALS+$i],@acc[0]		! res
1315e1051a39Sopenharmony_ci	ld	[%sp+LOCALS+$i+4],@acc[1]
1316e1051a39Sopenharmony_ci	ld	[$bp_real+$i],@acc[2]		! in2
1317e1051a39Sopenharmony_ci	ld	[$bp_real+$i+4],@acc[3]
1318e1051a39Sopenharmony_ci	ld	[$ap_real+$i],@acc[4]		! in1
1319e1051a39Sopenharmony_ci	ld	[$ap_real+$i+4],@acc[5]
1320e1051a39Sopenharmony_ci	movrz	$t1,@acc[2],@acc[0]
1321e1051a39Sopenharmony_ci	movrz	$t1,@acc[3],@acc[1]
1322e1051a39Sopenharmony_ci	movrz	$t2,@acc[4],@acc[0]
1323e1051a39Sopenharmony_ci	movrz	$t2,@acc[5],@acc[1]
1324e1051a39Sopenharmony_ci	st	@acc[0],[$rp+$i]
1325e1051a39Sopenharmony_ci	st	@acc[1],[$rp+$i+4]
1326e1051a39Sopenharmony_ci___
1327e1051a39Sopenharmony_ci}
1328e1051a39Sopenharmony_cifor(;$i<96;$i+=8) {
1329e1051a39Sopenharmony_cimy $j=($i-64)/4;
1330e1051a39Sopenharmony_ci$code.=<<___;
1331e1051a39Sopenharmony_ci	ld	[%sp+LOCALS+$i],@acc[0]		! res
1332e1051a39Sopenharmony_ci	ld	[%sp+LOCALS+$i+4],@acc[1]
1333e1051a39Sopenharmony_ci	ld	[$ap_real+$i],@acc[4]		! in1
1334e1051a39Sopenharmony_ci	ld	[$ap_real+$i+4],@acc[5]
1335e1051a39Sopenharmony_ci	movrz	$t1,@ONE_mont[$j],@acc[0]
1336e1051a39Sopenharmony_ci	movrz	$t1,@ONE_mont[$j+1],@acc[1]
1337e1051a39Sopenharmony_ci	movrz	$t2,@acc[4],@acc[0]
1338e1051a39Sopenharmony_ci	movrz	$t2,@acc[5],@acc[1]
1339e1051a39Sopenharmony_ci	st	@acc[0],[$rp+$i]
1340e1051a39Sopenharmony_ci	st	@acc[1],[$rp+$i+4]
1341e1051a39Sopenharmony_ci___
1342e1051a39Sopenharmony_ci}
1343e1051a39Sopenharmony_ci$code.=<<___;
1344e1051a39Sopenharmony_ci	ret
1345e1051a39Sopenharmony_ci	restore
1346e1051a39Sopenharmony_ci.type	ecp_nistz256_point_add_affine,#function
1347e1051a39Sopenharmony_ci.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1348e1051a39Sopenharmony_ci___
1349e1051a39Sopenharmony_ci}								}}}
1350e1051a39Sopenharmony_ci{{{
1351e1051a39Sopenharmony_cimy ($out,$inp,$index)=map("%i$_",(0..2));
1352e1051a39Sopenharmony_cimy $mask="%o0";
1353e1051a39Sopenharmony_ci
1354e1051a39Sopenharmony_ci$code.=<<___;
1355e1051a39Sopenharmony_ci! void	ecp_nistz256_scatter_w5(void *%i0,const P256_POINT *%i1,
1356e1051a39Sopenharmony_ci!					  int %i2);
1357e1051a39Sopenharmony_ci.globl	ecp_nistz256_scatter_w5
1358e1051a39Sopenharmony_ci.align	32
1359e1051a39Sopenharmony_ciecp_nistz256_scatter_w5:
1360e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
1361e1051a39Sopenharmony_ci
1362e1051a39Sopenharmony_ci	sll	$index,2,$index
1363e1051a39Sopenharmony_ci	add	$out,$index,$out
1364e1051a39Sopenharmony_ci
1365e1051a39Sopenharmony_ci	ld	[$inp],%l0		! X
1366e1051a39Sopenharmony_ci	ld	[$inp+4],%l1
1367e1051a39Sopenharmony_ci	ld	[$inp+8],%l2
1368e1051a39Sopenharmony_ci	ld	[$inp+12],%l3
1369e1051a39Sopenharmony_ci	ld	[$inp+16],%l4
1370e1051a39Sopenharmony_ci	ld	[$inp+20],%l5
1371e1051a39Sopenharmony_ci	ld	[$inp+24],%l6
1372e1051a39Sopenharmony_ci	ld	[$inp+28],%l7
1373e1051a39Sopenharmony_ci	add	$inp,32,$inp
1374e1051a39Sopenharmony_ci	st	%l0,[$out+64*0-4]
1375e1051a39Sopenharmony_ci	st	%l1,[$out+64*1-4]
1376e1051a39Sopenharmony_ci	st	%l2,[$out+64*2-4]
1377e1051a39Sopenharmony_ci	st	%l3,[$out+64*3-4]
1378e1051a39Sopenharmony_ci	st	%l4,[$out+64*4-4]
1379e1051a39Sopenharmony_ci	st	%l5,[$out+64*5-4]
1380e1051a39Sopenharmony_ci	st	%l6,[$out+64*6-4]
1381e1051a39Sopenharmony_ci	st	%l7,[$out+64*7-4]
1382e1051a39Sopenharmony_ci	add	$out,64*8,$out
1383e1051a39Sopenharmony_ci
1384e1051a39Sopenharmony_ci	ld	[$inp],%l0		! Y
1385e1051a39Sopenharmony_ci	ld	[$inp+4],%l1
1386e1051a39Sopenharmony_ci	ld	[$inp+8],%l2
1387e1051a39Sopenharmony_ci	ld	[$inp+12],%l3
1388e1051a39Sopenharmony_ci	ld	[$inp+16],%l4
1389e1051a39Sopenharmony_ci	ld	[$inp+20],%l5
1390e1051a39Sopenharmony_ci	ld	[$inp+24],%l6
1391e1051a39Sopenharmony_ci	ld	[$inp+28],%l7
1392e1051a39Sopenharmony_ci	add	$inp,32,$inp
1393e1051a39Sopenharmony_ci	st	%l0,[$out+64*0-4]
1394e1051a39Sopenharmony_ci	st	%l1,[$out+64*1-4]
1395e1051a39Sopenharmony_ci	st	%l2,[$out+64*2-4]
1396e1051a39Sopenharmony_ci	st	%l3,[$out+64*3-4]
1397e1051a39Sopenharmony_ci	st	%l4,[$out+64*4-4]
1398e1051a39Sopenharmony_ci	st	%l5,[$out+64*5-4]
1399e1051a39Sopenharmony_ci	st	%l6,[$out+64*6-4]
1400e1051a39Sopenharmony_ci	st	%l7,[$out+64*7-4]
1401e1051a39Sopenharmony_ci	add	$out,64*8,$out
1402e1051a39Sopenharmony_ci
1403e1051a39Sopenharmony_ci	ld	[$inp],%l0		! Z
1404e1051a39Sopenharmony_ci	ld	[$inp+4],%l1
1405e1051a39Sopenharmony_ci	ld	[$inp+8],%l2
1406e1051a39Sopenharmony_ci	ld	[$inp+12],%l3
1407e1051a39Sopenharmony_ci	ld	[$inp+16],%l4
1408e1051a39Sopenharmony_ci	ld	[$inp+20],%l5
1409e1051a39Sopenharmony_ci	ld	[$inp+24],%l6
1410e1051a39Sopenharmony_ci	ld	[$inp+28],%l7
1411e1051a39Sopenharmony_ci	st	%l0,[$out+64*0-4]
1412e1051a39Sopenharmony_ci	st	%l1,[$out+64*1-4]
1413e1051a39Sopenharmony_ci	st	%l2,[$out+64*2-4]
1414e1051a39Sopenharmony_ci	st	%l3,[$out+64*3-4]
1415e1051a39Sopenharmony_ci	st	%l4,[$out+64*4-4]
1416e1051a39Sopenharmony_ci	st	%l5,[$out+64*5-4]
1417e1051a39Sopenharmony_ci	st	%l6,[$out+64*6-4]
1418e1051a39Sopenharmony_ci	st	%l7,[$out+64*7-4]
1419e1051a39Sopenharmony_ci
1420e1051a39Sopenharmony_ci	ret
1421e1051a39Sopenharmony_ci	restore
1422e1051a39Sopenharmony_ci.type	ecp_nistz256_scatter_w5,#function
1423e1051a39Sopenharmony_ci.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1424e1051a39Sopenharmony_ci
1425e1051a39Sopenharmony_ci! void	ecp_nistz256_gather_w5(P256_POINT *%i0,const void *%i1,
1426e1051a39Sopenharmony_ci!					       int %i2);
1427e1051a39Sopenharmony_ci.globl	ecp_nistz256_gather_w5
1428e1051a39Sopenharmony_ci.align	32
1429e1051a39Sopenharmony_ciecp_nistz256_gather_w5:
1430e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
1431e1051a39Sopenharmony_ci
1432e1051a39Sopenharmony_ci	neg	$index,$mask
1433e1051a39Sopenharmony_ci	srax	$mask,63,$mask
1434e1051a39Sopenharmony_ci
1435e1051a39Sopenharmony_ci	add	$index,$mask,$index
1436e1051a39Sopenharmony_ci	sll	$index,2,$index
1437e1051a39Sopenharmony_ci	add	$inp,$index,$inp
1438e1051a39Sopenharmony_ci
1439e1051a39Sopenharmony_ci	ld	[$inp+64*0],%l0
1440e1051a39Sopenharmony_ci	ld	[$inp+64*1],%l1
1441e1051a39Sopenharmony_ci	ld	[$inp+64*2],%l2
1442e1051a39Sopenharmony_ci	ld	[$inp+64*3],%l3
1443e1051a39Sopenharmony_ci	ld	[$inp+64*4],%l4
1444e1051a39Sopenharmony_ci	ld	[$inp+64*5],%l5
1445e1051a39Sopenharmony_ci	ld	[$inp+64*6],%l6
1446e1051a39Sopenharmony_ci	ld	[$inp+64*7],%l7
1447e1051a39Sopenharmony_ci	add	$inp,64*8,$inp
1448e1051a39Sopenharmony_ci	and	%l0,$mask,%l0
1449e1051a39Sopenharmony_ci	and	%l1,$mask,%l1
1450e1051a39Sopenharmony_ci	st	%l0,[$out]		! X
1451e1051a39Sopenharmony_ci	and	%l2,$mask,%l2
1452e1051a39Sopenharmony_ci	st	%l1,[$out+4]
1453e1051a39Sopenharmony_ci	and	%l3,$mask,%l3
1454e1051a39Sopenharmony_ci	st	%l2,[$out+8]
1455e1051a39Sopenharmony_ci	and	%l4,$mask,%l4
1456e1051a39Sopenharmony_ci	st	%l3,[$out+12]
1457e1051a39Sopenharmony_ci	and	%l5,$mask,%l5
1458e1051a39Sopenharmony_ci	st	%l4,[$out+16]
1459e1051a39Sopenharmony_ci	and	%l6,$mask,%l6
1460e1051a39Sopenharmony_ci	st	%l5,[$out+20]
1461e1051a39Sopenharmony_ci	and	%l7,$mask,%l7
1462e1051a39Sopenharmony_ci	st	%l6,[$out+24]
1463e1051a39Sopenharmony_ci	st	%l7,[$out+28]
1464e1051a39Sopenharmony_ci	add	$out,32,$out
1465e1051a39Sopenharmony_ci
1466e1051a39Sopenharmony_ci	ld	[$inp+64*0],%l0
1467e1051a39Sopenharmony_ci	ld	[$inp+64*1],%l1
1468e1051a39Sopenharmony_ci	ld	[$inp+64*2],%l2
1469e1051a39Sopenharmony_ci	ld	[$inp+64*3],%l3
1470e1051a39Sopenharmony_ci	ld	[$inp+64*4],%l4
1471e1051a39Sopenharmony_ci	ld	[$inp+64*5],%l5
1472e1051a39Sopenharmony_ci	ld	[$inp+64*6],%l6
1473e1051a39Sopenharmony_ci	ld	[$inp+64*7],%l7
1474e1051a39Sopenharmony_ci	add	$inp,64*8,$inp
1475e1051a39Sopenharmony_ci	and	%l0,$mask,%l0
1476e1051a39Sopenharmony_ci	and	%l1,$mask,%l1
1477e1051a39Sopenharmony_ci	st	%l0,[$out]		! Y
1478e1051a39Sopenharmony_ci	and	%l2,$mask,%l2
1479e1051a39Sopenharmony_ci	st	%l1,[$out+4]
1480e1051a39Sopenharmony_ci	and	%l3,$mask,%l3
1481e1051a39Sopenharmony_ci	st	%l2,[$out+8]
1482e1051a39Sopenharmony_ci	and	%l4,$mask,%l4
1483e1051a39Sopenharmony_ci	st	%l3,[$out+12]
1484e1051a39Sopenharmony_ci	and	%l5,$mask,%l5
1485e1051a39Sopenharmony_ci	st	%l4,[$out+16]
1486e1051a39Sopenharmony_ci	and	%l6,$mask,%l6
1487e1051a39Sopenharmony_ci	st	%l5,[$out+20]
1488e1051a39Sopenharmony_ci	and	%l7,$mask,%l7
1489e1051a39Sopenharmony_ci	st	%l6,[$out+24]
1490e1051a39Sopenharmony_ci	st	%l7,[$out+28]
1491e1051a39Sopenharmony_ci	add	$out,32,$out
1492e1051a39Sopenharmony_ci
1493e1051a39Sopenharmony_ci	ld	[$inp+64*0],%l0
1494e1051a39Sopenharmony_ci	ld	[$inp+64*1],%l1
1495e1051a39Sopenharmony_ci	ld	[$inp+64*2],%l2
1496e1051a39Sopenharmony_ci	ld	[$inp+64*3],%l3
1497e1051a39Sopenharmony_ci	ld	[$inp+64*4],%l4
1498e1051a39Sopenharmony_ci	ld	[$inp+64*5],%l5
1499e1051a39Sopenharmony_ci	ld	[$inp+64*6],%l6
1500e1051a39Sopenharmony_ci	ld	[$inp+64*7],%l7
1501e1051a39Sopenharmony_ci	and	%l0,$mask,%l0
1502e1051a39Sopenharmony_ci	and	%l1,$mask,%l1
1503e1051a39Sopenharmony_ci	st	%l0,[$out]		! Z
1504e1051a39Sopenharmony_ci	and	%l2,$mask,%l2
1505e1051a39Sopenharmony_ci	st	%l1,[$out+4]
1506e1051a39Sopenharmony_ci	and	%l3,$mask,%l3
1507e1051a39Sopenharmony_ci	st	%l2,[$out+8]
1508e1051a39Sopenharmony_ci	and	%l4,$mask,%l4
1509e1051a39Sopenharmony_ci	st	%l3,[$out+12]
1510e1051a39Sopenharmony_ci	and	%l5,$mask,%l5
1511e1051a39Sopenharmony_ci	st	%l4,[$out+16]
1512e1051a39Sopenharmony_ci	and	%l6,$mask,%l6
1513e1051a39Sopenharmony_ci	st	%l5,[$out+20]
1514e1051a39Sopenharmony_ci	and	%l7,$mask,%l7
1515e1051a39Sopenharmony_ci	st	%l6,[$out+24]
1516e1051a39Sopenharmony_ci	st	%l7,[$out+28]
1517e1051a39Sopenharmony_ci
1518e1051a39Sopenharmony_ci	ret
1519e1051a39Sopenharmony_ci	restore
1520e1051a39Sopenharmony_ci.type	ecp_nistz256_gather_w5,#function
1521e1051a39Sopenharmony_ci.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1522e1051a39Sopenharmony_ci
1523e1051a39Sopenharmony_ci! void	ecp_nistz256_scatter_w7(void *%i0,const P256_POINT_AFFINE *%i1,
1524e1051a39Sopenharmony_ci!					  int %i2);
1525e1051a39Sopenharmony_ci.globl	ecp_nistz256_scatter_w7
1526e1051a39Sopenharmony_ci.align	32
1527e1051a39Sopenharmony_ciecp_nistz256_scatter_w7:
1528e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
1529e1051a39Sopenharmony_ci	nop
1530e1051a39Sopenharmony_ci	add	$out,$index,$out
1531e1051a39Sopenharmony_ci	mov	64/4,$index
1532e1051a39Sopenharmony_ci.Loop_scatter_w7:
1533e1051a39Sopenharmony_ci	ld	[$inp],%l0
1534e1051a39Sopenharmony_ci	add	$inp,4,$inp
1535e1051a39Sopenharmony_ci	subcc	$index,1,$index
1536e1051a39Sopenharmony_ci	stb	%l0,[$out+64*0]
1537e1051a39Sopenharmony_ci	srl	%l0,8,%l1
1538e1051a39Sopenharmony_ci	stb	%l1,[$out+64*1]
1539e1051a39Sopenharmony_ci	srl	%l0,16,%l2
1540e1051a39Sopenharmony_ci	stb	%l2,[$out+64*2]
1541e1051a39Sopenharmony_ci	srl	%l0,24,%l3
1542e1051a39Sopenharmony_ci	stb	%l3,[$out+64*3]
1543e1051a39Sopenharmony_ci	bne	.Loop_scatter_w7
1544e1051a39Sopenharmony_ci	add	$out,64*4,$out
1545e1051a39Sopenharmony_ci
1546e1051a39Sopenharmony_ci	ret
1547e1051a39Sopenharmony_ci	restore
1548e1051a39Sopenharmony_ci.type	ecp_nistz256_scatter_w7,#function
1549e1051a39Sopenharmony_ci.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1550e1051a39Sopenharmony_ci
1551e1051a39Sopenharmony_ci! void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *%i0,const void *%i1,
1552e1051a39Sopenharmony_ci!						      int %i2);
1553e1051a39Sopenharmony_ci.globl	ecp_nistz256_gather_w7
1554e1051a39Sopenharmony_ci.align	32
1555e1051a39Sopenharmony_ciecp_nistz256_gather_w7:
1556e1051a39Sopenharmony_ci	save	%sp,-STACK_FRAME,%sp
1557e1051a39Sopenharmony_ci
1558e1051a39Sopenharmony_ci	neg	$index,$mask
1559e1051a39Sopenharmony_ci	srax	$mask,63,$mask
1560e1051a39Sopenharmony_ci
1561e1051a39Sopenharmony_ci	add	$index,$mask,$index
1562e1051a39Sopenharmony_ci	add	$inp,$index,$inp
1563e1051a39Sopenharmony_ci	mov	64/4,$index
1564e1051a39Sopenharmony_ci
1565e1051a39Sopenharmony_ci.Loop_gather_w7:
1566e1051a39Sopenharmony_ci	ldub	[$inp+64*0],%l0
1567e1051a39Sopenharmony_ci	prefetch [$inp+3840+64*0],1
1568e1051a39Sopenharmony_ci	subcc	$index,1,$index
1569e1051a39Sopenharmony_ci	ldub	[$inp+64*1],%l1
1570e1051a39Sopenharmony_ci	prefetch [$inp+3840+64*1],1
1571e1051a39Sopenharmony_ci	ldub	[$inp+64*2],%l2
1572e1051a39Sopenharmony_ci	prefetch [$inp+3840+64*2],1
1573e1051a39Sopenharmony_ci	ldub	[$inp+64*3],%l3
1574e1051a39Sopenharmony_ci	prefetch [$inp+3840+64*3],1
1575e1051a39Sopenharmony_ci	add	$inp,64*4,$inp
1576e1051a39Sopenharmony_ci	sll	%l1,8,%l1
1577e1051a39Sopenharmony_ci	sll	%l2,16,%l2
1578e1051a39Sopenharmony_ci	or	%l0,%l1,%l0
1579e1051a39Sopenharmony_ci	sll	%l3,24,%l3
1580e1051a39Sopenharmony_ci	or	%l0,%l2,%l0
1581e1051a39Sopenharmony_ci	or	%l0,%l3,%l0
1582e1051a39Sopenharmony_ci	and	%l0,$mask,%l0
1583e1051a39Sopenharmony_ci	st	%l0,[$out]
1584e1051a39Sopenharmony_ci	bne	.Loop_gather_w7
1585e1051a39Sopenharmony_ci	add	$out,4,$out
1586e1051a39Sopenharmony_ci
1587e1051a39Sopenharmony_ci	ret
1588e1051a39Sopenharmony_ci	restore
1589e1051a39Sopenharmony_ci.type	ecp_nistz256_gather_w7,#function
1590e1051a39Sopenharmony_ci.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1591e1051a39Sopenharmony_ci___
1592e1051a39Sopenharmony_ci}}}
1593e1051a39Sopenharmony_ci{{{
1594e1051a39Sopenharmony_ci########################################################################
1595e1051a39Sopenharmony_ci# Following subroutines are VIS3 counterparts of those above that
1596e1051a39Sopenharmony_ci# implement ones found in ecp_nistz256.c. Key difference is that they
1597e1051a39Sopenharmony_ci# use 128-bit multiplication and addition with 64-bit carry, and in order
1598e1051a39Sopenharmony_ci# to do that they perform conversion from uin32_t[8] to uint64_t[4] upon
1599e1051a39Sopenharmony_ci# entry and vice versa on return.
1600e1051a39Sopenharmony_ci#
1601e1051a39Sopenharmony_cimy ($rp,$ap,$bp)=map("%i$_",(0..2));
1602e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7));
1603e1051a39Sopenharmony_cimy ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5));
1604e1051a39Sopenharmony_cimy ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1");
1605e1051a39Sopenharmony_cimy ($rp_real,$ap_real)=("%g2","%g3");
1606e1051a39Sopenharmony_cimy ($acc6,$acc7)=($bp,$bi);	# used in squaring
1607e1051a39Sopenharmony_ci
1608e1051a39Sopenharmony_ci$code.=<<___;
1609e1051a39Sopenharmony_ci.align	32
1610e1051a39Sopenharmony_ci__ecp_nistz256_mul_by_2_vis3:
1611e1051a39Sopenharmony_ci	addcc	$acc0,$acc0,$acc0
1612e1051a39Sopenharmony_ci	addxccc	$acc1,$acc1,$acc1
1613e1051a39Sopenharmony_ci	addxccc	$acc2,$acc2,$acc2
1614e1051a39Sopenharmony_ci	addxccc	$acc3,$acc3,$acc3
1615e1051a39Sopenharmony_ci	b	.Lreduce_by_sub_vis3
1616e1051a39Sopenharmony_ci	addxc	%g0,%g0,$acc4		! did it carry?
1617e1051a39Sopenharmony_ci.type	__ecp_nistz256_mul_by_2_vis3,#function
1618e1051a39Sopenharmony_ci.size	__ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3
1619e1051a39Sopenharmony_ci
1620e1051a39Sopenharmony_ci.align	32
1621e1051a39Sopenharmony_ci__ecp_nistz256_add_vis3:
1622e1051a39Sopenharmony_ci	ldx	[$bp+0],$t0
1623e1051a39Sopenharmony_ci	ldx	[$bp+8],$t1
1624e1051a39Sopenharmony_ci	ldx	[$bp+16],$t2
1625e1051a39Sopenharmony_ci	ldx	[$bp+24],$t3
1626e1051a39Sopenharmony_ci
1627e1051a39Sopenharmony_ci__ecp_nistz256_add_noload_vis3:
1628e1051a39Sopenharmony_ci
1629e1051a39Sopenharmony_ci	addcc	$t0,$acc0,$acc0
1630e1051a39Sopenharmony_ci	addxccc	$t1,$acc1,$acc1
1631e1051a39Sopenharmony_ci	addxccc	$t2,$acc2,$acc2
1632e1051a39Sopenharmony_ci	addxccc	$t3,$acc3,$acc3
1633e1051a39Sopenharmony_ci	addxc	%g0,%g0,$acc4		! did it carry?
1634e1051a39Sopenharmony_ci
1635e1051a39Sopenharmony_ci.Lreduce_by_sub_vis3:
1636e1051a39Sopenharmony_ci
1637e1051a39Sopenharmony_ci	addcc	$acc0,1,$t0		! add -modulus, i.e. subtract
1638e1051a39Sopenharmony_ci	addxccc	$acc1,$poly1,$t1
1639e1051a39Sopenharmony_ci	addxccc	$acc2,$minus1,$t2
1640e1051a39Sopenharmony_ci	addxccc	$acc3,$poly3,$t3
1641e1051a39Sopenharmony_ci	addxc	$acc4,$minus1,$acc4
1642e1051a39Sopenharmony_ci
1643e1051a39Sopenharmony_ci	movrz	$acc4,$t0,$acc0		! ret = borrow ? ret : ret-modulus
1644e1051a39Sopenharmony_ci	movrz	$acc4,$t1,$acc1
1645e1051a39Sopenharmony_ci	stx	$acc0,[$rp]
1646e1051a39Sopenharmony_ci	movrz	$acc4,$t2,$acc2
1647e1051a39Sopenharmony_ci	stx	$acc1,[$rp+8]
1648e1051a39Sopenharmony_ci	movrz	$acc4,$t3,$acc3
1649e1051a39Sopenharmony_ci	stx	$acc2,[$rp+16]
1650e1051a39Sopenharmony_ci	retl
1651e1051a39Sopenharmony_ci	stx	$acc3,[$rp+24]
1652e1051a39Sopenharmony_ci.type	__ecp_nistz256_add_vis3,#function
1653e1051a39Sopenharmony_ci.size	__ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3
1654e1051a39Sopenharmony_ci
1655e1051a39Sopenharmony_ci! Trouble with subtraction is that there is no subtraction with 64-bit
1656e1051a39Sopenharmony_ci! borrow, only with 32-bit one. For this reason we "decompose" 64-bit
1657e1051a39Sopenharmony_ci! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But
1658e1051a39Sopenharmony_ci! recall that SPARC is big-endian, which is why you'll observe that
1659e1051a39Sopenharmony_ci! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we
1660e1051a39Sopenharmony_ci! "collect" result back to 64-bit $acc0-$acc3.
1661e1051a39Sopenharmony_ci.align	32
1662e1051a39Sopenharmony_ci__ecp_nistz256_sub_from_vis3:
1663e1051a39Sopenharmony_ci	ld	[$bp+4],$t0
1664e1051a39Sopenharmony_ci	ld	[$bp+0],$t1
1665e1051a39Sopenharmony_ci	ld	[$bp+12],$t2
1666e1051a39Sopenharmony_ci	ld	[$bp+8],$t3
1667e1051a39Sopenharmony_ci
1668e1051a39Sopenharmony_ci	srlx	$acc0,32,$acc4
1669e1051a39Sopenharmony_ci	not	$poly1,$poly1
1670e1051a39Sopenharmony_ci	srlx	$acc1,32,$acc5
1671e1051a39Sopenharmony_ci	subcc	$acc0,$t0,$acc0
1672e1051a39Sopenharmony_ci	ld	[$bp+20],$t0
1673e1051a39Sopenharmony_ci	subccc	$acc4,$t1,$acc4
1674e1051a39Sopenharmony_ci	ld	[$bp+16],$t1
1675e1051a39Sopenharmony_ci	subccc	$acc1,$t2,$acc1
1676e1051a39Sopenharmony_ci	ld	[$bp+28],$t2
1677e1051a39Sopenharmony_ci	and	$acc0,$poly1,$acc0
1678e1051a39Sopenharmony_ci	subccc	$acc5,$t3,$acc5
1679e1051a39Sopenharmony_ci	ld	[$bp+24],$t3
1680e1051a39Sopenharmony_ci	sllx	$acc4,32,$acc4
1681e1051a39Sopenharmony_ci	and	$acc1,$poly1,$acc1
1682e1051a39Sopenharmony_ci	sllx	$acc5,32,$acc5
1683e1051a39Sopenharmony_ci	or	$acc0,$acc4,$acc0
1684e1051a39Sopenharmony_ci	srlx	$acc2,32,$acc4
1685e1051a39Sopenharmony_ci	or	$acc1,$acc5,$acc1
1686e1051a39Sopenharmony_ci	srlx	$acc3,32,$acc5
1687e1051a39Sopenharmony_ci	subccc	$acc2,$t0,$acc2
1688e1051a39Sopenharmony_ci	subccc	$acc4,$t1,$acc4
1689e1051a39Sopenharmony_ci	subccc	$acc3,$t2,$acc3
1690e1051a39Sopenharmony_ci	and	$acc2,$poly1,$acc2
1691e1051a39Sopenharmony_ci	subccc	$acc5,$t3,$acc5
1692e1051a39Sopenharmony_ci	sllx	$acc4,32,$acc4
1693e1051a39Sopenharmony_ci	and	$acc3,$poly1,$acc3
1694e1051a39Sopenharmony_ci	sllx	$acc5,32,$acc5
1695e1051a39Sopenharmony_ci	or	$acc2,$acc4,$acc2
1696e1051a39Sopenharmony_ci	subc	%g0,%g0,$acc4		! did it borrow?
1697e1051a39Sopenharmony_ci	b	.Lreduce_by_add_vis3
1698e1051a39Sopenharmony_ci	or	$acc3,$acc5,$acc3
1699e1051a39Sopenharmony_ci.type	__ecp_nistz256_sub_from_vis3,#function
1700e1051a39Sopenharmony_ci.size	__ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3
1701e1051a39Sopenharmony_ci
1702e1051a39Sopenharmony_ci.align	32
1703e1051a39Sopenharmony_ci__ecp_nistz256_sub_morf_vis3:
1704e1051a39Sopenharmony_ci	ld	[$bp+4],$t0
1705e1051a39Sopenharmony_ci	ld	[$bp+0],$t1
1706e1051a39Sopenharmony_ci	ld	[$bp+12],$t2
1707e1051a39Sopenharmony_ci	ld	[$bp+8],$t3
1708e1051a39Sopenharmony_ci
1709e1051a39Sopenharmony_ci	srlx	$acc0,32,$acc4
1710e1051a39Sopenharmony_ci	not	$poly1,$poly1
1711e1051a39Sopenharmony_ci	srlx	$acc1,32,$acc5
1712e1051a39Sopenharmony_ci	subcc	$t0,$acc0,$acc0
1713e1051a39Sopenharmony_ci	ld	[$bp+20],$t0
1714e1051a39Sopenharmony_ci	subccc	$t1,$acc4,$acc4
1715e1051a39Sopenharmony_ci	ld	[$bp+16],$t1
1716e1051a39Sopenharmony_ci	subccc	$t2,$acc1,$acc1
1717e1051a39Sopenharmony_ci	ld	[$bp+28],$t2
1718e1051a39Sopenharmony_ci	and	$acc0,$poly1,$acc0
1719e1051a39Sopenharmony_ci	subccc	$t3,$acc5,$acc5
1720e1051a39Sopenharmony_ci	ld	[$bp+24],$t3
1721e1051a39Sopenharmony_ci	sllx	$acc4,32,$acc4
1722e1051a39Sopenharmony_ci	and	$acc1,$poly1,$acc1
1723e1051a39Sopenharmony_ci	sllx	$acc5,32,$acc5
1724e1051a39Sopenharmony_ci	or	$acc0,$acc4,$acc0
1725e1051a39Sopenharmony_ci	srlx	$acc2,32,$acc4
1726e1051a39Sopenharmony_ci	or	$acc1,$acc5,$acc1
1727e1051a39Sopenharmony_ci	srlx	$acc3,32,$acc5
1728e1051a39Sopenharmony_ci	subccc	$t0,$acc2,$acc2
1729e1051a39Sopenharmony_ci	subccc	$t1,$acc4,$acc4
1730e1051a39Sopenharmony_ci	subccc	$t2,$acc3,$acc3
1731e1051a39Sopenharmony_ci	and	$acc2,$poly1,$acc2
1732e1051a39Sopenharmony_ci	subccc	$t3,$acc5,$acc5
1733e1051a39Sopenharmony_ci	sllx	$acc4,32,$acc4
1734e1051a39Sopenharmony_ci	and	$acc3,$poly1,$acc3
1735e1051a39Sopenharmony_ci	sllx	$acc5,32,$acc5
1736e1051a39Sopenharmony_ci	or	$acc2,$acc4,$acc2
1737e1051a39Sopenharmony_ci	subc	%g0,%g0,$acc4		! did it borrow?
1738e1051a39Sopenharmony_ci	or	$acc3,$acc5,$acc3
1739e1051a39Sopenharmony_ci
1740e1051a39Sopenharmony_ci.Lreduce_by_add_vis3:
1741e1051a39Sopenharmony_ci
1742e1051a39Sopenharmony_ci	addcc	$acc0,-1,$t0		! add modulus
1743e1051a39Sopenharmony_ci	not	$poly3,$t3
1744e1051a39Sopenharmony_ci	addxccc	$acc1,$poly1,$t1
1745e1051a39Sopenharmony_ci	not	$poly1,$poly1		! restore $poly1
1746e1051a39Sopenharmony_ci	addxccc	$acc2,%g0,$t2
1747e1051a39Sopenharmony_ci	addxc	$acc3,$t3,$t3
1748e1051a39Sopenharmony_ci
1749e1051a39Sopenharmony_ci	movrnz	$acc4,$t0,$acc0		! if a-b borrowed, ret = ret+mod
1750e1051a39Sopenharmony_ci	movrnz	$acc4,$t1,$acc1
1751e1051a39Sopenharmony_ci	stx	$acc0,[$rp]
1752e1051a39Sopenharmony_ci	movrnz	$acc4,$t2,$acc2
1753e1051a39Sopenharmony_ci	stx	$acc1,[$rp+8]
1754e1051a39Sopenharmony_ci	movrnz	$acc4,$t3,$acc3
1755e1051a39Sopenharmony_ci	stx	$acc2,[$rp+16]
1756e1051a39Sopenharmony_ci	retl
1757e1051a39Sopenharmony_ci	stx	$acc3,[$rp+24]
1758e1051a39Sopenharmony_ci.type	__ecp_nistz256_sub_morf_vis3,#function
1759e1051a39Sopenharmony_ci.size	__ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3
1760e1051a39Sopenharmony_ci
1761e1051a39Sopenharmony_ci.align	32
1762e1051a39Sopenharmony_ci__ecp_nistz256_div_by_2_vis3:
1763e1051a39Sopenharmony_ci	! ret = (a is odd ? a+mod : a) >> 1
1764e1051a39Sopenharmony_ci
1765e1051a39Sopenharmony_ci	not	$poly1,$t1
1766e1051a39Sopenharmony_ci	not	$poly3,$t3
1767e1051a39Sopenharmony_ci	and	$acc0,1,$acc5
1768e1051a39Sopenharmony_ci	addcc	$acc0,-1,$t0		! add modulus
1769e1051a39Sopenharmony_ci	addxccc	$acc1,$t1,$t1
1770e1051a39Sopenharmony_ci	addxccc	$acc2,%g0,$t2
1771e1051a39Sopenharmony_ci	addxccc	$acc3,$t3,$t3
1772e1051a39Sopenharmony_ci	addxc	%g0,%g0,$acc4		! carry bit
1773e1051a39Sopenharmony_ci
1774e1051a39Sopenharmony_ci	movrnz	$acc5,$t0,$acc0
1775e1051a39Sopenharmony_ci	movrnz	$acc5,$t1,$acc1
1776e1051a39Sopenharmony_ci	movrnz	$acc5,$t2,$acc2
1777e1051a39Sopenharmony_ci	movrnz	$acc5,$t3,$acc3
1778e1051a39Sopenharmony_ci	movrz	$acc5,%g0,$acc4
1779e1051a39Sopenharmony_ci
1780e1051a39Sopenharmony_ci	! ret >>= 1
1781e1051a39Sopenharmony_ci
1782e1051a39Sopenharmony_ci	srlx	$acc0,1,$acc0
1783e1051a39Sopenharmony_ci	sllx	$acc1,63,$t0
1784e1051a39Sopenharmony_ci	srlx	$acc1,1,$acc1
1785e1051a39Sopenharmony_ci	or	$acc0,$t0,$acc0
1786e1051a39Sopenharmony_ci	sllx	$acc2,63,$t1
1787e1051a39Sopenharmony_ci	srlx	$acc2,1,$acc2
1788e1051a39Sopenharmony_ci	or	$acc1,$t1,$acc1
1789e1051a39Sopenharmony_ci	sllx	$acc3,63,$t2
1790e1051a39Sopenharmony_ci	stx	$acc0,[$rp]
1791e1051a39Sopenharmony_ci	srlx	$acc3,1,$acc3
1792e1051a39Sopenharmony_ci	or	$acc2,$t2,$acc2
1793e1051a39Sopenharmony_ci	sllx	$acc4,63,$t3		! don't forget carry bit
1794e1051a39Sopenharmony_ci	stx	$acc1,[$rp+8]
1795e1051a39Sopenharmony_ci	or	$acc3,$t3,$acc3
1796e1051a39Sopenharmony_ci	stx	$acc2,[$rp+16]
1797e1051a39Sopenharmony_ci	retl
1798e1051a39Sopenharmony_ci	stx	$acc3,[$rp+24]
1799e1051a39Sopenharmony_ci.type	__ecp_nistz256_div_by_2_vis3,#function
1800e1051a39Sopenharmony_ci.size	__ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3
1801e1051a39Sopenharmony_ci
1802e1051a39Sopenharmony_ci! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and
1803e1051a39Sopenharmony_ci! 4x faster [on T4]...
1804e1051a39Sopenharmony_ci.align	32
1805e1051a39Sopenharmony_ci__ecp_nistz256_mul_mont_vis3:
1806e1051a39Sopenharmony_ci	mulx	$a0,$bi,$acc0
1807e1051a39Sopenharmony_ci	not	$poly3,$poly3		! 0xFFFFFFFF00000001
1808e1051a39Sopenharmony_ci	umulxhi	$a0,$bi,$t0
1809e1051a39Sopenharmony_ci	mulx	$a1,$bi,$acc1
1810e1051a39Sopenharmony_ci	umulxhi	$a1,$bi,$t1
1811e1051a39Sopenharmony_ci	mulx	$a2,$bi,$acc2
1812e1051a39Sopenharmony_ci	umulxhi	$a2,$bi,$t2
1813e1051a39Sopenharmony_ci	mulx	$a3,$bi,$acc3
1814e1051a39Sopenharmony_ci	umulxhi	$a3,$bi,$t3
1815e1051a39Sopenharmony_ci	ldx	[$bp+8],$bi		! b[1]
1816e1051a39Sopenharmony_ci
1817e1051a39Sopenharmony_ci	addcc	$acc1,$t0,$acc1		! accumulate high parts of multiplication
1818e1051a39Sopenharmony_ci	 sllx	$acc0,32,$t0
1819e1051a39Sopenharmony_ci	addxccc	$acc2,$t1,$acc2
1820e1051a39Sopenharmony_ci	 srlx	$acc0,32,$t1
1821e1051a39Sopenharmony_ci	addxccc	$acc3,$t2,$acc3
1822e1051a39Sopenharmony_ci	addxc	%g0,$t3,$acc4
1823e1051a39Sopenharmony_ci	mov	0,$acc5
1824e1051a39Sopenharmony_ci___
1825e1051a39Sopenharmony_cifor($i=1;$i<4;$i++) {
1826e1051a39Sopenharmony_ci	# Reduction iteration is normally performed by accumulating
1827e1051a39Sopenharmony_ci	# result of multiplication of modulus by "magic" digit [and
1828e1051a39Sopenharmony_ci	# omitting least significant word, which is guaranteed to
1829e1051a39Sopenharmony_ci	# be 0], but thanks to special form of modulus and "magic"
1830e1051a39Sopenharmony_ci	# digit being equal to least significant word, it can be
1831e1051a39Sopenharmony_ci	# performed with additions and subtractions alone. Indeed:
1832e1051a39Sopenharmony_ci	#
1833e1051a39Sopenharmony_ci	#            ffff0001.00000000.0000ffff.ffffffff
1834e1051a39Sopenharmony_ci	# *                                     abcdefgh
1835e1051a39Sopenharmony_ci	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1836e1051a39Sopenharmony_ci	#
1837e1051a39Sopenharmony_ci	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1838e1051a39Sopenharmony_ci	# rewrite above as:
1839e1051a39Sopenharmony_ci	#
1840e1051a39Sopenharmony_ci	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1841e1051a39Sopenharmony_ci	# + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
1842e1051a39Sopenharmony_ci	# - 0000abcd.efgh0000.00000000.00000000.abcdefgh
1843e1051a39Sopenharmony_ci	#
1844e1051a39Sopenharmony_ci	# or marking redundant operations:
1845e1051a39Sopenharmony_ci	#
1846e1051a39Sopenharmony_ci	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
1847e1051a39Sopenharmony_ci	# + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
1848e1051a39Sopenharmony_ci	# - 0000abcd.efgh0000.--------.--------.--------
1849e1051a39Sopenharmony_ci	#   ^^^^^^^^ but this word is calculated with umulxhi, because
1850e1051a39Sopenharmony_ci	#            there is no subtract with 64-bit borrow:-(
1851e1051a39Sopenharmony_ci
1852e1051a39Sopenharmony_ci$code.=<<___;
1853e1051a39Sopenharmony_ci	sub	$acc0,$t0,$t2		! acc0*0xFFFFFFFF00000001, low part
1854e1051a39Sopenharmony_ci	umulxhi	$acc0,$poly3,$t3	! acc0*0xFFFFFFFF00000001, high part
1855e1051a39Sopenharmony_ci	addcc	$acc1,$t0,$acc0		! +=acc[0]<<96 and omit acc[0]
1856e1051a39Sopenharmony_ci	mulx	$a0,$bi,$t0
1857e1051a39Sopenharmony_ci	addxccc	$acc2,$t1,$acc1
1858e1051a39Sopenharmony_ci	mulx	$a1,$bi,$t1
1859e1051a39Sopenharmony_ci	addxccc	$acc3,$t2,$acc2		! +=acc[0]*0xFFFFFFFF00000001
1860e1051a39Sopenharmony_ci	mulx	$a2,$bi,$t2
1861e1051a39Sopenharmony_ci	addxccc	$acc4,$t3,$acc3
1862e1051a39Sopenharmony_ci	mulx	$a3,$bi,$t3
1863e1051a39Sopenharmony_ci	addxc	$acc5,%g0,$acc4
1864e1051a39Sopenharmony_ci
1865e1051a39Sopenharmony_ci	addcc	$acc0,$t0,$acc0		! accumulate low parts of multiplication
1866e1051a39Sopenharmony_ci	umulxhi	$a0,$bi,$t0
1867e1051a39Sopenharmony_ci	addxccc	$acc1,$t1,$acc1
1868e1051a39Sopenharmony_ci	umulxhi	$a1,$bi,$t1
1869e1051a39Sopenharmony_ci	addxccc	$acc2,$t2,$acc2
1870e1051a39Sopenharmony_ci	umulxhi	$a2,$bi,$t2
1871e1051a39Sopenharmony_ci	addxccc	$acc3,$t3,$acc3
1872e1051a39Sopenharmony_ci	umulxhi	$a3,$bi,$t3
1873e1051a39Sopenharmony_ci	addxc	$acc4,%g0,$acc4
1874e1051a39Sopenharmony_ci___
1875e1051a39Sopenharmony_ci$code.=<<___	if ($i<3);
1876e1051a39Sopenharmony_ci	ldx	[$bp+8*($i+1)],$bi	! bp[$i+1]
1877e1051a39Sopenharmony_ci___
1878e1051a39Sopenharmony_ci$code.=<<___;
1879e1051a39Sopenharmony_ci	addcc	$acc1,$t0,$acc1		! accumulate high parts of multiplication
1880e1051a39Sopenharmony_ci	 sllx	$acc0,32,$t0
1881e1051a39Sopenharmony_ci	addxccc	$acc2,$t1,$acc2
1882e1051a39Sopenharmony_ci	 srlx	$acc0,32,$t1
1883e1051a39Sopenharmony_ci	addxccc	$acc3,$t2,$acc3
1884e1051a39Sopenharmony_ci	addxccc	$acc4,$t3,$acc4
1885e1051a39Sopenharmony_ci	addxc	%g0,%g0,$acc5
1886e1051a39Sopenharmony_ci___
1887e1051a39Sopenharmony_ci}
1888e1051a39Sopenharmony_ci$code.=<<___;
1889e1051a39Sopenharmony_ci	sub	$acc0,$t0,$t2		! acc0*0xFFFFFFFF00000001, low part
1890e1051a39Sopenharmony_ci	umulxhi	$acc0,$poly3,$t3	! acc0*0xFFFFFFFF00000001, high part
1891e1051a39Sopenharmony_ci	addcc	$acc1,$t0,$acc0		! +=acc[0]<<96 and omit acc[0]
1892e1051a39Sopenharmony_ci	addxccc	$acc2,$t1,$acc1
1893e1051a39Sopenharmony_ci	addxccc	$acc3,$t2,$acc2		! +=acc[0]*0xFFFFFFFF00000001
1894e1051a39Sopenharmony_ci	addxccc	$acc4,$t3,$acc3
1895e1051a39Sopenharmony_ci	b	.Lmul_final_vis3	! see below
1896e1051a39Sopenharmony_ci	addxc	$acc5,%g0,$acc4
1897e1051a39Sopenharmony_ci.type	__ecp_nistz256_mul_mont_vis3,#function
1898e1051a39Sopenharmony_ci.size	__ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3
1899e1051a39Sopenharmony_ci
1900e1051a39Sopenharmony_ci! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less
1901e1051a39Sopenharmony_ci! instructions, but only 14% faster [on T4]...
1902e1051a39Sopenharmony_ci.align	32
1903e1051a39Sopenharmony_ci__ecp_nistz256_sqr_mont_vis3:
1904e1051a39Sopenharmony_ci	!  |  |  |  |  |  |a1*a0|  |
1905e1051a39Sopenharmony_ci	!  |  |  |  |  |a2*a0|  |  |
1906e1051a39Sopenharmony_ci	!  |  |a3*a2|a3*a0|  |  |  |
1907e1051a39Sopenharmony_ci	!  |  |  |  |a2*a1|  |  |  |
1908e1051a39Sopenharmony_ci	!  |  |  |a3*a1|  |  |  |  |
1909e1051a39Sopenharmony_ci	! *|  |  |  |  |  |  |  | 2|
1910e1051a39Sopenharmony_ci	! +|a3*a3|a2*a2|a1*a1|a0*a0|
1911e1051a39Sopenharmony_ci	!  |--+--+--+--+--+--+--+--|
1912e1051a39Sopenharmony_ci	!  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1913e1051a39Sopenharmony_ci	!
1914e1051a39Sopenharmony_ci	!  "can't overflow" below mark carrying into high part of
1915e1051a39Sopenharmony_ci	!  multiplication result, which can't overflow, because it
1916e1051a39Sopenharmony_ci	!  can never be all ones.
1917e1051a39Sopenharmony_ci
1918e1051a39Sopenharmony_ci	mulx	$a1,$a0,$acc1		! a[1]*a[0]
1919e1051a39Sopenharmony_ci	umulxhi	$a1,$a0,$t1
1920e1051a39Sopenharmony_ci	mulx	$a2,$a0,$acc2		! a[2]*a[0]
1921e1051a39Sopenharmony_ci	umulxhi	$a2,$a0,$t2
1922e1051a39Sopenharmony_ci	mulx	$a3,$a0,$acc3		! a[3]*a[0]
1923e1051a39Sopenharmony_ci	umulxhi	$a3,$a0,$acc4
1924e1051a39Sopenharmony_ci
1925e1051a39Sopenharmony_ci	addcc	$acc2,$t1,$acc2		! accumulate high parts of multiplication
1926e1051a39Sopenharmony_ci	mulx	$a2,$a1,$t0		! a[2]*a[1]
1927e1051a39Sopenharmony_ci	umulxhi	$a2,$a1,$t1
1928e1051a39Sopenharmony_ci	addxccc	$acc3,$t2,$acc3
1929e1051a39Sopenharmony_ci	mulx	$a3,$a1,$t2		! a[3]*a[1]
1930e1051a39Sopenharmony_ci	umulxhi	$a3,$a1,$t3
1931e1051a39Sopenharmony_ci	addxc	$acc4,%g0,$acc4		! can't overflow
1932e1051a39Sopenharmony_ci
1933e1051a39Sopenharmony_ci	mulx	$a3,$a2,$acc5		! a[3]*a[2]
1934e1051a39Sopenharmony_ci	not	$poly3,$poly3		! 0xFFFFFFFF00000001
1935e1051a39Sopenharmony_ci	umulxhi	$a3,$a2,$acc6
1936e1051a39Sopenharmony_ci
1937e1051a39Sopenharmony_ci	addcc	$t2,$t1,$t1		! accumulate high parts of multiplication
1938e1051a39Sopenharmony_ci	mulx	$a0,$a0,$acc0		! a[0]*a[0]
1939e1051a39Sopenharmony_ci	addxc	$t3,%g0,$t2		! can't overflow
1940e1051a39Sopenharmony_ci
1941e1051a39Sopenharmony_ci	addcc	$acc3,$t0,$acc3		! accumulate low parts of multiplication
1942e1051a39Sopenharmony_ci	umulxhi	$a0,$a0,$a0
1943e1051a39Sopenharmony_ci	addxccc	$acc4,$t1,$acc4
1944e1051a39Sopenharmony_ci	mulx	$a1,$a1,$t1		! a[1]*a[1]
1945e1051a39Sopenharmony_ci	addxccc	$acc5,$t2,$acc5
1946e1051a39Sopenharmony_ci	umulxhi	$a1,$a1,$a1
1947e1051a39Sopenharmony_ci	addxc	$acc6,%g0,$acc6		! can't overflow
1948e1051a39Sopenharmony_ci
1949e1051a39Sopenharmony_ci	addcc	$acc1,$acc1,$acc1	! acc[1-6]*=2
1950e1051a39Sopenharmony_ci	mulx	$a2,$a2,$t2		! a[2]*a[2]
1951e1051a39Sopenharmony_ci	addxccc	$acc2,$acc2,$acc2
1952e1051a39Sopenharmony_ci	umulxhi	$a2,$a2,$a2
1953e1051a39Sopenharmony_ci	addxccc	$acc3,$acc3,$acc3
1954e1051a39Sopenharmony_ci	mulx	$a3,$a3,$t3		! a[3]*a[3]
1955e1051a39Sopenharmony_ci	addxccc	$acc4,$acc4,$acc4
1956e1051a39Sopenharmony_ci	umulxhi	$a3,$a3,$a3
1957e1051a39Sopenharmony_ci	addxccc	$acc5,$acc5,$acc5
1958e1051a39Sopenharmony_ci	addxccc	$acc6,$acc6,$acc6
1959e1051a39Sopenharmony_ci	addxc	%g0,%g0,$acc7
1960e1051a39Sopenharmony_ci
1961e1051a39Sopenharmony_ci	addcc	$acc1,$a0,$acc1		! +a[i]*a[i]
1962e1051a39Sopenharmony_ci	addxccc	$acc2,$t1,$acc2
1963e1051a39Sopenharmony_ci	addxccc	$acc3,$a1,$acc3
1964e1051a39Sopenharmony_ci	addxccc	$acc4,$t2,$acc4
1965e1051a39Sopenharmony_ci	 sllx	$acc0,32,$t0
1966e1051a39Sopenharmony_ci	addxccc	$acc5,$a2,$acc5
1967e1051a39Sopenharmony_ci	 srlx	$acc0,32,$t1
1968e1051a39Sopenharmony_ci	addxccc	$acc6,$t3,$acc6
1969e1051a39Sopenharmony_ci	 sub	$acc0,$t0,$t2		! acc0*0xFFFFFFFF00000001, low part
1970e1051a39Sopenharmony_ci	addxc	$acc7,$a3,$acc7
1971e1051a39Sopenharmony_ci___
1972e1051a39Sopenharmony_cifor($i=0;$i<3;$i++) {			# reductions, see commentary
1973e1051a39Sopenharmony_ci					# in multiplication for details
1974e1051a39Sopenharmony_ci$code.=<<___;
1975e1051a39Sopenharmony_ci	umulxhi	$acc0,$poly3,$t3	! acc0*0xFFFFFFFF00000001, high part
1976e1051a39Sopenharmony_ci	addcc	$acc1,$t0,$acc0		! +=acc[0]<<96 and omit acc[0]
1977e1051a39Sopenharmony_ci	 sllx	$acc0,32,$t0
1978e1051a39Sopenharmony_ci	addxccc	$acc2,$t1,$acc1
1979e1051a39Sopenharmony_ci	 srlx	$acc0,32,$t1
1980e1051a39Sopenharmony_ci	addxccc	$acc3,$t2,$acc2		! +=acc[0]*0xFFFFFFFF00000001
1981e1051a39Sopenharmony_ci	 sub	$acc0,$t0,$t2		! acc0*0xFFFFFFFF00000001, low part
1982e1051a39Sopenharmony_ci	addxc	%g0,$t3,$acc3		! can't overflow
1983e1051a39Sopenharmony_ci___
1984e1051a39Sopenharmony_ci}
1985e1051a39Sopenharmony_ci$code.=<<___;
1986e1051a39Sopenharmony_ci	umulxhi	$acc0,$poly3,$t3	! acc0*0xFFFFFFFF00000001, high part
1987e1051a39Sopenharmony_ci	addcc	$acc1,$t0,$acc0		! +=acc[0]<<96 and omit acc[0]
1988e1051a39Sopenharmony_ci	addxccc	$acc2,$t1,$acc1
1989e1051a39Sopenharmony_ci	addxccc	$acc3,$t2,$acc2		! +=acc[0]*0xFFFFFFFF00000001
1990e1051a39Sopenharmony_ci	addxc	%g0,$t3,$acc3		! can't overflow
1991e1051a39Sopenharmony_ci
1992e1051a39Sopenharmony_ci	addcc	$acc0,$acc4,$acc0	! accumulate upper half
1993e1051a39Sopenharmony_ci	addxccc	$acc1,$acc5,$acc1
1994e1051a39Sopenharmony_ci	addxccc	$acc2,$acc6,$acc2
1995e1051a39Sopenharmony_ci	addxccc	$acc3,$acc7,$acc3
1996e1051a39Sopenharmony_ci	addxc	%g0,%g0,$acc4
1997e1051a39Sopenharmony_ci
1998e1051a39Sopenharmony_ci.Lmul_final_vis3:
1999e1051a39Sopenharmony_ci
2000e1051a39Sopenharmony_ci	! Final step is "if result > mod, subtract mod", but as comparison
2001e1051a39Sopenharmony_ci	! means subtraction, we do the subtraction and then copy outcome
2002e1051a39Sopenharmony_ci	! if it didn't borrow. But note that as we [have to] replace
2003e1051a39Sopenharmony_ci	! subtraction with addition with negative, carry/borrow logic is
2004e1051a39Sopenharmony_ci	! inverse.
2005e1051a39Sopenharmony_ci
2006e1051a39Sopenharmony_ci	addcc	$acc0,1,$t0		! add -modulus, i.e. subtract
2007e1051a39Sopenharmony_ci	not	$poly3,$poly3		! restore 0x00000000FFFFFFFE
2008e1051a39Sopenharmony_ci	addxccc	$acc1,$poly1,$t1
2009e1051a39Sopenharmony_ci	addxccc	$acc2,$minus1,$t2
2010e1051a39Sopenharmony_ci	addxccc	$acc3,$poly3,$t3
2011e1051a39Sopenharmony_ci	addxccc	$acc4,$minus1,%g0	! did it carry?
2012e1051a39Sopenharmony_ci
2013e1051a39Sopenharmony_ci	movcs	%xcc,$t0,$acc0
2014e1051a39Sopenharmony_ci	movcs	%xcc,$t1,$acc1
2015e1051a39Sopenharmony_ci	stx	$acc0,[$rp]
2016e1051a39Sopenharmony_ci	movcs	%xcc,$t2,$acc2
2017e1051a39Sopenharmony_ci	stx	$acc1,[$rp+8]
2018e1051a39Sopenharmony_ci	movcs	%xcc,$t3,$acc3
2019e1051a39Sopenharmony_ci	stx	$acc2,[$rp+16]
2020e1051a39Sopenharmony_ci	retl
2021e1051a39Sopenharmony_ci	stx	$acc3,[$rp+24]
2022e1051a39Sopenharmony_ci.type	__ecp_nistz256_sqr_mont_vis3,#function
2023e1051a39Sopenharmony_ci.size	__ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3
2024e1051a39Sopenharmony_ci___
2025e1051a39Sopenharmony_ci
2026e1051a39Sopenharmony_ci########################################################################
2027e1051a39Sopenharmony_ci# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
2028e1051a39Sopenharmony_ci#
2029e1051a39Sopenharmony_ci{
2030e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z,
2031e1051a39Sopenharmony_ci    $in_x,$in_y,$in_z,
2032e1051a39Sopenharmony_ci    $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9));
2033e1051a39Sopenharmony_ci# above map() describes stack layout with 10 temporary
2034e1051a39Sopenharmony_ci# 256-bit vectors on top.
2035e1051a39Sopenharmony_ci
2036e1051a39Sopenharmony_ci$code.=<<___;
2037e1051a39Sopenharmony_ci.align	32
2038e1051a39Sopenharmony_ciecp_nistz256_point_double_vis3:
2039e1051a39Sopenharmony_ci	save	%sp,-STACK64_FRAME-32*10,%sp
2040e1051a39Sopenharmony_ci
2041e1051a39Sopenharmony_ci	mov	$rp,$rp_real
2042e1051a39Sopenharmony_ci.Ldouble_shortcut_vis3:
2043e1051a39Sopenharmony_ci	mov	-1,$minus1
2044e1051a39Sopenharmony_ci	mov	-2,$poly3
2045e1051a39Sopenharmony_ci	sllx	$minus1,32,$poly1		! 0xFFFFFFFF00000000
2046e1051a39Sopenharmony_ci	srl	$poly3,0,$poly3			! 0x00000000FFFFFFFE
2047e1051a39Sopenharmony_ci
2048e1051a39Sopenharmony_ci	! convert input to uint64_t[4]
2049e1051a39Sopenharmony_ci	ld	[$ap],$a0			! in_x
2050e1051a39Sopenharmony_ci	ld	[$ap+4],$t0
2051e1051a39Sopenharmony_ci	ld	[$ap+8],$a1
2052e1051a39Sopenharmony_ci	ld	[$ap+12],$t1
2053e1051a39Sopenharmony_ci	ld	[$ap+16],$a2
2054e1051a39Sopenharmony_ci	ld	[$ap+20],$t2
2055e1051a39Sopenharmony_ci	ld	[$ap+24],$a3
2056e1051a39Sopenharmony_ci	ld	[$ap+28],$t3
2057e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2058e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2059e1051a39Sopenharmony_ci	ld	[$ap+32],$acc0			! in_y
2060e1051a39Sopenharmony_ci	or	$a0,$t0,$a0
2061e1051a39Sopenharmony_ci	ld	[$ap+32+4],$t0
2062e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2063e1051a39Sopenharmony_ci	ld	[$ap+32+8],$acc1
2064e1051a39Sopenharmony_ci	or	$a1,$t1,$a1
2065e1051a39Sopenharmony_ci	ld	[$ap+32+12],$t1
2066e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2067e1051a39Sopenharmony_ci	ld	[$ap+32+16],$acc2
2068e1051a39Sopenharmony_ci	or	$a2,$t2,$a2
2069e1051a39Sopenharmony_ci	ld	[$ap+32+20],$t2
2070e1051a39Sopenharmony_ci	or	$a3,$t3,$a3
2071e1051a39Sopenharmony_ci	ld	[$ap+32+24],$acc3
2072e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2073e1051a39Sopenharmony_ci	ld	[$ap+32+28],$t3
2074e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2075e1051a39Sopenharmony_ci	stx	$a0,[%sp+LOCALS64+$in_x]
2076e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2077e1051a39Sopenharmony_ci	stx	$a1,[%sp+LOCALS64+$in_x+8]
2078e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2079e1051a39Sopenharmony_ci	stx	$a2,[%sp+LOCALS64+$in_x+16]
2080e1051a39Sopenharmony_ci	or	$acc0,$t0,$acc0
2081e1051a39Sopenharmony_ci	stx	$a3,[%sp+LOCALS64+$in_x+24]
2082e1051a39Sopenharmony_ci	or	$acc1,$t1,$acc1
2083e1051a39Sopenharmony_ci	stx	$acc0,[%sp+LOCALS64+$in_y]
2084e1051a39Sopenharmony_ci	or	$acc2,$t2,$acc2
2085e1051a39Sopenharmony_ci	stx	$acc1,[%sp+LOCALS64+$in_y+8]
2086e1051a39Sopenharmony_ci	or	$acc3,$t3,$acc3
2087e1051a39Sopenharmony_ci	stx	$acc2,[%sp+LOCALS64+$in_y+16]
2088e1051a39Sopenharmony_ci	stx	$acc3,[%sp+LOCALS64+$in_y+24]
2089e1051a39Sopenharmony_ci
2090e1051a39Sopenharmony_ci	ld	[$ap+64],$a0			! in_z
2091e1051a39Sopenharmony_ci	ld	[$ap+64+4],$t0
2092e1051a39Sopenharmony_ci	ld	[$ap+64+8],$a1
2093e1051a39Sopenharmony_ci	ld	[$ap+64+12],$t1
2094e1051a39Sopenharmony_ci	ld	[$ap+64+16],$a2
2095e1051a39Sopenharmony_ci	ld	[$ap+64+20],$t2
2096e1051a39Sopenharmony_ci	ld	[$ap+64+24],$a3
2097e1051a39Sopenharmony_ci	ld	[$ap+64+28],$t3
2098e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2099e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2100e1051a39Sopenharmony_ci	or	$a0,$t0,$a0
2101e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2102e1051a39Sopenharmony_ci	or	$a1,$t1,$a1
2103e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2104e1051a39Sopenharmony_ci	or	$a2,$t2,$a2
2105e1051a39Sopenharmony_ci	or	$a3,$t3,$a3
2106e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2107e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2108e1051a39Sopenharmony_ci	stx	$a0,[%sp+LOCALS64+$in_z]
2109e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2110e1051a39Sopenharmony_ci	stx	$a1,[%sp+LOCALS64+$in_z+8]
2111e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2112e1051a39Sopenharmony_ci	stx	$a2,[%sp+LOCALS64+$in_z+16]
2113e1051a39Sopenharmony_ci	stx	$a3,[%sp+LOCALS64+$in_z+24]
2114e1051a39Sopenharmony_ci
2115e1051a39Sopenharmony_ci	! in_y is still in $acc0-$acc3
2116e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2_vis3	! p256_mul_by_2(S, in_y);
2117e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S,$rp
2118e1051a39Sopenharmony_ci
2119e1051a39Sopenharmony_ci	! in_z is still in $a0-$a3
2120e1051a39Sopenharmony_ci	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Zsqr, in_z);
2121e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Zsqr,$rp
2122e1051a39Sopenharmony_ci
2123e1051a39Sopenharmony_ci	mov	$acc0,$a0			! put Zsqr aside
2124e1051a39Sopenharmony_ci	mov	$acc1,$a1
2125e1051a39Sopenharmony_ci	mov	$acc2,$a2
2126e1051a39Sopenharmony_ci	mov	$acc3,$a3
2127e1051a39Sopenharmony_ci
2128e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$in_x,$bp
2129e1051a39Sopenharmony_ci	call	__ecp_nistz256_add_vis3		! p256_add(M, Zsqr, in_x);
2130e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$M,$rp
2131e1051a39Sopenharmony_ci
2132e1051a39Sopenharmony_ci	mov	$a0,$acc0			! restore Zsqr
2133e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S],$a0		! forward load
2134e1051a39Sopenharmony_ci	mov	$a1,$acc1
2135e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S+8],$a1
2136e1051a39Sopenharmony_ci	mov	$a2,$acc2
2137e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S+16],$a2
2138e1051a39Sopenharmony_ci	mov	$a3,$acc3
2139e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S+24],$a3
2140e1051a39Sopenharmony_ci
2141e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$in_x,$bp
2142e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(Zsqr, in_x, Zsqr);
2143e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Zsqr,$rp
2144e1051a39Sopenharmony_ci
2145e1051a39Sopenharmony_ci	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(S, S);
2146e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S,$rp
2147e1051a39Sopenharmony_ci
2148e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in_z],$bi
2149e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in_y],$a0
2150e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in_y+8],$a1
2151e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in_y+16],$a2
2152e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in_y+24],$a3
2153e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$in_z,$bp
2154e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(tmp0, in_z, in_y);
2155e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$tmp0,$rp
2156e1051a39Sopenharmony_ci
2157e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$M],$bi		! forward load
2158e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Zsqr],$a0
2159e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Zsqr+8],$a1
2160e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Zsqr+16],$a2
2161e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Zsqr+24],$a3
2162e1051a39Sopenharmony_ci
2163e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2_vis3	! p256_mul_by_2(res_z, tmp0);
2164e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_z,$rp
2165e1051a39Sopenharmony_ci
2166e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$M,$bp
2167e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(M, M, Zsqr);
2168e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$M,$rp
2169e1051a39Sopenharmony_ci
2170e1051a39Sopenharmony_ci	mov	$acc0,$a0			! put aside M
2171e1051a39Sopenharmony_ci	mov	$acc1,$a1
2172e1051a39Sopenharmony_ci	mov	$acc2,$a2
2173e1051a39Sopenharmony_ci	mov	$acc3,$a3
2174e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2_vis3
2175e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$M,$rp
2176e1051a39Sopenharmony_ci	mov	$a0,$t0				! copy M
2177e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S],$a0		! forward load
2178e1051a39Sopenharmony_ci	mov	$a1,$t1
2179e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S+8],$a1
2180e1051a39Sopenharmony_ci	mov	$a2,$t2
2181e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S+16],$a2
2182e1051a39Sopenharmony_ci	mov	$a3,$t3
2183e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S+24],$a3
2184e1051a39Sopenharmony_ci	call	__ecp_nistz256_add_noload_vis3	! p256_mul_by_3(M, M);
2185e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$M,$rp
2186e1051a39Sopenharmony_ci
2187e1051a39Sopenharmony_ci	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(tmp0, S);
2188e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$tmp0,$rp
2189e1051a39Sopenharmony_ci
2190e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S],$bi		! forward load
2191e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in_x],$a0
2192e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in_x+8],$a1
2193e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in_x+16],$a2
2194e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in_x+24],$a3
2195e1051a39Sopenharmony_ci
2196e1051a39Sopenharmony_ci	call	__ecp_nistz256_div_by_2_vis3	! p256_div_by_2(res_y, tmp0);
2197e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_y,$rp
2198e1051a39Sopenharmony_ci
2199e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S,$bp
2200e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S, S, in_x);
2201e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S,$rp
2202e1051a39Sopenharmony_ci
2203e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$M],$a0		! forward load
2204e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$M+8],$a1
2205e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$M+16],$a2
2206e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$M+24],$a3
2207e1051a39Sopenharmony_ci
2208e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2_vis3	! p256_mul_by_2(tmp0, S);
2209e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$tmp0,$rp
2210e1051a39Sopenharmony_ci
2211e1051a39Sopenharmony_ci	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(res_x, M);
2212e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_x,$rp
2213e1051a39Sopenharmony_ci
2214e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$tmp0,$bp
2215e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from_vis3	! p256_sub(res_x, res_x, tmp0);
2216e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_x,$rp
2217e1051a39Sopenharmony_ci
2218e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$M],$a0		! forward load
2219e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$M+8],$a1
2220e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$M+16],$a2
2221e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$M+24],$a3
2222e1051a39Sopenharmony_ci
2223e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S,$bp
2224e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(S, S, res_x);
2225e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S,$rp
2226e1051a39Sopenharmony_ci
2227e1051a39Sopenharmony_ci	mov	$acc0,$bi
2228e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S, S, M);
2229e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S,$rp
2230e1051a39Sopenharmony_ci
2231e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_x],$a0	! forward load
2232e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_x+8],$a1
2233e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_x+16],$a2
2234e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_x+24],$a3
2235e1051a39Sopenharmony_ci
2236e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_y,$bp
2237e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from_vis3	! p256_sub(res_y, S, res_y);
2238e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_y,$bp
2239e1051a39Sopenharmony_ci
2240e1051a39Sopenharmony_ci	! convert output to uint_32[8]
2241e1051a39Sopenharmony_ci	srlx	$a0,32,$t0
2242e1051a39Sopenharmony_ci	srlx	$a1,32,$t1
2243e1051a39Sopenharmony_ci	st	$a0,[$rp_real]			! res_x
2244e1051a39Sopenharmony_ci	srlx	$a2,32,$t2
2245e1051a39Sopenharmony_ci	st	$t0,[$rp_real+4]
2246e1051a39Sopenharmony_ci	srlx	$a3,32,$t3
2247e1051a39Sopenharmony_ci	st	$a1,[$rp_real+8]
2248e1051a39Sopenharmony_ci	st	$t1,[$rp_real+12]
2249e1051a39Sopenharmony_ci	st	$a2,[$rp_real+16]
2250e1051a39Sopenharmony_ci	st	$t2,[$rp_real+20]
2251e1051a39Sopenharmony_ci	st	$a3,[$rp_real+24]
2252e1051a39Sopenharmony_ci	st	$t3,[$rp_real+28]
2253e1051a39Sopenharmony_ci
2254e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_z],$a0	! forward load
2255e1051a39Sopenharmony_ci	srlx	$acc0,32,$t0
2256e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_z+8],$a1
2257e1051a39Sopenharmony_ci	srlx	$acc1,32,$t1
2258e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_z+16],$a2
2259e1051a39Sopenharmony_ci	srlx	$acc2,32,$t2
2260e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_z+24],$a3
2261e1051a39Sopenharmony_ci	srlx	$acc3,32,$t3
2262e1051a39Sopenharmony_ci	st	$acc0,[$rp_real+32]		! res_y
2263e1051a39Sopenharmony_ci	st	$t0,  [$rp_real+32+4]
2264e1051a39Sopenharmony_ci	st	$acc1,[$rp_real+32+8]
2265e1051a39Sopenharmony_ci	st	$t1,  [$rp_real+32+12]
2266e1051a39Sopenharmony_ci	st	$acc2,[$rp_real+32+16]
2267e1051a39Sopenharmony_ci	st	$t2,  [$rp_real+32+20]
2268e1051a39Sopenharmony_ci	st	$acc3,[$rp_real+32+24]
2269e1051a39Sopenharmony_ci	st	$t3,  [$rp_real+32+28]
2270e1051a39Sopenharmony_ci
2271e1051a39Sopenharmony_ci	srlx	$a0,32,$t0
2272e1051a39Sopenharmony_ci	srlx	$a1,32,$t1
2273e1051a39Sopenharmony_ci	st	$a0,[$rp_real+64]		! res_z
2274e1051a39Sopenharmony_ci	srlx	$a2,32,$t2
2275e1051a39Sopenharmony_ci	st	$t0,[$rp_real+64+4]
2276e1051a39Sopenharmony_ci	srlx	$a3,32,$t3
2277e1051a39Sopenharmony_ci	st	$a1,[$rp_real+64+8]
2278e1051a39Sopenharmony_ci	st	$t1,[$rp_real+64+12]
2279e1051a39Sopenharmony_ci	st	$a2,[$rp_real+64+16]
2280e1051a39Sopenharmony_ci	st	$t2,[$rp_real+64+20]
2281e1051a39Sopenharmony_ci	st	$a3,[$rp_real+64+24]
2282e1051a39Sopenharmony_ci	st	$t3,[$rp_real+64+28]
2283e1051a39Sopenharmony_ci
2284e1051a39Sopenharmony_ci	ret
2285e1051a39Sopenharmony_ci	restore
2286e1051a39Sopenharmony_ci.type	ecp_nistz256_point_double_vis3,#function
2287e1051a39Sopenharmony_ci.size	ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3
2288e1051a39Sopenharmony_ci___
2289e1051a39Sopenharmony_ci}
2290e1051a39Sopenharmony_ci########################################################################
2291e1051a39Sopenharmony_ci# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
2292e1051a39Sopenharmony_ci#			      const P256_POINT *in2);
2293e1051a39Sopenharmony_ci{
2294e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z,
2295e1051a39Sopenharmony_ci    $in1_x,$in1_y,$in1_z,
2296e1051a39Sopenharmony_ci    $in2_x,$in2_y,$in2_z,
2297e1051a39Sopenharmony_ci    $H,$Hsqr,$R,$Rsqr,$Hcub,
2298e1051a39Sopenharmony_ci    $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
2299e1051a39Sopenharmony_cimy ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2300e1051a39Sopenharmony_ci
2301e1051a39Sopenharmony_ci# above map() describes stack layout with 18 temporary
2302e1051a39Sopenharmony_ci# 256-bit vectors on top. Then we reserve some space for
2303e1051a39Sopenharmony_ci# !in1infty, !in2infty and result of check for zero.
2304e1051a39Sopenharmony_ci
2305e1051a39Sopenharmony_ci$code.=<<___;
2306e1051a39Sopenharmony_ci.align	32
2307e1051a39Sopenharmony_ciecp_nistz256_point_add_vis3:
2308e1051a39Sopenharmony_ci	save	%sp,-STACK64_FRAME-32*18-32,%sp
2309e1051a39Sopenharmony_ci
2310e1051a39Sopenharmony_ci	mov	$rp,$rp_real
2311e1051a39Sopenharmony_ci	mov	-1,$minus1
2312e1051a39Sopenharmony_ci	mov	-2,$poly3
2313e1051a39Sopenharmony_ci	sllx	$minus1,32,$poly1		! 0xFFFFFFFF00000000
2314e1051a39Sopenharmony_ci	srl	$poly3,0,$poly3			! 0x00000000FFFFFFFE
2315e1051a39Sopenharmony_ci
2316e1051a39Sopenharmony_ci	! convert input to uint64_t[4]
2317e1051a39Sopenharmony_ci	ld	[$bp],$a0			! in2_x
2318e1051a39Sopenharmony_ci	ld	[$bp+4],$t0
2319e1051a39Sopenharmony_ci	ld	[$bp+8],$a1
2320e1051a39Sopenharmony_ci	ld	[$bp+12],$t1
2321e1051a39Sopenharmony_ci	ld	[$bp+16],$a2
2322e1051a39Sopenharmony_ci	ld	[$bp+20],$t2
2323e1051a39Sopenharmony_ci	ld	[$bp+24],$a3
2324e1051a39Sopenharmony_ci	ld	[$bp+28],$t3
2325e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2326e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2327e1051a39Sopenharmony_ci	ld	[$bp+32],$acc0			! in2_y
2328e1051a39Sopenharmony_ci	or	$a0,$t0,$a0
2329e1051a39Sopenharmony_ci	ld	[$bp+32+4],$t0
2330e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2331e1051a39Sopenharmony_ci	ld	[$bp+32+8],$acc1
2332e1051a39Sopenharmony_ci	or	$a1,$t1,$a1
2333e1051a39Sopenharmony_ci	ld	[$bp+32+12],$t1
2334e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2335e1051a39Sopenharmony_ci	ld	[$bp+32+16],$acc2
2336e1051a39Sopenharmony_ci	or	$a2,$t2,$a2
2337e1051a39Sopenharmony_ci	ld	[$bp+32+20],$t2
2338e1051a39Sopenharmony_ci	or	$a3,$t3,$a3
2339e1051a39Sopenharmony_ci	ld	[$bp+32+24],$acc3
2340e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2341e1051a39Sopenharmony_ci	ld	[$bp+32+28],$t3
2342e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2343e1051a39Sopenharmony_ci	stx	$a0,[%sp+LOCALS64+$in2_x]
2344e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2345e1051a39Sopenharmony_ci	stx	$a1,[%sp+LOCALS64+$in2_x+8]
2346e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2347e1051a39Sopenharmony_ci	stx	$a2,[%sp+LOCALS64+$in2_x+16]
2348e1051a39Sopenharmony_ci	or	$acc0,$t0,$acc0
2349e1051a39Sopenharmony_ci	stx	$a3,[%sp+LOCALS64+$in2_x+24]
2350e1051a39Sopenharmony_ci	or	$acc1,$t1,$acc1
2351e1051a39Sopenharmony_ci	stx	$acc0,[%sp+LOCALS64+$in2_y]
2352e1051a39Sopenharmony_ci	or	$acc2,$t2,$acc2
2353e1051a39Sopenharmony_ci	stx	$acc1,[%sp+LOCALS64+$in2_y+8]
2354e1051a39Sopenharmony_ci	or	$acc3,$t3,$acc3
2355e1051a39Sopenharmony_ci	stx	$acc2,[%sp+LOCALS64+$in2_y+16]
2356e1051a39Sopenharmony_ci	stx	$acc3,[%sp+LOCALS64+$in2_y+24]
2357e1051a39Sopenharmony_ci
2358e1051a39Sopenharmony_ci	ld	[$bp+64],$acc0			! in2_z
2359e1051a39Sopenharmony_ci	ld	[$bp+64+4],$t0
2360e1051a39Sopenharmony_ci	ld	[$bp+64+8],$acc1
2361e1051a39Sopenharmony_ci	ld	[$bp+64+12],$t1
2362e1051a39Sopenharmony_ci	ld	[$bp+64+16],$acc2
2363e1051a39Sopenharmony_ci	ld	[$bp+64+20],$t2
2364e1051a39Sopenharmony_ci	ld	[$bp+64+24],$acc3
2365e1051a39Sopenharmony_ci	ld	[$bp+64+28],$t3
2366e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2367e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2368e1051a39Sopenharmony_ci	ld	[$ap],$a0			! in1_x
2369e1051a39Sopenharmony_ci	or	$acc0,$t0,$acc0
2370e1051a39Sopenharmony_ci	ld	[$ap+4],$t0
2371e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2372e1051a39Sopenharmony_ci	ld	[$ap+8],$a1
2373e1051a39Sopenharmony_ci	or	$acc1,$t1,$acc1
2374e1051a39Sopenharmony_ci	ld	[$ap+12],$t1
2375e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2376e1051a39Sopenharmony_ci	ld	[$ap+16],$a2
2377e1051a39Sopenharmony_ci	or	$acc2,$t2,$acc2
2378e1051a39Sopenharmony_ci	ld	[$ap+20],$t2
2379e1051a39Sopenharmony_ci	or	$acc3,$t3,$acc3
2380e1051a39Sopenharmony_ci	ld	[$ap+24],$a3
2381e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2382e1051a39Sopenharmony_ci	ld	[$ap+28],$t3
2383e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2384e1051a39Sopenharmony_ci	stx	$acc0,[%sp+LOCALS64+$in2_z]
2385e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2386e1051a39Sopenharmony_ci	stx	$acc1,[%sp+LOCALS64+$in2_z+8]
2387e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2388e1051a39Sopenharmony_ci	stx	$acc2,[%sp+LOCALS64+$in2_z+16]
2389e1051a39Sopenharmony_ci	stx	$acc3,[%sp+LOCALS64+$in2_z+24]
2390e1051a39Sopenharmony_ci
2391e1051a39Sopenharmony_ci	or	$acc1,$acc0,$acc0
2392e1051a39Sopenharmony_ci	or	$acc3,$acc2,$acc2
2393e1051a39Sopenharmony_ci	or	$acc2,$acc0,$acc0
2394e1051a39Sopenharmony_ci	movrnz	$acc0,-1,$acc0			! !in2infty
2395e1051a39Sopenharmony_ci	stx	$acc0,[%fp+STACK_BIAS-8]
2396e1051a39Sopenharmony_ci
2397e1051a39Sopenharmony_ci	or	$a0,$t0,$a0
2398e1051a39Sopenharmony_ci	ld	[$ap+32],$acc0			! in1_y
2399e1051a39Sopenharmony_ci	or	$a1,$t1,$a1
2400e1051a39Sopenharmony_ci	ld	[$ap+32+4],$t0
2401e1051a39Sopenharmony_ci	or	$a2,$t2,$a2
2402e1051a39Sopenharmony_ci	ld	[$ap+32+8],$acc1
2403e1051a39Sopenharmony_ci	or	$a3,$t3,$a3
2404e1051a39Sopenharmony_ci	ld	[$ap+32+12],$t1
2405e1051a39Sopenharmony_ci	ld	[$ap+32+16],$acc2
2406e1051a39Sopenharmony_ci	ld	[$ap+32+20],$t2
2407e1051a39Sopenharmony_ci	ld	[$ap+32+24],$acc3
2408e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2409e1051a39Sopenharmony_ci	ld	[$ap+32+28],$t3
2410e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2411e1051a39Sopenharmony_ci	stx	$a0,[%sp+LOCALS64+$in1_x]
2412e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2413e1051a39Sopenharmony_ci	stx	$a1,[%sp+LOCALS64+$in1_x+8]
2414e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2415e1051a39Sopenharmony_ci	stx	$a2,[%sp+LOCALS64+$in1_x+16]
2416e1051a39Sopenharmony_ci	or	$acc0,$t0,$acc0
2417e1051a39Sopenharmony_ci	stx	$a3,[%sp+LOCALS64+$in1_x+24]
2418e1051a39Sopenharmony_ci	or	$acc1,$t1,$acc1
2419e1051a39Sopenharmony_ci	stx	$acc0,[%sp+LOCALS64+$in1_y]
2420e1051a39Sopenharmony_ci	or	$acc2,$t2,$acc2
2421e1051a39Sopenharmony_ci	stx	$acc1,[%sp+LOCALS64+$in1_y+8]
2422e1051a39Sopenharmony_ci	or	$acc3,$t3,$acc3
2423e1051a39Sopenharmony_ci	stx	$acc2,[%sp+LOCALS64+$in1_y+16]
2424e1051a39Sopenharmony_ci	stx	$acc3,[%sp+LOCALS64+$in1_y+24]
2425e1051a39Sopenharmony_ci
2426e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z],$a0	! forward load
2427e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z+8],$a1
2428e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z+16],$a2
2429e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z+24],$a3
2430e1051a39Sopenharmony_ci
2431e1051a39Sopenharmony_ci	ld	[$ap+64],$acc0			! in1_z
2432e1051a39Sopenharmony_ci	ld	[$ap+64+4],$t0
2433e1051a39Sopenharmony_ci	ld	[$ap+64+8],$acc1
2434e1051a39Sopenharmony_ci	ld	[$ap+64+12],$t1
2435e1051a39Sopenharmony_ci	ld	[$ap+64+16],$acc2
2436e1051a39Sopenharmony_ci	ld	[$ap+64+20],$t2
2437e1051a39Sopenharmony_ci	ld	[$ap+64+24],$acc3
2438e1051a39Sopenharmony_ci	ld	[$ap+64+28],$t3
2439e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2440e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2441e1051a39Sopenharmony_ci	or	$acc0,$t0,$acc0
2442e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2443e1051a39Sopenharmony_ci	or	$acc1,$t1,$acc1
2444e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2445e1051a39Sopenharmony_ci	stx	$acc0,[%sp+LOCALS64+$in1_z]
2446e1051a39Sopenharmony_ci	or	$acc2,$t2,$acc2
2447e1051a39Sopenharmony_ci	stx	$acc1,[%sp+LOCALS64+$in1_z+8]
2448e1051a39Sopenharmony_ci	or	$acc3,$t3,$acc3
2449e1051a39Sopenharmony_ci	stx	$acc2,[%sp+LOCALS64+$in1_z+16]
2450e1051a39Sopenharmony_ci	stx	$acc3,[%sp+LOCALS64+$in1_z+24]
2451e1051a39Sopenharmony_ci
2452e1051a39Sopenharmony_ci	or	$acc1,$acc0,$acc0
2453e1051a39Sopenharmony_ci	or	$acc3,$acc2,$acc2
2454e1051a39Sopenharmony_ci	or	$acc2,$acc0,$acc0
2455e1051a39Sopenharmony_ci	movrnz	$acc0,-1,$acc0			! !in1infty
2456e1051a39Sopenharmony_ci	stx	$acc0,[%fp+STACK_BIAS-16]
2457e1051a39Sopenharmony_ci
2458e1051a39Sopenharmony_ci	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Z2sqr, in2_z);
2459e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Z2sqr,$rp
2460e1051a39Sopenharmony_ci
2461e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z],$a0
2462e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+8],$a1
2463e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+16],$a2
2464e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+24],$a3
2465e1051a39Sopenharmony_ci	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Z1sqr, in1_z);
2466e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Z1sqr,$rp
2467e1051a39Sopenharmony_ci
2468e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Z2sqr],$bi
2469e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z],$a0
2470e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z+8],$a1
2471e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z+16],$a2
2472e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z+24],$a3
2473e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Z2sqr,$bp
2474e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S1, Z2sqr, in2_z);
2475e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S1,$rp
2476e1051a39Sopenharmony_ci
2477e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Z1sqr],$bi
2478e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z],$a0
2479e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+8],$a1
2480e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+16],$a2
2481e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+24],$a3
2482e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Z1sqr,$bp
2483e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, Z1sqr, in1_z);
2484e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S2,$rp
2485e1051a39Sopenharmony_ci
2486e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S1],$bi
2487e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_y],$a0
2488e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_y+8],$a1
2489e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_y+16],$a2
2490e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_y+24],$a3
2491e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S1,$bp
2492e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S1, S1, in1_y);
2493e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S1,$rp
2494e1051a39Sopenharmony_ci
2495e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S2],$bi
2496e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_y],$a0
2497e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_y+8],$a1
2498e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_y+16],$a2
2499e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_y+24],$a3
2500e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S2,$bp
2501e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, S2, in2_y);
2502e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S2,$rp
2503e1051a39Sopenharmony_ci
2504e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Z2sqr],$bi	! forward load
2505e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x],$a0
2506e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+8],$a1
2507e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+16],$a2
2508e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+24],$a3
2509e1051a39Sopenharmony_ci
2510e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S1,$bp
2511e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from_vis3	! p256_sub(R, S2, S1);
2512e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$R,$rp
2513e1051a39Sopenharmony_ci
2514e1051a39Sopenharmony_ci	or	$acc1,$acc0,$acc0		! see if result is zero
2515e1051a39Sopenharmony_ci	or	$acc3,$acc2,$acc2
2516e1051a39Sopenharmony_ci	or	$acc2,$acc0,$acc0
2517e1051a39Sopenharmony_ci	stx	$acc0,[%fp+STACK_BIAS-24]
2518e1051a39Sopenharmony_ci
2519e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Z2sqr,$bp
2520e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(U1, in1_x, Z2sqr);
2521e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$U1,$rp
2522e1051a39Sopenharmony_ci
2523e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Z1sqr],$bi
2524e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_x],$a0
2525e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_x+8],$a1
2526e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_x+16],$a2
2527e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_x+24],$a3
2528e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Z1sqr,$bp
2529e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(U2, in2_x, Z1sqr);
2530e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$U2,$rp
2531e1051a39Sopenharmony_ci
2532e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$R],$a0		! forward load
2533e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$R+8],$a1
2534e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$R+16],$a2
2535e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$R+24],$a3
2536e1051a39Sopenharmony_ci
2537e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$U1,$bp
2538e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from_vis3	! p256_sub(H, U2, U1);
2539e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$H,$rp
2540e1051a39Sopenharmony_ci
2541e1051a39Sopenharmony_ci	or	$acc1,$acc0,$acc0		! see if result is zero
2542e1051a39Sopenharmony_ci	or	$acc3,$acc2,$acc2
2543e1051a39Sopenharmony_ci	orcc	$acc2,$acc0,$acc0
2544e1051a39Sopenharmony_ci
2545e1051a39Sopenharmony_ci	bne,pt	%xcc,.Ladd_proceed_vis3		! is_equal(U1,U2)?
2546e1051a39Sopenharmony_ci	nop
2547e1051a39Sopenharmony_ci
2548e1051a39Sopenharmony_ci	ldx	[%fp+STACK_BIAS-8],$t0
2549e1051a39Sopenharmony_ci	ldx	[%fp+STACK_BIAS-16],$t1
2550e1051a39Sopenharmony_ci	ldx	[%fp+STACK_BIAS-24],$t2
2551e1051a39Sopenharmony_ci	andcc	$t0,$t1,%g0
2552e1051a39Sopenharmony_ci	be,pt	%xcc,.Ladd_proceed_vis3		! (in1infty || in2infty)?
2553e1051a39Sopenharmony_ci	nop
2554e1051a39Sopenharmony_ci	andcc	$t2,$t2,%g0
2555e1051a39Sopenharmony_ci	be,a,pt	%xcc,.Ldouble_shortcut_vis3	! is_equal(S1,S2)?
2556e1051a39Sopenharmony_ci	add	%sp,32*(12-10)+32,%sp		! difference in frame sizes
2557e1051a39Sopenharmony_ci
2558e1051a39Sopenharmony_ci	st	%g0,[$rp_real]
2559e1051a39Sopenharmony_ci	st	%g0,[$rp_real+4]
2560e1051a39Sopenharmony_ci	st	%g0,[$rp_real+8]
2561e1051a39Sopenharmony_ci	st	%g0,[$rp_real+12]
2562e1051a39Sopenharmony_ci	st	%g0,[$rp_real+16]
2563e1051a39Sopenharmony_ci	st	%g0,[$rp_real+20]
2564e1051a39Sopenharmony_ci	st	%g0,[$rp_real+24]
2565e1051a39Sopenharmony_ci	st	%g0,[$rp_real+28]
2566e1051a39Sopenharmony_ci	st	%g0,[$rp_real+32]
2567e1051a39Sopenharmony_ci	st	%g0,[$rp_real+32+4]
2568e1051a39Sopenharmony_ci	st	%g0,[$rp_real+32+8]
2569e1051a39Sopenharmony_ci	st	%g0,[$rp_real+32+12]
2570e1051a39Sopenharmony_ci	st	%g0,[$rp_real+32+16]
2571e1051a39Sopenharmony_ci	st	%g0,[$rp_real+32+20]
2572e1051a39Sopenharmony_ci	st	%g0,[$rp_real+32+24]
2573e1051a39Sopenharmony_ci	st	%g0,[$rp_real+32+28]
2574e1051a39Sopenharmony_ci	st	%g0,[$rp_real+64]
2575e1051a39Sopenharmony_ci	st	%g0,[$rp_real+64+4]
2576e1051a39Sopenharmony_ci	st	%g0,[$rp_real+64+8]
2577e1051a39Sopenharmony_ci	st	%g0,[$rp_real+64+12]
2578e1051a39Sopenharmony_ci	st	%g0,[$rp_real+64+16]
2579e1051a39Sopenharmony_ci	st	%g0,[$rp_real+64+20]
2580e1051a39Sopenharmony_ci	st	%g0,[$rp_real+64+24]
2581e1051a39Sopenharmony_ci	st	%g0,[$rp_real+64+28]
2582e1051a39Sopenharmony_ci	b	.Ladd_done_vis3
2583e1051a39Sopenharmony_ci	nop
2584e1051a39Sopenharmony_ci
2585e1051a39Sopenharmony_ci.align	16
2586e1051a39Sopenharmony_ci.Ladd_proceed_vis3:
2587e1051a39Sopenharmony_ci	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Rsqr, R);
2588e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Rsqr,$rp
2589e1051a39Sopenharmony_ci
2590e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H],$bi
2591e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z],$a0
2592e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+8],$a1
2593e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+16],$a2
2594e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+24],$a3
2595e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$H,$bp
2596e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(res_z, H, in1_z);
2597e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_z,$rp
2598e1051a39Sopenharmony_ci
2599e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H],$a0
2600e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H+8],$a1
2601e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H+16],$a2
2602e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H+24],$a3
2603e1051a39Sopenharmony_ci	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Hsqr, H);
2604e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Hsqr,$rp
2605e1051a39Sopenharmony_ci
2606e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_z],$bi
2607e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z],$a0
2608e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z+8],$a1
2609e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z+16],$a2
2610e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_z+24],$a3
2611e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_z,$bp
2612e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(res_z, res_z, in2_z);
2613e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_z,$rp
2614e1051a39Sopenharmony_ci
2615e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H],$bi
2616e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr],$a0
2617e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr+8],$a1
2618e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr+16],$a2
2619e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr+24],$a3
2620e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$H,$bp
2621e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(Hcub, Hsqr, H);
2622e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Hcub,$rp
2623e1051a39Sopenharmony_ci
2624e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$U1],$bi
2625e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr],$a0
2626e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr+8],$a1
2627e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr+16],$a2
2628e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr+24],$a3
2629e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$U1,$bp
2630e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(U2, U1, Hsqr);
2631e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$U2,$rp
2632e1051a39Sopenharmony_ci
2633e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2_vis3	! p256_mul_by_2(Hsqr, U2);
2634e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Hsqr,$rp
2635e1051a39Sopenharmony_ci
2636e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Rsqr,$bp
2637e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(res_x, Rsqr, Hsqr);
2638e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_x,$rp
2639e1051a39Sopenharmony_ci
2640e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Hcub,$bp
2641e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from_vis3	!  p256_sub(res_x, res_x, Hcub);
2642e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_x,$rp
2643e1051a39Sopenharmony_ci
2644e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S1],$bi		! forward load
2645e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hcub],$a0
2646e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hcub+8],$a1
2647e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hcub+16],$a2
2648e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hcub+24],$a3
2649e1051a39Sopenharmony_ci
2650e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$U2,$bp
2651e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(res_y, U2, res_x);
2652e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_y,$rp
2653e1051a39Sopenharmony_ci
2654e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S1,$bp
2655e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, S1, Hcub);
2656e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S2,$rp
2657e1051a39Sopenharmony_ci
2658e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$R],$bi
2659e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_y],$a0
2660e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_y+8],$a1
2661e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_y+16],$a2
2662e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_y+24],$a3
2663e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$R,$bp
2664e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(res_y, res_y, R);
2665e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_y,$rp
2666e1051a39Sopenharmony_ci
2667e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S2,$bp
2668e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from_vis3	! p256_sub(res_y, res_y, S2);
2669e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_y,$rp
2670e1051a39Sopenharmony_ci
2671e1051a39Sopenharmony_ci	ldx	[%fp+STACK_BIAS-16],$t1		! !in1infty
2672e1051a39Sopenharmony_ci	ldx	[%fp+STACK_BIAS-8],$t2		! !in2infty
2673e1051a39Sopenharmony_ci___
2674e1051a39Sopenharmony_cifor($i=0;$i<96;$i+=16) {			# conditional moves
2675e1051a39Sopenharmony_ci$code.=<<___;
2676e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_x+$i],$acc0	! res
2677e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_x+$i+8],$acc1
2678e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_x+$i],$acc2	! in2
2679e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_x+$i+8],$acc3
2680e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+$i],$acc4	! in1
2681e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+$i+8],$acc5
2682e1051a39Sopenharmony_ci	movrz	$t1,$acc2,$acc0
2683e1051a39Sopenharmony_ci	movrz	$t1,$acc3,$acc1
2684e1051a39Sopenharmony_ci	movrz	$t2,$acc4,$acc0
2685e1051a39Sopenharmony_ci	movrz	$t2,$acc5,$acc1
2686e1051a39Sopenharmony_ci	srlx	$acc0,32,$acc2
2687e1051a39Sopenharmony_ci	srlx	$acc1,32,$acc3
2688e1051a39Sopenharmony_ci	st	$acc0,[$rp_real+$i]
2689e1051a39Sopenharmony_ci	st	$acc2,[$rp_real+$i+4]
2690e1051a39Sopenharmony_ci	st	$acc1,[$rp_real+$i+8]
2691e1051a39Sopenharmony_ci	st	$acc3,[$rp_real+$i+12]
2692e1051a39Sopenharmony_ci___
2693e1051a39Sopenharmony_ci}
2694e1051a39Sopenharmony_ci$code.=<<___;
2695e1051a39Sopenharmony_ci.Ladd_done_vis3:
2696e1051a39Sopenharmony_ci	ret
2697e1051a39Sopenharmony_ci	restore
2698e1051a39Sopenharmony_ci.type	ecp_nistz256_point_add_vis3,#function
2699e1051a39Sopenharmony_ci.size	ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3
2700e1051a39Sopenharmony_ci___
2701e1051a39Sopenharmony_ci}
2702e1051a39Sopenharmony_ci########################################################################
2703e1051a39Sopenharmony_ci# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
2704e1051a39Sopenharmony_ci#				     const P256_POINT_AFFINE *in2);
2705e1051a39Sopenharmony_ci{
2706e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z,
2707e1051a39Sopenharmony_ci    $in1_x,$in1_y,$in1_z,
2708e1051a39Sopenharmony_ci    $in2_x,$in2_y,
2709e1051a39Sopenharmony_ci    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
2710e1051a39Sopenharmony_cimy $Z1sqr = $S2;
2711e1051a39Sopenharmony_ci# above map() describes stack layout with 15 temporary
2712e1051a39Sopenharmony_ci# 256-bit vectors on top. Then we reserve some space for
2713e1051a39Sopenharmony_ci# !in1infty and !in2infty.
2714e1051a39Sopenharmony_ci
2715e1051a39Sopenharmony_ci$code.=<<___;
2716e1051a39Sopenharmony_ci.align	32
2717e1051a39Sopenharmony_ciecp_nistz256_point_add_affine_vis3:
2718e1051a39Sopenharmony_ci	save	%sp,-STACK64_FRAME-32*15-32,%sp
2719e1051a39Sopenharmony_ci
2720e1051a39Sopenharmony_ci	mov	$rp,$rp_real
2721e1051a39Sopenharmony_ci	mov	-1,$minus1
2722e1051a39Sopenharmony_ci	mov	-2,$poly3
2723e1051a39Sopenharmony_ci	sllx	$minus1,32,$poly1		! 0xFFFFFFFF00000000
2724e1051a39Sopenharmony_ci	srl	$poly3,0,$poly3			! 0x00000000FFFFFFFE
2725e1051a39Sopenharmony_ci
2726e1051a39Sopenharmony_ci	! convert input to uint64_t[4]
2727e1051a39Sopenharmony_ci	ld	[$bp],$a0			! in2_x
2728e1051a39Sopenharmony_ci	ld	[$bp+4],$t0
2729e1051a39Sopenharmony_ci	ld	[$bp+8],$a1
2730e1051a39Sopenharmony_ci	ld	[$bp+12],$t1
2731e1051a39Sopenharmony_ci	ld	[$bp+16],$a2
2732e1051a39Sopenharmony_ci	ld	[$bp+20],$t2
2733e1051a39Sopenharmony_ci	ld	[$bp+24],$a3
2734e1051a39Sopenharmony_ci	ld	[$bp+28],$t3
2735e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2736e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2737e1051a39Sopenharmony_ci	ld	[$bp+32],$acc0			! in2_y
2738e1051a39Sopenharmony_ci	or	$a0,$t0,$a0
2739e1051a39Sopenharmony_ci	ld	[$bp+32+4],$t0
2740e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2741e1051a39Sopenharmony_ci	ld	[$bp+32+8],$acc1
2742e1051a39Sopenharmony_ci	or	$a1,$t1,$a1
2743e1051a39Sopenharmony_ci	ld	[$bp+32+12],$t1
2744e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2745e1051a39Sopenharmony_ci	ld	[$bp+32+16],$acc2
2746e1051a39Sopenharmony_ci	or	$a2,$t2,$a2
2747e1051a39Sopenharmony_ci	ld	[$bp+32+20],$t2
2748e1051a39Sopenharmony_ci	or	$a3,$t3,$a3
2749e1051a39Sopenharmony_ci	ld	[$bp+32+24],$acc3
2750e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2751e1051a39Sopenharmony_ci	ld	[$bp+32+28],$t3
2752e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2753e1051a39Sopenharmony_ci	stx	$a0,[%sp+LOCALS64+$in2_x]
2754e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2755e1051a39Sopenharmony_ci	stx	$a1,[%sp+LOCALS64+$in2_x+8]
2756e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2757e1051a39Sopenharmony_ci	stx	$a2,[%sp+LOCALS64+$in2_x+16]
2758e1051a39Sopenharmony_ci	or	$acc0,$t0,$acc0
2759e1051a39Sopenharmony_ci	stx	$a3,[%sp+LOCALS64+$in2_x+24]
2760e1051a39Sopenharmony_ci	or	$acc1,$t1,$acc1
2761e1051a39Sopenharmony_ci	stx	$acc0,[%sp+LOCALS64+$in2_y]
2762e1051a39Sopenharmony_ci	or	$acc2,$t2,$acc2
2763e1051a39Sopenharmony_ci	stx	$acc1,[%sp+LOCALS64+$in2_y+8]
2764e1051a39Sopenharmony_ci	or	$acc3,$t3,$acc3
2765e1051a39Sopenharmony_ci	stx	$acc2,[%sp+LOCALS64+$in2_y+16]
2766e1051a39Sopenharmony_ci	stx	$acc3,[%sp+LOCALS64+$in2_y+24]
2767e1051a39Sopenharmony_ci
2768e1051a39Sopenharmony_ci	or	$a1,$a0,$a0
2769e1051a39Sopenharmony_ci	or	$a3,$a2,$a2
2770e1051a39Sopenharmony_ci	or	$acc1,$acc0,$acc0
2771e1051a39Sopenharmony_ci	or	$acc3,$acc2,$acc2
2772e1051a39Sopenharmony_ci	or	$a2,$a0,$a0
2773e1051a39Sopenharmony_ci	or	$acc2,$acc0,$acc0
2774e1051a39Sopenharmony_ci	or	$acc0,$a0,$a0
2775e1051a39Sopenharmony_ci	movrnz	$a0,-1,$a0			! !in2infty
2776e1051a39Sopenharmony_ci	stx	$a0,[%fp+STACK_BIAS-8]
2777e1051a39Sopenharmony_ci
2778e1051a39Sopenharmony_ci	ld	[$ap],$a0			! in1_x
2779e1051a39Sopenharmony_ci	ld	[$ap+4],$t0
2780e1051a39Sopenharmony_ci	ld	[$ap+8],$a1
2781e1051a39Sopenharmony_ci	ld	[$ap+12],$t1
2782e1051a39Sopenharmony_ci	ld	[$ap+16],$a2
2783e1051a39Sopenharmony_ci	ld	[$ap+20],$t2
2784e1051a39Sopenharmony_ci	ld	[$ap+24],$a3
2785e1051a39Sopenharmony_ci	ld	[$ap+28],$t3
2786e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2787e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2788e1051a39Sopenharmony_ci	ld	[$ap+32],$acc0			! in1_y
2789e1051a39Sopenharmony_ci	or	$a0,$t0,$a0
2790e1051a39Sopenharmony_ci	ld	[$ap+32+4],$t0
2791e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2792e1051a39Sopenharmony_ci	ld	[$ap+32+8],$acc1
2793e1051a39Sopenharmony_ci	or	$a1,$t1,$a1
2794e1051a39Sopenharmony_ci	ld	[$ap+32+12],$t1
2795e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2796e1051a39Sopenharmony_ci	ld	[$ap+32+16],$acc2
2797e1051a39Sopenharmony_ci	or	$a2,$t2,$a2
2798e1051a39Sopenharmony_ci	ld	[$ap+32+20],$t2
2799e1051a39Sopenharmony_ci	or	$a3,$t3,$a3
2800e1051a39Sopenharmony_ci	ld	[$ap+32+24],$acc3
2801e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2802e1051a39Sopenharmony_ci	ld	[$ap+32+28],$t3
2803e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2804e1051a39Sopenharmony_ci	stx	$a0,[%sp+LOCALS64+$in1_x]
2805e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2806e1051a39Sopenharmony_ci	stx	$a1,[%sp+LOCALS64+$in1_x+8]
2807e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2808e1051a39Sopenharmony_ci	stx	$a2,[%sp+LOCALS64+$in1_x+16]
2809e1051a39Sopenharmony_ci	or	$acc0,$t0,$acc0
2810e1051a39Sopenharmony_ci	stx	$a3,[%sp+LOCALS64+$in1_x+24]
2811e1051a39Sopenharmony_ci	or	$acc1,$t1,$acc1
2812e1051a39Sopenharmony_ci	stx	$acc0,[%sp+LOCALS64+$in1_y]
2813e1051a39Sopenharmony_ci	or	$acc2,$t2,$acc2
2814e1051a39Sopenharmony_ci	stx	$acc1,[%sp+LOCALS64+$in1_y+8]
2815e1051a39Sopenharmony_ci	or	$acc3,$t3,$acc3
2816e1051a39Sopenharmony_ci	stx	$acc2,[%sp+LOCALS64+$in1_y+16]
2817e1051a39Sopenharmony_ci	stx	$acc3,[%sp+LOCALS64+$in1_y+24]
2818e1051a39Sopenharmony_ci
2819e1051a39Sopenharmony_ci	ld	[$ap+64],$a0			! in1_z
2820e1051a39Sopenharmony_ci	ld	[$ap+64+4],$t0
2821e1051a39Sopenharmony_ci	ld	[$ap+64+8],$a1
2822e1051a39Sopenharmony_ci	ld	[$ap+64+12],$t1
2823e1051a39Sopenharmony_ci	ld	[$ap+64+16],$a2
2824e1051a39Sopenharmony_ci	ld	[$ap+64+20],$t2
2825e1051a39Sopenharmony_ci	ld	[$ap+64+24],$a3
2826e1051a39Sopenharmony_ci	ld	[$ap+64+28],$t3
2827e1051a39Sopenharmony_ci	sllx	$t0,32,$t0
2828e1051a39Sopenharmony_ci	sllx	$t1,32,$t1
2829e1051a39Sopenharmony_ci	or	$a0,$t0,$a0
2830e1051a39Sopenharmony_ci	sllx	$t2,32,$t2
2831e1051a39Sopenharmony_ci	or	$a1,$t1,$a1
2832e1051a39Sopenharmony_ci	sllx	$t3,32,$t3
2833e1051a39Sopenharmony_ci	stx	$a0,[%sp+LOCALS64+$in1_z]
2834e1051a39Sopenharmony_ci	or	$a2,$t2,$a2
2835e1051a39Sopenharmony_ci	stx	$a1,[%sp+LOCALS64+$in1_z+8]
2836e1051a39Sopenharmony_ci	or	$a3,$t3,$a3
2837e1051a39Sopenharmony_ci	stx	$a2,[%sp+LOCALS64+$in1_z+16]
2838e1051a39Sopenharmony_ci	stx	$a3,[%sp+LOCALS64+$in1_z+24]
2839e1051a39Sopenharmony_ci
2840e1051a39Sopenharmony_ci	or	$a1,$a0,$t0
2841e1051a39Sopenharmony_ci	or	$a3,$a2,$t2
2842e1051a39Sopenharmony_ci	or	$t2,$t0,$t0
2843e1051a39Sopenharmony_ci	movrnz	$t0,-1,$t0			! !in1infty
2844e1051a39Sopenharmony_ci	stx	$t0,[%fp+STACK_BIAS-16]
2845e1051a39Sopenharmony_ci
2846e1051a39Sopenharmony_ci	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Z1sqr, in1_z);
2847e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Z1sqr,$rp
2848e1051a39Sopenharmony_ci
2849e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_x],$bi
2850e1051a39Sopenharmony_ci	mov	$acc0,$a0
2851e1051a39Sopenharmony_ci	mov	$acc1,$a1
2852e1051a39Sopenharmony_ci	mov	$acc2,$a2
2853e1051a39Sopenharmony_ci	mov	$acc3,$a3
2854e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$in2_x,$bp
2855e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(U2, Z1sqr, in2_x);
2856e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$U2,$rp
2857e1051a39Sopenharmony_ci
2858e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Z1sqr],$bi	! forward load
2859e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z],$a0
2860e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+8],$a1
2861e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+16],$a2
2862e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+24],$a3
2863e1051a39Sopenharmony_ci
2864e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$in1_x,$bp
2865e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from_vis3	! p256_sub(H, U2, in1_x);
2866e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$H,$rp
2867e1051a39Sopenharmony_ci
2868e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Z1sqr,$bp
2869e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, Z1sqr, in1_z);
2870e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S2,$rp
2871e1051a39Sopenharmony_ci
2872e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H],$bi
2873e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z],$a0
2874e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+8],$a1
2875e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+16],$a2
2876e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_z+24],$a3
2877e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$H,$bp
2878e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(res_z, H, in1_z);
2879e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_z,$rp
2880e1051a39Sopenharmony_ci
2881e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$S2],$bi
2882e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_y],$a0
2883e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_y+8],$a1
2884e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_y+16],$a2
2885e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_y+24],$a3
2886e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S2,$bp
2887e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, S2, in2_y);
2888e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S2,$rp
2889e1051a39Sopenharmony_ci
2890e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H],$a0		! forward load
2891e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H+8],$a1
2892e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H+16],$a2
2893e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H+24],$a3
2894e1051a39Sopenharmony_ci
2895e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$in1_y,$bp
2896e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from_vis3	! p256_sub(R, S2, in1_y);
2897e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$R,$rp
2898e1051a39Sopenharmony_ci
2899e1051a39Sopenharmony_ci	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Hsqr, H);
2900e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Hsqr,$rp
2901e1051a39Sopenharmony_ci
2902e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$R],$a0
2903e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$R+8],$a1
2904e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$R+16],$a2
2905e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$R+24],$a3
2906e1051a39Sopenharmony_ci	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Rsqr, R);
2907e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Rsqr,$rp
2908e1051a39Sopenharmony_ci
2909e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$H],$bi
2910e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr],$a0
2911e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr+8],$a1
2912e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr+16],$a2
2913e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr+24],$a3
2914e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$H,$bp
2915e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(Hcub, Hsqr, H);
2916e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Hcub,$rp
2917e1051a39Sopenharmony_ci
2918e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hsqr],$bi
2919e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x],$a0
2920e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+8],$a1
2921e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+16],$a2
2922e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+24],$a3
2923e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Hsqr,$bp
2924e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(U2, in1_x, Hsqr);
2925e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$U2,$rp
2926e1051a39Sopenharmony_ci
2927e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_by_2_vis3	! p256_mul_by_2(Hsqr, U2);
2928e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Hsqr,$rp
2929e1051a39Sopenharmony_ci
2930e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Rsqr,$bp
2931e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(res_x, Rsqr, Hsqr);
2932e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_x,$rp
2933e1051a39Sopenharmony_ci
2934e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Hcub,$bp
2935e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from_vis3	!  p256_sub(res_x, res_x, Hcub);
2936e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_x,$rp
2937e1051a39Sopenharmony_ci
2938e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$Hcub],$bi	! forward load
2939e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_y],$a0
2940e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_y+8],$a1
2941e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_y+16],$a2
2942e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_y+24],$a3
2943e1051a39Sopenharmony_ci
2944e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$U2,$bp
2945e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(res_y, U2, res_x);
2946e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_y,$rp
2947e1051a39Sopenharmony_ci
2948e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$Hcub,$bp
2949e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, in1_y, Hcub);
2950e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S2,$rp
2951e1051a39Sopenharmony_ci
2952e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$R],$bi
2953e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_y],$a0
2954e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_y+8],$a1
2955e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_y+16],$a2
2956e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_y+24],$a3
2957e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$R,$bp
2958e1051a39Sopenharmony_ci	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(res_y, res_y, R);
2959e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_y,$rp
2960e1051a39Sopenharmony_ci
2961e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$S2,$bp
2962e1051a39Sopenharmony_ci	call	__ecp_nistz256_sub_from_vis3	! p256_sub(res_y, res_y, S2);
2963e1051a39Sopenharmony_ci	add	%sp,LOCALS64+$res_y,$rp
2964e1051a39Sopenharmony_ci
2965e1051a39Sopenharmony_ci	ldx	[%fp+STACK_BIAS-16],$t1		! !in1infty
2966e1051a39Sopenharmony_ci	ldx	[%fp+STACK_BIAS-8],$t2		! !in2infty
2967e1051a39Sopenharmony_ci1:	call	.+8
2968e1051a39Sopenharmony_ci	add	%o7,.Lone_mont_vis3-1b,$bp
2969e1051a39Sopenharmony_ci___
2970e1051a39Sopenharmony_cifor($i=0;$i<64;$i+=16) {			# conditional moves
2971e1051a39Sopenharmony_ci$code.=<<___;
2972e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_x+$i],$acc0	! res
2973e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_x+$i+8],$acc1
2974e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_x+$i],$acc2	! in2
2975e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in2_x+$i+8],$acc3
2976e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+$i],$acc4	! in1
2977e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+$i+8],$acc5
2978e1051a39Sopenharmony_ci	movrz	$t1,$acc2,$acc0
2979e1051a39Sopenharmony_ci	movrz	$t1,$acc3,$acc1
2980e1051a39Sopenharmony_ci	movrz	$t2,$acc4,$acc0
2981e1051a39Sopenharmony_ci	movrz	$t2,$acc5,$acc1
2982e1051a39Sopenharmony_ci	srlx	$acc0,32,$acc2
2983e1051a39Sopenharmony_ci	srlx	$acc1,32,$acc3
2984e1051a39Sopenharmony_ci	st	$acc0,[$rp_real+$i]
2985e1051a39Sopenharmony_ci	st	$acc2,[$rp_real+$i+4]
2986e1051a39Sopenharmony_ci	st	$acc1,[$rp_real+$i+8]
2987e1051a39Sopenharmony_ci	st	$acc3,[$rp_real+$i+12]
2988e1051a39Sopenharmony_ci___
2989e1051a39Sopenharmony_ci}
2990e1051a39Sopenharmony_cifor(;$i<96;$i+=16) {
2991e1051a39Sopenharmony_ci$code.=<<___;
2992e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_x+$i],$acc0	! res
2993e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$res_x+$i+8],$acc1
2994e1051a39Sopenharmony_ci	ldx	[$bp+$i-64],$acc2		! "in2"
2995e1051a39Sopenharmony_ci	ldx	[$bp+$i-64+8],$acc3
2996e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+$i],$acc4	! in1
2997e1051a39Sopenharmony_ci	ldx	[%sp+LOCALS64+$in1_x+$i+8],$acc5
2998e1051a39Sopenharmony_ci	movrz	$t1,$acc2,$acc0
2999e1051a39Sopenharmony_ci	movrz	$t1,$acc3,$acc1
3000e1051a39Sopenharmony_ci	movrz	$t2,$acc4,$acc0
3001e1051a39Sopenharmony_ci	movrz	$t2,$acc5,$acc1
3002e1051a39Sopenharmony_ci	srlx	$acc0,32,$acc2
3003e1051a39Sopenharmony_ci	srlx	$acc1,32,$acc3
3004e1051a39Sopenharmony_ci	st	$acc0,[$rp_real+$i]
3005e1051a39Sopenharmony_ci	st	$acc2,[$rp_real+$i+4]
3006e1051a39Sopenharmony_ci	st	$acc1,[$rp_real+$i+8]
3007e1051a39Sopenharmony_ci	st	$acc3,[$rp_real+$i+12]
3008e1051a39Sopenharmony_ci___
3009e1051a39Sopenharmony_ci}
3010e1051a39Sopenharmony_ci$code.=<<___;
3011e1051a39Sopenharmony_ci	ret
3012e1051a39Sopenharmony_ci	restore
3013e1051a39Sopenharmony_ci.type	ecp_nistz256_point_add_affine_vis3,#function
3014e1051a39Sopenharmony_ci.size	ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3
3015e1051a39Sopenharmony_ci.align	64
3016e1051a39Sopenharmony_ci.Lone_mont_vis3:
3017e1051a39Sopenharmony_ci.long	0x00000000,0x00000001, 0xffffffff,0x00000000
3018e1051a39Sopenharmony_ci.long	0xffffffff,0xffffffff, 0x00000000,0xfffffffe
3019e1051a39Sopenharmony_ci.align	64
3020e1051a39Sopenharmony_ci___
3021e1051a39Sopenharmony_ci}								}}}
3022e1051a39Sopenharmony_ci
3023e1051a39Sopenharmony_ci# Purpose of these subroutines is to explicitly encode VIS instructions,
3024e1051a39Sopenharmony_ci# so that one can compile the module without having to specify VIS
3025e1051a39Sopenharmony_ci# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
3026e1051a39Sopenharmony_ci# Idea is to reserve for option to produce "universal" binary and let
3027e1051a39Sopenharmony_ci# programmer detect if current CPU is VIS capable at run-time.
3028e1051a39Sopenharmony_cisub unvis3 {
3029e1051a39Sopenharmony_cimy ($mnemonic,$rs1,$rs2,$rd)=@_;
3030e1051a39Sopenharmony_cimy %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
3031e1051a39Sopenharmony_cimy ($ref,$opf);
3032e1051a39Sopenharmony_cimy %visopf = (	"addxc"		=> 0x011,
3033e1051a39Sopenharmony_ci		"addxccc"	=> 0x013,
3034e1051a39Sopenharmony_ci		"umulxhi"	=> 0x016	);
3035e1051a39Sopenharmony_ci
3036e1051a39Sopenharmony_ci    $ref = "$mnemonic\t$rs1,$rs2,$rd";
3037e1051a39Sopenharmony_ci
3038e1051a39Sopenharmony_ci    if ($opf=$visopf{$mnemonic}) {
3039e1051a39Sopenharmony_ci	foreach ($rs1,$rs2,$rd) {
3040e1051a39Sopenharmony_ci	    return $ref if (!/%([goli])([0-9])/);
3041e1051a39Sopenharmony_ci	    $_=$bias{$1}+$2;
3042e1051a39Sopenharmony_ci	}
3043e1051a39Sopenharmony_ci
3044e1051a39Sopenharmony_ci	return	sprintf ".word\t0x%08x !%s",
3045e1051a39Sopenharmony_ci			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
3046e1051a39Sopenharmony_ci			$ref;
3047e1051a39Sopenharmony_ci    } else {
3048e1051a39Sopenharmony_ci	return $ref;
3049e1051a39Sopenharmony_ci    }
3050e1051a39Sopenharmony_ci}
3051e1051a39Sopenharmony_ci
3052e1051a39Sopenharmony_ciforeach (split("\n",$code)) {
3053e1051a39Sopenharmony_ci	s/\`([^\`]*)\`/eval $1/ge;
3054e1051a39Sopenharmony_ci
3055e1051a39Sopenharmony_ci	s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
3056e1051a39Sopenharmony_ci		&unvis3($1,$2,$3,$4)
3057e1051a39Sopenharmony_ci	 /ge;
3058e1051a39Sopenharmony_ci
3059e1051a39Sopenharmony_ci	print $_,"\n";
3060e1051a39Sopenharmony_ci}
3061e1051a39Sopenharmony_ci
3062e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";
3063