1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci#
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# ECP_NISTZ256 module for PPC64.
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# August 2016.
20e1051a39Sopenharmony_ci#
21e1051a39Sopenharmony_ci# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22e1051a39Sopenharmony_ci# http://eprint.iacr.org/2013/816.
23e1051a39Sopenharmony_ci#
24e1051a39Sopenharmony_ci#			with/without -DECP_NISTZ256_ASM
25e1051a39Sopenharmony_ci# POWER7		+260-530%
26e1051a39Sopenharmony_ci# POWER8		+220-340%
27e1051a39Sopenharmony_ci
28e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
29e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
30e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
31e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
32e1051a39Sopenharmony_ci
33e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34e1051a39Sopenharmony_ci( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
35e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
36e1051a39Sopenharmony_cidie "can't locate ppc-xlate.pl";
37e1051a39Sopenharmony_ci
38e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" $xlate $flavour \"$output\""
39e1051a39Sopenharmony_ci    or die "can't call $xlate: $!";
40e1051a39Sopenharmony_ci*STDOUT=*OUT;
41e1051a39Sopenharmony_ci
42e1051a39Sopenharmony_cimy $sp="r1";
43e1051a39Sopenharmony_ci
44e1051a39Sopenharmony_ci{
45e1051a39Sopenharmony_cimy ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
46e1051a39Sopenharmony_ci    $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
47e1051a39Sopenharmony_ci    map("r$_",(3..12,22..31));
48e1051a39Sopenharmony_ci
49e1051a39Sopenharmony_cimy ($acc6,$acc7)=($bp,$bi);	# used in __ecp_nistz256_sqr_mont
50e1051a39Sopenharmony_ci
51e1051a39Sopenharmony_ci$code.=<<___;
52e1051a39Sopenharmony_ci.machine	"any"
53e1051a39Sopenharmony_ci.text
54e1051a39Sopenharmony_ci___
55e1051a39Sopenharmony_ci########################################################################
56e1051a39Sopenharmony_ci# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
57e1051a39Sopenharmony_ci#
58e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59e1051a39Sopenharmony_ciopen TABLE,"<ecp_nistz256_table.c"		or
60e1051a39Sopenharmony_ciopen TABLE,"<${dir}../ecp_nistz256_table.c"	or
61e1051a39Sopenharmony_cidie "failed to open ecp_nistz256_table.c:",$!;
62e1051a39Sopenharmony_ci
63e1051a39Sopenharmony_ciuse integer;
64e1051a39Sopenharmony_ci
65e1051a39Sopenharmony_ciforeach(<TABLE>) {
66e1051a39Sopenharmony_ci	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
67e1051a39Sopenharmony_ci}
68e1051a39Sopenharmony_ciclose TABLE;
69e1051a39Sopenharmony_ci
70e1051a39Sopenharmony_ci# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
71e1051a39Sopenharmony_ci# 64*16*37-1 is because $#arr returns last valid index or @arr, not
72e1051a39Sopenharmony_ci# amount of elements.
73e1051a39Sopenharmony_cidie "insane number of elements" if ($#arr != 64*16*37-1);
74e1051a39Sopenharmony_ci
75e1051a39Sopenharmony_ci$code.=<<___;
76e1051a39Sopenharmony_ci.type	ecp_nistz256_precomputed,\@object
77e1051a39Sopenharmony_ci.globl	ecp_nistz256_precomputed
78e1051a39Sopenharmony_ci.align	12
79e1051a39Sopenharmony_ciecp_nistz256_precomputed:
80e1051a39Sopenharmony_ci___
81e1051a39Sopenharmony_ci########################################################################
82e1051a39Sopenharmony_ci# this conversion smashes P256_POINT_AFFINE by individual bytes with
83e1051a39Sopenharmony_ci# 64 byte interval, similar to
84e1051a39Sopenharmony_ci#	1111222233334444
85e1051a39Sopenharmony_ci#	1234123412341234
86e1051a39Sopenharmony_cifor(1..37) {
87e1051a39Sopenharmony_ci	@tbl = splice(@arr,0,64*16);
88e1051a39Sopenharmony_ci	for($i=0;$i<64;$i++) {
89e1051a39Sopenharmony_ci		undef @line;
90e1051a39Sopenharmony_ci		for($j=0;$j<64;$j++) {
91e1051a39Sopenharmony_ci			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
92e1051a39Sopenharmony_ci		}
93e1051a39Sopenharmony_ci		$code.=".byte\t";
94e1051a39Sopenharmony_ci		$code.=join(',',map { sprintf "0x%02x",$_} @line);
95e1051a39Sopenharmony_ci		$code.="\n";
96e1051a39Sopenharmony_ci	}
97e1051a39Sopenharmony_ci}
98e1051a39Sopenharmony_ci
99e1051a39Sopenharmony_ci$code.=<<___;
100e1051a39Sopenharmony_ci.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
101e1051a39Sopenharmony_ci.asciz	"ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
102e1051a39Sopenharmony_ci
103e1051a39Sopenharmony_ci# void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
104e1051a39Sopenharmony_ci#					     const BN_ULONG x2[4]);
105e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_mont
106e1051a39Sopenharmony_ci.align	5
107e1051a39Sopenharmony_ciecp_nistz256_mul_mont:
108e1051a39Sopenharmony_ci	stdu	$sp,-128($sp)
109e1051a39Sopenharmony_ci	mflr	r0
110e1051a39Sopenharmony_ci	std	r22,48($sp)
111e1051a39Sopenharmony_ci	std	r23,56($sp)
112e1051a39Sopenharmony_ci	std	r24,64($sp)
113e1051a39Sopenharmony_ci	std	r25,72($sp)
114e1051a39Sopenharmony_ci	std	r26,80($sp)
115e1051a39Sopenharmony_ci	std	r27,88($sp)
116e1051a39Sopenharmony_ci	std	r28,96($sp)
117e1051a39Sopenharmony_ci	std	r29,104($sp)
118e1051a39Sopenharmony_ci	std	r30,112($sp)
119e1051a39Sopenharmony_ci	std	r31,120($sp)
120e1051a39Sopenharmony_ci
121e1051a39Sopenharmony_ci	ld	$a0,0($ap)
122e1051a39Sopenharmony_ci	ld	$bi,0($bp)
123e1051a39Sopenharmony_ci	ld	$a1,8($ap)
124e1051a39Sopenharmony_ci	ld	$a2,16($ap)
125e1051a39Sopenharmony_ci	ld	$a3,24($ap)
126e1051a39Sopenharmony_ci
127e1051a39Sopenharmony_ci	li	$poly1,-1
128e1051a39Sopenharmony_ci	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
129e1051a39Sopenharmony_ci	li	$poly3,1
130e1051a39Sopenharmony_ci	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
131e1051a39Sopenharmony_ci
132e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont
133e1051a39Sopenharmony_ci
134e1051a39Sopenharmony_ci	mtlr	r0
135e1051a39Sopenharmony_ci	ld	r22,48($sp)
136e1051a39Sopenharmony_ci	ld	r23,56($sp)
137e1051a39Sopenharmony_ci	ld	r24,64($sp)
138e1051a39Sopenharmony_ci	ld	r25,72($sp)
139e1051a39Sopenharmony_ci	ld	r26,80($sp)
140e1051a39Sopenharmony_ci	ld	r27,88($sp)
141e1051a39Sopenharmony_ci	ld	r28,96($sp)
142e1051a39Sopenharmony_ci	ld	r29,104($sp)
143e1051a39Sopenharmony_ci	ld	r30,112($sp)
144e1051a39Sopenharmony_ci	ld	r31,120($sp)
145e1051a39Sopenharmony_ci	addi	$sp,$sp,128
146e1051a39Sopenharmony_ci	blr
147e1051a39Sopenharmony_ci	.long	0
148e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,10,3,0
149e1051a39Sopenharmony_ci	.long	0
150e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
151e1051a39Sopenharmony_ci
152e1051a39Sopenharmony_ci# void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
153e1051a39Sopenharmony_ci.globl	ecp_nistz256_sqr_mont
154e1051a39Sopenharmony_ci.align	4
155e1051a39Sopenharmony_ciecp_nistz256_sqr_mont:
156e1051a39Sopenharmony_ci	stdu	$sp,-128($sp)
157e1051a39Sopenharmony_ci	mflr	r0
158e1051a39Sopenharmony_ci	std	r22,48($sp)
159e1051a39Sopenharmony_ci	std	r23,56($sp)
160e1051a39Sopenharmony_ci	std	r24,64($sp)
161e1051a39Sopenharmony_ci	std	r25,72($sp)
162e1051a39Sopenharmony_ci	std	r26,80($sp)
163e1051a39Sopenharmony_ci	std	r27,88($sp)
164e1051a39Sopenharmony_ci	std	r28,96($sp)
165e1051a39Sopenharmony_ci	std	r29,104($sp)
166e1051a39Sopenharmony_ci	std	r30,112($sp)
167e1051a39Sopenharmony_ci	std	r31,120($sp)
168e1051a39Sopenharmony_ci
169e1051a39Sopenharmony_ci	ld	$a0,0($ap)
170e1051a39Sopenharmony_ci	ld	$a1,8($ap)
171e1051a39Sopenharmony_ci	ld	$a2,16($ap)
172e1051a39Sopenharmony_ci	ld	$a3,24($ap)
173e1051a39Sopenharmony_ci
174e1051a39Sopenharmony_ci	li	$poly1,-1
175e1051a39Sopenharmony_ci	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
176e1051a39Sopenharmony_ci	li	$poly3,1
177e1051a39Sopenharmony_ci	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
178e1051a39Sopenharmony_ci
179e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont
180e1051a39Sopenharmony_ci
181e1051a39Sopenharmony_ci	mtlr	r0
182e1051a39Sopenharmony_ci	ld	r22,48($sp)
183e1051a39Sopenharmony_ci	ld	r23,56($sp)
184e1051a39Sopenharmony_ci	ld	r24,64($sp)
185e1051a39Sopenharmony_ci	ld	r25,72($sp)
186e1051a39Sopenharmony_ci	ld	r26,80($sp)
187e1051a39Sopenharmony_ci	ld	r27,88($sp)
188e1051a39Sopenharmony_ci	ld	r28,96($sp)
189e1051a39Sopenharmony_ci	ld	r29,104($sp)
190e1051a39Sopenharmony_ci	ld	r30,112($sp)
191e1051a39Sopenharmony_ci	ld	r31,120($sp)
192e1051a39Sopenharmony_ci	addi	$sp,$sp,128
193e1051a39Sopenharmony_ci	blr
194e1051a39Sopenharmony_ci	.long	0
195e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,10,2,0
196e1051a39Sopenharmony_ci	.long	0
197e1051a39Sopenharmony_ci.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
198e1051a39Sopenharmony_ci
199e1051a39Sopenharmony_ci# void	ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
200e1051a39Sopenharmony_ci#					const BN_ULONG x2[4]);
201e1051a39Sopenharmony_ci.globl	ecp_nistz256_add
202e1051a39Sopenharmony_ci.align	4
203e1051a39Sopenharmony_ciecp_nistz256_add:
204e1051a39Sopenharmony_ci	stdu	$sp,-128($sp)
205e1051a39Sopenharmony_ci	mflr	r0
206e1051a39Sopenharmony_ci	std	r28,96($sp)
207e1051a39Sopenharmony_ci	std	r29,104($sp)
208e1051a39Sopenharmony_ci	std	r30,112($sp)
209e1051a39Sopenharmony_ci	std	r31,120($sp)
210e1051a39Sopenharmony_ci
211e1051a39Sopenharmony_ci	ld	$acc0,0($ap)
212e1051a39Sopenharmony_ci	ld	$t0,  0($bp)
213e1051a39Sopenharmony_ci	ld	$acc1,8($ap)
214e1051a39Sopenharmony_ci	ld	$t1,  8($bp)
215e1051a39Sopenharmony_ci	ld	$acc2,16($ap)
216e1051a39Sopenharmony_ci	ld	$t2,  16($bp)
217e1051a39Sopenharmony_ci	ld	$acc3,24($ap)
218e1051a39Sopenharmony_ci	ld	$t3,  24($bp)
219e1051a39Sopenharmony_ci
220e1051a39Sopenharmony_ci	li	$poly1,-1
221e1051a39Sopenharmony_ci	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
222e1051a39Sopenharmony_ci	li	$poly3,1
223e1051a39Sopenharmony_ci	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
224e1051a39Sopenharmony_ci
225e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add
226e1051a39Sopenharmony_ci
227e1051a39Sopenharmony_ci	mtlr	r0
228e1051a39Sopenharmony_ci	ld	r28,96($sp)
229e1051a39Sopenharmony_ci	ld	r29,104($sp)
230e1051a39Sopenharmony_ci	ld	r30,112($sp)
231e1051a39Sopenharmony_ci	ld	r31,120($sp)
232e1051a39Sopenharmony_ci	addi	$sp,$sp,128
233e1051a39Sopenharmony_ci	blr
234e1051a39Sopenharmony_ci	.long	0
235e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,4,3,0
236e1051a39Sopenharmony_ci	.long	0
237e1051a39Sopenharmony_ci.size	ecp_nistz256_add,.-ecp_nistz256_add
238e1051a39Sopenharmony_ci
239e1051a39Sopenharmony_ci# void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
240e1051a39Sopenharmony_ci.globl	ecp_nistz256_div_by_2
241e1051a39Sopenharmony_ci.align	4
242e1051a39Sopenharmony_ciecp_nistz256_div_by_2:
243e1051a39Sopenharmony_ci	stdu	$sp,-128($sp)
244e1051a39Sopenharmony_ci	mflr	r0
245e1051a39Sopenharmony_ci	std	r28,96($sp)
246e1051a39Sopenharmony_ci	std	r29,104($sp)
247e1051a39Sopenharmony_ci	std	r30,112($sp)
248e1051a39Sopenharmony_ci	std	r31,120($sp)
249e1051a39Sopenharmony_ci
250e1051a39Sopenharmony_ci	ld	$acc0,0($ap)
251e1051a39Sopenharmony_ci	ld	$acc1,8($ap)
252e1051a39Sopenharmony_ci	ld	$acc2,16($ap)
253e1051a39Sopenharmony_ci	ld	$acc3,24($ap)
254e1051a39Sopenharmony_ci
255e1051a39Sopenharmony_ci	li	$poly1,-1
256e1051a39Sopenharmony_ci	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
257e1051a39Sopenharmony_ci	li	$poly3,1
258e1051a39Sopenharmony_ci	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
259e1051a39Sopenharmony_ci
260e1051a39Sopenharmony_ci	bl	__ecp_nistz256_div_by_2
261e1051a39Sopenharmony_ci
262e1051a39Sopenharmony_ci	mtlr	r0
263e1051a39Sopenharmony_ci	ld	r28,96($sp)
264e1051a39Sopenharmony_ci	ld	r29,104($sp)
265e1051a39Sopenharmony_ci	ld	r30,112($sp)
266e1051a39Sopenharmony_ci	ld	r31,120($sp)
267e1051a39Sopenharmony_ci	addi	$sp,$sp,128
268e1051a39Sopenharmony_ci	blr
269e1051a39Sopenharmony_ci	.long	0
270e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,4,2,0
271e1051a39Sopenharmony_ci	.long	0
272e1051a39Sopenharmony_ci.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
273e1051a39Sopenharmony_ci
274e1051a39Sopenharmony_ci# void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
275e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_by_2
276e1051a39Sopenharmony_ci.align	4
277e1051a39Sopenharmony_ciecp_nistz256_mul_by_2:
278e1051a39Sopenharmony_ci	stdu	$sp,-128($sp)
279e1051a39Sopenharmony_ci	mflr	r0
280e1051a39Sopenharmony_ci	std	r28,96($sp)
281e1051a39Sopenharmony_ci	std	r29,104($sp)
282e1051a39Sopenharmony_ci	std	r30,112($sp)
283e1051a39Sopenharmony_ci	std	r31,120($sp)
284e1051a39Sopenharmony_ci
285e1051a39Sopenharmony_ci	ld	$acc0,0($ap)
286e1051a39Sopenharmony_ci	ld	$acc1,8($ap)
287e1051a39Sopenharmony_ci	ld	$acc2,16($ap)
288e1051a39Sopenharmony_ci	ld	$acc3,24($ap)
289e1051a39Sopenharmony_ci
290e1051a39Sopenharmony_ci	mr	$t0,$acc0
291e1051a39Sopenharmony_ci	mr	$t1,$acc1
292e1051a39Sopenharmony_ci	mr	$t2,$acc2
293e1051a39Sopenharmony_ci	mr	$t3,$acc3
294e1051a39Sopenharmony_ci
295e1051a39Sopenharmony_ci	li	$poly1,-1
296e1051a39Sopenharmony_ci	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
297e1051a39Sopenharmony_ci	li	$poly3,1
298e1051a39Sopenharmony_ci	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
299e1051a39Sopenharmony_ci
300e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	# ret = a+a	// 2*a
301e1051a39Sopenharmony_ci
302e1051a39Sopenharmony_ci	mtlr	r0
303e1051a39Sopenharmony_ci	ld	r28,96($sp)
304e1051a39Sopenharmony_ci	ld	r29,104($sp)
305e1051a39Sopenharmony_ci	ld	r30,112($sp)
306e1051a39Sopenharmony_ci	ld	r31,120($sp)
307e1051a39Sopenharmony_ci	addi	$sp,$sp,128
308e1051a39Sopenharmony_ci	blr
309e1051a39Sopenharmony_ci	.long	0
310e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,4,3,0
311e1051a39Sopenharmony_ci	.long	0
312e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
313e1051a39Sopenharmony_ci
314e1051a39Sopenharmony_ci# void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
315e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_by_3
316e1051a39Sopenharmony_ci.align	4
317e1051a39Sopenharmony_ciecp_nistz256_mul_by_3:
318e1051a39Sopenharmony_ci	stdu	$sp,-128($sp)
319e1051a39Sopenharmony_ci	mflr	r0
320e1051a39Sopenharmony_ci	std	r28,96($sp)
321e1051a39Sopenharmony_ci	std	r29,104($sp)
322e1051a39Sopenharmony_ci	std	r30,112($sp)
323e1051a39Sopenharmony_ci	std	r31,120($sp)
324e1051a39Sopenharmony_ci
325e1051a39Sopenharmony_ci	ld	$acc0,0($ap)
326e1051a39Sopenharmony_ci	ld	$acc1,8($ap)
327e1051a39Sopenharmony_ci	ld	$acc2,16($ap)
328e1051a39Sopenharmony_ci	ld	$acc3,24($ap)
329e1051a39Sopenharmony_ci
330e1051a39Sopenharmony_ci	mr	$t0,$acc0
331e1051a39Sopenharmony_ci	std	$acc0,64($sp)
332e1051a39Sopenharmony_ci	mr	$t1,$acc1
333e1051a39Sopenharmony_ci	std	$acc1,72($sp)
334e1051a39Sopenharmony_ci	mr	$t2,$acc2
335e1051a39Sopenharmony_ci	std	$acc2,80($sp)
336e1051a39Sopenharmony_ci	mr	$t3,$acc3
337e1051a39Sopenharmony_ci	std	$acc3,88($sp)
338e1051a39Sopenharmony_ci
339e1051a39Sopenharmony_ci	li	$poly1,-1
340e1051a39Sopenharmony_ci	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
341e1051a39Sopenharmony_ci	li	$poly3,1
342e1051a39Sopenharmony_ci	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
343e1051a39Sopenharmony_ci
344e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	# ret = a+a	// 2*a
345e1051a39Sopenharmony_ci
346e1051a39Sopenharmony_ci	ld	$t0,64($sp)
347e1051a39Sopenharmony_ci	ld	$t1,72($sp)
348e1051a39Sopenharmony_ci	ld	$t2,80($sp)
349e1051a39Sopenharmony_ci	ld	$t3,88($sp)
350e1051a39Sopenharmony_ci
351e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	# ret += a	// 2*a+a=3*a
352e1051a39Sopenharmony_ci
353e1051a39Sopenharmony_ci	mtlr	r0
354e1051a39Sopenharmony_ci	ld	r28,96($sp)
355e1051a39Sopenharmony_ci	ld	r29,104($sp)
356e1051a39Sopenharmony_ci	ld	r30,112($sp)
357e1051a39Sopenharmony_ci	ld	r31,120($sp)
358e1051a39Sopenharmony_ci	addi	$sp,$sp,128
359e1051a39Sopenharmony_ci	blr
360e1051a39Sopenharmony_ci	.long	0
361e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,4,2,0
362e1051a39Sopenharmony_ci	.long	0
363e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
364e1051a39Sopenharmony_ci
365e1051a39Sopenharmony_ci# void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
366e1051a39Sopenharmony_ci#				        const BN_ULONG x2[4]);
367e1051a39Sopenharmony_ci.globl	ecp_nistz256_sub
368e1051a39Sopenharmony_ci.align	4
369e1051a39Sopenharmony_ciecp_nistz256_sub:
370e1051a39Sopenharmony_ci	stdu	$sp,-128($sp)
371e1051a39Sopenharmony_ci	mflr	r0
372e1051a39Sopenharmony_ci	std	r28,96($sp)
373e1051a39Sopenharmony_ci	std	r29,104($sp)
374e1051a39Sopenharmony_ci	std	r30,112($sp)
375e1051a39Sopenharmony_ci	std	r31,120($sp)
376e1051a39Sopenharmony_ci
377e1051a39Sopenharmony_ci	ld	$acc0,0($ap)
378e1051a39Sopenharmony_ci	ld	$acc1,8($ap)
379e1051a39Sopenharmony_ci	ld	$acc2,16($ap)
380e1051a39Sopenharmony_ci	ld	$acc3,24($ap)
381e1051a39Sopenharmony_ci
382e1051a39Sopenharmony_ci	li	$poly1,-1
383e1051a39Sopenharmony_ci	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
384e1051a39Sopenharmony_ci	li	$poly3,1
385e1051a39Sopenharmony_ci	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
386e1051a39Sopenharmony_ci
387e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from
388e1051a39Sopenharmony_ci
389e1051a39Sopenharmony_ci	mtlr	r0
390e1051a39Sopenharmony_ci	ld	r28,96($sp)
391e1051a39Sopenharmony_ci	ld	r29,104($sp)
392e1051a39Sopenharmony_ci	ld	r30,112($sp)
393e1051a39Sopenharmony_ci	ld	r31,120($sp)
394e1051a39Sopenharmony_ci	addi	$sp,$sp,128
395e1051a39Sopenharmony_ci	blr
396e1051a39Sopenharmony_ci	.long	0
397e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,4,3,0
398e1051a39Sopenharmony_ci	.long	0
399e1051a39Sopenharmony_ci.size	ecp_nistz256_sub,.-ecp_nistz256_sub
400e1051a39Sopenharmony_ci
401e1051a39Sopenharmony_ci# void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
402e1051a39Sopenharmony_ci.globl	ecp_nistz256_neg
403e1051a39Sopenharmony_ci.align	4
404e1051a39Sopenharmony_ciecp_nistz256_neg:
405e1051a39Sopenharmony_ci	stdu	$sp,-128($sp)
406e1051a39Sopenharmony_ci	mflr	r0
407e1051a39Sopenharmony_ci	std	r28,96($sp)
408e1051a39Sopenharmony_ci	std	r29,104($sp)
409e1051a39Sopenharmony_ci	std	r30,112($sp)
410e1051a39Sopenharmony_ci	std	r31,120($sp)
411e1051a39Sopenharmony_ci
412e1051a39Sopenharmony_ci	mr	$bp,$ap
413e1051a39Sopenharmony_ci	li	$acc0,0
414e1051a39Sopenharmony_ci	li	$acc1,0
415e1051a39Sopenharmony_ci	li	$acc2,0
416e1051a39Sopenharmony_ci	li	$acc3,0
417e1051a39Sopenharmony_ci
418e1051a39Sopenharmony_ci	li	$poly1,-1
419e1051a39Sopenharmony_ci	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
420e1051a39Sopenharmony_ci	li	$poly3,1
421e1051a39Sopenharmony_ci	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
422e1051a39Sopenharmony_ci
423e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from
424e1051a39Sopenharmony_ci
425e1051a39Sopenharmony_ci	mtlr	r0
426e1051a39Sopenharmony_ci	ld	r28,96($sp)
427e1051a39Sopenharmony_ci	ld	r29,104($sp)
428e1051a39Sopenharmony_ci	ld	r30,112($sp)
429e1051a39Sopenharmony_ci	ld	r31,120($sp)
430e1051a39Sopenharmony_ci	addi	$sp,$sp,128
431e1051a39Sopenharmony_ci	blr
432e1051a39Sopenharmony_ci	.long	0
433e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,4,2,0
434e1051a39Sopenharmony_ci	.long	0
435e1051a39Sopenharmony_ci.size	ecp_nistz256_neg,.-ecp_nistz256_neg
436e1051a39Sopenharmony_ci
437e1051a39Sopenharmony_ci# note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
438e1051a39Sopenharmony_ci# to $a0-$a3 and b[0] - to $bi
439e1051a39Sopenharmony_ci.type	__ecp_nistz256_mul_mont,\@function
440e1051a39Sopenharmony_ci.align	4
441e1051a39Sopenharmony_ci__ecp_nistz256_mul_mont:
442e1051a39Sopenharmony_ci	mulld	$acc0,$a0,$bi		# a[0]*b[0]
443e1051a39Sopenharmony_ci	mulhdu	$t0,$a0,$bi
444e1051a39Sopenharmony_ci
445e1051a39Sopenharmony_ci	mulld	$acc1,$a1,$bi		# a[1]*b[0]
446e1051a39Sopenharmony_ci	mulhdu	$t1,$a1,$bi
447e1051a39Sopenharmony_ci
448e1051a39Sopenharmony_ci	mulld	$acc2,$a2,$bi		# a[2]*b[0]
449e1051a39Sopenharmony_ci	mulhdu	$t2,$a2,$bi
450e1051a39Sopenharmony_ci
451e1051a39Sopenharmony_ci	mulld	$acc3,$a3,$bi		# a[3]*b[0]
452e1051a39Sopenharmony_ci	mulhdu	$t3,$a3,$bi
453e1051a39Sopenharmony_ci	ld	$bi,8($bp)		# b[1]
454e1051a39Sopenharmony_ci
455e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$t0		# accumulate high parts of multiplication
456e1051a39Sopenharmony_ci	 sldi	$t0,$acc0,32
457e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
458e1051a39Sopenharmony_ci	 srdi	$t1,$acc0,32
459e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
460e1051a39Sopenharmony_ci	addze	$acc4,$t3
461e1051a39Sopenharmony_ci	li	$acc5,0
462e1051a39Sopenharmony_ci___
463e1051a39Sopenharmony_cifor($i=1;$i<4;$i++) {
464e1051a39Sopenharmony_ci	################################################################
465e1051a39Sopenharmony_ci	# Reduction iteration is normally performed by accumulating
466e1051a39Sopenharmony_ci	# result of multiplication of modulus by "magic" digit [and
467e1051a39Sopenharmony_ci	# omitting least significant word, which is guaranteed to
468e1051a39Sopenharmony_ci	# be 0], but thanks to special form of modulus and "magic"
469e1051a39Sopenharmony_ci	# digit being equal to least significant word, it can be
470e1051a39Sopenharmony_ci	# performed with additions and subtractions alone. Indeed:
471e1051a39Sopenharmony_ci	#
472e1051a39Sopenharmony_ci	#            ffff0001.00000000.0000ffff.ffffffff
473e1051a39Sopenharmony_ci	# *                                     abcdefgh
474e1051a39Sopenharmony_ci	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
475e1051a39Sopenharmony_ci	#
476e1051a39Sopenharmony_ci	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
477e1051a39Sopenharmony_ci	# rewrite above as:
478e1051a39Sopenharmony_ci	#
479e1051a39Sopenharmony_ci	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
480e1051a39Sopenharmony_ci	# + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
481e1051a39Sopenharmony_ci	# - 0000abcd.efgh0000.00000000.00000000.abcdefgh
482e1051a39Sopenharmony_ci	#
483e1051a39Sopenharmony_ci	# or marking redundant operations:
484e1051a39Sopenharmony_ci	#
485e1051a39Sopenharmony_ci	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
486e1051a39Sopenharmony_ci	# + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
487e1051a39Sopenharmony_ci	# - 0000abcd.efgh0000.--------.--------.--------
488e1051a39Sopenharmony_ci
489e1051a39Sopenharmony_ci$code.=<<___;
490e1051a39Sopenharmony_ci	subfc	$t2,$t0,$acc0		# "*0xffff0001"
491e1051a39Sopenharmony_ci	subfe	$t3,$t1,$acc0
492e1051a39Sopenharmony_ci	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
493e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t1
494e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
495e1051a39Sopenharmony_ci	adde	$acc3,$acc4,$t3
496e1051a39Sopenharmony_ci	addze	$acc4,$acc5
497e1051a39Sopenharmony_ci
498e1051a39Sopenharmony_ci	mulld	$t0,$a0,$bi		# lo(a[0]*b[i])
499e1051a39Sopenharmony_ci	mulld	$t1,$a1,$bi		# lo(a[1]*b[i])
500e1051a39Sopenharmony_ci	mulld	$t2,$a2,$bi		# lo(a[2]*b[i])
501e1051a39Sopenharmony_ci	mulld	$t3,$a3,$bi		# lo(a[3]*b[i])
502e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0		# accumulate low parts of multiplication
503e1051a39Sopenharmony_ci	 mulhdu	$t0,$a0,$bi		# hi(a[0]*b[i])
504e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
505e1051a39Sopenharmony_ci	 mulhdu	$t1,$a1,$bi		# hi(a[1]*b[i])
506e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
507e1051a39Sopenharmony_ci	 mulhdu	$t2,$a2,$bi		# hi(a[2]*b[i])
508e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
509e1051a39Sopenharmony_ci	 mulhdu	$t3,$a3,$bi		# hi(a[3]*b[i])
510e1051a39Sopenharmony_ci	addze	$acc4,$acc4
511e1051a39Sopenharmony_ci___
512e1051a39Sopenharmony_ci$code.=<<___	if ($i<3);
513e1051a39Sopenharmony_ci	ld	$bi,8*($i+1)($bp)	# b[$i+1]
514e1051a39Sopenharmony_ci___
515e1051a39Sopenharmony_ci$code.=<<___;
516e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$t0		# accumulate high parts of multiplication
517e1051a39Sopenharmony_ci	 sldi	$t0,$acc0,32
518e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
519e1051a39Sopenharmony_ci	 srdi	$t1,$acc0,32
520e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
521e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t3
522e1051a39Sopenharmony_ci	li	$acc5,0
523e1051a39Sopenharmony_ci	addze	$acc5,$acc5
524e1051a39Sopenharmony_ci___
525e1051a39Sopenharmony_ci}
526e1051a39Sopenharmony_ci$code.=<<___;
527e1051a39Sopenharmony_ci	# last reduction
528e1051a39Sopenharmony_ci	subfc	$t2,$t0,$acc0		# "*0xffff0001"
529e1051a39Sopenharmony_ci	subfe	$t3,$t1,$acc0
530e1051a39Sopenharmony_ci	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
531e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t1
532e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
533e1051a39Sopenharmony_ci	adde	$acc3,$acc4,$t3
534e1051a39Sopenharmony_ci	addze	$acc4,$acc5
535e1051a39Sopenharmony_ci
536e1051a39Sopenharmony_ci	li	$t2,0
537e1051a39Sopenharmony_ci	addic	$acc0,$acc0,1		# ret -= modulus
538e1051a39Sopenharmony_ci	subfe	$acc1,$poly1,$acc1
539e1051a39Sopenharmony_ci	subfe	$acc2,$t2,$acc2
540e1051a39Sopenharmony_ci	subfe	$acc3,$poly3,$acc3
541e1051a39Sopenharmony_ci	subfe	$acc4,$t2,$acc4
542e1051a39Sopenharmony_ci
543e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$acc4	# ret += modulus if borrow
544e1051a39Sopenharmony_ci	and	$t1,$poly1,$acc4
545e1051a39Sopenharmony_ci	and	$t3,$poly3,$acc4
546e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
547e1051a39Sopenharmony_ci	addze	$acc2,$acc2
548e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
549e1051a39Sopenharmony_ci
550e1051a39Sopenharmony_ci	std	$acc0,0($rp)
551e1051a39Sopenharmony_ci	std	$acc1,8($rp)
552e1051a39Sopenharmony_ci	std	$acc2,16($rp)
553e1051a39Sopenharmony_ci	std	$acc3,24($rp)
554e1051a39Sopenharmony_ci
555e1051a39Sopenharmony_ci	blr
556e1051a39Sopenharmony_ci	.long	0
557e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,1,0
558e1051a39Sopenharmony_ci	.long	0
559e1051a39Sopenharmony_ci.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
560e1051a39Sopenharmony_ci
561e1051a39Sopenharmony_ci# note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
562e1051a39Sopenharmony_ci# to $a0-$a3
563e1051a39Sopenharmony_ci.type	__ecp_nistz256_sqr_mont,\@function
564e1051a39Sopenharmony_ci.align	4
565e1051a39Sopenharmony_ci__ecp_nistz256_sqr_mont:
566e1051a39Sopenharmony_ci	################################################################
567e1051a39Sopenharmony_ci	#  |  |  |  |  |  |a1*a0|  |
568e1051a39Sopenharmony_ci	#  |  |  |  |  |a2*a0|  |  |
569e1051a39Sopenharmony_ci	#  |  |a3*a2|a3*a0|  |  |  |
570e1051a39Sopenharmony_ci	#  |  |  |  |a2*a1|  |  |  |
571e1051a39Sopenharmony_ci	#  |  |  |a3*a1|  |  |  |  |
572e1051a39Sopenharmony_ci	# *|  |  |  |  |  |  |  | 2|
573e1051a39Sopenharmony_ci	# +|a3*a3|a2*a2|a1*a1|a0*a0|
574e1051a39Sopenharmony_ci	#  |--+--+--+--+--+--+--+--|
575e1051a39Sopenharmony_ci	#  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
576e1051a39Sopenharmony_ci	#
577e1051a39Sopenharmony_ci	#  "can't overflow" below mark carrying into high part of
578e1051a39Sopenharmony_ci	#  multiplication result, which can't overflow, because it
579e1051a39Sopenharmony_ci	#  can never be all ones.
580e1051a39Sopenharmony_ci
581e1051a39Sopenharmony_ci	mulld	$acc1,$a1,$a0		# a[1]*a[0]
582e1051a39Sopenharmony_ci	mulhdu	$t1,$a1,$a0
583e1051a39Sopenharmony_ci	mulld	$acc2,$a2,$a0		# a[2]*a[0]
584e1051a39Sopenharmony_ci	mulhdu	$t2,$a2,$a0
585e1051a39Sopenharmony_ci	mulld	$acc3,$a3,$a0		# a[3]*a[0]
586e1051a39Sopenharmony_ci	mulhdu	$acc4,$a3,$a0
587e1051a39Sopenharmony_ci
588e1051a39Sopenharmony_ci	addc	$acc2,$acc2,$t1		# accumulate high parts of multiplication
589e1051a39Sopenharmony_ci	 mulld	$t0,$a2,$a1		# a[2]*a[1]
590e1051a39Sopenharmony_ci	 mulhdu	$t1,$a2,$a1
591e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
592e1051a39Sopenharmony_ci	 mulld	$t2,$a3,$a1		# a[3]*a[1]
593e1051a39Sopenharmony_ci	 mulhdu	$t3,$a3,$a1
594e1051a39Sopenharmony_ci	addze	$acc4,$acc4		# can't overflow
595e1051a39Sopenharmony_ci
596e1051a39Sopenharmony_ci	mulld	$acc5,$a3,$a2		# a[3]*a[2]
597e1051a39Sopenharmony_ci	mulhdu	$acc6,$a3,$a2
598e1051a39Sopenharmony_ci
599e1051a39Sopenharmony_ci	addc	$t1,$t1,$t2		# accumulate high parts of multiplication
600e1051a39Sopenharmony_ci	addze	$t2,$t3			# can't overflow
601e1051a39Sopenharmony_ci
602e1051a39Sopenharmony_ci	addc	$acc3,$acc3,$t0		# accumulate low parts of multiplication
603e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t1
604e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$t2
605e1051a39Sopenharmony_ci	addze	$acc6,$acc6		# can't overflow
606e1051a39Sopenharmony_ci
607e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$acc1	# acc[1-6]*=2
608e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$acc2
609e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$acc3
610e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$acc4
611e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$acc5
612e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$acc6
613e1051a39Sopenharmony_ci	li	$acc7,0
614e1051a39Sopenharmony_ci	addze	$acc7,$acc7
615e1051a39Sopenharmony_ci
616e1051a39Sopenharmony_ci	mulld	$acc0,$a0,$a0		# a[0]*a[0]
617e1051a39Sopenharmony_ci	mulhdu	$a0,$a0,$a0
618e1051a39Sopenharmony_ci	mulld	$t1,$a1,$a1		# a[1]*a[1]
619e1051a39Sopenharmony_ci	mulhdu	$a1,$a1,$a1
620e1051a39Sopenharmony_ci	mulld	$t2,$a2,$a2		# a[2]*a[2]
621e1051a39Sopenharmony_ci	mulhdu	$a2,$a2,$a2
622e1051a39Sopenharmony_ci	mulld	$t3,$a3,$a3		# a[3]*a[3]
623e1051a39Sopenharmony_ci	mulhdu	$a3,$a3,$a3
624e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$a0		# +a[i]*a[i]
625e1051a39Sopenharmony_ci	 sldi	$t0,$acc0,32
626e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
627e1051a39Sopenharmony_ci	 srdi	$t1,$acc0,32
628e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$a1
629e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t2
630e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$a2
631e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$t3
632e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$a3
633e1051a39Sopenharmony_ci___
634e1051a39Sopenharmony_cifor($i=0;$i<3;$i++) {			# reductions, see commentary in
635e1051a39Sopenharmony_ci					# multiplication for details
636e1051a39Sopenharmony_ci$code.=<<___;
637e1051a39Sopenharmony_ci	subfc	$t2,$t0,$acc0		# "*0xffff0001"
638e1051a39Sopenharmony_ci	subfe	$t3,$t1,$acc0
639e1051a39Sopenharmony_ci	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
640e1051a39Sopenharmony_ci	 sldi	$t0,$acc0,32
641e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t1
642e1051a39Sopenharmony_ci	 srdi	$t1,$acc0,32
643e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
644e1051a39Sopenharmony_ci	addze	$acc3,$t3		# can't overflow
645e1051a39Sopenharmony_ci___
646e1051a39Sopenharmony_ci}
647e1051a39Sopenharmony_ci$code.=<<___;
648e1051a39Sopenharmony_ci	subfc	$t2,$t0,$acc0		# "*0xffff0001"
649e1051a39Sopenharmony_ci	subfe	$t3,$t1,$acc0
650e1051a39Sopenharmony_ci	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
651e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t1
652e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
653e1051a39Sopenharmony_ci	addze	$acc3,$t3		# can't overflow
654e1051a39Sopenharmony_ci
655e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$acc4	# accumulate upper half
656e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$acc5
657e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$acc6
658e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$acc7
659e1051a39Sopenharmony_ci	li	$t2,0
660e1051a39Sopenharmony_ci	addze	$acc4,$t2
661e1051a39Sopenharmony_ci
662e1051a39Sopenharmony_ci	addic	$acc0,$acc0,1		# ret -= modulus
663e1051a39Sopenharmony_ci	subfe	$acc1,$poly1,$acc1
664e1051a39Sopenharmony_ci	subfe	$acc2,$t2,$acc2
665e1051a39Sopenharmony_ci	subfe	$acc3,$poly3,$acc3
666e1051a39Sopenharmony_ci	subfe	$acc4,$t2,$acc4
667e1051a39Sopenharmony_ci
668e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$acc4	# ret += modulus if borrow
669e1051a39Sopenharmony_ci	and	$t1,$poly1,$acc4
670e1051a39Sopenharmony_ci	and	$t3,$poly3,$acc4
671e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
672e1051a39Sopenharmony_ci	addze	$acc2,$acc2
673e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
674e1051a39Sopenharmony_ci
675e1051a39Sopenharmony_ci	std	$acc0,0($rp)
676e1051a39Sopenharmony_ci	std	$acc1,8($rp)
677e1051a39Sopenharmony_ci	std	$acc2,16($rp)
678e1051a39Sopenharmony_ci	std	$acc3,24($rp)
679e1051a39Sopenharmony_ci
680e1051a39Sopenharmony_ci	blr
681e1051a39Sopenharmony_ci	.long	0
682e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,1,0
683e1051a39Sopenharmony_ci	.long	0
684e1051a39Sopenharmony_ci.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
685e1051a39Sopenharmony_ci
686e1051a39Sopenharmony_ci# Note that __ecp_nistz256_add expects both input vectors pre-loaded to
687e1051a39Sopenharmony_ci# $a0-$a3 and $t0-$t3. This is done because it's used in multiple
688e1051a39Sopenharmony_ci# contexts, e.g. in multiplication by 2 and 3...
689e1051a39Sopenharmony_ci.type	__ecp_nistz256_add,\@function
690e1051a39Sopenharmony_ci.align	4
691e1051a39Sopenharmony_ci__ecp_nistz256_add:
692e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0		# ret = a+b
693e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
694e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
695e1051a39Sopenharmony_ci	li	$t2,0
696e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
697e1051a39Sopenharmony_ci	addze	$t0,$t2
698e1051a39Sopenharmony_ci
699e1051a39Sopenharmony_ci	# if a+b >= modulus, subtract modulus
700e1051a39Sopenharmony_ci	#
701e1051a39Sopenharmony_ci	# But since comparison implies subtraction, we subtract
702e1051a39Sopenharmony_ci	# modulus and then add it back if subtraction borrowed.
703e1051a39Sopenharmony_ci
704e1051a39Sopenharmony_ci	subic	$acc0,$acc0,-1
705e1051a39Sopenharmony_ci	subfe	$acc1,$poly1,$acc1
706e1051a39Sopenharmony_ci	subfe	$acc2,$t2,$acc2
707e1051a39Sopenharmony_ci	subfe	$acc3,$poly3,$acc3
708e1051a39Sopenharmony_ci	subfe	$t0,$t2,$t0
709e1051a39Sopenharmony_ci
710e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0
711e1051a39Sopenharmony_ci	and	$t1,$poly1,$t0
712e1051a39Sopenharmony_ci	and	$t3,$poly3,$t0
713e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
714e1051a39Sopenharmony_ci	addze	$acc2,$acc2
715e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
716e1051a39Sopenharmony_ci
717e1051a39Sopenharmony_ci	std	$acc0,0($rp)
718e1051a39Sopenharmony_ci	std	$acc1,8($rp)
719e1051a39Sopenharmony_ci	std	$acc2,16($rp)
720e1051a39Sopenharmony_ci	std	$acc3,24($rp)
721e1051a39Sopenharmony_ci
722e1051a39Sopenharmony_ci	blr
723e1051a39Sopenharmony_ci	.long	0
724e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,3,0
725e1051a39Sopenharmony_ci	.long	0
726e1051a39Sopenharmony_ci.size	__ecp_nistz256_add,.-__ecp_nistz256_add
727e1051a39Sopenharmony_ci
728e1051a39Sopenharmony_ci.type	__ecp_nistz256_sub_from,\@function
729e1051a39Sopenharmony_ci.align	4
730e1051a39Sopenharmony_ci__ecp_nistz256_sub_from:
731e1051a39Sopenharmony_ci	ld	$t0,0($bp)
732e1051a39Sopenharmony_ci	ld	$t1,8($bp)
733e1051a39Sopenharmony_ci	ld	$t2,16($bp)
734e1051a39Sopenharmony_ci	ld	$t3,24($bp)
735e1051a39Sopenharmony_ci	subfc	$acc0,$t0,$acc0		# ret = a-b
736e1051a39Sopenharmony_ci	subfe	$acc1,$t1,$acc1
737e1051a39Sopenharmony_ci	subfe	$acc2,$t2,$acc2
738e1051a39Sopenharmony_ci	subfe	$acc3,$t3,$acc3
739e1051a39Sopenharmony_ci	subfe	$t0,$t0,$t0		# t0 = borrow ? -1 : 0
740e1051a39Sopenharmony_ci
741e1051a39Sopenharmony_ci	# if a-b borrowed, add modulus
742e1051a39Sopenharmony_ci
743e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0		# ret -= modulus & t0
744e1051a39Sopenharmony_ci	and	$t1,$poly1,$t0
745e1051a39Sopenharmony_ci	and	$t3,$poly3,$t0
746e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
747e1051a39Sopenharmony_ci	addze	$acc2,$acc2
748e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
749e1051a39Sopenharmony_ci
750e1051a39Sopenharmony_ci	std	$acc0,0($rp)
751e1051a39Sopenharmony_ci	std	$acc1,8($rp)
752e1051a39Sopenharmony_ci	std	$acc2,16($rp)
753e1051a39Sopenharmony_ci	std	$acc3,24($rp)
754e1051a39Sopenharmony_ci
755e1051a39Sopenharmony_ci	blr
756e1051a39Sopenharmony_ci	.long	0
757e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,3,0
758e1051a39Sopenharmony_ci	.long	0
759e1051a39Sopenharmony_ci.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
760e1051a39Sopenharmony_ci
761e1051a39Sopenharmony_ci.type	__ecp_nistz256_sub_morf,\@function
762e1051a39Sopenharmony_ci.align	4
763e1051a39Sopenharmony_ci__ecp_nistz256_sub_morf:
764e1051a39Sopenharmony_ci	ld	$t0,0($bp)
765e1051a39Sopenharmony_ci	ld	$t1,8($bp)
766e1051a39Sopenharmony_ci	ld	$t2,16($bp)
767e1051a39Sopenharmony_ci	ld	$t3,24($bp)
768e1051a39Sopenharmony_ci	subfc	$acc0,$acc0,$t0 	# ret = b-a
769e1051a39Sopenharmony_ci	subfe	$acc1,$acc1,$t1
770e1051a39Sopenharmony_ci	subfe	$acc2,$acc2,$t2
771e1051a39Sopenharmony_ci	subfe	$acc3,$acc3,$t3
772e1051a39Sopenharmony_ci	subfe	$t0,$t0,$t0		# t0 = borrow ? -1 : 0
773e1051a39Sopenharmony_ci
774e1051a39Sopenharmony_ci	# if b-a borrowed, add modulus
775e1051a39Sopenharmony_ci
776e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0		# ret -= modulus & t0
777e1051a39Sopenharmony_ci	and	$t1,$poly1,$t0
778e1051a39Sopenharmony_ci	and	$t3,$poly3,$t0
779e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
780e1051a39Sopenharmony_ci	addze	$acc2,$acc2
781e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
782e1051a39Sopenharmony_ci
783e1051a39Sopenharmony_ci	std	$acc0,0($rp)
784e1051a39Sopenharmony_ci	std	$acc1,8($rp)
785e1051a39Sopenharmony_ci	std	$acc2,16($rp)
786e1051a39Sopenharmony_ci	std	$acc3,24($rp)
787e1051a39Sopenharmony_ci
788e1051a39Sopenharmony_ci	blr
789e1051a39Sopenharmony_ci	.long	0
790e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,3,0
791e1051a39Sopenharmony_ci	.long	0
792e1051a39Sopenharmony_ci.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
793e1051a39Sopenharmony_ci
794e1051a39Sopenharmony_ci.type	__ecp_nistz256_div_by_2,\@function
795e1051a39Sopenharmony_ci.align	4
796e1051a39Sopenharmony_ci__ecp_nistz256_div_by_2:
797e1051a39Sopenharmony_ci	andi.	$t0,$acc0,1
798e1051a39Sopenharmony_ci	addic	$acc0,$acc0,-1		# a += modulus
799e1051a39Sopenharmony_ci	 neg	$t0,$t0
800e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$poly1
801e1051a39Sopenharmony_ci	 not	$t0,$t0
802e1051a39Sopenharmony_ci	addze	$acc2,$acc2
803e1051a39Sopenharmony_ci	 li	$t2,0
804e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$poly3
805e1051a39Sopenharmony_ci	 and	$t1,$poly1,$t0
806e1051a39Sopenharmony_ci	addze	$ap,$t2			# ap = carry
807e1051a39Sopenharmony_ci	 and	$t3,$poly3,$t0
808e1051a39Sopenharmony_ci
809e1051a39Sopenharmony_ci	subfc	$acc0,$t0,$acc0		# a -= modulus if a was even
810e1051a39Sopenharmony_ci	subfe	$acc1,$t1,$acc1
811e1051a39Sopenharmony_ci	subfe	$acc2,$t2,$acc2
812e1051a39Sopenharmony_ci	subfe	$acc3,$t3,$acc3
813e1051a39Sopenharmony_ci	subfe	$ap,  $t2,$ap
814e1051a39Sopenharmony_ci
815e1051a39Sopenharmony_ci	srdi	$acc0,$acc0,1
816e1051a39Sopenharmony_ci	sldi	$t0,$acc1,63
817e1051a39Sopenharmony_ci	srdi	$acc1,$acc1,1
818e1051a39Sopenharmony_ci	sldi	$t1,$acc2,63
819e1051a39Sopenharmony_ci	srdi	$acc2,$acc2,1
820e1051a39Sopenharmony_ci	sldi	$t2,$acc3,63
821e1051a39Sopenharmony_ci	srdi	$acc3,$acc3,1
822e1051a39Sopenharmony_ci	sldi	$t3,$ap,63
823e1051a39Sopenharmony_ci	or	$acc0,$acc0,$t0
824e1051a39Sopenharmony_ci	or	$acc1,$acc1,$t1
825e1051a39Sopenharmony_ci	or	$acc2,$acc2,$t2
826e1051a39Sopenharmony_ci	or	$acc3,$acc3,$t3
827e1051a39Sopenharmony_ci
828e1051a39Sopenharmony_ci	std	$acc0,0($rp)
829e1051a39Sopenharmony_ci	std	$acc1,8($rp)
830e1051a39Sopenharmony_ci	std	$acc2,16($rp)
831e1051a39Sopenharmony_ci	std	$acc3,24($rp)
832e1051a39Sopenharmony_ci
833e1051a39Sopenharmony_ci	blr
834e1051a39Sopenharmony_ci	.long	0
835e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,1,0
836e1051a39Sopenharmony_ci	.long	0
837e1051a39Sopenharmony_ci.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
838e1051a39Sopenharmony_ci___
839e1051a39Sopenharmony_ci########################################################################
840e1051a39Sopenharmony_ci# following subroutines are "literal" implementation of those found in
841e1051a39Sopenharmony_ci# ecp_nistz256.c
842e1051a39Sopenharmony_ci#
843e1051a39Sopenharmony_ci########################################################################
844e1051a39Sopenharmony_ci# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
845e1051a39Sopenharmony_ci#
846e1051a39Sopenharmony_ciif (1) {
847e1051a39Sopenharmony_cimy $FRAME=64+32*4+12*8;
848e1051a39Sopenharmony_cimy ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
849e1051a39Sopenharmony_ci# above map() describes stack layout with 4 temporary
850e1051a39Sopenharmony_ci# 256-bit vectors on top.
851e1051a39Sopenharmony_cimy ($rp_real,$ap_real) = map("r$_",(20,21));
852e1051a39Sopenharmony_ci
853e1051a39Sopenharmony_ci$code.=<<___;
854e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_double
855e1051a39Sopenharmony_ci.align	5
856e1051a39Sopenharmony_ciecp_nistz256_point_double:
857e1051a39Sopenharmony_ci	stdu	$sp,-$FRAME($sp)
858e1051a39Sopenharmony_ci	mflr	r0
859e1051a39Sopenharmony_ci	std	r20,$FRAME-8*12($sp)
860e1051a39Sopenharmony_ci	std	r21,$FRAME-8*11($sp)
861e1051a39Sopenharmony_ci	std	r22,$FRAME-8*10($sp)
862e1051a39Sopenharmony_ci	std	r23,$FRAME-8*9($sp)
863e1051a39Sopenharmony_ci	std	r24,$FRAME-8*8($sp)
864e1051a39Sopenharmony_ci	std	r25,$FRAME-8*7($sp)
865e1051a39Sopenharmony_ci	std	r26,$FRAME-8*6($sp)
866e1051a39Sopenharmony_ci	std	r27,$FRAME-8*5($sp)
867e1051a39Sopenharmony_ci	std	r28,$FRAME-8*4($sp)
868e1051a39Sopenharmony_ci	std	r29,$FRAME-8*3($sp)
869e1051a39Sopenharmony_ci	std	r30,$FRAME-8*2($sp)
870e1051a39Sopenharmony_ci	std	r31,$FRAME-8*1($sp)
871e1051a39Sopenharmony_ci
872e1051a39Sopenharmony_ci	li	$poly1,-1
873e1051a39Sopenharmony_ci	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
874e1051a39Sopenharmony_ci	li	$poly3,1
875e1051a39Sopenharmony_ci	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
876e1051a39Sopenharmony_ci.Ldouble_shortcut:
877e1051a39Sopenharmony_ci	ld	$acc0,32($ap)
878e1051a39Sopenharmony_ci	ld	$acc1,40($ap)
879e1051a39Sopenharmony_ci	ld	$acc2,48($ap)
880e1051a39Sopenharmony_ci	ld	$acc3,56($ap)
881e1051a39Sopenharmony_ci	mr	$t0,$acc0
882e1051a39Sopenharmony_ci	mr	$t1,$acc1
883e1051a39Sopenharmony_ci	mr	$t2,$acc2
884e1051a39Sopenharmony_ci	mr	$t3,$acc3
885e1051a39Sopenharmony_ci	 ld	$a0,64($ap)		# forward load for p256_sqr_mont
886e1051a39Sopenharmony_ci	 ld	$a1,72($ap)
887e1051a39Sopenharmony_ci	 ld	$a2,80($ap)
888e1051a39Sopenharmony_ci	 ld	$a3,88($ap)
889e1051a39Sopenharmony_ci	 mr	$rp_real,$rp
890e1051a39Sopenharmony_ci	 mr	$ap_real,$ap
891e1051a39Sopenharmony_ci	addi	$rp,$sp,$S
892e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	# p256_mul_by_2(S, in_y);
893e1051a39Sopenharmony_ci
894e1051a39Sopenharmony_ci	addi	$rp,$sp,$Zsqr
895e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Zsqr, in_z);
896e1051a39Sopenharmony_ci
897e1051a39Sopenharmony_ci	ld	$t0,0($ap_real)
898e1051a39Sopenharmony_ci	ld	$t1,8($ap_real)
899e1051a39Sopenharmony_ci	ld	$t2,16($ap_real)
900e1051a39Sopenharmony_ci	ld	$t3,24($ap_real)
901e1051a39Sopenharmony_ci	mr	$a0,$acc0		# put Zsqr aside for p256_sub
902e1051a39Sopenharmony_ci	mr	$a1,$acc1
903e1051a39Sopenharmony_ci	mr	$a2,$acc2
904e1051a39Sopenharmony_ci	mr	$a3,$acc3
905e1051a39Sopenharmony_ci	addi	$rp,$sp,$M
906e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	# p256_add(M, Zsqr, in_x);
907e1051a39Sopenharmony_ci
908e1051a39Sopenharmony_ci	addi	$bp,$ap_real,0
909e1051a39Sopenharmony_ci	mr	$acc0,$a0		# restore Zsqr
910e1051a39Sopenharmony_ci	mr	$acc1,$a1
911e1051a39Sopenharmony_ci	mr	$acc2,$a2
912e1051a39Sopenharmony_ci	mr	$acc3,$a3
913e1051a39Sopenharmony_ci	 ld	$a0,$S+0($sp)		# forward load for p256_sqr_mont
914e1051a39Sopenharmony_ci	 ld	$a1,$S+8($sp)
915e1051a39Sopenharmony_ci	 ld	$a2,$S+16($sp)
916e1051a39Sopenharmony_ci	 ld	$a3,$S+24($sp)
917e1051a39Sopenharmony_ci	addi	$rp,$sp,$Zsqr
918e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	# p256_sub(Zsqr, in_x, Zsqr);
919e1051a39Sopenharmony_ci
920e1051a39Sopenharmony_ci	addi	$rp,$sp,$S
921e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(S, S);
922e1051a39Sopenharmony_ci
923e1051a39Sopenharmony_ci	ld	$bi,32($ap_real)
924e1051a39Sopenharmony_ci	ld	$a0,64($ap_real)
925e1051a39Sopenharmony_ci	ld	$a1,72($ap_real)
926e1051a39Sopenharmony_ci	ld	$a2,80($ap_real)
927e1051a39Sopenharmony_ci	ld	$a3,88($ap_real)
928e1051a39Sopenharmony_ci	addi	$bp,$ap_real,32
929e1051a39Sopenharmony_ci	addi	$rp,$sp,$tmp0
930e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(tmp0, in_z, in_y);
931e1051a39Sopenharmony_ci
932e1051a39Sopenharmony_ci	mr	$t0,$acc0
933e1051a39Sopenharmony_ci	mr	$t1,$acc1
934e1051a39Sopenharmony_ci	mr	$t2,$acc2
935e1051a39Sopenharmony_ci	mr	$t3,$acc3
936e1051a39Sopenharmony_ci	 ld	$a0,$S+0($sp)		# forward load for p256_sqr_mont
937e1051a39Sopenharmony_ci	 ld	$a1,$S+8($sp)
938e1051a39Sopenharmony_ci	 ld	$a2,$S+16($sp)
939e1051a39Sopenharmony_ci	 ld	$a3,$S+24($sp)
940e1051a39Sopenharmony_ci	addi	$rp,$rp_real,64
941e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	# p256_mul_by_2(res_z, tmp0);
942e1051a39Sopenharmony_ci
943e1051a39Sopenharmony_ci	addi	$rp,$sp,$tmp0
944e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(tmp0, S);
945e1051a39Sopenharmony_ci
946e1051a39Sopenharmony_ci	 ld	$bi,$Zsqr($sp)		# forward load for p256_mul_mont
947e1051a39Sopenharmony_ci	 ld	$a0,$M+0($sp)
948e1051a39Sopenharmony_ci	 ld	$a1,$M+8($sp)
949e1051a39Sopenharmony_ci	 ld	$a2,$M+16($sp)
950e1051a39Sopenharmony_ci	 ld	$a3,$M+24($sp)
951e1051a39Sopenharmony_ci	addi	$rp,$rp_real,32
952e1051a39Sopenharmony_ci	bl	__ecp_nistz256_div_by_2	# p256_div_by_2(res_y, tmp0);
953e1051a39Sopenharmony_ci
954e1051a39Sopenharmony_ci	addi	$bp,$sp,$Zsqr
955e1051a39Sopenharmony_ci	addi	$rp,$sp,$M
956e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(M, M, Zsqr);
957e1051a39Sopenharmony_ci
958e1051a39Sopenharmony_ci	mr	$t0,$acc0		# duplicate M
959e1051a39Sopenharmony_ci	mr	$t1,$acc1
960e1051a39Sopenharmony_ci	mr	$t2,$acc2
961e1051a39Sopenharmony_ci	mr	$t3,$acc3
962e1051a39Sopenharmony_ci	mr	$a0,$acc0		# put M aside
963e1051a39Sopenharmony_ci	mr	$a1,$acc1
964e1051a39Sopenharmony_ci	mr	$a2,$acc2
965e1051a39Sopenharmony_ci	mr	$a3,$acc3
966e1051a39Sopenharmony_ci	addi	$rp,$sp,$M
967e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add
968e1051a39Sopenharmony_ci	mr	$t0,$a0			# restore M
969e1051a39Sopenharmony_ci	mr	$t1,$a1
970e1051a39Sopenharmony_ci	mr	$t2,$a2
971e1051a39Sopenharmony_ci	mr	$t3,$a3
972e1051a39Sopenharmony_ci	 ld	$bi,0($ap_real)		# forward load for p256_mul_mont
973e1051a39Sopenharmony_ci	 ld	$a0,$S+0($sp)
974e1051a39Sopenharmony_ci	 ld	$a1,$S+8($sp)
975e1051a39Sopenharmony_ci	 ld	$a2,$S+16($sp)
976e1051a39Sopenharmony_ci	 ld	$a3,$S+24($sp)
977e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	# p256_mul_by_3(M, M);
978e1051a39Sopenharmony_ci
979e1051a39Sopenharmony_ci	addi	$bp,$ap_real,0
980e1051a39Sopenharmony_ci	addi	$rp,$sp,$S
981e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S, S, in_x);
982e1051a39Sopenharmony_ci
983e1051a39Sopenharmony_ci	mr	$t0,$acc0
984e1051a39Sopenharmony_ci	mr	$t1,$acc1
985e1051a39Sopenharmony_ci	mr	$t2,$acc2
986e1051a39Sopenharmony_ci	mr	$t3,$acc3
987e1051a39Sopenharmony_ci	 ld	$a0,$M+0($sp)		# forward load for p256_sqr_mont
988e1051a39Sopenharmony_ci	 ld	$a1,$M+8($sp)
989e1051a39Sopenharmony_ci	 ld	$a2,$M+16($sp)
990e1051a39Sopenharmony_ci	 ld	$a3,$M+24($sp)
991e1051a39Sopenharmony_ci	addi	$rp,$sp,$tmp0
992e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	# p256_mul_by_2(tmp0, S);
993e1051a39Sopenharmony_ci
994e1051a39Sopenharmony_ci	addi	$rp,$rp_real,0
995e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(res_x, M);
996e1051a39Sopenharmony_ci
997e1051a39Sopenharmony_ci	addi	$bp,$sp,$tmp0
998e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	# p256_sub(res_x, res_x, tmp0);
999e1051a39Sopenharmony_ci
1000e1051a39Sopenharmony_ci	addi	$bp,$sp,$S
1001e1051a39Sopenharmony_ci	addi	$rp,$sp,$S
1002e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	# p256_sub(S, S, res_x);
1003e1051a39Sopenharmony_ci
1004e1051a39Sopenharmony_ci	ld	$bi,$M($sp)
1005e1051a39Sopenharmony_ci	mr	$a0,$acc0		# copy S
1006e1051a39Sopenharmony_ci	mr	$a1,$acc1
1007e1051a39Sopenharmony_ci	mr	$a2,$acc2
1008e1051a39Sopenharmony_ci	mr	$a3,$acc3
1009e1051a39Sopenharmony_ci	addi	$bp,$sp,$M
1010e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S, S, M);
1011e1051a39Sopenharmony_ci
1012e1051a39Sopenharmony_ci	addi	$bp,$rp_real,32
1013e1051a39Sopenharmony_ci	addi	$rp,$rp_real,32
1014e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	# p256_sub(res_y, S, res_y);
1015e1051a39Sopenharmony_ci
1016e1051a39Sopenharmony_ci	mtlr	r0
1017e1051a39Sopenharmony_ci	ld	r20,$FRAME-8*12($sp)
1018e1051a39Sopenharmony_ci	ld	r21,$FRAME-8*11($sp)
1019e1051a39Sopenharmony_ci	ld	r22,$FRAME-8*10($sp)
1020e1051a39Sopenharmony_ci	ld	r23,$FRAME-8*9($sp)
1021e1051a39Sopenharmony_ci	ld	r24,$FRAME-8*8($sp)
1022e1051a39Sopenharmony_ci	ld	r25,$FRAME-8*7($sp)
1023e1051a39Sopenharmony_ci	ld	r26,$FRAME-8*6($sp)
1024e1051a39Sopenharmony_ci	ld	r27,$FRAME-8*5($sp)
1025e1051a39Sopenharmony_ci	ld	r28,$FRAME-8*4($sp)
1026e1051a39Sopenharmony_ci	ld	r29,$FRAME-8*3($sp)
1027e1051a39Sopenharmony_ci	ld	r30,$FRAME-8*2($sp)
1028e1051a39Sopenharmony_ci	ld	r31,$FRAME-8*1($sp)
1029e1051a39Sopenharmony_ci	addi	$sp,$sp,$FRAME
1030e1051a39Sopenharmony_ci	blr
1031e1051a39Sopenharmony_ci	.long	0
1032e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,12,2,0
1033e1051a39Sopenharmony_ci	.long	0
1034e1051a39Sopenharmony_ci.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
1035e1051a39Sopenharmony_ci___
1036e1051a39Sopenharmony_ci}
1037e1051a39Sopenharmony_ci
1038e1051a39Sopenharmony_ci########################################################################
1039e1051a39Sopenharmony_ci# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1040e1051a39Sopenharmony_ci#			      const P256_POINT *in2);
1041e1051a39Sopenharmony_ciif (1) {
1042e1051a39Sopenharmony_cimy $FRAME = 64 + 32*12 + 16*8;
1043e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z,
1044e1051a39Sopenharmony_ci    $H,$Hsqr,$R,$Rsqr,$Hcub,
1045e1051a39Sopenharmony_ci    $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
1046e1051a39Sopenharmony_cimy ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1047e1051a39Sopenharmony_ci# above map() describes stack layout with 12 temporary
1048e1051a39Sopenharmony_ci# 256-bit vectors on top.
1049e1051a39Sopenharmony_cimy ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1050e1051a39Sopenharmony_ci
1051e1051a39Sopenharmony_ci$code.=<<___;
1052e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_add
1053e1051a39Sopenharmony_ci.align	5
1054e1051a39Sopenharmony_ciecp_nistz256_point_add:
1055e1051a39Sopenharmony_ci	stdu	$sp,-$FRAME($sp)
1056e1051a39Sopenharmony_ci	mflr	r0
1057e1051a39Sopenharmony_ci	std	r16,$FRAME-8*16($sp)
1058e1051a39Sopenharmony_ci	std	r17,$FRAME-8*15($sp)
1059e1051a39Sopenharmony_ci	std	r18,$FRAME-8*14($sp)
1060e1051a39Sopenharmony_ci	std	r19,$FRAME-8*13($sp)
1061e1051a39Sopenharmony_ci	std	r20,$FRAME-8*12($sp)
1062e1051a39Sopenharmony_ci	std	r21,$FRAME-8*11($sp)
1063e1051a39Sopenharmony_ci	std	r22,$FRAME-8*10($sp)
1064e1051a39Sopenharmony_ci	std	r23,$FRAME-8*9($sp)
1065e1051a39Sopenharmony_ci	std	r24,$FRAME-8*8($sp)
1066e1051a39Sopenharmony_ci	std	r25,$FRAME-8*7($sp)
1067e1051a39Sopenharmony_ci	std	r26,$FRAME-8*6($sp)
1068e1051a39Sopenharmony_ci	std	r27,$FRAME-8*5($sp)
1069e1051a39Sopenharmony_ci	std	r28,$FRAME-8*4($sp)
1070e1051a39Sopenharmony_ci	std	r29,$FRAME-8*3($sp)
1071e1051a39Sopenharmony_ci	std	r30,$FRAME-8*2($sp)
1072e1051a39Sopenharmony_ci	std	r31,$FRAME-8*1($sp)
1073e1051a39Sopenharmony_ci
1074e1051a39Sopenharmony_ci	li	$poly1,-1
1075e1051a39Sopenharmony_ci	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
1076e1051a39Sopenharmony_ci	li	$poly3,1
1077e1051a39Sopenharmony_ci	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
1078e1051a39Sopenharmony_ci
1079e1051a39Sopenharmony_ci	ld	$a0,64($bp)		# in2_z
1080e1051a39Sopenharmony_ci	ld	$a1,72($bp)
1081e1051a39Sopenharmony_ci	ld	$a2,80($bp)
1082e1051a39Sopenharmony_ci	ld	$a3,88($bp)
1083e1051a39Sopenharmony_ci	 mr	$rp_real,$rp
1084e1051a39Sopenharmony_ci	 mr	$ap_real,$ap
1085e1051a39Sopenharmony_ci	 mr	$bp_real,$bp
1086e1051a39Sopenharmony_ci	or	$t0,$a0,$a1
1087e1051a39Sopenharmony_ci	or	$t2,$a2,$a3
1088e1051a39Sopenharmony_ci	or	$in2infty,$t0,$t2
1089e1051a39Sopenharmony_ci	neg	$t0,$in2infty
1090e1051a39Sopenharmony_ci	or	$in2infty,$in2infty,$t0
1091e1051a39Sopenharmony_ci	sradi	$in2infty,$in2infty,63	# !in2infty
1092e1051a39Sopenharmony_ci	addi	$rp,$sp,$Z2sqr
1093e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Z2sqr, in2_z);
1094e1051a39Sopenharmony_ci
1095e1051a39Sopenharmony_ci	ld	$a0,64($ap_real)	# in1_z
1096e1051a39Sopenharmony_ci	ld	$a1,72($ap_real)
1097e1051a39Sopenharmony_ci	ld	$a2,80($ap_real)
1098e1051a39Sopenharmony_ci	ld	$a3,88($ap_real)
1099e1051a39Sopenharmony_ci	or	$t0,$a0,$a1
1100e1051a39Sopenharmony_ci	or	$t2,$a2,$a3
1101e1051a39Sopenharmony_ci	or	$in1infty,$t0,$t2
1102e1051a39Sopenharmony_ci	neg	$t0,$in1infty
1103e1051a39Sopenharmony_ci	or	$in1infty,$in1infty,$t0
1104e1051a39Sopenharmony_ci	sradi	$in1infty,$in1infty,63	# !in1infty
1105e1051a39Sopenharmony_ci	addi	$rp,$sp,$Z1sqr
1106e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Z1sqr, in1_z);
1107e1051a39Sopenharmony_ci
1108e1051a39Sopenharmony_ci	ld	$bi,64($bp_real)
1109e1051a39Sopenharmony_ci	ld	$a0,$Z2sqr+0($sp)
1110e1051a39Sopenharmony_ci	ld	$a1,$Z2sqr+8($sp)
1111e1051a39Sopenharmony_ci	ld	$a2,$Z2sqr+16($sp)
1112e1051a39Sopenharmony_ci	ld	$a3,$Z2sqr+24($sp)
1113e1051a39Sopenharmony_ci	addi	$bp,$bp_real,64
1114e1051a39Sopenharmony_ci	addi	$rp,$sp,$S1
1115e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S1, Z2sqr, in2_z);
1116e1051a39Sopenharmony_ci
1117e1051a39Sopenharmony_ci	ld	$bi,64($ap_real)
1118e1051a39Sopenharmony_ci	ld	$a0,$Z1sqr+0($sp)
1119e1051a39Sopenharmony_ci	ld	$a1,$Z1sqr+8($sp)
1120e1051a39Sopenharmony_ci	ld	$a2,$Z1sqr+16($sp)
1121e1051a39Sopenharmony_ci	ld	$a3,$Z1sqr+24($sp)
1122e1051a39Sopenharmony_ci	addi	$bp,$ap_real,64
1123e1051a39Sopenharmony_ci	addi	$rp,$sp,$S2
1124e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, Z1sqr, in1_z);
1125e1051a39Sopenharmony_ci
1126e1051a39Sopenharmony_ci	ld	$bi,32($ap_real)
1127e1051a39Sopenharmony_ci	ld	$a0,$S1+0($sp)
1128e1051a39Sopenharmony_ci	ld	$a1,$S1+8($sp)
1129e1051a39Sopenharmony_ci	ld	$a2,$S1+16($sp)
1130e1051a39Sopenharmony_ci	ld	$a3,$S1+24($sp)
1131e1051a39Sopenharmony_ci	addi	$bp,$ap_real,32
1132e1051a39Sopenharmony_ci	addi	$rp,$sp,$S1
1133e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S1, S1, in1_y);
1134e1051a39Sopenharmony_ci
1135e1051a39Sopenharmony_ci	ld	$bi,32($bp_real)
1136e1051a39Sopenharmony_ci	ld	$a0,$S2+0($sp)
1137e1051a39Sopenharmony_ci	ld	$a1,$S2+8($sp)
1138e1051a39Sopenharmony_ci	ld	$a2,$S2+16($sp)
1139e1051a39Sopenharmony_ci	ld	$a3,$S2+24($sp)
1140e1051a39Sopenharmony_ci	addi	$bp,$bp_real,32
1141e1051a39Sopenharmony_ci	addi	$rp,$sp,$S2
1142e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, S2, in2_y);
1143e1051a39Sopenharmony_ci
1144e1051a39Sopenharmony_ci	addi	$bp,$sp,$S1
1145e1051a39Sopenharmony_ci	 ld	$bi,$Z2sqr($sp)		# forward load for p256_mul_mont
1146e1051a39Sopenharmony_ci	 ld	$a0,0($ap_real)
1147e1051a39Sopenharmony_ci	 ld	$a1,8($ap_real)
1148e1051a39Sopenharmony_ci	 ld	$a2,16($ap_real)
1149e1051a39Sopenharmony_ci	 ld	$a3,24($ap_real)
1150e1051a39Sopenharmony_ci	addi	$rp,$sp,$R
1151e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	# p256_sub(R, S2, S1);
1152e1051a39Sopenharmony_ci
1153e1051a39Sopenharmony_ci	or	$acc0,$acc0,$acc1	# see if result is zero
1154e1051a39Sopenharmony_ci	or	$acc2,$acc2,$acc3
1155e1051a39Sopenharmony_ci	or	$temp,$acc0,$acc2
1156e1051a39Sopenharmony_ci
1157e1051a39Sopenharmony_ci	addi	$bp,$sp,$Z2sqr
1158e1051a39Sopenharmony_ci	addi	$rp,$sp,$U1
1159e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U1, in1_x, Z2sqr);
1160e1051a39Sopenharmony_ci
1161e1051a39Sopenharmony_ci	ld	$bi,$Z1sqr($sp)
1162e1051a39Sopenharmony_ci	ld	$a0,0($bp_real)
1163e1051a39Sopenharmony_ci	ld	$a1,8($bp_real)
1164e1051a39Sopenharmony_ci	ld	$a2,16($bp_real)
1165e1051a39Sopenharmony_ci	ld	$a3,24($bp_real)
1166e1051a39Sopenharmony_ci	addi	$bp,$sp,$Z1sqr
1167e1051a39Sopenharmony_ci	addi	$rp,$sp,$U2
1168e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, in2_x, Z1sqr);
1169e1051a39Sopenharmony_ci
1170e1051a39Sopenharmony_ci	addi	$bp,$sp,$U1
1171e1051a39Sopenharmony_ci	 ld	$a0,$R+0($sp)		# forward load for p256_sqr_mont
1172e1051a39Sopenharmony_ci	 ld	$a1,$R+8($sp)
1173e1051a39Sopenharmony_ci	 ld	$a2,$R+16($sp)
1174e1051a39Sopenharmony_ci	 ld	$a3,$R+24($sp)
1175e1051a39Sopenharmony_ci	addi	$rp,$sp,$H
1176e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	# p256_sub(H, U2, U1);
1177e1051a39Sopenharmony_ci
1178e1051a39Sopenharmony_ci	or	$acc0,$acc0,$acc1	# see if result is zero
1179e1051a39Sopenharmony_ci	or	$acc2,$acc2,$acc3
1180e1051a39Sopenharmony_ci	or.	$acc0,$acc0,$acc2
1181e1051a39Sopenharmony_ci	bne	.Ladd_proceed		# is_equal(U1,U2)?
1182e1051a39Sopenharmony_ci
1183e1051a39Sopenharmony_ci	and.	$t0,$in1infty,$in2infty
1184e1051a39Sopenharmony_ci	beq	.Ladd_proceed		# (in1infty || in2infty)?
1185e1051a39Sopenharmony_ci
1186e1051a39Sopenharmony_ci	cmpldi	$temp,0
1187e1051a39Sopenharmony_ci	beq	.Ladd_double		# is_equal(S1,S2)?
1188e1051a39Sopenharmony_ci
1189e1051a39Sopenharmony_ci	xor	$a0,$a0,$a0
1190e1051a39Sopenharmony_ci	std	$a0,0($rp_real)
1191e1051a39Sopenharmony_ci	std	$a0,8($rp_real)
1192e1051a39Sopenharmony_ci	std	$a0,16($rp_real)
1193e1051a39Sopenharmony_ci	std	$a0,24($rp_real)
1194e1051a39Sopenharmony_ci	std	$a0,32($rp_real)
1195e1051a39Sopenharmony_ci	std	$a0,40($rp_real)
1196e1051a39Sopenharmony_ci	std	$a0,48($rp_real)
1197e1051a39Sopenharmony_ci	std	$a0,56($rp_real)
1198e1051a39Sopenharmony_ci	std	$a0,64($rp_real)
1199e1051a39Sopenharmony_ci	std	$a0,72($rp_real)
1200e1051a39Sopenharmony_ci	std	$a0,80($rp_real)
1201e1051a39Sopenharmony_ci	std	$a0,88($rp_real)
1202e1051a39Sopenharmony_ci	b	.Ladd_done
1203e1051a39Sopenharmony_ci
1204e1051a39Sopenharmony_ci.align	4
1205e1051a39Sopenharmony_ci.Ladd_double:
1206e1051a39Sopenharmony_ci	ld	$bp,0($sp)		# back-link
1207e1051a39Sopenharmony_ci	mr	$ap,$ap_real
1208e1051a39Sopenharmony_ci	mr	$rp,$rp_real
1209e1051a39Sopenharmony_ci	ld	r16,$FRAME-8*16($sp)
1210e1051a39Sopenharmony_ci	ld	r17,$FRAME-8*15($sp)
1211e1051a39Sopenharmony_ci	ld	r18,$FRAME-8*14($sp)
1212e1051a39Sopenharmony_ci	ld	r19,$FRAME-8*13($sp)
1213e1051a39Sopenharmony_ci	stdu	$bp,$FRAME-288($sp)	# difference in stack frame sizes
1214e1051a39Sopenharmony_ci	b	.Ldouble_shortcut
1215e1051a39Sopenharmony_ci
1216e1051a39Sopenharmony_ci.align	4
1217e1051a39Sopenharmony_ci.Ladd_proceed:
1218e1051a39Sopenharmony_ci	addi	$rp,$sp,$Rsqr
1219e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Rsqr, R);
1220e1051a39Sopenharmony_ci
1221e1051a39Sopenharmony_ci	ld	$bi,64($ap_real)
1222e1051a39Sopenharmony_ci	ld	$a0,$H+0($sp)
1223e1051a39Sopenharmony_ci	ld	$a1,$H+8($sp)
1224e1051a39Sopenharmony_ci	ld	$a2,$H+16($sp)
1225e1051a39Sopenharmony_ci	ld	$a3,$H+24($sp)
1226e1051a39Sopenharmony_ci	addi	$bp,$ap_real,64
1227e1051a39Sopenharmony_ci	addi	$rp,$sp,$res_z
1228e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_z, H, in1_z);
1229e1051a39Sopenharmony_ci
1230e1051a39Sopenharmony_ci	ld	$a0,$H+0($sp)
1231e1051a39Sopenharmony_ci	ld	$a1,$H+8($sp)
1232e1051a39Sopenharmony_ci	ld	$a2,$H+16($sp)
1233e1051a39Sopenharmony_ci	ld	$a3,$H+24($sp)
1234e1051a39Sopenharmony_ci	addi	$rp,$sp,$Hsqr
1235e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Hsqr, H);
1236e1051a39Sopenharmony_ci
1237e1051a39Sopenharmony_ci	ld	$bi,64($bp_real)
1238e1051a39Sopenharmony_ci	ld	$a0,$res_z+0($sp)
1239e1051a39Sopenharmony_ci	ld	$a1,$res_z+8($sp)
1240e1051a39Sopenharmony_ci	ld	$a2,$res_z+16($sp)
1241e1051a39Sopenharmony_ci	ld	$a3,$res_z+24($sp)
1242e1051a39Sopenharmony_ci	addi	$bp,$bp_real,64
1243e1051a39Sopenharmony_ci	addi	$rp,$sp,$res_z
1244e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_z, res_z, in2_z);
1245e1051a39Sopenharmony_ci
1246e1051a39Sopenharmony_ci	ld	$bi,$H($sp)
1247e1051a39Sopenharmony_ci	ld	$a0,$Hsqr+0($sp)
1248e1051a39Sopenharmony_ci	ld	$a1,$Hsqr+8($sp)
1249e1051a39Sopenharmony_ci	ld	$a2,$Hsqr+16($sp)
1250e1051a39Sopenharmony_ci	ld	$a3,$Hsqr+24($sp)
1251e1051a39Sopenharmony_ci	addi	$bp,$sp,$H
1252e1051a39Sopenharmony_ci	addi	$rp,$sp,$Hcub
1253e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(Hcub, Hsqr, H);
1254e1051a39Sopenharmony_ci
1255e1051a39Sopenharmony_ci	ld	$bi,$Hsqr($sp)
1256e1051a39Sopenharmony_ci	ld	$a0,$U1+0($sp)
1257e1051a39Sopenharmony_ci	ld	$a1,$U1+8($sp)
1258e1051a39Sopenharmony_ci	ld	$a2,$U1+16($sp)
1259e1051a39Sopenharmony_ci	ld	$a3,$U1+24($sp)
1260e1051a39Sopenharmony_ci	addi	$bp,$sp,$Hsqr
1261e1051a39Sopenharmony_ci	addi	$rp,$sp,$U2
1262e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, U1, Hsqr);
1263e1051a39Sopenharmony_ci
1264e1051a39Sopenharmony_ci	mr	$t0,$acc0
1265e1051a39Sopenharmony_ci	mr	$t1,$acc1
1266e1051a39Sopenharmony_ci	mr	$t2,$acc2
1267e1051a39Sopenharmony_ci	mr	$t3,$acc3
1268e1051a39Sopenharmony_ci	addi	$rp,$sp,$Hsqr
1269e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	# p256_mul_by_2(Hsqr, U2);
1270e1051a39Sopenharmony_ci
1271e1051a39Sopenharmony_ci	addi	$bp,$sp,$Rsqr
1272e1051a39Sopenharmony_ci	addi	$rp,$sp,$res_x
1273e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	# p256_sub(res_x, Rsqr, Hsqr);
1274e1051a39Sopenharmony_ci
1275e1051a39Sopenharmony_ci	addi	$bp,$sp,$Hcub
1276e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	# p256_sub(res_x, res_x, Hcub);
1277e1051a39Sopenharmony_ci
1278e1051a39Sopenharmony_ci	addi	$bp,$sp,$U2
1279e1051a39Sopenharmony_ci	 ld	$bi,$Hcub($sp)		# forward load for p256_mul_mont
1280e1051a39Sopenharmony_ci	 ld	$a0,$S1+0($sp)
1281e1051a39Sopenharmony_ci	 ld	$a1,$S1+8($sp)
1282e1051a39Sopenharmony_ci	 ld	$a2,$S1+16($sp)
1283e1051a39Sopenharmony_ci	 ld	$a3,$S1+24($sp)
1284e1051a39Sopenharmony_ci	addi	$rp,$sp,$res_y
1285e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	# p256_sub(res_y, U2, res_x);
1286e1051a39Sopenharmony_ci
1287e1051a39Sopenharmony_ci	addi	$bp,$sp,$Hcub
1288e1051a39Sopenharmony_ci	addi	$rp,$sp,$S2
1289e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, S1, Hcub);
1290e1051a39Sopenharmony_ci
1291e1051a39Sopenharmony_ci	ld	$bi,$R($sp)
1292e1051a39Sopenharmony_ci	ld	$a0,$res_y+0($sp)
1293e1051a39Sopenharmony_ci	ld	$a1,$res_y+8($sp)
1294e1051a39Sopenharmony_ci	ld	$a2,$res_y+16($sp)
1295e1051a39Sopenharmony_ci	ld	$a3,$res_y+24($sp)
1296e1051a39Sopenharmony_ci	addi	$bp,$sp,$R
1297e1051a39Sopenharmony_ci	addi	$rp,$sp,$res_y
1298e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_y, res_y, R);
1299e1051a39Sopenharmony_ci
1300e1051a39Sopenharmony_ci	addi	$bp,$sp,$S2
1301e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	# p256_sub(res_y, res_y, S2);
1302e1051a39Sopenharmony_ci
1303e1051a39Sopenharmony_ci	ld	$t0,0($bp_real)		# in2
1304e1051a39Sopenharmony_ci	ld	$t1,8($bp_real)
1305e1051a39Sopenharmony_ci	ld	$t2,16($bp_real)
1306e1051a39Sopenharmony_ci	ld	$t3,24($bp_real)
1307e1051a39Sopenharmony_ci	ld	$a0,$res_x+0($sp)	# res
1308e1051a39Sopenharmony_ci	ld	$a1,$res_x+8($sp)
1309e1051a39Sopenharmony_ci	ld	$a2,$res_x+16($sp)
1310e1051a39Sopenharmony_ci	ld	$a3,$res_x+24($sp)
1311e1051a39Sopenharmony_ci___
1312e1051a39Sopenharmony_cifor($i=0;$i<64;$i+=32) {		# conditional moves
1313e1051a39Sopenharmony_ci$code.=<<___;
1314e1051a39Sopenharmony_ci	ld	$acc0,$i+0($ap_real)	# in1
1315e1051a39Sopenharmony_ci	ld	$acc1,$i+8($ap_real)
1316e1051a39Sopenharmony_ci	ld	$acc2,$i+16($ap_real)
1317e1051a39Sopenharmony_ci	ld	$acc3,$i+24($ap_real)
1318e1051a39Sopenharmony_ci	andc	$t0,$t0,$in1infty
1319e1051a39Sopenharmony_ci	andc	$t1,$t1,$in1infty
1320e1051a39Sopenharmony_ci	andc	$t2,$t2,$in1infty
1321e1051a39Sopenharmony_ci	andc	$t3,$t3,$in1infty
1322e1051a39Sopenharmony_ci	and	$a0,$a0,$in1infty
1323e1051a39Sopenharmony_ci	and	$a1,$a1,$in1infty
1324e1051a39Sopenharmony_ci	and	$a2,$a2,$in1infty
1325e1051a39Sopenharmony_ci	and	$a3,$a3,$in1infty
1326e1051a39Sopenharmony_ci	or	$t0,$t0,$a0
1327e1051a39Sopenharmony_ci	or	$t1,$t1,$a1
1328e1051a39Sopenharmony_ci	or	$t2,$t2,$a2
1329e1051a39Sopenharmony_ci	or	$t3,$t3,$a3
1330e1051a39Sopenharmony_ci	andc	$acc0,$acc0,$in2infty
1331e1051a39Sopenharmony_ci	andc	$acc1,$acc1,$in2infty
1332e1051a39Sopenharmony_ci	andc	$acc2,$acc2,$in2infty
1333e1051a39Sopenharmony_ci	andc	$acc3,$acc3,$in2infty
1334e1051a39Sopenharmony_ci	and	$t0,$t0,$in2infty
1335e1051a39Sopenharmony_ci	and	$t1,$t1,$in2infty
1336e1051a39Sopenharmony_ci	and	$t2,$t2,$in2infty
1337e1051a39Sopenharmony_ci	and	$t3,$t3,$in2infty
1338e1051a39Sopenharmony_ci	or	$acc0,$acc0,$t0
1339e1051a39Sopenharmony_ci	or	$acc1,$acc1,$t1
1340e1051a39Sopenharmony_ci	or	$acc2,$acc2,$t2
1341e1051a39Sopenharmony_ci	or	$acc3,$acc3,$t3
1342e1051a39Sopenharmony_ci
1343e1051a39Sopenharmony_ci	ld	$t0,$i+32($bp_real)	# in2
1344e1051a39Sopenharmony_ci	ld	$t1,$i+40($bp_real)
1345e1051a39Sopenharmony_ci	ld	$t2,$i+48($bp_real)
1346e1051a39Sopenharmony_ci	ld	$t3,$i+56($bp_real)
1347e1051a39Sopenharmony_ci	ld	$a0,$res_x+$i+32($sp)
1348e1051a39Sopenharmony_ci	ld	$a1,$res_x+$i+40($sp)
1349e1051a39Sopenharmony_ci	ld	$a2,$res_x+$i+48($sp)
1350e1051a39Sopenharmony_ci	ld	$a3,$res_x+$i+56($sp)
1351e1051a39Sopenharmony_ci	std	$acc0,$i+0($rp_real)
1352e1051a39Sopenharmony_ci	std	$acc1,$i+8($rp_real)
1353e1051a39Sopenharmony_ci	std	$acc2,$i+16($rp_real)
1354e1051a39Sopenharmony_ci	std	$acc3,$i+24($rp_real)
1355e1051a39Sopenharmony_ci___
1356e1051a39Sopenharmony_ci}
1357e1051a39Sopenharmony_ci$code.=<<___;
1358e1051a39Sopenharmony_ci	ld	$acc0,$i+0($ap_real)	# in1
1359e1051a39Sopenharmony_ci	ld	$acc1,$i+8($ap_real)
1360e1051a39Sopenharmony_ci	ld	$acc2,$i+16($ap_real)
1361e1051a39Sopenharmony_ci	ld	$acc3,$i+24($ap_real)
1362e1051a39Sopenharmony_ci	andc	$t0,$t0,$in1infty
1363e1051a39Sopenharmony_ci	andc	$t1,$t1,$in1infty
1364e1051a39Sopenharmony_ci	andc	$t2,$t2,$in1infty
1365e1051a39Sopenharmony_ci	andc	$t3,$t3,$in1infty
1366e1051a39Sopenharmony_ci	and	$a0,$a0,$in1infty
1367e1051a39Sopenharmony_ci	and	$a1,$a1,$in1infty
1368e1051a39Sopenharmony_ci	and	$a2,$a2,$in1infty
1369e1051a39Sopenharmony_ci	and	$a3,$a3,$in1infty
1370e1051a39Sopenharmony_ci	or	$t0,$t0,$a0
1371e1051a39Sopenharmony_ci	or	$t1,$t1,$a1
1372e1051a39Sopenharmony_ci	or	$t2,$t2,$a2
1373e1051a39Sopenharmony_ci	or	$t3,$t3,$a3
1374e1051a39Sopenharmony_ci	andc	$acc0,$acc0,$in2infty
1375e1051a39Sopenharmony_ci	andc	$acc1,$acc1,$in2infty
1376e1051a39Sopenharmony_ci	andc	$acc2,$acc2,$in2infty
1377e1051a39Sopenharmony_ci	andc	$acc3,$acc3,$in2infty
1378e1051a39Sopenharmony_ci	and	$t0,$t0,$in2infty
1379e1051a39Sopenharmony_ci	and	$t1,$t1,$in2infty
1380e1051a39Sopenharmony_ci	and	$t2,$t2,$in2infty
1381e1051a39Sopenharmony_ci	and	$t3,$t3,$in2infty
1382e1051a39Sopenharmony_ci	or	$acc0,$acc0,$t0
1383e1051a39Sopenharmony_ci	or	$acc1,$acc1,$t1
1384e1051a39Sopenharmony_ci	or	$acc2,$acc2,$t2
1385e1051a39Sopenharmony_ci	or	$acc3,$acc3,$t3
1386e1051a39Sopenharmony_ci	std	$acc0,$i+0($rp_real)
1387e1051a39Sopenharmony_ci	std	$acc1,$i+8($rp_real)
1388e1051a39Sopenharmony_ci	std	$acc2,$i+16($rp_real)
1389e1051a39Sopenharmony_ci	std	$acc3,$i+24($rp_real)
1390e1051a39Sopenharmony_ci
1391e1051a39Sopenharmony_ci.Ladd_done:
1392e1051a39Sopenharmony_ci	mtlr	r0
1393e1051a39Sopenharmony_ci	ld	r16,$FRAME-8*16($sp)
1394e1051a39Sopenharmony_ci	ld	r17,$FRAME-8*15($sp)
1395e1051a39Sopenharmony_ci	ld	r18,$FRAME-8*14($sp)
1396e1051a39Sopenharmony_ci	ld	r19,$FRAME-8*13($sp)
1397e1051a39Sopenharmony_ci	ld	r20,$FRAME-8*12($sp)
1398e1051a39Sopenharmony_ci	ld	r21,$FRAME-8*11($sp)
1399e1051a39Sopenharmony_ci	ld	r22,$FRAME-8*10($sp)
1400e1051a39Sopenharmony_ci	ld	r23,$FRAME-8*9($sp)
1401e1051a39Sopenharmony_ci	ld	r24,$FRAME-8*8($sp)
1402e1051a39Sopenharmony_ci	ld	r25,$FRAME-8*7($sp)
1403e1051a39Sopenharmony_ci	ld	r26,$FRAME-8*6($sp)
1404e1051a39Sopenharmony_ci	ld	r27,$FRAME-8*5($sp)
1405e1051a39Sopenharmony_ci	ld	r28,$FRAME-8*4($sp)
1406e1051a39Sopenharmony_ci	ld	r29,$FRAME-8*3($sp)
1407e1051a39Sopenharmony_ci	ld	r30,$FRAME-8*2($sp)
1408e1051a39Sopenharmony_ci	ld	r31,$FRAME-8*1($sp)
1409e1051a39Sopenharmony_ci	addi	$sp,$sp,$FRAME
1410e1051a39Sopenharmony_ci	blr
1411e1051a39Sopenharmony_ci	.long	0
1412e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,16,3,0
1413e1051a39Sopenharmony_ci	.long	0
1414e1051a39Sopenharmony_ci.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1415e1051a39Sopenharmony_ci___
1416e1051a39Sopenharmony_ci}
1417e1051a39Sopenharmony_ci
1418e1051a39Sopenharmony_ci########################################################################
1419e1051a39Sopenharmony_ci# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1420e1051a39Sopenharmony_ci#				     const P256_POINT_AFFINE *in2);
1421e1051a39Sopenharmony_ciif (1) {
1422e1051a39Sopenharmony_cimy $FRAME = 64 + 32*10 + 16*8;
1423e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z,
1424e1051a39Sopenharmony_ci    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
1425e1051a39Sopenharmony_cimy $Z1sqr = $S2;
1426e1051a39Sopenharmony_ci# above map() describes stack layout with 10 temporary
1427e1051a39Sopenharmony_ci# 256-bit vectors on top.
1428e1051a39Sopenharmony_cimy ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1429e1051a39Sopenharmony_ci
1430e1051a39Sopenharmony_ci$code.=<<___;
1431e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_add_affine
1432e1051a39Sopenharmony_ci.align	5
1433e1051a39Sopenharmony_ciecp_nistz256_point_add_affine:
1434e1051a39Sopenharmony_ci	stdu	$sp,-$FRAME($sp)
1435e1051a39Sopenharmony_ci	mflr	r0
1436e1051a39Sopenharmony_ci	std	r16,$FRAME-8*16($sp)
1437e1051a39Sopenharmony_ci	std	r17,$FRAME-8*15($sp)
1438e1051a39Sopenharmony_ci	std	r18,$FRAME-8*14($sp)
1439e1051a39Sopenharmony_ci	std	r19,$FRAME-8*13($sp)
1440e1051a39Sopenharmony_ci	std	r20,$FRAME-8*12($sp)
1441e1051a39Sopenharmony_ci	std	r21,$FRAME-8*11($sp)
1442e1051a39Sopenharmony_ci	std	r22,$FRAME-8*10($sp)
1443e1051a39Sopenharmony_ci	std	r23,$FRAME-8*9($sp)
1444e1051a39Sopenharmony_ci	std	r24,$FRAME-8*8($sp)
1445e1051a39Sopenharmony_ci	std	r25,$FRAME-8*7($sp)
1446e1051a39Sopenharmony_ci	std	r26,$FRAME-8*6($sp)
1447e1051a39Sopenharmony_ci	std	r27,$FRAME-8*5($sp)
1448e1051a39Sopenharmony_ci	std	r28,$FRAME-8*4($sp)
1449e1051a39Sopenharmony_ci	std	r29,$FRAME-8*3($sp)
1450e1051a39Sopenharmony_ci	std	r30,$FRAME-8*2($sp)
1451e1051a39Sopenharmony_ci	std	r31,$FRAME-8*1($sp)
1452e1051a39Sopenharmony_ci
1453e1051a39Sopenharmony_ci	li	$poly1,-1
1454e1051a39Sopenharmony_ci	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
1455e1051a39Sopenharmony_ci	li	$poly3,1
1456e1051a39Sopenharmony_ci	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
1457e1051a39Sopenharmony_ci
1458e1051a39Sopenharmony_ci	mr	$rp_real,$rp
1459e1051a39Sopenharmony_ci	mr	$ap_real,$ap
1460e1051a39Sopenharmony_ci	mr	$bp_real,$bp
1461e1051a39Sopenharmony_ci
1462e1051a39Sopenharmony_ci	ld	$a0,64($ap)		# in1_z
1463e1051a39Sopenharmony_ci	ld	$a1,72($ap)
1464e1051a39Sopenharmony_ci	ld	$a2,80($ap)
1465e1051a39Sopenharmony_ci	ld	$a3,88($ap)
1466e1051a39Sopenharmony_ci	or	$t0,$a0,$a1
1467e1051a39Sopenharmony_ci	or	$t2,$a2,$a3
1468e1051a39Sopenharmony_ci	or	$in1infty,$t0,$t2
1469e1051a39Sopenharmony_ci	neg	$t0,$in1infty
1470e1051a39Sopenharmony_ci	or	$in1infty,$in1infty,$t0
1471e1051a39Sopenharmony_ci	sradi	$in1infty,$in1infty,63	# !in1infty
1472e1051a39Sopenharmony_ci
1473e1051a39Sopenharmony_ci	ld	$acc0,0($bp)		# in2_x
1474e1051a39Sopenharmony_ci	ld	$acc1,8($bp)
1475e1051a39Sopenharmony_ci	ld	$acc2,16($bp)
1476e1051a39Sopenharmony_ci	ld	$acc3,24($bp)
1477e1051a39Sopenharmony_ci	ld	$t0,32($bp)		# in2_y
1478e1051a39Sopenharmony_ci	ld	$t1,40($bp)
1479e1051a39Sopenharmony_ci	ld	$t2,48($bp)
1480e1051a39Sopenharmony_ci	ld	$t3,56($bp)
1481e1051a39Sopenharmony_ci	or	$acc0,$acc0,$acc1
1482e1051a39Sopenharmony_ci	or	$acc2,$acc2,$acc3
1483e1051a39Sopenharmony_ci	or	$acc0,$acc0,$acc2
1484e1051a39Sopenharmony_ci	or	$t0,$t0,$t1
1485e1051a39Sopenharmony_ci	or	$t2,$t2,$t3
1486e1051a39Sopenharmony_ci	or	$t0,$t0,$t2
1487e1051a39Sopenharmony_ci	or	$in2infty,$acc0,$t0
1488e1051a39Sopenharmony_ci	neg	$t0,$in2infty
1489e1051a39Sopenharmony_ci	or	$in2infty,$in2infty,$t0
1490e1051a39Sopenharmony_ci	sradi	$in2infty,$in2infty,63	# !in2infty
1491e1051a39Sopenharmony_ci
1492e1051a39Sopenharmony_ci	addi	$rp,$sp,$Z1sqr
1493e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Z1sqr, in1_z);
1494e1051a39Sopenharmony_ci
1495e1051a39Sopenharmony_ci	mr	$a0,$acc0
1496e1051a39Sopenharmony_ci	mr	$a1,$acc1
1497e1051a39Sopenharmony_ci	mr	$a2,$acc2
1498e1051a39Sopenharmony_ci	mr	$a3,$acc3
1499e1051a39Sopenharmony_ci	ld	$bi,0($bp_real)
1500e1051a39Sopenharmony_ci	addi	$bp,$bp_real,0
1501e1051a39Sopenharmony_ci	addi	$rp,$sp,$U2
1502e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, Z1sqr, in2_x);
1503e1051a39Sopenharmony_ci
1504e1051a39Sopenharmony_ci	addi	$bp,$ap_real,0
1505e1051a39Sopenharmony_ci	 ld	$bi,64($ap_real)	# forward load for p256_mul_mont
1506e1051a39Sopenharmony_ci	 ld	$a0,$Z1sqr+0($sp)
1507e1051a39Sopenharmony_ci	 ld	$a1,$Z1sqr+8($sp)
1508e1051a39Sopenharmony_ci	 ld	$a2,$Z1sqr+16($sp)
1509e1051a39Sopenharmony_ci	 ld	$a3,$Z1sqr+24($sp)
1510e1051a39Sopenharmony_ci	addi	$rp,$sp,$H
1511e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	# p256_sub(H, U2, in1_x);
1512e1051a39Sopenharmony_ci
1513e1051a39Sopenharmony_ci	addi	$bp,$ap_real,64
1514e1051a39Sopenharmony_ci	addi	$rp,$sp,$S2
1515e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, Z1sqr, in1_z);
1516e1051a39Sopenharmony_ci
1517e1051a39Sopenharmony_ci	ld	$bi,64($ap_real)
1518e1051a39Sopenharmony_ci	ld	$a0,$H+0($sp)
1519e1051a39Sopenharmony_ci	ld	$a1,$H+8($sp)
1520e1051a39Sopenharmony_ci	ld	$a2,$H+16($sp)
1521e1051a39Sopenharmony_ci	ld	$a3,$H+24($sp)
1522e1051a39Sopenharmony_ci	addi	$bp,$ap_real,64
1523e1051a39Sopenharmony_ci	addi	$rp,$sp,$res_z
1524e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_z, H, in1_z);
1525e1051a39Sopenharmony_ci
1526e1051a39Sopenharmony_ci	ld	$bi,32($bp_real)
1527e1051a39Sopenharmony_ci	ld	$a0,$S2+0($sp)
1528e1051a39Sopenharmony_ci	ld	$a1,$S2+8($sp)
1529e1051a39Sopenharmony_ci	ld	$a2,$S2+16($sp)
1530e1051a39Sopenharmony_ci	ld	$a3,$S2+24($sp)
1531e1051a39Sopenharmony_ci	addi	$bp,$bp_real,32
1532e1051a39Sopenharmony_ci	addi	$rp,$sp,$S2
1533e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, S2, in2_y);
1534e1051a39Sopenharmony_ci
1535e1051a39Sopenharmony_ci	addi	$bp,$ap_real,32
1536e1051a39Sopenharmony_ci	 ld	$a0,$H+0($sp)		# forward load for p256_sqr_mont
1537e1051a39Sopenharmony_ci	 ld	$a1,$H+8($sp)
1538e1051a39Sopenharmony_ci	 ld	$a2,$H+16($sp)
1539e1051a39Sopenharmony_ci	 ld	$a3,$H+24($sp)
1540e1051a39Sopenharmony_ci	addi	$rp,$sp,$R
1541e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	# p256_sub(R, S2, in1_y);
1542e1051a39Sopenharmony_ci
1543e1051a39Sopenharmony_ci	addi	$rp,$sp,$Hsqr
1544e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Hsqr, H);
1545e1051a39Sopenharmony_ci
1546e1051a39Sopenharmony_ci	ld	$a0,$R+0($sp)
1547e1051a39Sopenharmony_ci	ld	$a1,$R+8($sp)
1548e1051a39Sopenharmony_ci	ld	$a2,$R+16($sp)
1549e1051a39Sopenharmony_ci	ld	$a3,$R+24($sp)
1550e1051a39Sopenharmony_ci	addi	$rp,$sp,$Rsqr
1551e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Rsqr, R);
1552e1051a39Sopenharmony_ci
1553e1051a39Sopenharmony_ci	ld	$bi,$H($sp)
1554e1051a39Sopenharmony_ci	ld	$a0,$Hsqr+0($sp)
1555e1051a39Sopenharmony_ci	ld	$a1,$Hsqr+8($sp)
1556e1051a39Sopenharmony_ci	ld	$a2,$Hsqr+16($sp)
1557e1051a39Sopenharmony_ci	ld	$a3,$Hsqr+24($sp)
1558e1051a39Sopenharmony_ci	addi	$bp,$sp,$H
1559e1051a39Sopenharmony_ci	addi	$rp,$sp,$Hcub
1560e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(Hcub, Hsqr, H);
1561e1051a39Sopenharmony_ci
1562e1051a39Sopenharmony_ci	ld	$bi,0($ap_real)
1563e1051a39Sopenharmony_ci	ld	$a0,$Hsqr+0($sp)
1564e1051a39Sopenharmony_ci	ld	$a1,$Hsqr+8($sp)
1565e1051a39Sopenharmony_ci	ld	$a2,$Hsqr+16($sp)
1566e1051a39Sopenharmony_ci	ld	$a3,$Hsqr+24($sp)
1567e1051a39Sopenharmony_ci	addi	$bp,$ap_real,0
1568e1051a39Sopenharmony_ci	addi	$rp,$sp,$U2
1569e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, in1_x, Hsqr);
1570e1051a39Sopenharmony_ci
1571e1051a39Sopenharmony_ci	mr	$t0,$acc0
1572e1051a39Sopenharmony_ci	mr	$t1,$acc1
1573e1051a39Sopenharmony_ci	mr	$t2,$acc2
1574e1051a39Sopenharmony_ci	mr	$t3,$acc3
1575e1051a39Sopenharmony_ci	addi	$rp,$sp,$Hsqr
1576e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	# p256_mul_by_2(Hsqr, U2);
1577e1051a39Sopenharmony_ci
1578e1051a39Sopenharmony_ci	addi	$bp,$sp,$Rsqr
1579e1051a39Sopenharmony_ci	addi	$rp,$sp,$res_x
1580e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	# p256_sub(res_x, Rsqr, Hsqr);
1581e1051a39Sopenharmony_ci
1582e1051a39Sopenharmony_ci	addi	$bp,$sp,$Hcub
1583e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	#  p256_sub(res_x, res_x, Hcub);
1584e1051a39Sopenharmony_ci
1585e1051a39Sopenharmony_ci	addi	$bp,$sp,$U2
1586e1051a39Sopenharmony_ci	 ld	$bi,32($ap_real)	# forward load for p256_mul_mont
1587e1051a39Sopenharmony_ci	 ld	$a0,$Hcub+0($sp)
1588e1051a39Sopenharmony_ci	 ld	$a1,$Hcub+8($sp)
1589e1051a39Sopenharmony_ci	 ld	$a2,$Hcub+16($sp)
1590e1051a39Sopenharmony_ci	 ld	$a3,$Hcub+24($sp)
1591e1051a39Sopenharmony_ci	addi	$rp,$sp,$res_y
1592e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	# p256_sub(res_y, U2, res_x);
1593e1051a39Sopenharmony_ci
1594e1051a39Sopenharmony_ci	addi	$bp,$ap_real,32
1595e1051a39Sopenharmony_ci	addi	$rp,$sp,$S2
1596e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, in1_y, Hcub);
1597e1051a39Sopenharmony_ci
1598e1051a39Sopenharmony_ci	ld	$bi,$R($sp)
1599e1051a39Sopenharmony_ci	ld	$a0,$res_y+0($sp)
1600e1051a39Sopenharmony_ci	ld	$a1,$res_y+8($sp)
1601e1051a39Sopenharmony_ci	ld	$a2,$res_y+16($sp)
1602e1051a39Sopenharmony_ci	ld	$a3,$res_y+24($sp)
1603e1051a39Sopenharmony_ci	addi	$bp,$sp,$R
1604e1051a39Sopenharmony_ci	addi	$rp,$sp,$res_y
1605e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_y, res_y, R);
1606e1051a39Sopenharmony_ci
1607e1051a39Sopenharmony_ci	addi	$bp,$sp,$S2
1608e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	# p256_sub(res_y, res_y, S2);
1609e1051a39Sopenharmony_ci
1610e1051a39Sopenharmony_ci	ld	$t0,0($bp_real)		# in2
1611e1051a39Sopenharmony_ci	ld	$t1,8($bp_real)
1612e1051a39Sopenharmony_ci	ld	$t2,16($bp_real)
1613e1051a39Sopenharmony_ci	ld	$t3,24($bp_real)
1614e1051a39Sopenharmony_ci	ld	$a0,$res_x+0($sp)	# res
1615e1051a39Sopenharmony_ci	ld	$a1,$res_x+8($sp)
1616e1051a39Sopenharmony_ci	ld	$a2,$res_x+16($sp)
1617e1051a39Sopenharmony_ci	ld	$a3,$res_x+24($sp)
1618e1051a39Sopenharmony_ci___
1619e1051a39Sopenharmony_cifor($i=0;$i<64;$i+=32) {		# conditional moves
1620e1051a39Sopenharmony_ci$code.=<<___;
1621e1051a39Sopenharmony_ci	ld	$acc0,$i+0($ap_real)	# in1
1622e1051a39Sopenharmony_ci	ld	$acc1,$i+8($ap_real)
1623e1051a39Sopenharmony_ci	ld	$acc2,$i+16($ap_real)
1624e1051a39Sopenharmony_ci	ld	$acc3,$i+24($ap_real)
1625e1051a39Sopenharmony_ci	andc	$t0,$t0,$in1infty
1626e1051a39Sopenharmony_ci	andc	$t1,$t1,$in1infty
1627e1051a39Sopenharmony_ci	andc	$t2,$t2,$in1infty
1628e1051a39Sopenharmony_ci	andc	$t3,$t3,$in1infty
1629e1051a39Sopenharmony_ci	and	$a0,$a0,$in1infty
1630e1051a39Sopenharmony_ci	and	$a1,$a1,$in1infty
1631e1051a39Sopenharmony_ci	and	$a2,$a2,$in1infty
1632e1051a39Sopenharmony_ci	and	$a3,$a3,$in1infty
1633e1051a39Sopenharmony_ci	or	$t0,$t0,$a0
1634e1051a39Sopenharmony_ci	or	$t1,$t1,$a1
1635e1051a39Sopenharmony_ci	or	$t2,$t2,$a2
1636e1051a39Sopenharmony_ci	or	$t3,$t3,$a3
1637e1051a39Sopenharmony_ci	andc	$acc0,$acc0,$in2infty
1638e1051a39Sopenharmony_ci	andc	$acc1,$acc1,$in2infty
1639e1051a39Sopenharmony_ci	andc	$acc2,$acc2,$in2infty
1640e1051a39Sopenharmony_ci	andc	$acc3,$acc3,$in2infty
1641e1051a39Sopenharmony_ci	and	$t0,$t0,$in2infty
1642e1051a39Sopenharmony_ci	and	$t1,$t1,$in2infty
1643e1051a39Sopenharmony_ci	and	$t2,$t2,$in2infty
1644e1051a39Sopenharmony_ci	and	$t3,$t3,$in2infty
1645e1051a39Sopenharmony_ci	or	$acc0,$acc0,$t0
1646e1051a39Sopenharmony_ci	or	$acc1,$acc1,$t1
1647e1051a39Sopenharmony_ci	or	$acc2,$acc2,$t2
1648e1051a39Sopenharmony_ci	or	$acc3,$acc3,$t3
1649e1051a39Sopenharmony_ci___
1650e1051a39Sopenharmony_ci$code.=<<___	if ($i==0);
1651e1051a39Sopenharmony_ci	ld	$t0,32($bp_real)	# in2
1652e1051a39Sopenharmony_ci	ld	$t1,40($bp_real)
1653e1051a39Sopenharmony_ci	ld	$t2,48($bp_real)
1654e1051a39Sopenharmony_ci	ld	$t3,56($bp_real)
1655e1051a39Sopenharmony_ci___
1656e1051a39Sopenharmony_ci$code.=<<___	if ($i==32);
1657e1051a39Sopenharmony_ci	li	$t0,1			# Lone_mont
1658e1051a39Sopenharmony_ci	not	$t1,$poly1
1659e1051a39Sopenharmony_ci	li	$t2,-1
1660e1051a39Sopenharmony_ci	not	$t3,$poly3
1661e1051a39Sopenharmony_ci___
1662e1051a39Sopenharmony_ci$code.=<<___;
1663e1051a39Sopenharmony_ci	ld	$a0,$res_x+$i+32($sp)
1664e1051a39Sopenharmony_ci	ld	$a1,$res_x+$i+40($sp)
1665e1051a39Sopenharmony_ci	ld	$a2,$res_x+$i+48($sp)
1666e1051a39Sopenharmony_ci	ld	$a3,$res_x+$i+56($sp)
1667e1051a39Sopenharmony_ci	std	$acc0,$i+0($rp_real)
1668e1051a39Sopenharmony_ci	std	$acc1,$i+8($rp_real)
1669e1051a39Sopenharmony_ci	std	$acc2,$i+16($rp_real)
1670e1051a39Sopenharmony_ci	std	$acc3,$i+24($rp_real)
1671e1051a39Sopenharmony_ci___
1672e1051a39Sopenharmony_ci}
1673e1051a39Sopenharmony_ci$code.=<<___;
1674e1051a39Sopenharmony_ci	ld	$acc0,$i+0($ap_real)	# in1
1675e1051a39Sopenharmony_ci	ld	$acc1,$i+8($ap_real)
1676e1051a39Sopenharmony_ci	ld	$acc2,$i+16($ap_real)
1677e1051a39Sopenharmony_ci	ld	$acc3,$i+24($ap_real)
1678e1051a39Sopenharmony_ci	andc	$t0,$t0,$in1infty
1679e1051a39Sopenharmony_ci	andc	$t1,$t1,$in1infty
1680e1051a39Sopenharmony_ci	andc	$t2,$t2,$in1infty
1681e1051a39Sopenharmony_ci	andc	$t3,$t3,$in1infty
1682e1051a39Sopenharmony_ci	and	$a0,$a0,$in1infty
1683e1051a39Sopenharmony_ci	and	$a1,$a1,$in1infty
1684e1051a39Sopenharmony_ci	and	$a2,$a2,$in1infty
1685e1051a39Sopenharmony_ci	and	$a3,$a3,$in1infty
1686e1051a39Sopenharmony_ci	or	$t0,$t0,$a0
1687e1051a39Sopenharmony_ci	or	$t1,$t1,$a1
1688e1051a39Sopenharmony_ci	or	$t2,$t2,$a2
1689e1051a39Sopenharmony_ci	or	$t3,$t3,$a3
1690e1051a39Sopenharmony_ci	andc	$acc0,$acc0,$in2infty
1691e1051a39Sopenharmony_ci	andc	$acc1,$acc1,$in2infty
1692e1051a39Sopenharmony_ci	andc	$acc2,$acc2,$in2infty
1693e1051a39Sopenharmony_ci	andc	$acc3,$acc3,$in2infty
1694e1051a39Sopenharmony_ci	and	$t0,$t0,$in2infty
1695e1051a39Sopenharmony_ci	and	$t1,$t1,$in2infty
1696e1051a39Sopenharmony_ci	and	$t2,$t2,$in2infty
1697e1051a39Sopenharmony_ci	and	$t3,$t3,$in2infty
1698e1051a39Sopenharmony_ci	or	$acc0,$acc0,$t0
1699e1051a39Sopenharmony_ci	or	$acc1,$acc1,$t1
1700e1051a39Sopenharmony_ci	or	$acc2,$acc2,$t2
1701e1051a39Sopenharmony_ci	or	$acc3,$acc3,$t3
1702e1051a39Sopenharmony_ci	std	$acc0,$i+0($rp_real)
1703e1051a39Sopenharmony_ci	std	$acc1,$i+8($rp_real)
1704e1051a39Sopenharmony_ci	std	$acc2,$i+16($rp_real)
1705e1051a39Sopenharmony_ci	std	$acc3,$i+24($rp_real)
1706e1051a39Sopenharmony_ci
1707e1051a39Sopenharmony_ci	mtlr	r0
1708e1051a39Sopenharmony_ci	ld	r16,$FRAME-8*16($sp)
1709e1051a39Sopenharmony_ci	ld	r17,$FRAME-8*15($sp)
1710e1051a39Sopenharmony_ci	ld	r18,$FRAME-8*14($sp)
1711e1051a39Sopenharmony_ci	ld	r19,$FRAME-8*13($sp)
1712e1051a39Sopenharmony_ci	ld	r20,$FRAME-8*12($sp)
1713e1051a39Sopenharmony_ci	ld	r21,$FRAME-8*11($sp)
1714e1051a39Sopenharmony_ci	ld	r22,$FRAME-8*10($sp)
1715e1051a39Sopenharmony_ci	ld	r23,$FRAME-8*9($sp)
1716e1051a39Sopenharmony_ci	ld	r24,$FRAME-8*8($sp)
1717e1051a39Sopenharmony_ci	ld	r25,$FRAME-8*7($sp)
1718e1051a39Sopenharmony_ci	ld	r26,$FRAME-8*6($sp)
1719e1051a39Sopenharmony_ci	ld	r27,$FRAME-8*5($sp)
1720e1051a39Sopenharmony_ci	ld	r28,$FRAME-8*4($sp)
1721e1051a39Sopenharmony_ci	ld	r29,$FRAME-8*3($sp)
1722e1051a39Sopenharmony_ci	ld	r30,$FRAME-8*2($sp)
1723e1051a39Sopenharmony_ci	ld	r31,$FRAME-8*1($sp)
1724e1051a39Sopenharmony_ci	addi	$sp,$sp,$FRAME
1725e1051a39Sopenharmony_ci	blr
1726e1051a39Sopenharmony_ci	.long	0
1727e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,16,3,0
1728e1051a39Sopenharmony_ci	.long	0
1729e1051a39Sopenharmony_ci.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1730e1051a39Sopenharmony_ci___
1731e1051a39Sopenharmony_ci}
1732e1051a39Sopenharmony_ciif (1) {
1733e1051a39Sopenharmony_cimy ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
1734e1051a39Sopenharmony_cimy ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
1735e1051a39Sopenharmony_ci
1736e1051a39Sopenharmony_ci$code.=<<___;
1737e1051a39Sopenharmony_ci########################################################################
1738e1051a39Sopenharmony_ci# void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1739e1051a39Sopenharmony_ci#                                uint64_t b[4]);
1740e1051a39Sopenharmony_ci.globl	ecp_nistz256_ord_mul_mont
1741e1051a39Sopenharmony_ci.align	5
1742e1051a39Sopenharmony_ciecp_nistz256_ord_mul_mont:
1743e1051a39Sopenharmony_ci	stdu	$sp,-160($sp)
1744e1051a39Sopenharmony_ci	std	r18,48($sp)
1745e1051a39Sopenharmony_ci	std	r19,56($sp)
1746e1051a39Sopenharmony_ci	std	r20,64($sp)
1747e1051a39Sopenharmony_ci	std	r21,72($sp)
1748e1051a39Sopenharmony_ci	std	r22,80($sp)
1749e1051a39Sopenharmony_ci	std	r23,88($sp)
1750e1051a39Sopenharmony_ci	std	r24,96($sp)
1751e1051a39Sopenharmony_ci	std	r25,104($sp)
1752e1051a39Sopenharmony_ci	std	r26,112($sp)
1753e1051a39Sopenharmony_ci	std	r27,120($sp)
1754e1051a39Sopenharmony_ci	std	r28,128($sp)
1755e1051a39Sopenharmony_ci	std	r29,136($sp)
1756e1051a39Sopenharmony_ci	std	r30,144($sp)
1757e1051a39Sopenharmony_ci	std	r31,152($sp)
1758e1051a39Sopenharmony_ci
1759e1051a39Sopenharmony_ci	ld	$a0,0($ap)
1760e1051a39Sopenharmony_ci	ld	$bi,0($bp)
1761e1051a39Sopenharmony_ci	ld	$a1,8($ap)
1762e1051a39Sopenharmony_ci	ld	$a2,16($ap)
1763e1051a39Sopenharmony_ci	ld	$a3,24($ap)
1764e1051a39Sopenharmony_ci
1765e1051a39Sopenharmony_ci	lis	$ordk,0xccd1
1766e1051a39Sopenharmony_ci	lis	$ord0,0xf3b9
1767e1051a39Sopenharmony_ci	lis	$ord1,0xbce6
1768e1051a39Sopenharmony_ci	ori	$ordk,$ordk,0xc8aa
1769e1051a39Sopenharmony_ci	ori	$ord0,$ord0,0xcac2
1770e1051a39Sopenharmony_ci	ori	$ord1,$ord1,0xfaad
1771e1051a39Sopenharmony_ci	sldi	$ordk,$ordk,32
1772e1051a39Sopenharmony_ci	sldi	$ord0,$ord0,32
1773e1051a39Sopenharmony_ci	sldi	$ord1,$ord1,32
1774e1051a39Sopenharmony_ci	oris	$ordk,$ordk,0xee00
1775e1051a39Sopenharmony_ci	oris	$ord0,$ord0,0xfc63
1776e1051a39Sopenharmony_ci	oris	$ord1,$ord1,0xa717
1777e1051a39Sopenharmony_ci	ori	$ordk,$ordk,0xbc4f	# 0xccd1c8aaee00bc4f
1778e1051a39Sopenharmony_ci	ori	$ord0,$ord0,0x2551	# 0xf3b9cac2fc632551
1779e1051a39Sopenharmony_ci	ori	$ord1,$ord1,0x9e84	# 0xbce6faada7179e84
1780e1051a39Sopenharmony_ci	li	$ord2,-1		# 0xffffffffffffffff
1781e1051a39Sopenharmony_ci	sldi	$ord3,$ord2,32		# 0xffffffff00000000
1782e1051a39Sopenharmony_ci	li	$zr,0
1783e1051a39Sopenharmony_ci
1784e1051a39Sopenharmony_ci	mulld	$acc0,$a0,$bi		# a[0]*b[0]
1785e1051a39Sopenharmony_ci	mulhdu	$t0,$a0,$bi
1786e1051a39Sopenharmony_ci
1787e1051a39Sopenharmony_ci	mulld	$acc1,$a1,$bi		# a[1]*b[0]
1788e1051a39Sopenharmony_ci	mulhdu	$t1,$a1,$bi
1789e1051a39Sopenharmony_ci
1790e1051a39Sopenharmony_ci	mulld	$acc2,$a2,$bi		# a[2]*b[0]
1791e1051a39Sopenharmony_ci	mulhdu	$t2,$a2,$bi
1792e1051a39Sopenharmony_ci
1793e1051a39Sopenharmony_ci	mulld	$acc3,$a3,$bi		# a[3]*b[0]
1794e1051a39Sopenharmony_ci	mulhdu	$acc4,$a3,$bi
1795e1051a39Sopenharmony_ci
1796e1051a39Sopenharmony_ci	mulld	$t4,$acc0,$ordk
1797e1051a39Sopenharmony_ci
1798e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$t0		# accumulate high parts of multiplication
1799e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
1800e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
1801e1051a39Sopenharmony_ci	addze	$acc4,$acc4
1802e1051a39Sopenharmony_ci	li	$acc5,0
1803e1051a39Sopenharmony_ci___
1804e1051a39Sopenharmony_cifor ($i=1;$i<4;$i++) {
1805e1051a39Sopenharmony_ci	################################################################
1806e1051a39Sopenharmony_ci	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1807e1051a39Sopenharmony_ci	# *                                     abcdefgh
1808e1051a39Sopenharmony_ci	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1809e1051a39Sopenharmony_ci	#
1810e1051a39Sopenharmony_ci	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1811e1051a39Sopenharmony_ci	# rewrite above as:
1812e1051a39Sopenharmony_ci	#
1813e1051a39Sopenharmony_ci	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1814e1051a39Sopenharmony_ci	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1815e1051a39Sopenharmony_ci	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1816e1051a39Sopenharmony_ci$code.=<<___;
1817e1051a39Sopenharmony_ci	ld	$bi,8*$i($bp)		# b[i]
1818e1051a39Sopenharmony_ci
1819e1051a39Sopenharmony_ci	sldi	$t0,$t4,32
1820e1051a39Sopenharmony_ci	subfc	$acc2,$t4,$acc2
1821e1051a39Sopenharmony_ci	srdi	$t1,$t4,32
1822e1051a39Sopenharmony_ci	subfe	$acc3,$t0,$acc3
1823e1051a39Sopenharmony_ci	subfe	$acc4,$t1,$acc4
1824e1051a39Sopenharmony_ci	subfe	$acc5,$zr,$acc5
1825e1051a39Sopenharmony_ci
1826e1051a39Sopenharmony_ci	addic	$t0,$acc0,-1		# discarded
1827e1051a39Sopenharmony_ci	mulhdu	$t1,$ord0,$t4
1828e1051a39Sopenharmony_ci	mulld	$t2,$ord1,$t4
1829e1051a39Sopenharmony_ci	mulhdu	$t3,$ord1,$t4
1830e1051a39Sopenharmony_ci
1831e1051a39Sopenharmony_ci	adde	$t2,$t2,$t1
1832e1051a39Sopenharmony_ci	 mulld	$t0,$a0,$bi
1833e1051a39Sopenharmony_ci	addze	$t3,$t3
1834e1051a39Sopenharmony_ci	 mulld	$t1,$a1,$bi
1835e1051a39Sopenharmony_ci
1836e1051a39Sopenharmony_ci	addc	$acc0,$acc1,$t2
1837e1051a39Sopenharmony_ci	 mulld	$t2,$a2,$bi
1838e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t3
1839e1051a39Sopenharmony_ci	 mulld	$t3,$a3,$bi
1840e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t4
1841e1051a39Sopenharmony_ci	adde	$acc3,$acc4,$t4
1842e1051a39Sopenharmony_ci	addze	$acc4,$acc5
1843e1051a39Sopenharmony_ci
1844e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0		# accumulate low parts
1845e1051a39Sopenharmony_ci	mulhdu	$t0,$a0,$bi
1846e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
1847e1051a39Sopenharmony_ci	mulhdu	$t1,$a1,$bi
1848e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t2
1849e1051a39Sopenharmony_ci	mulhdu	$t2,$a2,$bi
1850e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
1851e1051a39Sopenharmony_ci	mulhdu	$t3,$a3,$bi
1852e1051a39Sopenharmony_ci	addze	$acc4,$acc4
1853e1051a39Sopenharmony_ci	mulld	$t4,$acc0,$ordk
1854e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$t0		# accumulate high parts
1855e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
1856e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
1857e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t3
1858e1051a39Sopenharmony_ci	addze	$acc5,$zr
1859e1051a39Sopenharmony_ci___
1860e1051a39Sopenharmony_ci}
1861e1051a39Sopenharmony_ci$code.=<<___;
1862e1051a39Sopenharmony_ci	sldi	$t0,$t4,32		# last reduction
1863e1051a39Sopenharmony_ci	subfc	$acc2,$t4,$acc2
1864e1051a39Sopenharmony_ci	srdi	$t1,$t4,32
1865e1051a39Sopenharmony_ci	subfe	$acc3,$t0,$acc3
1866e1051a39Sopenharmony_ci	subfe	$acc4,$t1,$acc4
1867e1051a39Sopenharmony_ci	subfe	$acc5,$zr,$acc5
1868e1051a39Sopenharmony_ci
1869e1051a39Sopenharmony_ci	addic	$t0,$acc0,-1		# discarded
1870e1051a39Sopenharmony_ci	mulhdu	$t1,$ord0,$t4
1871e1051a39Sopenharmony_ci	mulld	$t2,$ord1,$t4
1872e1051a39Sopenharmony_ci	mulhdu	$t3,$ord1,$t4
1873e1051a39Sopenharmony_ci
1874e1051a39Sopenharmony_ci	adde	$t2,$t2,$t1
1875e1051a39Sopenharmony_ci	addze	$t3,$t3
1876e1051a39Sopenharmony_ci
1877e1051a39Sopenharmony_ci	addc	$acc0,$acc1,$t2
1878e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t3
1879e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t4
1880e1051a39Sopenharmony_ci	adde	$acc3,$acc4,$t4
1881e1051a39Sopenharmony_ci	addze	$acc4,$acc5
1882e1051a39Sopenharmony_ci
1883e1051a39Sopenharmony_ci	subfc	$acc0,$ord0,$acc0	# ret -= modulus
1884e1051a39Sopenharmony_ci	subfe	$acc1,$ord1,$acc1
1885e1051a39Sopenharmony_ci	subfe	$acc2,$ord2,$acc2
1886e1051a39Sopenharmony_ci	subfe	$acc3,$ord3,$acc3
1887e1051a39Sopenharmony_ci	subfe	$acc4,$zr,$acc4
1888e1051a39Sopenharmony_ci
1889e1051a39Sopenharmony_ci	and	$t0,$ord0,$acc4
1890e1051a39Sopenharmony_ci	and	$t1,$ord1,$acc4
1891e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$t0		# ret += modulus if borrow
1892e1051a39Sopenharmony_ci	and	$t3,$ord3,$acc4
1893e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$t1
1894e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$acc4
1895e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t3
1896e1051a39Sopenharmony_ci
1897e1051a39Sopenharmony_ci	std	$acc0,0($rp)
1898e1051a39Sopenharmony_ci	std	$acc1,8($rp)
1899e1051a39Sopenharmony_ci	std	$acc2,16($rp)
1900e1051a39Sopenharmony_ci	std	$acc3,24($rp)
1901e1051a39Sopenharmony_ci
1902e1051a39Sopenharmony_ci	ld	r18,48($sp)
1903e1051a39Sopenharmony_ci	ld	r19,56($sp)
1904e1051a39Sopenharmony_ci	ld	r20,64($sp)
1905e1051a39Sopenharmony_ci	ld	r21,72($sp)
1906e1051a39Sopenharmony_ci	ld	r22,80($sp)
1907e1051a39Sopenharmony_ci	ld	r23,88($sp)
1908e1051a39Sopenharmony_ci	ld	r24,96($sp)
1909e1051a39Sopenharmony_ci	ld	r25,104($sp)
1910e1051a39Sopenharmony_ci	ld	r26,112($sp)
1911e1051a39Sopenharmony_ci	ld	r27,120($sp)
1912e1051a39Sopenharmony_ci	ld	r28,128($sp)
1913e1051a39Sopenharmony_ci	ld	r29,136($sp)
1914e1051a39Sopenharmony_ci	ld	r30,144($sp)
1915e1051a39Sopenharmony_ci	ld	r31,152($sp)
1916e1051a39Sopenharmony_ci	addi	$sp,$sp,160
1917e1051a39Sopenharmony_ci	blr
1918e1051a39Sopenharmony_ci	.long	0
1919e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,14,3,0
1920e1051a39Sopenharmony_ci	.long	0
1921e1051a39Sopenharmony_ci.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1922e1051a39Sopenharmony_ci
1923e1051a39Sopenharmony_ci################################################################################
1924e1051a39Sopenharmony_ci# void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1925e1051a39Sopenharmony_ci#                                uint64_t rep);
1926e1051a39Sopenharmony_ci.globl	ecp_nistz256_ord_sqr_mont
1927e1051a39Sopenharmony_ci.align	5
1928e1051a39Sopenharmony_ciecp_nistz256_ord_sqr_mont:
1929e1051a39Sopenharmony_ci	stdu	$sp,-160($sp)
1930e1051a39Sopenharmony_ci	std	r18,48($sp)
1931e1051a39Sopenharmony_ci	std	r19,56($sp)
1932e1051a39Sopenharmony_ci	std	r20,64($sp)
1933e1051a39Sopenharmony_ci	std	r21,72($sp)
1934e1051a39Sopenharmony_ci	std	r22,80($sp)
1935e1051a39Sopenharmony_ci	std	r23,88($sp)
1936e1051a39Sopenharmony_ci	std	r24,96($sp)
1937e1051a39Sopenharmony_ci	std	r25,104($sp)
1938e1051a39Sopenharmony_ci	std	r26,112($sp)
1939e1051a39Sopenharmony_ci	std	r27,120($sp)
1940e1051a39Sopenharmony_ci	std	r28,128($sp)
1941e1051a39Sopenharmony_ci	std	r29,136($sp)
1942e1051a39Sopenharmony_ci	std	r30,144($sp)
1943e1051a39Sopenharmony_ci	std	r31,152($sp)
1944e1051a39Sopenharmony_ci
1945e1051a39Sopenharmony_ci	mtctr	$bp
1946e1051a39Sopenharmony_ci
1947e1051a39Sopenharmony_ci	ld	$a0,0($ap)
1948e1051a39Sopenharmony_ci	ld	$a1,8($ap)
1949e1051a39Sopenharmony_ci	ld	$a2,16($ap)
1950e1051a39Sopenharmony_ci	ld	$a3,24($ap)
1951e1051a39Sopenharmony_ci
1952e1051a39Sopenharmony_ci	lis	$ordk,0xccd1
1953e1051a39Sopenharmony_ci	lis	$ord0,0xf3b9
1954e1051a39Sopenharmony_ci	lis	$ord1,0xbce6
1955e1051a39Sopenharmony_ci	ori	$ordk,$ordk,0xc8aa
1956e1051a39Sopenharmony_ci	ori	$ord0,$ord0,0xcac2
1957e1051a39Sopenharmony_ci	ori	$ord1,$ord1,0xfaad
1958e1051a39Sopenharmony_ci	sldi	$ordk,$ordk,32
1959e1051a39Sopenharmony_ci	sldi	$ord0,$ord0,32
1960e1051a39Sopenharmony_ci	sldi	$ord1,$ord1,32
1961e1051a39Sopenharmony_ci	oris	$ordk,$ordk,0xee00
1962e1051a39Sopenharmony_ci	oris	$ord0,$ord0,0xfc63
1963e1051a39Sopenharmony_ci	oris	$ord1,$ord1,0xa717
1964e1051a39Sopenharmony_ci	ori	$ordk,$ordk,0xbc4f	# 0xccd1c8aaee00bc4f
1965e1051a39Sopenharmony_ci	ori	$ord0,$ord0,0x2551	# 0xf3b9cac2fc632551
1966e1051a39Sopenharmony_ci	ori	$ord1,$ord1,0x9e84	# 0xbce6faada7179e84
1967e1051a39Sopenharmony_ci	li	$ord2,-1		# 0xffffffffffffffff
1968e1051a39Sopenharmony_ci	sldi	$ord3,$ord2,32		# 0xffffffff00000000
1969e1051a39Sopenharmony_ci	li	$zr,0
1970e1051a39Sopenharmony_ci	b	.Loop_ord_sqr
1971e1051a39Sopenharmony_ci
1972e1051a39Sopenharmony_ci.align	5
1973e1051a39Sopenharmony_ci.Loop_ord_sqr:
1974e1051a39Sopenharmony_ci	################################################################
1975e1051a39Sopenharmony_ci	#  |  |  |  |  |  |a1*a0|  |
1976e1051a39Sopenharmony_ci	#  |  |  |  |  |a2*a0|  |  |
1977e1051a39Sopenharmony_ci	#  |  |a3*a2|a3*a0|  |  |  |
1978e1051a39Sopenharmony_ci	#  |  |  |  |a2*a1|  |  |  |
1979e1051a39Sopenharmony_ci	#  |  |  |a3*a1|  |  |  |  |
1980e1051a39Sopenharmony_ci	# *|  |  |  |  |  |  |  | 2|
1981e1051a39Sopenharmony_ci	# +|a3*a3|a2*a2|a1*a1|a0*a0|
1982e1051a39Sopenharmony_ci	#  |--+--+--+--+--+--+--+--|
1983e1051a39Sopenharmony_ci	#  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1984e1051a39Sopenharmony_ci	#
1985e1051a39Sopenharmony_ci	#  "can't overflow" below mark carrying into high part of
1986e1051a39Sopenharmony_ci	#  multiplication result, which can't overflow, because it
1987e1051a39Sopenharmony_ci	#  can never be all ones.
1988e1051a39Sopenharmony_ci
1989e1051a39Sopenharmony_ci	mulld	$acc1,$a1,$a0		# a[1]*a[0]
1990e1051a39Sopenharmony_ci	mulhdu	$t1,$a1,$a0
1991e1051a39Sopenharmony_ci	mulld	$acc2,$a2,$a0		# a[2]*a[0]
1992e1051a39Sopenharmony_ci	mulhdu	$t2,$a2,$a0
1993e1051a39Sopenharmony_ci	mulld	$acc3,$a3,$a0		# a[3]*a[0]
1994e1051a39Sopenharmony_ci	mulhdu	$acc4,$a3,$a0
1995e1051a39Sopenharmony_ci
1996e1051a39Sopenharmony_ci	addc	$acc2,$acc2,$t1		# accumulate high parts of multiplication
1997e1051a39Sopenharmony_ci	 mulld	$t0,$a2,$a1		# a[2]*a[1]
1998e1051a39Sopenharmony_ci	 mulhdu	$t1,$a2,$a1
1999e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$t2
2000e1051a39Sopenharmony_ci	 mulld	$t2,$a3,$a1		# a[3]*a[1]
2001e1051a39Sopenharmony_ci	 mulhdu	$t3,$a3,$a1
2002e1051a39Sopenharmony_ci	addze	$acc4,$acc4		# can't overflow
2003e1051a39Sopenharmony_ci
2004e1051a39Sopenharmony_ci	mulld	$acc5,$a3,$a2		# a[3]*a[2]
2005e1051a39Sopenharmony_ci	mulhdu	$acc6,$a3,$a2
2006e1051a39Sopenharmony_ci
2007e1051a39Sopenharmony_ci	addc	$t1,$t1,$t2		# accumulate high parts of multiplication
2008e1051a39Sopenharmony_ci	 mulld	$acc0,$a0,$a0		# a[0]*a[0]
2009e1051a39Sopenharmony_ci	addze	$t2,$t3			# can't overflow
2010e1051a39Sopenharmony_ci
2011e1051a39Sopenharmony_ci	addc	$acc3,$acc3,$t0		# accumulate low parts of multiplication
2012e1051a39Sopenharmony_ci	 mulhdu	$a0,$a0,$a0
2013e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t1
2014e1051a39Sopenharmony_ci	 mulld	$t1,$a1,$a1		# a[1]*a[1]
2015e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$t2
2016e1051a39Sopenharmony_ci	 mulhdu	$a1,$a1,$a1
2017e1051a39Sopenharmony_ci	addze	$acc6,$acc6		# can't overflow
2018e1051a39Sopenharmony_ci
2019e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$acc1	# acc[1-6]*=2
2020e1051a39Sopenharmony_ci	 mulld	$t2,$a2,$a2		# a[2]*a[2]
2021e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$acc2
2022e1051a39Sopenharmony_ci	 mulhdu	$a2,$a2,$a2
2023e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$acc3
2024e1051a39Sopenharmony_ci	 mulld	$t3,$a3,$a3		# a[3]*a[3]
2025e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$acc4
2026e1051a39Sopenharmony_ci	 mulhdu	$a3,$a3,$a3
2027e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$acc5
2028e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$acc6
2029e1051a39Sopenharmony_ci	addze	$acc7,$zr
2030e1051a39Sopenharmony_ci
2031e1051a39Sopenharmony_ci	addc	$acc1,$acc1,$a0		# +a[i]*a[i]
2032e1051a39Sopenharmony_ci	 mulld	$t4,$acc0,$ordk
2033e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$t1
2034e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$a1
2035e1051a39Sopenharmony_ci	adde	$acc4,$acc4,$t2
2036e1051a39Sopenharmony_ci	adde	$acc5,$acc5,$a2
2037e1051a39Sopenharmony_ci	adde	$acc6,$acc6,$t3
2038e1051a39Sopenharmony_ci	adde	$acc7,$acc7,$a3
2039e1051a39Sopenharmony_ci___
2040e1051a39Sopenharmony_cifor($i=0; $i<4; $i++) {			# reductions
2041e1051a39Sopenharmony_ci$code.=<<___;
2042e1051a39Sopenharmony_ci	addic	$t0,$acc0,-1		# discarded
2043e1051a39Sopenharmony_ci	mulhdu	$t1,$ord0,$t4
2044e1051a39Sopenharmony_ci	mulld	$t2,$ord1,$t4
2045e1051a39Sopenharmony_ci	mulhdu	$t3,$ord1,$t4
2046e1051a39Sopenharmony_ci
2047e1051a39Sopenharmony_ci	adde	$t2,$t2,$t1
2048e1051a39Sopenharmony_ci	addze	$t3,$t3
2049e1051a39Sopenharmony_ci
2050e1051a39Sopenharmony_ci	addc	$acc0,$acc1,$t2
2051e1051a39Sopenharmony_ci	adde	$acc1,$acc2,$t3
2052e1051a39Sopenharmony_ci	adde	$acc2,$acc3,$t4
2053e1051a39Sopenharmony_ci	adde	$acc3,$zr,$t4		# can't overflow
2054e1051a39Sopenharmony_ci___
2055e1051a39Sopenharmony_ci$code.=<<___	if ($i<3);
2056e1051a39Sopenharmony_ci	mulld	$t3,$acc0,$ordk
2057e1051a39Sopenharmony_ci___
2058e1051a39Sopenharmony_ci$code.=<<___;
2059e1051a39Sopenharmony_ci	sldi	$t0,$t4,32
2060e1051a39Sopenharmony_ci	subfc	$acc1,$t4,$acc1
2061e1051a39Sopenharmony_ci	srdi	$t1,$t4,32
2062e1051a39Sopenharmony_ci	subfe	$acc2,$t0,$acc2
2063e1051a39Sopenharmony_ci	subfe	$acc3,$t1,$acc3		# can't borrow
2064e1051a39Sopenharmony_ci___
2065e1051a39Sopenharmony_ci	($t3,$t4) = ($t4,$t3);
2066e1051a39Sopenharmony_ci}
2067e1051a39Sopenharmony_ci$code.=<<___;
2068e1051a39Sopenharmony_ci	addc	$acc0,$acc0,$acc4	# accumulate upper half
2069e1051a39Sopenharmony_ci	adde	$acc1,$acc1,$acc5
2070e1051a39Sopenharmony_ci	adde	$acc2,$acc2,$acc6
2071e1051a39Sopenharmony_ci	adde	$acc3,$acc3,$acc7
2072e1051a39Sopenharmony_ci	addze	$acc4,$zr
2073e1051a39Sopenharmony_ci
2074e1051a39Sopenharmony_ci	subfc	$acc0,$ord0,$acc0	# ret -= modulus
2075e1051a39Sopenharmony_ci	subfe	$acc1,$ord1,$acc1
2076e1051a39Sopenharmony_ci	subfe	$acc2,$ord2,$acc2
2077e1051a39Sopenharmony_ci	subfe	$acc3,$ord3,$acc3
2078e1051a39Sopenharmony_ci	subfe	$acc4,$zr,$acc4
2079e1051a39Sopenharmony_ci
2080e1051a39Sopenharmony_ci	and	$t0,$ord0,$acc4
2081e1051a39Sopenharmony_ci	and	$t1,$ord1,$acc4
2082e1051a39Sopenharmony_ci	addc	$a0,$acc0,$t0		# ret += modulus if borrow
2083e1051a39Sopenharmony_ci	and	$t3,$ord3,$acc4
2084e1051a39Sopenharmony_ci	adde	$a1,$acc1,$t1
2085e1051a39Sopenharmony_ci	adde	$a2,$acc2,$acc4
2086e1051a39Sopenharmony_ci	adde	$a3,$acc3,$t3
2087e1051a39Sopenharmony_ci
2088e1051a39Sopenharmony_ci	bdnz	.Loop_ord_sqr
2089e1051a39Sopenharmony_ci
2090e1051a39Sopenharmony_ci	std	$a0,0($rp)
2091e1051a39Sopenharmony_ci	std	$a1,8($rp)
2092e1051a39Sopenharmony_ci	std	$a2,16($rp)
2093e1051a39Sopenharmony_ci	std	$a3,24($rp)
2094e1051a39Sopenharmony_ci
2095e1051a39Sopenharmony_ci	ld	r18,48($sp)
2096e1051a39Sopenharmony_ci	ld	r19,56($sp)
2097e1051a39Sopenharmony_ci	ld	r20,64($sp)
2098e1051a39Sopenharmony_ci	ld	r21,72($sp)
2099e1051a39Sopenharmony_ci	ld	r22,80($sp)
2100e1051a39Sopenharmony_ci	ld	r23,88($sp)
2101e1051a39Sopenharmony_ci	ld	r24,96($sp)
2102e1051a39Sopenharmony_ci	ld	r25,104($sp)
2103e1051a39Sopenharmony_ci	ld	r26,112($sp)
2104e1051a39Sopenharmony_ci	ld	r27,120($sp)
2105e1051a39Sopenharmony_ci	ld	r28,128($sp)
2106e1051a39Sopenharmony_ci	ld	r29,136($sp)
2107e1051a39Sopenharmony_ci	ld	r30,144($sp)
2108e1051a39Sopenharmony_ci	ld	r31,152($sp)
2109e1051a39Sopenharmony_ci	addi	$sp,$sp,160
2110e1051a39Sopenharmony_ci	blr
2111e1051a39Sopenharmony_ci	.long	0
2112e1051a39Sopenharmony_ci	.byte	0,12,4,0,0x80,14,3,0
2113e1051a39Sopenharmony_ci	.long	0
2114e1051a39Sopenharmony_ci.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
2115e1051a39Sopenharmony_ci___
2116e1051a39Sopenharmony_ci}	}
2117e1051a39Sopenharmony_ci
2118e1051a39Sopenharmony_ci########################################################################
2119e1051a39Sopenharmony_ci# scatter-gather subroutines
2120e1051a39Sopenharmony_ci{
2121e1051a39Sopenharmony_cimy ($out,$inp,$index,$mask)=map("r$_",(3..7));
2122e1051a39Sopenharmony_ci$code.=<<___;
2123e1051a39Sopenharmony_ci########################################################################
2124e1051a39Sopenharmony_ci# void	ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
2125e1051a39Sopenharmony_ci#				int index);
2126e1051a39Sopenharmony_ci.globl	ecp_nistz256_scatter_w5
2127e1051a39Sopenharmony_ci.align	4
2128e1051a39Sopenharmony_ciecp_nistz256_scatter_w5:
2129e1051a39Sopenharmony_ci	slwi	$index,$index,2
2130e1051a39Sopenharmony_ci	add	$out,$out,$index
2131e1051a39Sopenharmony_ci
2132e1051a39Sopenharmony_ci	ld	r8, 0($inp)		# X
2133e1051a39Sopenharmony_ci	ld	r9, 8($inp)
2134e1051a39Sopenharmony_ci	ld	r10,16($inp)
2135e1051a39Sopenharmony_ci	ld	r11,24($inp)
2136e1051a39Sopenharmony_ci
2137e1051a39Sopenharmony_ci	stw	r8, 64*0-4($out)
2138e1051a39Sopenharmony_ci	srdi	r8, r8, 32
2139e1051a39Sopenharmony_ci	stw	r9, 64*1-4($out)
2140e1051a39Sopenharmony_ci	srdi	r9, r9, 32
2141e1051a39Sopenharmony_ci	stw	r10,64*2-4($out)
2142e1051a39Sopenharmony_ci	srdi	r10,r10,32
2143e1051a39Sopenharmony_ci	stw	r11,64*3-4($out)
2144e1051a39Sopenharmony_ci	srdi	r11,r11,32
2145e1051a39Sopenharmony_ci	stw	r8, 64*4-4($out)
2146e1051a39Sopenharmony_ci	stw	r9, 64*5-4($out)
2147e1051a39Sopenharmony_ci	stw	r10,64*6-4($out)
2148e1051a39Sopenharmony_ci	stw	r11,64*7-4($out)
2149e1051a39Sopenharmony_ci	addi	$out,$out,64*8
2150e1051a39Sopenharmony_ci
2151e1051a39Sopenharmony_ci	ld	r8, 32($inp)		# Y
2152e1051a39Sopenharmony_ci	ld	r9, 40($inp)
2153e1051a39Sopenharmony_ci	ld	r10,48($inp)
2154e1051a39Sopenharmony_ci	ld	r11,56($inp)
2155e1051a39Sopenharmony_ci
2156e1051a39Sopenharmony_ci	stw	r8, 64*0-4($out)
2157e1051a39Sopenharmony_ci	srdi	r8, r8, 32
2158e1051a39Sopenharmony_ci	stw	r9, 64*1-4($out)
2159e1051a39Sopenharmony_ci	srdi	r9, r9, 32
2160e1051a39Sopenharmony_ci	stw	r10,64*2-4($out)
2161e1051a39Sopenharmony_ci	srdi	r10,r10,32
2162e1051a39Sopenharmony_ci	stw	r11,64*3-4($out)
2163e1051a39Sopenharmony_ci	srdi	r11,r11,32
2164e1051a39Sopenharmony_ci	stw	r8, 64*4-4($out)
2165e1051a39Sopenharmony_ci	stw	r9, 64*5-4($out)
2166e1051a39Sopenharmony_ci	stw	r10,64*6-4($out)
2167e1051a39Sopenharmony_ci	stw	r11,64*7-4($out)
2168e1051a39Sopenharmony_ci	addi	$out,$out,64*8
2169e1051a39Sopenharmony_ci
2170e1051a39Sopenharmony_ci	ld	r8, 64($inp)		# Z
2171e1051a39Sopenharmony_ci	ld	r9, 72($inp)
2172e1051a39Sopenharmony_ci	ld	r10,80($inp)
2173e1051a39Sopenharmony_ci	ld	r11,88($inp)
2174e1051a39Sopenharmony_ci
2175e1051a39Sopenharmony_ci	stw	r8, 64*0-4($out)
2176e1051a39Sopenharmony_ci	srdi	r8, r8, 32
2177e1051a39Sopenharmony_ci	stw	r9, 64*1-4($out)
2178e1051a39Sopenharmony_ci	srdi	r9, r9, 32
2179e1051a39Sopenharmony_ci	stw	r10,64*2-4($out)
2180e1051a39Sopenharmony_ci	srdi	r10,r10,32
2181e1051a39Sopenharmony_ci	stw	r11,64*3-4($out)
2182e1051a39Sopenharmony_ci	srdi	r11,r11,32
2183e1051a39Sopenharmony_ci	stw	r8, 64*4-4($out)
2184e1051a39Sopenharmony_ci	stw	r9, 64*5-4($out)
2185e1051a39Sopenharmony_ci	stw	r10,64*6-4($out)
2186e1051a39Sopenharmony_ci	stw	r11,64*7-4($out)
2187e1051a39Sopenharmony_ci
2188e1051a39Sopenharmony_ci	blr
2189e1051a39Sopenharmony_ci	.long	0
2190e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,3,0
2191e1051a39Sopenharmony_ci	.long	0
2192e1051a39Sopenharmony_ci.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2193e1051a39Sopenharmony_ci
2194e1051a39Sopenharmony_ci########################################################################
2195e1051a39Sopenharmony_ci# void	ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
2196e1051a39Sopenharmony_ci#				int index);
2197e1051a39Sopenharmony_ci.globl	ecp_nistz256_gather_w5
2198e1051a39Sopenharmony_ci.align	4
2199e1051a39Sopenharmony_ciecp_nistz256_gather_w5:
2200e1051a39Sopenharmony_ci	neg	r0,$index
2201e1051a39Sopenharmony_ci	sradi	r0,r0,63
2202e1051a39Sopenharmony_ci
2203e1051a39Sopenharmony_ci	add	$index,$index,r0
2204e1051a39Sopenharmony_ci	slwi	$index,$index,2
2205e1051a39Sopenharmony_ci	add	$inp,$inp,$index
2206e1051a39Sopenharmony_ci
2207e1051a39Sopenharmony_ci	lwz	r5, 64*0($inp)
2208e1051a39Sopenharmony_ci	lwz	r6, 64*1($inp)
2209e1051a39Sopenharmony_ci	lwz	r7, 64*2($inp)
2210e1051a39Sopenharmony_ci	lwz	r8, 64*3($inp)
2211e1051a39Sopenharmony_ci	lwz	r9, 64*4($inp)
2212e1051a39Sopenharmony_ci	lwz	r10,64*5($inp)
2213e1051a39Sopenharmony_ci	lwz	r11,64*6($inp)
2214e1051a39Sopenharmony_ci	lwz	r12,64*7($inp)
2215e1051a39Sopenharmony_ci	addi	$inp,$inp,64*8
2216e1051a39Sopenharmony_ci	sldi	r9, r9, 32
2217e1051a39Sopenharmony_ci	sldi	r10,r10,32
2218e1051a39Sopenharmony_ci	sldi	r11,r11,32
2219e1051a39Sopenharmony_ci	sldi	r12,r12,32
2220e1051a39Sopenharmony_ci	or	r5,r5,r9
2221e1051a39Sopenharmony_ci	or	r6,r6,r10
2222e1051a39Sopenharmony_ci	or	r7,r7,r11
2223e1051a39Sopenharmony_ci	or	r8,r8,r12
2224e1051a39Sopenharmony_ci	and	r5,r5,r0
2225e1051a39Sopenharmony_ci	and	r6,r6,r0
2226e1051a39Sopenharmony_ci	and	r7,r7,r0
2227e1051a39Sopenharmony_ci	and	r8,r8,r0
2228e1051a39Sopenharmony_ci	std	r5,0($out)		# X
2229e1051a39Sopenharmony_ci	std	r6,8($out)
2230e1051a39Sopenharmony_ci	std	r7,16($out)
2231e1051a39Sopenharmony_ci	std	r8,24($out)
2232e1051a39Sopenharmony_ci
2233e1051a39Sopenharmony_ci	lwz	r5, 64*0($inp)
2234e1051a39Sopenharmony_ci	lwz	r6, 64*1($inp)
2235e1051a39Sopenharmony_ci	lwz	r7, 64*2($inp)
2236e1051a39Sopenharmony_ci	lwz	r8, 64*3($inp)
2237e1051a39Sopenharmony_ci	lwz	r9, 64*4($inp)
2238e1051a39Sopenharmony_ci	lwz	r10,64*5($inp)
2239e1051a39Sopenharmony_ci	lwz	r11,64*6($inp)
2240e1051a39Sopenharmony_ci	lwz	r12,64*7($inp)
2241e1051a39Sopenharmony_ci	addi	$inp,$inp,64*8
2242e1051a39Sopenharmony_ci	sldi	r9, r9, 32
2243e1051a39Sopenharmony_ci	sldi	r10,r10,32
2244e1051a39Sopenharmony_ci	sldi	r11,r11,32
2245e1051a39Sopenharmony_ci	sldi	r12,r12,32
2246e1051a39Sopenharmony_ci	or	r5,r5,r9
2247e1051a39Sopenharmony_ci	or	r6,r6,r10
2248e1051a39Sopenharmony_ci	or	r7,r7,r11
2249e1051a39Sopenharmony_ci	or	r8,r8,r12
2250e1051a39Sopenharmony_ci	and	r5,r5,r0
2251e1051a39Sopenharmony_ci	and	r6,r6,r0
2252e1051a39Sopenharmony_ci	and	r7,r7,r0
2253e1051a39Sopenharmony_ci	and	r8,r8,r0
2254e1051a39Sopenharmony_ci	std	r5,32($out)		# Y
2255e1051a39Sopenharmony_ci	std	r6,40($out)
2256e1051a39Sopenharmony_ci	std	r7,48($out)
2257e1051a39Sopenharmony_ci	std	r8,56($out)
2258e1051a39Sopenharmony_ci
2259e1051a39Sopenharmony_ci	lwz	r5, 64*0($inp)
2260e1051a39Sopenharmony_ci	lwz	r6, 64*1($inp)
2261e1051a39Sopenharmony_ci	lwz	r7, 64*2($inp)
2262e1051a39Sopenharmony_ci	lwz	r8, 64*3($inp)
2263e1051a39Sopenharmony_ci	lwz	r9, 64*4($inp)
2264e1051a39Sopenharmony_ci	lwz	r10,64*5($inp)
2265e1051a39Sopenharmony_ci	lwz	r11,64*6($inp)
2266e1051a39Sopenharmony_ci	lwz	r12,64*7($inp)
2267e1051a39Sopenharmony_ci	sldi	r9, r9, 32
2268e1051a39Sopenharmony_ci	sldi	r10,r10,32
2269e1051a39Sopenharmony_ci	sldi	r11,r11,32
2270e1051a39Sopenharmony_ci	sldi	r12,r12,32
2271e1051a39Sopenharmony_ci	or	r5,r5,r9
2272e1051a39Sopenharmony_ci	or	r6,r6,r10
2273e1051a39Sopenharmony_ci	or	r7,r7,r11
2274e1051a39Sopenharmony_ci	or	r8,r8,r12
2275e1051a39Sopenharmony_ci	and	r5,r5,r0
2276e1051a39Sopenharmony_ci	and	r6,r6,r0
2277e1051a39Sopenharmony_ci	and	r7,r7,r0
2278e1051a39Sopenharmony_ci	and	r8,r8,r0
2279e1051a39Sopenharmony_ci	std	r5,64($out)		# Z
2280e1051a39Sopenharmony_ci	std	r6,72($out)
2281e1051a39Sopenharmony_ci	std	r7,80($out)
2282e1051a39Sopenharmony_ci	std	r8,88($out)
2283e1051a39Sopenharmony_ci
2284e1051a39Sopenharmony_ci	blr
2285e1051a39Sopenharmony_ci	.long	0
2286e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,3,0
2287e1051a39Sopenharmony_ci	.long	0
2288e1051a39Sopenharmony_ci.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2289e1051a39Sopenharmony_ci
2290e1051a39Sopenharmony_ci########################################################################
2291e1051a39Sopenharmony_ci# void	ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
2292e1051a39Sopenharmony_ci#				int index);
2293e1051a39Sopenharmony_ci.globl	ecp_nistz256_scatter_w7
2294e1051a39Sopenharmony_ci.align	4
2295e1051a39Sopenharmony_ciecp_nistz256_scatter_w7:
2296e1051a39Sopenharmony_ci	li	r0,8
2297e1051a39Sopenharmony_ci	mtctr	r0
2298e1051a39Sopenharmony_ci	add	$out,$out,$index
2299e1051a39Sopenharmony_ci	subi	$inp,$inp,8
2300e1051a39Sopenharmony_ci
2301e1051a39Sopenharmony_ci.Loop_scatter_w7:
2302e1051a39Sopenharmony_ci	ldu	r0,8($inp)
2303e1051a39Sopenharmony_ci	stb	r0,64*0($out)
2304e1051a39Sopenharmony_ci	srdi	r0,r0,8
2305e1051a39Sopenharmony_ci	stb	r0,64*1($out)
2306e1051a39Sopenharmony_ci	srdi	r0,r0,8
2307e1051a39Sopenharmony_ci	stb	r0,64*2($out)
2308e1051a39Sopenharmony_ci	srdi	r0,r0,8
2309e1051a39Sopenharmony_ci	stb	r0,64*3($out)
2310e1051a39Sopenharmony_ci	srdi	r0,r0,8
2311e1051a39Sopenharmony_ci	stb	r0,64*4($out)
2312e1051a39Sopenharmony_ci	srdi	r0,r0,8
2313e1051a39Sopenharmony_ci	stb	r0,64*5($out)
2314e1051a39Sopenharmony_ci	srdi	r0,r0,8
2315e1051a39Sopenharmony_ci	stb	r0,64*6($out)
2316e1051a39Sopenharmony_ci	srdi	r0,r0,8
2317e1051a39Sopenharmony_ci	stb	r0,64*7($out)
2318e1051a39Sopenharmony_ci	addi	$out,$out,64*8
2319e1051a39Sopenharmony_ci	bdnz	.Loop_scatter_w7
2320e1051a39Sopenharmony_ci
2321e1051a39Sopenharmony_ci	blr
2322e1051a39Sopenharmony_ci	.long	0
2323e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,3,0
2324e1051a39Sopenharmony_ci	.long	0
2325e1051a39Sopenharmony_ci.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2326e1051a39Sopenharmony_ci
2327e1051a39Sopenharmony_ci########################################################################
2328e1051a39Sopenharmony_ci# void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
2329e1051a39Sopenharmony_ci#				int index);
2330e1051a39Sopenharmony_ci.globl	ecp_nistz256_gather_w7
2331e1051a39Sopenharmony_ci.align	4
2332e1051a39Sopenharmony_ciecp_nistz256_gather_w7:
2333e1051a39Sopenharmony_ci	li	r0,8
2334e1051a39Sopenharmony_ci	mtctr	r0
2335e1051a39Sopenharmony_ci	neg	r0,$index
2336e1051a39Sopenharmony_ci	sradi	r0,r0,63
2337e1051a39Sopenharmony_ci
2338e1051a39Sopenharmony_ci	add	$index,$index,r0
2339e1051a39Sopenharmony_ci	add	$inp,$inp,$index
2340e1051a39Sopenharmony_ci	subi	$out,$out,8
2341e1051a39Sopenharmony_ci
2342e1051a39Sopenharmony_ci.Loop_gather_w7:
2343e1051a39Sopenharmony_ci	lbz	r5, 64*0($inp)
2344e1051a39Sopenharmony_ci	lbz	r6, 64*1($inp)
2345e1051a39Sopenharmony_ci	lbz	r7, 64*2($inp)
2346e1051a39Sopenharmony_ci	lbz	r8, 64*3($inp)
2347e1051a39Sopenharmony_ci	lbz	r9, 64*4($inp)
2348e1051a39Sopenharmony_ci	lbz	r10,64*5($inp)
2349e1051a39Sopenharmony_ci	lbz	r11,64*6($inp)
2350e1051a39Sopenharmony_ci	lbz	r12,64*7($inp)
2351e1051a39Sopenharmony_ci	addi	$inp,$inp,64*8
2352e1051a39Sopenharmony_ci
2353e1051a39Sopenharmony_ci	sldi	r6, r6, 8
2354e1051a39Sopenharmony_ci	sldi	r7, r7, 16
2355e1051a39Sopenharmony_ci	sldi	r8, r8, 24
2356e1051a39Sopenharmony_ci	sldi	r9, r9, 32
2357e1051a39Sopenharmony_ci	sldi	r10,r10,40
2358e1051a39Sopenharmony_ci	sldi	r11,r11,48
2359e1051a39Sopenharmony_ci	sldi	r12,r12,56
2360e1051a39Sopenharmony_ci
2361e1051a39Sopenharmony_ci	or	r5,r5,r6
2362e1051a39Sopenharmony_ci	or	r7,r7,r8
2363e1051a39Sopenharmony_ci	or	r9,r9,r10
2364e1051a39Sopenharmony_ci	or	r11,r11,r12
2365e1051a39Sopenharmony_ci	or	r5,r5,r7
2366e1051a39Sopenharmony_ci	or	r9,r9,r11
2367e1051a39Sopenharmony_ci	or	r5,r5,r9
2368e1051a39Sopenharmony_ci	and	r5,r5,r0
2369e1051a39Sopenharmony_ci	stdu	r5,8($out)
2370e1051a39Sopenharmony_ci	bdnz	.Loop_gather_w7
2371e1051a39Sopenharmony_ci
2372e1051a39Sopenharmony_ci	blr
2373e1051a39Sopenharmony_ci	.long	0
2374e1051a39Sopenharmony_ci	.byte	0,12,0x14,0,0,0,3,0
2375e1051a39Sopenharmony_ci	.long	0
2376e1051a39Sopenharmony_ci.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2377e1051a39Sopenharmony_ci___
2378e1051a39Sopenharmony_ci}
2379e1051a39Sopenharmony_ci
2380e1051a39Sopenharmony_ciforeach (split("\n",$code)) {
2381e1051a39Sopenharmony_ci	s/\`([^\`]*)\`/eval $1/ge;
2382e1051a39Sopenharmony_ci
2383e1051a39Sopenharmony_ci	print $_,"\n";
2384e1051a39Sopenharmony_ci}
2385e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";	# enforce flush
2386