1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# ECP_NISTZ256 module for ARMv8.
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# February 2015.
20e1051a39Sopenharmony_ci#
21e1051a39Sopenharmony_ci# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22e1051a39Sopenharmony_ci# http://eprint.iacr.org/2013/816.
23e1051a39Sopenharmony_ci#
24e1051a39Sopenharmony_ci#			with/without -DECP_NISTZ256_ASM
25e1051a39Sopenharmony_ci# Apple A7		+190-360%
26e1051a39Sopenharmony_ci# Cortex-A53		+190-400%
27e1051a39Sopenharmony_ci# Cortex-A57		+190-350%
28e1051a39Sopenharmony_ci# Denver		+230-400%
29e1051a39Sopenharmony_ci#
30e1051a39Sopenharmony_ci# Ranges denote minimum and maximum improvement coefficients depending
31e1051a39Sopenharmony_ci# on benchmark. Lower coefficients are for ECDSA sign, server-side
32e1051a39Sopenharmony_ci# operation. Keep in mind that +400% means 5x improvement.
33e1051a39Sopenharmony_ci
34e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
35e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
36e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
37e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
38e1051a39Sopenharmony_ci
39e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40e1051a39Sopenharmony_ci( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41e1051a39Sopenharmony_ci( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42e1051a39Sopenharmony_cidie "can't locate arm-xlate.pl";
43e1051a39Sopenharmony_ci
44e1051a39Sopenharmony_ciopen OUT,"| \"$^X\" $xlate $flavour \"$output\""
45e1051a39Sopenharmony_ci    or die "can't call $xlate: $!";
46e1051a39Sopenharmony_ci*STDOUT=*OUT;
47e1051a39Sopenharmony_ci
48e1051a39Sopenharmony_ci{
49e1051a39Sopenharmony_cimy ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
50e1051a39Sopenharmony_ci    $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
51e1051a39Sopenharmony_ci    map("x$_",(0..17,19,20));
52e1051a39Sopenharmony_ci
53e1051a39Sopenharmony_cimy ($acc6,$acc7)=($ap,$bp);	# used in __ecp_nistz256_sqr_mont
54e1051a39Sopenharmony_ci
55e1051a39Sopenharmony_ci$code.=<<___;
56e1051a39Sopenharmony_ci#include "arm_arch.h"
57e1051a39Sopenharmony_ci
58e1051a39Sopenharmony_ci.rodata
59e1051a39Sopenharmony_ci___
60e1051a39Sopenharmony_ci########################################################################
61e1051a39Sopenharmony_ci# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
62e1051a39Sopenharmony_ci#
63e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64e1051a39Sopenharmony_ciopen TABLE,"<ecp_nistz256_table.c"		or
65e1051a39Sopenharmony_ciopen TABLE,"<${dir}../ecp_nistz256_table.c"	or
66e1051a39Sopenharmony_cidie "failed to open ecp_nistz256_table.c:",$!;
67e1051a39Sopenharmony_ci
68e1051a39Sopenharmony_ciuse integer;
69e1051a39Sopenharmony_ci
70e1051a39Sopenharmony_ciforeach(<TABLE>) {
71e1051a39Sopenharmony_ci	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
72e1051a39Sopenharmony_ci}
73e1051a39Sopenharmony_ciclose TABLE;
74e1051a39Sopenharmony_ci
75e1051a39Sopenharmony_ci# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
76e1051a39Sopenharmony_ci# 64*16*37-1 is because $#arr returns last valid index or @arr, not
77e1051a39Sopenharmony_ci# amount of elements.
78e1051a39Sopenharmony_cidie "insane number of elements" if ($#arr != 64*16*37-1);
79e1051a39Sopenharmony_ci
80e1051a39Sopenharmony_ci$code.=<<___;
81e1051a39Sopenharmony_ci.globl	ecp_nistz256_precomputed
82e1051a39Sopenharmony_ci.type	ecp_nistz256_precomputed,%object
83e1051a39Sopenharmony_ci.align	12
84e1051a39Sopenharmony_ciecp_nistz256_precomputed:
85e1051a39Sopenharmony_ci___
86e1051a39Sopenharmony_ci########################################################################
87e1051a39Sopenharmony_ci# this conversion smashes P256_POINT_AFFINE by individual bytes with
88e1051a39Sopenharmony_ci# 64 byte interval, similar to
89e1051a39Sopenharmony_ci#	1111222233334444
90e1051a39Sopenharmony_ci#	1234123412341234
91e1051a39Sopenharmony_cifor(1..37) {
92e1051a39Sopenharmony_ci	@tbl = splice(@arr,0,64*16);
93e1051a39Sopenharmony_ci	for($i=0;$i<64;$i++) {
94e1051a39Sopenharmony_ci		undef @line;
95e1051a39Sopenharmony_ci		for($j=0;$j<64;$j++) {
96e1051a39Sopenharmony_ci			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
97e1051a39Sopenharmony_ci		}
98e1051a39Sopenharmony_ci		$code.=".byte\t";
99e1051a39Sopenharmony_ci		$code.=join(',',map { sprintf "0x%02x",$_} @line);
100e1051a39Sopenharmony_ci		$code.="\n";
101e1051a39Sopenharmony_ci	}
102e1051a39Sopenharmony_ci}
103e1051a39Sopenharmony_ci$code.=<<___;
104e1051a39Sopenharmony_ci.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
105e1051a39Sopenharmony_ci.align	5
106e1051a39Sopenharmony_ci.Lpoly:
107e1051a39Sopenharmony_ci.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
108e1051a39Sopenharmony_ci.LRR:	// 2^512 mod P precomputed for NIST P256 polynomial
109e1051a39Sopenharmony_ci.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
110e1051a39Sopenharmony_ci.Lone_mont:
111e1051a39Sopenharmony_ci.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
112e1051a39Sopenharmony_ci.Lone:
113e1051a39Sopenharmony_ci.quad	1,0,0,0
114e1051a39Sopenharmony_ci.Lord:
115e1051a39Sopenharmony_ci.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
116e1051a39Sopenharmony_ci.LordK:
117e1051a39Sopenharmony_ci.quad	0xccd1c8aaee00bc4f
118e1051a39Sopenharmony_ci.asciz	"ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
119e1051a39Sopenharmony_ci
120e1051a39Sopenharmony_ci.text
121e1051a39Sopenharmony_ci
122e1051a39Sopenharmony_ci// void	ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
123e1051a39Sopenharmony_ci.globl	ecp_nistz256_to_mont
124e1051a39Sopenharmony_ci.type	ecp_nistz256_to_mont,%function
125e1051a39Sopenharmony_ci.align	6
126e1051a39Sopenharmony_ciecp_nistz256_to_mont:
127e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
128e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-32]!
129e1051a39Sopenharmony_ci	add	x29,sp,#0
130e1051a39Sopenharmony_ci	stp	x19,x20,[sp,#16]
131e1051a39Sopenharmony_ci
132e1051a39Sopenharmony_ci	adrp	$bi,.LRR
133e1051a39Sopenharmony_ci	ldr	$bi,[$bi,:lo12:.LRR]	// bp[0]
134e1051a39Sopenharmony_ci	ldp	$a0,$a1,[$ap]
135e1051a39Sopenharmony_ci	ldp	$a2,$a3,[$ap,#16]
136e1051a39Sopenharmony_ci	adrp	$poly3,.Lpoly
137e1051a39Sopenharmony_ci	add	$poly3,$poly3,:lo12:.Lpoly
138e1051a39Sopenharmony_ci	ldr	$poly1,[$poly3,#8]
139e1051a39Sopenharmony_ci	ldr	$poly3,[$poly3,#24]
140e1051a39Sopenharmony_ci	adrp	$bp,.LRR		// &bp[0]
141e1051a39Sopenharmony_ci	add	$bp,$bp,:lo12:.LRR
142e1051a39Sopenharmony_ci
143e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont
144e1051a39Sopenharmony_ci
145e1051a39Sopenharmony_ci	ldp	x19,x20,[sp,#16]
146e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#32
147e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
148e1051a39Sopenharmony_ci	ret
149e1051a39Sopenharmony_ci.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
150e1051a39Sopenharmony_ci
151e1051a39Sopenharmony_ci// void	ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
152e1051a39Sopenharmony_ci.globl	ecp_nistz256_from_mont
153e1051a39Sopenharmony_ci.type	ecp_nistz256_from_mont,%function
154e1051a39Sopenharmony_ci.align	4
155e1051a39Sopenharmony_ciecp_nistz256_from_mont:
156e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
157e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-32]!
158e1051a39Sopenharmony_ci	add	x29,sp,#0
159e1051a39Sopenharmony_ci	stp	x19,x20,[sp,#16]
160e1051a39Sopenharmony_ci
161e1051a39Sopenharmony_ci	mov	$bi,#1			// bp[0]
162e1051a39Sopenharmony_ci	ldp	$a0,$a1,[$ap]
163e1051a39Sopenharmony_ci	ldp	$a2,$a3,[$ap,#16]
164e1051a39Sopenharmony_ci	adrp	$poly3,.Lpoly
165e1051a39Sopenharmony_ci	add	$poly3,$poly3,:lo12:.Lpoly
166e1051a39Sopenharmony_ci	ldr	$poly1,[$poly3,#8]
167e1051a39Sopenharmony_ci	ldr	$poly3,[$poly3,#24]
168e1051a39Sopenharmony_ci	adrp	$bp,.Lone		// &bp[0]
169e1051a39Sopenharmony_ci	add	$bp,$bp,:lo12:.Lone
170e1051a39Sopenharmony_ci
171e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont
172e1051a39Sopenharmony_ci
173e1051a39Sopenharmony_ci	ldp	x19,x20,[sp,#16]
174e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#32
175e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
176e1051a39Sopenharmony_ci	ret
177e1051a39Sopenharmony_ci.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
178e1051a39Sopenharmony_ci
179e1051a39Sopenharmony_ci// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
180e1051a39Sopenharmony_ci//					     const BN_ULONG x2[4]);
181e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_mont
182e1051a39Sopenharmony_ci.type	ecp_nistz256_mul_mont,%function
183e1051a39Sopenharmony_ci.align	4
184e1051a39Sopenharmony_ciecp_nistz256_mul_mont:
185e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
186e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-32]!
187e1051a39Sopenharmony_ci	add	x29,sp,#0
188e1051a39Sopenharmony_ci	stp	x19,x20,[sp,#16]
189e1051a39Sopenharmony_ci
190e1051a39Sopenharmony_ci	ldr	$bi,[$bp]		// bp[0]
191e1051a39Sopenharmony_ci	ldp	$a0,$a1,[$ap]
192e1051a39Sopenharmony_ci	ldp	$a2,$a3,[$ap,#16]
193e1051a39Sopenharmony_ci	adrp	$poly3,.Lpoly
194e1051a39Sopenharmony_ci	add	$poly3,$poly3,:lo12:.Lpoly
195e1051a39Sopenharmony_ci	ldr	$poly1,[$poly3,#8]
196e1051a39Sopenharmony_ci	ldr	$poly3,[$poly3,#24]
197e1051a39Sopenharmony_ci
198e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont
199e1051a39Sopenharmony_ci
200e1051a39Sopenharmony_ci	ldp	x19,x20,[sp,#16]
201e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#32
202e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
203e1051a39Sopenharmony_ci	ret
204e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
205e1051a39Sopenharmony_ci
206e1051a39Sopenharmony_ci// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
207e1051a39Sopenharmony_ci.globl	ecp_nistz256_sqr_mont
208e1051a39Sopenharmony_ci.type	ecp_nistz256_sqr_mont,%function
209e1051a39Sopenharmony_ci.align	4
210e1051a39Sopenharmony_ciecp_nistz256_sqr_mont:
211e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
212e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-32]!
213e1051a39Sopenharmony_ci	add	x29,sp,#0
214e1051a39Sopenharmony_ci	stp	x19,x20,[sp,#16]
215e1051a39Sopenharmony_ci
216e1051a39Sopenharmony_ci	ldp	$a0,$a1,[$ap]
217e1051a39Sopenharmony_ci	ldp	$a2,$a3,[$ap,#16]
218e1051a39Sopenharmony_ci	adrp	$poly3,.Lpoly
219e1051a39Sopenharmony_ci	add	$poly3,$poly3,:lo12:.Lpoly
220e1051a39Sopenharmony_ci	ldr	$poly1,[$poly3,#8]
221e1051a39Sopenharmony_ci	ldr	$poly3,[$poly3,#24]
222e1051a39Sopenharmony_ci
223e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont
224e1051a39Sopenharmony_ci
225e1051a39Sopenharmony_ci	ldp	x19,x20,[sp,#16]
226e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#32
227e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
228e1051a39Sopenharmony_ci	ret
229e1051a39Sopenharmony_ci.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
230e1051a39Sopenharmony_ci
231e1051a39Sopenharmony_ci// void	ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
232e1051a39Sopenharmony_ci//					const BN_ULONG x2[4]);
233e1051a39Sopenharmony_ci.globl	ecp_nistz256_add
234e1051a39Sopenharmony_ci.type	ecp_nistz256_add,%function
235e1051a39Sopenharmony_ci.align	4
236e1051a39Sopenharmony_ciecp_nistz256_add:
237e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
238e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-16]!
239e1051a39Sopenharmony_ci	add	x29,sp,#0
240e1051a39Sopenharmony_ci
241e1051a39Sopenharmony_ci	ldp	$acc0,$acc1,[$ap]
242e1051a39Sopenharmony_ci	ldp	$t0,$t1,[$bp]
243e1051a39Sopenharmony_ci	ldp	$acc2,$acc3,[$ap,#16]
244e1051a39Sopenharmony_ci	ldp	$t2,$t3,[$bp,#16]
245e1051a39Sopenharmony_ci	adrp	$poly3,.Lpoly
246e1051a39Sopenharmony_ci	add	$poly3,$poly3,:lo12:.Lpoly
247e1051a39Sopenharmony_ci	ldr	$poly1,[$poly3,#8]
248e1051a39Sopenharmony_ci	ldr	$poly3,[$poly3,#24]
249e1051a39Sopenharmony_ci
250e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add
251e1051a39Sopenharmony_ci
252e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#16
253e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
254e1051a39Sopenharmony_ci	ret
255e1051a39Sopenharmony_ci.size	ecp_nistz256_add,.-ecp_nistz256_add
256e1051a39Sopenharmony_ci
257e1051a39Sopenharmony_ci// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
258e1051a39Sopenharmony_ci.globl	ecp_nistz256_div_by_2
259e1051a39Sopenharmony_ci.type	ecp_nistz256_div_by_2,%function
260e1051a39Sopenharmony_ci.align	4
261e1051a39Sopenharmony_ciecp_nistz256_div_by_2:
262e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
263e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-16]!
264e1051a39Sopenharmony_ci	add	x29,sp,#0
265e1051a39Sopenharmony_ci
266e1051a39Sopenharmony_ci	ldp	$acc0,$acc1,[$ap]
267e1051a39Sopenharmony_ci	ldp	$acc2,$acc3,[$ap,#16]
268e1051a39Sopenharmony_ci	adrp	$poly3,.Lpoly
269e1051a39Sopenharmony_ci	add	$poly3,$poly3,:lo12:.Lpoly
270e1051a39Sopenharmony_ci	ldr	$poly1,[$poly3,#8]
271e1051a39Sopenharmony_ci	ldr	$poly3,[$poly3,#24]
272e1051a39Sopenharmony_ci
273e1051a39Sopenharmony_ci	bl	__ecp_nistz256_div_by_2
274e1051a39Sopenharmony_ci
275e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#16
276e1051a39Sopenharmony_ci	.inst	0xd50323bf		//  autiasp
277e1051a39Sopenharmony_ci	ret
278e1051a39Sopenharmony_ci.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
279e1051a39Sopenharmony_ci
280e1051a39Sopenharmony_ci// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
281e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_by_2
282e1051a39Sopenharmony_ci.type	ecp_nistz256_mul_by_2,%function
283e1051a39Sopenharmony_ci.align	4
284e1051a39Sopenharmony_ciecp_nistz256_mul_by_2:
285e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
286e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-16]!
287e1051a39Sopenharmony_ci	add	x29,sp,#0
288e1051a39Sopenharmony_ci
289e1051a39Sopenharmony_ci	ldp	$acc0,$acc1,[$ap]
290e1051a39Sopenharmony_ci	ldp	$acc2,$acc3,[$ap,#16]
291e1051a39Sopenharmony_ci	adrp	$poly3,.Lpoly
292e1051a39Sopenharmony_ci	add	$poly3,$poly3,:lo12:.Lpoly
293e1051a39Sopenharmony_ci	ldr	$poly1,[$poly3,#8]
294e1051a39Sopenharmony_ci	ldr	$poly3,[$poly3,#24]
295e1051a39Sopenharmony_ci	mov	$t0,$acc0
296e1051a39Sopenharmony_ci	mov	$t1,$acc1
297e1051a39Sopenharmony_ci	mov	$t2,$acc2
298e1051a39Sopenharmony_ci	mov	$t3,$acc3
299e1051a39Sopenharmony_ci
300e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	// ret = a+a	// 2*a
301e1051a39Sopenharmony_ci
302e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#16
303e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
304e1051a39Sopenharmony_ci	ret
305e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
306e1051a39Sopenharmony_ci
307e1051a39Sopenharmony_ci// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
308e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_by_3
309e1051a39Sopenharmony_ci.type	ecp_nistz256_mul_by_3,%function
310e1051a39Sopenharmony_ci.align	4
311e1051a39Sopenharmony_ciecp_nistz256_mul_by_3:
312e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
313e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-16]!
314e1051a39Sopenharmony_ci	add	x29,sp,#0
315e1051a39Sopenharmony_ci
316e1051a39Sopenharmony_ci	ldp	$acc0,$acc1,[$ap]
317e1051a39Sopenharmony_ci	ldp	$acc2,$acc3,[$ap,#16]
318e1051a39Sopenharmony_ci	adrp	$poly3,.Lpoly
319e1051a39Sopenharmony_ci	add	$poly3,$poly3,:lo12:.Lpoly
320e1051a39Sopenharmony_ci	ldr	$poly1,[$poly3,#8]
321e1051a39Sopenharmony_ci	ldr	$poly3,[$poly3,#24]
322e1051a39Sopenharmony_ci	mov	$t0,$acc0
323e1051a39Sopenharmony_ci	mov	$t1,$acc1
324e1051a39Sopenharmony_ci	mov	$t2,$acc2
325e1051a39Sopenharmony_ci	mov	$t3,$acc3
326e1051a39Sopenharmony_ci	mov	$a0,$acc0
327e1051a39Sopenharmony_ci	mov	$a1,$acc1
328e1051a39Sopenharmony_ci	mov	$a2,$acc2
329e1051a39Sopenharmony_ci	mov	$a3,$acc3
330e1051a39Sopenharmony_ci
331e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	// ret = a+a	// 2*a
332e1051a39Sopenharmony_ci
333e1051a39Sopenharmony_ci	mov	$t0,$a0
334e1051a39Sopenharmony_ci	mov	$t1,$a1
335e1051a39Sopenharmony_ci	mov	$t2,$a2
336e1051a39Sopenharmony_ci	mov	$t3,$a3
337e1051a39Sopenharmony_ci
338e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	// ret += a	// 2*a+a=3*a
339e1051a39Sopenharmony_ci
340e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#16
341e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
342e1051a39Sopenharmony_ci	ret
343e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
344e1051a39Sopenharmony_ci
345e1051a39Sopenharmony_ci// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
346e1051a39Sopenharmony_ci//				        const BN_ULONG x2[4]);
347e1051a39Sopenharmony_ci.globl	ecp_nistz256_sub
348e1051a39Sopenharmony_ci.type	ecp_nistz256_sub,%function
349e1051a39Sopenharmony_ci.align	4
350e1051a39Sopenharmony_ciecp_nistz256_sub:
351e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
352e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-16]!
353e1051a39Sopenharmony_ci	add	x29,sp,#0
354e1051a39Sopenharmony_ci
355e1051a39Sopenharmony_ci	ldp	$acc0,$acc1,[$ap]
356e1051a39Sopenharmony_ci	ldp	$acc2,$acc3,[$ap,#16]
357e1051a39Sopenharmony_ci	adrp	$poly3,.Lpoly
358e1051a39Sopenharmony_ci	add	$poly3,$poly3,:lo12:.Lpoly
359e1051a39Sopenharmony_ci	ldr	$poly1,[$poly3,#8]
360e1051a39Sopenharmony_ci	ldr	$poly3,[$poly3,#24]
361e1051a39Sopenharmony_ci
362e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from
363e1051a39Sopenharmony_ci
364e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#16
365e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
366e1051a39Sopenharmony_ci	ret
367e1051a39Sopenharmony_ci.size	ecp_nistz256_sub,.-ecp_nistz256_sub
368e1051a39Sopenharmony_ci
369e1051a39Sopenharmony_ci// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
370e1051a39Sopenharmony_ci.globl	ecp_nistz256_neg
371e1051a39Sopenharmony_ci.type	ecp_nistz256_neg,%function
372e1051a39Sopenharmony_ci.align	4
373e1051a39Sopenharmony_ciecp_nistz256_neg:
374e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
375e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-16]!
376e1051a39Sopenharmony_ci	add	x29,sp,#0
377e1051a39Sopenharmony_ci
378e1051a39Sopenharmony_ci	mov	$bp,$ap
379e1051a39Sopenharmony_ci	mov	$acc0,xzr		// a = 0
380e1051a39Sopenharmony_ci	mov	$acc1,xzr
381e1051a39Sopenharmony_ci	mov	$acc2,xzr
382e1051a39Sopenharmony_ci	mov	$acc3,xzr
383e1051a39Sopenharmony_ci	adrp	$poly3,.Lpoly
384e1051a39Sopenharmony_ci	add	$poly3,$poly3,:lo12:.Lpoly
385e1051a39Sopenharmony_ci	ldr	$poly1,[$poly3,#8]
386e1051a39Sopenharmony_ci	ldr	$poly3,[$poly3,#24]
387e1051a39Sopenharmony_ci
388e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from
389e1051a39Sopenharmony_ci
390e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#16
391e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
392e1051a39Sopenharmony_ci	ret
393e1051a39Sopenharmony_ci.size	ecp_nistz256_neg,.-ecp_nistz256_neg
394e1051a39Sopenharmony_ci
395e1051a39Sopenharmony_ci// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
396e1051a39Sopenharmony_ci// to $a0-$a3 and b[0] - to $bi
397e1051a39Sopenharmony_ci.type	__ecp_nistz256_mul_mont,%function
398e1051a39Sopenharmony_ci.align	4
399e1051a39Sopenharmony_ci__ecp_nistz256_mul_mont:
400e1051a39Sopenharmony_ci	mul	$acc0,$a0,$bi		// a[0]*b[0]
401e1051a39Sopenharmony_ci	umulh	$t0,$a0,$bi
402e1051a39Sopenharmony_ci
403e1051a39Sopenharmony_ci	mul	$acc1,$a1,$bi		// a[1]*b[0]
404e1051a39Sopenharmony_ci	umulh	$t1,$a1,$bi
405e1051a39Sopenharmony_ci
406e1051a39Sopenharmony_ci	mul	$acc2,$a2,$bi		// a[2]*b[0]
407e1051a39Sopenharmony_ci	umulh	$t2,$a2,$bi
408e1051a39Sopenharmony_ci
409e1051a39Sopenharmony_ci	mul	$acc3,$a3,$bi		// a[3]*b[0]
410e1051a39Sopenharmony_ci	umulh	$t3,$a3,$bi
411e1051a39Sopenharmony_ci	ldr	$bi,[$bp,#8]		// b[1]
412e1051a39Sopenharmony_ci
413e1051a39Sopenharmony_ci	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
414e1051a39Sopenharmony_ci	 lsl	$t0,$acc0,#32
415e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$t1
416e1051a39Sopenharmony_ci	 lsr	$t1,$acc0,#32
417e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$t2
418e1051a39Sopenharmony_ci	adc	$acc4,xzr,$t3
419e1051a39Sopenharmony_ci	mov	$acc5,xzr
420e1051a39Sopenharmony_ci___
421e1051a39Sopenharmony_cifor($i=1;$i<4;$i++) {
422e1051a39Sopenharmony_ci        # Reduction iteration is normally performed by accumulating
423e1051a39Sopenharmony_ci        # result of multiplication of modulus by "magic" digit [and
424e1051a39Sopenharmony_ci        # omitting least significant word, which is guaranteed to
425e1051a39Sopenharmony_ci        # be 0], but thanks to special form of modulus and "magic"
426e1051a39Sopenharmony_ci        # digit being equal to least significant word, it can be
427e1051a39Sopenharmony_ci        # performed with additions and subtractions alone. Indeed:
428e1051a39Sopenharmony_ci        #
429e1051a39Sopenharmony_ci        #            ffff0001.00000000.0000ffff.ffffffff
430e1051a39Sopenharmony_ci        # *                                     abcdefgh
431e1051a39Sopenharmony_ci        # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
432e1051a39Sopenharmony_ci        #
433e1051a39Sopenharmony_ci        # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
434e1051a39Sopenharmony_ci        # rewrite above as:
435e1051a39Sopenharmony_ci        #
436e1051a39Sopenharmony_ci        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
437e1051a39Sopenharmony_ci        # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
438e1051a39Sopenharmony_ci        # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
439e1051a39Sopenharmony_ci        #
440e1051a39Sopenharmony_ci        # or marking redundant operations:
441e1051a39Sopenharmony_ci        #
442e1051a39Sopenharmony_ci        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
443e1051a39Sopenharmony_ci        # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
444e1051a39Sopenharmony_ci        # - 0000abcd.efgh0000.--------.--------.--------
445e1051a39Sopenharmony_ci
446e1051a39Sopenharmony_ci$code.=<<___;
447e1051a39Sopenharmony_ci	subs	$t2,$acc0,$t0		// "*0xffff0001"
448e1051a39Sopenharmony_ci	sbc	$t3,$acc0,$t1
449e1051a39Sopenharmony_ci	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
450e1051a39Sopenharmony_ci	 mul	$t0,$a0,$bi		// lo(a[0]*b[i])
451e1051a39Sopenharmony_ci	adcs	$acc1,$acc2,$t1
452e1051a39Sopenharmony_ci	 mul	$t1,$a1,$bi		// lo(a[1]*b[i])
453e1051a39Sopenharmony_ci	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
454e1051a39Sopenharmony_ci	 mul	$t2,$a2,$bi		// lo(a[2]*b[i])
455e1051a39Sopenharmony_ci	adcs	$acc3,$acc4,$t3
456e1051a39Sopenharmony_ci	 mul	$t3,$a3,$bi		// lo(a[3]*b[i])
457e1051a39Sopenharmony_ci	adc	$acc4,$acc5,xzr
458e1051a39Sopenharmony_ci
459e1051a39Sopenharmony_ci	adds	$acc0,$acc0,$t0		// accumulate low parts of multiplication
460e1051a39Sopenharmony_ci	 umulh	$t0,$a0,$bi		// hi(a[0]*b[i])
461e1051a39Sopenharmony_ci	adcs	$acc1,$acc1,$t1
462e1051a39Sopenharmony_ci	 umulh	$t1,$a1,$bi		// hi(a[1]*b[i])
463e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$t2
464e1051a39Sopenharmony_ci	 umulh	$t2,$a2,$bi		// hi(a[2]*b[i])
465e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$t3
466e1051a39Sopenharmony_ci	 umulh	$t3,$a3,$bi		// hi(a[3]*b[i])
467e1051a39Sopenharmony_ci	adc	$acc4,$acc4,xzr
468e1051a39Sopenharmony_ci___
469e1051a39Sopenharmony_ci$code.=<<___	if ($i<3);
470e1051a39Sopenharmony_ci	ldr	$bi,[$bp,#8*($i+1)]	// b[$i+1]
471e1051a39Sopenharmony_ci___
472e1051a39Sopenharmony_ci$code.=<<___;
473e1051a39Sopenharmony_ci	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
474e1051a39Sopenharmony_ci	 lsl	$t0,$acc0,#32
475e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$t1
476e1051a39Sopenharmony_ci	 lsr	$t1,$acc0,#32
477e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$t2
478e1051a39Sopenharmony_ci	adcs	$acc4,$acc4,$t3
479e1051a39Sopenharmony_ci	adc	$acc5,xzr,xzr
480e1051a39Sopenharmony_ci___
481e1051a39Sopenharmony_ci}
482e1051a39Sopenharmony_ci$code.=<<___;
483e1051a39Sopenharmony_ci	// last reduction
484e1051a39Sopenharmony_ci	subs	$t2,$acc0,$t0		// "*0xffff0001"
485e1051a39Sopenharmony_ci	sbc	$t3,$acc0,$t1
486e1051a39Sopenharmony_ci	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
487e1051a39Sopenharmony_ci	adcs	$acc1,$acc2,$t1
488e1051a39Sopenharmony_ci	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
489e1051a39Sopenharmony_ci	adcs	$acc3,$acc4,$t3
490e1051a39Sopenharmony_ci	adc	$acc4,$acc5,xzr
491e1051a39Sopenharmony_ci
492e1051a39Sopenharmony_ci	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
493e1051a39Sopenharmony_ci	sbcs	$t1,$acc1,$poly1
494e1051a39Sopenharmony_ci	sbcs	$t2,$acc2,xzr
495e1051a39Sopenharmony_ci	sbcs	$t3,$acc3,$poly3
496e1051a39Sopenharmony_ci	sbcs	xzr,$acc4,xzr		// did it borrow?
497e1051a39Sopenharmony_ci
498e1051a39Sopenharmony_ci	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
499e1051a39Sopenharmony_ci	csel	$acc1,$acc1,$t1,lo
500e1051a39Sopenharmony_ci	csel	$acc2,$acc2,$t2,lo
501e1051a39Sopenharmony_ci	stp	$acc0,$acc1,[$rp]
502e1051a39Sopenharmony_ci	csel	$acc3,$acc3,$t3,lo
503e1051a39Sopenharmony_ci	stp	$acc2,$acc3,[$rp,#16]
504e1051a39Sopenharmony_ci
505e1051a39Sopenharmony_ci	ret
506e1051a39Sopenharmony_ci.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
507e1051a39Sopenharmony_ci
508e1051a39Sopenharmony_ci// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
509e1051a39Sopenharmony_ci// to $a0-$a3
510e1051a39Sopenharmony_ci.type	__ecp_nistz256_sqr_mont,%function
511e1051a39Sopenharmony_ci.align	4
512e1051a39Sopenharmony_ci__ecp_nistz256_sqr_mont:
513e1051a39Sopenharmony_ci	//  |  |  |  |  |  |a1*a0|  |
514e1051a39Sopenharmony_ci	//  |  |  |  |  |a2*a0|  |  |
515e1051a39Sopenharmony_ci	//  |  |a3*a2|a3*a0|  |  |  |
516e1051a39Sopenharmony_ci	//  |  |  |  |a2*a1|  |  |  |
517e1051a39Sopenharmony_ci	//  |  |  |a3*a1|  |  |  |  |
518e1051a39Sopenharmony_ci	// *|  |  |  |  |  |  |  | 2|
519e1051a39Sopenharmony_ci	// +|a3*a3|a2*a2|a1*a1|a0*a0|
520e1051a39Sopenharmony_ci	//  |--+--+--+--+--+--+--+--|
521e1051a39Sopenharmony_ci	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
522e1051a39Sopenharmony_ci	//
523e1051a39Sopenharmony_ci	//  "can't overflow" below mark carrying into high part of
524e1051a39Sopenharmony_ci	//  multiplication result, which can't overflow, because it
525e1051a39Sopenharmony_ci	//  can never be all ones.
526e1051a39Sopenharmony_ci
527e1051a39Sopenharmony_ci	mul	$acc1,$a1,$a0		// a[1]*a[0]
528e1051a39Sopenharmony_ci	umulh	$t1,$a1,$a0
529e1051a39Sopenharmony_ci	mul	$acc2,$a2,$a0		// a[2]*a[0]
530e1051a39Sopenharmony_ci	umulh	$t2,$a2,$a0
531e1051a39Sopenharmony_ci	mul	$acc3,$a3,$a0		// a[3]*a[0]
532e1051a39Sopenharmony_ci	umulh	$acc4,$a3,$a0
533e1051a39Sopenharmony_ci
534e1051a39Sopenharmony_ci	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
535e1051a39Sopenharmony_ci	 mul	$t0,$a2,$a1		// a[2]*a[1]
536e1051a39Sopenharmony_ci	 umulh	$t1,$a2,$a1
537e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$t2
538e1051a39Sopenharmony_ci	 mul	$t2,$a3,$a1		// a[3]*a[1]
539e1051a39Sopenharmony_ci	 umulh	$t3,$a3,$a1
540e1051a39Sopenharmony_ci	adc	$acc4,$acc4,xzr		// can't overflow
541e1051a39Sopenharmony_ci
542e1051a39Sopenharmony_ci	mul	$acc5,$a3,$a2		// a[3]*a[2]
543e1051a39Sopenharmony_ci	umulh	$acc6,$a3,$a2
544e1051a39Sopenharmony_ci
545e1051a39Sopenharmony_ci	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
546e1051a39Sopenharmony_ci	 mul	$acc0,$a0,$a0		// a[0]*a[0]
547e1051a39Sopenharmony_ci	adc	$t2,$t3,xzr		// can't overflow
548e1051a39Sopenharmony_ci
549e1051a39Sopenharmony_ci	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
550e1051a39Sopenharmony_ci	 umulh	$a0,$a0,$a0
551e1051a39Sopenharmony_ci	adcs	$acc4,$acc4,$t1
552e1051a39Sopenharmony_ci	 mul	$t1,$a1,$a1		// a[1]*a[1]
553e1051a39Sopenharmony_ci	adcs	$acc5,$acc5,$t2
554e1051a39Sopenharmony_ci	 umulh	$a1,$a1,$a1
555e1051a39Sopenharmony_ci	adc	$acc6,$acc6,xzr		// can't overflow
556e1051a39Sopenharmony_ci
557e1051a39Sopenharmony_ci	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
558e1051a39Sopenharmony_ci	 mul	$t2,$a2,$a2		// a[2]*a[2]
559e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$acc2
560e1051a39Sopenharmony_ci	 umulh	$a2,$a2,$a2
561e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$acc3
562e1051a39Sopenharmony_ci	 mul	$t3,$a3,$a3		// a[3]*a[3]
563e1051a39Sopenharmony_ci	adcs	$acc4,$acc4,$acc4
564e1051a39Sopenharmony_ci	 umulh	$a3,$a3,$a3
565e1051a39Sopenharmony_ci	adcs	$acc5,$acc5,$acc5
566e1051a39Sopenharmony_ci	adcs	$acc6,$acc6,$acc6
567e1051a39Sopenharmony_ci	adc	$acc7,xzr,xzr
568e1051a39Sopenharmony_ci
569e1051a39Sopenharmony_ci	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
570e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$t1
571e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$a1
572e1051a39Sopenharmony_ci	adcs	$acc4,$acc4,$t2
573e1051a39Sopenharmony_ci	adcs	$acc5,$acc5,$a2
574e1051a39Sopenharmony_ci	 lsl	$t0,$acc0,#32
575e1051a39Sopenharmony_ci	adcs	$acc6,$acc6,$t3
576e1051a39Sopenharmony_ci	 lsr	$t1,$acc0,#32
577e1051a39Sopenharmony_ci	adc	$acc7,$acc7,$a3
578e1051a39Sopenharmony_ci___
579e1051a39Sopenharmony_cifor($i=0;$i<3;$i++) {			# reductions, see commentary in
580e1051a39Sopenharmony_ci					# multiplication for details
581e1051a39Sopenharmony_ci$code.=<<___;
582e1051a39Sopenharmony_ci	subs	$t2,$acc0,$t0		// "*0xffff0001"
583e1051a39Sopenharmony_ci	sbc	$t3,$acc0,$t1
584e1051a39Sopenharmony_ci	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
585e1051a39Sopenharmony_ci	adcs	$acc1,$acc2,$t1
586e1051a39Sopenharmony_ci	 lsl	$t0,$acc0,#32
587e1051a39Sopenharmony_ci	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
588e1051a39Sopenharmony_ci	 lsr	$t1,$acc0,#32
589e1051a39Sopenharmony_ci	adc	$acc3,$t3,xzr		// can't overflow
590e1051a39Sopenharmony_ci___
591e1051a39Sopenharmony_ci}
592e1051a39Sopenharmony_ci$code.=<<___;
593e1051a39Sopenharmony_ci	subs	$t2,$acc0,$t0		// "*0xffff0001"
594e1051a39Sopenharmony_ci	sbc	$t3,$acc0,$t1
595e1051a39Sopenharmony_ci	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
596e1051a39Sopenharmony_ci	adcs	$acc1,$acc2,$t1
597e1051a39Sopenharmony_ci	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
598e1051a39Sopenharmony_ci	adc	$acc3,$t3,xzr		// can't overflow
599e1051a39Sopenharmony_ci
600e1051a39Sopenharmony_ci	adds	$acc0,$acc0,$acc4	// accumulate upper half
601e1051a39Sopenharmony_ci	adcs	$acc1,$acc1,$acc5
602e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$acc6
603e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$acc7
604e1051a39Sopenharmony_ci	adc	$acc4,xzr,xzr
605e1051a39Sopenharmony_ci
606e1051a39Sopenharmony_ci	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
607e1051a39Sopenharmony_ci	sbcs	$t1,$acc1,$poly1
608e1051a39Sopenharmony_ci	sbcs	$t2,$acc2,xzr
609e1051a39Sopenharmony_ci	sbcs	$t3,$acc3,$poly3
610e1051a39Sopenharmony_ci	sbcs	xzr,$acc4,xzr		// did it borrow?
611e1051a39Sopenharmony_ci
612e1051a39Sopenharmony_ci	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
613e1051a39Sopenharmony_ci	csel	$acc1,$acc1,$t1,lo
614e1051a39Sopenharmony_ci	csel	$acc2,$acc2,$t2,lo
615e1051a39Sopenharmony_ci	stp	$acc0,$acc1,[$rp]
616e1051a39Sopenharmony_ci	csel	$acc3,$acc3,$t3,lo
617e1051a39Sopenharmony_ci	stp	$acc2,$acc3,[$rp,#16]
618e1051a39Sopenharmony_ci
619e1051a39Sopenharmony_ci	ret
620e1051a39Sopenharmony_ci.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
621e1051a39Sopenharmony_ci
622e1051a39Sopenharmony_ci// Note that __ecp_nistz256_add expects both input vectors pre-loaded to
623e1051a39Sopenharmony_ci// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
624e1051a39Sopenharmony_ci// contexts, e.g. in multiplication by 2 and 3...
625e1051a39Sopenharmony_ci.type	__ecp_nistz256_add,%function
626e1051a39Sopenharmony_ci.align	4
627e1051a39Sopenharmony_ci__ecp_nistz256_add:
628e1051a39Sopenharmony_ci	adds	$acc0,$acc0,$t0		// ret = a+b
629e1051a39Sopenharmony_ci	adcs	$acc1,$acc1,$t1
630e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$t2
631e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$t3
632e1051a39Sopenharmony_ci	adc	$ap,xzr,xzr		// zap $ap
633e1051a39Sopenharmony_ci
634e1051a39Sopenharmony_ci	adds	$t0,$acc0,#1		// subs	$t0,$a0,#-1 // tmp = ret-modulus
635e1051a39Sopenharmony_ci	sbcs	$t1,$acc1,$poly1
636e1051a39Sopenharmony_ci	sbcs	$t2,$acc2,xzr
637e1051a39Sopenharmony_ci	sbcs	$t3,$acc3,$poly3
638e1051a39Sopenharmony_ci	sbcs	xzr,$ap,xzr		// did subtraction borrow?
639e1051a39Sopenharmony_ci
640e1051a39Sopenharmony_ci	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
641e1051a39Sopenharmony_ci	csel	$acc1,$acc1,$t1,lo
642e1051a39Sopenharmony_ci	csel	$acc2,$acc2,$t2,lo
643e1051a39Sopenharmony_ci	stp	$acc0,$acc1,[$rp]
644e1051a39Sopenharmony_ci	csel	$acc3,$acc3,$t3,lo
645e1051a39Sopenharmony_ci	stp	$acc2,$acc3,[$rp,#16]
646e1051a39Sopenharmony_ci
647e1051a39Sopenharmony_ci	ret
648e1051a39Sopenharmony_ci.size	__ecp_nistz256_add,.-__ecp_nistz256_add
649e1051a39Sopenharmony_ci
650e1051a39Sopenharmony_ci.type	__ecp_nistz256_sub_from,%function
651e1051a39Sopenharmony_ci.align	4
652e1051a39Sopenharmony_ci__ecp_nistz256_sub_from:
653e1051a39Sopenharmony_ci	ldp	$t0,$t1,[$bp]
654e1051a39Sopenharmony_ci	ldp	$t2,$t3,[$bp,#16]
655e1051a39Sopenharmony_ci	subs	$acc0,$acc0,$t0		// ret = a-b
656e1051a39Sopenharmony_ci	sbcs	$acc1,$acc1,$t1
657e1051a39Sopenharmony_ci	sbcs	$acc2,$acc2,$t2
658e1051a39Sopenharmony_ci	sbcs	$acc3,$acc3,$t3
659e1051a39Sopenharmony_ci	sbc	$ap,xzr,xzr		// zap $ap
660e1051a39Sopenharmony_ci
661e1051a39Sopenharmony_ci	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
662e1051a39Sopenharmony_ci	adcs	$t1,$acc1,$poly1
663e1051a39Sopenharmony_ci	adcs	$t2,$acc2,xzr
664e1051a39Sopenharmony_ci	adc	$t3,$acc3,$poly3
665e1051a39Sopenharmony_ci	cmp	$ap,xzr			// did subtraction borrow?
666e1051a39Sopenharmony_ci
667e1051a39Sopenharmony_ci	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
668e1051a39Sopenharmony_ci	csel	$acc1,$acc1,$t1,eq
669e1051a39Sopenharmony_ci	csel	$acc2,$acc2,$t2,eq
670e1051a39Sopenharmony_ci	stp	$acc0,$acc1,[$rp]
671e1051a39Sopenharmony_ci	csel	$acc3,$acc3,$t3,eq
672e1051a39Sopenharmony_ci	stp	$acc2,$acc3,[$rp,#16]
673e1051a39Sopenharmony_ci
674e1051a39Sopenharmony_ci	ret
675e1051a39Sopenharmony_ci.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
676e1051a39Sopenharmony_ci
677e1051a39Sopenharmony_ci.type	__ecp_nistz256_sub_morf,%function
678e1051a39Sopenharmony_ci.align	4
679e1051a39Sopenharmony_ci__ecp_nistz256_sub_morf:
680e1051a39Sopenharmony_ci	ldp	$t0,$t1,[$bp]
681e1051a39Sopenharmony_ci	ldp	$t2,$t3,[$bp,#16]
682e1051a39Sopenharmony_ci	subs	$acc0,$t0,$acc0		// ret = b-a
683e1051a39Sopenharmony_ci	sbcs	$acc1,$t1,$acc1
684e1051a39Sopenharmony_ci	sbcs	$acc2,$t2,$acc2
685e1051a39Sopenharmony_ci	sbcs	$acc3,$t3,$acc3
686e1051a39Sopenharmony_ci	sbc	$ap,xzr,xzr		// zap $ap
687e1051a39Sopenharmony_ci
688e1051a39Sopenharmony_ci	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
689e1051a39Sopenharmony_ci	adcs	$t1,$acc1,$poly1
690e1051a39Sopenharmony_ci	adcs	$t2,$acc2,xzr
691e1051a39Sopenharmony_ci	adc	$t3,$acc3,$poly3
692e1051a39Sopenharmony_ci	cmp	$ap,xzr			// did subtraction borrow?
693e1051a39Sopenharmony_ci
694e1051a39Sopenharmony_ci	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
695e1051a39Sopenharmony_ci	csel	$acc1,$acc1,$t1,eq
696e1051a39Sopenharmony_ci	csel	$acc2,$acc2,$t2,eq
697e1051a39Sopenharmony_ci	stp	$acc0,$acc1,[$rp]
698e1051a39Sopenharmony_ci	csel	$acc3,$acc3,$t3,eq
699e1051a39Sopenharmony_ci	stp	$acc2,$acc3,[$rp,#16]
700e1051a39Sopenharmony_ci
701e1051a39Sopenharmony_ci	ret
702e1051a39Sopenharmony_ci.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
703e1051a39Sopenharmony_ci
704e1051a39Sopenharmony_ci.type	__ecp_nistz256_div_by_2,%function
705e1051a39Sopenharmony_ci.align	4
706e1051a39Sopenharmony_ci__ecp_nistz256_div_by_2:
707e1051a39Sopenharmony_ci	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = a+modulus
708e1051a39Sopenharmony_ci	adcs	$t1,$acc1,$poly1
709e1051a39Sopenharmony_ci	adcs	$t2,$acc2,xzr
710e1051a39Sopenharmony_ci	adcs	$t3,$acc3,$poly3
711e1051a39Sopenharmony_ci	adc	$ap,xzr,xzr		// zap $ap
712e1051a39Sopenharmony_ci	tst	$acc0,#1		// is a even?
713e1051a39Sopenharmony_ci
714e1051a39Sopenharmony_ci	csel	$acc0,$acc0,$t0,eq	// ret = even ? a : a+modulus
715e1051a39Sopenharmony_ci	csel	$acc1,$acc1,$t1,eq
716e1051a39Sopenharmony_ci	csel	$acc2,$acc2,$t2,eq
717e1051a39Sopenharmony_ci	csel	$acc3,$acc3,$t3,eq
718e1051a39Sopenharmony_ci	csel	$ap,xzr,$ap,eq
719e1051a39Sopenharmony_ci
720e1051a39Sopenharmony_ci	lsr	$acc0,$acc0,#1		// ret >>= 1
721e1051a39Sopenharmony_ci	orr	$acc0,$acc0,$acc1,lsl#63
722e1051a39Sopenharmony_ci	lsr	$acc1,$acc1,#1
723e1051a39Sopenharmony_ci	orr	$acc1,$acc1,$acc2,lsl#63
724e1051a39Sopenharmony_ci	lsr	$acc2,$acc2,#1
725e1051a39Sopenharmony_ci	orr	$acc2,$acc2,$acc3,lsl#63
726e1051a39Sopenharmony_ci	lsr	$acc3,$acc3,#1
727e1051a39Sopenharmony_ci	stp	$acc0,$acc1,[$rp]
728e1051a39Sopenharmony_ci	orr	$acc3,$acc3,$ap,lsl#63
729e1051a39Sopenharmony_ci	stp	$acc2,$acc3,[$rp,#16]
730e1051a39Sopenharmony_ci
731e1051a39Sopenharmony_ci	ret
732e1051a39Sopenharmony_ci.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
733e1051a39Sopenharmony_ci___
734e1051a39Sopenharmony_ci########################################################################
735e1051a39Sopenharmony_ci# following subroutines are "literal" implementation of those found in
736e1051a39Sopenharmony_ci# ecp_nistz256.c
737e1051a39Sopenharmony_ci#
738e1051a39Sopenharmony_ci########################################################################
739e1051a39Sopenharmony_ci# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
740e1051a39Sopenharmony_ci#
741e1051a39Sopenharmony_ci{
742e1051a39Sopenharmony_cimy ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
743e1051a39Sopenharmony_ci# above map() describes stack layout with 4 temporary
744e1051a39Sopenharmony_ci# 256-bit vectors on top.
745e1051a39Sopenharmony_cimy ($rp_real,$ap_real) = map("x$_",(21,22));
746e1051a39Sopenharmony_ci
747e1051a39Sopenharmony_ci$code.=<<___;
748e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_double
749e1051a39Sopenharmony_ci.type	ecp_nistz256_point_double,%function
750e1051a39Sopenharmony_ci.align	5
751e1051a39Sopenharmony_ciecp_nistz256_point_double:
752e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
753e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-96]!
754e1051a39Sopenharmony_ci	add	x29,sp,#0
755e1051a39Sopenharmony_ci	stp	x19,x20,[sp,#16]
756e1051a39Sopenharmony_ci	stp	x21,x22,[sp,#32]
757e1051a39Sopenharmony_ci	sub	sp,sp,#32*4
758e1051a39Sopenharmony_ci
759e1051a39Sopenharmony_ci.Ldouble_shortcut:
760e1051a39Sopenharmony_ci	ldp	$acc0,$acc1,[$ap,#32]
761e1051a39Sopenharmony_ci	 mov	$rp_real,$rp
762e1051a39Sopenharmony_ci	ldp	$acc2,$acc3,[$ap,#48]
763e1051a39Sopenharmony_ci	 mov	$ap_real,$ap
764e1051a39Sopenharmony_ci	 adrp	$poly3,.Lpoly
765e1051a39Sopenharmony_ci	 add	$poly3,$poly3,:lo12:.Lpoly
766e1051a39Sopenharmony_ci	 ldr	$poly1,[$poly3,#8]
767e1051a39Sopenharmony_ci	mov	$t0,$acc0
768e1051a39Sopenharmony_ci	 ldr	$poly3,[$poly3,#24]
769e1051a39Sopenharmony_ci	mov	$t1,$acc1
770e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[$ap_real,#64]	// forward load for p256_sqr_mont
771e1051a39Sopenharmony_ci	mov	$t2,$acc2
772e1051a39Sopenharmony_ci	mov	$t3,$acc3
773e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[$ap_real,#64+16]
774e1051a39Sopenharmony_ci	add	$rp,sp,#$S
775e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	// p256_mul_by_2(S, in_y);
776e1051a39Sopenharmony_ci
777e1051a39Sopenharmony_ci	add	$rp,sp,#$Zsqr
778e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
779e1051a39Sopenharmony_ci
780e1051a39Sopenharmony_ci	ldp	$t0,$t1,[$ap_real]
781e1051a39Sopenharmony_ci	ldp	$t2,$t3,[$ap_real,#16]
782e1051a39Sopenharmony_ci	mov	$a0,$acc0		// put Zsqr aside for p256_sub
783e1051a39Sopenharmony_ci	mov	$a1,$acc1
784e1051a39Sopenharmony_ci	mov	$a2,$acc2
785e1051a39Sopenharmony_ci	mov	$a3,$acc3
786e1051a39Sopenharmony_ci	add	$rp,sp,#$M
787e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	// p256_add(M, Zsqr, in_x);
788e1051a39Sopenharmony_ci
789e1051a39Sopenharmony_ci	add	$bp,$ap_real,#0
790e1051a39Sopenharmony_ci	mov	$acc0,$a0		// restore Zsqr
791e1051a39Sopenharmony_ci	mov	$acc1,$a1
792e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
793e1051a39Sopenharmony_ci	mov	$acc2,$a2
794e1051a39Sopenharmony_ci	mov	$acc3,$a3
795e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[sp,#$S+16]
796e1051a39Sopenharmony_ci	add	$rp,sp,#$Zsqr
797e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
798e1051a39Sopenharmony_ci
799e1051a39Sopenharmony_ci	add	$rp,sp,#$S
800e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
801e1051a39Sopenharmony_ci
802e1051a39Sopenharmony_ci	ldr	$bi,[$ap_real,#32]
803e1051a39Sopenharmony_ci	ldp	$a0,$a1,[$ap_real,#64]
804e1051a39Sopenharmony_ci	ldp	$a2,$a3,[$ap_real,#64+16]
805e1051a39Sopenharmony_ci	add	$bp,$ap_real,#32
806e1051a39Sopenharmony_ci	add	$rp,sp,#$tmp0
807e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
808e1051a39Sopenharmony_ci
809e1051a39Sopenharmony_ci	mov	$t0,$acc0
810e1051a39Sopenharmony_ci	mov	$t1,$acc1
811e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
812e1051a39Sopenharmony_ci	mov	$t2,$acc2
813e1051a39Sopenharmony_ci	mov	$t3,$acc3
814e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[sp,#$S+16]
815e1051a39Sopenharmony_ci	add	$rp,$rp_real,#64
816e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	// p256_mul_by_2(res_z, tmp0);
817e1051a39Sopenharmony_ci
818e1051a39Sopenharmony_ci	add	$rp,sp,#$tmp0
819e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
820e1051a39Sopenharmony_ci
821e1051a39Sopenharmony_ci	 ldr	$bi,[sp,#$Zsqr]		// forward load for p256_mul_mont
822e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[sp,#$M]
823e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[sp,#$M+16]
824e1051a39Sopenharmony_ci	add	$rp,$rp_real,#32
825e1051a39Sopenharmony_ci	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
826e1051a39Sopenharmony_ci
827e1051a39Sopenharmony_ci	add	$bp,sp,#$Zsqr
828e1051a39Sopenharmony_ci	add	$rp,sp,#$M
829e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
830e1051a39Sopenharmony_ci
831e1051a39Sopenharmony_ci	mov	$t0,$acc0		// duplicate M
832e1051a39Sopenharmony_ci	mov	$t1,$acc1
833e1051a39Sopenharmony_ci	mov	$t2,$acc2
834e1051a39Sopenharmony_ci	mov	$t3,$acc3
835e1051a39Sopenharmony_ci	mov	$a0,$acc0		// put M aside
836e1051a39Sopenharmony_ci	mov	$a1,$acc1
837e1051a39Sopenharmony_ci	mov	$a2,$acc2
838e1051a39Sopenharmony_ci	mov	$a3,$acc3
839e1051a39Sopenharmony_ci	add	$rp,sp,#$M
840e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add
841e1051a39Sopenharmony_ci	mov	$t0,$a0			// restore M
842e1051a39Sopenharmony_ci	mov	$t1,$a1
843e1051a39Sopenharmony_ci	 ldr	$bi,[$ap_real]		// forward load for p256_mul_mont
844e1051a39Sopenharmony_ci	mov	$t2,$a2
845e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[sp,#$S]
846e1051a39Sopenharmony_ci	mov	$t3,$a3
847e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[sp,#$S+16]
848e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	// p256_mul_by_3(M, M);
849e1051a39Sopenharmony_ci
850e1051a39Sopenharmony_ci	add	$bp,$ap_real,#0
851e1051a39Sopenharmony_ci	add	$rp,sp,#$S
852e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
853e1051a39Sopenharmony_ci
854e1051a39Sopenharmony_ci	mov	$t0,$acc0
855e1051a39Sopenharmony_ci	mov	$t1,$acc1
856e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[sp,#$M]	// forward load for p256_sqr_mont
857e1051a39Sopenharmony_ci	mov	$t2,$acc2
858e1051a39Sopenharmony_ci	mov	$t3,$acc3
859e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[sp,#$M+16]
860e1051a39Sopenharmony_ci	add	$rp,sp,#$tmp0
861e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	// p256_mul_by_2(tmp0, S);
862e1051a39Sopenharmony_ci
863e1051a39Sopenharmony_ci	add	$rp,$rp_real,#0
864e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
865e1051a39Sopenharmony_ci
866e1051a39Sopenharmony_ci	add	$bp,sp,#$tmp0
867e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
868e1051a39Sopenharmony_ci
869e1051a39Sopenharmony_ci	add	$bp,sp,#$S
870e1051a39Sopenharmony_ci	add	$rp,sp,#$S
871e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
872e1051a39Sopenharmony_ci
873e1051a39Sopenharmony_ci	ldr	$bi,[sp,#$M]
874e1051a39Sopenharmony_ci	mov	$a0,$acc0		// copy S
875e1051a39Sopenharmony_ci	mov	$a1,$acc1
876e1051a39Sopenharmony_ci	mov	$a2,$acc2
877e1051a39Sopenharmony_ci	mov	$a3,$acc3
878e1051a39Sopenharmony_ci	add	$bp,sp,#$M
879e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
880e1051a39Sopenharmony_ci
881e1051a39Sopenharmony_ci	add	$bp,$rp_real,#32
882e1051a39Sopenharmony_ci	add	$rp,$rp_real,#32
883e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
884e1051a39Sopenharmony_ci
885e1051a39Sopenharmony_ci	add	sp,x29,#0		// destroy frame
886e1051a39Sopenharmony_ci	ldp	x19,x20,[x29,#16]
887e1051a39Sopenharmony_ci	ldp	x21,x22,[x29,#32]
888e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#96
889e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
890e1051a39Sopenharmony_ci	ret
891e1051a39Sopenharmony_ci.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
892e1051a39Sopenharmony_ci___
893e1051a39Sopenharmony_ci}
894e1051a39Sopenharmony_ci
895e1051a39Sopenharmony_ci########################################################################
896e1051a39Sopenharmony_ci# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
897e1051a39Sopenharmony_ci#			      const P256_POINT *in2);
898e1051a39Sopenharmony_ci{
899e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z,
900e1051a39Sopenharmony_ci    $H,$Hsqr,$R,$Rsqr,$Hcub,
901e1051a39Sopenharmony_ci    $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
902e1051a39Sopenharmony_cimy ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
903e1051a39Sopenharmony_ci# above map() describes stack layout with 12 temporary
904e1051a39Sopenharmony_ci# 256-bit vectors on top.
905e1051a39Sopenharmony_cimy ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28));
906e1051a39Sopenharmony_ci
907e1051a39Sopenharmony_ci$code.=<<___;
908e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_add
909e1051a39Sopenharmony_ci.type	ecp_nistz256_point_add,%function
910e1051a39Sopenharmony_ci.align	5
911e1051a39Sopenharmony_ciecp_nistz256_point_add:
912e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
913e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-96]!
914e1051a39Sopenharmony_ci	add	x29,sp,#0
915e1051a39Sopenharmony_ci	stp	x19,x20,[sp,#16]
916e1051a39Sopenharmony_ci	stp	x21,x22,[sp,#32]
917e1051a39Sopenharmony_ci	stp	x23,x24,[sp,#48]
918e1051a39Sopenharmony_ci	stp	x25,x26,[sp,#64]
919e1051a39Sopenharmony_ci	stp	x27,x28,[sp,#80]
920e1051a39Sopenharmony_ci	sub	sp,sp,#32*12
921e1051a39Sopenharmony_ci
922e1051a39Sopenharmony_ci	ldp	$a0,$a1,[$bp,#64]	// in2_z
923e1051a39Sopenharmony_ci	ldp	$a2,$a3,[$bp,#64+16]
924e1051a39Sopenharmony_ci	 mov	$rp_real,$rp
925e1051a39Sopenharmony_ci	 mov	$ap_real,$ap
926e1051a39Sopenharmony_ci	 mov	$bp_real,$bp
927e1051a39Sopenharmony_ci	 adrp	$poly3,.Lpoly
928e1051a39Sopenharmony_ci	 add	$poly3,$poly3,:lo12:.Lpoly
929e1051a39Sopenharmony_ci	 ldr	$poly1,[$poly3,#8]
930e1051a39Sopenharmony_ci	 ldr	$poly3,[$poly3,#24]
931e1051a39Sopenharmony_ci	orr	$t0,$a0,$a1
932e1051a39Sopenharmony_ci	orr	$t2,$a2,$a3
933e1051a39Sopenharmony_ci	orr	$in2infty,$t0,$t2
934e1051a39Sopenharmony_ci	cmp	$in2infty,#0
935e1051a39Sopenharmony_ci	csetm	$in2infty,ne		// ~in2infty
936e1051a39Sopenharmony_ci	add	$rp,sp,#$Z2sqr
937e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
938e1051a39Sopenharmony_ci
939e1051a39Sopenharmony_ci	ldp	$a0,$a1,[$ap_real,#64]	// in1_z
940e1051a39Sopenharmony_ci	ldp	$a2,$a3,[$ap_real,#64+16]
941e1051a39Sopenharmony_ci	orr	$t0,$a0,$a1
942e1051a39Sopenharmony_ci	orr	$t2,$a2,$a3
943e1051a39Sopenharmony_ci	orr	$in1infty,$t0,$t2
944e1051a39Sopenharmony_ci	cmp	$in1infty,#0
945e1051a39Sopenharmony_ci	csetm	$in1infty,ne		// ~in1infty
946e1051a39Sopenharmony_ci	add	$rp,sp,#$Z1sqr
947e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
948e1051a39Sopenharmony_ci
949e1051a39Sopenharmony_ci	ldr	$bi,[$bp_real,#64]
950e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$Z2sqr]
951e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$Z2sqr+16]
952e1051a39Sopenharmony_ci	add	$bp,$bp_real,#64
953e1051a39Sopenharmony_ci	add	$rp,sp,#$S1
954e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
955e1051a39Sopenharmony_ci
956e1051a39Sopenharmony_ci	ldr	$bi,[$ap_real,#64]
957e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$Z1sqr]
958e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$Z1sqr+16]
959e1051a39Sopenharmony_ci	add	$bp,$ap_real,#64
960e1051a39Sopenharmony_ci	add	$rp,sp,#$S2
961e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
962e1051a39Sopenharmony_ci
963e1051a39Sopenharmony_ci	ldr	$bi,[$ap_real,#32]
964e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$S1]
965e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$S1+16]
966e1051a39Sopenharmony_ci	add	$bp,$ap_real,#32
967e1051a39Sopenharmony_ci	add	$rp,sp,#$S1
968e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
969e1051a39Sopenharmony_ci
970e1051a39Sopenharmony_ci	ldr	$bi,[$bp_real,#32]
971e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$S2]
972e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$S2+16]
973e1051a39Sopenharmony_ci	add	$bp,$bp_real,#32
974e1051a39Sopenharmony_ci	add	$rp,sp,#$S2
975e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
976e1051a39Sopenharmony_ci
977e1051a39Sopenharmony_ci	add	$bp,sp,#$S1
978e1051a39Sopenharmony_ci	 ldr	$bi,[sp,#$Z2sqr]	// forward load for p256_mul_mont
979e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[$ap_real]
980e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[$ap_real,#16]
981e1051a39Sopenharmony_ci	add	$rp,sp,#$R
982e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
983e1051a39Sopenharmony_ci
984e1051a39Sopenharmony_ci	orr	$acc0,$acc0,$acc1	// see if result is zero
985e1051a39Sopenharmony_ci	orr	$acc2,$acc2,$acc3
986e1051a39Sopenharmony_ci	orr	$temp0,$acc0,$acc2	// ~is_equal(S1,S2)
987e1051a39Sopenharmony_ci
988e1051a39Sopenharmony_ci	add	$bp,sp,#$Z2sqr
989e1051a39Sopenharmony_ci	add	$rp,sp,#$U1
990e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
991e1051a39Sopenharmony_ci
992e1051a39Sopenharmony_ci	ldr	$bi,[sp,#$Z1sqr]
993e1051a39Sopenharmony_ci	ldp	$a0,$a1,[$bp_real]
994e1051a39Sopenharmony_ci	ldp	$a2,$a3,[$bp_real,#16]
995e1051a39Sopenharmony_ci	add	$bp,sp,#$Z1sqr
996e1051a39Sopenharmony_ci	add	$rp,sp,#$U2
997e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
998e1051a39Sopenharmony_ci
999e1051a39Sopenharmony_ci	add	$bp,sp,#$U1
1000e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[sp,#$R]	// forward load for p256_sqr_mont
1001e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[sp,#$R+16]
1002e1051a39Sopenharmony_ci	add	$rp,sp,#$H
1003e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
1004e1051a39Sopenharmony_ci
1005e1051a39Sopenharmony_ci	orr	$acc0,$acc0,$acc1	// see if result is zero
1006e1051a39Sopenharmony_ci	orr	$acc2,$acc2,$acc3
1007e1051a39Sopenharmony_ci	orr	$acc0,$acc0,$acc2	// ~is_equal(U1,U2)
1008e1051a39Sopenharmony_ci
1009e1051a39Sopenharmony_ci	mvn	$temp1,$in1infty	// -1/0 -> 0/-1
1010e1051a39Sopenharmony_ci	mvn	$temp2,$in2infty	// -1/0 -> 0/-1
1011e1051a39Sopenharmony_ci	orr	$acc0,$acc0,$temp1
1012e1051a39Sopenharmony_ci	orr	$acc0,$acc0,$temp2
1013e1051a39Sopenharmony_ci	orr	$acc0,$acc0,$temp0
1014e1051a39Sopenharmony_ci	cbnz	$acc0,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
1015e1051a39Sopenharmony_ci
1016e1051a39Sopenharmony_ci.Ladd_double:
1017e1051a39Sopenharmony_ci	mov	$ap,$ap_real
1018e1051a39Sopenharmony_ci	mov	$rp,$rp_real
1019e1051a39Sopenharmony_ci	ldp	x23,x24,[x29,#48]
1020e1051a39Sopenharmony_ci	ldp	x25,x26,[x29,#64]
1021e1051a39Sopenharmony_ci	ldp	x27,x28,[x29,#80]
1022e1051a39Sopenharmony_ci	add	sp,sp,#32*(12-4)	// difference in stack frames
1023e1051a39Sopenharmony_ci	b	.Ldouble_shortcut
1024e1051a39Sopenharmony_ci
1025e1051a39Sopenharmony_ci.align	4
1026e1051a39Sopenharmony_ci.Ladd_proceed:
1027e1051a39Sopenharmony_ci	add	$rp,sp,#$Rsqr
1028e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1029e1051a39Sopenharmony_ci
1030e1051a39Sopenharmony_ci	ldr	$bi,[$ap_real,#64]
1031e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$H]
1032e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$H+16]
1033e1051a39Sopenharmony_ci	add	$bp,$ap_real,#64
1034e1051a39Sopenharmony_ci	add	$rp,sp,#$res_z
1035e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1036e1051a39Sopenharmony_ci
1037e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$H]
1038e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$H+16]
1039e1051a39Sopenharmony_ci	add	$rp,sp,#$Hsqr
1040e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1041e1051a39Sopenharmony_ci
1042e1051a39Sopenharmony_ci	ldr	$bi,[$bp_real,#64]
1043e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$res_z]
1044e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$res_z+16]
1045e1051a39Sopenharmony_ci	add	$bp,$bp_real,#64
1046e1051a39Sopenharmony_ci	add	$rp,sp,#$res_z
1047e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
1048e1051a39Sopenharmony_ci
1049e1051a39Sopenharmony_ci	ldr	$bi,[sp,#$H]
1050e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$Hsqr]
1051e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$Hsqr+16]
1052e1051a39Sopenharmony_ci	add	$bp,sp,#$H
1053e1051a39Sopenharmony_ci	add	$rp,sp,#$Hcub
1054e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1055e1051a39Sopenharmony_ci
1056e1051a39Sopenharmony_ci	ldr	$bi,[sp,#$Hsqr]
1057e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$U1]
1058e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$U1+16]
1059e1051a39Sopenharmony_ci	add	$bp,sp,#$Hsqr
1060e1051a39Sopenharmony_ci	add	$rp,sp,#$U2
1061e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
1062e1051a39Sopenharmony_ci
1063e1051a39Sopenharmony_ci	mov	$t0,$acc0
1064e1051a39Sopenharmony_ci	mov	$t1,$acc1
1065e1051a39Sopenharmony_ci	mov	$t2,$acc2
1066e1051a39Sopenharmony_ci	mov	$t3,$acc3
1067e1051a39Sopenharmony_ci	add	$rp,sp,#$Hsqr
1068e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	// p256_mul_by_2(Hsqr, U2);
1069e1051a39Sopenharmony_ci
1070e1051a39Sopenharmony_ci	add	$bp,sp,#$Rsqr
1071e1051a39Sopenharmony_ci	add	$rp,sp,#$res_x
1072e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1073e1051a39Sopenharmony_ci
1074e1051a39Sopenharmony_ci	add	$bp,sp,#$Hcub
1075e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1076e1051a39Sopenharmony_ci
1077e1051a39Sopenharmony_ci	add	$bp,sp,#$U2
1078e1051a39Sopenharmony_ci	 ldr	$bi,[sp,#$Hcub]		// forward load for p256_mul_mont
1079e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[sp,#$S1]
1080e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[sp,#$S1+16]
1081e1051a39Sopenharmony_ci	add	$rp,sp,#$res_y
1082e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1083e1051a39Sopenharmony_ci
1084e1051a39Sopenharmony_ci	add	$bp,sp,#$Hcub
1085e1051a39Sopenharmony_ci	add	$rp,sp,#$S2
1086e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
1087e1051a39Sopenharmony_ci
1088e1051a39Sopenharmony_ci	ldr	$bi,[sp,#$R]
1089e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$res_y]
1090e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$res_y+16]
1091e1051a39Sopenharmony_ci	add	$bp,sp,#$R
1092e1051a39Sopenharmony_ci	add	$rp,sp,#$res_y
1093e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1094e1051a39Sopenharmony_ci
1095e1051a39Sopenharmony_ci	add	$bp,sp,#$S2
1096e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1097e1051a39Sopenharmony_ci
1098e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$res_x]		// res
1099e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$res_x+16]
1100e1051a39Sopenharmony_ci	ldp	$t0,$t1,[$bp_real]		// in2
1101e1051a39Sopenharmony_ci	ldp	$t2,$t3,[$bp_real,#16]
1102e1051a39Sopenharmony_ci___
1103e1051a39Sopenharmony_cifor($i=0;$i<64;$i+=32) {		# conditional moves
1104e1051a39Sopenharmony_ci$code.=<<___;
1105e1051a39Sopenharmony_ci	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1106e1051a39Sopenharmony_ci	cmp	$in1infty,#0			// ~$in1intfy, remember?
1107e1051a39Sopenharmony_ci	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1108e1051a39Sopenharmony_ci	csel	$t0,$a0,$t0,ne
1109e1051a39Sopenharmony_ci	csel	$t1,$a1,$t1,ne
1110e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
1111e1051a39Sopenharmony_ci	csel	$t2,$a2,$t2,ne
1112e1051a39Sopenharmony_ci	csel	$t3,$a3,$t3,ne
1113e1051a39Sopenharmony_ci	cmp	$in2infty,#0			// ~$in2intfy, remember?
1114e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$res_x+$i+48]
1115e1051a39Sopenharmony_ci	csel	$acc0,$t0,$acc0,ne
1116e1051a39Sopenharmony_ci	csel	$acc1,$t1,$acc1,ne
1117e1051a39Sopenharmony_ci	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
1118e1051a39Sopenharmony_ci	csel	$acc2,$t2,$acc2,ne
1119e1051a39Sopenharmony_ci	csel	$acc3,$t3,$acc3,ne
1120e1051a39Sopenharmony_ci	ldp	$t2,$t3,[$bp_real,#$i+48]
1121e1051a39Sopenharmony_ci	stp	$acc0,$acc1,[$rp_real,#$i]
1122e1051a39Sopenharmony_ci	stp	$acc2,$acc3,[$rp_real,#$i+16]
1123e1051a39Sopenharmony_ci___
1124e1051a39Sopenharmony_ci}
1125e1051a39Sopenharmony_ci$code.=<<___;
1126e1051a39Sopenharmony_ci	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1127e1051a39Sopenharmony_ci	cmp	$in1infty,#0			// ~$in1intfy, remember?
1128e1051a39Sopenharmony_ci	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1129e1051a39Sopenharmony_ci	csel	$t0,$a0,$t0,ne
1130e1051a39Sopenharmony_ci	csel	$t1,$a1,$t1,ne
1131e1051a39Sopenharmony_ci	csel	$t2,$a2,$t2,ne
1132e1051a39Sopenharmony_ci	csel	$t3,$a3,$t3,ne
1133e1051a39Sopenharmony_ci	cmp	$in2infty,#0			// ~$in2intfy, remember?
1134e1051a39Sopenharmony_ci	csel	$acc0,$t0,$acc0,ne
1135e1051a39Sopenharmony_ci	csel	$acc1,$t1,$acc1,ne
1136e1051a39Sopenharmony_ci	csel	$acc2,$t2,$acc2,ne
1137e1051a39Sopenharmony_ci	csel	$acc3,$t3,$acc3,ne
1138e1051a39Sopenharmony_ci	stp	$acc0,$acc1,[$rp_real,#$i]
1139e1051a39Sopenharmony_ci	stp	$acc2,$acc3,[$rp_real,#$i+16]
1140e1051a39Sopenharmony_ci
1141e1051a39Sopenharmony_ci.Ladd_done:
1142e1051a39Sopenharmony_ci	add	sp,x29,#0		// destroy frame
1143e1051a39Sopenharmony_ci	ldp	x19,x20,[x29,#16]
1144e1051a39Sopenharmony_ci	ldp	x21,x22,[x29,#32]
1145e1051a39Sopenharmony_ci	ldp	x23,x24,[x29,#48]
1146e1051a39Sopenharmony_ci	ldp	x25,x26,[x29,#64]
1147e1051a39Sopenharmony_ci	ldp	x27,x28,[x29,#80]
1148e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#96
1149e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
1150e1051a39Sopenharmony_ci	ret
1151e1051a39Sopenharmony_ci.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1152e1051a39Sopenharmony_ci___
1153e1051a39Sopenharmony_ci}
1154e1051a39Sopenharmony_ci
1155e1051a39Sopenharmony_ci########################################################################
1156e1051a39Sopenharmony_ci# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1157e1051a39Sopenharmony_ci#				     const P256_POINT_AFFINE *in2);
1158e1051a39Sopenharmony_ci{
1159e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z,
1160e1051a39Sopenharmony_ci    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1161e1051a39Sopenharmony_cimy $Z1sqr = $S2;
1162e1051a39Sopenharmony_ci# above map() describes stack layout with 10 temporary
1163e1051a39Sopenharmony_ci# 256-bit vectors on top.
1164e1051a39Sopenharmony_cimy ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
1165e1051a39Sopenharmony_ci
1166e1051a39Sopenharmony_ci$code.=<<___;
1167e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_add_affine
1168e1051a39Sopenharmony_ci.type	ecp_nistz256_point_add_affine,%function
1169e1051a39Sopenharmony_ci.align	5
1170e1051a39Sopenharmony_ciecp_nistz256_point_add_affine:
1171e1051a39Sopenharmony_ci	.inst	0xd503233f		// paciasp
1172e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-80]!
1173e1051a39Sopenharmony_ci	add	x29,sp,#0
1174e1051a39Sopenharmony_ci	stp	x19,x20,[sp,#16]
1175e1051a39Sopenharmony_ci	stp	x21,x22,[sp,#32]
1176e1051a39Sopenharmony_ci	stp	x23,x24,[sp,#48]
1177e1051a39Sopenharmony_ci	stp	x25,x26,[sp,#64]
1178e1051a39Sopenharmony_ci	sub	sp,sp,#32*10
1179e1051a39Sopenharmony_ci
1180e1051a39Sopenharmony_ci	mov	$rp_real,$rp
1181e1051a39Sopenharmony_ci	mov	$ap_real,$ap
1182e1051a39Sopenharmony_ci	mov	$bp_real,$bp
1183e1051a39Sopenharmony_ci	adrp	$poly3,.Lpoly
1184e1051a39Sopenharmony_ci	add	$poly3,$poly3,:lo12:.Lpoly
1185e1051a39Sopenharmony_ci	ldr	$poly1,[$poly3,#8]
1186e1051a39Sopenharmony_ci	ldr	$poly3,[$poly3,#24]
1187e1051a39Sopenharmony_ci
1188e1051a39Sopenharmony_ci	ldp	$a0,$a1,[$ap,#64]	// in1_z
1189e1051a39Sopenharmony_ci	ldp	$a2,$a3,[$ap,#64+16]
1190e1051a39Sopenharmony_ci	orr	$t0,$a0,$a1
1191e1051a39Sopenharmony_ci	orr	$t2,$a2,$a3
1192e1051a39Sopenharmony_ci	orr	$in1infty,$t0,$t2
1193e1051a39Sopenharmony_ci	cmp	$in1infty,#0
1194e1051a39Sopenharmony_ci	csetm	$in1infty,ne		// ~in1infty
1195e1051a39Sopenharmony_ci
1196e1051a39Sopenharmony_ci	ldp	$acc0,$acc1,[$bp]	// in2_x
1197e1051a39Sopenharmony_ci	ldp	$acc2,$acc3,[$bp,#16]
1198e1051a39Sopenharmony_ci	ldp	$t0,$t1,[$bp,#32]	// in2_y
1199e1051a39Sopenharmony_ci	ldp	$t2,$t3,[$bp,#48]
1200e1051a39Sopenharmony_ci	orr	$acc0,$acc0,$acc1
1201e1051a39Sopenharmony_ci	orr	$acc2,$acc2,$acc3
1202e1051a39Sopenharmony_ci	orr	$t0,$t0,$t1
1203e1051a39Sopenharmony_ci	orr	$t2,$t2,$t3
1204e1051a39Sopenharmony_ci	orr	$acc0,$acc0,$acc2
1205e1051a39Sopenharmony_ci	orr	$t0,$t0,$t2
1206e1051a39Sopenharmony_ci	orr	$in2infty,$acc0,$t0
1207e1051a39Sopenharmony_ci	cmp	$in2infty,#0
1208e1051a39Sopenharmony_ci	csetm	$in2infty,ne		// ~in2infty
1209e1051a39Sopenharmony_ci
1210e1051a39Sopenharmony_ci	add	$rp,sp,#$Z1sqr
1211e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
1212e1051a39Sopenharmony_ci
1213e1051a39Sopenharmony_ci	mov	$a0,$acc0
1214e1051a39Sopenharmony_ci	mov	$a1,$acc1
1215e1051a39Sopenharmony_ci	mov	$a2,$acc2
1216e1051a39Sopenharmony_ci	mov	$a3,$acc3
1217e1051a39Sopenharmony_ci	ldr	$bi,[$bp_real]
1218e1051a39Sopenharmony_ci	add	$bp,$bp_real,#0
1219e1051a39Sopenharmony_ci	add	$rp,sp,#$U2
1220e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
1221e1051a39Sopenharmony_ci
1222e1051a39Sopenharmony_ci	add	$bp,$ap_real,#0
1223e1051a39Sopenharmony_ci	 ldr	$bi,[$ap_real,#64]	// forward load for p256_mul_mont
1224e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[sp,#$Z1sqr]
1225e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[sp,#$Z1sqr+16]
1226e1051a39Sopenharmony_ci	add	$rp,sp,#$H
1227e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
1228e1051a39Sopenharmony_ci
1229e1051a39Sopenharmony_ci	add	$bp,$ap_real,#64
1230e1051a39Sopenharmony_ci	add	$rp,sp,#$S2
1231e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
1232e1051a39Sopenharmony_ci
1233e1051a39Sopenharmony_ci	ldr	$bi,[$ap_real,#64]
1234e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$H]
1235e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$H+16]
1236e1051a39Sopenharmony_ci	add	$bp,$ap_real,#64
1237e1051a39Sopenharmony_ci	add	$rp,sp,#$res_z
1238e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1239e1051a39Sopenharmony_ci
1240e1051a39Sopenharmony_ci	ldr	$bi,[$bp_real,#32]
1241e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$S2]
1242e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$S2+16]
1243e1051a39Sopenharmony_ci	add	$bp,$bp_real,#32
1244e1051a39Sopenharmony_ci	add	$rp,sp,#$S2
1245e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
1246e1051a39Sopenharmony_ci
1247e1051a39Sopenharmony_ci	add	$bp,$ap_real,#32
1248e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[sp,#$H]	// forward load for p256_sqr_mont
1249e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[sp,#$H+16]
1250e1051a39Sopenharmony_ci	add	$rp,sp,#$R
1251e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
1252e1051a39Sopenharmony_ci
1253e1051a39Sopenharmony_ci	add	$rp,sp,#$Hsqr
1254e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1255e1051a39Sopenharmony_ci
1256e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$R]
1257e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$R+16]
1258e1051a39Sopenharmony_ci	add	$rp,sp,#$Rsqr
1259e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1260e1051a39Sopenharmony_ci
1261e1051a39Sopenharmony_ci	ldr	$bi,[sp,#$H]
1262e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$Hsqr]
1263e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$Hsqr+16]
1264e1051a39Sopenharmony_ci	add	$bp,sp,#$H
1265e1051a39Sopenharmony_ci	add	$rp,sp,#$Hcub
1266e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1267e1051a39Sopenharmony_ci
1268e1051a39Sopenharmony_ci	ldr	$bi,[$ap_real]
1269e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$Hsqr]
1270e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$Hsqr+16]
1271e1051a39Sopenharmony_ci	add	$bp,$ap_real,#0
1272e1051a39Sopenharmony_ci	add	$rp,sp,#$U2
1273e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
1274e1051a39Sopenharmony_ci
1275e1051a39Sopenharmony_ci	mov	$t0,$acc0
1276e1051a39Sopenharmony_ci	mov	$t1,$acc1
1277e1051a39Sopenharmony_ci	mov	$t2,$acc2
1278e1051a39Sopenharmony_ci	mov	$t3,$acc3
1279e1051a39Sopenharmony_ci	add	$rp,sp,#$Hsqr
1280e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	// p256_mul_by_2(Hsqr, U2);
1281e1051a39Sopenharmony_ci
1282e1051a39Sopenharmony_ci	add	$bp,sp,#$Rsqr
1283e1051a39Sopenharmony_ci	add	$rp,sp,#$res_x
1284e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1285e1051a39Sopenharmony_ci
1286e1051a39Sopenharmony_ci	add	$bp,sp,#$Hcub
1287e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1288e1051a39Sopenharmony_ci
1289e1051a39Sopenharmony_ci	add	$bp,sp,#$U2
1290e1051a39Sopenharmony_ci	 ldr	$bi,[$ap_real,#32]	// forward load for p256_mul_mont
1291e1051a39Sopenharmony_ci	 ldp	$a0,$a1,[sp,#$Hcub]
1292e1051a39Sopenharmony_ci	 ldp	$a2,$a3,[sp,#$Hcub+16]
1293e1051a39Sopenharmony_ci	add	$rp,sp,#$res_y
1294e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1295e1051a39Sopenharmony_ci
1296e1051a39Sopenharmony_ci	add	$bp,$ap_real,#32
1297e1051a39Sopenharmony_ci	add	$rp,sp,#$S2
1298e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1299e1051a39Sopenharmony_ci
1300e1051a39Sopenharmony_ci	ldr	$bi,[sp,#$R]
1301e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$res_y]
1302e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$res_y+16]
1303e1051a39Sopenharmony_ci	add	$bp,sp,#$R
1304e1051a39Sopenharmony_ci	add	$rp,sp,#$res_y
1305e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1306e1051a39Sopenharmony_ci
1307e1051a39Sopenharmony_ci	add	$bp,sp,#$S2
1308e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1309e1051a39Sopenharmony_ci
1310e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$res_x]		// res
1311e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$res_x+16]
1312e1051a39Sopenharmony_ci	ldp	$t0,$t1,[$bp_real]		// in2
1313e1051a39Sopenharmony_ci	ldp	$t2,$t3,[$bp_real,#16]
1314e1051a39Sopenharmony_ci___
1315e1051a39Sopenharmony_cifor($i=0;$i<64;$i+=32) {		# conditional moves
1316e1051a39Sopenharmony_ci$code.=<<___;
1317e1051a39Sopenharmony_ci	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1318e1051a39Sopenharmony_ci	cmp	$in1infty,#0			// ~$in1intfy, remember?
1319e1051a39Sopenharmony_ci	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1320e1051a39Sopenharmony_ci	csel	$t0,$a0,$t0,ne
1321e1051a39Sopenharmony_ci	csel	$t1,$a1,$t1,ne
1322e1051a39Sopenharmony_ci	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
1323e1051a39Sopenharmony_ci	csel	$t2,$a2,$t2,ne
1324e1051a39Sopenharmony_ci	csel	$t3,$a3,$t3,ne
1325e1051a39Sopenharmony_ci	cmp	$in2infty,#0			// ~$in2intfy, remember?
1326e1051a39Sopenharmony_ci	ldp	$a2,$a3,[sp,#$res_x+$i+48]
1327e1051a39Sopenharmony_ci	csel	$acc0,$t0,$acc0,ne
1328e1051a39Sopenharmony_ci	csel	$acc1,$t1,$acc1,ne
1329e1051a39Sopenharmony_ci	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
1330e1051a39Sopenharmony_ci	csel	$acc2,$t2,$acc2,ne
1331e1051a39Sopenharmony_ci	csel	$acc3,$t3,$acc3,ne
1332e1051a39Sopenharmony_ci	ldp	$t2,$t3,[$bp_real,#$i+48]
1333e1051a39Sopenharmony_ci	stp	$acc0,$acc1,[$rp_real,#$i]
1334e1051a39Sopenharmony_ci	stp	$acc2,$acc3,[$rp_real,#$i+16]
1335e1051a39Sopenharmony_ci___
1336e1051a39Sopenharmony_ci$code.=<<___	if ($i == 0);
1337e1051a39Sopenharmony_ci	adrp	$bp_real,.Lone_mont-64
1338e1051a39Sopenharmony_ci	add	$bp_real,$bp_real,:lo12:.Lone_mont-64
1339e1051a39Sopenharmony_ci___
1340e1051a39Sopenharmony_ci}
1341e1051a39Sopenharmony_ci$code.=<<___;
1342e1051a39Sopenharmony_ci	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1343e1051a39Sopenharmony_ci	cmp	$in1infty,#0			// ~$in1intfy, remember?
1344e1051a39Sopenharmony_ci	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1345e1051a39Sopenharmony_ci	csel	$t0,$a0,$t0,ne
1346e1051a39Sopenharmony_ci	csel	$t1,$a1,$t1,ne
1347e1051a39Sopenharmony_ci	csel	$t2,$a2,$t2,ne
1348e1051a39Sopenharmony_ci	csel	$t3,$a3,$t3,ne
1349e1051a39Sopenharmony_ci	cmp	$in2infty,#0			// ~$in2intfy, remember?
1350e1051a39Sopenharmony_ci	csel	$acc0,$t0,$acc0,ne
1351e1051a39Sopenharmony_ci	csel	$acc1,$t1,$acc1,ne
1352e1051a39Sopenharmony_ci	csel	$acc2,$t2,$acc2,ne
1353e1051a39Sopenharmony_ci	csel	$acc3,$t3,$acc3,ne
1354e1051a39Sopenharmony_ci	stp	$acc0,$acc1,[$rp_real,#$i]
1355e1051a39Sopenharmony_ci	stp	$acc2,$acc3,[$rp_real,#$i+16]
1356e1051a39Sopenharmony_ci
1357e1051a39Sopenharmony_ci	add	sp,x29,#0		// destroy frame
1358e1051a39Sopenharmony_ci	ldp	x19,x20,[x29,#16]
1359e1051a39Sopenharmony_ci	ldp	x21,x22,[x29,#32]
1360e1051a39Sopenharmony_ci	ldp	x23,x24,[x29,#48]
1361e1051a39Sopenharmony_ci	ldp	x25,x26,[x29,#64]
1362e1051a39Sopenharmony_ci	ldp	x29,x30,[sp],#80
1363e1051a39Sopenharmony_ci	.inst	0xd50323bf		// autiasp
1364e1051a39Sopenharmony_ci	ret
1365e1051a39Sopenharmony_ci.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1366e1051a39Sopenharmony_ci___
1367e1051a39Sopenharmony_ci}
1368e1051a39Sopenharmony_ciif (1) {
1369e1051a39Sopenharmony_cimy ($ord0,$ord1) = ($poly1,$poly3);
1370e1051a39Sopenharmony_cimy ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
1371e1051a39Sopenharmony_cimy $acc7 = $bi;
1372e1051a39Sopenharmony_ci
1373e1051a39Sopenharmony_ci$code.=<<___;
1374e1051a39Sopenharmony_ci////////////////////////////////////////////////////////////////////////
1375e1051a39Sopenharmony_ci// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1376e1051a39Sopenharmony_ci//                                uint64_t b[4]);
1377e1051a39Sopenharmony_ci.globl	ecp_nistz256_ord_mul_mont
1378e1051a39Sopenharmony_ci.type	ecp_nistz256_ord_mul_mont,%function
1379e1051a39Sopenharmony_ci.align	4
1380e1051a39Sopenharmony_ciecp_nistz256_ord_mul_mont:
1381e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-64]!
1382e1051a39Sopenharmony_ci	add	x29,sp,#0
1383e1051a39Sopenharmony_ci	stp	x19,x20,[sp,#16]
1384e1051a39Sopenharmony_ci	stp	x21,x22,[sp,#32]
1385e1051a39Sopenharmony_ci	stp	x23,x24,[sp,#48]
1386e1051a39Sopenharmony_ci
1387e1051a39Sopenharmony_ci	adrp	$ordk,.Lord
1388e1051a39Sopenharmony_ci	add	$ordk,$ordk,:lo12:.Lord
1389e1051a39Sopenharmony_ci	ldr	$bi,[$bp]		// bp[0]
1390e1051a39Sopenharmony_ci	ldp	$a0,$a1,[$ap]
1391e1051a39Sopenharmony_ci	ldp	$a2,$a3,[$ap,#16]
1392e1051a39Sopenharmony_ci
1393e1051a39Sopenharmony_ci	ldp	$ord0,$ord1,[$ordk,#0]
1394e1051a39Sopenharmony_ci	ldp	$ord2,$ord3,[$ordk,#16]
1395e1051a39Sopenharmony_ci	ldr	$ordk,[$ordk,#32]
1396e1051a39Sopenharmony_ci
1397e1051a39Sopenharmony_ci	mul	$acc0,$a0,$bi		// a[0]*b[0]
1398e1051a39Sopenharmony_ci	umulh	$t0,$a0,$bi
1399e1051a39Sopenharmony_ci
1400e1051a39Sopenharmony_ci	mul	$acc1,$a1,$bi		// a[1]*b[0]
1401e1051a39Sopenharmony_ci	umulh	$t1,$a1,$bi
1402e1051a39Sopenharmony_ci
1403e1051a39Sopenharmony_ci	mul	$acc2,$a2,$bi		// a[2]*b[0]
1404e1051a39Sopenharmony_ci	umulh	$t2,$a2,$bi
1405e1051a39Sopenharmony_ci
1406e1051a39Sopenharmony_ci	mul	$acc3,$a3,$bi		// a[3]*b[0]
1407e1051a39Sopenharmony_ci	umulh	$acc4,$a3,$bi
1408e1051a39Sopenharmony_ci
1409e1051a39Sopenharmony_ci	mul	$t4,$acc0,$ordk
1410e1051a39Sopenharmony_ci
1411e1051a39Sopenharmony_ci	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
1412e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$t1
1413e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$t2
1414e1051a39Sopenharmony_ci	adc	$acc4,$acc4,xzr
1415e1051a39Sopenharmony_ci	mov	$acc5,xzr
1416e1051a39Sopenharmony_ci___
1417e1051a39Sopenharmony_cifor ($i=1;$i<4;$i++) {
1418e1051a39Sopenharmony_ci	################################################################
1419e1051a39Sopenharmony_ci	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1420e1051a39Sopenharmony_ci	# *                                     abcdefgh
1421e1051a39Sopenharmony_ci	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1422e1051a39Sopenharmony_ci	#
1423e1051a39Sopenharmony_ci	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1424e1051a39Sopenharmony_ci	# rewrite above as:
1425e1051a39Sopenharmony_ci	#
1426e1051a39Sopenharmony_ci	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1427e1051a39Sopenharmony_ci	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1428e1051a39Sopenharmony_ci	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1429e1051a39Sopenharmony_ci$code.=<<___;
1430e1051a39Sopenharmony_ci	ldr	$bi,[$bp,#8*$i]		// b[i]
1431e1051a39Sopenharmony_ci
1432e1051a39Sopenharmony_ci	lsl	$t0,$t4,#32
1433e1051a39Sopenharmony_ci	subs	$acc2,$acc2,$t4
1434e1051a39Sopenharmony_ci	lsr	$t1,$t4,#32
1435e1051a39Sopenharmony_ci	sbcs	$acc3,$acc3,$t0
1436e1051a39Sopenharmony_ci	sbcs	$acc4,$acc4,$t1
1437e1051a39Sopenharmony_ci	sbc	$acc5,$acc5,xzr
1438e1051a39Sopenharmony_ci
1439e1051a39Sopenharmony_ci	subs	xzr,$acc0,#1
1440e1051a39Sopenharmony_ci	umulh	$t1,$ord0,$t4
1441e1051a39Sopenharmony_ci	mul	$t2,$ord1,$t4
1442e1051a39Sopenharmony_ci	umulh	$t3,$ord1,$t4
1443e1051a39Sopenharmony_ci
1444e1051a39Sopenharmony_ci	adcs	$t2,$t2,$t1
1445e1051a39Sopenharmony_ci	 mul	$t0,$a0,$bi
1446e1051a39Sopenharmony_ci	adc	$t3,$t3,xzr
1447e1051a39Sopenharmony_ci	 mul	$t1,$a1,$bi
1448e1051a39Sopenharmony_ci
1449e1051a39Sopenharmony_ci	adds	$acc0,$acc1,$t2
1450e1051a39Sopenharmony_ci	 mul	$t2,$a2,$bi
1451e1051a39Sopenharmony_ci	adcs	$acc1,$acc2,$t3
1452e1051a39Sopenharmony_ci	 mul	$t3,$a3,$bi
1453e1051a39Sopenharmony_ci	adcs	$acc2,$acc3,$t4
1454e1051a39Sopenharmony_ci	adcs	$acc3,$acc4,$t4
1455e1051a39Sopenharmony_ci	adc	$acc4,$acc5,xzr
1456e1051a39Sopenharmony_ci
1457e1051a39Sopenharmony_ci	adds	$acc0,$acc0,$t0		// accumulate low parts
1458e1051a39Sopenharmony_ci	umulh	$t0,$a0,$bi
1459e1051a39Sopenharmony_ci	adcs	$acc1,$acc1,$t1
1460e1051a39Sopenharmony_ci	umulh	$t1,$a1,$bi
1461e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$t2
1462e1051a39Sopenharmony_ci	umulh	$t2,$a2,$bi
1463e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$t3
1464e1051a39Sopenharmony_ci	umulh	$t3,$a3,$bi
1465e1051a39Sopenharmony_ci	adc	$acc4,$acc4,xzr
1466e1051a39Sopenharmony_ci	mul	$t4,$acc0,$ordk
1467e1051a39Sopenharmony_ci	adds	$acc1,$acc1,$t0		// accumulate high parts
1468e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$t1
1469e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$t2
1470e1051a39Sopenharmony_ci	adcs	$acc4,$acc4,$t3
1471e1051a39Sopenharmony_ci	adc	$acc5,xzr,xzr
1472e1051a39Sopenharmony_ci___
1473e1051a39Sopenharmony_ci}
1474e1051a39Sopenharmony_ci$code.=<<___;
1475e1051a39Sopenharmony_ci	lsl	$t0,$t4,#32		// last reduction
1476e1051a39Sopenharmony_ci	subs	$acc2,$acc2,$t4
1477e1051a39Sopenharmony_ci	lsr	$t1,$t4,#32
1478e1051a39Sopenharmony_ci	sbcs	$acc3,$acc3,$t0
1479e1051a39Sopenharmony_ci	sbcs	$acc4,$acc4,$t1
1480e1051a39Sopenharmony_ci	sbc	$acc5,$acc5,xzr
1481e1051a39Sopenharmony_ci
1482e1051a39Sopenharmony_ci	subs	xzr,$acc0,#1
1483e1051a39Sopenharmony_ci	umulh	$t1,$ord0,$t4
1484e1051a39Sopenharmony_ci	mul	$t2,$ord1,$t4
1485e1051a39Sopenharmony_ci	umulh	$t3,$ord1,$t4
1486e1051a39Sopenharmony_ci
1487e1051a39Sopenharmony_ci	adcs	$t2,$t2,$t1
1488e1051a39Sopenharmony_ci	adc	$t3,$t3,xzr
1489e1051a39Sopenharmony_ci
1490e1051a39Sopenharmony_ci	adds	$acc0,$acc1,$t2
1491e1051a39Sopenharmony_ci	adcs	$acc1,$acc2,$t3
1492e1051a39Sopenharmony_ci	adcs	$acc2,$acc3,$t4
1493e1051a39Sopenharmony_ci	adcs	$acc3,$acc4,$t4
1494e1051a39Sopenharmony_ci	adc	$acc4,$acc5,xzr
1495e1051a39Sopenharmony_ci
1496e1051a39Sopenharmony_ci	subs	$t0,$acc0,$ord0		// ret -= modulus
1497e1051a39Sopenharmony_ci	sbcs	$t1,$acc1,$ord1
1498e1051a39Sopenharmony_ci	sbcs	$t2,$acc2,$ord2
1499e1051a39Sopenharmony_ci	sbcs	$t3,$acc3,$ord3
1500e1051a39Sopenharmony_ci	sbcs	xzr,$acc4,xzr
1501e1051a39Sopenharmony_ci
1502e1051a39Sopenharmony_ci	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1503e1051a39Sopenharmony_ci	csel	$acc1,$acc1,$t1,lo
1504e1051a39Sopenharmony_ci	csel	$acc2,$acc2,$t2,lo
1505e1051a39Sopenharmony_ci	stp	$acc0,$acc1,[$rp]
1506e1051a39Sopenharmony_ci	csel	$acc3,$acc3,$t3,lo
1507e1051a39Sopenharmony_ci	stp	$acc2,$acc3,[$rp,#16]
1508e1051a39Sopenharmony_ci
1509e1051a39Sopenharmony_ci	ldp	x19,x20,[sp,#16]
1510e1051a39Sopenharmony_ci	ldp	x21,x22,[sp,#32]
1511e1051a39Sopenharmony_ci	ldp	x23,x24,[sp,#48]
1512e1051a39Sopenharmony_ci	ldr	x29,[sp],#64
1513e1051a39Sopenharmony_ci	ret
1514e1051a39Sopenharmony_ci.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1515e1051a39Sopenharmony_ci
1516e1051a39Sopenharmony_ci////////////////////////////////////////////////////////////////////////
1517e1051a39Sopenharmony_ci// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1518e1051a39Sopenharmony_ci//                                uint64_t rep);
1519e1051a39Sopenharmony_ci.globl	ecp_nistz256_ord_sqr_mont
1520e1051a39Sopenharmony_ci.type	ecp_nistz256_ord_sqr_mont,%function
1521e1051a39Sopenharmony_ci.align	4
1522e1051a39Sopenharmony_ciecp_nistz256_ord_sqr_mont:
1523e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-64]!
1524e1051a39Sopenharmony_ci	add	x29,sp,#0
1525e1051a39Sopenharmony_ci	stp	x19,x20,[sp,#16]
1526e1051a39Sopenharmony_ci	stp	x21,x22,[sp,#32]
1527e1051a39Sopenharmony_ci	stp	x23,x24,[sp,#48]
1528e1051a39Sopenharmony_ci
1529e1051a39Sopenharmony_ci	adrp	$ordk,.Lord
1530e1051a39Sopenharmony_ci	add	$ordk,$ordk,:lo12:.Lord
1531e1051a39Sopenharmony_ci	ldp	$a0,$a1,[$ap]
1532e1051a39Sopenharmony_ci	ldp	$a2,$a3,[$ap,#16]
1533e1051a39Sopenharmony_ci
1534e1051a39Sopenharmony_ci	ldp	$ord0,$ord1,[$ordk,#0]
1535e1051a39Sopenharmony_ci	ldp	$ord2,$ord3,[$ordk,#16]
1536e1051a39Sopenharmony_ci	ldr	$ordk,[$ordk,#32]
1537e1051a39Sopenharmony_ci	b	.Loop_ord_sqr
1538e1051a39Sopenharmony_ci
1539e1051a39Sopenharmony_ci.align	4
1540e1051a39Sopenharmony_ci.Loop_ord_sqr:
1541e1051a39Sopenharmony_ci	sub	$bp,$bp,#1
1542e1051a39Sopenharmony_ci	////////////////////////////////////////////////////////////////
1543e1051a39Sopenharmony_ci	//  |  |  |  |  |  |a1*a0|  |
1544e1051a39Sopenharmony_ci	//  |  |  |  |  |a2*a0|  |  |
1545e1051a39Sopenharmony_ci	//  |  |a3*a2|a3*a0|  |  |  |
1546e1051a39Sopenharmony_ci	//  |  |  |  |a2*a1|  |  |  |
1547e1051a39Sopenharmony_ci	//  |  |  |a3*a1|  |  |  |  |
1548e1051a39Sopenharmony_ci	// *|  |  |  |  |  |  |  | 2|
1549e1051a39Sopenharmony_ci	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1550e1051a39Sopenharmony_ci	//  |--+--+--+--+--+--+--+--|
1551e1051a39Sopenharmony_ci	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1552e1051a39Sopenharmony_ci	//
1553e1051a39Sopenharmony_ci	//  "can't overflow" below mark carrying into high part of
1554e1051a39Sopenharmony_ci	//  multiplication result, which can't overflow, because it
1555e1051a39Sopenharmony_ci	//  can never be all ones.
1556e1051a39Sopenharmony_ci
1557e1051a39Sopenharmony_ci	mul	$acc1,$a1,$a0		// a[1]*a[0]
1558e1051a39Sopenharmony_ci	umulh	$t1,$a1,$a0
1559e1051a39Sopenharmony_ci	mul	$acc2,$a2,$a0		// a[2]*a[0]
1560e1051a39Sopenharmony_ci	umulh	$t2,$a2,$a0
1561e1051a39Sopenharmony_ci	mul	$acc3,$a3,$a0		// a[3]*a[0]
1562e1051a39Sopenharmony_ci	umulh	$acc4,$a3,$a0
1563e1051a39Sopenharmony_ci
1564e1051a39Sopenharmony_ci	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
1565e1051a39Sopenharmony_ci	 mul	$t0,$a2,$a1		// a[2]*a[1]
1566e1051a39Sopenharmony_ci	 umulh	$t1,$a2,$a1
1567e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$t2
1568e1051a39Sopenharmony_ci	 mul	$t2,$a3,$a1		// a[3]*a[1]
1569e1051a39Sopenharmony_ci	 umulh	$t3,$a3,$a1
1570e1051a39Sopenharmony_ci	adc	$acc4,$acc4,xzr		// can't overflow
1571e1051a39Sopenharmony_ci
1572e1051a39Sopenharmony_ci	mul	$acc5,$a3,$a2		// a[3]*a[2]
1573e1051a39Sopenharmony_ci	umulh	$acc6,$a3,$a2
1574e1051a39Sopenharmony_ci
1575e1051a39Sopenharmony_ci	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
1576e1051a39Sopenharmony_ci	 mul	$acc0,$a0,$a0		// a[0]*a[0]
1577e1051a39Sopenharmony_ci	adc	$t2,$t3,xzr		// can't overflow
1578e1051a39Sopenharmony_ci
1579e1051a39Sopenharmony_ci	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
1580e1051a39Sopenharmony_ci	 umulh	$a0,$a0,$a0
1581e1051a39Sopenharmony_ci	adcs	$acc4,$acc4,$t1
1582e1051a39Sopenharmony_ci	 mul	$t1,$a1,$a1		// a[1]*a[1]
1583e1051a39Sopenharmony_ci	adcs	$acc5,$acc5,$t2
1584e1051a39Sopenharmony_ci	 umulh	$a1,$a1,$a1
1585e1051a39Sopenharmony_ci	adc	$acc6,$acc6,xzr		// can't overflow
1586e1051a39Sopenharmony_ci
1587e1051a39Sopenharmony_ci	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
1588e1051a39Sopenharmony_ci	 mul	$t2,$a2,$a2		// a[2]*a[2]
1589e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$acc2
1590e1051a39Sopenharmony_ci	 umulh	$a2,$a2,$a2
1591e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$acc3
1592e1051a39Sopenharmony_ci	 mul	$t3,$a3,$a3		// a[3]*a[3]
1593e1051a39Sopenharmony_ci	adcs	$acc4,$acc4,$acc4
1594e1051a39Sopenharmony_ci	 umulh	$a3,$a3,$a3
1595e1051a39Sopenharmony_ci	adcs	$acc5,$acc5,$acc5
1596e1051a39Sopenharmony_ci	adcs	$acc6,$acc6,$acc6
1597e1051a39Sopenharmony_ci	adc	$acc7,xzr,xzr
1598e1051a39Sopenharmony_ci
1599e1051a39Sopenharmony_ci	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
1600e1051a39Sopenharmony_ci	 mul	$t4,$acc0,$ordk
1601e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$t1
1602e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$a1
1603e1051a39Sopenharmony_ci	adcs	$acc4,$acc4,$t2
1604e1051a39Sopenharmony_ci	adcs	$acc5,$acc5,$a2
1605e1051a39Sopenharmony_ci	adcs	$acc6,$acc6,$t3
1606e1051a39Sopenharmony_ci	adc	$acc7,$acc7,$a3
1607e1051a39Sopenharmony_ci___
1608e1051a39Sopenharmony_cifor($i=0; $i<4; $i++) {			# reductions
1609e1051a39Sopenharmony_ci$code.=<<___;
1610e1051a39Sopenharmony_ci	subs	xzr,$acc0,#1
1611e1051a39Sopenharmony_ci	umulh	$t1,$ord0,$t4
1612e1051a39Sopenharmony_ci	mul	$t2,$ord1,$t4
1613e1051a39Sopenharmony_ci	umulh	$t3,$ord1,$t4
1614e1051a39Sopenharmony_ci
1615e1051a39Sopenharmony_ci	adcs	$t2,$t2,$t1
1616e1051a39Sopenharmony_ci	adc	$t3,$t3,xzr
1617e1051a39Sopenharmony_ci
1618e1051a39Sopenharmony_ci	adds	$acc0,$acc1,$t2
1619e1051a39Sopenharmony_ci	adcs	$acc1,$acc2,$t3
1620e1051a39Sopenharmony_ci	adcs	$acc2,$acc3,$t4
1621e1051a39Sopenharmony_ci	adc	$acc3,xzr,$t4		// can't overflow
1622e1051a39Sopenharmony_ci___
1623e1051a39Sopenharmony_ci$code.=<<___	if ($i<3);
1624e1051a39Sopenharmony_ci	mul	$t3,$acc0,$ordk
1625e1051a39Sopenharmony_ci___
1626e1051a39Sopenharmony_ci$code.=<<___;
1627e1051a39Sopenharmony_ci	lsl	$t0,$t4,#32
1628e1051a39Sopenharmony_ci	subs	$acc1,$acc1,$t4
1629e1051a39Sopenharmony_ci	lsr	$t1,$t4,#32
1630e1051a39Sopenharmony_ci	sbcs	$acc2,$acc2,$t0
1631e1051a39Sopenharmony_ci	sbc	$acc3,$acc3,$t1		// can't borrow
1632e1051a39Sopenharmony_ci___
1633e1051a39Sopenharmony_ci	($t3,$t4) = ($t4,$t3);
1634e1051a39Sopenharmony_ci}
1635e1051a39Sopenharmony_ci$code.=<<___;
1636e1051a39Sopenharmony_ci	adds	$acc0,$acc0,$acc4	// accumulate upper half
1637e1051a39Sopenharmony_ci	adcs	$acc1,$acc1,$acc5
1638e1051a39Sopenharmony_ci	adcs	$acc2,$acc2,$acc6
1639e1051a39Sopenharmony_ci	adcs	$acc3,$acc3,$acc7
1640e1051a39Sopenharmony_ci	adc	$acc4,xzr,xzr
1641e1051a39Sopenharmony_ci
1642e1051a39Sopenharmony_ci	subs	$t0,$acc0,$ord0		// ret -= modulus
1643e1051a39Sopenharmony_ci	sbcs	$t1,$acc1,$ord1
1644e1051a39Sopenharmony_ci	sbcs	$t2,$acc2,$ord2
1645e1051a39Sopenharmony_ci	sbcs	$t3,$acc3,$ord3
1646e1051a39Sopenharmony_ci	sbcs	xzr,$acc4,xzr
1647e1051a39Sopenharmony_ci
1648e1051a39Sopenharmony_ci	csel	$a0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1649e1051a39Sopenharmony_ci	csel	$a1,$acc1,$t1,lo
1650e1051a39Sopenharmony_ci	csel	$a2,$acc2,$t2,lo
1651e1051a39Sopenharmony_ci	csel	$a3,$acc3,$t3,lo
1652e1051a39Sopenharmony_ci
1653e1051a39Sopenharmony_ci	cbnz	$bp,.Loop_ord_sqr
1654e1051a39Sopenharmony_ci
1655e1051a39Sopenharmony_ci	stp	$a0,$a1,[$rp]
1656e1051a39Sopenharmony_ci	stp	$a2,$a3,[$rp,#16]
1657e1051a39Sopenharmony_ci
1658e1051a39Sopenharmony_ci	ldp	x19,x20,[sp,#16]
1659e1051a39Sopenharmony_ci	ldp	x21,x22,[sp,#32]
1660e1051a39Sopenharmony_ci	ldp	x23,x24,[sp,#48]
1661e1051a39Sopenharmony_ci	ldr	x29,[sp],#64
1662e1051a39Sopenharmony_ci	ret
1663e1051a39Sopenharmony_ci.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1664e1051a39Sopenharmony_ci___
1665e1051a39Sopenharmony_ci}	}
1666e1051a39Sopenharmony_ci
1667e1051a39Sopenharmony_ci########################################################################
1668e1051a39Sopenharmony_ci# scatter-gather subroutines
1669e1051a39Sopenharmony_ci{
1670e1051a39Sopenharmony_cimy ($out,$inp,$index,$mask)=map("x$_",(0..3));
1671e1051a39Sopenharmony_ci$code.=<<___;
1672e1051a39Sopenharmony_ci// void	ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1,
1673e1051a39Sopenharmony_ci//					 int x2);
1674e1051a39Sopenharmony_ci.globl	ecp_nistz256_scatter_w5
1675e1051a39Sopenharmony_ci.type	ecp_nistz256_scatter_w5,%function
1676e1051a39Sopenharmony_ci.align	4
1677e1051a39Sopenharmony_ciecp_nistz256_scatter_w5:
1678e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-16]!
1679e1051a39Sopenharmony_ci	add	x29,sp,#0
1680e1051a39Sopenharmony_ci
1681e1051a39Sopenharmony_ci	add	$out,$out,$index,lsl#2
1682e1051a39Sopenharmony_ci
1683e1051a39Sopenharmony_ci	ldp	x4,x5,[$inp]		// X
1684e1051a39Sopenharmony_ci	ldp	x6,x7,[$inp,#16]
1685e1051a39Sopenharmony_ci	stur	w4,[$out,#64*0-4]
1686e1051a39Sopenharmony_ci	lsr	x4,x4,#32
1687e1051a39Sopenharmony_ci	str	w5,[$out,#64*1-4]
1688e1051a39Sopenharmony_ci	lsr	x5,x5,#32
1689e1051a39Sopenharmony_ci	str	w6,[$out,#64*2-4]
1690e1051a39Sopenharmony_ci	lsr	x6,x6,#32
1691e1051a39Sopenharmony_ci	str	w7,[$out,#64*3-4]
1692e1051a39Sopenharmony_ci	lsr	x7,x7,#32
1693e1051a39Sopenharmony_ci	str	w4,[$out,#64*4-4]
1694e1051a39Sopenharmony_ci	str	w5,[$out,#64*5-4]
1695e1051a39Sopenharmony_ci	str	w6,[$out,#64*6-4]
1696e1051a39Sopenharmony_ci	str	w7,[$out,#64*7-4]
1697e1051a39Sopenharmony_ci	add	$out,$out,#64*8
1698e1051a39Sopenharmony_ci
1699e1051a39Sopenharmony_ci	ldp	x4,x5,[$inp,#32]	// Y
1700e1051a39Sopenharmony_ci	ldp	x6,x7,[$inp,#48]
1701e1051a39Sopenharmony_ci	stur	w4,[$out,#64*0-4]
1702e1051a39Sopenharmony_ci	lsr	x4,x4,#32
1703e1051a39Sopenharmony_ci	str	w5,[$out,#64*1-4]
1704e1051a39Sopenharmony_ci	lsr	x5,x5,#32
1705e1051a39Sopenharmony_ci	str	w6,[$out,#64*2-4]
1706e1051a39Sopenharmony_ci	lsr	x6,x6,#32
1707e1051a39Sopenharmony_ci	str	w7,[$out,#64*3-4]
1708e1051a39Sopenharmony_ci	lsr	x7,x7,#32
1709e1051a39Sopenharmony_ci	str	w4,[$out,#64*4-4]
1710e1051a39Sopenharmony_ci	str	w5,[$out,#64*5-4]
1711e1051a39Sopenharmony_ci	str	w6,[$out,#64*6-4]
1712e1051a39Sopenharmony_ci	str	w7,[$out,#64*7-4]
1713e1051a39Sopenharmony_ci	add	$out,$out,#64*8
1714e1051a39Sopenharmony_ci
1715e1051a39Sopenharmony_ci	ldp	x4,x5,[$inp,#64]	// Z
1716e1051a39Sopenharmony_ci	ldp	x6,x7,[$inp,#80]
1717e1051a39Sopenharmony_ci	stur	w4,[$out,#64*0-4]
1718e1051a39Sopenharmony_ci	lsr	x4,x4,#32
1719e1051a39Sopenharmony_ci	str	w5,[$out,#64*1-4]
1720e1051a39Sopenharmony_ci	lsr	x5,x5,#32
1721e1051a39Sopenharmony_ci	str	w6,[$out,#64*2-4]
1722e1051a39Sopenharmony_ci	lsr	x6,x6,#32
1723e1051a39Sopenharmony_ci	str	w7,[$out,#64*3-4]
1724e1051a39Sopenharmony_ci	lsr	x7,x7,#32
1725e1051a39Sopenharmony_ci	str	w4,[$out,#64*4-4]
1726e1051a39Sopenharmony_ci	str	w5,[$out,#64*5-4]
1727e1051a39Sopenharmony_ci	str	w6,[$out,#64*6-4]
1728e1051a39Sopenharmony_ci	str	w7,[$out,#64*7-4]
1729e1051a39Sopenharmony_ci
1730e1051a39Sopenharmony_ci	ldr	x29,[sp],#16
1731e1051a39Sopenharmony_ci	ret
1732e1051a39Sopenharmony_ci.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1733e1051a39Sopenharmony_ci
1734e1051a39Sopenharmony_ci// void	ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1,
1735e1051a39Sopenharmony_ci//					      int x2);
1736e1051a39Sopenharmony_ci.globl	ecp_nistz256_gather_w5
1737e1051a39Sopenharmony_ci.type	ecp_nistz256_gather_w5,%function
1738e1051a39Sopenharmony_ci.align	4
1739e1051a39Sopenharmony_ciecp_nistz256_gather_w5:
1740e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-16]!
1741e1051a39Sopenharmony_ci	add	x29,sp,#0
1742e1051a39Sopenharmony_ci
1743e1051a39Sopenharmony_ci	cmp	$index,xzr
1744e1051a39Sopenharmony_ci	csetm	x3,ne
1745e1051a39Sopenharmony_ci	add	$index,$index,x3
1746e1051a39Sopenharmony_ci	add	$inp,$inp,$index,lsl#2
1747e1051a39Sopenharmony_ci
1748e1051a39Sopenharmony_ci	ldr	w4,[$inp,#64*0]
1749e1051a39Sopenharmony_ci	ldr	w5,[$inp,#64*1]
1750e1051a39Sopenharmony_ci	ldr	w6,[$inp,#64*2]
1751e1051a39Sopenharmony_ci	ldr	w7,[$inp,#64*3]
1752e1051a39Sopenharmony_ci	ldr	w8,[$inp,#64*4]
1753e1051a39Sopenharmony_ci	ldr	w9,[$inp,#64*5]
1754e1051a39Sopenharmony_ci	ldr	w10,[$inp,#64*6]
1755e1051a39Sopenharmony_ci	ldr	w11,[$inp,#64*7]
1756e1051a39Sopenharmony_ci	add	$inp,$inp,#64*8
1757e1051a39Sopenharmony_ci	orr	x4,x4,x8,lsl#32
1758e1051a39Sopenharmony_ci	orr	x5,x5,x9,lsl#32
1759e1051a39Sopenharmony_ci	orr	x6,x6,x10,lsl#32
1760e1051a39Sopenharmony_ci	orr	x7,x7,x11,lsl#32
1761e1051a39Sopenharmony_ci	csel	x4,x4,xzr,ne
1762e1051a39Sopenharmony_ci	csel	x5,x5,xzr,ne
1763e1051a39Sopenharmony_ci	csel	x6,x6,xzr,ne
1764e1051a39Sopenharmony_ci	csel	x7,x7,xzr,ne
1765e1051a39Sopenharmony_ci	stp	x4,x5,[$out]		// X
1766e1051a39Sopenharmony_ci	stp	x6,x7,[$out,#16]
1767e1051a39Sopenharmony_ci
1768e1051a39Sopenharmony_ci	ldr	w4,[$inp,#64*0]
1769e1051a39Sopenharmony_ci	ldr	w5,[$inp,#64*1]
1770e1051a39Sopenharmony_ci	ldr	w6,[$inp,#64*2]
1771e1051a39Sopenharmony_ci	ldr	w7,[$inp,#64*3]
1772e1051a39Sopenharmony_ci	ldr	w8,[$inp,#64*4]
1773e1051a39Sopenharmony_ci	ldr	w9,[$inp,#64*5]
1774e1051a39Sopenharmony_ci	ldr	w10,[$inp,#64*6]
1775e1051a39Sopenharmony_ci	ldr	w11,[$inp,#64*7]
1776e1051a39Sopenharmony_ci	add	$inp,$inp,#64*8
1777e1051a39Sopenharmony_ci	orr	x4,x4,x8,lsl#32
1778e1051a39Sopenharmony_ci	orr	x5,x5,x9,lsl#32
1779e1051a39Sopenharmony_ci	orr	x6,x6,x10,lsl#32
1780e1051a39Sopenharmony_ci	orr	x7,x7,x11,lsl#32
1781e1051a39Sopenharmony_ci	csel	x4,x4,xzr,ne
1782e1051a39Sopenharmony_ci	csel	x5,x5,xzr,ne
1783e1051a39Sopenharmony_ci	csel	x6,x6,xzr,ne
1784e1051a39Sopenharmony_ci	csel	x7,x7,xzr,ne
1785e1051a39Sopenharmony_ci	stp	x4,x5,[$out,#32]	// Y
1786e1051a39Sopenharmony_ci	stp	x6,x7,[$out,#48]
1787e1051a39Sopenharmony_ci
1788e1051a39Sopenharmony_ci	ldr	w4,[$inp,#64*0]
1789e1051a39Sopenharmony_ci	ldr	w5,[$inp,#64*1]
1790e1051a39Sopenharmony_ci	ldr	w6,[$inp,#64*2]
1791e1051a39Sopenharmony_ci	ldr	w7,[$inp,#64*3]
1792e1051a39Sopenharmony_ci	ldr	w8,[$inp,#64*4]
1793e1051a39Sopenharmony_ci	ldr	w9,[$inp,#64*5]
1794e1051a39Sopenharmony_ci	ldr	w10,[$inp,#64*6]
1795e1051a39Sopenharmony_ci	ldr	w11,[$inp,#64*7]
1796e1051a39Sopenharmony_ci	orr	x4,x4,x8,lsl#32
1797e1051a39Sopenharmony_ci	orr	x5,x5,x9,lsl#32
1798e1051a39Sopenharmony_ci	orr	x6,x6,x10,lsl#32
1799e1051a39Sopenharmony_ci	orr	x7,x7,x11,lsl#32
1800e1051a39Sopenharmony_ci	csel	x4,x4,xzr,ne
1801e1051a39Sopenharmony_ci	csel	x5,x5,xzr,ne
1802e1051a39Sopenharmony_ci	csel	x6,x6,xzr,ne
1803e1051a39Sopenharmony_ci	csel	x7,x7,xzr,ne
1804e1051a39Sopenharmony_ci	stp	x4,x5,[$out,#64]	// Z
1805e1051a39Sopenharmony_ci	stp	x6,x7,[$out,#80]
1806e1051a39Sopenharmony_ci
1807e1051a39Sopenharmony_ci	ldr	x29,[sp],#16
1808e1051a39Sopenharmony_ci	ret
1809e1051a39Sopenharmony_ci.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1810e1051a39Sopenharmony_ci
1811e1051a39Sopenharmony_ci// void	ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1,
1812e1051a39Sopenharmony_ci//					 int x2);
1813e1051a39Sopenharmony_ci.globl	ecp_nistz256_scatter_w7
1814e1051a39Sopenharmony_ci.type	ecp_nistz256_scatter_w7,%function
1815e1051a39Sopenharmony_ci.align	4
1816e1051a39Sopenharmony_ciecp_nistz256_scatter_w7:
1817e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-16]!
1818e1051a39Sopenharmony_ci	add	x29,sp,#0
1819e1051a39Sopenharmony_ci
1820e1051a39Sopenharmony_ci	add	$out,$out,$index
1821e1051a39Sopenharmony_ci	mov	$index,#64/8
1822e1051a39Sopenharmony_ci.Loop_scatter_w7:
1823e1051a39Sopenharmony_ci	ldr	x3,[$inp],#8
1824e1051a39Sopenharmony_ci	subs	$index,$index,#1
1825e1051a39Sopenharmony_ci	prfm	pstl1strm,[$out,#4096+64*0]
1826e1051a39Sopenharmony_ci	prfm	pstl1strm,[$out,#4096+64*1]
1827e1051a39Sopenharmony_ci	prfm	pstl1strm,[$out,#4096+64*2]
1828e1051a39Sopenharmony_ci	prfm	pstl1strm,[$out,#4096+64*3]
1829e1051a39Sopenharmony_ci	prfm	pstl1strm,[$out,#4096+64*4]
1830e1051a39Sopenharmony_ci	prfm	pstl1strm,[$out,#4096+64*5]
1831e1051a39Sopenharmony_ci	prfm	pstl1strm,[$out,#4096+64*6]
1832e1051a39Sopenharmony_ci	prfm	pstl1strm,[$out,#4096+64*7]
1833e1051a39Sopenharmony_ci	strb	w3,[$out,#64*0]
1834e1051a39Sopenharmony_ci	lsr	x3,x3,#8
1835e1051a39Sopenharmony_ci	strb	w3,[$out,#64*1]
1836e1051a39Sopenharmony_ci	lsr	x3,x3,#8
1837e1051a39Sopenharmony_ci	strb	w3,[$out,#64*2]
1838e1051a39Sopenharmony_ci	lsr	x3,x3,#8
1839e1051a39Sopenharmony_ci	strb	w3,[$out,#64*3]
1840e1051a39Sopenharmony_ci	lsr	x3,x3,#8
1841e1051a39Sopenharmony_ci	strb	w3,[$out,#64*4]
1842e1051a39Sopenharmony_ci	lsr	x3,x3,#8
1843e1051a39Sopenharmony_ci	strb	w3,[$out,#64*5]
1844e1051a39Sopenharmony_ci	lsr	x3,x3,#8
1845e1051a39Sopenharmony_ci	strb	w3,[$out,#64*6]
1846e1051a39Sopenharmony_ci	lsr	x3,x3,#8
1847e1051a39Sopenharmony_ci	strb	w3,[$out,#64*7]
1848e1051a39Sopenharmony_ci	add	$out,$out,#64*8
1849e1051a39Sopenharmony_ci	b.ne	.Loop_scatter_w7
1850e1051a39Sopenharmony_ci
1851e1051a39Sopenharmony_ci	ldr	x29,[sp],#16
1852e1051a39Sopenharmony_ci	ret
1853e1051a39Sopenharmony_ci.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1854e1051a39Sopenharmony_ci
1855e1051a39Sopenharmony_ci// void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1,
1856e1051a39Sopenharmony_ci//						     int x2);
1857e1051a39Sopenharmony_ci.globl	ecp_nistz256_gather_w7
1858e1051a39Sopenharmony_ci.type	ecp_nistz256_gather_w7,%function
1859e1051a39Sopenharmony_ci.align	4
1860e1051a39Sopenharmony_ciecp_nistz256_gather_w7:
1861e1051a39Sopenharmony_ci	stp	x29,x30,[sp,#-16]!
1862e1051a39Sopenharmony_ci	add	x29,sp,#0
1863e1051a39Sopenharmony_ci
1864e1051a39Sopenharmony_ci	cmp	$index,xzr
1865e1051a39Sopenharmony_ci	csetm	x3,ne
1866e1051a39Sopenharmony_ci	add	$index,$index,x3
1867e1051a39Sopenharmony_ci	add	$inp,$inp,$index
1868e1051a39Sopenharmony_ci	mov	$index,#64/8
1869e1051a39Sopenharmony_ci	nop
1870e1051a39Sopenharmony_ci.Loop_gather_w7:
1871e1051a39Sopenharmony_ci	ldrb	w4,[$inp,#64*0]
1872e1051a39Sopenharmony_ci	prfm	pldl1strm,[$inp,#4096+64*0]
1873e1051a39Sopenharmony_ci	subs	$index,$index,#1
1874e1051a39Sopenharmony_ci	ldrb	w5,[$inp,#64*1]
1875e1051a39Sopenharmony_ci	prfm	pldl1strm,[$inp,#4096+64*1]
1876e1051a39Sopenharmony_ci	ldrb	w6,[$inp,#64*2]
1877e1051a39Sopenharmony_ci	prfm	pldl1strm,[$inp,#4096+64*2]
1878e1051a39Sopenharmony_ci	ldrb	w7,[$inp,#64*3]
1879e1051a39Sopenharmony_ci	prfm	pldl1strm,[$inp,#4096+64*3]
1880e1051a39Sopenharmony_ci	ldrb	w8,[$inp,#64*4]
1881e1051a39Sopenharmony_ci	prfm	pldl1strm,[$inp,#4096+64*4]
1882e1051a39Sopenharmony_ci	ldrb	w9,[$inp,#64*5]
1883e1051a39Sopenharmony_ci	prfm	pldl1strm,[$inp,#4096+64*5]
1884e1051a39Sopenharmony_ci	ldrb	w10,[$inp,#64*6]
1885e1051a39Sopenharmony_ci	prfm	pldl1strm,[$inp,#4096+64*6]
1886e1051a39Sopenharmony_ci	ldrb	w11,[$inp,#64*7]
1887e1051a39Sopenharmony_ci	prfm	pldl1strm,[$inp,#4096+64*7]
1888e1051a39Sopenharmony_ci	add	$inp,$inp,#64*8
1889e1051a39Sopenharmony_ci	orr	x4,x4,x5,lsl#8
1890e1051a39Sopenharmony_ci	orr	x6,x6,x7,lsl#8
1891e1051a39Sopenharmony_ci	orr	x8,x8,x9,lsl#8
1892e1051a39Sopenharmony_ci	orr	x4,x4,x6,lsl#16
1893e1051a39Sopenharmony_ci	orr	x10,x10,x11,lsl#8
1894e1051a39Sopenharmony_ci	orr	x4,x4,x8,lsl#32
1895e1051a39Sopenharmony_ci	orr	x4,x4,x10,lsl#48
1896e1051a39Sopenharmony_ci	and	x4,x4,x3
1897e1051a39Sopenharmony_ci	str	x4,[$out],#8
1898e1051a39Sopenharmony_ci	b.ne	.Loop_gather_w7
1899e1051a39Sopenharmony_ci
1900e1051a39Sopenharmony_ci	ldr	x29,[sp],#16
1901e1051a39Sopenharmony_ci	ret
1902e1051a39Sopenharmony_ci.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1903e1051a39Sopenharmony_ci___
1904e1051a39Sopenharmony_ci}
1905e1051a39Sopenharmony_ci
1906e1051a39Sopenharmony_ciforeach (split("\n",$code)) {
1907e1051a39Sopenharmony_ci	s/\`([^\`]*)\`/eval $1/ge;
1908e1051a39Sopenharmony_ci
1909e1051a39Sopenharmony_ci	print $_,"\n";
1910e1051a39Sopenharmony_ci}
1911e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";	# enforce flush
1912