1e1051a39Sopenharmony_ci#! /usr/bin/env perl
2e1051a39Sopenharmony_ci# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3e1051a39Sopenharmony_ci#
4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License").  You may not use
5e1051a39Sopenharmony_ci# this file except in compliance with the License.  You can obtain a copy
6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at
7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html
8e1051a39Sopenharmony_ci
9e1051a39Sopenharmony_ci
10e1051a39Sopenharmony_ci# ====================================================================
11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and
13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further
14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/.
15e1051a39Sopenharmony_ci# ====================================================================
16e1051a39Sopenharmony_ci#
17e1051a39Sopenharmony_ci# ECP_NISTZ256 module for ARMv4.
18e1051a39Sopenharmony_ci#
19e1051a39Sopenharmony_ci# October 2014.
20e1051a39Sopenharmony_ci#
21e1051a39Sopenharmony_ci# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22e1051a39Sopenharmony_ci# http://eprint.iacr.org/2013/816. In the process of adaptation
23e1051a39Sopenharmony_ci# original .c module was made 32-bit savvy in order to make this
24e1051a39Sopenharmony_ci# implementation possible.
25e1051a39Sopenharmony_ci#
26e1051a39Sopenharmony_ci#			with/without -DECP_NISTZ256_ASM
27e1051a39Sopenharmony_ci# Cortex-A8		+53-170%
28e1051a39Sopenharmony_ci# Cortex-A9		+76-205%
29e1051a39Sopenharmony_ci# Cortex-A15		+100-316%
30e1051a39Sopenharmony_ci# Snapdragon S4		+66-187%
31e1051a39Sopenharmony_ci#
32e1051a39Sopenharmony_ci# Ranges denote minimum and maximum improvement coefficients depending
33e1051a39Sopenharmony_ci# on benchmark. Lower coefficients are for ECDSA sign, server-side
34e1051a39Sopenharmony_ci# operation. Keep in mind that +200% means 3x improvement.
35e1051a39Sopenharmony_ci
36e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension)
37e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file
38e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
39e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
40e1051a39Sopenharmony_ci
41e1051a39Sopenharmony_ciif ($flavour && $flavour ne "void") {
42e1051a39Sopenharmony_ci    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43e1051a39Sopenharmony_ci    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
44e1051a39Sopenharmony_ci    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
45e1051a39Sopenharmony_ci    die "can't locate arm-xlate.pl";
46e1051a39Sopenharmony_ci
47e1051a39Sopenharmony_ci    open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
48e1051a39Sopenharmony_ci        or die "can't call  $xlate: $!";
49e1051a39Sopenharmony_ci} else {
50e1051a39Sopenharmony_ci    $output and open STDOUT,">$output";
51e1051a39Sopenharmony_ci}
52e1051a39Sopenharmony_ci
53e1051a39Sopenharmony_ci$code.=<<___;
54e1051a39Sopenharmony_ci#include "arm_arch.h"
55e1051a39Sopenharmony_ci
56e1051a39Sopenharmony_ci#if defined(__thumb2__)
57e1051a39Sopenharmony_ci.syntax	unified
58e1051a39Sopenharmony_ci.thumb
59e1051a39Sopenharmony_ci#else
60e1051a39Sopenharmony_ci.code	32
61e1051a39Sopenharmony_ci#endif
62e1051a39Sopenharmony_ci___
63e1051a39Sopenharmony_ci########################################################################
64e1051a39Sopenharmony_ci# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
65e1051a39Sopenharmony_ci#
66e1051a39Sopenharmony_ci$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
67e1051a39Sopenharmony_ciopen TABLE,"<ecp_nistz256_table.c"		or
68e1051a39Sopenharmony_ciopen TABLE,"<${dir}../ecp_nistz256_table.c"	or
69e1051a39Sopenharmony_cidie "failed to open ecp_nistz256_table.c:",$!;
70e1051a39Sopenharmony_ci
71e1051a39Sopenharmony_ciuse integer;
72e1051a39Sopenharmony_ci
73e1051a39Sopenharmony_ciforeach(<TABLE>) {
74e1051a39Sopenharmony_ci	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
75e1051a39Sopenharmony_ci}
76e1051a39Sopenharmony_ciclose TABLE;
77e1051a39Sopenharmony_ci
78e1051a39Sopenharmony_ci# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
79e1051a39Sopenharmony_ci# 64*16*37-1 is because $#arr returns last valid index or @arr, not
80e1051a39Sopenharmony_ci# amount of elements.
81e1051a39Sopenharmony_cidie "insane number of elements" if ($#arr != 64*16*37-1);
82e1051a39Sopenharmony_ci
83e1051a39Sopenharmony_ci$code.=<<___;
84e1051a39Sopenharmony_ci.rodata
85e1051a39Sopenharmony_ci.globl	ecp_nistz256_precomputed
86e1051a39Sopenharmony_ci.type	ecp_nistz256_precomputed,%object
87e1051a39Sopenharmony_ci.align	12
88e1051a39Sopenharmony_ciecp_nistz256_precomputed:
89e1051a39Sopenharmony_ci___
90e1051a39Sopenharmony_ci########################################################################
91e1051a39Sopenharmony_ci# this conversion smashes P256_POINT_AFFINE by individual bytes with
92e1051a39Sopenharmony_ci# 64 byte interval, similar to
93e1051a39Sopenharmony_ci#	1111222233334444
94e1051a39Sopenharmony_ci#	1234123412341234
95e1051a39Sopenharmony_cifor(1..37) {
96e1051a39Sopenharmony_ci	@tbl = splice(@arr,0,64*16);
97e1051a39Sopenharmony_ci	for($i=0;$i<64;$i++) {
98e1051a39Sopenharmony_ci		undef @line;
99e1051a39Sopenharmony_ci		for($j=0;$j<64;$j++) {
100e1051a39Sopenharmony_ci			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
101e1051a39Sopenharmony_ci		}
102e1051a39Sopenharmony_ci		$code.=".byte\t";
103e1051a39Sopenharmony_ci		$code.=join(',',map { sprintf "0x%02x",$_} @line);
104e1051a39Sopenharmony_ci		$code.="\n";
105e1051a39Sopenharmony_ci	}
106e1051a39Sopenharmony_ci}
107e1051a39Sopenharmony_ci$code.=<<___;
108e1051a39Sopenharmony_ci.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
109e1051a39Sopenharmony_ci
110e1051a39Sopenharmony_ci.text
111e1051a39Sopenharmony_ci.align	5
112e1051a39Sopenharmony_ci.LRR:	@ 2^512 mod P precomputed for NIST P256 polynomial
113e1051a39Sopenharmony_ci.long	0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
114e1051a39Sopenharmony_ci.long	0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
115e1051a39Sopenharmony_ci.Lone:
116e1051a39Sopenharmony_ci.long	1,0,0,0,0,0,0,0
117e1051a39Sopenharmony_ci.asciz	"ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
118e1051a39Sopenharmony_ci.align	6
119e1051a39Sopenharmony_ci___
120e1051a39Sopenharmony_ci
121e1051a39Sopenharmony_ci########################################################################
122e1051a39Sopenharmony_ci# common register layout, note that $t2 is link register, so that if
123e1051a39Sopenharmony_ci# internal subroutine uses $t2, then it has to offload lr...
124e1051a39Sopenharmony_ci
125e1051a39Sopenharmony_ci($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
126e1051a39Sopenharmony_ci		map("r$_",(0..12,14));
127e1051a39Sopenharmony_ci($t0,$t3)=($ff,$a_ptr);
128e1051a39Sopenharmony_ci
129e1051a39Sopenharmony_ci$code.=<<___;
130e1051a39Sopenharmony_ci@ void	ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
131e1051a39Sopenharmony_ci.globl	ecp_nistz256_to_mont
132e1051a39Sopenharmony_ci.type	ecp_nistz256_to_mont,%function
133e1051a39Sopenharmony_ciecp_nistz256_to_mont:
134e1051a39Sopenharmony_ci	adr	$b_ptr,.LRR
135e1051a39Sopenharmony_ci	b	.Lecp_nistz256_mul_mont
136e1051a39Sopenharmony_ci.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
137e1051a39Sopenharmony_ci
138e1051a39Sopenharmony_ci@ void	ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
139e1051a39Sopenharmony_ci.globl	ecp_nistz256_from_mont
140e1051a39Sopenharmony_ci.type	ecp_nistz256_from_mont,%function
141e1051a39Sopenharmony_ciecp_nistz256_from_mont:
142e1051a39Sopenharmony_ci	adr	$b_ptr,.Lone
143e1051a39Sopenharmony_ci	b	.Lecp_nistz256_mul_mont
144e1051a39Sopenharmony_ci.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
145e1051a39Sopenharmony_ci
146e1051a39Sopenharmony_ci@ void	ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
147e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_by_2
148e1051a39Sopenharmony_ci.type	ecp_nistz256_mul_by_2,%function
149e1051a39Sopenharmony_ci.align	4
150e1051a39Sopenharmony_ciecp_nistz256_mul_by_2:
151e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r12,lr}
152e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_by_2
153e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__)
154e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
155e1051a39Sopenharmony_ci#else
156e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,lr}
157e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
158e1051a39Sopenharmony_ci#endif
159e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
160e1051a39Sopenharmony_ci
161e1051a39Sopenharmony_ci.type	__ecp_nistz256_mul_by_2,%function
162e1051a39Sopenharmony_ci.align	4
163e1051a39Sopenharmony_ci__ecp_nistz256_mul_by_2:
164e1051a39Sopenharmony_ci	ldr	$a0,[$a_ptr,#0]
165e1051a39Sopenharmony_ci	ldr	$a1,[$a_ptr,#4]
166e1051a39Sopenharmony_ci	ldr	$a2,[$a_ptr,#8]
167e1051a39Sopenharmony_ci	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7], i.e. add with itself
168e1051a39Sopenharmony_ci	ldr	$a3,[$a_ptr,#12]
169e1051a39Sopenharmony_ci	adcs	$a1,$a1,$a1
170e1051a39Sopenharmony_ci	ldr	$a4,[$a_ptr,#16]
171e1051a39Sopenharmony_ci	adcs	$a2,$a2,$a2
172e1051a39Sopenharmony_ci	ldr	$a5,[$a_ptr,#20]
173e1051a39Sopenharmony_ci	adcs	$a3,$a3,$a3
174e1051a39Sopenharmony_ci	ldr	$a6,[$a_ptr,#24]
175e1051a39Sopenharmony_ci	adcs	$a4,$a4,$a4
176e1051a39Sopenharmony_ci	ldr	$a7,[$a_ptr,#28]
177e1051a39Sopenharmony_ci	adcs	$a5,$a5,$a5
178e1051a39Sopenharmony_ci	adcs	$a6,$a6,$a6
179e1051a39Sopenharmony_ci	mov	$ff,#0
180e1051a39Sopenharmony_ci	adcs	$a7,$a7,$a7
181e1051a39Sopenharmony_ci	adc	$ff,$ff,#0
182e1051a39Sopenharmony_ci
183e1051a39Sopenharmony_ci	b	.Lreduce_by_sub
184e1051a39Sopenharmony_ci.size	__ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
185e1051a39Sopenharmony_ci
186e1051a39Sopenharmony_ci@ void	ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
187e1051a39Sopenharmony_ci@					const BN_ULONG r2[8]);
188e1051a39Sopenharmony_ci.globl	ecp_nistz256_add
189e1051a39Sopenharmony_ci.type	ecp_nistz256_add,%function
190e1051a39Sopenharmony_ci.align	4
191e1051a39Sopenharmony_ciecp_nistz256_add:
192e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r12,lr}
193e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add
194e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__)
195e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
196e1051a39Sopenharmony_ci#else
197e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,lr}
198e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
199e1051a39Sopenharmony_ci#endif
200e1051a39Sopenharmony_ci.size	ecp_nistz256_add,.-ecp_nistz256_add
201e1051a39Sopenharmony_ci
202e1051a39Sopenharmony_ci.type	__ecp_nistz256_add,%function
203e1051a39Sopenharmony_ci.align	4
204e1051a39Sopenharmony_ci__ecp_nistz256_add:
205e1051a39Sopenharmony_ci	str	lr,[sp,#-4]!		@ push lr
206e1051a39Sopenharmony_ci
207e1051a39Sopenharmony_ci	ldr	$a0,[$a_ptr,#0]
208e1051a39Sopenharmony_ci	ldr	$a1,[$a_ptr,#4]
209e1051a39Sopenharmony_ci	ldr	$a2,[$a_ptr,#8]
210e1051a39Sopenharmony_ci	ldr	$a3,[$a_ptr,#12]
211e1051a39Sopenharmony_ci	ldr	$a4,[$a_ptr,#16]
212e1051a39Sopenharmony_ci	 ldr	$t0,[$b_ptr,#0]
213e1051a39Sopenharmony_ci	ldr	$a5,[$a_ptr,#20]
214e1051a39Sopenharmony_ci	 ldr	$t1,[$b_ptr,#4]
215e1051a39Sopenharmony_ci	ldr	$a6,[$a_ptr,#24]
216e1051a39Sopenharmony_ci	 ldr	$t2,[$b_ptr,#8]
217e1051a39Sopenharmony_ci	ldr	$a7,[$a_ptr,#28]
218e1051a39Sopenharmony_ci	 ldr	$t3,[$b_ptr,#12]
219e1051a39Sopenharmony_ci	adds	$a0,$a0,$t0
220e1051a39Sopenharmony_ci	 ldr	$t0,[$b_ptr,#16]
221e1051a39Sopenharmony_ci	adcs	$a1,$a1,$t1
222e1051a39Sopenharmony_ci	 ldr	$t1,[$b_ptr,#20]
223e1051a39Sopenharmony_ci	adcs	$a2,$a2,$t2
224e1051a39Sopenharmony_ci	 ldr	$t2,[$b_ptr,#24]
225e1051a39Sopenharmony_ci	adcs	$a3,$a3,$t3
226e1051a39Sopenharmony_ci	 ldr	$t3,[$b_ptr,#28]
227e1051a39Sopenharmony_ci	adcs	$a4,$a4,$t0
228e1051a39Sopenharmony_ci	adcs	$a5,$a5,$t1
229e1051a39Sopenharmony_ci	adcs	$a6,$a6,$t2
230e1051a39Sopenharmony_ci	mov	$ff,#0
231e1051a39Sopenharmony_ci	adcs	$a7,$a7,$t3
232e1051a39Sopenharmony_ci	adc	$ff,$ff,#0
233e1051a39Sopenharmony_ci	ldr	lr,[sp],#4		@ pop lr
234e1051a39Sopenharmony_ci
235e1051a39Sopenharmony_ci.Lreduce_by_sub:
236e1051a39Sopenharmony_ci
237e1051a39Sopenharmony_ci	@ if a+b >= modulus, subtract modulus.
238e1051a39Sopenharmony_ci	@
239e1051a39Sopenharmony_ci	@ But since comparison implies subtraction, we subtract
240e1051a39Sopenharmony_ci	@ modulus and then add it back if subtraction borrowed.
241e1051a39Sopenharmony_ci
242e1051a39Sopenharmony_ci	subs	$a0,$a0,#-1
243e1051a39Sopenharmony_ci	sbcs	$a1,$a1,#-1
244e1051a39Sopenharmony_ci	sbcs	$a2,$a2,#-1
245e1051a39Sopenharmony_ci	sbcs	$a3,$a3,#0
246e1051a39Sopenharmony_ci	sbcs	$a4,$a4,#0
247e1051a39Sopenharmony_ci	sbcs	$a5,$a5,#0
248e1051a39Sopenharmony_ci	sbcs	$a6,$a6,#1
249e1051a39Sopenharmony_ci	sbcs	$a7,$a7,#-1
250e1051a39Sopenharmony_ci	sbc	$ff,$ff,#0
251e1051a39Sopenharmony_ci
252e1051a39Sopenharmony_ci	@ Note that because mod has special form, i.e. consists of
253e1051a39Sopenharmony_ci	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
254e1051a39Sopenharmony_ci	@ using value of borrow as a whole or extracting single bit.
255e1051a39Sopenharmony_ci	@ Follow $ff register...
256e1051a39Sopenharmony_ci
257e1051a39Sopenharmony_ci	adds	$a0,$a0,$ff		@ add synthesized modulus
258e1051a39Sopenharmony_ci	adcs	$a1,$a1,$ff
259e1051a39Sopenharmony_ci	str	$a0,[$r_ptr,#0]
260e1051a39Sopenharmony_ci	adcs	$a2,$a2,$ff
261e1051a39Sopenharmony_ci	str	$a1,[$r_ptr,#4]
262e1051a39Sopenharmony_ci	adcs	$a3,$a3,#0
263e1051a39Sopenharmony_ci	str	$a2,[$r_ptr,#8]
264e1051a39Sopenharmony_ci	adcs	$a4,$a4,#0
265e1051a39Sopenharmony_ci	str	$a3,[$r_ptr,#12]
266e1051a39Sopenharmony_ci	adcs	$a5,$a5,#0
267e1051a39Sopenharmony_ci	str	$a4,[$r_ptr,#16]
268e1051a39Sopenharmony_ci	adcs	$a6,$a6,$ff,lsr#31
269e1051a39Sopenharmony_ci	str	$a5,[$r_ptr,#20]
270e1051a39Sopenharmony_ci	adcs	$a7,$a7,$ff
271e1051a39Sopenharmony_ci	str	$a6,[$r_ptr,#24]
272e1051a39Sopenharmony_ci	str	$a7,[$r_ptr,#28]
273e1051a39Sopenharmony_ci
274e1051a39Sopenharmony_ci	mov	pc,lr
275e1051a39Sopenharmony_ci.size	__ecp_nistz256_add,.-__ecp_nistz256_add
276e1051a39Sopenharmony_ci
277e1051a39Sopenharmony_ci@ void	ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
278e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_by_3
279e1051a39Sopenharmony_ci.type	ecp_nistz256_mul_by_3,%function
280e1051a39Sopenharmony_ci.align	4
281e1051a39Sopenharmony_ciecp_nistz256_mul_by_3:
282e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r12,lr}
283e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_by_3
284e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__)
285e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
286e1051a39Sopenharmony_ci#else
287e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,lr}
288e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
289e1051a39Sopenharmony_ci#endif
290e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
291e1051a39Sopenharmony_ci
292e1051a39Sopenharmony_ci.type	__ecp_nistz256_mul_by_3,%function
293e1051a39Sopenharmony_ci.align	4
294e1051a39Sopenharmony_ci__ecp_nistz256_mul_by_3:
295e1051a39Sopenharmony_ci	str	lr,[sp,#-4]!		@ push lr
296e1051a39Sopenharmony_ci
297e1051a39Sopenharmony_ci	@ As multiplication by 3 is performed as 2*n+n, below are inline
298e1051a39Sopenharmony_ci	@ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
299e1051a39Sopenharmony_ci	@ corresponding subroutines for details.
300e1051a39Sopenharmony_ci
301e1051a39Sopenharmony_ci	ldr	$a0,[$a_ptr,#0]
302e1051a39Sopenharmony_ci	ldr	$a1,[$a_ptr,#4]
303e1051a39Sopenharmony_ci	ldr	$a2,[$a_ptr,#8]
304e1051a39Sopenharmony_ci	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7]
305e1051a39Sopenharmony_ci	ldr	$a3,[$a_ptr,#12]
306e1051a39Sopenharmony_ci	adcs	$a1,$a1,$a1
307e1051a39Sopenharmony_ci	ldr	$a4,[$a_ptr,#16]
308e1051a39Sopenharmony_ci	adcs	$a2,$a2,$a2
309e1051a39Sopenharmony_ci	ldr	$a5,[$a_ptr,#20]
310e1051a39Sopenharmony_ci	adcs	$a3,$a3,$a3
311e1051a39Sopenharmony_ci	ldr	$a6,[$a_ptr,#24]
312e1051a39Sopenharmony_ci	adcs	$a4,$a4,$a4
313e1051a39Sopenharmony_ci	ldr	$a7,[$a_ptr,#28]
314e1051a39Sopenharmony_ci	adcs	$a5,$a5,$a5
315e1051a39Sopenharmony_ci	adcs	$a6,$a6,$a6
316e1051a39Sopenharmony_ci	mov	$ff,#0
317e1051a39Sopenharmony_ci	adcs	$a7,$a7,$a7
318e1051a39Sopenharmony_ci	adc	$ff,$ff,#0
319e1051a39Sopenharmony_ci
320e1051a39Sopenharmony_ci	subs	$a0,$a0,#-1		@ .Lreduce_by_sub but without stores
321e1051a39Sopenharmony_ci	sbcs	$a1,$a1,#-1
322e1051a39Sopenharmony_ci	sbcs	$a2,$a2,#-1
323e1051a39Sopenharmony_ci	sbcs	$a3,$a3,#0
324e1051a39Sopenharmony_ci	sbcs	$a4,$a4,#0
325e1051a39Sopenharmony_ci	sbcs	$a5,$a5,#0
326e1051a39Sopenharmony_ci	sbcs	$a6,$a6,#1
327e1051a39Sopenharmony_ci	sbcs	$a7,$a7,#-1
328e1051a39Sopenharmony_ci	sbc	$ff,$ff,#0
329e1051a39Sopenharmony_ci
330e1051a39Sopenharmony_ci	adds	$a0,$a0,$ff		@ add synthesized modulus
331e1051a39Sopenharmony_ci	adcs	$a1,$a1,$ff
332e1051a39Sopenharmony_ci	adcs	$a2,$a2,$ff
333e1051a39Sopenharmony_ci	adcs	$a3,$a3,#0
334e1051a39Sopenharmony_ci	adcs	$a4,$a4,#0
335e1051a39Sopenharmony_ci	 ldr	$b_ptr,[$a_ptr,#0]
336e1051a39Sopenharmony_ci	adcs	$a5,$a5,#0
337e1051a39Sopenharmony_ci	 ldr	$t1,[$a_ptr,#4]
338e1051a39Sopenharmony_ci	adcs	$a6,$a6,$ff,lsr#31
339e1051a39Sopenharmony_ci	 ldr	$t2,[$a_ptr,#8]
340e1051a39Sopenharmony_ci	adc	$a7,$a7,$ff
341e1051a39Sopenharmony_ci
342e1051a39Sopenharmony_ci	ldr	$t0,[$a_ptr,#12]
343e1051a39Sopenharmony_ci	adds	$a0,$a0,$b_ptr		@ 2*a[0:7]+=a[0:7]
344e1051a39Sopenharmony_ci	ldr	$b_ptr,[$a_ptr,#16]
345e1051a39Sopenharmony_ci	adcs	$a1,$a1,$t1
346e1051a39Sopenharmony_ci	ldr	$t1,[$a_ptr,#20]
347e1051a39Sopenharmony_ci	adcs	$a2,$a2,$t2
348e1051a39Sopenharmony_ci	ldr	$t2,[$a_ptr,#24]
349e1051a39Sopenharmony_ci	adcs	$a3,$a3,$t0
350e1051a39Sopenharmony_ci	ldr	$t3,[$a_ptr,#28]
351e1051a39Sopenharmony_ci	adcs	$a4,$a4,$b_ptr
352e1051a39Sopenharmony_ci	adcs	$a5,$a5,$t1
353e1051a39Sopenharmony_ci	adcs	$a6,$a6,$t2
354e1051a39Sopenharmony_ci	mov	$ff,#0
355e1051a39Sopenharmony_ci	adcs	$a7,$a7,$t3
356e1051a39Sopenharmony_ci	adc	$ff,$ff,#0
357e1051a39Sopenharmony_ci	ldr	lr,[sp],#4		@ pop lr
358e1051a39Sopenharmony_ci
359e1051a39Sopenharmony_ci	b	.Lreduce_by_sub
360e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
361e1051a39Sopenharmony_ci
362e1051a39Sopenharmony_ci@ void	ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
363e1051a39Sopenharmony_ci.globl	ecp_nistz256_div_by_2
364e1051a39Sopenharmony_ci.type	ecp_nistz256_div_by_2,%function
365e1051a39Sopenharmony_ci.align	4
366e1051a39Sopenharmony_ciecp_nistz256_div_by_2:
367e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r12,lr}
368e1051a39Sopenharmony_ci	bl	__ecp_nistz256_div_by_2
369e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__)
370e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
371e1051a39Sopenharmony_ci#else
372e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,lr}
373e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
374e1051a39Sopenharmony_ci#endif
375e1051a39Sopenharmony_ci.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
376e1051a39Sopenharmony_ci
377e1051a39Sopenharmony_ci.type	__ecp_nistz256_div_by_2,%function
378e1051a39Sopenharmony_ci.align	4
379e1051a39Sopenharmony_ci__ecp_nistz256_div_by_2:
380e1051a39Sopenharmony_ci	@ ret = (a is odd ? a+mod : a) >> 1
381e1051a39Sopenharmony_ci
382e1051a39Sopenharmony_ci	ldr	$a0,[$a_ptr,#0]
383e1051a39Sopenharmony_ci	ldr	$a1,[$a_ptr,#4]
384e1051a39Sopenharmony_ci	ldr	$a2,[$a_ptr,#8]
385e1051a39Sopenharmony_ci	mov	$ff,$a0,lsl#31		@ place least significant bit to most
386e1051a39Sopenharmony_ci					@ significant position, now arithmetic
387e1051a39Sopenharmony_ci					@ right shift by 31 will produce -1 or
388e1051a39Sopenharmony_ci					@ 0, while logical right shift 1 or 0,
389e1051a39Sopenharmony_ci					@ this is how modulus is conditionally
390e1051a39Sopenharmony_ci					@ synthesized in this case...
391e1051a39Sopenharmony_ci	ldr	$a3,[$a_ptr,#12]
392e1051a39Sopenharmony_ci	adds	$a0,$a0,$ff,asr#31
393e1051a39Sopenharmony_ci	ldr	$a4,[$a_ptr,#16]
394e1051a39Sopenharmony_ci	adcs	$a1,$a1,$ff,asr#31
395e1051a39Sopenharmony_ci	ldr	$a5,[$a_ptr,#20]
396e1051a39Sopenharmony_ci	adcs	$a2,$a2,$ff,asr#31
397e1051a39Sopenharmony_ci	ldr	$a6,[$a_ptr,#24]
398e1051a39Sopenharmony_ci	adcs	$a3,$a3,#0
399e1051a39Sopenharmony_ci	ldr	$a7,[$a_ptr,#28]
400e1051a39Sopenharmony_ci	adcs	$a4,$a4,#0
401e1051a39Sopenharmony_ci	 mov	$a0,$a0,lsr#1		@ a[0:7]>>=1, we can start early
402e1051a39Sopenharmony_ci					@ because it doesn't affect flags
403e1051a39Sopenharmony_ci	adcs	$a5,$a5,#0
404e1051a39Sopenharmony_ci	 orr	$a0,$a0,$a1,lsl#31
405e1051a39Sopenharmony_ci	adcs	$a6,$a6,$ff,lsr#31
406e1051a39Sopenharmony_ci	mov	$b_ptr,#0
407e1051a39Sopenharmony_ci	adcs	$a7,$a7,$ff,asr#31
408e1051a39Sopenharmony_ci	 mov	$a1,$a1,lsr#1
409e1051a39Sopenharmony_ci	adc	$b_ptr,$b_ptr,#0	@ top-most carry bit from addition
410e1051a39Sopenharmony_ci
411e1051a39Sopenharmony_ci	orr	$a1,$a1,$a2,lsl#31
412e1051a39Sopenharmony_ci	mov	$a2,$a2,lsr#1
413e1051a39Sopenharmony_ci	str	$a0,[$r_ptr,#0]
414e1051a39Sopenharmony_ci	orr	$a2,$a2,$a3,lsl#31
415e1051a39Sopenharmony_ci	mov	$a3,$a3,lsr#1
416e1051a39Sopenharmony_ci	str	$a1,[$r_ptr,#4]
417e1051a39Sopenharmony_ci	orr	$a3,$a3,$a4,lsl#31
418e1051a39Sopenharmony_ci	mov	$a4,$a4,lsr#1
419e1051a39Sopenharmony_ci	str	$a2,[$r_ptr,#8]
420e1051a39Sopenharmony_ci	orr	$a4,$a4,$a5,lsl#31
421e1051a39Sopenharmony_ci	mov	$a5,$a5,lsr#1
422e1051a39Sopenharmony_ci	str	$a3,[$r_ptr,#12]
423e1051a39Sopenharmony_ci	orr	$a5,$a5,$a6,lsl#31
424e1051a39Sopenharmony_ci	mov	$a6,$a6,lsr#1
425e1051a39Sopenharmony_ci	str	$a4,[$r_ptr,#16]
426e1051a39Sopenharmony_ci	orr	$a6,$a6,$a7,lsl#31
427e1051a39Sopenharmony_ci	mov	$a7,$a7,lsr#1
428e1051a39Sopenharmony_ci	str	$a5,[$r_ptr,#20]
429e1051a39Sopenharmony_ci	orr	$a7,$a7,$b_ptr,lsl#31	@ don't forget the top-most carry bit
430e1051a39Sopenharmony_ci	str	$a6,[$r_ptr,#24]
431e1051a39Sopenharmony_ci	str	$a7,[$r_ptr,#28]
432e1051a39Sopenharmony_ci
433e1051a39Sopenharmony_ci	mov	pc,lr
434e1051a39Sopenharmony_ci.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
435e1051a39Sopenharmony_ci
436e1051a39Sopenharmony_ci@ void	ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
437e1051a39Sopenharmony_ci@				        const BN_ULONG r2[8]);
438e1051a39Sopenharmony_ci.globl	ecp_nistz256_sub
439e1051a39Sopenharmony_ci.type	ecp_nistz256_sub,%function
440e1051a39Sopenharmony_ci.align	4
441e1051a39Sopenharmony_ciecp_nistz256_sub:
442e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r12,lr}
443e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub
444e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__)
445e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
446e1051a39Sopenharmony_ci#else
447e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,lr}
448e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
449e1051a39Sopenharmony_ci#endif
450e1051a39Sopenharmony_ci.size	ecp_nistz256_sub,.-ecp_nistz256_sub
451e1051a39Sopenharmony_ci
452e1051a39Sopenharmony_ci.type	__ecp_nistz256_sub,%function
453e1051a39Sopenharmony_ci.align	4
454e1051a39Sopenharmony_ci__ecp_nistz256_sub:
455e1051a39Sopenharmony_ci	str	lr,[sp,#-4]!		@ push lr
456e1051a39Sopenharmony_ci
457e1051a39Sopenharmony_ci	ldr	$a0,[$a_ptr,#0]
458e1051a39Sopenharmony_ci	ldr	$a1,[$a_ptr,#4]
459e1051a39Sopenharmony_ci	ldr	$a2,[$a_ptr,#8]
460e1051a39Sopenharmony_ci	ldr	$a3,[$a_ptr,#12]
461e1051a39Sopenharmony_ci	ldr	$a4,[$a_ptr,#16]
462e1051a39Sopenharmony_ci	 ldr	$t0,[$b_ptr,#0]
463e1051a39Sopenharmony_ci	ldr	$a5,[$a_ptr,#20]
464e1051a39Sopenharmony_ci	 ldr	$t1,[$b_ptr,#4]
465e1051a39Sopenharmony_ci	ldr	$a6,[$a_ptr,#24]
466e1051a39Sopenharmony_ci	 ldr	$t2,[$b_ptr,#8]
467e1051a39Sopenharmony_ci	ldr	$a7,[$a_ptr,#28]
468e1051a39Sopenharmony_ci	 ldr	$t3,[$b_ptr,#12]
469e1051a39Sopenharmony_ci	subs	$a0,$a0,$t0
470e1051a39Sopenharmony_ci	 ldr	$t0,[$b_ptr,#16]
471e1051a39Sopenharmony_ci	sbcs	$a1,$a1,$t1
472e1051a39Sopenharmony_ci	 ldr	$t1,[$b_ptr,#20]
473e1051a39Sopenharmony_ci	sbcs	$a2,$a2,$t2
474e1051a39Sopenharmony_ci	 ldr	$t2,[$b_ptr,#24]
475e1051a39Sopenharmony_ci	sbcs	$a3,$a3,$t3
476e1051a39Sopenharmony_ci	 ldr	$t3,[$b_ptr,#28]
477e1051a39Sopenharmony_ci	sbcs	$a4,$a4,$t0
478e1051a39Sopenharmony_ci	sbcs	$a5,$a5,$t1
479e1051a39Sopenharmony_ci	sbcs	$a6,$a6,$t2
480e1051a39Sopenharmony_ci	sbcs	$a7,$a7,$t3
481e1051a39Sopenharmony_ci	sbc	$ff,$ff,$ff		@ broadcast borrow bit
482e1051a39Sopenharmony_ci	ldr	lr,[sp],#4		@ pop lr
483e1051a39Sopenharmony_ci
484e1051a39Sopenharmony_ci.Lreduce_by_add:
485e1051a39Sopenharmony_ci
486e1051a39Sopenharmony_ci	@ if a-b borrows, add modulus.
487e1051a39Sopenharmony_ci	@
488e1051a39Sopenharmony_ci	@ Note that because mod has special form, i.e. consists of
489e1051a39Sopenharmony_ci	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
490e1051a39Sopenharmony_ci	@ broadcasting borrow bit to a register, $ff, and using it as
491e1051a39Sopenharmony_ci	@ a whole or extracting single bit.
492e1051a39Sopenharmony_ci
493e1051a39Sopenharmony_ci	adds	$a0,$a0,$ff		@ add synthesized modulus
494e1051a39Sopenharmony_ci	adcs	$a1,$a1,$ff
495e1051a39Sopenharmony_ci	str	$a0,[$r_ptr,#0]
496e1051a39Sopenharmony_ci	adcs	$a2,$a2,$ff
497e1051a39Sopenharmony_ci	str	$a1,[$r_ptr,#4]
498e1051a39Sopenharmony_ci	adcs	$a3,$a3,#0
499e1051a39Sopenharmony_ci	str	$a2,[$r_ptr,#8]
500e1051a39Sopenharmony_ci	adcs	$a4,$a4,#0
501e1051a39Sopenharmony_ci	str	$a3,[$r_ptr,#12]
502e1051a39Sopenharmony_ci	adcs	$a5,$a5,#0
503e1051a39Sopenharmony_ci	str	$a4,[$r_ptr,#16]
504e1051a39Sopenharmony_ci	adcs	$a6,$a6,$ff,lsr#31
505e1051a39Sopenharmony_ci	str	$a5,[$r_ptr,#20]
506e1051a39Sopenharmony_ci	adcs	$a7,$a7,$ff
507e1051a39Sopenharmony_ci	str	$a6,[$r_ptr,#24]
508e1051a39Sopenharmony_ci	str	$a7,[$r_ptr,#28]
509e1051a39Sopenharmony_ci
510e1051a39Sopenharmony_ci	mov	pc,lr
511e1051a39Sopenharmony_ci.size	__ecp_nistz256_sub,.-__ecp_nistz256_sub
512e1051a39Sopenharmony_ci
513e1051a39Sopenharmony_ci@ void	ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
514e1051a39Sopenharmony_ci.globl	ecp_nistz256_neg
515e1051a39Sopenharmony_ci.type	ecp_nistz256_neg,%function
516e1051a39Sopenharmony_ci.align	4
517e1051a39Sopenharmony_ciecp_nistz256_neg:
518e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r12,lr}
519e1051a39Sopenharmony_ci	bl	__ecp_nistz256_neg
520e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__)
521e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
522e1051a39Sopenharmony_ci#else
523e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,lr}
524e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
525e1051a39Sopenharmony_ci#endif
526e1051a39Sopenharmony_ci.size	ecp_nistz256_neg,.-ecp_nistz256_neg
527e1051a39Sopenharmony_ci
528e1051a39Sopenharmony_ci.type	__ecp_nistz256_neg,%function
529e1051a39Sopenharmony_ci.align	4
530e1051a39Sopenharmony_ci__ecp_nistz256_neg:
531e1051a39Sopenharmony_ci	ldr	$a0,[$a_ptr,#0]
532e1051a39Sopenharmony_ci	eor	$ff,$ff,$ff
533e1051a39Sopenharmony_ci	ldr	$a1,[$a_ptr,#4]
534e1051a39Sopenharmony_ci	ldr	$a2,[$a_ptr,#8]
535e1051a39Sopenharmony_ci	subs	$a0,$ff,$a0
536e1051a39Sopenharmony_ci	ldr	$a3,[$a_ptr,#12]
537e1051a39Sopenharmony_ci	sbcs	$a1,$ff,$a1
538e1051a39Sopenharmony_ci	ldr	$a4,[$a_ptr,#16]
539e1051a39Sopenharmony_ci	sbcs	$a2,$ff,$a2
540e1051a39Sopenharmony_ci	ldr	$a5,[$a_ptr,#20]
541e1051a39Sopenharmony_ci	sbcs	$a3,$ff,$a3
542e1051a39Sopenharmony_ci	ldr	$a6,[$a_ptr,#24]
543e1051a39Sopenharmony_ci	sbcs	$a4,$ff,$a4
544e1051a39Sopenharmony_ci	ldr	$a7,[$a_ptr,#28]
545e1051a39Sopenharmony_ci	sbcs	$a5,$ff,$a5
546e1051a39Sopenharmony_ci	sbcs	$a6,$ff,$a6
547e1051a39Sopenharmony_ci	sbcs	$a7,$ff,$a7
548e1051a39Sopenharmony_ci	sbc	$ff,$ff,$ff
549e1051a39Sopenharmony_ci
550e1051a39Sopenharmony_ci	b	.Lreduce_by_add
551e1051a39Sopenharmony_ci.size	__ecp_nistz256_neg,.-__ecp_nistz256_neg
552e1051a39Sopenharmony_ci___
553e1051a39Sopenharmony_ci{
554e1051a39Sopenharmony_cimy @acc=map("r$_",(3..11));
555e1051a39Sopenharmony_cimy ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
556e1051a39Sopenharmony_ci
557e1051a39Sopenharmony_ci$code.=<<___;
558e1051a39Sopenharmony_ci@ void	ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
559e1051a39Sopenharmony_ci.globl	ecp_nistz256_sqr_mont
560e1051a39Sopenharmony_ci.type	ecp_nistz256_sqr_mont,%function
561e1051a39Sopenharmony_ci.align	4
562e1051a39Sopenharmony_ciecp_nistz256_sqr_mont:
563e1051a39Sopenharmony_ci	mov	$b_ptr,$a_ptr
564e1051a39Sopenharmony_ci	b	.Lecp_nistz256_mul_mont
565e1051a39Sopenharmony_ci.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
566e1051a39Sopenharmony_ci
567e1051a39Sopenharmony_ci@ void	ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
568e1051a39Sopenharmony_ci@					     const BN_ULONG r2[8]);
569e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_mont
570e1051a39Sopenharmony_ci.type	ecp_nistz256_mul_mont,%function
571e1051a39Sopenharmony_ci.align	4
572e1051a39Sopenharmony_ciecp_nistz256_mul_mont:
573e1051a39Sopenharmony_ci.Lecp_nistz256_mul_mont:
574e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r12,lr}
575e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont
576e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__)
577e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
578e1051a39Sopenharmony_ci#else
579e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,lr}
580e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
581e1051a39Sopenharmony_ci#endif
582e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
583e1051a39Sopenharmony_ci
584e1051a39Sopenharmony_ci.type	__ecp_nistz256_mul_mont,%function
585e1051a39Sopenharmony_ci.align	4
586e1051a39Sopenharmony_ci__ecp_nistz256_mul_mont:
587e1051a39Sopenharmony_ci	stmdb	sp!,{r0-r2,lr}			@ make a copy of arguments too
588e1051a39Sopenharmony_ci
589e1051a39Sopenharmony_ci	ldr	$bj,[$b_ptr,#0]			@ b[0]
590e1051a39Sopenharmony_ci	ldmia	$a_ptr,{@acc[1]-@acc[8]}
591e1051a39Sopenharmony_ci
592e1051a39Sopenharmony_ci	umull	@acc[0],$t3,@acc[1],$bj		@ r[0]=a[0]*b[0]
593e1051a39Sopenharmony_ci	stmdb	sp!,{$acc[1]-@acc[8]}		@ copy a[0-7] to stack, so
594e1051a39Sopenharmony_ci						@ that it can be addressed
595e1051a39Sopenharmony_ci						@ without spending register
596e1051a39Sopenharmony_ci						@ on address
597e1051a39Sopenharmony_ci	umull	@acc[1],$t0,@acc[2],$bj		@ r[1]=a[1]*b[0]
598e1051a39Sopenharmony_ci	umull	@acc[2],$t1,@acc[3],$bj
599e1051a39Sopenharmony_ci	adds	@acc[1],@acc[1],$t3		@ accumulate high part of mult
600e1051a39Sopenharmony_ci	umull	@acc[3],$t2,@acc[4],$bj
601e1051a39Sopenharmony_ci	adcs	@acc[2],@acc[2],$t0
602e1051a39Sopenharmony_ci	umull	@acc[4],$t3,@acc[5],$bj
603e1051a39Sopenharmony_ci	adcs	@acc[3],@acc[3],$t1
604e1051a39Sopenharmony_ci	umull	@acc[5],$t0,@acc[6],$bj
605e1051a39Sopenharmony_ci	adcs	@acc[4],@acc[4],$t2
606e1051a39Sopenharmony_ci	umull	@acc[6],$t1,@acc[7],$bj
607e1051a39Sopenharmony_ci	adcs	@acc[5],@acc[5],$t3
608e1051a39Sopenharmony_ci	umull	@acc[7],$t2,@acc[8],$bj
609e1051a39Sopenharmony_ci	adcs	@acc[6],@acc[6],$t0
610e1051a39Sopenharmony_ci	adcs	@acc[7],@acc[7],$t1
611e1051a39Sopenharmony_ci	eor	$t3,$t3,$t3			@ first overflow bit is zero
612e1051a39Sopenharmony_ci	adc	@acc[8],$t2,#0
613e1051a39Sopenharmony_ci___
614e1051a39Sopenharmony_cifor(my $i=1;$i<8;$i++) {
615e1051a39Sopenharmony_cimy $t4=@acc[0];
616e1051a39Sopenharmony_ci
617e1051a39Sopenharmony_ci	# Reduction iteration is normally performed by accumulating
618e1051a39Sopenharmony_ci	# result of multiplication of modulus by "magic" digit [and
619e1051a39Sopenharmony_ci	# omitting least significant word, which is guaranteed to
620e1051a39Sopenharmony_ci	# be 0], but thanks to special form of modulus and "magic"
621e1051a39Sopenharmony_ci	# digit being equal to least significant word, it can be
622e1051a39Sopenharmony_ci	# performed with additions and subtractions alone. Indeed:
623e1051a39Sopenharmony_ci	#
624e1051a39Sopenharmony_ci	#        ffff.0001.0000.0000.0000.ffff.ffff.ffff
625e1051a39Sopenharmony_ci	# *                                         abcd
626e1051a39Sopenharmony_ci	# + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
627e1051a39Sopenharmony_ci	#
628e1051a39Sopenharmony_ci	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
629e1051a39Sopenharmony_ci	# rewrite above as:
630e1051a39Sopenharmony_ci	#
631e1051a39Sopenharmony_ci	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
632e1051a39Sopenharmony_ci	# + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
633e1051a39Sopenharmony_ci	# -      abcd.0000.0000.0000.0000.0000.0000.abcd
634e1051a39Sopenharmony_ci	#
635e1051a39Sopenharmony_ci	# or marking redundant operations:
636e1051a39Sopenharmony_ci	#
637e1051a39Sopenharmony_ci	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
638e1051a39Sopenharmony_ci	# + abcd.0000.abcd.0000.0000.abcd.----.----.----
639e1051a39Sopenharmony_ci	# -      abcd.----.----.----.----.----.----.----
640e1051a39Sopenharmony_ci
641e1051a39Sopenharmony_ci$code.=<<___;
642e1051a39Sopenharmony_ci	@ multiplication-less reduction $i
643e1051a39Sopenharmony_ci	adds	@acc[3],@acc[3],@acc[0]		@ r[3]+=r[0]
644e1051a39Sopenharmony_ci	 ldr	$bj,[sp,#40]			@ restore b_ptr
645e1051a39Sopenharmony_ci	adcs	@acc[4],@acc[4],#0		@ r[4]+=0
646e1051a39Sopenharmony_ci	adcs	@acc[5],@acc[5],#0		@ r[5]+=0
647e1051a39Sopenharmony_ci	adcs	@acc[6],@acc[6],@acc[0]		@ r[6]+=r[0]
648e1051a39Sopenharmony_ci	 ldr	$t1,[sp,#0]			@ load a[0]
649e1051a39Sopenharmony_ci	adcs	@acc[7],@acc[7],#0		@ r[7]+=0
650e1051a39Sopenharmony_ci	 ldr	$bj,[$bj,#4*$i]			@ load b[i]
651e1051a39Sopenharmony_ci	adcs	@acc[8],@acc[8],@acc[0]		@ r[8]+=r[0]
652e1051a39Sopenharmony_ci	 eor	$t0,$t0,$t0
653e1051a39Sopenharmony_ci	adc	$t3,$t3,#0			@ overflow bit
654e1051a39Sopenharmony_ci	subs	@acc[7],@acc[7],@acc[0]		@ r[7]-=r[0]
655e1051a39Sopenharmony_ci	 ldr	$t2,[sp,#4]			@ a[1]
656e1051a39Sopenharmony_ci	sbcs	@acc[8],@acc[8],#0		@ r[8]-=0
657e1051a39Sopenharmony_ci	 umlal	@acc[1],$t0,$t1,$bj		@ "r[0]"+=a[0]*b[i]
658e1051a39Sopenharmony_ci	 eor	$t1,$t1,$t1
659e1051a39Sopenharmony_ci	sbc	@acc[0],$t3,#0			@ overflow bit, keep in mind
660e1051a39Sopenharmony_ci						@ that netto result is
661e1051a39Sopenharmony_ci						@ addition of a value which
662e1051a39Sopenharmony_ci						@ makes underflow impossible
663e1051a39Sopenharmony_ci
664e1051a39Sopenharmony_ci	ldr	$t3,[sp,#8]			@ a[2]
665e1051a39Sopenharmony_ci	umlal	@acc[2],$t1,$t2,$bj		@ "r[1]"+=a[1]*b[i]
666e1051a39Sopenharmony_ci	 str	@acc[0],[sp,#36]		@ temporarily offload overflow
667e1051a39Sopenharmony_ci	eor	$t2,$t2,$t2
668e1051a39Sopenharmony_ci	ldr	$t4,[sp,#12]			@ a[3], $t4 is alias @acc[0]
669e1051a39Sopenharmony_ci	umlal	@acc[3],$t2,$t3,$bj		@ "r[2]"+=a[2]*b[i]
670e1051a39Sopenharmony_ci	eor	$t3,$t3,$t3
671e1051a39Sopenharmony_ci	adds	@acc[2],@acc[2],$t0		@ accumulate high part of mult
672e1051a39Sopenharmony_ci	ldr	$t0,[sp,#16]			@ a[4]
673e1051a39Sopenharmony_ci	umlal	@acc[4],$t3,$t4,$bj		@ "r[3]"+=a[3]*b[i]
674e1051a39Sopenharmony_ci	eor	$t4,$t4,$t4
675e1051a39Sopenharmony_ci	adcs	@acc[3],@acc[3],$t1
676e1051a39Sopenharmony_ci	ldr	$t1,[sp,#20]			@ a[5]
677e1051a39Sopenharmony_ci	umlal	@acc[5],$t4,$t0,$bj		@ "r[4]"+=a[4]*b[i]
678e1051a39Sopenharmony_ci	eor	$t0,$t0,$t0
679e1051a39Sopenharmony_ci	adcs	@acc[4],@acc[4],$t2
680e1051a39Sopenharmony_ci	ldr	$t2,[sp,#24]			@ a[6]
681e1051a39Sopenharmony_ci	umlal	@acc[6],$t0,$t1,$bj		@ "r[5]"+=a[5]*b[i]
682e1051a39Sopenharmony_ci	eor	$t1,$t1,$t1
683e1051a39Sopenharmony_ci	adcs	@acc[5],@acc[5],$t3
684e1051a39Sopenharmony_ci	ldr	$t3,[sp,#28]			@ a[7]
685e1051a39Sopenharmony_ci	umlal	@acc[7],$t1,$t2,$bj		@ "r[6]"+=a[6]*b[i]
686e1051a39Sopenharmony_ci	eor	$t2,$t2,$t2
687e1051a39Sopenharmony_ci	adcs	@acc[6],@acc[6],$t4
688e1051a39Sopenharmony_ci	 ldr	@acc[0],[sp,#36]		@ restore overflow bit
689e1051a39Sopenharmony_ci	umlal	@acc[8],$t2,$t3,$bj		@ "r[7]"+=a[7]*b[i]
690e1051a39Sopenharmony_ci	eor	$t3,$t3,$t3
691e1051a39Sopenharmony_ci	adcs	@acc[7],@acc[7],$t0
692e1051a39Sopenharmony_ci	adcs	@acc[8],@acc[8],$t1
693e1051a39Sopenharmony_ci	adcs	@acc[0],$acc[0],$t2
694e1051a39Sopenharmony_ci	adc	$t3,$t3,#0			@ new overflow bit
695e1051a39Sopenharmony_ci___
696e1051a39Sopenharmony_ci	push(@acc,shift(@acc));			# rotate registers, so that
697e1051a39Sopenharmony_ci						# "r[i]" becomes r[i]
698e1051a39Sopenharmony_ci}
699e1051a39Sopenharmony_ci$code.=<<___;
700e1051a39Sopenharmony_ci	@ last multiplication-less reduction
701e1051a39Sopenharmony_ci	adds	@acc[3],@acc[3],@acc[0]
702e1051a39Sopenharmony_ci	ldr	$r_ptr,[sp,#32]			@ restore r_ptr
703e1051a39Sopenharmony_ci	adcs	@acc[4],@acc[4],#0
704e1051a39Sopenharmony_ci	adcs	@acc[5],@acc[5],#0
705e1051a39Sopenharmony_ci	adcs	@acc[6],@acc[6],@acc[0]
706e1051a39Sopenharmony_ci	adcs	@acc[7],@acc[7],#0
707e1051a39Sopenharmony_ci	adcs	@acc[8],@acc[8],@acc[0]
708e1051a39Sopenharmony_ci	adc	$t3,$t3,#0
709e1051a39Sopenharmony_ci	subs	@acc[7],@acc[7],@acc[0]
710e1051a39Sopenharmony_ci	sbcs	@acc[8],@acc[8],#0
711e1051a39Sopenharmony_ci	sbc	@acc[0],$t3,#0			@ overflow bit
712e1051a39Sopenharmony_ci
713e1051a39Sopenharmony_ci	@ Final step is "if result > mod, subtract mod", but we do it
714e1051a39Sopenharmony_ci	@ "other way around", namely subtract modulus from result
715e1051a39Sopenharmony_ci	@ and if it borrowed, add modulus back.
716e1051a39Sopenharmony_ci
717e1051a39Sopenharmony_ci	adds	@acc[1],@acc[1],#1		@ subs	@acc[1],@acc[1],#-1
718e1051a39Sopenharmony_ci	adcs	@acc[2],@acc[2],#0		@ sbcs	@acc[2],@acc[2],#-1
719e1051a39Sopenharmony_ci	adcs	@acc[3],@acc[3],#0		@ sbcs	@acc[3],@acc[3],#-1
720e1051a39Sopenharmony_ci	sbcs	@acc[4],@acc[4],#0
721e1051a39Sopenharmony_ci	sbcs	@acc[5],@acc[5],#0
722e1051a39Sopenharmony_ci	sbcs	@acc[6],@acc[6],#0
723e1051a39Sopenharmony_ci	sbcs	@acc[7],@acc[7],#1
724e1051a39Sopenharmony_ci	adcs	@acc[8],@acc[8],#0		@ sbcs	@acc[8],@acc[8],#-1
725e1051a39Sopenharmony_ci	ldr	lr,[sp,#44]			@ restore lr
726e1051a39Sopenharmony_ci	sbc	@acc[0],@acc[0],#0		@ broadcast borrow bit
727e1051a39Sopenharmony_ci	add	sp,sp,#48
728e1051a39Sopenharmony_ci
729e1051a39Sopenharmony_ci	@ Note that because mod has special form, i.e. consists of
730e1051a39Sopenharmony_ci	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
731e1051a39Sopenharmony_ci	@ broadcasting borrow bit to a register, @acc[0], and using it as
732e1051a39Sopenharmony_ci	@ a whole or extracting single bit.
733e1051a39Sopenharmony_ci
734e1051a39Sopenharmony_ci	adds	@acc[1],@acc[1],@acc[0]		@ add modulus or zero
735e1051a39Sopenharmony_ci	adcs	@acc[2],@acc[2],@acc[0]
736e1051a39Sopenharmony_ci	str	@acc[1],[$r_ptr,#0]
737e1051a39Sopenharmony_ci	adcs	@acc[3],@acc[3],@acc[0]
738e1051a39Sopenharmony_ci	str	@acc[2],[$r_ptr,#4]
739e1051a39Sopenharmony_ci	adcs	@acc[4],@acc[4],#0
740e1051a39Sopenharmony_ci	str	@acc[3],[$r_ptr,#8]
741e1051a39Sopenharmony_ci	adcs	@acc[5],@acc[5],#0
742e1051a39Sopenharmony_ci	str	@acc[4],[$r_ptr,#12]
743e1051a39Sopenharmony_ci	adcs	@acc[6],@acc[6],#0
744e1051a39Sopenharmony_ci	str	@acc[5],[$r_ptr,#16]
745e1051a39Sopenharmony_ci	adcs	@acc[7],@acc[7],@acc[0],lsr#31
746e1051a39Sopenharmony_ci	str	@acc[6],[$r_ptr,#20]
747e1051a39Sopenharmony_ci	adc	@acc[8],@acc[8],@acc[0]
748e1051a39Sopenharmony_ci	str	@acc[7],[$r_ptr,#24]
749e1051a39Sopenharmony_ci	str	@acc[8],[$r_ptr,#28]
750e1051a39Sopenharmony_ci
751e1051a39Sopenharmony_ci	mov	pc,lr
752e1051a39Sopenharmony_ci.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
753e1051a39Sopenharmony_ci___
754e1051a39Sopenharmony_ci}
755e1051a39Sopenharmony_ci
756e1051a39Sopenharmony_ci{
757e1051a39Sopenharmony_cimy ($out,$inp,$index,$mask)=map("r$_",(0..3));
758e1051a39Sopenharmony_ci$code.=<<___;
759e1051a39Sopenharmony_ci@ void	ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1,
760e1051a39Sopenharmony_ci@					 int r2);
761e1051a39Sopenharmony_ci.globl	ecp_nistz256_scatter_w5
762e1051a39Sopenharmony_ci.type	ecp_nistz256_scatter_w5,%function
763e1051a39Sopenharmony_ci.align	5
764e1051a39Sopenharmony_ciecp_nistz256_scatter_w5:
765e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r11}
766e1051a39Sopenharmony_ci
767e1051a39Sopenharmony_ci	add	$out,$out,$index,lsl#2
768e1051a39Sopenharmony_ci
769e1051a39Sopenharmony_ci	ldmia	$inp!,{r4-r11}		@ X
770e1051a39Sopenharmony_ci	str	r4,[$out,#64*0-4]
771e1051a39Sopenharmony_ci	str	r5,[$out,#64*1-4]
772e1051a39Sopenharmony_ci	str	r6,[$out,#64*2-4]
773e1051a39Sopenharmony_ci	str	r7,[$out,#64*3-4]
774e1051a39Sopenharmony_ci	str	r8,[$out,#64*4-4]
775e1051a39Sopenharmony_ci	str	r9,[$out,#64*5-4]
776e1051a39Sopenharmony_ci	str	r10,[$out,#64*6-4]
777e1051a39Sopenharmony_ci	str	r11,[$out,#64*7-4]
778e1051a39Sopenharmony_ci	add	$out,$out,#64*8
779e1051a39Sopenharmony_ci
780e1051a39Sopenharmony_ci	ldmia	$inp!,{r4-r11}		@ Y
781e1051a39Sopenharmony_ci	str	r4,[$out,#64*0-4]
782e1051a39Sopenharmony_ci	str	r5,[$out,#64*1-4]
783e1051a39Sopenharmony_ci	str	r6,[$out,#64*2-4]
784e1051a39Sopenharmony_ci	str	r7,[$out,#64*3-4]
785e1051a39Sopenharmony_ci	str	r8,[$out,#64*4-4]
786e1051a39Sopenharmony_ci	str	r9,[$out,#64*5-4]
787e1051a39Sopenharmony_ci	str	r10,[$out,#64*6-4]
788e1051a39Sopenharmony_ci	str	r11,[$out,#64*7-4]
789e1051a39Sopenharmony_ci	add	$out,$out,#64*8
790e1051a39Sopenharmony_ci
791e1051a39Sopenharmony_ci	ldmia	$inp,{r4-r11}		@ Z
792e1051a39Sopenharmony_ci	str	r4,[$out,#64*0-4]
793e1051a39Sopenharmony_ci	str	r5,[$out,#64*1-4]
794e1051a39Sopenharmony_ci	str	r6,[$out,#64*2-4]
795e1051a39Sopenharmony_ci	str	r7,[$out,#64*3-4]
796e1051a39Sopenharmony_ci	str	r8,[$out,#64*4-4]
797e1051a39Sopenharmony_ci	str	r9,[$out,#64*5-4]
798e1051a39Sopenharmony_ci	str	r10,[$out,#64*6-4]
799e1051a39Sopenharmony_ci	str	r11,[$out,#64*7-4]
800e1051a39Sopenharmony_ci
801e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r11}
802e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || defined(__thumb__)
803e1051a39Sopenharmony_ci	bx	lr
804e1051a39Sopenharmony_ci#else
805e1051a39Sopenharmony_ci	mov	pc,lr
806e1051a39Sopenharmony_ci#endif
807e1051a39Sopenharmony_ci.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
808e1051a39Sopenharmony_ci
809e1051a39Sopenharmony_ci@ void	ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1,
810e1051a39Sopenharmony_ci@					      int r2);
811e1051a39Sopenharmony_ci.globl	ecp_nistz256_gather_w5
812e1051a39Sopenharmony_ci.type	ecp_nistz256_gather_w5,%function
813e1051a39Sopenharmony_ci.align	5
814e1051a39Sopenharmony_ciecp_nistz256_gather_w5:
815e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r11}
816e1051a39Sopenharmony_ci
817e1051a39Sopenharmony_ci	cmp	$index,#0
818e1051a39Sopenharmony_ci	mov	$mask,#0
819e1051a39Sopenharmony_ci#ifdef	__thumb2__
820e1051a39Sopenharmony_ci	itt	ne
821e1051a39Sopenharmony_ci#endif
822e1051a39Sopenharmony_ci	subne	$index,$index,#1
823e1051a39Sopenharmony_ci	movne	$mask,#-1
824e1051a39Sopenharmony_ci	add	$inp,$inp,$index,lsl#2
825e1051a39Sopenharmony_ci
826e1051a39Sopenharmony_ci	ldr	r4,[$inp,#64*0]
827e1051a39Sopenharmony_ci	ldr	r5,[$inp,#64*1]
828e1051a39Sopenharmony_ci	ldr	r6,[$inp,#64*2]
829e1051a39Sopenharmony_ci	and	r4,r4,$mask
830e1051a39Sopenharmony_ci	ldr	r7,[$inp,#64*3]
831e1051a39Sopenharmony_ci	and	r5,r5,$mask
832e1051a39Sopenharmony_ci	ldr	r8,[$inp,#64*4]
833e1051a39Sopenharmony_ci	and	r6,r6,$mask
834e1051a39Sopenharmony_ci	ldr	r9,[$inp,#64*5]
835e1051a39Sopenharmony_ci	and	r7,r7,$mask
836e1051a39Sopenharmony_ci	ldr	r10,[$inp,#64*6]
837e1051a39Sopenharmony_ci	and	r8,r8,$mask
838e1051a39Sopenharmony_ci	ldr	r11,[$inp,#64*7]
839e1051a39Sopenharmony_ci	add	$inp,$inp,#64*8
840e1051a39Sopenharmony_ci	and	r9,r9,$mask
841e1051a39Sopenharmony_ci	and	r10,r10,$mask
842e1051a39Sopenharmony_ci	and	r11,r11,$mask
843e1051a39Sopenharmony_ci	stmia	$out!,{r4-r11}	@ X
844e1051a39Sopenharmony_ci
845e1051a39Sopenharmony_ci	ldr	r4,[$inp,#64*0]
846e1051a39Sopenharmony_ci	ldr	r5,[$inp,#64*1]
847e1051a39Sopenharmony_ci	ldr	r6,[$inp,#64*2]
848e1051a39Sopenharmony_ci	and	r4,r4,$mask
849e1051a39Sopenharmony_ci	ldr	r7,[$inp,#64*3]
850e1051a39Sopenharmony_ci	and	r5,r5,$mask
851e1051a39Sopenharmony_ci	ldr	r8,[$inp,#64*4]
852e1051a39Sopenharmony_ci	and	r6,r6,$mask
853e1051a39Sopenharmony_ci	ldr	r9,[$inp,#64*5]
854e1051a39Sopenharmony_ci	and	r7,r7,$mask
855e1051a39Sopenharmony_ci	ldr	r10,[$inp,#64*6]
856e1051a39Sopenharmony_ci	and	r8,r8,$mask
857e1051a39Sopenharmony_ci	ldr	r11,[$inp,#64*7]
858e1051a39Sopenharmony_ci	add	$inp,$inp,#64*8
859e1051a39Sopenharmony_ci	and	r9,r9,$mask
860e1051a39Sopenharmony_ci	and	r10,r10,$mask
861e1051a39Sopenharmony_ci	and	r11,r11,$mask
862e1051a39Sopenharmony_ci	stmia	$out!,{r4-r11}	@ Y
863e1051a39Sopenharmony_ci
864e1051a39Sopenharmony_ci	ldr	r4,[$inp,#64*0]
865e1051a39Sopenharmony_ci	ldr	r5,[$inp,#64*1]
866e1051a39Sopenharmony_ci	ldr	r6,[$inp,#64*2]
867e1051a39Sopenharmony_ci	and	r4,r4,$mask
868e1051a39Sopenharmony_ci	ldr	r7,[$inp,#64*3]
869e1051a39Sopenharmony_ci	and	r5,r5,$mask
870e1051a39Sopenharmony_ci	ldr	r8,[$inp,#64*4]
871e1051a39Sopenharmony_ci	and	r6,r6,$mask
872e1051a39Sopenharmony_ci	ldr	r9,[$inp,#64*5]
873e1051a39Sopenharmony_ci	and	r7,r7,$mask
874e1051a39Sopenharmony_ci	ldr	r10,[$inp,#64*6]
875e1051a39Sopenharmony_ci	and	r8,r8,$mask
876e1051a39Sopenharmony_ci	ldr	r11,[$inp,#64*7]
877e1051a39Sopenharmony_ci	and	r9,r9,$mask
878e1051a39Sopenharmony_ci	and	r10,r10,$mask
879e1051a39Sopenharmony_ci	and	r11,r11,$mask
880e1051a39Sopenharmony_ci	stmia	$out,{r4-r11}		@ Z
881e1051a39Sopenharmony_ci
882e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r11}
883e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || defined(__thumb__)
884e1051a39Sopenharmony_ci	bx	lr
885e1051a39Sopenharmony_ci#else
886e1051a39Sopenharmony_ci	mov	pc,lr
887e1051a39Sopenharmony_ci#endif
888e1051a39Sopenharmony_ci.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
889e1051a39Sopenharmony_ci
890e1051a39Sopenharmony_ci@ void	ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1,
891e1051a39Sopenharmony_ci@					 int r2);
892e1051a39Sopenharmony_ci.globl	ecp_nistz256_scatter_w7
893e1051a39Sopenharmony_ci.type	ecp_nistz256_scatter_w7,%function
894e1051a39Sopenharmony_ci.align	5
895e1051a39Sopenharmony_ciecp_nistz256_scatter_w7:
896e1051a39Sopenharmony_ci	add	$out,$out,$index
897e1051a39Sopenharmony_ci	mov	$index,#64/4
898e1051a39Sopenharmony_ci.Loop_scatter_w7:
899e1051a39Sopenharmony_ci	ldr	$mask,[$inp],#4
900e1051a39Sopenharmony_ci	subs	$index,$index,#1
901e1051a39Sopenharmony_ci	strb	$mask,[$out,#64*0]
902e1051a39Sopenharmony_ci	mov	$mask,$mask,lsr#8
903e1051a39Sopenharmony_ci	strb	$mask,[$out,#64*1]
904e1051a39Sopenharmony_ci	mov	$mask,$mask,lsr#8
905e1051a39Sopenharmony_ci	strb	$mask,[$out,#64*2]
906e1051a39Sopenharmony_ci	mov	$mask,$mask,lsr#8
907e1051a39Sopenharmony_ci	strb	$mask,[$out,#64*3]
908e1051a39Sopenharmony_ci	add	$out,$out,#64*4
909e1051a39Sopenharmony_ci	bne	.Loop_scatter_w7
910e1051a39Sopenharmony_ci
911e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || defined(__thumb__)
912e1051a39Sopenharmony_ci	bx	lr
913e1051a39Sopenharmony_ci#else
914e1051a39Sopenharmony_ci	mov	pc,lr
915e1051a39Sopenharmony_ci#endif
916e1051a39Sopenharmony_ci.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
917e1051a39Sopenharmony_ci
918e1051a39Sopenharmony_ci@ void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1,
919e1051a39Sopenharmony_ci@						     int r2);
920e1051a39Sopenharmony_ci.globl	ecp_nistz256_gather_w7
921e1051a39Sopenharmony_ci.type	ecp_nistz256_gather_w7,%function
922e1051a39Sopenharmony_ci.align	5
923e1051a39Sopenharmony_ciecp_nistz256_gather_w7:
924e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r7}
925e1051a39Sopenharmony_ci
926e1051a39Sopenharmony_ci	cmp	$index,#0
927e1051a39Sopenharmony_ci	mov	$mask,#0
928e1051a39Sopenharmony_ci#ifdef	__thumb2__
929e1051a39Sopenharmony_ci	itt	ne
930e1051a39Sopenharmony_ci#endif
931e1051a39Sopenharmony_ci	subne	$index,$index,#1
932e1051a39Sopenharmony_ci	movne	$mask,#-1
933e1051a39Sopenharmony_ci	add	$inp,$inp,$index
934e1051a39Sopenharmony_ci	mov	$index,#64/4
935e1051a39Sopenharmony_ci	nop
936e1051a39Sopenharmony_ci.Loop_gather_w7:
937e1051a39Sopenharmony_ci	ldrb	r4,[$inp,#64*0]
938e1051a39Sopenharmony_ci	subs	$index,$index,#1
939e1051a39Sopenharmony_ci	ldrb	r5,[$inp,#64*1]
940e1051a39Sopenharmony_ci	ldrb	r6,[$inp,#64*2]
941e1051a39Sopenharmony_ci	ldrb	r7,[$inp,#64*3]
942e1051a39Sopenharmony_ci	add	$inp,$inp,#64*4
943e1051a39Sopenharmony_ci	orr	r4,r4,r5,lsl#8
944e1051a39Sopenharmony_ci	orr	r4,r4,r6,lsl#16
945e1051a39Sopenharmony_ci	orr	r4,r4,r7,lsl#24
946e1051a39Sopenharmony_ci	and	r4,r4,$mask
947e1051a39Sopenharmony_ci	str	r4,[$out],#4
948e1051a39Sopenharmony_ci	bne	.Loop_gather_w7
949e1051a39Sopenharmony_ci
950e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r7}
951e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || defined(__thumb__)
952e1051a39Sopenharmony_ci	bx	lr
953e1051a39Sopenharmony_ci#else
954e1051a39Sopenharmony_ci	mov	pc,lr
955e1051a39Sopenharmony_ci#endif
956e1051a39Sopenharmony_ci.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
957e1051a39Sopenharmony_ci___
958e1051a39Sopenharmony_ci}
959e1051a39Sopenharmony_ciif (0) {
960e1051a39Sopenharmony_ci# In comparison to integer-only equivalent of below subroutine:
961e1051a39Sopenharmony_ci#
962e1051a39Sopenharmony_ci# Cortex-A8	+10%
963e1051a39Sopenharmony_ci# Cortex-A9	-10%
964e1051a39Sopenharmony_ci# Snapdragon S4	+5%
965e1051a39Sopenharmony_ci#
966e1051a39Sopenharmony_ci# As not all time is spent in multiplication, overall impact is deemed
967e1051a39Sopenharmony_ci# too low to care about.
968e1051a39Sopenharmony_ci
969e1051a39Sopenharmony_cimy ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
970e1051a39Sopenharmony_cimy $mask="q4";
971e1051a39Sopenharmony_cimy $mult="q5";
972e1051a39Sopenharmony_cimy @AxB=map("q$_",(8..15));
973e1051a39Sopenharmony_ci
974e1051a39Sopenharmony_cimy ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
975e1051a39Sopenharmony_ci
976e1051a39Sopenharmony_ci$code.=<<___;
977e1051a39Sopenharmony_ci#if __ARM_ARCH__>=7
978e1051a39Sopenharmony_ci.fpu	neon
979e1051a39Sopenharmony_ci
980e1051a39Sopenharmony_ci.globl	ecp_nistz256_mul_mont_neon
981e1051a39Sopenharmony_ci.type	ecp_nistz256_mul_mont_neon,%function
982e1051a39Sopenharmony_ci.align	5
983e1051a39Sopenharmony_ciecp_nistz256_mul_mont_neon:
984e1051a39Sopenharmony_ci	mov	ip,sp
985e1051a39Sopenharmony_ci	stmdb	sp!,{r4-r9}
986e1051a39Sopenharmony_ci	vstmdb	sp!,{q4-q5}		@ ABI specification says so
987e1051a39Sopenharmony_ci
988e1051a39Sopenharmony_ci	sub		$toutptr,sp,#40
989e1051a39Sopenharmony_ci	vld1.32		{${Bi}[0]},[$bptr,:32]!
990e1051a39Sopenharmony_ci	veor		$zero,$zero,$zero
991e1051a39Sopenharmony_ci	vld1.32		{$A0-$A3}, [$aptr]		@ can't specify :32 :-(
992e1051a39Sopenharmony_ci	vzip.16		$Bi,$zero
993e1051a39Sopenharmony_ci	mov		sp,$toutptr			@ alloca
994e1051a39Sopenharmony_ci	vmov.i64	$mask,#0xffff
995e1051a39Sopenharmony_ci
996e1051a39Sopenharmony_ci	vmull.u32	@AxB[0],$Bi,${A0}[0]
997e1051a39Sopenharmony_ci	vmull.u32	@AxB[1],$Bi,${A0}[1]
998e1051a39Sopenharmony_ci	vmull.u32	@AxB[2],$Bi,${A1}[0]
999e1051a39Sopenharmony_ci	vmull.u32	@AxB[3],$Bi,${A1}[1]
1000e1051a39Sopenharmony_ci	 vshr.u64	$temp,@AxB[0]#lo,#16
1001e1051a39Sopenharmony_ci	vmull.u32	@AxB[4],$Bi,${A2}[0]
1002e1051a39Sopenharmony_ci	 vadd.u64	@AxB[0]#hi,@AxB[0]#hi,$temp
1003e1051a39Sopenharmony_ci	vmull.u32	@AxB[5],$Bi,${A2}[1]
1004e1051a39Sopenharmony_ci	 vshr.u64	$temp,@AxB[0]#hi,#16		@ upper 32 bits of a[0]*b[0]
1005e1051a39Sopenharmony_ci	vmull.u32	@AxB[6],$Bi,${A3}[0]
1006e1051a39Sopenharmony_ci	 vand.u64	@AxB[0],@AxB[0],$mask		@ lower 32 bits of a[0]*b[0]
1007e1051a39Sopenharmony_ci	vmull.u32	@AxB[7],$Bi,${A3}[1]
1008e1051a39Sopenharmony_ci___
1009e1051a39Sopenharmony_cifor($i=1;$i<8;$i++) {
1010e1051a39Sopenharmony_ci$code.=<<___;
1011e1051a39Sopenharmony_ci	 vld1.32	{${Bi}[0]},[$bptr,:32]!
1012e1051a39Sopenharmony_ci	 veor		$zero,$zero,$zero
1013e1051a39Sopenharmony_ci	vadd.u64	@AxB[1]#lo,@AxB[1]#lo,$temp	@ reduction
1014e1051a39Sopenharmony_ci	vshl.u64	$mult,@AxB[0],#32
1015e1051a39Sopenharmony_ci	vadd.u64	@AxB[3],@AxB[3],@AxB[0]
1016e1051a39Sopenharmony_ci	vsub.u64	$mult,$mult,@AxB[0]
1017e1051a39Sopenharmony_ci	 vzip.16	$Bi,$zero
1018e1051a39Sopenharmony_ci	vadd.u64	@AxB[6],@AxB[6],@AxB[0]
1019e1051a39Sopenharmony_ci	vadd.u64	@AxB[7],@AxB[7],$mult
1020e1051a39Sopenharmony_ci___
1021e1051a39Sopenharmony_ci	push(@AxB,shift(@AxB));
1022e1051a39Sopenharmony_ci$code.=<<___;
1023e1051a39Sopenharmony_ci	vmlal.u32	@AxB[0],$Bi,${A0}[0]
1024e1051a39Sopenharmony_ci	vmlal.u32	@AxB[1],$Bi,${A0}[1]
1025e1051a39Sopenharmony_ci	vmlal.u32	@AxB[2],$Bi,${A1}[0]
1026e1051a39Sopenharmony_ci	vmlal.u32	@AxB[3],$Bi,${A1}[1]
1027e1051a39Sopenharmony_ci	 vshr.u64	$temp,@AxB[0]#lo,#16
1028e1051a39Sopenharmony_ci	vmlal.u32	@AxB[4],$Bi,${A2}[0]
1029e1051a39Sopenharmony_ci	 vadd.u64	@AxB[0]#hi,@AxB[0]#hi,$temp
1030e1051a39Sopenharmony_ci	vmlal.u32	@AxB[5],$Bi,${A2}[1]
1031e1051a39Sopenharmony_ci	 vshr.u64	$temp,@AxB[0]#hi,#16		@ upper 33 bits of a[0]*b[i]+t[0]
1032e1051a39Sopenharmony_ci	vmlal.u32	@AxB[6],$Bi,${A3}[0]
1033e1051a39Sopenharmony_ci	 vand.u64	@AxB[0],@AxB[0],$mask		@ lower 32 bits of a[0]*b[0]
1034e1051a39Sopenharmony_ci	vmull.u32	@AxB[7],$Bi,${A3}[1]
1035e1051a39Sopenharmony_ci___
1036e1051a39Sopenharmony_ci}
1037e1051a39Sopenharmony_ci$code.=<<___;
1038e1051a39Sopenharmony_ci	vadd.u64	@AxB[1]#lo,@AxB[1]#lo,$temp	@ last reduction
1039e1051a39Sopenharmony_ci	vshl.u64	$mult,@AxB[0],#32
1040e1051a39Sopenharmony_ci	vadd.u64	@AxB[3],@AxB[3],@AxB[0]
1041e1051a39Sopenharmony_ci	vsub.u64	$mult,$mult,@AxB[0]
1042e1051a39Sopenharmony_ci	vadd.u64	@AxB[6],@AxB[6],@AxB[0]
1043e1051a39Sopenharmony_ci	vadd.u64	@AxB[7],@AxB[7],$mult
1044e1051a39Sopenharmony_ci
1045e1051a39Sopenharmony_ci	vshr.u64	$temp,@AxB[1]#lo,#16		@ convert
1046e1051a39Sopenharmony_ci	vadd.u64	@AxB[1]#hi,@AxB[1]#hi,$temp
1047e1051a39Sopenharmony_ci	vshr.u64	$temp,@AxB[1]#hi,#16
1048e1051a39Sopenharmony_ci	vzip.16		@AxB[1]#lo,@AxB[1]#hi
1049e1051a39Sopenharmony_ci___
1050e1051a39Sopenharmony_ciforeach (2..7) {
1051e1051a39Sopenharmony_ci$code.=<<___;
1052e1051a39Sopenharmony_ci	vadd.u64	@AxB[$_]#lo,@AxB[$_]#lo,$temp
1053e1051a39Sopenharmony_ci	vst1.32		{@AxB[$_-1]#lo[0]},[$toutptr,:32]!
1054e1051a39Sopenharmony_ci	vshr.u64	$temp,@AxB[$_]#lo,#16
1055e1051a39Sopenharmony_ci	vadd.u64	@AxB[$_]#hi,@AxB[$_]#hi,$temp
1056e1051a39Sopenharmony_ci	vshr.u64	$temp,@AxB[$_]#hi,#16
1057e1051a39Sopenharmony_ci	vzip.16		@AxB[$_]#lo,@AxB[$_]#hi
1058e1051a39Sopenharmony_ci___
1059e1051a39Sopenharmony_ci}
1060e1051a39Sopenharmony_ci$code.=<<___;
1061e1051a39Sopenharmony_ci	vst1.32		{@AxB[7]#lo[0]},[$toutptr,:32]!
1062e1051a39Sopenharmony_ci	vst1.32		{$temp},[$toutptr]		@ upper 33 bits
1063e1051a39Sopenharmony_ci
1064e1051a39Sopenharmony_ci	ldr	r1,[sp,#0]
1065e1051a39Sopenharmony_ci	ldr	r2,[sp,#4]
1066e1051a39Sopenharmony_ci	ldr	r3,[sp,#8]
1067e1051a39Sopenharmony_ci	subs	r1,r1,#-1
1068e1051a39Sopenharmony_ci	ldr	r4,[sp,#12]
1069e1051a39Sopenharmony_ci	sbcs	r2,r2,#-1
1070e1051a39Sopenharmony_ci	ldr	r5,[sp,#16]
1071e1051a39Sopenharmony_ci	sbcs	r3,r3,#-1
1072e1051a39Sopenharmony_ci	ldr	r6,[sp,#20]
1073e1051a39Sopenharmony_ci	sbcs	r4,r4,#0
1074e1051a39Sopenharmony_ci	ldr	r7,[sp,#24]
1075e1051a39Sopenharmony_ci	sbcs	r5,r5,#0
1076e1051a39Sopenharmony_ci	ldr	r8,[sp,#28]
1077e1051a39Sopenharmony_ci	sbcs	r6,r6,#0
1078e1051a39Sopenharmony_ci	ldr	r9,[sp,#32]				@ top-most bit
1079e1051a39Sopenharmony_ci	sbcs	r7,r7,#1
1080e1051a39Sopenharmony_ci	sub	sp,ip,#40+16
1081e1051a39Sopenharmony_ci	sbcs	r8,r8,#-1
1082e1051a39Sopenharmony_ci	sbc	r9,r9,#0
1083e1051a39Sopenharmony_ci        vldmia  sp!,{q4-q5}
1084e1051a39Sopenharmony_ci
1085e1051a39Sopenharmony_ci	adds	r1,r1,r9
1086e1051a39Sopenharmony_ci	adcs	r2,r2,r9
1087e1051a39Sopenharmony_ci	str	r1,[$rptr,#0]
1088e1051a39Sopenharmony_ci	adcs	r3,r3,r9
1089e1051a39Sopenharmony_ci	str	r2,[$rptr,#4]
1090e1051a39Sopenharmony_ci	adcs	r4,r4,#0
1091e1051a39Sopenharmony_ci	str	r3,[$rptr,#8]
1092e1051a39Sopenharmony_ci	adcs	r5,r5,#0
1093e1051a39Sopenharmony_ci	str	r4,[$rptr,#12]
1094e1051a39Sopenharmony_ci	adcs	r6,r6,#0
1095e1051a39Sopenharmony_ci	str	r5,[$rptr,#16]
1096e1051a39Sopenharmony_ci	adcs	r7,r7,r9,lsr#31
1097e1051a39Sopenharmony_ci	str	r6,[$rptr,#20]
1098e1051a39Sopenharmony_ci	adcs	r8,r8,r9
1099e1051a39Sopenharmony_ci	str	r7,[$rptr,#24]
1100e1051a39Sopenharmony_ci	str	r8,[$rptr,#28]
1101e1051a39Sopenharmony_ci
1102e1051a39Sopenharmony_ci        ldmia   sp!,{r4-r9}
1103e1051a39Sopenharmony_ci	bx	lr
1104e1051a39Sopenharmony_ci.size	ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
1105e1051a39Sopenharmony_ci#endif
1106e1051a39Sopenharmony_ci___
1107e1051a39Sopenharmony_ci}
1108e1051a39Sopenharmony_ci
1109e1051a39Sopenharmony_ci{{{
1110e1051a39Sopenharmony_ci########################################################################
1111e1051a39Sopenharmony_ci# Below $aN assignment matches order in which 256-bit result appears in
1112e1051a39Sopenharmony_ci# register bank at return from __ecp_nistz256_mul_mont, so that we can
1113e1051a39Sopenharmony_ci# skip over reloading it from memory. This means that below functions
1114e1051a39Sopenharmony_ci# use custom calling sequence accepting 256-bit input in registers,
1115e1051a39Sopenharmony_ci# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
1116e1051a39Sopenharmony_ci#
1117e1051a39Sopenharmony_ci# See their "normal" counterparts for insights on calculations.
1118e1051a39Sopenharmony_ci
1119e1051a39Sopenharmony_cimy ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
1120e1051a39Sopenharmony_ci    $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
1121e1051a39Sopenharmony_cimy $ff=$b_ptr;
1122e1051a39Sopenharmony_ci
1123e1051a39Sopenharmony_ci$code.=<<___;
1124e1051a39Sopenharmony_ci.type	__ecp_nistz256_sub_from,%function
1125e1051a39Sopenharmony_ci.align	5
1126e1051a39Sopenharmony_ci__ecp_nistz256_sub_from:
1127e1051a39Sopenharmony_ci	str	lr,[sp,#-4]!		@ push lr
1128e1051a39Sopenharmony_ci
1129e1051a39Sopenharmony_ci	 ldr	$t0,[$b_ptr,#0]
1130e1051a39Sopenharmony_ci	 ldr	$t1,[$b_ptr,#4]
1131e1051a39Sopenharmony_ci	 ldr	$t2,[$b_ptr,#8]
1132e1051a39Sopenharmony_ci	 ldr	$t3,[$b_ptr,#12]
1133e1051a39Sopenharmony_ci	subs	$a0,$a0,$t0
1134e1051a39Sopenharmony_ci	 ldr	$t0,[$b_ptr,#16]
1135e1051a39Sopenharmony_ci	sbcs	$a1,$a1,$t1
1136e1051a39Sopenharmony_ci	 ldr	$t1,[$b_ptr,#20]
1137e1051a39Sopenharmony_ci	sbcs	$a2,$a2,$t2
1138e1051a39Sopenharmony_ci	 ldr	$t2,[$b_ptr,#24]
1139e1051a39Sopenharmony_ci	sbcs	$a3,$a3,$t3
1140e1051a39Sopenharmony_ci	 ldr	$t3,[$b_ptr,#28]
1141e1051a39Sopenharmony_ci	sbcs	$a4,$a4,$t0
1142e1051a39Sopenharmony_ci	sbcs	$a5,$a5,$t1
1143e1051a39Sopenharmony_ci	sbcs	$a6,$a6,$t2
1144e1051a39Sopenharmony_ci	sbcs	$a7,$a7,$t3
1145e1051a39Sopenharmony_ci	sbc	$ff,$ff,$ff		@ broadcast borrow bit
1146e1051a39Sopenharmony_ci	ldr	lr,[sp],#4		@ pop lr
1147e1051a39Sopenharmony_ci
1148e1051a39Sopenharmony_ci	adds	$a0,$a0,$ff		@ add synthesized modulus
1149e1051a39Sopenharmony_ci	adcs	$a1,$a1,$ff
1150e1051a39Sopenharmony_ci	str	$a0,[$r_ptr,#0]
1151e1051a39Sopenharmony_ci	adcs	$a2,$a2,$ff
1152e1051a39Sopenharmony_ci	str	$a1,[$r_ptr,#4]
1153e1051a39Sopenharmony_ci	adcs	$a3,$a3,#0
1154e1051a39Sopenharmony_ci	str	$a2,[$r_ptr,#8]
1155e1051a39Sopenharmony_ci	adcs	$a4,$a4,#0
1156e1051a39Sopenharmony_ci	str	$a3,[$r_ptr,#12]
1157e1051a39Sopenharmony_ci	adcs	$a5,$a5,#0
1158e1051a39Sopenharmony_ci	str	$a4,[$r_ptr,#16]
1159e1051a39Sopenharmony_ci	adcs	$a6,$a6,$ff,lsr#31
1160e1051a39Sopenharmony_ci	str	$a5,[$r_ptr,#20]
1161e1051a39Sopenharmony_ci	adcs	$a7,$a7,$ff
1162e1051a39Sopenharmony_ci	str	$a6,[$r_ptr,#24]
1163e1051a39Sopenharmony_ci	str	$a7,[$r_ptr,#28]
1164e1051a39Sopenharmony_ci
1165e1051a39Sopenharmony_ci	mov	pc,lr
1166e1051a39Sopenharmony_ci.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
1167e1051a39Sopenharmony_ci
1168e1051a39Sopenharmony_ci.type	__ecp_nistz256_sub_morf,%function
1169e1051a39Sopenharmony_ci.align	5
1170e1051a39Sopenharmony_ci__ecp_nistz256_sub_morf:
1171e1051a39Sopenharmony_ci	str	lr,[sp,#-4]!		@ push lr
1172e1051a39Sopenharmony_ci
1173e1051a39Sopenharmony_ci	 ldr	$t0,[$b_ptr,#0]
1174e1051a39Sopenharmony_ci	 ldr	$t1,[$b_ptr,#4]
1175e1051a39Sopenharmony_ci	 ldr	$t2,[$b_ptr,#8]
1176e1051a39Sopenharmony_ci	 ldr	$t3,[$b_ptr,#12]
1177e1051a39Sopenharmony_ci	subs	$a0,$t0,$a0
1178e1051a39Sopenharmony_ci	 ldr	$t0,[$b_ptr,#16]
1179e1051a39Sopenharmony_ci	sbcs	$a1,$t1,$a1
1180e1051a39Sopenharmony_ci	 ldr	$t1,[$b_ptr,#20]
1181e1051a39Sopenharmony_ci	sbcs	$a2,$t2,$a2
1182e1051a39Sopenharmony_ci	 ldr	$t2,[$b_ptr,#24]
1183e1051a39Sopenharmony_ci	sbcs	$a3,$t3,$a3
1184e1051a39Sopenharmony_ci	 ldr	$t3,[$b_ptr,#28]
1185e1051a39Sopenharmony_ci	sbcs	$a4,$t0,$a4
1186e1051a39Sopenharmony_ci	sbcs	$a5,$t1,$a5
1187e1051a39Sopenharmony_ci	sbcs	$a6,$t2,$a6
1188e1051a39Sopenharmony_ci	sbcs	$a7,$t3,$a7
1189e1051a39Sopenharmony_ci	sbc	$ff,$ff,$ff		@ broadcast borrow bit
1190e1051a39Sopenharmony_ci	ldr	lr,[sp],#4		@ pop lr
1191e1051a39Sopenharmony_ci
1192e1051a39Sopenharmony_ci	adds	$a0,$a0,$ff		@ add synthesized modulus
1193e1051a39Sopenharmony_ci	adcs	$a1,$a1,$ff
1194e1051a39Sopenharmony_ci	str	$a0,[$r_ptr,#0]
1195e1051a39Sopenharmony_ci	adcs	$a2,$a2,$ff
1196e1051a39Sopenharmony_ci	str	$a1,[$r_ptr,#4]
1197e1051a39Sopenharmony_ci	adcs	$a3,$a3,#0
1198e1051a39Sopenharmony_ci	str	$a2,[$r_ptr,#8]
1199e1051a39Sopenharmony_ci	adcs	$a4,$a4,#0
1200e1051a39Sopenharmony_ci	str	$a3,[$r_ptr,#12]
1201e1051a39Sopenharmony_ci	adcs	$a5,$a5,#0
1202e1051a39Sopenharmony_ci	str	$a4,[$r_ptr,#16]
1203e1051a39Sopenharmony_ci	adcs	$a6,$a6,$ff,lsr#31
1204e1051a39Sopenharmony_ci	str	$a5,[$r_ptr,#20]
1205e1051a39Sopenharmony_ci	adcs	$a7,$a7,$ff
1206e1051a39Sopenharmony_ci	str	$a6,[$r_ptr,#24]
1207e1051a39Sopenharmony_ci	str	$a7,[$r_ptr,#28]
1208e1051a39Sopenharmony_ci
1209e1051a39Sopenharmony_ci	mov	pc,lr
1210e1051a39Sopenharmony_ci.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
1211e1051a39Sopenharmony_ci
1212e1051a39Sopenharmony_ci.type	__ecp_nistz256_add_self,%function
1213e1051a39Sopenharmony_ci.align	4
1214e1051a39Sopenharmony_ci__ecp_nistz256_add_self:
1215e1051a39Sopenharmony_ci	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7]
1216e1051a39Sopenharmony_ci	adcs	$a1,$a1,$a1
1217e1051a39Sopenharmony_ci	adcs	$a2,$a2,$a2
1218e1051a39Sopenharmony_ci	adcs	$a3,$a3,$a3
1219e1051a39Sopenharmony_ci	adcs	$a4,$a4,$a4
1220e1051a39Sopenharmony_ci	adcs	$a5,$a5,$a5
1221e1051a39Sopenharmony_ci	adcs	$a6,$a6,$a6
1222e1051a39Sopenharmony_ci	mov	$ff,#0
1223e1051a39Sopenharmony_ci	adcs	$a7,$a7,$a7
1224e1051a39Sopenharmony_ci	adc	$ff,$ff,#0
1225e1051a39Sopenharmony_ci
1226e1051a39Sopenharmony_ci	@ if a+b >= modulus, subtract modulus.
1227e1051a39Sopenharmony_ci	@
1228e1051a39Sopenharmony_ci	@ But since comparison implies subtraction, we subtract
1229e1051a39Sopenharmony_ci	@ modulus and then add it back if subtraction borrowed.
1230e1051a39Sopenharmony_ci
1231e1051a39Sopenharmony_ci	subs	$a0,$a0,#-1
1232e1051a39Sopenharmony_ci	sbcs	$a1,$a1,#-1
1233e1051a39Sopenharmony_ci	sbcs	$a2,$a2,#-1
1234e1051a39Sopenharmony_ci	sbcs	$a3,$a3,#0
1235e1051a39Sopenharmony_ci	sbcs	$a4,$a4,#0
1236e1051a39Sopenharmony_ci	sbcs	$a5,$a5,#0
1237e1051a39Sopenharmony_ci	sbcs	$a6,$a6,#1
1238e1051a39Sopenharmony_ci	sbcs	$a7,$a7,#-1
1239e1051a39Sopenharmony_ci	sbc	$ff,$ff,#0
1240e1051a39Sopenharmony_ci
1241e1051a39Sopenharmony_ci	@ Note that because mod has special form, i.e. consists of
1242e1051a39Sopenharmony_ci	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
1243e1051a39Sopenharmony_ci	@ using value of borrow as a whole or extracting single bit.
1244e1051a39Sopenharmony_ci	@ Follow $ff register...
1245e1051a39Sopenharmony_ci
1246e1051a39Sopenharmony_ci	adds	$a0,$a0,$ff		@ add synthesized modulus
1247e1051a39Sopenharmony_ci	adcs	$a1,$a1,$ff
1248e1051a39Sopenharmony_ci	str	$a0,[$r_ptr,#0]
1249e1051a39Sopenharmony_ci	adcs	$a2,$a2,$ff
1250e1051a39Sopenharmony_ci	str	$a1,[$r_ptr,#4]
1251e1051a39Sopenharmony_ci	adcs	$a3,$a3,#0
1252e1051a39Sopenharmony_ci	str	$a2,[$r_ptr,#8]
1253e1051a39Sopenharmony_ci	adcs	$a4,$a4,#0
1254e1051a39Sopenharmony_ci	str	$a3,[$r_ptr,#12]
1255e1051a39Sopenharmony_ci	adcs	$a5,$a5,#0
1256e1051a39Sopenharmony_ci	str	$a4,[$r_ptr,#16]
1257e1051a39Sopenharmony_ci	adcs	$a6,$a6,$ff,lsr#31
1258e1051a39Sopenharmony_ci	str	$a5,[$r_ptr,#20]
1259e1051a39Sopenharmony_ci	adcs	$a7,$a7,$ff
1260e1051a39Sopenharmony_ci	str	$a6,[$r_ptr,#24]
1261e1051a39Sopenharmony_ci	str	$a7,[$r_ptr,#28]
1262e1051a39Sopenharmony_ci
1263e1051a39Sopenharmony_ci	mov	pc,lr
1264e1051a39Sopenharmony_ci.size	__ecp_nistz256_add_self,.-__ecp_nistz256_add_self
1265e1051a39Sopenharmony_ci
1266e1051a39Sopenharmony_ci___
1267e1051a39Sopenharmony_ci
1268e1051a39Sopenharmony_ci########################################################################
1269e1051a39Sopenharmony_ci# following subroutines are "literal" implementation of those found in
1270e1051a39Sopenharmony_ci# ecp_nistz256.c
1271e1051a39Sopenharmony_ci#
1272e1051a39Sopenharmony_ci########################################################################
1273e1051a39Sopenharmony_ci# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1274e1051a39Sopenharmony_ci#
1275e1051a39Sopenharmony_ci{
1276e1051a39Sopenharmony_cimy ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1277e1051a39Sopenharmony_ci# above map() describes stack layout with 5 temporary
1278e1051a39Sopenharmony_ci# 256-bit vectors on top. Then note that we push
1279e1051a39Sopenharmony_ci# starting from r0, which means that we have copy of
1280e1051a39Sopenharmony_ci# input arguments just below these temporary vectors.
1281e1051a39Sopenharmony_ci
1282e1051a39Sopenharmony_ci$code.=<<___;
1283e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_double
1284e1051a39Sopenharmony_ci.type	ecp_nistz256_point_double,%function
1285e1051a39Sopenharmony_ci.align	5
1286e1051a39Sopenharmony_ciecp_nistz256_point_double:
1287e1051a39Sopenharmony_ci	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1288e1051a39Sopenharmony_ci	sub	sp,sp,#32*5
1289e1051a39Sopenharmony_ci
1290e1051a39Sopenharmony_ci.Lpoint_double_shortcut:
1291e1051a39Sopenharmony_ci	add	r3,sp,#$in_x
1292e1051a39Sopenharmony_ci	ldmia	$a_ptr!,{r4-r11}	@ copy in_x
1293e1051a39Sopenharmony_ci	stmia	r3,{r4-r11}
1294e1051a39Sopenharmony_ci
1295e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S
1296e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_by_2	@ p256_mul_by_2(S, in_y);
1297e1051a39Sopenharmony_ci
1298e1051a39Sopenharmony_ci	add	$b_ptr,$a_ptr,#32
1299e1051a39Sopenharmony_ci	add	$a_ptr,$a_ptr,#32
1300e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Zsqr
1301e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Zsqr, in_z);
1302e1051a39Sopenharmony_ci
1303e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$S
1304e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S
1305e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S
1306e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(S, S);
1307e1051a39Sopenharmony_ci
1308e1051a39Sopenharmony_ci	ldr	$b_ptr,[sp,#32*5+4]
1309e1051a39Sopenharmony_ci	add	$a_ptr,$b_ptr,#32
1310e1051a39Sopenharmony_ci	add	$b_ptr,$b_ptr,#64
1311e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$tmp0
1312e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(tmp0, in_z, in_y);
1313e1051a39Sopenharmony_ci
1314e1051a39Sopenharmony_ci	ldr	$r_ptr,[sp,#32*5]
1315e1051a39Sopenharmony_ci	add	$r_ptr,$r_ptr,#64
1316e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(res_z, tmp0);
1317e1051a39Sopenharmony_ci
1318e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in_x
1319e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$Zsqr
1320e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$M
1321e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add	@ p256_add(M, in_x, Zsqr);
1322e1051a39Sopenharmony_ci
1323e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in_x
1324e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$Zsqr
1325e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Zsqr
1326e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub	@ p256_sub(Zsqr, in_x, Zsqr);
1327e1051a39Sopenharmony_ci
1328e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$S
1329e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S
1330e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$tmp0
1331e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(tmp0, S);
1332e1051a39Sopenharmony_ci
1333e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$Zsqr
1334e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$M
1335e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$M
1336e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(M, M, Zsqr);
1337e1051a39Sopenharmony_ci
1338e1051a39Sopenharmony_ci	ldr	$r_ptr,[sp,#32*5]
1339e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$tmp0
1340e1051a39Sopenharmony_ci	add	$r_ptr,$r_ptr,#32
1341e1051a39Sopenharmony_ci	bl	__ecp_nistz256_div_by_2	@ p256_div_by_2(res_y, tmp0);
1342e1051a39Sopenharmony_ci
1343e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$M
1344e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$M
1345e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_by_3	@ p256_mul_by_3(M, M);
1346e1051a39Sopenharmony_ci
1347e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in_x
1348e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S
1349e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S
1350e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S, S, in_x);
1351e1051a39Sopenharmony_ci
1352e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$tmp0
1353e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(tmp0, S);
1354e1051a39Sopenharmony_ci
1355e1051a39Sopenharmony_ci	ldr	$r_ptr,[sp,#32*5]
1356e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$M
1357e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$M
1358e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(res_x, M);
1359e1051a39Sopenharmony_ci
1360e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$tmp0
1361e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	@ p256_sub(res_x, res_x, tmp0);
1362e1051a39Sopenharmony_ci
1363e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S
1364e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S
1365e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	@ p256_sub(S, S, res_x);
1366e1051a39Sopenharmony_ci
1367e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$M
1368e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S
1369e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S, S, M);
1370e1051a39Sopenharmony_ci
1371e1051a39Sopenharmony_ci	ldr	$r_ptr,[sp,#32*5]
1372e1051a39Sopenharmony_ci	add	$b_ptr,$r_ptr,#32
1373e1051a39Sopenharmony_ci	add	$r_ptr,$r_ptr,#32
1374e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, S, res_y);
1375e1051a39Sopenharmony_ci
1376e1051a39Sopenharmony_ci	add	sp,sp,#32*5+16		@ +16 means "skip even over saved r0-r3"
1377e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__)
1378e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
1379e1051a39Sopenharmony_ci#else
1380e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,lr}
1381e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
1382e1051a39Sopenharmony_ci#endif
1383e1051a39Sopenharmony_ci.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
1384e1051a39Sopenharmony_ci___
1385e1051a39Sopenharmony_ci}
1386e1051a39Sopenharmony_ci
1387e1051a39Sopenharmony_ci########################################################################
1388e1051a39Sopenharmony_ci# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1389e1051a39Sopenharmony_ci#			      const P256_POINT *in2);
1390e1051a39Sopenharmony_ci{
1391e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z,
1392e1051a39Sopenharmony_ci    $in1_x,$in1_y,$in1_z,
1393e1051a39Sopenharmony_ci    $in2_x,$in2_y,$in2_z,
1394e1051a39Sopenharmony_ci    $H,$Hsqr,$R,$Rsqr,$Hcub,
1395e1051a39Sopenharmony_ci    $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
1396e1051a39Sopenharmony_cimy ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1397e1051a39Sopenharmony_ci# above map() describes stack layout with 18 temporary
1398e1051a39Sopenharmony_ci# 256-bit vectors on top. Then note that we push
1399e1051a39Sopenharmony_ci# starting from r0, which means that we have copy of
1400e1051a39Sopenharmony_ci# input arguments just below these temporary vectors.
1401e1051a39Sopenharmony_ci# We use three of them for ~in1infty, ~in2infty and
1402e1051a39Sopenharmony_ci# result of check for zero.
1403e1051a39Sopenharmony_ci
1404e1051a39Sopenharmony_ci$code.=<<___;
1405e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_add
1406e1051a39Sopenharmony_ci.type	ecp_nistz256_point_add,%function
1407e1051a39Sopenharmony_ci.align	5
1408e1051a39Sopenharmony_ciecp_nistz256_point_add:
1409e1051a39Sopenharmony_ci	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1410e1051a39Sopenharmony_ci	sub	sp,sp,#32*18+16
1411e1051a39Sopenharmony_ci
1412e1051a39Sopenharmony_ci	ldmia	$b_ptr!,{r4-r11}	@ copy in2_x
1413e1051a39Sopenharmony_ci	add	r3,sp,#$in2_x
1414e1051a39Sopenharmony_ci	stmia	r3!,{r4-r11}
1415e1051a39Sopenharmony_ci	ldmia	$b_ptr!,{r4-r11}	@ copy in2_y
1416e1051a39Sopenharmony_ci	stmia	r3!,{r4-r11}
1417e1051a39Sopenharmony_ci	ldmia	$b_ptr,{r4-r11}		@ copy in2_z
1418e1051a39Sopenharmony_ci	orr	r12,r4,r5
1419e1051a39Sopenharmony_ci	orr	r12,r12,r6
1420e1051a39Sopenharmony_ci	orr	r12,r12,r7
1421e1051a39Sopenharmony_ci	orr	r12,r12,r8
1422e1051a39Sopenharmony_ci	orr	r12,r12,r9
1423e1051a39Sopenharmony_ci	orr	r12,r12,r10
1424e1051a39Sopenharmony_ci	orr	r12,r12,r11
1425e1051a39Sopenharmony_ci	cmp	r12,#0
1426e1051a39Sopenharmony_ci#ifdef	__thumb2__
1427e1051a39Sopenharmony_ci	it	ne
1428e1051a39Sopenharmony_ci#endif
1429e1051a39Sopenharmony_ci	movne	r12,#-1
1430e1051a39Sopenharmony_ci	stmia	r3,{r4-r11}
1431e1051a39Sopenharmony_ci	str	r12,[sp,#32*18+8]	@ ~in2infty
1432e1051a39Sopenharmony_ci
1433e1051a39Sopenharmony_ci	ldmia	$a_ptr!,{r4-r11}	@ copy in1_x
1434e1051a39Sopenharmony_ci	add	r3,sp,#$in1_x
1435e1051a39Sopenharmony_ci	stmia	r3!,{r4-r11}
1436e1051a39Sopenharmony_ci	ldmia	$a_ptr!,{r4-r11}	@ copy in1_y
1437e1051a39Sopenharmony_ci	stmia	r3!,{r4-r11}
1438e1051a39Sopenharmony_ci	ldmia	$a_ptr,{r4-r11}		@ copy in1_z
1439e1051a39Sopenharmony_ci	orr	r12,r4,r5
1440e1051a39Sopenharmony_ci	orr	r12,r12,r6
1441e1051a39Sopenharmony_ci	orr	r12,r12,r7
1442e1051a39Sopenharmony_ci	orr	r12,r12,r8
1443e1051a39Sopenharmony_ci	orr	r12,r12,r9
1444e1051a39Sopenharmony_ci	orr	r12,r12,r10
1445e1051a39Sopenharmony_ci	orr	r12,r12,r11
1446e1051a39Sopenharmony_ci	cmp	r12,#0
1447e1051a39Sopenharmony_ci#ifdef	__thumb2__
1448e1051a39Sopenharmony_ci	it	ne
1449e1051a39Sopenharmony_ci#endif
1450e1051a39Sopenharmony_ci	movne	r12,#-1
1451e1051a39Sopenharmony_ci	stmia	r3,{r4-r11}
1452e1051a39Sopenharmony_ci	str	r12,[sp,#32*18+4]	@ ~in1infty
1453e1051a39Sopenharmony_ci
1454e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in2_z
1455e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$in2_z
1456e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Z2sqr
1457e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z2sqr, in2_z);
1458e1051a39Sopenharmony_ci
1459e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in1_z
1460e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$in1_z
1461e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Z1sqr
1462e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z1sqr, in1_z);
1463e1051a39Sopenharmony_ci
1464e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in2_z
1465e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$Z2sqr
1466e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S1
1467e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S1, Z2sqr, in2_z);
1468e1051a39Sopenharmony_ci
1469e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in1_z
1470e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$Z1sqr
1471e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S2
1472e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, Z1sqr, in1_z);
1473e1051a39Sopenharmony_ci
1474e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in1_y
1475e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S1
1476e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S1
1477e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S1, S1, in1_y);
1478e1051a39Sopenharmony_ci
1479e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in2_y
1480e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S2
1481e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S2
1482e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S2, in2_y);
1483e1051a39Sopenharmony_ci
1484e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S1
1485e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$R
1486e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	@ p256_sub(R, S2, S1);
1487e1051a39Sopenharmony_ci
1488e1051a39Sopenharmony_ci	orr	$a0,$a0,$a1		@ see if result is zero
1489e1051a39Sopenharmony_ci	orr	$a2,$a2,$a3
1490e1051a39Sopenharmony_ci	orr	$a4,$a4,$a5
1491e1051a39Sopenharmony_ci	orr	$a0,$a0,$a2
1492e1051a39Sopenharmony_ci	orr	$a4,$a4,$a6
1493e1051a39Sopenharmony_ci	orr	$a0,$a0,$a7
1494e1051a39Sopenharmony_ci	 add	$a_ptr,sp,#$in1_x
1495e1051a39Sopenharmony_ci	orr	$a0,$a0,$a4
1496e1051a39Sopenharmony_ci	 add	$b_ptr,sp,#$Z2sqr
1497e1051a39Sopenharmony_ci	str	$a0,[sp,#32*18+12]
1498e1051a39Sopenharmony_ci
1499e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$U1
1500e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U1, in1_x, Z2sqr);
1501e1051a39Sopenharmony_ci
1502e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in2_x
1503e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$Z1sqr
1504e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$U2
1505e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, in2_x, Z1sqr);
1506e1051a39Sopenharmony_ci
1507e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$U1
1508e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$H
1509e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	@ p256_sub(H, U2, U1);
1510e1051a39Sopenharmony_ci
1511e1051a39Sopenharmony_ci	orr	$a0,$a0,$a1		@ see if result is zero
1512e1051a39Sopenharmony_ci	orr	$a2,$a2,$a3
1513e1051a39Sopenharmony_ci	orr	$a4,$a4,$a5
1514e1051a39Sopenharmony_ci	orr	$a0,$a0,$a2
1515e1051a39Sopenharmony_ci	orr	$a4,$a4,$a6
1516e1051a39Sopenharmony_ci	orr	$a0,$a0,$a7
1517e1051a39Sopenharmony_ci	orr	$a0,$a0,$a4		@ ~is_equal(U1,U2)
1518e1051a39Sopenharmony_ci
1519e1051a39Sopenharmony_ci	ldr	$t0,[sp,#32*18+4]	@ ~in1infty
1520e1051a39Sopenharmony_ci	ldr	$t1,[sp,#32*18+8]	@ ~in2infty
1521e1051a39Sopenharmony_ci	ldr	$t2,[sp,#32*18+12]	@ ~is_equal(S1,S2)
1522e1051a39Sopenharmony_ci	mvn	$t0,$t0			@ -1/0 -> 0/-1
1523e1051a39Sopenharmony_ci	mvn	$t1,$t1			@ -1/0 -> 0/-1
1524e1051a39Sopenharmony_ci	orr	$a0,$a0,$t0
1525e1051a39Sopenharmony_ci	orr	$a0,$a0,$t1
1526e1051a39Sopenharmony_ci	orrs	$a0,$a0,$t2		@ set flags
1527e1051a39Sopenharmony_ci
1528e1051a39Sopenharmony_ci	@ if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
1529e1051a39Sopenharmony_ci	bne	.Ladd_proceed
1530e1051a39Sopenharmony_ci
1531e1051a39Sopenharmony_ci.Ladd_double:
1532e1051a39Sopenharmony_ci	ldr	$a_ptr,[sp,#32*18+20]
1533e1051a39Sopenharmony_ci	add	sp,sp,#32*(18-5)+16	@ difference in frame sizes
1534e1051a39Sopenharmony_ci	b	.Lpoint_double_shortcut
1535e1051a39Sopenharmony_ci
1536e1051a39Sopenharmony_ci.align	4
1537e1051a39Sopenharmony_ci.Ladd_proceed:
1538e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$R
1539e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$R
1540e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Rsqr
1541e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Rsqr, R);
1542e1051a39Sopenharmony_ci
1543e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$H
1544e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$in1_z
1545e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$res_z
1546e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, H, in1_z);
1547e1051a39Sopenharmony_ci
1548e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$H
1549e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$H
1550e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Hsqr
1551e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Hsqr, H);
1552e1051a39Sopenharmony_ci
1553e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in2_z
1554e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$res_z
1555e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$res_z
1556e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, res_z, in2_z);
1557e1051a39Sopenharmony_ci
1558e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$H
1559e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$Hsqr
1560e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Hcub
1561e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(Hcub, Hsqr, H);
1562e1051a39Sopenharmony_ci
1563e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$Hsqr
1564e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$U1
1565e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$U2
1566e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, U1, Hsqr);
1567e1051a39Sopenharmony_ci
1568e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Hsqr
1569e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(Hsqr, U2);
1570e1051a39Sopenharmony_ci
1571e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$Rsqr
1572e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$res_x
1573e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_x, Rsqr, Hsqr);
1574e1051a39Sopenharmony_ci
1575e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$Hcub
1576e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	@  p256_sub(res_x, res_x, Hcub);
1577e1051a39Sopenharmony_ci
1578e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$U2
1579e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$res_y
1580e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_y, U2, res_x);
1581e1051a39Sopenharmony_ci
1582e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$Hcub
1583e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S1
1584e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S2
1585e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S1, Hcub);
1586e1051a39Sopenharmony_ci
1587e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$R
1588e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$res_y
1589e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$res_y
1590e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_y, res_y, R);
1591e1051a39Sopenharmony_ci
1592e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S2
1593e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, res_y, S2);
1594e1051a39Sopenharmony_ci
1595e1051a39Sopenharmony_ci	ldr	r11,[sp,#32*18+4]	@ ~in1infty
1596e1051a39Sopenharmony_ci	ldr	r12,[sp,#32*18+8]	@ ~in2infty
1597e1051a39Sopenharmony_ci	add	r1,sp,#$res_x
1598e1051a39Sopenharmony_ci	add	r2,sp,#$in2_x
1599e1051a39Sopenharmony_ci	and	r10,r11,r12		@ ~in1infty & ~in2infty
1600e1051a39Sopenharmony_ci	mvn	r11,r11
1601e1051a39Sopenharmony_ci	add	r3,sp,#$in1_x
1602e1051a39Sopenharmony_ci	and	r11,r11,r12		@ in1infty & ~in2infty
1603e1051a39Sopenharmony_ci	mvn	r12,r12			@ in2infty
1604e1051a39Sopenharmony_ci	ldr	$r_ptr,[sp,#32*18+16]
1605e1051a39Sopenharmony_ci___
1606e1051a39Sopenharmony_cifor($i=0;$i<96;$i+=8) {			# conditional moves
1607e1051a39Sopenharmony_ci$code.=<<___;
1608e1051a39Sopenharmony_ci	ldmia	r1!,{r4-r5}		@ res_x
1609e1051a39Sopenharmony_ci	ldmia	r2!,{r6-r7}		@ in2_x
1610e1051a39Sopenharmony_ci	ldmia	r3!,{r8-r9}		@ in1_x
1611e1051a39Sopenharmony_ci	and	r4,r4,r10		@ ~in1infty & ~in2infty
1612e1051a39Sopenharmony_ci	and	r5,r5,r10
1613e1051a39Sopenharmony_ci	and	r6,r6,r11		@ in1infty & ~in2infty
1614e1051a39Sopenharmony_ci	and	r7,r7,r11
1615e1051a39Sopenharmony_ci	and	r8,r8,r12		@ in2infty
1616e1051a39Sopenharmony_ci	and	r9,r9,r12
1617e1051a39Sopenharmony_ci	orr	r4,r4,r6
1618e1051a39Sopenharmony_ci	orr	r5,r5,r7
1619e1051a39Sopenharmony_ci	orr	r4,r4,r8
1620e1051a39Sopenharmony_ci	orr	r5,r5,r9
1621e1051a39Sopenharmony_ci	stmia	$r_ptr!,{r4-r5}
1622e1051a39Sopenharmony_ci___
1623e1051a39Sopenharmony_ci}
1624e1051a39Sopenharmony_ci$code.=<<___;
1625e1051a39Sopenharmony_ci.Ladd_done:
1626e1051a39Sopenharmony_ci	add	sp,sp,#32*18+16+16	@ +16 means "skip even over saved r0-r3"
1627e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__)
1628e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
1629e1051a39Sopenharmony_ci#else
1630e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,lr}
1631e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
1632e1051a39Sopenharmony_ci#endif
1633e1051a39Sopenharmony_ci.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1634e1051a39Sopenharmony_ci___
1635e1051a39Sopenharmony_ci}
1636e1051a39Sopenharmony_ci
1637e1051a39Sopenharmony_ci########################################################################
1638e1051a39Sopenharmony_ci# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1639e1051a39Sopenharmony_ci#				     const P256_POINT_AFFINE *in2);
1640e1051a39Sopenharmony_ci{
1641e1051a39Sopenharmony_cimy ($res_x,$res_y,$res_z,
1642e1051a39Sopenharmony_ci    $in1_x,$in1_y,$in1_z,
1643e1051a39Sopenharmony_ci    $in2_x,$in2_y,
1644e1051a39Sopenharmony_ci    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
1645e1051a39Sopenharmony_cimy $Z1sqr = $S2;
1646e1051a39Sopenharmony_ci# above map() describes stack layout with 18 temporary
1647e1051a39Sopenharmony_ci# 256-bit vectors on top. Then note that we push
1648e1051a39Sopenharmony_ci# starting from r0, which means that we have copy of
1649e1051a39Sopenharmony_ci# input arguments just below these temporary vectors.
1650e1051a39Sopenharmony_ci# We use two of them for ~in1infty, ~in2infty.
1651e1051a39Sopenharmony_ci
1652e1051a39Sopenharmony_cimy @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1653e1051a39Sopenharmony_ci
1654e1051a39Sopenharmony_ci$code.=<<___;
1655e1051a39Sopenharmony_ci.globl	ecp_nistz256_point_add_affine
1656e1051a39Sopenharmony_ci.type	ecp_nistz256_point_add_affine,%function
1657e1051a39Sopenharmony_ci.align	5
1658e1051a39Sopenharmony_ciecp_nistz256_point_add_affine:
1659e1051a39Sopenharmony_ci	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1660e1051a39Sopenharmony_ci	sub	sp,sp,#32*15
1661e1051a39Sopenharmony_ci
1662e1051a39Sopenharmony_ci	ldmia	$a_ptr!,{r4-r11}	@ copy in1_x
1663e1051a39Sopenharmony_ci	add	r3,sp,#$in1_x
1664e1051a39Sopenharmony_ci	stmia	r3!,{r4-r11}
1665e1051a39Sopenharmony_ci	ldmia	$a_ptr!,{r4-r11}	@ copy in1_y
1666e1051a39Sopenharmony_ci	stmia	r3!,{r4-r11}
1667e1051a39Sopenharmony_ci	ldmia	$a_ptr,{r4-r11}		@ copy in1_z
1668e1051a39Sopenharmony_ci	orr	r12,r4,r5
1669e1051a39Sopenharmony_ci	orr	r12,r12,r6
1670e1051a39Sopenharmony_ci	orr	r12,r12,r7
1671e1051a39Sopenharmony_ci	orr	r12,r12,r8
1672e1051a39Sopenharmony_ci	orr	r12,r12,r9
1673e1051a39Sopenharmony_ci	orr	r12,r12,r10
1674e1051a39Sopenharmony_ci	orr	r12,r12,r11
1675e1051a39Sopenharmony_ci	cmp	r12,#0
1676e1051a39Sopenharmony_ci#ifdef	__thumb2__
1677e1051a39Sopenharmony_ci	it	ne
1678e1051a39Sopenharmony_ci#endif
1679e1051a39Sopenharmony_ci	movne	r12,#-1
1680e1051a39Sopenharmony_ci	stmia	r3,{r4-r11}
1681e1051a39Sopenharmony_ci	str	r12,[sp,#32*15+4]	@ ~in1infty
1682e1051a39Sopenharmony_ci
1683e1051a39Sopenharmony_ci	ldmia	$b_ptr!,{r4-r11}	@ copy in2_x
1684e1051a39Sopenharmony_ci	add	r3,sp,#$in2_x
1685e1051a39Sopenharmony_ci	orr	r12,r4,r5
1686e1051a39Sopenharmony_ci	orr	r12,r12,r6
1687e1051a39Sopenharmony_ci	orr	r12,r12,r7
1688e1051a39Sopenharmony_ci	orr	r12,r12,r8
1689e1051a39Sopenharmony_ci	orr	r12,r12,r9
1690e1051a39Sopenharmony_ci	orr	r12,r12,r10
1691e1051a39Sopenharmony_ci	orr	r12,r12,r11
1692e1051a39Sopenharmony_ci	stmia	r3!,{r4-r11}
1693e1051a39Sopenharmony_ci	ldmia	$b_ptr!,{r4-r11}	@ copy in2_y
1694e1051a39Sopenharmony_ci	orr	r12,r12,r4
1695e1051a39Sopenharmony_ci	orr	r12,r12,r5
1696e1051a39Sopenharmony_ci	orr	r12,r12,r6
1697e1051a39Sopenharmony_ci	orr	r12,r12,r7
1698e1051a39Sopenharmony_ci	orr	r12,r12,r8
1699e1051a39Sopenharmony_ci	orr	r12,r12,r9
1700e1051a39Sopenharmony_ci	orr	r12,r12,r10
1701e1051a39Sopenharmony_ci	orr	r12,r12,r11
1702e1051a39Sopenharmony_ci	stmia	r3!,{r4-r11}
1703e1051a39Sopenharmony_ci	cmp	r12,#0
1704e1051a39Sopenharmony_ci#ifdef	__thumb2__
1705e1051a39Sopenharmony_ci	it	ne
1706e1051a39Sopenharmony_ci#endif
1707e1051a39Sopenharmony_ci	movne	r12,#-1
1708e1051a39Sopenharmony_ci	str	r12,[sp,#32*15+8]	@ ~in2infty
1709e1051a39Sopenharmony_ci
1710e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in1_z
1711e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$in1_z
1712e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Z1sqr
1713e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z1sqr, in1_z);
1714e1051a39Sopenharmony_ci
1715e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$Z1sqr
1716e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$in2_x
1717e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$U2
1718e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, Z1sqr, in2_x);
1719e1051a39Sopenharmony_ci
1720e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$in1_x
1721e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$H
1722e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	@ p256_sub(H, U2, in1_x);
1723e1051a39Sopenharmony_ci
1724e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$Z1sqr
1725e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$in1_z
1726e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S2
1727e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, Z1sqr, in1_z);
1728e1051a39Sopenharmony_ci
1729e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$H
1730e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$in1_z
1731e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$res_z
1732e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, H, in1_z);
1733e1051a39Sopenharmony_ci
1734e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$in2_y
1735e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S2
1736e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S2
1737e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S2, in2_y);
1738e1051a39Sopenharmony_ci
1739e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$in1_y
1740e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$R
1741e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	@ p256_sub(R, S2, in1_y);
1742e1051a39Sopenharmony_ci
1743e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$H
1744e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$H
1745e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Hsqr
1746e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Hsqr, H);
1747e1051a39Sopenharmony_ci
1748e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$R
1749e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$R
1750e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Rsqr
1751e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Rsqr, R);
1752e1051a39Sopenharmony_ci
1753e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$H
1754e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$Hsqr
1755e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Hcub
1756e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(Hcub, Hsqr, H);
1757e1051a39Sopenharmony_ci
1758e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$Hsqr
1759e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$in1_x
1760e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$U2
1761e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, in1_x, Hsqr);
1762e1051a39Sopenharmony_ci
1763e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$Hsqr
1764e1051a39Sopenharmony_ci	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(Hsqr, U2);
1765e1051a39Sopenharmony_ci
1766e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$Rsqr
1767e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$res_x
1768e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_x, Rsqr, Hsqr);
1769e1051a39Sopenharmony_ci
1770e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$Hcub
1771e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	@  p256_sub(res_x, res_x, Hcub);
1772e1051a39Sopenharmony_ci
1773e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$U2
1774e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$res_y
1775e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_y, U2, res_x);
1776e1051a39Sopenharmony_ci
1777e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$Hcub
1778e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$in1_y
1779e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$S2
1780e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, in1_y, Hcub);
1781e1051a39Sopenharmony_ci
1782e1051a39Sopenharmony_ci	add	$a_ptr,sp,#$R
1783e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$res_y
1784e1051a39Sopenharmony_ci	add	$r_ptr,sp,#$res_y
1785e1051a39Sopenharmony_ci	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_y, res_y, R);
1786e1051a39Sopenharmony_ci
1787e1051a39Sopenharmony_ci	add	$b_ptr,sp,#$S2
1788e1051a39Sopenharmony_ci	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, res_y, S2);
1789e1051a39Sopenharmony_ci
1790e1051a39Sopenharmony_ci	ldr	r11,[sp,#32*15+4]	@ ~in1infty
1791e1051a39Sopenharmony_ci	ldr	r12,[sp,#32*15+8]	@ ~in2infty
1792e1051a39Sopenharmony_ci	add	r1,sp,#$res_x
1793e1051a39Sopenharmony_ci	add	r2,sp,#$in2_x
1794e1051a39Sopenharmony_ci	and	r10,r11,r12		@ ~in1infty & ~in2infty
1795e1051a39Sopenharmony_ci	mvn	r11,r11
1796e1051a39Sopenharmony_ci	add	r3,sp,#$in1_x
1797e1051a39Sopenharmony_ci	and	r11,r11,r12		@ in1infty & ~in2infty
1798e1051a39Sopenharmony_ci	mvn	r12,r12			@ in2infty
1799e1051a39Sopenharmony_ci	ldr	$r_ptr,[sp,#32*15]
1800e1051a39Sopenharmony_ci___
1801e1051a39Sopenharmony_cifor($i=0;$i<64;$i+=8) {			# conditional moves
1802e1051a39Sopenharmony_ci$code.=<<___;
1803e1051a39Sopenharmony_ci	ldmia	r1!,{r4-r5}		@ res_x
1804e1051a39Sopenharmony_ci	ldmia	r2!,{r6-r7}		@ in2_x
1805e1051a39Sopenharmony_ci	ldmia	r3!,{r8-r9}		@ in1_x
1806e1051a39Sopenharmony_ci	and	r4,r4,r10		@ ~in1infty & ~in2infty
1807e1051a39Sopenharmony_ci	and	r5,r5,r10
1808e1051a39Sopenharmony_ci	and	r6,r6,r11		@ in1infty & ~in2infty
1809e1051a39Sopenharmony_ci	and	r7,r7,r11
1810e1051a39Sopenharmony_ci	and	r8,r8,r12		@ in2infty
1811e1051a39Sopenharmony_ci	and	r9,r9,r12
1812e1051a39Sopenharmony_ci	orr	r4,r4,r6
1813e1051a39Sopenharmony_ci	orr	r5,r5,r7
1814e1051a39Sopenharmony_ci	orr	r4,r4,r8
1815e1051a39Sopenharmony_ci	orr	r5,r5,r9
1816e1051a39Sopenharmony_ci	stmia	$r_ptr!,{r4-r5}
1817e1051a39Sopenharmony_ci___
1818e1051a39Sopenharmony_ci}
1819e1051a39Sopenharmony_cifor(;$i<96;$i+=8) {
1820e1051a39Sopenharmony_cimy $j=($i-64)/4;
1821e1051a39Sopenharmony_ci$code.=<<___;
1822e1051a39Sopenharmony_ci	ldmia	r1!,{r4-r5}		@ res_z
1823e1051a39Sopenharmony_ci	ldmia	r3!,{r8-r9}		@ in1_z
1824e1051a39Sopenharmony_ci	and	r4,r4,r10
1825e1051a39Sopenharmony_ci	and	r5,r5,r10
1826e1051a39Sopenharmony_ci	and	r6,r11,#@ONE_mont[$j]
1827e1051a39Sopenharmony_ci	and	r7,r11,#@ONE_mont[$j+1]
1828e1051a39Sopenharmony_ci	and	r8,r8,r12
1829e1051a39Sopenharmony_ci	and	r9,r9,r12
1830e1051a39Sopenharmony_ci	orr	r4,r4,r6
1831e1051a39Sopenharmony_ci	orr	r5,r5,r7
1832e1051a39Sopenharmony_ci	orr	r4,r4,r8
1833e1051a39Sopenharmony_ci	orr	r5,r5,r9
1834e1051a39Sopenharmony_ci	stmia	$r_ptr!,{r4-r5}
1835e1051a39Sopenharmony_ci___
1836e1051a39Sopenharmony_ci}
1837e1051a39Sopenharmony_ci$code.=<<___;
1838e1051a39Sopenharmony_ci	add	sp,sp,#32*15+16		@ +16 means "skip even over saved r0-r3"
1839e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 || !defined(__thumb__)
1840e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,pc}
1841e1051a39Sopenharmony_ci#else
1842e1051a39Sopenharmony_ci	ldmia	sp!,{r4-r12,lr}
1843e1051a39Sopenharmony_ci	bx	lr			@ interoperable with Thumb ISA:-)
1844e1051a39Sopenharmony_ci#endif
1845e1051a39Sopenharmony_ci.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1846e1051a39Sopenharmony_ci___
1847e1051a39Sopenharmony_ci}					}}}
1848e1051a39Sopenharmony_ci
1849e1051a39Sopenharmony_ciforeach (split("\n",$code)) {
1850e1051a39Sopenharmony_ci	s/\`([^\`]*)\`/eval $1/geo;
1851e1051a39Sopenharmony_ci
1852e1051a39Sopenharmony_ci	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1853e1051a39Sopenharmony_ci
1854e1051a39Sopenharmony_ci	print $_,"\n";
1855e1051a39Sopenharmony_ci}
1856e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!";	# enforce flush
1857