1#! /usr/bin/env perl
2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for ARMv8.
18#
19# February 2015.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816.
23#
24#			with/without -DECP_NISTZ256_ASM
25# Apple A7		+190-360%
26# Cortex-A53		+190-400%
27# Cortex-A57		+190-350%
28# Denver		+230-400%
29#
30# Ranges denote minimum and maximum improvement coefficients depending
31# on benchmark. Lower coefficients are for ECDSA sign, server-side
32# operation. Keep in mind that +400% means 5x improvement.
33
34# $output is the last argument if it looks like a file (it has an extension)
35# $flavour is the first argument if it doesn't look like a file
36$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
37$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42die "can't locate arm-xlate.pl";
43
44open OUT,"| \"$^X\" $xlate $flavour \"$output\""
45    or die "can't call $xlate: $!";
46*STDOUT=*OUT;
47
48{
49my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
50    $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
51    map("x$_",(0..17,19,20));
52
53my ($acc6,$acc7)=($ap,$bp);	# used in __ecp_nistz256_sqr_mont
54
55$code.=<<___;
56#include "arm_arch.h"
57
58.rodata
59___
60########################################################################
61# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
62#
63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64open TABLE,"<ecp_nistz256_table.c"		or
65open TABLE,"<${dir}../ecp_nistz256_table.c"	or
66die "failed to open ecp_nistz256_table.c:",$!;
67
68use integer;
69
70foreach(<TABLE>) {
71	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
72}
73close TABLE;
74
75# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
76# 64*16*37-1 is because $#arr returns last valid index or @arr, not
77# amount of elements.
78die "insane number of elements" if ($#arr != 64*16*37-1);
79
80$code.=<<___;
81.globl	ecp_nistz256_precomputed
82.type	ecp_nistz256_precomputed,%object
83.align	12
84ecp_nistz256_precomputed:
85___
86########################################################################
87# this conversion smashes P256_POINT_AFFINE by individual bytes with
88# 64 byte interval, similar to
89#	1111222233334444
90#	1234123412341234
91for(1..37) {
92	@tbl = splice(@arr,0,64*16);
93	for($i=0;$i<64;$i++) {
94		undef @line;
95		for($j=0;$j<64;$j++) {
96			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
97		}
98		$code.=".byte\t";
99		$code.=join(',',map { sprintf "0x%02x",$_} @line);
100		$code.="\n";
101	}
102}
103$code.=<<___;
104.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
105.align	5
106.Lpoly:
107.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
108.LRR:	// 2^512 mod P precomputed for NIST P256 polynomial
109.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
110.Lone_mont:
111.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
112.Lone:
113.quad	1,0,0,0
114.Lord:
115.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
116.LordK:
117.quad	0xccd1c8aaee00bc4f
118.asciz	"ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
119
120.text
121
122// void	ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
123.globl	ecp_nistz256_to_mont
124.type	ecp_nistz256_to_mont,%function
125.align	6
126ecp_nistz256_to_mont:
127	.inst	0xd503233f		// paciasp
128	stp	x29,x30,[sp,#-32]!
129	add	x29,sp,#0
130	stp	x19,x20,[sp,#16]
131
132	adrp	$bi,.LRR
133	ldr	$bi,[$bi,:lo12:.LRR]	// bp[0]
134	ldp	$a0,$a1,[$ap]
135	ldp	$a2,$a3,[$ap,#16]
136	adrp	$poly3,.Lpoly
137	add	$poly3,$poly3,:lo12:.Lpoly
138	ldr	$poly1,[$poly3,#8]
139	ldr	$poly3,[$poly3,#24]
140	adrp	$bp,.LRR		// &bp[0]
141	add	$bp,$bp,:lo12:.LRR
142
143	bl	__ecp_nistz256_mul_mont
144
145	ldp	x19,x20,[sp,#16]
146	ldp	x29,x30,[sp],#32
147	.inst	0xd50323bf		// autiasp
148	ret
149.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
150
151// void	ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
152.globl	ecp_nistz256_from_mont
153.type	ecp_nistz256_from_mont,%function
154.align	4
155ecp_nistz256_from_mont:
156	.inst	0xd503233f		// paciasp
157	stp	x29,x30,[sp,#-32]!
158	add	x29,sp,#0
159	stp	x19,x20,[sp,#16]
160
161	mov	$bi,#1			// bp[0]
162	ldp	$a0,$a1,[$ap]
163	ldp	$a2,$a3,[$ap,#16]
164	adrp	$poly3,.Lpoly
165	add	$poly3,$poly3,:lo12:.Lpoly
166	ldr	$poly1,[$poly3,#8]
167	ldr	$poly3,[$poly3,#24]
168	adrp	$bp,.Lone		// &bp[0]
169	add	$bp,$bp,:lo12:.Lone
170
171	bl	__ecp_nistz256_mul_mont
172
173	ldp	x19,x20,[sp,#16]
174	ldp	x29,x30,[sp],#32
175	.inst	0xd50323bf		// autiasp
176	ret
177.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
178
179// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
180//					     const BN_ULONG x2[4]);
181.globl	ecp_nistz256_mul_mont
182.type	ecp_nistz256_mul_mont,%function
183.align	4
184ecp_nistz256_mul_mont:
185	.inst	0xd503233f		// paciasp
186	stp	x29,x30,[sp,#-32]!
187	add	x29,sp,#0
188	stp	x19,x20,[sp,#16]
189
190	ldr	$bi,[$bp]		// bp[0]
191	ldp	$a0,$a1,[$ap]
192	ldp	$a2,$a3,[$ap,#16]
193	adrp	$poly3,.Lpoly
194	add	$poly3,$poly3,:lo12:.Lpoly
195	ldr	$poly1,[$poly3,#8]
196	ldr	$poly3,[$poly3,#24]
197
198	bl	__ecp_nistz256_mul_mont
199
200	ldp	x19,x20,[sp,#16]
201	ldp	x29,x30,[sp],#32
202	.inst	0xd50323bf		// autiasp
203	ret
204.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
205
206// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
207.globl	ecp_nistz256_sqr_mont
208.type	ecp_nistz256_sqr_mont,%function
209.align	4
210ecp_nistz256_sqr_mont:
211	.inst	0xd503233f		// paciasp
212	stp	x29,x30,[sp,#-32]!
213	add	x29,sp,#0
214	stp	x19,x20,[sp,#16]
215
216	ldp	$a0,$a1,[$ap]
217	ldp	$a2,$a3,[$ap,#16]
218	adrp	$poly3,.Lpoly
219	add	$poly3,$poly3,:lo12:.Lpoly
220	ldr	$poly1,[$poly3,#8]
221	ldr	$poly3,[$poly3,#24]
222
223	bl	__ecp_nistz256_sqr_mont
224
225	ldp	x19,x20,[sp,#16]
226	ldp	x29,x30,[sp],#32
227	.inst	0xd50323bf		// autiasp
228	ret
229.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
230
231// void	ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
232//					const BN_ULONG x2[4]);
233.globl	ecp_nistz256_add
234.type	ecp_nistz256_add,%function
235.align	4
236ecp_nistz256_add:
237	.inst	0xd503233f		// paciasp
238	stp	x29,x30,[sp,#-16]!
239	add	x29,sp,#0
240
241	ldp	$acc0,$acc1,[$ap]
242	ldp	$t0,$t1,[$bp]
243	ldp	$acc2,$acc3,[$ap,#16]
244	ldp	$t2,$t3,[$bp,#16]
245	adrp	$poly3,.Lpoly
246	add	$poly3,$poly3,:lo12:.Lpoly
247	ldr	$poly1,[$poly3,#8]
248	ldr	$poly3,[$poly3,#24]
249
250	bl	__ecp_nistz256_add
251
252	ldp	x29,x30,[sp],#16
253	.inst	0xd50323bf		// autiasp
254	ret
255.size	ecp_nistz256_add,.-ecp_nistz256_add
256
257// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
258.globl	ecp_nistz256_div_by_2
259.type	ecp_nistz256_div_by_2,%function
260.align	4
261ecp_nistz256_div_by_2:
262	.inst	0xd503233f		// paciasp
263	stp	x29,x30,[sp,#-16]!
264	add	x29,sp,#0
265
266	ldp	$acc0,$acc1,[$ap]
267	ldp	$acc2,$acc3,[$ap,#16]
268	adrp	$poly3,.Lpoly
269	add	$poly3,$poly3,:lo12:.Lpoly
270	ldr	$poly1,[$poly3,#8]
271	ldr	$poly3,[$poly3,#24]
272
273	bl	__ecp_nistz256_div_by_2
274
275	ldp	x29,x30,[sp],#16
276	.inst	0xd50323bf		//  autiasp
277	ret
278.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
279
280// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
281.globl	ecp_nistz256_mul_by_2
282.type	ecp_nistz256_mul_by_2,%function
283.align	4
284ecp_nistz256_mul_by_2:
285	.inst	0xd503233f		// paciasp
286	stp	x29,x30,[sp,#-16]!
287	add	x29,sp,#0
288
289	ldp	$acc0,$acc1,[$ap]
290	ldp	$acc2,$acc3,[$ap,#16]
291	adrp	$poly3,.Lpoly
292	add	$poly3,$poly3,:lo12:.Lpoly
293	ldr	$poly1,[$poly3,#8]
294	ldr	$poly3,[$poly3,#24]
295	mov	$t0,$acc0
296	mov	$t1,$acc1
297	mov	$t2,$acc2
298	mov	$t3,$acc3
299
300	bl	__ecp_nistz256_add	// ret = a+a	// 2*a
301
302	ldp	x29,x30,[sp],#16
303	.inst	0xd50323bf		// autiasp
304	ret
305.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
306
307// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
308.globl	ecp_nistz256_mul_by_3
309.type	ecp_nistz256_mul_by_3,%function
310.align	4
311ecp_nistz256_mul_by_3:
312	.inst	0xd503233f		// paciasp
313	stp	x29,x30,[sp,#-16]!
314	add	x29,sp,#0
315
316	ldp	$acc0,$acc1,[$ap]
317	ldp	$acc2,$acc3,[$ap,#16]
318	adrp	$poly3,.Lpoly
319	add	$poly3,$poly3,:lo12:.Lpoly
320	ldr	$poly1,[$poly3,#8]
321	ldr	$poly3,[$poly3,#24]
322	mov	$t0,$acc0
323	mov	$t1,$acc1
324	mov	$t2,$acc2
325	mov	$t3,$acc3
326	mov	$a0,$acc0
327	mov	$a1,$acc1
328	mov	$a2,$acc2
329	mov	$a3,$acc3
330
331	bl	__ecp_nistz256_add	// ret = a+a	// 2*a
332
333	mov	$t0,$a0
334	mov	$t1,$a1
335	mov	$t2,$a2
336	mov	$t3,$a3
337
338	bl	__ecp_nistz256_add	// ret += a	// 2*a+a=3*a
339
340	ldp	x29,x30,[sp],#16
341	.inst	0xd50323bf		// autiasp
342	ret
343.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
344
345// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
346//				        const BN_ULONG x2[4]);
347.globl	ecp_nistz256_sub
348.type	ecp_nistz256_sub,%function
349.align	4
350ecp_nistz256_sub:
351	.inst	0xd503233f		// paciasp
352	stp	x29,x30,[sp,#-16]!
353	add	x29,sp,#0
354
355	ldp	$acc0,$acc1,[$ap]
356	ldp	$acc2,$acc3,[$ap,#16]
357	adrp	$poly3,.Lpoly
358	add	$poly3,$poly3,:lo12:.Lpoly
359	ldr	$poly1,[$poly3,#8]
360	ldr	$poly3,[$poly3,#24]
361
362	bl	__ecp_nistz256_sub_from
363
364	ldp	x29,x30,[sp],#16
365	.inst	0xd50323bf		// autiasp
366	ret
367.size	ecp_nistz256_sub,.-ecp_nistz256_sub
368
369// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
370.globl	ecp_nistz256_neg
371.type	ecp_nistz256_neg,%function
372.align	4
373ecp_nistz256_neg:
374	.inst	0xd503233f		// paciasp
375	stp	x29,x30,[sp,#-16]!
376	add	x29,sp,#0
377
378	mov	$bp,$ap
379	mov	$acc0,xzr		// a = 0
380	mov	$acc1,xzr
381	mov	$acc2,xzr
382	mov	$acc3,xzr
383	adrp	$poly3,.Lpoly
384	add	$poly3,$poly3,:lo12:.Lpoly
385	ldr	$poly1,[$poly3,#8]
386	ldr	$poly3,[$poly3,#24]
387
388	bl	__ecp_nistz256_sub_from
389
390	ldp	x29,x30,[sp],#16
391	.inst	0xd50323bf		// autiasp
392	ret
393.size	ecp_nistz256_neg,.-ecp_nistz256_neg
394
395// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
396// to $a0-$a3 and b[0] - to $bi
397.type	__ecp_nistz256_mul_mont,%function
398.align	4
399__ecp_nistz256_mul_mont:
400	mul	$acc0,$a0,$bi		// a[0]*b[0]
401	umulh	$t0,$a0,$bi
402
403	mul	$acc1,$a1,$bi		// a[1]*b[0]
404	umulh	$t1,$a1,$bi
405
406	mul	$acc2,$a2,$bi		// a[2]*b[0]
407	umulh	$t2,$a2,$bi
408
409	mul	$acc3,$a3,$bi		// a[3]*b[0]
410	umulh	$t3,$a3,$bi
411	ldr	$bi,[$bp,#8]		// b[1]
412
413	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
414	 lsl	$t0,$acc0,#32
415	adcs	$acc2,$acc2,$t1
416	 lsr	$t1,$acc0,#32
417	adcs	$acc3,$acc3,$t2
418	adc	$acc4,xzr,$t3
419	mov	$acc5,xzr
420___
421for($i=1;$i<4;$i++) {
422        # Reduction iteration is normally performed by accumulating
423        # result of multiplication of modulus by "magic" digit [and
424        # omitting least significant word, which is guaranteed to
425        # be 0], but thanks to special form of modulus and "magic"
426        # digit being equal to least significant word, it can be
427        # performed with additions and subtractions alone. Indeed:
428        #
429        #            ffff0001.00000000.0000ffff.ffffffff
430        # *                                     abcdefgh
431        # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
432        #
433        # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
434        # rewrite above as:
435        #
436        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
437        # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
438        # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
439        #
440        # or marking redundant operations:
441        #
442        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
443        # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
444        # - 0000abcd.efgh0000.--------.--------.--------
445
446$code.=<<___;
447	subs	$t2,$acc0,$t0		// "*0xffff0001"
448	sbc	$t3,$acc0,$t1
449	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
450	 mul	$t0,$a0,$bi		// lo(a[0]*b[i])
451	adcs	$acc1,$acc2,$t1
452	 mul	$t1,$a1,$bi		// lo(a[1]*b[i])
453	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
454	 mul	$t2,$a2,$bi		// lo(a[2]*b[i])
455	adcs	$acc3,$acc4,$t3
456	 mul	$t3,$a3,$bi		// lo(a[3]*b[i])
457	adc	$acc4,$acc5,xzr
458
459	adds	$acc0,$acc0,$t0		// accumulate low parts of multiplication
460	 umulh	$t0,$a0,$bi		// hi(a[0]*b[i])
461	adcs	$acc1,$acc1,$t1
462	 umulh	$t1,$a1,$bi		// hi(a[1]*b[i])
463	adcs	$acc2,$acc2,$t2
464	 umulh	$t2,$a2,$bi		// hi(a[2]*b[i])
465	adcs	$acc3,$acc3,$t3
466	 umulh	$t3,$a3,$bi		// hi(a[3]*b[i])
467	adc	$acc4,$acc4,xzr
468___
469$code.=<<___	if ($i<3);
470	ldr	$bi,[$bp,#8*($i+1)]	// b[$i+1]
471___
472$code.=<<___;
473	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
474	 lsl	$t0,$acc0,#32
475	adcs	$acc2,$acc2,$t1
476	 lsr	$t1,$acc0,#32
477	adcs	$acc3,$acc3,$t2
478	adcs	$acc4,$acc4,$t3
479	adc	$acc5,xzr,xzr
480___
481}
482$code.=<<___;
483	// last reduction
484	subs	$t2,$acc0,$t0		// "*0xffff0001"
485	sbc	$t3,$acc0,$t1
486	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
487	adcs	$acc1,$acc2,$t1
488	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
489	adcs	$acc3,$acc4,$t3
490	adc	$acc4,$acc5,xzr
491
492	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
493	sbcs	$t1,$acc1,$poly1
494	sbcs	$t2,$acc2,xzr
495	sbcs	$t3,$acc3,$poly3
496	sbcs	xzr,$acc4,xzr		// did it borrow?
497
498	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
499	csel	$acc1,$acc1,$t1,lo
500	csel	$acc2,$acc2,$t2,lo
501	stp	$acc0,$acc1,[$rp]
502	csel	$acc3,$acc3,$t3,lo
503	stp	$acc2,$acc3,[$rp,#16]
504
505	ret
506.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
507
508// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
509// to $a0-$a3
510.type	__ecp_nistz256_sqr_mont,%function
511.align	4
512__ecp_nistz256_sqr_mont:
513	//  |  |  |  |  |  |a1*a0|  |
514	//  |  |  |  |  |a2*a0|  |  |
515	//  |  |a3*a2|a3*a0|  |  |  |
516	//  |  |  |  |a2*a1|  |  |  |
517	//  |  |  |a3*a1|  |  |  |  |
518	// *|  |  |  |  |  |  |  | 2|
519	// +|a3*a3|a2*a2|a1*a1|a0*a0|
520	//  |--+--+--+--+--+--+--+--|
521	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
522	//
523	//  "can't overflow" below mark carrying into high part of
524	//  multiplication result, which can't overflow, because it
525	//  can never be all ones.
526
527	mul	$acc1,$a1,$a0		// a[1]*a[0]
528	umulh	$t1,$a1,$a0
529	mul	$acc2,$a2,$a0		// a[2]*a[0]
530	umulh	$t2,$a2,$a0
531	mul	$acc3,$a3,$a0		// a[3]*a[0]
532	umulh	$acc4,$a3,$a0
533
534	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
535	 mul	$t0,$a2,$a1		// a[2]*a[1]
536	 umulh	$t1,$a2,$a1
537	adcs	$acc3,$acc3,$t2
538	 mul	$t2,$a3,$a1		// a[3]*a[1]
539	 umulh	$t3,$a3,$a1
540	adc	$acc4,$acc4,xzr		// can't overflow
541
542	mul	$acc5,$a3,$a2		// a[3]*a[2]
543	umulh	$acc6,$a3,$a2
544
545	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
546	 mul	$acc0,$a0,$a0		// a[0]*a[0]
547	adc	$t2,$t3,xzr		// can't overflow
548
549	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
550	 umulh	$a0,$a0,$a0
551	adcs	$acc4,$acc4,$t1
552	 mul	$t1,$a1,$a1		// a[1]*a[1]
553	adcs	$acc5,$acc5,$t2
554	 umulh	$a1,$a1,$a1
555	adc	$acc6,$acc6,xzr		// can't overflow
556
557	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
558	 mul	$t2,$a2,$a2		// a[2]*a[2]
559	adcs	$acc2,$acc2,$acc2
560	 umulh	$a2,$a2,$a2
561	adcs	$acc3,$acc3,$acc3
562	 mul	$t3,$a3,$a3		// a[3]*a[3]
563	adcs	$acc4,$acc4,$acc4
564	 umulh	$a3,$a3,$a3
565	adcs	$acc5,$acc5,$acc5
566	adcs	$acc6,$acc6,$acc6
567	adc	$acc7,xzr,xzr
568
569	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
570	adcs	$acc2,$acc2,$t1
571	adcs	$acc3,$acc3,$a1
572	adcs	$acc4,$acc4,$t2
573	adcs	$acc5,$acc5,$a2
574	 lsl	$t0,$acc0,#32
575	adcs	$acc6,$acc6,$t3
576	 lsr	$t1,$acc0,#32
577	adc	$acc7,$acc7,$a3
578___
579for($i=0;$i<3;$i++) {			# reductions, see commentary in
580					# multiplication for details
581$code.=<<___;
582	subs	$t2,$acc0,$t0		// "*0xffff0001"
583	sbc	$t3,$acc0,$t1
584	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
585	adcs	$acc1,$acc2,$t1
586	 lsl	$t0,$acc0,#32
587	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
588	 lsr	$t1,$acc0,#32
589	adc	$acc3,$t3,xzr		// can't overflow
590___
591}
592$code.=<<___;
593	subs	$t2,$acc0,$t0		// "*0xffff0001"
594	sbc	$t3,$acc0,$t1
595	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
596	adcs	$acc1,$acc2,$t1
597	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
598	adc	$acc3,$t3,xzr		// can't overflow
599
600	adds	$acc0,$acc0,$acc4	// accumulate upper half
601	adcs	$acc1,$acc1,$acc5
602	adcs	$acc2,$acc2,$acc6
603	adcs	$acc3,$acc3,$acc7
604	adc	$acc4,xzr,xzr
605
606	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
607	sbcs	$t1,$acc1,$poly1
608	sbcs	$t2,$acc2,xzr
609	sbcs	$t3,$acc3,$poly3
610	sbcs	xzr,$acc4,xzr		// did it borrow?
611
612	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
613	csel	$acc1,$acc1,$t1,lo
614	csel	$acc2,$acc2,$t2,lo
615	stp	$acc0,$acc1,[$rp]
616	csel	$acc3,$acc3,$t3,lo
617	stp	$acc2,$acc3,[$rp,#16]
618
619	ret
620.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
621
622// Note that __ecp_nistz256_add expects both input vectors pre-loaded to
623// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
624// contexts, e.g. in multiplication by 2 and 3...
625.type	__ecp_nistz256_add,%function
626.align	4
627__ecp_nistz256_add:
628	adds	$acc0,$acc0,$t0		// ret = a+b
629	adcs	$acc1,$acc1,$t1
630	adcs	$acc2,$acc2,$t2
631	adcs	$acc3,$acc3,$t3
632	adc	$ap,xzr,xzr		// zap $ap
633
634	adds	$t0,$acc0,#1		// subs	$t0,$a0,#-1 // tmp = ret-modulus
635	sbcs	$t1,$acc1,$poly1
636	sbcs	$t2,$acc2,xzr
637	sbcs	$t3,$acc3,$poly3
638	sbcs	xzr,$ap,xzr		// did subtraction borrow?
639
640	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
641	csel	$acc1,$acc1,$t1,lo
642	csel	$acc2,$acc2,$t2,lo
643	stp	$acc0,$acc1,[$rp]
644	csel	$acc3,$acc3,$t3,lo
645	stp	$acc2,$acc3,[$rp,#16]
646
647	ret
648.size	__ecp_nistz256_add,.-__ecp_nistz256_add
649
650.type	__ecp_nistz256_sub_from,%function
651.align	4
652__ecp_nistz256_sub_from:
653	ldp	$t0,$t1,[$bp]
654	ldp	$t2,$t3,[$bp,#16]
655	subs	$acc0,$acc0,$t0		// ret = a-b
656	sbcs	$acc1,$acc1,$t1
657	sbcs	$acc2,$acc2,$t2
658	sbcs	$acc3,$acc3,$t3
659	sbc	$ap,xzr,xzr		// zap $ap
660
661	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
662	adcs	$t1,$acc1,$poly1
663	adcs	$t2,$acc2,xzr
664	adc	$t3,$acc3,$poly3
665	cmp	$ap,xzr			// did subtraction borrow?
666
667	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
668	csel	$acc1,$acc1,$t1,eq
669	csel	$acc2,$acc2,$t2,eq
670	stp	$acc0,$acc1,[$rp]
671	csel	$acc3,$acc3,$t3,eq
672	stp	$acc2,$acc3,[$rp,#16]
673
674	ret
675.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
676
677.type	__ecp_nistz256_sub_morf,%function
678.align	4
679__ecp_nistz256_sub_morf:
680	ldp	$t0,$t1,[$bp]
681	ldp	$t2,$t3,[$bp,#16]
682	subs	$acc0,$t0,$acc0		// ret = b-a
683	sbcs	$acc1,$t1,$acc1
684	sbcs	$acc2,$t2,$acc2
685	sbcs	$acc3,$t3,$acc3
686	sbc	$ap,xzr,xzr		// zap $ap
687
688	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
689	adcs	$t1,$acc1,$poly1
690	adcs	$t2,$acc2,xzr
691	adc	$t3,$acc3,$poly3
692	cmp	$ap,xzr			// did subtraction borrow?
693
694	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
695	csel	$acc1,$acc1,$t1,eq
696	csel	$acc2,$acc2,$t2,eq
697	stp	$acc0,$acc1,[$rp]
698	csel	$acc3,$acc3,$t3,eq
699	stp	$acc2,$acc3,[$rp,#16]
700
701	ret
702.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
703
704.type	__ecp_nistz256_div_by_2,%function
705.align	4
706__ecp_nistz256_div_by_2:
707	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = a+modulus
708	adcs	$t1,$acc1,$poly1
709	adcs	$t2,$acc2,xzr
710	adcs	$t3,$acc3,$poly3
711	adc	$ap,xzr,xzr		// zap $ap
712	tst	$acc0,#1		// is a even?
713
714	csel	$acc0,$acc0,$t0,eq	// ret = even ? a : a+modulus
715	csel	$acc1,$acc1,$t1,eq
716	csel	$acc2,$acc2,$t2,eq
717	csel	$acc3,$acc3,$t3,eq
718	csel	$ap,xzr,$ap,eq
719
720	lsr	$acc0,$acc0,#1		// ret >>= 1
721	orr	$acc0,$acc0,$acc1,lsl#63
722	lsr	$acc1,$acc1,#1
723	orr	$acc1,$acc1,$acc2,lsl#63
724	lsr	$acc2,$acc2,#1
725	orr	$acc2,$acc2,$acc3,lsl#63
726	lsr	$acc3,$acc3,#1
727	stp	$acc0,$acc1,[$rp]
728	orr	$acc3,$acc3,$ap,lsl#63
729	stp	$acc2,$acc3,[$rp,#16]
730
731	ret
732.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
733___
734########################################################################
735# following subroutines are "literal" implementation of those found in
736# ecp_nistz256.c
737#
738########################################################################
739# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
740#
741{
742my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
743# above map() describes stack layout with 4 temporary
744# 256-bit vectors on top.
745my ($rp_real,$ap_real) = map("x$_",(21,22));
746
747$code.=<<___;
748.globl	ecp_nistz256_point_double
749.type	ecp_nistz256_point_double,%function
750.align	5
751ecp_nistz256_point_double:
752	.inst	0xd503233f		// paciasp
753	stp	x29,x30,[sp,#-96]!
754	add	x29,sp,#0
755	stp	x19,x20,[sp,#16]
756	stp	x21,x22,[sp,#32]
757	sub	sp,sp,#32*4
758
759.Ldouble_shortcut:
760	ldp	$acc0,$acc1,[$ap,#32]
761	 mov	$rp_real,$rp
762	ldp	$acc2,$acc3,[$ap,#48]
763	 mov	$ap_real,$ap
764	 adrp	$poly3,.Lpoly
765	 add	$poly3,$poly3,:lo12:.Lpoly
766	 ldr	$poly1,[$poly3,#8]
767	mov	$t0,$acc0
768	 ldr	$poly3,[$poly3,#24]
769	mov	$t1,$acc1
770	 ldp	$a0,$a1,[$ap_real,#64]	// forward load for p256_sqr_mont
771	mov	$t2,$acc2
772	mov	$t3,$acc3
773	 ldp	$a2,$a3,[$ap_real,#64+16]
774	add	$rp,sp,#$S
775	bl	__ecp_nistz256_add	// p256_mul_by_2(S, in_y);
776
777	add	$rp,sp,#$Zsqr
778	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
779
780	ldp	$t0,$t1,[$ap_real]
781	ldp	$t2,$t3,[$ap_real,#16]
782	mov	$a0,$acc0		// put Zsqr aside for p256_sub
783	mov	$a1,$acc1
784	mov	$a2,$acc2
785	mov	$a3,$acc3
786	add	$rp,sp,#$M
787	bl	__ecp_nistz256_add	// p256_add(M, Zsqr, in_x);
788
789	add	$bp,$ap_real,#0
790	mov	$acc0,$a0		// restore Zsqr
791	mov	$acc1,$a1
792	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
793	mov	$acc2,$a2
794	mov	$acc3,$a3
795	 ldp	$a2,$a3,[sp,#$S+16]
796	add	$rp,sp,#$Zsqr
797	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
798
799	add	$rp,sp,#$S
800	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
801
802	ldr	$bi,[$ap_real,#32]
803	ldp	$a0,$a1,[$ap_real,#64]
804	ldp	$a2,$a3,[$ap_real,#64+16]
805	add	$bp,$ap_real,#32
806	add	$rp,sp,#$tmp0
807	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
808
809	mov	$t0,$acc0
810	mov	$t1,$acc1
811	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
812	mov	$t2,$acc2
813	mov	$t3,$acc3
814	 ldp	$a2,$a3,[sp,#$S+16]
815	add	$rp,$rp_real,#64
816	bl	__ecp_nistz256_add	// p256_mul_by_2(res_z, tmp0);
817
818	add	$rp,sp,#$tmp0
819	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
820
821	 ldr	$bi,[sp,#$Zsqr]		// forward load for p256_mul_mont
822	 ldp	$a0,$a1,[sp,#$M]
823	 ldp	$a2,$a3,[sp,#$M+16]
824	add	$rp,$rp_real,#32
825	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
826
827	add	$bp,sp,#$Zsqr
828	add	$rp,sp,#$M
829	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
830
831	mov	$t0,$acc0		// duplicate M
832	mov	$t1,$acc1
833	mov	$t2,$acc2
834	mov	$t3,$acc3
835	mov	$a0,$acc0		// put M aside
836	mov	$a1,$acc1
837	mov	$a2,$acc2
838	mov	$a3,$acc3
839	add	$rp,sp,#$M
840	bl	__ecp_nistz256_add
841	mov	$t0,$a0			// restore M
842	mov	$t1,$a1
843	 ldr	$bi,[$ap_real]		// forward load for p256_mul_mont
844	mov	$t2,$a2
845	 ldp	$a0,$a1,[sp,#$S]
846	mov	$t3,$a3
847	 ldp	$a2,$a3,[sp,#$S+16]
848	bl	__ecp_nistz256_add	// p256_mul_by_3(M, M);
849
850	add	$bp,$ap_real,#0
851	add	$rp,sp,#$S
852	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
853
854	mov	$t0,$acc0
855	mov	$t1,$acc1
856	 ldp	$a0,$a1,[sp,#$M]	// forward load for p256_sqr_mont
857	mov	$t2,$acc2
858	mov	$t3,$acc3
859	 ldp	$a2,$a3,[sp,#$M+16]
860	add	$rp,sp,#$tmp0
861	bl	__ecp_nistz256_add	// p256_mul_by_2(tmp0, S);
862
863	add	$rp,$rp_real,#0
864	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
865
866	add	$bp,sp,#$tmp0
867	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
868
869	add	$bp,sp,#$S
870	add	$rp,sp,#$S
871	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
872
873	ldr	$bi,[sp,#$M]
874	mov	$a0,$acc0		// copy S
875	mov	$a1,$acc1
876	mov	$a2,$acc2
877	mov	$a3,$acc3
878	add	$bp,sp,#$M
879	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
880
881	add	$bp,$rp_real,#32
882	add	$rp,$rp_real,#32
883	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
884
885	add	sp,x29,#0		// destroy frame
886	ldp	x19,x20,[x29,#16]
887	ldp	x21,x22,[x29,#32]
888	ldp	x29,x30,[sp],#96
889	.inst	0xd50323bf		// autiasp
890	ret
891.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
892___
893}
894
895########################################################################
896# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
897#			      const P256_POINT *in2);
898{
899my ($res_x,$res_y,$res_z,
900    $H,$Hsqr,$R,$Rsqr,$Hcub,
901    $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
902my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
903# above map() describes stack layout with 12 temporary
904# 256-bit vectors on top.
905my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28));
906
907$code.=<<___;
908.globl	ecp_nistz256_point_add
909.type	ecp_nistz256_point_add,%function
910.align	5
911ecp_nistz256_point_add:
912	.inst	0xd503233f		// paciasp
913	stp	x29,x30,[sp,#-96]!
914	add	x29,sp,#0
915	stp	x19,x20,[sp,#16]
916	stp	x21,x22,[sp,#32]
917	stp	x23,x24,[sp,#48]
918	stp	x25,x26,[sp,#64]
919	stp	x27,x28,[sp,#80]
920	sub	sp,sp,#32*12
921
922	ldp	$a0,$a1,[$bp,#64]	// in2_z
923	ldp	$a2,$a3,[$bp,#64+16]
924	 mov	$rp_real,$rp
925	 mov	$ap_real,$ap
926	 mov	$bp_real,$bp
927	 adrp	$poly3,.Lpoly
928	 add	$poly3,$poly3,:lo12:.Lpoly
929	 ldr	$poly1,[$poly3,#8]
930	 ldr	$poly3,[$poly3,#24]
931	orr	$t0,$a0,$a1
932	orr	$t2,$a2,$a3
933	orr	$in2infty,$t0,$t2
934	cmp	$in2infty,#0
935	csetm	$in2infty,ne		// ~in2infty
936	add	$rp,sp,#$Z2sqr
937	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
938
939	ldp	$a0,$a1,[$ap_real,#64]	// in1_z
940	ldp	$a2,$a3,[$ap_real,#64+16]
941	orr	$t0,$a0,$a1
942	orr	$t2,$a2,$a3
943	orr	$in1infty,$t0,$t2
944	cmp	$in1infty,#0
945	csetm	$in1infty,ne		// ~in1infty
946	add	$rp,sp,#$Z1sqr
947	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
948
949	ldr	$bi,[$bp_real,#64]
950	ldp	$a0,$a1,[sp,#$Z2sqr]
951	ldp	$a2,$a3,[sp,#$Z2sqr+16]
952	add	$bp,$bp_real,#64
953	add	$rp,sp,#$S1
954	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
955
956	ldr	$bi,[$ap_real,#64]
957	ldp	$a0,$a1,[sp,#$Z1sqr]
958	ldp	$a2,$a3,[sp,#$Z1sqr+16]
959	add	$bp,$ap_real,#64
960	add	$rp,sp,#$S2
961	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
962
963	ldr	$bi,[$ap_real,#32]
964	ldp	$a0,$a1,[sp,#$S1]
965	ldp	$a2,$a3,[sp,#$S1+16]
966	add	$bp,$ap_real,#32
967	add	$rp,sp,#$S1
968	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
969
970	ldr	$bi,[$bp_real,#32]
971	ldp	$a0,$a1,[sp,#$S2]
972	ldp	$a2,$a3,[sp,#$S2+16]
973	add	$bp,$bp_real,#32
974	add	$rp,sp,#$S2
975	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
976
977	add	$bp,sp,#$S1
978	 ldr	$bi,[sp,#$Z2sqr]	// forward load for p256_mul_mont
979	 ldp	$a0,$a1,[$ap_real]
980	 ldp	$a2,$a3,[$ap_real,#16]
981	add	$rp,sp,#$R
982	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
983
984	orr	$acc0,$acc0,$acc1	// see if result is zero
985	orr	$acc2,$acc2,$acc3
986	orr	$temp0,$acc0,$acc2	// ~is_equal(S1,S2)
987
988	add	$bp,sp,#$Z2sqr
989	add	$rp,sp,#$U1
990	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
991
992	ldr	$bi,[sp,#$Z1sqr]
993	ldp	$a0,$a1,[$bp_real]
994	ldp	$a2,$a3,[$bp_real,#16]
995	add	$bp,sp,#$Z1sqr
996	add	$rp,sp,#$U2
997	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
998
999	add	$bp,sp,#$U1
1000	 ldp	$a0,$a1,[sp,#$R]	// forward load for p256_sqr_mont
1001	 ldp	$a2,$a3,[sp,#$R+16]
1002	add	$rp,sp,#$H
1003	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
1004
1005	orr	$acc0,$acc0,$acc1	// see if result is zero
1006	orr	$acc2,$acc2,$acc3
1007	orr	$acc0,$acc0,$acc2	// ~is_equal(U1,U2)
1008
1009	mvn	$temp1,$in1infty	// -1/0 -> 0/-1
1010	mvn	$temp2,$in2infty	// -1/0 -> 0/-1
1011	orr	$acc0,$acc0,$temp1
1012	orr	$acc0,$acc0,$temp2
1013	orr	$acc0,$acc0,$temp0
1014	cbnz	$acc0,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
1015
1016.Ladd_double:
1017	mov	$ap,$ap_real
1018	mov	$rp,$rp_real
1019	ldp	x23,x24,[x29,#48]
1020	ldp	x25,x26,[x29,#64]
1021	ldp	x27,x28,[x29,#80]
1022	add	sp,sp,#32*(12-4)	// difference in stack frames
1023	b	.Ldouble_shortcut
1024
1025.align	4
1026.Ladd_proceed:
1027	add	$rp,sp,#$Rsqr
1028	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1029
1030	ldr	$bi,[$ap_real,#64]
1031	ldp	$a0,$a1,[sp,#$H]
1032	ldp	$a2,$a3,[sp,#$H+16]
1033	add	$bp,$ap_real,#64
1034	add	$rp,sp,#$res_z
1035	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1036
1037	ldp	$a0,$a1,[sp,#$H]
1038	ldp	$a2,$a3,[sp,#$H+16]
1039	add	$rp,sp,#$Hsqr
1040	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1041
1042	ldr	$bi,[$bp_real,#64]
1043	ldp	$a0,$a1,[sp,#$res_z]
1044	ldp	$a2,$a3,[sp,#$res_z+16]
1045	add	$bp,$bp_real,#64
1046	add	$rp,sp,#$res_z
1047	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
1048
1049	ldr	$bi,[sp,#$H]
1050	ldp	$a0,$a1,[sp,#$Hsqr]
1051	ldp	$a2,$a3,[sp,#$Hsqr+16]
1052	add	$bp,sp,#$H
1053	add	$rp,sp,#$Hcub
1054	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1055
1056	ldr	$bi,[sp,#$Hsqr]
1057	ldp	$a0,$a1,[sp,#$U1]
1058	ldp	$a2,$a3,[sp,#$U1+16]
1059	add	$bp,sp,#$Hsqr
1060	add	$rp,sp,#$U2
1061	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
1062
1063	mov	$t0,$acc0
1064	mov	$t1,$acc1
1065	mov	$t2,$acc2
1066	mov	$t3,$acc3
1067	add	$rp,sp,#$Hsqr
1068	bl	__ecp_nistz256_add	// p256_mul_by_2(Hsqr, U2);
1069
1070	add	$bp,sp,#$Rsqr
1071	add	$rp,sp,#$res_x
1072	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1073
1074	add	$bp,sp,#$Hcub
1075	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1076
1077	add	$bp,sp,#$U2
1078	 ldr	$bi,[sp,#$Hcub]		// forward load for p256_mul_mont
1079	 ldp	$a0,$a1,[sp,#$S1]
1080	 ldp	$a2,$a3,[sp,#$S1+16]
1081	add	$rp,sp,#$res_y
1082	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1083
1084	add	$bp,sp,#$Hcub
1085	add	$rp,sp,#$S2
1086	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
1087
1088	ldr	$bi,[sp,#$R]
1089	ldp	$a0,$a1,[sp,#$res_y]
1090	ldp	$a2,$a3,[sp,#$res_y+16]
1091	add	$bp,sp,#$R
1092	add	$rp,sp,#$res_y
1093	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1094
1095	add	$bp,sp,#$S2
1096	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1097
1098	ldp	$a0,$a1,[sp,#$res_x]		// res
1099	ldp	$a2,$a3,[sp,#$res_x+16]
1100	ldp	$t0,$t1,[$bp_real]		// in2
1101	ldp	$t2,$t3,[$bp_real,#16]
1102___
1103for($i=0;$i<64;$i+=32) {		# conditional moves
1104$code.=<<___;
1105	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1106	cmp	$in1infty,#0			// ~$in1intfy, remember?
1107	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1108	csel	$t0,$a0,$t0,ne
1109	csel	$t1,$a1,$t1,ne
1110	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
1111	csel	$t2,$a2,$t2,ne
1112	csel	$t3,$a3,$t3,ne
1113	cmp	$in2infty,#0			// ~$in2intfy, remember?
1114	ldp	$a2,$a3,[sp,#$res_x+$i+48]
1115	csel	$acc0,$t0,$acc0,ne
1116	csel	$acc1,$t1,$acc1,ne
1117	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
1118	csel	$acc2,$t2,$acc2,ne
1119	csel	$acc3,$t3,$acc3,ne
1120	ldp	$t2,$t3,[$bp_real,#$i+48]
1121	stp	$acc0,$acc1,[$rp_real,#$i]
1122	stp	$acc2,$acc3,[$rp_real,#$i+16]
1123___
1124}
1125$code.=<<___;
1126	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1127	cmp	$in1infty,#0			// ~$in1intfy, remember?
1128	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1129	csel	$t0,$a0,$t0,ne
1130	csel	$t1,$a1,$t1,ne
1131	csel	$t2,$a2,$t2,ne
1132	csel	$t3,$a3,$t3,ne
1133	cmp	$in2infty,#0			// ~$in2intfy, remember?
1134	csel	$acc0,$t0,$acc0,ne
1135	csel	$acc1,$t1,$acc1,ne
1136	csel	$acc2,$t2,$acc2,ne
1137	csel	$acc3,$t3,$acc3,ne
1138	stp	$acc0,$acc1,[$rp_real,#$i]
1139	stp	$acc2,$acc3,[$rp_real,#$i+16]
1140
1141.Ladd_done:
1142	add	sp,x29,#0		// destroy frame
1143	ldp	x19,x20,[x29,#16]
1144	ldp	x21,x22,[x29,#32]
1145	ldp	x23,x24,[x29,#48]
1146	ldp	x25,x26,[x29,#64]
1147	ldp	x27,x28,[x29,#80]
1148	ldp	x29,x30,[sp],#96
1149	.inst	0xd50323bf		// autiasp
1150	ret
1151.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1152___
1153}
1154
1155########################################################################
1156# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1157#				     const P256_POINT_AFFINE *in2);
1158{
1159my ($res_x,$res_y,$res_z,
1160    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1161my $Z1sqr = $S2;
1162# above map() describes stack layout with 10 temporary
1163# 256-bit vectors on top.
1164my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
1165
1166$code.=<<___;
1167.globl	ecp_nistz256_point_add_affine
1168.type	ecp_nistz256_point_add_affine,%function
1169.align	5
1170ecp_nistz256_point_add_affine:
1171	.inst	0xd503233f		// paciasp
1172	stp	x29,x30,[sp,#-80]!
1173	add	x29,sp,#0
1174	stp	x19,x20,[sp,#16]
1175	stp	x21,x22,[sp,#32]
1176	stp	x23,x24,[sp,#48]
1177	stp	x25,x26,[sp,#64]
1178	sub	sp,sp,#32*10
1179
1180	mov	$rp_real,$rp
1181	mov	$ap_real,$ap
1182	mov	$bp_real,$bp
1183	adrp	$poly3,.Lpoly
1184	add	$poly3,$poly3,:lo12:.Lpoly
1185	ldr	$poly1,[$poly3,#8]
1186	ldr	$poly3,[$poly3,#24]
1187
1188	ldp	$a0,$a1,[$ap,#64]	// in1_z
1189	ldp	$a2,$a3,[$ap,#64+16]
1190	orr	$t0,$a0,$a1
1191	orr	$t2,$a2,$a3
1192	orr	$in1infty,$t0,$t2
1193	cmp	$in1infty,#0
1194	csetm	$in1infty,ne		// ~in1infty
1195
1196	ldp	$acc0,$acc1,[$bp]	// in2_x
1197	ldp	$acc2,$acc3,[$bp,#16]
1198	ldp	$t0,$t1,[$bp,#32]	// in2_y
1199	ldp	$t2,$t3,[$bp,#48]
1200	orr	$acc0,$acc0,$acc1
1201	orr	$acc2,$acc2,$acc3
1202	orr	$t0,$t0,$t1
1203	orr	$t2,$t2,$t3
1204	orr	$acc0,$acc0,$acc2
1205	orr	$t0,$t0,$t2
1206	orr	$in2infty,$acc0,$t0
1207	cmp	$in2infty,#0
1208	csetm	$in2infty,ne		// ~in2infty
1209
1210	add	$rp,sp,#$Z1sqr
1211	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
1212
1213	mov	$a0,$acc0
1214	mov	$a1,$acc1
1215	mov	$a2,$acc2
1216	mov	$a3,$acc3
1217	ldr	$bi,[$bp_real]
1218	add	$bp,$bp_real,#0
1219	add	$rp,sp,#$U2
1220	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
1221
1222	add	$bp,$ap_real,#0
1223	 ldr	$bi,[$ap_real,#64]	// forward load for p256_mul_mont
1224	 ldp	$a0,$a1,[sp,#$Z1sqr]
1225	 ldp	$a2,$a3,[sp,#$Z1sqr+16]
1226	add	$rp,sp,#$H
1227	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
1228
1229	add	$bp,$ap_real,#64
1230	add	$rp,sp,#$S2
1231	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
1232
1233	ldr	$bi,[$ap_real,#64]
1234	ldp	$a0,$a1,[sp,#$H]
1235	ldp	$a2,$a3,[sp,#$H+16]
1236	add	$bp,$ap_real,#64
1237	add	$rp,sp,#$res_z
1238	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1239
1240	ldr	$bi,[$bp_real,#32]
1241	ldp	$a0,$a1,[sp,#$S2]
1242	ldp	$a2,$a3,[sp,#$S2+16]
1243	add	$bp,$bp_real,#32
1244	add	$rp,sp,#$S2
1245	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
1246
1247	add	$bp,$ap_real,#32
1248	 ldp	$a0,$a1,[sp,#$H]	// forward load for p256_sqr_mont
1249	 ldp	$a2,$a3,[sp,#$H+16]
1250	add	$rp,sp,#$R
1251	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
1252
1253	add	$rp,sp,#$Hsqr
1254	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1255
1256	ldp	$a0,$a1,[sp,#$R]
1257	ldp	$a2,$a3,[sp,#$R+16]
1258	add	$rp,sp,#$Rsqr
1259	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1260
1261	ldr	$bi,[sp,#$H]
1262	ldp	$a0,$a1,[sp,#$Hsqr]
1263	ldp	$a2,$a3,[sp,#$Hsqr+16]
1264	add	$bp,sp,#$H
1265	add	$rp,sp,#$Hcub
1266	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1267
1268	ldr	$bi,[$ap_real]
1269	ldp	$a0,$a1,[sp,#$Hsqr]
1270	ldp	$a2,$a3,[sp,#$Hsqr+16]
1271	add	$bp,$ap_real,#0
1272	add	$rp,sp,#$U2
1273	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
1274
1275	mov	$t0,$acc0
1276	mov	$t1,$acc1
1277	mov	$t2,$acc2
1278	mov	$t3,$acc3
1279	add	$rp,sp,#$Hsqr
1280	bl	__ecp_nistz256_add	// p256_mul_by_2(Hsqr, U2);
1281
1282	add	$bp,sp,#$Rsqr
1283	add	$rp,sp,#$res_x
1284	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1285
1286	add	$bp,sp,#$Hcub
1287	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1288
1289	add	$bp,sp,#$U2
1290	 ldr	$bi,[$ap_real,#32]	// forward load for p256_mul_mont
1291	 ldp	$a0,$a1,[sp,#$Hcub]
1292	 ldp	$a2,$a3,[sp,#$Hcub+16]
1293	add	$rp,sp,#$res_y
1294	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1295
1296	add	$bp,$ap_real,#32
1297	add	$rp,sp,#$S2
1298	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1299
1300	ldr	$bi,[sp,#$R]
1301	ldp	$a0,$a1,[sp,#$res_y]
1302	ldp	$a2,$a3,[sp,#$res_y+16]
1303	add	$bp,sp,#$R
1304	add	$rp,sp,#$res_y
1305	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1306
1307	add	$bp,sp,#$S2
1308	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1309
1310	ldp	$a0,$a1,[sp,#$res_x]		// res
1311	ldp	$a2,$a3,[sp,#$res_x+16]
1312	ldp	$t0,$t1,[$bp_real]		// in2
1313	ldp	$t2,$t3,[$bp_real,#16]
1314___
1315for($i=0;$i<64;$i+=32) {		# conditional moves
1316$code.=<<___;
1317	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1318	cmp	$in1infty,#0			// ~$in1intfy, remember?
1319	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1320	csel	$t0,$a0,$t0,ne
1321	csel	$t1,$a1,$t1,ne
1322	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
1323	csel	$t2,$a2,$t2,ne
1324	csel	$t3,$a3,$t3,ne
1325	cmp	$in2infty,#0			// ~$in2intfy, remember?
1326	ldp	$a2,$a3,[sp,#$res_x+$i+48]
1327	csel	$acc0,$t0,$acc0,ne
1328	csel	$acc1,$t1,$acc1,ne
1329	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
1330	csel	$acc2,$t2,$acc2,ne
1331	csel	$acc3,$t3,$acc3,ne
1332	ldp	$t2,$t3,[$bp_real,#$i+48]
1333	stp	$acc0,$acc1,[$rp_real,#$i]
1334	stp	$acc2,$acc3,[$rp_real,#$i+16]
1335___
1336$code.=<<___	if ($i == 0);
1337	adrp	$bp_real,.Lone_mont-64
1338	add	$bp_real,$bp_real,:lo12:.Lone_mont-64
1339___
1340}
1341$code.=<<___;
1342	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1343	cmp	$in1infty,#0			// ~$in1intfy, remember?
1344	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1345	csel	$t0,$a0,$t0,ne
1346	csel	$t1,$a1,$t1,ne
1347	csel	$t2,$a2,$t2,ne
1348	csel	$t3,$a3,$t3,ne
1349	cmp	$in2infty,#0			// ~$in2intfy, remember?
1350	csel	$acc0,$t0,$acc0,ne
1351	csel	$acc1,$t1,$acc1,ne
1352	csel	$acc2,$t2,$acc2,ne
1353	csel	$acc3,$t3,$acc3,ne
1354	stp	$acc0,$acc1,[$rp_real,#$i]
1355	stp	$acc2,$acc3,[$rp_real,#$i+16]
1356
1357	add	sp,x29,#0		// destroy frame
1358	ldp	x19,x20,[x29,#16]
1359	ldp	x21,x22,[x29,#32]
1360	ldp	x23,x24,[x29,#48]
1361	ldp	x25,x26,[x29,#64]
1362	ldp	x29,x30,[sp],#80
1363	.inst	0xd50323bf		// autiasp
1364	ret
1365.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1366___
1367}
1368if (1) {
1369my ($ord0,$ord1) = ($poly1,$poly3);
1370my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
1371my $acc7 = $bi;
1372
1373$code.=<<___;
1374////////////////////////////////////////////////////////////////////////
1375// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1376//                                uint64_t b[4]);
1377.globl	ecp_nistz256_ord_mul_mont
1378.type	ecp_nistz256_ord_mul_mont,%function
1379.align	4
1380ecp_nistz256_ord_mul_mont:
1381	stp	x29,x30,[sp,#-64]!
1382	add	x29,sp,#0
1383	stp	x19,x20,[sp,#16]
1384	stp	x21,x22,[sp,#32]
1385	stp	x23,x24,[sp,#48]
1386
1387	adrp	$ordk,.Lord
1388	add	$ordk,$ordk,:lo12:.Lord
1389	ldr	$bi,[$bp]		// bp[0]
1390	ldp	$a0,$a1,[$ap]
1391	ldp	$a2,$a3,[$ap,#16]
1392
1393	ldp	$ord0,$ord1,[$ordk,#0]
1394	ldp	$ord2,$ord3,[$ordk,#16]
1395	ldr	$ordk,[$ordk,#32]
1396
1397	mul	$acc0,$a0,$bi		// a[0]*b[0]
1398	umulh	$t0,$a0,$bi
1399
1400	mul	$acc1,$a1,$bi		// a[1]*b[0]
1401	umulh	$t1,$a1,$bi
1402
1403	mul	$acc2,$a2,$bi		// a[2]*b[0]
1404	umulh	$t2,$a2,$bi
1405
1406	mul	$acc3,$a3,$bi		// a[3]*b[0]
1407	umulh	$acc4,$a3,$bi
1408
1409	mul	$t4,$acc0,$ordk
1410
1411	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
1412	adcs	$acc2,$acc2,$t1
1413	adcs	$acc3,$acc3,$t2
1414	adc	$acc4,$acc4,xzr
1415	mov	$acc5,xzr
1416___
1417for ($i=1;$i<4;$i++) {
1418	################################################################
1419	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1420	# *                                     abcdefgh
1421	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1422	#
1423	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1424	# rewrite above as:
1425	#
1426	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1427	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1428	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1429$code.=<<___;
1430	ldr	$bi,[$bp,#8*$i]		// b[i]
1431
1432	lsl	$t0,$t4,#32
1433	subs	$acc2,$acc2,$t4
1434	lsr	$t1,$t4,#32
1435	sbcs	$acc3,$acc3,$t0
1436	sbcs	$acc4,$acc4,$t1
1437	sbc	$acc5,$acc5,xzr
1438
1439	subs	xzr,$acc0,#1
1440	umulh	$t1,$ord0,$t4
1441	mul	$t2,$ord1,$t4
1442	umulh	$t3,$ord1,$t4
1443
1444	adcs	$t2,$t2,$t1
1445	 mul	$t0,$a0,$bi
1446	adc	$t3,$t3,xzr
1447	 mul	$t1,$a1,$bi
1448
1449	adds	$acc0,$acc1,$t2
1450	 mul	$t2,$a2,$bi
1451	adcs	$acc1,$acc2,$t3
1452	 mul	$t3,$a3,$bi
1453	adcs	$acc2,$acc3,$t4
1454	adcs	$acc3,$acc4,$t4
1455	adc	$acc4,$acc5,xzr
1456
1457	adds	$acc0,$acc0,$t0		// accumulate low parts
1458	umulh	$t0,$a0,$bi
1459	adcs	$acc1,$acc1,$t1
1460	umulh	$t1,$a1,$bi
1461	adcs	$acc2,$acc2,$t2
1462	umulh	$t2,$a2,$bi
1463	adcs	$acc3,$acc3,$t3
1464	umulh	$t3,$a3,$bi
1465	adc	$acc4,$acc4,xzr
1466	mul	$t4,$acc0,$ordk
1467	adds	$acc1,$acc1,$t0		// accumulate high parts
1468	adcs	$acc2,$acc2,$t1
1469	adcs	$acc3,$acc3,$t2
1470	adcs	$acc4,$acc4,$t3
1471	adc	$acc5,xzr,xzr
1472___
1473}
1474$code.=<<___;
1475	lsl	$t0,$t4,#32		// last reduction
1476	subs	$acc2,$acc2,$t4
1477	lsr	$t1,$t4,#32
1478	sbcs	$acc3,$acc3,$t0
1479	sbcs	$acc4,$acc4,$t1
1480	sbc	$acc5,$acc5,xzr
1481
1482	subs	xzr,$acc0,#1
1483	umulh	$t1,$ord0,$t4
1484	mul	$t2,$ord1,$t4
1485	umulh	$t3,$ord1,$t4
1486
1487	adcs	$t2,$t2,$t1
1488	adc	$t3,$t3,xzr
1489
1490	adds	$acc0,$acc1,$t2
1491	adcs	$acc1,$acc2,$t3
1492	adcs	$acc2,$acc3,$t4
1493	adcs	$acc3,$acc4,$t4
1494	adc	$acc4,$acc5,xzr
1495
1496	subs	$t0,$acc0,$ord0		// ret -= modulus
1497	sbcs	$t1,$acc1,$ord1
1498	sbcs	$t2,$acc2,$ord2
1499	sbcs	$t3,$acc3,$ord3
1500	sbcs	xzr,$acc4,xzr
1501
1502	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1503	csel	$acc1,$acc1,$t1,lo
1504	csel	$acc2,$acc2,$t2,lo
1505	stp	$acc0,$acc1,[$rp]
1506	csel	$acc3,$acc3,$t3,lo
1507	stp	$acc2,$acc3,[$rp,#16]
1508
1509	ldp	x19,x20,[sp,#16]
1510	ldp	x21,x22,[sp,#32]
1511	ldp	x23,x24,[sp,#48]
1512	ldr	x29,[sp],#64
1513	ret
1514.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1515
1516////////////////////////////////////////////////////////////////////////
1517// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1518//                                uint64_t rep);
1519.globl	ecp_nistz256_ord_sqr_mont
1520.type	ecp_nistz256_ord_sqr_mont,%function
1521.align	4
1522ecp_nistz256_ord_sqr_mont:
1523	stp	x29,x30,[sp,#-64]!
1524	add	x29,sp,#0
1525	stp	x19,x20,[sp,#16]
1526	stp	x21,x22,[sp,#32]
1527	stp	x23,x24,[sp,#48]
1528
1529	adrp	$ordk,.Lord
1530	add	$ordk,$ordk,:lo12:.Lord
1531	ldp	$a0,$a1,[$ap]
1532	ldp	$a2,$a3,[$ap,#16]
1533
1534	ldp	$ord0,$ord1,[$ordk,#0]
1535	ldp	$ord2,$ord3,[$ordk,#16]
1536	ldr	$ordk,[$ordk,#32]
1537	b	.Loop_ord_sqr
1538
1539.align	4
1540.Loop_ord_sqr:
1541	sub	$bp,$bp,#1
1542	////////////////////////////////////////////////////////////////
1543	//  |  |  |  |  |  |a1*a0|  |
1544	//  |  |  |  |  |a2*a0|  |  |
1545	//  |  |a3*a2|a3*a0|  |  |  |
1546	//  |  |  |  |a2*a1|  |  |  |
1547	//  |  |  |a3*a1|  |  |  |  |
1548	// *|  |  |  |  |  |  |  | 2|
1549	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1550	//  |--+--+--+--+--+--+--+--|
1551	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1552	//
1553	//  "can't overflow" below mark carrying into high part of
1554	//  multiplication result, which can't overflow, because it
1555	//  can never be all ones.
1556
1557	mul	$acc1,$a1,$a0		// a[1]*a[0]
1558	umulh	$t1,$a1,$a0
1559	mul	$acc2,$a2,$a0		// a[2]*a[0]
1560	umulh	$t2,$a2,$a0
1561	mul	$acc3,$a3,$a0		// a[3]*a[0]
1562	umulh	$acc4,$a3,$a0
1563
1564	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
1565	 mul	$t0,$a2,$a1		// a[2]*a[1]
1566	 umulh	$t1,$a2,$a1
1567	adcs	$acc3,$acc3,$t2
1568	 mul	$t2,$a3,$a1		// a[3]*a[1]
1569	 umulh	$t3,$a3,$a1
1570	adc	$acc4,$acc4,xzr		// can't overflow
1571
1572	mul	$acc5,$a3,$a2		// a[3]*a[2]
1573	umulh	$acc6,$a3,$a2
1574
1575	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
1576	 mul	$acc0,$a0,$a0		// a[0]*a[0]
1577	adc	$t2,$t3,xzr		// can't overflow
1578
1579	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
1580	 umulh	$a0,$a0,$a0
1581	adcs	$acc4,$acc4,$t1
1582	 mul	$t1,$a1,$a1		// a[1]*a[1]
1583	adcs	$acc5,$acc5,$t2
1584	 umulh	$a1,$a1,$a1
1585	adc	$acc6,$acc6,xzr		// can't overflow
1586
1587	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
1588	 mul	$t2,$a2,$a2		// a[2]*a[2]
1589	adcs	$acc2,$acc2,$acc2
1590	 umulh	$a2,$a2,$a2
1591	adcs	$acc3,$acc3,$acc3
1592	 mul	$t3,$a3,$a3		// a[3]*a[3]
1593	adcs	$acc4,$acc4,$acc4
1594	 umulh	$a3,$a3,$a3
1595	adcs	$acc5,$acc5,$acc5
1596	adcs	$acc6,$acc6,$acc6
1597	adc	$acc7,xzr,xzr
1598
1599	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
1600	 mul	$t4,$acc0,$ordk
1601	adcs	$acc2,$acc2,$t1
1602	adcs	$acc3,$acc3,$a1
1603	adcs	$acc4,$acc4,$t2
1604	adcs	$acc5,$acc5,$a2
1605	adcs	$acc6,$acc6,$t3
1606	adc	$acc7,$acc7,$a3
1607___
1608for($i=0; $i<4; $i++) {			# reductions
1609$code.=<<___;
1610	subs	xzr,$acc0,#1
1611	umulh	$t1,$ord0,$t4
1612	mul	$t2,$ord1,$t4
1613	umulh	$t3,$ord1,$t4
1614
1615	adcs	$t2,$t2,$t1
1616	adc	$t3,$t3,xzr
1617
1618	adds	$acc0,$acc1,$t2
1619	adcs	$acc1,$acc2,$t3
1620	adcs	$acc2,$acc3,$t4
1621	adc	$acc3,xzr,$t4		// can't overflow
1622___
1623$code.=<<___	if ($i<3);
1624	mul	$t3,$acc0,$ordk
1625___
1626$code.=<<___;
1627	lsl	$t0,$t4,#32
1628	subs	$acc1,$acc1,$t4
1629	lsr	$t1,$t4,#32
1630	sbcs	$acc2,$acc2,$t0
1631	sbc	$acc3,$acc3,$t1		// can't borrow
1632___
1633	($t3,$t4) = ($t4,$t3);
1634}
1635$code.=<<___;
1636	adds	$acc0,$acc0,$acc4	// accumulate upper half
1637	adcs	$acc1,$acc1,$acc5
1638	adcs	$acc2,$acc2,$acc6
1639	adcs	$acc3,$acc3,$acc7
1640	adc	$acc4,xzr,xzr
1641
1642	subs	$t0,$acc0,$ord0		// ret -= modulus
1643	sbcs	$t1,$acc1,$ord1
1644	sbcs	$t2,$acc2,$ord2
1645	sbcs	$t3,$acc3,$ord3
1646	sbcs	xzr,$acc4,xzr
1647
1648	csel	$a0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1649	csel	$a1,$acc1,$t1,lo
1650	csel	$a2,$acc2,$t2,lo
1651	csel	$a3,$acc3,$t3,lo
1652
1653	cbnz	$bp,.Loop_ord_sqr
1654
1655	stp	$a0,$a1,[$rp]
1656	stp	$a2,$a3,[$rp,#16]
1657
1658	ldp	x19,x20,[sp,#16]
1659	ldp	x21,x22,[sp,#32]
1660	ldp	x23,x24,[sp,#48]
1661	ldr	x29,[sp],#64
1662	ret
1663.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1664___
1665}	}
1666
1667########################################################################
1668# scatter-gather subroutines
1669{
1670my ($out,$inp,$index,$mask)=map("x$_",(0..3));
1671$code.=<<___;
1672// void	ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1,
1673//					 int x2);
1674.globl	ecp_nistz256_scatter_w5
1675.type	ecp_nistz256_scatter_w5,%function
1676.align	4
1677ecp_nistz256_scatter_w5:
1678	stp	x29,x30,[sp,#-16]!
1679	add	x29,sp,#0
1680
1681	add	$out,$out,$index,lsl#2
1682
1683	ldp	x4,x5,[$inp]		// X
1684	ldp	x6,x7,[$inp,#16]
1685	stur	w4,[$out,#64*0-4]
1686	lsr	x4,x4,#32
1687	str	w5,[$out,#64*1-4]
1688	lsr	x5,x5,#32
1689	str	w6,[$out,#64*2-4]
1690	lsr	x6,x6,#32
1691	str	w7,[$out,#64*3-4]
1692	lsr	x7,x7,#32
1693	str	w4,[$out,#64*4-4]
1694	str	w5,[$out,#64*5-4]
1695	str	w6,[$out,#64*6-4]
1696	str	w7,[$out,#64*7-4]
1697	add	$out,$out,#64*8
1698
1699	ldp	x4,x5,[$inp,#32]	// Y
1700	ldp	x6,x7,[$inp,#48]
1701	stur	w4,[$out,#64*0-4]
1702	lsr	x4,x4,#32
1703	str	w5,[$out,#64*1-4]
1704	lsr	x5,x5,#32
1705	str	w6,[$out,#64*2-4]
1706	lsr	x6,x6,#32
1707	str	w7,[$out,#64*3-4]
1708	lsr	x7,x7,#32
1709	str	w4,[$out,#64*4-4]
1710	str	w5,[$out,#64*5-4]
1711	str	w6,[$out,#64*6-4]
1712	str	w7,[$out,#64*7-4]
1713	add	$out,$out,#64*8
1714
1715	ldp	x4,x5,[$inp,#64]	// Z
1716	ldp	x6,x7,[$inp,#80]
1717	stur	w4,[$out,#64*0-4]
1718	lsr	x4,x4,#32
1719	str	w5,[$out,#64*1-4]
1720	lsr	x5,x5,#32
1721	str	w6,[$out,#64*2-4]
1722	lsr	x6,x6,#32
1723	str	w7,[$out,#64*3-4]
1724	lsr	x7,x7,#32
1725	str	w4,[$out,#64*4-4]
1726	str	w5,[$out,#64*5-4]
1727	str	w6,[$out,#64*6-4]
1728	str	w7,[$out,#64*7-4]
1729
1730	ldr	x29,[sp],#16
1731	ret
1732.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1733
1734// void	ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1,
1735//					      int x2);
1736.globl	ecp_nistz256_gather_w5
1737.type	ecp_nistz256_gather_w5,%function
1738.align	4
1739ecp_nistz256_gather_w5:
1740	stp	x29,x30,[sp,#-16]!
1741	add	x29,sp,#0
1742
1743	cmp	$index,xzr
1744	csetm	x3,ne
1745	add	$index,$index,x3
1746	add	$inp,$inp,$index,lsl#2
1747
1748	ldr	w4,[$inp,#64*0]
1749	ldr	w5,[$inp,#64*1]
1750	ldr	w6,[$inp,#64*2]
1751	ldr	w7,[$inp,#64*3]
1752	ldr	w8,[$inp,#64*4]
1753	ldr	w9,[$inp,#64*5]
1754	ldr	w10,[$inp,#64*6]
1755	ldr	w11,[$inp,#64*7]
1756	add	$inp,$inp,#64*8
1757	orr	x4,x4,x8,lsl#32
1758	orr	x5,x5,x9,lsl#32
1759	orr	x6,x6,x10,lsl#32
1760	orr	x7,x7,x11,lsl#32
1761	csel	x4,x4,xzr,ne
1762	csel	x5,x5,xzr,ne
1763	csel	x6,x6,xzr,ne
1764	csel	x7,x7,xzr,ne
1765	stp	x4,x5,[$out]		// X
1766	stp	x6,x7,[$out,#16]
1767
1768	ldr	w4,[$inp,#64*0]
1769	ldr	w5,[$inp,#64*1]
1770	ldr	w6,[$inp,#64*2]
1771	ldr	w7,[$inp,#64*3]
1772	ldr	w8,[$inp,#64*4]
1773	ldr	w9,[$inp,#64*5]
1774	ldr	w10,[$inp,#64*6]
1775	ldr	w11,[$inp,#64*7]
1776	add	$inp,$inp,#64*8
1777	orr	x4,x4,x8,lsl#32
1778	orr	x5,x5,x9,lsl#32
1779	orr	x6,x6,x10,lsl#32
1780	orr	x7,x7,x11,lsl#32
1781	csel	x4,x4,xzr,ne
1782	csel	x5,x5,xzr,ne
1783	csel	x6,x6,xzr,ne
1784	csel	x7,x7,xzr,ne
1785	stp	x4,x5,[$out,#32]	// Y
1786	stp	x6,x7,[$out,#48]
1787
1788	ldr	w4,[$inp,#64*0]
1789	ldr	w5,[$inp,#64*1]
1790	ldr	w6,[$inp,#64*2]
1791	ldr	w7,[$inp,#64*3]
1792	ldr	w8,[$inp,#64*4]
1793	ldr	w9,[$inp,#64*5]
1794	ldr	w10,[$inp,#64*6]
1795	ldr	w11,[$inp,#64*7]
1796	orr	x4,x4,x8,lsl#32
1797	orr	x5,x5,x9,lsl#32
1798	orr	x6,x6,x10,lsl#32
1799	orr	x7,x7,x11,lsl#32
1800	csel	x4,x4,xzr,ne
1801	csel	x5,x5,xzr,ne
1802	csel	x6,x6,xzr,ne
1803	csel	x7,x7,xzr,ne
1804	stp	x4,x5,[$out,#64]	// Z
1805	stp	x6,x7,[$out,#80]
1806
1807	ldr	x29,[sp],#16
1808	ret
1809.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1810
1811// void	ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1,
1812//					 int x2);
1813.globl	ecp_nistz256_scatter_w7
1814.type	ecp_nistz256_scatter_w7,%function
1815.align	4
1816ecp_nistz256_scatter_w7:
1817	stp	x29,x30,[sp,#-16]!
1818	add	x29,sp,#0
1819
1820	add	$out,$out,$index
1821	mov	$index,#64/8
1822.Loop_scatter_w7:
1823	ldr	x3,[$inp],#8
1824	subs	$index,$index,#1
1825	prfm	pstl1strm,[$out,#4096+64*0]
1826	prfm	pstl1strm,[$out,#4096+64*1]
1827	prfm	pstl1strm,[$out,#4096+64*2]
1828	prfm	pstl1strm,[$out,#4096+64*3]
1829	prfm	pstl1strm,[$out,#4096+64*4]
1830	prfm	pstl1strm,[$out,#4096+64*5]
1831	prfm	pstl1strm,[$out,#4096+64*6]
1832	prfm	pstl1strm,[$out,#4096+64*7]
1833	strb	w3,[$out,#64*0]
1834	lsr	x3,x3,#8
1835	strb	w3,[$out,#64*1]
1836	lsr	x3,x3,#8
1837	strb	w3,[$out,#64*2]
1838	lsr	x3,x3,#8
1839	strb	w3,[$out,#64*3]
1840	lsr	x3,x3,#8
1841	strb	w3,[$out,#64*4]
1842	lsr	x3,x3,#8
1843	strb	w3,[$out,#64*5]
1844	lsr	x3,x3,#8
1845	strb	w3,[$out,#64*6]
1846	lsr	x3,x3,#8
1847	strb	w3,[$out,#64*7]
1848	add	$out,$out,#64*8
1849	b.ne	.Loop_scatter_w7
1850
1851	ldr	x29,[sp],#16
1852	ret
1853.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1854
1855// void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1,
1856//						     int x2);
1857.globl	ecp_nistz256_gather_w7
1858.type	ecp_nistz256_gather_w7,%function
1859.align	4
1860ecp_nistz256_gather_w7:
1861	stp	x29,x30,[sp,#-16]!
1862	add	x29,sp,#0
1863
1864	cmp	$index,xzr
1865	csetm	x3,ne
1866	add	$index,$index,x3
1867	add	$inp,$inp,$index
1868	mov	$index,#64/8
1869	nop
1870.Loop_gather_w7:
1871	ldrb	w4,[$inp,#64*0]
1872	prfm	pldl1strm,[$inp,#4096+64*0]
1873	subs	$index,$index,#1
1874	ldrb	w5,[$inp,#64*1]
1875	prfm	pldl1strm,[$inp,#4096+64*1]
1876	ldrb	w6,[$inp,#64*2]
1877	prfm	pldl1strm,[$inp,#4096+64*2]
1878	ldrb	w7,[$inp,#64*3]
1879	prfm	pldl1strm,[$inp,#4096+64*3]
1880	ldrb	w8,[$inp,#64*4]
1881	prfm	pldl1strm,[$inp,#4096+64*4]
1882	ldrb	w9,[$inp,#64*5]
1883	prfm	pldl1strm,[$inp,#4096+64*5]
1884	ldrb	w10,[$inp,#64*6]
1885	prfm	pldl1strm,[$inp,#4096+64*6]
1886	ldrb	w11,[$inp,#64*7]
1887	prfm	pldl1strm,[$inp,#4096+64*7]
1888	add	$inp,$inp,#64*8
1889	orr	x4,x4,x5,lsl#8
1890	orr	x6,x6,x7,lsl#8
1891	orr	x8,x8,x9,lsl#8
1892	orr	x4,x4,x6,lsl#16
1893	orr	x10,x10,x11,lsl#8
1894	orr	x4,x4,x8,lsl#32
1895	orr	x4,x4,x10,lsl#48
1896	and	x4,x4,x3
1897	str	x4,[$out],#8
1898	b.ne	.Loop_gather_w7
1899
1900	ldr	x29,[sp],#16
1901	ret
1902.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1903___
1904}
1905
1906foreach (split("\n",$code)) {
1907	s/\`([^\`]*)\`/eval $1/ge;
1908
1909	print $_,"\n";
1910}
1911close STDOUT or die "error closing STDOUT: $!";	# enforce flush
1912