1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2015
18#
19# ChaCha20 for ARMv8.
20#
21# April 2019
22#
23# Replace 3xNEON+1xIALU code path with 4+1. 4+1 is actually fastest
24# option on most(*), but not all, processors, yet 6+2 is retained.
25# This is because penalties are considered tolerable in comparison to
26# improvement on processors where 6+2 helps. Most notably +37% on
27# ThunderX2. It's server-oriented processor which will have to serve
28# as many requests as possible. While others are mostly clients, when
29# performance doesn't have to be absolute top-notch, just fast enough,
30# as majority of time is spent "entertaining" relatively slow human.
31#
32# Performance in cycles per byte out of large buffer.
33#
34#			IALU/gcc-4.9	4xNEON+1xIALU	6xNEON+2xIALU
35#
36# Apple A7		5.50/+49%	2.72		1.60
37# Cortex-A53		8.40/+80%	4.06		4.45(*)
38# Cortex-A57		8.06/+43%	4.15		4.40(*)
39# Denver		4.50/+82%	2.30		2.70(*)
40# X-Gene		9.50/+46%	8.20		8.90(*)
41# Mongoose		8.00/+44%	2.74		3.12(*)
42# Kryo			8.17/+50%	4.47		4.65(*)
43# ThunderX2		7.22/+48%	5.64		4.10
44#
45# (*)	slower than 4+1:-(
46
47# $output is the last argument if it looks like a file (it has an extension)
48# $flavour is the first argument if it doesn't look like a file
49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51
52$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
54( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
55die "can't locate arm-xlate.pl";
56
57open OUT,"| \"$^X\" $xlate $flavour \"$output\""
58    or die "can't call $xlate: $!";
59*STDOUT=*OUT;
60
61sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
62{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
63  my $arg = pop;
64    $arg = "#$arg" if ($arg*1 eq $arg);
65    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
66}
67
68my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
69
70my @x=map("x$_",(5..17,19..21));
71my @d=map("x$_",(22..28,30));
72
73sub ROUND {
74my ($a0,$b0,$c0,$d0)=@_;
75my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
76my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
77my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
78
79    (
80	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
81	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
82	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
83	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
84	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
85	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
86	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
87	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
88	"&ror_32	(@x[$d0],@x[$d0],16)",
89	 "&ror_32	(@x[$d1],@x[$d1],16)",
90	  "&ror_32	(@x[$d2],@x[$d2],16)",
91	   "&ror_32	(@x[$d3],@x[$d3],16)",
92
93	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
94	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
95	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
96	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
97	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
98	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
99	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
100	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
101	"&ror_32	(@x[$b0],@x[$b0],20)",
102	 "&ror_32	(@x[$b1],@x[$b1],20)",
103	  "&ror_32	(@x[$b2],@x[$b2],20)",
104	   "&ror_32	(@x[$b3],@x[$b3],20)",
105
106	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
107	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
108	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
109	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
110	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
111	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
112	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
113	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
114	"&ror_32	(@x[$d0],@x[$d0],24)",
115	 "&ror_32	(@x[$d1],@x[$d1],24)",
116	  "&ror_32	(@x[$d2],@x[$d2],24)",
117	   "&ror_32	(@x[$d3],@x[$d3],24)",
118
119	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
120	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
121	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
122	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
123	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
124	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
125	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
126	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
127	"&ror_32	(@x[$b0],@x[$b0],25)",
128	 "&ror_32	(@x[$b1],@x[$b1],25)",
129	  "&ror_32	(@x[$b2],@x[$b2],25)",
130	   "&ror_32	(@x[$b3],@x[$b3],25)"
131    );
132}
133
134$code.=<<___;
135#ifndef	__KERNEL__
136# include "arm_arch.h"
137.extern	OPENSSL_armcap_P
138.hidden	OPENSSL_armcap_P
139#endif
140
141.rodata
142
143.align	5
144.Lsigma:
145.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
146.Lone:
147.long	1,2,3,4
148.Lrot24:
149.long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
150.asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm"
151
152.text
153
154.globl	ChaCha20_ctr32
155.type	ChaCha20_ctr32,%function
156.align	5
157ChaCha20_ctr32:
158	cbz	$len,.Labort
159	cmp	$len,#192
160	b.lo	.Lshort
161
162#ifndef	__KERNEL__
163	adrp	x17,OPENSSL_armcap_P
164	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
165	tst	w17,#ARMV7_NEON
166	b.ne	.LChaCha20_neon
167#endif
168
169.Lshort:
170	.inst	0xd503233f			// paciasp
171	stp	x29,x30,[sp,#-96]!
172	add	x29,sp,#0
173
174	adrp	@x[0],.Lsigma
175	add	@x[0],@x[0],:lo12:.Lsigma
176	stp	x19,x20,[sp,#16]
177	stp	x21,x22,[sp,#32]
178	stp	x23,x24,[sp,#48]
179	stp	x25,x26,[sp,#64]
180	stp	x27,x28,[sp,#80]
181	sub	sp,sp,#64
182
183	ldp	@d[0],@d[1],[@x[0]]		// load sigma
184	ldp	@d[2],@d[3],[$key]		// load key
185	ldp	@d[4],@d[5],[$key,#16]
186	ldp	@d[6],@d[7],[$ctr]		// load counter
187#ifdef	__AARCH64EB__
188	ror	@d[2],@d[2],#32
189	ror	@d[3],@d[3],#32
190	ror	@d[4],@d[4],#32
191	ror	@d[5],@d[5],#32
192	ror	@d[6],@d[6],#32
193	ror	@d[7],@d[7],#32
194#endif
195
196.Loop_outer:
197	mov.32	@x[0],@d[0]			// unpack key block
198	lsr	@x[1],@d[0],#32
199	mov.32	@x[2],@d[1]
200	lsr	@x[3],@d[1],#32
201	mov.32	@x[4],@d[2]
202	lsr	@x[5],@d[2],#32
203	mov.32	@x[6],@d[3]
204	lsr	@x[7],@d[3],#32
205	mov.32	@x[8],@d[4]
206	lsr	@x[9],@d[4],#32
207	mov.32	@x[10],@d[5]
208	lsr	@x[11],@d[5],#32
209	mov.32	@x[12],@d[6]
210	lsr	@x[13],@d[6],#32
211	mov.32	@x[14],@d[7]
212	lsr	@x[15],@d[7],#32
213
214	mov	$ctr,#10
215	subs	$len,$len,#64
216.Loop:
217	sub	$ctr,$ctr,#1
218___
219	foreach (&ROUND(0, 4, 8,12)) { eval; }
220	foreach (&ROUND(0, 5,10,15)) { eval; }
221$code.=<<___;
222	cbnz	$ctr,.Loop
223
224	add.32	@x[0],@x[0],@d[0]		// accumulate key block
225	add	@x[1],@x[1],@d[0],lsr#32
226	add.32	@x[2],@x[2],@d[1]
227	add	@x[3],@x[3],@d[1],lsr#32
228	add.32	@x[4],@x[4],@d[2]
229	add	@x[5],@x[5],@d[2],lsr#32
230	add.32	@x[6],@x[6],@d[3]
231	add	@x[7],@x[7],@d[3],lsr#32
232	add.32	@x[8],@x[8],@d[4]
233	add	@x[9],@x[9],@d[4],lsr#32
234	add.32	@x[10],@x[10],@d[5]
235	add	@x[11],@x[11],@d[5],lsr#32
236	add.32	@x[12],@x[12],@d[6]
237	add	@x[13],@x[13],@d[6],lsr#32
238	add.32	@x[14],@x[14],@d[7]
239	add	@x[15],@x[15],@d[7],lsr#32
240
241	b.lo	.Ltail
242
243	add	@x[0],@x[0],@x[1],lsl#32	// pack
244	add	@x[2],@x[2],@x[3],lsl#32
245	ldp	@x[1],@x[3],[$inp,#0]		// load input
246	add	@x[4],@x[4],@x[5],lsl#32
247	add	@x[6],@x[6],@x[7],lsl#32
248	ldp	@x[5],@x[7],[$inp,#16]
249	add	@x[8],@x[8],@x[9],lsl#32
250	add	@x[10],@x[10],@x[11],lsl#32
251	ldp	@x[9],@x[11],[$inp,#32]
252	add	@x[12],@x[12],@x[13],lsl#32
253	add	@x[14],@x[14],@x[15],lsl#32
254	ldp	@x[13],@x[15],[$inp,#48]
255	add	$inp,$inp,#64
256#ifdef	__AARCH64EB__
257	rev	@x[0],@x[0]
258	rev	@x[2],@x[2]
259	rev	@x[4],@x[4]
260	rev	@x[6],@x[6]
261	rev	@x[8],@x[8]
262	rev	@x[10],@x[10]
263	rev	@x[12],@x[12]
264	rev	@x[14],@x[14]
265#endif
266	eor	@x[0],@x[0],@x[1]
267	eor	@x[2],@x[2],@x[3]
268	eor	@x[4],@x[4],@x[5]
269	eor	@x[6],@x[6],@x[7]
270	eor	@x[8],@x[8],@x[9]
271	eor	@x[10],@x[10],@x[11]
272	eor	@x[12],@x[12],@x[13]
273	eor	@x[14],@x[14],@x[15]
274
275	stp	@x[0],@x[2],[$out,#0]		// store output
276	 add	@d[6],@d[6],#1			// increment counter
277	stp	@x[4],@x[6],[$out,#16]
278	stp	@x[8],@x[10],[$out,#32]
279	stp	@x[12],@x[14],[$out,#48]
280	add	$out,$out,#64
281
282	b.hi	.Loop_outer
283
284	ldp	x19,x20,[x29,#16]
285	add	sp,sp,#64
286	ldp	x21,x22,[x29,#32]
287	ldp	x23,x24,[x29,#48]
288	ldp	x25,x26,[x29,#64]
289	ldp	x27,x28,[x29,#80]
290	ldp	x29,x30,[sp],#96
291	.inst	0xd50323bf			// autiasp
292.Labort:
293	ret
294
295.align	4
296.Ltail:
297	add	$len,$len,#64
298.Less_than_64:
299	sub	$out,$out,#1
300	add	$inp,$inp,$len
301	add	$out,$out,$len
302	add	$ctr,sp,$len
303	neg	$len,$len
304
305	add	@x[0],@x[0],@x[1],lsl#32	// pack
306	add	@x[2],@x[2],@x[3],lsl#32
307	add	@x[4],@x[4],@x[5],lsl#32
308	add	@x[6],@x[6],@x[7],lsl#32
309	add	@x[8],@x[8],@x[9],lsl#32
310	add	@x[10],@x[10],@x[11],lsl#32
311	add	@x[12],@x[12],@x[13],lsl#32
312	add	@x[14],@x[14],@x[15],lsl#32
313#ifdef	__AARCH64EB__
314	rev	@x[0],@x[0]
315	rev	@x[2],@x[2]
316	rev	@x[4],@x[4]
317	rev	@x[6],@x[6]
318	rev	@x[8],@x[8]
319	rev	@x[10],@x[10]
320	rev	@x[12],@x[12]
321	rev	@x[14],@x[14]
322#endif
323	stp	@x[0],@x[2],[sp,#0]
324	stp	@x[4],@x[6],[sp,#16]
325	stp	@x[8],@x[10],[sp,#32]
326	stp	@x[12],@x[14],[sp,#48]
327
328.Loop_tail:
329	ldrb	w10,[$inp,$len]
330	ldrb	w11,[$ctr,$len]
331	add	$len,$len,#1
332	eor	w10,w10,w11
333	strb	w10,[$out,$len]
334	cbnz	$len,.Loop_tail
335
336	stp	xzr,xzr,[sp,#0]
337	stp	xzr,xzr,[sp,#16]
338	stp	xzr,xzr,[sp,#32]
339	stp	xzr,xzr,[sp,#48]
340
341	ldp	x19,x20,[x29,#16]
342	add	sp,sp,#64
343	ldp	x21,x22,[x29,#32]
344	ldp	x23,x24,[x29,#48]
345	ldp	x25,x26,[x29,#64]
346	ldp	x27,x28,[x29,#80]
347	ldp	x29,x30,[sp],#96
348	.inst	0xd50323bf			// autiasp
349	ret
350.size	ChaCha20_ctr32,.-ChaCha20_ctr32
351___
352
353{{{
354my @K = map("v$_.4s",(0..3));
355my ($xt0,$xt1,$xt2,$xt3, $CTR,$ROT24) = map("v$_.4s",(4..9));
356my @X = map("v$_.4s",(16,20,24,28, 17,21,25,29, 18,22,26,30, 19,23,27,31));
357my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
358    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @X;
359
360sub NEON_lane_ROUND {
361my ($a0,$b0,$c0,$d0)=@_;
362my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
363my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
364my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
365my @x=map("'$_'",@X);
366
367	(
368	"&add		(@x[$a0],@x[$a0],@x[$b0])",	# Q1
369	 "&add		(@x[$a1],@x[$a1],@x[$b1])",	# Q2
370	  "&add		(@x[$a2],@x[$a2],@x[$b2])",	# Q3
371	   "&add	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
372	"&eor		(@x[$d0],@x[$d0],@x[$a0])",
373	 "&eor		(@x[$d1],@x[$d1],@x[$a1])",
374	  "&eor		(@x[$d2],@x[$d2],@x[$a2])",
375	   "&eor	(@x[$d3],@x[$d3],@x[$a3])",
376	"&rev32_16	(@x[$d0],@x[$d0])",
377	 "&rev32_16	(@x[$d1],@x[$d1])",
378	  "&rev32_16	(@x[$d2],@x[$d2])",
379	   "&rev32_16	(@x[$d3],@x[$d3])",
380
381	"&add		(@x[$c0],@x[$c0],@x[$d0])",
382	 "&add		(@x[$c1],@x[$c1],@x[$d1])",
383	  "&add		(@x[$c2],@x[$c2],@x[$d2])",
384	   "&add	(@x[$c3],@x[$c3],@x[$d3])",
385	"&eor		('$xt0',@x[$b0],@x[$c0])",
386	 "&eor		('$xt1',@x[$b1],@x[$c1])",
387	  "&eor		('$xt2',@x[$b2],@x[$c2])",
388	   "&eor	('$xt3',@x[$b3],@x[$c3])",
389	"&ushr		(@x[$b0],'$xt0',20)",
390	 "&ushr		(@x[$b1],'$xt1',20)",
391	  "&ushr	(@x[$b2],'$xt2',20)",
392	   "&ushr	(@x[$b3],'$xt3',20)",
393	"&sli		(@x[$b0],'$xt0',12)",
394	 "&sli		(@x[$b1],'$xt1',12)",
395	  "&sli		(@x[$b2],'$xt2',12)",
396	   "&sli	(@x[$b3],'$xt3',12)",
397
398	"&add		(@x[$a0],@x[$a0],@x[$b0])",
399	 "&add		(@x[$a1],@x[$a1],@x[$b1])",
400	  "&add		(@x[$a2],@x[$a2],@x[$b2])",
401	   "&add	(@x[$a3],@x[$a3],@x[$b3])",
402	"&eor		('$xt0',@x[$d0],@x[$a0])",
403	 "&eor		('$xt1',@x[$d1],@x[$a1])",
404	  "&eor		('$xt2',@x[$d2],@x[$a2])",
405	   "&eor	('$xt3',@x[$d3],@x[$a3])",
406	"&tbl		(@x[$d0],'{$xt0}','$ROT24')",
407	 "&tbl		(@x[$d1],'{$xt1}','$ROT24')",
408	  "&tbl		(@x[$d2],'{$xt2}','$ROT24')",
409	   "&tbl	(@x[$d3],'{$xt3}','$ROT24')",
410
411	"&add		(@x[$c0],@x[$c0],@x[$d0])",
412	 "&add		(@x[$c1],@x[$c1],@x[$d1])",
413	  "&add		(@x[$c2],@x[$c2],@x[$d2])",
414	   "&add	(@x[$c3],@x[$c3],@x[$d3])",
415	"&eor		('$xt0',@x[$b0],@x[$c0])",
416	 "&eor		('$xt1',@x[$b1],@x[$c1])",
417	  "&eor		('$xt2',@x[$b2],@x[$c2])",
418	   "&eor	('$xt3',@x[$b3],@x[$c3])",
419	"&ushr		(@x[$b0],'$xt0',25)",
420	 "&ushr		(@x[$b1],'$xt1',25)",
421	  "&ushr	(@x[$b2],'$xt2',25)",
422	   "&ushr	(@x[$b3],'$xt3',25)",
423	"&sli		(@x[$b0],'$xt0',7)",
424	 "&sli		(@x[$b1],'$xt1',7)",
425	  "&sli		(@x[$b2],'$xt2',7)",
426	   "&sli	(@x[$b3],'$xt3',7)"
427	);
428}
429
430$code.=<<___;
431
432#ifdef	__KERNEL__
433.globl	ChaCha20_neon
434#endif
435.type	ChaCha20_neon,%function
436.align	5
437ChaCha20_neon:
438.LChaCha20_neon:
439	.inst	0xd503233f			// paciasp
440	stp	x29,x30,[sp,#-96]!
441	add	x29,sp,#0
442
443	adrp	@x[0],.Lsigma
444	add	@x[0],@x[0],:lo12:.Lsigma
445	stp	x19,x20,[sp,#16]
446	stp	x21,x22,[sp,#32]
447	stp	x23,x24,[sp,#48]
448	stp	x25,x26,[sp,#64]
449	stp	x27,x28,[sp,#80]
450	cmp	$len,#512
451	b.hs	.L512_or_more_neon
452
453	sub	sp,sp,#64
454
455	ldp	@d[0],@d[1],[@x[0]]		// load sigma
456	ld1	{@K[0]},[@x[0]],#16
457	ldp	@d[2],@d[3],[$key]		// load key
458	ldp	@d[4],@d[5],[$key,#16]
459	ld1	{@K[1],@K[2]},[$key]
460	ldp	@d[6],@d[7],[$ctr]		// load counter
461	ld1	{@K[3]},[$ctr]
462	stp	d8,d9,[sp]			// meet ABI requirements
463	ld1	{$CTR,$ROT24},[@x[0]]
464#ifdef	__AARCH64EB__
465	rev64	@K[0],@K[0]
466	ror	@d[2],@d[2],#32
467	ror	@d[3],@d[3],#32
468	ror	@d[4],@d[4],#32
469	ror	@d[5],@d[5],#32
470	ror	@d[6],@d[6],#32
471	ror	@d[7],@d[7],#32
472#endif
473
474.Loop_outer_neon:
475	dup	$xa0,@{K[0]}[0]			// unpack key block
476	 mov.32	@x[0],@d[0]
477	dup	$xa1,@{K[0]}[1]
478	 lsr	@x[1],@d[0],#32
479	dup	$xa2,@{K[0]}[2]
480	 mov.32	@x[2],@d[1]
481	dup	$xa3,@{K[0]}[3]
482	 lsr	@x[3],@d[1],#32
483	dup	$xb0,@{K[1]}[0]
484	 mov.32	@x[4],@d[2]
485	dup	$xb1,@{K[1]}[1]
486	 lsr	@x[5],@d[2],#32
487	dup	$xb2,@{K[1]}[2]
488	 mov.32	@x[6],@d[3]
489	dup	$xb3,@{K[1]}[3]
490	 lsr	@x[7],@d[3],#32
491	dup	$xd0,@{K[3]}[0]
492	 mov.32	@x[8],@d[4]
493	dup	$xd1,@{K[3]}[1]
494	 lsr	@x[9],@d[4],#32
495	dup	$xd2,@{K[3]}[2]
496	 mov.32	@x[10],@d[5]
497	dup	$xd3,@{K[3]}[3]
498	 lsr	@x[11],@d[5],#32
499	add	$xd0,$xd0,$CTR
500	 mov.32	@x[12],@d[6]
501	dup	$xc0,@{K[2]}[0]
502	 lsr	@x[13],@d[6],#32
503	dup	$xc1,@{K[2]}[1]
504	 mov.32	@x[14],@d[7]
505	dup	$xc2,@{K[2]}[2]
506	 lsr	@x[15],@d[7],#32
507	dup	$xc3,@{K[2]}[3]
508
509	mov	$ctr,#10
510	subs	$len,$len,#320
511.Loop_neon:
512	sub	$ctr,$ctr,#1
513___
514	my @plus_one=&ROUND(0,4,8,12);
515	foreach (&NEON_lane_ROUND(0,4,8,12))  { eval; eval(shift(@plus_one)); }
516
517	@plus_one=&ROUND(0,5,10,15);
518	foreach (&NEON_lane_ROUND(0,5,10,15)) { eval; eval(shift(@plus_one)); }
519$code.=<<___;
520	cbnz	$ctr,.Loop_neon
521
522	add	$xd0,$xd0,$CTR
523
524	zip1	$xt0,$xa0,$xa1			// transpose data
525	zip1	$xt1,$xa2,$xa3
526	zip2	$xt2,$xa0,$xa1
527	zip2	$xt3,$xa2,$xa3
528	zip1.64	$xa0,$xt0,$xt1
529	zip2.64	$xa1,$xt0,$xt1
530	zip1.64	$xa2,$xt2,$xt3
531	zip2.64	$xa3,$xt2,$xt3
532
533	zip1	$xt0,$xb0,$xb1
534	zip1	$xt1,$xb2,$xb3
535	zip2	$xt2,$xb0,$xb1
536	zip2	$xt3,$xb2,$xb3
537	zip1.64	$xb0,$xt0,$xt1
538	zip2.64	$xb1,$xt0,$xt1
539	zip1.64	$xb2,$xt2,$xt3
540	zip2.64	$xb3,$xt2,$xt3
541
542	zip1	$xt0,$xc0,$xc1
543	 add.32	@x[0],@x[0],@d[0]		// accumulate key block
544	zip1	$xt1,$xc2,$xc3
545	 add	@x[1],@x[1],@d[0],lsr#32
546	zip2	$xt2,$xc0,$xc1
547	 add.32	@x[2],@x[2],@d[1]
548	zip2	$xt3,$xc2,$xc3
549	 add	@x[3],@x[3],@d[1],lsr#32
550	zip1.64	$xc0,$xt0,$xt1
551	 add.32	@x[4],@x[4],@d[2]
552	zip2.64	$xc1,$xt0,$xt1
553	 add	@x[5],@x[5],@d[2],lsr#32
554	zip1.64	$xc2,$xt2,$xt3
555	 add.32	@x[6],@x[6],@d[3]
556	zip2.64	$xc3,$xt2,$xt3
557	 add	@x[7],@x[7],@d[3],lsr#32
558
559	zip1	$xt0,$xd0,$xd1
560	 add.32	@x[8],@x[8],@d[4]
561	zip1	$xt1,$xd2,$xd3
562	 add	@x[9],@x[9],@d[4],lsr#32
563	zip2	$xt2,$xd0,$xd1
564	 add.32	@x[10],@x[10],@d[5]
565	zip2	$xt3,$xd2,$xd3
566	 add	@x[11],@x[11],@d[5],lsr#32
567	zip1.64	$xd0,$xt0,$xt1
568	 add.32	@x[12],@x[12],@d[6]
569	zip2.64	$xd1,$xt0,$xt1
570	 add	@x[13],@x[13],@d[6],lsr#32
571	zip1.64	$xd2,$xt2,$xt3
572	 add.32	@x[14],@x[14],@d[7]
573	zip2.64	$xd3,$xt2,$xt3
574	 add	@x[15],@x[15],@d[7],lsr#32
575
576	b.lo	.Ltail_neon
577
578	add	@x[0],@x[0],@x[1],lsl#32	// pack
579	add	@x[2],@x[2],@x[3],lsl#32
580	ldp	@x[1],@x[3],[$inp,#0]		// load input
581	 add	$xa0,$xa0,@K[0]			// accumulate key block
582	add	@x[4],@x[4],@x[5],lsl#32
583	add	@x[6],@x[6],@x[7],lsl#32
584	ldp	@x[5],@x[7],[$inp,#16]
585	 add	$xb0,$xb0,@K[1]
586	add	@x[8],@x[8],@x[9],lsl#32
587	add	@x[10],@x[10],@x[11],lsl#32
588	ldp	@x[9],@x[11],[$inp,#32]
589	 add	$xc0,$xc0,@K[2]
590	add	@x[12],@x[12],@x[13],lsl#32
591	add	@x[14],@x[14],@x[15],lsl#32
592	ldp	@x[13],@x[15],[$inp,#48]
593	 add	$xd0,$xd0,@K[3]
594	add	$inp,$inp,#64
595#ifdef	__AARCH64EB__
596	rev	@x[0],@x[0]
597	rev	@x[2],@x[2]
598	rev	@x[4],@x[4]
599	rev	@x[6],@x[6]
600	rev	@x[8],@x[8]
601	rev	@x[10],@x[10]
602	rev	@x[12],@x[12]
603	rev	@x[14],@x[14]
604#endif
605	ld1.8	{$xt0-$xt3},[$inp],#64
606	eor	@x[0],@x[0],@x[1]
607	 add	$xa1,$xa1,@K[0]
608	eor	@x[2],@x[2],@x[3]
609	 add	$xb1,$xb1,@K[1]
610	eor	@x[4],@x[4],@x[5]
611	 add	$xc1,$xc1,@K[2]
612	eor	@x[6],@x[6],@x[7]
613	 add	$xd1,$xd1,@K[3]
614	eor	@x[8],@x[8],@x[9]
615	 eor	$xa0,$xa0,$xt0
616	 movi	$xt0,#5
617	eor	@x[10],@x[10],@x[11]
618	 eor	$xb0,$xb0,$xt1
619	eor	@x[12],@x[12],@x[13]
620	 eor	$xc0,$xc0,$xt2
621	eor	@x[14],@x[14],@x[15]
622	 eor	$xd0,$xd0,$xt3
623	 add	$CTR,$CTR,$xt0			// += 5
624	 ld1.8	{$xt0-$xt3},[$inp],#64
625
626	stp	@x[0],@x[2],[$out,#0]		// store output
627	 add	@d[6],@d[6],#5			// increment counter
628	stp	@x[4],@x[6],[$out,#16]
629	stp	@x[8],@x[10],[$out,#32]
630	stp	@x[12],@x[14],[$out,#48]
631	add	$out,$out,#64
632
633	st1.8	{$xa0-$xd0},[$out],#64
634	 add	$xa2,$xa2,@K[0]
635	 add	$xb2,$xb2,@K[1]
636	 add	$xc2,$xc2,@K[2]
637	 add	$xd2,$xd2,@K[3]
638	ld1.8	{$xa0-$xd0},[$inp],#64
639
640	eor	$xa1,$xa1,$xt0
641	eor	$xb1,$xb1,$xt1
642	eor	$xc1,$xc1,$xt2
643	eor	$xd1,$xd1,$xt3
644	st1.8	{$xa1-$xd1},[$out],#64
645	 add	$xa3,$xa3,@K[0]
646	 add	$xb3,$xb3,@K[1]
647	 add	$xc3,$xc3,@K[2]
648	 add	$xd3,$xd3,@K[3]
649	ld1.8	{$xa1-$xd1},[$inp],#64
650
651	eor	$xa2,$xa2,$xa0
652	eor	$xb2,$xb2,$xb0
653	eor	$xc2,$xc2,$xc0
654	eor	$xd2,$xd2,$xd0
655	st1.8	{$xa2-$xd2},[$out],#64
656
657	eor	$xa3,$xa3,$xa1
658	eor	$xb3,$xb3,$xb1
659	eor	$xc3,$xc3,$xc1
660	eor	$xd3,$xd3,$xd1
661	st1.8	{$xa3-$xd3},[$out],#64
662
663	b.hi	.Loop_outer_neon
664
665	ldp	d8,d9,[sp]			// meet ABI requirements
666
667	ldp	x19,x20,[x29,#16]
668	add	sp,sp,#64
669	ldp	x21,x22,[x29,#32]
670	ldp	x23,x24,[x29,#48]
671	ldp	x25,x26,[x29,#64]
672	ldp	x27,x28,[x29,#80]
673	ldp	x29,x30,[sp],#96
674	.inst	0xd50323bf			// autiasp
675	ret
676
677.align	4
678.Ltail_neon:
679	add	$len,$len,#320
680	ldp	d8,d9,[sp]			// meet ABI requirements
681	cmp	$len,#64
682	b.lo	.Less_than_64
683
684	add	@x[0],@x[0],@x[1],lsl#32	// pack
685	add	@x[2],@x[2],@x[3],lsl#32
686	ldp	@x[1],@x[3],[$inp,#0]		// load input
687	add	@x[4],@x[4],@x[5],lsl#32
688	add	@x[6],@x[6],@x[7],lsl#32
689	ldp	@x[5],@x[7],[$inp,#16]
690	add	@x[8],@x[8],@x[9],lsl#32
691	add	@x[10],@x[10],@x[11],lsl#32
692	ldp	@x[9],@x[11],[$inp,#32]
693	add	@x[12],@x[12],@x[13],lsl#32
694	add	@x[14],@x[14],@x[15],lsl#32
695	ldp	@x[13],@x[15],[$inp,#48]
696	add	$inp,$inp,#64
697#ifdef	__AARCH64EB__
698	rev	@x[0],@x[0]
699	rev	@x[2],@x[2]
700	rev	@x[4],@x[4]
701	rev	@x[6],@x[6]
702	rev	@x[8],@x[8]
703	rev	@x[10],@x[10]
704	rev	@x[12],@x[12]
705	rev	@x[14],@x[14]
706#endif
707	eor	@x[0],@x[0],@x[1]
708	eor	@x[2],@x[2],@x[3]
709	eor	@x[4],@x[4],@x[5]
710	eor	@x[6],@x[6],@x[7]
711	eor	@x[8],@x[8],@x[9]
712	eor	@x[10],@x[10],@x[11]
713	eor	@x[12],@x[12],@x[13]
714	eor	@x[14],@x[14],@x[15]
715
716	stp	@x[0],@x[2],[$out,#0]		// store output
717	 add	$xa0,$xa0,@K[0]			// accumulate key block
718	stp	@x[4],@x[6],[$out,#16]
719	 add	$xb0,$xb0,@K[1]
720	stp	@x[8],@x[10],[$out,#32]
721	 add	$xc0,$xc0,@K[2]
722	stp	@x[12],@x[14],[$out,#48]
723	 add	$xd0,$xd0,@K[3]
724	add	$out,$out,#64
725	b.eq	.Ldone_neon
726	sub	$len,$len,#64
727	cmp	$len,#64
728	b.lo	.Last_neon
729
730	ld1.8	{$xt0-$xt3},[$inp],#64
731	eor	$xa0,$xa0,$xt0
732	eor	$xb0,$xb0,$xt1
733	eor	$xc0,$xc0,$xt2
734	eor	$xd0,$xd0,$xt3
735	st1.8	{$xa0-$xd0},[$out],#64
736	b.eq	.Ldone_neon
737
738	add	$xa0,$xa1,@K[0]
739	add	$xb0,$xb1,@K[1]
740	sub	$len,$len,#64
741	add	$xc0,$xc1,@K[2]
742	cmp	$len,#64
743	add	$xd0,$xd1,@K[3]
744	b.lo	.Last_neon
745
746	ld1.8	{$xt0-$xt3},[$inp],#64
747	eor	$xa1,$xa0,$xt0
748	eor	$xb1,$xb0,$xt1
749	eor	$xc1,$xc0,$xt2
750	eor	$xd1,$xd0,$xt3
751	st1.8	{$xa1-$xd1},[$out],#64
752	b.eq	.Ldone_neon
753
754	add	$xa0,$xa2,@K[0]
755	add	$xb0,$xb2,@K[1]
756	sub	$len,$len,#64
757	add	$xc0,$xc2,@K[2]
758	cmp	$len,#64
759	add	$xd0,$xd2,@K[3]
760	b.lo	.Last_neon
761
762	ld1.8	{$xt0-$xt3},[$inp],#64
763	eor	$xa2,$xa0,$xt0
764	eor	$xb2,$xb0,$xt1
765	eor	$xc2,$xc0,$xt2
766	eor	$xd2,$xd0,$xt3
767	st1.8	{$xa2-$xd2},[$out],#64
768	b.eq	.Ldone_neon
769
770	add	$xa0,$xa3,@K[0]
771	add	$xb0,$xb3,@K[1]
772	add	$xc0,$xc3,@K[2]
773	add	$xd0,$xd3,@K[3]
774	sub	$len,$len,#64
775
776.Last_neon:
777	st1.8	{$xa0-$xd0},[sp]
778
779	sub	$out,$out,#1
780	add	$inp,$inp,$len
781	add	$out,$out,$len
782	add	$ctr,sp,$len
783	neg	$len,$len
784
785.Loop_tail_neon:
786	ldrb	w10,[$inp,$len]
787	ldrb	w11,[$ctr,$len]
788	add	$len,$len,#1
789	eor	w10,w10,w11
790	strb	w10,[$out,$len]
791	cbnz	$len,.Loop_tail_neon
792
793	stp	xzr,xzr,[sp,#0]
794	stp	xzr,xzr,[sp,#16]
795	stp	xzr,xzr,[sp,#32]
796	stp	xzr,xzr,[sp,#48]
797
798.Ldone_neon:
799	ldp	x19,x20,[x29,#16]
800	add	sp,sp,#64
801	ldp	x21,x22,[x29,#32]
802	ldp	x23,x24,[x29,#48]
803	ldp	x25,x26,[x29,#64]
804	ldp	x27,x28,[x29,#80]
805	ldp	x29,x30,[sp],#96
806	.inst	0xd50323bf			// autiasp
807	ret
808.size	ChaCha20_neon,.-ChaCha20_neon
809___
810{
811my @K = map("v$_.4s",(0..6));
812my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
813my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
814    $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(8..31));
815my $rot24 = @K[6];
816my $ONE = "v7.4s";
817
818sub NEONROUND {
819my $odd = pop;
820my ($a,$b,$c,$d,$t)=@_;
821
822	(
823	"&add		('$a','$a','$b')",
824	"&eor		('$d','$d','$a')",
825	"&rev32_16	('$d','$d')",		# vrot ($d,16)
826
827	"&add		('$c','$c','$d')",
828	"&eor		('$t','$b','$c')",
829	"&ushr		('$b','$t',20)",
830	"&sli		('$b','$t',12)",
831
832	"&add		('$a','$a','$b')",
833	"&eor		('$d','$d','$a')",
834	"&tbl		('$d','{$d}','$rot24')",
835
836	"&add		('$c','$c','$d')",
837	"&eor		('$t','$b','$c')",
838	"&ushr		('$b','$t',25)",
839	"&sli		('$b','$t',7)",
840
841	"&ext		('$c','$c','$c',8)",
842	"&ext		('$d','$d','$d',$odd?4:12)",
843	"&ext		('$b','$b','$b',$odd?12:4)"
844	);
845}
846
847$code.=<<___;
848.type	ChaCha20_512_neon,%function
849.align	5
850ChaCha20_512_neon:
851	.inst	0xd503233f			// paciasp
852	stp	x29,x30,[sp,#-96]!
853	add	x29,sp,#0
854
855	adrp	@x[0],.Lsigma
856	add	@x[0],@x[0],:lo12:.Lsigma
857	stp	x19,x20,[sp,#16]
858	stp	x21,x22,[sp,#32]
859	stp	x23,x24,[sp,#48]
860	stp	x25,x26,[sp,#64]
861	stp	x27,x28,[sp,#80]
862
863.L512_or_more_neon:
864	sub	sp,sp,#128+64
865
866	eor	$ONE,$ONE,$ONE
867	ldp	@d[0],@d[1],[@x[0]]		// load sigma
868	ld1	{@K[0]},[@x[0]],#16
869	ldp	@d[2],@d[3],[$key]		// load key
870	ldp	@d[4],@d[5],[$key,#16]
871	ld1	{@K[1],@K[2]},[$key]
872	ldp	@d[6],@d[7],[$ctr]		// load counter
873	ld1	{@K[3]},[$ctr]
874	ld1	{$ONE}[0],[@x[0]]
875	add	$key,@x[0],#16			// .Lrot24
876#ifdef	__AARCH64EB__
877	rev64	@K[0],@K[0]
878	ror	@d[2],@d[2],#32
879	ror	@d[3],@d[3],#32
880	ror	@d[4],@d[4],#32
881	ror	@d[5],@d[5],#32
882	ror	@d[6],@d[6],#32
883	ror	@d[7],@d[7],#32
884#endif
885	add	@K[3],@K[3],$ONE		// += 1
886	stp	@K[0],@K[1],[sp,#0]		// off-load key block, invariant part
887	add	@K[3],@K[3],$ONE		// not typo
888	str	@K[2],[sp,#32]
889	add	@K[4],@K[3],$ONE
890	add	@K[5],@K[4],$ONE
891	add	@K[6],@K[5],$ONE
892	shl	$ONE,$ONE,#2			// 1 -> 4
893
894	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
895	stp	d10,d11,[sp,#128+16]
896	stp	d12,d13,[sp,#128+32]
897	stp	d14,d15,[sp,#128+48]
898
899	sub	$len,$len,#512			// not typo
900
901.Loop_outer_512_neon:
902	 mov	$A0,@K[0]
903	 mov	$A1,@K[0]
904	 mov	$A2,@K[0]
905	 mov	$A3,@K[0]
906	 mov	$A4,@K[0]
907	 mov	$A5,@K[0]
908	 mov	$B0,@K[1]
909	mov.32	@x[0],@d[0]			// unpack key block
910	 mov	$B1,@K[1]
911	lsr	@x[1],@d[0],#32
912	 mov	$B2,@K[1]
913	mov.32	@x[2],@d[1]
914	 mov	$B3,@K[1]
915	lsr	@x[3],@d[1],#32
916	 mov	$B4,@K[1]
917	mov.32	@x[4],@d[2]
918	 mov	$B5,@K[1]
919	lsr	@x[5],@d[2],#32
920	 mov	$D0,@K[3]
921	mov.32	@x[6],@d[3]
922	 mov	$D1,@K[4]
923	lsr	@x[7],@d[3],#32
924	 mov	$D2,@K[5]
925	mov.32	@x[8],@d[4]
926	 mov	$D3,@K[6]
927	lsr	@x[9],@d[4],#32
928	 mov	$C0,@K[2]
929	mov.32	@x[10],@d[5]
930	 mov	$C1,@K[2]
931	lsr	@x[11],@d[5],#32
932	 add	$D4,$D0,$ONE			// +4
933	mov.32	@x[12],@d[6]
934	 add	$D5,$D1,$ONE			// +4
935	lsr	@x[13],@d[6],#32
936	 mov	$C2,@K[2]
937	mov.32	@x[14],@d[7]
938	 mov	$C3,@K[2]
939	lsr	@x[15],@d[7],#32
940	 mov	$C4,@K[2]
941	 stp	@K[3],@K[4],[sp,#48]		// off-load key block, variable part
942	 mov	$C5,@K[2]
943	 stp	@K[5],@K[6],[sp,#80]
944
945	mov	$ctr,#5
946	ld1	{$rot24},[$key]
947	subs	$len,$len,#512
948.Loop_upper_neon:
949	sub	$ctr,$ctr,#1
950___
951	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
952	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
953	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
954	my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
955	my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
956	my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
957	my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
958	my $diff = ($#thread0+1)*6 - $#thread67 - 1;
959	my $i = 0;
960
961	foreach (@thread0) {
962		eval;			eval(shift(@thread67));
963		eval(shift(@thread1));	eval(shift(@thread67));
964		eval(shift(@thread2));	eval(shift(@thread67));
965		eval(shift(@thread3));	eval(shift(@thread67));
966		eval(shift(@thread4));	eval(shift(@thread67));
967		eval(shift(@thread5));	eval(shift(@thread67));
968	}
969
970	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
971	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
972	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
973	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
974	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
975	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
976	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
977
978	foreach (@thread0) {
979		eval;			eval(shift(@thread67));
980		eval(shift(@thread1));	eval(shift(@thread67));
981		eval(shift(@thread2));	eval(shift(@thread67));
982		eval(shift(@thread3));	eval(shift(@thread67));
983		eval(shift(@thread4));	eval(shift(@thread67));
984		eval(shift(@thread5));	eval(shift(@thread67));
985	}
986$code.=<<___;
987	cbnz	$ctr,.Loop_upper_neon
988
989	add.32	@x[0],@x[0],@d[0]		// accumulate key block
990	add	@x[1],@x[1],@d[0],lsr#32
991	add.32	@x[2],@x[2],@d[1]
992	add	@x[3],@x[3],@d[1],lsr#32
993	add.32	@x[4],@x[4],@d[2]
994	add	@x[5],@x[5],@d[2],lsr#32
995	add.32	@x[6],@x[6],@d[3]
996	add	@x[7],@x[7],@d[3],lsr#32
997	add.32	@x[8],@x[8],@d[4]
998	add	@x[9],@x[9],@d[4],lsr#32
999	add.32	@x[10],@x[10],@d[5]
1000	add	@x[11],@x[11],@d[5],lsr#32
1001	add.32	@x[12],@x[12],@d[6]
1002	add	@x[13],@x[13],@d[6],lsr#32
1003	add.32	@x[14],@x[14],@d[7]
1004	add	@x[15],@x[15],@d[7],lsr#32
1005
1006	add	@x[0],@x[0],@x[1],lsl#32	// pack
1007	add	@x[2],@x[2],@x[3],lsl#32
1008	ldp	@x[1],@x[3],[$inp,#0]		// load input
1009	add	@x[4],@x[4],@x[5],lsl#32
1010	add	@x[6],@x[6],@x[7],lsl#32
1011	ldp	@x[5],@x[7],[$inp,#16]
1012	add	@x[8],@x[8],@x[9],lsl#32
1013	add	@x[10],@x[10],@x[11],lsl#32
1014	ldp	@x[9],@x[11],[$inp,#32]
1015	add	@x[12],@x[12],@x[13],lsl#32
1016	add	@x[14],@x[14],@x[15],lsl#32
1017	ldp	@x[13],@x[15],[$inp,#48]
1018	add	$inp,$inp,#64
1019#ifdef	__AARCH64EB__
1020	rev	@x[0],@x[0]
1021	rev	@x[2],@x[2]
1022	rev	@x[4],@x[4]
1023	rev	@x[6],@x[6]
1024	rev	@x[8],@x[8]
1025	rev	@x[10],@x[10]
1026	rev	@x[12],@x[12]
1027	rev	@x[14],@x[14]
1028#endif
1029	eor	@x[0],@x[0],@x[1]
1030	eor	@x[2],@x[2],@x[3]
1031	eor	@x[4],@x[4],@x[5]
1032	eor	@x[6],@x[6],@x[7]
1033	eor	@x[8],@x[8],@x[9]
1034	eor	@x[10],@x[10],@x[11]
1035	eor	@x[12],@x[12],@x[13]
1036	eor	@x[14],@x[14],@x[15]
1037
1038	 stp	@x[0],@x[2],[$out,#0]		// store output
1039	 add	@d[6],@d[6],#1			// increment counter
1040	mov.32	@x[0],@d[0]			// unpack key block
1041	lsr	@x[1],@d[0],#32
1042	 stp	@x[4],@x[6],[$out,#16]
1043	mov.32	@x[2],@d[1]
1044	lsr	@x[3],@d[1],#32
1045	 stp	@x[8],@x[10],[$out,#32]
1046	mov.32	@x[4],@d[2]
1047	lsr	@x[5],@d[2],#32
1048	 stp	@x[12],@x[14],[$out,#48]
1049	 add	$out,$out,#64
1050	mov.32	@x[6],@d[3]
1051	lsr	@x[7],@d[3],#32
1052	mov.32	@x[8],@d[4]
1053	lsr	@x[9],@d[4],#32
1054	mov.32	@x[10],@d[5]
1055	lsr	@x[11],@d[5],#32
1056	mov.32	@x[12],@d[6]
1057	lsr	@x[13],@d[6],#32
1058	mov.32	@x[14],@d[7]
1059	lsr	@x[15],@d[7],#32
1060
1061	mov	$ctr,#5
1062.Loop_lower_neon:
1063	sub	$ctr,$ctr,#1
1064___
1065	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
1066	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
1067	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
1068	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
1069	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
1070	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
1071	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
1072
1073	foreach (@thread0) {
1074		eval;			eval(shift(@thread67));
1075		eval(shift(@thread1));	eval(shift(@thread67));
1076		eval(shift(@thread2));	eval(shift(@thread67));
1077		eval(shift(@thread3));	eval(shift(@thread67));
1078		eval(shift(@thread4));	eval(shift(@thread67));
1079		eval(shift(@thread5));	eval(shift(@thread67));
1080	}
1081
1082	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
1083	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
1084	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
1085	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
1086	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
1087	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
1088	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
1089
1090	foreach (@thread0) {
1091		eval;			eval(shift(@thread67));
1092		eval(shift(@thread1));	eval(shift(@thread67));
1093		eval(shift(@thread2));	eval(shift(@thread67));
1094		eval(shift(@thread3));	eval(shift(@thread67));
1095		eval(shift(@thread4));	eval(shift(@thread67));
1096		eval(shift(@thread5));	eval(shift(@thread67));
1097	}
1098$code.=<<___;
1099	cbnz	$ctr,.Loop_lower_neon
1100
1101	add.32	@x[0],@x[0],@d[0]		// accumulate key block
1102	 ldp	@K[0],@K[1],[sp,#0]
1103	add	@x[1],@x[1],@d[0],lsr#32
1104	 ldp	@K[2],@K[3],[sp,#32]
1105	add.32	@x[2],@x[2],@d[1]
1106	 ldp	@K[4],@K[5],[sp,#64]
1107	add	@x[3],@x[3],@d[1],lsr#32
1108	 ldr	@K[6],[sp,#96]
1109	 add	$A0,$A0,@K[0]
1110	add.32	@x[4],@x[4],@d[2]
1111	 add	$A1,$A1,@K[0]
1112	add	@x[5],@x[5],@d[2],lsr#32
1113	 add	$A2,$A2,@K[0]
1114	add.32	@x[6],@x[6],@d[3]
1115	 add	$A3,$A3,@K[0]
1116	add	@x[7],@x[7],@d[3],lsr#32
1117	 add	$A4,$A4,@K[0]
1118	add.32	@x[8],@x[8],@d[4]
1119	 add	$A5,$A5,@K[0]
1120	add	@x[9],@x[9],@d[4],lsr#32
1121	 add	$C0,$C0,@K[2]
1122	add.32	@x[10],@x[10],@d[5]
1123	 add	$C1,$C1,@K[2]
1124	add	@x[11],@x[11],@d[5],lsr#32
1125	 add	$C2,$C2,@K[2]
1126	add.32	@x[12],@x[12],@d[6]
1127	 add	$C3,$C3,@K[2]
1128	add	@x[13],@x[13],@d[6],lsr#32
1129	 add	$C4,$C4,@K[2]
1130	add.32	@x[14],@x[14],@d[7]
1131	 add	$C5,$C5,@K[2]
1132	add	@x[15],@x[15],@d[7],lsr#32
1133	 add	$D4,$D4,$ONE			// +4
1134	add	@x[0],@x[0],@x[1],lsl#32	// pack
1135	 add	$D5,$D5,$ONE			// +4
1136	add	@x[2],@x[2],@x[3],lsl#32
1137	 add	$D0,$D0,@K[3]
1138	ldp	@x[1],@x[3],[$inp,#0]		// load input
1139	 add	$D1,$D1,@K[4]
1140	add	@x[4],@x[4],@x[5],lsl#32
1141	 add	$D2,$D2,@K[5]
1142	add	@x[6],@x[6],@x[7],lsl#32
1143	 add	$D3,$D3,@K[6]
1144	ldp	@x[5],@x[7],[$inp,#16]
1145	 add	$D4,$D4,@K[3]
1146	add	@x[8],@x[8],@x[9],lsl#32
1147	 add	$D5,$D5,@K[4]
1148	add	@x[10],@x[10],@x[11],lsl#32
1149	 add	$B0,$B0,@K[1]
1150	ldp	@x[9],@x[11],[$inp,#32]
1151	 add	$B1,$B1,@K[1]
1152	add	@x[12],@x[12],@x[13],lsl#32
1153	 add	$B2,$B2,@K[1]
1154	add	@x[14],@x[14],@x[15],lsl#32
1155	 add	$B3,$B3,@K[1]
1156	ldp	@x[13],@x[15],[$inp,#48]
1157	 add	$B4,$B4,@K[1]
1158	add	$inp,$inp,#64
1159	 add	$B5,$B5,@K[1]
1160
1161#ifdef	__AARCH64EB__
1162	rev	@x[0],@x[0]
1163	rev	@x[2],@x[2]
1164	rev	@x[4],@x[4]
1165	rev	@x[6],@x[6]
1166	rev	@x[8],@x[8]
1167	rev	@x[10],@x[10]
1168	rev	@x[12],@x[12]
1169	rev	@x[14],@x[14]
1170#endif
1171	ld1.8	{$T0-$T3},[$inp],#64
1172	eor	@x[0],@x[0],@x[1]
1173	eor	@x[2],@x[2],@x[3]
1174	eor	@x[4],@x[4],@x[5]
1175	eor	@x[6],@x[6],@x[7]
1176	eor	@x[8],@x[8],@x[9]
1177	 eor	$A0,$A0,$T0
1178	eor	@x[10],@x[10],@x[11]
1179	 eor	$B0,$B0,$T1
1180	eor	@x[12],@x[12],@x[13]
1181	 eor	$C0,$C0,$T2
1182	eor	@x[14],@x[14],@x[15]
1183	 eor	$D0,$D0,$T3
1184	 ld1.8	{$T0-$T3},[$inp],#64
1185
1186	stp	@x[0],@x[2],[$out,#0]		// store output
1187	 add	@d[6],@d[6],#7			// increment counter
1188	stp	@x[4],@x[6],[$out,#16]
1189	stp	@x[8],@x[10],[$out,#32]
1190	stp	@x[12],@x[14],[$out,#48]
1191	add	$out,$out,#64
1192	st1.8	{$A0-$D0},[$out],#64
1193
1194	ld1.8	{$A0-$D0},[$inp],#64
1195	eor	$A1,$A1,$T0
1196	eor	$B1,$B1,$T1
1197	eor	$C1,$C1,$T2
1198	eor	$D1,$D1,$T3
1199	st1.8	{$A1-$D1},[$out],#64
1200
1201	ld1.8	{$A1-$D1},[$inp],#64
1202	eor	$A2,$A2,$A0
1203	 ldp	@K[0],@K[1],[sp,#0]
1204	eor	$B2,$B2,$B0
1205	 ldp	@K[2],@K[3],[sp,#32]
1206	eor	$C2,$C2,$C0
1207	eor	$D2,$D2,$D0
1208	st1.8	{$A2-$D2},[$out],#64
1209
1210	ld1.8	{$A2-$D2},[$inp],#64
1211	eor	$A3,$A3,$A1
1212	eor	$B3,$B3,$B1
1213	eor	$C3,$C3,$C1
1214	eor	$D3,$D3,$D1
1215	st1.8	{$A3-$D3},[$out],#64
1216
1217	ld1.8	{$A3-$D3},[$inp],#64
1218	eor	$A4,$A4,$A2
1219	eor	$B4,$B4,$B2
1220	eor	$C4,$C4,$C2
1221	eor	$D4,$D4,$D2
1222	st1.8	{$A4-$D4},[$out],#64
1223
1224	shl	$A0,$ONE,#1			// 4 -> 8
1225	eor	$A5,$A5,$A3
1226	eor	$B5,$B5,$B3
1227	eor	$C5,$C5,$C3
1228	eor	$D5,$D5,$D3
1229	st1.8	{$A5-$D5},[$out],#64
1230
1231	add	@K[3],@K[3],$A0			// += 8
1232	add	@K[4],@K[4],$A0
1233	add	@K[5],@K[5],$A0
1234	add	@K[6],@K[6],$A0
1235
1236	b.hs	.Loop_outer_512_neon
1237
1238	adds	$len,$len,#512
1239	ushr	$ONE,$ONE,#1			// 4 -> 2
1240
1241	ldp	d10,d11,[sp,#128+16]		// meet ABI requirements
1242	ldp	d12,d13,[sp,#128+32]
1243	ldp	d14,d15,[sp,#128+48]
1244
1245	stp	@K[0],@K[0],[sp,#0]		// wipe off-load area
1246	stp	@K[0],@K[0],[sp,#32]
1247	stp	@K[0],@K[0],[sp,#64]
1248
1249	b.eq	.Ldone_512_neon
1250
1251	sub	$key,$key,#16			// .Lone
1252	cmp	$len,#192
1253	add	sp,sp,#128
1254	sub	@K[3],@K[3],$ONE		// -= 2
1255	ld1	{$CTR,$ROT24},[$key]
1256	b.hs	.Loop_outer_neon
1257
1258	ldp	d8,d9,[sp,#0]			// meet ABI requirements
1259	eor	@K[1],@K[1],@K[1]
1260	eor	@K[2],@K[2],@K[2]
1261	eor	@K[3],@K[3],@K[3]
1262	eor	@K[4],@K[4],@K[4]
1263	eor	@K[5],@K[5],@K[5]
1264	eor	@K[6],@K[6],@K[6]
1265	b	.Loop_outer
1266
1267.Ldone_512_neon:
1268	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1269	ldp	x19,x20,[x29,#16]
1270	add	sp,sp,#128+64
1271	ldp	x21,x22,[x29,#32]
1272	ldp	x23,x24,[x29,#48]
1273	ldp	x25,x26,[x29,#64]
1274	ldp	x27,x28,[x29,#80]
1275	ldp	x29,x30,[sp],#96
1276	.inst	0xd50323bf			// autiasp
1277	ret
1278.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1279___
1280}
1281}}}
1282
1283foreach (split("\n",$code)) {
1284	s/\`([^\`]*)\`/eval $1/geo;
1285
1286	(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1))	or
1287	(m/\b(eor|ext|mov|tbl)\b/ and (s/\.4s/\.16b/g or 1))	or
1288	(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1))	or
1289	(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1))	or
1290	(m/\b(dup|ld1)\b/ and (s/\.4(s}?\[[0-3]\])/.$1/g or 1))	or
1291	(s/\b(zip[12])\.64\b/$1/ and (s/\.4s/\.2d/g or 1))	or
1292	(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1293
1294	#s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1295
1296	print $_,"\n";
1297}
1298close STDOUT or die "error closing STDOUT: $!";	# flush
1299