1#! /usr/bin/env perl
2# Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# April 2019
31#
32# Key to performance of parallelize-able modes is round instruction
33# interleaving. But which factor to use? There is optimal one for
34# each combination of instruction latency and issue rate, beyond
35# which increasing interleave factor doesn't pay off. While on cons
36# side we have code size increase and resource waste on platforms for
37# which interleave factor is too high. In other words you want it to
38# be just right. So far interleave factor of 3x was serving well all
39# platforms. But for ThunderX2 optimal interleave factor was measured
40# to be 5x...
41#
42# Performance in cycles per byte processed with 128-bit key:
43#
44#		CBC enc		CBC dec		CTR
45# Apple A7	2.39		1.20		1.20
46# Cortex-A53	1.32		1.17/1.29(**)	1.36/1.46
47# Cortex-A57(*)	1.95		0.82/0.85	0.89/0.93
48# Cortex-A72	1.33		0.85/0.88	0.92/0.96
49# Denver	1.96		0.65/0.86	0.76/0.80
50# Mongoose	1.33		1.23/1.20	1.30/1.20
51# Kryo		1.26		0.87/0.94	1.00/1.00
52# ThunderX2	5.95		1.25		1.30
53#
54# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
55#	and are still same even for updated module;
56# (**)	numbers after slash are for 32-bit code, which is 3x-
57#	interleaved;
58
59# $output is the last argument if it looks like a file (it has an extension)
60# $flavour is the first argument if it doesn't look like a file
61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67die "can't locate arm-xlate.pl";
68
69open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70    or die "can't call $xlate: $!";
71*STDOUT=*OUT;
72
73$prefix="aes_v8";
74
75$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
76
77$code=<<___;
78#include "arm_arch.h"
79
80#if __ARM_MAX_ARCH__>=7
81___
82$code.=".arch	armv8-a+crypto\n.text\n"		if ($flavour =~ /64/);
83$code.=<<___						if ($flavour !~ /64/);
84.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
85.fpu	neon
86#ifdef	__thumb2__
87.syntax	unified
88.thumb
89# define INST(a,b,c,d)	$_byte	c,d|0xc,a,b
90#else
91.code	32
92# define INST(a,b,c,d)	$_byte	a,b,c,d
93#endif
94
95.text
96___
97
98# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100# maintain both 32- and 64-bit codes within single module and
101# transliterate common code to either flavour with regex vodoo.
102#
103{{{
104my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
107
108
109#
110# This file generates .s file for 64-bit and 32-bit CPUs.
111# We don't implement .rodata on 32-bit CPUs yet.
112#
113$code.=".rodata\n"	if ($flavour =~ /64/);
114$code.=<<___;
115.align	5
116.Lrcon:
117.long	0x01,0x01,0x01,0x01
118.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
119.long	0x1b,0x1b,0x1b,0x1b
120___
121$code.=".previous\n"	if ($flavour =~ /64/);
122
123$code.=<<___;
124.globl	${prefix}_set_encrypt_key
125.type	${prefix}_set_encrypt_key,%function
126.align	5
127${prefix}_set_encrypt_key:
128.Lenc_key:
129___
130$code.=<<___	if ($flavour =~ /64/);
131	stp	x29,x30,[sp,#-16]!
132	add	x29,sp,#0
133___
134$code.=<<___;
135	mov	$ptr,#-1
136	cmp	$inp,#0
137	b.eq	.Lenc_key_abort
138	cmp	$out,#0
139	b.eq	.Lenc_key_abort
140	mov	$ptr,#-2
141	cmp	$bits,#128
142	b.lt	.Lenc_key_abort
143	cmp	$bits,#256
144	b.gt	.Lenc_key_abort
145	tst	$bits,#0x3f
146	b.ne	.Lenc_key_abort
147
148___
149$code.=<<___	if ($flavour =~ /64/);
150	adrp	$ptr,.Lrcon
151	add	$ptr,$ptr,:lo12:.Lrcon
152___
153$code.=<<___	if ($flavour =~ /32/);
154	adr	$ptr,.Lrcon
155___
156$code.=<<___;
157	cmp	$bits,#192
158
159	veor	$zero,$zero,$zero
160	vld1.8	{$in0},[$inp],#16
161	mov	$bits,#8		// reuse $bits
162	vld1.32	{$rcon,$mask},[$ptr],#32
163
164	b.lt	.Loop128
165	b.eq	.L192
166	b	.L256
167
168.align	4
169.Loop128:
170	vtbl.8	$key,{$in0},$mask
171	vext.8	$tmp,$zero,$in0,#12
172	vst1.32	{$in0},[$out],#16
173	aese	$key,$zero
174	subs	$bits,$bits,#1
175
176	veor	$in0,$in0,$tmp
177	vext.8	$tmp,$zero,$tmp,#12
178	veor	$in0,$in0,$tmp
179	vext.8	$tmp,$zero,$tmp,#12
180	 veor	$key,$key,$rcon
181	veor	$in0,$in0,$tmp
182	vshl.u8	$rcon,$rcon,#1
183	veor	$in0,$in0,$key
184	b.ne	.Loop128
185
186	vld1.32	{$rcon},[$ptr]
187
188	vtbl.8	$key,{$in0},$mask
189	vext.8	$tmp,$zero,$in0,#12
190	vst1.32	{$in0},[$out],#16
191	aese	$key,$zero
192
193	veor	$in0,$in0,$tmp
194	vext.8	$tmp,$zero,$tmp,#12
195	veor	$in0,$in0,$tmp
196	vext.8	$tmp,$zero,$tmp,#12
197	 veor	$key,$key,$rcon
198	veor	$in0,$in0,$tmp
199	vshl.u8	$rcon,$rcon,#1
200	veor	$in0,$in0,$key
201
202	vtbl.8	$key,{$in0},$mask
203	vext.8	$tmp,$zero,$in0,#12
204	vst1.32	{$in0},[$out],#16
205	aese	$key,$zero
206
207	veor	$in0,$in0,$tmp
208	vext.8	$tmp,$zero,$tmp,#12
209	veor	$in0,$in0,$tmp
210	vext.8	$tmp,$zero,$tmp,#12
211	 veor	$key,$key,$rcon
212	veor	$in0,$in0,$tmp
213	veor	$in0,$in0,$key
214	vst1.32	{$in0},[$out]
215	add	$out,$out,#0x50
216
217	mov	$rounds,#10
218	b	.Ldone
219
220.align	4
221.L192:
222	vld1.8	{$in1},[$inp],#8
223	vmov.i8	$key,#8			// borrow $key
224	vst1.32	{$in0},[$out],#16
225	vsub.i8	$mask,$mask,$key	// adjust the mask
226
227.Loop192:
228	vtbl.8	$key,{$in1},$mask
229	vext.8	$tmp,$zero,$in0,#12
230#ifdef __ARMEB__
231	vst1.32	{$in1},[$out],#16
232	sub	$out,$out,#8
233#else
234	vst1.32	{$in1},[$out],#8
235#endif
236	aese	$key,$zero
237	subs	$bits,$bits,#1
238
239	veor	$in0,$in0,$tmp
240	vext.8	$tmp,$zero,$tmp,#12
241	veor	$in0,$in0,$tmp
242	vext.8	$tmp,$zero,$tmp,#12
243	veor	$in0,$in0,$tmp
244
245	vdup.32	$tmp,${in0}[3]
246	veor	$tmp,$tmp,$in1
247	 veor	$key,$key,$rcon
248	vext.8	$in1,$zero,$in1,#12
249	vshl.u8	$rcon,$rcon,#1
250	veor	$in1,$in1,$tmp
251	veor	$in0,$in0,$key
252	veor	$in1,$in1,$key
253	vst1.32	{$in0},[$out],#16
254	b.ne	.Loop192
255
256	mov	$rounds,#12
257	add	$out,$out,#0x20
258	b	.Ldone
259
260.align	4
261.L256:
262	vld1.8	{$in1},[$inp]
263	mov	$bits,#7
264	mov	$rounds,#14
265	vst1.32	{$in0},[$out],#16
266
267.Loop256:
268	vtbl.8	$key,{$in1},$mask
269	vext.8	$tmp,$zero,$in0,#12
270	vst1.32	{$in1},[$out],#16
271	aese	$key,$zero
272	subs	$bits,$bits,#1
273
274	veor	$in0,$in0,$tmp
275	vext.8	$tmp,$zero,$tmp,#12
276	veor	$in0,$in0,$tmp
277	vext.8	$tmp,$zero,$tmp,#12
278	 veor	$key,$key,$rcon
279	veor	$in0,$in0,$tmp
280	vshl.u8	$rcon,$rcon,#1
281	veor	$in0,$in0,$key
282	vst1.32	{$in0},[$out],#16
283	b.eq	.Ldone
284
285	vdup.32	$key,${in0}[3]		// just splat
286	vext.8	$tmp,$zero,$in1,#12
287	aese	$key,$zero
288
289	veor	$in1,$in1,$tmp
290	vext.8	$tmp,$zero,$tmp,#12
291	veor	$in1,$in1,$tmp
292	vext.8	$tmp,$zero,$tmp,#12
293	veor	$in1,$in1,$tmp
294
295	veor	$in1,$in1,$key
296	b	.Loop256
297
298.Ldone:
299	str	$rounds,[$out]
300	mov	$ptr,#0
301
302.Lenc_key_abort:
303	mov	x0,$ptr			// return value
304	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
305	ret
306.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
307
308.globl	${prefix}_set_decrypt_key
309.type	${prefix}_set_decrypt_key,%function
310.align	5
311${prefix}_set_decrypt_key:
312___
313$code.=<<___	if ($flavour =~ /64/);
314	.inst	0xd503233f		// paciasp
315	stp	x29,x30,[sp,#-16]!
316	add	x29,sp,#0
317___
318$code.=<<___	if ($flavour !~ /64/);
319	stmdb	sp!,{r4,lr}
320___
321$code.=<<___;
322	bl	.Lenc_key
323
324	cmp	x0,#0
325	b.ne	.Ldec_key_abort
326
327	sub	$out,$out,#240		// restore original $out
328	mov	x4,#-16
329	add	$inp,$out,x12,lsl#4	// end of key schedule
330
331	vld1.32	{v0.16b},[$out]
332	vld1.32	{v1.16b},[$inp]
333	vst1.32	{v0.16b},[$inp],x4
334	vst1.32	{v1.16b},[$out],#16
335
336.Loop_imc:
337	vld1.32	{v0.16b},[$out]
338	vld1.32	{v1.16b},[$inp]
339	aesimc	v0.16b,v0.16b
340	aesimc	v1.16b,v1.16b
341	vst1.32	{v0.16b},[$inp],x4
342	vst1.32	{v1.16b},[$out],#16
343	cmp	$inp,$out
344	b.hi	.Loop_imc
345
346	vld1.32	{v0.16b},[$out]
347	aesimc	v0.16b,v0.16b
348	vst1.32	{v0.16b},[$inp]
349
350	eor	x0,x0,x0		// return value
351.Ldec_key_abort:
352___
353$code.=<<___	if ($flavour !~ /64/);
354	ldmia	sp!,{r4,pc}
355___
356$code.=<<___	if ($flavour =~ /64/);
357	ldp	x29,x30,[sp],#16
358	.inst	0xd50323bf		// autiasp
359	ret
360___
361$code.=<<___;
362.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
363___
364}}}
365{{{
366sub gen_block () {
367my $dir = shift;
368my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
369my ($inp,$out,$key)=map("x$_",(0..2));
370my $rounds="w3";
371my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
372
373$code.=<<___;
374.globl	${prefix}_${dir}crypt
375.type	${prefix}_${dir}crypt,%function
376.align	5
377${prefix}_${dir}crypt:
378	ldr	$rounds,[$key,#240]
379	vld1.32	{$rndkey0},[$key],#16
380	vld1.8	{$inout},[$inp]
381	sub	$rounds,$rounds,#2
382	vld1.32	{$rndkey1},[$key],#16
383
384.Loop_${dir}c:
385	aes$e	$inout,$rndkey0
386	aes$mc	$inout,$inout
387	vld1.32	{$rndkey0},[$key],#16
388	subs	$rounds,$rounds,#2
389	aes$e	$inout,$rndkey1
390	aes$mc	$inout,$inout
391	vld1.32	{$rndkey1},[$key],#16
392	b.gt	.Loop_${dir}c
393
394	aes$e	$inout,$rndkey0
395	aes$mc	$inout,$inout
396	vld1.32	{$rndkey0},[$key]
397	aes$e	$inout,$rndkey1
398	veor	$inout,$inout,$rndkey0
399
400	vst1.8	{$inout},[$out]
401	ret
402.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
403___
404}
405&gen_block("en");
406&gen_block("de");
407}}}
408
409# Performance in cycles per byte.
410# Processed with AES-ECB different key size.
411# It shows the value before and after optimization as below:
412# (before/after):
413#
414#		AES-128-ECB		AES-192-ECB		AES-256-ECB
415# Cortex-A57	1.85/0.82		2.16/0.96		2.47/1.10
416# Cortex-A72	1.64/0.85		1.82/0.99		2.13/1.14
417
418# Optimization is implemented by loop unrolling and interleaving.
419# Commonly, we choose the unrolling factor as 5, if the input
420# data size smaller than 5 blocks, but not smaller than 3 blocks,
421# choose 3 as the unrolling factor.
422# If the input data size dsize >= 5*16 bytes, then take 5 blocks
423# as one iteration, every loop the left size lsize -= 5*16.
424# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
425# every loop lsize -=3*16.
426# If lsize < 3*16 bytes, treat them as the tail, interleave the
427# two blocks AES instructions.
428# There is one special case, if the original input data size dsize
429# = 16 bytes, we will treat it seperately to improve the
430# performance: one independent code block without LR, FP load and
431# store, just looks like what the original ECB implementation does.
432
433{{{
434my ($inp,$out,$len,$key)=map("x$_",(0..3));
435my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
436my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
437
438my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
439
440### q7	last round key
441### q10-q15	q7 Last 7 round keys
442### q8-q9	preloaded round keys except last 7 keys for big size
443### q5, q6, q8-q9	preloaded round keys except last 7 keys for only 16 byte
444
445{
446my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
447
448my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
449my ($dat4,$in4,$tmp4);
450if ($flavour =~ /64/) {
451    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
452}
453
454$code.=<<___;
455.globl	${prefix}_ecb_encrypt
456.type	${prefix}_ecb_encrypt,%function
457.align	5
458${prefix}_ecb_encrypt:
459___
460$code.=<<___	if ($flavour =~ /64/);
461	subs	$len,$len,#16
462	// Original input data size bigger than 16, jump to big size processing.
463	b.ne    .Lecb_big_size
464	vld1.8	{$dat0},[$inp]
465	cmp	$enc,#0					// en- or decrypting?
466	ldr	$rounds,[$key,#240]
467	vld1.32	{q5-q6},[$key],#32			// load key schedule...
468
469	b.eq .Lecb_small_dec
470	aese	$dat0,q5
471	aesmc	$dat0,$dat0
472	vld1.32	{q8-q9},[$key],#32			// load key schedule...
473	aese	$dat0,q6
474	aesmc	$dat0,$dat0
475	subs	$rounds,$rounds,#10			// if rounds==10, jump to aes-128-ecb processing
476	b.eq    .Lecb_128_enc
477.Lecb_round_loop:
478	aese	$dat0,q8
479	aesmc	$dat0,$dat0
480	vld1.32	{q8},[$key],#16				// load key schedule...
481	aese	$dat0,q9
482	aesmc	$dat0,$dat0
483	vld1.32	{q9},[$key],#16				// load key schedule...
484	subs	$rounds,$rounds,#2			// bias
485	b.gt    .Lecb_round_loop
486.Lecb_128_enc:
487	vld1.32	{q10-q11},[$key],#32		// load key schedule...
488	aese	$dat0,q8
489	aesmc	$dat0,$dat0
490	aese	$dat0,q9
491	aesmc	$dat0,$dat0
492	vld1.32	{q12-q13},[$key],#32		// load key schedule...
493	aese	$dat0,q10
494	aesmc	$dat0,$dat0
495	aese	$dat0,q11
496	aesmc	$dat0,$dat0
497	vld1.32	{q14-q15},[$key],#32		// load key schedule...
498	aese	$dat0,q12
499	aesmc	$dat0,$dat0
500	aese	$dat0,q13
501	aesmc	$dat0,$dat0
502	vld1.32	{$rndlast},[$key]
503	aese	$dat0,q14
504	aesmc	$dat0,$dat0
505	aese	$dat0,q15
506	veor	$dat0,$dat0,$rndlast
507	vst1.8	{$dat0},[$out]
508	b	.Lecb_Final_abort
509.Lecb_small_dec:
510	aesd	$dat0,q5
511	aesimc	$dat0,$dat0
512	vld1.32	{q8-q9},[$key],#32			// load key schedule...
513	aesd	$dat0,q6
514	aesimc	$dat0,$dat0
515	subs	$rounds,$rounds,#10			// bias
516	b.eq    .Lecb_128_dec
517.Lecb_dec_round_loop:
518	aesd	$dat0,q8
519	aesimc	$dat0,$dat0
520	vld1.32	{q8},[$key],#16				// load key schedule...
521	aesd	$dat0,q9
522	aesimc	$dat0,$dat0
523	vld1.32	{q9},[$key],#16				// load key schedule...
524	subs	$rounds,$rounds,#2			// bias
525	b.gt    .Lecb_dec_round_loop
526.Lecb_128_dec:
527	vld1.32	{q10-q11},[$key],#32		// load key schedule...
528	aesd	$dat0,q8
529	aesimc	$dat0,$dat0
530	aesd	$dat0,q9
531	aesimc	$dat0,$dat0
532	vld1.32	{q12-q13},[$key],#32		// load key schedule...
533	aesd	$dat0,q10
534	aesimc	$dat0,$dat0
535	aesd	$dat0,q11
536	aesimc	$dat0,$dat0
537	vld1.32	{q14-q15},[$key],#32		// load key schedule...
538	aesd	$dat0,q12
539	aesimc	$dat0,$dat0
540	aesd	$dat0,q13
541	aesimc	$dat0,$dat0
542	vld1.32	{$rndlast},[$key]
543	aesd	$dat0,q14
544	aesimc	$dat0,$dat0
545	aesd	$dat0,q15
546	veor	$dat0,$dat0,$rndlast
547	vst1.8	{$dat0},[$out]
548	b	.Lecb_Final_abort
549.Lecb_big_size:
550___
551$code.=<<___	if ($flavour =~ /64/);
552	stp	x29,x30,[sp,#-16]!
553	add	x29,sp,#0
554___
555$code.=<<___	if ($flavour !~ /64/);
556	mov	ip,sp
557	stmdb	sp!,{r4-r8,lr}
558	vstmdb	sp!,{d8-d15}			@ ABI specification says so
559	ldmia	ip,{r4-r5}			@ load remaining args
560	subs	$len,$len,#16
561___
562$code.=<<___;
563	mov	$step,#16
564	b.lo	.Lecb_done
565	cclr	$step,eq
566
567	cmp	$enc,#0					// en- or decrypting?
568	ldr	$rounds,[$key,#240]
569	and	$len,$len,#-16
570	vld1.8	{$dat},[$inp],$step
571
572	vld1.32	{q8-q9},[$key]				// load key schedule...
573	sub	$rounds,$rounds,#6
574	add	$key_,$key,x5,lsl#4				// pointer to last 7 round keys
575	sub	$rounds,$rounds,#2
576	vld1.32	{q10-q11},[$key_],#32
577	vld1.32	{q12-q13},[$key_],#32
578	vld1.32	{q14-q15},[$key_],#32
579	vld1.32	{$rndlast},[$key_]
580
581	add	$key_,$key,#32
582	mov	$cnt,$rounds
583	b.eq	.Lecb_dec
584
585	vld1.8	{$dat1},[$inp],#16
586	subs	$len,$len,#32				// bias
587	add	$cnt,$rounds,#2
588	vorr	$in1,$dat1,$dat1
589	vorr	$dat2,$dat1,$dat1
590	vorr	$dat1,$dat,$dat
591	b.lo	.Lecb_enc_tail
592
593	vorr	$dat1,$in1,$in1
594	vld1.8	{$dat2},[$inp],#16
595___
596$code.=<<___	if ($flavour =~ /64/);
597	cmp	$len,#32
598	b.lo	.Loop3x_ecb_enc
599
600	vld1.8	{$dat3},[$inp],#16
601	vld1.8	{$dat4},[$inp],#16
602	sub	$len,$len,#32				// bias
603	mov	$cnt,$rounds
604
605.Loop5x_ecb_enc:
606	aese	$dat0,q8
607	aesmc	$dat0,$dat0
608	aese	$dat1,q8
609	aesmc	$dat1,$dat1
610	aese	$dat2,q8
611	aesmc	$dat2,$dat2
612	aese	$dat3,q8
613	aesmc	$dat3,$dat3
614	aese	$dat4,q8
615	aesmc	$dat4,$dat4
616	vld1.32	{q8},[$key_],#16
617	subs	$cnt,$cnt,#2
618	aese	$dat0,q9
619	aesmc	$dat0,$dat0
620	aese	$dat1,q9
621	aesmc	$dat1,$dat1
622	aese	$dat2,q9
623	aesmc	$dat2,$dat2
624	aese	$dat3,q9
625	aesmc	$dat3,$dat3
626	aese	$dat4,q9
627	aesmc	$dat4,$dat4
628	vld1.32	{q9},[$key_],#16
629	b.gt	.Loop5x_ecb_enc
630
631	aese	$dat0,q8
632	aesmc	$dat0,$dat0
633	aese	$dat1,q8
634	aesmc	$dat1,$dat1
635	aese	$dat2,q8
636	aesmc	$dat2,$dat2
637	aese	$dat3,q8
638	aesmc	$dat3,$dat3
639	aese	$dat4,q8
640	aesmc	$dat4,$dat4
641	cmp	$len,#0x40					// because .Lecb_enc_tail4x
642	sub	$len,$len,#0x50
643
644	aese	$dat0,q9
645	aesmc	$dat0,$dat0
646	aese	$dat1,q9
647	aesmc	$dat1,$dat1
648	aese	$dat2,q9
649	aesmc	$dat2,$dat2
650	aese	$dat3,q9
651	aesmc	$dat3,$dat3
652	aese	$dat4,q9
653	aesmc	$dat4,$dat4
654	csel	x6,xzr,$len,gt			// borrow x6, $cnt, "gt" is not typo
655	mov	$key_,$key
656
657	aese	$dat0,q10
658	aesmc	$dat0,$dat0
659	aese	$dat1,q10
660	aesmc	$dat1,$dat1
661	aese	$dat2,q10
662	aesmc	$dat2,$dat2
663	aese	$dat3,q10
664	aesmc	$dat3,$dat3
665	aese	$dat4,q10
666	aesmc	$dat4,$dat4
667	add	$inp,$inp,x6				// $inp is adjusted in such way that
668							// at exit from the loop $dat1-$dat4
669							// are loaded with last "words"
670	add	x6,$len,#0x60		    // because .Lecb_enc_tail4x
671
672	aese	$dat0,q11
673	aesmc	$dat0,$dat0
674	aese	$dat1,q11
675	aesmc	$dat1,$dat1
676	aese	$dat2,q11
677	aesmc	$dat2,$dat2
678	aese	$dat3,q11
679	aesmc	$dat3,$dat3
680	aese	$dat4,q11
681	aesmc	$dat4,$dat4
682
683	aese	$dat0,q12
684	aesmc	$dat0,$dat0
685	aese	$dat1,q12
686	aesmc	$dat1,$dat1
687	aese	$dat2,q12
688	aesmc	$dat2,$dat2
689	aese	$dat3,q12
690	aesmc	$dat3,$dat3
691	aese	$dat4,q12
692	aesmc	$dat4,$dat4
693
694	aese	$dat0,q13
695	aesmc	$dat0,$dat0
696	aese	$dat1,q13
697	aesmc	$dat1,$dat1
698	aese	$dat2,q13
699	aesmc	$dat2,$dat2
700	aese	$dat3,q13
701	aesmc	$dat3,$dat3
702	aese	$dat4,q13
703	aesmc	$dat4,$dat4
704
705	aese	$dat0,q14
706	aesmc	$dat0,$dat0
707	aese	$dat1,q14
708	aesmc	$dat1,$dat1
709	aese	$dat2,q14
710	aesmc	$dat2,$dat2
711	aese	$dat3,q14
712	aesmc	$dat3,$dat3
713	aese	$dat4,q14
714	aesmc	$dat4,$dat4
715
716	aese	$dat0,q15
717	vld1.8	{$in0},[$inp],#16
718	aese	$dat1,q15
719	vld1.8	{$in1},[$inp],#16
720	aese	$dat2,q15
721	vld1.8	{$in2},[$inp],#16
722	aese	$dat3,q15
723	vld1.8	{$in3},[$inp],#16
724	aese	$dat4,q15
725	vld1.8	{$in4},[$inp],#16
726	cbz	x6,.Lecb_enc_tail4x
727	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
728	veor	$tmp0,$rndlast,$dat0
729	vorr	$dat0,$in0,$in0
730	veor	$tmp1,$rndlast,$dat1
731	vorr	$dat1,$in1,$in1
732	veor	$tmp2,$rndlast,$dat2
733	vorr	$dat2,$in2,$in2
734	veor	$tmp3,$rndlast,$dat3
735	vorr	$dat3,$in3,$in3
736	veor	$tmp4,$rndlast,$dat4
737	vst1.8	{$tmp0},[$out],#16
738	vorr	$dat4,$in4,$in4
739	vst1.8	{$tmp1},[$out],#16
740	mov	$cnt,$rounds
741	vst1.8	{$tmp2},[$out],#16
742	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
743	vst1.8	{$tmp3},[$out],#16
744	vst1.8	{$tmp4},[$out],#16
745	b.hs	.Loop5x_ecb_enc
746
747	add	$len,$len,#0x50
748	cbz	$len,.Lecb_done
749
750	add	$cnt,$rounds,#2
751	subs	$len,$len,#0x30
752	vorr	$dat0,$in2,$in2
753	vorr	$dat1,$in3,$in3
754	vorr	$dat2,$in4,$in4
755	b.lo	.Lecb_enc_tail
756
757	b	.Loop3x_ecb_enc
758
759.align	4
760.Lecb_enc_tail4x:
761	veor	$tmp1,$rndlast,$dat1
762	veor	$tmp2,$rndlast,$dat2
763	veor	$tmp3,$rndlast,$dat3
764	veor	$tmp4,$rndlast,$dat4
765	vst1.8	{$tmp1},[$out],#16
766	vst1.8	{$tmp2},[$out],#16
767	vst1.8	{$tmp3},[$out],#16
768	vst1.8	{$tmp4},[$out],#16
769
770	b	.Lecb_done
771.align	4
772___
773$code.=<<___;
774.Loop3x_ecb_enc:
775	aese	$dat0,q8
776	aesmc	$dat0,$dat0
777	aese	$dat1,q8
778	aesmc	$dat1,$dat1
779	aese	$dat2,q8
780	aesmc	$dat2,$dat2
781	vld1.32	{q8},[$key_],#16
782	subs	$cnt,$cnt,#2
783	aese	$dat0,q9
784	aesmc	$dat0,$dat0
785	aese	$dat1,q9
786	aesmc	$dat1,$dat1
787	aese	$dat2,q9
788	aesmc	$dat2,$dat2
789	vld1.32	{q9},[$key_],#16
790	b.gt	.Loop3x_ecb_enc
791
792	aese	$dat0,q8
793	aesmc	$dat0,$dat0
794	aese	$dat1,q8
795	aesmc	$dat1,$dat1
796	aese	$dat2,q8
797	aesmc	$dat2,$dat2
798	subs	$len,$len,#0x30
799	mov.lo	x6,$len				// x6, $cnt, is zero at this point
800	aese	$dat0,q9
801	aesmc	$dat0,$dat0
802	aese	$dat1,q9
803	aesmc	$dat1,$dat1
804	aese	$dat2,q9
805	aesmc	$dat2,$dat2
806	add	$inp,$inp,x6			// $inp is adjusted in such way that
807						// at exit from the loop $dat1-$dat2
808						// are loaded with last "words"
809	mov	$key_,$key
810	aese	$dat0,q12
811	aesmc	$dat0,$dat0
812	aese	$dat1,q12
813	aesmc	$dat1,$dat1
814	aese	$dat2,q12
815	aesmc	$dat2,$dat2
816	vld1.8	{$in0},[$inp],#16
817	aese	$dat0,q13
818	aesmc	$dat0,$dat0
819	aese	$dat1,q13
820	aesmc	$dat1,$dat1
821	aese	$dat2,q13
822	aesmc	$dat2,$dat2
823	vld1.8	{$in1},[$inp],#16
824	aese	$dat0,q14
825	aesmc	$dat0,$dat0
826	aese	$dat1,q14
827	aesmc	$dat1,$dat1
828	aese	$dat2,q14
829	aesmc	$dat2,$dat2
830	vld1.8	{$in2},[$inp],#16
831	aese	$dat0,q15
832	aese	$dat1,q15
833	aese	$dat2,q15
834	vld1.32 {q8},[$key_],#16		// re-pre-load rndkey[0]
835	add	$cnt,$rounds,#2
836	veor	$tmp0,$rndlast,$dat0
837	veor	$tmp1,$rndlast,$dat1
838	veor	$dat2,$dat2,$rndlast
839	vld1.32 {q9},[$key_],#16		// re-pre-load rndkey[1]
840	vst1.8	{$tmp0},[$out],#16
841	vorr	$dat0,$in0,$in0
842	vst1.8	{$tmp1},[$out],#16
843	vorr	$dat1,$in1,$in1
844	vst1.8	{$dat2},[$out],#16
845	vorr	$dat2,$in2,$in2
846	b.hs	.Loop3x_ecb_enc
847
848	cmn	$len,#0x30
849	b.eq	.Lecb_done
850	nop
851
852.Lecb_enc_tail:
853	aese	$dat1,q8
854	aesmc	$dat1,$dat1
855	aese	$dat2,q8
856	aesmc	$dat2,$dat2
857	vld1.32	{q8},[$key_],#16
858	subs	$cnt,$cnt,#2
859	aese	$dat1,q9
860	aesmc	$dat1,$dat1
861	aese	$dat2,q9
862	aesmc	$dat2,$dat2
863	vld1.32	{q9},[$key_],#16
864	b.gt	.Lecb_enc_tail
865
866	aese	$dat1,q8
867	aesmc	$dat1,$dat1
868	aese	$dat2,q8
869	aesmc	$dat2,$dat2
870	aese	$dat1,q9
871	aesmc	$dat1,$dat1
872	aese	$dat2,q9
873	aesmc	$dat2,$dat2
874	aese	$dat1,q12
875	aesmc	$dat1,$dat1
876	aese	$dat2,q12
877	aesmc	$dat2,$dat2
878	cmn	$len,#0x20
879	aese	$dat1,q13
880	aesmc	$dat1,$dat1
881	aese	$dat2,q13
882	aesmc	$dat2,$dat2
883	aese	$dat1,q14
884	aesmc	$dat1,$dat1
885	aese	$dat2,q14
886	aesmc	$dat2,$dat2
887	aese	$dat1,q15
888	aese	$dat2,q15
889	b.eq	.Lecb_enc_one
890	veor	$tmp1,$rndlast,$dat1
891	veor	$tmp2,$rndlast,$dat2
892	vst1.8	{$tmp1},[$out],#16
893	vst1.8	{$tmp2},[$out],#16
894	b	.Lecb_done
895
896.Lecb_enc_one:
897	veor	$tmp1,$rndlast,$dat2
898	vst1.8	{$tmp1},[$out],#16
899	b	.Lecb_done
900___
901
902$code.=<<___;
903.align	5
904.Lecb_dec:
905	vld1.8	{$dat1},[$inp],#16
906	subs	$len,$len,#32			// bias
907	add	$cnt,$rounds,#2
908	vorr	$in1,$dat1,$dat1
909	vorr	$dat2,$dat1,$dat1
910	vorr	$dat1,$dat,$dat
911	b.lo	.Lecb_dec_tail
912
913	vorr	$dat1,$in1,$in1
914	vld1.8	{$dat2},[$inp],#16
915___
916$code.=<<___	if ($flavour =~ /64/);
917	cmp	$len,#32
918	b.lo	.Loop3x_ecb_dec
919
920	vld1.8	{$dat3},[$inp],#16
921	vld1.8	{$dat4},[$inp],#16
922	sub	$len,$len,#32				// bias
923	mov	$cnt,$rounds
924
925.Loop5x_ecb_dec:
926	aesd	$dat0,q8
927	aesimc	$dat0,$dat0
928	aesd	$dat1,q8
929	aesimc	$dat1,$dat1
930	aesd	$dat2,q8
931	aesimc	$dat2,$dat2
932	aesd	$dat3,q8
933	aesimc	$dat3,$dat3
934	aesd	$dat4,q8
935	aesimc	$dat4,$dat4
936	vld1.32	{q8},[$key_],#16
937	subs	$cnt,$cnt,#2
938	aesd	$dat0,q9
939	aesimc	$dat0,$dat0
940	aesd	$dat1,q9
941	aesimc	$dat1,$dat1
942	aesd	$dat2,q9
943	aesimc	$dat2,$dat2
944	aesd	$dat3,q9
945	aesimc	$dat3,$dat3
946	aesd	$dat4,q9
947	aesimc	$dat4,$dat4
948	vld1.32	{q9},[$key_],#16
949	b.gt	.Loop5x_ecb_dec
950
951	aesd	$dat0,q8
952	aesimc	$dat0,$dat0
953	aesd	$dat1,q8
954	aesimc	$dat1,$dat1
955	aesd	$dat2,q8
956	aesimc	$dat2,$dat2
957	aesd	$dat3,q8
958	aesimc	$dat3,$dat3
959	aesd	$dat4,q8
960	aesimc	$dat4,$dat4
961	cmp	$len,#0x40				// because .Lecb_tail4x
962	sub	$len,$len,#0x50
963
964	aesd	$dat0,q9
965	aesimc	$dat0,$dat0
966	aesd	$dat1,q9
967	aesimc	$dat1,$dat1
968	aesd	$dat2,q9
969	aesimc	$dat2,$dat2
970	aesd	$dat3,q9
971	aesimc	$dat3,$dat3
972	aesd	$dat4,q9
973	aesimc	$dat4,$dat4
974	csel	x6,xzr,$len,gt		// borrow x6, $cnt, "gt" is not typo
975	mov	$key_,$key
976
977	aesd	$dat0,q10
978	aesimc	$dat0,$dat0
979	aesd	$dat1,q10
980	aesimc	$dat1,$dat1
981	aesd	$dat2,q10
982	aesimc	$dat2,$dat2
983	aesd	$dat3,q10
984	aesimc	$dat3,$dat3
985	aesd	$dat4,q10
986	aesimc	$dat4,$dat4
987	add	$inp,$inp,x6				// $inp is adjusted in such way that
988							// at exit from the loop $dat1-$dat4
989							// are loaded with last "words"
990	add	x6,$len,#0x60			// because .Lecb_tail4x
991
992	aesd	$dat0,q11
993	aesimc	$dat0,$dat0
994	aesd	$dat1,q11
995	aesimc	$dat1,$dat1
996	aesd	$dat2,q11
997	aesimc	$dat2,$dat2
998	aesd	$dat3,q11
999	aesimc	$dat3,$dat3
1000	aesd	$dat4,q11
1001	aesimc	$dat4,$dat4
1002
1003	aesd	$dat0,q12
1004	aesimc	$dat0,$dat0
1005	aesd	$dat1,q12
1006	aesimc	$dat1,$dat1
1007	aesd	$dat2,q12
1008	aesimc	$dat2,$dat2
1009	aesd	$dat3,q12
1010	aesimc	$dat3,$dat3
1011	aesd	$dat4,q12
1012	aesimc	$dat4,$dat4
1013
1014	aesd	$dat0,q13
1015	aesimc	$dat0,$dat0
1016	aesd	$dat1,q13
1017	aesimc	$dat1,$dat1
1018	aesd	$dat2,q13
1019	aesimc	$dat2,$dat2
1020	aesd	$dat3,q13
1021	aesimc	$dat3,$dat3
1022	aesd	$dat4,q13
1023	aesimc	$dat4,$dat4
1024
1025	aesd	$dat0,q14
1026	aesimc	$dat0,$dat0
1027	aesd	$dat1,q14
1028	aesimc	$dat1,$dat1
1029	aesd	$dat2,q14
1030	aesimc	$dat2,$dat2
1031	aesd	$dat3,q14
1032	aesimc	$dat3,$dat3
1033	aesd	$dat4,q14
1034	aesimc	$dat4,$dat4
1035
1036	aesd	$dat0,q15
1037	vld1.8	{$in0},[$inp],#16
1038	aesd	$dat1,q15
1039	vld1.8	{$in1},[$inp],#16
1040	aesd	$dat2,q15
1041	vld1.8	{$in2},[$inp],#16
1042	aesd	$dat3,q15
1043	vld1.8	{$in3},[$inp],#16
1044	aesd	$dat4,q15
1045	vld1.8	{$in4},[$inp],#16
1046	cbz	x6,.Lecb_tail4x
1047	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
1048	veor	$tmp0,$rndlast,$dat0
1049	vorr	$dat0,$in0,$in0
1050	veor	$tmp1,$rndlast,$dat1
1051	vorr	$dat1,$in1,$in1
1052	veor	$tmp2,$rndlast,$dat2
1053	vorr	$dat2,$in2,$in2
1054	veor	$tmp3,$rndlast,$dat3
1055	vorr	$dat3,$in3,$in3
1056	veor	$tmp4,$rndlast,$dat4
1057	vst1.8	{$tmp0},[$out],#16
1058	vorr	$dat4,$in4,$in4
1059	vst1.8	{$tmp1},[$out],#16
1060	mov	$cnt,$rounds
1061	vst1.8	{$tmp2},[$out],#16
1062	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
1063	vst1.8	{$tmp3},[$out],#16
1064	vst1.8	{$tmp4},[$out],#16
1065	b.hs	.Loop5x_ecb_dec
1066
1067	add	$len,$len,#0x50
1068	cbz	$len,.Lecb_done
1069
1070	add	$cnt,$rounds,#2
1071	subs	$len,$len,#0x30
1072	vorr	$dat0,$in2,$in2
1073	vorr	$dat1,$in3,$in3
1074	vorr	$dat2,$in4,$in4
1075	b.lo	.Lecb_dec_tail
1076
1077	b	.Loop3x_ecb_dec
1078
1079.align	4
1080.Lecb_tail4x:
1081	veor	$tmp1,$rndlast,$dat1
1082	veor	$tmp2,$rndlast,$dat2
1083	veor	$tmp3,$rndlast,$dat3
1084	veor	$tmp4,$rndlast,$dat4
1085	vst1.8	{$tmp1},[$out],#16
1086	vst1.8	{$tmp2},[$out],#16
1087	vst1.8	{$tmp3},[$out],#16
1088	vst1.8	{$tmp4},[$out],#16
1089
1090	b	.Lecb_done
1091.align	4
1092___
1093$code.=<<___;
1094.Loop3x_ecb_dec:
1095	aesd	$dat0,q8
1096	aesimc	$dat0,$dat0
1097	aesd	$dat1,q8
1098	aesimc	$dat1,$dat1
1099	aesd	$dat2,q8
1100	aesimc	$dat2,$dat2
1101	vld1.32	{q8},[$key_],#16
1102	subs	$cnt,$cnt,#2
1103	aesd	$dat0,q9
1104	aesimc	$dat0,$dat0
1105	aesd	$dat1,q9
1106	aesimc	$dat1,$dat1
1107	aesd	$dat2,q9
1108	aesimc	$dat2,$dat2
1109	vld1.32	{q9},[$key_],#16
1110	b.gt	.Loop3x_ecb_dec
1111
1112	aesd	$dat0,q8
1113	aesimc	$dat0,$dat0
1114	aesd	$dat1,q8
1115	aesimc	$dat1,$dat1
1116	aesd	$dat2,q8
1117	aesimc	$dat2,$dat2
1118	subs	$len,$len,#0x30
1119	mov.lo	x6,$len				// x6, $cnt, is zero at this point
1120	aesd	$dat0,q9
1121	aesimc	$dat0,$dat0
1122	aesd	$dat1,q9
1123	aesimc	$dat1,$dat1
1124	aesd	$dat2,q9
1125	aesimc	$dat2,$dat2
1126	add	$inp,$inp,x6 			// $inp is adjusted in such way that
1127						// at exit from the loop $dat1-$dat2
1128						// are loaded with last "words"
1129	mov	$key_,$key
1130	aesd	$dat0,q12
1131	aesimc	$dat0,$dat0
1132	aesd	$dat1,q12
1133	aesimc	$dat1,$dat1
1134	aesd	$dat2,q12
1135	aesimc	$dat2,$dat2
1136	vld1.8	{$in0},[$inp],#16
1137	aesd	$dat0,q13
1138	aesimc	$dat0,$dat0
1139	aesd	$dat1,q13
1140	aesimc	$dat1,$dat1
1141	aesd	$dat2,q13
1142	aesimc	$dat2,$dat2
1143	vld1.8	{$in1},[$inp],#16
1144	aesd	$dat0,q14
1145	aesimc	$dat0,$dat0
1146	aesd	$dat1,q14
1147	aesimc	$dat1,$dat1
1148	aesd	$dat2,q14
1149	aesimc	$dat2,$dat2
1150	vld1.8	{$in2},[$inp],#16
1151	aesd	$dat0,q15
1152	aesd	$dat1,q15
1153	aesd	$dat2,q15
1154	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
1155	add	$cnt,$rounds,#2
1156	veor	$tmp0,$rndlast,$dat0
1157	veor	$tmp1,$rndlast,$dat1
1158	veor	$dat2,$dat2,$rndlast
1159	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
1160	vst1.8	{$tmp0},[$out],#16
1161	vorr	$dat0,$in0,$in0
1162	vst1.8	{$tmp1},[$out],#16
1163	vorr	$dat1,$in1,$in1
1164	vst1.8	{$dat2},[$out],#16
1165	vorr	$dat2,$in2,$in2
1166	b.hs	.Loop3x_ecb_dec
1167
1168	cmn	$len,#0x30
1169	b.eq	.Lecb_done
1170	nop
1171
1172.Lecb_dec_tail:
1173	aesd	$dat1,q8
1174	aesimc	$dat1,$dat1
1175	aesd	$dat2,q8
1176	aesimc	$dat2,$dat2
1177	vld1.32	{q8},[$key_],#16
1178	subs	$cnt,$cnt,#2
1179	aesd	$dat1,q9
1180	aesimc	$dat1,$dat1
1181	aesd	$dat2,q9
1182	aesimc	$dat2,$dat2
1183	vld1.32	{q9},[$key_],#16
1184	b.gt	.Lecb_dec_tail
1185
1186	aesd	$dat1,q8
1187	aesimc	$dat1,$dat1
1188	aesd	$dat2,q8
1189	aesimc	$dat2,$dat2
1190	aesd	$dat1,q9
1191	aesimc	$dat1,$dat1
1192	aesd	$dat2,q9
1193	aesimc	$dat2,$dat2
1194	aesd	$dat1,q12
1195	aesimc	$dat1,$dat1
1196	aesd	$dat2,q12
1197	aesimc	$dat2,$dat2
1198	cmn	$len,#0x20
1199	aesd	$dat1,q13
1200	aesimc	$dat1,$dat1
1201	aesd	$dat2,q13
1202	aesimc	$dat2,$dat2
1203	aesd	$dat1,q14
1204	aesimc	$dat1,$dat1
1205	aesd	$dat2,q14
1206	aesimc	$dat2,$dat2
1207	aesd	$dat1,q15
1208	aesd	$dat2,q15
1209	b.eq	.Lecb_dec_one
1210	veor	$tmp1,$rndlast,$dat1
1211	veor	$tmp2,$rndlast,$dat2
1212	vst1.8	{$tmp1},[$out],#16
1213	vst1.8	{$tmp2},[$out],#16
1214	b	.Lecb_done
1215
1216.Lecb_dec_one:
1217	veor	$tmp1,$rndlast,$dat2
1218	vst1.8	{$tmp1},[$out],#16
1219
1220.Lecb_done:
1221___
1222}
1223$code.=<<___	if ($flavour !~ /64/);
1224	vldmia	sp!,{d8-d15}
1225	ldmia	sp!,{r4-r8,pc}
1226___
1227$code.=<<___	if ($flavour =~ /64/);
1228	ldr	x29,[sp],#16
1229___
1230$code.=<<___	if ($flavour =~ /64/);
1231.Lecb_Final_abort:
1232	ret
1233___
1234$code.=<<___;
1235.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1236___
1237}}}
1238{{{
1239my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1240my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1241my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1242
1243my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1244my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1245
1246### q8-q15	preloaded key schedule
1247
1248$code.=<<___;
1249.globl	${prefix}_cbc_encrypt
1250.type	${prefix}_cbc_encrypt,%function
1251.align	5
1252${prefix}_cbc_encrypt:
1253___
1254$code.=<<___	if ($flavour =~ /64/);
1255	stp	x29,x30,[sp,#-16]!
1256	add	x29,sp,#0
1257___
1258$code.=<<___	if ($flavour !~ /64/);
1259	mov	ip,sp
1260	stmdb	sp!,{r4-r8,lr}
1261	vstmdb	sp!,{d8-d15}            @ ABI specification says so
1262	ldmia	ip,{r4-r5}		@ load remaining args
1263___
1264$code.=<<___;
1265	subs	$len,$len,#16
1266	mov	$step,#16
1267	b.lo	.Lcbc_abort
1268	cclr	$step,eq
1269
1270	cmp	$enc,#0			// en- or decrypting?
1271	ldr	$rounds,[$key,#240]
1272	and	$len,$len,#-16
1273	vld1.8	{$ivec},[$ivp]
1274	vld1.8	{$dat},[$inp],$step
1275
1276	vld1.32	{q8-q9},[$key]		// load key schedule...
1277	sub	$rounds,$rounds,#6
1278	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
1279	sub	$rounds,$rounds,#2
1280	vld1.32	{q10-q11},[$key_],#32
1281	vld1.32	{q12-q13},[$key_],#32
1282	vld1.32	{q14-q15},[$key_],#32
1283	vld1.32	{$rndlast},[$key_]
1284
1285	add	$key_,$key,#32
1286	mov	$cnt,$rounds
1287	b.eq	.Lcbc_dec
1288
1289	cmp	$rounds,#2
1290	veor	$dat,$dat,$ivec
1291	veor	$rndzero_n_last,q8,$rndlast
1292	b.eq	.Lcbc_enc128
1293
1294	vld1.32	{$in0-$in1},[$key_]
1295	add	$key_,$key,#16
1296	add	$key4,$key,#16*4
1297	add	$key5,$key,#16*5
1298	aese	$dat,q8
1299	aesmc	$dat,$dat
1300	add	$key6,$key,#16*6
1301	add	$key7,$key,#16*7
1302	b	.Lenter_cbc_enc
1303
1304.align	4
1305.Loop_cbc_enc:
1306	aese	$dat,q8
1307	aesmc	$dat,$dat
1308	 vst1.8	{$ivec},[$out],#16
1309.Lenter_cbc_enc:
1310	aese	$dat,q9
1311	aesmc	$dat,$dat
1312	aese	$dat,$in0
1313	aesmc	$dat,$dat
1314	vld1.32	{q8},[$key4]
1315	cmp	$rounds,#4
1316	aese	$dat,$in1
1317	aesmc	$dat,$dat
1318	vld1.32	{q9},[$key5]
1319	b.eq	.Lcbc_enc192
1320
1321	aese	$dat,q8
1322	aesmc	$dat,$dat
1323	vld1.32	{q8},[$key6]
1324	aese	$dat,q9
1325	aesmc	$dat,$dat
1326	vld1.32	{q9},[$key7]
1327	nop
1328
1329.Lcbc_enc192:
1330	aese	$dat,q8
1331	aesmc	$dat,$dat
1332	 subs	$len,$len,#16
1333	aese	$dat,q9
1334	aesmc	$dat,$dat
1335	 cclr	$step,eq
1336	aese	$dat,q10
1337	aesmc	$dat,$dat
1338	aese	$dat,q11
1339	aesmc	$dat,$dat
1340	 vld1.8	{q8},[$inp],$step
1341	aese	$dat,q12
1342	aesmc	$dat,$dat
1343	 veor	q8,q8,$rndzero_n_last
1344	aese	$dat,q13
1345	aesmc	$dat,$dat
1346	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
1347	aese	$dat,q14
1348	aesmc	$dat,$dat
1349	aese	$dat,q15
1350	veor	$ivec,$dat,$rndlast
1351	b.hs	.Loop_cbc_enc
1352
1353	vst1.8	{$ivec},[$out],#16
1354	b	.Lcbc_done
1355
1356.align	5
1357.Lcbc_enc128:
1358	vld1.32	{$in0-$in1},[$key_]
1359	aese	$dat,q8
1360	aesmc	$dat,$dat
1361	b	.Lenter_cbc_enc128
1362.Loop_cbc_enc128:
1363	aese	$dat,q8
1364	aesmc	$dat,$dat
1365	 vst1.8	{$ivec},[$out],#16
1366.Lenter_cbc_enc128:
1367	aese	$dat,q9
1368	aesmc	$dat,$dat
1369	 subs	$len,$len,#16
1370	aese	$dat,$in0
1371	aesmc	$dat,$dat
1372	 cclr	$step,eq
1373	aese	$dat,$in1
1374	aesmc	$dat,$dat
1375	aese	$dat,q10
1376	aesmc	$dat,$dat
1377	aese	$dat,q11
1378	aesmc	$dat,$dat
1379	 vld1.8	{q8},[$inp],$step
1380	aese	$dat,q12
1381	aesmc	$dat,$dat
1382	aese	$dat,q13
1383	aesmc	$dat,$dat
1384	aese	$dat,q14
1385	aesmc	$dat,$dat
1386	 veor	q8,q8,$rndzero_n_last
1387	aese	$dat,q15
1388	veor	$ivec,$dat,$rndlast
1389	b.hs	.Loop_cbc_enc128
1390
1391	vst1.8	{$ivec},[$out],#16
1392	b	.Lcbc_done
1393___
1394{
1395my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1396
1397my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
1398my ($dat4,$in4,$tmp4);
1399if ($flavour =~ /64/) {
1400    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1401}
1402
1403$code.=<<___;
1404.align	5
1405.Lcbc_dec:
1406	vld1.8	{$dat2},[$inp],#16
1407	subs	$len,$len,#32		// bias
1408	add	$cnt,$rounds,#2
1409	vorr	$in1,$dat,$dat
1410	vorr	$dat1,$dat,$dat
1411	vorr	$in2,$dat2,$dat2
1412	b.lo	.Lcbc_dec_tail
1413
1414	vorr	$dat1,$dat2,$dat2
1415	vld1.8	{$dat2},[$inp],#16
1416	vorr	$in0,$dat,$dat
1417	vorr	$in1,$dat1,$dat1
1418	vorr	$in2,$dat2,$dat2
1419___
1420$code.=<<___	if ($flavour =~ /64/);
1421	cmp	$len,#32
1422	b.lo	.Loop3x_cbc_dec
1423
1424	vld1.8	{$dat3},[$inp],#16
1425	vld1.8	{$dat4},[$inp],#16
1426	sub	$len,$len,#32		// bias
1427	mov	$cnt,$rounds
1428	vorr	$in3,$dat3,$dat3
1429	vorr	$in4,$dat4,$dat4
1430
1431.Loop5x_cbc_dec:
1432	aesd	$dat0,q8
1433	aesimc	$dat0,$dat0
1434	aesd	$dat1,q8
1435	aesimc	$dat1,$dat1
1436	aesd	$dat2,q8
1437	aesimc	$dat2,$dat2
1438	aesd	$dat3,q8
1439	aesimc	$dat3,$dat3
1440	aesd	$dat4,q8
1441	aesimc	$dat4,$dat4
1442	vld1.32	{q8},[$key_],#16
1443	subs	$cnt,$cnt,#2
1444	aesd	$dat0,q9
1445	aesimc	$dat0,$dat0
1446	aesd	$dat1,q9
1447	aesimc	$dat1,$dat1
1448	aesd	$dat2,q9
1449	aesimc	$dat2,$dat2
1450	aesd	$dat3,q9
1451	aesimc	$dat3,$dat3
1452	aesd	$dat4,q9
1453	aesimc	$dat4,$dat4
1454	vld1.32	{q9},[$key_],#16
1455	b.gt	.Loop5x_cbc_dec
1456
1457	aesd	$dat0,q8
1458	aesimc	$dat0,$dat0
1459	aesd	$dat1,q8
1460	aesimc	$dat1,$dat1
1461	aesd	$dat2,q8
1462	aesimc	$dat2,$dat2
1463	aesd	$dat3,q8
1464	aesimc	$dat3,$dat3
1465	aesd	$dat4,q8
1466	aesimc	$dat4,$dat4
1467	 cmp	$len,#0x40		// because .Lcbc_tail4x
1468	 sub	$len,$len,#0x50
1469
1470	aesd	$dat0,q9
1471	aesimc	$dat0,$dat0
1472	aesd	$dat1,q9
1473	aesimc	$dat1,$dat1
1474	aesd	$dat2,q9
1475	aesimc	$dat2,$dat2
1476	aesd	$dat3,q9
1477	aesimc	$dat3,$dat3
1478	aesd	$dat4,q9
1479	aesimc	$dat4,$dat4
1480	 csel	x6,xzr,$len,gt		// borrow x6, $cnt, "gt" is not typo
1481	 mov	$key_,$key
1482
1483	aesd	$dat0,q10
1484	aesimc	$dat0,$dat0
1485	aesd	$dat1,q10
1486	aesimc	$dat1,$dat1
1487	aesd	$dat2,q10
1488	aesimc	$dat2,$dat2
1489	aesd	$dat3,q10
1490	aesimc	$dat3,$dat3
1491	aesd	$dat4,q10
1492	aesimc	$dat4,$dat4
1493	 add	$inp,$inp,x6		// $inp is adjusted in such way that
1494					// at exit from the loop $dat1-$dat4
1495					// are loaded with last "words"
1496	 add	x6,$len,#0x60		// because .Lcbc_tail4x
1497
1498	aesd	$dat0,q11
1499	aesimc	$dat0,$dat0
1500	aesd	$dat1,q11
1501	aesimc	$dat1,$dat1
1502	aesd	$dat2,q11
1503	aesimc	$dat2,$dat2
1504	aesd	$dat3,q11
1505	aesimc	$dat3,$dat3
1506	aesd	$dat4,q11
1507	aesimc	$dat4,$dat4
1508
1509	aesd	$dat0,q12
1510	aesimc	$dat0,$dat0
1511	aesd	$dat1,q12
1512	aesimc	$dat1,$dat1
1513	aesd	$dat2,q12
1514	aesimc	$dat2,$dat2
1515	aesd	$dat3,q12
1516	aesimc	$dat3,$dat3
1517	aesd	$dat4,q12
1518	aesimc	$dat4,$dat4
1519
1520	aesd	$dat0,q13
1521	aesimc	$dat0,$dat0
1522	aesd	$dat1,q13
1523	aesimc	$dat1,$dat1
1524	aesd	$dat2,q13
1525	aesimc	$dat2,$dat2
1526	aesd	$dat3,q13
1527	aesimc	$dat3,$dat3
1528	aesd	$dat4,q13
1529	aesimc	$dat4,$dat4
1530
1531	aesd	$dat0,q14
1532	aesimc	$dat0,$dat0
1533	aesd	$dat1,q14
1534	aesimc	$dat1,$dat1
1535	aesd	$dat2,q14
1536	aesimc	$dat2,$dat2
1537	aesd	$dat3,q14
1538	aesimc	$dat3,$dat3
1539	aesd	$dat4,q14
1540	aesimc	$dat4,$dat4
1541
1542	 veor	$tmp0,$ivec,$rndlast
1543	aesd	$dat0,q15
1544	 veor	$tmp1,$in0,$rndlast
1545	 vld1.8	{$in0},[$inp],#16
1546	aesd	$dat1,q15
1547	 veor	$tmp2,$in1,$rndlast
1548	 vld1.8	{$in1},[$inp],#16
1549	aesd	$dat2,q15
1550	 veor	$tmp3,$in2,$rndlast
1551	 vld1.8	{$in2},[$inp],#16
1552	aesd	$dat3,q15
1553	 veor	$tmp4,$in3,$rndlast
1554	 vld1.8	{$in3},[$inp],#16
1555	aesd	$dat4,q15
1556	 vorr	$ivec,$in4,$in4
1557	 vld1.8	{$in4},[$inp],#16
1558	cbz	x6,.Lcbc_tail4x
1559	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
1560	veor	$tmp0,$tmp0,$dat0
1561	 vorr	$dat0,$in0,$in0
1562	veor	$tmp1,$tmp1,$dat1
1563	 vorr	$dat1,$in1,$in1
1564	veor	$tmp2,$tmp2,$dat2
1565	 vorr	$dat2,$in2,$in2
1566	veor	$tmp3,$tmp3,$dat3
1567	 vorr	$dat3,$in3,$in3
1568	veor	$tmp4,$tmp4,$dat4
1569	vst1.8	{$tmp0},[$out],#16
1570	 vorr	$dat4,$in4,$in4
1571	vst1.8	{$tmp1},[$out],#16
1572	 mov	$cnt,$rounds
1573	vst1.8	{$tmp2},[$out],#16
1574	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
1575	vst1.8	{$tmp3},[$out],#16
1576	vst1.8	{$tmp4},[$out],#16
1577	b.hs	.Loop5x_cbc_dec
1578
1579	add	$len,$len,#0x50
1580	cbz	$len,.Lcbc_done
1581
1582	add	$cnt,$rounds,#2
1583	subs	$len,$len,#0x30
1584	vorr	$dat0,$in2,$in2
1585	vorr	$in0,$in2,$in2
1586	vorr	$dat1,$in3,$in3
1587	vorr	$in1,$in3,$in3
1588	vorr	$dat2,$in4,$in4
1589	vorr	$in2,$in4,$in4
1590	b.lo	.Lcbc_dec_tail
1591
1592	b	.Loop3x_cbc_dec
1593
1594.align	4
1595.Lcbc_tail4x:
1596	veor	$tmp1,$tmp0,$dat1
1597	veor	$tmp2,$tmp2,$dat2
1598	veor	$tmp3,$tmp3,$dat3
1599	veor	$tmp4,$tmp4,$dat4
1600	vst1.8	{$tmp1},[$out],#16
1601	vst1.8	{$tmp2},[$out],#16
1602	vst1.8	{$tmp3},[$out],#16
1603	vst1.8	{$tmp4},[$out],#16
1604
1605	b	.Lcbc_done
1606.align	4
1607___
1608$code.=<<___;
1609.Loop3x_cbc_dec:
1610	aesd	$dat0,q8
1611	aesimc	$dat0,$dat0
1612	aesd	$dat1,q8
1613	aesimc	$dat1,$dat1
1614	aesd	$dat2,q8
1615	aesimc	$dat2,$dat2
1616	vld1.32	{q8},[$key_],#16
1617	subs	$cnt,$cnt,#2
1618	aesd	$dat0,q9
1619	aesimc	$dat0,$dat0
1620	aesd	$dat1,q9
1621	aesimc	$dat1,$dat1
1622	aesd	$dat2,q9
1623	aesimc	$dat2,$dat2
1624	vld1.32	{q9},[$key_],#16
1625	b.gt	.Loop3x_cbc_dec
1626
1627	aesd	$dat0,q8
1628	aesimc	$dat0,$dat0
1629	aesd	$dat1,q8
1630	aesimc	$dat1,$dat1
1631	aesd	$dat2,q8
1632	aesimc	$dat2,$dat2
1633	 veor	$tmp0,$ivec,$rndlast
1634	 subs	$len,$len,#0x30
1635	 veor	$tmp1,$in0,$rndlast
1636	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
1637	aesd	$dat0,q9
1638	aesimc	$dat0,$dat0
1639	aesd	$dat1,q9
1640	aesimc	$dat1,$dat1
1641	aesd	$dat2,q9
1642	aesimc	$dat2,$dat2
1643	 veor	$tmp2,$in1,$rndlast
1644	 add	$inp,$inp,x6		// $inp is adjusted in such way that
1645					// at exit from the loop $dat1-$dat2
1646					// are loaded with last "words"
1647	 vorr	$ivec,$in2,$in2
1648	 mov	$key_,$key
1649	aesd	$dat0,q12
1650	aesimc	$dat0,$dat0
1651	aesd	$dat1,q12
1652	aesimc	$dat1,$dat1
1653	aesd	$dat2,q12
1654	aesimc	$dat2,$dat2
1655	 vld1.8	{$in0},[$inp],#16
1656	aesd	$dat0,q13
1657	aesimc	$dat0,$dat0
1658	aesd	$dat1,q13
1659	aesimc	$dat1,$dat1
1660	aesd	$dat2,q13
1661	aesimc	$dat2,$dat2
1662	 vld1.8	{$in1},[$inp],#16
1663	aesd	$dat0,q14
1664	aesimc	$dat0,$dat0
1665	aesd	$dat1,q14
1666	aesimc	$dat1,$dat1
1667	aesd	$dat2,q14
1668	aesimc	$dat2,$dat2
1669	 vld1.8	{$in2},[$inp],#16
1670	aesd	$dat0,q15
1671	aesd	$dat1,q15
1672	aesd	$dat2,q15
1673	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
1674	 add	$cnt,$rounds,#2
1675	veor	$tmp0,$tmp0,$dat0
1676	veor	$tmp1,$tmp1,$dat1
1677	veor	$dat2,$dat2,$tmp2
1678	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
1679	vst1.8	{$tmp0},[$out],#16
1680	 vorr	$dat0,$in0,$in0
1681	vst1.8	{$tmp1},[$out],#16
1682	 vorr	$dat1,$in1,$in1
1683	vst1.8	{$dat2},[$out],#16
1684	 vorr	$dat2,$in2,$in2
1685	b.hs	.Loop3x_cbc_dec
1686
1687	cmn	$len,#0x30
1688	b.eq	.Lcbc_done
1689	nop
1690
1691.Lcbc_dec_tail:
1692	aesd	$dat1,q8
1693	aesimc	$dat1,$dat1
1694	aesd	$dat2,q8
1695	aesimc	$dat2,$dat2
1696	vld1.32	{q8},[$key_],#16
1697	subs	$cnt,$cnt,#2
1698	aesd	$dat1,q9
1699	aesimc	$dat1,$dat1
1700	aesd	$dat2,q9
1701	aesimc	$dat2,$dat2
1702	vld1.32	{q9},[$key_],#16
1703	b.gt	.Lcbc_dec_tail
1704
1705	aesd	$dat1,q8
1706	aesimc	$dat1,$dat1
1707	aesd	$dat2,q8
1708	aesimc	$dat2,$dat2
1709	aesd	$dat1,q9
1710	aesimc	$dat1,$dat1
1711	aesd	$dat2,q9
1712	aesimc	$dat2,$dat2
1713	aesd	$dat1,q12
1714	aesimc	$dat1,$dat1
1715	aesd	$dat2,q12
1716	aesimc	$dat2,$dat2
1717	 cmn	$len,#0x20
1718	aesd	$dat1,q13
1719	aesimc	$dat1,$dat1
1720	aesd	$dat2,q13
1721	aesimc	$dat2,$dat2
1722	 veor	$tmp1,$ivec,$rndlast
1723	aesd	$dat1,q14
1724	aesimc	$dat1,$dat1
1725	aesd	$dat2,q14
1726	aesimc	$dat2,$dat2
1727	 veor	$tmp2,$in1,$rndlast
1728	aesd	$dat1,q15
1729	aesd	$dat2,q15
1730	b.eq	.Lcbc_dec_one
1731	veor	$tmp1,$tmp1,$dat1
1732	veor	$tmp2,$tmp2,$dat2
1733	 vorr	$ivec,$in2,$in2
1734	vst1.8	{$tmp1},[$out],#16
1735	vst1.8	{$tmp2},[$out],#16
1736	b	.Lcbc_done
1737
1738.Lcbc_dec_one:
1739	veor	$tmp1,$tmp1,$dat2
1740	 vorr	$ivec,$in2,$in2
1741	vst1.8	{$tmp1},[$out],#16
1742
1743.Lcbc_done:
1744	vst1.8	{$ivec},[$ivp]
1745.Lcbc_abort:
1746___
1747}
1748$code.=<<___	if ($flavour !~ /64/);
1749	vldmia	sp!,{d8-d15}
1750	ldmia	sp!,{r4-r8,pc}
1751___
1752$code.=<<___	if ($flavour =~ /64/);
1753	ldr	x29,[sp],#16
1754	ret
1755___
1756$code.=<<___;
1757.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1758___
1759}}}
1760{{{
1761my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1762my ($rounds,$cnt,$key_)=("w5","w6","x7");
1763my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1764my $step="x12";		# aliases with $tctr2
1765
1766my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1767my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1768
1769# used only in 64-bit mode...
1770my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1771
1772my ($dat,$tmp)=($dat0,$tmp0);
1773
1774### q8-q15	preloaded key schedule
1775
1776$code.=<<___;
1777.globl	${prefix}_ctr32_encrypt_blocks
1778.type	${prefix}_ctr32_encrypt_blocks,%function
1779.align	5
1780${prefix}_ctr32_encrypt_blocks:
1781___
1782$code.=<<___	if ($flavour =~ /64/);
1783	stp		x29,x30,[sp,#-16]!
1784	add		x29,sp,#0
1785___
1786$code.=<<___	if ($flavour !~ /64/);
1787	mov		ip,sp
1788	stmdb		sp!,{r4-r10,lr}
1789	vstmdb		sp!,{d8-d15}            @ ABI specification says so
1790	ldr		r4, [ip]		@ load remaining arg
1791___
1792$code.=<<___;
1793	ldr		$rounds,[$key,#240]
1794
1795	ldr		$ctr, [$ivp, #12]
1796#ifdef __ARMEB__
1797	vld1.8		{$dat0},[$ivp]
1798#else
1799	vld1.32		{$dat0},[$ivp]
1800#endif
1801	vld1.32		{q8-q9},[$key]		// load key schedule...
1802	sub		$rounds,$rounds,#4
1803	mov		$step,#16
1804	cmp		$len,#2
1805	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
1806	sub		$rounds,$rounds,#2
1807	vld1.32		{q12-q13},[$key_],#32
1808	vld1.32		{q14-q15},[$key_],#32
1809	vld1.32		{$rndlast},[$key_]
1810	add		$key_,$key,#32
1811	mov		$cnt,$rounds
1812	cclr		$step,lo
1813#ifndef __ARMEB__
1814	rev		$ctr, $ctr
1815#endif
1816___
1817$code.=<<___	if ($flavour =~ /64/);
1818	vorr		$dat1,$dat0,$dat0
1819	add		$tctr1, $ctr, #1
1820	vorr		$dat2,$dat0,$dat0
1821	add		$ctr, $ctr, #2
1822	vorr		$ivec,$dat0,$dat0
1823	rev		$tctr1, $tctr1
1824	vmov.32		${dat1}[3],$tctr1
1825	b.ls		.Lctr32_tail
1826	rev		$tctr2, $ctr
1827	sub		$len,$len,#3		// bias
1828	vmov.32		${dat2}[3],$tctr2
1829___
1830$code.=<<___	if ($flavour !~ /64/);
1831	add		$tctr1, $ctr, #1
1832	vorr		$ivec,$dat0,$dat0
1833	rev		$tctr1, $tctr1
1834	vmov.32		${ivec}[3],$tctr1
1835	add		$ctr, $ctr, #2
1836	vorr		$dat1,$ivec,$ivec
1837	b.ls		.Lctr32_tail
1838	rev		$tctr2, $ctr
1839	vmov.32		${ivec}[3],$tctr2
1840	sub		$len,$len,#3		// bias
1841	vorr		$dat2,$ivec,$ivec
1842___
1843$code.=<<___	if ($flavour =~ /64/);
1844	cmp		$len,#32
1845	b.lo		.Loop3x_ctr32
1846
1847	add		w13,$ctr,#1
1848	add		w14,$ctr,#2
1849	vorr		$dat3,$dat0,$dat0
1850	rev		w13,w13
1851	vorr		$dat4,$dat0,$dat0
1852	rev		w14,w14
1853	vmov.32		${dat3}[3],w13
1854	sub		$len,$len,#2		// bias
1855	vmov.32		${dat4}[3],w14
1856	add		$ctr,$ctr,#2
1857	b		.Loop5x_ctr32
1858
1859.align	4
1860.Loop5x_ctr32:
1861	aese		$dat0,q8
1862	aesmc		$dat0,$dat0
1863	aese		$dat1,q8
1864	aesmc		$dat1,$dat1
1865	aese		$dat2,q8
1866	aesmc		$dat2,$dat2
1867	aese		$dat3,q8
1868	aesmc		$dat3,$dat3
1869	aese		$dat4,q8
1870	aesmc		$dat4,$dat4
1871	vld1.32		{q8},[$key_],#16
1872	subs		$cnt,$cnt,#2
1873	aese		$dat0,q9
1874	aesmc		$dat0,$dat0
1875	aese		$dat1,q9
1876	aesmc		$dat1,$dat1
1877	aese		$dat2,q9
1878	aesmc		$dat2,$dat2
1879	aese		$dat3,q9
1880	aesmc		$dat3,$dat3
1881	aese		$dat4,q9
1882	aesmc		$dat4,$dat4
1883	vld1.32		{q9},[$key_],#16
1884	b.gt		.Loop5x_ctr32
1885
1886	mov		$key_,$key
1887	aese		$dat0,q8
1888	aesmc		$dat0,$dat0
1889	aese		$dat1,q8
1890	aesmc		$dat1,$dat1
1891	aese		$dat2,q8
1892	aesmc		$dat2,$dat2
1893	aese		$dat3,q8
1894	aesmc		$dat3,$dat3
1895	aese		$dat4,q8
1896	aesmc		$dat4,$dat4
1897	vld1.32	 	{q8},[$key_],#16	// re-pre-load rndkey[0]
1898
1899	aese		$dat0,q9
1900	aesmc		$dat0,$dat0
1901	aese		$dat1,q9
1902	aesmc		$dat1,$dat1
1903	aese		$dat2,q9
1904	aesmc		$dat2,$dat2
1905	aese		$dat3,q9
1906	aesmc		$dat3,$dat3
1907	aese		$dat4,q9
1908	aesmc		$dat4,$dat4
1909	vld1.32	 	{q9},[$key_],#16	// re-pre-load rndkey[1]
1910
1911	aese		$dat0,q12
1912	aesmc		$dat0,$dat0
1913	 add		$tctr0,$ctr,#1
1914	 add		$tctr1,$ctr,#2
1915	aese		$dat1,q12
1916	aesmc		$dat1,$dat1
1917	 add		$tctr2,$ctr,#3
1918	 add		w13,$ctr,#4
1919	aese		$dat2,q12
1920	aesmc		$dat2,$dat2
1921	 add		w14,$ctr,#5
1922	 rev		$tctr0,$tctr0
1923	aese		$dat3,q12
1924	aesmc		$dat3,$dat3
1925	 rev		$tctr1,$tctr1
1926	 rev		$tctr2,$tctr2
1927	aese		$dat4,q12
1928	aesmc		$dat4,$dat4
1929	 rev		w13,w13
1930	 rev		w14,w14
1931
1932	aese		$dat0,q13
1933	aesmc		$dat0,$dat0
1934	aese		$dat1,q13
1935	aesmc		$dat1,$dat1
1936	aese		$dat2,q13
1937	aesmc		$dat2,$dat2
1938	aese		$dat3,q13
1939	aesmc		$dat3,$dat3
1940	aese		$dat4,q13
1941	aesmc		$dat4,$dat4
1942
1943	aese		$dat0,q14
1944	aesmc		$dat0,$dat0
1945	 vld1.8		{$in0},[$inp],#16
1946	aese		$dat1,q14
1947	aesmc		$dat1,$dat1
1948	 vld1.8		{$in1},[$inp],#16
1949	aese		$dat2,q14
1950	aesmc		$dat2,$dat2
1951	 vld1.8		{$in2},[$inp],#16
1952	aese		$dat3,q14
1953	aesmc		$dat3,$dat3
1954	 vld1.8		{$in3},[$inp],#16
1955	aese		$dat4,q14
1956	aesmc		$dat4,$dat4
1957	 vld1.8		{$in4},[$inp],#16
1958
1959	aese		$dat0,q15
1960	 veor		$in0,$in0,$rndlast
1961	aese		$dat1,q15
1962	 veor		$in1,$in1,$rndlast
1963	aese		$dat2,q15
1964	 veor		$in2,$in2,$rndlast
1965	aese		$dat3,q15
1966	 veor		$in3,$in3,$rndlast
1967	aese		$dat4,q15
1968	 veor		$in4,$in4,$rndlast
1969
1970	veor		$in0,$in0,$dat0
1971	 vorr		$dat0,$ivec,$ivec
1972	veor		$in1,$in1,$dat1
1973	 vorr		$dat1,$ivec,$ivec
1974	veor		$in2,$in2,$dat2
1975	 vorr		$dat2,$ivec,$ivec
1976	veor		$in3,$in3,$dat3
1977	 vorr		$dat3,$ivec,$ivec
1978	veor		$in4,$in4,$dat4
1979	 vorr		$dat4,$ivec,$ivec
1980
1981	vst1.8		{$in0},[$out],#16
1982	 vmov.32	${dat0}[3],$tctr0
1983	vst1.8		{$in1},[$out],#16
1984	 vmov.32	${dat1}[3],$tctr1
1985	vst1.8		{$in2},[$out],#16
1986	 vmov.32	${dat2}[3],$tctr2
1987	vst1.8		{$in3},[$out],#16
1988	 vmov.32	${dat3}[3],w13
1989	vst1.8		{$in4},[$out],#16
1990	 vmov.32	${dat4}[3],w14
1991
1992	mov		$cnt,$rounds
1993	cbz		$len,.Lctr32_done
1994
1995	add		$ctr,$ctr,#5
1996	subs		$len,$len,#5
1997	b.hs		.Loop5x_ctr32
1998
1999	add		$len,$len,#5
2000	sub		$ctr,$ctr,#5
2001
2002	cmp		$len,#2
2003	mov		$step,#16
2004	cclr		$step,lo
2005	b.ls		.Lctr32_tail
2006
2007	sub		$len,$len,#3		// bias
2008	add		$ctr,$ctr,#3
2009___
2010$code.=<<___;
2011	b		.Loop3x_ctr32
2012
2013.align	4
2014.Loop3x_ctr32:
2015	aese		$dat0,q8
2016	aesmc		$dat0,$dat0
2017	aese		$dat1,q8
2018	aesmc		$dat1,$dat1
2019	aese		$dat2,q8
2020	aesmc		$dat2,$dat2
2021	vld1.32		{q8},[$key_],#16
2022	subs		$cnt,$cnt,#2
2023	aese		$dat0,q9
2024	aesmc		$dat0,$dat0
2025	aese		$dat1,q9
2026	aesmc		$dat1,$dat1
2027	aese		$dat2,q9
2028	aesmc		$dat2,$dat2
2029	vld1.32		{q9},[$key_],#16
2030	b.gt		.Loop3x_ctr32
2031
2032	aese		$dat0,q8
2033	aesmc		$tmp0,$dat0
2034	aese		$dat1,q8
2035	aesmc		$tmp1,$dat1
2036	 vld1.8		{$in0},[$inp],#16
2037___
2038$code.=<<___	if ($flavour =~ /64/);
2039	 vorr		$dat0,$ivec,$ivec
2040___
2041$code.=<<___	if ($flavour !~ /64/);
2042	 add		$tctr0,$ctr,#1
2043___
2044$code.=<<___;
2045	aese		$dat2,q8
2046	aesmc		$dat2,$dat2
2047	 vld1.8		{$in1},[$inp],#16
2048___
2049$code.=<<___	if ($flavour =~ /64/);
2050	 vorr		$dat1,$ivec,$ivec
2051___
2052$code.=<<___	if ($flavour !~ /64/);
2053	 rev		$tctr0,$tctr0
2054___
2055$code.=<<___;
2056	aese		$tmp0,q9
2057	aesmc		$tmp0,$tmp0
2058	aese		$tmp1,q9
2059	aesmc		$tmp1,$tmp1
2060	 vld1.8		{$in2},[$inp],#16
2061	 mov		$key_,$key
2062	aese		$dat2,q9
2063	aesmc		$tmp2,$dat2
2064___
2065$code.=<<___	if ($flavour =~ /64/);
2066	 vorr		$dat2,$ivec,$ivec
2067	 add		$tctr0,$ctr,#1
2068___
2069$code.=<<___;
2070	aese		$tmp0,q12
2071	aesmc		$tmp0,$tmp0
2072	aese		$tmp1,q12
2073	aesmc		$tmp1,$tmp1
2074	 veor		$in0,$in0,$rndlast
2075	 add		$tctr1,$ctr,#2
2076	aese		$tmp2,q12
2077	aesmc		$tmp2,$tmp2
2078	 veor		$in1,$in1,$rndlast
2079	 add		$ctr,$ctr,#3
2080	aese		$tmp0,q13
2081	aesmc		$tmp0,$tmp0
2082	aese		$tmp1,q13
2083	aesmc		$tmp1,$tmp1
2084	 veor		$in2,$in2,$rndlast
2085___
2086$code.=<<___	if ($flavour =~ /64/);
2087	 rev		$tctr0,$tctr0
2088	aese		$tmp2,q13
2089	aesmc		$tmp2,$tmp2
2090	 vmov.32	${dat0}[3], $tctr0
2091___
2092$code.=<<___	if ($flavour !~ /64/);
2093	 vmov.32	${ivec}[3], $tctr0
2094	aese		$tmp2,q13
2095	aesmc		$tmp2,$tmp2
2096	 vorr		$dat0,$ivec,$ivec
2097___
2098$code.=<<___;
2099	 rev		$tctr1,$tctr1
2100	aese		$tmp0,q14
2101	aesmc		$tmp0,$tmp0
2102___
2103$code.=<<___	if ($flavour !~ /64/);
2104	 vmov.32	${ivec}[3], $tctr1
2105	 rev		$tctr2,$ctr
2106___
2107$code.=<<___;
2108	aese		$tmp1,q14
2109	aesmc		$tmp1,$tmp1
2110___
2111$code.=<<___	if ($flavour =~ /64/);
2112	 vmov.32	${dat1}[3], $tctr1
2113	 rev		$tctr2,$ctr
2114	aese		$tmp2,q14
2115	aesmc		$tmp2,$tmp2
2116	 vmov.32	${dat2}[3], $tctr2
2117___
2118$code.=<<___	if ($flavour !~ /64/);
2119	 vorr		$dat1,$ivec,$ivec
2120	 vmov.32	${ivec}[3], $tctr2
2121	aese		$tmp2,q14
2122	aesmc		$tmp2,$tmp2
2123	 vorr		$dat2,$ivec,$ivec
2124___
2125$code.=<<___;
2126	 subs		$len,$len,#3
2127	aese		$tmp0,q15
2128	aese		$tmp1,q15
2129	aese		$tmp2,q15
2130
2131	veor		$in0,$in0,$tmp0
2132	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
2133	vst1.8		{$in0},[$out],#16
2134	veor		$in1,$in1,$tmp1
2135	 mov		$cnt,$rounds
2136	vst1.8		{$in1},[$out],#16
2137	veor		$in2,$in2,$tmp2
2138	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
2139	vst1.8		{$in2},[$out],#16
2140	b.hs		.Loop3x_ctr32
2141
2142	adds		$len,$len,#3
2143	b.eq		.Lctr32_done
2144	cmp		$len,#1
2145	mov		$step,#16
2146	cclr		$step,eq
2147
2148.Lctr32_tail:
2149	aese		$dat0,q8
2150	aesmc		$dat0,$dat0
2151	aese		$dat1,q8
2152	aesmc		$dat1,$dat1
2153	vld1.32		{q8},[$key_],#16
2154	subs		$cnt,$cnt,#2
2155	aese		$dat0,q9
2156	aesmc		$dat0,$dat0
2157	aese		$dat1,q9
2158	aesmc		$dat1,$dat1
2159	vld1.32		{q9},[$key_],#16
2160	b.gt		.Lctr32_tail
2161
2162	aese		$dat0,q8
2163	aesmc		$dat0,$dat0
2164	aese		$dat1,q8
2165	aesmc		$dat1,$dat1
2166	aese		$dat0,q9
2167	aesmc		$dat0,$dat0
2168	aese		$dat1,q9
2169	aesmc		$dat1,$dat1
2170	 vld1.8		{$in0},[$inp],$step
2171	aese		$dat0,q12
2172	aesmc		$dat0,$dat0
2173	aese		$dat1,q12
2174	aesmc		$dat1,$dat1
2175	 vld1.8		{$in1},[$inp]
2176	aese		$dat0,q13
2177	aesmc		$dat0,$dat0
2178	aese		$dat1,q13
2179	aesmc		$dat1,$dat1
2180	 veor		$in0,$in0,$rndlast
2181	aese		$dat0,q14
2182	aesmc		$dat0,$dat0
2183	aese		$dat1,q14
2184	aesmc		$dat1,$dat1
2185	 veor		$in1,$in1,$rndlast
2186	aese		$dat0,q15
2187	aese		$dat1,q15
2188
2189	cmp		$len,#1
2190	veor		$in0,$in0,$dat0
2191	veor		$in1,$in1,$dat1
2192	vst1.8		{$in0},[$out],#16
2193	b.eq		.Lctr32_done
2194	vst1.8		{$in1},[$out]
2195
2196.Lctr32_done:
2197___
2198$code.=<<___	if ($flavour !~ /64/);
2199	vldmia		sp!,{d8-d15}
2200	ldmia		sp!,{r4-r10,pc}
2201___
2202$code.=<<___	if ($flavour =~ /64/);
2203	ldr		x29,[sp],#16
2204	ret
2205___
2206$code.=<<___;
2207.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2208___
2209}}}
2210# Performance in cycles per byte.
2211# Processed with AES-XTS different key size.
2212# It shows the value before and after optimization as below:
2213# (before/after):
2214#
2215#		AES-128-XTS		AES-256-XTS
2216# Cortex-A57	3.36/1.09		4.02/1.37
2217# Cortex-A72	3.03/1.02		3.28/1.33
2218
2219# Optimization is implemented by loop unrolling and interleaving.
2220# Commonly, we choose the unrolling factor as 5, if the input
2221# data size smaller than 5 blocks, but not smaller than 3 blocks,
2222# choose 3 as the unrolling factor.
2223# If the input data size dsize >= 5*16 bytes, then take 5 blocks
2224# as one iteration, every loop the left size lsize -= 5*16.
2225# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2226# will be processed specially, which be integrated into the 5*16 bytes
2227# loop to improve the efficiency.
2228# There is one special case, if the original input data size dsize
2229# = 16 bytes, we will treat it seperately to improve the
2230# performance: one independent code block without LR, FP load and
2231# store.
2232# Encryption will process the (length -tailcnt) bytes as mentioned
2233# previously, then encrypt the composite block as last second
2234# cipher block.
2235# Decryption will process the (length -tailcnt -1) bytes as mentioned
2236# previously, then decrypt the last second cipher block to get the
2237# last plain block(tail), decrypt the composite block as last second
2238# plain text block.
2239
2240{{{
2241my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2242my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2243my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2244my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2245my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2246my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2247my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2248my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2249my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2250
2251my ($tmpin)=("v26.16b");
2252my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2253
2254# q7	last round key
2255# q10-q15, q7	Last 7 round keys
2256# q8-q9	preloaded round keys except last 7 keys for big size
2257# q20, q21, q8-q9	preloaded round keys except last 7 keys for only 16 byte
2258
2259
2260my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2261
2262my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
2263my ($dat4,$in4,$tmp4);
2264if ($flavour =~ /64/) {
2265    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2266}
2267
2268$code.=<<___	if ($flavour =~ /64/);
2269.globl	${prefix}_xts_encrypt
2270.type	${prefix}_xts_encrypt,%function
2271.align	5
2272${prefix}_xts_encrypt:
2273___
2274$code.=<<___	if ($flavour =~ /64/);
2275	cmp	$len,#16
2276	// Original input data size bigger than 16, jump to big size processing.
2277	b.ne	.Lxts_enc_big_size
2278	// Encrypt the iv with key2, as the first XEX iv.
2279	ldr	$rounds,[$key2,#240]
2280	vld1.32	{$dat},[$key2],#16
2281	vld1.8	{$iv0},[$ivp]
2282	sub	$rounds,$rounds,#2
2283	vld1.32	{$dat1},[$key2],#16
2284
2285.Loop_enc_iv_enc:
2286	aese	$iv0,$dat
2287	aesmc	$iv0,$iv0
2288	vld1.32	{$dat},[$key2],#16
2289	subs	$rounds,$rounds,#2
2290	aese	$iv0,$dat1
2291	aesmc	$iv0,$iv0
2292	vld1.32	{$dat1},[$key2],#16
2293	b.gt	.Loop_enc_iv_enc
2294
2295	aese	$iv0,$dat
2296	aesmc	$iv0,$iv0
2297	vld1.32	{$dat},[$key2]
2298	aese	$iv0,$dat1
2299	veor	$iv0,$iv0,$dat
2300
2301	vld1.8	{$dat0},[$inp]
2302	veor	$dat0,$iv0,$dat0
2303
2304	ldr	$rounds,[$key1,#240]
2305	vld1.32	{q20-q21},[$key1],#32		// load key schedule...
2306
2307	aese	$dat0,q20
2308	aesmc	$dat0,$dat0
2309	vld1.32	{q8-q9},[$key1],#32		// load key schedule...
2310	aese	$dat0,q21
2311	aesmc	$dat0,$dat0
2312	subs	$rounds,$rounds,#10		// if rounds==10, jump to aes-128-xts processing
2313	b.eq	.Lxts_128_enc
2314.Lxts_enc_round_loop:
2315	aese	$dat0,q8
2316	aesmc	$dat0,$dat0
2317	vld1.32	{q8},[$key1],#16		// load key schedule...
2318	aese	$dat0,q9
2319	aesmc	$dat0,$dat0
2320	vld1.32	{q9},[$key1],#16		// load key schedule...
2321	subs	$rounds,$rounds,#2		// bias
2322	b.gt	.Lxts_enc_round_loop
2323.Lxts_128_enc:
2324	vld1.32	{q10-q11},[$key1],#32		// load key schedule...
2325	aese	$dat0,q8
2326	aesmc	$dat0,$dat0
2327	aese	$dat0,q9
2328	aesmc	$dat0,$dat0
2329	vld1.32	{q12-q13},[$key1],#32		// load key schedule...
2330	aese	$dat0,q10
2331	aesmc	$dat0,$dat0
2332	aese	$dat0,q11
2333	aesmc	$dat0,$dat0
2334	vld1.32	{q14-q15},[$key1],#32		// load key schedule...
2335	aese	$dat0,q12
2336	aesmc	$dat0,$dat0
2337	aese	$dat0,q13
2338	aesmc	$dat0,$dat0
2339	vld1.32	{$rndlast},[$key1]
2340	aese	$dat0,q14
2341	aesmc	$dat0,$dat0
2342	aese	$dat0,q15
2343	veor	$dat0,$dat0,$rndlast
2344	veor	$dat0,$dat0,$iv0
2345	vst1.8	{$dat0},[$out]
2346	b	.Lxts_enc_final_abort
2347
2348.align	4
2349.Lxts_enc_big_size:
2350___
2351$code.=<<___	if ($flavour =~ /64/);
2352	stp	$constnumx,$tmpinp,[sp,#-64]!
2353	stp	$tailcnt,$midnumx,[sp,#48]
2354	stp	$ivd10,$ivd20,[sp,#32]
2355	stp	$ivd30,$ivd40,[sp,#16]
2356
2357	// tailcnt store the tail value of length%16.
2358	and	$tailcnt,$len,#0xf
2359	and	$len,$len,#-16
2360	subs	$len,$len,#16
2361	mov	$step,#16
2362	b.lo	.Lxts_abort
2363	csel	$step,xzr,$step,eq
2364
2365	// Firstly, encrypt the iv with key2, as the first iv of XEX.
2366	ldr	$rounds,[$key2,#240]
2367	vld1.32	{$dat},[$key2],#16
2368	vld1.8	{$iv0},[$ivp]
2369	sub	$rounds,$rounds,#2
2370	vld1.32	{$dat1},[$key2],#16
2371
2372.Loop_iv_enc:
2373	aese	$iv0,$dat
2374	aesmc	$iv0,$iv0
2375	vld1.32	{$dat},[$key2],#16
2376	subs	$rounds,$rounds,#2
2377	aese	$iv0,$dat1
2378	aesmc	$iv0,$iv0
2379	vld1.32	{$dat1},[$key2],#16
2380	b.gt	.Loop_iv_enc
2381
2382	aese	$iv0,$dat
2383	aesmc	$iv0,$iv0
2384	vld1.32	{$dat},[$key2]
2385	aese	$iv0,$dat1
2386	veor	$iv0,$iv0,$dat
2387
2388	// The iv for second block
2389	// $ivl- iv(low), $ivh - iv(high)
2390	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2391	fmov	$ivl,$ivd00
2392	fmov	$ivh,$ivd01
2393	mov	$constnum,#0x87
2394	extr	$midnumx,$ivh,$ivh,#32
2395	extr	$ivh,$ivh,$ivl,#63
2396	and	$tmpmw,$constnum,$midnum,asr#31
2397	eor	$ivl,$tmpmx,$ivl,lsl#1
2398	fmov	$ivd10,$ivl
2399	fmov	$ivd11,$ivh
2400
2401	ldr	$rounds0,[$key1,#240]		// next starting point
2402	vld1.8	{$dat},[$inp],$step
2403
2404	vld1.32	{q8-q9},[$key1]			// load key schedule...
2405	sub	$rounds0,$rounds0,#6
2406	add	$key_,$key1,$ivp,lsl#4		// pointer to last 7 round keys
2407	sub	$rounds0,$rounds0,#2
2408	vld1.32	{q10-q11},[$key_],#32
2409	vld1.32	{q12-q13},[$key_],#32
2410	vld1.32	{q14-q15},[$key_],#32
2411	vld1.32	{$rndlast},[$key_]
2412
2413	add	$key_,$key1,#32
2414	mov	$rounds,$rounds0
2415
2416	// Encryption
2417.Lxts_enc:
2418	vld1.8	{$dat2},[$inp],#16
2419	subs	$len,$len,#32			// bias
2420	add	$rounds,$rounds0,#2
2421	vorr	$in1,$dat,$dat
2422	vorr	$dat1,$dat,$dat
2423	vorr	$in3,$dat,$dat
2424	vorr	$in2,$dat2,$dat2
2425	vorr	$in4,$dat2,$dat2
2426	b.lo	.Lxts_inner_enc_tail
2427	veor	$dat,$dat,$iv0			// before encryption, xor with iv
2428	veor	$dat2,$dat2,$iv1
2429
2430	// The iv for third block
2431	extr	$midnumx,$ivh,$ivh,#32
2432	extr	$ivh,$ivh,$ivl,#63
2433	and	$tmpmw,$constnum,$midnum,asr#31
2434	eor	$ivl,$tmpmx,$ivl,lsl#1
2435	fmov	$ivd20,$ivl
2436	fmov	$ivd21,$ivh
2437
2438
2439	vorr	$dat1,$dat2,$dat2
2440	vld1.8	{$dat2},[$inp],#16
2441	vorr	$in0,$dat,$dat
2442	vorr	$in1,$dat1,$dat1
2443	veor	$in2,$dat2,$iv2 		// the third block
2444	veor	$dat2,$dat2,$iv2
2445	cmp	$len,#32
2446	b.lo	.Lxts_outer_enc_tail
2447
2448	// The iv for fourth block
2449	extr	$midnumx,$ivh,$ivh,#32
2450	extr	$ivh,$ivh,$ivl,#63
2451	and	$tmpmw,$constnum,$midnum,asr#31
2452	eor	$ivl,$tmpmx,$ivl,lsl#1
2453	fmov	$ivd30,$ivl
2454	fmov	$ivd31,$ivh
2455
2456	vld1.8	{$dat3},[$inp],#16
2457	// The iv for fifth block
2458	extr	$midnumx,$ivh,$ivh,#32
2459	extr	$ivh,$ivh,$ivl,#63
2460	and	$tmpmw,$constnum,$midnum,asr#31
2461	eor	$ivl,$tmpmx,$ivl,lsl#1
2462	fmov	$ivd40,$ivl
2463	fmov	$ivd41,$ivh
2464
2465	vld1.8	{$dat4},[$inp],#16
2466	veor	$dat3,$dat3,$iv3		// the fourth block
2467	veor	$dat4,$dat4,$iv4
2468	sub	$len,$len,#32			// bias
2469	mov	$rounds,$rounds0
2470	b	.Loop5x_xts_enc
2471
2472.align	4
2473.Loop5x_xts_enc:
2474	aese	$dat0,q8
2475	aesmc	$dat0,$dat0
2476	aese	$dat1,q8
2477	aesmc	$dat1,$dat1
2478	aese	$dat2,q8
2479	aesmc	$dat2,$dat2
2480	aese	$dat3,q8
2481	aesmc	$dat3,$dat3
2482	aese	$dat4,q8
2483	aesmc	$dat4,$dat4
2484	vld1.32	{q8},[$key_],#16
2485	subs	$rounds,$rounds,#2
2486	aese	$dat0,q9
2487	aesmc	$dat0,$dat0
2488	aese	$dat1,q9
2489	aesmc	$dat1,$dat1
2490	aese	$dat2,q9
2491	aesmc	$dat2,$dat2
2492	aese	$dat3,q9
2493	aesmc	$dat3,$dat3
2494	aese	$dat4,q9
2495	aesmc	$dat4,$dat4
2496	vld1.32	{q9},[$key_],#16
2497	b.gt	.Loop5x_xts_enc
2498
2499	aese	$dat0,q8
2500	aesmc	$dat0,$dat0
2501	aese	$dat1,q8
2502	aesmc	$dat1,$dat1
2503	aese	$dat2,q8
2504	aesmc	$dat2,$dat2
2505	aese	$dat3,q8
2506	aesmc	$dat3,$dat3
2507	aese	$dat4,q8
2508	aesmc	$dat4,$dat4
2509	subs	$len,$len,#0x50			// because .Lxts_enc_tail4x
2510
2511	aese	$dat0,q9
2512	aesmc	$dat0,$dat0
2513	aese	$dat1,q9
2514	aesmc	$dat1,$dat1
2515	aese	$dat2,q9
2516	aesmc	$dat2,$dat2
2517	aese	$dat3,q9
2518	aesmc	$dat3,$dat3
2519	aese	$dat4,q9
2520	aesmc	$dat4,$dat4
2521	csel	$xoffset,xzr,$len,gt		// borrow x6, w6, "gt" is not typo
2522	mov	$key_,$key1
2523
2524	aese	$dat0,q10
2525	aesmc	$dat0,$dat0
2526	aese	$dat1,q10
2527	aesmc	$dat1,$dat1
2528	aese	$dat2,q10
2529	aesmc	$dat2,$dat2
2530	aese	$dat3,q10
2531	aesmc	$dat3,$dat3
2532	aese	$dat4,q10
2533	aesmc	$dat4,$dat4
2534	add	$inp,$inp,$xoffset		// x0 is adjusted in such way that
2535						// at exit from the loop v1.16b-v26.16b
2536						// are loaded with last "words"
2537	add	$xoffset,$len,#0x60		// because .Lxts_enc_tail4x
2538
2539	aese	$dat0,q11
2540	aesmc	$dat0,$dat0
2541	aese	$dat1,q11
2542	aesmc	$dat1,$dat1
2543	aese	$dat2,q11
2544	aesmc	$dat2,$dat2
2545	aese	$dat3,q11
2546	aesmc	$dat3,$dat3
2547	aese	$dat4,q11
2548	aesmc	$dat4,$dat4
2549
2550	aese	$dat0,q12
2551	aesmc	$dat0,$dat0
2552	aese	$dat1,q12
2553	aesmc	$dat1,$dat1
2554	aese	$dat2,q12
2555	aesmc	$dat2,$dat2
2556	aese	$dat3,q12
2557	aesmc	$dat3,$dat3
2558	aese	$dat4,q12
2559	aesmc	$dat4,$dat4
2560
2561	aese	$dat0,q13
2562	aesmc	$dat0,$dat0
2563	aese	$dat1,q13
2564	aesmc	$dat1,$dat1
2565	aese	$dat2,q13
2566	aesmc	$dat2,$dat2
2567	aese	$dat3,q13
2568	aesmc	$dat3,$dat3
2569	aese	$dat4,q13
2570	aesmc	$dat4,$dat4
2571
2572	aese	$dat0,q14
2573	aesmc	$dat0,$dat0
2574	aese	$dat1,q14
2575	aesmc	$dat1,$dat1
2576	aese	$dat2,q14
2577	aesmc	$dat2,$dat2
2578	aese	$dat3,q14
2579	aesmc	$dat3,$dat3
2580	aese	$dat4,q14
2581	aesmc	$dat4,$dat4
2582
2583	veor	$tmp0,$rndlast,$iv0
2584	aese	$dat0,q15
2585	// The iv for first block of one iteration
2586	extr	$midnumx,$ivh,$ivh,#32
2587	extr	$ivh,$ivh,$ivl,#63
2588	and	$tmpmw,$constnum,$midnum,asr#31
2589	eor	$ivl,$tmpmx,$ivl,lsl#1
2590	fmov	$ivd00,$ivl
2591	fmov	$ivd01,$ivh
2592	veor	$tmp1,$rndlast,$iv1
2593	vld1.8	{$in0},[$inp],#16
2594	aese	$dat1,q15
2595	// The iv for second block
2596	extr	$midnumx,$ivh,$ivh,#32
2597	extr	$ivh,$ivh,$ivl,#63
2598	and	$tmpmw,$constnum,$midnum,asr#31
2599	eor	$ivl,$tmpmx,$ivl,lsl#1
2600	fmov	$ivd10,$ivl
2601	fmov	$ivd11,$ivh
2602	veor	$tmp2,$rndlast,$iv2
2603	vld1.8	{$in1},[$inp],#16
2604	aese	$dat2,q15
2605	// The iv for third block
2606	extr	$midnumx,$ivh,$ivh,#32
2607	extr	$ivh,$ivh,$ivl,#63
2608	and	$tmpmw,$constnum,$midnum,asr#31
2609	eor	$ivl,$tmpmx,$ivl,lsl#1
2610	fmov	$ivd20,$ivl
2611	fmov	$ivd21,$ivh
2612	veor	$tmp3,$rndlast,$iv3
2613	vld1.8	{$in2},[$inp],#16
2614	aese	$dat3,q15
2615	// The iv for fourth block
2616	extr	$midnumx,$ivh,$ivh,#32
2617	extr	$ivh,$ivh,$ivl,#63
2618	and	$tmpmw,$constnum,$midnum,asr#31
2619	eor	$ivl,$tmpmx,$ivl,lsl#1
2620	fmov	$ivd30,$ivl
2621	fmov	$ivd31,$ivh
2622	veor	$tmp4,$rndlast,$iv4
2623	vld1.8	{$in3},[$inp],#16
2624	aese	$dat4,q15
2625
2626	// The iv for fifth block
2627	extr	$midnumx,$ivh,$ivh,#32
2628	extr	$ivh,$ivh,$ivl,#63
2629	and	$tmpmw,$constnum,$midnum,asr #31
2630	eor	$ivl,$tmpmx,$ivl,lsl #1
2631	fmov	$ivd40,$ivl
2632	fmov	$ivd41,$ivh
2633
2634	vld1.8	{$in4},[$inp],#16
2635	cbz	$xoffset,.Lxts_enc_tail4x
2636	vld1.32 {q8},[$key_],#16		// re-pre-load rndkey[0]
2637	veor	$tmp0,$tmp0,$dat0
2638	veor	$dat0,$in0,$iv0
2639	veor	$tmp1,$tmp1,$dat1
2640	veor	$dat1,$in1,$iv1
2641	veor	$tmp2,$tmp2,$dat2
2642	veor	$dat2,$in2,$iv2
2643	veor	$tmp3,$tmp3,$dat3
2644	veor	$dat3,$in3,$iv3
2645	veor	$tmp4,$tmp4,$dat4
2646	vst1.8	{$tmp0},[$out],#16
2647	veor	$dat4,$in4,$iv4
2648	vst1.8	{$tmp1},[$out],#16
2649	mov	$rounds,$rounds0
2650	vst1.8	{$tmp2},[$out],#16
2651	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
2652	vst1.8	{$tmp3},[$out],#16
2653	vst1.8	{$tmp4},[$out],#16
2654	b.hs	.Loop5x_xts_enc
2655
2656
2657	// If left 4 blocks, borrow the five block's processing.
2658	cmn	$len,#0x10
2659	b.ne	.Loop5x_enc_after
2660	vorr	$iv4,$iv3,$iv3
2661	vorr	$iv3,$iv2,$iv2
2662	vorr	$iv2,$iv1,$iv1
2663	vorr	$iv1,$iv0,$iv0
2664	fmov	$ivl,$ivd40
2665	fmov	$ivh,$ivd41
2666	veor	$dat0,$iv0,$in0
2667	veor	$dat1,$iv1,$in1
2668	veor	$dat2,$in2,$iv2
2669	veor	$dat3,$in3,$iv3
2670	veor	$dat4,$in4,$iv4
2671	b.eq	.Loop5x_xts_enc
2672
2673.Loop5x_enc_after:
2674	add	$len,$len,#0x50
2675	cbz	$len,.Lxts_enc_done
2676
2677	add	$rounds,$rounds0,#2
2678	subs	$len,$len,#0x30
2679	b.lo	.Lxts_inner_enc_tail
2680
2681	veor	$dat0,$iv0,$in2
2682	veor	$dat1,$iv1,$in3
2683	veor	$dat2,$in4,$iv2
2684	b	.Lxts_outer_enc_tail
2685
2686.align	4
2687.Lxts_enc_tail4x:
2688	add	$inp,$inp,#16
2689	veor	$tmp1,$dat1,$tmp1
2690	vst1.8	{$tmp1},[$out],#16
2691	veor	$tmp2,$dat2,$tmp2
2692	vst1.8	{$tmp2},[$out],#16
2693	veor	$tmp3,$dat3,$tmp3
2694	veor	$tmp4,$dat4,$tmp4
2695	vst1.8	{$tmp3-$tmp4},[$out],#32
2696
2697	b	.Lxts_enc_done
2698.align	4
2699.Lxts_outer_enc_tail:
2700	aese	$dat0,q8
2701	aesmc	$dat0,$dat0
2702	aese	$dat1,q8
2703	aesmc	$dat1,$dat1
2704	aese	$dat2,q8
2705	aesmc	$dat2,$dat2
2706	vld1.32	{q8},[$key_],#16
2707	subs	$rounds,$rounds,#2
2708	aese	$dat0,q9
2709	aesmc	$dat0,$dat0
2710	aese	$dat1,q9
2711	aesmc	$dat1,$dat1
2712	aese	$dat2,q9
2713	aesmc	$dat2,$dat2
2714	vld1.32	{q9},[$key_],#16
2715	b.gt	.Lxts_outer_enc_tail
2716
2717	aese	$dat0,q8
2718	aesmc	$dat0,$dat0
2719	aese	$dat1,q8
2720	aesmc	$dat1,$dat1
2721	aese	$dat2,q8
2722	aesmc	$dat2,$dat2
2723	veor	$tmp0,$iv0,$rndlast
2724	subs	$len,$len,#0x30
2725	// The iv for first block
2726	fmov	$ivl,$ivd20
2727	fmov	$ivh,$ivd21
2728	//mov	$constnum,#0x87
2729	extr	$midnumx,$ivh,$ivh,#32
2730	extr	$ivh,$ivh,$ivl,#63
2731	and	$tmpmw,$constnum,$midnum,asr#31
2732	eor	$ivl,$tmpmx,$ivl,lsl#1
2733	fmov	$ivd00,$ivl
2734	fmov	$ivd01,$ivh
2735	veor	$tmp1,$iv1,$rndlast
2736	csel	$xoffset,$len,$xoffset,lo       // x6, w6, is zero at this point
2737	aese	$dat0,q9
2738	aesmc	$dat0,$dat0
2739	aese	$dat1,q9
2740	aesmc	$dat1,$dat1
2741	aese	$dat2,q9
2742	aesmc	$dat2,$dat2
2743	veor	$tmp2,$iv2,$rndlast
2744
2745	add	$xoffset,$xoffset,#0x20
2746	add	$inp,$inp,$xoffset
2747	mov	$key_,$key1
2748
2749	aese	$dat0,q12
2750	aesmc	$dat0,$dat0
2751	aese	$dat1,q12
2752	aesmc	$dat1,$dat1
2753	aese	$dat2,q12
2754	aesmc	$dat2,$dat2
2755	aese	$dat0,q13
2756	aesmc	$dat0,$dat0
2757	aese	$dat1,q13
2758	aesmc	$dat1,$dat1
2759	aese	$dat2,q13
2760	aesmc	$dat2,$dat2
2761	aese	$dat0,q14
2762	aesmc	$dat0,$dat0
2763	aese	$dat1,q14
2764	aesmc	$dat1,$dat1
2765	aese	$dat2,q14
2766	aesmc	$dat2,$dat2
2767	aese	$dat0,q15
2768	aese	$dat1,q15
2769	aese	$dat2,q15
2770	vld1.8	{$in2},[$inp],#16
2771	add	$rounds,$rounds0,#2
2772	vld1.32	{q8},[$key_],#16                // re-pre-load rndkey[0]
2773	veor	$tmp0,$tmp0,$dat0
2774	veor	$tmp1,$tmp1,$dat1
2775	veor	$dat2,$dat2,$tmp2
2776	vld1.32	{q9},[$key_],#16                // re-pre-load rndkey[1]
2777	vst1.8	{$tmp0},[$out],#16
2778	vst1.8	{$tmp1},[$out],#16
2779	vst1.8	{$dat2},[$out],#16
2780	cmn	$len,#0x30
2781	b.eq	.Lxts_enc_done
2782.Lxts_encxor_one:
2783	vorr	$in3,$in1,$in1
2784	vorr	$in4,$in2,$in2
2785	nop
2786
2787.Lxts_inner_enc_tail:
2788	cmn	$len,#0x10
2789	veor	$dat1,$in3,$iv0
2790	veor	$dat2,$in4,$iv1
2791	b.eq	.Lxts_enc_tail_loop
2792	veor	$dat2,$in4,$iv0
2793.Lxts_enc_tail_loop:
2794	aese	$dat1,q8
2795	aesmc	$dat1,$dat1
2796	aese	$dat2,q8
2797	aesmc	$dat2,$dat2
2798	vld1.32	{q8},[$key_],#16
2799	subs	$rounds,$rounds,#2
2800	aese	$dat1,q9
2801	aesmc	$dat1,$dat1
2802	aese	$dat2,q9
2803	aesmc	$dat2,$dat2
2804	vld1.32	{q9},[$key_],#16
2805	b.gt	.Lxts_enc_tail_loop
2806
2807	aese	$dat1,q8
2808	aesmc	$dat1,$dat1
2809	aese	$dat2,q8
2810	aesmc	$dat2,$dat2
2811	aese	$dat1,q9
2812	aesmc	$dat1,$dat1
2813	aese	$dat2,q9
2814	aesmc	$dat2,$dat2
2815	aese	$dat1,q12
2816	aesmc	$dat1,$dat1
2817	aese	$dat2,q12
2818	aesmc	$dat2,$dat2
2819	cmn	$len,#0x20
2820	aese	$dat1,q13
2821	aesmc	$dat1,$dat1
2822	aese	$dat2,q13
2823	aesmc	$dat2,$dat2
2824	veor	$tmp1,$iv0,$rndlast
2825	aese	$dat1,q14
2826	aesmc	$dat1,$dat1
2827	aese	$dat2,q14
2828	aesmc	$dat2,$dat2
2829	veor	$tmp2,$iv1,$rndlast
2830	aese	$dat1,q15
2831	aese	$dat2,q15
2832	b.eq	.Lxts_enc_one
2833	veor	$tmp1,$tmp1,$dat1
2834	vst1.8	{$tmp1},[$out],#16
2835	veor	$tmp2,$tmp2,$dat2
2836	vorr	$iv0,$iv1,$iv1
2837	vst1.8	{$tmp2},[$out],#16
2838	fmov	$ivl,$ivd10
2839	fmov	$ivh,$ivd11
2840	mov	$constnum,#0x87
2841	extr	$midnumx,$ivh,$ivh,#32
2842	extr	$ivh,$ivh,$ivl,#63
2843	and	$tmpmw,$constnum,$midnum,asr #31
2844	eor	$ivl,$tmpmx,$ivl,lsl #1
2845	fmov	$ivd00,$ivl
2846	fmov	$ivd01,$ivh
2847	b	.Lxts_enc_done
2848
2849.Lxts_enc_one:
2850	veor	$tmp1,$tmp1,$dat2
2851	vorr	$iv0,$iv0,$iv0
2852	vst1.8	{$tmp1},[$out],#16
2853	fmov	$ivl,$ivd00
2854	fmov	$ivh,$ivd01
2855	mov	$constnum,#0x87
2856	extr	$midnumx,$ivh,$ivh,#32
2857	extr	$ivh,$ivh,$ivl,#63
2858	and	$tmpmw,$constnum,$midnum,asr #31
2859	eor	$ivl,$tmpmx,$ivl,lsl #1
2860	fmov	$ivd00,$ivl
2861	fmov	$ivd01,$ivh
2862	b	.Lxts_enc_done
2863.align	5
2864.Lxts_enc_done:
2865	// Process the tail block with cipher stealing.
2866	tst	$tailcnt,#0xf
2867	b.eq	.Lxts_abort
2868
2869	mov	$tmpinp,$inp
2870	mov	$tmpoutp,$out
2871	sub	$out,$out,#16
2872.composite_enc_loop:
2873	subs	$tailcnt,$tailcnt,#1
2874	ldrb	$l2outp,[$out,$tailcnt]
2875	ldrb	$loutp,[$tmpinp,$tailcnt]
2876	strb	$l2outp,[$tmpoutp,$tailcnt]
2877	strb	$loutp,[$out,$tailcnt]
2878	b.gt	.composite_enc_loop
2879.Lxts_enc_load_done:
2880	vld1.8	{$tmpin},[$out]
2881	veor	$tmpin,$tmpin,$iv0
2882
2883	// Encrypt the composite block to get the last second encrypted text block
2884	ldr	$rounds,[$key1,#240]		// load key schedule...
2885	vld1.32	{$dat},[$key1],#16
2886	sub	$rounds,$rounds,#2
2887	vld1.32	{$dat1},[$key1],#16		// load key schedule...
2888.Loop_final_enc:
2889	aese	$tmpin,$dat0
2890	aesmc	$tmpin,$tmpin
2891	vld1.32	{$dat0},[$key1],#16
2892	subs	$rounds,$rounds,#2
2893	aese	$tmpin,$dat1
2894	aesmc	$tmpin,$tmpin
2895	vld1.32	{$dat1},[$key1],#16
2896	b.gt	.Loop_final_enc
2897
2898	aese	$tmpin,$dat0
2899	aesmc	$tmpin,$tmpin
2900	vld1.32	{$dat0},[$key1]
2901	aese	$tmpin,$dat1
2902	veor	$tmpin,$tmpin,$dat0
2903	veor	$tmpin,$tmpin,$iv0
2904	vst1.8	{$tmpin},[$out]
2905
2906.Lxts_abort:
2907	ldp	$tailcnt,$midnumx,[sp,#48]
2908	ldp	$ivd10,$ivd20,[sp,#32]
2909	ldp	$ivd30,$ivd40,[sp,#16]
2910	ldp	$constnumx,$tmpinp,[sp],#64
2911.Lxts_enc_final_abort:
2912	ret
2913.size	${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
2914___
2915
2916}}}
2917{{{
2918my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2919my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2920my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2921my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2922my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2923my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2924my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2925my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2926my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2927
2928my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2929
2930# q7	last round key
2931# q10-q15, q7	Last 7 round keys
2932# q8-q9	preloaded round keys except last 7 keys for big size
2933# q20, q21, q8-q9	preloaded round keys except last 7 keys for only 16 byte
2934
2935{
2936my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2937
2938my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
2939my ($dat4,$in4,$tmp4);
2940if ($flavour =~ /64/) {
2941    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2942}
2943
2944$code.=<<___	if ($flavour =~ /64/);
2945.globl	${prefix}_xts_decrypt
2946.type	${prefix}_xts_decrypt,%function
2947.align	5
2948${prefix}_xts_decrypt:
2949___
2950$code.=<<___	if ($flavour =~ /64/);
2951	cmp	$len,#16
2952	// Original input data size bigger than 16, jump to big size processing.
2953	b.ne	.Lxts_dec_big_size
2954	// Encrypt the iv with key2, as the first XEX iv.
2955	ldr	$rounds,[$key2,#240]
2956	vld1.32	{$dat},[$key2],#16
2957	vld1.8	{$iv0},[$ivp]
2958	sub	$rounds,$rounds,#2
2959	vld1.32	{$dat1},[$key2],#16
2960
2961.Loop_dec_small_iv_enc:
2962	aese	$iv0,$dat
2963	aesmc	$iv0,$iv0
2964	vld1.32	{$dat},[$key2],#16
2965	subs	$rounds,$rounds,#2
2966	aese	$iv0,$dat1
2967	aesmc	$iv0,$iv0
2968	vld1.32	{$dat1},[$key2],#16
2969	b.gt	.Loop_dec_small_iv_enc
2970
2971	aese	$iv0,$dat
2972	aesmc	$iv0,$iv0
2973	vld1.32	{$dat},[$key2]
2974	aese	$iv0,$dat1
2975	veor	$iv0,$iv0,$dat
2976
2977	vld1.8	{$dat0},[$inp]
2978	veor	$dat0,$iv0,$dat0
2979
2980	ldr	$rounds,[$key1,#240]
2981	vld1.32	{q20-q21},[$key1],#32			// load key schedule...
2982
2983	aesd	$dat0,q20
2984	aesimc	$dat0,$dat0
2985	vld1.32	{q8-q9},[$key1],#32			// load key schedule...
2986	aesd	$dat0,q21
2987	aesimc	$dat0,$dat0
2988	subs	$rounds,$rounds,#10			// bias
2989	b.eq	.Lxts_128_dec
2990.Lxts_dec_round_loop:
2991	aesd	$dat0,q8
2992	aesimc	$dat0,$dat0
2993	vld1.32	{q8},[$key1],#16			// load key schedule...
2994	aesd	$dat0,q9
2995	aesimc	$dat0,$dat0
2996	vld1.32	{q9},[$key1],#16			// load key schedule...
2997	subs	$rounds,$rounds,#2			// bias
2998	b.gt	.Lxts_dec_round_loop
2999.Lxts_128_dec:
3000	vld1.32	{q10-q11},[$key1],#32			// load key schedule...
3001	aesd	$dat0,q8
3002	aesimc	$dat0,$dat0
3003	aesd	$dat0,q9
3004	aesimc	$dat0,$dat0
3005	vld1.32	{q12-q13},[$key1],#32			// load key schedule...
3006	aesd	$dat0,q10
3007	aesimc	$dat0,$dat0
3008	aesd	$dat0,q11
3009	aesimc	$dat0,$dat0
3010	vld1.32	{q14-q15},[$key1],#32			// load key schedule...
3011	aesd	$dat0,q12
3012	aesimc	$dat0,$dat0
3013	aesd	$dat0,q13
3014	aesimc	$dat0,$dat0
3015	vld1.32	{$rndlast},[$key1]
3016	aesd	$dat0,q14
3017	aesimc	$dat0,$dat0
3018	aesd	$dat0,q15
3019	veor	$dat0,$dat0,$rndlast
3020	veor	$dat0,$iv0,$dat0
3021	vst1.8	{$dat0},[$out]
3022	b	.Lxts_dec_final_abort
3023.Lxts_dec_big_size:
3024___
3025$code.=<<___	if ($flavour =~ /64/);
3026	stp	$constnumx,$tmpinp,[sp,#-64]!
3027	stp	$tailcnt,$midnumx,[sp,#48]
3028	stp	$ivd10,$ivd20,[sp,#32]
3029	stp	$ivd30,$ivd40,[sp,#16]
3030
3031	and	$tailcnt,$len,#0xf
3032	and	$len,$len,#-16
3033	subs	$len,$len,#16
3034	mov	$step,#16
3035	b.lo	.Lxts_dec_abort
3036
3037	// Encrypt the iv with key2, as the first XEX iv
3038	ldr	$rounds,[$key2,#240]
3039	vld1.32	{$dat},[$key2],#16
3040	vld1.8	{$iv0},[$ivp]
3041	sub	$rounds,$rounds,#2
3042	vld1.32	{$dat1},[$key2],#16
3043
3044.Loop_dec_iv_enc:
3045	aese	$iv0,$dat
3046	aesmc	$iv0,$iv0
3047	vld1.32	{$dat},[$key2],#16
3048	subs	$rounds,$rounds,#2
3049	aese	$iv0,$dat1
3050	aesmc	$iv0,$iv0
3051	vld1.32	{$dat1},[$key2],#16
3052	b.gt	.Loop_dec_iv_enc
3053
3054	aese	$iv0,$dat
3055	aesmc	$iv0,$iv0
3056	vld1.32	{$dat},[$key2]
3057	aese	$iv0,$dat1
3058	veor	$iv0,$iv0,$dat
3059
3060	// The iv for second block
3061	// $ivl- iv(low), $ivh - iv(high)
3062	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3063	fmov	$ivl,$ivd00
3064	fmov	$ivh,$ivd01
3065	mov	$constnum,#0x87
3066	extr	$midnumx,$ivh,$ivh,#32
3067	extr	$ivh,$ivh,$ivl,#63
3068	and	$tmpmw,$constnum,$midnum,asr #31
3069	eor	$ivl,$tmpmx,$ivl,lsl #1
3070	fmov	$ivd10,$ivl
3071	fmov	$ivd11,$ivh
3072
3073	ldr	$rounds0,[$key1,#240]		// load rounds number
3074
3075	// The iv for third block
3076	extr	$midnumx,$ivh,$ivh,#32
3077	extr	$ivh,$ivh,$ivl,#63
3078	and	$tmpmw,$constnum,$midnum,asr #31
3079	eor	$ivl,$tmpmx,$ivl,lsl #1
3080	fmov	$ivd20,$ivl
3081	fmov	$ivd21,$ivh
3082
3083	vld1.32	{q8-q9},[$key1]			// load key schedule...
3084	sub	$rounds0,$rounds0,#6
3085	add	$key_,$key1,$ivp,lsl#4		// pointer to last 7 round keys
3086	sub	$rounds0,$rounds0,#2
3087	vld1.32	{q10-q11},[$key_],#32		// load key schedule...
3088	vld1.32	{q12-q13},[$key_],#32
3089	vld1.32	{q14-q15},[$key_],#32
3090	vld1.32	{$rndlast},[$key_]
3091
3092	// The iv for fourth block
3093	extr	$midnumx,$ivh,$ivh,#32
3094	extr	$ivh,$ivh,$ivl,#63
3095	and	$tmpmw,$constnum,$midnum,asr #31
3096	eor	$ivl,$tmpmx,$ivl,lsl #1
3097	fmov	$ivd30,$ivl
3098	fmov	$ivd31,$ivh
3099
3100	add	$key_,$key1,#32
3101	mov	$rounds,$rounds0
3102	b	.Lxts_dec
3103
3104	// Decryption
3105.align	5
3106.Lxts_dec:
3107	tst	$tailcnt,#0xf
3108	b.eq	.Lxts_dec_begin
3109	subs	$len,$len,#16
3110	csel	$step,xzr,$step,eq
3111	vld1.8	{$dat},[$inp],#16
3112	b.lo	.Lxts_done
3113	sub	$inp,$inp,#16
3114.Lxts_dec_begin:
3115	vld1.8	{$dat},[$inp],$step
3116	subs	$len,$len,#32			// bias
3117	add	$rounds,$rounds0,#2
3118	vorr	$in1,$dat,$dat
3119	vorr	$dat1,$dat,$dat
3120	vorr	$in3,$dat,$dat
3121	vld1.8	{$dat2},[$inp],#16
3122	vorr	$in2,$dat2,$dat2
3123	vorr	$in4,$dat2,$dat2
3124	b.lo	.Lxts_inner_dec_tail
3125	veor	$dat,$dat,$iv0			// before decryt, xor with iv
3126	veor	$dat2,$dat2,$iv1
3127
3128	vorr	$dat1,$dat2,$dat2
3129	vld1.8	{$dat2},[$inp],#16
3130	vorr	$in0,$dat,$dat
3131	vorr	$in1,$dat1,$dat1
3132	veor	$in2,$dat2,$iv2			// third block xox with third iv
3133	veor	$dat2,$dat2,$iv2
3134	cmp	$len,#32
3135	b.lo	.Lxts_outer_dec_tail
3136
3137	vld1.8	{$dat3},[$inp],#16
3138
3139	// The iv for fifth block
3140	extr	$midnumx,$ivh,$ivh,#32
3141	extr	$ivh,$ivh,$ivl,#63
3142	and	$tmpmw,$constnum,$midnum,asr #31
3143	eor	$ivl,$tmpmx,$ivl,lsl #1
3144	fmov	$ivd40,$ivl
3145	fmov	$ivd41,$ivh
3146
3147	vld1.8	{$dat4},[$inp],#16
3148	veor	$dat3,$dat3,$iv3		// the fourth block
3149	veor	$dat4,$dat4,$iv4
3150	sub $len,$len,#32			// bias
3151	mov	$rounds,$rounds0
3152	b	.Loop5x_xts_dec
3153
3154.align	4
3155.Loop5x_xts_dec:
3156	aesd	$dat0,q8
3157	aesimc	$dat0,$dat0
3158	aesd	$dat1,q8
3159	aesimc	$dat1,$dat1
3160	aesd	$dat2,q8
3161	aesimc	$dat2,$dat2
3162	aesd	$dat3,q8
3163	aesimc	$dat3,$dat3
3164	aesd	$dat4,q8
3165	aesimc	$dat4,$dat4
3166	vld1.32	{q8},[$key_],#16		// load key schedule...
3167	subs	$rounds,$rounds,#2
3168	aesd	$dat0,q9
3169	aesimc	$dat0,$dat0
3170	aesd	$dat1,q9
3171	aesimc	$dat1,$dat1
3172	aesd	$dat2,q9
3173	aesimc	$dat2,$dat2
3174	aesd	$dat3,q9
3175	aesimc	$dat3,$dat3
3176	aesd	$dat4,q9
3177	aesimc	$dat4,$dat4
3178	vld1.32	{q9},[$key_],#16		// load key schedule...
3179	b.gt	.Loop5x_xts_dec
3180
3181	aesd	$dat0,q8
3182	aesimc	$dat0,$dat0
3183	aesd	$dat1,q8
3184	aesimc	$dat1,$dat1
3185	aesd	$dat2,q8
3186	aesimc	$dat2,$dat2
3187	aesd	$dat3,q8
3188	aesimc	$dat3,$dat3
3189	aesd	$dat4,q8
3190	aesimc	$dat4,$dat4
3191	subs	$len,$len,#0x50			// because .Lxts_dec_tail4x
3192
3193	aesd	$dat0,q9
3194	aesimc	$dat0,$dat
3195	aesd	$dat1,q9
3196	aesimc	$dat1,$dat1
3197	aesd	$dat2,q9
3198	aesimc	$dat2,$dat2
3199	aesd	$dat3,q9
3200	aesimc	$dat3,$dat3
3201	aesd	$dat4,q9
3202	aesimc	$dat4,$dat4
3203	csel	$xoffset,xzr,$len,gt		// borrow x6, w6, "gt" is not typo
3204	mov	$key_,$key1
3205
3206	aesd	$dat0,q10
3207	aesimc	$dat0,$dat0
3208	aesd	$dat1,q10
3209	aesimc	$dat1,$dat1
3210	aesd	$dat2,q10
3211	aesimc	$dat2,$dat2
3212	aesd	$dat3,q10
3213	aesimc	$dat3,$dat3
3214	aesd	$dat4,q10
3215	aesimc	$dat4,$dat4
3216	add	$inp,$inp,$xoffset		// x0 is adjusted in such way that
3217						// at exit from the loop v1.16b-v26.16b
3218						// are loaded with last "words"
3219	add	$xoffset,$len,#0x60		// because .Lxts_dec_tail4x
3220
3221	aesd	$dat0,q11
3222	aesimc	$dat0,$dat0
3223	aesd	$dat1,q11
3224	aesimc	$dat1,$dat1
3225	aesd	$dat2,q11
3226	aesimc	$dat2,$dat2
3227	aesd	$dat3,q11
3228	aesimc	$dat3,$dat3
3229	aesd	$dat4,q11
3230	aesimc	$dat4,$dat4
3231
3232	aesd	$dat0,q12
3233	aesimc	$dat0,$dat0
3234	aesd	$dat1,q12
3235	aesimc	$dat1,$dat1
3236	aesd	$dat2,q12
3237	aesimc	$dat2,$dat2
3238	aesd	$dat3,q12
3239	aesimc	$dat3,$dat3
3240	aesd	$dat4,q12
3241	aesimc	$dat4,$dat4
3242
3243	aesd	$dat0,q13
3244	aesimc	$dat0,$dat0
3245	aesd	$dat1,q13
3246	aesimc	$dat1,$dat1
3247	aesd	$dat2,q13
3248	aesimc	$dat2,$dat2
3249	aesd	$dat3,q13
3250	aesimc	$dat3,$dat3
3251	aesd	$dat4,q13
3252	aesimc	$dat4,$dat4
3253
3254	aesd	$dat0,q14
3255	aesimc	$dat0,$dat0
3256	aesd	$dat1,q14
3257	aesimc	$dat1,$dat1
3258	aesd	$dat2,q14
3259	aesimc	$dat2,$dat2
3260	aesd	$dat3,q14
3261	aesimc	$dat3,$dat3
3262	aesd	$dat4,q14
3263	aesimc	$dat4,$dat4
3264
3265	veor	$tmp0,$rndlast,$iv0
3266	aesd	$dat0,q15
3267	// The iv for first block of next iteration.
3268	extr	$midnumx,$ivh,$ivh,#32
3269	extr	$ivh,$ivh,$ivl,#63
3270	and	$tmpmw,$constnum,$midnum,asr #31
3271	eor	$ivl,$tmpmx,$ivl,lsl #1
3272	fmov	$ivd00,$ivl
3273	fmov	$ivd01,$ivh
3274	veor	$tmp1,$rndlast,$iv1
3275	vld1.8	{$in0},[$inp],#16
3276	aesd	$dat1,q15
3277	// The iv for second block
3278	extr	$midnumx,$ivh,$ivh,#32
3279	extr	$ivh,$ivh,$ivl,#63
3280	and	$tmpmw,$constnum,$midnum,asr #31
3281	eor	$ivl,$tmpmx,$ivl,lsl #1
3282	fmov	$ivd10,$ivl
3283	fmov	$ivd11,$ivh
3284	veor	$tmp2,$rndlast,$iv2
3285	vld1.8	{$in1},[$inp],#16
3286	aesd	$dat2,q15
3287	// The iv for third block
3288	extr	$midnumx,$ivh,$ivh,#32
3289	extr	$ivh,$ivh,$ivl,#63
3290	and	$tmpmw,$constnum,$midnum,asr #31
3291	eor	$ivl,$tmpmx,$ivl,lsl #1
3292	fmov	$ivd20,$ivl
3293	fmov	$ivd21,$ivh
3294	veor	$tmp3,$rndlast,$iv3
3295	vld1.8	{$in2},[$inp],#16
3296	aesd	$dat3,q15
3297	// The iv for fourth block
3298	extr	$midnumx,$ivh,$ivh,#32
3299	extr	$ivh,$ivh,$ivl,#63
3300	and	$tmpmw,$constnum,$midnum,asr #31
3301	eor	$ivl,$tmpmx,$ivl,lsl #1
3302	fmov	$ivd30,$ivl
3303	fmov	$ivd31,$ivh
3304	veor	$tmp4,$rndlast,$iv4
3305	vld1.8	{$in3},[$inp],#16
3306	aesd	$dat4,q15
3307
3308	// The iv for fifth block
3309	extr	$midnumx,$ivh,$ivh,#32
3310	extr	$ivh,$ivh,$ivl,#63
3311	and	$tmpmw,$constnum,$midnum,asr #31
3312	eor	$ivl,$tmpmx,$ivl,lsl #1
3313	fmov	$ivd40,$ivl
3314	fmov	$ivd41,$ivh
3315
3316	vld1.8	{$in4},[$inp],#16
3317	cbz	$xoffset,.Lxts_dec_tail4x
3318	vld1.32	{q8},[$key_],#16		// re-pre-load rndkey[0]
3319	veor	$tmp0,$tmp0,$dat0
3320	veor	$dat0,$in0,$iv0
3321	veor	$tmp1,$tmp1,$dat1
3322	veor	$dat1,$in1,$iv1
3323	veor	$tmp2,$tmp2,$dat2
3324	veor	$dat2,$in2,$iv2
3325	veor	$tmp3,$tmp3,$dat3
3326	veor	$dat3,$in3,$iv3
3327	veor	$tmp4,$tmp4,$dat4
3328	vst1.8	{$tmp0},[$out],#16
3329	veor	$dat4,$in4,$iv4
3330	vst1.8	{$tmp1},[$out],#16
3331	mov	$rounds,$rounds0
3332	vst1.8	{$tmp2},[$out],#16
3333	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
3334	vst1.8	{$tmp3},[$out],#16
3335	vst1.8	{$tmp4},[$out],#16
3336	b.hs	.Loop5x_xts_dec
3337
3338	cmn	$len,#0x10
3339	b.ne	.Loop5x_dec_after
3340	// If x2($len) equal to -0x10, the left blocks is 4.
3341	// After specially processing, utilize the five blocks processing again.
3342	// It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
3343	vorr	$iv4,$iv3,$iv3
3344	vorr	$iv3,$iv2,$iv2
3345	vorr	$iv2,$iv1,$iv1
3346	vorr	$iv1,$iv0,$iv0
3347	fmov	$ivl,$ivd40
3348	fmov	$ivh,$ivd41
3349	veor	$dat0,$iv0,$in0
3350	veor	$dat1,$iv1,$in1
3351	veor	$dat2,$in2,$iv2
3352	veor	$dat3,$in3,$iv3
3353	veor	$dat4,$in4,$iv4
3354	b.eq	.Loop5x_xts_dec
3355
3356.Loop5x_dec_after:
3357	add	$len,$len,#0x50
3358	cbz	$len,.Lxts_done
3359
3360	add	$rounds,$rounds0,#2
3361	subs	$len,$len,#0x30
3362	b.lo	.Lxts_inner_dec_tail
3363
3364	veor	$dat0,$iv0,$in2
3365	veor	$dat1,$iv1,$in3
3366	veor	$dat2,$in4,$iv2
3367	b	.Lxts_outer_dec_tail
3368
3369.align	4
3370.Lxts_dec_tail4x:
3371	add	$inp,$inp,#16
3372	tst	$tailcnt,#0xf
3373	veor	$tmp1,$dat1,$tmp0
3374	vst1.8	{$tmp1},[$out],#16
3375	veor	$tmp2,$dat2,$tmp2
3376	vst1.8	{$tmp2},[$out],#16
3377	veor	$tmp3,$dat3,$tmp3
3378	veor	$tmp4,$dat4,$tmp4
3379	vst1.8	{$tmp3-$tmp4},[$out],#32
3380
3381	b.eq	.Lxts_dec_abort
3382	vld1.8	{$dat0},[$inp],#16
3383	b	.Lxts_done
3384.align	4
3385.Lxts_outer_dec_tail:
3386	aesd	$dat0,q8
3387	aesimc	$dat0,$dat0
3388	aesd	$dat1,q8
3389	aesimc	$dat1,$dat1
3390	aesd	$dat2,q8
3391	aesimc	$dat2,$dat2
3392	vld1.32	{q8},[$key_],#16
3393	subs	$rounds,$rounds,#2
3394	aesd	$dat0,q9
3395	aesimc	$dat0,$dat0
3396	aesd	$dat1,q9
3397	aesimc	$dat1,$dat1
3398	aesd	$dat2,q9
3399	aesimc	$dat2,$dat2
3400	vld1.32	{q9},[$key_],#16
3401	b.gt	.Lxts_outer_dec_tail
3402
3403	aesd	$dat0,q8
3404	aesimc	$dat0,$dat0
3405	aesd	$dat1,q8
3406	aesimc	$dat1,$dat1
3407	aesd	$dat2,q8
3408	aesimc	$dat2,$dat2
3409	veor	$tmp0,$iv0,$rndlast
3410	subs	$len,$len,#0x30
3411	// The iv for first block
3412	fmov	$ivl,$ivd20
3413	fmov	$ivh,$ivd21
3414	mov	$constnum,#0x87
3415	extr	$midnumx,$ivh,$ivh,#32
3416	extr	$ivh,$ivh,$ivl,#63
3417	and	$tmpmw,$constnum,$midnum,asr #31
3418	eor	$ivl,$tmpmx,$ivl,lsl #1
3419	fmov	$ivd00,$ivl
3420	fmov	$ivd01,$ivh
3421	veor	$tmp1,$iv1,$rndlast
3422	csel	$xoffset,$len,$xoffset,lo	// x6, w6, is zero at this point
3423	aesd	$dat0,q9
3424	aesimc	$dat0,$dat0
3425	aesd	$dat1,q9
3426	aesimc	$dat1,$dat1
3427	aesd	$dat2,q9
3428	aesimc	$dat2,$dat2
3429	veor	$tmp2,$iv2,$rndlast
3430	// The iv for second block
3431	extr	$midnumx,$ivh,$ivh,#32
3432	extr	$ivh,$ivh,$ivl,#63
3433	and	$tmpmw,$constnum,$midnum,asr #31
3434	eor	$ivl,$tmpmx,$ivl,lsl #1
3435	fmov	$ivd10,$ivl
3436	fmov	$ivd11,$ivh
3437
3438	add	$xoffset,$xoffset,#0x20
3439	add	$inp,$inp,$xoffset		// $inp is adjusted to the last data
3440
3441	mov	$key_,$key1
3442
3443	// The iv for third block
3444	extr	$midnumx,$ivh,$ivh,#32
3445	extr	$ivh,$ivh,$ivl,#63
3446	and	$tmpmw,$constnum,$midnum,asr #31
3447	eor	$ivl,$tmpmx,$ivl,lsl #1
3448	fmov	$ivd20,$ivl
3449	fmov	$ivd21,$ivh
3450
3451	aesd	$dat0,q12
3452	aesimc	$dat0,$dat0
3453	aesd	$dat1,q12
3454	aesimc	$dat1,$dat1
3455	aesd	$dat2,q12
3456	aesimc	$dat2,$dat2
3457	aesd	$dat0,q13
3458	aesimc	$dat0,$dat0
3459	aesd	$dat1,q13
3460	aesimc	$dat1,$dat1
3461	aesd	$dat2,q13
3462	aesimc	$dat2,$dat2
3463	aesd	$dat0,q14
3464	aesimc	$dat0,$dat0
3465	aesd	$dat1,q14
3466	aesimc	$dat1,$dat1
3467	aesd	$dat2,q14
3468	aesimc	$dat2,$dat2
3469	vld1.8	{$in2},[$inp],#16
3470	aesd	$dat0,q15
3471	aesd	$dat1,q15
3472	aesd	$dat2,q15
3473	vld1.32	{q8},[$key_],#16		// re-pre-load rndkey[0]
3474	add	$rounds,$rounds0,#2
3475	veor	$tmp0,$tmp0,$dat0
3476	veor	$tmp1,$tmp1,$dat1
3477	veor	$dat2,$dat2,$tmp2
3478	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
3479	vst1.8	{$tmp0},[$out],#16
3480	vst1.8	{$tmp1},[$out],#16
3481	vst1.8	{$dat2},[$out],#16
3482
3483	cmn	$len,#0x30
3484	add	$len,$len,#0x30
3485	b.eq	.Lxts_done
3486	sub	$len,$len,#0x30
3487	vorr	$in3,$in1,$in1
3488	vorr	$in4,$in2,$in2
3489	nop
3490
3491.Lxts_inner_dec_tail:
3492	// $len == -0x10 means two blocks left.
3493	cmn	$len,#0x10
3494	veor	$dat1,$in3,$iv0
3495	veor	$dat2,$in4,$iv1
3496	b.eq	.Lxts_dec_tail_loop
3497	veor	$dat2,$in4,$iv0
3498.Lxts_dec_tail_loop:
3499	aesd	$dat1,q8
3500	aesimc	$dat1,$dat1
3501	aesd	$dat2,q8
3502	aesimc	$dat2,$dat2
3503	vld1.32	{q8},[$key_],#16
3504	subs	$rounds,$rounds,#2
3505	aesd	$dat1,q9
3506	aesimc	$dat1,$dat1
3507	aesd	$dat2,q9
3508	aesimc	$dat2,$dat2
3509	vld1.32	{q9},[$key_],#16
3510	b.gt	.Lxts_dec_tail_loop
3511
3512	aesd	$dat1,q8
3513	aesimc	$dat1,$dat1
3514	aesd	$dat2,q8
3515	aesimc	$dat2,$dat2
3516	aesd	$dat1,q9
3517	aesimc	$dat1,$dat1
3518	aesd	$dat2,q9
3519	aesimc	$dat2,$dat2
3520	aesd	$dat1,q12
3521	aesimc	$dat1,$dat1
3522	aesd	$dat2,q12
3523	aesimc	$dat2,$dat2
3524	cmn	$len,#0x20
3525	aesd	$dat1,q13
3526	aesimc	$dat1,$dat1
3527	aesd	$dat2,q13
3528	aesimc	$dat2,$dat2
3529	veor	$tmp1,$iv0,$rndlast
3530	aesd	$dat1,q14
3531	aesimc	$dat1,$dat1
3532	aesd	$dat2,q14
3533	aesimc	$dat2,$dat2
3534	veor	$tmp2,$iv1,$rndlast
3535	aesd	$dat1,q15
3536	aesd	$dat2,q15
3537	b.eq	.Lxts_dec_one
3538	veor	$tmp1,$tmp1,$dat1
3539	veor	$tmp2,$tmp2,$dat2
3540	vorr	$iv0,$iv2,$iv2
3541	vorr	$iv1,$iv3,$iv3
3542	vst1.8	{$tmp1},[$out],#16
3543	vst1.8	{$tmp2},[$out],#16
3544	add	$len,$len,#16
3545	b	.Lxts_done
3546
3547.Lxts_dec_one:
3548	veor	$tmp1,$tmp1,$dat2
3549	vorr	$iv0,$iv1,$iv1
3550	vorr	$iv1,$iv2,$iv2
3551	vst1.8	{$tmp1},[$out],#16
3552	add	$len,$len,#32
3553
3554.Lxts_done:
3555	tst	$tailcnt,#0xf
3556	b.eq	.Lxts_dec_abort
3557	// Processing the last two blocks with cipher stealing.
3558	mov	x7,x3
3559	cbnz	x2,.Lxts_dec_1st_done
3560	vld1.8	{$dat0},[$inp],#16
3561
3562	// Decrypt the last secod block to get the last plain text block
3563.Lxts_dec_1st_done:
3564	eor	$tmpin,$dat0,$iv1
3565	ldr	$rounds,[$key1,#240]
3566	vld1.32	{$dat0},[$key1],#16
3567	sub	$rounds,$rounds,#2
3568	vld1.32	{$dat1},[$key1],#16
3569.Loop_final_2nd_dec:
3570	aesd	$tmpin,$dat0
3571	aesimc	$tmpin,$tmpin
3572	vld1.32	{$dat0},[$key1],#16		// load key schedule...
3573	subs	$rounds,$rounds,#2
3574	aesd	$tmpin,$dat1
3575	aesimc	$tmpin,$tmpin
3576	vld1.32	{$dat1},[$key1],#16		// load key schedule...
3577	b.gt	.Loop_final_2nd_dec
3578
3579	aesd	$tmpin,$dat0
3580	aesimc	$tmpin,$tmpin
3581	vld1.32	{$dat0},[$key1]
3582	aesd	$tmpin,$dat1
3583	veor	$tmpin,$tmpin,$dat0
3584	veor	$tmpin,$tmpin,$iv1
3585	vst1.8	{$tmpin},[$out]
3586
3587	mov	$tmpinp,$inp
3588	add	$tmpoutp,$out,#16
3589
3590	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3591	// to get the last encrypted block.
3592.composite_dec_loop:
3593	subs	$tailcnt,$tailcnt,#1
3594	ldrb	$l2outp,[$out,$tailcnt]
3595	ldrb	$loutp,[$tmpinp,$tailcnt]
3596	strb	$l2outp,[$tmpoutp,$tailcnt]
3597	strb	$loutp,[$out,$tailcnt]
3598	b.gt	.composite_dec_loop
3599.Lxts_dec_load_done:
3600	vld1.8	{$tmpin},[$out]
3601	veor	$tmpin,$tmpin,$iv0
3602
3603	// Decrypt the composite block to get the last second plain text block
3604	ldr	$rounds,[$key_,#240]
3605	vld1.32	{$dat},[$key_],#16
3606	sub	$rounds,$rounds,#2
3607	vld1.32	{$dat1},[$key_],#16
3608.Loop_final_dec:
3609	aesd	$tmpin,$dat0
3610	aesimc	$tmpin,$tmpin
3611	vld1.32	{$dat0},[$key_],#16		// load key schedule...
3612	subs	$rounds,$rounds,#2
3613	aesd	$tmpin,$dat1
3614	aesimc	$tmpin,$tmpin
3615	vld1.32	{$dat1},[$key_],#16		// load key schedule...
3616	b.gt	.Loop_final_dec
3617
3618	aesd	$tmpin,$dat0
3619	aesimc	$tmpin,$tmpin
3620	vld1.32	{$dat0},[$key_]
3621	aesd	$tmpin,$dat1
3622	veor	$tmpin,$tmpin,$dat0
3623	veor	$tmpin,$tmpin,$iv0
3624	vst1.8	{$tmpin},[$out]
3625
3626.Lxts_dec_abort:
3627	ldp	$tailcnt,$midnumx,[sp,#48]
3628	ldp	$ivd10,$ivd20,[sp,#32]
3629	ldp	$ivd30,$ivd40,[sp,#16]
3630	ldp	$constnumx,$tmpinp,[sp],#64
3631
3632.Lxts_dec_final_abort:
3633	ret
3634.size	${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
3635___
3636}
3637}}}
3638$code.=<<___;
3639#endif
3640___
3641########################################
3642if ($flavour =~ /64/) {			######## 64-bit code
3643    my %opcode = (
3644	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
3645	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
3646
3647    local *unaes = sub {
3648	my ($mnemonic,$arg)=@_;
3649
3650	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
3651	sprintf ".inst\t0x%08x\t//%s %s",
3652			$opcode{$mnemonic}|$1|($2<<5),
3653			$mnemonic,$arg;
3654    };
3655
3656    foreach(split("\n",$code)) {
3657	s/\`([^\`]*)\`/eval($1)/geo;
3658
3659	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
3660	s/@\s/\/\//o;			# old->new style commentary
3661
3662	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
3663	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
3664	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
3665	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
3666	s/vext\.8/ext/o		or
3667	s/vrev32\.8/rev32/o	or
3668	s/vtst\.8/cmtst/o	or
3669	s/vshr/ushr/o		or
3670	s/^(\s+)v/$1/o		or	# strip off v prefix
3671	s/\bbx\s+lr\b/ret/o;
3672
3673	# fix up remaining legacy suffixes
3674	s/\.[ui]?8//o;
3675	m/\],#8/o and s/\.16b/\.8b/go;
3676	s/\.[ui]?32//o and s/\.16b/\.4s/go;
3677	s/\.[ui]?64//o and s/\.16b/\.2d/go;
3678	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3679
3680	print $_,"\n";
3681    }
3682} else {				######## 32-bit code
3683    my %opcode = (
3684	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
3685	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
3686
3687    local *unaes = sub {
3688	my ($mnemonic,$arg)=@_;
3689
3690	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
3691	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
3692					 |(($2&7)<<1) |(($2&8)<<2);
3693	    # since ARMv7 instructions are always encoded little-endian.
3694	    # correct solution is to use .inst directive, but older
3695	    # assemblers don't implement it:-(
3696	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3697			$word&0xff,($word>>8)&0xff,
3698			($word>>16)&0xff,($word>>24)&0xff,
3699			$mnemonic,$arg;
3700	}
3701    };
3702
3703    sub unvtbl {
3704	my $arg=shift;
3705
3706	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
3707	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
3708		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
3709    }
3710
3711    sub unvdup32 {
3712	my $arg=shift;
3713
3714	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3715	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3716    }
3717
3718    sub unvmov32 {
3719	my $arg=shift;
3720
3721	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3722	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3723    }
3724
3725    foreach(split("\n",$code)) {
3726	s/\`([^\`]*)\`/eval($1)/geo;
3727
3728	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
3729	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
3730	s/\/\/\s?/@ /o;				# new->old style commentary
3731
3732	# fix up remaining new-style suffixes
3733	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
3734	s/\],#[0-9]+/]!/o;
3735
3736	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
3737	s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2	$1,#0/o	or
3738	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
3739	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
3740	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
3741	s/^(\s+)b\./$1b/o				or
3742	s/^(\s+)ret/$1bx\tlr/o;
3743
3744	if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3745	    print "	it	$2\n";
3746	}
3747
3748	print $_,"\n";
3749    }
3750}
3751
3752close STDOUT or die "error closing STDOUT: $!";
3753