1 #include "arm_arch.h"
2 
3 #if __ARM_MAX_ARCH__>=7
4 
5 .text
6 .align	5
7 Lrcon:
8 .long	0x01,0x01,0x01,0x01
9 .long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
10 .long	0x1b,0x1b,0x1b,0x1b
11 
12 .globl	_aes_v8_set_encrypt_key
13 
14 .align	5
15 _aes_v8_set_encrypt_key:
16 Lenc_key:
17 	stp	x29,x30,[sp,#-16]!
18 	add	x29,sp,#0
19 	mov	x3,#-1
20 	cmp	x0,#0
21 	b.eq	Lenc_key_abort
22 	cmp	x2,#0
23 	b.eq	Lenc_key_abort
24 	mov	x3,#-2
25 	cmp	w1,#128
26 	b.lt	Lenc_key_abort
27 	cmp	w1,#256
28 	b.gt	Lenc_key_abort
29 	tst	w1,#0x3f
30 	b.ne	Lenc_key_abort
31 
32 	adr	x3,Lrcon
33 	cmp	w1,#192
34 
35 	eor	v0.16b,v0.16b,v0.16b
36 	ld1	{v3.16b},[x0],#16
37 	mov	w1,#8		// reuse w1
38 	ld1	{v1.4s,v2.4s},[x3],#32
39 
40 	b.lt	Loop128
41 	b.eq	L192
42 	b	L256
43 
44 .align	4
45 Loop128:
46 	tbl	v6.16b,{v3.16b},v2.16b
47 	ext	v5.16b,v0.16b,v3.16b,#12
48 	st1	{v3.4s},[x2],#16
49 	aese	v6.16b,v0.16b
50 	subs	w1,w1,#1
51 
52 	eor	v3.16b,v3.16b,v5.16b
53 	ext	v5.16b,v0.16b,v5.16b,#12
54 	eor	v3.16b,v3.16b,v5.16b
55 	ext	v5.16b,v0.16b,v5.16b,#12
56 	eor	v6.16b,v6.16b,v1.16b
57 	eor	v3.16b,v3.16b,v5.16b
58 	shl	v1.16b,v1.16b,#1
59 	eor	v3.16b,v3.16b,v6.16b
60 	b.ne	Loop128
61 
62 	ld1	{v1.4s},[x3]
63 
64 	tbl	v6.16b,{v3.16b},v2.16b
65 	ext	v5.16b,v0.16b,v3.16b,#12
66 	st1	{v3.4s},[x2],#16
67 	aese	v6.16b,v0.16b
68 
69 	eor	v3.16b,v3.16b,v5.16b
70 	ext	v5.16b,v0.16b,v5.16b,#12
71 	eor	v3.16b,v3.16b,v5.16b
72 	ext	v5.16b,v0.16b,v5.16b,#12
73 	eor	v6.16b,v6.16b,v1.16b
74 	eor	v3.16b,v3.16b,v5.16b
75 	shl	v1.16b,v1.16b,#1
76 	eor	v3.16b,v3.16b,v6.16b
77 
78 	tbl	v6.16b,{v3.16b},v2.16b
79 	ext	v5.16b,v0.16b,v3.16b,#12
80 	st1	{v3.4s},[x2],#16
81 	aese	v6.16b,v0.16b
82 
83 	eor	v3.16b,v3.16b,v5.16b
84 	ext	v5.16b,v0.16b,v5.16b,#12
85 	eor	v3.16b,v3.16b,v5.16b
86 	ext	v5.16b,v0.16b,v5.16b,#12
87 	eor	v6.16b,v6.16b,v1.16b
88 	eor	v3.16b,v3.16b,v5.16b
89 	eor	v3.16b,v3.16b,v6.16b
90 	st1	{v3.4s},[x2]
91 	add	x2,x2,#0x50
92 
93 	mov	w12,#10
94 	b	Ldone
95 
96 .align	4
97 L192:
98 	ld1	{v4.8b},[x0],#8
99 	movi	v6.16b,#8			// borrow v6.16b
100 	st1	{v3.4s},[x2],#16
101 	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
102 
103 Loop192:
104 	tbl	v6.16b,{v4.16b},v2.16b
105 	ext	v5.16b,v0.16b,v3.16b,#12
106 #ifdef __AARCH64EB__
107 	st1	{v4.4s},[x2],#16
108 	sub	x2,x2,#8
109 #else
110 	st1	{v4.8b},[x2],#8
111 #endif
112 	aese	v6.16b,v0.16b
113 	subs	w1,w1,#1
114 
115 	eor	v3.16b,v3.16b,v5.16b
116 	ext	v5.16b,v0.16b,v5.16b,#12
117 	eor	v3.16b,v3.16b,v5.16b
118 	ext	v5.16b,v0.16b,v5.16b,#12
119 	eor	v3.16b,v3.16b,v5.16b
120 
121 	dup	v5.4s,v3.s[3]
122 	eor	v5.16b,v5.16b,v4.16b
123 	eor	v6.16b,v6.16b,v1.16b
124 	ext	v4.16b,v0.16b,v4.16b,#12
125 	shl	v1.16b,v1.16b,#1
126 	eor	v4.16b,v4.16b,v5.16b
127 	eor	v3.16b,v3.16b,v6.16b
128 	eor	v4.16b,v4.16b,v6.16b
129 	st1	{v3.4s},[x2],#16
130 	b.ne	Loop192
131 
132 	mov	w12,#12
133 	add	x2,x2,#0x20
134 	b	Ldone
135 
136 .align	4
137 L256:
138 	ld1	{v4.16b},[x0]
139 	mov	w1,#7
140 	mov	w12,#14
141 	st1	{v3.4s},[x2],#16
142 
143 Loop256:
144 	tbl	v6.16b,{v4.16b},v2.16b
145 	ext	v5.16b,v0.16b,v3.16b,#12
146 	st1	{v4.4s},[x2],#16
147 	aese	v6.16b,v0.16b
148 	subs	w1,w1,#1
149 
150 	eor	v3.16b,v3.16b,v5.16b
151 	ext	v5.16b,v0.16b,v5.16b,#12
152 	eor	v3.16b,v3.16b,v5.16b
153 	ext	v5.16b,v0.16b,v5.16b,#12
154 	eor	v6.16b,v6.16b,v1.16b
155 	eor	v3.16b,v3.16b,v5.16b
156 	shl	v1.16b,v1.16b,#1
157 	eor	v3.16b,v3.16b,v6.16b
158 	st1	{v3.4s},[x2],#16
159 	b.eq	Ldone
160 
161 	dup	v6.4s,v3.s[3]		// just splat
162 	ext	v5.16b,v0.16b,v4.16b,#12
163 	aese	v6.16b,v0.16b
164 
165 	eor	v4.16b,v4.16b,v5.16b
166 	ext	v5.16b,v0.16b,v5.16b,#12
167 	eor	v4.16b,v4.16b,v5.16b
168 	ext	v5.16b,v0.16b,v5.16b,#12
169 	eor	v4.16b,v4.16b,v5.16b
170 
171 	eor	v4.16b,v4.16b,v6.16b
172 	b	Loop256
173 
174 Ldone:
175 	str	w12,[x2]
176 	mov	x3,#0
177 
178 Lenc_key_abort:
179 	mov	x0,x3			// return value
180 	ldr	x29,[sp],#16
181 	ret
182 
183 
184 .globl	_aes_v8_set_decrypt_key
185 
186 .align	5
187 _aes_v8_set_decrypt_key:
188 .long	0xd503233f		// paciasp
189 	stp	x29,x30,[sp,#-16]!
190 	add	x29,sp,#0
191 	bl	Lenc_key
192 
193 	cmp	x0,#0
194 	b.ne	Ldec_key_abort
195 
196 	sub	x2,x2,#240		// restore original x2
197 	mov	x4,#-16
198 	add	x0,x2,x12,lsl#4	// end of key schedule
199 
200 	ld1	{v0.4s},[x2]
201 	ld1	{v1.4s},[x0]
202 	st1	{v0.4s},[x0],x4
203 	st1	{v1.4s},[x2],#16
204 
205 Loop_imc:
206 	ld1	{v0.4s},[x2]
207 	ld1	{v1.4s},[x0]
208 	aesimc	v0.16b,v0.16b
209 	aesimc	v1.16b,v1.16b
210 	st1	{v0.4s},[x0],x4
211 	st1	{v1.4s},[x2],#16
212 	cmp	x0,x2
213 	b.hi	Loop_imc
214 
215 	ld1	{v0.4s},[x2]
216 	aesimc	v0.16b,v0.16b
217 	st1	{v0.4s},[x0]
218 
219 	eor	x0,x0,x0		// return value
220 Ldec_key_abort:
221 	ldp	x29,x30,[sp],#16
222 .long	0xd50323bf		// autiasp
223 	ret
224 
225 .globl	_aes_v8_encrypt
226 
227 .align	5
228 _aes_v8_encrypt:
229 	ldr	w3,[x2,#240]
230 	ld1	{v0.4s},[x2],#16
231 	ld1	{v2.16b},[x0]
232 	sub	w3,w3,#2
233 	ld1	{v1.4s},[x2],#16
234 
235 Loop_enc:
236 	aese	v2.16b,v0.16b
237 	aesmc	v2.16b,v2.16b
238 	ld1	{v0.4s},[x2],#16
239 	subs	w3,w3,#2
240 	aese	v2.16b,v1.16b
241 	aesmc	v2.16b,v2.16b
242 	ld1	{v1.4s},[x2],#16
243 	b.gt	Loop_enc
244 
245 	aese	v2.16b,v0.16b
246 	aesmc	v2.16b,v2.16b
247 	ld1	{v0.4s},[x2]
248 	aese	v2.16b,v1.16b
249 	eor	v2.16b,v2.16b,v0.16b
250 
251 	st1	{v2.16b},[x1]
252 	ret
253 
254 .globl	_aes_v8_decrypt
255 
256 .align	5
257 _aes_v8_decrypt:
258 	ldr	w3,[x2,#240]
259 	ld1	{v0.4s},[x2],#16
260 	ld1	{v2.16b},[x0]
261 	sub	w3,w3,#2
262 	ld1	{v1.4s},[x2],#16
263 
264 Loop_dec:
265 	aesd	v2.16b,v0.16b
266 	aesimc	v2.16b,v2.16b
267 	ld1	{v0.4s},[x2],#16
268 	subs	w3,w3,#2
269 	aesd	v2.16b,v1.16b
270 	aesimc	v2.16b,v2.16b
271 	ld1	{v1.4s},[x2],#16
272 	b.gt	Loop_dec
273 
274 	aesd	v2.16b,v0.16b
275 	aesimc	v2.16b,v2.16b
276 	ld1	{v0.4s},[x2]
277 	aesd	v2.16b,v1.16b
278 	eor	v2.16b,v2.16b,v0.16b
279 
280 	st1	{v2.16b},[x1]
281 	ret
282 
283 .globl	_aes_v8_ecb_encrypt
284 
285 .align	5
286 _aes_v8_ecb_encrypt:
287 	subs	x2,x2,#16
288 	// Original input data size bigger than 16, jump to big size processing.
289 	b.ne	Lecb_big_size
290 	ld1	{v0.16b},[x0]
291 	cmp	w4,#0					// en- or decrypting?
292 	ldr	w5,[x3,#240]
293 	ld1	{v5.4s,v6.4s},[x3],#32			// load key schedule...
294 
295 	b.eq	Lecb_small_dec
296 	aese	v0.16b,v5.16b
297 	aesmc	v0.16b,v0.16b
298 	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
299 	aese	v0.16b,v6.16b
300 	aesmc	v0.16b,v0.16b
301 	subs	w5,w5,#10			// if rounds==10, jump to aes-128-ecb processing
302 	b.eq	Lecb_128_enc
303 Lecb_round_loop:
304 	aese	v0.16b,v16.16b
305 	aesmc	v0.16b,v0.16b
306 	ld1	{v16.4s},[x3],#16				// load key schedule...
307 	aese	v0.16b,v17.16b
308 	aesmc	v0.16b,v0.16b
309 	ld1	{v17.4s},[x3],#16				// load key schedule...
310 	subs	w5,w5,#2			// bias
311 	b.gt	Lecb_round_loop
312 Lecb_128_enc:
313 	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
314 	aese	v0.16b,v16.16b
315 	aesmc	v0.16b,v0.16b
316 	aese	v0.16b,v17.16b
317 	aesmc	v0.16b,v0.16b
318 	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
319 	aese	v0.16b,v18.16b
320 	aesmc	v0.16b,v0.16b
321 	aese	v0.16b,v19.16b
322 	aesmc	v0.16b,v0.16b
323 	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
324 	aese	v0.16b,v20.16b
325 	aesmc	v0.16b,v0.16b
326 	aese	v0.16b,v21.16b
327 	aesmc	v0.16b,v0.16b
328 	ld1	{v7.4s},[x3]
329 	aese	v0.16b,v22.16b
330 	aesmc	v0.16b,v0.16b
331 	aese	v0.16b,v23.16b
332 	eor	v0.16b,v0.16b,v7.16b
333 	st1	{v0.16b},[x1]
334 	b	Lecb_Final_abort
335 Lecb_small_dec:
336 	aesd	v0.16b,v5.16b
337 	aesimc	v0.16b,v0.16b
338 	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
339 	aesd	v0.16b,v6.16b
340 	aesimc	v0.16b,v0.16b
341 	subs	w5,w5,#10			// bias
342 	b.eq	Lecb_128_dec
343 Lecb_dec_round_loop:
344 	aesd	v0.16b,v16.16b
345 	aesimc	v0.16b,v0.16b
346 	ld1	{v16.4s},[x3],#16				// load key schedule...
347 	aesd	v0.16b,v17.16b
348 	aesimc	v0.16b,v0.16b
349 	ld1	{v17.4s},[x3],#16				// load key schedule...
350 	subs	w5,w5,#2			// bias
351 	b.gt	Lecb_dec_round_loop
352 Lecb_128_dec:
353 	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
354 	aesd	v0.16b,v16.16b
355 	aesimc	v0.16b,v0.16b
356 	aesd	v0.16b,v17.16b
357 	aesimc	v0.16b,v0.16b
358 	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
359 	aesd	v0.16b,v18.16b
360 	aesimc	v0.16b,v0.16b
361 	aesd	v0.16b,v19.16b
362 	aesimc	v0.16b,v0.16b
363 	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
364 	aesd	v0.16b,v20.16b
365 	aesimc	v0.16b,v0.16b
366 	aesd	v0.16b,v21.16b
367 	aesimc	v0.16b,v0.16b
368 	ld1	{v7.4s},[x3]
369 	aesd	v0.16b,v22.16b
370 	aesimc	v0.16b,v0.16b
371 	aesd	v0.16b,v23.16b
372 	eor	v0.16b,v0.16b,v7.16b
373 	st1	{v0.16b},[x1]
374 	b	Lecb_Final_abort
375 Lecb_big_size:
376 	stp	x29,x30,[sp,#-16]!
377 	add	x29,sp,#0
378 	mov	x8,#16
379 	b.lo	Lecb_done
380 	csel	x8,xzr,x8,eq
381 
382 	cmp	w4,#0					// en- or decrypting?
383 	ldr	w5,[x3,#240]
384 	and	x2,x2,#-16
385 	ld1	{v0.16b},[x0],x8
386 
387 	ld1	{v16.4s,v17.4s},[x3]				// load key schedule...
388 	sub	w5,w5,#6
389 	add	x7,x3,x5,lsl#4				// pointer to last 7 round keys
390 	sub	w5,w5,#2
391 	ld1	{v18.4s,v19.4s},[x7],#32
392 	ld1	{v20.4s,v21.4s},[x7],#32
393 	ld1	{v22.4s,v23.4s},[x7],#32
394 	ld1	{v7.4s},[x7]
395 
396 	add	x7,x3,#32
397 	mov	w6,w5
398 	b.eq	Lecb_dec
399 
400 	ld1	{v1.16b},[x0],#16
401 	subs	x2,x2,#32				// bias
402 	add	w6,w5,#2
403 	orr	v3.16b,v1.16b,v1.16b
404 	orr	v24.16b,v1.16b,v1.16b
405 	orr	v1.16b,v0.16b,v0.16b
406 	b.lo	Lecb_enc_tail
407 
408 	orr	v1.16b,v3.16b,v3.16b
409 	ld1	{v24.16b},[x0],#16
410 	cmp	x2,#32
411 	b.lo	Loop3x_ecb_enc
412 
413 	ld1	{v25.16b},[x0],#16
414 	ld1	{v26.16b},[x0],#16
415 	sub	x2,x2,#32				// bias
416 	mov	w6,w5
417 
418 Loop5x_ecb_enc:
419 	aese	v0.16b,v16.16b
420 	aesmc	v0.16b,v0.16b
421 	aese	v1.16b,v16.16b
422 	aesmc	v1.16b,v1.16b
423 	aese	v24.16b,v16.16b
424 	aesmc	v24.16b,v24.16b
425 	aese	v25.16b,v16.16b
426 	aesmc	v25.16b,v25.16b
427 	aese	v26.16b,v16.16b
428 	aesmc	v26.16b,v26.16b
429 	ld1	{v16.4s},[x7],#16
430 	subs	w6,w6,#2
431 	aese	v0.16b,v17.16b
432 	aesmc	v0.16b,v0.16b
433 	aese	v1.16b,v17.16b
434 	aesmc	v1.16b,v1.16b
435 	aese	v24.16b,v17.16b
436 	aesmc	v24.16b,v24.16b
437 	aese	v25.16b,v17.16b
438 	aesmc	v25.16b,v25.16b
439 	aese	v26.16b,v17.16b
440 	aesmc	v26.16b,v26.16b
441 	ld1	{v17.4s},[x7],#16
442 	b.gt	Loop5x_ecb_enc
443 
444 	aese	v0.16b,v16.16b
445 	aesmc	v0.16b,v0.16b
446 	aese	v1.16b,v16.16b
447 	aesmc	v1.16b,v1.16b
448 	aese	v24.16b,v16.16b
449 	aesmc	v24.16b,v24.16b
450 	aese	v25.16b,v16.16b
451 	aesmc	v25.16b,v25.16b
452 	aese	v26.16b,v16.16b
453 	aesmc	v26.16b,v26.16b
454 	cmp	x2,#0x40					// because Lecb_enc_tail4x
455 	sub	x2,x2,#0x50
456 
457 	aese	v0.16b,v17.16b
458 	aesmc	v0.16b,v0.16b
459 	aese	v1.16b,v17.16b
460 	aesmc	v1.16b,v1.16b
461 	aese	v24.16b,v17.16b
462 	aesmc	v24.16b,v24.16b
463 	aese	v25.16b,v17.16b
464 	aesmc	v25.16b,v25.16b
465 	aese	v26.16b,v17.16b
466 	aesmc	v26.16b,v26.16b
467 	csel	x6,xzr,x2,gt			// borrow x6, w6, "gt" is not typo
468 	mov	x7,x3
469 
470 	aese	v0.16b,v18.16b
471 	aesmc	v0.16b,v0.16b
472 	aese	v1.16b,v18.16b
473 	aesmc	v1.16b,v1.16b
474 	aese	v24.16b,v18.16b
475 	aesmc	v24.16b,v24.16b
476 	aese	v25.16b,v18.16b
477 	aesmc	v25.16b,v25.16b
478 	aese	v26.16b,v18.16b
479 	aesmc	v26.16b,v26.16b
480 	add	x0,x0,x6				// x0 is adjusted in such way that
481 							// at exit from the loop v1.16b-v26.16b
482 							// are loaded with last "words"
483 	add	x6,x2,#0x60		    // because Lecb_enc_tail4x
484 
485 	aese	v0.16b,v19.16b
486 	aesmc	v0.16b,v0.16b
487 	aese	v1.16b,v19.16b
488 	aesmc	v1.16b,v1.16b
489 	aese	v24.16b,v19.16b
490 	aesmc	v24.16b,v24.16b
491 	aese	v25.16b,v19.16b
492 	aesmc	v25.16b,v25.16b
493 	aese	v26.16b,v19.16b
494 	aesmc	v26.16b,v26.16b
495 
496 	aese	v0.16b,v20.16b
497 	aesmc	v0.16b,v0.16b
498 	aese	v1.16b,v20.16b
499 	aesmc	v1.16b,v1.16b
500 	aese	v24.16b,v20.16b
501 	aesmc	v24.16b,v24.16b
502 	aese	v25.16b,v20.16b
503 	aesmc	v25.16b,v25.16b
504 	aese	v26.16b,v20.16b
505 	aesmc	v26.16b,v26.16b
506 
507 	aese	v0.16b,v21.16b
508 	aesmc	v0.16b,v0.16b
509 	aese	v1.16b,v21.16b
510 	aesmc	v1.16b,v1.16b
511 	aese	v24.16b,v21.16b
512 	aesmc	v24.16b,v24.16b
513 	aese	v25.16b,v21.16b
514 	aesmc	v25.16b,v25.16b
515 	aese	v26.16b,v21.16b
516 	aesmc	v26.16b,v26.16b
517 
518 	aese	v0.16b,v22.16b
519 	aesmc	v0.16b,v0.16b
520 	aese	v1.16b,v22.16b
521 	aesmc	v1.16b,v1.16b
522 	aese	v24.16b,v22.16b
523 	aesmc	v24.16b,v24.16b
524 	aese	v25.16b,v22.16b
525 	aesmc	v25.16b,v25.16b
526 	aese	v26.16b,v22.16b
527 	aesmc	v26.16b,v26.16b
528 
529 	aese	v0.16b,v23.16b
530 	ld1	{v2.16b},[x0],#16
531 	aese	v1.16b,v23.16b
532 	ld1	{v3.16b},[x0],#16
533 	aese	v24.16b,v23.16b
534 	ld1	{v27.16b},[x0],#16
535 	aese	v25.16b,v23.16b
536 	ld1	{v28.16b},[x0],#16
537 	aese	v26.16b,v23.16b
538 	ld1	{v29.16b},[x0],#16
539 	cbz	x6,Lecb_enc_tail4x
540 	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
541 	eor	v4.16b,v7.16b,v0.16b
542 	orr	v0.16b,v2.16b,v2.16b
543 	eor	v5.16b,v7.16b,v1.16b
544 	orr	v1.16b,v3.16b,v3.16b
545 	eor	v17.16b,v7.16b,v24.16b
546 	orr	v24.16b,v27.16b,v27.16b
547 	eor	v30.16b,v7.16b,v25.16b
548 	orr	v25.16b,v28.16b,v28.16b
549 	eor	v31.16b,v7.16b,v26.16b
550 	st1	{v4.16b},[x1],#16
551 	orr	v26.16b,v29.16b,v29.16b
552 	st1	{v5.16b},[x1],#16
553 	mov	w6,w5
554 	st1	{v17.16b},[x1],#16
555 	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
556 	st1	{v30.16b},[x1],#16
557 	st1	{v31.16b},[x1],#16
558 	b.hs	Loop5x_ecb_enc
559 
560 	add	x2,x2,#0x50
561 	cbz	x2,Lecb_done
562 
563 	add	w6,w5,#2
564 	subs	x2,x2,#0x30
565 	orr	v0.16b,v27.16b,v27.16b
566 	orr	v1.16b,v28.16b,v28.16b
567 	orr	v24.16b,v29.16b,v29.16b
568 	b.lo	Lecb_enc_tail
569 
570 	b	Loop3x_ecb_enc
571 
572 .align	4
573 Lecb_enc_tail4x:
574 	eor	v5.16b,v7.16b,v1.16b
575 	eor	v17.16b,v7.16b,v24.16b
576 	eor	v30.16b,v7.16b,v25.16b
577 	eor	v31.16b,v7.16b,v26.16b
578 	st1	{v5.16b},[x1],#16
579 	st1	{v17.16b},[x1],#16
580 	st1	{v30.16b},[x1],#16
581 	st1	{v31.16b},[x1],#16
582 
583 	b	Lecb_done
584 .align	4
585 Loop3x_ecb_enc:
586 	aese	v0.16b,v16.16b
587 	aesmc	v0.16b,v0.16b
588 	aese	v1.16b,v16.16b
589 	aesmc	v1.16b,v1.16b
590 	aese	v24.16b,v16.16b
591 	aesmc	v24.16b,v24.16b
592 	ld1	{v16.4s},[x7],#16
593 	subs	w6,w6,#2
594 	aese	v0.16b,v17.16b
595 	aesmc	v0.16b,v0.16b
596 	aese	v1.16b,v17.16b
597 	aesmc	v1.16b,v1.16b
598 	aese	v24.16b,v17.16b
599 	aesmc	v24.16b,v24.16b
600 	ld1	{v17.4s},[x7],#16
601 	b.gt	Loop3x_ecb_enc
602 
603 	aese	v0.16b,v16.16b
604 	aesmc	v0.16b,v0.16b
605 	aese	v1.16b,v16.16b
606 	aesmc	v1.16b,v1.16b
607 	aese	v24.16b,v16.16b
608 	aesmc	v24.16b,v24.16b
609 	subs	x2,x2,#0x30
610 	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
611 	aese	v0.16b,v17.16b
612 	aesmc	v0.16b,v0.16b
613 	aese	v1.16b,v17.16b
614 	aesmc	v1.16b,v1.16b
615 	aese	v24.16b,v17.16b
616 	aesmc	v24.16b,v24.16b
617 	add	x0,x0,x6			// x0 is adjusted in such way that
618 						// at exit from the loop v1.16b-v24.16b
619 						// are loaded with last "words"
620 	mov	x7,x3
621 	aese	v0.16b,v20.16b
622 	aesmc	v0.16b,v0.16b
623 	aese	v1.16b,v20.16b
624 	aesmc	v1.16b,v1.16b
625 	aese	v24.16b,v20.16b
626 	aesmc	v24.16b,v24.16b
627 	ld1	{v2.16b},[x0],#16
628 	aese	v0.16b,v21.16b
629 	aesmc	v0.16b,v0.16b
630 	aese	v1.16b,v21.16b
631 	aesmc	v1.16b,v1.16b
632 	aese	v24.16b,v21.16b
633 	aesmc	v24.16b,v24.16b
634 	ld1	{v3.16b},[x0],#16
635 	aese	v0.16b,v22.16b
636 	aesmc	v0.16b,v0.16b
637 	aese	v1.16b,v22.16b
638 	aesmc	v1.16b,v1.16b
639 	aese	v24.16b,v22.16b
640 	aesmc	v24.16b,v24.16b
641 	ld1	{v27.16b},[x0],#16
642 	aese	v0.16b,v23.16b
643 	aese	v1.16b,v23.16b
644 	aese	v24.16b,v23.16b
645 	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
646 	add	w6,w5,#2
647 	eor	v4.16b,v7.16b,v0.16b
648 	eor	v5.16b,v7.16b,v1.16b
649 	eor	v24.16b,v24.16b,v7.16b
650 	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
651 	st1	{v4.16b},[x1],#16
652 	orr	v0.16b,v2.16b,v2.16b
653 	st1	{v5.16b},[x1],#16
654 	orr	v1.16b,v3.16b,v3.16b
655 	st1	{v24.16b},[x1],#16
656 	orr	v24.16b,v27.16b,v27.16b
657 	b.hs	Loop3x_ecb_enc
658 
659 	cmn	x2,#0x30
660 	b.eq	Lecb_done
661 	nop
662 
663 Lecb_enc_tail:
664 	aese	v1.16b,v16.16b
665 	aesmc	v1.16b,v1.16b
666 	aese	v24.16b,v16.16b
667 	aesmc	v24.16b,v24.16b
668 	ld1	{v16.4s},[x7],#16
669 	subs	w6,w6,#2
670 	aese	v1.16b,v17.16b
671 	aesmc	v1.16b,v1.16b
672 	aese	v24.16b,v17.16b
673 	aesmc	v24.16b,v24.16b
674 	ld1	{v17.4s},[x7],#16
675 	b.gt	Lecb_enc_tail
676 
677 	aese	v1.16b,v16.16b
678 	aesmc	v1.16b,v1.16b
679 	aese	v24.16b,v16.16b
680 	aesmc	v24.16b,v24.16b
681 	aese	v1.16b,v17.16b
682 	aesmc	v1.16b,v1.16b
683 	aese	v24.16b,v17.16b
684 	aesmc	v24.16b,v24.16b
685 	aese	v1.16b,v20.16b
686 	aesmc	v1.16b,v1.16b
687 	aese	v24.16b,v20.16b
688 	aesmc	v24.16b,v24.16b
689 	cmn	x2,#0x20
690 	aese	v1.16b,v21.16b
691 	aesmc	v1.16b,v1.16b
692 	aese	v24.16b,v21.16b
693 	aesmc	v24.16b,v24.16b
694 	aese	v1.16b,v22.16b
695 	aesmc	v1.16b,v1.16b
696 	aese	v24.16b,v22.16b
697 	aesmc	v24.16b,v24.16b
698 	aese	v1.16b,v23.16b
699 	aese	v24.16b,v23.16b
700 	b.eq	Lecb_enc_one
701 	eor	v5.16b,v7.16b,v1.16b
702 	eor	v17.16b,v7.16b,v24.16b
703 	st1	{v5.16b},[x1],#16
704 	st1	{v17.16b},[x1],#16
705 	b	Lecb_done
706 
707 Lecb_enc_one:
708 	eor	v5.16b,v7.16b,v24.16b
709 	st1	{v5.16b},[x1],#16
710 	b	Lecb_done
711 .align	5
712 Lecb_dec:
713 	ld1	{v1.16b},[x0],#16
714 	subs	x2,x2,#32			// bias
715 	add	w6,w5,#2
716 	orr	v3.16b,v1.16b,v1.16b
717 	orr	v24.16b,v1.16b,v1.16b
718 	orr	v1.16b,v0.16b,v0.16b
719 	b.lo	Lecb_dec_tail
720 
721 	orr	v1.16b,v3.16b,v3.16b
722 	ld1	{v24.16b},[x0],#16
723 	cmp	x2,#32
724 	b.lo	Loop3x_ecb_dec
725 
726 	ld1	{v25.16b},[x0],#16
727 	ld1	{v26.16b},[x0],#16
728 	sub	x2,x2,#32				// bias
729 	mov	w6,w5
730 
731 Loop5x_ecb_dec:
732 	aesd	v0.16b,v16.16b
733 	aesimc	v0.16b,v0.16b
734 	aesd	v1.16b,v16.16b
735 	aesimc	v1.16b,v1.16b
736 	aesd	v24.16b,v16.16b
737 	aesimc	v24.16b,v24.16b
738 	aesd	v25.16b,v16.16b
739 	aesimc	v25.16b,v25.16b
740 	aesd	v26.16b,v16.16b
741 	aesimc	v26.16b,v26.16b
742 	ld1	{v16.4s},[x7],#16
743 	subs	w6,w6,#2
744 	aesd	v0.16b,v17.16b
745 	aesimc	v0.16b,v0.16b
746 	aesd	v1.16b,v17.16b
747 	aesimc	v1.16b,v1.16b
748 	aesd	v24.16b,v17.16b
749 	aesimc	v24.16b,v24.16b
750 	aesd	v25.16b,v17.16b
751 	aesimc	v25.16b,v25.16b
752 	aesd	v26.16b,v17.16b
753 	aesimc	v26.16b,v26.16b
754 	ld1	{v17.4s},[x7],#16
755 	b.gt	Loop5x_ecb_dec
756 
757 	aesd	v0.16b,v16.16b
758 	aesimc	v0.16b,v0.16b
759 	aesd	v1.16b,v16.16b
760 	aesimc	v1.16b,v1.16b
761 	aesd	v24.16b,v16.16b
762 	aesimc	v24.16b,v24.16b
763 	aesd	v25.16b,v16.16b
764 	aesimc	v25.16b,v25.16b
765 	aesd	v26.16b,v16.16b
766 	aesimc	v26.16b,v26.16b
767 	cmp	x2,#0x40				// because Lecb_tail4x
768 	sub	x2,x2,#0x50
769 
770 	aesd	v0.16b,v17.16b
771 	aesimc	v0.16b,v0.16b
772 	aesd	v1.16b,v17.16b
773 	aesimc	v1.16b,v1.16b
774 	aesd	v24.16b,v17.16b
775 	aesimc	v24.16b,v24.16b
776 	aesd	v25.16b,v17.16b
777 	aesimc	v25.16b,v25.16b
778 	aesd	v26.16b,v17.16b
779 	aesimc	v26.16b,v26.16b
780 	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
781 	mov	x7,x3
782 
783 	aesd	v0.16b,v18.16b
784 	aesimc	v0.16b,v0.16b
785 	aesd	v1.16b,v18.16b
786 	aesimc	v1.16b,v1.16b
787 	aesd	v24.16b,v18.16b
788 	aesimc	v24.16b,v24.16b
789 	aesd	v25.16b,v18.16b
790 	aesimc	v25.16b,v25.16b
791 	aesd	v26.16b,v18.16b
792 	aesimc	v26.16b,v26.16b
793 	add	x0,x0,x6				// x0 is adjusted in such way that
794 							// at exit from the loop v1.16b-v26.16b
795 							// are loaded with last "words"
796 	add	x6,x2,#0x60			// because Lecb_tail4x
797 
798 	aesd	v0.16b,v19.16b
799 	aesimc	v0.16b,v0.16b
800 	aesd	v1.16b,v19.16b
801 	aesimc	v1.16b,v1.16b
802 	aesd	v24.16b,v19.16b
803 	aesimc	v24.16b,v24.16b
804 	aesd	v25.16b,v19.16b
805 	aesimc	v25.16b,v25.16b
806 	aesd	v26.16b,v19.16b
807 	aesimc	v26.16b,v26.16b
808 
809 	aesd	v0.16b,v20.16b
810 	aesimc	v0.16b,v0.16b
811 	aesd	v1.16b,v20.16b
812 	aesimc	v1.16b,v1.16b
813 	aesd	v24.16b,v20.16b
814 	aesimc	v24.16b,v24.16b
815 	aesd	v25.16b,v20.16b
816 	aesimc	v25.16b,v25.16b
817 	aesd	v26.16b,v20.16b
818 	aesimc	v26.16b,v26.16b
819 
820 	aesd	v0.16b,v21.16b
821 	aesimc	v0.16b,v0.16b
822 	aesd	v1.16b,v21.16b
823 	aesimc	v1.16b,v1.16b
824 	aesd	v24.16b,v21.16b
825 	aesimc	v24.16b,v24.16b
826 	aesd	v25.16b,v21.16b
827 	aesimc	v25.16b,v25.16b
828 	aesd	v26.16b,v21.16b
829 	aesimc	v26.16b,v26.16b
830 
831 	aesd	v0.16b,v22.16b
832 	aesimc	v0.16b,v0.16b
833 	aesd	v1.16b,v22.16b
834 	aesimc	v1.16b,v1.16b
835 	aesd	v24.16b,v22.16b
836 	aesimc	v24.16b,v24.16b
837 	aesd	v25.16b,v22.16b
838 	aesimc	v25.16b,v25.16b
839 	aesd	v26.16b,v22.16b
840 	aesimc	v26.16b,v26.16b
841 
842 	aesd	v0.16b,v23.16b
843 	ld1	{v2.16b},[x0],#16
844 	aesd	v1.16b,v23.16b
845 	ld1	{v3.16b},[x0],#16
846 	aesd	v24.16b,v23.16b
847 	ld1	{v27.16b},[x0],#16
848 	aesd	v25.16b,v23.16b
849 	ld1	{v28.16b},[x0],#16
850 	aesd	v26.16b,v23.16b
851 	ld1	{v29.16b},[x0],#16
852 	cbz	x6,Lecb_tail4x
853 	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
854 	eor	v4.16b,v7.16b,v0.16b
855 	orr	v0.16b,v2.16b,v2.16b
856 	eor	v5.16b,v7.16b,v1.16b
857 	orr	v1.16b,v3.16b,v3.16b
858 	eor	v17.16b,v7.16b,v24.16b
859 	orr	v24.16b,v27.16b,v27.16b
860 	eor	v30.16b,v7.16b,v25.16b
861 	orr	v25.16b,v28.16b,v28.16b
862 	eor	v31.16b,v7.16b,v26.16b
863 	st1	{v4.16b},[x1],#16
864 	orr	v26.16b,v29.16b,v29.16b
865 	st1	{v5.16b},[x1],#16
866 	mov	w6,w5
867 	st1	{v17.16b},[x1],#16
868 	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
869 	st1	{v30.16b},[x1],#16
870 	st1	{v31.16b},[x1],#16
871 	b.hs	Loop5x_ecb_dec
872 
873 	add	x2,x2,#0x50
874 	cbz	x2,Lecb_done
875 
876 	add	w6,w5,#2
877 	subs	x2,x2,#0x30
878 	orr	v0.16b,v27.16b,v27.16b
879 	orr	v1.16b,v28.16b,v28.16b
880 	orr	v24.16b,v29.16b,v29.16b
881 	b.lo	Lecb_dec_tail
882 
883 	b	Loop3x_ecb_dec
884 
885 .align	4
886 Lecb_tail4x:
887 	eor	v5.16b,v7.16b,v1.16b
888 	eor	v17.16b,v7.16b,v24.16b
889 	eor	v30.16b,v7.16b,v25.16b
890 	eor	v31.16b,v7.16b,v26.16b
891 	st1	{v5.16b},[x1],#16
892 	st1	{v17.16b},[x1],#16
893 	st1	{v30.16b},[x1],#16
894 	st1	{v31.16b},[x1],#16
895 
896 	b	Lecb_done
897 .align	4
898 Loop3x_ecb_dec:
899 	aesd	v0.16b,v16.16b
900 	aesimc	v0.16b,v0.16b
901 	aesd	v1.16b,v16.16b
902 	aesimc	v1.16b,v1.16b
903 	aesd	v24.16b,v16.16b
904 	aesimc	v24.16b,v24.16b
905 	ld1	{v16.4s},[x7],#16
906 	subs	w6,w6,#2
907 	aesd	v0.16b,v17.16b
908 	aesimc	v0.16b,v0.16b
909 	aesd	v1.16b,v17.16b
910 	aesimc	v1.16b,v1.16b
911 	aesd	v24.16b,v17.16b
912 	aesimc	v24.16b,v24.16b
913 	ld1	{v17.4s},[x7],#16
914 	b.gt	Loop3x_ecb_dec
915 
916 	aesd	v0.16b,v16.16b
917 	aesimc	v0.16b,v0.16b
918 	aesd	v1.16b,v16.16b
919 	aesimc	v1.16b,v1.16b
920 	aesd	v24.16b,v16.16b
921 	aesimc	v24.16b,v24.16b
922 	subs	x2,x2,#0x30
923 	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
924 	aesd	v0.16b,v17.16b
925 	aesimc	v0.16b,v0.16b
926 	aesd	v1.16b,v17.16b
927 	aesimc	v1.16b,v1.16b
928 	aesd	v24.16b,v17.16b
929 	aesimc	v24.16b,v24.16b
930 	add	x0,x0,x6 			// x0 is adjusted in such way that
931 						// at exit from the loop v1.16b-v24.16b
932 						// are loaded with last "words"
933 	mov	x7,x3
934 	aesd	v0.16b,v20.16b
935 	aesimc	v0.16b,v0.16b
936 	aesd	v1.16b,v20.16b
937 	aesimc	v1.16b,v1.16b
938 	aesd	v24.16b,v20.16b
939 	aesimc	v24.16b,v24.16b
940 	ld1	{v2.16b},[x0],#16
941 	aesd	v0.16b,v21.16b
942 	aesimc	v0.16b,v0.16b
943 	aesd	v1.16b,v21.16b
944 	aesimc	v1.16b,v1.16b
945 	aesd	v24.16b,v21.16b
946 	aesimc	v24.16b,v24.16b
947 	ld1	{v3.16b},[x0],#16
948 	aesd	v0.16b,v22.16b
949 	aesimc	v0.16b,v0.16b
950 	aesd	v1.16b,v22.16b
951 	aesimc	v1.16b,v1.16b
952 	aesd	v24.16b,v22.16b
953 	aesimc	v24.16b,v24.16b
954 	ld1	{v27.16b},[x0],#16
955 	aesd	v0.16b,v23.16b
956 	aesd	v1.16b,v23.16b
957 	aesd	v24.16b,v23.16b
958 	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
959 	add	w6,w5,#2
960 	eor	v4.16b,v7.16b,v0.16b
961 	eor	v5.16b,v7.16b,v1.16b
962 	eor	v24.16b,v24.16b,v7.16b
963 	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
964 	st1	{v4.16b},[x1],#16
965 	orr	v0.16b,v2.16b,v2.16b
966 	st1	{v5.16b},[x1],#16
967 	orr	v1.16b,v3.16b,v3.16b
968 	st1	{v24.16b},[x1],#16
969 	orr	v24.16b,v27.16b,v27.16b
970 	b.hs	Loop3x_ecb_dec
971 
972 	cmn	x2,#0x30
973 	b.eq	Lecb_done
974 	nop
975 
976 Lecb_dec_tail:
977 	aesd	v1.16b,v16.16b
978 	aesimc	v1.16b,v1.16b
979 	aesd	v24.16b,v16.16b
980 	aesimc	v24.16b,v24.16b
981 	ld1	{v16.4s},[x7],#16
982 	subs	w6,w6,#2
983 	aesd	v1.16b,v17.16b
984 	aesimc	v1.16b,v1.16b
985 	aesd	v24.16b,v17.16b
986 	aesimc	v24.16b,v24.16b
987 	ld1	{v17.4s},[x7],#16
988 	b.gt	Lecb_dec_tail
989 
990 	aesd	v1.16b,v16.16b
991 	aesimc	v1.16b,v1.16b
992 	aesd	v24.16b,v16.16b
993 	aesimc	v24.16b,v24.16b
994 	aesd	v1.16b,v17.16b
995 	aesimc	v1.16b,v1.16b
996 	aesd	v24.16b,v17.16b
997 	aesimc	v24.16b,v24.16b
998 	aesd	v1.16b,v20.16b
999 	aesimc	v1.16b,v1.16b
1000 	aesd	v24.16b,v20.16b
1001 	aesimc	v24.16b,v24.16b
1002 	cmn	x2,#0x20
1003 	aesd	v1.16b,v21.16b
1004 	aesimc	v1.16b,v1.16b
1005 	aesd	v24.16b,v21.16b
1006 	aesimc	v24.16b,v24.16b
1007 	aesd	v1.16b,v22.16b
1008 	aesimc	v1.16b,v1.16b
1009 	aesd	v24.16b,v22.16b
1010 	aesimc	v24.16b,v24.16b
1011 	aesd	v1.16b,v23.16b
1012 	aesd	v24.16b,v23.16b
1013 	b.eq	Lecb_dec_one
1014 	eor	v5.16b,v7.16b,v1.16b
1015 	eor	v17.16b,v7.16b,v24.16b
1016 	st1	{v5.16b},[x1],#16
1017 	st1	{v17.16b},[x1],#16
1018 	b	Lecb_done
1019 
1020 Lecb_dec_one:
1021 	eor	v5.16b,v7.16b,v24.16b
1022 	st1	{v5.16b},[x1],#16
1023 
1024 Lecb_done:
1025 	ldr	x29,[sp],#16
1026 Lecb_Final_abort:
1027 	ret
1028 
1029 .globl	_aes_v8_cbc_encrypt
1030 
1031 .align	5
1032 _aes_v8_cbc_encrypt:
1033 	stp	x29,x30,[sp,#-16]!
1034 	add	x29,sp,#0
1035 	subs	x2,x2,#16
1036 	mov	x8,#16
1037 	b.lo	Lcbc_abort
1038 	csel	x8,xzr,x8,eq
1039 
1040 	cmp	w5,#0			// en- or decrypting?
1041 	ldr	w5,[x3,#240]
1042 	and	x2,x2,#-16
1043 	ld1	{v6.16b},[x4]
1044 	ld1	{v0.16b},[x0],x8
1045 
1046 	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
1047 	sub	w5,w5,#6
1048 	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
1049 	sub	w5,w5,#2
1050 	ld1	{v18.4s,v19.4s},[x7],#32
1051 	ld1	{v20.4s,v21.4s},[x7],#32
1052 	ld1	{v22.4s,v23.4s},[x7],#32
1053 	ld1	{v7.4s},[x7]
1054 
1055 	add	x7,x3,#32
1056 	mov	w6,w5
1057 	b.eq	Lcbc_dec
1058 
1059 	cmp	w5,#2
1060 	eor	v0.16b,v0.16b,v6.16b
1061 	eor	v5.16b,v16.16b,v7.16b
1062 	b.eq	Lcbc_enc128
1063 
1064 	ld1	{v2.4s,v3.4s},[x7]
1065 	add	x7,x3,#16
1066 	add	x6,x3,#16*4
1067 	add	x12,x3,#16*5
1068 	aese	v0.16b,v16.16b
1069 	aesmc	v0.16b,v0.16b
1070 	add	x14,x3,#16*6
1071 	add	x3,x3,#16*7
1072 	b	Lenter_cbc_enc
1073 
1074 .align	4
1075 Loop_cbc_enc:
1076 	aese	v0.16b,v16.16b
1077 	aesmc	v0.16b,v0.16b
1078 	st1	{v6.16b},[x1],#16
1079 Lenter_cbc_enc:
1080 	aese	v0.16b,v17.16b
1081 	aesmc	v0.16b,v0.16b
1082 	aese	v0.16b,v2.16b
1083 	aesmc	v0.16b,v0.16b
1084 	ld1	{v16.4s},[x6]
1085 	cmp	w5,#4
1086 	aese	v0.16b,v3.16b
1087 	aesmc	v0.16b,v0.16b
1088 	ld1	{v17.4s},[x12]
1089 	b.eq	Lcbc_enc192
1090 
1091 	aese	v0.16b,v16.16b
1092 	aesmc	v0.16b,v0.16b
1093 	ld1	{v16.4s},[x14]
1094 	aese	v0.16b,v17.16b
1095 	aesmc	v0.16b,v0.16b
1096 	ld1	{v17.4s},[x3]
1097 	nop
1098 
1099 Lcbc_enc192:
1100 	aese	v0.16b,v16.16b
1101 	aesmc	v0.16b,v0.16b
1102 	subs	x2,x2,#16
1103 	aese	v0.16b,v17.16b
1104 	aesmc	v0.16b,v0.16b
1105 	csel	x8,xzr,x8,eq
1106 	aese	v0.16b,v18.16b
1107 	aesmc	v0.16b,v0.16b
1108 	aese	v0.16b,v19.16b
1109 	aesmc	v0.16b,v0.16b
1110 	ld1	{v16.16b},[x0],x8
1111 	aese	v0.16b,v20.16b
1112 	aesmc	v0.16b,v0.16b
1113 	eor	v16.16b,v16.16b,v5.16b
1114 	aese	v0.16b,v21.16b
1115 	aesmc	v0.16b,v0.16b
1116 	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
1117 	aese	v0.16b,v22.16b
1118 	aesmc	v0.16b,v0.16b
1119 	aese	v0.16b,v23.16b
1120 	eor	v6.16b,v0.16b,v7.16b
1121 	b.hs	Loop_cbc_enc
1122 
1123 	st1	{v6.16b},[x1],#16
1124 	b	Lcbc_done
1125 
1126 .align	5
1127 Lcbc_enc128:
1128 	ld1	{v2.4s,v3.4s},[x7]
1129 	aese	v0.16b,v16.16b
1130 	aesmc	v0.16b,v0.16b
1131 	b	Lenter_cbc_enc128
1132 Loop_cbc_enc128:
1133 	aese	v0.16b,v16.16b
1134 	aesmc	v0.16b,v0.16b
1135 	st1	{v6.16b},[x1],#16
1136 Lenter_cbc_enc128:
1137 	aese	v0.16b,v17.16b
1138 	aesmc	v0.16b,v0.16b
1139 	subs	x2,x2,#16
1140 	aese	v0.16b,v2.16b
1141 	aesmc	v0.16b,v0.16b
1142 	csel	x8,xzr,x8,eq
1143 	aese	v0.16b,v3.16b
1144 	aesmc	v0.16b,v0.16b
1145 	aese	v0.16b,v18.16b
1146 	aesmc	v0.16b,v0.16b
1147 	aese	v0.16b,v19.16b
1148 	aesmc	v0.16b,v0.16b
1149 	ld1	{v16.16b},[x0],x8
1150 	aese	v0.16b,v20.16b
1151 	aesmc	v0.16b,v0.16b
1152 	aese	v0.16b,v21.16b
1153 	aesmc	v0.16b,v0.16b
1154 	aese	v0.16b,v22.16b
1155 	aesmc	v0.16b,v0.16b
1156 	eor	v16.16b,v16.16b,v5.16b
1157 	aese	v0.16b,v23.16b
1158 	eor	v6.16b,v0.16b,v7.16b
1159 	b.hs	Loop_cbc_enc128
1160 
1161 	st1	{v6.16b},[x1],#16
1162 	b	Lcbc_done
1163 .align	5
1164 Lcbc_dec:
1165 	ld1	{v24.16b},[x0],#16
1166 	subs	x2,x2,#32		// bias
1167 	add	w6,w5,#2
1168 	orr	v3.16b,v0.16b,v0.16b
1169 	orr	v1.16b,v0.16b,v0.16b
1170 	orr	v27.16b,v24.16b,v24.16b
1171 	b.lo	Lcbc_dec_tail
1172 
1173 	orr	v1.16b,v24.16b,v24.16b
1174 	ld1	{v24.16b},[x0],#16
1175 	orr	v2.16b,v0.16b,v0.16b
1176 	orr	v3.16b,v1.16b,v1.16b
1177 	orr	v27.16b,v24.16b,v24.16b
1178 	cmp	x2,#32
1179 	b.lo	Loop3x_cbc_dec
1180 
1181 	ld1	{v25.16b},[x0],#16
1182 	ld1	{v26.16b},[x0],#16
1183 	sub	x2,x2,#32		// bias
1184 	mov	w6,w5
1185 	orr	v28.16b,v25.16b,v25.16b
1186 	orr	v29.16b,v26.16b,v26.16b
1187 
1188 Loop5x_cbc_dec:
1189 	aesd	v0.16b,v16.16b
1190 	aesimc	v0.16b,v0.16b
1191 	aesd	v1.16b,v16.16b
1192 	aesimc	v1.16b,v1.16b
1193 	aesd	v24.16b,v16.16b
1194 	aesimc	v24.16b,v24.16b
1195 	aesd	v25.16b,v16.16b
1196 	aesimc	v25.16b,v25.16b
1197 	aesd	v26.16b,v16.16b
1198 	aesimc	v26.16b,v26.16b
1199 	ld1	{v16.4s},[x7],#16
1200 	subs	w6,w6,#2
1201 	aesd	v0.16b,v17.16b
1202 	aesimc	v0.16b,v0.16b
1203 	aesd	v1.16b,v17.16b
1204 	aesimc	v1.16b,v1.16b
1205 	aesd	v24.16b,v17.16b
1206 	aesimc	v24.16b,v24.16b
1207 	aesd	v25.16b,v17.16b
1208 	aesimc	v25.16b,v25.16b
1209 	aesd	v26.16b,v17.16b
1210 	aesimc	v26.16b,v26.16b
1211 	ld1	{v17.4s},[x7],#16
1212 	b.gt	Loop5x_cbc_dec
1213 
1214 	aesd	v0.16b,v16.16b
1215 	aesimc	v0.16b,v0.16b
1216 	aesd	v1.16b,v16.16b
1217 	aesimc	v1.16b,v1.16b
1218 	aesd	v24.16b,v16.16b
1219 	aesimc	v24.16b,v24.16b
1220 	aesd	v25.16b,v16.16b
1221 	aesimc	v25.16b,v25.16b
1222 	aesd	v26.16b,v16.16b
1223 	aesimc	v26.16b,v26.16b
1224 	cmp	x2,#0x40		// because Lcbc_tail4x
1225 	sub	x2,x2,#0x50
1226 
1227 	aesd	v0.16b,v17.16b
1228 	aesimc	v0.16b,v0.16b
1229 	aesd	v1.16b,v17.16b
1230 	aesimc	v1.16b,v1.16b
1231 	aesd	v24.16b,v17.16b
1232 	aesimc	v24.16b,v24.16b
1233 	aesd	v25.16b,v17.16b
1234 	aesimc	v25.16b,v25.16b
1235 	aesd	v26.16b,v17.16b
1236 	aesimc	v26.16b,v26.16b
1237 	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
1238 	mov	x7,x3
1239 
1240 	aesd	v0.16b,v18.16b
1241 	aesimc	v0.16b,v0.16b
1242 	aesd	v1.16b,v18.16b
1243 	aesimc	v1.16b,v1.16b
1244 	aesd	v24.16b,v18.16b
1245 	aesimc	v24.16b,v24.16b
1246 	aesd	v25.16b,v18.16b
1247 	aesimc	v25.16b,v25.16b
1248 	aesd	v26.16b,v18.16b
1249 	aesimc	v26.16b,v26.16b
1250 	add	x0,x0,x6		// x0 is adjusted in such way that
1251 					// at exit from the loop v1.16b-v26.16b
1252 					// are loaded with last "words"
1253 	add	x6,x2,#0x60		// because Lcbc_tail4x
1254 
1255 	aesd	v0.16b,v19.16b
1256 	aesimc	v0.16b,v0.16b
1257 	aesd	v1.16b,v19.16b
1258 	aesimc	v1.16b,v1.16b
1259 	aesd	v24.16b,v19.16b
1260 	aesimc	v24.16b,v24.16b
1261 	aesd	v25.16b,v19.16b
1262 	aesimc	v25.16b,v25.16b
1263 	aesd	v26.16b,v19.16b
1264 	aesimc	v26.16b,v26.16b
1265 
1266 	aesd	v0.16b,v20.16b
1267 	aesimc	v0.16b,v0.16b
1268 	aesd	v1.16b,v20.16b
1269 	aesimc	v1.16b,v1.16b
1270 	aesd	v24.16b,v20.16b
1271 	aesimc	v24.16b,v24.16b
1272 	aesd	v25.16b,v20.16b
1273 	aesimc	v25.16b,v25.16b
1274 	aesd	v26.16b,v20.16b
1275 	aesimc	v26.16b,v26.16b
1276 
1277 	aesd	v0.16b,v21.16b
1278 	aesimc	v0.16b,v0.16b
1279 	aesd	v1.16b,v21.16b
1280 	aesimc	v1.16b,v1.16b
1281 	aesd	v24.16b,v21.16b
1282 	aesimc	v24.16b,v24.16b
1283 	aesd	v25.16b,v21.16b
1284 	aesimc	v25.16b,v25.16b
1285 	aesd	v26.16b,v21.16b
1286 	aesimc	v26.16b,v26.16b
1287 
1288 	aesd	v0.16b,v22.16b
1289 	aesimc	v0.16b,v0.16b
1290 	aesd	v1.16b,v22.16b
1291 	aesimc	v1.16b,v1.16b
1292 	aesd	v24.16b,v22.16b
1293 	aesimc	v24.16b,v24.16b
1294 	aesd	v25.16b,v22.16b
1295 	aesimc	v25.16b,v25.16b
1296 	aesd	v26.16b,v22.16b
1297 	aesimc	v26.16b,v26.16b
1298 
1299 	eor	v4.16b,v6.16b,v7.16b
1300 	aesd	v0.16b,v23.16b
1301 	eor	v5.16b,v2.16b,v7.16b
1302 	ld1	{v2.16b},[x0],#16
1303 	aesd	v1.16b,v23.16b
1304 	eor	v17.16b,v3.16b,v7.16b
1305 	ld1	{v3.16b},[x0],#16
1306 	aesd	v24.16b,v23.16b
1307 	eor	v30.16b,v27.16b,v7.16b
1308 	ld1	{v27.16b},[x0],#16
1309 	aesd	v25.16b,v23.16b
1310 	eor	v31.16b,v28.16b,v7.16b
1311 	ld1	{v28.16b},[x0],#16
1312 	aesd	v26.16b,v23.16b
1313 	orr	v6.16b,v29.16b,v29.16b
1314 	ld1	{v29.16b},[x0],#16
1315 	cbz	x6,Lcbc_tail4x
1316 	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1317 	eor	v4.16b,v4.16b,v0.16b
1318 	orr	v0.16b,v2.16b,v2.16b
1319 	eor	v5.16b,v5.16b,v1.16b
1320 	orr	v1.16b,v3.16b,v3.16b
1321 	eor	v17.16b,v17.16b,v24.16b
1322 	orr	v24.16b,v27.16b,v27.16b
1323 	eor	v30.16b,v30.16b,v25.16b
1324 	orr	v25.16b,v28.16b,v28.16b
1325 	eor	v31.16b,v31.16b,v26.16b
1326 	st1	{v4.16b},[x1],#16
1327 	orr	v26.16b,v29.16b,v29.16b
1328 	st1	{v5.16b},[x1],#16
1329 	mov	w6,w5
1330 	st1	{v17.16b},[x1],#16
1331 	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1332 	st1	{v30.16b},[x1],#16
1333 	st1	{v31.16b},[x1],#16
1334 	b.hs	Loop5x_cbc_dec
1335 
1336 	add	x2,x2,#0x50
1337 	cbz	x2,Lcbc_done
1338 
1339 	add	w6,w5,#2
1340 	subs	x2,x2,#0x30
1341 	orr	v0.16b,v27.16b,v27.16b
1342 	orr	v2.16b,v27.16b,v27.16b
1343 	orr	v1.16b,v28.16b,v28.16b
1344 	orr	v3.16b,v28.16b,v28.16b
1345 	orr	v24.16b,v29.16b,v29.16b
1346 	orr	v27.16b,v29.16b,v29.16b
1347 	b.lo	Lcbc_dec_tail
1348 
1349 	b	Loop3x_cbc_dec
1350 
1351 .align	4
1352 Lcbc_tail4x:
1353 	eor	v5.16b,v4.16b,v1.16b
1354 	eor	v17.16b,v17.16b,v24.16b
1355 	eor	v30.16b,v30.16b,v25.16b
1356 	eor	v31.16b,v31.16b,v26.16b
1357 	st1	{v5.16b},[x1],#16
1358 	st1	{v17.16b},[x1],#16
1359 	st1	{v30.16b},[x1],#16
1360 	st1	{v31.16b},[x1],#16
1361 
1362 	b	Lcbc_done
1363 .align	4
1364 Loop3x_cbc_dec:
1365 	aesd	v0.16b,v16.16b
1366 	aesimc	v0.16b,v0.16b
1367 	aesd	v1.16b,v16.16b
1368 	aesimc	v1.16b,v1.16b
1369 	aesd	v24.16b,v16.16b
1370 	aesimc	v24.16b,v24.16b
1371 	ld1	{v16.4s},[x7],#16
1372 	subs	w6,w6,#2
1373 	aesd	v0.16b,v17.16b
1374 	aesimc	v0.16b,v0.16b
1375 	aesd	v1.16b,v17.16b
1376 	aesimc	v1.16b,v1.16b
1377 	aesd	v24.16b,v17.16b
1378 	aesimc	v24.16b,v24.16b
1379 	ld1	{v17.4s},[x7],#16
1380 	b.gt	Loop3x_cbc_dec
1381 
1382 	aesd	v0.16b,v16.16b
1383 	aesimc	v0.16b,v0.16b
1384 	aesd	v1.16b,v16.16b
1385 	aesimc	v1.16b,v1.16b
1386 	aesd	v24.16b,v16.16b
1387 	aesimc	v24.16b,v24.16b
1388 	eor	v4.16b,v6.16b,v7.16b
1389 	subs	x2,x2,#0x30
1390 	eor	v5.16b,v2.16b,v7.16b
1391 	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
1392 	aesd	v0.16b,v17.16b
1393 	aesimc	v0.16b,v0.16b
1394 	aesd	v1.16b,v17.16b
1395 	aesimc	v1.16b,v1.16b
1396 	aesd	v24.16b,v17.16b
1397 	aesimc	v24.16b,v24.16b
1398 	eor	v17.16b,v3.16b,v7.16b
1399 	add	x0,x0,x6		// x0 is adjusted in such way that
1400 					// at exit from the loop v1.16b-v24.16b
1401 					// are loaded with last "words"
1402 	orr	v6.16b,v27.16b,v27.16b
1403 	mov	x7,x3
1404 	aesd	v0.16b,v20.16b
1405 	aesimc	v0.16b,v0.16b
1406 	aesd	v1.16b,v20.16b
1407 	aesimc	v1.16b,v1.16b
1408 	aesd	v24.16b,v20.16b
1409 	aesimc	v24.16b,v24.16b
1410 	ld1	{v2.16b},[x0],#16
1411 	aesd	v0.16b,v21.16b
1412 	aesimc	v0.16b,v0.16b
1413 	aesd	v1.16b,v21.16b
1414 	aesimc	v1.16b,v1.16b
1415 	aesd	v24.16b,v21.16b
1416 	aesimc	v24.16b,v24.16b
1417 	ld1	{v3.16b},[x0],#16
1418 	aesd	v0.16b,v22.16b
1419 	aesimc	v0.16b,v0.16b
1420 	aesd	v1.16b,v22.16b
1421 	aesimc	v1.16b,v1.16b
1422 	aesd	v24.16b,v22.16b
1423 	aesimc	v24.16b,v24.16b
1424 	ld1	{v27.16b},[x0],#16
1425 	aesd	v0.16b,v23.16b
1426 	aesd	v1.16b,v23.16b
1427 	aesd	v24.16b,v23.16b
1428 	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1429 	add	w6,w5,#2
1430 	eor	v4.16b,v4.16b,v0.16b
1431 	eor	v5.16b,v5.16b,v1.16b
1432 	eor	v24.16b,v24.16b,v17.16b
1433 	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1434 	st1	{v4.16b},[x1],#16
1435 	orr	v0.16b,v2.16b,v2.16b
1436 	st1	{v5.16b},[x1],#16
1437 	orr	v1.16b,v3.16b,v3.16b
1438 	st1	{v24.16b},[x1],#16
1439 	orr	v24.16b,v27.16b,v27.16b
1440 	b.hs	Loop3x_cbc_dec
1441 
1442 	cmn	x2,#0x30
1443 	b.eq	Lcbc_done
1444 	nop
1445 
1446 Lcbc_dec_tail:
1447 	aesd	v1.16b,v16.16b
1448 	aesimc	v1.16b,v1.16b
1449 	aesd	v24.16b,v16.16b
1450 	aesimc	v24.16b,v24.16b
1451 	ld1	{v16.4s},[x7],#16
1452 	subs	w6,w6,#2
1453 	aesd	v1.16b,v17.16b
1454 	aesimc	v1.16b,v1.16b
1455 	aesd	v24.16b,v17.16b
1456 	aesimc	v24.16b,v24.16b
1457 	ld1	{v17.4s},[x7],#16
1458 	b.gt	Lcbc_dec_tail
1459 
1460 	aesd	v1.16b,v16.16b
1461 	aesimc	v1.16b,v1.16b
1462 	aesd	v24.16b,v16.16b
1463 	aesimc	v24.16b,v24.16b
1464 	aesd	v1.16b,v17.16b
1465 	aesimc	v1.16b,v1.16b
1466 	aesd	v24.16b,v17.16b
1467 	aesimc	v24.16b,v24.16b
1468 	aesd	v1.16b,v20.16b
1469 	aesimc	v1.16b,v1.16b
1470 	aesd	v24.16b,v20.16b
1471 	aesimc	v24.16b,v24.16b
1472 	cmn	x2,#0x20
1473 	aesd	v1.16b,v21.16b
1474 	aesimc	v1.16b,v1.16b
1475 	aesd	v24.16b,v21.16b
1476 	aesimc	v24.16b,v24.16b
1477 	eor	v5.16b,v6.16b,v7.16b
1478 	aesd	v1.16b,v22.16b
1479 	aesimc	v1.16b,v1.16b
1480 	aesd	v24.16b,v22.16b
1481 	aesimc	v24.16b,v24.16b
1482 	eor	v17.16b,v3.16b,v7.16b
1483 	aesd	v1.16b,v23.16b
1484 	aesd	v24.16b,v23.16b
1485 	b.eq	Lcbc_dec_one
1486 	eor	v5.16b,v5.16b,v1.16b
1487 	eor	v17.16b,v17.16b,v24.16b
1488 	orr	v6.16b,v27.16b,v27.16b
1489 	st1	{v5.16b},[x1],#16
1490 	st1	{v17.16b},[x1],#16
1491 	b	Lcbc_done
1492 
1493 Lcbc_dec_one:
1494 	eor	v5.16b,v5.16b,v24.16b
1495 	orr	v6.16b,v27.16b,v27.16b
1496 	st1	{v5.16b},[x1],#16
1497 
1498 Lcbc_done:
1499 	st1	{v6.16b},[x4]
1500 Lcbc_abort:
1501 	ldr	x29,[sp],#16
1502 	ret
1503 
1504 .globl	_aes_v8_ctr32_encrypt_blocks
1505 
1506 .align	5
1507 _aes_v8_ctr32_encrypt_blocks:
1508 	stp	x29,x30,[sp,#-16]!
1509 	add	x29,sp,#0
1510 	ldr	w5,[x3,#240]
1511 
1512 	ldr	w8, [x4, #12]
1513 #ifdef __AARCH64EB__
1514 	ld1	{v0.16b},[x4]
1515 #else
1516 	ld1	{v0.4s},[x4]
1517 #endif
1518 	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
1519 	sub	w5,w5,#4
1520 	mov	x12,#16
1521 	cmp	x2,#2
1522 	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
1523 	sub	w5,w5,#2
1524 	ld1	{v20.4s,v21.4s},[x7],#32
1525 	ld1	{v22.4s,v23.4s},[x7],#32
1526 	ld1	{v7.4s},[x7]
1527 	add	x7,x3,#32
1528 	mov	w6,w5
1529 	csel	x12,xzr,x12,lo
1530 #ifndef __AARCH64EB__
1531 	rev	w8, w8
1532 #endif
1533 	orr	v1.16b,v0.16b,v0.16b
1534 	add	w10, w8, #1
1535 	orr	v18.16b,v0.16b,v0.16b
1536 	add	w8, w8, #2
1537 	orr	v6.16b,v0.16b,v0.16b
1538 	rev	w10, w10
1539 	mov	v1.s[3],w10
1540 	b.ls	Lctr32_tail
1541 	rev	w12, w8
1542 	sub	x2,x2,#3		// bias
1543 	mov	v18.s[3],w12
1544 	cmp	x2,#32
1545 	b.lo	Loop3x_ctr32
1546 
1547 	add	w13,w8,#1
1548 	add	w14,w8,#2
1549 	orr	v24.16b,v0.16b,v0.16b
1550 	rev	w13,w13
1551 	orr	v25.16b,v0.16b,v0.16b
1552 	rev	w14,w14
1553 	mov	v24.s[3],w13
1554 	sub	x2,x2,#2		// bias
1555 	mov	v25.s[3],w14
1556 	add	w8,w8,#2
1557 	b	Loop5x_ctr32
1558 
1559 .align	4
1560 Loop5x_ctr32:
1561 	aese	v0.16b,v16.16b
1562 	aesmc	v0.16b,v0.16b
1563 	aese	v1.16b,v16.16b
1564 	aesmc	v1.16b,v1.16b
1565 	aese	v18.16b,v16.16b
1566 	aesmc	v18.16b,v18.16b
1567 	aese	v24.16b,v16.16b
1568 	aesmc	v24.16b,v24.16b
1569 	aese	v25.16b,v16.16b
1570 	aesmc	v25.16b,v25.16b
1571 	ld1	{v16.4s},[x7],#16
1572 	subs	w6,w6,#2
1573 	aese	v0.16b,v17.16b
1574 	aesmc	v0.16b,v0.16b
1575 	aese	v1.16b,v17.16b
1576 	aesmc	v1.16b,v1.16b
1577 	aese	v18.16b,v17.16b
1578 	aesmc	v18.16b,v18.16b
1579 	aese	v24.16b,v17.16b
1580 	aesmc	v24.16b,v24.16b
1581 	aese	v25.16b,v17.16b
1582 	aesmc	v25.16b,v25.16b
1583 	ld1	{v17.4s},[x7],#16
1584 	b.gt	Loop5x_ctr32
1585 
1586 	mov	x7,x3
1587 	aese	v0.16b,v16.16b
1588 	aesmc	v0.16b,v0.16b
1589 	aese	v1.16b,v16.16b
1590 	aesmc	v1.16b,v1.16b
1591 	aese	v18.16b,v16.16b
1592 	aesmc	v18.16b,v18.16b
1593 	aese	v24.16b,v16.16b
1594 	aesmc	v24.16b,v24.16b
1595 	aese	v25.16b,v16.16b
1596 	aesmc	v25.16b,v25.16b
1597 	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1598 
1599 	aese	v0.16b,v17.16b
1600 	aesmc	v0.16b,v0.16b
1601 	aese	v1.16b,v17.16b
1602 	aesmc	v1.16b,v1.16b
1603 	aese	v18.16b,v17.16b
1604 	aesmc	v18.16b,v18.16b
1605 	aese	v24.16b,v17.16b
1606 	aesmc	v24.16b,v24.16b
1607 	aese	v25.16b,v17.16b
1608 	aesmc	v25.16b,v25.16b
1609 	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1610 
1611 	aese	v0.16b,v20.16b
1612 	aesmc	v0.16b,v0.16b
1613 	add	w9,w8,#1
1614 	add	w10,w8,#2
1615 	aese	v1.16b,v20.16b
1616 	aesmc	v1.16b,v1.16b
1617 	add	w12,w8,#3
1618 	add	w13,w8,#4
1619 	aese	v18.16b,v20.16b
1620 	aesmc	v18.16b,v18.16b
1621 	add	w14,w8,#5
1622 	rev	w9,w9
1623 	aese	v24.16b,v20.16b
1624 	aesmc	v24.16b,v24.16b
1625 	rev	w10,w10
1626 	rev	w12,w12
1627 	aese	v25.16b,v20.16b
1628 	aesmc	v25.16b,v25.16b
1629 	rev	w13,w13
1630 	rev	w14,w14
1631 
1632 	aese	v0.16b,v21.16b
1633 	aesmc	v0.16b,v0.16b
1634 	aese	v1.16b,v21.16b
1635 	aesmc	v1.16b,v1.16b
1636 	aese	v18.16b,v21.16b
1637 	aesmc	v18.16b,v18.16b
1638 	aese	v24.16b,v21.16b
1639 	aesmc	v24.16b,v24.16b
1640 	aese	v25.16b,v21.16b
1641 	aesmc	v25.16b,v25.16b
1642 
1643 	aese	v0.16b,v22.16b
1644 	aesmc	v0.16b,v0.16b
1645 	ld1	{v2.16b},[x0],#16
1646 	aese	v1.16b,v22.16b
1647 	aesmc	v1.16b,v1.16b
1648 	ld1	{v3.16b},[x0],#16
1649 	aese	v18.16b,v22.16b
1650 	aesmc	v18.16b,v18.16b
1651 	ld1	{v19.16b},[x0],#16
1652 	aese	v24.16b,v22.16b
1653 	aesmc	v24.16b,v24.16b
1654 	ld1	{v26.16b},[x0],#16
1655 	aese	v25.16b,v22.16b
1656 	aesmc	v25.16b,v25.16b
1657 	ld1	{v27.16b},[x0],#16
1658 
1659 	aese	v0.16b,v23.16b
1660 	eor	v2.16b,v2.16b,v7.16b
1661 	aese	v1.16b,v23.16b
1662 	eor	v3.16b,v3.16b,v7.16b
1663 	aese	v18.16b,v23.16b
1664 	eor	v19.16b,v19.16b,v7.16b
1665 	aese	v24.16b,v23.16b
1666 	eor	v26.16b,v26.16b,v7.16b
1667 	aese	v25.16b,v23.16b
1668 	eor	v27.16b,v27.16b,v7.16b
1669 
1670 	eor	v2.16b,v2.16b,v0.16b
1671 	orr	v0.16b,v6.16b,v6.16b
1672 	eor	v3.16b,v3.16b,v1.16b
1673 	orr	v1.16b,v6.16b,v6.16b
1674 	eor	v19.16b,v19.16b,v18.16b
1675 	orr	v18.16b,v6.16b,v6.16b
1676 	eor	v26.16b,v26.16b,v24.16b
1677 	orr	v24.16b,v6.16b,v6.16b
1678 	eor	v27.16b,v27.16b,v25.16b
1679 	orr	v25.16b,v6.16b,v6.16b
1680 
1681 	st1	{v2.16b},[x1],#16
1682 	mov	v0.s[3],w9
1683 	st1	{v3.16b},[x1],#16
1684 	mov	v1.s[3],w10
1685 	st1	{v19.16b},[x1],#16
1686 	mov	v18.s[3],w12
1687 	st1	{v26.16b},[x1],#16
1688 	mov	v24.s[3],w13
1689 	st1	{v27.16b},[x1],#16
1690 	mov	v25.s[3],w14
1691 
1692 	mov	w6,w5
1693 	cbz	x2,Lctr32_done
1694 
1695 	add	w8,w8,#5
1696 	subs	x2,x2,#5
1697 	b.hs	Loop5x_ctr32
1698 
1699 	add	x2,x2,#5
1700 	sub	w8,w8,#5
1701 
1702 	cmp	x2,#2
1703 	mov	x12,#16
1704 	csel	x12,xzr,x12,lo
1705 	b.ls	Lctr32_tail
1706 
1707 	sub	x2,x2,#3		// bias
1708 	add	w8,w8,#3
1709 	b	Loop3x_ctr32
1710 
1711 .align	4
1712 Loop3x_ctr32:
1713 	aese	v0.16b,v16.16b
1714 	aesmc	v0.16b,v0.16b
1715 	aese	v1.16b,v16.16b
1716 	aesmc	v1.16b,v1.16b
1717 	aese	v18.16b,v16.16b
1718 	aesmc	v18.16b,v18.16b
1719 	ld1	{v16.4s},[x7],#16
1720 	subs	w6,w6,#2
1721 	aese	v0.16b,v17.16b
1722 	aesmc	v0.16b,v0.16b
1723 	aese	v1.16b,v17.16b
1724 	aesmc	v1.16b,v1.16b
1725 	aese	v18.16b,v17.16b
1726 	aesmc	v18.16b,v18.16b
1727 	ld1	{v17.4s},[x7],#16
1728 	b.gt	Loop3x_ctr32
1729 
1730 	aese	v0.16b,v16.16b
1731 	aesmc	v4.16b,v0.16b
1732 	aese	v1.16b,v16.16b
1733 	aesmc	v5.16b,v1.16b
1734 	ld1	{v2.16b},[x0],#16
1735 	orr	v0.16b,v6.16b,v6.16b
1736 	aese	v18.16b,v16.16b
1737 	aesmc	v18.16b,v18.16b
1738 	ld1	{v3.16b},[x0],#16
1739 	orr	v1.16b,v6.16b,v6.16b
1740 	aese	v4.16b,v17.16b
1741 	aesmc	v4.16b,v4.16b
1742 	aese	v5.16b,v17.16b
1743 	aesmc	v5.16b,v5.16b
1744 	ld1	{v19.16b},[x0],#16
1745 	mov	x7,x3
1746 	aese	v18.16b,v17.16b
1747 	aesmc	v17.16b,v18.16b
1748 	orr	v18.16b,v6.16b,v6.16b
1749 	add	w9,w8,#1
1750 	aese	v4.16b,v20.16b
1751 	aesmc	v4.16b,v4.16b
1752 	aese	v5.16b,v20.16b
1753 	aesmc	v5.16b,v5.16b
1754 	eor	v2.16b,v2.16b,v7.16b
1755 	add	w10,w8,#2
1756 	aese	v17.16b,v20.16b
1757 	aesmc	v17.16b,v17.16b
1758 	eor	v3.16b,v3.16b,v7.16b
1759 	add	w8,w8,#3
1760 	aese	v4.16b,v21.16b
1761 	aesmc	v4.16b,v4.16b
1762 	aese	v5.16b,v21.16b
1763 	aesmc	v5.16b,v5.16b
1764 	eor	v19.16b,v19.16b,v7.16b
1765 	rev	w9,w9
1766 	aese	v17.16b,v21.16b
1767 	aesmc	v17.16b,v17.16b
1768 	mov	v0.s[3], w9
1769 	rev	w10,w10
1770 	aese	v4.16b,v22.16b
1771 	aesmc	v4.16b,v4.16b
1772 	aese	v5.16b,v22.16b
1773 	aesmc	v5.16b,v5.16b
1774 	mov	v1.s[3], w10
1775 	rev	w12,w8
1776 	aese	v17.16b,v22.16b
1777 	aesmc	v17.16b,v17.16b
1778 	mov	v18.s[3], w12
1779 	subs	x2,x2,#3
1780 	aese	v4.16b,v23.16b
1781 	aese	v5.16b,v23.16b
1782 	aese	v17.16b,v23.16b
1783 
1784 	eor	v2.16b,v2.16b,v4.16b
1785 	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1786 	st1	{v2.16b},[x1],#16
1787 	eor	v3.16b,v3.16b,v5.16b
1788 	mov	w6,w5
1789 	st1	{v3.16b},[x1],#16
1790 	eor	v19.16b,v19.16b,v17.16b
1791 	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1792 	st1	{v19.16b},[x1],#16
1793 	b.hs	Loop3x_ctr32
1794 
1795 	adds	x2,x2,#3
1796 	b.eq	Lctr32_done
1797 	cmp	x2,#1
1798 	mov	x12,#16
1799 	csel	x12,xzr,x12,eq
1800 
1801 Lctr32_tail:
1802 	aese	v0.16b,v16.16b
1803 	aesmc	v0.16b,v0.16b
1804 	aese	v1.16b,v16.16b
1805 	aesmc	v1.16b,v1.16b
1806 	ld1	{v16.4s},[x7],#16
1807 	subs	w6,w6,#2
1808 	aese	v0.16b,v17.16b
1809 	aesmc	v0.16b,v0.16b
1810 	aese	v1.16b,v17.16b
1811 	aesmc	v1.16b,v1.16b
1812 	ld1	{v17.4s},[x7],#16
1813 	b.gt	Lctr32_tail
1814 
1815 	aese	v0.16b,v16.16b
1816 	aesmc	v0.16b,v0.16b
1817 	aese	v1.16b,v16.16b
1818 	aesmc	v1.16b,v1.16b
1819 	aese	v0.16b,v17.16b
1820 	aesmc	v0.16b,v0.16b
1821 	aese	v1.16b,v17.16b
1822 	aesmc	v1.16b,v1.16b
1823 	ld1	{v2.16b},[x0],x12
1824 	aese	v0.16b,v20.16b
1825 	aesmc	v0.16b,v0.16b
1826 	aese	v1.16b,v20.16b
1827 	aesmc	v1.16b,v1.16b
1828 	ld1	{v3.16b},[x0]
1829 	aese	v0.16b,v21.16b
1830 	aesmc	v0.16b,v0.16b
1831 	aese	v1.16b,v21.16b
1832 	aesmc	v1.16b,v1.16b
1833 	eor	v2.16b,v2.16b,v7.16b
1834 	aese	v0.16b,v22.16b
1835 	aesmc	v0.16b,v0.16b
1836 	aese	v1.16b,v22.16b
1837 	aesmc	v1.16b,v1.16b
1838 	eor	v3.16b,v3.16b,v7.16b
1839 	aese	v0.16b,v23.16b
1840 	aese	v1.16b,v23.16b
1841 
1842 	cmp	x2,#1
1843 	eor	v2.16b,v2.16b,v0.16b
1844 	eor	v3.16b,v3.16b,v1.16b
1845 	st1	{v2.16b},[x1],#16
1846 	b.eq	Lctr32_done
1847 	st1	{v3.16b},[x1]
1848 
1849 Lctr32_done:
1850 	ldr	x29,[sp],#16
1851 	ret
1852 
1853 .globl	_aes_v8_xts_encrypt
1854 
1855 .align	5
1856 _aes_v8_xts_encrypt:
1857 	cmp	x2,#16
1858 	// Original input data size bigger than 16, jump to big size processing.
1859 	b.ne	Lxts_enc_big_size
1860 	// Encrypt the iv with key2, as the first XEX iv.
1861 	ldr	w6,[x4,#240]
1862 	ld1	{v0.4s},[x4],#16
1863 	ld1	{v6.16b},[x5]
1864 	sub	w6,w6,#2
1865 	ld1	{v1.4s},[x4],#16
1866 
1867 Loop_enc_iv_enc:
1868 	aese	v6.16b,v0.16b
1869 	aesmc	v6.16b,v6.16b
1870 	ld1	{v0.4s},[x4],#16
1871 	subs	w6,w6,#2
1872 	aese	v6.16b,v1.16b
1873 	aesmc	v6.16b,v6.16b
1874 	ld1	{v1.4s},[x4],#16
1875 	b.gt	Loop_enc_iv_enc
1876 
1877 	aese	v6.16b,v0.16b
1878 	aesmc	v6.16b,v6.16b
1879 	ld1	{v0.4s},[x4]
1880 	aese	v6.16b,v1.16b
1881 	eor	v6.16b,v6.16b,v0.16b
1882 
1883 	ld1	{v0.16b},[x0]
1884 	eor	v0.16b,v6.16b,v0.16b
1885 
1886 	ldr	w6,[x3,#240]
1887 	ld1	{v28.4s,v29.4s},[x3],#32		// load key schedule...
1888 
1889 	aese	v0.16b,v28.16b
1890 	aesmc	v0.16b,v0.16b
1891 	ld1	{v16.4s,v17.4s},[x3],#32		// load key schedule...
1892 	aese	v0.16b,v29.16b
1893 	aesmc	v0.16b,v0.16b
1894 	subs	w6,w6,#10		// if rounds==10, jump to aes-128-xts processing
1895 	b.eq	Lxts_128_enc
1896 Lxts_enc_round_loop:
1897 	aese	v0.16b,v16.16b
1898 	aesmc	v0.16b,v0.16b
1899 	ld1	{v16.4s},[x3],#16		// load key schedule...
1900 	aese	v0.16b,v17.16b
1901 	aesmc	v0.16b,v0.16b
1902 	ld1	{v17.4s},[x3],#16		// load key schedule...
1903 	subs	w6,w6,#2		// bias
1904 	b.gt	Lxts_enc_round_loop
1905 Lxts_128_enc:
1906 	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
1907 	aese	v0.16b,v16.16b
1908 	aesmc	v0.16b,v0.16b
1909 	aese	v0.16b,v17.16b
1910 	aesmc	v0.16b,v0.16b
1911 	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
1912 	aese	v0.16b,v18.16b
1913 	aesmc	v0.16b,v0.16b
1914 	aese	v0.16b,v19.16b
1915 	aesmc	v0.16b,v0.16b
1916 	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
1917 	aese	v0.16b,v20.16b
1918 	aesmc	v0.16b,v0.16b
1919 	aese	v0.16b,v21.16b
1920 	aesmc	v0.16b,v0.16b
1921 	ld1	{v7.4s},[x3]
1922 	aese	v0.16b,v22.16b
1923 	aesmc	v0.16b,v0.16b
1924 	aese	v0.16b,v23.16b
1925 	eor	v0.16b,v0.16b,v7.16b
1926 	eor	v0.16b,v0.16b,v6.16b
1927 	st1	{v0.16b},[x1]
1928 	b	Lxts_enc_final_abort
1929 
1930 .align	4
1931 Lxts_enc_big_size:
1932 	stp	x19,x20,[sp,#-64]!
1933 	stp	x21,x22,[sp,#48]
1934 	stp	d8,d9,[sp,#32]
1935 	stp	d10,d11,[sp,#16]
1936 
1937 	// tailcnt store the tail value of length%16.
1938 	and	x21,x2,#0xf
1939 	and	x2,x2,#-16
1940 	subs	x2,x2,#16
1941 	mov	x8,#16
1942 	b.lo	Lxts_abort
1943 	csel	x8,xzr,x8,eq
1944 
1945 	// Firstly, encrypt the iv with key2, as the first iv of XEX.
1946 	ldr	w6,[x4,#240]
1947 	ld1	{v0.4s},[x4],#16
1948 	ld1	{v6.16b},[x5]
1949 	sub	w6,w6,#2
1950 	ld1	{v1.4s},[x4],#16
1951 
1952 Loop_iv_enc:
1953 	aese	v6.16b,v0.16b
1954 	aesmc	v6.16b,v6.16b
1955 	ld1	{v0.4s},[x4],#16
1956 	subs	w6,w6,#2
1957 	aese	v6.16b,v1.16b
1958 	aesmc	v6.16b,v6.16b
1959 	ld1	{v1.4s},[x4],#16
1960 	b.gt	Loop_iv_enc
1961 
1962 	aese	v6.16b,v0.16b
1963 	aesmc	v6.16b,v6.16b
1964 	ld1	{v0.4s},[x4]
1965 	aese	v6.16b,v1.16b
1966 	eor	v6.16b,v6.16b,v0.16b
1967 
1968 	// The iv for second block
1969 	// x9- iv(low), x10 - iv(high)
1970 	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
1971 	fmov	x9,d6
1972 	fmov	x10,v6.d[1]
1973 	mov	w19,#0x87
1974 	extr	x22,x10,x10,#32
1975 	extr	x10,x10,x9,#63
1976 	and	w11,w19,w22,asr#31
1977 	eor	x9,x11,x9,lsl#1
1978 	fmov	d8,x9
1979 	fmov	v8.d[1],x10
1980 
1981 	ldr	w5,[x3,#240]		// next starting point
1982 	ld1	{v0.16b},[x0],x8
1983 
1984 	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
1985 	sub	w5,w5,#6
1986 	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
1987 	sub	w5,w5,#2
1988 	ld1	{v18.4s,v19.4s},[x7],#32
1989 	ld1	{v20.4s,v21.4s},[x7],#32
1990 	ld1	{v22.4s,v23.4s},[x7],#32
1991 	ld1	{v7.4s},[x7]
1992 
1993 	add	x7,x3,#32
1994 	mov	w6,w5
1995 
1996 	// Encryption
1997 Lxts_enc:
1998 	ld1	{v24.16b},[x0],#16
1999 	subs	x2,x2,#32			// bias
2000 	add	w6,w5,#2
2001 	orr	v3.16b,v0.16b,v0.16b
2002 	orr	v1.16b,v0.16b,v0.16b
2003 	orr	v28.16b,v0.16b,v0.16b
2004 	orr	v27.16b,v24.16b,v24.16b
2005 	orr	v29.16b,v24.16b,v24.16b
2006 	b.lo	Lxts_inner_enc_tail
2007 	eor	v0.16b,v0.16b,v6.16b			// before encryption, xor with iv
2008 	eor	v24.16b,v24.16b,v8.16b
2009 
2010 	// The iv for third block
2011 	extr	x22,x10,x10,#32
2012 	extr	x10,x10,x9,#63
2013 	and	w11,w19,w22,asr#31
2014 	eor	x9,x11,x9,lsl#1
2015 	fmov	d9,x9
2016 	fmov	v9.d[1],x10
2017 
2018 
2019 	orr	v1.16b,v24.16b,v24.16b
2020 	ld1	{v24.16b},[x0],#16
2021 	orr	v2.16b,v0.16b,v0.16b
2022 	orr	v3.16b,v1.16b,v1.16b
2023 	eor	v27.16b,v24.16b,v9.16b 		// the third block
2024 	eor	v24.16b,v24.16b,v9.16b
2025 	cmp	x2,#32
2026 	b.lo	Lxts_outer_enc_tail
2027 
2028 	// The iv for fourth block
2029 	extr	x22,x10,x10,#32
2030 	extr	x10,x10,x9,#63
2031 	and	w11,w19,w22,asr#31
2032 	eor	x9,x11,x9,lsl#1
2033 	fmov	d10,x9
2034 	fmov	v10.d[1],x10
2035 
2036 	ld1	{v25.16b},[x0],#16
2037 	// The iv for fifth block
2038 	extr	x22,x10,x10,#32
2039 	extr	x10,x10,x9,#63
2040 	and	w11,w19,w22,asr#31
2041 	eor	x9,x11,x9,lsl#1
2042 	fmov	d11,x9
2043 	fmov	v11.d[1],x10
2044 
2045 	ld1	{v26.16b},[x0],#16
2046 	eor	v25.16b,v25.16b,v10.16b		// the fourth block
2047 	eor	v26.16b,v26.16b,v11.16b
2048 	sub	x2,x2,#32			// bias
2049 	mov	w6,w5
2050 	b	Loop5x_xts_enc
2051 
2052 .align	4
2053 Loop5x_xts_enc:
2054 	aese	v0.16b,v16.16b
2055 	aesmc	v0.16b,v0.16b
2056 	aese	v1.16b,v16.16b
2057 	aesmc	v1.16b,v1.16b
2058 	aese	v24.16b,v16.16b
2059 	aesmc	v24.16b,v24.16b
2060 	aese	v25.16b,v16.16b
2061 	aesmc	v25.16b,v25.16b
2062 	aese	v26.16b,v16.16b
2063 	aesmc	v26.16b,v26.16b
2064 	ld1	{v16.4s},[x7],#16
2065 	subs	w6,w6,#2
2066 	aese	v0.16b,v17.16b
2067 	aesmc	v0.16b,v0.16b
2068 	aese	v1.16b,v17.16b
2069 	aesmc	v1.16b,v1.16b
2070 	aese	v24.16b,v17.16b
2071 	aesmc	v24.16b,v24.16b
2072 	aese	v25.16b,v17.16b
2073 	aesmc	v25.16b,v25.16b
2074 	aese	v26.16b,v17.16b
2075 	aesmc	v26.16b,v26.16b
2076 	ld1	{v17.4s},[x7],#16
2077 	b.gt	Loop5x_xts_enc
2078 
2079 	aese	v0.16b,v16.16b
2080 	aesmc	v0.16b,v0.16b
2081 	aese	v1.16b,v16.16b
2082 	aesmc	v1.16b,v1.16b
2083 	aese	v24.16b,v16.16b
2084 	aesmc	v24.16b,v24.16b
2085 	aese	v25.16b,v16.16b
2086 	aesmc	v25.16b,v25.16b
2087 	aese	v26.16b,v16.16b
2088 	aesmc	v26.16b,v26.16b
2089 	subs	x2,x2,#0x50			// because Lxts_enc_tail4x
2090 
2091 	aese	v0.16b,v17.16b
2092 	aesmc	v0.16b,v0.16b
2093 	aese	v1.16b,v17.16b
2094 	aesmc	v1.16b,v1.16b
2095 	aese	v24.16b,v17.16b
2096 	aesmc	v24.16b,v24.16b
2097 	aese	v25.16b,v17.16b
2098 	aesmc	v25.16b,v25.16b
2099 	aese	v26.16b,v17.16b
2100 	aesmc	v26.16b,v26.16b
2101 	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
2102 	mov	x7,x3
2103 
2104 	aese	v0.16b,v18.16b
2105 	aesmc	v0.16b,v0.16b
2106 	aese	v1.16b,v18.16b
2107 	aesmc	v1.16b,v1.16b
2108 	aese	v24.16b,v18.16b
2109 	aesmc	v24.16b,v24.16b
2110 	aese	v25.16b,v18.16b
2111 	aesmc	v25.16b,v25.16b
2112 	aese	v26.16b,v18.16b
2113 	aesmc	v26.16b,v26.16b
2114 	add	x0,x0,x6		// x0 is adjusted in such way that
2115 						// at exit from the loop v1.16b-v26.16b
2116 						// are loaded with last "words"
2117 	add	x6,x2,#0x60		// because Lxts_enc_tail4x
2118 
2119 	aese	v0.16b,v19.16b
2120 	aesmc	v0.16b,v0.16b
2121 	aese	v1.16b,v19.16b
2122 	aesmc	v1.16b,v1.16b
2123 	aese	v24.16b,v19.16b
2124 	aesmc	v24.16b,v24.16b
2125 	aese	v25.16b,v19.16b
2126 	aesmc	v25.16b,v25.16b
2127 	aese	v26.16b,v19.16b
2128 	aesmc	v26.16b,v26.16b
2129 
2130 	aese	v0.16b,v20.16b
2131 	aesmc	v0.16b,v0.16b
2132 	aese	v1.16b,v20.16b
2133 	aesmc	v1.16b,v1.16b
2134 	aese	v24.16b,v20.16b
2135 	aesmc	v24.16b,v24.16b
2136 	aese	v25.16b,v20.16b
2137 	aesmc	v25.16b,v25.16b
2138 	aese	v26.16b,v20.16b
2139 	aesmc	v26.16b,v26.16b
2140 
2141 	aese	v0.16b,v21.16b
2142 	aesmc	v0.16b,v0.16b
2143 	aese	v1.16b,v21.16b
2144 	aesmc	v1.16b,v1.16b
2145 	aese	v24.16b,v21.16b
2146 	aesmc	v24.16b,v24.16b
2147 	aese	v25.16b,v21.16b
2148 	aesmc	v25.16b,v25.16b
2149 	aese	v26.16b,v21.16b
2150 	aesmc	v26.16b,v26.16b
2151 
2152 	aese	v0.16b,v22.16b
2153 	aesmc	v0.16b,v0.16b
2154 	aese	v1.16b,v22.16b
2155 	aesmc	v1.16b,v1.16b
2156 	aese	v24.16b,v22.16b
2157 	aesmc	v24.16b,v24.16b
2158 	aese	v25.16b,v22.16b
2159 	aesmc	v25.16b,v25.16b
2160 	aese	v26.16b,v22.16b
2161 	aesmc	v26.16b,v26.16b
2162 
2163 	eor	v4.16b,v7.16b,v6.16b
2164 	aese	v0.16b,v23.16b
2165 	// The iv for first block of one iteration
2166 	extr	x22,x10,x10,#32
2167 	extr	x10,x10,x9,#63
2168 	and	w11,w19,w22,asr#31
2169 	eor	x9,x11,x9,lsl#1
2170 	fmov	d6,x9
2171 	fmov	v6.d[1],x10
2172 	eor	v5.16b,v7.16b,v8.16b
2173 	ld1	{v2.16b},[x0],#16
2174 	aese	v1.16b,v23.16b
2175 	// The iv for second block
2176 	extr	x22,x10,x10,#32
2177 	extr	x10,x10,x9,#63
2178 	and	w11,w19,w22,asr#31
2179 	eor	x9,x11,x9,lsl#1
2180 	fmov	d8,x9
2181 	fmov	v8.d[1],x10
2182 	eor	v17.16b,v7.16b,v9.16b
2183 	ld1	{v3.16b},[x0],#16
2184 	aese	v24.16b,v23.16b
2185 	// The iv for third block
2186 	extr	x22,x10,x10,#32
2187 	extr	x10,x10,x9,#63
2188 	and	w11,w19,w22,asr#31
2189 	eor	x9,x11,x9,lsl#1
2190 	fmov	d9,x9
2191 	fmov	v9.d[1],x10
2192 	eor	v30.16b,v7.16b,v10.16b
2193 	ld1	{v27.16b},[x0],#16
2194 	aese	v25.16b,v23.16b
2195 	// The iv for fourth block
2196 	extr	x22,x10,x10,#32
2197 	extr	x10,x10,x9,#63
2198 	and	w11,w19,w22,asr#31
2199 	eor	x9,x11,x9,lsl#1
2200 	fmov	d10,x9
2201 	fmov	v10.d[1],x10
2202 	eor	v31.16b,v7.16b,v11.16b
2203 	ld1	{v28.16b},[x0],#16
2204 	aese	v26.16b,v23.16b
2205 
2206 	// The iv for fifth block
2207 	extr	x22,x10,x10,#32
2208 	extr	x10,x10,x9,#63
2209 	and	w11,w19,w22,asr #31
2210 	eor	x9,x11,x9,lsl #1
2211 	fmov	d11,x9
2212 	fmov	v11.d[1],x10
2213 
2214 	ld1	{v29.16b},[x0],#16
2215 	cbz	x6,Lxts_enc_tail4x
2216 	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
2217 	eor	v4.16b,v4.16b,v0.16b
2218 	eor	v0.16b,v2.16b,v6.16b
2219 	eor	v5.16b,v5.16b,v1.16b
2220 	eor	v1.16b,v3.16b,v8.16b
2221 	eor	v17.16b,v17.16b,v24.16b
2222 	eor	v24.16b,v27.16b,v9.16b
2223 	eor	v30.16b,v30.16b,v25.16b
2224 	eor	v25.16b,v28.16b,v10.16b
2225 	eor	v31.16b,v31.16b,v26.16b
2226 	st1	{v4.16b},[x1],#16
2227 	eor	v26.16b,v29.16b,v11.16b
2228 	st1	{v5.16b},[x1],#16
2229 	mov	w6,w5
2230 	st1	{v17.16b},[x1],#16
2231 	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
2232 	st1	{v30.16b},[x1],#16
2233 	st1	{v31.16b},[x1],#16
2234 	b.hs	Loop5x_xts_enc
2235 
2236 
2237 	// If left 4 blocks, borrow the five block's processing.
2238 	cmn	x2,#0x10
2239 	b.ne	Loop5x_enc_after
2240 	orr	v11.16b,v10.16b,v10.16b
2241 	orr	v10.16b,v9.16b,v9.16b
2242 	orr	v9.16b,v8.16b,v8.16b
2243 	orr	v8.16b,v6.16b,v6.16b
2244 	fmov	x9,d11
2245 	fmov	x10,v11.d[1]
2246 	eor	v0.16b,v6.16b,v2.16b
2247 	eor	v1.16b,v8.16b,v3.16b
2248 	eor	v24.16b,v27.16b,v9.16b
2249 	eor	v25.16b,v28.16b,v10.16b
2250 	eor	v26.16b,v29.16b,v11.16b
2251 	b.eq	Loop5x_xts_enc
2252 
2253 Loop5x_enc_after:
2254 	add	x2,x2,#0x50
2255 	cbz	x2,Lxts_enc_done
2256 
2257 	add	w6,w5,#2
2258 	subs	x2,x2,#0x30
2259 	b.lo	Lxts_inner_enc_tail
2260 
2261 	eor	v0.16b,v6.16b,v27.16b
2262 	eor	v1.16b,v8.16b,v28.16b
2263 	eor	v24.16b,v29.16b,v9.16b
2264 	b	Lxts_outer_enc_tail
2265 
2266 .align	4
2267 Lxts_enc_tail4x:
2268 	add	x0,x0,#16
2269 	eor	v5.16b,v1.16b,v5.16b
2270 	st1	{v5.16b},[x1],#16
2271 	eor	v17.16b,v24.16b,v17.16b
2272 	st1	{v17.16b},[x1],#16
2273 	eor	v30.16b,v25.16b,v30.16b
2274 	eor	v31.16b,v26.16b,v31.16b
2275 	st1	{v30.16b,v31.16b},[x1],#32
2276 
2277 	b	Lxts_enc_done
2278 .align	4
2279 Lxts_outer_enc_tail:
2280 	aese	v0.16b,v16.16b
2281 	aesmc	v0.16b,v0.16b
2282 	aese	v1.16b,v16.16b
2283 	aesmc	v1.16b,v1.16b
2284 	aese	v24.16b,v16.16b
2285 	aesmc	v24.16b,v24.16b
2286 	ld1	{v16.4s},[x7],#16
2287 	subs	w6,w6,#2
2288 	aese	v0.16b,v17.16b
2289 	aesmc	v0.16b,v0.16b
2290 	aese	v1.16b,v17.16b
2291 	aesmc	v1.16b,v1.16b
2292 	aese	v24.16b,v17.16b
2293 	aesmc	v24.16b,v24.16b
2294 	ld1	{v17.4s},[x7],#16
2295 	b.gt	Lxts_outer_enc_tail
2296 
2297 	aese	v0.16b,v16.16b
2298 	aesmc	v0.16b,v0.16b
2299 	aese	v1.16b,v16.16b
2300 	aesmc	v1.16b,v1.16b
2301 	aese	v24.16b,v16.16b
2302 	aesmc	v24.16b,v24.16b
2303 	eor	v4.16b,v6.16b,v7.16b
2304 	subs	x2,x2,#0x30
2305 	// The iv for first block
2306 	fmov	x9,d9
2307 	fmov	x10,v9.d[1]
2308 	//mov	w19,#0x87
2309 	extr	x22,x10,x10,#32
2310 	extr	x10,x10,x9,#63
2311 	and	w11,w19,w22,asr#31
2312 	eor	x9,x11,x9,lsl#1
2313 	fmov	d6,x9
2314 	fmov	v6.d[1],x10
2315 	eor	v5.16b,v8.16b,v7.16b
2316 	csel	x6,x2,x6,lo       // x6, w6, is zero at this point
2317 	aese	v0.16b,v17.16b
2318 	aesmc	v0.16b,v0.16b
2319 	aese	v1.16b,v17.16b
2320 	aesmc	v1.16b,v1.16b
2321 	aese	v24.16b,v17.16b
2322 	aesmc	v24.16b,v24.16b
2323 	eor	v17.16b,v9.16b,v7.16b
2324 
2325 	add	x6,x6,#0x20
2326 	add	x0,x0,x6
2327 	mov	x7,x3
2328 
2329 	aese	v0.16b,v20.16b
2330 	aesmc	v0.16b,v0.16b
2331 	aese	v1.16b,v20.16b
2332 	aesmc	v1.16b,v1.16b
2333 	aese	v24.16b,v20.16b
2334 	aesmc	v24.16b,v24.16b
2335 	aese	v0.16b,v21.16b
2336 	aesmc	v0.16b,v0.16b
2337 	aese	v1.16b,v21.16b
2338 	aesmc	v1.16b,v1.16b
2339 	aese	v24.16b,v21.16b
2340 	aesmc	v24.16b,v24.16b
2341 	aese	v0.16b,v22.16b
2342 	aesmc	v0.16b,v0.16b
2343 	aese	v1.16b,v22.16b
2344 	aesmc	v1.16b,v1.16b
2345 	aese	v24.16b,v22.16b
2346 	aesmc	v24.16b,v24.16b
2347 	aese	v0.16b,v23.16b
2348 	aese	v1.16b,v23.16b
2349 	aese	v24.16b,v23.16b
2350 	ld1	{v27.16b},[x0],#16
2351 	add	w6,w5,#2
2352 	ld1	{v16.4s},[x7],#16                // re-pre-load rndkey[0]
2353 	eor	v4.16b,v4.16b,v0.16b
2354 	eor	v5.16b,v5.16b,v1.16b
2355 	eor	v24.16b,v24.16b,v17.16b
2356 	ld1	{v17.4s},[x7],#16                // re-pre-load rndkey[1]
2357 	st1	{v4.16b},[x1],#16
2358 	st1	{v5.16b},[x1],#16
2359 	st1	{v24.16b},[x1],#16
2360 	cmn	x2,#0x30
2361 	b.eq	Lxts_enc_done
2362 Lxts_encxor_one:
2363 	orr	v28.16b,v3.16b,v3.16b
2364 	orr	v29.16b,v27.16b,v27.16b
2365 	nop
2366 
2367 Lxts_inner_enc_tail:
2368 	cmn	x2,#0x10
2369 	eor	v1.16b,v28.16b,v6.16b
2370 	eor	v24.16b,v29.16b,v8.16b
2371 	b.eq	Lxts_enc_tail_loop
2372 	eor	v24.16b,v29.16b,v6.16b
2373 Lxts_enc_tail_loop:
2374 	aese	v1.16b,v16.16b
2375 	aesmc	v1.16b,v1.16b
2376 	aese	v24.16b,v16.16b
2377 	aesmc	v24.16b,v24.16b
2378 	ld1	{v16.4s},[x7],#16
2379 	subs	w6,w6,#2
2380 	aese	v1.16b,v17.16b
2381 	aesmc	v1.16b,v1.16b
2382 	aese	v24.16b,v17.16b
2383 	aesmc	v24.16b,v24.16b
2384 	ld1	{v17.4s},[x7],#16
2385 	b.gt	Lxts_enc_tail_loop
2386 
2387 	aese	v1.16b,v16.16b
2388 	aesmc	v1.16b,v1.16b
2389 	aese	v24.16b,v16.16b
2390 	aesmc	v24.16b,v24.16b
2391 	aese	v1.16b,v17.16b
2392 	aesmc	v1.16b,v1.16b
2393 	aese	v24.16b,v17.16b
2394 	aesmc	v24.16b,v24.16b
2395 	aese	v1.16b,v20.16b
2396 	aesmc	v1.16b,v1.16b
2397 	aese	v24.16b,v20.16b
2398 	aesmc	v24.16b,v24.16b
2399 	cmn	x2,#0x20
2400 	aese	v1.16b,v21.16b
2401 	aesmc	v1.16b,v1.16b
2402 	aese	v24.16b,v21.16b
2403 	aesmc	v24.16b,v24.16b
2404 	eor	v5.16b,v6.16b,v7.16b
2405 	aese	v1.16b,v22.16b
2406 	aesmc	v1.16b,v1.16b
2407 	aese	v24.16b,v22.16b
2408 	aesmc	v24.16b,v24.16b
2409 	eor	v17.16b,v8.16b,v7.16b
2410 	aese	v1.16b,v23.16b
2411 	aese	v24.16b,v23.16b
2412 	b.eq	Lxts_enc_one
2413 	eor	v5.16b,v5.16b,v1.16b
2414 	st1	{v5.16b},[x1],#16
2415 	eor	v17.16b,v17.16b,v24.16b
2416 	orr	v6.16b,v8.16b,v8.16b
2417 	st1	{v17.16b},[x1],#16
2418 	fmov	x9,d8
2419 	fmov	x10,v8.d[1]
2420 	mov	w19,#0x87
2421 	extr	x22,x10,x10,#32
2422 	extr	x10,x10,x9,#63
2423 	and	w11,w19,w22,asr #31
2424 	eor	x9,x11,x9,lsl #1
2425 	fmov	d6,x9
2426 	fmov	v6.d[1],x10
2427 	b	Lxts_enc_done
2428 
2429 Lxts_enc_one:
2430 	eor	v5.16b,v5.16b,v24.16b
2431 	orr	v6.16b,v6.16b,v6.16b
2432 	st1	{v5.16b},[x1],#16
2433 	fmov	x9,d6
2434 	fmov	x10,v6.d[1]
2435 	mov	w19,#0x87
2436 	extr	x22,x10,x10,#32
2437 	extr	x10,x10,x9,#63
2438 	and	w11,w19,w22,asr #31
2439 	eor	x9,x11,x9,lsl #1
2440 	fmov	d6,x9
2441 	fmov	v6.d[1],x10
2442 	b	Lxts_enc_done
2443 .align	5
2444 Lxts_enc_done:
2445 	// Process the tail block with cipher stealing.
2446 	tst	x21,#0xf
2447 	b.eq	Lxts_abort
2448 
2449 	mov	x20,x0
2450 	mov	x13,x1
2451 	sub	x1,x1,#16
2452 .composite_enc_loop:
2453 	subs	x21,x21,#1
2454 	ldrb	w15,[x1,x21]
2455 	ldrb	w14,[x20,x21]
2456 	strb	w15,[x13,x21]
2457 	strb	w14,[x1,x21]
2458 	b.gt	.composite_enc_loop
2459 Lxts_enc_load_done:
2460 	ld1	{v26.16b},[x1]
2461 	eor	v26.16b,v26.16b,v6.16b
2462 
2463 	// Encrypt the composite block to get the last second encrypted text block
2464 	ldr	w6,[x3,#240]		// load key schedule...
2465 	ld1	{v0.4s},[x3],#16
2466 	sub	w6,w6,#2
2467 	ld1	{v1.4s},[x3],#16		// load key schedule...
2468 Loop_final_enc:
2469 	aese	v26.16b,v0.16b
2470 	aesmc	v26.16b,v26.16b
2471 	ld1	{v0.4s},[x3],#16
2472 	subs	w6,w6,#2
2473 	aese	v26.16b,v1.16b
2474 	aesmc	v26.16b,v26.16b
2475 	ld1	{v1.4s},[x3],#16
2476 	b.gt	Loop_final_enc
2477 
2478 	aese	v26.16b,v0.16b
2479 	aesmc	v26.16b,v26.16b
2480 	ld1	{v0.4s},[x3]
2481 	aese	v26.16b,v1.16b
2482 	eor	v26.16b,v26.16b,v0.16b
2483 	eor	v26.16b,v26.16b,v6.16b
2484 	st1	{v26.16b},[x1]
2485 
2486 Lxts_abort:
2487 	ldp	x21,x22,[sp,#48]
2488 	ldp	d8,d9,[sp,#32]
2489 	ldp	d10,d11,[sp,#16]
2490 	ldp	x19,x20,[sp],#64
2491 Lxts_enc_final_abort:
2492 	ret
2493 
2494 .globl	_aes_v8_xts_decrypt
2495 
2496 .align	5
2497 _aes_v8_xts_decrypt:
2498 	cmp	x2,#16
2499 	// Original input data size bigger than 16, jump to big size processing.
2500 	b.ne	Lxts_dec_big_size
2501 	// Encrypt the iv with key2, as the first XEX iv.
2502 	ldr	w6,[x4,#240]
2503 	ld1	{v0.4s},[x4],#16
2504 	ld1	{v6.16b},[x5]
2505 	sub	w6,w6,#2
2506 	ld1	{v1.4s},[x4],#16
2507 
2508 Loop_dec_small_iv_enc:
2509 	aese	v6.16b,v0.16b
2510 	aesmc	v6.16b,v6.16b
2511 	ld1	{v0.4s},[x4],#16
2512 	subs	w6,w6,#2
2513 	aese	v6.16b,v1.16b
2514 	aesmc	v6.16b,v6.16b
2515 	ld1	{v1.4s},[x4],#16
2516 	b.gt	Loop_dec_small_iv_enc
2517 
2518 	aese	v6.16b,v0.16b
2519 	aesmc	v6.16b,v6.16b
2520 	ld1	{v0.4s},[x4]
2521 	aese	v6.16b,v1.16b
2522 	eor	v6.16b,v6.16b,v0.16b
2523 
2524 	ld1	{v0.16b},[x0]
2525 	eor	v0.16b,v6.16b,v0.16b
2526 
2527 	ldr	w6,[x3,#240]
2528 	ld1	{v28.4s,v29.4s},[x3],#32			// load key schedule...
2529 
2530 	aesd	v0.16b,v28.16b
2531 	aesimc	v0.16b,v0.16b
2532 	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
2533 	aesd	v0.16b,v29.16b
2534 	aesimc	v0.16b,v0.16b
2535 	subs	w6,w6,#10			// bias
2536 	b.eq	Lxts_128_dec
2537 Lxts_dec_round_loop:
2538 	aesd	v0.16b,v16.16b
2539 	aesimc	v0.16b,v0.16b
2540 	ld1	{v16.4s},[x3],#16			// load key schedule...
2541 	aesd	v0.16b,v17.16b
2542 	aesimc	v0.16b,v0.16b
2543 	ld1	{v17.4s},[x3],#16			// load key schedule...
2544 	subs	w6,w6,#2			// bias
2545 	b.gt	Lxts_dec_round_loop
2546 Lxts_128_dec:
2547 	ld1	{v18.4s,v19.4s},[x3],#32			// load key schedule...
2548 	aesd	v0.16b,v16.16b
2549 	aesimc	v0.16b,v0.16b
2550 	aesd	v0.16b,v17.16b
2551 	aesimc	v0.16b,v0.16b
2552 	ld1	{v20.4s,v21.4s},[x3],#32			// load key schedule...
2553 	aesd	v0.16b,v18.16b
2554 	aesimc	v0.16b,v0.16b
2555 	aesd	v0.16b,v19.16b
2556 	aesimc	v0.16b,v0.16b
2557 	ld1	{v22.4s,v23.4s},[x3],#32			// load key schedule...
2558 	aesd	v0.16b,v20.16b
2559 	aesimc	v0.16b,v0.16b
2560 	aesd	v0.16b,v21.16b
2561 	aesimc	v0.16b,v0.16b
2562 	ld1	{v7.4s},[x3]
2563 	aesd	v0.16b,v22.16b
2564 	aesimc	v0.16b,v0.16b
2565 	aesd	v0.16b,v23.16b
2566 	eor	v0.16b,v0.16b,v7.16b
2567 	eor	v0.16b,v6.16b,v0.16b
2568 	st1	{v0.16b},[x1]
2569 	b	Lxts_dec_final_abort
2570 Lxts_dec_big_size:
2571 	stp	x19,x20,[sp,#-64]!
2572 	stp	x21,x22,[sp,#48]
2573 	stp	d8,d9,[sp,#32]
2574 	stp	d10,d11,[sp,#16]
2575 
2576 	and	x21,x2,#0xf
2577 	and	x2,x2,#-16
2578 	subs	x2,x2,#16
2579 	mov	x8,#16
2580 	b.lo	Lxts_dec_abort
2581 
2582 	// Encrypt the iv with key2, as the first XEX iv
2583 	ldr	w6,[x4,#240]
2584 	ld1	{v0.4s},[x4],#16
2585 	ld1	{v6.16b},[x5]
2586 	sub	w6,w6,#2
2587 	ld1	{v1.4s},[x4],#16
2588 
2589 Loop_dec_iv_enc:
2590 	aese	v6.16b,v0.16b
2591 	aesmc	v6.16b,v6.16b
2592 	ld1	{v0.4s},[x4],#16
2593 	subs	w6,w6,#2
2594 	aese	v6.16b,v1.16b
2595 	aesmc	v6.16b,v6.16b
2596 	ld1	{v1.4s},[x4],#16
2597 	b.gt	Loop_dec_iv_enc
2598 
2599 	aese	v6.16b,v0.16b
2600 	aesmc	v6.16b,v6.16b
2601 	ld1	{v0.4s},[x4]
2602 	aese	v6.16b,v1.16b
2603 	eor	v6.16b,v6.16b,v0.16b
2604 
2605 	// The iv for second block
2606 	// x9- iv(low), x10 - iv(high)
2607 	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
2608 	fmov	x9,d6
2609 	fmov	x10,v6.d[1]
2610 	mov	w19,#0x87
2611 	extr	x22,x10,x10,#32
2612 	extr	x10,x10,x9,#63
2613 	and	w11,w19,w22,asr #31
2614 	eor	x9,x11,x9,lsl #1
2615 	fmov	d8,x9
2616 	fmov	v8.d[1],x10
2617 
2618 	ldr	w5,[x3,#240]		// load rounds number
2619 
2620 	// The iv for third block
2621 	extr	x22,x10,x10,#32
2622 	extr	x10,x10,x9,#63
2623 	and	w11,w19,w22,asr #31
2624 	eor	x9,x11,x9,lsl #1
2625 	fmov	d9,x9
2626 	fmov	v9.d[1],x10
2627 
2628 	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
2629 	sub	w5,w5,#6
2630 	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
2631 	sub	w5,w5,#2
2632 	ld1	{v18.4s,v19.4s},[x7],#32		// load key schedule...
2633 	ld1	{v20.4s,v21.4s},[x7],#32
2634 	ld1	{v22.4s,v23.4s},[x7],#32
2635 	ld1	{v7.4s},[x7]
2636 
2637 	// The iv for fourth block
2638 	extr	x22,x10,x10,#32
2639 	extr	x10,x10,x9,#63
2640 	and	w11,w19,w22,asr #31
2641 	eor	x9,x11,x9,lsl #1
2642 	fmov	d10,x9
2643 	fmov	v10.d[1],x10
2644 
2645 	add	x7,x3,#32
2646 	mov	w6,w5
2647 	b	Lxts_dec
2648 
2649 	// Decryption
2650 .align	5
2651 Lxts_dec:
2652 	tst	x21,#0xf
2653 	b.eq	Lxts_dec_begin
2654 	subs	x2,x2,#16
2655 	csel	x8,xzr,x8,eq
2656 	ld1	{v0.16b},[x0],#16
2657 	b.lo	Lxts_done
2658 	sub	x0,x0,#16
2659 Lxts_dec_begin:
2660 	ld1	{v0.16b},[x0],x8
2661 	subs	x2,x2,#32			// bias
2662 	add	w6,w5,#2
2663 	orr	v3.16b,v0.16b,v0.16b
2664 	orr	v1.16b,v0.16b,v0.16b
2665 	orr	v28.16b,v0.16b,v0.16b
2666 	ld1	{v24.16b},[x0],#16
2667 	orr	v27.16b,v24.16b,v24.16b
2668 	orr	v29.16b,v24.16b,v24.16b
2669 	b.lo	Lxts_inner_dec_tail
2670 	eor	v0.16b,v0.16b,v6.16b			// before decryt, xor with iv
2671 	eor	v24.16b,v24.16b,v8.16b
2672 
2673 	orr	v1.16b,v24.16b,v24.16b
2674 	ld1	{v24.16b},[x0],#16
2675 	orr	v2.16b,v0.16b,v0.16b
2676 	orr	v3.16b,v1.16b,v1.16b
2677 	eor	v27.16b,v24.16b,v9.16b			// third block xox with third iv
2678 	eor	v24.16b,v24.16b,v9.16b
2679 	cmp	x2,#32
2680 	b.lo	Lxts_outer_dec_tail
2681 
2682 	ld1	{v25.16b},[x0],#16
2683 
2684 	// The iv for fifth block
2685 	extr	x22,x10,x10,#32
2686 	extr	x10,x10,x9,#63
2687 	and	w11,w19,w22,asr #31
2688 	eor	x9,x11,x9,lsl #1
2689 	fmov	d11,x9
2690 	fmov	v11.d[1],x10
2691 
2692 	ld1	{v26.16b},[x0],#16
2693 	eor	v25.16b,v25.16b,v10.16b		// the fourth block
2694 	eor	v26.16b,v26.16b,v11.16b
2695 	sub	x2,x2,#32			// bias
2696 	mov	w6,w5
2697 	b	Loop5x_xts_dec
2698 
2699 .align	4
2700 Loop5x_xts_dec:
2701 	aesd	v0.16b,v16.16b
2702 	aesimc	v0.16b,v0.16b
2703 	aesd	v1.16b,v16.16b
2704 	aesimc	v1.16b,v1.16b
2705 	aesd	v24.16b,v16.16b
2706 	aesimc	v24.16b,v24.16b
2707 	aesd	v25.16b,v16.16b
2708 	aesimc	v25.16b,v25.16b
2709 	aesd	v26.16b,v16.16b
2710 	aesimc	v26.16b,v26.16b
2711 	ld1	{v16.4s},[x7],#16		// load key schedule...
2712 	subs	w6,w6,#2
2713 	aesd	v0.16b,v17.16b
2714 	aesimc	v0.16b,v0.16b
2715 	aesd	v1.16b,v17.16b
2716 	aesimc	v1.16b,v1.16b
2717 	aesd	v24.16b,v17.16b
2718 	aesimc	v24.16b,v24.16b
2719 	aesd	v25.16b,v17.16b
2720 	aesimc	v25.16b,v25.16b
2721 	aesd	v26.16b,v17.16b
2722 	aesimc	v26.16b,v26.16b
2723 	ld1	{v17.4s},[x7],#16		// load key schedule...
2724 	b.gt	Loop5x_xts_dec
2725 
2726 	aesd	v0.16b,v16.16b
2727 	aesimc	v0.16b,v0.16b
2728 	aesd	v1.16b,v16.16b
2729 	aesimc	v1.16b,v1.16b
2730 	aesd	v24.16b,v16.16b
2731 	aesimc	v24.16b,v24.16b
2732 	aesd	v25.16b,v16.16b
2733 	aesimc	v25.16b,v25.16b
2734 	aesd	v26.16b,v16.16b
2735 	aesimc	v26.16b,v26.16b
2736 	subs	x2,x2,#0x50			// because Lxts_dec_tail4x
2737 
2738 	aesd	v0.16b,v17.16b
2739 	aesimc	v0.16b,v0.16b
2740 	aesd	v1.16b,v17.16b
2741 	aesimc	v1.16b,v1.16b
2742 	aesd	v24.16b,v17.16b
2743 	aesimc	v24.16b,v24.16b
2744 	aesd	v25.16b,v17.16b
2745 	aesimc	v25.16b,v25.16b
2746 	aesd	v26.16b,v17.16b
2747 	aesimc	v26.16b,v26.16b
2748 	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
2749 	mov	x7,x3
2750 
2751 	aesd	v0.16b,v18.16b
2752 	aesimc	v0.16b,v0.16b
2753 	aesd	v1.16b,v18.16b
2754 	aesimc	v1.16b,v1.16b
2755 	aesd	v24.16b,v18.16b
2756 	aesimc	v24.16b,v24.16b
2757 	aesd	v25.16b,v18.16b
2758 	aesimc	v25.16b,v25.16b
2759 	aesd	v26.16b,v18.16b
2760 	aesimc	v26.16b,v26.16b
2761 	add	x0,x0,x6		// x0 is adjusted in such way that
2762 						// at exit from the loop v1.16b-v26.16b
2763 						// are loaded with last "words"
2764 	add	x6,x2,#0x60		// because Lxts_dec_tail4x
2765 
2766 	aesd	v0.16b,v19.16b
2767 	aesimc	v0.16b,v0.16b
2768 	aesd	v1.16b,v19.16b
2769 	aesimc	v1.16b,v1.16b
2770 	aesd	v24.16b,v19.16b
2771 	aesimc	v24.16b,v24.16b
2772 	aesd	v25.16b,v19.16b
2773 	aesimc	v25.16b,v25.16b
2774 	aesd	v26.16b,v19.16b
2775 	aesimc	v26.16b,v26.16b
2776 
2777 	aesd	v0.16b,v20.16b
2778 	aesimc	v0.16b,v0.16b
2779 	aesd	v1.16b,v20.16b
2780 	aesimc	v1.16b,v1.16b
2781 	aesd	v24.16b,v20.16b
2782 	aesimc	v24.16b,v24.16b
2783 	aesd	v25.16b,v20.16b
2784 	aesimc	v25.16b,v25.16b
2785 	aesd	v26.16b,v20.16b
2786 	aesimc	v26.16b,v26.16b
2787 
2788 	aesd	v0.16b,v21.16b
2789 	aesimc	v0.16b,v0.16b
2790 	aesd	v1.16b,v21.16b
2791 	aesimc	v1.16b,v1.16b
2792 	aesd	v24.16b,v21.16b
2793 	aesimc	v24.16b,v24.16b
2794 	aesd	v25.16b,v21.16b
2795 	aesimc	v25.16b,v25.16b
2796 	aesd	v26.16b,v21.16b
2797 	aesimc	v26.16b,v26.16b
2798 
2799 	aesd	v0.16b,v22.16b
2800 	aesimc	v0.16b,v0.16b
2801 	aesd	v1.16b,v22.16b
2802 	aesimc	v1.16b,v1.16b
2803 	aesd	v24.16b,v22.16b
2804 	aesimc	v24.16b,v24.16b
2805 	aesd	v25.16b,v22.16b
2806 	aesimc	v25.16b,v25.16b
2807 	aesd	v26.16b,v22.16b
2808 	aesimc	v26.16b,v26.16b
2809 
2810 	eor	v4.16b,v7.16b,v6.16b
2811 	aesd	v0.16b,v23.16b
2812 	// The iv for first block of next iteration.
2813 	extr	x22,x10,x10,#32
2814 	extr	x10,x10,x9,#63
2815 	and	w11,w19,w22,asr #31
2816 	eor	x9,x11,x9,lsl #1
2817 	fmov	d6,x9
2818 	fmov	v6.d[1],x10
2819 	eor	v5.16b,v7.16b,v8.16b
2820 	ld1	{v2.16b},[x0],#16
2821 	aesd	v1.16b,v23.16b
2822 	// The iv for second block
2823 	extr	x22,x10,x10,#32
2824 	extr	x10,x10,x9,#63
2825 	and	w11,w19,w22,asr #31
2826 	eor	x9,x11,x9,lsl #1
2827 	fmov	d8,x9
2828 	fmov	v8.d[1],x10
2829 	eor	v17.16b,v7.16b,v9.16b
2830 	ld1	{v3.16b},[x0],#16
2831 	aesd	v24.16b,v23.16b
2832 	// The iv for third block
2833 	extr	x22,x10,x10,#32
2834 	extr	x10,x10,x9,#63
2835 	and	w11,w19,w22,asr #31
2836 	eor	x9,x11,x9,lsl #1
2837 	fmov	d9,x9
2838 	fmov	v9.d[1],x10
2839 	eor	v30.16b,v7.16b,v10.16b
2840 	ld1	{v27.16b},[x0],#16
2841 	aesd	v25.16b,v23.16b
2842 	// The iv for fourth block
2843 	extr	x22,x10,x10,#32
2844 	extr	x10,x10,x9,#63
2845 	and	w11,w19,w22,asr #31
2846 	eor	x9,x11,x9,lsl #1
2847 	fmov	d10,x9
2848 	fmov	v10.d[1],x10
2849 	eor	v31.16b,v7.16b,v11.16b
2850 	ld1	{v28.16b},[x0],#16
2851 	aesd	v26.16b,v23.16b
2852 
2853 	// The iv for fifth block
2854 	extr	x22,x10,x10,#32
2855 	extr	x10,x10,x9,#63
2856 	and	w11,w19,w22,asr #31
2857 	eor	x9,x11,x9,lsl #1
2858 	fmov	d11,x9
2859 	fmov	v11.d[1],x10
2860 
2861 	ld1	{v29.16b},[x0],#16
2862 	cbz	x6,Lxts_dec_tail4x
2863 	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
2864 	eor	v4.16b,v4.16b,v0.16b
2865 	eor	v0.16b,v2.16b,v6.16b
2866 	eor	v5.16b,v5.16b,v1.16b
2867 	eor	v1.16b,v3.16b,v8.16b
2868 	eor	v17.16b,v17.16b,v24.16b
2869 	eor	v24.16b,v27.16b,v9.16b
2870 	eor	v30.16b,v30.16b,v25.16b
2871 	eor	v25.16b,v28.16b,v10.16b
2872 	eor	v31.16b,v31.16b,v26.16b
2873 	st1	{v4.16b},[x1],#16
2874 	eor	v26.16b,v29.16b,v11.16b
2875 	st1	{v5.16b},[x1],#16
2876 	mov	w6,w5
2877 	st1	{v17.16b},[x1],#16
2878 	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
2879 	st1	{v30.16b},[x1],#16
2880 	st1	{v31.16b},[x1],#16
2881 	b.hs	Loop5x_xts_dec
2882 
2883 	cmn	x2,#0x10
2884 	b.ne	Loop5x_dec_after
2885 	// If x2(x2) equal to -0x10, the left blocks is 4.
2886 	// After specially processing, utilize the five blocks processing again.
2887 	// It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b.
2888 	orr	v11.16b,v10.16b,v10.16b
2889 	orr	v10.16b,v9.16b,v9.16b
2890 	orr	v9.16b,v8.16b,v8.16b
2891 	orr	v8.16b,v6.16b,v6.16b
2892 	fmov	x9,d11
2893 	fmov	x10,v11.d[1]
2894 	eor	v0.16b,v6.16b,v2.16b
2895 	eor	v1.16b,v8.16b,v3.16b
2896 	eor	v24.16b,v27.16b,v9.16b
2897 	eor	v25.16b,v28.16b,v10.16b
2898 	eor	v26.16b,v29.16b,v11.16b
2899 	b.eq	Loop5x_xts_dec
2900 
2901 Loop5x_dec_after:
2902 	add	x2,x2,#0x50
2903 	cbz	x2,Lxts_done
2904 
2905 	add	w6,w5,#2
2906 	subs	x2,x2,#0x30
2907 	b.lo	Lxts_inner_dec_tail
2908 
2909 	eor	v0.16b,v6.16b,v27.16b
2910 	eor	v1.16b,v8.16b,v28.16b
2911 	eor	v24.16b,v29.16b,v9.16b
2912 	b	Lxts_outer_dec_tail
2913 
2914 .align	4
2915 Lxts_dec_tail4x:
2916 	add	x0,x0,#16
2917 	tst	x21,#0xf
2918 	eor	v5.16b,v1.16b,v4.16b
2919 	st1	{v5.16b},[x1],#16
2920 	eor	v17.16b,v24.16b,v17.16b
2921 	st1	{v17.16b},[x1],#16
2922 	eor	v30.16b,v25.16b,v30.16b
2923 	eor	v31.16b,v26.16b,v31.16b
2924 	st1	{v30.16b,v31.16b},[x1],#32
2925 
2926 	b.eq	Lxts_dec_abort
2927 	ld1	{v0.16b},[x0],#16
2928 	b	Lxts_done
2929 .align	4
2930 Lxts_outer_dec_tail:
2931 	aesd	v0.16b,v16.16b
2932 	aesimc	v0.16b,v0.16b
2933 	aesd	v1.16b,v16.16b
2934 	aesimc	v1.16b,v1.16b
2935 	aesd	v24.16b,v16.16b
2936 	aesimc	v24.16b,v24.16b
2937 	ld1	{v16.4s},[x7],#16
2938 	subs	w6,w6,#2
2939 	aesd	v0.16b,v17.16b
2940 	aesimc	v0.16b,v0.16b
2941 	aesd	v1.16b,v17.16b
2942 	aesimc	v1.16b,v1.16b
2943 	aesd	v24.16b,v17.16b
2944 	aesimc	v24.16b,v24.16b
2945 	ld1	{v17.4s},[x7],#16
2946 	b.gt	Lxts_outer_dec_tail
2947 
2948 	aesd	v0.16b,v16.16b
2949 	aesimc	v0.16b,v0.16b
2950 	aesd	v1.16b,v16.16b
2951 	aesimc	v1.16b,v1.16b
2952 	aesd	v24.16b,v16.16b
2953 	aesimc	v24.16b,v24.16b
2954 	eor	v4.16b,v6.16b,v7.16b
2955 	subs	x2,x2,#0x30
2956 	// The iv for first block
2957 	fmov	x9,d9
2958 	fmov	x10,v9.d[1]
2959 	mov	w19,#0x87
2960 	extr	x22,x10,x10,#32
2961 	extr	x10,x10,x9,#63
2962 	and	w11,w19,w22,asr #31
2963 	eor	x9,x11,x9,lsl #1
2964 	fmov	d6,x9
2965 	fmov	v6.d[1],x10
2966 	eor	v5.16b,v8.16b,v7.16b
2967 	csel	x6,x2,x6,lo	// x6, w6, is zero at this point
2968 	aesd	v0.16b,v17.16b
2969 	aesimc	v0.16b,v0.16b
2970 	aesd	v1.16b,v17.16b
2971 	aesimc	v1.16b,v1.16b
2972 	aesd	v24.16b,v17.16b
2973 	aesimc	v24.16b,v24.16b
2974 	eor	v17.16b,v9.16b,v7.16b
2975 	// The iv for second block
2976 	extr	x22,x10,x10,#32
2977 	extr	x10,x10,x9,#63
2978 	and	w11,w19,w22,asr #31
2979 	eor	x9,x11,x9,lsl #1
2980 	fmov	d8,x9
2981 	fmov	v8.d[1],x10
2982 
2983 	add	x6,x6,#0x20
2984 	add	x0,x0,x6		// x0 is adjusted to the last data
2985 
2986 	mov	x7,x3
2987 
2988 	// The iv for third block
2989 	extr	x22,x10,x10,#32
2990 	extr	x10,x10,x9,#63
2991 	and	w11,w19,w22,asr #31
2992 	eor	x9,x11,x9,lsl #1
2993 	fmov	d9,x9
2994 	fmov	v9.d[1],x10
2995 
2996 	aesd	v0.16b,v20.16b
2997 	aesimc	v0.16b,v0.16b
2998 	aesd	v1.16b,v20.16b
2999 	aesimc	v1.16b,v1.16b
3000 	aesd	v24.16b,v20.16b
3001 	aesimc	v24.16b,v24.16b
3002 	aesd	v0.16b,v21.16b
3003 	aesimc	v0.16b,v0.16b
3004 	aesd	v1.16b,v21.16b
3005 	aesimc	v1.16b,v1.16b
3006 	aesd	v24.16b,v21.16b
3007 	aesimc	v24.16b,v24.16b
3008 	aesd	v0.16b,v22.16b
3009 	aesimc	v0.16b,v0.16b
3010 	aesd	v1.16b,v22.16b
3011 	aesimc	v1.16b,v1.16b
3012 	aesd	v24.16b,v22.16b
3013 	aesimc	v24.16b,v24.16b
3014 	ld1	{v27.16b},[x0],#16
3015 	aesd	v0.16b,v23.16b
3016 	aesd	v1.16b,v23.16b
3017 	aesd	v24.16b,v23.16b
3018 	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
3019 	add	w6,w5,#2
3020 	eor	v4.16b,v4.16b,v0.16b
3021 	eor	v5.16b,v5.16b,v1.16b
3022 	eor	v24.16b,v24.16b,v17.16b
3023 	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
3024 	st1	{v4.16b},[x1],#16
3025 	st1	{v5.16b},[x1],#16
3026 	st1	{v24.16b},[x1],#16
3027 
3028 	cmn	x2,#0x30
3029 	add	x2,x2,#0x30
3030 	b.eq	Lxts_done
3031 	sub	x2,x2,#0x30
3032 	orr	v28.16b,v3.16b,v3.16b
3033 	orr	v29.16b,v27.16b,v27.16b
3034 	nop
3035 
3036 Lxts_inner_dec_tail:
3037 	// x2 == -0x10 means two blocks left.
3038 	cmn	x2,#0x10
3039 	eor	v1.16b,v28.16b,v6.16b
3040 	eor	v24.16b,v29.16b,v8.16b
3041 	b.eq	Lxts_dec_tail_loop
3042 	eor	v24.16b,v29.16b,v6.16b
3043 Lxts_dec_tail_loop:
3044 	aesd	v1.16b,v16.16b
3045 	aesimc	v1.16b,v1.16b
3046 	aesd	v24.16b,v16.16b
3047 	aesimc	v24.16b,v24.16b
3048 	ld1	{v16.4s},[x7],#16
3049 	subs	w6,w6,#2
3050 	aesd	v1.16b,v17.16b
3051 	aesimc	v1.16b,v1.16b
3052 	aesd	v24.16b,v17.16b
3053 	aesimc	v24.16b,v24.16b
3054 	ld1	{v17.4s},[x7],#16
3055 	b.gt	Lxts_dec_tail_loop
3056 
3057 	aesd	v1.16b,v16.16b
3058 	aesimc	v1.16b,v1.16b
3059 	aesd	v24.16b,v16.16b
3060 	aesimc	v24.16b,v24.16b
3061 	aesd	v1.16b,v17.16b
3062 	aesimc	v1.16b,v1.16b
3063 	aesd	v24.16b,v17.16b
3064 	aesimc	v24.16b,v24.16b
3065 	aesd	v1.16b,v20.16b
3066 	aesimc	v1.16b,v1.16b
3067 	aesd	v24.16b,v20.16b
3068 	aesimc	v24.16b,v24.16b
3069 	cmn	x2,#0x20
3070 	aesd	v1.16b,v21.16b
3071 	aesimc	v1.16b,v1.16b
3072 	aesd	v24.16b,v21.16b
3073 	aesimc	v24.16b,v24.16b
3074 	eor	v5.16b,v6.16b,v7.16b
3075 	aesd	v1.16b,v22.16b
3076 	aesimc	v1.16b,v1.16b
3077 	aesd	v24.16b,v22.16b
3078 	aesimc	v24.16b,v24.16b
3079 	eor	v17.16b,v8.16b,v7.16b
3080 	aesd	v1.16b,v23.16b
3081 	aesd	v24.16b,v23.16b
3082 	b.eq	Lxts_dec_one
3083 	eor	v5.16b,v5.16b,v1.16b
3084 	eor	v17.16b,v17.16b,v24.16b
3085 	orr	v6.16b,v9.16b,v9.16b
3086 	orr	v8.16b,v10.16b,v10.16b
3087 	st1	{v5.16b},[x1],#16
3088 	st1	{v17.16b},[x1],#16
3089 	add	x2,x2,#16
3090 	b	Lxts_done
3091 
3092 Lxts_dec_one:
3093 	eor	v5.16b,v5.16b,v24.16b
3094 	orr	v6.16b,v8.16b,v8.16b
3095 	orr	v8.16b,v9.16b,v9.16b
3096 	st1	{v5.16b},[x1],#16
3097 	add	x2,x2,#32
3098 
3099 Lxts_done:
3100 	tst	x21,#0xf
3101 	b.eq	Lxts_dec_abort
3102 	// Processing the last two blocks with cipher stealing.
3103 	mov	x7,x3
3104 	cbnz	x2,Lxts_dec_1st_done
3105 	ld1	{v0.16b},[x0],#16
3106 
3107 	// Decrypt the last secod block to get the last plain text block
3108 Lxts_dec_1st_done:
3109 	eor	v26.16b,v0.16b,v8.16b
3110 	ldr	w6,[x3,#240]
3111 	ld1	{v0.4s},[x3],#16
3112 	sub	w6,w6,#2
3113 	ld1	{v1.4s},[x3],#16
3114 Loop_final_2nd_dec:
3115 	aesd	v26.16b,v0.16b
3116 	aesimc	v26.16b,v26.16b
3117 	ld1	{v0.4s},[x3],#16		// load key schedule...
3118 	subs	w6,w6,#2
3119 	aesd	v26.16b,v1.16b
3120 	aesimc	v26.16b,v26.16b
3121 	ld1	{v1.4s},[x3],#16		// load key schedule...
3122 	b.gt	Loop_final_2nd_dec
3123 
3124 	aesd	v26.16b,v0.16b
3125 	aesimc	v26.16b,v26.16b
3126 	ld1	{v0.4s},[x3]
3127 	aesd	v26.16b,v1.16b
3128 	eor	v26.16b,v26.16b,v0.16b
3129 	eor	v26.16b,v26.16b,v8.16b
3130 	st1	{v26.16b},[x1]
3131 
3132 	mov	x20,x0
3133 	add	x13,x1,#16
3134 
3135 	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3136 	// to get the last encrypted block.
3137 .composite_dec_loop:
3138 	subs	x21,x21,#1
3139 	ldrb	w15,[x1,x21]
3140 	ldrb	w14,[x20,x21]
3141 	strb	w15,[x13,x21]
3142 	strb	w14,[x1,x21]
3143 	b.gt	.composite_dec_loop
3144 Lxts_dec_load_done:
3145 	ld1	{v26.16b},[x1]
3146 	eor	v26.16b,v26.16b,v6.16b
3147 
3148 	// Decrypt the composite block to get the last second plain text block
3149 	ldr	w6,[x7,#240]
3150 	ld1	{v0.4s},[x7],#16
3151 	sub	w6,w6,#2
3152 	ld1	{v1.4s},[x7],#16
3153 Loop_final_dec:
3154 	aesd	v26.16b,v0.16b
3155 	aesimc	v26.16b,v26.16b
3156 	ld1	{v0.4s},[x7],#16		// load key schedule...
3157 	subs	w6,w6,#2
3158 	aesd	v26.16b,v1.16b
3159 	aesimc	v26.16b,v26.16b
3160 	ld1	{v1.4s},[x7],#16		// load key schedule...
3161 	b.gt	Loop_final_dec
3162 
3163 	aesd	v26.16b,v0.16b
3164 	aesimc	v26.16b,v26.16b
3165 	ld1	{v0.4s},[x7]
3166 	aesd	v26.16b,v1.16b
3167 	eor	v26.16b,v26.16b,v0.16b
3168 	eor	v26.16b,v26.16b,v6.16b
3169 	st1	{v26.16b},[x1]
3170 
3171 Lxts_dec_abort:
3172 	ldp	x21,x22,[sp,#48]
3173 	ldp	d8,d9,[sp,#32]
3174 	ldp	d10,d11,[sp,#16]
3175 	ldp	x19,x20,[sp],#64
3176 
3177 Lxts_dec_final_abort:
3178 	ret
3179 
3180 #endif
3181