1 .text
2 
3 .align	8	// strategic alignment and padding that allows to use
4 		// address value as loop termination condition...
5 .quad	0,0,0,0,0,0,0,0
6 
7 iotas:
8 .quad	0x0000000000000001
9 .quad	0x0000000000008082
10 .quad	0x800000000000808a
11 .quad	0x8000000080008000
12 .quad	0x000000000000808b
13 .quad	0x0000000080000001
14 .quad	0x8000000080008081
15 .quad	0x8000000000008009
16 .quad	0x000000000000008a
17 .quad	0x0000000000000088
18 .quad	0x0000000080008009
19 .quad	0x000000008000000a
20 .quad	0x000000008000808b
21 .quad	0x800000000000008b
22 .quad	0x8000000000008089
23 .quad	0x8000000000008003
24 .quad	0x8000000000008002
25 .quad	0x8000000000000080
26 .quad	0x000000000000800a
27 .quad	0x800000008000000a
28 .quad	0x8000000080008081
29 .quad	0x8000000000008080
30 .quad	0x0000000080000001
31 .quad	0x8000000080008008
32 
33 
34 .align	5
35 KeccakF1600_int:
36 	adr	x28,iotas
37 .long	0xd503233f			// paciasp
38 	stp	x28,x30,[sp,#16]		// 32 bytes on top are mine
39 	b	Loop
40 .align	4
41 Loop:
42 	////////////////////////////////////////// Theta
43 	eor	x26,x0,x5
44 	stp	x4,x9,[sp,#0]	// offload pair...
45 	eor	x27,x1,x6
46 	eor	x28,x2,x7
47 	eor	x30,x3,x8
48 	eor	x4,x4,x9
49 	eor	x26,x26,x10
50 	eor	x27,x27,x11
51 	eor	x28,x28,x12
52 	eor	x30,x30,x13
53 	eor	x4,x4,x14
54 	eor	x26,x26,x15
55 	eor	x27,x27,x16
56 	eor	x28,x28,x17
57 	eor	x30,x30,x25
58 	eor	x4,x4,x19
59 	eor	x26,x26,x20
60 	eor	x28,x28,x22
61 	eor	x27,x27,x21
62 	eor	x30,x30,x23
63 	eor	x4,x4,x24
64 
65 	eor	x9,x26,x28,ror#63
66 
67 	eor	x1,x1,x9
68 	eor	x6,x6,x9
69 	eor	x11,x11,x9
70 	eor	x16,x16,x9
71 	eor	x21,x21,x9
72 
73 	eor	x9,x27,x30,ror#63
74 	eor	x28,x28,x4,ror#63
75 	eor	x30,x30,x26,ror#63
76 	eor	x4,x4,x27,ror#63
77 
78 	eor	x27,   x2,x9		// mov	x27,x2
79 	eor	x7,x7,x9
80 	eor	x12,x12,x9
81 	eor	x17,x17,x9
82 	eor	x22,x22,x9
83 
84 	eor	x0,x0,x4
85 	eor	x5,x5,x4
86 	eor	x10,x10,x4
87 	eor	x15,x15,x4
88 	eor	x20,x20,x4
89 	ldp	x4,x9,[sp,#0]	// re-load offloaded data
90 	eor	x26,   x3,x28		// mov	x26,x3
91 	eor	x8,x8,x28
92 	eor	x13,x13,x28
93 	eor	x25,x25,x28
94 	eor	x23,x23,x28
95 
96 	eor	x28,   x4,x30		// mov	x28,x4
97 	eor	x9,x9,x30
98 	eor	x14,x14,x30
99 	eor	x19,x19,x30
100 	eor	x24,x24,x30
101 
102 	////////////////////////////////////////// Rho+Pi
103 	mov	x30,x1
104 	ror	x1,x6,#64-44
105 	//mov	x27,x2
106 	ror	x2,x12,#64-43
107 	//mov	x26,x3
108 	ror	x3,x25,#64-21
109 	//mov	x28,x4
110 	ror	x4,x24,#64-14
111 
112 	ror	x6,x9,#64-20
113 	ror	x12,x13,#64-25
114 	ror	x25,x17,#64-15
115 	ror	x24,x21,#64-2
116 
117 	ror	x9,x22,#64-61
118 	ror	x13,x19,#64-8
119 	ror	x17,x11,#64-10
120 	ror	x21,x8,#64-55
121 
122 	ror	x22,x14,#64-39
123 	ror	x19,x23,#64-56
124 	ror	x11,x7,#64-6
125 	ror	x8,x16,#64-45
126 
127 	ror	x14,x20,#64-18
128 	ror	x23,x15,#64-41
129 	ror	x7,x10,#64-3
130 	ror	x16,x5,#64-36
131 
132 	ror	x5,x26,#64-28
133 	ror	x10,x30,#64-1
134 	ror	x15,x28,#64-27
135 	ror	x20,x27,#64-62
136 
137 	////////////////////////////////////////// Chi+Iota
138 	bic	x26,x2,x1
139 	bic	x27,x3,x2
140 	bic	x28,x0,x4
141 	bic	x30,x1,x0
142 	eor	x0,x0,x26
143 	bic	x26,x4,x3
144 	eor	x1,x1,x27
145 	ldr	x27,[sp,#16]
146 	eor	x3,x3,x28
147 	eor	x4,x4,x30
148 	eor	x2,x2,x26
149 	ldr	x30,[x27],#8		// Iota[i++]
150 
151 	bic	x26,x7,x6
152 	tst	x27,#255			// are we done?
153 	str	x27,[sp,#16]
154 	bic	x27,x8,x7
155 	bic	x28,x5,x9
156 	eor	x0,x0,x30		// A[0][0] ^= Iota
157 	bic	x30,x6,x5
158 	eor	x5,x5,x26
159 	bic	x26,x9,x8
160 	eor	x6,x6,x27
161 	eor	x8,x8,x28
162 	eor	x9,x9,x30
163 	eor	x7,x7,x26
164 
165 	bic	x26,x12,x11
166 	bic	x27,x13,x12
167 	bic	x28,x10,x14
168 	bic	x30,x11,x10
169 	eor	x10,x10,x26
170 	bic	x26,x14,x13
171 	eor	x11,x11,x27
172 	eor	x13,x13,x28
173 	eor	x14,x14,x30
174 	eor	x12,x12,x26
175 
176 	bic	x26,x17,x16
177 	bic	x27,x25,x17
178 	bic	x28,x15,x19
179 	bic	x30,x16,x15
180 	eor	x15,x15,x26
181 	bic	x26,x19,x25
182 	eor	x16,x16,x27
183 	eor	x25,x25,x28
184 	eor	x19,x19,x30
185 	eor	x17,x17,x26
186 
187 	bic	x26,x22,x21
188 	bic	x27,x23,x22
189 	bic	x28,x20,x24
190 	bic	x30,x21,x20
191 	eor	x20,x20,x26
192 	bic	x26,x24,x23
193 	eor	x21,x21,x27
194 	eor	x23,x23,x28
195 	eor	x24,x24,x30
196 	eor	x22,x22,x26
197 
198 	bne	Loop
199 
200 	ldr	x30,[sp,#24]
201 .long	0xd50323bf			// autiasp
202 	ret
203 
204 
205 
206 .align	5
207 KeccakF1600:
208 .long	0xd503233f			// paciasp
209 	stp	x29,x30,[sp,#-128]!
210 	add	x29,sp,#0
211 	stp	x19,x20,[sp,#16]
212 	stp	x21,x22,[sp,#32]
213 	stp	x23,x24,[sp,#48]
214 	stp	x25,x26,[sp,#64]
215 	stp	x27,x28,[sp,#80]
216 	sub	sp,sp,#48
217 
218 	str	x0,[sp,#32]			// offload argument
219 	mov	x26,x0
220 	ldp	x0,x1,[x0,#16*0]
221 	ldp	x2,x3,[x26,#16*1]
222 	ldp	x4,x5,[x26,#16*2]
223 	ldp	x6,x7,[x26,#16*3]
224 	ldp	x8,x9,[x26,#16*4]
225 	ldp	x10,x11,[x26,#16*5]
226 	ldp	x12,x13,[x26,#16*6]
227 	ldp	x14,x15,[x26,#16*7]
228 	ldp	x16,x17,[x26,#16*8]
229 	ldp	x25,x19,[x26,#16*9]
230 	ldp	x20,x21,[x26,#16*10]
231 	ldp	x22,x23,[x26,#16*11]
232 	ldr	x24,[x26,#16*12]
233 
234 	bl	KeccakF1600_int
235 
236 	ldr	x26,[sp,#32]
237 	stp	x0,x1,[x26,#16*0]
238 	stp	x2,x3,[x26,#16*1]
239 	stp	x4,x5,[x26,#16*2]
240 	stp	x6,x7,[x26,#16*3]
241 	stp	x8,x9,[x26,#16*4]
242 	stp	x10,x11,[x26,#16*5]
243 	stp	x12,x13,[x26,#16*6]
244 	stp	x14,x15,[x26,#16*7]
245 	stp	x16,x17,[x26,#16*8]
246 	stp	x25,x19,[x26,#16*9]
247 	stp	x20,x21,[x26,#16*10]
248 	stp	x22,x23,[x26,#16*11]
249 	str	x24,[x26,#16*12]
250 
251 	ldp	x19,x20,[x29,#16]
252 	add	sp,sp,#48
253 	ldp	x21,x22,[x29,#32]
254 	ldp	x23,x24,[x29,#48]
255 	ldp	x25,x26,[x29,#64]
256 	ldp	x27,x28,[x29,#80]
257 	ldp	x29,x30,[sp],#128
258 .long	0xd50323bf			// autiasp
259 	ret
260 
261 
262 .globl	_SHA3_absorb
263 
264 .align	5
265 _SHA3_absorb:
266 .long	0xd503233f			// paciasp
267 	stp	x29,x30,[sp,#-128]!
268 	add	x29,sp,#0
269 	stp	x19,x20,[sp,#16]
270 	stp	x21,x22,[sp,#32]
271 	stp	x23,x24,[sp,#48]
272 	stp	x25,x26,[sp,#64]
273 	stp	x27,x28,[sp,#80]
274 	sub	sp,sp,#64
275 
276 	stp	x0,x1,[sp,#32]			// offload arguments
277 	stp	x2,x3,[sp,#48]
278 
279 	mov	x26,x0			// uint64_t A[5][5]
280 	mov	x27,x1			// const void *inp
281 	mov	x28,x2			// size_t len
282 	mov	x30,x3			// size_t bsz
283 	ldp	x0,x1,[x26,#16*0]
284 	ldp	x2,x3,[x26,#16*1]
285 	ldp	x4,x5,[x26,#16*2]
286 	ldp	x6,x7,[x26,#16*3]
287 	ldp	x8,x9,[x26,#16*4]
288 	ldp	x10,x11,[x26,#16*5]
289 	ldp	x12,x13,[x26,#16*6]
290 	ldp	x14,x15,[x26,#16*7]
291 	ldp	x16,x17,[x26,#16*8]
292 	ldp	x25,x19,[x26,#16*9]
293 	ldp	x20,x21,[x26,#16*10]
294 	ldp	x22,x23,[x26,#16*11]
295 	ldr	x24,[x26,#16*12]
296 	b	Loop_absorb
297 
298 .align	4
299 Loop_absorb:
300 	subs	x26,x28,x30		// len - bsz
301 	blo	Labsorbed
302 
303 	str	x26,[sp,#48]			// save len - bsz
304 	ldr	x26,[x27],#8		// *inp++
305 #ifdef	__AARCH64EB__
306 	rev	x26,x26
307 #endif
308 	eor	x0,x0,x26
309 	cmp	x30,#8*(0+2)
310 	blo	Lprocess_block
311 	ldr	x26,[x27],#8		// *inp++
312 #ifdef	__AARCH64EB__
313 	rev	x26,x26
314 #endif
315 	eor	x1,x1,x26
316 	beq	Lprocess_block
317 	ldr	x26,[x27],#8		// *inp++
318 #ifdef	__AARCH64EB__
319 	rev	x26,x26
320 #endif
321 	eor	x2,x2,x26
322 	cmp	x30,#8*(2+2)
323 	blo	Lprocess_block
324 	ldr	x26,[x27],#8		// *inp++
325 #ifdef	__AARCH64EB__
326 	rev	x26,x26
327 #endif
328 	eor	x3,x3,x26
329 	beq	Lprocess_block
330 	ldr	x26,[x27],#8		// *inp++
331 #ifdef	__AARCH64EB__
332 	rev	x26,x26
333 #endif
334 	eor	x4,x4,x26
335 	cmp	x30,#8*(4+2)
336 	blo	Lprocess_block
337 	ldr	x26,[x27],#8		// *inp++
338 #ifdef	__AARCH64EB__
339 	rev	x26,x26
340 #endif
341 	eor	x5,x5,x26
342 	beq	Lprocess_block
343 	ldr	x26,[x27],#8		// *inp++
344 #ifdef	__AARCH64EB__
345 	rev	x26,x26
346 #endif
347 	eor	x6,x6,x26
348 	cmp	x30,#8*(6+2)
349 	blo	Lprocess_block
350 	ldr	x26,[x27],#8		// *inp++
351 #ifdef	__AARCH64EB__
352 	rev	x26,x26
353 #endif
354 	eor	x7,x7,x26
355 	beq	Lprocess_block
356 	ldr	x26,[x27],#8		// *inp++
357 #ifdef	__AARCH64EB__
358 	rev	x26,x26
359 #endif
360 	eor	x8,x8,x26
361 	cmp	x30,#8*(8+2)
362 	blo	Lprocess_block
363 	ldr	x26,[x27],#8		// *inp++
364 #ifdef	__AARCH64EB__
365 	rev	x26,x26
366 #endif
367 	eor	x9,x9,x26
368 	beq	Lprocess_block
369 	ldr	x26,[x27],#8		// *inp++
370 #ifdef	__AARCH64EB__
371 	rev	x26,x26
372 #endif
373 	eor	x10,x10,x26
374 	cmp	x30,#8*(10+2)
375 	blo	Lprocess_block
376 	ldr	x26,[x27],#8		// *inp++
377 #ifdef	__AARCH64EB__
378 	rev	x26,x26
379 #endif
380 	eor	x11,x11,x26
381 	beq	Lprocess_block
382 	ldr	x26,[x27],#8		// *inp++
383 #ifdef	__AARCH64EB__
384 	rev	x26,x26
385 #endif
386 	eor	x12,x12,x26
387 	cmp	x30,#8*(12+2)
388 	blo	Lprocess_block
389 	ldr	x26,[x27],#8		// *inp++
390 #ifdef	__AARCH64EB__
391 	rev	x26,x26
392 #endif
393 	eor	x13,x13,x26
394 	beq	Lprocess_block
395 	ldr	x26,[x27],#8		// *inp++
396 #ifdef	__AARCH64EB__
397 	rev	x26,x26
398 #endif
399 	eor	x14,x14,x26
400 	cmp	x30,#8*(14+2)
401 	blo	Lprocess_block
402 	ldr	x26,[x27],#8		// *inp++
403 #ifdef	__AARCH64EB__
404 	rev	x26,x26
405 #endif
406 	eor	x15,x15,x26
407 	beq	Lprocess_block
408 	ldr	x26,[x27],#8		// *inp++
409 #ifdef	__AARCH64EB__
410 	rev	x26,x26
411 #endif
412 	eor	x16,x16,x26
413 	cmp	x30,#8*(16+2)
414 	blo	Lprocess_block
415 	ldr	x26,[x27],#8		// *inp++
416 #ifdef	__AARCH64EB__
417 	rev	x26,x26
418 #endif
419 	eor	x17,x17,x26
420 	beq	Lprocess_block
421 	ldr	x26,[x27],#8		// *inp++
422 #ifdef	__AARCH64EB__
423 	rev	x26,x26
424 #endif
425 	eor	x25,x25,x26
426 	cmp	x30,#8*(18+2)
427 	blo	Lprocess_block
428 	ldr	x26,[x27],#8		// *inp++
429 #ifdef	__AARCH64EB__
430 	rev	x26,x26
431 #endif
432 	eor	x19,x19,x26
433 	beq	Lprocess_block
434 	ldr	x26,[x27],#8		// *inp++
435 #ifdef	__AARCH64EB__
436 	rev	x26,x26
437 #endif
438 	eor	x20,x20,x26
439 	cmp	x30,#8*(20+2)
440 	blo	Lprocess_block
441 	ldr	x26,[x27],#8		// *inp++
442 #ifdef	__AARCH64EB__
443 	rev	x26,x26
444 #endif
445 	eor	x21,x21,x26
446 	beq	Lprocess_block
447 	ldr	x26,[x27],#8		// *inp++
448 #ifdef	__AARCH64EB__
449 	rev	x26,x26
450 #endif
451 	eor	x22,x22,x26
452 	cmp	x30,#8*(22+2)
453 	blo	Lprocess_block
454 	ldr	x26,[x27],#8		// *inp++
455 #ifdef	__AARCH64EB__
456 	rev	x26,x26
457 #endif
458 	eor	x23,x23,x26
459 	beq	Lprocess_block
460 	ldr	x26,[x27],#8		// *inp++
461 #ifdef	__AARCH64EB__
462 	rev	x26,x26
463 #endif
464 	eor	x24,x24,x26
465 
466 Lprocess_block:
467 	str	x27,[sp,#40]			// save inp
468 
469 	bl	KeccakF1600_int
470 
471 	ldr	x27,[sp,#40]			// restore arguments
472 	ldp	x28,x30,[sp,#48]
473 	b	Loop_absorb
474 
475 .align	4
476 Labsorbed:
477 	ldr	x27,[sp,#32]
478 	stp	x0,x1,[x27,#16*0]
479 	stp	x2,x3,[x27,#16*1]
480 	stp	x4,x5,[x27,#16*2]
481 	stp	x6,x7,[x27,#16*3]
482 	stp	x8,x9,[x27,#16*4]
483 	stp	x10,x11,[x27,#16*5]
484 	stp	x12,x13,[x27,#16*6]
485 	stp	x14,x15,[x27,#16*7]
486 	stp	x16,x17,[x27,#16*8]
487 	stp	x25,x19,[x27,#16*9]
488 	stp	x20,x21,[x27,#16*10]
489 	stp	x22,x23,[x27,#16*11]
490 	str	x24,[x27,#16*12]
491 
492 	mov	x0,x28			// return value
493 	ldp	x19,x20,[x29,#16]
494 	add	sp,sp,#64
495 	ldp	x21,x22,[x29,#32]
496 	ldp	x23,x24,[x29,#48]
497 	ldp	x25,x26,[x29,#64]
498 	ldp	x27,x28,[x29,#80]
499 	ldp	x29,x30,[sp],#128
500 .long	0xd50323bf			// autiasp
501 	ret
502 
503 .globl	_SHA3_squeeze
504 
505 .align	5
506 _SHA3_squeeze:
507 .long	0xd503233f			// paciasp
508 	stp	x29,x30,[sp,#-48]!
509 	add	x29,sp,#0
510 	stp	x19,x20,[sp,#16]
511 	stp	x21,x22,[sp,#32]
512 
513 	mov	x19,x0			// put aside arguments
514 	mov	x20,x1
515 	mov	x21,x2
516 	mov	x22,x3
517 
518 Loop_squeeze:
519 	ldr	x4,[x0],#8
520 	cmp	x21,#8
521 	blo	Lsqueeze_tail
522 #ifdef	__AARCH64EB__
523 	rev	x4,x4
524 #endif
525 	str	x4,[x20],#8
526 	subs	x21,x21,#8
527 	beq	Lsqueeze_done
528 
529 	subs	x3,x3,#8
530 	bhi	Loop_squeeze
531 
532 	mov	x0,x19
533 	bl	KeccakF1600
534 	mov	x0,x19
535 	mov	x3,x22
536 	b	Loop_squeeze
537 
538 .align	4
539 Lsqueeze_tail:
540 	strb	w4,[x20],#1
541 	lsr	x4,x4,#8
542 	subs	x21,x21,#1
543 	beq	Lsqueeze_done
544 	strb	w4,[x20],#1
545 	lsr	x4,x4,#8
546 	subs	x21,x21,#1
547 	beq	Lsqueeze_done
548 	strb	w4,[x20],#1
549 	lsr	x4,x4,#8
550 	subs	x21,x21,#1
551 	beq	Lsqueeze_done
552 	strb	w4,[x20],#1
553 	lsr	x4,x4,#8
554 	subs	x21,x21,#1
555 	beq	Lsqueeze_done
556 	strb	w4,[x20],#1
557 	lsr	x4,x4,#8
558 	subs	x21,x21,#1
559 	beq	Lsqueeze_done
560 	strb	w4,[x20],#1
561 	lsr	x4,x4,#8
562 	subs	x21,x21,#1
563 	beq	Lsqueeze_done
564 	strb	w4,[x20],#1
565 
566 Lsqueeze_done:
567 	ldp	x19,x20,[sp,#16]
568 	ldp	x21,x22,[sp,#32]
569 	ldp	x29,x30,[sp],#48
570 .long	0xd50323bf			// autiasp
571 	ret
572 
573 
574 .align	5
575 KeccakF1600_ce:
576 	mov	x9,#24
577 	adr	x10,iotas
578 	b	Loop_ce
579 .align	4
580 Loop_ce:
581 	////////////////////////////////////////////////// Theta
582 .long	0xce0f2a99	//eor3 v25.16b,v20.16b,v15.16b,v10.16b
583 .long	0xce102eba	//eor3 v26.16b,v21.16b,v16.16b,v11.16b
584 .long	0xce1132db	//eor3 v27.16b,v22.16b,v17.16b,v12.16b
585 .long	0xce1236fc	//eor3 v28.16b,v23.16b,v18.16b,v13.16b
586 .long	0xce133b1d	//eor3 v29.16b,v24.16b,v19.16b,v14.16b
587 .long	0xce050339	//eor3 v25.16b,v25.16b,   v5.16b,v0.16b
588 .long	0xce06075a	//eor3 v26.16b,v26.16b,   v6.16b,v1.16b
589 .long	0xce070b7b	//eor3 v27.16b,v27.16b,   v7.16b,v2.16b
590 .long	0xce080f9c	//eor3 v28.16b,v28.16b,   v8.16b,v3.16b
591 .long	0xce0913bd	//eor3 v29.16b,v29.16b,   v9.16b,v4.16b
592 
593 .long	0xce7b8f3e	//rax1 v30.16b,v25.16b,v27.16b			// D[1]
594 .long	0xce7c8f5f	//rax1 v31.16b,v26.16b,v28.16b			// D[2]
595 .long	0xce7d8f7b	//rax1 v27.16b,v27.16b,v29.16b			// D[3]
596 .long	0xce798f9c	//rax1 v28.16b,v28.16b,v25.16b			// D[4]
597 .long	0xce7a8fbd	//rax1 v29.16b,v29.16b,v26.16b			// D[0]
598 
599 	////////////////////////////////////////////////// Theta+Rho+Pi
600 .long	0xce9efc39	//xar v25.16b,   v1.16b,v30.16b,#64-1 // C[0]=A[2][0]
601 
602 .long	0xce9e50c1	//xar v1.16b,v6.16b,v30.16b,#64-44
603 .long	0xce9cb126	//xar v6.16b,v9.16b,v28.16b,#64-20
604 .long	0xce9f0ec9	//xar v9.16b,v22.16b,v31.16b,#64-61
605 .long	0xce9c65d6	//xar v22.16b,v14.16b,v28.16b,#64-39
606 .long	0xce9dba8e	//xar v14.16b,v20.16b,v29.16b,#64-18
607 
608 .long	0xce9f085a	//xar v26.16b,   v2.16b,v31.16b,#64-62 // C[1]=A[4][0]
609 
610 .long	0xce9f5582	//xar v2.16b,v12.16b,v31.16b,#64-43
611 .long	0xce9b9dac	//xar v12.16b,v13.16b,v27.16b,#64-25
612 .long	0xce9ce26d	//xar v13.16b,v19.16b,v28.16b,#64-8
613 .long	0xce9b22f3	//xar v19.16b,v23.16b,v27.16b,#64-56
614 .long	0xce9d5df7	//xar v23.16b,v15.16b,v29.16b,#64-41
615 
616 .long	0xce9c948f	//xar v15.16b,v4.16b,v28.16b,#64-27
617 
618 .long	0xce9ccb1c	//xar v28.16b,   v24.16b,v28.16b,#64-14 // D[4]=A[0][4]
619 .long	0xce9efab8	//xar v24.16b,v21.16b,v30.16b,#64-2
620 .long	0xce9b2508	//xar v8.16b,v8.16b,v27.16b,#64-55 // A[1][3]=A[4][1]
621 .long	0xce9e4e04	//xar v4.16b,v16.16b,v30.16b,#64-45 // A[0][4]=A[1][3]
622 .long	0xce9d70b0	//xar v16.16b,v5.16b,v29.16b,#64-36
623 
624 .long	0xce9b9065	//xar v5.16b,v3.16b,v27.16b,#64-28
625 
626 	eor	v0.16b,v0.16b,v29.16b
627 
628 .long	0xce9bae5b	//xar v27.16b,   v18.16b,v27.16b,#64-21 // D[3]=A[0][3]
629 .long	0xce9fc623	//xar v3.16b,v17.16b,v31.16b,#64-15 // A[0][3]=A[3][3]
630 .long	0xce9ed97e	//xar v30.16b,   v11.16b,v30.16b,#64-10 // D[1]=A[3][2]
631 .long	0xce9fe8ff	//xar v31.16b,   v7.16b,v31.16b,#64-6 // D[2]=A[2][1]
632 .long	0xce9df55d	//xar v29.16b,   v10.16b,v29.16b,#64-3 // D[0]=A[1][2]
633 
634 	////////////////////////////////////////////////// Chi+Iota
635 .long	0xce362354	//bcax v20.16b,v26.16b,   v22.16b,v8.16b	// A[1][3]=A[4][1]
636 .long	0xce375915	//bcax v21.16b,v8.16b,v23.16b,v22.16b	// A[1][3]=A[4][1]
637 .long	0xce385ed6	//bcax v22.16b,v22.16b,v24.16b,v23.16b
638 .long	0xce3a62f7	//bcax v23.16b,v23.16b,v26.16b,   v24.16b
639 .long	0xce286b18	//bcax v24.16b,v24.16b,v8.16b,v26.16b	// A[1][3]=A[4][1]
640 
641 	ld1r	{v26.2d},[x10],#8
642 
643 .long	0xce330fd1	//bcax v17.16b,v30.16b,   v19.16b,v3.16b	// A[0][3]=A[3][3]
644 .long	0xce2f4c72	//bcax v18.16b,v3.16b,v15.16b,v19.16b	// A[0][3]=A[3][3]
645 .long	0xce303e73	//bcax v19.16b,v19.16b,v16.16b,v15.16b
646 .long	0xce3e41ef	//bcax v15.16b,v15.16b,v30.16b,   v16.16b
647 .long	0xce237a10	//bcax v16.16b,v16.16b,v3.16b,v30.16b	// A[0][3]=A[3][3]
648 
649 .long	0xce2c7f2a	//bcax v10.16b,v25.16b,   v12.16b,v31.16b
650 .long	0xce2d33eb	//bcax v11.16b,v31.16b,   v13.16b,v12.16b
651 .long	0xce2e358c	//bcax v12.16b,v12.16b,v14.16b,v13.16b
652 .long	0xce3939ad	//bcax v13.16b,v13.16b,v25.16b,   v14.16b
653 .long	0xce3f65ce	//bcax v14.16b,v14.16b,v31.16b,   v25.16b
654 
655 .long	0xce2913a7	//bcax v7.16b,v29.16b,   v9.16b,v4.16b	// A[0][4]=A[1][3]
656 .long	0xce252488	//bcax v8.16b,v4.16b,v5.16b,v9.16b	// A[0][4]=A[1][3]
657 .long	0xce261529	//bcax v9.16b,v9.16b,v6.16b,v5.16b
658 .long	0xce3d18a5	//bcax v5.16b,v5.16b,v29.16b,   v6.16b
659 .long	0xce2474c6	//bcax v6.16b,v6.16b,v4.16b,v29.16b	// A[0][4]=A[1][3]
660 
661 .long	0xce207363	//bcax v3.16b,v27.16b,   v0.16b,v28.16b
662 .long	0xce210384	//bcax v4.16b,v28.16b,   v1.16b,v0.16b
663 .long	0xce220400	//bcax v0.16b,v0.16b,v2.16b,v1.16b
664 .long	0xce3b0821	//bcax v1.16b,v1.16b,v27.16b,   v2.16b
665 .long	0xce3c6c42	//bcax v2.16b,v2.16b,v28.16b,   v27.16b
666 
667 	eor	v0.16b,v0.16b,v26.16b
668 
669 	subs	x9,x9,#1
670 	bne	Loop_ce
671 
672 	ret
673 
674 
675 
676 .align	5
677 KeccakF1600_cext:
678 .long	0xd503233f		// paciasp
679 	stp	x29,x30,[sp,#-80]!
680 	add	x29,sp,#0
681 	stp	d8,d9,[sp,#16]		// per ABI requirement
682 	stp	d10,d11,[sp,#32]
683 	stp	d12,d13,[sp,#48]
684 	stp	d14,d15,[sp,#64]
685 	ldp	d0,d1,[x0,#8*0]
686 	ldp	d2,d3,[x0,#8*2]
687 	ldp	d4,d5,[x0,#8*4]
688 	ldp	d6,d7,[x0,#8*6]
689 	ldp	d8,d9,[x0,#8*8]
690 	ldp	d10,d11,[x0,#8*10]
691 	ldp	d12,d13,[x0,#8*12]
692 	ldp	d14,d15,[x0,#8*14]
693 	ldp	d16,d17,[x0,#8*16]
694 	ldp	d18,d19,[x0,#8*18]
695 	ldp	d20,d21,[x0,#8*20]
696 	ldp	d22,d23,[x0,#8*22]
697 	ldr	d24,[x0,#8*24]
698 	bl	KeccakF1600_ce
699 	ldr	x30,[sp,#8]
700 	stp	d0,d1,[x0,#8*0]
701 	stp	d2,d3,[x0,#8*2]
702 	stp	d4,d5,[x0,#8*4]
703 	stp	d6,d7,[x0,#8*6]
704 	stp	d8,d9,[x0,#8*8]
705 	stp	d10,d11,[x0,#8*10]
706 	stp	d12,d13,[x0,#8*12]
707 	stp	d14,d15,[x0,#8*14]
708 	stp	d16,d17,[x0,#8*16]
709 	stp	d18,d19,[x0,#8*18]
710 	stp	d20,d21,[x0,#8*20]
711 	stp	d22,d23,[x0,#8*22]
712 	str	d24,[x0,#8*24]
713 
714 	ldp	d8,d9,[sp,#16]
715 	ldp	d10,d11,[sp,#32]
716 	ldp	d12,d13,[sp,#48]
717 	ldp	d14,d15,[sp,#64]
718 	ldr	x29,[sp],#80
719 .long	0xd50323bf		// autiasp
720 	ret
721 
722 .globl	_SHA3_absorb_cext
723 
724 .align	5
725 _SHA3_absorb_cext:
726 .long	0xd503233f		// paciasp
727 	stp	x29,x30,[sp,#-80]!
728 	add	x29,sp,#0
729 	stp	d8,d9,[sp,#16]		// per ABI requirement
730 	stp	d10,d11,[sp,#32]
731 	stp	d12,d13,[sp,#48]
732 	stp	d14,d15,[sp,#64]
733 	ldp	d0,d1,[x0,#8*0]
734 	ldp	d2,d3,[x0,#8*2]
735 	ldp	d4,d5,[x0,#8*4]
736 	ldp	d6,d7,[x0,#8*6]
737 	ldp	d8,d9,[x0,#8*8]
738 	ldp	d10,d11,[x0,#8*10]
739 	ldp	d12,d13,[x0,#8*12]
740 	ldp	d14,d15,[x0,#8*14]
741 	ldp	d16,d17,[x0,#8*16]
742 	ldp	d18,d19,[x0,#8*18]
743 	ldp	d20,d21,[x0,#8*20]
744 	ldp	d22,d23,[x0,#8*22]
745 	ldr	d24,[x0,#8*24]
746 	b	Loop_absorb_ce
747 
748 .align	4
749 Loop_absorb_ce:
750 	subs	x2,x2,x3		// len - bsz
751 	blo	Labsorbed_ce
752 	ldr	d31,[x1],#8		// *inp++
753 #ifdef	__AARCH64EB__
754 	rev64	v31.16b,v31.16b
755 #endif
756 	eor	v0.16b,v0.16b,v31.16b
757 	cmp	x3,#8*(0+2)
758 	blo	Lprocess_block_ce
759 	ldr	d31,[x1],#8		// *inp++
760 #ifdef	__AARCH64EB__
761 	rev64	v31.16b,v31.16b
762 #endif
763 	eor	v1.16b,v1.16b,v31.16b
764 	beq	Lprocess_block_ce
765 	ldr	d31,[x1],#8		// *inp++
766 #ifdef	__AARCH64EB__
767 	rev64	v31.16b,v31.16b
768 #endif
769 	eor	v2.16b,v2.16b,v31.16b
770 	cmp	x3,#8*(2+2)
771 	blo	Lprocess_block_ce
772 	ldr	d31,[x1],#8		// *inp++
773 #ifdef	__AARCH64EB__
774 	rev64	v31.16b,v31.16b
775 #endif
776 	eor	v3.16b,v3.16b,v31.16b
777 	beq	Lprocess_block_ce
778 	ldr	d31,[x1],#8		// *inp++
779 #ifdef	__AARCH64EB__
780 	rev64	v31.16b,v31.16b
781 #endif
782 	eor	v4.16b,v4.16b,v31.16b
783 	cmp	x3,#8*(4+2)
784 	blo	Lprocess_block_ce
785 	ldr	d31,[x1],#8		// *inp++
786 #ifdef	__AARCH64EB__
787 	rev64	v31.16b,v31.16b
788 #endif
789 	eor	v5.16b,v5.16b,v31.16b
790 	beq	Lprocess_block_ce
791 	ldr	d31,[x1],#8		// *inp++
792 #ifdef	__AARCH64EB__
793 	rev64	v31.16b,v31.16b
794 #endif
795 	eor	v6.16b,v6.16b,v31.16b
796 	cmp	x3,#8*(6+2)
797 	blo	Lprocess_block_ce
798 	ldr	d31,[x1],#8		// *inp++
799 #ifdef	__AARCH64EB__
800 	rev64	v31.16b,v31.16b
801 #endif
802 	eor	v7.16b,v7.16b,v31.16b
803 	beq	Lprocess_block_ce
804 	ldr	d31,[x1],#8		// *inp++
805 #ifdef	__AARCH64EB__
806 	rev64	v31.16b,v31.16b
807 #endif
808 	eor	v8.16b,v8.16b,v31.16b
809 	cmp	x3,#8*(8+2)
810 	blo	Lprocess_block_ce
811 	ldr	d31,[x1],#8		// *inp++
812 #ifdef	__AARCH64EB__
813 	rev64	v31.16b,v31.16b
814 #endif
815 	eor	v9.16b,v9.16b,v31.16b
816 	beq	Lprocess_block_ce
817 	ldr	d31,[x1],#8		// *inp++
818 #ifdef	__AARCH64EB__
819 	rev64	v31.16b,v31.16b
820 #endif
821 	eor	v10.16b,v10.16b,v31.16b
822 	cmp	x3,#8*(10+2)
823 	blo	Lprocess_block_ce
824 	ldr	d31,[x1],#8		// *inp++
825 #ifdef	__AARCH64EB__
826 	rev64	v31.16b,v31.16b
827 #endif
828 	eor	v11.16b,v11.16b,v31.16b
829 	beq	Lprocess_block_ce
830 	ldr	d31,[x1],#8		// *inp++
831 #ifdef	__AARCH64EB__
832 	rev64	v31.16b,v31.16b
833 #endif
834 	eor	v12.16b,v12.16b,v31.16b
835 	cmp	x3,#8*(12+2)
836 	blo	Lprocess_block_ce
837 	ldr	d31,[x1],#8		// *inp++
838 #ifdef	__AARCH64EB__
839 	rev64	v31.16b,v31.16b
840 #endif
841 	eor	v13.16b,v13.16b,v31.16b
842 	beq	Lprocess_block_ce
843 	ldr	d31,[x1],#8		// *inp++
844 #ifdef	__AARCH64EB__
845 	rev64	v31.16b,v31.16b
846 #endif
847 	eor	v14.16b,v14.16b,v31.16b
848 	cmp	x3,#8*(14+2)
849 	blo	Lprocess_block_ce
850 	ldr	d31,[x1],#8		// *inp++
851 #ifdef	__AARCH64EB__
852 	rev64	v31.16b,v31.16b
853 #endif
854 	eor	v15.16b,v15.16b,v31.16b
855 	beq	Lprocess_block_ce
856 	ldr	d31,[x1],#8		// *inp++
857 #ifdef	__AARCH64EB__
858 	rev64	v31.16b,v31.16b
859 #endif
860 	eor	v16.16b,v16.16b,v31.16b
861 	cmp	x3,#8*(16+2)
862 	blo	Lprocess_block_ce
863 	ldr	d31,[x1],#8		// *inp++
864 #ifdef	__AARCH64EB__
865 	rev64	v31.16b,v31.16b
866 #endif
867 	eor	v17.16b,v17.16b,v31.16b
868 	beq	Lprocess_block_ce
869 	ldr	d31,[x1],#8		// *inp++
870 #ifdef	__AARCH64EB__
871 	rev64	v31.16b,v31.16b
872 #endif
873 	eor	v18.16b,v18.16b,v31.16b
874 	cmp	x3,#8*(18+2)
875 	blo	Lprocess_block_ce
876 	ldr	d31,[x1],#8		// *inp++
877 #ifdef	__AARCH64EB__
878 	rev64	v31.16b,v31.16b
879 #endif
880 	eor	v19.16b,v19.16b,v31.16b
881 	beq	Lprocess_block_ce
882 	ldr	d31,[x1],#8		// *inp++
883 #ifdef	__AARCH64EB__
884 	rev64	v31.16b,v31.16b
885 #endif
886 	eor	v20.16b,v20.16b,v31.16b
887 	cmp	x3,#8*(20+2)
888 	blo	Lprocess_block_ce
889 	ldr	d31,[x1],#8		// *inp++
890 #ifdef	__AARCH64EB__
891 	rev64	v31.16b,v31.16b
892 #endif
893 	eor	v21.16b,v21.16b,v31.16b
894 	beq	Lprocess_block_ce
895 	ldr	d31,[x1],#8		// *inp++
896 #ifdef	__AARCH64EB__
897 	rev64	v31.16b,v31.16b
898 #endif
899 	eor	v22.16b,v22.16b,v31.16b
900 	cmp	x3,#8*(22+2)
901 	blo	Lprocess_block_ce
902 	ldr	d31,[x1],#8		// *inp++
903 #ifdef	__AARCH64EB__
904 	rev64	v31.16b,v31.16b
905 #endif
906 	eor	v23.16b,v23.16b,v31.16b
907 	beq	Lprocess_block_ce
908 	ldr	d31,[x1],#8		// *inp++
909 #ifdef	__AARCH64EB__
910 	rev64	v31.16b,v31.16b
911 #endif
912 	eor	v24.16b,v24.16b,v31.16b
913 
914 Lprocess_block_ce:
915 
916 	bl	KeccakF1600_ce
917 
918 	b	Loop_absorb_ce
919 
920 .align	4
921 Labsorbed_ce:
922 	stp	d0,d1,[x0,#8*0]
923 	stp	d2,d3,[x0,#8*2]
924 	stp	d4,d5,[x0,#8*4]
925 	stp	d6,d7,[x0,#8*6]
926 	stp	d8,d9,[x0,#8*8]
927 	stp	d10,d11,[x0,#8*10]
928 	stp	d12,d13,[x0,#8*12]
929 	stp	d14,d15,[x0,#8*14]
930 	stp	d16,d17,[x0,#8*16]
931 	stp	d18,d19,[x0,#8*18]
932 	stp	d20,d21,[x0,#8*20]
933 	stp	d22,d23,[x0,#8*22]
934 	str	d24,[x0,#8*24]
935 	add	x0,x2,x3		// return value
936 
937 	ldp	d8,d9,[sp,#16]
938 	ldp	d10,d11,[sp,#32]
939 	ldp	d12,d13,[sp,#48]
940 	ldp	d14,d15,[sp,#64]
941 	ldp	x29,x30,[sp],#80
942 .long	0xd50323bf		// autiasp
943 	ret
944 
945 .globl	_SHA3_squeeze_cext
946 
947 .align	5
948 _SHA3_squeeze_cext:
949 .long	0xd503233f		// paciasp
950 	stp	x29,x30,[sp,#-16]!
951 	add	x29,sp,#0
952 	mov	x9,x0
953 	mov	x10,x3
954 
955 Loop_squeeze_ce:
956 	ldr	x4,[x9],#8
957 	cmp	x2,#8
958 	blo	Lsqueeze_tail_ce
959 #ifdef	__AARCH64EB__
960 	rev	x4,x4
961 #endif
962 	str	x4,[x1],#8
963 	beq	Lsqueeze_done_ce
964 
965 	sub	x2,x2,#8
966 	subs	x10,x10,#8
967 	bhi	Loop_squeeze_ce
968 
969 	bl	KeccakF1600_cext
970 	ldr	x30,[sp,#8]
971 	mov	x9,x0
972 	mov	x10,x3
973 	b	Loop_squeeze_ce
974 
975 .align	4
976 Lsqueeze_tail_ce:
977 	strb	w4,[x1],#1
978 	lsr	x4,x4,#8
979 	subs	x2,x2,#1
980 	beq	Lsqueeze_done_ce
981 	strb	w4,[x1],#1
982 	lsr	x4,x4,#8
983 	subs	x2,x2,#1
984 	beq	Lsqueeze_done_ce
985 	strb	w4,[x1],#1
986 	lsr	x4,x4,#8
987 	subs	x2,x2,#1
988 	beq	Lsqueeze_done_ce
989 	strb	w4,[x1],#1
990 	lsr	x4,x4,#8
991 	subs	x2,x2,#1
992 	beq	Lsqueeze_done_ce
993 	strb	w4,[x1],#1
994 	lsr	x4,x4,#8
995 	subs	x2,x2,#1
996 	beq	Lsqueeze_done_ce
997 	strb	w4,[x1],#1
998 	lsr	x4,x4,#8
999 	subs	x2,x2,#1
1000 	beq	Lsqueeze_done_ce
1001 	strb	w4,[x1],#1
1002 
1003 Lsqueeze_done_ce:
1004 	ldr	x29,[sp],#16
1005 .long	0xd50323bf		// autiasp
1006 	ret
1007 
1008 .byte	75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1009 .align	2
1010