1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
4  *
5  * Copyright (C) 2015 - 2017 Linaro Ltd.
6  * Copyright (C) 2023 Google LLC. <ardb@google.com>
7  */
8 
9 #include <linux/linkage.h>
10 #include <asm/assembler.h>
11 
12 	.arch		armv8-a
13 	.fpu		crypto-neon-fp-armv8
14 
15 	SHASH		.req	q0
16 	T1		.req	q1
17 	XL		.req	q2
18 	XM		.req	q3
19 	XH		.req	q4
20 	IN1		.req	q4
21 
22 	SHASH_L		.req	d0
23 	SHASH_H		.req	d1
24 	T1_L		.req	d2
25 	T1_H		.req	d3
26 	XL_L		.req	d4
27 	XL_H		.req	d5
28 	XM_L		.req	d6
29 	XM_H		.req	d7
30 	XH_L		.req	d8
31 
32 	t0l		.req	d10
33 	t0h		.req	d11
34 	t1l		.req	d12
35 	t1h		.req	d13
36 	t2l		.req	d14
37 	t2h		.req	d15
38 	t3l		.req	d16
39 	t3h		.req	d17
40 	t4l		.req	d18
41 	t4h		.req	d19
42 
43 	t0q		.req	q5
44 	t1q		.req	q6
45 	t2q		.req	q7
46 	t3q		.req	q8
47 	t4q		.req	q9
48 	XH2		.req	q9
49 
50 	s1l		.req	d20
51 	s1h		.req	d21
52 	s2l		.req	d22
53 	s2h		.req	d23
54 	s3l		.req	d24
55 	s3h		.req	d25
56 	s4l		.req	d26
57 	s4h		.req	d27
58 
59 	MASK		.req	d28
60 	SHASH2_p8	.req	d28
61 
62 	k16		.req	d29
63 	k32		.req	d30
64 	k48		.req	d31
65 	SHASH2_p64	.req	d31
66 
67 	HH		.req	q10
68 	HH3		.req	q11
69 	HH4		.req	q12
70 	HH34		.req	q13
71 
72 	HH_L		.req	d20
73 	HH_H		.req	d21
74 	HH3_L		.req	d22
75 	HH3_H		.req	d23
76 	HH4_L		.req	d24
77 	HH4_H		.req	d25
78 	HH34_L		.req	d26
79 	HH34_H		.req	d27
80 	SHASH2_H	.req	d29
81 
82 	XL2		.req	q5
83 	XM2		.req	q6
84 	T2		.req	q7
85 	T3		.req	q8
86 
87 	XL2_L		.req	d10
88 	XL2_H		.req	d11
89 	XM2_L		.req	d12
90 	XM2_H		.req	d13
91 	T3_L		.req	d16
92 	T3_H		.req	d17
93 
94 	.text
95 
96 	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
97 	vmull.p64	\rd, \rn, \rm
98 	.endm
99 
100 	/*
101 	 * This implementation of 64x64 -> 128 bit polynomial multiplication
102 	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
103 	 * "Fast Software Polynomial Multiplication on ARM Processors Using
104 	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
105 	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
106 	 *
107 	 * It has been slightly tweaked for in-order performance, and to allow
108 	 * 'rq' to overlap with 'ad' or 'bd'.
109 	 */
110 	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
111 	vext.8		t0l, \ad, \ad, #1	@ A1
112 	.ifc		\b1, t4l
113 	vext.8		t4l, \bd, \bd, #1	@ B1
114 	.endif
115 	vmull.p8	t0q, t0l, \bd		@ F = A1*B
116 	vext.8		t1l, \ad, \ad, #2	@ A2
117 	vmull.p8	t4q, \ad, \b1		@ E = A*B1
118 	.ifc		\b2, t3l
119 	vext.8		t3l, \bd, \bd, #2	@ B2
120 	.endif
121 	vmull.p8	t1q, t1l, \bd		@ H = A2*B
122 	vext.8		t2l, \ad, \ad, #3	@ A3
123 	vmull.p8	t3q, \ad, \b2		@ G = A*B2
124 	veor		t0q, t0q, t4q		@ L = E + F
125 	.ifc		\b3, t4l
126 	vext.8		t4l, \bd, \bd, #3	@ B3
127 	.endif
128 	vmull.p8	t2q, t2l, \bd		@ J = A3*B
129 	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
130 	veor		t1q, t1q, t3q		@ M = G + H
131 	.ifc		\b4, t3l
132 	vext.8		t3l, \bd, \bd, #4	@ B4
133 	.endif
134 	vmull.p8	t4q, \ad, \b3		@ I = A*B3
135 	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
136 	vmull.p8	t3q, \ad, \b4		@ K = A*B4
137 	vand		t0h, t0h, k48
138 	vand		t1h, t1h, k32
139 	veor		t2q, t2q, t4q		@ N = I + J
140 	veor		t0l, t0l, t0h
141 	veor		t1l, t1l, t1h
142 	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
143 	vand		t2h, t2h, k16
144 	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
145 	vmov.i64	t3h, #0
146 	vext.8		t0q, t0q, t0q, #15
147 	veor		t2l, t2l, t2h
148 	vext.8		t1q, t1q, t1q, #14
149 	vmull.p8	\rq, \ad, \bd		@ D = A*B
150 	vext.8		t2q, t2q, t2q, #13
151 	vext.8		t3q, t3q, t3q, #12
152 	veor		t0q, t0q, t1q
153 	veor		t2q, t2q, t3q
154 	veor		\rq, \rq, t0q
155 	veor		\rq, \rq, t2q
156 	.endm
157 
158 	//
159 	// PMULL (64x64->128) based reduction for CPUs that can do
160 	// it in a single instruction.
161 	//
162 	.macro		__pmull_reduce_p64
163 	vmull.p64	T1, XL_L, MASK
164 
165 	veor		XH_L, XH_L, XM_H
166 	vext.8		T1, T1, T1, #8
167 	veor		XL_H, XL_H, XM_L
168 	veor		T1, T1, XL
169 
170 	vmull.p64	XL, T1_H, MASK
171 	.endm
172 
173 	//
174 	// Alternative reduction for CPUs that lack support for the
175 	// 64x64->128 PMULL instruction
176 	//
177 	.macro		__pmull_reduce_p8
178 	veor		XL_H, XL_H, XM_L
179 	veor		XH_L, XH_L, XM_H
180 
181 	vshl.i64	T1, XL, #57
182 	vshl.i64	T2, XL, #62
183 	veor		T1, T1, T2
184 	vshl.i64	T2, XL, #63
185 	veor		T1, T1, T2
186 	veor		XL_H, XL_H, T1_L
187 	veor		XH_L, XH_L, T1_H
188 
189 	vshr.u64	T1, XL, #1
190 	veor		XH, XH, XL
191 	veor		XL, XL, T1
192 	vshr.u64	T1, T1, #6
193 	vshr.u64	XL, XL, #1
194 	.endm
195 
196 	.macro		ghash_update, pn, enc, aggregate=1, head=1
197 	vld1.64		{XL}, [r1]
198 
199 	.if		\head
200 	/* do the head block first, if supplied */
201 	ldr		ip, [sp]
202 	teq		ip, #0
203 	beq		0f
204 	vld1.64		{T1}, [ip]
205 	teq		r0, #0
206 	b		3f
207 	.endif
208 
209 0:	.ifc		\pn, p64
210 	.if		\aggregate
211 	tst		r0, #3			// skip until #blocks is a
212 	bne		2f			// round multiple of 4
213 
214 	vld1.8		{XL2-XM2}, [r2]!
215 1:	vld1.8		{T2-T3}, [r2]!
216 
217 	.ifnb		\enc
218 	\enc\()_4x	XL2, XM2, T2, T3
219 
220 	add		ip, r3, #16
221 	vld1.64		{HH}, [ip, :128]!
222 	vld1.64		{HH3-HH4}, [ip, :128]
223 
224 	veor		SHASH2_p64, SHASH_L, SHASH_H
225 	veor		SHASH2_H, HH_L, HH_H
226 	veor		HH34_L, HH3_L, HH3_H
227 	veor		HH34_H, HH4_L, HH4_H
228 
229 	vmov.i8		MASK, #0xe1
230 	vshl.u64	MASK, MASK, #57
231 	.endif
232 
233 	vrev64.8	XL2, XL2
234 	vrev64.8	XM2, XM2
235 
236 	subs		r0, r0, #4
237 
238 	vext.8		T1, XL2, XL2, #8
239 	veor		XL2_H, XL2_H, XL_L
240 	veor		XL, XL, T1
241 
242 	vrev64.8	T1, T3
243 	vrev64.8	T3, T2
244 
245 	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
246 	veor		XL2_H, XL2_H, XL_H
247 	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
248 	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
249 
250 	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
251 	veor		XM2_L, XM2_L, XM2_H
252 	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
253 	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
254 
255 	veor		XH, XH, XH2
256 	veor		XL, XL, XL2
257 	veor		XM, XM, XM2
258 
259 	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
260 	veor		T3_L, T3_L, T3_H
261 	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
262 	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
263 
264 	veor		XH, XH, XH2
265 	veor		XL, XL, XL2
266 	veor		XM, XM, XM2
267 
268 	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
269 	veor		T1_L, T1_L, T1_H
270 	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
271 	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
272 
273 	veor		XH, XH, XH2
274 	veor		XL, XL, XL2
275 	veor		XM, XM, XM2
276 
277 	beq		4f
278 
279 	vld1.8		{XL2-XM2}, [r2]!
280 
281 	veor		T1, XL, XH
282 	veor		XM, XM, T1
283 
284 	__pmull_reduce_p64
285 
286 	veor		T1, T1, XH
287 	veor		XL, XL, T1
288 
289 	b		1b
290 	.endif
291 	.endif
292 
293 2:	vld1.8		{T1}, [r2]!
294 
295 	.ifnb		\enc
296 	\enc\()_1x	T1
297 	veor		SHASH2_p64, SHASH_L, SHASH_H
298 	vmov.i8		MASK, #0xe1
299 	vshl.u64	MASK, MASK, #57
300 	.endif
301 
302 	subs		r0, r0, #1
303 
304 3:	/* multiply XL by SHASH in GF(2^128) */
305 	vrev64.8	T1, T1
306 
307 	vext.8		IN1, T1, T1, #8
308 	veor		T1_L, T1_L, XL_H
309 	veor		XL, XL, IN1
310 
311 	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
312 	veor		T1, T1, XL
313 	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
314 	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
315 
316 4:	veor		T1, XL, XH
317 	veor		XM, XM, T1
318 
319 	__pmull_reduce_\pn
320 
321 	veor		T1, T1, XH
322 	veor		XL, XL, T1
323 
324 	bne		0b
325 	.endm
326 
327 	/*
328 	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
329 	 *			   struct ghash_key const *k, const char *head)
330 	 */
331 ENTRY(pmull_ghash_update_p64)
332 	vld1.64		{SHASH}, [r3]!
333 	vld1.64		{HH}, [r3]!
334 	vld1.64		{HH3-HH4}, [r3]
335 
336 	veor		SHASH2_p64, SHASH_L, SHASH_H
337 	veor		SHASH2_H, HH_L, HH_H
338 	veor		HH34_L, HH3_L, HH3_H
339 	veor		HH34_H, HH4_L, HH4_H
340 
341 	vmov.i8		MASK, #0xe1
342 	vshl.u64	MASK, MASK, #57
343 
344 	ghash_update	p64
345 	vst1.64		{XL}, [r1]
346 
347 	bx		lr
348 ENDPROC(pmull_ghash_update_p64)
349 
350 ENTRY(pmull_ghash_update_p8)
351 	vld1.64		{SHASH}, [r3]
352 	veor		SHASH2_p8, SHASH_L, SHASH_H
353 
354 	vext.8		s1l, SHASH_L, SHASH_L, #1
355 	vext.8		s2l, SHASH_L, SHASH_L, #2
356 	vext.8		s3l, SHASH_L, SHASH_L, #3
357 	vext.8		s4l, SHASH_L, SHASH_L, #4
358 	vext.8		s1h, SHASH_H, SHASH_H, #1
359 	vext.8		s2h, SHASH_H, SHASH_H, #2
360 	vext.8		s3h, SHASH_H, SHASH_H, #3
361 	vext.8		s4h, SHASH_H, SHASH_H, #4
362 
363 	vmov.i64	k16, #0xffff
364 	vmov.i64	k32, #0xffffffff
365 	vmov.i64	k48, #0xffffffffffff
366 
367 	ghash_update	p8
368 	vst1.64		{XL}, [r1]
369 
370 	bx		lr
371 ENDPROC(pmull_ghash_update_p8)
372 
373 	e0		.req	q9
374 	e1		.req	q10
375 	e2		.req	q11
376 	e3		.req	q12
377 	e0l		.req	d18
378 	e0h		.req	d19
379 	e2l		.req	d22
380 	e2h		.req	d23
381 	e3l		.req	d24
382 	e3h		.req	d25
383 	ctr		.req	q13
384 	ctr0		.req	d26
385 	ctr1		.req	d27
386 
387 	ek0		.req	q14
388 	ek1		.req	q15
389 
390 	.macro		round, rk:req, regs:vararg
391 	.irp		r, \regs
392 	aese.8		\r, \rk
393 	aesmc.8		\r, \r
394 	.endr
395 	.endm
396 
397 	.macro		aes_encrypt, rkp, rounds, regs:vararg
398 	vld1.8		{ek0-ek1}, [\rkp, :128]!
399 	cmp		\rounds, #12
400 	blt		.L\@			// AES-128
401 
402 	round		ek0, \regs
403 	vld1.8		{ek0}, [\rkp, :128]!
404 	round		ek1, \regs
405 	vld1.8		{ek1}, [\rkp, :128]!
406 
407 	beq		.L\@			// AES-192
408 
409 	round		ek0, \regs
410 	vld1.8		{ek0}, [\rkp, :128]!
411 	round		ek1, \regs
412 	vld1.8		{ek1}, [\rkp, :128]!
413 
414 .L\@:	.rept		4
415 	round		ek0, \regs
416 	vld1.8		{ek0}, [\rkp, :128]!
417 	round		ek1, \regs
418 	vld1.8		{ek1}, [\rkp, :128]!
419 	.endr
420 
421 	round		ek0, \regs
422 	vld1.8		{ek0}, [\rkp, :128]
423 
424 	.irp		r, \regs
425 	aese.8		\r, ek1
426 	.endr
427 	.irp		r, \regs
428 	veor		\r, \r, ek0
429 	.endr
430 	.endm
431 
432 pmull_aes_encrypt:
433 	add		ip, r5, #4
434 	vld1.8		{ctr0}, [r5]		// load 12 byte IV
435 	vld1.8		{ctr1}, [ip]
436 	rev		r8, r7
437 	vext.8		ctr1, ctr1, ctr1, #4
438 	add		r7, r7, #1
439 	vmov.32		ctr1[1], r8
440 	vmov		e0, ctr
441 
442 	add		ip, r3, #64
443 	aes_encrypt	ip, r6, e0
444 	bx		lr
445 ENDPROC(pmull_aes_encrypt)
446 
447 pmull_aes_encrypt_4x:
448 	add		ip, r5, #4
449 	vld1.8		{ctr0}, [r5]
450 	vld1.8		{ctr1}, [ip]
451 	rev		r8, r7
452 	vext.8		ctr1, ctr1, ctr1, #4
453 	add		r7, r7, #1
454 	vmov.32		ctr1[1], r8
455 	rev		ip, r7
456 	vmov		e0, ctr
457 	add		r7, r7, #1
458 	vmov.32		ctr1[1], ip
459 	rev		r8, r7
460 	vmov		e1, ctr
461 	add		r7, r7, #1
462 	vmov.32		ctr1[1], r8
463 	rev		ip, r7
464 	vmov		e2, ctr
465 	add		r7, r7, #1
466 	vmov.32		ctr1[1], ip
467 	vmov		e3, ctr
468 
469 	add		ip, r3, #64
470 	aes_encrypt	ip, r6, e0, e1, e2, e3
471 	bx		lr
472 ENDPROC(pmull_aes_encrypt_4x)
473 
474 pmull_aes_encrypt_final:
475 	add		ip, r5, #4
476 	vld1.8		{ctr0}, [r5]
477 	vld1.8		{ctr1}, [ip]
478 	rev		r8, r7
479 	vext.8		ctr1, ctr1, ctr1, #4
480 	mov		r7, #1 << 24		// BE #1 for the tag
481 	vmov.32		ctr1[1], r8
482 	vmov		e0, ctr
483 	vmov.32		ctr1[1], r7
484 	vmov		e1, ctr
485 
486 	add		ip, r3, #64
487 	aes_encrypt	ip, r6, e0, e1
488 	bx		lr
489 ENDPROC(pmull_aes_encrypt_final)
490 
491 	.macro		enc_1x, in0
492 	bl		pmull_aes_encrypt
493 	veor		\in0, \in0, e0
494 	vst1.8		{\in0}, [r4]!
495 	.endm
496 
497 	.macro		dec_1x, in0
498 	bl		pmull_aes_encrypt
499 	veor		e0, e0, \in0
500 	vst1.8		{e0}, [r4]!
501 	.endm
502 
503 	.macro		enc_4x, in0, in1, in2, in3
504 	bl		pmull_aes_encrypt_4x
505 
506 	veor		\in0, \in0, e0
507 	veor		\in1, \in1, e1
508 	veor		\in2, \in2, e2
509 	veor		\in3, \in3, e3
510 
511 	vst1.8		{\in0-\in1}, [r4]!
512 	vst1.8		{\in2-\in3}, [r4]!
513 	.endm
514 
515 	.macro		dec_4x, in0, in1, in2, in3
516 	bl		pmull_aes_encrypt_4x
517 
518 	veor		e0, e0, \in0
519 	veor		e1, e1, \in1
520 	veor		e2, e2, \in2
521 	veor		e3, e3, \in3
522 
523 	vst1.8		{e0-e1}, [r4]!
524 	vst1.8		{e2-e3}, [r4]!
525 	.endm
526 
527 	/*
528 	 * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
529 	 *			  struct gcm_key const *k, char *dst,
530 	 *			  char *iv, int rounds, u32 counter)
531 	 */
532 ENTRY(pmull_gcm_encrypt)
533 	push		{r4-r8, lr}
534 	ldrd		r4, r5, [sp, #24]
535 	ldrd		r6, r7, [sp, #32]
536 
537 	vld1.64		{SHASH}, [r3]
538 
539 	ghash_update	p64, enc, head=0
540 	vst1.64		{XL}, [r1]
541 
542 	pop		{r4-r8, pc}
543 ENDPROC(pmull_gcm_encrypt)
544 
545 	/*
546 	 * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
547 	 *			  struct gcm_key const *k, char *dst,
548 	 *			  char *iv, int rounds, u32 counter)
549 	 */
550 ENTRY(pmull_gcm_decrypt)
551 	push		{r4-r8, lr}
552 	ldrd		r4, r5, [sp, #24]
553 	ldrd		r6, r7, [sp, #32]
554 
555 	vld1.64		{SHASH}, [r3]
556 
557 	ghash_update	p64, dec, head=0
558 	vst1.64		{XL}, [r1]
559 
560 	pop		{r4-r8, pc}
561 ENDPROC(pmull_gcm_decrypt)
562 
563 	/*
564 	 * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
565 	 *			    struct gcm_key const *k, char *head,
566 	 *			    char *iv, int rounds, u32 counter)
567 	 */
568 ENTRY(pmull_gcm_enc_final)
569 	push		{r4-r8, lr}
570 	ldrd		r4, r5, [sp, #24]
571 	ldrd		r6, r7, [sp, #32]
572 
573 	bl		pmull_aes_encrypt_final
574 
575 	cmp		r0, #0
576 	beq		.Lenc_final
577 
578 	mov_l		ip, .Lpermute
579 	sub		r4, r4, #16
580 	add		r8, ip, r0
581 	add		ip, ip, #32
582 	add		r4, r4, r0
583 	sub		ip, ip, r0
584 
585 	vld1.8		{e3}, [r8]		// permute vector for key stream
586 	vld1.8		{e2}, [ip]		// permute vector for ghash input
587 
588 	vtbl.8		e3l, {e0}, e3l
589 	vtbl.8		e3h, {e0}, e3h
590 
591 	vld1.8		{e0}, [r4]		// encrypt tail block
592 	veor		e0, e0, e3
593 	vst1.8		{e0}, [r4]
594 
595 	vtbl.8		T1_L, {e0}, e2l
596 	vtbl.8		T1_H, {e0}, e2h
597 
598 	vld1.64		{XL}, [r1]
599 .Lenc_final:
600 	vld1.64		{SHASH}, [r3, :128]
601 	vmov.i8		MASK, #0xe1
602 	veor		SHASH2_p64, SHASH_L, SHASH_H
603 	vshl.u64	MASK, MASK, #57
604 	mov		r0, #1
605 	bne		3f			// process head block first
606 	ghash_update	p64, aggregate=0, head=0
607 
608 	vrev64.8	XL, XL
609 	vext.8		XL, XL, XL, #8
610 	veor		XL, XL, e1
611 
612 	sub		r2, r2, #16		// rewind src pointer
613 	vst1.8		{XL}, [r2]		// store tag
614 
615 	pop		{r4-r8, pc}
616 ENDPROC(pmull_gcm_enc_final)
617 
618 	/*
619 	 * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
620 	 *			   struct gcm_key const *k, char *head,
621 	 *			   char *iv, int rounds, u32 counter,
622 	 *			   const char *otag, int authsize)
623 	 */
624 ENTRY(pmull_gcm_dec_final)
625 	push		{r4-r8, lr}
626 	ldrd		r4, r5, [sp, #24]
627 	ldrd		r6, r7, [sp, #32]
628 
629 	bl		pmull_aes_encrypt_final
630 
631 	cmp		r0, #0
632 	beq		.Ldec_final
633 
634 	mov_l		ip, .Lpermute
635 	sub		r4, r4, #16
636 	add		r8, ip, r0
637 	add		ip, ip, #32
638 	add		r4, r4, r0
639 	sub		ip, ip, r0
640 
641 	vld1.8		{e3}, [r8]		// permute vector for key stream
642 	vld1.8		{e2}, [ip]		// permute vector for ghash input
643 
644 	vtbl.8		e3l, {e0}, e3l
645 	vtbl.8		e3h, {e0}, e3h
646 
647 	vld1.8		{e0}, [r4]
648 
649 	vtbl.8		T1_L, {e0}, e2l
650 	vtbl.8		T1_H, {e0}, e2h
651 
652 	veor		e0, e0, e3
653 	vst1.8		{e0}, [r4]
654 
655 	vld1.64		{XL}, [r1]
656 .Ldec_final:
657 	vld1.64		{SHASH}, [r3]
658 	vmov.i8		MASK, #0xe1
659 	veor		SHASH2_p64, SHASH_L, SHASH_H
660 	vshl.u64	MASK, MASK, #57
661 	mov		r0, #1
662 	bne		3f			// process head block first
663 	ghash_update	p64, aggregate=0, head=0
664 
665 	vrev64.8	XL, XL
666 	vext.8		XL, XL, XL, #8
667 	veor		XL, XL, e1
668 
669 	mov_l		ip, .Lpermute
670 	ldrd		r2, r3, [sp, #40]	// otag and authsize
671 	vld1.8		{T1}, [r2]
672 	add		ip, ip, r3
673 	vceq.i8		T1, T1, XL		// compare tags
674 	vmvn		T1, T1			// 0 for eq, -1 for ne
675 
676 	vld1.8		{e0}, [ip]
677 	vtbl.8		XL_L, {T1}, e0l		// keep authsize bytes only
678 	vtbl.8		XL_H, {T1}, e0h
679 
680 	vpmin.s8	XL_L, XL_L, XL_H	// take the minimum s8 across the vector
681 	vpmin.s8	XL_L, XL_L, XL_L
682 	vmov.32		r0, XL_L[0]		// fail if != 0x0
683 
684 	pop		{r4-r8, pc}
685 ENDPROC(pmull_gcm_dec_final)
686 
687 	.section	".rodata", "a", %progbits
688 	.align		5
689 .Lpermute:
690 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
691 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
692 	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
693 	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
694 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
695 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
696