1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4  *
5  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6  */
7 
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
10 
11 	SHASH		.req	v0
12 	SHASH2		.req	v1
13 	T1		.req	v2
14 	T2		.req	v3
15 	MASK		.req	v4
16 	XM		.req	v5
17 	XL		.req	v6
18 	XH		.req	v7
19 	IN1		.req	v7
20 
21 	k00_16		.req	v8
22 	k32_48		.req	v9
23 
24 	t3		.req	v10
25 	t4		.req	v11
26 	t5		.req	v12
27 	t6		.req	v13
28 	t7		.req	v14
29 	t8		.req	v15
30 	t9		.req	v16
31 
32 	perm1		.req	v17
33 	perm2		.req	v18
34 	perm3		.req	v19
35 
36 	sh1		.req	v20
37 	sh2		.req	v21
38 	sh3		.req	v22
39 	sh4		.req	v23
40 
41 	ss1		.req	v24
42 	ss2		.req	v25
43 	ss3		.req	v26
44 	ss4		.req	v27
45 
46 	XL2		.req	v8
47 	XM2		.req	v9
48 	XH2		.req	v10
49 	XL3		.req	v11
50 	XM3		.req	v12
51 	XH3		.req	v13
52 	TT3		.req	v14
53 	TT4		.req	v15
54 	HH		.req	v16
55 	HH3		.req	v17
56 	HH4		.req	v18
57 	HH34		.req	v19
58 
59 	.text
60 	.arch		armv8-a+crypto
61 
62 	.macro		__pmull_p64, rd, rn, rm
63 	pmull		\rd\().1q, \rn\().1d, \rm\().1d
64 	.endm
65 
66 	.macro		__pmull2_p64, rd, rn, rm
67 	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
68 	.endm
69 
70 	.macro		__pmull_p8, rq, ad, bd
71 	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
72 	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
73 	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
74 
75 	__pmull_p8_\bd	\rq, \ad
76 	.endm
77 
78 	.macro		__pmull2_p8, rq, ad, bd
79 	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
80 	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
81 	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
82 
83 	__pmull2_p8_\bd	\rq, \ad
84 	.endm
85 
86 	.macro		__pmull_p8_SHASH, rq, ad
87 	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
88 	.endm
89 
90 	.macro		__pmull_p8_SHASH2, rq, ad
91 	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
92 	.endm
93 
94 	.macro		__pmull2_p8_SHASH, rq, ad
95 	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
96 	.endm
97 
98 	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
99 	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
100 	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
101 	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
102 	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
103 	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
104 	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
105 	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
106 	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
107 
108 	eor		t3.16b, t3.16b, t4.16b			// L = E + F
109 	eor		t5.16b, t5.16b, t6.16b			// M = G + H
110 	eor		t7.16b, t7.16b, t8.16b			// N = I + J
111 
112 	uzp1		t4.2d, t3.2d, t5.2d
113 	uzp2		t3.2d, t3.2d, t5.2d
114 	uzp1		t6.2d, t7.2d, t9.2d
115 	uzp2		t7.2d, t7.2d, t9.2d
116 
117 	// t3 = (L) (P0 + P1) << 8
118 	// t5 = (M) (P2 + P3) << 16
119 	eor		t4.16b, t4.16b, t3.16b
120 	and		t3.16b, t3.16b, k32_48.16b
121 
122 	// t7 = (N) (P4 + P5) << 24
123 	// t9 = (K) (P6 + P7) << 32
124 	eor		t6.16b, t6.16b, t7.16b
125 	and		t7.16b, t7.16b, k00_16.16b
126 
127 	eor		t4.16b, t4.16b, t3.16b
128 	eor		t6.16b, t6.16b, t7.16b
129 
130 	zip2		t5.2d, t4.2d, t3.2d
131 	zip1		t3.2d, t4.2d, t3.2d
132 	zip2		t9.2d, t6.2d, t7.2d
133 	zip1		t7.2d, t6.2d, t7.2d
134 
135 	ext		t3.16b, t3.16b, t3.16b, #15
136 	ext		t5.16b, t5.16b, t5.16b, #14
137 	ext		t7.16b, t7.16b, t7.16b, #13
138 	ext		t9.16b, t9.16b, t9.16b, #12
139 
140 	eor		t3.16b, t3.16b, t5.16b
141 	eor		t7.16b, t7.16b, t9.16b
142 	eor		\rq\().16b, \rq\().16b, t3.16b
143 	eor		\rq\().16b, \rq\().16b, t7.16b
144 	.endm
145 
146 	.macro		__pmull_pre_p64
147 	add		x8, x3, #16
148 	ld1		{HH.2d-HH4.2d}, [x8]
149 
150 	trn1		SHASH2.2d, SHASH.2d, HH.2d
151 	trn2		T1.2d, SHASH.2d, HH.2d
152 	eor		SHASH2.16b, SHASH2.16b, T1.16b
153 
154 	trn1		HH34.2d, HH3.2d, HH4.2d
155 	trn2		T1.2d, HH3.2d, HH4.2d
156 	eor		HH34.16b, HH34.16b, T1.16b
157 
158 	movi		MASK.16b, #0xe1
159 	shl		MASK.2d, MASK.2d, #57
160 	.endm
161 
162 	.macro		__pmull_pre_p8
163 	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
164 	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
165 
166 	// k00_16 := 0x0000000000000000_000000000000ffff
167 	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
168 	movi		k32_48.2d, #0xffffffff
169 	mov		k32_48.h[2], k32_48.h[0]
170 	ushr		k00_16.2d, k32_48.2d, #32
171 
172 	// prepare the permutation vectors
173 	mov_q		x5, 0x080f0e0d0c0b0a09
174 	movi		T1.8b, #8
175 	dup		perm1.2d, x5
176 	eor		perm1.16b, perm1.16b, T1.16b
177 	ushr		perm2.2d, perm1.2d, #8
178 	ushr		perm3.2d, perm1.2d, #16
179 	ushr		T1.2d, perm1.2d, #24
180 	sli		perm2.2d, perm1.2d, #56
181 	sli		perm3.2d, perm1.2d, #48
182 	sli		T1.2d, perm1.2d, #40
183 
184 	// precompute loop invariants
185 	tbl		sh1.16b, {SHASH.16b}, perm1.16b
186 	tbl		sh2.16b, {SHASH.16b}, perm2.16b
187 	tbl		sh3.16b, {SHASH.16b}, perm3.16b
188 	tbl		sh4.16b, {SHASH.16b}, T1.16b
189 	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
190 	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
191 	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
192 	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
193 	.endm
194 
195 	//
196 	// PMULL (64x64->128) based reduction for CPUs that can do
197 	// it in a single instruction.
198 	//
199 	.macro		__pmull_reduce_p64
200 	pmull		T2.1q, XL.1d, MASK.1d
201 	eor		XM.16b, XM.16b, T1.16b
202 
203 	mov		XH.d[0], XM.d[1]
204 	mov		XM.d[1], XL.d[0]
205 
206 	eor		XL.16b, XM.16b, T2.16b
207 	ext		T2.16b, XL.16b, XL.16b, #8
208 	pmull		XL.1q, XL.1d, MASK.1d
209 	.endm
210 
211 	//
212 	// Alternative reduction for CPUs that lack support for the
213 	// 64x64->128 PMULL instruction
214 	//
215 	.macro		__pmull_reduce_p8
216 	eor		XM.16b, XM.16b, T1.16b
217 
218 	mov		XL.d[1], XM.d[0]
219 	mov		XH.d[0], XM.d[1]
220 
221 	shl		T1.2d, XL.2d, #57
222 	shl		T2.2d, XL.2d, #62
223 	eor		T2.16b, T2.16b, T1.16b
224 	shl		T1.2d, XL.2d, #63
225 	eor		T2.16b, T2.16b, T1.16b
226 	ext		T1.16b, XL.16b, XH.16b, #8
227 	eor		T2.16b, T2.16b, T1.16b
228 
229 	mov		XL.d[1], T2.d[0]
230 	mov		XH.d[0], T2.d[1]
231 
232 	ushr		T2.2d, XL.2d, #1
233 	eor		XH.16b, XH.16b, XL.16b
234 	eor		XL.16b, XL.16b, T2.16b
235 	ushr		T2.2d, T2.2d, #6
236 	ushr		XL.2d, XL.2d, #1
237 	.endm
238 
239 	.macro		__pmull_ghash, pn
240 	ld1		{SHASH.2d}, [x3]
241 	ld1		{XL.2d}, [x1]
242 
243 	__pmull_pre_\pn
244 
245 	/* do the head block first, if supplied */
246 	cbz		x4, 0f
247 	ld1		{T1.2d}, [x4]
248 	mov		x4, xzr
249 	b		3f
250 
251 0:	.ifc		\pn, p64
252 	tbnz		w0, #0, 2f		// skip until #blocks is a
253 	tbnz		w0, #1, 2f		// round multiple of 4
254 
255 1:	ld1		{XM3.16b-TT4.16b}, [x2], #64
256 
257 	sub		w0, w0, #4
258 
259 	rev64		T1.16b, XM3.16b
260 	rev64		T2.16b, XH3.16b
261 	rev64		TT4.16b, TT4.16b
262 	rev64		TT3.16b, TT3.16b
263 
264 	ext		IN1.16b, TT4.16b, TT4.16b, #8
265 	ext		XL3.16b, TT3.16b, TT3.16b, #8
266 
267 	eor		TT4.16b, TT4.16b, IN1.16b
268 	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
269 	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
270 	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
271 
272 	eor		TT3.16b, TT3.16b, XL3.16b
273 	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
274 	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
275 	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
276 
277 	ext		IN1.16b, T2.16b, T2.16b, #8
278 	eor		XL2.16b, XL2.16b, XL3.16b
279 	eor		XH2.16b, XH2.16b, XH3.16b
280 	eor		XM2.16b, XM2.16b, XM3.16b
281 
282 	eor		T2.16b, T2.16b, IN1.16b
283 	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
284 	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
285 	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
286 
287 	eor		XL2.16b, XL2.16b, XL3.16b
288 	eor		XH2.16b, XH2.16b, XH3.16b
289 	eor		XM2.16b, XM2.16b, XM3.16b
290 
291 	ext		IN1.16b, T1.16b, T1.16b, #8
292 	ext		TT3.16b, XL.16b, XL.16b, #8
293 	eor		XL.16b, XL.16b, IN1.16b
294 	eor		T1.16b, T1.16b, TT3.16b
295 
296 	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
297 	eor		T1.16b, T1.16b, XL.16b
298 	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
299 	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
300 
301 	eor		XL.16b, XL.16b, XL2.16b
302 	eor		XH.16b, XH.16b, XH2.16b
303 	eor		XM.16b, XM.16b, XM2.16b
304 
305 	eor		T2.16b, XL.16b, XH.16b
306 	ext		T1.16b, XL.16b, XH.16b, #8
307 	eor		XM.16b, XM.16b, T2.16b
308 
309 	__pmull_reduce_p64
310 
311 	eor		T2.16b, T2.16b, XH.16b
312 	eor		XL.16b, XL.16b, T2.16b
313 
314 	cbz		w0, 5f
315 	b		1b
316 	.endif
317 
318 2:	ld1		{T1.2d}, [x2], #16
319 	sub		w0, w0, #1
320 
321 3:	/* multiply XL by SHASH in GF(2^128) */
322 CPU_LE(	rev64		T1.16b, T1.16b	)
323 
324 	ext		T2.16b, XL.16b, XL.16b, #8
325 	ext		IN1.16b, T1.16b, T1.16b, #8
326 	eor		T1.16b, T1.16b, T2.16b
327 	eor		XL.16b, XL.16b, IN1.16b
328 
329 	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
330 	eor		T1.16b, T1.16b, XL.16b
331 	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
332 	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
333 
334 4:	eor		T2.16b, XL.16b, XH.16b
335 	ext		T1.16b, XL.16b, XH.16b, #8
336 	eor		XM.16b, XM.16b, T2.16b
337 
338 	__pmull_reduce_\pn
339 
340 	eor		T2.16b, T2.16b, XH.16b
341 	eor		XL.16b, XL.16b, T2.16b
342 
343 	cbnz		w0, 0b
344 
345 5:	st1		{XL.2d}, [x1]
346 	ret
347 	.endm
348 
349 	/*
350 	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
351 	 *			   struct ghash_key const *k, const char *head)
352 	 */
353 SYM_FUNC_START(pmull_ghash_update_p64)
354 	__pmull_ghash	p64
355 SYM_FUNC_END(pmull_ghash_update_p64)
356 
357 SYM_FUNC_START(pmull_ghash_update_p8)
358 	__pmull_ghash	p8
359 SYM_FUNC_END(pmull_ghash_update_p8)
360 
361 	KS0		.req	v8
362 	KS1		.req	v9
363 	KS2		.req	v10
364 	KS3		.req	v11
365 
366 	INP0		.req	v21
367 	INP1		.req	v22
368 	INP2		.req	v23
369 	INP3		.req	v24
370 
371 	K0		.req	v25
372 	K1		.req	v26
373 	K2		.req	v27
374 	K3		.req	v28
375 	K4		.req	v12
376 	K5		.req	v13
377 	K6		.req	v4
378 	K7		.req	v5
379 	K8		.req	v14
380 	K9		.req	v15
381 	KK		.req	v29
382 	KL		.req	v30
383 	KM		.req	v31
384 
385 	.macro		load_round_keys, rounds, rk, tmp
386 	add		\tmp, \rk, #64
387 	ld1		{K0.4s-K3.4s}, [\rk]
388 	ld1		{K4.4s-K5.4s}, [\tmp]
389 	add		\tmp, \rk, \rounds, lsl #4
390 	sub		\tmp, \tmp, #32
391 	ld1		{KK.4s-KM.4s}, [\tmp]
392 	.endm
393 
394 	.macro		enc_round, state, key
395 	aese		\state\().16b, \key\().16b
396 	aesmc		\state\().16b, \state\().16b
397 	.endm
398 
399 	.macro		enc_qround, s0, s1, s2, s3, key
400 	enc_round	\s0, \key
401 	enc_round	\s1, \key
402 	enc_round	\s2, \key
403 	enc_round	\s3, \key
404 	.endm
405 
406 	.macro		enc_block, state, rounds, rk, tmp
407 	add		\tmp, \rk, #96
408 	ld1		{K6.4s-K7.4s}, [\tmp], #32
409 	.irp		key, K0, K1, K2, K3, K4 K5
410 	enc_round	\state, \key
411 	.endr
412 
413 	tbnz		\rounds, #2, .Lnot128_\@
414 .Lout256_\@:
415 	enc_round	\state, K6
416 	enc_round	\state, K7
417 
418 .Lout192_\@:
419 	enc_round	\state, KK
420 	aese		\state\().16b, KL.16b
421 	eor		\state\().16b, \state\().16b, KM.16b
422 
423 	.subsection	1
424 .Lnot128_\@:
425 	ld1		{K8.4s-K9.4s}, [\tmp], #32
426 	enc_round	\state, K6
427 	enc_round	\state, K7
428 	ld1		{K6.4s-K7.4s}, [\tmp]
429 	enc_round	\state, K8
430 	enc_round	\state, K9
431 	tbz		\rounds, #1, .Lout192_\@
432 	b		.Lout256_\@
433 	.previous
434 	.endm
435 
436 	.align		6
437 	.macro		pmull_gcm_do_crypt, enc
438 	stp		x29, x30, [sp, #-32]!
439 	mov		x29, sp
440 	str		x19, [sp, #24]
441 
442 	load_round_keys	x7, x6, x8
443 
444 	ld1		{SHASH.2d}, [x3], #16
445 	ld1		{HH.2d-HH4.2d}, [x3]
446 
447 	trn1		SHASH2.2d, SHASH.2d, HH.2d
448 	trn2		T1.2d, SHASH.2d, HH.2d
449 	eor		SHASH2.16b, SHASH2.16b, T1.16b
450 
451 	trn1		HH34.2d, HH3.2d, HH4.2d
452 	trn2		T1.2d, HH3.2d, HH4.2d
453 	eor		HH34.16b, HH34.16b, T1.16b
454 
455 	ld1		{XL.2d}, [x4]
456 
457 	cbz		x0, 3f				// tag only?
458 
459 	ldr		w8, [x5, #12]			// load lower counter
460 CPU_LE(	rev		w8, w8		)
461 
462 0:	mov		w9, #4				// max blocks per round
463 	add		x10, x0, #0xf
464 	lsr		x10, x10, #4			// remaining blocks
465 
466 	subs		x0, x0, #64
467 	csel		w9, w10, w9, mi
468 	add		w8, w8, w9
469 
470 	bmi		1f
471 	ld1		{INP0.16b-INP3.16b}, [x2], #64
472 	.subsection	1
473 	/*
474 	 * Populate the four input registers right to left with up to 63 bytes
475 	 * of data, using overlapping loads to avoid branches.
476 	 *
477 	 *                INP0     INP1     INP2     INP3
478 	 *  1 byte     |        |        |        |x       |
479 	 * 16 bytes    |        |        |        |xxxxxxxx|
480 	 * 17 bytes    |        |        |xxxxxxxx|x       |
481 	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
482 	 * etc etc
483 	 *
484 	 * Note that this code may read up to 15 bytes before the start of
485 	 * the input. It is up to the calling code to ensure this is safe if
486 	 * this happens in the first iteration of the loop (i.e., when the
487 	 * input size is < 16 bytes)
488 	 */
489 1:	mov		x15, #16
490 	ands		x19, x0, #0xf
491 	csel		x19, x19, x15, ne
492 	adr_l		x17, .Lpermute_table + 16
493 
494 	sub		x11, x15, x19
495 	add		x12, x17, x11
496 	sub		x17, x17, x11
497 	ld1		{T1.16b}, [x12]
498 	sub		x10, x1, x11
499 	sub		x11, x2, x11
500 
501 	cmp		x0, #-16
502 	csel		x14, x15, xzr, gt
503 	cmp		x0, #-32
504 	csel		x15, x15, xzr, gt
505 	cmp		x0, #-48
506 	csel		x16, x19, xzr, gt
507 	csel		x1, x1, x10, gt
508 	csel		x2, x2, x11, gt
509 
510 	ld1		{INP0.16b}, [x2], x14
511 	ld1		{INP1.16b}, [x2], x15
512 	ld1		{INP2.16b}, [x2], x16
513 	ld1		{INP3.16b}, [x2]
514 	tbl		INP3.16b, {INP3.16b}, T1.16b
515 	b		2f
516 	.previous
517 
518 2:	.if		\enc == 0
519 	bl		pmull_gcm_ghash_4x
520 	.endif
521 
522 	bl		pmull_gcm_enc_4x
523 
524 	tbnz		x0, #63, 6f
525 	st1		{INP0.16b-INP3.16b}, [x1], #64
526 	.if		\enc == 1
527 	bl		pmull_gcm_ghash_4x
528 	.endif
529 	bne		0b
530 
531 3:	ldp		x19, x10, [sp, #24]
532 	cbz		x10, 5f				// output tag?
533 
534 	ld1		{INP3.16b}, [x10]		// load lengths[]
535 	mov		w9, #1
536 	bl		pmull_gcm_ghash_4x
537 
538 	mov		w11, #(0x1 << 24)		// BE '1U'
539 	ld1		{KS0.16b}, [x5]
540 	mov		KS0.s[3], w11
541 
542 	enc_block	KS0, x7, x6, x12
543 
544 	ext		XL.16b, XL.16b, XL.16b, #8
545 	rev64		XL.16b, XL.16b
546 	eor		XL.16b, XL.16b, KS0.16b
547 	st1		{XL.16b}, [x10]			// store tag
548 
549 4:	ldp		x29, x30, [sp], #32
550 	ret
551 
552 5:
553 CPU_LE(	rev		w8, w8		)
554 	str		w8, [x5, #12]			// store lower counter
555 	st1		{XL.2d}, [x4]
556 	b		4b
557 
558 6:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
559 	sub		x17, x17, x19, lsl #1
560 
561 	cmp		w9, #1
562 	beq		7f
563 	.subsection	1
564 7:	ld1		{INP2.16b}, [x1]
565 	tbx		INP2.16b, {INP3.16b}, T1.16b
566 	mov		INP3.16b, INP2.16b
567 	b		8f
568 	.previous
569 
570 	st1		{INP0.16b}, [x1], x14
571 	st1		{INP1.16b}, [x1], x15
572 	st1		{INP2.16b}, [x1], x16
573 	tbl		INP3.16b, {INP3.16b}, T1.16b
574 	tbx		INP3.16b, {INP2.16b}, T2.16b
575 8:	st1		{INP3.16b}, [x1]
576 
577 	.if		\enc == 1
578 	ld1		{T1.16b}, [x17]
579 	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
580 	bl		pmull_gcm_ghash_4x
581 	.endif
582 	b		3b
583 	.endm
584 
585 	/*
586 	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
587 	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
588 	 *			  int rounds, u8 tag)
589 	 */
590 SYM_FUNC_START(pmull_gcm_encrypt)
591 	pmull_gcm_do_crypt	1
592 SYM_FUNC_END(pmull_gcm_encrypt)
593 
594 	/*
595 	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
596 	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
597 	 *			  int rounds, u8 tag)
598 	 */
599 SYM_FUNC_START(pmull_gcm_decrypt)
600 	pmull_gcm_do_crypt	0
601 SYM_FUNC_END(pmull_gcm_decrypt)
602 
603 SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
604 	movi		MASK.16b, #0xe1
605 	shl		MASK.2d, MASK.2d, #57
606 
607 	rev64		T1.16b, INP0.16b
608 	rev64		T2.16b, INP1.16b
609 	rev64		TT3.16b, INP2.16b
610 	rev64		TT4.16b, INP3.16b
611 
612 	ext		XL.16b, XL.16b, XL.16b, #8
613 
614 	tbz		w9, #2, 0f			// <4 blocks?
615 	.subsection	1
616 0:	movi		XH2.16b, #0
617 	movi		XM2.16b, #0
618 	movi		XL2.16b, #0
619 
620 	tbz		w9, #0, 1f			// 2 blocks?
621 	tbz		w9, #1, 2f			// 1 block?
622 
623 	eor		T2.16b, T2.16b, XL.16b
624 	ext		T1.16b, T2.16b, T2.16b, #8
625 	b		.Lgh3
626 
627 1:	eor		TT3.16b, TT3.16b, XL.16b
628 	ext		T2.16b, TT3.16b, TT3.16b, #8
629 	b		.Lgh2
630 
631 2:	eor		TT4.16b, TT4.16b, XL.16b
632 	ext		IN1.16b, TT4.16b, TT4.16b, #8
633 	b		.Lgh1
634 	.previous
635 
636 	eor		T1.16b, T1.16b, XL.16b
637 	ext		IN1.16b, T1.16b, T1.16b, #8
638 
639 	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
640 	eor		T1.16b, T1.16b, IN1.16b
641 	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
642 	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
643 
644 	ext		T1.16b, T2.16b, T2.16b, #8
645 .Lgh3:	eor		T2.16b, T2.16b, T1.16b
646 	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
647 	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
648 	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
649 
650 	eor		XH2.16b, XH2.16b, XH.16b
651 	eor		XL2.16b, XL2.16b, XL.16b
652 	eor		XM2.16b, XM2.16b, XM.16b
653 
654 	ext		T2.16b, TT3.16b, TT3.16b, #8
655 .Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
656 	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
657 	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
658 	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
659 
660 	eor		XH2.16b, XH2.16b, XH.16b
661 	eor		XL2.16b, XL2.16b, XL.16b
662 	eor		XM2.16b, XM2.16b, XM.16b
663 
664 	ext		IN1.16b, TT4.16b, TT4.16b, #8
665 .Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
666 	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
667 	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
668 	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
669 
670 	eor		XH.16b, XH.16b, XH2.16b
671 	eor		XL.16b, XL.16b, XL2.16b
672 	eor		XM.16b, XM.16b, XM2.16b
673 
674 	eor		T2.16b, XL.16b, XH.16b
675 	ext		T1.16b, XL.16b, XH.16b, #8
676 	eor		XM.16b, XM.16b, T2.16b
677 
678 	__pmull_reduce_p64
679 
680 	eor		T2.16b, T2.16b, XH.16b
681 	eor		XL.16b, XL.16b, T2.16b
682 
683 	ret
684 SYM_FUNC_END(pmull_gcm_ghash_4x)
685 
686 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
687 	ld1		{KS0.16b}, [x5]			// load upper counter
688 	sub		w10, w8, #4
689 	sub		w11, w8, #3
690 	sub		w12, w8, #2
691 	sub		w13, w8, #1
692 	rev		w10, w10
693 	rev		w11, w11
694 	rev		w12, w12
695 	rev		w13, w13
696 	mov		KS1.16b, KS0.16b
697 	mov		KS2.16b, KS0.16b
698 	mov		KS3.16b, KS0.16b
699 	ins		KS0.s[3], w10			// set lower counter
700 	ins		KS1.s[3], w11
701 	ins		KS2.s[3], w12
702 	ins		KS3.s[3], w13
703 
704 	add		x10, x6, #96			// round key pointer
705 	ld1		{K6.4s-K7.4s}, [x10], #32
706 	.irp		key, K0, K1, K2, K3, K4, K5
707 	enc_qround	KS0, KS1, KS2, KS3, \key
708 	.endr
709 
710 	tbnz		x7, #2, .Lnot128
711 	.subsection	1
712 .Lnot128:
713 	ld1		{K8.4s-K9.4s}, [x10], #32
714 	.irp		key, K6, K7
715 	enc_qround	KS0, KS1, KS2, KS3, \key
716 	.endr
717 	ld1		{K6.4s-K7.4s}, [x10]
718 	.irp		key, K8, K9
719 	enc_qround	KS0, KS1, KS2, KS3, \key
720 	.endr
721 	tbz		x7, #1, .Lout192
722 	b		.Lout256
723 	.previous
724 
725 .Lout256:
726 	.irp		key, K6, K7
727 	enc_qround	KS0, KS1, KS2, KS3, \key
728 	.endr
729 
730 .Lout192:
731 	enc_qround	KS0, KS1, KS2, KS3, KK
732 
733 	aese		KS0.16b, KL.16b
734 	aese		KS1.16b, KL.16b
735 	aese		KS2.16b, KL.16b
736 	aese		KS3.16b, KL.16b
737 
738 	eor		KS0.16b, KS0.16b, KM.16b
739 	eor		KS1.16b, KS1.16b, KM.16b
740 	eor		KS2.16b, KS2.16b, KM.16b
741 	eor		KS3.16b, KS3.16b, KM.16b
742 
743 	eor		INP0.16b, INP0.16b, KS0.16b
744 	eor		INP1.16b, INP1.16b, KS1.16b
745 	eor		INP2.16b, INP2.16b, KS2.16b
746 	eor		INP3.16b, INP3.16b, KS3.16b
747 
748 	ret
749 SYM_FUNC_END(pmull_gcm_enc_4x)
750 
751 	.section	".rodata", "a"
752 	.align		6
753 .Lpermute_table:
754 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
755 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
756 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
757 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
758 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
759 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
760 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
761 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
762 	.previous
763