162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2015 - 2017 Linaro Ltd.
662306a36Sopenharmony_ci * Copyright (C) 2023 Google LLC. <ardb@google.com>
762306a36Sopenharmony_ci */
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#include <linux/linkage.h>
1062306a36Sopenharmony_ci#include <asm/assembler.h>
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci	.arch		armv8-a
1362306a36Sopenharmony_ci	.fpu		crypto-neon-fp-armv8
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci	SHASH		.req	q0
1662306a36Sopenharmony_ci	T1		.req	q1
1762306a36Sopenharmony_ci	XL		.req	q2
1862306a36Sopenharmony_ci	XM		.req	q3
1962306a36Sopenharmony_ci	XH		.req	q4
2062306a36Sopenharmony_ci	IN1		.req	q4
2162306a36Sopenharmony_ci
2262306a36Sopenharmony_ci	SHASH_L		.req	d0
2362306a36Sopenharmony_ci	SHASH_H		.req	d1
2462306a36Sopenharmony_ci	T1_L		.req	d2
2562306a36Sopenharmony_ci	T1_H		.req	d3
2662306a36Sopenharmony_ci	XL_L		.req	d4
2762306a36Sopenharmony_ci	XL_H		.req	d5
2862306a36Sopenharmony_ci	XM_L		.req	d6
2962306a36Sopenharmony_ci	XM_H		.req	d7
3062306a36Sopenharmony_ci	XH_L		.req	d8
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_ci	t0l		.req	d10
3362306a36Sopenharmony_ci	t0h		.req	d11
3462306a36Sopenharmony_ci	t1l		.req	d12
3562306a36Sopenharmony_ci	t1h		.req	d13
3662306a36Sopenharmony_ci	t2l		.req	d14
3762306a36Sopenharmony_ci	t2h		.req	d15
3862306a36Sopenharmony_ci	t3l		.req	d16
3962306a36Sopenharmony_ci	t3h		.req	d17
4062306a36Sopenharmony_ci	t4l		.req	d18
4162306a36Sopenharmony_ci	t4h		.req	d19
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ci	t0q		.req	q5
4462306a36Sopenharmony_ci	t1q		.req	q6
4562306a36Sopenharmony_ci	t2q		.req	q7
4662306a36Sopenharmony_ci	t3q		.req	q8
4762306a36Sopenharmony_ci	t4q		.req	q9
4862306a36Sopenharmony_ci	XH2		.req	q9
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci	s1l		.req	d20
5162306a36Sopenharmony_ci	s1h		.req	d21
5262306a36Sopenharmony_ci	s2l		.req	d22
5362306a36Sopenharmony_ci	s2h		.req	d23
5462306a36Sopenharmony_ci	s3l		.req	d24
5562306a36Sopenharmony_ci	s3h		.req	d25
5662306a36Sopenharmony_ci	s4l		.req	d26
5762306a36Sopenharmony_ci	s4h		.req	d27
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci	MASK		.req	d28
6062306a36Sopenharmony_ci	SHASH2_p8	.req	d28
6162306a36Sopenharmony_ci
6262306a36Sopenharmony_ci	k16		.req	d29
6362306a36Sopenharmony_ci	k32		.req	d30
6462306a36Sopenharmony_ci	k48		.req	d31
6562306a36Sopenharmony_ci	SHASH2_p64	.req	d31
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci	HH		.req	q10
6862306a36Sopenharmony_ci	HH3		.req	q11
6962306a36Sopenharmony_ci	HH4		.req	q12
7062306a36Sopenharmony_ci	HH34		.req	q13
7162306a36Sopenharmony_ci
7262306a36Sopenharmony_ci	HH_L		.req	d20
7362306a36Sopenharmony_ci	HH_H		.req	d21
7462306a36Sopenharmony_ci	HH3_L		.req	d22
7562306a36Sopenharmony_ci	HH3_H		.req	d23
7662306a36Sopenharmony_ci	HH4_L		.req	d24
7762306a36Sopenharmony_ci	HH4_H		.req	d25
7862306a36Sopenharmony_ci	HH34_L		.req	d26
7962306a36Sopenharmony_ci	HH34_H		.req	d27
8062306a36Sopenharmony_ci	SHASH2_H	.req	d29
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci	XL2		.req	q5
8362306a36Sopenharmony_ci	XM2		.req	q6
8462306a36Sopenharmony_ci	T2		.req	q7
8562306a36Sopenharmony_ci	T3		.req	q8
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci	XL2_L		.req	d10
8862306a36Sopenharmony_ci	XL2_H		.req	d11
8962306a36Sopenharmony_ci	XM2_L		.req	d12
9062306a36Sopenharmony_ci	XM2_H		.req	d13
9162306a36Sopenharmony_ci	T3_L		.req	d16
9262306a36Sopenharmony_ci	T3_H		.req	d17
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_ci	.text
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
9762306a36Sopenharmony_ci	vmull.p64	\rd, \rn, \rm
9862306a36Sopenharmony_ci	.endm
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci	/*
10162306a36Sopenharmony_ci	 * This implementation of 64x64 -> 128 bit polynomial multiplication
10262306a36Sopenharmony_ci	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
10362306a36Sopenharmony_ci	 * "Fast Software Polynomial Multiplication on ARM Processors Using
10462306a36Sopenharmony_ci	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
10562306a36Sopenharmony_ci	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
10662306a36Sopenharmony_ci	 *
10762306a36Sopenharmony_ci	 * It has been slightly tweaked for in-order performance, and to allow
10862306a36Sopenharmony_ci	 * 'rq' to overlap with 'ad' or 'bd'.
10962306a36Sopenharmony_ci	 */
11062306a36Sopenharmony_ci	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
11162306a36Sopenharmony_ci	vext.8		t0l, \ad, \ad, #1	@ A1
11262306a36Sopenharmony_ci	.ifc		\b1, t4l
11362306a36Sopenharmony_ci	vext.8		t4l, \bd, \bd, #1	@ B1
11462306a36Sopenharmony_ci	.endif
11562306a36Sopenharmony_ci	vmull.p8	t0q, t0l, \bd		@ F = A1*B
11662306a36Sopenharmony_ci	vext.8		t1l, \ad, \ad, #2	@ A2
11762306a36Sopenharmony_ci	vmull.p8	t4q, \ad, \b1		@ E = A*B1
11862306a36Sopenharmony_ci	.ifc		\b2, t3l
11962306a36Sopenharmony_ci	vext.8		t3l, \bd, \bd, #2	@ B2
12062306a36Sopenharmony_ci	.endif
12162306a36Sopenharmony_ci	vmull.p8	t1q, t1l, \bd		@ H = A2*B
12262306a36Sopenharmony_ci	vext.8		t2l, \ad, \ad, #3	@ A3
12362306a36Sopenharmony_ci	vmull.p8	t3q, \ad, \b2		@ G = A*B2
12462306a36Sopenharmony_ci	veor		t0q, t0q, t4q		@ L = E + F
12562306a36Sopenharmony_ci	.ifc		\b3, t4l
12662306a36Sopenharmony_ci	vext.8		t4l, \bd, \bd, #3	@ B3
12762306a36Sopenharmony_ci	.endif
12862306a36Sopenharmony_ci	vmull.p8	t2q, t2l, \bd		@ J = A3*B
12962306a36Sopenharmony_ci	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
13062306a36Sopenharmony_ci	veor		t1q, t1q, t3q		@ M = G + H
13162306a36Sopenharmony_ci	.ifc		\b4, t3l
13262306a36Sopenharmony_ci	vext.8		t3l, \bd, \bd, #4	@ B4
13362306a36Sopenharmony_ci	.endif
13462306a36Sopenharmony_ci	vmull.p8	t4q, \ad, \b3		@ I = A*B3
13562306a36Sopenharmony_ci	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
13662306a36Sopenharmony_ci	vmull.p8	t3q, \ad, \b4		@ K = A*B4
13762306a36Sopenharmony_ci	vand		t0h, t0h, k48
13862306a36Sopenharmony_ci	vand		t1h, t1h, k32
13962306a36Sopenharmony_ci	veor		t2q, t2q, t4q		@ N = I + J
14062306a36Sopenharmony_ci	veor		t0l, t0l, t0h
14162306a36Sopenharmony_ci	veor		t1l, t1l, t1h
14262306a36Sopenharmony_ci	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
14362306a36Sopenharmony_ci	vand		t2h, t2h, k16
14462306a36Sopenharmony_ci	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
14562306a36Sopenharmony_ci	vmov.i64	t3h, #0
14662306a36Sopenharmony_ci	vext.8		t0q, t0q, t0q, #15
14762306a36Sopenharmony_ci	veor		t2l, t2l, t2h
14862306a36Sopenharmony_ci	vext.8		t1q, t1q, t1q, #14
14962306a36Sopenharmony_ci	vmull.p8	\rq, \ad, \bd		@ D = A*B
15062306a36Sopenharmony_ci	vext.8		t2q, t2q, t2q, #13
15162306a36Sopenharmony_ci	vext.8		t3q, t3q, t3q, #12
15262306a36Sopenharmony_ci	veor		t0q, t0q, t1q
15362306a36Sopenharmony_ci	veor		t2q, t2q, t3q
15462306a36Sopenharmony_ci	veor		\rq, \rq, t0q
15562306a36Sopenharmony_ci	veor		\rq, \rq, t2q
15662306a36Sopenharmony_ci	.endm
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci	//
15962306a36Sopenharmony_ci	// PMULL (64x64->128) based reduction for CPUs that can do
16062306a36Sopenharmony_ci	// it in a single instruction.
16162306a36Sopenharmony_ci	//
16262306a36Sopenharmony_ci	.macro		__pmull_reduce_p64
16362306a36Sopenharmony_ci	vmull.p64	T1, XL_L, MASK
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_ci	veor		XH_L, XH_L, XM_H
16662306a36Sopenharmony_ci	vext.8		T1, T1, T1, #8
16762306a36Sopenharmony_ci	veor		XL_H, XL_H, XM_L
16862306a36Sopenharmony_ci	veor		T1, T1, XL
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci	vmull.p64	XL, T1_H, MASK
17162306a36Sopenharmony_ci	.endm
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	//
17462306a36Sopenharmony_ci	// Alternative reduction for CPUs that lack support for the
17562306a36Sopenharmony_ci	// 64x64->128 PMULL instruction
17662306a36Sopenharmony_ci	//
17762306a36Sopenharmony_ci	.macro		__pmull_reduce_p8
17862306a36Sopenharmony_ci	veor		XL_H, XL_H, XM_L
17962306a36Sopenharmony_ci	veor		XH_L, XH_L, XM_H
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci	vshl.i64	T1, XL, #57
18262306a36Sopenharmony_ci	vshl.i64	T2, XL, #62
18362306a36Sopenharmony_ci	veor		T1, T1, T2
18462306a36Sopenharmony_ci	vshl.i64	T2, XL, #63
18562306a36Sopenharmony_ci	veor		T1, T1, T2
18662306a36Sopenharmony_ci	veor		XL_H, XL_H, T1_L
18762306a36Sopenharmony_ci	veor		XH_L, XH_L, T1_H
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	vshr.u64	T1, XL, #1
19062306a36Sopenharmony_ci	veor		XH, XH, XL
19162306a36Sopenharmony_ci	veor		XL, XL, T1
19262306a36Sopenharmony_ci	vshr.u64	T1, T1, #6
19362306a36Sopenharmony_ci	vshr.u64	XL, XL, #1
19462306a36Sopenharmony_ci	.endm
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci	.macro		ghash_update, pn, enc, aggregate=1, head=1
19762306a36Sopenharmony_ci	vld1.64		{XL}, [r1]
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_ci	.if		\head
20062306a36Sopenharmony_ci	/* do the head block first, if supplied */
20162306a36Sopenharmony_ci	ldr		ip, [sp]
20262306a36Sopenharmony_ci	teq		ip, #0
20362306a36Sopenharmony_ci	beq		0f
20462306a36Sopenharmony_ci	vld1.64		{T1}, [ip]
20562306a36Sopenharmony_ci	teq		r0, #0
20662306a36Sopenharmony_ci	b		3f
20762306a36Sopenharmony_ci	.endif
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci0:	.ifc		\pn, p64
21062306a36Sopenharmony_ci	.if		\aggregate
21162306a36Sopenharmony_ci	tst		r0, #3			// skip until #blocks is a
21262306a36Sopenharmony_ci	bne		2f			// round multiple of 4
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci	vld1.8		{XL2-XM2}, [r2]!
21562306a36Sopenharmony_ci1:	vld1.8		{T2-T3}, [r2]!
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	.ifnb		\enc
21862306a36Sopenharmony_ci	\enc\()_4x	XL2, XM2, T2, T3
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_ci	add		ip, r3, #16
22162306a36Sopenharmony_ci	vld1.64		{HH}, [ip, :128]!
22262306a36Sopenharmony_ci	vld1.64		{HH3-HH4}, [ip, :128]
22362306a36Sopenharmony_ci
22462306a36Sopenharmony_ci	veor		SHASH2_p64, SHASH_L, SHASH_H
22562306a36Sopenharmony_ci	veor		SHASH2_H, HH_L, HH_H
22662306a36Sopenharmony_ci	veor		HH34_L, HH3_L, HH3_H
22762306a36Sopenharmony_ci	veor		HH34_H, HH4_L, HH4_H
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci	vmov.i8		MASK, #0xe1
23062306a36Sopenharmony_ci	vshl.u64	MASK, MASK, #57
23162306a36Sopenharmony_ci	.endif
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci	vrev64.8	XL2, XL2
23462306a36Sopenharmony_ci	vrev64.8	XM2, XM2
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ci	subs		r0, r0, #4
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	vext.8		T1, XL2, XL2, #8
23962306a36Sopenharmony_ci	veor		XL2_H, XL2_H, XL_L
24062306a36Sopenharmony_ci	veor		XL, XL, T1
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_ci	vrev64.8	T1, T3
24362306a36Sopenharmony_ci	vrev64.8	T3, T2
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
24662306a36Sopenharmony_ci	veor		XL2_H, XL2_H, XL_H
24762306a36Sopenharmony_ci	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
24862306a36Sopenharmony_ci	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
25162306a36Sopenharmony_ci	veor		XM2_L, XM2_L, XM2_H
25262306a36Sopenharmony_ci	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
25362306a36Sopenharmony_ci	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_ci	veor		XH, XH, XH2
25662306a36Sopenharmony_ci	veor		XL, XL, XL2
25762306a36Sopenharmony_ci	veor		XM, XM, XM2
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
26062306a36Sopenharmony_ci	veor		T3_L, T3_L, T3_H
26162306a36Sopenharmony_ci	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
26262306a36Sopenharmony_ci	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	veor		XH, XH, XH2
26562306a36Sopenharmony_ci	veor		XL, XL, XL2
26662306a36Sopenharmony_ci	veor		XM, XM, XM2
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
26962306a36Sopenharmony_ci	veor		T1_L, T1_L, T1_H
27062306a36Sopenharmony_ci	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
27162306a36Sopenharmony_ci	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci	veor		XH, XH, XH2
27462306a36Sopenharmony_ci	veor		XL, XL, XL2
27562306a36Sopenharmony_ci	veor		XM, XM, XM2
27662306a36Sopenharmony_ci
27762306a36Sopenharmony_ci	beq		4f
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci	vld1.8		{XL2-XM2}, [r2]!
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_ci	veor		T1, XL, XH
28262306a36Sopenharmony_ci	veor		XM, XM, T1
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_ci	__pmull_reduce_p64
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	veor		T1, T1, XH
28762306a36Sopenharmony_ci	veor		XL, XL, T1
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	b		1b
29062306a36Sopenharmony_ci	.endif
29162306a36Sopenharmony_ci	.endif
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci2:	vld1.8		{T1}, [r2]!
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	.ifnb		\enc
29662306a36Sopenharmony_ci	\enc\()_1x	T1
29762306a36Sopenharmony_ci	veor		SHASH2_p64, SHASH_L, SHASH_H
29862306a36Sopenharmony_ci	vmov.i8		MASK, #0xe1
29962306a36Sopenharmony_ci	vshl.u64	MASK, MASK, #57
30062306a36Sopenharmony_ci	.endif
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	subs		r0, r0, #1
30362306a36Sopenharmony_ci
30462306a36Sopenharmony_ci3:	/* multiply XL by SHASH in GF(2^128) */
30562306a36Sopenharmony_ci	vrev64.8	T1, T1
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci	vext.8		IN1, T1, T1, #8
30862306a36Sopenharmony_ci	veor		T1_L, T1_L, XL_H
30962306a36Sopenharmony_ci	veor		XL, XL, IN1
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
31262306a36Sopenharmony_ci	veor		T1, T1, XL
31362306a36Sopenharmony_ci	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
31462306a36Sopenharmony_ci	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
31562306a36Sopenharmony_ci
31662306a36Sopenharmony_ci4:	veor		T1, XL, XH
31762306a36Sopenharmony_ci	veor		XM, XM, T1
31862306a36Sopenharmony_ci
31962306a36Sopenharmony_ci	__pmull_reduce_\pn
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_ci	veor		T1, T1, XH
32262306a36Sopenharmony_ci	veor		XL, XL, T1
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci	bne		0b
32562306a36Sopenharmony_ci	.endm
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	/*
32862306a36Sopenharmony_ci	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
32962306a36Sopenharmony_ci	 *			   struct ghash_key const *k, const char *head)
33062306a36Sopenharmony_ci	 */
33162306a36Sopenharmony_ciENTRY(pmull_ghash_update_p64)
33262306a36Sopenharmony_ci	vld1.64		{SHASH}, [r3]!
33362306a36Sopenharmony_ci	vld1.64		{HH}, [r3]!
33462306a36Sopenharmony_ci	vld1.64		{HH3-HH4}, [r3]
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci	veor		SHASH2_p64, SHASH_L, SHASH_H
33762306a36Sopenharmony_ci	veor		SHASH2_H, HH_L, HH_H
33862306a36Sopenharmony_ci	veor		HH34_L, HH3_L, HH3_H
33962306a36Sopenharmony_ci	veor		HH34_H, HH4_L, HH4_H
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	vmov.i8		MASK, #0xe1
34262306a36Sopenharmony_ci	vshl.u64	MASK, MASK, #57
34362306a36Sopenharmony_ci
34462306a36Sopenharmony_ci	ghash_update	p64
34562306a36Sopenharmony_ci	vst1.64		{XL}, [r1]
34662306a36Sopenharmony_ci
34762306a36Sopenharmony_ci	bx		lr
34862306a36Sopenharmony_ciENDPROC(pmull_ghash_update_p64)
34962306a36Sopenharmony_ci
35062306a36Sopenharmony_ciENTRY(pmull_ghash_update_p8)
35162306a36Sopenharmony_ci	vld1.64		{SHASH}, [r3]
35262306a36Sopenharmony_ci	veor		SHASH2_p8, SHASH_L, SHASH_H
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	vext.8		s1l, SHASH_L, SHASH_L, #1
35562306a36Sopenharmony_ci	vext.8		s2l, SHASH_L, SHASH_L, #2
35662306a36Sopenharmony_ci	vext.8		s3l, SHASH_L, SHASH_L, #3
35762306a36Sopenharmony_ci	vext.8		s4l, SHASH_L, SHASH_L, #4
35862306a36Sopenharmony_ci	vext.8		s1h, SHASH_H, SHASH_H, #1
35962306a36Sopenharmony_ci	vext.8		s2h, SHASH_H, SHASH_H, #2
36062306a36Sopenharmony_ci	vext.8		s3h, SHASH_H, SHASH_H, #3
36162306a36Sopenharmony_ci	vext.8		s4h, SHASH_H, SHASH_H, #4
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ci	vmov.i64	k16, #0xffff
36462306a36Sopenharmony_ci	vmov.i64	k32, #0xffffffff
36562306a36Sopenharmony_ci	vmov.i64	k48, #0xffffffffffff
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci	ghash_update	p8
36862306a36Sopenharmony_ci	vst1.64		{XL}, [r1]
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci	bx		lr
37162306a36Sopenharmony_ciENDPROC(pmull_ghash_update_p8)
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci	e0		.req	q9
37462306a36Sopenharmony_ci	e1		.req	q10
37562306a36Sopenharmony_ci	e2		.req	q11
37662306a36Sopenharmony_ci	e3		.req	q12
37762306a36Sopenharmony_ci	e0l		.req	d18
37862306a36Sopenharmony_ci	e0h		.req	d19
37962306a36Sopenharmony_ci	e2l		.req	d22
38062306a36Sopenharmony_ci	e2h		.req	d23
38162306a36Sopenharmony_ci	e3l		.req	d24
38262306a36Sopenharmony_ci	e3h		.req	d25
38362306a36Sopenharmony_ci	ctr		.req	q13
38462306a36Sopenharmony_ci	ctr0		.req	d26
38562306a36Sopenharmony_ci	ctr1		.req	d27
38662306a36Sopenharmony_ci
38762306a36Sopenharmony_ci	ek0		.req	q14
38862306a36Sopenharmony_ci	ek1		.req	q15
38962306a36Sopenharmony_ci
39062306a36Sopenharmony_ci	.macro		round, rk:req, regs:vararg
39162306a36Sopenharmony_ci	.irp		r, \regs
39262306a36Sopenharmony_ci	aese.8		\r, \rk
39362306a36Sopenharmony_ci	aesmc.8		\r, \r
39462306a36Sopenharmony_ci	.endr
39562306a36Sopenharmony_ci	.endm
39662306a36Sopenharmony_ci
39762306a36Sopenharmony_ci	.macro		aes_encrypt, rkp, rounds, regs:vararg
39862306a36Sopenharmony_ci	vld1.8		{ek0-ek1}, [\rkp, :128]!
39962306a36Sopenharmony_ci	cmp		\rounds, #12
40062306a36Sopenharmony_ci	blt		.L\@			// AES-128
40162306a36Sopenharmony_ci
40262306a36Sopenharmony_ci	round		ek0, \regs
40362306a36Sopenharmony_ci	vld1.8		{ek0}, [\rkp, :128]!
40462306a36Sopenharmony_ci	round		ek1, \regs
40562306a36Sopenharmony_ci	vld1.8		{ek1}, [\rkp, :128]!
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_ci	beq		.L\@			// AES-192
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci	round		ek0, \regs
41062306a36Sopenharmony_ci	vld1.8		{ek0}, [\rkp, :128]!
41162306a36Sopenharmony_ci	round		ek1, \regs
41262306a36Sopenharmony_ci	vld1.8		{ek1}, [\rkp, :128]!
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci.L\@:	.rept		4
41562306a36Sopenharmony_ci	round		ek0, \regs
41662306a36Sopenharmony_ci	vld1.8		{ek0}, [\rkp, :128]!
41762306a36Sopenharmony_ci	round		ek1, \regs
41862306a36Sopenharmony_ci	vld1.8		{ek1}, [\rkp, :128]!
41962306a36Sopenharmony_ci	.endr
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci	round		ek0, \regs
42262306a36Sopenharmony_ci	vld1.8		{ek0}, [\rkp, :128]
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_ci	.irp		r, \regs
42562306a36Sopenharmony_ci	aese.8		\r, ek1
42662306a36Sopenharmony_ci	.endr
42762306a36Sopenharmony_ci	.irp		r, \regs
42862306a36Sopenharmony_ci	veor		\r, \r, ek0
42962306a36Sopenharmony_ci	.endr
43062306a36Sopenharmony_ci	.endm
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_cipmull_aes_encrypt:
43362306a36Sopenharmony_ci	add		ip, r5, #4
43462306a36Sopenharmony_ci	vld1.8		{ctr0}, [r5]		// load 12 byte IV
43562306a36Sopenharmony_ci	vld1.8		{ctr1}, [ip]
43662306a36Sopenharmony_ci	rev		r8, r7
43762306a36Sopenharmony_ci	vext.8		ctr1, ctr1, ctr1, #4
43862306a36Sopenharmony_ci	add		r7, r7, #1
43962306a36Sopenharmony_ci	vmov.32		ctr1[1], r8
44062306a36Sopenharmony_ci	vmov		e0, ctr
44162306a36Sopenharmony_ci
44262306a36Sopenharmony_ci	add		ip, r3, #64
44362306a36Sopenharmony_ci	aes_encrypt	ip, r6, e0
44462306a36Sopenharmony_ci	bx		lr
44562306a36Sopenharmony_ciENDPROC(pmull_aes_encrypt)
44662306a36Sopenharmony_ci
44762306a36Sopenharmony_cipmull_aes_encrypt_4x:
44862306a36Sopenharmony_ci	add		ip, r5, #4
44962306a36Sopenharmony_ci	vld1.8		{ctr0}, [r5]
45062306a36Sopenharmony_ci	vld1.8		{ctr1}, [ip]
45162306a36Sopenharmony_ci	rev		r8, r7
45262306a36Sopenharmony_ci	vext.8		ctr1, ctr1, ctr1, #4
45362306a36Sopenharmony_ci	add		r7, r7, #1
45462306a36Sopenharmony_ci	vmov.32		ctr1[1], r8
45562306a36Sopenharmony_ci	rev		ip, r7
45662306a36Sopenharmony_ci	vmov		e0, ctr
45762306a36Sopenharmony_ci	add		r7, r7, #1
45862306a36Sopenharmony_ci	vmov.32		ctr1[1], ip
45962306a36Sopenharmony_ci	rev		r8, r7
46062306a36Sopenharmony_ci	vmov		e1, ctr
46162306a36Sopenharmony_ci	add		r7, r7, #1
46262306a36Sopenharmony_ci	vmov.32		ctr1[1], r8
46362306a36Sopenharmony_ci	rev		ip, r7
46462306a36Sopenharmony_ci	vmov		e2, ctr
46562306a36Sopenharmony_ci	add		r7, r7, #1
46662306a36Sopenharmony_ci	vmov.32		ctr1[1], ip
46762306a36Sopenharmony_ci	vmov		e3, ctr
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	add		ip, r3, #64
47062306a36Sopenharmony_ci	aes_encrypt	ip, r6, e0, e1, e2, e3
47162306a36Sopenharmony_ci	bx		lr
47262306a36Sopenharmony_ciENDPROC(pmull_aes_encrypt_4x)
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_cipmull_aes_encrypt_final:
47562306a36Sopenharmony_ci	add		ip, r5, #4
47662306a36Sopenharmony_ci	vld1.8		{ctr0}, [r5]
47762306a36Sopenharmony_ci	vld1.8		{ctr1}, [ip]
47862306a36Sopenharmony_ci	rev		r8, r7
47962306a36Sopenharmony_ci	vext.8		ctr1, ctr1, ctr1, #4
48062306a36Sopenharmony_ci	mov		r7, #1 << 24		// BE #1 for the tag
48162306a36Sopenharmony_ci	vmov.32		ctr1[1], r8
48262306a36Sopenharmony_ci	vmov		e0, ctr
48362306a36Sopenharmony_ci	vmov.32		ctr1[1], r7
48462306a36Sopenharmony_ci	vmov		e1, ctr
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_ci	add		ip, r3, #64
48762306a36Sopenharmony_ci	aes_encrypt	ip, r6, e0, e1
48862306a36Sopenharmony_ci	bx		lr
48962306a36Sopenharmony_ciENDPROC(pmull_aes_encrypt_final)
49062306a36Sopenharmony_ci
49162306a36Sopenharmony_ci	.macro		enc_1x, in0
49262306a36Sopenharmony_ci	bl		pmull_aes_encrypt
49362306a36Sopenharmony_ci	veor		\in0, \in0, e0
49462306a36Sopenharmony_ci	vst1.8		{\in0}, [r4]!
49562306a36Sopenharmony_ci	.endm
49662306a36Sopenharmony_ci
49762306a36Sopenharmony_ci	.macro		dec_1x, in0
49862306a36Sopenharmony_ci	bl		pmull_aes_encrypt
49962306a36Sopenharmony_ci	veor		e0, e0, \in0
50062306a36Sopenharmony_ci	vst1.8		{e0}, [r4]!
50162306a36Sopenharmony_ci	.endm
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci	.macro		enc_4x, in0, in1, in2, in3
50462306a36Sopenharmony_ci	bl		pmull_aes_encrypt_4x
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_ci	veor		\in0, \in0, e0
50762306a36Sopenharmony_ci	veor		\in1, \in1, e1
50862306a36Sopenharmony_ci	veor		\in2, \in2, e2
50962306a36Sopenharmony_ci	veor		\in3, \in3, e3
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_ci	vst1.8		{\in0-\in1}, [r4]!
51262306a36Sopenharmony_ci	vst1.8		{\in2-\in3}, [r4]!
51362306a36Sopenharmony_ci	.endm
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci	.macro		dec_4x, in0, in1, in2, in3
51662306a36Sopenharmony_ci	bl		pmull_aes_encrypt_4x
51762306a36Sopenharmony_ci
51862306a36Sopenharmony_ci	veor		e0, e0, \in0
51962306a36Sopenharmony_ci	veor		e1, e1, \in1
52062306a36Sopenharmony_ci	veor		e2, e2, \in2
52162306a36Sopenharmony_ci	veor		e3, e3, \in3
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci	vst1.8		{e0-e1}, [r4]!
52462306a36Sopenharmony_ci	vst1.8		{e2-e3}, [r4]!
52562306a36Sopenharmony_ci	.endm
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_ci	/*
52862306a36Sopenharmony_ci	 * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
52962306a36Sopenharmony_ci	 *			  struct gcm_key const *k, char *dst,
53062306a36Sopenharmony_ci	 *			  char *iv, int rounds, u32 counter)
53162306a36Sopenharmony_ci	 */
53262306a36Sopenharmony_ciENTRY(pmull_gcm_encrypt)
53362306a36Sopenharmony_ci	push		{r4-r8, lr}
53462306a36Sopenharmony_ci	ldrd		r4, r5, [sp, #24]
53562306a36Sopenharmony_ci	ldrd		r6, r7, [sp, #32]
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci	vld1.64		{SHASH}, [r3]
53862306a36Sopenharmony_ci
53962306a36Sopenharmony_ci	ghash_update	p64, enc, head=0
54062306a36Sopenharmony_ci	vst1.64		{XL}, [r1]
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci	pop		{r4-r8, pc}
54362306a36Sopenharmony_ciENDPROC(pmull_gcm_encrypt)
54462306a36Sopenharmony_ci
54562306a36Sopenharmony_ci	/*
54662306a36Sopenharmony_ci	 * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
54762306a36Sopenharmony_ci	 *			  struct gcm_key const *k, char *dst,
54862306a36Sopenharmony_ci	 *			  char *iv, int rounds, u32 counter)
54962306a36Sopenharmony_ci	 */
55062306a36Sopenharmony_ciENTRY(pmull_gcm_decrypt)
55162306a36Sopenharmony_ci	push		{r4-r8, lr}
55262306a36Sopenharmony_ci	ldrd		r4, r5, [sp, #24]
55362306a36Sopenharmony_ci	ldrd		r6, r7, [sp, #32]
55462306a36Sopenharmony_ci
55562306a36Sopenharmony_ci	vld1.64		{SHASH}, [r3]
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_ci	ghash_update	p64, dec, head=0
55862306a36Sopenharmony_ci	vst1.64		{XL}, [r1]
55962306a36Sopenharmony_ci
56062306a36Sopenharmony_ci	pop		{r4-r8, pc}
56162306a36Sopenharmony_ciENDPROC(pmull_gcm_decrypt)
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ci	/*
56462306a36Sopenharmony_ci	 * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
56562306a36Sopenharmony_ci	 *			    struct gcm_key const *k, char *head,
56662306a36Sopenharmony_ci	 *			    char *iv, int rounds, u32 counter)
56762306a36Sopenharmony_ci	 */
56862306a36Sopenharmony_ciENTRY(pmull_gcm_enc_final)
56962306a36Sopenharmony_ci	push		{r4-r8, lr}
57062306a36Sopenharmony_ci	ldrd		r4, r5, [sp, #24]
57162306a36Sopenharmony_ci	ldrd		r6, r7, [sp, #32]
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci	bl		pmull_aes_encrypt_final
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci	cmp		r0, #0
57662306a36Sopenharmony_ci	beq		.Lenc_final
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	mov_l		ip, .Lpermute
57962306a36Sopenharmony_ci	sub		r4, r4, #16
58062306a36Sopenharmony_ci	add		r8, ip, r0
58162306a36Sopenharmony_ci	add		ip, ip, #32
58262306a36Sopenharmony_ci	add		r4, r4, r0
58362306a36Sopenharmony_ci	sub		ip, ip, r0
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci	vld1.8		{e3}, [r8]		// permute vector for key stream
58662306a36Sopenharmony_ci	vld1.8		{e2}, [ip]		// permute vector for ghash input
58762306a36Sopenharmony_ci
58862306a36Sopenharmony_ci	vtbl.8		e3l, {e0}, e3l
58962306a36Sopenharmony_ci	vtbl.8		e3h, {e0}, e3h
59062306a36Sopenharmony_ci
59162306a36Sopenharmony_ci	vld1.8		{e0}, [r4]		// encrypt tail block
59262306a36Sopenharmony_ci	veor		e0, e0, e3
59362306a36Sopenharmony_ci	vst1.8		{e0}, [r4]
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_ci	vtbl.8		T1_L, {e0}, e2l
59662306a36Sopenharmony_ci	vtbl.8		T1_H, {e0}, e2h
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci	vld1.64		{XL}, [r1]
59962306a36Sopenharmony_ci.Lenc_final:
60062306a36Sopenharmony_ci	vld1.64		{SHASH}, [r3, :128]
60162306a36Sopenharmony_ci	vmov.i8		MASK, #0xe1
60262306a36Sopenharmony_ci	veor		SHASH2_p64, SHASH_L, SHASH_H
60362306a36Sopenharmony_ci	vshl.u64	MASK, MASK, #57
60462306a36Sopenharmony_ci	mov		r0, #1
60562306a36Sopenharmony_ci	bne		3f			// process head block first
60662306a36Sopenharmony_ci	ghash_update	p64, aggregate=0, head=0
60762306a36Sopenharmony_ci
60862306a36Sopenharmony_ci	vrev64.8	XL, XL
60962306a36Sopenharmony_ci	vext.8		XL, XL, XL, #8
61062306a36Sopenharmony_ci	veor		XL, XL, e1
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	sub		r2, r2, #16		// rewind src pointer
61362306a36Sopenharmony_ci	vst1.8		{XL}, [r2]		// store tag
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_ci	pop		{r4-r8, pc}
61662306a36Sopenharmony_ciENDPROC(pmull_gcm_enc_final)
61762306a36Sopenharmony_ci
61862306a36Sopenharmony_ci	/*
61962306a36Sopenharmony_ci	 * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
62062306a36Sopenharmony_ci	 *			   struct gcm_key const *k, char *head,
62162306a36Sopenharmony_ci	 *			   char *iv, int rounds, u32 counter,
62262306a36Sopenharmony_ci	 *			   const char *otag, int authsize)
62362306a36Sopenharmony_ci	 */
62462306a36Sopenharmony_ciENTRY(pmull_gcm_dec_final)
62562306a36Sopenharmony_ci	push		{r4-r8, lr}
62662306a36Sopenharmony_ci	ldrd		r4, r5, [sp, #24]
62762306a36Sopenharmony_ci	ldrd		r6, r7, [sp, #32]
62862306a36Sopenharmony_ci
62962306a36Sopenharmony_ci	bl		pmull_aes_encrypt_final
63062306a36Sopenharmony_ci
63162306a36Sopenharmony_ci	cmp		r0, #0
63262306a36Sopenharmony_ci	beq		.Ldec_final
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci	mov_l		ip, .Lpermute
63562306a36Sopenharmony_ci	sub		r4, r4, #16
63662306a36Sopenharmony_ci	add		r8, ip, r0
63762306a36Sopenharmony_ci	add		ip, ip, #32
63862306a36Sopenharmony_ci	add		r4, r4, r0
63962306a36Sopenharmony_ci	sub		ip, ip, r0
64062306a36Sopenharmony_ci
64162306a36Sopenharmony_ci	vld1.8		{e3}, [r8]		// permute vector for key stream
64262306a36Sopenharmony_ci	vld1.8		{e2}, [ip]		// permute vector for ghash input
64362306a36Sopenharmony_ci
64462306a36Sopenharmony_ci	vtbl.8		e3l, {e0}, e3l
64562306a36Sopenharmony_ci	vtbl.8		e3h, {e0}, e3h
64662306a36Sopenharmony_ci
64762306a36Sopenharmony_ci	vld1.8		{e0}, [r4]
64862306a36Sopenharmony_ci
64962306a36Sopenharmony_ci	vtbl.8		T1_L, {e0}, e2l
65062306a36Sopenharmony_ci	vtbl.8		T1_H, {e0}, e2h
65162306a36Sopenharmony_ci
65262306a36Sopenharmony_ci	veor		e0, e0, e3
65362306a36Sopenharmony_ci	vst1.8		{e0}, [r4]
65462306a36Sopenharmony_ci
65562306a36Sopenharmony_ci	vld1.64		{XL}, [r1]
65662306a36Sopenharmony_ci.Ldec_final:
65762306a36Sopenharmony_ci	vld1.64		{SHASH}, [r3]
65862306a36Sopenharmony_ci	vmov.i8		MASK, #0xe1
65962306a36Sopenharmony_ci	veor		SHASH2_p64, SHASH_L, SHASH_H
66062306a36Sopenharmony_ci	vshl.u64	MASK, MASK, #57
66162306a36Sopenharmony_ci	mov		r0, #1
66262306a36Sopenharmony_ci	bne		3f			// process head block first
66362306a36Sopenharmony_ci	ghash_update	p64, aggregate=0, head=0
66462306a36Sopenharmony_ci
66562306a36Sopenharmony_ci	vrev64.8	XL, XL
66662306a36Sopenharmony_ci	vext.8		XL, XL, XL, #8
66762306a36Sopenharmony_ci	veor		XL, XL, e1
66862306a36Sopenharmony_ci
66962306a36Sopenharmony_ci	mov_l		ip, .Lpermute
67062306a36Sopenharmony_ci	ldrd		r2, r3, [sp, #40]	// otag and authsize
67162306a36Sopenharmony_ci	vld1.8		{T1}, [r2]
67262306a36Sopenharmony_ci	add		ip, ip, r3
67362306a36Sopenharmony_ci	vceq.i8		T1, T1, XL		// compare tags
67462306a36Sopenharmony_ci	vmvn		T1, T1			// 0 for eq, -1 for ne
67562306a36Sopenharmony_ci
67662306a36Sopenharmony_ci	vld1.8		{e0}, [ip]
67762306a36Sopenharmony_ci	vtbl.8		XL_L, {T1}, e0l		// keep authsize bytes only
67862306a36Sopenharmony_ci	vtbl.8		XL_H, {T1}, e0h
67962306a36Sopenharmony_ci
68062306a36Sopenharmony_ci	vpmin.s8	XL_L, XL_L, XL_H	// take the minimum s8 across the vector
68162306a36Sopenharmony_ci	vpmin.s8	XL_L, XL_L, XL_L
68262306a36Sopenharmony_ci	vmov.32		r0, XL_L[0]		// fail if != 0x0
68362306a36Sopenharmony_ci
68462306a36Sopenharmony_ci	pop		{r4-r8, pc}
68562306a36Sopenharmony_ciENDPROC(pmull_gcm_dec_final)
68662306a36Sopenharmony_ci
68762306a36Sopenharmony_ci	.section	".rodata", "a", %progbits
68862306a36Sopenharmony_ci	.align		5
68962306a36Sopenharmony_ci.Lpermute:
69062306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
69162306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
69262306a36Sopenharmony_ci	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
69362306a36Sopenharmony_ci	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
69462306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
69562306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
696