162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Accelerated GHASH implementation with ARMv8 PMULL instructions.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci#include <linux/linkage.h>
962306a36Sopenharmony_ci#include <linux/cfi_types.h>
1062306a36Sopenharmony_ci#include <asm/assembler.h>
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci	SHASH		.req	v0
1362306a36Sopenharmony_ci	SHASH2		.req	v1
1462306a36Sopenharmony_ci	T1		.req	v2
1562306a36Sopenharmony_ci	T2		.req	v3
1662306a36Sopenharmony_ci	MASK		.req	v4
1762306a36Sopenharmony_ci	XM		.req	v5
1862306a36Sopenharmony_ci	XL		.req	v6
1962306a36Sopenharmony_ci	XH		.req	v7
2062306a36Sopenharmony_ci	IN1		.req	v7
2162306a36Sopenharmony_ci
2262306a36Sopenharmony_ci	k00_16		.req	v8
2362306a36Sopenharmony_ci	k32_48		.req	v9
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_ci	t3		.req	v10
2662306a36Sopenharmony_ci	t4		.req	v11
2762306a36Sopenharmony_ci	t5		.req	v12
2862306a36Sopenharmony_ci	t6		.req	v13
2962306a36Sopenharmony_ci	t7		.req	v14
3062306a36Sopenharmony_ci	t8		.req	v15
3162306a36Sopenharmony_ci	t9		.req	v16
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci	perm1		.req	v17
3462306a36Sopenharmony_ci	perm2		.req	v18
3562306a36Sopenharmony_ci	perm3		.req	v19
3662306a36Sopenharmony_ci
3762306a36Sopenharmony_ci	sh1		.req	v20
3862306a36Sopenharmony_ci	sh2		.req	v21
3962306a36Sopenharmony_ci	sh3		.req	v22
4062306a36Sopenharmony_ci	sh4		.req	v23
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci	ss1		.req	v24
4362306a36Sopenharmony_ci	ss2		.req	v25
4462306a36Sopenharmony_ci	ss3		.req	v26
4562306a36Sopenharmony_ci	ss4		.req	v27
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_ci	XL2		.req	v8
4862306a36Sopenharmony_ci	XM2		.req	v9
4962306a36Sopenharmony_ci	XH2		.req	v10
5062306a36Sopenharmony_ci	XL3		.req	v11
5162306a36Sopenharmony_ci	XM3		.req	v12
5262306a36Sopenharmony_ci	XH3		.req	v13
5362306a36Sopenharmony_ci	TT3		.req	v14
5462306a36Sopenharmony_ci	TT4		.req	v15
5562306a36Sopenharmony_ci	HH		.req	v16
5662306a36Sopenharmony_ci	HH3		.req	v17
5762306a36Sopenharmony_ci	HH4		.req	v18
5862306a36Sopenharmony_ci	HH34		.req	v19
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci	.text
6162306a36Sopenharmony_ci	.arch		armv8-a+crypto
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci	.macro		__pmull_p64, rd, rn, rm
6462306a36Sopenharmony_ci	pmull		\rd\().1q, \rn\().1d, \rm\().1d
6562306a36Sopenharmony_ci	.endm
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci	.macro		__pmull2_p64, rd, rn, rm
6862306a36Sopenharmony_ci	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
6962306a36Sopenharmony_ci	.endm
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci	.macro		__pmull_p8, rq, ad, bd
7262306a36Sopenharmony_ci	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
7362306a36Sopenharmony_ci	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
7462306a36Sopenharmony_ci	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci	__pmull_p8_\bd	\rq, \ad
7762306a36Sopenharmony_ci	.endm
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci	.macro		__pmull2_p8, rq, ad, bd
8062306a36Sopenharmony_ci	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
8162306a36Sopenharmony_ci	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
8262306a36Sopenharmony_ci	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci	__pmull2_p8_\bd	\rq, \ad
8562306a36Sopenharmony_ci	.endm
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci	.macro		__pmull_p8_SHASH, rq, ad
8862306a36Sopenharmony_ci	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
8962306a36Sopenharmony_ci	.endm
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	.macro		__pmull_p8_SHASH2, rq, ad
9262306a36Sopenharmony_ci	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
9362306a36Sopenharmony_ci	.endm
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_ci	.macro		__pmull2_p8_SHASH, rq, ad
9662306a36Sopenharmony_ci	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
9762306a36Sopenharmony_ci	.endm
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
10062306a36Sopenharmony_ci	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
10162306a36Sopenharmony_ci	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
10262306a36Sopenharmony_ci	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
10362306a36Sopenharmony_ci	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
10462306a36Sopenharmony_ci	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
10562306a36Sopenharmony_ci	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
10662306a36Sopenharmony_ci	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
10762306a36Sopenharmony_ci	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci	eor		t3.16b, t3.16b, t4.16b			// L = E + F
11062306a36Sopenharmony_ci	eor		t5.16b, t5.16b, t6.16b			// M = G + H
11162306a36Sopenharmony_ci	eor		t7.16b, t7.16b, t8.16b			// N = I + J
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci	uzp1		t4.2d, t3.2d, t5.2d
11462306a36Sopenharmony_ci	uzp2		t3.2d, t3.2d, t5.2d
11562306a36Sopenharmony_ci	uzp1		t6.2d, t7.2d, t9.2d
11662306a36Sopenharmony_ci	uzp2		t7.2d, t7.2d, t9.2d
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci	// t3 = (L) (P0 + P1) << 8
11962306a36Sopenharmony_ci	// t5 = (M) (P2 + P3) << 16
12062306a36Sopenharmony_ci	eor		t4.16b, t4.16b, t3.16b
12162306a36Sopenharmony_ci	and		t3.16b, t3.16b, k32_48.16b
12262306a36Sopenharmony_ci
12362306a36Sopenharmony_ci	// t7 = (N) (P4 + P5) << 24
12462306a36Sopenharmony_ci	// t9 = (K) (P6 + P7) << 32
12562306a36Sopenharmony_ci	eor		t6.16b, t6.16b, t7.16b
12662306a36Sopenharmony_ci	and		t7.16b, t7.16b, k00_16.16b
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci	eor		t4.16b, t4.16b, t3.16b
12962306a36Sopenharmony_ci	eor		t6.16b, t6.16b, t7.16b
13062306a36Sopenharmony_ci
13162306a36Sopenharmony_ci	zip2		t5.2d, t4.2d, t3.2d
13262306a36Sopenharmony_ci	zip1		t3.2d, t4.2d, t3.2d
13362306a36Sopenharmony_ci	zip2		t9.2d, t6.2d, t7.2d
13462306a36Sopenharmony_ci	zip1		t7.2d, t6.2d, t7.2d
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ci	ext		t3.16b, t3.16b, t3.16b, #15
13762306a36Sopenharmony_ci	ext		t5.16b, t5.16b, t5.16b, #14
13862306a36Sopenharmony_ci	ext		t7.16b, t7.16b, t7.16b, #13
13962306a36Sopenharmony_ci	ext		t9.16b, t9.16b, t9.16b, #12
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci	eor		t3.16b, t3.16b, t5.16b
14262306a36Sopenharmony_ci	eor		t7.16b, t7.16b, t9.16b
14362306a36Sopenharmony_ci	eor		\rq\().16b, \rq\().16b, t3.16b
14462306a36Sopenharmony_ci	eor		\rq\().16b, \rq\().16b, t7.16b
14562306a36Sopenharmony_ci	.endm
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	.macro		__pmull_pre_p64
14862306a36Sopenharmony_ci	add		x8, x3, #16
14962306a36Sopenharmony_ci	ld1		{HH.2d-HH4.2d}, [x8]
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	trn1		SHASH2.2d, SHASH.2d, HH.2d
15262306a36Sopenharmony_ci	trn2		T1.2d, SHASH.2d, HH.2d
15362306a36Sopenharmony_ci	eor		SHASH2.16b, SHASH2.16b, T1.16b
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci	trn1		HH34.2d, HH3.2d, HH4.2d
15662306a36Sopenharmony_ci	trn2		T1.2d, HH3.2d, HH4.2d
15762306a36Sopenharmony_ci	eor		HH34.16b, HH34.16b, T1.16b
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci	movi		MASK.16b, #0xe1
16062306a36Sopenharmony_ci	shl		MASK.2d, MASK.2d, #57
16162306a36Sopenharmony_ci	.endm
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci	.macro		__pmull_pre_p8
16462306a36Sopenharmony_ci	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
16562306a36Sopenharmony_ci	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
16662306a36Sopenharmony_ci
16762306a36Sopenharmony_ci	// k00_16 := 0x0000000000000000_000000000000ffff
16862306a36Sopenharmony_ci	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
16962306a36Sopenharmony_ci	movi		k32_48.2d, #0xffffffff
17062306a36Sopenharmony_ci	mov		k32_48.h[2], k32_48.h[0]
17162306a36Sopenharmony_ci	ushr		k00_16.2d, k32_48.2d, #32
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	// prepare the permutation vectors
17462306a36Sopenharmony_ci	mov_q		x5, 0x080f0e0d0c0b0a09
17562306a36Sopenharmony_ci	movi		T1.8b, #8
17662306a36Sopenharmony_ci	dup		perm1.2d, x5
17762306a36Sopenharmony_ci	eor		perm1.16b, perm1.16b, T1.16b
17862306a36Sopenharmony_ci	ushr		perm2.2d, perm1.2d, #8
17962306a36Sopenharmony_ci	ushr		perm3.2d, perm1.2d, #16
18062306a36Sopenharmony_ci	ushr		T1.2d, perm1.2d, #24
18162306a36Sopenharmony_ci	sli		perm2.2d, perm1.2d, #56
18262306a36Sopenharmony_ci	sli		perm3.2d, perm1.2d, #48
18362306a36Sopenharmony_ci	sli		T1.2d, perm1.2d, #40
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci	// precompute loop invariants
18662306a36Sopenharmony_ci	tbl		sh1.16b, {SHASH.16b}, perm1.16b
18762306a36Sopenharmony_ci	tbl		sh2.16b, {SHASH.16b}, perm2.16b
18862306a36Sopenharmony_ci	tbl		sh3.16b, {SHASH.16b}, perm3.16b
18962306a36Sopenharmony_ci	tbl		sh4.16b, {SHASH.16b}, T1.16b
19062306a36Sopenharmony_ci	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
19162306a36Sopenharmony_ci	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
19262306a36Sopenharmony_ci	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
19362306a36Sopenharmony_ci	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
19462306a36Sopenharmony_ci	.endm
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci	//
19762306a36Sopenharmony_ci	// PMULL (64x64->128) based reduction for CPUs that can do
19862306a36Sopenharmony_ci	// it in a single instruction.
19962306a36Sopenharmony_ci	//
20062306a36Sopenharmony_ci	.macro		__pmull_reduce_p64
20162306a36Sopenharmony_ci	pmull		T2.1q, XL.1d, MASK.1d
20262306a36Sopenharmony_ci	eor		XM.16b, XM.16b, T1.16b
20362306a36Sopenharmony_ci
20462306a36Sopenharmony_ci	mov		XH.d[0], XM.d[1]
20562306a36Sopenharmony_ci	mov		XM.d[1], XL.d[0]
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	eor		XL.16b, XM.16b, T2.16b
20862306a36Sopenharmony_ci	ext		T2.16b, XL.16b, XL.16b, #8
20962306a36Sopenharmony_ci	pmull		XL.1q, XL.1d, MASK.1d
21062306a36Sopenharmony_ci	.endm
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	//
21362306a36Sopenharmony_ci	// Alternative reduction for CPUs that lack support for the
21462306a36Sopenharmony_ci	// 64x64->128 PMULL instruction
21562306a36Sopenharmony_ci	//
21662306a36Sopenharmony_ci	.macro		__pmull_reduce_p8
21762306a36Sopenharmony_ci	eor		XM.16b, XM.16b, T1.16b
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci	mov		XL.d[1], XM.d[0]
22062306a36Sopenharmony_ci	mov		XH.d[0], XM.d[1]
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_ci	shl		T1.2d, XL.2d, #57
22362306a36Sopenharmony_ci	shl		T2.2d, XL.2d, #62
22462306a36Sopenharmony_ci	eor		T2.16b, T2.16b, T1.16b
22562306a36Sopenharmony_ci	shl		T1.2d, XL.2d, #63
22662306a36Sopenharmony_ci	eor		T2.16b, T2.16b, T1.16b
22762306a36Sopenharmony_ci	ext		T1.16b, XL.16b, XH.16b, #8
22862306a36Sopenharmony_ci	eor		T2.16b, T2.16b, T1.16b
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_ci	mov		XL.d[1], T2.d[0]
23162306a36Sopenharmony_ci	mov		XH.d[0], T2.d[1]
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci	ushr		T2.2d, XL.2d, #1
23462306a36Sopenharmony_ci	eor		XH.16b, XH.16b, XL.16b
23562306a36Sopenharmony_ci	eor		XL.16b, XL.16b, T2.16b
23662306a36Sopenharmony_ci	ushr		T2.2d, T2.2d, #6
23762306a36Sopenharmony_ci	ushr		XL.2d, XL.2d, #1
23862306a36Sopenharmony_ci	.endm
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_ci	.macro		__pmull_ghash, pn
24162306a36Sopenharmony_ci	ld1		{SHASH.2d}, [x3]
24262306a36Sopenharmony_ci	ld1		{XL.2d}, [x1]
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci	__pmull_pre_\pn
24562306a36Sopenharmony_ci
24662306a36Sopenharmony_ci	/* do the head block first, if supplied */
24762306a36Sopenharmony_ci	cbz		x4, 0f
24862306a36Sopenharmony_ci	ld1		{T1.2d}, [x4]
24962306a36Sopenharmony_ci	mov		x4, xzr
25062306a36Sopenharmony_ci	b		3f
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci0:	.ifc		\pn, p64
25362306a36Sopenharmony_ci	tbnz		w0, #0, 2f		// skip until #blocks is a
25462306a36Sopenharmony_ci	tbnz		w0, #1, 2f		// round multiple of 4
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci1:	ld1		{XM3.16b-TT4.16b}, [x2], #64
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_ci	sub		w0, w0, #4
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	rev64		T1.16b, XM3.16b
26162306a36Sopenharmony_ci	rev64		T2.16b, XH3.16b
26262306a36Sopenharmony_ci	rev64		TT4.16b, TT4.16b
26362306a36Sopenharmony_ci	rev64		TT3.16b, TT3.16b
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_ci	ext		IN1.16b, TT4.16b, TT4.16b, #8
26662306a36Sopenharmony_ci	ext		XL3.16b, TT3.16b, TT3.16b, #8
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	eor		TT4.16b, TT4.16b, IN1.16b
26962306a36Sopenharmony_ci	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
27062306a36Sopenharmony_ci	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
27162306a36Sopenharmony_ci	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci	eor		TT3.16b, TT3.16b, XL3.16b
27462306a36Sopenharmony_ci	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
27562306a36Sopenharmony_ci	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
27662306a36Sopenharmony_ci	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci	ext		IN1.16b, T2.16b, T2.16b, #8
27962306a36Sopenharmony_ci	eor		XL2.16b, XL2.16b, XL3.16b
28062306a36Sopenharmony_ci	eor		XH2.16b, XH2.16b, XH3.16b
28162306a36Sopenharmony_ci	eor		XM2.16b, XM2.16b, XM3.16b
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci	eor		T2.16b, T2.16b, IN1.16b
28462306a36Sopenharmony_ci	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
28562306a36Sopenharmony_ci	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
28662306a36Sopenharmony_ci	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci	eor		XL2.16b, XL2.16b, XL3.16b
28962306a36Sopenharmony_ci	eor		XH2.16b, XH2.16b, XH3.16b
29062306a36Sopenharmony_ci	eor		XM2.16b, XM2.16b, XM3.16b
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ci	ext		IN1.16b, T1.16b, T1.16b, #8
29362306a36Sopenharmony_ci	ext		TT3.16b, XL.16b, XL.16b, #8
29462306a36Sopenharmony_ci	eor		XL.16b, XL.16b, IN1.16b
29562306a36Sopenharmony_ci	eor		T1.16b, T1.16b, TT3.16b
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
29862306a36Sopenharmony_ci	eor		T1.16b, T1.16b, XL.16b
29962306a36Sopenharmony_ci	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
30062306a36Sopenharmony_ci	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	eor		XL.16b, XL.16b, XL2.16b
30362306a36Sopenharmony_ci	eor		XH.16b, XH.16b, XH2.16b
30462306a36Sopenharmony_ci	eor		XM.16b, XM.16b, XM2.16b
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_ci	eor		T2.16b, XL.16b, XH.16b
30762306a36Sopenharmony_ci	ext		T1.16b, XL.16b, XH.16b, #8
30862306a36Sopenharmony_ci	eor		XM.16b, XM.16b, T2.16b
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci	__pmull_reduce_p64
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_ci	eor		T2.16b, T2.16b, XH.16b
31362306a36Sopenharmony_ci	eor		XL.16b, XL.16b, T2.16b
31462306a36Sopenharmony_ci
31562306a36Sopenharmony_ci	cbz		w0, 5f
31662306a36Sopenharmony_ci	b		1b
31762306a36Sopenharmony_ci	.endif
31862306a36Sopenharmony_ci
31962306a36Sopenharmony_ci2:	ld1		{T1.2d}, [x2], #16
32062306a36Sopenharmony_ci	sub		w0, w0, #1
32162306a36Sopenharmony_ci
32262306a36Sopenharmony_ci3:	/* multiply XL by SHASH in GF(2^128) */
32362306a36Sopenharmony_ciCPU_LE(	rev64		T1.16b, T1.16b	)
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_ci	ext		T2.16b, XL.16b, XL.16b, #8
32662306a36Sopenharmony_ci	ext		IN1.16b, T1.16b, T1.16b, #8
32762306a36Sopenharmony_ci	eor		T1.16b, T1.16b, T2.16b
32862306a36Sopenharmony_ci	eor		XL.16b, XL.16b, IN1.16b
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
33162306a36Sopenharmony_ci	eor		T1.16b, T1.16b, XL.16b
33262306a36Sopenharmony_ci	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
33362306a36Sopenharmony_ci	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci4:	eor		T2.16b, XL.16b, XH.16b
33662306a36Sopenharmony_ci	ext		T1.16b, XL.16b, XH.16b, #8
33762306a36Sopenharmony_ci	eor		XM.16b, XM.16b, T2.16b
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_ci	__pmull_reduce_\pn
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	eor		T2.16b, T2.16b, XH.16b
34262306a36Sopenharmony_ci	eor		XL.16b, XL.16b, T2.16b
34362306a36Sopenharmony_ci
34462306a36Sopenharmony_ci	cbnz		w0, 0b
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_ci5:	st1		{XL.2d}, [x1]
34762306a36Sopenharmony_ci	ret
34862306a36Sopenharmony_ci	.endm
34962306a36Sopenharmony_ci
35062306a36Sopenharmony_ci	/*
35162306a36Sopenharmony_ci	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
35262306a36Sopenharmony_ci	 *			   struct ghash_key const *k, const char *head)
35362306a36Sopenharmony_ci	 */
35462306a36Sopenharmony_ciSYM_TYPED_FUNC_START(pmull_ghash_update_p64)
35562306a36Sopenharmony_ci	__pmull_ghash	p64
35662306a36Sopenharmony_ciSYM_FUNC_END(pmull_ghash_update_p64)
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_ciSYM_TYPED_FUNC_START(pmull_ghash_update_p8)
35962306a36Sopenharmony_ci	__pmull_ghash	p8
36062306a36Sopenharmony_ciSYM_FUNC_END(pmull_ghash_update_p8)
36162306a36Sopenharmony_ci
36262306a36Sopenharmony_ci	KS0		.req	v8
36362306a36Sopenharmony_ci	KS1		.req	v9
36462306a36Sopenharmony_ci	KS2		.req	v10
36562306a36Sopenharmony_ci	KS3		.req	v11
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci	INP0		.req	v21
36862306a36Sopenharmony_ci	INP1		.req	v22
36962306a36Sopenharmony_ci	INP2		.req	v23
37062306a36Sopenharmony_ci	INP3		.req	v24
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	K0		.req	v25
37362306a36Sopenharmony_ci	K1		.req	v26
37462306a36Sopenharmony_ci	K2		.req	v27
37562306a36Sopenharmony_ci	K3		.req	v28
37662306a36Sopenharmony_ci	K4		.req	v12
37762306a36Sopenharmony_ci	K5		.req	v13
37862306a36Sopenharmony_ci	K6		.req	v4
37962306a36Sopenharmony_ci	K7		.req	v5
38062306a36Sopenharmony_ci	K8		.req	v14
38162306a36Sopenharmony_ci	K9		.req	v15
38262306a36Sopenharmony_ci	KK		.req	v29
38362306a36Sopenharmony_ci	KL		.req	v30
38462306a36Sopenharmony_ci	KM		.req	v31
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	.macro		load_round_keys, rounds, rk, tmp
38762306a36Sopenharmony_ci	add		\tmp, \rk, #64
38862306a36Sopenharmony_ci	ld1		{K0.4s-K3.4s}, [\rk]
38962306a36Sopenharmony_ci	ld1		{K4.4s-K5.4s}, [\tmp]
39062306a36Sopenharmony_ci	add		\tmp, \rk, \rounds, lsl #4
39162306a36Sopenharmony_ci	sub		\tmp, \tmp, #32
39262306a36Sopenharmony_ci	ld1		{KK.4s-KM.4s}, [\tmp]
39362306a36Sopenharmony_ci	.endm
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci	.macro		enc_round, state, key
39662306a36Sopenharmony_ci	aese		\state\().16b, \key\().16b
39762306a36Sopenharmony_ci	aesmc		\state\().16b, \state\().16b
39862306a36Sopenharmony_ci	.endm
39962306a36Sopenharmony_ci
40062306a36Sopenharmony_ci	.macro		enc_qround, s0, s1, s2, s3, key
40162306a36Sopenharmony_ci	enc_round	\s0, \key
40262306a36Sopenharmony_ci	enc_round	\s1, \key
40362306a36Sopenharmony_ci	enc_round	\s2, \key
40462306a36Sopenharmony_ci	enc_round	\s3, \key
40562306a36Sopenharmony_ci	.endm
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_ci	.macro		enc_block, state, rounds, rk, tmp
40862306a36Sopenharmony_ci	add		\tmp, \rk, #96
40962306a36Sopenharmony_ci	ld1		{K6.4s-K7.4s}, [\tmp], #32
41062306a36Sopenharmony_ci	.irp		key, K0, K1, K2, K3, K4 K5
41162306a36Sopenharmony_ci	enc_round	\state, \key
41262306a36Sopenharmony_ci	.endr
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci	tbnz		\rounds, #2, .Lnot128_\@
41562306a36Sopenharmony_ci.Lout256_\@:
41662306a36Sopenharmony_ci	enc_round	\state, K6
41762306a36Sopenharmony_ci	enc_round	\state, K7
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci.Lout192_\@:
42062306a36Sopenharmony_ci	enc_round	\state, KK
42162306a36Sopenharmony_ci	aese		\state\().16b, KL.16b
42262306a36Sopenharmony_ci	eor		\state\().16b, \state\().16b, KM.16b
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_ci	.subsection	1
42562306a36Sopenharmony_ci.Lnot128_\@:
42662306a36Sopenharmony_ci	ld1		{K8.4s-K9.4s}, [\tmp], #32
42762306a36Sopenharmony_ci	enc_round	\state, K6
42862306a36Sopenharmony_ci	enc_round	\state, K7
42962306a36Sopenharmony_ci	ld1		{K6.4s-K7.4s}, [\tmp]
43062306a36Sopenharmony_ci	enc_round	\state, K8
43162306a36Sopenharmony_ci	enc_round	\state, K9
43262306a36Sopenharmony_ci	tbz		\rounds, #1, .Lout192_\@
43362306a36Sopenharmony_ci	b		.Lout256_\@
43462306a36Sopenharmony_ci	.previous
43562306a36Sopenharmony_ci	.endm
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci	.align		6
43862306a36Sopenharmony_ci	.macro		pmull_gcm_do_crypt, enc
43962306a36Sopenharmony_ci	frame_push	1
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci	load_round_keys	x7, x6, x8
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	ld1		{SHASH.2d}, [x3], #16
44462306a36Sopenharmony_ci	ld1		{HH.2d-HH4.2d}, [x3]
44562306a36Sopenharmony_ci
44662306a36Sopenharmony_ci	trn1		SHASH2.2d, SHASH.2d, HH.2d
44762306a36Sopenharmony_ci	trn2		T1.2d, SHASH.2d, HH.2d
44862306a36Sopenharmony_ci	eor		SHASH2.16b, SHASH2.16b, T1.16b
44962306a36Sopenharmony_ci
45062306a36Sopenharmony_ci	trn1		HH34.2d, HH3.2d, HH4.2d
45162306a36Sopenharmony_ci	trn2		T1.2d, HH3.2d, HH4.2d
45262306a36Sopenharmony_ci	eor		HH34.16b, HH34.16b, T1.16b
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci	ld1		{XL.2d}, [x4]
45562306a36Sopenharmony_ci
45662306a36Sopenharmony_ci	cbz		x0, 3f				// tag only?
45762306a36Sopenharmony_ci
45862306a36Sopenharmony_ci	ldr		w8, [x5, #12]			// load lower counter
45962306a36Sopenharmony_ciCPU_LE(	rev		w8, w8		)
46062306a36Sopenharmony_ci
46162306a36Sopenharmony_ci0:	mov		w9, #4				// max blocks per round
46262306a36Sopenharmony_ci	add		x10, x0, #0xf
46362306a36Sopenharmony_ci	lsr		x10, x10, #4			// remaining blocks
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_ci	subs		x0, x0, #64
46662306a36Sopenharmony_ci	csel		w9, w10, w9, mi
46762306a36Sopenharmony_ci	add		w8, w8, w9
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	bmi		1f
47062306a36Sopenharmony_ci	ld1		{INP0.16b-INP3.16b}, [x2], #64
47162306a36Sopenharmony_ci	.subsection	1
47262306a36Sopenharmony_ci	/*
47362306a36Sopenharmony_ci	 * Populate the four input registers right to left with up to 63 bytes
47462306a36Sopenharmony_ci	 * of data, using overlapping loads to avoid branches.
47562306a36Sopenharmony_ci	 *
47662306a36Sopenharmony_ci	 *                INP0     INP1     INP2     INP3
47762306a36Sopenharmony_ci	 *  1 byte     |        |        |        |x       |
47862306a36Sopenharmony_ci	 * 16 bytes    |        |        |        |xxxxxxxx|
47962306a36Sopenharmony_ci	 * 17 bytes    |        |        |xxxxxxxx|x       |
48062306a36Sopenharmony_ci	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
48162306a36Sopenharmony_ci	 * etc etc
48262306a36Sopenharmony_ci	 *
48362306a36Sopenharmony_ci	 * Note that this code may read up to 15 bytes before the start of
48462306a36Sopenharmony_ci	 * the input. It is up to the calling code to ensure this is safe if
48562306a36Sopenharmony_ci	 * this happens in the first iteration of the loop (i.e., when the
48662306a36Sopenharmony_ci	 * input size is < 16 bytes)
48762306a36Sopenharmony_ci	 */
48862306a36Sopenharmony_ci1:	mov		x15, #16
48962306a36Sopenharmony_ci	ands		x19, x0, #0xf
49062306a36Sopenharmony_ci	csel		x19, x19, x15, ne
49162306a36Sopenharmony_ci	adr_l		x17, .Lpermute_table + 16
49262306a36Sopenharmony_ci
49362306a36Sopenharmony_ci	sub		x11, x15, x19
49462306a36Sopenharmony_ci	add		x12, x17, x11
49562306a36Sopenharmony_ci	sub		x17, x17, x11
49662306a36Sopenharmony_ci	ld1		{T1.16b}, [x12]
49762306a36Sopenharmony_ci	sub		x10, x1, x11
49862306a36Sopenharmony_ci	sub		x11, x2, x11
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	cmp		x0, #-16
50162306a36Sopenharmony_ci	csel		x14, x15, xzr, gt
50262306a36Sopenharmony_ci	cmp		x0, #-32
50362306a36Sopenharmony_ci	csel		x15, x15, xzr, gt
50462306a36Sopenharmony_ci	cmp		x0, #-48
50562306a36Sopenharmony_ci	csel		x16, x19, xzr, gt
50662306a36Sopenharmony_ci	csel		x1, x1, x10, gt
50762306a36Sopenharmony_ci	csel		x2, x2, x11, gt
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_ci	ld1		{INP0.16b}, [x2], x14
51062306a36Sopenharmony_ci	ld1		{INP1.16b}, [x2], x15
51162306a36Sopenharmony_ci	ld1		{INP2.16b}, [x2], x16
51262306a36Sopenharmony_ci	ld1		{INP3.16b}, [x2]
51362306a36Sopenharmony_ci	tbl		INP3.16b, {INP3.16b}, T1.16b
51462306a36Sopenharmony_ci	b		2f
51562306a36Sopenharmony_ci	.previous
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci2:	.if		\enc == 0
51862306a36Sopenharmony_ci	bl		pmull_gcm_ghash_4x
51962306a36Sopenharmony_ci	.endif
52062306a36Sopenharmony_ci
52162306a36Sopenharmony_ci	bl		pmull_gcm_enc_4x
52262306a36Sopenharmony_ci
52362306a36Sopenharmony_ci	tbnz		x0, #63, 6f
52462306a36Sopenharmony_ci	st1		{INP0.16b-INP3.16b}, [x1], #64
52562306a36Sopenharmony_ci	.if		\enc == 1
52662306a36Sopenharmony_ci	bl		pmull_gcm_ghash_4x
52762306a36Sopenharmony_ci	.endif
52862306a36Sopenharmony_ci	bne		0b
52962306a36Sopenharmony_ci
53062306a36Sopenharmony_ci3:	ldr		x10, [sp, #.Lframe_local_offset]
53162306a36Sopenharmony_ci	cbz		x10, 5f				// output tag?
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_ci	ld1		{INP3.16b}, [x10]		// load lengths[]
53462306a36Sopenharmony_ci	mov		w9, #1
53562306a36Sopenharmony_ci	bl		pmull_gcm_ghash_4x
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci	mov		w11, #(0x1 << 24)		// BE '1U'
53862306a36Sopenharmony_ci	ld1		{KS0.16b}, [x5]
53962306a36Sopenharmony_ci	mov		KS0.s[3], w11
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci	enc_block	KS0, x7, x6, x12
54262306a36Sopenharmony_ci
54362306a36Sopenharmony_ci	ext		XL.16b, XL.16b, XL.16b, #8
54462306a36Sopenharmony_ci	rev64		XL.16b, XL.16b
54562306a36Sopenharmony_ci	eor		XL.16b, XL.16b, KS0.16b
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_ci	.if		\enc == 1
54862306a36Sopenharmony_ci	st1		{XL.16b}, [x10]			// store tag
54962306a36Sopenharmony_ci	.else
55062306a36Sopenharmony_ci	ldp		x11, x12, [sp, #40]		// load tag pointer and authsize
55162306a36Sopenharmony_ci	adr_l		x17, .Lpermute_table
55262306a36Sopenharmony_ci	ld1		{KS0.16b}, [x11]		// load supplied tag
55362306a36Sopenharmony_ci	add		x17, x17, x12
55462306a36Sopenharmony_ci	ld1		{KS1.16b}, [x17]		// load permute vector
55562306a36Sopenharmony_ci
55662306a36Sopenharmony_ci	cmeq		XL.16b, XL.16b, KS0.16b		// compare tags
55762306a36Sopenharmony_ci	mvn		XL.16b, XL.16b			// -1 for fail, 0 for pass
55862306a36Sopenharmony_ci	tbl		XL.16b, {XL.16b}, KS1.16b	// keep authsize bytes only
55962306a36Sopenharmony_ci	sminv		b0, XL.16b			// signed minimum across XL
56062306a36Sopenharmony_ci	smov		w0, v0.b[0]			// return b0
56162306a36Sopenharmony_ci	.endif
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ci4:	frame_pop
56462306a36Sopenharmony_ci	ret
56562306a36Sopenharmony_ci
56662306a36Sopenharmony_ci5:
56762306a36Sopenharmony_ciCPU_LE(	rev		w8, w8		)
56862306a36Sopenharmony_ci	str		w8, [x5, #12]			// store lower counter
56962306a36Sopenharmony_ci	st1		{XL.2d}, [x4]
57062306a36Sopenharmony_ci	b		4b
57162306a36Sopenharmony_ci
57262306a36Sopenharmony_ci6:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
57362306a36Sopenharmony_ci	sub		x17, x17, x19, lsl #1
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci	cmp		w9, #1
57662306a36Sopenharmony_ci	beq		7f
57762306a36Sopenharmony_ci	.subsection	1
57862306a36Sopenharmony_ci7:	ld1		{INP2.16b}, [x1]
57962306a36Sopenharmony_ci	tbx		INP2.16b, {INP3.16b}, T1.16b
58062306a36Sopenharmony_ci	mov		INP3.16b, INP2.16b
58162306a36Sopenharmony_ci	b		8f
58262306a36Sopenharmony_ci	.previous
58362306a36Sopenharmony_ci
58462306a36Sopenharmony_ci	st1		{INP0.16b}, [x1], x14
58562306a36Sopenharmony_ci	st1		{INP1.16b}, [x1], x15
58662306a36Sopenharmony_ci	st1		{INP2.16b}, [x1], x16
58762306a36Sopenharmony_ci	tbl		INP3.16b, {INP3.16b}, T1.16b
58862306a36Sopenharmony_ci	tbx		INP3.16b, {INP2.16b}, T2.16b
58962306a36Sopenharmony_ci8:	st1		{INP3.16b}, [x1]
59062306a36Sopenharmony_ci
59162306a36Sopenharmony_ci	.if		\enc == 1
59262306a36Sopenharmony_ci	ld1		{T1.16b}, [x17]
59362306a36Sopenharmony_ci	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
59462306a36Sopenharmony_ci	bl		pmull_gcm_ghash_4x
59562306a36Sopenharmony_ci	.endif
59662306a36Sopenharmony_ci	b		3b
59762306a36Sopenharmony_ci	.endm
59862306a36Sopenharmony_ci
59962306a36Sopenharmony_ci	/*
60062306a36Sopenharmony_ci	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
60162306a36Sopenharmony_ci	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
60262306a36Sopenharmony_ci	 *			  int rounds, u8 tag)
60362306a36Sopenharmony_ci	 */
60462306a36Sopenharmony_ciSYM_FUNC_START(pmull_gcm_encrypt)
60562306a36Sopenharmony_ci	pmull_gcm_do_crypt	1
60662306a36Sopenharmony_ciSYM_FUNC_END(pmull_gcm_encrypt)
60762306a36Sopenharmony_ci
60862306a36Sopenharmony_ci	/*
60962306a36Sopenharmony_ci	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
61062306a36Sopenharmony_ci	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
61162306a36Sopenharmony_ci	 *			  int rounds, u8 tag)
61262306a36Sopenharmony_ci	 */
61362306a36Sopenharmony_ciSYM_FUNC_START(pmull_gcm_decrypt)
61462306a36Sopenharmony_ci	pmull_gcm_do_crypt	0
61562306a36Sopenharmony_ciSYM_FUNC_END(pmull_gcm_decrypt)
61662306a36Sopenharmony_ci
61762306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
61862306a36Sopenharmony_ci	movi		MASK.16b, #0xe1
61962306a36Sopenharmony_ci	shl		MASK.2d, MASK.2d, #57
62062306a36Sopenharmony_ci
62162306a36Sopenharmony_ci	rev64		T1.16b, INP0.16b
62262306a36Sopenharmony_ci	rev64		T2.16b, INP1.16b
62362306a36Sopenharmony_ci	rev64		TT3.16b, INP2.16b
62462306a36Sopenharmony_ci	rev64		TT4.16b, INP3.16b
62562306a36Sopenharmony_ci
62662306a36Sopenharmony_ci	ext		XL.16b, XL.16b, XL.16b, #8
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_ci	tbz		w9, #2, 0f			// <4 blocks?
62962306a36Sopenharmony_ci	.subsection	1
63062306a36Sopenharmony_ci0:	movi		XH2.16b, #0
63162306a36Sopenharmony_ci	movi		XM2.16b, #0
63262306a36Sopenharmony_ci	movi		XL2.16b, #0
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci	tbz		w9, #0, 1f			// 2 blocks?
63562306a36Sopenharmony_ci	tbz		w9, #1, 2f			// 1 block?
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_ci	eor		T2.16b, T2.16b, XL.16b
63862306a36Sopenharmony_ci	ext		T1.16b, T2.16b, T2.16b, #8
63962306a36Sopenharmony_ci	b		.Lgh3
64062306a36Sopenharmony_ci
64162306a36Sopenharmony_ci1:	eor		TT3.16b, TT3.16b, XL.16b
64262306a36Sopenharmony_ci	ext		T2.16b, TT3.16b, TT3.16b, #8
64362306a36Sopenharmony_ci	b		.Lgh2
64462306a36Sopenharmony_ci
64562306a36Sopenharmony_ci2:	eor		TT4.16b, TT4.16b, XL.16b
64662306a36Sopenharmony_ci	ext		IN1.16b, TT4.16b, TT4.16b, #8
64762306a36Sopenharmony_ci	b		.Lgh1
64862306a36Sopenharmony_ci	.previous
64962306a36Sopenharmony_ci
65062306a36Sopenharmony_ci	eor		T1.16b, T1.16b, XL.16b
65162306a36Sopenharmony_ci	ext		IN1.16b, T1.16b, T1.16b, #8
65262306a36Sopenharmony_ci
65362306a36Sopenharmony_ci	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
65462306a36Sopenharmony_ci	eor		T1.16b, T1.16b, IN1.16b
65562306a36Sopenharmony_ci	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
65662306a36Sopenharmony_ci	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
65762306a36Sopenharmony_ci
65862306a36Sopenharmony_ci	ext		T1.16b, T2.16b, T2.16b, #8
65962306a36Sopenharmony_ci.Lgh3:	eor		T2.16b, T2.16b, T1.16b
66062306a36Sopenharmony_ci	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
66162306a36Sopenharmony_ci	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
66262306a36Sopenharmony_ci	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
66362306a36Sopenharmony_ci
66462306a36Sopenharmony_ci	eor		XH2.16b, XH2.16b, XH.16b
66562306a36Sopenharmony_ci	eor		XL2.16b, XL2.16b, XL.16b
66662306a36Sopenharmony_ci	eor		XM2.16b, XM2.16b, XM.16b
66762306a36Sopenharmony_ci
66862306a36Sopenharmony_ci	ext		T2.16b, TT3.16b, TT3.16b, #8
66962306a36Sopenharmony_ci.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
67062306a36Sopenharmony_ci	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
67162306a36Sopenharmony_ci	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
67262306a36Sopenharmony_ci	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
67362306a36Sopenharmony_ci
67462306a36Sopenharmony_ci	eor		XH2.16b, XH2.16b, XH.16b
67562306a36Sopenharmony_ci	eor		XL2.16b, XL2.16b, XL.16b
67662306a36Sopenharmony_ci	eor		XM2.16b, XM2.16b, XM.16b
67762306a36Sopenharmony_ci
67862306a36Sopenharmony_ci	ext		IN1.16b, TT4.16b, TT4.16b, #8
67962306a36Sopenharmony_ci.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
68062306a36Sopenharmony_ci	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
68162306a36Sopenharmony_ci	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
68262306a36Sopenharmony_ci	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
68362306a36Sopenharmony_ci
68462306a36Sopenharmony_ci	eor		XH.16b, XH.16b, XH2.16b
68562306a36Sopenharmony_ci	eor		XL.16b, XL.16b, XL2.16b
68662306a36Sopenharmony_ci	eor		XM.16b, XM.16b, XM2.16b
68762306a36Sopenharmony_ci
68862306a36Sopenharmony_ci	eor		T2.16b, XL.16b, XH.16b
68962306a36Sopenharmony_ci	ext		T1.16b, XL.16b, XH.16b, #8
69062306a36Sopenharmony_ci	eor		XM.16b, XM.16b, T2.16b
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci	__pmull_reduce_p64
69362306a36Sopenharmony_ci
69462306a36Sopenharmony_ci	eor		T2.16b, T2.16b, XH.16b
69562306a36Sopenharmony_ci	eor		XL.16b, XL.16b, T2.16b
69662306a36Sopenharmony_ci
69762306a36Sopenharmony_ci	ret
69862306a36Sopenharmony_ciSYM_FUNC_END(pmull_gcm_ghash_4x)
69962306a36Sopenharmony_ci
70062306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
70162306a36Sopenharmony_ci	ld1		{KS0.16b}, [x5]			// load upper counter
70262306a36Sopenharmony_ci	sub		w10, w8, #4
70362306a36Sopenharmony_ci	sub		w11, w8, #3
70462306a36Sopenharmony_ci	sub		w12, w8, #2
70562306a36Sopenharmony_ci	sub		w13, w8, #1
70662306a36Sopenharmony_ci	rev		w10, w10
70762306a36Sopenharmony_ci	rev		w11, w11
70862306a36Sopenharmony_ci	rev		w12, w12
70962306a36Sopenharmony_ci	rev		w13, w13
71062306a36Sopenharmony_ci	mov		KS1.16b, KS0.16b
71162306a36Sopenharmony_ci	mov		KS2.16b, KS0.16b
71262306a36Sopenharmony_ci	mov		KS3.16b, KS0.16b
71362306a36Sopenharmony_ci	ins		KS0.s[3], w10			// set lower counter
71462306a36Sopenharmony_ci	ins		KS1.s[3], w11
71562306a36Sopenharmony_ci	ins		KS2.s[3], w12
71662306a36Sopenharmony_ci	ins		KS3.s[3], w13
71762306a36Sopenharmony_ci
71862306a36Sopenharmony_ci	add		x10, x6, #96			// round key pointer
71962306a36Sopenharmony_ci	ld1		{K6.4s-K7.4s}, [x10], #32
72062306a36Sopenharmony_ci	.irp		key, K0, K1, K2, K3, K4, K5
72162306a36Sopenharmony_ci	enc_qround	KS0, KS1, KS2, KS3, \key
72262306a36Sopenharmony_ci	.endr
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ci	tbnz		x7, #2, .Lnot128
72562306a36Sopenharmony_ci	.subsection	1
72662306a36Sopenharmony_ci.Lnot128:
72762306a36Sopenharmony_ci	ld1		{K8.4s-K9.4s}, [x10], #32
72862306a36Sopenharmony_ci	.irp		key, K6, K7
72962306a36Sopenharmony_ci	enc_qround	KS0, KS1, KS2, KS3, \key
73062306a36Sopenharmony_ci	.endr
73162306a36Sopenharmony_ci	ld1		{K6.4s-K7.4s}, [x10]
73262306a36Sopenharmony_ci	.irp		key, K8, K9
73362306a36Sopenharmony_ci	enc_qround	KS0, KS1, KS2, KS3, \key
73462306a36Sopenharmony_ci	.endr
73562306a36Sopenharmony_ci	tbz		x7, #1, .Lout192
73662306a36Sopenharmony_ci	b		.Lout256
73762306a36Sopenharmony_ci	.previous
73862306a36Sopenharmony_ci
73962306a36Sopenharmony_ci.Lout256:
74062306a36Sopenharmony_ci	.irp		key, K6, K7
74162306a36Sopenharmony_ci	enc_qround	KS0, KS1, KS2, KS3, \key
74262306a36Sopenharmony_ci	.endr
74362306a36Sopenharmony_ci
74462306a36Sopenharmony_ci.Lout192:
74562306a36Sopenharmony_ci	enc_qround	KS0, KS1, KS2, KS3, KK
74662306a36Sopenharmony_ci
74762306a36Sopenharmony_ci	aese		KS0.16b, KL.16b
74862306a36Sopenharmony_ci	aese		KS1.16b, KL.16b
74962306a36Sopenharmony_ci	aese		KS2.16b, KL.16b
75062306a36Sopenharmony_ci	aese		KS3.16b, KL.16b
75162306a36Sopenharmony_ci
75262306a36Sopenharmony_ci	eor		KS0.16b, KS0.16b, KM.16b
75362306a36Sopenharmony_ci	eor		KS1.16b, KS1.16b, KM.16b
75462306a36Sopenharmony_ci	eor		KS2.16b, KS2.16b, KM.16b
75562306a36Sopenharmony_ci	eor		KS3.16b, KS3.16b, KM.16b
75662306a36Sopenharmony_ci
75762306a36Sopenharmony_ci	eor		INP0.16b, INP0.16b, KS0.16b
75862306a36Sopenharmony_ci	eor		INP1.16b, INP1.16b, KS1.16b
75962306a36Sopenharmony_ci	eor		INP2.16b, INP2.16b, KS2.16b
76062306a36Sopenharmony_ci	eor		INP3.16b, INP3.16b, KS3.16b
76162306a36Sopenharmony_ci
76262306a36Sopenharmony_ci	ret
76362306a36Sopenharmony_ciSYM_FUNC_END(pmull_gcm_enc_4x)
76462306a36Sopenharmony_ci
76562306a36Sopenharmony_ci	.section	".rodata", "a"
76662306a36Sopenharmony_ci	.align		6
76762306a36Sopenharmony_ci.Lpermute_table:
76862306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
76962306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
77062306a36Sopenharmony_ci	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
77162306a36Sopenharmony_ci	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
77262306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
77362306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
77462306a36Sopenharmony_ci	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
77562306a36Sopenharmony_ci	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
77662306a36Sopenharmony_ci	.previous
777