18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Accelerated GHASH implementation with ARMv8 PMULL instructions.
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci#include <linux/linkage.h>
98c2ecf20Sopenharmony_ci#include <asm/assembler.h>
108c2ecf20Sopenharmony_ci
118c2ecf20Sopenharmony_ci	SHASH		.req	v0
128c2ecf20Sopenharmony_ci	SHASH2		.req	v1
138c2ecf20Sopenharmony_ci	T1		.req	v2
148c2ecf20Sopenharmony_ci	T2		.req	v3
158c2ecf20Sopenharmony_ci	MASK		.req	v4
168c2ecf20Sopenharmony_ci	XM		.req	v5
178c2ecf20Sopenharmony_ci	XL		.req	v6
188c2ecf20Sopenharmony_ci	XH		.req	v7
198c2ecf20Sopenharmony_ci	IN1		.req	v7
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_ci	k00_16		.req	v8
228c2ecf20Sopenharmony_ci	k32_48		.req	v9
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_ci	t3		.req	v10
258c2ecf20Sopenharmony_ci	t4		.req	v11
268c2ecf20Sopenharmony_ci	t5		.req	v12
278c2ecf20Sopenharmony_ci	t6		.req	v13
288c2ecf20Sopenharmony_ci	t7		.req	v14
298c2ecf20Sopenharmony_ci	t8		.req	v15
308c2ecf20Sopenharmony_ci	t9		.req	v16
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_ci	perm1		.req	v17
338c2ecf20Sopenharmony_ci	perm2		.req	v18
348c2ecf20Sopenharmony_ci	perm3		.req	v19
358c2ecf20Sopenharmony_ci
368c2ecf20Sopenharmony_ci	sh1		.req	v20
378c2ecf20Sopenharmony_ci	sh2		.req	v21
388c2ecf20Sopenharmony_ci	sh3		.req	v22
398c2ecf20Sopenharmony_ci	sh4		.req	v23
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_ci	ss1		.req	v24
428c2ecf20Sopenharmony_ci	ss2		.req	v25
438c2ecf20Sopenharmony_ci	ss3		.req	v26
448c2ecf20Sopenharmony_ci	ss4		.req	v27
458c2ecf20Sopenharmony_ci
468c2ecf20Sopenharmony_ci	XL2		.req	v8
478c2ecf20Sopenharmony_ci	XM2		.req	v9
488c2ecf20Sopenharmony_ci	XH2		.req	v10
498c2ecf20Sopenharmony_ci	XL3		.req	v11
508c2ecf20Sopenharmony_ci	XM3		.req	v12
518c2ecf20Sopenharmony_ci	XH3		.req	v13
528c2ecf20Sopenharmony_ci	TT3		.req	v14
538c2ecf20Sopenharmony_ci	TT4		.req	v15
548c2ecf20Sopenharmony_ci	HH		.req	v16
558c2ecf20Sopenharmony_ci	HH3		.req	v17
568c2ecf20Sopenharmony_ci	HH4		.req	v18
578c2ecf20Sopenharmony_ci	HH34		.req	v19
588c2ecf20Sopenharmony_ci
598c2ecf20Sopenharmony_ci	.text
608c2ecf20Sopenharmony_ci	.arch		armv8-a+crypto
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci	.macro		__pmull_p64, rd, rn, rm
638c2ecf20Sopenharmony_ci	pmull		\rd\().1q, \rn\().1d, \rm\().1d
648c2ecf20Sopenharmony_ci	.endm
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_ci	.macro		__pmull2_p64, rd, rn, rm
678c2ecf20Sopenharmony_ci	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
688c2ecf20Sopenharmony_ci	.endm
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci	.macro		__pmull_p8, rq, ad, bd
718c2ecf20Sopenharmony_ci	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
728c2ecf20Sopenharmony_ci	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
738c2ecf20Sopenharmony_ci	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_ci	__pmull_p8_\bd	\rq, \ad
768c2ecf20Sopenharmony_ci	.endm
778c2ecf20Sopenharmony_ci
788c2ecf20Sopenharmony_ci	.macro		__pmull2_p8, rq, ad, bd
798c2ecf20Sopenharmony_ci	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
808c2ecf20Sopenharmony_ci	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
818c2ecf20Sopenharmony_ci	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_ci	__pmull2_p8_\bd	\rq, \ad
848c2ecf20Sopenharmony_ci	.endm
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_ci	.macro		__pmull_p8_SHASH, rq, ad
878c2ecf20Sopenharmony_ci	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
888c2ecf20Sopenharmony_ci	.endm
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci	.macro		__pmull_p8_SHASH2, rq, ad
918c2ecf20Sopenharmony_ci	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
928c2ecf20Sopenharmony_ci	.endm
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_ci	.macro		__pmull2_p8_SHASH, rq, ad
958c2ecf20Sopenharmony_ci	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
968c2ecf20Sopenharmony_ci	.endm
978c2ecf20Sopenharmony_ci
988c2ecf20Sopenharmony_ci	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
998c2ecf20Sopenharmony_ci	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
1008c2ecf20Sopenharmony_ci	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
1018c2ecf20Sopenharmony_ci	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
1028c2ecf20Sopenharmony_ci	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
1038c2ecf20Sopenharmony_ci	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
1048c2ecf20Sopenharmony_ci	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
1058c2ecf20Sopenharmony_ci	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
1068c2ecf20Sopenharmony_ci	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
1078c2ecf20Sopenharmony_ci
1088c2ecf20Sopenharmony_ci	eor		t3.16b, t3.16b, t4.16b			// L = E + F
1098c2ecf20Sopenharmony_ci	eor		t5.16b, t5.16b, t6.16b			// M = G + H
1108c2ecf20Sopenharmony_ci	eor		t7.16b, t7.16b, t8.16b			// N = I + J
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_ci	uzp1		t4.2d, t3.2d, t5.2d
1138c2ecf20Sopenharmony_ci	uzp2		t3.2d, t3.2d, t5.2d
1148c2ecf20Sopenharmony_ci	uzp1		t6.2d, t7.2d, t9.2d
1158c2ecf20Sopenharmony_ci	uzp2		t7.2d, t7.2d, t9.2d
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_ci	// t3 = (L) (P0 + P1) << 8
1188c2ecf20Sopenharmony_ci	// t5 = (M) (P2 + P3) << 16
1198c2ecf20Sopenharmony_ci	eor		t4.16b, t4.16b, t3.16b
1208c2ecf20Sopenharmony_ci	and		t3.16b, t3.16b, k32_48.16b
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci	// t7 = (N) (P4 + P5) << 24
1238c2ecf20Sopenharmony_ci	// t9 = (K) (P6 + P7) << 32
1248c2ecf20Sopenharmony_ci	eor		t6.16b, t6.16b, t7.16b
1258c2ecf20Sopenharmony_ci	and		t7.16b, t7.16b, k00_16.16b
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci	eor		t4.16b, t4.16b, t3.16b
1288c2ecf20Sopenharmony_ci	eor		t6.16b, t6.16b, t7.16b
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ci	zip2		t5.2d, t4.2d, t3.2d
1318c2ecf20Sopenharmony_ci	zip1		t3.2d, t4.2d, t3.2d
1328c2ecf20Sopenharmony_ci	zip2		t9.2d, t6.2d, t7.2d
1338c2ecf20Sopenharmony_ci	zip1		t7.2d, t6.2d, t7.2d
1348c2ecf20Sopenharmony_ci
1358c2ecf20Sopenharmony_ci	ext		t3.16b, t3.16b, t3.16b, #15
1368c2ecf20Sopenharmony_ci	ext		t5.16b, t5.16b, t5.16b, #14
1378c2ecf20Sopenharmony_ci	ext		t7.16b, t7.16b, t7.16b, #13
1388c2ecf20Sopenharmony_ci	ext		t9.16b, t9.16b, t9.16b, #12
1398c2ecf20Sopenharmony_ci
1408c2ecf20Sopenharmony_ci	eor		t3.16b, t3.16b, t5.16b
1418c2ecf20Sopenharmony_ci	eor		t7.16b, t7.16b, t9.16b
1428c2ecf20Sopenharmony_ci	eor		\rq\().16b, \rq\().16b, t3.16b
1438c2ecf20Sopenharmony_ci	eor		\rq\().16b, \rq\().16b, t7.16b
1448c2ecf20Sopenharmony_ci	.endm
1458c2ecf20Sopenharmony_ci
1468c2ecf20Sopenharmony_ci	.macro		__pmull_pre_p64
1478c2ecf20Sopenharmony_ci	add		x8, x3, #16
1488c2ecf20Sopenharmony_ci	ld1		{HH.2d-HH4.2d}, [x8]
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci	trn1		SHASH2.2d, SHASH.2d, HH.2d
1518c2ecf20Sopenharmony_ci	trn2		T1.2d, SHASH.2d, HH.2d
1528c2ecf20Sopenharmony_ci	eor		SHASH2.16b, SHASH2.16b, T1.16b
1538c2ecf20Sopenharmony_ci
1548c2ecf20Sopenharmony_ci	trn1		HH34.2d, HH3.2d, HH4.2d
1558c2ecf20Sopenharmony_ci	trn2		T1.2d, HH3.2d, HH4.2d
1568c2ecf20Sopenharmony_ci	eor		HH34.16b, HH34.16b, T1.16b
1578c2ecf20Sopenharmony_ci
1588c2ecf20Sopenharmony_ci	movi		MASK.16b, #0xe1
1598c2ecf20Sopenharmony_ci	shl		MASK.2d, MASK.2d, #57
1608c2ecf20Sopenharmony_ci	.endm
1618c2ecf20Sopenharmony_ci
1628c2ecf20Sopenharmony_ci	.macro		__pmull_pre_p8
1638c2ecf20Sopenharmony_ci	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
1648c2ecf20Sopenharmony_ci	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci	// k00_16 := 0x0000000000000000_000000000000ffff
1678c2ecf20Sopenharmony_ci	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
1688c2ecf20Sopenharmony_ci	movi		k32_48.2d, #0xffffffff
1698c2ecf20Sopenharmony_ci	mov		k32_48.h[2], k32_48.h[0]
1708c2ecf20Sopenharmony_ci	ushr		k00_16.2d, k32_48.2d, #32
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci	// prepare the permutation vectors
1738c2ecf20Sopenharmony_ci	mov_q		x5, 0x080f0e0d0c0b0a09
1748c2ecf20Sopenharmony_ci	movi		T1.8b, #8
1758c2ecf20Sopenharmony_ci	dup		perm1.2d, x5
1768c2ecf20Sopenharmony_ci	eor		perm1.16b, perm1.16b, T1.16b
1778c2ecf20Sopenharmony_ci	ushr		perm2.2d, perm1.2d, #8
1788c2ecf20Sopenharmony_ci	ushr		perm3.2d, perm1.2d, #16
1798c2ecf20Sopenharmony_ci	ushr		T1.2d, perm1.2d, #24
1808c2ecf20Sopenharmony_ci	sli		perm2.2d, perm1.2d, #56
1818c2ecf20Sopenharmony_ci	sli		perm3.2d, perm1.2d, #48
1828c2ecf20Sopenharmony_ci	sli		T1.2d, perm1.2d, #40
1838c2ecf20Sopenharmony_ci
1848c2ecf20Sopenharmony_ci	// precompute loop invariants
1858c2ecf20Sopenharmony_ci	tbl		sh1.16b, {SHASH.16b}, perm1.16b
1868c2ecf20Sopenharmony_ci	tbl		sh2.16b, {SHASH.16b}, perm2.16b
1878c2ecf20Sopenharmony_ci	tbl		sh3.16b, {SHASH.16b}, perm3.16b
1888c2ecf20Sopenharmony_ci	tbl		sh4.16b, {SHASH.16b}, T1.16b
1898c2ecf20Sopenharmony_ci	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
1908c2ecf20Sopenharmony_ci	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
1918c2ecf20Sopenharmony_ci	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
1928c2ecf20Sopenharmony_ci	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
1938c2ecf20Sopenharmony_ci	.endm
1948c2ecf20Sopenharmony_ci
1958c2ecf20Sopenharmony_ci	//
1968c2ecf20Sopenharmony_ci	// PMULL (64x64->128) based reduction for CPUs that can do
1978c2ecf20Sopenharmony_ci	// it in a single instruction.
1988c2ecf20Sopenharmony_ci	//
1998c2ecf20Sopenharmony_ci	.macro		__pmull_reduce_p64
2008c2ecf20Sopenharmony_ci	pmull		T2.1q, XL.1d, MASK.1d
2018c2ecf20Sopenharmony_ci	eor		XM.16b, XM.16b, T1.16b
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci	mov		XH.d[0], XM.d[1]
2048c2ecf20Sopenharmony_ci	mov		XM.d[1], XL.d[0]
2058c2ecf20Sopenharmony_ci
2068c2ecf20Sopenharmony_ci	eor		XL.16b, XM.16b, T2.16b
2078c2ecf20Sopenharmony_ci	ext		T2.16b, XL.16b, XL.16b, #8
2088c2ecf20Sopenharmony_ci	pmull		XL.1q, XL.1d, MASK.1d
2098c2ecf20Sopenharmony_ci	.endm
2108c2ecf20Sopenharmony_ci
2118c2ecf20Sopenharmony_ci	//
2128c2ecf20Sopenharmony_ci	// Alternative reduction for CPUs that lack support for the
2138c2ecf20Sopenharmony_ci	// 64x64->128 PMULL instruction
2148c2ecf20Sopenharmony_ci	//
2158c2ecf20Sopenharmony_ci	.macro		__pmull_reduce_p8
2168c2ecf20Sopenharmony_ci	eor		XM.16b, XM.16b, T1.16b
2178c2ecf20Sopenharmony_ci
2188c2ecf20Sopenharmony_ci	mov		XL.d[1], XM.d[0]
2198c2ecf20Sopenharmony_ci	mov		XH.d[0], XM.d[1]
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_ci	shl		T1.2d, XL.2d, #57
2228c2ecf20Sopenharmony_ci	shl		T2.2d, XL.2d, #62
2238c2ecf20Sopenharmony_ci	eor		T2.16b, T2.16b, T1.16b
2248c2ecf20Sopenharmony_ci	shl		T1.2d, XL.2d, #63
2258c2ecf20Sopenharmony_ci	eor		T2.16b, T2.16b, T1.16b
2268c2ecf20Sopenharmony_ci	ext		T1.16b, XL.16b, XH.16b, #8
2278c2ecf20Sopenharmony_ci	eor		T2.16b, T2.16b, T1.16b
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_ci	mov		XL.d[1], T2.d[0]
2308c2ecf20Sopenharmony_ci	mov		XH.d[0], T2.d[1]
2318c2ecf20Sopenharmony_ci
2328c2ecf20Sopenharmony_ci	ushr		T2.2d, XL.2d, #1
2338c2ecf20Sopenharmony_ci	eor		XH.16b, XH.16b, XL.16b
2348c2ecf20Sopenharmony_ci	eor		XL.16b, XL.16b, T2.16b
2358c2ecf20Sopenharmony_ci	ushr		T2.2d, T2.2d, #6
2368c2ecf20Sopenharmony_ci	ushr		XL.2d, XL.2d, #1
2378c2ecf20Sopenharmony_ci	.endm
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_ci	.macro		__pmull_ghash, pn
2408c2ecf20Sopenharmony_ci	ld1		{SHASH.2d}, [x3]
2418c2ecf20Sopenharmony_ci	ld1		{XL.2d}, [x1]
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ci	__pmull_pre_\pn
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ci	/* do the head block first, if supplied */
2468c2ecf20Sopenharmony_ci	cbz		x4, 0f
2478c2ecf20Sopenharmony_ci	ld1		{T1.2d}, [x4]
2488c2ecf20Sopenharmony_ci	mov		x4, xzr
2498c2ecf20Sopenharmony_ci	b		3f
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci0:	.ifc		\pn, p64
2528c2ecf20Sopenharmony_ci	tbnz		w0, #0, 2f		// skip until #blocks is a
2538c2ecf20Sopenharmony_ci	tbnz		w0, #1, 2f		// round multiple of 4
2548c2ecf20Sopenharmony_ci
2558c2ecf20Sopenharmony_ci1:	ld1		{XM3.16b-TT4.16b}, [x2], #64
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci	sub		w0, w0, #4
2588c2ecf20Sopenharmony_ci
2598c2ecf20Sopenharmony_ci	rev64		T1.16b, XM3.16b
2608c2ecf20Sopenharmony_ci	rev64		T2.16b, XH3.16b
2618c2ecf20Sopenharmony_ci	rev64		TT4.16b, TT4.16b
2628c2ecf20Sopenharmony_ci	rev64		TT3.16b, TT3.16b
2638c2ecf20Sopenharmony_ci
2648c2ecf20Sopenharmony_ci	ext		IN1.16b, TT4.16b, TT4.16b, #8
2658c2ecf20Sopenharmony_ci	ext		XL3.16b, TT3.16b, TT3.16b, #8
2668c2ecf20Sopenharmony_ci
2678c2ecf20Sopenharmony_ci	eor		TT4.16b, TT4.16b, IN1.16b
2688c2ecf20Sopenharmony_ci	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
2698c2ecf20Sopenharmony_ci	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
2708c2ecf20Sopenharmony_ci	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
2718c2ecf20Sopenharmony_ci
2728c2ecf20Sopenharmony_ci	eor		TT3.16b, TT3.16b, XL3.16b
2738c2ecf20Sopenharmony_ci	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
2748c2ecf20Sopenharmony_ci	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
2758c2ecf20Sopenharmony_ci	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
2768c2ecf20Sopenharmony_ci
2778c2ecf20Sopenharmony_ci	ext		IN1.16b, T2.16b, T2.16b, #8
2788c2ecf20Sopenharmony_ci	eor		XL2.16b, XL2.16b, XL3.16b
2798c2ecf20Sopenharmony_ci	eor		XH2.16b, XH2.16b, XH3.16b
2808c2ecf20Sopenharmony_ci	eor		XM2.16b, XM2.16b, XM3.16b
2818c2ecf20Sopenharmony_ci
2828c2ecf20Sopenharmony_ci	eor		T2.16b, T2.16b, IN1.16b
2838c2ecf20Sopenharmony_ci	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
2848c2ecf20Sopenharmony_ci	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
2858c2ecf20Sopenharmony_ci	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
2868c2ecf20Sopenharmony_ci
2878c2ecf20Sopenharmony_ci	eor		XL2.16b, XL2.16b, XL3.16b
2888c2ecf20Sopenharmony_ci	eor		XH2.16b, XH2.16b, XH3.16b
2898c2ecf20Sopenharmony_ci	eor		XM2.16b, XM2.16b, XM3.16b
2908c2ecf20Sopenharmony_ci
2918c2ecf20Sopenharmony_ci	ext		IN1.16b, T1.16b, T1.16b, #8
2928c2ecf20Sopenharmony_ci	ext		TT3.16b, XL.16b, XL.16b, #8
2938c2ecf20Sopenharmony_ci	eor		XL.16b, XL.16b, IN1.16b
2948c2ecf20Sopenharmony_ci	eor		T1.16b, T1.16b, TT3.16b
2958c2ecf20Sopenharmony_ci
2968c2ecf20Sopenharmony_ci	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
2978c2ecf20Sopenharmony_ci	eor		T1.16b, T1.16b, XL.16b
2988c2ecf20Sopenharmony_ci	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
2998c2ecf20Sopenharmony_ci	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
3008c2ecf20Sopenharmony_ci
3018c2ecf20Sopenharmony_ci	eor		XL.16b, XL.16b, XL2.16b
3028c2ecf20Sopenharmony_ci	eor		XH.16b, XH.16b, XH2.16b
3038c2ecf20Sopenharmony_ci	eor		XM.16b, XM.16b, XM2.16b
3048c2ecf20Sopenharmony_ci
3058c2ecf20Sopenharmony_ci	eor		T2.16b, XL.16b, XH.16b
3068c2ecf20Sopenharmony_ci	ext		T1.16b, XL.16b, XH.16b, #8
3078c2ecf20Sopenharmony_ci	eor		XM.16b, XM.16b, T2.16b
3088c2ecf20Sopenharmony_ci
3098c2ecf20Sopenharmony_ci	__pmull_reduce_p64
3108c2ecf20Sopenharmony_ci
3118c2ecf20Sopenharmony_ci	eor		T2.16b, T2.16b, XH.16b
3128c2ecf20Sopenharmony_ci	eor		XL.16b, XL.16b, T2.16b
3138c2ecf20Sopenharmony_ci
3148c2ecf20Sopenharmony_ci	cbz		w0, 5f
3158c2ecf20Sopenharmony_ci	b		1b
3168c2ecf20Sopenharmony_ci	.endif
3178c2ecf20Sopenharmony_ci
3188c2ecf20Sopenharmony_ci2:	ld1		{T1.2d}, [x2], #16
3198c2ecf20Sopenharmony_ci	sub		w0, w0, #1
3208c2ecf20Sopenharmony_ci
3218c2ecf20Sopenharmony_ci3:	/* multiply XL by SHASH in GF(2^128) */
3228c2ecf20Sopenharmony_ciCPU_LE(	rev64		T1.16b, T1.16b	)
3238c2ecf20Sopenharmony_ci
3248c2ecf20Sopenharmony_ci	ext		T2.16b, XL.16b, XL.16b, #8
3258c2ecf20Sopenharmony_ci	ext		IN1.16b, T1.16b, T1.16b, #8
3268c2ecf20Sopenharmony_ci	eor		T1.16b, T1.16b, T2.16b
3278c2ecf20Sopenharmony_ci	eor		XL.16b, XL.16b, IN1.16b
3288c2ecf20Sopenharmony_ci
3298c2ecf20Sopenharmony_ci	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
3308c2ecf20Sopenharmony_ci	eor		T1.16b, T1.16b, XL.16b
3318c2ecf20Sopenharmony_ci	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
3328c2ecf20Sopenharmony_ci	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
3338c2ecf20Sopenharmony_ci
3348c2ecf20Sopenharmony_ci4:	eor		T2.16b, XL.16b, XH.16b
3358c2ecf20Sopenharmony_ci	ext		T1.16b, XL.16b, XH.16b, #8
3368c2ecf20Sopenharmony_ci	eor		XM.16b, XM.16b, T2.16b
3378c2ecf20Sopenharmony_ci
3388c2ecf20Sopenharmony_ci	__pmull_reduce_\pn
3398c2ecf20Sopenharmony_ci
3408c2ecf20Sopenharmony_ci	eor		T2.16b, T2.16b, XH.16b
3418c2ecf20Sopenharmony_ci	eor		XL.16b, XL.16b, T2.16b
3428c2ecf20Sopenharmony_ci
3438c2ecf20Sopenharmony_ci	cbnz		w0, 0b
3448c2ecf20Sopenharmony_ci
3458c2ecf20Sopenharmony_ci5:	st1		{XL.2d}, [x1]
3468c2ecf20Sopenharmony_ci	ret
3478c2ecf20Sopenharmony_ci	.endm
3488c2ecf20Sopenharmony_ci
3498c2ecf20Sopenharmony_ci	/*
3508c2ecf20Sopenharmony_ci	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
3518c2ecf20Sopenharmony_ci	 *			   struct ghash_key const *k, const char *head)
3528c2ecf20Sopenharmony_ci	 */
3538c2ecf20Sopenharmony_ciSYM_FUNC_START(pmull_ghash_update_p64)
3548c2ecf20Sopenharmony_ci	__pmull_ghash	p64
3558c2ecf20Sopenharmony_ciSYM_FUNC_END(pmull_ghash_update_p64)
3568c2ecf20Sopenharmony_ci
3578c2ecf20Sopenharmony_ciSYM_FUNC_START(pmull_ghash_update_p8)
3588c2ecf20Sopenharmony_ci	__pmull_ghash	p8
3598c2ecf20Sopenharmony_ciSYM_FUNC_END(pmull_ghash_update_p8)
3608c2ecf20Sopenharmony_ci
3618c2ecf20Sopenharmony_ci	KS0		.req	v8
3628c2ecf20Sopenharmony_ci	KS1		.req	v9
3638c2ecf20Sopenharmony_ci	KS2		.req	v10
3648c2ecf20Sopenharmony_ci	KS3		.req	v11
3658c2ecf20Sopenharmony_ci
3668c2ecf20Sopenharmony_ci	INP0		.req	v21
3678c2ecf20Sopenharmony_ci	INP1		.req	v22
3688c2ecf20Sopenharmony_ci	INP2		.req	v23
3698c2ecf20Sopenharmony_ci	INP3		.req	v24
3708c2ecf20Sopenharmony_ci
3718c2ecf20Sopenharmony_ci	K0		.req	v25
3728c2ecf20Sopenharmony_ci	K1		.req	v26
3738c2ecf20Sopenharmony_ci	K2		.req	v27
3748c2ecf20Sopenharmony_ci	K3		.req	v28
3758c2ecf20Sopenharmony_ci	K4		.req	v12
3768c2ecf20Sopenharmony_ci	K5		.req	v13
3778c2ecf20Sopenharmony_ci	K6		.req	v4
3788c2ecf20Sopenharmony_ci	K7		.req	v5
3798c2ecf20Sopenharmony_ci	K8		.req	v14
3808c2ecf20Sopenharmony_ci	K9		.req	v15
3818c2ecf20Sopenharmony_ci	KK		.req	v29
3828c2ecf20Sopenharmony_ci	KL		.req	v30
3838c2ecf20Sopenharmony_ci	KM		.req	v31
3848c2ecf20Sopenharmony_ci
3858c2ecf20Sopenharmony_ci	.macro		load_round_keys, rounds, rk, tmp
3868c2ecf20Sopenharmony_ci	add		\tmp, \rk, #64
3878c2ecf20Sopenharmony_ci	ld1		{K0.4s-K3.4s}, [\rk]
3888c2ecf20Sopenharmony_ci	ld1		{K4.4s-K5.4s}, [\tmp]
3898c2ecf20Sopenharmony_ci	add		\tmp, \rk, \rounds, lsl #4
3908c2ecf20Sopenharmony_ci	sub		\tmp, \tmp, #32
3918c2ecf20Sopenharmony_ci	ld1		{KK.4s-KM.4s}, [\tmp]
3928c2ecf20Sopenharmony_ci	.endm
3938c2ecf20Sopenharmony_ci
3948c2ecf20Sopenharmony_ci	.macro		enc_round, state, key
3958c2ecf20Sopenharmony_ci	aese		\state\().16b, \key\().16b
3968c2ecf20Sopenharmony_ci	aesmc		\state\().16b, \state\().16b
3978c2ecf20Sopenharmony_ci	.endm
3988c2ecf20Sopenharmony_ci
3998c2ecf20Sopenharmony_ci	.macro		enc_qround, s0, s1, s2, s3, key
4008c2ecf20Sopenharmony_ci	enc_round	\s0, \key
4018c2ecf20Sopenharmony_ci	enc_round	\s1, \key
4028c2ecf20Sopenharmony_ci	enc_round	\s2, \key
4038c2ecf20Sopenharmony_ci	enc_round	\s3, \key
4048c2ecf20Sopenharmony_ci	.endm
4058c2ecf20Sopenharmony_ci
4068c2ecf20Sopenharmony_ci	.macro		enc_block, state, rounds, rk, tmp
4078c2ecf20Sopenharmony_ci	add		\tmp, \rk, #96
4088c2ecf20Sopenharmony_ci	ld1		{K6.4s-K7.4s}, [\tmp], #32
4098c2ecf20Sopenharmony_ci	.irp		key, K0, K1, K2, K3, K4 K5
4108c2ecf20Sopenharmony_ci	enc_round	\state, \key
4118c2ecf20Sopenharmony_ci	.endr
4128c2ecf20Sopenharmony_ci
4138c2ecf20Sopenharmony_ci	tbnz		\rounds, #2, .Lnot128_\@
4148c2ecf20Sopenharmony_ci.Lout256_\@:
4158c2ecf20Sopenharmony_ci	enc_round	\state, K6
4168c2ecf20Sopenharmony_ci	enc_round	\state, K7
4178c2ecf20Sopenharmony_ci
4188c2ecf20Sopenharmony_ci.Lout192_\@:
4198c2ecf20Sopenharmony_ci	enc_round	\state, KK
4208c2ecf20Sopenharmony_ci	aese		\state\().16b, KL.16b
4218c2ecf20Sopenharmony_ci	eor		\state\().16b, \state\().16b, KM.16b
4228c2ecf20Sopenharmony_ci
4238c2ecf20Sopenharmony_ci	.subsection	1
4248c2ecf20Sopenharmony_ci.Lnot128_\@:
4258c2ecf20Sopenharmony_ci	ld1		{K8.4s-K9.4s}, [\tmp], #32
4268c2ecf20Sopenharmony_ci	enc_round	\state, K6
4278c2ecf20Sopenharmony_ci	enc_round	\state, K7
4288c2ecf20Sopenharmony_ci	ld1		{K6.4s-K7.4s}, [\tmp]
4298c2ecf20Sopenharmony_ci	enc_round	\state, K8
4308c2ecf20Sopenharmony_ci	enc_round	\state, K9
4318c2ecf20Sopenharmony_ci	tbz		\rounds, #1, .Lout192_\@
4328c2ecf20Sopenharmony_ci	b		.Lout256_\@
4338c2ecf20Sopenharmony_ci	.previous
4348c2ecf20Sopenharmony_ci	.endm
4358c2ecf20Sopenharmony_ci
4368c2ecf20Sopenharmony_ci	.align		6
4378c2ecf20Sopenharmony_ci	.macro		pmull_gcm_do_crypt, enc
4388c2ecf20Sopenharmony_ci	stp		x29, x30, [sp, #-32]!
4398c2ecf20Sopenharmony_ci	mov		x29, sp
4408c2ecf20Sopenharmony_ci	str		x19, [sp, #24]
4418c2ecf20Sopenharmony_ci
4428c2ecf20Sopenharmony_ci	load_round_keys	x7, x6, x8
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ci	ld1		{SHASH.2d}, [x3], #16
4458c2ecf20Sopenharmony_ci	ld1		{HH.2d-HH4.2d}, [x3]
4468c2ecf20Sopenharmony_ci
4478c2ecf20Sopenharmony_ci	trn1		SHASH2.2d, SHASH.2d, HH.2d
4488c2ecf20Sopenharmony_ci	trn2		T1.2d, SHASH.2d, HH.2d
4498c2ecf20Sopenharmony_ci	eor		SHASH2.16b, SHASH2.16b, T1.16b
4508c2ecf20Sopenharmony_ci
4518c2ecf20Sopenharmony_ci	trn1		HH34.2d, HH3.2d, HH4.2d
4528c2ecf20Sopenharmony_ci	trn2		T1.2d, HH3.2d, HH4.2d
4538c2ecf20Sopenharmony_ci	eor		HH34.16b, HH34.16b, T1.16b
4548c2ecf20Sopenharmony_ci
4558c2ecf20Sopenharmony_ci	ld1		{XL.2d}, [x4]
4568c2ecf20Sopenharmony_ci
4578c2ecf20Sopenharmony_ci	cbz		x0, 3f				// tag only?
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci	ldr		w8, [x5, #12]			// load lower counter
4608c2ecf20Sopenharmony_ciCPU_LE(	rev		w8, w8		)
4618c2ecf20Sopenharmony_ci
4628c2ecf20Sopenharmony_ci0:	mov		w9, #4				// max blocks per round
4638c2ecf20Sopenharmony_ci	add		x10, x0, #0xf
4648c2ecf20Sopenharmony_ci	lsr		x10, x10, #4			// remaining blocks
4658c2ecf20Sopenharmony_ci
4668c2ecf20Sopenharmony_ci	subs		x0, x0, #64
4678c2ecf20Sopenharmony_ci	csel		w9, w10, w9, mi
4688c2ecf20Sopenharmony_ci	add		w8, w8, w9
4698c2ecf20Sopenharmony_ci
4708c2ecf20Sopenharmony_ci	bmi		1f
4718c2ecf20Sopenharmony_ci	ld1		{INP0.16b-INP3.16b}, [x2], #64
4728c2ecf20Sopenharmony_ci	.subsection	1
4738c2ecf20Sopenharmony_ci	/*
4748c2ecf20Sopenharmony_ci	 * Populate the four input registers right to left with up to 63 bytes
4758c2ecf20Sopenharmony_ci	 * of data, using overlapping loads to avoid branches.
4768c2ecf20Sopenharmony_ci	 *
4778c2ecf20Sopenharmony_ci	 *                INP0     INP1     INP2     INP3
4788c2ecf20Sopenharmony_ci	 *  1 byte     |        |        |        |x       |
4798c2ecf20Sopenharmony_ci	 * 16 bytes    |        |        |        |xxxxxxxx|
4808c2ecf20Sopenharmony_ci	 * 17 bytes    |        |        |xxxxxxxx|x       |
4818c2ecf20Sopenharmony_ci	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
4828c2ecf20Sopenharmony_ci	 * etc etc
4838c2ecf20Sopenharmony_ci	 *
4848c2ecf20Sopenharmony_ci	 * Note that this code may read up to 15 bytes before the start of
4858c2ecf20Sopenharmony_ci	 * the input. It is up to the calling code to ensure this is safe if
4868c2ecf20Sopenharmony_ci	 * this happens in the first iteration of the loop (i.e., when the
4878c2ecf20Sopenharmony_ci	 * input size is < 16 bytes)
4888c2ecf20Sopenharmony_ci	 */
4898c2ecf20Sopenharmony_ci1:	mov		x15, #16
4908c2ecf20Sopenharmony_ci	ands		x19, x0, #0xf
4918c2ecf20Sopenharmony_ci	csel		x19, x19, x15, ne
4928c2ecf20Sopenharmony_ci	adr_l		x17, .Lpermute_table + 16
4938c2ecf20Sopenharmony_ci
4948c2ecf20Sopenharmony_ci	sub		x11, x15, x19
4958c2ecf20Sopenharmony_ci	add		x12, x17, x11
4968c2ecf20Sopenharmony_ci	sub		x17, x17, x11
4978c2ecf20Sopenharmony_ci	ld1		{T1.16b}, [x12]
4988c2ecf20Sopenharmony_ci	sub		x10, x1, x11
4998c2ecf20Sopenharmony_ci	sub		x11, x2, x11
5008c2ecf20Sopenharmony_ci
5018c2ecf20Sopenharmony_ci	cmp		x0, #-16
5028c2ecf20Sopenharmony_ci	csel		x14, x15, xzr, gt
5038c2ecf20Sopenharmony_ci	cmp		x0, #-32
5048c2ecf20Sopenharmony_ci	csel		x15, x15, xzr, gt
5058c2ecf20Sopenharmony_ci	cmp		x0, #-48
5068c2ecf20Sopenharmony_ci	csel		x16, x19, xzr, gt
5078c2ecf20Sopenharmony_ci	csel		x1, x1, x10, gt
5088c2ecf20Sopenharmony_ci	csel		x2, x2, x11, gt
5098c2ecf20Sopenharmony_ci
5108c2ecf20Sopenharmony_ci	ld1		{INP0.16b}, [x2], x14
5118c2ecf20Sopenharmony_ci	ld1		{INP1.16b}, [x2], x15
5128c2ecf20Sopenharmony_ci	ld1		{INP2.16b}, [x2], x16
5138c2ecf20Sopenharmony_ci	ld1		{INP3.16b}, [x2]
5148c2ecf20Sopenharmony_ci	tbl		INP3.16b, {INP3.16b}, T1.16b
5158c2ecf20Sopenharmony_ci	b		2f
5168c2ecf20Sopenharmony_ci	.previous
5178c2ecf20Sopenharmony_ci
5188c2ecf20Sopenharmony_ci2:	.if		\enc == 0
5198c2ecf20Sopenharmony_ci	bl		pmull_gcm_ghash_4x
5208c2ecf20Sopenharmony_ci	.endif
5218c2ecf20Sopenharmony_ci
5228c2ecf20Sopenharmony_ci	bl		pmull_gcm_enc_4x
5238c2ecf20Sopenharmony_ci
5248c2ecf20Sopenharmony_ci	tbnz		x0, #63, 6f
5258c2ecf20Sopenharmony_ci	st1		{INP0.16b-INP3.16b}, [x1], #64
5268c2ecf20Sopenharmony_ci	.if		\enc == 1
5278c2ecf20Sopenharmony_ci	bl		pmull_gcm_ghash_4x
5288c2ecf20Sopenharmony_ci	.endif
5298c2ecf20Sopenharmony_ci	bne		0b
5308c2ecf20Sopenharmony_ci
5318c2ecf20Sopenharmony_ci3:	ldp		x19, x10, [sp, #24]
5328c2ecf20Sopenharmony_ci	cbz		x10, 5f				// output tag?
5338c2ecf20Sopenharmony_ci
5348c2ecf20Sopenharmony_ci	ld1		{INP3.16b}, [x10]		// load lengths[]
5358c2ecf20Sopenharmony_ci	mov		w9, #1
5368c2ecf20Sopenharmony_ci	bl		pmull_gcm_ghash_4x
5378c2ecf20Sopenharmony_ci
5388c2ecf20Sopenharmony_ci	mov		w11, #(0x1 << 24)		// BE '1U'
5398c2ecf20Sopenharmony_ci	ld1		{KS0.16b}, [x5]
5408c2ecf20Sopenharmony_ci	mov		KS0.s[3], w11
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ci	enc_block	KS0, x7, x6, x12
5438c2ecf20Sopenharmony_ci
5448c2ecf20Sopenharmony_ci	ext		XL.16b, XL.16b, XL.16b, #8
5458c2ecf20Sopenharmony_ci	rev64		XL.16b, XL.16b
5468c2ecf20Sopenharmony_ci	eor		XL.16b, XL.16b, KS0.16b
5478c2ecf20Sopenharmony_ci	st1		{XL.16b}, [x10]			// store tag
5488c2ecf20Sopenharmony_ci
5498c2ecf20Sopenharmony_ci4:	ldp		x29, x30, [sp], #32
5508c2ecf20Sopenharmony_ci	ret
5518c2ecf20Sopenharmony_ci
5528c2ecf20Sopenharmony_ci5:
5538c2ecf20Sopenharmony_ciCPU_LE(	rev		w8, w8		)
5548c2ecf20Sopenharmony_ci	str		w8, [x5, #12]			// store lower counter
5558c2ecf20Sopenharmony_ci	st1		{XL.2d}, [x4]
5568c2ecf20Sopenharmony_ci	b		4b
5578c2ecf20Sopenharmony_ci
5588c2ecf20Sopenharmony_ci6:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
5598c2ecf20Sopenharmony_ci	sub		x17, x17, x19, lsl #1
5608c2ecf20Sopenharmony_ci
5618c2ecf20Sopenharmony_ci	cmp		w9, #1
5628c2ecf20Sopenharmony_ci	beq		7f
5638c2ecf20Sopenharmony_ci	.subsection	1
5648c2ecf20Sopenharmony_ci7:	ld1		{INP2.16b}, [x1]
5658c2ecf20Sopenharmony_ci	tbx		INP2.16b, {INP3.16b}, T1.16b
5668c2ecf20Sopenharmony_ci	mov		INP3.16b, INP2.16b
5678c2ecf20Sopenharmony_ci	b		8f
5688c2ecf20Sopenharmony_ci	.previous
5698c2ecf20Sopenharmony_ci
5708c2ecf20Sopenharmony_ci	st1		{INP0.16b}, [x1], x14
5718c2ecf20Sopenharmony_ci	st1		{INP1.16b}, [x1], x15
5728c2ecf20Sopenharmony_ci	st1		{INP2.16b}, [x1], x16
5738c2ecf20Sopenharmony_ci	tbl		INP3.16b, {INP3.16b}, T1.16b
5748c2ecf20Sopenharmony_ci	tbx		INP3.16b, {INP2.16b}, T2.16b
5758c2ecf20Sopenharmony_ci8:	st1		{INP3.16b}, [x1]
5768c2ecf20Sopenharmony_ci
5778c2ecf20Sopenharmony_ci	.if		\enc == 1
5788c2ecf20Sopenharmony_ci	ld1		{T1.16b}, [x17]
5798c2ecf20Sopenharmony_ci	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
5808c2ecf20Sopenharmony_ci	bl		pmull_gcm_ghash_4x
5818c2ecf20Sopenharmony_ci	.endif
5828c2ecf20Sopenharmony_ci	b		3b
5838c2ecf20Sopenharmony_ci	.endm
5848c2ecf20Sopenharmony_ci
5858c2ecf20Sopenharmony_ci	/*
5868c2ecf20Sopenharmony_ci	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
5878c2ecf20Sopenharmony_ci	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
5888c2ecf20Sopenharmony_ci	 *			  int rounds, u8 tag)
5898c2ecf20Sopenharmony_ci	 */
5908c2ecf20Sopenharmony_ciSYM_FUNC_START(pmull_gcm_encrypt)
5918c2ecf20Sopenharmony_ci	pmull_gcm_do_crypt	1
5928c2ecf20Sopenharmony_ciSYM_FUNC_END(pmull_gcm_encrypt)
5938c2ecf20Sopenharmony_ci
5948c2ecf20Sopenharmony_ci	/*
5958c2ecf20Sopenharmony_ci	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
5968c2ecf20Sopenharmony_ci	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
5978c2ecf20Sopenharmony_ci	 *			  int rounds, u8 tag)
5988c2ecf20Sopenharmony_ci	 */
5998c2ecf20Sopenharmony_ciSYM_FUNC_START(pmull_gcm_decrypt)
6008c2ecf20Sopenharmony_ci	pmull_gcm_do_crypt	0
6018c2ecf20Sopenharmony_ciSYM_FUNC_END(pmull_gcm_decrypt)
6028c2ecf20Sopenharmony_ci
6038c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
6048c2ecf20Sopenharmony_ci	movi		MASK.16b, #0xe1
6058c2ecf20Sopenharmony_ci	shl		MASK.2d, MASK.2d, #57
6068c2ecf20Sopenharmony_ci
6078c2ecf20Sopenharmony_ci	rev64		T1.16b, INP0.16b
6088c2ecf20Sopenharmony_ci	rev64		T2.16b, INP1.16b
6098c2ecf20Sopenharmony_ci	rev64		TT3.16b, INP2.16b
6108c2ecf20Sopenharmony_ci	rev64		TT4.16b, INP3.16b
6118c2ecf20Sopenharmony_ci
6128c2ecf20Sopenharmony_ci	ext		XL.16b, XL.16b, XL.16b, #8
6138c2ecf20Sopenharmony_ci
6148c2ecf20Sopenharmony_ci	tbz		w9, #2, 0f			// <4 blocks?
6158c2ecf20Sopenharmony_ci	.subsection	1
6168c2ecf20Sopenharmony_ci0:	movi		XH2.16b, #0
6178c2ecf20Sopenharmony_ci	movi		XM2.16b, #0
6188c2ecf20Sopenharmony_ci	movi		XL2.16b, #0
6198c2ecf20Sopenharmony_ci
6208c2ecf20Sopenharmony_ci	tbz		w9, #0, 1f			// 2 blocks?
6218c2ecf20Sopenharmony_ci	tbz		w9, #1, 2f			// 1 block?
6228c2ecf20Sopenharmony_ci
6238c2ecf20Sopenharmony_ci	eor		T2.16b, T2.16b, XL.16b
6248c2ecf20Sopenharmony_ci	ext		T1.16b, T2.16b, T2.16b, #8
6258c2ecf20Sopenharmony_ci	b		.Lgh3
6268c2ecf20Sopenharmony_ci
6278c2ecf20Sopenharmony_ci1:	eor		TT3.16b, TT3.16b, XL.16b
6288c2ecf20Sopenharmony_ci	ext		T2.16b, TT3.16b, TT3.16b, #8
6298c2ecf20Sopenharmony_ci	b		.Lgh2
6308c2ecf20Sopenharmony_ci
6318c2ecf20Sopenharmony_ci2:	eor		TT4.16b, TT4.16b, XL.16b
6328c2ecf20Sopenharmony_ci	ext		IN1.16b, TT4.16b, TT4.16b, #8
6338c2ecf20Sopenharmony_ci	b		.Lgh1
6348c2ecf20Sopenharmony_ci	.previous
6358c2ecf20Sopenharmony_ci
6368c2ecf20Sopenharmony_ci	eor		T1.16b, T1.16b, XL.16b
6378c2ecf20Sopenharmony_ci	ext		IN1.16b, T1.16b, T1.16b, #8
6388c2ecf20Sopenharmony_ci
6398c2ecf20Sopenharmony_ci	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
6408c2ecf20Sopenharmony_ci	eor		T1.16b, T1.16b, IN1.16b
6418c2ecf20Sopenharmony_ci	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
6428c2ecf20Sopenharmony_ci	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
6438c2ecf20Sopenharmony_ci
6448c2ecf20Sopenharmony_ci	ext		T1.16b, T2.16b, T2.16b, #8
6458c2ecf20Sopenharmony_ci.Lgh3:	eor		T2.16b, T2.16b, T1.16b
6468c2ecf20Sopenharmony_ci	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
6478c2ecf20Sopenharmony_ci	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
6488c2ecf20Sopenharmony_ci	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
6498c2ecf20Sopenharmony_ci
6508c2ecf20Sopenharmony_ci	eor		XH2.16b, XH2.16b, XH.16b
6518c2ecf20Sopenharmony_ci	eor		XL2.16b, XL2.16b, XL.16b
6528c2ecf20Sopenharmony_ci	eor		XM2.16b, XM2.16b, XM.16b
6538c2ecf20Sopenharmony_ci
6548c2ecf20Sopenharmony_ci	ext		T2.16b, TT3.16b, TT3.16b, #8
6558c2ecf20Sopenharmony_ci.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
6568c2ecf20Sopenharmony_ci	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
6578c2ecf20Sopenharmony_ci	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
6588c2ecf20Sopenharmony_ci	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
6598c2ecf20Sopenharmony_ci
6608c2ecf20Sopenharmony_ci	eor		XH2.16b, XH2.16b, XH.16b
6618c2ecf20Sopenharmony_ci	eor		XL2.16b, XL2.16b, XL.16b
6628c2ecf20Sopenharmony_ci	eor		XM2.16b, XM2.16b, XM.16b
6638c2ecf20Sopenharmony_ci
6648c2ecf20Sopenharmony_ci	ext		IN1.16b, TT4.16b, TT4.16b, #8
6658c2ecf20Sopenharmony_ci.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
6668c2ecf20Sopenharmony_ci	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
6678c2ecf20Sopenharmony_ci	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
6688c2ecf20Sopenharmony_ci	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
6698c2ecf20Sopenharmony_ci
6708c2ecf20Sopenharmony_ci	eor		XH.16b, XH.16b, XH2.16b
6718c2ecf20Sopenharmony_ci	eor		XL.16b, XL.16b, XL2.16b
6728c2ecf20Sopenharmony_ci	eor		XM.16b, XM.16b, XM2.16b
6738c2ecf20Sopenharmony_ci
6748c2ecf20Sopenharmony_ci	eor		T2.16b, XL.16b, XH.16b
6758c2ecf20Sopenharmony_ci	ext		T1.16b, XL.16b, XH.16b, #8
6768c2ecf20Sopenharmony_ci	eor		XM.16b, XM.16b, T2.16b
6778c2ecf20Sopenharmony_ci
6788c2ecf20Sopenharmony_ci	__pmull_reduce_p64
6798c2ecf20Sopenharmony_ci
6808c2ecf20Sopenharmony_ci	eor		T2.16b, T2.16b, XH.16b
6818c2ecf20Sopenharmony_ci	eor		XL.16b, XL.16b, T2.16b
6828c2ecf20Sopenharmony_ci
6838c2ecf20Sopenharmony_ci	ret
6848c2ecf20Sopenharmony_ciSYM_FUNC_END(pmull_gcm_ghash_4x)
6858c2ecf20Sopenharmony_ci
6868c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
6878c2ecf20Sopenharmony_ci	ld1		{KS0.16b}, [x5]			// load upper counter
6888c2ecf20Sopenharmony_ci	sub		w10, w8, #4
6898c2ecf20Sopenharmony_ci	sub		w11, w8, #3
6908c2ecf20Sopenharmony_ci	sub		w12, w8, #2
6918c2ecf20Sopenharmony_ci	sub		w13, w8, #1
6928c2ecf20Sopenharmony_ci	rev		w10, w10
6938c2ecf20Sopenharmony_ci	rev		w11, w11
6948c2ecf20Sopenharmony_ci	rev		w12, w12
6958c2ecf20Sopenharmony_ci	rev		w13, w13
6968c2ecf20Sopenharmony_ci	mov		KS1.16b, KS0.16b
6978c2ecf20Sopenharmony_ci	mov		KS2.16b, KS0.16b
6988c2ecf20Sopenharmony_ci	mov		KS3.16b, KS0.16b
6998c2ecf20Sopenharmony_ci	ins		KS0.s[3], w10			// set lower counter
7008c2ecf20Sopenharmony_ci	ins		KS1.s[3], w11
7018c2ecf20Sopenharmony_ci	ins		KS2.s[3], w12
7028c2ecf20Sopenharmony_ci	ins		KS3.s[3], w13
7038c2ecf20Sopenharmony_ci
7048c2ecf20Sopenharmony_ci	add		x10, x6, #96			// round key pointer
7058c2ecf20Sopenharmony_ci	ld1		{K6.4s-K7.4s}, [x10], #32
7068c2ecf20Sopenharmony_ci	.irp		key, K0, K1, K2, K3, K4, K5
7078c2ecf20Sopenharmony_ci	enc_qround	KS0, KS1, KS2, KS3, \key
7088c2ecf20Sopenharmony_ci	.endr
7098c2ecf20Sopenharmony_ci
7108c2ecf20Sopenharmony_ci	tbnz		x7, #2, .Lnot128
7118c2ecf20Sopenharmony_ci	.subsection	1
7128c2ecf20Sopenharmony_ci.Lnot128:
7138c2ecf20Sopenharmony_ci	ld1		{K8.4s-K9.4s}, [x10], #32
7148c2ecf20Sopenharmony_ci	.irp		key, K6, K7
7158c2ecf20Sopenharmony_ci	enc_qround	KS0, KS1, KS2, KS3, \key
7168c2ecf20Sopenharmony_ci	.endr
7178c2ecf20Sopenharmony_ci	ld1		{K6.4s-K7.4s}, [x10]
7188c2ecf20Sopenharmony_ci	.irp		key, K8, K9
7198c2ecf20Sopenharmony_ci	enc_qround	KS0, KS1, KS2, KS3, \key
7208c2ecf20Sopenharmony_ci	.endr
7218c2ecf20Sopenharmony_ci	tbz		x7, #1, .Lout192
7228c2ecf20Sopenharmony_ci	b		.Lout256
7238c2ecf20Sopenharmony_ci	.previous
7248c2ecf20Sopenharmony_ci
7258c2ecf20Sopenharmony_ci.Lout256:
7268c2ecf20Sopenharmony_ci	.irp		key, K6, K7
7278c2ecf20Sopenharmony_ci	enc_qround	KS0, KS1, KS2, KS3, \key
7288c2ecf20Sopenharmony_ci	.endr
7298c2ecf20Sopenharmony_ci
7308c2ecf20Sopenharmony_ci.Lout192:
7318c2ecf20Sopenharmony_ci	enc_qround	KS0, KS1, KS2, KS3, KK
7328c2ecf20Sopenharmony_ci
7338c2ecf20Sopenharmony_ci	aese		KS0.16b, KL.16b
7348c2ecf20Sopenharmony_ci	aese		KS1.16b, KL.16b
7358c2ecf20Sopenharmony_ci	aese		KS2.16b, KL.16b
7368c2ecf20Sopenharmony_ci	aese		KS3.16b, KL.16b
7378c2ecf20Sopenharmony_ci
7388c2ecf20Sopenharmony_ci	eor		KS0.16b, KS0.16b, KM.16b
7398c2ecf20Sopenharmony_ci	eor		KS1.16b, KS1.16b, KM.16b
7408c2ecf20Sopenharmony_ci	eor		KS2.16b, KS2.16b, KM.16b
7418c2ecf20Sopenharmony_ci	eor		KS3.16b, KS3.16b, KM.16b
7428c2ecf20Sopenharmony_ci
7438c2ecf20Sopenharmony_ci	eor		INP0.16b, INP0.16b, KS0.16b
7448c2ecf20Sopenharmony_ci	eor		INP1.16b, INP1.16b, KS1.16b
7458c2ecf20Sopenharmony_ci	eor		INP2.16b, INP2.16b, KS2.16b
7468c2ecf20Sopenharmony_ci	eor		INP3.16b, INP3.16b, KS3.16b
7478c2ecf20Sopenharmony_ci
7488c2ecf20Sopenharmony_ci	ret
7498c2ecf20Sopenharmony_ciSYM_FUNC_END(pmull_gcm_enc_4x)
7508c2ecf20Sopenharmony_ci
7518c2ecf20Sopenharmony_ci	.section	".rodata", "a"
7528c2ecf20Sopenharmony_ci	.align		6
7538c2ecf20Sopenharmony_ci.Lpermute_table:
7548c2ecf20Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
7558c2ecf20Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
7568c2ecf20Sopenharmony_ci	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
7578c2ecf20Sopenharmony_ci	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
7588c2ecf20Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
7598c2ecf20Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
7608c2ecf20Sopenharmony_ci	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
7618c2ecf20Sopenharmony_ci	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
7628c2ecf20Sopenharmony_ci	.previous
763