162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
462306a36Sopenharmony_ci * as specified in rfc8998
562306a36Sopenharmony_ci * https://datatracker.ietf.org/doc/html/rfc8998
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
862306a36Sopenharmony_ci * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
962306a36Sopenharmony_ci */
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci#include <linux/linkage.h>
1262306a36Sopenharmony_ci#include <linux/cfi_types.h>
1362306a36Sopenharmony_ci#include <asm/assembler.h>
1462306a36Sopenharmony_ci#include "sm4-ce-asm.h"
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci.arch	armv8-a+crypto
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci.irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
1962306a36Sopenharmony_ci	.set .Lv\b\().4s, \b
2062306a36Sopenharmony_ci.endr
2162306a36Sopenharmony_ci
2262306a36Sopenharmony_ci.macro sm4e, vd, vn
2362306a36Sopenharmony_ci	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
2462306a36Sopenharmony_ci.endm
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci/* Register macros */
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci/* Used for both encryption and decryption */
2962306a36Sopenharmony_ci#define	RHASH	v21
3062306a36Sopenharmony_ci#define	RRCONST	v22
3162306a36Sopenharmony_ci#define RZERO	v23
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci/* Helper macros. */
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci/*
3662306a36Sopenharmony_ci * input: m0, m1
3762306a36Sopenharmony_ci * output: r0:r1 (low 128-bits in r0, high in r1)
3862306a36Sopenharmony_ci */
3962306a36Sopenharmony_ci#define PMUL_128x128(r0, r1, m0, m1, T0, T1)			\
4062306a36Sopenharmony_ci		ext		T0.16b, m1.16b, m1.16b, #8;	\
4162306a36Sopenharmony_ci		pmull		r0.1q, m0.1d, m1.1d;		\
4262306a36Sopenharmony_ci		pmull		T1.1q, m0.1d, T0.1d;		\
4362306a36Sopenharmony_ci		pmull2		T0.1q, m0.2d, T0.2d;		\
4462306a36Sopenharmony_ci		pmull2		r1.1q, m0.2d, m1.2d;		\
4562306a36Sopenharmony_ci		eor		T0.16b, T0.16b, T1.16b;		\
4662306a36Sopenharmony_ci		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
4762306a36Sopenharmony_ci		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
4862306a36Sopenharmony_ci		eor		r0.16b, r0.16b, T1.16b;		\
4962306a36Sopenharmony_ci		eor		r1.16b, r1.16b, T0.16b;
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1,			\
5262306a36Sopenharmony_ci			r2, r3, m2, m3, T2, T3,			\
5362306a36Sopenharmony_ci			r4, r5, m4, m5, T4, T5,			\
5462306a36Sopenharmony_ci			r6, r7, m6, m7, T6, T7)			\
5562306a36Sopenharmony_ci		ext		T0.16b, m1.16b, m1.16b, #8;	\
5662306a36Sopenharmony_ci		ext		T2.16b, m3.16b, m3.16b, #8;	\
5762306a36Sopenharmony_ci		ext		T4.16b, m5.16b, m5.16b, #8;	\
5862306a36Sopenharmony_ci		ext		T6.16b, m7.16b, m7.16b, #8;	\
5962306a36Sopenharmony_ci		pmull		r0.1q, m0.1d, m1.1d;		\
6062306a36Sopenharmony_ci		pmull		r2.1q, m2.1d, m3.1d;		\
6162306a36Sopenharmony_ci		pmull		r4.1q, m4.1d, m5.1d;		\
6262306a36Sopenharmony_ci		pmull		r6.1q, m6.1d, m7.1d;		\
6362306a36Sopenharmony_ci		pmull		T1.1q, m0.1d, T0.1d;		\
6462306a36Sopenharmony_ci		pmull		T3.1q, m2.1d, T2.1d;		\
6562306a36Sopenharmony_ci		pmull		T5.1q, m4.1d, T4.1d;		\
6662306a36Sopenharmony_ci		pmull		T7.1q, m6.1d, T6.1d;		\
6762306a36Sopenharmony_ci		pmull2		T0.1q, m0.2d, T0.2d;		\
6862306a36Sopenharmony_ci		pmull2		T2.1q, m2.2d, T2.2d;		\
6962306a36Sopenharmony_ci		pmull2		T4.1q, m4.2d, T4.2d;		\
7062306a36Sopenharmony_ci		pmull2		T6.1q, m6.2d, T6.2d;		\
7162306a36Sopenharmony_ci		pmull2		r1.1q, m0.2d, m1.2d;		\
7262306a36Sopenharmony_ci		pmull2		r3.1q, m2.2d, m3.2d;		\
7362306a36Sopenharmony_ci		pmull2		r5.1q, m4.2d, m5.2d;		\
7462306a36Sopenharmony_ci		pmull2		r7.1q, m6.2d, m7.2d;		\
7562306a36Sopenharmony_ci		eor		T0.16b, T0.16b, T1.16b;		\
7662306a36Sopenharmony_ci		eor		T2.16b, T2.16b, T3.16b;		\
7762306a36Sopenharmony_ci		eor		T4.16b, T4.16b, T5.16b;		\
7862306a36Sopenharmony_ci		eor		T6.16b, T6.16b, T7.16b;		\
7962306a36Sopenharmony_ci		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
8062306a36Sopenharmony_ci		ext		T3.16b, RZERO.16b, T2.16b, #8;	\
8162306a36Sopenharmony_ci		ext		T5.16b, RZERO.16b, T4.16b, #8;	\
8262306a36Sopenharmony_ci		ext		T7.16b, RZERO.16b, T6.16b, #8;	\
8362306a36Sopenharmony_ci		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
8462306a36Sopenharmony_ci		ext		T2.16b, T2.16b, RZERO.16b, #8;	\
8562306a36Sopenharmony_ci		ext		T4.16b, T4.16b, RZERO.16b, #8;	\
8662306a36Sopenharmony_ci		ext		T6.16b, T6.16b, RZERO.16b, #8;	\
8762306a36Sopenharmony_ci		eor		r0.16b, r0.16b, T1.16b;		\
8862306a36Sopenharmony_ci		eor		r2.16b, r2.16b, T3.16b; 	\
8962306a36Sopenharmony_ci		eor		r4.16b, r4.16b, T5.16b; 	\
9062306a36Sopenharmony_ci		eor		r6.16b, r6.16b, T7.16b; 	\
9162306a36Sopenharmony_ci		eor		r1.16b, r1.16b, T0.16b; 	\
9262306a36Sopenharmony_ci		eor		r3.16b, r3.16b, T2.16b; 	\
9362306a36Sopenharmony_ci		eor		r5.16b, r5.16b, T4.16b; 	\
9462306a36Sopenharmony_ci		eor		r7.16b, r7.16b, T6.16b;
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci/*
9762306a36Sopenharmony_ci * input: r0:r1 (low 128-bits in r0, high in r1)
9862306a36Sopenharmony_ci * output: a
9962306a36Sopenharmony_ci */
10062306a36Sopenharmony_ci#define REDUCTION(a, r0, r1, rconst, T0, T1)			\
10162306a36Sopenharmony_ci		pmull2		T0.1q, r1.2d, rconst.2d;	\
10262306a36Sopenharmony_ci		ext		T1.16b, T0.16b, RZERO.16b, #8;	\
10362306a36Sopenharmony_ci		ext		T0.16b, RZERO.16b, T0.16b, #8;	\
10462306a36Sopenharmony_ci		eor		r1.16b, r1.16b, T1.16b;		\
10562306a36Sopenharmony_ci		eor		r0.16b, r0.16b, T0.16b;		\
10662306a36Sopenharmony_ci		pmull		T0.1q, r1.1d, rconst.1d;	\
10762306a36Sopenharmony_ci		eor		a.16b, r0.16b, T0.16b;
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1)	\
11062306a36Sopenharmony_ci	rev32			b0.16b, b0.16b;			\
11162306a36Sopenharmony_ci		ext		T0.16b, m1.16b, m1.16b, #8;	\
11262306a36Sopenharmony_ci	sm4e			b0.4s, v24.4s;			\
11362306a36Sopenharmony_ci		pmull		r0.1q, m0.1d, m1.1d;		\
11462306a36Sopenharmony_ci	sm4e			b0.4s, v25.4s;			\
11562306a36Sopenharmony_ci		pmull		T1.1q, m0.1d, T0.1d;		\
11662306a36Sopenharmony_ci	sm4e			b0.4s, v26.4s;			\
11762306a36Sopenharmony_ci		pmull2		T0.1q, m0.2d, T0.2d;		\
11862306a36Sopenharmony_ci	sm4e			b0.4s, v27.4s;			\
11962306a36Sopenharmony_ci		pmull2		r1.1q, m0.2d, m1.2d;		\
12062306a36Sopenharmony_ci	sm4e			b0.4s, v28.4s;			\
12162306a36Sopenharmony_ci		eor		T0.16b, T0.16b, T1.16b;		\
12262306a36Sopenharmony_ci	sm4e			b0.4s, v29.4s;			\
12362306a36Sopenharmony_ci		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
12462306a36Sopenharmony_ci	sm4e			b0.4s, v30.4s;			\
12562306a36Sopenharmony_ci		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
12662306a36Sopenharmony_ci	sm4e			b0.4s, v31.4s;			\
12762306a36Sopenharmony_ci		eor		r0.16b, r0.16b, T1.16b;		\
12862306a36Sopenharmony_ci	rev64			b0.4s, b0.4s;			\
12962306a36Sopenharmony_ci		eor		r1.16b, r1.16b, T0.16b;		\
13062306a36Sopenharmony_ci	ext			b0.16b, b0.16b, b0.16b, #8;	\
13162306a36Sopenharmony_ci	rev32			b0.16b, b0.16b;
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2,			\
13462306a36Sopenharmony_ci				    r0, r1, m0, m1, T0, T1,	\
13562306a36Sopenharmony_ci				    r2, r3, m2, m3, T2, T3,	\
13662306a36Sopenharmony_ci				    r4, r5, m4, m5, T4, T5)	\
13762306a36Sopenharmony_ci	rev32			b0.16b, b0.16b;			\
13862306a36Sopenharmony_ci	rev32			b1.16b, b1.16b;			\
13962306a36Sopenharmony_ci	rev32			b2.16b, b2.16b;			\
14062306a36Sopenharmony_ci		ext		T0.16b, m1.16b, m1.16b, #8;	\
14162306a36Sopenharmony_ci		ext		T2.16b, m3.16b, m3.16b, #8;	\
14262306a36Sopenharmony_ci		ext		T4.16b, m5.16b, m5.16b, #8;	\
14362306a36Sopenharmony_ci	sm4e			b0.4s, v24.4s;			\
14462306a36Sopenharmony_ci	sm4e			b1.4s, v24.4s;			\
14562306a36Sopenharmony_ci	sm4e			b2.4s, v24.4s;			\
14662306a36Sopenharmony_ci		pmull		r0.1q, m0.1d, m1.1d;		\
14762306a36Sopenharmony_ci		pmull		r2.1q, m2.1d, m3.1d;		\
14862306a36Sopenharmony_ci		pmull		r4.1q, m4.1d, m5.1d;		\
14962306a36Sopenharmony_ci	sm4e			b0.4s, v25.4s;			\
15062306a36Sopenharmony_ci	sm4e			b1.4s, v25.4s;			\
15162306a36Sopenharmony_ci	sm4e			b2.4s, v25.4s;			\
15262306a36Sopenharmony_ci		pmull		T1.1q, m0.1d, T0.1d;		\
15362306a36Sopenharmony_ci		pmull		T3.1q, m2.1d, T2.1d;		\
15462306a36Sopenharmony_ci		pmull		T5.1q, m4.1d, T4.1d;		\
15562306a36Sopenharmony_ci	sm4e			b0.4s, v26.4s;			\
15662306a36Sopenharmony_ci	sm4e			b1.4s, v26.4s;			\
15762306a36Sopenharmony_ci	sm4e			b2.4s, v26.4s;			\
15862306a36Sopenharmony_ci		pmull2		T0.1q, m0.2d, T0.2d;		\
15962306a36Sopenharmony_ci		pmull2		T2.1q, m2.2d, T2.2d;		\
16062306a36Sopenharmony_ci		pmull2		T4.1q, m4.2d, T4.2d;		\
16162306a36Sopenharmony_ci	sm4e			b0.4s, v27.4s;			\
16262306a36Sopenharmony_ci	sm4e			b1.4s, v27.4s;			\
16362306a36Sopenharmony_ci	sm4e			b2.4s, v27.4s;			\
16462306a36Sopenharmony_ci		pmull2		r1.1q, m0.2d, m1.2d;		\
16562306a36Sopenharmony_ci		pmull2		r3.1q, m2.2d, m3.2d;		\
16662306a36Sopenharmony_ci		pmull2		r5.1q, m4.2d, m5.2d;		\
16762306a36Sopenharmony_ci	sm4e			b0.4s, v28.4s;			\
16862306a36Sopenharmony_ci	sm4e			b1.4s, v28.4s;			\
16962306a36Sopenharmony_ci	sm4e			b2.4s, v28.4s;			\
17062306a36Sopenharmony_ci		eor		T0.16b, T0.16b, T1.16b;		\
17162306a36Sopenharmony_ci		eor		T2.16b, T2.16b, T3.16b;		\
17262306a36Sopenharmony_ci		eor		T4.16b, T4.16b, T5.16b;		\
17362306a36Sopenharmony_ci	sm4e			b0.4s, v29.4s;			\
17462306a36Sopenharmony_ci	sm4e			b1.4s, v29.4s;			\
17562306a36Sopenharmony_ci	sm4e			b2.4s, v29.4s;			\
17662306a36Sopenharmony_ci		ext		T1.16b, RZERO.16b, T0.16b, #8;	\
17762306a36Sopenharmony_ci		ext		T3.16b, RZERO.16b, T2.16b, #8;	\
17862306a36Sopenharmony_ci		ext		T5.16b, RZERO.16b, T4.16b, #8;	\
17962306a36Sopenharmony_ci	sm4e			b0.4s, v30.4s;			\
18062306a36Sopenharmony_ci	sm4e			b1.4s, v30.4s;			\
18162306a36Sopenharmony_ci	sm4e			b2.4s, v30.4s;			\
18262306a36Sopenharmony_ci		ext		T0.16b, T0.16b, RZERO.16b, #8;	\
18362306a36Sopenharmony_ci		ext		T2.16b, T2.16b, RZERO.16b, #8;	\
18462306a36Sopenharmony_ci		ext		T4.16b, T4.16b, RZERO.16b, #8;	\
18562306a36Sopenharmony_ci	sm4e			b0.4s, v31.4s;			\
18662306a36Sopenharmony_ci	sm4e			b1.4s, v31.4s;			\
18762306a36Sopenharmony_ci	sm4e			b2.4s, v31.4s;			\
18862306a36Sopenharmony_ci		eor		r0.16b, r0.16b, T1.16b;		\
18962306a36Sopenharmony_ci		eor		r2.16b, r2.16b, T3.16b;		\
19062306a36Sopenharmony_ci		eor		r4.16b, r4.16b, T5.16b;		\
19162306a36Sopenharmony_ci	rev64			b0.4s, b0.4s;			\
19262306a36Sopenharmony_ci	rev64			b1.4s, b1.4s;			\
19362306a36Sopenharmony_ci	rev64			b2.4s, b2.4s;			\
19462306a36Sopenharmony_ci		eor		r1.16b, r1.16b, T0.16b;		\
19562306a36Sopenharmony_ci		eor		r3.16b, r3.16b, T2.16b;		\
19662306a36Sopenharmony_ci		eor		r5.16b, r5.16b, T4.16b;		\
19762306a36Sopenharmony_ci	ext			b0.16b, b0.16b, b0.16b, #8;	\
19862306a36Sopenharmony_ci	ext			b1.16b, b1.16b, b1.16b, #8;	\
19962306a36Sopenharmony_ci	ext			b2.16b, b2.16b, b2.16b, #8;	\
20062306a36Sopenharmony_ci		eor		r0.16b, r0.16b, r2.16b;		\
20162306a36Sopenharmony_ci		eor		r1.16b, r1.16b, r3.16b;		\
20262306a36Sopenharmony_ci	rev32			b0.16b, b0.16b;			\
20362306a36Sopenharmony_ci	rev32			b1.16b, b1.16b;			\
20462306a36Sopenharmony_ci	rev32			b2.16b, b2.16b;			\
20562306a36Sopenharmony_ci		eor		r0.16b, r0.16b, r4.16b;		\
20662306a36Sopenharmony_ci		eor		r1.16b, r1.16b, r5.16b;
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_ci#define inc32_le128(vctr)					\
20962306a36Sopenharmony_ci		mov		vctr.d[1], x9;			\
21062306a36Sopenharmony_ci		add		w6, w9, #1;			\
21162306a36Sopenharmony_ci		mov		vctr.d[0], x8;			\
21262306a36Sopenharmony_ci		bfi		x9, x6, #0, #32;		\
21362306a36Sopenharmony_ci		rev64		vctr.16b, vctr.16b;
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci#define GTAG_HASH_LENGTHS(vctr0, vlen)					\
21662306a36Sopenharmony_ci		ld1		{vlen.16b}, [x7];			\
21762306a36Sopenharmony_ci		/* construct CTR0 */					\
21862306a36Sopenharmony_ci		/* the lower 32-bits of initial IV is always be32(1) */	\
21962306a36Sopenharmony_ci		mov		x6, #0x1;				\
22062306a36Sopenharmony_ci		bfi		x9, x6, #0, #32;			\
22162306a36Sopenharmony_ci		mov		vctr0.d[0], x8;				\
22262306a36Sopenharmony_ci		mov		vctr0.d[1], x9;				\
22362306a36Sopenharmony_ci		rbit		vlen.16b, vlen.16b;			\
22462306a36Sopenharmony_ci		rev64		vctr0.16b, vctr0.16b;			\
22562306a36Sopenharmony_ci		/* authtag = GCTR(CTR0, GHASH) */			\
22662306a36Sopenharmony_ci		eor		RHASH.16b, RHASH.16b, vlen.16b;		\
22762306a36Sopenharmony_ci		SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1,	\
22862306a36Sopenharmony_ci					   RTMP0, RTMP1);		\
22962306a36Sopenharmony_ci		REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3);	\
23062306a36Sopenharmony_ci		rbit		RHASH.16b, RHASH.16b;			\
23162306a36Sopenharmony_ci		eor		RHASH.16b, RHASH.16b, vctr0.16b;
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci/* Register macros for encrypt and ghash */
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ci/* can be the same as input v0-v3 */
23762306a36Sopenharmony_ci#define	RR1	v0
23862306a36Sopenharmony_ci#define	RR3	v1
23962306a36Sopenharmony_ci#define	RR5	v2
24062306a36Sopenharmony_ci#define	RR7	v3
24162306a36Sopenharmony_ci
24262306a36Sopenharmony_ci#define	RR0	v4
24362306a36Sopenharmony_ci#define	RR2	v5
24462306a36Sopenharmony_ci#define	RR4	v6
24562306a36Sopenharmony_ci#define	RR6	v7
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci#define RTMP0	v8
24862306a36Sopenharmony_ci#define RTMP1	v9
24962306a36Sopenharmony_ci#define RTMP2	v10
25062306a36Sopenharmony_ci#define RTMP3	v11
25162306a36Sopenharmony_ci#define RTMP4	v12
25262306a36Sopenharmony_ci#define RTMP5	v13
25362306a36Sopenharmony_ci#define RTMP6	v14
25462306a36Sopenharmony_ci#define RTMP7	v15
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci#define	RH1	v16
25762306a36Sopenharmony_ci#define	RH2	v17
25862306a36Sopenharmony_ci#define	RH3	v18
25962306a36Sopenharmony_ci#define	RH4	v19
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci.align 3
26262306a36Sopenharmony_ciSYM_FUNC_START(sm4_ce_pmull_ghash_setup)
26362306a36Sopenharmony_ci	/* input:
26462306a36Sopenharmony_ci	 *   x0: round key array, CTX
26562306a36Sopenharmony_ci	 *   x1: ghash table
26662306a36Sopenharmony_ci	 */
26762306a36Sopenharmony_ci	SM4_PREPARE(x0)
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci	adr_l		x2, .Lghash_rconst
27062306a36Sopenharmony_ci	ld1r		{RRCONST.2d}, [x2]
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	eor		RZERO.16b, RZERO.16b, RZERO.16b
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci	/* H = E(K, 0^128) */
27562306a36Sopenharmony_ci	rev32		v0.16b, RZERO.16b
27662306a36Sopenharmony_ci	SM4_CRYPT_BLK_BE(v0)
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci	/* H ^ 1 */
27962306a36Sopenharmony_ci	rbit		RH1.16b, v0.16b
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_ci	/* H ^ 2 */
28262306a36Sopenharmony_ci	PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
28362306a36Sopenharmony_ci	REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci	/* H ^ 3 */
28662306a36Sopenharmony_ci	PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
28762306a36Sopenharmony_ci	REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	/* H ^ 4 */
29062306a36Sopenharmony_ci	PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
29162306a36Sopenharmony_ci	REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	st1		{RH1.16b-RH4.16b}, [x1]
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	ret
29662306a36Sopenharmony_ciSYM_FUNC_END(sm4_ce_pmull_ghash_setup)
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci.align 3
29962306a36Sopenharmony_ciSYM_FUNC_START(pmull_ghash_update)
30062306a36Sopenharmony_ci	/* input:
30162306a36Sopenharmony_ci	 *   x0: ghash table
30262306a36Sopenharmony_ci	 *   x1: ghash result
30362306a36Sopenharmony_ci	 *   x2: src
30462306a36Sopenharmony_ci	 *   w3: nblocks
30562306a36Sopenharmony_ci	 */
30662306a36Sopenharmony_ci	ld1		{RH1.16b-RH4.16b}, [x0]
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci	ld1		{RHASH.16b}, [x1]
30962306a36Sopenharmony_ci	rbit		RHASH.16b, RHASH.16b
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci	adr_l		x4, .Lghash_rconst
31262306a36Sopenharmony_ci	ld1r		{RRCONST.2d}, [x4]
31362306a36Sopenharmony_ci
31462306a36Sopenharmony_ci	eor		RZERO.16b, RZERO.16b, RZERO.16b
31562306a36Sopenharmony_ci
31662306a36Sopenharmony_ci.Lghash_loop_4x:
31762306a36Sopenharmony_ci	cmp		w3, #4
31862306a36Sopenharmony_ci	blt		.Lghash_loop_1x
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci	sub		w3, w3, #4
32162306a36Sopenharmony_ci
32262306a36Sopenharmony_ci	ld1		{v0.16b-v3.16b}, [x2], #64
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci	rbit		v0.16b, v0.16b
32562306a36Sopenharmony_ci	rbit		v1.16b, v1.16b
32662306a36Sopenharmony_ci	rbit		v2.16b, v2.16b
32762306a36Sopenharmony_ci	rbit		v3.16b, v3.16b
32862306a36Sopenharmony_ci
32962306a36Sopenharmony_ci	/*
33062306a36Sopenharmony_ci	 * (in0 ^ HASH) * H^4 => rr0:rr1
33162306a36Sopenharmony_ci	 * (in1)        * H^3 => rr2:rr3
33262306a36Sopenharmony_ci	 * (in2)        * H^2 => rr4:rr5
33362306a36Sopenharmony_ci	 * (in3)        * H^1 => rr6:rr7
33462306a36Sopenharmony_ci	 */
33562306a36Sopenharmony_ci	eor		RHASH.16b, RHASH.16b, v0.16b
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
33862306a36Sopenharmony_ci			RR2, RR3, v1, RH3, RTMP2, RTMP3,
33962306a36Sopenharmony_ci			RR4, RR5, v2, RH2, RTMP4, RTMP5,
34062306a36Sopenharmony_ci			RR6, RR7, v3, RH1, RTMP6, RTMP7)
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci	eor		RR0.16b, RR0.16b, RR2.16b
34362306a36Sopenharmony_ci	eor		RR1.16b, RR1.16b, RR3.16b
34462306a36Sopenharmony_ci	eor		RR0.16b, RR0.16b, RR4.16b
34562306a36Sopenharmony_ci	eor		RR1.16b, RR1.16b, RR5.16b
34662306a36Sopenharmony_ci	eor		RR0.16b, RR0.16b, RR6.16b
34762306a36Sopenharmony_ci	eor		RR1.16b, RR1.16b, RR7.16b
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci	cbz		w3, .Lghash_end
35262306a36Sopenharmony_ci	b		.Lghash_loop_4x
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci.Lghash_loop_1x:
35562306a36Sopenharmony_ci	sub		w3, w3, #1
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_ci	ld1		{v0.16b}, [x2], #16
35862306a36Sopenharmony_ci	rbit		v0.16b, v0.16b
35962306a36Sopenharmony_ci	eor		RHASH.16b, RHASH.16b, v0.16b
36062306a36Sopenharmony_ci
36162306a36Sopenharmony_ci	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
36262306a36Sopenharmony_ci	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_ci	cbnz		w3, .Lghash_loop_1x
36562306a36Sopenharmony_ci
36662306a36Sopenharmony_ci.Lghash_end:
36762306a36Sopenharmony_ci	rbit		RHASH.16b, RHASH.16b
36862306a36Sopenharmony_ci	st1		{RHASH.2d}, [x1]
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci	ret
37162306a36Sopenharmony_ciSYM_FUNC_END(pmull_ghash_update)
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci.align 3
37462306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
37562306a36Sopenharmony_ci	/* input:
37662306a36Sopenharmony_ci	 *   x0: round key array, CTX
37762306a36Sopenharmony_ci	 *   x1: dst
37862306a36Sopenharmony_ci	 *   x2: src
37962306a36Sopenharmony_ci	 *   x3: ctr (big endian, 128 bit)
38062306a36Sopenharmony_ci	 *   w4: nbytes
38162306a36Sopenharmony_ci	 *   x5: ghash result
38262306a36Sopenharmony_ci	 *   x6: ghash table
38362306a36Sopenharmony_ci	 *   x7: lengths (only for last block)
38462306a36Sopenharmony_ci	 */
38562306a36Sopenharmony_ci	SM4_PREPARE(x0)
38662306a36Sopenharmony_ci
38762306a36Sopenharmony_ci	ldp		x8, x9, [x3]
38862306a36Sopenharmony_ci	rev		x8, x8
38962306a36Sopenharmony_ci	rev		x9, x9
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci	ld1		{RH1.16b-RH4.16b}, [x6]
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci	ld1		{RHASH.16b}, [x5]
39462306a36Sopenharmony_ci	rbit		RHASH.16b, RHASH.16b
39562306a36Sopenharmony_ci
39662306a36Sopenharmony_ci	adr_l		x6, .Lghash_rconst
39762306a36Sopenharmony_ci	ld1r		{RRCONST.2d}, [x6]
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_ci	eor		RZERO.16b, RZERO.16b, RZERO.16b
40062306a36Sopenharmony_ci
40162306a36Sopenharmony_ci	cbz		w4, .Lgcm_enc_hash_len
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci.Lgcm_enc_loop_4x:
40462306a36Sopenharmony_ci	cmp		w4, #(4 * 16)
40562306a36Sopenharmony_ci	blt		.Lgcm_enc_loop_1x
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_ci	sub		w4, w4, #(4 * 16)
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci	/* construct CTRs */
41062306a36Sopenharmony_ci	inc32_le128(v0)			/* +0 */
41162306a36Sopenharmony_ci	inc32_le128(v1)			/* +1 */
41262306a36Sopenharmony_ci	inc32_le128(v2)			/* +2 */
41362306a36Sopenharmony_ci	inc32_le128(v3)			/* +3 */
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
41662306a36Sopenharmony_ci
41762306a36Sopenharmony_ci	SM4_CRYPT_BLK4(v0, v1, v2, v3)
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci	eor		v0.16b, v0.16b, RTMP0.16b
42062306a36Sopenharmony_ci	eor		v1.16b, v1.16b, RTMP1.16b
42162306a36Sopenharmony_ci	eor		v2.16b, v2.16b, RTMP2.16b
42262306a36Sopenharmony_ci	eor		v3.16b, v3.16b, RTMP3.16b
42362306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x1], #64
42462306a36Sopenharmony_ci
42562306a36Sopenharmony_ci	/* ghash update */
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci	rbit		v0.16b, v0.16b
42862306a36Sopenharmony_ci	rbit		v1.16b, v1.16b
42962306a36Sopenharmony_ci	rbit		v2.16b, v2.16b
43062306a36Sopenharmony_ci	rbit		v3.16b, v3.16b
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_ci	/*
43362306a36Sopenharmony_ci	 * (in0 ^ HASH) * H^4 => rr0:rr1
43462306a36Sopenharmony_ci	 * (in1)        * H^3 => rr2:rr3
43562306a36Sopenharmony_ci	 * (in2)        * H^2 => rr4:rr5
43662306a36Sopenharmony_ci	 * (in3)        * H^1 => rr6:rr7
43762306a36Sopenharmony_ci	 */
43862306a36Sopenharmony_ci	eor		RHASH.16b, RHASH.16b, v0.16b
43962306a36Sopenharmony_ci
44062306a36Sopenharmony_ci	PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
44162306a36Sopenharmony_ci			RR2, RR3, v1, RH3, RTMP2, RTMP3,
44262306a36Sopenharmony_ci			RR4, RR5, v2, RH2, RTMP4, RTMP5,
44362306a36Sopenharmony_ci			RR6, RR7, v3, RH1, RTMP6, RTMP7)
44462306a36Sopenharmony_ci
44562306a36Sopenharmony_ci	eor		RR0.16b, RR0.16b, RR2.16b
44662306a36Sopenharmony_ci	eor		RR1.16b, RR1.16b, RR3.16b
44762306a36Sopenharmony_ci	eor		RR0.16b, RR0.16b, RR4.16b
44862306a36Sopenharmony_ci	eor		RR1.16b, RR1.16b, RR5.16b
44962306a36Sopenharmony_ci	eor		RR0.16b, RR0.16b, RR6.16b
45062306a36Sopenharmony_ci	eor		RR1.16b, RR1.16b, RR7.16b
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci	cbz		w4, .Lgcm_enc_hash_len
45562306a36Sopenharmony_ci	b		.Lgcm_enc_loop_4x
45662306a36Sopenharmony_ci
45762306a36Sopenharmony_ci.Lgcm_enc_loop_1x:
45862306a36Sopenharmony_ci	cmp		w4, #16
45962306a36Sopenharmony_ci	blt		.Lgcm_enc_tail
46062306a36Sopenharmony_ci
46162306a36Sopenharmony_ci	sub		w4, w4, #16
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_ci	/* construct CTRs */
46462306a36Sopenharmony_ci	inc32_le128(v0)
46562306a36Sopenharmony_ci
46662306a36Sopenharmony_ci	ld1		{RTMP0.16b}, [x2], #16
46762306a36Sopenharmony_ci
46862306a36Sopenharmony_ci	SM4_CRYPT_BLK(v0)
46962306a36Sopenharmony_ci
47062306a36Sopenharmony_ci	eor		v0.16b, v0.16b, RTMP0.16b
47162306a36Sopenharmony_ci	st1		{v0.16b}, [x1], #16
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci	/* ghash update */
47462306a36Sopenharmony_ci	rbit		v0.16b, v0.16b
47562306a36Sopenharmony_ci	eor		RHASH.16b, RHASH.16b, v0.16b
47662306a36Sopenharmony_ci	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
47762306a36Sopenharmony_ci	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
47862306a36Sopenharmony_ci
47962306a36Sopenharmony_ci	cbz		w4, .Lgcm_enc_hash_len
48062306a36Sopenharmony_ci	b		.Lgcm_enc_loop_1x
48162306a36Sopenharmony_ci
48262306a36Sopenharmony_ci.Lgcm_enc_tail:
48362306a36Sopenharmony_ci	/* construct CTRs */
48462306a36Sopenharmony_ci	inc32_le128(v0)
48562306a36Sopenharmony_ci	SM4_CRYPT_BLK(v0)
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_ci	/* load permute table */
48862306a36Sopenharmony_ci	adr_l		x0, .Lcts_permute_table
48962306a36Sopenharmony_ci	add		x0, x0, #32
49062306a36Sopenharmony_ci	sub		x0, x0, w4, uxtw
49162306a36Sopenharmony_ci	ld1		{v3.16b}, [x0]
49262306a36Sopenharmony_ci
49362306a36Sopenharmony_ci.Lgcm_enc_tail_loop:
49462306a36Sopenharmony_ci	/* do encrypt */
49562306a36Sopenharmony_ci	ldrb		w0, [x2], #1	/* get 1 byte from input */
49662306a36Sopenharmony_ci	umov		w6, v0.b[0]	/* get top crypted byte */
49762306a36Sopenharmony_ci	eor		w6, w6, w0	/* w6 = CTR ^ input */
49862306a36Sopenharmony_ci	strb		w6, [x1], #1	/* store out byte */
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	/* shift right out one byte */
50162306a36Sopenharmony_ci	ext		v0.16b, v0.16b, v0.16b, #1
50262306a36Sopenharmony_ci	/* the last ciphertext is placed in high bytes */
50362306a36Sopenharmony_ci	ins		v0.b[15], w6
50462306a36Sopenharmony_ci
50562306a36Sopenharmony_ci	subs		w4, w4, #1
50662306a36Sopenharmony_ci	bne		.Lgcm_enc_tail_loop
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci	/* padding last block with zeros */
50962306a36Sopenharmony_ci	tbl		v0.16b, {v0.16b}, v3.16b
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_ci	/* ghash update */
51262306a36Sopenharmony_ci	rbit		v0.16b, v0.16b
51362306a36Sopenharmony_ci	eor		RHASH.16b, RHASH.16b, v0.16b
51462306a36Sopenharmony_ci	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
51562306a36Sopenharmony_ci	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci.Lgcm_enc_hash_len:
51862306a36Sopenharmony_ci	cbz		x7, .Lgcm_enc_end
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci	GTAG_HASH_LENGTHS(v1, v3)
52162306a36Sopenharmony_ci
52262306a36Sopenharmony_ci	b		.Lgcm_enc_ret
52362306a36Sopenharmony_ci
52462306a36Sopenharmony_ci.Lgcm_enc_end:
52562306a36Sopenharmony_ci	/* store new CTR */
52662306a36Sopenharmony_ci	rev		x8, x8
52762306a36Sopenharmony_ci	rev		x9, x9
52862306a36Sopenharmony_ci	stp		x8, x9, [x3]
52962306a36Sopenharmony_ci
53062306a36Sopenharmony_ci	rbit		RHASH.16b, RHASH.16b
53162306a36Sopenharmony_ci
53262306a36Sopenharmony_ci.Lgcm_enc_ret:
53362306a36Sopenharmony_ci	/* store new MAC */
53462306a36Sopenharmony_ci	st1		{RHASH.2d}, [x5]
53562306a36Sopenharmony_ci
53662306a36Sopenharmony_ci	ret
53762306a36Sopenharmony_ciSYM_FUNC_END(sm4_ce_pmull_gcm_enc)
53862306a36Sopenharmony_ci
53962306a36Sopenharmony_ci#undef	RR1
54062306a36Sopenharmony_ci#undef	RR3
54162306a36Sopenharmony_ci#undef	RR5
54262306a36Sopenharmony_ci#undef	RR7
54362306a36Sopenharmony_ci#undef	RR0
54462306a36Sopenharmony_ci#undef	RR2
54562306a36Sopenharmony_ci#undef	RR4
54662306a36Sopenharmony_ci#undef	RR6
54762306a36Sopenharmony_ci#undef RTMP0
54862306a36Sopenharmony_ci#undef RTMP1
54962306a36Sopenharmony_ci#undef RTMP2
55062306a36Sopenharmony_ci#undef RTMP3
55162306a36Sopenharmony_ci#undef RTMP4
55262306a36Sopenharmony_ci#undef RTMP5
55362306a36Sopenharmony_ci#undef RTMP6
55462306a36Sopenharmony_ci#undef RTMP7
55562306a36Sopenharmony_ci#undef	RH1
55662306a36Sopenharmony_ci#undef	RH2
55762306a36Sopenharmony_ci#undef	RH3
55862306a36Sopenharmony_ci#undef	RH4
55962306a36Sopenharmony_ci
56062306a36Sopenharmony_ci
56162306a36Sopenharmony_ci/* Register macros for decrypt */
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ci/* v0-v2 for building CTRs, v3-v5 for saving inputs */
56462306a36Sopenharmony_ci
56562306a36Sopenharmony_ci#define	RR1	v6
56662306a36Sopenharmony_ci#define	RR3	v7
56762306a36Sopenharmony_ci#define	RR5	v8
56862306a36Sopenharmony_ci
56962306a36Sopenharmony_ci#define	RR0	v9
57062306a36Sopenharmony_ci#define	RR2	v10
57162306a36Sopenharmony_ci#define	RR4	v11
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci#define RTMP0	v12
57462306a36Sopenharmony_ci#define RTMP1	v13
57562306a36Sopenharmony_ci#define RTMP2	v14
57662306a36Sopenharmony_ci#define RTMP3	v15
57762306a36Sopenharmony_ci#define RTMP4	v16
57862306a36Sopenharmony_ci#define RTMP5	v17
57962306a36Sopenharmony_ci
58062306a36Sopenharmony_ci#define	RH1	v18
58162306a36Sopenharmony_ci#define	RH2	v19
58262306a36Sopenharmony_ci#define	RH3	v20
58362306a36Sopenharmony_ci
58462306a36Sopenharmony_ci.align 3
58562306a36Sopenharmony_ciSYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
58662306a36Sopenharmony_ci	/* input:
58762306a36Sopenharmony_ci	 *   x0: round key array, CTX
58862306a36Sopenharmony_ci	 *   x1: dst
58962306a36Sopenharmony_ci	 *   x2: src
59062306a36Sopenharmony_ci	 *   x3: ctr (big endian, 128 bit)
59162306a36Sopenharmony_ci	 *   w4: nbytes
59262306a36Sopenharmony_ci	 *   x5: ghash result
59362306a36Sopenharmony_ci	 *   x6: ghash table
59462306a36Sopenharmony_ci	 *   x7: lengths (only for last block)
59562306a36Sopenharmony_ci	 */
59662306a36Sopenharmony_ci	SM4_PREPARE(x0)
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci	ldp		x8, x9, [x3]
59962306a36Sopenharmony_ci	rev		x8, x8
60062306a36Sopenharmony_ci	rev		x9, x9
60162306a36Sopenharmony_ci
60262306a36Sopenharmony_ci	ld1		{RH1.16b-RH3.16b}, [x6]
60362306a36Sopenharmony_ci
60462306a36Sopenharmony_ci	ld1		{RHASH.16b}, [x5]
60562306a36Sopenharmony_ci	rbit		RHASH.16b, RHASH.16b
60662306a36Sopenharmony_ci
60762306a36Sopenharmony_ci	adr_l		x6, .Lghash_rconst
60862306a36Sopenharmony_ci	ld1r		{RRCONST.2d}, [x6]
60962306a36Sopenharmony_ci
61062306a36Sopenharmony_ci	eor		RZERO.16b, RZERO.16b, RZERO.16b
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	cbz		w4, .Lgcm_dec_hash_len
61362306a36Sopenharmony_ci
61462306a36Sopenharmony_ci.Lgcm_dec_loop_3x:
61562306a36Sopenharmony_ci	cmp		w4, #(3 * 16)
61662306a36Sopenharmony_ci	blt		.Lgcm_dec_loop_1x
61762306a36Sopenharmony_ci
61862306a36Sopenharmony_ci	sub		w4, w4, #(3 * 16)
61962306a36Sopenharmony_ci
62062306a36Sopenharmony_ci	ld1		{v3.16b-v5.16b}, [x2], #(3 * 16)
62162306a36Sopenharmony_ci
62262306a36Sopenharmony_ci	/* construct CTRs */
62362306a36Sopenharmony_ci	inc32_le128(v0)			/* +0 */
62462306a36Sopenharmony_ci	rbit		v6.16b, v3.16b
62562306a36Sopenharmony_ci	inc32_le128(v1)			/* +1 */
62662306a36Sopenharmony_ci	rbit		v7.16b, v4.16b
62762306a36Sopenharmony_ci	inc32_le128(v2)			/* +2 */
62862306a36Sopenharmony_ci	rbit		v8.16b, v5.16b
62962306a36Sopenharmony_ci
63062306a36Sopenharmony_ci	eor		RHASH.16b, RHASH.16b, v6.16b
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ci	/* decrypt & ghash update */
63362306a36Sopenharmony_ci	SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
63462306a36Sopenharmony_ci				    RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
63562306a36Sopenharmony_ci				    RR2, RR3, v7, RH2, RTMP2, RTMP3,
63662306a36Sopenharmony_ci				    RR4, RR5, v8, RH1, RTMP4, RTMP5)
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v3.16b
63962306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v4.16b
64062306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v5.16b
64162306a36Sopenharmony_ci
64262306a36Sopenharmony_ci	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
64362306a36Sopenharmony_ci
64462306a36Sopenharmony_ci	st1		{v0.16b-v2.16b}, [x1], #(3 * 16)
64562306a36Sopenharmony_ci
64662306a36Sopenharmony_ci	cbz		w4, .Lgcm_dec_hash_len
64762306a36Sopenharmony_ci	b		.Lgcm_dec_loop_3x
64862306a36Sopenharmony_ci
64962306a36Sopenharmony_ci.Lgcm_dec_loop_1x:
65062306a36Sopenharmony_ci	cmp		w4, #16
65162306a36Sopenharmony_ci	blt		.Lgcm_dec_tail
65262306a36Sopenharmony_ci
65362306a36Sopenharmony_ci	sub		w4, w4, #16
65462306a36Sopenharmony_ci
65562306a36Sopenharmony_ci	ld1		{v3.16b}, [x2], #16
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci	/* construct CTRs */
65862306a36Sopenharmony_ci	inc32_le128(v0)
65962306a36Sopenharmony_ci	rbit		v6.16b, v3.16b
66062306a36Sopenharmony_ci
66162306a36Sopenharmony_ci	eor		RHASH.16b, RHASH.16b, v6.16b
66262306a36Sopenharmony_ci
66362306a36Sopenharmony_ci	SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
66462306a36Sopenharmony_ci
66562306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v3.16b
66662306a36Sopenharmony_ci
66762306a36Sopenharmony_ci	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
66862306a36Sopenharmony_ci
66962306a36Sopenharmony_ci	st1		{v0.16b}, [x1], #16
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci	cbz		w4, .Lgcm_dec_hash_len
67262306a36Sopenharmony_ci	b		.Lgcm_dec_loop_1x
67362306a36Sopenharmony_ci
67462306a36Sopenharmony_ci.Lgcm_dec_tail:
67562306a36Sopenharmony_ci	/* construct CTRs */
67662306a36Sopenharmony_ci	inc32_le128(v0)
67762306a36Sopenharmony_ci	SM4_CRYPT_BLK(v0)
67862306a36Sopenharmony_ci
67962306a36Sopenharmony_ci	/* load permute table */
68062306a36Sopenharmony_ci	adr_l		x0, .Lcts_permute_table
68162306a36Sopenharmony_ci	add		x0, x0, #32
68262306a36Sopenharmony_ci	sub		x0, x0, w4, uxtw
68362306a36Sopenharmony_ci	ld1		{v3.16b}, [x0]
68462306a36Sopenharmony_ci
68562306a36Sopenharmony_ci.Lgcm_dec_tail_loop:
68662306a36Sopenharmony_ci	/* do decrypt */
68762306a36Sopenharmony_ci	ldrb		w0, [x2], #1	/* get 1 byte from input */
68862306a36Sopenharmony_ci	umov		w6, v0.b[0]	/* get top crypted byte */
68962306a36Sopenharmony_ci	eor		w6, w6, w0	/* w6 = CTR ^ input */
69062306a36Sopenharmony_ci	strb		w6, [x1], #1	/* store out byte */
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci	/* shift right out one byte */
69362306a36Sopenharmony_ci	ext		v0.16b, v0.16b, v0.16b, #1
69462306a36Sopenharmony_ci	/* the last ciphertext is placed in high bytes */
69562306a36Sopenharmony_ci	ins		v0.b[15], w0
69662306a36Sopenharmony_ci
69762306a36Sopenharmony_ci	subs		w4, w4, #1
69862306a36Sopenharmony_ci	bne		.Lgcm_dec_tail_loop
69962306a36Sopenharmony_ci
70062306a36Sopenharmony_ci	/* padding last block with zeros */
70162306a36Sopenharmony_ci	tbl		v0.16b, {v0.16b}, v3.16b
70262306a36Sopenharmony_ci
70362306a36Sopenharmony_ci	/* ghash update */
70462306a36Sopenharmony_ci	rbit		v0.16b, v0.16b
70562306a36Sopenharmony_ci	eor		RHASH.16b, RHASH.16b, v0.16b
70662306a36Sopenharmony_ci	PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
70762306a36Sopenharmony_ci	REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
70862306a36Sopenharmony_ci
70962306a36Sopenharmony_ci.Lgcm_dec_hash_len:
71062306a36Sopenharmony_ci	cbz		x7, .Lgcm_dec_end
71162306a36Sopenharmony_ci
71262306a36Sopenharmony_ci	GTAG_HASH_LENGTHS(v1, v3)
71362306a36Sopenharmony_ci
71462306a36Sopenharmony_ci	b		.Lgcm_dec_ret
71562306a36Sopenharmony_ci
71662306a36Sopenharmony_ci.Lgcm_dec_end:
71762306a36Sopenharmony_ci	/* store new CTR */
71862306a36Sopenharmony_ci	rev		x8, x8
71962306a36Sopenharmony_ci	rev		x9, x9
72062306a36Sopenharmony_ci	stp		x8, x9, [x3]
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci	rbit		RHASH.16b, RHASH.16b
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_ci.Lgcm_dec_ret:
72562306a36Sopenharmony_ci	/* store new MAC */
72662306a36Sopenharmony_ci	st1		{RHASH.2d}, [x5]
72762306a36Sopenharmony_ci
72862306a36Sopenharmony_ci	ret
72962306a36Sopenharmony_ciSYM_FUNC_END(sm4_ce_pmull_gcm_dec)
73062306a36Sopenharmony_ci
73162306a36Sopenharmony_ci	.section	".rodata", "a"
73262306a36Sopenharmony_ci	.align 4
73362306a36Sopenharmony_ci.Lcts_permute_table:
73462306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
73562306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
73662306a36Sopenharmony_ci	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
73762306a36Sopenharmony_ci	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
73862306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
73962306a36Sopenharmony_ci	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
74062306a36Sopenharmony_ci
74162306a36Sopenharmony_ci.Lghash_rconst:
74262306a36Sopenharmony_ci	.quad		0x87
743