162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * SM4 Cipher Algorithm for ARMv8 NEON
462306a36Sopenharmony_ci * as specified in
562306a36Sopenharmony_ci * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Copyright (C) 2022, Alibaba Group.
862306a36Sopenharmony_ci * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
962306a36Sopenharmony_ci */
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci#include <linux/linkage.h>
1262306a36Sopenharmony_ci#include <asm/assembler.h>
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci/* Register macros */
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci#define RTMP0	v8
1762306a36Sopenharmony_ci#define RTMP1	v9
1862306a36Sopenharmony_ci#define RTMP2	v10
1962306a36Sopenharmony_ci#define RTMP3	v11
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci#define RTMP4	v12
2262306a36Sopenharmony_ci#define RTMP5	v13
2362306a36Sopenharmony_ci#define RTMP6	v14
2462306a36Sopenharmony_ci#define RTMP7	v15
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci#define RX0	v12
2762306a36Sopenharmony_ci#define RX1	v13
2862306a36Sopenharmony_ci#define RKEY	v14
2962306a36Sopenharmony_ci#define RIV	v15
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_ci/* Helper macros. */
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci#define SM4_PREPARE()                                           \
3462306a36Sopenharmony_ci	adr_l		x5, crypto_sm4_sbox;                    \
3562306a36Sopenharmony_ci	ld1		{v16.16b-v19.16b}, [x5], #64;           \
3662306a36Sopenharmony_ci	ld1		{v20.16b-v23.16b}, [x5], #64;           \
3762306a36Sopenharmony_ci	ld1		{v24.16b-v27.16b}, [x5], #64;           \
3862306a36Sopenharmony_ci	ld1		{v28.16b-v31.16b}, [x5];
3962306a36Sopenharmony_ci
4062306a36Sopenharmony_ci#define transpose_4x4(s0, s1, s2, s3)                           \
4162306a36Sopenharmony_ci	zip1		RTMP0.4s, s0.4s, s1.4s;                 \
4262306a36Sopenharmony_ci	zip1		RTMP1.4s, s2.4s, s3.4s;                 \
4362306a36Sopenharmony_ci	zip2		RTMP2.4s, s0.4s, s1.4s;                 \
4462306a36Sopenharmony_ci	zip2		RTMP3.4s, s2.4s, s3.4s;                 \
4562306a36Sopenharmony_ci	zip1		s0.2d, RTMP0.2d, RTMP1.2d;              \
4662306a36Sopenharmony_ci	zip2		s1.2d, RTMP0.2d, RTMP1.2d;              \
4762306a36Sopenharmony_ci	zip1		s2.2d, RTMP2.2d, RTMP3.2d;              \
4862306a36Sopenharmony_ci	zip2		s3.2d, RTMP2.2d, RTMP3.2d;
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci#define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7)        \
5162306a36Sopenharmony_ci	zip1		RTMP0.4s, s0.4s, s1.4s;                 \
5262306a36Sopenharmony_ci	zip1		RTMP1.4s, s2.4s, s3.4s;                 \
5362306a36Sopenharmony_ci	zip2		RTMP2.4s, s0.4s, s1.4s;                 \
5462306a36Sopenharmony_ci	zip2		RTMP3.4s, s2.4s, s3.4s;                 \
5562306a36Sopenharmony_ci	zip1		RTMP4.4s, s4.4s, s5.4s;                 \
5662306a36Sopenharmony_ci	zip1		RTMP5.4s, s6.4s, s7.4s;                 \
5762306a36Sopenharmony_ci	zip2		RTMP6.4s, s4.4s, s5.4s;                 \
5862306a36Sopenharmony_ci	zip2		RTMP7.4s, s6.4s, s7.4s;                 \
5962306a36Sopenharmony_ci	zip1		s0.2d, RTMP0.2d, RTMP1.2d;              \
6062306a36Sopenharmony_ci	zip2		s1.2d, RTMP0.2d, RTMP1.2d;              \
6162306a36Sopenharmony_ci	zip1		s2.2d, RTMP2.2d, RTMP3.2d;              \
6262306a36Sopenharmony_ci	zip2		s3.2d, RTMP2.2d, RTMP3.2d;              \
6362306a36Sopenharmony_ci	zip1		s4.2d, RTMP4.2d, RTMP5.2d;              \
6462306a36Sopenharmony_ci	zip2		s5.2d, RTMP4.2d, RTMP5.2d;              \
6562306a36Sopenharmony_ci	zip1		s6.2d, RTMP6.2d, RTMP7.2d;              \
6662306a36Sopenharmony_ci	zip2		s7.2d, RTMP6.2d, RTMP7.2d;
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_ci#define rotate_clockwise_4x4(s0, s1, s2, s3)                    \
6962306a36Sopenharmony_ci	zip1		RTMP0.4s, s1.4s, s0.4s;                 \
7062306a36Sopenharmony_ci	zip2		RTMP1.4s, s1.4s, s0.4s;                 \
7162306a36Sopenharmony_ci	zip1		RTMP2.4s, s3.4s, s2.4s;                 \
7262306a36Sopenharmony_ci	zip2		RTMP3.4s, s3.4s, s2.4s;                 \
7362306a36Sopenharmony_ci	zip1		s0.2d, RTMP2.2d, RTMP0.2d;              \
7462306a36Sopenharmony_ci	zip2		s1.2d, RTMP2.2d, RTMP0.2d;              \
7562306a36Sopenharmony_ci	zip1		s2.2d, RTMP3.2d, RTMP1.2d;              \
7662306a36Sopenharmony_ci	zip2		s3.2d, RTMP3.2d, RTMP1.2d;
7762306a36Sopenharmony_ci
7862306a36Sopenharmony_ci#define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
7962306a36Sopenharmony_ci	zip1		RTMP0.4s, s1.4s, s0.4s;                 \
8062306a36Sopenharmony_ci	zip1		RTMP2.4s, s3.4s, s2.4s;                 \
8162306a36Sopenharmony_ci	zip2		RTMP1.4s, s1.4s, s0.4s;                 \
8262306a36Sopenharmony_ci	zip2		RTMP3.4s, s3.4s, s2.4s;                 \
8362306a36Sopenharmony_ci	zip1		RTMP4.4s, s5.4s, s4.4s;                 \
8462306a36Sopenharmony_ci	zip1		RTMP6.4s, s7.4s, s6.4s;                 \
8562306a36Sopenharmony_ci	zip2		RTMP5.4s, s5.4s, s4.4s;                 \
8662306a36Sopenharmony_ci	zip2		RTMP7.4s, s7.4s, s6.4s;                 \
8762306a36Sopenharmony_ci	zip1		s0.2d, RTMP2.2d, RTMP0.2d;              \
8862306a36Sopenharmony_ci	zip2		s1.2d, RTMP2.2d, RTMP0.2d;              \
8962306a36Sopenharmony_ci	zip1		s2.2d, RTMP3.2d, RTMP1.2d;              \
9062306a36Sopenharmony_ci	zip2		s3.2d, RTMP3.2d, RTMP1.2d;              \
9162306a36Sopenharmony_ci	zip1		s4.2d, RTMP6.2d, RTMP4.2d;              \
9262306a36Sopenharmony_ci	zip2		s5.2d, RTMP6.2d, RTMP4.2d;              \
9362306a36Sopenharmony_ci	zip1		s6.2d, RTMP7.2d, RTMP5.2d;              \
9462306a36Sopenharmony_ci	zip2		s7.2d, RTMP7.2d, RTMP5.2d;
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci#define ROUND4(round, s0, s1, s2, s3)                           \
9762306a36Sopenharmony_ci	dup		RX0.4s, RKEY.s[round];                  \
9862306a36Sopenharmony_ci	/* rk ^ s1 ^ s2 ^ s3 */                                 \
9962306a36Sopenharmony_ci	eor		RTMP1.16b, s2.16b, s3.16b;              \
10062306a36Sopenharmony_ci	eor		RX0.16b, RX0.16b, s1.16b;               \
10162306a36Sopenharmony_ci	eor		RX0.16b, RX0.16b, RTMP1.16b;            \
10262306a36Sopenharmony_ci                                                                \
10362306a36Sopenharmony_ci	/* sbox, non-linear part */                             \
10462306a36Sopenharmony_ci	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
10562306a36Sopenharmony_ci	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
10662306a36Sopenharmony_ci	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
10762306a36Sopenharmony_ci	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
10862306a36Sopenharmony_ci	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
10962306a36Sopenharmony_ci	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
11062306a36Sopenharmony_ci	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
11162306a36Sopenharmony_ci	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
11262306a36Sopenharmony_ci                                                                \
11362306a36Sopenharmony_ci	/* linear part */                                       \
11462306a36Sopenharmony_ci	shl		RTMP1.4s, RTMP0.4s, #8;                 \
11562306a36Sopenharmony_ci	shl		RTMP2.4s, RTMP0.4s, #16;                \
11662306a36Sopenharmony_ci	shl		RTMP3.4s, RTMP0.4s, #24;                \
11762306a36Sopenharmony_ci	sri		RTMP1.4s, RTMP0.4s, #(32-8);            \
11862306a36Sopenharmony_ci	sri		RTMP2.4s, RTMP0.4s, #(32-16);           \
11962306a36Sopenharmony_ci	sri		RTMP3.4s, RTMP0.4s, #(32-24);           \
12062306a36Sopenharmony_ci	/* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */            \
12162306a36Sopenharmony_ci	eor		RTMP1.16b, RTMP1.16b, RTMP0.16b;        \
12262306a36Sopenharmony_ci	eor		RTMP1.16b, RTMP1.16b, RTMP2.16b;        \
12362306a36Sopenharmony_ci	/* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */        \
12462306a36Sopenharmony_ci	eor		RTMP3.16b, RTMP3.16b, RTMP0.16b;        \
12562306a36Sopenharmony_ci	shl		RTMP2.4s, RTMP1.4s, 2;                  \
12662306a36Sopenharmony_ci	sri		RTMP2.4s, RTMP1.4s, #(32-2);            \
12762306a36Sopenharmony_ci	eor		RTMP3.16b, RTMP3.16b, RTMP2.16b;        \
12862306a36Sopenharmony_ci	/* s0 ^= RTMP3 */                                       \
12962306a36Sopenharmony_ci	eor		s0.16b, s0.16b, RTMP3.16b;
13062306a36Sopenharmony_ci
13162306a36Sopenharmony_ci#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3)                       \
13262306a36Sopenharmony_ci	mov		x6, 8;                                  \
13362306a36Sopenharmony_ci4:                                                              \
13462306a36Sopenharmony_ci	ld1		{RKEY.4s}, [x0], #16;                   \
13562306a36Sopenharmony_ci	subs		x6, x6, #1;                             \
13662306a36Sopenharmony_ci                                                                \
13762306a36Sopenharmony_ci	ROUND4(0, b0, b1, b2, b3);                              \
13862306a36Sopenharmony_ci	ROUND4(1, b1, b2, b3, b0);                              \
13962306a36Sopenharmony_ci	ROUND4(2, b2, b3, b0, b1);                              \
14062306a36Sopenharmony_ci	ROUND4(3, b3, b0, b1, b2);                              \
14162306a36Sopenharmony_ci                                                                \
14262306a36Sopenharmony_ci	bne		4b;                                     \
14362306a36Sopenharmony_ci                                                                \
14462306a36Sopenharmony_ci	rev32		b0.16b, b0.16b;                         \
14562306a36Sopenharmony_ci	rev32		b1.16b, b1.16b;                         \
14662306a36Sopenharmony_ci	rev32		b2.16b, b2.16b;                         \
14762306a36Sopenharmony_ci	rev32		b3.16b, b3.16b;                         \
14862306a36Sopenharmony_ci                                                                \
14962306a36Sopenharmony_ci	rotate_clockwise_4x4(b0, b1, b2, b3);                   \
15062306a36Sopenharmony_ci                                                                \
15162306a36Sopenharmony_ci	/* repoint to rkey */                                   \
15262306a36Sopenharmony_ci	sub		x0, x0, #128;
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci#define SM4_CRYPT_BLK4(b0, b1, b2, b3)                          \
15562306a36Sopenharmony_ci	rev32		b0.16b, b0.16b;                         \
15662306a36Sopenharmony_ci	rev32		b1.16b, b1.16b;                         \
15762306a36Sopenharmony_ci	rev32		b2.16b, b2.16b;                         \
15862306a36Sopenharmony_ci	rev32		b3.16b, b3.16b;                         \
15962306a36Sopenharmony_ci	SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3)           \
16262306a36Sopenharmony_ci	/* rk ^ s1 ^ s2 ^ s3 */                                 \
16362306a36Sopenharmony_ci	dup		RX0.4s, RKEY.s[round];                  \
16462306a36Sopenharmony_ci	eor		RTMP0.16b, s2.16b, s3.16b;              \
16562306a36Sopenharmony_ci	mov		RX1.16b, RX0.16b;                       \
16662306a36Sopenharmony_ci	eor		RTMP1.16b, t2.16b, t3.16b;              \
16762306a36Sopenharmony_ci	eor		RX0.16b, RX0.16b, s1.16b;               \
16862306a36Sopenharmony_ci	eor		RX1.16b, RX1.16b, t1.16b;               \
16962306a36Sopenharmony_ci	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
17062306a36Sopenharmony_ci	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
17162306a36Sopenharmony_ci                                                                \
17262306a36Sopenharmony_ci	/* sbox, non-linear part */                             \
17362306a36Sopenharmony_ci	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
17462306a36Sopenharmony_ci	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
17562306a36Sopenharmony_ci	tbl		RTMP1.16b, {v16.16b-v19.16b}, RX1.16b;  \
17662306a36Sopenharmony_ci	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
17762306a36Sopenharmony_ci	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
17862306a36Sopenharmony_ci	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
17962306a36Sopenharmony_ci	tbx		RTMP1.16b, {v20.16b-v23.16b}, RX1.16b;  \
18062306a36Sopenharmony_ci	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
18162306a36Sopenharmony_ci	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
18262306a36Sopenharmony_ci	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
18362306a36Sopenharmony_ci	tbx		RTMP1.16b, {v24.16b-v27.16b}, RX1.16b;  \
18462306a36Sopenharmony_ci	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
18562306a36Sopenharmony_ci	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
18662306a36Sopenharmony_ci	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
18762306a36Sopenharmony_ci	tbx		RTMP1.16b, {v28.16b-v31.16b}, RX1.16b;  \
18862306a36Sopenharmony_ci                                                                \
18962306a36Sopenharmony_ci	/* linear part */                                       \
19062306a36Sopenharmony_ci	shl		RX0.4s, RTMP0.4s, #8;                   \
19162306a36Sopenharmony_ci	shl		RX1.4s, RTMP1.4s, #8;                   \
19262306a36Sopenharmony_ci	shl		RTMP2.4s, RTMP0.4s, #16;                \
19362306a36Sopenharmony_ci	shl		RTMP3.4s, RTMP1.4s, #16;                \
19462306a36Sopenharmony_ci	sri		RX0.4s, RTMP0.4s, #(32 - 8);            \
19562306a36Sopenharmony_ci	sri		RX1.4s, RTMP1.4s, #(32 - 8);            \
19662306a36Sopenharmony_ci	sri		RTMP2.4s, RTMP0.4s, #(32 - 16);         \
19762306a36Sopenharmony_ci	sri		RTMP3.4s, RTMP1.4s, #(32 - 16);         \
19862306a36Sopenharmony_ci	/* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */               \
19962306a36Sopenharmony_ci	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
20062306a36Sopenharmony_ci	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
20162306a36Sopenharmony_ci	eor		RX0.16b, RX0.16b, RTMP2.16b;            \
20262306a36Sopenharmony_ci	eor		RX1.16b, RX1.16b, RTMP3.16b;            \
20362306a36Sopenharmony_ci	/* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */        \
20462306a36Sopenharmony_ci	shl		RTMP2.4s, RTMP0.4s, #24;                \
20562306a36Sopenharmony_ci	shl		RTMP3.4s, RTMP1.4s, #24;                \
20662306a36Sopenharmony_ci	sri		RTMP2.4s, RTMP0.4s, #(32 - 24);         \
20762306a36Sopenharmony_ci	sri		RTMP3.4s, RTMP1.4s, #(32 - 24);         \
20862306a36Sopenharmony_ci	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
20962306a36Sopenharmony_ci	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
21062306a36Sopenharmony_ci	shl		RTMP2.4s, RX0.4s, #2;                   \
21162306a36Sopenharmony_ci	shl		RTMP3.4s, RX1.4s, #2;                   \
21262306a36Sopenharmony_ci	sri		RTMP2.4s, RX0.4s, #(32 - 2);            \
21362306a36Sopenharmony_ci	sri		RTMP3.4s, RX1.4s, #(32 - 2);            \
21462306a36Sopenharmony_ci	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
21562306a36Sopenharmony_ci	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
21662306a36Sopenharmony_ci	/* s0/t0 ^= RTMP0/1 */                                  \
21762306a36Sopenharmony_ci	eor		s0.16b, s0.16b, RTMP0.16b;              \
21862306a36Sopenharmony_ci	eor		t0.16b, t0.16b, RTMP1.16b;
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_ci#define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
22162306a36Sopenharmony_ci	rev32		b0.16b, b0.16b;                         \
22262306a36Sopenharmony_ci	rev32		b1.16b, b1.16b;                         \
22362306a36Sopenharmony_ci	rev32		b2.16b, b2.16b;                         \
22462306a36Sopenharmony_ci	rev32		b3.16b, b3.16b;                         \
22562306a36Sopenharmony_ci	rev32		b4.16b, b4.16b;                         \
22662306a36Sopenharmony_ci	rev32		b5.16b, b5.16b;                         \
22762306a36Sopenharmony_ci	rev32		b6.16b, b6.16b;                         \
22862306a36Sopenharmony_ci	rev32		b7.16b, b7.16b;                         \
22962306a36Sopenharmony_ci                                                                \
23062306a36Sopenharmony_ci	mov		x6, 8;                                  \
23162306a36Sopenharmony_ci8:                                                              \
23262306a36Sopenharmony_ci	ld1		{RKEY.4s}, [x0], #16;                   \
23362306a36Sopenharmony_ci	subs		x6, x6, #1;                             \
23462306a36Sopenharmony_ci                                                                \
23562306a36Sopenharmony_ci	ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7);              \
23662306a36Sopenharmony_ci	ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4);              \
23762306a36Sopenharmony_ci	ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5);              \
23862306a36Sopenharmony_ci	ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6);              \
23962306a36Sopenharmony_ci                                                                \
24062306a36Sopenharmony_ci	bne		8b;                                     \
24162306a36Sopenharmony_ci                                                                \
24262306a36Sopenharmony_ci	rev32		b0.16b, b0.16b;                         \
24362306a36Sopenharmony_ci	rev32		b1.16b, b1.16b;                         \
24462306a36Sopenharmony_ci	rev32		b2.16b, b2.16b;                         \
24562306a36Sopenharmony_ci	rev32		b3.16b, b3.16b;                         \
24662306a36Sopenharmony_ci	rev32		b4.16b, b4.16b;                         \
24762306a36Sopenharmony_ci	rev32		b5.16b, b5.16b;                         \
24862306a36Sopenharmony_ci	rev32		b6.16b, b6.16b;                         \
24962306a36Sopenharmony_ci	rev32		b7.16b, b7.16b;                         \
25062306a36Sopenharmony_ci                                                                \
25162306a36Sopenharmony_ci	/* repoint to rkey */                                   \
25262306a36Sopenharmony_ci	sub		x0, x0, #128;
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7)			\
25562306a36Sopenharmony_ci	SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7);	\
25662306a36Sopenharmony_ci	rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7);	\
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci.align 3
26062306a36Sopenharmony_ciSYM_FUNC_START(sm4_neon_crypt)
26162306a36Sopenharmony_ci	/* input:
26262306a36Sopenharmony_ci	 *   x0: round key array, CTX
26362306a36Sopenharmony_ci	 *   x1: dst
26462306a36Sopenharmony_ci	 *   x2: src
26562306a36Sopenharmony_ci	 *   w3: nblocks
26662306a36Sopenharmony_ci	 */
26762306a36Sopenharmony_ci	SM4_PREPARE()
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci.Lcrypt_loop_8x:
27062306a36Sopenharmony_ci	sub		w3, w3, #8
27162306a36Sopenharmony_ci	tbnz		w3, #31, .Lcrypt_4x
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci	ld4		{v0.4s-v3.4s}, [x2], #64
27462306a36Sopenharmony_ci	ld4		{v4.4s-v7.4s}, [x2], #64
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x1], #64
27962306a36Sopenharmony_ci	st1		{v4.16b-v7.16b}, [x1], #64
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_ci	cbz		w3, .Lcrypt_end
28262306a36Sopenharmony_ci	b		.Lcrypt_loop_8x
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_ci.Lcrypt_4x:
28562306a36Sopenharmony_ci	add		w3, w3, #8
28662306a36Sopenharmony_ci	cmp		w3, #4
28762306a36Sopenharmony_ci	blt		.Lcrypt_tail
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	sub		w3, w3, #4
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_ci	ld4		{v0.4s-v3.4s}, [x2], #64
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	SM4_CRYPT_BLK4(v0, v1, v2, v3)
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x1], #64
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	cbz		w3, .Lcrypt_end
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_ci.Lcrypt_tail:
30062306a36Sopenharmony_ci	cmp		w3, #2
30162306a36Sopenharmony_ci	ld1		{v0.16b}, [x2], #16
30262306a36Sopenharmony_ci	blt		.Lcrypt_tail_load_done
30362306a36Sopenharmony_ci	ld1		{v1.16b}, [x2], #16
30462306a36Sopenharmony_ci	beq		.Lcrypt_tail_load_done
30562306a36Sopenharmony_ci	ld1		{v2.16b}, [x2], #16
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci.Lcrypt_tail_load_done:
30862306a36Sopenharmony_ci	transpose_4x4(v0, v1, v2, v3)
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci	SM4_CRYPT_BLK4(v0, v1, v2, v3)
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_ci	cmp		w3, #2
31362306a36Sopenharmony_ci	st1		{v0.16b}, [x1], #16
31462306a36Sopenharmony_ci	blt		.Lcrypt_end
31562306a36Sopenharmony_ci	st1		{v1.16b}, [x1], #16
31662306a36Sopenharmony_ci	beq		.Lcrypt_end
31762306a36Sopenharmony_ci	st1		{v2.16b}, [x1], #16
31862306a36Sopenharmony_ci
31962306a36Sopenharmony_ci.Lcrypt_end:
32062306a36Sopenharmony_ci	ret
32162306a36Sopenharmony_ciSYM_FUNC_END(sm4_neon_crypt)
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci.align 3
32462306a36Sopenharmony_ciSYM_FUNC_START(sm4_neon_cbc_dec)
32562306a36Sopenharmony_ci	/* input:
32662306a36Sopenharmony_ci	 *   x0: round key array, CTX
32762306a36Sopenharmony_ci	 *   x1: dst
32862306a36Sopenharmony_ci	 *   x2: src
32962306a36Sopenharmony_ci	 *   x3: iv (big endian, 128 bit)
33062306a36Sopenharmony_ci	 *   w4: nblocks
33162306a36Sopenharmony_ci	 */
33262306a36Sopenharmony_ci	SM4_PREPARE()
33362306a36Sopenharmony_ci
33462306a36Sopenharmony_ci	ld1		{RIV.16b}, [x3]
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci.Lcbc_dec_loop_8x:
33762306a36Sopenharmony_ci	sub		w4, w4, #8
33862306a36Sopenharmony_ci	tbnz		w4, #31, .Lcbc_dec_4x
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci	ld4		{v0.4s-v3.4s}, [x2], #64
34162306a36Sopenharmony_ci	ld4		{v4.4s-v7.4s}, [x2]
34262306a36Sopenharmony_ci
34362306a36Sopenharmony_ci	SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci	/* Avoid overwriting the RIV register */
34662306a36Sopenharmony_ci	rotate_clockwise_4x4(v0, v1, v2, v3)
34762306a36Sopenharmony_ci	rotate_clockwise_4x4(v4, v5, v6, v7)
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci	sub		x2, x2, #64
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci	eor		v0.16b, v0.16b, RIV.16b
35262306a36Sopenharmony_ci
35362306a36Sopenharmony_ci	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
35462306a36Sopenharmony_ci	ld1		{RTMP4.16b-RTMP7.16b}, [x2], #64
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci	eor		v1.16b, v1.16b, RTMP0.16b
35762306a36Sopenharmony_ci	eor		v2.16b, v2.16b, RTMP1.16b
35862306a36Sopenharmony_ci	eor		v3.16b, v3.16b, RTMP2.16b
35962306a36Sopenharmony_ci	eor		v4.16b, v4.16b, RTMP3.16b
36062306a36Sopenharmony_ci	eor		v5.16b, v5.16b, RTMP4.16b
36162306a36Sopenharmony_ci	eor		v6.16b, v6.16b, RTMP5.16b
36262306a36Sopenharmony_ci	eor		v7.16b, v7.16b, RTMP6.16b
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_ci	mov		RIV.16b, RTMP7.16b
36562306a36Sopenharmony_ci
36662306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x1], #64
36762306a36Sopenharmony_ci	st1		{v4.16b-v7.16b}, [x1], #64
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci	cbz		w4, .Lcbc_dec_end
37062306a36Sopenharmony_ci	b		.Lcbc_dec_loop_8x
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci.Lcbc_dec_4x:
37362306a36Sopenharmony_ci	add		w4, w4, #8
37462306a36Sopenharmony_ci	cmp		w4, #4
37562306a36Sopenharmony_ci	blt		.Lcbc_dec_tail
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci	sub		w4, w4, #4
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci	ld1		{v0.16b-v3.16b}, [x2], #64
38062306a36Sopenharmony_ci
38162306a36Sopenharmony_ci	rev32		v4.16b, v0.16b
38262306a36Sopenharmony_ci	rev32		v5.16b, v1.16b
38362306a36Sopenharmony_ci	rev32		v6.16b, v2.16b
38462306a36Sopenharmony_ci	rev32		v7.16b, v3.16b
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	transpose_4x4(v4, v5, v6, v7)
38762306a36Sopenharmony_ci
38862306a36Sopenharmony_ci	SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
38962306a36Sopenharmony_ci
39062306a36Sopenharmony_ci	eor		v4.16b, v4.16b, RIV.16b
39162306a36Sopenharmony_ci	eor		v5.16b, v5.16b, v0.16b
39262306a36Sopenharmony_ci	eor		v6.16b, v6.16b, v1.16b
39362306a36Sopenharmony_ci	eor		v7.16b, v7.16b, v2.16b
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci	mov		RIV.16b, v3.16b
39662306a36Sopenharmony_ci
39762306a36Sopenharmony_ci	st1		{v4.16b-v7.16b}, [x1], #64
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_ci	cbz		w4, .Lcbc_dec_end
40062306a36Sopenharmony_ci
40162306a36Sopenharmony_ci.Lcbc_dec_tail:
40262306a36Sopenharmony_ci	cmp		w4, #2
40362306a36Sopenharmony_ci	ld1		{v0.16b}, [x2], #16
40462306a36Sopenharmony_ci	blt		.Lcbc_dec_tail_load_done
40562306a36Sopenharmony_ci	ld1		{v1.16b}, [x2], #16
40662306a36Sopenharmony_ci	beq		.Lcbc_dec_tail_load_done
40762306a36Sopenharmony_ci	ld1		{v2.16b}, [x2], #16
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci.Lcbc_dec_tail_load_done:
41062306a36Sopenharmony_ci	rev32		v4.16b, v0.16b
41162306a36Sopenharmony_ci	rev32		v5.16b, v1.16b
41262306a36Sopenharmony_ci	rev32		v6.16b, v2.16b
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci	transpose_4x4(v4, v5, v6, v7)
41562306a36Sopenharmony_ci
41662306a36Sopenharmony_ci	SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
41762306a36Sopenharmony_ci
41862306a36Sopenharmony_ci	cmp		w4, #2
41962306a36Sopenharmony_ci	eor		v4.16b, v4.16b, RIV.16b
42062306a36Sopenharmony_ci	mov		RIV.16b, v0.16b
42162306a36Sopenharmony_ci	st1		{v4.16b}, [x1], #16
42262306a36Sopenharmony_ci	blt		.Lcbc_dec_end
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_ci	eor		v5.16b, v5.16b, v0.16b
42562306a36Sopenharmony_ci	mov		RIV.16b, v1.16b
42662306a36Sopenharmony_ci	st1		{v5.16b}, [x1], #16
42762306a36Sopenharmony_ci	beq		.Lcbc_dec_end
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	eor		v6.16b, v6.16b, v1.16b
43062306a36Sopenharmony_ci	mov		RIV.16b, v2.16b
43162306a36Sopenharmony_ci	st1		{v6.16b}, [x1], #16
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci.Lcbc_dec_end:
43462306a36Sopenharmony_ci	/* store new IV */
43562306a36Sopenharmony_ci	st1		{RIV.16b}, [x3]
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci	ret
43862306a36Sopenharmony_ciSYM_FUNC_END(sm4_neon_cbc_dec)
43962306a36Sopenharmony_ci
44062306a36Sopenharmony_ci.align 3
44162306a36Sopenharmony_ciSYM_FUNC_START(sm4_neon_cfb_dec)
44262306a36Sopenharmony_ci	/* input:
44362306a36Sopenharmony_ci	 *   x0: round key array, CTX
44462306a36Sopenharmony_ci	 *   x1: dst
44562306a36Sopenharmony_ci	 *   x2: src
44662306a36Sopenharmony_ci	 *   x3: iv (big endian, 128 bit)
44762306a36Sopenharmony_ci	 *   w4: nblocks
44862306a36Sopenharmony_ci	 */
44962306a36Sopenharmony_ci	SM4_PREPARE()
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci	ld1		{v0.16b}, [x3]
45262306a36Sopenharmony_ci
45362306a36Sopenharmony_ci.Lcfb_dec_loop_8x:
45462306a36Sopenharmony_ci	sub		w4, w4, #8
45562306a36Sopenharmony_ci	tbnz		w4, #31, .Lcfb_dec_4x
45662306a36Sopenharmony_ci
45762306a36Sopenharmony_ci	ld1		{v1.16b-v3.16b}, [x2], #48
45862306a36Sopenharmony_ci	ld4		{v4.4s-v7.4s}, [x2]
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci	transpose_4x4(v0, v1, v2, v3)
46162306a36Sopenharmony_ci
46262306a36Sopenharmony_ci	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
46362306a36Sopenharmony_ci
46462306a36Sopenharmony_ci	sub		x2, x2, #48
46562306a36Sopenharmony_ci	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
46662306a36Sopenharmony_ci	ld1		{RTMP4.16b-RTMP7.16b}, [x2], #64
46762306a36Sopenharmony_ci
46862306a36Sopenharmony_ci	eor		v0.16b, v0.16b, RTMP0.16b
46962306a36Sopenharmony_ci	eor		v1.16b, v1.16b, RTMP1.16b
47062306a36Sopenharmony_ci	eor		v2.16b, v2.16b, RTMP2.16b
47162306a36Sopenharmony_ci	eor		v3.16b, v3.16b, RTMP3.16b
47262306a36Sopenharmony_ci	eor		v4.16b, v4.16b, RTMP4.16b
47362306a36Sopenharmony_ci	eor		v5.16b, v5.16b, RTMP5.16b
47462306a36Sopenharmony_ci	eor		v6.16b, v6.16b, RTMP6.16b
47562306a36Sopenharmony_ci	eor		v7.16b, v7.16b, RTMP7.16b
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x1], #64
47862306a36Sopenharmony_ci	st1		{v4.16b-v7.16b}, [x1], #64
47962306a36Sopenharmony_ci
48062306a36Sopenharmony_ci	mov		v0.16b, RTMP7.16b
48162306a36Sopenharmony_ci
48262306a36Sopenharmony_ci	cbz		w4, .Lcfb_dec_end
48362306a36Sopenharmony_ci	b		.Lcfb_dec_loop_8x
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_ci.Lcfb_dec_4x:
48662306a36Sopenharmony_ci	add		w4, w4, #8
48762306a36Sopenharmony_ci	cmp		w4, #4
48862306a36Sopenharmony_ci	blt		.Lcfb_dec_tail
48962306a36Sopenharmony_ci
49062306a36Sopenharmony_ci	sub		w4, w4, #4
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	ld1		{v4.16b-v7.16b}, [x2], #64
49362306a36Sopenharmony_ci
49462306a36Sopenharmony_ci	rev32		v0.16b, v0.16b		/* v0 is IV register */
49562306a36Sopenharmony_ci	rev32		v1.16b, v4.16b
49662306a36Sopenharmony_ci	rev32		v2.16b, v5.16b
49762306a36Sopenharmony_ci	rev32		v3.16b, v6.16b
49862306a36Sopenharmony_ci
49962306a36Sopenharmony_ci	transpose_4x4(v0, v1, v2, v3)
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_ci	SM4_CRYPT_BLK4_BE(v0, v1, v2, v3)
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
50462306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v5.16b
50562306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v6.16b
50662306a36Sopenharmony_ci	eor		v3.16b, v3.16b, v7.16b
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x1], #64
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_ci	mov		v0.16b, v7.16b
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ci	cbz		w4, .Lcfb_dec_end
51362306a36Sopenharmony_ci
51462306a36Sopenharmony_ci.Lcfb_dec_tail:
51562306a36Sopenharmony_ci	cmp		w4, #2
51662306a36Sopenharmony_ci	ld1		{v4.16b}, [x2], #16
51762306a36Sopenharmony_ci	blt		.Lcfb_dec_tail_load_done
51862306a36Sopenharmony_ci	ld1		{v5.16b}, [x2], #16
51962306a36Sopenharmony_ci	beq		.Lcfb_dec_tail_load_done
52062306a36Sopenharmony_ci	ld1		{v6.16b}, [x2], #16
52162306a36Sopenharmony_ci
52262306a36Sopenharmony_ci.Lcfb_dec_tail_load_done:
52362306a36Sopenharmony_ci	rev32		v0.16b, v0.16b		/* v0 is IV register */
52462306a36Sopenharmony_ci	rev32		v1.16b, v4.16b
52562306a36Sopenharmony_ci	rev32		v2.16b, v5.16b
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_ci	transpose_4x4(v0, v1, v2, v3)
52862306a36Sopenharmony_ci
52962306a36Sopenharmony_ci	SM4_CRYPT_BLK4_BE(v0, v1, v2, v3)
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_ci	cmp		w4, #2
53262306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
53362306a36Sopenharmony_ci	st1		{v0.16b}, [x1], #16
53462306a36Sopenharmony_ci	mov		v0.16b, v4.16b
53562306a36Sopenharmony_ci	blt		.Lcfb_dec_end
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v5.16b
53862306a36Sopenharmony_ci	st1		{v1.16b}, [x1], #16
53962306a36Sopenharmony_ci	mov		v0.16b, v5.16b
54062306a36Sopenharmony_ci	beq		.Lcfb_dec_end
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v6.16b
54362306a36Sopenharmony_ci	st1		{v2.16b}, [x1], #16
54462306a36Sopenharmony_ci	mov		v0.16b, v6.16b
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_ci.Lcfb_dec_end:
54762306a36Sopenharmony_ci	/* store new IV */
54862306a36Sopenharmony_ci	st1		{v0.16b}, [x3]
54962306a36Sopenharmony_ci
55062306a36Sopenharmony_ci	ret
55162306a36Sopenharmony_ciSYM_FUNC_END(sm4_neon_cfb_dec)
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci.align 3
55462306a36Sopenharmony_ciSYM_FUNC_START(sm4_neon_ctr_crypt)
55562306a36Sopenharmony_ci	/* input:
55662306a36Sopenharmony_ci	 *   x0: round key array, CTX
55762306a36Sopenharmony_ci	 *   x1: dst
55862306a36Sopenharmony_ci	 *   x2: src
55962306a36Sopenharmony_ci	 *   x3: ctr (big endian, 128 bit)
56062306a36Sopenharmony_ci	 *   w4: nblocks
56162306a36Sopenharmony_ci	 */
56262306a36Sopenharmony_ci	SM4_PREPARE()
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_ci	ldp		x7, x8, [x3]
56562306a36Sopenharmony_ci	rev		x7, x7
56662306a36Sopenharmony_ci	rev		x8, x8
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_ci.Lctr_crypt_loop_8x:
56962306a36Sopenharmony_ci	sub		w4, w4, #8
57062306a36Sopenharmony_ci	tbnz		w4, #31, .Lctr_crypt_4x
57162306a36Sopenharmony_ci
57262306a36Sopenharmony_ci#define inc_le128(vctr)                             \
57362306a36Sopenharmony_ci		mov		vctr.d[1], x8;      \
57462306a36Sopenharmony_ci		mov		vctr.d[0], x7;      \
57562306a36Sopenharmony_ci		adds		x8, x8, #1;         \
57662306a36Sopenharmony_ci		rev64		vctr.16b, vctr.16b; \
57762306a36Sopenharmony_ci		adc		x7, x7, xzr;
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci	/* construct CTRs */
58062306a36Sopenharmony_ci	inc_le128(v0)			/* +0 */
58162306a36Sopenharmony_ci	inc_le128(v1)			/* +1 */
58262306a36Sopenharmony_ci	inc_le128(v2)			/* +2 */
58362306a36Sopenharmony_ci	inc_le128(v3)			/* +3 */
58462306a36Sopenharmony_ci	inc_le128(v4)			/* +4 */
58562306a36Sopenharmony_ci	inc_le128(v5)			/* +5 */
58662306a36Sopenharmony_ci	inc_le128(v6)			/* +6 */
58762306a36Sopenharmony_ci	inc_le128(v7)			/* +7 */
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci	transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
59062306a36Sopenharmony_ci
59162306a36Sopenharmony_ci	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_ci	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64
59462306a36Sopenharmony_ci	ld1		{RTMP4.16b-RTMP7.16b}, [x2], #64
59562306a36Sopenharmony_ci
59662306a36Sopenharmony_ci	eor		v0.16b, v0.16b, RTMP0.16b
59762306a36Sopenharmony_ci	eor		v1.16b, v1.16b, RTMP1.16b
59862306a36Sopenharmony_ci	eor		v2.16b, v2.16b, RTMP2.16b
59962306a36Sopenharmony_ci	eor		v3.16b, v3.16b, RTMP3.16b
60062306a36Sopenharmony_ci	eor		v4.16b, v4.16b, RTMP4.16b
60162306a36Sopenharmony_ci	eor		v5.16b, v5.16b, RTMP5.16b
60262306a36Sopenharmony_ci	eor		v6.16b, v6.16b, RTMP6.16b
60362306a36Sopenharmony_ci	eor		v7.16b, v7.16b, RTMP7.16b
60462306a36Sopenharmony_ci
60562306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x1], #64
60662306a36Sopenharmony_ci	st1		{v4.16b-v7.16b}, [x1], #64
60762306a36Sopenharmony_ci
60862306a36Sopenharmony_ci	cbz		w4, .Lctr_crypt_end
60962306a36Sopenharmony_ci	b		.Lctr_crypt_loop_8x
61062306a36Sopenharmony_ci
61162306a36Sopenharmony_ci.Lctr_crypt_4x:
61262306a36Sopenharmony_ci	add		w4, w4, #8
61362306a36Sopenharmony_ci	cmp		w4, #4
61462306a36Sopenharmony_ci	blt		.Lctr_crypt_tail
61562306a36Sopenharmony_ci
61662306a36Sopenharmony_ci	sub		w4, w4, #4
61762306a36Sopenharmony_ci
61862306a36Sopenharmony_ci	/* construct CTRs */
61962306a36Sopenharmony_ci	inc_le128(v0)			/* +0 */
62062306a36Sopenharmony_ci	inc_le128(v1)			/* +1 */
62162306a36Sopenharmony_ci	inc_le128(v2)			/* +2 */
62262306a36Sopenharmony_ci	inc_le128(v3)			/* +3 */
62362306a36Sopenharmony_ci
62462306a36Sopenharmony_ci	ld1		{v4.16b-v7.16b}, [x2], #64
62562306a36Sopenharmony_ci
62662306a36Sopenharmony_ci	transpose_4x4(v0, v1, v2, v3)
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_ci	SM4_CRYPT_BLK4(v0, v1, v2, v3)
62962306a36Sopenharmony_ci
63062306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
63162306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v5.16b
63262306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v6.16b
63362306a36Sopenharmony_ci	eor		v3.16b, v3.16b, v7.16b
63462306a36Sopenharmony_ci
63562306a36Sopenharmony_ci	st1		{v0.16b-v3.16b}, [x1], #64
63662306a36Sopenharmony_ci
63762306a36Sopenharmony_ci	cbz		w4, .Lctr_crypt_end
63862306a36Sopenharmony_ci
63962306a36Sopenharmony_ci.Lctr_crypt_tail:
64062306a36Sopenharmony_ci	/* inc_le128 will change the sign bit */
64162306a36Sopenharmony_ci	ld1		{v4.16b}, [x2], #16
64262306a36Sopenharmony_ci	inc_le128(v0)
64362306a36Sopenharmony_ci	cmp		w4, #2
64462306a36Sopenharmony_ci	blt		.Lctr_crypt_tail_load_done
64562306a36Sopenharmony_ci
64662306a36Sopenharmony_ci	ld1		{v5.16b}, [x2], #16
64762306a36Sopenharmony_ci	inc_le128(v1)
64862306a36Sopenharmony_ci	cmp		w4, #2
64962306a36Sopenharmony_ci	beq		.Lctr_crypt_tail_load_done
65062306a36Sopenharmony_ci
65162306a36Sopenharmony_ci	ld1		{v6.16b}, [x2], #16
65262306a36Sopenharmony_ci	inc_le128(v2)
65362306a36Sopenharmony_ci
65462306a36Sopenharmony_ci.Lctr_crypt_tail_load_done:
65562306a36Sopenharmony_ci	transpose_4x4(v0, v1, v2, v3)
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci	SM4_CRYPT_BLK4(v0, v1, v2, v3)
65862306a36Sopenharmony_ci
65962306a36Sopenharmony_ci	cmp		w4, #2
66062306a36Sopenharmony_ci
66162306a36Sopenharmony_ci	eor		v0.16b, v0.16b, v4.16b
66262306a36Sopenharmony_ci	st1		{v0.16b}, [x1], #16
66362306a36Sopenharmony_ci	blt		.Lctr_crypt_end
66462306a36Sopenharmony_ci
66562306a36Sopenharmony_ci	eor		v1.16b, v1.16b, v5.16b
66662306a36Sopenharmony_ci	st1		{v1.16b}, [x1], #16
66762306a36Sopenharmony_ci	beq		.Lctr_crypt_end
66862306a36Sopenharmony_ci
66962306a36Sopenharmony_ci	eor		v2.16b, v2.16b, v6.16b
67062306a36Sopenharmony_ci	st1		{v2.16b}, [x1], #16
67162306a36Sopenharmony_ci
67262306a36Sopenharmony_ci.Lctr_crypt_end:
67362306a36Sopenharmony_ci	/* store new CTR */
67462306a36Sopenharmony_ci	rev		x7, x7
67562306a36Sopenharmony_ci	rev		x8, x8
67662306a36Sopenharmony_ci	stp		x7, x8, [x3]
67762306a36Sopenharmony_ci
67862306a36Sopenharmony_ci	ret
67962306a36Sopenharmony_ciSYM_FUNC_END(sm4_neon_ctr_crypt)
680